{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 4871, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00020529665366454526, "grad_norm": 110.00955745696662, "kd_ratio": 0.5, "learning_rate": 1.360544217687075e-07, "loss": 5.859516143798828, "loss/kd": 9.194616317749023, "loss/lm": 2.524415969848633, "step": 1 }, { "epoch": 0.0004105933073290905, "grad_norm": 109.70607926963655, "kd_ratio": 0.5, "learning_rate": 2.72108843537415e-07, "loss": 5.883566856384277, "loss/kd": 9.119073867797852, "loss/lm": 2.648059844970703, "step": 2 }, { "epoch": 0.0006158899609936358, "grad_norm": 112.78940799131291, "kd_ratio": 0.5, "learning_rate": 4.0816326530612243e-07, "loss": 6.069156169891357, "loss/kd": 9.638937950134277, "loss/lm": 2.4993743896484375, "step": 3 }, { "epoch": 0.000821186614658181, "grad_norm": 100.35898429568525, "kd_ratio": 0.5, "learning_rate": 5.4421768707483e-07, "loss": 5.9113969802856445, "loss/kd": 9.330845832824707, "loss/lm": 2.4919486045837402, "step": 4 }, { "epoch": 0.0010264832683227264, "grad_norm": 94.69097726148632, "kd_ratio": 0.5, "learning_rate": 6.802721088435376e-07, "loss": 6.132721424102783, "loss/kd": 9.664273262023926, "loss/lm": 2.6011693477630615, "step": 5 }, { "epoch": 0.0012317799219872716, "grad_norm": 117.11589454186006, "kd_ratio": 0.5, "learning_rate": 8.163265306122449e-07, "loss": 6.239492416381836, "loss/kd": 9.762786865234375, "loss/lm": 2.716197967529297, "step": 6 }, { "epoch": 0.0014370765756518168, "grad_norm": 104.23911889122019, "kd_ratio": 0.5, "learning_rate": 9.523809523809525e-07, "loss": 6.02845573425293, "loss/kd": 9.495902061462402, "loss/lm": 2.561008930206299, "step": 7 }, { "epoch": 0.001642373229316362, "grad_norm": 101.61402559351386, "kd_ratio": 0.5, "learning_rate": 1.08843537414966e-06, "loss": 6.048721790313721, "loss/kd": 9.427233695983887, "loss/lm": 2.670210123062134, "step": 8 }, { "epoch": 0.0018476698829809075, "grad_norm": 106.58032652322707, "kd_ratio": 0.5, "learning_rate": 1.2244897959183673e-06, "loss": 5.756770133972168, "loss/kd": 9.112539291381836, "loss/lm": 2.401001214981079, "step": 9 }, { "epoch": 0.0020529665366454527, "grad_norm": 120.03715586535003, "kd_ratio": 0.5, "learning_rate": 1.3605442176870751e-06, "loss": 5.74399471282959, "loss/kd": 8.675286293029785, "loss/lm": 2.8127033710479736, "step": 10 }, { "epoch": 0.0022582631903099977, "grad_norm": 108.14737580535729, "kd_ratio": 0.5, "learning_rate": 1.4965986394557825e-06, "loss": 5.447847843170166, "loss/kd": 8.356966972351074, "loss/lm": 2.538728713989258, "step": 11 }, { "epoch": 0.002463559843974543, "grad_norm": 98.40141591689208, "kd_ratio": 0.5, "learning_rate": 1.6326530612244897e-06, "loss": 5.534481525421143, "loss/kd": 8.52441120147705, "loss/lm": 2.5445520877838135, "step": 12 }, { "epoch": 0.0026688564976390886, "grad_norm": 155.07795063820828, "kd_ratio": 0.5, "learning_rate": 1.7687074829931975e-06, "loss": 4.390925407409668, "loss/kd": 6.631160259246826, "loss/lm": 2.1506903171539307, "step": 13 }, { "epoch": 0.0028741531513036336, "grad_norm": 95.29321789570794, "kd_ratio": 0.5, "learning_rate": 1.904761904761905e-06, "loss": 4.105798244476318, "loss/kd": 5.962360382080078, "loss/lm": 2.2492358684539795, "step": 14 }, { "epoch": 0.003079449804968179, "grad_norm": 77.94368882443325, "kd_ratio": 0.5, "learning_rate": 2.0408163265306125e-06, "loss": 4.1667070388793945, "loss/kd": 6.013872146606445, "loss/lm": 2.3195419311523438, "step": 15 }, { "epoch": 0.003284746458632724, "grad_norm": 78.39917784525726, "kd_ratio": 0.5, "learning_rate": 2.17687074829932e-06, "loss": 4.421575546264648, "loss/kd": 6.543628215789795, "loss/lm": 2.299523115158081, "step": 16 }, { "epoch": 0.0034900431122972695, "grad_norm": 35.48812387625263, "kd_ratio": 0.5, "learning_rate": 2.3129251700680273e-06, "loss": 3.126570224761963, "loss/kd": 4.228987216949463, "loss/lm": 2.024153232574463, "step": 17 }, { "epoch": 0.003695339765961815, "grad_norm": 28.517825056854026, "kd_ratio": 0.5, "learning_rate": 2.4489795918367347e-06, "loss": 2.8652663230895996, "loss/kd": 3.699425220489502, "loss/lm": 2.0311074256896973, "step": 18 }, { "epoch": 0.00390063641962636, "grad_norm": 26.07119822059023, "kd_ratio": 0.5, "learning_rate": 2.5850340136054425e-06, "loss": 3.4145774841308594, "loss/kd": 4.714385509490967, "loss/lm": 2.114769220352173, "step": 19 }, { "epoch": 0.0041059330732909054, "grad_norm": 94.86650664371027, "kd_ratio": 0.5, "learning_rate": 2.7210884353741503e-06, "loss": 2.954554319381714, "loss/kd": 3.843651533126831, "loss/lm": 2.0654571056365967, "step": 20 }, { "epoch": 0.0043112297269554505, "grad_norm": 21.166914330078015, "kd_ratio": 0.5, "learning_rate": 2.8571428571428573e-06, "loss": 3.1916770935058594, "loss/kd": 4.382627010345459, "loss/lm": 2.0007269382476807, "step": 21 }, { "epoch": 0.0045165263806199955, "grad_norm": 20.998859486742056, "kd_ratio": 0.5, "learning_rate": 2.993197278911565e-06, "loss": 2.7634119987487793, "loss/kd": 3.510927677154541, "loss/lm": 2.0158963203430176, "step": 22 }, { "epoch": 0.004721823034284541, "grad_norm": 14.985882910972089, "kd_ratio": 0.5, "learning_rate": 3.1292517006802725e-06, "loss": 2.7154757976531982, "loss/kd": 3.5082690715789795, "loss/lm": 1.9226824045181274, "step": 23 }, { "epoch": 0.004927119687949086, "grad_norm": 22.608086775536044, "kd_ratio": 0.5, "learning_rate": 3.2653061224489794e-06, "loss": 2.929765224456787, "loss/kd": 4.039677143096924, "loss/lm": 1.8198530673980713, "step": 24 }, { "epoch": 0.005132416341613631, "grad_norm": 21.133001871849938, "kd_ratio": 0.5, "learning_rate": 3.4013605442176872e-06, "loss": 2.6377410888671875, "loss/kd": 3.5890252590179443, "loss/lm": 1.6864566802978516, "step": 25 }, { "epoch": 0.005337712995278177, "grad_norm": 16.068958202337974, "kd_ratio": 0.5, "learning_rate": 3.537414965986395e-06, "loss": 2.6654551029205322, "loss/kd": 3.6679294109344482, "loss/lm": 1.6629809141159058, "step": 26 }, { "epoch": 0.005543009648942722, "grad_norm": 13.502640799009756, "kd_ratio": 0.5, "learning_rate": 3.6734693877551024e-06, "loss": 3.0846428871154785, "loss/kd": 4.36391544342041, "loss/lm": 1.805370569229126, "step": 27 }, { "epoch": 0.005748306302607267, "grad_norm": 10.438187543315948, "kd_ratio": 0.5, "learning_rate": 3.80952380952381e-06, "loss": 2.9357733726501465, "loss/kd": 4.035686492919922, "loss/lm": 1.8358601331710815, "step": 28 }, { "epoch": 0.005953602956271813, "grad_norm": 17.293964887677568, "kd_ratio": 0.5, "learning_rate": 3.945578231292517e-06, "loss": 2.7410643100738525, "loss/kd": 3.8442158699035645, "loss/lm": 1.637912631034851, "step": 29 }, { "epoch": 0.006158899609936358, "grad_norm": 9.382214622094228, "kd_ratio": 0.5, "learning_rate": 4.081632653061225e-06, "loss": 2.510615348815918, "loss/kd": 3.3608391284942627, "loss/lm": 1.6603914499282837, "step": 30 }, { "epoch": 0.006364196263600903, "grad_norm": 12.047980906413274, "kd_ratio": 0.5, "learning_rate": 4.217687074829933e-06, "loss": 2.679048776626587, "loss/kd": 3.671302556991577, "loss/lm": 1.6867948770523071, "step": 31 }, { "epoch": 0.006569492917265448, "grad_norm": 10.956535113989949, "kd_ratio": 0.5, "learning_rate": 4.35374149659864e-06, "loss": 2.4586057662963867, "loss/kd": 3.2859578132629395, "loss/lm": 1.631253719329834, "step": 32 }, { "epoch": 0.006774789570929994, "grad_norm": 13.875495950320282, "kd_ratio": 0.5, "learning_rate": 4.489795918367348e-06, "loss": 2.7146804332733154, "loss/kd": 3.7657430171966553, "loss/lm": 1.6636178493499756, "step": 33 }, { "epoch": 0.006980086224594539, "grad_norm": 12.758813525265525, "kd_ratio": 0.5, "learning_rate": 4.6258503401360546e-06, "loss": 2.7240958213806152, "loss/kd": 3.926669120788574, "loss/lm": 1.5215225219726562, "step": 34 }, { "epoch": 0.007185382878259084, "grad_norm": 16.77764515446079, "kd_ratio": 0.5, "learning_rate": 4.761904761904762e-06, "loss": 2.61466646194458, "loss/kd": 3.5953588485717773, "loss/lm": 1.6339741945266724, "step": 35 }, { "epoch": 0.00739067953192363, "grad_norm": 10.526746158534658, "kd_ratio": 0.5, "learning_rate": 4.897959183673469e-06, "loss": 2.719836711883545, "loss/kd": 3.8498594760894775, "loss/lm": 1.5898138284683228, "step": 36 }, { "epoch": 0.007595976185588175, "grad_norm": 9.067265107292695, "kd_ratio": 0.5, "learning_rate": 5.034013605442177e-06, "loss": 2.609747886657715, "loss/kd": 3.7301406860351562, "loss/lm": 1.4893550872802734, "step": 37 }, { "epoch": 0.00780127283925272, "grad_norm": 7.013775381529852, "kd_ratio": 0.5, "learning_rate": 5.170068027210885e-06, "loss": 2.328523874282837, "loss/kd": 3.074021100997925, "loss/lm": 1.5830267667770386, "step": 38 }, { "epoch": 0.008006569492917266, "grad_norm": 5.194678673056856, "kd_ratio": 0.5, "learning_rate": 5.306122448979593e-06, "loss": 2.496345281600952, "loss/kd": 3.458807945251465, "loss/lm": 1.5338826179504395, "step": 39 }, { "epoch": 0.008211866146581811, "grad_norm": 6.595216133764594, "kd_ratio": 0.5, "learning_rate": 5.442176870748301e-06, "loss": 2.453526735305786, "loss/kd": 3.4567415714263916, "loss/lm": 1.4503117799758911, "step": 40 }, { "epoch": 0.008417162800246356, "grad_norm": 7.184700574324739, "kd_ratio": 0.5, "learning_rate": 5.578231292517007e-06, "loss": 2.454983711242676, "loss/kd": 3.3159501552581787, "loss/lm": 1.594017505645752, "step": 41 }, { "epoch": 0.008622459453910901, "grad_norm": 7.32171654844896, "kd_ratio": 0.5, "learning_rate": 5.7142857142857145e-06, "loss": 2.430558681488037, "loss/kd": 3.358459234237671, "loss/lm": 1.5026578903198242, "step": 42 }, { "epoch": 0.008827756107575446, "grad_norm": 5.513445557390148, "kd_ratio": 0.5, "learning_rate": 5.850340136054422e-06, "loss": 2.2783164978027344, "loss/kd": 3.171860933303833, "loss/lm": 1.3847721815109253, "step": 43 }, { "epoch": 0.009033052761239991, "grad_norm": 4.216288178643194, "kd_ratio": 0.5, "learning_rate": 5.98639455782313e-06, "loss": 2.366410493850708, "loss/kd": 3.395423173904419, "loss/lm": 1.337397813796997, "step": 44 }, { "epoch": 0.009238349414904538, "grad_norm": 10.213706666144116, "kd_ratio": 0.5, "learning_rate": 6.122448979591837e-06, "loss": 2.215071678161621, "loss/kd": 3.073274612426758, "loss/lm": 1.356868863105774, "step": 45 }, { "epoch": 0.009443646068569083, "grad_norm": 2.940261291663705, "kd_ratio": 0.5, "learning_rate": 6.258503401360545e-06, "loss": 2.529839515686035, "loss/kd": 3.7125680446624756, "loss/lm": 1.3471111059188843, "step": 46 }, { "epoch": 0.009648942722233628, "grad_norm": 4.695768416049388, "kd_ratio": 0.5, "learning_rate": 6.394557823129253e-06, "loss": 2.2241621017456055, "loss/kd": 3.011923313140869, "loss/lm": 1.4364007711410522, "step": 47 }, { "epoch": 0.009854239375898173, "grad_norm": 4.3569827212785475, "kd_ratio": 0.5, "learning_rate": 6.530612244897959e-06, "loss": 2.046186923980713, "loss/kd": 2.8041110038757324, "loss/lm": 1.288262963294983, "step": 48 }, { "epoch": 0.010059536029562718, "grad_norm": 3.6339204258111923, "kd_ratio": 0.5, "learning_rate": 6.666666666666667e-06, "loss": 2.256417751312256, "loss/kd": 3.1312317848205566, "loss/lm": 1.3816039562225342, "step": 49 }, { "epoch": 0.010264832683227263, "grad_norm": 4.09936516906029, "kd_ratio": 0.5, "learning_rate": 6.8027210884353745e-06, "loss": 2.8349123001098633, "loss/kd": 4.591951847076416, "loss/lm": 1.0778727531433105, "step": 50 }, { "epoch": 0.01047012933689181, "grad_norm": 3.4158946989737915, "kd_ratio": 0.5, "learning_rate": 6.938775510204082e-06, "loss": 2.2342681884765625, "loss/kd": 3.0598738193511963, "loss/lm": 1.4086623191833496, "step": 51 }, { "epoch": 0.010675425990556354, "grad_norm": 3.3420145499558327, "kd_ratio": 0.5, "learning_rate": 7.07482993197279e-06, "loss": 2.345767021179199, "loss/kd": 3.296490430831909, "loss/lm": 1.3950438499450684, "step": 52 }, { "epoch": 0.0108807226442209, "grad_norm": 2.6792222670956494, "kd_ratio": 0.5, "learning_rate": 7.210884353741497e-06, "loss": 2.6368937492370605, "loss/kd": 3.9868428707122803, "loss/lm": 1.28694486618042, "step": 53 }, { "epoch": 0.011086019297885445, "grad_norm": 4.110918792364995, "kd_ratio": 0.5, "learning_rate": 7.346938775510205e-06, "loss": 2.144416093826294, "loss/kd": 2.98728084564209, "loss/lm": 1.3015512228012085, "step": 54 }, { "epoch": 0.01129131595154999, "grad_norm": 2.7495490773436426, "kd_ratio": 0.5, "learning_rate": 7.482993197278913e-06, "loss": 2.0472910404205322, "loss/kd": 2.9248690605163574, "loss/lm": 1.169713020324707, "step": 55 }, { "epoch": 0.011496612605214535, "grad_norm": 2.5349896235918092, "kd_ratio": 0.5, "learning_rate": 7.61904761904762e-06, "loss": 2.244516372680664, "loss/kd": 3.262197971343994, "loss/lm": 1.2268348932266235, "step": 56 }, { "epoch": 0.01170190925887908, "grad_norm": 3.5051142124740817, "kd_ratio": 0.5, "learning_rate": 7.755102040816327e-06, "loss": 2.28420090675354, "loss/kd": 3.3878753185272217, "loss/lm": 1.1805263757705688, "step": 57 }, { "epoch": 0.011907205912543626, "grad_norm": 2.813565879053469, "kd_ratio": 0.5, "learning_rate": 7.891156462585034e-06, "loss": 2.2862329483032227, "loss/kd": 3.2333874702453613, "loss/lm": 1.339078426361084, "step": 58 }, { "epoch": 0.012112502566208171, "grad_norm": 2.09504579551497, "kd_ratio": 0.5, "learning_rate": 8.027210884353741e-06, "loss": 1.9965089559555054, "loss/kd": 2.6793372631073, "loss/lm": 1.313680648803711, "step": 59 }, { "epoch": 0.012317799219872716, "grad_norm": 2.3588332639831067, "kd_ratio": 0.5, "learning_rate": 8.16326530612245e-06, "loss": 2.1130211353302, "loss/kd": 2.9763453006744385, "loss/lm": 1.2496970891952515, "step": 60 }, { "epoch": 0.012523095873537261, "grad_norm": 2.7241591693385634, "kd_ratio": 0.5, "learning_rate": 8.299319727891157e-06, "loss": 2.3627171516418457, "loss/kd": 3.5626771450042725, "loss/lm": 1.162757158279419, "step": 61 }, { "epoch": 0.012728392527201806, "grad_norm": 2.3500808129193236, "kd_ratio": 0.5, "learning_rate": 8.435374149659866e-06, "loss": 2.0505175590515137, "loss/kd": 2.9345178604125977, "loss/lm": 1.1665172576904297, "step": 62 }, { "epoch": 0.012933689180866351, "grad_norm": 11.909620917585608, "kd_ratio": 0.5, "learning_rate": 8.571428571428571e-06, "loss": 2.1856977939605713, "loss/kd": 3.1379852294921875, "loss/lm": 1.2334104776382446, "step": 63 }, { "epoch": 0.013138985834530896, "grad_norm": 2.5852471332666003, "kd_ratio": 0.5, "learning_rate": 8.70748299319728e-06, "loss": 2.277280330657959, "loss/kd": 3.384164810180664, "loss/lm": 1.1703957319259644, "step": 64 }, { "epoch": 0.013344282488195443, "grad_norm": 2.4004583783530093, "kd_ratio": 0.5, "learning_rate": 8.843537414965987e-06, "loss": 2.362316131591797, "loss/kd": 3.46648907661438, "loss/lm": 1.258143424987793, "step": 65 }, { "epoch": 0.013549579141859988, "grad_norm": 2.366816357575599, "kd_ratio": 0.5, "learning_rate": 8.979591836734695e-06, "loss": 2.30002498626709, "loss/kd": 3.3750925064086914, "loss/lm": 1.2249574661254883, "step": 66 }, { "epoch": 0.013754875795524533, "grad_norm": 2.1994679005476776, "kd_ratio": 0.5, "learning_rate": 9.115646258503402e-06, "loss": 2.0496037006378174, "loss/kd": 2.9994940757751465, "loss/lm": 1.0997133255004883, "step": 67 }, { "epoch": 0.013960172449189078, "grad_norm": 2.1041749184591305, "kd_ratio": 0.5, "learning_rate": 9.251700680272109e-06, "loss": 2.1625051498413086, "loss/kd": 3.1448302268981934, "loss/lm": 1.1801798343658447, "step": 68 }, { "epoch": 0.014165469102853623, "grad_norm": 18.711817413991636, "kd_ratio": 0.5, "learning_rate": 9.387755102040818e-06, "loss": 2.2342684268951416, "loss/kd": 3.192456007003784, "loss/lm": 1.2760807275772095, "step": 69 }, { "epoch": 0.014370765756518168, "grad_norm": 2.5698037683470205, "kd_ratio": 0.5, "learning_rate": 9.523809523809525e-06, "loss": 2.2212071418762207, "loss/kd": 3.1838901042938232, "loss/lm": 1.2585244178771973, "step": 70 }, { "epoch": 0.014576062410182713, "grad_norm": 1.7747237416079027, "kd_ratio": 0.5, "learning_rate": 9.659863945578232e-06, "loss": 2.3044135570526123, "loss/kd": 3.51994252204895, "loss/lm": 1.0888845920562744, "step": 71 }, { "epoch": 0.01478135906384726, "grad_norm": 2.0799226642384134, "kd_ratio": 0.5, "learning_rate": 9.795918367346939e-06, "loss": 2.259995222091675, "loss/kd": 3.3029093742370605, "loss/lm": 1.2170809507369995, "step": 72 }, { "epoch": 0.014986655717511805, "grad_norm": 2.379502073472001, "kd_ratio": 0.5, "learning_rate": 9.931972789115647e-06, "loss": 2.410987615585327, "loss/kd": 3.5721113681793213, "loss/lm": 1.2498639822006226, "step": 73 }, { "epoch": 0.01519195237117635, "grad_norm": 3.7801792130691476, "kd_ratio": 0.5, "learning_rate": 1.0068027210884354e-05, "loss": 2.1773719787597656, "loss/kd": 3.265259027481079, "loss/lm": 1.0894851684570312, "step": 74 }, { "epoch": 0.015397249024840895, "grad_norm": 3.3326636772928206, "kd_ratio": 0.5, "learning_rate": 1.0204081632653063e-05, "loss": 2.0098001956939697, "loss/kd": 2.9902756214141846, "loss/lm": 1.0293247699737549, "step": 75 }, { "epoch": 0.01560254567850544, "grad_norm": 2.586390093853056, "kd_ratio": 0.5, "learning_rate": 1.034013605442177e-05, "loss": 2.071682929992676, "loss/kd": 2.9823269844055176, "loss/lm": 1.1610387563705444, "step": 76 }, { "epoch": 0.015807842332169985, "grad_norm": 2.6705784936550656, "kd_ratio": 0.5, "learning_rate": 1.0476190476190477e-05, "loss": 2.3282251358032227, "loss/kd": 3.6129798889160156, "loss/lm": 1.0434702634811401, "step": 77 }, { "epoch": 0.016013138985834532, "grad_norm": 1.5904909792493853, "kd_ratio": 0.5, "learning_rate": 1.0612244897959186e-05, "loss": 1.9747506380081177, "loss/kd": 2.9158272743225098, "loss/lm": 1.0336740016937256, "step": 78 }, { "epoch": 0.016218435639499075, "grad_norm": 4.378083085146, "kd_ratio": 0.5, "learning_rate": 1.0748299319727893e-05, "loss": 2.2640490531921387, "loss/kd": 3.3414037227630615, "loss/lm": 1.1866945028305054, "step": 79 }, { "epoch": 0.016423732293163622, "grad_norm": 1.7236947658270945, "kd_ratio": 0.5, "learning_rate": 1.0884353741496601e-05, "loss": 2.00189471244812, "loss/kd": 2.959028720855713, "loss/lm": 1.0447607040405273, "step": 80 }, { "epoch": 0.016629028946828165, "grad_norm": 1.9385865157018256, "kd_ratio": 0.5, "learning_rate": 1.1020408163265306e-05, "loss": 1.938751220703125, "loss/kd": 2.8065176010131836, "loss/lm": 1.0709847211837769, "step": 81 }, { "epoch": 0.016834325600492712, "grad_norm": 2.801664760186557, "kd_ratio": 0.5, "learning_rate": 1.1156462585034013e-05, "loss": 2.0907225608825684, "loss/kd": 3.0308609008789062, "loss/lm": 1.1505842208862305, "step": 82 }, { "epoch": 0.01703962225415726, "grad_norm": 2.5402272164922817, "kd_ratio": 0.5, "learning_rate": 1.1292517006802722e-05, "loss": 2.068282127380371, "loss/kd": 3.1496176719665527, "loss/lm": 0.9869464635848999, "step": 83 }, { "epoch": 0.017244918907821802, "grad_norm": 3.6885296418195903, "kd_ratio": 0.5, "learning_rate": 1.1428571428571429e-05, "loss": 2.04002046585083, "loss/kd": 2.8571789264678955, "loss/lm": 1.2228622436523438, "step": 84 }, { "epoch": 0.01745021556148635, "grad_norm": 2.073931391357448, "kd_ratio": 0.5, "learning_rate": 1.1564625850340136e-05, "loss": 2.0939974784851074, "loss/kd": 3.103199005126953, "loss/lm": 1.0847959518432617, "step": 85 }, { "epoch": 0.017655512215150892, "grad_norm": 2.4470542972940157, "kd_ratio": 0.5, "learning_rate": 1.1700680272108845e-05, "loss": 2.082110643386841, "loss/kd": 3.0362508296966553, "loss/lm": 1.127970576286316, "step": 86 }, { "epoch": 0.01786080886881544, "grad_norm": 4.344143056220164, "kd_ratio": 0.5, "learning_rate": 1.1836734693877552e-05, "loss": 1.9167475700378418, "loss/kd": 2.7454586029052734, "loss/lm": 1.0880365371704102, "step": 87 }, { "epoch": 0.018066105522479982, "grad_norm": 4.7099712655728645, "kd_ratio": 0.5, "learning_rate": 1.197278911564626e-05, "loss": 2.1820459365844727, "loss/kd": 3.308696985244751, "loss/lm": 1.0553947687149048, "step": 88 }, { "epoch": 0.01827140217614453, "grad_norm": 4.157686865484906, "kd_ratio": 0.5, "learning_rate": 1.2108843537414967e-05, "loss": 1.9100685119628906, "loss/kd": 2.8081672191619873, "loss/lm": 1.0119699239730835, "step": 89 }, { "epoch": 0.018476698829809075, "grad_norm": 2.129912110564647, "kd_ratio": 0.5, "learning_rate": 1.2244897959183674e-05, "loss": 2.7021193504333496, "loss/kd": 4.602091312408447, "loss/lm": 0.8021475076675415, "step": 90 }, { "epoch": 0.01868199548347362, "grad_norm": 2.5170132111320056, "kd_ratio": 0.5, "learning_rate": 1.2380952380952383e-05, "loss": 1.9618372917175293, "loss/kd": 2.865227699279785, "loss/lm": 1.0584468841552734, "step": 91 }, { "epoch": 0.018887292137138165, "grad_norm": 3.491491825619453, "kd_ratio": 0.5, "learning_rate": 1.251700680272109e-05, "loss": 2.064626455307007, "loss/kd": 2.9413509368896484, "loss/lm": 1.1879019737243652, "step": 92 }, { "epoch": 0.01909258879080271, "grad_norm": 3.080034637837237, "kd_ratio": 0.5, "learning_rate": 1.2653061224489798e-05, "loss": 2.118464708328247, "loss/kd": 3.163142204284668, "loss/lm": 1.0737870931625366, "step": 93 }, { "epoch": 0.019297885444467255, "grad_norm": 2.0910242680492694, "kd_ratio": 0.5, "learning_rate": 1.2789115646258505e-05, "loss": 1.9853051900863647, "loss/kd": 2.862565040588379, "loss/lm": 1.1080453395843506, "step": 94 }, { "epoch": 0.019503182098131802, "grad_norm": 1.8027613245011782, "kd_ratio": 0.5, "learning_rate": 1.2925170068027212e-05, "loss": 2.140313148498535, "loss/kd": 3.1635730266571045, "loss/lm": 1.1170530319213867, "step": 95 }, { "epoch": 0.019708478751796345, "grad_norm": 1.9298894542852927, "kd_ratio": 0.5, "learning_rate": 1.3061224489795918e-05, "loss": 2.092926502227783, "loss/kd": 3.196476697921753, "loss/lm": 0.9893763661384583, "step": 96 }, { "epoch": 0.019913775405460892, "grad_norm": 2.0414872871876013, "kd_ratio": 0.5, "learning_rate": 1.3197278911564626e-05, "loss": 2.188217878341675, "loss/kd": 3.3857650756835938, "loss/lm": 0.9906708002090454, "step": 97 }, { "epoch": 0.020119072059125435, "grad_norm": 2.9348603492763705, "kd_ratio": 0.5, "learning_rate": 1.3333333333333333e-05, "loss": 2.145463466644287, "loss/kd": 3.277559757232666, "loss/lm": 1.0133670568466187, "step": 98 }, { "epoch": 0.020324368712789982, "grad_norm": 2.3021262562590437, "kd_ratio": 0.5, "learning_rate": 1.3469387755102042e-05, "loss": 1.7963128089904785, "loss/kd": 2.429844617843628, "loss/lm": 1.1627811193466187, "step": 99 }, { "epoch": 0.020529665366454525, "grad_norm": 2.1156485916003653, "kd_ratio": 0.5, "learning_rate": 1.3605442176870749e-05, "loss": 1.7762656211853027, "loss/kd": 2.576434850692749, "loss/lm": 0.9760962724685669, "step": 100 }, { "epoch": 0.020734962020119072, "grad_norm": 3.9186333112845206, "kd_ratio": 0.5, "learning_rate": 1.3741496598639456e-05, "loss": 2.109128713607788, "loss/kd": 3.3096694946289062, "loss/lm": 0.9085878729820251, "step": 101 }, { "epoch": 0.02094025867378362, "grad_norm": 4.000531615767079, "kd_ratio": 0.5, "learning_rate": 1.3877551020408165e-05, "loss": 1.8411298990249634, "loss/kd": 2.680809736251831, "loss/lm": 1.0014500617980957, "step": 102 }, { "epoch": 0.021145555327448162, "grad_norm": 6.6174176381188525, "kd_ratio": 0.5, "learning_rate": 1.4013605442176872e-05, "loss": 2.025067090988159, "loss/kd": 3.0250327587127686, "loss/lm": 1.0251015424728394, "step": 103 }, { "epoch": 0.02135085198111271, "grad_norm": 2.3173036221337884, "kd_ratio": 0.5, "learning_rate": 1.414965986394558e-05, "loss": 2.199903726577759, "loss/kd": 3.359792709350586, "loss/lm": 1.0400148630142212, "step": 104 }, { "epoch": 0.021556148634777252, "grad_norm": 2.7418321640623122, "kd_ratio": 0.5, "learning_rate": 1.4285714285714287e-05, "loss": 1.7156376838684082, "loss/kd": 2.4675276279449463, "loss/lm": 0.9637476801872253, "step": 105 }, { "epoch": 0.0217614452884418, "grad_norm": 2.9518971009532944, "kd_ratio": 0.5, "learning_rate": 1.4421768707482994e-05, "loss": 2.086059331893921, "loss/kd": 3.1781742572784424, "loss/lm": 0.9939444065093994, "step": 106 }, { "epoch": 0.021966741942106342, "grad_norm": 2.3970468706793175, "kd_ratio": 0.5, "learning_rate": 1.4557823129251703e-05, "loss": 2.121190309524536, "loss/kd": 3.1146907806396484, "loss/lm": 1.1276897192001343, "step": 107 }, { "epoch": 0.02217203859577089, "grad_norm": 4.64685363358938, "kd_ratio": 0.5, "learning_rate": 1.469387755102041e-05, "loss": 1.88392174243927, "loss/kd": 2.7123310565948486, "loss/lm": 1.0555124282836914, "step": 108 }, { "epoch": 0.022377335249435436, "grad_norm": 5.706587557694418, "kd_ratio": 0.5, "learning_rate": 1.4829931972789118e-05, "loss": 2.2287585735321045, "loss/kd": 3.4100263118743896, "loss/lm": 1.0474909543991089, "step": 109 }, { "epoch": 0.02258263190309998, "grad_norm": 3.854076676112795, "kd_ratio": 0.5, "learning_rate": 1.4965986394557825e-05, "loss": 2.007672071456909, "loss/kd": 3.039680004119873, "loss/lm": 0.9756640195846558, "step": 110 }, { "epoch": 0.022787928556764526, "grad_norm": 4.738403559758473, "kd_ratio": 0.5, "learning_rate": 1.510204081632653e-05, "loss": 2.1069388389587402, "loss/kd": 3.278177261352539, "loss/lm": 0.9357002973556519, "step": 111 }, { "epoch": 0.02299322521042907, "grad_norm": 6.937714313366444, "kd_ratio": 0.5, "learning_rate": 1.523809523809524e-05, "loss": 1.7146366834640503, "loss/kd": 2.496892213821411, "loss/lm": 0.9323811531066895, "step": 112 }, { "epoch": 0.023198521864093616, "grad_norm": 3.1478139655496182, "kd_ratio": 0.5, "learning_rate": 1.5374149659863945e-05, "loss": 1.7947659492492676, "loss/kd": 2.715986967086792, "loss/lm": 0.8735448122024536, "step": 113 }, { "epoch": 0.02340381851775816, "grad_norm": 8.915113920110203, "kd_ratio": 0.5, "learning_rate": 1.5510204081632655e-05, "loss": 2.632624864578247, "loss/kd": 4.500158786773682, "loss/lm": 0.765090823173523, "step": 114 }, { "epoch": 0.023609115171422706, "grad_norm": 9.692065346156117, "kd_ratio": 0.5, "learning_rate": 1.5646258503401362e-05, "loss": 2.184978723526001, "loss/kd": 3.320091962814331, "loss/lm": 1.0498656034469604, "step": 115 }, { "epoch": 0.023814411825087253, "grad_norm": 3.713875051703335, "kd_ratio": 0.5, "learning_rate": 1.578231292517007e-05, "loss": 1.9924310445785522, "loss/kd": 3.061701536178589, "loss/lm": 0.9231606125831604, "step": 116 }, { "epoch": 0.024019708478751796, "grad_norm": 11.265176918188327, "kd_ratio": 0.5, "learning_rate": 1.5918367346938776e-05, "loss": 2.0636372566223145, "loss/kd": 3.22147274017334, "loss/lm": 0.9058015942573547, "step": 117 }, { "epoch": 0.024225005132416343, "grad_norm": 2.525424655382895, "kd_ratio": 0.5, "learning_rate": 1.6054421768707483e-05, "loss": 1.8581167459487915, "loss/kd": 2.7636427879333496, "loss/lm": 0.9525906443595886, "step": 118 }, { "epoch": 0.024430301786080886, "grad_norm": 7.542089818383863, "kd_ratio": 0.5, "learning_rate": 1.6190476190476193e-05, "loss": 2.0825881958007812, "loss/kd": 3.1827824115753174, "loss/lm": 0.982393741607666, "step": 119 }, { "epoch": 0.024635598439745433, "grad_norm": 7.116270786956529, "kd_ratio": 0.5, "learning_rate": 1.63265306122449e-05, "loss": 2.245659112930298, "loss/kd": 3.460615396499634, "loss/lm": 1.030702829360962, "step": 120 }, { "epoch": 0.024840895093409976, "grad_norm": 5.16855689477879, "kd_ratio": 0.5, "learning_rate": 1.6462585034013607e-05, "loss": 1.988242745399475, "loss/kd": 2.966012477874756, "loss/lm": 1.0104730129241943, "step": 121 }, { "epoch": 0.025046191747074523, "grad_norm": 10.840971480451664, "kd_ratio": 0.5, "learning_rate": 1.6598639455782314e-05, "loss": 1.928604006767273, "loss/kd": 2.955533266067505, "loss/lm": 0.9016748070716858, "step": 122 }, { "epoch": 0.02525148840073907, "grad_norm": 2.7878034082855185, "kd_ratio": 0.5, "learning_rate": 1.673469387755102e-05, "loss": 1.8366976976394653, "loss/kd": 2.6859920024871826, "loss/lm": 0.9874033331871033, "step": 123 }, { "epoch": 0.025456785054403613, "grad_norm": 7.5754206697632025, "kd_ratio": 0.5, "learning_rate": 1.687074829931973e-05, "loss": 2.0839624404907227, "loss/kd": 3.188669204711914, "loss/lm": 0.9792555570602417, "step": 124 }, { "epoch": 0.02566208170806816, "grad_norm": 4.20998213538023, "kd_ratio": 0.5, "learning_rate": 1.7006802721088435e-05, "loss": 2.3131628036499023, "loss/kd": 3.592339515686035, "loss/lm": 1.0339863300323486, "step": 125 }, { "epoch": 0.025867378361732703, "grad_norm": 6.11471736856931, "kd_ratio": 0.5, "learning_rate": 1.7142857142857142e-05, "loss": 1.9673759937286377, "loss/kd": 2.937596321105957, "loss/lm": 0.9971557259559631, "step": 126 }, { "epoch": 0.02607267501539725, "grad_norm": 7.060627482004923, "kd_ratio": 0.5, "learning_rate": 1.7278911564625852e-05, "loss": 1.7398861646652222, "loss/kd": 2.6160712242126465, "loss/lm": 0.8637011051177979, "step": 127 }, { "epoch": 0.026277971669061793, "grad_norm": 2.585784257517249, "kd_ratio": 0.5, "learning_rate": 1.741496598639456e-05, "loss": 1.8566721677780151, "loss/kd": 2.88348388671875, "loss/lm": 0.829860508441925, "step": 128 }, { "epoch": 0.02648326832272634, "grad_norm": 8.046021602173257, "kd_ratio": 0.5, "learning_rate": 1.7551020408163266e-05, "loss": 2.0635900497436523, "loss/kd": 3.2601025104522705, "loss/lm": 0.8670778274536133, "step": 129 }, { "epoch": 0.026688564976390886, "grad_norm": 3.3250023338106445, "kd_ratio": 0.5, "learning_rate": 1.7687074829931973e-05, "loss": 1.8421529531478882, "loss/kd": 2.703843355178833, "loss/lm": 0.9804626107215881, "step": 130 }, { "epoch": 0.02689386163005543, "grad_norm": 6.224818046992526, "kd_ratio": 0.5, "learning_rate": 1.782312925170068e-05, "loss": 1.9178297519683838, "loss/kd": 2.968468189239502, "loss/lm": 0.8671911954879761, "step": 131 }, { "epoch": 0.027099158283719976, "grad_norm": 6.933630439455684, "kd_ratio": 0.5, "learning_rate": 1.795918367346939e-05, "loss": 2.0675578117370605, "loss/kd": 3.1933701038360596, "loss/lm": 0.9417455196380615, "step": 132 }, { "epoch": 0.02730445493738452, "grad_norm": 3.901864507530629, "kd_ratio": 0.5, "learning_rate": 1.8095238095238097e-05, "loss": 1.980143427848816, "loss/kd": 3.059619188308716, "loss/lm": 0.900667667388916, "step": 133 }, { "epoch": 0.027509751591049066, "grad_norm": 7.45340344120725, "kd_ratio": 0.5, "learning_rate": 1.8231292517006804e-05, "loss": 1.8193912506103516, "loss/kd": 2.6796486377716064, "loss/lm": 0.9591337442398071, "step": 134 }, { "epoch": 0.02771504824471361, "grad_norm": 1.4946986881905988, "kd_ratio": 0.5, "learning_rate": 1.836734693877551e-05, "loss": 1.7332465648651123, "loss/kd": 2.5311439037323, "loss/lm": 0.9353492259979248, "step": 135 }, { "epoch": 0.027920344898378156, "grad_norm": 4.423083822129225, "kd_ratio": 0.5, "learning_rate": 1.8503401360544218e-05, "loss": 1.8017314672470093, "loss/kd": 2.712143659591675, "loss/lm": 0.8913192749023438, "step": 136 }, { "epoch": 0.028125641552042703, "grad_norm": 4.085195598670423, "kd_ratio": 0.5, "learning_rate": 1.863945578231293e-05, "loss": 1.766051173210144, "loss/kd": 2.6476144790649414, "loss/lm": 0.8844878077507019, "step": 137 }, { "epoch": 0.028330938205707246, "grad_norm": 1.9729655144948541, "kd_ratio": 0.5, "learning_rate": 1.8775510204081636e-05, "loss": 1.9561278820037842, "loss/kd": 3.0398685932159424, "loss/lm": 0.872387170791626, "step": 138 }, { "epoch": 0.028536234859371793, "grad_norm": 4.815735041785913, "kd_ratio": 0.5, "learning_rate": 1.8911564625850343e-05, "loss": 2.1332168579101562, "loss/kd": 3.326634407043457, "loss/lm": 0.9397995471954346, "step": 139 }, { "epoch": 0.028741531513036336, "grad_norm": 2.4501598847850445, "kd_ratio": 0.5, "learning_rate": 1.904761904761905e-05, "loss": 2.067904233932495, "loss/kd": 3.1604905128479004, "loss/lm": 0.9753178954124451, "step": 140 }, { "epoch": 0.028946828166700883, "grad_norm": 4.372582800168516, "kd_ratio": 0.5, "learning_rate": 1.9183673469387756e-05, "loss": 1.8647043704986572, "loss/kd": 2.882758855819702, "loss/lm": 0.8466500043869019, "step": 141 }, { "epoch": 0.029152124820365426, "grad_norm": 4.208352720474093, "kd_ratio": 0.5, "learning_rate": 1.9319727891156463e-05, "loss": 1.9127813577651978, "loss/kd": 2.9469335079193115, "loss/lm": 0.878629207611084, "step": 142 }, { "epoch": 0.029357421474029973, "grad_norm": 3.2998518530218197, "kd_ratio": 0.5, "learning_rate": 1.945578231292517e-05, "loss": 1.7805297374725342, "loss/kd": 2.751681327819824, "loss/lm": 0.8093780279159546, "step": 143 }, { "epoch": 0.02956271812769452, "grad_norm": 3.8469187158476026, "kd_ratio": 0.5, "learning_rate": 1.9591836734693877e-05, "loss": 2.1517627239227295, "loss/kd": 3.3293633460998535, "loss/lm": 0.9741622805595398, "step": 144 }, { "epoch": 0.029768014781359063, "grad_norm": 1.87298297793878, "kd_ratio": 0.5, "learning_rate": 1.9727891156462588e-05, "loss": 2.039757013320923, "loss/kd": 3.0058951377868652, "loss/lm": 1.0736188888549805, "step": 145 }, { "epoch": 0.02997331143502361, "grad_norm": 2.7894354126295404, "kd_ratio": 0.5, "learning_rate": 1.9863945578231295e-05, "loss": 1.73590087890625, "loss/kd": 2.7099921703338623, "loss/lm": 0.7618095874786377, "step": 146 }, { "epoch": 0.030178608088688153, "grad_norm": 4.158222985955652, "kd_ratio": 0.5, "learning_rate": 2e-05, "loss": 1.7448409795761108, "loss/kd": 2.5498626232147217, "loss/lm": 0.9398193359375, "step": 147 }, { "epoch": 0.0303839047423527, "grad_norm": 1.6865821869525688, "kd_ratio": 0.5, "learning_rate": 1.9999997788688342e-05, "loss": 1.8605033159255981, "loss/kd": 2.8161399364471436, "loss/lm": 0.9048666954040527, "step": 148 }, { "epoch": 0.030589201396017243, "grad_norm": 5.56854737330322, "kd_ratio": 0.5, "learning_rate": 1.9999991154754343e-05, "loss": 1.8475157022476196, "loss/kd": 2.8157155513763428, "loss/lm": 0.8793159127235413, "step": 149 }, { "epoch": 0.03079449804968179, "grad_norm": 2.361464179728931, "kd_ratio": 0.5, "learning_rate": 1.9999980098200934e-05, "loss": 1.8201484680175781, "loss/kd": 2.7508459091186523, "loss/lm": 0.8894510865211487, "step": 150 }, { "epoch": 0.030999794703346337, "grad_norm": 4.457575602211877, "kd_ratio": 0.5, "learning_rate": 1.999996461903301e-05, "loss": 1.7819031476974487, "loss/kd": 2.7136313915252686, "loss/lm": 0.8501749038696289, "step": 151 }, { "epoch": 0.03120509135701088, "grad_norm": 5.35146269412281, "kd_ratio": 0.5, "learning_rate": 1.9999944717257408e-05, "loss": 1.7237871885299683, "loss/kd": 2.5336780548095703, "loss/lm": 0.913896381855011, "step": 152 }, { "epoch": 0.03141038801067542, "grad_norm": 4.118962025134679, "kd_ratio": 0.5, "learning_rate": 1.9999920392882944e-05, "loss": 1.8547526597976685, "loss/kd": 2.790715217590332, "loss/lm": 0.9187901020050049, "step": 153 }, { "epoch": 0.03161568466433997, "grad_norm": 5.001480481439521, "kd_ratio": 0.5, "learning_rate": 1.9999891645920363e-05, "loss": 1.7216424942016602, "loss/kd": 2.660634756088257, "loss/lm": 0.7826501131057739, "step": 154 }, { "epoch": 0.03182098131800452, "grad_norm": 1.9981239487610105, "kd_ratio": 0.5, "learning_rate": 1.9999858476382388e-05, "loss": 1.8819317817687988, "loss/kd": 2.8218441009521484, "loss/lm": 0.9420194625854492, "step": 155 }, { "epoch": 0.032026277971669063, "grad_norm": 5.798775685282853, "kd_ratio": 0.5, "learning_rate": 1.999982088428368e-05, "loss": 2.0501604080200195, "loss/kd": 3.1614913940429688, "loss/lm": 0.9388293623924255, "step": 156 }, { "epoch": 0.03223157462533361, "grad_norm": 2.6743512621015237, "kd_ratio": 0.5, "learning_rate": 1.999977886964087e-05, "loss": 2.226210832595825, "loss/kd": 3.633990526199341, "loss/lm": 0.8184309601783752, "step": 157 }, { "epoch": 0.03243687127899815, "grad_norm": 4.266205062592955, "kd_ratio": 0.5, "learning_rate": 1.9999732432472544e-05, "loss": 1.6909661293029785, "loss/kd": 2.522190570831299, "loss/lm": 0.8597418069839478, "step": 158 }, { "epoch": 0.0326421679326627, "grad_norm": 3.8005976442238856, "kd_ratio": 0.5, "learning_rate": 1.9999681572799226e-05, "loss": 1.837475061416626, "loss/kd": 2.675368070602417, "loss/lm": 0.9995819330215454, "step": 159 }, { "epoch": 0.032847464586327244, "grad_norm": 3.9972700439442996, "kd_ratio": 0.5, "learning_rate": 1.999962629064342e-05, "loss": 1.7429676055908203, "loss/kd": 2.6865079402923584, "loss/lm": 0.7994271516799927, "step": 160 }, { "epoch": 0.03305276123999179, "grad_norm": 4.989264327987154, "kd_ratio": 0.5, "learning_rate": 1.9999566586029575e-05, "loss": 1.7821226119995117, "loss/kd": 2.747391700744629, "loss/lm": 0.8168534636497498, "step": 161 }, { "epoch": 0.03325805789365633, "grad_norm": 3.5956211469157235, "kd_ratio": 0.5, "learning_rate": 1.999950245898409e-05, "loss": 1.7090071439743042, "loss/kd": 2.63417649269104, "loss/lm": 0.7838377356529236, "step": 162 }, { "epoch": 0.03346335454732088, "grad_norm": 2.694809624714514, "kd_ratio": 0.5, "learning_rate": 1.9999433909535333e-05, "loss": 1.822548747062683, "loss/kd": 2.7562103271484375, "loss/lm": 0.8888871669769287, "step": 163 }, { "epoch": 0.033668651200985424, "grad_norm": 1.6019108332790568, "kd_ratio": 0.5, "learning_rate": 1.9999360937713615e-05, "loss": 2.164584159851074, "loss/kd": 3.3977601528167725, "loss/lm": 0.9314082264900208, "step": 164 }, { "epoch": 0.03387394785464997, "grad_norm": 3.248757575262085, "kd_ratio": 0.5, "learning_rate": 1.999928354355121e-05, "loss": 1.8923251628875732, "loss/kd": 3.015277147293091, "loss/lm": 0.7693732976913452, "step": 165 }, { "epoch": 0.03407924450831452, "grad_norm": 1.6602980800092082, "kd_ratio": 0.5, "learning_rate": 1.9999201727082348e-05, "loss": 1.7690415382385254, "loss/kd": 2.5252187252044678, "loss/lm": 1.012864351272583, "step": 166 }, { "epoch": 0.03428454116197906, "grad_norm": 2.13120710660986, "kd_ratio": 0.5, "learning_rate": 1.9999115488343213e-05, "loss": 1.9530200958251953, "loss/kd": 3.117199659347534, "loss/lm": 0.7888405323028564, "step": 167 }, { "epoch": 0.034489837815643604, "grad_norm": 3.4208583402915447, "kd_ratio": 0.5, "learning_rate": 1.9999024827371946e-05, "loss": 1.8194950819015503, "loss/kd": 2.831695079803467, "loss/lm": 0.8072951436042786, "step": 168 }, { "epoch": 0.03469513446930815, "grad_norm": 3.394924114747004, "kd_ratio": 0.5, "learning_rate": 1.999892974420864e-05, "loss": 1.8499228954315186, "loss/kd": 2.8581302165985107, "loss/lm": 0.8417156934738159, "step": 169 }, { "epoch": 0.0349004311229727, "grad_norm": 3.402192535655255, "kd_ratio": 0.5, "learning_rate": 1.999883023889535e-05, "loss": 1.6265515089035034, "loss/kd": 2.5117063522338867, "loss/lm": 0.7413967251777649, "step": 170 }, { "epoch": 0.035105727776637244, "grad_norm": 3.5868698217127997, "kd_ratio": 0.5, "learning_rate": 1.999872631147608e-05, "loss": 1.5382853746414185, "loss/kd": 2.3148951530456543, "loss/lm": 0.7616755366325378, "step": 171 }, { "epoch": 0.035311024430301784, "grad_norm": 3.099619421109341, "kd_ratio": 0.5, "learning_rate": 1.9998617961996797e-05, "loss": 1.786040186882019, "loss/kd": 2.8107879161834717, "loss/lm": 0.7612924575805664, "step": 172 }, { "epoch": 0.03551632108396633, "grad_norm": 3.2545615951527136, "kd_ratio": 0.5, "learning_rate": 1.9998505190505423e-05, "loss": 1.8434911966323853, "loss/kd": 2.8312041759490967, "loss/lm": 0.8557782173156738, "step": 173 }, { "epoch": 0.03572161773763088, "grad_norm": 2.7241050409331646, "kd_ratio": 0.5, "learning_rate": 1.999838799705182e-05, "loss": 1.952164888381958, "loss/kd": 3.051138401031494, "loss/lm": 0.8531914949417114, "step": 174 }, { "epoch": 0.035926914391295424, "grad_norm": 2.5589028725264145, "kd_ratio": 0.5, "learning_rate": 1.999826638168783e-05, "loss": 2.049166440963745, "loss/kd": 3.1274077892303467, "loss/lm": 0.9709250330924988, "step": 175 }, { "epoch": 0.036132211044959964, "grad_norm": 2.659703740257912, "kd_ratio": 0.5, "learning_rate": 1.9998140344467233e-05, "loss": 1.9833964109420776, "loss/kd": 3.165487289428711, "loss/lm": 0.8013055324554443, "step": 176 }, { "epoch": 0.03633750769862451, "grad_norm": 2.095616056136659, "kd_ratio": 0.5, "learning_rate": 1.999800988544577e-05, "loss": 1.7370246648788452, "loss/kd": 2.6381263732910156, "loss/lm": 0.83592289686203, "step": 177 }, { "epoch": 0.03654280435228906, "grad_norm": 2.2188512843804498, "kd_ratio": 0.5, "learning_rate": 1.9997875004681147e-05, "loss": 1.8465970754623413, "loss/kd": 2.8196845054626465, "loss/lm": 0.8735096454620361, "step": 178 }, { "epoch": 0.036748101005953604, "grad_norm": 1.9997224797792272, "kd_ratio": 0.5, "learning_rate": 1.9997735702233006e-05, "loss": 2.008509397506714, "loss/kd": 3.1989340782165527, "loss/lm": 0.8180845379829407, "step": 179 }, { "epoch": 0.03695339765961815, "grad_norm": 2.478240598529132, "kd_ratio": 0.5, "learning_rate": 1.999759197816296e-05, "loss": 1.8396481275558472, "loss/kd": 2.7867696285247803, "loss/lm": 0.8925265669822693, "step": 180 }, { "epoch": 0.03715869431328269, "grad_norm": 1.6260273513948318, "kd_ratio": 0.5, "learning_rate": 1.9997443832534573e-05, "loss": 1.7486834526062012, "loss/kd": 2.6203701496124268, "loss/lm": 0.8769967555999756, "step": 181 }, { "epoch": 0.03736399096694724, "grad_norm": 3.1696599685617683, "kd_ratio": 0.5, "learning_rate": 1.999729126541336e-05, "loss": 1.8986576795578003, "loss/kd": 2.8241825103759766, "loss/lm": 0.973132848739624, "step": 182 }, { "epoch": 0.037569287620611784, "grad_norm": 3.2063726725651627, "kd_ratio": 0.5, "learning_rate": 1.99971342768668e-05, "loss": 1.9317128658294678, "loss/kd": 3.100665807723999, "loss/lm": 0.7627599239349365, "step": 183 }, { "epoch": 0.03777458427427633, "grad_norm": 2.5588363547788218, "kd_ratio": 0.5, "learning_rate": 1.999697286696433e-05, "loss": 1.9449591636657715, "loss/kd": 3.154680013656616, "loss/lm": 0.735238254070282, "step": 184 }, { "epoch": 0.03797988092794088, "grad_norm": 1.8831868125116706, "kd_ratio": 0.5, "learning_rate": 1.999680703577732e-05, "loss": 1.7533385753631592, "loss/kd": 2.627945899963379, "loss/lm": 0.8787311911582947, "step": 185 }, { "epoch": 0.03818517758160542, "grad_norm": 2.111163132982261, "kd_ratio": 0.5, "learning_rate": 1.999663678337912e-05, "loss": 1.625373125076294, "loss/kd": 2.4580483436584473, "loss/lm": 0.7926977872848511, "step": 186 }, { "epoch": 0.038390474235269964, "grad_norm": 1.746680395878304, "kd_ratio": 0.5, "learning_rate": 1.999646210984502e-05, "loss": 2.474456548690796, "loss/kd": 4.284259796142578, "loss/lm": 0.6646534204483032, "step": 187 }, { "epoch": 0.03859577088893451, "grad_norm": 2.2033965362726327, "kd_ratio": 0.5, "learning_rate": 1.9996283015252286e-05, "loss": 2.557138681411743, "loss/kd": 4.448230266571045, "loss/lm": 0.6660470962524414, "step": 188 }, { "epoch": 0.03880106754259906, "grad_norm": 1.8280409230410821, "kd_ratio": 0.5, "learning_rate": 1.999609949968011e-05, "loss": 1.9099900722503662, "loss/kd": 2.926781177520752, "loss/lm": 0.8931988477706909, "step": 189 }, { "epoch": 0.039006364196263604, "grad_norm": 2.67472827456577, "kd_ratio": 0.5, "learning_rate": 1.999591156320966e-05, "loss": 1.85675048828125, "loss/kd": 2.8431742191314697, "loss/lm": 0.8703266978263855, "step": 190 }, { "epoch": 0.039211660849928144, "grad_norm": 1.5893693608228432, "kd_ratio": 0.5, "learning_rate": 1.999571920592405e-05, "loss": 1.8165045976638794, "loss/kd": 2.964160680770874, "loss/lm": 0.66884845495224, "step": 191 }, { "epoch": 0.03941695750359269, "grad_norm": 1.7229210766185825, "kd_ratio": 0.5, "learning_rate": 1.9995522427908355e-05, "loss": 1.9170914888381958, "loss/kd": 3.1209802627563477, "loss/lm": 0.7132026553153992, "step": 192 }, { "epoch": 0.03962225415725724, "grad_norm": 1.505033898428979, "kd_ratio": 0.5, "learning_rate": 1.9995321229249605e-05, "loss": 1.5615078210830688, "loss/kd": 2.332746744155884, "loss/lm": 0.7902688980102539, "step": 193 }, { "epoch": 0.039827550810921784, "grad_norm": 1.6199402269998615, "kd_ratio": 0.5, "learning_rate": 1.999511561003678e-05, "loss": 1.8305615186691284, "loss/kd": 2.952789068222046, "loss/lm": 0.7083339095115662, "step": 194 }, { "epoch": 0.040032847464586324, "grad_norm": 2.5664544779560154, "kd_ratio": 0.5, "learning_rate": 1.9994905570360817e-05, "loss": 1.8722467422485352, "loss/kd": 2.800032615661621, "loss/lm": 0.944460928440094, "step": 195 }, { "epoch": 0.04023814411825087, "grad_norm": 2.5795830244987226, "kd_ratio": 0.5, "learning_rate": 1.9994691110314605e-05, "loss": 2.0104317665100098, "loss/kd": 2.96094012260437, "loss/lm": 1.0599232912063599, "step": 196 }, { "epoch": 0.04044344077191542, "grad_norm": 1.6238132476609024, "kd_ratio": 0.5, "learning_rate": 1.9994472229993e-05, "loss": 1.6692795753479004, "loss/kd": 2.5895273685455322, "loss/lm": 0.749031662940979, "step": 197 }, { "epoch": 0.040648737425579964, "grad_norm": 1.993862744445086, "kd_ratio": 0.5, "learning_rate": 1.9994248929492798e-05, "loss": 1.870476245880127, "loss/kd": 2.720855951309204, "loss/lm": 1.0200964212417603, "step": 198 }, { "epoch": 0.04085403407924451, "grad_norm": 1.9204322593618148, "kd_ratio": 0.5, "learning_rate": 1.999402120891276e-05, "loss": 1.8197952508926392, "loss/kd": 2.7795584201812744, "loss/lm": 0.8600320219993591, "step": 199 }, { "epoch": 0.04105933073290905, "grad_norm": 1.9484511206447659, "kd_ratio": 0.5, "learning_rate": 1.99937890683536e-05, "loss": 1.824784278869629, "loss/kd": 2.876267194747925, "loss/lm": 0.7733014822006226, "step": 200 }, { "epoch": 0.0412646273865736, "grad_norm": 1.5687545246663044, "kd_ratio": 0.5, "learning_rate": 1.9993552507917977e-05, "loss": 1.9030978679656982, "loss/kd": 3.026764154434204, "loss/lm": 0.7794315814971924, "step": 201 }, { "epoch": 0.041469924040238144, "grad_norm": 1.7299352833469865, "kd_ratio": 0.5, "learning_rate": 1.999331152771052e-05, "loss": 1.478719711303711, "loss/kd": 2.168848752975464, "loss/lm": 0.788590669631958, "step": 202 }, { "epoch": 0.04167522069390269, "grad_norm": 1.8198726515061583, "kd_ratio": 0.5, "learning_rate": 1.99930661278378e-05, "loss": 1.741032361984253, "loss/kd": 2.631458044052124, "loss/lm": 0.8506067395210266, "step": 203 }, { "epoch": 0.04188051734756724, "grad_norm": 1.4895229835848938, "kd_ratio": 0.5, "learning_rate": 1.9992816308408355e-05, "loss": 1.852175235748291, "loss/kd": 2.8608782291412354, "loss/lm": 0.8434723019599915, "step": 204 }, { "epoch": 0.04208581400123178, "grad_norm": 2.045979101324664, "kd_ratio": 0.5, "learning_rate": 1.9992562069532665e-05, "loss": 2.0528204441070557, "loss/kd": 3.3340725898742676, "loss/lm": 0.7715684771537781, "step": 205 }, { "epoch": 0.042291110654896324, "grad_norm": 1.9935675588334905, "kd_ratio": 0.5, "learning_rate": 1.9992303411323177e-05, "loss": 1.902407169342041, "loss/kd": 2.9196090698242188, "loss/lm": 0.8852053284645081, "step": 206 }, { "epoch": 0.04249640730856087, "grad_norm": 1.4778501991219175, "kd_ratio": 0.5, "learning_rate": 1.9992040333894273e-05, "loss": 1.8829175233840942, "loss/kd": 3.039324998855591, "loss/lm": 0.7265101075172424, "step": 207 }, { "epoch": 0.04270170396222542, "grad_norm": 1.394212531388074, "kd_ratio": 0.5, "learning_rate": 1.9991772837362315e-05, "loss": 1.7364177703857422, "loss/kd": 2.5884735584259033, "loss/lm": 0.8843618631362915, "step": 208 }, { "epoch": 0.04290700061588996, "grad_norm": 1.456092674939953, "kd_ratio": 0.5, "learning_rate": 1.99915009218456e-05, "loss": 1.6590352058410645, "loss/kd": 2.5609214305877686, "loss/lm": 0.7571489214897156, "step": 209 }, { "epoch": 0.043112297269554505, "grad_norm": 1.585869342391742, "kd_ratio": 0.5, "learning_rate": 1.999122458746439e-05, "loss": 2.002904176712036, "loss/kd": 3.239692449569702, "loss/lm": 0.7661157846450806, "step": 210 }, { "epoch": 0.04331759392321905, "grad_norm": 1.376864506196728, "kd_ratio": 0.5, "learning_rate": 1.9990943834340893e-05, "loss": 1.6506177186965942, "loss/kd": 2.629735231399536, "loss/lm": 0.6715002059936523, "step": 211 }, { "epoch": 0.0435228905768836, "grad_norm": 1.3544490711248485, "kd_ratio": 0.5, "learning_rate": 1.9990658662599282e-05, "loss": 2.015136241912842, "loss/kd": 3.3049726486206055, "loss/lm": 0.7253000736236572, "step": 212 }, { "epoch": 0.043728187230548145, "grad_norm": 2.63168449302951, "kd_ratio": 0.5, "learning_rate": 1.9990369072365666e-05, "loss": 1.860628366470337, "loss/kd": 2.885082244873047, "loss/lm": 0.8361744284629822, "step": 213 }, { "epoch": 0.043933483884212685, "grad_norm": 3.7307957623410246, "kd_ratio": 0.5, "learning_rate": 1.9990075063768135e-05, "loss": 1.8750131130218506, "loss/kd": 2.8994534015655518, "loss/lm": 0.850572943687439, "step": 214 }, { "epoch": 0.04413878053787723, "grad_norm": 1.462926567918004, "kd_ratio": 0.5, "learning_rate": 1.9989776636936705e-05, "loss": 1.5785481929779053, "loss/kd": 2.400527238845825, "loss/lm": 0.7565692067146301, "step": 215 }, { "epoch": 0.04434407719154178, "grad_norm": 3.902319200623343, "kd_ratio": 0.5, "learning_rate": 1.998947379200337e-05, "loss": 1.499224305152893, "loss/kd": 2.2926247119903564, "loss/lm": 0.7058238387107849, "step": 216 }, { "epoch": 0.044549373845206325, "grad_norm": 1.3393036745920206, "kd_ratio": 0.5, "learning_rate": 1.9989166529102056e-05, "loss": 1.8623610734939575, "loss/kd": 2.7701616287231445, "loss/lm": 0.9545605182647705, "step": 217 }, { "epoch": 0.04475467049887087, "grad_norm": 4.30117210763719, "kd_ratio": 0.5, "learning_rate": 1.998885484836866e-05, "loss": 1.7748974561691284, "loss/kd": 2.7703166007995605, "loss/lm": 0.7794782519340515, "step": 218 }, { "epoch": 0.04495996715253541, "grad_norm": 1.7294288688427362, "kd_ratio": 0.5, "learning_rate": 1.9988538749941024e-05, "loss": 1.6991441249847412, "loss/kd": 2.535189390182495, "loss/lm": 0.8630987405776978, "step": 219 }, { "epoch": 0.04516526380619996, "grad_norm": 3.591848628669552, "kd_ratio": 0.5, "learning_rate": 1.998821823395895e-05, "loss": 1.8632668256759644, "loss/kd": 2.872903823852539, "loss/lm": 0.8536298871040344, "step": 220 }, { "epoch": 0.045370560459864505, "grad_norm": 1.7189961535864449, "kd_ratio": 0.5, "learning_rate": 1.9987893300564186e-05, "loss": 1.7858482599258423, "loss/kd": 2.8115017414093018, "loss/lm": 0.760194718837738, "step": 221 }, { "epoch": 0.04557585711352905, "grad_norm": 1.8411025450172653, "kd_ratio": 0.5, "learning_rate": 1.998756394990044e-05, "loss": 1.9990228414535522, "loss/kd": 3.088792562484741, "loss/lm": 0.9092530608177185, "step": 222 }, { "epoch": 0.04578115376719359, "grad_norm": 1.8615789971403653, "kd_ratio": 0.5, "learning_rate": 1.9987230182113374e-05, "loss": 1.8030507564544678, "loss/kd": 2.888908863067627, "loss/lm": 0.717192530632019, "step": 223 }, { "epoch": 0.04598645042085814, "grad_norm": 3.9091378742190224, "kd_ratio": 0.5, "learning_rate": 1.9986891997350598e-05, "loss": 1.6637734174728394, "loss/kd": 2.5452747344970703, "loss/lm": 0.7822721004486084, "step": 224 }, { "epoch": 0.046191747074522685, "grad_norm": 2.9178867786910905, "kd_ratio": 0.5, "learning_rate": 1.9986549395761677e-05, "loss": 1.893010139465332, "loss/kd": 3.016423463821411, "loss/lm": 0.7695967555046082, "step": 225 }, { "epoch": 0.04639704372818723, "grad_norm": 2.30772416008835, "kd_ratio": 0.5, "learning_rate": 1.9986202377498133e-05, "loss": 1.6868590116500854, "loss/kd": 2.5818984508514404, "loss/lm": 0.7918196320533752, "step": 226 }, { "epoch": 0.04660234038185178, "grad_norm": 4.307737760677594, "kd_ratio": 0.5, "learning_rate": 1.998585094271344e-05, "loss": 1.8237199783325195, "loss/kd": 2.879458427429199, "loss/lm": 0.7679815292358398, "step": 227 }, { "epoch": 0.04680763703551632, "grad_norm": 1.7911087950071884, "kd_ratio": 0.5, "learning_rate": 1.998549509156302e-05, "loss": 1.7558649778366089, "loss/kd": 2.780686378479004, "loss/lm": 0.7310435175895691, "step": 228 }, { "epoch": 0.047012933689180865, "grad_norm": 1.9187849804964872, "kd_ratio": 0.5, "learning_rate": 1.9985134824204256e-05, "loss": 1.8197721242904663, "loss/kd": 2.9308574199676514, "loss/lm": 0.708686888217926, "step": 229 }, { "epoch": 0.04721823034284541, "grad_norm": 2.3048838144273223, "kd_ratio": 0.5, "learning_rate": 1.998477014079648e-05, "loss": 1.598859190940857, "loss/kd": 2.439883232116699, "loss/lm": 0.7578350901603699, "step": 230 }, { "epoch": 0.04742352699650996, "grad_norm": 3.5774770134654785, "kd_ratio": 0.5, "learning_rate": 1.998440104150098e-05, "loss": 1.6251909732818604, "loss/kd": 2.6050870418548584, "loss/lm": 0.6452949643135071, "step": 231 }, { "epoch": 0.047628823650174505, "grad_norm": 1.813966096702953, "kd_ratio": 0.5, "learning_rate": 1.998402752648099e-05, "loss": 1.8450679779052734, "loss/kd": 2.975710153579712, "loss/lm": 0.714425802230835, "step": 232 }, { "epoch": 0.047834120303839045, "grad_norm": 3.108482842829256, "kd_ratio": 0.5, "learning_rate": 1.9983649595901706e-05, "loss": 2.108384132385254, "loss/kd": 3.417876958847046, "loss/lm": 0.7988914847373962, "step": 233 }, { "epoch": 0.04803941695750359, "grad_norm": 2.144527592884591, "kd_ratio": 0.5, "learning_rate": 1.998326724993027e-05, "loss": 1.5819610357284546, "loss/kd": 2.3511157035827637, "loss/lm": 0.8128064274787903, "step": 234 }, { "epoch": 0.04824471361116814, "grad_norm": 2.0501784959345914, "kd_ratio": 0.5, "learning_rate": 1.998288048873578e-05, "loss": 1.7052929401397705, "loss/kd": 2.64416766166687, "loss/lm": 0.7664180994033813, "step": 235 }, { "epoch": 0.048450010264832685, "grad_norm": 2.4021474131343195, "kd_ratio": 0.5, "learning_rate": 1.9982489312489286e-05, "loss": 1.8546009063720703, "loss/kd": 2.979661464691162, "loss/lm": 0.7295402884483337, "step": 236 }, { "epoch": 0.048655306918497225, "grad_norm": 1.9523314750529903, "kd_ratio": 0.5, "learning_rate": 1.9982093721363792e-05, "loss": 1.665312647819519, "loss/kd": 2.5359156131744385, "loss/lm": 0.7947096824645996, "step": 237 }, { "epoch": 0.04886060357216177, "grad_norm": 3.1414302751445096, "kd_ratio": 0.5, "learning_rate": 1.998169371553425e-05, "loss": 2.043761968612671, "loss/kd": 3.288501262664795, "loss/lm": 0.7990224957466125, "step": 238 }, { "epoch": 0.04906590022582632, "grad_norm": 4.874179288083487, "kd_ratio": 0.5, "learning_rate": 1.9981289295177566e-05, "loss": 1.7751905918121338, "loss/kd": 2.7753031253814697, "loss/lm": 0.7750781774520874, "step": 239 }, { "epoch": 0.049271196879490865, "grad_norm": 2.525748357222383, "kd_ratio": 0.5, "learning_rate": 1.9980880460472605e-05, "loss": 1.6125175952911377, "loss/kd": 2.5602641105651855, "loss/lm": 0.6647710800170898, "step": 240 }, { "epoch": 0.04947649353315541, "grad_norm": 3.9831989925734437, "kd_ratio": 0.5, "learning_rate": 1.9980467211600176e-05, "loss": 1.7043628692626953, "loss/kd": 2.5885210037231445, "loss/lm": 0.8202048540115356, "step": 241 }, { "epoch": 0.04968179018681995, "grad_norm": 2.292447117831205, "kd_ratio": 0.5, "learning_rate": 1.9980049548743047e-05, "loss": 1.676912784576416, "loss/kd": 2.391294240951538, "loss/lm": 0.9625314474105835, "step": 242 }, { "epoch": 0.0498870868404845, "grad_norm": 4.768706722924629, "kd_ratio": 0.5, "learning_rate": 1.9979627472085927e-05, "loss": 1.7255005836486816, "loss/kd": 2.662968397140503, "loss/lm": 0.7880328297615051, "step": 243 }, { "epoch": 0.050092383494149045, "grad_norm": 2.0058652093082645, "kd_ratio": 0.5, "learning_rate": 1.9979200981815493e-05, "loss": 1.5082218647003174, "loss/kd": 2.2791433334350586, "loss/lm": 0.7373002767562866, "step": 244 }, { "epoch": 0.05029768014781359, "grad_norm": 2.4847406750473326, "kd_ratio": 0.5, "learning_rate": 1.9978770078120362e-05, "loss": 1.5983916521072388, "loss/kd": 2.425185441970825, "loss/lm": 0.7715979218482971, "step": 245 }, { "epoch": 0.05050297680147814, "grad_norm": 2.8367768277970375, "kd_ratio": 0.5, "learning_rate": 1.9978334761191106e-05, "loss": 1.586836338043213, "loss/kd": 2.4452295303344727, "loss/lm": 0.7284432649612427, "step": 246 }, { "epoch": 0.05070827345514268, "grad_norm": 2.7154850012888736, "kd_ratio": 0.5, "learning_rate": 1.997789503122025e-05, "loss": 1.4347492456436157, "loss/kd": 2.0193731784820557, "loss/lm": 0.8501253128051758, "step": 247 }, { "epoch": 0.050913570108807225, "grad_norm": 4.216053815759349, "kd_ratio": 0.5, "learning_rate": 1.997745088840227e-05, "loss": 1.6273517608642578, "loss/kd": 2.6062331199645996, "loss/lm": 0.6484704613685608, "step": 248 }, { "epoch": 0.05111886676247177, "grad_norm": 2.893300596181721, "kd_ratio": 0.5, "learning_rate": 1.9977002332933592e-05, "loss": 1.5852535963058472, "loss/kd": 2.3368980884552, "loss/lm": 0.8336091637611389, "step": 249 }, { "epoch": 0.05132416341613632, "grad_norm": 2.4362714990193144, "kd_ratio": 0.5, "learning_rate": 1.9976549365012597e-05, "loss": 1.7637708187103271, "loss/kd": 2.8031275272369385, "loss/lm": 0.7244141697883606, "step": 250 }, { "epoch": 0.051529460069800866, "grad_norm": 3.3571810830709583, "kd_ratio": 0.5, "learning_rate": 1.9976091984839616e-05, "loss": 1.692673683166504, "loss/kd": 2.6372365951538086, "loss/lm": 0.7481108903884888, "step": 251 }, { "epoch": 0.051734756723465405, "grad_norm": 2.6756115624641836, "kd_ratio": 0.5, "learning_rate": 1.997563019261693e-05, "loss": 1.6596932411193848, "loss/kd": 2.4371073246002197, "loss/lm": 0.8822792768478394, "step": 252 }, { "epoch": 0.05194005337712995, "grad_norm": 2.514384273618872, "kd_ratio": 0.5, "learning_rate": 1.9975163988548775e-05, "loss": 1.6547311544418335, "loss/kd": 2.5005552768707275, "loss/lm": 0.8089070320129395, "step": 253 }, { "epoch": 0.0521453500307945, "grad_norm": 3.091307168842892, "kd_ratio": 0.5, "learning_rate": 1.997469337284133e-05, "loss": 1.7222270965576172, "loss/kd": 2.7130627632141113, "loss/lm": 0.7313914895057678, "step": 254 }, { "epoch": 0.052350646684459046, "grad_norm": 1.3693789684553144, "kd_ratio": 0.5, "learning_rate": 1.9974218345702733e-05, "loss": 1.7772350311279297, "loss/kd": 2.7145864963531494, "loss/lm": 0.8398836255073547, "step": 255 }, { "epoch": 0.052555943338123585, "grad_norm": 2.5460985104261784, "kd_ratio": 0.5, "learning_rate": 1.9973738907343074e-05, "loss": 1.5255088806152344, "loss/kd": 2.324805498123169, "loss/lm": 0.726212203502655, "step": 256 }, { "epoch": 0.05276123999178813, "grad_norm": 2.271324714560755, "kd_ratio": 0.5, "learning_rate": 1.997325505797439e-05, "loss": 1.7014482021331787, "loss/kd": 2.7484939098358154, "loss/lm": 0.6544023752212524, "step": 257 }, { "epoch": 0.05296653664545268, "grad_norm": 1.6923614678494434, "kd_ratio": 0.5, "learning_rate": 1.997276679781066e-05, "loss": 1.8621052503585815, "loss/kd": 2.892076253890991, "loss/lm": 0.8321341872215271, "step": 258 }, { "epoch": 0.053171833299117226, "grad_norm": 2.6132700182953283, "kd_ratio": 0.5, "learning_rate": 1.9972274127067838e-05, "loss": 1.484854817390442, "loss/kd": 2.2574527263641357, "loss/lm": 0.7122568488121033, "step": 259 }, { "epoch": 0.05337712995278177, "grad_norm": 2.0065663359105694, "kd_ratio": 0.5, "learning_rate": 1.99717770459638e-05, "loss": 1.7073851823806763, "loss/kd": 2.588604688644409, "loss/lm": 0.8261657357215881, "step": 260 }, { "epoch": 0.05358242660644631, "grad_norm": 2.35852539717841, "kd_ratio": 0.5, "learning_rate": 1.9971275554718395e-05, "loss": 1.7325025796890259, "loss/kd": 2.7255196571350098, "loss/lm": 0.739485502243042, "step": 261 }, { "epoch": 0.05378772326011086, "grad_norm": 3.7260204060886273, "kd_ratio": 0.5, "learning_rate": 1.997076965355341e-05, "loss": 1.5338906049728394, "loss/kd": 2.4934473037719727, "loss/lm": 0.574333906173706, "step": 262 }, { "epoch": 0.053993019913775406, "grad_norm": 2.383525041102166, "kd_ratio": 0.5, "learning_rate": 1.997025934269259e-05, "loss": 1.7238517999649048, "loss/kd": 2.721195936203003, "loss/lm": 0.7265076041221619, "step": 263 }, { "epoch": 0.05419831656743995, "grad_norm": 1.5918357518923134, "kd_ratio": 0.5, "learning_rate": 1.996974462236162e-05, "loss": 1.4654597043991089, "loss/kd": 2.2910478115081787, "loss/lm": 0.6398715972900391, "step": 264 }, { "epoch": 0.0544036132211045, "grad_norm": 3.2624137285268087, "kd_ratio": 0.5, "learning_rate": 1.9969225492788146e-05, "loss": 1.7407951354980469, "loss/kd": 2.8039333820343018, "loss/lm": 0.6776567697525024, "step": 265 }, { "epoch": 0.05460890987476904, "grad_norm": 1.6074715852065773, "kd_ratio": 0.5, "learning_rate": 1.9968701954201758e-05, "loss": 1.990717887878418, "loss/kd": 3.2102456092834473, "loss/lm": 0.7711901068687439, "step": 266 }, { "epoch": 0.054814206528433586, "grad_norm": 2.2401826916324845, "kd_ratio": 0.5, "learning_rate": 1.9968174006833996e-05, "loss": 1.8976107835769653, "loss/kd": 2.988750696182251, "loss/lm": 0.8064708709716797, "step": 267 }, { "epoch": 0.05501950318209813, "grad_norm": 1.608012462552153, "kd_ratio": 0.5, "learning_rate": 1.9967641650918352e-05, "loss": 1.8384625911712646, "loss/kd": 3.104959726333618, "loss/lm": 0.5719654560089111, "step": 268 }, { "epoch": 0.05522479983576268, "grad_norm": 1.4845967535568148, "kd_ratio": 0.5, "learning_rate": 1.996710488669027e-05, "loss": 1.9172114133834839, "loss/kd": 3.103095293045044, "loss/lm": 0.731327474117279, "step": 269 }, { "epoch": 0.05543009648942722, "grad_norm": 1.6043471916035752, "kd_ratio": 0.5, "learning_rate": 1.996656371438714e-05, "loss": 1.7301284074783325, "loss/kd": 2.785702705383301, "loss/lm": 0.674554169178009, "step": 270 }, { "epoch": 0.055635393143091766, "grad_norm": 1.4503857111475502, "kd_ratio": 0.5, "learning_rate": 1.9966018134248296e-05, "loss": 1.849181890487671, "loss/kd": 2.8365895748138428, "loss/lm": 0.8617741465568542, "step": 271 }, { "epoch": 0.05584068979675631, "grad_norm": 1.8424823785591764, "kd_ratio": 0.5, "learning_rate": 1.9965468146515036e-05, "loss": 1.7064955234527588, "loss/kd": 2.6336660385131836, "loss/lm": 0.7793248891830444, "step": 272 }, { "epoch": 0.05604598645042086, "grad_norm": 1.7872128725790022, "kd_ratio": 0.5, "learning_rate": 1.9964913751430593e-05, "loss": 1.5665761232376099, "loss/kd": 2.308602809906006, "loss/lm": 0.8245494365692139, "step": 273 }, { "epoch": 0.056251283104085406, "grad_norm": 3.3194723640162493, "kd_ratio": 0.5, "learning_rate": 1.9964354949240153e-05, "loss": 1.623845100402832, "loss/kd": 2.4662771224975586, "loss/lm": 0.781413197517395, "step": 274 }, { "epoch": 0.056456579757749946, "grad_norm": 2.934660075867764, "kd_ratio": 0.5, "learning_rate": 1.9963791740190863e-05, "loss": 1.3525481224060059, "loss/kd": 2.0593762397766113, "loss/lm": 0.6457198858261108, "step": 275 }, { "epoch": 0.05666187641141449, "grad_norm": 2.0488315821550915, "kd_ratio": 0.5, "learning_rate": 1.99632241245318e-05, "loss": 1.6878669261932373, "loss/kd": 2.589035987854004, "loss/lm": 0.7866979837417603, "step": 276 }, { "epoch": 0.05686717306507904, "grad_norm": 3.1494182104754556, "kd_ratio": 0.5, "learning_rate": 1.9962652102514005e-05, "loss": 1.8099398612976074, "loss/kd": 2.8469274044036865, "loss/lm": 0.7729521989822388, "step": 277 }, { "epoch": 0.057072469718743586, "grad_norm": 2.5028878449602794, "kd_ratio": 0.5, "learning_rate": 1.9962075674390456e-05, "loss": 1.5985395908355713, "loss/kd": 2.3917794227600098, "loss/lm": 0.805299699306488, "step": 278 }, { "epoch": 0.05727776637240813, "grad_norm": 1.867809272612178, "kd_ratio": 0.5, "learning_rate": 1.996149484041609e-05, "loss": 1.4587697982788086, "loss/kd": 2.2200191020965576, "loss/lm": 0.6975204944610596, "step": 279 }, { "epoch": 0.05748306302607267, "grad_norm": 3.240870504391221, "kd_ratio": 0.5, "learning_rate": 1.9960909600847787e-05, "loss": 1.5309360027313232, "loss/kd": 2.37026309967041, "loss/lm": 0.6916090250015259, "step": 280 }, { "epoch": 0.05768835967973722, "grad_norm": 2.7626949404480223, "kd_ratio": 0.5, "learning_rate": 1.9960319955944377e-05, "loss": 1.7485064268112183, "loss/kd": 2.7210233211517334, "loss/lm": 0.7759894728660583, "step": 281 }, { "epoch": 0.057893656333401766, "grad_norm": 3.2997517521239903, "kd_ratio": 0.5, "learning_rate": 1.9959725905966633e-05, "loss": 1.694644808769226, "loss/kd": 2.730889320373535, "loss/lm": 0.6584003567695618, "step": 282 }, { "epoch": 0.05809895298706631, "grad_norm": 2.7659570325627625, "kd_ratio": 0.5, "learning_rate": 1.9959127451177287e-05, "loss": 1.5558314323425293, "loss/kd": 2.5138180255889893, "loss/lm": 0.5978447794914246, "step": 283 }, { "epoch": 0.05830424964073085, "grad_norm": 1.8365175865572538, "kd_ratio": 0.5, "learning_rate": 1.995852459184101e-05, "loss": 1.7949343919754028, "loss/kd": 2.7855308055877686, "loss/lm": 0.8043380379676819, "step": 284 }, { "epoch": 0.0585095462943954, "grad_norm": 1.8538246944800996, "kd_ratio": 0.5, "learning_rate": 1.9957917328224424e-05, "loss": 1.7892392873764038, "loss/kd": 2.883887529373169, "loss/lm": 0.6945909857749939, "step": 285 }, { "epoch": 0.058714842948059946, "grad_norm": 2.2388399079585883, "kd_ratio": 0.5, "learning_rate": 1.99573056605961e-05, "loss": 1.693166732788086, "loss/kd": 2.5861382484436035, "loss/lm": 0.8001950979232788, "step": 286 }, { "epoch": 0.05892013960172449, "grad_norm": 2.0254096016990353, "kd_ratio": 0.5, "learning_rate": 1.9956689589226555e-05, "loss": 1.5118072032928467, "loss/kd": 2.2710931301116943, "loss/lm": 0.7525213360786438, "step": 287 }, { "epoch": 0.05912543625538904, "grad_norm": 1.5631297134027093, "kd_ratio": 0.5, "learning_rate": 1.995606911438825e-05, "loss": 1.8672181367874146, "loss/kd": 2.986370325088501, "loss/lm": 0.7480659484863281, "step": 288 }, { "epoch": 0.05933073290905358, "grad_norm": 2.4946395736288434, "kd_ratio": 0.5, "learning_rate": 1.995544423635561e-05, "loss": 1.6355810165405273, "loss/kd": 2.6938583850860596, "loss/lm": 0.5773035883903503, "step": 289 }, { "epoch": 0.059536029562718126, "grad_norm": 1.6891279742805916, "kd_ratio": 0.5, "learning_rate": 1.995481495540498e-05, "loss": 1.6285367012023926, "loss/kd": 2.5406224727630615, "loss/lm": 0.7164509892463684, "step": 290 }, { "epoch": 0.05974132621638267, "grad_norm": 2.2524056352241635, "kd_ratio": 0.5, "learning_rate": 1.9954181271814673e-05, "loss": 1.3884544372558594, "loss/kd": 2.038482189178467, "loss/lm": 0.7384266257286072, "step": 291 }, { "epoch": 0.05994662287004722, "grad_norm": 1.2374631849634845, "kd_ratio": 0.5, "learning_rate": 1.995354318586495e-05, "loss": 1.647318959236145, "loss/kd": 2.604184627532959, "loss/lm": 0.6904533505439758, "step": 292 }, { "epoch": 0.06015191952371177, "grad_norm": 1.9305175424485879, "kd_ratio": 0.5, "learning_rate": 1.9952900697838004e-05, "loss": 1.4491702318191528, "loss/kd": 2.1272072792053223, "loss/lm": 0.7711332440376282, "step": 293 }, { "epoch": 0.060357216177376306, "grad_norm": 1.7947053121490353, "kd_ratio": 0.5, "learning_rate": 1.9952253808017986e-05, "loss": 1.5231072902679443, "loss/kd": 2.330944776535034, "loss/lm": 0.7152697443962097, "step": 294 }, { "epoch": 0.06056251283104085, "grad_norm": 3.103555112841317, "kd_ratio": 0.5, "learning_rate": 1.9951602516690988e-05, "loss": 1.7568775415420532, "loss/kd": 2.7685115337371826, "loss/lm": 0.7452436089515686, "step": 295 }, { "epoch": 0.0607678094847054, "grad_norm": 1.7523366113732561, "kd_ratio": 0.5, "learning_rate": 1.9950946824145058e-05, "loss": 1.633457899093628, "loss/kd": 2.447814464569092, "loss/lm": 0.8191012740135193, "step": 296 }, { "epoch": 0.06097310613836995, "grad_norm": 1.3732583478681644, "kd_ratio": 0.5, "learning_rate": 1.9950286730670183e-05, "loss": 1.6022872924804688, "loss/kd": 2.4923617839813232, "loss/lm": 0.7122127413749695, "step": 297 }, { "epoch": 0.061178402792034486, "grad_norm": 1.4630784424892214, "kd_ratio": 0.5, "learning_rate": 1.9949622236558294e-05, "loss": 1.5185550451278687, "loss/kd": 2.3637380599975586, "loss/lm": 0.6733720302581787, "step": 298 }, { "epoch": 0.06138369944569903, "grad_norm": 2.1688996869249437, "kd_ratio": 0.5, "learning_rate": 1.9948953342103268e-05, "loss": 1.862894058227539, "loss/kd": 3.091749429702759, "loss/lm": 0.6340386867523193, "step": 299 }, { "epoch": 0.06158899609936358, "grad_norm": 3.016930935405189, "kd_ratio": 0.5, "learning_rate": 1.994828004760094e-05, "loss": 1.6817396879196167, "loss/kd": 2.6458239555358887, "loss/lm": 0.7176553606987, "step": 300 }, { "epoch": 0.06179429275302813, "grad_norm": 1.4354942840984095, "kd_ratio": 0.5, "learning_rate": 1.994760235334908e-05, "loss": 1.6244680881500244, "loss/kd": 2.5537800788879395, "loss/lm": 0.6951562166213989, "step": 301 }, { "epoch": 0.06199958940669267, "grad_norm": 3.1815245520988866, "kd_ratio": 0.5, "learning_rate": 1.9946920259647408e-05, "loss": 1.756564974784851, "loss/kd": 2.6158010959625244, "loss/lm": 0.897328794002533, "step": 302 }, { "epoch": 0.06220488606035721, "grad_norm": 3.213567141961111, "kd_ratio": 0.5, "learning_rate": 1.994623376679758e-05, "loss": 1.5641082525253296, "loss/kd": 2.5653669834136963, "loss/lm": 0.5628495216369629, "step": 303 }, { "epoch": 0.06241018271402176, "grad_norm": 1.7491093044285977, "kd_ratio": 0.5, "learning_rate": 1.9945542875103218e-05, "loss": 1.7764267921447754, "loss/kd": 2.810523271560669, "loss/lm": 0.7423303723335266, "step": 304 }, { "epoch": 0.06261547936768631, "grad_norm": 4.393490906347425, "kd_ratio": 0.5, "learning_rate": 1.994484758486987e-05, "loss": 1.65035080909729, "loss/kd": 2.579761028289795, "loss/lm": 0.7209406495094299, "step": 305 }, { "epoch": 0.06282077602135085, "grad_norm": 4.530224159735039, "kd_ratio": 0.5, "learning_rate": 1.994414789640504e-05, "loss": 1.5997533798217773, "loss/kd": 2.5745108127593994, "loss/lm": 0.6249960660934448, "step": 306 }, { "epoch": 0.0630260726750154, "grad_norm": 1.689047947417952, "kd_ratio": 0.5, "learning_rate": 1.9943443810018174e-05, "loss": 1.5925244092941284, "loss/kd": 2.5032875537872314, "loss/lm": 0.6817612648010254, "step": 307 }, { "epoch": 0.06323136932867994, "grad_norm": 4.103128372948419, "kd_ratio": 0.5, "learning_rate": 1.9942735326020658e-05, "loss": 1.7637364864349365, "loss/kd": 2.792513847351074, "loss/lm": 0.7349592447280884, "step": 308 }, { "epoch": 0.06343666598234449, "grad_norm": 3.079523463724084, "kd_ratio": 0.5, "learning_rate": 1.9942022444725833e-05, "loss": 1.792527198791504, "loss/kd": 2.866885185241699, "loss/lm": 0.7181692719459534, "step": 309 }, { "epoch": 0.06364196263600903, "grad_norm": 2.8543707112127694, "kd_ratio": 0.5, "learning_rate": 1.9941305166448973e-05, "loss": 1.4537588357925415, "loss/kd": 2.3055930137634277, "loss/lm": 0.6019246578216553, "step": 310 }, { "epoch": 0.06384725928967358, "grad_norm": 5.216639961606918, "kd_ratio": 0.5, "learning_rate": 1.9940583491507314e-05, "loss": 1.5588364601135254, "loss/kd": 2.4226176738739014, "loss/lm": 0.6950552463531494, "step": 311 }, { "epoch": 0.06405255594333813, "grad_norm": 3.08315514383033, "kd_ratio": 0.5, "learning_rate": 1.9939857420220016e-05, "loss": 1.5329983234405518, "loss/kd": 2.2945899963378906, "loss/lm": 0.7714066505432129, "step": 312 }, { "epoch": 0.06425785259700267, "grad_norm": 2.4032374692618674, "kd_ratio": 0.5, "learning_rate": 1.9939126952908198e-05, "loss": 1.5559462308883667, "loss/kd": 2.4095263481140137, "loss/lm": 0.702366054058075, "step": 313 }, { "epoch": 0.06446314925066722, "grad_norm": 3.569549298727343, "kd_ratio": 0.5, "learning_rate": 1.9938392089894912e-05, "loss": 1.539535403251648, "loss/kd": 2.389275074005127, "loss/lm": 0.689795732498169, "step": 314 }, { "epoch": 0.06466844590433175, "grad_norm": 1.4350359613168628, "kd_ratio": 0.5, "learning_rate": 1.993765283150517e-05, "loss": 1.4886877536773682, "loss/kd": 2.377107858657837, "loss/lm": 0.6002677083015442, "step": 315 }, { "epoch": 0.0648737425579963, "grad_norm": 3.429500286784062, "kd_ratio": 0.5, "learning_rate": 1.9936909178065912e-05, "loss": 1.6252360343933105, "loss/kd": 2.505064010620117, "loss/lm": 0.7454081177711487, "step": 316 }, { "epoch": 0.06507903921166085, "grad_norm": 1.7532436967073564, "kd_ratio": 0.5, "learning_rate": 1.9936161129906025e-05, "loss": 1.4328532218933105, "loss/kd": 2.2792067527770996, "loss/lm": 0.5864997506141663, "step": 317 }, { "epoch": 0.0652843358653254, "grad_norm": 2.3214361686278786, "kd_ratio": 0.5, "learning_rate": 1.993540868735635e-05, "loss": 1.517982006072998, "loss/kd": 2.4532883167266846, "loss/lm": 0.5826758146286011, "step": 318 }, { "epoch": 0.06548963251898994, "grad_norm": 2.3329325475641176, "kd_ratio": 0.5, "learning_rate": 1.9934651850749663e-05, "loss": 1.5179765224456787, "loss/kd": 2.3453433513641357, "loss/lm": 0.6906098127365112, "step": 319 }, { "epoch": 0.06569492917265449, "grad_norm": 2.854398991337633, "kd_ratio": 0.5, "learning_rate": 1.9933890620420678e-05, "loss": 2.4946436882019043, "loss/kd": 4.320828914642334, "loss/lm": 0.6684586405754089, "step": 320 }, { "epoch": 0.06590022582631903, "grad_norm": 2.6186829815248807, "kd_ratio": 0.5, "learning_rate": 1.993312499670606e-05, "loss": 1.4582629203796387, "loss/kd": 2.298892021179199, "loss/lm": 0.6176337003707886, "step": 321 }, { "epoch": 0.06610552247998358, "grad_norm": 1.6409901398752444, "kd_ratio": 0.5, "learning_rate": 1.993235497994442e-05, "loss": 1.5514249801635742, "loss/kd": 2.469120740890503, "loss/lm": 0.6337293386459351, "step": 322 }, { "epoch": 0.06631081913364813, "grad_norm": 2.157649988799275, "kd_ratio": 0.5, "learning_rate": 1.9931580570476306e-05, "loss": 1.617522954940796, "loss/kd": 2.6006011962890625, "loss/lm": 0.6344445943832397, "step": 323 }, { "epoch": 0.06651611578731266, "grad_norm": 1.7076388213153373, "kd_ratio": 0.5, "learning_rate": 1.9930801768644207e-05, "loss": 1.5160818099975586, "loss/kd": 2.251540422439575, "loss/lm": 0.780623197555542, "step": 324 }, { "epoch": 0.0667214124409772, "grad_norm": 3.3539492318266872, "kd_ratio": 0.5, "learning_rate": 1.9930018574792563e-05, "loss": 1.319311499595642, "loss/kd": 1.9969043731689453, "loss/lm": 0.6417185664176941, "step": 325 }, { "epoch": 0.06692670909464175, "grad_norm": 1.4287372582847746, "kd_ratio": 0.5, "learning_rate": 1.9929230989267747e-05, "loss": 1.6993553638458252, "loss/kd": 2.7766780853271484, "loss/lm": 0.6220327615737915, "step": 326 }, { "epoch": 0.0671320057483063, "grad_norm": 3.4885530041248427, "kd_ratio": 0.5, "learning_rate": 1.9928439012418076e-05, "loss": 1.4787541627883911, "loss/kd": 2.3202426433563232, "loss/lm": 0.6372657418251038, "step": 327 }, { "epoch": 0.06733730240197085, "grad_norm": 2.4975124887223994, "kd_ratio": 0.5, "learning_rate": 1.9927642644593818e-05, "loss": 1.7873777151107788, "loss/kd": 2.887314796447754, "loss/lm": 0.6874406933784485, "step": 328 }, { "epoch": 0.0675425990556354, "grad_norm": 2.0568034968855846, "kd_ratio": 0.5, "learning_rate": 1.9926841886147176e-05, "loss": 1.6055097579956055, "loss/kd": 2.510470390319824, "loss/lm": 0.7005492448806763, "step": 329 }, { "epoch": 0.06774789570929994, "grad_norm": 2.157724245990829, "kd_ratio": 0.5, "learning_rate": 1.992603673743229e-05, "loss": 1.6184892654418945, "loss/kd": 2.466291904449463, "loss/lm": 0.7706867456436157, "step": 330 }, { "epoch": 0.06795319236296449, "grad_norm": 1.446775060273043, "kd_ratio": 0.5, "learning_rate": 1.9925227198805247e-05, "loss": 1.965736746788025, "loss/kd": 3.136406183242798, "loss/lm": 0.7950672507286072, "step": 331 }, { "epoch": 0.06815848901662903, "grad_norm": 1.7173559717669324, "kd_ratio": 0.5, "learning_rate": 1.9924413270624083e-05, "loss": 1.3792332410812378, "loss/kd": 2.062103748321533, "loss/lm": 0.6963627934455872, "step": 332 }, { "epoch": 0.06836378567029358, "grad_norm": 2.2136164412930466, "kd_ratio": 0.5, "learning_rate": 1.992359495324876e-05, "loss": 1.57779061794281, "loss/kd": 2.3649535179138184, "loss/lm": 0.790627658367157, "step": 333 }, { "epoch": 0.06856908232395811, "grad_norm": 2.235814970505945, "kd_ratio": 0.5, "learning_rate": 1.9922772247041196e-05, "loss": 1.6490479707717896, "loss/kd": 2.4993622303009033, "loss/lm": 0.7987337112426758, "step": 334 }, { "epoch": 0.06877437897762266, "grad_norm": 1.6840154374637815, "kd_ratio": 0.5, "learning_rate": 1.9921945152365235e-05, "loss": 1.7595525979995728, "loss/kd": 2.8185572624206543, "loss/lm": 0.7005479335784912, "step": 335 }, { "epoch": 0.06897967563128721, "grad_norm": 1.0683367192021427, "kd_ratio": 0.5, "learning_rate": 1.9921113669586677e-05, "loss": 2.077440023422241, "loss/kd": 3.231379985809326, "loss/lm": 0.9235000014305115, "step": 336 }, { "epoch": 0.06918497228495175, "grad_norm": 1.3302179324334955, "kd_ratio": 0.5, "learning_rate": 1.9920277799073247e-05, "loss": 1.5832817554473877, "loss/kd": 2.5419373512268066, "loss/lm": 0.624626100063324, "step": 337 }, { "epoch": 0.0693902689386163, "grad_norm": 1.410876464659953, "kd_ratio": 0.5, "learning_rate": 1.9919437541194628e-05, "loss": 1.5783885717391968, "loss/kd": 2.5369515419006348, "loss/lm": 0.619825541973114, "step": 338 }, { "epoch": 0.06959556559228085, "grad_norm": 2.0439619050903555, "kd_ratio": 0.5, "learning_rate": 1.9918592896322432e-05, "loss": 1.6664248704910278, "loss/kd": 2.5728371143341064, "loss/lm": 0.760012686252594, "step": 339 }, { "epoch": 0.0698008622459454, "grad_norm": 1.4688757197859872, "kd_ratio": 0.5, "learning_rate": 1.991774386483021e-05, "loss": 1.6313178539276123, "loss/kd": 2.539435625076294, "loss/lm": 0.7232002019882202, "step": 340 }, { "epoch": 0.07000615889960994, "grad_norm": 1.7856284584012219, "kd_ratio": 0.5, "learning_rate": 1.9916890447093458e-05, "loss": 2.060617685317993, "loss/kd": 3.397411584854126, "loss/lm": 0.7238236665725708, "step": 341 }, { "epoch": 0.07021145555327449, "grad_norm": 3.0023601892059544, "kd_ratio": 0.5, "learning_rate": 1.991603264348961e-05, "loss": 1.8308783769607544, "loss/kd": 2.955456256866455, "loss/lm": 0.7063004374504089, "step": 342 }, { "epoch": 0.07041675220693902, "grad_norm": 1.8352477674427614, "kd_ratio": 0.5, "learning_rate": 1.9915170454398045e-05, "loss": 1.4655145406723022, "loss/kd": 2.354445219039917, "loss/lm": 0.5765838027000427, "step": 343 }, { "epoch": 0.07062204886060357, "grad_norm": 2.307405877253676, "kd_ratio": 0.5, "learning_rate": 1.9914303880200072e-05, "loss": 1.4921107292175293, "loss/kd": 2.304994821548462, "loss/lm": 0.6792265176773071, "step": 344 }, { "epoch": 0.07082734551426811, "grad_norm": 2.675300099507052, "kd_ratio": 0.5, "learning_rate": 1.9913432921278945e-05, "loss": 1.4655041694641113, "loss/kd": 2.0901520252227783, "loss/lm": 0.8408561944961548, "step": 345 }, { "epoch": 0.07103264216793266, "grad_norm": 2.387232637454804, "kd_ratio": 0.5, "learning_rate": 1.991255757801986e-05, "loss": 1.428192138671875, "loss/kd": 2.1990129947662354, "loss/lm": 0.6573712825775146, "step": 346 }, { "epoch": 0.07123793882159721, "grad_norm": 1.4437850204461626, "kd_ratio": 0.5, "learning_rate": 1.9911677850809943e-05, "loss": 2.4139044284820557, "loss/kd": 4.268982887268066, "loss/lm": 0.5588260889053345, "step": 347 }, { "epoch": 0.07144323547526175, "grad_norm": 2.9064267339112027, "kd_ratio": 0.5, "learning_rate": 1.9910793740038266e-05, "loss": 1.4049572944641113, "loss/kd": 2.2736527919769287, "loss/lm": 0.5362616777420044, "step": 348 }, { "epoch": 0.0716485321289263, "grad_norm": 1.8129567590498836, "kd_ratio": 0.5, "learning_rate": 1.990990524609584e-05, "loss": 1.6755534410476685, "loss/kd": 2.685736656188965, "loss/lm": 0.6653702259063721, "step": 349 }, { "epoch": 0.07185382878259085, "grad_norm": 2.6937186622601694, "kd_ratio": 0.5, "learning_rate": 1.990901236937561e-05, "loss": 1.6502304077148438, "loss/kd": 2.6497139930725098, "loss/lm": 0.6507468819618225, "step": 350 }, { "epoch": 0.0720591254362554, "grad_norm": 4.644686476913009, "kd_ratio": 0.5, "learning_rate": 1.9908115110272463e-05, "loss": 1.6421279907226562, "loss/kd": 2.6595945358276367, "loss/lm": 0.6246614456176758, "step": 351 }, { "epoch": 0.07226442208991993, "grad_norm": 2.571732261785064, "kd_ratio": 0.5, "learning_rate": 1.990721346918322e-05, "loss": 1.6570829153060913, "loss/kd": 2.540865182876587, "loss/lm": 0.7733005881309509, "step": 352 }, { "epoch": 0.07246971874358447, "grad_norm": 3.3300631161722443, "kd_ratio": 0.5, "learning_rate": 1.9906307446506647e-05, "loss": 1.5674678087234497, "loss/kd": 2.342181921005249, "loss/lm": 0.7927536964416504, "step": 353 }, { "epoch": 0.07267501539724902, "grad_norm": 1.7509640198720555, "kd_ratio": 0.5, "learning_rate": 1.9905397042643443e-05, "loss": 1.6418163776397705, "loss/kd": 2.525907278060913, "loss/lm": 0.7577254772186279, "step": 354 }, { "epoch": 0.07288031205091357, "grad_norm": 3.818672045473283, "kd_ratio": 0.5, "learning_rate": 1.9904482257996244e-05, "loss": 1.5764386653900146, "loss/kd": 2.580152988433838, "loss/lm": 0.572724461555481, "step": 355 }, { "epoch": 0.07308560870457811, "grad_norm": 2.2782693171918567, "kd_ratio": 0.5, "learning_rate": 1.9903563092969624e-05, "loss": 1.8766502141952515, "loss/kd": 3.055820941925049, "loss/lm": 0.6974795460700989, "step": 356 }, { "epoch": 0.07329090535824266, "grad_norm": 2.635146399578228, "kd_ratio": 0.5, "learning_rate": 1.9902639547970098e-05, "loss": 1.6200016736984253, "loss/kd": 2.617610216140747, "loss/lm": 0.6223931312561035, "step": 357 }, { "epoch": 0.07349620201190721, "grad_norm": 3.9768504941114107, "kd_ratio": 0.5, "learning_rate": 1.990171162340611e-05, "loss": 1.535634994506836, "loss/kd": 2.516634702682495, "loss/lm": 0.554635226726532, "step": 358 }, { "epoch": 0.07370149866557175, "grad_norm": 2.631055294217634, "kd_ratio": 0.5, "learning_rate": 1.990077931968805e-05, "loss": 1.588702917098999, "loss/kd": 2.4497454166412354, "loss/lm": 0.7276603579521179, "step": 359 }, { "epoch": 0.0739067953192363, "grad_norm": 3.3752795271180314, "kd_ratio": 0.5, "learning_rate": 1.9899842637228243e-05, "loss": 1.560733437538147, "loss/kd": 2.5726194381713867, "loss/lm": 0.5488473773002625, "step": 360 }, { "epoch": 0.07411209197290085, "grad_norm": 4.1837726945525215, "kd_ratio": 0.5, "learning_rate": 1.9898901576440943e-05, "loss": 1.4649115800857544, "loss/kd": 2.278350591659546, "loss/lm": 0.6514725685119629, "step": 361 }, { "epoch": 0.07431738862656538, "grad_norm": 2.586637091268951, "kd_ratio": 0.5, "learning_rate": 1.989795613774235e-05, "loss": 1.6403957605361938, "loss/kd": 2.61712646484375, "loss/lm": 0.6636651158332825, "step": 362 }, { "epoch": 0.07452268528022993, "grad_norm": 3.156829030651973, "kd_ratio": 0.5, "learning_rate": 1.9897006321550592e-05, "loss": 1.361005425453186, "loss/kd": 2.20761775970459, "loss/lm": 0.514393150806427, "step": 363 }, { "epoch": 0.07472798193389447, "grad_norm": 1.5868407353128218, "kd_ratio": 0.5, "learning_rate": 1.989605212828574e-05, "loss": 1.6743900775909424, "loss/kd": 2.6737618446350098, "loss/lm": 0.675018310546875, "step": 364 }, { "epoch": 0.07493327858755902, "grad_norm": 4.025652964169458, "kd_ratio": 0.5, "learning_rate": 1.98950935583698e-05, "loss": 1.6521525382995605, "loss/kd": 2.636991500854492, "loss/lm": 0.6673136949539185, "step": 365 }, { "epoch": 0.07513857524122357, "grad_norm": 2.0822340278185276, "kd_ratio": 0.5, "learning_rate": 1.9894130612226705e-05, "loss": 1.4153846502304077, "loss/kd": 2.2732410430908203, "loss/lm": 0.5575282573699951, "step": 366 }, { "epoch": 0.07534387189488811, "grad_norm": 2.691650176687364, "kd_ratio": 0.5, "learning_rate": 1.9893163290282335e-05, "loss": 1.4763407707214355, "loss/kd": 2.3478689193725586, "loss/lm": 0.604812502861023, "step": 367 }, { "epoch": 0.07554916854855266, "grad_norm": 3.325414991347236, "kd_ratio": 0.5, "learning_rate": 1.9892191592964498e-05, "loss": 1.5080795288085938, "loss/kd": 2.175478935241699, "loss/lm": 0.8406800627708435, "step": 368 }, { "epoch": 0.07575446520221721, "grad_norm": 1.480377370135002, "kd_ratio": 0.5, "learning_rate": 1.9891215520702938e-05, "loss": 1.64625084400177, "loss/kd": 2.732609748840332, "loss/lm": 0.5598918795585632, "step": 369 }, { "epoch": 0.07595976185588175, "grad_norm": 2.9755794838922323, "kd_ratio": 0.5, "learning_rate": 1.9890235073929334e-05, "loss": 1.6288310289382935, "loss/kd": 2.538524866104126, "loss/lm": 0.7191371321678162, "step": 370 }, { "epoch": 0.07616505850954629, "grad_norm": 1.6934404072903415, "kd_ratio": 0.5, "learning_rate": 1.9889250253077306e-05, "loss": 1.5664622783660889, "loss/kd": 2.4551913738250732, "loss/lm": 0.6777332425117493, "step": 371 }, { "epoch": 0.07637035516321083, "grad_norm": 2.2121103425486592, "kd_ratio": 0.5, "learning_rate": 1.9888261058582402e-05, "loss": 1.5616130828857422, "loss/kd": 2.520045042037964, "loss/lm": 0.6031811833381653, "step": 372 }, { "epoch": 0.07657565181687538, "grad_norm": 1.4645249617619316, "kd_ratio": 0.5, "learning_rate": 1.98872674908821e-05, "loss": 1.6021723747253418, "loss/kd": 2.5648410320281982, "loss/lm": 0.6395036578178406, "step": 373 }, { "epoch": 0.07678094847053993, "grad_norm": 2.3134706869645467, "kd_ratio": 0.5, "learning_rate": 1.9886269550415824e-05, "loss": 1.476367473602295, "loss/kd": 2.1960811614990234, "loss/lm": 0.7566538453102112, "step": 374 }, { "epoch": 0.07698624512420447, "grad_norm": 2.333223674182503, "kd_ratio": 0.5, "learning_rate": 1.9885267237624923e-05, "loss": 1.575329303741455, "loss/kd": 2.5577542781829834, "loss/lm": 0.592904269695282, "step": 375 }, { "epoch": 0.07719154177786902, "grad_norm": 1.174127468964521, "kd_ratio": 0.5, "learning_rate": 1.988426055295268e-05, "loss": 1.628918170928955, "loss/kd": 2.4894893169403076, "loss/lm": 0.7683470845222473, "step": 376 }, { "epoch": 0.07739683843153357, "grad_norm": 1.4657936599186496, "kd_ratio": 0.5, "learning_rate": 1.9883249496844316e-05, "loss": 1.5806818008422852, "loss/kd": 2.467810869216919, "loss/lm": 0.6935528516769409, "step": 377 }, { "epoch": 0.07760213508519812, "grad_norm": 1.7055457693114662, "kd_ratio": 0.5, "learning_rate": 1.988223406974698e-05, "loss": 1.492768406867981, "loss/kd": 2.3670382499694824, "loss/lm": 0.6184986233711243, "step": 378 }, { "epoch": 0.07780743173886266, "grad_norm": 2.017039735085791, "kd_ratio": 0.5, "learning_rate": 1.988121427210976e-05, "loss": 1.7496423721313477, "loss/kd": 2.807307481765747, "loss/lm": 0.691977322101593, "step": 379 }, { "epoch": 0.07801272839252721, "grad_norm": 1.6136985738226401, "kd_ratio": 0.5, "learning_rate": 1.9880190104383677e-05, "loss": 1.641109585762024, "loss/kd": 2.636976957321167, "loss/lm": 0.6452422738075256, "step": 380 }, { "epoch": 0.07821802504619174, "grad_norm": 1.6417813182066174, "kd_ratio": 0.5, "learning_rate": 1.9879161567021677e-05, "loss": 1.3375309705734253, "loss/kd": 1.844996452331543, "loss/lm": 0.8300654292106628, "step": 381 }, { "epoch": 0.07842332169985629, "grad_norm": 1.2073642977897185, "kd_ratio": 0.5, "learning_rate": 1.9878128660478645e-05, "loss": 1.469428300857544, "loss/kd": 2.301034450531006, "loss/lm": 0.6378220915794373, "step": 382 }, { "epoch": 0.07862861835352083, "grad_norm": 1.8420884368509811, "kd_ratio": 0.5, "learning_rate": 1.98770913852114e-05, "loss": 1.6652135848999023, "loss/kd": 2.7032458782196045, "loss/lm": 0.6271814107894897, "step": 383 }, { "epoch": 0.07883391500718538, "grad_norm": 1.5343124483006585, "kd_ratio": 0.5, "learning_rate": 1.9876049741678682e-05, "loss": 1.7283473014831543, "loss/kd": 2.716738700866699, "loss/lm": 0.7399559020996094, "step": 384 }, { "epoch": 0.07903921166084993, "grad_norm": 1.933896014961453, "kd_ratio": 0.5, "learning_rate": 1.9875003730341176e-05, "loss": 1.598374366760254, "loss/kd": 2.482527017593384, "loss/lm": 0.7142215967178345, "step": 385 }, { "epoch": 0.07924450831451448, "grad_norm": 2.3486716346523138, "kd_ratio": 0.5, "learning_rate": 1.987395335166149e-05, "loss": 1.8993885517120361, "loss/kd": 3.1402487754821777, "loss/lm": 0.6585284471511841, "step": 386 }, { "epoch": 0.07944980496817902, "grad_norm": 1.2411463656920898, "kd_ratio": 0.5, "learning_rate": 1.9872898606104175e-05, "loss": 1.73720121383667, "loss/kd": 2.8006880283355713, "loss/lm": 0.673714280128479, "step": 387 }, { "epoch": 0.07965510162184357, "grad_norm": 1.550659989765838, "kd_ratio": 0.5, "learning_rate": 1.9871839494135696e-05, "loss": 1.5751138925552368, "loss/kd": 2.4913861751556396, "loss/lm": 0.658841609954834, "step": 388 }, { "epoch": 0.07986039827550812, "grad_norm": 1.8667987647054496, "kd_ratio": 0.5, "learning_rate": 1.9870776016224466e-05, "loss": 1.6329309940338135, "loss/kd": 2.566654920578003, "loss/lm": 0.6992071866989136, "step": 389 }, { "epoch": 0.08006569492917265, "grad_norm": 1.7689564942933749, "kd_ratio": 0.5, "learning_rate": 1.9869708172840812e-05, "loss": 1.2605652809143066, "loss/kd": 1.8877372741699219, "loss/lm": 0.6333931684494019, "step": 390 }, { "epoch": 0.0802709915828372, "grad_norm": 1.907035790642156, "kd_ratio": 0.5, "learning_rate": 1.9868635964457007e-05, "loss": 1.2651174068450928, "loss/kd": 1.916133165359497, "loss/lm": 0.6141015291213989, "step": 391 }, { "epoch": 0.08047628823650174, "grad_norm": 1.7445131623938412, "kd_ratio": 0.5, "learning_rate": 1.986755939154725e-05, "loss": 1.5998601913452148, "loss/kd": 2.3805203437805176, "loss/lm": 0.8192000985145569, "step": 392 }, { "epoch": 0.08068158489016629, "grad_norm": 2.5399384782081404, "kd_ratio": 0.5, "learning_rate": 1.986647845458766e-05, "loss": 1.3921988010406494, "loss/kd": 1.9732892513275146, "loss/lm": 0.8111082315444946, "step": 393 }, { "epoch": 0.08088688154383084, "grad_norm": 1.5293848785257984, "kd_ratio": 0.5, "learning_rate": 1.9865393154056308e-05, "loss": 1.6706857681274414, "loss/kd": 2.6647980213165283, "loss/lm": 0.6765733957290649, "step": 394 }, { "epoch": 0.08109217819749538, "grad_norm": 1.2740703349721134, "kd_ratio": 0.5, "learning_rate": 1.986430349043317e-05, "loss": 1.4012391567230225, "loss/kd": 2.2130143642425537, "loss/lm": 0.5894638299942017, "step": 395 }, { "epoch": 0.08129747485115993, "grad_norm": 2.3674436228890756, "kd_ratio": 0.5, "learning_rate": 1.9863209464200172e-05, "loss": 1.791179895401001, "loss/kd": 2.9199626445770264, "loss/lm": 0.6623970866203308, "step": 396 }, { "epoch": 0.08150277150482448, "grad_norm": 2.4542272593918453, "kd_ratio": 0.5, "learning_rate": 1.9862111075841152e-05, "loss": 1.581034541130066, "loss/kd": 2.3678674697875977, "loss/lm": 0.794201672077179, "step": 397 }, { "epoch": 0.08170806815848902, "grad_norm": 1.4121710000439167, "kd_ratio": 0.5, "learning_rate": 1.9861008325841893e-05, "loss": 1.3327200412750244, "loss/kd": 2.1387131214141846, "loss/lm": 0.526727020740509, "step": 398 }, { "epoch": 0.08191336481215356, "grad_norm": 1.657510259845241, "kd_ratio": 0.5, "learning_rate": 1.9859901214690094e-05, "loss": 1.3741403818130493, "loss/kd": 1.9988465309143066, "loss/lm": 0.7494341731071472, "step": 399 }, { "epoch": 0.0821186614658181, "grad_norm": 1.957485520349326, "kd_ratio": 0.5, "learning_rate": 1.9858789742875393e-05, "loss": 1.466583013534546, "loss/kd": 2.298706531524658, "loss/lm": 0.6344594359397888, "step": 400 }, { "epoch": 0.08232395811948265, "grad_norm": 1.821120919730615, "kd_ratio": 0.5, "learning_rate": 1.985767391088935e-05, "loss": 1.4046766757965088, "loss/kd": 2.089362382888794, "loss/lm": 0.7199910879135132, "step": 401 }, { "epoch": 0.0825292547731472, "grad_norm": 1.2290271358318772, "kd_ratio": 0.5, "learning_rate": 1.9856553719225454e-05, "loss": 1.7027289867401123, "loss/kd": 2.693314790725708, "loss/lm": 0.712143063545227, "step": 402 }, { "epoch": 0.08273455142681174, "grad_norm": 2.9105086768591875, "kd_ratio": 0.5, "learning_rate": 1.9855429168379127e-05, "loss": 1.5783779621124268, "loss/kd": 2.5662641525268555, "loss/lm": 0.5904916524887085, "step": 403 }, { "epoch": 0.08293984808047629, "grad_norm": 3.781123914543241, "kd_ratio": 0.5, "learning_rate": 1.9854300258847713e-05, "loss": 1.6133860349655151, "loss/kd": 2.6138217449188232, "loss/lm": 0.6129503846168518, "step": 404 }, { "epoch": 0.08314514473414084, "grad_norm": 1.5577146750007498, "kd_ratio": 0.5, "learning_rate": 1.985316699113049e-05, "loss": 1.6150872707366943, "loss/kd": 2.652237892150879, "loss/lm": 0.5779366493225098, "step": 405 }, { "epoch": 0.08335044138780538, "grad_norm": 3.6999910464096866, "kd_ratio": 0.5, "learning_rate": 1.9852029365728652e-05, "loss": 1.734922170639038, "loss/kd": 2.855520725250244, "loss/lm": 0.6143234968185425, "step": 406 }, { "epoch": 0.08355573804146993, "grad_norm": 4.059665720944271, "kd_ratio": 0.5, "learning_rate": 1.985088738314534e-05, "loss": 1.345781683921814, "loss/kd": 2.054471969604492, "loss/lm": 0.6370914578437805, "step": 407 }, { "epoch": 0.08376103469513448, "grad_norm": 1.3002231298293963, "kd_ratio": 0.5, "learning_rate": 1.9849741043885596e-05, "loss": 1.5344996452331543, "loss/kd": 2.366882801055908, "loss/lm": 0.7021166086196899, "step": 408 }, { "epoch": 0.08396633134879901, "grad_norm": 4.871669101873444, "kd_ratio": 0.5, "learning_rate": 1.984859034845641e-05, "loss": 1.4361215829849243, "loss/kd": 2.3532447814941406, "loss/lm": 0.5189984440803528, "step": 409 }, { "epoch": 0.08417162800246356, "grad_norm": 3.902831475650466, "kd_ratio": 0.5, "learning_rate": 1.984743529736669e-05, "loss": 1.5190898180007935, "loss/kd": 2.4944167137145996, "loss/lm": 0.5437629818916321, "step": 410 }, { "epoch": 0.0843769246561281, "grad_norm": 2.8512041491745137, "kd_ratio": 0.5, "learning_rate": 1.9846275891127275e-05, "loss": 1.6478400230407715, "loss/kd": 2.5928618907928467, "loss/lm": 0.7028182148933411, "step": 411 }, { "epoch": 0.08458222130979265, "grad_norm": 4.4655814981951325, "kd_ratio": 0.5, "learning_rate": 1.984511213025092e-05, "loss": 1.4703105688095093, "loss/kd": 2.1849780082702637, "loss/lm": 0.7556430697441101, "step": 412 }, { "epoch": 0.0847875179634572, "grad_norm": 2.4148719527456595, "kd_ratio": 0.5, "learning_rate": 1.9843944015252318e-05, "loss": 1.3597524166107178, "loss/kd": 2.0430967807769775, "loss/lm": 0.6764079928398132, "step": 413 }, { "epoch": 0.08499281461712174, "grad_norm": 4.511879928118578, "kd_ratio": 0.5, "learning_rate": 1.984277154664808e-05, "loss": 1.3711469173431396, "loss/kd": 2.0830957889556885, "loss/lm": 0.6591979265213013, "step": 414 }, { "epoch": 0.08519811127078629, "grad_norm": 1.7724689498134008, "kd_ratio": 0.5, "learning_rate": 1.9841594724956746e-05, "loss": 1.7166894674301147, "loss/kd": 2.6874895095825195, "loss/lm": 0.7458894848823547, "step": 415 }, { "epoch": 0.08540340792445084, "grad_norm": 3.9204624627354265, "kd_ratio": 0.5, "learning_rate": 1.984041355069878e-05, "loss": 1.4529742002487183, "loss/kd": 2.1897642612457275, "loss/lm": 0.7161840796470642, "step": 416 }, { "epoch": 0.08560870457811538, "grad_norm": 3.1129473576404822, "kd_ratio": 0.5, "learning_rate": 1.983922802439657e-05, "loss": 1.5493035316467285, "loss/kd": 2.526207685470581, "loss/lm": 0.5723992586135864, "step": 417 }, { "epoch": 0.08581400123177992, "grad_norm": 2.815671743246748, "kd_ratio": 0.5, "learning_rate": 1.9838038146574426e-05, "loss": 1.216132640838623, "loss/kd": 1.8589359521865845, "loss/lm": 0.5733294486999512, "step": 418 }, { "epoch": 0.08601929788544446, "grad_norm": 4.62657711990906, "kd_ratio": 0.5, "learning_rate": 1.9836843917758593e-05, "loss": 1.4878196716308594, "loss/kd": 2.2716939449310303, "loss/lm": 0.7039454579353333, "step": 419 }, { "epoch": 0.08622459453910901, "grad_norm": 2.048126924979341, "kd_ratio": 0.5, "learning_rate": 1.983564533847723e-05, "loss": 1.4576261043548584, "loss/kd": 2.31498646736145, "loss/lm": 0.6002656817436218, "step": 420 }, { "epoch": 0.08642989119277356, "grad_norm": 3.3056309361811693, "kd_ratio": 0.5, "learning_rate": 1.983444240926042e-05, "loss": 1.4108185768127441, "loss/kd": 2.164536952972412, "loss/lm": 0.657100260257721, "step": 421 }, { "epoch": 0.0866351878464381, "grad_norm": 1.8003223882427808, "kd_ratio": 0.5, "learning_rate": 1.983323513064018e-05, "loss": 1.250888705253601, "loss/kd": 1.9090884923934937, "loss/lm": 0.5926889181137085, "step": 422 }, { "epoch": 0.08684048450010265, "grad_norm": 2.910579710518235, "kd_ratio": 0.5, "learning_rate": 1.983202350315044e-05, "loss": 1.4237314462661743, "loss/kd": 2.2645139694213867, "loss/lm": 0.5829489231109619, "step": 423 }, { "epoch": 0.0870457811537672, "grad_norm": 2.196613885149625, "kd_ratio": 0.5, "learning_rate": 1.983080752732706e-05, "loss": 1.356027364730835, "loss/kd": 2.0804107189178467, "loss/lm": 0.6316438913345337, "step": 424 }, { "epoch": 0.08725107780743174, "grad_norm": 2.406746906417158, "kd_ratio": 0.5, "learning_rate": 1.9829587203707817e-05, "loss": 1.488078236579895, "loss/kd": 2.349416971206665, "loss/lm": 0.6267394423484802, "step": 425 }, { "epoch": 0.08745637446109629, "grad_norm": 2.3927881283477537, "kd_ratio": 0.5, "learning_rate": 1.9828362532832415e-05, "loss": 1.404346227645874, "loss/kd": 2.250859498977661, "loss/lm": 0.5578330755233765, "step": 426 }, { "epoch": 0.08766167111476084, "grad_norm": 2.732574664979054, "kd_ratio": 0.5, "learning_rate": 1.982713351524248e-05, "loss": 1.3332089185714722, "loss/kd": 2.108992099761963, "loss/lm": 0.5574257969856262, "step": 427 }, { "epoch": 0.08786696776842537, "grad_norm": 1.8487032876260792, "kd_ratio": 0.5, "learning_rate": 1.9825900151481562e-05, "loss": 1.5118550062179565, "loss/kd": 2.3875579833984375, "loss/lm": 0.6361520290374756, "step": 428 }, { "epoch": 0.08807226442208992, "grad_norm": 1.9538626014296168, "kd_ratio": 0.5, "learning_rate": 1.9824662442095128e-05, "loss": 1.7108094692230225, "loss/kd": 2.6717400550842285, "loss/lm": 0.7498788237571716, "step": 429 }, { "epoch": 0.08827756107575446, "grad_norm": 2.5777308859975356, "kd_ratio": 0.5, "learning_rate": 1.9823420387630573e-05, "loss": 1.5250747203826904, "loss/kd": 2.4330661296844482, "loss/lm": 0.6170834302902222, "step": 430 }, { "epoch": 0.08848285772941901, "grad_norm": 2.2230667111534546, "kd_ratio": 0.5, "learning_rate": 1.982217398863721e-05, "loss": 1.4177439212799072, "loss/kd": 2.307887077331543, "loss/lm": 0.5276007652282715, "step": 431 }, { "epoch": 0.08868815438308356, "grad_norm": 1.5998468751262203, "kd_ratio": 0.5, "learning_rate": 1.982092324566627e-05, "loss": 1.541382074356079, "loss/kd": 2.46427845954895, "loss/lm": 0.618485689163208, "step": 432 }, { "epoch": 0.0888934510367481, "grad_norm": 1.5430499362016366, "kd_ratio": 0.5, "learning_rate": 1.981966815927092e-05, "loss": 1.2830970287322998, "loss/kd": 1.9417033195495605, "loss/lm": 0.6244907379150391, "step": 433 }, { "epoch": 0.08909874769041265, "grad_norm": 2.124933591584945, "kd_ratio": 0.5, "learning_rate": 1.981840873000623e-05, "loss": 1.6118932962417603, "loss/kd": 2.5375916957855225, "loss/lm": 0.6861949563026428, "step": 434 }, { "epoch": 0.0893040443440772, "grad_norm": 1.223573652674705, "kd_ratio": 0.5, "learning_rate": 1.98171449584292e-05, "loss": 1.3060308694839478, "loss/kd": 2.0000224113464355, "loss/lm": 0.61203932762146, "step": 435 }, { "epoch": 0.08950934099774174, "grad_norm": 3.8244872968755756, "kd_ratio": 0.5, "learning_rate": 1.9815876845098746e-05, "loss": 1.3375910520553589, "loss/kd": 2.07999324798584, "loss/lm": 0.5951887965202332, "step": 436 }, { "epoch": 0.08971463765140628, "grad_norm": 2.943191760977826, "kd_ratio": 0.5, "learning_rate": 1.9814604390575707e-05, "loss": 1.3812183141708374, "loss/kd": 2.156521797180176, "loss/lm": 0.6059147715568542, "step": 437 }, { "epoch": 0.08991993430507082, "grad_norm": 1.869037119862598, "kd_ratio": 0.5, "learning_rate": 1.9813327595422843e-05, "loss": 1.3062859773635864, "loss/kd": 1.9495197534561157, "loss/lm": 0.6630521416664124, "step": 438 }, { "epoch": 0.09012523095873537, "grad_norm": 2.441775338794373, "kd_ratio": 0.5, "learning_rate": 1.9812046460204837e-05, "loss": 1.6095497608184814, "loss/kd": 2.551250696182251, "loss/lm": 0.6678488254547119, "step": 439 }, { "epoch": 0.09033052761239992, "grad_norm": 1.472959869606592, "kd_ratio": 0.5, "learning_rate": 1.981076098548828e-05, "loss": 1.4519321918487549, "loss/kd": 2.1840908527374268, "loss/lm": 0.7197734713554382, "step": 440 }, { "epoch": 0.09053582426606446, "grad_norm": 1.5028814769221168, "kd_ratio": 0.5, "learning_rate": 1.9809471171841692e-05, "loss": 1.5619652271270752, "loss/kd": 2.4508352279663086, "loss/lm": 0.6730953454971313, "step": 441 }, { "epoch": 0.09074112091972901, "grad_norm": 1.6370714339436558, "kd_ratio": 0.5, "learning_rate": 1.9808177019835507e-05, "loss": 1.5008409023284912, "loss/kd": 2.440093755722046, "loss/lm": 0.561587929725647, "step": 442 }, { "epoch": 0.09094641757339356, "grad_norm": 2.021772361506955, "kd_ratio": 0.5, "learning_rate": 1.9806878530042083e-05, "loss": 1.4551690816879272, "loss/kd": 2.2740793228149414, "loss/lm": 0.6362588405609131, "step": 443 }, { "epoch": 0.0911517142270581, "grad_norm": 1.7290114934142709, "kd_ratio": 0.5, "learning_rate": 1.980557570303569e-05, "loss": 1.6083046197891235, "loss/kd": 2.6516199111938477, "loss/lm": 0.5649893879890442, "step": 444 }, { "epoch": 0.09135701088072265, "grad_norm": 1.4843685628090735, "kd_ratio": 0.5, "learning_rate": 1.9804268539392524e-05, "loss": 1.410529375076294, "loss/kd": 2.1738173961639404, "loss/lm": 0.6472412347793579, "step": 445 }, { "epoch": 0.09156230753438718, "grad_norm": 1.901502959423661, "kd_ratio": 0.5, "learning_rate": 1.980295703969069e-05, "loss": 1.4826184511184692, "loss/kd": 2.41387677192688, "loss/lm": 0.5513601899147034, "step": 446 }, { "epoch": 0.09176760418805173, "grad_norm": 2.8538451239692635, "kd_ratio": 0.5, "learning_rate": 1.9801641204510216e-05, "loss": 1.2200745344161987, "loss/kd": 1.8094797134399414, "loss/lm": 0.630669355392456, "step": 447 }, { "epoch": 0.09197290084171628, "grad_norm": 1.5215809458947462, "kd_ratio": 0.5, "learning_rate": 1.9800321034433043e-05, "loss": 1.3290271759033203, "loss/kd": 2.147080421447754, "loss/lm": 0.5109739899635315, "step": 448 }, { "epoch": 0.09217819749538082, "grad_norm": 1.8418870536848566, "kd_ratio": 0.5, "learning_rate": 1.9798996530043037e-05, "loss": 1.3374048471450806, "loss/kd": 2.0972020626068115, "loss/lm": 0.5776075720787048, "step": 449 }, { "epoch": 0.09238349414904537, "grad_norm": 2.3385065546398014, "kd_ratio": 0.5, "learning_rate": 1.9797667691925976e-05, "loss": 1.5328128337860107, "loss/kd": 2.3175055980682373, "loss/lm": 0.7481200695037842, "step": 450 }, { "epoch": 0.09258879080270992, "grad_norm": 1.4089300889054786, "kd_ratio": 0.5, "learning_rate": 1.9796334520669555e-05, "loss": 1.3705936670303345, "loss/kd": 2.1369752883911133, "loss/lm": 0.6042120456695557, "step": 451 }, { "epoch": 0.09279408745637446, "grad_norm": 1.9484824994737446, "kd_ratio": 0.5, "learning_rate": 1.9794997016863384e-05, "loss": 1.367258071899414, "loss/kd": 2.105912923812866, "loss/lm": 0.6286032199859619, "step": 452 }, { "epoch": 0.09299938411003901, "grad_norm": 2.025911382188894, "kd_ratio": 0.5, "learning_rate": 1.9793655181098992e-05, "loss": 2.147043228149414, "loss/kd": 3.838399887084961, "loss/lm": 0.4556867778301239, "step": 453 }, { "epoch": 0.09320468076370356, "grad_norm": 2.5136400364535785, "kd_ratio": 0.5, "learning_rate": 1.9792309013969818e-05, "loss": 1.2892353534698486, "loss/kd": 2.053528308868408, "loss/lm": 0.5249423384666443, "step": 454 }, { "epoch": 0.0934099774173681, "grad_norm": 1.5495577026554408, "kd_ratio": 0.5, "learning_rate": 1.9790958516071228e-05, "loss": 1.408053994178772, "loss/kd": 2.317014217376709, "loss/lm": 0.49909380078315735, "step": 455 }, { "epoch": 0.09361527407103264, "grad_norm": 1.4234195630921995, "kd_ratio": 0.5, "learning_rate": 1.978960368800049e-05, "loss": 1.460726261138916, "loss/kd": 2.320601463317871, "loss/lm": 0.6008510589599609, "step": 456 }, { "epoch": 0.09382057072469718, "grad_norm": 1.6833101886067718, "kd_ratio": 0.5, "learning_rate": 1.97882445303568e-05, "loss": 1.5779430866241455, "loss/kd": 2.623016357421875, "loss/lm": 0.5328698754310608, "step": 457 }, { "epoch": 0.09402586737836173, "grad_norm": 1.9872691735036805, "kd_ratio": 0.5, "learning_rate": 1.9786881043741256e-05, "loss": 1.3706061840057373, "loss/kd": 2.155452013015747, "loss/lm": 0.5857602953910828, "step": 458 }, { "epoch": 0.09423116403202628, "grad_norm": 1.0866613412141917, "kd_ratio": 0.5, "learning_rate": 1.978551322875688e-05, "loss": 2.243367910385132, "loss/kd": 4.035746097564697, "loss/lm": 0.45098966360092163, "step": 459 }, { "epoch": 0.09443646068569082, "grad_norm": 2.1925570241192878, "kd_ratio": 0.5, "learning_rate": 1.9784141086008608e-05, "loss": 1.099234700202942, "loss/kd": 1.6670171022415161, "loss/lm": 0.5314523577690125, "step": 460 }, { "epoch": 0.09464175733935537, "grad_norm": 1.6634923982233205, "kd_ratio": 0.5, "learning_rate": 1.978276461610328e-05, "loss": 1.3099015951156616, "loss/kd": 1.9139171838760376, "loss/lm": 0.7058860659599304, "step": 461 }, { "epoch": 0.09484705399301992, "grad_norm": 1.4099752462910407, "kd_ratio": 0.5, "learning_rate": 1.978138381964966e-05, "loss": 1.4358336925506592, "loss/kd": 2.326586961746216, "loss/lm": 0.5450805425643921, "step": 462 }, { "epoch": 0.09505235064668446, "grad_norm": 1.987765472803316, "kd_ratio": 0.5, "learning_rate": 1.977999869725842e-05, "loss": 1.3415937423706055, "loss/kd": 2.118406057357788, "loss/lm": 0.5647814273834229, "step": 463 }, { "epoch": 0.09525764730034901, "grad_norm": 2.6999379642680488, "kd_ratio": 0.5, "learning_rate": 1.9778609249542153e-05, "loss": 1.117956519126892, "loss/kd": 1.673289179801941, "loss/lm": 0.562623918056488, "step": 464 }, { "epoch": 0.09546294395401354, "grad_norm": 3.069622246172856, "kd_ratio": 0.5, "learning_rate": 1.9777215477115355e-05, "loss": 1.4558799266815186, "loss/kd": 2.3860766887664795, "loss/lm": 0.5256830453872681, "step": 465 }, { "epoch": 0.09566824060767809, "grad_norm": 1.8605133469893018, "kd_ratio": 0.5, "learning_rate": 1.9775817380594446e-05, "loss": 2.3268895149230957, "loss/kd": 4.134962558746338, "loss/lm": 0.5188166499137878, "step": 466 }, { "epoch": 0.09587353726134264, "grad_norm": 1.813813854735109, "kd_ratio": 0.5, "learning_rate": 1.977441496059774e-05, "loss": 1.4661014080047607, "loss/kd": 2.4017722606658936, "loss/lm": 0.5304306745529175, "step": 467 }, { "epoch": 0.09607883391500718, "grad_norm": 3.132314248933139, "kd_ratio": 0.5, "learning_rate": 1.9773008217745483e-05, "loss": 1.7665607929229736, "loss/kd": 2.8244943618774414, "loss/lm": 0.7086273431777954, "step": 468 }, { "epoch": 0.09628413056867173, "grad_norm": 3.117095183779629, "kd_ratio": 0.5, "learning_rate": 1.9771597152659817e-05, "loss": 1.5951507091522217, "loss/kd": 2.539788007736206, "loss/lm": 0.6505133509635925, "step": 469 }, { "epoch": 0.09648942722233628, "grad_norm": 1.5366632255206862, "kd_ratio": 0.5, "learning_rate": 1.9770181765964814e-05, "loss": 1.541905403137207, "loss/kd": 2.3823492527008057, "loss/lm": 0.7014614343643188, "step": 470 }, { "epoch": 0.09669472387600082, "grad_norm": 1.9972059956265598, "kd_ratio": 0.5, "learning_rate": 1.9768762058286433e-05, "loss": 1.1416229009628296, "loss/kd": 1.7298054695129395, "loss/lm": 0.553440272808075, "step": 471 }, { "epoch": 0.09690002052966537, "grad_norm": 2.8666754035621085, "kd_ratio": 0.5, "learning_rate": 1.976733803025257e-05, "loss": 1.4806312322616577, "loss/kd": 2.4441795349121094, "loss/lm": 0.517082929611206, "step": 472 }, { "epoch": 0.09710531718332992, "grad_norm": 2.172644969458462, "kd_ratio": 0.5, "learning_rate": 1.976590968249301e-05, "loss": 1.395958662033081, "loss/kd": 2.162752151489258, "loss/lm": 0.6291651725769043, "step": 473 }, { "epoch": 0.09731061383699445, "grad_norm": 1.1460838167514125, "kd_ratio": 0.5, "learning_rate": 1.976447701563946e-05, "loss": 1.4482535123825073, "loss/kd": 2.418018102645874, "loss/lm": 0.47848886251449585, "step": 474 }, { "epoch": 0.097515910490659, "grad_norm": 1.7036598231769449, "kd_ratio": 0.5, "learning_rate": 1.976304003032554e-05, "loss": 1.3441524505615234, "loss/kd": 2.1385748386383057, "loss/lm": 0.5497300028800964, "step": 475 }, { "epoch": 0.09772120714432354, "grad_norm": 2.02259959700887, "kd_ratio": 0.5, "learning_rate": 1.9761598727186766e-05, "loss": 1.3841207027435303, "loss/kd": 2.0165460109710693, "loss/lm": 0.7516952753067017, "step": 476 }, { "epoch": 0.09792650379798809, "grad_norm": 2.0615464681184648, "kd_ratio": 0.5, "learning_rate": 1.9760153106860575e-05, "loss": 1.486498236656189, "loss/kd": 2.422079563140869, "loss/lm": 0.5509169101715088, "step": 477 }, { "epoch": 0.09813180045165264, "grad_norm": 2.3695212240622685, "kd_ratio": 0.5, "learning_rate": 1.975870316998631e-05, "loss": 1.4727824926376343, "loss/kd": 2.2941195964813232, "loss/lm": 0.6514453291893005, "step": 478 }, { "epoch": 0.09833709710531718, "grad_norm": 3.252277140976667, "kd_ratio": 0.5, "learning_rate": 1.9757248917205228e-05, "loss": 1.7715506553649902, "loss/kd": 2.792943239212036, "loss/lm": 0.7501579523086548, "step": 479 }, { "epoch": 0.09854239375898173, "grad_norm": 1.9072515777139791, "kd_ratio": 0.5, "learning_rate": 1.9755790349160487e-05, "loss": 1.5076780319213867, "loss/kd": 2.4202797412872314, "loss/lm": 0.5950762033462524, "step": 480 }, { "epoch": 0.09874769041264628, "grad_norm": 1.6048190914752214, "kd_ratio": 0.5, "learning_rate": 1.9754327466497154e-05, "loss": 1.4018938541412354, "loss/kd": 2.2625555992126465, "loss/lm": 0.541232168674469, "step": 481 }, { "epoch": 0.09895298706631082, "grad_norm": 2.6138086047773084, "kd_ratio": 0.5, "learning_rate": 1.975286026986221e-05, "loss": 1.5551974773406982, "loss/kd": 2.482534408569336, "loss/lm": 0.6278605461120605, "step": 482 }, { "epoch": 0.09915828371997537, "grad_norm": 1.8495460606955465, "kd_ratio": 0.5, "learning_rate": 1.975138875990454e-05, "loss": 1.1947108507156372, "loss/kd": 1.8636229038238525, "loss/lm": 0.5257987976074219, "step": 483 }, { "epoch": 0.0993635803736399, "grad_norm": 1.3413928042657317, "kd_ratio": 0.5, "learning_rate": 1.9749912937274938e-05, "loss": 1.520735740661621, "loss/kd": 2.440037488937378, "loss/lm": 0.601434051990509, "step": 484 }, { "epoch": 0.09956887702730445, "grad_norm": 1.7892233661434818, "kd_ratio": 0.5, "learning_rate": 1.9748432802626103e-05, "loss": 1.5829806327819824, "loss/kd": 2.4768919944763184, "loss/lm": 0.6890691518783569, "step": 485 }, { "epoch": 0.099774173680969, "grad_norm": 1.5309694065648072, "kd_ratio": 0.5, "learning_rate": 1.9746948356612645e-05, "loss": 1.3734383583068848, "loss/kd": 2.168860673904419, "loss/lm": 0.578015923500061, "step": 486 }, { "epoch": 0.09997947033463354, "grad_norm": 1.6021927394772006, "kd_ratio": 0.5, "learning_rate": 1.974545959989108e-05, "loss": 1.4496920108795166, "loss/kd": 2.262660503387451, "loss/lm": 0.6367236375808716, "step": 487 }, { "epoch": 0.10018476698829809, "grad_norm": 1.1709395173333808, "kd_ratio": 0.5, "learning_rate": 1.9743966533119823e-05, "loss": 1.2979882955551147, "loss/kd": 2.0934700965881348, "loss/lm": 0.5025065541267395, "step": 488 }, { "epoch": 0.10039006364196264, "grad_norm": 1.1787580070126138, "kd_ratio": 0.5, "learning_rate": 1.9742469156959204e-05, "loss": 1.6194889545440674, "loss/kd": 2.6522583961486816, "loss/lm": 0.5867195129394531, "step": 489 }, { "epoch": 0.10059536029562718, "grad_norm": 1.0662319791286021, "kd_ratio": 0.5, "learning_rate": 1.974096747207146e-05, "loss": 1.3760719299316406, "loss/kd": 2.210111141204834, "loss/lm": 0.542032778263092, "step": 490 }, { "epoch": 0.10080065694929173, "grad_norm": 1.5003883170260046, "kd_ratio": 0.5, "learning_rate": 1.9739461479120727e-05, "loss": 1.4611458778381348, "loss/kd": 2.367318630218506, "loss/lm": 0.5549731850624084, "step": 491 }, { "epoch": 0.10100595360295628, "grad_norm": 1.279098833100474, "kd_ratio": 0.5, "learning_rate": 1.973795117877305e-05, "loss": 1.2362691164016724, "loss/kd": 1.9176498651504517, "loss/lm": 0.5548884272575378, "step": 492 }, { "epoch": 0.10121125025662081, "grad_norm": 1.1980656149752416, "kd_ratio": 0.5, "learning_rate": 1.973643657169637e-05, "loss": 1.4743521213531494, "loss/kd": 2.3492014408111572, "loss/lm": 0.5995028018951416, "step": 493 }, { "epoch": 0.10141654691028536, "grad_norm": 1.2396546832898894, "kd_ratio": 0.5, "learning_rate": 1.9734917658560554e-05, "loss": 1.4419867992401123, "loss/kd": 2.26202654838562, "loss/lm": 0.6219470500946045, "step": 494 }, { "epoch": 0.1016218435639499, "grad_norm": 1.307460596489826, "kd_ratio": 0.5, "learning_rate": 1.973339444003735e-05, "loss": 1.4531141519546509, "loss/kd": 2.3587639331817627, "loss/lm": 0.5474643707275391, "step": 495 }, { "epoch": 0.10182714021761445, "grad_norm": 2.018492129402567, "kd_ratio": 0.5, "learning_rate": 1.9731866916800426e-05, "loss": 1.3022103309631348, "loss/kd": 2.082960605621338, "loss/lm": 0.5214599967002869, "step": 496 }, { "epoch": 0.102032436871279, "grad_norm": 1.7698568177971241, "kd_ratio": 0.5, "learning_rate": 1.973033508952534e-05, "loss": 1.186326503753662, "loss/kd": 1.81851065158844, "loss/lm": 0.5541424751281738, "step": 497 }, { "epoch": 0.10223773352494354, "grad_norm": 1.2322114268677014, "kd_ratio": 0.5, "learning_rate": 1.972879895888957e-05, "loss": 1.3718730211257935, "loss/kd": 2.1481778621673584, "loss/lm": 0.5955681800842285, "step": 498 }, { "epoch": 0.10244303017860809, "grad_norm": 2.5161321109531865, "kd_ratio": 0.5, "learning_rate": 1.9727258525572487e-05, "loss": 1.3449816703796387, "loss/kd": 2.1392905712127686, "loss/lm": 0.5506727695465088, "step": 499 }, { "epoch": 0.10264832683227264, "grad_norm": 2.1913237068686393, "kd_ratio": 0.5, "learning_rate": 1.9725713790255362e-05, "loss": 1.5963749885559082, "loss/kd": 2.4747869968414307, "loss/lm": 0.7179630994796753, "step": 500 }, { "epoch": 0.10285362348593718, "grad_norm": 1.86980516145026, "kd_ratio": 0.5, "learning_rate": 1.9724164753621383e-05, "loss": 1.4633804559707642, "loss/kd": 2.303619623184204, "loss/lm": 0.6231412291526794, "step": 501 }, { "epoch": 0.10305892013960173, "grad_norm": 3.0477236623699944, "kd_ratio": 0.5, "learning_rate": 1.9722611416355616e-05, "loss": 1.1563634872436523, "loss/kd": 1.794532060623169, "loss/lm": 0.5181947946548462, "step": 502 }, { "epoch": 0.10326421679326626, "grad_norm": 1.6790808551103003, "kd_ratio": 0.5, "learning_rate": 1.9721053779145057e-05, "loss": 2.1914634704589844, "loss/kd": 3.8545315265655518, "loss/lm": 0.5283952355384827, "step": 503 }, { "epoch": 0.10346951344693081, "grad_norm": 3.5606370594271124, "kd_ratio": 0.5, "learning_rate": 1.9719491842678583e-05, "loss": 1.421980619430542, "loss/kd": 2.2282638549804688, "loss/lm": 0.61569744348526, "step": 504 }, { "epoch": 0.10367481010059536, "grad_norm": 3.201411146615011, "kd_ratio": 0.5, "learning_rate": 1.971792560764698e-05, "loss": 1.329468011856079, "loss/kd": 2.096282482147217, "loss/lm": 0.5626535415649414, "step": 505 }, { "epoch": 0.1038801067542599, "grad_norm": 1.76848783287171, "kd_ratio": 0.5, "learning_rate": 1.971635507474294e-05, "loss": 1.5488855838775635, "loss/kd": 2.411229133605957, "loss/lm": 0.6865421533584595, "step": 506 }, { "epoch": 0.10408540340792445, "grad_norm": 1.9808873402321439, "kd_ratio": 0.5, "learning_rate": 1.9714780244661044e-05, "loss": 1.3845562934875488, "loss/kd": 2.2235395908355713, "loss/lm": 0.5455729961395264, "step": 507 }, { "epoch": 0.104290700061589, "grad_norm": 1.742269301789763, "kd_ratio": 0.5, "learning_rate": 1.9713201118097784e-05, "loss": 1.3769227266311646, "loss/kd": 2.139338970184326, "loss/lm": 0.6145065426826477, "step": 508 }, { "epoch": 0.10449599671525354, "grad_norm": 2.1464206983883147, "kd_ratio": 0.5, "learning_rate": 1.9711617695751548e-05, "loss": 1.3024793863296509, "loss/kd": 2.0760326385498047, "loss/lm": 0.5289261937141418, "step": 509 }, { "epoch": 0.10470129336891809, "grad_norm": 1.7930258872931957, "kd_ratio": 0.5, "learning_rate": 1.9710029978322622e-05, "loss": 1.5649914741516113, "loss/kd": 2.405478000640869, "loss/lm": 0.7245048880577087, "step": 510 }, { "epoch": 0.10490659002258264, "grad_norm": 2.5427450366846585, "kd_ratio": 0.5, "learning_rate": 1.9708437966513196e-05, "loss": 1.1707749366760254, "loss/kd": 1.806776523590088, "loss/lm": 0.5347732305526733, "step": 511 }, { "epoch": 0.10511188667624717, "grad_norm": 1.9198840302908708, "kd_ratio": 0.5, "learning_rate": 1.9706841661027353e-05, "loss": 1.3066377639770508, "loss/kd": 2.0597033500671387, "loss/lm": 0.5535721778869629, "step": 512 }, { "epoch": 0.10531718332991172, "grad_norm": 3.640137182596108, "kd_ratio": 0.5, "learning_rate": 1.9705241062571084e-05, "loss": 1.3852183818817139, "loss/kd": 2.2149817943573, "loss/lm": 0.5554549098014832, "step": 513 }, { "epoch": 0.10552247998357626, "grad_norm": 2.0065041440468, "kd_ratio": 0.5, "learning_rate": 1.970363617185227e-05, "loss": 1.3424146175384521, "loss/kd": 2.1668591499328613, "loss/lm": 0.517970085144043, "step": 514 }, { "epoch": 0.10572777663724081, "grad_norm": 3.108201849864798, "kd_ratio": 0.5, "learning_rate": 1.9702026989580694e-05, "loss": 1.4524757862091064, "loss/kd": 2.2622010707855225, "loss/lm": 0.6427504420280457, "step": 515 }, { "epoch": 0.10593307329090536, "grad_norm": 3.978915899785666, "kd_ratio": 0.5, "learning_rate": 1.9700413516468034e-05, "loss": 1.3580132722854614, "loss/kd": 2.1736457347869873, "loss/lm": 0.5423808693885803, "step": 516 }, { "epoch": 0.1061383699445699, "grad_norm": 1.9064023079564323, "kd_ratio": 0.5, "learning_rate": 1.969879575322788e-05, "loss": 1.448489785194397, "loss/kd": 2.261963367462158, "loss/lm": 0.635016143321991, "step": 517 }, { "epoch": 0.10634366659823445, "grad_norm": 3.3377138244738154, "kd_ratio": 0.5, "learning_rate": 1.9697173700575694e-05, "loss": 1.3432703018188477, "loss/kd": 2.130401611328125, "loss/lm": 0.5561389327049255, "step": 518 }, { "epoch": 0.106548963251899, "grad_norm": 2.5173452750359124, "kd_ratio": 0.5, "learning_rate": 1.969554735922885e-05, "loss": 1.3874257802963257, "loss/kd": 2.1913721561431885, "loss/lm": 0.5834793448448181, "step": 519 }, { "epoch": 0.10675425990556354, "grad_norm": 2.1370379206743264, "kd_ratio": 0.5, "learning_rate": 1.969391672990663e-05, "loss": 1.4462977647781372, "loss/kd": 2.3012008666992188, "loss/lm": 0.5913947224617004, "step": 520 }, { "epoch": 0.10695955655922808, "grad_norm": 1.6860069176268917, "kd_ratio": 0.5, "learning_rate": 1.9692281813330186e-05, "loss": 2.254072427749634, "loss/kd": 4.063918590545654, "loss/lm": 0.4442262053489685, "step": 521 }, { "epoch": 0.10716485321289262, "grad_norm": 1.8548585535258986, "kd_ratio": 0.5, "learning_rate": 1.969064261022259e-05, "loss": 2.294882297515869, "loss/kd": 4.004128456115723, "loss/lm": 0.5856362581253052, "step": 522 }, { "epoch": 0.10737014986655717, "grad_norm": 1.6706183011206046, "kd_ratio": 0.5, "learning_rate": 1.968899912130879e-05, "loss": 1.3248755931854248, "loss/kd": 2.040034532546997, "loss/lm": 0.6097167730331421, "step": 523 }, { "epoch": 0.10757544652022172, "grad_norm": 1.7188745557419938, "kd_ratio": 0.5, "learning_rate": 1.9687351347315648e-05, "loss": 1.4205001592636108, "loss/kd": 2.158088445663452, "loss/lm": 0.6829118132591248, "step": 524 }, { "epoch": 0.10778074317388626, "grad_norm": 1.4150741132394091, "kd_ratio": 0.5, "learning_rate": 1.9685699288971908e-05, "loss": 1.58267343044281, "loss/kd": 2.6014764308929443, "loss/lm": 0.5638704299926758, "step": 525 }, { "epoch": 0.10798603982755081, "grad_norm": 1.6594618326661927, "kd_ratio": 0.5, "learning_rate": 1.9684042947008214e-05, "loss": 2.085136651992798, "loss/kd": 3.7339248657226562, "loss/lm": 0.43634840846061707, "step": 526 }, { "epoch": 0.10819133648121536, "grad_norm": 1.8364961957726558, "kd_ratio": 0.5, "learning_rate": 1.9682382322157103e-05, "loss": 1.4622149467468262, "loss/kd": 2.2790629863739014, "loss/lm": 0.6453670263290405, "step": 527 }, { "epoch": 0.1083966331348799, "grad_norm": 1.562869028021996, "kd_ratio": 0.5, "learning_rate": 1.968071741515301e-05, "loss": 1.218731164932251, "loss/kd": 1.9010709524154663, "loss/lm": 0.5363912582397461, "step": 528 }, { "epoch": 0.10860192978854445, "grad_norm": 1.554991406025663, "kd_ratio": 0.5, "learning_rate": 1.967904822673226e-05, "loss": 1.5640392303466797, "loss/kd": 2.5960044860839844, "loss/lm": 0.5320739150047302, "step": 529 }, { "epoch": 0.108807226442209, "grad_norm": 1.7304115417388084, "kd_ratio": 0.5, "learning_rate": 1.9677374757633066e-05, "loss": 1.4558296203613281, "loss/kd": 2.3320770263671875, "loss/lm": 0.5795823335647583, "step": 530 }, { "epoch": 0.10901252309587353, "grad_norm": 1.8732481878102782, "kd_ratio": 0.5, "learning_rate": 1.9675697008595545e-05, "loss": 1.3030459880828857, "loss/kd": 1.966294527053833, "loss/lm": 0.6397975087165833, "step": 531 }, { "epoch": 0.10921781974953808, "grad_norm": 2.1065902800489282, "kd_ratio": 0.5, "learning_rate": 1.9674014980361703e-05, "loss": 1.4208667278289795, "loss/kd": 2.204319715499878, "loss/lm": 0.6374137997627258, "step": 532 }, { "epoch": 0.10942311640320262, "grad_norm": 1.7622454384001367, "kd_ratio": 0.5, "learning_rate": 1.9672328673675438e-05, "loss": 1.2416030168533325, "loss/kd": 1.9937328100204468, "loss/lm": 0.48947322368621826, "step": 533 }, { "epoch": 0.10962841305686717, "grad_norm": 1.5732653007308326, "kd_ratio": 0.5, "learning_rate": 1.967063808928254e-05, "loss": 1.4021728038787842, "loss/kd": 2.2457003593444824, "loss/lm": 0.5586453676223755, "step": 534 }, { "epoch": 0.10983370971053172, "grad_norm": 1.365215492332246, "kd_ratio": 0.5, "learning_rate": 1.9668943227930686e-05, "loss": 1.2692244052886963, "loss/kd": 1.8943486213684082, "loss/lm": 0.6441002488136292, "step": 535 }, { "epoch": 0.11003900636419627, "grad_norm": 1.0288426502278305, "kd_ratio": 0.5, "learning_rate": 1.9667244090369455e-05, "loss": 1.6375432014465332, "loss/kd": 2.7203969955444336, "loss/lm": 0.554689347743988, "step": 536 }, { "epoch": 0.11024430301786081, "grad_norm": 1.3037694799916983, "kd_ratio": 0.5, "learning_rate": 1.966554067735031e-05, "loss": 1.4084144830703735, "loss/kd": 2.4149458408355713, "loss/lm": 0.4018830955028534, "step": 537 }, { "epoch": 0.11044959967152536, "grad_norm": 1.702683656248729, "kd_ratio": 0.5, "learning_rate": 1.96638329896266e-05, "loss": 1.572335124015808, "loss/kd": 2.591324806213379, "loss/lm": 0.5533455014228821, "step": 538 }, { "epoch": 0.1106548963251899, "grad_norm": 1.4644455159563448, "kd_ratio": 0.5, "learning_rate": 1.966212102795358e-05, "loss": 1.2442998886108398, "loss/kd": 1.996452808380127, "loss/lm": 0.4921469986438751, "step": 539 }, { "epoch": 0.11086019297885444, "grad_norm": 1.0968973817255105, "kd_ratio": 0.5, "learning_rate": 1.9660404793088387e-05, "loss": 1.3289982080459595, "loss/kd": 2.1380062103271484, "loss/lm": 0.5199902653694153, "step": 540 }, { "epoch": 0.11106548963251898, "grad_norm": 1.582305631236641, "kd_ratio": 0.5, "learning_rate": 1.965868428579004e-05, "loss": 1.509984016418457, "loss/kd": 2.4252243041992188, "loss/lm": 0.5947436690330505, "step": 541 }, { "epoch": 0.11127078628618353, "grad_norm": 2.244314101168145, "kd_ratio": 0.5, "learning_rate": 1.9656959506819456e-05, "loss": 1.3248099088668823, "loss/kd": 2.0838663578033447, "loss/lm": 0.5657534599304199, "step": 542 }, { "epoch": 0.11147608293984808, "grad_norm": 2.105881319233153, "kd_ratio": 0.5, "learning_rate": 1.965523045693944e-05, "loss": 1.4556002616882324, "loss/kd": 2.4394116401672363, "loss/lm": 0.4717889726161957, "step": 543 }, { "epoch": 0.11168137959351263, "grad_norm": 1.530171040447792, "kd_ratio": 0.5, "learning_rate": 1.965349713691469e-05, "loss": 1.5809319019317627, "loss/kd": 2.4317092895507812, "loss/lm": 0.7301545739173889, "step": 544 }, { "epoch": 0.11188667624717717, "grad_norm": 1.0581818812835824, "kd_ratio": 0.5, "learning_rate": 1.9651759547511785e-05, "loss": 1.2187050580978394, "loss/kd": 1.8757492303848267, "loss/lm": 0.5616608262062073, "step": 545 }, { "epoch": 0.11209197290084172, "grad_norm": 2.2991695668563232, "kd_ratio": 0.5, "learning_rate": 1.9650017689499195e-05, "loss": 1.4948610067367554, "loss/kd": 2.3927195072174072, "loss/lm": 0.5970025062561035, "step": 546 }, { "epoch": 0.11229726955450627, "grad_norm": 1.9877236609730964, "kd_ratio": 0.5, "learning_rate": 1.964827156364728e-05, "loss": 1.2201870679855347, "loss/kd": 1.8516079187393188, "loss/lm": 0.5887662768363953, "step": 547 }, { "epoch": 0.11250256620817081, "grad_norm": 1.0766979769758638, "kd_ratio": 0.5, "learning_rate": 1.9646521170728283e-05, "loss": 1.18389093875885, "loss/kd": 1.8173589706420898, "loss/lm": 0.5504228472709656, "step": 548 }, { "epoch": 0.11270786286183536, "grad_norm": 2.952556110647858, "kd_ratio": 0.5, "learning_rate": 1.9644766511516335e-05, "loss": 1.4423688650131226, "loss/kd": 2.397766351699829, "loss/lm": 0.4869714677333832, "step": 549 }, { "epoch": 0.11291315951549989, "grad_norm": 3.520962451808975, "kd_ratio": 0.5, "learning_rate": 1.9643007586787462e-05, "loss": 1.616905689239502, "loss/kd": 2.649233102798462, "loss/lm": 0.5845783352851868, "step": 550 }, { "epoch": 0.11311845616916444, "grad_norm": 2.2013606091351967, "kd_ratio": 0.5, "learning_rate": 1.964124439731957e-05, "loss": 1.3832155466079712, "loss/kd": 2.2046897411346436, "loss/lm": 0.561741292476654, "step": 551 }, { "epoch": 0.11332375282282899, "grad_norm": 1.4643539910576595, "kd_ratio": 0.5, "learning_rate": 1.963947694389244e-05, "loss": 1.4154272079467773, "loss/kd": 2.1332859992980957, "loss/lm": 0.697568416595459, "step": 552 }, { "epoch": 0.11352904947649353, "grad_norm": 3.123310475833794, "kd_ratio": 0.5, "learning_rate": 1.9637705227287763e-05, "loss": 1.5170774459838867, "loss/kd": 2.47969126701355, "loss/lm": 0.5544635057449341, "step": 553 }, { "epoch": 0.11373434613015808, "grad_norm": 3.739346743829235, "kd_ratio": 0.5, "learning_rate": 1.96359292482891e-05, "loss": 1.4982274770736694, "loss/kd": 2.374375343322754, "loss/lm": 0.622079610824585, "step": 554 }, { "epoch": 0.11393964278382263, "grad_norm": 1.7629567124652241, "kd_ratio": 0.5, "learning_rate": 1.9634149007681894e-05, "loss": 1.2458539009094238, "loss/kd": 2.00954270362854, "loss/lm": 0.4821651577949524, "step": 555 }, { "epoch": 0.11414493943748717, "grad_norm": 2.397457462492206, "kd_ratio": 0.5, "learning_rate": 1.963236450625348e-05, "loss": 1.5195116996765137, "loss/kd": 2.2875845432281494, "loss/lm": 0.7514389157295227, "step": 556 }, { "epoch": 0.11435023609115172, "grad_norm": 3.673270534807553, "kd_ratio": 0.5, "learning_rate": 1.9630575744793082e-05, "loss": 1.4933204650878906, "loss/kd": 2.4087746143341064, "loss/lm": 0.57786625623703, "step": 557 }, { "epoch": 0.11455553274481627, "grad_norm": 1.6936462529550007, "kd_ratio": 0.5, "learning_rate": 1.9628782724091795e-05, "loss": 1.1691148281097412, "loss/kd": 1.8765778541564941, "loss/lm": 0.46165189146995544, "step": 558 }, { "epoch": 0.1147608293984808, "grad_norm": 2.5051970288435133, "kd_ratio": 0.5, "learning_rate": 1.962698544494261e-05, "loss": 1.2287391424179077, "loss/kd": 1.9325374364852905, "loss/lm": 0.5249408483505249, "step": 559 }, { "epoch": 0.11496612605214535, "grad_norm": 3.4838367458326926, "kd_ratio": 0.5, "learning_rate": 1.9625183908140387e-05, "loss": 1.7691503763198853, "loss/kd": 2.914250135421753, "loss/lm": 0.6240505576133728, "step": 560 }, { "epoch": 0.11517142270580989, "grad_norm": 1.1687893608243891, "kd_ratio": 0.5, "learning_rate": 1.962337811448189e-05, "loss": 1.288439154624939, "loss/kd": 2.009348154067993, "loss/lm": 0.5675302147865295, "step": 561 }, { "epoch": 0.11537671935947444, "grad_norm": 3.1007199475593588, "kd_ratio": 0.5, "learning_rate": 1.9621568064765743e-05, "loss": 1.228562593460083, "loss/kd": 1.8165440559387207, "loss/lm": 0.6405810117721558, "step": 562 }, { "epoch": 0.11558201601313899, "grad_norm": 3.437108858467257, "kd_ratio": 0.5, "learning_rate": 1.9619753759792466e-05, "loss": 1.332515001296997, "loss/kd": 1.9552611112594604, "loss/lm": 0.7097688913345337, "step": 563 }, { "epoch": 0.11578731266680353, "grad_norm": 1.3756051997049825, "kd_ratio": 0.5, "learning_rate": 1.961793520036446e-05, "loss": 1.3660277128219604, "loss/kd": 2.1840031147003174, "loss/lm": 0.5480523705482483, "step": 564 }, { "epoch": 0.11599260932046808, "grad_norm": 3.5737719202524314, "kd_ratio": 0.5, "learning_rate": 1.9616112387286004e-05, "loss": 1.2468639612197876, "loss/kd": 1.9678210020065308, "loss/lm": 0.5259068608283997, "step": 565 }, { "epoch": 0.11619790597413263, "grad_norm": 2.5146256060257963, "kd_ratio": 0.5, "learning_rate": 1.9614285321363262e-05, "loss": 1.7592271566390991, "loss/kd": 2.907564163208008, "loss/lm": 0.6108900904655457, "step": 566 }, { "epoch": 0.11640320262779717, "grad_norm": 2.0174904347578426, "kd_ratio": 0.5, "learning_rate": 1.961245400340427e-05, "loss": 1.317613124847412, "loss/kd": 2.094343662261963, "loss/lm": 0.5408826470375061, "step": 567 }, { "epoch": 0.1166084992814617, "grad_norm": 2.8349349807480473, "kd_ratio": 0.5, "learning_rate": 1.961061843421896e-05, "loss": 1.203471302986145, "loss/kd": 1.7826581001281738, "loss/lm": 0.6242844462394714, "step": 568 }, { "epoch": 0.11681379593512625, "grad_norm": 1.8092714154648755, "kd_ratio": 0.5, "learning_rate": 1.9608778614619125e-05, "loss": 1.8404319286346436, "loss/kd": 3.146077871322632, "loss/lm": 0.5347859859466553, "step": 569 }, { "epoch": 0.1170190925887908, "grad_norm": 2.1661175256784486, "kd_ratio": 0.5, "learning_rate": 1.9606934545418457e-05, "loss": 1.4045295715332031, "loss/kd": 2.3061788082122803, "loss/lm": 0.502880334854126, "step": 570 }, { "epoch": 0.11722438924245535, "grad_norm": 1.3204412365569977, "kd_ratio": 0.5, "learning_rate": 1.9605086227432512e-05, "loss": 1.2138503789901733, "loss/kd": 1.9350937604904175, "loss/lm": 0.49260690808296204, "step": 571 }, { "epoch": 0.11742968589611989, "grad_norm": 1.6704275950710992, "kd_ratio": 0.5, "learning_rate": 1.9603233661478734e-05, "loss": 1.2559565305709839, "loss/kd": 2.035288095474243, "loss/lm": 0.47662490606307983, "step": 572 }, { "epoch": 0.11763498254978444, "grad_norm": 2.007458066237923, "kd_ratio": 0.5, "learning_rate": 1.9601376848376443e-05, "loss": 1.5303587913513184, "loss/kd": 2.4297637939453125, "loss/lm": 0.630953848361969, "step": 573 }, { "epoch": 0.11784027920344899, "grad_norm": 1.1567229694660321, "kd_ratio": 0.5, "learning_rate": 1.9599515788946838e-05, "loss": 1.4065794944763184, "loss/kd": 2.2362048625946045, "loss/lm": 0.5769541263580322, "step": 574 }, { "epoch": 0.11804557585711353, "grad_norm": 2.2049731781950457, "kd_ratio": 0.5, "learning_rate": 1.9597650484012997e-05, "loss": 1.3285017013549805, "loss/kd": 2.0781052112579346, "loss/lm": 0.5788981318473816, "step": 575 }, { "epoch": 0.11825087251077808, "grad_norm": 1.4866173444502149, "kd_ratio": 0.5, "learning_rate": 1.9595780934399867e-05, "loss": 1.2501821517944336, "loss/kd": 1.979967474937439, "loss/lm": 0.520396888256073, "step": 576 }, { "epoch": 0.11845616916444263, "grad_norm": 1.2248191007961768, "kd_ratio": 0.5, "learning_rate": 1.9593907140934286e-05, "loss": 1.4288227558135986, "loss/kd": 2.2625207901000977, "loss/lm": 0.5951247215270996, "step": 577 }, { "epoch": 0.11866146581810716, "grad_norm": 1.0849546786233355, "kd_ratio": 0.5, "learning_rate": 1.9592029104444964e-05, "loss": 1.2108187675476074, "loss/kd": 1.94416081905365, "loss/lm": 0.47747665643692017, "step": 578 }, { "epoch": 0.1188667624717717, "grad_norm": 1.0698967458646744, "kd_ratio": 0.5, "learning_rate": 1.9590146825762476e-05, "loss": 1.3151988983154297, "loss/kd": 2.023245096206665, "loss/lm": 0.6071525812149048, "step": 579 }, { "epoch": 0.11907205912543625, "grad_norm": 1.5281251216581895, "kd_ratio": 0.5, "learning_rate": 1.9588260305719294e-05, "loss": 1.3801230192184448, "loss/kd": 2.2155959606170654, "loss/lm": 0.544650137424469, "step": 580 }, { "epoch": 0.1192773557791008, "grad_norm": 1.8096413297346499, "kd_ratio": 0.5, "learning_rate": 1.958636954514975e-05, "loss": 1.4110045433044434, "loss/kd": 2.319584608078003, "loss/lm": 0.5024245381355286, "step": 581 }, { "epoch": 0.11948265243276535, "grad_norm": 1.0725998921363242, "kd_ratio": 0.5, "learning_rate": 1.9584474544890055e-05, "loss": 1.1376011371612549, "loss/kd": 1.7082622051239014, "loss/lm": 0.5669399499893188, "step": 582 }, { "epoch": 0.11968794908642989, "grad_norm": 1.7407698031712917, "kd_ratio": 0.5, "learning_rate": 1.9582575305778297e-05, "loss": 1.3877460956573486, "loss/kd": 2.20692777633667, "loss/lm": 0.5685644745826721, "step": 583 }, { "epoch": 0.11989324574009444, "grad_norm": 1.7674895701343392, "kd_ratio": 0.5, "learning_rate": 1.958067182865444e-05, "loss": 1.248513102531433, "loss/kd": 1.913575291633606, "loss/lm": 0.583450973033905, "step": 584 }, { "epoch": 0.12009854239375899, "grad_norm": 1.8462526153846435, "kd_ratio": 0.5, "learning_rate": 1.9578764114360318e-05, "loss": 1.397388219833374, "loss/kd": 2.2565691471099854, "loss/lm": 0.5382072925567627, "step": 585 }, { "epoch": 0.12030383904742353, "grad_norm": 1.7123573493561577, "kd_ratio": 0.5, "learning_rate": 1.9576852163739645e-05, "loss": 1.3184748888015747, "loss/kd": 2.1610236167907715, "loss/lm": 0.4759262502193451, "step": 586 }, { "epoch": 0.12050913570108807, "grad_norm": 1.8960809153443872, "kd_ratio": 0.5, "learning_rate": 1.9574935977637994e-05, "loss": 1.190643072128296, "loss/kd": 1.7957323789596558, "loss/lm": 0.585553765296936, "step": 587 }, { "epoch": 0.12071443235475261, "grad_norm": 2.8003049901629686, "kd_ratio": 0.5, "learning_rate": 1.9573015556902836e-05, "loss": 1.1686409711837769, "loss/kd": 1.7730803489685059, "loss/lm": 0.5642015337944031, "step": 588 }, { "epoch": 0.12091972900841716, "grad_norm": 2.5264714977022185, "kd_ratio": 0.5, "learning_rate": 1.9571090902383493e-05, "loss": 1.7160639762878418, "loss/kd": 2.804765462875366, "loss/lm": 0.6273624300956726, "step": 589 }, { "epoch": 0.1211250256620817, "grad_norm": 2.1807718207840017, "kd_ratio": 0.5, "learning_rate": 1.9569162014931166e-05, "loss": 1.3554552793502808, "loss/kd": 2.0621237754821777, "loss/lm": 0.648786723613739, "step": 590 }, { "epoch": 0.12133032231574625, "grad_norm": 2.6280653868895802, "kd_ratio": 0.5, "learning_rate": 1.9567228895398936e-05, "loss": 1.4166433811187744, "loss/kd": 2.320821762084961, "loss/lm": 0.5124651193618774, "step": 591 }, { "epoch": 0.1215356189694108, "grad_norm": 1.6212428871022646, "kd_ratio": 0.5, "learning_rate": 1.9565291544641744e-05, "loss": 1.428295612335205, "loss/kd": 2.317514657974243, "loss/lm": 0.5390764474868774, "step": 592 }, { "epoch": 0.12174091562307535, "grad_norm": 3.3995721462280044, "kd_ratio": 0.5, "learning_rate": 1.9563349963516403e-05, "loss": 1.352876901626587, "loss/kd": 2.055478572845459, "loss/lm": 0.6502752304077148, "step": 593 }, { "epoch": 0.1219462122767399, "grad_norm": 2.9160654064753055, "kd_ratio": 0.5, "learning_rate": 1.956140415288161e-05, "loss": 1.3934495449066162, "loss/kd": 2.1856801509857178, "loss/lm": 0.6012188196182251, "step": 594 }, { "epoch": 0.12215150893040444, "grad_norm": 1.7462590210453495, "kd_ratio": 0.5, "learning_rate": 1.955945411359792e-05, "loss": 1.1068745851516724, "loss/kd": 1.746208906173706, "loss/lm": 0.4675402045249939, "step": 595 }, { "epoch": 0.12235680558406897, "grad_norm": 3.5134236540502886, "kd_ratio": 0.5, "learning_rate": 1.9557499846527757e-05, "loss": 1.6097652912139893, "loss/kd": 2.573835611343384, "loss/lm": 0.6456949710845947, "step": 596 }, { "epoch": 0.12256210223773352, "grad_norm": 1.0019781171480473, "kd_ratio": 0.5, "learning_rate": 1.955554135253543e-05, "loss": 1.477194905281067, "loss/kd": 2.337270498275757, "loss/lm": 0.617119312286377, "step": 597 }, { "epoch": 0.12276739889139807, "grad_norm": 2.9986167214859436, "kd_ratio": 0.5, "learning_rate": 1.9553578632487103e-05, "loss": 1.4216585159301758, "loss/kd": 2.123171329498291, "loss/lm": 0.7201457023620605, "step": 598 }, { "epoch": 0.12297269554506261, "grad_norm": 1.2304634396260734, "kd_ratio": 0.5, "learning_rate": 1.9551611687250808e-05, "loss": 1.3811315298080444, "loss/kd": 2.030649423599243, "loss/lm": 0.7316135764122009, "step": 599 }, { "epoch": 0.12317799219872716, "grad_norm": 2.938553775948689, "kd_ratio": 0.5, "learning_rate": 1.9549640517696457e-05, "loss": 1.2708112001419067, "loss/kd": 2.035454750061035, "loss/lm": 0.5061677098274231, "step": 600 }, { "epoch": 0.1233832888523917, "grad_norm": 1.9224331980271177, "kd_ratio": 0.5, "learning_rate": 1.9547665124695817e-05, "loss": 1.3273563385009766, "loss/kd": 2.078702211380005, "loss/lm": 0.5760104060173035, "step": 601 }, { "epoch": 0.12358858550605625, "grad_norm": 2.2043533665058743, "kd_ratio": 0.5, "learning_rate": 1.954568550912254e-05, "loss": 1.172882080078125, "loss/kd": 1.7933659553527832, "loss/lm": 0.552398145198822, "step": 602 }, { "epoch": 0.1237938821597208, "grad_norm": 2.210273615126951, "kd_ratio": 0.5, "learning_rate": 1.9543701671852127e-05, "loss": 1.524968147277832, "loss/kd": 2.459048271179199, "loss/lm": 0.5908880829811096, "step": 603 }, { "epoch": 0.12399917881338535, "grad_norm": 1.5504320637081086, "kd_ratio": 0.5, "learning_rate": 1.954171361376196e-05, "loss": 1.6327619552612305, "loss/kd": 2.705253839492798, "loss/lm": 0.5602701306343079, "step": 604 }, { "epoch": 0.1242044754670499, "grad_norm": 1.9223848701897868, "kd_ratio": 0.5, "learning_rate": 1.9539721335731276e-05, "loss": 1.36929190158844, "loss/kd": 2.239760398864746, "loss/lm": 0.49882349371910095, "step": 605 }, { "epoch": 0.12440977212071443, "grad_norm": 1.275086952650864, "kd_ratio": 0.5, "learning_rate": 1.953772483864119e-05, "loss": 1.236740231513977, "loss/kd": 1.9678782224655151, "loss/lm": 0.5056021809577942, "step": 606 }, { "epoch": 0.12461506877437897, "grad_norm": 1.5528620871437357, "kd_ratio": 0.5, "learning_rate": 1.9535724123374674e-05, "loss": 1.3166790008544922, "loss/kd": 2.054638147354126, "loss/lm": 0.5787197351455688, "step": 607 }, { "epoch": 0.12482036542804352, "grad_norm": 1.3166101506911378, "kd_ratio": 0.5, "learning_rate": 1.9533719190816575e-05, "loss": 1.2457001209259033, "loss/kd": 1.926783800125122, "loss/lm": 0.5646163821220398, "step": 608 }, { "epoch": 0.12502566208170807, "grad_norm": 2.115760371242464, "kd_ratio": 0.5, "learning_rate": 1.953171004185359e-05, "loss": 1.2361526489257812, "loss/kd": 1.9610272645950317, "loss/lm": 0.5112781524658203, "step": 609 }, { "epoch": 0.12523095873537263, "grad_norm": 1.3969818479853642, "kd_ratio": 0.5, "learning_rate": 1.95296966773743e-05, "loss": 1.5748997926712036, "loss/kd": 2.554915428161621, "loss/lm": 0.5948840975761414, "step": 610 }, { "epoch": 0.12543625538903716, "grad_norm": 2.768813206528842, "kd_ratio": 0.5, "learning_rate": 1.952767909826913e-05, "loss": 1.4693132638931274, "loss/kd": 2.4360361099243164, "loss/lm": 0.5025904178619385, "step": 611 }, { "epoch": 0.1256415520427017, "grad_norm": 2.535216556220481, "kd_ratio": 0.5, "learning_rate": 1.9525657305430385e-05, "loss": 1.2845708131790161, "loss/kd": 2.05173921585083, "loss/lm": 0.5174024701118469, "step": 612 }, { "epoch": 0.12584684869636625, "grad_norm": 1.330260026131191, "kd_ratio": 0.5, "learning_rate": 1.952363129975223e-05, "loss": 1.3478741645812988, "loss/kd": 2.1205029487609863, "loss/lm": 0.5752454996109009, "step": 613 }, { "epoch": 0.1260521453500308, "grad_norm": 3.653777876458729, "kd_ratio": 0.5, "learning_rate": 1.9521601082130682e-05, "loss": 1.0641894340515137, "loss/kd": 1.5470484495162964, "loss/lm": 0.5813302993774414, "step": 614 }, { "epoch": 0.12625744200369535, "grad_norm": 2.9128687491414613, "kd_ratio": 0.5, "learning_rate": 1.951956665346364e-05, "loss": 1.2256523370742798, "loss/kd": 1.928807020187378, "loss/lm": 0.5224975943565369, "step": 615 }, { "epoch": 0.12646273865735988, "grad_norm": 1.3370077406475238, "kd_ratio": 0.5, "learning_rate": 1.9517528014650852e-05, "loss": 1.2993695735931396, "loss/kd": 2.04482102394104, "loss/lm": 0.553918182849884, "step": 616 }, { "epoch": 0.12666803531102444, "grad_norm": 1.3211578872974905, "kd_ratio": 0.5, "learning_rate": 1.9515485166593927e-05, "loss": 1.6742634773254395, "loss/kd": 2.7548179626464844, "loss/lm": 0.5937091112136841, "step": 617 }, { "epoch": 0.12687333196468897, "grad_norm": 1.9906782050248315, "kd_ratio": 0.5, "learning_rate": 1.9513438110196346e-05, "loss": 1.241999626159668, "loss/kd": 1.9531280994415283, "loss/lm": 0.5308712124824524, "step": 618 }, { "epoch": 0.12707862861835353, "grad_norm": 1.826305585884594, "kd_ratio": 0.5, "learning_rate": 1.951138684636344e-05, "loss": 1.1384148597717285, "loss/kd": 1.7654085159301758, "loss/lm": 0.5114212036132812, "step": 619 }, { "epoch": 0.12728392527201807, "grad_norm": 1.4823116831368128, "kd_ratio": 0.5, "learning_rate": 1.9509331376002406e-05, "loss": 1.347562551498413, "loss/kd": 2.1467628479003906, "loss/lm": 0.5483623147010803, "step": 620 }, { "epoch": 0.1274892219256826, "grad_norm": 2.031531060096886, "kd_ratio": 0.5, "learning_rate": 1.9507271700022308e-05, "loss": 1.3220961093902588, "loss/kd": 2.1155073642730713, "loss/lm": 0.5286847949028015, "step": 621 }, { "epoch": 0.12769451857934716, "grad_norm": 3.3403462646279696, "kd_ratio": 0.5, "learning_rate": 1.9505207819334052e-05, "loss": 1.240973949432373, "loss/kd": 2.011705160140991, "loss/lm": 0.4702426791191101, "step": 622 }, { "epoch": 0.1278998152330117, "grad_norm": 1.5191515784139313, "kd_ratio": 0.5, "learning_rate": 1.9503139734850426e-05, "loss": 1.1172853708267212, "loss/kd": 1.7934188842773438, "loss/lm": 0.44115176796913147, "step": 623 }, { "epoch": 0.12810511188667625, "grad_norm": 2.2827578620016227, "kd_ratio": 0.5, "learning_rate": 1.9501067447486054e-05, "loss": 1.373993158340454, "loss/kd": 2.0599684715270996, "loss/lm": 0.6880178451538086, "step": 624 }, { "epoch": 0.1283104085403408, "grad_norm": 1.8782648409538392, "kd_ratio": 0.5, "learning_rate": 1.9498990958157443e-05, "loss": 1.2190301418304443, "loss/kd": 1.9148072004318237, "loss/lm": 0.5232532024383545, "step": 625 }, { "epoch": 0.12851570519400535, "grad_norm": 1.6827382338456838, "kd_ratio": 0.5, "learning_rate": 1.9496910267782934e-05, "loss": 1.2352688312530518, "loss/kd": 1.9962869882583618, "loss/lm": 0.47425055503845215, "step": 626 }, { "epoch": 0.12872100184766988, "grad_norm": 2.1893030003427305, "kd_ratio": 0.5, "learning_rate": 1.9494825377282746e-05, "loss": 1.3169794082641602, "loss/kd": 2.076293468475342, "loss/lm": 0.5576652884483337, "step": 627 }, { "epoch": 0.12892629850133444, "grad_norm": 1.800621891952745, "kd_ratio": 0.5, "learning_rate": 1.9492736287578947e-05, "loss": 1.210123062133789, "loss/kd": 1.9199001789093018, "loss/lm": 0.5003460645675659, "step": 628 }, { "epoch": 0.12913159515499897, "grad_norm": 2.0119701876091645, "kd_ratio": 0.5, "learning_rate": 1.949064299959546e-05, "loss": 1.3724498748779297, "loss/kd": 2.1159982681274414, "loss/lm": 0.6289016008377075, "step": 629 }, { "epoch": 0.1293368918086635, "grad_norm": 1.3938843837619102, "kd_ratio": 0.5, "learning_rate": 1.9488545514258068e-05, "loss": 1.139295220375061, "loss/kd": 1.7984387874603271, "loss/lm": 0.4801516532897949, "step": 630 }, { "epoch": 0.12954218846232807, "grad_norm": 2.212600061509354, "kd_ratio": 0.5, "learning_rate": 1.9486443832494414e-05, "loss": 1.3612791299819946, "loss/kd": 2.197234869003296, "loss/lm": 0.5253233909606934, "step": 631 }, { "epoch": 0.1297474851159926, "grad_norm": 1.81526233885116, "kd_ratio": 0.5, "learning_rate": 1.9484337955233987e-05, "loss": 1.239557147026062, "loss/kd": 1.9170886278152466, "loss/lm": 0.5620257258415222, "step": 632 }, { "epoch": 0.12995278176965716, "grad_norm": 1.458959107116136, "kd_ratio": 0.5, "learning_rate": 1.9482227883408135e-05, "loss": 1.196505069732666, "loss/kd": 1.8828885555267334, "loss/lm": 0.5101216435432434, "step": 633 }, { "epoch": 0.1301580784233217, "grad_norm": 2.2455039841398805, "kd_ratio": 0.5, "learning_rate": 1.948011361795007e-05, "loss": 1.1450321674346924, "loss/kd": 1.744858980178833, "loss/lm": 0.5452053546905518, "step": 634 }, { "epoch": 0.13036337507698625, "grad_norm": 3.3003960062158444, "kd_ratio": 0.5, "learning_rate": 1.9477995159794854e-05, "loss": 1.397840976715088, "loss/kd": 2.1783337593078613, "loss/lm": 0.6173482537269592, "step": 635 }, { "epoch": 0.1305686717306508, "grad_norm": 1.8794964124966687, "kd_ratio": 0.5, "learning_rate": 1.947587250987939e-05, "loss": 1.4448949098587036, "loss/kd": 2.3599042892456055, "loss/lm": 0.529885470867157, "step": 636 }, { "epoch": 0.13077396838431535, "grad_norm": 2.360208000542608, "kd_ratio": 0.5, "learning_rate": 1.9473745669142455e-05, "loss": 1.0476181507110596, "loss/kd": 1.6715785264968872, "loss/lm": 0.4236578643321991, "step": 637 }, { "epoch": 0.13097926503797988, "grad_norm": 3.155207322666124, "kd_ratio": 0.5, "learning_rate": 1.947161463852467e-05, "loss": 1.1325838565826416, "loss/kd": 1.797461748123169, "loss/lm": 0.46770599484443665, "step": 638 }, { "epoch": 0.1311845616916444, "grad_norm": 1.5985912894526737, "kd_ratio": 0.5, "learning_rate": 1.9469479418968506e-05, "loss": 1.2230874300003052, "loss/kd": 1.903023600578308, "loss/lm": 0.543151319026947, "step": 639 }, { "epoch": 0.13138985834530897, "grad_norm": 2.133207623633984, "kd_ratio": 0.5, "learning_rate": 1.9467340011418297e-05, "loss": 1.691777229309082, "loss/kd": 2.7409257888793945, "loss/lm": 0.6426287293434143, "step": 640 }, { "epoch": 0.1315951549989735, "grad_norm": 1.1328227394675234, "kd_ratio": 0.5, "learning_rate": 1.946519641682021e-05, "loss": 1.2703306674957275, "loss/kd": 2.0063467025756836, "loss/lm": 0.5343145132064819, "step": 641 }, { "epoch": 0.13180045165263807, "grad_norm": 1.6558305454805493, "kd_ratio": 0.5, "learning_rate": 1.9463048636122287e-05, "loss": 1.0542221069335938, "loss/kd": 1.5576525926589966, "loss/lm": 0.5507915019989014, "step": 642 }, { "epoch": 0.1320057483063026, "grad_norm": 1.0597146116487774, "kd_ratio": 0.5, "learning_rate": 1.9460896670274408e-05, "loss": 1.281808853149414, "loss/kd": 2.053196430206299, "loss/lm": 0.5104212760925293, "step": 643 }, { "epoch": 0.13221104495996716, "grad_norm": 2.149814720257583, "kd_ratio": 0.5, "learning_rate": 1.9458740520228307e-05, "loss": 1.2000517845153809, "loss/kd": 1.78633713722229, "loss/lm": 0.6137665510177612, "step": 644 }, { "epoch": 0.1324163416136317, "grad_norm": 1.8306650030720424, "kd_ratio": 0.5, "learning_rate": 1.9456580186937564e-05, "loss": 1.2495083808898926, "loss/kd": 1.9945095777511597, "loss/lm": 0.5045070648193359, "step": 645 }, { "epoch": 0.13262163826729625, "grad_norm": 1.084245936577246, "kd_ratio": 0.5, "learning_rate": 1.9454415671357615e-05, "loss": 1.1907744407653809, "loss/kd": 1.9011539220809937, "loss/lm": 0.4803948402404785, "step": 646 }, { "epoch": 0.1328269349209608, "grad_norm": 1.2959620771734002, "kd_ratio": 0.5, "learning_rate": 1.9452246974445743e-05, "loss": 1.1848303079605103, "loss/kd": 1.9197020530700684, "loss/lm": 0.44995853304862976, "step": 647 }, { "epoch": 0.13303223157462532, "grad_norm": 1.3959862485294092, "kd_ratio": 0.5, "learning_rate": 1.9450074097161087e-05, "loss": 1.395146369934082, "loss/kd": 2.2423765659332275, "loss/lm": 0.5479161143302917, "step": 648 }, { "epoch": 0.13323752822828988, "grad_norm": 2.1066823938132706, "kd_ratio": 0.5, "learning_rate": 1.944789704046462e-05, "loss": 1.3065863847732544, "loss/kd": 2.07505202293396, "loss/lm": 0.538120687007904, "step": 649 }, { "epoch": 0.1334428248819544, "grad_norm": 1.9001559202286789, "kd_ratio": 0.5, "learning_rate": 1.9445715805319177e-05, "loss": 1.3240256309509277, "loss/kd": 2.1126673221588135, "loss/lm": 0.5353840589523315, "step": 650 }, { "epoch": 0.13364812153561897, "grad_norm": 1.1939223802305068, "kd_ratio": 0.5, "learning_rate": 1.9443530392689434e-05, "loss": 1.3808997869491577, "loss/kd": 2.24094820022583, "loss/lm": 0.5208514332771301, "step": 651 }, { "epoch": 0.1338534181892835, "grad_norm": 1.2171904587840323, "kd_ratio": 0.5, "learning_rate": 1.9441340803541918e-05, "loss": 1.486943244934082, "loss/kd": 2.481268882751465, "loss/lm": 0.4926176071166992, "step": 652 }, { "epoch": 0.13405871484294807, "grad_norm": 1.0834927074999678, "kd_ratio": 0.5, "learning_rate": 1.9439147038845006e-05, "loss": 1.350818395614624, "loss/kd": 2.147491216659546, "loss/lm": 0.5541455149650574, "step": 653 }, { "epoch": 0.1342640114966126, "grad_norm": 1.2190285655578494, "kd_ratio": 0.5, "learning_rate": 1.943694909956891e-05, "loss": 1.2677509784698486, "loss/kd": 1.9822760820388794, "loss/lm": 0.5532257556915283, "step": 654 }, { "epoch": 0.13446930815027716, "grad_norm": 1.26181852559201, "kd_ratio": 0.5, "learning_rate": 1.94347469866857e-05, "loss": 1.1439027786254883, "loss/kd": 1.7641195058822632, "loss/lm": 0.5236860513687134, "step": 655 }, { "epoch": 0.1346746048039417, "grad_norm": 0.9230325493473417, "kd_ratio": 0.5, "learning_rate": 1.9432540701169283e-05, "loss": 1.4475337266921997, "loss/kd": 2.3679890632629395, "loss/lm": 0.52707839012146, "step": 656 }, { "epoch": 0.13487990145760623, "grad_norm": 1.462906567244686, "kd_ratio": 0.5, "learning_rate": 1.9430330243995424e-05, "loss": 1.507298469543457, "loss/kd": 2.5121381282806396, "loss/lm": 0.5024588704109192, "step": 657 }, { "epoch": 0.1350851981112708, "grad_norm": 1.4131274116066261, "kd_ratio": 0.5, "learning_rate": 1.942811561614172e-05, "loss": 1.3616350889205933, "loss/kd": 2.1634650230407715, "loss/lm": 0.559805154800415, "step": 658 }, { "epoch": 0.13529049476493532, "grad_norm": 2.285456751624547, "kd_ratio": 0.5, "learning_rate": 1.9425896818587615e-05, "loss": 1.268155813217163, "loss/kd": 2.0903899669647217, "loss/lm": 0.4459215998649597, "step": 659 }, { "epoch": 0.13549579141859988, "grad_norm": 1.5196279760143678, "kd_ratio": 0.5, "learning_rate": 1.9423673852314406e-05, "loss": 1.1992504596710205, "loss/kd": 1.9970839023590088, "loss/lm": 0.4014171063899994, "step": 660 }, { "epoch": 0.13570108807226441, "grad_norm": 1.39976798291926, "kd_ratio": 0.5, "learning_rate": 1.9421446718305223e-05, "loss": 1.275602102279663, "loss/kd": 2.0291008949279785, "loss/lm": 0.5221033096313477, "step": 661 }, { "epoch": 0.13590638472592897, "grad_norm": 2.679312999114413, "kd_ratio": 0.5, "learning_rate": 1.9419215417545044e-05, "loss": 1.3589282035827637, "loss/kd": 2.1935830116271973, "loss/lm": 0.5242733359336853, "step": 662 }, { "epoch": 0.1361116813795935, "grad_norm": 2.4718869068722875, "kd_ratio": 0.5, "learning_rate": 1.941697995102069e-05, "loss": 1.4752172231674194, "loss/kd": 2.1171317100524902, "loss/lm": 0.8333027362823486, "step": 663 }, { "epoch": 0.13631697803325807, "grad_norm": 1.945607150683844, "kd_ratio": 0.5, "learning_rate": 1.941474031972082e-05, "loss": 1.3652379512786865, "loss/kd": 2.226464033126831, "loss/lm": 0.5040117502212524, "step": 664 }, { "epoch": 0.1365222746869226, "grad_norm": 2.1500706470619386, "kd_ratio": 0.5, "learning_rate": 1.9412496524635944e-05, "loss": 1.312656044960022, "loss/kd": 2.130866050720215, "loss/lm": 0.4944460988044739, "step": 665 }, { "epoch": 0.13672757134058716, "grad_norm": 1.1063350512980612, "kd_ratio": 0.5, "learning_rate": 1.9410248566758406e-05, "loss": 1.2497804164886475, "loss/kd": 2.011120557785034, "loss/lm": 0.48844021558761597, "step": 666 }, { "epoch": 0.1369328679942517, "grad_norm": 1.9824073629812955, "kd_ratio": 0.5, "learning_rate": 1.9407996447082394e-05, "loss": 1.2979878187179565, "loss/kd": 2.1092376708984375, "loss/lm": 0.4867379665374756, "step": 667 }, { "epoch": 0.13713816464791623, "grad_norm": 1.0634398884870484, "kd_ratio": 0.5, "learning_rate": 1.9405740166603936e-05, "loss": 1.2164937257766724, "loss/kd": 1.8533973693847656, "loss/lm": 0.5795901417732239, "step": 668 }, { "epoch": 0.1373434613015808, "grad_norm": 2.0376153970233672, "kd_ratio": 0.5, "learning_rate": 1.9403479726320894e-05, "loss": 1.3529558181762695, "loss/kd": 2.1924378871917725, "loss/lm": 0.5134736895561218, "step": 669 }, { "epoch": 0.13754875795524532, "grad_norm": 1.1376055095947324, "kd_ratio": 0.5, "learning_rate": 1.9401215127232983e-05, "loss": 1.3057613372802734, "loss/kd": 2.1150643825531006, "loss/lm": 0.49645835161209106, "step": 670 }, { "epoch": 0.13775405460890988, "grad_norm": 2.6598057412504703, "kd_ratio": 0.5, "learning_rate": 1.939894637034174e-05, "loss": 1.2381377220153809, "loss/kd": 1.8857886791229248, "loss/lm": 0.5904867053031921, "step": 671 }, { "epoch": 0.13795935126257441, "grad_norm": 2.244844984910515, "kd_ratio": 0.5, "learning_rate": 1.939667345665057e-05, "loss": 1.1601494550704956, "loss/kd": 1.814915418624878, "loss/lm": 0.5053834915161133, "step": 672 }, { "epoch": 0.13816464791623898, "grad_norm": 1.7274270431358083, "kd_ratio": 0.5, "learning_rate": 1.9394396387164677e-05, "loss": 1.8371983766555786, "loss/kd": 3.296361207962036, "loss/lm": 0.37803545594215393, "step": 673 }, { "epoch": 0.1383699445699035, "grad_norm": 2.494038388584794, "kd_ratio": 0.5, "learning_rate": 1.9392115162891132e-05, "loss": 1.4648646116256714, "loss/kd": 2.2920684814453125, "loss/lm": 0.6376607418060303, "step": 674 }, { "epoch": 0.13857524122356807, "grad_norm": 1.5298587944238398, "kd_ratio": 0.5, "learning_rate": 1.9389829784838833e-05, "loss": 1.289789080619812, "loss/kd": 1.9968703985214233, "loss/lm": 0.5827078223228455, "step": 675 }, { "epoch": 0.1387805378772326, "grad_norm": 1.5259327174904485, "kd_ratio": 0.5, "learning_rate": 1.9387540254018516e-05, "loss": 1.2297756671905518, "loss/kd": 1.8749573230743408, "loss/lm": 0.5845940709114075, "step": 676 }, { "epoch": 0.13898583453089713, "grad_norm": 1.025390997301615, "kd_ratio": 0.5, "learning_rate": 1.9385246571442756e-05, "loss": 1.0746772289276123, "loss/kd": 1.7160265445709229, "loss/lm": 0.43332797288894653, "step": 677 }, { "epoch": 0.1391911311845617, "grad_norm": 2.013837132057695, "kd_ratio": 0.5, "learning_rate": 1.9382948738125966e-05, "loss": 1.4308912754058838, "loss/kd": 2.398383140563965, "loss/lm": 0.46339935064315796, "step": 678 }, { "epoch": 0.13939642783822623, "grad_norm": 1.69656801309809, "kd_ratio": 0.5, "learning_rate": 1.938064675508438e-05, "loss": 1.2826831340789795, "loss/kd": 2.1124107837677, "loss/lm": 0.45295554399490356, "step": 679 }, { "epoch": 0.1396017244918908, "grad_norm": 1.9989402276460917, "kd_ratio": 0.5, "learning_rate": 1.937834062333609e-05, "loss": 1.2812955379486084, "loss/kd": 2.109370470046997, "loss/lm": 0.45322057604789734, "step": 680 }, { "epoch": 0.13980702114555532, "grad_norm": 3.278590890337317, "kd_ratio": 0.5, "learning_rate": 1.9376030343901005e-05, "loss": 1.2468341588974, "loss/kd": 2.0711491107940674, "loss/lm": 0.42251917719841003, "step": 681 }, { "epoch": 0.14001231779921988, "grad_norm": 1.2498684824509811, "kd_ratio": 0.5, "learning_rate": 1.9373715917800874e-05, "loss": 1.2749583721160889, "loss/kd": 2.0940511226654053, "loss/lm": 0.4558657109737396, "step": 682 }, { "epoch": 0.14021761445288441, "grad_norm": 2.575369319052647, "kd_ratio": 0.5, "learning_rate": 1.9371397346059286e-05, "loss": 1.3323993682861328, "loss/kd": 2.0204432010650635, "loss/lm": 0.6443554759025574, "step": 683 }, { "epoch": 0.14042291110654898, "grad_norm": 2.670187851637896, "kd_ratio": 0.5, "learning_rate": 1.9369074629701653e-05, "loss": 1.0336939096450806, "loss/kd": 1.5664457082748413, "loss/lm": 0.500942051410675, "step": 684 }, { "epoch": 0.1406282077602135, "grad_norm": 1.7116950143436505, "kd_ratio": 0.5, "learning_rate": 1.9366747769755222e-05, "loss": 1.126167893409729, "loss/kd": 1.7706122398376465, "loss/lm": 0.4817236363887787, "step": 685 }, { "epoch": 0.14083350441387804, "grad_norm": 2.02925734963643, "kd_ratio": 0.5, "learning_rate": 1.9364416767249085e-05, "loss": 1.3233227729797363, "loss/kd": 2.120194435119629, "loss/lm": 0.5264511704444885, "step": 686 }, { "epoch": 0.1410388010675426, "grad_norm": 1.2692334320460323, "kd_ratio": 0.5, "learning_rate": 1.936208162321415e-05, "loss": 1.0839239358901978, "loss/kd": 1.6120128631591797, "loss/lm": 0.5558350086212158, "step": 687 }, { "epoch": 0.14124409772120713, "grad_norm": 2.172462397033306, "kd_ratio": 0.5, "learning_rate": 1.9359742338683165e-05, "loss": 1.2255191802978516, "loss/kd": 1.8859540224075317, "loss/lm": 0.5650842785835266, "step": 688 }, { "epoch": 0.1414493943748717, "grad_norm": 1.1627921165888002, "kd_ratio": 0.5, "learning_rate": 1.9357398914690707e-05, "loss": 1.911754846572876, "loss/kd": 3.3801305294036865, "loss/lm": 0.44337913393974304, "step": 689 }, { "epoch": 0.14165469102853623, "grad_norm": 1.9097621275782668, "kd_ratio": 0.5, "learning_rate": 1.9355051352273183e-05, "loss": 1.2393993139266968, "loss/kd": 1.9837113618850708, "loss/lm": 0.49508726596832275, "step": 690 }, { "epoch": 0.1418599876822008, "grad_norm": 1.2701728133650574, "kd_ratio": 0.5, "learning_rate": 1.9352699652468835e-05, "loss": 1.431436538696289, "loss/kd": 2.285585403442383, "loss/lm": 0.5772877931594849, "step": 691 }, { "epoch": 0.14206528433586532, "grad_norm": 1.3441887486455228, "kd_ratio": 0.5, "learning_rate": 1.9350343816317728e-05, "loss": 1.9130122661590576, "loss/kd": 3.132373332977295, "loss/lm": 0.6936512589454651, "step": 692 }, { "epoch": 0.14227058098952988, "grad_norm": 1.5464201761606058, "kd_ratio": 0.5, "learning_rate": 1.934798384486176e-05, "loss": 1.1949903964996338, "loss/kd": 1.7664566040039062, "loss/lm": 0.6235242486000061, "step": 693 }, { "epoch": 0.14247587764319442, "grad_norm": 1.2389243271719865, "kd_ratio": 0.5, "learning_rate": 1.9345619739144655e-05, "loss": 1.487256646156311, "loss/kd": 2.3672821521759033, "loss/lm": 0.607231080532074, "step": 694 }, { "epoch": 0.14268117429685895, "grad_norm": 1.053501192446315, "kd_ratio": 0.5, "learning_rate": 1.9343251500211977e-05, "loss": 1.378990888595581, "loss/kd": 2.233769416809082, "loss/lm": 0.5242123007774353, "step": 695 }, { "epoch": 0.1428864709505235, "grad_norm": 1.0617321370161912, "kd_ratio": 0.5, "learning_rate": 1.93408791291111e-05, "loss": 1.0504775047302246, "loss/kd": 1.637190818786621, "loss/lm": 0.4637643098831177, "step": 696 }, { "epoch": 0.14309176760418804, "grad_norm": 1.9780822178885316, "kd_ratio": 0.5, "learning_rate": 1.9338502626891236e-05, "loss": 1.2249480485916138, "loss/kd": 1.9510455131530762, "loss/lm": 0.498850554227829, "step": 697 }, { "epoch": 0.1432970642578526, "grad_norm": 2.6219231732997796, "kd_ratio": 0.5, "learning_rate": 1.9336121994603424e-05, "loss": 1.3427209854125977, "loss/kd": 2.219088554382324, "loss/lm": 0.46635350584983826, "step": 698 }, { "epoch": 0.14350236091151714, "grad_norm": 2.0100624579814323, "kd_ratio": 0.5, "learning_rate": 1.933373723330053e-05, "loss": 1.441514253616333, "loss/kd": 2.344966411590576, "loss/lm": 0.5380620956420898, "step": 699 }, { "epoch": 0.1437076575651817, "grad_norm": 1.5398242834755178, "kd_ratio": 0.5, "learning_rate": 1.9331348344037238e-05, "loss": 1.210823893547058, "loss/kd": 1.8498907089233398, "loss/lm": 0.5717571377754211, "step": 700 }, { "epoch": 0.14391295421884623, "grad_norm": 1.3216029704961985, "kd_ratio": 0.5, "learning_rate": 1.932895532787007e-05, "loss": 1.0994054079055786, "loss/kd": 1.721545934677124, "loss/lm": 0.4772648811340332, "step": 701 }, { "epoch": 0.1441182508725108, "grad_norm": 1.2635035222024689, "kd_ratio": 0.5, "learning_rate": 1.9326558185857363e-05, "loss": 1.258115530014038, "loss/kd": 2.00154447555542, "loss/lm": 0.5146867036819458, "step": 702 }, { "epoch": 0.14432354752617532, "grad_norm": 2.5824263224660275, "kd_ratio": 0.5, "learning_rate": 1.9324156919059286e-05, "loss": 1.3889153003692627, "loss/kd": 2.2318665981292725, "loss/lm": 0.5459639430046082, "step": 703 }, { "epoch": 0.14452884417983985, "grad_norm": 1.798423174135004, "kd_ratio": 0.5, "learning_rate": 1.9321751528537823e-05, "loss": 1.5166200399398804, "loss/kd": 2.5105652809143066, "loss/lm": 0.5226748585700989, "step": 704 }, { "epoch": 0.14473414083350442, "grad_norm": 3.2630272986247446, "kd_ratio": 0.5, "learning_rate": 1.9319342015356793e-05, "loss": 1.2619199752807617, "loss/kd": 1.9965490102767944, "loss/lm": 0.5272910594940186, "step": 705 }, { "epoch": 0.14493943748716895, "grad_norm": 3.565709498720084, "kd_ratio": 0.5, "learning_rate": 1.9316928380581835e-05, "loss": 1.2766788005828857, "loss/kd": 2.0965824127197266, "loss/lm": 0.45677512884140015, "step": 706 }, { "epoch": 0.1451447341408335, "grad_norm": 2.0591040904202904, "kd_ratio": 0.5, "learning_rate": 1.93145106252804e-05, "loss": 1.2310200929641724, "loss/kd": 1.9105589389801025, "loss/lm": 0.5514811873435974, "step": 707 }, { "epoch": 0.14535003079449804, "grad_norm": 2.6781903991671507, "kd_ratio": 0.5, "learning_rate": 1.9312088750521778e-05, "loss": 1.2980413436889648, "loss/kd": 2.1556899547576904, "loss/lm": 0.4403926432132721, "step": 708 }, { "epoch": 0.1455553274481626, "grad_norm": 1.0290480757248666, "kd_ratio": 0.5, "learning_rate": 1.9309662757377066e-05, "loss": 1.2604349851608276, "loss/kd": 2.0702714920043945, "loss/lm": 0.4505985379219055, "step": 709 }, { "epoch": 0.14576062410182714, "grad_norm": 2.315338934063642, "kd_ratio": 0.5, "learning_rate": 1.9307232646919202e-05, "loss": 1.2600611448287964, "loss/kd": 1.8978487253189087, "loss/lm": 0.6222735643386841, "step": 710 }, { "epoch": 0.1459659207554917, "grad_norm": 2.032179921109917, "kd_ratio": 0.5, "learning_rate": 1.9304798420222918e-05, "loss": 1.566495418548584, "loss/kd": 2.6439054012298584, "loss/lm": 0.4890854060649872, "step": 711 }, { "epoch": 0.14617121740915623, "grad_norm": 3.3118158322589113, "kd_ratio": 0.5, "learning_rate": 1.9302360078364785e-05, "loss": 1.1051206588745117, "loss/kd": 1.6206670999526978, "loss/lm": 0.5895742774009705, "step": 712 }, { "epoch": 0.1463765140628208, "grad_norm": 2.7223227027937504, "kd_ratio": 0.5, "learning_rate": 1.9299917622423196e-05, "loss": 1.3659093379974365, "loss/kd": 2.334944248199463, "loss/lm": 0.396874338388443, "step": 713 }, { "epoch": 0.14658181071648532, "grad_norm": 1.214352621224898, "kd_ratio": 0.5, "learning_rate": 1.9297471053478347e-05, "loss": 1.27422034740448, "loss/kd": 2.108881711959839, "loss/lm": 0.43955907225608826, "step": 714 }, { "epoch": 0.14678710737014986, "grad_norm": 1.7979469832438542, "kd_ratio": 0.5, "learning_rate": 1.9295020372612276e-05, "loss": 1.0685755014419556, "loss/kd": 1.57216477394104, "loss/lm": 0.5649862289428711, "step": 715 }, { "epoch": 0.14699240402381442, "grad_norm": 2.485364427240352, "kd_ratio": 0.5, "learning_rate": 1.9292565580908812e-05, "loss": 1.1422959566116333, "loss/kd": 1.6547539234161377, "loss/lm": 0.6298379302024841, "step": 716 }, { "epoch": 0.14719770067747895, "grad_norm": 2.4669229824835748, "kd_ratio": 0.5, "learning_rate": 1.9290106679453628e-05, "loss": 1.124616265296936, "loss/kd": 1.8166325092315674, "loss/lm": 0.4325999915599823, "step": 717 }, { "epoch": 0.1474029973311435, "grad_norm": 1.6560474905734068, "kd_ratio": 0.5, "learning_rate": 1.9287643669334202e-05, "loss": 1.4113216400146484, "loss/kd": 2.1517558097839355, "loss/lm": 0.6708874702453613, "step": 718 }, { "epoch": 0.14760829398480804, "grad_norm": 1.3103981826769129, "kd_ratio": 0.5, "learning_rate": 1.9285176551639826e-05, "loss": 1.2476636171340942, "loss/kd": 2.0042598247528076, "loss/lm": 0.4910673499107361, "step": 719 }, { "epoch": 0.1478135906384726, "grad_norm": 2.3147926287146627, "kd_ratio": 0.5, "learning_rate": 1.928270532746162e-05, "loss": 1.397464394569397, "loss/kd": 2.146257162094116, "loss/lm": 0.6486716270446777, "step": 720 }, { "epoch": 0.14801888729213714, "grad_norm": 2.3809129911280325, "kd_ratio": 0.5, "learning_rate": 1.9280229997892505e-05, "loss": 1.377620816230774, "loss/kd": 2.1650235652923584, "loss/lm": 0.5902180671691895, "step": 721 }, { "epoch": 0.1482241839458017, "grad_norm": 1.275164289349481, "kd_ratio": 0.5, "learning_rate": 1.9277750564027232e-05, "loss": 1.4057197570800781, "loss/kd": 2.317739725112915, "loss/lm": 0.4936998784542084, "step": 722 }, { "epoch": 0.14842948059946623, "grad_norm": 1.7657549188725798, "kd_ratio": 0.5, "learning_rate": 1.9275267026962358e-05, "loss": 1.4870002269744873, "loss/kd": 2.358689308166504, "loss/lm": 0.6153112649917603, "step": 723 }, { "epoch": 0.14863477725313076, "grad_norm": 3.3009731829647446, "kd_ratio": 0.5, "learning_rate": 1.9272779387796263e-05, "loss": 1.3396425247192383, "loss/kd": 2.2301058769226074, "loss/lm": 0.44917917251586914, "step": 724 }, { "epoch": 0.14884007390679532, "grad_norm": 2.077455794644503, "kd_ratio": 0.5, "learning_rate": 1.9270287647629132e-05, "loss": 1.1591482162475586, "loss/kd": 1.9122586250305176, "loss/lm": 0.40603771805763245, "step": 725 }, { "epoch": 0.14904537056045986, "grad_norm": 1.371019206226764, "kd_ratio": 0.5, "learning_rate": 1.9267791807562964e-05, "loss": 1.1401911973953247, "loss/kd": 1.6929113864898682, "loss/lm": 0.5874710083007812, "step": 726 }, { "epoch": 0.14925066721412442, "grad_norm": 2.235677799605497, "kd_ratio": 0.5, "learning_rate": 1.9265291868701584e-05, "loss": 1.247720718383789, "loss/kd": 1.9622901678085327, "loss/lm": 0.5331512689590454, "step": 727 }, { "epoch": 0.14945596386778895, "grad_norm": 1.2145494380686022, "kd_ratio": 0.5, "learning_rate": 1.9262787832150615e-05, "loss": 1.2781754732131958, "loss/kd": 1.9510855674743652, "loss/lm": 0.6052654385566711, "step": 728 }, { "epoch": 0.1496612605214535, "grad_norm": 1.9367878100440346, "kd_ratio": 0.5, "learning_rate": 1.9260279699017496e-05, "loss": 1.2490744590759277, "loss/kd": 1.937443494796753, "loss/lm": 0.5607054233551025, "step": 729 }, { "epoch": 0.14986655717511804, "grad_norm": 2.3676264236551536, "kd_ratio": 0.5, "learning_rate": 1.9257767470411487e-05, "loss": 1.2889240980148315, "loss/kd": 2.0606954097747803, "loss/lm": 0.5171527862548828, "step": 730 }, { "epoch": 0.1500718538287826, "grad_norm": 1.1002341654920795, "kd_ratio": 0.5, "learning_rate": 1.9255251147443646e-05, "loss": 1.0954068899154663, "loss/kd": 1.630959153175354, "loss/lm": 0.5598546862602234, "step": 731 }, { "epoch": 0.15027715048244714, "grad_norm": 1.9093786450265413, "kd_ratio": 0.5, "learning_rate": 1.925273073122685e-05, "loss": 1.4676482677459717, "loss/kd": 2.3936662673950195, "loss/lm": 0.5416303873062134, "step": 732 }, { "epoch": 0.15048244713611167, "grad_norm": 2.8644624182471663, "kd_ratio": 0.5, "learning_rate": 1.9250206222875785e-05, "loss": 1.1767945289611816, "loss/kd": 1.8462625741958618, "loss/lm": 0.5073263645172119, "step": 733 }, { "epoch": 0.15068774378977623, "grad_norm": 1.913413993433021, "kd_ratio": 0.5, "learning_rate": 1.924767762350695e-05, "loss": 1.4692442417144775, "loss/kd": 2.337515354156494, "loss/lm": 0.6009731888771057, "step": 734 }, { "epoch": 0.15089304044344076, "grad_norm": 1.6577124983089524, "kd_ratio": 0.5, "learning_rate": 1.924514493423864e-05, "loss": 1.363079309463501, "loss/kd": 2.2074527740478516, "loss/lm": 0.5187059640884399, "step": 735 }, { "epoch": 0.15109833709710532, "grad_norm": 2.455057045984639, "kd_ratio": 0.5, "learning_rate": 1.924260815619097e-05, "loss": 1.2480144500732422, "loss/kd": 2.0327112674713135, "loss/lm": 0.4633175730705261, "step": 736 }, { "epoch": 0.15130363375076986, "grad_norm": 2.280133623133325, "kd_ratio": 0.5, "learning_rate": 1.9240067290485865e-05, "loss": 1.3019754886627197, "loss/kd": 2.129760503768921, "loss/lm": 0.47419053316116333, "step": 737 }, { "epoch": 0.15150893040443442, "grad_norm": 1.3395718945521553, "kd_ratio": 0.5, "learning_rate": 1.9237522338247053e-05, "loss": 1.0448458194732666, "loss/kd": 1.5761184692382812, "loss/lm": 0.5135732293128967, "step": 738 }, { "epoch": 0.15171422705809895, "grad_norm": 2.888839049347668, "kd_ratio": 0.5, "learning_rate": 1.9234973300600074e-05, "loss": 1.3868894577026367, "loss/kd": 2.189049243927002, "loss/lm": 0.5847296118736267, "step": 739 }, { "epoch": 0.1519195237117635, "grad_norm": 3.594126610492624, "kd_ratio": 0.5, "learning_rate": 1.9232420178672263e-05, "loss": 1.3886523246765137, "loss/kd": 2.069004535675049, "loss/lm": 0.7083001136779785, "step": 740 }, { "epoch": 0.15212482036542804, "grad_norm": 0.9662953053551554, "kd_ratio": 0.5, "learning_rate": 1.9229862973592778e-05, "loss": 1.31399667263031, "loss/kd": 2.1988730430603027, "loss/lm": 0.429120272397995, "step": 741 }, { "epoch": 0.15233011701909258, "grad_norm": 3.9406332959138473, "kd_ratio": 0.5, "learning_rate": 1.922730168649257e-05, "loss": 1.1877102851867676, "loss/kd": 1.8890316486358643, "loss/lm": 0.48638904094696045, "step": 742 }, { "epoch": 0.15253541367275714, "grad_norm": 3.1369939583843793, "kd_ratio": 0.5, "learning_rate": 1.92247363185044e-05, "loss": 1.363332986831665, "loss/kd": 2.2314159870147705, "loss/lm": 0.4952501058578491, "step": 743 }, { "epoch": 0.15274071032642167, "grad_norm": 1.1535850849445477, "kd_ratio": 0.5, "learning_rate": 1.9222166870762833e-05, "loss": 1.7554823160171509, "loss/kd": 2.8812713623046875, "loss/lm": 0.6296932697296143, "step": 744 }, { "epoch": 0.15294600698008623, "grad_norm": 2.7279002330994655, "kd_ratio": 0.5, "learning_rate": 1.9219593344404242e-05, "loss": 1.3229995965957642, "loss/kd": 2.208712339401245, "loss/lm": 0.4372867941856384, "step": 745 }, { "epoch": 0.15315130363375076, "grad_norm": 1.9647904999841979, "kd_ratio": 0.5, "learning_rate": 1.92170157405668e-05, "loss": 1.1478785276412964, "loss/kd": 1.9260568618774414, "loss/lm": 0.36970025300979614, "step": 746 }, { "epoch": 0.15335660028741532, "grad_norm": 1.6262135494252643, "kd_ratio": 0.5, "learning_rate": 1.9214434060390484e-05, "loss": 1.0484198331832886, "loss/kd": 1.708614468574524, "loss/lm": 0.3882252275943756, "step": 747 }, { "epoch": 0.15356189694107986, "grad_norm": 2.2241918018638565, "kd_ratio": 0.5, "learning_rate": 1.9211848305017072e-05, "loss": 1.2529160976409912, "loss/kd": 1.9580111503601074, "loss/lm": 0.5478209257125854, "step": 748 }, { "epoch": 0.15376719359474442, "grad_norm": 2.266507562093413, "kd_ratio": 0.5, "learning_rate": 1.9209258475590146e-05, "loss": 1.358754277229309, "loss/kd": 2.145803689956665, "loss/lm": 0.5717049241065979, "step": 749 }, { "epoch": 0.15397249024840895, "grad_norm": 1.7958266149303592, "kd_ratio": 0.5, "learning_rate": 1.9206664573255095e-05, "loss": 1.1590654850006104, "loss/kd": 1.6884160041809082, "loss/lm": 0.629714846611023, "step": 750 }, { "epoch": 0.15417778690207348, "grad_norm": 2.3304966732971484, "kd_ratio": 0.5, "learning_rate": 1.9204066599159094e-05, "loss": 1.080683946609497, "loss/kd": 1.7776315212249756, "loss/lm": 0.383736252784729, "step": 751 }, { "epoch": 0.15438308355573804, "grad_norm": 2.491842109742185, "kd_ratio": 0.5, "learning_rate": 1.9201464554451142e-05, "loss": 1.5244309902191162, "loss/kd": 2.548150062561035, "loss/lm": 0.5007119178771973, "step": 752 }, { "epoch": 0.15458838020940258, "grad_norm": 2.3531020368221998, "kd_ratio": 0.5, "learning_rate": 1.9198858440282016e-05, "loss": 1.0697652101516724, "loss/kd": 1.6353074312210083, "loss/lm": 0.5042230486869812, "step": 753 }, { "epoch": 0.15479367686306714, "grad_norm": 1.3654481116948918, "kd_ratio": 0.5, "learning_rate": 1.9196248257804305e-05, "loss": 2.128002882003784, "loss/kd": 3.7859225273132324, "loss/lm": 0.4700833261013031, "step": 754 }, { "epoch": 0.15499897351673167, "grad_norm": 2.9050809009737857, "kd_ratio": 0.5, "learning_rate": 1.9193634008172396e-05, "loss": 1.1945925951004028, "loss/kd": 1.8257561922073364, "loss/lm": 0.563429057598114, "step": 755 }, { "epoch": 0.15520427017039623, "grad_norm": 1.2255812423565036, "kd_ratio": 0.5, "learning_rate": 1.919101569254247e-05, "loss": 1.529819369316101, "loss/kd": 2.4476170539855957, "loss/lm": 0.6120216846466064, "step": 756 }, { "epoch": 0.15540956682406076, "grad_norm": 2.4845471688062064, "kd_ratio": 0.5, "learning_rate": 1.9188393312072513e-05, "loss": 1.4818979501724243, "loss/kd": 2.470501661300659, "loss/lm": 0.4932941496372223, "step": 757 }, { "epoch": 0.15561486347772532, "grad_norm": 1.869334164169525, "kd_ratio": 0.5, "learning_rate": 1.9185766867922303e-05, "loss": 1.4285454750061035, "loss/kd": 2.2823729515075684, "loss/lm": 0.5747178792953491, "step": 758 }, { "epoch": 0.15582016013138986, "grad_norm": 1.664714175550213, "kd_ratio": 0.5, "learning_rate": 1.9183136361253417e-05, "loss": 1.2545782327651978, "loss/kd": 1.9169195890426636, "loss/lm": 0.5922368168830872, "step": 759 }, { "epoch": 0.15602545678505442, "grad_norm": 1.0273715766463531, "kd_ratio": 0.5, "learning_rate": 1.9180501793229228e-05, "loss": 1.2176275253295898, "loss/kd": 2.033721446990967, "loss/lm": 0.4015336036682129, "step": 760 }, { "epoch": 0.15623075343871895, "grad_norm": 1.9401087262822945, "kd_ratio": 0.5, "learning_rate": 1.917786316501491e-05, "loss": 1.3694231510162354, "loss/kd": 2.2399981021881104, "loss/lm": 0.4988481104373932, "step": 761 }, { "epoch": 0.15643605009238348, "grad_norm": 1.6208510994488399, "kd_ratio": 0.5, "learning_rate": 1.9175220477777425e-05, "loss": 1.20201575756073, "loss/kd": 1.9156665802001953, "loss/lm": 0.4883649945259094, "step": 762 }, { "epoch": 0.15664134674604804, "grad_norm": 1.824799010818013, "kd_ratio": 0.5, "learning_rate": 1.917257373268554e-05, "loss": 1.2675278186798096, "loss/kd": 2.094855546951294, "loss/lm": 0.4402001202106476, "step": 763 }, { "epoch": 0.15684664339971258, "grad_norm": 1.641805340477464, "kd_ratio": 0.5, "learning_rate": 1.91699229309098e-05, "loss": 1.1960077285766602, "loss/kd": 1.852177381515503, "loss/lm": 0.5398379564285278, "step": 764 }, { "epoch": 0.15705194005337714, "grad_norm": 2.0164903534022187, "kd_ratio": 0.5, "learning_rate": 1.9167268073622563e-05, "loss": 1.1581388711929321, "loss/kd": 1.841364860534668, "loss/lm": 0.47491294145584106, "step": 765 }, { "epoch": 0.15725723670704167, "grad_norm": 1.472996664812874, "kd_ratio": 0.5, "learning_rate": 1.9164609161997972e-05, "loss": 1.0523583889007568, "loss/kd": 1.5626088380813599, "loss/lm": 0.5421080589294434, "step": 766 }, { "epoch": 0.15746253336070623, "grad_norm": 1.9687572717664212, "kd_ratio": 0.5, "learning_rate": 1.916194619721196e-05, "loss": 1.3922057151794434, "loss/kd": 2.288695812225342, "loss/lm": 0.49571549892425537, "step": 767 }, { "epoch": 0.15766783001437076, "grad_norm": 1.4850926544917458, "kd_ratio": 0.5, "learning_rate": 1.9159279180442257e-05, "loss": 1.2156888246536255, "loss/kd": 1.980737566947937, "loss/lm": 0.45064008235931396, "step": 768 }, { "epoch": 0.15787312666803532, "grad_norm": 1.8273456467223714, "kd_ratio": 0.5, "learning_rate": 1.9156608112868388e-05, "loss": 1.2441142797470093, "loss/kd": 1.937146782875061, "loss/lm": 0.5510817766189575, "step": 769 }, { "epoch": 0.15807842332169986, "grad_norm": 2.0935950586903633, "kd_ratio": 0.5, "learning_rate": 1.915393299567166e-05, "loss": 1.3003077507019043, "loss/kd": 2.1254043579101562, "loss/lm": 0.4752110242843628, "step": 770 }, { "epoch": 0.1582837199753644, "grad_norm": 1.6838387529063974, "kd_ratio": 0.5, "learning_rate": 1.915125383003518e-05, "loss": 1.3481154441833496, "loss/kd": 2.2178330421447754, "loss/lm": 0.47839781641960144, "step": 771 }, { "epoch": 0.15848901662902895, "grad_norm": 2.735290038078483, "kd_ratio": 0.5, "learning_rate": 1.914857061714384e-05, "loss": 1.4289690256118774, "loss/kd": 2.3068149089813232, "loss/lm": 0.5511231422424316, "step": 772 }, { "epoch": 0.15869431328269348, "grad_norm": 1.1367400350383137, "kd_ratio": 0.5, "learning_rate": 1.914588335818433e-05, "loss": 1.1987611055374146, "loss/kd": 1.987466812133789, "loss/lm": 0.41005539894104004, "step": 773 }, { "epoch": 0.15889960993635804, "grad_norm": 2.389863001767214, "kd_ratio": 0.5, "learning_rate": 1.9143192054345114e-05, "loss": 1.195733666419983, "loss/kd": 1.81514573097229, "loss/lm": 0.576321542263031, "step": 774 }, { "epoch": 0.15910490659002258, "grad_norm": 1.0593172268666, "kd_ratio": 0.5, "learning_rate": 1.914049670681646e-05, "loss": 1.3814313411712646, "loss/kd": 2.236016035079956, "loss/lm": 0.5268467664718628, "step": 775 }, { "epoch": 0.15931020324368714, "grad_norm": 3.3350791941786437, "kd_ratio": 0.5, "learning_rate": 1.9137797316790417e-05, "loss": 1.3393528461456299, "loss/kd": 2.1284685134887695, "loss/lm": 0.5502372980117798, "step": 776 }, { "epoch": 0.15951549989735167, "grad_norm": 2.0569287671270438, "kd_ratio": 0.5, "learning_rate": 1.9135093885460826e-05, "loss": 1.2166781425476074, "loss/kd": 1.9710723161697388, "loss/lm": 0.4622838497161865, "step": 777 }, { "epoch": 0.15972079655101623, "grad_norm": 1.9192765035409296, "kd_ratio": 0.5, "learning_rate": 1.9132386414023306e-05, "loss": 1.5295597314834595, "loss/kd": 2.3061838150024414, "loss/lm": 0.7529355883598328, "step": 778 }, { "epoch": 0.15992609320468076, "grad_norm": 1.5546544529955826, "kd_ratio": 0.5, "learning_rate": 1.912967490367528e-05, "loss": 1.3302009105682373, "loss/kd": 2.208620071411133, "loss/lm": 0.4517817795276642, "step": 779 }, { "epoch": 0.1601313898583453, "grad_norm": 2.173187703206896, "kd_ratio": 0.5, "learning_rate": 1.912695935561594e-05, "loss": 1.1186130046844482, "loss/kd": 1.7665742635726929, "loss/lm": 0.470651775598526, "step": 780 }, { "epoch": 0.16033668651200986, "grad_norm": 1.905143525338707, "kd_ratio": 0.5, "learning_rate": 1.912423977104627e-05, "loss": 1.2846862077713013, "loss/kd": 2.039991617202759, "loss/lm": 0.5293807983398438, "step": 781 }, { "epoch": 0.1605419831656744, "grad_norm": 1.815081468004292, "kd_ratio": 0.5, "learning_rate": 1.9121516151169045e-05, "loss": 1.1259146928787231, "loss/kd": 1.7898967266082764, "loss/lm": 0.4619326591491699, "step": 782 }, { "epoch": 0.16074727981933895, "grad_norm": 1.4552612147676436, "kd_ratio": 0.5, "learning_rate": 1.9118788497188815e-05, "loss": 1.3679593801498413, "loss/kd": 2.1931002140045166, "loss/lm": 0.5428186058998108, "step": 783 }, { "epoch": 0.16095257647300348, "grad_norm": 1.2856510505256793, "kd_ratio": 0.5, "learning_rate": 1.911605681031192e-05, "loss": 1.4260519742965698, "loss/kd": 2.302248001098633, "loss/lm": 0.5498559474945068, "step": 784 }, { "epoch": 0.16115787312666804, "grad_norm": 1.5119611811881855, "kd_ratio": 0.5, "learning_rate": 1.9113321091746478e-05, "loss": 1.345354437828064, "loss/kd": 2.1422483921051025, "loss/lm": 0.5484605431556702, "step": 785 }, { "epoch": 0.16136316978033258, "grad_norm": 1.5755352149987947, "kd_ratio": 0.5, "learning_rate": 1.9110581342702403e-05, "loss": 1.0579473972320557, "loss/kd": 1.6626038551330566, "loss/lm": 0.4532909393310547, "step": 786 }, { "epoch": 0.16156846643399714, "grad_norm": 1.599125866450504, "kd_ratio": 0.5, "learning_rate": 1.9107837564391376e-05, "loss": 1.2584056854248047, "loss/kd": 1.9410563707351685, "loss/lm": 0.5757550597190857, "step": 787 }, { "epoch": 0.16177376308766167, "grad_norm": 1.3599341492450254, "kd_ratio": 0.5, "learning_rate": 1.9105089758026872e-05, "loss": 1.3827457427978516, "loss/kd": 2.2549540996551514, "loss/lm": 0.5105372667312622, "step": 788 }, { "epoch": 0.1619790597413262, "grad_norm": 1.0967077659157367, "kd_ratio": 0.5, "learning_rate": 1.9102337924824134e-05, "loss": 1.6388510465621948, "loss/kd": 2.682199239730835, "loss/lm": 0.5955027937889099, "step": 789 }, { "epoch": 0.16218435639499076, "grad_norm": 1.968078807055547, "kd_ratio": 0.5, "learning_rate": 1.9099582066000204e-05, "loss": 1.238771677017212, "loss/kd": 1.873626708984375, "loss/lm": 0.6039166450500488, "step": 790 }, { "epoch": 0.1623896530486553, "grad_norm": 2.919113617441476, "kd_ratio": 0.5, "learning_rate": 1.9096822182773887e-05, "loss": 1.4662142992019653, "loss/kd": 2.422398090362549, "loss/lm": 0.5100304484367371, "step": 791 }, { "epoch": 0.16259494970231986, "grad_norm": 3.01340178481669, "kd_ratio": 0.5, "learning_rate": 1.9094058276365782e-05, "loss": 1.3049590587615967, "loss/kd": 2.0698184967041016, "loss/lm": 0.540099561214447, "step": 792 }, { "epoch": 0.1628002463559844, "grad_norm": 1.6441722121174587, "kd_ratio": 0.5, "learning_rate": 1.9091290347998256e-05, "loss": 1.2886909246444702, "loss/kd": 2.0357866287231445, "loss/lm": 0.5415952205657959, "step": 793 }, { "epoch": 0.16300554300964895, "grad_norm": 1.4957594318194207, "kd_ratio": 0.5, "learning_rate": 1.908851839889546e-05, "loss": 1.4764655828475952, "loss/kd": 2.4060912132263184, "loss/lm": 0.5468398928642273, "step": 794 }, { "epoch": 0.16321083966331348, "grad_norm": 2.2914032802421413, "kd_ratio": 0.5, "learning_rate": 1.9085742430283322e-05, "loss": 1.4444068670272827, "loss/kd": 2.3151540756225586, "loss/lm": 0.5736595988273621, "step": 795 }, { "epoch": 0.16341613631697804, "grad_norm": 2.4939094979353147, "kd_ratio": 0.5, "learning_rate": 1.908296244338955e-05, "loss": 1.1692253351211548, "loss/kd": 1.7876218557357788, "loss/lm": 0.5508288145065308, "step": 796 }, { "epoch": 0.16362143297064258, "grad_norm": 1.3368317875789497, "kd_ratio": 0.5, "learning_rate": 1.9080178439443628e-05, "loss": 1.3516089916229248, "loss/kd": 2.2410454750061035, "loss/lm": 0.4621725380420685, "step": 797 }, { "epoch": 0.1638267296243071, "grad_norm": 1.7945194881786413, "kd_ratio": 0.5, "learning_rate": 1.9077390419676813e-05, "loss": 1.0567468404769897, "loss/kd": 1.6540309190750122, "loss/lm": 0.4594626724720001, "step": 798 }, { "epoch": 0.16403202627797167, "grad_norm": 2.8639600030402734, "kd_ratio": 0.5, "learning_rate": 1.907459838532215e-05, "loss": 1.1916539669036865, "loss/kd": 1.866633415222168, "loss/lm": 0.5166745781898499, "step": 799 }, { "epoch": 0.1642373229316362, "grad_norm": 2.414604180040847, "kd_ratio": 0.5, "learning_rate": 1.9071802337614436e-05, "loss": 1.2865490913391113, "loss/kd": 2.1147515773773193, "loss/lm": 0.45834648609161377, "step": 800 }, { "epoch": 0.16444261958530076, "grad_norm": 1.6007291410615296, "kd_ratio": 0.5, "learning_rate": 1.9069002277790268e-05, "loss": 1.1606870889663696, "loss/kd": 1.8419052362442017, "loss/lm": 0.4794689118862152, "step": 801 }, { "epoch": 0.1646479162389653, "grad_norm": 1.2455440747859194, "kd_ratio": 0.5, "learning_rate": 1.9066198207088004e-05, "loss": 1.0843290090560913, "loss/kd": 1.7492318153381348, "loss/lm": 0.419426292181015, "step": 802 }, { "epoch": 0.16485321289262986, "grad_norm": 1.236161200871825, "kd_ratio": 0.5, "learning_rate": 1.9063390126747778e-05, "loss": 1.3379762172698975, "loss/kd": 2.125781774520874, "loss/lm": 0.5501706004142761, "step": 803 }, { "epoch": 0.1650585095462944, "grad_norm": 2.3246768226900256, "kd_ratio": 0.5, "learning_rate": 1.9060578038011503e-05, "loss": 1.3253227472305298, "loss/kd": 2.2425429821014404, "loss/lm": 0.40810251235961914, "step": 804 }, { "epoch": 0.16526380619995895, "grad_norm": 2.737339380024289, "kd_ratio": 0.5, "learning_rate": 1.9057761942122854e-05, "loss": 1.280998706817627, "loss/kd": 2.077517032623291, "loss/lm": 0.4844802916049957, "step": 805 }, { "epoch": 0.16546910285362348, "grad_norm": 1.3296593784831294, "kd_ratio": 0.5, "learning_rate": 1.9054941840327287e-05, "loss": 1.3571360111236572, "loss/kd": 2.237670421600342, "loss/lm": 0.47660157084465027, "step": 806 }, { "epoch": 0.16567439950728804, "grad_norm": 1.9945758196621146, "kd_ratio": 0.5, "learning_rate": 1.9052117733872025e-05, "loss": 1.3456836938858032, "loss/kd": 2.203117609024048, "loss/lm": 0.4882497489452362, "step": 807 }, { "epoch": 0.16587969616095258, "grad_norm": 2.469068534948543, "kd_ratio": 0.5, "learning_rate": 1.904928962400607e-05, "loss": 1.2614295482635498, "loss/kd": 1.8888607025146484, "loss/lm": 0.6339982748031616, "step": 808 }, { "epoch": 0.1660849928146171, "grad_norm": 1.1332240727928677, "kd_ratio": 0.5, "learning_rate": 1.9046457511980175e-05, "loss": 1.3777170181274414, "loss/kd": 2.2753396034240723, "loss/lm": 0.4800943434238434, "step": 809 }, { "epoch": 0.16629028946828167, "grad_norm": 2.1481977375797734, "kd_ratio": 0.5, "learning_rate": 1.9043621399046894e-05, "loss": 1.186751365661621, "loss/kd": 1.824379801750183, "loss/lm": 0.5491228103637695, "step": 810 }, { "epoch": 0.1664955861219462, "grad_norm": 1.2669748058769261, "kd_ratio": 0.5, "learning_rate": 1.904078128646052e-05, "loss": 1.3100850582122803, "loss/kd": 2.1187539100646973, "loss/lm": 0.5014162659645081, "step": 811 }, { "epoch": 0.16670088277561076, "grad_norm": 2.7683221477507547, "kd_ratio": 0.5, "learning_rate": 1.9037937175477133e-05, "loss": 1.1684463024139404, "loss/kd": 1.8698930740356445, "loss/lm": 0.46699944138526917, "step": 812 }, { "epoch": 0.1669061794292753, "grad_norm": 2.980452981974217, "kd_ratio": 0.5, "learning_rate": 1.9035089067354573e-05, "loss": 1.221429467201233, "loss/kd": 1.9316784143447876, "loss/lm": 0.5111804604530334, "step": 813 }, { "epoch": 0.16711147608293986, "grad_norm": 1.825519316119304, "kd_ratio": 0.5, "learning_rate": 1.9032236963352454e-05, "loss": 1.1725679636001587, "loss/kd": 1.958310604095459, "loss/lm": 0.3868253231048584, "step": 814 }, { "epoch": 0.1673167727366044, "grad_norm": 1.16755683836992, "kd_ratio": 0.5, "learning_rate": 1.902938086473215e-05, "loss": 1.3928343057632446, "loss/kd": 2.3152713775634766, "loss/lm": 0.4703972041606903, "step": 815 }, { "epoch": 0.16752206939026895, "grad_norm": 1.8384740216521922, "kd_ratio": 0.5, "learning_rate": 1.9026520772756814e-05, "loss": 1.0733755826950073, "loss/kd": 1.6473076343536377, "loss/lm": 0.49944353103637695, "step": 816 }, { "epoch": 0.16772736604393348, "grad_norm": 1.9327861720499369, "kd_ratio": 0.5, "learning_rate": 1.902365668869135e-05, "loss": 1.1492787599563599, "loss/kd": 1.8701471090316772, "loss/lm": 0.42841044068336487, "step": 817 }, { "epoch": 0.16793266269759802, "grad_norm": 1.5977602348239395, "kd_ratio": 0.5, "learning_rate": 1.9020788613802435e-05, "loss": 1.1232988834381104, "loss/kd": 1.7244776487350464, "loss/lm": 0.5221200585365295, "step": 818 }, { "epoch": 0.16813795935126258, "grad_norm": 1.1894954034828922, "kd_ratio": 0.5, "learning_rate": 1.901791654935852e-05, "loss": 1.2789158821105957, "loss/kd": 2.1462018489837646, "loss/lm": 0.411629855632782, "step": 819 }, { "epoch": 0.1683432560049271, "grad_norm": 1.1423719165895796, "kd_ratio": 0.5, "learning_rate": 1.9015040496629792e-05, "loss": 1.4156993627548218, "loss/kd": 2.3417961597442627, "loss/lm": 0.489602655172348, "step": 820 }, { "epoch": 0.16854855265859167, "grad_norm": 1.420888865673212, "kd_ratio": 0.5, "learning_rate": 1.9012160456888235e-05, "loss": 1.2751657962799072, "loss/kd": 2.001755952835083, "loss/lm": 0.5485756993293762, "step": 821 }, { "epoch": 0.1687538493122562, "grad_norm": 1.1861175439541585, "kd_ratio": 0.5, "learning_rate": 1.900927643140758e-05, "loss": 1.4258941411972046, "loss/kd": 2.33050537109375, "loss/lm": 0.521282970905304, "step": 822 }, { "epoch": 0.16895914596592077, "grad_norm": 1.4462956419395991, "kd_ratio": 0.5, "learning_rate": 1.9006388421463322e-05, "loss": 1.166949987411499, "loss/kd": 1.8995507955551147, "loss/lm": 0.4343492388725281, "step": 823 }, { "epoch": 0.1691644426195853, "grad_norm": 1.5195071922026262, "kd_ratio": 0.5, "learning_rate": 1.9003496428332714e-05, "loss": 1.3054981231689453, "loss/kd": 2.1685614585876465, "loss/lm": 0.4424346685409546, "step": 824 }, { "epoch": 0.16936973927324986, "grad_norm": 1.8462030620878822, "kd_ratio": 0.5, "learning_rate": 1.9000600453294787e-05, "loss": 1.23576819896698, "loss/kd": 1.9317456483840942, "loss/lm": 0.539790689945221, "step": 825 }, { "epoch": 0.1695750359269144, "grad_norm": 1.3288386857116783, "kd_ratio": 0.5, "learning_rate": 1.8997700497630308e-05, "loss": 1.284054160118103, "loss/kd": 1.8930209875106812, "loss/lm": 0.6750873327255249, "step": 826 }, { "epoch": 0.16978033258057892, "grad_norm": 1.3661566582644595, "kd_ratio": 0.5, "learning_rate": 1.899479656262183e-05, "loss": 1.1762584447860718, "loss/kd": 1.8018865585327148, "loss/lm": 0.5506303310394287, "step": 827 }, { "epoch": 0.16998562923424348, "grad_norm": 1.8214728354921244, "kd_ratio": 0.5, "learning_rate": 1.899188864955365e-05, "loss": 1.3988211154937744, "loss/kd": 2.3468549251556396, "loss/lm": 0.45078742504119873, "step": 828 }, { "epoch": 0.17019092588790802, "grad_norm": 1.815136313165199, "kd_ratio": 0.5, "learning_rate": 1.898897675971182e-05, "loss": 1.3322213888168335, "loss/kd": 2.1028685569763184, "loss/lm": 0.5615741610527039, "step": 829 }, { "epoch": 0.17039622254157258, "grad_norm": 1.3890145124858455, "kd_ratio": 0.5, "learning_rate": 1.8986060894384174e-05, "loss": 1.131162166595459, "loss/kd": 1.7764333486557007, "loss/lm": 0.4858908951282501, "step": 830 }, { "epoch": 0.1706015191952371, "grad_norm": 1.482661044694518, "kd_ratio": 0.5, "learning_rate": 1.898314105486028e-05, "loss": 1.2795826196670532, "loss/kd": 2.064096450805664, "loss/lm": 0.49506884813308716, "step": 831 }, { "epoch": 0.17080681584890167, "grad_norm": 2.2689853301974576, "kd_ratio": 0.5, "learning_rate": 1.8980217242431473e-05, "loss": 1.1496843099594116, "loss/kd": 1.8583340644836426, "loss/lm": 0.44103458523750305, "step": 832 }, { "epoch": 0.1710121125025662, "grad_norm": 2.5408751153158486, "kd_ratio": 0.5, "learning_rate": 1.897728945839085e-05, "loss": 1.2846463918685913, "loss/kd": 2.0939080715179443, "loss/lm": 0.4753846228122711, "step": 833 }, { "epoch": 0.17121740915623077, "grad_norm": 1.1704414276841373, "kd_ratio": 0.5, "learning_rate": 1.8974357704033255e-05, "loss": 1.1840786933898926, "loss/kd": 1.9179978370666504, "loss/lm": 0.4501595199108124, "step": 834 }, { "epoch": 0.1714227058098953, "grad_norm": 2.186482805486249, "kd_ratio": 0.5, "learning_rate": 1.8971421980655295e-05, "loss": 1.1922948360443115, "loss/kd": 1.8005799055099487, "loss/lm": 0.5840098261833191, "step": 835 }, { "epoch": 0.17162800246355983, "grad_norm": 1.8428198125167758, "kd_ratio": 0.5, "learning_rate": 1.896848228955533e-05, "loss": 1.4408138990402222, "loss/kd": 2.3836522102355957, "loss/lm": 0.49797549843788147, "step": 836 }, { "epoch": 0.1718332991172244, "grad_norm": 1.2091843483096711, "kd_ratio": 0.5, "learning_rate": 1.896553863203347e-05, "loss": 1.181064248085022, "loss/kd": 1.8455703258514404, "loss/lm": 0.5165581703186035, "step": 837 }, { "epoch": 0.17203859577088892, "grad_norm": 1.9112590642985965, "kd_ratio": 0.5, "learning_rate": 1.8962591009391595e-05, "loss": 1.0101662874221802, "loss/kd": 1.6191538572311401, "loss/lm": 0.4011788070201874, "step": 838 }, { "epoch": 0.17224389242455349, "grad_norm": 1.82311284304679, "kd_ratio": 0.5, "learning_rate": 1.8959639422933316e-05, "loss": 1.3287580013275146, "loss/kd": 1.9753955602645874, "loss/lm": 0.6821205019950867, "step": 839 }, { "epoch": 0.17244918907821802, "grad_norm": 1.9505160868857196, "kd_ratio": 0.5, "learning_rate": 1.895668387396401e-05, "loss": 1.4356014728546143, "loss/kd": 2.384672164916992, "loss/lm": 0.4865308105945587, "step": 840 }, { "epoch": 0.17265448573188258, "grad_norm": 2.410488144631461, "kd_ratio": 0.5, "learning_rate": 1.8953724363790812e-05, "loss": 1.295626163482666, "loss/kd": 2.0762834548950195, "loss/lm": 0.5149689316749573, "step": 841 }, { "epoch": 0.1728597823855471, "grad_norm": 2.9955152227536064, "kd_ratio": 0.5, "learning_rate": 1.8950760893722595e-05, "loss": 1.3004230260849, "loss/kd": 2.1579232215881348, "loss/lm": 0.44292283058166504, "step": 842 }, { "epoch": 0.17306507903921167, "grad_norm": 1.5101701298653456, "kd_ratio": 0.5, "learning_rate": 1.894779346506999e-05, "loss": 1.1859240531921387, "loss/kd": 1.9418842792510986, "loss/lm": 0.42996394634246826, "step": 843 }, { "epoch": 0.1732703756928762, "grad_norm": 2.1191480013178756, "kd_ratio": 0.5, "learning_rate": 1.8944822079145385e-05, "loss": 1.1396950483322144, "loss/kd": 1.8248296976089478, "loss/lm": 0.45456036925315857, "step": 844 }, { "epoch": 0.17347567234654074, "grad_norm": 1.9486647179720888, "kd_ratio": 0.5, "learning_rate": 1.894184673726291e-05, "loss": 1.4281859397888184, "loss/kd": 2.1672143936157227, "loss/lm": 0.6891574263572693, "step": 845 }, { "epoch": 0.1736809690002053, "grad_norm": 1.8705312522987096, "kd_ratio": 0.5, "learning_rate": 1.8938867440738445e-05, "loss": 1.3075891733169556, "loss/kd": 2.10034441947937, "loss/lm": 0.5148339867591858, "step": 846 }, { "epoch": 0.17388626565386983, "grad_norm": 1.489670574458769, "kd_ratio": 0.5, "learning_rate": 1.893588419088962e-05, "loss": 1.3901426792144775, "loss/kd": 2.3877367973327637, "loss/lm": 0.39254850149154663, "step": 847 }, { "epoch": 0.1740915623075344, "grad_norm": 2.9937344661258933, "kd_ratio": 0.5, "learning_rate": 1.8932896989035814e-05, "loss": 1.19473397731781, "loss/kd": 1.951971173286438, "loss/lm": 0.4374968111515045, "step": 848 }, { "epoch": 0.17429685896119892, "grad_norm": 2.2748739458819363, "kd_ratio": 0.5, "learning_rate": 1.8929905836498155e-05, "loss": 1.2247787714004517, "loss/kd": 2.0018627643585205, "loss/lm": 0.44769471883773804, "step": 849 }, { "epoch": 0.17450215561486349, "grad_norm": 1.5542293489186274, "kd_ratio": 0.5, "learning_rate": 1.8926910734599516e-05, "loss": 1.2777317762374878, "loss/kd": 1.999848484992981, "loss/lm": 0.5556150674819946, "step": 850 }, { "epoch": 0.17470745226852802, "grad_norm": 1.9055755944125339, "kd_ratio": 0.5, "learning_rate": 1.892391168466452e-05, "loss": 1.17008376121521, "loss/kd": 1.812033772468567, "loss/lm": 0.5281338095664978, "step": 851 }, { "epoch": 0.17491274892219258, "grad_norm": 1.9603865054002596, "kd_ratio": 0.5, "learning_rate": 1.8920908688019532e-05, "loss": 1.3765835762023926, "loss/kd": 2.205564498901367, "loss/lm": 0.5476025938987732, "step": 852 }, { "epoch": 0.1751180455758571, "grad_norm": 1.5787462236130774, "kd_ratio": 0.5, "learning_rate": 1.8917901745992667e-05, "loss": 1.3134113550186157, "loss/kd": 2.0802736282348633, "loss/lm": 0.5465490818023682, "step": 853 }, { "epoch": 0.17532334222952167, "grad_norm": 1.7236342537214429, "kd_ratio": 0.5, "learning_rate": 1.8914890859913777e-05, "loss": 1.1467987298965454, "loss/kd": 1.8259482383728027, "loss/lm": 0.4676492214202881, "step": 854 }, { "epoch": 0.1755286388831862, "grad_norm": 2.219645760562214, "kd_ratio": 0.5, "learning_rate": 1.891187603111447e-05, "loss": 1.1671242713928223, "loss/kd": 1.8729643821716309, "loss/lm": 0.4612842798233032, "step": 855 }, { "epoch": 0.17573393553685074, "grad_norm": 1.449291343956787, "kd_ratio": 0.5, "learning_rate": 1.8908857260928083e-05, "loss": 1.2845112085342407, "loss/kd": 2.062326669692993, "loss/lm": 0.5066956877708435, "step": 856 }, { "epoch": 0.1759392321905153, "grad_norm": 1.680987160575823, "kd_ratio": 0.5, "learning_rate": 1.8905834550689713e-05, "loss": 1.483727216720581, "loss/kd": 2.4487314224243164, "loss/lm": 0.5187230706214905, "step": 857 }, { "epoch": 0.17614452884417983, "grad_norm": 2.61976266766821, "kd_ratio": 0.5, "learning_rate": 1.8902807901736185e-05, "loss": 1.3449783325195312, "loss/kd": 2.1825709342956543, "loss/lm": 0.5073856711387634, "step": 858 }, { "epoch": 0.1763498254978444, "grad_norm": 2.37558245461016, "kd_ratio": 0.5, "learning_rate": 1.8899777315406073e-05, "loss": 1.0952234268188477, "loss/kd": 1.6803019046783447, "loss/lm": 0.5101450681686401, "step": 859 }, { "epoch": 0.17655512215150893, "grad_norm": 1.0777728031705018, "kd_ratio": 0.5, "learning_rate": 1.8896742793039692e-05, "loss": 1.2237622737884521, "loss/kd": 1.9593887329101562, "loss/lm": 0.4881357252597809, "step": 860 }, { "epoch": 0.17676041880517349, "grad_norm": 1.7191613310419929, "kd_ratio": 0.5, "learning_rate": 1.8893704335979097e-05, "loss": 1.3694769144058228, "loss/kd": 2.183255434036255, "loss/lm": 0.5556984543800354, "step": 861 }, { "epoch": 0.17696571545883802, "grad_norm": 2.152536009603623, "kd_ratio": 0.5, "learning_rate": 1.8890661945568085e-05, "loss": 1.0327422618865967, "loss/kd": 1.6583703756332397, "loss/lm": 0.4071141481399536, "step": 862 }, { "epoch": 0.17717101211250258, "grad_norm": 1.0998893287274458, "kd_ratio": 0.5, "learning_rate": 1.8887615623152188e-05, "loss": 1.2037756443023682, "loss/kd": 1.9394901990890503, "loss/lm": 0.4680611491203308, "step": 863 }, { "epoch": 0.1773763087661671, "grad_norm": 1.7999032018523011, "kd_ratio": 0.5, "learning_rate": 1.888456537007868e-05, "loss": 2.1324844360351562, "loss/kd": 3.697164297103882, "loss/lm": 0.5678043365478516, "step": 864 }, { "epoch": 0.17758160541983165, "grad_norm": 1.7076115908268457, "kd_ratio": 0.5, "learning_rate": 1.888151118769657e-05, "loss": 1.1973490715026855, "loss/kd": 1.9351037740707397, "loss/lm": 0.4595944285392761, "step": 865 }, { "epoch": 0.1777869020734962, "grad_norm": 1.0990074958675442, "kd_ratio": 0.5, "learning_rate": 1.8878453077356616e-05, "loss": 1.1615575551986694, "loss/kd": 1.9501469135284424, "loss/lm": 0.3729681074619293, "step": 866 }, { "epoch": 0.17799219872716074, "grad_norm": 1.426644735913748, "kd_ratio": 0.5, "learning_rate": 1.88753910404113e-05, "loss": 1.3290526866912842, "loss/kd": 2.0726499557495117, "loss/lm": 0.5854552984237671, "step": 867 }, { "epoch": 0.1781974953808253, "grad_norm": 2.379935805535458, "kd_ratio": 0.5, "learning_rate": 1.887232507821484e-05, "loss": 1.1935489177703857, "loss/kd": 1.8871089220046997, "loss/lm": 0.499988853931427, "step": 868 }, { "epoch": 0.17840279203448983, "grad_norm": 1.5897468781783797, "kd_ratio": 0.5, "learning_rate": 1.8869255192123206e-05, "loss": 1.227349877357483, "loss/kd": 1.9959065914154053, "loss/lm": 0.4587932527065277, "step": 869 }, { "epoch": 0.1786080886881544, "grad_norm": 2.1558160462411426, "kd_ratio": 0.5, "learning_rate": 1.886618138349409e-05, "loss": 1.2484132051467896, "loss/kd": 1.9951469898223877, "loss/lm": 0.5016794204711914, "step": 870 }, { "epoch": 0.17881338534181893, "grad_norm": 3.085988625978358, "kd_ratio": 0.5, "learning_rate": 1.8863103653686917e-05, "loss": 1.4801629781723022, "loss/kd": 2.4392173290252686, "loss/lm": 0.5211086869239807, "step": 871 }, { "epoch": 0.1790186819954835, "grad_norm": 1.860696567089104, "kd_ratio": 0.5, "learning_rate": 1.8860022004062854e-05, "loss": 1.4612400531768799, "loss/kd": 2.322134256362915, "loss/lm": 0.6003458499908447, "step": 872 }, { "epoch": 0.17922397864914802, "grad_norm": 2.1986168231490804, "kd_ratio": 0.5, "learning_rate": 1.88569364359848e-05, "loss": 1.2903162240982056, "loss/kd": 1.8689301013946533, "loss/lm": 0.7117023468017578, "step": 873 }, { "epoch": 0.17942927530281255, "grad_norm": 1.5803974228210158, "kd_ratio": 0.5, "learning_rate": 1.8853846950817382e-05, "loss": 1.4088096618652344, "loss/kd": 2.257668972015381, "loss/lm": 0.5599504709243774, "step": 874 }, { "epoch": 0.1796345719564771, "grad_norm": 1.9790728806831688, "kd_ratio": 0.5, "learning_rate": 1.8850753549926967e-05, "loss": 1.232973575592041, "loss/kd": 1.877440094947815, "loss/lm": 0.5885071754455566, "step": 875 }, { "epoch": 0.17983986861014165, "grad_norm": 1.3542434702019523, "kd_ratio": 0.5, "learning_rate": 1.8847656234681647e-05, "loss": 1.2670094966888428, "loss/kd": 2.0986547470092773, "loss/lm": 0.4353641867637634, "step": 876 }, { "epoch": 0.1800451652638062, "grad_norm": 2.408626675808733, "kd_ratio": 0.5, "learning_rate": 1.884455500645125e-05, "loss": 1.326429009437561, "loss/kd": 2.167083501815796, "loss/lm": 0.48577451705932617, "step": 877 }, { "epoch": 0.18025046191747074, "grad_norm": 1.5770017439234942, "kd_ratio": 0.5, "learning_rate": 1.884144986660733e-05, "loss": 1.2873830795288086, "loss/kd": 1.9528931379318237, "loss/lm": 0.6218729615211487, "step": 878 }, { "epoch": 0.1804557585711353, "grad_norm": 2.5885540744615647, "kd_ratio": 0.5, "learning_rate": 1.8838340816523175e-05, "loss": 1.373990535736084, "loss/kd": 2.2433247566223145, "loss/lm": 0.5046563148498535, "step": 879 }, { "epoch": 0.18066105522479983, "grad_norm": 3.0753876392240054, "kd_ratio": 0.5, "learning_rate": 1.88352278575738e-05, "loss": 1.3701846599578857, "loss/kd": 2.1257545948028564, "loss/lm": 0.6146146059036255, "step": 880 }, { "epoch": 0.1808663518784644, "grad_norm": 1.6607598474788041, "kd_ratio": 0.5, "learning_rate": 1.8832110991135945e-05, "loss": 1.2306134700775146, "loss/kd": 1.9635690450668335, "loss/lm": 0.49765798449516296, "step": 881 }, { "epoch": 0.18107164853212893, "grad_norm": 1.6436898961859654, "kd_ratio": 0.5, "learning_rate": 1.8828990218588092e-05, "loss": 1.0189564228057861, "loss/kd": 1.6476855278015137, "loss/lm": 0.39022743701934814, "step": 882 }, { "epoch": 0.18127694518579346, "grad_norm": 1.4905556881992887, "kd_ratio": 0.5, "learning_rate": 1.8825865541310438e-05, "loss": 1.4607937335968018, "loss/kd": 2.5273079872131348, "loss/lm": 0.3942795991897583, "step": 883 }, { "epoch": 0.18148224183945802, "grad_norm": 2.2097965650243343, "kd_ratio": 0.5, "learning_rate": 1.8822736960684905e-05, "loss": 1.1215896606445312, "loss/kd": 1.7825037240982056, "loss/lm": 0.4606754779815674, "step": 884 }, { "epoch": 0.18168753849312255, "grad_norm": 1.309391126496168, "kd_ratio": 0.5, "learning_rate": 1.8819604478095153e-05, "loss": 1.1925387382507324, "loss/kd": 1.9830338954925537, "loss/lm": 0.40204349160194397, "step": 885 }, { "epoch": 0.1818928351467871, "grad_norm": 1.6970235020394124, "kd_ratio": 0.5, "learning_rate": 1.8816468094926553e-05, "loss": 1.1189055442810059, "loss/kd": 1.7631080150604248, "loss/lm": 0.47470319271087646, "step": 886 }, { "epoch": 0.18209813180045165, "grad_norm": 1.1813161556843315, "kd_ratio": 0.5, "learning_rate": 1.8813327812566217e-05, "loss": 1.3387306928634644, "loss/kd": 2.2496142387390137, "loss/lm": 0.4278472363948822, "step": 887 }, { "epoch": 0.1823034284541162, "grad_norm": 1.124324608960703, "kd_ratio": 0.5, "learning_rate": 1.8810183632402972e-05, "loss": 1.4252651929855347, "loss/kd": 2.1706619262695312, "loss/lm": 0.6798684000968933, "step": 888 }, { "epoch": 0.18250872510778074, "grad_norm": 1.478175917178407, "kd_ratio": 0.5, "learning_rate": 1.8807035555827367e-05, "loss": 1.4068267345428467, "loss/kd": 2.119518756866455, "loss/lm": 0.6941347718238831, "step": 889 }, { "epoch": 0.1827140217614453, "grad_norm": 1.9160956593234677, "kd_ratio": 0.5, "learning_rate": 1.880388358423168e-05, "loss": 1.1126189231872559, "loss/kd": 1.728317379951477, "loss/lm": 0.4969203472137451, "step": 890 }, { "epoch": 0.18291931841510983, "grad_norm": 1.3132217172035177, "kd_ratio": 0.5, "learning_rate": 1.880072771900991e-05, "loss": 1.1061638593673706, "loss/kd": 1.8098751306533813, "loss/lm": 0.4024525582790375, "step": 891 }, { "epoch": 0.18312461506877437, "grad_norm": 1.507607336016149, "kd_ratio": 0.5, "learning_rate": 1.8797567961557776e-05, "loss": 1.4566855430603027, "loss/kd": 2.497612953186035, "loss/lm": 0.4157581329345703, "step": 892 }, { "epoch": 0.18332991172243893, "grad_norm": 2.8089475713767813, "kd_ratio": 0.5, "learning_rate": 1.879440431327272e-05, "loss": 1.2510110139846802, "loss/kd": 2.0163989067077637, "loss/lm": 0.4856230616569519, "step": 893 }, { "epoch": 0.18353520837610346, "grad_norm": 2.6075867269540254, "kd_ratio": 0.5, "learning_rate": 1.8791236775553906e-05, "loss": 1.362593650817871, "loss/kd": 2.255751371383667, "loss/lm": 0.46943581104278564, "step": 894 }, { "epoch": 0.18374050502976802, "grad_norm": 1.345877765369567, "kd_ratio": 0.5, "learning_rate": 1.878806534980221e-05, "loss": 1.184347152709961, "loss/kd": 1.8196308612823486, "loss/lm": 0.5490635633468628, "step": 895 }, { "epoch": 0.18394580168343255, "grad_norm": 1.5324253615710741, "kd_ratio": 0.5, "learning_rate": 1.8784890037420245e-05, "loss": 1.2212353944778442, "loss/kd": 2.0128772258758545, "loss/lm": 0.429593563079834, "step": 896 }, { "epoch": 0.1841510983370971, "grad_norm": 2.4185183675139803, "kd_ratio": 0.5, "learning_rate": 1.8781710839812324e-05, "loss": 1.2588862180709839, "loss/kd": 2.06514573097229, "loss/lm": 0.4526267945766449, "step": 897 }, { "epoch": 0.18435639499076165, "grad_norm": 1.406803670692781, "kd_ratio": 0.5, "learning_rate": 1.8778527758384492e-05, "loss": 1.3251919746398926, "loss/kd": 2.1409921646118164, "loss/lm": 0.5093916654586792, "step": 898 }, { "epoch": 0.1845616916444262, "grad_norm": 1.6094479588153128, "kd_ratio": 0.5, "learning_rate": 1.8775340794544497e-05, "loss": 1.192732334136963, "loss/kd": 1.9728577136993408, "loss/lm": 0.4126070439815521, "step": 899 }, { "epoch": 0.18476698829809074, "grad_norm": 2.815030739219912, "kd_ratio": 0.5, "learning_rate": 1.8772149949701824e-05, "loss": 1.3604403734207153, "loss/kd": 2.2705776691436768, "loss/lm": 0.4503030776977539, "step": 900 }, { "epoch": 0.18497228495175527, "grad_norm": 2.1910689751395513, "kd_ratio": 0.5, "learning_rate": 1.8768955225267653e-05, "loss": 1.423772931098938, "loss/kd": 2.2718238830566406, "loss/lm": 0.5757219195365906, "step": 901 }, { "epoch": 0.18517758160541983, "grad_norm": 1.0694547721580498, "kd_ratio": 0.5, "learning_rate": 1.87657566226549e-05, "loss": 1.0837621688842773, "loss/kd": 1.779706597328186, "loss/lm": 0.3878178596496582, "step": 902 }, { "epoch": 0.18538287825908437, "grad_norm": 1.5689932412525747, "kd_ratio": 0.5, "learning_rate": 1.876255414327818e-05, "loss": 1.2935508489608765, "loss/kd": 2.105724811553955, "loss/lm": 0.48137691617012024, "step": 903 }, { "epoch": 0.18558817491274893, "grad_norm": 1.4016034513826656, "kd_ratio": 0.5, "learning_rate": 1.875934778855383e-05, "loss": 1.2014384269714355, "loss/kd": 2.0076754093170166, "loss/lm": 0.39520153403282166, "step": 904 }, { "epoch": 0.18579347156641346, "grad_norm": 1.269136202109844, "kd_ratio": 0.5, "learning_rate": 1.8756137559899904e-05, "loss": 1.1193217039108276, "loss/kd": 1.7129406929016113, "loss/lm": 0.525702714920044, "step": 905 }, { "epoch": 0.18599876822007802, "grad_norm": 1.91395791286549, "kd_ratio": 0.5, "learning_rate": 1.875292345873616e-05, "loss": 1.2151671648025513, "loss/kd": 1.961764931678772, "loss/lm": 0.4685693085193634, "step": 906 }, { "epoch": 0.18620406487374255, "grad_norm": 1.3452273282224294, "kd_ratio": 0.5, "learning_rate": 1.8749705486484074e-05, "loss": 1.099381685256958, "loss/kd": 1.7724287509918213, "loss/lm": 0.42633453011512756, "step": 907 }, { "epoch": 0.1864093615274071, "grad_norm": 1.1011168779938605, "kd_ratio": 0.5, "learning_rate": 1.8746483644566842e-05, "loss": 1.344096302986145, "loss/kd": 2.2623186111450195, "loss/lm": 0.4258740246295929, "step": 908 }, { "epoch": 0.18661465818107165, "grad_norm": 1.4472636070741678, "kd_ratio": 0.5, "learning_rate": 1.8743257934409352e-05, "loss": 1.1174352169036865, "loss/kd": 1.8110101222991943, "loss/lm": 0.4238602817058563, "step": 909 }, { "epoch": 0.1868199548347362, "grad_norm": 1.5209208336377829, "kd_ratio": 0.5, "learning_rate": 1.874002835743822e-05, "loss": 1.1584630012512207, "loss/kd": 1.8392339944839478, "loss/lm": 0.4776919484138489, "step": 910 }, { "epoch": 0.18702525148840074, "grad_norm": 1.3351721869220945, "kd_ratio": 0.5, "learning_rate": 1.8736794915081765e-05, "loss": 1.387655258178711, "loss/kd": 2.1306145191192627, "loss/lm": 0.644696056842804, "step": 911 }, { "epoch": 0.18723054814206527, "grad_norm": 1.2464661680222497, "kd_ratio": 0.5, "learning_rate": 1.873355760877002e-05, "loss": 1.144467830657959, "loss/kd": 1.873875379562378, "loss/lm": 0.4150604009628296, "step": 912 }, { "epoch": 0.18743584479572983, "grad_norm": 1.2810191587835078, "kd_ratio": 0.5, "learning_rate": 1.8730316439934723e-05, "loss": 1.348559856414795, "loss/kd": 2.3263111114501953, "loss/lm": 0.3708086907863617, "step": 913 }, { "epoch": 0.18764114144939437, "grad_norm": 1.1924491788441776, "kd_ratio": 0.5, "learning_rate": 1.8727071410009313e-05, "loss": 1.2651102542877197, "loss/kd": 2.049931526184082, "loss/lm": 0.4802889823913574, "step": 914 }, { "epoch": 0.18784643810305893, "grad_norm": 1.5098222585163754, "kd_ratio": 0.5, "learning_rate": 1.8723822520428954e-05, "loss": 1.2490224838256836, "loss/kd": 1.9494069814682007, "loss/lm": 0.5486379265785217, "step": 915 }, { "epoch": 0.18805173475672346, "grad_norm": 1.3108591125437736, "kd_ratio": 0.5, "learning_rate": 1.8720569772630505e-05, "loss": 1.3552671670913696, "loss/kd": 2.28546404838562, "loss/lm": 0.42507031559944153, "step": 916 }, { "epoch": 0.18825703141038802, "grad_norm": 1.2098200683158953, "kd_ratio": 0.5, "learning_rate": 1.871731316805253e-05, "loss": 1.241329550743103, "loss/kd": 1.8873213529586792, "loss/lm": 0.5953378081321716, "step": 917 }, { "epoch": 0.18846232806405255, "grad_norm": 1.0168587997783192, "kd_ratio": 0.5, "learning_rate": 1.8714052708135305e-05, "loss": 1.3679835796356201, "loss/kd": 2.223651170730591, "loss/lm": 0.5123160481452942, "step": 918 }, { "epoch": 0.18866762471771711, "grad_norm": 0.9852805078198205, "kd_ratio": 0.5, "learning_rate": 1.8710788394320807e-05, "loss": 1.2846803665161133, "loss/kd": 2.0174036026000977, "loss/lm": 0.5519571900367737, "step": 919 }, { "epoch": 0.18887292137138165, "grad_norm": 1.0604223403009794, "kd_ratio": 0.5, "learning_rate": 1.8707520228052726e-05, "loss": 1.1646779775619507, "loss/kd": 1.8080984354019165, "loss/lm": 0.5212574601173401, "step": 920 }, { "epoch": 0.18907821802504618, "grad_norm": 0.9857651942630495, "kd_ratio": 0.5, "learning_rate": 1.8704248210776442e-05, "loss": 1.2126392126083374, "loss/kd": 1.8399721384048462, "loss/lm": 0.5853062868118286, "step": 921 }, { "epoch": 0.18928351467871074, "grad_norm": 1.2892338673870207, "kd_ratio": 0.5, "learning_rate": 1.8700972343939042e-05, "loss": 2.033107280731201, "loss/kd": 3.734034776687622, "loss/lm": 0.3321800231933594, "step": 922 }, { "epoch": 0.18948881133237527, "grad_norm": 1.796038472569194, "kd_ratio": 0.5, "learning_rate": 1.8697692628989327e-05, "loss": 1.1501491069793701, "loss/kd": 1.927925705909729, "loss/lm": 0.3723726272583008, "step": 923 }, { "epoch": 0.18969410798603983, "grad_norm": 1.1859415229907027, "kd_ratio": 0.5, "learning_rate": 1.8694409067377788e-05, "loss": 1.3882503509521484, "loss/kd": 2.3247599601745605, "loss/lm": 0.4517408311367035, "step": 924 }, { "epoch": 0.18989940463970437, "grad_norm": 1.1137178556201535, "kd_ratio": 0.5, "learning_rate": 1.869112166055662e-05, "loss": 1.1948933601379395, "loss/kd": 1.9499523639678955, "loss/lm": 0.43983444571495056, "step": 925 }, { "epoch": 0.19010470129336893, "grad_norm": 1.1848035907930305, "kd_ratio": 0.5, "learning_rate": 1.8687830409979718e-05, "loss": 1.2357016801834106, "loss/kd": 2.0315916538238525, "loss/lm": 0.4398117959499359, "step": 926 }, { "epoch": 0.19030999794703346, "grad_norm": 1.394532432649141, "kd_ratio": 0.5, "learning_rate": 1.868453531710268e-05, "loss": 1.027358055114746, "loss/kd": 1.6038925647735596, "loss/lm": 0.45082345604896545, "step": 927 }, { "epoch": 0.19051529460069802, "grad_norm": 1.754832671829589, "kd_ratio": 0.5, "learning_rate": 1.8681236383382804e-05, "loss": 1.2097276449203491, "loss/kd": 1.916533350944519, "loss/lm": 0.5029218792915344, "step": 928 }, { "epoch": 0.19072059125436255, "grad_norm": 1.7457193423962158, "kd_ratio": 0.5, "learning_rate": 1.867793361027908e-05, "loss": 1.2490200996398926, "loss/kd": 1.8726463317871094, "loss/lm": 0.6253939867019653, "step": 929 }, { "epoch": 0.1909258879080271, "grad_norm": 1.1458228102197356, "kd_ratio": 0.5, "learning_rate": 1.8674626999252198e-05, "loss": 1.4104418754577637, "loss/kd": 2.2108418941497803, "loss/lm": 0.6100419759750366, "step": 930 }, { "epoch": 0.19113118456169165, "grad_norm": 1.4326015329382311, "kd_ratio": 0.5, "learning_rate": 1.8671316551764552e-05, "loss": 1.2393944263458252, "loss/kd": 2.1249172687530518, "loss/lm": 0.3538717031478882, "step": 931 }, { "epoch": 0.19133648121535618, "grad_norm": 1.4467954043440214, "kd_ratio": 0.5, "learning_rate": 1.8668002269280228e-05, "loss": 0.9768576622009277, "loss/kd": 1.4514522552490234, "loss/lm": 0.502263069152832, "step": 932 }, { "epoch": 0.19154177786902074, "grad_norm": 2.398160496041351, "kd_ratio": 0.5, "learning_rate": 1.866468415326501e-05, "loss": 1.2844693660736084, "loss/kd": 1.9954514503479004, "loss/lm": 0.5734872221946716, "step": 933 }, { "epoch": 0.19174707452268527, "grad_norm": 2.4388586993843857, "kd_ratio": 0.5, "learning_rate": 1.866136220518637e-05, "loss": 1.0661627054214478, "loss/kd": 1.7120636701583862, "loss/lm": 0.4202617108821869, "step": 934 }, { "epoch": 0.19195237117634983, "grad_norm": 1.1143637269980795, "kd_ratio": 0.5, "learning_rate": 1.865803642651348e-05, "loss": 1.389697551727295, "loss/kd": 2.2808139324188232, "loss/lm": 0.4985812306404114, "step": 935 }, { "epoch": 0.19215766783001437, "grad_norm": 1.7513462140670004, "kd_ratio": 0.5, "learning_rate": 1.8654706818717218e-05, "loss": 1.244870662689209, "loss/kd": 2.028671979904175, "loss/lm": 0.4610694646835327, "step": 936 }, { "epoch": 0.19236296448367893, "grad_norm": 1.8808314686019036, "kd_ratio": 0.5, "learning_rate": 1.8651373383270132e-05, "loss": 1.5631253719329834, "loss/kd": 2.424678087234497, "loss/lm": 0.7015726566314697, "step": 937 }, { "epoch": 0.19256826113734346, "grad_norm": 2.152372963795462, "kd_ratio": 0.5, "learning_rate": 1.8648036121646474e-05, "loss": 1.465088129043579, "loss/kd": 2.531298875808716, "loss/lm": 0.3988772928714752, "step": 938 }, { "epoch": 0.192773557791008, "grad_norm": 1.5924079953798944, "kd_ratio": 0.5, "learning_rate": 1.8644695035322203e-05, "loss": 1.2206940650939941, "loss/kd": 1.9811733961105347, "loss/lm": 0.4602147340774536, "step": 939 }, { "epoch": 0.19297885444467255, "grad_norm": 1.5066661029321684, "kd_ratio": 0.5, "learning_rate": 1.864135012577494e-05, "loss": 1.0330451726913452, "loss/kd": 1.6391271352767944, "loss/lm": 0.42696329951286316, "step": 940 }, { "epoch": 0.1931841510983371, "grad_norm": 1.4597281188658213, "kd_ratio": 0.5, "learning_rate": 1.8638001394484026e-05, "loss": 0.9793672561645508, "loss/kd": 1.6057488918304443, "loss/lm": 0.3529856204986572, "step": 941 }, { "epoch": 0.19338944775200165, "grad_norm": 2.600301599842269, "kd_ratio": 0.5, "learning_rate": 1.8634648842930466e-05, "loss": 1.3194169998168945, "loss/kd": 2.1494383811950684, "loss/lm": 0.4893956482410431, "step": 942 }, { "epoch": 0.19359474440566618, "grad_norm": 2.078290207371806, "kd_ratio": 0.5, "learning_rate": 1.8631292472596978e-05, "loss": 1.2928340435028076, "loss/kd": 2.164715528488159, "loss/lm": 0.4209524393081665, "step": 943 }, { "epoch": 0.19380004105933074, "grad_norm": 1.4351832531171262, "kd_ratio": 0.5, "learning_rate": 1.862793228496795e-05, "loss": 1.1018868684768677, "loss/kd": 1.7695884704589844, "loss/lm": 0.43418535590171814, "step": 944 }, { "epoch": 0.19400533771299527, "grad_norm": 1.819443819650502, "kd_ratio": 0.5, "learning_rate": 1.8624568281529473e-05, "loss": 1.2959051132202148, "loss/kd": 2.1492583751678467, "loss/lm": 0.4425518214702606, "step": 945 }, { "epoch": 0.19421063436665983, "grad_norm": 1.4870393425390342, "kd_ratio": 0.5, "learning_rate": 1.8621200463769313e-05, "loss": 1.0619553327560425, "loss/kd": 1.6785017251968384, "loss/lm": 0.4454089403152466, "step": 946 }, { "epoch": 0.19441593102032437, "grad_norm": 1.6464659341555565, "kd_ratio": 0.5, "learning_rate": 1.8617828833176935e-05, "loss": 1.404083013534546, "loss/kd": 2.332869529724121, "loss/lm": 0.4752964377403259, "step": 947 }, { "epoch": 0.1946212276739889, "grad_norm": 1.028018170510118, "kd_ratio": 0.5, "learning_rate": 1.8614453391243482e-05, "loss": 1.2399840354919434, "loss/kd": 1.964528203010559, "loss/lm": 0.5154399275779724, "step": 948 }, { "epoch": 0.19482652432765346, "grad_norm": 1.6656181431953558, "kd_ratio": 0.5, "learning_rate": 1.861107413946178e-05, "loss": 1.1781642436981201, "loss/kd": 1.8816066980361938, "loss/lm": 0.4747218191623688, "step": 949 }, { "epoch": 0.195031820981318, "grad_norm": 1.3167375130615744, "kd_ratio": 0.5, "learning_rate": 1.860769107932635e-05, "loss": 1.1343404054641724, "loss/kd": 1.7313299179077148, "loss/lm": 0.5373509526252747, "step": 950 }, { "epoch": 0.19523711763498255, "grad_norm": 1.8836280228680085, "kd_ratio": 0.5, "learning_rate": 1.860430421233339e-05, "loss": 1.0829741954803467, "loss/kd": 1.7341192960739136, "loss/lm": 0.4318291246891022, "step": 951 }, { "epoch": 0.1954424142886471, "grad_norm": 2.2830071184524714, "kd_ratio": 0.5, "learning_rate": 1.8600913539980786e-05, "loss": 1.1782892942428589, "loss/kd": 1.8890224695205688, "loss/lm": 0.4675561785697937, "step": 952 }, { "epoch": 0.19564771094231165, "grad_norm": 1.3525076885144574, "kd_ratio": 0.5, "learning_rate": 1.8597519063768104e-05, "loss": 1.3203198909759521, "loss/kd": 2.0495622158050537, "loss/lm": 0.5910775065422058, "step": 953 }, { "epoch": 0.19585300759597618, "grad_norm": 1.4380633314899351, "kd_ratio": 0.5, "learning_rate": 1.859412078519659e-05, "loss": 1.2279378175735474, "loss/kd": 2.00547456741333, "loss/lm": 0.4504011273384094, "step": 954 }, { "epoch": 0.19605830424964074, "grad_norm": 1.3434731383375815, "kd_ratio": 0.5, "learning_rate": 1.859071870576918e-05, "loss": 1.3503855466842651, "loss/kd": 2.264866352081299, "loss/lm": 0.43590468168258667, "step": 955 }, { "epoch": 0.19626360090330527, "grad_norm": 1.6272873176181446, "kd_ratio": 0.5, "learning_rate": 1.858731282699048e-05, "loss": 1.1098811626434326, "loss/kd": 1.6666009426116943, "loss/lm": 0.5531613826751709, "step": 956 }, { "epoch": 0.19646889755696983, "grad_norm": 1.3389507335563464, "kd_ratio": 0.5, "learning_rate": 1.8583903150366785e-05, "loss": 1.965562105178833, "loss/kd": 3.581740617752075, "loss/lm": 0.34938350319862366, "step": 957 }, { "epoch": 0.19667419421063437, "grad_norm": 1.4041048151960456, "kd_ratio": 0.5, "learning_rate": 1.8580489677406064e-05, "loss": 0.8940942287445068, "loss/kd": 1.369506597518921, "loss/lm": 0.4186818301677704, "step": 958 }, { "epoch": 0.1968794908642989, "grad_norm": 1.5079878355214962, "kd_ratio": 0.5, "learning_rate": 1.857707240961797e-05, "loss": 0.9880449771881104, "loss/kd": 1.6059532165527344, "loss/lm": 0.37013670802116394, "step": 959 }, { "epoch": 0.19708478751796346, "grad_norm": 1.7997235809186098, "kd_ratio": 0.5, "learning_rate": 1.857365134851383e-05, "loss": 1.128889799118042, "loss/kd": 1.7575727701187134, "loss/lm": 0.500206708908081, "step": 960 }, { "epoch": 0.197290084171628, "grad_norm": 2.441247151560499, "kd_ratio": 0.5, "learning_rate": 1.8570226495606657e-05, "loss": 1.2276519536972046, "loss/kd": 1.9783085584640503, "loss/lm": 0.4769953191280365, "step": 961 }, { "epoch": 0.19749538082529255, "grad_norm": 1.9542678227634636, "kd_ratio": 0.5, "learning_rate": 1.8566797852411124e-05, "loss": 1.303932547569275, "loss/kd": 2.0225236415863037, "loss/lm": 0.5853415131568909, "step": 962 }, { "epoch": 0.1977006774789571, "grad_norm": 1.6127694529266448, "kd_ratio": 0.5, "learning_rate": 1.8563365420443594e-05, "loss": 1.1728713512420654, "loss/kd": 1.9736831188201904, "loss/lm": 0.37205955386161804, "step": 963 }, { "epoch": 0.19790597413262165, "grad_norm": 1.100822080561681, "kd_ratio": 0.5, "learning_rate": 1.8559929201222107e-05, "loss": 1.1369597911834717, "loss/kd": 1.868516445159912, "loss/lm": 0.4054030776023865, "step": 964 }, { "epoch": 0.19811127078628618, "grad_norm": 1.5700344657730976, "kd_ratio": 0.5, "learning_rate": 1.855648919626637e-05, "loss": 1.5860669612884521, "loss/kd": 2.662367343902588, "loss/lm": 0.5097665786743164, "step": 965 }, { "epoch": 0.19831656743995074, "grad_norm": 2.3783467539444105, "kd_ratio": 0.5, "learning_rate": 1.855304540709777e-05, "loss": 1.257372260093689, "loss/kd": 2.135038137435913, "loss/lm": 0.3797062933444977, "step": 966 }, { "epoch": 0.19852186409361527, "grad_norm": 2.721267514863075, "kd_ratio": 0.5, "learning_rate": 1.854959783523936e-05, "loss": 1.206997275352478, "loss/kd": 1.9454147815704346, "loss/lm": 0.46857982873916626, "step": 967 }, { "epoch": 0.1987271607472798, "grad_norm": 2.144243271937135, "kd_ratio": 0.5, "learning_rate": 1.8546146482215875e-05, "loss": 1.029984712600708, "loss/kd": 1.671467900276184, "loss/lm": 0.3885016143321991, "step": 968 }, { "epoch": 0.19893245740094437, "grad_norm": 1.4620911819746447, "kd_ratio": 0.5, "learning_rate": 1.854269134955372e-05, "loss": 1.3344398736953735, "loss/kd": 2.2457215785980225, "loss/lm": 0.42315807938575745, "step": 969 }, { "epoch": 0.1991377540546089, "grad_norm": 1.5512207042820982, "kd_ratio": 0.5, "learning_rate": 1.8539232438780964e-05, "loss": 1.1125397682189941, "loss/kd": 1.7528800964355469, "loss/lm": 0.472199410200119, "step": 970 }, { "epoch": 0.19934305070827346, "grad_norm": 1.8841798294635679, "kd_ratio": 0.5, "learning_rate": 1.853576975142736e-05, "loss": 1.3140252828598022, "loss/kd": 2.109830379486084, "loss/lm": 0.5182201266288757, "step": 971 }, { "epoch": 0.199548347361938, "grad_norm": 1.9144309362627632, "kd_ratio": 0.5, "learning_rate": 1.853230328902432e-05, "loss": 1.2656036615371704, "loss/kd": 1.9990488290786743, "loss/lm": 0.5321584343910217, "step": 972 }, { "epoch": 0.19975364401560256, "grad_norm": 1.6642918816962, "kd_ratio": 0.5, "learning_rate": 1.852883305310493e-05, "loss": 1.2497297525405884, "loss/kd": 2.0883901119232178, "loss/lm": 0.41106948256492615, "step": 973 }, { "epoch": 0.1999589406692671, "grad_norm": 1.1070465843317563, "kd_ratio": 0.5, "learning_rate": 1.8525359045203944e-05, "loss": 1.3316948413848877, "loss/kd": 2.1574435234069824, "loss/lm": 0.5059462785720825, "step": 974 }, { "epoch": 0.20016423732293165, "grad_norm": 1.2713445197114035, "kd_ratio": 0.5, "learning_rate": 1.852188126685779e-05, "loss": 1.2755337953567505, "loss/kd": 2.087367534637451, "loss/lm": 0.46369999647140503, "step": 975 }, { "epoch": 0.20036953397659618, "grad_norm": 1.1417596361373574, "kd_ratio": 0.5, "learning_rate": 1.8518399719604554e-05, "loss": 1.407732367515564, "loss/kd": 2.424736261367798, "loss/lm": 0.39072853326797485, "step": 976 }, { "epoch": 0.20057483063026071, "grad_norm": 1.1773136577537342, "kd_ratio": 0.5, "learning_rate": 1.8514914404983992e-05, "loss": 1.141545057296753, "loss/kd": 1.6358033418655396, "loss/lm": 0.6472867131233215, "step": 977 }, { "epoch": 0.20078012728392527, "grad_norm": 1.8741867436687039, "kd_ratio": 0.5, "learning_rate": 1.851142532453753e-05, "loss": 1.0187674760818481, "loss/kd": 1.6460695266723633, "loss/lm": 0.39146551489830017, "step": 978 }, { "epoch": 0.2009854239375898, "grad_norm": 1.2322569263366507, "kd_ratio": 0.5, "learning_rate": 1.8507932479808254e-05, "loss": 1.4325642585754395, "loss/kd": 2.374480962753296, "loss/lm": 0.49064764380455017, "step": 979 }, { "epoch": 0.20119072059125437, "grad_norm": 1.3584317082142214, "kd_ratio": 0.5, "learning_rate": 1.8504435872340924e-05, "loss": 1.4438120126724243, "loss/kd": 2.370584011077881, "loss/lm": 0.5170400142669678, "step": 980 }, { "epoch": 0.2013960172449189, "grad_norm": 1.0830310777399454, "kd_ratio": 0.5, "learning_rate": 1.850093550368195e-05, "loss": 1.229151725769043, "loss/kd": 2.053316831588745, "loss/lm": 0.4049866795539856, "step": 981 }, { "epoch": 0.20160131389858346, "grad_norm": 1.3789981525239707, "kd_ratio": 0.5, "learning_rate": 1.8497431375379417e-05, "loss": 1.3284375667572021, "loss/kd": 2.095475196838379, "loss/lm": 0.5613998770713806, "step": 982 }, { "epoch": 0.201806610552248, "grad_norm": 2.539722366057221, "kd_ratio": 0.5, "learning_rate": 1.8493923488983066e-05, "loss": 1.0446033477783203, "loss/kd": 1.6309314966201782, "loss/lm": 0.45827516913414, "step": 983 }, { "epoch": 0.20201190720591256, "grad_norm": 1.5887936829608054, "kd_ratio": 0.5, "learning_rate": 1.8490411846044313e-05, "loss": 1.1143549680709839, "loss/kd": 1.7145240306854248, "loss/lm": 0.5141858458518982, "step": 984 }, { "epoch": 0.2022172038595771, "grad_norm": 1.7818240376915557, "kd_ratio": 0.5, "learning_rate": 1.848689644811621e-05, "loss": 1.0371904373168945, "loss/kd": 1.6044830083847046, "loss/lm": 0.46989795565605164, "step": 985 }, { "epoch": 0.20242250051324162, "grad_norm": 2.3953550476054346, "kd_ratio": 0.5, "learning_rate": 1.84833772967535e-05, "loss": 1.1754932403564453, "loss/kd": 1.927876591682434, "loss/lm": 0.4231100082397461, "step": 986 }, { "epoch": 0.20262779716690618, "grad_norm": 1.298192687760742, "kd_ratio": 0.5, "learning_rate": 1.847985439351256e-05, "loss": 1.1694369316101074, "loss/kd": 1.8863176107406616, "loss/lm": 0.45255616307258606, "step": 987 }, { "epoch": 0.20283309382057071, "grad_norm": 1.2321867932612955, "kd_ratio": 0.5, "learning_rate": 1.847632773995144e-05, "loss": 1.2630664110183716, "loss/kd": 2.1088664531707764, "loss/lm": 0.4172663986682892, "step": 988 }, { "epoch": 0.20303839047423528, "grad_norm": 1.4823989650695815, "kd_ratio": 0.5, "learning_rate": 1.8472797337629852e-05, "loss": 1.4198119640350342, "loss/kd": 2.245142936706543, "loss/lm": 0.5944809317588806, "step": 989 }, { "epoch": 0.2032436871278998, "grad_norm": 1.3162517893106436, "kd_ratio": 0.5, "learning_rate": 1.8469263188109153e-05, "loss": 1.1049623489379883, "loss/kd": 1.8016947507858276, "loss/lm": 0.40822985768318176, "step": 990 }, { "epoch": 0.20344898378156437, "grad_norm": 1.4350486273750713, "kd_ratio": 0.5, "learning_rate": 1.846572529295237e-05, "loss": 1.0791875123977661, "loss/kd": 1.7120001316070557, "loss/lm": 0.4463749825954437, "step": 991 }, { "epoch": 0.2036542804352289, "grad_norm": 1.1661398249805481, "kd_ratio": 0.5, "learning_rate": 1.8462183653724173e-05, "loss": 1.4300403594970703, "loss/kd": 2.3730430603027344, "loss/lm": 0.4870375394821167, "step": 992 }, { "epoch": 0.20385957708889346, "grad_norm": 1.0278908615972842, "kd_ratio": 0.5, "learning_rate": 1.84586382719909e-05, "loss": 1.278088927268982, "loss/kd": 1.9592190980911255, "loss/lm": 0.5969587564468384, "step": 993 }, { "epoch": 0.204064873742558, "grad_norm": 0.9674861928610722, "kd_ratio": 0.5, "learning_rate": 1.8455089149320544e-05, "loss": 1.3164942264556885, "loss/kd": 2.118014097213745, "loss/lm": 0.5149744153022766, "step": 994 }, { "epoch": 0.20427017039622253, "grad_norm": 0.9812582250725269, "kd_ratio": 0.5, "learning_rate": 1.845153628728274e-05, "loss": 1.4139891862869263, "loss/kd": 2.4208319187164307, "loss/lm": 0.4071464538574219, "step": 995 }, { "epoch": 0.2044754670498871, "grad_norm": 0.9820630482499438, "kd_ratio": 0.5, "learning_rate": 1.8447979687448795e-05, "loss": 1.2829678058624268, "loss/kd": 2.087597370147705, "loss/lm": 0.4783382713794708, "step": 996 }, { "epoch": 0.20468076370355162, "grad_norm": 1.2268416900369765, "kd_ratio": 0.5, "learning_rate": 1.8444419351391646e-05, "loss": 1.322801113128662, "loss/kd": 2.062511920928955, "loss/lm": 0.5830902457237244, "step": 997 }, { "epoch": 0.20488606035721618, "grad_norm": 1.5039755185298125, "kd_ratio": 0.5, "learning_rate": 1.8440855280685907e-05, "loss": 1.2294398546218872, "loss/kd": 1.8327741622924805, "loss/lm": 0.6261056065559387, "step": 998 }, { "epoch": 0.20509135701088071, "grad_norm": 0.9936597113191141, "kd_ratio": 0.5, "learning_rate": 1.8437287476907828e-05, "loss": 1.2389756441116333, "loss/kd": 1.951321005821228, "loss/lm": 0.5266302824020386, "step": 999 }, { "epoch": 0.20529665366454528, "grad_norm": 1.4614912768468258, "kd_ratio": 0.5, "learning_rate": 1.8433715941635317e-05, "loss": 1.152726411819458, "loss/kd": 1.887198805809021, "loss/lm": 0.41825392842292786, "step": 1000 }, { "epoch": 0.2055019503182098, "grad_norm": 1.7296691276760285, "kd_ratio": 0.5, "learning_rate": 1.8430140676447923e-05, "loss": 0.9520840644836426, "loss/kd": 1.466536045074463, "loss/lm": 0.43763211369514465, "step": 1001 }, { "epoch": 0.20570724697187437, "grad_norm": 1.9232344239280077, "kd_ratio": 0.5, "learning_rate": 1.8426561682926857e-05, "loss": 1.3211703300476074, "loss/kd": 1.9770143032073975, "loss/lm": 0.6653263568878174, "step": 1002 }, { "epoch": 0.2059125436255389, "grad_norm": 2.0365905615388864, "kd_ratio": 0.5, "learning_rate": 1.842297896265497e-05, "loss": 1.1357046365737915, "loss/kd": 1.745933175086975, "loss/lm": 0.5254760980606079, "step": 1003 }, { "epoch": 0.20611784027920346, "grad_norm": 1.5944351843206133, "kd_ratio": 0.5, "learning_rate": 1.8419392517216763e-05, "loss": 1.2470041513442993, "loss/kd": 2.1147890090942383, "loss/lm": 0.37921929359436035, "step": 1004 }, { "epoch": 0.206323136932868, "grad_norm": 1.0513776747922934, "kd_ratio": 0.5, "learning_rate": 1.841580234819839e-05, "loss": 1.1659272909164429, "loss/kd": 1.8601888418197632, "loss/lm": 0.4716656804084778, "step": 1005 }, { "epoch": 0.20652843358653253, "grad_norm": 1.4196514313605468, "kd_ratio": 0.5, "learning_rate": 1.8412208457187648e-05, "loss": 0.980004072189331, "loss/kd": 1.5656335353851318, "loss/lm": 0.3943746089935303, "step": 1006 }, { "epoch": 0.2067337302401971, "grad_norm": 1.6864512177115079, "kd_ratio": 0.5, "learning_rate": 1.8408610845773974e-05, "loss": 1.233079433441162, "loss/kd": 2.0091373920440674, "loss/lm": 0.45702141523361206, "step": 1007 }, { "epoch": 0.20693902689386162, "grad_norm": 1.4551117846016135, "kd_ratio": 0.5, "learning_rate": 1.840500951554846e-05, "loss": 1.1713578701019287, "loss/kd": 1.8653618097305298, "loss/lm": 0.4773540198802948, "step": 1008 }, { "epoch": 0.20714432354752618, "grad_norm": 1.0662381363073223, "kd_ratio": 0.5, "learning_rate": 1.8401404468103838e-05, "loss": 1.315654993057251, "loss/kd": 2.1239945888519287, "loss/lm": 0.5073152780532837, "step": 1009 }, { "epoch": 0.20734962020119072, "grad_norm": 1.228530021301926, "kd_ratio": 0.5, "learning_rate": 1.8397795705034482e-05, "loss": 1.1259430646896362, "loss/kd": 1.8328309059143066, "loss/lm": 0.4190552830696106, "step": 1010 }, { "epoch": 0.20755491685485528, "grad_norm": 1.3908679713900012, "kd_ratio": 0.5, "learning_rate": 1.8394183227936418e-05, "loss": 1.2278364896774292, "loss/kd": 1.984952688217163, "loss/lm": 0.4707202613353729, "step": 1011 }, { "epoch": 0.2077602135085198, "grad_norm": 1.5989739398216658, "kd_ratio": 0.5, "learning_rate": 1.8390567038407306e-05, "loss": 1.1607177257537842, "loss/kd": 1.8575623035430908, "loss/lm": 0.46387311816215515, "step": 1012 }, { "epoch": 0.20796551016218437, "grad_norm": 1.7428514642165325, "kd_ratio": 0.5, "learning_rate": 1.838694713804645e-05, "loss": 1.423801302909851, "loss/kd": 2.4611892700195312, "loss/lm": 0.3864133059978485, "step": 1013 }, { "epoch": 0.2081708068158489, "grad_norm": 1.400998488471713, "kd_ratio": 0.5, "learning_rate": 1.838332352845479e-05, "loss": 1.2728352546691895, "loss/kd": 2.112288236618042, "loss/lm": 0.43338221311569214, "step": 1014 }, { "epoch": 0.20837610346951344, "grad_norm": 1.0076739292541077, "kd_ratio": 0.5, "learning_rate": 1.8379696211234918e-05, "loss": 1.0574898719787598, "loss/kd": 1.7144283056259155, "loss/lm": 0.400551438331604, "step": 1015 }, { "epoch": 0.208581400123178, "grad_norm": 1.179273010633787, "kd_ratio": 0.5, "learning_rate": 1.837606518799106e-05, "loss": 1.1421759128570557, "loss/kd": 1.8062074184417725, "loss/lm": 0.47814440727233887, "step": 1016 }, { "epoch": 0.20878669677684253, "grad_norm": 1.4089724906869932, "kd_ratio": 0.5, "learning_rate": 1.837243046032908e-05, "loss": 1.2893974781036377, "loss/kd": 2.0721468925476074, "loss/lm": 0.5066481232643127, "step": 1017 }, { "epoch": 0.2089919934305071, "grad_norm": 2.0189661819604963, "kd_ratio": 0.5, "learning_rate": 1.8368792029856482e-05, "loss": 1.0494441986083984, "loss/kd": 1.5920590162277222, "loss/lm": 0.5068294405937195, "step": 1018 }, { "epoch": 0.20919729008417162, "grad_norm": 2.4342667367435293, "kd_ratio": 0.5, "learning_rate": 1.8365149898182403e-05, "loss": 1.2022478580474854, "loss/kd": 1.9719911813735962, "loss/lm": 0.43250444531440735, "step": 1019 }, { "epoch": 0.20940258673783618, "grad_norm": 1.5488035860688132, "kd_ratio": 0.5, "learning_rate": 1.8361504066917623e-05, "loss": 1.1692743301391602, "loss/kd": 1.808140754699707, "loss/lm": 0.5304080247879028, "step": 1020 }, { "epoch": 0.20960788339150072, "grad_norm": 1.5665183467415746, "kd_ratio": 0.5, "learning_rate": 1.8357854537674556e-05, "loss": 0.9988474249839783, "loss/kd": 1.603023648262024, "loss/lm": 0.394671231508255, "step": 1021 }, { "epoch": 0.20981318004516528, "grad_norm": 2.0716452706066457, "kd_ratio": 0.5, "learning_rate": 1.8354201312067254e-05, "loss": 1.1574245691299438, "loss/kd": 1.8545539379119873, "loss/lm": 0.46029528975486755, "step": 1022 }, { "epoch": 0.2100184766988298, "grad_norm": 1.858141961186047, "kd_ratio": 0.5, "learning_rate": 1.8350544391711396e-05, "loss": 1.2317436933517456, "loss/kd": 1.9805086851119995, "loss/lm": 0.4829787611961365, "step": 1023 }, { "epoch": 0.21022377335249434, "grad_norm": 2.1465570782595247, "kd_ratio": 0.5, "learning_rate": 1.8346883778224306e-05, "loss": 1.1352030038833618, "loss/kd": 1.7849900722503662, "loss/lm": 0.48541587591171265, "step": 1024 }, { "epoch": 0.2104290700061589, "grad_norm": 1.1860695697742603, "kd_ratio": 0.5, "learning_rate": 1.834321947322493e-05, "loss": 1.2436763048171997, "loss/kd": 2.08353853225708, "loss/lm": 0.40381407737731934, "step": 1025 }, { "epoch": 0.21063436665982344, "grad_norm": 2.2221404136633827, "kd_ratio": 0.5, "learning_rate": 1.8339551478333853e-05, "loss": 1.195007085800171, "loss/kd": 1.9409610033035278, "loss/lm": 0.44905316829681396, "step": 1026 }, { "epoch": 0.210839663313488, "grad_norm": 1.5629896875272449, "kd_ratio": 0.5, "learning_rate": 1.833587979517329e-05, "loss": 1.120133638381958, "loss/kd": 1.8487725257873535, "loss/lm": 0.3914946913719177, "step": 1027 }, { "epoch": 0.21104495996715253, "grad_norm": 2.1605346329523876, "kd_ratio": 0.5, "learning_rate": 1.8332204425367096e-05, "loss": 1.1915608644485474, "loss/kd": 1.9709752798080444, "loss/lm": 0.41214653849601746, "step": 1028 }, { "epoch": 0.2112502566208171, "grad_norm": 2.4174696757272165, "kd_ratio": 0.5, "learning_rate": 1.8328525370540737e-05, "loss": 1.5181035995483398, "loss/kd": 2.6387484073638916, "loss/lm": 0.39745888113975525, "step": 1029 }, { "epoch": 0.21145555327448162, "grad_norm": 1.354791332040653, "kd_ratio": 0.5, "learning_rate": 1.832484263232133e-05, "loss": 2.109441041946411, "loss/kd": 3.7526071071624756, "loss/lm": 0.46627485752105713, "step": 1030 }, { "epoch": 0.21166084992814618, "grad_norm": 2.38853613571787, "kd_ratio": 0.5, "learning_rate": 1.8321156212337604e-05, "loss": 1.7845289707183838, "loss/kd": 3.1147165298461914, "loss/lm": 0.454341322183609, "step": 1031 }, { "epoch": 0.21186614658181072, "grad_norm": 1.3853769891503966, "kd_ratio": 0.5, "learning_rate": 1.831746611221993e-05, "loss": 1.1221386194229126, "loss/kd": 1.8486855030059814, "loss/lm": 0.39559173583984375, "step": 1032 }, { "epoch": 0.21207144323547525, "grad_norm": 2.1925449318875607, "kd_ratio": 0.5, "learning_rate": 1.83137723336003e-05, "loss": 0.975955069065094, "loss/kd": 1.5292738676071167, "loss/lm": 0.4226362407207489, "step": 1033 }, { "epoch": 0.2122767398891398, "grad_norm": 2.9156959258937287, "kd_ratio": 0.5, "learning_rate": 1.8310074878112326e-05, "loss": 1.252642273902893, "loss/kd": 2.0688652992248535, "loss/lm": 0.43641915917396545, "step": 1034 }, { "epoch": 0.21248203654280434, "grad_norm": 4.130254661494914, "kd_ratio": 0.5, "learning_rate": 1.830637374739126e-05, "loss": 1.3029719591140747, "loss/kd": 2.1411538124084473, "loss/lm": 0.46479007601737976, "step": 1035 }, { "epoch": 0.2126873331964689, "grad_norm": 2.8585550544218377, "kd_ratio": 0.5, "learning_rate": 1.830266894307397e-05, "loss": 1.2392634153366089, "loss/kd": 2.0232534408569336, "loss/lm": 0.45527347922325134, "step": 1036 }, { "epoch": 0.21289262985013344, "grad_norm": 1.6899880536921974, "kd_ratio": 0.5, "learning_rate": 1.8298960466798952e-05, "loss": 1.1961923837661743, "loss/kd": 1.9405473470687866, "loss/lm": 0.4518373906612396, "step": 1037 }, { "epoch": 0.213097926503798, "grad_norm": 3.1209237827555283, "kd_ratio": 0.5, "learning_rate": 1.8295248320206323e-05, "loss": 1.3431178331375122, "loss/kd": 2.122049570083618, "loss/lm": 0.5641860961914062, "step": 1038 }, { "epoch": 0.21330322315746253, "grad_norm": 2.0236432928435777, "kd_ratio": 0.5, "learning_rate": 1.829153250493783e-05, "loss": 1.26642906665802, "loss/kd": 2.137512683868408, "loss/lm": 0.395345538854599, "step": 1039 }, { "epoch": 0.2135085198111271, "grad_norm": 1.9674198907058702, "kd_ratio": 0.5, "learning_rate": 1.8287813022636837e-05, "loss": 1.0798213481903076, "loss/kd": 1.6846317052841187, "loss/lm": 0.47501102089881897, "step": 1040 }, { "epoch": 0.21371381646479162, "grad_norm": 1.5147800883962599, "kd_ratio": 0.5, "learning_rate": 1.828408987494833e-05, "loss": 0.975717306137085, "loss/kd": 1.5479413270950317, "loss/lm": 0.40349334478378296, "step": 1041 }, { "epoch": 0.21391911311845616, "grad_norm": 1.9478142803799725, "kd_ratio": 0.5, "learning_rate": 1.828036306351892e-05, "loss": 1.3360636234283447, "loss/kd": 2.1489593982696533, "loss/lm": 0.5231678485870361, "step": 1042 }, { "epoch": 0.21412440977212072, "grad_norm": 1.4852451265118687, "kd_ratio": 0.5, "learning_rate": 1.827663258999683e-05, "loss": 1.0965651273727417, "loss/kd": 1.7502890825271606, "loss/lm": 0.44284117221832275, "step": 1043 }, { "epoch": 0.21432970642578525, "grad_norm": 1.4912760398226077, "kd_ratio": 0.5, "learning_rate": 1.8272898456031907e-05, "loss": 1.2219964265823364, "loss/kd": 1.9849553108215332, "loss/lm": 0.4590374529361725, "step": 1044 }, { "epoch": 0.2145350030794498, "grad_norm": 1.7063332705769632, "kd_ratio": 0.5, "learning_rate": 1.8269160663275626e-05, "loss": 1.0446302890777588, "loss/kd": 1.7314659357070923, "loss/lm": 0.35779452323913574, "step": 1045 }, { "epoch": 0.21474029973311434, "grad_norm": 2.495474605820119, "kd_ratio": 0.5, "learning_rate": 1.8265419213381065e-05, "loss": 1.1077089309692383, "loss/kd": 1.74244225025177, "loss/lm": 0.472975492477417, "step": 1046 }, { "epoch": 0.2149455963867789, "grad_norm": 2.844384877717041, "kd_ratio": 0.5, "learning_rate": 1.8261674108002925e-05, "loss": 1.2050400972366333, "loss/kd": 1.9781508445739746, "loss/lm": 0.43192940950393677, "step": 1047 }, { "epoch": 0.21515089304044344, "grad_norm": 1.185065316712956, "kd_ratio": 0.5, "learning_rate": 1.8257925348797534e-05, "loss": 1.147223711013794, "loss/kd": 1.9127970933914185, "loss/lm": 0.381650447845459, "step": 1048 }, { "epoch": 0.215356189694108, "grad_norm": 3.0800299475386805, "kd_ratio": 0.5, "learning_rate": 1.825417293742282e-05, "loss": 1.2445778846740723, "loss/kd": 2.0100231170654297, "loss/lm": 0.4791327118873596, "step": 1049 }, { "epoch": 0.21556148634777253, "grad_norm": 1.8513602742427195, "kd_ratio": 0.5, "learning_rate": 1.825041687553833e-05, "loss": 1.191755771636963, "loss/kd": 1.9377485513687134, "loss/lm": 0.44576311111450195, "step": 1050 }, { "epoch": 0.2157667830014371, "grad_norm": 2.0641493197853387, "kd_ratio": 0.5, "learning_rate": 1.824665716480524e-05, "loss": 1.0891633033752441, "loss/kd": 1.7692521810531616, "loss/lm": 0.40907442569732666, "step": 1051 }, { "epoch": 0.21597207965510162, "grad_norm": 1.8501978838429542, "kd_ratio": 0.5, "learning_rate": 1.824289380688632e-05, "loss": 1.0896098613739014, "loss/kd": 1.7443917989730835, "loss/lm": 0.4348279535770416, "step": 1052 }, { "epoch": 0.21617737630876616, "grad_norm": 1.6917635099922148, "kd_ratio": 0.5, "learning_rate": 1.823912680344596e-05, "loss": 1.0466394424438477, "loss/kd": 1.646252989768982, "loss/lm": 0.44702595472335815, "step": 1053 }, { "epoch": 0.21638267296243072, "grad_norm": 1.7143678851694872, "kd_ratio": 0.5, "learning_rate": 1.823535615615017e-05, "loss": 1.1608308553695679, "loss/kd": 1.747494101524353, "loss/lm": 0.5741675496101379, "step": 1054 }, { "epoch": 0.21658796961609525, "grad_norm": 1.1383595043099588, "kd_ratio": 0.5, "learning_rate": 1.823158186666656e-05, "loss": 1.0060783624649048, "loss/kd": 1.5267137289047241, "loss/lm": 0.48544296622276306, "step": 1055 }, { "epoch": 0.2167932662697598, "grad_norm": 1.5608673071228958, "kd_ratio": 0.5, "learning_rate": 1.8227803936664358e-05, "loss": 1.033624529838562, "loss/kd": 1.653739094734192, "loss/lm": 0.41350993514060974, "step": 1056 }, { "epoch": 0.21699856292342434, "grad_norm": 1.9845019768220449, "kd_ratio": 0.5, "learning_rate": 1.8224022367814402e-05, "loss": 1.0888038873672485, "loss/kd": 1.7053850889205933, "loss/lm": 0.4722226560115814, "step": 1057 }, { "epoch": 0.2172038595770889, "grad_norm": 2.1999468898562604, "kd_ratio": 0.5, "learning_rate": 1.8220237161789134e-05, "loss": 1.2390897274017334, "loss/kd": 2.0222413539886475, "loss/lm": 0.45593810081481934, "step": 1058 }, { "epoch": 0.21740915623075344, "grad_norm": 1.1126697483413723, "kd_ratio": 0.5, "learning_rate": 1.821644832026261e-05, "loss": 1.0112026929855347, "loss/kd": 1.5912830829620361, "loss/lm": 0.4311223030090332, "step": 1059 }, { "epoch": 0.217614452884418, "grad_norm": 1.2041399492619465, "kd_ratio": 0.5, "learning_rate": 1.8212655844910496e-05, "loss": 1.237128734588623, "loss/kd": 1.9720286130905151, "loss/lm": 0.5022289156913757, "step": 1060 }, { "epoch": 0.21781974953808253, "grad_norm": 1.1726517148723876, "kd_ratio": 0.5, "learning_rate": 1.820885973741005e-05, "loss": 1.0970661640167236, "loss/kd": 1.804565191268921, "loss/lm": 0.389567106962204, "step": 1061 }, { "epoch": 0.21802504619174706, "grad_norm": 1.110401246956404, "kd_ratio": 0.5, "learning_rate": 1.820505999944016e-05, "loss": 1.2066274881362915, "loss/kd": 1.9810001850128174, "loss/lm": 0.4322548508644104, "step": 1062 }, { "epoch": 0.21823034284541162, "grad_norm": 1.2661450640148926, "kd_ratio": 0.5, "learning_rate": 1.82012566326813e-05, "loss": 1.062292456626892, "loss/kd": 1.5284874439239502, "loss/lm": 0.5960974097251892, "step": 1063 }, { "epoch": 0.21843563949907616, "grad_norm": 1.5318446130726628, "kd_ratio": 0.5, "learning_rate": 1.8197449638815556e-05, "loss": 1.0778065919876099, "loss/kd": 1.73293936252594, "loss/lm": 0.4226737320423126, "step": 1064 }, { "epoch": 0.21864093615274072, "grad_norm": 1.3879260204240895, "kd_ratio": 0.5, "learning_rate": 1.8193639019526618e-05, "loss": 1.121518611907959, "loss/kd": 1.7022407054901123, "loss/lm": 0.5407966375350952, "step": 1065 }, { "epoch": 0.21884623280640525, "grad_norm": 1.1514630815903855, "kd_ratio": 0.5, "learning_rate": 1.818982477649978e-05, "loss": 1.3667019605636597, "loss/kd": 2.2196245193481445, "loss/lm": 0.5137794017791748, "step": 1066 }, { "epoch": 0.2190515294600698, "grad_norm": 2.129934026873999, "kd_ratio": 0.5, "learning_rate": 1.8186006911421937e-05, "loss": 1.1472432613372803, "loss/kd": 1.8999823331832886, "loss/lm": 0.39450424909591675, "step": 1067 }, { "epoch": 0.21925682611373434, "grad_norm": 2.9541216204462852, "kd_ratio": 0.5, "learning_rate": 1.8182185425981593e-05, "loss": 1.1806131601333618, "loss/kd": 1.8787990808486938, "loss/lm": 0.4824272394180298, "step": 1068 }, { "epoch": 0.2194621227673989, "grad_norm": 3.2323934230415188, "kd_ratio": 0.5, "learning_rate": 1.8178360321868843e-05, "loss": 1.0885145664215088, "loss/kd": 1.8099151849746704, "loss/lm": 0.36711385846138, "step": 1069 }, { "epoch": 0.21966741942106344, "grad_norm": 2.4738922913736787, "kd_ratio": 0.5, "learning_rate": 1.817453160077538e-05, "loss": 1.0587525367736816, "loss/kd": 1.6181116104125977, "loss/lm": 0.4993935525417328, "step": 1070 }, { "epoch": 0.21987271607472797, "grad_norm": 1.791009320750563, "kd_ratio": 0.5, "learning_rate": 1.817069926439451e-05, "loss": 1.3577380180358887, "loss/kd": 2.278749704360962, "loss/lm": 0.43672627210617065, "step": 1071 }, { "epoch": 0.22007801272839253, "grad_norm": 1.4791473567365312, "kd_ratio": 0.5, "learning_rate": 1.8166863314421133e-05, "loss": 1.1664068698883057, "loss/kd": 1.826913833618164, "loss/lm": 0.5059000253677368, "step": 1072 }, { "epoch": 0.22028330938205706, "grad_norm": 2.6281728300647127, "kd_ratio": 0.5, "learning_rate": 1.816302375255174e-05, "loss": 1.2400704622268677, "loss/kd": 2.0290400981903076, "loss/lm": 0.45110073685646057, "step": 1073 }, { "epoch": 0.22048860603572162, "grad_norm": 1.7532457349204649, "kd_ratio": 0.5, "learning_rate": 1.8159180580484427e-05, "loss": 1.2168850898742676, "loss/kd": 1.9972282648086548, "loss/lm": 0.43654191493988037, "step": 1074 }, { "epoch": 0.22069390268938616, "grad_norm": 1.5232624272201574, "kd_ratio": 0.5, "learning_rate": 1.8155333799918883e-05, "loss": 1.1508687734603882, "loss/kd": 1.8770411014556885, "loss/lm": 0.4246964752674103, "step": 1075 }, { "epoch": 0.22089919934305072, "grad_norm": 1.9393547806319265, "kd_ratio": 0.5, "learning_rate": 1.8151483412556397e-05, "loss": 1.1566849946975708, "loss/kd": 1.90936279296875, "loss/lm": 0.4040072560310364, "step": 1076 }, { "epoch": 0.22110449599671525, "grad_norm": 1.9446287202763244, "kd_ratio": 0.5, "learning_rate": 1.814762942009985e-05, "loss": 1.2249730825424194, "loss/kd": 1.9739189147949219, "loss/lm": 0.476027250289917, "step": 1077 }, { "epoch": 0.2213097926503798, "grad_norm": 1.648248100729226, "kd_ratio": 0.5, "learning_rate": 1.8143771824253712e-05, "loss": 1.281295657157898, "loss/kd": 2.077124834060669, "loss/lm": 0.4854663908481598, "step": 1078 }, { "epoch": 0.22151508930404434, "grad_norm": 1.5008831570069412, "kd_ratio": 0.5, "learning_rate": 1.8139910626724058e-05, "loss": 1.2700645923614502, "loss/kd": 2.154777765274048, "loss/lm": 0.385351300239563, "step": 1079 }, { "epoch": 0.22172038595770888, "grad_norm": 2.0951922783774424, "kd_ratio": 0.5, "learning_rate": 1.8136045829218547e-05, "loss": 1.0773112773895264, "loss/kd": 1.611629605293274, "loss/lm": 0.5429930686950684, "step": 1080 }, { "epoch": 0.22192568261137344, "grad_norm": 1.4835887979343552, "kd_ratio": 0.5, "learning_rate": 1.8132177433446437e-05, "loss": 1.1216200590133667, "loss/kd": 1.8314698934555054, "loss/lm": 0.411770224571228, "step": 1081 }, { "epoch": 0.22213097926503797, "grad_norm": 1.551214609005419, "kd_ratio": 0.5, "learning_rate": 1.812830544111857e-05, "loss": 1.0375181436538696, "loss/kd": 1.5904772281646729, "loss/lm": 0.48455899953842163, "step": 1082 }, { "epoch": 0.22233627591870253, "grad_norm": 2.5415490925860498, "kd_ratio": 0.5, "learning_rate": 1.8124429853947387e-05, "loss": 1.0490237474441528, "loss/kd": 1.6006293296813965, "loss/lm": 0.4974181354045868, "step": 1083 }, { "epoch": 0.22254157257236706, "grad_norm": 3.582597066133404, "kd_ratio": 0.5, "learning_rate": 1.812055067364691e-05, "loss": 1.327044129371643, "loss/kd": 2.170470714569092, "loss/lm": 0.4836176335811615, "step": 1084 }, { "epoch": 0.22274686922603162, "grad_norm": 1.9072504938496244, "kd_ratio": 0.5, "learning_rate": 1.8116667901932753e-05, "loss": 1.1460171937942505, "loss/kd": 1.817392349243164, "loss/lm": 0.47464197874069214, "step": 1085 }, { "epoch": 0.22295216587969616, "grad_norm": 1.9229959722941268, "kd_ratio": 0.5, "learning_rate": 1.8112781540522124e-05, "loss": 0.9990065097808838, "loss/kd": 1.5902934074401855, "loss/lm": 0.4077196717262268, "step": 1086 }, { "epoch": 0.22315746253336072, "grad_norm": 2.0336176378705675, "kd_ratio": 0.5, "learning_rate": 1.8108891591133812e-05, "loss": 0.943844199180603, "loss/kd": 1.5174896717071533, "loss/lm": 0.37019872665405273, "step": 1087 }, { "epoch": 0.22336275918702525, "grad_norm": 1.4035065895042327, "kd_ratio": 0.5, "learning_rate": 1.8104998055488198e-05, "loss": 1.028552532196045, "loss/kd": 1.5700486898422241, "loss/lm": 0.4870563745498657, "step": 1088 }, { "epoch": 0.22356805584068978, "grad_norm": 1.4758449687113833, "kd_ratio": 0.5, "learning_rate": 1.810110093530724e-05, "loss": 1.038696527481079, "loss/kd": 1.6180927753448486, "loss/lm": 0.4593002200126648, "step": 1089 }, { "epoch": 0.22377335249435434, "grad_norm": 2.0495220455939935, "kd_ratio": 0.5, "learning_rate": 1.8097200232314493e-05, "loss": 1.2165169715881348, "loss/kd": 2.057655096054077, "loss/lm": 0.3753787577152252, "step": 1090 }, { "epoch": 0.22397864914801888, "grad_norm": 1.7202249952768671, "kd_ratio": 0.5, "learning_rate": 1.809329594823509e-05, "loss": 1.1713675260543823, "loss/kd": 1.8736546039581299, "loss/lm": 0.46908038854599, "step": 1091 }, { "epoch": 0.22418394580168344, "grad_norm": 1.385459963436686, "kd_ratio": 0.5, "learning_rate": 1.808938808479575e-05, "loss": 1.262641429901123, "loss/kd": 2.068016767501831, "loss/lm": 0.4572660028934479, "step": 1092 }, { "epoch": 0.22438924245534797, "grad_norm": 1.3071490415789668, "kd_ratio": 0.5, "learning_rate": 1.8085476643724768e-05, "loss": 1.2542064189910889, "loss/kd": 2.1112003326416016, "loss/lm": 0.39721253514289856, "step": 1093 }, { "epoch": 0.22459453910901253, "grad_norm": 2.2709219976052233, "kd_ratio": 0.5, "learning_rate": 1.808156162675203e-05, "loss": 1.9149218797683716, "loss/kd": 3.533763885498047, "loss/lm": 0.2960797846317291, "step": 1094 }, { "epoch": 0.22479983576267706, "grad_norm": 2.104864453411654, "kd_ratio": 0.5, "learning_rate": 1.8077643035609006e-05, "loss": 1.2358946800231934, "loss/kd": 1.9529439210891724, "loss/lm": 0.5188454389572144, "step": 1095 }, { "epoch": 0.22500513241634162, "grad_norm": 1.7298732778183186, "kd_ratio": 0.5, "learning_rate": 1.807372087202873e-05, "loss": 1.266636610031128, "loss/kd": 2.1356120109558105, "loss/lm": 0.39766132831573486, "step": 1096 }, { "epoch": 0.22521042907000616, "grad_norm": 2.6687168091470914, "kd_ratio": 0.5, "learning_rate": 1.806979513774584e-05, "loss": 1.2353914976119995, "loss/kd": 1.9807991981506348, "loss/lm": 0.4899837374687195, "step": 1097 }, { "epoch": 0.22541572572367072, "grad_norm": 1.325061552220048, "kd_ratio": 0.5, "learning_rate": 1.8065865834496535e-05, "loss": 1.187432885169983, "loss/kd": 1.9368634223937988, "loss/lm": 0.4380023777484894, "step": 1098 }, { "epoch": 0.22562102237733525, "grad_norm": 2.8905095426860345, "kd_ratio": 0.5, "learning_rate": 1.806193296401859e-05, "loss": 1.258303165435791, "loss/kd": 2.0852859020233154, "loss/lm": 0.43132051825523376, "step": 1099 }, { "epoch": 0.22582631903099978, "grad_norm": 2.367414136909767, "kd_ratio": 0.5, "learning_rate": 1.805799652805138e-05, "loss": 1.2804811000823975, "loss/kd": 2.0341641902923584, "loss/lm": 0.526797890663147, "step": 1100 }, { "epoch": 0.22603161568466434, "grad_norm": 1.8233426621190685, "kd_ratio": 0.5, "learning_rate": 1.8054056528335832e-05, "loss": 1.251843810081482, "loss/kd": 2.047792434692383, "loss/lm": 0.45589518547058105, "step": 1101 }, { "epoch": 0.22623691233832888, "grad_norm": 1.8809666545915384, "kd_ratio": 0.5, "learning_rate": 1.805011296661446e-05, "loss": 1.1010620594024658, "loss/kd": 1.7151226997375488, "loss/lm": 0.4870013892650604, "step": 1102 }, { "epoch": 0.22644220899199344, "grad_norm": 1.3851283991573384, "kd_ratio": 0.5, "learning_rate": 1.804616584463136e-05, "loss": 1.2878284454345703, "loss/kd": 2.091773271560669, "loss/lm": 0.4838835597038269, "step": 1103 }, { "epoch": 0.22664750564565797, "grad_norm": 1.7253957788762597, "kd_ratio": 0.5, "learning_rate": 1.8042215164132186e-05, "loss": 1.3118410110473633, "loss/kd": 2.0947022438049316, "loss/lm": 0.5289798378944397, "step": 1104 }, { "epoch": 0.22685280229932253, "grad_norm": 1.0911609999775302, "kd_ratio": 0.5, "learning_rate": 1.8038260926864183e-05, "loss": 1.2941515445709229, "loss/kd": 2.1832549571990967, "loss/lm": 0.4050482213497162, "step": 1105 }, { "epoch": 0.22705809895298706, "grad_norm": 1.8831415204457504, "kd_ratio": 0.5, "learning_rate": 1.8034303134576154e-05, "loss": 1.0259596109390259, "loss/kd": 1.6250911951065063, "loss/lm": 0.4268280267715454, "step": 1106 }, { "epoch": 0.22726339560665162, "grad_norm": 1.5670081244218337, "kd_ratio": 0.5, "learning_rate": 1.803034178901849e-05, "loss": 1.0631455183029175, "loss/kd": 1.6826640367507935, "loss/lm": 0.44362691044807434, "step": 1107 }, { "epoch": 0.22746869226031616, "grad_norm": 2.4534462721416648, "kd_ratio": 0.5, "learning_rate": 1.8026376891943137e-05, "loss": 1.2562203407287598, "loss/kd": 2.025667190551758, "loss/lm": 0.4867735505104065, "step": 1108 }, { "epoch": 0.2276739889139807, "grad_norm": 1.7532073268608481, "kd_ratio": 0.5, "learning_rate": 1.802240844510362e-05, "loss": 1.0562243461608887, "loss/kd": 1.5185949802398682, "loss/lm": 0.593853771686554, "step": 1109 }, { "epoch": 0.22787928556764525, "grad_norm": 1.6820820806363022, "kd_ratio": 0.5, "learning_rate": 1.8018436450255042e-05, "loss": 1.233849287033081, "loss/kd": 1.9394646883010864, "loss/lm": 0.5282340049743652, "step": 1110 }, { "epoch": 0.22808458222130978, "grad_norm": 2.6592388867553285, "kd_ratio": 0.5, "learning_rate": 1.8014460909154058e-05, "loss": 1.2648289203643799, "loss/kd": 2.0129289627075195, "loss/lm": 0.5167287588119507, "step": 1111 }, { "epoch": 0.22828987887497434, "grad_norm": 2.273076526915434, "kd_ratio": 0.5, "learning_rate": 1.8010481823558907e-05, "loss": 1.2927110195159912, "loss/kd": 2.1824254989624023, "loss/lm": 0.4029965400695801, "step": 1112 }, { "epoch": 0.22849517552863888, "grad_norm": 1.8509578956384123, "kd_ratio": 0.5, "learning_rate": 1.800649919522938e-05, "loss": 1.2892098426818848, "loss/kd": 2.1649162769317627, "loss/lm": 0.4135034382343292, "step": 1113 }, { "epoch": 0.22870047218230344, "grad_norm": 0.9990186381420852, "kd_ratio": 0.5, "learning_rate": 1.8002513025926853e-05, "loss": 1.4154199361801147, "loss/kd": 2.306730031967163, "loss/lm": 0.5241098403930664, "step": 1114 }, { "epoch": 0.22890576883596797, "grad_norm": 1.5996089496111685, "kd_ratio": 0.5, "learning_rate": 1.799852331741425e-05, "loss": 1.083078145980835, "loss/kd": 1.7429159879684448, "loss/lm": 0.4232402443885803, "step": 1115 }, { "epoch": 0.22911106548963253, "grad_norm": 1.9293726664547988, "kd_ratio": 0.5, "learning_rate": 1.7994530071456076e-05, "loss": 1.0110218524932861, "loss/kd": 1.6098588705062866, "loss/lm": 0.4121848940849304, "step": 1116 }, { "epoch": 0.22931636214329706, "grad_norm": 2.0061977089423175, "kd_ratio": 0.5, "learning_rate": 1.7990533289818388e-05, "loss": 1.1975908279418945, "loss/kd": 1.968841314315796, "loss/lm": 0.4263403117656708, "step": 1117 }, { "epoch": 0.2295216587969616, "grad_norm": 1.322222680076398, "kd_ratio": 0.5, "learning_rate": 1.7986532974268814e-05, "loss": 0.9834321737289429, "loss/kd": 1.5359529256820679, "loss/lm": 0.4309113919734955, "step": 1118 }, { "epoch": 0.22972695545062616, "grad_norm": 2.3458653981566355, "kd_ratio": 0.5, "learning_rate": 1.7982529126576543e-05, "loss": 1.3833154439926147, "loss/kd": 2.356158494949341, "loss/lm": 0.41047239303588867, "step": 1119 }, { "epoch": 0.2299322521042907, "grad_norm": 1.583922187157095, "kd_ratio": 0.5, "learning_rate": 1.7978521748512324e-05, "loss": 1.306518316268921, "loss/kd": 2.106342315673828, "loss/lm": 0.5066942572593689, "step": 1120 }, { "epoch": 0.23013754875795525, "grad_norm": 1.4721284641275245, "kd_ratio": 0.5, "learning_rate": 1.7974510841848475e-05, "loss": 1.3726831674575806, "loss/kd": 2.2717645168304443, "loss/lm": 0.473601758480072, "step": 1121 }, { "epoch": 0.23034284541161978, "grad_norm": 1.059147260681169, "kd_ratio": 0.5, "learning_rate": 1.7970496408358866e-05, "loss": 1.2250065803527832, "loss/kd": 1.8456364870071411, "loss/lm": 0.6043766736984253, "step": 1122 }, { "epoch": 0.23054814206528434, "grad_norm": 1.3900451340871867, "kd_ratio": 0.5, "learning_rate": 1.7966478449818925e-05, "loss": 1.0075318813323975, "loss/kd": 1.6617084741592407, "loss/lm": 0.353355348110199, "step": 1123 }, { "epoch": 0.23075343871894888, "grad_norm": 1.2280526866471724, "kd_ratio": 0.5, "learning_rate": 1.796245696800565e-05, "loss": 1.0562576055526733, "loss/kd": 1.6177793741226196, "loss/lm": 0.49473583698272705, "step": 1124 }, { "epoch": 0.23095873537261344, "grad_norm": 1.7221971663559636, "kd_ratio": 0.5, "learning_rate": 1.7958431964697586e-05, "loss": 1.2580647468566895, "loss/kd": 1.99708092212677, "loss/lm": 0.5190485715866089, "step": 1125 }, { "epoch": 0.23116403202627797, "grad_norm": 1.366337284538514, "kd_ratio": 0.5, "learning_rate": 1.7954403441674844e-05, "loss": 1.1299984455108643, "loss/kd": 1.8013546466827393, "loss/lm": 0.4586421549320221, "step": 1126 }, { "epoch": 0.2313693286799425, "grad_norm": 2.2133664326271854, "kd_ratio": 0.5, "learning_rate": 1.7950371400719087e-05, "loss": 0.944158136844635, "loss/kd": 1.4899842739105225, "loss/lm": 0.39833196997642517, "step": 1127 }, { "epoch": 0.23157462533360706, "grad_norm": 1.7194599847937402, "kd_ratio": 0.5, "learning_rate": 1.7946335843613533e-05, "loss": 1.0697274208068848, "loss/kd": 1.7154308557510376, "loss/lm": 0.4240241050720215, "step": 1128 }, { "epoch": 0.2317799219872716, "grad_norm": 1.3298652270815496, "kd_ratio": 0.5, "learning_rate": 1.794229677214296e-05, "loss": 1.3750834465026855, "loss/kd": 2.2001352310180664, "loss/lm": 0.5500315427780151, "step": 1129 }, { "epoch": 0.23198521864093616, "grad_norm": 1.4815309377018528, "kd_ratio": 0.5, "learning_rate": 1.7938254188093696e-05, "loss": 0.9530582427978516, "loss/kd": 1.4664108753204346, "loss/lm": 0.43970564007759094, "step": 1130 }, { "epoch": 0.2321905152946007, "grad_norm": 1.418209481412432, "kd_ratio": 0.5, "learning_rate": 1.7934208093253625e-05, "loss": 1.1168164014816284, "loss/kd": 1.8399821519851685, "loss/lm": 0.3936505913734436, "step": 1131 }, { "epoch": 0.23239581194826525, "grad_norm": 1.5248348152385502, "kd_ratio": 0.5, "learning_rate": 1.793015848941218e-05, "loss": 1.2998406887054443, "loss/kd": 2.109325885772705, "loss/lm": 0.490355521440506, "step": 1132 }, { "epoch": 0.23260110860192978, "grad_norm": 1.4374668404256363, "kd_ratio": 0.5, "learning_rate": 1.792610537836035e-05, "loss": 1.0843555927276611, "loss/kd": 1.801759123802185, "loss/lm": 0.3669520914554596, "step": 1133 }, { "epoch": 0.23280640525559435, "grad_norm": 1.1345399526004258, "kd_ratio": 0.5, "learning_rate": 1.7922048761890666e-05, "loss": 1.2360872030258179, "loss/kd": 2.1030139923095703, "loss/lm": 0.3691605031490326, "step": 1134 }, { "epoch": 0.23301170190925888, "grad_norm": 1.3843101363685588, "kd_ratio": 0.5, "learning_rate": 1.7917988641797227e-05, "loss": 1.1974356174468994, "loss/kd": 1.9646672010421753, "loss/lm": 0.430203914642334, "step": 1135 }, { "epoch": 0.2332169985629234, "grad_norm": 1.940527675489603, "kd_ratio": 0.5, "learning_rate": 1.791392501987567e-05, "loss": 1.009734869003296, "loss/kd": 1.6631916761398315, "loss/lm": 0.3562779426574707, "step": 1136 }, { "epoch": 0.23342229521658797, "grad_norm": 2.3814226078462117, "kd_ratio": 0.5, "learning_rate": 1.7909857897923176e-05, "loss": 1.240744709968567, "loss/kd": 2.1352016925811768, "loss/lm": 0.34628766775131226, "step": 1137 }, { "epoch": 0.2336275918702525, "grad_norm": 1.8737490297274249, "kd_ratio": 0.5, "learning_rate": 1.7905787277738483e-05, "loss": 1.1514066457748413, "loss/kd": 1.8433232307434082, "loss/lm": 0.4594901204109192, "step": 1138 }, { "epoch": 0.23383288852391707, "grad_norm": 1.1301584067325887, "kd_ratio": 0.5, "learning_rate": 1.7901713161121873e-05, "loss": 1.0114047527313232, "loss/kd": 1.5406696796417236, "loss/lm": 0.4821397364139557, "step": 1139 }, { "epoch": 0.2340381851775816, "grad_norm": 2.4665414395495837, "kd_ratio": 0.5, "learning_rate": 1.7897635549875177e-05, "loss": 1.1223738193511963, "loss/kd": 1.8495506048202515, "loss/lm": 0.3951971232891083, "step": 1140 }, { "epoch": 0.23424348183124616, "grad_norm": 2.389621992518192, "kd_ratio": 0.5, "learning_rate": 1.7893554445801763e-05, "loss": 0.972686231136322, "loss/kd": 1.4654594659805298, "loss/lm": 0.47991302609443665, "step": 1141 }, { "epoch": 0.2344487784849107, "grad_norm": 1.448170538474498, "kd_ratio": 0.5, "learning_rate": 1.788946985070655e-05, "loss": 1.0820666551589966, "loss/kd": 1.8166838884353638, "loss/lm": 0.3474493622779846, "step": 1142 }, { "epoch": 0.23465407513857525, "grad_norm": 1.197701532452248, "kd_ratio": 0.5, "learning_rate": 1.7885381766396008e-05, "loss": 1.3367481231689453, "loss/kd": 2.2659060955047607, "loss/lm": 0.4075901508331299, "step": 1143 }, { "epoch": 0.23485937179223978, "grad_norm": 1.5498939365805893, "kd_ratio": 0.5, "learning_rate": 1.7881290194678136e-05, "loss": 0.9890567064285278, "loss/kd": 1.4968057870864868, "loss/lm": 0.48130759596824646, "step": 1144 }, { "epoch": 0.23506466844590435, "grad_norm": 2.0281184708670352, "kd_ratio": 0.5, "learning_rate": 1.7877195137362485e-05, "loss": 1.2213941812515259, "loss/kd": 1.9815959930419922, "loss/lm": 0.46119239926338196, "step": 1145 }, { "epoch": 0.23526996509956888, "grad_norm": 1.554938101253734, "kd_ratio": 0.5, "learning_rate": 1.787309659626014e-05, "loss": 1.2385815382003784, "loss/kd": 2.0057356357574463, "loss/lm": 0.47142741084098816, "step": 1146 }, { "epoch": 0.2354752617532334, "grad_norm": 1.8033646652895416, "kd_ratio": 0.5, "learning_rate": 1.786899457318374e-05, "loss": 1.3069796562194824, "loss/kd": 2.113772392272949, "loss/lm": 0.5001869201660156, "step": 1147 }, { "epoch": 0.23568055840689797, "grad_norm": 1.5320429320717732, "kd_ratio": 0.5, "learning_rate": 1.7864889069947448e-05, "loss": 1.0979846715927124, "loss/kd": 1.7382742166519165, "loss/lm": 0.4576951563358307, "step": 1148 }, { "epoch": 0.2358858550605625, "grad_norm": 2.608620732649123, "kd_ratio": 0.5, "learning_rate": 1.7860780088366975e-05, "loss": 1.0694481134414673, "loss/kd": 1.722774863243103, "loss/lm": 0.41612133383750916, "step": 1149 }, { "epoch": 0.23609115171422707, "grad_norm": 2.024821539036196, "kd_ratio": 0.5, "learning_rate": 1.785666763025957e-05, "loss": 1.1759352684020996, "loss/kd": 1.871246576309204, "loss/lm": 0.4806239604949951, "step": 1150 }, { "epoch": 0.2362964483678916, "grad_norm": 1.632339881438863, "kd_ratio": 0.5, "learning_rate": 1.7852551697444017e-05, "loss": 1.3477524518966675, "loss/kd": 2.14198637008667, "loss/lm": 0.5535185933113098, "step": 1151 }, { "epoch": 0.23650174502155616, "grad_norm": 2.178903458700015, "kd_ratio": 0.5, "learning_rate": 1.7848432291740642e-05, "loss": 1.340388298034668, "loss/kd": 2.3135504722595215, "loss/lm": 0.3672260344028473, "step": 1152 }, { "epoch": 0.2367070416752207, "grad_norm": 1.1599032178432627, "kd_ratio": 0.5, "learning_rate": 1.7844309414971296e-05, "loss": 1.2061996459960938, "loss/kd": 1.925557017326355, "loss/lm": 0.48684218525886536, "step": 1153 }, { "epoch": 0.23691233832888525, "grad_norm": 1.3898186281510863, "kd_ratio": 0.5, "learning_rate": 1.7840183068959377e-05, "loss": 1.1096618175506592, "loss/kd": 1.8256478309631348, "loss/lm": 0.3936757445335388, "step": 1154 }, { "epoch": 0.23711763498254979, "grad_norm": 1.1526919748743552, "kd_ratio": 0.5, "learning_rate": 1.783605325552981e-05, "loss": 1.1214933395385742, "loss/kd": 1.895460844039917, "loss/lm": 0.34752586483955383, "step": 1155 }, { "epoch": 0.23732293163621432, "grad_norm": 1.4595547373232813, "kd_ratio": 0.5, "learning_rate": 1.7831919976509058e-05, "loss": 1.0467890501022339, "loss/kd": 1.6917293071746826, "loss/lm": 0.40184879302978516, "step": 1156 }, { "epoch": 0.23752822828987888, "grad_norm": 1.456728864474276, "kd_ratio": 0.5, "learning_rate": 1.7827783233725114e-05, "loss": 1.184584379196167, "loss/kd": 1.989107370376587, "loss/lm": 0.38006144762039185, "step": 1157 }, { "epoch": 0.2377335249435434, "grad_norm": 1.9077550978037525, "kd_ratio": 0.5, "learning_rate": 1.78236430290075e-05, "loss": 0.943892240524292, "loss/kd": 1.4773874282836914, "loss/lm": 0.4103970229625702, "step": 1158 }, { "epoch": 0.23793882159720797, "grad_norm": 1.259415685274735, "kd_ratio": 0.5, "learning_rate": 1.7819499364187282e-05, "loss": 1.1472508907318115, "loss/kd": 1.9083386659622192, "loss/lm": 0.3861631453037262, "step": 1159 }, { "epoch": 0.2381441182508725, "grad_norm": 1.353154530780856, "kd_ratio": 0.5, "learning_rate": 1.7815352241097037e-05, "loss": 1.2604279518127441, "loss/kd": 2.0596086978912354, "loss/lm": 0.4612472653388977, "step": 1160 }, { "epoch": 0.23834941490453707, "grad_norm": 0.995702823512839, "kd_ratio": 0.5, "learning_rate": 1.7811201661570888e-05, "loss": 1.2013435363769531, "loss/kd": 1.9840691089630127, "loss/lm": 0.418617844581604, "step": 1161 }, { "epoch": 0.2385547115582016, "grad_norm": 1.747497908231377, "kd_ratio": 0.5, "learning_rate": 1.7807047627444473e-05, "loss": 1.2362598180770874, "loss/kd": 2.11507248878479, "loss/lm": 0.3574471175670624, "step": 1162 }, { "epoch": 0.23876000821186616, "grad_norm": 1.4293687744518304, "kd_ratio": 0.5, "learning_rate": 1.780289014055497e-05, "loss": 1.2630963325500488, "loss/kd": 2.058929443359375, "loss/lm": 0.46726328134536743, "step": 1163 }, { "epoch": 0.2389653048655307, "grad_norm": 1.8112809301219281, "kd_ratio": 0.5, "learning_rate": 1.7798729202741082e-05, "loss": 0.9173579216003418, "loss/kd": 1.4563469886779785, "loss/lm": 0.37836888432502747, "step": 1164 }, { "epoch": 0.23917060151919523, "grad_norm": 1.5385963016997364, "kd_ratio": 0.5, "learning_rate": 1.7794564815843025e-05, "loss": 1.2324552536010742, "loss/kd": 2.0845272541046143, "loss/lm": 0.38038337230682373, "step": 1165 }, { "epoch": 0.23937589817285979, "grad_norm": 1.8000454588811818, "kd_ratio": 0.5, "learning_rate": 1.779039698170256e-05, "loss": 1.197695016860962, "loss/kd": 1.903179407119751, "loss/lm": 0.4922105073928833, "step": 1166 }, { "epoch": 0.23958119482652432, "grad_norm": 1.869759995845063, "kd_ratio": 0.5, "learning_rate": 1.7786225702162955e-05, "loss": 1.0940715074539185, "loss/kd": 1.7739076614379883, "loss/lm": 0.4142354130744934, "step": 1167 }, { "epoch": 0.23978649148018888, "grad_norm": 1.4577550552240026, "kd_ratio": 0.5, "learning_rate": 1.778205097906902e-05, "loss": 1.1313267946243286, "loss/kd": 1.8384175300598145, "loss/lm": 0.4242360293865204, "step": 1168 }, { "epoch": 0.2399917881338534, "grad_norm": 1.1028092566199332, "kd_ratio": 0.5, "learning_rate": 1.7777872814267068e-05, "loss": 1.2663697004318237, "loss/kd": 2.128920555114746, "loss/lm": 0.40381890535354614, "step": 1169 }, { "epoch": 0.24019708478751797, "grad_norm": 1.6473936140011407, "kd_ratio": 0.5, "learning_rate": 1.7773691209604948e-05, "loss": 1.0645084381103516, "loss/kd": 1.702700138092041, "loss/lm": 0.4263167083263397, "step": 1170 }, { "epoch": 0.2404023814411825, "grad_norm": 1.8101053445331605, "kd_ratio": 0.5, "learning_rate": 1.7769506166932026e-05, "loss": 0.9956259727478027, "loss/kd": 1.5620970726013184, "loss/lm": 0.4291548430919647, "step": 1171 }, { "epoch": 0.24060767809484707, "grad_norm": 1.351599799628428, "kd_ratio": 0.5, "learning_rate": 1.776531768809919e-05, "loss": 1.2853449583053589, "loss/kd": 2.1336898803710938, "loss/lm": 0.4370000958442688, "step": 1172 }, { "epoch": 0.2408129747485116, "grad_norm": 2.6324631498639053, "kd_ratio": 0.5, "learning_rate": 1.7761125774958846e-05, "loss": 1.3507399559020996, "loss/kd": 2.3057315349578857, "loss/lm": 0.395748496055603, "step": 1173 }, { "epoch": 0.24101827140217613, "grad_norm": 1.1608167984775246, "kd_ratio": 0.5, "learning_rate": 1.7756930429364912e-05, "loss": 1.0842820405960083, "loss/kd": 1.7501040697097778, "loss/lm": 0.41846007108688354, "step": 1174 }, { "epoch": 0.2412235680558407, "grad_norm": 2.5007084974668676, "kd_ratio": 0.5, "learning_rate": 1.7752731653172847e-05, "loss": 1.3179941177368164, "loss/kd": 2.222424268722534, "loss/lm": 0.413563996553421, "step": 1175 }, { "epoch": 0.24142886470950523, "grad_norm": 1.4158433331470066, "kd_ratio": 0.5, "learning_rate": 1.7748529448239594e-05, "loss": 1.002995252609253, "loss/kd": 1.5718138217926025, "loss/lm": 0.43417680263519287, "step": 1176 }, { "epoch": 0.24163416136316979, "grad_norm": 1.7129119280126908, "kd_ratio": 0.5, "learning_rate": 1.7744323816423645e-05, "loss": 1.3653292655944824, "loss/kd": 2.392857313156128, "loss/lm": 0.3378012180328369, "step": 1177 }, { "epoch": 0.24183945801683432, "grad_norm": 2.1910754712979554, "kd_ratio": 0.5, "learning_rate": 1.7740114759584983e-05, "loss": 1.2381789684295654, "loss/kd": 2.050194501876831, "loss/lm": 0.42616331577301025, "step": 1178 }, { "epoch": 0.24204475467049888, "grad_norm": 1.9716425051380633, "kd_ratio": 0.5, "learning_rate": 1.7735902279585118e-05, "loss": 0.9733291864395142, "loss/kd": 1.483368992805481, "loss/lm": 0.46328943967819214, "step": 1179 }, { "epoch": 0.2422500513241634, "grad_norm": 1.627250186327469, "kd_ratio": 0.5, "learning_rate": 1.773168637828707e-05, "loss": 1.1149559020996094, "loss/kd": 1.812990665435791, "loss/lm": 0.4169212579727173, "step": 1180 }, { "epoch": 0.24245534797782797, "grad_norm": 1.6784523587600604, "kd_ratio": 0.5, "learning_rate": 1.7727467057555376e-05, "loss": 1.1050260066986084, "loss/kd": 1.750107765197754, "loss/lm": 0.4599441587924957, "step": 1181 }, { "epoch": 0.2426606446314925, "grad_norm": 1.1667298891203923, "kd_ratio": 0.5, "learning_rate": 1.772324431925608e-05, "loss": 1.221562147140503, "loss/kd": 1.9925150871276855, "loss/lm": 0.45060911774635315, "step": 1182 }, { "epoch": 0.24286594128515704, "grad_norm": 1.325137669484954, "kd_ratio": 0.5, "learning_rate": 1.7719018165256745e-05, "loss": 1.3895517587661743, "loss/kd": 2.353708505630493, "loss/lm": 0.4253949522972107, "step": 1183 }, { "epoch": 0.2430712379388216, "grad_norm": 1.7835382486252633, "kd_ratio": 0.5, "learning_rate": 1.771478859742643e-05, "loss": 1.001421570777893, "loss/kd": 1.612006425857544, "loss/lm": 0.3908367156982422, "step": 1184 }, { "epoch": 0.24327653459248613, "grad_norm": 2.1602646718343967, "kd_ratio": 0.5, "learning_rate": 1.7710555617635727e-05, "loss": 1.226070523262024, "loss/kd": 2.0482277870178223, "loss/lm": 0.4039131999015808, "step": 1185 }, { "epoch": 0.2434818312461507, "grad_norm": 2.5612455024054905, "kd_ratio": 0.5, "learning_rate": 1.770631922775671e-05, "loss": 1.091251015663147, "loss/kd": 1.8025834560394287, "loss/lm": 0.37991863489151, "step": 1186 }, { "epoch": 0.24368712789981523, "grad_norm": 2.5130668912928065, "kd_ratio": 0.5, "learning_rate": 1.7702079429662986e-05, "loss": 1.394978404045105, "loss/kd": 2.343898296356201, "loss/lm": 0.446058452129364, "step": 1187 }, { "epoch": 0.2438924245534798, "grad_norm": 1.922261205430925, "kd_ratio": 0.5, "learning_rate": 1.769783622522965e-05, "loss": 1.2757704257965088, "loss/kd": 2.1583592891693115, "loss/lm": 0.39318159222602844, "step": 1188 }, { "epoch": 0.24409772120714432, "grad_norm": 0.9841757402243734, "kd_ratio": 0.5, "learning_rate": 1.769358961633331e-05, "loss": 1.1625829935073853, "loss/kd": 1.901319146156311, "loss/lm": 0.42384690046310425, "step": 1189 }, { "epoch": 0.24430301786080888, "grad_norm": 1.3449269614869241, "kd_ratio": 0.5, "learning_rate": 1.7689339604852093e-05, "loss": 1.1223341226577759, "loss/kd": 1.854236364364624, "loss/lm": 0.39043179154396057, "step": 1190 }, { "epoch": 0.2445083145144734, "grad_norm": 1.6750863314765227, "kd_ratio": 0.5, "learning_rate": 1.7685086192665605e-05, "loss": 1.1516525745391846, "loss/kd": 1.9058113098144531, "loss/lm": 0.39749377965927124, "step": 1191 }, { "epoch": 0.24471361116813795, "grad_norm": 1.2805726972272418, "kd_ratio": 0.5, "learning_rate": 1.7680829381654982e-05, "loss": 1.1321258544921875, "loss/kd": 1.790138602256775, "loss/lm": 0.47411301732063293, "step": 1192 }, { "epoch": 0.2449189078218025, "grad_norm": 1.3667761867250583, "kd_ratio": 0.5, "learning_rate": 1.7676569173702844e-05, "loss": 0.9869122505187988, "loss/kd": 1.5733891725540161, "loss/lm": 0.40043535828590393, "step": 1193 }, { "epoch": 0.24512420447546704, "grad_norm": 1.3973756447220658, "kd_ratio": 0.5, "learning_rate": 1.767230557069332e-05, "loss": 1.2538965940475464, "loss/kd": 2.022052049636841, "loss/lm": 0.48574116826057434, "step": 1194 }, { "epoch": 0.2453295011291316, "grad_norm": 1.661168669350605, "kd_ratio": 0.5, "learning_rate": 1.7668038574512045e-05, "loss": 1.3897327184677124, "loss/kd": 2.341526985168457, "loss/lm": 0.4379383623600006, "step": 1195 }, { "epoch": 0.24553479778279613, "grad_norm": 1.3206425447535965, "kd_ratio": 0.5, "learning_rate": 1.7663768187046146e-05, "loss": 1.1716045141220093, "loss/kd": 1.9464225769042969, "loss/lm": 0.3967863917350769, "step": 1196 }, { "epoch": 0.2457400944364607, "grad_norm": 1.8956199849559494, "kd_ratio": 0.5, "learning_rate": 1.7659494410184263e-05, "loss": 1.1624597311019897, "loss/kd": 2.007694721221924, "loss/lm": 0.31722477078437805, "step": 1197 }, { "epoch": 0.24594539109012523, "grad_norm": 1.252179475579861, "kd_ratio": 0.5, "learning_rate": 1.7655217245816513e-05, "loss": 1.015536904335022, "loss/kd": 1.662867784500122, "loss/lm": 0.3682059645652771, "step": 1198 }, { "epoch": 0.2461506877437898, "grad_norm": 1.6124311016159771, "kd_ratio": 0.5, "learning_rate": 1.7650936695834536e-05, "loss": 1.1770113706588745, "loss/kd": 1.9455848932266235, "loss/lm": 0.4084378778934479, "step": 1199 }, { "epoch": 0.24635598439745432, "grad_norm": 1.9269272853142125, "kd_ratio": 0.5, "learning_rate": 1.7646652762131457e-05, "loss": 1.1149669885635376, "loss/kd": 1.8385735750198364, "loss/lm": 0.39136040210723877, "step": 1200 }, { "epoch": 0.24656128105111888, "grad_norm": 1.794983906869595, "kd_ratio": 0.5, "learning_rate": 1.7642365446601896e-05, "loss": 1.2719439268112183, "loss/kd": 2.098073959350586, "loss/lm": 0.4458138644695282, "step": 1201 }, { "epoch": 0.2467665777047834, "grad_norm": 1.5762723628032362, "kd_ratio": 0.5, "learning_rate": 1.7638074751141965e-05, "loss": 1.059622049331665, "loss/kd": 1.7495356798171997, "loss/lm": 0.3697082996368408, "step": 1202 }, { "epoch": 0.24697187435844795, "grad_norm": 1.3876955011188166, "kd_ratio": 0.5, "learning_rate": 1.763378067764929e-05, "loss": 1.028618335723877, "loss/kd": 1.6654216051101685, "loss/lm": 0.3918150067329407, "step": 1203 }, { "epoch": 0.2471771710121125, "grad_norm": 1.4039148597975586, "kd_ratio": 0.5, "learning_rate": 1.7629483228022968e-05, "loss": 1.1699274778366089, "loss/kd": 1.8608922958374023, "loss/lm": 0.4789626896381378, "step": 1204 }, { "epoch": 0.24738246766577704, "grad_norm": 1.6273260734080266, "kd_ratio": 0.5, "learning_rate": 1.7625182404163603e-05, "loss": 1.1372098922729492, "loss/kd": 1.7489097118377686, "loss/lm": 0.5255101919174194, "step": 1205 }, { "epoch": 0.2475877643194416, "grad_norm": 2.978268061325805, "kd_ratio": 0.5, "learning_rate": 1.7620878207973286e-05, "loss": 1.0271456241607666, "loss/kd": 1.5956008434295654, "loss/lm": 0.4586905241012573, "step": 1206 }, { "epoch": 0.24779306097310613, "grad_norm": 2.4324222143652796, "kd_ratio": 0.5, "learning_rate": 1.7616570641355602e-05, "loss": 1.3947112560272217, "loss/kd": 2.2968673706054688, "loss/lm": 0.49255505204200745, "step": 1207 }, { "epoch": 0.2479983576267707, "grad_norm": 1.9069106597843504, "kd_ratio": 0.5, "learning_rate": 1.7612259706215626e-05, "loss": 1.089651346206665, "loss/kd": 1.7028411626815796, "loss/lm": 0.47646158933639526, "step": 1208 }, { "epoch": 0.24820365428043523, "grad_norm": 1.54216209033752, "kd_ratio": 0.5, "learning_rate": 1.7607945404459917e-05, "loss": 1.2650146484375, "loss/kd": 2.0638015270233154, "loss/lm": 0.4662276804447174, "step": 1209 }, { "epoch": 0.2484089509340998, "grad_norm": 1.8437806546483724, "kd_ratio": 0.5, "learning_rate": 1.760362773799654e-05, "loss": 1.1598970890045166, "loss/kd": 1.9367656707763672, "loss/lm": 0.3830285668373108, "step": 1210 }, { "epoch": 0.24861424758776432, "grad_norm": 2.831933994454114, "kd_ratio": 0.5, "learning_rate": 1.759930670873502e-05, "loss": 1.234615445137024, "loss/kd": 2.019678831100464, "loss/lm": 0.44955214858055115, "step": 1211 }, { "epoch": 0.24881954424142885, "grad_norm": 3.5298385856639456, "kd_ratio": 0.5, "learning_rate": 1.75949823185864e-05, "loss": 1.2989507913589478, "loss/kd": 2.1303138732910156, "loss/lm": 0.46758776903152466, "step": 1212 }, { "epoch": 0.2490248408950934, "grad_norm": 1.1371415365560604, "kd_ratio": 0.5, "learning_rate": 1.7590654569463186e-05, "loss": 2.08896541595459, "loss/kd": 3.7223610877990723, "loss/lm": 0.4555697441101074, "step": 1213 }, { "epoch": 0.24923013754875795, "grad_norm": 3.0159302779735913, "kd_ratio": 0.5, "learning_rate": 1.7586323463279383e-05, "loss": 1.2991952896118164, "loss/kd": 2.0306107997894287, "loss/lm": 0.5677797794342041, "step": 1214 }, { "epoch": 0.2494354342024225, "grad_norm": 3.1419660786147476, "kd_ratio": 0.5, "learning_rate": 1.758198900195047e-05, "loss": 1.0332374572753906, "loss/kd": 1.6726924180984497, "loss/lm": 0.3937825858592987, "step": 1215 }, { "epoch": 0.24964073085608704, "grad_norm": 1.6268419425156175, "kd_ratio": 0.5, "learning_rate": 1.7577651187393424e-05, "loss": 1.1254876852035522, "loss/kd": 1.8054900169372559, "loss/lm": 0.445485383272171, "step": 1216 }, { "epoch": 0.2498460275097516, "grad_norm": 1.2377138674625177, "kd_ratio": 0.5, "learning_rate": 1.7573310021526696e-05, "loss": 1.1084530353546143, "loss/kd": 1.7568180561065674, "loss/lm": 0.4600878953933716, "step": 1217 }, { "epoch": 0.25005132416341613, "grad_norm": 1.934992384915025, "kd_ratio": 0.5, "learning_rate": 1.7568965506270212e-05, "loss": 1.227609395980835, "loss/kd": 2.0731284618377686, "loss/lm": 0.38209041953086853, "step": 1218 }, { "epoch": 0.25025662081708067, "grad_norm": 2.25533435430749, "kd_ratio": 0.5, "learning_rate": 1.7564617643545395e-05, "loss": 1.0886874198913574, "loss/kd": 1.6068154573440552, "loss/lm": 0.5705594420433044, "step": 1219 }, { "epoch": 0.25046191747074525, "grad_norm": 2.4075592970734823, "kd_ratio": 0.5, "learning_rate": 1.7560266435275143e-05, "loss": 1.169874668121338, "loss/kd": 1.9038814306259155, "loss/lm": 0.4358677864074707, "step": 1220 }, { "epoch": 0.2506672141244098, "grad_norm": 1.6324728635530794, "kd_ratio": 0.5, "learning_rate": 1.7555911883383823e-05, "loss": 1.050649642944336, "loss/kd": 1.6567327976226807, "loss/lm": 0.4445665180683136, "step": 1221 }, { "epoch": 0.2508725107780743, "grad_norm": 1.353267794446313, "kd_ratio": 0.5, "learning_rate": 1.7551553989797292e-05, "loss": 1.0865893363952637, "loss/kd": 1.710999608039856, "loss/lm": 0.46217894554138184, "step": 1222 }, { "epoch": 0.25107780743173885, "grad_norm": 1.7743736504905199, "kd_ratio": 0.5, "learning_rate": 1.7547192756442887e-05, "loss": 1.1097930669784546, "loss/kd": 1.7624469995498657, "loss/lm": 0.4571390748023987, "step": 1223 }, { "epoch": 0.2512831040854034, "grad_norm": 1.294685691948129, "kd_ratio": 0.5, "learning_rate": 1.7542828185249413e-05, "loss": 0.9201875329017639, "loss/kd": 1.4743603467941284, "loss/lm": 0.3660147190093994, "step": 1224 }, { "epoch": 0.251488400739068, "grad_norm": 1.0192436388098867, "kd_ratio": 0.5, "learning_rate": 1.7538460278147157e-05, "loss": 1.143908977508545, "loss/kd": 1.8951386213302612, "loss/lm": 0.3926793932914734, "step": 1225 }, { "epoch": 0.2516936973927325, "grad_norm": 1.1525993014793705, "kd_ratio": 0.5, "learning_rate": 1.753408903706788e-05, "loss": 1.3226473331451416, "loss/kd": 2.140378952026367, "loss/lm": 0.5049157738685608, "step": 1226 }, { "epoch": 0.25189899404639704, "grad_norm": 1.063578726995709, "kd_ratio": 0.5, "learning_rate": 1.7529714463944815e-05, "loss": 1.0982708930969238, "loss/kd": 1.8018581867218018, "loss/lm": 0.39468371868133545, "step": 1227 }, { "epoch": 0.2521042907000616, "grad_norm": 1.0025297962892552, "kd_ratio": 0.5, "learning_rate": 1.7525336560712675e-05, "loss": 1.1292052268981934, "loss/kd": 1.8423835039138794, "loss/lm": 0.4160270392894745, "step": 1228 }, { "epoch": 0.25230958735372616, "grad_norm": 1.1644183425515597, "kd_ratio": 0.5, "learning_rate": 1.7520955329307637e-05, "loss": 1.177880048751831, "loss/kd": 1.8873958587646484, "loss/lm": 0.4683641195297241, "step": 1229 }, { "epoch": 0.2525148840073907, "grad_norm": 0.9735989105068289, "kd_ratio": 0.5, "learning_rate": 1.7516570771667356e-05, "loss": 1.418558120727539, "loss/kd": 2.379700183868408, "loss/lm": 0.4574161171913147, "step": 1230 }, { "epoch": 0.2527201806610552, "grad_norm": 1.1642110082689392, "kd_ratio": 0.5, "learning_rate": 1.751218288973096e-05, "loss": 1.1727551221847534, "loss/kd": 1.9130326509475708, "loss/lm": 0.43247753381729126, "step": 1231 }, { "epoch": 0.25292547731471976, "grad_norm": 1.2508782285031546, "kd_ratio": 0.5, "learning_rate": 1.7507791685439038e-05, "loss": 1.1429897546768188, "loss/kd": 1.8651669025421143, "loss/lm": 0.4208126366138458, "step": 1232 }, { "epoch": 0.2531307739683843, "grad_norm": 1.4143005771063932, "kd_ratio": 0.5, "learning_rate": 1.750339716073366e-05, "loss": 1.3961925506591797, "loss/kd": 2.3002660274505615, "loss/lm": 0.4921189546585083, "step": 1233 }, { "epoch": 0.2533360706220489, "grad_norm": 1.1888330559763838, "kd_ratio": 0.5, "learning_rate": 1.7498999317558358e-05, "loss": 1.1582521200180054, "loss/kd": 1.837082028388977, "loss/lm": 0.4794222414493561, "step": 1234 }, { "epoch": 0.2535413672757134, "grad_norm": 1.170590590000046, "kd_ratio": 0.5, "learning_rate": 1.7494598157858127e-05, "loss": 1.0995235443115234, "loss/kd": 1.8570609092712402, "loss/lm": 0.3419860899448395, "step": 1235 }, { "epoch": 0.25374666392937795, "grad_norm": 1.3357100888990854, "kd_ratio": 0.5, "learning_rate": 1.749019368357944e-05, "loss": 1.1821491718292236, "loss/kd": 1.9727295637130737, "loss/lm": 0.39156877994537354, "step": 1236 }, { "epoch": 0.2539519605830425, "grad_norm": 1.1213542082752044, "kd_ratio": 0.5, "learning_rate": 1.7485785896670227e-05, "loss": 1.3782329559326172, "loss/kd": 2.2900943756103516, "loss/lm": 0.46637141704559326, "step": 1237 }, { "epoch": 0.25415725723670707, "grad_norm": 1.099527865436381, "kd_ratio": 0.5, "learning_rate": 1.748137479907989e-05, "loss": 1.0739543437957764, "loss/kd": 1.6789672374725342, "loss/lm": 0.4689413905143738, "step": 1238 }, { "epoch": 0.2543625538903716, "grad_norm": 1.0186675025256524, "kd_ratio": 0.5, "learning_rate": 1.7476960392759284e-05, "loss": 1.171476125717163, "loss/kd": 1.799246907234192, "loss/lm": 0.5437052249908447, "step": 1239 }, { "epoch": 0.25456785054403613, "grad_norm": 1.6027750184702012, "kd_ratio": 0.5, "learning_rate": 1.747254267966074e-05, "loss": 1.3474721908569336, "loss/kd": 2.2011916637420654, "loss/lm": 0.49375271797180176, "step": 1240 }, { "epoch": 0.25477314719770067, "grad_norm": 1.7379943799119968, "kd_ratio": 0.5, "learning_rate": 1.7468121661738046e-05, "loss": 0.9336268901824951, "loss/kd": 1.498882532119751, "loss/lm": 0.36837130784988403, "step": 1241 }, { "epoch": 0.2549784438513652, "grad_norm": 1.043162426420129, "kd_ratio": 0.5, "learning_rate": 1.7463697340946454e-05, "loss": 1.040440320968628, "loss/kd": 1.674315333366394, "loss/lm": 0.406565397977829, "step": 1242 }, { "epoch": 0.2551837405050298, "grad_norm": 1.1000919446498176, "kd_ratio": 0.5, "learning_rate": 1.7459269719242665e-05, "loss": 1.2767106294631958, "loss/kd": 2.193415403366089, "loss/lm": 0.3600058853626251, "step": 1243 }, { "epoch": 0.2553890371586943, "grad_norm": 1.157174523671609, "kd_ratio": 0.5, "learning_rate": 1.745483879858486e-05, "loss": 1.2266961336135864, "loss/kd": 2.0319435596466064, "loss/lm": 0.4214487671852112, "step": 1244 }, { "epoch": 0.25559433381235885, "grad_norm": 1.2159912924382204, "kd_ratio": 0.5, "learning_rate": 1.745040458093266e-05, "loss": 1.186066746711731, "loss/kd": 1.9114079475402832, "loss/lm": 0.4607256352901459, "step": 1245 }, { "epoch": 0.2557996304660234, "grad_norm": 1.3978783990701387, "kd_ratio": 0.5, "learning_rate": 1.744596706824716e-05, "loss": 1.126942753791809, "loss/kd": 1.7945290803909302, "loss/lm": 0.45935651659965515, "step": 1246 }, { "epoch": 0.256004927119688, "grad_norm": 1.0588094003097546, "kd_ratio": 0.5, "learning_rate": 1.74415262624909e-05, "loss": 1.192173719406128, "loss/kd": 1.9022862911224365, "loss/lm": 0.48206111788749695, "step": 1247 }, { "epoch": 0.2562102237733525, "grad_norm": 1.9031987976224518, "kd_ratio": 0.5, "learning_rate": 1.743708216562788e-05, "loss": 1.1482086181640625, "loss/kd": 1.8915833234786987, "loss/lm": 0.40483391284942627, "step": 1248 }, { "epoch": 0.25641552042701704, "grad_norm": 3.1454273930994194, "kd_ratio": 0.5, "learning_rate": 1.7432634779623564e-05, "loss": 1.089448094367981, "loss/kd": 1.7463431358337402, "loss/lm": 0.43255308270454407, "step": 1249 }, { "epoch": 0.2566208170806816, "grad_norm": 2.650969018551358, "kd_ratio": 0.5, "learning_rate": 1.742818410644485e-05, "loss": 1.249963402748108, "loss/kd": 2.046548366546631, "loss/lm": 0.4533785283565521, "step": 1250 }, { "epoch": 0.2568261137343461, "grad_norm": 1.2036755819174834, "kd_ratio": 0.5, "learning_rate": 1.742373014806012e-05, "loss": 0.9929270148277283, "loss/kd": 1.4213263988494873, "loss/lm": 0.5645276308059692, "step": 1251 }, { "epoch": 0.2570314103880107, "grad_norm": 1.5839658138496262, "kd_ratio": 0.5, "learning_rate": 1.7419272906439177e-05, "loss": 0.9535325169563293, "loss/kd": 1.445359706878662, "loss/lm": 0.4617052972316742, "step": 1252 }, { "epoch": 0.2572367070416752, "grad_norm": 2.3066571860509497, "kd_ratio": 0.5, "learning_rate": 1.7414812383553297e-05, "loss": 1.1820591688156128, "loss/kd": 1.851690649986267, "loss/lm": 0.5124277472496033, "step": 1253 }, { "epoch": 0.25744200369533976, "grad_norm": 2.6262296842064696, "kd_ratio": 0.5, "learning_rate": 1.741034858137521e-05, "loss": 1.110429286956787, "loss/kd": 1.8269559144973755, "loss/lm": 0.39390262961387634, "step": 1254 }, { "epoch": 0.2576473003490043, "grad_norm": 1.7335387813853909, "kd_ratio": 0.5, "learning_rate": 1.740588150187907e-05, "loss": 1.2519416809082031, "loss/kd": 2.050459384918213, "loss/lm": 0.45342400670051575, "step": 1255 }, { "epoch": 0.2578525970026689, "grad_norm": 1.5889155303131761, "kd_ratio": 0.5, "learning_rate": 1.7401411147040507e-05, "loss": 0.9760453701019287, "loss/kd": 1.5813506841659546, "loss/lm": 0.3707400858402252, "step": 1256 }, { "epoch": 0.2580578936563334, "grad_norm": 2.3584595108550452, "kd_ratio": 0.5, "learning_rate": 1.7396937518836595e-05, "loss": 1.2497403621673584, "loss/kd": 2.1542863845825195, "loss/lm": 0.34519439935684204, "step": 1257 }, { "epoch": 0.25826319030999795, "grad_norm": 1.3196827366518238, "kd_ratio": 0.5, "learning_rate": 1.7392460619245842e-05, "loss": 1.0978180170059204, "loss/kd": 1.7989836931228638, "loss/lm": 0.39665237069129944, "step": 1258 }, { "epoch": 0.2584684869636625, "grad_norm": 1.8349526602558013, "kd_ratio": 0.5, "learning_rate": 1.7387980450248222e-05, "loss": 1.9600619077682495, "loss/kd": 3.5344278812408447, "loss/lm": 0.38569602370262146, "step": 1259 }, { "epoch": 0.258673783617327, "grad_norm": 2.185938147694779, "kd_ratio": 0.5, "learning_rate": 1.738349701382514e-05, "loss": 1.2949248552322388, "loss/kd": 2.1344120502471924, "loss/lm": 0.45543769001960754, "step": 1260 }, { "epoch": 0.2588790802709916, "grad_norm": 2.1414998054426584, "kd_ratio": 0.5, "learning_rate": 1.7379010311959448e-05, "loss": 1.1514228582382202, "loss/kd": 1.9689176082611084, "loss/lm": 0.3339281678199768, "step": 1261 }, { "epoch": 0.25908437692465613, "grad_norm": 1.3398851422284366, "kd_ratio": 0.5, "learning_rate": 1.737452034663545e-05, "loss": 1.0918142795562744, "loss/kd": 1.7129707336425781, "loss/lm": 0.47065791487693787, "step": 1262 }, { "epoch": 0.25928967357832067, "grad_norm": 1.2897786592305944, "kd_ratio": 0.5, "learning_rate": 1.7370027119838884e-05, "loss": 1.1530109643936157, "loss/kd": 1.88149893283844, "loss/lm": 0.42452290654182434, "step": 1263 }, { "epoch": 0.2594949702319852, "grad_norm": 1.7586616612094277, "kd_ratio": 0.5, "learning_rate": 1.7365530633556938e-05, "loss": 1.1089881658554077, "loss/kd": 1.7263028621673584, "loss/lm": 0.4916735589504242, "step": 1264 }, { "epoch": 0.2597002668856498, "grad_norm": 2.2458235938457243, "kd_ratio": 0.5, "learning_rate": 1.736103088977824e-05, "loss": 0.883622407913208, "loss/kd": 1.3300734758377075, "loss/lm": 0.4371713399887085, "step": 1265 }, { "epoch": 0.2599055635393143, "grad_norm": 1.810962322928412, "kd_ratio": 0.5, "learning_rate": 1.7356527890492855e-05, "loss": 1.2228548526763916, "loss/kd": 1.989112377166748, "loss/lm": 0.4565972089767456, "step": 1266 }, { "epoch": 0.26011086019297885, "grad_norm": 1.0446216819158258, "kd_ratio": 0.5, "learning_rate": 1.735202163769229e-05, "loss": 1.2053124904632568, "loss/kd": 1.9862067699432373, "loss/lm": 0.4244181215763092, "step": 1267 }, { "epoch": 0.2603161568466434, "grad_norm": 1.5479173945446079, "kd_ratio": 0.5, "learning_rate": 1.7347512133369494e-05, "loss": 1.3385164737701416, "loss/kd": 2.3054542541503906, "loss/lm": 0.37157875299453735, "step": 1268 }, { "epoch": 0.2605214535003079, "grad_norm": 1.051774039909251, "kd_ratio": 0.5, "learning_rate": 1.7342999379518842e-05, "loss": 1.6102112531661987, "loss/kd": 2.7649593353271484, "loss/lm": 0.455463171005249, "step": 1269 }, { "epoch": 0.2607267501539725, "grad_norm": 1.1859850697233694, "kd_ratio": 0.5, "learning_rate": 1.7338483378136165e-05, "loss": 1.2110954523086548, "loss/kd": 1.981471300125122, "loss/lm": 0.4407196044921875, "step": 1270 }, { "epoch": 0.26093204680763704, "grad_norm": 1.3165115423330602, "kd_ratio": 0.5, "learning_rate": 1.7333964131218714e-05, "loss": 1.1075713634490967, "loss/kd": 1.8342187404632568, "loss/lm": 0.3809240758419037, "step": 1271 }, { "epoch": 0.2611373434613016, "grad_norm": 1.079175989979073, "kd_ratio": 0.5, "learning_rate": 1.7329441640765187e-05, "loss": 1.1184030771255493, "loss/kd": 1.7996678352355957, "loss/lm": 0.4371383488178253, "step": 1272 }, { "epoch": 0.2613426401149661, "grad_norm": 1.0759973061925774, "kd_ratio": 0.5, "learning_rate": 1.7324915908775708e-05, "loss": 1.027976155281067, "loss/kd": 1.6489081382751465, "loss/lm": 0.40704408288002014, "step": 1273 }, { "epoch": 0.2615479367686307, "grad_norm": 1.1780850330753194, "kd_ratio": 0.5, "learning_rate": 1.732038693725184e-05, "loss": 0.9837092757225037, "loss/kd": 1.5848259925842285, "loss/lm": 0.3825925290584564, "step": 1274 }, { "epoch": 0.26175323342229523, "grad_norm": 1.1621286830783935, "kd_ratio": 0.5, "learning_rate": 1.7315854728196568e-05, "loss": 1.095073938369751, "loss/kd": 1.7258026599884033, "loss/lm": 0.4643453359603882, "step": 1275 }, { "epoch": 0.26195853007595976, "grad_norm": 1.1865481846643786, "kd_ratio": 0.5, "learning_rate": 1.731131928361433e-05, "loss": 0.9938222765922546, "loss/kd": 1.5607494115829468, "loss/lm": 0.4268951416015625, "step": 1276 }, { "epoch": 0.2621638267296243, "grad_norm": 1.18179164448945, "kd_ratio": 0.5, "learning_rate": 1.7306780605510973e-05, "loss": 1.1560239791870117, "loss/kd": 1.9295421838760376, "loss/lm": 0.3825058937072754, "step": 1277 }, { "epoch": 0.2623691233832888, "grad_norm": 1.2707027253310814, "kd_ratio": 0.5, "learning_rate": 1.7302238695893788e-05, "loss": 1.1410378217697144, "loss/kd": 1.8757390975952148, "loss/lm": 0.40633657574653625, "step": 1278 }, { "epoch": 0.2625744200369534, "grad_norm": 1.0960151464834818, "kd_ratio": 0.5, "learning_rate": 1.729769355677149e-05, "loss": 1.13515305519104, "loss/kd": 1.8596694469451904, "loss/lm": 0.4106365740299225, "step": 1279 }, { "epoch": 0.26277971669061795, "grad_norm": 1.124960500984649, "kd_ratio": 0.5, "learning_rate": 1.729314519015422e-05, "loss": 1.1466063261032104, "loss/kd": 1.795422077178955, "loss/lm": 0.4977906346321106, "step": 1280 }, { "epoch": 0.2629850133442825, "grad_norm": 1.1967161028444178, "kd_ratio": 0.5, "learning_rate": 1.728859359805355e-05, "loss": 1.009652853012085, "loss/kd": 1.493116855621338, "loss/lm": 0.5261889696121216, "step": 1281 }, { "epoch": 0.263190309997947, "grad_norm": 1.588770616317561, "kd_ratio": 0.5, "learning_rate": 1.728403878248248e-05, "loss": 1.1748849153518677, "loss/kd": 1.924782633781433, "loss/lm": 0.424987256526947, "step": 1282 }, { "epoch": 0.2633956066516116, "grad_norm": 1.278178096445154, "kd_ratio": 0.5, "learning_rate": 1.7279480745455433e-05, "loss": 1.2913877964019775, "loss/kd": 2.182147979736328, "loss/lm": 0.4006275534629822, "step": 1283 }, { "epoch": 0.26360090330527614, "grad_norm": 1.2985647889447167, "kd_ratio": 0.5, "learning_rate": 1.7274919488988256e-05, "loss": 0.9829603433609009, "loss/kd": 1.6085907220840454, "loss/lm": 0.3573299050331116, "step": 1284 }, { "epoch": 0.26380619995894067, "grad_norm": 1.7269478480507723, "kd_ratio": 0.5, "learning_rate": 1.727035501509822e-05, "loss": 1.0991311073303223, "loss/kd": 1.6493538618087769, "loss/lm": 0.5489082932472229, "step": 1285 }, { "epoch": 0.2640114966126052, "grad_norm": 1.530268178010838, "kd_ratio": 0.5, "learning_rate": 1.7265787325804024e-05, "loss": 1.025407314300537, "loss/kd": 1.6387807130813599, "loss/lm": 0.4120337963104248, "step": 1286 }, { "epoch": 0.26421679326626973, "grad_norm": 1.2444930395564686, "kd_ratio": 0.5, "learning_rate": 1.7261216423125782e-05, "loss": 1.2306307554244995, "loss/kd": 2.106492280960083, "loss/lm": 0.35476914048194885, "step": 1287 }, { "epoch": 0.2644220899199343, "grad_norm": 1.0900331083053907, "kd_ratio": 0.5, "learning_rate": 1.725664230908503e-05, "loss": 1.0835399627685547, "loss/kd": 1.757896900177002, "loss/lm": 0.4091830849647522, "step": 1288 }, { "epoch": 0.26462738657359886, "grad_norm": 1.0812752629111213, "kd_ratio": 0.5, "learning_rate": 1.725206498570473e-05, "loss": 1.1403148174285889, "loss/kd": 1.9186248779296875, "loss/lm": 0.3620048463344574, "step": 1289 }, { "epoch": 0.2648326832272634, "grad_norm": 1.09078639359206, "kd_ratio": 0.5, "learning_rate": 1.7247484455009257e-05, "loss": 1.0770165920257568, "loss/kd": 1.741104245185852, "loss/lm": 0.4129290282726288, "step": 1290 }, { "epoch": 0.2650379798809279, "grad_norm": 1.0195526582462238, "kd_ratio": 0.5, "learning_rate": 1.724290071902441e-05, "loss": 1.1347073316574097, "loss/kd": 1.858191967010498, "loss/lm": 0.4112227261066437, "step": 1291 }, { "epoch": 0.2652432765345925, "grad_norm": 1.1523777012244523, "kd_ratio": 0.5, "learning_rate": 1.72383137797774e-05, "loss": 1.0202404260635376, "loss/kd": 1.624408483505249, "loss/lm": 0.41607245802879333, "step": 1292 }, { "epoch": 0.26544857318825704, "grad_norm": 1.0812986744557125, "kd_ratio": 0.5, "learning_rate": 1.7233723639296857e-05, "loss": 1.1015470027923584, "loss/kd": 1.8026551008224487, "loss/lm": 0.4004388451576233, "step": 1293 }, { "epoch": 0.2656538698419216, "grad_norm": 1.4378479216734648, "kd_ratio": 0.5, "learning_rate": 1.722913029961283e-05, "loss": 1.2821121215820312, "loss/kd": 2.083937644958496, "loss/lm": 0.4802866280078888, "step": 1294 }, { "epoch": 0.2658591664955861, "grad_norm": 1.1470436734462033, "kd_ratio": 0.5, "learning_rate": 1.7224533762756775e-05, "loss": 1.4068317413330078, "loss/kd": 2.2332851886749268, "loss/lm": 0.5803783535957336, "step": 1295 }, { "epoch": 0.26606446314925064, "grad_norm": 1.3308017851819918, "kd_ratio": 0.5, "learning_rate": 1.7219934030761578e-05, "loss": 1.9376747608184814, "loss/kd": 3.572310447692871, "loss/lm": 0.3030390441417694, "step": 1296 }, { "epoch": 0.26626975980291523, "grad_norm": 1.5451055784178769, "kd_ratio": 0.5, "learning_rate": 1.7215331105661518e-05, "loss": 1.0708669424057007, "loss/kd": 1.7711101770401, "loss/lm": 0.3706236779689789, "step": 1297 }, { "epoch": 0.26647505645657976, "grad_norm": 1.6684879666216386, "kd_ratio": 0.5, "learning_rate": 1.7210724989492298e-05, "loss": 1.3335411548614502, "loss/kd": 2.236464023590088, "loss/lm": 0.43061816692352295, "step": 1298 }, { "epoch": 0.2666803531102443, "grad_norm": 1.3979531831838494, "kd_ratio": 0.5, "learning_rate": 1.720611568429103e-05, "loss": 1.1990368366241455, "loss/kd": 1.9552737474441528, "loss/lm": 0.44280001521110535, "step": 1299 }, { "epoch": 0.2668856497639088, "grad_norm": 1.072048808374234, "kd_ratio": 0.5, "learning_rate": 1.7201503192096233e-05, "loss": 1.0953235626220703, "loss/kd": 1.8837683200836182, "loss/lm": 0.306878924369812, "step": 1300 }, { "epoch": 0.2670909464175734, "grad_norm": 1.004715459490765, "kd_ratio": 0.5, "learning_rate": 1.7196887514947838e-05, "loss": 0.9805500507354736, "loss/kd": 1.5920050144195557, "loss/lm": 0.369095116853714, "step": 1301 }, { "epoch": 0.26729624307123795, "grad_norm": 1.4108947743832294, "kd_ratio": 0.5, "learning_rate": 1.7192268654887193e-05, "loss": 0.9536053538322449, "loss/kd": 1.5058709383010864, "loss/lm": 0.4013397991657257, "step": 1302 }, { "epoch": 0.2675015397249025, "grad_norm": 1.3377801416253068, "kd_ratio": 0.5, "learning_rate": 1.718764661395704e-05, "loss": 0.9033289551734924, "loss/kd": 1.4232226610183716, "loss/lm": 0.3834352493286133, "step": 1303 }, { "epoch": 0.267706836378567, "grad_norm": 1.2020973913572754, "kd_ratio": 0.5, "learning_rate": 1.7183021394201533e-05, "loss": 1.1034080982208252, "loss/kd": 1.8232706785202026, "loss/lm": 0.3835454285144806, "step": 1304 }, { "epoch": 0.26791213303223155, "grad_norm": 1.2081321377520784, "kd_ratio": 0.5, "learning_rate": 1.7178392997666237e-05, "loss": 0.9839969873428345, "loss/kd": 1.5547014474868774, "loss/lm": 0.4132925868034363, "step": 1305 }, { "epoch": 0.26811742968589614, "grad_norm": 1.1812496823028906, "kd_ratio": 0.5, "learning_rate": 1.717376142639811e-05, "loss": 0.9354917407035828, "loss/kd": 1.4695045948028564, "loss/lm": 0.40147891640663147, "step": 1306 }, { "epoch": 0.26832272633956067, "grad_norm": 1.512908682973604, "kd_ratio": 0.5, "learning_rate": 1.716912668244553e-05, "loss": 1.1920490264892578, "loss/kd": 1.89142906665802, "loss/lm": 0.492669016122818, "step": 1307 }, { "epoch": 0.2685280229932252, "grad_norm": 1.3949123996581279, "kd_ratio": 0.5, "learning_rate": 1.7164488767858262e-05, "loss": 1.3712143898010254, "loss/kd": 2.275595188140869, "loss/lm": 0.4668337106704712, "step": 1308 }, { "epoch": 0.26873331964688973, "grad_norm": 1.828177483640373, "kd_ratio": 0.5, "learning_rate": 1.715984768468749e-05, "loss": 1.1150233745574951, "loss/kd": 1.796573281288147, "loss/lm": 0.43347346782684326, "step": 1309 }, { "epoch": 0.2689386163005543, "grad_norm": 1.2892232362325318, "kd_ratio": 0.5, "learning_rate": 1.715520343498578e-05, "loss": 1.1587724685668945, "loss/kd": 1.8737112283706665, "loss/lm": 0.4438338279724121, "step": 1310 }, { "epoch": 0.26914391295421886, "grad_norm": 1.2186610005621636, "kd_ratio": 0.5, "learning_rate": 1.715055602080711e-05, "loss": 1.1649417877197266, "loss/kd": 1.830023169517517, "loss/lm": 0.49986031651496887, "step": 1311 }, { "epoch": 0.2693492096078834, "grad_norm": 1.5847036746620995, "kd_ratio": 0.5, "learning_rate": 1.7145905444206868e-05, "loss": 1.255257487297058, "loss/kd": 2.0306127071380615, "loss/lm": 0.4799022674560547, "step": 1312 }, { "epoch": 0.2695545062615479, "grad_norm": 1.1657242509715025, "kd_ratio": 0.5, "learning_rate": 1.714125170724182e-05, "loss": 1.3085168600082397, "loss/kd": 2.202357292175293, "loss/lm": 0.41467639803886414, "step": 1313 }, { "epoch": 0.26975980291521245, "grad_norm": 1.4522352058769066, "kd_ratio": 0.5, "learning_rate": 1.7136594811970132e-05, "loss": 1.20489501953125, "loss/kd": 1.9899563789367676, "loss/lm": 0.4198336899280548, "step": 1314 }, { "epoch": 0.26996509956887704, "grad_norm": 1.4441950982389082, "kd_ratio": 0.5, "learning_rate": 1.7131934760451385e-05, "loss": 1.0579731464385986, "loss/kd": 1.6894562244415283, "loss/lm": 0.42649000883102417, "step": 1315 }, { "epoch": 0.2701703962225416, "grad_norm": 1.7060793141186639, "kd_ratio": 0.5, "learning_rate": 1.7127271554746538e-05, "loss": 1.3183443546295166, "loss/kd": 2.2336807250976562, "loss/lm": 0.40300804376602173, "step": 1316 }, { "epoch": 0.2703756928762061, "grad_norm": 1.0150728507141924, "kd_ratio": 0.5, "learning_rate": 1.7122605196917957e-05, "loss": 1.375773310661316, "loss/kd": 2.2396249771118164, "loss/lm": 0.5119217038154602, "step": 1317 }, { "epoch": 0.27058098952987064, "grad_norm": 1.6085578674200178, "kd_ratio": 0.5, "learning_rate": 1.7117935689029386e-05, "loss": 1.0214320421218872, "loss/kd": 1.6063188314437866, "loss/lm": 0.43654516339302063, "step": 1318 }, { "epoch": 0.27078628618353523, "grad_norm": 1.1638383101491359, "kd_ratio": 0.5, "learning_rate": 1.7113263033145985e-05, "loss": 1.101091742515564, "loss/kd": 1.8069078922271729, "loss/lm": 0.3952755630016327, "step": 1319 }, { "epoch": 0.27099158283719976, "grad_norm": 1.615168571961889, "kd_ratio": 0.5, "learning_rate": 1.7108587231334285e-05, "loss": 1.1234900951385498, "loss/kd": 1.8292170763015747, "loss/lm": 0.41776320338249207, "step": 1320 }, { "epoch": 0.2711968794908643, "grad_norm": 1.8438230027278002, "kd_ratio": 0.5, "learning_rate": 1.7103908285662216e-05, "loss": 1.0690191984176636, "loss/kd": 1.7283471822738647, "loss/lm": 0.4096912443637848, "step": 1321 }, { "epoch": 0.27140217614452883, "grad_norm": 2.1892659651511885, "kd_ratio": 0.5, "learning_rate": 1.7099226198199104e-05, "loss": 1.224934458732605, "loss/kd": 2.0852303504943848, "loss/lm": 0.36463862657546997, "step": 1322 }, { "epoch": 0.2716074727981934, "grad_norm": 2.7097608741189942, "kd_ratio": 0.5, "learning_rate": 1.7094540971015663e-05, "loss": 1.0905613899230957, "loss/kd": 1.7670646905899048, "loss/lm": 0.41405802965164185, "step": 1323 }, { "epoch": 0.27181276945185795, "grad_norm": 3.281554865383141, "kd_ratio": 0.5, "learning_rate": 1.7089852606183986e-05, "loss": 1.0888152122497559, "loss/kd": 1.7097139358520508, "loss/lm": 0.4679165780544281, "step": 1324 }, { "epoch": 0.2720180661055225, "grad_norm": 2.2766159624324716, "kd_ratio": 0.5, "learning_rate": 1.7085161105777563e-05, "loss": 1.135389804840088, "loss/kd": 1.911694049835205, "loss/lm": 0.35908564925193787, "step": 1325 }, { "epoch": 0.272223362759187, "grad_norm": 1.4153691480596484, "kd_ratio": 0.5, "learning_rate": 1.7080466471871264e-05, "loss": 1.1722559928894043, "loss/kd": 1.9467437267303467, "loss/lm": 0.3977682888507843, "step": 1326 }, { "epoch": 0.27242865941285155, "grad_norm": 1.540249223468573, "kd_ratio": 0.5, "learning_rate": 1.7075768706541355e-05, "loss": 1.361094355583191, "loss/kd": 2.34983491897583, "loss/lm": 0.372353732585907, "step": 1327 }, { "epoch": 0.27263395606651614, "grad_norm": 2.557196565017348, "kd_ratio": 0.5, "learning_rate": 1.7071067811865477e-05, "loss": 1.2814700603485107, "loss/kd": 2.2065255641937256, "loss/lm": 0.35641446709632874, "step": 1328 }, { "epoch": 0.27283925272018067, "grad_norm": 1.795073176948417, "kd_ratio": 0.5, "learning_rate": 1.706636378992266e-05, "loss": 0.9741994738578796, "loss/kd": 1.5638315677642822, "loss/lm": 0.38456740975379944, "step": 1329 }, { "epoch": 0.2730445493738452, "grad_norm": 1.2947960142733645, "kd_ratio": 0.5, "learning_rate": 1.7061656642793313e-05, "loss": 1.1668123006820679, "loss/kd": 1.9792085886001587, "loss/lm": 0.3544161021709442, "step": 1330 }, { "epoch": 0.27324984602750974, "grad_norm": 1.8553421696600318, "kd_ratio": 0.5, "learning_rate": 1.7056946372559234e-05, "loss": 1.2089145183563232, "loss/kd": 2.016201972961426, "loss/lm": 0.40162694454193115, "step": 1331 }, { "epoch": 0.2734551426811743, "grad_norm": 1.1698758753836114, "kd_ratio": 0.5, "learning_rate": 1.7052232981303594e-05, "loss": 1.0784111022949219, "loss/kd": 1.7509478330612183, "loss/lm": 0.4058743417263031, "step": 1332 }, { "epoch": 0.27366043933483886, "grad_norm": 1.358329183362983, "kd_ratio": 0.5, "learning_rate": 1.7047516471110953e-05, "loss": 1.01827871799469, "loss/kd": 1.6099234819412231, "loss/lm": 0.42663395404815674, "step": 1333 }, { "epoch": 0.2738657359885034, "grad_norm": 1.534668330850748, "kd_ratio": 0.5, "learning_rate": 1.7042796844067246e-05, "loss": 1.0519044399261475, "loss/kd": 1.7104954719543457, "loss/lm": 0.39331331849098206, "step": 1334 }, { "epoch": 0.2740710326421679, "grad_norm": 2.2410605984721457, "kd_ratio": 0.5, "learning_rate": 1.7038074102259775e-05, "loss": 0.9048126935958862, "loss/kd": 1.4933228492736816, "loss/lm": 0.31630247831344604, "step": 1335 }, { "epoch": 0.27427632929583246, "grad_norm": 2.3119431828823656, "kd_ratio": 0.5, "learning_rate": 1.7033348247777245e-05, "loss": 1.3225473165512085, "loss/kd": 2.2432844638824463, "loss/lm": 0.4018101990222931, "step": 1336 }, { "epoch": 0.27448162594949704, "grad_norm": 1.5792766999516703, "kd_ratio": 0.5, "learning_rate": 1.7028619282709718e-05, "loss": 1.3161447048187256, "loss/kd": 2.2092416286468506, "loss/lm": 0.42304766178131104, "step": 1337 }, { "epoch": 0.2746869226031616, "grad_norm": 1.0240675300631432, "kd_ratio": 0.5, "learning_rate": 1.7023887209148636e-05, "loss": 1.066137671470642, "loss/kd": 1.761297345161438, "loss/lm": 0.37097808718681335, "step": 1338 }, { "epoch": 0.2748922192568261, "grad_norm": 2.0061984326390108, "kd_ratio": 0.5, "learning_rate": 1.7019152029186817e-05, "loss": 0.9483902454376221, "loss/kd": 1.4928594827651978, "loss/lm": 0.403920978307724, "step": 1339 }, { "epoch": 0.27509751591049064, "grad_norm": 2.4527467152387463, "kd_ratio": 0.5, "learning_rate": 1.7014413744918453e-05, "loss": 1.116959571838379, "loss/kd": 1.7924394607543945, "loss/lm": 0.4414796531200409, "step": 1340 }, { "epoch": 0.27530281256415523, "grad_norm": 2.0786856240743186, "kd_ratio": 0.5, "learning_rate": 1.700967235843911e-05, "loss": 1.1331150531768799, "loss/kd": 1.8964182138442993, "loss/lm": 0.36981201171875, "step": 1341 }, { "epoch": 0.27550810921781976, "grad_norm": 1.3507058376459604, "kd_ratio": 0.5, "learning_rate": 1.7004927871845725e-05, "loss": 1.0326817035675049, "loss/kd": 1.6554101705551147, "loss/lm": 0.409953236579895, "step": 1342 }, { "epoch": 0.2757134058714843, "grad_norm": 0.9967888875884409, "kd_ratio": 0.5, "learning_rate": 1.70001802872366e-05, "loss": 1.5841916799545288, "loss/kd": 2.7149734497070312, "loss/lm": 0.453409880399704, "step": 1343 }, { "epoch": 0.27591870252514883, "grad_norm": 1.1140791703625843, "kd_ratio": 0.5, "learning_rate": 1.699542960671142e-05, "loss": 1.0869065523147583, "loss/kd": 1.7437266111373901, "loss/lm": 0.4300864636898041, "step": 1344 }, { "epoch": 0.27612399917881336, "grad_norm": 1.0547337539189645, "kd_ratio": 0.5, "learning_rate": 1.6990675832371232e-05, "loss": 1.0669804811477661, "loss/kd": 1.7778664827346802, "loss/lm": 0.3560943901538849, "step": 1345 }, { "epoch": 0.27632929583247795, "grad_norm": 1.080390927357975, "kd_ratio": 0.5, "learning_rate": 1.6985918966318444e-05, "loss": 1.0269074440002441, "loss/kd": 1.6571677923202515, "loss/lm": 0.3966471552848816, "step": 1346 }, { "epoch": 0.2765345924861425, "grad_norm": 1.0681187155670688, "kd_ratio": 0.5, "learning_rate": 1.6981159010656847e-05, "loss": 1.0561190843582153, "loss/kd": 1.7434254884719849, "loss/lm": 0.36881259083747864, "step": 1347 }, { "epoch": 0.276739889139807, "grad_norm": 1.072951461390765, "kd_ratio": 0.5, "learning_rate": 1.6976395967491585e-05, "loss": 1.7311131954193115, "loss/kd": 3.1383278369903564, "loss/lm": 0.3238985538482666, "step": 1348 }, { "epoch": 0.27694518579347155, "grad_norm": 1.3704106668408347, "kd_ratio": 0.5, "learning_rate": 1.697162983892917e-05, "loss": 1.190291404724121, "loss/kd": 1.939703106880188, "loss/lm": 0.44087958335876465, "step": 1349 }, { "epoch": 0.27715048244713614, "grad_norm": 1.1697459163100399, "kd_ratio": 0.5, "learning_rate": 1.6966860627077494e-05, "loss": 1.0782777070999146, "loss/kd": 1.7639374732971191, "loss/lm": 0.39261797070503235, "step": 1350 }, { "epoch": 0.27735577910080067, "grad_norm": 1.0463726966092046, "kd_ratio": 0.5, "learning_rate": 1.6962088334045785e-05, "loss": 1.0213501453399658, "loss/kd": 1.5472384691238403, "loss/lm": 0.49546194076538086, "step": 1351 }, { "epoch": 0.2775610757544652, "grad_norm": 1.3283192877252061, "kd_ratio": 0.5, "learning_rate": 1.6957312961944653e-05, "loss": 1.2581264972686768, "loss/kd": 2.110320806503296, "loss/lm": 0.4059321880340576, "step": 1352 }, { "epoch": 0.27776637240812974, "grad_norm": 1.6563970084114563, "kd_ratio": 0.5, "learning_rate": 1.695253451288607e-05, "loss": 1.084373116493225, "loss/kd": 1.7518937587738037, "loss/lm": 0.41685256361961365, "step": 1353 }, { "epoch": 0.27797166906179427, "grad_norm": 1.0621053889338383, "kd_ratio": 0.5, "learning_rate": 1.694775298898336e-05, "loss": 1.2590484619140625, "loss/kd": 2.0540292263031006, "loss/lm": 0.46406760811805725, "step": 1354 }, { "epoch": 0.27817696571545886, "grad_norm": 1.7273950238871052, "kd_ratio": 0.5, "learning_rate": 1.694296839235121e-05, "loss": 2.1117045879364014, "loss/kd": 3.8254756927490234, "loss/lm": 0.3979332745075226, "step": 1355 }, { "epoch": 0.2783822623691234, "grad_norm": 1.6315306490551125, "kd_ratio": 0.5, "learning_rate": 1.693818072510567e-05, "loss": 1.1630905866622925, "loss/kd": 1.9197945594787598, "loss/lm": 0.4063865542411804, "step": 1356 }, { "epoch": 0.2785875590227879, "grad_norm": 1.498676094594886, "kd_ratio": 0.5, "learning_rate": 1.6933389989364145e-05, "loss": 1.8959019184112549, "loss/kd": 3.501023054122925, "loss/lm": 0.29078081250190735, "step": 1357 }, { "epoch": 0.27879285567645246, "grad_norm": 1.2029302069010175, "kd_ratio": 0.5, "learning_rate": 1.692859618724539e-05, "loss": 1.1395864486694336, "loss/kd": 1.851898193359375, "loss/lm": 0.42727458477020264, "step": 1358 }, { "epoch": 0.27899815233011704, "grad_norm": 1.0687870340905326, "kd_ratio": 0.5, "learning_rate": 1.692379932086953e-05, "loss": 1.159960389137268, "loss/kd": 1.9252738952636719, "loss/lm": 0.39464691281318665, "step": 1359 }, { "epoch": 0.2792034489837816, "grad_norm": 1.1165205154703948, "kd_ratio": 0.5, "learning_rate": 1.6918999392358037e-05, "loss": 1.058595061302185, "loss/kd": 1.7279568910598755, "loss/lm": 0.389233261346817, "step": 1360 }, { "epoch": 0.2794087456374461, "grad_norm": 1.13617684396221, "kd_ratio": 0.5, "learning_rate": 1.691419640383374e-05, "loss": 0.890415370464325, "loss/kd": 1.4066251516342163, "loss/lm": 0.3742055892944336, "step": 1361 }, { "epoch": 0.27961404229111064, "grad_norm": 1.7827075531529468, "kd_ratio": 0.5, "learning_rate": 1.6909390357420816e-05, "loss": 1.0497711896896362, "loss/kd": 1.7301805019378662, "loss/lm": 0.36936184763908386, "step": 1362 }, { "epoch": 0.2798193389447752, "grad_norm": 1.8168005602478172, "kd_ratio": 0.5, "learning_rate": 1.6904581255244802e-05, "loss": 1.3011269569396973, "loss/kd": 2.0941977500915527, "loss/lm": 0.5080562233924866, "step": 1363 }, { "epoch": 0.28002463559843976, "grad_norm": 1.2514920467851962, "kd_ratio": 0.5, "learning_rate": 1.689976909943258e-05, "loss": 1.022472858428955, "loss/kd": 1.6097990274429321, "loss/lm": 0.43514660000801086, "step": 1364 }, { "epoch": 0.2802299322521043, "grad_norm": 2.1420796796408585, "kd_ratio": 0.5, "learning_rate": 1.6894953892112388e-05, "loss": 1.1639468669891357, "loss/kd": 1.9277043342590332, "loss/lm": 0.40018945932388306, "step": 1365 }, { "epoch": 0.28043522890576883, "grad_norm": 2.665940336900657, "kd_ratio": 0.5, "learning_rate": 1.6890135635413805e-05, "loss": 1.1449280977249146, "loss/kd": 1.8653111457824707, "loss/lm": 0.4245450496673584, "step": 1366 }, { "epoch": 0.28064052555943336, "grad_norm": 2.8109333412786417, "kd_ratio": 0.5, "learning_rate": 1.688531433146777e-05, "loss": 1.131895661354065, "loss/kd": 1.8186436891555786, "loss/lm": 0.4451475739479065, "step": 1367 }, { "epoch": 0.28084582221309795, "grad_norm": 1.815804391961722, "kd_ratio": 0.5, "learning_rate": 1.6880489982406568e-05, "loss": 0.969666063785553, "loss/kd": 1.5273696184158325, "loss/lm": 0.41196250915527344, "step": 1368 }, { "epoch": 0.2810511188667625, "grad_norm": 1.0317550403756393, "kd_ratio": 0.5, "learning_rate": 1.687566259036382e-05, "loss": 1.1043027639389038, "loss/kd": 1.7676621675491333, "loss/lm": 0.4409434497356415, "step": 1369 }, { "epoch": 0.281256415520427, "grad_norm": 1.981201455298469, "kd_ratio": 0.5, "learning_rate": 1.6870832157474496e-05, "loss": 1.078946828842163, "loss/kd": 1.6918585300445557, "loss/lm": 0.4660350978374481, "step": 1370 }, { "epoch": 0.28146171217409155, "grad_norm": 1.9249340897089307, "kd_ratio": 0.5, "learning_rate": 1.6865998685874923e-05, "loss": 1.2784126996994019, "loss/kd": 2.014699697494507, "loss/lm": 0.5421256422996521, "step": 1371 }, { "epoch": 0.2816670088277561, "grad_norm": 1.424043637501572, "kd_ratio": 0.5, "learning_rate": 1.686116217770276e-05, "loss": 1.100435495376587, "loss/kd": 1.736507773399353, "loss/lm": 0.4643631875514984, "step": 1372 }, { "epoch": 0.28187230548142067, "grad_norm": 2.1025357904420843, "kd_ratio": 0.5, "learning_rate": 1.6856322635097013e-05, "loss": 0.9512017965316772, "loss/kd": 1.4796311855316162, "loss/lm": 0.4227723777294159, "step": 1373 }, { "epoch": 0.2820776021350852, "grad_norm": 1.295873986170015, "kd_ratio": 0.5, "learning_rate": 1.685148006019803e-05, "loss": 1.214162826538086, "loss/kd": 2.03454852104187, "loss/lm": 0.39377716183662415, "step": 1374 }, { "epoch": 0.28228289878874974, "grad_norm": 1.864250724631448, "kd_ratio": 0.5, "learning_rate": 1.6846634455147498e-05, "loss": 0.9548526406288147, "loss/kd": 1.5213799476623535, "loss/lm": 0.3883253037929535, "step": 1375 }, { "epoch": 0.28248819544241427, "grad_norm": 2.6715139192411423, "kd_ratio": 0.5, "learning_rate": 1.6841785822088445e-05, "loss": 1.1042494773864746, "loss/kd": 1.762739896774292, "loss/lm": 0.4457589387893677, "step": 1376 }, { "epoch": 0.28269349209607886, "grad_norm": 1.8249056052996635, "kd_ratio": 0.5, "learning_rate": 1.683693416316524e-05, "loss": 1.1175603866577148, "loss/kd": 1.8788363933563232, "loss/lm": 0.35628440976142883, "step": 1377 }, { "epoch": 0.2828987887497434, "grad_norm": 1.122995306269613, "kd_ratio": 0.5, "learning_rate": 1.683207948052359e-05, "loss": 0.956916093826294, "loss/kd": 1.571276068687439, "loss/lm": 0.3425561189651489, "step": 1378 }, { "epoch": 0.2831040854034079, "grad_norm": 1.5973586310826204, "kd_ratio": 0.5, "learning_rate": 1.6827221776310532e-05, "loss": 1.0317405462265015, "loss/kd": 1.7033582925796509, "loss/lm": 0.3601228892803192, "step": 1379 }, { "epoch": 0.28330938205707246, "grad_norm": 1.9949523046219888, "kd_ratio": 0.5, "learning_rate": 1.6822361052674455e-05, "loss": 1.1368799209594727, "loss/kd": 1.8183577060699463, "loss/lm": 0.4554022550582886, "step": 1380 }, { "epoch": 0.283514678710737, "grad_norm": 1.6824473456274431, "kd_ratio": 0.5, "learning_rate": 1.681749731176507e-05, "loss": 1.0806101560592651, "loss/kd": 1.7820219993591309, "loss/lm": 0.3791983723640442, "step": 1381 }, { "epoch": 0.2837199753644016, "grad_norm": 1.66486825589785, "kd_ratio": 0.5, "learning_rate": 1.6812630555733425e-05, "loss": 1.4261151552200317, "loss/kd": 2.435544967651367, "loss/lm": 0.4166853427886963, "step": 1382 }, { "epoch": 0.2839252720180661, "grad_norm": 1.9426162182814395, "kd_ratio": 0.5, "learning_rate": 1.6807760786731905e-05, "loss": 1.00077223777771, "loss/kd": 1.6138120889663696, "loss/lm": 0.3877323567867279, "step": 1383 }, { "epoch": 0.28413056867173064, "grad_norm": 2.8016667103804784, "kd_ratio": 0.5, "learning_rate": 1.6802888006914223e-05, "loss": 1.057087779045105, "loss/kd": 1.692028522491455, "loss/lm": 0.42214709520339966, "step": 1384 }, { "epoch": 0.2843358653253952, "grad_norm": 1.4057649773599377, "kd_ratio": 0.5, "learning_rate": 1.6798012218435428e-05, "loss": 1.3663105964660645, "loss/kd": 2.291163921356201, "loss/lm": 0.44145721197128296, "step": 1385 }, { "epoch": 0.28454116197905976, "grad_norm": 1.5800230181167225, "kd_ratio": 0.5, "learning_rate": 1.6793133423451896e-05, "loss": 1.19535493850708, "loss/kd": 1.9414560794830322, "loss/lm": 0.4492539167404175, "step": 1386 }, { "epoch": 0.2847464586327243, "grad_norm": 1.9236276260571505, "kd_ratio": 0.5, "learning_rate": 1.6788251624121335e-05, "loss": 1.0610209703445435, "loss/kd": 1.6348165273666382, "loss/lm": 0.48722532391548157, "step": 1387 }, { "epoch": 0.28495175528638883, "grad_norm": 1.4984694696284457, "kd_ratio": 0.5, "learning_rate": 1.678336682260278e-05, "loss": 1.1843186616897583, "loss/kd": 1.9696332216262817, "loss/lm": 0.399004191160202, "step": 1388 }, { "epoch": 0.28515705194005336, "grad_norm": 1.895927117445641, "kd_ratio": 0.5, "learning_rate": 1.67784790210566e-05, "loss": 1.2200393676757812, "loss/kd": 2.012791156768799, "loss/lm": 0.42728760838508606, "step": 1389 }, { "epoch": 0.2853623485937179, "grad_norm": 1.3701180388228065, "kd_ratio": 0.5, "learning_rate": 1.6773588221644475e-05, "loss": 2.084820508956909, "loss/kd": 3.8394720554351807, "loss/lm": 0.33016887307167053, "step": 1390 }, { "epoch": 0.2855676452473825, "grad_norm": 1.9240394494930175, "kd_ratio": 0.5, "learning_rate": 1.6768694426529432e-05, "loss": 1.2134572267532349, "loss/kd": 1.919690489768982, "loss/lm": 0.507223904132843, "step": 1391 }, { "epoch": 0.285772941901047, "grad_norm": 1.7559205307018289, "kd_ratio": 0.5, "learning_rate": 1.676379763787581e-05, "loss": 1.0742862224578857, "loss/kd": 1.6659629344940186, "loss/lm": 0.4826095402240753, "step": 1392 }, { "epoch": 0.28597823855471155, "grad_norm": 1.1458342717474075, "kd_ratio": 0.5, "learning_rate": 1.6758897857849268e-05, "loss": 1.3089203834533691, "loss/kd": 2.21234393119812, "loss/lm": 0.4054969251155853, "step": 1393 }, { "epoch": 0.2861835352083761, "grad_norm": 1.4896837537522423, "kd_ratio": 0.5, "learning_rate": 1.67539950886168e-05, "loss": 1.3228150606155396, "loss/kd": 2.2469286918640137, "loss/lm": 0.3987014591693878, "step": 1394 }, { "epoch": 0.28638883186204067, "grad_norm": 1.7361721415135591, "kd_ratio": 0.5, "learning_rate": 1.6749089332346714e-05, "loss": 1.1277979612350464, "loss/kd": 1.8983122110366821, "loss/lm": 0.35728368163108826, "step": 1395 }, { "epoch": 0.2865941285157052, "grad_norm": 1.8277345617838234, "kd_ratio": 0.5, "learning_rate": 1.674418059120864e-05, "loss": 1.119110107421875, "loss/kd": 1.8682975769042969, "loss/lm": 0.3699225187301636, "step": 1396 }, { "epoch": 0.28679942516936974, "grad_norm": 1.1476317383726418, "kd_ratio": 0.5, "learning_rate": 1.6739268867373532e-05, "loss": 1.2335126399993896, "loss/kd": 1.9801921844482422, "loss/lm": 0.4868330657482147, "step": 1397 }, { "epoch": 0.28700472182303427, "grad_norm": 1.688198143319544, "kd_ratio": 0.5, "learning_rate": 1.673435416301366e-05, "loss": 1.0098767280578613, "loss/kd": 1.7208893299102783, "loss/lm": 0.2988640367984772, "step": 1398 }, { "epoch": 0.2872100184766988, "grad_norm": 1.1822904315230336, "kd_ratio": 0.5, "learning_rate": 1.672943648030261e-05, "loss": 1.1576942205429077, "loss/kd": 1.919866919517517, "loss/lm": 0.3955215513706207, "step": 1399 }, { "epoch": 0.2874153151303634, "grad_norm": 1.2408023483386659, "kd_ratio": 0.5, "learning_rate": 1.672451582141529e-05, "loss": 1.3545516729354858, "loss/kd": 2.2823598384857178, "loss/lm": 0.4267435669898987, "step": 1400 }, { "epoch": 0.2876206117840279, "grad_norm": 1.3792329998891348, "kd_ratio": 0.5, "learning_rate": 1.671959218852792e-05, "loss": 0.9953914880752563, "loss/kd": 1.5429140329360962, "loss/lm": 0.44786888360977173, "step": 1401 }, { "epoch": 0.28782590843769246, "grad_norm": 1.2652502484159174, "kd_ratio": 0.5, "learning_rate": 1.6714665583818047e-05, "loss": 1.421128511428833, "loss/kd": 2.3548202514648438, "loss/lm": 0.4874367117881775, "step": 1402 }, { "epoch": 0.288031205091357, "grad_norm": 1.5322316610993727, "kd_ratio": 0.5, "learning_rate": 1.6709736009464504e-05, "loss": 1.1718136072158813, "loss/kd": 1.9503130912780762, "loss/lm": 0.39331403374671936, "step": 1403 }, { "epoch": 0.2882365017450216, "grad_norm": 1.4636383035141272, "kd_ratio": 0.5, "learning_rate": 1.6704803467647478e-05, "loss": 1.153476595878601, "loss/kd": 1.8940165042877197, "loss/lm": 0.4129366874694824, "step": 1404 }, { "epoch": 0.2884417983986861, "grad_norm": 1.3192021131179268, "kd_ratio": 0.5, "learning_rate": 1.6699867960548426e-05, "loss": 1.0184557437896729, "loss/kd": 1.6716400384902954, "loss/lm": 0.36527156829833984, "step": 1405 }, { "epoch": 0.28864709505235064, "grad_norm": 1.2763951601051402, "kd_ratio": 0.5, "learning_rate": 1.6694929490350152e-05, "loss": 0.9086412787437439, "loss/kd": 1.3576184511184692, "loss/lm": 0.45966407656669617, "step": 1406 }, { "epoch": 0.2888523917060152, "grad_norm": 2.025893980506498, "kd_ratio": 0.5, "learning_rate": 1.668998805923675e-05, "loss": 1.1344749927520752, "loss/kd": 1.8717843294143677, "loss/lm": 0.39716577529907227, "step": 1407 }, { "epoch": 0.2890576883596797, "grad_norm": 1.439649468486444, "kd_ratio": 0.5, "learning_rate": 1.6685043669393622e-05, "loss": 1.8174493312835693, "loss/kd": 3.264458417892456, "loss/lm": 0.3704403042793274, "step": 1408 }, { "epoch": 0.2892629850133443, "grad_norm": 1.1653574082947031, "kd_ratio": 0.5, "learning_rate": 1.66800963230075e-05, "loss": 1.200324296951294, "loss/kd": 1.9393541812896729, "loss/lm": 0.4612944424152374, "step": 1409 }, { "epoch": 0.28946828166700883, "grad_norm": 1.695314501780985, "kd_ratio": 0.5, "learning_rate": 1.667514602226639e-05, "loss": 1.299674153327942, "loss/kd": 2.079779863357544, "loss/lm": 0.5195683836936951, "step": 1410 }, { "epoch": 0.28967357832067336, "grad_norm": 1.2132699354070842, "kd_ratio": 0.5, "learning_rate": 1.6670192769359643e-05, "loss": 1.1151037216186523, "loss/kd": 1.7641470432281494, "loss/lm": 0.46606045961380005, "step": 1411 }, { "epoch": 0.2898788749743379, "grad_norm": 1.346440356280193, "kd_ratio": 0.5, "learning_rate": 1.6665236566477884e-05, "loss": 1.1358627080917358, "loss/kd": 1.832531213760376, "loss/lm": 0.43919411301612854, "step": 1412 }, { "epoch": 0.2900841716280025, "grad_norm": 1.4545446062644873, "kd_ratio": 0.5, "learning_rate": 1.666027741581306e-05, "loss": 1.2374107837677002, "loss/kd": 2.1031365394592285, "loss/lm": 0.37168508768081665, "step": 1413 }, { "epoch": 0.290289468281667, "grad_norm": 1.9231965028102875, "kd_ratio": 0.5, "learning_rate": 1.6655315319558413e-05, "loss": 1.2625643014907837, "loss/kd": 2.0729687213897705, "loss/lm": 0.45215994119644165, "step": 1414 }, { "epoch": 0.29049476493533155, "grad_norm": 1.9337174142401738, "kd_ratio": 0.5, "learning_rate": 1.6650350279908497e-05, "loss": 1.1546434164047241, "loss/kd": 1.919791579246521, "loss/lm": 0.3894953429698944, "step": 1415 }, { "epoch": 0.2907000615889961, "grad_norm": 1.813372745253655, "kd_ratio": 0.5, "learning_rate": 1.6645382299059154e-05, "loss": 1.173239827156067, "loss/kd": 1.9945884943008423, "loss/lm": 0.3518911600112915, "step": 1416 }, { "epoch": 0.29090535824266067, "grad_norm": 1.4960405031483288, "kd_ratio": 0.5, "learning_rate": 1.664041137920754e-05, "loss": 1.2607998847961426, "loss/kd": 2.0767626762390137, "loss/lm": 0.44483721256256104, "step": 1417 }, { "epoch": 0.2911106548963252, "grad_norm": 2.779394022424927, "kd_ratio": 0.5, "learning_rate": 1.6635437522552106e-05, "loss": 1.2146037817001343, "loss/kd": 2.0418131351470947, "loss/lm": 0.3873944580554962, "step": 1418 }, { "epoch": 0.29131595154998974, "grad_norm": 2.7897271064715845, "kd_ratio": 0.5, "learning_rate": 1.6630460731292597e-05, "loss": 1.2106742858886719, "loss/kd": 1.9887303113937378, "loss/lm": 0.43261826038360596, "step": 1419 }, { "epoch": 0.29152124820365427, "grad_norm": 1.252294032325739, "kd_ratio": 0.5, "learning_rate": 1.6625481007630066e-05, "loss": 1.0732405185699463, "loss/kd": 1.6753326654434204, "loss/lm": 0.4711483418941498, "step": 1420 }, { "epoch": 0.2917265448573188, "grad_norm": 1.1790179986907774, "kd_ratio": 0.5, "learning_rate": 1.6620498353766853e-05, "loss": 1.0923210382461548, "loss/kd": 1.7326860427856445, "loss/lm": 0.4519561231136322, "step": 1421 }, { "epoch": 0.2919318415109834, "grad_norm": 1.3242917199808137, "kd_ratio": 0.5, "learning_rate": 1.66155127719066e-05, "loss": 1.1359832286834717, "loss/kd": 1.8827662467956543, "loss/lm": 0.3892003297805786, "step": 1422 }, { "epoch": 0.2921371381646479, "grad_norm": 1.2883648361905924, "kd_ratio": 0.5, "learning_rate": 1.661052426425424e-05, "loss": 1.1239805221557617, "loss/kd": 1.8660290241241455, "loss/lm": 0.38193202018737793, "step": 1423 }, { "epoch": 0.29234243481831246, "grad_norm": 1.1109095633393018, "kd_ratio": 0.5, "learning_rate": 1.660553283301601e-05, "loss": 1.397840142250061, "loss/kd": 2.4019768238067627, "loss/lm": 0.39370355010032654, "step": 1424 }, { "epoch": 0.292547731471977, "grad_norm": 1.5874096707418568, "kd_ratio": 0.5, "learning_rate": 1.660053848039942e-05, "loss": 1.4057680368423462, "loss/kd": 2.4111733436584473, "loss/lm": 0.4003627598285675, "step": 1425 }, { "epoch": 0.2927530281256416, "grad_norm": 1.7611694979890253, "kd_ratio": 0.5, "learning_rate": 1.659554120861329e-05, "loss": 1.2178387641906738, "loss/kd": 2.0130646228790283, "loss/lm": 0.4226127862930298, "step": 1426 }, { "epoch": 0.2929583247793061, "grad_norm": 1.3805174359471413, "kd_ratio": 0.5, "learning_rate": 1.6590541019867722e-05, "loss": 1.0355087518692017, "loss/kd": 1.6833107471466064, "loss/lm": 0.3877067267894745, "step": 1427 }, { "epoch": 0.29316362143297064, "grad_norm": 1.3980681962688768, "kd_ratio": 0.5, "learning_rate": 1.658553791637412e-05, "loss": 1.2329994440078735, "loss/kd": 2.0852303504943848, "loss/lm": 0.38076847791671753, "step": 1428 }, { "epoch": 0.2933689180866352, "grad_norm": 2.6325914043680485, "kd_ratio": 0.5, "learning_rate": 1.658053190034516e-05, "loss": 1.1178158521652222, "loss/kd": 1.8464311361312866, "loss/lm": 0.3892006278038025, "step": 1429 }, { "epoch": 0.2935742147402997, "grad_norm": 1.6090183445105093, "kd_ratio": 0.5, "learning_rate": 1.6575522973994815e-05, "loss": 1.1184332370758057, "loss/kd": 1.8098288774490356, "loss/lm": 0.4270375967025757, "step": 1430 }, { "epoch": 0.2937795113939643, "grad_norm": 1.3258705894646567, "kd_ratio": 0.5, "learning_rate": 1.6570511139538348e-05, "loss": 1.2093274593353271, "loss/kd": 1.9749586582183838, "loss/lm": 0.4436962604522705, "step": 1431 }, { "epoch": 0.29398480804762883, "grad_norm": 1.5305765092342303, "kd_ratio": 0.5, "learning_rate": 1.65654963991923e-05, "loss": 1.061826467514038, "loss/kd": 1.7265642881393433, "loss/lm": 0.3970886468887329, "step": 1432 }, { "epoch": 0.29419010470129336, "grad_norm": 1.8020631153205455, "kd_ratio": 0.5, "learning_rate": 1.6560478755174506e-05, "loss": 1.0142136812210083, "loss/kd": 1.6502504348754883, "loss/lm": 0.37817686796188354, "step": 1433 }, { "epoch": 0.2943954013549579, "grad_norm": 1.6713567267638485, "kd_ratio": 0.5, "learning_rate": 1.655545820970408e-05, "loss": 1.0840096473693848, "loss/kd": 1.7728333473205566, "loss/lm": 0.3951859772205353, "step": 1434 }, { "epoch": 0.2946006980086225, "grad_norm": 1.0283473878158345, "kd_ratio": 0.5, "learning_rate": 1.655043476500142e-05, "loss": 1.0615155696868896, "loss/kd": 1.6225731372833252, "loss/lm": 0.5004580020904541, "step": 1435 }, { "epoch": 0.294805994662287, "grad_norm": 1.2846473764781046, "kd_ratio": 0.5, "learning_rate": 1.6545408423288203e-05, "loss": 1.2112572193145752, "loss/kd": 1.9576905965805054, "loss/lm": 0.46482372283935547, "step": 1436 }, { "epoch": 0.29501129131595155, "grad_norm": 1.5028189880036904, "kd_ratio": 0.5, "learning_rate": 1.6540379186787395e-05, "loss": 0.9627628326416016, "loss/kd": 1.479691505432129, "loss/lm": 0.4458341598510742, "step": 1437 }, { "epoch": 0.2952165879696161, "grad_norm": 1.7134054944607127, "kd_ratio": 0.5, "learning_rate": 1.6535347057723235e-05, "loss": 1.0451109409332275, "loss/kd": 1.7621567249298096, "loss/lm": 0.3280651569366455, "step": 1438 }, { "epoch": 0.2954218846232806, "grad_norm": 1.0246505756236528, "kd_ratio": 0.5, "learning_rate": 1.6530312038321247e-05, "loss": 1.1910526752471924, "loss/kd": 1.8982629776000977, "loss/lm": 0.4838424026966095, "step": 1439 }, { "epoch": 0.2956271812769452, "grad_norm": 1.4386560398220531, "kd_ratio": 0.5, "learning_rate": 1.6525274130808228e-05, "loss": 1.2894573211669922, "loss/kd": 2.2339282035827637, "loss/lm": 0.3449864387512207, "step": 1440 }, { "epoch": 0.29583247793060974, "grad_norm": 1.5193587977257135, "kd_ratio": 0.5, "learning_rate": 1.6520233337412253e-05, "loss": 1.4138047695159912, "loss/kd": 2.3786370754241943, "loss/lm": 0.44897258281707764, "step": 1441 }, { "epoch": 0.29603777458427427, "grad_norm": 1.1896459395380745, "kd_ratio": 0.5, "learning_rate": 1.651518966036268e-05, "loss": 1.1323745250701904, "loss/kd": 1.9065282344818115, "loss/lm": 0.3582208454608917, "step": 1442 }, { "epoch": 0.2962430712379388, "grad_norm": 1.038350569778525, "kd_ratio": 0.5, "learning_rate": 1.6510143101890136e-05, "loss": 1.029556393623352, "loss/kd": 1.6894176006317139, "loss/lm": 0.369695246219635, "step": 1443 }, { "epoch": 0.2964483678916034, "grad_norm": 1.3516497262757834, "kd_ratio": 0.5, "learning_rate": 1.650509366422652e-05, "loss": 0.8989176154136658, "loss/kd": 1.4455182552337646, "loss/lm": 0.3523169457912445, "step": 1444 }, { "epoch": 0.2966536645452679, "grad_norm": 1.4139467079715105, "kd_ratio": 0.5, "learning_rate": 1.6500041349605012e-05, "loss": 1.0818294286727905, "loss/kd": 1.7763344049453735, "loss/lm": 0.38732439279556274, "step": 1445 }, { "epoch": 0.29685896119893246, "grad_norm": 1.2376340670322035, "kd_ratio": 0.5, "learning_rate": 1.649498616026006e-05, "loss": 1.1918325424194336, "loss/kd": 1.9597713947296143, "loss/lm": 0.42389369010925293, "step": 1446 }, { "epoch": 0.297064257852597, "grad_norm": 1.1165677917217922, "kd_ratio": 0.5, "learning_rate": 1.6489928098427383e-05, "loss": 1.1453981399536133, "loss/kd": 1.8887401819229126, "loss/lm": 0.4020562171936035, "step": 1447 }, { "epoch": 0.2972695545062615, "grad_norm": 1.0722354351375092, "kd_ratio": 0.5, "learning_rate": 1.648486716634397e-05, "loss": 0.9842793345451355, "loss/kd": 1.5279006958007812, "loss/lm": 0.44065794348716736, "step": 1448 }, { "epoch": 0.2974748511599261, "grad_norm": 1.5022725921486773, "kd_ratio": 0.5, "learning_rate": 1.647980336624808e-05, "loss": 1.0660816431045532, "loss/kd": 1.7317230701446533, "loss/lm": 0.4004402160644531, "step": 1449 }, { "epoch": 0.29768014781359065, "grad_norm": 1.712404328619881, "kd_ratio": 0.5, "learning_rate": 1.6474736700379247e-05, "loss": 1.1247748136520386, "loss/kd": 1.8086143732070923, "loss/lm": 0.44093528389930725, "step": 1450 }, { "epoch": 0.2978854444672552, "grad_norm": 1.8272304403263855, "kd_ratio": 0.5, "learning_rate": 1.6469667170978258e-05, "loss": 1.2342932224273682, "loss/kd": 1.978872537612915, "loss/lm": 0.48971396684646606, "step": 1451 }, { "epoch": 0.2980907411209197, "grad_norm": 1.7044227679802448, "kd_ratio": 0.5, "learning_rate": 1.6464594780287183e-05, "loss": 1.3036715984344482, "loss/kd": 2.152866840362549, "loss/lm": 0.45447638630867004, "step": 1452 }, { "epoch": 0.2982960377745843, "grad_norm": 1.3872582581184691, "kd_ratio": 0.5, "learning_rate": 1.6459519530549345e-05, "loss": 1.265559434890747, "loss/kd": 2.1198017597198486, "loss/lm": 0.41131705045700073, "step": 1453 }, { "epoch": 0.29850133442824883, "grad_norm": 1.2184922591410943, "kd_ratio": 0.5, "learning_rate": 1.6454441424009333e-05, "loss": 1.0279905796051025, "loss/kd": 1.7424529790878296, "loss/lm": 0.31352806091308594, "step": 1454 }, { "epoch": 0.29870663108191337, "grad_norm": 1.9388683285370554, "kd_ratio": 0.5, "learning_rate": 1.6449360462913005e-05, "loss": 1.2074919939041138, "loss/kd": 1.9168474674224854, "loss/lm": 0.4981364905834198, "step": 1455 }, { "epoch": 0.2989119277355779, "grad_norm": 1.4423011478980081, "kd_ratio": 0.5, "learning_rate": 1.6444276649507485e-05, "loss": 1.0583566427230835, "loss/kd": 1.7793149948120117, "loss/lm": 0.3373982608318329, "step": 1456 }, { "epoch": 0.29911722438924243, "grad_norm": 1.011193039009565, "kd_ratio": 0.5, "learning_rate": 1.643918998604114e-05, "loss": 1.174306035041809, "loss/kd": 1.953312873840332, "loss/lm": 0.3952992558479309, "step": 1457 }, { "epoch": 0.299322521042907, "grad_norm": 1.0664188592767414, "kd_ratio": 0.5, "learning_rate": 1.6434100474763623e-05, "loss": 0.8662461638450623, "loss/kd": 1.4267868995666504, "loss/lm": 0.30570539832115173, "step": 1458 }, { "epoch": 0.29952781769657155, "grad_norm": 1.2052621187978747, "kd_ratio": 0.5, "learning_rate": 1.642900811792582e-05, "loss": 1.0487905740737915, "loss/kd": 1.7017040252685547, "loss/lm": 0.39587709307670593, "step": 1459 }, { "epoch": 0.2997331143502361, "grad_norm": 1.220364584858195, "kd_ratio": 0.5, "learning_rate": 1.6423912917779897e-05, "loss": 1.0878318548202515, "loss/kd": 1.7495019435882568, "loss/lm": 0.42616167664527893, "step": 1460 }, { "epoch": 0.2999384110039006, "grad_norm": 0.9761566959122666, "kd_ratio": 0.5, "learning_rate": 1.641881487657927e-05, "loss": 1.260151982307434, "loss/kd": 2.0670015811920166, "loss/lm": 0.4533022940158844, "step": 1461 }, { "epoch": 0.3001437076575652, "grad_norm": 1.6188644018245104, "kd_ratio": 0.5, "learning_rate": 1.6413713996578604e-05, "loss": 1.1579201221466064, "loss/kd": 1.8719420433044434, "loss/lm": 0.4438982605934143, "step": 1462 }, { "epoch": 0.30034900431122974, "grad_norm": 1.4264338324375065, "kd_ratio": 0.5, "learning_rate": 1.640861028003383e-05, "loss": 1.0908249616622925, "loss/kd": 1.823526382446289, "loss/lm": 0.3581235408782959, "step": 1463 }, { "epoch": 0.30055430096489427, "grad_norm": 1.303505604379355, "kd_ratio": 0.5, "learning_rate": 1.6403503729202134e-05, "loss": 1.1990643739700317, "loss/kd": 2.043442487716675, "loss/lm": 0.35468626022338867, "step": 1464 }, { "epoch": 0.3007595976185588, "grad_norm": 1.0825814451184668, "kd_ratio": 0.5, "learning_rate": 1.639839434634194e-05, "loss": 1.2019253969192505, "loss/kd": 1.9939520359039307, "loss/lm": 0.40989866852760315, "step": 1465 }, { "epoch": 0.30096489427222334, "grad_norm": 1.3338921893494307, "kd_ratio": 0.5, "learning_rate": 1.6393282133712945e-05, "loss": 1.1406019926071167, "loss/kd": 1.8917710781097412, "loss/lm": 0.38943296670913696, "step": 1466 }, { "epoch": 0.3011701909258879, "grad_norm": 1.1364459241340052, "kd_ratio": 0.5, "learning_rate": 1.6388167093576083e-05, "loss": 1.2318230867385864, "loss/kd": 2.020172595977783, "loss/lm": 0.4434736669063568, "step": 1467 }, { "epoch": 0.30137548757955246, "grad_norm": 1.0549393880136153, "kd_ratio": 0.5, "learning_rate": 1.6383049228193545e-05, "loss": 1.1345592737197876, "loss/kd": 1.8640705347061157, "loss/lm": 0.40504810214042664, "step": 1468 }, { "epoch": 0.301580784233217, "grad_norm": 1.518896328577393, "kd_ratio": 0.5, "learning_rate": 1.6377928539828772e-05, "loss": 0.9375718832015991, "loss/kd": 1.4708749055862427, "loss/lm": 0.4042688012123108, "step": 1469 }, { "epoch": 0.3017860808868815, "grad_norm": 1.740261176596262, "kd_ratio": 0.5, "learning_rate": 1.637280503074645e-05, "loss": 1.0639066696166992, "loss/kd": 1.7425673007965088, "loss/lm": 0.38524603843688965, "step": 1470 }, { "epoch": 0.3019913775405461, "grad_norm": 1.5378421005324943, "kd_ratio": 0.5, "learning_rate": 1.6367678703212515e-05, "loss": 1.127672791481018, "loss/kd": 1.8261860609054565, "loss/lm": 0.4291594922542572, "step": 1471 }, { "epoch": 0.30219667419421065, "grad_norm": 1.01908214542176, "kd_ratio": 0.5, "learning_rate": 1.6362549559494144e-05, "loss": 1.1035387516021729, "loss/kd": 1.8176538944244385, "loss/lm": 0.3894236385822296, "step": 1472 }, { "epoch": 0.3024019708478752, "grad_norm": 1.3264930630268386, "kd_ratio": 0.5, "learning_rate": 1.6357417601859772e-05, "loss": 1.1307778358459473, "loss/kd": 1.8929306268692017, "loss/lm": 0.36862507462501526, "step": 1473 }, { "epoch": 0.3026072675015397, "grad_norm": 1.687108115471099, "kd_ratio": 0.5, "learning_rate": 1.635228283257907e-05, "loss": 1.1235660314559937, "loss/kd": 1.8822649717330933, "loss/lm": 0.36486712098121643, "step": 1474 }, { "epoch": 0.30281256415520424, "grad_norm": 1.4575574035109282, "kd_ratio": 0.5, "learning_rate": 1.6347145253922942e-05, "loss": 1.1695361137390137, "loss/kd": 1.8394283056259155, "loss/lm": 0.49964380264282227, "step": 1475 }, { "epoch": 0.30301786080886883, "grad_norm": 1.1846508423240858, "kd_ratio": 0.5, "learning_rate": 1.634200486816355e-05, "loss": 0.9769582748413086, "loss/kd": 1.557927131652832, "loss/lm": 0.39598947763442993, "step": 1476 }, { "epoch": 0.30322315746253337, "grad_norm": 1.5978220973769408, "kd_ratio": 0.5, "learning_rate": 1.6336861677574305e-05, "loss": 1.0903539657592773, "loss/kd": 1.8422679901123047, "loss/lm": 0.3384398818016052, "step": 1477 }, { "epoch": 0.3034284541161979, "grad_norm": 1.5557504408275526, "kd_ratio": 0.5, "learning_rate": 1.6331715684429834e-05, "loss": 1.0960328578948975, "loss/kd": 1.7747132778167725, "loss/lm": 0.4173524081707001, "step": 1478 }, { "epoch": 0.30363375076986243, "grad_norm": 1.8525332397908847, "kd_ratio": 0.5, "learning_rate": 1.632656689100602e-05, "loss": 1.1398017406463623, "loss/kd": 1.9423459768295288, "loss/lm": 0.3372575044631958, "step": 1479 }, { "epoch": 0.303839047423527, "grad_norm": 2.059993084635239, "kd_ratio": 0.5, "learning_rate": 1.632141529957998e-05, "loss": 1.160776138305664, "loss/kd": 1.870092511177063, "loss/lm": 0.45145973563194275, "step": 1480 }, { "epoch": 0.30404434407719155, "grad_norm": 1.9202007481859609, "kd_ratio": 0.5, "learning_rate": 1.6316260912430066e-05, "loss": 1.0416394472122192, "loss/kd": 1.6335713863372803, "loss/lm": 0.44970741868019104, "step": 1481 }, { "epoch": 0.3042496407308561, "grad_norm": 1.059905460052168, "kd_ratio": 0.5, "learning_rate": 1.6311103731835872e-05, "loss": 1.0560717582702637, "loss/kd": 1.6106934547424316, "loss/lm": 0.5014501810073853, "step": 1482 }, { "epoch": 0.3044549373845206, "grad_norm": 1.7112383842979992, "kd_ratio": 0.5, "learning_rate": 1.6305943760078226e-05, "loss": 1.0442330837249756, "loss/kd": 1.7033017873764038, "loss/lm": 0.3851644694805145, "step": 1483 }, { "epoch": 0.30466023403818515, "grad_norm": 2.3138957924864436, "kd_ratio": 0.5, "learning_rate": 1.6300780999439186e-05, "loss": 1.2339153289794922, "loss/kd": 2.0253100395202637, "loss/lm": 0.44252049922943115, "step": 1484 }, { "epoch": 0.30486553069184974, "grad_norm": 1.5165460204838732, "kd_ratio": 0.5, "learning_rate": 1.6295615452202052e-05, "loss": 1.1864093542099, "loss/kd": 1.9821438789367676, "loss/lm": 0.3906749188899994, "step": 1485 }, { "epoch": 0.3050708273455143, "grad_norm": 1.4223828147668687, "kd_ratio": 0.5, "learning_rate": 1.6290447120651344e-05, "loss": 0.9561863541603088, "loss/kd": 1.5721009969711304, "loss/lm": 0.3402717113494873, "step": 1486 }, { "epoch": 0.3052761239991788, "grad_norm": 2.0145025269133208, "kd_ratio": 0.5, "learning_rate": 1.628527600707283e-05, "loss": 1.1214534044265747, "loss/kd": 1.9210152626037598, "loss/lm": 0.3218914568424225, "step": 1487 }, { "epoch": 0.30548142065284334, "grad_norm": 2.015593970705144, "kd_ratio": 0.5, "learning_rate": 1.628010211375348e-05, "loss": 1.016815423965454, "loss/kd": 1.5931509733200073, "loss/lm": 0.4404798746109009, "step": 1488 }, { "epoch": 0.3056867173065079, "grad_norm": 1.349459603716158, "kd_ratio": 0.5, "learning_rate": 1.6274925442981535e-05, "loss": 1.0380398035049438, "loss/kd": 1.7821508646011353, "loss/lm": 0.2939288318157196, "step": 1489 }, { "epoch": 0.30589201396017246, "grad_norm": 2.0555231224847192, "kd_ratio": 0.5, "learning_rate": 1.6269745997046427e-05, "loss": 1.071655035018921, "loss/kd": 1.7339938879013062, "loss/lm": 0.4093162417411804, "step": 1490 }, { "epoch": 0.306097310613837, "grad_norm": 2.3191742051130113, "kd_ratio": 0.5, "learning_rate": 1.6264563778238834e-05, "loss": 1.0943396091461182, "loss/kd": 1.731292486190796, "loss/lm": 0.4573867619037628, "step": 1491 }, { "epoch": 0.3063026072675015, "grad_norm": 2.0041422615413444, "kd_ratio": 0.5, "learning_rate": 1.6259378788850656e-05, "loss": 1.3426765203475952, "loss/kd": 2.269348621368408, "loss/lm": 0.416004478931427, "step": 1492 }, { "epoch": 0.30650790392116606, "grad_norm": 1.343052151816719, "kd_ratio": 0.5, "learning_rate": 1.625419103117502e-05, "loss": 1.0227867364883423, "loss/kd": 1.670136570930481, "loss/lm": 0.37543684244155884, "step": 1493 }, { "epoch": 0.30671320057483065, "grad_norm": 1.592104215051678, "kd_ratio": 0.5, "learning_rate": 1.624900050750627e-05, "loss": 1.0100923776626587, "loss/kd": 1.6400240659713745, "loss/lm": 0.3801606297492981, "step": 1494 }, { "epoch": 0.3069184972284952, "grad_norm": 2.0641211563500144, "kd_ratio": 0.5, "learning_rate": 1.6243807220139988e-05, "loss": 1.4443027973175049, "loss/kd": 2.4411568641662598, "loss/lm": 0.4474486708641052, "step": 1495 }, { "epoch": 0.3071237938821597, "grad_norm": 2.842517986464559, "kd_ratio": 0.5, "learning_rate": 1.6238611171372964e-05, "loss": 0.99894118309021, "loss/kd": 1.6158806085586548, "loss/lm": 0.3820017874240875, "step": 1496 }, { "epoch": 0.30732909053582425, "grad_norm": 1.867795746311515, "kd_ratio": 0.5, "learning_rate": 1.6233412363503216e-05, "loss": 0.9408147931098938, "loss/kd": 1.4631907939910889, "loss/lm": 0.41843879222869873, "step": 1497 }, { "epoch": 0.30753438718948883, "grad_norm": 1.665084309828793, "kd_ratio": 0.5, "learning_rate": 1.6228210798829978e-05, "loss": 1.0862631797790527, "loss/kd": 1.728602409362793, "loss/lm": 0.44392406940460205, "step": 1498 }, { "epoch": 0.30773968384315337, "grad_norm": 2.650020510093592, "kd_ratio": 0.5, "learning_rate": 1.6223006479653708e-05, "loss": 1.0675711631774902, "loss/kd": 1.7667152881622314, "loss/lm": 0.368427038192749, "step": 1499 }, { "epoch": 0.3079449804968179, "grad_norm": 1.5188058070767445, "kd_ratio": 0.5, "learning_rate": 1.6217799408276084e-05, "loss": 1.2604587078094482, "loss/kd": 2.1397125720977783, "loss/lm": 0.38120484352111816, "step": 1500 }, { "epoch": 0.30815027715048243, "grad_norm": 1.7003720890439495, "kd_ratio": 0.5, "learning_rate": 1.621258958699999e-05, "loss": 0.8819398283958435, "loss/kd": 1.42592191696167, "loss/lm": 0.3379577398300171, "step": 1501 }, { "epoch": 0.30835557380414697, "grad_norm": 2.253725226088527, "kd_ratio": 0.5, "learning_rate": 1.620737701812954e-05, "loss": 1.300389289855957, "loss/kd": 2.239201068878174, "loss/lm": 0.3615776002407074, "step": 1502 }, { "epoch": 0.30856087045781155, "grad_norm": 1.7420453639637992, "kd_ratio": 0.5, "learning_rate": 1.6202161703970057e-05, "loss": 1.1773778200149536, "loss/kd": 1.9342467784881592, "loss/lm": 0.4205089211463928, "step": 1503 }, { "epoch": 0.3087661671114761, "grad_norm": 1.6688194927620268, "kd_ratio": 0.5, "learning_rate": 1.6196943646828072e-05, "loss": 1.1680169105529785, "loss/kd": 1.8840012550354004, "loss/lm": 0.45203250646591187, "step": 1504 }, { "epoch": 0.3089714637651406, "grad_norm": 1.8256202964582473, "kd_ratio": 0.5, "learning_rate": 1.619172284901134e-05, "loss": 1.226004958152771, "loss/kd": 1.9844825267791748, "loss/lm": 0.4675273001194, "step": 1505 }, { "epoch": 0.30917676041880515, "grad_norm": 1.5059743252108466, "kd_ratio": 0.5, "learning_rate": 1.6186499312828826e-05, "loss": 1.2925394773483276, "loss/kd": 2.261622667312622, "loss/lm": 0.323456346988678, "step": 1506 }, { "epoch": 0.30938205707246974, "grad_norm": 1.0433473237146211, "kd_ratio": 0.5, "learning_rate": 1.6181273040590696e-05, "loss": 1.103272795677185, "loss/kd": 1.7691123485565186, "loss/lm": 0.4374331533908844, "step": 1507 }, { "epoch": 0.3095873537261343, "grad_norm": 1.6495022358342657, "kd_ratio": 0.5, "learning_rate": 1.617604403460834e-05, "loss": 1.293345332145691, "loss/kd": 2.173435926437378, "loss/lm": 0.4132547378540039, "step": 1508 }, { "epoch": 0.3097926503797988, "grad_norm": 1.4231663955013663, "kd_ratio": 0.5, "learning_rate": 1.617081229719434e-05, "loss": 1.1782045364379883, "loss/kd": 1.9415316581726074, "loss/lm": 0.41487741470336914, "step": 1509 }, { "epoch": 0.30999794703346334, "grad_norm": 2.1378811359891334, "kd_ratio": 0.5, "learning_rate": 1.6165577830662508e-05, "loss": 1.2848820686340332, "loss/kd": 2.1378087997436523, "loss/lm": 0.43195536732673645, "step": 1510 }, { "epoch": 0.3102032436871279, "grad_norm": 2.0536822299211797, "kd_ratio": 0.5, "learning_rate": 1.616034063732785e-05, "loss": 1.4706661701202393, "loss/kd": 2.433283567428589, "loss/lm": 0.5080487132072449, "step": 1511 }, { "epoch": 0.31040854034079246, "grad_norm": 1.262022901178437, "kd_ratio": 0.5, "learning_rate": 1.615510071950657e-05, "loss": 1.306535005569458, "loss/kd": 2.1813433170318604, "loss/lm": 0.4317266345024109, "step": 1512 }, { "epoch": 0.310613836994457, "grad_norm": 1.8396704094586034, "kd_ratio": 0.5, "learning_rate": 1.6149858079516097e-05, "loss": 1.0424606800079346, "loss/kd": 1.7290256023406982, "loss/lm": 0.3558958172798157, "step": 1513 }, { "epoch": 0.3108191336481215, "grad_norm": 1.0875826226883527, "kd_ratio": 0.5, "learning_rate": 1.6144612719675046e-05, "loss": 1.0741775035858154, "loss/kd": 1.7505321502685547, "loss/lm": 0.3978227972984314, "step": 1514 }, { "epoch": 0.31102443030178606, "grad_norm": 2.42977626162665, "kd_ratio": 0.5, "learning_rate": 1.613936464230325e-05, "loss": 1.094460129737854, "loss/kd": 1.75968599319458, "loss/lm": 0.4292343258857727, "step": 1515 }, { "epoch": 0.31122972695545065, "grad_norm": 2.3903324165569853, "kd_ratio": 0.5, "learning_rate": 1.6134113849721725e-05, "loss": 1.558031678199768, "loss/kd": 2.6549315452575684, "loss/lm": 0.4611317217350006, "step": 1516 }, { "epoch": 0.3114350236091152, "grad_norm": 1.6512890822758706, "kd_ratio": 0.5, "learning_rate": 1.6128860344252707e-05, "loss": 1.3254003524780273, "loss/kd": 2.2308173179626465, "loss/lm": 0.4199833869934082, "step": 1517 }, { "epoch": 0.3116403202627797, "grad_norm": 1.5570356477633842, "kd_ratio": 0.5, "learning_rate": 1.612360412821962e-05, "loss": 1.157935380935669, "loss/kd": 1.8871718645095825, "loss/lm": 0.42869889736175537, "step": 1518 }, { "epoch": 0.31184561691644425, "grad_norm": 1.144233533842663, "kd_ratio": 0.5, "learning_rate": 1.6118345203947093e-05, "loss": 1.014717698097229, "loss/kd": 1.6311475038528442, "loss/lm": 0.3982878029346466, "step": 1519 }, { "epoch": 0.31205091357010883, "grad_norm": 1.6767885644761336, "kd_ratio": 0.5, "learning_rate": 1.611308357376095e-05, "loss": 1.0020345449447632, "loss/kd": 1.5765506029129028, "loss/lm": 0.42751842737197876, "step": 1520 }, { "epoch": 0.31225621022377337, "grad_norm": 1.1532210734573884, "kd_ratio": 0.5, "learning_rate": 1.6107819239988206e-05, "loss": 1.0079931020736694, "loss/kd": 1.644803762435913, "loss/lm": 0.3711824417114258, "step": 1521 }, { "epoch": 0.3124615068774379, "grad_norm": 1.974207172715277, "kd_ratio": 0.5, "learning_rate": 1.6102552204957083e-05, "loss": 1.128464698791504, "loss/kd": 1.7920358180999756, "loss/lm": 0.4648934602737427, "step": 1522 }, { "epoch": 0.31266680353110243, "grad_norm": 1.1086166139638445, "kd_ratio": 0.5, "learning_rate": 1.6097282470996997e-05, "loss": 0.9474606513977051, "loss/kd": 1.5159432888031006, "loss/lm": 0.37897807359695435, "step": 1523 }, { "epoch": 0.31287210018476697, "grad_norm": 1.9669633499432482, "kd_ratio": 0.5, "learning_rate": 1.609201004043854e-05, "loss": 1.0826330184936523, "loss/kd": 1.7781469821929932, "loss/lm": 0.38711896538734436, "step": 1524 }, { "epoch": 0.31307739683843155, "grad_norm": 1.4318308417490542, "kd_ratio": 0.5, "learning_rate": 1.6086734915613518e-05, "loss": 1.0625981092453003, "loss/kd": 1.778327465057373, "loss/lm": 0.3468688428401947, "step": 1525 }, { "epoch": 0.3132826934920961, "grad_norm": 1.3928901135419467, "kd_ratio": 0.5, "learning_rate": 1.6081457098854922e-05, "loss": 1.29863703250885, "loss/kd": 2.205071210861206, "loss/lm": 0.39220285415649414, "step": 1526 }, { "epoch": 0.3134879901457606, "grad_norm": 1.5116807485734896, "kd_ratio": 0.5, "learning_rate": 1.6076176592496926e-05, "loss": 1.2306164503097534, "loss/kd": 2.070821762084961, "loss/lm": 0.39041122794151306, "step": 1527 }, { "epoch": 0.31369328679942515, "grad_norm": 1.1158302019915114, "kd_ratio": 0.5, "learning_rate": 1.60708933988749e-05, "loss": 0.9938474893569946, "loss/kd": 1.5994819402694702, "loss/lm": 0.38821303844451904, "step": 1528 }, { "epoch": 0.31389858345308974, "grad_norm": 1.3852119782486232, "kd_ratio": 0.5, "learning_rate": 1.6065607520325404e-05, "loss": 1.1225807666778564, "loss/kd": 1.8365837335586548, "loss/lm": 0.40857774019241333, "step": 1529 }, { "epoch": 0.3141038801067543, "grad_norm": 1.3587405685823974, "kd_ratio": 0.5, "learning_rate": 1.606031895918618e-05, "loss": 1.1525074243545532, "loss/kd": 1.9401216506958008, "loss/lm": 0.3648932874202728, "step": 1530 }, { "epoch": 0.3143091767604188, "grad_norm": 1.3503835472277008, "kd_ratio": 0.5, "learning_rate": 1.605502771779616e-05, "loss": 1.0483931303024292, "loss/kd": 1.6806410551071167, "loss/lm": 0.4161452353000641, "step": 1531 }, { "epoch": 0.31451447341408334, "grad_norm": 1.5931797986410299, "kd_ratio": 0.5, "learning_rate": 1.6049733798495462e-05, "loss": 1.102854609489441, "loss/kd": 1.8099433183670044, "loss/lm": 0.3957659602165222, "step": 1532 }, { "epoch": 0.3147197700677479, "grad_norm": 1.7970529488727272, "kd_ratio": 0.5, "learning_rate": 1.604443720362539e-05, "loss": 1.2194963693618774, "loss/kd": 1.9821993112564087, "loss/lm": 0.4567933976650238, "step": 1533 }, { "epoch": 0.31492506672141246, "grad_norm": 1.4323216480053929, "kd_ratio": 0.5, "learning_rate": 1.603913793552842e-05, "loss": 1.1901754140853882, "loss/kd": 1.9629205465316772, "loss/lm": 0.4174303114414215, "step": 1534 }, { "epoch": 0.315130363375077, "grad_norm": 1.0122968177185263, "kd_ratio": 0.5, "learning_rate": 1.603383599654823e-05, "loss": 0.9977090954780579, "loss/kd": 1.5157891511917114, "loss/lm": 0.4796290695667267, "step": 1535 }, { "epoch": 0.3153356600287415, "grad_norm": 1.3710940114870511, "kd_ratio": 0.5, "learning_rate": 1.6028531389029658e-05, "loss": 0.9195882081985474, "loss/kd": 1.47151780128479, "loss/lm": 0.3676586151123047, "step": 1536 }, { "epoch": 0.31554095668240606, "grad_norm": 1.256072662279851, "kd_ratio": 0.5, "learning_rate": 1.6023224115318736e-05, "loss": 2.0327255725860596, "loss/kd": 3.7687153816223145, "loss/lm": 0.29673582315444946, "step": 1537 }, { "epoch": 0.31574625333607065, "grad_norm": 1.2898809591115141, "kd_ratio": 0.5, "learning_rate": 1.601791417776267e-05, "loss": 0.9375568628311157, "loss/kd": 1.5029386281967163, "loss/lm": 0.37217506766319275, "step": 1538 }, { "epoch": 0.3159515499897352, "grad_norm": 1.3058005854867696, "kd_ratio": 0.5, "learning_rate": 1.601260157870985e-05, "loss": 1.0707412958145142, "loss/kd": 1.761888861656189, "loss/lm": 0.37959370017051697, "step": 1539 }, { "epoch": 0.3161568466433997, "grad_norm": 1.325032366318593, "kd_ratio": 0.5, "learning_rate": 1.6007286320509834e-05, "loss": 1.059206247329712, "loss/kd": 1.7917505502700806, "loss/lm": 0.3266618549823761, "step": 1540 }, { "epoch": 0.31636214329706425, "grad_norm": 1.9919817284014403, "kd_ratio": 0.5, "learning_rate": 1.6001968405513357e-05, "loss": 1.1873235702514648, "loss/kd": 1.9975581169128418, "loss/lm": 0.3770889639854431, "step": 1541 }, { "epoch": 0.3165674399507288, "grad_norm": 1.6868873282184944, "kd_ratio": 0.5, "learning_rate": 1.599664783607234e-05, "loss": 0.9756805896759033, "loss/kd": 1.5416868925094604, "loss/lm": 0.4096742570400238, "step": 1542 }, { "epoch": 0.31677273660439337, "grad_norm": 1.189616066231635, "kd_ratio": 0.5, "learning_rate": 1.599132461453987e-05, "loss": 1.2315508127212524, "loss/kd": 2.053074836730957, "loss/lm": 0.4100266993045807, "step": 1543 }, { "epoch": 0.3169780332580579, "grad_norm": 1.2734359567322433, "kd_ratio": 0.5, "learning_rate": 1.5985998743270202e-05, "loss": 1.1540589332580566, "loss/kd": 1.7959390878677368, "loss/lm": 0.5121787190437317, "step": 1544 }, { "epoch": 0.31718332991172243, "grad_norm": 1.0599437265257425, "kd_ratio": 0.5, "learning_rate": 1.598067022461877e-05, "loss": 1.4184240102767944, "loss/kd": 2.4214348793029785, "loss/lm": 0.41541314125061035, "step": 1545 }, { "epoch": 0.31738862656538697, "grad_norm": 1.2873174836585133, "kd_ratio": 0.5, "learning_rate": 1.597533906094218e-05, "loss": 1.0083309412002563, "loss/kd": 1.5877705812454224, "loss/lm": 0.4288913905620575, "step": 1546 }, { "epoch": 0.31759392321905155, "grad_norm": 1.1620527836918322, "kd_ratio": 0.5, "learning_rate": 1.5970005254598204e-05, "loss": 1.0504748821258545, "loss/kd": 1.6858958005905151, "loss/lm": 0.415054053068161, "step": 1547 }, { "epoch": 0.3177992198727161, "grad_norm": 1.1879189898999372, "kd_ratio": 0.5, "learning_rate": 1.5964668807945777e-05, "loss": 1.234281301498413, "loss/kd": 2.0464088916778564, "loss/lm": 0.42215365171432495, "step": 1548 }, { "epoch": 0.3180045165263806, "grad_norm": 1.1032890521478091, "kd_ratio": 0.5, "learning_rate": 1.595932972334502e-05, "loss": 1.106476902961731, "loss/kd": 1.8369745016098022, "loss/lm": 0.3759792745113373, "step": 1549 }, { "epoch": 0.31820981318004515, "grad_norm": 0.9877241326461609, "kd_ratio": 0.5, "learning_rate": 1.59539880031572e-05, "loss": 1.2801018953323364, "loss/kd": 2.188981294631958, "loss/lm": 0.37122243642807007, "step": 1550 }, { "epoch": 0.3184151098337097, "grad_norm": 1.0092081505364219, "kd_ratio": 0.5, "learning_rate": 1.594864364974476e-05, "loss": 0.9239659905433655, "loss/kd": 1.4621508121490479, "loss/lm": 0.3857811391353607, "step": 1551 }, { "epoch": 0.3186204064873743, "grad_norm": 1.216476970423219, "kd_ratio": 0.5, "learning_rate": 1.594329666547131e-05, "loss": 0.9753957390785217, "loss/kd": 1.571586012840271, "loss/lm": 0.37920549511909485, "step": 1552 }, { "epoch": 0.3188257031410388, "grad_norm": 1.7320194366724957, "kd_ratio": 0.5, "learning_rate": 1.5937947052701615e-05, "loss": 0.944657564163208, "loss/kd": 1.5368469953536987, "loss/lm": 0.3524681031703949, "step": 1553 }, { "epoch": 0.31903099979470334, "grad_norm": 1.8619331183154035, "kd_ratio": 0.5, "learning_rate": 1.5932594813801613e-05, "loss": 1.2673170566558838, "loss/kd": 2.0984537601470947, "loss/lm": 0.4361802935600281, "step": 1554 }, { "epoch": 0.3192362964483679, "grad_norm": 1.0998419909778478, "kd_ratio": 0.5, "learning_rate": 1.592723995113839e-05, "loss": 1.8736778497695923, "loss/kd": 3.4299449920654297, "loss/lm": 0.3174106776714325, "step": 1555 }, { "epoch": 0.31944159310203246, "grad_norm": 1.0827300809597027, "kd_ratio": 0.5, "learning_rate": 1.5921882467080206e-05, "loss": 0.9992591738700867, "loss/kd": 1.6169486045837402, "loss/lm": 0.3815697133541107, "step": 1556 }, { "epoch": 0.319646889755697, "grad_norm": 1.7274990898899838, "kd_ratio": 0.5, "learning_rate": 1.5916522363996477e-05, "loss": 1.1137126684188843, "loss/kd": 1.800591230392456, "loss/lm": 0.42683419585227966, "step": 1557 }, { "epoch": 0.3198521864093615, "grad_norm": 2.1284589163138756, "kd_ratio": 0.5, "learning_rate": 1.5911159644257765e-05, "loss": 1.12232506275177, "loss/kd": 1.8702706098556519, "loss/lm": 0.37437954545021057, "step": 1558 }, { "epoch": 0.32005748306302606, "grad_norm": 1.3645387186984712, "kd_ratio": 0.5, "learning_rate": 1.5905794310235808e-05, "loss": 1.003210425376892, "loss/kd": 1.6457328796386719, "loss/lm": 0.3606879711151123, "step": 1559 }, { "epoch": 0.3202627797166906, "grad_norm": 1.3754255684498216, "kd_ratio": 0.5, "learning_rate": 1.5900426364303486e-05, "loss": 1.2836906909942627, "loss/kd": 2.1420063972473145, "loss/lm": 0.4253750741481781, "step": 1560 }, { "epoch": 0.3204680763703552, "grad_norm": 2.1776829007135214, "kd_ratio": 0.5, "learning_rate": 1.589505580883484e-05, "loss": 1.2147196531295776, "loss/kd": 1.9994927644729614, "loss/lm": 0.4299464523792267, "step": 1561 }, { "epoch": 0.3206733730240197, "grad_norm": 2.6602174226998287, "kd_ratio": 0.5, "learning_rate": 1.5889682646205066e-05, "loss": 1.1790677309036255, "loss/kd": 1.9521867036819458, "loss/lm": 0.405948668718338, "step": 1562 }, { "epoch": 0.32087866967768425, "grad_norm": 2.503924368405537, "kd_ratio": 0.5, "learning_rate": 1.5884306878790512e-05, "loss": 1.290306568145752, "loss/kd": 2.2249679565429688, "loss/lm": 0.3556452691555023, "step": 1563 }, { "epoch": 0.3210839663313488, "grad_norm": 1.643015859827164, "kd_ratio": 0.5, "learning_rate": 1.5878928508968676e-05, "loss": 0.9885281324386597, "loss/kd": 1.6042433977127075, "loss/lm": 0.37281283736228943, "step": 1564 }, { "epoch": 0.32128926298501337, "grad_norm": 1.2259701537257783, "kd_ratio": 0.5, "learning_rate": 1.5873547539118207e-05, "loss": 1.3879787921905518, "loss/kd": 2.31321382522583, "loss/lm": 0.462743878364563, "step": 1565 }, { "epoch": 0.3214945596386779, "grad_norm": 1.7729932184931534, "kd_ratio": 0.5, "learning_rate": 1.5868163971618904e-05, "loss": 1.2932664155960083, "loss/kd": 2.225094795227051, "loss/lm": 0.3614380955696106, "step": 1566 }, { "epoch": 0.32169985629234243, "grad_norm": 1.3783180881311943, "kd_ratio": 0.5, "learning_rate": 1.586277780885172e-05, "loss": 1.061660885810852, "loss/kd": 1.7202593088150024, "loss/lm": 0.4030625522136688, "step": 1567 }, { "epoch": 0.32190515294600697, "grad_norm": 1.2695561558325665, "kd_ratio": 0.5, "learning_rate": 1.5857389053198753e-05, "loss": 1.0710123777389526, "loss/kd": 1.7403711080551147, "loss/lm": 0.4016536772251129, "step": 1568 }, { "epoch": 0.3221104495996715, "grad_norm": 2.570882508698269, "kd_ratio": 0.5, "learning_rate": 1.585199770704324e-05, "loss": 1.0376243591308594, "loss/kd": 1.644372582435608, "loss/lm": 0.43087613582611084, "step": 1569 }, { "epoch": 0.3223157462533361, "grad_norm": 2.7293904490464236, "kd_ratio": 0.5, "learning_rate": 1.584660377276958e-05, "loss": 1.1267436742782593, "loss/kd": 1.8399354219436646, "loss/lm": 0.4135519862174988, "step": 1570 }, { "epoch": 0.3225210429070006, "grad_norm": 1.6437293904082775, "kd_ratio": 0.5, "learning_rate": 1.58412072527633e-05, "loss": 1.1368701457977295, "loss/kd": 1.887180209159851, "loss/lm": 0.3865601718425751, "step": 1571 }, { "epoch": 0.32272633956066515, "grad_norm": 1.116568759757198, "kd_ratio": 0.5, "learning_rate": 1.583580814941108e-05, "loss": 0.9529187679290771, "loss/kd": 1.460347294807434, "loss/lm": 0.4454902708530426, "step": 1572 }, { "epoch": 0.3229316362143297, "grad_norm": 2.0455569909758156, "kd_ratio": 0.5, "learning_rate": 1.583040646510074e-05, "loss": 1.075738549232483, "loss/kd": 1.7963846921920776, "loss/lm": 0.3550924062728882, "step": 1573 }, { "epoch": 0.3231369328679943, "grad_norm": 1.8053320913665531, "kd_ratio": 0.5, "learning_rate": 1.582500220222124e-05, "loss": 1.0562071800231934, "loss/kd": 1.6659901142120361, "loss/lm": 0.44642412662506104, "step": 1574 }, { "epoch": 0.3233422295216588, "grad_norm": 1.7353438017480352, "kd_ratio": 0.5, "learning_rate": 1.5819595363162682e-05, "loss": 0.9621950387954712, "loss/kd": 1.5223199129104614, "loss/lm": 0.4020701050758362, "step": 1575 }, { "epoch": 0.32354752617532334, "grad_norm": 1.665147624205623, "kd_ratio": 0.5, "learning_rate": 1.5814185950316307e-05, "loss": 0.9679046273231506, "loss/kd": 1.5379815101623535, "loss/lm": 0.39782774448394775, "step": 1576 }, { "epoch": 0.3237528228289879, "grad_norm": 2.3568607902769587, "kd_ratio": 0.5, "learning_rate": 1.5808773966074493e-05, "loss": 0.9155915975570679, "loss/kd": 1.4903266429901123, "loss/lm": 0.3408565819263458, "step": 1577 }, { "epoch": 0.3239581194826524, "grad_norm": 1.4993614367481136, "kd_ratio": 0.5, "learning_rate": 1.5803359412830763e-05, "loss": 1.054990291595459, "loss/kd": 1.7180379629135132, "loss/lm": 0.39194267988204956, "step": 1578 }, { "epoch": 0.324163416136317, "grad_norm": 1.4426272923457069, "kd_ratio": 0.5, "learning_rate": 1.5797942292979767e-05, "loss": 1.0817068815231323, "loss/kd": 1.7888977527618408, "loss/lm": 0.37451592087745667, "step": 1579 }, { "epoch": 0.32436871278998153, "grad_norm": 2.21166487115772, "kd_ratio": 0.5, "learning_rate": 1.579252260891729e-05, "loss": 1.0141950845718384, "loss/kd": 1.6081370115280151, "loss/lm": 0.4202532172203064, "step": 1580 }, { "epoch": 0.32457400944364606, "grad_norm": 1.753572349028404, "kd_ratio": 0.5, "learning_rate": 1.5787100363040256e-05, "loss": 1.0429041385650635, "loss/kd": 1.6853067874908447, "loss/lm": 0.4005015194416046, "step": 1581 }, { "epoch": 0.3247793060973106, "grad_norm": 1.0306388896648866, "kd_ratio": 0.5, "learning_rate": 1.578167555774672e-05, "loss": 1.2496006488800049, "loss/kd": 1.9888428449630737, "loss/lm": 0.510358452796936, "step": 1582 }, { "epoch": 0.3249846027509752, "grad_norm": 1.9407462693772888, "kd_ratio": 0.5, "learning_rate": 1.577624819543587e-05, "loss": 1.139809012413025, "loss/kd": 1.8904973268508911, "loss/lm": 0.38912075757980347, "step": 1583 }, { "epoch": 0.3251898994046397, "grad_norm": 2.4050537563634022, "kd_ratio": 0.5, "learning_rate": 1.5770818278508025e-05, "loss": 1.1370755434036255, "loss/kd": 1.8445032835006714, "loss/lm": 0.4296477437019348, "step": 1584 }, { "epoch": 0.32539519605830425, "grad_norm": 1.349282275067116, "kd_ratio": 0.5, "learning_rate": 1.576538580936463e-05, "loss": 1.260302186012268, "loss/kd": 2.137861967086792, "loss/lm": 0.38274240493774414, "step": 1585 }, { "epoch": 0.3256004927119688, "grad_norm": 1.5139701052912875, "kd_ratio": 0.5, "learning_rate": 1.5759950790408264e-05, "loss": 1.0268073081970215, "loss/kd": 1.6606358289718628, "loss/lm": 0.39297884702682495, "step": 1586 }, { "epoch": 0.3258057893656333, "grad_norm": 1.8905343582541758, "kd_ratio": 0.5, "learning_rate": 1.5754513224042625e-05, "loss": 1.0145413875579834, "loss/kd": 1.6404061317443848, "loss/lm": 0.38867658376693726, "step": 1587 }, { "epoch": 0.3260110860192979, "grad_norm": 1.34427822112437, "kd_ratio": 0.5, "learning_rate": 1.574907311267255e-05, "loss": 1.3067954778671265, "loss/kd": 2.2218687534332275, "loss/lm": 0.391722172498703, "step": 1588 }, { "epoch": 0.32621638267296243, "grad_norm": 1.4218377617035511, "kd_ratio": 0.5, "learning_rate": 1.5743630458703996e-05, "loss": 1.1141330003738403, "loss/kd": 1.8557287454605103, "loss/lm": 0.37253719568252563, "step": 1589 }, { "epoch": 0.32642167932662697, "grad_norm": 2.0357888563851065, "kd_ratio": 0.5, "learning_rate": 1.573818526454404e-05, "loss": 1.0946749448776245, "loss/kd": 1.7517801523208618, "loss/lm": 0.4375697374343872, "step": 1590 }, { "epoch": 0.3266269759802915, "grad_norm": 1.4851873070236066, "kd_ratio": 0.5, "learning_rate": 1.573273753260089e-05, "loss": 0.9685258865356445, "loss/kd": 1.5865825414657593, "loss/lm": 0.350469172000885, "step": 1591 }, { "epoch": 0.3268322726339561, "grad_norm": 1.4863498376254365, "kd_ratio": 0.5, "learning_rate": 1.572728726528387e-05, "loss": 1.41228449344635, "loss/kd": 2.4742283821105957, "loss/lm": 0.35034069418907166, "step": 1592 }, { "epoch": 0.3270375692876206, "grad_norm": 2.5814690777712186, "kd_ratio": 0.5, "learning_rate": 1.5721834465003425e-05, "loss": 1.1099002361297607, "loss/kd": 1.7175172567367554, "loss/lm": 0.5022831559181213, "step": 1593 }, { "epoch": 0.32724286594128515, "grad_norm": 1.351881563123976, "kd_ratio": 0.5, "learning_rate": 1.571637913417113e-05, "loss": 1.171389102935791, "loss/kd": 1.9456900358200073, "loss/lm": 0.3970882296562195, "step": 1594 }, { "epoch": 0.3274481625949497, "grad_norm": 2.146477417775098, "kd_ratio": 0.5, "learning_rate": 1.571092127519967e-05, "loss": 1.941074252128601, "loss/kd": 3.5647311210632324, "loss/lm": 0.3174173831939697, "step": 1595 }, { "epoch": 0.3276534592486142, "grad_norm": 2.9238893703527027, "kd_ratio": 0.5, "learning_rate": 1.5705460890502845e-05, "loss": 1.1240499019622803, "loss/kd": 1.7955436706542969, "loss/lm": 0.4525560140609741, "step": 1596 }, { "epoch": 0.3278587559022788, "grad_norm": 1.2819076333607846, "kd_ratio": 0.5, "learning_rate": 1.5699997982495586e-05, "loss": 0.9899414777755737, "loss/kd": 1.6471353769302368, "loss/lm": 0.33274754881858826, "step": 1597 }, { "epoch": 0.32806405255594334, "grad_norm": 2.2058461858959895, "kd_ratio": 0.5, "learning_rate": 1.5694532553593925e-05, "loss": 1.3559036254882812, "loss/kd": 2.2313950061798096, "loss/lm": 0.4804122745990753, "step": 1598 }, { "epoch": 0.3282693492096079, "grad_norm": 1.702253524083304, "kd_ratio": 0.5, "learning_rate": 1.568906460621502e-05, "loss": 0.9474708437919617, "loss/kd": 1.5268687009811401, "loss/lm": 0.3680729866027832, "step": 1599 }, { "epoch": 0.3284746458632724, "grad_norm": 1.7333134030076416, "kd_ratio": 0.5, "learning_rate": 1.568359414277713e-05, "loss": 1.0663783550262451, "loss/kd": 1.751625657081604, "loss/lm": 0.3811311423778534, "step": 1600 }, { "epoch": 0.328679942516937, "grad_norm": 1.5541667443477105, "kd_ratio": 0.5, "learning_rate": 1.567812116569965e-05, "loss": 1.1918816566467285, "loss/kd": 1.8858143091201782, "loss/lm": 0.49794894456863403, "step": 1601 }, { "epoch": 0.32888523917060153, "grad_norm": 2.107010055479111, "kd_ratio": 0.5, "learning_rate": 1.567264567740306e-05, "loss": 1.095778226852417, "loss/kd": 1.7488503456115723, "loss/lm": 0.4427061378955841, "step": 1602 }, { "epoch": 0.32909053582426606, "grad_norm": 2.5806073977897745, "kd_ratio": 0.5, "learning_rate": 1.566716768030896e-05, "loss": 1.189994215965271, "loss/kd": 2.045024871826172, "loss/lm": 0.3349635601043701, "step": 1603 }, { "epoch": 0.3292958324779306, "grad_norm": 1.8173398230315794, "kd_ratio": 0.5, "learning_rate": 1.5661687176840066e-05, "loss": 1.1670469045639038, "loss/kd": 1.8396598100662231, "loss/lm": 0.4944339394569397, "step": 1604 }, { "epoch": 0.3295011291315951, "grad_norm": 2.0784585052934985, "kd_ratio": 0.5, "learning_rate": 1.56562041694202e-05, "loss": 1.1104251146316528, "loss/kd": 1.8619141578674316, "loss/lm": 0.3589361608028412, "step": 1605 }, { "epoch": 0.3297064257852597, "grad_norm": 1.242665371764101, "kd_ratio": 0.5, "learning_rate": 1.5650718660474288e-05, "loss": 1.3268766403198242, "loss/kd": 2.1563775539398193, "loss/lm": 0.4973757565021515, "step": 1606 }, { "epoch": 0.32991172243892425, "grad_norm": 1.9784785331467911, "kd_ratio": 0.5, "learning_rate": 1.5645230652428367e-05, "loss": 1.238661527633667, "loss/kd": 2.089184522628784, "loss/lm": 0.3881385028362274, "step": 1607 }, { "epoch": 0.3301170190925888, "grad_norm": 1.6874781653122903, "kd_ratio": 0.5, "learning_rate": 1.563974014770957e-05, "loss": 1.0721423625946045, "loss/kd": 1.6996252536773682, "loss/lm": 0.44465959072113037, "step": 1608 }, { "epoch": 0.3303223157462533, "grad_norm": 1.8981732426796385, "kd_ratio": 0.5, "learning_rate": 1.5634247148746147e-05, "loss": 1.1439284086227417, "loss/kd": 1.9253543615341187, "loss/lm": 0.3625025153160095, "step": 1609 }, { "epoch": 0.3305276123999179, "grad_norm": 1.3547664070113337, "kd_ratio": 0.5, "learning_rate": 1.562875165796744e-05, "loss": 1.0892539024353027, "loss/kd": 1.7985057830810547, "loss/lm": 0.3800020217895508, "step": 1610 }, { "epoch": 0.33073290905358244, "grad_norm": 1.6645302476847532, "kd_ratio": 0.5, "learning_rate": 1.5623253677803897e-05, "loss": 1.0473049879074097, "loss/kd": 1.6217797994613647, "loss/lm": 0.47283023595809937, "step": 1611 }, { "epoch": 0.33093820570724697, "grad_norm": 1.4235845641615992, "kd_ratio": 0.5, "learning_rate": 1.5617753210687072e-05, "loss": 0.9351109266281128, "loss/kd": 1.5265485048294067, "loss/lm": 0.34367334842681885, "step": 1612 }, { "epoch": 0.3311435023609115, "grad_norm": 1.0163444769608967, "kd_ratio": 0.5, "learning_rate": 1.561225025904961e-05, "loss": 0.9368967413902283, "loss/kd": 1.5120995044708252, "loss/lm": 0.36169400811195374, "step": 1613 }, { "epoch": 0.3313487990145761, "grad_norm": 1.1835016143584118, "kd_ratio": 0.5, "learning_rate": 1.5606744825325263e-05, "loss": 0.7799957990646362, "loss/kd": 1.2195708751678467, "loss/lm": 0.3404206931591034, "step": 1614 }, { "epoch": 0.3315540956682406, "grad_norm": 1.055185568915288, "kd_ratio": 0.5, "learning_rate": 1.5601236911948876e-05, "loss": 1.0152900218963623, "loss/kd": 1.6742366552352905, "loss/lm": 0.35634344816207886, "step": 1615 }, { "epoch": 0.33175939232190516, "grad_norm": 1.3023391051755533, "kd_ratio": 0.5, "learning_rate": 1.5595726521356387e-05, "loss": 1.218833565711975, "loss/kd": 1.995219349861145, "loss/lm": 0.4424477219581604, "step": 1616 }, { "epoch": 0.3319646889755697, "grad_norm": 1.2241347390423645, "kd_ratio": 0.5, "learning_rate": 1.559021365598484e-05, "loss": 1.0034111738204956, "loss/kd": 1.6324611902236938, "loss/lm": 0.3743612468242645, "step": 1617 }, { "epoch": 0.3321699856292342, "grad_norm": 1.103163660643003, "kd_ratio": 0.5, "learning_rate": 1.5584698318272367e-05, "loss": 1.0586110353469849, "loss/kd": 1.7979360818862915, "loss/lm": 0.3192860782146454, "step": 1618 }, { "epoch": 0.3323752822828988, "grad_norm": 1.0944603864220357, "kd_ratio": 0.5, "learning_rate": 1.5579180510658187e-05, "loss": 1.0279016494750977, "loss/kd": 1.650151014328003, "loss/lm": 0.40565234422683716, "step": 1619 }, { "epoch": 0.33258057893656334, "grad_norm": 1.15663171776151, "kd_ratio": 0.5, "learning_rate": 1.557366023558263e-05, "loss": 1.0309603214263916, "loss/kd": 1.7230892181396484, "loss/lm": 0.3388315439224243, "step": 1620 }, { "epoch": 0.3327858755902279, "grad_norm": 1.4786725344925835, "kd_ratio": 0.5, "learning_rate": 1.55681374954871e-05, "loss": 1.2153786420822144, "loss/kd": 2.0332796573638916, "loss/lm": 0.3974776268005371, "step": 1621 }, { "epoch": 0.3329911722438924, "grad_norm": 1.1034999928045088, "kd_ratio": 0.5, "learning_rate": 1.556261229281409e-05, "loss": 1.147789478302002, "loss/kd": 1.9291046857833862, "loss/lm": 0.36647433042526245, "step": 1622 }, { "epoch": 0.333196468897557, "grad_norm": 1.5987251288714024, "kd_ratio": 0.5, "learning_rate": 1.5557084630007206e-05, "loss": 1.0439623594284058, "loss/kd": 1.7414679527282715, "loss/lm": 0.3464568257331848, "step": 1623 }, { "epoch": 0.33340176555122153, "grad_norm": 1.8167036835237544, "kd_ratio": 0.5, "learning_rate": 1.5551554509511107e-05, "loss": 1.1456584930419922, "loss/kd": 1.9076110124588013, "loss/lm": 0.38370588421821594, "step": 1624 }, { "epoch": 0.33360706220488606, "grad_norm": 1.173582316631859, "kd_ratio": 0.5, "learning_rate": 1.5546021933771568e-05, "loss": 1.034666895866394, "loss/kd": 1.662346601486206, "loss/lm": 0.40698713064193726, "step": 1625 }, { "epoch": 0.3338123588585506, "grad_norm": 1.124106714022086, "kd_ratio": 0.5, "learning_rate": 1.5540486905235434e-05, "loss": 1.063976526260376, "loss/kd": 1.6612908840179443, "loss/lm": 0.46666204929351807, "step": 1626 }, { "epoch": 0.33401765551221513, "grad_norm": 1.5814512778110679, "kd_ratio": 0.5, "learning_rate": 1.5534949426350642e-05, "loss": 1.0783978700637817, "loss/kd": 1.822451114654541, "loss/lm": 0.3343445658683777, "step": 1627 }, { "epoch": 0.3342229521658797, "grad_norm": 1.417622618527803, "kd_ratio": 0.5, "learning_rate": 1.552940949956621e-05, "loss": 0.996055543422699, "loss/kd": 1.6223595142364502, "loss/lm": 0.36975157260894775, "step": 1628 }, { "epoch": 0.33442824881954425, "grad_norm": 1.0946401072126806, "kd_ratio": 0.5, "learning_rate": 1.552386712733224e-05, "loss": 0.9344702959060669, "loss/kd": 1.476144552230835, "loss/lm": 0.39279603958129883, "step": 1629 }, { "epoch": 0.3346335454732088, "grad_norm": 1.5562205162429785, "kd_ratio": 0.5, "learning_rate": 1.5518322312099908e-05, "loss": 0.9560286998748779, "loss/kd": 1.4933832883834839, "loss/lm": 0.41867417097091675, "step": 1630 }, { "epoch": 0.3348388421268733, "grad_norm": 1.3869893333847636, "kd_ratio": 0.5, "learning_rate": 1.551277505632149e-05, "loss": 0.9950425624847412, "loss/kd": 1.5982950925827026, "loss/lm": 0.3917900025844574, "step": 1631 }, { "epoch": 0.3350441387805379, "grad_norm": 1.0838151456997858, "kd_ratio": 0.5, "learning_rate": 1.5507225362450312e-05, "loss": 0.952720582485199, "loss/kd": 1.5930715799331665, "loss/lm": 0.31236958503723145, "step": 1632 }, { "epoch": 0.33524943543420244, "grad_norm": 1.3423904422173225, "kd_ratio": 0.5, "learning_rate": 1.5501673232940807e-05, "loss": 0.9956599473953247, "loss/kd": 1.5797536373138428, "loss/lm": 0.41156628727912903, "step": 1633 }, { "epoch": 0.33545473208786697, "grad_norm": 1.3820265748391243, "kd_ratio": 0.5, "learning_rate": 1.549611867024847e-05, "loss": 1.2339009046554565, "loss/kd": 2.0747334957122803, "loss/lm": 0.39306822419166565, "step": 1634 }, { "epoch": 0.3356600287415315, "grad_norm": 0.9671918106933984, "kd_ratio": 0.5, "learning_rate": 1.549056167682987e-05, "loss": 1.0824792385101318, "loss/kd": 1.8272175788879395, "loss/lm": 0.33774086833000183, "step": 1635 }, { "epoch": 0.33586532539519603, "grad_norm": 1.154857290973034, "kd_ratio": 0.5, "learning_rate": 1.5485002255142662e-05, "loss": 1.2510100603103638, "loss/kd": 2.1038753986358643, "loss/lm": 0.3981446325778961, "step": 1636 }, { "epoch": 0.3360706220488606, "grad_norm": 1.5522415214703613, "kd_ratio": 0.5, "learning_rate": 1.5479440407645565e-05, "loss": 0.9349936842918396, "loss/kd": 1.562770962715149, "loss/lm": 0.30721643567085266, "step": 1637 }, { "epoch": 0.33627591870252516, "grad_norm": 1.7963995606361418, "kd_ratio": 0.5, "learning_rate": 1.5473876136798374e-05, "loss": 1.021751880645752, "loss/kd": 1.6606403589248657, "loss/lm": 0.38286352157592773, "step": 1638 }, { "epoch": 0.3364812153561897, "grad_norm": 1.6967022679037411, "kd_ratio": 0.5, "learning_rate": 1.546830944506196e-05, "loss": 0.9434210062026978, "loss/kd": 1.4505072832107544, "loss/lm": 0.4363346993923187, "step": 1639 }, { "epoch": 0.3366865120098542, "grad_norm": 1.2892828107000265, "kd_ratio": 0.5, "learning_rate": 1.5462740334898257e-05, "loss": 1.0863854885101318, "loss/kd": 1.773627519607544, "loss/lm": 0.3991435468196869, "step": 1640 }, { "epoch": 0.3368918086635188, "grad_norm": 2.454191572668273, "kd_ratio": 0.5, "learning_rate": 1.5457168808770278e-05, "loss": 0.9498311281204224, "loss/kd": 1.5500930547714233, "loss/lm": 0.349569171667099, "step": 1641 }, { "epoch": 0.33709710531718334, "grad_norm": 1.6111947177563561, "kd_ratio": 0.5, "learning_rate": 1.5451594869142096e-05, "loss": 0.9988285899162292, "loss/kd": 1.6150603294372559, "loss/lm": 0.38259682059288025, "step": 1642 }, { "epoch": 0.3373024019708479, "grad_norm": 1.6982665173853118, "kd_ratio": 0.5, "learning_rate": 1.544601851847885e-05, "loss": 1.112583875656128, "loss/kd": 1.7610231637954712, "loss/lm": 0.4641445279121399, "step": 1643 }, { "epoch": 0.3375076986245124, "grad_norm": 1.7085106641691428, "kd_ratio": 0.5, "learning_rate": 1.5440439759246756e-05, "loss": 1.9681077003479004, "loss/kd": 3.62288498878479, "loss/lm": 0.3133305311203003, "step": 1644 }, { "epoch": 0.33771299527817694, "grad_norm": 1.502861966235528, "kd_ratio": 0.5, "learning_rate": 1.5434858593913087e-05, "loss": 1.0527515411376953, "loss/kd": 1.8347795009613037, "loss/lm": 0.2707236707210541, "step": 1645 }, { "epoch": 0.33791829193184153, "grad_norm": 1.48411686750476, "kd_ratio": 0.5, "learning_rate": 1.5429275024946187e-05, "loss": 1.1435770988464355, "loss/kd": 1.8139909505844116, "loss/lm": 0.47316327691078186, "step": 1646 }, { "epoch": 0.33812358858550606, "grad_norm": 1.0055318964660864, "kd_ratio": 0.5, "learning_rate": 1.542368905481545e-05, "loss": 1.3515163660049438, "loss/kd": 2.3721516132354736, "loss/lm": 0.3308810591697693, "step": 1647 }, { "epoch": 0.3383288852391706, "grad_norm": 1.572266994361662, "kd_ratio": 0.5, "learning_rate": 1.5418100685991344e-05, "loss": 1.0440995693206787, "loss/kd": 1.6981297731399536, "loss/lm": 0.39006930589675903, "step": 1648 }, { "epoch": 0.33853418189283513, "grad_norm": 1.385796112390297, "kd_ratio": 0.5, "learning_rate": 1.5412509920945398e-05, "loss": 0.9620411396026611, "loss/kd": 1.6136656999588013, "loss/lm": 0.31041663885116577, "step": 1649 }, { "epoch": 0.3387394785464997, "grad_norm": 1.678904537973718, "kd_ratio": 0.5, "learning_rate": 1.5406916762150187e-05, "loss": 1.0892871618270874, "loss/kd": 1.8362919092178345, "loss/lm": 0.3422825038433075, "step": 1650 }, { "epoch": 0.33894477520016425, "grad_norm": 1.163654036767065, "kd_ratio": 0.5, "learning_rate": 1.5401321212079366e-05, "loss": 1.071509599685669, "loss/kd": 1.776570439338684, "loss/lm": 0.36644864082336426, "step": 1651 }, { "epoch": 0.3391500718538288, "grad_norm": 2.0689661956780245, "kd_ratio": 0.5, "learning_rate": 1.5395723273207626e-05, "loss": 1.1205521821975708, "loss/kd": 1.868022084236145, "loss/lm": 0.3730821907520294, "step": 1652 }, { "epoch": 0.3393553685074933, "grad_norm": 1.3933661243901265, "kd_ratio": 0.5, "learning_rate": 1.539012294801073e-05, "loss": 1.1661155223846436, "loss/kd": 1.9587520360946655, "loss/lm": 0.3734790086746216, "step": 1653 }, { "epoch": 0.33956066516115785, "grad_norm": 1.5492239440356776, "kd_ratio": 0.5, "learning_rate": 1.5384520238965487e-05, "loss": 0.957607626914978, "loss/kd": 1.6131478548049927, "loss/lm": 0.3020673990249634, "step": 1654 }, { "epoch": 0.33976596181482244, "grad_norm": 1.5148955164119493, "kd_ratio": 0.5, "learning_rate": 1.5378915148549772e-05, "loss": 1.551414966583252, "loss/kd": 2.5761635303497314, "loss/lm": 0.526666522026062, "step": 1655 }, { "epoch": 0.33997125846848697, "grad_norm": 1.6368618087869207, "kd_ratio": 0.5, "learning_rate": 1.5373307679242496e-05, "loss": 1.2290992736816406, "loss/kd": 2.056640625, "loss/lm": 0.40155795216560364, "step": 1656 }, { "epoch": 0.3401765551221515, "grad_norm": 1.5149603808373184, "kd_ratio": 0.5, "learning_rate": 1.536769783352364e-05, "loss": 1.0772649049758911, "loss/kd": 1.7364697456359863, "loss/lm": 0.4180600941181183, "step": 1657 }, { "epoch": 0.34038185177581604, "grad_norm": 1.2758389138508741, "kd_ratio": 0.5, "learning_rate": 1.536208561387422e-05, "loss": 1.3012306690216064, "loss/kd": 2.150729179382324, "loss/lm": 0.4517321288585663, "step": 1658 }, { "epoch": 0.3405871484294806, "grad_norm": 1.0698736919782914, "kd_ratio": 0.5, "learning_rate": 1.5356471022776315e-05, "loss": 1.098378300666809, "loss/kd": 1.8864843845367432, "loss/lm": 0.3102721869945526, "step": 1659 }, { "epoch": 0.34079244508314516, "grad_norm": 1.7215429094850205, "kd_ratio": 0.5, "learning_rate": 1.535085406271304e-05, "loss": 1.015791416168213, "loss/kd": 1.687217354774475, "loss/lm": 0.34436550736427307, "step": 1660 }, { "epoch": 0.3409977417368097, "grad_norm": 1.7206543486821555, "kd_ratio": 0.5, "learning_rate": 1.5345234736168572e-05, "loss": 1.3500123023986816, "loss/kd": 2.2579150199890137, "loss/lm": 0.44210970401763916, "step": 1661 }, { "epoch": 0.3412030383904742, "grad_norm": 1.5461659960737326, "kd_ratio": 0.5, "learning_rate": 1.5339613045628127e-05, "loss": 1.2897381782531738, "loss/kd": 2.076674699783325, "loss/lm": 0.5028015375137329, "step": 1662 }, { "epoch": 0.34140833504413876, "grad_norm": 1.3651896014688458, "kd_ratio": 0.5, "learning_rate": 1.5333988993577958e-05, "loss": 1.1278712749481201, "loss/kd": 1.8034121990203857, "loss/lm": 0.4523303210735321, "step": 1663 }, { "epoch": 0.34161363169780334, "grad_norm": 1.239905386356712, "kd_ratio": 0.5, "learning_rate": 1.5328362582505384e-05, "loss": 0.9775252938270569, "loss/kd": 1.6046476364135742, "loss/lm": 0.35040295124053955, "step": 1664 }, { "epoch": 0.3418189283514679, "grad_norm": 1.7606062791432369, "kd_ratio": 0.5, "learning_rate": 1.532273381489875e-05, "loss": 1.0710828304290771, "loss/kd": 1.7203874588012695, "loss/lm": 0.42177814245224, "step": 1665 }, { "epoch": 0.3420242250051324, "grad_norm": 1.9049555595553378, "kd_ratio": 0.5, "learning_rate": 1.5317102693247443e-05, "loss": 1.2434957027435303, "loss/kd": 2.161149501800537, "loss/lm": 0.3258419930934906, "step": 1666 }, { "epoch": 0.34222952165879694, "grad_norm": 1.5741770726888478, "kd_ratio": 0.5, "learning_rate": 1.5311469220041903e-05, "loss": 1.0989290475845337, "loss/kd": 1.7150013446807861, "loss/lm": 0.48285672068595886, "step": 1667 }, { "epoch": 0.34243481831246153, "grad_norm": 1.026516416032385, "kd_ratio": 0.5, "learning_rate": 1.5305833397773596e-05, "loss": 1.0815215110778809, "loss/kd": 1.7869600057601929, "loss/lm": 0.3760829567909241, "step": 1668 }, { "epoch": 0.34264011496612606, "grad_norm": 1.0556177472064254, "kd_ratio": 0.5, "learning_rate": 1.5300195228935044e-05, "loss": 1.3611540794372559, "loss/kd": 2.2926242351531982, "loss/lm": 0.42968401312828064, "step": 1669 }, { "epoch": 0.3428454116197906, "grad_norm": 1.2046765236996506, "kd_ratio": 0.5, "learning_rate": 1.5294554716019788e-05, "loss": 1.2949540615081787, "loss/kd": 2.170142650604248, "loss/lm": 0.4197655916213989, "step": 1670 }, { "epoch": 0.34305070827345513, "grad_norm": 1.1974813028878597, "kd_ratio": 0.5, "learning_rate": 1.5288911861522413e-05, "loss": 1.278981328010559, "loss/kd": 2.1456785202026367, "loss/lm": 0.4122840464115143, "step": 1671 }, { "epoch": 0.34325600492711966, "grad_norm": 1.2064277364243712, "kd_ratio": 0.5, "learning_rate": 1.5283266667938547e-05, "loss": 1.0760204792022705, "loss/kd": 1.7293277978897095, "loss/lm": 0.42271319031715393, "step": 1672 }, { "epoch": 0.34346130158078425, "grad_norm": 1.0478904956647002, "kd_ratio": 0.5, "learning_rate": 1.5277619137764843e-05, "loss": 0.9779921770095825, "loss/kd": 1.5569019317626953, "loss/lm": 0.3990824222564697, "step": 1673 }, { "epoch": 0.3436665982344488, "grad_norm": 1.2393062958460161, "kd_ratio": 0.5, "learning_rate": 1.5271969273498994e-05, "loss": 1.161069393157959, "loss/kd": 1.8353629112243652, "loss/lm": 0.4867759644985199, "step": 1674 }, { "epoch": 0.3438718948881133, "grad_norm": 1.4018686679536072, "kd_ratio": 0.5, "learning_rate": 1.526631707763972e-05, "loss": 1.5403993129730225, "loss/kd": 2.71354341506958, "loss/lm": 0.3672551214694977, "step": 1675 }, { "epoch": 0.34407719154177785, "grad_norm": 1.3149141397185153, "kd_ratio": 0.5, "learning_rate": 1.5260662552686776e-05, "loss": 1.0801293849945068, "loss/kd": 1.7694231271743774, "loss/lm": 0.3908356726169586, "step": 1676 }, { "epoch": 0.34428248819544244, "grad_norm": 1.3625592522653638, "kd_ratio": 0.5, "learning_rate": 1.5255005701140944e-05, "loss": 1.2994157075881958, "loss/kd": 2.1938488483428955, "loss/lm": 0.4049825966358185, "step": 1677 }, { "epoch": 0.34448778484910697, "grad_norm": 1.4935780221335515, "kd_ratio": 0.5, "learning_rate": 1.5249346525504032e-05, "loss": 1.0217187404632568, "loss/kd": 1.6385326385498047, "loss/lm": 0.4049048125743866, "step": 1678 }, { "epoch": 0.3446930815027715, "grad_norm": 1.630012547584631, "kd_ratio": 0.5, "learning_rate": 1.5243685028278888e-05, "loss": 1.0276217460632324, "loss/kd": 1.7624192237854004, "loss/lm": 0.2928243577480316, "step": 1679 }, { "epoch": 0.34489837815643604, "grad_norm": 1.3035629931174313, "kd_ratio": 0.5, "learning_rate": 1.5238021211969375e-05, "loss": 1.07335364818573, "loss/kd": 1.703441858291626, "loss/lm": 0.443265438079834, "step": 1680 }, { "epoch": 0.34510367481010057, "grad_norm": 1.3706594131788288, "kd_ratio": 0.5, "learning_rate": 1.5232355079080385e-05, "loss": 1.3046005964279175, "loss/kd": 2.200744390487671, "loss/lm": 0.4084567129611969, "step": 1681 }, { "epoch": 0.34530897146376516, "grad_norm": 2.284530576122698, "kd_ratio": 0.5, "learning_rate": 1.5226686632117837e-05, "loss": 0.8295267224311829, "loss/kd": 1.3021265268325806, "loss/lm": 0.35692688822746277, "step": 1682 }, { "epoch": 0.3455142681174297, "grad_norm": 2.0132531807808234, "kd_ratio": 0.5, "learning_rate": 1.5221015873588672e-05, "loss": 1.2099101543426514, "loss/kd": 2.0238304138183594, "loss/lm": 0.39598986506462097, "step": 1683 }, { "epoch": 0.3457195647710942, "grad_norm": 0.9719363216853072, "kd_ratio": 0.5, "learning_rate": 1.521534280600085e-05, "loss": 1.1414862871170044, "loss/kd": 1.8796111345291138, "loss/lm": 0.40336135029792786, "step": 1684 }, { "epoch": 0.34592486142475876, "grad_norm": 2.1490357685130905, "kd_ratio": 0.5, "learning_rate": 1.5209667431863357e-05, "loss": 1.2489899396896362, "loss/kd": 2.136141061782837, "loss/lm": 0.3618387281894684, "step": 1685 }, { "epoch": 0.34613015807842334, "grad_norm": 2.0210764910480723, "kd_ratio": 0.5, "learning_rate": 1.5203989753686198e-05, "loss": 0.8989713788032532, "loss/kd": 1.4637396335601807, "loss/lm": 0.3342031240463257, "step": 1686 }, { "epoch": 0.3463354547320879, "grad_norm": 1.1269786766656988, "kd_ratio": 0.5, "learning_rate": 1.5198309773980397e-05, "loss": 1.0402907133102417, "loss/kd": 1.6775418519973755, "loss/lm": 0.40303948521614075, "step": 1687 }, { "epoch": 0.3465407513857524, "grad_norm": 1.6610938575965113, "kd_ratio": 0.5, "learning_rate": 1.5192627495257992e-05, "loss": 0.9066832661628723, "loss/kd": 1.4664920568466187, "loss/lm": 0.346874475479126, "step": 1688 }, { "epoch": 0.34674604803941694, "grad_norm": 2.4304374688357844, "kd_ratio": 0.5, "learning_rate": 1.5186942920032039e-05, "loss": 1.014440894126892, "loss/kd": 1.65328848361969, "loss/lm": 0.375593364238739, "step": 1689 }, { "epoch": 0.3469513446930815, "grad_norm": 2.072317534451953, "kd_ratio": 0.5, "learning_rate": 1.5181256050816618e-05, "loss": 0.863060712814331, "loss/kd": 1.455895185470581, "loss/lm": 0.27022626996040344, "step": 1690 }, { "epoch": 0.34715664134674606, "grad_norm": 1.18123462079723, "kd_ratio": 0.5, "learning_rate": 1.5175566890126812e-05, "loss": 0.9860916137695312, "loss/kd": 1.5377720594406128, "loss/lm": 0.4344112277030945, "step": 1691 }, { "epoch": 0.3473619380004106, "grad_norm": 1.7358416855870327, "kd_ratio": 0.5, "learning_rate": 1.5169875440478725e-05, "loss": 1.1793245077133179, "loss/kd": 1.9478150606155396, "loss/lm": 0.4108339250087738, "step": 1692 }, { "epoch": 0.34756723465407513, "grad_norm": 1.981850469840646, "kd_ratio": 0.5, "learning_rate": 1.5164181704389471e-05, "loss": 1.0526230335235596, "loss/kd": 1.7012912034988403, "loss/lm": 0.4039548933506012, "step": 1693 }, { "epoch": 0.34777253130773966, "grad_norm": 1.0757621801197772, "kd_ratio": 0.5, "learning_rate": 1.5158485684377171e-05, "loss": 1.0063366889953613, "loss/kd": 1.650909662246704, "loss/lm": 0.36176377534866333, "step": 1694 }, { "epoch": 0.34797782796140425, "grad_norm": 1.7679995360360474, "kd_ratio": 0.5, "learning_rate": 1.5152787382960968e-05, "loss": 0.9888104200363159, "loss/kd": 1.6076339483261108, "loss/lm": 0.369986891746521, "step": 1695 }, { "epoch": 0.3481831246150688, "grad_norm": 2.2738591705195885, "kd_ratio": 0.5, "learning_rate": 1.5147086802660997e-05, "loss": 0.9617721438407898, "loss/kd": 1.5722522735595703, "loss/lm": 0.3512920141220093, "step": 1696 }, { "epoch": 0.3483884212687333, "grad_norm": 1.1042317062661222, "kd_ratio": 0.5, "learning_rate": 1.5141383945998414e-05, "loss": 0.8841731548309326, "loss/kd": 1.379934549331665, "loss/lm": 0.3884117901325226, "step": 1697 }, { "epoch": 0.34859371792239785, "grad_norm": 1.887854444601992, "kd_ratio": 0.5, "learning_rate": 1.5135678815495381e-05, "loss": 1.2100770473480225, "loss/kd": 2.0238900184631348, "loss/lm": 0.3962640166282654, "step": 1698 }, { "epoch": 0.3487990145760624, "grad_norm": 2.354087162180652, "kd_ratio": 0.5, "learning_rate": 1.5129971413675055e-05, "loss": 1.207667350769043, "loss/kd": 1.9843742847442627, "loss/lm": 0.43096038699150085, "step": 1699 }, { "epoch": 0.34900431122972697, "grad_norm": 1.2970144601784, "kd_ratio": 0.5, "learning_rate": 1.512426174306161e-05, "loss": 0.9870527982711792, "loss/kd": 1.5766350030899048, "loss/lm": 0.3974705934524536, "step": 1700 }, { "epoch": 0.3492096078833915, "grad_norm": 1.367948620987273, "kd_ratio": 0.5, "learning_rate": 1.5118549806180218e-05, "loss": 0.9637629985809326, "loss/kd": 1.6032061576843262, "loss/lm": 0.32431986927986145, "step": 1701 }, { "epoch": 0.34941490453705604, "grad_norm": 1.4616387602490586, "kd_ratio": 0.5, "learning_rate": 1.5112835605557052e-05, "loss": 1.3138810396194458, "loss/kd": 2.1932733058929443, "loss/lm": 0.43448886275291443, "step": 1702 }, { "epoch": 0.34962020119072057, "grad_norm": 1.0262872269535743, "kd_ratio": 0.5, "learning_rate": 1.510711914371929e-05, "loss": 1.1221755743026733, "loss/kd": 1.8216131925582886, "loss/lm": 0.4227379560470581, "step": 1703 }, { "epoch": 0.34982549784438516, "grad_norm": 1.5870282063162113, "kd_ratio": 0.5, "learning_rate": 1.5101400423195105e-05, "loss": 0.999963641166687, "loss/kd": 1.5983017683029175, "loss/lm": 0.40162548422813416, "step": 1704 }, { "epoch": 0.3500307944980497, "grad_norm": 1.5716632892993165, "kd_ratio": 0.5, "learning_rate": 1.5095679446513672e-05, "loss": 1.257581114768982, "loss/kd": 2.11061429977417, "loss/lm": 0.40454789996147156, "step": 1705 }, { "epoch": 0.3502360911517142, "grad_norm": 1.0751478092623996, "kd_ratio": 0.5, "learning_rate": 1.5089956216205166e-05, "loss": 1.4420948028564453, "loss/kd": 2.4027788639068604, "loss/lm": 0.4814107120037079, "step": 1706 }, { "epoch": 0.35044138780537876, "grad_norm": 1.7936740283101307, "kd_ratio": 0.5, "learning_rate": 1.5084230734800754e-05, "loss": 0.9806714057922363, "loss/kd": 1.5949259996414185, "loss/lm": 0.3664167523384094, "step": 1707 }, { "epoch": 0.35064668445904335, "grad_norm": 1.7933651007851008, "kd_ratio": 0.5, "learning_rate": 1.5078503004832599e-05, "loss": 1.1711370944976807, "loss/kd": 1.9904942512512207, "loss/lm": 0.35177987813949585, "step": 1708 }, { "epoch": 0.3508519811127079, "grad_norm": 1.0996152183938155, "kd_ratio": 0.5, "learning_rate": 1.5072773028833864e-05, "loss": 0.8931093811988831, "loss/kd": 1.3740532398223877, "loss/lm": 0.4121655225753784, "step": 1709 }, { "epoch": 0.3510572777663724, "grad_norm": 1.6278044961746174, "kd_ratio": 0.5, "learning_rate": 1.50670408093387e-05, "loss": 1.0731627941131592, "loss/kd": 1.7052737474441528, "loss/lm": 0.4410519003868103, "step": 1710 }, { "epoch": 0.35126257442003694, "grad_norm": 1.5843019111242898, "kd_ratio": 0.5, "learning_rate": 1.5061306348882252e-05, "loss": 1.0785009860992432, "loss/kd": 1.7862361669540405, "loss/lm": 0.37076571583747864, "step": 1711 }, { "epoch": 0.3514678710737015, "grad_norm": 1.0684620027568865, "kd_ratio": 0.5, "learning_rate": 1.5055569650000655e-05, "loss": 1.2643415927886963, "loss/kd": 2.1412670612335205, "loss/lm": 0.3874160945415497, "step": 1712 }, { "epoch": 0.35167316772736607, "grad_norm": 1.0395053459145631, "kd_ratio": 0.5, "learning_rate": 1.5049830715231038e-05, "loss": 0.9290595054626465, "loss/kd": 1.4857443571090698, "loss/lm": 0.37237465381622314, "step": 1713 }, { "epoch": 0.3518784643810306, "grad_norm": 1.6256770667541471, "kd_ratio": 0.5, "learning_rate": 1.5044089547111508e-05, "loss": 0.9379884004592896, "loss/kd": 1.4808859825134277, "loss/lm": 0.39509084820747375, "step": 1714 }, { "epoch": 0.35208376103469513, "grad_norm": 1.4300317621989738, "kd_ratio": 0.5, "learning_rate": 1.5038346148181178e-05, "loss": 1.089898943901062, "loss/kd": 1.7660670280456543, "loss/lm": 0.4137308895587921, "step": 1715 }, { "epoch": 0.35228905768835966, "grad_norm": 1.3533641781598074, "kd_ratio": 0.5, "learning_rate": 1.5032600520980127e-05, "loss": 1.1826684474945068, "loss/kd": 1.9935282468795776, "loss/lm": 0.3718087077140808, "step": 1716 }, { "epoch": 0.35249435434202425, "grad_norm": 2.203880799567296, "kd_ratio": 0.5, "learning_rate": 1.5026852668049436e-05, "loss": 0.9591441750526428, "loss/kd": 1.521726131439209, "loss/lm": 0.39656224846839905, "step": 1717 }, { "epoch": 0.3526996509956888, "grad_norm": 1.059897314324134, "kd_ratio": 0.5, "learning_rate": 1.502110259193116e-05, "loss": 1.1949207782745361, "loss/kd": 2.026735305786133, "loss/lm": 0.36310622096061707, "step": 1718 }, { "epoch": 0.3529049476493533, "grad_norm": 1.9561900765666584, "kd_ratio": 0.5, "learning_rate": 1.5015350295168344e-05, "loss": 0.9495060443878174, "loss/kd": 1.474303960800171, "loss/lm": 0.42470818758010864, "step": 1719 }, { "epoch": 0.35311024430301785, "grad_norm": 1.1370739578405964, "kd_ratio": 0.5, "learning_rate": 1.5009595780305014e-05, "loss": 1.1801600456237793, "loss/kd": 1.9162293672561646, "loss/lm": 0.4440908133983612, "step": 1720 }, { "epoch": 0.3533155409566824, "grad_norm": 1.5918752855704117, "kd_ratio": 0.5, "learning_rate": 1.5003839049886168e-05, "loss": 1.0913790464401245, "loss/kd": 1.8183847665786743, "loss/lm": 0.3643732964992523, "step": 1721 }, { "epoch": 0.35352083761034697, "grad_norm": 1.3986364176037092, "kd_ratio": 0.5, "learning_rate": 1.4998080106457798e-05, "loss": 1.0576508045196533, "loss/kd": 1.7656440734863281, "loss/lm": 0.3496575355529785, "step": 1722 }, { "epoch": 0.3537261342640115, "grad_norm": 1.2248850916878053, "kd_ratio": 0.5, "learning_rate": 1.4992318952566862e-05, "loss": 0.9958852529525757, "loss/kd": 1.527197241783142, "loss/lm": 0.4645732641220093, "step": 1723 }, { "epoch": 0.35393143091767604, "grad_norm": 1.2413449404100916, "kd_ratio": 0.5, "learning_rate": 1.4986555590761306e-05, "loss": 1.0311121940612793, "loss/kd": 1.6386868953704834, "loss/lm": 0.42353755235671997, "step": 1724 }, { "epoch": 0.35413672757134057, "grad_norm": 1.6012711842989107, "kd_ratio": 0.5, "learning_rate": 1.4980790023590049e-05, "loss": 1.0268954038619995, "loss/kd": 1.6339657306671143, "loss/lm": 0.41982513666152954, "step": 1725 }, { "epoch": 0.35434202422500516, "grad_norm": 1.157751833109761, "kd_ratio": 0.5, "learning_rate": 1.4975022253602977e-05, "loss": 1.038020133972168, "loss/kd": 1.7166422605514526, "loss/lm": 0.35939791798591614, "step": 1726 }, { "epoch": 0.3545473208786697, "grad_norm": 1.1477991329871984, "kd_ratio": 0.5, "learning_rate": 1.4969252283350964e-05, "loss": 0.9847185611724854, "loss/kd": 1.6486324071884155, "loss/lm": 0.3208047151565552, "step": 1727 }, { "epoch": 0.3547526175323342, "grad_norm": 1.2132173273039104, "kd_ratio": 0.5, "learning_rate": 1.4963480115385847e-05, "loss": 0.9143335223197937, "loss/kd": 1.4229978322982788, "loss/lm": 0.4056691825389862, "step": 1728 }, { "epoch": 0.35495791418599876, "grad_norm": 0.9814170848046142, "kd_ratio": 0.5, "learning_rate": 1.4957705752260441e-05, "loss": 1.4259241819381714, "loss/kd": 2.433932065963745, "loss/lm": 0.41791632771492004, "step": 1729 }, { "epoch": 0.3551632108396633, "grad_norm": 1.079894746693457, "kd_ratio": 0.5, "learning_rate": 1.4951929196528524e-05, "loss": 1.123094081878662, "loss/kd": 1.802535891532898, "loss/lm": 0.4436522126197815, "step": 1730 }, { "epoch": 0.3553685074933279, "grad_norm": 1.040988810420285, "kd_ratio": 0.5, "learning_rate": 1.4946150450744859e-05, "loss": 0.9353870153427124, "loss/kd": 1.5238618850708008, "loss/lm": 0.3469122052192688, "step": 1731 }, { "epoch": 0.3555738041469924, "grad_norm": 1.1478832725962838, "kd_ratio": 0.5, "learning_rate": 1.494036951746516e-05, "loss": 1.0349628925323486, "loss/kd": 1.6980918645858765, "loss/lm": 0.37183383107185364, "step": 1732 }, { "epoch": 0.35577910080065694, "grad_norm": 1.096032157108799, "kd_ratio": 0.5, "learning_rate": 1.4934586399246116e-05, "loss": 1.1268726587295532, "loss/kd": 1.909988284111023, "loss/lm": 0.34375712275505066, "step": 1733 }, { "epoch": 0.3559843974543215, "grad_norm": 1.0588797950155988, "kd_ratio": 0.5, "learning_rate": 1.4928801098645385e-05, "loss": 0.9495048522949219, "loss/kd": 1.5152934789657593, "loss/lm": 0.3837162256240845, "step": 1734 }, { "epoch": 0.35618969410798607, "grad_norm": 0.9469210159395588, "kd_ratio": 0.5, "learning_rate": 1.4923013618221584e-05, "loss": 1.2390375137329102, "loss/kd": 2.0967254638671875, "loss/lm": 0.38134947419166565, "step": 1735 }, { "epoch": 0.3563949907616506, "grad_norm": 1.0650286928654622, "kd_ratio": 0.5, "learning_rate": 1.4917223960534303e-05, "loss": 1.1566543579101562, "loss/kd": 1.8865199089050293, "loss/lm": 0.42678868770599365, "step": 1736 }, { "epoch": 0.35660028741531513, "grad_norm": 1.0772332388463723, "kd_ratio": 0.5, "learning_rate": 1.4911432128144088e-05, "loss": 1.2174102067947388, "loss/kd": 2.0585153102874756, "loss/lm": 0.37630507349967957, "step": 1737 }, { "epoch": 0.35680558406897966, "grad_norm": 1.0020394829913712, "kd_ratio": 0.5, "learning_rate": 1.4905638123612443e-05, "loss": 1.1093862056732178, "loss/kd": 1.8433423042297363, "loss/lm": 0.375430166721344, "step": 1738 }, { "epoch": 0.3570108807226442, "grad_norm": 1.0564970476933664, "kd_ratio": 0.5, "learning_rate": 1.4899841949501845e-05, "loss": 1.2327842712402344, "loss/kd": 2.043382167816162, "loss/lm": 0.4221864342689514, "step": 1739 }, { "epoch": 0.3572161773763088, "grad_norm": 0.9096443108593041, "kd_ratio": 0.5, "learning_rate": 1.4894043608375718e-05, "loss": 1.0942351818084717, "loss/kd": 1.7893764972686768, "loss/lm": 0.399093896150589, "step": 1740 }, { "epoch": 0.3574214740299733, "grad_norm": 1.245665325108332, "kd_ratio": 0.5, "learning_rate": 1.4888243102798455e-05, "loss": 1.0938209295272827, "loss/kd": 1.826830267906189, "loss/lm": 0.3608115613460541, "step": 1741 }, { "epoch": 0.35762677068363785, "grad_norm": 0.921790951630105, "kd_ratio": 0.5, "learning_rate": 1.4882440435335397e-05, "loss": 1.0530343055725098, "loss/kd": 1.7316495180130005, "loss/lm": 0.37441903352737427, "step": 1742 }, { "epoch": 0.3578320673373024, "grad_norm": 1.4617009116945363, "kd_ratio": 0.5, "learning_rate": 1.4876635608552845e-05, "loss": 1.156141757965088, "loss/kd": 1.8922351598739624, "loss/lm": 0.42004823684692383, "step": 1743 }, { "epoch": 0.358037363990967, "grad_norm": 1.6384214327541013, "kd_ratio": 0.5, "learning_rate": 1.4870828625018061e-05, "loss": 1.1270027160644531, "loss/kd": 1.7909040451049805, "loss/lm": 0.4631013870239258, "step": 1744 }, { "epoch": 0.3582426606446315, "grad_norm": 1.5789629403965866, "kd_ratio": 0.5, "learning_rate": 1.4865019487299247e-05, "loss": 1.2518441677093506, "loss/kd": 1.9846078157424927, "loss/lm": 0.5190804600715637, "step": 1745 }, { "epoch": 0.35844795729829604, "grad_norm": 1.2913729832684053, "kd_ratio": 0.5, "learning_rate": 1.485920819796557e-05, "loss": 0.9423242807388306, "loss/kd": 1.5600336790084839, "loss/lm": 0.324614942073822, "step": 1746 }, { "epoch": 0.35865325395196057, "grad_norm": 1.2494787819690958, "kd_ratio": 0.5, "learning_rate": 1.4853394759587146e-05, "loss": 0.9520132541656494, "loss/kd": 1.6089444160461426, "loss/lm": 0.29508206248283386, "step": 1747 }, { "epoch": 0.3588585506056251, "grad_norm": 1.8408730025519648, "kd_ratio": 0.5, "learning_rate": 1.4847579174735036e-05, "loss": 1.014596700668335, "loss/kd": 1.7053722143173218, "loss/lm": 0.32382115721702576, "step": 1748 }, { "epoch": 0.3590638472592897, "grad_norm": 1.9722998132908738, "kd_ratio": 0.5, "learning_rate": 1.4841761445981255e-05, "loss": 1.1151171922683716, "loss/kd": 1.8674520254135132, "loss/lm": 0.3627822697162628, "step": 1749 }, { "epoch": 0.3592691439129542, "grad_norm": 1.1446413114341252, "kd_ratio": 0.5, "learning_rate": 1.4835941575898768e-05, "loss": 1.8935080766677856, "loss/kd": 3.4000473022460938, "loss/lm": 0.38696885108947754, "step": 1750 }, { "epoch": 0.35947444056661876, "grad_norm": 1.7760925236711467, "kd_ratio": 0.5, "learning_rate": 1.4830119567061484e-05, "loss": 1.2097458839416504, "loss/kd": 2.04921293258667, "loss/lm": 0.37027883529663086, "step": 1751 }, { "epoch": 0.3596797372202833, "grad_norm": 2.5684312652973973, "kd_ratio": 0.5, "learning_rate": 1.4824295422044257e-05, "loss": 1.2845226526260376, "loss/kd": 2.137277841567993, "loss/lm": 0.4317674934864044, "step": 1752 }, { "epoch": 0.3598850338739479, "grad_norm": 2.5232619537163425, "kd_ratio": 0.5, "learning_rate": 1.4818469143422882e-05, "loss": 1.4198683500289917, "loss/kd": 2.3673903942108154, "loss/lm": 0.47234633564949036, "step": 1753 }, { "epoch": 0.3600903305276124, "grad_norm": 1.7374546337964492, "kd_ratio": 0.5, "learning_rate": 1.4812640733774112e-05, "loss": 1.4126451015472412, "loss/kd": 2.4633517265319824, "loss/lm": 0.3619384765625, "step": 1754 }, { "epoch": 0.36029562718127695, "grad_norm": 0.9876755770528418, "kd_ratio": 0.5, "learning_rate": 1.4806810195675627e-05, "loss": 1.0128974914550781, "loss/kd": 1.6678059101104736, "loss/lm": 0.35798898339271545, "step": 1755 }, { "epoch": 0.3605009238349415, "grad_norm": 1.5978005100460146, "kd_ratio": 0.5, "learning_rate": 1.4800977531706054e-05, "loss": 1.2641950845718384, "loss/kd": 2.1018099784851074, "loss/lm": 0.42658019065856934, "step": 1756 }, { "epoch": 0.360706220488606, "grad_norm": 1.5634612656204583, "kd_ratio": 0.5, "learning_rate": 1.4795142744444965e-05, "loss": 0.9568713307380676, "loss/kd": 1.5802229642868042, "loss/lm": 0.33351969718933105, "step": 1757 }, { "epoch": 0.3609115171422706, "grad_norm": 1.114634689224671, "kd_ratio": 0.5, "learning_rate": 1.4789305836472865e-05, "loss": 0.8398027420043945, "loss/kd": 1.3184726238250732, "loss/lm": 0.36113280057907104, "step": 1758 }, { "epoch": 0.36111681379593513, "grad_norm": 1.1402240233041014, "kd_ratio": 0.5, "learning_rate": 1.4783466810371195e-05, "loss": 0.8802304863929749, "loss/kd": 1.4389082193374634, "loss/lm": 0.32155275344848633, "step": 1759 }, { "epoch": 0.36132211044959966, "grad_norm": 1.5056819451554155, "kd_ratio": 0.5, "learning_rate": 1.477762566872234e-05, "loss": 1.0761945247650146, "loss/kd": 1.7750625610351562, "loss/lm": 0.3773265779018402, "step": 1760 }, { "epoch": 0.3615274071032642, "grad_norm": 1.3256166780217211, "kd_ratio": 0.5, "learning_rate": 1.4771782414109614e-05, "loss": 1.232060194015503, "loss/kd": 2.0741636753082275, "loss/lm": 0.38995683193206787, "step": 1761 }, { "epoch": 0.3617327037569288, "grad_norm": 0.9782208009462523, "kd_ratio": 0.5, "learning_rate": 1.4765937049117272e-05, "loss": 1.2012112140655518, "loss/kd": 2.0394175052642822, "loss/lm": 0.36300504207611084, "step": 1762 }, { "epoch": 0.3619380004105933, "grad_norm": 1.01131745544058, "kd_ratio": 0.5, "learning_rate": 1.4760089576330493e-05, "loss": 1.0624037981033325, "loss/kd": 1.7827839851379395, "loss/lm": 0.34202370047569275, "step": 1763 }, { "epoch": 0.36214329706425785, "grad_norm": 1.0806432711730896, "kd_ratio": 0.5, "learning_rate": 1.47542399983354e-05, "loss": 1.2364071607589722, "loss/kd": 2.0984885692596436, "loss/lm": 0.3743256628513336, "step": 1764 }, { "epoch": 0.3623485937179224, "grad_norm": 1.398423211019225, "kd_ratio": 0.5, "learning_rate": 1.474838831771904e-05, "loss": 1.0146594047546387, "loss/kd": 1.6880569458007812, "loss/lm": 0.3412618041038513, "step": 1765 }, { "epoch": 0.3625538903715869, "grad_norm": 1.2824718102804915, "kd_ratio": 0.5, "learning_rate": 1.474253453706939e-05, "loss": 1.8637515306472778, "loss/kd": 3.4698033332824707, "loss/lm": 0.25769975781440735, "step": 1766 }, { "epoch": 0.3627591870252515, "grad_norm": 1.396621106647542, "kd_ratio": 0.5, "learning_rate": 1.4736678658975357e-05, "loss": 1.0426126718521118, "loss/kd": 1.6894780397415161, "loss/lm": 0.3957473039627075, "step": 1767 }, { "epoch": 0.36296448367891604, "grad_norm": 1.2158488105042502, "kd_ratio": 0.5, "learning_rate": 1.4730820686026773e-05, "loss": 1.0614402294158936, "loss/kd": 1.7396903038024902, "loss/lm": 0.38319021463394165, "step": 1768 }, { "epoch": 0.36316978033258057, "grad_norm": 1.6418983754128498, "kd_ratio": 0.5, "learning_rate": 1.4724960620814402e-05, "loss": 1.03449285030365, "loss/kd": 1.7207988500595093, "loss/lm": 0.34818679094314575, "step": 1769 }, { "epoch": 0.3633750769862451, "grad_norm": 1.255751593908813, "kd_ratio": 0.5, "learning_rate": 1.4719098465929926e-05, "loss": 1.043495535850525, "loss/kd": 1.7354910373687744, "loss/lm": 0.3514999449253082, "step": 1770 }, { "epoch": 0.3635803736399097, "grad_norm": 1.5824563039597166, "kd_ratio": 0.5, "learning_rate": 1.471323422396596e-05, "loss": 1.0814064741134644, "loss/kd": 1.693987488746643, "loss/lm": 0.4688253700733185, "step": 1771 }, { "epoch": 0.3637856702935742, "grad_norm": 1.531129318724003, "kd_ratio": 0.5, "learning_rate": 1.4707367897516034e-05, "loss": 1.2454566955566406, "loss/kd": 2.141390085220337, "loss/lm": 0.3495233654975891, "step": 1772 }, { "epoch": 0.36399096694723876, "grad_norm": 1.0944598670914962, "kd_ratio": 0.5, "learning_rate": 1.4701499489174604e-05, "loss": 1.1146767139434814, "loss/kd": 1.796128273010254, "loss/lm": 0.43322503566741943, "step": 1773 }, { "epoch": 0.3641962636009033, "grad_norm": 1.3101925188054053, "kd_ratio": 0.5, "learning_rate": 1.4695629001537048e-05, "loss": 1.2214479446411133, "loss/kd": 1.9472074508666992, "loss/lm": 0.49568837881088257, "step": 1774 }, { "epoch": 0.3644015602545678, "grad_norm": 1.3081791803909542, "kd_ratio": 0.5, "learning_rate": 1.4689756437199658e-05, "loss": 1.1817669868469238, "loss/kd": 1.9342124462127686, "loss/lm": 0.4293214976787567, "step": 1775 }, { "epoch": 0.3646068569082324, "grad_norm": 1.02656525501586, "kd_ratio": 0.5, "learning_rate": 1.4683881798759653e-05, "loss": 1.0661375522613525, "loss/kd": 1.7036032676696777, "loss/lm": 0.42867183685302734, "step": 1776 }, { "epoch": 0.36481215356189695, "grad_norm": 1.1630701530138494, "kd_ratio": 0.5, "learning_rate": 1.467800508881516e-05, "loss": 0.9842616319656372, "loss/kd": 1.5525617599487305, "loss/lm": 0.41596153378486633, "step": 1777 }, { "epoch": 0.3650174502155615, "grad_norm": 1.3599287856719149, "kd_ratio": 0.5, "learning_rate": 1.4672126309965226e-05, "loss": 1.2470576763153076, "loss/kd": 2.1616640090942383, "loss/lm": 0.3324514627456665, "step": 1778 }, { "epoch": 0.365222746869226, "grad_norm": 1.390913604911537, "kd_ratio": 0.5, "learning_rate": 1.4666245464809818e-05, "loss": 1.1696538925170898, "loss/kd": 1.9369621276855469, "loss/lm": 0.4023456871509552, "step": 1779 }, { "epoch": 0.3654280435228906, "grad_norm": 1.7948532287704841, "kd_ratio": 0.5, "learning_rate": 1.4660362555949808e-05, "loss": 1.057949423789978, "loss/kd": 1.7309712171554565, "loss/lm": 0.38492757081985474, "step": 1780 }, { "epoch": 0.36563334017655513, "grad_norm": 2.272491072162824, "kd_ratio": 0.5, "learning_rate": 1.4654477585986983e-05, "loss": 1.388066053390503, "loss/kd": 2.36999249458313, "loss/lm": 0.4061395525932312, "step": 1781 }, { "epoch": 0.36583863683021967, "grad_norm": 1.9870438790702072, "kd_ratio": 0.5, "learning_rate": 1.4648590557524052e-05, "loss": 1.1887154579162598, "loss/kd": 1.9842274188995361, "loss/lm": 0.39320361614227295, "step": 1782 }, { "epoch": 0.3660439334838842, "grad_norm": 1.2178526728459351, "kd_ratio": 0.5, "learning_rate": 1.4642701473164618e-05, "loss": 1.164579153060913, "loss/kd": 1.8996024131774902, "loss/lm": 0.42955586314201355, "step": 1783 }, { "epoch": 0.36624923013754873, "grad_norm": 1.743661597351738, "kd_ratio": 0.5, "learning_rate": 1.4636810335513207e-05, "loss": 1.2481741905212402, "loss/kd": 2.0703060626983643, "loss/lm": 0.4260422885417938, "step": 1784 }, { "epoch": 0.3664545267912133, "grad_norm": 1.8609433694814173, "kd_ratio": 0.5, "learning_rate": 1.4630917147175241e-05, "loss": 0.9663325548171997, "loss/kd": 1.5043971538543701, "loss/lm": 0.4282678961753845, "step": 1785 }, { "epoch": 0.36665982344487785, "grad_norm": 1.5211891739537677, "kd_ratio": 0.5, "learning_rate": 1.4625021910757061e-05, "loss": 1.240412712097168, "loss/kd": 2.0615129470825195, "loss/lm": 0.41931259632110596, "step": 1786 }, { "epoch": 0.3668651200985424, "grad_norm": 1.100090316777514, "kd_ratio": 0.5, "learning_rate": 1.4619124628865904e-05, "loss": 1.0087330341339111, "loss/kd": 1.664845585823059, "loss/lm": 0.352620393037796, "step": 1787 }, { "epoch": 0.3670704167522069, "grad_norm": 1.4211932947629253, "kd_ratio": 0.5, "learning_rate": 1.4613225304109917e-05, "loss": 1.1669572591781616, "loss/kd": 1.863581657409668, "loss/lm": 0.4703327715396881, "step": 1788 }, { "epoch": 0.3672757134058715, "grad_norm": 1.5791221326601246, "kd_ratio": 0.5, "learning_rate": 1.4607323939098152e-05, "loss": 1.0420589447021484, "loss/kd": 1.698364496231079, "loss/lm": 0.3857535123825073, "step": 1789 }, { "epoch": 0.36748101005953604, "grad_norm": 1.404666174663106, "kd_ratio": 0.5, "learning_rate": 1.4601420536440553e-05, "loss": 1.144176721572876, "loss/kd": 1.9201180934906006, "loss/lm": 0.36823534965515137, "step": 1790 }, { "epoch": 0.3676863067132006, "grad_norm": 1.2365521846479375, "kd_ratio": 0.5, "learning_rate": 1.459551509874798e-05, "loss": 1.1421383619308472, "loss/kd": 1.9145822525024414, "loss/lm": 0.3696945011615753, "step": 1791 }, { "epoch": 0.3678916033668651, "grad_norm": 1.355199183785552, "kd_ratio": 0.5, "learning_rate": 1.4589607628632186e-05, "loss": 1.097785234451294, "loss/kd": 1.8149693012237549, "loss/lm": 0.38060128688812256, "step": 1792 }, { "epoch": 0.36809690002052964, "grad_norm": 2.064875115849127, "kd_ratio": 0.5, "learning_rate": 1.4583698128705815e-05, "loss": 0.9999876022338867, "loss/kd": 1.6487202644348145, "loss/lm": 0.35125499963760376, "step": 1793 }, { "epoch": 0.3683021966741942, "grad_norm": 1.0986990341255405, "kd_ratio": 0.5, "learning_rate": 1.4577786601582423e-05, "loss": 0.997852087020874, "loss/kd": 1.6630141735076904, "loss/lm": 0.33268994092941284, "step": 1794 }, { "epoch": 0.36850749332785876, "grad_norm": 4.200345364839994, "kd_ratio": 0.5, "learning_rate": 1.4571873049876452e-05, "loss": 1.0169880390167236, "loss/kd": 1.5506703853607178, "loss/lm": 0.4833056926727295, "step": 1795 }, { "epoch": 0.3687127899815233, "grad_norm": 2.0078396228766198, "kd_ratio": 0.5, "learning_rate": 1.4565957476203248e-05, "loss": 1.1937170028686523, "loss/kd": 2.0684525966644287, "loss/lm": 0.3189813792705536, "step": 1796 }, { "epoch": 0.3689180866351878, "grad_norm": 1.2104534598931262, "kd_ratio": 0.5, "learning_rate": 1.456003988317904e-05, "loss": 1.0721769332885742, "loss/kd": 1.7941263914108276, "loss/lm": 0.3502274453639984, "step": 1797 }, { "epoch": 0.3691233832888524, "grad_norm": 1.5073460670568886, "kd_ratio": 0.5, "learning_rate": 1.455412027342096e-05, "loss": 0.8611955642700195, "loss/kd": 1.412265419960022, "loss/lm": 0.3101257085800171, "step": 1798 }, { "epoch": 0.36932867994251695, "grad_norm": 1.6101500084256593, "kd_ratio": 0.5, "learning_rate": 1.454819864954703e-05, "loss": 1.2765984535217285, "loss/kd": 2.1818957328796387, "loss/lm": 0.3713012933731079, "step": 1799 }, { "epoch": 0.3695339765961815, "grad_norm": 1.9026146222506264, "kd_ratio": 0.5, "learning_rate": 1.4542275014176156e-05, "loss": 1.9420193433761597, "loss/kd": 3.623354911804199, "loss/lm": 0.2606837749481201, "step": 1800 }, { "epoch": 0.369739273249846, "grad_norm": 1.226465465039142, "kd_ratio": 0.5, "learning_rate": 1.4536349369928142e-05, "loss": 1.2218503952026367, "loss/kd": 1.9921014308929443, "loss/lm": 0.4515992999076843, "step": 1801 }, { "epoch": 0.36994456990351055, "grad_norm": 1.871928337529583, "kd_ratio": 0.5, "learning_rate": 1.453042171942368e-05, "loss": 1.0732169151306152, "loss/kd": 1.745137095451355, "loss/lm": 0.40129685401916504, "step": 1802 }, { "epoch": 0.37014986655717513, "grad_norm": 1.7484937071369662, "kd_ratio": 0.5, "learning_rate": 1.4524492065284344e-05, "loss": 1.3265597820281982, "loss/kd": 2.1597542762756348, "loss/lm": 0.4933653771877289, "step": 1803 }, { "epoch": 0.37035516321083967, "grad_norm": 1.5563476178601798, "kd_ratio": 0.5, "learning_rate": 1.4518560410132593e-05, "loss": 1.283691167831421, "loss/kd": 2.0346007347106934, "loss/lm": 0.5327816605567932, "step": 1804 }, { "epoch": 0.3705604598645042, "grad_norm": 1.325908651689899, "kd_ratio": 0.5, "learning_rate": 1.4512626756591784e-05, "loss": 1.170424461364746, "loss/kd": 1.9250441789627075, "loss/lm": 0.41580477356910706, "step": 1805 }, { "epoch": 0.37076575651816873, "grad_norm": 2.653452820833214, "kd_ratio": 0.5, "learning_rate": 1.4506691107286137e-05, "loss": 0.9839425683021545, "loss/kd": 1.6554827690124512, "loss/lm": 0.3124023675918579, "step": 1806 }, { "epoch": 0.3709710531718333, "grad_norm": 3.3357673639069345, "kd_ratio": 0.5, "learning_rate": 1.4500753464840775e-05, "loss": 1.1651039123535156, "loss/kd": 1.9109523296356201, "loss/lm": 0.4192555248737335, "step": 1807 }, { "epoch": 0.37117634982549785, "grad_norm": 2.0097275092993057, "kd_ratio": 0.5, "learning_rate": 1.4494813831881687e-05, "loss": 1.0930293798446655, "loss/kd": 1.8128888607025146, "loss/lm": 0.37316980957984924, "step": 1808 }, { "epoch": 0.3713816464791624, "grad_norm": 1.1603677340233338, "kd_ratio": 0.5, "learning_rate": 1.4488872211035751e-05, "loss": 1.099825143814087, "loss/kd": 1.8689961433410645, "loss/lm": 0.33065417408943176, "step": 1809 }, { "epoch": 0.3715869431328269, "grad_norm": 1.7368842766975567, "kd_ratio": 0.5, "learning_rate": 1.4482928604930729e-05, "loss": 1.2827644348144531, "loss/kd": 2.1036438941955566, "loss/lm": 0.4618850648403168, "step": 1810 }, { "epoch": 0.3717922397864915, "grad_norm": 2.1740809117669055, "kd_ratio": 0.5, "learning_rate": 1.4476983016195245e-05, "loss": 1.0227984189987183, "loss/kd": 1.627724289894104, "loss/lm": 0.41787248849868774, "step": 1811 }, { "epoch": 0.37199753644015604, "grad_norm": 1.4656457173549984, "kd_ratio": 0.5, "learning_rate": 1.4471035447458812e-05, "loss": 1.0756707191467285, "loss/kd": 1.7763738632202148, "loss/lm": 0.37496769428253174, "step": 1812 }, { "epoch": 0.3722028330938206, "grad_norm": 1.3616610807321081, "kd_ratio": 0.5, "learning_rate": 1.4465085901351819e-05, "loss": 1.1407641172409058, "loss/kd": 1.907309889793396, "loss/lm": 0.3742184340953827, "step": 1813 }, { "epoch": 0.3724081297474851, "grad_norm": 1.8719782660113193, "kd_ratio": 0.5, "learning_rate": 1.445913438050552e-05, "loss": 1.1261588335037231, "loss/kd": 1.821938395500183, "loss/lm": 0.43037930130958557, "step": 1814 }, { "epoch": 0.37261342640114964, "grad_norm": 1.3006017272476493, "kd_ratio": 0.5, "learning_rate": 1.4453180887552052e-05, "loss": 1.0883264541625977, "loss/kd": 1.793461799621582, "loss/lm": 0.3831910490989685, "step": 1815 }, { "epoch": 0.3728187230548142, "grad_norm": 1.247562235992685, "kd_ratio": 0.5, "learning_rate": 1.444722542512442e-05, "loss": 1.2252898216247559, "loss/kd": 2.0567233562469482, "loss/lm": 0.39385637640953064, "step": 1816 }, { "epoch": 0.37302401970847876, "grad_norm": 1.815622637622465, "kd_ratio": 0.5, "learning_rate": 1.4441267995856502e-05, "loss": 1.2281173467636108, "loss/kd": 2.047759771347046, "loss/lm": 0.40847501158714294, "step": 1817 }, { "epoch": 0.3732293163621433, "grad_norm": 1.424826062334779, "kd_ratio": 0.5, "learning_rate": 1.4435308602383043e-05, "loss": 1.3238179683685303, "loss/kd": 2.2784647941589355, "loss/lm": 0.36917123198509216, "step": 1818 }, { "epoch": 0.3734346130158078, "grad_norm": 1.0113637681651606, "kd_ratio": 0.5, "learning_rate": 1.4429347247339656e-05, "loss": 1.0854530334472656, "loss/kd": 1.7958664894104004, "loss/lm": 0.3750396966934204, "step": 1819 }, { "epoch": 0.3736399096694724, "grad_norm": 1.8745229885101997, "kd_ratio": 0.5, "learning_rate": 1.4423383933362832e-05, "loss": 1.0699992179870605, "loss/kd": 1.7952587604522705, "loss/lm": 0.3447396755218506, "step": 1820 }, { "epoch": 0.37384520632313695, "grad_norm": 1.5132858682801706, "kd_ratio": 0.5, "learning_rate": 1.4417418663089908e-05, "loss": 1.039975881576538, "loss/kd": 1.6641491651535034, "loss/lm": 0.41580256819725037, "step": 1821 }, { "epoch": 0.3740505029768015, "grad_norm": 1.042674654585731, "kd_ratio": 0.5, "learning_rate": 1.441145143915911e-05, "loss": 1.0960615873336792, "loss/kd": 1.740270733833313, "loss/lm": 0.45185238122940063, "step": 1822 }, { "epoch": 0.374255799630466, "grad_norm": 1.7039075978155027, "kd_ratio": 0.5, "learning_rate": 1.4405482264209512e-05, "loss": 0.9640483856201172, "loss/kd": 1.475743055343628, "loss/lm": 0.45235368609428406, "step": 1823 }, { "epoch": 0.37446109628413055, "grad_norm": 1.276460680256795, "kd_ratio": 0.5, "learning_rate": 1.439951114088105e-05, "loss": 0.9703794717788696, "loss/kd": 1.4756512641906738, "loss/lm": 0.46510764956474304, "step": 1824 }, { "epoch": 0.37466639293779513, "grad_norm": 1.087298057072029, "kd_ratio": 0.5, "learning_rate": 1.4393538071814534e-05, "loss": 1.0080879926681519, "loss/kd": 1.5383802652359009, "loss/lm": 0.4777957797050476, "step": 1825 }, { "epoch": 0.37487168959145967, "grad_norm": 1.4069892402727249, "kd_ratio": 0.5, "learning_rate": 1.4387563059651628e-05, "loss": 1.0246455669403076, "loss/kd": 1.6935076713562012, "loss/lm": 0.3557835817337036, "step": 1826 }, { "epoch": 0.3750769862451242, "grad_norm": 1.0410125572042117, "kd_ratio": 0.5, "learning_rate": 1.4381586107034849e-05, "loss": 1.022727131843567, "loss/kd": 1.7202479839324951, "loss/lm": 0.32520627975463867, "step": 1827 }, { "epoch": 0.37528228289878873, "grad_norm": 1.2176078963568886, "kd_ratio": 0.5, "learning_rate": 1.437560721660758e-05, "loss": 1.1403310298919678, "loss/kd": 1.8554420471191406, "loss/lm": 0.42521992325782776, "step": 1828 }, { "epoch": 0.3754875795524533, "grad_norm": 1.2953093421026831, "kd_ratio": 0.5, "learning_rate": 1.4369626391014058e-05, "loss": 1.157271385192871, "loss/kd": 1.9761302471160889, "loss/lm": 0.3384125828742981, "step": 1829 }, { "epoch": 0.37569287620611785, "grad_norm": 0.9804060224760264, "kd_ratio": 0.5, "learning_rate": 1.4363643632899383e-05, "loss": 1.1449631452560425, "loss/kd": 1.9619357585906982, "loss/lm": 0.32799050211906433, "step": 1830 }, { "epoch": 0.3758981728597824, "grad_norm": 1.0070070143405896, "kd_ratio": 0.5, "learning_rate": 1.4357658944909496e-05, "loss": 1.009903907775879, "loss/kd": 1.6591377258300781, "loss/lm": 0.3606700897216797, "step": 1831 }, { "epoch": 0.3761034695134469, "grad_norm": 1.1806670565984223, "kd_ratio": 0.5, "learning_rate": 1.4351672329691204e-05, "loss": 1.0772998332977295, "loss/kd": 1.7316617965698242, "loss/lm": 0.4229377806186676, "step": 1832 }, { "epoch": 0.37630876616711145, "grad_norm": 1.3143560919263972, "kd_ratio": 0.5, "learning_rate": 1.434568378989216e-05, "loss": 0.9719973802566528, "loss/kd": 1.6313502788543701, "loss/lm": 0.31264445185661316, "step": 1833 }, { "epoch": 0.37651406282077604, "grad_norm": 1.1433742833837017, "kd_ratio": 0.5, "learning_rate": 1.4339693328160866e-05, "loss": 1.1325103044509888, "loss/kd": 1.9007267951965332, "loss/lm": 0.36429381370544434, "step": 1834 }, { "epoch": 0.3767193594744406, "grad_norm": 1.0347873039894824, "kd_ratio": 0.5, "learning_rate": 1.4333700947146686e-05, "loss": 0.8411880731582642, "loss/kd": 1.3410742282867432, "loss/lm": 0.34130191802978516, "step": 1835 }, { "epoch": 0.3769246561281051, "grad_norm": 1.2146389420815957, "kd_ratio": 0.5, "learning_rate": 1.4327706649499815e-05, "loss": 0.987157940864563, "loss/kd": 1.6333225965499878, "loss/lm": 0.3409932255744934, "step": 1836 }, { "epoch": 0.37712995278176964, "grad_norm": 1.3562869931407961, "kd_ratio": 0.5, "learning_rate": 1.432171043787131e-05, "loss": 1.0407425165176392, "loss/kd": 1.6400750875473022, "loss/lm": 0.4414099454879761, "step": 1837 }, { "epoch": 0.37733524943543423, "grad_norm": 1.683970477153104, "kd_ratio": 0.5, "learning_rate": 1.431571231491307e-05, "loss": 1.0400673151016235, "loss/kd": 1.767077922821045, "loss/lm": 0.31305673718452454, "step": 1838 }, { "epoch": 0.37754054608909876, "grad_norm": 1.2086174525178344, "kd_ratio": 0.5, "learning_rate": 1.4309712283277839e-05, "loss": 1.252238392829895, "loss/kd": 2.1181342601776123, "loss/lm": 0.3863426148891449, "step": 1839 }, { "epoch": 0.3777458427427633, "grad_norm": 1.5782591900274616, "kd_ratio": 0.5, "learning_rate": 1.4303710345619201e-05, "loss": 1.094970941543579, "loss/kd": 1.8112494945526123, "loss/lm": 0.37869250774383545, "step": 1840 }, { "epoch": 0.3779511393964278, "grad_norm": 1.2356717021088321, "kd_ratio": 0.5, "learning_rate": 1.4297706504591593e-05, "loss": 1.1436527967453003, "loss/kd": 1.8815289735794067, "loss/lm": 0.4057765603065491, "step": 1841 }, { "epoch": 0.37815643605009236, "grad_norm": 1.169701161968788, "kd_ratio": 0.5, "learning_rate": 1.4291700762850282e-05, "loss": 1.6579155921936035, "loss/kd": 2.796051263809204, "loss/lm": 0.5197798609733582, "step": 1842 }, { "epoch": 0.37836173270375695, "grad_norm": 1.5209577908992984, "kd_ratio": 0.5, "learning_rate": 1.4285693123051385e-05, "loss": 1.9370667934417725, "loss/kd": 3.5143520832061768, "loss/lm": 0.3597813844680786, "step": 1843 }, { "epoch": 0.3785670293574215, "grad_norm": 2.593885044360303, "kd_ratio": 0.5, "learning_rate": 1.4279683587851853e-05, "loss": 1.0447325706481934, "loss/kd": 1.5804989337921143, "loss/lm": 0.5089660882949829, "step": 1844 }, { "epoch": 0.378772326011086, "grad_norm": 2.051244209197076, "kd_ratio": 0.5, "learning_rate": 1.4273672159909475e-05, "loss": 1.036909818649292, "loss/kd": 1.7018769979476929, "loss/lm": 0.37194252014160156, "step": 1845 }, { "epoch": 0.37897762266475055, "grad_norm": 1.4356922600424764, "kd_ratio": 0.5, "learning_rate": 1.4267658841882883e-05, "loss": 0.8055599927902222, "loss/kd": 1.2099112272262573, "loss/lm": 0.4012087285518646, "step": 1846 }, { "epoch": 0.37918291931841513, "grad_norm": 2.331890567775454, "kd_ratio": 0.5, "learning_rate": 1.4261643636431539e-05, "loss": 0.9587591290473938, "loss/kd": 1.523605227470398, "loss/lm": 0.39391306042671204, "step": 1847 }, { "epoch": 0.37938821597207967, "grad_norm": 1.4204555551788398, "kd_ratio": 0.5, "learning_rate": 1.4255626546215746e-05, "loss": 1.2395365238189697, "loss/kd": 2.082765817642212, "loss/lm": 0.39630722999572754, "step": 1848 }, { "epoch": 0.3795935126257442, "grad_norm": 1.28699409524951, "kd_ratio": 0.5, "learning_rate": 1.424960757389663e-05, "loss": 1.056200385093689, "loss/kd": 1.7372422218322754, "loss/lm": 0.37515854835510254, "step": 1849 }, { "epoch": 0.37979880927940873, "grad_norm": 1.3608421023314101, "kd_ratio": 0.5, "learning_rate": 1.424358672213616e-05, "loss": 1.2943029403686523, "loss/kd": 2.2469022274017334, "loss/lm": 0.3417035639286041, "step": 1850 }, { "epoch": 0.38000410593307327, "grad_norm": 1.8916946444487774, "kd_ratio": 0.5, "learning_rate": 1.4237563993597133e-05, "loss": 1.1370203495025635, "loss/kd": 1.92513108253479, "loss/lm": 0.3489095866680145, "step": 1851 }, { "epoch": 0.38020940258673785, "grad_norm": 1.4417331767676895, "kd_ratio": 0.5, "learning_rate": 1.4231539390943167e-05, "loss": 1.0614652633666992, "loss/kd": 1.763497233390808, "loss/lm": 0.3594333827495575, "step": 1852 }, { "epoch": 0.3804146992404024, "grad_norm": 1.2629520286694216, "kd_ratio": 0.5, "learning_rate": 1.4225512916838726e-05, "loss": 1.1227260828018188, "loss/kd": 1.9661158323287964, "loss/lm": 0.2793363034725189, "step": 1853 }, { "epoch": 0.3806199958940669, "grad_norm": 1.8338633142786265, "kd_ratio": 0.5, "learning_rate": 1.4219484573949088e-05, "loss": 1.086659550666809, "loss/kd": 1.8003737926483154, "loss/lm": 0.37294527888298035, "step": 1854 }, { "epoch": 0.38082529254773145, "grad_norm": 2.3849668591986815, "kd_ratio": 0.5, "learning_rate": 1.4213454364940362e-05, "loss": 1.0764139890670776, "loss/kd": 1.7698203325271606, "loss/lm": 0.38300755620002747, "step": 1855 }, { "epoch": 0.38103058920139604, "grad_norm": 1.2042162270744208, "kd_ratio": 0.5, "learning_rate": 1.4207422292479483e-05, "loss": 1.0669381618499756, "loss/kd": 1.7405107021331787, "loss/lm": 0.39336562156677246, "step": 1856 }, { "epoch": 0.3812358858550606, "grad_norm": 1.5501215947639135, "kd_ratio": 0.5, "learning_rate": 1.4201388359234211e-05, "loss": 1.220255732536316, "loss/kd": 2.0345518589019775, "loss/lm": 0.4059596061706543, "step": 1857 }, { "epoch": 0.3814411825087251, "grad_norm": 1.8293095694371984, "kd_ratio": 0.5, "learning_rate": 1.4195352567873124e-05, "loss": 1.1461131572723389, "loss/kd": 1.9389055967330933, "loss/lm": 0.3533206582069397, "step": 1858 }, { "epoch": 0.38164647916238964, "grad_norm": 1.059906139643859, "kd_ratio": 0.5, "learning_rate": 1.4189314921065629e-05, "loss": 1.0016932487487793, "loss/kd": 1.672677993774414, "loss/lm": 0.33070841431617737, "step": 1859 }, { "epoch": 0.3818517758160542, "grad_norm": 1.1114986880934743, "kd_ratio": 0.5, "learning_rate": 1.4183275421481946e-05, "loss": 0.8631497621536255, "loss/kd": 1.3281718492507935, "loss/lm": 0.3981277048587799, "step": 1860 }, { "epoch": 0.38205707246971876, "grad_norm": 0.9747894723470412, "kd_ratio": 0.5, "learning_rate": 1.4177234071793122e-05, "loss": 0.9191667437553406, "loss/kd": 1.4184911251068115, "loss/lm": 0.41984236240386963, "step": 1861 }, { "epoch": 0.3822623691233833, "grad_norm": 1.2976396908059957, "kd_ratio": 0.5, "learning_rate": 1.4171190874671017e-05, "loss": 1.8367677927017212, "loss/kd": 3.3156282901763916, "loss/lm": 0.35790738463401794, "step": 1862 }, { "epoch": 0.3824676657770478, "grad_norm": 1.0357555719432856, "kd_ratio": 0.5, "learning_rate": 1.4165145832788305e-05, "loss": 1.125678300857544, "loss/kd": 1.8999696969985962, "loss/lm": 0.3513868749141693, "step": 1863 }, { "epoch": 0.38267296243071236, "grad_norm": 1.1395729530774228, "kd_ratio": 0.5, "learning_rate": 1.4159098948818488e-05, "loss": 0.8914031982421875, "loss/kd": 1.4307212829589844, "loss/lm": 0.35208505392074585, "step": 1864 }, { "epoch": 0.38287825908437695, "grad_norm": 1.038614603050854, "kd_ratio": 0.5, "learning_rate": 1.4153050225435869e-05, "loss": 1.024277925491333, "loss/kd": 1.625175952911377, "loss/lm": 0.4233798086643219, "step": 1865 }, { "epoch": 0.3830835557380415, "grad_norm": 1.4362293233096801, "kd_ratio": 0.5, "learning_rate": 1.4146999665315573e-05, "loss": 1.0113471746444702, "loss/kd": 1.6996477842330933, "loss/lm": 0.32304665446281433, "step": 1866 }, { "epoch": 0.383288852391706, "grad_norm": 1.092699071863102, "kd_ratio": 0.5, "learning_rate": 1.4140947271133536e-05, "loss": 0.9395495653152466, "loss/kd": 1.5276728868484497, "loss/lm": 0.35142627358436584, "step": 1867 }, { "epoch": 0.38349414904537055, "grad_norm": 1.7565707974352338, "kd_ratio": 0.5, "learning_rate": 1.41348930455665e-05, "loss": 1.067739486694336, "loss/kd": 1.7061582803726196, "loss/lm": 0.4293207824230194, "step": 1868 }, { "epoch": 0.3836994456990351, "grad_norm": 1.259031984648603, "kd_ratio": 0.5, "learning_rate": 1.4128836991292027e-05, "loss": 1.002244472503662, "loss/kd": 1.632238507270813, "loss/lm": 0.37225034832954407, "step": 1869 }, { "epoch": 0.38390474235269967, "grad_norm": 1.408517621035933, "kd_ratio": 0.5, "learning_rate": 1.4122779110988476e-05, "loss": 1.0102988481521606, "loss/kd": 1.5961970090866089, "loss/lm": 0.4244006872177124, "step": 1870 }, { "epoch": 0.3841100390063642, "grad_norm": 1.243034749985775, "kd_ratio": 0.5, "learning_rate": 1.4116719407335022e-05, "loss": 1.0780812501907349, "loss/kd": 1.7793967723846436, "loss/lm": 0.37676572799682617, "step": 1871 }, { "epoch": 0.38431533566002873, "grad_norm": 1.3583140581055753, "kd_ratio": 0.5, "learning_rate": 1.4110657883011642e-05, "loss": 1.295432209968567, "loss/kd": 2.2161009311676025, "loss/lm": 0.3747633993625641, "step": 1872 }, { "epoch": 0.38452063231369327, "grad_norm": 1.1570614571590847, "kd_ratio": 0.5, "learning_rate": 1.4104594540699122e-05, "loss": 0.9374828338623047, "loss/kd": 1.4900535345077515, "loss/lm": 0.38491207361221313, "step": 1873 }, { "epoch": 0.38472592896735786, "grad_norm": 1.9105107400602646, "kd_ratio": 0.5, "learning_rate": 1.4098529383079048e-05, "loss": 1.1469162702560425, "loss/kd": 1.9071307182312012, "loss/lm": 0.386701762676239, "step": 1874 }, { "epoch": 0.3849312256210224, "grad_norm": 2.11689558295789, "kd_ratio": 0.5, "learning_rate": 1.4092462412833811e-05, "loss": 1.0896241664886475, "loss/kd": 1.7536754608154297, "loss/lm": 0.42557281255722046, "step": 1875 }, { "epoch": 0.3851365222746869, "grad_norm": 1.4631614881185893, "kd_ratio": 0.5, "learning_rate": 1.4086393632646605e-05, "loss": 1.2612941265106201, "loss/kd": 2.0764076709747314, "loss/lm": 0.446180522441864, "step": 1876 }, { "epoch": 0.38534181892835145, "grad_norm": 1.1732527274442417, "kd_ratio": 0.5, "learning_rate": 1.4080323045201423e-05, "loss": 1.1850297451019287, "loss/kd": 1.9887527227401733, "loss/lm": 0.3813067674636841, "step": 1877 }, { "epoch": 0.385547115582016, "grad_norm": 1.7627207835711767, "kd_ratio": 0.5, "learning_rate": 1.4074250653183055e-05, "loss": 0.9781777858734131, "loss/kd": 1.589177131652832, "loss/lm": 0.36717846989631653, "step": 1878 }, { "epoch": 0.3857524122356806, "grad_norm": 2.718751098149332, "kd_ratio": 0.5, "learning_rate": 1.406817645927709e-05, "loss": 1.1399067640304565, "loss/kd": 1.9644811153411865, "loss/lm": 0.31533244252204895, "step": 1879 }, { "epoch": 0.3859577088893451, "grad_norm": 2.6811315424368236, "kd_ratio": 0.5, "learning_rate": 1.4062100466169921e-05, "loss": 1.2667999267578125, "loss/kd": 1.9282313585281372, "loss/lm": 0.605368435382843, "step": 1880 }, { "epoch": 0.38616300554300964, "grad_norm": 1.0397547600381662, "kd_ratio": 0.5, "learning_rate": 1.4056022676548726e-05, "loss": 1.1301196813583374, "loss/kd": 1.852647066116333, "loss/lm": 0.4075922966003418, "step": 1881 }, { "epoch": 0.3863683021966742, "grad_norm": 3.153684597268122, "kd_ratio": 0.5, "learning_rate": 1.404994309310148e-05, "loss": 1.2136688232421875, "loss/kd": 2.077275037765503, "loss/lm": 0.3500625193119049, "step": 1882 }, { "epoch": 0.38657359885033876, "grad_norm": 2.5903884909880834, "kd_ratio": 0.5, "learning_rate": 1.4043861718516964e-05, "loss": 1.2399505376815796, "loss/kd": 2.0662264823913574, "loss/lm": 0.4136746823787689, "step": 1883 }, { "epoch": 0.3867788955040033, "grad_norm": 1.3458342614358882, "kd_ratio": 0.5, "learning_rate": 1.403777855548473e-05, "loss": 1.0696829557418823, "loss/kd": 1.7712230682373047, "loss/lm": 0.36814290285110474, "step": 1884 }, { "epoch": 0.38698419215766783, "grad_norm": 2.451358839820347, "kd_ratio": 0.5, "learning_rate": 1.403169360669514e-05, "loss": 1.046635389328003, "loss/kd": 1.6477656364440918, "loss/lm": 0.4455051124095917, "step": 1885 }, { "epoch": 0.38718948881133236, "grad_norm": 2.6108713557090626, "kd_ratio": 0.5, "learning_rate": 1.4025606874839331e-05, "loss": 0.9533456563949585, "loss/kd": 1.5431935787200928, "loss/lm": 0.3634977340698242, "step": 1886 }, { "epoch": 0.3873947854649969, "grad_norm": 1.326732315827028, "kd_ratio": 0.5, "learning_rate": 1.4019518362609239e-05, "loss": 0.991507887840271, "loss/kd": 1.654710054397583, "loss/lm": 0.3283056616783142, "step": 1887 }, { "epoch": 0.3876000821186615, "grad_norm": 2.1991012352871158, "kd_ratio": 0.5, "learning_rate": 1.4013428072697584e-05, "loss": 1.068442940711975, "loss/kd": 1.6964067220687866, "loss/lm": 0.44047924876213074, "step": 1888 }, { "epoch": 0.387805378772326, "grad_norm": 2.606721645062503, "kd_ratio": 0.5, "learning_rate": 1.4007336007797867e-05, "loss": 1.1899468898773193, "loss/kd": 2.006288528442383, "loss/lm": 0.37360531091690063, "step": 1889 }, { "epoch": 0.38801067542599055, "grad_norm": 1.1465991020491624, "kd_ratio": 0.5, "learning_rate": 1.4001242170604386e-05, "loss": 1.1858274936676025, "loss/kd": 1.9988962411880493, "loss/lm": 0.3727588355541229, "step": 1890 }, { "epoch": 0.3882159720796551, "grad_norm": 1.9377848703351974, "kd_ratio": 0.5, "learning_rate": 1.399514656381221e-05, "loss": 1.25515878200531, "loss/kd": 2.134532928466797, "loss/lm": 0.3757845461368561, "step": 1891 }, { "epoch": 0.38842126873331967, "grad_norm": 2.4381061251572684, "kd_ratio": 0.5, "learning_rate": 1.3989049190117197e-05, "loss": 1.1004726886749268, "loss/kd": 1.778091311454773, "loss/lm": 0.42285415530204773, "step": 1892 }, { "epoch": 0.3886265653869842, "grad_norm": 1.7883468690951951, "kd_ratio": 0.5, "learning_rate": 1.398295005221599e-05, "loss": 0.8943893909454346, "loss/kd": 1.4207743406295776, "loss/lm": 0.3680044710636139, "step": 1893 }, { "epoch": 0.38883186204064873, "grad_norm": 1.1233037049766972, "kd_ratio": 0.5, "learning_rate": 1.3976849152806003e-05, "loss": 1.1875706911087036, "loss/kd": 1.9208711385726929, "loss/lm": 0.4542701840400696, "step": 1894 }, { "epoch": 0.38903715869431327, "grad_norm": 1.8192081132900801, "kd_ratio": 0.5, "learning_rate": 1.3970746494585439e-05, "loss": 1.0316191911697388, "loss/kd": 1.6843363046646118, "loss/lm": 0.37890204787254333, "step": 1895 }, { "epoch": 0.3892424553479778, "grad_norm": 1.0013760798094324, "kd_ratio": 0.5, "learning_rate": 1.3964642080253268e-05, "loss": 1.101158618927002, "loss/kd": 1.7569797039031982, "loss/lm": 0.4453374147415161, "step": 1896 }, { "epoch": 0.3894477520016424, "grad_norm": 1.6553660477210692, "kd_ratio": 0.5, "learning_rate": 1.3958535912509246e-05, "loss": 1.1721491813659668, "loss/kd": 1.9521855115890503, "loss/lm": 0.39211273193359375, "step": 1897 }, { "epoch": 0.3896530486553069, "grad_norm": 1.1884498490714217, "kd_ratio": 0.5, "learning_rate": 1.39524279940539e-05, "loss": 0.9726046919822693, "loss/kd": 1.612316608428955, "loss/lm": 0.3328927755355835, "step": 1898 }, { "epoch": 0.38985834530897145, "grad_norm": 1.6200966306081352, "kd_ratio": 0.5, "learning_rate": 1.3946318327588534e-05, "loss": 1.1102664470672607, "loss/kd": 1.8483442068099976, "loss/lm": 0.37218859791755676, "step": 1899 }, { "epoch": 0.390063641962636, "grad_norm": 2.133959918477195, "kd_ratio": 0.5, "learning_rate": 1.394020691581522e-05, "loss": 1.3412213325500488, "loss/kd": 2.294430732727051, "loss/lm": 0.38801202178001404, "step": 1900 }, { "epoch": 0.3902689386163006, "grad_norm": 1.5696195682136493, "kd_ratio": 0.5, "learning_rate": 1.3934093761436807e-05, "loss": 1.3015459775924683, "loss/kd": 2.2043769359588623, "loss/lm": 0.398715078830719, "step": 1901 }, { "epoch": 0.3904742352699651, "grad_norm": 1.2768187038965957, "kd_ratio": 0.5, "learning_rate": 1.3927978867156913e-05, "loss": 1.1269611120224, "loss/kd": 1.8548020124435425, "loss/lm": 0.3991202712059021, "step": 1902 }, { "epoch": 0.39067953192362964, "grad_norm": 1.2098883333104944, "kd_ratio": 0.5, "learning_rate": 1.3921862235679929e-05, "loss": 0.9952186346054077, "loss/kd": 1.5948610305786133, "loss/lm": 0.39557626843452454, "step": 1903 }, { "epoch": 0.3908848285772942, "grad_norm": 1.0015863260484623, "kd_ratio": 0.5, "learning_rate": 1.3915743869711005e-05, "loss": 1.239179253578186, "loss/kd": 2.0452871322631836, "loss/lm": 0.4330713152885437, "step": 1904 }, { "epoch": 0.39109012523095876, "grad_norm": 1.1560750067583956, "kd_ratio": 0.5, "learning_rate": 1.3909623771956064e-05, "loss": 1.1167125701904297, "loss/kd": 1.908278226852417, "loss/lm": 0.32514703273773193, "step": 1905 }, { "epoch": 0.3912954218846233, "grad_norm": 1.0059593176433992, "kd_ratio": 0.5, "learning_rate": 1.39035019451218e-05, "loss": 1.0874083042144775, "loss/kd": 1.7657322883605957, "loss/lm": 0.409084290266037, "step": 1906 }, { "epoch": 0.39150071853828783, "grad_norm": 1.0199170507833293, "kd_ratio": 0.5, "learning_rate": 1.389737839191566e-05, "loss": 1.0678247213363647, "loss/kd": 1.6969712972640991, "loss/lm": 0.43867823481559753, "step": 1907 }, { "epoch": 0.39170601519195236, "grad_norm": 0.9542936794565466, "kd_ratio": 0.5, "learning_rate": 1.3891253115045867e-05, "loss": 1.2348793745040894, "loss/kd": 2.117448329925537, "loss/lm": 0.352310448884964, "step": 1908 }, { "epoch": 0.3919113118456169, "grad_norm": 0.9876361447856046, "kd_ratio": 0.5, "learning_rate": 1.3885126117221394e-05, "loss": 1.2367568016052246, "loss/kd": 2.0424115657806396, "loss/lm": 0.4311019778251648, "step": 1909 }, { "epoch": 0.3921166084992815, "grad_norm": 1.2238692902339532, "kd_ratio": 0.5, "learning_rate": 1.3878997401151985e-05, "loss": 1.1631426811218262, "loss/kd": 1.949774980545044, "loss/lm": 0.37651026248931885, "step": 1910 }, { "epoch": 0.392321905152946, "grad_norm": 1.2944901900163, "kd_ratio": 0.5, "learning_rate": 1.3872866969548143e-05, "loss": 1.0561096668243408, "loss/kd": 1.7893825769424438, "loss/lm": 0.3228367865085602, "step": 1911 }, { "epoch": 0.39252720180661055, "grad_norm": 0.9779431728339694, "kd_ratio": 0.5, "learning_rate": 1.3866734825121122e-05, "loss": 1.123051643371582, "loss/kd": 1.8805336952209473, "loss/lm": 0.36556947231292725, "step": 1912 }, { "epoch": 0.3927324984602751, "grad_norm": 1.1243762896140677, "kd_ratio": 0.5, "learning_rate": 1.386060097058294e-05, "loss": 0.9190587997436523, "loss/kd": 1.4767206907272339, "loss/lm": 0.3613969385623932, "step": 1913 }, { "epoch": 0.39293779511393967, "grad_norm": 1.3670018462988314, "kd_ratio": 0.5, "learning_rate": 1.3854465408646372e-05, "loss": 0.9652943015098572, "loss/kd": 1.6167247295379639, "loss/lm": 0.3138638436794281, "step": 1914 }, { "epoch": 0.3931430917676042, "grad_norm": 1.4306640620008049, "kd_ratio": 0.5, "learning_rate": 1.384832814202494e-05, "loss": 0.8828359246253967, "loss/kd": 1.4200475215911865, "loss/lm": 0.3456243574619293, "step": 1915 }, { "epoch": 0.39334838842126874, "grad_norm": 1.4421598484012124, "kd_ratio": 0.5, "learning_rate": 1.3842189173432934e-05, "loss": 0.8684791922569275, "loss/kd": 1.3834683895111084, "loss/lm": 0.35349002480506897, "step": 1916 }, { "epoch": 0.39355368507493327, "grad_norm": 1.4242789526238404, "kd_ratio": 0.5, "learning_rate": 1.3836048505585385e-05, "loss": 0.9642009735107422, "loss/kd": 1.5991466045379639, "loss/lm": 0.3292554020881653, "step": 1917 }, { "epoch": 0.3937589817285978, "grad_norm": 1.020950005783249, "kd_ratio": 0.5, "learning_rate": 1.3829906141198076e-05, "loss": 1.2316911220550537, "loss/kd": 2.1169564723968506, "loss/lm": 0.34642577171325684, "step": 1918 }, { "epoch": 0.3939642783822624, "grad_norm": 1.5533945251212653, "kd_ratio": 0.5, "learning_rate": 1.3823762082987544e-05, "loss": 0.9545629024505615, "loss/kd": 1.6378324031829834, "loss/lm": 0.27129337191581726, "step": 1919 }, { "epoch": 0.3941695750359269, "grad_norm": 1.1553834271184162, "kd_ratio": 0.5, "learning_rate": 1.3817616333671077e-05, "loss": 1.1993610858917236, "loss/kd": 1.952455997467041, "loss/lm": 0.4462660849094391, "step": 1920 }, { "epoch": 0.39437487168959146, "grad_norm": 1.7311594352652346, "kd_ratio": 0.5, "learning_rate": 1.381146889596671e-05, "loss": 0.9564181566238403, "loss/kd": 1.5986438989639282, "loss/lm": 0.31419235467910767, "step": 1921 }, { "epoch": 0.394580168343256, "grad_norm": 2.185846911643444, "kd_ratio": 0.5, "learning_rate": 1.3805319772593224e-05, "loss": 1.1016877889633179, "loss/kd": 1.8092862367630005, "loss/lm": 0.39408934116363525, "step": 1922 }, { "epoch": 0.3947854649969206, "grad_norm": 1.3553061859237552, "kd_ratio": 0.5, "learning_rate": 1.3799168966270139e-05, "loss": 1.240138053894043, "loss/kd": 2.0510613918304443, "loss/lm": 0.429214745759964, "step": 1923 }, { "epoch": 0.3949907616505851, "grad_norm": 1.6773566572318004, "kd_ratio": 0.5, "learning_rate": 1.3793016479717727e-05, "loss": 1.2012137174606323, "loss/kd": 2.055158853530884, "loss/lm": 0.347268670797348, "step": 1924 }, { "epoch": 0.39519605830424964, "grad_norm": 1.176411885280365, "kd_ratio": 0.5, "learning_rate": 1.3786862315657002e-05, "loss": 1.2463464736938477, "loss/kd": 2.1258962154388428, "loss/lm": 0.3667966425418854, "step": 1925 }, { "epoch": 0.3954013549579142, "grad_norm": 1.454026974582232, "kd_ratio": 0.5, "learning_rate": 1.3780706476809718e-05, "loss": 1.1708729267120361, "loss/kd": 1.9550663232803345, "loss/lm": 0.38667944073677063, "step": 1926 }, { "epoch": 0.3956066516115787, "grad_norm": 1.2219864314345463, "kd_ratio": 0.5, "learning_rate": 1.3774548965898371e-05, "loss": 1.0059634447097778, "loss/kd": 1.5967742204666138, "loss/lm": 0.41515272855758667, "step": 1927 }, { "epoch": 0.3958119482652433, "grad_norm": 1.6251304678261533, "kd_ratio": 0.5, "learning_rate": 1.3768389785646196e-05, "loss": 1.295702338218689, "loss/kd": 2.1723458766937256, "loss/lm": 0.41905874013900757, "step": 1928 }, { "epoch": 0.39601724491890783, "grad_norm": 1.2907942332772708, "kd_ratio": 0.5, "learning_rate": 1.3762228938777166e-05, "loss": 1.1001478433609009, "loss/kd": 1.8228217363357544, "loss/lm": 0.37747395038604736, "step": 1929 }, { "epoch": 0.39622254157257236, "grad_norm": 1.6764321287822697, "kd_ratio": 0.5, "learning_rate": 1.3756066428015995e-05, "loss": 0.9913004040718079, "loss/kd": 1.697077989578247, "loss/lm": 0.28552278876304626, "step": 1930 }, { "epoch": 0.3964278382262369, "grad_norm": 1.9088666898039761, "kd_ratio": 0.5, "learning_rate": 1.3749902256088125e-05, "loss": 1.11249840259552, "loss/kd": 1.8148726224899292, "loss/lm": 0.410124272108078, "step": 1931 }, { "epoch": 0.3966331348799015, "grad_norm": 2.1631978374125995, "kd_ratio": 0.5, "learning_rate": 1.3743736425719742e-05, "loss": 1.0894560813903809, "loss/kd": 1.807424545288086, "loss/lm": 0.371487557888031, "step": 1932 }, { "epoch": 0.396838431533566, "grad_norm": 2.7138767028337796, "kd_ratio": 0.5, "learning_rate": 1.3737568939637753e-05, "loss": 1.038655161857605, "loss/kd": 1.7082489728927612, "loss/lm": 0.36906129121780396, "step": 1933 }, { "epoch": 0.39704372818723055, "grad_norm": 0.9590624680683807, "kd_ratio": 0.5, "learning_rate": 1.3731399800569811e-05, "loss": 0.8446389436721802, "loss/kd": 1.3296175003051758, "loss/lm": 0.3596603572368622, "step": 1934 }, { "epoch": 0.3972490248408951, "grad_norm": 2.073528792520961, "kd_ratio": 0.5, "learning_rate": 1.3725229011244294e-05, "loss": 1.1506474018096924, "loss/kd": 1.9460718631744385, "loss/lm": 0.3552229702472687, "step": 1935 }, { "epoch": 0.3974543214945596, "grad_norm": 1.9368761820293763, "kd_ratio": 0.5, "learning_rate": 1.3719056574390304e-05, "loss": 1.1778932809829712, "loss/kd": 1.996396780014038, "loss/lm": 0.3593897819519043, "step": 1936 }, { "epoch": 0.3976596181482242, "grad_norm": 1.299219170300339, "kd_ratio": 0.5, "learning_rate": 1.3712882492737681e-05, "loss": 0.9088078141212463, "loss/kd": 1.4510518312454224, "loss/lm": 0.3665637969970703, "step": 1937 }, { "epoch": 0.39786491480188874, "grad_norm": 1.6690825832548035, "kd_ratio": 0.5, "learning_rate": 1.3706706769016991e-05, "loss": 1.0099718570709229, "loss/kd": 1.6600209474563599, "loss/lm": 0.35992273688316345, "step": 1938 }, { "epoch": 0.39807021145555327, "grad_norm": 1.268050627641687, "kd_ratio": 0.5, "learning_rate": 1.3700529405959517e-05, "loss": 0.9603940844535828, "loss/kd": 1.5880802869796753, "loss/lm": 0.33270785212516785, "step": 1939 }, { "epoch": 0.3982755081092178, "grad_norm": 1.283686711243023, "kd_ratio": 0.5, "learning_rate": 1.3694350406297284e-05, "loss": 0.9472630023956299, "loss/kd": 1.5785967111587524, "loss/lm": 0.3159293532371521, "step": 1940 }, { "epoch": 0.3984808047628824, "grad_norm": 1.3487393496136584, "kd_ratio": 0.5, "learning_rate": 1.3688169772763022e-05, "loss": 1.1068729162216187, "loss/kd": 1.8250763416290283, "loss/lm": 0.3886694610118866, "step": 1941 }, { "epoch": 0.3986861014165469, "grad_norm": 1.3847236510082133, "kd_ratio": 0.5, "learning_rate": 1.36819875080902e-05, "loss": 1.3684040307998657, "loss/kd": 2.2754223346710205, "loss/lm": 0.46138569712638855, "step": 1942 }, { "epoch": 0.39889139807021146, "grad_norm": 1.0464667793808566, "kd_ratio": 0.5, "learning_rate": 1.3675803615012993e-05, "loss": 1.2089709043502808, "loss/kd": 2.006887912750244, "loss/lm": 0.4110538959503174, "step": 1943 }, { "epoch": 0.399096694723876, "grad_norm": 1.3629838695574756, "kd_ratio": 0.5, "learning_rate": 1.3669618096266305e-05, "loss": 1.238083004951477, "loss/kd": 2.1510040760040283, "loss/lm": 0.32516199350357056, "step": 1944 }, { "epoch": 0.3993019913775405, "grad_norm": 1.6497470730318569, "kd_ratio": 0.5, "learning_rate": 1.3663430954585768e-05, "loss": 1.275399923324585, "loss/kd": 2.1354727745056152, "loss/lm": 0.4153270423412323, "step": 1945 }, { "epoch": 0.3995072880312051, "grad_norm": 1.0141553874993698, "kd_ratio": 0.5, "learning_rate": 1.3657242192707707e-05, "loss": 1.0770162343978882, "loss/kd": 1.7807921171188354, "loss/lm": 0.3732404410839081, "step": 1946 }, { "epoch": 0.39971258468486964, "grad_norm": 1.584166677172318, "kd_ratio": 0.5, "learning_rate": 1.3651051813369188e-05, "loss": 1.536409854888916, "loss/kd": 2.702469825744629, "loss/lm": 0.3703498840332031, "step": 1947 }, { "epoch": 0.3999178813385342, "grad_norm": 1.6903085026438165, "kd_ratio": 0.5, "learning_rate": 1.364485981930798e-05, "loss": 1.322993516921997, "loss/kd": 2.2632017135620117, "loss/lm": 0.38278523087501526, "step": 1948 }, { "epoch": 0.4001231779921987, "grad_norm": 1.1215203112009489, "kd_ratio": 0.5, "learning_rate": 1.3638666213262568e-05, "loss": 1.2565932273864746, "loss/kd": 2.117554187774658, "loss/lm": 0.3956322968006134, "step": 1949 }, { "epoch": 0.4003284746458633, "grad_norm": 1.399389001090236, "kd_ratio": 0.5, "learning_rate": 1.3632470997972153e-05, "loss": 0.9863914251327515, "loss/kd": 1.5762910842895508, "loss/lm": 0.3964917063713074, "step": 1950 }, { "epoch": 0.40053377129952783, "grad_norm": 1.732660157865558, "kd_ratio": 0.5, "learning_rate": 1.3626274176176645e-05, "loss": 1.2171525955200195, "loss/kd": 2.0412659645080566, "loss/lm": 0.39303913712501526, "step": 1951 }, { "epoch": 0.40073906795319236, "grad_norm": 1.2693880852067638, "kd_ratio": 0.5, "learning_rate": 1.3620075750616661e-05, "loss": 1.1084606647491455, "loss/kd": 1.8533579111099243, "loss/lm": 0.36356329917907715, "step": 1952 }, { "epoch": 0.4009443646068569, "grad_norm": 1.1998153267411111, "kd_ratio": 0.5, "learning_rate": 1.3613875724033536e-05, "loss": 1.0177171230316162, "loss/kd": 1.7367939949035645, "loss/lm": 0.2986403703689575, "step": 1953 }, { "epoch": 0.40114966126052143, "grad_norm": 1.384255801484776, "kd_ratio": 0.5, "learning_rate": 1.3607674099169304e-05, "loss": 0.9984021782875061, "loss/kd": 1.6042258739471436, "loss/lm": 0.39257848262786865, "step": 1954 }, { "epoch": 0.401354957914186, "grad_norm": 1.244801938395557, "kd_ratio": 0.5, "learning_rate": 1.3601470878766714e-05, "loss": 1.0323187112808228, "loss/kd": 1.6795655488967896, "loss/lm": 0.38507184386253357, "step": 1955 }, { "epoch": 0.40156025456785055, "grad_norm": 1.0086107125208423, "kd_ratio": 0.5, "learning_rate": 1.3595266065569212e-05, "loss": 1.0970934629440308, "loss/kd": 1.8192659616470337, "loss/lm": 0.3749210238456726, "step": 1956 }, { "epoch": 0.4017655512215151, "grad_norm": 1.4495492243011103, "kd_ratio": 0.5, "learning_rate": 1.3589059662320958e-05, "loss": 1.0212939977645874, "loss/kd": 1.6395477056503296, "loss/lm": 0.4030403196811676, "step": 1957 }, { "epoch": 0.4019708478751796, "grad_norm": 1.6268315074220763, "kd_ratio": 0.5, "learning_rate": 1.3582851671766808e-05, "loss": 2.119100570678711, "loss/kd": 3.9470105171203613, "loss/lm": 0.29119041562080383, "step": 1958 }, { "epoch": 0.4021761445288442, "grad_norm": 1.3473443833351388, "kd_ratio": 0.5, "learning_rate": 1.3576642096652322e-05, "loss": 0.9809629917144775, "loss/kd": 1.573154091835022, "loss/lm": 0.38877183198928833, "step": 1959 }, { "epoch": 0.40238144118250874, "grad_norm": 1.379132756863021, "kd_ratio": 0.5, "learning_rate": 1.3570430939723763e-05, "loss": 0.9512546062469482, "loss/kd": 1.523328423500061, "loss/lm": 0.37918081879615784, "step": 1960 }, { "epoch": 0.40258673783617327, "grad_norm": 2.3280404374054835, "kd_ratio": 0.5, "learning_rate": 1.356421820372809e-05, "loss": 1.0733377933502197, "loss/kd": 1.7505615949630737, "loss/lm": 0.3961141109466553, "step": 1961 }, { "epoch": 0.4027920344898378, "grad_norm": 1.683470552787156, "kd_ratio": 0.5, "learning_rate": 1.3558003891412964e-05, "loss": 1.1424816846847534, "loss/kd": 1.8929693698883057, "loss/lm": 0.3919939398765564, "step": 1962 }, { "epoch": 0.40299733114350234, "grad_norm": 1.4880946865136198, "kd_ratio": 0.5, "learning_rate": 1.3551788005526738e-05, "loss": 1.042050838470459, "loss/kd": 1.696468710899353, "loss/lm": 0.3876330256462097, "step": 1963 }, { "epoch": 0.4032026277971669, "grad_norm": 2.037204125619181, "kd_ratio": 0.5, "learning_rate": 1.3545570548818467e-05, "loss": 0.934647798538208, "loss/kd": 1.5110949277877808, "loss/lm": 0.3582006096839905, "step": 1964 }, { "epoch": 0.40340792445083146, "grad_norm": 1.3193613417256236, "kd_ratio": 0.5, "learning_rate": 1.3539351524037896e-05, "loss": 1.0682950019836426, "loss/kd": 1.767809510231018, "loss/lm": 0.3687804043292999, "step": 1965 }, { "epoch": 0.403613221104496, "grad_norm": 1.3311934849842944, "kd_ratio": 0.5, "learning_rate": 1.3533130933935468e-05, "loss": 1.107012152671814, "loss/kd": 1.8762325048446655, "loss/lm": 0.33779188990592957, "step": 1966 }, { "epoch": 0.4038185177581605, "grad_norm": 1.188443363884379, "kd_ratio": 0.5, "learning_rate": 1.3526908781262314e-05, "loss": 1.2052432298660278, "loss/kd": 2.062192440032959, "loss/lm": 0.3482939302921295, "step": 1967 }, { "epoch": 0.4040238144118251, "grad_norm": 1.450017254535658, "kd_ratio": 0.5, "learning_rate": 1.352068506877026e-05, "loss": 1.0212293863296509, "loss/kd": 1.6921929121017456, "loss/lm": 0.3502658009529114, "step": 1968 }, { "epoch": 0.40422911106548964, "grad_norm": 1.2182656655821902, "kd_ratio": 0.5, "learning_rate": 1.3514459799211814e-05, "loss": 1.4511744976043701, "loss/kd": 2.481140613555908, "loss/lm": 0.42120832204818726, "step": 1969 }, { "epoch": 0.4044344077191542, "grad_norm": 1.0806782619177515, "kd_ratio": 0.5, "learning_rate": 1.3508232975340183e-05, "loss": 0.9559611082077026, "loss/kd": 1.5355364084243774, "loss/lm": 0.37638577818870544, "step": 1970 }, { "epoch": 0.4046397043728187, "grad_norm": 1.0819383273011152, "kd_ratio": 0.5, "learning_rate": 1.3502004599909255e-05, "loss": 1.2499696016311646, "loss/kd": 2.168957471847534, "loss/lm": 0.3309817910194397, "step": 1971 }, { "epoch": 0.40484500102648324, "grad_norm": 1.1268365164691585, "kd_ratio": 0.5, "learning_rate": 1.3495774675673606e-05, "loss": 0.9832741022109985, "loss/kd": 1.6206146478652954, "loss/lm": 0.34593358635902405, "step": 1972 }, { "epoch": 0.40505029768014783, "grad_norm": 1.3635539850162994, "kd_ratio": 0.5, "learning_rate": 1.3489543205388498e-05, "loss": 1.0810801982879639, "loss/kd": 1.7834644317626953, "loss/lm": 0.3786960542201996, "step": 1973 }, { "epoch": 0.40525559433381236, "grad_norm": 1.4813666462995676, "kd_ratio": 0.5, "learning_rate": 1.3483310191809875e-05, "loss": 0.9872205257415771, "loss/kd": 1.5983704328536987, "loss/lm": 0.3760705888271332, "step": 1974 }, { "epoch": 0.4054608909874769, "grad_norm": 1.3509355224253836, "kd_ratio": 0.5, "learning_rate": 1.3477075637694362e-05, "loss": 1.015455961227417, "loss/kd": 1.695481300354004, "loss/lm": 0.3354306221008301, "step": 1975 }, { "epoch": 0.40566618764114143, "grad_norm": 1.3709031734682446, "kd_ratio": 0.5, "learning_rate": 1.3470839545799271e-05, "loss": 1.053525447845459, "loss/kd": 1.7026094198226929, "loss/lm": 0.40444159507751465, "step": 1976 }, { "epoch": 0.405871484294806, "grad_norm": 0.9436501609400766, "kd_ratio": 0.5, "learning_rate": 1.3464601918882589e-05, "loss": 1.0477818250656128, "loss/kd": 1.7162541151046753, "loss/lm": 0.37930944561958313, "step": 1977 }, { "epoch": 0.40607678094847055, "grad_norm": 1.3537639959304515, "kd_ratio": 0.5, "learning_rate": 1.345836275970298e-05, "loss": 0.9418107867240906, "loss/kd": 1.4995588064193726, "loss/lm": 0.3840627372264862, "step": 1978 }, { "epoch": 0.4062820776021351, "grad_norm": 1.4531258754070397, "kd_ratio": 0.5, "learning_rate": 1.3452122071019797e-05, "loss": 0.8146686553955078, "loss/kd": 1.2318506240844727, "loss/lm": 0.3974866569042206, "step": 1979 }, { "epoch": 0.4064873742557996, "grad_norm": 1.0935406263299885, "kd_ratio": 0.5, "learning_rate": 1.3445879855593054e-05, "loss": 1.1251882314682007, "loss/kd": 1.853095531463623, "loss/lm": 0.39728087186813354, "step": 1980 }, { "epoch": 0.40669267090946415, "grad_norm": 1.4769594198284297, "kd_ratio": 0.5, "learning_rate": 1.343963611618345e-05, "loss": 1.3298192024230957, "loss/kd": 2.229613780975342, "loss/lm": 0.43002450466156006, "step": 1981 }, { "epoch": 0.40689796756312874, "grad_norm": 1.4484197399229874, "kd_ratio": 0.5, "learning_rate": 1.3433390855552357e-05, "loss": 1.0511200428009033, "loss/kd": 1.7374416589736938, "loss/lm": 0.3647983968257904, "step": 1982 }, { "epoch": 0.40710326421679327, "grad_norm": 1.090210578168171, "kd_ratio": 0.5, "learning_rate": 1.3427144076461818e-05, "loss": 1.329717755317688, "loss/kd": 2.245361804962158, "loss/lm": 0.414073646068573, "step": 1983 }, { "epoch": 0.4073085608704578, "grad_norm": 1.992796060326194, "kd_ratio": 0.5, "learning_rate": 1.3420895781674547e-05, "loss": 1.381848931312561, "loss/kd": 2.329385280609131, "loss/lm": 0.4343125522136688, "step": 1984 }, { "epoch": 0.40751385752412234, "grad_norm": 1.3169351857371423, "kd_ratio": 0.5, "learning_rate": 1.3414645973953931e-05, "loss": 2.0425167083740234, "loss/kd": 3.802570343017578, "loss/lm": 0.2824629545211792, "step": 1985 }, { "epoch": 0.4077191541777869, "grad_norm": 1.2068313664356884, "kd_ratio": 0.5, "learning_rate": 1.3408394656064025e-05, "loss": 1.1556980609893799, "loss/kd": 1.9776434898376465, "loss/lm": 0.33375275135040283, "step": 1986 }, { "epoch": 0.40792445083145146, "grad_norm": 1.0537148958953608, "kd_ratio": 0.5, "learning_rate": 1.3402141830769551e-05, "loss": 0.9517239928245544, "loss/kd": 1.53981614112854, "loss/lm": 0.36363184452056885, "step": 1987 }, { "epoch": 0.408129747485116, "grad_norm": 1.2401457348984484, "kd_ratio": 0.5, "learning_rate": 1.3395887500835894e-05, "loss": 1.1892555952072144, "loss/kd": 1.9837496280670166, "loss/lm": 0.39476150274276733, "step": 1988 }, { "epoch": 0.4083350441387805, "grad_norm": 1.0165955347139868, "kd_ratio": 0.5, "learning_rate": 1.3389631669029115e-05, "loss": 0.9917846918106079, "loss/kd": 1.581905722618103, "loss/lm": 0.401663601398468, "step": 1989 }, { "epoch": 0.40854034079244506, "grad_norm": 0.9460707902154638, "kd_ratio": 0.5, "learning_rate": 1.3383374338115929e-05, "loss": 1.3078092336654663, "loss/kd": 2.2320570945739746, "loss/lm": 0.383561372756958, "step": 1990 }, { "epoch": 0.40874563744610964, "grad_norm": 0.973452014312779, "kd_ratio": 0.5, "learning_rate": 1.3377115510863716e-05, "loss": 0.9092784523963928, "loss/kd": 1.399102807044983, "loss/lm": 0.41945409774780273, "step": 1991 }, { "epoch": 0.4089509340997742, "grad_norm": 1.0588978977351524, "kd_ratio": 0.5, "learning_rate": 1.3370855190040523e-05, "loss": 1.072082757949829, "loss/kd": 1.755592942237854, "loss/lm": 0.3885725736618042, "step": 1992 }, { "epoch": 0.4091562307534387, "grad_norm": 1.1810673372890883, "kd_ratio": 0.5, "learning_rate": 1.3364593378415054e-05, "loss": 1.0776140689849854, "loss/kd": 1.7545474767684937, "loss/lm": 0.4006807506084442, "step": 1993 }, { "epoch": 0.40936152740710324, "grad_norm": 1.1028215648363549, "kd_ratio": 0.5, "learning_rate": 1.3358330078756672e-05, "loss": 0.9719545841217041, "loss/kd": 1.592355489730835, "loss/lm": 0.35155367851257324, "step": 1994 }, { "epoch": 0.40956682406076783, "grad_norm": 1.1937281839073302, "kd_ratio": 0.5, "learning_rate": 1.3352065293835399e-05, "loss": 1.244107723236084, "loss/kd": 2.0124969482421875, "loss/lm": 0.47571861743927, "step": 1995 }, { "epoch": 0.40977212071443236, "grad_norm": 1.18807356488102, "kd_ratio": 0.5, "learning_rate": 1.3345799026421911e-05, "loss": 1.0440744161605835, "loss/kd": 1.7332144975662231, "loss/lm": 0.3549342453479767, "step": 1996 }, { "epoch": 0.4099774173680969, "grad_norm": 1.3509360434825652, "kd_ratio": 0.5, "learning_rate": 1.3339531279287544e-05, "loss": 0.9244966506958008, "loss/kd": 1.5120995044708252, "loss/lm": 0.3368937373161316, "step": 1997 }, { "epoch": 0.41018271402176143, "grad_norm": 1.3453231541701136, "kd_ratio": 0.5, "learning_rate": 1.3333262055204284e-05, "loss": 1.1960830688476562, "loss/kd": 2.0053353309631348, "loss/lm": 0.38683074712753296, "step": 1998 }, { "epoch": 0.410388010675426, "grad_norm": 1.7423078615303451, "kd_ratio": 0.5, "learning_rate": 1.3326991356944776e-05, "loss": 1.5111945867538452, "loss/kd": 2.6129190921783447, "loss/lm": 0.40947017073631287, "step": 1999 }, { "epoch": 0.41059330732909055, "grad_norm": 1.3778037715254605, "kd_ratio": 0.5, "learning_rate": 1.3320719187282311e-05, "loss": 1.1873844861984253, "loss/kd": 1.9999701976776123, "loss/lm": 0.3747987151145935, "step": 2000 }, { "epoch": 0.4107986039827551, "grad_norm": 1.557436602860571, "kd_ratio": 0.5, "learning_rate": 1.3314445548990836e-05, "loss": 1.328439712524414, "loss/kd": 2.196864128112793, "loss/lm": 0.46001535654067993, "step": 2001 }, { "epoch": 0.4110039006364196, "grad_norm": 2.2295549828776737, "kd_ratio": 0.5, "learning_rate": 1.3308170444844941e-05, "loss": 1.1758474111557007, "loss/kd": 2.0107579231262207, "loss/lm": 0.3409368693828583, "step": 2002 }, { "epoch": 0.41120919729008415, "grad_norm": 2.023604600443183, "kd_ratio": 0.5, "learning_rate": 1.3301893877619874e-05, "loss": 1.0061438083648682, "loss/kd": 1.6063601970672607, "loss/lm": 0.4059273302555084, "step": 2003 }, { "epoch": 0.41141449394374874, "grad_norm": 1.6205190925693567, "kd_ratio": 0.5, "learning_rate": 1.3295615850091518e-05, "loss": 1.043236255645752, "loss/kd": 1.6641974449157715, "loss/lm": 0.4222750663757324, "step": 2004 }, { "epoch": 0.41161979059741327, "grad_norm": 1.03424678582595, "kd_ratio": 0.5, "learning_rate": 1.3289336365036414e-05, "loss": 0.9855688810348511, "loss/kd": 1.554457426071167, "loss/lm": 0.41668030619621277, "step": 2005 }, { "epoch": 0.4118250872510778, "grad_norm": 1.6657307282042944, "kd_ratio": 0.5, "learning_rate": 1.3283055425231738e-05, "loss": 0.9400963187217712, "loss/kd": 1.5567772388458252, "loss/lm": 0.3234154284000397, "step": 2006 }, { "epoch": 0.41203038390474234, "grad_norm": 1.4747217830128918, "kd_ratio": 0.5, "learning_rate": 1.3276773033455312e-05, "loss": 1.1687650680541992, "loss/kd": 1.9663618803024292, "loss/lm": 0.37116825580596924, "step": 2007 }, { "epoch": 0.4122356805584069, "grad_norm": 1.7077449707150403, "kd_ratio": 0.5, "learning_rate": 1.3270489192485606e-05, "loss": 1.29544997215271, "loss/kd": 2.217047929763794, "loss/lm": 0.373852014541626, "step": 2008 }, { "epoch": 0.41244097721207146, "grad_norm": 1.1529404614623608, "kd_ratio": 0.5, "learning_rate": 1.326420390510172e-05, "loss": 1.132161259651184, "loss/kd": 1.849666714668274, "loss/lm": 0.41465580463409424, "step": 2009 }, { "epoch": 0.412646273865736, "grad_norm": 1.9834430541747612, "kd_ratio": 0.5, "learning_rate": 1.3257917174083403e-05, "loss": 0.912807047367096, "loss/kd": 1.3768131732940674, "loss/lm": 0.4488008916378021, "step": 2010 }, { "epoch": 0.4128515705194005, "grad_norm": 1.7315700855709997, "kd_ratio": 0.5, "learning_rate": 1.3251629002211042e-05, "loss": 0.9634726047515869, "loss/kd": 1.585092306137085, "loss/lm": 0.3418528735637665, "step": 2011 }, { "epoch": 0.41305686717306506, "grad_norm": 1.6410387375772744, "kd_ratio": 0.5, "learning_rate": 1.3245339392265656e-05, "loss": 1.343221664428711, "loss/kd": 2.2531232833862305, "loss/lm": 0.43331992626190186, "step": 2012 }, { "epoch": 0.41326216382672964, "grad_norm": 1.2537956326764739, "kd_ratio": 0.5, "learning_rate": 1.32390483470289e-05, "loss": 1.0238404273986816, "loss/kd": 1.6589624881744385, "loss/lm": 0.38871824741363525, "step": 2013 }, { "epoch": 0.4134674604803942, "grad_norm": 0.9823317812862126, "kd_ratio": 0.5, "learning_rate": 1.323275586928307e-05, "loss": 2.033574104309082, "loss/kd": 3.7834854125976562, "loss/lm": 0.28366267681121826, "step": 2014 }, { "epoch": 0.4136727571340587, "grad_norm": 1.1384839986103548, "kd_ratio": 0.5, "learning_rate": 1.322646196181109e-05, "loss": 1.123503565788269, "loss/kd": 1.893166422843933, "loss/lm": 0.35384076833724976, "step": 2015 }, { "epoch": 0.41387805378772324, "grad_norm": 1.3792039734803647, "kd_ratio": 0.5, "learning_rate": 1.3220166627396523e-05, "loss": 1.2118899822235107, "loss/kd": 2.0304131507873535, "loss/lm": 0.39336681365966797, "step": 2016 }, { "epoch": 0.41408335044138783, "grad_norm": 1.2299584353464816, "kd_ratio": 0.5, "learning_rate": 1.3213869868823548e-05, "loss": 1.3017656803131104, "loss/kd": 2.1993613243103027, "loss/lm": 0.4041700065135956, "step": 2017 }, { "epoch": 0.41428864709505236, "grad_norm": 1.022749893812818, "kd_ratio": 0.5, "learning_rate": 1.3207571688876994e-05, "loss": 1.2792640924453735, "loss/kd": 2.1388237476348877, "loss/lm": 0.41970449686050415, "step": 2018 }, { "epoch": 0.4144939437487169, "grad_norm": 1.6308632933055496, "kd_ratio": 0.5, "learning_rate": 1.3201272090342303e-05, "loss": 1.2621610164642334, "loss/kd": 2.1437456607818604, "loss/lm": 0.38057637214660645, "step": 2019 }, { "epoch": 0.41469924040238143, "grad_norm": 2.313346839699488, "kd_ratio": 0.5, "learning_rate": 1.3194971076005554e-05, "loss": 1.0784393548965454, "loss/kd": 1.8161569833755493, "loss/lm": 0.3407216966152191, "step": 2020 }, { "epoch": 0.41490453705604596, "grad_norm": 1.885078140041208, "kd_ratio": 0.5, "learning_rate": 1.3188668648653447e-05, "loss": 1.3146857023239136, "loss/kd": 2.1925628185272217, "loss/lm": 0.43680864572525024, "step": 2021 }, { "epoch": 0.41510983370971055, "grad_norm": 1.1282144881906981, "kd_ratio": 0.5, "learning_rate": 1.3182364811073304e-05, "loss": 0.9783961772918701, "loss/kd": 1.6093013286590576, "loss/lm": 0.34749096632003784, "step": 2022 }, { "epoch": 0.4153151303633751, "grad_norm": 2.7319137291502433, "kd_ratio": 0.5, "learning_rate": 1.3176059566053083e-05, "loss": 1.1779826879501343, "loss/kd": 2.0089120864868164, "loss/lm": 0.34705328941345215, "step": 2023 }, { "epoch": 0.4155204270170396, "grad_norm": 2.4687381157528683, "kd_ratio": 0.5, "learning_rate": 1.3169752916381348e-05, "loss": 1.102912187576294, "loss/kd": 1.8783416748046875, "loss/lm": 0.32748278975486755, "step": 2024 }, { "epoch": 0.41572572367070415, "grad_norm": 1.1690991828861856, "kd_ratio": 0.5, "learning_rate": 1.31634448648473e-05, "loss": 1.0494683980941772, "loss/kd": 1.7698689699172974, "loss/lm": 0.3290678560733795, "step": 2025 }, { "epoch": 0.41593102032436874, "grad_norm": 1.7245718042076112, "kd_ratio": 0.5, "learning_rate": 1.3157135414240752e-05, "loss": 1.2129452228546143, "loss/kd": 2.094468593597412, "loss/lm": 0.33142173290252686, "step": 2026 }, { "epoch": 0.41613631697803327, "grad_norm": 1.8753957148622198, "kd_ratio": 0.5, "learning_rate": 1.3150824567352128e-05, "loss": 1.159938097000122, "loss/kd": 1.9104633331298828, "loss/lm": 0.40941280126571655, "step": 2027 }, { "epoch": 0.4163416136316978, "grad_norm": 1.6777488549493096, "kd_ratio": 0.5, "learning_rate": 1.3144512326972485e-05, "loss": 1.220562219619751, "loss/kd": 2.054725408554077, "loss/lm": 0.3863990902900696, "step": 2028 }, { "epoch": 0.41654691028536234, "grad_norm": 1.5062109964859474, "kd_ratio": 0.5, "learning_rate": 1.3138198695893487e-05, "loss": 1.0361719131469727, "loss/kd": 1.739203691482544, "loss/lm": 0.33314016461372375, "step": 2029 }, { "epoch": 0.41675220693902687, "grad_norm": 1.5810907175853541, "kd_ratio": 0.5, "learning_rate": 1.3131883676907415e-05, "loss": 1.0187597274780273, "loss/kd": 1.7219743728637695, "loss/lm": 0.3155449628829956, "step": 2030 }, { "epoch": 0.41695750359269146, "grad_norm": 0.9718171157711157, "kd_ratio": 0.5, "learning_rate": 1.3125567272807167e-05, "loss": 1.0556613206863403, "loss/kd": 1.7215436697006226, "loss/lm": 0.3897789716720581, "step": 2031 }, { "epoch": 0.417162800246356, "grad_norm": 1.1592454720902288, "kd_ratio": 0.5, "learning_rate": 1.3119249486386246e-05, "loss": 0.994284987449646, "loss/kd": 1.617177963256836, "loss/lm": 0.37139201164245605, "step": 2032 }, { "epoch": 0.4173680969000205, "grad_norm": 1.1096287035009915, "kd_ratio": 0.5, "learning_rate": 1.3112930320438774e-05, "loss": 1.0751423835754395, "loss/kd": 1.8062081336975098, "loss/lm": 0.3440766930580139, "step": 2033 }, { "epoch": 0.41757339355368506, "grad_norm": 1.0293810403260815, "kd_ratio": 0.5, "learning_rate": 1.3106609777759483e-05, "loss": 1.3780854940414429, "loss/kd": 2.356045961380005, "loss/lm": 0.40012499690055847, "step": 2034 }, { "epoch": 0.41777869020734965, "grad_norm": 1.072517094302559, "kd_ratio": 0.5, "learning_rate": 1.3100287861143703e-05, "loss": 1.1678296327590942, "loss/kd": 1.915187120437622, "loss/lm": 0.42047205567359924, "step": 2035 }, { "epoch": 0.4179839868610142, "grad_norm": 1.0658686639827866, "kd_ratio": 0.5, "learning_rate": 1.3093964573387385e-05, "loss": 0.8411891460418701, "loss/kd": 1.3223732709884644, "loss/lm": 0.3600049912929535, "step": 2036 }, { "epoch": 0.4181892835146787, "grad_norm": 1.190382835136822, "kd_ratio": 0.5, "learning_rate": 1.3087639917287081e-05, "loss": 1.2970020771026611, "loss/kd": 2.229020595550537, "loss/lm": 0.3649836480617523, "step": 2037 }, { "epoch": 0.41839458016834324, "grad_norm": 1.146087659562635, "kd_ratio": 0.5, "learning_rate": 1.3081313895639945e-05, "loss": 0.9091181755065918, "loss/kd": 1.5308011770248413, "loss/lm": 0.2874351143836975, "step": 2038 }, { "epoch": 0.4185998768220078, "grad_norm": 0.9803324913570766, "kd_ratio": 0.5, "learning_rate": 1.3074986511243741e-05, "loss": 1.0114864110946655, "loss/kd": 1.6599993705749512, "loss/lm": 0.3629733622074127, "step": 2039 }, { "epoch": 0.41880517347567237, "grad_norm": 1.1511989914691974, "kd_ratio": 0.5, "learning_rate": 1.306865776689683e-05, "loss": 1.0245389938354492, "loss/kd": 1.65047025680542, "loss/lm": 0.39860785007476807, "step": 2040 }, { "epoch": 0.4190104701293369, "grad_norm": 1.0887418149109713, "kd_ratio": 0.5, "learning_rate": 1.3062327665398184e-05, "loss": 0.9518993496894836, "loss/kd": 1.5522493124008179, "loss/lm": 0.351549357175827, "step": 2041 }, { "epoch": 0.41921576678300143, "grad_norm": 1.3165638166613332, "kd_ratio": 0.5, "learning_rate": 1.3055996209547361e-05, "loss": 1.2144628763198853, "loss/kd": 1.997302770614624, "loss/lm": 0.43162307143211365, "step": 2042 }, { "epoch": 0.41942106343666596, "grad_norm": 1.1532024247094446, "kd_ratio": 0.5, "learning_rate": 1.3049663402144528e-05, "loss": 1.0112040042877197, "loss/kd": 1.6629165410995483, "loss/lm": 0.3594915270805359, "step": 2043 }, { "epoch": 0.41962636009033055, "grad_norm": 1.202484344536874, "kd_ratio": 0.5, "learning_rate": 1.3043329245990449e-05, "loss": 1.0962438583374023, "loss/kd": 1.8634169101715088, "loss/lm": 0.3290708065032959, "step": 2044 }, { "epoch": 0.4198316567439951, "grad_norm": 1.1428071549724463, "kd_ratio": 0.5, "learning_rate": 1.3036993743886477e-05, "loss": 1.1283572912216187, "loss/kd": 1.7872624397277832, "loss/lm": 0.4694521129131317, "step": 2045 }, { "epoch": 0.4200369533976596, "grad_norm": 1.2294098476593454, "kd_ratio": 0.5, "learning_rate": 1.3030656898634572e-05, "loss": 1.2304143905639648, "loss/kd": 2.0940871238708496, "loss/lm": 0.3667415976524353, "step": 2046 }, { "epoch": 0.42024225005132415, "grad_norm": 1.1515958012503271, "kd_ratio": 0.5, "learning_rate": 1.302431871303728e-05, "loss": 0.9172031879425049, "loss/kd": 1.4803447723388672, "loss/lm": 0.3540616035461426, "step": 2047 }, { "epoch": 0.4204475467049887, "grad_norm": 1.3925945365001344, "kd_ratio": 0.5, "learning_rate": 1.3017979189897738e-05, "loss": 1.064571738243103, "loss/kd": 1.8009005784988403, "loss/lm": 0.3282429575920105, "step": 2048 }, { "epoch": 0.4206528433586533, "grad_norm": 1.2260316221450578, "kd_ratio": 0.5, "learning_rate": 1.3011638332019687e-05, "loss": 1.2073173522949219, "loss/kd": 1.9904260635375977, "loss/lm": 0.4242086410522461, "step": 2049 }, { "epoch": 0.4208581400123178, "grad_norm": 1.0743103980542046, "kd_ratio": 0.5, "learning_rate": 1.300529614220744e-05, "loss": 1.0607296228408813, "loss/kd": 1.6907432079315186, "loss/lm": 0.43071606755256653, "step": 2050 }, { "epoch": 0.42106343666598234, "grad_norm": 1.0657285249304982, "kd_ratio": 0.5, "learning_rate": 1.2998952623265917e-05, "loss": 1.363602638244629, "loss/kd": 2.3225855827331543, "loss/lm": 0.40461963415145874, "step": 2051 }, { "epoch": 0.42126873331964687, "grad_norm": 1.1676327293368616, "kd_ratio": 0.5, "learning_rate": 1.2992607778000612e-05, "loss": 1.0481475591659546, "loss/kd": 1.732688546180725, "loss/lm": 0.36360663175582886, "step": 2052 }, { "epoch": 0.42147402997331146, "grad_norm": 1.2130684642914855, "kd_ratio": 0.5, "learning_rate": 1.2986261609217612e-05, "loss": 1.1991183757781982, "loss/kd": 2.0069680213928223, "loss/lm": 0.39126867055892944, "step": 2053 }, { "epoch": 0.421679326626976, "grad_norm": 1.0845637204895073, "kd_ratio": 0.5, "learning_rate": 1.2979914119723588e-05, "loss": 0.9946930408477783, "loss/kd": 1.6549221277236938, "loss/lm": 0.334463894367218, "step": 2054 }, { "epoch": 0.4218846232806405, "grad_norm": 1.2905750829036577, "kd_ratio": 0.5, "learning_rate": 1.2973565312325798e-05, "loss": 1.1577348709106445, "loss/kd": 1.9212650060653687, "loss/lm": 0.39420464634895325, "step": 2055 }, { "epoch": 0.42208991993430506, "grad_norm": 1.4443945195757404, "kd_ratio": 0.5, "learning_rate": 1.2967215189832079e-05, "loss": 0.9233458042144775, "loss/kd": 1.5112521648406982, "loss/lm": 0.3354395031929016, "step": 2056 }, { "epoch": 0.4222952165879696, "grad_norm": 1.2049360232735906, "kd_ratio": 0.5, "learning_rate": 1.296086375505085e-05, "loss": 1.0622814893722534, "loss/kd": 1.7630195617675781, "loss/lm": 0.3615434765815735, "step": 2057 }, { "epoch": 0.4225005132416342, "grad_norm": 1.115126014741522, "kd_ratio": 0.5, "learning_rate": 1.2954511010791111e-05, "loss": 0.9676231145858765, "loss/kd": 1.5635873079299927, "loss/lm": 0.37165892124176025, "step": 2058 }, { "epoch": 0.4227058098952987, "grad_norm": 1.2595571658300584, "kd_ratio": 0.5, "learning_rate": 1.2948156959862446e-05, "loss": 0.978260338306427, "loss/kd": 1.6255552768707275, "loss/lm": 0.33096542954444885, "step": 2059 }, { "epoch": 0.42291110654896324, "grad_norm": 1.0401902230052276, "kd_ratio": 0.5, "learning_rate": 1.2941801605075009e-05, "loss": 1.1445434093475342, "loss/kd": 1.9370204210281372, "loss/lm": 0.3520664572715759, "step": 2060 }, { "epoch": 0.4231164032026278, "grad_norm": 1.0785093839268989, "kd_ratio": 0.5, "learning_rate": 1.2935444949239531e-05, "loss": 1.148500919342041, "loss/kd": 1.8985360860824585, "loss/lm": 0.39846566319465637, "step": 2061 }, { "epoch": 0.42332169985629237, "grad_norm": 1.308094035279418, "kd_ratio": 0.5, "learning_rate": 1.2929086995167325e-05, "loss": 1.1261341571807861, "loss/kd": 1.8173481225967407, "loss/lm": 0.4349202811717987, "step": 2062 }, { "epoch": 0.4235269965099569, "grad_norm": 1.6078034558109393, "kd_ratio": 0.5, "learning_rate": 1.2922727745670276e-05, "loss": 1.0473954677581787, "loss/kd": 1.754008173942566, "loss/lm": 0.34078264236450195, "step": 2063 }, { "epoch": 0.42373229316362143, "grad_norm": 2.3467597030412257, "kd_ratio": 0.5, "learning_rate": 1.2916367203560839e-05, "loss": 0.9295429587364197, "loss/kd": 1.4824773073196411, "loss/lm": 0.37660858035087585, "step": 2064 }, { "epoch": 0.42393758981728596, "grad_norm": 1.160005969802341, "kd_ratio": 0.5, "learning_rate": 1.2910005371652039e-05, "loss": 1.06219482421875, "loss/kd": 1.7971853017807007, "loss/lm": 0.3272044062614441, "step": 2065 }, { "epoch": 0.4241428864709505, "grad_norm": 1.208644527613537, "kd_ratio": 0.5, "learning_rate": 1.2903642252757479e-05, "loss": 0.9968885779380798, "loss/kd": 1.6193933486938477, "loss/lm": 0.374383807182312, "step": 2066 }, { "epoch": 0.4243481831246151, "grad_norm": 1.6445154867022904, "kd_ratio": 0.5, "learning_rate": 1.2897277849691326e-05, "loss": 1.3547565937042236, "loss/kd": 2.367526054382324, "loss/lm": 0.34198707342147827, "step": 2067 }, { "epoch": 0.4245534797782796, "grad_norm": 1.1332166503631187, "kd_ratio": 0.5, "learning_rate": 1.2890912165268315e-05, "loss": 1.9683923721313477, "loss/kd": 3.6599326133728027, "loss/lm": 0.276852011680603, "step": 2068 }, { "epoch": 0.42475877643194415, "grad_norm": 1.2072450007426732, "kd_ratio": 0.5, "learning_rate": 1.2884545202303746e-05, "loss": 1.0784416198730469, "loss/kd": 1.765232801437378, "loss/lm": 0.39165040850639343, "step": 2069 }, { "epoch": 0.4249640730856087, "grad_norm": 1.6434528264257344, "kd_ratio": 0.5, "learning_rate": 1.2878176963613491e-05, "loss": 1.1102616786956787, "loss/kd": 1.8220664262771606, "loss/lm": 0.3984568119049072, "step": 2070 }, { "epoch": 0.4251693697392733, "grad_norm": 1.3912148701073919, "kd_ratio": 0.5, "learning_rate": 1.2871807452013977e-05, "loss": 1.0935227870941162, "loss/kd": 1.8250709772109985, "loss/lm": 0.3619745969772339, "step": 2071 }, { "epoch": 0.4253746663929378, "grad_norm": 0.9864617685537515, "kd_ratio": 0.5, "learning_rate": 1.2865436670322209e-05, "loss": 1.1877135038375854, "loss/kd": 1.8297584056854248, "loss/lm": 0.5456686615943909, "step": 2072 }, { "epoch": 0.42557996304660234, "grad_norm": 1.9982831964073766, "kd_ratio": 0.5, "learning_rate": 1.2859064621355735e-05, "loss": 1.0002268552780151, "loss/kd": 1.6612204313278198, "loss/lm": 0.33923327922821045, "step": 2073 }, { "epoch": 0.42578525970026687, "grad_norm": 2.2227643381361077, "kd_ratio": 0.5, "learning_rate": 1.2852691307932672e-05, "loss": 1.1213815212249756, "loss/kd": 1.7600685358047485, "loss/lm": 0.48269447684288025, "step": 2074 }, { "epoch": 0.4259905563539314, "grad_norm": 1.3237253394698094, "kd_ratio": 0.5, "learning_rate": 1.28463167328717e-05, "loss": 1.1967480182647705, "loss/kd": 1.9979908466339111, "loss/lm": 0.3955051302909851, "step": 2075 }, { "epoch": 0.426195853007596, "grad_norm": 1.0134365496788713, "kd_ratio": 0.5, "learning_rate": 1.2839940898992049e-05, "loss": 1.0969709157943726, "loss/kd": 1.850041151046753, "loss/lm": 0.3439006805419922, "step": 2076 }, { "epoch": 0.4264011496612605, "grad_norm": 1.391458764474531, "kd_ratio": 0.5, "learning_rate": 1.2833563809113519e-05, "loss": 1.5317742824554443, "loss/kd": 2.592085123062134, "loss/lm": 0.47146332263946533, "step": 2077 }, { "epoch": 0.42660644631492506, "grad_norm": 1.3447387535223019, "kd_ratio": 0.5, "learning_rate": 1.282718546605645e-05, "loss": 0.9935741424560547, "loss/kd": 1.5559664964675903, "loss/lm": 0.43118178844451904, "step": 2078 }, { "epoch": 0.4268117429685896, "grad_norm": 1.2444042267054078, "kd_ratio": 0.5, "learning_rate": 1.2820805872641745e-05, "loss": 1.1373792886734009, "loss/kd": 1.8603062629699707, "loss/lm": 0.41445231437683105, "step": 2079 }, { "epoch": 0.4270170396222542, "grad_norm": 1.161700301713162, "kd_ratio": 0.5, "learning_rate": 1.2814425031690854e-05, "loss": 0.9860473871231079, "loss/kd": 1.5894973278045654, "loss/lm": 0.3825974762439728, "step": 2080 }, { "epoch": 0.4272223362759187, "grad_norm": 1.873504014300367, "kd_ratio": 0.5, "learning_rate": 1.2808042946025788e-05, "loss": 1.106848120689392, "loss/kd": 1.8544337749481201, "loss/lm": 0.3592623770236969, "step": 2081 }, { "epoch": 0.42742763292958325, "grad_norm": 1.972871614319895, "kd_ratio": 0.5, "learning_rate": 1.2801659618469098e-05, "loss": 1.0426567792892456, "loss/kd": 1.7419037818908691, "loss/lm": 0.3434097170829773, "step": 2082 }, { "epoch": 0.4276329295832478, "grad_norm": 0.971007283498822, "kd_ratio": 0.5, "learning_rate": 1.2795275051843893e-05, "loss": 1.0942473411560059, "loss/kd": 1.7935168743133545, "loss/lm": 0.3949778378009796, "step": 2083 }, { "epoch": 0.4278382262369123, "grad_norm": 1.863714158079372, "kd_ratio": 0.5, "learning_rate": 1.2788889248973826e-05, "loss": 0.9594180583953857, "loss/kd": 1.5298629999160767, "loss/lm": 0.38897308707237244, "step": 2084 }, { "epoch": 0.4280435228905769, "grad_norm": 1.149439678157921, "kd_ratio": 0.5, "learning_rate": 1.2782502212683092e-05, "loss": 1.036558985710144, "loss/kd": 1.7042217254638672, "loss/lm": 0.3688963055610657, "step": 2085 }, { "epoch": 0.42824881954424143, "grad_norm": 1.6642018189465064, "kd_ratio": 0.5, "learning_rate": 1.2776113945796447e-05, "loss": 1.4829185009002686, "loss/kd": 2.6342906951904297, "loss/lm": 0.3315463364124298, "step": 2086 }, { "epoch": 0.42845411619790597, "grad_norm": 1.5808095676980929, "kd_ratio": 0.5, "learning_rate": 1.276972445113917e-05, "loss": 0.947279691696167, "loss/kd": 1.5771355628967285, "loss/lm": 0.31742382049560547, "step": 2087 }, { "epoch": 0.4286594128515705, "grad_norm": 1.2799169044849605, "kd_ratio": 0.5, "learning_rate": 1.2763333731537102e-05, "loss": 1.2332147359848022, "loss/kd": 2.105501413345337, "loss/lm": 0.36092814803123474, "step": 2088 }, { "epoch": 0.4288647095052351, "grad_norm": 1.2062921393257988, "kd_ratio": 0.5, "learning_rate": 1.275694178981661e-05, "loss": 1.1840410232543945, "loss/kd": 1.905364751815796, "loss/lm": 0.4627172648906708, "step": 2089 }, { "epoch": 0.4290700061588996, "grad_norm": 1.1167748548460419, "kd_ratio": 0.5, "learning_rate": 1.2750548628804618e-05, "loss": 0.8600082993507385, "loss/kd": 1.3456765413284302, "loss/lm": 0.37434008717536926, "step": 2090 }, { "epoch": 0.42927530281256415, "grad_norm": 1.2740269859200932, "kd_ratio": 0.5, "learning_rate": 1.2744154251328573e-05, "loss": 1.0770026445388794, "loss/kd": 1.7253988981246948, "loss/lm": 0.42860645055770874, "step": 2091 }, { "epoch": 0.4294805994662287, "grad_norm": 1.229455411364298, "kd_ratio": 0.5, "learning_rate": 1.2737758660216468e-05, "loss": 1.062385082244873, "loss/kd": 1.7583637237548828, "loss/lm": 0.3664064109325409, "step": 2092 }, { "epoch": 0.4296858961198932, "grad_norm": 1.417980622500529, "kd_ratio": 0.5, "learning_rate": 1.2731361858296833e-05, "loss": 1.078694462776184, "loss/kd": 1.7901021242141724, "loss/lm": 0.367286741733551, "step": 2093 }, { "epoch": 0.4298911927735578, "grad_norm": 1.1563194822874605, "kd_ratio": 0.5, "learning_rate": 1.2724963848398737e-05, "loss": 0.9533175230026245, "loss/kd": 1.5114883184432983, "loss/lm": 0.3951466679573059, "step": 2094 }, { "epoch": 0.43009648942722234, "grad_norm": 0.9482443476114926, "kd_ratio": 0.5, "learning_rate": 1.2718564633351773e-05, "loss": 1.2405683994293213, "loss/kd": 2.029965400695801, "loss/lm": 0.45117127895355225, "step": 2095 }, { "epoch": 0.43030178608088687, "grad_norm": 1.466164022541625, "kd_ratio": 0.5, "learning_rate": 1.2712164215986076e-05, "loss": 0.8988956212997437, "loss/kd": 1.5050859451293945, "loss/lm": 0.29270532727241516, "step": 2096 }, { "epoch": 0.4305070827345514, "grad_norm": 1.5365769233313913, "kd_ratio": 0.5, "learning_rate": 1.2705762599132308e-05, "loss": 1.2029327154159546, "loss/kd": 2.015357255935669, "loss/lm": 0.390508234500885, "step": 2097 }, { "epoch": 0.430712379388216, "grad_norm": 1.2217706554520933, "kd_ratio": 0.5, "learning_rate": 1.2699359785621663e-05, "loss": 0.9681616425514221, "loss/kd": 1.5755565166473389, "loss/lm": 0.36076679825782776, "step": 2098 }, { "epoch": 0.4309176760418805, "grad_norm": 1.133279528426544, "kd_ratio": 0.5, "learning_rate": 1.2692955778285865e-05, "loss": 1.0372438430786133, "loss/kd": 1.7048031091690063, "loss/lm": 0.3696846663951874, "step": 2099 }, { "epoch": 0.43112297269554506, "grad_norm": 1.4859691967215392, "kd_ratio": 0.5, "learning_rate": 1.2686550579957162e-05, "loss": 0.9655536413192749, "loss/kd": 1.5006704330444336, "loss/lm": 0.4304368793964386, "step": 2100 }, { "epoch": 0.4313282693492096, "grad_norm": 1.5064712209423976, "kd_ratio": 0.5, "learning_rate": 1.2680144193468339e-05, "loss": 1.3247753381729126, "loss/kd": 2.2351038455963135, "loss/lm": 0.4144468307495117, "step": 2101 }, { "epoch": 0.4315335660028742, "grad_norm": 1.1282920807743189, "kd_ratio": 0.5, "learning_rate": 1.267373662165269e-05, "loss": 1.0503828525543213, "loss/kd": 1.7426908016204834, "loss/lm": 0.35807478427886963, "step": 2102 }, { "epoch": 0.4317388626565387, "grad_norm": 1.3900332786310767, "kd_ratio": 0.5, "learning_rate": 1.266732786734405e-05, "loss": 1.0094772577285767, "loss/kd": 1.6337175369262695, "loss/lm": 0.38523703813552856, "step": 2103 }, { "epoch": 0.43194415931020325, "grad_norm": 1.6777475237727573, "kd_ratio": 0.5, "learning_rate": 1.2660917933376768e-05, "loss": 1.202444076538086, "loss/kd": 2.0045225620269775, "loss/lm": 0.4003655016422272, "step": 2104 }, { "epoch": 0.4321494559638678, "grad_norm": 1.0564940577348583, "kd_ratio": 0.5, "learning_rate": 1.2654506822585715e-05, "loss": 1.102285623550415, "loss/kd": 1.8052359819412231, "loss/lm": 0.3993353545665741, "step": 2105 }, { "epoch": 0.4323547526175323, "grad_norm": 1.3342296980983435, "kd_ratio": 0.5, "learning_rate": 1.2648094537806287e-05, "loss": 1.0199403762817383, "loss/kd": 1.6436805725097656, "loss/lm": 0.3962000906467438, "step": 2106 }, { "epoch": 0.4325600492711969, "grad_norm": 1.2377226649010444, "kd_ratio": 0.5, "learning_rate": 1.2641681081874394e-05, "loss": 1.1528916358947754, "loss/kd": 1.9636162519454956, "loss/lm": 0.3421669602394104, "step": 2107 }, { "epoch": 0.43276534592486143, "grad_norm": 1.2775726422458566, "kd_ratio": 0.5, "learning_rate": 1.2635266457626461e-05, "loss": 1.2089729309082031, "loss/kd": 2.020158052444458, "loss/lm": 0.397787868976593, "step": 2108 }, { "epoch": 0.43297064257852597, "grad_norm": 1.22290016220977, "kd_ratio": 0.5, "learning_rate": 1.2628850667899442e-05, "loss": 1.0173462629318237, "loss/kd": 1.6341149806976318, "loss/lm": 0.4005776047706604, "step": 2109 }, { "epoch": 0.4331759392321905, "grad_norm": 1.5013293207302207, "kd_ratio": 0.5, "learning_rate": 1.2622433715530796e-05, "loss": 1.232511281967163, "loss/kd": 2.102505922317505, "loss/lm": 0.36251652240753174, "step": 2110 }, { "epoch": 0.4333812358858551, "grad_norm": 1.4481340152145972, "kd_ratio": 0.5, "learning_rate": 1.2616015603358497e-05, "loss": 0.9433349370956421, "loss/kd": 1.541725516319275, "loss/lm": 0.34494441747665405, "step": 2111 }, { "epoch": 0.4335865325395196, "grad_norm": 1.322131116863534, "kd_ratio": 0.5, "learning_rate": 1.2609596334221042e-05, "loss": 1.0053448677062988, "loss/kd": 1.6403402090072632, "loss/lm": 0.37034961581230164, "step": 2112 }, { "epoch": 0.43379182919318415, "grad_norm": 0.9848423588562896, "kd_ratio": 0.5, "learning_rate": 1.260317591095742e-05, "loss": 1.0068254470825195, "loss/kd": 1.7445240020751953, "loss/lm": 0.2691269814968109, "step": 2113 }, { "epoch": 0.4339971258468487, "grad_norm": 1.2076778164413537, "kd_ratio": 0.5, "learning_rate": 1.2596754336407155e-05, "loss": 1.4322153329849243, "loss/kd": 2.4623360633850098, "loss/lm": 0.40209460258483887, "step": 2114 }, { "epoch": 0.4342024225005132, "grad_norm": 1.382663791281604, "kd_ratio": 0.5, "learning_rate": 1.2590331613410261e-05, "loss": 1.274362325668335, "loss/kd": 2.116265296936035, "loss/lm": 0.43245944380760193, "step": 2115 }, { "epoch": 0.4344077191541778, "grad_norm": 1.413195981362352, "kd_ratio": 0.5, "learning_rate": 1.2583907744807267e-05, "loss": 1.04854416847229, "loss/kd": 1.7106719017028809, "loss/lm": 0.38641637563705444, "step": 2116 }, { "epoch": 0.43461301580784234, "grad_norm": 1.3135812562522784, "kd_ratio": 0.5, "learning_rate": 1.2577482733439209e-05, "loss": 1.2530817985534668, "loss/kd": 2.1292898654937744, "loss/lm": 0.3768736720085144, "step": 2117 }, { "epoch": 0.4348183124615069, "grad_norm": 1.0559278605810354, "kd_ratio": 0.5, "learning_rate": 1.2571056582147625e-05, "loss": 0.9847383499145508, "loss/kd": 1.6263219118118286, "loss/lm": 0.34315481781959534, "step": 2118 }, { "epoch": 0.4350236091151714, "grad_norm": 1.4064390282881956, "kd_ratio": 0.5, "learning_rate": 1.2564629293774561e-05, "loss": 1.9880473613739014, "loss/kd": 3.680229425430298, "loss/lm": 0.29586538672447205, "step": 2119 }, { "epoch": 0.435228905768836, "grad_norm": 1.6110435239553336, "kd_ratio": 0.5, "learning_rate": 1.2558200871162567e-05, "loss": 1.0716379880905151, "loss/kd": 1.710018277168274, "loss/lm": 0.43325766921043396, "step": 2120 }, { "epoch": 0.4354342024225005, "grad_norm": 1.4265220916961445, "kd_ratio": 0.5, "learning_rate": 1.2551771317154687e-05, "loss": 1.1318721771240234, "loss/kd": 1.860924482345581, "loss/lm": 0.40281981229782104, "step": 2121 }, { "epoch": 0.43563949907616506, "grad_norm": 1.2172969907467184, "kd_ratio": 0.5, "learning_rate": 1.2545340634594476e-05, "loss": 1.3248767852783203, "loss/kd": 2.156763792037964, "loss/lm": 0.4929896593093872, "step": 2122 }, { "epoch": 0.4358447957298296, "grad_norm": 2.1208445411029087, "kd_ratio": 0.5, "learning_rate": 1.253890882632598e-05, "loss": 1.0159426927566528, "loss/kd": 1.737317681312561, "loss/lm": 0.29456761479377747, "step": 2123 }, { "epoch": 0.4360500923834941, "grad_norm": 2.458556174553139, "kd_ratio": 0.5, "learning_rate": 1.2532475895193747e-05, "loss": 1.1011713743209839, "loss/kd": 1.8372317552566528, "loss/lm": 0.3651110529899597, "step": 2124 }, { "epoch": 0.4362553890371587, "grad_norm": 1.1065955040024305, "kd_ratio": 0.5, "learning_rate": 1.252604184404282e-05, "loss": 1.0246312618255615, "loss/kd": 1.6930906772613525, "loss/lm": 0.35617172718048096, "step": 2125 }, { "epoch": 0.43646068569082325, "grad_norm": 2.1943915714106823, "kd_ratio": 0.5, "learning_rate": 1.2519606675718734e-05, "loss": 1.0513638257980347, "loss/kd": 1.7502167224884033, "loss/lm": 0.35251089930534363, "step": 2126 }, { "epoch": 0.4366659823444878, "grad_norm": 2.8832381161986014, "kd_ratio": 0.5, "learning_rate": 1.2513170393067527e-05, "loss": 1.2156010866165161, "loss/kd": 2.095808267593384, "loss/lm": 0.33539384603500366, "step": 2127 }, { "epoch": 0.4368712789981523, "grad_norm": 1.7740287769617786, "kd_ratio": 0.5, "learning_rate": 1.2506732998935717e-05, "loss": 1.1376397609710693, "loss/kd": 1.9512304067611694, "loss/lm": 0.32404911518096924, "step": 2128 }, { "epoch": 0.4370765756518169, "grad_norm": 1.0540242485884743, "kd_ratio": 0.5, "learning_rate": 1.2500294496170328e-05, "loss": 1.069462776184082, "loss/kd": 1.7320287227630615, "loss/lm": 0.40689677000045776, "step": 2129 }, { "epoch": 0.43728187230548143, "grad_norm": 2.045665360425285, "kd_ratio": 0.5, "learning_rate": 1.2493854887618863e-05, "loss": 0.9920138120651245, "loss/kd": 1.61289644241333, "loss/lm": 0.37113121151924133, "step": 2130 }, { "epoch": 0.43748716895914597, "grad_norm": 1.5100813539322409, "kd_ratio": 0.5, "learning_rate": 1.2487414176129322e-05, "loss": 0.9425071477890015, "loss/kd": 1.6134990453720093, "loss/lm": 0.27151528000831604, "step": 2131 }, { "epoch": 0.4376924656128105, "grad_norm": 1.24682700333469, "kd_ratio": 0.5, "learning_rate": 1.2480972364550188e-05, "loss": 1.16386878490448, "loss/kd": 1.921887993812561, "loss/lm": 0.4058496654033661, "step": 2132 }, { "epoch": 0.43789776226647503, "grad_norm": 1.705561555467902, "kd_ratio": 0.5, "learning_rate": 1.2474529455730429e-05, "loss": 1.1280263662338257, "loss/kd": 1.9305109977722168, "loss/lm": 0.32554182410240173, "step": 2133 }, { "epoch": 0.4381030589201396, "grad_norm": 1.355750111744708, "kd_ratio": 0.5, "learning_rate": 1.2468085452519503e-05, "loss": 1.3548938035964966, "loss/kd": 2.3088979721069336, "loss/lm": 0.4008895456790924, "step": 2134 }, { "epoch": 0.43830835557380415, "grad_norm": 1.4145973351201033, "kd_ratio": 0.5, "learning_rate": 1.246164035776735e-05, "loss": 1.1453579664230347, "loss/kd": 1.9715187549591064, "loss/lm": 0.3191971480846405, "step": 2135 }, { "epoch": 0.4385136522274687, "grad_norm": 1.99149830722168, "kd_ratio": 0.5, "learning_rate": 1.2455194174324387e-05, "loss": 1.2171225547790527, "loss/kd": 1.9967540502548218, "loss/lm": 0.43749117851257324, "step": 2136 }, { "epoch": 0.4387189488811332, "grad_norm": 1.2212292207143478, "kd_ratio": 0.5, "learning_rate": 1.2448746905041528e-05, "loss": 1.717644214630127, "loss/kd": 3.0391058921813965, "loss/lm": 0.39618244767189026, "step": 2137 }, { "epoch": 0.4389242455347978, "grad_norm": 1.4557413904301317, "kd_ratio": 0.5, "learning_rate": 1.2442298552770151e-05, "loss": 2.005681037902832, "loss/kd": 3.7083444595336914, "loss/lm": 0.30301737785339355, "step": 2138 }, { "epoch": 0.43912954218846234, "grad_norm": 1.851200594678172, "kd_ratio": 0.5, "learning_rate": 1.2435849120362123e-05, "loss": 1.3195042610168457, "loss/kd": 2.1567206382751465, "loss/lm": 0.4822879433631897, "step": 2139 }, { "epoch": 0.4393348388421269, "grad_norm": 1.69302327295321, "kd_ratio": 0.5, "learning_rate": 1.242939861066978e-05, "loss": 0.9426560997962952, "loss/kd": 1.412546157836914, "loss/lm": 0.4727660119533539, "step": 2140 }, { "epoch": 0.4395401354957914, "grad_norm": 1.2348542366345432, "kd_ratio": 0.5, "learning_rate": 1.2422947026545943e-05, "loss": 1.062029242515564, "loss/kd": 1.6531306505203247, "loss/lm": 0.47092780470848083, "step": 2141 }, { "epoch": 0.43974543214945594, "grad_norm": 2.583785938006833, "kd_ratio": 0.5, "learning_rate": 1.2416494370843905e-05, "loss": 0.7683510184288025, "loss/kd": 1.1785547733306885, "loss/lm": 0.3581472635269165, "step": 2142 }, { "epoch": 0.4399507288031205, "grad_norm": 1.484959207101084, "kd_ratio": 0.5, "learning_rate": 1.2410040646417431e-05, "loss": 0.913027822971344, "loss/kd": 1.4699339866638184, "loss/lm": 0.35612165927886963, "step": 2143 }, { "epoch": 0.44015602545678506, "grad_norm": 2.1283960131866433, "kd_ratio": 0.5, "learning_rate": 1.2403585856120762e-05, "loss": 1.2545816898345947, "loss/kd": 2.164613962173462, "loss/lm": 0.3445494472980499, "step": 2144 }, { "epoch": 0.4403613221104496, "grad_norm": 2.0449222257681403, "kd_ratio": 0.5, "learning_rate": 1.2397130002808604e-05, "loss": 0.9994280338287354, "loss/kd": 1.724866509437561, "loss/lm": 0.27398958802223206, "step": 2145 }, { "epoch": 0.4405666187641141, "grad_norm": 1.0218130539657606, "kd_ratio": 0.5, "learning_rate": 1.2390673089336144e-05, "loss": 1.2609915733337402, "loss/kd": 2.15828013420105, "loss/lm": 0.3637028932571411, "step": 2146 }, { "epoch": 0.4407719154177787, "grad_norm": 1.3300058130848333, "kd_ratio": 0.5, "learning_rate": 1.2384215118559027e-05, "loss": 1.1298812627792358, "loss/kd": 1.7999589443206787, "loss/lm": 0.4598035514354706, "step": 2147 }, { "epoch": 0.44097721207144325, "grad_norm": 1.4791669501497713, "kd_ratio": 0.5, "learning_rate": 1.2377756093333371e-05, "loss": 1.0505882501602173, "loss/kd": 1.768620491027832, "loss/lm": 0.3325560688972473, "step": 2148 }, { "epoch": 0.4411825087251078, "grad_norm": 1.2971302270322225, "kd_ratio": 0.5, "learning_rate": 1.2371296016515762e-05, "loss": 0.9756743907928467, "loss/kd": 1.5924124717712402, "loss/lm": 0.35893628001213074, "step": 2149 }, { "epoch": 0.4413878053787723, "grad_norm": 1.1727656901196921, "kd_ratio": 0.5, "learning_rate": 1.2364834890963246e-05, "loss": 1.0252615213394165, "loss/kd": 1.7386642694473267, "loss/lm": 0.3118587136268616, "step": 2150 }, { "epoch": 0.44159310203243685, "grad_norm": 1.6713110740021042, "kd_ratio": 0.5, "learning_rate": 1.235837271953334e-05, "loss": 1.0168030261993408, "loss/kd": 1.7204060554504395, "loss/lm": 0.313199907541275, "step": 2151 }, { "epoch": 0.44179839868610143, "grad_norm": 1.0545992459418838, "kd_ratio": 0.5, "learning_rate": 1.2351909505084012e-05, "loss": 1.1560311317443848, "loss/kd": 1.924570918083191, "loss/lm": 0.387491375207901, "step": 2152 }, { "epoch": 0.44200369533976597, "grad_norm": 1.210018817675039, "kd_ratio": 0.5, "learning_rate": 1.2345445250473702e-05, "loss": 1.0050898790359497, "loss/kd": 1.639898657798767, "loss/lm": 0.3702811896800995, "step": 2153 }, { "epoch": 0.4422089919934305, "grad_norm": 0.9066851754860317, "kd_ratio": 0.5, "learning_rate": 1.2338979958561305e-05, "loss": 1.3125064373016357, "loss/kd": 2.2434844970703125, "loss/lm": 0.38152843713760376, "step": 2154 }, { "epoch": 0.44241428864709503, "grad_norm": 1.0867373175012052, "kd_ratio": 0.5, "learning_rate": 1.2332513632206183e-05, "loss": 1.1347719430923462, "loss/kd": 1.9452245235443115, "loss/lm": 0.32431939244270325, "step": 2155 }, { "epoch": 0.4426195853007596, "grad_norm": 1.2577000103059128, "kd_ratio": 0.5, "learning_rate": 1.2326046274268136e-05, "loss": 1.126254677772522, "loss/kd": 1.8995226621627808, "loss/lm": 0.35298675298690796, "step": 2156 }, { "epoch": 0.44282488195442415, "grad_norm": 1.5487636726736536, "kd_ratio": 0.5, "learning_rate": 1.2319577887607443e-05, "loss": 1.0746433734893799, "loss/kd": 1.819970726966858, "loss/lm": 0.32931607961654663, "step": 2157 }, { "epoch": 0.4430301786080887, "grad_norm": 1.036410737779043, "kd_ratio": 0.5, "learning_rate": 1.2313108475084823e-05, "loss": 0.9492475986480713, "loss/kd": 1.6065059900283813, "loss/lm": 0.291989266872406, "step": 2158 }, { "epoch": 0.4432354752617532, "grad_norm": 1.2373241285551255, "kd_ratio": 0.5, "learning_rate": 1.2306638039561455e-05, "loss": 1.1688014268875122, "loss/kd": 1.9283218383789062, "loss/lm": 0.40928101539611816, "step": 2159 }, { "epoch": 0.44344077191541775, "grad_norm": 1.238849039577139, "kd_ratio": 0.5, "learning_rate": 1.2300166583898969e-05, "loss": 0.9745953679084778, "loss/kd": 1.5938539505004883, "loss/lm": 0.3553367853164673, "step": 2160 }, { "epoch": 0.44364606856908234, "grad_norm": 1.1024326777775952, "kd_ratio": 0.5, "learning_rate": 1.2293694110959448e-05, "loss": 1.1629379987716675, "loss/kd": 1.9789036512374878, "loss/lm": 0.34697240591049194, "step": 2161 }, { "epoch": 0.4438513652227469, "grad_norm": 1.523421775275653, "kd_ratio": 0.5, "learning_rate": 1.2287220623605418e-05, "loss": 1.034757375717163, "loss/kd": 1.7342971563339233, "loss/lm": 0.3352174758911133, "step": 2162 }, { "epoch": 0.4440566618764114, "grad_norm": 1.1344769602792102, "kd_ratio": 0.5, "learning_rate": 1.2280746124699864e-05, "loss": 0.9175363779067993, "loss/kd": 1.4608116149902344, "loss/lm": 0.37426120042800903, "step": 2163 }, { "epoch": 0.44426195853007594, "grad_norm": 1.297758941179333, "kd_ratio": 0.5, "learning_rate": 1.2274270617106208e-05, "loss": 1.2381550073623657, "loss/kd": 2.111656427383423, "loss/lm": 0.36465349793434143, "step": 2164 }, { "epoch": 0.44446725518374053, "grad_norm": 1.9722141452388955, "kd_ratio": 0.5, "learning_rate": 1.2267794103688327e-05, "loss": 1.1677485704421997, "loss/kd": 1.8987184762954712, "loss/lm": 0.43677860498428345, "step": 2165 }, { "epoch": 0.44467255183740506, "grad_norm": 1.5179984312351296, "kd_ratio": 0.5, "learning_rate": 1.2261316587310534e-05, "loss": 1.0830068588256836, "loss/kd": 1.7864731550216675, "loss/lm": 0.37954047322273254, "step": 2166 }, { "epoch": 0.4448778484910696, "grad_norm": 1.014110939974573, "kd_ratio": 0.5, "learning_rate": 1.2254838070837596e-05, "loss": 1.2452988624572754, "loss/kd": 2.135880947113037, "loss/lm": 0.35471680760383606, "step": 2167 }, { "epoch": 0.4450831451447341, "grad_norm": 1.3921226727057756, "kd_ratio": 0.5, "learning_rate": 1.2248358557134714e-05, "loss": 1.2116303443908691, "loss/kd": 2.0691769123077393, "loss/lm": 0.3540838658809662, "step": 2168 }, { "epoch": 0.44528844179839866, "grad_norm": 1.3136019600651967, "kd_ratio": 0.5, "learning_rate": 1.2241878049067536e-05, "loss": 1.2602347135543823, "loss/kd": 2.1560938358306885, "loss/lm": 0.364375501871109, "step": 2169 }, { "epoch": 0.44549373845206325, "grad_norm": 0.9828066730889824, "kd_ratio": 0.5, "learning_rate": 1.2235396549502144e-05, "loss": 1.1034860610961914, "loss/kd": 1.774312138557434, "loss/lm": 0.43265995383262634, "step": 2170 }, { "epoch": 0.4456990351057278, "grad_norm": 1.5023639507269007, "kd_ratio": 0.5, "learning_rate": 1.2228914061305059e-05, "loss": 1.0096772909164429, "loss/kd": 1.713653326034546, "loss/lm": 0.30570122599601746, "step": 2171 }, { "epoch": 0.4459043317593923, "grad_norm": 1.4081325813168715, "kd_ratio": 0.5, "learning_rate": 1.2222430587343246e-05, "loss": 1.0559089183807373, "loss/kd": 1.6628555059432983, "loss/lm": 0.4489624500274658, "step": 2172 }, { "epoch": 0.44610962841305685, "grad_norm": 0.9580715626978111, "kd_ratio": 0.5, "learning_rate": 1.2215946130484096e-05, "loss": 1.0654706954956055, "loss/kd": 1.7801402807235718, "loss/lm": 0.3508012294769287, "step": 2173 }, { "epoch": 0.44631492506672144, "grad_norm": 1.0645867744892845, "kd_ratio": 0.5, "learning_rate": 1.2209460693595446e-05, "loss": 0.9481526017189026, "loss/kd": 1.560001015663147, "loss/lm": 0.3363042175769806, "step": 2174 }, { "epoch": 0.44652022172038597, "grad_norm": 1.0195007248010166, "kd_ratio": 0.5, "learning_rate": 1.2202974279545554e-05, "loss": 0.8933717012405396, "loss/kd": 1.4748775959014893, "loss/lm": 0.31186577677726746, "step": 2175 }, { "epoch": 0.4467255183740505, "grad_norm": 1.1318205423142205, "kd_ratio": 0.5, "learning_rate": 1.2196486891203121e-05, "loss": 1.0829212665557861, "loss/kd": 1.8076013326644897, "loss/lm": 0.35824131965637207, "step": 2176 }, { "epoch": 0.44693081502771503, "grad_norm": 0.9685399624502522, "kd_ratio": 0.5, "learning_rate": 1.2189998531437275e-05, "loss": 0.9426478147506714, "loss/kd": 1.5259085893630981, "loss/lm": 0.35938701033592224, "step": 2177 }, { "epoch": 0.44713611168137957, "grad_norm": 1.3527096283704256, "kd_ratio": 0.5, "learning_rate": 1.218350920311757e-05, "loss": 0.9388275146484375, "loss/kd": 1.5025526285171509, "loss/lm": 0.3751024007797241, "step": 2178 }, { "epoch": 0.44734140833504416, "grad_norm": 1.4544965433770656, "kd_ratio": 0.5, "learning_rate": 1.2177018909113994e-05, "loss": 1.185093641281128, "loss/kd": 1.961659550666809, "loss/lm": 0.40852782130241394, "step": 2179 }, { "epoch": 0.4475467049887087, "grad_norm": 1.3071735453469044, "kd_ratio": 0.5, "learning_rate": 1.2170527652296958e-05, "loss": 0.9422147870063782, "loss/kd": 1.544661283493042, "loss/lm": 0.33976829051971436, "step": 2180 }, { "epoch": 0.4477520016423732, "grad_norm": 1.357455979682598, "kd_ratio": 0.5, "learning_rate": 1.2164035435537302e-05, "loss": 1.3727303743362427, "loss/kd": 2.4270236492156982, "loss/lm": 0.3184371590614319, "step": 2181 }, { "epoch": 0.44795729829603775, "grad_norm": 1.7737059599525378, "kd_ratio": 0.5, "learning_rate": 1.2157542261706287e-05, "loss": 0.9613368511199951, "loss/kd": 1.4982906579971313, "loss/lm": 0.4243830442428589, "step": 2182 }, { "epoch": 0.44816259494970234, "grad_norm": 1.003334604501382, "kd_ratio": 0.5, "learning_rate": 1.21510481336756e-05, "loss": 1.3305736780166626, "loss/kd": 2.314244270324707, "loss/lm": 0.3469031751155853, "step": 2183 }, { "epoch": 0.4483678916033669, "grad_norm": 1.696009150974152, "kd_ratio": 0.5, "learning_rate": 1.214455305431735e-05, "loss": 1.3088006973266602, "loss/kd": 2.216066360473633, "loss/lm": 0.4015349745750427, "step": 2184 }, { "epoch": 0.4485731882570314, "grad_norm": 2.1118095527164837, "kd_ratio": 0.5, "learning_rate": 1.2138057026504065e-05, "loss": 1.029090404510498, "loss/kd": 1.6777558326721191, "loss/lm": 0.38042500615119934, "step": 2185 }, { "epoch": 0.44877848491069594, "grad_norm": 1.3493607631079048, "kd_ratio": 0.5, "learning_rate": 1.2131560053108693e-05, "loss": 1.9401159286499023, "loss/kd": 3.5743751525878906, "loss/lm": 0.30585670471191406, "step": 2186 }, { "epoch": 0.4489837815643605, "grad_norm": 1.255443846563794, "kd_ratio": 0.5, "learning_rate": 1.2125062137004602e-05, "loss": 1.0019490718841553, "loss/kd": 1.6484816074371338, "loss/lm": 0.3554166257381439, "step": 2187 }, { "epoch": 0.44918907821802506, "grad_norm": 1.7537389836918276, "kd_ratio": 0.5, "learning_rate": 1.2118563281065574e-05, "loss": 1.183159589767456, "loss/kd": 1.9967308044433594, "loss/lm": 0.36958834528923035, "step": 2188 }, { "epoch": 0.4493943748716896, "grad_norm": 1.2078534571401822, "kd_ratio": 0.5, "learning_rate": 1.211206348816581e-05, "loss": 1.2117817401885986, "loss/kd": 1.934091567993164, "loss/lm": 0.48947179317474365, "step": 2189 }, { "epoch": 0.4495996715253541, "grad_norm": 1.2730671835618408, "kd_ratio": 0.5, "learning_rate": 1.2105562761179923e-05, "loss": 0.888611912727356, "loss/kd": 1.4349690675735474, "loss/lm": 0.3422548174858093, "step": 2190 }, { "epoch": 0.44980496817901866, "grad_norm": 1.7150293424945668, "kd_ratio": 0.5, "learning_rate": 1.2099061102982939e-05, "loss": 1.138570785522461, "loss/kd": 1.8844540119171143, "loss/lm": 0.39268752932548523, "step": 2191 }, { "epoch": 0.45001026483268325, "grad_norm": 1.292009739517745, "kd_ratio": 0.5, "learning_rate": 1.2092558516450296e-05, "loss": 1.0407038927078247, "loss/kd": 1.6887913942337036, "loss/lm": 0.39261630177497864, "step": 2192 }, { "epoch": 0.4502155614863478, "grad_norm": 1.959320715560075, "kd_ratio": 0.5, "learning_rate": 1.2086055004457844e-05, "loss": 0.9306381344795227, "loss/kd": 1.5651090145111084, "loss/lm": 0.296167254447937, "step": 2193 }, { "epoch": 0.4504208581400123, "grad_norm": 2.0852718464501647, "kd_ratio": 0.5, "learning_rate": 1.207955056988184e-05, "loss": 1.1785333156585693, "loss/kd": 2.0122809410095215, "loss/lm": 0.3447856307029724, "step": 2194 }, { "epoch": 0.45062615479367685, "grad_norm": 1.0840205070446758, "kd_ratio": 0.5, "learning_rate": 1.2073045215598953e-05, "loss": 1.0802947282791138, "loss/kd": 1.829918622970581, "loss/lm": 0.33067086338996887, "step": 2195 }, { "epoch": 0.45083145144734144, "grad_norm": 1.1874562504093293, "kd_ratio": 0.5, "learning_rate": 1.2066538944486254e-05, "loss": 1.0712553262710571, "loss/kd": 1.8852336406707764, "loss/lm": 0.2572769224643707, "step": 2196 }, { "epoch": 0.45103674810100597, "grad_norm": 1.3737066452498734, "kd_ratio": 0.5, "learning_rate": 1.2060031759421224e-05, "loss": 0.9559294581413269, "loss/kd": 1.5496928691864014, "loss/lm": 0.36216607689857483, "step": 2197 }, { "epoch": 0.4512420447546705, "grad_norm": 1.1978674654769847, "kd_ratio": 0.5, "learning_rate": 1.2053523663281745e-05, "loss": 0.9921332001686096, "loss/kd": 1.5917185544967651, "loss/lm": 0.3925478160381317, "step": 2198 }, { "epoch": 0.45144734140833503, "grad_norm": 0.9964946352297545, "kd_ratio": 0.5, "learning_rate": 1.20470146589461e-05, "loss": 1.0726184844970703, "loss/kd": 1.7031666040420532, "loss/lm": 0.44207027554512024, "step": 2199 }, { "epoch": 0.45165263806199957, "grad_norm": 1.0637198868736797, "kd_ratio": 0.5, "learning_rate": 1.2040504749292979e-05, "loss": 1.0431544780731201, "loss/kd": 1.6853046417236328, "loss/lm": 0.40100419521331787, "step": 2200 }, { "epoch": 0.45185793471566416, "grad_norm": 1.178352971188724, "kd_ratio": 0.5, "learning_rate": 1.2033993937201469e-05, "loss": 0.9009753465652466, "loss/kd": 1.502729058265686, "loss/lm": 0.29922157526016235, "step": 2201 }, { "epoch": 0.4520632313693287, "grad_norm": 1.0065621379416279, "kd_ratio": 0.5, "learning_rate": 1.2027482225551057e-05, "loss": 1.3824996948242188, "loss/kd": 2.3960137367248535, "loss/lm": 0.36898568272590637, "step": 2202 }, { "epoch": 0.4522685280229932, "grad_norm": 1.42062492531656, "kd_ratio": 0.5, "learning_rate": 1.2020969617221627e-05, "loss": 1.0592200756072998, "loss/kd": 1.7584892511367798, "loss/lm": 0.3599509298801422, "step": 2203 }, { "epoch": 0.45247382467665775, "grad_norm": 1.3421929135812403, "kd_ratio": 0.5, "learning_rate": 1.2014456115093463e-05, "loss": 0.9496930241584778, "loss/kd": 1.5355058908462524, "loss/lm": 0.36388012766838074, "step": 2204 }, { "epoch": 0.45267912133032234, "grad_norm": 1.5299447393041863, "kd_ratio": 0.5, "learning_rate": 1.200794172204724e-05, "loss": 1.059234857559204, "loss/kd": 1.7523199319839478, "loss/lm": 0.36614978313446045, "step": 2205 }, { "epoch": 0.4528844179839869, "grad_norm": 1.5858696110601684, "kd_ratio": 0.5, "learning_rate": 1.2001426440964025e-05, "loss": 0.9512544870376587, "loss/kd": 1.5716326236724854, "loss/lm": 0.33087635040283203, "step": 2206 }, { "epoch": 0.4530897146376514, "grad_norm": 1.1616945772521654, "kd_ratio": 0.5, "learning_rate": 1.199491027472529e-05, "loss": 1.2106776237487793, "loss/kd": 1.995119333267212, "loss/lm": 0.42623594403266907, "step": 2207 }, { "epoch": 0.45329501129131594, "grad_norm": 1.221264019961467, "kd_ratio": 0.5, "learning_rate": 1.1988393226212884e-05, "loss": 1.3236806392669678, "loss/kd": 2.333239793777466, "loss/lm": 0.31412139534950256, "step": 2208 }, { "epoch": 0.4535003079449805, "grad_norm": 1.0829270941158133, "kd_ratio": 0.5, "learning_rate": 1.1981875298309054e-05, "loss": 1.033829689025879, "loss/kd": 1.6968119144439697, "loss/lm": 0.37084758281707764, "step": 2209 }, { "epoch": 0.45370560459864506, "grad_norm": 1.1045143214671662, "kd_ratio": 0.5, "learning_rate": 1.1975356493896431e-05, "loss": 1.2400009632110596, "loss/kd": 2.0876548290252686, "loss/lm": 0.3923470973968506, "step": 2210 }, { "epoch": 0.4539109012523096, "grad_norm": 0.966825197806791, "kd_ratio": 0.5, "learning_rate": 1.1968836815858038e-05, "loss": 1.0523184537887573, "loss/kd": 1.6830400228500366, "loss/lm": 0.42159682512283325, "step": 2211 }, { "epoch": 0.45411619790597413, "grad_norm": 1.3849841765468, "kd_ratio": 0.5, "learning_rate": 1.1962316267077284e-05, "loss": 0.8877248764038086, "loss/kd": 1.409704327583313, "loss/lm": 0.3657453954219818, "step": 2212 }, { "epoch": 0.45432149455963866, "grad_norm": 1.7737101389459247, "kd_ratio": 0.5, "learning_rate": 1.1955794850437962e-05, "loss": 1.2953970432281494, "loss/kd": 2.250189781188965, "loss/lm": 0.34060418605804443, "step": 2213 }, { "epoch": 0.45452679121330325, "grad_norm": 1.2243535640209018, "kd_ratio": 0.5, "learning_rate": 1.1949272568824248e-05, "loss": 1.0671348571777344, "loss/kd": 1.7393789291381836, "loss/lm": 0.39489084482192993, "step": 2214 }, { "epoch": 0.4547320878669678, "grad_norm": 1.6794913663312014, "kd_ratio": 0.5, "learning_rate": 1.1942749425120704e-05, "loss": 1.4565755128860474, "loss/kd": 2.487168550491333, "loss/lm": 0.4259824752807617, "step": 2215 }, { "epoch": 0.4549373845206323, "grad_norm": 1.5612696577677916, "kd_ratio": 0.5, "learning_rate": 1.193622542221227e-05, "loss": 0.8519620895385742, "loss/kd": 1.4124767780303955, "loss/lm": 0.29144737124443054, "step": 2216 }, { "epoch": 0.45514268117429685, "grad_norm": 1.1381399618007548, "kd_ratio": 0.5, "learning_rate": 1.192970056298426e-05, "loss": 1.1535098552703857, "loss/kd": 1.9057310819625854, "loss/lm": 0.40128862857818604, "step": 2217 }, { "epoch": 0.4553479778279614, "grad_norm": 1.6520563662427277, "kd_ratio": 0.5, "learning_rate": 1.1923174850322385e-05, "loss": 1.1602693796157837, "loss/kd": 1.9363434314727783, "loss/lm": 0.3841952681541443, "step": 2218 }, { "epoch": 0.45555327448162597, "grad_norm": 1.7171150023493196, "kd_ratio": 0.5, "learning_rate": 1.1916648287112714e-05, "loss": 1.3028327226638794, "loss/kd": 2.2205159664154053, "loss/lm": 0.38514938950538635, "step": 2219 }, { "epoch": 0.4557585711352905, "grad_norm": 1.5396350637189993, "kd_ratio": 0.5, "learning_rate": 1.1910120876241702e-05, "loss": 1.035522222518921, "loss/kd": 1.7204737663269043, "loss/lm": 0.35057079792022705, "step": 2220 }, { "epoch": 0.45596386778895504, "grad_norm": 1.2446106175868765, "kd_ratio": 0.5, "learning_rate": 1.1903592620596175e-05, "loss": 1.1733628511428833, "loss/kd": 1.9457024335861206, "loss/lm": 0.4010232388973236, "step": 2221 }, { "epoch": 0.45616916444261957, "grad_norm": 1.464293292416387, "kd_ratio": 0.5, "learning_rate": 1.1897063523063338e-05, "loss": 1.092370629310608, "loss/kd": 1.8368078470230103, "loss/lm": 0.3479333221912384, "step": 2222 }, { "epoch": 0.45637446109628416, "grad_norm": 1.2569122559980381, "kd_ratio": 0.5, "learning_rate": 1.1890533586530766e-05, "loss": 0.6878312826156616, "loss/kd": 1.0729855298995972, "loss/lm": 0.3026770353317261, "step": 2223 }, { "epoch": 0.4565797577499487, "grad_norm": 1.637891194570208, "kd_ratio": 0.5, "learning_rate": 1.1884002813886399e-05, "loss": 1.250030755996704, "loss/kd": 2.173192024230957, "loss/lm": 0.3268694579601288, "step": 2224 }, { "epoch": 0.4567850544036132, "grad_norm": 1.4424732091514207, "kd_ratio": 0.5, "learning_rate": 1.1877471208018554e-05, "loss": 0.9857938885688782, "loss/kd": 1.5735527276992798, "loss/lm": 0.39803504943847656, "step": 2225 }, { "epoch": 0.45699035105727775, "grad_norm": 2.0064713362069178, "kd_ratio": 0.5, "learning_rate": 1.1870938771815916e-05, "loss": 1.185537576675415, "loss/kd": 1.9865531921386719, "loss/lm": 0.38452187180519104, "step": 2226 }, { "epoch": 0.4571956477109423, "grad_norm": 1.070831323205445, "kd_ratio": 0.5, "learning_rate": 1.1864405508167532e-05, "loss": 1.0633572340011597, "loss/kd": 1.7462221384048462, "loss/lm": 0.38049226999282837, "step": 2227 }, { "epoch": 0.4574009443646069, "grad_norm": 2.540851041935948, "kd_ratio": 0.5, "learning_rate": 1.1857871419962823e-05, "loss": 0.9612817764282227, "loss/kd": 1.6126724481582642, "loss/lm": 0.3098911643028259, "step": 2228 }, { "epoch": 0.4576062410182714, "grad_norm": 1.7426799861979487, "kd_ratio": 0.5, "learning_rate": 1.1851336510091567e-05, "loss": 1.0115742683410645, "loss/kd": 1.7378923892974854, "loss/lm": 0.28525617718696594, "step": 2229 }, { "epoch": 0.45781153767193594, "grad_norm": 1.5798631251636763, "kd_ratio": 0.5, "learning_rate": 1.1844800781443905e-05, "loss": 1.0709545612335205, "loss/kd": 1.827181339263916, "loss/lm": 0.3147278428077698, "step": 2230 }, { "epoch": 0.4580168343256005, "grad_norm": 2.831653104070473, "kd_ratio": 0.5, "learning_rate": 1.1838264236910348e-05, "loss": 1.0370259284973145, "loss/kd": 1.7450025081634521, "loss/lm": 0.32904940843582153, "step": 2231 }, { "epoch": 0.45822213097926506, "grad_norm": 1.731638476167777, "kd_ratio": 0.5, "learning_rate": 1.1831726879381764e-05, "loss": 1.189191222190857, "loss/kd": 2.039618968963623, "loss/lm": 0.3387635350227356, "step": 2232 }, { "epoch": 0.4584274276329296, "grad_norm": 1.9826916831160672, "kd_ratio": 0.5, "learning_rate": 1.182518871174938e-05, "loss": 0.9407301545143127, "loss/kd": 1.5264419317245483, "loss/lm": 0.35501837730407715, "step": 2233 }, { "epoch": 0.45863272428659413, "grad_norm": 2.359295891462886, "kd_ratio": 0.5, "learning_rate": 1.1818649736904781e-05, "loss": 1.223013162612915, "loss/kd": 2.083129405975342, "loss/lm": 0.36289697885513306, "step": 2234 }, { "epoch": 0.45883802094025866, "grad_norm": 1.0818902835036601, "kd_ratio": 0.5, "learning_rate": 1.1812109957739907e-05, "loss": 0.9342263340950012, "loss/kd": 1.5546454191207886, "loss/lm": 0.31380724906921387, "step": 2235 }, { "epoch": 0.4590433175939232, "grad_norm": 2.3474861523935466, "kd_ratio": 0.5, "learning_rate": 1.1805569377147059e-05, "loss": 1.0828715562820435, "loss/kd": 1.810290813446045, "loss/lm": 0.3554523289203644, "step": 2236 }, { "epoch": 0.4592486142475878, "grad_norm": 1.359171538764875, "kd_ratio": 0.5, "learning_rate": 1.1799027998018888e-05, "loss": 1.023677110671997, "loss/kd": 1.6728919744491577, "loss/lm": 0.3744623363018036, "step": 2237 }, { "epoch": 0.4594539109012523, "grad_norm": 1.6710750688808447, "kd_ratio": 0.5, "learning_rate": 1.1792485823248396e-05, "loss": 1.279673457145691, "loss/kd": 2.1743712425231934, "loss/lm": 0.38497570157051086, "step": 2238 }, { "epoch": 0.45965920755491685, "grad_norm": 1.1513510111241543, "kd_ratio": 0.5, "learning_rate": 1.1785942855728945e-05, "loss": 0.9881015419960022, "loss/kd": 1.6000460386276245, "loss/lm": 0.3761570453643799, "step": 2239 }, { "epoch": 0.4598645042085814, "grad_norm": 1.538236406063261, "kd_ratio": 0.5, "learning_rate": 1.1779399098354242e-05, "loss": 0.9795700311660767, "loss/kd": 1.6105279922485352, "loss/lm": 0.3486120104789734, "step": 2240 }, { "epoch": 0.46006980086224597, "grad_norm": 1.1666679944981693, "kd_ratio": 0.5, "learning_rate": 1.1772854554018345e-05, "loss": 1.2135461568832397, "loss/kd": 2.041532516479492, "loss/lm": 0.3855597674846649, "step": 2241 }, { "epoch": 0.4602750975159105, "grad_norm": 1.2706853304728256, "kd_ratio": 0.5, "learning_rate": 1.176630922561566e-05, "loss": 1.0825421810150146, "loss/kd": 1.7341831922531128, "loss/lm": 0.43090111017227173, "step": 2242 }, { "epoch": 0.46048039416957504, "grad_norm": 1.033074338588794, "kd_ratio": 0.5, "learning_rate": 1.1759763116040936e-05, "loss": 0.936396062374115, "loss/kd": 1.5197665691375732, "loss/lm": 0.3530255854129791, "step": 2243 }, { "epoch": 0.46068569082323957, "grad_norm": 1.5628210437753702, "kd_ratio": 0.5, "learning_rate": 1.1753216228189275e-05, "loss": 0.8859143853187561, "loss/kd": 1.448062539100647, "loss/lm": 0.32376620173454285, "step": 2244 }, { "epoch": 0.4608909874769041, "grad_norm": 1.0064507529848739, "kd_ratio": 0.5, "learning_rate": 1.1746668564956113e-05, "loss": 1.2394602298736572, "loss/kd": 2.1378819942474365, "loss/lm": 0.3410383462905884, "step": 2245 }, { "epoch": 0.4610962841305687, "grad_norm": 1.4950041903797475, "kd_ratio": 0.5, "learning_rate": 1.1740120129237242e-05, "loss": 1.933413028717041, "loss/kd": 3.62151837348938, "loss/lm": 0.2453075647354126, "step": 2246 }, { "epoch": 0.4613015807842332, "grad_norm": 1.5766674744082694, "kd_ratio": 0.5, "learning_rate": 1.1733570923928785e-05, "loss": 1.1240618228912354, "loss/kd": 1.8695008754730225, "loss/lm": 0.37862271070480347, "step": 2247 }, { "epoch": 0.46150687743789776, "grad_norm": 1.4895642843113097, "kd_ratio": 0.5, "learning_rate": 1.1727020951927206e-05, "loss": 0.999017596244812, "loss/kd": 1.5996280908584595, "loss/lm": 0.3984071612358093, "step": 2248 }, { "epoch": 0.4617121740915623, "grad_norm": 1.0577474397217275, "kd_ratio": 0.5, "learning_rate": 1.1720470216129312e-05, "loss": 1.166785478591919, "loss/kd": 1.9319065809249878, "loss/lm": 0.4016643166542053, "step": 2249 }, { "epoch": 0.4619174707452269, "grad_norm": 1.1187358744107379, "kd_ratio": 0.5, "learning_rate": 1.171391871943225e-05, "loss": 1.0886353254318237, "loss/kd": 1.7949016094207764, "loss/lm": 0.3823690414428711, "step": 2250 }, { "epoch": 0.4621227673988914, "grad_norm": 1.251792800331188, "kd_ratio": 0.5, "learning_rate": 1.1707366464733501e-05, "loss": 1.188420295715332, "loss/kd": 1.9563510417938232, "loss/lm": 0.42048946022987366, "step": 2251 }, { "epoch": 0.46232806405255594, "grad_norm": 1.5001409318828252, "kd_ratio": 0.5, "learning_rate": 1.1700813454930875e-05, "loss": 1.1016379594802856, "loss/kd": 1.7477896213531494, "loss/lm": 0.4554862380027771, "step": 2252 }, { "epoch": 0.4625333607062205, "grad_norm": 1.0379501761362422, "kd_ratio": 0.5, "learning_rate": 1.1694259692922525e-05, "loss": 1.0498281717300415, "loss/kd": 1.7442562580108643, "loss/lm": 0.3554001748561859, "step": 2253 }, { "epoch": 0.462738657359885, "grad_norm": 2.4531766352379663, "kd_ratio": 0.5, "learning_rate": 1.1687705181606932e-05, "loss": 1.3005911111831665, "loss/kd": 2.1480484008789062, "loss/lm": 0.45313379168510437, "step": 2254 }, { "epoch": 0.4629439540135496, "grad_norm": 1.8733080514093186, "kd_ratio": 0.5, "learning_rate": 1.1681149923882913e-05, "loss": 1.1451656818389893, "loss/kd": 1.9345463514328003, "loss/lm": 0.3557850122451782, "step": 2255 }, { "epoch": 0.46314925066721413, "grad_norm": 1.3913420770208123, "kd_ratio": 0.5, "learning_rate": 1.1674593922649604e-05, "loss": 1.3935075998306274, "loss/kd": 2.3304271697998047, "loss/lm": 0.45658811926841736, "step": 2256 }, { "epoch": 0.46335454732087866, "grad_norm": 1.3693926313664342, "kd_ratio": 0.5, "learning_rate": 1.1668037180806483e-05, "loss": 0.9467576742172241, "loss/kd": 1.5650947093963623, "loss/lm": 0.32842060923576355, "step": 2257 }, { "epoch": 0.4635598439745432, "grad_norm": 1.9460840191765485, "kd_ratio": 0.5, "learning_rate": 1.1661479701253348e-05, "loss": 1.181595802307129, "loss/kd": 2.0620388984680176, "loss/lm": 0.3011527359485626, "step": 2258 }, { "epoch": 0.4637651406282078, "grad_norm": 1.0979037326622962, "kd_ratio": 0.5, "learning_rate": 1.1654921486890327e-05, "loss": 0.9925858378410339, "loss/kd": 1.6441476345062256, "loss/lm": 0.3410240709781647, "step": 2259 }, { "epoch": 0.4639704372818723, "grad_norm": 1.3675794165567448, "kd_ratio": 0.5, "learning_rate": 1.164836254061787e-05, "loss": 1.2336301803588867, "loss/kd": 2.0334079265594482, "loss/lm": 0.43385252356529236, "step": 2260 }, { "epoch": 0.46417573393553685, "grad_norm": 1.4714113573245933, "kd_ratio": 0.5, "learning_rate": 1.1641802865336751e-05, "loss": 0.8162227869033813, "loss/kd": 1.342444658279419, "loss/lm": 0.29000094532966614, "step": 2261 }, { "epoch": 0.4643810305892014, "grad_norm": 1.4094147425227765, "kd_ratio": 0.5, "learning_rate": 1.1635242463948072e-05, "loss": 1.0915268659591675, "loss/kd": 1.7843559980392456, "loss/lm": 0.3986978232860565, "step": 2262 }, { "epoch": 0.4645863272428659, "grad_norm": 1.3443258143301806, "kd_ratio": 0.5, "learning_rate": 1.1628681339353244e-05, "loss": 1.2185704708099365, "loss/kd": 2.0341129302978516, "loss/lm": 0.4030279815196991, "step": 2263 }, { "epoch": 0.4647916238965305, "grad_norm": 2.0582275379459483, "kd_ratio": 0.5, "learning_rate": 1.162211949445401e-05, "loss": 1.190749168395996, "loss/kd": 2.057044267654419, "loss/lm": 0.3244541883468628, "step": 2264 }, { "epoch": 0.46499692055019504, "grad_norm": 1.3167520840995293, "kd_ratio": 0.5, "learning_rate": 1.1615556932152426e-05, "loss": 1.0710490942001343, "loss/kd": 1.7853695154190063, "loss/lm": 0.3567286431789398, "step": 2265 }, { "epoch": 0.46520221720385957, "grad_norm": 1.176126689659076, "kd_ratio": 0.5, "learning_rate": 1.1608993655350864e-05, "loss": 1.0134588479995728, "loss/kd": 1.6850056648254395, "loss/lm": 0.3419121205806732, "step": 2266 }, { "epoch": 0.4654075138575241, "grad_norm": 1.6681403770275753, "kd_ratio": 0.5, "learning_rate": 1.1602429666952015e-05, "loss": 1.227617621421814, "loss/kd": 2.1110780239105225, "loss/lm": 0.34415727853775024, "step": 2267 }, { "epoch": 0.4656128105111887, "grad_norm": 1.1568135054537236, "kd_ratio": 0.5, "learning_rate": 1.1595864969858888e-05, "loss": 0.992792010307312, "loss/kd": 1.5941565036773682, "loss/lm": 0.39142751693725586, "step": 2268 }, { "epoch": 0.4658181071648532, "grad_norm": 1.375016026318986, "kd_ratio": 0.5, "learning_rate": 1.1589299566974796e-05, "loss": 1.042142629623413, "loss/kd": 1.7105717658996582, "loss/lm": 0.3737134039402008, "step": 2269 }, { "epoch": 0.46602340381851776, "grad_norm": 2.0771494650953883, "kd_ratio": 0.5, "learning_rate": 1.1582733461203372e-05, "loss": 1.0370261669158936, "loss/kd": 1.7110062837600708, "loss/lm": 0.36304596066474915, "step": 2270 }, { "epoch": 0.4662287004721823, "grad_norm": 1.2309489332513872, "kd_ratio": 0.5, "learning_rate": 1.1576166655448558e-05, "loss": 0.9127094745635986, "loss/kd": 1.4659039974212646, "loss/lm": 0.359514981508255, "step": 2271 }, { "epoch": 0.4664339971258468, "grad_norm": 1.5401124827275066, "kd_ratio": 0.5, "learning_rate": 1.1569599152614604e-05, "loss": 1.0370402336120605, "loss/kd": 1.7133032083511353, "loss/lm": 0.3607771694660187, "step": 2272 }, { "epoch": 0.4666392937795114, "grad_norm": 2.6007569980924625, "kd_ratio": 0.5, "learning_rate": 1.1563030955606067e-05, "loss": 1.099336862564087, "loss/kd": 1.820572853088379, "loss/lm": 0.3781009316444397, "step": 2273 }, { "epoch": 0.46684459043317594, "grad_norm": 1.7443329820039046, "kd_ratio": 0.5, "learning_rate": 1.1556462067327814e-05, "loss": 0.9724366068840027, "loss/kd": 1.6448922157287598, "loss/lm": 0.2999809682369232, "step": 2274 }, { "epoch": 0.4670498870868405, "grad_norm": 1.352865166087562, "kd_ratio": 0.5, "learning_rate": 1.1549892490685018e-05, "loss": 1.194039225578308, "loss/kd": 1.996213674545288, "loss/lm": 0.3918648362159729, "step": 2275 }, { "epoch": 0.467255183740505, "grad_norm": 2.148924155738226, "kd_ratio": 0.5, "learning_rate": 1.1543322228583154e-05, "loss": 0.8930197954177856, "loss/kd": 1.3732483386993408, "loss/lm": 0.4127911925315857, "step": 2276 }, { "epoch": 0.4674604803941696, "grad_norm": 1.6381767643324072, "kd_ratio": 0.5, "learning_rate": 1.1536751283928002e-05, "loss": 1.0701134204864502, "loss/kd": 1.7858794927597046, "loss/lm": 0.3543473780155182, "step": 2277 }, { "epoch": 0.46766577704783413, "grad_norm": 1.5290733793718476, "kd_ratio": 0.5, "learning_rate": 1.1530179659625647e-05, "loss": 1.046492576599121, "loss/kd": 1.6628689765930176, "loss/lm": 0.430116206407547, "step": 2278 }, { "epoch": 0.46787107370149866, "grad_norm": 1.6917988756392015, "kd_ratio": 0.5, "learning_rate": 1.1523607358582462e-05, "loss": 1.1288198232650757, "loss/kd": 1.8587478399276733, "loss/lm": 0.39889174699783325, "step": 2279 }, { "epoch": 0.4680763703551632, "grad_norm": 1.2351218899484142, "kd_ratio": 0.5, "learning_rate": 1.1517034383705139e-05, "loss": 0.9015835523605347, "loss/kd": 1.4564179182052612, "loss/lm": 0.3467491567134857, "step": 2280 }, { "epoch": 0.46828166700882773, "grad_norm": 1.8333220615244057, "kd_ratio": 0.5, "learning_rate": 1.1510460737900653e-05, "loss": 0.9898371696472168, "loss/kd": 1.6602481603622437, "loss/lm": 0.31942617893218994, "step": 2281 }, { "epoch": 0.4684869636624923, "grad_norm": 1.796292726791867, "kd_ratio": 0.5, "learning_rate": 1.1503886424076274e-05, "loss": 0.879555344581604, "loss/kd": 1.3933736085891724, "loss/lm": 0.36573708057403564, "step": 2282 }, { "epoch": 0.46869226031615685, "grad_norm": 1.2957655184687313, "kd_ratio": 0.5, "learning_rate": 1.149731144513958e-05, "loss": 1.0393341779708862, "loss/kd": 1.78079354763031, "loss/lm": 0.29787489771842957, "step": 2283 }, { "epoch": 0.4688975569698214, "grad_norm": 2.4520857863849197, "kd_ratio": 0.5, "learning_rate": 1.1490735803998434e-05, "loss": 1.057354211807251, "loss/kd": 1.7707335948944092, "loss/lm": 0.3439747393131256, "step": 2284 }, { "epoch": 0.4691028536234859, "grad_norm": 1.5462151409430425, "kd_ratio": 0.5, "learning_rate": 1.1484159503560994e-05, "loss": 0.865021824836731, "loss/kd": 1.3979793787002563, "loss/lm": 0.33206433057785034, "step": 2285 }, { "epoch": 0.4693081502771505, "grad_norm": 1.1739414731959403, "kd_ratio": 0.5, "learning_rate": 1.1477582546735713e-05, "loss": 0.9492342472076416, "loss/kd": 1.500017523765564, "loss/lm": 0.39845097064971924, "step": 2286 }, { "epoch": 0.46951344693081504, "grad_norm": 1.3936674763378254, "kd_ratio": 0.5, "learning_rate": 1.1471004936431327e-05, "loss": 0.876689076423645, "loss/kd": 1.4375327825546265, "loss/lm": 0.31584540009498596, "step": 2287 }, { "epoch": 0.46971874358447957, "grad_norm": 1.1994216875487516, "kd_ratio": 0.5, "learning_rate": 1.1464426675556873e-05, "loss": 1.1496824026107788, "loss/kd": 1.9700571298599243, "loss/lm": 0.32930776476860046, "step": 2288 }, { "epoch": 0.4699240402381441, "grad_norm": 1.635477948016293, "kd_ratio": 0.5, "learning_rate": 1.1457847767021659e-05, "loss": 0.9747592210769653, "loss/kd": 1.6057815551757812, "loss/lm": 0.3437369167804718, "step": 2289 }, { "epoch": 0.4701293368918087, "grad_norm": 1.3263816209252297, "kd_ratio": 0.5, "learning_rate": 1.1451268213735292e-05, "loss": 1.156141996383667, "loss/kd": 1.9112679958343506, "loss/lm": 0.4010159373283386, "step": 2290 }, { "epoch": 0.4703346335454732, "grad_norm": 1.0976926349667036, "kd_ratio": 0.5, "learning_rate": 1.144468801860766e-05, "loss": 1.0442255735397339, "loss/kd": 1.707169532775879, "loss/lm": 0.3812815248966217, "step": 2291 }, { "epoch": 0.47053993019913776, "grad_norm": 1.2965774935335785, "kd_ratio": 0.5, "learning_rate": 1.1438107184548934e-05, "loss": 1.138658046722412, "loss/kd": 1.9804285764694214, "loss/lm": 0.2968875467777252, "step": 2292 }, { "epoch": 0.4707452268528023, "grad_norm": 1.2527191308067365, "kd_ratio": 0.5, "learning_rate": 1.1431525714469576e-05, "loss": 1.0309326648712158, "loss/kd": 1.6548510789871216, "loss/lm": 0.4070141911506653, "step": 2293 }, { "epoch": 0.4709505235064668, "grad_norm": 1.4879084431036453, "kd_ratio": 0.5, "learning_rate": 1.1424943611280312e-05, "loss": 0.9191633462905884, "loss/kd": 1.5598219633102417, "loss/lm": 0.2785046696662903, "step": 2294 }, { "epoch": 0.4711558201601314, "grad_norm": 1.6724184697803925, "kd_ratio": 0.5, "learning_rate": 1.1418360877892165e-05, "loss": 1.0670509338378906, "loss/kd": 1.6235514879226685, "loss/lm": 0.5105504393577576, "step": 2295 }, { "epoch": 0.47136111681379594, "grad_norm": 1.1334960615334402, "kd_ratio": 0.5, "learning_rate": 1.141177751721643e-05, "loss": 1.1804025173187256, "loss/kd": 1.9755905866622925, "loss/lm": 0.3852144777774811, "step": 2296 }, { "epoch": 0.4715664134674605, "grad_norm": 1.0881223990427027, "kd_ratio": 0.5, "learning_rate": 1.140519353216468e-05, "loss": 1.1952364444732666, "loss/kd": 2.0835893154144287, "loss/lm": 0.30688363313674927, "step": 2297 }, { "epoch": 0.471771710121125, "grad_norm": 1.3492790089507183, "kd_ratio": 0.5, "learning_rate": 1.139860892564876e-05, "loss": 1.1688764095306396, "loss/kd": 1.8854998350143433, "loss/lm": 0.4522530138492584, "step": 2298 }, { "epoch": 0.4719770067747896, "grad_norm": 1.1860078480025449, "kd_ratio": 0.5, "learning_rate": 1.1392023700580796e-05, "loss": 1.0156757831573486, "loss/kd": 1.7307426929473877, "loss/lm": 0.30060893297195435, "step": 2299 }, { "epoch": 0.47218230342845413, "grad_norm": 1.9206407994176662, "kd_ratio": 0.5, "learning_rate": 1.1385437859873183e-05, "loss": 0.9466528296470642, "loss/kd": 1.4705758094787598, "loss/lm": 0.42272982001304626, "step": 2300 }, { "epoch": 0.47238760008211866, "grad_norm": 1.3279612574087245, "kd_ratio": 0.5, "learning_rate": 1.1378851406438592e-05, "loss": 1.0786709785461426, "loss/kd": 1.8407137393951416, "loss/lm": 0.31662818789482117, "step": 2301 }, { "epoch": 0.4725928967357832, "grad_norm": 1.3548529603680755, "kd_ratio": 0.5, "learning_rate": 1.1372264343189962e-05, "loss": 1.194883942604065, "loss/kd": 2.040396213531494, "loss/lm": 0.34937170147895813, "step": 2302 }, { "epoch": 0.47279819338944773, "grad_norm": 1.5450857462937762, "kd_ratio": 0.5, "learning_rate": 1.1365676673040502e-05, "loss": 0.9801619648933411, "loss/kd": 1.5606244802474976, "loss/lm": 0.39969947934150696, "step": 2303 }, { "epoch": 0.4730034900431123, "grad_norm": 0.9676708647297926, "kd_ratio": 0.5, "learning_rate": 1.1359088398903693e-05, "loss": 1.2523008584976196, "loss/kd": 2.0983939170837402, "loss/lm": 0.40620774030685425, "step": 2304 }, { "epoch": 0.47320878669677685, "grad_norm": 1.0024130220077931, "kd_ratio": 0.5, "learning_rate": 1.1352499523693279e-05, "loss": 1.0114305019378662, "loss/kd": 1.6222443580627441, "loss/lm": 0.40061676502227783, "step": 2305 }, { "epoch": 0.4734140833504414, "grad_norm": 0.990873189364609, "kd_ratio": 0.5, "learning_rate": 1.1345910050323273e-05, "loss": 1.0645321607589722, "loss/kd": 1.8069859743118286, "loss/lm": 0.3220783472061157, "step": 2306 }, { "epoch": 0.4736193800041059, "grad_norm": 0.9678077874036848, "kd_ratio": 0.5, "learning_rate": 1.133931998170795e-05, "loss": 1.2790406942367554, "loss/kd": 2.2383456230163574, "loss/lm": 0.3197357654571533, "step": 2307 }, { "epoch": 0.4738246766577705, "grad_norm": 1.0096751159604167, "kd_ratio": 0.5, "learning_rate": 1.1332729320761846e-05, "loss": 0.920153021812439, "loss/kd": 1.496962547302246, "loss/lm": 0.34334346652030945, "step": 2308 }, { "epoch": 0.47402997331143504, "grad_norm": 1.057598292906987, "kd_ratio": 0.5, "learning_rate": 1.1326138070399768e-05, "loss": 1.8958288431167603, "loss/kd": 3.495807409286499, "loss/lm": 0.2958502471446991, "step": 2309 }, { "epoch": 0.47423526996509957, "grad_norm": 1.0806534486668162, "kd_ratio": 0.5, "learning_rate": 1.1319546233536772e-05, "loss": 1.1432386636734009, "loss/kd": 1.94009530544281, "loss/lm": 0.3463820517063141, "step": 2310 }, { "epoch": 0.4744405666187641, "grad_norm": 1.2344708000651725, "kd_ratio": 0.5, "learning_rate": 1.1312953813088183e-05, "loss": 1.2396812438964844, "loss/kd": 1.9923052787780762, "loss/lm": 0.4870571792125702, "step": 2311 }, { "epoch": 0.47464586327242864, "grad_norm": 1.2175835977379124, "kd_ratio": 0.5, "learning_rate": 1.130636081196958e-05, "loss": 1.1286792755126953, "loss/kd": 1.8644753694534302, "loss/lm": 0.39288318157196045, "step": 2312 }, { "epoch": 0.4748511599260932, "grad_norm": 1.337680538062082, "kd_ratio": 0.5, "learning_rate": 1.1299767233096794e-05, "loss": 1.0838333368301392, "loss/kd": 1.8387666940689087, "loss/lm": 0.328900009393692, "step": 2313 }, { "epoch": 0.47505645657975776, "grad_norm": 1.0929634746575725, "kd_ratio": 0.5, "learning_rate": 1.1293173079385923e-05, "loss": 1.0584776401519775, "loss/kd": 1.7713232040405273, "loss/lm": 0.3456319570541382, "step": 2314 }, { "epoch": 0.4752617532334223, "grad_norm": 1.5653911320577754, "kd_ratio": 0.5, "learning_rate": 1.1286578353753313e-05, "loss": 1.162198543548584, "loss/kd": 1.935605764389038, "loss/lm": 0.3887912333011627, "step": 2315 }, { "epoch": 0.4754670498870868, "grad_norm": 0.9272228172128965, "kd_ratio": 0.5, "learning_rate": 1.1279983059115557e-05, "loss": 1.0252634286880493, "loss/kd": 1.7279285192489624, "loss/lm": 0.3225984275341034, "step": 2316 }, { "epoch": 0.4756723465407514, "grad_norm": 1.5304537183097122, "kd_ratio": 0.5, "learning_rate": 1.127338719838951e-05, "loss": 0.8573063015937805, "loss/kd": 1.3947490453720093, "loss/lm": 0.31986358761787415, "step": 2317 }, { "epoch": 0.47587764319441594, "grad_norm": 1.526397991562598, "kd_ratio": 0.5, "learning_rate": 1.126679077449227e-05, "loss": 0.940436065196991, "loss/kd": 1.5012660026550293, "loss/lm": 0.37960612773895264, "step": 2318 }, { "epoch": 0.4760829398480805, "grad_norm": 1.1635961921685172, "kd_ratio": 0.5, "learning_rate": 1.1260193790341186e-05, "loss": 0.9154407978057861, "loss/kd": 1.4844298362731934, "loss/lm": 0.3464517891407013, "step": 2319 }, { "epoch": 0.476288236501745, "grad_norm": 1.1370198518397632, "kd_ratio": 0.5, "learning_rate": 1.1253596248853864e-05, "loss": 1.0549269914627075, "loss/kd": 1.7332870960235596, "loss/lm": 0.37656697630882263, "step": 2320 }, { "epoch": 0.47649353315540954, "grad_norm": 1.4932988855220097, "kd_ratio": 0.5, "learning_rate": 1.1246998152948137e-05, "loss": 1.182377815246582, "loss/kd": 2.068077564239502, "loss/lm": 0.2966780662536621, "step": 2321 }, { "epoch": 0.47669882980907413, "grad_norm": 1.058634587645864, "kd_ratio": 0.5, "learning_rate": 1.1240399505542096e-05, "loss": 1.1445468664169312, "loss/kd": 1.9438108205795288, "loss/lm": 0.3452829420566559, "step": 2322 }, { "epoch": 0.47690412646273866, "grad_norm": 1.1101049508671321, "kd_ratio": 0.5, "learning_rate": 1.1233800309554083e-05, "loss": 1.401603102684021, "loss/kd": 2.5059001445770264, "loss/lm": 0.2973061203956604, "step": 2323 }, { "epoch": 0.4771094231164032, "grad_norm": 1.3244900031362636, "kd_ratio": 0.5, "learning_rate": 1.1227200567902665e-05, "loss": 1.138568639755249, "loss/kd": 1.908963918685913, "loss/lm": 0.3681732714176178, "step": 2324 }, { "epoch": 0.47731471977006773, "grad_norm": 1.0143115942619385, "kd_ratio": 0.5, "learning_rate": 1.1220600283506665e-05, "loss": 1.088181972503662, "loss/kd": 1.807000756263733, "loss/lm": 0.3693631887435913, "step": 2325 }, { "epoch": 0.4775200164237323, "grad_norm": 1.1446177570165537, "kd_ratio": 0.5, "learning_rate": 1.1213999459285132e-05, "loss": 1.1992087364196777, "loss/kd": 1.9700677394866943, "loss/lm": 0.42834967374801636, "step": 2326 }, { "epoch": 0.47772531307739685, "grad_norm": 1.0470769978619432, "kd_ratio": 0.5, "learning_rate": 1.1207398098157371e-05, "loss": 1.8461682796478271, "loss/kd": 3.3676021099090576, "loss/lm": 0.3247343897819519, "step": 2327 }, { "epoch": 0.4779306097310614, "grad_norm": 0.9714064533036869, "kd_ratio": 0.5, "learning_rate": 1.1200796203042912e-05, "loss": 1.2189539670944214, "loss/kd": 1.995185136795044, "loss/lm": 0.44272273778915405, "step": 2328 }, { "epoch": 0.4781359063847259, "grad_norm": 1.1714023723800266, "kd_ratio": 0.5, "learning_rate": 1.1194193776861522e-05, "loss": 1.0758357048034668, "loss/kd": 1.7375609874725342, "loss/lm": 0.4141104519367218, "step": 2329 }, { "epoch": 0.47834120303839045, "grad_norm": 1.169114417314411, "kd_ratio": 0.5, "learning_rate": 1.1187590822533209e-05, "loss": 1.100884199142456, "loss/kd": 1.7278367280960083, "loss/lm": 0.47393175959587097, "step": 2330 }, { "epoch": 0.47854649969205504, "grad_norm": 1.0424093479130783, "kd_ratio": 0.5, "learning_rate": 1.1180987342978209e-05, "loss": 1.2187639474868774, "loss/kd": 2.0118165016174316, "loss/lm": 0.425711452960968, "step": 2331 }, { "epoch": 0.47875179634571957, "grad_norm": 1.0058551641790519, "kd_ratio": 0.5, "learning_rate": 1.117438334111699e-05, "loss": 1.0781863927841187, "loss/kd": 1.8468374013900757, "loss/lm": 0.309535413980484, "step": 2332 }, { "epoch": 0.4789570929993841, "grad_norm": 1.309590001004909, "kd_ratio": 0.5, "learning_rate": 1.116777881987026e-05, "loss": 1.3259594440460205, "loss/kd": 2.1834137439727783, "loss/lm": 0.46850526332855225, "step": 2333 }, { "epoch": 0.47916238965304864, "grad_norm": 1.607576785749306, "kd_ratio": 0.5, "learning_rate": 1.1161173782158943e-05, "loss": 0.9947628974914551, "loss/kd": 1.6091128587722778, "loss/lm": 0.3804129362106323, "step": 2334 }, { "epoch": 0.4793676863067132, "grad_norm": 1.2589138023096273, "kd_ratio": 0.5, "learning_rate": 1.1154568230904204e-05, "loss": 1.1631031036376953, "loss/kd": 1.9634028673171997, "loss/lm": 0.3628034293651581, "step": 2335 }, { "epoch": 0.47957298296037776, "grad_norm": 1.1519831436443002, "kd_ratio": 0.5, "learning_rate": 1.1147962169027427e-05, "loss": 1.0107680559158325, "loss/kd": 1.677558183670044, "loss/lm": 0.34397783875465393, "step": 2336 }, { "epoch": 0.4797782796140423, "grad_norm": 1.2151346147818445, "kd_ratio": 0.5, "learning_rate": 1.1141355599450221e-05, "loss": 1.058047890663147, "loss/kd": 1.7155719995498657, "loss/lm": 0.40052372217178345, "step": 2337 }, { "epoch": 0.4799835762677068, "grad_norm": 1.077557029924806, "kd_ratio": 0.5, "learning_rate": 1.113474852509443e-05, "loss": 1.298750877380371, "loss/kd": 2.2296621799468994, "loss/lm": 0.3678395748138428, "step": 2338 }, { "epoch": 0.48018887292137136, "grad_norm": 1.1847725507045634, "kd_ratio": 0.5, "learning_rate": 1.1128140948882107e-05, "loss": 1.1142960786819458, "loss/kd": 1.8424264192581177, "loss/lm": 0.3861658275127411, "step": 2339 }, { "epoch": 0.48039416957503595, "grad_norm": 1.2398967644027488, "kd_ratio": 0.5, "learning_rate": 1.1121532873735537e-05, "loss": 1.251777172088623, "loss/kd": 2.1723482608795166, "loss/lm": 0.3312060534954071, "step": 2340 }, { "epoch": 0.4805994662287005, "grad_norm": 1.1778381437747854, "kd_ratio": 0.5, "learning_rate": 1.1114924302577226e-05, "loss": 1.1282806396484375, "loss/kd": 1.9117693901062012, "loss/lm": 0.344791978597641, "step": 2341 }, { "epoch": 0.480804762882365, "grad_norm": 1.5041670020298448, "kd_ratio": 0.5, "learning_rate": 1.110831523832989e-05, "loss": 1.9526838064193726, "loss/kd": 3.546572685241699, "loss/lm": 0.3587949872016907, "step": 2342 }, { "epoch": 0.48101005953602954, "grad_norm": 1.3747067043482066, "kd_ratio": 0.5, "learning_rate": 1.1101705683916473e-05, "loss": 1.0980266332626343, "loss/kd": 1.743147850036621, "loss/lm": 0.45290547609329224, "step": 2343 }, { "epoch": 0.48121535618969413, "grad_norm": 1.2489444019569353, "kd_ratio": 0.5, "learning_rate": 1.1095095642260131e-05, "loss": 0.8874729871749878, "loss/kd": 1.383700966835022, "loss/lm": 0.3912450075149536, "step": 2344 }, { "epoch": 0.48142065284335867, "grad_norm": 2.1423571299439303, "kd_ratio": 0.5, "learning_rate": 1.1088485116284238e-05, "loss": 1.0624613761901855, "loss/kd": 1.7571864128112793, "loss/lm": 0.36773642897605896, "step": 2345 }, { "epoch": 0.4816259494970232, "grad_norm": 1.2306530285108819, "kd_ratio": 0.5, "learning_rate": 1.1081874108912381e-05, "loss": 1.0948433876037598, "loss/kd": 1.8104363679885864, "loss/lm": 0.37925034761428833, "step": 2346 }, { "epoch": 0.48183124615068773, "grad_norm": 1.4623914929056285, "kd_ratio": 0.5, "learning_rate": 1.1075262623068352e-05, "loss": 1.3397668600082397, "loss/kd": 2.2871909141540527, "loss/lm": 0.39234286546707153, "step": 2347 }, { "epoch": 0.48203654280435226, "grad_norm": 1.7831785162410507, "kd_ratio": 0.5, "learning_rate": 1.1068650661676173e-05, "loss": 0.9662396907806396, "loss/kd": 1.5308247804641724, "loss/lm": 0.40165454149246216, "step": 2348 }, { "epoch": 0.48224183945801685, "grad_norm": 1.162471159738106, "kd_ratio": 0.5, "learning_rate": 1.1062038227660056e-05, "loss": 1.0725451707839966, "loss/kd": 1.8220306634902954, "loss/lm": 0.32305967807769775, "step": 2349 }, { "epoch": 0.4824471361116814, "grad_norm": 1.3460362347716015, "kd_ratio": 0.5, "learning_rate": 1.1055425323944438e-05, "loss": 1.0511226654052734, "loss/kd": 1.7327457666397095, "loss/lm": 0.3694995939731598, "step": 2350 }, { "epoch": 0.4826524327653459, "grad_norm": 1.451617877984142, "kd_ratio": 0.5, "learning_rate": 1.1048811953453955e-05, "loss": 1.2148264646530151, "loss/kd": 2.0746450424194336, "loss/lm": 0.35500794649124146, "step": 2351 }, { "epoch": 0.48285772941901045, "grad_norm": 1.179726709345192, "kd_ratio": 0.5, "learning_rate": 1.1042198119113451e-05, "loss": 1.013425588607788, "loss/kd": 1.678725242614746, "loss/lm": 0.34812599420547485, "step": 2352 }, { "epoch": 0.48306302607267504, "grad_norm": 1.088210153637185, "kd_ratio": 0.5, "learning_rate": 1.103558382384798e-05, "loss": 1.3724265098571777, "loss/kd": 2.378481149673462, "loss/lm": 0.3663717806339264, "step": 2353 }, { "epoch": 0.48326832272633957, "grad_norm": 1.8955979398534448, "kd_ratio": 0.5, "learning_rate": 1.1028969070582789e-05, "loss": 1.8933652639389038, "loss/kd": 3.4713637828826904, "loss/lm": 0.3153667151927948, "step": 2354 }, { "epoch": 0.4834736193800041, "grad_norm": 1.636516327070009, "kd_ratio": 0.5, "learning_rate": 1.1022353862243338e-05, "loss": 1.0574922561645508, "loss/kd": 1.7417482137680054, "loss/lm": 0.37323617935180664, "step": 2355 }, { "epoch": 0.48367891603366864, "grad_norm": 1.2664744408539736, "kd_ratio": 0.5, "learning_rate": 1.1015738201755283e-05, "loss": 1.1330947875976562, "loss/kd": 1.8792310953140259, "loss/lm": 0.38695859909057617, "step": 2356 }, { "epoch": 0.48388421268733317, "grad_norm": 1.853978154261468, "kd_ratio": 0.5, "learning_rate": 1.1009122092044485e-05, "loss": 0.8867329359054565, "loss/kd": 1.4072784185409546, "loss/lm": 0.3661873936653137, "step": 2357 }, { "epoch": 0.48408950934099776, "grad_norm": 1.4225966928870328, "kd_ratio": 0.5, "learning_rate": 1.1002505536036997e-05, "loss": 1.0123658180236816, "loss/kd": 1.6102536916732788, "loss/lm": 0.41447803378105164, "step": 2358 }, { "epoch": 0.4842948059946623, "grad_norm": 1.242699385876343, "kd_ratio": 0.5, "learning_rate": 1.0995888536659067e-05, "loss": 1.0325977802276611, "loss/kd": 1.7074867486953735, "loss/lm": 0.3577089309692383, "step": 2359 }, { "epoch": 0.4845001026483268, "grad_norm": 1.6956036300506498, "kd_ratio": 0.5, "learning_rate": 1.0989271096837156e-05, "loss": 1.1697165966033936, "loss/kd": 1.9931193590164185, "loss/lm": 0.34631380438804626, "step": 2360 }, { "epoch": 0.48470539930199136, "grad_norm": 1.352150403768616, "kd_ratio": 0.5, "learning_rate": 1.0982653219497901e-05, "loss": 0.9836455583572388, "loss/kd": 1.5453859567642212, "loss/lm": 0.4219052195549011, "step": 2361 }, { "epoch": 0.48491069595565595, "grad_norm": 1.0928510069168809, "kd_ratio": 0.5, "learning_rate": 1.0976034907568143e-05, "loss": 2.0024609565734863, "loss/kd": 3.763697862625122, "loss/lm": 0.2412240207195282, "step": 2362 }, { "epoch": 0.4851159926093205, "grad_norm": 2.1392082309481304, "kd_ratio": 0.5, "learning_rate": 1.096941616397491e-05, "loss": 0.9720731973648071, "loss/kd": 1.6071873903274536, "loss/lm": 0.33695903420448303, "step": 2363 }, { "epoch": 0.485321289262985, "grad_norm": 1.9044188827297976, "kd_ratio": 0.5, "learning_rate": 1.0962796991645424e-05, "loss": 1.4137412309646606, "loss/kd": 2.4844937324523926, "loss/lm": 0.34298866987228394, "step": 2364 }, { "epoch": 0.48552658591664954, "grad_norm": 1.0319783905017816, "kd_ratio": 0.5, "learning_rate": 1.0956177393507095e-05, "loss": 1.4064797163009644, "loss/kd": 2.445237398147583, "loss/lm": 0.3677219748497009, "step": 2365 }, { "epoch": 0.4857318825703141, "grad_norm": 1.7714552374465744, "kd_ratio": 0.5, "learning_rate": 1.0949557372487523e-05, "loss": 1.9072085618972778, "loss/kd": 3.490307569503784, "loss/lm": 0.3241094648838043, "step": 2366 }, { "epoch": 0.48593717922397867, "grad_norm": 1.4899021723573145, "kd_ratio": 0.5, "learning_rate": 1.0942936931514492e-05, "loss": 1.07761812210083, "loss/kd": 1.8195013999938965, "loss/lm": 0.3357347548007965, "step": 2367 }, { "epoch": 0.4861424758776432, "grad_norm": 1.2971638185869767, "kd_ratio": 0.5, "learning_rate": 1.0936316073515973e-05, "loss": 0.9390382170677185, "loss/kd": 1.5472607612609863, "loss/lm": 0.33081570267677307, "step": 2368 }, { "epoch": 0.48634777253130773, "grad_norm": 1.6827559724544863, "kd_ratio": 0.5, "learning_rate": 1.0929694801420128e-05, "loss": 1.196424961090088, "loss/kd": 2.0780060291290283, "loss/lm": 0.3148438632488251, "step": 2369 }, { "epoch": 0.48655306918497226, "grad_norm": 1.2946087452445694, "kd_ratio": 0.5, "learning_rate": 1.0923073118155287e-05, "loss": 1.020878553390503, "loss/kd": 1.6312204599380493, "loss/lm": 0.4105365574359894, "step": 2370 }, { "epoch": 0.48675836583863685, "grad_norm": 0.920027762856283, "kd_ratio": 0.5, "learning_rate": 1.0916451026649981e-05, "loss": 1.2355432510375977, "loss/kd": 2.0117568969726562, "loss/lm": 0.4593295156955719, "step": 2371 }, { "epoch": 0.4869636624923014, "grad_norm": 1.1562082754382834, "kd_ratio": 0.5, "learning_rate": 1.0909828529832907e-05, "loss": 1.2405900955200195, "loss/kd": 2.0954813957214355, "loss/lm": 0.38569867610931396, "step": 2372 }, { "epoch": 0.4871689591459659, "grad_norm": 1.7478754930276086, "kd_ratio": 0.5, "learning_rate": 1.0903205630632942e-05, "loss": 1.0701205730438232, "loss/kd": 1.7855651378631592, "loss/lm": 0.3546760082244873, "step": 2373 }, { "epoch": 0.48737425579963045, "grad_norm": 0.9553788416347021, "kd_ratio": 0.5, "learning_rate": 1.0896582331979154e-05, "loss": 1.1272265911102295, "loss/kd": 1.8984731435775757, "loss/lm": 0.35597994923591614, "step": 2374 }, { "epoch": 0.487579552453295, "grad_norm": 1.1907012361219045, "kd_ratio": 0.5, "learning_rate": 1.088995863680077e-05, "loss": 1.1637433767318726, "loss/kd": 1.9997235536575317, "loss/lm": 0.32776322960853577, "step": 2375 }, { "epoch": 0.4877848491069596, "grad_norm": 1.4175040528302723, "kd_ratio": 0.5, "learning_rate": 1.0883334548027202e-05, "loss": 1.0661625862121582, "loss/kd": 1.7776490449905396, "loss/lm": 0.3546760380268097, "step": 2376 }, { "epoch": 0.4879901457606241, "grad_norm": 1.0470103403238997, "kd_ratio": 0.5, "learning_rate": 1.0876710068588038e-05, "loss": 1.011617660522461, "loss/kd": 1.67777681350708, "loss/lm": 0.34545841813087463, "step": 2377 }, { "epoch": 0.48819544241428864, "grad_norm": 0.9602051332866584, "kd_ratio": 0.5, "learning_rate": 1.0870085201413034e-05, "loss": 0.8675553798675537, "loss/kd": 1.3958768844604492, "loss/lm": 0.3392339050769806, "step": 2378 }, { "epoch": 0.48840073906795317, "grad_norm": 1.2605669027287285, "kd_ratio": 0.5, "learning_rate": 1.0863459949432122e-05, "loss": 0.9583179950714111, "loss/kd": 1.5882444381713867, "loss/lm": 0.32839149236679077, "step": 2379 }, { "epoch": 0.48860603572161776, "grad_norm": 1.2784188494507682, "kd_ratio": 0.5, "learning_rate": 1.08568343155754e-05, "loss": 1.0592087507247925, "loss/kd": 1.78415048122406, "loss/lm": 0.33426710963249207, "step": 2380 }, { "epoch": 0.4888113323752823, "grad_norm": 1.1161160379150408, "kd_ratio": 0.5, "learning_rate": 1.0850208302773133e-05, "loss": 0.8617123365402222, "loss/kd": 1.3761823177337646, "loss/lm": 0.34724241495132446, "step": 2381 }, { "epoch": 0.4890166290289468, "grad_norm": 1.2358805112744566, "kd_ratio": 0.5, "learning_rate": 1.084358191395576e-05, "loss": 1.2062829732894897, "loss/kd": 2.039640426635742, "loss/lm": 0.37292543053627014, "step": 2382 }, { "epoch": 0.48922192568261136, "grad_norm": 1.5540602145527795, "kd_ratio": 0.5, "learning_rate": 1.0836955152053883e-05, "loss": 1.013187050819397, "loss/kd": 1.7129565477371216, "loss/lm": 0.31341761350631714, "step": 2383 }, { "epoch": 0.4894272223362759, "grad_norm": 0.981534241146532, "kd_ratio": 0.5, "learning_rate": 1.0830328019998269e-05, "loss": 1.0811253786087036, "loss/kd": 1.8503258228302002, "loss/lm": 0.31192493438720703, "step": 2384 }, { "epoch": 0.4896325189899405, "grad_norm": 1.0783234711297527, "kd_ratio": 0.5, "learning_rate": 1.0823700520719846e-05, "loss": 0.9911071062088013, "loss/kd": 1.532706379890442, "loss/lm": 0.44950786232948303, "step": 2385 }, { "epoch": 0.489837815643605, "grad_norm": 1.0453953789160986, "kd_ratio": 0.5, "learning_rate": 1.0817072657149713e-05, "loss": 0.8677278757095337, "loss/kd": 1.3936798572540283, "loss/lm": 0.34177595376968384, "step": 2386 }, { "epoch": 0.49004311229726955, "grad_norm": 1.0421011462783738, "kd_ratio": 0.5, "learning_rate": 1.081044443221912e-05, "loss": 0.9504553079605103, "loss/kd": 1.5139143466949463, "loss/lm": 0.3869962692260742, "step": 2387 }, { "epoch": 0.4902484089509341, "grad_norm": 0.9561978872958088, "kd_ratio": 0.5, "learning_rate": 1.0803815848859485e-05, "loss": 1.1323966979980469, "loss/kd": 1.9116185903549194, "loss/lm": 0.35317492485046387, "step": 2388 }, { "epoch": 0.49045370560459867, "grad_norm": 0.9200730115203718, "kd_ratio": 0.5, "learning_rate": 1.0797186910002375e-05, "loss": 1.0422210693359375, "loss/kd": 1.718150019645691, "loss/lm": 0.36629214882850647, "step": 2389 }, { "epoch": 0.4906590022582632, "grad_norm": 1.3852140482112534, "kd_ratio": 0.5, "learning_rate": 1.0790557618579527e-05, "loss": 1.1217401027679443, "loss/kd": 1.911017894744873, "loss/lm": 0.3324623107910156, "step": 2390 }, { "epoch": 0.49086429891192773, "grad_norm": 1.1487162005086327, "kd_ratio": 0.5, "learning_rate": 1.0783927977522819e-05, "loss": 0.8223556876182556, "loss/kd": 1.3108819723129272, "loss/lm": 0.3338293731212616, "step": 2391 }, { "epoch": 0.49106959556559227, "grad_norm": 1.3571435850213076, "kd_ratio": 0.5, "learning_rate": 1.0777297989764297e-05, "loss": 1.0233551263809204, "loss/kd": 1.7694504261016846, "loss/lm": 0.2772597670555115, "step": 2392 }, { "epoch": 0.49127489221925685, "grad_norm": 0.9502979898848307, "kd_ratio": 0.5, "learning_rate": 1.0770667658236156e-05, "loss": 1.0940625667572021, "loss/kd": 1.853334903717041, "loss/lm": 0.3347901999950409, "step": 2393 }, { "epoch": 0.4914801888729214, "grad_norm": 1.4837803245249295, "kd_ratio": 0.5, "learning_rate": 1.0764036985870733e-05, "loss": 0.8673577904701233, "loss/kd": 1.3902522325515747, "loss/lm": 0.3444633185863495, "step": 2394 }, { "epoch": 0.4916854855265859, "grad_norm": 1.1072800079156242, "kd_ratio": 0.5, "learning_rate": 1.0757405975600534e-05, "loss": 1.0507627725601196, "loss/kd": 1.689640998840332, "loss/lm": 0.4118845462799072, "step": 2395 }, { "epoch": 0.49189078218025045, "grad_norm": 1.0298602829277603, "kd_ratio": 0.5, "learning_rate": 1.0750774630358204e-05, "loss": 1.126991629600525, "loss/kd": 1.830863118171692, "loss/lm": 0.42312005162239075, "step": 2396 }, { "epoch": 0.492096078833915, "grad_norm": 1.162946814415971, "kd_ratio": 0.5, "learning_rate": 1.0744142953076535e-05, "loss": 1.107401967048645, "loss/kd": 1.9162542819976807, "loss/lm": 0.2985496520996094, "step": 2397 }, { "epoch": 0.4923013754875796, "grad_norm": 1.0378495791220252, "kd_ratio": 0.5, "learning_rate": 1.0737510946688468e-05, "loss": 0.9901957511901855, "loss/kd": 1.6256452798843384, "loss/lm": 0.3547461926937103, "step": 2398 }, { "epoch": 0.4925066721412441, "grad_norm": 1.121963898543215, "kd_ratio": 0.5, "learning_rate": 1.0730878614127087e-05, "loss": 0.9061889052391052, "loss/kd": 1.4530943632125854, "loss/lm": 0.3592834174633026, "step": 2399 }, { "epoch": 0.49271196879490864, "grad_norm": 1.1845656620203702, "kd_ratio": 0.5, "learning_rate": 1.0724245958325629e-05, "loss": 1.098128318786621, "loss/kd": 1.8280003070831299, "loss/lm": 0.36825644969940186, "step": 2400 }, { "epoch": 0.49291726544857317, "grad_norm": 2.0263352215456605, "kd_ratio": 0.5, "learning_rate": 1.0717612982217464e-05, "loss": 0.8696232438087463, "loss/kd": 1.3506345748901367, "loss/lm": 0.38861194252967834, "step": 2401 }, { "epoch": 0.49312256210223776, "grad_norm": 1.4815479027590197, "kd_ratio": 0.5, "learning_rate": 1.0710979688736107e-05, "loss": 0.8677806854248047, "loss/kd": 1.4387396574020386, "loss/lm": 0.2968217730522156, "step": 2402 }, { "epoch": 0.4933278587559023, "grad_norm": 1.2612346560089542, "kd_ratio": 0.5, "learning_rate": 1.0704346080815218e-05, "loss": 1.4113786220550537, "loss/kd": 2.4919023513793945, "loss/lm": 0.3308548033237457, "step": 2403 }, { "epoch": 0.4935331554095668, "grad_norm": 1.5714380016004326, "kd_ratio": 0.5, "learning_rate": 1.0697712161388584e-05, "loss": 1.0270339250564575, "loss/kd": 1.7180651426315308, "loss/lm": 0.3360027074813843, "step": 2404 }, { "epoch": 0.49373845206323136, "grad_norm": 1.1192164267682163, "kd_ratio": 0.5, "learning_rate": 1.0691077933390145e-05, "loss": 1.9599660634994507, "loss/kd": 3.490459442138672, "loss/lm": 0.4294726252555847, "step": 2405 }, { "epoch": 0.4939437487168959, "grad_norm": 1.5774303786455102, "kd_ratio": 0.5, "learning_rate": 1.0684443399753967e-05, "loss": 1.1327115297317505, "loss/kd": 1.9177666902542114, "loss/lm": 0.3476564586162567, "step": 2406 }, { "epoch": 0.4941490453705605, "grad_norm": 1.053401318387066, "kd_ratio": 0.5, "learning_rate": 1.0677808563414256e-05, "loss": 0.8989375233650208, "loss/kd": 1.41989266872406, "loss/lm": 0.37798240780830383, "step": 2407 }, { "epoch": 0.494354342024225, "grad_norm": 1.2697534058156055, "kd_ratio": 0.5, "learning_rate": 1.067117342730535e-05, "loss": 1.2428094148635864, "loss/kd": 2.154162645339966, "loss/lm": 0.3314562141895294, "step": 2408 }, { "epoch": 0.49455963867788955, "grad_norm": 1.3430673215416178, "kd_ratio": 0.5, "learning_rate": 1.0664537994361717e-05, "loss": 1.1026332378387451, "loss/kd": 1.8339259624481201, "loss/lm": 0.37134063243865967, "step": 2409 }, { "epoch": 0.4947649353315541, "grad_norm": 1.0694480128731179, "kd_ratio": 0.5, "learning_rate": 1.065790226751796e-05, "loss": 1.0387760400772095, "loss/kd": 1.7400455474853516, "loss/lm": 0.33750662207603455, "step": 2410 }, { "epoch": 0.49497023198521867, "grad_norm": 1.2358401116435502, "kd_ratio": 0.5, "learning_rate": 1.0651266249708816e-05, "loss": 1.0542148351669312, "loss/kd": 1.703340768814087, "loss/lm": 0.405088871717453, "step": 2411 }, { "epoch": 0.4951755286388832, "grad_norm": 1.2311432938715683, "kd_ratio": 0.5, "learning_rate": 1.0644629943869138e-05, "loss": 0.8459126949310303, "loss/kd": 1.3816317319869995, "loss/lm": 0.31019365787506104, "step": 2412 }, { "epoch": 0.49538082529254773, "grad_norm": 1.070471919489994, "kd_ratio": 0.5, "learning_rate": 1.0637993352933917e-05, "loss": 0.9372707009315491, "loss/kd": 1.547501564025879, "loss/lm": 0.32703983783721924, "step": 2413 }, { "epoch": 0.49558612194621227, "grad_norm": 1.1265472159439336, "kd_ratio": 0.5, "learning_rate": 1.0631356479838274e-05, "loss": 1.0458106994628906, "loss/kd": 1.7464865446090698, "loss/lm": 0.3451347351074219, "step": 2414 }, { "epoch": 0.4957914185998768, "grad_norm": 1.1118480208579928, "kd_ratio": 0.5, "learning_rate": 1.0624719327517434e-05, "loss": 1.0935784578323364, "loss/kd": 1.818910002708435, "loss/lm": 0.36824697256088257, "step": 2415 }, { "epoch": 0.4959967152535414, "grad_norm": 1.3717538935867668, "kd_ratio": 0.5, "learning_rate": 1.0618081898906774e-05, "loss": 1.0374308824539185, "loss/kd": 1.6911289691925049, "loss/lm": 0.38373276591300964, "step": 2416 }, { "epoch": 0.4962020119072059, "grad_norm": 1.3205751695875942, "kd_ratio": 0.5, "learning_rate": 1.0611444196941769e-05, "loss": 0.8439232707023621, "loss/kd": 1.3568586111068726, "loss/lm": 0.33098796010017395, "step": 2417 }, { "epoch": 0.49640730856087045, "grad_norm": 1.1154448355128208, "kd_ratio": 0.5, "learning_rate": 1.0604806224558028e-05, "loss": 1.0059821605682373, "loss/kd": 1.6389459371566772, "loss/lm": 0.37301844358444214, "step": 2418 }, { "epoch": 0.496612605214535, "grad_norm": 1.2906901779655116, "kd_ratio": 0.5, "learning_rate": 1.0598167984691276e-05, "loss": 1.9472781419754028, "loss/kd": 3.525367021560669, "loss/lm": 0.3691893219947815, "step": 2419 }, { "epoch": 0.4968179018681996, "grad_norm": 1.0661748646212106, "kd_ratio": 0.5, "learning_rate": 1.0591529480277355e-05, "loss": 1.16817045211792, "loss/kd": 1.8785136938095093, "loss/lm": 0.45782724022865295, "step": 2420 }, { "epoch": 0.4970231985218641, "grad_norm": 0.9728366419464375, "kd_ratio": 0.5, "learning_rate": 1.0584890714252229e-05, "loss": 0.9951532483100891, "loss/kd": 1.6341211795806885, "loss/lm": 0.35618531703948975, "step": 2421 }, { "epoch": 0.49722849517552864, "grad_norm": 1.0355381320181916, "kd_ratio": 0.5, "learning_rate": 1.0578251689551971e-05, "loss": 1.2268905639648438, "loss/kd": 2.072427749633789, "loss/lm": 0.3813534080982208, "step": 2422 }, { "epoch": 0.4974337918291932, "grad_norm": 1.2383780087267457, "kd_ratio": 0.5, "learning_rate": 1.057161240911277e-05, "loss": 0.91402667760849, "loss/kd": 1.5184537172317505, "loss/lm": 0.3095996081829071, "step": 2423 }, { "epoch": 0.4976390884828577, "grad_norm": 1.5753240305379057, "kd_ratio": 0.5, "learning_rate": 1.0564972875870936e-05, "loss": 1.1118165254592896, "loss/kd": 1.891213059425354, "loss/lm": 0.3324200510978699, "step": 2424 }, { "epoch": 0.4978443851365223, "grad_norm": 1.0296152214910375, "kd_ratio": 0.5, "learning_rate": 1.0558333092762876e-05, "loss": 1.0182393789291382, "loss/kd": 1.6731771230697632, "loss/lm": 0.3633016347885132, "step": 2425 }, { "epoch": 0.4980496817901868, "grad_norm": 0.9941373349266474, "kd_ratio": 0.5, "learning_rate": 1.0551693062725124e-05, "loss": 0.8758721351623535, "loss/kd": 1.4209121465682983, "loss/lm": 0.3308320641517639, "step": 2426 }, { "epoch": 0.49825497844385136, "grad_norm": 1.0953451903425655, "kd_ratio": 0.5, "learning_rate": 1.0545052788694312e-05, "loss": 1.0733193159103394, "loss/kd": 1.7899179458618164, "loss/lm": 0.35672059655189514, "step": 2427 }, { "epoch": 0.4984602750975159, "grad_norm": 1.0125611538735797, "kd_ratio": 0.5, "learning_rate": 1.053841227360718e-05, "loss": 1.36002779006958, "loss/kd": 2.3905365467071533, "loss/lm": 0.32951903343200684, "step": 2428 }, { "epoch": 0.4986655717511805, "grad_norm": 1.2027282263677372, "kd_ratio": 0.5, "learning_rate": 1.0531771520400582e-05, "loss": 1.0310883522033691, "loss/kd": 1.7177515029907227, "loss/lm": 0.344425231218338, "step": 2429 }, { "epoch": 0.498870868404845, "grad_norm": 1.2204502802962798, "kd_ratio": 0.5, "learning_rate": 1.052513053201147e-05, "loss": 0.8293322324752808, "loss/kd": 1.3194594383239746, "loss/lm": 0.3392050862312317, "step": 2430 }, { "epoch": 0.49907616505850955, "grad_norm": 1.138552020756066, "kd_ratio": 0.5, "learning_rate": 1.0518489311376905e-05, "loss": 1.1480462551116943, "loss/kd": 1.9826669692993164, "loss/lm": 0.31342557072639465, "step": 2431 }, { "epoch": 0.4992814617121741, "grad_norm": 0.9835035209423423, "kd_ratio": 0.5, "learning_rate": 1.0511847861434046e-05, "loss": 1.1623220443725586, "loss/kd": 1.9219352006912231, "loss/lm": 0.40270885825157166, "step": 2432 }, { "epoch": 0.4994867583658386, "grad_norm": 1.1099574739619595, "kd_ratio": 0.5, "learning_rate": 1.050520618512016e-05, "loss": 1.13551664352417, "loss/kd": 1.9793540239334106, "loss/lm": 0.2916792333126068, "step": 2433 }, { "epoch": 0.4996920550195032, "grad_norm": 1.040249982914626, "kd_ratio": 0.5, "learning_rate": 1.049856428537261e-05, "loss": 0.9480573534965515, "loss/kd": 1.558562159538269, "loss/lm": 0.3375525176525116, "step": 2434 }, { "epoch": 0.49989735167316773, "grad_norm": 0.9754525307688592, "kd_ratio": 0.5, "learning_rate": 1.0491922165128853e-05, "loss": 0.8712668418884277, "loss/kd": 1.472071647644043, "loss/lm": 0.2704620957374573, "step": 2435 }, { "epoch": 0.5001026483268323, "grad_norm": 1.0327955840802612, "kd_ratio": 0.5, "learning_rate": 1.0485279827326455e-05, "loss": 0.9530760645866394, "loss/kd": 1.5694513320922852, "loss/lm": 0.33670076727867126, "step": 2436 }, { "epoch": 0.5003079449804968, "grad_norm": 1.3997282686696577, "kd_ratio": 0.5, "learning_rate": 1.047863727490307e-05, "loss": 1.081862449645996, "loss/kd": 1.7789579629898071, "loss/lm": 0.3847668468952179, "step": 2437 }, { "epoch": 0.5005132416341613, "grad_norm": 1.261684350182048, "kd_ratio": 0.5, "learning_rate": 1.0471994510796444e-05, "loss": 1.2754034996032715, "loss/kd": 2.24345064163208, "loss/lm": 0.30735623836517334, "step": 2438 }, { "epoch": 0.5007185382878259, "grad_norm": 0.9357333789124244, "kd_ratio": 0.5, "learning_rate": 1.0465351537944429e-05, "loss": 1.000211477279663, "loss/kd": 1.677407145500183, "loss/lm": 0.32301583886146545, "step": 2439 }, { "epoch": 0.5009238349414905, "grad_norm": 1.2261499629114119, "kd_ratio": 0.5, "learning_rate": 1.0458708359284954e-05, "loss": 1.077184796333313, "loss/kd": 1.7801986932754517, "loss/lm": 0.37417086958885193, "step": 2440 }, { "epoch": 0.501129131595155, "grad_norm": 1.4679462773482157, "kd_ratio": 0.5, "learning_rate": 1.0452064977756051e-05, "loss": 0.9250313639640808, "loss/kd": 1.472009539604187, "loss/lm": 0.3780531585216522, "step": 2441 }, { "epoch": 0.5013344282488196, "grad_norm": 1.4191299089553946, "kd_ratio": 0.5, "learning_rate": 1.0445421396295839e-05, "loss": 1.9536999464035034, "loss/kd": 3.57867169380188, "loss/lm": 0.3287281394004822, "step": 2442 }, { "epoch": 0.5015397249024841, "grad_norm": 1.2139636206808613, "kd_ratio": 0.5, "learning_rate": 1.043877761784252e-05, "loss": 0.952400803565979, "loss/kd": 1.634237289428711, "loss/lm": 0.2705642580986023, "step": 2443 }, { "epoch": 0.5017450215561486, "grad_norm": 1.4116620738634271, "kd_ratio": 0.5, "learning_rate": 1.0432133645334389e-05, "loss": 0.9958764314651489, "loss/kd": 1.6618787050247192, "loss/lm": 0.329874187707901, "step": 2444 }, { "epoch": 0.5019503182098132, "grad_norm": 2.0164653363890697, "kd_ratio": 0.5, "learning_rate": 1.0425489481709824e-05, "loss": 1.005001425743103, "loss/kd": 1.6035350561141968, "loss/lm": 0.4064677953720093, "step": 2445 }, { "epoch": 0.5021556148634777, "grad_norm": 1.7750421971682622, "kd_ratio": 0.5, "learning_rate": 1.0418845129907289e-05, "loss": 1.8277467489242554, "loss/kd": 3.319056272506714, "loss/lm": 0.33643725514411926, "step": 2446 }, { "epoch": 0.5023609115171422, "grad_norm": 1.2846499126949367, "kd_ratio": 0.5, "learning_rate": 1.0412200592865331e-05, "loss": 1.0353755950927734, "loss/kd": 1.6930242776870728, "loss/lm": 0.3777269423007965, "step": 2447 }, { "epoch": 0.5025662081708068, "grad_norm": 2.51953711247945, "kd_ratio": 0.5, "learning_rate": 1.0405555873522576e-05, "loss": 1.0741641521453857, "loss/kd": 1.7385685443878174, "loss/lm": 0.4097597301006317, "step": 2448 }, { "epoch": 0.5027715048244714, "grad_norm": 1.5813799709738492, "kd_ratio": 0.5, "learning_rate": 1.0398910974817736e-05, "loss": 0.829226553440094, "loss/kd": 1.3460100889205933, "loss/lm": 0.31244298815727234, "step": 2449 }, { "epoch": 0.502976801478136, "grad_norm": 1.3478742399780475, "kd_ratio": 0.5, "learning_rate": 1.0392265899689595e-05, "loss": 1.107816219329834, "loss/kd": 1.8442896604537964, "loss/lm": 0.3713426887989044, "step": 2450 }, { "epoch": 0.5031820981318005, "grad_norm": 2.3942597189954506, "kd_ratio": 0.5, "learning_rate": 1.0385620651077024e-05, "loss": 1.2936617136001587, "loss/kd": 2.159398317337036, "loss/lm": 0.4279250204563141, "step": 2451 }, { "epoch": 0.503387394785465, "grad_norm": 2.221808672175423, "kd_ratio": 0.5, "learning_rate": 1.0378975231918966e-05, "loss": 1.1997840404510498, "loss/kd": 2.0345640182495117, "loss/lm": 0.3650040626525879, "step": 2452 }, { "epoch": 0.5035926914391295, "grad_norm": 1.075122394079473, "kd_ratio": 0.5, "learning_rate": 1.037232964515444e-05, "loss": 0.8935390710830688, "loss/kd": 1.4878829717636108, "loss/lm": 0.2991951107978821, "step": 2453 }, { "epoch": 0.5037979880927941, "grad_norm": 2.150522661618345, "kd_ratio": 0.5, "learning_rate": 1.0365683893722531e-05, "loss": 0.9608685374259949, "loss/kd": 1.5720813274383545, "loss/lm": 0.34965571761131287, "step": 2454 }, { "epoch": 0.5040032847464586, "grad_norm": 2.050669830770304, "kd_ratio": 0.5, "learning_rate": 1.0359037980562416e-05, "loss": 1.6741387844085693, "loss/kd": 3.009998083114624, "loss/lm": 0.3382793664932251, "step": 2455 }, { "epoch": 0.5042085814001231, "grad_norm": 1.1163869302251506, "kd_ratio": 0.5, "learning_rate": 1.0352391908613325e-05, "loss": 1.140249490737915, "loss/kd": 1.9208394289016724, "loss/lm": 0.3596595525741577, "step": 2456 }, { "epoch": 0.5044138780537877, "grad_norm": 2.0700210872963796, "kd_ratio": 0.5, "learning_rate": 1.0345745680814566e-05, "loss": 1.954715609550476, "loss/kd": 3.639153003692627, "loss/lm": 0.27027812600135803, "step": 2457 }, { "epoch": 0.5046191747074523, "grad_norm": 1.4875120325454663, "kd_ratio": 0.5, "learning_rate": 1.0339099300105513e-05, "loss": 1.1679660081863403, "loss/kd": 1.979980707168579, "loss/lm": 0.35595133900642395, "step": 2458 }, { "epoch": 0.5048244713611169, "grad_norm": 1.8526263285567326, "kd_ratio": 0.5, "learning_rate": 1.0332452769425619e-05, "loss": 1.0096086263656616, "loss/kd": 1.6591448783874512, "loss/lm": 0.36007240414619446, "step": 2459 }, { "epoch": 0.5050297680147814, "grad_norm": 1.9555850755582957, "kd_ratio": 0.5, "learning_rate": 1.0325806091714381e-05, "loss": 1.2148449420928955, "loss/kd": 2.0021586418151855, "loss/lm": 0.42753127217292786, "step": 2460 }, { "epoch": 0.5052350646684459, "grad_norm": 1.1687595481005768, "kd_ratio": 0.5, "learning_rate": 1.0319159269911384e-05, "loss": 0.8720617890357971, "loss/kd": 1.3830909729003906, "loss/lm": 0.3610325753688812, "step": 2461 }, { "epoch": 0.5054403613221105, "grad_norm": 1.67892563404984, "kd_ratio": 0.5, "learning_rate": 1.0312512306956266e-05, "loss": 1.130298376083374, "loss/kd": 1.9290120601654053, "loss/lm": 0.331584632396698, "step": 2462 }, { "epoch": 0.505645657975775, "grad_norm": 1.665250431584723, "kd_ratio": 0.5, "learning_rate": 1.0305865205788728e-05, "loss": 1.1253259181976318, "loss/kd": 1.8161042928695679, "loss/lm": 0.4345476031303406, "step": 2463 }, { "epoch": 0.5058509546294395, "grad_norm": 1.0567083414212228, "kd_ratio": 0.5, "learning_rate": 1.0299217969348524e-05, "loss": 1.1168667078018188, "loss/kd": 1.7521824836730957, "loss/lm": 0.4815508723258972, "step": 2464 }, { "epoch": 0.506056251283104, "grad_norm": 1.7302305833207994, "kd_ratio": 0.5, "learning_rate": 1.0292570600575489e-05, "loss": 1.0203213691711426, "loss/kd": 1.6906102895736694, "loss/lm": 0.35003238916397095, "step": 2465 }, { "epoch": 0.5062615479367686, "grad_norm": 1.7794564763967686, "kd_ratio": 0.5, "learning_rate": 1.0285923102409499e-05, "loss": 1.0519938468933105, "loss/kd": 1.6580294370651245, "loss/lm": 0.4459581971168518, "step": 2466 }, { "epoch": 0.5064668445904332, "grad_norm": 1.476651775094816, "kd_ratio": 0.5, "learning_rate": 1.0279275477790487e-05, "loss": 1.192511796951294, "loss/kd": 2.062288761138916, "loss/lm": 0.3227347731590271, "step": 2467 }, { "epoch": 0.5066721412440978, "grad_norm": 2.271661964941823, "kd_ratio": 0.5, "learning_rate": 1.027262772965845e-05, "loss": 1.0164480209350586, "loss/kd": 1.6882128715515137, "loss/lm": 0.34468305110931396, "step": 2468 }, { "epoch": 0.5068774378977623, "grad_norm": 1.5313272075735642, "kd_ratio": 0.5, "learning_rate": 1.026597986095344e-05, "loss": 1.0916860103607178, "loss/kd": 1.8091974258422852, "loss/lm": 0.3741745054721832, "step": 2469 }, { "epoch": 0.5070827345514268, "grad_norm": 1.7747703990605308, "kd_ratio": 0.5, "learning_rate": 1.0259331874615555e-05, "loss": 1.2031495571136475, "loss/kd": 2.0293166637420654, "loss/lm": 0.3769824504852295, "step": 2470 }, { "epoch": 0.5072880312050914, "grad_norm": 1.5902913857461678, "kd_ratio": 0.5, "learning_rate": 1.0252683773584953e-05, "loss": 0.8552711009979248, "loss/kd": 1.3662747144699097, "loss/lm": 0.34426742792129517, "step": 2471 }, { "epoch": 0.5074933278587559, "grad_norm": 1.3428550784174063, "kd_ratio": 0.5, "learning_rate": 1.0246035560801833e-05, "loss": 1.064404010772705, "loss/kd": 1.7437572479248047, "loss/lm": 0.3850506544113159, "step": 2472 }, { "epoch": 0.5076986245124204, "grad_norm": 1.8500809351137146, "kd_ratio": 0.5, "learning_rate": 1.0239387239206455e-05, "loss": 1.3487765789031982, "loss/kd": 2.241142511367798, "loss/lm": 0.45641055703163147, "step": 2473 }, { "epoch": 0.507903921166085, "grad_norm": 1.1320171535100492, "kd_ratio": 0.5, "learning_rate": 1.0232738811739116e-05, "loss": 0.9603466391563416, "loss/kd": 1.5767349004745483, "loss/lm": 0.34395837783813477, "step": 2474 }, { "epoch": 0.5081092178197495, "grad_norm": 1.6866343063988543, "kd_ratio": 0.5, "learning_rate": 1.0226090281340168e-05, "loss": 1.1093374490737915, "loss/kd": 1.8452075719833374, "loss/lm": 0.37346726655960083, "step": 2475 }, { "epoch": 0.5083145144734141, "grad_norm": 1.1704202048513155, "kd_ratio": 0.5, "learning_rate": 1.0219441650950007e-05, "loss": 1.0652588605880737, "loss/kd": 1.8091859817504883, "loss/lm": 0.32133179903030396, "step": 2476 }, { "epoch": 0.5085198111270787, "grad_norm": 1.4219992898185292, "kd_ratio": 0.5, "learning_rate": 1.0212792923509067e-05, "loss": 1.1463299989700317, "loss/kd": 1.922549843788147, "loss/lm": 0.3701101243495941, "step": 2477 }, { "epoch": 0.5087251077807432, "grad_norm": 1.4450853225395495, "kd_ratio": 0.5, "learning_rate": 1.0206144101957831e-05, "loss": 0.9367961883544922, "loss/kd": 1.489380121231079, "loss/lm": 0.38421228528022766, "step": 2478 }, { "epoch": 0.5089304044344077, "grad_norm": 0.937801133541415, "kd_ratio": 0.5, "learning_rate": 1.0199495189236828e-05, "loss": 1.119422197341919, "loss/kd": 1.9362469911575317, "loss/lm": 0.3025974929332733, "step": 2479 }, { "epoch": 0.5091357010880723, "grad_norm": 1.3167145563232372, "kd_ratio": 0.5, "learning_rate": 1.0192846188286614e-05, "loss": 1.609654426574707, "loss/kd": 2.854022264480591, "loss/lm": 0.3652867078781128, "step": 2480 }, { "epoch": 0.5093409977417368, "grad_norm": 1.6847448800068474, "kd_ratio": 0.5, "learning_rate": 1.01861971020478e-05, "loss": 1.5043656826019287, "loss/kd": 2.6019012928009033, "loss/lm": 0.4068301022052765, "step": 2481 }, { "epoch": 0.5095462943954013, "grad_norm": 1.7040976563037127, "kd_ratio": 0.5, "learning_rate": 1.0179547933461016e-05, "loss": 0.9701509475708008, "loss/kd": 1.6356111764907837, "loss/lm": 0.30469071865081787, "step": 2482 }, { "epoch": 0.5097515910490659, "grad_norm": 1.740507267810578, "kd_ratio": 0.5, "learning_rate": 1.0172898685466947e-05, "loss": 0.8891834020614624, "loss/kd": 1.4265440702438354, "loss/lm": 0.35182279348373413, "step": 2483 }, { "epoch": 0.5099568877027304, "grad_norm": 1.536717049448316, "kd_ratio": 0.5, "learning_rate": 1.0166249361006301e-05, "loss": 1.0102813243865967, "loss/kd": 1.7013378143310547, "loss/lm": 0.3192247152328491, "step": 2484 }, { "epoch": 0.510162184356395, "grad_norm": 1.7322814532125033, "kd_ratio": 0.5, "learning_rate": 1.0159599963019826e-05, "loss": 1.4189687967300415, "loss/kd": 2.4425902366638184, "loss/lm": 0.3953474462032318, "step": 2485 }, { "epoch": 0.5103674810100596, "grad_norm": 1.9566394589098965, "kd_ratio": 0.5, "learning_rate": 1.0152950494448298e-05, "loss": 1.1854562759399414, "loss/kd": 1.9893954992294312, "loss/lm": 0.38151708245277405, "step": 2486 }, { "epoch": 0.5105727776637241, "grad_norm": 1.1403333544843535, "kd_ratio": 0.5, "learning_rate": 1.0146300958232528e-05, "loss": 1.0362697839736938, "loss/kd": 1.695982575416565, "loss/lm": 0.3765570819377899, "step": 2487 }, { "epoch": 0.5107780743173886, "grad_norm": 2.1237941981736035, "kd_ratio": 0.5, "learning_rate": 1.0139651357313354e-05, "loss": 0.997217059135437, "loss/kd": 1.6655172109603882, "loss/lm": 0.32891684770584106, "step": 2488 }, { "epoch": 0.5109833709710532, "grad_norm": 1.1675411016945783, "kd_ratio": 0.5, "learning_rate": 1.0133001694631648e-05, "loss": 1.0380213260650635, "loss/kd": 1.539991855621338, "loss/lm": 0.5360506772994995, "step": 2489 }, { "epoch": 0.5111886676247177, "grad_norm": 1.60900857419827, "kd_ratio": 0.5, "learning_rate": 1.0126351973128299e-05, "loss": 1.047020673751831, "loss/kd": 1.7100704908370972, "loss/lm": 0.3839707374572754, "step": 2490 }, { "epoch": 0.5113939642783822, "grad_norm": 1.4694465759322308, "kd_ratio": 0.5, "learning_rate": 1.0119702195744236e-05, "loss": 0.9691872596740723, "loss/kd": 1.5732831954956055, "loss/lm": 0.36509132385253906, "step": 2491 }, { "epoch": 0.5115992609320468, "grad_norm": 1.1841845945237197, "kd_ratio": 0.5, "learning_rate": 1.01130523654204e-05, "loss": 0.9888238310813904, "loss/kd": 1.5540647506713867, "loss/lm": 0.42358294129371643, "step": 2492 }, { "epoch": 0.5118045575857113, "grad_norm": 1.3995587296449907, "kd_ratio": 0.5, "learning_rate": 1.0106402485097757e-05, "loss": 1.3243882656097412, "loss/kd": 2.269620656967163, "loss/lm": 0.3791559934616089, "step": 2493 }, { "epoch": 0.512009854239376, "grad_norm": 1.467807420401733, "kd_ratio": 0.5, "learning_rate": 1.0099752557717306e-05, "loss": 1.1163444519042969, "loss/kd": 1.9118421077728271, "loss/lm": 0.3208467662334442, "step": 2494 }, { "epoch": 0.5122151508930405, "grad_norm": 0.9784519939589909, "kd_ratio": 0.5, "learning_rate": 1.0093102586220056e-05, "loss": 1.02672278881073, "loss/kd": 1.6672645807266235, "loss/lm": 0.3861810863018036, "step": 2495 }, { "epoch": 0.512420447546705, "grad_norm": 1.7564323981432337, "kd_ratio": 0.5, "learning_rate": 1.0086452573547036e-05, "loss": 1.219229817390442, "loss/kd": 2.0684311389923096, "loss/lm": 0.3700285255908966, "step": 2496 }, { "epoch": 0.5126257442003695, "grad_norm": 1.6267169362354685, "kd_ratio": 0.5, "learning_rate": 1.0079802522639303e-05, "loss": 1.1330417394638062, "loss/kd": 1.9000732898712158, "loss/lm": 0.36601021885871887, "step": 2497 }, { "epoch": 0.5128310408540341, "grad_norm": 1.0400216467969974, "kd_ratio": 0.5, "learning_rate": 1.0073152436437918e-05, "loss": 1.1046350002288818, "loss/kd": 1.9083290100097656, "loss/lm": 0.30094102025032043, "step": 2498 }, { "epoch": 0.5130363375076986, "grad_norm": 1.9318762318449056, "kd_ratio": 0.5, "learning_rate": 1.0066502317883969e-05, "loss": 1.0415107011795044, "loss/kd": 1.7388572692871094, "loss/lm": 0.344164103269577, "step": 2499 }, { "epoch": 0.5132416341613631, "grad_norm": 1.408024229964708, "kd_ratio": 0.5, "learning_rate": 1.0059852169918547e-05, "loss": 1.0393997430801392, "loss/kd": 1.6844813823699951, "loss/lm": 0.3943180739879608, "step": 2500 }, { "epoch": 0.5134469308150277, "grad_norm": 1.0914296939915327, "kd_ratio": 0.5, "learning_rate": 1.0053201995482766e-05, "loss": 1.4056730270385742, "loss/kd": 2.498542308807373, "loss/lm": 0.3128036856651306, "step": 2501 }, { "epoch": 0.5136522274686922, "grad_norm": 1.8635202195853504, "kd_ratio": 0.5, "learning_rate": 1.0046551797517748e-05, "loss": 1.169528603553772, "loss/kd": 1.9564173221588135, "loss/lm": 0.3826397955417633, "step": 2502 }, { "epoch": 0.5138575241223569, "grad_norm": 1.2882893302356433, "kd_ratio": 0.5, "learning_rate": 1.0039901578964619e-05, "loss": 0.9810352325439453, "loss/kd": 1.6280105113983154, "loss/lm": 0.3340598940849304, "step": 2503 }, { "epoch": 0.5140628207760214, "grad_norm": 1.3355149893883476, "kd_ratio": 0.5, "learning_rate": 1.0033251342764527e-05, "loss": 1.3891960382461548, "loss/kd": 2.3924341201782227, "loss/lm": 0.3859579861164093, "step": 2504 }, { "epoch": 0.5142681174296859, "grad_norm": 1.69281850614461, "kd_ratio": 0.5, "learning_rate": 1.0026601091858618e-05, "loss": 0.9542440176010132, "loss/kd": 1.5622860193252563, "loss/lm": 0.3462020456790924, "step": 2505 }, { "epoch": 0.5144734140833505, "grad_norm": 1.3704615807310598, "kd_ratio": 0.5, "learning_rate": 1.0019950829188047e-05, "loss": 0.8381393551826477, "loss/kd": 1.3517858982086182, "loss/lm": 0.32449281215667725, "step": 2506 }, { "epoch": 0.514678710737015, "grad_norm": 1.6479021181203881, "kd_ratio": 0.5, "learning_rate": 1.0013300557693981e-05, "loss": 0.9087698459625244, "loss/kd": 1.5272530317306519, "loss/lm": 0.290286660194397, "step": 2507 }, { "epoch": 0.5148840073906795, "grad_norm": 1.900406286750179, "kd_ratio": 0.5, "learning_rate": 1.0006650280317573e-05, "loss": 1.0292242765426636, "loss/kd": 1.7110549211502075, "loss/lm": 0.3473937213420868, "step": 2508 }, { "epoch": 0.515089304044344, "grad_norm": 2.29826416282712, "kd_ratio": 0.5, "learning_rate": 1e-05, "loss": 1.1061400175094604, "loss/kd": 1.9060977697372437, "loss/lm": 0.30618226528167725, "step": 2509 }, { "epoch": 0.5152946006980086, "grad_norm": 1.5850102062315847, "kd_ratio": 0.5, "learning_rate": 9.993349719682428e-06, "loss": 1.001692771911621, "loss/kd": 1.6452276706695557, "loss/lm": 0.35815781354904175, "step": 2510 }, { "epoch": 0.5154998973516731, "grad_norm": 2.2460565938183827, "kd_ratio": 0.5, "learning_rate": 9.986699442306025e-06, "loss": 0.9831064939498901, "loss/kd": 1.5868504047393799, "loss/lm": 0.37936264276504517, "step": 2511 }, { "epoch": 0.5157051940053378, "grad_norm": 1.8044324053813414, "kd_ratio": 0.5, "learning_rate": 9.980049170811951e-06, "loss": 1.076865315437317, "loss/kd": 1.845861554145813, "loss/lm": 0.3078691065311432, "step": 2512 }, { "epoch": 0.5159104906590023, "grad_norm": 1.5028645477391638, "kd_ratio": 0.5, "learning_rate": 9.973398908141383e-06, "loss": 0.9058680534362793, "loss/kd": 1.4687604904174805, "loss/lm": 0.3429756462574005, "step": 2513 }, { "epoch": 0.5161157873126668, "grad_norm": 1.4551476299152855, "kd_ratio": 0.5, "learning_rate": 9.966748657235475e-06, "loss": 0.9736350178718567, "loss/kd": 1.5499950647354126, "loss/lm": 0.3972749412059784, "step": 2514 }, { "epoch": 0.5163210839663314, "grad_norm": 2.024249613204227, "kd_ratio": 0.5, "learning_rate": 9.960098421035383e-06, "loss": 1.0770331621170044, "loss/kd": 1.8335398435592651, "loss/lm": 0.3205265402793884, "step": 2515 }, { "epoch": 0.5165263806199959, "grad_norm": 1.783346785074354, "kd_ratio": 0.5, "learning_rate": 9.953448202482259e-06, "loss": 1.2068425416946411, "loss/kd": 2.0645205974578857, "loss/lm": 0.34916451573371887, "step": 2516 }, { "epoch": 0.5167316772736604, "grad_norm": 1.2824558768788132, "kd_ratio": 0.5, "learning_rate": 9.946798004517236e-06, "loss": 0.927076518535614, "loss/kd": 1.4530048370361328, "loss/lm": 0.4011482000350952, "step": 2517 }, { "epoch": 0.516936973927325, "grad_norm": 1.2843670829432556, "kd_ratio": 0.5, "learning_rate": 9.940147830081455e-06, "loss": 1.1584784984588623, "loss/kd": 1.9532978534698486, "loss/lm": 0.3636591136455536, "step": 2518 }, { "epoch": 0.5171422705809895, "grad_norm": 1.5038379785630276, "kd_ratio": 0.5, "learning_rate": 9.933497682116035e-06, "loss": 1.1723109483718872, "loss/kd": 2.010828971862793, "loss/lm": 0.33379295468330383, "step": 2519 }, { "epoch": 0.517347567234654, "grad_norm": 1.0704603231143222, "kd_ratio": 0.5, "learning_rate": 9.926847563562084e-06, "loss": 1.1713554859161377, "loss/kd": 1.9398950338363647, "loss/lm": 0.4028159976005554, "step": 2520 }, { "epoch": 0.5175528638883187, "grad_norm": 1.6265603826338384, "kd_ratio": 0.5, "learning_rate": 9.920197477360702e-06, "loss": 1.4148250818252563, "loss/kd": 2.3993897438049316, "loss/lm": 0.4302605092525482, "step": 2521 }, { "epoch": 0.5177581605419832, "grad_norm": 0.9756865274187645, "kd_ratio": 0.5, "learning_rate": 9.913547426452967e-06, "loss": 1.1098928451538086, "loss/kd": 1.824950098991394, "loss/lm": 0.39483562111854553, "step": 2522 }, { "epoch": 0.5179634571956477, "grad_norm": 1.4792411862418016, "kd_ratio": 0.5, "learning_rate": 9.906897413779949e-06, "loss": 1.1318577527999878, "loss/kd": 1.848807454109192, "loss/lm": 0.4149080514907837, "step": 2523 }, { "epoch": 0.5181687538493123, "grad_norm": 1.0461896988489578, "kd_ratio": 0.5, "learning_rate": 9.900247442282696e-06, "loss": 0.940582811832428, "loss/kd": 1.4877870082855225, "loss/lm": 0.3933786451816559, "step": 2524 }, { "epoch": 0.5183740505029768, "grad_norm": 1.3350323084974187, "kd_ratio": 0.5, "learning_rate": 9.893597514902245e-06, "loss": 1.1146314144134521, "loss/kd": 1.8602441549301147, "loss/lm": 0.3690187931060791, "step": 2525 }, { "epoch": 0.5185793471566413, "grad_norm": 1.1158815662389012, "kd_ratio": 0.5, "learning_rate": 9.886947634579607e-06, "loss": 1.0381381511688232, "loss/kd": 1.7150174379348755, "loss/lm": 0.36125895380973816, "step": 2526 }, { "epoch": 0.5187846438103059, "grad_norm": 1.5072432804843419, "kd_ratio": 0.5, "learning_rate": 9.88029780425577e-06, "loss": 1.1243778467178345, "loss/kd": 1.9213087558746338, "loss/lm": 0.327446848154068, "step": 2527 }, { "epoch": 0.5189899404639704, "grad_norm": 0.9879164701812834, "kd_ratio": 0.5, "learning_rate": 9.873648026871701e-06, "loss": 1.000504970550537, "loss/kd": 1.68239426612854, "loss/lm": 0.3186156749725342, "step": 2528 }, { "epoch": 0.5191952371176349, "grad_norm": 1.2036835956627145, "kd_ratio": 0.5, "learning_rate": 9.866998305368354e-06, "loss": 0.9482721090316772, "loss/kd": 1.5324938297271729, "loss/lm": 0.36405035853385925, "step": 2529 }, { "epoch": 0.5194005337712996, "grad_norm": 1.3912556831845297, "kd_ratio": 0.5, "learning_rate": 9.860348642686648e-06, "loss": 1.0536531209945679, "loss/kd": 1.7541069984436035, "loss/lm": 0.353199303150177, "step": 2530 }, { "epoch": 0.5196058304249641, "grad_norm": 1.2501312814097443, "kd_ratio": 0.5, "learning_rate": 9.853699041767473e-06, "loss": 1.0940406322479248, "loss/kd": 1.828805923461914, "loss/lm": 0.3592754602432251, "step": 2531 }, { "epoch": 0.5198111270786286, "grad_norm": 1.4100344295654437, "kd_ratio": 0.5, "learning_rate": 9.847049505551706e-06, "loss": 0.8563008308410645, "loss/kd": 1.4060981273651123, "loss/lm": 0.306503564119339, "step": 2532 }, { "epoch": 0.5200164237322932, "grad_norm": 1.0340291405374282, "kd_ratio": 0.5, "learning_rate": 9.840400036980176e-06, "loss": 1.2223589420318604, "loss/kd": 2.1151282787323, "loss/lm": 0.3295896351337433, "step": 2533 }, { "epoch": 0.5202217203859577, "grad_norm": 1.3781374335511198, "kd_ratio": 0.5, "learning_rate": 9.8337506389937e-06, "loss": 0.950482189655304, "loss/kd": 1.5223846435546875, "loss/lm": 0.378579705953598, "step": 2534 }, { "epoch": 0.5204270170396222, "grad_norm": 1.3097755238892301, "kd_ratio": 0.5, "learning_rate": 9.827101314533056e-06, "loss": 1.3742527961730957, "loss/kd": 2.417416572570801, "loss/lm": 0.33108896017074585, "step": 2535 }, { "epoch": 0.5206323136932868, "grad_norm": 1.6300424344477944, "kd_ratio": 0.5, "learning_rate": 9.820452066538987e-06, "loss": 1.137245774269104, "loss/kd": 1.831578016281128, "loss/lm": 0.4429135024547577, "step": 2536 }, { "epoch": 0.5208376103469513, "grad_norm": 1.233633313894879, "kd_ratio": 0.5, "learning_rate": 9.813802897952207e-06, "loss": 1.1279405355453491, "loss/kd": 1.9378873109817505, "loss/lm": 0.31799378991127014, "step": 2537 }, { "epoch": 0.5210429070006158, "grad_norm": 1.1120287897310779, "kd_ratio": 0.5, "learning_rate": 9.807153811713386e-06, "loss": 0.9941641092300415, "loss/kd": 1.6624964475631714, "loss/lm": 0.32583171129226685, "step": 2538 }, { "epoch": 0.5212482036542805, "grad_norm": 1.2102362586064428, "kd_ratio": 0.5, "learning_rate": 9.800504810763176e-06, "loss": 1.0300264358520508, "loss/kd": 1.6988786458969116, "loss/lm": 0.36117416620254517, "step": 2539 }, { "epoch": 0.521453500307945, "grad_norm": 1.2333901100828997, "kd_ratio": 0.5, "learning_rate": 9.79385589804217e-06, "loss": 0.9123122692108154, "loss/kd": 1.5365948677062988, "loss/lm": 0.2880297303199768, "step": 2540 }, { "epoch": 0.5216587969616095, "grad_norm": 1.1778483670203117, "kd_ratio": 0.5, "learning_rate": 9.787207076490937e-06, "loss": 1.4015206098556519, "loss/kd": 2.4470157623291016, "loss/lm": 0.3560255169868469, "step": 2541 }, { "epoch": 0.5218640936152741, "grad_norm": 1.118147964920372, "kd_ratio": 0.5, "learning_rate": 9.780558349049998e-06, "loss": 0.9757221937179565, "loss/kd": 1.5334678888320923, "loss/lm": 0.417976438999176, "step": 2542 }, { "epoch": 0.5220693902689386, "grad_norm": 1.3473361023578354, "kd_ratio": 0.5, "learning_rate": 9.773909718659831e-06, "loss": 0.8266373872756958, "loss/kd": 1.3658313751220703, "loss/lm": 0.2874433696269989, "step": 2543 }, { "epoch": 0.5222746869226031, "grad_norm": 1.6185027619422145, "kd_ratio": 0.5, "learning_rate": 9.767261188260886e-06, "loss": 1.2399133443832397, "loss/kd": 2.050198554992676, "loss/lm": 0.4296281635761261, "step": 2544 }, { "epoch": 0.5224799835762677, "grad_norm": 1.1018832662761302, "kd_ratio": 0.5, "learning_rate": 9.760612760793549e-06, "loss": 1.0892506837844849, "loss/kd": 1.7850937843322754, "loss/lm": 0.3934076726436615, "step": 2545 }, { "epoch": 0.5226852802299322, "grad_norm": 1.5466775252186418, "kd_ratio": 0.5, "learning_rate": 9.75396443919817e-06, "loss": 0.9263688325881958, "loss/kd": 1.4660696983337402, "loss/lm": 0.38666802644729614, "step": 2546 }, { "epoch": 0.5228905768835967, "grad_norm": 1.2431702979064148, "kd_ratio": 0.5, "learning_rate": 9.747316226415052e-06, "loss": 1.1133958101272583, "loss/kd": 1.8637871742248535, "loss/lm": 0.36300453543663025, "step": 2547 }, { "epoch": 0.5230958735372614, "grad_norm": 1.4083956841716838, "kd_ratio": 0.5, "learning_rate": 9.74066812538445e-06, "loss": 1.412737488746643, "loss/kd": 2.378788471221924, "loss/lm": 0.44668641686439514, "step": 2548 }, { "epoch": 0.5233011701909259, "grad_norm": 0.9865123721146015, "kd_ratio": 0.5, "learning_rate": 9.734020139046562e-06, "loss": 1.309840202331543, "loss/kd": 2.2920544147491455, "loss/lm": 0.3276260495185852, "step": 2549 }, { "epoch": 0.5235064668445905, "grad_norm": 1.2280295787462268, "kd_ratio": 0.5, "learning_rate": 9.727372270341552e-06, "loss": 1.1100313663482666, "loss/kd": 1.8715178966522217, "loss/lm": 0.348544716835022, "step": 2550 }, { "epoch": 0.523711763498255, "grad_norm": 1.2398016069868139, "kd_ratio": 0.5, "learning_rate": 9.720724522209518e-06, "loss": 1.0307427644729614, "loss/kd": 1.7177282571792603, "loss/lm": 0.34375736117362976, "step": 2551 }, { "epoch": 0.5239170601519195, "grad_norm": 1.5032925232003604, "kd_ratio": 0.5, "learning_rate": 9.714076897590508e-06, "loss": 1.096985101699829, "loss/kd": 1.83559250831604, "loss/lm": 0.3583777844905853, "step": 2552 }, { "epoch": 0.524122356805584, "grad_norm": 1.2783311532422603, "kd_ratio": 0.5, "learning_rate": 9.707429399424514e-06, "loss": 1.1474196910858154, "loss/kd": 1.9499198198318481, "loss/lm": 0.34491968154907227, "step": 2553 }, { "epoch": 0.5243276534592486, "grad_norm": 1.0442712562853211, "kd_ratio": 0.5, "learning_rate": 9.700782030651476e-06, "loss": 1.0226739645004272, "loss/kd": 1.6425281763076782, "loss/lm": 0.4028196632862091, "step": 2554 }, { "epoch": 0.5245329501129131, "grad_norm": 1.8428969662283958, "kd_ratio": 0.5, "learning_rate": 9.694134794211277e-06, "loss": 0.9076322913169861, "loss/kd": 1.490510106086731, "loss/lm": 0.3247544467449188, "step": 2555 }, { "epoch": 0.5247382467665777, "grad_norm": 1.2432123257234047, "kd_ratio": 0.5, "learning_rate": 9.687487693043738e-06, "loss": 0.9715996980667114, "loss/kd": 1.5663727521896362, "loss/lm": 0.376826673746109, "step": 2556 }, { "epoch": 0.5249435434202423, "grad_norm": 1.8332262210881844, "kd_ratio": 0.5, "learning_rate": 9.680840730088619e-06, "loss": 1.1214720010757446, "loss/kd": 1.869840145111084, "loss/lm": 0.3731037974357605, "step": 2557 }, { "epoch": 0.5251488400739068, "grad_norm": 1.0256475362410942, "kd_ratio": 0.5, "learning_rate": 9.67419390828562e-06, "loss": 0.9813084602355957, "loss/kd": 1.617400884628296, "loss/lm": 0.3452160358428955, "step": 2558 }, { "epoch": 0.5253541367275714, "grad_norm": 2.3943067285022566, "kd_ratio": 0.5, "learning_rate": 9.667547230574386e-06, "loss": 0.9854109883308411, "loss/kd": 1.5616618394851685, "loss/lm": 0.4091601073741913, "step": 2559 }, { "epoch": 0.5255594333812359, "grad_norm": 1.2315382644326238, "kd_ratio": 0.5, "learning_rate": 9.660900699894489e-06, "loss": 1.2199945449829102, "loss/kd": 2.0719573497772217, "loss/lm": 0.3680317997932434, "step": 2560 }, { "epoch": 0.5257647300349004, "grad_norm": 1.916277135682965, "kd_ratio": 0.5, "learning_rate": 9.654254319185438e-06, "loss": 1.0383474826812744, "loss/kd": 1.7220901250839233, "loss/lm": 0.3546048700809479, "step": 2561 }, { "epoch": 0.525970026688565, "grad_norm": 1.8136056526640267, "kd_ratio": 0.5, "learning_rate": 9.64760809138668e-06, "loss": 0.9459949135780334, "loss/kd": 1.5544981956481934, "loss/lm": 0.33749163150787354, "step": 2562 }, { "epoch": 0.5261753233422295, "grad_norm": 1.4932270423712242, "kd_ratio": 0.5, "learning_rate": 9.64096201943759e-06, "loss": 1.2643898725509644, "loss/kd": 2.132953405380249, "loss/lm": 0.3958262503147125, "step": 2563 }, { "epoch": 0.526380619995894, "grad_norm": 0.9821941173954989, "kd_ratio": 0.5, "learning_rate": 9.634316106277469e-06, "loss": 1.0652847290039062, "loss/kd": 1.7645652294158936, "loss/lm": 0.36600425839424133, "step": 2564 }, { "epoch": 0.5265859166495586, "grad_norm": 1.4141012151597923, "kd_ratio": 0.5, "learning_rate": 9.627670354845563e-06, "loss": 0.9966681599617004, "loss/kd": 1.6166681051254272, "loss/lm": 0.376668244600296, "step": 2565 }, { "epoch": 0.5267912133032232, "grad_norm": 1.039555505929648, "kd_ratio": 0.5, "learning_rate": 9.621024768081037e-06, "loss": 0.996431827545166, "loss/kd": 1.648290991783142, "loss/lm": 0.34457260370254517, "step": 2566 }, { "epoch": 0.5269965099568877, "grad_norm": 1.3094543864318489, "kd_ratio": 0.5, "learning_rate": 9.61437934892298e-06, "loss": 1.1759637594223022, "loss/kd": 1.985298991203308, "loss/lm": 0.3666285276412964, "step": 2567 }, { "epoch": 0.5272018066105523, "grad_norm": 1.2932414666902392, "kd_ratio": 0.5, "learning_rate": 9.607734100310408e-06, "loss": 1.076897144317627, "loss/kd": 1.812996506690979, "loss/lm": 0.3407977223396301, "step": 2568 }, { "epoch": 0.5274071032642168, "grad_norm": 1.3009055556626372, "kd_ratio": 0.5, "learning_rate": 9.601089025182268e-06, "loss": 1.0597524642944336, "loss/kd": 1.717260718345642, "loss/lm": 0.4022441804409027, "step": 2569 }, { "epoch": 0.5276123999178813, "grad_norm": 1.3084711944338223, "kd_ratio": 0.5, "learning_rate": 9.594444126477426e-06, "loss": 1.3020062446594238, "loss/kd": 2.220402479171753, "loss/lm": 0.3836100995540619, "step": 2570 }, { "epoch": 0.5278176965715459, "grad_norm": 1.5613938647148364, "kd_ratio": 0.5, "learning_rate": 9.587799407134672e-06, "loss": 1.1571166515350342, "loss/kd": 1.9860265254974365, "loss/lm": 0.328206866979599, "step": 2571 }, { "epoch": 0.5280229932252104, "grad_norm": 1.0244177259445242, "kd_ratio": 0.5, "learning_rate": 9.581154870092714e-06, "loss": 1.0682061910629272, "loss/kd": 1.7373675107955933, "loss/lm": 0.39904487133026123, "step": 2572 }, { "epoch": 0.5282282898788749, "grad_norm": 1.0094427769531895, "kd_ratio": 0.5, "learning_rate": 9.57451051829018e-06, "loss": 0.9861953258514404, "loss/kd": 1.6167316436767578, "loss/lm": 0.35565900802612305, "step": 2573 }, { "epoch": 0.5284335865325395, "grad_norm": 1.131645060482016, "kd_ratio": 0.5, "learning_rate": 9.567866354665616e-06, "loss": 0.993285596370697, "loss/kd": 1.602845549583435, "loss/lm": 0.3837256133556366, "step": 2574 }, { "epoch": 0.5286388831862041, "grad_norm": 1.0212906317441979, "kd_ratio": 0.5, "learning_rate": 9.56122238215748e-06, "loss": 0.975839376449585, "loss/kd": 1.5827572345733643, "loss/lm": 0.36892154812812805, "step": 2575 }, { "epoch": 0.5288441798398686, "grad_norm": 1.1662072277910107, "kd_ratio": 0.5, "learning_rate": 9.554578603704164e-06, "loss": 0.9644429087638855, "loss/kd": 1.4739255905151367, "loss/lm": 0.4549602270126343, "step": 2576 }, { "epoch": 0.5290494764935332, "grad_norm": 1.0813540292151371, "kd_ratio": 0.5, "learning_rate": 9.547935022243952e-06, "loss": 1.1137337684631348, "loss/kd": 1.759324550628662, "loss/lm": 0.4681430459022522, "step": 2577 }, { "epoch": 0.5292547731471977, "grad_norm": 1.1472798560997095, "kd_ratio": 0.5, "learning_rate": 9.541291640715047e-06, "loss": 1.0535732507705688, "loss/kd": 1.8502217531204224, "loss/lm": 0.2569248080253601, "step": 2578 }, { "epoch": 0.5294600698008622, "grad_norm": 0.9703973298426886, "kd_ratio": 0.5, "learning_rate": 9.534648462055576e-06, "loss": 1.9238982200622559, "loss/kd": 3.5625996589660645, "loss/lm": 0.28519681096076965, "step": 2579 }, { "epoch": 0.5296653664545268, "grad_norm": 0.9928771267338715, "kd_ratio": 0.5, "learning_rate": 9.528005489203556e-06, "loss": 1.0566843748092651, "loss/kd": 1.7848155498504639, "loss/lm": 0.3285532593727112, "step": 2580 }, { "epoch": 0.5298706631081913, "grad_norm": 0.9044621233587306, "kd_ratio": 0.5, "learning_rate": 9.521362725096933e-06, "loss": 1.1677069664001465, "loss/kd": 1.8758751153945923, "loss/lm": 0.4595387876033783, "step": 2581 }, { "epoch": 0.5300759597618558, "grad_norm": 1.0386742119965147, "kd_ratio": 0.5, "learning_rate": 9.514720172673547e-06, "loss": 1.0347193479537964, "loss/kd": 1.7033119201660156, "loss/lm": 0.36612674593925476, "step": 2582 }, { "epoch": 0.5302812564155204, "grad_norm": 0.9881716578689064, "kd_ratio": 0.5, "learning_rate": 9.50807783487115e-06, "loss": 1.020485758781433, "loss/kd": 1.5775586366653442, "loss/lm": 0.4634128212928772, "step": 2583 }, { "epoch": 0.530486553069185, "grad_norm": 0.9643828628206553, "kd_ratio": 0.5, "learning_rate": 9.501435714627396e-06, "loss": 1.0931426286697388, "loss/kd": 1.7814711332321167, "loss/lm": 0.40481412410736084, "step": 2584 }, { "epoch": 0.5306918497228496, "grad_norm": 1.0108840781609223, "kd_ratio": 0.5, "learning_rate": 9.494793814879841e-06, "loss": 0.9961068630218506, "loss/kd": 1.6209238767623901, "loss/lm": 0.3712899088859558, "step": 2585 }, { "epoch": 0.5308971463765141, "grad_norm": 0.9827667168864564, "kd_ratio": 0.5, "learning_rate": 9.488152138565956e-06, "loss": 0.9885130524635315, "loss/kd": 1.6906578540802002, "loss/lm": 0.2863682508468628, "step": 2586 }, { "epoch": 0.5311024430301786, "grad_norm": 1.005279982241533, "kd_ratio": 0.5, "learning_rate": 9.481510688623098e-06, "loss": 1.2157963514328003, "loss/kd": 2.042961359024048, "loss/lm": 0.38863131403923035, "step": 2587 }, { "epoch": 0.5313077396838431, "grad_norm": 0.91254148006228, "kd_ratio": 0.5, "learning_rate": 9.474869467988534e-06, "loss": 1.510157823562622, "loss/kd": 2.5771212577819824, "loss/lm": 0.44319435954093933, "step": 2588 }, { "epoch": 0.5315130363375077, "grad_norm": 0.9661810478701881, "kd_ratio": 0.5, "learning_rate": 9.468228479599423e-06, "loss": 1.4243748188018799, "loss/kd": 2.4536185264587402, "loss/lm": 0.3951311707496643, "step": 2589 }, { "epoch": 0.5317183329911722, "grad_norm": 1.0681509406956158, "kd_ratio": 0.5, "learning_rate": 9.461587726392822e-06, "loss": 0.9875261187553406, "loss/kd": 1.6199557781219482, "loss/lm": 0.3550964891910553, "step": 2590 }, { "epoch": 0.5319236296448367, "grad_norm": 1.0481959521757924, "kd_ratio": 0.5, "learning_rate": 9.454947211305691e-06, "loss": 0.9314371347427368, "loss/kd": 1.4861366748809814, "loss/lm": 0.3767375648021698, "step": 2591 }, { "epoch": 0.5321289262985013, "grad_norm": 1.0595438791990859, "kd_ratio": 0.5, "learning_rate": 9.448306937274879e-06, "loss": 1.142055630683899, "loss/kd": 1.9141496419906616, "loss/lm": 0.3699616491794586, "step": 2592 }, { "epoch": 0.5323342229521659, "grad_norm": 1.2216633706847138, "kd_ratio": 0.5, "learning_rate": 9.441666907237127e-06, "loss": 1.1006150245666504, "loss/kd": 1.7838447093963623, "loss/lm": 0.4173853099346161, "step": 2593 }, { "epoch": 0.5325395196058305, "grad_norm": 1.017671265120061, "kd_ratio": 0.5, "learning_rate": 9.435027124129071e-06, "loss": 1.0938591957092285, "loss/kd": 1.8207345008850098, "loss/lm": 0.36698397994041443, "step": 2594 }, { "epoch": 0.532744816259495, "grad_norm": 1.0248753962086845, "kd_ratio": 0.5, "learning_rate": 9.42838759088723e-06, "loss": 1.068932056427002, "loss/kd": 1.7659780979156494, "loss/lm": 0.3718859851360321, "step": 2595 }, { "epoch": 0.5329501129131595, "grad_norm": 1.3939419374043633, "kd_ratio": 0.5, "learning_rate": 9.421748310448032e-06, "loss": 1.0831665992736816, "loss/kd": 1.773625135421753, "loss/lm": 0.39270809292793274, "step": 2596 }, { "epoch": 0.5331554095668241, "grad_norm": 1.0685733710834688, "kd_ratio": 0.5, "learning_rate": 9.415109285747773e-06, "loss": 1.1429203748703003, "loss/kd": 1.9442492723464966, "loss/lm": 0.34159141778945923, "step": 2597 }, { "epoch": 0.5333607062204886, "grad_norm": 1.2609577608765885, "kd_ratio": 0.5, "learning_rate": 9.408470519722646e-06, "loss": 0.9451083540916443, "loss/kd": 1.524559736251831, "loss/lm": 0.36565694212913513, "step": 2598 }, { "epoch": 0.5335660028741531, "grad_norm": 1.4036764921341747, "kd_ratio": 0.5, "learning_rate": 9.401832015308728e-06, "loss": 1.0065085887908936, "loss/kd": 1.6158161163330078, "loss/lm": 0.3972010016441345, "step": 2599 }, { "epoch": 0.5337712995278177, "grad_norm": 1.1247471643484552, "kd_ratio": 0.5, "learning_rate": 9.395193775441977e-06, "loss": 1.174215316772461, "loss/kd": 2.022704601287842, "loss/lm": 0.3257259130477905, "step": 2600 }, { "epoch": 0.5339765961814822, "grad_norm": 1.1174129005061986, "kd_ratio": 0.5, "learning_rate": 9.388555803058233e-06, "loss": 0.9908872246742249, "loss/kd": 1.6158111095428467, "loss/lm": 0.36596331000328064, "step": 2601 }, { "epoch": 0.5341818928351468, "grad_norm": 1.2619773980779287, "kd_ratio": 0.5, "learning_rate": 9.38191810109323e-06, "loss": 0.97498619556427, "loss/kd": 1.6514723300933838, "loss/lm": 0.2985000014305115, "step": 2602 }, { "epoch": 0.5343871894888114, "grad_norm": 1.501434460054872, "kd_ratio": 0.5, "learning_rate": 9.375280672482567e-06, "loss": 1.194597840309143, "loss/kd": 2.0070552825927734, "loss/lm": 0.38214030861854553, "step": 2603 }, { "epoch": 0.5345924861424759, "grad_norm": 1.6171983289930314, "kd_ratio": 0.5, "learning_rate": 9.368643520161731e-06, "loss": 1.0225168466567993, "loss/kd": 1.6430491209030151, "loss/lm": 0.40198463201522827, "step": 2604 }, { "epoch": 0.5347977827961404, "grad_norm": 2.477563270125658, "kd_ratio": 0.5, "learning_rate": 9.362006647066084e-06, "loss": 0.8799464106559753, "loss/kd": 1.3703010082244873, "loss/lm": 0.389591783285141, "step": 2605 }, { "epoch": 0.535003079449805, "grad_norm": 1.3721298694749273, "kd_ratio": 0.5, "learning_rate": 9.355370056130864e-06, "loss": 1.0307307243347168, "loss/kd": 1.754411220550537, "loss/lm": 0.30705010890960693, "step": 2606 }, { "epoch": 0.5352083761034695, "grad_norm": 1.7292607319778288, "kd_ratio": 0.5, "learning_rate": 9.348733750291186e-06, "loss": 1.1230907440185547, "loss/kd": 1.9053518772125244, "loss/lm": 0.3408295214176178, "step": 2607 }, { "epoch": 0.535413672757134, "grad_norm": 1.9372291618880322, "kd_ratio": 0.5, "learning_rate": 9.342097732482041e-06, "loss": 0.970242440700531, "loss/kd": 1.6211979389190674, "loss/lm": 0.31928691267967224, "step": 2608 }, { "epoch": 0.5356189694107986, "grad_norm": 1.3142119609270104, "kd_ratio": 0.5, "learning_rate": 9.335462005638287e-06, "loss": 0.9702719449996948, "loss/kd": 1.6838812828063965, "loss/lm": 0.25666266679763794, "step": 2609 }, { "epoch": 0.5358242660644631, "grad_norm": 1.2193708462252273, "kd_ratio": 0.5, "learning_rate": 9.328826572694657e-06, "loss": 1.0386381149291992, "loss/kd": 1.738994836807251, "loss/lm": 0.3382814824581146, "step": 2610 }, { "epoch": 0.5360295627181277, "grad_norm": 1.6523168344339474, "kd_ratio": 0.5, "learning_rate": 9.322191436585745e-06, "loss": 0.8105379939079285, "loss/kd": 1.268072485923767, "loss/lm": 0.35300353169441223, "step": 2611 }, { "epoch": 0.5362348593717923, "grad_norm": 1.0207551765077865, "kd_ratio": 0.5, "learning_rate": 9.315556600246035e-06, "loss": 1.0423917770385742, "loss/kd": 1.7093030214309692, "loss/lm": 0.37548065185546875, "step": 2612 }, { "epoch": 0.5364401560254568, "grad_norm": 1.4798851238794095, "kd_ratio": 0.5, "learning_rate": 9.308922066609858e-06, "loss": 1.5262948274612427, "loss/kd": 2.7360453605651855, "loss/lm": 0.3165443241596222, "step": 2613 }, { "epoch": 0.5366454526791213, "grad_norm": 1.5003229512468108, "kd_ratio": 0.5, "learning_rate": 9.302287838611417e-06, "loss": 1.140547513961792, "loss/kd": 1.95033860206604, "loss/lm": 0.33075645565986633, "step": 2614 }, { "epoch": 0.5368507493327859, "grad_norm": 1.9782424008664068, "kd_ratio": 0.5, "learning_rate": 9.295653919184787e-06, "loss": 1.0428236722946167, "loss/kd": 1.7063730955123901, "loss/lm": 0.37927424907684326, "step": 2615 }, { "epoch": 0.5370560459864504, "grad_norm": 1.0003864623627692, "kd_ratio": 0.5, "learning_rate": 9.289020311263891e-06, "loss": 0.8103258013725281, "loss/kd": 1.317079782485962, "loss/lm": 0.3035718500614166, "step": 2616 }, { "epoch": 0.5372613426401149, "grad_norm": 2.190124972378107, "kd_ratio": 0.5, "learning_rate": 9.282387017782537e-06, "loss": 0.9110122323036194, "loss/kd": 1.5381160974502563, "loss/lm": 0.2839083969593048, "step": 2617 }, { "epoch": 0.5374666392937795, "grad_norm": 1.5925399165062977, "kd_ratio": 0.5, "learning_rate": 9.275754041674373e-06, "loss": 1.0211734771728516, "loss/kd": 1.6524091958999634, "loss/lm": 0.38993775844573975, "step": 2618 }, { "epoch": 0.537671935947444, "grad_norm": 1.5664363018978706, "kd_ratio": 0.5, "learning_rate": 9.269121385872915e-06, "loss": 1.0018808841705322, "loss/kd": 1.6787129640579224, "loss/lm": 0.32504868507385254, "step": 2619 }, { "epoch": 0.5378772326011086, "grad_norm": 2.139466318682377, "kd_ratio": 0.5, "learning_rate": 9.262489053311537e-06, "loss": 1.0332332849502563, "loss/kd": 1.7492491006851196, "loss/lm": 0.3172173798084259, "step": 2620 }, { "epoch": 0.5380825292547732, "grad_norm": 1.6942735615248348, "kd_ratio": 0.5, "learning_rate": 9.255857046923467e-06, "loss": 1.1945489645004272, "loss/kd": 2.027905225753784, "loss/lm": 0.3611927628517151, "step": 2621 }, { "epoch": 0.5382878259084377, "grad_norm": 1.2683157939713225, "kd_ratio": 0.5, "learning_rate": 9.249225369641798e-06, "loss": 1.1399048566818237, "loss/kd": 1.9187463521957397, "loss/lm": 0.3610633611679077, "step": 2622 }, { "epoch": 0.5384931225621022, "grad_norm": 2.431191119281866, "kd_ratio": 0.5, "learning_rate": 9.242594024399467e-06, "loss": 0.9046696424484253, "loss/kd": 1.4913020133972168, "loss/lm": 0.318037211894989, "step": 2623 }, { "epoch": 0.5386984192157668, "grad_norm": 1.772655913053726, "kd_ratio": 0.5, "learning_rate": 9.235963014129268e-06, "loss": 1.0388894081115723, "loss/kd": 1.704427719116211, "loss/lm": 0.37335118651390076, "step": 2624 }, { "epoch": 0.5389037158694313, "grad_norm": 1.134247411625286, "kd_ratio": 0.5, "learning_rate": 9.229332341763851e-06, "loss": 1.0963550806045532, "loss/kd": 1.8095651865005493, "loss/lm": 0.38314494490623474, "step": 2625 }, { "epoch": 0.5391090125230958, "grad_norm": 2.149599170414898, "kd_ratio": 0.5, "learning_rate": 9.222702010235708e-06, "loss": 0.8505418300628662, "loss/kd": 1.4794104099273682, "loss/lm": 0.22167319059371948, "step": 2626 }, { "epoch": 0.5393143091767604, "grad_norm": 1.164781331519933, "kd_ratio": 0.5, "learning_rate": 9.216072022477183e-06, "loss": 1.0913395881652832, "loss/kd": 1.7062122821807861, "loss/lm": 0.47646698355674744, "step": 2627 }, { "epoch": 0.5395196058304249, "grad_norm": 1.575016671769598, "kd_ratio": 0.5, "learning_rate": 9.209442381420476e-06, "loss": 1.0094999074935913, "loss/kd": 1.6752665042877197, "loss/lm": 0.3437333106994629, "step": 2628 }, { "epoch": 0.5397249024840896, "grad_norm": 1.447330941710306, "kd_ratio": 0.5, "learning_rate": 9.202813089997628e-06, "loss": 0.924630880355835, "loss/kd": 1.4730913639068604, "loss/lm": 0.37617039680480957, "step": 2629 }, { "epoch": 0.5399301991377541, "grad_norm": 1.2484215603788422, "kd_ratio": 0.5, "learning_rate": 9.19618415114052e-06, "loss": 1.123787522315979, "loss/kd": 1.9207652807235718, "loss/lm": 0.32680973410606384, "step": 2630 }, { "epoch": 0.5401354957914186, "grad_norm": 1.419715101281116, "kd_ratio": 0.5, "learning_rate": 9.189555567780882e-06, "loss": 1.2865545749664307, "loss/kd": 2.217820167541504, "loss/lm": 0.35528892278671265, "step": 2631 }, { "epoch": 0.5403407924450832, "grad_norm": 1.091816033372735, "kd_ratio": 0.5, "learning_rate": 9.182927342850287e-06, "loss": 1.0563573837280273, "loss/kd": 1.751289963722229, "loss/lm": 0.36142483353614807, "step": 2632 }, { "epoch": 0.5405460890987477, "grad_norm": 1.69340953756522, "kd_ratio": 0.5, "learning_rate": 9.176299479280155e-06, "loss": 0.9112897515296936, "loss/kd": 1.4977768659591675, "loss/lm": 0.3248026669025421, "step": 2633 }, { "epoch": 0.5407513857524122, "grad_norm": 1.218794319104797, "kd_ratio": 0.5, "learning_rate": 9.169671980001733e-06, "loss": 1.1267868280410767, "loss/kd": 1.904744029045105, "loss/lm": 0.34882956743240356, "step": 2634 }, { "epoch": 0.5409566824060767, "grad_norm": 1.0018237235598966, "kd_ratio": 0.5, "learning_rate": 9.16304484794612e-06, "loss": 1.3323601484298706, "loss/kd": 2.2425408363342285, "loss/lm": 0.42217937111854553, "step": 2635 }, { "epoch": 0.5411619790597413, "grad_norm": 1.3279235533732388, "kd_ratio": 0.5, "learning_rate": 9.156418086044245e-06, "loss": 1.1682217121124268, "loss/kd": 1.946226716041565, "loss/lm": 0.3902166187763214, "step": 2636 }, { "epoch": 0.5413672757134059, "grad_norm": 1.279731195652969, "kd_ratio": 0.5, "learning_rate": 9.149791697226868e-06, "loss": 1.0958685874938965, "loss/kd": 1.80595862865448, "loss/lm": 0.3857784867286682, "step": 2637 }, { "epoch": 0.5415725723670705, "grad_norm": 0.894428791982181, "kd_ratio": 0.5, "learning_rate": 9.143165684424604e-06, "loss": 1.063522219657898, "loss/kd": 1.74123215675354, "loss/lm": 0.38581228256225586, "step": 2638 }, { "epoch": 0.541777869020735, "grad_norm": 1.0840973072323028, "kd_ratio": 0.5, "learning_rate": 9.13654005056788e-06, "loss": 0.9256612658500671, "loss/kd": 1.520139217376709, "loss/lm": 0.3311833143234253, "step": 2639 }, { "epoch": 0.5419831656743995, "grad_norm": 0.9984746367686905, "kd_ratio": 0.5, "learning_rate": 9.129914798586968e-06, "loss": 1.072173833847046, "loss/kd": 1.7895736694335938, "loss/lm": 0.3547738790512085, "step": 2640 }, { "epoch": 0.5421884623280641, "grad_norm": 1.0421006069698835, "kd_ratio": 0.5, "learning_rate": 9.123289931411965e-06, "loss": 1.0957914590835571, "loss/kd": 1.8963087797164917, "loss/lm": 0.29527413845062256, "step": 2641 }, { "epoch": 0.5423937589817286, "grad_norm": 1.5613283425695024, "kd_ratio": 0.5, "learning_rate": 9.116665451972801e-06, "loss": 1.0163289308547974, "loss/kd": 1.703818678855896, "loss/lm": 0.3288392126560211, "step": 2642 }, { "epoch": 0.5425990556353931, "grad_norm": 1.2593949390902424, "kd_ratio": 0.5, "learning_rate": 9.110041363199233e-06, "loss": 1.0455752611160278, "loss/kd": 1.7861377000808716, "loss/lm": 0.30501288175582886, "step": 2643 }, { "epoch": 0.5428043522890577, "grad_norm": 1.0941986777734514, "kd_ratio": 0.5, "learning_rate": 9.10341766802085e-06, "loss": 0.9289969801902771, "loss/kd": 1.464720368385315, "loss/lm": 0.39327359199523926, "step": 2644 }, { "epoch": 0.5430096489427222, "grad_norm": 1.3659835375998193, "kd_ratio": 0.5, "learning_rate": 9.096794369367061e-06, "loss": 0.9007705450057983, "loss/kd": 1.4392672777175903, "loss/lm": 0.36227381229400635, "step": 2645 }, { "epoch": 0.5432149455963868, "grad_norm": 1.0799021508037552, "kd_ratio": 0.5, "learning_rate": 9.090171470167097e-06, "loss": 1.173860788345337, "loss/kd": 1.9416686296463013, "loss/lm": 0.40605300664901733, "step": 2646 }, { "epoch": 0.5434202422500514, "grad_norm": 1.0139790328870204, "kd_ratio": 0.5, "learning_rate": 9.083548973350019e-06, "loss": 1.0097919702529907, "loss/kd": 1.7428984642028809, "loss/lm": 0.27668553590774536, "step": 2647 }, { "epoch": 0.5436255389037159, "grad_norm": 1.610777486938102, "kd_ratio": 0.5, "learning_rate": 9.076926881844713e-06, "loss": 0.9219072461128235, "loss/kd": 1.4913535118103027, "loss/lm": 0.35246098041534424, "step": 2648 }, { "epoch": 0.5438308355573804, "grad_norm": 1.0447096117970567, "kd_ratio": 0.5, "learning_rate": 9.070305198579877e-06, "loss": 1.2956589460372925, "loss/kd": 2.2711596488952637, "loss/lm": 0.32015830278396606, "step": 2649 }, { "epoch": 0.544036132211045, "grad_norm": 0.9361782686569391, "kd_ratio": 0.5, "learning_rate": 9.06368392648403e-06, "loss": 1.093928575515747, "loss/kd": 1.8050925731658936, "loss/lm": 0.38276466727256775, "step": 2650 }, { "epoch": 0.5442414288647095, "grad_norm": 1.1427488988511652, "kd_ratio": 0.5, "learning_rate": 9.057063068485513e-06, "loss": 0.9022786617279053, "loss/kd": 1.4394938945770264, "loss/lm": 0.3650633692741394, "step": 2651 }, { "epoch": 0.544446725518374, "grad_norm": 0.9161356892580566, "kd_ratio": 0.5, "learning_rate": 9.050442627512482e-06, "loss": 1.101839303970337, "loss/kd": 1.8261611461639404, "loss/lm": 0.3775175213813782, "step": 2652 }, { "epoch": 0.5446520221720386, "grad_norm": 1.1631520846730152, "kd_ratio": 0.5, "learning_rate": 9.043822606492907e-06, "loss": 1.0811690092086792, "loss/kd": 1.7838530540466309, "loss/lm": 0.37848490476608276, "step": 2653 }, { "epoch": 0.5448573188257031, "grad_norm": 1.0570503217273015, "kd_ratio": 0.5, "learning_rate": 9.037203008354578e-06, "loss": 0.8201994895935059, "loss/kd": 1.2752128839492798, "loss/lm": 0.36518603563308716, "step": 2654 }, { "epoch": 0.5450626154793677, "grad_norm": 4.164111256617794, "kd_ratio": 0.5, "learning_rate": 9.030583836025093e-06, "loss": 0.8599696159362793, "loss/kd": 1.390228033065796, "loss/lm": 0.3297111392021179, "step": 2655 }, { "epoch": 0.5452679121330323, "grad_norm": 0.9491477772716427, "kd_ratio": 0.5, "learning_rate": 9.023965092431862e-06, "loss": 1.1522250175476074, "loss/kd": 1.9468626976013184, "loss/lm": 0.35758742690086365, "step": 2656 }, { "epoch": 0.5454732087866968, "grad_norm": 0.9527788814488273, "kd_ratio": 0.5, "learning_rate": 9.017346780502102e-06, "loss": 1.164338231086731, "loss/kd": 1.9531123638153076, "loss/lm": 0.3755640685558319, "step": 2657 }, { "epoch": 0.5456785054403613, "grad_norm": 1.2141692685072374, "kd_ratio": 0.5, "learning_rate": 9.010728903162846e-06, "loss": 1.866448163986206, "loss/kd": 3.3728396892547607, "loss/lm": 0.36005663871765137, "step": 2658 }, { "epoch": 0.5458838020940259, "grad_norm": 1.1073051052184826, "kd_ratio": 0.5, "learning_rate": 9.004111463340935e-06, "loss": 1.1273937225341797, "loss/kd": 1.9287376403808594, "loss/lm": 0.3260498642921448, "step": 2659 }, { "epoch": 0.5460890987476904, "grad_norm": 1.2173206442939348, "kd_ratio": 0.5, "learning_rate": 8.997494463963008e-06, "loss": 1.1335889101028442, "loss/kd": 1.877267837524414, "loss/lm": 0.3899099826812744, "step": 2660 }, { "epoch": 0.5462943954013549, "grad_norm": 1.2491360729327785, "kd_ratio": 0.5, "learning_rate": 8.99087790795552e-06, "loss": 1.0004887580871582, "loss/kd": 1.6710916757583618, "loss/lm": 0.3298857808113098, "step": 2661 }, { "epoch": 0.5464996920550195, "grad_norm": 0.9688838975724632, "kd_ratio": 0.5, "learning_rate": 8.98426179824472e-06, "loss": 0.971409022808075, "loss/kd": 1.548125147819519, "loss/lm": 0.39469292759895325, "step": 2662 }, { "epoch": 0.546704988708684, "grad_norm": 1.6012541690248971, "kd_ratio": 0.5, "learning_rate": 8.977646137756662e-06, "loss": 0.9939379692077637, "loss/kd": 1.664777159690857, "loss/lm": 0.3230988383293152, "step": 2663 }, { "epoch": 0.5469102853623486, "grad_norm": 1.3174214579554726, "kd_ratio": 0.5, "learning_rate": 8.971030929417213e-06, "loss": 1.0145636796951294, "loss/kd": 1.6128865480422974, "loss/lm": 0.4162408709526062, "step": 2664 }, { "epoch": 0.5471155820160132, "grad_norm": 1.1890708659811071, "kd_ratio": 0.5, "learning_rate": 8.964416176152023e-06, "loss": 1.1221064329147339, "loss/kd": 1.8074760437011719, "loss/lm": 0.4367368519306183, "step": 2665 }, { "epoch": 0.5473208786696777, "grad_norm": 1.2645052561966257, "kd_ratio": 0.5, "learning_rate": 8.95780188088655e-06, "loss": 1.0358914136886597, "loss/kd": 1.7167212963104248, "loss/lm": 0.3550616204738617, "step": 2666 }, { "epoch": 0.5475261753233422, "grad_norm": 1.3832311753914286, "kd_ratio": 0.5, "learning_rate": 8.951188046546048e-06, "loss": 0.9808585047721863, "loss/kd": 1.5549914836883545, "loss/lm": 0.40672555565834045, "step": 2667 }, { "epoch": 0.5477314719770068, "grad_norm": 1.9171790765496832, "kd_ratio": 0.5, "learning_rate": 8.944574676055564e-06, "loss": 1.988749384880066, "loss/kd": 3.6057310104370117, "loss/lm": 0.3717678487300873, "step": 2668 }, { "epoch": 0.5479367686306713, "grad_norm": 1.6232595103755425, "kd_ratio": 0.5, "learning_rate": 8.937961772339945e-06, "loss": 1.3076950311660767, "loss/kd": 2.270051956176758, "loss/lm": 0.34533804655075073, "step": 2669 }, { "epoch": 0.5481420652843358, "grad_norm": 1.4894439338807355, "kd_ratio": 0.5, "learning_rate": 8.93134933832383e-06, "loss": 1.0245457887649536, "loss/kd": 1.675261378288269, "loss/lm": 0.3738301396369934, "step": 2670 }, { "epoch": 0.5483473619380004, "grad_norm": 1.4483567615015354, "kd_ratio": 0.5, "learning_rate": 8.924737376931651e-06, "loss": 1.3020751476287842, "loss/kd": 2.2933084964752197, "loss/lm": 0.31084179878234863, "step": 2671 }, { "epoch": 0.5485526585916649, "grad_norm": 1.0766319116543814, "kd_ratio": 0.5, "learning_rate": 8.918125891087626e-06, "loss": 1.0267839431762695, "loss/kd": 1.7124186754226685, "loss/lm": 0.34114933013916016, "step": 2672 }, { "epoch": 0.5487579552453296, "grad_norm": 2.2900409677521676, "kd_ratio": 0.5, "learning_rate": 8.911514883715763e-06, "loss": 1.0571755170822144, "loss/kd": 1.7195433378219604, "loss/lm": 0.3948076665401459, "step": 2673 }, { "epoch": 0.5489632518989941, "grad_norm": 1.6198252187264026, "kd_ratio": 0.5, "learning_rate": 8.90490435773987e-06, "loss": 1.9559708833694458, "loss/kd": 3.5550801753997803, "loss/lm": 0.35686156153678894, "step": 2674 }, { "epoch": 0.5491685485526586, "grad_norm": 2.2612054843267013, "kd_ratio": 0.5, "learning_rate": 8.898294316083529e-06, "loss": 0.8951770067214966, "loss/kd": 1.4173330068588257, "loss/lm": 0.37302106618881226, "step": 2675 }, { "epoch": 0.5493738452063232, "grad_norm": 2.297117789333914, "kd_ratio": 0.5, "learning_rate": 8.891684761670115e-06, "loss": 1.1790528297424316, "loss/kd": 1.9903430938720703, "loss/lm": 0.3677625060081482, "step": 2676 }, { "epoch": 0.5495791418599877, "grad_norm": 2.3565677537330876, "kd_ratio": 0.5, "learning_rate": 8.88507569742278e-06, "loss": 1.1148619651794434, "loss/kd": 1.9235697984695435, "loss/lm": 0.3061540722846985, "step": 2677 }, { "epoch": 0.5497844385136522, "grad_norm": 1.358840505878176, "kd_ratio": 0.5, "learning_rate": 8.878467126264467e-06, "loss": 0.8590760827064514, "loss/kd": 1.3843116760253906, "loss/lm": 0.3338404893875122, "step": 2678 }, { "epoch": 0.5499897351673168, "grad_norm": 2.28306648986816, "kd_ratio": 0.5, "learning_rate": 8.871859051117896e-06, "loss": 1.060842514038086, "loss/kd": 1.774964451789856, "loss/lm": 0.34672069549560547, "step": 2679 }, { "epoch": 0.5501950318209813, "grad_norm": 1.0121531380199578, "kd_ratio": 0.5, "learning_rate": 8.865251474905574e-06, "loss": 0.9844450950622559, "loss/kd": 1.6155686378479004, "loss/lm": 0.35332149267196655, "step": 2680 }, { "epoch": 0.5504003284746458, "grad_norm": 1.7359752363017193, "kd_ratio": 0.5, "learning_rate": 8.85864440054978e-06, "loss": 1.03469717502594, "loss/kd": 1.6901766061782837, "loss/lm": 0.3792177438735962, "step": 2681 }, { "epoch": 0.5506056251283105, "grad_norm": 1.2463922221391652, "kd_ratio": 0.5, "learning_rate": 8.852037830972578e-06, "loss": 0.992121696472168, "loss/kd": 1.6291688680648804, "loss/lm": 0.35507452487945557, "step": 2682 }, { "epoch": 0.550810921781975, "grad_norm": 1.0755845898629968, "kd_ratio": 0.5, "learning_rate": 8.8454317690958e-06, "loss": 1.0669043064117432, "loss/kd": 1.7860463857650757, "loss/lm": 0.3477623462677002, "step": 2683 }, { "epoch": 0.5510162184356395, "grad_norm": 1.2738206762593958, "kd_ratio": 0.5, "learning_rate": 8.838826217841057e-06, "loss": 0.94260174036026, "loss/kd": 1.5512744188308716, "loss/lm": 0.3339290916919708, "step": 2684 }, { "epoch": 0.5512215150893041, "grad_norm": 1.194772415918906, "kd_ratio": 0.5, "learning_rate": 8.832221180129743e-06, "loss": 1.1930184364318848, "loss/kd": 2.026646137237549, "loss/lm": 0.3593907356262207, "step": 2685 }, { "epoch": 0.5514268117429686, "grad_norm": 1.109611383472955, "kd_ratio": 0.5, "learning_rate": 8.825616658883012e-06, "loss": 1.0203627347946167, "loss/kd": 1.7490417957305908, "loss/lm": 0.2916835844516754, "step": 2686 }, { "epoch": 0.5516321083966331, "grad_norm": 1.0627970895704917, "kd_ratio": 0.5, "learning_rate": 8.819012657021794e-06, "loss": 1.2302933931350708, "loss/kd": 2.1293139457702637, "loss/lm": 0.33127278089523315, "step": 2687 }, { "epoch": 0.5518374050502977, "grad_norm": 1.0869473635322027, "kd_ratio": 0.5, "learning_rate": 8.812409177466796e-06, "loss": 0.9596243500709534, "loss/kd": 1.6029444932937622, "loss/lm": 0.31630420684814453, "step": 2688 }, { "epoch": 0.5520427017039622, "grad_norm": 0.9238858998797952, "kd_ratio": 0.5, "learning_rate": 8.80580622313848e-06, "loss": 0.8375353217124939, "loss/kd": 1.3892872333526611, "loss/lm": 0.28578343987464905, "step": 2689 }, { "epoch": 0.5522479983576267, "grad_norm": 1.2351346666964769, "kd_ratio": 0.5, "learning_rate": 8.79920379695709e-06, "loss": 1.0691845417022705, "loss/kd": 1.7152526378631592, "loss/lm": 0.42311638593673706, "step": 2690 }, { "epoch": 0.5524532950112914, "grad_norm": 1.0708150691626679, "kd_ratio": 0.5, "learning_rate": 8.79260190184263e-06, "loss": 1.0631836652755737, "loss/kd": 1.7190792560577393, "loss/lm": 0.407288134098053, "step": 2691 }, { "epoch": 0.5526585916649559, "grad_norm": 0.9617869828324008, "kd_ratio": 0.5, "learning_rate": 8.786000540714871e-06, "loss": 1.1124958992004395, "loss/kd": 1.7738404273986816, "loss/lm": 0.4511514902114868, "step": 2692 }, { "epoch": 0.5528638883186204, "grad_norm": 0.9739213890793365, "kd_ratio": 0.5, "learning_rate": 8.779399716493342e-06, "loss": 0.8681412935256958, "loss/kd": 1.3714288473129272, "loss/lm": 0.36485376954078674, "step": 2693 }, { "epoch": 0.553069184972285, "grad_norm": 0.9576124912762499, "kd_ratio": 0.5, "learning_rate": 8.772799432097337e-06, "loss": 1.0970039367675781, "loss/kd": 1.8753632307052612, "loss/lm": 0.31864461302757263, "step": 2694 }, { "epoch": 0.5532744816259495, "grad_norm": 0.9671114191797175, "kd_ratio": 0.5, "learning_rate": 8.76619969044592e-06, "loss": 1.1311932802200317, "loss/kd": 1.9205279350280762, "loss/lm": 0.3418586552143097, "step": 2695 }, { "epoch": 0.553479778279614, "grad_norm": 0.9891003179941433, "kd_ratio": 0.5, "learning_rate": 8.759600494457905e-06, "loss": 1.0606824159622192, "loss/kd": 1.7721554040908813, "loss/lm": 0.3492094874382019, "step": 2696 }, { "epoch": 0.5536850749332786, "grad_norm": 1.0600550882406583, "kd_ratio": 0.5, "learning_rate": 8.753001847051867e-06, "loss": 1.0124708414077759, "loss/kd": 1.6187338829040527, "loss/lm": 0.4062078595161438, "step": 2697 }, { "epoch": 0.5538903715869431, "grad_norm": 0.9683952092455442, "kd_ratio": 0.5, "learning_rate": 8.746403751146142e-06, "loss": 1.211683988571167, "loss/kd": 2.052468776702881, "loss/lm": 0.3708992004394531, "step": 2698 }, { "epoch": 0.5540956682406076, "grad_norm": 0.9968433324359339, "kd_ratio": 0.5, "learning_rate": 8.739806209658812e-06, "loss": 1.0507936477661133, "loss/kd": 1.7355462312698364, "loss/lm": 0.3660410940647125, "step": 2699 }, { "epoch": 0.5543009648942723, "grad_norm": 1.075070431040895, "kd_ratio": 0.5, "learning_rate": 8.733209225507732e-06, "loss": 0.931450366973877, "loss/kd": 1.4647483825683594, "loss/lm": 0.39815229177474976, "step": 2700 }, { "epoch": 0.5545062615479368, "grad_norm": 0.9455475035895862, "kd_ratio": 0.5, "learning_rate": 8.726612801610494e-06, "loss": 0.9338679313659668, "loss/kd": 1.5562564134597778, "loss/lm": 0.3114794194698334, "step": 2701 }, { "epoch": 0.5547115582016013, "grad_norm": 1.151059346569803, "kd_ratio": 0.5, "learning_rate": 8.720016940884445e-06, "loss": 0.967024564743042, "loss/kd": 1.5549384355545044, "loss/lm": 0.379110723733902, "step": 2702 }, { "epoch": 0.5549168548552659, "grad_norm": 1.1489403997089411, "kd_ratio": 0.5, "learning_rate": 8.713421646246692e-06, "loss": 1.101523756980896, "loss/kd": 1.8651163578033447, "loss/lm": 0.3379310965538025, "step": 2703 }, { "epoch": 0.5551221515089304, "grad_norm": 1.3148019200119643, "kd_ratio": 0.5, "learning_rate": 8.706826920614078e-06, "loss": 1.0388648509979248, "loss/kd": 1.7147670984268188, "loss/lm": 0.3629626929759979, "step": 2704 }, { "epoch": 0.5553274481625949, "grad_norm": 1.193974601483224, "kd_ratio": 0.5, "learning_rate": 8.700232766903206e-06, "loss": 1.0569069385528564, "loss/kd": 1.738855004310608, "loss/lm": 0.37495896220207214, "step": 2705 }, { "epoch": 0.5555327448162595, "grad_norm": 1.5137852116566983, "kd_ratio": 0.5, "learning_rate": 8.693639188030424e-06, "loss": 0.8742015361785889, "loss/kd": 1.457184076309204, "loss/lm": 0.29121893644332886, "step": 2706 }, { "epoch": 0.555738041469924, "grad_norm": 1.1410926621523834, "kd_ratio": 0.5, "learning_rate": 8.687046186911819e-06, "loss": 1.1302354335784912, "loss/kd": 1.961695909500122, "loss/lm": 0.2987748384475708, "step": 2707 }, { "epoch": 0.5559433381235885, "grad_norm": 1.4824224582870675, "kd_ratio": 0.5, "learning_rate": 8.68045376646323e-06, "loss": 1.0510810613632202, "loss/kd": 1.7454031705856323, "loss/lm": 0.35675886273384094, "step": 2708 }, { "epoch": 0.5561486347772532, "grad_norm": 1.4030403731901158, "kd_ratio": 0.5, "learning_rate": 8.673861929600237e-06, "loss": 1.1123830080032349, "loss/kd": 1.8575000762939453, "loss/lm": 0.36726585030555725, "step": 2709 }, { "epoch": 0.5563539314309177, "grad_norm": 1.4775066132439099, "kd_ratio": 0.5, "learning_rate": 8.667270679238154e-06, "loss": 1.0111362934112549, "loss/kd": 1.6812286376953125, "loss/lm": 0.3410438597202301, "step": 2710 }, { "epoch": 0.5565592280845822, "grad_norm": 1.220597557722204, "kd_ratio": 0.5, "learning_rate": 8.660680018292053e-06, "loss": 1.1711446046829224, "loss/kd": 1.9484161138534546, "loss/lm": 0.3938731551170349, "step": 2711 }, { "epoch": 0.5567645247382468, "grad_norm": 1.1352865035605988, "kd_ratio": 0.5, "learning_rate": 8.654089949676729e-06, "loss": 1.2169899940490723, "loss/kd": 2.0224945545196533, "loss/lm": 0.411485493183136, "step": 2712 }, { "epoch": 0.5569698213919113, "grad_norm": 1.0960138357280451, "kd_ratio": 0.5, "learning_rate": 8.647500476306724e-06, "loss": 1.0326664447784424, "loss/kd": 1.7514959573745728, "loss/lm": 0.3138369619846344, "step": 2713 }, { "epoch": 0.5571751180455758, "grad_norm": 1.3714933270169756, "kd_ratio": 0.5, "learning_rate": 8.64091160109631e-06, "loss": 1.0220046043395996, "loss/kd": 1.701638102531433, "loss/lm": 0.34237104654312134, "step": 2714 }, { "epoch": 0.5573804146992404, "grad_norm": 1.1773669508036382, "kd_ratio": 0.5, "learning_rate": 8.634323326959501e-06, "loss": 1.2239606380462646, "loss/kd": 2.1355886459350586, "loss/lm": 0.31233274936676025, "step": 2715 }, { "epoch": 0.5575857113529049, "grad_norm": 1.0470940002553086, "kd_ratio": 0.5, "learning_rate": 8.627735656810041e-06, "loss": 1.117711067199707, "loss/kd": 1.8789305686950684, "loss/lm": 0.35649168491363525, "step": 2716 }, { "epoch": 0.5577910080065694, "grad_norm": 1.8608418589089866, "kd_ratio": 0.5, "learning_rate": 8.621148593561412e-06, "loss": 1.2197898626327515, "loss/kd": 2.1307904720306396, "loss/lm": 0.30878928303718567, "step": 2717 }, { "epoch": 0.5579963046602341, "grad_norm": 1.0250863085109272, "kd_ratio": 0.5, "learning_rate": 8.61456214012682e-06, "loss": 1.0192415714263916, "loss/kd": 1.6085824966430664, "loss/lm": 0.42990076541900635, "step": 2718 }, { "epoch": 0.5582016013138986, "grad_norm": 1.762834894111461, "kd_ratio": 0.5, "learning_rate": 8.60797629941921e-06, "loss": 1.0528123378753662, "loss/kd": 1.6671545505523682, "loss/lm": 0.4384700059890747, "step": 2719 }, { "epoch": 0.5584068979675632, "grad_norm": 1.6171385262343758, "kd_ratio": 0.5, "learning_rate": 8.60139107435124e-06, "loss": 0.9448257684707642, "loss/kd": 1.5184897184371948, "loss/lm": 0.3711617887020111, "step": 2720 }, { "epoch": 0.5586121946212277, "grad_norm": 1.4492580116947356, "kd_ratio": 0.5, "learning_rate": 8.594806467835322e-06, "loss": 1.0771104097366333, "loss/kd": 1.8443818092346191, "loss/lm": 0.3098389804363251, "step": 2721 }, { "epoch": 0.5588174912748922, "grad_norm": 1.4141603035259067, "kd_ratio": 0.5, "learning_rate": 8.588222482783571e-06, "loss": 0.9062590599060059, "loss/kd": 1.4618607759475708, "loss/lm": 0.3506573736667633, "step": 2722 }, { "epoch": 0.5590227879285568, "grad_norm": 1.7885008626885326, "kd_ratio": 0.5, "learning_rate": 8.581639122107837e-06, "loss": 1.1781941652297974, "loss/kd": 2.0553109645843506, "loss/lm": 0.3010774254798889, "step": 2723 }, { "epoch": 0.5592280845822213, "grad_norm": 1.2792645076718514, "kd_ratio": 0.5, "learning_rate": 8.57505638871969e-06, "loss": 1.465294599533081, "loss/kd": 2.5837244987487793, "loss/lm": 0.34686464071273804, "step": 2724 }, { "epoch": 0.5594333812358858, "grad_norm": 1.1876162340822736, "kd_ratio": 0.5, "learning_rate": 8.568474285530427e-06, "loss": 1.0843164920806885, "loss/kd": 1.8142443895339966, "loss/lm": 0.35438865423202515, "step": 2725 }, { "epoch": 0.5596386778895504, "grad_norm": 1.8677166218752257, "kd_ratio": 0.5, "learning_rate": 8.561892815451065e-06, "loss": 1.1375290155410767, "loss/kd": 1.833337426185608, "loss/lm": 0.4417206048965454, "step": 2726 }, { "epoch": 0.559843974543215, "grad_norm": 1.0055414132399056, "kd_ratio": 0.5, "learning_rate": 8.555311981392342e-06, "loss": 1.0150525569915771, "loss/kd": 1.7084877490997314, "loss/lm": 0.3216172754764557, "step": 2727 }, { "epoch": 0.5600492711968795, "grad_norm": 2.0734791576489315, "kd_ratio": 0.5, "learning_rate": 8.548731786264713e-06, "loss": 1.0333552360534668, "loss/kd": 1.7291748523712158, "loss/lm": 0.3375355899333954, "step": 2728 }, { "epoch": 0.5602545678505441, "grad_norm": 1.1944830407466773, "kd_ratio": 0.5, "learning_rate": 8.542152232978346e-06, "loss": 1.1705807447433472, "loss/kd": 1.8840988874435425, "loss/lm": 0.4570625126361847, "step": 2729 }, { "epoch": 0.5604598645042086, "grad_norm": 1.3392658172434404, "kd_ratio": 0.5, "learning_rate": 8.535573324443134e-06, "loss": 1.6256269216537476, "loss/kd": 2.9512457847595215, "loss/lm": 0.30000796914100647, "step": 2730 }, { "epoch": 0.5606651611578731, "grad_norm": 1.5474912842323971, "kd_ratio": 0.5, "learning_rate": 8.528995063568673e-06, "loss": 0.9469254016876221, "loss/kd": 1.5402003526687622, "loss/lm": 0.35365039110183716, "step": 2731 }, { "epoch": 0.5608704578115377, "grad_norm": 1.2126238313521742, "kd_ratio": 0.5, "learning_rate": 8.522417453264289e-06, "loss": 0.9287277460098267, "loss/kd": 1.4955650568008423, "loss/lm": 0.3618904948234558, "step": 2732 }, { "epoch": 0.5610757544652022, "grad_norm": 1.4384859045906486, "kd_ratio": 0.5, "learning_rate": 8.515840496439009e-06, "loss": 1.0936332941055298, "loss/kd": 1.8750330209732056, "loss/lm": 0.3122336268424988, "step": 2733 }, { "epoch": 0.5612810511188667, "grad_norm": 1.3466236346990752, "kd_ratio": 0.5, "learning_rate": 8.50926419600157e-06, "loss": 1.2459027767181396, "loss/kd": 2.1761302947998047, "loss/lm": 0.315675288438797, "step": 2734 }, { "epoch": 0.5614863477725313, "grad_norm": 1.319485778773627, "kd_ratio": 0.5, "learning_rate": 8.502688554860426e-06, "loss": 1.3648260831832886, "loss/kd": 2.266408920288086, "loss/lm": 0.4632433354854584, "step": 2735 }, { "epoch": 0.5616916444261959, "grad_norm": 1.568482131665022, "kd_ratio": 0.5, "learning_rate": 8.496113575923727e-06, "loss": 0.9924178719520569, "loss/kd": 1.7015241384506226, "loss/lm": 0.2833116054534912, "step": 2736 }, { "epoch": 0.5618969410798604, "grad_norm": 1.0878093257004284, "kd_ratio": 0.5, "learning_rate": 8.48953926209935e-06, "loss": 1.1461104154586792, "loss/kd": 1.952906608581543, "loss/lm": 0.3393142521381378, "step": 2737 }, { "epoch": 0.562102237733525, "grad_norm": 1.5456097646017934, "kd_ratio": 0.5, "learning_rate": 8.482965616294863e-06, "loss": 1.0198439359664917, "loss/kd": 1.7262015342712402, "loss/lm": 0.31348633766174316, "step": 2738 }, { "epoch": 0.5623075343871895, "grad_norm": 1.899843212645109, "kd_ratio": 0.5, "learning_rate": 8.47639264141754e-06, "loss": 1.174804449081421, "loss/kd": 2.0246312618255615, "loss/lm": 0.3249775469303131, "step": 2739 }, { "epoch": 0.562512831040854, "grad_norm": 0.9868868932748304, "kd_ratio": 0.5, "learning_rate": 8.469820340374358e-06, "loss": 0.9713268280029297, "loss/kd": 1.5586681365966797, "loss/lm": 0.38398557901382446, "step": 2740 }, { "epoch": 0.5627181276945186, "grad_norm": 1.5861985800616383, "kd_ratio": 0.5, "learning_rate": 8.463248716072e-06, "loss": 0.9393947124481201, "loss/kd": 1.6039570569992065, "loss/lm": 0.2748323082923889, "step": 2741 }, { "epoch": 0.5629234243481831, "grad_norm": 1.3043849277724162, "kd_ratio": 0.5, "learning_rate": 8.456677771416848e-06, "loss": 1.0548335313796997, "loss/kd": 1.7133058309555054, "loss/lm": 0.39636123180389404, "step": 2742 }, { "epoch": 0.5631287210018476, "grad_norm": 1.3390843608099672, "kd_ratio": 0.5, "learning_rate": 8.450107509314983e-06, "loss": 0.8746188879013062, "loss/kd": 1.4112296104431152, "loss/lm": 0.3380081355571747, "step": 2743 }, { "epoch": 0.5633340176555122, "grad_norm": 1.4961373349756204, "kd_ratio": 0.5, "learning_rate": 8.44353793267219e-06, "loss": 1.1452420949935913, "loss/kd": 1.954263687133789, "loss/lm": 0.33622056245803833, "step": 2744 }, { "epoch": 0.5635393143091768, "grad_norm": 0.9066054536626537, "kd_ratio": 0.5, "learning_rate": 8.43696904439394e-06, "loss": 0.9376078248023987, "loss/kd": 1.4583669900894165, "loss/lm": 0.41684865951538086, "step": 2745 }, { "epoch": 0.5637446109628413, "grad_norm": 1.7147352745856068, "kd_ratio": 0.5, "learning_rate": 8.430400847385397e-06, "loss": 1.0009576082229614, "loss/kd": 1.632126808166504, "loss/lm": 0.36978837847709656, "step": 2746 }, { "epoch": 0.5639499076165059, "grad_norm": 1.422330147537802, "kd_ratio": 0.5, "learning_rate": 8.423833344551443e-06, "loss": 1.0231178998947144, "loss/kd": 1.7538119554519653, "loss/lm": 0.2924237847328186, "step": 2747 }, { "epoch": 0.5641552042701704, "grad_norm": 1.0626401656984334, "kd_ratio": 0.5, "learning_rate": 8.41726653879663e-06, "loss": 1.1420097351074219, "loss/kd": 1.9084447622299194, "loss/lm": 0.3755747973918915, "step": 2748 }, { "epoch": 0.5643605009238349, "grad_norm": 1.2561497593254212, "kd_ratio": 0.5, "learning_rate": 8.410700433025207e-06, "loss": 1.0734330415725708, "loss/kd": 1.840062141418457, "loss/lm": 0.30680403113365173, "step": 2749 }, { "epoch": 0.5645657975774995, "grad_norm": 0.9442974421351298, "kd_ratio": 0.5, "learning_rate": 8.404135030141116e-06, "loss": 0.8905307054519653, "loss/kd": 1.4737845659255981, "loss/lm": 0.30727681517601013, "step": 2750 }, { "epoch": 0.564771094231164, "grad_norm": 1.120416841454342, "kd_ratio": 0.5, "learning_rate": 8.397570333047985e-06, "loss": 0.9295818209648132, "loss/kd": 1.5651214122772217, "loss/lm": 0.2940421998500824, "step": 2751 }, { "epoch": 0.5649763908848285, "grad_norm": 0.892148726459336, "kd_ratio": 0.5, "learning_rate": 8.391006344649139e-06, "loss": 0.9282779097557068, "loss/kd": 1.5285056829452515, "loss/lm": 0.3280501067638397, "step": 2752 }, { "epoch": 0.5651816875384931, "grad_norm": 1.09839850417901, "kd_ratio": 0.5, "learning_rate": 8.384443067847578e-06, "loss": 0.9231257438659668, "loss/kd": 1.4985328912734985, "loss/lm": 0.34771865606307983, "step": 2753 }, { "epoch": 0.5653869841921577, "grad_norm": 1.2076165877155058, "kd_ratio": 0.5, "learning_rate": 8.377880505545994e-06, "loss": 1.0839723348617554, "loss/kd": 1.7941975593566895, "loss/lm": 0.3737470507621765, "step": 2754 }, { "epoch": 0.5655922808458222, "grad_norm": 1.0817838738569083, "kd_ratio": 0.5, "learning_rate": 8.37131866064676e-06, "loss": 1.2805804014205933, "loss/kd": 2.144339084625244, "loss/lm": 0.4168216586112976, "step": 2755 }, { "epoch": 0.5657975774994868, "grad_norm": 1.292518765986864, "kd_ratio": 0.5, "learning_rate": 8.364757536051935e-06, "loss": 0.9075435400009155, "loss/kd": 1.5049924850463867, "loss/lm": 0.3100946545600891, "step": 2756 }, { "epoch": 0.5660028741531513, "grad_norm": 1.22173191445444, "kd_ratio": 0.5, "learning_rate": 8.35819713466325e-06, "loss": 0.9925839304924011, "loss/kd": 1.6055220365524292, "loss/lm": 0.37964579463005066, "step": 2757 }, { "epoch": 0.5662081708068158, "grad_norm": 1.274243071338215, "kd_ratio": 0.5, "learning_rate": 8.351637459382133e-06, "loss": 1.0470588207244873, "loss/kd": 1.7831093072891235, "loss/lm": 0.3110082745552063, "step": 2758 }, { "epoch": 0.5664134674604804, "grad_norm": 1.6070231058332034, "kd_ratio": 0.5, "learning_rate": 8.345078513109677e-06, "loss": 1.1686737537384033, "loss/kd": 2.0129380226135254, "loss/lm": 0.3244093656539917, "step": 2759 }, { "epoch": 0.5666187641141449, "grad_norm": 0.8990717354465355, "kd_ratio": 0.5, "learning_rate": 8.338520298746653e-06, "loss": 0.8472064137458801, "loss/kd": 1.3650318384170532, "loss/lm": 0.32938098907470703, "step": 2760 }, { "epoch": 0.5668240607678094, "grad_norm": 1.4324491194718436, "kd_ratio": 0.5, "learning_rate": 8.33196281919352e-06, "loss": 0.9637899994850159, "loss/kd": 1.6403990983963013, "loss/lm": 0.28718093037605286, "step": 2761 }, { "epoch": 0.567029357421474, "grad_norm": 1.284045998308809, "kd_ratio": 0.5, "learning_rate": 8.3254060773504e-06, "loss": 1.2588894367218018, "loss/kd": 2.1450557708740234, "loss/lm": 0.3727231025695801, "step": 2762 }, { "epoch": 0.5672346540751386, "grad_norm": 1.1207394198761125, "kd_ratio": 0.5, "learning_rate": 8.31885007611709e-06, "loss": 1.0771700143814087, "loss/kd": 1.8234928846359253, "loss/lm": 0.33084723353385925, "step": 2763 }, { "epoch": 0.5674399507288032, "grad_norm": 1.1664767429996477, "kd_ratio": 0.5, "learning_rate": 8.31229481839307e-06, "loss": 1.0321171283721924, "loss/kd": 1.7288872003555298, "loss/lm": 0.3353469967842102, "step": 2764 }, { "epoch": 0.5676452473824677, "grad_norm": 0.9684277673322281, "kd_ratio": 0.5, "learning_rate": 8.305740307077479e-06, "loss": 0.8536576628684998, "loss/kd": 1.3535734415054321, "loss/lm": 0.353741854429245, "step": 2765 }, { "epoch": 0.5678505440361322, "grad_norm": 1.0422079298918456, "kd_ratio": 0.5, "learning_rate": 8.299186545069129e-06, "loss": 1.1486022472381592, "loss/kd": 1.8799411058425903, "loss/lm": 0.41726329922676086, "step": 2766 }, { "epoch": 0.5680558406897968, "grad_norm": 1.0197262711755388, "kd_ratio": 0.5, "learning_rate": 8.2926335352665e-06, "loss": 1.0903658866882324, "loss/kd": 1.9350225925445557, "loss/lm": 0.2457091212272644, "step": 2767 }, { "epoch": 0.5682611373434613, "grad_norm": 1.2133622709797904, "kd_ratio": 0.5, "learning_rate": 8.286081280567751e-06, "loss": 1.1827086210250854, "loss/kd": 2.0409460067749023, "loss/lm": 0.32447126507759094, "step": 2768 }, { "epoch": 0.5684664339971258, "grad_norm": 0.9207847920474959, "kd_ratio": 0.5, "learning_rate": 8.27952978387069e-06, "loss": 0.8114833235740662, "loss/kd": 1.2852391004562378, "loss/lm": 0.33772754669189453, "step": 2769 }, { "epoch": 0.5686717306507904, "grad_norm": 1.497422333106505, "kd_ratio": 0.5, "learning_rate": 8.272979048072797e-06, "loss": 1.066228985786438, "loss/kd": 1.7854690551757812, "loss/lm": 0.3469889760017395, "step": 2770 }, { "epoch": 0.5688770273044549, "grad_norm": 1.1398455927703663, "kd_ratio": 0.5, "learning_rate": 8.266429076071221e-06, "loss": 0.9170776605606079, "loss/kd": 1.4759165048599243, "loss/lm": 0.3582388758659363, "step": 2771 }, { "epoch": 0.5690823239581195, "grad_norm": 1.066895184257067, "kd_ratio": 0.5, "learning_rate": 8.25987987076276e-06, "loss": 1.0372183322906494, "loss/kd": 1.7258096933364868, "loss/lm": 0.3486270010471344, "step": 2772 }, { "epoch": 0.5692876206117841, "grad_norm": 1.4044563068133826, "kd_ratio": 0.5, "learning_rate": 8.253331435043888e-06, "loss": 0.9404542446136475, "loss/kd": 1.5663447380065918, "loss/lm": 0.3145638108253479, "step": 2773 }, { "epoch": 0.5694929172654486, "grad_norm": 1.3736938009691875, "kd_ratio": 0.5, "learning_rate": 8.246783771810728e-06, "loss": 1.0221763849258423, "loss/kd": 1.6879987716674805, "loss/lm": 0.3563540577888489, "step": 2774 }, { "epoch": 0.5696982139191131, "grad_norm": 1.5051644902314933, "kd_ratio": 0.5, "learning_rate": 8.240236883959067e-06, "loss": 1.656666874885559, "loss/kd": 2.910504102706909, "loss/lm": 0.40282970666885376, "step": 2775 }, { "epoch": 0.5699035105727777, "grad_norm": 1.2009896308863741, "kd_ratio": 0.5, "learning_rate": 8.233690774384344e-06, "loss": 1.3718390464782715, "loss/kd": 2.3225009441375732, "loss/lm": 0.42117711901664734, "step": 2776 }, { "epoch": 0.5701088072264422, "grad_norm": 1.201941359288425, "kd_ratio": 0.5, "learning_rate": 8.227145445981655e-06, "loss": 1.1061718463897705, "loss/kd": 1.9165709018707275, "loss/lm": 0.2957727015018463, "step": 2777 }, { "epoch": 0.5703141038801067, "grad_norm": 1.5976164353211568, "kd_ratio": 0.5, "learning_rate": 8.22060090164576e-06, "loss": 1.0191936492919922, "loss/kd": 1.5941039323806763, "loss/lm": 0.44428324699401855, "step": 2778 }, { "epoch": 0.5705194005337713, "grad_norm": 1.0986434760294443, "kd_ratio": 0.5, "learning_rate": 8.214057144271058e-06, "loss": 1.0604047775268555, "loss/kd": 1.781665325164795, "loss/lm": 0.3391442894935608, "step": 2779 }, { "epoch": 0.5707246971874358, "grad_norm": 1.5788021669421215, "kd_ratio": 0.5, "learning_rate": 8.207514176751606e-06, "loss": 1.026010513305664, "loss/kd": 1.6557666063308716, "loss/lm": 0.39625445008277893, "step": 2780 }, { "epoch": 0.5709299938411004, "grad_norm": 0.9575603196130101, "kd_ratio": 0.5, "learning_rate": 8.200972001981119e-06, "loss": 1.0600589513778687, "loss/kd": 1.850730299949646, "loss/lm": 0.2693875730037689, "step": 2781 }, { "epoch": 0.571135290494765, "grad_norm": 1.476829923234302, "kd_ratio": 0.5, "learning_rate": 8.194430622852946e-06, "loss": 0.9673147201538086, "loss/kd": 1.5648657083511353, "loss/lm": 0.36976373195648193, "step": 2782 }, { "epoch": 0.5713405871484295, "grad_norm": 1.0950006008784923, "kd_ratio": 0.5, "learning_rate": 8.187890042260094e-06, "loss": 0.9846938848495483, "loss/kd": 1.6356532573699951, "loss/lm": 0.3337344527244568, "step": 2783 }, { "epoch": 0.571545883802094, "grad_norm": 1.3058634628472496, "kd_ratio": 0.5, "learning_rate": 8.18135026309522e-06, "loss": 1.173170566558838, "loss/kd": 1.9103856086730957, "loss/lm": 0.4359554350376129, "step": 2784 }, { "epoch": 0.5717511804557586, "grad_norm": 0.9457492898893834, "kd_ratio": 0.5, "learning_rate": 8.174811288250621e-06, "loss": 1.106018304824829, "loss/kd": 1.8675434589385986, "loss/lm": 0.34449321031570435, "step": 2785 }, { "epoch": 0.5719564771094231, "grad_norm": 1.3987511315112404, "kd_ratio": 0.5, "learning_rate": 8.168273120618238e-06, "loss": 0.8583462238311768, "loss/kd": 1.3973897695541382, "loss/lm": 0.31930267810821533, "step": 2786 }, { "epoch": 0.5721617737630876, "grad_norm": 0.9478267499637751, "kd_ratio": 0.5, "learning_rate": 8.161735763089654e-06, "loss": 0.9827986359596252, "loss/kd": 1.6058387756347656, "loss/lm": 0.3597584664821625, "step": 2787 }, { "epoch": 0.5723670704167522, "grad_norm": 1.1362561257452681, "kd_ratio": 0.5, "learning_rate": 8.155199218556098e-06, "loss": 1.0327614545822144, "loss/kd": 1.7069873809814453, "loss/lm": 0.3585355281829834, "step": 2788 }, { "epoch": 0.5725723670704167, "grad_norm": 1.0247374166286787, "kd_ratio": 0.5, "learning_rate": 8.148663489908437e-06, "loss": 1.0501208305358887, "loss/kd": 1.6958123445510864, "loss/lm": 0.40442943572998047, "step": 2789 }, { "epoch": 0.5727776637240813, "grad_norm": 1.0477437773961487, "kd_ratio": 0.5, "learning_rate": 8.14212858003718e-06, "loss": 1.3378655910491943, "loss/kd": 2.3413257598876953, "loss/lm": 0.3344053328037262, "step": 2790 }, { "epoch": 0.5729829603777459, "grad_norm": 1.1645207242595954, "kd_ratio": 0.5, "learning_rate": 8.13559449183247e-06, "loss": 1.139762043952942, "loss/kd": 1.9565459489822388, "loss/lm": 0.3229782283306122, "step": 2791 }, { "epoch": 0.5731882570314104, "grad_norm": 1.1343492609192445, "kd_ratio": 0.5, "learning_rate": 8.12906122818409e-06, "loss": 1.0162878036499023, "loss/kd": 1.716552495956421, "loss/lm": 0.3160230219364166, "step": 2792 }, { "epoch": 0.5733935536850749, "grad_norm": 0.9521008900941228, "kd_ratio": 0.5, "learning_rate": 8.122528791981447e-06, "loss": 2.0650901794433594, "loss/kd": 3.7948052883148193, "loss/lm": 0.3353751301765442, "step": 2793 }, { "epoch": 0.5735988503387395, "grad_norm": 1.0653207733936925, "kd_ratio": 0.5, "learning_rate": 8.115997186113603e-06, "loss": 1.0680484771728516, "loss/kd": 1.7814582586288452, "loss/lm": 0.3546387851238251, "step": 2794 }, { "epoch": 0.573804146992404, "grad_norm": 0.9833387067909976, "kd_ratio": 0.5, "learning_rate": 8.109466413469238e-06, "loss": 0.9927239418029785, "loss/kd": 1.6476614475250244, "loss/lm": 0.33778640627861023, "step": 2795 }, { "epoch": 0.5740094436460685, "grad_norm": 0.9770876541781355, "kd_ratio": 0.5, "learning_rate": 8.102936476936665e-06, "loss": 0.9947199821472168, "loss/kd": 1.6340479850769043, "loss/lm": 0.3553919494152069, "step": 2796 }, { "epoch": 0.5742147402997331, "grad_norm": 1.0813023550900132, "kd_ratio": 0.5, "learning_rate": 8.096407379403826e-06, "loss": 0.9043975472450256, "loss/kd": 1.4582736492156982, "loss/lm": 0.35052141547203064, "step": 2797 }, { "epoch": 0.5744200369533976, "grad_norm": 1.7170601893634765, "kd_ratio": 0.5, "learning_rate": 8.089879123758301e-06, "loss": 0.9987371563911438, "loss/kd": 1.6683732271194458, "loss/lm": 0.3291010558605194, "step": 2798 }, { "epoch": 0.5746253336070622, "grad_norm": 1.0453803914377444, "kd_ratio": 0.5, "learning_rate": 8.083351712887288e-06, "loss": 1.1762382984161377, "loss/kd": 1.9520989656448364, "loss/lm": 0.4003777503967285, "step": 2799 }, { "epoch": 0.5748306302607268, "grad_norm": 1.2880375107973747, "kd_ratio": 0.5, "learning_rate": 8.076825149677617e-06, "loss": 0.9818542003631592, "loss/kd": 1.6382120847702026, "loss/lm": 0.32549628615379333, "step": 2800 }, { "epoch": 0.5750359269143913, "grad_norm": 1.0395353427965952, "kd_ratio": 0.5, "learning_rate": 8.070299437015741e-06, "loss": 1.2743134498596191, "loss/kd": 2.1677916049957275, "loss/lm": 0.3808353543281555, "step": 2801 }, { "epoch": 0.5752412235680558, "grad_norm": 1.1675225159318812, "kd_ratio": 0.5, "learning_rate": 8.063774577787736e-06, "loss": 0.9680136442184448, "loss/kd": 1.6392024755477905, "loss/lm": 0.29682478308677673, "step": 2802 }, { "epoch": 0.5754465202217204, "grad_norm": 1.0946320280397133, "kd_ratio": 0.5, "learning_rate": 8.057250574879296e-06, "loss": 1.1170037984848022, "loss/kd": 1.8481431007385254, "loss/lm": 0.3858644962310791, "step": 2803 }, { "epoch": 0.5756518168753849, "grad_norm": 1.0544070383200508, "kd_ratio": 0.5, "learning_rate": 8.050727431175752e-06, "loss": 0.8941067457199097, "loss/kd": 1.4909459352493286, "loss/lm": 0.2972675859928131, "step": 2804 }, { "epoch": 0.5758571135290494, "grad_norm": 1.1773201111346805, "kd_ratio": 0.5, "learning_rate": 8.044205149562041e-06, "loss": 1.0061978101730347, "loss/kd": 1.6540793180465698, "loss/lm": 0.3583163619041443, "step": 2805 }, { "epoch": 0.576062410182714, "grad_norm": 0.9745042121899851, "kd_ratio": 0.5, "learning_rate": 8.03768373292272e-06, "loss": 1.0223469734191895, "loss/kd": 1.6787759065628052, "loss/lm": 0.36591798067092896, "step": 2806 }, { "epoch": 0.5762677068363785, "grad_norm": 1.0369438792959325, "kd_ratio": 0.5, "learning_rate": 8.031163184141965e-06, "loss": 1.0937819480895996, "loss/kd": 1.843841791152954, "loss/lm": 0.34372207522392273, "step": 2807 }, { "epoch": 0.5764730034900432, "grad_norm": 1.1807607696149889, "kd_ratio": 0.5, "learning_rate": 8.024643506103574e-06, "loss": 1.0591273307800293, "loss/kd": 1.7898606061935425, "loss/lm": 0.3283940255641937, "step": 2808 }, { "epoch": 0.5766783001437077, "grad_norm": 1.0607001163129284, "kd_ratio": 0.5, "learning_rate": 8.018124701690948e-06, "loss": 0.9174329042434692, "loss/kd": 1.478150725364685, "loss/lm": 0.3567150831222534, "step": 2809 }, { "epoch": 0.5768835967973722, "grad_norm": 0.9254307515451127, "kd_ratio": 0.5, "learning_rate": 8.011606773787118e-06, "loss": 1.2427339553833008, "loss/kd": 2.180178165435791, "loss/lm": 0.30528974533081055, "step": 2810 }, { "epoch": 0.5770888934510368, "grad_norm": 1.2019566630451635, "kd_ratio": 0.5, "learning_rate": 8.005089725274711e-06, "loss": 1.3873162269592285, "loss/kd": 2.4371347427368164, "loss/lm": 0.33749762177467346, "step": 2811 }, { "epoch": 0.5772941901047013, "grad_norm": 0.9747379300141892, "kd_ratio": 0.5, "learning_rate": 7.998573559035976e-06, "loss": 1.1120132207870483, "loss/kd": 1.8316152095794678, "loss/lm": 0.3924112319946289, "step": 2812 }, { "epoch": 0.5774994867583658, "grad_norm": 1.11905095169701, "kd_ratio": 0.5, "learning_rate": 7.992058277952765e-06, "loss": 1.069166660308838, "loss/kd": 1.7101322412490845, "loss/lm": 0.42820096015930176, "step": 2813 }, { "epoch": 0.5777047834120304, "grad_norm": 0.9678058279083405, "kd_ratio": 0.5, "learning_rate": 7.985543884906539e-06, "loss": 1.1487109661102295, "loss/kd": 1.9342191219329834, "loss/lm": 0.3632028102874756, "step": 2814 }, { "epoch": 0.5779100800656949, "grad_norm": 1.037419431225701, "kd_ratio": 0.5, "learning_rate": 7.979030382778376e-06, "loss": 1.068595051765442, "loss/kd": 1.8204947710037231, "loss/lm": 0.31669536232948303, "step": 2815 }, { "epoch": 0.5781153767193594, "grad_norm": 0.9932017130896507, "kd_ratio": 0.5, "learning_rate": 7.972517774448946e-06, "loss": 0.9407647252082825, "loss/kd": 1.5265849828720093, "loss/lm": 0.35494449734687805, "step": 2816 }, { "epoch": 0.5783206733730241, "grad_norm": 0.9611219433916328, "kd_ratio": 0.5, "learning_rate": 7.966006062798534e-06, "loss": 1.085211992263794, "loss/kd": 1.7566618919372559, "loss/lm": 0.4137621819972992, "step": 2817 }, { "epoch": 0.5785259700266886, "grad_norm": 1.1796025459862405, "kd_ratio": 0.5, "learning_rate": 7.959495250707026e-06, "loss": 1.330389380455017, "loss/kd": 2.205249309539795, "loss/lm": 0.4555293619632721, "step": 2818 }, { "epoch": 0.5787312666803531, "grad_norm": 1.0000152592728415, "kd_ratio": 0.5, "learning_rate": 7.952985341053902e-06, "loss": 1.0868000984191895, "loss/kd": 1.8450851440429688, "loss/lm": 0.32851511240005493, "step": 2819 }, { "epoch": 0.5789365633340177, "grad_norm": 1.2161514774849982, "kd_ratio": 0.5, "learning_rate": 7.946476336718258e-06, "loss": 0.9136779308319092, "loss/kd": 1.4830166101455688, "loss/lm": 0.3443392217159271, "step": 2820 }, { "epoch": 0.5791418599876822, "grad_norm": 1.0851813981669745, "kd_ratio": 0.5, "learning_rate": 7.939968240578778e-06, "loss": 0.9271063208580017, "loss/kd": 1.4067845344543457, "loss/lm": 0.4474281072616577, "step": 2821 }, { "epoch": 0.5793471566413467, "grad_norm": 1.0934251316700314, "kd_ratio": 0.5, "learning_rate": 7.93346105551375e-06, "loss": 1.2299561500549316, "loss/kd": 2.0576515197753906, "loss/lm": 0.4022608995437622, "step": 2822 }, { "epoch": 0.5795524532950113, "grad_norm": 0.9503683019188118, "kd_ratio": 0.5, "learning_rate": 7.92695478440105e-06, "loss": 1.1416783332824707, "loss/kd": 1.8584057092666626, "loss/lm": 0.4249509572982788, "step": 2823 }, { "epoch": 0.5797577499486758, "grad_norm": 1.0867838469389155, "kd_ratio": 0.5, "learning_rate": 7.92044943011816e-06, "loss": 1.2970845699310303, "loss/kd": 2.3057868480682373, "loss/lm": 0.28838223218917847, "step": 2824 }, { "epoch": 0.5799630466023403, "grad_norm": 0.9766057170664746, "kd_ratio": 0.5, "learning_rate": 7.913944995542159e-06, "loss": 1.458254098892212, "loss/kd": 2.6462302207946777, "loss/lm": 0.2702779769897461, "step": 2825 }, { "epoch": 0.580168343256005, "grad_norm": 1.04372173372621, "kd_ratio": 0.5, "learning_rate": 7.907441483549705e-06, "loss": 0.9619735479354858, "loss/kd": 1.6303993463516235, "loss/lm": 0.2935478091239929, "step": 2826 }, { "epoch": 0.5803736399096695, "grad_norm": 2.329089345982785, "kd_ratio": 0.5, "learning_rate": 7.900938897017064e-06, "loss": 1.088217854499817, "loss/kd": 1.8737170696258545, "loss/lm": 0.3027186691761017, "step": 2827 }, { "epoch": 0.580578936563334, "grad_norm": 1.0158684307857264, "kd_ratio": 0.5, "learning_rate": 7.89443723882008e-06, "loss": 0.9291251301765442, "loss/kd": 1.54386568069458, "loss/lm": 0.3143845498561859, "step": 2828 }, { "epoch": 0.5807842332169986, "grad_norm": 1.2171978556761025, "kd_ratio": 0.5, "learning_rate": 7.88793651183419e-06, "loss": 1.1763250827789307, "loss/kd": 1.960370421409607, "loss/lm": 0.39227962493896484, "step": 2829 }, { "epoch": 0.5809895298706631, "grad_norm": 1.3826304729323686, "kd_ratio": 0.5, "learning_rate": 7.881436718934427e-06, "loss": 1.3956196308135986, "loss/kd": 2.3935532569885254, "loss/lm": 0.3976859748363495, "step": 2830 }, { "epoch": 0.5811948265243276, "grad_norm": 1.235058171388672, "kd_ratio": 0.5, "learning_rate": 7.874937862995401e-06, "loss": 1.0099574327468872, "loss/kd": 1.6735273599624634, "loss/lm": 0.3463875651359558, "step": 2831 }, { "epoch": 0.5814001231779922, "grad_norm": 1.363840775548946, "kd_ratio": 0.5, "learning_rate": 7.86843994689131e-06, "loss": 1.2995326519012451, "loss/kd": 2.251615524291992, "loss/lm": 0.34744980931282043, "step": 2832 }, { "epoch": 0.5816054198316567, "grad_norm": 1.2313722518965275, "kd_ratio": 0.5, "learning_rate": 7.861942973495939e-06, "loss": 1.138418436050415, "loss/kd": 1.9689058065414429, "loss/lm": 0.3079310357570648, "step": 2833 }, { "epoch": 0.5818107164853213, "grad_norm": 1.1751166631458394, "kd_ratio": 0.5, "learning_rate": 7.855446945682655e-06, "loss": 1.1242402791976929, "loss/kd": 1.8660588264465332, "loss/lm": 0.3824218213558197, "step": 2834 }, { "epoch": 0.5820160131389859, "grad_norm": 1.0251054083697066, "kd_ratio": 0.5, "learning_rate": 7.848951866324402e-06, "loss": 1.1686314344406128, "loss/kd": 1.9787043333053589, "loss/lm": 0.35855844616889954, "step": 2835 }, { "epoch": 0.5822213097926504, "grad_norm": 1.3120319624508614, "kd_ratio": 0.5, "learning_rate": 7.842457738293714e-06, "loss": 0.8610324859619141, "loss/kd": 1.4311802387237549, "loss/lm": 0.290884792804718, "step": 2836 }, { "epoch": 0.5824266064463149, "grad_norm": 0.9233479533681423, "kd_ratio": 0.5, "learning_rate": 7.835964564462701e-06, "loss": 1.203900694847107, "loss/kd": 2.0981662273406982, "loss/lm": 0.309635192155838, "step": 2837 }, { "epoch": 0.5826319030999795, "grad_norm": 1.7048655898788243, "kd_ratio": 0.5, "learning_rate": 7.829472347703046e-06, "loss": 1.1341303586959839, "loss/kd": 1.8405147790908813, "loss/lm": 0.4277459383010864, "step": 2838 }, { "epoch": 0.582837199753644, "grad_norm": 1.115063797409578, "kd_ratio": 0.5, "learning_rate": 7.822981090886011e-06, "loss": 1.0730090141296387, "loss/kd": 1.7460777759552002, "loss/lm": 0.39994022250175476, "step": 2839 }, { "epoch": 0.5830424964073085, "grad_norm": 1.4121075292847234, "kd_ratio": 0.5, "learning_rate": 7.816490796882432e-06, "loss": 1.0621201992034912, "loss/kd": 1.710478663444519, "loss/lm": 0.41376185417175293, "step": 2840 }, { "epoch": 0.5832477930609731, "grad_norm": 1.3214549288031747, "kd_ratio": 0.5, "learning_rate": 7.810001468562728e-06, "loss": 1.2355059385299683, "loss/kd": 2.0523250102996826, "loss/lm": 0.41868680715560913, "step": 2841 }, { "epoch": 0.5834530897146376, "grad_norm": 1.2458323596312046, "kd_ratio": 0.5, "learning_rate": 7.803513108796882e-06, "loss": 0.9353852868080139, "loss/kd": 1.5836070775985718, "loss/lm": 0.28716346621513367, "step": 2842 }, { "epoch": 0.5836583863683023, "grad_norm": 1.2033491715076141, "kd_ratio": 0.5, "learning_rate": 7.79702572045445e-06, "loss": 2.028874158859253, "loss/kd": 3.772639274597168, "loss/lm": 0.2851088345050812, "step": 2843 }, { "epoch": 0.5838636830219668, "grad_norm": 1.358399040995502, "kd_ratio": 0.5, "learning_rate": 7.79053930640456e-06, "loss": 0.9962665438652039, "loss/kd": 1.6710262298583984, "loss/lm": 0.32150688767433167, "step": 2844 }, { "epoch": 0.5840689796756313, "grad_norm": 1.0753827586831861, "kd_ratio": 0.5, "learning_rate": 7.784053869515904e-06, "loss": 1.1110427379608154, "loss/kd": 1.8385802507400513, "loss/lm": 0.3835051953792572, "step": 2845 }, { "epoch": 0.5842742763292958, "grad_norm": 1.5807585906062986, "kd_ratio": 0.5, "learning_rate": 7.777569412656757e-06, "loss": 1.2136200666427612, "loss/kd": 2.0691280364990234, "loss/lm": 0.3581121563911438, "step": 2846 }, { "epoch": 0.5844795729829604, "grad_norm": 1.8009984967911283, "kd_ratio": 0.5, "learning_rate": 7.771085938694943e-06, "loss": 0.8780449628829956, "loss/kd": 1.398668646812439, "loss/lm": 0.35742121934890747, "step": 2847 }, { "epoch": 0.5846848696366249, "grad_norm": 1.116056893128097, "kd_ratio": 0.5, "learning_rate": 7.764603450497861e-06, "loss": 1.3772052526474, "loss/kd": 2.38496994972229, "loss/lm": 0.36944058537483215, "step": 2848 }, { "epoch": 0.5848901662902894, "grad_norm": 1.7069443451467878, "kd_ratio": 0.5, "learning_rate": 7.758121950932469e-06, "loss": 1.006434440612793, "loss/kd": 1.6757655143737793, "loss/lm": 0.33710330724716187, "step": 2849 }, { "epoch": 0.585095462943954, "grad_norm": 1.0095364405969274, "kd_ratio": 0.5, "learning_rate": 7.751641442865285e-06, "loss": 1.141458511352539, "loss/kd": 1.9680821895599365, "loss/lm": 0.314834862947464, "step": 2850 }, { "epoch": 0.5853007595976185, "grad_norm": 1.680667704271087, "kd_ratio": 0.5, "learning_rate": 7.745161929162405e-06, "loss": 0.7862467765808105, "loss/kd": 1.2218736410140991, "loss/lm": 0.35061994194984436, "step": 2851 }, { "epoch": 0.5855060562512832, "grad_norm": 1.3076670057628867, "kd_ratio": 0.5, "learning_rate": 7.73868341268947e-06, "loss": 1.2767271995544434, "loss/kd": 2.1969070434570312, "loss/lm": 0.3565472364425659, "step": 2852 }, { "epoch": 0.5857113529049477, "grad_norm": 1.0787606853082918, "kd_ratio": 0.5, "learning_rate": 7.732205896311678e-06, "loss": 0.9770397543907166, "loss/kd": 1.6071248054504395, "loss/lm": 0.34695467352867126, "step": 2853 }, { "epoch": 0.5859166495586122, "grad_norm": 1.1090569915314419, "kd_ratio": 0.5, "learning_rate": 7.725729382893796e-06, "loss": 1.1704202890396118, "loss/kd": 2.0191004276275635, "loss/lm": 0.32174018025398254, "step": 2854 }, { "epoch": 0.5861219462122768, "grad_norm": 1.1211113751057076, "kd_ratio": 0.5, "learning_rate": 7.719253875300138e-06, "loss": 1.0600694417953491, "loss/kd": 1.8766213655471802, "loss/lm": 0.24351747334003448, "step": 2855 }, { "epoch": 0.5863272428659413, "grad_norm": 0.9165285939278927, "kd_ratio": 0.5, "learning_rate": 7.712779376394582e-06, "loss": 0.7956986427307129, "loss/kd": 1.2384179830551147, "loss/lm": 0.3529793620109558, "step": 2856 }, { "epoch": 0.5865325395196058, "grad_norm": 1.0489985691026438, "kd_ratio": 0.5, "learning_rate": 7.706305889040555e-06, "loss": 1.073938012123108, "loss/kd": 1.7631332874298096, "loss/lm": 0.384742796421051, "step": 2857 }, { "epoch": 0.5867378361732704, "grad_norm": 0.9521903092905075, "kd_ratio": 0.5, "learning_rate": 7.699833416101033e-06, "loss": 1.374786376953125, "loss/kd": 2.337484121322632, "loss/lm": 0.4120885729789734, "step": 2858 }, { "epoch": 0.5869431328269349, "grad_norm": 1.0501993208683202, "kd_ratio": 0.5, "learning_rate": 7.693361960438548e-06, "loss": 0.9980581402778625, "loss/kd": 1.653694748878479, "loss/lm": 0.3424215316772461, "step": 2859 }, { "epoch": 0.5871484294805994, "grad_norm": 1.2396471494771761, "kd_ratio": 0.5, "learning_rate": 7.68689152491518e-06, "loss": 1.1004561185836792, "loss/kd": 1.7749830484390259, "loss/lm": 0.42592909932136536, "step": 2860 }, { "epoch": 0.5873537261342641, "grad_norm": 0.9745116823243251, "kd_ratio": 0.5, "learning_rate": 7.68042211239256e-06, "loss": 1.1565394401550293, "loss/kd": 1.977034091949463, "loss/lm": 0.3360448181629181, "step": 2861 }, { "epoch": 0.5875590227879286, "grad_norm": 1.0892406726104962, "kd_ratio": 0.5, "learning_rate": 7.673953725731866e-06, "loss": 1.1760339736938477, "loss/kd": 2.024932384490967, "loss/lm": 0.3271355628967285, "step": 2862 }, { "epoch": 0.5877643194415931, "grad_norm": 1.0830817817034513, "kd_ratio": 0.5, "learning_rate": 7.667486367793822e-06, "loss": 1.114310622215271, "loss/kd": 1.8679640293121338, "loss/lm": 0.3606571853160858, "step": 2863 }, { "epoch": 0.5879696160952577, "grad_norm": 0.9953968930362012, "kd_ratio": 0.5, "learning_rate": 7.661020041438696e-06, "loss": 1.3173065185546875, "loss/kd": 2.2068285942077637, "loss/lm": 0.42778438329696655, "step": 2864 }, { "epoch": 0.5881749127489222, "grad_norm": 1.2173552141998678, "kd_ratio": 0.5, "learning_rate": 7.654554749526301e-06, "loss": 1.165615200996399, "loss/kd": 1.947813868522644, "loss/lm": 0.3834165334701538, "step": 2865 }, { "epoch": 0.5883802094025867, "grad_norm": 0.9583887704063019, "kd_ratio": 0.5, "learning_rate": 7.64809049491599e-06, "loss": 1.106345534324646, "loss/kd": 1.8122599124908447, "loss/lm": 0.40043115615844727, "step": 2866 }, { "epoch": 0.5885855060562513, "grad_norm": 1.0074692267294578, "kd_ratio": 0.5, "learning_rate": 7.641627280466663e-06, "loss": 0.868965208530426, "loss/kd": 1.396325707435608, "loss/lm": 0.34160470962524414, "step": 2867 }, { "epoch": 0.5887908027099158, "grad_norm": 1.0095666231936573, "kd_ratio": 0.5, "learning_rate": 7.635165109036756e-06, "loss": 1.0092052221298218, "loss/kd": 1.665519118309021, "loss/lm": 0.3528914153575897, "step": 2868 }, { "epoch": 0.5889960993635803, "grad_norm": 1.0191021160088722, "kd_ratio": 0.5, "learning_rate": 7.628703983484241e-06, "loss": 1.1001821756362915, "loss/kd": 1.8484821319580078, "loss/lm": 0.3518822193145752, "step": 2869 }, { "epoch": 0.589201396017245, "grad_norm": 1.15869193123442, "kd_ratio": 0.5, "learning_rate": 7.622243906666631e-06, "loss": 0.8454615473747253, "loss/kd": 1.3523054122924805, "loss/lm": 0.3386176824569702, "step": 2870 }, { "epoch": 0.5894066926709095, "grad_norm": 1.0568071908413648, "kd_ratio": 0.5, "learning_rate": 7.615784881440975e-06, "loss": 1.066171407699585, "loss/kd": 1.752052664756775, "loss/lm": 0.38029006123542786, "step": 2871 }, { "epoch": 0.589611989324574, "grad_norm": 1.0384243644313544, "kd_ratio": 0.5, "learning_rate": 7.6093269106638576e-06, "loss": 1.0087841749191284, "loss/kd": 1.5844614505767822, "loss/lm": 0.4331068694591522, "step": 2872 }, { "epoch": 0.5898172859782386, "grad_norm": 1.1524629094822623, "kd_ratio": 0.5, "learning_rate": 7.602869997191398e-06, "loss": 0.9294511079788208, "loss/kd": 1.5733280181884766, "loss/lm": 0.2855742275714874, "step": 2873 }, { "epoch": 0.5900225826319031, "grad_norm": 1.0385768276214997, "kd_ratio": 0.5, "learning_rate": 7.596414143879243e-06, "loss": 1.0477643013000488, "loss/kd": 1.7786738872528076, "loss/lm": 0.31685471534729004, "step": 2874 }, { "epoch": 0.5902278792855676, "grad_norm": 1.100413372133579, "kd_ratio": 0.5, "learning_rate": 7.589959353582574e-06, "loss": 1.0426533222198486, "loss/kd": 1.7907747030258179, "loss/lm": 0.29453203082084656, "step": 2875 }, { "epoch": 0.5904331759392322, "grad_norm": 1.0834406907386231, "kd_ratio": 0.5, "learning_rate": 7.583505629156096e-06, "loss": 1.0447255373001099, "loss/kd": 1.6552661657333374, "loss/lm": 0.4341849684715271, "step": 2876 }, { "epoch": 0.5906384725928967, "grad_norm": 1.465014058695486, "kd_ratio": 0.5, "learning_rate": 7.577052973454057e-06, "loss": 2.0321450233459473, "loss/kd": 3.7584381103515625, "loss/lm": 0.30585184693336487, "step": 2877 }, { "epoch": 0.5908437692465612, "grad_norm": 1.0045156497970966, "kd_ratio": 0.5, "learning_rate": 7.570601389330222e-06, "loss": 1.0225673913955688, "loss/kd": 1.6714870929718018, "loss/lm": 0.37364766001701355, "step": 2878 }, { "epoch": 0.5910490659002259, "grad_norm": 1.1875247122500614, "kd_ratio": 0.5, "learning_rate": 7.564150879637882e-06, "loss": 1.140101671218872, "loss/kd": 1.9557583332061768, "loss/lm": 0.324444979429245, "step": 2879 }, { "epoch": 0.5912543625538904, "grad_norm": 1.0633844263890586, "kd_ratio": 0.5, "learning_rate": 7.55770144722985e-06, "loss": 1.477823257446289, "loss/kd": 2.5946247577667236, "loss/lm": 0.3610217273235321, "step": 2880 }, { "epoch": 0.5914596592075549, "grad_norm": 0.9654379779186794, "kd_ratio": 0.5, "learning_rate": 7.551253094958473e-06, "loss": 1.5077593326568604, "loss/kd": 2.6467363834381104, "loss/lm": 0.3687823414802551, "step": 2881 }, { "epoch": 0.5916649558612195, "grad_norm": 0.9132396254507194, "kd_ratio": 0.5, "learning_rate": 7.544805825675612e-06, "loss": 1.3265087604522705, "loss/kd": 2.282074213027954, "loss/lm": 0.3709433376789093, "step": 2882 }, { "epoch": 0.591870252514884, "grad_norm": 1.04518396762722, "kd_ratio": 0.5, "learning_rate": 7.538359642232654e-06, "loss": 1.9370665550231934, "loss/kd": 3.6147689819335938, "loss/lm": 0.2593640089035034, "step": 2883 }, { "epoch": 0.5920755491685485, "grad_norm": 1.1546209861145629, "kd_ratio": 0.5, "learning_rate": 7.531914547480501e-06, "loss": 1.1672492027282715, "loss/kd": 1.9444084167480469, "loss/lm": 0.3900899589061737, "step": 2884 }, { "epoch": 0.5922808458222131, "grad_norm": 0.9344807965158465, "kd_ratio": 0.5, "learning_rate": 7.525470544269575e-06, "loss": 1.271063208580017, "loss/kd": 2.1724722385406494, "loss/lm": 0.36965411901474, "step": 2885 }, { "epoch": 0.5924861424758776, "grad_norm": 1.0504642006291853, "kd_ratio": 0.5, "learning_rate": 7.5190276354498184e-06, "loss": 1.057557463645935, "loss/kd": 1.774078607559204, "loss/lm": 0.341036319732666, "step": 2886 }, { "epoch": 0.5926914391295421, "grad_norm": 1.0846783080230151, "kd_ratio": 0.5, "learning_rate": 7.5125858238706785e-06, "loss": 0.9542309045791626, "loss/kd": 1.5470246076583862, "loss/lm": 0.3614371418952942, "step": 2887 }, { "epoch": 0.5928967357832068, "grad_norm": 1.1317109100422402, "kd_ratio": 0.5, "learning_rate": 7.506145112381138e-06, "loss": 0.9255214333534241, "loss/kd": 1.519194483757019, "loss/lm": 0.3318484127521515, "step": 2888 }, { "epoch": 0.5931020324368713, "grad_norm": 1.56578655148672, "kd_ratio": 0.5, "learning_rate": 7.499705503829673e-06, "loss": 1.1676418781280518, "loss/kd": 1.9315950870513916, "loss/lm": 0.4036886692047119, "step": 2889 }, { "epoch": 0.5933073290905359, "grad_norm": 1.759970462321766, "kd_ratio": 0.5, "learning_rate": 7.493267001064286e-06, "loss": 1.0026034116744995, "loss/kd": 1.6543575525283813, "loss/lm": 0.35084930062294006, "step": 2890 }, { "epoch": 0.5935126257442004, "grad_norm": 1.0514506781513282, "kd_ratio": 0.5, "learning_rate": 7.486829606932478e-06, "loss": 1.0724736452102661, "loss/kd": 1.7407617568969727, "loss/lm": 0.40418553352355957, "step": 2891 }, { "epoch": 0.5937179223978649, "grad_norm": 1.52637821350059, "kd_ratio": 0.5, "learning_rate": 7.480393324281267e-06, "loss": 1.001913070678711, "loss/kd": 1.6712794303894043, "loss/lm": 0.332546591758728, "step": 2892 }, { "epoch": 0.5939232190515294, "grad_norm": 1.2705058749800944, "kd_ratio": 0.5, "learning_rate": 7.473958155957182e-06, "loss": 1.0069295167922974, "loss/kd": 1.726540207862854, "loss/lm": 0.2873188257217407, "step": 2893 }, { "epoch": 0.594128515705194, "grad_norm": 1.1690679021781623, "kd_ratio": 0.5, "learning_rate": 7.4675241048062545e-06, "loss": 0.8365532755851746, "loss/kd": 1.3722560405731201, "loss/lm": 0.3008505403995514, "step": 2894 }, { "epoch": 0.5943338123588585, "grad_norm": 1.2956638166100478, "kd_ratio": 0.5, "learning_rate": 7.461091173674022e-06, "loss": 1.0693808794021606, "loss/kd": 1.7573738098144531, "loss/lm": 0.38138794898986816, "step": 2895 }, { "epoch": 0.594539109012523, "grad_norm": 0.9522868549421087, "kd_ratio": 0.5, "learning_rate": 7.454659365405528e-06, "loss": 0.8435838222503662, "loss/kd": 1.3436470031738281, "loss/lm": 0.3435206711292267, "step": 2896 }, { "epoch": 0.5947444056661877, "grad_norm": 1.247541624741334, "kd_ratio": 0.5, "learning_rate": 7.448228682845313e-06, "loss": 1.1095099449157715, "loss/kd": 1.8878980875015259, "loss/lm": 0.33112189173698425, "step": 2897 }, { "epoch": 0.5949497023198522, "grad_norm": 1.072445754386357, "kd_ratio": 0.5, "learning_rate": 7.441799128837437e-06, "loss": 0.8214402794837952, "loss/kd": 1.3203719854354858, "loss/lm": 0.3225085735321045, "step": 2898 }, { "epoch": 0.5951549989735168, "grad_norm": 1.439666267494756, "kd_ratio": 0.5, "learning_rate": 7.43537070622544e-06, "loss": 1.0956790447235107, "loss/kd": 1.8464020490646362, "loss/lm": 0.3449561297893524, "step": 2899 }, { "epoch": 0.5953602956271813, "grad_norm": 1.5674838281910017, "kd_ratio": 0.5, "learning_rate": 7.428943417852379e-06, "loss": 0.8605512380599976, "loss/kd": 1.4849517345428467, "loss/lm": 0.23615078628063202, "step": 2900 }, { "epoch": 0.5955655922808458, "grad_norm": 1.1626609075225152, "kd_ratio": 0.5, "learning_rate": 7.422517266560796e-06, "loss": 1.2004605531692505, "loss/kd": 2.033277988433838, "loss/lm": 0.3676430284976959, "step": 2901 }, { "epoch": 0.5957708889345104, "grad_norm": 1.8470835493842694, "kd_ratio": 0.5, "learning_rate": 7.416092255192734e-06, "loss": 1.02800452709198, "loss/kd": 1.7536187171936035, "loss/lm": 0.30239027738571167, "step": 2902 }, { "epoch": 0.5959761855881749, "grad_norm": 0.9388570729220261, "kd_ratio": 0.5, "learning_rate": 7.40966838658974e-06, "loss": 1.1158338785171509, "loss/kd": 1.9376811981201172, "loss/lm": 0.2939865291118622, "step": 2903 }, { "epoch": 0.5961814822418394, "grad_norm": 1.9838245570484567, "kd_ratio": 0.5, "learning_rate": 7.403245663592846e-06, "loss": 1.2302461862564087, "loss/kd": 2.0586414337158203, "loss/lm": 0.4018508493900299, "step": 2904 }, { "epoch": 0.596386778895504, "grad_norm": 1.2343306390617075, "kd_ratio": 0.5, "learning_rate": 7.39682408904258e-06, "loss": 1.1612098217010498, "loss/kd": 1.9695161581039429, "loss/lm": 0.3529033958911896, "step": 2905 }, { "epoch": 0.5965920755491686, "grad_norm": 1.1958350636188624, "kd_ratio": 0.5, "learning_rate": 7.390403665778965e-06, "loss": 1.015007734298706, "loss/kd": 1.6799044609069824, "loss/lm": 0.35011109709739685, "step": 2906 }, { "epoch": 0.5967973722028331, "grad_norm": 1.3664023846735367, "kd_ratio": 0.5, "learning_rate": 7.383984396641506e-06, "loss": 1.0418217182159424, "loss/kd": 1.7146961688995361, "loss/lm": 0.368947297334671, "step": 2907 }, { "epoch": 0.5970026688564977, "grad_norm": 1.0364174181445067, "kd_ratio": 0.5, "learning_rate": 7.3775662844692075e-06, "loss": 1.0411748886108398, "loss/kd": 1.7208093404769897, "loss/lm": 0.3615405559539795, "step": 2908 }, { "epoch": 0.5972079655101622, "grad_norm": 1.350737781155697, "kd_ratio": 0.5, "learning_rate": 7.37114933210056e-06, "loss": 0.992093563079834, "loss/kd": 1.6694564819335938, "loss/lm": 0.314730703830719, "step": 2909 }, { "epoch": 0.5974132621638267, "grad_norm": 1.5124204907335377, "kd_ratio": 0.5, "learning_rate": 7.364733542373542e-06, "loss": 1.239683747291565, "loss/kd": 2.1206018924713135, "loss/lm": 0.3587656319141388, "step": 2910 }, { "epoch": 0.5976185588174913, "grad_norm": 0.9440410005424726, "kd_ratio": 0.5, "learning_rate": 7.358318918125613e-06, "loss": 1.044998049736023, "loss/kd": 1.7304115295410156, "loss/lm": 0.3595845103263855, "step": 2911 }, { "epoch": 0.5978238554711558, "grad_norm": 1.3656460774048482, "kd_ratio": 0.5, "learning_rate": 7.351905462193718e-06, "loss": 1.2293542623519897, "loss/kd": 2.0448007583618164, "loss/lm": 0.4139077365398407, "step": 2912 }, { "epoch": 0.5980291521248203, "grad_norm": 1.1958100180807993, "kd_ratio": 0.5, "learning_rate": 7.345493177414284e-06, "loss": 1.0191296339035034, "loss/kd": 1.7063733339309692, "loss/lm": 0.33188596367836, "step": 2913 }, { "epoch": 0.5982344487784849, "grad_norm": 0.9795588909346142, "kd_ratio": 0.5, "learning_rate": 7.339082066623233e-06, "loss": 1.0848069190979004, "loss/kd": 1.8211108446121216, "loss/lm": 0.3485029339790344, "step": 2914 }, { "epoch": 0.5984397454321495, "grad_norm": 1.2002373203350891, "kd_ratio": 0.5, "learning_rate": 7.332672132655953e-06, "loss": 1.0440346002578735, "loss/kd": 1.717150330543518, "loss/lm": 0.3709188997745514, "step": 2915 }, { "epoch": 0.598645042085814, "grad_norm": 1.2565948110836673, "kd_ratio": 0.5, "learning_rate": 7.326263378347311e-06, "loss": 1.3644986152648926, "loss/kd": 2.449692964553833, "loss/lm": 0.27930429577827454, "step": 2916 }, { "epoch": 0.5988503387394786, "grad_norm": 1.0543377689835405, "kd_ratio": 0.5, "learning_rate": 7.3198558065316665e-06, "loss": 1.0030263662338257, "loss/kd": 1.6249805688858032, "loss/lm": 0.38107210397720337, "step": 2917 }, { "epoch": 0.5990556353931431, "grad_norm": 0.9889326064878136, "kd_ratio": 0.5, "learning_rate": 7.313449420042837e-06, "loss": 1.1107736825942993, "loss/kd": 1.8204387426376343, "loss/lm": 0.4011085629463196, "step": 2918 }, { "epoch": 0.5992609320468076, "grad_norm": 1.3303350386807786, "kd_ratio": 0.5, "learning_rate": 7.307044221714139e-06, "loss": 0.9781762361526489, "loss/kd": 1.576810598373413, "loss/lm": 0.37954193353652954, "step": 2919 }, { "epoch": 0.5994662287004722, "grad_norm": 1.3337465898675174, "kd_ratio": 0.5, "learning_rate": 7.30064021437834e-06, "loss": 1.0224108695983887, "loss/kd": 1.7170268297195435, "loss/lm": 0.3277948796749115, "step": 2920 }, { "epoch": 0.5996715253541367, "grad_norm": 0.8507527659064585, "kd_ratio": 0.5, "learning_rate": 7.294237400867696e-06, "loss": 1.2472175359725952, "loss/kd": 2.080962657928467, "loss/lm": 0.413472443819046, "step": 2921 }, { "epoch": 0.5998768220078012, "grad_norm": 1.434979608417555, "kd_ratio": 0.5, "learning_rate": 7.287835784013928e-06, "loss": 1.0172537565231323, "loss/kd": 1.6646878719329834, "loss/lm": 0.36981967091560364, "step": 2922 }, { "epoch": 0.6000821186614658, "grad_norm": 1.0954985568727904, "kd_ratio": 0.5, "learning_rate": 7.2814353666482276e-06, "loss": 1.1453396081924438, "loss/kd": 1.955032229423523, "loss/lm": 0.3356470465660095, "step": 2923 }, { "epoch": 0.6002874153151304, "grad_norm": 0.9463055617861779, "kd_ratio": 0.5, "learning_rate": 7.275036151601265e-06, "loss": 1.036821722984314, "loss/kd": 1.6464952230453491, "loss/lm": 0.4271482825279236, "step": 2924 }, { "epoch": 0.600492711968795, "grad_norm": 1.295750711909542, "kd_ratio": 0.5, "learning_rate": 7.268638141703168e-06, "loss": 0.9478375315666199, "loss/kd": 1.6227525472640991, "loss/lm": 0.272922545671463, "step": 2925 }, { "epoch": 0.6006980086224595, "grad_norm": 1.108566070615049, "kd_ratio": 0.5, "learning_rate": 7.262241339783534e-06, "loss": 1.3645715713500977, "loss/kd": 2.3895890712738037, "loss/lm": 0.3395540714263916, "step": 2926 }, { "epoch": 0.600903305276124, "grad_norm": 1.4588915350829905, "kd_ratio": 0.5, "learning_rate": 7.2558457486714316e-06, "loss": 0.9141712188720703, "loss/kd": 1.4603134393692017, "loss/lm": 0.36802899837493896, "step": 2927 }, { "epoch": 0.6011086019297885, "grad_norm": 1.003483963535552, "kd_ratio": 0.5, "learning_rate": 7.249451371195384e-06, "loss": 0.9702686667442322, "loss/kd": 1.567928671836853, "loss/lm": 0.37260866165161133, "step": 2928 }, { "epoch": 0.6013138985834531, "grad_norm": 1.4473161475481802, "kd_ratio": 0.5, "learning_rate": 7.24305821018339e-06, "loss": 1.1381391286849976, "loss/kd": 1.9517561197280884, "loss/lm": 0.32452210783958435, "step": 2929 }, { "epoch": 0.6015191952371176, "grad_norm": 1.2993773028740119, "kd_ratio": 0.5, "learning_rate": 7.2366662684629015e-06, "loss": 1.2841750383377075, "loss/kd": 2.17183780670166, "loss/lm": 0.3965122103691101, "step": 2930 }, { "epoch": 0.6017244918907821, "grad_norm": 1.0711315750350996, "kd_ratio": 0.5, "learning_rate": 7.230275548860833e-06, "loss": 1.3542712926864624, "loss/kd": 2.3419132232666016, "loss/lm": 0.3666292726993561, "step": 2931 }, { "epoch": 0.6019297885444467, "grad_norm": 1.3366724953279654, "kd_ratio": 0.5, "learning_rate": 7.223886054203559e-06, "loss": 1.3181688785552979, "loss/kd": 2.3257808685302734, "loss/lm": 0.31055691838264465, "step": 2932 }, { "epoch": 0.6021350851981113, "grad_norm": 1.0420129744389441, "kd_ratio": 0.5, "learning_rate": 7.217497787316909e-06, "loss": 0.8844603896141052, "loss/kd": 1.467146396636963, "loss/lm": 0.30177438259124756, "step": 2933 }, { "epoch": 0.6023403818517759, "grad_norm": 1.2562293753419278, "kd_ratio": 0.5, "learning_rate": 7.2111107510261776e-06, "loss": 0.8897281885147095, "loss/kd": 1.5126831531524658, "loss/lm": 0.26677319407463074, "step": 2934 }, { "epoch": 0.6025456785054404, "grad_norm": 1.3612650782556288, "kd_ratio": 0.5, "learning_rate": 7.2047249481561125e-06, "loss": 1.280792474746704, "loss/kd": 2.1548774242401123, "loss/lm": 0.40670761466026306, "step": 2935 }, { "epoch": 0.6027509751591049, "grad_norm": 1.0313263418832739, "kd_ratio": 0.5, "learning_rate": 7.198340381530906e-06, "loss": 1.0192257165908813, "loss/kd": 1.6748592853546143, "loss/lm": 0.3635922372341156, "step": 2936 }, { "epoch": 0.6029562718127695, "grad_norm": 1.7001736473944018, "kd_ratio": 0.5, "learning_rate": 7.1919570539742165e-06, "loss": 1.1905970573425293, "loss/kd": 2.059293508529663, "loss/lm": 0.3219006657600403, "step": 2937 }, { "epoch": 0.603161568466434, "grad_norm": 1.0080220154656017, "kd_ratio": 0.5, "learning_rate": 7.18557496830915e-06, "loss": 1.0218801498413086, "loss/kd": 1.6114249229431152, "loss/lm": 0.4323354959487915, "step": 2938 }, { "epoch": 0.6033668651200985, "grad_norm": 1.66958218986938, "kd_ratio": 0.5, "learning_rate": 7.179194127358258e-06, "loss": 0.9020969271659851, "loss/kd": 1.3981438875198364, "loss/lm": 0.4060499668121338, "step": 2939 }, { "epoch": 0.603572161773763, "grad_norm": 1.4506481061140217, "kd_ratio": 0.5, "learning_rate": 7.172814533943552e-06, "loss": 1.0118639469146729, "loss/kd": 1.6939818859100342, "loss/lm": 0.32974594831466675, "step": 2940 }, { "epoch": 0.6037774584274276, "grad_norm": 1.1177879436619513, "kd_ratio": 0.5, "learning_rate": 7.166436190886483e-06, "loss": 1.066813588142395, "loss/kd": 1.7325986623764038, "loss/lm": 0.40102842450141907, "step": 2941 }, { "epoch": 0.6039827550810922, "grad_norm": 1.852892708022727, "kd_ratio": 0.5, "learning_rate": 7.160059101007952e-06, "loss": 1.062315821647644, "loss/kd": 1.822582483291626, "loss/lm": 0.30204907059669495, "step": 2942 }, { "epoch": 0.6041880517347568, "grad_norm": 1.094265086408641, "kd_ratio": 0.5, "learning_rate": 7.153683267128304e-06, "loss": 0.9639092683792114, "loss/kd": 1.5631871223449707, "loss/lm": 0.3646313548088074, "step": 2943 }, { "epoch": 0.6043933483884213, "grad_norm": 1.1450785602642566, "kd_ratio": 0.5, "learning_rate": 7.147308692067331e-06, "loss": 1.089898943901062, "loss/kd": 1.8064773082733154, "loss/lm": 0.373320609331131, "step": 2944 }, { "epoch": 0.6045986450420858, "grad_norm": 1.1323131513268008, "kd_ratio": 0.5, "learning_rate": 7.140935378644268e-06, "loss": 0.9865920543670654, "loss/kd": 1.6372382640838623, "loss/lm": 0.33594590425491333, "step": 2945 }, { "epoch": 0.6048039416957504, "grad_norm": 1.1001858162371734, "kd_ratio": 0.5, "learning_rate": 7.134563329677794e-06, "loss": 1.285947561264038, "loss/kd": 2.2074618339538574, "loss/lm": 0.36443328857421875, "step": 2946 }, { "epoch": 0.6050092383494149, "grad_norm": 1.3082225852857132, "kd_ratio": 0.5, "learning_rate": 7.128192547986023e-06, "loss": 1.1393505334854126, "loss/kd": 1.9350838661193848, "loss/lm": 0.3436172902584076, "step": 2947 }, { "epoch": 0.6052145350030794, "grad_norm": 0.9160299931427777, "kd_ratio": 0.5, "learning_rate": 7.121823036386514e-06, "loss": 0.9262303113937378, "loss/kd": 1.464234471321106, "loss/lm": 0.388226181268692, "step": 2948 }, { "epoch": 0.605419831656744, "grad_norm": 1.6188554424370212, "kd_ratio": 0.5, "learning_rate": 7.115454797696255e-06, "loss": 1.072917103767395, "loss/kd": 1.8149218559265137, "loss/lm": 0.3309122920036316, "step": 2949 }, { "epoch": 0.6056251283104085, "grad_norm": 1.0214225066900824, "kd_ratio": 0.5, "learning_rate": 7.109087834731689e-06, "loss": 0.9656181335449219, "loss/kd": 1.543606162071228, "loss/lm": 0.3876301646232605, "step": 2950 }, { "epoch": 0.6058304249640731, "grad_norm": 1.0566680870723113, "kd_ratio": 0.5, "learning_rate": 7.102722150308678e-06, "loss": 1.077956199645996, "loss/kd": 1.756615400314331, "loss/lm": 0.3992971181869507, "step": 2951 }, { "epoch": 0.6060357216177377, "grad_norm": 1.1830219103963906, "kd_ratio": 0.5, "learning_rate": 7.096357747242526e-06, "loss": 2.0456318855285645, "loss/kd": 3.7836520671844482, "loss/lm": 0.30761173367500305, "step": 2952 }, { "epoch": 0.6062410182714022, "grad_norm": 0.9440044887010174, "kd_ratio": 0.5, "learning_rate": 7.089994628347965e-06, "loss": 0.9899669885635376, "loss/kd": 1.652921199798584, "loss/lm": 0.3270127475261688, "step": 2953 }, { "epoch": 0.6064463149250667, "grad_norm": 1.1132943232229344, "kd_ratio": 0.5, "learning_rate": 7.083632796439164e-06, "loss": 1.0593900680541992, "loss/kd": 1.7241886854171753, "loss/lm": 0.39459148049354553, "step": 2954 }, { "epoch": 0.6066516115787313, "grad_norm": 0.9327787324931749, "kd_ratio": 0.5, "learning_rate": 7.077272254329726e-06, "loss": 0.9581760764122009, "loss/kd": 1.5355172157287598, "loss/lm": 0.3808349668979645, "step": 2955 }, { "epoch": 0.6068569082323958, "grad_norm": 0.9507873510955642, "kd_ratio": 0.5, "learning_rate": 7.070913004832675e-06, "loss": 1.244406819343567, "loss/kd": 2.143639326095581, "loss/lm": 0.34517428278923035, "step": 2956 }, { "epoch": 0.6070622048860603, "grad_norm": 0.9934148089379402, "kd_ratio": 0.5, "learning_rate": 7.064555050760472e-06, "loss": 1.0553749799728394, "loss/kd": 1.7623182535171509, "loss/lm": 0.34843161702156067, "step": 2957 }, { "epoch": 0.6072675015397249, "grad_norm": 1.0027305129868003, "kd_ratio": 0.5, "learning_rate": 7.058198394924997e-06, "loss": 1.0931676626205444, "loss/kd": 1.8616259098052979, "loss/lm": 0.3247095048427582, "step": 2958 }, { "epoch": 0.6074727981933894, "grad_norm": 1.104039146937297, "kd_ratio": 0.5, "learning_rate": 7.051843040137558e-06, "loss": 1.074076533317566, "loss/kd": 1.7578282356262207, "loss/lm": 0.3903248608112335, "step": 2959 }, { "epoch": 0.607678094847054, "grad_norm": 0.8877112630709457, "kd_ratio": 0.5, "learning_rate": 7.045488989208889e-06, "loss": 0.9064149260520935, "loss/kd": 1.5291141271591187, "loss/lm": 0.28371572494506836, "step": 2960 }, { "epoch": 0.6078833915007186, "grad_norm": 1.1856928290798312, "kd_ratio": 0.5, "learning_rate": 7.039136244949153e-06, "loss": 1.0548279285430908, "loss/kd": 1.7712219953536987, "loss/lm": 0.33843374252319336, "step": 2961 }, { "epoch": 0.6080886881543831, "grad_norm": 0.9435334754274667, "kd_ratio": 0.5, "learning_rate": 7.032784810167923e-06, "loss": 1.014493465423584, "loss/kd": 1.6260854005813599, "loss/lm": 0.40290164947509766, "step": 2962 }, { "epoch": 0.6082939848080476, "grad_norm": 1.1611802853668292, "kd_ratio": 0.5, "learning_rate": 7.026434687674204e-06, "loss": 0.9226101636886597, "loss/kd": 1.5068013668060303, "loss/lm": 0.3384189009666443, "step": 2963 }, { "epoch": 0.6084992814617122, "grad_norm": 0.9035212786482201, "kd_ratio": 0.5, "learning_rate": 7.020085880276415e-06, "loss": 1.125415563583374, "loss/kd": 1.8267236948013306, "loss/lm": 0.42410752177238464, "step": 2964 }, { "epoch": 0.6087045781153767, "grad_norm": 1.090675781363257, "kd_ratio": 0.5, "learning_rate": 7.01373839078239e-06, "loss": 1.0359236001968384, "loss/kd": 1.7673641443252563, "loss/lm": 0.3044830858707428, "step": 2965 }, { "epoch": 0.6089098747690412, "grad_norm": 1.0223655012148622, "kd_ratio": 0.5, "learning_rate": 7.007392221999391e-06, "loss": 0.9933993816375732, "loss/kd": 1.697087287902832, "loss/lm": 0.28971150517463684, "step": 2966 }, { "epoch": 0.6091151714227058, "grad_norm": 1.0679945379406126, "kd_ratio": 0.5, "learning_rate": 7.001047376734087e-06, "loss": 0.9309388995170593, "loss/kd": 1.5410734415054321, "loss/lm": 0.32080432772636414, "step": 2967 }, { "epoch": 0.6093204680763703, "grad_norm": 1.2414539208030797, "kd_ratio": 0.5, "learning_rate": 6.994703857792562e-06, "loss": 0.9513878226280212, "loss/kd": 1.5295439958572388, "loss/lm": 0.3732316493988037, "step": 2968 }, { "epoch": 0.609525764730035, "grad_norm": 0.9970907156573653, "kd_ratio": 0.5, "learning_rate": 6.988361667980319e-06, "loss": 1.512908935546875, "loss/kd": 2.6757233142852783, "loss/lm": 0.35009443759918213, "step": 2969 }, { "epoch": 0.6097310613836995, "grad_norm": 1.106963992566946, "kd_ratio": 0.5, "learning_rate": 6.982020810102262e-06, "loss": 1.0558903217315674, "loss/kd": 1.7529700994491577, "loss/lm": 0.3588104248046875, "step": 2970 }, { "epoch": 0.609936358037364, "grad_norm": 1.195730610413327, "kd_ratio": 0.5, "learning_rate": 6.975681286962724e-06, "loss": 1.079241156578064, "loss/kd": 1.8095002174377441, "loss/lm": 0.3489820957183838, "step": 2971 }, { "epoch": 0.6101416546910285, "grad_norm": 1.1459751712644715, "kd_ratio": 0.5, "learning_rate": 6.969343101365431e-06, "loss": 0.88456130027771, "loss/kd": 1.4425435066223145, "loss/lm": 0.3265790641307831, "step": 2972 }, { "epoch": 0.6103469513446931, "grad_norm": 1.2658772542257346, "kd_ratio": 0.5, "learning_rate": 6.963006256113527e-06, "loss": 1.0265114307403564, "loss/kd": 1.642665147781372, "loss/lm": 0.41035762429237366, "step": 2973 }, { "epoch": 0.6105522479983576, "grad_norm": 1.2078349232278902, "kd_ratio": 0.5, "learning_rate": 6.956670754009558e-06, "loss": 1.2674518823623657, "loss/kd": 2.143780469894409, "loss/lm": 0.3911232650279999, "step": 2974 }, { "epoch": 0.6107575446520221, "grad_norm": 1.333976960958085, "kd_ratio": 0.5, "learning_rate": 6.9503365978554735e-06, "loss": 0.9714632630348206, "loss/kd": 1.614109754562378, "loss/lm": 0.32881680130958557, "step": 2975 }, { "epoch": 0.6109628413056867, "grad_norm": 1.364294347770456, "kd_ratio": 0.5, "learning_rate": 6.944003790452641e-06, "loss": 1.0202641487121582, "loss/kd": 1.7458698749542236, "loss/lm": 0.2946583926677704, "step": 2976 }, { "epoch": 0.6111681379593512, "grad_norm": 1.1678923337139968, "kd_ratio": 0.5, "learning_rate": 6.937672334601819e-06, "loss": 1.0276463031768799, "loss/kd": 1.7540580034255981, "loss/lm": 0.30123451352119446, "step": 2977 }, { "epoch": 0.6113734346130159, "grad_norm": 1.5825495532399896, "kd_ratio": 0.5, "learning_rate": 6.931342233103171e-06, "loss": 1.0983421802520752, "loss/kd": 1.8327369689941406, "loss/lm": 0.36394745111465454, "step": 2978 }, { "epoch": 0.6115787312666804, "grad_norm": 1.3821246866425987, "kd_ratio": 0.5, "learning_rate": 6.925013488756264e-06, "loss": 1.0154407024383545, "loss/kd": 1.6412206888198853, "loss/lm": 0.38966068625450134, "step": 2979 }, { "epoch": 0.6117840279203449, "grad_norm": 1.1574334675344833, "kd_ratio": 0.5, "learning_rate": 6.9186861043600575e-06, "loss": 1.1646941900253296, "loss/kd": 2.01444673538208, "loss/lm": 0.3149417042732239, "step": 2980 }, { "epoch": 0.6119893245740095, "grad_norm": 1.5051258701506807, "kd_ratio": 0.5, "learning_rate": 6.912360082712923e-06, "loss": 0.9854162931442261, "loss/kd": 1.5938745737075806, "loss/lm": 0.37695807218551636, "step": 2981 }, { "epoch": 0.612194621227674, "grad_norm": 1.526259526548499, "kd_ratio": 0.5, "learning_rate": 6.906035426612617e-06, "loss": 0.9319056272506714, "loss/kd": 1.4843848943710327, "loss/lm": 0.37942636013031006, "step": 2982 }, { "epoch": 0.6123999178813385, "grad_norm": 1.5426310483668517, "kd_ratio": 0.5, "learning_rate": 6.8997121388563e-06, "loss": 0.9252052903175354, "loss/kd": 1.4961211681365967, "loss/lm": 0.3542894124984741, "step": 2983 }, { "epoch": 0.612605214535003, "grad_norm": 1.6527500957399766, "kd_ratio": 0.5, "learning_rate": 6.893390222240522e-06, "loss": 0.9259132146835327, "loss/kd": 1.5610774755477905, "loss/lm": 0.2907489240169525, "step": 2984 }, { "epoch": 0.6128105111886676, "grad_norm": 2.104746129831937, "kd_ratio": 0.5, "learning_rate": 6.887069679561229e-06, "loss": 0.9841333627700806, "loss/kd": 1.5568300485610962, "loss/lm": 0.41143670678138733, "step": 2985 }, { "epoch": 0.6130158078423321, "grad_norm": 1.9884199089587185, "kd_ratio": 0.5, "learning_rate": 6.880750513613754e-06, "loss": 1.171069860458374, "loss/kd": 1.9268022775650024, "loss/lm": 0.41533735394477844, "step": 2986 }, { "epoch": 0.6132211044959968, "grad_norm": 1.538079135958607, "kd_ratio": 0.5, "learning_rate": 6.874432727192837e-06, "loss": 0.993901789188385, "loss/kd": 1.673071026802063, "loss/lm": 0.31473255157470703, "step": 2987 }, { "epoch": 0.6134264011496613, "grad_norm": 2.018734861863792, "kd_ratio": 0.5, "learning_rate": 6.868116323092589e-06, "loss": 1.0735499858856201, "loss/kd": 1.6633914709091187, "loss/lm": 0.4837084710597992, "step": 2988 }, { "epoch": 0.6136316978033258, "grad_norm": 1.239169395042328, "kd_ratio": 0.5, "learning_rate": 6.861801304106517e-06, "loss": 1.059387445449829, "loss/kd": 1.7188715934753418, "loss/lm": 0.399903267621994, "step": 2989 }, { "epoch": 0.6138369944569904, "grad_norm": 2.1177550023697886, "kd_ratio": 0.5, "learning_rate": 6.855487673027519e-06, "loss": 1.1054526567459106, "loss/kd": 1.9174699783325195, "loss/lm": 0.29343539476394653, "step": 2990 }, { "epoch": 0.6140422911106549, "grad_norm": 1.2282365888516067, "kd_ratio": 0.5, "learning_rate": 6.849175432647875e-06, "loss": 1.0744093656539917, "loss/kd": 1.774906873703003, "loss/lm": 0.37391191720962524, "step": 2991 }, { "epoch": 0.6142475877643194, "grad_norm": 1.7189719337138942, "kd_ratio": 0.5, "learning_rate": 6.842864585759252e-06, "loss": 0.8222957253456116, "loss/kd": 1.2999553680419922, "loss/lm": 0.34463608264923096, "step": 2992 }, { "epoch": 0.614452884417984, "grad_norm": 1.8742931213971994, "kd_ratio": 0.5, "learning_rate": 6.8365551351527e-06, "loss": 1.012608289718628, "loss/kd": 1.6681005954742432, "loss/lm": 0.35711604356765747, "step": 2993 }, { "epoch": 0.6146581810716485, "grad_norm": 1.0080131639875152, "kd_ratio": 0.5, "learning_rate": 6.830247083618653e-06, "loss": 2.0634992122650146, "loss/kd": 3.869211435317993, "loss/lm": 0.25778698921203613, "step": 2994 }, { "epoch": 0.614863477725313, "grad_norm": 1.83911650349346, "kd_ratio": 0.5, "learning_rate": 6.823940433946921e-06, "loss": 1.1245520114898682, "loss/kd": 1.8725800514221191, "loss/lm": 0.3765239417552948, "step": 2995 }, { "epoch": 0.6150687743789777, "grad_norm": 1.0202798940375508, "kd_ratio": 0.5, "learning_rate": 6.8176351889266955e-06, "loss": 1.0781100988388062, "loss/kd": 1.8238437175750732, "loss/lm": 0.33237650990486145, "step": 2996 }, { "epoch": 0.6152740710326422, "grad_norm": 2.342527644659263, "kd_ratio": 0.5, "learning_rate": 6.811331351346556e-06, "loss": 1.0620779991149902, "loss/kd": 1.7178748846054077, "loss/lm": 0.406281054019928, "step": 2997 }, { "epoch": 0.6154793676863067, "grad_norm": 1.5400703483826037, "kd_ratio": 0.5, "learning_rate": 6.80502892399445e-06, "loss": 1.058282732963562, "loss/kd": 1.7032973766326904, "loss/lm": 0.4132680296897888, "step": 2998 }, { "epoch": 0.6156846643399713, "grad_norm": 2.109766203253428, "kd_ratio": 0.5, "learning_rate": 6.798727909657698e-06, "loss": 0.9652786254882812, "loss/kd": 1.538801670074463, "loss/lm": 0.3917556405067444, "step": 2999 }, { "epoch": 0.6158899609936358, "grad_norm": 1.433909420974654, "kd_ratio": 0.5, "learning_rate": 6.79242831112301e-06, "loss": 0.9147164821624756, "loss/kd": 1.5230190753936768, "loss/lm": 0.3064139187335968, "step": 3000 }, { "epoch": 0.6160952576473003, "grad_norm": 1.7915571931771155, "kd_ratio": 0.5, "learning_rate": 6.786130131176452e-06, "loss": 0.916709840297699, "loss/kd": 1.5015387535095215, "loss/lm": 0.33188092708587646, "step": 3001 }, { "epoch": 0.6163005543009649, "grad_norm": 1.960202944520138, "kd_ratio": 0.5, "learning_rate": 6.7798333726034816e-06, "loss": 1.0170339345932007, "loss/kd": 1.6910730600357056, "loss/lm": 0.34299489855766296, "step": 3002 }, { "epoch": 0.6165058509546294, "grad_norm": 1.1756403604327028, "kd_ratio": 0.5, "learning_rate": 6.773538038188912e-06, "loss": 1.1456551551818848, "loss/kd": 1.8975437879562378, "loss/lm": 0.3937665820121765, "step": 3003 }, { "epoch": 0.6167111476082939, "grad_norm": 2.2788859255945924, "kd_ratio": 0.5, "learning_rate": 6.767244130716934e-06, "loss": 1.317294955253601, "loss/kd": 2.254187822341919, "loss/lm": 0.38040217757225037, "step": 3004 }, { "epoch": 0.6169164442619586, "grad_norm": 0.8986568401361279, "kd_ratio": 0.5, "learning_rate": 6.7609516529711035e-06, "loss": 0.9862563610076904, "loss/kd": 1.647123098373413, "loss/lm": 0.32538968324661255, "step": 3005 }, { "epoch": 0.6171217409156231, "grad_norm": 2.1674523281659726, "kd_ratio": 0.5, "learning_rate": 6.754660607734347e-06, "loss": 0.8398056030273438, "loss/kd": 1.3738844394683838, "loss/lm": 0.3057267963886261, "step": 3006 }, { "epoch": 0.6173270375692876, "grad_norm": 1.2898484464894637, "kd_ratio": 0.5, "learning_rate": 6.74837099778896e-06, "loss": 0.9570367932319641, "loss/kd": 1.551700234413147, "loss/lm": 0.36237338185310364, "step": 3007 }, { "epoch": 0.6175323342229522, "grad_norm": 1.5574694022937172, "kd_ratio": 0.5, "learning_rate": 6.742082825916599e-06, "loss": 0.9149680137634277, "loss/kd": 1.4750885963439941, "loss/lm": 0.35484740138053894, "step": 3008 }, { "epoch": 0.6177376308766167, "grad_norm": 1.855099568033288, "kd_ratio": 0.5, "learning_rate": 6.735796094898282e-06, "loss": 1.9823076725006104, "loss/kd": 3.643401861190796, "loss/lm": 0.32121360301971436, "step": 3009 }, { "epoch": 0.6179429275302812, "grad_norm": 1.1597542408523842, "kd_ratio": 0.5, "learning_rate": 6.7295108075143985e-06, "loss": 1.061306118965149, "loss/kd": 1.768818736076355, "loss/lm": 0.35379353165626526, "step": 3010 }, { "epoch": 0.6181482241839458, "grad_norm": 1.215389741278427, "kd_ratio": 0.5, "learning_rate": 6.723226966544691e-06, "loss": 1.0482176542282104, "loss/kd": 1.7914857864379883, "loss/lm": 0.30494943261146545, "step": 3011 }, { "epoch": 0.6183535208376103, "grad_norm": 1.090105537210132, "kd_ratio": 0.5, "learning_rate": 6.716944574768264e-06, "loss": 1.175477147102356, "loss/kd": 1.9527467489242554, "loss/lm": 0.39820748567581177, "step": 3012 }, { "epoch": 0.6185588174912748, "grad_norm": 1.71998511171229, "kd_ratio": 0.5, "learning_rate": 6.710663634963588e-06, "loss": 1.2393274307250977, "loss/kd": 2.1904022693634033, "loss/lm": 0.28825271129608154, "step": 3013 }, { "epoch": 0.6187641141449395, "grad_norm": 1.0957097386594679, "kd_ratio": 0.5, "learning_rate": 6.704384149908484e-06, "loss": 1.120989203453064, "loss/kd": 1.900822401046753, "loss/lm": 0.3411559760570526, "step": 3014 }, { "epoch": 0.618969410798604, "grad_norm": 1.548869865062486, "kd_ratio": 0.5, "learning_rate": 6.69810612238013e-06, "loss": 0.9210492372512817, "loss/kd": 1.5154584646224976, "loss/lm": 0.3266400098800659, "step": 3015 }, { "epoch": 0.6191747074522685, "grad_norm": 1.090304427531517, "kd_ratio": 0.5, "learning_rate": 6.69182955515506e-06, "loss": 1.8887659311294556, "loss/kd": 3.519365072250366, "loss/lm": 0.25816673040390015, "step": 3016 }, { "epoch": 0.6193800041059331, "grad_norm": 1.3815400928898378, "kd_ratio": 0.5, "learning_rate": 6.685554451009167e-06, "loss": 0.8971444964408875, "loss/kd": 1.4978854656219482, "loss/lm": 0.2964034974575043, "step": 3017 }, { "epoch": 0.6195853007595976, "grad_norm": 1.2850145279114709, "kd_ratio": 0.5, "learning_rate": 6.67928081271769e-06, "loss": 0.9460369348526001, "loss/kd": 1.520902395248413, "loss/lm": 0.3711714446544647, "step": 3018 }, { "epoch": 0.6197905974132621, "grad_norm": 1.0938069759363038, "kd_ratio": 0.5, "learning_rate": 6.673008643055228e-06, "loss": 1.050049066543579, "loss/kd": 1.7100268602371216, "loss/lm": 0.3900712728500366, "step": 3019 }, { "epoch": 0.6199958940669267, "grad_norm": 1.534999577553302, "kd_ratio": 0.5, "learning_rate": 6.66673794479572e-06, "loss": 1.8908536434173584, "loss/kd": 3.49299955368042, "loss/lm": 0.2887076139450073, "step": 3020 }, { "epoch": 0.6202011907205912, "grad_norm": 0.986493304730834, "kd_ratio": 0.5, "learning_rate": 6.660468720712463e-06, "loss": 1.0200693607330322, "loss/kd": 1.7316375970840454, "loss/lm": 0.30850106477737427, "step": 3021 }, { "epoch": 0.6204064873742559, "grad_norm": 1.5527400095402677, "kd_ratio": 0.5, "learning_rate": 6.654200973578091e-06, "loss": 0.9932048916816711, "loss/kd": 1.6150078773498535, "loss/lm": 0.37140193581581116, "step": 3022 }, { "epoch": 0.6206117840279204, "grad_norm": 1.3013637577434727, "kd_ratio": 0.5, "learning_rate": 6.6479347061646046e-06, "loss": 0.8557647466659546, "loss/kd": 1.3613190650939941, "loss/lm": 0.35021042823791504, "step": 3023 }, { "epoch": 0.6208170806815849, "grad_norm": 1.0260753981896875, "kd_ratio": 0.5, "learning_rate": 6.64166992124333e-06, "loss": 1.2059820890426636, "loss/kd": 2.085554599761963, "loss/lm": 0.3264094889163971, "step": 3024 }, { "epoch": 0.6210223773352495, "grad_norm": 1.2078325075975302, "kd_ratio": 0.5, "learning_rate": 6.63540662158495e-06, "loss": 1.887533187866211, "loss/kd": 3.478484869003296, "loss/lm": 0.29658159613609314, "step": 3025 }, { "epoch": 0.621227673988914, "grad_norm": 0.8713127147576283, "kd_ratio": 0.5, "learning_rate": 6.629144809959479e-06, "loss": 1.043654441833496, "loss/kd": 1.7043912410736084, "loss/lm": 0.382917582988739, "step": 3026 }, { "epoch": 0.6214329706425785, "grad_norm": 1.295045150339674, "kd_ratio": 0.5, "learning_rate": 6.622884489136286e-06, "loss": 0.9603475332260132, "loss/kd": 1.5847316980361938, "loss/lm": 0.3359633982181549, "step": 3027 }, { "epoch": 0.621638267296243, "grad_norm": 0.8726678386133543, "kd_ratio": 0.5, "learning_rate": 6.616625661884073e-06, "loss": 1.1856359243392944, "loss/kd": 2.0438034534454346, "loss/lm": 0.3274684548377991, "step": 3028 }, { "epoch": 0.6218435639499076, "grad_norm": 1.20632596700647, "kd_ratio": 0.5, "learning_rate": 6.610368330970889e-06, "loss": 1.0586163997650146, "loss/kd": 1.7672972679138184, "loss/lm": 0.3499356210231781, "step": 3029 }, { "epoch": 0.6220488606035721, "grad_norm": 1.1738106703065054, "kd_ratio": 0.5, "learning_rate": 6.604112499164108e-06, "loss": 1.09989595413208, "loss/kd": 1.8244097232818604, "loss/lm": 0.37538227438926697, "step": 3030 }, { "epoch": 0.6222541572572368, "grad_norm": 1.0458000638558511, "kd_ratio": 0.5, "learning_rate": 6.597858169230454e-06, "loss": 1.1342494487762451, "loss/kd": 1.9681763648986816, "loss/lm": 0.3003224730491638, "step": 3031 }, { "epoch": 0.6224594539109013, "grad_norm": 0.975080250164222, "kd_ratio": 0.5, "learning_rate": 6.591605343935976e-06, "loss": 1.1854465007781982, "loss/kd": 2.0245516300201416, "loss/lm": 0.3463413715362549, "step": 3032 }, { "epoch": 0.6226647505645658, "grad_norm": 1.0503512316268293, "kd_ratio": 0.5, "learning_rate": 6.585354026046069e-06, "loss": 1.0243076086044312, "loss/kd": 1.6798962354660034, "loss/lm": 0.36871907114982605, "step": 3033 }, { "epoch": 0.6228700472182304, "grad_norm": 1.1032126775042344, "kd_ratio": 0.5, "learning_rate": 6.579104218325455e-06, "loss": 1.1291354894638062, "loss/kd": 1.865339756011963, "loss/lm": 0.39293113350868225, "step": 3034 }, { "epoch": 0.6230753438718949, "grad_norm": 0.9632354887699224, "kd_ratio": 0.5, "learning_rate": 6.572855923538186e-06, "loss": 1.2446553707122803, "loss/kd": 2.1193504333496094, "loss/lm": 0.3699601888656616, "step": 3035 }, { "epoch": 0.6232806405255594, "grad_norm": 1.1633877664012318, "kd_ratio": 0.5, "learning_rate": 6.566609144447647e-06, "loss": 1.1144626140594482, "loss/kd": 1.9598612785339355, "loss/lm": 0.2690640687942505, "step": 3036 }, { "epoch": 0.623485937179224, "grad_norm": 1.4264445431421386, "kd_ratio": 0.5, "learning_rate": 6.560363883816554e-06, "loss": 1.109889030456543, "loss/kd": 1.811347484588623, "loss/lm": 0.4084305465221405, "step": 3037 }, { "epoch": 0.6236912338328885, "grad_norm": 1.3188100622929186, "kd_ratio": 0.5, "learning_rate": 6.554120144406948e-06, "loss": 0.8205528855323792, "loss/kd": 1.2865506410598755, "loss/lm": 0.3545551002025604, "step": 3038 }, { "epoch": 0.623896530486553, "grad_norm": 1.0250527888209593, "kd_ratio": 0.5, "learning_rate": 6.547877928980206e-06, "loss": 1.3284192085266113, "loss/kd": 2.256803035736084, "loss/lm": 0.4000355005264282, "step": 3039 }, { "epoch": 0.6241018271402177, "grad_norm": 1.9140222383730388, "kd_ratio": 0.5, "learning_rate": 6.541637240297022e-06, "loss": 1.1898415088653564, "loss/kd": 2.064934015274048, "loss/lm": 0.3147491216659546, "step": 3040 }, { "epoch": 0.6243071237938822, "grad_norm": 1.0046544506817345, "kd_ratio": 0.5, "learning_rate": 6.535398081117417e-06, "loss": 1.1471081972122192, "loss/kd": 1.9008193016052246, "loss/lm": 0.3933970630168915, "step": 3041 }, { "epoch": 0.6245124204475467, "grad_norm": 1.8419321143633922, "kd_ratio": 0.5, "learning_rate": 6.529160454200733e-06, "loss": 0.963049054145813, "loss/kd": 1.5763128995895386, "loss/lm": 0.3497852385044098, "step": 3042 }, { "epoch": 0.6247177171012113, "grad_norm": 1.0751530288927318, "kd_ratio": 0.5, "learning_rate": 6.522924362305639e-06, "loss": 1.1428004503250122, "loss/kd": 1.9174675941467285, "loss/lm": 0.3681332767009735, "step": 3043 }, { "epoch": 0.6249230137548758, "grad_norm": 1.4532311580263997, "kd_ratio": 0.5, "learning_rate": 6.5166898081901285e-06, "loss": 0.8683341145515442, "loss/kd": 1.4285893440246582, "loss/lm": 0.3080788552761078, "step": 3044 }, { "epoch": 0.6251283104085403, "grad_norm": 1.3496262552243952, "kd_ratio": 0.5, "learning_rate": 6.510456794611504e-06, "loss": 1.020559549331665, "loss/kd": 1.7073973417282104, "loss/lm": 0.3337218761444092, "step": 3045 }, { "epoch": 0.6253336070622049, "grad_norm": 0.9864208504944222, "kd_ratio": 0.5, "learning_rate": 6.504225324326397e-06, "loss": 0.9098052978515625, "loss/kd": 1.4405035972595215, "loss/lm": 0.3791069984436035, "step": 3046 }, { "epoch": 0.6255389037158694, "grad_norm": 1.3175395128367784, "kd_ratio": 0.5, "learning_rate": 6.497995400090748e-06, "loss": 1.082705020904541, "loss/kd": 1.7778966426849365, "loss/lm": 0.38751348853111267, "step": 3047 }, { "epoch": 0.6257442003695339, "grad_norm": 1.0878073456664505, "kd_ratio": 0.5, "learning_rate": 6.491767024659818e-06, "loss": 1.9405593872070312, "loss/kd": 3.5856871604919434, "loss/lm": 0.2954317033290863, "step": 3048 }, { "epoch": 0.6259494970231986, "grad_norm": 1.1406507265093648, "kd_ratio": 0.5, "learning_rate": 6.485540200788189e-06, "loss": 0.9886111617088318, "loss/kd": 1.701406478881836, "loss/lm": 0.27581581473350525, "step": 3049 }, { "epoch": 0.6261547936768631, "grad_norm": 1.3632419461090337, "kd_ratio": 0.5, "learning_rate": 6.479314931229746e-06, "loss": 1.1057937145233154, "loss/kd": 1.8720616102218628, "loss/lm": 0.3395257592201233, "step": 3050 }, { "epoch": 0.6263600903305276, "grad_norm": 1.0239544270560126, "kd_ratio": 0.5, "learning_rate": 6.4730912187376895e-06, "loss": 1.0670874118804932, "loss/kd": 1.8150447607040405, "loss/lm": 0.31913015246391296, "step": 3051 }, { "epoch": 0.6265653869841922, "grad_norm": 1.332922192848504, "kd_ratio": 0.5, "learning_rate": 6.466869066064536e-06, "loss": 0.9542100429534912, "loss/kd": 1.582831621170044, "loss/lm": 0.32558852434158325, "step": 3052 }, { "epoch": 0.6267706836378567, "grad_norm": 1.1394369052319115, "kd_ratio": 0.5, "learning_rate": 6.460648475962104e-06, "loss": 0.9262582659721375, "loss/kd": 1.5460693836212158, "loss/lm": 0.3064471483230591, "step": 3053 }, { "epoch": 0.6269759802915212, "grad_norm": 1.6525530178102497, "kd_ratio": 0.5, "learning_rate": 6.454429451181537e-06, "loss": 0.9489462971687317, "loss/kd": 1.552484154701233, "loss/lm": 0.34540843963623047, "step": 3054 }, { "epoch": 0.6271812769451858, "grad_norm": 1.0889908457798414, "kd_ratio": 0.5, "learning_rate": 6.448211994473263e-06, "loss": 1.065934658050537, "loss/kd": 1.7924429178237915, "loss/lm": 0.33942651748657227, "step": 3055 }, { "epoch": 0.6273865735988503, "grad_norm": 1.115395403788123, "kd_ratio": 0.5, "learning_rate": 6.44199610858704e-06, "loss": 1.1705671548843384, "loss/kd": 1.995280385017395, "loss/lm": 0.3458540141582489, "step": 3056 }, { "epoch": 0.6275918702525148, "grad_norm": 1.3075699918391372, "kd_ratio": 0.5, "learning_rate": 6.4357817962719136e-06, "loss": 1.135810136795044, "loss/kd": 1.8636186122894287, "loss/lm": 0.408001571893692, "step": 3057 }, { "epoch": 0.6277971669061795, "grad_norm": 0.9487755995914363, "kd_ratio": 0.5, "learning_rate": 6.429569060276237e-06, "loss": 0.9569905400276184, "loss/kd": 1.5345724821090698, "loss/lm": 0.379408597946167, "step": 3058 }, { "epoch": 0.628002463559844, "grad_norm": 1.0323999280721203, "kd_ratio": 0.5, "learning_rate": 6.42335790334768e-06, "loss": 0.8318237066268921, "loss/kd": 1.3297626972198486, "loss/lm": 0.33388468623161316, "step": 3059 }, { "epoch": 0.6282077602135085, "grad_norm": 1.0506343543133536, "kd_ratio": 0.5, "learning_rate": 6.417148328233195e-06, "loss": 1.1216145753860474, "loss/kd": 1.9232441186904907, "loss/lm": 0.319985032081604, "step": 3060 }, { "epoch": 0.6284130568671731, "grad_norm": 0.9016682583756167, "kd_ratio": 0.5, "learning_rate": 6.410940337679046e-06, "loss": 1.014968752861023, "loss/kd": 1.7107837200164795, "loss/lm": 0.31915369629859924, "step": 3061 }, { "epoch": 0.6286183535208376, "grad_norm": 0.9333819868636427, "kd_ratio": 0.5, "learning_rate": 6.404733934430791e-06, "loss": 1.8191442489624023, "loss/kd": 3.3124051094055176, "loss/lm": 0.3258834183216095, "step": 3062 }, { "epoch": 0.6288236501745021, "grad_norm": 1.265599012485599, "kd_ratio": 0.5, "learning_rate": 6.398529121233291e-06, "loss": 1.1429089307785034, "loss/kd": 1.9350574016571045, "loss/lm": 0.35076043009757996, "step": 3063 }, { "epoch": 0.6290289468281667, "grad_norm": 0.8971974892546803, "kd_ratio": 0.5, "learning_rate": 6.392325900830698e-06, "loss": 1.0631744861602783, "loss/kd": 1.7704617977142334, "loss/lm": 0.355887234210968, "step": 3064 }, { "epoch": 0.6292342434818312, "grad_norm": 1.0460689409997241, "kd_ratio": 0.5, "learning_rate": 6.386124275966467e-06, "loss": 0.8774707913398743, "loss/kd": 1.4651191234588623, "loss/lm": 0.28982242941856384, "step": 3065 }, { "epoch": 0.6294395401354957, "grad_norm": 0.960761947540745, "kd_ratio": 0.5, "learning_rate": 6.3799242493833416e-06, "loss": 1.0512807369232178, "loss/kd": 1.7767789363861084, "loss/lm": 0.3257824182510376, "step": 3066 }, { "epoch": 0.6296448367891604, "grad_norm": 0.850291980735089, "kd_ratio": 0.5, "learning_rate": 6.373725823823359e-06, "loss": 1.221897840499878, "loss/kd": 2.109715700149536, "loss/lm": 0.3340800404548645, "step": 3067 }, { "epoch": 0.6298501334428249, "grad_norm": 0.9464364880703031, "kd_ratio": 0.5, "learning_rate": 6.36752900202785e-06, "loss": 1.8712916374206543, "loss/kd": 3.3878276348114014, "loss/lm": 0.35475561022758484, "step": 3068 }, { "epoch": 0.6300554300964895, "grad_norm": 0.9473654625982345, "kd_ratio": 0.5, "learning_rate": 6.361333786737431e-06, "loss": 0.959246039390564, "loss/kd": 1.5550023317337036, "loss/lm": 0.3634898066520691, "step": 3069 }, { "epoch": 0.630260726750154, "grad_norm": 0.8669914797810649, "kd_ratio": 0.5, "learning_rate": 6.355140180692022e-06, "loss": 1.2669063806533813, "loss/kd": 2.1457295417785645, "loss/lm": 0.38808321952819824, "step": 3070 }, { "epoch": 0.6304660234038185, "grad_norm": 1.0090738688976366, "kd_ratio": 0.5, "learning_rate": 6.348948186630815e-06, "loss": 1.0720425844192505, "loss/kd": 1.787227749824524, "loss/lm": 0.3568574786186218, "step": 3071 }, { "epoch": 0.630671320057483, "grad_norm": 0.9064968736407503, "kd_ratio": 0.5, "learning_rate": 6.3427578072922946e-06, "loss": 1.1347918510437012, "loss/kd": 1.9559050798416138, "loss/lm": 0.31367865204811096, "step": 3072 }, { "epoch": 0.6308766167111476, "grad_norm": 1.0522609058309025, "kd_ratio": 0.5, "learning_rate": 6.336569045414238e-06, "loss": 0.9802418947219849, "loss/kd": 1.6466410160064697, "loss/lm": 0.3138427436351776, "step": 3073 }, { "epoch": 0.6310819133648121, "grad_norm": 0.9058635899838806, "kd_ratio": 0.5, "learning_rate": 6.330381903733694e-06, "loss": 1.0044225454330444, "loss/kd": 1.643584966659546, "loss/lm": 0.36526015400886536, "step": 3074 }, { "epoch": 0.6312872100184767, "grad_norm": 0.9849780445434401, "kd_ratio": 0.5, "learning_rate": 6.324196384987009e-06, "loss": 1.0176713466644287, "loss/kd": 1.7010170221328735, "loss/lm": 0.3343256115913391, "step": 3075 }, { "epoch": 0.6314925066721413, "grad_norm": 0.8917781124517975, "kd_ratio": 0.5, "learning_rate": 6.318012491909804e-06, "loss": 1.024160385131836, "loss/kd": 1.6963562965393066, "loss/lm": 0.35196444392204285, "step": 3076 }, { "epoch": 0.6316978033258058, "grad_norm": 1.0151408890530436, "kd_ratio": 0.5, "learning_rate": 6.31183022723698e-06, "loss": 0.8046847581863403, "loss/kd": 1.269161581993103, "loss/lm": 0.3402079641819, "step": 3077 }, { "epoch": 0.6319030999794704, "grad_norm": 0.9614819538499201, "kd_ratio": 0.5, "learning_rate": 6.305649593702721e-06, "loss": 1.121521234512329, "loss/kd": 1.867368221282959, "loss/lm": 0.3756742775440216, "step": 3078 }, { "epoch": 0.6321083966331349, "grad_norm": 1.0843188834402633, "kd_ratio": 0.5, "learning_rate": 6.2994705940404825e-06, "loss": 0.8890889883041382, "loss/kd": 1.5111132860183716, "loss/lm": 0.2670647203922272, "step": 3079 }, { "epoch": 0.6323136932867994, "grad_norm": 1.0260803480980438, "kd_ratio": 0.5, "learning_rate": 6.293293230983013e-06, "loss": 0.9930910468101501, "loss/kd": 1.671043872833252, "loss/lm": 0.31513822078704834, "step": 3080 }, { "epoch": 0.632518989940464, "grad_norm": 1.0200548675516532, "kd_ratio": 0.5, "learning_rate": 6.2871175072623214e-06, "loss": 1.0417819023132324, "loss/kd": 1.7654838562011719, "loss/lm": 0.31808000802993774, "step": 3081 }, { "epoch": 0.6327242865941285, "grad_norm": 0.9697854918153914, "kd_ratio": 0.5, "learning_rate": 6.280943425609699e-06, "loss": 1.0703157186508179, "loss/kd": 1.8015861511230469, "loss/lm": 0.3390452265739441, "step": 3082 }, { "epoch": 0.632929583247793, "grad_norm": 1.0709176438488566, "kd_ratio": 0.5, "learning_rate": 6.274770988755712e-06, "loss": 1.222063660621643, "loss/kd": 2.092524528503418, "loss/lm": 0.3516027629375458, "step": 3083 }, { "epoch": 0.6331348799014576, "grad_norm": 1.0792464716441943, "kd_ratio": 0.5, "learning_rate": 6.2686001994301885e-06, "loss": 0.938029408454895, "loss/kd": 1.5757185220718384, "loss/lm": 0.30034035444259644, "step": 3084 }, { "epoch": 0.6333401765551222, "grad_norm": 0.9687768410239245, "kd_ratio": 0.5, "learning_rate": 6.262431060362249e-06, "loss": 1.1720011234283447, "loss/kd": 1.965328335762024, "loss/lm": 0.378673791885376, "step": 3085 }, { "epoch": 0.6335454732087867, "grad_norm": 1.2170437653207664, "kd_ratio": 0.5, "learning_rate": 6.256263574280261e-06, "loss": 0.94969242811203, "loss/kd": 1.5724881887435913, "loss/lm": 0.32689666748046875, "step": 3086 }, { "epoch": 0.6337507698624513, "grad_norm": 0.9528205500207918, "kd_ratio": 0.5, "learning_rate": 6.250097743911877e-06, "loss": 0.7973403930664062, "loss/kd": 1.3106003999710083, "loss/lm": 0.2840804159641266, "step": 3087 }, { "epoch": 0.6339560665161158, "grad_norm": 0.9151145666351487, "kd_ratio": 0.5, "learning_rate": 6.243933571984009e-06, "loss": 1.0771732330322266, "loss/kd": 1.813757061958313, "loss/lm": 0.340589314699173, "step": 3088 }, { "epoch": 0.6341613631697803, "grad_norm": 1.0847326297046833, "kd_ratio": 0.5, "learning_rate": 6.237771061222836e-06, "loss": 1.1248424053192139, "loss/kd": 1.8512470722198486, "loss/lm": 0.39843764901161194, "step": 3089 }, { "epoch": 0.6343666598234449, "grad_norm": 1.1374356013942029, "kd_ratio": 0.5, "learning_rate": 6.231610214353807e-06, "loss": 1.17271888256073, "loss/kd": 1.9990787506103516, "loss/lm": 0.3463590443134308, "step": 3090 }, { "epoch": 0.6345719564771094, "grad_norm": 0.9454706265946505, "kd_ratio": 0.5, "learning_rate": 6.225451034101631e-06, "loss": 0.9229815006256104, "loss/kd": 1.528483271598816, "loss/lm": 0.31747978925704956, "step": 3091 }, { "epoch": 0.6347772531307739, "grad_norm": 0.9976109800813318, "kd_ratio": 0.5, "learning_rate": 6.219293523190286e-06, "loss": 0.992824137210846, "loss/kd": 1.625625729560852, "loss/lm": 0.36002257466316223, "step": 3092 }, { "epoch": 0.6349825497844385, "grad_norm": 0.9739761362026143, "kd_ratio": 0.5, "learning_rate": 6.213137684343002e-06, "loss": 1.1316039562225342, "loss/kd": 1.9388532638549805, "loss/lm": 0.3243545591831207, "step": 3093 }, { "epoch": 0.6351878464381031, "grad_norm": 1.1430713782632977, "kd_ratio": 0.5, "learning_rate": 6.206983520282279e-06, "loss": 1.0652518272399902, "loss/kd": 1.826919436454773, "loss/lm": 0.30358418822288513, "step": 3094 }, { "epoch": 0.6353931430917676, "grad_norm": 1.1258037763077746, "kd_ratio": 0.5, "learning_rate": 6.200831033729864e-06, "loss": 1.0907684564590454, "loss/kd": 1.8384943008422852, "loss/lm": 0.3430425524711609, "step": 3095 }, { "epoch": 0.6355984397454322, "grad_norm": 1.0601472738797695, "kd_ratio": 0.5, "learning_rate": 6.19468022740678e-06, "loss": 0.8737485408782959, "loss/kd": 1.3916547298431396, "loss/lm": 0.35584235191345215, "step": 3096 }, { "epoch": 0.6358037363990967, "grad_norm": 0.9894240926674333, "kd_ratio": 0.5, "learning_rate": 6.18853110403329e-06, "loss": 1.09228515625, "loss/kd": 1.736069917678833, "loss/lm": 0.448500394821167, "step": 3097 }, { "epoch": 0.6360090330527612, "grad_norm": 0.9416232020291991, "kd_ratio": 0.5, "learning_rate": 6.182383666328925e-06, "loss": 0.9442470073699951, "loss/kd": 1.5515261888504028, "loss/lm": 0.3369678854942322, "step": 3098 }, { "epoch": 0.6362143297064258, "grad_norm": 0.9316673803252115, "kd_ratio": 0.5, "learning_rate": 6.176237917012459e-06, "loss": 1.1762140989303589, "loss/kd": 2.0246875286102295, "loss/lm": 0.3277406394481659, "step": 3099 }, { "epoch": 0.6364196263600903, "grad_norm": 1.1223749060459356, "kd_ratio": 0.5, "learning_rate": 6.170093858801928e-06, "loss": 1.1705255508422852, "loss/kd": 1.9464730024337769, "loss/lm": 0.39457812905311584, "step": 3100 }, { "epoch": 0.6366249230137548, "grad_norm": 0.9585653868395226, "kd_ratio": 0.5, "learning_rate": 6.163951494414619e-06, "loss": 0.9382580518722534, "loss/kd": 1.5548402070999146, "loss/lm": 0.3216758370399475, "step": 3101 }, { "epoch": 0.6368302196674194, "grad_norm": 1.055286306438476, "kd_ratio": 0.5, "learning_rate": 6.157810826567068e-06, "loss": 0.9154030680656433, "loss/kd": 1.5108599662780762, "loss/lm": 0.31994616985321045, "step": 3102 }, { "epoch": 0.637035516321084, "grad_norm": 0.9675270569529189, "kd_ratio": 0.5, "learning_rate": 6.151671857975061e-06, "loss": 0.973220944404602, "loss/kd": 1.6460787057876587, "loss/lm": 0.3003632128238678, "step": 3103 }, { "epoch": 0.6372408129747485, "grad_norm": 1.063469396176767, "kd_ratio": 0.5, "learning_rate": 6.1455345913536325e-06, "loss": 1.1238789558410645, "loss/kd": 1.974044680595398, "loss/lm": 0.27371329069137573, "step": 3104 }, { "epoch": 0.6374461096284131, "grad_norm": 1.1003132169197376, "kd_ratio": 0.5, "learning_rate": 6.139399029417061e-06, "loss": 1.1436635255813599, "loss/kd": 1.8989653587341309, "loss/lm": 0.38836175203323364, "step": 3105 }, { "epoch": 0.6376514062820776, "grad_norm": 1.2026539699932803, "kd_ratio": 0.5, "learning_rate": 6.13326517487888e-06, "loss": 0.8874175548553467, "loss/kd": 1.4965784549713135, "loss/lm": 0.27825671434402466, "step": 3106 }, { "epoch": 0.6378567029357421, "grad_norm": 1.224742935257408, "kd_ratio": 0.5, "learning_rate": 6.12713303045186e-06, "loss": 1.1391115188598633, "loss/kd": 1.9424996376037598, "loss/lm": 0.33572348952293396, "step": 3107 }, { "epoch": 0.6380619995894067, "grad_norm": 0.9216446435049743, "kd_ratio": 0.5, "learning_rate": 6.121002598848017e-06, "loss": 0.9765897393226624, "loss/kd": 1.598811149597168, "loss/lm": 0.35436829924583435, "step": 3108 }, { "epoch": 0.6382672962430712, "grad_norm": 1.070586147606561, "kd_ratio": 0.5, "learning_rate": 6.114873882778609e-06, "loss": 0.9106866717338562, "loss/kd": 1.546579360961914, "loss/lm": 0.2747940123081207, "step": 3109 }, { "epoch": 0.6384725928967357, "grad_norm": 0.9903527206580087, "kd_ratio": 0.5, "learning_rate": 6.108746884954135e-06, "loss": 1.1477947235107422, "loss/kd": 2.0156657695770264, "loss/lm": 0.279923677444458, "step": 3110 }, { "epoch": 0.6386778895504003, "grad_norm": 1.1179879640706063, "kd_ratio": 0.5, "learning_rate": 6.10262160808434e-06, "loss": 0.9346492886543274, "loss/kd": 1.5487934350967407, "loss/lm": 0.3205051124095917, "step": 3111 }, { "epoch": 0.6388831862040649, "grad_norm": 1.1397174062131679, "kd_ratio": 0.5, "learning_rate": 6.096498054878204e-06, "loss": 1.1438305377960205, "loss/kd": 1.9622063636779785, "loss/lm": 0.3254547417163849, "step": 3112 }, { "epoch": 0.6390884828577295, "grad_norm": 0.9711315629722996, "kd_ratio": 0.5, "learning_rate": 6.090376228043938e-06, "loss": 1.0207246541976929, "loss/kd": 1.669317364692688, "loss/lm": 0.37213191390037537, "step": 3113 }, { "epoch": 0.639293779511394, "grad_norm": 0.9557566810478293, "kd_ratio": 0.5, "learning_rate": 6.084256130289e-06, "loss": 1.0314445495605469, "loss/kd": 1.710241675376892, "loss/lm": 0.3526473641395569, "step": 3114 }, { "epoch": 0.6394990761650585, "grad_norm": 0.9405662169911472, "kd_ratio": 0.5, "learning_rate": 6.0781377643200765e-06, "loss": 0.904083251953125, "loss/kd": 1.462707757949829, "loss/lm": 0.3454587757587433, "step": 3115 }, { "epoch": 0.639704372818723, "grad_norm": 0.8938858872082354, "kd_ratio": 0.5, "learning_rate": 6.072021132843087e-06, "loss": 1.2684619426727295, "loss/kd": 2.2100026607513428, "loss/lm": 0.3269211947917938, "step": 3116 }, { "epoch": 0.6399096694723876, "grad_norm": 0.9093180481344622, "kd_ratio": 0.5, "learning_rate": 6.065906238563195e-06, "loss": 1.281582236289978, "loss/kd": 2.154900074005127, "loss/lm": 0.4082643389701843, "step": 3117 }, { "epoch": 0.6401149661260521, "grad_norm": 0.9933208077986562, "kd_ratio": 0.5, "learning_rate": 6.059793084184782e-06, "loss": 1.136928915977478, "loss/kd": 1.8891081809997559, "loss/lm": 0.3847495913505554, "step": 3118 }, { "epoch": 0.6403202627797167, "grad_norm": 1.118215580715179, "kd_ratio": 0.5, "learning_rate": 6.053681672411471e-06, "loss": 0.8709299564361572, "loss/kd": 1.3323233127593994, "loss/lm": 0.4095366597175598, "step": 3119 }, { "epoch": 0.6405255594333812, "grad_norm": 1.2647911378388288, "kd_ratio": 0.5, "learning_rate": 6.047572005946105e-06, "loss": 1.0408282279968262, "loss/kd": 1.7361479997634888, "loss/lm": 0.3455085754394531, "step": 3120 }, { "epoch": 0.6407308560870458, "grad_norm": 0.9995852870652153, "kd_ratio": 0.5, "learning_rate": 6.041464087490756e-06, "loss": 0.9261789917945862, "loss/kd": 1.5115435123443604, "loss/lm": 0.3408145010471344, "step": 3121 }, { "epoch": 0.6409361527407104, "grad_norm": 1.2754178243278806, "kd_ratio": 0.5, "learning_rate": 6.035357919746735e-06, "loss": 1.1487905979156494, "loss/kd": 1.902260422706604, "loss/lm": 0.3953207731246948, "step": 3122 }, { "epoch": 0.6411414493943749, "grad_norm": 1.0313416536856268, "kd_ratio": 0.5, "learning_rate": 6.029253505414565e-06, "loss": 1.1333550214767456, "loss/kd": 1.8404713869094849, "loss/lm": 0.4262387156486511, "step": 3123 }, { "epoch": 0.6413467460480394, "grad_norm": 0.9303740523151115, "kd_ratio": 0.5, "learning_rate": 6.0231508471940005e-06, "loss": 0.9735040664672852, "loss/kd": 1.5980069637298584, "loss/lm": 0.3490011394023895, "step": 3124 }, { "epoch": 0.641552042701704, "grad_norm": 1.0152816943558787, "kd_ratio": 0.5, "learning_rate": 6.017049947784015e-06, "loss": 1.2983789443969727, "loss/kd": 2.267634868621826, "loss/lm": 0.32912302017211914, "step": 3125 }, { "epoch": 0.6417573393553685, "grad_norm": 0.9742687820371334, "kd_ratio": 0.5, "learning_rate": 6.010950809882804e-06, "loss": 0.9600133895874023, "loss/kd": 1.6730170249938965, "loss/lm": 0.24700972437858582, "step": 3126 }, { "epoch": 0.641962636009033, "grad_norm": 1.0122667690849987, "kd_ratio": 0.5, "learning_rate": 6.004853436187794e-06, "loss": 1.0885798931121826, "loss/kd": 1.8451696634292603, "loss/lm": 0.331990122795105, "step": 3127 }, { "epoch": 0.6421679326626976, "grad_norm": 0.9261212982007598, "kd_ratio": 0.5, "learning_rate": 5.998757829395617e-06, "loss": 1.2506158351898193, "loss/kd": 2.171142578125, "loss/lm": 0.3300889730453491, "step": 3128 }, { "epoch": 0.6423732293163621, "grad_norm": 0.8529171598636907, "kd_ratio": 0.5, "learning_rate": 5.992663992202136e-06, "loss": 1.099610447883606, "loss/kd": 1.8518911600112915, "loss/lm": 0.34732967615127563, "step": 3129 }, { "epoch": 0.6425785259700267, "grad_norm": 1.0117056764594539, "kd_ratio": 0.5, "learning_rate": 5.986571927302423e-06, "loss": 0.8796042799949646, "loss/kd": 1.3826043605804443, "loss/lm": 0.37660422921180725, "step": 3130 }, { "epoch": 0.6427838226236913, "grad_norm": 0.9570948543486435, "kd_ratio": 0.5, "learning_rate": 5.9804816373907625e-06, "loss": 0.9620095491409302, "loss/kd": 1.6187528371810913, "loss/lm": 0.30526620149612427, "step": 3131 }, { "epoch": 0.6429891192773558, "grad_norm": 1.0673418818052176, "kd_ratio": 0.5, "learning_rate": 5.9743931251606714e-06, "loss": 1.027605414390564, "loss/kd": 1.7222334146499634, "loss/lm": 0.3329773247241974, "step": 3132 }, { "epoch": 0.6431944159310203, "grad_norm": 0.9414634392983272, "kd_ratio": 0.5, "learning_rate": 5.968306393304863e-06, "loss": 1.231980323791504, "loss/kd": 2.1022839546203613, "loss/lm": 0.36167675256729126, "step": 3133 }, { "epoch": 0.6433997125846849, "grad_norm": 0.958668304237227, "kd_ratio": 0.5, "learning_rate": 5.962221444515273e-06, "loss": 0.9458860158920288, "loss/kd": 1.5671886205673218, "loss/lm": 0.32458335161209106, "step": 3134 }, { "epoch": 0.6436050092383494, "grad_norm": 1.0288440347748031, "kd_ratio": 0.5, "learning_rate": 5.956138281483039e-06, "loss": 1.0824049711227417, "loss/kd": 1.842939019203186, "loss/lm": 0.32187098264694214, "step": 3135 }, { "epoch": 0.6438103058920139, "grad_norm": 1.1283856043766654, "kd_ratio": 0.5, "learning_rate": 5.950056906898518e-06, "loss": 0.9195363521575928, "loss/kd": 1.5379143953323364, "loss/lm": 0.30115824937820435, "step": 3136 }, { "epoch": 0.6440156025456785, "grad_norm": 0.8924808561337233, "kd_ratio": 0.5, "learning_rate": 5.943977323451277e-06, "loss": 0.957156777381897, "loss/kd": 1.6782163381576538, "loss/lm": 0.23609723150730133, "step": 3137 }, { "epoch": 0.644220899199343, "grad_norm": 1.1237197604223377, "kd_ratio": 0.5, "learning_rate": 5.9378995338300815e-06, "loss": 1.080394983291626, "loss/kd": 1.8159558773040771, "loss/lm": 0.34483397006988525, "step": 3138 }, { "epoch": 0.6444261958530076, "grad_norm": 0.8748861916855641, "kd_ratio": 0.5, "learning_rate": 5.931823540722912e-06, "loss": 0.9058727622032166, "loss/kd": 1.4646137952804565, "loss/lm": 0.34713172912597656, "step": 3139 }, { "epoch": 0.6446314925066722, "grad_norm": 1.1414379482614698, "kd_ratio": 0.5, "learning_rate": 5.925749346816949e-06, "loss": 1.0402641296386719, "loss/kd": 1.7610169649124146, "loss/lm": 0.3195113241672516, "step": 3140 }, { "epoch": 0.6448367891603367, "grad_norm": 1.173887169615642, "kd_ratio": 0.5, "learning_rate": 5.919676954798583e-06, "loss": 1.0844941139221191, "loss/kd": 1.792664885520935, "loss/lm": 0.3763233423233032, "step": 3141 }, { "epoch": 0.6450420858140012, "grad_norm": 0.8714899739474965, "kd_ratio": 0.5, "learning_rate": 5.913606367353396e-06, "loss": 0.9238755106925964, "loss/kd": 1.5303075313568115, "loss/lm": 0.31744349002838135, "step": 3142 }, { "epoch": 0.6452473824676658, "grad_norm": 1.0782715679403152, "kd_ratio": 0.5, "learning_rate": 5.907537587166191e-06, "loss": 1.1383013725280762, "loss/kd": 1.8625705242156982, "loss/lm": 0.41403213143348694, "step": 3143 }, { "epoch": 0.6454526791213303, "grad_norm": 0.8713063700073674, "kd_ratio": 0.5, "learning_rate": 5.901470616920957e-06, "loss": 0.9462263584136963, "loss/kd": 1.5715628862380981, "loss/lm": 0.32088980078697205, "step": 3144 }, { "epoch": 0.6456579757749948, "grad_norm": 1.0510675817906197, "kd_ratio": 0.5, "learning_rate": 5.895405459300881e-06, "loss": 1.806731939315796, "loss/kd": 3.2758989334106445, "loss/lm": 0.3375648260116577, "step": 3145 }, { "epoch": 0.6458632724286594, "grad_norm": 0.9118311313046401, "kd_ratio": 0.5, "learning_rate": 5.889342116988362e-06, "loss": 1.1751248836517334, "loss/kd": 1.924588918685913, "loss/lm": 0.42566096782684326, "step": 3146 }, { "epoch": 0.6460685690823239, "grad_norm": 1.1982528732998587, "kd_ratio": 0.5, "learning_rate": 5.883280592664979e-06, "loss": 1.8652620315551758, "loss/kd": 3.422145128250122, "loss/lm": 0.3083789348602295, "step": 3147 }, { "epoch": 0.6462738657359886, "grad_norm": 0.9024146294568216, "kd_ratio": 0.5, "learning_rate": 5.877220889011526e-06, "loss": 0.8799943923950195, "loss/kd": 1.3872134685516357, "loss/lm": 0.37277528643608093, "step": 3148 }, { "epoch": 0.6464791623896531, "grad_norm": 1.02729477167352, "kd_ratio": 0.5, "learning_rate": 5.871163008707977e-06, "loss": 1.0467374324798584, "loss/kd": 1.77339506149292, "loss/lm": 0.3200797438621521, "step": 3149 }, { "epoch": 0.6466844590433176, "grad_norm": 0.9094945639414905, "kd_ratio": 0.5, "learning_rate": 5.865106954433502e-06, "loss": 1.2307066917419434, "loss/kd": 2.027778387069702, "loss/lm": 0.433634877204895, "step": 3150 }, { "epoch": 0.6468897556969821, "grad_norm": 1.1457719143469183, "kd_ratio": 0.5, "learning_rate": 5.859052728866468e-06, "loss": 1.0768259763717651, "loss/kd": 1.726150631904602, "loss/lm": 0.427501380443573, "step": 3151 }, { "epoch": 0.6470950523506467, "grad_norm": 1.212532689932059, "kd_ratio": 0.5, "learning_rate": 5.853000334684427e-06, "loss": 1.251126766204834, "loss/kd": 2.0928447246551514, "loss/lm": 0.409408837556839, "step": 3152 }, { "epoch": 0.6473003490043112, "grad_norm": 1.1440405591402636, "kd_ratio": 0.5, "learning_rate": 5.846949774564133e-06, "loss": 0.9528024196624756, "loss/kd": 1.5169849395751953, "loss/lm": 0.38861995935440063, "step": 3153 }, { "epoch": 0.6475056456579757, "grad_norm": 0.9519264619597014, "kd_ratio": 0.5, "learning_rate": 5.840901051181516e-06, "loss": 1.1872397661209106, "loss/kd": 2.001882314682007, "loss/lm": 0.3725971281528473, "step": 3154 }, { "epoch": 0.6477109423116403, "grad_norm": 1.051776164109432, "kd_ratio": 0.5, "learning_rate": 5.834854167211699e-06, "loss": 1.1665153503417969, "loss/kd": 1.9986534118652344, "loss/lm": 0.3343771696090698, "step": 3155 }, { "epoch": 0.6479162389653048, "grad_norm": 1.2770443628308432, "kd_ratio": 0.5, "learning_rate": 5.828809125328987e-06, "loss": 1.0371313095092773, "loss/kd": 1.760584831237793, "loss/lm": 0.3136777877807617, "step": 3156 }, { "epoch": 0.6481215356189695, "grad_norm": 1.0965165497486875, "kd_ratio": 0.5, "learning_rate": 5.822765928206881e-06, "loss": 1.5164926052093506, "loss/kd": 2.7079849243164062, "loss/lm": 0.32500022649765015, "step": 3157 }, { "epoch": 0.648326832272634, "grad_norm": 1.2055392832589586, "kd_ratio": 0.5, "learning_rate": 5.8167245785180535e-06, "loss": 1.2670998573303223, "loss/kd": 2.154104232788086, "loss/lm": 0.3800954520702362, "step": 3158 }, { "epoch": 0.6485321289262985, "grad_norm": 0.918015442157592, "kd_ratio": 0.5, "learning_rate": 5.810685078934375e-06, "loss": 0.9759163856506348, "loss/kd": 1.5529569387435913, "loss/lm": 0.39887580275535583, "step": 3159 }, { "epoch": 0.6487374255799631, "grad_norm": 1.0028646106637318, "kd_ratio": 0.5, "learning_rate": 5.804647432126878e-06, "loss": 1.0920454263687134, "loss/kd": 1.7562267780303955, "loss/lm": 0.4278640151023865, "step": 3160 }, { "epoch": 0.6489427222336276, "grad_norm": 0.936945291271859, "kd_ratio": 0.5, "learning_rate": 5.798611640765792e-06, "loss": 1.1084867715835571, "loss/kd": 1.8755850791931152, "loss/lm": 0.3413884937763214, "step": 3161 }, { "epoch": 0.6491480188872921, "grad_norm": 0.9859980931269279, "kd_ratio": 0.5, "learning_rate": 5.792577707520518e-06, "loss": 1.2014901638031006, "loss/kd": 2.0502562522888184, "loss/lm": 0.3527241051197052, "step": 3162 }, { "epoch": 0.6493533155409567, "grad_norm": 1.0330334070657567, "kd_ratio": 0.5, "learning_rate": 5.78654563505964e-06, "loss": 0.9544640779495239, "loss/kd": 1.6232327222824097, "loss/lm": 0.2856954336166382, "step": 3163 }, { "epoch": 0.6495586121946212, "grad_norm": 0.9239523928020731, "kd_ratio": 0.5, "learning_rate": 5.780515426050917e-06, "loss": 1.888742446899414, "loss/kd": 3.455742835998535, "loss/lm": 0.3217420279979706, "step": 3164 }, { "epoch": 0.6497639088482857, "grad_norm": 1.2264528448236196, "kd_ratio": 0.5, "learning_rate": 5.774487083161278e-06, "loss": 1.0196584463119507, "loss/kd": 1.6276453733444214, "loss/lm": 0.41167154908180237, "step": 3165 }, { "epoch": 0.6499692055019504, "grad_norm": 1.0262662991412559, "kd_ratio": 0.5, "learning_rate": 5.768460609056838e-06, "loss": 0.8319952487945557, "loss/kd": 1.373347282409668, "loss/lm": 0.29064327478408813, "step": 3166 }, { "epoch": 0.6501745021556149, "grad_norm": 1.072050177630072, "kd_ratio": 0.5, "learning_rate": 5.762436006402874e-06, "loss": 1.515251636505127, "loss/kd": 2.6102662086486816, "loss/lm": 0.42023715376853943, "step": 3167 }, { "epoch": 0.6503797988092794, "grad_norm": 1.039988351911028, "kd_ratio": 0.5, "learning_rate": 5.756413277863843e-06, "loss": 1.9418864250183105, "loss/kd": 3.6287944316864014, "loss/lm": 0.2549784481525421, "step": 3168 }, { "epoch": 0.650585095462944, "grad_norm": 1.1878732724424157, "kd_ratio": 0.5, "learning_rate": 5.7503924261033705e-06, "loss": 1.100531816482544, "loss/kd": 1.8285645246505737, "loss/lm": 0.3724989891052246, "step": 3169 }, { "epoch": 0.6507903921166085, "grad_norm": 1.257607189039203, "kd_ratio": 0.5, "learning_rate": 5.744373453784256e-06, "loss": 1.2092328071594238, "loss/kd": 2.067204236984253, "loss/lm": 0.35126128792762756, "step": 3170 }, { "epoch": 0.650995688770273, "grad_norm": 1.0251002839645396, "kd_ratio": 0.5, "learning_rate": 5.738356363568463e-06, "loss": 0.8869470357894897, "loss/kd": 1.4593135118484497, "loss/lm": 0.3145805895328522, "step": 3171 }, { "epoch": 0.6512009854239376, "grad_norm": 1.137034355827606, "kd_ratio": 0.5, "learning_rate": 5.732341158117118e-06, "loss": 0.937475323677063, "loss/kd": 1.5426881313323975, "loss/lm": 0.3322625458240509, "step": 3172 }, { "epoch": 0.6514062820776021, "grad_norm": 1.1914401704470319, "kd_ratio": 0.5, "learning_rate": 5.7263278400905265e-06, "loss": 1.1817474365234375, "loss/kd": 2.0296714305877686, "loss/lm": 0.33382338285446167, "step": 3173 }, { "epoch": 0.6516115787312666, "grad_norm": 1.1999943813414609, "kd_ratio": 0.5, "learning_rate": 5.72031641214815e-06, "loss": 1.0663228034973145, "loss/kd": 1.7855719327926636, "loss/lm": 0.34707367420196533, "step": 3174 }, { "epoch": 0.6518168753849313, "grad_norm": 2.0661393144685425, "kd_ratio": 0.5, "learning_rate": 5.714306876948621e-06, "loss": 1.0860154628753662, "loss/kd": 1.7454382181167603, "loss/lm": 0.42659273743629456, "step": 3175 }, { "epoch": 0.6520221720385958, "grad_norm": 1.0499458901866248, "kd_ratio": 0.5, "learning_rate": 5.708299237149721e-06, "loss": 1.1185941696166992, "loss/kd": 1.9137935638427734, "loss/lm": 0.3233947455883026, "step": 3176 }, { "epoch": 0.6522274686922603, "grad_norm": 1.4023984423296403, "kd_ratio": 0.5, "learning_rate": 5.70229349540841e-06, "loss": 0.8848651647567749, "loss/kd": 1.4645438194274902, "loss/lm": 0.30518656969070435, "step": 3177 }, { "epoch": 0.6524327653459249, "grad_norm": 0.9913588590150884, "kd_ratio": 0.5, "learning_rate": 5.696289654380798e-06, "loss": 1.0637431144714355, "loss/kd": 1.8317927122116089, "loss/lm": 0.2956934869289398, "step": 3178 }, { "epoch": 0.6526380619995894, "grad_norm": 1.5294044638117383, "kd_ratio": 0.5, "learning_rate": 5.69028771672216e-06, "loss": 0.9548894762992859, "loss/kd": 1.623335838317871, "loss/lm": 0.2864430844783783, "step": 3179 }, { "epoch": 0.6528433586532539, "grad_norm": 1.141655229798286, "kd_ratio": 0.5, "learning_rate": 5.684287685086931e-06, "loss": 1.1197068691253662, "loss/kd": 1.9219039678573608, "loss/lm": 0.31750982999801636, "step": 3180 }, { "epoch": 0.6530486553069185, "grad_norm": 1.2391184053052324, "kd_ratio": 0.5, "learning_rate": 5.67828956212869e-06, "loss": 1.0912121534347534, "loss/kd": 1.774867057800293, "loss/lm": 0.4075571894645691, "step": 3181 }, { "epoch": 0.653253951960583, "grad_norm": 1.351350644436342, "kd_ratio": 0.5, "learning_rate": 5.672293350500189e-06, "loss": 0.9015899896621704, "loss/kd": 1.4689900875091553, "loss/lm": 0.33418989181518555, "step": 3182 }, { "epoch": 0.6534592486142475, "grad_norm": 1.0494308313555214, "kd_ratio": 0.5, "learning_rate": 5.666299052853314e-06, "loss": 1.1351557970046997, "loss/kd": 1.975303053855896, "loss/lm": 0.2950086295604706, "step": 3183 }, { "epoch": 0.6536645452679122, "grad_norm": 1.0326118713287935, "kd_ratio": 0.5, "learning_rate": 5.660306671839135e-06, "loss": 1.0901943445205688, "loss/kd": 1.8388992547988892, "loss/lm": 0.34148943424224854, "step": 3184 }, { "epoch": 0.6538698419215767, "grad_norm": 1.0435203874139956, "kd_ratio": 0.5, "learning_rate": 5.654316210107843e-06, "loss": 0.8212348818778992, "loss/kd": 1.2836840152740479, "loss/lm": 0.3587857186794281, "step": 3185 }, { "epoch": 0.6540751385752412, "grad_norm": 0.9522943488473059, "kd_ratio": 0.5, "learning_rate": 5.648327670308797e-06, "loss": 0.9201993942260742, "loss/kd": 1.5707449913024902, "loss/lm": 0.2696538269519806, "step": 3186 }, { "epoch": 0.6542804352289058, "grad_norm": 1.0961026914813803, "kd_ratio": 0.5, "learning_rate": 5.642341055090508e-06, "loss": 1.198970913887024, "loss/kd": 2.0306737422943115, "loss/lm": 0.3672681152820587, "step": 3187 }, { "epoch": 0.6544857318825703, "grad_norm": 1.1409574062621979, "kd_ratio": 0.5, "learning_rate": 5.636356367100621e-06, "loss": 1.2093210220336914, "loss/kd": 2.0469861030578613, "loss/lm": 0.3716558516025543, "step": 3188 }, { "epoch": 0.6546910285362348, "grad_norm": 0.9668439550956411, "kd_ratio": 0.5, "learning_rate": 5.630373608985944e-06, "loss": 0.8738187551498413, "loss/kd": 1.3945029973983765, "loss/lm": 0.35313448309898376, "step": 3189 }, { "epoch": 0.6548963251898994, "grad_norm": 1.1834567139233856, "kd_ratio": 0.5, "learning_rate": 5.624392783392422e-06, "loss": 1.0739723443984985, "loss/kd": 1.8045425415039062, "loss/lm": 0.34340208768844604, "step": 3190 }, { "epoch": 0.6551016218435639, "grad_norm": 0.8886617379935955, "kd_ratio": 0.5, "learning_rate": 5.618413892965158e-06, "loss": 1.9782850742340088, "loss/kd": 3.5562829971313477, "loss/lm": 0.40028712153434753, "step": 3191 }, { "epoch": 0.6553069184972284, "grad_norm": 1.415207230190337, "kd_ratio": 0.5, "learning_rate": 5.612436940348377e-06, "loss": 0.8080931901931763, "loss/kd": 1.3712481260299683, "loss/lm": 0.24493829905986786, "step": 3192 }, { "epoch": 0.6555122151508931, "grad_norm": 1.046516797589833, "kd_ratio": 0.5, "learning_rate": 5.606461928185472e-06, "loss": 1.0517520904541016, "loss/kd": 1.6828927993774414, "loss/lm": 0.4206114113330841, "step": 3193 }, { "epoch": 0.6557175118045576, "grad_norm": 1.3739187984204129, "kd_ratio": 0.5, "learning_rate": 5.60048885911895e-06, "loss": 0.9484047889709473, "loss/kd": 1.580103874206543, "loss/lm": 0.31670576333999634, "step": 3194 }, { "epoch": 0.6559228084582222, "grad_norm": 1.397092425547371, "kd_ratio": 0.5, "learning_rate": 5.5945177357904935e-06, "loss": 1.0104426145553589, "loss/kd": 1.7019201517105103, "loss/lm": 0.3189651072025299, "step": 3195 }, { "epoch": 0.6561281051118867, "grad_norm": 0.9467576900582597, "kd_ratio": 0.5, "learning_rate": 5.588548560840892e-06, "loss": 0.9162116050720215, "loss/kd": 1.4922994375228882, "loss/lm": 0.34012383222579956, "step": 3196 }, { "epoch": 0.6563334017655512, "grad_norm": 1.1943456390105707, "kd_ratio": 0.5, "learning_rate": 5.582581336910093e-06, "loss": 1.0314668416976929, "loss/kd": 1.7752615213394165, "loss/lm": 0.2876720726490021, "step": 3197 }, { "epoch": 0.6565386984192157, "grad_norm": 1.1347245388010836, "kd_ratio": 0.5, "learning_rate": 5.576616066637175e-06, "loss": 1.0376633405685425, "loss/kd": 1.7368639707565308, "loss/lm": 0.33846279978752136, "step": 3198 }, { "epoch": 0.6567439950728803, "grad_norm": 1.1410435022417436, "kd_ratio": 0.5, "learning_rate": 5.570652752660343e-06, "loss": 1.0245000123977661, "loss/kd": 1.7357563972473145, "loss/lm": 0.31324365735054016, "step": 3199 }, { "epoch": 0.6569492917265448, "grad_norm": 1.2545548714025, "kd_ratio": 0.5, "learning_rate": 5.564691397616961e-06, "loss": 1.0229735374450684, "loss/kd": 1.7608911991119385, "loss/lm": 0.2850559949874878, "step": 3200 }, { "epoch": 0.6571545883802093, "grad_norm": 1.032436473295135, "kd_ratio": 0.5, "learning_rate": 5.558732004143501e-06, "loss": 1.3715312480926514, "loss/kd": 2.3767807483673096, "loss/lm": 0.36628177762031555, "step": 3201 }, { "epoch": 0.657359885033874, "grad_norm": 1.341502512042662, "kd_ratio": 0.5, "learning_rate": 5.5527745748755855e-06, "loss": 1.0765811204910278, "loss/kd": 1.769538402557373, "loss/lm": 0.383623868227005, "step": 3202 }, { "epoch": 0.6575651816875385, "grad_norm": 0.9906225501938855, "kd_ratio": 0.5, "learning_rate": 5.546819112447952e-06, "loss": 1.0375477075576782, "loss/kd": 1.7755740880966187, "loss/lm": 0.2995212972164154, "step": 3203 }, { "epoch": 0.6577704783412031, "grad_norm": 1.4579646176362324, "kd_ratio": 0.5, "learning_rate": 5.540865619494484e-06, "loss": 0.9696465730667114, "loss/kd": 1.5959569215774536, "loss/lm": 0.34333622455596924, "step": 3204 }, { "epoch": 0.6579757749948676, "grad_norm": 0.9604037278861678, "kd_ratio": 0.5, "learning_rate": 5.534914098648185e-06, "loss": 1.064228892326355, "loss/kd": 1.756664514541626, "loss/lm": 0.37179329991340637, "step": 3205 }, { "epoch": 0.6581810716485321, "grad_norm": 1.0913819953349646, "kd_ratio": 0.5, "learning_rate": 5.528964552541187e-06, "loss": 1.1177772283554077, "loss/kd": 1.880749225616455, "loss/lm": 0.3548052906990051, "step": 3206 }, { "epoch": 0.6583863683021967, "grad_norm": 1.0803608201947466, "kd_ratio": 0.5, "learning_rate": 5.523016983804759e-06, "loss": 1.0804165601730347, "loss/kd": 1.8458495140075684, "loss/lm": 0.314983606338501, "step": 3207 }, { "epoch": 0.6585916649558612, "grad_norm": 1.1287039715192697, "kd_ratio": 0.5, "learning_rate": 5.5170713950692735e-06, "loss": 1.12368905544281, "loss/kd": 1.9087800979614258, "loss/lm": 0.3385980427265167, "step": 3208 }, { "epoch": 0.6587969616095257, "grad_norm": 1.0699185891511303, "kd_ratio": 0.5, "learning_rate": 5.511127788964248e-06, "loss": 0.9341123104095459, "loss/kd": 1.5021326541900635, "loss/lm": 0.3660919666290283, "step": 3209 }, { "epoch": 0.6590022582631903, "grad_norm": 1.0567032840739257, "kd_ratio": 0.5, "learning_rate": 5.505186168118314e-06, "loss": 1.132968783378601, "loss/kd": 1.9028048515319824, "loss/lm": 0.36313262581825256, "step": 3210 }, { "epoch": 0.6592075549168549, "grad_norm": 1.0741823261872139, "kd_ratio": 0.5, "learning_rate": 5.499246535159231e-06, "loss": 1.2021214962005615, "loss/kd": 2.0719873905181885, "loss/lm": 0.33225569128990173, "step": 3211 }, { "epoch": 0.6594128515705194, "grad_norm": 1.3063479797361384, "kd_ratio": 0.5, "learning_rate": 5.493308892713866e-06, "loss": 0.9091973900794983, "loss/kd": 1.5735236406326294, "loss/lm": 0.2448711395263672, "step": 3212 }, { "epoch": 0.659618148224184, "grad_norm": 0.9128336877789951, "kd_ratio": 0.5, "learning_rate": 5.48737324340822e-06, "loss": 0.9300894141197205, "loss/kd": 1.4549190998077393, "loss/lm": 0.4052596986293793, "step": 3213 }, { "epoch": 0.6598234448778485, "grad_norm": 1.624812265176206, "kd_ratio": 0.5, "learning_rate": 5.481439589867406e-06, "loss": 0.9784060120582581, "loss/kd": 1.6072545051574707, "loss/lm": 0.3495575189590454, "step": 3214 }, { "epoch": 0.660028741531513, "grad_norm": 0.9364799920487734, "kd_ratio": 0.5, "learning_rate": 5.47550793471566e-06, "loss": 1.1538574695587158, "loss/kd": 1.9510083198547363, "loss/lm": 0.3567065894603729, "step": 3215 }, { "epoch": 0.6602340381851776, "grad_norm": 1.2220704054109137, "kd_ratio": 0.5, "learning_rate": 5.469578280576322e-06, "loss": 0.9863395690917969, "loss/kd": 1.6579153537750244, "loss/lm": 0.31476372480392456, "step": 3216 }, { "epoch": 0.6604393348388421, "grad_norm": 1.003497133328598, "kd_ratio": 0.5, "learning_rate": 5.463650630071857e-06, "loss": 0.873982310295105, "loss/kd": 1.3839924335479736, "loss/lm": 0.3639722168445587, "step": 3217 }, { "epoch": 0.6606446314925066, "grad_norm": 1.0518717702148646, "kd_ratio": 0.5, "learning_rate": 5.457724985823849e-06, "loss": 0.9696213006973267, "loss/kd": 1.5565651655197144, "loss/lm": 0.38267746567726135, "step": 3218 }, { "epoch": 0.6608499281461713, "grad_norm": 1.248462816748159, "kd_ratio": 0.5, "learning_rate": 5.451801350452975e-06, "loss": 1.1096457242965698, "loss/kd": 1.9130163192749023, "loss/lm": 0.30627503991127014, "step": 3219 }, { "epoch": 0.6610552247998358, "grad_norm": 1.0237932348406227, "kd_ratio": 0.5, "learning_rate": 5.445879726579042e-06, "loss": 0.9971054792404175, "loss/kd": 1.6608580350875854, "loss/lm": 0.3333529233932495, "step": 3220 }, { "epoch": 0.6612605214535003, "grad_norm": 1.103574647882922, "kd_ratio": 0.5, "learning_rate": 5.4399601168209614e-06, "loss": 0.921177089214325, "loss/kd": 1.47593355178833, "loss/lm": 0.3664206564426422, "step": 3221 }, { "epoch": 0.6614658181071649, "grad_norm": 1.1059622907408355, "kd_ratio": 0.5, "learning_rate": 5.434042523796758e-06, "loss": 1.0547584295272827, "loss/kd": 1.8038976192474365, "loss/lm": 0.3056192100048065, "step": 3222 }, { "epoch": 0.6616711147608294, "grad_norm": 0.9714484288425312, "kd_ratio": 0.5, "learning_rate": 5.428126950123551e-06, "loss": 0.8198895454406738, "loss/kd": 1.2538923025131226, "loss/lm": 0.3858867883682251, "step": 3223 }, { "epoch": 0.6618764114144939, "grad_norm": 0.9602875151229715, "kd_ratio": 0.5, "learning_rate": 5.42221339841758e-06, "loss": 1.1528295278549194, "loss/kd": 1.99365234375, "loss/lm": 0.3120066225528717, "step": 3224 }, { "epoch": 0.6620817080681585, "grad_norm": 1.0154702215096219, "kd_ratio": 0.5, "learning_rate": 5.416301871294186e-06, "loss": 0.9229000806808472, "loss/kd": 1.5622822046279907, "loss/lm": 0.2835179269313812, "step": 3225 }, { "epoch": 0.662287004721823, "grad_norm": 0.963201265960058, "kd_ratio": 0.5, "learning_rate": 5.410392371367817e-06, "loss": 1.2371184825897217, "loss/kd": 2.1228795051574707, "loss/lm": 0.3513573706150055, "step": 3226 }, { "epoch": 0.6624923013754875, "grad_norm": 0.9116156068426299, "kd_ratio": 0.5, "learning_rate": 5.404484901252023e-06, "loss": 1.1482871770858765, "loss/kd": 1.9722230434417725, "loss/lm": 0.32435137033462524, "step": 3227 }, { "epoch": 0.6626975980291522, "grad_norm": 1.1904183670046824, "kd_ratio": 0.5, "learning_rate": 5.398579463559448e-06, "loss": 1.121363639831543, "loss/kd": 1.961437702178955, "loss/lm": 0.2812895178794861, "step": 3228 }, { "epoch": 0.6629028946828167, "grad_norm": 0.9425109684584325, "kd_ratio": 0.5, "learning_rate": 5.3926760609018555e-06, "loss": 1.355313777923584, "loss/kd": 2.367886781692505, "loss/lm": 0.34274089336395264, "step": 3229 }, { "epoch": 0.6631081913364812, "grad_norm": 1.0047867043245495, "kd_ratio": 0.5, "learning_rate": 5.386774695890083e-06, "loss": 1.16245436668396, "loss/kd": 2.0183820724487305, "loss/lm": 0.30652666091918945, "step": 3230 }, { "epoch": 0.6633134879901458, "grad_norm": 1.0070018988976952, "kd_ratio": 0.5, "learning_rate": 5.3808753711341e-06, "loss": 0.9600775837898254, "loss/kd": 1.584077000617981, "loss/lm": 0.3360781669616699, "step": 3231 }, { "epoch": 0.6635187846438103, "grad_norm": 1.0195780303142163, "kd_ratio": 0.5, "learning_rate": 5.3749780892429416e-06, "loss": 1.0463981628417969, "loss/kd": 1.7672001123428345, "loss/lm": 0.3255960941314697, "step": 3232 }, { "epoch": 0.6637240812974748, "grad_norm": 1.054736144220107, "kd_ratio": 0.5, "learning_rate": 5.369082852824761e-06, "loss": 0.9510741233825684, "loss/kd": 1.5853776931762695, "loss/lm": 0.3167704939842224, "step": 3233 }, { "epoch": 0.6639293779511394, "grad_norm": 1.0520550429670754, "kd_ratio": 0.5, "learning_rate": 5.363189664486799e-06, "loss": 1.0301204919815063, "loss/kd": 1.6707143783569336, "loss/lm": 0.3895266056060791, "step": 3234 }, { "epoch": 0.6641346746048039, "grad_norm": 0.9365735962291597, "kd_ratio": 0.5, "learning_rate": 5.357298526835381e-06, "loss": 1.2189669609069824, "loss/kd": 2.1339430809020996, "loss/lm": 0.30399078130722046, "step": 3235 }, { "epoch": 0.6643399712584684, "grad_norm": 0.9722298097455464, "kd_ratio": 0.5, "learning_rate": 5.35140944247595e-06, "loss": 0.9062308073043823, "loss/kd": 1.500241756439209, "loss/lm": 0.31221991777420044, "step": 3236 }, { "epoch": 0.6645452679121331, "grad_norm": 0.9616279354109928, "kd_ratio": 0.5, "learning_rate": 5.345522414013016e-06, "loss": 1.1722992658615112, "loss/kd": 1.9730757474899292, "loss/lm": 0.37152284383773804, "step": 3237 }, { "epoch": 0.6647505645657976, "grad_norm": 0.9973400336788532, "kd_ratio": 0.5, "learning_rate": 5.339637444050197e-06, "loss": 1.1341255903244019, "loss/kd": 1.864091157913208, "loss/lm": 0.4041600525379181, "step": 3238 }, { "epoch": 0.6649558612194622, "grad_norm": 0.9702138859681457, "kd_ratio": 0.5, "learning_rate": 5.333754535190186e-06, "loss": 0.882305383682251, "loss/kd": 1.4522771835327148, "loss/lm": 0.3123335838317871, "step": 3239 }, { "epoch": 0.6651611578731267, "grad_norm": 1.151483284036558, "kd_ratio": 0.5, "learning_rate": 5.327873690034775e-06, "loss": 0.9091542363166809, "loss/kd": 1.5142905712127686, "loss/lm": 0.30401790142059326, "step": 3240 }, { "epoch": 0.6653664545267912, "grad_norm": 1.0128806128012362, "kd_ratio": 0.5, "learning_rate": 5.321994911184842e-06, "loss": 1.1031107902526855, "loss/kd": 1.8778154850006104, "loss/lm": 0.3284062147140503, "step": 3241 }, { "epoch": 0.6655717511804558, "grad_norm": 1.03033144348461, "kd_ratio": 0.5, "learning_rate": 5.316118201240348e-06, "loss": 1.0052300691604614, "loss/kd": 1.7075275182724, "loss/lm": 0.3029325306415558, "step": 3242 }, { "epoch": 0.6657770478341203, "grad_norm": 1.0189041873748095, "kd_ratio": 0.5, "learning_rate": 5.3102435628003435e-06, "loss": 0.9831084609031677, "loss/kd": 1.6623365879058838, "loss/lm": 0.30388033390045166, "step": 3243 }, { "epoch": 0.6659823444877848, "grad_norm": 1.0419686930896552, "kd_ratio": 0.5, "learning_rate": 5.304370998462953e-06, "loss": 1.0477335453033447, "loss/kd": 1.744408369064331, "loss/lm": 0.35105881094932556, "step": 3244 }, { "epoch": 0.6661876411414493, "grad_norm": 1.2242760554866967, "kd_ratio": 0.5, "learning_rate": 5.298500510825399e-06, "loss": 1.1322423219680786, "loss/kd": 1.940048336982727, "loss/lm": 0.3244362771511078, "step": 3245 }, { "epoch": 0.666392937795114, "grad_norm": 0.9620847231434622, "kd_ratio": 0.5, "learning_rate": 5.2926321024839655e-06, "loss": 1.1103267669677734, "loss/kd": 1.9277117252349854, "loss/lm": 0.2929419279098511, "step": 3246 }, { "epoch": 0.6665982344487785, "grad_norm": 0.98983954843553, "kd_ratio": 0.5, "learning_rate": 5.286765776034044e-06, "loss": 0.9526453614234924, "loss/kd": 1.6002497673034668, "loss/lm": 0.30504095554351807, "step": 3247 }, { "epoch": 0.6668035311024431, "grad_norm": 0.8671248895378842, "kd_ratio": 0.5, "learning_rate": 5.2809015340700754e-06, "loss": 1.0177252292633057, "loss/kd": 1.7069848775863647, "loss/lm": 0.3284655511379242, "step": 3248 }, { "epoch": 0.6670088277561076, "grad_norm": 0.8974880277593537, "kd_ratio": 0.5, "learning_rate": 5.2750393791856045e-06, "loss": 1.1734061241149902, "loss/kd": 1.9316508769989014, "loss/lm": 0.41516149044036865, "step": 3249 }, { "epoch": 0.6672141244097721, "grad_norm": 0.9528521702088585, "kd_ratio": 0.5, "learning_rate": 5.269179313973232e-06, "loss": 0.9398765563964844, "loss/kd": 1.4752895832061768, "loss/lm": 0.4044635593891144, "step": 3250 }, { "epoch": 0.6674194210634367, "grad_norm": 0.9185506935884566, "kd_ratio": 0.5, "learning_rate": 5.263321341024646e-06, "loss": 1.0319416522979736, "loss/kd": 1.680342674255371, "loss/lm": 0.38354066014289856, "step": 3251 }, { "epoch": 0.6676247177171012, "grad_norm": 0.92392327748703, "kd_ratio": 0.5, "learning_rate": 5.257465462930611e-06, "loss": 1.133186936378479, "loss/kd": 1.94575834274292, "loss/lm": 0.3206155598163605, "step": 3252 }, { "epoch": 0.6678300143707657, "grad_norm": 0.9047780672096307, "kd_ratio": 0.5, "learning_rate": 5.25161168228096e-06, "loss": 0.875493049621582, "loss/kd": 1.3877513408660889, "loss/lm": 0.3632347583770752, "step": 3253 }, { "epoch": 0.6680353110244303, "grad_norm": 0.9231278865027485, "kd_ratio": 0.5, "learning_rate": 5.245760001664601e-06, "loss": 0.9121657013893127, "loss/kd": 1.5185184478759766, "loss/lm": 0.3058129549026489, "step": 3254 }, { "epoch": 0.6682406076780949, "grad_norm": 0.8631384029141762, "kd_ratio": 0.5, "learning_rate": 5.239910423669509e-06, "loss": 0.8479198813438416, "loss/kd": 1.392735242843628, "loss/lm": 0.30310454964637756, "step": 3255 }, { "epoch": 0.6684459043317594, "grad_norm": 1.135179088203671, "kd_ratio": 0.5, "learning_rate": 5.234062950882732e-06, "loss": 0.8753632307052612, "loss/kd": 1.3681221008300781, "loss/lm": 0.3826044201850891, "step": 3256 }, { "epoch": 0.668651200985424, "grad_norm": 0.9306274006575112, "kd_ratio": 0.5, "learning_rate": 5.228217585890387e-06, "loss": 1.0977083444595337, "loss/kd": 1.749680995941162, "loss/lm": 0.44573578238487244, "step": 3257 }, { "epoch": 0.6688564976390885, "grad_norm": 1.0544464272751348, "kd_ratio": 0.5, "learning_rate": 5.222374331277665e-06, "loss": 1.3061938285827637, "loss/kd": 2.3143343925476074, "loss/lm": 0.29805323481559753, "step": 3258 }, { "epoch": 0.669061794292753, "grad_norm": 1.0922924858515444, "kd_ratio": 0.5, "learning_rate": 5.216533189628808e-06, "loss": 0.9298745393753052, "loss/kd": 1.5906707048416138, "loss/lm": 0.2690783143043518, "step": 3259 }, { "epoch": 0.6692670909464176, "grad_norm": 1.073697586225043, "kd_ratio": 0.5, "learning_rate": 5.210694163527138e-06, "loss": 1.1502916812896729, "loss/kd": 1.9490793943405151, "loss/lm": 0.3515038788318634, "step": 3260 }, { "epoch": 0.6694723876000821, "grad_norm": 1.018198570726521, "kd_ratio": 0.5, "learning_rate": 5.204857255555036e-06, "loss": 1.3708022832870483, "loss/kd": 2.4535958766937256, "loss/lm": 0.2880087196826935, "step": 3261 }, { "epoch": 0.6696776842537466, "grad_norm": 1.0055629857681874, "kd_ratio": 0.5, "learning_rate": 5.199022468293944e-06, "loss": 1.170241117477417, "loss/kd": 2.028502941131592, "loss/lm": 0.311979204416275, "step": 3262 }, { "epoch": 0.6698829809074112, "grad_norm": 1.324591953167947, "kd_ratio": 0.5, "learning_rate": 5.193189804324376e-06, "loss": 1.0970803499221802, "loss/kd": 1.7356014251708984, "loss/lm": 0.4585592448711395, "step": 3263 }, { "epoch": 0.6700882775610758, "grad_norm": 0.9636988580900563, "kd_ratio": 0.5, "learning_rate": 5.18735926622589e-06, "loss": 1.1162413358688354, "loss/kd": 1.9381747245788574, "loss/lm": 0.2943079471588135, "step": 3264 }, { "epoch": 0.6702935742147403, "grad_norm": 1.0633312266343182, "kd_ratio": 0.5, "learning_rate": 5.181530856577121e-06, "loss": 1.1622390747070312, "loss/kd": 1.9694485664367676, "loss/lm": 0.3550296127796173, "step": 3265 }, { "epoch": 0.6704988708684049, "grad_norm": 1.2738590926382163, "kd_ratio": 0.5, "learning_rate": 5.175704577955745e-06, "loss": 1.3123104572296143, "loss/kd": 2.179752826690674, "loss/lm": 0.44486817717552185, "step": 3266 }, { "epoch": 0.6707041675220694, "grad_norm": 1.0781314391744907, "kd_ratio": 0.5, "learning_rate": 5.169880432938519e-06, "loss": 0.9220666289329529, "loss/kd": 1.4750373363494873, "loss/lm": 0.36909595131874084, "step": 3267 }, { "epoch": 0.6709094641757339, "grad_norm": 1.3423510602602673, "kd_ratio": 0.5, "learning_rate": 5.1640584241012325e-06, "loss": 0.9621885418891907, "loss/kd": 1.5371421575546265, "loss/lm": 0.3872348964214325, "step": 3268 }, { "epoch": 0.6711147608293985, "grad_norm": 1.1383979829011563, "kd_ratio": 0.5, "learning_rate": 5.158238554018744e-06, "loss": 0.9131829142570496, "loss/kd": 1.5160962343215942, "loss/lm": 0.3102695941925049, "step": 3269 }, { "epoch": 0.671320057483063, "grad_norm": 1.0849055841097952, "kd_ratio": 0.5, "learning_rate": 5.152420825264968e-06, "loss": 1.2158572673797607, "loss/kd": 2.0667734146118164, "loss/lm": 0.36494123935699463, "step": 3270 }, { "epoch": 0.6715253541367275, "grad_norm": 1.3757127423595388, "kd_ratio": 0.5, "learning_rate": 5.146605240412859e-06, "loss": 1.0028631687164307, "loss/kd": 1.726810097694397, "loss/lm": 0.27891629934310913, "step": 3271 }, { "epoch": 0.6717306507903921, "grad_norm": 0.8764276573741407, "kd_ratio": 0.5, "learning_rate": 5.140791802034432e-06, "loss": 0.9302917122840881, "loss/kd": 1.486762285232544, "loss/lm": 0.3738211691379547, "step": 3272 }, { "epoch": 0.6719359474440567, "grad_norm": 1.339434354677518, "kd_ratio": 0.5, "learning_rate": 5.134980512700755e-06, "loss": 0.9032169580459595, "loss/kd": 1.4948090314865112, "loss/lm": 0.3116249144077301, "step": 3273 }, { "epoch": 0.6721412440977212, "grad_norm": 1.2401702072931857, "kd_ratio": 0.5, "learning_rate": 5.129171374981945e-06, "loss": 1.311875820159912, "loss/kd": 2.255513906478882, "loss/lm": 0.36823761463165283, "step": 3274 }, { "epoch": 0.6723465407513858, "grad_norm": 1.197059225678652, "kd_ratio": 0.5, "learning_rate": 5.123364391447156e-06, "loss": 1.0440233945846558, "loss/kd": 1.7656400203704834, "loss/lm": 0.3224067687988281, "step": 3275 }, { "epoch": 0.6725518374050503, "grad_norm": 1.330564071042137, "kd_ratio": 0.5, "learning_rate": 5.117559564664609e-06, "loss": 1.1337828636169434, "loss/kd": 1.89018714427948, "loss/lm": 0.3773787021636963, "step": 3276 }, { "epoch": 0.6727571340587148, "grad_norm": 0.9350227571467179, "kd_ratio": 0.5, "learning_rate": 5.111756897201546e-06, "loss": 0.8995060920715332, "loss/kd": 1.453909993171692, "loss/lm": 0.34510213136672974, "step": 3277 }, { "epoch": 0.6729624307123794, "grad_norm": 1.454470555825721, "kd_ratio": 0.5, "learning_rate": 5.105956391624285e-06, "loss": 1.1709798574447632, "loss/kd": 2.000850200653076, "loss/lm": 0.3411095142364502, "step": 3278 }, { "epoch": 0.6731677273660439, "grad_norm": 1.3011852400906705, "kd_ratio": 0.5, "learning_rate": 5.100158050498159e-06, "loss": 1.0639561414718628, "loss/kd": 1.7651045322418213, "loss/lm": 0.3628077507019043, "step": 3279 }, { "epoch": 0.6733730240197084, "grad_norm": 1.203061347807897, "kd_ratio": 0.5, "learning_rate": 5.094361876387557e-06, "loss": 0.9260357022285461, "loss/kd": 1.4679135084152222, "loss/lm": 0.38415786623954773, "step": 3280 }, { "epoch": 0.673578320673373, "grad_norm": 1.4484646821391975, "kd_ratio": 0.5, "learning_rate": 5.088567871855917e-06, "loss": 1.091670036315918, "loss/kd": 1.8409191370010376, "loss/lm": 0.3424208164215088, "step": 3281 }, { "epoch": 0.6737836173270376, "grad_norm": 0.928039551731727, "kd_ratio": 0.5, "learning_rate": 5.082776039465695e-06, "loss": 1.595166563987732, "loss/kd": 2.778568744659424, "loss/lm": 0.41176438331604004, "step": 3282 }, { "epoch": 0.6739889139807022, "grad_norm": 1.1540333179540423, "kd_ratio": 0.5, "learning_rate": 5.076986381778417e-06, "loss": 1.1663295030593872, "loss/kd": 1.9414478540420532, "loss/lm": 0.39121106266975403, "step": 3283 }, { "epoch": 0.6741942106343667, "grad_norm": 1.4352027894404795, "kd_ratio": 0.5, "learning_rate": 5.071198901354617e-06, "loss": 1.1198524236679077, "loss/kd": 1.864275574684143, "loss/lm": 0.37542930245399475, "step": 3284 }, { "epoch": 0.6743995072880312, "grad_norm": 0.9594454932668552, "kd_ratio": 0.5, "learning_rate": 5.065413600753888e-06, "loss": 0.9206074476242065, "loss/kd": 1.4818651676177979, "loss/lm": 0.35934966802597046, "step": 3285 }, { "epoch": 0.6746048039416958, "grad_norm": 1.4280266470691991, "kd_ratio": 0.5, "learning_rate": 5.0596304825348455e-06, "loss": 0.8166590332984924, "loss/kd": 1.310450553894043, "loss/lm": 0.3228675127029419, "step": 3286 }, { "epoch": 0.6748101005953603, "grad_norm": 2.409886029361374, "kd_ratio": 0.5, "learning_rate": 5.053849549255143e-06, "loss": 0.9068461656570435, "loss/kd": 1.5059159994125366, "loss/lm": 0.3077763617038727, "step": 3287 }, { "epoch": 0.6750153972490248, "grad_norm": 1.102515156844487, "kd_ratio": 0.5, "learning_rate": 5.048070803471474e-06, "loss": 1.1177098751068115, "loss/kd": 1.861793041229248, "loss/lm": 0.37362661957740784, "step": 3288 }, { "epoch": 0.6752206939026894, "grad_norm": 1.413263190973675, "kd_ratio": 0.5, "learning_rate": 5.0422942477395605e-06, "loss": 1.1686128377914429, "loss/kd": 2.0245866775512695, "loss/lm": 0.3126390874385834, "step": 3289 }, { "epoch": 0.6754259905563539, "grad_norm": 1.2075605528050506, "kd_ratio": 0.5, "learning_rate": 5.036519884614157e-06, "loss": 0.9794427752494812, "loss/kd": 1.6043753623962402, "loss/lm": 0.35451021790504456, "step": 3290 }, { "epoch": 0.6756312872100185, "grad_norm": 1.2863496326253798, "kd_ratio": 0.5, "learning_rate": 5.03074771664904e-06, "loss": 0.8376264572143555, "loss/kd": 1.324096918106079, "loss/lm": 0.3511560261249542, "step": 3291 }, { "epoch": 0.6758365838636831, "grad_norm": 1.778917707868459, "kd_ratio": 0.5, "learning_rate": 5.024977746397025e-06, "loss": 1.1386810541152954, "loss/kd": 1.8695405721664429, "loss/lm": 0.40782150626182556, "step": 3292 }, { "epoch": 0.6760418805173476, "grad_norm": 1.18057347427962, "kd_ratio": 0.5, "learning_rate": 5.019209976409954e-06, "loss": 1.291438102722168, "loss/kd": 2.23152232170105, "loss/lm": 0.3513539731502533, "step": 3293 }, { "epoch": 0.6762471771710121, "grad_norm": 1.3960107165221514, "kd_ratio": 0.5, "learning_rate": 5.013444409238697e-06, "loss": 1.236228108406067, "loss/kd": 2.069175958633423, "loss/lm": 0.40328022837638855, "step": 3294 }, { "epoch": 0.6764524738246767, "grad_norm": 1.003239707113612, "kd_ratio": 0.5, "learning_rate": 5.0076810474331395e-06, "loss": 1.0322073698043823, "loss/kd": 1.7133485078811646, "loss/lm": 0.3510661721229553, "step": 3295 }, { "epoch": 0.6766577704783412, "grad_norm": 1.079938359073247, "kd_ratio": 0.5, "learning_rate": 5.001919893542204e-06, "loss": 0.8846153020858765, "loss/kd": 1.422608494758606, "loss/lm": 0.34662213921546936, "step": 3296 }, { "epoch": 0.6768630671320057, "grad_norm": 1.2247326536613605, "kd_ratio": 0.5, "learning_rate": 4.996160950113837e-06, "loss": 0.9339575171470642, "loss/kd": 1.5636093616485596, "loss/lm": 0.30430567264556885, "step": 3297 }, { "epoch": 0.6770683637856703, "grad_norm": 0.9691953864811249, "kd_ratio": 0.5, "learning_rate": 4.990404219694988e-06, "loss": 2.0071568489074707, "loss/kd": 3.7357571125030518, "loss/lm": 0.27855637669563293, "step": 3298 }, { "epoch": 0.6772736604393348, "grad_norm": 1.1785144912774925, "kd_ratio": 0.5, "learning_rate": 4.984649704831658e-06, "loss": 1.1103729009628296, "loss/kd": 1.9041081666946411, "loss/lm": 0.3166375458240509, "step": 3299 }, { "epoch": 0.6774789570929994, "grad_norm": 0.9722374258222903, "kd_ratio": 0.5, "learning_rate": 4.9788974080688416e-06, "loss": 1.0749754905700684, "loss/kd": 1.7843176126480103, "loss/lm": 0.36563342809677124, "step": 3300 }, { "epoch": 0.677684253746664, "grad_norm": 1.2824984971158369, "kd_ratio": 0.5, "learning_rate": 4.97314733195057e-06, "loss": 1.0123381614685059, "loss/kd": 1.6627318859100342, "loss/lm": 0.3619445562362671, "step": 3301 }, { "epoch": 0.6778895504003285, "grad_norm": 1.0449093899393294, "kd_ratio": 0.5, "learning_rate": 4.967399479019877e-06, "loss": 0.9493141174316406, "loss/kd": 1.5494314432144165, "loss/lm": 0.34919679164886475, "step": 3302 }, { "epoch": 0.678094847053993, "grad_norm": 1.0467565485579926, "kd_ratio": 0.5, "learning_rate": 4.961653851818827e-06, "loss": 1.2279504537582397, "loss/kd": 2.1374189853668213, "loss/lm": 0.31848201155662537, "step": 3303 }, { "epoch": 0.6783001437076576, "grad_norm": 1.0028363250185321, "kd_ratio": 0.5, "learning_rate": 4.955910452888492e-06, "loss": 0.870050847530365, "loss/kd": 1.4551212787628174, "loss/lm": 0.2849803864955902, "step": 3304 }, { "epoch": 0.6785054403613221, "grad_norm": 1.1455466725049788, "kd_ratio": 0.5, "learning_rate": 4.950169284768968e-06, "loss": 0.9036670923233032, "loss/kd": 1.4388749599456787, "loss/lm": 0.3684592843055725, "step": 3305 }, { "epoch": 0.6787107370149866, "grad_norm": 0.9223188360476927, "kd_ratio": 0.5, "learning_rate": 4.9444303499993484e-06, "loss": 2.030270576477051, "loss/kd": 3.757498264312744, "loss/lm": 0.3030427098274231, "step": 3306 }, { "epoch": 0.6789160336686512, "grad_norm": 0.9226687983104214, "kd_ratio": 0.5, "learning_rate": 4.938693651117751e-06, "loss": 0.9638400673866272, "loss/kd": 1.6261125802993774, "loss/lm": 0.30156758427619934, "step": 3307 }, { "epoch": 0.6791213303223157, "grad_norm": 1.3608498978616188, "kd_ratio": 0.5, "learning_rate": 4.9329591906613e-06, "loss": 1.9897974729537964, "loss/kd": 3.6990537643432617, "loss/lm": 0.28054124116897583, "step": 3308 }, { "epoch": 0.6793266269759803, "grad_norm": 1.0138291916865236, "kd_ratio": 0.5, "learning_rate": 4.927226971166136e-06, "loss": 1.235595464706421, "loss/kd": 2.076756000518799, "loss/lm": 0.3944348692893982, "step": 3309 }, { "epoch": 0.6795319236296449, "grad_norm": 1.035213936903595, "kd_ratio": 0.5, "learning_rate": 4.921496995167404e-06, "loss": 1.0499416589736938, "loss/kd": 1.6716550588607788, "loss/lm": 0.4282282292842865, "step": 3310 }, { "epoch": 0.6797372202833094, "grad_norm": 1.2650895490109455, "kd_ratio": 0.5, "learning_rate": 4.9157692651992495e-06, "loss": 0.9685144424438477, "loss/kd": 1.664170742034912, "loss/lm": 0.2728581130504608, "step": 3311 }, { "epoch": 0.6799425169369739, "grad_norm": 1.0994210791839805, "kd_ratio": 0.5, "learning_rate": 4.91004378379484e-06, "loss": 1.0840198993682861, "loss/kd": 1.8001388311386108, "loss/lm": 0.3679008483886719, "step": 3312 }, { "epoch": 0.6801478135906385, "grad_norm": 1.186640130323757, "kd_ratio": 0.5, "learning_rate": 4.904320553486328e-06, "loss": 1.06923508644104, "loss/kd": 1.8291947841644287, "loss/lm": 0.30927544832229614, "step": 3313 }, { "epoch": 0.680353110244303, "grad_norm": 1.0519032459241353, "kd_ratio": 0.5, "learning_rate": 4.898599576804899e-06, "loss": 1.2463659048080444, "loss/kd": 2.061052083969116, "loss/lm": 0.4316796660423279, "step": 3314 }, { "epoch": 0.6805584068979675, "grad_norm": 0.9212768258596652, "kd_ratio": 0.5, "learning_rate": 4.892880856280713e-06, "loss": 0.9575737118721008, "loss/kd": 1.5951318740844727, "loss/lm": 0.3200155794620514, "step": 3315 }, { "epoch": 0.6807637035516321, "grad_norm": 1.2583443817040636, "kd_ratio": 0.5, "learning_rate": 4.887164394442948e-06, "loss": 0.8866745233535767, "loss/kd": 1.4303234815597534, "loss/lm": 0.3430256247520447, "step": 3316 }, { "epoch": 0.6809690002052966, "grad_norm": 1.0276188881770234, "kd_ratio": 0.5, "learning_rate": 4.881450193819785e-06, "loss": 0.9735850691795349, "loss/kd": 1.6312274932861328, "loss/lm": 0.3159426748752594, "step": 3317 }, { "epoch": 0.6811742968589612, "grad_norm": 0.9481535181985589, "kd_ratio": 0.5, "learning_rate": 4.875738256938393e-06, "loss": 0.9497963786125183, "loss/kd": 1.613720417022705, "loss/lm": 0.28587237000465393, "step": 3318 }, { "epoch": 0.6813795935126258, "grad_norm": 1.4112144502811284, "kd_ratio": 0.5, "learning_rate": 4.870028586324947e-06, "loss": 1.1029068231582642, "loss/kd": 1.9156256914138794, "loss/lm": 0.2901880145072937, "step": 3319 }, { "epoch": 0.6815848901662903, "grad_norm": 1.08456161149245, "kd_ratio": 0.5, "learning_rate": 4.864321184504622e-06, "loss": 0.8918163776397705, "loss/kd": 1.397818684577942, "loss/lm": 0.3858141005039215, "step": 3320 }, { "epoch": 0.6817901868199548, "grad_norm": 1.107787070838129, "kd_ratio": 0.5, "learning_rate": 4.85861605400159e-06, "loss": 1.0219297409057617, "loss/kd": 1.6874864101409912, "loss/lm": 0.3563731014728546, "step": 3321 }, { "epoch": 0.6819954834736194, "grad_norm": 1.289510909282049, "kd_ratio": 0.5, "learning_rate": 4.852913197339007e-06, "loss": 1.073337197303772, "loss/kd": 1.8002550601959229, "loss/lm": 0.34641942381858826, "step": 3322 }, { "epoch": 0.6822007801272839, "grad_norm": 1.2514852408767465, "kd_ratio": 0.5, "learning_rate": 4.847212617039037e-06, "loss": 1.2083302736282349, "loss/kd": 2.000128984451294, "loss/lm": 0.416531503200531, "step": 3323 }, { "epoch": 0.6824060767809484, "grad_norm": 1.0278180295500705, "kd_ratio": 0.5, "learning_rate": 4.84151431562283e-06, "loss": 0.9622462391853333, "loss/kd": 1.6195074319839478, "loss/lm": 0.30498501658439636, "step": 3324 }, { "epoch": 0.682611373434613, "grad_norm": 0.9741231513248833, "kd_ratio": 0.5, "learning_rate": 4.835818295610531e-06, "loss": 0.9537878632545471, "loss/kd": 1.5650321245193481, "loss/lm": 0.3425436317920685, "step": 3325 }, { "epoch": 0.6828166700882775, "grad_norm": 0.9921540583600299, "kd_ratio": 0.5, "learning_rate": 4.830124559521277e-06, "loss": 1.094774842262268, "loss/kd": 1.867164134979248, "loss/lm": 0.32238560914993286, "step": 3326 }, { "epoch": 0.6830219667419422, "grad_norm": 1.0018408031352846, "kd_ratio": 0.5, "learning_rate": 4.82443310987319e-06, "loss": 1.438235878944397, "loss/kd": 2.4740262031555176, "loss/lm": 0.40244555473327637, "step": 3327 }, { "epoch": 0.6832272633956067, "grad_norm": 0.9518211089158938, "kd_ratio": 0.5, "learning_rate": 4.818743949183388e-06, "loss": 0.8429446816444397, "loss/kd": 1.3999274969100952, "loss/lm": 0.2859618365764618, "step": 3328 }, { "epoch": 0.6834325600492712, "grad_norm": 0.9291283976398509, "kd_ratio": 0.5, "learning_rate": 4.813057079967961e-06, "loss": 1.1329317092895508, "loss/kd": 1.9631282091140747, "loss/lm": 0.3027351498603821, "step": 3329 }, { "epoch": 0.6836378567029358, "grad_norm": 1.0243958112623723, "kd_ratio": 0.5, "learning_rate": 4.807372504742013e-06, "loss": 0.8646199107170105, "loss/kd": 1.3733980655670166, "loss/lm": 0.3558417856693268, "step": 3330 }, { "epoch": 0.6838431533566003, "grad_norm": 1.0012158356954746, "kd_ratio": 0.5, "learning_rate": 4.801690226019606e-06, "loss": 1.110560655593872, "loss/kd": 1.861190915107727, "loss/lm": 0.3599303364753723, "step": 3331 }, { "epoch": 0.6840484500102648, "grad_norm": 0.9990716744606576, "kd_ratio": 0.5, "learning_rate": 4.796010246313806e-06, "loss": 1.1750141382217407, "loss/kd": 2.0077433586120605, "loss/lm": 0.3422848880290985, "step": 3332 }, { "epoch": 0.6842537466639294, "grad_norm": 1.0214029795809498, "kd_ratio": 0.5, "learning_rate": 4.790332568136645e-06, "loss": 0.9725781679153442, "loss/kd": 1.558007001876831, "loss/lm": 0.38714927434921265, "step": 3333 }, { "epoch": 0.6844590433175939, "grad_norm": 0.8889258506698725, "kd_ratio": 0.5, "learning_rate": 4.784657193999153e-06, "loss": 1.0120859146118164, "loss/kd": 1.6538081169128418, "loss/lm": 0.3703638017177582, "step": 3334 }, { "epoch": 0.6846643399712584, "grad_norm": 0.92611815847813, "kd_ratio": 0.5, "learning_rate": 4.77898412641133e-06, "loss": 1.1002200841903687, "loss/kd": 1.9064247608184814, "loss/lm": 0.29401537775993347, "step": 3335 }, { "epoch": 0.6848696366249231, "grad_norm": 0.9611163049255566, "kd_ratio": 0.5, "learning_rate": 4.773313367882164e-06, "loss": 0.9646521806716919, "loss/kd": 1.5871524810791016, "loss/lm": 0.342151939868927, "step": 3336 }, { "epoch": 0.6850749332785876, "grad_norm": 1.2105534054696314, "kd_ratio": 0.5, "learning_rate": 4.767644920919619e-06, "loss": 1.0532336235046387, "loss/kd": 1.763656497001648, "loss/lm": 0.342810720205307, "step": 3337 }, { "epoch": 0.6852802299322521, "grad_norm": 0.8873018686783979, "kd_ratio": 0.5, "learning_rate": 4.761978788030629e-06, "loss": 1.1384941339492798, "loss/kd": 1.9535354375839233, "loss/lm": 0.32345283031463623, "step": 3338 }, { "epoch": 0.6854855265859167, "grad_norm": 0.8549843648481895, "kd_ratio": 0.5, "learning_rate": 4.756314971721115e-06, "loss": 0.8633198142051697, "loss/kd": 1.4324815273284912, "loss/lm": 0.29415813088417053, "step": 3339 }, { "epoch": 0.6856908232395812, "grad_norm": 1.2231558727918532, "kd_ratio": 0.5, "learning_rate": 4.750653474495969e-06, "loss": 0.8250389099121094, "loss/kd": 1.3345731496810913, "loss/lm": 0.31550464034080505, "step": 3340 }, { "epoch": 0.6858961198932457, "grad_norm": 0.9723915581272801, "kd_ratio": 0.5, "learning_rate": 4.744994298859062e-06, "loss": 0.9814029932022095, "loss/kd": 1.6291126012802124, "loss/lm": 0.33369341492652893, "step": 3341 }, { "epoch": 0.6861014165469103, "grad_norm": 0.9717205847442497, "kd_ratio": 0.5, "learning_rate": 4.739337447313228e-06, "loss": 0.9265340566635132, "loss/kd": 1.5515986680984497, "loss/lm": 0.3014693856239319, "step": 3342 }, { "epoch": 0.6863067132005748, "grad_norm": 1.3999189392221258, "kd_ratio": 0.5, "learning_rate": 4.733682922360282e-06, "loss": 1.2194617986679077, "loss/kd": 2.0239479541778564, "loss/lm": 0.41497567296028137, "step": 3343 }, { "epoch": 0.6865120098542393, "grad_norm": 1.124008596268987, "kd_ratio": 0.5, "learning_rate": 4.728030726501011e-06, "loss": 1.0484062433242798, "loss/kd": 1.779525637626648, "loss/lm": 0.31728681921958923, "step": 3344 }, { "epoch": 0.686717306507904, "grad_norm": 1.096781406053321, "kd_ratio": 0.5, "learning_rate": 4.722380862235156e-06, "loss": 0.9647749662399292, "loss/kd": 1.6410517692565918, "loss/lm": 0.2884982228279114, "step": 3345 }, { "epoch": 0.6869226031615685, "grad_norm": 1.2427317789581431, "kd_ratio": 0.5, "learning_rate": 4.716733332061456e-06, "loss": 1.1840900182724, "loss/kd": 2.0286009311676025, "loss/lm": 0.33957913517951965, "step": 3346 }, { "epoch": 0.687127899815233, "grad_norm": 0.8786632362499142, "kd_ratio": 0.5, "learning_rate": 4.71108813847759e-06, "loss": 1.176687240600586, "loss/kd": 2.0548324584960938, "loss/lm": 0.29854193329811096, "step": 3347 }, { "epoch": 0.6873331964688976, "grad_norm": 1.2126614963027071, "kd_ratio": 0.5, "learning_rate": 4.70544528398022e-06, "loss": 1.1970971822738647, "loss/kd": 2.116122245788574, "loss/lm": 0.2780720889568329, "step": 3348 }, { "epoch": 0.6875384931225621, "grad_norm": 0.980408506269209, "kd_ratio": 0.5, "learning_rate": 4.699804771064961e-06, "loss": 0.8116880655288696, "loss/kd": 1.258449673652649, "loss/lm": 0.36492639780044556, "step": 3349 }, { "epoch": 0.6877437897762266, "grad_norm": 1.0477422879135658, "kd_ratio": 0.5, "learning_rate": 4.694166602226404e-06, "loss": 1.3081367015838623, "loss/kd": 2.25188946723938, "loss/lm": 0.3643839359283447, "step": 3350 }, { "epoch": 0.6879490864298912, "grad_norm": 1.1442508725685785, "kd_ratio": 0.5, "learning_rate": 4.688530779958099e-06, "loss": 1.0381356477737427, "loss/kd": 1.6954257488250732, "loss/lm": 0.3808456063270569, "step": 3351 }, { "epoch": 0.6881543830835557, "grad_norm": 0.8443740648624198, "kd_ratio": 0.5, "learning_rate": 4.6828973067525564e-06, "loss": 1.1338551044464111, "loss/kd": 1.9099583625793457, "loss/lm": 0.3577517569065094, "step": 3352 }, { "epoch": 0.6883596797372202, "grad_norm": 1.6557701321987537, "kd_ratio": 0.5, "learning_rate": 4.677266185101253e-06, "loss": 0.9423750042915344, "loss/kd": 1.5817508697509766, "loss/lm": 0.3029991090297699, "step": 3353 }, { "epoch": 0.6885649763908849, "grad_norm": 0.9568515115507785, "kd_ratio": 0.5, "learning_rate": 4.671637417494616e-06, "loss": 1.044832468032837, "loss/kd": 1.7523212432861328, "loss/lm": 0.3373437225818634, "step": 3354 }, { "epoch": 0.6887702730445494, "grad_norm": 1.2167679243767133, "kd_ratio": 0.5, "learning_rate": 4.666011006422041e-06, "loss": 1.0234136581420898, "loss/kd": 1.7165982723236084, "loss/lm": 0.33022892475128174, "step": 3355 }, { "epoch": 0.6889755696982139, "grad_norm": 1.2482860228491663, "kd_ratio": 0.5, "learning_rate": 4.6603869543718765e-06, "loss": 1.8806538581848145, "loss/kd": 3.419929265975952, "loss/lm": 0.3413783311843872, "step": 3356 }, { "epoch": 0.6891808663518785, "grad_norm": 1.0726555188079687, "kd_ratio": 0.5, "learning_rate": 4.654765263831431e-06, "loss": 1.0227361917495728, "loss/kd": 1.7165802717208862, "loss/lm": 0.3288920223712921, "step": 3357 }, { "epoch": 0.689386163005543, "grad_norm": 1.2157998061307658, "kd_ratio": 0.5, "learning_rate": 4.649145937286963e-06, "loss": 0.9962256550788879, "loss/kd": 1.6710398197174072, "loss/lm": 0.32141146063804626, "step": 3358 }, { "epoch": 0.6895914596592075, "grad_norm": 1.037617410786875, "kd_ratio": 0.5, "learning_rate": 4.643528977223689e-06, "loss": 0.9654778242111206, "loss/kd": 1.583728313446045, "loss/lm": 0.3472273647785187, "step": 3359 }, { "epoch": 0.6897967563128721, "grad_norm": 0.9287501974359947, "kd_ratio": 0.5, "learning_rate": 4.637914386125781e-06, "loss": 1.1174914836883545, "loss/kd": 1.8707749843597412, "loss/lm": 0.36420807242393494, "step": 3360 }, { "epoch": 0.6900020529665366, "grad_norm": 1.0518497429125786, "kd_ratio": 0.5, "learning_rate": 4.632302166476365e-06, "loss": 1.1158323287963867, "loss/kd": 1.9114875793457031, "loss/lm": 0.3201770782470703, "step": 3361 }, { "epoch": 0.6902073496202011, "grad_norm": 1.2143434397081514, "kd_ratio": 0.5, "learning_rate": 4.626692320757505e-06, "loss": 0.8482159376144409, "loss/kd": 1.331746220588684, "loss/lm": 0.36468562483787537, "step": 3362 }, { "epoch": 0.6904126462738658, "grad_norm": 1.205097214537245, "kd_ratio": 0.5, "learning_rate": 4.621084851450229e-06, "loss": 1.1074433326721191, "loss/kd": 1.8982946872711182, "loss/lm": 0.3165920674800873, "step": 3363 }, { "epoch": 0.6906179429275303, "grad_norm": 0.9739016083650609, "kd_ratio": 0.5, "learning_rate": 4.615479761034515e-06, "loss": 1.1447774171829224, "loss/kd": 1.9541256427764893, "loss/lm": 0.3354291617870331, "step": 3364 }, { "epoch": 0.6908232395811948, "grad_norm": 1.120653572720439, "kd_ratio": 0.5, "learning_rate": 4.60987705198927e-06, "loss": 1.2027463912963867, "loss/kd": 2.050137758255005, "loss/lm": 0.3553549349308014, "step": 3365 }, { "epoch": 0.6910285362348594, "grad_norm": 1.2058666551407942, "kd_ratio": 0.5, "learning_rate": 4.604276726792377e-06, "loss": 0.9992370009422302, "loss/kd": 1.627001404762268, "loss/lm": 0.37147256731987, "step": 3366 }, { "epoch": 0.6912338328885239, "grad_norm": 1.1419076111931297, "kd_ratio": 0.5, "learning_rate": 4.5986787879206375e-06, "loss": 1.0831903219223022, "loss/kd": 1.8489181995391846, "loss/lm": 0.3174625337123871, "step": 3367 }, { "epoch": 0.6914391295421884, "grad_norm": 1.1246612371085176, "kd_ratio": 0.5, "learning_rate": 4.593083237849816e-06, "loss": 1.1237155199050903, "loss/kd": 1.7719554901123047, "loss/lm": 0.4754754602909088, "step": 3368 }, { "epoch": 0.691644426195853, "grad_norm": 1.0691967390060164, "kd_ratio": 0.5, "learning_rate": 4.587490079054607e-06, "loss": 1.2660634517669678, "loss/kd": 2.238367795944214, "loss/lm": 0.2937591075897217, "step": 3369 }, { "epoch": 0.6918497228495175, "grad_norm": 1.0221498009875731, "kd_ratio": 0.5, "learning_rate": 4.581899314008657e-06, "loss": 1.2452257871627808, "loss/kd": 2.1114485263824463, "loss/lm": 0.3790031373500824, "step": 3370 }, { "epoch": 0.692055019503182, "grad_norm": 1.0022392071433697, "kd_ratio": 0.5, "learning_rate": 4.5763109451845515e-06, "loss": 0.9941228032112122, "loss/kd": 1.6790558099746704, "loss/lm": 0.3091897964477539, "step": 3371 }, { "epoch": 0.6922603161568467, "grad_norm": 1.0946905374295468, "kd_ratio": 0.5, "learning_rate": 4.570724975053814e-06, "loss": 1.1877264976501465, "loss/kd": 2.024357795715332, "loss/lm": 0.3510951101779938, "step": 3372 }, { "epoch": 0.6924656128105112, "grad_norm": 1.347494280402027, "kd_ratio": 0.5, "learning_rate": 4.565141406086913e-06, "loss": 1.0788545608520508, "loss/kd": 1.8117997646331787, "loss/lm": 0.3459094762802124, "step": 3373 }, { "epoch": 0.6926709094641758, "grad_norm": 0.99077350251656, "kd_ratio": 0.5, "learning_rate": 4.559560240753245e-06, "loss": 1.0346320867538452, "loss/kd": 1.7427327632904053, "loss/lm": 0.32653146982192993, "step": 3374 }, { "epoch": 0.6928762061178403, "grad_norm": 0.9390431466573086, "kd_ratio": 0.5, "learning_rate": 4.553981481521156e-06, "loss": 1.0722862482070923, "loss/kd": 1.7529630661010742, "loss/lm": 0.3916095197200775, "step": 3375 }, { "epoch": 0.6930815027715048, "grad_norm": 0.9282999132807159, "kd_ratio": 0.5, "learning_rate": 4.548405130857907e-06, "loss": 1.0404609441757202, "loss/kd": 1.770289421081543, "loss/lm": 0.3106323778629303, "step": 3376 }, { "epoch": 0.6932867994251694, "grad_norm": 0.9163817663250164, "kd_ratio": 0.5, "learning_rate": 4.5428311912297265e-06, "loss": 0.9887503385543823, "loss/kd": 1.6225236654281616, "loss/lm": 0.354977011680603, "step": 3377 }, { "epoch": 0.6934920960788339, "grad_norm": 1.1039198548600677, "kd_ratio": 0.5, "learning_rate": 4.537259665101745e-06, "loss": 1.0264286994934082, "loss/kd": 1.7391088008880615, "loss/lm": 0.3137485086917877, "step": 3378 }, { "epoch": 0.6936973927324984, "grad_norm": 0.9373582265652356, "kd_ratio": 0.5, "learning_rate": 4.531690554938043e-06, "loss": 0.9990160465240479, "loss/kd": 1.6675677299499512, "loss/lm": 0.33046433329582214, "step": 3379 }, { "epoch": 0.693902689386163, "grad_norm": 0.9473237289732513, "kd_ratio": 0.5, "learning_rate": 4.52612386320163e-06, "loss": 1.2006078958511353, "loss/kd": 2.055432081222534, "loss/lm": 0.34578371047973633, "step": 3380 }, { "epoch": 0.6941079860398276, "grad_norm": 1.005684459137226, "kd_ratio": 0.5, "learning_rate": 4.520559592354436e-06, "loss": 1.011519432067871, "loss/kd": 1.7389216423034668, "loss/lm": 0.28411710262298584, "step": 3381 }, { "epoch": 0.6943132826934921, "grad_norm": 1.077840524504025, "kd_ratio": 0.5, "learning_rate": 4.514997744857342e-06, "loss": 1.0832734107971191, "loss/kd": 1.8350163698196411, "loss/lm": 0.33153054118156433, "step": 3382 }, { "epoch": 0.6945185793471567, "grad_norm": 1.4646997271490412, "kd_ratio": 0.5, "learning_rate": 4.509438323170131e-06, "loss": 0.8995620608329773, "loss/kd": 1.4704270362854004, "loss/lm": 0.3286970853805542, "step": 3383 }, { "epoch": 0.6947238760008212, "grad_norm": 0.9526080660013652, "kd_ratio": 0.5, "learning_rate": 4.503881329751535e-06, "loss": 0.9053094387054443, "loss/kd": 1.4295748472213745, "loss/lm": 0.38104405999183655, "step": 3384 }, { "epoch": 0.6949291726544857, "grad_norm": 1.1361666523759175, "kd_ratio": 0.5, "learning_rate": 4.498326767059196e-06, "loss": 1.168617606163025, "loss/kd": 1.9666322469711304, "loss/lm": 0.37060290575027466, "step": 3385 }, { "epoch": 0.6951344693081503, "grad_norm": 1.456760540137674, "kd_ratio": 0.5, "learning_rate": 4.4927746375496894e-06, "loss": 1.160016417503357, "loss/kd": 1.9500359296798706, "loss/lm": 0.3699968159198761, "step": 3386 }, { "epoch": 0.6953397659618148, "grad_norm": 1.4546861562591227, "kd_ratio": 0.5, "learning_rate": 4.487224943678513e-06, "loss": 1.004359483718872, "loss/kd": 1.6909270286560059, "loss/lm": 0.31779202818870544, "step": 3387 }, { "epoch": 0.6955450626154793, "grad_norm": 1.3742782212598783, "kd_ratio": 0.5, "learning_rate": 4.4816776879000935e-06, "loss": 1.0788681507110596, "loss/kd": 1.837598443031311, "loss/lm": 0.3201378583908081, "step": 3388 }, { "epoch": 0.6957503592691439, "grad_norm": 0.9741872521762254, "kd_ratio": 0.5, "learning_rate": 4.476132872667764e-06, "loss": 1.002856969833374, "loss/kd": 1.663605809211731, "loss/lm": 0.3421080410480499, "step": 3389 }, { "epoch": 0.6959556559228085, "grad_norm": 1.3240827719715136, "kd_ratio": 0.5, "learning_rate": 4.4705905004337925e-06, "loss": 1.1095633506774902, "loss/kd": 1.8617850542068481, "loss/lm": 0.3573417365550995, "step": 3390 }, { "epoch": 0.696160952576473, "grad_norm": 1.2352946592955287, "kd_ratio": 0.5, "learning_rate": 4.465050573649359e-06, "loss": 0.9789988994598389, "loss/kd": 1.6390715837478638, "loss/lm": 0.3189261853694916, "step": 3391 }, { "epoch": 0.6963662492301376, "grad_norm": 1.085895079216854, "kd_ratio": 0.5, "learning_rate": 4.459513094764566e-06, "loss": 1.1640958786010742, "loss/kd": 1.9789239168167114, "loss/lm": 0.349267840385437, "step": 3392 }, { "epoch": 0.6965715458838021, "grad_norm": 1.1749808047060526, "kd_ratio": 0.5, "learning_rate": 4.453978066228436e-06, "loss": 1.2675107717514038, "loss/kd": 2.1834936141967773, "loss/lm": 0.35152801871299744, "step": 3393 }, { "epoch": 0.6967768425374666, "grad_norm": 0.8779545402972577, "kd_ratio": 0.5, "learning_rate": 4.448445490488895e-06, "loss": 1.3502552509307861, "loss/kd": 2.307480573654175, "loss/lm": 0.3930300176143646, "step": 3394 }, { "epoch": 0.6969821391911312, "grad_norm": 1.0610827544779324, "kd_ratio": 0.5, "learning_rate": 4.442915369992802e-06, "loss": 1.0600882768630981, "loss/kd": 1.7976644039154053, "loss/lm": 0.32251206040382385, "step": 3395 }, { "epoch": 0.6971874358447957, "grad_norm": 1.0319821900580908, "kd_ratio": 0.5, "learning_rate": 4.437387707185911e-06, "loss": 1.0454422235488892, "loss/kd": 1.7734496593475342, "loss/lm": 0.317434698343277, "step": 3396 }, { "epoch": 0.6973927324984602, "grad_norm": 1.1195389124018307, "kd_ratio": 0.5, "learning_rate": 4.431862504512905e-06, "loss": 0.9360262155532837, "loss/kd": 1.5628931522369385, "loss/lm": 0.3091592788696289, "step": 3397 }, { "epoch": 0.6975980291521248, "grad_norm": 0.9628874946761385, "kd_ratio": 0.5, "learning_rate": 4.426339764417372e-06, "loss": 0.9982554316520691, "loss/kd": 1.6918096542358398, "loss/lm": 0.30470117926597595, "step": 3398 }, { "epoch": 0.6978033258057894, "grad_norm": 0.8592677252207246, "kd_ratio": 0.5, "learning_rate": 4.4208194893418125e-06, "loss": 1.193709135055542, "loss/kd": 2.0440328121185303, "loss/lm": 0.3433855473995209, "step": 3399 }, { "epoch": 0.6980086224594539, "grad_norm": 1.013662684683141, "kd_ratio": 0.5, "learning_rate": 4.415301681727638e-06, "loss": 1.1151397228240967, "loss/kd": 1.913087010383606, "loss/lm": 0.3171924650669098, "step": 3400 }, { "epoch": 0.6982139191131185, "grad_norm": 1.109172176234114, "kd_ratio": 0.5, "learning_rate": 4.409786344015164e-06, "loss": 0.948620080947876, "loss/kd": 1.5321998596191406, "loss/lm": 0.3650403320789337, "step": 3401 }, { "epoch": 0.698419215766783, "grad_norm": 1.0797297831918233, "kd_ratio": 0.5, "learning_rate": 4.404273478643615e-06, "loss": 0.9178197383880615, "loss/kd": 1.472845196723938, "loss/lm": 0.36279433965682983, "step": 3402 }, { "epoch": 0.6986245124204475, "grad_norm": 1.02891850463215, "kd_ratio": 0.5, "learning_rate": 4.398763088051127e-06, "loss": 1.0743244886398315, "loss/kd": 1.7931246757507324, "loss/lm": 0.3555243909358978, "step": 3403 }, { "epoch": 0.6988298090741121, "grad_norm": 1.2514358283629559, "kd_ratio": 0.5, "learning_rate": 4.393255174674741e-06, "loss": 1.3907642364501953, "loss/kd": 2.457477331161499, "loss/lm": 0.32405105233192444, "step": 3404 }, { "epoch": 0.6990351057277766, "grad_norm": 1.0722766736269496, "kd_ratio": 0.5, "learning_rate": 4.387749740950392e-06, "loss": 1.2075936794281006, "loss/kd": 2.033496379852295, "loss/lm": 0.3816908597946167, "step": 3405 }, { "epoch": 0.6992404023814411, "grad_norm": 1.079240775190394, "kd_ratio": 0.5, "learning_rate": 4.382246789312931e-06, "loss": 2.024954080581665, "loss/kd": 3.7358405590057373, "loss/lm": 0.31406763195991516, "step": 3406 }, { "epoch": 0.6994456990351057, "grad_norm": 1.0741009289089412, "kd_ratio": 0.5, "learning_rate": 4.3767463221961034e-06, "loss": 1.1699261665344238, "loss/kd": 1.8933396339416504, "loss/lm": 0.4465126395225525, "step": 3407 }, { "epoch": 0.6996509956887703, "grad_norm": 1.2968687688429084, "kd_ratio": 0.5, "learning_rate": 4.371248342032562e-06, "loss": 0.8578058481216431, "loss/kd": 1.3834826946258545, "loss/lm": 0.33212897181510925, "step": 3408 }, { "epoch": 0.6998562923424348, "grad_norm": 1.2902079980881294, "kd_ratio": 0.5, "learning_rate": 4.365752851253856e-06, "loss": 1.1504091024398804, "loss/kd": 1.9923146963119507, "loss/lm": 0.3085035979747772, "step": 3409 }, { "epoch": 0.7000615889960994, "grad_norm": 1.1427223265249589, "kd_ratio": 0.5, "learning_rate": 4.360259852290431e-06, "loss": 0.9770845174789429, "loss/kd": 1.6293872594833374, "loss/lm": 0.3247818350791931, "step": 3410 }, { "epoch": 0.7002668856497639, "grad_norm": 1.2497344863544424, "kd_ratio": 0.5, "learning_rate": 4.354769347571638e-06, "loss": 1.7750146389007568, "loss/kd": 3.3296279907226562, "loss/lm": 0.220401331782341, "step": 3411 }, { "epoch": 0.7004721823034284, "grad_norm": 1.3376725469148023, "kd_ratio": 0.5, "learning_rate": 4.34928133952571e-06, "loss": 0.9259635210037231, "loss/kd": 1.4931321144104004, "loss/lm": 0.3587948679924011, "step": 3412 }, { "epoch": 0.700677478957093, "grad_norm": 1.4801602011648924, "kd_ratio": 0.5, "learning_rate": 4.343795830579803e-06, "loss": 1.1105844974517822, "loss/kd": 1.9363964796066284, "loss/lm": 0.28477251529693604, "step": 3413 }, { "epoch": 0.7008827756107575, "grad_norm": 1.2294322512548177, "kd_ratio": 0.5, "learning_rate": 4.338312823159936e-06, "loss": 1.0900285243988037, "loss/kd": 1.8617573976516724, "loss/lm": 0.31829965114593506, "step": 3414 }, { "epoch": 0.701088072264422, "grad_norm": 1.5626765815636061, "kd_ratio": 0.5, "learning_rate": 4.332832319691044e-06, "loss": 0.8432543277740479, "loss/kd": 1.3387171030044556, "loss/lm": 0.34779155254364014, "step": 3415 }, { "epoch": 0.7012933689180867, "grad_norm": 1.0275105499486452, "kd_ratio": 0.5, "learning_rate": 4.3273543225969485e-06, "loss": 1.048919916152954, "loss/kd": 1.7696044445037842, "loss/lm": 0.3282354772090912, "step": 3416 }, { "epoch": 0.7014986655717512, "grad_norm": 1.5863319649265006, "kd_ratio": 0.5, "learning_rate": 4.321878834300355e-06, "loss": 1.16678786277771, "loss/kd": 1.9887491464614868, "loss/lm": 0.3448266386985779, "step": 3417 }, { "epoch": 0.7017039622254158, "grad_norm": 1.1082231029770415, "kd_ratio": 0.5, "learning_rate": 4.316405857222868e-06, "loss": 1.0377970933914185, "loss/kd": 1.7136528491973877, "loss/lm": 0.36194130778312683, "step": 3418 }, { "epoch": 0.7019092588790803, "grad_norm": 1.107693878344019, "kd_ratio": 0.5, "learning_rate": 4.3109353937849815e-06, "loss": 1.1594903469085693, "loss/kd": 1.9868208169937134, "loss/lm": 0.33215975761413574, "step": 3419 }, { "epoch": 0.7021145555327448, "grad_norm": 1.246126638804458, "kd_ratio": 0.5, "learning_rate": 4.305467446406077e-06, "loss": 1.3131191730499268, "loss/kd": 2.224369525909424, "loss/lm": 0.4018688499927521, "step": 3420 }, { "epoch": 0.7023198521864094, "grad_norm": 1.4447638971122863, "kd_ratio": 0.5, "learning_rate": 4.300002017504417e-06, "loss": 1.2162426710128784, "loss/kd": 2.009152889251709, "loss/lm": 0.4233323633670807, "step": 3421 }, { "epoch": 0.7025251488400739, "grad_norm": 1.4748542360162578, "kd_ratio": 0.5, "learning_rate": 4.294539109497159e-06, "loss": 1.0627131462097168, "loss/kd": 1.7415201663970947, "loss/lm": 0.38390615582466125, "step": 3422 }, { "epoch": 0.7027304454937384, "grad_norm": 1.127637462713945, "kd_ratio": 0.5, "learning_rate": 4.289078724800331e-06, "loss": 1.119686484336853, "loss/kd": 1.9294841289520264, "loss/lm": 0.30988889932632446, "step": 3423 }, { "epoch": 0.702935742147403, "grad_norm": 0.9967451322160586, "kd_ratio": 0.5, "learning_rate": 4.283620865828874e-06, "loss": 1.0777015686035156, "loss/kd": 1.7558871507644653, "loss/lm": 0.3995159864425659, "step": 3424 }, { "epoch": 0.7031410388010676, "grad_norm": 1.0224728864818535, "kd_ratio": 0.5, "learning_rate": 4.278165534996577e-06, "loss": 0.9667403697967529, "loss/kd": 1.6368741989135742, "loss/lm": 0.29660651087760925, "step": 3425 }, { "epoch": 0.7033463354547321, "grad_norm": 1.4280520525094884, "kd_ratio": 0.5, "learning_rate": 4.272712734716134e-06, "loss": 1.0523725748062134, "loss/kd": 1.7699964046478271, "loss/lm": 0.3347487151622772, "step": 3426 }, { "epoch": 0.7035516321083967, "grad_norm": 0.9812384439441388, "kd_ratio": 0.5, "learning_rate": 4.267262467399114e-06, "loss": 0.9383787512779236, "loss/kd": 1.5735657215118408, "loss/lm": 0.30319175124168396, "step": 3427 }, { "epoch": 0.7037569287620612, "grad_norm": 0.894000017411495, "kd_ratio": 0.5, "learning_rate": 4.261814735455959e-06, "loss": 0.9992566704750061, "loss/kd": 1.6833199262619019, "loss/lm": 0.31519338488578796, "step": 3428 }, { "epoch": 0.7039622254157257, "grad_norm": 0.9618330728286966, "kd_ratio": 0.5, "learning_rate": 4.2563695412960075e-06, "loss": 1.1265828609466553, "loss/kd": 1.9467618465423584, "loss/lm": 0.306403785943985, "step": 3429 }, { "epoch": 0.7041675220693903, "grad_norm": 1.068784321962688, "kd_ratio": 0.5, "learning_rate": 4.250926887327451e-06, "loss": 0.9095795154571533, "loss/kd": 1.5148389339447021, "loss/lm": 0.3043201267719269, "step": 3430 }, { "epoch": 0.7043728187230548, "grad_norm": 1.177730762539792, "kd_ratio": 0.5, "learning_rate": 4.24548677595738e-06, "loss": 0.9768959879875183, "loss/kd": 1.6419872045516968, "loss/lm": 0.31180474162101746, "step": 3431 }, { "epoch": 0.7045781153767193, "grad_norm": 1.0319722318688993, "kd_ratio": 0.5, "learning_rate": 4.2400492095917425e-06, "loss": 1.7114739418029785, "loss/kd": 3.0176165103912354, "loss/lm": 0.40533146262168884, "step": 3432 }, { "epoch": 0.7047834120303839, "grad_norm": 1.1572692926851897, "kd_ratio": 0.5, "learning_rate": 4.234614190635375e-06, "loss": 0.8571578860282898, "loss/kd": 1.3396856784820557, "loss/lm": 0.3746300935745239, "step": 3433 }, { "epoch": 0.7049887086840485, "grad_norm": 0.931984444546425, "kd_ratio": 0.5, "learning_rate": 4.229181721491977e-06, "loss": 1.1346805095672607, "loss/kd": 1.942334532737732, "loss/lm": 0.3270265758037567, "step": 3434 }, { "epoch": 0.705194005337713, "grad_norm": 1.1748558513994731, "kd_ratio": 0.5, "learning_rate": 4.22375180456413e-06, "loss": 1.000878095626831, "loss/kd": 1.6830137968063354, "loss/lm": 0.3187423348426819, "step": 3435 }, { "epoch": 0.7053993019913776, "grad_norm": 0.9388873005354244, "kd_ratio": 0.5, "learning_rate": 4.2183244422532835e-06, "loss": 1.1306246519088745, "loss/kd": 1.9596859216690063, "loss/lm": 0.3015632927417755, "step": 3436 }, { "epoch": 0.7056045986450421, "grad_norm": 1.012923587634699, "kd_ratio": 0.5, "learning_rate": 4.2128996369597475e-06, "loss": 0.8796584606170654, "loss/kd": 1.4677282571792603, "loss/lm": 0.2915886342525482, "step": 3437 }, { "epoch": 0.7058098952987066, "grad_norm": 0.9684657323628121, "kd_ratio": 0.5, "learning_rate": 4.2074773910827125e-06, "loss": 1.2211194038391113, "loss/kd": 2.0596425533294678, "loss/lm": 0.38259637355804443, "step": 3438 }, { "epoch": 0.7060151919523712, "grad_norm": 1.033322062451025, "kd_ratio": 0.5, "learning_rate": 4.202057707020235e-06, "loss": 1.104427456855774, "loss/kd": 1.8329195976257324, "loss/lm": 0.37593525648117065, "step": 3439 }, { "epoch": 0.7062204886060357, "grad_norm": 0.9720850933020367, "kd_ratio": 0.5, "learning_rate": 4.1966405871692394e-06, "loss": 1.1258875131607056, "loss/kd": 1.8741588592529297, "loss/lm": 0.37761616706848145, "step": 3440 }, { "epoch": 0.7064257852597002, "grad_norm": 1.2838199185068895, "kd_ratio": 0.5, "learning_rate": 4.1912260339255085e-06, "loss": 1.0077136754989624, "loss/kd": 1.6535027027130127, "loss/lm": 0.36192455887794495, "step": 3441 }, { "epoch": 0.7066310819133648, "grad_norm": 0.8846884655670707, "kd_ratio": 0.5, "learning_rate": 4.1858140496836965e-06, "loss": 0.9295529723167419, "loss/kd": 1.5192101001739502, "loss/lm": 0.3398958742618561, "step": 3442 }, { "epoch": 0.7068363785670294, "grad_norm": 1.2218808936877672, "kd_ratio": 0.5, "learning_rate": 4.180404636837321e-06, "loss": 1.2094447612762451, "loss/kd": 2.065744400024414, "loss/lm": 0.3531450629234314, "step": 3443 }, { "epoch": 0.7070416752206939, "grad_norm": 1.0515731876436283, "kd_ratio": 0.5, "learning_rate": 4.1749977977787615e-06, "loss": 1.1298713684082031, "loss/kd": 1.9297412633895874, "loss/lm": 0.33000150322914124, "step": 3444 }, { "epoch": 0.7072469718743585, "grad_norm": 0.9127286782669551, "kd_ratio": 0.5, "learning_rate": 4.169593534899262e-06, "loss": 0.9477939605712891, "loss/kd": 1.5658149719238281, "loss/lm": 0.3297730088233948, "step": 3445 }, { "epoch": 0.707452268528023, "grad_norm": 1.0796751638399258, "kd_ratio": 0.5, "learning_rate": 4.16419185058892e-06, "loss": 1.9291151762008667, "loss/kd": 3.5765981674194336, "loss/lm": 0.2816321849822998, "step": 3446 }, { "epoch": 0.7076575651816875, "grad_norm": 0.9502428034377504, "kd_ratio": 0.5, "learning_rate": 4.158792747236702e-06, "loss": 0.9380060434341431, "loss/kd": 1.5977572202682495, "loss/lm": 0.2782548666000366, "step": 3447 }, { "epoch": 0.7078628618353521, "grad_norm": 0.8965936072522565, "kd_ratio": 0.5, "learning_rate": 4.153396227230421e-06, "loss": 1.5460050106048584, "loss/kd": 2.7622902393341064, "loss/lm": 0.3297198712825775, "step": 3448 }, { "epoch": 0.7080681584890166, "grad_norm": 0.9513295216878952, "kd_ratio": 0.5, "learning_rate": 4.148002292956759e-06, "loss": 0.9798411130905151, "loss/kd": 1.6036535501480103, "loss/lm": 0.35602864623069763, "step": 3449 }, { "epoch": 0.7082734551426811, "grad_norm": 1.0384066530951515, "kd_ratio": 0.5, "learning_rate": 4.142610946801248e-06, "loss": 1.197934865951538, "loss/kd": 2.1231000423431396, "loss/lm": 0.2727696895599365, "step": 3450 }, { "epoch": 0.7084787517963457, "grad_norm": 0.9556037019236184, "kd_ratio": 0.5, "learning_rate": 4.137222191148282e-06, "loss": 0.9073339104652405, "loss/kd": 1.5206329822540283, "loss/lm": 0.294034868478775, "step": 3451 }, { "epoch": 0.7086840484500103, "grad_norm": 1.2200838160624152, "kd_ratio": 0.5, "learning_rate": 4.1318360283811e-06, "loss": 0.9363788366317749, "loss/kd": 1.5386940240859985, "loss/lm": 0.3340636193752289, "step": 3452 }, { "epoch": 0.7088893451036749, "grad_norm": 0.9165327516084475, "kd_ratio": 0.5, "learning_rate": 4.126452460881798e-06, "loss": 1.0217480659484863, "loss/kd": 1.6926496028900146, "loss/lm": 0.35084643959999084, "step": 3453 }, { "epoch": 0.7090946417573394, "grad_norm": 0.9894759803586715, "kd_ratio": 0.5, "learning_rate": 4.121071491031329e-06, "loss": 1.9844744205474854, "loss/kd": 3.689302682876587, "loss/lm": 0.27964603900909424, "step": 3454 }, { "epoch": 0.7092999384110039, "grad_norm": 0.9274718466987166, "kd_ratio": 0.5, "learning_rate": 4.11569312120949e-06, "loss": 1.1693086624145508, "loss/kd": 1.970474123954773, "loss/lm": 0.36814308166503906, "step": 3455 }, { "epoch": 0.7095052350646684, "grad_norm": 1.1394051296898882, "kd_ratio": 0.5, "learning_rate": 4.110317353794937e-06, "loss": 0.9987761974334717, "loss/kd": 1.6359070539474487, "loss/lm": 0.361645370721817, "step": 3456 }, { "epoch": 0.709710531718333, "grad_norm": 1.1414580705577784, "kd_ratio": 0.5, "learning_rate": 4.104944191165162e-06, "loss": 1.0475571155548096, "loss/kd": 1.7138978242874146, "loss/lm": 0.38121628761291504, "step": 3457 }, { "epoch": 0.7099158283719975, "grad_norm": 1.271736763538756, "kd_ratio": 0.5, "learning_rate": 4.09957363569652e-06, "loss": 1.1811822652816772, "loss/kd": 2.0011909008026123, "loss/lm": 0.361173540353775, "step": 3458 }, { "epoch": 0.710121125025662, "grad_norm": 1.6493691960767585, "kd_ratio": 0.5, "learning_rate": 4.0942056897641934e-06, "loss": 1.2240917682647705, "loss/kd": 2.065781354904175, "loss/lm": 0.38240206241607666, "step": 3459 }, { "epoch": 0.7103264216793266, "grad_norm": 1.358605148850613, "kd_ratio": 0.5, "learning_rate": 4.088840355742238e-06, "loss": 1.1258412599563599, "loss/kd": 1.9116523265838623, "loss/lm": 0.3400302231311798, "step": 3460 }, { "epoch": 0.7105317183329912, "grad_norm": 0.9637630928892684, "kd_ratio": 0.5, "learning_rate": 4.083477636003527e-06, "loss": 0.9919087290763855, "loss/kd": 1.620322585105896, "loss/lm": 0.3634949028491974, "step": 3461 }, { "epoch": 0.7107370149866558, "grad_norm": 1.1669107821888345, "kd_ratio": 0.5, "learning_rate": 4.078117532919794e-06, "loss": 2.003225326538086, "loss/kd": 3.768786907196045, "loss/lm": 0.23766380548477173, "step": 3462 }, { "epoch": 0.7109423116403203, "grad_norm": 1.3011740170243689, "kd_ratio": 0.5, "learning_rate": 4.072760048861614e-06, "loss": 1.1578280925750732, "loss/kd": 1.9849114418029785, "loss/lm": 0.3307446241378784, "step": 3463 }, { "epoch": 0.7111476082939848, "grad_norm": 1.3431679642123542, "kd_ratio": 0.5, "learning_rate": 4.067405186198389e-06, "loss": 0.9135359525680542, "loss/kd": 1.4748315811157227, "loss/lm": 0.35224029421806335, "step": 3464 }, { "epoch": 0.7113529049476494, "grad_norm": 1.596738962126751, "kd_ratio": 0.5, "learning_rate": 4.062052947298387e-06, "loss": 1.0067754983901978, "loss/kd": 1.6208796501159668, "loss/lm": 0.39267125725746155, "step": 3465 }, { "epoch": 0.7115582016013139, "grad_norm": 1.3284722789608772, "kd_ratio": 0.5, "learning_rate": 4.0567033345286935e-06, "loss": 1.020986557006836, "loss/kd": 1.7126713991165161, "loss/lm": 0.32930171489715576, "step": 3466 }, { "epoch": 0.7117634982549784, "grad_norm": 1.1904109507609104, "kd_ratio": 0.5, "learning_rate": 4.051356350255246e-06, "loss": 1.1333630084991455, "loss/kd": 1.9334944486618042, "loss/lm": 0.3332315981388092, "step": 3467 }, { "epoch": 0.711968794908643, "grad_norm": 1.3369232903932173, "kd_ratio": 0.5, "learning_rate": 4.046011996842806e-06, "loss": 1.0229547023773193, "loss/kd": 1.5977121591567993, "loss/lm": 0.44819730520248413, "step": 3468 }, { "epoch": 0.7121740915623075, "grad_norm": 1.1462718424415186, "kd_ratio": 0.5, "learning_rate": 4.0406702766549855e-06, "loss": 0.9551370143890381, "loss/kd": 1.5003297328948975, "loss/lm": 0.40994423627853394, "step": 3469 }, { "epoch": 0.7123793882159721, "grad_norm": 1.4125156247465431, "kd_ratio": 0.5, "learning_rate": 4.035331192054225e-06, "loss": 0.9869644641876221, "loss/kd": 1.6773947477340698, "loss/lm": 0.29653412103652954, "step": 3470 }, { "epoch": 0.7125846848696367, "grad_norm": 0.9170883469898845, "kd_ratio": 0.5, "learning_rate": 4.0299947454018e-06, "loss": 0.9303733706474304, "loss/kd": 1.4941859245300293, "loss/lm": 0.36656081676483154, "step": 3471 }, { "epoch": 0.7127899815233012, "grad_norm": 1.0431511958645399, "kd_ratio": 0.5, "learning_rate": 4.024660939057824e-06, "loss": 1.3310108184814453, "loss/kd": 2.3149211406707764, "loss/lm": 0.34710046648979187, "step": 3472 }, { "epoch": 0.7129952781769657, "grad_norm": 1.3349037821415044, "kd_ratio": 0.5, "learning_rate": 4.0193297753812325e-06, "loss": 0.8765712976455688, "loss/kd": 1.4182283878326416, "loss/lm": 0.3349141776561737, "step": 3473 }, { "epoch": 0.7132005748306303, "grad_norm": 1.2742264834191068, "kd_ratio": 0.5, "learning_rate": 4.014001256729804e-06, "loss": 1.159564733505249, "loss/kd": 1.9974952936172485, "loss/lm": 0.3216341733932495, "step": 3474 }, { "epoch": 0.7134058714842948, "grad_norm": 1.0066015071523957, "kd_ratio": 0.5, "learning_rate": 4.008675385460131e-06, "loss": 1.0051742792129517, "loss/kd": 1.6487964391708374, "loss/lm": 0.36155205965042114, "step": 3475 }, { "epoch": 0.7136111681379593, "grad_norm": 1.317245849105356, "kd_ratio": 0.5, "learning_rate": 4.003352163927662e-06, "loss": 1.1526938676834106, "loss/kd": 2.0333943367004395, "loss/lm": 0.27199336886405945, "step": 3476 }, { "epoch": 0.7138164647916239, "grad_norm": 0.9006384589215902, "kd_ratio": 0.5, "learning_rate": 3.998031594486645e-06, "loss": 1.005021095275879, "loss/kd": 1.6887331008911133, "loss/lm": 0.32130905985832214, "step": 3477 }, { "epoch": 0.7140217614452884, "grad_norm": 1.412957907605752, "kd_ratio": 0.5, "learning_rate": 3.992713679490172e-06, "loss": 0.9900896549224854, "loss/kd": 1.6793797016143799, "loss/lm": 0.3007996380329132, "step": 3478 }, { "epoch": 0.714227058098953, "grad_norm": 0.9926544186978297, "kd_ratio": 0.5, "learning_rate": 3.987398421290155e-06, "loss": 0.8989591598510742, "loss/kd": 1.4955095052719116, "loss/lm": 0.3024088442325592, "step": 3479 }, { "epoch": 0.7144323547526176, "grad_norm": 1.0562136043476977, "kd_ratio": 0.5, "learning_rate": 3.982085822237332e-06, "loss": 0.943535327911377, "loss/kd": 1.5435117483139038, "loss/lm": 0.3435588479042053, "step": 3480 }, { "epoch": 0.7146376514062821, "grad_norm": 1.042069166572449, "kd_ratio": 0.5, "learning_rate": 3.976775884681267e-06, "loss": 0.8943309187889099, "loss/kd": 1.5062695741653442, "loss/lm": 0.282392293214798, "step": 3481 }, { "epoch": 0.7148429480599466, "grad_norm": 1.0604443525744536, "kd_ratio": 0.5, "learning_rate": 3.9714686109703434e-06, "loss": 1.223035216331482, "loss/kd": 2.1523401737213135, "loss/lm": 0.2937302887439728, "step": 3482 }, { "epoch": 0.7150482447136112, "grad_norm": 1.0605334915866587, "kd_ratio": 0.5, "learning_rate": 3.966164003451775e-06, "loss": 1.0165265798568726, "loss/kd": 1.735067367553711, "loss/lm": 0.2979857325553894, "step": 3483 }, { "epoch": 0.7152535413672757, "grad_norm": 1.0123993866648195, "kd_ratio": 0.5, "learning_rate": 3.9608620644715814e-06, "loss": 0.9832313656806946, "loss/kd": 1.647923231124878, "loss/lm": 0.31853947043418884, "step": 3484 }, { "epoch": 0.7154588380209402, "grad_norm": 0.9366699742143534, "kd_ratio": 0.5, "learning_rate": 3.955562796374614e-06, "loss": 0.9417792558670044, "loss/kd": 1.4991346597671509, "loss/lm": 0.38442379236221313, "step": 3485 }, { "epoch": 0.7156641346746048, "grad_norm": 1.0231617693925499, "kd_ratio": 0.5, "learning_rate": 3.9502662015045375e-06, "loss": 1.2109827995300293, "loss/kd": 2.055154800415039, "loss/lm": 0.3668108582496643, "step": 3486 }, { "epoch": 0.7158694313282693, "grad_norm": 1.1298126613238406, "kd_ratio": 0.5, "learning_rate": 3.944972282203844e-06, "loss": 0.8949203491210938, "loss/kd": 1.5137865543365479, "loss/lm": 0.27605414390563965, "step": 3487 }, { "epoch": 0.716074727981934, "grad_norm": 1.0552881609364617, "kd_ratio": 0.5, "learning_rate": 3.9396810408138254e-06, "loss": 1.122450590133667, "loss/kd": 1.912811279296875, "loss/lm": 0.3320898413658142, "step": 3488 }, { "epoch": 0.7162800246355985, "grad_norm": 0.9272135501610305, "kd_ratio": 0.5, "learning_rate": 3.9343924796746e-06, "loss": 0.9838384389877319, "loss/kd": 1.637312412261963, "loss/lm": 0.3303644061088562, "step": 3489 }, { "epoch": 0.716485321289263, "grad_norm": 0.9816231788076556, "kd_ratio": 0.5, "learning_rate": 3.9291066011251024e-06, "loss": 1.0046522617340088, "loss/kd": 1.713609218597412, "loss/lm": 0.29569533467292786, "step": 3490 }, { "epoch": 0.7166906179429275, "grad_norm": 1.2288106158160041, "kd_ratio": 0.5, "learning_rate": 3.923823407503076e-06, "loss": 1.01389479637146, "loss/kd": 1.6723462343215942, "loss/lm": 0.3554433584213257, "step": 3491 }, { "epoch": 0.7168959145965921, "grad_norm": 1.1097657735959683, "kd_ratio": 0.5, "learning_rate": 3.918542901145083e-06, "loss": 0.8442142605781555, "loss/kd": 1.3845568895339966, "loss/lm": 0.30387160181999207, "step": 3492 }, { "epoch": 0.7171012112502566, "grad_norm": 0.8993689804513283, "kd_ratio": 0.5, "learning_rate": 3.913265084386483e-06, "loss": 0.9635441303253174, "loss/kd": 1.6066105365753174, "loss/lm": 0.32047775387763977, "step": 3493 }, { "epoch": 0.7173065079039211, "grad_norm": 0.8432928194464651, "kd_ratio": 0.5, "learning_rate": 3.9079899595614645e-06, "loss": 1.177022933959961, "loss/kd": 1.9422870874404907, "loss/lm": 0.411758691072464, "step": 3494 }, { "epoch": 0.7175118045575857, "grad_norm": 1.0720476295570687, "kd_ratio": 0.5, "learning_rate": 3.902717529003005e-06, "loss": 1.073866605758667, "loss/kd": 1.777835726737976, "loss/lm": 0.3698975443840027, "step": 3495 }, { "epoch": 0.7177171012112502, "grad_norm": 0.8525024047811984, "kd_ratio": 0.5, "learning_rate": 3.897447795042918e-06, "loss": 1.775607943534851, "loss/kd": 3.2419421672821045, "loss/lm": 0.3092738091945648, "step": 3496 }, { "epoch": 0.7179223978649149, "grad_norm": 1.0582715407035956, "kd_ratio": 0.5, "learning_rate": 3.892180760011795e-06, "loss": 0.7980867624282837, "loss/kd": 1.285588026046753, "loss/lm": 0.31058546900749207, "step": 3497 }, { "epoch": 0.7181276945185794, "grad_norm": 1.1412621575236734, "kd_ratio": 0.5, "learning_rate": 3.886916426239052e-06, "loss": 1.086414098739624, "loss/kd": 1.8067697286605835, "loss/lm": 0.366058349609375, "step": 3498 }, { "epoch": 0.7183329911722439, "grad_norm": 0.9311545736449263, "kd_ratio": 0.5, "learning_rate": 3.88165479605291e-06, "loss": 0.9812174439430237, "loss/kd": 1.548443078994751, "loss/lm": 0.413991779088974, "step": 3499 }, { "epoch": 0.7185382878259085, "grad_norm": 1.140491274931275, "kd_ratio": 0.5, "learning_rate": 3.876395871780381e-06, "loss": 1.1974936723709106, "loss/kd": 1.9973934888839722, "loss/lm": 0.39759376645088196, "step": 3500 }, { "epoch": 0.718743584479573, "grad_norm": 1.077565248805297, "kd_ratio": 0.5, "learning_rate": 3.8711396557472944e-06, "loss": 1.0268352031707764, "loss/kd": 1.7239830493927002, "loss/lm": 0.3296872675418854, "step": 3501 }, { "epoch": 0.7189488811332375, "grad_norm": 1.0438629525324485, "kd_ratio": 0.5, "learning_rate": 3.865886150278276e-06, "loss": 0.9249237775802612, "loss/kd": 1.577148675918579, "loss/lm": 0.27269890904426575, "step": 3502 }, { "epoch": 0.719154177786902, "grad_norm": 1.1162742056201929, "kd_ratio": 0.5, "learning_rate": 3.860635357696756e-06, "loss": 1.0567675828933716, "loss/kd": 1.7886892557144165, "loss/lm": 0.3248459994792938, "step": 3503 }, { "epoch": 0.7193594744405666, "grad_norm": 0.9346312989344543, "kd_ratio": 0.5, "learning_rate": 3.855387280324956e-06, "loss": 0.8566914796829224, "loss/kd": 1.3795676231384277, "loss/lm": 0.3338152766227722, "step": 3504 }, { "epoch": 0.7195647710942311, "grad_norm": 0.9992064583975334, "kd_ratio": 0.5, "learning_rate": 3.8501419204839085e-06, "loss": 1.2881211042404175, "loss/kd": 2.2370545864105225, "loss/lm": 0.3391876518726349, "step": 3505 }, { "epoch": 0.7197700677478958, "grad_norm": 1.0534762284141566, "kd_ratio": 0.5, "learning_rate": 3.844899280493431e-06, "loss": 0.9136031866073608, "loss/kd": 1.508193016052246, "loss/lm": 0.319013386964798, "step": 3506 }, { "epoch": 0.7199753644015603, "grad_norm": 1.0980816597180874, "kd_ratio": 0.5, "learning_rate": 3.839659362672156e-06, "loss": 1.1304943561553955, "loss/kd": 1.9143640995025635, "loss/lm": 0.34662458300590515, "step": 3507 }, { "epoch": 0.7201806610552248, "grad_norm": 1.0081608826459552, "kd_ratio": 0.5, "learning_rate": 3.834422169337494e-06, "loss": 0.8817760944366455, "loss/kd": 1.4205312728881836, "loss/lm": 0.3430209755897522, "step": 3508 }, { "epoch": 0.7203859577088894, "grad_norm": 1.041837156347851, "kd_ratio": 0.5, "learning_rate": 3.8291877028056615e-06, "loss": 0.7899836301803589, "loss/kd": 1.2919843196868896, "loss/lm": 0.2879829704761505, "step": 3509 }, { "epoch": 0.7205912543625539, "grad_norm": 0.9189364420916697, "kd_ratio": 0.5, "learning_rate": 3.8239559653916684e-06, "loss": 1.0526007413864136, "loss/kd": 1.6634585857391357, "loss/lm": 0.44174298644065857, "step": 3510 }, { "epoch": 0.7207965510162184, "grad_norm": 1.024912666845964, "kd_ratio": 0.5, "learning_rate": 3.818726959409305e-06, "loss": 0.9277409315109253, "loss/kd": 1.5223363637924194, "loss/lm": 0.3331454396247864, "step": 3511 }, { "epoch": 0.721001847669883, "grad_norm": 1.0805969548112262, "kd_ratio": 0.5, "learning_rate": 3.8135006871711786e-06, "loss": 1.150198221206665, "loss/kd": 1.8910592794418335, "loss/lm": 0.4093371629714966, "step": 3512 }, { "epoch": 0.7212071443235475, "grad_norm": 1.0437880406695454, "kd_ratio": 0.5, "learning_rate": 3.8082771509886606e-06, "loss": 1.0341987609863281, "loss/kd": 1.7424635887145996, "loss/lm": 0.32593390345573425, "step": 3513 }, { "epoch": 0.721412440977212, "grad_norm": 1.0198288320920168, "kd_ratio": 0.5, "learning_rate": 3.803056353171932e-06, "loss": 1.0499393939971924, "loss/kd": 1.766864538192749, "loss/lm": 0.3330143690109253, "step": 3514 }, { "epoch": 0.7216177376308767, "grad_norm": 1.0920284464148968, "kd_ratio": 0.5, "learning_rate": 3.7978382960299476e-06, "loss": 1.1091477870941162, "loss/kd": 1.8258792161941528, "loss/lm": 0.392416387796402, "step": 3515 }, { "epoch": 0.7218230342845412, "grad_norm": 0.8755743128639268, "kd_ratio": 0.5, "learning_rate": 3.792622981870462e-06, "loss": 1.3407915830612183, "loss/kd": 2.37414288520813, "loss/lm": 0.30744028091430664, "step": 3516 }, { "epoch": 0.7220283309382057, "grad_norm": 1.1006609665445113, "kd_ratio": 0.5, "learning_rate": 3.787410413000011e-06, "loss": 1.2545950412750244, "loss/kd": 2.1686317920684814, "loss/lm": 0.340558260679245, "step": 3517 }, { "epoch": 0.7222336275918703, "grad_norm": 1.010332319868683, "kd_ratio": 0.5, "learning_rate": 3.782200591723919e-06, "loss": 1.0160045623779297, "loss/kd": 1.7192189693450928, "loss/lm": 0.3127902150154114, "step": 3518 }, { "epoch": 0.7224389242455348, "grad_norm": 1.1161433375000867, "kd_ratio": 0.5, "learning_rate": 3.776993520346295e-06, "loss": 1.1713290214538574, "loss/kd": 2.0471811294555664, "loss/lm": 0.29547691345214844, "step": 3519 }, { "epoch": 0.7226442208991993, "grad_norm": 0.8921719644545205, "kd_ratio": 0.5, "learning_rate": 3.771789201170025e-06, "loss": 1.1010360717773438, "loss/kd": 1.8554350137710571, "loss/lm": 0.3466370105743408, "step": 3520 }, { "epoch": 0.7228495175528639, "grad_norm": 1.1145211192021098, "kd_ratio": 0.5, "learning_rate": 3.7665876364967866e-06, "loss": 0.958464503288269, "loss/kd": 1.535508632659912, "loss/lm": 0.3814203441143036, "step": 3521 }, { "epoch": 0.7230548142065284, "grad_norm": 1.2070770451446498, "kd_ratio": 0.5, "learning_rate": 3.761388828627037e-06, "loss": 1.1649713516235352, "loss/kd": 2.0203351974487305, "loss/lm": 0.30960744619369507, "step": 3522 }, { "epoch": 0.7232601108601929, "grad_norm": 1.1013558307083475, "kd_ratio": 0.5, "learning_rate": 3.756192779860014e-06, "loss": 1.234999179840088, "loss/kd": 2.0883593559265137, "loss/lm": 0.3816390037536621, "step": 3523 }, { "epoch": 0.7234654075138576, "grad_norm": 0.9521120155922375, "kd_ratio": 0.5, "learning_rate": 3.75099949249373e-06, "loss": 1.8968415260314941, "loss/kd": 3.540100574493408, "loss/lm": 0.2535824477672577, "step": 3524 }, { "epoch": 0.7236707041675221, "grad_norm": 1.0703304332387507, "kd_ratio": 0.5, "learning_rate": 3.7458089688249823e-06, "loss": 1.1711554527282715, "loss/kd": 1.9867972135543823, "loss/lm": 0.3555137813091278, "step": 3525 }, { "epoch": 0.7238760008211866, "grad_norm": 0.9986617653487341, "kd_ratio": 0.5, "learning_rate": 3.740621211149347e-06, "loss": 1.0828497409820557, "loss/kd": 1.830789566040039, "loss/lm": 0.33491000533103943, "step": 3526 }, { "epoch": 0.7240812974748512, "grad_norm": 1.138339590474335, "kd_ratio": 0.5, "learning_rate": 3.7354362217611652e-06, "loss": 1.0029484033584595, "loss/kd": 1.6265437602996826, "loss/lm": 0.37935304641723633, "step": 3527 }, { "epoch": 0.7242865941285157, "grad_norm": 1.0570304434637596, "kd_ratio": 0.5, "learning_rate": 3.730254002953575e-06, "loss": 1.0196068286895752, "loss/kd": 1.722010850906372, "loss/lm": 0.3172028362751007, "step": 3528 }, { "epoch": 0.7244918907821802, "grad_norm": 1.2209732758991045, "kd_ratio": 0.5, "learning_rate": 3.7250745570184666e-06, "loss": 1.1734610795974731, "loss/kd": 2.0240960121154785, "loss/lm": 0.3228260576725006, "step": 3529 }, { "epoch": 0.7246971874358448, "grad_norm": 0.9209667269047591, "kd_ratio": 0.5, "learning_rate": 3.719897886246521e-06, "loss": 0.9664089679718018, "loss/kd": 1.623769998550415, "loss/lm": 0.3090479075908661, "step": 3530 }, { "epoch": 0.7249024840895093, "grad_norm": 1.0503446337479814, "kd_ratio": 0.5, "learning_rate": 3.714723992927177e-06, "loss": 0.7857714891433716, "loss/kd": 1.2873107194900513, "loss/lm": 0.28423231840133667, "step": 3531 }, { "epoch": 0.7251077807431738, "grad_norm": 0.9034557847208686, "kd_ratio": 0.5, "learning_rate": 3.709552879348657e-06, "loss": 1.0422574281692505, "loss/kd": 1.6860764026641846, "loss/lm": 0.3984384834766388, "step": 3532 }, { "epoch": 0.7253130773968385, "grad_norm": 2.8359381155573273, "kd_ratio": 0.5, "learning_rate": 3.704384547797949e-06, "loss": 1.0408592224121094, "loss/kd": 1.7883270978927612, "loss/lm": 0.2933914065361023, "step": 3533 }, { "epoch": 0.725518374050503, "grad_norm": 0.8670544127909295, "kd_ratio": 0.5, "learning_rate": 3.6992190005608154e-06, "loss": 1.2456142902374268, "loss/kd": 2.1132047176361084, "loss/lm": 0.37802380323410034, "step": 3534 }, { "epoch": 0.7257236707041675, "grad_norm": 1.0629143809195543, "kd_ratio": 0.5, "learning_rate": 3.694056239921776e-06, "loss": 1.1057542562484741, "loss/kd": 1.8113535642623901, "loss/lm": 0.40015488862991333, "step": 3535 }, { "epoch": 0.7259289673578321, "grad_norm": 0.9881964483981492, "kd_ratio": 0.5, "learning_rate": 3.6888962681641295e-06, "loss": 1.1109514236450195, "loss/kd": 1.8842287063598633, "loss/lm": 0.33767426013946533, "step": 3536 }, { "epoch": 0.7261342640114966, "grad_norm": 1.065273351490397, "kd_ratio": 0.5, "learning_rate": 3.6837390875699354e-06, "loss": 1.8959087133407593, "loss/kd": 3.5425477027893066, "loss/lm": 0.24926966428756714, "step": 3537 }, { "epoch": 0.7263395606651611, "grad_norm": 0.8969777609295477, "kd_ratio": 0.5, "learning_rate": 3.6785847004200227e-06, "loss": 0.8663195371627808, "loss/kd": 1.4170842170715332, "loss/lm": 0.31555482745170593, "step": 3538 }, { "epoch": 0.7265448573188257, "grad_norm": 1.150430618946623, "kd_ratio": 0.5, "learning_rate": 3.6734331089939835e-06, "loss": 1.2002958059310913, "loss/kd": 2.0712890625, "loss/lm": 0.3293026387691498, "step": 3539 }, { "epoch": 0.7267501539724902, "grad_norm": 1.0768296956398502, "kd_ratio": 0.5, "learning_rate": 3.6682843155701684e-06, "loss": 1.0000436305999756, "loss/kd": 1.5942591428756714, "loss/lm": 0.4058280885219574, "step": 3540 }, { "epoch": 0.7269554506261547, "grad_norm": 1.0012256743879806, "kd_ratio": 0.5, "learning_rate": 3.6631383224257e-06, "loss": 1.0911659002304077, "loss/kd": 1.842511534690857, "loss/lm": 0.33982032537460327, "step": 3541 }, { "epoch": 0.7271607472798194, "grad_norm": 1.1113366753916676, "kd_ratio": 0.5, "learning_rate": 3.657995131836447e-06, "loss": 0.9646444320678711, "loss/kd": 1.6133050918579102, "loss/lm": 0.31598374247550964, "step": 3542 }, { "epoch": 0.7273660439334839, "grad_norm": 1.068774485721189, "kd_ratio": 0.5, "learning_rate": 3.6528547460770636e-06, "loss": 1.1270519495010376, "loss/kd": 1.8920263051986694, "loss/lm": 0.36207765340805054, "step": 3543 }, { "epoch": 0.7275713405871485, "grad_norm": 0.9168557004266493, "kd_ratio": 0.5, "learning_rate": 3.6477171674209368e-06, "loss": 0.828969419002533, "loss/kd": 1.342875599861145, "loss/lm": 0.3150632083415985, "step": 3544 }, { "epoch": 0.727776637240813, "grad_norm": 0.9574278755301162, "kd_ratio": 0.5, "learning_rate": 3.6425823981402297e-06, "loss": 1.1376692056655884, "loss/kd": 1.925142765045166, "loss/lm": 0.3501957058906555, "step": 3545 }, { "epoch": 0.7279819338944775, "grad_norm": 0.9967183410795345, "kd_ratio": 0.5, "learning_rate": 3.6374504405058588e-06, "loss": 1.0824851989746094, "loss/kd": 1.8127270936965942, "loss/lm": 0.3522433638572693, "step": 3546 }, { "epoch": 0.728187230548142, "grad_norm": 0.9772014419368037, "kd_ratio": 0.5, "learning_rate": 3.6323212967874866e-06, "loss": 1.2814390659332275, "loss/kd": 2.1662650108337402, "loss/lm": 0.39661306142807007, "step": 3547 }, { "epoch": 0.7283925272018066, "grad_norm": 0.9733475698242902, "kd_ratio": 0.5, "learning_rate": 3.6271949692535526e-06, "loss": 1.273864984512329, "loss/kd": 2.127476215362549, "loss/lm": 0.42025381326675415, "step": 3548 }, { "epoch": 0.7285978238554711, "grad_norm": 0.9501752880870873, "kd_ratio": 0.5, "learning_rate": 3.6220714601712305e-06, "loss": 1.0415682792663574, "loss/kd": 1.7327818870544434, "loss/lm": 0.35035473108291626, "step": 3549 }, { "epoch": 0.7288031205091356, "grad_norm": 0.900938168214072, "kd_ratio": 0.5, "learning_rate": 3.616950771806459e-06, "loss": 1.3701595067977905, "loss/kd": 2.411616802215576, "loss/lm": 0.32870224118232727, "step": 3550 }, { "epoch": 0.7290084171628003, "grad_norm": 1.1772259237334843, "kd_ratio": 0.5, "learning_rate": 3.6118329064239222e-06, "loss": 0.9563851356506348, "loss/kd": 1.500397801399231, "loss/lm": 0.4123724102973938, "step": 3551 }, { "epoch": 0.7292137138164648, "grad_norm": 0.8803525124684539, "kd_ratio": 0.5, "learning_rate": 3.6067178662870594e-06, "loss": 0.9162558317184448, "loss/kd": 1.4841163158416748, "loss/lm": 0.34839528799057007, "step": 3552 }, { "epoch": 0.7294190104701294, "grad_norm": 1.1348960248957085, "kd_ratio": 0.5, "learning_rate": 3.6016056536580625e-06, "loss": 1.094894289970398, "loss/kd": 1.8930606842041016, "loss/lm": 0.2967279255390167, "step": 3553 }, { "epoch": 0.7296243071237939, "grad_norm": 0.9244799843889309, "kd_ratio": 0.5, "learning_rate": 3.59649627079787e-06, "loss": 1.090879201889038, "loss/kd": 1.8494855165481567, "loss/lm": 0.3322729170322418, "step": 3554 }, { "epoch": 0.7298296037774584, "grad_norm": 0.9637225796336655, "kd_ratio": 0.5, "learning_rate": 3.5913897199661716e-06, "loss": 1.0378726720809937, "loss/kd": 1.8400646448135376, "loss/lm": 0.2356807142496109, "step": 3555 }, { "epoch": 0.730034900431123, "grad_norm": 0.9710733347264717, "kd_ratio": 0.5, "learning_rate": 3.5862860034213976e-06, "loss": 1.0823020935058594, "loss/kd": 1.822316288948059, "loss/lm": 0.34228789806365967, "step": 3556 }, { "epoch": 0.7302401970847875, "grad_norm": 1.0422581856792772, "kd_ratio": 0.5, "learning_rate": 3.5811851234207353e-06, "loss": 1.0640956163406372, "loss/kd": 1.8194316625595093, "loss/lm": 0.30875954031944275, "step": 3557 }, { "epoch": 0.730445493738452, "grad_norm": 0.8578449384016522, "kd_ratio": 0.5, "learning_rate": 3.576087082220101e-06, "loss": 1.461605429649353, "loss/kd": 2.5804319381713867, "loss/lm": 0.34277886152267456, "step": 3558 }, { "epoch": 0.7306507903921166, "grad_norm": 0.8514253415706436, "kd_ratio": 0.5, "learning_rate": 3.5709918820741816e-06, "loss": 1.1136796474456787, "loss/kd": 1.8712663650512695, "loss/lm": 0.35609281063079834, "step": 3559 }, { "epoch": 0.7308560870457812, "grad_norm": 0.9909404661419722, "kd_ratio": 0.5, "learning_rate": 3.5658995252363805e-06, "loss": 1.0139994621276855, "loss/kd": 1.6219322681427002, "loss/lm": 0.4060665965080261, "step": 3560 }, { "epoch": 0.7310613836994457, "grad_norm": 1.040795982849614, "kd_ratio": 0.5, "learning_rate": 3.560810013958863e-06, "loss": 0.8532668948173523, "loss/kd": 1.388201117515564, "loss/lm": 0.3183326721191406, "step": 3561 }, { "epoch": 0.7312666803531103, "grad_norm": 1.1334853163595011, "kd_ratio": 0.5, "learning_rate": 3.5557233504925203e-06, "loss": 1.0097029209136963, "loss/kd": 1.6541105508804321, "loss/lm": 0.3652953803539276, "step": 3562 }, { "epoch": 0.7314719770067748, "grad_norm": 0.9646386282377125, "kd_ratio": 0.5, "learning_rate": 3.5506395370869963e-06, "loss": 1.0718399286270142, "loss/kd": 1.8205857276916504, "loss/lm": 0.32309409976005554, "step": 3563 }, { "epoch": 0.7316772736604393, "grad_norm": 1.2235379174242618, "kd_ratio": 0.5, "learning_rate": 3.5455585759906696e-06, "loss": 0.9034857153892517, "loss/kd": 1.5529468059539795, "loss/lm": 0.2540246546268463, "step": 3564 }, { "epoch": 0.7318825703141039, "grad_norm": 0.9135149855581524, "kd_ratio": 0.5, "learning_rate": 3.540480469450659e-06, "loss": 0.875070333480835, "loss/kd": 1.3933573961257935, "loss/lm": 0.35678333044052124, "step": 3565 }, { "epoch": 0.7320878669677684, "grad_norm": 1.3912092274336576, "kd_ratio": 0.5, "learning_rate": 3.535405219712823e-06, "loss": 1.0470714569091797, "loss/kd": 1.7042187452316284, "loss/lm": 0.38992413878440857, "step": 3566 }, { "epoch": 0.7322931636214329, "grad_norm": 0.8771115924095833, "kd_ratio": 0.5, "learning_rate": 3.5303328290217453e-06, "loss": 1.2361390590667725, "loss/kd": 2.0291335582733154, "loss/lm": 0.44314461946487427, "step": 3567 }, { "epoch": 0.7324984602750975, "grad_norm": 0.9395307093374503, "kd_ratio": 0.5, "learning_rate": 3.525263299620758e-06, "loss": 1.272533655166626, "loss/kd": 2.1030352115631104, "loss/lm": 0.4420320987701416, "step": 3568 }, { "epoch": 0.7327037569287621, "grad_norm": 1.398566802396031, "kd_ratio": 0.5, "learning_rate": 3.520196633751921e-06, "loss": 0.8874301314353943, "loss/kd": 1.4237663745880127, "loss/lm": 0.35109391808509827, "step": 3569 }, { "epoch": 0.7329090535824266, "grad_norm": 0.9648565573698503, "kd_ratio": 0.5, "learning_rate": 3.5151328336560363e-06, "loss": 1.205086588859558, "loss/kd": 2.059049129486084, "loss/lm": 0.35112401843070984, "step": 3570 }, { "epoch": 0.7331143502360912, "grad_norm": 0.9691293179810301, "kd_ratio": 0.5, "learning_rate": 3.5100719015726228e-06, "loss": 1.0418572425842285, "loss/kd": 1.733321189880371, "loss/lm": 0.35039323568344116, "step": 3571 }, { "epoch": 0.7333196468897557, "grad_norm": 0.8956524326606504, "kd_ratio": 0.5, "learning_rate": 3.5050138397399437e-06, "loss": 0.8180710673332214, "loss/kd": 1.3329812288284302, "loss/lm": 0.3031609356403351, "step": 3572 }, { "epoch": 0.7335249435434202, "grad_norm": 1.174451012637149, "kd_ratio": 0.5, "learning_rate": 3.49995865039499e-06, "loss": 1.1027445793151855, "loss/kd": 1.8448913097381592, "loss/lm": 0.36059778928756714, "step": 3573 }, { "epoch": 0.7337302401970848, "grad_norm": 0.942139349740292, "kd_ratio": 0.5, "learning_rate": 3.4949063357734804e-06, "loss": 1.2914694547653198, "loss/kd": 2.198683977127075, "loss/lm": 0.3842548429965973, "step": 3574 }, { "epoch": 0.7339355368507493, "grad_norm": 1.0026878804723613, "kd_ratio": 0.5, "learning_rate": 3.4898568981098678e-06, "loss": 1.2651152610778809, "loss/kd": 2.197765827178955, "loss/lm": 0.3324647843837738, "step": 3575 }, { "epoch": 0.7341408335044138, "grad_norm": 0.9536316376349778, "kd_ratio": 0.5, "learning_rate": 3.4848103396373224e-06, "loss": 1.301911473274231, "loss/kd": 2.262674331665039, "loss/lm": 0.34114858508110046, "step": 3576 }, { "epoch": 0.7343461301580784, "grad_norm": 0.9574605734939365, "kd_ratio": 0.5, "learning_rate": 3.479766662587751e-06, "loss": 1.0020912885665894, "loss/kd": 1.6630096435546875, "loss/lm": 0.3411729633808136, "step": 3577 }, { "epoch": 0.734551426811743, "grad_norm": 0.9545774400563216, "kd_ratio": 0.5, "learning_rate": 3.4747258691917772e-06, "loss": 1.0298659801483154, "loss/kd": 1.7373931407928467, "loss/lm": 0.3223387897014618, "step": 3578 }, { "epoch": 0.7347567234654075, "grad_norm": 0.9863962083730917, "kd_ratio": 0.5, "learning_rate": 3.469687961678757e-06, "loss": 0.9947553873062134, "loss/kd": 1.6955277919769287, "loss/lm": 0.29398298263549805, "step": 3579 }, { "epoch": 0.7349620201190721, "grad_norm": 0.9300263563125888, "kd_ratio": 0.5, "learning_rate": 3.464652942276767e-06, "loss": 1.0023211240768433, "loss/kd": 1.7230232954025269, "loss/lm": 0.28161895275115967, "step": 3580 }, { "epoch": 0.7351673167727366, "grad_norm": 1.0442615167994336, "kd_ratio": 0.5, "learning_rate": 3.4596208132126073e-06, "loss": 1.046291470527649, "loss/kd": 1.7181174755096436, "loss/lm": 0.3744654953479767, "step": 3581 }, { "epoch": 0.7353726134264011, "grad_norm": 0.9300080516552616, "kd_ratio": 0.5, "learning_rate": 3.454591576711801e-06, "loss": 0.9673498272895813, "loss/kd": 1.6038564443588257, "loss/lm": 0.3308431804180145, "step": 3582 }, { "epoch": 0.7355779100800657, "grad_norm": 1.054592013252736, "kd_ratio": 0.5, "learning_rate": 3.4495652349985844e-06, "loss": 0.9522068500518799, "loss/kd": 1.6044728755950928, "loss/lm": 0.29994088411331177, "step": 3583 }, { "epoch": 0.7357832067337302, "grad_norm": 1.015071276553754, "kd_ratio": 0.5, "learning_rate": 3.4445417902959224e-06, "loss": 1.1249773502349854, "loss/kd": 1.9240442514419556, "loss/lm": 0.32591041922569275, "step": 3584 }, { "epoch": 0.7359885033873947, "grad_norm": 0.9485300934595582, "kd_ratio": 0.5, "learning_rate": 3.4395212448254944e-06, "loss": 0.8789641261100769, "loss/kd": 1.4635268449783325, "loss/lm": 0.2944014370441437, "step": 3585 }, { "epoch": 0.7361938000410593, "grad_norm": 0.9378393675442178, "kd_ratio": 0.5, "learning_rate": 3.4345036008077025e-06, "loss": 1.1770501136779785, "loss/kd": 2.0006237030029297, "loss/lm": 0.3534764349460602, "step": 3586 }, { "epoch": 0.7363990966947239, "grad_norm": 1.1504299498060373, "kd_ratio": 0.5, "learning_rate": 3.429488860461655e-06, "loss": 1.0323526859283447, "loss/kd": 1.68083655834198, "loss/lm": 0.3838688135147095, "step": 3587 }, { "epoch": 0.7366043933483885, "grad_norm": 1.0839039022055441, "kd_ratio": 0.5, "learning_rate": 3.4244770260051863e-06, "loss": 0.7536389231681824, "loss/kd": 1.2061349153518677, "loss/lm": 0.3011429011821747, "step": 3588 }, { "epoch": 0.736809690002053, "grad_norm": 0.9093442400608839, "kd_ratio": 0.5, "learning_rate": 3.4194680996548412e-06, "loss": 1.0337520837783813, "loss/kd": 1.7449333667755127, "loss/lm": 0.3225707709789276, "step": 3589 }, { "epoch": 0.7370149866557175, "grad_norm": 1.2118729637012908, "kd_ratio": 0.5, "learning_rate": 3.4144620836258835e-06, "loss": 1.030508279800415, "loss/kd": 1.7388004064559937, "loss/lm": 0.32221609354019165, "step": 3590 }, { "epoch": 0.737220283309382, "grad_norm": 1.007634637829861, "kd_ratio": 0.5, "learning_rate": 3.4094589801322773e-06, "loss": 0.8443793058395386, "loss/kd": 1.361575961112976, "loss/lm": 0.3271826505661011, "step": 3591 }, { "epoch": 0.7374255799630466, "grad_norm": 1.5057782175670495, "kd_ratio": 0.5, "learning_rate": 3.4044587913867124e-06, "loss": 0.9316969513893127, "loss/kd": 1.5376646518707275, "loss/lm": 0.32572925090789795, "step": 3592 }, { "epoch": 0.7376308766167111, "grad_norm": 1.0972638486016348, "kd_ratio": 0.5, "learning_rate": 3.3994615196005854e-06, "loss": 0.8031418919563293, "loss/kd": 1.2948346138000488, "loss/lm": 0.3114491403102875, "step": 3593 }, { "epoch": 0.7378361732703757, "grad_norm": 0.9775351816938013, "kd_ratio": 0.5, "learning_rate": 3.3944671669839922e-06, "loss": 1.0051543712615967, "loss/kd": 1.6728464365005493, "loss/lm": 0.33746233582496643, "step": 3594 }, { "epoch": 0.7380414699240402, "grad_norm": 0.9142861104945323, "kd_ratio": 0.5, "learning_rate": 3.389475735745761e-06, "loss": 1.019815444946289, "loss/kd": 1.7231857776641846, "loss/lm": 0.31644514203071594, "step": 3595 }, { "epoch": 0.7382467665777048, "grad_norm": 1.062909045358159, "kd_ratio": 0.5, "learning_rate": 3.3844872280934017e-06, "loss": 1.0167819261550903, "loss/kd": 1.6893354654312134, "loss/lm": 0.3442283868789673, "step": 3596 }, { "epoch": 0.7384520632313694, "grad_norm": 1.125474785023881, "kd_ratio": 0.5, "learning_rate": 3.3795016462331507e-06, "loss": 1.1344908475875854, "loss/kd": 1.9363054037094116, "loss/lm": 0.3326762914657593, "step": 3597 }, { "epoch": 0.7386573598850339, "grad_norm": 1.0449527610161626, "kd_ratio": 0.5, "learning_rate": 3.3745189923699373e-06, "loss": 1.0380277633666992, "loss/kd": 1.7403645515441895, "loss/lm": 0.33569106459617615, "step": 3598 }, { "epoch": 0.7388626565386984, "grad_norm": 0.9853314438751938, "kd_ratio": 0.5, "learning_rate": 3.3695392687074045e-06, "loss": 1.0818842649459839, "loss/kd": 1.860654592514038, "loss/lm": 0.30311399698257446, "step": 3599 }, { "epoch": 0.739067953192363, "grad_norm": 1.4264946793422628, "kd_ratio": 0.5, "learning_rate": 3.3645624774478967e-06, "loss": 0.9320726990699768, "loss/kd": 1.4250123500823975, "loss/lm": 0.43913304805755615, "step": 3600 }, { "epoch": 0.7392732498460275, "grad_norm": 1.1845898218575144, "kd_ratio": 0.5, "learning_rate": 3.359588620792461e-06, "loss": 0.9935042858123779, "loss/kd": 1.6607221364974976, "loss/lm": 0.3262864947319031, "step": 3601 }, { "epoch": 0.739478546499692, "grad_norm": 0.8711284282079478, "kd_ratio": 0.5, "learning_rate": 3.3546177009408498e-06, "loss": 1.1373584270477295, "loss/kd": 1.94084632396698, "loss/lm": 0.3338705003261566, "step": 3602 }, { "epoch": 0.7396838431533566, "grad_norm": 1.1632039560059049, "kd_ratio": 0.5, "learning_rate": 3.3496497200915067e-06, "loss": 1.0067410469055176, "loss/kd": 1.6987042427062988, "loss/lm": 0.3147779405117035, "step": 3603 }, { "epoch": 0.7398891398070211, "grad_norm": 1.0659782054788596, "kd_ratio": 0.5, "learning_rate": 3.344684680441591e-06, "loss": 0.8981661796569824, "loss/kd": 1.5033862590789795, "loss/lm": 0.2929461598396301, "step": 3604 }, { "epoch": 0.7400944364606857, "grad_norm": 0.9204917233774969, "kd_ratio": 0.5, "learning_rate": 3.3397225841869408e-06, "loss": 1.1071432828903198, "loss/kd": 1.8858921527862549, "loss/lm": 0.32839444279670715, "step": 3605 }, { "epoch": 0.7402997331143503, "grad_norm": 1.004730675048693, "kd_ratio": 0.5, "learning_rate": 3.334763433522119e-06, "loss": 0.9626813530921936, "loss/kd": 1.577642560005188, "loss/lm": 0.3477201759815216, "step": 3606 }, { "epoch": 0.7405050297680148, "grad_norm": 0.8816040247620244, "kd_ratio": 0.5, "learning_rate": 3.3298072306403595e-06, "loss": 1.1159507036209106, "loss/kd": 1.8462002277374268, "loss/lm": 0.38570117950439453, "step": 3607 }, { "epoch": 0.7407103264216793, "grad_norm": 1.0007458309973791, "kd_ratio": 0.5, "learning_rate": 3.3248539777336097e-06, "loss": 0.9911630153656006, "loss/kd": 1.6344614028930664, "loss/lm": 0.34786462783813477, "step": 3608 }, { "epoch": 0.7409156230753439, "grad_norm": 1.0245321172396762, "kd_ratio": 0.5, "learning_rate": 3.3199036769925087e-06, "loss": 0.99058598279953, "loss/kd": 1.5999635457992554, "loss/lm": 0.3812084197998047, "step": 3609 }, { "epoch": 0.7411209197290084, "grad_norm": 0.8749288497120761, "kd_ratio": 0.5, "learning_rate": 3.314956330606378e-06, "loss": 1.099098801612854, "loss/kd": 1.7904629707336426, "loss/lm": 0.4077346622943878, "step": 3610 }, { "epoch": 0.7413262163826729, "grad_norm": 1.0756501217422578, "kd_ratio": 0.5, "learning_rate": 3.3100119407632556e-06, "loss": 0.9032546281814575, "loss/kd": 1.4298757314682007, "loss/lm": 0.3766334652900696, "step": 3611 }, { "epoch": 0.7415315130363375, "grad_norm": 1.0894540210210866, "kd_ratio": 0.5, "learning_rate": 3.3050705096498504e-06, "loss": 0.9914214015007019, "loss/kd": 1.6449064016342163, "loss/lm": 0.3379363715648651, "step": 3612 }, { "epoch": 0.7417368096900021, "grad_norm": 0.9350961031432135, "kd_ratio": 0.5, "learning_rate": 3.300132039451577e-06, "loss": 1.1793572902679443, "loss/kd": 2.015957832336426, "loss/lm": 0.3427567780017853, "step": 3613 }, { "epoch": 0.7419421063436666, "grad_norm": 0.9663042231424371, "kd_ratio": 0.5, "learning_rate": 3.2951965323525283e-06, "loss": 1.0755577087402344, "loss/kd": 1.8824771642684937, "loss/lm": 0.26863816380500793, "step": 3614 }, { "epoch": 0.7421474029973312, "grad_norm": 1.1114600956144622, "kd_ratio": 0.5, "learning_rate": 3.2902639905354948e-06, "loss": 1.016404390335083, "loss/kd": 1.7280466556549072, "loss/lm": 0.3047620356082916, "step": 3615 }, { "epoch": 0.7423526996509957, "grad_norm": 0.9566378799410062, "kd_ratio": 0.5, "learning_rate": 3.2853344161819567e-06, "loss": 0.8871070146560669, "loss/kd": 1.4552204608917236, "loss/lm": 0.31899353861808777, "step": 3616 }, { "epoch": 0.7425579963046602, "grad_norm": 1.1246717166332119, "kd_ratio": 0.5, "learning_rate": 3.280407811472078e-06, "loss": 0.9920879602432251, "loss/kd": 1.6893802881240845, "loss/lm": 0.2947956323623657, "step": 3617 }, { "epoch": 0.7427632929583248, "grad_norm": 0.9964339546383922, "kd_ratio": 0.5, "learning_rate": 3.2754841785847126e-06, "loss": 1.1026701927185059, "loss/kd": 1.9042831659317017, "loss/lm": 0.3010573387145996, "step": 3618 }, { "epoch": 0.7429685896119893, "grad_norm": 1.066624738066865, "kd_ratio": 0.5, "learning_rate": 3.2705635196973927e-06, "loss": 1.0685632228851318, "loss/kd": 1.7880069017410278, "loss/lm": 0.3491195738315582, "step": 3619 }, { "epoch": 0.7431738862656538, "grad_norm": 0.9656624530617467, "kd_ratio": 0.5, "learning_rate": 3.265645836986343e-06, "loss": 0.8638062477111816, "loss/kd": 1.348699927330017, "loss/lm": 0.3789125978946686, "step": 3620 }, { "epoch": 0.7433791829193184, "grad_norm": 1.0276564404445674, "kd_ratio": 0.5, "learning_rate": 3.260731132626469e-06, "loss": 0.9656682014465332, "loss/kd": 1.6680700778961182, "loss/lm": 0.26326635479927063, "step": 3621 }, { "epoch": 0.743584479572983, "grad_norm": 0.8446363101877543, "kd_ratio": 0.5, "learning_rate": 3.255819408791364e-06, "loss": 1.0222750902175903, "loss/kd": 1.655885100364685, "loss/lm": 0.38866499066352844, "step": 3622 }, { "epoch": 0.7437897762266475, "grad_norm": 0.9460281771372926, "kd_ratio": 0.5, "learning_rate": 3.2509106676532897e-06, "loss": 1.1663334369659424, "loss/kd": 2.072899580001831, "loss/lm": 0.25976723432540894, "step": 3623 }, { "epoch": 0.7439950728803121, "grad_norm": 1.0675222449732338, "kd_ratio": 0.5, "learning_rate": 3.246004911383206e-06, "loss": 1.0547633171081543, "loss/kd": 1.7663439512252808, "loss/lm": 0.343182772397995, "step": 3624 }, { "epoch": 0.7442003695339766, "grad_norm": 1.1861739937896119, "kd_ratio": 0.5, "learning_rate": 3.241102142150734e-06, "loss": 0.9689839482307434, "loss/kd": 1.642043948173523, "loss/lm": 0.29592394828796387, "step": 3625 }, { "epoch": 0.7444056661876411, "grad_norm": 0.9394095173463417, "kd_ratio": 0.5, "learning_rate": 3.2362023621241944e-06, "loss": 1.0958755016326904, "loss/kd": 1.8960018157958984, "loss/lm": 0.2957491874694824, "step": 3626 }, { "epoch": 0.7446109628413057, "grad_norm": 1.0600012565296468, "kd_ratio": 0.5, "learning_rate": 3.231305573470569e-06, "loss": 1.135803461074829, "loss/kd": 1.8711986541748047, "loss/lm": 0.4004083275794983, "step": 3627 }, { "epoch": 0.7448162594949702, "grad_norm": 1.1642066324002225, "kd_ratio": 0.5, "learning_rate": 3.226411778355525e-06, "loss": 1.0043185949325562, "loss/kd": 1.6985937356948853, "loss/lm": 0.31004345417022705, "step": 3628 }, { "epoch": 0.7450215561486347, "grad_norm": 0.9456863808840764, "kd_ratio": 0.5, "learning_rate": 3.2215209789434055e-06, "loss": 1.0612456798553467, "loss/kd": 1.7131034135818481, "loss/lm": 0.4093879759311676, "step": 3629 }, { "epoch": 0.7452268528022993, "grad_norm": 1.123693830562398, "kd_ratio": 0.5, "learning_rate": 3.2166331773972227e-06, "loss": 0.8804754018783569, "loss/kd": 1.4881560802459717, "loss/lm": 0.2727947235107422, "step": 3630 }, { "epoch": 0.7454321494559639, "grad_norm": 1.0761835580960657, "kd_ratio": 0.5, "learning_rate": 3.2117483758786683e-06, "loss": 1.1539338827133179, "loss/kd": 1.9470257759094238, "loss/lm": 0.36084192991256714, "step": 3631 }, { "epoch": 0.7456374461096285, "grad_norm": 1.1029160569920349, "kd_ratio": 0.5, "learning_rate": 3.2068665765481066e-06, "loss": 0.9319852590560913, "loss/kd": 1.5343302488327026, "loss/lm": 0.3296402394771576, "step": 3632 }, { "epoch": 0.745842742763293, "grad_norm": 0.9558229410470698, "kd_ratio": 0.5, "learning_rate": 3.201987781564577e-06, "loss": 0.8115559816360474, "loss/kd": 1.3188483715057373, "loss/lm": 0.30426353216171265, "step": 3633 }, { "epoch": 0.7460480394169575, "grad_norm": 1.2682974930893542, "kd_ratio": 0.5, "learning_rate": 3.1971119930857808e-06, "loss": 1.0070112943649292, "loss/kd": 1.5977075099945068, "loss/lm": 0.4163150191307068, "step": 3634 }, { "epoch": 0.746253336070622, "grad_norm": 1.2403818799286035, "kd_ratio": 0.5, "learning_rate": 3.192239213268099e-06, "loss": 1.003110647201538, "loss/kd": 1.660921335220337, "loss/lm": 0.34530001878738403, "step": 3635 }, { "epoch": 0.7464586327242866, "grad_norm": 0.9363709974313921, "kd_ratio": 0.5, "learning_rate": 3.187369444266577e-06, "loss": 1.0877653360366821, "loss/kd": 1.749343991279602, "loss/lm": 0.4261866807937622, "step": 3636 }, { "epoch": 0.7466639293779511, "grad_norm": 1.5135413477818984, "kd_ratio": 0.5, "learning_rate": 3.182502688234931e-06, "loss": 1.6450999975204468, "loss/kd": 2.9740257263183594, "loss/lm": 0.31617429852485657, "step": 3637 }, { "epoch": 0.7468692260316157, "grad_norm": 0.9467658455159973, "kd_ratio": 0.5, "learning_rate": 3.1776389473255476e-06, "loss": 1.0132956504821777, "loss/kd": 1.7734721899032593, "loss/lm": 0.25311917066574097, "step": 3638 }, { "epoch": 0.7470745226852802, "grad_norm": 0.875436657131449, "kd_ratio": 0.5, "learning_rate": 3.17277822368947e-06, "loss": 1.0013923645019531, "loss/kd": 1.6661697626113892, "loss/lm": 0.33661484718322754, "step": 3639 }, { "epoch": 0.7472798193389448, "grad_norm": 0.9832645295353782, "kd_ratio": 0.5, "learning_rate": 3.1679205194764173e-06, "loss": 1.2520537376403809, "loss/kd": 2.163196563720703, "loss/lm": 0.34091079235076904, "step": 3640 }, { "epoch": 0.7474851159926094, "grad_norm": 1.0892614505989089, "kd_ratio": 0.5, "learning_rate": 3.1630658368347624e-06, "loss": 1.1508643627166748, "loss/kd": 2.003340005874634, "loss/lm": 0.298388808965683, "step": 3641 }, { "epoch": 0.7476904126462739, "grad_norm": 0.93520263680642, "kd_ratio": 0.5, "learning_rate": 3.1582141779115605e-06, "loss": 0.8931909799575806, "loss/kd": 1.4588485956192017, "loss/lm": 0.32753342390060425, "step": 3642 }, { "epoch": 0.7478957092999384, "grad_norm": 0.9574006962473873, "kd_ratio": 0.5, "learning_rate": 3.1533655448525057e-06, "loss": 0.9842535257339478, "loss/kd": 1.5995652675628662, "loss/lm": 0.3689417839050293, "step": 3643 }, { "epoch": 0.748101005953603, "grad_norm": 1.065068245699932, "kd_ratio": 0.5, "learning_rate": 3.148519939801973e-06, "loss": 1.0955095291137695, "loss/kd": 1.8122320175170898, "loss/lm": 0.37878695130348206, "step": 3644 }, { "epoch": 0.7483063026072675, "grad_norm": 0.8778917307034486, "kd_ratio": 0.5, "learning_rate": 3.1436773649029906e-06, "loss": 1.1975677013397217, "loss/kd": 2.0022025108337402, "loss/lm": 0.3929329216480255, "step": 3645 }, { "epoch": 0.748511599260932, "grad_norm": 1.1485265399742548, "kd_ratio": 0.5, "learning_rate": 3.1388378222972426e-06, "loss": 0.9838769435882568, "loss/kd": 1.6114197969436646, "loss/lm": 0.3563341200351715, "step": 3646 }, { "epoch": 0.7487168959145966, "grad_norm": 1.0615801350365832, "kd_ratio": 0.5, "learning_rate": 3.134001314125079e-06, "loss": 0.7488669157028198, "loss/kd": 1.235087513923645, "loss/lm": 0.262646347284317, "step": 3647 }, { "epoch": 0.7489221925682611, "grad_norm": 0.8591842434606892, "kd_ratio": 0.5, "learning_rate": 3.1291678425255044e-06, "loss": 0.8263142704963684, "loss/kd": 1.3873016834259033, "loss/lm": 0.2653268575668335, "step": 3648 }, { "epoch": 0.7491274892219257, "grad_norm": 1.0141113548463792, "kd_ratio": 0.5, "learning_rate": 3.1243374096361865e-06, "loss": 1.1841521263122559, "loss/kd": 2.0742452144622803, "loss/lm": 0.29405903816223145, "step": 3649 }, { "epoch": 0.7493327858755903, "grad_norm": 1.0653002508037526, "kd_ratio": 0.5, "learning_rate": 3.1195100175934357e-06, "loss": 1.0323972702026367, "loss/kd": 1.6969596147537231, "loss/lm": 0.36783480644226074, "step": 3650 }, { "epoch": 0.7495380825292548, "grad_norm": 0.9993085373532548, "kd_ratio": 0.5, "learning_rate": 3.114685668532229e-06, "loss": 0.8893418908119202, "loss/kd": 1.4865851402282715, "loss/lm": 0.29209867119789124, "step": 3651 }, { "epoch": 0.7497433791829193, "grad_norm": 1.1886135501327941, "kd_ratio": 0.5, "learning_rate": 3.1098643645861946e-06, "loss": 0.8994776606559753, "loss/kd": 1.5286141633987427, "loss/lm": 0.2703411877155304, "step": 3652 }, { "epoch": 0.7499486758365839, "grad_norm": 1.0225461595387646, "kd_ratio": 0.5, "learning_rate": 3.105046107887616e-06, "loss": 1.0393171310424805, "loss/kd": 1.7487856149673462, "loss/lm": 0.32984861731529236, "step": 3653 }, { "epoch": 0.7501539724902484, "grad_norm": 0.8862301958448021, "kd_ratio": 0.5, "learning_rate": 3.1002309005674226e-06, "loss": 0.9876970052719116, "loss/kd": 1.6441640853881836, "loss/lm": 0.33122992515563965, "step": 3654 }, { "epoch": 0.7503592691439129, "grad_norm": 0.8683781180353963, "kd_ratio": 0.5, "learning_rate": 3.0954187447551996e-06, "loss": 0.9841888546943665, "loss/kd": 1.6203763484954834, "loss/lm": 0.3480013608932495, "step": 3655 }, { "epoch": 0.7505645657975775, "grad_norm": 0.8568021098194989, "kd_ratio": 0.5, "learning_rate": 3.0906096425791877e-06, "loss": 1.2614080905914307, "loss/kd": 2.1180315017700195, "loss/lm": 0.404784619808197, "step": 3656 }, { "epoch": 0.750769862451242, "grad_norm": 1.010975476963925, "kd_ratio": 0.5, "learning_rate": 3.0858035961662604e-06, "loss": 0.9894196391105652, "loss/kd": 1.7083581686019897, "loss/lm": 0.2704811096191406, "step": 3657 }, { "epoch": 0.7509751591049066, "grad_norm": 1.0602917447601556, "kd_ratio": 0.5, "learning_rate": 3.081000607641964e-06, "loss": 1.0404934883117676, "loss/kd": 1.7533248662948608, "loss/lm": 0.32766205072402954, "step": 3658 }, { "epoch": 0.7511804557585712, "grad_norm": 0.9841469813271931, "kd_ratio": 0.5, "learning_rate": 3.076200679130471e-06, "loss": 1.2640764713287354, "loss/kd": 2.2073049545288086, "loss/lm": 0.32084786891937256, "step": 3659 }, { "epoch": 0.7513857524122357, "grad_norm": 0.8961384453017699, "kd_ratio": 0.5, "learning_rate": 3.0714038127546142e-06, "loss": 1.222764492034912, "loss/kd": 2.139437198638916, "loss/lm": 0.3060917556285858, "step": 3660 }, { "epoch": 0.7515910490659002, "grad_norm": 0.9523779874103592, "kd_ratio": 0.5, "learning_rate": 3.066610010635861e-06, "loss": 1.1012003421783447, "loss/kd": 1.8475033044815063, "loss/lm": 0.35489746928215027, "step": 3661 }, { "epoch": 0.7517963457195648, "grad_norm": 0.9046308316843661, "kd_ratio": 0.5, "learning_rate": 3.0618192748943333e-06, "loss": 1.3883363008499146, "loss/kd": 2.4081168174743652, "loss/lm": 0.3685557544231415, "step": 3662 }, { "epoch": 0.7520016423732293, "grad_norm": 1.11365570983653, "kd_ratio": 0.5, "learning_rate": 3.0570316076487918e-06, "loss": 0.9938387870788574, "loss/kd": 1.6696373224258423, "loss/lm": 0.31804022192955017, "step": 3663 }, { "epoch": 0.7522069390268938, "grad_norm": 0.9620188316583741, "kd_ratio": 0.5, "learning_rate": 3.0522470110166414e-06, "loss": 1.0302419662475586, "loss/kd": 1.7741289138793945, "loss/lm": 0.28635501861572266, "step": 3664 }, { "epoch": 0.7524122356805584, "grad_norm": 1.2213570125088145, "kd_ratio": 0.5, "learning_rate": 3.047465487113933e-06, "loss": 0.891419529914856, "loss/kd": 1.4793317317962646, "loss/lm": 0.30350732803344727, "step": 3665 }, { "epoch": 0.7526175323342229, "grad_norm": 0.8693896796826123, "kd_ratio": 0.5, "learning_rate": 3.0426870380553484e-06, "loss": 1.001890778541565, "loss/kd": 1.7096306085586548, "loss/lm": 0.2941510081291199, "step": 3666 }, { "epoch": 0.7528228289878875, "grad_norm": 1.0968329716918515, "kd_ratio": 0.5, "learning_rate": 3.0379116659542186e-06, "loss": 1.085676670074463, "loss/kd": 1.8269420862197876, "loss/lm": 0.3444112539291382, "step": 3667 }, { "epoch": 0.7530281256415521, "grad_norm": 0.9850084550116087, "kd_ratio": 0.5, "learning_rate": 3.033139372922509e-06, "loss": 1.1037232875823975, "loss/kd": 1.8900715112686157, "loss/lm": 0.317375123500824, "step": 3668 }, { "epoch": 0.7532334222952166, "grad_norm": 0.8769688265647333, "kd_ratio": 0.5, "learning_rate": 3.0283701610708303e-06, "loss": 0.9524631500244141, "loss/kd": 1.5920547246932983, "loss/lm": 0.3128715455532074, "step": 3669 }, { "epoch": 0.7534387189488811, "grad_norm": 1.06318467135977, "kd_ratio": 0.5, "learning_rate": 3.023604032508419e-06, "loss": 0.9269897937774658, "loss/kd": 1.556767225265503, "loss/lm": 0.29721230268478394, "step": 3670 }, { "epoch": 0.7536440156025457, "grad_norm": 0.9958033539775885, "kd_ratio": 0.5, "learning_rate": 3.0188409893431556e-06, "loss": 1.095647931098938, "loss/kd": 1.836527705192566, "loss/lm": 0.35476821660995483, "step": 3671 }, { "epoch": 0.7538493122562102, "grad_norm": 0.9386141761348967, "kd_ratio": 0.5, "learning_rate": 3.0140810336815572e-06, "loss": 1.0835614204406738, "loss/kd": 1.858237862586975, "loss/lm": 0.30888503789901733, "step": 3672 }, { "epoch": 0.7540546089098747, "grad_norm": 0.9005151700518648, "kd_ratio": 0.5, "learning_rate": 3.0093241676287697e-06, "loss": 0.9110293388366699, "loss/kd": 1.538665771484375, "loss/lm": 0.2833929657936096, "step": 3673 }, { "epoch": 0.7542599055635393, "grad_norm": 0.9130182970894345, "kd_ratio": 0.5, "learning_rate": 3.004570393288583e-06, "loss": 1.047255277633667, "loss/kd": 1.6895346641540527, "loss/lm": 0.4049758315086365, "step": 3674 }, { "epoch": 0.7544652022172038, "grad_norm": 0.8415635800223149, "kd_ratio": 0.5, "learning_rate": 2.999819712763402e-06, "loss": 0.8309738636016846, "loss/kd": 1.3847113847732544, "loss/lm": 0.27723637223243713, "step": 3675 }, { "epoch": 0.7546704988708685, "grad_norm": 0.9496305531222015, "kd_ratio": 0.5, "learning_rate": 2.9950721281542816e-06, "loss": 1.1008379459381104, "loss/kd": 1.9037327766418457, "loss/lm": 0.29794323444366455, "step": 3676 }, { "epoch": 0.754875795524533, "grad_norm": 0.870416897895354, "kd_ratio": 0.5, "learning_rate": 2.990327641560892e-06, "loss": 0.8942261934280396, "loss/kd": 1.469331979751587, "loss/lm": 0.3191204071044922, "step": 3677 }, { "epoch": 0.7550810921781975, "grad_norm": 1.0050353953883688, "kd_ratio": 0.5, "learning_rate": 2.9855862550815507e-06, "loss": 0.857661247253418, "loss/kd": 1.3463640213012695, "loss/lm": 0.3689585328102112, "step": 3678 }, { "epoch": 0.755286388831862, "grad_norm": 0.9279316960793922, "kd_ratio": 0.5, "learning_rate": 2.9808479708131864e-06, "loss": 0.8144927024841309, "loss/kd": 1.2934153079986572, "loss/lm": 0.3355700671672821, "step": 3679 }, { "epoch": 0.7554916854855266, "grad_norm": 0.9628091555858128, "kd_ratio": 0.5, "learning_rate": 2.97611279085137e-06, "loss": 1.0017285346984863, "loss/kd": 1.6842395067214966, "loss/lm": 0.3192175328731537, "step": 3680 }, { "epoch": 0.7556969821391911, "grad_norm": 1.056274197468985, "kd_ratio": 0.5, "learning_rate": 2.9713807172902865e-06, "loss": 1.0050163269042969, "loss/kd": 1.650888442993164, "loss/lm": 0.3591442108154297, "step": 3681 }, { "epoch": 0.7559022787928557, "grad_norm": 0.8858145263684793, "kd_ratio": 0.5, "learning_rate": 2.9666517522227576e-06, "loss": 1.1622759103775024, "loss/kd": 1.95838463306427, "loss/lm": 0.36616724729537964, "step": 3682 }, { "epoch": 0.7561075754465202, "grad_norm": 0.9698612734682454, "kd_ratio": 0.5, "learning_rate": 2.9619258977402253e-06, "loss": 1.1272751092910767, "loss/kd": 1.949976921081543, "loss/lm": 0.30457329750061035, "step": 3683 }, { "epoch": 0.7563128721001847, "grad_norm": 1.0605931552880012, "kd_ratio": 0.5, "learning_rate": 2.9572031559327586e-06, "loss": 1.176505208015442, "loss/kd": 2.020413398742676, "loss/lm": 0.332597017288208, "step": 3684 }, { "epoch": 0.7565181687538494, "grad_norm": 0.9425951249008311, "kd_ratio": 0.5, "learning_rate": 2.9524835288890497e-06, "loss": 0.7524663209915161, "loss/kd": 1.2314796447753906, "loss/lm": 0.2734529376029968, "step": 3685 }, { "epoch": 0.7567234654075139, "grad_norm": 0.9237552797061653, "kd_ratio": 0.5, "learning_rate": 2.9477670186964067e-06, "loss": 1.061911702156067, "loss/kd": 1.7157765626907349, "loss/lm": 0.40804681181907654, "step": 3686 }, { "epoch": 0.7569287620611784, "grad_norm": 0.95226131789806, "kd_ratio": 0.5, "learning_rate": 2.943053627440771e-06, "loss": 1.002558946609497, "loss/kd": 1.7062058448791504, "loss/lm": 0.2989121675491333, "step": 3687 }, { "epoch": 0.757134058714843, "grad_norm": 0.8455810490018819, "kd_ratio": 0.5, "learning_rate": 2.9383433572066866e-06, "loss": 1.0727605819702148, "loss/kd": 1.806028127670288, "loss/lm": 0.33949291706085205, "step": 3688 }, { "epoch": 0.7573393553685075, "grad_norm": 0.9414985057707901, "kd_ratio": 0.5, "learning_rate": 2.9336362100773443e-06, "loss": 1.1026281118392944, "loss/kd": 1.8399229049682617, "loss/lm": 0.3653333783149719, "step": 3689 }, { "epoch": 0.757544652022172, "grad_norm": 1.1134317563404117, "kd_ratio": 0.5, "learning_rate": 2.9289321881345257e-06, "loss": 0.8942023515701294, "loss/kd": 1.4978662729263306, "loss/lm": 0.29053840041160583, "step": 3690 }, { "epoch": 0.7577499486758366, "grad_norm": 0.9102035495972006, "kd_ratio": 0.5, "learning_rate": 2.924231293458647e-06, "loss": 1.1382728815078735, "loss/kd": 1.8731871843338013, "loss/lm": 0.4033585786819458, "step": 3691 }, { "epoch": 0.7579552453295011, "grad_norm": 0.9257911283076711, "kd_ratio": 0.5, "learning_rate": 2.9195335281287395e-06, "loss": 0.9298462867736816, "loss/kd": 1.533832311630249, "loss/lm": 0.32586032152175903, "step": 3692 }, { "epoch": 0.7581605419831656, "grad_norm": 0.8847244358427985, "kd_ratio": 0.5, "learning_rate": 2.914838894222439e-06, "loss": 0.8735436201095581, "loss/kd": 1.4589954614639282, "loss/lm": 0.2880917489528656, "step": 3693 }, { "epoch": 0.7583658386368303, "grad_norm": 1.1265657366795152, "kd_ratio": 0.5, "learning_rate": 2.9101473938160173e-06, "loss": 1.127799153327942, "loss/kd": 1.9039405584335327, "loss/lm": 0.3516576588153839, "step": 3694 }, { "epoch": 0.7585711352904948, "grad_norm": 0.9534734709093418, "kd_ratio": 0.5, "learning_rate": 2.90545902898434e-06, "loss": 1.072106957435608, "loss/kd": 1.8282427787780762, "loss/lm": 0.3159710764884949, "step": 3695 }, { "epoch": 0.7587764319441593, "grad_norm": 0.9405395595574528, "kd_ratio": 0.5, "learning_rate": 2.900773801800898e-06, "loss": 1.0678484439849854, "loss/kd": 1.7896174192428589, "loss/lm": 0.346079558134079, "step": 3696 }, { "epoch": 0.7589817285978239, "grad_norm": 1.0338864622881152, "kd_ratio": 0.5, "learning_rate": 2.8960917143377865e-06, "loss": 1.0778295993804932, "loss/kd": 1.8549927473068237, "loss/lm": 0.3006663918495178, "step": 3697 }, { "epoch": 0.7591870252514884, "grad_norm": 0.8743228842412865, "kd_ratio": 0.5, "learning_rate": 2.89141276866572e-06, "loss": 0.946922779083252, "loss/kd": 1.564697504043579, "loss/lm": 0.3291480243206024, "step": 3698 }, { "epoch": 0.7593923219051529, "grad_norm": 1.0555894017680962, "kd_ratio": 0.5, "learning_rate": 2.886736966854018e-06, "loss": 0.9060190320014954, "loss/kd": 1.4899992942810059, "loss/lm": 0.32203879952430725, "step": 3699 }, { "epoch": 0.7595976185588175, "grad_norm": 0.8067902352721883, "kd_ratio": 0.5, "learning_rate": 2.8820643109706136e-06, "loss": 0.9220213890075684, "loss/kd": 1.6058886051177979, "loss/lm": 0.23815415799617767, "step": 3700 }, { "epoch": 0.759802915212482, "grad_norm": 1.3055267789804212, "kd_ratio": 0.5, "learning_rate": 2.8773948030820488e-06, "loss": 1.2252845764160156, "loss/kd": 2.162173271179199, "loss/lm": 0.2883959710597992, "step": 3701 }, { "epoch": 0.7600082118661465, "grad_norm": 0.8672908066268281, "kd_ratio": 0.5, "learning_rate": 2.8727284452534634e-06, "loss": 0.9654828310012817, "loss/kd": 1.6077769994735718, "loss/lm": 0.3231886327266693, "step": 3702 }, { "epoch": 0.7602135085198112, "grad_norm": 1.3092140553455252, "kd_ratio": 0.5, "learning_rate": 2.8680652395486198e-06, "loss": 1.1678067445755005, "loss/kd": 2.001307725906372, "loss/lm": 0.3343057632446289, "step": 3703 }, { "epoch": 0.7604188051734757, "grad_norm": 1.0677849991964, "kd_ratio": 0.5, "learning_rate": 2.8634051880298687e-06, "loss": 0.9306001663208008, "loss/kd": 1.5163229703903198, "loss/lm": 0.34487733244895935, "step": 3704 }, { "epoch": 0.7606241018271402, "grad_norm": 1.0979094672687235, "kd_ratio": 0.5, "learning_rate": 2.8587482927581865e-06, "loss": 0.872737467288971, "loss/kd": 1.3620103597640991, "loss/lm": 0.3834645748138428, "step": 3705 }, { "epoch": 0.7608293984808048, "grad_norm": 1.3135855593010564, "kd_ratio": 0.5, "learning_rate": 2.8540945557931356e-06, "loss": 1.8942021131515503, "loss/kd": 3.5567269325256348, "loss/lm": 0.23167739808559418, "step": 3706 }, { "epoch": 0.7610346951344693, "grad_norm": 0.8304796121348267, "kd_ratio": 0.5, "learning_rate": 2.849443979192892e-06, "loss": 0.8887394666671753, "loss/kd": 1.5009907484054565, "loss/lm": 0.27648818492889404, "step": 3707 }, { "epoch": 0.7612399917881338, "grad_norm": 1.1905121912362338, "kd_ratio": 0.5, "learning_rate": 2.844796565014225e-06, "loss": 1.0074074268341064, "loss/kd": 1.6722782850265503, "loss/lm": 0.34253644943237305, "step": 3708 }, { "epoch": 0.7614452884417984, "grad_norm": 1.0110229518104303, "kd_ratio": 0.5, "learning_rate": 2.8401523153125156e-06, "loss": 0.9615508317947388, "loss/kd": 1.5538336038589478, "loss/lm": 0.3692680299282074, "step": 3709 }, { "epoch": 0.7616505850954629, "grad_norm": 1.1897515510866181, "kd_ratio": 0.5, "learning_rate": 2.835511232141739e-06, "loss": 1.1400549411773682, "loss/kd": 1.945533037185669, "loss/lm": 0.33457672595977783, "step": 3710 }, { "epoch": 0.7618558817491274, "grad_norm": 1.0125393038246842, "kd_ratio": 0.5, "learning_rate": 2.8308733175544724e-06, "loss": 1.0028610229492188, "loss/kd": 1.7276805639266968, "loss/lm": 0.27804145216941833, "step": 3711 }, { "epoch": 0.7620611784027921, "grad_norm": 1.5474013826227244, "kd_ratio": 0.5, "learning_rate": 2.8262385736018925e-06, "loss": 0.9925382137298584, "loss/kd": 1.6963015794754028, "loss/lm": 0.28877490758895874, "step": 3712 }, { "epoch": 0.7622664750564566, "grad_norm": 1.0510988286961434, "kd_ratio": 0.5, "learning_rate": 2.821607002333767e-06, "loss": 0.8450779318809509, "loss/kd": 1.3941152095794678, "loss/lm": 0.2960406243801117, "step": 3713 }, { "epoch": 0.7624717717101211, "grad_norm": 0.959097863819182, "kd_ratio": 0.5, "learning_rate": 2.8169786057984684e-06, "loss": 1.1269657611846924, "loss/kd": 1.9378962516784668, "loss/lm": 0.31603533029556274, "step": 3714 }, { "epoch": 0.7626770683637857, "grad_norm": 1.098420071743959, "kd_ratio": 0.5, "learning_rate": 2.812353386042962e-06, "loss": 1.085129976272583, "loss/kd": 1.7666149139404297, "loss/lm": 0.4036451280117035, "step": 3715 }, { "epoch": 0.7628823650174502, "grad_norm": 0.8663545512661405, "kd_ratio": 0.5, "learning_rate": 2.8077313451128107e-06, "loss": 0.8392120599746704, "loss/kd": 1.3353990316390991, "loss/lm": 0.3430250585079193, "step": 3716 }, { "epoch": 0.7630876616711147, "grad_norm": 1.0479301278864979, "kd_ratio": 0.5, "learning_rate": 2.803112485052163e-06, "loss": 1.3451241254806519, "loss/kd": 2.346123695373535, "loss/lm": 0.34412461519241333, "step": 3717 }, { "epoch": 0.7632929583247793, "grad_norm": 0.910454905957368, "kd_ratio": 0.5, "learning_rate": 2.798496807903771e-06, "loss": 1.063336730003357, "loss/kd": 1.7532646656036377, "loss/lm": 0.37340885400772095, "step": 3718 }, { "epoch": 0.7634982549784438, "grad_norm": 0.9584221634531337, "kd_ratio": 0.5, "learning_rate": 2.7938843157089734e-06, "loss": 0.9862115383148193, "loss/kd": 1.6568995714187622, "loss/lm": 0.3155234456062317, "step": 3719 }, { "epoch": 0.7637035516321083, "grad_norm": 1.0732713185375784, "kd_ratio": 0.5, "learning_rate": 2.789275010507704e-06, "loss": 0.898134708404541, "loss/kd": 1.4024604558944702, "loss/lm": 0.3938089609146118, "step": 3720 }, { "epoch": 0.763908848285773, "grad_norm": 0.9147251503763096, "kd_ratio": 0.5, "learning_rate": 2.7846688943384848e-06, "loss": 1.9040307998657227, "loss/kd": 3.604720115661621, "loss/lm": 0.20334146916866302, "step": 3721 }, { "epoch": 0.7641141449394375, "grad_norm": 0.9306941317971078, "kd_ratio": 0.5, "learning_rate": 2.7800659692384237e-06, "loss": 1.496875286102295, "loss/kd": 2.6176207065582275, "loss/lm": 0.3761298954486847, "step": 3722 }, { "epoch": 0.7643194415931021, "grad_norm": 0.9693585616661733, "kd_ratio": 0.5, "learning_rate": 2.775466237243226e-06, "loss": 1.352406620979309, "loss/kd": 2.453001022338867, "loss/lm": 0.251812219619751, "step": 3723 }, { "epoch": 0.7645247382467666, "grad_norm": 0.9171047783182003, "kd_ratio": 0.5, "learning_rate": 2.770869700387171e-06, "loss": 1.0655635595321655, "loss/kd": 1.8287310600280762, "loss/lm": 0.3023959994316101, "step": 3724 }, { "epoch": 0.7647300349004311, "grad_norm": 0.878317352252021, "kd_ratio": 0.5, "learning_rate": 2.766276360703146e-06, "loss": 0.9600582122802734, "loss/kd": 1.6131970882415771, "loss/lm": 0.3069193661212921, "step": 3725 }, { "epoch": 0.7649353315540957, "grad_norm": 1.097650518475582, "kd_ratio": 0.5, "learning_rate": 2.7616862202226047e-06, "loss": 0.9745483994483948, "loss/kd": 1.5286089181900024, "loss/lm": 0.4204879105091095, "step": 3726 }, { "epoch": 0.7651406282077602, "grad_norm": 0.958112645691759, "kd_ratio": 0.5, "learning_rate": 2.7570992809755937e-06, "loss": 1.104069471359253, "loss/kd": 1.9110482931137085, "loss/lm": 0.2970907390117645, "step": 3727 }, { "epoch": 0.7653459248614247, "grad_norm": 0.9360365121329105, "kd_ratio": 0.5, "learning_rate": 2.752515544990747e-06, "loss": 0.9777593612670898, "loss/kd": 1.6624170541763306, "loss/lm": 0.29310163855552673, "step": 3728 }, { "epoch": 0.7655512215150893, "grad_norm": 0.9430820496907723, "kd_ratio": 0.5, "learning_rate": 2.7479350142952733e-06, "loss": 0.8889173865318298, "loss/kd": 1.4782683849334717, "loss/lm": 0.299566388130188, "step": 3729 }, { "epoch": 0.7657565181687539, "grad_norm": 0.9211067923788113, "kd_ratio": 0.5, "learning_rate": 2.7433576909149717e-06, "loss": 0.8842874765396118, "loss/kd": 1.4529701471328735, "loss/lm": 0.3156047761440277, "step": 3730 }, { "epoch": 0.7659618148224184, "grad_norm": 0.8620659030400823, "kd_ratio": 0.5, "learning_rate": 2.73878357687422e-06, "loss": 1.0664916038513184, "loss/kd": 1.7427979707717896, "loss/lm": 0.39018532633781433, "step": 3731 }, { "epoch": 0.766167111476083, "grad_norm": 0.9157980037644956, "kd_ratio": 0.5, "learning_rate": 2.7342126741959785e-06, "loss": 1.1421245336532593, "loss/kd": 1.9153743982315063, "loss/lm": 0.368874728679657, "step": 3732 }, { "epoch": 0.7663724081297475, "grad_norm": 0.8952012351311562, "kd_ratio": 0.5, "learning_rate": 2.7296449849017803e-06, "loss": 1.084720492362976, "loss/kd": 1.878988265991211, "loss/lm": 0.29045265913009644, "step": 3733 }, { "epoch": 0.766577704783412, "grad_norm": 0.9594354361810096, "kd_ratio": 0.5, "learning_rate": 2.725080511011745e-06, "loss": 0.9105197191238403, "loss/kd": 1.4576215744018555, "loss/lm": 0.3634178340435028, "step": 3734 }, { "epoch": 0.7667830014370766, "grad_norm": 0.8948284503272712, "kd_ratio": 0.5, "learning_rate": 2.720519254544568e-06, "loss": 1.03215491771698, "loss/kd": 1.7138729095458984, "loss/lm": 0.35043683648109436, "step": 3735 }, { "epoch": 0.7669882980907411, "grad_norm": 0.9631493624378149, "kd_ratio": 0.5, "learning_rate": 2.7159612175175233e-06, "loss": 0.8550668954849243, "loss/kd": 1.4260926246643066, "loss/lm": 0.2840411067008972, "step": 3736 }, { "epoch": 0.7671935947444056, "grad_norm": 0.8796971466845203, "kd_ratio": 0.5, "learning_rate": 2.7114064019464524e-06, "loss": 1.202161192893982, "loss/kd": 2.054591178894043, "loss/lm": 0.3497312068939209, "step": 3737 }, { "epoch": 0.7673988913980702, "grad_norm": 1.0052084478752288, "kd_ratio": 0.5, "learning_rate": 2.7068548098457837e-06, "loss": 1.3467037677764893, "loss/kd": 2.3858022689819336, "loss/lm": 0.3076052963733673, "step": 3738 }, { "epoch": 0.7676041880517348, "grad_norm": 0.9660579896489113, "kd_ratio": 0.5, "learning_rate": 2.702306443228516e-06, "loss": 1.053141474723816, "loss/kd": 1.8223541975021362, "loss/lm": 0.28392884135246277, "step": 3739 }, { "epoch": 0.7678094847053993, "grad_norm": 1.182982490460989, "kd_ratio": 0.5, "learning_rate": 2.6977613041062133e-06, "loss": 1.245005488395691, "loss/kd": 2.1259264945983887, "loss/lm": 0.36408454179763794, "step": 3740 }, { "epoch": 0.7680147813590639, "grad_norm": 1.2760339446069449, "kd_ratio": 0.5, "learning_rate": 2.6932193944890295e-06, "loss": 1.0524485111236572, "loss/kd": 1.7162871360778809, "loss/lm": 0.38861000537872314, "step": 3741 }, { "epoch": 0.7682200780127284, "grad_norm": 0.9366035788797029, "kd_ratio": 0.5, "learning_rate": 2.6886807163856733e-06, "loss": 1.0823702812194824, "loss/kd": 1.8568638563156128, "loss/lm": 0.3078768253326416, "step": 3742 }, { "epoch": 0.7684253746663929, "grad_norm": 0.8441046147912447, "kd_ratio": 0.5, "learning_rate": 2.6841452718034343e-06, "loss": 0.9466087222099304, "loss/kd": 1.610278606414795, "loss/lm": 0.28293880820274353, "step": 3743 }, { "epoch": 0.7686306713200575, "grad_norm": 1.0418311699946627, "kd_ratio": 0.5, "learning_rate": 2.6796130627481663e-06, "loss": 0.9277374744415283, "loss/kd": 1.5084396600723267, "loss/lm": 0.34703528881073, "step": 3744 }, { "epoch": 0.768835967973722, "grad_norm": 0.8901978453042988, "kd_ratio": 0.5, "learning_rate": 2.6750840912242948e-06, "loss": 1.010207176208496, "loss/kd": 1.6837387084960938, "loss/lm": 0.3366757035255432, "step": 3745 }, { "epoch": 0.7690412646273865, "grad_norm": 1.0355136602024702, "kd_ratio": 0.5, "learning_rate": 2.6705583592348138e-06, "loss": 1.1138007640838623, "loss/kd": 1.9230151176452637, "loss/lm": 0.3045863211154938, "step": 3746 }, { "epoch": 0.7692465612810511, "grad_norm": 0.9167518731315532, "kd_ratio": 0.5, "learning_rate": 2.666035868781285e-06, "loss": 0.9967338442802429, "loss/kd": 1.5690690279006958, "loss/lm": 0.42439866065979004, "step": 3747 }, { "epoch": 0.7694518579347157, "grad_norm": 1.0630086336819973, "kd_ratio": 0.5, "learning_rate": 2.6615166218638375e-06, "loss": 0.974546492099762, "loss/kd": 1.5575250387191772, "loss/lm": 0.39156797528266907, "step": 3748 }, { "epoch": 0.7696571545883802, "grad_norm": 0.9076420703388761, "kd_ratio": 0.5, "learning_rate": 2.6570006204811595e-06, "loss": 0.9422533512115479, "loss/kd": 1.5514719486236572, "loss/lm": 0.33303478360176086, "step": 3749 }, { "epoch": 0.7698624512420448, "grad_norm": 0.9622555748135214, "kd_ratio": 0.5, "learning_rate": 2.652487866630511e-06, "loss": 1.0715633630752563, "loss/kd": 1.811891794204712, "loss/lm": 0.33123502135276794, "step": 3750 }, { "epoch": 0.7700677478957093, "grad_norm": 0.9399617881271609, "kd_ratio": 0.5, "learning_rate": 2.6479783623077105e-06, "loss": 1.1912726163864136, "loss/kd": 2.0901153087615967, "loss/lm": 0.2924298346042633, "step": 3751 }, { "epoch": 0.7702730445493738, "grad_norm": 0.9994866229664471, "kd_ratio": 0.5, "learning_rate": 2.643472109507148e-06, "loss": 1.049210548400879, "loss/kd": 1.7544569969177246, "loss/lm": 0.34396421909332275, "step": 3752 }, { "epoch": 0.7704783412030384, "grad_norm": 0.983200806942247, "kd_ratio": 0.5, "learning_rate": 2.6389691102217617e-06, "loss": 0.8805719017982483, "loss/kd": 1.4090155363082886, "loss/lm": 0.3521282374858856, "step": 3753 }, { "epoch": 0.7706836378567029, "grad_norm": 1.010061139287169, "kd_ratio": 0.5, "learning_rate": 2.634469366443063e-06, "loss": 0.9067527055740356, "loss/kd": 1.4949148893356323, "loss/lm": 0.3185904622077942, "step": 3754 }, { "epoch": 0.7708889345103674, "grad_norm": 1.0736465058754878, "kd_ratio": 0.5, "learning_rate": 2.62997288016112e-06, "loss": 0.8692988157272339, "loss/kd": 1.328741192817688, "loss/lm": 0.409856379032135, "step": 3755 }, { "epoch": 0.771094231164032, "grad_norm": 1.020639155555313, "kd_ratio": 0.5, "learning_rate": 2.6254796533645523e-06, "loss": 1.1149810552597046, "loss/kd": 1.8518931865692139, "loss/lm": 0.3780690133571625, "step": 3756 }, { "epoch": 0.7712995278176966, "grad_norm": 1.0139712769390603, "kd_ratio": 0.5, "learning_rate": 2.620989688040556e-06, "loss": 0.9737240076065063, "loss/kd": 1.6165238618850708, "loss/lm": 0.3309241831302643, "step": 3757 }, { "epoch": 0.7715048244713612, "grad_norm": 0.9987023147148775, "kd_ratio": 0.5, "learning_rate": 2.6165029861748635e-06, "loss": 0.9866715669631958, "loss/kd": 1.6768299341201782, "loss/lm": 0.29651325941085815, "step": 3758 }, { "epoch": 0.7717101211250257, "grad_norm": 0.9396944865668311, "kd_ratio": 0.5, "learning_rate": 2.6120195497517818e-06, "loss": 1.0125062465667725, "loss/kd": 1.7202961444854736, "loss/lm": 0.30471646785736084, "step": 3759 }, { "epoch": 0.7719154177786902, "grad_norm": 1.0156394335769876, "kd_ratio": 0.5, "learning_rate": 2.6075393807541593e-06, "loss": 0.9438452124595642, "loss/kd": 1.5888760089874268, "loss/lm": 0.29881441593170166, "step": 3760 }, { "epoch": 0.7721207144323547, "grad_norm": 0.9150847854001358, "kd_ratio": 0.5, "learning_rate": 2.6030624811634086e-06, "loss": 1.1286046504974365, "loss/kd": 1.9537492990493774, "loss/lm": 0.30345988273620605, "step": 3761 }, { "epoch": 0.7723260110860193, "grad_norm": 1.0052943173044182, "kd_ratio": 0.5, "learning_rate": 2.5985888529594937e-06, "loss": 1.0818297863006592, "loss/kd": 1.8118482828140259, "loss/lm": 0.3518112003803253, "step": 3762 }, { "epoch": 0.7725313077396838, "grad_norm": 0.8965156107618112, "kd_ratio": 0.5, "learning_rate": 2.5941184981209354e-06, "loss": 1.0241222381591797, "loss/kd": 1.7546875476837158, "loss/lm": 0.29355692863464355, "step": 3763 }, { "epoch": 0.7727366043933483, "grad_norm": 0.9818454603708848, "kd_ratio": 0.5, "learning_rate": 2.589651418624798e-06, "loss": 1.0700424909591675, "loss/kd": 1.799342155456543, "loss/lm": 0.3407428562641144, "step": 3764 }, { "epoch": 0.7729419010470129, "grad_norm": 1.0231651454649593, "kd_ratio": 0.5, "learning_rate": 2.585187616446704e-06, "loss": 1.312927007675171, "loss/kd": 2.183458089828491, "loss/lm": 0.44239580631256104, "step": 3765 }, { "epoch": 0.7731471977006775, "grad_norm": 1.2058120321111154, "kd_ratio": 0.5, "learning_rate": 2.5807270935608244e-06, "loss": 0.9917817115783691, "loss/kd": 1.6495431661605835, "loss/lm": 0.3340202867984772, "step": 3766 }, { "epoch": 0.7733524943543421, "grad_norm": 0.9881237352715564, "kd_ratio": 0.5, "learning_rate": 2.5762698519398832e-06, "loss": 1.1755356788635254, "loss/kd": 2.0332956314086914, "loss/lm": 0.317775696516037, "step": 3767 }, { "epoch": 0.7735577910080066, "grad_norm": 1.0790223518294535, "kd_ratio": 0.5, "learning_rate": 2.5718158935551505e-06, "loss": 0.962177038192749, "loss/kd": 1.5694092512130737, "loss/lm": 0.3549448251724243, "step": 3768 }, { "epoch": 0.7737630876616711, "grad_norm": 1.0066178486444135, "kd_ratio": 0.5, "learning_rate": 2.567365220376441e-06, "loss": 0.9431855082511902, "loss/kd": 1.5720534324645996, "loss/lm": 0.31431761384010315, "step": 3769 }, { "epoch": 0.7739683843153357, "grad_norm": 1.0844437684215325, "kd_ratio": 0.5, "learning_rate": 2.562917834372123e-06, "loss": 1.1637506484985352, "loss/kd": 2.0239064693450928, "loss/lm": 0.3035949468612671, "step": 3770 }, { "epoch": 0.7741736809690002, "grad_norm": 0.9526540397853261, "kd_ratio": 0.5, "learning_rate": 2.5584737375091016e-06, "loss": 1.2833929061889648, "loss/kd": 2.2417681217193604, "loss/lm": 0.32501769065856934, "step": 3771 }, { "epoch": 0.7743789776226647, "grad_norm": 0.9643646664697005, "kd_ratio": 0.5, "learning_rate": 2.5540329317528435e-06, "loss": 1.108905553817749, "loss/kd": 1.8721370697021484, "loss/lm": 0.34567391872406006, "step": 3772 }, { "epoch": 0.7745842742763293, "grad_norm": 1.0382151295867503, "kd_ratio": 0.5, "learning_rate": 2.549595419067341e-06, "loss": 1.151583194732666, "loss/kd": 1.953810214996338, "loss/lm": 0.3493562340736389, "step": 3773 }, { "epoch": 0.7747895709299938, "grad_norm": 0.9814769101381241, "kd_ratio": 0.5, "learning_rate": 2.5451612014151427e-06, "loss": 1.003096103668213, "loss/kd": 1.6716389656066895, "loss/lm": 0.3345533609390259, "step": 3774 }, { "epoch": 0.7749948675836584, "grad_norm": 0.9537209082446501, "kd_ratio": 0.5, "learning_rate": 2.5407302807573387e-06, "loss": 0.9893261790275574, "loss/kd": 1.6750903129577637, "loss/lm": 0.3035620152950287, "step": 3775 }, { "epoch": 0.775200164237323, "grad_norm": 0.9596456088544665, "kd_ratio": 0.5, "learning_rate": 2.5363026590535488e-06, "loss": 1.0350733995437622, "loss/kd": 1.7238065004348755, "loss/lm": 0.3463403880596161, "step": 3776 }, { "epoch": 0.7754054608909875, "grad_norm": 0.9559088726480518, "kd_ratio": 0.5, "learning_rate": 2.531878338261956e-06, "loss": 0.9627191424369812, "loss/kd": 1.5401849746704102, "loss/lm": 0.38525334000587463, "step": 3777 }, { "epoch": 0.775610757544652, "grad_norm": 1.0475392141988062, "kd_ratio": 0.5, "learning_rate": 2.527457320339262e-06, "loss": 0.942284882068634, "loss/kd": 1.6107079982757568, "loss/lm": 0.27386173605918884, "step": 3778 }, { "epoch": 0.7758160541983166, "grad_norm": 1.099639256150612, "kd_ratio": 0.5, "learning_rate": 2.5230396072407204e-06, "loss": 0.9363612532615662, "loss/kd": 1.5395876169204712, "loss/lm": 0.3331349194049835, "step": 3779 }, { "epoch": 0.7760213508519811, "grad_norm": 0.8831303361744732, "kd_ratio": 0.5, "learning_rate": 2.5186252009201152e-06, "loss": 0.7532070875167847, "loss/kd": 1.2379670143127441, "loss/lm": 0.2684471011161804, "step": 3780 }, { "epoch": 0.7762266475056456, "grad_norm": 0.8717159623135441, "kd_ratio": 0.5, "learning_rate": 2.514214103329775e-06, "loss": 0.9849082827568054, "loss/kd": 1.6461328268051147, "loss/lm": 0.3236837089061737, "step": 3781 }, { "epoch": 0.7764319441593102, "grad_norm": 1.0077832074591766, "kd_ratio": 0.5, "learning_rate": 2.509806316420562e-06, "loss": 1.1988115310668945, "loss/kd": 2.054368734359741, "loss/lm": 0.3432544469833374, "step": 3782 }, { "epoch": 0.7766372408129747, "grad_norm": 1.1835400592239023, "kd_ratio": 0.5, "learning_rate": 2.5054018421418737e-06, "loss": 1.0024747848510742, "loss/kd": 1.6446961164474487, "loss/lm": 0.36025339365005493, "step": 3783 }, { "epoch": 0.7768425374666393, "grad_norm": 0.9868046066173136, "kd_ratio": 0.5, "learning_rate": 2.501000682441647e-06, "loss": 0.8809870481491089, "loss/kd": 1.446029782295227, "loss/lm": 0.3159443140029907, "step": 3784 }, { "epoch": 0.7770478341203039, "grad_norm": 0.9872611814509819, "kd_ratio": 0.5, "learning_rate": 2.496602839266342e-06, "loss": 1.060300350189209, "loss/kd": 1.8133103847503662, "loss/lm": 0.30729028582572937, "step": 3785 }, { "epoch": 0.7772531307739684, "grad_norm": 0.9443180849208591, "kd_ratio": 0.5, "learning_rate": 2.4922083145609653e-06, "loss": 0.9958336353302002, "loss/kd": 1.6382637023925781, "loss/lm": 0.35340356826782227, "step": 3786 }, { "epoch": 0.7774584274276329, "grad_norm": 0.8479273074717402, "kd_ratio": 0.5, "learning_rate": 2.487817110269042e-06, "loss": 1.2364073991775513, "loss/kd": 2.06670880317688, "loss/lm": 0.4061059355735779, "step": 3787 }, { "epoch": 0.7776637240812975, "grad_norm": 0.9815747862331213, "kd_ratio": 0.5, "learning_rate": 2.4834292283326467e-06, "loss": 0.7687749266624451, "loss/kd": 1.2529672384262085, "loss/lm": 0.28458261489868164, "step": 3788 }, { "epoch": 0.777869020734962, "grad_norm": 0.9445655206802754, "kd_ratio": 0.5, "learning_rate": 2.4790446706923664e-06, "loss": 1.093273401260376, "loss/kd": 1.86375892162323, "loss/lm": 0.32278794050216675, "step": 3789 }, { "epoch": 0.7780743173886265, "grad_norm": 1.0461874774625828, "kd_ratio": 0.5, "learning_rate": 2.4746634392873282e-06, "loss": 0.9462755918502808, "loss/kd": 1.541595220565796, "loss/lm": 0.350955992937088, "step": 3790 }, { "epoch": 0.7782796140422911, "grad_norm": 0.8203051124382353, "kd_ratio": 0.5, "learning_rate": 2.470285536055188e-06, "loss": 1.7879921197891235, "loss/kd": 3.2754781246185303, "loss/lm": 0.300506055355072, "step": 3791 }, { "epoch": 0.7784849106959556, "grad_norm": 1.1104442853610004, "kd_ratio": 0.5, "learning_rate": 2.465910962932123e-06, "loss": 1.0638946294784546, "loss/kd": 1.7908939123153687, "loss/lm": 0.3368953764438629, "step": 3792 }, { "epoch": 0.7786902073496202, "grad_norm": 0.8984744348053701, "kd_ratio": 0.5, "learning_rate": 2.461539721852845e-06, "loss": 1.0001239776611328, "loss/kd": 1.7311640977859497, "loss/lm": 0.26908397674560547, "step": 3793 }, { "epoch": 0.7788955040032848, "grad_norm": 0.9407800982037365, "kd_ratio": 0.5, "learning_rate": 2.4571718147505872e-06, "loss": 0.9358602166175842, "loss/kd": 1.518141746520996, "loss/lm": 0.35357871651649475, "step": 3794 }, { "epoch": 0.7791008006569493, "grad_norm": 0.9218833137340715, "kd_ratio": 0.5, "learning_rate": 2.4528072435571158e-06, "loss": 0.9042482972145081, "loss/kd": 1.432594656944275, "loss/lm": 0.3759019076824188, "step": 3795 }, { "epoch": 0.7793060973106138, "grad_norm": 1.0011179761049058, "kd_ratio": 0.5, "learning_rate": 2.4484460102027098e-06, "loss": 0.989369809627533, "loss/kd": 1.677995204925537, "loss/lm": 0.3007444441318512, "step": 3796 }, { "epoch": 0.7795113939642784, "grad_norm": 0.996075560519428, "kd_ratio": 0.5, "learning_rate": 2.44408811661618e-06, "loss": 0.9220794439315796, "loss/kd": 1.6031219959259033, "loss/lm": 0.24103695154190063, "step": 3797 }, { "epoch": 0.7797166906179429, "grad_norm": 0.828052481369783, "kd_ratio": 0.5, "learning_rate": 2.4397335647248598e-06, "loss": 0.8619629740715027, "loss/kd": 1.440755009651184, "loss/lm": 0.2831709384918213, "step": 3798 }, { "epoch": 0.7799219872716074, "grad_norm": 1.0336571185916998, "kd_ratio": 0.5, "learning_rate": 2.4353823564546064e-06, "loss": 1.010250210762024, "loss/kd": 1.6809357404708862, "loss/lm": 0.3395647704601288, "step": 3799 }, { "epoch": 0.780127283925272, "grad_norm": 0.9841889669405438, "kd_ratio": 0.5, "learning_rate": 2.4310344937297894e-06, "loss": 0.8704145550727844, "loss/kd": 1.454591155052185, "loss/lm": 0.2862379848957062, "step": 3800 }, { "epoch": 0.7803325805789366, "grad_norm": 0.963068268324721, "kd_ratio": 0.5, "learning_rate": 2.426689978473308e-06, "loss": 1.3003989458084106, "loss/kd": 2.193152666091919, "loss/lm": 0.4076453149318695, "step": 3801 }, { "epoch": 0.7805378772326012, "grad_norm": 0.9246057913284282, "kd_ratio": 0.5, "learning_rate": 2.4223488126065752e-06, "loss": 1.129142165184021, "loss/kd": 1.9279741048812866, "loss/lm": 0.33031031489372253, "step": 3802 }, { "epoch": 0.7807431738862657, "grad_norm": 0.8420808174202724, "kd_ratio": 0.5, "learning_rate": 2.4180109980495293e-06, "loss": 1.0620741844177246, "loss/kd": 1.7013766765594482, "loss/lm": 0.4227718114852905, "step": 3803 }, { "epoch": 0.7809484705399302, "grad_norm": 0.9011199216591826, "kd_ratio": 0.5, "learning_rate": 2.4136765367206216e-06, "loss": 1.0056793689727783, "loss/kd": 1.6713957786560059, "loss/lm": 0.33996284008026123, "step": 3804 }, { "epoch": 0.7811537671935948, "grad_norm": 0.9748664422223, "kd_ratio": 0.5, "learning_rate": 2.409345430536817e-06, "loss": 1.0830081701278687, "loss/kd": 1.7833174467086792, "loss/lm": 0.3826988935470581, "step": 3805 }, { "epoch": 0.7813590638472593, "grad_norm": 0.8413565218699662, "kd_ratio": 0.5, "learning_rate": 2.405017681413605e-06, "loss": 0.81944739818573, "loss/kd": 1.3761979341506958, "loss/lm": 0.2626968324184418, "step": 3806 }, { "epoch": 0.7815643605009238, "grad_norm": 0.8955367215074956, "kd_ratio": 0.5, "learning_rate": 2.4006932912649816e-06, "loss": 1.0656895637512207, "loss/kd": 1.8204114437103271, "loss/lm": 0.3109678030014038, "step": 3807 }, { "epoch": 0.7817696571545883, "grad_norm": 1.0151322194860204, "kd_ratio": 0.5, "learning_rate": 2.396372262003465e-06, "loss": 0.9584739804267883, "loss/kd": 1.5785483121871948, "loss/lm": 0.3383996784687042, "step": 3808 }, { "epoch": 0.7819749538082529, "grad_norm": 0.9342873283399425, "kd_ratio": 0.5, "learning_rate": 2.392054595540083e-06, "loss": 1.0833951234817505, "loss/kd": 1.774629831314087, "loss/lm": 0.39216047525405884, "step": 3809 }, { "epoch": 0.7821802504619175, "grad_norm": 0.8651576467849567, "kd_ratio": 0.5, "learning_rate": 2.3877402937843753e-06, "loss": 1.455018162727356, "loss/kd": 2.5435686111450195, "loss/lm": 0.3664677143096924, "step": 3810 }, { "epoch": 0.7823855471155821, "grad_norm": 0.8777899975263808, "kd_ratio": 0.5, "learning_rate": 2.3834293586444e-06, "loss": 0.8405467867851257, "loss/kd": 1.3738958835601807, "loss/lm": 0.3071977198123932, "step": 3811 }, { "epoch": 0.7825908437692466, "grad_norm": 1.2677956766661733, "kd_ratio": 0.5, "learning_rate": 2.3791217920267163e-06, "loss": 0.912255585193634, "loss/kd": 1.48911452293396, "loss/lm": 0.3353966772556305, "step": 3812 }, { "epoch": 0.7827961404229111, "grad_norm": 0.8505824030188247, "kd_ratio": 0.5, "learning_rate": 2.3748175958363985e-06, "loss": 1.194890022277832, "loss/kd": 2.084016799926758, "loss/lm": 0.3057633638381958, "step": 3813 }, { "epoch": 0.7830014370765757, "grad_norm": 0.8866317720637421, "kd_ratio": 0.5, "learning_rate": 2.370516771977033e-06, "loss": 0.8649992942810059, "loss/kd": 1.3842023611068726, "loss/lm": 0.34579628705978394, "step": 3814 }, { "epoch": 0.7832067337302402, "grad_norm": 0.9639770219685172, "kd_ratio": 0.5, "learning_rate": 2.3662193223507135e-06, "loss": 0.8385199904441833, "loss/kd": 1.3290225267410278, "loss/lm": 0.34801745414733887, "step": 3815 }, { "epoch": 0.7834120303839047, "grad_norm": 0.8073931082950416, "kd_ratio": 0.5, "learning_rate": 2.3619252488580345e-06, "loss": 0.9039512276649475, "loss/kd": 1.5580037832260132, "loss/lm": 0.24989870190620422, "step": 3816 }, { "epoch": 0.7836173270375693, "grad_norm": 0.8437457637116801, "kd_ratio": 0.5, "learning_rate": 2.357634553398108e-06, "loss": 1.1433874368667603, "loss/kd": 1.8959660530090332, "loss/lm": 0.3908088505268097, "step": 3817 }, { "epoch": 0.7838226236912338, "grad_norm": 0.8948239824264836, "kd_ratio": 0.5, "learning_rate": 2.353347237868544e-06, "loss": 1.0783143043518066, "loss/kd": 1.7574174404144287, "loss/lm": 0.3992110788822174, "step": 3818 }, { "epoch": 0.7840279203448984, "grad_norm": 0.9251253145501583, "kd_ratio": 0.5, "learning_rate": 2.349063304165462e-06, "loss": 1.297791600227356, "loss/kd": 2.3047049045562744, "loss/lm": 0.2908783257007599, "step": 3819 }, { "epoch": 0.784233216998563, "grad_norm": 0.9635995172733264, "kd_ratio": 0.5, "learning_rate": 2.344782754183488e-06, "loss": 0.9989006519317627, "loss/kd": 1.6861152648925781, "loss/lm": 0.31168603897094727, "step": 3820 }, { "epoch": 0.7844385136522275, "grad_norm": 1.012407175674435, "kd_ratio": 0.5, "learning_rate": 2.3405055898157416e-06, "loss": 1.1407866477966309, "loss/kd": 1.8670451641082764, "loss/lm": 0.4145281910896301, "step": 3821 }, { "epoch": 0.784643810305892, "grad_norm": 1.0735423862612161, "kd_ratio": 0.5, "learning_rate": 2.3362318129538573e-06, "loss": 1.0710309743881226, "loss/kd": 1.8262816667556763, "loss/lm": 0.31578028202056885, "step": 3822 }, { "epoch": 0.7848491069595566, "grad_norm": 1.0763053852291735, "kd_ratio": 0.5, "learning_rate": 2.331961425487956e-06, "loss": 1.180593729019165, "loss/kd": 2.074904203414917, "loss/lm": 0.28628334403038025, "step": 3823 }, { "epoch": 0.7850544036132211, "grad_norm": 0.9119325012973196, "kd_ratio": 0.5, "learning_rate": 2.327694429306683e-06, "loss": 0.8634946346282959, "loss/kd": 1.4392472505569458, "loss/lm": 0.2877420485019684, "step": 3824 }, { "epoch": 0.7852597002668856, "grad_norm": 0.9478101276378347, "kd_ratio": 0.5, "learning_rate": 2.3234308262971594e-06, "loss": 0.9891906976699829, "loss/kd": 1.6264450550079346, "loss/lm": 0.35193637013435364, "step": 3825 }, { "epoch": 0.7854649969205502, "grad_norm": 0.9534950505039951, "kd_ratio": 0.5, "learning_rate": 2.3191706183450225e-06, "loss": 1.3889341354370117, "loss/kd": 2.473090410232544, "loss/lm": 0.30477777123451233, "step": 3826 }, { "epoch": 0.7856702935742147, "grad_norm": 1.0160194950996546, "kd_ratio": 0.5, "learning_rate": 2.3149138073343958e-06, "loss": 1.0157873630523682, "loss/kd": 1.6384400129318237, "loss/lm": 0.393134742975235, "step": 3827 }, { "epoch": 0.7858755902278793, "grad_norm": 0.8492919820215226, "kd_ratio": 0.5, "learning_rate": 2.31066039514791e-06, "loss": 0.9448274970054626, "loss/kd": 1.558674693107605, "loss/lm": 0.3309803009033203, "step": 3828 }, { "epoch": 0.7860808868815439, "grad_norm": 0.961931018889164, "kd_ratio": 0.5, "learning_rate": 2.3064103836666896e-06, "loss": 1.1587016582489014, "loss/kd": 1.9313617944717407, "loss/lm": 0.3860415816307068, "step": 3829 }, { "epoch": 0.7862861835352084, "grad_norm": 1.0661418594402494, "kd_ratio": 0.5, "learning_rate": 2.302163774770353e-06, "loss": 0.9311894774436951, "loss/kd": 1.5411268472671509, "loss/lm": 0.32125207781791687, "step": 3830 }, { "epoch": 0.7864914801888729, "grad_norm": 2.036921287959589, "kd_ratio": 0.5, "learning_rate": 2.297920570337019e-06, "loss": 0.8980609774589539, "loss/kd": 1.5033998489379883, "loss/lm": 0.29272210597991943, "step": 3831 }, { "epoch": 0.7866967768425375, "grad_norm": 0.8987765455347948, "kd_ratio": 0.5, "learning_rate": 2.293680772243292e-06, "loss": 1.0239795446395874, "loss/kd": 1.7101243734359741, "loss/lm": 0.3378346562385559, "step": 3832 }, { "epoch": 0.786902073496202, "grad_norm": 1.0265629859156447, "kd_ratio": 0.5, "learning_rate": 2.2894443823642787e-06, "loss": 1.061751127243042, "loss/kd": 1.7340549230575562, "loss/lm": 0.3894473910331726, "step": 3833 }, { "epoch": 0.7871073701498665, "grad_norm": 1.2845424929631282, "kd_ratio": 0.5, "learning_rate": 2.285211402573568e-06, "loss": 0.9512664079666138, "loss/kd": 1.5251840353012085, "loss/lm": 0.37734872102737427, "step": 3834 }, { "epoch": 0.7873126668035311, "grad_norm": 0.8381610261586049, "kd_ratio": 0.5, "learning_rate": 2.2809818347432598e-06, "loss": 1.2168464660644531, "loss/kd": 2.072154998779297, "loss/lm": 0.36153796315193176, "step": 3835 }, { "epoch": 0.7875179634571956, "grad_norm": 0.9181037262550197, "kd_ratio": 0.5, "learning_rate": 2.2767556807439216e-06, "loss": 1.0523595809936523, "loss/kd": 1.7520933151245117, "loss/lm": 0.35262590646743774, "step": 3836 }, { "epoch": 0.7877232601108602, "grad_norm": 1.367730042227559, "kd_ratio": 0.5, "learning_rate": 2.2725329424446263e-06, "loss": 1.3505922555923462, "loss/kd": 2.386608839035034, "loss/lm": 0.3145756125450134, "step": 3837 }, { "epoch": 0.7879285567645248, "grad_norm": 1.0822178459225373, "kd_ratio": 0.5, "learning_rate": 2.268313621712934e-06, "loss": 1.2523283958435059, "loss/kd": 2.2138328552246094, "loss/lm": 0.29082396626472473, "step": 3838 }, { "epoch": 0.7881338534181893, "grad_norm": 0.9088892771245437, "kd_ratio": 0.5, "learning_rate": 2.2640977204148838e-06, "loss": 0.9518059492111206, "loss/kd": 1.5821114778518677, "loss/lm": 0.32150036096572876, "step": 3839 }, { "epoch": 0.7883391500718538, "grad_norm": 0.945766152022627, "kd_ratio": 0.5, "learning_rate": 2.2598852404150207e-06, "loss": 1.3829200267791748, "loss/kd": 2.42067551612854, "loss/lm": 0.34516441822052, "step": 3840 }, { "epoch": 0.7885444467255184, "grad_norm": 1.2123591886868368, "kd_ratio": 0.5, "learning_rate": 2.2556761835763576e-06, "loss": 0.8233312964439392, "loss/kd": 1.333724021911621, "loss/lm": 0.3129386007785797, "step": 3841 }, { "epoch": 0.7887497433791829, "grad_norm": 0.928518119753925, "kd_ratio": 0.5, "learning_rate": 2.2514705517604073e-06, "loss": 1.1246535778045654, "loss/kd": 1.8671395778656006, "loss/lm": 0.3821676969528198, "step": 3842 }, { "epoch": 0.7889550400328474, "grad_norm": 0.9110955103947467, "kd_ratio": 0.5, "learning_rate": 2.2472683468271584e-06, "loss": 0.9496622085571289, "loss/kd": 1.535451054573059, "loss/lm": 0.3638733923435211, "step": 3843 }, { "epoch": 0.789160336686512, "grad_norm": 0.9904048647400742, "kd_ratio": 0.5, "learning_rate": 2.243069570635088e-06, "loss": 0.9885167479515076, "loss/kd": 1.623958945274353, "loss/lm": 0.3530745208263397, "step": 3844 }, { "epoch": 0.7893656333401765, "grad_norm": 1.1127567774673985, "kd_ratio": 0.5, "learning_rate": 2.2388742250411576e-06, "loss": 1.0566787719726562, "loss/kd": 1.7641633749008179, "loss/lm": 0.34919407963752747, "step": 3845 }, { "epoch": 0.7895709299938412, "grad_norm": 0.9219110386114022, "kd_ratio": 0.5, "learning_rate": 2.234682311900812e-06, "loss": 0.8242051005363464, "loss/kd": 1.2021242380142212, "loss/lm": 0.4462859630584717, "step": 3846 }, { "epoch": 0.7897762266475057, "grad_norm": 1.0401160683533663, "kd_ratio": 0.5, "learning_rate": 2.230493833067977e-06, "loss": 0.9929932355880737, "loss/kd": 1.686708688735962, "loss/lm": 0.2992778420448303, "step": 3847 }, { "epoch": 0.7899815233011702, "grad_norm": 1.267901377648149, "kd_ratio": 0.5, "learning_rate": 2.226308790395054e-06, "loss": 0.9154247045516968, "loss/kd": 1.5427117347717285, "loss/lm": 0.2881377041339874, "step": 3848 }, { "epoch": 0.7901868199548348, "grad_norm": 0.9283318702921989, "kd_ratio": 0.5, "learning_rate": 2.222127185732934e-06, "loss": 1.0082036256790161, "loss/kd": 1.733919620513916, "loss/lm": 0.2824876010417938, "step": 3849 }, { "epoch": 0.7903921166084993, "grad_norm": 1.3390011646421063, "kd_ratio": 0.5, "learning_rate": 2.2179490209309827e-06, "loss": 1.1630768775939941, "loss/kd": 2.0396294593811035, "loss/lm": 0.2865242660045624, "step": 3850 }, { "epoch": 0.7905974132621638, "grad_norm": 0.9264893513021872, "kd_ratio": 0.5, "learning_rate": 2.213774297837047e-06, "loss": 0.8649316430091858, "loss/kd": 1.429194450378418, "loss/lm": 0.3006688058376312, "step": 3851 }, { "epoch": 0.7908027099158284, "grad_norm": 0.9686614556497859, "kd_ratio": 0.5, "learning_rate": 2.209603018297444e-06, "loss": 0.8229360580444336, "loss/kd": 1.2939294576644897, "loss/lm": 0.35194268822669983, "step": 3852 }, { "epoch": 0.7910080065694929, "grad_norm": 1.3292225916017013, "kd_ratio": 0.5, "learning_rate": 2.2054351841569787e-06, "loss": 1.0691465139389038, "loss/kd": 1.816994071006775, "loss/lm": 0.3212989568710327, "step": 3853 }, { "epoch": 0.7912133032231574, "grad_norm": 0.858946568141523, "kd_ratio": 0.5, "learning_rate": 2.201270797258921e-06, "loss": 0.995486319065094, "loss/kd": 1.67929208278656, "loss/lm": 0.31168052554130554, "step": 3854 }, { "epoch": 0.7914185998768221, "grad_norm": 1.0923923036821692, "kd_ratio": 0.5, "learning_rate": 2.1971098594450315e-06, "loss": 0.790503740310669, "loss/kd": 1.3016695976257324, "loss/lm": 0.2793378233909607, "step": 3855 }, { "epoch": 0.7916238965304866, "grad_norm": 0.8586763202971991, "kd_ratio": 0.5, "learning_rate": 2.192952372555528e-06, "loss": 1.5110564231872559, "loss/kd": 2.767084836959839, "loss/lm": 0.25502797961235046, "step": 3856 }, { "epoch": 0.7918291931841511, "grad_norm": 0.9057972693047188, "kd_ratio": 0.5, "learning_rate": 2.1887983384291143e-06, "loss": 0.9646825194358826, "loss/kd": 1.6163671016693115, "loss/lm": 0.3129979074001312, "step": 3857 }, { "epoch": 0.7920344898378157, "grad_norm": 1.0314766275135903, "kd_ratio": 0.5, "learning_rate": 2.184647758902966e-06, "loss": 1.0415548086166382, "loss/kd": 1.7126460075378418, "loss/lm": 0.3704635798931122, "step": 3858 }, { "epoch": 0.7922397864914802, "grad_norm": 0.8819751571730715, "kd_ratio": 0.5, "learning_rate": 2.1805006358127213e-06, "loss": 0.9917873740196228, "loss/kd": 1.6621134281158447, "loss/lm": 0.3214612901210785, "step": 3859 }, { "epoch": 0.7924450831451447, "grad_norm": 0.8739156819644541, "kd_ratio": 0.5, "learning_rate": 2.176356970992499e-06, "loss": 0.9034912586212158, "loss/kd": 1.4274578094482422, "loss/lm": 0.37952473759651184, "step": 3860 }, { "epoch": 0.7926503797988093, "grad_norm": 0.9540950056230055, "kd_ratio": 0.5, "learning_rate": 2.1722167662748874e-06, "loss": 0.9285184741020203, "loss/kd": 1.4740660190582275, "loss/lm": 0.382970929145813, "step": 3861 }, { "epoch": 0.7928556764524738, "grad_norm": 0.9282418529481883, "kd_ratio": 0.5, "learning_rate": 2.168080023490946e-06, "loss": 1.05182683467865, "loss/kd": 1.830735683441162, "loss/lm": 0.27291789650917053, "step": 3862 }, { "epoch": 0.7930609731061383, "grad_norm": 1.4542104645269218, "kd_ratio": 0.5, "learning_rate": 2.1639467444701934e-06, "loss": 1.083946704864502, "loss/kd": 1.848448634147644, "loss/lm": 0.3194446563720703, "step": 3863 }, { "epoch": 0.793266269759803, "grad_norm": 0.9877969188551593, "kd_ratio": 0.5, "learning_rate": 2.159816931040627e-06, "loss": 0.986527144908905, "loss/kd": 1.658751130104065, "loss/lm": 0.3143031597137451, "step": 3864 }, { "epoch": 0.7934715664134675, "grad_norm": 1.0209153837224236, "kd_ratio": 0.5, "learning_rate": 2.155690585028707e-06, "loss": 1.090468168258667, "loss/kd": 1.8414417505264282, "loss/lm": 0.3394946753978729, "step": 3865 }, { "epoch": 0.793676863067132, "grad_norm": 1.1007800077134486, "kd_ratio": 0.5, "learning_rate": 2.151567708259361e-06, "loss": 1.0815486907958984, "loss/kd": 1.8349238634109497, "loss/lm": 0.3281736373901367, "step": 3866 }, { "epoch": 0.7938821597207966, "grad_norm": 0.9150845708129975, "kd_ratio": 0.5, "learning_rate": 2.1474483025559857e-06, "loss": 0.9001024961471558, "loss/kd": 1.4324058294296265, "loss/lm": 0.36779913306236267, "step": 3867 }, { "epoch": 0.7940874563744611, "grad_norm": 1.064566946672108, "kd_ratio": 0.5, "learning_rate": 2.1433323697404316e-06, "loss": 1.0053069591522217, "loss/kd": 1.7000881433486938, "loss/lm": 0.31052571535110474, "step": 3868 }, { "epoch": 0.7942927530281256, "grad_norm": 1.057382299585766, "kd_ratio": 0.5, "learning_rate": 2.1392199116330293e-06, "loss": 0.9968411326408386, "loss/kd": 1.640885591506958, "loss/lm": 0.35279664397239685, "step": 3869 }, { "epoch": 0.7944980496817902, "grad_norm": 0.8986158273884003, "kd_ratio": 0.5, "learning_rate": 2.135110930052553e-06, "loss": 0.9364175200462341, "loss/kd": 1.568838357925415, "loss/lm": 0.3039967119693756, "step": 3870 }, { "epoch": 0.7947033463354547, "grad_norm": 0.952642014483168, "kd_ratio": 0.5, "learning_rate": 2.1310054268162628e-06, "loss": 0.8013639450073242, "loss/kd": 1.3213047981262207, "loss/lm": 0.28142309188842773, "step": 3871 }, { "epoch": 0.7949086429891192, "grad_norm": 0.8903931410965977, "kd_ratio": 0.5, "learning_rate": 2.1269034037398596e-06, "loss": 0.8904743790626526, "loss/kd": 1.4532026052474976, "loss/lm": 0.32774612307548523, "step": 3872 }, { "epoch": 0.7951139396427839, "grad_norm": 1.0169199990996767, "kd_ratio": 0.5, "learning_rate": 2.1228048626375163e-06, "loss": 1.0720908641815186, "loss/kd": 1.778904914855957, "loss/lm": 0.36527693271636963, "step": 3873 }, { "epoch": 0.7953192362964484, "grad_norm": 0.9730849570655004, "kd_ratio": 0.5, "learning_rate": 2.118709805321867e-06, "loss": 0.9201753735542297, "loss/kd": 1.543081521987915, "loss/lm": 0.29726919531822205, "step": 3874 }, { "epoch": 0.7955245329501129, "grad_norm": 1.5284144746597716, "kd_ratio": 0.5, "learning_rate": 2.114618233603992e-06, "loss": 1.0138814449310303, "loss/kd": 1.7052650451660156, "loss/lm": 0.32249781489372253, "step": 3875 }, { "epoch": 0.7957298296037775, "grad_norm": 0.9610338449005954, "kd_ratio": 0.5, "learning_rate": 2.1105301492934503e-06, "loss": 0.8057247996330261, "loss/kd": 1.3441754579544067, "loss/lm": 0.2672741711139679, "step": 3876 }, { "epoch": 0.795935126257442, "grad_norm": 0.975538079293864, "kd_ratio": 0.5, "learning_rate": 2.1064455541982397e-06, "loss": 1.8725907802581787, "loss/kd": 3.4360878467559814, "loss/lm": 0.309093713760376, "step": 3877 }, { "epoch": 0.7961404229111065, "grad_norm": 0.8650389772842633, "kd_ratio": 0.5, "learning_rate": 2.1023644501248285e-06, "loss": 0.9936087131500244, "loss/kd": 1.6756144762039185, "loss/lm": 0.311602920293808, "step": 3878 }, { "epoch": 0.7963457195647711, "grad_norm": 0.9413257452383091, "kd_ratio": 0.5, "learning_rate": 2.0982868388781286e-06, "loss": 0.9991393089294434, "loss/kd": 1.7091535329818726, "loss/lm": 0.2891250550746918, "step": 3879 }, { "epoch": 0.7965510162184356, "grad_norm": 0.9907498484528208, "kd_ratio": 0.5, "learning_rate": 2.0942127222615184e-06, "loss": 0.9699832201004028, "loss/kd": 1.6221662759780884, "loss/lm": 0.3178001940250397, "step": 3880 }, { "epoch": 0.7967563128721001, "grad_norm": 1.0697494404003003, "kd_ratio": 0.5, "learning_rate": 2.090142102076825e-06, "loss": 0.8470137715339661, "loss/kd": 1.366396427154541, "loss/lm": 0.3276311159133911, "step": 3881 }, { "epoch": 0.7969616095257648, "grad_norm": 0.8182417041158947, "kd_ratio": 0.5, "learning_rate": 2.086074980124334e-06, "loss": 1.13111412525177, "loss/kd": 1.8723020553588867, "loss/lm": 0.3899262249469757, "step": 3882 }, { "epoch": 0.7971669061794293, "grad_norm": 0.8634075012775386, "kd_ratio": 0.5, "learning_rate": 2.0820113582027734e-06, "loss": 1.5915155410766602, "loss/kd": 2.8052287101745605, "loss/lm": 0.377802312374115, "step": 3883 }, { "epoch": 0.7973722028330938, "grad_norm": 0.898668214205961, "kd_ratio": 0.5, "learning_rate": 2.0779512381093348e-06, "loss": 1.0917201042175293, "loss/kd": 1.8218432664871216, "loss/lm": 0.361596941947937, "step": 3884 }, { "epoch": 0.7975774994867584, "grad_norm": 0.945877350628883, "kd_ratio": 0.5, "learning_rate": 2.0738946216396573e-06, "loss": 1.1337648630142212, "loss/kd": 1.927751898765564, "loss/lm": 0.3397778272628784, "step": 3885 }, { "epoch": 0.7977827961404229, "grad_norm": 0.9363973109869209, "kd_ratio": 0.5, "learning_rate": 2.069841510587821e-06, "loss": 0.9238082766532898, "loss/kd": 1.5100661516189575, "loss/lm": 0.33755043148994446, "step": 3886 }, { "epoch": 0.7979880927940874, "grad_norm": 0.9534656255471404, "kd_ratio": 0.5, "learning_rate": 2.0657919067463773e-06, "loss": 1.086132526397705, "loss/kd": 1.8065029382705688, "loss/lm": 0.36576199531555176, "step": 3887 }, { "epoch": 0.798193389447752, "grad_norm": 0.9134520294844704, "kd_ratio": 0.5, "learning_rate": 2.061745811906304e-06, "loss": 0.9062770009040833, "loss/kd": 1.4746158123016357, "loss/lm": 0.3379381597042084, "step": 3888 }, { "epoch": 0.7983986861014165, "grad_norm": 0.8217086064293555, "kd_ratio": 0.5, "learning_rate": 2.057703227857042e-06, "loss": 1.1023426055908203, "loss/kd": 1.8878527879714966, "loss/lm": 0.3168325424194336, "step": 3889 }, { "epoch": 0.798603982755081, "grad_norm": 0.9693633813183226, "kd_ratio": 0.5, "learning_rate": 2.053664156386469e-06, "loss": 1.0333079099655151, "loss/kd": 1.7174228429794312, "loss/lm": 0.3491930365562439, "step": 3890 }, { "epoch": 0.7988092794087457, "grad_norm": 0.8622613902798618, "kd_ratio": 0.5, "learning_rate": 2.0496285992809163e-06, "loss": 1.0545581579208374, "loss/kd": 1.7848780155181885, "loss/lm": 0.32423830032348633, "step": 3891 }, { "epoch": 0.7990145760624102, "grad_norm": 0.840286067304297, "kd_ratio": 0.5, "learning_rate": 2.045596558325158e-06, "loss": 1.0765039920806885, "loss/kd": 1.8042891025543213, "loss/lm": 0.3487188518047333, "step": 3892 }, { "epoch": 0.7992198727160748, "grad_norm": 0.8643580128088059, "kd_ratio": 0.5, "learning_rate": 2.041568035302416e-06, "loss": 0.9145472645759583, "loss/kd": 1.5461783409118652, "loss/lm": 0.2829161584377289, "step": 3893 }, { "epoch": 0.7994251693697393, "grad_norm": 0.8664247045671862, "kd_ratio": 0.5, "learning_rate": 2.0375430319943547e-06, "loss": 1.0696982145309448, "loss/kd": 1.8592474460601807, "loss/lm": 0.28014907240867615, "step": 3894 }, { "epoch": 0.7996304660234038, "grad_norm": 0.9075290480608426, "kd_ratio": 0.5, "learning_rate": 2.033521550181078e-06, "loss": 0.9460549354553223, "loss/kd": 1.4976917505264282, "loss/lm": 0.3944181799888611, "step": 3895 }, { "epoch": 0.7998357626770684, "grad_norm": 0.9116760921673683, "kd_ratio": 0.5, "learning_rate": 2.0295035916411377e-06, "loss": 1.8656232357025146, "loss/kd": 3.4624416828155518, "loss/lm": 0.2688049077987671, "step": 3896 }, { "epoch": 0.8000410593307329, "grad_norm": 0.8740715034686207, "kd_ratio": 0.5, "learning_rate": 2.0254891581515255e-06, "loss": 1.0756373405456543, "loss/kd": 1.839897632598877, "loss/lm": 0.31137707829475403, "step": 3897 }, { "epoch": 0.8002463559843974, "grad_norm": 0.880214153974129, "kd_ratio": 0.5, "learning_rate": 2.0214782514876763e-06, "loss": 0.8599808216094971, "loss/kd": 1.3975374698638916, "loss/lm": 0.32242417335510254, "step": 3898 }, { "epoch": 0.800451652638062, "grad_norm": 1.2391292866201398, "kd_ratio": 0.5, "learning_rate": 2.0174708734234596e-06, "loss": 1.041208267211914, "loss/kd": 1.7474241256713867, "loss/lm": 0.33499234914779663, "step": 3899 }, { "epoch": 0.8006569492917266, "grad_norm": 0.8669133760630143, "kd_ratio": 0.5, "learning_rate": 2.0134670257311872e-06, "loss": 1.6953312158584595, "loss/kd": 3.0599560737609863, "loss/lm": 0.33070632815361023, "step": 3900 }, { "epoch": 0.8008622459453911, "grad_norm": 1.0192309493301925, "kd_ratio": 0.5, "learning_rate": 2.0094667101816133e-06, "loss": 1.1128544807434082, "loss/kd": 1.8775213956832886, "loss/lm": 0.3481875956058502, "step": 3901 }, { "epoch": 0.8010675425990557, "grad_norm": 0.8758267387312746, "kd_ratio": 0.5, "learning_rate": 2.005469928543925e-06, "loss": 1.0897430181503296, "loss/kd": 1.7824965715408325, "loss/lm": 0.3969893753528595, "step": 3902 }, { "epoch": 0.8012728392527202, "grad_norm": 0.8677387153537168, "kd_ratio": 0.5, "learning_rate": 2.0014766825857514e-06, "loss": 1.1000213623046875, "loss/kd": 1.8380388021469116, "loss/lm": 0.3620039224624634, "step": 3903 }, { "epoch": 0.8014781359063847, "grad_norm": 1.0058429125182273, "kd_ratio": 0.5, "learning_rate": 1.9974869740731497e-06, "loss": 0.7838987708091736, "loss/kd": 1.19625985622406, "loss/lm": 0.3715376555919647, "step": 3904 }, { "epoch": 0.8016834325600493, "grad_norm": 0.8537835065859507, "kd_ratio": 0.5, "learning_rate": 1.9935008047706217e-06, "loss": 1.100368857383728, "loss/kd": 1.8391289710998535, "loss/lm": 0.36160871386528015, "step": 3905 }, { "epoch": 0.8018887292137138, "grad_norm": 0.8916478378334229, "kd_ratio": 0.5, "learning_rate": 1.989518176441094e-06, "loss": 1.0892661809921265, "loss/kd": 1.8390253782272339, "loss/lm": 0.33950701355934143, "step": 3906 }, { "epoch": 0.8020940258673783, "grad_norm": 0.890767797260584, "kd_ratio": 0.5, "learning_rate": 1.985539090845943e-06, "loss": 1.2874681949615479, "loss/kd": 2.141671657562256, "loss/lm": 0.4332647919654846, "step": 3907 }, { "epoch": 0.8022993225210429, "grad_norm": 0.9480099546110771, "kd_ratio": 0.5, "learning_rate": 1.981563549744959e-06, "loss": 1.105804204940796, "loss/kd": 1.8381307125091553, "loss/lm": 0.37347766757011414, "step": 3908 }, { "epoch": 0.8025046191747075, "grad_norm": 1.0530228743748602, "kd_ratio": 0.5, "learning_rate": 1.977591554896382e-06, "loss": 1.2268898487091064, "loss/kd": 2.1455769538879395, "loss/lm": 0.3082028031349182, "step": 3909 }, { "epoch": 0.802709915828372, "grad_norm": 1.2375103604793405, "kd_ratio": 0.5, "learning_rate": 1.973623108056868e-06, "loss": 0.8522177934646606, "loss/kd": 1.4670491218566895, "loss/lm": 0.23738649487495422, "step": 3910 }, { "epoch": 0.8029152124820366, "grad_norm": 0.9089098523958626, "kd_ratio": 0.5, "learning_rate": 1.9696582109815145e-06, "loss": 1.0428988933563232, "loss/kd": 1.7410212755203247, "loss/lm": 0.34477663040161133, "step": 3911 }, { "epoch": 0.8031205091357011, "grad_norm": 0.9583867011197147, "kd_ratio": 0.5, "learning_rate": 1.9656968654238473e-06, "loss": 0.9856245517730713, "loss/kd": 1.6483389139175415, "loss/lm": 0.3229101300239563, "step": 3912 }, { "epoch": 0.8033258057893656, "grad_norm": 1.004107150147735, "kd_ratio": 0.5, "learning_rate": 1.96173907313582e-06, "loss": 0.9642714262008667, "loss/kd": 1.616600513458252, "loss/lm": 0.31194233894348145, "step": 3913 }, { "epoch": 0.8035311024430302, "grad_norm": 1.2000675028451626, "kd_ratio": 0.5, "learning_rate": 1.9577848358678177e-06, "loss": 1.0974937677383423, "loss/kd": 1.8023653030395508, "loss/lm": 0.39262232184410095, "step": 3914 }, { "epoch": 0.8037363990966947, "grad_norm": 1.0217329394507555, "kd_ratio": 0.5, "learning_rate": 1.9538341553686446e-06, "loss": 1.0758321285247803, "loss/kd": 1.8079407215118408, "loss/lm": 0.3437236249446869, "step": 3915 }, { "epoch": 0.8039416957503592, "grad_norm": 0.9822048218018422, "kd_ratio": 0.5, "learning_rate": 1.9498870333855436e-06, "loss": 0.964187741279602, "loss/kd": 1.5657185316085815, "loss/lm": 0.36265698075294495, "step": 3916 }, { "epoch": 0.8041469924040238, "grad_norm": 0.9611196278674445, "kd_ratio": 0.5, "learning_rate": 1.94594347166417e-06, "loss": 0.9096519351005554, "loss/kd": 1.5441112518310547, "loss/lm": 0.27519264817237854, "step": 3917 }, { "epoch": 0.8043522890576884, "grad_norm": 1.002384197124437, "kd_ratio": 0.5, "learning_rate": 1.9420034719486246e-06, "loss": 0.9547131061553955, "loss/kd": 1.6245510578155518, "loss/lm": 0.28487512469291687, "step": 3918 }, { "epoch": 0.8045575857113529, "grad_norm": 0.868612153037383, "kd_ratio": 0.5, "learning_rate": 1.93806703598141e-06, "loss": 1.140770435333252, "loss/kd": 1.9643231630325317, "loss/lm": 0.3172176778316498, "step": 3919 }, { "epoch": 0.8047628823650175, "grad_norm": 0.7817545749853944, "kd_ratio": 0.5, "learning_rate": 1.9341341655034683e-06, "loss": 1.0956960916519165, "loss/kd": 1.8996367454528809, "loss/lm": 0.29175546765327454, "step": 3920 }, { "epoch": 0.804968179018682, "grad_norm": 1.0831583123208441, "kd_ratio": 0.5, "learning_rate": 1.9302048622541635e-06, "loss": 1.2470228672027588, "loss/kd": 2.0652878284454346, "loss/lm": 0.428757905960083, "step": 3921 }, { "epoch": 0.8051734756723465, "grad_norm": 0.9040447399341432, "kd_ratio": 0.5, "learning_rate": 1.926279127971268e-06, "loss": 1.1019896268844604, "loss/kd": 1.8818107843399048, "loss/lm": 0.3221685290336609, "step": 3922 }, { "epoch": 0.8053787723260111, "grad_norm": 0.8647740328674255, "kd_ratio": 0.5, "learning_rate": 1.9223569643909978e-06, "loss": 0.7359133958816528, "loss/kd": 1.1612802743911743, "loss/lm": 0.31054648756980896, "step": 3923 }, { "epoch": 0.8055840689796756, "grad_norm": 0.9194122938511078, "kd_ratio": 0.5, "learning_rate": 1.9184383732479705e-06, "loss": 1.1306874752044678, "loss/kd": 1.935586929321289, "loss/lm": 0.32578811049461365, "step": 3924 }, { "epoch": 0.8057893656333401, "grad_norm": 1.088479276305864, "kd_ratio": 0.5, "learning_rate": 1.9145233562752362e-06, "loss": 0.9891894459724426, "loss/kd": 1.6046797037124634, "loss/lm": 0.37369921803474426, "step": 3925 }, { "epoch": 0.8059946622870047, "grad_norm": 2.9540772634163166, "kd_ratio": 0.5, "learning_rate": 1.9106119152042545e-06, "loss": 1.0513631105422974, "loss/kd": 1.7644107341766357, "loss/lm": 0.33831557631492615, "step": 3926 }, { "epoch": 0.8061999589406693, "grad_norm": 0.9865758046945691, "kd_ratio": 0.5, "learning_rate": 1.9067040517649115e-06, "loss": 1.0057417154312134, "loss/kd": 1.681372880935669, "loss/lm": 0.33011046051979065, "step": 3927 }, { "epoch": 0.8064052555943338, "grad_norm": 1.0279279298121444, "kd_ratio": 0.5, "learning_rate": 1.9027997676855081e-06, "loss": 1.0097023248672485, "loss/kd": 1.6780205965042114, "loss/lm": 0.34138402342796326, "step": 3928 }, { "epoch": 0.8066105522479984, "grad_norm": 0.8480928806023399, "kd_ratio": 0.5, "learning_rate": 1.8988990646927606e-06, "loss": 0.8655840158462524, "loss/kd": 1.3914778232574463, "loss/lm": 0.339690238237381, "step": 3929 }, { "epoch": 0.8068158489016629, "grad_norm": 0.8879673424107309, "kd_ratio": 0.5, "learning_rate": 1.8950019445118062e-06, "loss": 1.189847707748413, "loss/kd": 2.039283037185669, "loss/lm": 0.34041228890419006, "step": 3930 }, { "epoch": 0.8070211455553274, "grad_norm": 0.8829981208875384, "kd_ratio": 0.5, "learning_rate": 1.8911084088661903e-06, "loss": 1.0391484498977661, "loss/kd": 1.729074239730835, "loss/lm": 0.34922271966934204, "step": 3931 }, { "epoch": 0.807226442208992, "grad_norm": 1.0023928621450988, "kd_ratio": 0.5, "learning_rate": 1.8872184594778776e-06, "loss": 1.0213029384613037, "loss/kd": 1.6870508193969727, "loss/lm": 0.35555511713027954, "step": 3932 }, { "epoch": 0.8074317388626565, "grad_norm": 0.8338730621510096, "kd_ratio": 0.5, "learning_rate": 1.8833320980672487e-06, "loss": 1.268405795097351, "loss/kd": 2.1551249027252197, "loss/lm": 0.38168659806251526, "step": 3933 }, { "epoch": 0.807637035516321, "grad_norm": 1.0003889657890452, "kd_ratio": 0.5, "learning_rate": 1.8794493263530956e-06, "loss": 0.8877272009849548, "loss/kd": 1.485120415687561, "loss/lm": 0.290334016084671, "step": 3934 }, { "epoch": 0.8078423321699856, "grad_norm": 0.9984395949394337, "kd_ratio": 0.5, "learning_rate": 1.8755701460526166e-06, "loss": 0.7957024574279785, "loss/kd": 1.2678008079528809, "loss/lm": 0.32360410690307617, "step": 3935 }, { "epoch": 0.8080476288236502, "grad_norm": 1.293915333627061, "kd_ratio": 0.5, "learning_rate": 1.8716945588814339e-06, "loss": 1.1731693744659424, "loss/kd": 2.0129857063293457, "loss/lm": 0.3333531320095062, "step": 3936 }, { "epoch": 0.8082529254773148, "grad_norm": 0.9055823015032585, "kd_ratio": 0.5, "learning_rate": 1.8678225665535666e-06, "loss": 0.9971002340316772, "loss/kd": 1.575797438621521, "loss/lm": 0.4184029698371887, "step": 3937 }, { "epoch": 0.8084582221309793, "grad_norm": 0.8970451323624528, "kd_ratio": 0.5, "learning_rate": 1.8639541707814557e-06, "loss": 1.5114034414291382, "loss/kd": 2.612553834915161, "loss/lm": 0.41025295853614807, "step": 3938 }, { "epoch": 0.8086635187846438, "grad_norm": 1.1461188365607242, "kd_ratio": 0.5, "learning_rate": 1.860089373275945e-06, "loss": 1.0301785469055176, "loss/kd": 1.7764183282852173, "loss/lm": 0.28393879532814026, "step": 3939 }, { "epoch": 0.8088688154383084, "grad_norm": 1.0351532803565786, "kd_ratio": 0.5, "learning_rate": 1.8562281757462908e-06, "loss": 1.0595542192459106, "loss/kd": 1.7782706022262573, "loss/lm": 0.34083789587020874, "step": 3940 }, { "epoch": 0.8090741120919729, "grad_norm": 1.0187095274271813, "kd_ratio": 0.5, "learning_rate": 1.8523705799001556e-06, "loss": 1.0181723833084106, "loss/kd": 1.707736611366272, "loss/lm": 0.3286081552505493, "step": 3941 }, { "epoch": 0.8092794087456374, "grad_norm": 1.1220647004769841, "kd_ratio": 0.5, "learning_rate": 1.848516587443605e-06, "loss": 1.1674113273620605, "loss/kd": 2.051137685775757, "loss/lm": 0.28368493914604187, "step": 3942 }, { "epoch": 0.809484705399302, "grad_norm": 0.9883634439279964, "kd_ratio": 0.5, "learning_rate": 1.8446662000811177e-06, "loss": 0.947435200214386, "loss/kd": 1.5566596984863281, "loss/lm": 0.33821070194244385, "step": 3943 }, { "epoch": 0.8096900020529665, "grad_norm": 1.053039369597487, "kd_ratio": 0.5, "learning_rate": 1.8408194195155738e-06, "loss": 0.960668683052063, "loss/kd": 1.5451736450195312, "loss/lm": 0.37616369128227234, "step": 3944 }, { "epoch": 0.8098952987066311, "grad_norm": 1.0885040783441342, "kd_ratio": 0.5, "learning_rate": 1.8369762474482632e-06, "loss": 0.912487268447876, "loss/kd": 1.4810845851898193, "loss/lm": 0.34388989210128784, "step": 3945 }, { "epoch": 0.8101005953602957, "grad_norm": 1.0388311740392522, "kd_ratio": 0.5, "learning_rate": 1.8331366855788702e-06, "loss": 1.0495128631591797, "loss/kd": 1.7307173013687134, "loss/lm": 0.36830830574035645, "step": 3946 }, { "epoch": 0.8103058920139602, "grad_norm": 0.9054821418538009, "kd_ratio": 0.5, "learning_rate": 1.8293007356054903e-06, "loss": 1.1281280517578125, "loss/kd": 1.9158101081848145, "loss/lm": 0.34044599533081055, "step": 3947 }, { "epoch": 0.8105111886676247, "grad_norm": 0.975820215451742, "kd_ratio": 0.5, "learning_rate": 1.8254683992246215e-06, "loss": 0.9661465883255005, "loss/kd": 1.571831464767456, "loss/lm": 0.36046168208122253, "step": 3948 }, { "epoch": 0.8107164853212893, "grad_norm": 0.852967184032789, "kd_ratio": 0.5, "learning_rate": 1.8216396781311607e-06, "loss": 1.0005666017532349, "loss/kd": 1.5858678817749023, "loss/lm": 0.415265291929245, "step": 3949 }, { "epoch": 0.8109217819749538, "grad_norm": 1.2495954253831496, "kd_ratio": 0.5, "learning_rate": 1.8178145740184095e-06, "loss": 0.9145776033401489, "loss/kd": 1.5068409442901611, "loss/lm": 0.32231420278549194, "step": 3950 }, { "epoch": 0.8111270786286183, "grad_norm": 0.9431595294361572, "kd_ratio": 0.5, "learning_rate": 1.8139930885780621e-06, "loss": 1.0158449411392212, "loss/kd": 1.7368354797363281, "loss/lm": 0.2948543131351471, "step": 3951 }, { "epoch": 0.8113323752822829, "grad_norm": 0.9200247468805695, "kd_ratio": 0.5, "learning_rate": 1.8101752235002234e-06, "loss": 1.1819157600402832, "loss/kd": 1.9212185144424438, "loss/lm": 0.44261297583580017, "step": 3952 }, { "epoch": 0.8115376719359474, "grad_norm": 0.8721449391184969, "kd_ratio": 0.5, "learning_rate": 1.8063609804733829e-06, "loss": 0.8732962012290955, "loss/kd": 1.482489824295044, "loss/lm": 0.264102578163147, "step": 3953 }, { "epoch": 0.811742968589612, "grad_norm": 0.8285846676857701, "kd_ratio": 0.5, "learning_rate": 1.8025503611844475e-06, "loss": 1.0275791883468628, "loss/kd": 1.6806950569152832, "loss/lm": 0.37446337938308716, "step": 3954 }, { "epoch": 0.8119482652432766, "grad_norm": 0.9219149293297598, "kd_ratio": 0.5, "learning_rate": 1.7987433673187026e-06, "loss": 1.1738557815551758, "loss/kd": 1.9248216152191162, "loss/lm": 0.4228900074958801, "step": 3955 }, { "epoch": 0.8121535618969411, "grad_norm": 1.0269304515306474, "kd_ratio": 0.5, "learning_rate": 1.7949400005598416e-06, "loss": 1.0675512552261353, "loss/kd": 1.813439130783081, "loss/lm": 0.32166340947151184, "step": 3956 }, { "epoch": 0.8123588585506056, "grad_norm": 1.0151130469328826, "kd_ratio": 0.5, "learning_rate": 1.791140262589951e-06, "loss": 1.0308700799942017, "loss/kd": 1.7561548948287964, "loss/lm": 0.30558520555496216, "step": 3957 }, { "epoch": 0.8125641552042702, "grad_norm": 0.9432568054666044, "kd_ratio": 0.5, "learning_rate": 1.7873441550895053e-06, "loss": 0.799724280834198, "loss/kd": 1.2596391439437866, "loss/lm": 0.339809387922287, "step": 3958 }, { "epoch": 0.8127694518579347, "grad_norm": 0.9758521736157018, "kd_ratio": 0.5, "learning_rate": 1.7835516797373908e-06, "loss": 1.0770928859710693, "loss/kd": 1.746755599975586, "loss/lm": 0.4074302911758423, "step": 3959 }, { "epoch": 0.8129747485115992, "grad_norm": 0.945097279599429, "kd_ratio": 0.5, "learning_rate": 1.7797628382108667e-06, "loss": 1.0637718439102173, "loss/kd": 1.7860044240951538, "loss/lm": 0.3415392339229584, "step": 3960 }, { "epoch": 0.8131800451652638, "grad_norm": 0.8860147496773949, "kd_ratio": 0.5, "learning_rate": 1.7759776321856014e-06, "loss": 1.119065284729004, "loss/kd": 1.8858438730239868, "loss/lm": 0.3522867262363434, "step": 3961 }, { "epoch": 0.8133853418189283, "grad_norm": 1.4454260748867023, "kd_ratio": 0.5, "learning_rate": 1.772196063335645e-06, "loss": 0.9519715309143066, "loss/kd": 1.6033215522766113, "loss/lm": 0.30062153935432434, "step": 3962 }, { "epoch": 0.8135906384725929, "grad_norm": 1.0255576601355578, "kd_ratio": 0.5, "learning_rate": 1.7684181333334437e-06, "loss": 0.8610314726829529, "loss/kd": 1.438347339630127, "loss/lm": 0.2837156057357788, "step": 3963 }, { "epoch": 0.8137959351262575, "grad_norm": 1.1514470506718832, "kd_ratio": 0.5, "learning_rate": 1.7646438438498336e-06, "loss": 0.9037567377090454, "loss/kd": 1.4629405736923218, "loss/lm": 0.34457284212112427, "step": 3964 }, { "epoch": 0.814001231779922, "grad_norm": 1.017796346241972, "kd_ratio": 0.5, "learning_rate": 1.7608731965540437e-06, "loss": 0.949312686920166, "loss/kd": 1.551357626914978, "loss/lm": 0.3472677171230316, "step": 3965 }, { "epoch": 0.8142065284335865, "grad_norm": 1.1753600778889042, "kd_ratio": 0.5, "learning_rate": 1.7571061931136845e-06, "loss": 0.9050864577293396, "loss/kd": 1.3993338346481323, "loss/lm": 0.4108390510082245, "step": 3966 }, { "epoch": 0.8144118250872511, "grad_norm": 1.1859738316402268, "kd_ratio": 0.5, "learning_rate": 1.7533428351947634e-06, "loss": 0.8812662959098816, "loss/kd": 1.4171535968780518, "loss/lm": 0.3453789949417114, "step": 3967 }, { "epoch": 0.8146171217409156, "grad_norm": 1.276196692784802, "kd_ratio": 0.5, "learning_rate": 1.7495831244616712e-06, "loss": 0.9996973276138306, "loss/kd": 1.6649023294448853, "loss/lm": 0.33449235558509827, "step": 3968 }, { "epoch": 0.8148224183945801, "grad_norm": 1.0024110996466906, "kd_ratio": 0.5, "learning_rate": 1.7458270625771822e-06, "loss": 0.9805462956428528, "loss/kd": 1.6399413347244263, "loss/lm": 0.3211512267589569, "step": 3969 }, { "epoch": 0.8150277150482447, "grad_norm": 1.058046129130604, "kd_ratio": 0.5, "learning_rate": 1.742074651202469e-06, "loss": 0.9171098470687866, "loss/kd": 1.5186399221420288, "loss/lm": 0.31557971239089966, "step": 3970 }, { "epoch": 0.8152330117019092, "grad_norm": 1.0601096949575062, "kd_ratio": 0.5, "learning_rate": 1.7383258919970746e-06, "loss": 1.0246297121047974, "loss/kd": 1.72991144657135, "loss/lm": 0.319348007440567, "step": 3971 }, { "epoch": 0.8154383083555738, "grad_norm": 1.0584323023670803, "kd_ratio": 0.5, "learning_rate": 1.7345807866189402e-06, "loss": 0.9372292757034302, "loss/kd": 1.5520795583724976, "loss/lm": 0.3223790228366852, "step": 3972 }, { "epoch": 0.8156436050092384, "grad_norm": 0.8983944096257602, "kd_ratio": 0.5, "learning_rate": 1.7308393367243781e-06, "loss": 0.9231149554252625, "loss/kd": 1.5267095565795898, "loss/lm": 0.31952038407325745, "step": 3973 }, { "epoch": 0.8158489016629029, "grad_norm": 0.8811604374063114, "kd_ratio": 0.5, "learning_rate": 1.7271015439680938e-06, "loss": 0.8897585868835449, "loss/kd": 1.4465246200561523, "loss/lm": 0.3329925537109375, "step": 3974 }, { "epoch": 0.8160541983165674, "grad_norm": 0.9531927338593505, "kd_ratio": 0.5, "learning_rate": 1.7233674100031728e-06, "loss": 0.8708105683326721, "loss/kd": 1.4227855205535889, "loss/lm": 0.31883564591407776, "step": 3975 }, { "epoch": 0.816259494970232, "grad_norm": 0.8709929496217702, "kd_ratio": 0.5, "learning_rate": 1.7196369364810816e-06, "loss": 1.0901687145233154, "loss/kd": 1.7882390022277832, "loss/lm": 0.39209839701652527, "step": 3976 }, { "epoch": 0.8164647916238965, "grad_norm": 0.9067309404948448, "kd_ratio": 0.5, "learning_rate": 1.7159101250516709e-06, "loss": 1.131217360496521, "loss/kd": 1.893402338027954, "loss/lm": 0.36903244256973267, "step": 3977 }, { "epoch": 0.816670088277561, "grad_norm": 0.9219740761487532, "kd_ratio": 0.5, "learning_rate": 1.7121869773631628e-06, "loss": 1.0848567485809326, "loss/kd": 1.8508566617965698, "loss/lm": 0.31885677576065063, "step": 3978 }, { "epoch": 0.8168753849312256, "grad_norm": 1.066767459343969, "kd_ratio": 0.5, "learning_rate": 1.7084674950621694e-06, "loss": 0.7166940569877625, "loss/kd": 1.171492099761963, "loss/lm": 0.2618959844112396, "step": 3979 }, { "epoch": 0.8170806815848901, "grad_norm": 0.919438311602685, "kd_ratio": 0.5, "learning_rate": 1.704751679793676e-06, "loss": 1.1657298803329468, "loss/kd": 1.9810693264007568, "loss/lm": 0.3503904938697815, "step": 3980 }, { "epoch": 0.8172859782385548, "grad_norm": 0.8467867598827017, "kd_ratio": 0.5, "learning_rate": 1.701039533201052e-06, "loss": 1.1194957494735718, "loss/kd": 1.8746517896652222, "loss/lm": 0.3643396496772766, "step": 3981 }, { "epoch": 0.8174912748922193, "grad_norm": 0.887176304839315, "kd_ratio": 0.5, "learning_rate": 1.6973310569260336e-06, "loss": 1.2189191579818726, "loss/kd": 2.1006903648376465, "loss/lm": 0.33714795112609863, "step": 3982 }, { "epoch": 0.8176965715458838, "grad_norm": 1.0034098913558598, "kd_ratio": 0.5, "learning_rate": 1.6936262526087432e-06, "loss": 1.111717939376831, "loss/kd": 1.8315318822860718, "loss/lm": 0.3919041156768799, "step": 3983 }, { "epoch": 0.8179018681995484, "grad_norm": 1.0206544783597489, "kd_ratio": 0.5, "learning_rate": 1.6899251218876756e-06, "loss": 0.953170895576477, "loss/kd": 1.529105544090271, "loss/lm": 0.3772362470626831, "step": 3984 }, { "epoch": 0.8181071648532129, "grad_norm": 0.8648787064628063, "kd_ratio": 0.5, "learning_rate": 1.6862276663997024e-06, "loss": 1.3571721315383911, "loss/kd": 2.4346790313720703, "loss/lm": 0.2796652317047119, "step": 3985 }, { "epoch": 0.8183124615068774, "grad_norm": 0.9496919805376048, "kd_ratio": 0.5, "learning_rate": 1.6825338877800712e-06, "loss": 0.9281817674636841, "loss/kd": 1.5934569835662842, "loss/lm": 0.2629065215587616, "step": 3986 }, { "epoch": 0.818517758160542, "grad_norm": 0.9378474357116434, "kd_ratio": 0.5, "learning_rate": 1.6788437876623963e-06, "loss": 1.064528226852417, "loss/kd": 1.773820400238037, "loss/lm": 0.3552359342575073, "step": 3987 }, { "epoch": 0.8187230548142065, "grad_norm": 1.1039044355255705, "kd_ratio": 0.5, "learning_rate": 1.6751573676786736e-06, "loss": 1.1278955936431885, "loss/kd": 1.8958334922790527, "loss/lm": 0.35995757579803467, "step": 3988 }, { "epoch": 0.818928351467871, "grad_norm": 0.9002292884071966, "kd_ratio": 0.5, "learning_rate": 1.6714746294592655e-06, "loss": 0.9246576428413391, "loss/kd": 1.5122449398040771, "loss/lm": 0.33707037568092346, "step": 3989 }, { "epoch": 0.8191336481215357, "grad_norm": 0.9913940841126297, "kd_ratio": 0.5, "learning_rate": 1.6677955746329077e-06, "loss": 1.3977417945861816, "loss/kd": 2.4499783515930176, "loss/lm": 0.3455052971839905, "step": 3990 }, { "epoch": 0.8193389447752002, "grad_norm": 0.8881469492936933, "kd_ratio": 0.5, "learning_rate": 1.6641202048267102e-06, "loss": 1.1472570896148682, "loss/kd": 1.921783208847046, "loss/lm": 0.3727308511734009, "step": 3991 }, { "epoch": 0.8195442414288647, "grad_norm": 0.8754062542262131, "kd_ratio": 0.5, "learning_rate": 1.6604485216661492e-06, "loss": 0.8620587587356567, "loss/kd": 1.423728108406067, "loss/lm": 0.3003893494606018, "step": 3992 }, { "epoch": 0.8197495380825293, "grad_norm": 1.0125888921210426, "kd_ratio": 0.5, "learning_rate": 1.6567805267750735e-06, "loss": 0.9995372295379639, "loss/kd": 1.6722770929336548, "loss/lm": 0.32679733633995056, "step": 3993 }, { "epoch": 0.8199548347361938, "grad_norm": 0.9634985497888877, "kd_ratio": 0.5, "learning_rate": 1.6531162217756969e-06, "loss": 0.9428852200508118, "loss/kd": 1.596421241760254, "loss/lm": 0.28934916853904724, "step": 3994 }, { "epoch": 0.8201601313898583, "grad_norm": 1.0032017473687436, "kd_ratio": 0.5, "learning_rate": 1.6494556082886038e-06, "loss": 1.0924134254455566, "loss/kd": 1.8358900547027588, "loss/lm": 0.34893670678138733, "step": 3995 }, { "epoch": 0.8203654280435229, "grad_norm": 0.9870396322640688, "kd_ratio": 0.5, "learning_rate": 1.6457986879327459e-06, "loss": 0.9414725303649902, "loss/kd": 1.614599585533142, "loss/lm": 0.2683454155921936, "step": 3996 }, { "epoch": 0.8205707246971874, "grad_norm": 1.2023244807911677, "kd_ratio": 0.5, "learning_rate": 1.6421454623254441e-06, "loss": 1.0290285348892212, "loss/kd": 1.7692222595214844, "loss/lm": 0.2888348400592804, "step": 3997 }, { "epoch": 0.820776021350852, "grad_norm": 1.231484450279291, "kd_ratio": 0.5, "learning_rate": 1.638495933082378e-06, "loss": 0.9956375360488892, "loss/kd": 1.692076325416565, "loss/lm": 0.299198716878891, "step": 3998 }, { "epoch": 0.8209813180045166, "grad_norm": 0.9262244335722088, "kd_ratio": 0.5, "learning_rate": 1.634850101817601e-06, "loss": 0.8001655340194702, "loss/kd": 1.316453456878662, "loss/lm": 0.2838776409626007, "step": 3999 }, { "epoch": 0.8211866146581811, "grad_norm": 1.1664287525894173, "kd_ratio": 0.5, "learning_rate": 1.6312079701435203e-06, "loss": 1.5545049905776978, "loss/kd": 2.7926812171936035, "loss/lm": 0.31632882356643677, "step": 4000 }, { "epoch": 0.8213919113118456, "grad_norm": 2.1937589805323086, "kd_ratio": 0.5, "learning_rate": 1.6275695396709223e-06, "loss": 1.0056028366088867, "loss/kd": 1.6730552911758423, "loss/lm": 0.33815038204193115, "step": 4001 }, { "epoch": 0.8215972079655102, "grad_norm": 0.9408974101144971, "kd_ratio": 0.5, "learning_rate": 1.6239348120089416e-06, "loss": 1.1787174940109253, "loss/kd": 2.022066831588745, "loss/lm": 0.3353680968284607, "step": 4002 }, { "epoch": 0.8218025046191747, "grad_norm": 1.033079736074829, "kd_ratio": 0.5, "learning_rate": 1.6203037887650842e-06, "loss": 1.1150535345077515, "loss/kd": 1.8036003112792969, "loss/lm": 0.4265066981315613, "step": 4003 }, { "epoch": 0.8220078012728392, "grad_norm": 1.170076540465972, "kd_ratio": 0.5, "learning_rate": 1.6166764715452154e-06, "loss": 0.9856274127960205, "loss/kd": 1.6428309679031372, "loss/lm": 0.3284238874912262, "step": 4004 }, { "epoch": 0.8222130979265038, "grad_norm": 1.0286129416061407, "kd_ratio": 0.5, "learning_rate": 1.6130528619535546e-06, "loss": 1.1434859037399292, "loss/kd": 1.961896300315857, "loss/lm": 0.32507553696632385, "step": 4005 }, { "epoch": 0.8224183945801683, "grad_norm": 1.21879423269592, "kd_ratio": 0.5, "learning_rate": 1.6094329615926974e-06, "loss": 1.073608636856079, "loss/kd": 1.8414517641067505, "loss/lm": 0.3057655096054077, "step": 4006 }, { "epoch": 0.8226236912338329, "grad_norm": 0.9321381970529334, "kd_ratio": 0.5, "learning_rate": 1.6058167720635832e-06, "loss": 0.9511713981628418, "loss/kd": 1.6325583457946777, "loss/lm": 0.26978442072868347, "step": 4007 }, { "epoch": 0.8228289878874975, "grad_norm": 1.120010964833255, "kd_ratio": 0.5, "learning_rate": 1.6022042949655193e-06, "loss": 1.1591874361038208, "loss/kd": 1.990281581878662, "loss/lm": 0.3280932903289795, "step": 4008 }, { "epoch": 0.823034284541162, "grad_norm": 0.8454211764394188, "kd_ratio": 0.5, "learning_rate": 1.5985955318961655e-06, "loss": 0.8669221997261047, "loss/kd": 1.3968474864959717, "loss/lm": 0.3369968831539154, "step": 4009 }, { "epoch": 0.8232395811948265, "grad_norm": 0.8735938996200122, "kd_ratio": 0.5, "learning_rate": 1.5949904844515429e-06, "loss": 1.1682956218719482, "loss/kd": 2.0675458908081055, "loss/lm": 0.2690454423427582, "step": 4010 }, { "epoch": 0.8234448778484911, "grad_norm": 0.83070729886252, "kd_ratio": 0.5, "learning_rate": 1.5913891542260284e-06, "loss": 1.2143946886062622, "loss/kd": 2.065342664718628, "loss/lm": 0.36344674229621887, "step": 4011 }, { "epoch": 0.8236501745021556, "grad_norm": 1.2226745825473977, "kd_ratio": 0.5, "learning_rate": 1.5877915428123547e-06, "loss": 1.0956453084945679, "loss/kd": 1.843779444694519, "loss/lm": 0.3475111126899719, "step": 4012 }, { "epoch": 0.8238554711558201, "grad_norm": 1.0014461080784516, "kd_ratio": 0.5, "learning_rate": 1.5841976518016121e-06, "loss": 1.2969449758529663, "loss/kd": 2.2730398178100586, "loss/lm": 0.32085004448890686, "step": 4013 }, { "epoch": 0.8240607678094847, "grad_norm": 0.8542534307537966, "kd_ratio": 0.5, "learning_rate": 1.5806074827832384e-06, "loss": 1.535372018814087, "loss/kd": 2.7137460708618164, "loss/lm": 0.35699784755706787, "step": 4014 }, { "epoch": 0.8242660644631492, "grad_norm": 0.8208854812088213, "kd_ratio": 0.5, "learning_rate": 1.5770210373450356e-06, "loss": 1.1634377241134644, "loss/kd": 2.002660036087036, "loss/lm": 0.32421544194221497, "step": 4015 }, { "epoch": 0.8244713611168139, "grad_norm": 0.8832216708792059, "kd_ratio": 0.5, "learning_rate": 1.573438317073146e-06, "loss": 1.1465963125228882, "loss/kd": 1.980360984802246, "loss/lm": 0.3128315806388855, "step": 4016 }, { "epoch": 0.8246766577704784, "grad_norm": 0.8422876273808185, "kd_ratio": 0.5, "learning_rate": 1.569859323552081e-06, "loss": 1.1561315059661865, "loss/kd": 1.9877809286117554, "loss/lm": 0.3244820535182953, "step": 4017 }, { "epoch": 0.8248819544241429, "grad_norm": 0.8803895771060865, "kd_ratio": 0.5, "learning_rate": 1.5662840583646876e-06, "loss": 1.0450869798660278, "loss/kd": 1.7678109407424927, "loss/lm": 0.32236307859420776, "step": 4018 }, { "epoch": 0.8250872510778074, "grad_norm": 0.9212054444485178, "kd_ratio": 0.5, "learning_rate": 1.5627125230921725e-06, "loss": 0.9631307125091553, "loss/kd": 1.6236203908920288, "loss/lm": 0.3026410937309265, "step": 4019 }, { "epoch": 0.825292547731472, "grad_norm": 0.8441815585798397, "kd_ratio": 0.5, "learning_rate": 1.5591447193140952e-06, "loss": 1.6003749370574951, "loss/kd": 2.8461086750030518, "loss/lm": 0.3546411693096161, "step": 4020 }, { "epoch": 0.8254978443851365, "grad_norm": 0.8757189165203594, "kd_ratio": 0.5, "learning_rate": 1.5555806486083559e-06, "loss": 1.9089009761810303, "loss/kd": 3.5145516395568848, "loss/lm": 0.30325037240982056, "step": 4021 }, { "epoch": 0.825703141038801, "grad_norm": 0.8687479411320734, "kd_ratio": 0.5, "learning_rate": 1.5520203125512102e-06, "loss": 0.8547663688659668, "loss/kd": 1.3202919960021973, "loss/lm": 0.38924074172973633, "step": 4022 }, { "epoch": 0.8259084376924656, "grad_norm": 0.8049181215260437, "kd_ratio": 0.5, "learning_rate": 1.5484637127172609e-06, "loss": 1.2888035774230957, "loss/kd": 2.275435209274292, "loss/lm": 0.30217188596725464, "step": 4023 }, { "epoch": 0.8261137343461301, "grad_norm": 0.8832045661845602, "kd_ratio": 0.5, "learning_rate": 1.5449108506794608e-06, "loss": 1.0355511903762817, "loss/kd": 1.7573072910308838, "loss/lm": 0.3137950301170349, "step": 4024 }, { "epoch": 0.8263190309997948, "grad_norm": 0.9901549505373762, "kd_ratio": 0.5, "learning_rate": 1.541361728009102e-06, "loss": 1.0022740364074707, "loss/kd": 1.6480530500411987, "loss/lm": 0.35649505257606506, "step": 4025 }, { "epoch": 0.8265243276534593, "grad_norm": 0.937443085585915, "kd_ratio": 0.5, "learning_rate": 1.53781634627583e-06, "loss": 0.9557576179504395, "loss/kd": 1.5629655122756958, "loss/lm": 0.34854966402053833, "step": 4026 }, { "epoch": 0.8267296243071238, "grad_norm": 0.8677892832445393, "kd_ratio": 0.5, "learning_rate": 1.5342747070476339e-06, "loss": 0.8433910012245178, "loss/kd": 1.3514827489852905, "loss/lm": 0.3352992534637451, "step": 4027 }, { "epoch": 0.8269349209607884, "grad_norm": 0.9416676102121148, "kd_ratio": 0.5, "learning_rate": 1.5307368118908506e-06, "loss": 1.0320806503295898, "loss/kd": 1.6791659593582153, "loss/lm": 0.3849954307079315, "step": 4028 }, { "epoch": 0.8271402176144529, "grad_norm": 0.8857785497065507, "kd_ratio": 0.5, "learning_rate": 1.5272026623701507e-06, "loss": 1.130202054977417, "loss/kd": 1.8632681369781494, "loss/lm": 0.397135853767395, "step": 4029 }, { "epoch": 0.8273455142681174, "grad_norm": 0.9910189868250466, "kd_ratio": 0.5, "learning_rate": 1.523672260048561e-06, "loss": 0.922659158706665, "loss/kd": 1.551559567451477, "loss/lm": 0.293758749961853, "step": 4030 }, { "epoch": 0.827550810921782, "grad_norm": 0.9095997284122321, "kd_ratio": 0.5, "learning_rate": 1.520145606487442e-06, "loss": 1.0375454425811768, "loss/kd": 1.7394371032714844, "loss/lm": 0.3356538712978363, "step": 4031 }, { "epoch": 0.8277561075754465, "grad_norm": 0.9029422230572027, "kd_ratio": 0.5, "learning_rate": 1.516622703246502e-06, "loss": 1.1770949363708496, "loss/kd": 2.007551431655884, "loss/lm": 0.3466383218765259, "step": 4032 }, { "epoch": 0.827961404229111, "grad_norm": 0.9128869932989147, "kd_ratio": 0.5, "learning_rate": 1.513103551883791e-06, "loss": 1.021174430847168, "loss/kd": 1.695123553276062, "loss/lm": 0.34722521901130676, "step": 4033 }, { "epoch": 0.8281667008827757, "grad_norm": 0.9044738366841163, "kd_ratio": 0.5, "learning_rate": 1.5095881539556912e-06, "loss": 0.9559409022331238, "loss/kd": 1.530829906463623, "loss/lm": 0.3810518682003021, "step": 4034 }, { "epoch": 0.8283719975364402, "grad_norm": 0.8883168397091744, "kd_ratio": 0.5, "learning_rate": 1.506076511016935e-06, "loss": 0.966775119304657, "loss/kd": 1.554870843887329, "loss/lm": 0.37867939472198486, "step": 4035 }, { "epoch": 0.8285772941901047, "grad_norm": 0.901126107037846, "kd_ratio": 0.5, "learning_rate": 1.502568624620584e-06, "loss": 0.9485009908676147, "loss/kd": 1.6153241395950317, "loss/lm": 0.28167781233787537, "step": 4036 }, { "epoch": 0.8287825908437693, "grad_norm": 0.9970172419277555, "kd_ratio": 0.5, "learning_rate": 1.4990644963180534e-06, "loss": 1.1974472999572754, "loss/kd": 2.0695865154266357, "loss/lm": 0.3253081738948822, "step": 4037 }, { "epoch": 0.8289878874974338, "grad_norm": 0.8673607166466462, "kd_ratio": 0.5, "learning_rate": 1.4955641276590794e-06, "loss": 1.0372965335845947, "loss/kd": 1.7379533052444458, "loss/lm": 0.3366397023200989, "step": 4038 }, { "epoch": 0.8291931841510983, "grad_norm": 0.8404418951535859, "kd_ratio": 0.5, "learning_rate": 1.4920675201917467e-06, "loss": 1.1539709568023682, "loss/kd": 1.887253999710083, "loss/lm": 0.42068788409233093, "step": 4039 }, { "epoch": 0.8293984808047629, "grad_norm": 0.91784525418507, "kd_ratio": 0.5, "learning_rate": 1.4885746754624742e-06, "loss": 0.9541777968406677, "loss/kd": 1.5589858293533325, "loss/lm": 0.34936973452568054, "step": 4040 }, { "epoch": 0.8296037774584274, "grad_norm": 0.9831843076193031, "kd_ratio": 0.5, "learning_rate": 1.4850855950160103e-06, "loss": 0.869737446308136, "loss/kd": 1.4688212871551514, "loss/lm": 0.2706535756587982, "step": 4041 }, { "epoch": 0.8298090741120919, "grad_norm": 0.8657519853933141, "kd_ratio": 0.5, "learning_rate": 1.4816002803954489e-06, "loss": 1.043296456336975, "loss/kd": 1.8028929233551025, "loss/lm": 0.28369995951652527, "step": 4042 }, { "epoch": 0.8300143707657566, "grad_norm": 0.9608017016876785, "kd_ratio": 0.5, "learning_rate": 1.4781187331422109e-06, "loss": 1.094010829925537, "loss/kd": 1.8709810972213745, "loss/lm": 0.3170406222343445, "step": 4043 }, { "epoch": 0.8302196674194211, "grad_norm": 0.8414399937861827, "kd_ratio": 0.5, "learning_rate": 1.4746409547960572e-06, "loss": 1.9712027311325073, "loss/kd": 3.587200403213501, "loss/lm": 0.35520508885383606, "step": 4044 }, { "epoch": 0.8304249640730856, "grad_norm": 0.8895684100431757, "kd_ratio": 0.5, "learning_rate": 1.4711669468950728e-06, "loss": 1.1713886260986328, "loss/kd": 2.0066609382629395, "loss/lm": 0.33611631393432617, "step": 4045 }, { "epoch": 0.8306302607267502, "grad_norm": 0.8956354769425406, "kd_ratio": 0.5, "learning_rate": 1.4676967109756823e-06, "loss": 0.8554182052612305, "loss/kd": 1.399296522140503, "loss/lm": 0.3115399181842804, "step": 4046 }, { "epoch": 0.8308355573804147, "grad_norm": 0.9172051578245946, "kd_ratio": 0.5, "learning_rate": 1.4642302485726423e-06, "loss": 0.8175455927848816, "loss/kd": 1.293912410736084, "loss/lm": 0.3411787450313568, "step": 4047 }, { "epoch": 0.8310408540340792, "grad_norm": 0.847725282182255, "kd_ratio": 0.5, "learning_rate": 1.4607675612190364e-06, "loss": 0.9821202754974365, "loss/kd": 1.6292262077331543, "loss/lm": 0.33501437306404114, "step": 4048 }, { "epoch": 0.8312461506877438, "grad_norm": 0.877336884901291, "kd_ratio": 0.5, "learning_rate": 1.4573086504462841e-06, "loss": 0.8600708246231079, "loss/kd": 1.4039204120635986, "loss/lm": 0.3162212669849396, "step": 4049 }, { "epoch": 0.8314514473414083, "grad_norm": 0.8749278239143248, "kd_ratio": 0.5, "learning_rate": 1.4538535177841272e-06, "loss": 1.2366654872894287, "loss/kd": 2.1276464462280273, "loss/lm": 0.3456844389438629, "step": 4050 }, { "epoch": 0.8316567439950728, "grad_norm": 1.011711207084492, "kd_ratio": 0.5, "learning_rate": 1.4504021647606448e-06, "loss": 1.105070948600769, "loss/kd": 1.7910224199295044, "loss/lm": 0.4191195070743561, "step": 4051 }, { "epoch": 0.8318620406487375, "grad_norm": 0.9438091895489328, "kd_ratio": 0.5, "learning_rate": 1.446954592902232e-06, "loss": 1.0937269926071167, "loss/kd": 1.8424557447433472, "loss/lm": 0.34499818086624146, "step": 4052 }, { "epoch": 0.832067337302402, "grad_norm": 0.9240481347604517, "kd_ratio": 0.5, "learning_rate": 1.4435108037336321e-06, "loss": 1.1023895740509033, "loss/kd": 1.875698447227478, "loss/lm": 0.32908058166503906, "step": 4053 }, { "epoch": 0.8322726339560665, "grad_norm": 0.9810644184096315, "kd_ratio": 0.5, "learning_rate": 1.440070798777895e-06, "loss": 0.9420039653778076, "loss/kd": 1.635879397392273, "loss/lm": 0.24812857806682587, "step": 4054 }, { "epoch": 0.8324779306097311, "grad_norm": 1.2121720230392983, "kd_ratio": 0.5, "learning_rate": 1.4366345795564084e-06, "loss": 0.9938257932662964, "loss/kd": 1.701615571975708, "loss/lm": 0.28603607416152954, "step": 4055 }, { "epoch": 0.8326832272633956, "grad_norm": 0.8866349171174661, "kd_ratio": 0.5, "learning_rate": 1.4332021475888801e-06, "loss": 1.078829050064087, "loss/kd": 1.794234037399292, "loss/lm": 0.363424152135849, "step": 4056 }, { "epoch": 0.8328885239170601, "grad_norm": 0.9594781374140738, "kd_ratio": 0.5, "learning_rate": 1.429773504393348e-06, "loss": 0.8763617873191833, "loss/kd": 1.4245426654815674, "loss/lm": 0.32818087935447693, "step": 4057 }, { "epoch": 0.8330938205707247, "grad_norm": 1.1156634970038106, "kd_ratio": 0.5, "learning_rate": 1.4263486514861702e-06, "loss": 1.0257939100265503, "loss/kd": 1.6876641511917114, "loss/lm": 0.36392369866371155, "step": 4058 }, { "epoch": 0.8332991172243892, "grad_norm": 0.9058077276112981, "kd_ratio": 0.5, "learning_rate": 1.4229275903820306e-06, "loss": 0.9461438655853271, "loss/kd": 1.630747675895691, "loss/lm": 0.261540025472641, "step": 4059 }, { "epoch": 0.8335044138780537, "grad_norm": 0.8543266904267811, "kd_ratio": 0.5, "learning_rate": 1.4195103225939387e-06, "loss": 1.1626155376434326, "loss/kd": 1.9428727626800537, "loss/lm": 0.3823583424091339, "step": 4060 }, { "epoch": 0.8337097105317184, "grad_norm": 1.0278057809739831, "kd_ratio": 0.5, "learning_rate": 1.4160968496332183e-06, "loss": 1.0300062894821167, "loss/kd": 1.6754746437072754, "loss/lm": 0.3845379054546356, "step": 4061 }, { "epoch": 0.8339150071853829, "grad_norm": 1.0061328249927723, "kd_ratio": 0.5, "learning_rate": 1.4126871730095215e-06, "loss": 1.0750457048416138, "loss/kd": 1.7784204483032227, "loss/lm": 0.37167105078697205, "step": 4062 }, { "epoch": 0.8341203038390475, "grad_norm": 0.9916207722005519, "kd_ratio": 0.5, "learning_rate": 1.409281294230821e-06, "loss": 1.208317756652832, "loss/kd": 2.0289242267608643, "loss/lm": 0.38771137595176697, "step": 4063 }, { "epoch": 0.834325600492712, "grad_norm": 1.0169670554997983, "kd_ratio": 0.5, "learning_rate": 1.4058792148034107e-06, "loss": 1.1970264911651611, "loss/kd": 2.0543980598449707, "loss/lm": 0.3396550416946411, "step": 4064 }, { "epoch": 0.8345308971463765, "grad_norm": 0.9066849377967182, "kd_ratio": 0.5, "learning_rate": 1.4024809362318969e-06, "loss": 1.1261268854141235, "loss/kd": 1.9211146831512451, "loss/lm": 0.3311391770839691, "step": 4065 }, { "epoch": 0.834736193800041, "grad_norm": 1.06336214196909, "kd_ratio": 0.5, "learning_rate": 1.3990864600192133e-06, "loss": 1.3104467391967773, "loss/kd": 2.3032851219177246, "loss/lm": 0.31760847568511963, "step": 4066 }, { "epoch": 0.8349414904537056, "grad_norm": 0.8553191453213027, "kd_ratio": 0.5, "learning_rate": 1.395695787666611e-06, "loss": 1.065274715423584, "loss/kd": 1.7987931966781616, "loss/lm": 0.3317561745643616, "step": 4067 }, { "epoch": 0.8351467871073701, "grad_norm": 0.8468724329917489, "kd_ratio": 0.5, "learning_rate": 1.3923089206736495e-06, "loss": 1.2869993448257446, "loss/kd": 2.2033677101135254, "loss/lm": 0.37063097953796387, "step": 4068 }, { "epoch": 0.8353520837610346, "grad_norm": 0.9577369284794598, "kd_ratio": 0.5, "learning_rate": 1.3889258605382216e-06, "loss": 1.1528897285461426, "loss/kd": 1.931405782699585, "loss/lm": 0.3743736445903778, "step": 4069 }, { "epoch": 0.8355573804146993, "grad_norm": 0.8549213155380876, "kd_ratio": 0.5, "learning_rate": 1.3855466087565218e-06, "loss": 0.9562588334083557, "loss/kd": 1.5581127405166626, "loss/lm": 0.35440492630004883, "step": 4070 }, { "epoch": 0.8357626770683638, "grad_norm": 1.2566143386535908, "kd_ratio": 0.5, "learning_rate": 1.3821711668230675e-06, "loss": 1.0230127573013306, "loss/kd": 1.7522861957550049, "loss/lm": 0.2937392294406891, "step": 4071 }, { "epoch": 0.8359679737220284, "grad_norm": 0.9043535601378545, "kd_ratio": 0.5, "learning_rate": 1.3787995362306882e-06, "loss": 1.0109002590179443, "loss/kd": 1.7186663150787354, "loss/lm": 0.30313411355018616, "step": 4072 }, { "epoch": 0.8361732703756929, "grad_norm": 1.0545156835198124, "kd_ratio": 0.5, "learning_rate": 1.375431718470529e-06, "loss": 1.064466118812561, "loss/kd": 1.7859147787094116, "loss/lm": 0.3430175483226776, "step": 4073 }, { "epoch": 0.8363785670293574, "grad_norm": 0.8962885730334732, "kd_ratio": 0.5, "learning_rate": 1.3720677150320505e-06, "loss": 0.774483323097229, "loss/kd": 1.2077105045318604, "loss/lm": 0.34125614166259766, "step": 4074 }, { "epoch": 0.836583863683022, "grad_norm": 0.8244958856682942, "kd_ratio": 0.5, "learning_rate": 1.3687075274030238e-06, "loss": 0.7771075367927551, "loss/kd": 1.245922565460205, "loss/lm": 0.3082925081253052, "step": 4075 }, { "epoch": 0.8367891603366865, "grad_norm": 0.9907534571889582, "kd_ratio": 0.5, "learning_rate": 1.3653511570695355e-06, "loss": 1.095265507698059, "loss/kd": 1.8926163911819458, "loss/lm": 0.2979145646095276, "step": 4076 }, { "epoch": 0.836994456990351, "grad_norm": 0.9841984412724661, "kd_ratio": 0.5, "learning_rate": 1.3619986055159783e-06, "loss": 0.9304841756820679, "loss/kd": 1.4906052350997925, "loss/lm": 0.37036314606666565, "step": 4077 }, { "epoch": 0.8371997536440156, "grad_norm": 0.9182044524927531, "kd_ratio": 0.5, "learning_rate": 1.3586498742250597e-06, "loss": 1.0815398693084717, "loss/kd": 1.8156483173370361, "loss/lm": 0.34743139147758484, "step": 4078 }, { "epoch": 0.8374050502976802, "grad_norm": 0.9483176336192163, "kd_ratio": 0.5, "learning_rate": 1.3553049646777993e-06, "loss": 0.9996926188468933, "loss/kd": 1.72185218334198, "loss/lm": 0.27753302454948425, "step": 4079 }, { "epoch": 0.8376103469513447, "grad_norm": 0.8683262420686602, "kd_ratio": 0.5, "learning_rate": 1.3519638783535272e-06, "loss": 1.2399424314498901, "loss/kd": 2.1367361545562744, "loss/lm": 0.3431486189365387, "step": 4080 }, { "epoch": 0.8378156436050093, "grad_norm": 0.9378184992043579, "kd_ratio": 0.5, "learning_rate": 1.3486266167298733e-06, "loss": 1.1012108325958252, "loss/kd": 1.855357050895691, "loss/lm": 0.3470645546913147, "step": 4081 }, { "epoch": 0.8380209402586738, "grad_norm": 1.0316341840606151, "kd_ratio": 0.5, "learning_rate": 1.3452931812827885e-06, "loss": 1.0035375356674194, "loss/kd": 1.6278495788574219, "loss/lm": 0.37922540307044983, "step": 4082 }, { "epoch": 0.8382262369123383, "grad_norm": 1.0004859057697477, "kd_ratio": 0.5, "learning_rate": 1.34196357348652e-06, "loss": 0.9323023557662964, "loss/kd": 1.5521388053894043, "loss/lm": 0.3124658763408661, "step": 4083 }, { "epoch": 0.8384315335660029, "grad_norm": 1.0803742367798288, "kd_ratio": 0.5, "learning_rate": 1.3386377948136354e-06, "loss": 0.8920775651931763, "loss/kd": 1.4227702617645264, "loss/lm": 0.36138489842414856, "step": 4084 }, { "epoch": 0.8386368302196674, "grad_norm": 0.884430139590356, "kd_ratio": 0.5, "learning_rate": 1.3353158467349947e-06, "loss": 1.0888382196426392, "loss/kd": 1.8275227546691895, "loss/lm": 0.35015374422073364, "step": 4085 }, { "epoch": 0.8388421268733319, "grad_norm": 0.8919848755225325, "kd_ratio": 0.5, "learning_rate": 1.331997730719773e-06, "loss": 2.0625827312469482, "loss/kd": 3.8849940299987793, "loss/lm": 0.24017152190208435, "step": 4086 }, { "epoch": 0.8390474235269965, "grad_norm": 0.9463040777774626, "kd_ratio": 0.5, "learning_rate": 1.3286834482354506e-06, "loss": 0.7912922501564026, "loss/kd": 1.268957257270813, "loss/lm": 0.3136272728443146, "step": 4087 }, { "epoch": 0.8392527201806611, "grad_norm": 0.9955398738952627, "kd_ratio": 0.5, "learning_rate": 1.3253730007478038e-06, "loss": 1.1351877450942993, "loss/kd": 1.9319349527359009, "loss/lm": 0.3384404480457306, "step": 4088 }, { "epoch": 0.8394580168343256, "grad_norm": 1.0040344015949776, "kd_ratio": 0.5, "learning_rate": 1.322066389720924e-06, "loss": 0.9027442932128906, "loss/kd": 1.4720509052276611, "loss/lm": 0.3334377110004425, "step": 4089 }, { "epoch": 0.8396633134879902, "grad_norm": 0.8172236472879402, "kd_ratio": 0.5, "learning_rate": 1.3187636166171979e-06, "loss": 0.8965057730674744, "loss/kd": 1.44002103805542, "loss/lm": 0.3529905378818512, "step": 4090 }, { "epoch": 0.8398686101416547, "grad_norm": 0.8434424915416163, "kd_ratio": 0.5, "learning_rate": 1.3154646828973217e-06, "loss": 0.9457160234451294, "loss/kd": 1.6244381666183472, "loss/lm": 0.2669938802719116, "step": 4091 }, { "epoch": 0.8400739067953192, "grad_norm": 0.8594756680095135, "kd_ratio": 0.5, "learning_rate": 1.3121695900202836e-06, "loss": 0.8089052438735962, "loss/kd": 1.3875192403793335, "loss/lm": 0.2302912324666977, "step": 4092 }, { "epoch": 0.8402792034489838, "grad_norm": 0.9856837049686331, "kd_ratio": 0.5, "learning_rate": 1.3088783394433813e-06, "loss": 0.8782973289489746, "loss/kd": 1.4384015798568726, "loss/lm": 0.31819313764572144, "step": 4093 }, { "epoch": 0.8404845001026483, "grad_norm": 1.3065605082081597, "kd_ratio": 0.5, "learning_rate": 1.3055909326222128e-06, "loss": 1.0169367790222168, "loss/kd": 1.7471169233322144, "loss/lm": 0.28675660490989685, "step": 4094 }, { "epoch": 0.8406897967563128, "grad_norm": 0.9970238909542861, "kd_ratio": 0.5, "learning_rate": 1.3023073710106726e-06, "loss": 1.0314364433288574, "loss/kd": 1.7306182384490967, "loss/lm": 0.33225470781326294, "step": 4095 }, { "epoch": 0.8408950934099774, "grad_norm": 0.885482434294359, "kd_ratio": 0.5, "learning_rate": 1.2990276560609594e-06, "loss": 1.098897933959961, "loss/kd": 1.8580471277236938, "loss/lm": 0.33974871039390564, "step": 4096 }, { "epoch": 0.841100390063642, "grad_norm": 0.9171916033923675, "kd_ratio": 0.5, "learning_rate": 1.2957517892235628e-06, "loss": 0.9935944080352783, "loss/kd": 1.7264009714126587, "loss/lm": 0.26078787446022034, "step": 4097 }, { "epoch": 0.8413056867173065, "grad_norm": 0.8683234536381286, "kd_ratio": 0.5, "learning_rate": 1.2924797719472792e-06, "loss": 1.1685560941696167, "loss/kd": 1.986458659172058, "loss/lm": 0.3506534993648529, "step": 4098 }, { "epoch": 0.8415109833709711, "grad_norm": 0.9304471275909608, "kd_ratio": 0.5, "learning_rate": 1.2892116056791927e-06, "loss": 0.9570643901824951, "loss/kd": 1.5183368921279907, "loss/lm": 0.39579182863235474, "step": 4099 }, { "epoch": 0.8417162800246356, "grad_norm": 0.8942179890571529, "kd_ratio": 0.5, "learning_rate": 1.2859472918646986e-06, "loss": 0.946135401725769, "loss/kd": 1.5592190027236938, "loss/lm": 0.333051860332489, "step": 4100 }, { "epoch": 0.8419215766783001, "grad_norm": 0.8253438907578782, "kd_ratio": 0.5, "learning_rate": 1.282686831947474e-06, "loss": 1.1136060953140259, "loss/kd": 1.8995283842086792, "loss/lm": 0.3276837170124054, "step": 4101 }, { "epoch": 0.8421268733319647, "grad_norm": 1.0332301648635571, "kd_ratio": 0.5, "learning_rate": 1.2794302273694991e-06, "loss": 1.0304694175720215, "loss/kd": 1.741388201713562, "loss/lm": 0.31955063343048096, "step": 4102 }, { "epoch": 0.8423321699856292, "grad_norm": 1.0591737829208756, "kd_ratio": 0.5, "learning_rate": 1.2761774795710502e-06, "loss": 1.0983701944351196, "loss/kd": 1.8917856216430664, "loss/lm": 0.30495473742485046, "step": 4103 }, { "epoch": 0.8425374666392937, "grad_norm": 0.9170677616144685, "kd_ratio": 0.5, "learning_rate": 1.2729285899906874e-06, "loss": 0.9531487822532654, "loss/kd": 1.5742403268814087, "loss/lm": 0.3320572078227997, "step": 4104 }, { "epoch": 0.8427427632929583, "grad_norm": 0.9591052999187818, "kd_ratio": 0.5, "learning_rate": 1.2696835600652812e-06, "loss": 1.9776034355163574, "loss/kd": 3.649536609649658, "loss/lm": 0.30567023158073425, "step": 4105 }, { "epoch": 0.8429480599466229, "grad_norm": 0.9495930590383312, "kd_ratio": 0.5, "learning_rate": 1.2664423912299807e-06, "loss": 0.794963002204895, "loss/kd": 1.2771084308624268, "loss/lm": 0.31281760334968567, "step": 4106 }, { "epoch": 0.8431533566002875, "grad_norm": 0.9405350074734663, "kd_ratio": 0.5, "learning_rate": 1.2632050849182365e-06, "loss": 1.0261611938476562, "loss/kd": 1.7812215089797974, "loss/lm": 0.27110081911087036, "step": 4107 }, { "epoch": 0.843358653253952, "grad_norm": 0.8746213040764874, "kd_ratio": 0.5, "learning_rate": 1.2599716425617813e-06, "loss": 1.08540678024292, "loss/kd": 1.9071717262268066, "loss/lm": 0.2636418640613556, "step": 4108 }, { "epoch": 0.8435639499076165, "grad_norm": 0.9942601627478582, "kd_ratio": 0.5, "learning_rate": 1.2567420655906505e-06, "loss": 1.0557568073272705, "loss/kd": 1.7616761922836304, "loss/lm": 0.34983736276626587, "step": 4109 }, { "epoch": 0.843769246561281, "grad_norm": 0.825331252943047, "kd_ratio": 0.5, "learning_rate": 1.2535163554331609e-06, "loss": 0.8902208209037781, "loss/kd": 1.491346001625061, "loss/lm": 0.28909561038017273, "step": 4110 }, { "epoch": 0.8439745432149456, "grad_norm": 0.7853947853493706, "kd_ratio": 0.5, "learning_rate": 1.2502945135159272e-06, "loss": 0.985910952091217, "loss/kd": 1.6686955690383911, "loss/lm": 0.30312636494636536, "step": 4111 }, { "epoch": 0.8441798398686101, "grad_norm": 0.796808519135623, "kd_ratio": 0.5, "learning_rate": 1.247076541263843e-06, "loss": 1.2562044858932495, "loss/kd": 2.1253013610839844, "loss/lm": 0.38710758090019226, "step": 4112 }, { "epoch": 0.8443851365222746, "grad_norm": 0.8908421305906873, "kd_ratio": 0.5, "learning_rate": 1.2438624401000987e-06, "loss": 1.1924225091934204, "loss/kd": 1.9955921173095703, "loss/lm": 0.38925299048423767, "step": 4113 }, { "epoch": 0.8445904331759392, "grad_norm": 0.9555822888551074, "kd_ratio": 0.5, "learning_rate": 1.2406522114461727e-06, "loss": 0.9041255712509155, "loss/kd": 1.504353404045105, "loss/lm": 0.30389779806137085, "step": 4114 }, { "epoch": 0.8447957298296038, "grad_norm": 0.859871966639485, "kd_ratio": 0.5, "learning_rate": 1.2374458567218217e-06, "loss": 1.839195966720581, "loss/kd": 3.2893781661987305, "loss/lm": 0.3890136778354645, "step": 4115 }, { "epoch": 0.8450010264832684, "grad_norm": 0.907009795969975, "kd_ratio": 0.5, "learning_rate": 1.2342433773451036e-06, "loss": 0.9463727474212646, "loss/kd": 1.5645520687103271, "loss/lm": 0.3281934857368469, "step": 4116 }, { "epoch": 0.8452063231369329, "grad_norm": 1.0382857876032017, "kd_ratio": 0.5, "learning_rate": 1.2310447747323484e-06, "loss": 0.9700291156768799, "loss/kd": 1.6198222637176514, "loss/lm": 0.320235937833786, "step": 4117 }, { "epoch": 0.8454116197905974, "grad_norm": 0.8949453623248098, "kd_ratio": 0.5, "learning_rate": 1.2278500502981816e-06, "loss": 1.085081696510315, "loss/kd": 1.8730311393737793, "loss/lm": 0.2971321940422058, "step": 4118 }, { "epoch": 0.845616916444262, "grad_norm": 0.8135285115942577, "kd_ratio": 0.5, "learning_rate": 1.2246592054555062e-06, "loss": 1.0507266521453857, "loss/kd": 1.7301181554794312, "loss/lm": 0.37133511900901794, "step": 4119 }, { "epoch": 0.8458222130979265, "grad_norm": 0.9330159920696615, "kd_ratio": 0.5, "learning_rate": 1.2214722416155133e-06, "loss": 1.3508613109588623, "loss/kd": 2.3243203163146973, "loss/lm": 0.37740230560302734, "step": 4120 }, { "epoch": 0.846027509751591, "grad_norm": 0.9321915648643994, "kd_ratio": 0.5, "learning_rate": 1.2182891601876778e-06, "loss": 1.1165169477462769, "loss/kd": 1.8655205965042114, "loss/lm": 0.36751335859298706, "step": 4121 }, { "epoch": 0.8462328064052556, "grad_norm": 0.8517203731204535, "kd_ratio": 0.5, "learning_rate": 1.215109962579757e-06, "loss": 1.228761076927185, "loss/kd": 2.064887523651123, "loss/lm": 0.39263471961021423, "step": 4122 }, { "epoch": 0.8464381030589201, "grad_norm": 0.9783204955060429, "kd_ratio": 0.5, "learning_rate": 1.2119346501977914e-06, "loss": 1.2874677181243896, "loss/kd": 2.296037197113037, "loss/lm": 0.27889835834503174, "step": 4123 }, { "epoch": 0.8466433997125847, "grad_norm": 0.8335079775121415, "kd_ratio": 0.5, "learning_rate": 1.208763224446098e-06, "loss": 0.9461137056350708, "loss/kd": 1.577215552330017, "loss/lm": 0.3150118589401245, "step": 4124 }, { "epoch": 0.8468486963662493, "grad_norm": 0.9301813008965181, "kd_ratio": 0.5, "learning_rate": 1.2055956867272821e-06, "loss": 1.110609769821167, "loss/kd": 1.8821138143539429, "loss/lm": 0.33910566568374634, "step": 4125 }, { "epoch": 0.8470539930199138, "grad_norm": 0.8518524147143673, "kd_ratio": 0.5, "learning_rate": 1.202432038442226e-06, "loss": 0.9162706136703491, "loss/kd": 1.5270335674285889, "loss/lm": 0.30550771951675415, "step": 4126 }, { "epoch": 0.8472592896735783, "grad_norm": 0.9467910537576864, "kd_ratio": 0.5, "learning_rate": 1.1992722809900925e-06, "loss": 1.1348650455474854, "loss/kd": 1.9854902029037476, "loss/lm": 0.2842397689819336, "step": 4127 }, { "epoch": 0.8474645863272429, "grad_norm": 0.8338292132538696, "kd_ratio": 0.5, "learning_rate": 1.1961164157683224e-06, "loss": 0.9877482652664185, "loss/kd": 1.6733165979385376, "loss/lm": 0.3021799623966217, "step": 4128 }, { "epoch": 0.8476698829809074, "grad_norm": 0.8299289036785661, "kd_ratio": 0.5, "learning_rate": 1.192964444172635e-06, "loss": 1.128278374671936, "loss/kd": 1.9847923517227173, "loss/lm": 0.27176445722579956, "step": 4129 }, { "epoch": 0.8478751796345719, "grad_norm": 0.9617611770133355, "kd_ratio": 0.5, "learning_rate": 1.1898163675970297e-06, "loss": 0.8908609747886658, "loss/kd": 1.4147164821624756, "loss/lm": 0.36700546741485596, "step": 4130 }, { "epoch": 0.8480804762882365, "grad_norm": 0.9223289001532774, "kd_ratio": 0.5, "learning_rate": 1.1866721874337827e-06, "loss": 0.9747894406318665, "loss/kd": 1.5881716012954712, "loss/lm": 0.3614072799682617, "step": 4131 }, { "epoch": 0.848285772941901, "grad_norm": 1.1228783461224776, "kd_ratio": 0.5, "learning_rate": 1.1835319050734483e-06, "loss": 1.1374690532684326, "loss/kd": 1.9102602005004883, "loss/lm": 0.3646778464317322, "step": 4132 }, { "epoch": 0.8484910695955656, "grad_norm": 0.9324491646976028, "kd_ratio": 0.5, "learning_rate": 1.1803955219048513e-06, "loss": 1.11087167263031, "loss/kd": 1.8821505308151245, "loss/lm": 0.33959272503852844, "step": 4133 }, { "epoch": 0.8486963662492302, "grad_norm": 0.8159896762136744, "kd_ratio": 0.5, "learning_rate": 1.1772630393150986e-06, "loss": 1.0598952770233154, "loss/kd": 1.7852264642715454, "loss/lm": 0.3345641493797302, "step": 4134 }, { "epoch": 0.8489016629028947, "grad_norm": 0.9118414743564018, "kd_ratio": 0.5, "learning_rate": 1.1741344586895642e-06, "loss": 1.8721776008605957, "loss/kd": 3.489253520965576, "loss/lm": 0.2551017105579376, "step": 4135 }, { "epoch": 0.8491069595565592, "grad_norm": 0.9941291809189178, "kd_ratio": 0.5, "learning_rate": 1.1710097814119093e-06, "loss": 1.2662301063537598, "loss/kd": 2.139772415161133, "loss/lm": 0.3926878869533539, "step": 4136 }, { "epoch": 0.8493122562102238, "grad_norm": 0.8566112786559588, "kd_ratio": 0.5, "learning_rate": 1.167889008864055e-06, "loss": 1.0153592824935913, "loss/kd": 1.7316288948059082, "loss/lm": 0.2990897297859192, "step": 4137 }, { "epoch": 0.8495175528638883, "grad_norm": 0.9443748216118609, "kd_ratio": 0.5, "learning_rate": 1.164772142426206e-06, "loss": 0.9729987382888794, "loss/kd": 1.635071873664856, "loss/lm": 0.3109256327152252, "step": 4138 }, { "epoch": 0.8497228495175528, "grad_norm": 1.0035624257769395, "kd_ratio": 0.5, "learning_rate": 1.1616591834768299e-06, "loss": 0.8928732872009277, "loss/kd": 1.4049869775772095, "loss/lm": 0.3807596266269684, "step": 4139 }, { "epoch": 0.8499281461712174, "grad_norm": 0.9703434061682972, "kd_ratio": 0.5, "learning_rate": 1.158550133392673e-06, "loss": 1.181897759437561, "loss/kd": 2.049515962600708, "loss/lm": 0.31427961587905884, "step": 4140 }, { "epoch": 0.8501334428248819, "grad_norm": 0.8316086657142305, "kd_ratio": 0.5, "learning_rate": 1.1554449935487533e-06, "loss": 1.0684208869934082, "loss/kd": 1.8232700824737549, "loss/lm": 0.3135717511177063, "step": 4141 }, { "epoch": 0.8503387394785465, "grad_norm": 0.8873764531160061, "kd_ratio": 0.5, "learning_rate": 1.1523437653183544e-06, "loss": 1.1452707052230835, "loss/kd": 1.9040786027908325, "loss/lm": 0.3864627778530121, "step": 4142 }, { "epoch": 0.8505440361322111, "grad_norm": 0.8767044915806806, "kd_ratio": 0.5, "learning_rate": 1.149246450073036e-06, "loss": 0.9069713950157166, "loss/kd": 1.46275794506073, "loss/lm": 0.35118481516838074, "step": 4143 }, { "epoch": 0.8507493327858756, "grad_norm": 0.8616686799069377, "kd_ratio": 0.5, "learning_rate": 1.14615304918262e-06, "loss": 0.8567259311676025, "loss/kd": 1.3341734409332275, "loss/lm": 0.37927839159965515, "step": 4144 }, { "epoch": 0.8509546294395401, "grad_norm": 0.9599483556072226, "kd_ratio": 0.5, "learning_rate": 1.1430635640152043e-06, "loss": 0.9185819029808044, "loss/kd": 1.5334876775741577, "loss/lm": 0.3036760985851288, "step": 4145 }, { "epoch": 0.8511599260932047, "grad_norm": 0.8225229856182072, "kd_ratio": 0.5, "learning_rate": 1.139977995937147e-06, "loss": 1.027200698852539, "loss/kd": 1.6390526294708252, "loss/lm": 0.4153487980365753, "step": 4146 }, { "epoch": 0.8513652227468692, "grad_norm": 0.9092942621843338, "kd_ratio": 0.5, "learning_rate": 1.1368963463130866e-06, "loss": 1.0703116655349731, "loss/kd": 1.7442289590835571, "loss/lm": 0.396394282579422, "step": 4147 }, { "epoch": 0.8515705194005337, "grad_norm": 0.8416072516694314, "kd_ratio": 0.5, "learning_rate": 1.133818616505914e-06, "loss": 1.3763364553451538, "loss/kd": 2.4783315658569336, "loss/lm": 0.2743414342403412, "step": 4148 }, { "epoch": 0.8517758160541983, "grad_norm": 0.9409298891330575, "kd_ratio": 0.5, "learning_rate": 1.1307448078767946e-06, "loss": 0.973568320274353, "loss/kd": 1.5565125942230225, "loss/lm": 0.3906239867210388, "step": 4149 }, { "epoch": 0.8519811127078628, "grad_norm": 0.9741827539941706, "kd_ratio": 0.5, "learning_rate": 1.1276749217851612e-06, "loss": 1.157996416091919, "loss/kd": 1.9070810079574585, "loss/lm": 0.4089118242263794, "step": 4150 }, { "epoch": 0.8521864093615275, "grad_norm": 0.9107697756847275, "kd_ratio": 0.5, "learning_rate": 1.1246089595887023e-06, "loss": 0.9109479188919067, "loss/kd": 1.4876185655593872, "loss/lm": 0.3342772424221039, "step": 4151 }, { "epoch": 0.852391706015192, "grad_norm": 1.2127452429039505, "kd_ratio": 0.5, "learning_rate": 1.1215469226433863e-06, "loss": 0.863634467124939, "loss/kd": 1.4788423776626587, "loss/lm": 0.24842651188373566, "step": 4152 }, { "epoch": 0.8525970026688565, "grad_norm": 0.9330861756826545, "kd_ratio": 0.5, "learning_rate": 1.11848881230343e-06, "loss": 0.965157151222229, "loss/kd": 1.6345192193984985, "loss/lm": 0.2957950532436371, "step": 4153 }, { "epoch": 0.852802299322521, "grad_norm": 0.860853869233848, "kd_ratio": 0.5, "learning_rate": 1.1154346299213238e-06, "loss": 1.018428087234497, "loss/kd": 1.7477794885635376, "loss/lm": 0.2890765964984894, "step": 4154 }, { "epoch": 0.8530075959761856, "grad_norm": 0.9254951724534951, "kd_ratio": 0.5, "learning_rate": 1.1123843768478148e-06, "loss": 1.1014139652252197, "loss/kd": 1.8746880292892456, "loss/lm": 0.3281398415565491, "step": 4155 }, { "epoch": 0.8532128926298501, "grad_norm": 1.0833728076733948, "kd_ratio": 0.5, "learning_rate": 1.1093380544319166e-06, "loss": 1.0522359609603882, "loss/kd": 1.8464024066925049, "loss/lm": 0.25806954503059387, "step": 4156 }, { "epoch": 0.8534181892835146, "grad_norm": 1.1347706514051792, "kd_ratio": 0.5, "learning_rate": 1.1062956640209033e-06, "loss": 0.9149767160415649, "loss/kd": 1.4740630388259888, "loss/lm": 0.3558904528617859, "step": 4157 }, { "epoch": 0.8536234859371792, "grad_norm": 0.9800855141957291, "kd_ratio": 0.5, "learning_rate": 1.1032572069603076e-06, "loss": 1.178593397140503, "loss/kd": 1.9439349174499512, "loss/lm": 0.41325196623802185, "step": 4158 }, { "epoch": 0.8538287825908437, "grad_norm": 0.823952723187615, "kd_ratio": 0.5, "learning_rate": 1.100222684593929e-06, "loss": 1.3396075963974, "loss/kd": 2.282310962677002, "loss/lm": 0.3969041705131531, "step": 4159 }, { "epoch": 0.8540340792445084, "grad_norm": 0.8665188762791792, "kd_ratio": 0.5, "learning_rate": 1.0971920982638173e-06, "loss": 0.9017848372459412, "loss/kd": 1.4319603443145752, "loss/lm": 0.37160930037498474, "step": 4160 }, { "epoch": 0.8542393758981729, "grad_norm": 0.9297107528008297, "kd_ratio": 0.5, "learning_rate": 1.0941654493102893e-06, "loss": 1.13615882396698, "loss/kd": 1.9483641386032104, "loss/lm": 0.32395341992378235, "step": 4161 }, { "epoch": 0.8544446725518374, "grad_norm": 0.8812160135285001, "kd_ratio": 0.5, "learning_rate": 1.0911427390719175e-06, "loss": 0.958389401435852, "loss/kd": 1.610262155532837, "loss/lm": 0.3065166771411896, "step": 4162 }, { "epoch": 0.854649969205502, "grad_norm": 0.9480716034717935, "kd_ratio": 0.5, "learning_rate": 1.088123968885534e-06, "loss": 0.8119311332702637, "loss/kd": 1.3356081247329712, "loss/lm": 0.2882540822029114, "step": 4163 }, { "epoch": 0.8548552658591665, "grad_norm": 0.8465961403265534, "kd_ratio": 0.5, "learning_rate": 1.0851091400862247e-06, "loss": 0.9173563718795776, "loss/kd": 1.4349206686019897, "loss/lm": 0.3997921049594879, "step": 4164 }, { "epoch": 0.855060562512831, "grad_norm": 0.8840601558597349, "kd_ratio": 0.5, "learning_rate": 1.0820982540073345e-06, "loss": 1.3932760953903198, "loss/kd": 2.4128122329711914, "loss/lm": 0.373740017414093, "step": 4165 }, { "epoch": 0.8552658591664956, "grad_norm": 0.8529391431662507, "kd_ratio": 0.5, "learning_rate": 1.0790913119804691e-06, "loss": 0.9915405511856079, "loss/kd": 1.6270660161972046, "loss/lm": 0.356015145778656, "step": 4166 }, { "epoch": 0.8554711558201601, "grad_norm": 0.8872183435029276, "kd_ratio": 0.5, "learning_rate": 1.0760883153354818e-06, "loss": 0.9873039126396179, "loss/kd": 1.681032657623291, "loss/lm": 0.2935751676559448, "step": 4167 }, { "epoch": 0.8556764524738246, "grad_norm": 0.8830450328285603, "kd_ratio": 0.5, "learning_rate": 1.0730892654004842e-06, "loss": 0.8828042149543762, "loss/kd": 1.436514973640442, "loss/lm": 0.32909345626831055, "step": 4168 }, { "epoch": 0.8558817491274893, "grad_norm": 0.8791369511188106, "kd_ratio": 0.5, "learning_rate": 1.070094163501847e-06, "loss": 1.140683889389038, "loss/kd": 1.9473317861557007, "loss/lm": 0.33403587341308594, "step": 4169 }, { "epoch": 0.8560870457811538, "grad_norm": 0.8380173859199381, "kd_ratio": 0.5, "learning_rate": 1.0671030109641889e-06, "loss": 0.8784466981887817, "loss/kd": 1.4400248527526855, "loss/lm": 0.31686854362487793, "step": 4170 }, { "epoch": 0.8562923424348183, "grad_norm": 0.8709519155032128, "kd_ratio": 0.5, "learning_rate": 1.0641158091103832e-06, "loss": 0.9810592532157898, "loss/kd": 1.6096597909927368, "loss/lm": 0.35245874524116516, "step": 4171 }, { "epoch": 0.8564976390884829, "grad_norm": 0.8586146207627617, "kd_ratio": 0.5, "learning_rate": 1.0611325592615574e-06, "loss": 0.9245083332061768, "loss/kd": 1.5271427631378174, "loss/lm": 0.32187390327453613, "step": 4172 }, { "epoch": 0.8567029357421474, "grad_norm": 0.8987189687761308, "kd_ratio": 0.5, "learning_rate": 1.0581532627370906e-06, "loss": 1.1284486055374146, "loss/kd": 1.9276556968688965, "loss/lm": 0.3292416036128998, "step": 4173 }, { "epoch": 0.8569082323958119, "grad_norm": 0.8646444570685223, "kd_ratio": 0.5, "learning_rate": 1.0551779208546154e-06, "loss": 1.0009639263153076, "loss/kd": 1.6674649715423584, "loss/lm": 0.334462970495224, "step": 4174 }, { "epoch": 0.8571135290494765, "grad_norm": 0.8724958453319627, "kd_ratio": 0.5, "learning_rate": 1.0522065349300103e-06, "loss": 1.108886957168579, "loss/kd": 1.9106132984161377, "loss/lm": 0.30716052651405334, "step": 4175 }, { "epoch": 0.857318825703141, "grad_norm": 0.9205244487705917, "kd_ratio": 0.5, "learning_rate": 1.0492391062774076e-06, "loss": 0.9899306893348694, "loss/kd": 1.6437530517578125, "loss/lm": 0.33610835671424866, "step": 4176 }, { "epoch": 0.8575241223568055, "grad_norm": 0.9698906955405037, "kd_ratio": 0.5, "learning_rate": 1.0462756362091908e-06, "loss": 1.0647494792938232, "loss/kd": 1.8484750986099243, "loss/lm": 0.281023770570755, "step": 4177 }, { "epoch": 0.8577294190104702, "grad_norm": 0.8468392190460722, "kd_ratio": 0.5, "learning_rate": 1.0433161260359903e-06, "loss": 0.9493147134780884, "loss/kd": 1.5915522575378418, "loss/lm": 0.30707719922065735, "step": 4178 }, { "epoch": 0.8579347156641347, "grad_norm": 0.9353884911647325, "kd_ratio": 0.5, "learning_rate": 1.040360577066688e-06, "loss": 1.0525709390640259, "loss/kd": 1.7057511806488037, "loss/lm": 0.39939069747924805, "step": 4179 }, { "epoch": 0.8581400123177992, "grad_norm": 1.0014367730812368, "kd_ratio": 0.5, "learning_rate": 1.0374089906084083e-06, "loss": 0.89007568359375, "loss/kd": 1.5225681066513062, "loss/lm": 0.2575833201408386, "step": 4180 }, { "epoch": 0.8583453089714638, "grad_norm": 0.8713847202302509, "kd_ratio": 0.5, "learning_rate": 1.0344613679665306e-06, "loss": 0.846182107925415, "loss/kd": 1.4179871082305908, "loss/lm": 0.27437707781791687, "step": 4181 }, { "epoch": 0.8585506056251283, "grad_norm": 1.0405527199021307, "kd_ratio": 0.5, "learning_rate": 1.0315177104446706e-06, "loss": 0.9239651560783386, "loss/kd": 1.4846506118774414, "loss/lm": 0.36327970027923584, "step": 4182 }, { "epoch": 0.8587559022787928, "grad_norm": 0.9510959083101337, "kd_ratio": 0.5, "learning_rate": 1.028578019344706e-06, "loss": 1.4189165830612183, "loss/kd": 2.436340570449829, "loss/lm": 0.40149256587028503, "step": 4183 }, { "epoch": 0.8589611989324574, "grad_norm": 0.9079130047437057, "kd_ratio": 0.5, "learning_rate": 1.0256422959667456e-06, "loss": 1.1163660287857056, "loss/kd": 1.8964682817459106, "loss/lm": 0.3362637460231781, "step": 4184 }, { "epoch": 0.8591664955861219, "grad_norm": 1.014839743777738, "kd_ratio": 0.5, "learning_rate": 1.022710541609151e-06, "loss": 1.2634806632995605, "loss/kd": 2.1122703552246094, "loss/lm": 0.4146910607814789, "step": 4185 }, { "epoch": 0.8593717922397864, "grad_norm": 0.987035022013986, "kd_ratio": 0.5, "learning_rate": 1.019782757568528e-06, "loss": 1.4714938402175903, "loss/kd": 2.54309344291687, "loss/lm": 0.3998943269252777, "step": 4186 }, { "epoch": 0.8595770888934511, "grad_norm": 0.8088440369002112, "kd_ratio": 0.5, "learning_rate": 1.0168589451397204e-06, "loss": 0.9468905925750732, "loss/kd": 1.6349046230316162, "loss/lm": 0.2588765621185303, "step": 4187 }, { "epoch": 0.8597823855471156, "grad_norm": 0.9232104942956219, "kd_ratio": 0.5, "learning_rate": 1.0139391056158277e-06, "loss": 1.2175978422164917, "loss/kd": 2.1001532077789307, "loss/lm": 0.3350425660610199, "step": 4188 }, { "epoch": 0.8599876822007801, "grad_norm": 0.8788399373224862, "kd_ratio": 0.5, "learning_rate": 1.0110232402881792e-06, "loss": 1.1175975799560547, "loss/kd": 1.9602909088134766, "loss/lm": 0.2749042809009552, "step": 4189 }, { "epoch": 0.8601929788544447, "grad_norm": 1.0521271624186972, "kd_ratio": 0.5, "learning_rate": 1.0081113504463546e-06, "loss": 1.1531230211257935, "loss/kd": 2.068140983581543, "loss/lm": 0.23810508847236633, "step": 4190 }, { "epoch": 0.8603982755081092, "grad_norm": 0.8946146165978353, "kd_ratio": 0.5, "learning_rate": 1.0052034373781716e-06, "loss": 1.079567313194275, "loss/kd": 1.7941852807998657, "loss/lm": 0.36494943499565125, "step": 4191 }, { "epoch": 0.8606035721617737, "grad_norm": 1.006814882756838, "kd_ratio": 0.5, "learning_rate": 1.0022995023696924e-06, "loss": 1.0222930908203125, "loss/kd": 1.6971614360809326, "loss/lm": 0.34742486476898193, "step": 4192 }, { "epoch": 0.8608088688154383, "grad_norm": 0.8810223918936824, "kd_ratio": 0.5, "learning_rate": 9.993995467052165e-07, "loss": 0.7372082471847534, "loss/kd": 1.1849024295806885, "loss/lm": 0.28951412439346313, "step": 4193 }, { "epoch": 0.8610141654691028, "grad_norm": 0.8585196479594354, "kd_ratio": 0.5, "learning_rate": 9.965035716672867e-07, "loss": 0.9976017475128174, "loss/kd": 1.6355669498443604, "loss/lm": 0.359636515378952, "step": 4194 }, { "epoch": 0.8612194621227675, "grad_norm": 0.9765265179974943, "kd_ratio": 0.5, "learning_rate": 9.936115785366817e-07, "loss": 1.5107494592666626, "loss/kd": 2.7287380695343018, "loss/lm": 0.2927609086036682, "step": 4195 }, { "epoch": 0.861424758776432, "grad_norm": 0.9468151156646766, "kd_ratio": 0.5, "learning_rate": 9.90723568592422e-07, "loss": 1.0629570484161377, "loss/kd": 1.8042867183685303, "loss/lm": 0.32162725925445557, "step": 4196 }, { "epoch": 0.8616300554300965, "grad_norm": 0.8875835269586129, "kd_ratio": 0.5, "learning_rate": 9.878395431117682e-07, "loss": 0.9690362215042114, "loss/kd": 1.611944317817688, "loss/lm": 0.3261280953884125, "step": 4197 }, { "epoch": 0.861835352083761, "grad_norm": 0.888713316525127, "kd_ratio": 0.5, "learning_rate": 9.849595033702098e-07, "loss": 1.0216498374938965, "loss/kd": 1.7086095809936523, "loss/lm": 0.3346901834011078, "step": 4198 }, { "epoch": 0.8620406487374256, "grad_norm": 0.8865619087604942, "kd_ratio": 0.5, "learning_rate": 9.820834506414866e-07, "loss": 1.0463812351226807, "loss/kd": 1.7843410968780518, "loss/lm": 0.30842137336730957, "step": 4199 }, { "epoch": 0.8622459453910901, "grad_norm": 1.122045276488375, "kd_ratio": 0.5, "learning_rate": 9.79211386197565e-07, "loss": 0.8291699290275574, "loss/kd": 1.345844030380249, "loss/lm": 0.31249579787254333, "step": 4200 }, { "epoch": 0.8624512420447547, "grad_norm": 0.8538499321068616, "kd_ratio": 0.5, "learning_rate": 9.763433113086528e-07, "loss": 1.004154920578003, "loss/kd": 1.6761647462844849, "loss/lm": 0.33214521408081055, "step": 4201 }, { "epoch": 0.8626565386984192, "grad_norm": 0.9068721638425222, "kd_ratio": 0.5, "learning_rate": 9.73479227243188e-07, "loss": 0.9372686743736267, "loss/kd": 1.5980799198150635, "loss/lm": 0.27645745873451233, "step": 4202 }, { "epoch": 0.8628618353520837, "grad_norm": 0.8839324350664034, "kd_ratio": 0.5, "learning_rate": 9.706191352678495e-07, "loss": 1.0241695642471313, "loss/kd": 1.6902109384536743, "loss/lm": 0.358128160238266, "step": 4203 }, { "epoch": 0.8630671320057484, "grad_norm": 1.066040573234614, "kd_ratio": 0.5, "learning_rate": 9.677630366475489e-07, "loss": 1.060074806213379, "loss/kd": 1.7932302951812744, "loss/lm": 0.3269193768501282, "step": 4204 }, { "epoch": 0.8632724286594129, "grad_norm": 0.9542037213741729, "kd_ratio": 0.5, "learning_rate": 9.649109326454286e-07, "loss": 1.2968037128448486, "loss/kd": 2.1988565921783447, "loss/lm": 0.39475083351135254, "step": 4205 }, { "epoch": 0.8634777253130774, "grad_norm": 0.8856910526243761, "kd_ratio": 0.5, "learning_rate": 9.620628245228714e-07, "loss": 0.8614667057991028, "loss/kd": 1.4634677171707153, "loss/lm": 0.25946566462516785, "step": 4206 }, { "epoch": 0.863683021966742, "grad_norm": 0.7838852339419954, "kd_ratio": 0.5, "learning_rate": 9.592187135394826e-07, "loss": 1.0244430303573608, "loss/kd": 1.6858704090118408, "loss/lm": 0.36301568150520325, "step": 4207 }, { "epoch": 0.8638883186204065, "grad_norm": 0.9086795895805084, "kd_ratio": 0.5, "learning_rate": 9.563786009531085e-07, "loss": 0.7198053598403931, "loss/kd": 1.137162446975708, "loss/lm": 0.3024483323097229, "step": 4208 }, { "epoch": 0.864093615274071, "grad_norm": 1.0828459955596172, "kd_ratio": 0.5, "learning_rate": 9.535424880198241e-07, "loss": 1.0909528732299805, "loss/kd": 1.9057444334030151, "loss/lm": 0.27616143226623535, "step": 4209 }, { "epoch": 0.8642989119277356, "grad_norm": 0.8406444261666222, "kd_ratio": 0.5, "learning_rate": 9.507103759939351e-07, "loss": 1.1851081848144531, "loss/kd": 2.035982608795166, "loss/lm": 0.3342337906360626, "step": 4210 }, { "epoch": 0.8645042085814001, "grad_norm": 0.8519159783109791, "kd_ratio": 0.5, "learning_rate": 9.478822661279763e-07, "loss": 1.181026577949524, "loss/kd": 2.0056967735290527, "loss/lm": 0.3563564419746399, "step": 4211 }, { "epoch": 0.8647095052350646, "grad_norm": 1.0024417577193085, "kd_ratio": 0.5, "learning_rate": 9.450581596727148e-07, "loss": 1.0365175008773804, "loss/kd": 1.7592130899429321, "loss/lm": 0.31382182240486145, "step": 4212 }, { "epoch": 0.8649148018887293, "grad_norm": 0.928327504296048, "kd_ratio": 0.5, "learning_rate": 9.422380578771472e-07, "loss": 0.9802467823028564, "loss/kd": 1.6456711292266846, "loss/lm": 0.31482240557670593, "step": 4213 }, { "epoch": 0.8651200985423938, "grad_norm": 0.9514172443409584, "kd_ratio": 0.5, "learning_rate": 9.394219619884981e-07, "loss": 1.144218921661377, "loss/kd": 2.0007736682891846, "loss/lm": 0.28766414523124695, "step": 4214 }, { "epoch": 0.8653253951960583, "grad_norm": 0.8436371781523632, "kd_ratio": 0.5, "learning_rate": 9.366098732522233e-07, "loss": 0.9305495023727417, "loss/kd": 1.6013191938400269, "loss/lm": 0.25977978110313416, "step": 4215 }, { "epoch": 0.8655306918497229, "grad_norm": 0.9080073404984021, "kd_ratio": 0.5, "learning_rate": 9.33801792911998e-07, "loss": 0.9471414089202881, "loss/kd": 1.6238130331039429, "loss/lm": 0.2704698145389557, "step": 4216 }, { "epoch": 0.8657359885033874, "grad_norm": 1.0559482899122894, "kd_ratio": 0.5, "learning_rate": 9.309977222097355e-07, "loss": 1.2897241115570068, "loss/kd": 2.2598977088928223, "loss/lm": 0.3195505142211914, "step": 4217 }, { "epoch": 0.8659412851570519, "grad_norm": 0.917791789188467, "kd_ratio": 0.5, "learning_rate": 9.281976623855671e-07, "loss": 0.838444173336029, "loss/kd": 1.3562400341033936, "loss/lm": 0.32064831256866455, "step": 4218 }, { "epoch": 0.8661465818107165, "grad_norm": 0.8536526429939362, "kd_ratio": 0.5, "learning_rate": 9.254016146778555e-07, "loss": 0.9760279655456543, "loss/kd": 1.6154870986938477, "loss/lm": 0.33656880259513855, "step": 4219 }, { "epoch": 0.866351878464381, "grad_norm": 0.8574816127189837, "kd_ratio": 0.5, "learning_rate": 9.226095803231871e-07, "loss": 1.0655605792999268, "loss/kd": 1.770016074180603, "loss/lm": 0.36110520362854004, "step": 4220 }, { "epoch": 0.8665571751180455, "grad_norm": 0.8698269803793021, "kd_ratio": 0.5, "learning_rate": 9.198215605563732e-07, "loss": 1.0234150886535645, "loss/kd": 1.6160368919372559, "loss/lm": 0.4307931959629059, "step": 4221 }, { "epoch": 0.8667624717717102, "grad_norm": 1.0943996700917622, "kd_ratio": 0.5, "learning_rate": 9.170375566104517e-07, "loss": 1.146033525466919, "loss/kd": 1.9859306812286377, "loss/lm": 0.30613645911216736, "step": 4222 }, { "epoch": 0.8669677684253747, "grad_norm": 0.9383394272508937, "kd_ratio": 0.5, "learning_rate": 9.1425756971668e-07, "loss": 1.1812620162963867, "loss/kd": 2.0504398345947266, "loss/lm": 0.3120840787887573, "step": 4223 }, { "epoch": 0.8671730650790392, "grad_norm": 1.0187983298468697, "kd_ratio": 0.5, "learning_rate": 9.114816011045424e-07, "loss": 1.1553568840026855, "loss/kd": 1.9613503217697144, "loss/lm": 0.3493634760379791, "step": 4224 }, { "epoch": 0.8673783617327038, "grad_norm": 0.9566422890467315, "kd_ratio": 0.5, "learning_rate": 9.087096520017458e-07, "loss": 0.8938310146331787, "loss/kd": 1.478707194328308, "loss/lm": 0.3089548349380493, "step": 4225 }, { "epoch": 0.8675836583863683, "grad_norm": 0.8727840120839689, "kd_ratio": 0.5, "learning_rate": 9.059417236342194e-07, "loss": 0.9968939423561096, "loss/kd": 1.682312250137329, "loss/lm": 0.31147563457489014, "step": 4226 }, { "epoch": 0.8677889550400328, "grad_norm": 0.8352406126311451, "kd_ratio": 0.5, "learning_rate": 9.03177817226113e-07, "loss": 0.9901167154312134, "loss/kd": 1.675947666168213, "loss/lm": 0.30428576469421387, "step": 4227 }, { "epoch": 0.8679942516936974, "grad_norm": 0.9831937424543937, "kd_ratio": 0.5, "learning_rate": 9.004179339997988e-07, "loss": 1.2666118144989014, "loss/kd": 2.145512342453003, "loss/lm": 0.38771122694015503, "step": 4228 }, { "epoch": 0.8681995483473619, "grad_norm": 0.9370592983789093, "kd_ratio": 0.5, "learning_rate": 8.976620751758646e-07, "loss": 1.0079915523529053, "loss/kd": 1.6334002017974854, "loss/lm": 0.38258281350135803, "step": 4229 }, { "epoch": 0.8684048450010264, "grad_norm": 1.0250451764747301, "kd_ratio": 0.5, "learning_rate": 8.949102419731315e-07, "loss": 1.42741858959198, "loss/kd": 2.5444095134735107, "loss/lm": 0.3104276955127716, "step": 4230 }, { "epoch": 0.8686101416546911, "grad_norm": 0.9604806507661419, "kd_ratio": 0.5, "learning_rate": 8.921624356086256e-07, "loss": 1.1179360151290894, "loss/kd": 1.9138561487197876, "loss/lm": 0.32201582193374634, "step": 4231 }, { "epoch": 0.8688154383083556, "grad_norm": 1.1713004849070132, "kd_ratio": 0.5, "learning_rate": 8.894186572975993e-07, "loss": 1.0420377254486084, "loss/kd": 1.7245687246322632, "loss/lm": 0.35950666666030884, "step": 4232 }, { "epoch": 0.8690207349620201, "grad_norm": 0.9562904954906635, "kd_ratio": 0.5, "learning_rate": 8.866789082535243e-07, "loss": 0.8776776790618896, "loss/kd": 1.417819857597351, "loss/lm": 0.3375355303287506, "step": 4233 }, { "epoch": 0.8692260316156847, "grad_norm": 0.9265254538131221, "kd_ratio": 0.5, "learning_rate": 8.839431896880835e-07, "loss": 0.8989689350128174, "loss/kd": 1.4388858079910278, "loss/lm": 0.35905200242996216, "step": 4234 }, { "epoch": 0.8694313282693492, "grad_norm": 0.9251817788944736, "kd_ratio": 0.5, "learning_rate": 8.81211502811189e-07, "loss": 0.9103348851203918, "loss/kd": 1.4703649282455444, "loss/lm": 0.35030484199523926, "step": 4235 }, { "epoch": 0.8696366249230137, "grad_norm": 1.0457032273559443, "kd_ratio": 0.5, "learning_rate": 8.784838488309577e-07, "loss": 0.8653278350830078, "loss/kd": 1.4456372261047363, "loss/lm": 0.2850185036659241, "step": 4236 }, { "epoch": 0.8698419215766783, "grad_norm": 0.8860503704474869, "kd_ratio": 0.5, "learning_rate": 8.757602289537326e-07, "loss": 1.1546247005462646, "loss/kd": 1.9333388805389404, "loss/lm": 0.3759104609489441, "step": 4237 }, { "epoch": 0.8700472182303428, "grad_norm": 1.2220702295591412, "kd_ratio": 0.5, "learning_rate": 8.730406443840634e-07, "loss": 0.9212595224380493, "loss/kd": 1.4956858158111572, "loss/lm": 0.3468332290649414, "step": 4238 }, { "epoch": 0.8702525148840073, "grad_norm": 1.6370652083314174, "kd_ratio": 0.5, "learning_rate": 8.703250963247223e-07, "loss": 0.8829562664031982, "loss/kd": 1.4840137958526611, "loss/lm": 0.2818986773490906, "step": 4239 }, { "epoch": 0.870457811537672, "grad_norm": 0.8080666025387692, "kd_ratio": 0.5, "learning_rate": 8.676135859766932e-07, "loss": 1.1441946029663086, "loss/kd": 1.993737816810608, "loss/lm": 0.2946513295173645, "step": 4240 }, { "epoch": 0.8706631081913365, "grad_norm": 1.0386084301998988, "kd_ratio": 0.5, "learning_rate": 8.649061145391758e-07, "loss": 1.0709781646728516, "loss/kd": 1.7936252355575562, "loss/lm": 0.348331093788147, "step": 4241 }, { "epoch": 0.870868404845001, "grad_norm": 0.8804253194111202, "kd_ratio": 0.5, "learning_rate": 8.622026832095854e-07, "loss": 1.1393990516662598, "loss/kd": 1.8868290185928345, "loss/lm": 0.39196914434432983, "step": 4242 }, { "epoch": 0.8710737014986656, "grad_norm": 0.9504615292390975, "kd_ratio": 0.5, "learning_rate": 8.595032931835423e-07, "loss": 1.1052942276000977, "loss/kd": 1.9115537405014038, "loss/lm": 0.29903465509414673, "step": 4243 }, { "epoch": 0.8712789981523301, "grad_norm": 0.89187352175617, "kd_ratio": 0.5, "learning_rate": 8.568079456548895e-07, "loss": 1.2717491388320923, "loss/kd": 2.2181854248046875, "loss/lm": 0.32531288266181946, "step": 4244 }, { "epoch": 0.8714842948059947, "grad_norm": 0.9335773539379968, "kd_ratio": 0.5, "learning_rate": 8.541166418156721e-07, "loss": 0.9697749018669128, "loss/kd": 1.5923906564712524, "loss/lm": 0.34715914726257324, "step": 4245 }, { "epoch": 0.8716895914596592, "grad_norm": 0.9748648222041594, "kd_ratio": 0.5, "learning_rate": 8.514293828561593e-07, "loss": 1.077614188194275, "loss/kd": 1.831221580505371, "loss/lm": 0.32400673627853394, "step": 4246 }, { "epoch": 0.8718948881133237, "grad_norm": 1.0033823770426027, "kd_ratio": 0.5, "learning_rate": 8.487461699648203e-07, "loss": 1.3332409858703613, "loss/kd": 2.275799512863159, "loss/lm": 0.39068248867988586, "step": 4247 }, { "epoch": 0.8721001847669883, "grad_norm": 0.9975813139055715, "kd_ratio": 0.5, "learning_rate": 8.460670043283403e-07, "loss": 1.9511871337890625, "loss/kd": 3.632159948348999, "loss/lm": 0.270214319229126, "step": 4248 }, { "epoch": 0.8723054814206529, "grad_norm": 0.9243173863938654, "kd_ratio": 0.5, "learning_rate": 8.433918871316149e-07, "loss": 1.1801681518554688, "loss/kd": 2.009976863861084, "loss/lm": 0.35035932064056396, "step": 4249 }, { "epoch": 0.8725107780743174, "grad_norm": 1.0212299490430907, "kd_ratio": 0.5, "learning_rate": 8.407208195577421e-07, "loss": 0.980671226978302, "loss/kd": 1.6596031188964844, "loss/lm": 0.30173933506011963, "step": 4250 }, { "epoch": 0.872716074727982, "grad_norm": 1.0102018252553717, "kd_ratio": 0.5, "learning_rate": 8.380538027880425e-07, "loss": 0.9648871421813965, "loss/kd": 1.5743736028671265, "loss/lm": 0.3554007411003113, "step": 4251 }, { "epoch": 0.8729213713816465, "grad_norm": 0.9568319861304714, "kd_ratio": 0.5, "learning_rate": 8.353908380020303e-07, "loss": 1.1634654998779297, "loss/kd": 1.9968851804733276, "loss/lm": 0.3300458490848541, "step": 4252 }, { "epoch": 0.873126668035311, "grad_norm": 0.9230164241296394, "kd_ratio": 0.5, "learning_rate": 8.327319263774403e-07, "loss": 1.19691801071167, "loss/kd": 2.0764074325561523, "loss/lm": 0.3174285590648651, "step": 4253 }, { "epoch": 0.8733319646889756, "grad_norm": 0.8134791307747362, "kd_ratio": 0.5, "learning_rate": 8.300770690902027e-07, "loss": 1.08411705493927, "loss/kd": 1.8149974346160889, "loss/lm": 0.3532366156578064, "step": 4254 }, { "epoch": 0.8735372613426401, "grad_norm": 1.0198734888462806, "kd_ratio": 0.5, "learning_rate": 8.274262673144651e-07, "loss": 1.4053289890289307, "loss/kd": 2.3914523124694824, "loss/lm": 0.4192057251930237, "step": 4255 }, { "epoch": 0.8737425579963046, "grad_norm": 0.9021056438859709, "kd_ratio": 0.5, "learning_rate": 8.247795222225763e-07, "loss": 0.9568327069282532, "loss/kd": 1.545265555381775, "loss/lm": 0.36839982867240906, "step": 4256 }, { "epoch": 0.8739478546499692, "grad_norm": 0.9150989718381167, "kd_ratio": 0.5, "learning_rate": 8.221368349850933e-07, "loss": 1.1066068410873413, "loss/kd": 1.74663245677948, "loss/lm": 0.46658119559288025, "step": 4257 }, { "epoch": 0.8741531513036338, "grad_norm": 0.9238402525254084, "kd_ratio": 0.5, "learning_rate": 8.194982067707735e-07, "loss": 1.8492364883422852, "loss/kd": 3.4534084796905518, "loss/lm": 0.24506454169750214, "step": 4258 }, { "epoch": 0.8743584479572983, "grad_norm": 0.9428589274494218, "kd_ratio": 0.5, "learning_rate": 8.168636387465856e-07, "loss": 0.9645904302597046, "loss/kd": 1.5754097700119019, "loss/lm": 0.35377106070518494, "step": 4259 }, { "epoch": 0.8745637446109629, "grad_norm": 0.9136225665616042, "kd_ratio": 0.5, "learning_rate": 8.142331320776986e-07, "loss": 0.9897071123123169, "loss/kd": 1.619023323059082, "loss/lm": 0.36039096117019653, "step": 4260 }, { "epoch": 0.8747690412646274, "grad_norm": 1.1919308142364082, "kd_ratio": 0.5, "learning_rate": 8.116066879274875e-07, "loss": 0.8704187870025635, "loss/kd": 1.412089467048645, "loss/lm": 0.32874810695648193, "step": 4261 }, { "epoch": 0.8749743379182919, "grad_norm": 0.948854224550023, "kd_ratio": 0.5, "learning_rate": 8.08984307457531e-07, "loss": 1.1344434022903442, "loss/kd": 1.867192029953003, "loss/lm": 0.40169480443000793, "step": 4262 }, { "epoch": 0.8751796345719565, "grad_norm": 1.0099638150803145, "kd_ratio": 0.5, "learning_rate": 8.063659918276056e-07, "loss": 1.0149670839309692, "loss/kd": 1.7411795854568481, "loss/lm": 0.28875449299812317, "step": 4263 }, { "epoch": 0.875384931225621, "grad_norm": 0.941732125179236, "kd_ratio": 0.5, "learning_rate": 8.037517421956975e-07, "loss": 0.8742177486419678, "loss/kd": 1.4659620523452759, "loss/lm": 0.28247344493865967, "step": 4264 }, { "epoch": 0.8755902278792855, "grad_norm": 0.8764010364144126, "kd_ratio": 0.5, "learning_rate": 8.01141559717985e-07, "loss": 1.3883715867996216, "loss/kd": 2.4386043548583984, "loss/lm": 0.3381389081478119, "step": 4265 }, { "epoch": 0.8757955245329501, "grad_norm": 1.023609643790077, "kd_ratio": 0.5, "learning_rate": 7.985354455488615e-07, "loss": 0.9648119211196899, "loss/kd": 1.5904037952423096, "loss/lm": 0.3392200469970703, "step": 4266 }, { "epoch": 0.8760008211866147, "grad_norm": 0.9763928296114032, "kd_ratio": 0.5, "learning_rate": 7.95933400840907e-07, "loss": 0.7344580888748169, "loss/kd": 1.1854974031448364, "loss/lm": 0.2834187150001526, "step": 4267 }, { "epoch": 0.8762061178402792, "grad_norm": 0.921507420510133, "kd_ratio": 0.5, "learning_rate": 7.933354267449089e-07, "loss": 1.0607186555862427, "loss/kd": 1.7739282846450806, "loss/lm": 0.3475090265274048, "step": 4268 }, { "epoch": 0.8764114144939438, "grad_norm": 0.8425857578873512, "kd_ratio": 0.5, "learning_rate": 7.907415244098571e-07, "loss": 0.9511082172393799, "loss/kd": 1.4973382949829102, "loss/lm": 0.404878169298172, "step": 4269 }, { "epoch": 0.8766167111476083, "grad_norm": 1.0893766126016344, "kd_ratio": 0.5, "learning_rate": 7.881516949829316e-07, "loss": 0.8869448900222778, "loss/kd": 1.4507452249526978, "loss/lm": 0.3231445848941803, "step": 4270 }, { "epoch": 0.8768220078012728, "grad_norm": 0.9410515296385543, "kd_ratio": 0.5, "learning_rate": 7.855659396095183e-07, "loss": 1.145078182220459, "loss/kd": 1.9695988893508911, "loss/lm": 0.32055747509002686, "step": 4271 }, { "epoch": 0.8770273044549374, "grad_norm": 0.9626317909383401, "kd_ratio": 0.5, "learning_rate": 7.829842594332004e-07, "loss": 0.9441805481910706, "loss/kd": 1.56990647315979, "loss/lm": 0.3184545934200287, "step": 4272 }, { "epoch": 0.8772326011086019, "grad_norm": 0.8151220399731421, "kd_ratio": 0.5, "learning_rate": 7.80406655595759e-07, "loss": 1.009857177734375, "loss/kd": 1.6573147773742676, "loss/lm": 0.3623996675014496, "step": 4273 }, { "epoch": 0.8774378977622664, "grad_norm": 0.8834460803194321, "kd_ratio": 0.5, "learning_rate": 7.778331292371677e-07, "loss": 0.8569350838661194, "loss/kd": 1.3912158012390137, "loss/lm": 0.3226543664932251, "step": 4274 }, { "epoch": 0.877643194415931, "grad_norm": 1.0184034705800518, "kd_ratio": 0.5, "learning_rate": 7.752636814956027e-07, "loss": 1.1303625106811523, "loss/kd": 1.9062479734420776, "loss/lm": 0.3544769883155823, "step": 4275 }, { "epoch": 0.8778484910695956, "grad_norm": 2.627795538089203, "kd_ratio": 0.5, "learning_rate": 7.726983135074328e-07, "loss": 0.9885771870613098, "loss/kd": 1.6831070184707642, "loss/lm": 0.29404738545417786, "step": 4276 }, { "epoch": 0.8780537877232601, "grad_norm": 1.1285645170827476, "kd_ratio": 0.5, "learning_rate": 7.701370264072239e-07, "loss": 0.8773676753044128, "loss/kd": 1.3831243515014648, "loss/lm": 0.3716110289096832, "step": 4277 }, { "epoch": 0.8782590843769247, "grad_norm": 0.9061736907497033, "kd_ratio": 0.5, "learning_rate": 7.675798213277386e-07, "loss": 0.9174439907073975, "loss/kd": 1.5637388229370117, "loss/lm": 0.2711491584777832, "step": 4278 }, { "epoch": 0.8784643810305892, "grad_norm": 0.87544909215127, "kd_ratio": 0.5, "learning_rate": 7.6502669939993e-07, "loss": 1.1021592617034912, "loss/kd": 1.7891687154769897, "loss/lm": 0.41514989733695984, "step": 4279 }, { "epoch": 0.8786696776842537, "grad_norm": 0.9222396516895185, "kd_ratio": 0.5, "learning_rate": 7.624776617529495e-07, "loss": 1.347022533416748, "loss/kd": 2.3389744758605957, "loss/lm": 0.355070561170578, "step": 4280 }, { "epoch": 0.8788749743379183, "grad_norm": 0.9355993438042374, "kd_ratio": 0.5, "learning_rate": 7.599327095141363e-07, "loss": 1.1278352737426758, "loss/kd": 1.868312954902649, "loss/lm": 0.38735753297805786, "step": 4281 }, { "epoch": 0.8790802709915828, "grad_norm": 1.015668960616292, "kd_ratio": 0.5, "learning_rate": 7.573918438090334e-07, "loss": 1.0167732238769531, "loss/kd": 1.7258256673812866, "loss/lm": 0.30772072076797485, "step": 4282 }, { "epoch": 0.8792855676452473, "grad_norm": 0.9450310493795105, "kd_ratio": 0.5, "learning_rate": 7.548550657613651e-07, "loss": 0.8447628021240234, "loss/kd": 1.3813025951385498, "loss/lm": 0.30822306871414185, "step": 4283 }, { "epoch": 0.8794908642989119, "grad_norm": 0.944519850969958, "kd_ratio": 0.5, "learning_rate": 7.523223764930554e-07, "loss": 1.2153023481369019, "loss/kd": 2.0856521129608154, "loss/lm": 0.3449525833129883, "step": 4284 }, { "epoch": 0.8796961609525765, "grad_norm": 0.9349637729928708, "kd_ratio": 0.5, "learning_rate": 7.497937771242159e-07, "loss": 0.9629557132720947, "loss/kd": 1.6071211099624634, "loss/lm": 0.3187903165817261, "step": 4285 }, { "epoch": 0.879901457606241, "grad_norm": 1.122777301015643, "kd_ratio": 0.5, "learning_rate": 7.472692687731498e-07, "loss": 1.0810163021087646, "loss/kd": 1.7861416339874268, "loss/lm": 0.3758910298347473, "step": 4286 }, { "epoch": 0.8801067542599056, "grad_norm": 0.9213356443016887, "kd_ratio": 0.5, "learning_rate": 7.447488525563551e-07, "loss": 1.025536060333252, "loss/kd": 1.7401286363601685, "loss/lm": 0.31094351410865784, "step": 4287 }, { "epoch": 0.8803120509135701, "grad_norm": 0.9210783696551156, "kd_ratio": 0.5, "learning_rate": 7.422325295885147e-07, "loss": 0.8788971900939941, "loss/kd": 1.429042100906372, "loss/lm": 0.3287522494792938, "step": 4288 }, { "epoch": 0.8805173475672347, "grad_norm": 1.0043537319360694, "kd_ratio": 0.5, "learning_rate": 7.397203009825061e-07, "loss": 1.2969942092895508, "loss/kd": 2.207268238067627, "loss/lm": 0.3867202401161194, "step": 4289 }, { "epoch": 0.8807226442208992, "grad_norm": 0.8834524396315646, "kd_ratio": 0.5, "learning_rate": 7.372121678493893e-07, "loss": 1.2119393348693848, "loss/kd": 2.119781017303467, "loss/lm": 0.30409759283065796, "step": 4290 }, { "epoch": 0.8809279408745637, "grad_norm": 0.9883824058394368, "kd_ratio": 0.5, "learning_rate": 7.347081312984194e-07, "loss": 1.068069577217102, "loss/kd": 1.7403545379638672, "loss/lm": 0.3957845866680145, "step": 4291 }, { "epoch": 0.8811332375282283, "grad_norm": 0.8770404264109177, "kd_ratio": 0.5, "learning_rate": 7.322081924370373e-07, "loss": 1.0175657272338867, "loss/kd": 1.7058799266815186, "loss/lm": 0.32925161719322205, "step": 4292 }, { "epoch": 0.8813385341818928, "grad_norm": 0.8383215608776586, "kd_ratio": 0.5, "learning_rate": 7.297123523708726e-07, "loss": 0.9937549233436584, "loss/kd": 1.6707522869110107, "loss/lm": 0.31675755977630615, "step": 4293 }, { "epoch": 0.8815438308355574, "grad_norm": 0.939423579457002, "kd_ratio": 0.5, "learning_rate": 7.272206122037407e-07, "loss": 0.8639847636222839, "loss/kd": 1.37171471118927, "loss/lm": 0.35625478625297546, "step": 4294 }, { "epoch": 0.881749127489222, "grad_norm": 0.9168498433603481, "kd_ratio": 0.5, "learning_rate": 7.247329730376429e-07, "loss": 1.0864344835281372, "loss/kd": 1.8604544401168823, "loss/lm": 0.3124144673347473, "step": 4295 }, { "epoch": 0.8819544241428865, "grad_norm": 0.9337298803362354, "kd_ratio": 0.5, "learning_rate": 7.222494359727716e-07, "loss": 1.088208794593811, "loss/kd": 1.8555903434753418, "loss/lm": 0.3208272159099579, "step": 4296 }, { "epoch": 0.882159720796551, "grad_norm": 1.005835270525725, "kd_ratio": 0.5, "learning_rate": 7.19770002107496e-07, "loss": 1.0053740739822388, "loss/kd": 1.6723312139511108, "loss/lm": 0.33841702342033386, "step": 4297 }, { "epoch": 0.8823650174502156, "grad_norm": 0.8425038009862391, "kd_ratio": 0.5, "learning_rate": 7.172946725383845e-07, "loss": 1.0889867544174194, "loss/kd": 1.893341064453125, "loss/lm": 0.28463247418403625, "step": 4298 }, { "epoch": 0.8825703141038801, "grad_norm": 0.8151215131248152, "kd_ratio": 0.5, "learning_rate": 7.148234483601746e-07, "loss": 1.0874685049057007, "loss/kd": 1.8268219232559204, "loss/lm": 0.34811508655548096, "step": 4299 }, { "epoch": 0.8827756107575446, "grad_norm": 0.8951901148895474, "kd_ratio": 0.5, "learning_rate": 7.123563306658021e-07, "loss": 1.062537670135498, "loss/kd": 1.72468900680542, "loss/lm": 0.40038633346557617, "step": 4300 }, { "epoch": 0.8829809074112092, "grad_norm": 0.8513264627925409, "kd_ratio": 0.5, "learning_rate": 7.098933205463742e-07, "loss": 1.0049306154251099, "loss/kd": 1.6791598796844482, "loss/lm": 0.33070144057273865, "step": 4301 }, { "epoch": 0.8831862040648737, "grad_norm": 1.016284204399165, "kd_ratio": 0.5, "learning_rate": 7.074344190911897e-07, "loss": 1.18617844581604, "loss/kd": 1.9609122276306152, "loss/lm": 0.4114445447921753, "step": 4302 }, { "epoch": 0.8833915007185383, "grad_norm": 0.9846546284682114, "kd_ratio": 0.5, "learning_rate": 7.049796273877297e-07, "loss": 0.8683002591133118, "loss/kd": 1.3711014986038208, "loss/lm": 0.3654990494251251, "step": 4303 }, { "epoch": 0.8835967973722029, "grad_norm": 0.8827207403219506, "kd_ratio": 0.5, "learning_rate": 7.025289465216534e-07, "loss": 1.019794225692749, "loss/kd": 1.6342358589172363, "loss/lm": 0.40535247325897217, "step": 4304 }, { "epoch": 0.8838020940258674, "grad_norm": 0.9795329998626092, "kd_ratio": 0.5, "learning_rate": 7.000823775768095e-07, "loss": 1.1177066564559937, "loss/kd": 1.9063866138458252, "loss/lm": 0.32902660965919495, "step": 4305 }, { "epoch": 0.8840073906795319, "grad_norm": 1.0008871377916337, "kd_ratio": 0.5, "learning_rate": 6.97639921635217e-07, "loss": 1.0556941032409668, "loss/kd": 1.7804795503616333, "loss/lm": 0.3309085965156555, "step": 4306 }, { "epoch": 0.8842126873331965, "grad_norm": 0.8462509990641638, "kd_ratio": 0.5, "learning_rate": 6.952015797770862e-07, "loss": 1.095921516418457, "loss/kd": 1.8689650297164917, "loss/lm": 0.32287803292274475, "step": 4307 }, { "epoch": 0.884417983986861, "grad_norm": 0.9351999136977673, "kd_ratio": 0.5, "learning_rate": 6.927673530808021e-07, "loss": 1.1434299945831299, "loss/kd": 1.9010318517684937, "loss/lm": 0.38582801818847656, "step": 4308 }, { "epoch": 0.8846232806405255, "grad_norm": 0.9566690014507438, "kd_ratio": 0.5, "learning_rate": 6.903372426229337e-07, "loss": 1.0018445253372192, "loss/kd": 1.6486700773239136, "loss/lm": 0.35501888394355774, "step": 4309 }, { "epoch": 0.8848285772941901, "grad_norm": 0.8705291615619707, "kd_ratio": 0.5, "learning_rate": 6.879112494782247e-07, "loss": 1.1796209812164307, "loss/kd": 2.057766914367676, "loss/lm": 0.30147504806518555, "step": 4310 }, { "epoch": 0.8850338739478546, "grad_norm": 0.8628489886963906, "kd_ratio": 0.5, "learning_rate": 6.854893747196034e-07, "loss": 1.2996894121170044, "loss/kd": 2.261272668838501, "loss/lm": 0.33810609579086304, "step": 4311 }, { "epoch": 0.8852391706015192, "grad_norm": 1.024898364203047, "kd_ratio": 0.5, "learning_rate": 6.830716194181675e-07, "loss": 1.3134400844573975, "loss/kd": 2.3226754665374756, "loss/lm": 0.30420467257499695, "step": 4312 }, { "epoch": 0.8854444672551838, "grad_norm": 0.8723874342416327, "kd_ratio": 0.5, "learning_rate": 6.806579846432082e-07, "loss": 0.9265754222869873, "loss/kd": 1.5047608613967896, "loss/lm": 0.34839004278182983, "step": 4313 }, { "epoch": 0.8856497639088483, "grad_norm": 0.8706567287019322, "kd_ratio": 0.5, "learning_rate": 6.782484714621784e-07, "loss": 0.7813515067100525, "loss/kd": 1.242663025856018, "loss/lm": 0.3200400173664093, "step": 4314 }, { "epoch": 0.8858550605625128, "grad_norm": 0.8222081894551366, "kd_ratio": 0.5, "learning_rate": 6.758430809407169e-07, "loss": 1.060579538345337, "loss/kd": 1.6434611082077026, "loss/lm": 0.47769784927368164, "step": 4315 }, { "epoch": 0.8860603572161774, "grad_norm": 0.8603015500537927, "kd_ratio": 0.5, "learning_rate": 6.734418141426391e-07, "loss": 1.0442261695861816, "loss/kd": 1.7245346307754517, "loss/lm": 0.36391758918762207, "step": 4316 }, { "epoch": 0.8862656538698419, "grad_norm": 0.8953310036867809, "kd_ratio": 0.5, "learning_rate": 6.710446721299313e-07, "loss": 1.1176618337631226, "loss/kd": 1.923265814781189, "loss/lm": 0.31205785274505615, "step": 4317 }, { "epoch": 0.8864709505235064, "grad_norm": 0.8480020254453784, "kd_ratio": 0.5, "learning_rate": 6.686516559627632e-07, "loss": 1.0117664337158203, "loss/kd": 1.7122366428375244, "loss/lm": 0.3112962543964386, "step": 4318 }, { "epoch": 0.886676247177171, "grad_norm": 0.9970871789101651, "kd_ratio": 0.5, "learning_rate": 6.662627666994725e-07, "loss": 0.9024921655654907, "loss/kd": 1.483841061592102, "loss/lm": 0.3211432695388794, "step": 4319 }, { "epoch": 0.8868815438308355, "grad_norm": 0.8462443023032475, "kd_ratio": 0.5, "learning_rate": 6.638780053965776e-07, "loss": 0.9050639867782593, "loss/kd": 1.4510445594787598, "loss/lm": 0.35908347368240356, "step": 4320 }, { "epoch": 0.8870868404845001, "grad_norm": 0.8641807230962706, "kd_ratio": 0.5, "learning_rate": 6.61497373108766e-07, "loss": 1.0754904747009277, "loss/kd": 1.8354710340499878, "loss/lm": 0.3155098259449005, "step": 4321 }, { "epoch": 0.8872921371381647, "grad_norm": 0.8592758323019078, "kd_ratio": 0.5, "learning_rate": 6.59120870888903e-07, "loss": 1.0310081243515015, "loss/kd": 1.813039779663086, "loss/lm": 0.24897651374340057, "step": 4322 }, { "epoch": 0.8874974337918292, "grad_norm": 0.8967156066747843, "kd_ratio": 0.5, "learning_rate": 6.567484997880247e-07, "loss": 1.0184112787246704, "loss/kd": 1.7168089151382446, "loss/lm": 0.3200136423110962, "step": 4323 }, { "epoch": 0.8877027304454937, "grad_norm": 0.8771285136605331, "kd_ratio": 0.5, "learning_rate": 6.54380260855344e-07, "loss": 1.0345486402511597, "loss/kd": 1.7452939748764038, "loss/lm": 0.32380321621894836, "step": 4324 }, { "epoch": 0.8879080270991583, "grad_norm": 1.139473509688524, "kd_ratio": 0.5, "learning_rate": 6.520161551382431e-07, "loss": 0.9479995369911194, "loss/kd": 1.6101375818252563, "loss/lm": 0.2858614921569824, "step": 4325 }, { "epoch": 0.8881133237528228, "grad_norm": 0.8830024062363748, "kd_ratio": 0.5, "learning_rate": 6.496561836822745e-07, "loss": 1.125962734222412, "loss/kd": 1.9320489168167114, "loss/lm": 0.31987643241882324, "step": 4326 }, { "epoch": 0.8883186204064873, "grad_norm": 1.583953763736469, "kd_ratio": 0.5, "learning_rate": 6.47300347531169e-07, "loss": 1.474070429801941, "loss/kd": 2.6866142749786377, "loss/lm": 0.26152655482292175, "step": 4327 }, { "epoch": 0.8885239170601519, "grad_norm": 0.8434431320676695, "kd_ratio": 0.5, "learning_rate": 6.449486477268174e-07, "loss": 0.8564447164535522, "loss/kd": 1.3623276948928833, "loss/lm": 0.3505616784095764, "step": 4328 }, { "epoch": 0.8887292137138164, "grad_norm": 0.941701553386589, "kd_ratio": 0.5, "learning_rate": 6.426010853092957e-07, "loss": 1.9140669107437134, "loss/kd": 3.569955825805664, "loss/lm": 0.25817805528640747, "step": 4329 }, { "epoch": 0.8889345103674811, "grad_norm": 0.8314657342169833, "kd_ratio": 0.5, "learning_rate": 6.402576613168366e-07, "loss": 1.3190958499908447, "loss/kd": 2.3098549842834473, "loss/lm": 0.32833659648895264, "step": 4330 }, { "epoch": 0.8891398070211456, "grad_norm": 0.8511214713773817, "kd_ratio": 0.5, "learning_rate": 6.37918376785851e-07, "loss": 1.124899983406067, "loss/kd": 1.9353160858154297, "loss/lm": 0.3144839107990265, "step": 4331 }, { "epoch": 0.8893451036748101, "grad_norm": 0.7826432023049903, "kd_ratio": 0.5, "learning_rate": 6.35583232750917e-07, "loss": 1.0208678245544434, "loss/kd": 1.7456411123275757, "loss/lm": 0.2960945963859558, "step": 4332 }, { "epoch": 0.8895504003284747, "grad_norm": 0.8458371496949535, "kd_ratio": 0.5, "learning_rate": 6.332522302447775e-07, "loss": 1.1695603132247925, "loss/kd": 2.0078909397125244, "loss/lm": 0.33122962713241577, "step": 4333 }, { "epoch": 0.8897556969821392, "grad_norm": 0.8466550912604126, "kd_ratio": 0.5, "learning_rate": 6.309253702983509e-07, "loss": 0.8592625260353088, "loss/kd": 1.4598890542984009, "loss/lm": 0.2586360275745392, "step": 4334 }, { "epoch": 0.8899609936358037, "grad_norm": 0.8492368973503329, "kd_ratio": 0.5, "learning_rate": 6.286026539407164e-07, "loss": 0.9428911209106445, "loss/kd": 1.560908555984497, "loss/lm": 0.324873685836792, "step": 4335 }, { "epoch": 0.8901662902894683, "grad_norm": 0.7616958664495437, "kd_ratio": 0.5, "learning_rate": 6.262840821991278e-07, "loss": 0.9202433228492737, "loss/kd": 1.5444259643554688, "loss/lm": 0.2960606813430786, "step": 4336 }, { "epoch": 0.8903715869431328, "grad_norm": 0.9030151621327956, "kd_ratio": 0.5, "learning_rate": 6.239696560989983e-07, "loss": 1.1682953834533691, "loss/kd": 1.9649765491485596, "loss/lm": 0.3716142773628235, "step": 4337 }, { "epoch": 0.8905768835967973, "grad_norm": 0.8844742029676127, "kd_ratio": 0.5, "learning_rate": 6.216593766639123e-07, "loss": 0.9312054514884949, "loss/kd": 1.5818405151367188, "loss/lm": 0.280570387840271, "step": 4338 }, { "epoch": 0.890782180250462, "grad_norm": 0.8444263036189084, "kd_ratio": 0.5, "learning_rate": 6.193532449156203e-07, "loss": 0.957482099533081, "loss/kd": 1.619769811630249, "loss/lm": 0.2951943874359131, "step": 4339 }, { "epoch": 0.8909874769041265, "grad_norm": 0.9020242446578632, "kd_ratio": 0.5, "learning_rate": 6.170512618740387e-07, "loss": 0.9011563062667847, "loss/kd": 1.5267695188522339, "loss/lm": 0.2755431532859802, "step": 4340 }, { "epoch": 0.891192773557791, "grad_norm": 0.8236515106411035, "kd_ratio": 0.5, "learning_rate": 6.147534285572443e-07, "loss": 1.020158290863037, "loss/kd": 1.689591884613037, "loss/lm": 0.35072463750839233, "step": 4341 }, { "epoch": 0.8913980702114556, "grad_norm": 0.8423480563047645, "kd_ratio": 0.5, "learning_rate": 6.124597459814852e-07, "loss": 1.123726487159729, "loss/kd": 1.8954877853393555, "loss/lm": 0.3519652783870697, "step": 4342 }, { "epoch": 0.8916033668651201, "grad_norm": 1.0224456896391676, "kd_ratio": 0.5, "learning_rate": 6.101702151611688e-07, "loss": 0.9551455974578857, "loss/kd": 1.5772595405578613, "loss/lm": 0.33303171396255493, "step": 4343 }, { "epoch": 0.8918086635187846, "grad_norm": 0.8545060742656397, "kd_ratio": 0.5, "learning_rate": 6.078848371088708e-07, "loss": 0.98462975025177, "loss/kd": 1.6077613830566406, "loss/lm": 0.36149805784225464, "step": 4344 }, { "epoch": 0.8920139601724492, "grad_norm": 0.8389888137284842, "kd_ratio": 0.5, "learning_rate": 6.056036128353259e-07, "loss": 1.0184745788574219, "loss/kd": 1.6913923025131226, "loss/lm": 0.3455568552017212, "step": 4345 }, { "epoch": 0.8922192568261137, "grad_norm": 0.8828210644484698, "kd_ratio": 0.5, "learning_rate": 6.03326543349434e-07, "loss": 0.9880515336990356, "loss/kd": 1.6584535837173462, "loss/lm": 0.3176494240760803, "step": 4346 }, { "epoch": 0.8924245534797782, "grad_norm": 0.8020888158522647, "kd_ratio": 0.5, "learning_rate": 6.010536296582592e-07, "loss": 0.815899133682251, "loss/kd": 1.3412612676620483, "loss/lm": 0.2905369997024536, "step": 4347 }, { "epoch": 0.8926298501334429, "grad_norm": 0.9696769589621215, "kd_ratio": 0.5, "learning_rate": 5.987848727670221e-07, "loss": 1.1877609491348267, "loss/kd": 2.038654327392578, "loss/lm": 0.33686748147010803, "step": 4348 }, { "epoch": 0.8928351467871074, "grad_norm": 0.8196393354275672, "kd_ratio": 0.5, "learning_rate": 5.965202736791086e-07, "loss": 1.1321940422058105, "loss/kd": 1.8835439682006836, "loss/lm": 0.3808441162109375, "step": 4349 }, { "epoch": 0.8930404434407719, "grad_norm": 0.8601857368230562, "kd_ratio": 0.5, "learning_rate": 5.942598333960692e-07, "loss": 0.9483259320259094, "loss/kd": 1.5582239627838135, "loss/lm": 0.33842790126800537, "step": 4350 }, { "epoch": 0.8932457400944365, "grad_norm": 0.8209050283739746, "kd_ratio": 0.5, "learning_rate": 5.920035529176082e-07, "loss": 0.9165034294128418, "loss/kd": 1.5023553371429443, "loss/lm": 0.33065149188041687, "step": 4351 }, { "epoch": 0.893451036748101, "grad_norm": 0.9124765295581958, "kd_ratio": 0.5, "learning_rate": 5.897514332415955e-07, "loss": 1.1799917221069336, "loss/kd": 2.0847201347351074, "loss/lm": 0.27526330947875977, "step": 4352 }, { "epoch": 0.8936563334017655, "grad_norm": 0.8440915787321083, "kd_ratio": 0.5, "learning_rate": 5.875034753640574e-07, "loss": 1.1236002445220947, "loss/kd": 1.865234136581421, "loss/lm": 0.3819664418697357, "step": 4353 }, { "epoch": 0.8938616300554301, "grad_norm": 0.9326779758900967, "kd_ratio": 0.5, "learning_rate": 5.85259680279181e-07, "loss": 1.1317486763000488, "loss/kd": 1.8931100368499756, "loss/lm": 0.3703872561454773, "step": 4354 }, { "epoch": 0.8940669267090946, "grad_norm": 0.8055920591010195, "kd_ratio": 0.5, "learning_rate": 5.830200489793136e-07, "loss": 1.0976321697235107, "loss/kd": 1.8316606283187866, "loss/lm": 0.3636036813259125, "step": 4355 }, { "epoch": 0.8942722233627591, "grad_norm": 0.8561632753466774, "kd_ratio": 0.5, "learning_rate": 5.807845824549596e-07, "loss": 1.1005563735961914, "loss/kd": 1.8022767305374146, "loss/lm": 0.3988361060619354, "step": 4356 }, { "epoch": 0.8944775200164238, "grad_norm": 0.8750040360285347, "kd_ratio": 0.5, "learning_rate": 5.785532816947792e-07, "loss": 1.3995797634124756, "loss/kd": 2.4822068214416504, "loss/lm": 0.3169527053833008, "step": 4357 }, { "epoch": 0.8946828166700883, "grad_norm": 1.393812316331736, "kd_ratio": 0.5, "learning_rate": 5.763261476855952e-07, "loss": 1.093157172203064, "loss/kd": 1.8131884336471558, "loss/lm": 0.373125821352005, "step": 4358 }, { "epoch": 0.8948881133237528, "grad_norm": 0.8928932978607853, "kd_ratio": 0.5, "learning_rate": 5.741031814123843e-07, "loss": 1.2585679292678833, "loss/kd": 2.12882661819458, "loss/lm": 0.3883092403411865, "step": 4359 }, { "epoch": 0.8950934099774174, "grad_norm": 1.0356014802877382, "kd_ratio": 0.5, "learning_rate": 5.718843838582811e-07, "loss": 1.029139518737793, "loss/kd": 1.758013367652893, "loss/lm": 0.3002657890319824, "step": 4360 }, { "epoch": 0.8952987066310819, "grad_norm": 0.8560660235141194, "kd_ratio": 0.5, "learning_rate": 5.696697560045772e-07, "loss": 0.9423871040344238, "loss/kd": 1.5661715269088745, "loss/lm": 0.3186027407646179, "step": 4361 }, { "epoch": 0.8955040032847464, "grad_norm": 0.9058550591827755, "kd_ratio": 0.5, "learning_rate": 5.67459298830717e-07, "loss": 0.9690015912055969, "loss/kd": 1.6642009019851685, "loss/lm": 0.2738023102283478, "step": 4362 }, { "epoch": 0.895709299938411, "grad_norm": 0.91471860788125, "kd_ratio": 0.5, "learning_rate": 5.652530133143042e-07, "loss": 1.1589547395706177, "loss/kd": 1.9453731775283813, "loss/lm": 0.3725363314151764, "step": 4363 }, { "epoch": 0.8959145965920755, "grad_norm": 0.9872014956601187, "kd_ratio": 0.5, "learning_rate": 5.630509004310913e-07, "loss": 0.9443386197090149, "loss/kd": 1.6040457487106323, "loss/lm": 0.28463149070739746, "step": 4364 }, { "epoch": 0.89611989324574, "grad_norm": 0.8521010951484537, "kd_ratio": 0.5, "learning_rate": 5.608529611549973e-07, "loss": 0.997789204120636, "loss/kd": 1.7161451578140259, "loss/lm": 0.2794332802295685, "step": 4365 }, { "epoch": 0.8963251898994047, "grad_norm": 0.8780152188130992, "kd_ratio": 0.5, "learning_rate": 5.586591964580812e-07, "loss": 0.8689946532249451, "loss/kd": 1.440956950187683, "loss/lm": 0.29703235626220703, "step": 4366 }, { "epoch": 0.8965304865530692, "grad_norm": 0.8503252888838813, "kd_ratio": 0.5, "learning_rate": 5.564696073105669e-07, "loss": 1.0813522338867188, "loss/kd": 1.8625208139419556, "loss/lm": 0.3001837432384491, "step": 4367 }, { "epoch": 0.8967357832067337, "grad_norm": 0.8512440909281712, "kd_ratio": 0.5, "learning_rate": 5.542841946808264e-07, "loss": 0.9768938422203064, "loss/kd": 1.6958564519882202, "loss/lm": 0.25793126225471497, "step": 4368 }, { "epoch": 0.8969410798603983, "grad_norm": 0.9751793617530349, "kd_ratio": 0.5, "learning_rate": 5.521029595353833e-07, "loss": 1.1755532026290894, "loss/kd": 2.0286622047424316, "loss/lm": 0.32244426012039185, "step": 4369 }, { "epoch": 0.8971463765140628, "grad_norm": 0.9125978110591244, "kd_ratio": 0.5, "learning_rate": 5.49925902838917e-07, "loss": 1.031624674797058, "loss/kd": 1.7230496406555176, "loss/lm": 0.340199738740921, "step": 4370 }, { "epoch": 0.8973516731677273, "grad_norm": 0.933946447233334, "kd_ratio": 0.5, "learning_rate": 5.477530255542573e-07, "loss": 0.8748562335968018, "loss/kd": 1.451534390449524, "loss/lm": 0.2981780767440796, "step": 4371 }, { "epoch": 0.8975569698213919, "grad_norm": 0.9114489511990937, "kd_ratio": 0.5, "learning_rate": 5.455843286423879e-07, "loss": 1.0739161968231201, "loss/kd": 1.8046643733978271, "loss/lm": 0.3431679606437683, "step": 4372 }, { "epoch": 0.8977622664750564, "grad_norm": 0.8768786569902703, "kd_ratio": 0.5, "learning_rate": 5.434198130624401e-07, "loss": 1.125407338142395, "loss/kd": 1.8944042921066284, "loss/lm": 0.3564104735851288, "step": 4373 }, { "epoch": 0.897967563128721, "grad_norm": 0.8984459459752977, "kd_ratio": 0.5, "learning_rate": 5.412594797716975e-07, "loss": 0.8809802532196045, "loss/kd": 1.42875075340271, "loss/lm": 0.33320969343185425, "step": 4374 }, { "epoch": 0.8981728597823856, "grad_norm": 0.9292738959120392, "kd_ratio": 0.5, "learning_rate": 5.391033297255932e-07, "loss": 0.940462052822113, "loss/kd": 1.571534514427185, "loss/lm": 0.309389591217041, "step": 4375 }, { "epoch": 0.8983781564360501, "grad_norm": 0.910410810004404, "kd_ratio": 0.5, "learning_rate": 5.369513638777147e-07, "loss": 1.045981764793396, "loss/kd": 1.7983227968215942, "loss/lm": 0.2936406433582306, "step": 4376 }, { "epoch": 0.8985834530897147, "grad_norm": 0.9754439558771207, "kd_ratio": 0.5, "learning_rate": 5.348035831797904e-07, "loss": 0.9631845355033875, "loss/kd": 1.5997132062911987, "loss/lm": 0.3266558349132538, "step": 4377 }, { "epoch": 0.8987887497433792, "grad_norm": 1.0370889425842251, "kd_ratio": 0.5, "learning_rate": 5.326599885817074e-07, "loss": 1.1160626411437988, "loss/kd": 1.9500353336334229, "loss/lm": 0.28208982944488525, "step": 4378 }, { "epoch": 0.8989940463970437, "grad_norm": 1.0014952348517323, "kd_ratio": 0.5, "learning_rate": 5.305205810314951e-07, "loss": 1.1330217123031616, "loss/kd": 1.8714537620544434, "loss/lm": 0.3945896029472351, "step": 4379 }, { "epoch": 0.8991993430507083, "grad_norm": 0.8712238032415273, "kd_ratio": 0.5, "learning_rate": 5.283853614753298e-07, "loss": 1.9466947317123413, "loss/kd": 3.6474881172180176, "loss/lm": 0.24590125679969788, "step": 4380 }, { "epoch": 0.8994046397043728, "grad_norm": 0.9436504282371002, "kd_ratio": 0.5, "learning_rate": 5.262543308575451e-07, "loss": 1.5094345808029175, "loss/kd": 2.6658525466918945, "loss/lm": 0.35301655530929565, "step": 4381 }, { "epoch": 0.8996099363580373, "grad_norm": 0.9408403195040373, "kd_ratio": 0.5, "learning_rate": 5.241274901206106e-07, "loss": 0.893557071685791, "loss/kd": 1.4226963520050049, "loss/lm": 0.36441776156425476, "step": 4382 }, { "epoch": 0.8998152330117019, "grad_norm": 0.8081581787241925, "kd_ratio": 0.5, "learning_rate": 5.220048402051503e-07, "loss": 0.8938416242599487, "loss/kd": 1.4516053199768066, "loss/lm": 0.33607786893844604, "step": 4383 }, { "epoch": 0.9000205296653665, "grad_norm": 0.8271524879548463, "kd_ratio": 0.5, "learning_rate": 5.1988638204993e-07, "loss": 1.1018643379211426, "loss/kd": 1.8694127798080444, "loss/lm": 0.3343159258365631, "step": 4384 }, { "epoch": 0.900225826319031, "grad_norm": 1.044493794624341, "kd_ratio": 0.5, "learning_rate": 5.177721165918659e-07, "loss": 0.8788061141967773, "loss/kd": 1.4758204221725464, "loss/lm": 0.2817918360233307, "step": 4385 }, { "epoch": 0.9004311229726956, "grad_norm": 0.8650928886305357, "kd_ratio": 0.5, "learning_rate": 5.156620447660165e-07, "loss": 0.9593719840049744, "loss/kd": 1.648829460144043, "loss/lm": 0.26991450786590576, "step": 4386 }, { "epoch": 0.9006364196263601, "grad_norm": 0.9568492733584096, "kd_ratio": 0.5, "learning_rate": 5.135561675055889e-07, "loss": 1.3073718547821045, "loss/kd": 2.2759318351745605, "loss/lm": 0.3388119637966156, "step": 4387 }, { "epoch": 0.9008417162800246, "grad_norm": 0.7959701716290104, "kd_ratio": 0.5, "learning_rate": 5.114544857419335e-07, "loss": 0.9513496160507202, "loss/kd": 1.622267723083496, "loss/lm": 0.28043147921562195, "step": 4388 }, { "epoch": 0.9010470129336892, "grad_norm": 1.0271341149682236, "kd_ratio": 0.5, "learning_rate": 5.093570004045412e-07, "loss": 0.9727975130081177, "loss/kd": 1.6261720657348633, "loss/lm": 0.3194229304790497, "step": 4389 }, { "epoch": 0.9012523095873537, "grad_norm": 0.8595349457982323, "kd_ratio": 0.5, "learning_rate": 5.072637124210544e-07, "loss": 0.7946915626525879, "loss/kd": 1.3014377355575562, "loss/lm": 0.28794533014297485, "step": 4390 }, { "epoch": 0.9014576062410182, "grad_norm": 1.0070277199059035, "kd_ratio": 0.5, "learning_rate": 5.051746227172538e-07, "loss": 0.9963011741638184, "loss/kd": 1.6946089267730713, "loss/lm": 0.2979934811592102, "step": 4391 }, { "epoch": 0.9016629028946829, "grad_norm": 0.8588294525301198, "kd_ratio": 0.5, "learning_rate": 5.03089732217068e-07, "loss": 1.0232517719268799, "loss/kd": 1.7191483974456787, "loss/lm": 0.32735511660575867, "step": 4392 }, { "epoch": 0.9018681995483474, "grad_norm": 0.9710222543296608, "kd_ratio": 0.5, "learning_rate": 5.010090418425617e-07, "loss": 0.8820693492889404, "loss/kd": 1.4285472631454468, "loss/lm": 0.3355914354324341, "step": 4393 }, { "epoch": 0.9020734962020119, "grad_norm": 0.8178580450557291, "kd_ratio": 0.5, "learning_rate": 4.989325525139466e-07, "loss": 0.9680095911026001, "loss/kd": 1.6247518062591553, "loss/lm": 0.3112673759460449, "step": 4394 }, { "epoch": 0.9022787928556765, "grad_norm": 0.9528618758700166, "kd_ratio": 0.5, "learning_rate": 4.96860265149578e-07, "loss": 1.3644468784332275, "loss/kd": 2.2730000019073486, "loss/lm": 0.45589369535446167, "step": 4395 }, { "epoch": 0.902484089509341, "grad_norm": 0.8327950154867537, "kd_ratio": 0.5, "learning_rate": 4.947921806659495e-07, "loss": 1.2870389223098755, "loss/kd": 2.28124737739563, "loss/lm": 0.2928304672241211, "step": 4396 }, { "epoch": 0.9026893861630055, "grad_norm": 0.7901450865329273, "kd_ratio": 0.5, "learning_rate": 4.92728299977695e-07, "loss": 0.9925291538238525, "loss/kd": 1.629788875579834, "loss/lm": 0.3552693724632263, "step": 4397 }, { "epoch": 0.9028946828166701, "grad_norm": 0.814317533807458, "kd_ratio": 0.5, "learning_rate": 4.906686239975933e-07, "loss": 0.9630909562110901, "loss/kd": 1.6542558670043945, "loss/lm": 0.27192604541778564, "step": 4398 }, { "epoch": 0.9030999794703346, "grad_norm": 0.9644831807948118, "kd_ratio": 0.5, "learning_rate": 4.886131536365623e-07, "loss": 1.0082406997680664, "loss/kd": 1.6467865705490112, "loss/lm": 0.36969491839408875, "step": 4399 }, { "epoch": 0.9033052761239991, "grad_norm": 0.9461016974286798, "kd_ratio": 0.5, "learning_rate": 4.865618898036561e-07, "loss": 1.1571680307388306, "loss/kd": 1.9399131536483765, "loss/lm": 0.37442290782928467, "step": 4400 }, { "epoch": 0.9035105727776638, "grad_norm": 0.8071812522153932, "kd_ratio": 0.5, "learning_rate": 4.845148334060734e-07, "loss": 0.8964888453483582, "loss/kd": 1.458545207977295, "loss/lm": 0.334432452917099, "step": 4401 }, { "epoch": 0.9037158694313283, "grad_norm": 0.9118043374459098, "kd_ratio": 0.5, "learning_rate": 4.8247198534915e-07, "loss": 1.0217242240905762, "loss/kd": 1.7462098598480225, "loss/lm": 0.29723846912384033, "step": 4402 }, { "epoch": 0.9039211660849928, "grad_norm": 0.9868670192115949, "kd_ratio": 0.5, "learning_rate": 4.804333465363609e-07, "loss": 1.00881826877594, "loss/kd": 1.6097025871276855, "loss/lm": 0.4079340398311615, "step": 4403 }, { "epoch": 0.9041264627386574, "grad_norm": 0.9328086878894666, "kd_ratio": 0.5, "learning_rate": 4.783989178693182e-07, "loss": 0.9923574924468994, "loss/kd": 1.640974998474121, "loss/lm": 0.34373995661735535, "step": 4404 }, { "epoch": 0.9043317593923219, "grad_norm": 0.8829002814295146, "kd_ratio": 0.5, "learning_rate": 4.7636870024777457e-07, "loss": 1.010085105895996, "loss/kd": 1.6911348104476929, "loss/lm": 0.3290354013442993, "step": 4405 }, { "epoch": 0.9045370560459864, "grad_norm": 0.8953632408213728, "kd_ratio": 0.5, "learning_rate": 4.7434269456961725e-07, "loss": 0.941391110420227, "loss/kd": 1.5489314794540405, "loss/lm": 0.33385077118873596, "step": 4406 }, { "epoch": 0.904742352699651, "grad_norm": 0.9408594971752635, "kd_ratio": 0.5, "learning_rate": 4.723209017308727e-07, "loss": 1.248917579650879, "loss/kd": 2.165067434310913, "loss/lm": 0.3327677845954895, "step": 4407 }, { "epoch": 0.9049476493533155, "grad_norm": 0.8094043496471728, "kd_ratio": 0.5, "learning_rate": 4.703033226257048e-07, "loss": 1.9213483333587646, "loss/kd": 3.625295400619507, "loss/lm": 0.21740137040615082, "step": 4408 }, { "epoch": 0.90515294600698, "grad_norm": 0.8036095649877021, "kd_ratio": 0.5, "learning_rate": 4.6828995814641174e-07, "loss": 1.083103060722351, "loss/kd": 1.8336623907089233, "loss/lm": 0.33254364132881165, "step": 4409 }, { "epoch": 0.9053582426606447, "grad_norm": 0.8227417434195046, "kd_ratio": 0.5, "learning_rate": 4.662808091834292e-07, "loss": 0.8612561821937561, "loss/kd": 1.4430323839187622, "loss/lm": 0.2794800102710724, "step": 4410 }, { "epoch": 0.9055635393143092, "grad_norm": 0.8106145417532784, "kd_ratio": 0.5, "learning_rate": 4.6427587662532636e-07, "loss": 1.1493573188781738, "loss/kd": 1.9545814990997314, "loss/lm": 0.34413301944732666, "step": 4411 }, { "epoch": 0.9057688359679738, "grad_norm": 0.8983097955373183, "kd_ratio": 0.5, "learning_rate": 4.622751613588128e-07, "loss": 0.9410111904144287, "loss/kd": 1.6122736930847168, "loss/lm": 0.269748717546463, "step": 4412 }, { "epoch": 0.9059741326216383, "grad_norm": 1.273551105601848, "kd_ratio": 0.5, "learning_rate": 4.6027866426872625e-07, "loss": 1.1212263107299805, "loss/kd": 1.8789029121398926, "loss/lm": 0.3635497987270355, "step": 4413 }, { "epoch": 0.9061794292753028, "grad_norm": 0.8632136975989161, "kd_ratio": 0.5, "learning_rate": 4.5828638623804376e-07, "loss": 1.0380429029464722, "loss/kd": 1.7240849733352661, "loss/lm": 0.352000892162323, "step": 4414 }, { "epoch": 0.9063847259289673, "grad_norm": 0.8531273233108203, "kd_ratio": 0.5, "learning_rate": 4.562983281478761e-07, "loss": 1.3427627086639404, "loss/kd": 2.2904884815216064, "loss/lm": 0.3950369656085968, "step": 4415 }, { "epoch": 0.9065900225826319, "grad_norm": 0.8853547949773375, "kd_ratio": 0.5, "learning_rate": 4.5431449087746216e-07, "loss": 0.8676525354385376, "loss/kd": 1.3958752155303955, "loss/lm": 0.3394298255443573, "step": 4416 }, { "epoch": 0.9067953192362964, "grad_norm": 0.8390532658786076, "kd_ratio": 0.5, "learning_rate": 4.5233487530418343e-07, "loss": 0.8824364542961121, "loss/kd": 1.4327161312103271, "loss/lm": 0.332156777381897, "step": 4417 }, { "epoch": 0.907000615889961, "grad_norm": 0.871781771770501, "kd_ratio": 0.5, "learning_rate": 4.503594823035462e-07, "loss": 0.9019199013710022, "loss/kd": 1.462804913520813, "loss/lm": 0.3410348892211914, "step": 4418 }, { "epoch": 0.9072059125436256, "grad_norm": 0.8454983297408784, "kd_ratio": 0.5, "learning_rate": 4.4838831274919505e-07, "loss": 1.1600264310836792, "loss/kd": 1.941843032836914, "loss/lm": 0.37820982933044434, "step": 4419 }, { "epoch": 0.9074112091972901, "grad_norm": 0.8754602945176084, "kd_ratio": 0.5, "learning_rate": 4.4642136751290035e-07, "loss": 1.0232421159744263, "loss/kd": 1.6914949417114258, "loss/lm": 0.35498929023742676, "step": 4420 }, { "epoch": 0.9076165058509547, "grad_norm": 0.9904291991797921, "kd_ratio": 0.5, "learning_rate": 4.444586474645707e-07, "loss": 1.092725157737732, "loss/kd": 1.8621469736099243, "loss/lm": 0.32330337166786194, "step": 4421 }, { "epoch": 0.9078218025046192, "grad_norm": 0.8139197558841172, "kd_ratio": 0.5, "learning_rate": 4.4250015347224175e-07, "loss": 0.8635267615318298, "loss/kd": 1.4111042022705078, "loss/lm": 0.31594932079315186, "step": 4422 }, { "epoch": 0.9080270991582837, "grad_norm": 0.7924043384076981, "kd_ratio": 0.5, "learning_rate": 4.4054588640208285e-07, "loss": 1.1350444555282593, "loss/kd": 1.8649450540542603, "loss/lm": 0.4051438271999359, "step": 4423 }, { "epoch": 0.9082323958119483, "grad_norm": 0.9807138173129057, "kd_ratio": 0.5, "learning_rate": 4.3859584711839265e-07, "loss": 0.8717827796936035, "loss/kd": 1.4480977058410645, "loss/lm": 0.29546791315078735, "step": 4424 }, { "epoch": 0.9084376924656128, "grad_norm": 0.8811208478105865, "kd_ratio": 0.5, "learning_rate": 4.3665003648359907e-07, "loss": 0.9089971780776978, "loss/kd": 1.4522192478179932, "loss/lm": 0.36577513813972473, "step": 4425 }, { "epoch": 0.9086429891192773, "grad_norm": 0.8676479867201405, "kd_ratio": 0.5, "learning_rate": 4.3470845535826255e-07, "loss": 1.0739848613739014, "loss/kd": 1.8058571815490723, "loss/lm": 0.34211266040802, "step": 4426 }, { "epoch": 0.9088482857729419, "grad_norm": 0.8793174956709621, "kd_ratio": 0.5, "learning_rate": 4.327711046010663e-07, "loss": 1.1206175088882446, "loss/kd": 1.8461151123046875, "loss/lm": 0.3951198160648346, "step": 4427 }, { "epoch": 0.9090535824266065, "grad_norm": 0.8784824486588168, "kd_ratio": 0.5, "learning_rate": 4.308379850688349e-07, "loss": 1.1265510320663452, "loss/kd": 1.903049111366272, "loss/lm": 0.3500530421733856, "step": 4428 }, { "epoch": 0.909258879080271, "grad_norm": 0.9784130940527284, "kd_ratio": 0.5, "learning_rate": 4.2890909761651e-07, "loss": 1.8412421941757202, "loss/kd": 3.462392807006836, "loss/lm": 0.22009152173995972, "step": 4429 }, { "epoch": 0.9094641757339356, "grad_norm": 0.8987539065435546, "kd_ratio": 0.5, "learning_rate": 4.26984443097167e-07, "loss": 1.0263053178787231, "loss/kd": 1.6930181980133057, "loss/lm": 0.3595925271511078, "step": 4430 }, { "epoch": 0.9096694723876001, "grad_norm": 0.9845451299610816, "kd_ratio": 0.5, "learning_rate": 4.2506402236200616e-07, "loss": 1.0298129320144653, "loss/kd": 1.6704825162887573, "loss/lm": 0.38914334774017334, "step": 4431 }, { "epoch": 0.9098747690412646, "grad_norm": 0.8332322408015699, "kd_ratio": 0.5, "learning_rate": 4.2314783626036026e-07, "loss": 1.0923612117767334, "loss/kd": 1.8620489835739136, "loss/lm": 0.32267338037490845, "step": 4432 }, { "epoch": 0.9100800656949292, "grad_norm": 0.8298480936962409, "kd_ratio": 0.5, "learning_rate": 4.2123588563968366e-07, "loss": 0.8106838464736938, "loss/kd": 1.3188408613204956, "loss/lm": 0.3025267720222473, "step": 4433 }, { "epoch": 0.9102853623485937, "grad_norm": 0.8398858086828653, "kd_ratio": 0.5, "learning_rate": 4.1932817134556103e-07, "loss": 1.0812430381774902, "loss/kd": 1.9029109477996826, "loss/lm": 0.2595751881599426, "step": 4434 }, { "epoch": 0.9104906590022582, "grad_norm": 0.9786499223146696, "kd_ratio": 0.5, "learning_rate": 4.1742469422170417e-07, "loss": 1.1534514427185059, "loss/kd": 2.033978223800659, "loss/lm": 0.27292463183403015, "step": 4435 }, { "epoch": 0.9106959556559228, "grad_norm": 0.8761316089404995, "kd_ratio": 0.5, "learning_rate": 4.1552545510994746e-07, "loss": 1.3681833744049072, "loss/kd": 2.355475425720215, "loss/lm": 0.380891352891922, "step": 4436 }, { "epoch": 0.9109012523095874, "grad_norm": 0.8483673349417865, "kd_ratio": 0.5, "learning_rate": 4.1363045485025235e-07, "loss": 1.0017982721328735, "loss/kd": 1.6788063049316406, "loss/lm": 0.3247901499271393, "step": 4437 }, { "epoch": 0.9111065489632519, "grad_norm": 0.8347247041047151, "kd_ratio": 0.5, "learning_rate": 4.1173969428070726e-07, "loss": 1.137861728668213, "loss/kd": 1.955670952796936, "loss/lm": 0.32005250453948975, "step": 4438 }, { "epoch": 0.9113118456169165, "grad_norm": 0.8380015904933201, "kd_ratio": 0.5, "learning_rate": 4.0985317423752557e-07, "loss": 0.8996337056159973, "loss/kd": 1.5088691711425781, "loss/lm": 0.2903982102870941, "step": 4439 }, { "epoch": 0.911517142270581, "grad_norm": 0.8136090495957774, "kd_ratio": 0.5, "learning_rate": 4.07970895555041e-07, "loss": 1.1064504384994507, "loss/kd": 1.803670883178711, "loss/lm": 0.4092300832271576, "step": 4440 }, { "epoch": 0.9117224389242455, "grad_norm": 0.9896284860422725, "kd_ratio": 0.5, "learning_rate": 4.0609285906571536e-07, "loss": 1.3061859607696533, "loss/kd": 2.2305684089660645, "loss/lm": 0.38180357217788696, "step": 4441 }, { "epoch": 0.9119277355779101, "grad_norm": 0.8110217989122659, "kd_ratio": 0.5, "learning_rate": 4.0421906560013433e-07, "loss": 1.096858263015747, "loss/kd": 1.8704578876495361, "loss/lm": 0.32325872778892517, "step": 4442 }, { "epoch": 0.9121330322315746, "grad_norm": 0.9650610524199799, "kd_ratio": 0.5, "learning_rate": 4.0234951598700725e-07, "loss": 0.9963582754135132, "loss/kd": 1.6932462453842163, "loss/lm": 0.29947027564048767, "step": 4443 }, { "epoch": 0.9123383288852391, "grad_norm": 0.9869320353310362, "kd_ratio": 0.5, "learning_rate": 4.0048421105316373e-07, "loss": 1.060287356376648, "loss/kd": 1.7410120964050293, "loss/lm": 0.3795626759529114, "step": 4444 }, { "epoch": 0.9125436255389037, "grad_norm": 0.9365928820231475, "kd_ratio": 0.5, "learning_rate": 3.9862315162355834e-07, "loss": 0.9538525342941284, "loss/kd": 1.6067293882369995, "loss/lm": 0.3009757101535797, "step": 4445 }, { "epoch": 0.9127489221925683, "grad_norm": 0.8932879087368243, "kd_ratio": 0.5, "learning_rate": 3.9676633852126834e-07, "loss": 0.885832667350769, "loss/kd": 1.3530445098876953, "loss/lm": 0.41862088441848755, "step": 4446 }, { "epoch": 0.9129542188462328, "grad_norm": 0.8476289238405812, "kd_ratio": 0.5, "learning_rate": 3.94913772567489e-07, "loss": 0.8668577671051025, "loss/kd": 1.433011531829834, "loss/lm": 0.3007039725780487, "step": 4447 }, { "epoch": 0.9131595154998974, "grad_norm": 0.8468269369773619, "kd_ratio": 0.5, "learning_rate": 3.9306545458154485e-07, "loss": 1.0541027784347534, "loss/kd": 1.7814381122589111, "loss/lm": 0.3267675042152405, "step": 4448 }, { "epoch": 0.9133648121535619, "grad_norm": 0.9460008623739118, "kd_ratio": 0.5, "learning_rate": 3.912213853808755e-07, "loss": 0.9861681461334229, "loss/kd": 1.5909366607666016, "loss/lm": 0.38139966130256653, "step": 4449 }, { "epoch": 0.9135701088072264, "grad_norm": 0.8359803900211978, "kd_ratio": 0.5, "learning_rate": 3.893815657810418e-07, "loss": 1.1161967515945435, "loss/kd": 1.9463732242584229, "loss/lm": 0.28602027893066406, "step": 4450 }, { "epoch": 0.913775405460891, "grad_norm": 0.9715010941979032, "kd_ratio": 0.5, "learning_rate": 3.875459965957307e-07, "loss": 0.9353426694869995, "loss/kd": 1.5720467567443848, "loss/lm": 0.29863861203193665, "step": 4451 }, { "epoch": 0.9139807021145555, "grad_norm": 0.8943794078782605, "kd_ratio": 0.5, "learning_rate": 3.857146786367405e-07, "loss": 0.7963340878486633, "loss/kd": 1.2877475023269653, "loss/lm": 0.3049207031726837, "step": 4452 }, { "epoch": 0.91418599876822, "grad_norm": 0.8838558072362179, "kd_ratio": 0.5, "learning_rate": 3.838876127139957e-07, "loss": 1.0145717859268188, "loss/kd": 1.6913665533065796, "loss/lm": 0.3377769887447357, "step": 4453 }, { "epoch": 0.9143912954218846, "grad_norm": 1.0235310684837136, "kd_ratio": 0.5, "learning_rate": 3.820647996355398e-07, "loss": 1.0809459686279297, "loss/kd": 1.8629423379898071, "loss/lm": 0.29894953966140747, "step": 4454 }, { "epoch": 0.9145965920755492, "grad_norm": 0.8259087847071822, "kd_ratio": 0.5, "learning_rate": 3.802462402075358e-07, "loss": 1.9120820760726929, "loss/kd": 3.599719762802124, "loss/lm": 0.2244444489479065, "step": 4455 }, { "epoch": 0.9148018887292138, "grad_norm": 0.9249741370011104, "kd_ratio": 0.5, "learning_rate": 3.7843193523426026e-07, "loss": 0.9758378267288208, "loss/kd": 1.6300513744354248, "loss/lm": 0.3216243386268616, "step": 4456 }, { "epoch": 0.9150071853828783, "grad_norm": 0.8312364722733742, "kd_ratio": 0.5, "learning_rate": 3.7662188551811476e-07, "loss": 1.0928767919540405, "loss/kd": 1.8173072338104248, "loss/lm": 0.368446409702301, "step": 4457 }, { "epoch": 0.9152124820365428, "grad_norm": 0.8198051482962656, "kd_ratio": 0.5, "learning_rate": 3.748160918596133e-07, "loss": 1.0565619468688965, "loss/kd": 1.772181749343872, "loss/lm": 0.3409421145915985, "step": 4458 }, { "epoch": 0.9154177786902074, "grad_norm": 0.8030603943497973, "kd_ratio": 0.5, "learning_rate": 3.7301455505739494e-07, "loss": 1.051547646522522, "loss/kd": 1.791595458984375, "loss/lm": 0.3114998936653137, "step": 4459 }, { "epoch": 0.9156230753438719, "grad_norm": 0.8925022828983761, "kd_ratio": 0.5, "learning_rate": 3.7121727590820665e-07, "loss": 1.0830010175704956, "loss/kd": 1.8283240795135498, "loss/lm": 0.33767804503440857, "step": 4460 }, { "epoch": 0.9158283719975364, "grad_norm": 0.8363195561348093, "kd_ratio": 0.5, "learning_rate": 3.6942425520692047e-07, "loss": 1.1392695903778076, "loss/kd": 1.9439822435379028, "loss/lm": 0.3345569670200348, "step": 4461 }, { "epoch": 0.916033668651201, "grad_norm": 0.9239730848155893, "kd_ratio": 0.5, "learning_rate": 3.67635493746521e-07, "loss": 1.1891109943389893, "loss/kd": 2.0182602405548096, "loss/lm": 0.3599618375301361, "step": 4462 }, { "epoch": 0.9162389653048655, "grad_norm": 0.8473623370237671, "kd_ratio": 0.5, "learning_rate": 3.6585099231810863e-07, "loss": 1.246805191040039, "loss/kd": 2.1555380821228027, "loss/lm": 0.338072270154953, "step": 4463 }, { "epoch": 0.9164442619585301, "grad_norm": 0.9108440415705029, "kd_ratio": 0.5, "learning_rate": 3.6407075171090435e-07, "loss": 0.9355383515357971, "loss/kd": 1.5790594816207886, "loss/lm": 0.29201725125312805, "step": 4464 }, { "epoch": 0.9166495586121947, "grad_norm": 0.858331524840362, "kd_ratio": 0.5, "learning_rate": 3.6229477271223834e-07, "loss": 1.0384929180145264, "loss/kd": 1.8509386777877808, "loss/lm": 0.226047083735466, "step": 4465 }, { "epoch": 0.9168548552658592, "grad_norm": 0.8672696079837245, "kd_ratio": 0.5, "learning_rate": 3.6052305610756235e-07, "loss": 0.9607564210891724, "loss/kd": 1.5962448120117188, "loss/lm": 0.3252680003643036, "step": 4466 }, { "epoch": 0.9170601519195237, "grad_norm": 0.8827621319676636, "kd_ratio": 0.5, "learning_rate": 3.587556026804362e-07, "loss": 0.913858950138092, "loss/kd": 1.4594988822937012, "loss/lm": 0.3682190179824829, "step": 4467 }, { "epoch": 0.9172654485731883, "grad_norm": 1.0176498517253418, "kd_ratio": 0.5, "learning_rate": 3.569924132125402e-07, "loss": 0.8512469530105591, "loss/kd": 1.4174158573150635, "loss/lm": 0.2850780189037323, "step": 4468 }, { "epoch": 0.9174707452268528, "grad_norm": 0.8125656303962571, "kd_ratio": 0.5, "learning_rate": 3.552334884836661e-07, "loss": 1.0830752849578857, "loss/kd": 1.8424395322799683, "loss/lm": 0.32371091842651367, "step": 4469 }, { "epoch": 0.9176760418805173, "grad_norm": 0.890875486519807, "kd_ratio": 0.5, "learning_rate": 3.534788292717206e-07, "loss": 1.085325002670288, "loss/kd": 1.7917993068695068, "loss/lm": 0.3788508176803589, "step": 4470 }, { "epoch": 0.9178813385341819, "grad_norm": 0.9377989884775724, "kd_ratio": 0.5, "learning_rate": 3.5172843635272403e-07, "loss": 0.9295488595962524, "loss/kd": 1.5773245096206665, "loss/lm": 0.2817732095718384, "step": 4471 }, { "epoch": 0.9180866351878464, "grad_norm": 0.8750750797014072, "kd_ratio": 0.5, "learning_rate": 3.499823105008071e-07, "loss": 1.1423224210739136, "loss/kd": 1.8937757015228271, "loss/lm": 0.39086923003196716, "step": 4472 }, { "epoch": 0.918291931841511, "grad_norm": 0.8234057342662516, "kd_ratio": 0.5, "learning_rate": 3.482404524882155e-07, "loss": 0.9823048114776611, "loss/kd": 1.631657600402832, "loss/lm": 0.3329520523548126, "step": 4473 }, { "epoch": 0.9184972284951756, "grad_norm": 0.8682639848101985, "kd_ratio": 0.5, "learning_rate": 3.4650286308530955e-07, "loss": 1.023957371711731, "loss/kd": 1.7283145189285278, "loss/lm": 0.3196001946926117, "step": 4474 }, { "epoch": 0.9187025251488401, "grad_norm": 0.8368748069326349, "kd_ratio": 0.5, "learning_rate": 3.4476954306056023e-07, "loss": 1.0118768215179443, "loss/kd": 1.7276310920715332, "loss/lm": 0.2961224913597107, "step": 4475 }, { "epoch": 0.9189078218025046, "grad_norm": 0.941980739336393, "kd_ratio": 0.5, "learning_rate": 3.430404931805464e-07, "loss": 1.2248674631118774, "loss/kd": 2.132620096206665, "loss/lm": 0.31711477041244507, "step": 4476 }, { "epoch": 0.9191131184561692, "grad_norm": 0.8780057807947078, "kd_ratio": 0.5, "learning_rate": 3.413157142099632e-07, "loss": 0.8530879616737366, "loss/kd": 1.4118636846542358, "loss/lm": 0.2943122088909149, "step": 4477 }, { "epoch": 0.9193184151098337, "grad_norm": 0.8093786741755626, "kd_ratio": 0.5, "learning_rate": 3.3959520691161595e-07, "loss": 1.2573744058609009, "loss/kd": 2.089992046356201, "loss/lm": 0.4247567355632782, "step": 4478 }, { "epoch": 0.9195237117634982, "grad_norm": 0.82465242835862, "kd_ratio": 0.5, "learning_rate": 3.378789720464193e-07, "loss": 1.2661014795303345, "loss/kd": 2.187185287475586, "loss/lm": 0.34501758217811584, "step": 4479 }, { "epoch": 0.9197290084171628, "grad_norm": 0.8279837663289756, "kd_ratio": 0.5, "learning_rate": 3.361670103734005e-07, "loss": 1.2801337242126465, "loss/kd": 2.23586106300354, "loss/lm": 0.3244062662124634, "step": 4480 }, { "epoch": 0.9199343050708273, "grad_norm": 0.8803126052505742, "kd_ratio": 0.5, "learning_rate": 3.3445932264969504e-07, "loss": 0.9888229966163635, "loss/kd": 1.6306180953979492, "loss/lm": 0.34702786803245544, "step": 4481 }, { "epoch": 0.9201396017244919, "grad_norm": 0.8671221173394196, "kd_ratio": 0.5, "learning_rate": 3.327559096305488e-07, "loss": 0.866639256477356, "loss/kd": 1.4437439441680908, "loss/lm": 0.28953462839126587, "step": 4482 }, { "epoch": 0.9203448983781565, "grad_norm": 0.8578916183935256, "kd_ratio": 0.5, "learning_rate": 3.31056772069317e-07, "loss": 2.0051591396331787, "loss/kd": 3.7656612396240234, "loss/lm": 0.24465718865394592, "step": 4483 }, { "epoch": 0.920550195031821, "grad_norm": 1.071327904734252, "kd_ratio": 0.5, "learning_rate": 3.2936191071746393e-07, "loss": 1.343562364578247, "loss/kd": 2.36264705657959, "loss/lm": 0.32447776198387146, "step": 4484 }, { "epoch": 0.9207554916854855, "grad_norm": 0.8262881342836312, "kd_ratio": 0.5, "learning_rate": 3.2767132632456345e-07, "loss": 1.0888105630874634, "loss/kd": 1.849838137626648, "loss/lm": 0.32778292894363403, "step": 4485 }, { "epoch": 0.9209607883391501, "grad_norm": 0.9360464641851306, "kd_ratio": 0.5, "learning_rate": 3.259850196382985e-07, "loss": 0.940017580986023, "loss/kd": 1.595471739768982, "loss/lm": 0.28456345200538635, "step": 4486 }, { "epoch": 0.9211660849928146, "grad_norm": 0.8501148911092136, "kd_ratio": 0.5, "learning_rate": 3.2430299140445597e-07, "loss": 0.9505870342254639, "loss/kd": 1.611824631690979, "loss/lm": 0.28934943675994873, "step": 4487 }, { "epoch": 0.9213713816464791, "grad_norm": 0.8108094913785012, "kd_ratio": 0.5, "learning_rate": 3.226252423669363e-07, "loss": 0.9915019869804382, "loss/kd": 1.6875792741775513, "loss/lm": 0.2954246997833252, "step": 4488 }, { "epoch": 0.9215766783001437, "grad_norm": 0.9463278313351907, "kd_ratio": 0.5, "learning_rate": 3.209517732677436e-07, "loss": 0.9591171145439148, "loss/kd": 1.601360559463501, "loss/lm": 0.316873699426651, "step": 4489 }, { "epoch": 0.9217819749538082, "grad_norm": 0.7964751204910147, "kd_ratio": 0.5, "learning_rate": 3.192825848469905e-07, "loss": 1.066490650177002, "loss/kd": 1.817173719406128, "loss/lm": 0.3158074915409088, "step": 4490 }, { "epoch": 0.9219872716074728, "grad_norm": 0.7947319256064507, "kd_ratio": 0.5, "learning_rate": 3.176176778428974e-07, "loss": 0.9150733947753906, "loss/kd": 1.4887714385986328, "loss/lm": 0.3413753807544708, "step": 4491 }, { "epoch": 0.9221925682611374, "grad_norm": 0.8287112315867584, "kd_ratio": 0.5, "learning_rate": 3.159570529917877e-07, "loss": 1.2221171855926514, "loss/kd": 2.0497384071350098, "loss/lm": 0.39449605345726013, "step": 4492 }, { "epoch": 0.9223978649148019, "grad_norm": 0.8028235205176502, "kd_ratio": 0.5, "learning_rate": 3.143007110280949e-07, "loss": 0.9229716658592224, "loss/kd": 1.5061801671981812, "loss/lm": 0.33976319432258606, "step": 4493 }, { "epoch": 0.9226031615684664, "grad_norm": 0.9958888797261561, "kd_ratio": 0.5, "learning_rate": 3.126486526843542e-07, "loss": 1.0433810949325562, "loss/kd": 1.834787368774414, "loss/lm": 0.25197482109069824, "step": 4494 }, { "epoch": 0.922808458222131, "grad_norm": 0.8998903569600677, "kd_ratio": 0.5, "learning_rate": 3.110008786912122e-07, "loss": 1.121860146522522, "loss/kd": 1.8829213380813599, "loss/lm": 0.3607989549636841, "step": 4495 }, { "epoch": 0.9230137548757955, "grad_norm": 0.9370473055878372, "kd_ratio": 0.5, "learning_rate": 3.093573897774149e-07, "loss": 0.9200000762939453, "loss/kd": 1.5112838745117188, "loss/lm": 0.32871633768081665, "step": 4496 }, { "epoch": 0.92321905152946, "grad_norm": 0.9734449414429059, "kd_ratio": 0.5, "learning_rate": 3.0771818666981533e-07, "loss": 1.079492211341858, "loss/kd": 1.8071075677871704, "loss/lm": 0.35187679529190063, "step": 4497 }, { "epoch": 0.9234243481831246, "grad_norm": 0.8205229676844512, "kd_ratio": 0.5, "learning_rate": 3.060832700933736e-07, "loss": 1.1188938617706299, "loss/kd": 1.8811792135238647, "loss/lm": 0.35660839080810547, "step": 4498 }, { "epoch": 0.9236296448367891, "grad_norm": 0.8288246269908248, "kd_ratio": 0.5, "learning_rate": 3.044526407711501e-07, "loss": 1.0457085371017456, "loss/kd": 1.6948221921920776, "loss/lm": 0.39659491181373596, "step": 4499 }, { "epoch": 0.9238349414904538, "grad_norm": 0.8349085451495333, "kd_ratio": 0.5, "learning_rate": 3.028262994243103e-07, "loss": 1.0881280899047852, "loss/kd": 1.8678560256958008, "loss/lm": 0.30840015411376953, "step": 4500 }, { "epoch": 0.9240402381441183, "grad_norm": 0.96448238771576, "kd_ratio": 0.5, "learning_rate": 3.0120424677212434e-07, "loss": 1.1846356391906738, "loss/kd": 1.9878557920455933, "loss/lm": 0.381415456533432, "step": 4501 }, { "epoch": 0.9242455347977828, "grad_norm": 0.897382949292895, "kd_ratio": 0.5, "learning_rate": 2.995864835319662e-07, "loss": 0.8921793103218079, "loss/kd": 1.4758695363998413, "loss/lm": 0.3084890842437744, "step": 4502 }, { "epoch": 0.9244508314514474, "grad_norm": 1.1092418560186554, "kd_ratio": 0.5, "learning_rate": 2.97973010419309e-07, "loss": 1.0834423303604126, "loss/kd": 1.7862436771392822, "loss/lm": 0.38064107298851013, "step": 4503 }, { "epoch": 0.9246561281051119, "grad_norm": 0.8853745580211266, "kd_ratio": 0.5, "learning_rate": 2.9636382814773324e-07, "loss": 0.796837568283081, "loss/kd": 1.1997226476669312, "loss/lm": 0.39395248889923096, "step": 4504 }, { "epoch": 0.9248614247587764, "grad_norm": 0.8613614425466887, "kd_ratio": 0.5, "learning_rate": 2.9475893742891836e-07, "loss": 1.3682801723480225, "loss/kd": 2.407430648803711, "loss/lm": 0.3291296660900116, "step": 4505 }, { "epoch": 0.925066721412441, "grad_norm": 0.8891152899111366, "kd_ratio": 0.5, "learning_rate": 2.931583389726478e-07, "loss": 0.8479666709899902, "loss/kd": 1.4039578437805176, "loss/lm": 0.2919755280017853, "step": 4506 }, { "epoch": 0.9252720180661055, "grad_norm": 0.8289967332684788, "kd_ratio": 0.5, "learning_rate": 2.915620334868074e-07, "loss": 1.043942928314209, "loss/kd": 1.7285040616989136, "loss/lm": 0.359381765127182, "step": 4507 }, { "epoch": 0.92547731471977, "grad_norm": 0.9174424043058681, "kd_ratio": 0.5, "learning_rate": 2.899700216773793e-07, "loss": 1.0730150938034058, "loss/kd": 1.8447455167770386, "loss/lm": 0.30128467082977295, "step": 4508 }, { "epoch": 0.9256826113734347, "grad_norm": 0.901542085214729, "kd_ratio": 0.5, "learning_rate": 2.883823042484546e-07, "loss": 0.9652668237686157, "loss/kd": 1.6034321784973145, "loss/lm": 0.3271014094352722, "step": 4509 }, { "epoch": 0.9258879080270992, "grad_norm": 0.9412253662312514, "kd_ratio": 0.5, "learning_rate": 2.8679888190221605e-07, "loss": 1.1491460800170898, "loss/kd": 1.9535658359527588, "loss/lm": 0.34472641348838806, "step": 4510 }, { "epoch": 0.9260932046807637, "grad_norm": 0.8086582650739815, "kd_ratio": 0.5, "learning_rate": 2.852197553389568e-07, "loss": 1.001043677330017, "loss/kd": 1.7182278633117676, "loss/lm": 0.28385940194129944, "step": 4511 }, { "epoch": 0.9262985013344283, "grad_norm": 0.8833595324982535, "kd_ratio": 0.5, "learning_rate": 2.836449252570617e-07, "loss": 0.9949588775634766, "loss/kd": 1.7356458902359009, "loss/lm": 0.25427180528640747, "step": 4512 }, { "epoch": 0.9265037979880928, "grad_norm": 0.8137002076239714, "kd_ratio": 0.5, "learning_rate": 2.8207439235302113e-07, "loss": 0.9572979211807251, "loss/kd": 1.538684606552124, "loss/lm": 0.37591129541397095, "step": 4513 }, { "epoch": 0.9267090946417573, "grad_norm": 0.8332335688443229, "kd_ratio": 0.5, "learning_rate": 2.805081573214197e-07, "loss": 1.0361180305480957, "loss/kd": 1.7476121187210083, "loss/lm": 0.32462388277053833, "step": 4514 }, { "epoch": 0.9269143912954219, "grad_norm": 0.8171087562217222, "kd_ratio": 0.5, "learning_rate": 2.789462208549454e-07, "loss": 1.301283597946167, "loss/kd": 2.3024282455444336, "loss/lm": 0.30013906955718994, "step": 4515 }, { "epoch": 0.9271196879490864, "grad_norm": 0.8985183416940392, "kd_ratio": 0.5, "learning_rate": 2.7738858364438457e-07, "loss": 0.977851152420044, "loss/kd": 1.6315021514892578, "loss/lm": 0.32420018315315247, "step": 4516 }, { "epoch": 0.9273249846027509, "grad_norm": 0.9453493258976317, "kd_ratio": 0.5, "learning_rate": 2.758352463786207e-07, "loss": 0.9429181218147278, "loss/kd": 1.5701167583465576, "loss/lm": 0.31571951508522034, "step": 4517 }, { "epoch": 0.9275302812564156, "grad_norm": 0.9891480295245456, "kd_ratio": 0.5, "learning_rate": 2.742862097446386e-07, "loss": 1.0875389575958252, "loss/kd": 1.902214527130127, "loss/lm": 0.27286338806152344, "step": 4518 }, { "epoch": 0.9277355779100801, "grad_norm": 0.8596955026534672, "kd_ratio": 0.5, "learning_rate": 2.727414744275147e-07, "loss": 1.0538150072097778, "loss/kd": 1.7760391235351562, "loss/lm": 0.3315909504890442, "step": 4519 }, { "epoch": 0.9279408745637446, "grad_norm": 0.8526668155005216, "kd_ratio": 0.5, "learning_rate": 2.7120104111043045e-07, "loss": 1.384526252746582, "loss/kd": 2.477234125137329, "loss/lm": 0.29181838035583496, "step": 4520 }, { "epoch": 0.9281461712174092, "grad_norm": 0.8963941254794336, "kd_ratio": 0.5, "learning_rate": 2.696649104746607e-07, "loss": 1.1440396308898926, "loss/kd": 2.0278167724609375, "loss/lm": 0.26026254892349243, "step": 4521 }, { "epoch": 0.9283514678710737, "grad_norm": 0.8766574377594852, "kd_ratio": 0.5, "learning_rate": 2.6813308319957877e-07, "loss": 0.9673501253128052, "loss/kd": 1.6054798364639282, "loss/lm": 0.3292204737663269, "step": 4522 }, { "epoch": 0.9285567645247382, "grad_norm": 0.7906110273832536, "kd_ratio": 0.5, "learning_rate": 2.666055599626527e-07, "loss": 0.9056060314178467, "loss/kd": 1.4947259426116943, "loss/lm": 0.3164861500263214, "step": 4523 }, { "epoch": 0.9287620611784028, "grad_norm": 0.794629808048382, "kd_ratio": 0.5, "learning_rate": 2.6508234143944875e-07, "loss": 1.0958043336868286, "loss/kd": 1.9357948303222656, "loss/lm": 0.2558138966560364, "step": 4524 }, { "epoch": 0.9289673578320673, "grad_norm": 0.8794719873716722, "kd_ratio": 0.5, "learning_rate": 2.6356342830363035e-07, "loss": 0.8607664704322815, "loss/kd": 1.3897509574890137, "loss/lm": 0.3317819833755493, "step": 4525 }, { "epoch": 0.9291726544857318, "grad_norm": 0.7512353473643023, "kd_ratio": 0.5, "learning_rate": 2.6204882122695343e-07, "loss": 0.953652024269104, "loss/kd": 1.5569854974746704, "loss/lm": 0.3503184914588928, "step": 4526 }, { "epoch": 0.9293779511393965, "grad_norm": 1.3327465430898884, "kd_ratio": 0.5, "learning_rate": 2.6053852087927436e-07, "loss": 1.1016809940338135, "loss/kd": 1.8770767450332642, "loss/lm": 0.32628530263900757, "step": 4527 }, { "epoch": 0.929583247793061, "grad_norm": 0.9810426999385705, "kd_ratio": 0.5, "learning_rate": 2.5903252792853993e-07, "loss": 1.249832272529602, "loss/kd": 2.203662157058716, "loss/lm": 0.2960023880004883, "step": 4528 }, { "epoch": 0.9297885444467255, "grad_norm": 0.8700682010409658, "kd_ratio": 0.5, "learning_rate": 2.5753084304079633e-07, "loss": 1.0458868741989136, "loss/kd": 1.718295693397522, "loss/lm": 0.3734780251979828, "step": 4529 }, { "epoch": 0.9299938411003901, "grad_norm": 0.8218582150352768, "kd_ratio": 0.5, "learning_rate": 2.5603346688018007e-07, "loss": 0.9368101358413696, "loss/kd": 1.584082841873169, "loss/lm": 0.2895374000072479, "step": 4530 }, { "epoch": 0.9301991377540546, "grad_norm": 0.9383936716233338, "kd_ratio": 0.5, "learning_rate": 2.5454040010892354e-07, "loss": 1.0729286670684814, "loss/kd": 1.8723704814910889, "loss/lm": 0.27348676323890686, "step": 4531 }, { "epoch": 0.9304044344077191, "grad_norm": 1.0525867969326026, "kd_ratio": 0.5, "learning_rate": 2.5305164338735646e-07, "loss": 0.9290511608123779, "loss/kd": 1.5677604675292969, "loss/lm": 0.2903417944908142, "step": 4532 }, { "epoch": 0.9306097310613837, "grad_norm": 0.950572156256868, "kd_ratio": 0.5, "learning_rate": 2.5156719737389757e-07, "loss": 1.046358585357666, "loss/kd": 1.7422806024551392, "loss/lm": 0.3504365384578705, "step": 4533 }, { "epoch": 0.9308150277150482, "grad_norm": 0.9192246661012977, "kd_ratio": 0.5, "learning_rate": 2.500870627250651e-07, "loss": 0.9744871854782104, "loss/kd": 1.6265108585357666, "loss/lm": 0.3224635720252991, "step": 4534 }, { "epoch": 0.9310203243687127, "grad_norm": 1.2814596975395802, "kd_ratio": 0.5, "learning_rate": 2.486112400954621e-07, "loss": 1.437691330909729, "loss/kd": 2.5721213817596436, "loss/lm": 0.3032613694667816, "step": 4535 }, { "epoch": 0.9312256210223774, "grad_norm": 0.905194047215764, "kd_ratio": 0.5, "learning_rate": 2.4713973013779204e-07, "loss": 1.0766016244888306, "loss/kd": 1.857506275177002, "loss/lm": 0.2956969738006592, "step": 4536 }, { "epoch": 0.9314309176760419, "grad_norm": 0.8495248929662148, "kd_ratio": 0.5, "learning_rate": 2.456725335028476e-07, "loss": 1.0221471786499023, "loss/kd": 1.7420315742492676, "loss/lm": 0.3022627830505371, "step": 4537 }, { "epoch": 0.9316362143297064, "grad_norm": 0.9141183900654931, "kd_ratio": 0.5, "learning_rate": 2.442096508395153e-07, "loss": 0.9881833791732788, "loss/kd": 1.5801016092300415, "loss/lm": 0.39626508951187134, "step": 4538 }, { "epoch": 0.931841510983371, "grad_norm": 0.9648580934494292, "kd_ratio": 0.5, "learning_rate": 2.427510827947721e-07, "loss": 1.0854508876800537, "loss/kd": 1.9024691581726074, "loss/lm": 0.2684325575828552, "step": 4539 }, { "epoch": 0.9320468076370355, "grad_norm": 0.836697529413312, "kd_ratio": 0.5, "learning_rate": 2.4129683001368865e-07, "loss": 1.058563470840454, "loss/kd": 1.8351671695709229, "loss/lm": 0.28195977210998535, "step": 4540 }, { "epoch": 0.9322521042907, "grad_norm": 0.8998357193342973, "kd_ratio": 0.5, "learning_rate": 2.398468931394249e-07, "loss": 0.9626671671867371, "loss/kd": 1.5752278566360474, "loss/lm": 0.35010647773742676, "step": 4541 }, { "epoch": 0.9324574009443646, "grad_norm": 0.8651315240761345, "kd_ratio": 0.5, "learning_rate": 2.3840127281323567e-07, "loss": 1.1092325448989868, "loss/kd": 1.8547406196594238, "loss/lm": 0.3637245297431946, "step": 4542 }, { "epoch": 0.9326626975980291, "grad_norm": 0.9166523006916413, "kd_ratio": 0.5, "learning_rate": 2.3695996967446178e-07, "loss": 1.1093075275421143, "loss/kd": 1.8942900896072388, "loss/lm": 0.3243248462677002, "step": 4543 }, { "epoch": 0.9328679942516936, "grad_norm": 0.8632266964913754, "kd_ratio": 0.5, "learning_rate": 2.355229843605389e-07, "loss": 0.9076722860336304, "loss/kd": 1.5015795230865479, "loss/lm": 0.31376510858535767, "step": 4544 }, { "epoch": 0.9330732909053583, "grad_norm": 1.1979111952522843, "kd_ratio": 0.5, "learning_rate": 2.3409031750699084e-07, "loss": 1.1385893821716309, "loss/kd": 1.9505012035369873, "loss/lm": 0.326677531003952, "step": 4545 }, { "epoch": 0.9332785875590228, "grad_norm": 0.88642707024919, "kd_ratio": 0.5, "learning_rate": 2.3266196974743084e-07, "loss": 1.089603304862976, "loss/kd": 1.8814101219177246, "loss/lm": 0.29779645800590515, "step": 4546 }, { "epoch": 0.9334838842126874, "grad_norm": 1.007075204528099, "kd_ratio": 0.5, "learning_rate": 2.3123794171356683e-07, "loss": 0.8884746432304382, "loss/kd": 1.4792910814285278, "loss/lm": 0.29765817523002625, "step": 4547 }, { "epoch": 0.9336891808663519, "grad_norm": 0.8782264140998502, "kd_ratio": 0.5, "learning_rate": 2.2981823403518954e-07, "loss": 1.6171090602874756, "loss/kd": 2.930215835571289, "loss/lm": 0.30400219559669495, "step": 4548 }, { "epoch": 0.9338944775200164, "grad_norm": 0.8388004738611028, "kd_ratio": 0.5, "learning_rate": 2.2840284734018337e-07, "loss": 1.006357192993164, "loss/kd": 1.7508641481399536, "loss/lm": 0.2618502974510193, "step": 4549 }, { "epoch": 0.934099774173681, "grad_norm": 0.9500185190807998, "kd_ratio": 0.5, "learning_rate": 2.269917822545209e-07, "loss": 1.173520565032959, "loss/kd": 2.0310006141662598, "loss/lm": 0.3160404562950134, "step": 4550 }, { "epoch": 0.9343050708273455, "grad_norm": 0.8258553436997352, "kd_ratio": 0.5, "learning_rate": 2.2558503940226296e-07, "loss": 0.9323108196258545, "loss/kd": 1.5747121572494507, "loss/lm": 0.2899094820022583, "step": 4551 }, { "epoch": 0.93451036748101, "grad_norm": 0.8825016926212974, "kd_ratio": 0.5, "learning_rate": 2.241826194055574e-07, "loss": 1.0617111921310425, "loss/kd": 1.808341383934021, "loss/lm": 0.3150809407234192, "step": 4552 }, { "epoch": 0.9347156641346746, "grad_norm": 0.8299207545056964, "kd_ratio": 0.5, "learning_rate": 2.227845228846437e-07, "loss": 0.9459413290023804, "loss/kd": 1.5958900451660156, "loss/lm": 0.29599255323410034, "step": 4553 }, { "epoch": 0.9349209607883392, "grad_norm": 0.9097572300858559, "kd_ratio": 0.5, "learning_rate": 2.213907504578472e-07, "loss": 1.1766760349273682, "loss/kd": 2.0537819862365723, "loss/lm": 0.2995702028274536, "step": 4554 }, { "epoch": 0.9351262574420037, "grad_norm": 0.8979825226961498, "kd_ratio": 0.5, "learning_rate": 2.2000130274158039e-07, "loss": 1.108177661895752, "loss/kd": 1.8501428365707397, "loss/lm": 0.3662124276161194, "step": 4555 }, { "epoch": 0.9353315540956683, "grad_norm": 0.8482865937512631, "kd_ratio": 0.5, "learning_rate": 2.1861618035034394e-07, "loss": 0.9260146617889404, "loss/kd": 1.4093384742736816, "loss/lm": 0.44269078969955444, "step": 4556 }, { "epoch": 0.9355368507493328, "grad_norm": 0.8785780883882875, "kd_ratio": 0.5, "learning_rate": 2.1723538389672338e-07, "loss": 1.2771844863891602, "loss/kd": 2.2473840713500977, "loss/lm": 0.30698490142822266, "step": 4557 }, { "epoch": 0.9357421474029973, "grad_norm": 0.8393847352339677, "kd_ratio": 0.5, "learning_rate": 2.158589139913958e-07, "loss": 0.7990262508392334, "loss/kd": 1.3283106088638306, "loss/lm": 0.269741952419281, "step": 4558 }, { "epoch": 0.9359474440566619, "grad_norm": 0.9647595679221048, "kd_ratio": 0.5, "learning_rate": 2.144867712431198e-07, "loss": 1.0007373094558716, "loss/kd": 1.6263374090194702, "loss/lm": 0.3751372694969177, "step": 4559 }, { "epoch": 0.9361527407103264, "grad_norm": 0.9656203485757346, "kd_ratio": 0.5, "learning_rate": 2.1311895625874434e-07, "loss": 0.9972741007804871, "loss/kd": 1.6206798553466797, "loss/lm": 0.37386831641197205, "step": 4560 }, { "epoch": 0.9363580373639909, "grad_norm": 0.845377774910992, "kd_ratio": 0.5, "learning_rate": 2.1175546964320226e-07, "loss": 1.0286076068878174, "loss/kd": 1.7215389013290405, "loss/lm": 0.3356763422489166, "step": 4561 }, { "epoch": 0.9365633340176555, "grad_norm": 1.130862071867197, "kd_ratio": 0.5, "learning_rate": 2.1039631199950895e-07, "loss": 1.162135124206543, "loss/kd": 1.954763412475586, "loss/lm": 0.3695068359375, "step": 4562 }, { "epoch": 0.9367686306713201, "grad_norm": 0.8397434055043143, "kd_ratio": 0.5, "learning_rate": 2.0904148392877354e-07, "loss": 1.1324470043182373, "loss/kd": 1.9914591312408447, "loss/lm": 0.27343490719795227, "step": 4563 }, { "epoch": 0.9369739273249846, "grad_norm": 0.8858597845747547, "kd_ratio": 0.5, "learning_rate": 2.0769098603018233e-07, "loss": 0.8543609380722046, "loss/kd": 1.3502799272537231, "loss/lm": 0.3584419786930084, "step": 4564 }, { "epoch": 0.9371792239786492, "grad_norm": 0.8968833988443958, "kd_ratio": 0.5, "learning_rate": 2.06344818901012e-07, "loss": 0.916885256767273, "loss/kd": 1.5005059242248535, "loss/lm": 0.33326461911201477, "step": 4565 }, { "epoch": 0.9373845206323137, "grad_norm": 0.7946167001584025, "kd_ratio": 0.5, "learning_rate": 2.050029831366185e-07, "loss": 1.0961506366729736, "loss/kd": 1.8003009557724, "loss/lm": 0.392000287771225, "step": 4566 }, { "epoch": 0.9375898172859782, "grad_norm": 0.8827725359548203, "kd_ratio": 0.5, "learning_rate": 2.0366547933044712e-07, "loss": 0.953208863735199, "loss/kd": 1.5579636096954346, "loss/lm": 0.34845414757728577, "step": 4567 }, { "epoch": 0.9377951139396428, "grad_norm": 0.9147093685354369, "kd_ratio": 0.5, "learning_rate": 2.0233230807402472e-07, "loss": 1.0754456520080566, "loss/kd": 1.7669662237167358, "loss/lm": 0.38392511010169983, "step": 4568 }, { "epoch": 0.9380004105933073, "grad_norm": 0.8334160784383047, "kd_ratio": 0.5, "learning_rate": 2.0100346995696408e-07, "loss": 0.9129058718681335, "loss/kd": 1.4507691860198975, "loss/lm": 0.37504252791404724, "step": 4569 }, { "epoch": 0.9382057072469718, "grad_norm": 0.9396968677062729, "kd_ratio": 0.5, "learning_rate": 1.9967896556695844e-07, "loss": 1.1200146675109863, "loss/kd": 1.8908324241638184, "loss/lm": 0.34919703006744385, "step": 4570 }, { "epoch": 0.9384110039006364, "grad_norm": 0.9195135347019104, "kd_ratio": 0.5, "learning_rate": 1.983587954897881e-07, "loss": 2.0437681674957275, "loss/kd": 3.7468669414520264, "loss/lm": 0.3406693935394287, "step": 4571 }, { "epoch": 0.938616300554301, "grad_norm": 0.8914706229340243, "kd_ratio": 0.5, "learning_rate": 1.970429603093127e-07, "loss": 1.2540026903152466, "loss/kd": 2.12088680267334, "loss/lm": 0.3871186077594757, "step": 4572 }, { "epoch": 0.9388215972079655, "grad_norm": 0.8550056539167582, "kd_ratio": 0.5, "learning_rate": 1.9573146060747673e-07, "loss": 1.0270546674728394, "loss/kd": 1.7339318990707397, "loss/lm": 0.32017749547958374, "step": 4573 }, { "epoch": 0.9390268938616301, "grad_norm": 0.8598816069732658, "kd_ratio": 0.5, "learning_rate": 1.9442429696430954e-07, "loss": 1.1072636842727661, "loss/kd": 1.8873283863067627, "loss/lm": 0.32719898223876953, "step": 4574 }, { "epoch": 0.9392321905152946, "grad_norm": 0.8363128651441478, "kd_ratio": 0.5, "learning_rate": 1.931214699579176e-07, "loss": 0.8845222592353821, "loss/kd": 1.3925330638885498, "loss/lm": 0.37651148438453674, "step": 4575 }, { "epoch": 0.9394374871689591, "grad_norm": 0.9187800413584145, "kd_ratio": 0.5, "learning_rate": 1.918229801644944e-07, "loss": 1.1101897954940796, "loss/kd": 1.8275591135025024, "loss/lm": 0.3928203880786896, "step": 4576 }, { "epoch": 0.9396427838226237, "grad_norm": 1.1238568066449721, "kd_ratio": 0.5, "learning_rate": 1.9052882815831065e-07, "loss": 0.9733911156654358, "loss/kd": 1.652686357498169, "loss/lm": 0.29409587383270264, "step": 4577 }, { "epoch": 0.9398480804762882, "grad_norm": 0.8559237004769602, "kd_ratio": 0.5, "learning_rate": 1.8923901451172287e-07, "loss": 1.098906397819519, "loss/kd": 1.8051174879074097, "loss/lm": 0.3926953673362732, "step": 4578 }, { "epoch": 0.9400533771299527, "grad_norm": 0.8290597554477264, "kd_ratio": 0.5, "learning_rate": 1.8795353979516596e-07, "loss": 1.236088514328003, "loss/kd": 2.102865219116211, "loss/lm": 0.36931174993515015, "step": 4579 }, { "epoch": 0.9402586737836174, "grad_norm": 0.8802445664824756, "kd_ratio": 0.5, "learning_rate": 1.866724045771573e-07, "loss": 0.9334231019020081, "loss/kd": 1.5838348865509033, "loss/lm": 0.2830113172531128, "step": 4580 }, { "epoch": 0.9404639704372819, "grad_norm": 0.8411461580128452, "kd_ratio": 0.5, "learning_rate": 1.8539560942429592e-07, "loss": 0.8998733758926392, "loss/kd": 1.455819845199585, "loss/lm": 0.34392696619033813, "step": 4581 }, { "epoch": 0.9406692670909464, "grad_norm": 0.9299863858776629, "kd_ratio": 0.5, "learning_rate": 1.8412315490125787e-07, "loss": 0.9691877365112305, "loss/kd": 1.6126347780227661, "loss/lm": 0.32574063539505005, "step": 4582 }, { "epoch": 0.940874563744611, "grad_norm": 0.8186573157176229, "kd_ratio": 0.5, "learning_rate": 1.8285504157080414e-07, "loss": 0.9051321148872375, "loss/kd": 1.4853764772415161, "loss/lm": 0.32488778233528137, "step": 4583 }, { "epoch": 0.9410798603982755, "grad_norm": 0.8542573140153575, "kd_ratio": 0.5, "learning_rate": 1.8159126999377164e-07, "loss": 1.0104507207870483, "loss/kd": 1.736593246459961, "loss/lm": 0.2843081057071686, "step": 4584 }, { "epoch": 0.94128515705194, "grad_norm": 0.850868929169657, "kd_ratio": 0.5, "learning_rate": 1.8033184072908105e-07, "loss": 1.0545637607574463, "loss/kd": 1.8681244850158691, "loss/lm": 0.24100299179553986, "step": 4585 }, { "epoch": 0.9414904537056046, "grad_norm": 0.8846585171666637, "kd_ratio": 0.5, "learning_rate": 1.7907675433372907e-07, "loss": 0.9399608373641968, "loss/kd": 1.5554102659225464, "loss/lm": 0.3245113492012024, "step": 4586 }, { "epoch": 0.9416957503592691, "grad_norm": 0.9209981648072117, "kd_ratio": 0.5, "learning_rate": 1.7782601136279277e-07, "loss": 1.0463838577270508, "loss/kd": 1.7444958686828613, "loss/lm": 0.34827178716659546, "step": 4587 }, { "epoch": 0.9419010470129336, "grad_norm": 0.8296452269674889, "kd_ratio": 0.5, "learning_rate": 1.7657961236942856e-07, "loss": 1.1834437847137451, "loss/kd": 1.9404778480529785, "loss/lm": 0.42640984058380127, "step": 4588 }, { "epoch": 0.9421063436665983, "grad_norm": 0.7737882626029715, "kd_ratio": 0.5, "learning_rate": 1.7533755790487327e-07, "loss": 0.9925815463066101, "loss/kd": 1.5981032848358154, "loss/lm": 0.3870598077774048, "step": 4589 }, { "epoch": 0.9423116403202628, "grad_norm": 0.8834622849547138, "kd_ratio": 0.5, "learning_rate": 1.7409984851844087e-07, "loss": 0.9295114278793335, "loss/kd": 1.4913219213485718, "loss/lm": 0.36770087480545044, "step": 4590 }, { "epoch": 0.9425169369739274, "grad_norm": 0.9971670568233703, "kd_ratio": 0.5, "learning_rate": 1.7286648475752122e-07, "loss": 1.1726781129837036, "loss/kd": 1.9053329229354858, "loss/lm": 0.44002339243888855, "step": 4591 }, { "epoch": 0.9427222336275919, "grad_norm": 0.863048850428601, "kd_ratio": 0.5, "learning_rate": 1.71637467167588e-07, "loss": 1.0781078338623047, "loss/kd": 1.8149114847183228, "loss/lm": 0.3413042426109314, "step": 4592 }, { "epoch": 0.9429275302812564, "grad_norm": 0.8325311770914056, "kd_ratio": 0.5, "learning_rate": 1.7041279629218532e-07, "loss": 0.9357602596282959, "loss/kd": 1.6080520153045654, "loss/lm": 0.26346853375434875, "step": 4593 }, { "epoch": 0.943132826934921, "grad_norm": 0.8972876628939382, "kd_ratio": 0.5, "learning_rate": 1.6919247267294215e-07, "loss": 1.113572359085083, "loss/kd": 1.9404724836349487, "loss/lm": 0.2866722643375397, "step": 4594 }, { "epoch": 0.9433381235885855, "grad_norm": 0.8626848162111228, "kd_ratio": 0.5, "learning_rate": 1.6797649684956118e-07, "loss": 1.0058826208114624, "loss/kd": 1.707162618637085, "loss/lm": 0.3046026825904846, "step": 4595 }, { "epoch": 0.94354342024225, "grad_norm": 0.825597547426107, "kd_ratio": 0.5, "learning_rate": 1.6676486935982116e-07, "loss": 1.093637466430664, "loss/kd": 1.8434998989105225, "loss/lm": 0.34377506375312805, "step": 4596 }, { "epoch": 0.9437487168959146, "grad_norm": 0.8720595404018563, "kd_ratio": 0.5, "learning_rate": 1.655575907395812e-07, "loss": 1.9960153102874756, "loss/kd": 3.719458818435669, "loss/lm": 0.2725719213485718, "step": 4597 }, { "epoch": 0.9439540135495792, "grad_norm": 1.0168529339038885, "kd_ratio": 0.5, "learning_rate": 1.6435466152277424e-07, "loss": 1.708992838859558, "loss/kd": 3.1804165840148926, "loss/lm": 0.23756910860538483, "step": 4598 }, { "epoch": 0.9441593102032437, "grad_norm": 0.9729999201612701, "kd_ratio": 0.5, "learning_rate": 1.6315608224141023e-07, "loss": 0.7585740685462952, "loss/kd": 1.1918013095855713, "loss/lm": 0.32534685730934143, "step": 4599 }, { "epoch": 0.9443646068569083, "grad_norm": 0.81183487889954, "kd_ratio": 0.5, "learning_rate": 1.6196185342557625e-07, "loss": 0.8675329685211182, "loss/kd": 1.395761251449585, "loss/lm": 0.3393046259880066, "step": 4600 }, { "epoch": 0.9445699035105728, "grad_norm": 0.8737141887409577, "kd_ratio": 0.5, "learning_rate": 1.6077197560343537e-07, "loss": 1.0184197425842285, "loss/kd": 1.6765779256820679, "loss/lm": 0.36026161909103394, "step": 4601 }, { "epoch": 0.9447752001642373, "grad_norm": 0.7672009142334426, "kd_ratio": 0.5, "learning_rate": 1.5958644930122336e-07, "loss": 0.982642650604248, "loss/kd": 1.6770696640014648, "loss/lm": 0.2882155776023865, "step": 4602 }, { "epoch": 0.9449804968179019, "grad_norm": 0.8157893024051995, "kd_ratio": 0.5, "learning_rate": 1.5840527504325632e-07, "loss": 1.0750259160995483, "loss/kd": 1.828619360923767, "loss/lm": 0.32143256068229675, "step": 4603 }, { "epoch": 0.9451857934715664, "grad_norm": 0.9530304036189378, "kd_ratio": 0.5, "learning_rate": 1.5722845335192084e-07, "loss": 1.0611670017242432, "loss/kd": 1.8549671173095703, "loss/lm": 0.26736685633659363, "step": 4604 }, { "epoch": 0.9453910901252309, "grad_norm": 1.3036555891920263, "kd_ratio": 0.5, "learning_rate": 1.560559847476839e-07, "loss": 0.9212679862976074, "loss/kd": 1.5338761806488037, "loss/lm": 0.3086598515510559, "step": 4605 }, { "epoch": 0.9455963867788955, "grad_norm": 0.9115191868762565, "kd_ratio": 0.5, "learning_rate": 1.5488786974908188e-07, "loss": 1.0137381553649902, "loss/kd": 1.707727313041687, "loss/lm": 0.3197490870952606, "step": 4606 }, { "epoch": 0.9458016834325601, "grad_norm": 0.8399828223930802, "kd_ratio": 0.5, "learning_rate": 1.5372410887272814e-07, "loss": 0.8921291828155518, "loss/kd": 1.4562164545059204, "loss/lm": 0.3280419707298279, "step": 4607 }, { "epoch": 0.9460069800862246, "grad_norm": 0.990019764578937, "kd_ratio": 0.5, "learning_rate": 1.5256470263331213e-07, "loss": 1.0200644731521606, "loss/kd": 1.7396862506866455, "loss/lm": 0.300442636013031, "step": 4608 }, { "epoch": 0.9462122767398892, "grad_norm": 0.834370829857823, "kd_ratio": 0.5, "learning_rate": 1.5140965154359145e-07, "loss": 1.1053353548049927, "loss/kd": 1.8601281642913818, "loss/lm": 0.3505425751209259, "step": 4609 }, { "epoch": 0.9464175733935537, "grad_norm": 0.8710786288637031, "kd_ratio": 0.5, "learning_rate": 1.5025895611440744e-07, "loss": 1.2075459957122803, "loss/kd": 2.1336300373077393, "loss/lm": 0.28146207332611084, "step": 4610 }, { "epoch": 0.9466228700472182, "grad_norm": 0.9058116716739993, "kd_ratio": 0.5, "learning_rate": 1.4911261685466416e-07, "loss": 1.1561588048934937, "loss/kd": 1.9486479759216309, "loss/lm": 0.3636695444583893, "step": 4611 }, { "epoch": 0.9468281667008828, "grad_norm": 0.8277114200123182, "kd_ratio": 0.5, "learning_rate": 1.4797063427134827e-07, "loss": 0.8751447796821594, "loss/kd": 1.433225154876709, "loss/lm": 0.31706440448760986, "step": 4612 }, { "epoch": 0.9470334633545473, "grad_norm": 0.8361908004863502, "kd_ratio": 0.5, "learning_rate": 1.4683300886951247e-07, "loss": 1.1362370252609253, "loss/kd": 1.96653413772583, "loss/lm": 0.3059399425983429, "step": 4613 }, { "epoch": 0.9472387600082118, "grad_norm": 0.862357072997884, "kd_ratio": 0.5, "learning_rate": 1.4569974115228758e-07, "loss": 1.1900526285171509, "loss/kd": 2.018155574798584, "loss/lm": 0.36194977164268494, "step": 4614 }, { "epoch": 0.9474440566618764, "grad_norm": 0.8629154408724248, "kd_ratio": 0.5, "learning_rate": 1.4457083162087383e-07, "loss": 1.1284862756729126, "loss/kd": 1.8313223123550415, "loss/lm": 0.4256502687931061, "step": 4615 }, { "epoch": 0.947649353315541, "grad_norm": 0.8467375602530702, "kd_ratio": 0.5, "learning_rate": 1.4344628077454626e-07, "loss": 1.2723597288131714, "loss/kd": 2.1854710578918457, "loss/lm": 0.3592483103275299, "step": 4616 }, { "epoch": 0.9478546499692055, "grad_norm": 1.0215100020301229, "kd_ratio": 0.5, "learning_rate": 1.423260891106526e-07, "loss": 0.8755481243133545, "loss/kd": 1.4057198762893677, "loss/lm": 0.34537631273269653, "step": 4617 }, { "epoch": 0.9480599466228701, "grad_norm": 0.7887106034075526, "kd_ratio": 0.5, "learning_rate": 1.4121025712460877e-07, "loss": 0.8476952314376831, "loss/kd": 1.3448225259780884, "loss/lm": 0.3505679666996002, "step": 4618 }, { "epoch": 0.9482652432765346, "grad_norm": 1.1637913346575293, "kd_ratio": 0.5, "learning_rate": 1.4009878530990784e-07, "loss": 1.7230006456375122, "loss/kd": 3.1535778045654297, "loss/lm": 0.29242339730262756, "step": 4619 }, { "epoch": 0.9484705399301991, "grad_norm": 0.9193557110748276, "kd_ratio": 0.5, "learning_rate": 1.3899167415810877e-07, "loss": 0.9788960218429565, "loss/kd": 1.7033947706222534, "loss/lm": 0.25439730286598206, "step": 4620 }, { "epoch": 0.9486758365838637, "grad_norm": 0.8765035238851913, "kd_ratio": 0.5, "learning_rate": 1.3788892415884881e-07, "loss": 0.9016762971878052, "loss/kd": 1.518180012702942, "loss/lm": 0.28517261147499084, "step": 4621 }, { "epoch": 0.9488811332375282, "grad_norm": 0.9066816412873375, "kd_ratio": 0.5, "learning_rate": 1.3679053579983003e-07, "loss": 0.8820006847381592, "loss/kd": 1.4964244365692139, "loss/lm": 0.2675769031047821, "step": 4622 }, { "epoch": 0.9490864298911927, "grad_norm": 0.9188590550093158, "kd_ratio": 0.5, "learning_rate": 1.3569650956682944e-07, "loss": 0.9333517551422119, "loss/kd": 1.5799345970153809, "loss/lm": 0.2867688536643982, "step": 4623 }, { "epoch": 0.9492917265448573, "grad_norm": 0.7942155689661268, "kd_ratio": 0.5, "learning_rate": 1.3460684594369333e-07, "loss": 0.8066707849502563, "loss/kd": 1.351272702217102, "loss/lm": 0.26206880807876587, "step": 4624 }, { "epoch": 0.9494970231985219, "grad_norm": 0.8753032987214107, "kd_ratio": 0.5, "learning_rate": 1.335215454123384e-07, "loss": 0.9862992167472839, "loss/kd": 1.6320738792419434, "loss/lm": 0.3405245542526245, "step": 4625 }, { "epoch": 0.9497023198521864, "grad_norm": 0.8751935948078775, "kd_ratio": 0.5, "learning_rate": 1.3244060845275298e-07, "loss": 0.9050616025924683, "loss/kd": 1.5000457763671875, "loss/lm": 0.3100774884223938, "step": 4626 }, { "epoch": 0.949907616505851, "grad_norm": 0.8381471410299154, "kd_ratio": 0.5, "learning_rate": 1.313640355429946e-07, "loss": 0.9670677781105042, "loss/kd": 1.52022385597229, "loss/lm": 0.4139116704463959, "step": 4627 }, { "epoch": 0.9501129131595155, "grad_norm": 0.9779429802505542, "kd_ratio": 0.5, "learning_rate": 1.3029182715919131e-07, "loss": 1.1142419576644897, "loss/kd": 1.8849350214004517, "loss/lm": 0.34354880452156067, "step": 4628 }, { "epoch": 0.95031820981318, "grad_norm": 0.8160481156438452, "kd_ratio": 0.5, "learning_rate": 1.2922398377553824e-07, "loss": 1.0079466104507446, "loss/kd": 1.652642011642456, "loss/lm": 0.3632511794567108, "step": 4629 }, { "epoch": 0.9505235064668446, "grad_norm": 1.0492295149046194, "kd_ratio": 0.5, "learning_rate": 1.281605058643054e-07, "loss": 1.0477784872055054, "loss/kd": 1.7825908660888672, "loss/lm": 0.3129661977291107, "step": 4630 }, { "epoch": 0.9507288031205091, "grad_norm": 0.8073890756911337, "kd_ratio": 0.5, "learning_rate": 1.2710139389582654e-07, "loss": 0.9261465072631836, "loss/kd": 1.535828948020935, "loss/lm": 0.3164641261100769, "step": 4631 }, { "epoch": 0.9509340997741736, "grad_norm": 0.8717950046716071, "kd_ratio": 0.5, "learning_rate": 1.260466483385092e-07, "loss": 0.8086106777191162, "loss/kd": 1.3131518363952637, "loss/lm": 0.3040695786476135, "step": 4632 }, { "epoch": 0.9511393964278382, "grad_norm": 0.9481791828773684, "kd_ratio": 0.5, "learning_rate": 1.2499626965882584e-07, "loss": 0.8803728818893433, "loss/kd": 1.4388978481292725, "loss/lm": 0.3218478560447693, "step": 4633 }, { "epoch": 0.9513446930815028, "grad_norm": 0.8591705674224054, "kd_ratio": 0.5, "learning_rate": 1.2395025832132034e-07, "loss": 1.0493361949920654, "loss/kd": 1.7698677778244019, "loss/lm": 0.32880449295043945, "step": 4634 }, { "epoch": 0.9515499897351674, "grad_norm": 0.8509642139566954, "kd_ratio": 0.5, "learning_rate": 1.2290861478860272e-07, "loss": 1.0069615840911865, "loss/kd": 1.64003324508667, "loss/lm": 0.3738898038864136, "step": 4635 }, { "epoch": 0.9517552863888319, "grad_norm": 0.8456169104091462, "kd_ratio": 0.5, "learning_rate": 1.2187133952135445e-07, "loss": 1.1458137035369873, "loss/kd": 1.964208960533142, "loss/lm": 0.3274185359477997, "step": 4636 }, { "epoch": 0.9519605830424964, "grad_norm": 0.9779395620754828, "kd_ratio": 0.5, "learning_rate": 1.20838432978323e-07, "loss": 1.175539255142212, "loss/kd": 2.0779001712799072, "loss/lm": 0.27317842841148376, "step": 4637 }, { "epoch": 0.952165879696161, "grad_norm": 0.8567743016995396, "kd_ratio": 0.5, "learning_rate": 1.1980989561632296e-07, "loss": 0.8637166023254395, "loss/kd": 1.4701858758926392, "loss/lm": 0.25724726915359497, "step": 4638 }, { "epoch": 0.9523711763498255, "grad_norm": 0.829184596593578, "kd_ratio": 0.5, "learning_rate": 1.1878572789023935e-07, "loss": 1.2063548564910889, "loss/kd": 2.02396559715271, "loss/lm": 0.3887440264225006, "step": 4639 }, { "epoch": 0.95257647300349, "grad_norm": 0.8219976167434369, "kd_ratio": 0.5, "learning_rate": 1.1776593025302097e-07, "loss": 1.2469384670257568, "loss/kd": 2.123272657394409, "loss/lm": 0.37060436606407166, "step": 4640 }, { "epoch": 0.9527817696571546, "grad_norm": 0.873698758707496, "kd_ratio": 0.5, "learning_rate": 1.1675050315568703e-07, "loss": 1.2197527885437012, "loss/kd": 2.0585813522338867, "loss/lm": 0.38092419505119324, "step": 4641 }, { "epoch": 0.9529870663108191, "grad_norm": 0.8150875969227018, "kd_ratio": 0.5, "learning_rate": 1.1573944704732276e-07, "loss": 1.1975407600402832, "loss/kd": 2.020318031311035, "loss/lm": 0.37476351857185364, "step": 4642 }, { "epoch": 0.9531923629644837, "grad_norm": 0.8573144458739062, "kd_ratio": 0.5, "learning_rate": 1.1473276237507935e-07, "loss": 0.8668638467788696, "loss/kd": 1.4120569229125977, "loss/lm": 0.3216708302497864, "step": 4643 }, { "epoch": 0.9533976596181483, "grad_norm": 0.936377117664016, "kd_ratio": 0.5, "learning_rate": 1.1373044958417734e-07, "loss": 1.1031770706176758, "loss/kd": 1.8778128623962402, "loss/lm": 0.32854124903678894, "step": 4644 }, { "epoch": 0.9536029562718128, "grad_norm": 0.8572740015573878, "kd_ratio": 0.5, "learning_rate": 1.127325091178988e-07, "loss": 1.0755552053451538, "loss/kd": 1.8310952186584473, "loss/lm": 0.32001522183418274, "step": 4645 }, { "epoch": 0.9538082529254773, "grad_norm": 0.9099225130148336, "kd_ratio": 0.5, "learning_rate": 1.1173894141759955e-07, "loss": 0.756913423538208, "loss/kd": 1.2268669605255127, "loss/lm": 0.2869598865509033, "step": 4646 }, { "epoch": 0.9540135495791419, "grad_norm": 0.8334665906172519, "kd_ratio": 0.5, "learning_rate": 1.1074974692269258e-07, "loss": 1.4244697093963623, "loss/kd": 2.444953441619873, "loss/lm": 0.40398597717285156, "step": 4647 }, { "epoch": 0.9542188462328064, "grad_norm": 0.8240123215373962, "kd_ratio": 0.5, "learning_rate": 1.0976492607066458e-07, "loss": 1.0637917518615723, "loss/kd": 1.8385597467422485, "loss/lm": 0.28902381658554077, "step": 4648 }, { "epoch": 0.9544241428864709, "grad_norm": 0.8576077107445442, "kd_ratio": 0.5, "learning_rate": 1.0878447929706382e-07, "loss": 1.0789519548416138, "loss/kd": 1.8663794994354248, "loss/lm": 0.29152435064315796, "step": 4649 }, { "epoch": 0.9546294395401355, "grad_norm": 0.8527789719466725, "kd_ratio": 0.5, "learning_rate": 1.0780840703550455e-07, "loss": 1.0354318618774414, "loss/kd": 1.6903126239776611, "loss/lm": 0.3805510401725769, "step": 4650 }, { "epoch": 0.9548347361938, "grad_norm": 0.8837808123805446, "kd_ratio": 0.5, "learning_rate": 1.068367097176659e-07, "loss": 1.1502338647842407, "loss/kd": 1.9418036937713623, "loss/lm": 0.35866400599479675, "step": 4651 }, { "epoch": 0.9550400328474646, "grad_norm": 0.8672515598941569, "kd_ratio": 0.5, "learning_rate": 1.0586938777329526e-07, "loss": 0.8462129831314087, "loss/kd": 1.4127928018569946, "loss/lm": 0.27963319420814514, "step": 4652 }, { "epoch": 0.9552453295011292, "grad_norm": 0.8860733231775894, "kd_ratio": 0.5, "learning_rate": 1.0490644163020147e-07, "loss": 1.1472140550613403, "loss/kd": 1.9711161851882935, "loss/lm": 0.3233119249343872, "step": 4653 }, { "epoch": 0.9554506261547937, "grad_norm": 0.8921225117006689, "kd_ratio": 0.5, "learning_rate": 1.0394787171425947e-07, "loss": 0.913460910320282, "loss/kd": 1.5189553499221802, "loss/lm": 0.3079664707183838, "step": 4654 }, { "epoch": 0.9556559228084582, "grad_norm": 1.059188290589224, "kd_ratio": 0.5, "learning_rate": 1.02993678449409e-07, "loss": 0.8172434568405151, "loss/kd": 1.2881042957305908, "loss/lm": 0.34638258814811707, "step": 4655 }, { "epoch": 0.9558612194621228, "grad_norm": 0.8927989632364145, "kd_ratio": 0.5, "learning_rate": 1.020438622576514e-07, "loss": 1.1833548545837402, "loss/kd": 2.0508158206939697, "loss/lm": 0.31589382886886597, "step": 4656 }, { "epoch": 0.9560665161157873, "grad_norm": 0.867212029305446, "kd_ratio": 0.5, "learning_rate": 1.0109842355905842e-07, "loss": 1.0884677171707153, "loss/kd": 1.8614100217819214, "loss/lm": 0.31552547216415405, "step": 4657 }, { "epoch": 0.9562718127694518, "grad_norm": 0.8951804002943984, "kd_ratio": 0.5, "learning_rate": 1.0015736277175892e-07, "loss": 1.0833401679992676, "loss/kd": 1.8841711282730103, "loss/lm": 0.2825091481208801, "step": 4658 }, { "epoch": 0.9564771094231164, "grad_norm": 0.9437388300540033, "kd_ratio": 0.5, "learning_rate": 9.92206803119511e-08, "loss": 1.0665092468261719, "loss/kd": 1.7202123403549194, "loss/lm": 0.41280603408813477, "step": 4659 }, { "epoch": 0.9566824060767809, "grad_norm": 0.8439679770087823, "kd_ratio": 0.5, "learning_rate": 9.828837659389245e-08, "loss": 1.8767002820968628, "loss/kd": 3.522695302963257, "loss/lm": 0.23070518672466278, "step": 4660 }, { "epoch": 0.9568877027304455, "grad_norm": 0.9004799393762105, "kd_ratio": 0.5, "learning_rate": 9.736045202990651e-08, "loss": 1.2608598470687866, "loss/kd": 2.1068100929260254, "loss/lm": 0.41490960121154785, "step": 4661 }, { "epoch": 0.9570929993841101, "grad_norm": 0.8121688054738382, "kd_ratio": 0.5, "learning_rate": 9.643690703037833e-08, "loss": 1.135223627090454, "loss/kd": 1.914743185043335, "loss/lm": 0.355704128742218, "step": 4662 }, { "epoch": 0.9572982960377746, "grad_norm": 0.8126944682680373, "kd_ratio": 0.5, "learning_rate": 9.551774200375896e-08, "loss": 1.0344078540802002, "loss/kd": 1.7546414136886597, "loss/lm": 0.31417420506477356, "step": 4663 }, { "epoch": 0.9575035926914391, "grad_norm": 0.9267514276323434, "kd_ratio": 0.5, "learning_rate": 9.460295735655878e-08, "loss": 0.8704898953437805, "loss/kd": 1.4973666667938232, "loss/lm": 0.24361316859722137, "step": 4664 }, { "epoch": 0.9577088893451037, "grad_norm": 0.8839261694217949, "kd_ratio": 0.5, "learning_rate": 9.369255349335415e-08, "loss": 1.0089645385742188, "loss/kd": 1.7001862525939941, "loss/lm": 0.3177429437637329, "step": 4665 }, { "epoch": 0.9579141859987682, "grad_norm": 0.8526005357798698, "kd_ratio": 0.5, "learning_rate": 9.278653081678079e-08, "loss": 1.0705828666687012, "loss/kd": 1.83988618850708, "loss/lm": 0.3012794852256775, "step": 4666 }, { "epoch": 0.9581194826524327, "grad_norm": 0.785774984672442, "kd_ratio": 0.5, "learning_rate": 9.188488972753928e-08, "loss": 1.042614459991455, "loss/kd": 1.7285300493240356, "loss/lm": 0.3566989302635193, "step": 4667 }, { "epoch": 0.9583247793060973, "grad_norm": 0.8744014140425436, "kd_ratio": 0.5, "learning_rate": 9.098763062439175e-08, "loss": 1.0258688926696777, "loss/kd": 1.7487218379974365, "loss/lm": 0.3030160665512085, "step": 4668 }, { "epoch": 0.9585300759597618, "grad_norm": 0.973883840101488, "kd_ratio": 0.5, "learning_rate": 9.00947539041619e-08, "loss": 1.1610013246536255, "loss/kd": 1.9717624187469482, "loss/lm": 0.35024020075798035, "step": 4669 }, { "epoch": 0.9587353726134265, "grad_norm": 0.9089319895548004, "kd_ratio": 0.5, "learning_rate": 8.920625996173493e-08, "loss": 0.9650033712387085, "loss/kd": 1.6310516595840454, "loss/lm": 0.2989550232887268, "step": 4670 }, { "epoch": 0.958940669267091, "grad_norm": 0.8977853005258429, "kd_ratio": 0.5, "learning_rate": 8.832214919005877e-08, "loss": 0.9166781306266785, "loss/kd": 1.5336781740188599, "loss/lm": 0.29967811703681946, "step": 4671 }, { "epoch": 0.9591459659207555, "grad_norm": 0.8204858514892759, "kd_ratio": 0.5, "learning_rate": 8.744242198014174e-08, "loss": 1.0623654127120972, "loss/kd": 1.8196676969528198, "loss/lm": 0.3050631284713745, "step": 4672 }, { "epoch": 0.95935126257442, "grad_norm": 0.8781698031804873, "kd_ratio": 0.5, "learning_rate": 8.656707872105485e-08, "loss": 1.094201683998108, "loss/kd": 1.8782799243927002, "loss/lm": 0.310123473405838, "step": 4673 }, { "epoch": 0.9595565592280846, "grad_norm": 0.7951950488123803, "kd_ratio": 0.5, "learning_rate": 8.569611979992953e-08, "loss": 1.0135254859924316, "loss/kd": 1.6967862844467163, "loss/lm": 0.3302646577358246, "step": 4674 }, { "epoch": 0.9597618558817491, "grad_norm": 0.8112373256632865, "kd_ratio": 0.5, "learning_rate": 8.482954560195655e-08, "loss": 1.0076733827590942, "loss/kd": 1.6966547966003418, "loss/lm": 0.3186918795108795, "step": 4675 }, { "epoch": 0.9599671525354136, "grad_norm": 0.8279337608888773, "kd_ratio": 0.5, "learning_rate": 8.396735651039046e-08, "loss": 0.8777816295623779, "loss/kd": 1.416145920753479, "loss/lm": 0.33941736817359924, "step": 4676 }, { "epoch": 0.9601724491890782, "grad_norm": 0.8000245041338994, "kd_ratio": 0.5, "learning_rate": 8.31095529065451e-08, "loss": 0.9939006567001343, "loss/kd": 1.6580209732055664, "loss/lm": 0.3297803997993469, "step": 4677 }, { "epoch": 0.9603777458427427, "grad_norm": 0.8642970561908287, "kd_ratio": 0.5, "learning_rate": 8.22561351697948e-08, "loss": 1.9751044511795044, "loss/kd": 3.6924080848693848, "loss/lm": 0.25780078768730164, "step": 4678 }, { "epoch": 0.9605830424964074, "grad_norm": 0.8565449144287559, "kd_ratio": 0.5, "learning_rate": 8.14071036775721e-08, "loss": 1.2406765222549438, "loss/kd": 2.1482841968536377, "loss/lm": 0.33306875824928284, "step": 4679 }, { "epoch": 0.9607883391500719, "grad_norm": 0.8899607147260278, "kd_ratio": 0.5, "learning_rate": 8.056245880537438e-08, "loss": 0.8470966815948486, "loss/kd": 1.4209967851638794, "loss/lm": 0.27319663763046265, "step": 4680 }, { "epoch": 0.9609936358037364, "grad_norm": 0.8952377859011735, "kd_ratio": 0.5, "learning_rate": 7.97222009267551e-08, "loss": 0.970430850982666, "loss/kd": 1.5556621551513672, "loss/lm": 0.38519951701164246, "step": 4681 }, { "epoch": 0.961198932457401, "grad_norm": 0.841577527074385, "kd_ratio": 0.5, "learning_rate": 7.888633041332805e-08, "loss": 1.1819733381271362, "loss/kd": 2.0535967350006104, "loss/lm": 0.3103500306606293, "step": 4682 }, { "epoch": 0.9614042291110655, "grad_norm": 0.8657865044194858, "kd_ratio": 0.5, "learning_rate": 7.805484763476756e-08, "loss": 0.8499885201454163, "loss/kd": 1.4295213222503662, "loss/lm": 0.2704556882381439, "step": 4683 }, { "epoch": 0.96160952576473, "grad_norm": 0.8431756494504781, "kd_ratio": 0.5, "learning_rate": 7.722775295880724e-08, "loss": 1.0896738767623901, "loss/kd": 1.858198881149292, "loss/lm": 0.3211487829685211, "step": 4684 }, { "epoch": 0.9618148224183946, "grad_norm": 0.8732714151674537, "kd_ratio": 0.5, "learning_rate": 7.640504675124006e-08, "loss": 1.064705729484558, "loss/kd": 1.77238929271698, "loss/lm": 0.357022225856781, "step": 4685 }, { "epoch": 0.9620201190720591, "grad_norm": 1.0217251827341083, "kd_ratio": 0.5, "learning_rate": 7.558672937591937e-08, "loss": 1.0763829946517944, "loss/kd": 1.841856837272644, "loss/lm": 0.31090906262397766, "step": 4686 }, { "epoch": 0.9622254157257236, "grad_norm": 0.8890343026294201, "kd_ratio": 0.5, "learning_rate": 7.477280119475239e-08, "loss": 0.8665851354598999, "loss/kd": 1.4483956098556519, "loss/lm": 0.28477466106414795, "step": 4687 }, { "epoch": 0.9624307123793883, "grad_norm": 0.8595011942793209, "kd_ratio": 0.5, "learning_rate": 7.396326256771336e-08, "loss": 1.0317950248718262, "loss/kd": 1.663485050201416, "loss/lm": 0.40010493993759155, "step": 4688 }, { "epoch": 0.9626360090330528, "grad_norm": 0.9776677753954285, "kd_ratio": 0.5, "learning_rate": 7.315811385282701e-08, "loss": 0.9741209149360657, "loss/kd": 1.5981254577636719, "loss/lm": 0.3501163423061371, "step": 4689 }, { "epoch": 0.9628413056867173, "grad_norm": 0.9166558275092129, "kd_ratio": 0.5, "learning_rate": 7.235735540618183e-08, "loss": 1.234483003616333, "loss/kd": 2.147953510284424, "loss/lm": 0.3210124671459198, "step": 4690 }, { "epoch": 0.9630466023403819, "grad_norm": 0.9147541088256319, "kd_ratio": 0.5, "learning_rate": 7.156098758192453e-08, "loss": 1.7716103792190552, "loss/kd": 3.318875312805176, "loss/lm": 0.22434547543525696, "step": 4691 }, { "epoch": 0.9632518989940464, "grad_norm": 0.9599722058873393, "kd_ratio": 0.5, "learning_rate": 7.076901073225562e-08, "loss": 0.9495531320571899, "loss/kd": 1.5361357927322388, "loss/lm": 0.3629705309867859, "step": 4692 }, { "epoch": 0.9634571956477109, "grad_norm": 0.9131899342353156, "kd_ratio": 0.5, "learning_rate": 6.998142520743934e-08, "loss": 1.3860292434692383, "loss/kd": 2.390209436416626, "loss/lm": 0.381849080324173, "step": 4693 }, { "epoch": 0.9636624923013755, "grad_norm": 0.8668206492098366, "kd_ratio": 0.5, "learning_rate": 6.919823135579373e-08, "loss": 1.0808188915252686, "loss/kd": 1.789373517036438, "loss/lm": 0.37226420640945435, "step": 4694 }, { "epoch": 0.96386778895504, "grad_norm": 1.124427464611739, "kd_ratio": 0.5, "learning_rate": 6.841942952369618e-08, "loss": 0.8848384618759155, "loss/kd": 1.4403996467590332, "loss/lm": 0.3292772173881531, "step": 4695 }, { "epoch": 0.9640730856087045, "grad_norm": 0.8133724442341665, "kd_ratio": 0.5, "learning_rate": 6.764502005558115e-08, "loss": 0.9192090630531311, "loss/kd": 1.4699974060058594, "loss/lm": 0.3684207499027252, "step": 4696 }, { "epoch": 0.9642783822623692, "grad_norm": 0.8400458615435592, "kd_ratio": 0.5, "learning_rate": 6.687500329394136e-08, "loss": 0.8365118503570557, "loss/kd": 1.4232293367385864, "loss/lm": 0.24979430437088013, "step": 4697 }, { "epoch": 0.9644836789160337, "grad_norm": 0.8425083669715048, "kd_ratio": 0.5, "learning_rate": 6.610937957932551e-08, "loss": 0.9618858098983765, "loss/kd": 1.614942193031311, "loss/lm": 0.3088294267654419, "step": 4698 }, { "epoch": 0.9646889755696982, "grad_norm": 0.780683336500211, "kd_ratio": 0.5, "learning_rate": 6.53481492503405e-08, "loss": 1.1985055208206177, "loss/kd": 2.0551767349243164, "loss/lm": 0.34183430671691895, "step": 4699 }, { "epoch": 0.9648942722233628, "grad_norm": 0.8111845577606448, "kd_ratio": 0.5, "learning_rate": 6.459131264365038e-08, "loss": 1.0186364650726318, "loss/kd": 1.6603596210479736, "loss/lm": 0.3769134283065796, "step": 4700 }, { "epoch": 0.9650995688770273, "grad_norm": 0.8499749376936383, "kd_ratio": 0.5, "learning_rate": 6.383887009397515e-08, "loss": 0.9979477524757385, "loss/kd": 1.6488465070724487, "loss/lm": 0.3470490276813507, "step": 4701 }, { "epoch": 0.9653048655306918, "grad_norm": 0.8580034706479388, "kd_ratio": 0.5, "learning_rate": 6.309082193409088e-08, "loss": 0.9000455737113953, "loss/kd": 1.536768913269043, "loss/lm": 0.26332226395606995, "step": 4702 }, { "epoch": 0.9655101621843564, "grad_norm": 0.8092026203602729, "kd_ratio": 0.5, "learning_rate": 6.23471684948318e-08, "loss": 0.8849546909332275, "loss/kd": 1.3501403331756592, "loss/lm": 0.4197691082954407, "step": 4703 }, { "epoch": 0.9657154588380209, "grad_norm": 0.8916404360980829, "kd_ratio": 0.5, "learning_rate": 6.160791010508816e-08, "loss": 0.8810591697692871, "loss/kd": 1.4773889780044556, "loss/lm": 0.2847294211387634, "step": 4704 }, { "epoch": 0.9659207554916854, "grad_norm": 0.8138935306397783, "kd_ratio": 0.5, "learning_rate": 6.087304709180509e-08, "loss": 1.1771938800811768, "loss/kd": 1.9797402620315552, "loss/lm": 0.37464749813079834, "step": 4705 }, { "epoch": 0.9661260521453501, "grad_norm": 0.7970201024633033, "kd_ratio": 0.5, "learning_rate": 6.014257977998594e-08, "loss": 0.9172576069831848, "loss/kd": 1.5248481035232544, "loss/lm": 0.30966711044311523, "step": 4706 }, { "epoch": 0.9663313487990146, "grad_norm": 0.8856282444262716, "kd_ratio": 0.5, "learning_rate": 5.9416508492688986e-08, "loss": 0.9025336503982544, "loss/kd": 1.4330106973648071, "loss/lm": 0.3720565736293793, "step": 4707 }, { "epoch": 0.9665366454526791, "grad_norm": 0.8416035266272482, "kd_ratio": 0.5, "learning_rate": 5.869483355102623e-08, "loss": 1.0677330493927002, "loss/kd": 1.70961594581604, "loss/lm": 0.4258502125740051, "step": 4708 }, { "epoch": 0.9667419421063437, "grad_norm": 0.8748113953915598, "kd_ratio": 0.5, "learning_rate": 5.797755527417015e-08, "loss": 0.7716647386550903, "loss/kd": 1.2169095277786255, "loss/lm": 0.3264199495315552, "step": 4709 }, { "epoch": 0.9669472387600082, "grad_norm": 0.9313685283658344, "kd_ratio": 0.5, "learning_rate": 5.726467397934365e-08, "loss": 1.1387357711791992, "loss/kd": 1.8543237447738647, "loss/lm": 0.42314791679382324, "step": 4710 }, { "epoch": 0.9671525354136727, "grad_norm": 0.8811136456192891, "kd_ratio": 0.5, "learning_rate": 5.655618998182899e-08, "loss": 1.1160130500793457, "loss/kd": 1.9021575450897217, "loss/lm": 0.3298686146736145, "step": 4711 }, { "epoch": 0.9673578320673373, "grad_norm": 0.9205498650350126, "kd_ratio": 0.5, "learning_rate": 5.585210359495996e-08, "loss": 0.9999439120292664, "loss/kd": 1.7121325731277466, "loss/lm": 0.2877552807331085, "step": 4712 }, { "epoch": 0.9675631287210018, "grad_norm": 0.8444215605342337, "kd_ratio": 0.5, "learning_rate": 5.51524151301297e-08, "loss": 1.0001527070999146, "loss/kd": 1.7117935419082642, "loss/lm": 0.28851181268692017, "step": 4713 }, { "epoch": 0.9677684253746663, "grad_norm": 0.9162465633728957, "kd_ratio": 0.5, "learning_rate": 5.445712489678179e-08, "loss": 0.9528664946556091, "loss/kd": 1.535420536994934, "loss/lm": 0.3703124225139618, "step": 4714 }, { "epoch": 0.967973722028331, "grad_norm": 0.8171441712138424, "kd_ratio": 0.5, "learning_rate": 5.376623320241914e-08, "loss": 0.9962038993835449, "loss/kd": 1.5605765581130981, "loss/lm": 0.4318312704563141, "step": 4715 }, { "epoch": 0.9681790186819955, "grad_norm": 0.8750817371967583, "kd_ratio": 0.5, "learning_rate": 5.307974035259511e-08, "loss": 0.9758539199829102, "loss/kd": 1.605928897857666, "loss/lm": 0.3457788825035095, "step": 4716 }, { "epoch": 0.96838431533566, "grad_norm": 0.832784535984127, "kd_ratio": 0.5, "learning_rate": 5.2397646650921285e-08, "loss": 1.102033257484436, "loss/kd": 1.8846968412399292, "loss/lm": 0.31936970353126526, "step": 4717 }, { "epoch": 0.9685896119893246, "grad_norm": 0.8048706778908309, "kd_ratio": 0.5, "learning_rate": 5.17199523990608e-08, "loss": 1.1103315353393555, "loss/kd": 1.906145453453064, "loss/lm": 0.3145175874233246, "step": 4718 }, { "epoch": 0.9687949086429891, "grad_norm": 0.838602542902813, "kd_ratio": 0.5, "learning_rate": 5.10466578967328e-08, "loss": 1.087479591369629, "loss/kd": 1.8762450218200684, "loss/lm": 0.29871419072151184, "step": 4719 }, { "epoch": 0.9690002052966536, "grad_norm": 0.9036938109760335, "kd_ratio": 0.5, "learning_rate": 5.0377763441710195e-08, "loss": 1.1088017225265503, "loss/kd": 1.9283928871154785, "loss/lm": 0.28921058773994446, "step": 4720 }, { "epoch": 0.9692055019503182, "grad_norm": 1.1856398889167161, "kd_ratio": 0.5, "learning_rate": 4.971326932981968e-08, "loss": 0.9957433938980103, "loss/kd": 1.6590825319290161, "loss/lm": 0.3324042558670044, "step": 4721 }, { "epoch": 0.9694107986039827, "grad_norm": 0.8256792074596894, "kd_ratio": 0.5, "learning_rate": 4.905317585494285e-08, "loss": 0.8731489181518555, "loss/kd": 1.4497205018997192, "loss/lm": 0.2965773642063141, "step": 4722 }, { "epoch": 0.9696160952576472, "grad_norm": 0.7976894854764441, "kd_ratio": 0.5, "learning_rate": 4.8397483309011726e-08, "loss": 1.25389564037323, "loss/kd": 2.2074813842773438, "loss/lm": 0.3003098666667938, "step": 4723 }, { "epoch": 0.9698213919113119, "grad_norm": 0.818954163290232, "kd_ratio": 0.5, "learning_rate": 4.7746191982016575e-08, "loss": 1.0628284215927124, "loss/kd": 1.838167428970337, "loss/lm": 0.2874893844127655, "step": 4724 }, { "epoch": 0.9700266885649764, "grad_norm": 0.9436755580125256, "kd_ratio": 0.5, "learning_rate": 4.709930216199921e-08, "loss": 1.35239577293396, "loss/kd": 2.3692803382873535, "loss/lm": 0.33551132678985596, "step": 4725 }, { "epoch": 0.970231985218641, "grad_norm": 0.8566316601089603, "kd_ratio": 0.5, "learning_rate": 4.645681413505299e-08, "loss": 0.9593340754508972, "loss/kd": 1.5859813690185547, "loss/lm": 0.33268675208091736, "step": 4726 }, { "epoch": 0.9704372818723055, "grad_norm": 0.8877390502118495, "kd_ratio": 0.5, "learning_rate": 4.581872818532729e-08, "loss": 1.1390279531478882, "loss/kd": 1.9319820404052734, "loss/lm": 0.3460739552974701, "step": 4727 }, { "epoch": 0.97064257852597, "grad_norm": 0.7985999971138105, "kd_ratio": 0.5, "learning_rate": 4.518504459502304e-08, "loss": 1.2010101079940796, "loss/kd": 1.9683107137680054, "loss/lm": 0.4337095320224762, "step": 4728 }, { "epoch": 0.9708478751796346, "grad_norm": 0.8380380478064164, "kd_ratio": 0.5, "learning_rate": 4.455576364439496e-08, "loss": 1.003348708152771, "loss/kd": 1.7169952392578125, "loss/lm": 0.28970226645469666, "step": 4729 }, { "epoch": 0.9710531718332991, "grad_norm": 0.8458176860941747, "kd_ratio": 0.5, "learning_rate": 4.3930885611750406e-08, "loss": 1.0678138732910156, "loss/kd": 1.7800660133361816, "loss/lm": 0.35556185245513916, "step": 4730 }, { "epoch": 0.9712584684869636, "grad_norm": 0.9567309980291283, "kd_ratio": 0.5, "learning_rate": 4.331041077344944e-08, "loss": 0.9207139015197754, "loss/kd": 1.4915512800216675, "loss/lm": 0.3498764634132385, "step": 4731 }, { "epoch": 0.9714637651406282, "grad_norm": 0.8949164352440412, "kd_ratio": 0.5, "learning_rate": 4.269433940390366e-08, "loss": 0.9449392557144165, "loss/kd": 1.6224550008773804, "loss/lm": 0.267423540353775, "step": 4732 }, { "epoch": 0.9716690617942928, "grad_norm": 0.9924077717430064, "kd_ratio": 0.5, "learning_rate": 4.208267177557846e-08, "loss": 0.9939721822738647, "loss/kd": 1.6658213138580322, "loss/lm": 0.32212311029434204, "step": 4733 }, { "epoch": 0.9718743584479573, "grad_norm": 1.2472790925183752, "kd_ratio": 0.5, "learning_rate": 4.147540815899298e-08, "loss": 0.9763468503952026, "loss/kd": 1.5622239112854004, "loss/lm": 0.3904697299003601, "step": 4734 }, { "epoch": 0.9720796551016219, "grad_norm": 0.8064495223966714, "kd_ratio": 0.5, "learning_rate": 4.087254882271574e-08, "loss": 0.9541147947311401, "loss/kd": 1.6026015281677246, "loss/lm": 0.30562809109687805, "step": 4735 }, { "epoch": 0.9722849517552864, "grad_norm": 0.8691345651222125, "kd_ratio": 0.5, "learning_rate": 4.027409403336901e-08, "loss": 0.988244891166687, "loss/kd": 1.6543724536895752, "loss/lm": 0.3221173584461212, "step": 4736 }, { "epoch": 0.9724902484089509, "grad_norm": 0.8399099174957873, "kd_ratio": 0.5, "learning_rate": 3.968004405562664e-08, "loss": 1.020691990852356, "loss/kd": 1.6617348194122314, "loss/lm": 0.3796491026878357, "step": 4737 }, { "epoch": 0.9726955450626155, "grad_norm": 1.0999932025804702, "kd_ratio": 0.5, "learning_rate": 3.909039915221513e-08, "loss": 1.2088876962661743, "loss/kd": 2.0902247428894043, "loss/lm": 0.3275506794452667, "step": 4738 }, { "epoch": 0.97290084171628, "grad_norm": 0.8323543281651946, "kd_ratio": 0.5, "learning_rate": 3.8505159583911434e-08, "loss": 0.9285510778427124, "loss/kd": 1.534136176109314, "loss/lm": 0.3229660391807556, "step": 4739 }, { "epoch": 0.9731061383699445, "grad_norm": 0.8101288410962545, "kd_ratio": 0.5, "learning_rate": 3.7924325609545175e-08, "loss": 1.0637840032577515, "loss/kd": 1.8440450429916382, "loss/lm": 0.2835228741168976, "step": 4740 }, { "epoch": 0.9733114350236091, "grad_norm": 1.3024188536094994, "kd_ratio": 0.5, "learning_rate": 3.734789748599754e-08, "loss": 1.0343303680419922, "loss/kd": 1.7384822368621826, "loss/lm": 0.3301784098148346, "step": 4741 }, { "epoch": 0.9735167316772737, "grad_norm": 1.0349917905876334, "kd_ratio": 0.5, "learning_rate": 3.6775875468201274e-08, "loss": 0.9503213167190552, "loss/kd": 1.6126741170883179, "loss/lm": 0.2879684865474701, "step": 4742 }, { "epoch": 0.9737220283309382, "grad_norm": 0.884498315697748, "kd_ratio": 0.5, "learning_rate": 3.6208259809139554e-08, "loss": 1.0395563840866089, "loss/kd": 1.7387874126434326, "loss/lm": 0.34032532572746277, "step": 4743 }, { "epoch": 0.9739273249846028, "grad_norm": 0.8289884495753632, "kd_ratio": 0.5, "learning_rate": 3.5645050759847146e-08, "loss": 1.2015252113342285, "loss/kd": 2.1163763999938965, "loss/lm": 0.28667405247688293, "step": 4744 }, { "epoch": 0.9741326216382673, "grad_norm": 0.8371862188727054, "kd_ratio": 0.5, "learning_rate": 3.508624856941034e-08, "loss": 1.0804417133331299, "loss/kd": 1.7912973165512085, "loss/lm": 0.3695860207080841, "step": 4745 }, { "epoch": 0.9743379182919318, "grad_norm": 0.9253894141650701, "kd_ratio": 0.5, "learning_rate": 3.453185348496702e-08, "loss": 1.21138334274292, "loss/kd": 2.115152597427368, "loss/lm": 0.3076140582561493, "step": 4746 }, { "epoch": 0.9745432149455964, "grad_norm": 0.9058938768148165, "kd_ratio": 0.5, "learning_rate": 3.3981865751705477e-08, "loss": 0.9861871004104614, "loss/kd": 1.6433292627334595, "loss/lm": 0.3290448784828186, "step": 4747 }, { "epoch": 0.9747485115992609, "grad_norm": 0.8530237340639617, "kd_ratio": 0.5, "learning_rate": 3.343628561286449e-08, "loss": 1.1848324537277222, "loss/kd": 2.035858392715454, "loss/lm": 0.33380651473999023, "step": 4748 }, { "epoch": 0.9749538082529254, "grad_norm": 0.8043848227226474, "kd_ratio": 0.5, "learning_rate": 3.2895113309732166e-08, "loss": 1.0717159509658813, "loss/kd": 1.7537556886672974, "loss/lm": 0.3896762728691101, "step": 4749 }, { "epoch": 0.97515910490659, "grad_norm": 1.089384871949707, "kd_ratio": 0.5, "learning_rate": 3.235834908164925e-08, "loss": 1.077366828918457, "loss/kd": 1.8516772985458374, "loss/lm": 0.30305641889572144, "step": 4750 }, { "epoch": 0.9753644015602546, "grad_norm": 0.8515790456064634, "kd_ratio": 0.5, "learning_rate": 3.182599316600699e-08, "loss": 1.1838557720184326, "loss/kd": 2.054643154144287, "loss/lm": 0.3130685091018677, "step": 4751 }, { "epoch": 0.9755696982139191, "grad_norm": 0.8342326023059199, "kd_ratio": 0.5, "learning_rate": 3.129804579824591e-08, "loss": 0.930014431476593, "loss/kd": 1.533948302268982, "loss/lm": 0.3260805606842041, "step": 4752 }, { "epoch": 0.9757749948675837, "grad_norm": 0.9509446694075038, "kd_ratio": 0.5, "learning_rate": 3.077450721185704e-08, "loss": 1.041284441947937, "loss/kd": 1.755308747291565, "loss/lm": 0.3272600769996643, "step": 4753 }, { "epoch": 0.9759802915212482, "grad_norm": 0.9490024386631766, "kd_ratio": 0.5, "learning_rate": 3.025537763838293e-08, "loss": 0.8350244760513306, "loss/kd": 1.3428527116775513, "loss/lm": 0.32719630002975464, "step": 4754 }, { "epoch": 0.9761855881749127, "grad_norm": 0.8359790814706013, "kd_ratio": 0.5, "learning_rate": 2.974065730741327e-08, "loss": 0.984859824180603, "loss/kd": 1.6429493427276611, "loss/lm": 0.32677027583122253, "step": 4755 }, { "epoch": 0.9763908848285773, "grad_norm": 0.803558069181935, "kd_ratio": 0.5, "learning_rate": 2.9230346446591506e-08, "loss": 0.8129640817642212, "loss/kd": 1.3405053615570068, "loss/lm": 0.28542277216911316, "step": 4756 }, { "epoch": 0.9765961814822418, "grad_norm": 0.9636430468177679, "kd_ratio": 0.5, "learning_rate": 2.8724445281607117e-08, "loss": 1.091847538948059, "loss/kd": 1.8590823411941528, "loss/lm": 0.32461270689964294, "step": 4757 }, { "epoch": 0.9768014781359063, "grad_norm": 0.7987096323645345, "kd_ratio": 0.5, "learning_rate": 2.8222954036201122e-08, "loss": 0.9813281297683716, "loss/kd": 1.679232120513916, "loss/lm": 0.2834241986274719, "step": 4758 }, { "epoch": 0.9770067747895709, "grad_norm": 0.8171459609616865, "kd_ratio": 0.5, "learning_rate": 2.772587293216611e-08, "loss": 1.044784426689148, "loss/kd": 1.7993810176849365, "loss/lm": 0.29018792510032654, "step": 4759 }, { "epoch": 0.9772120714432355, "grad_norm": 0.8834598329195913, "kd_ratio": 0.5, "learning_rate": 2.7233202189340667e-08, "loss": 0.9800726175308228, "loss/kd": 1.6303623914718628, "loss/lm": 0.3297829031944275, "step": 4760 }, { "epoch": 0.9774173680969, "grad_norm": 0.8537023099662464, "kd_ratio": 0.5, "learning_rate": 2.674494202561384e-08, "loss": 1.0675806999206543, "loss/kd": 1.8008689880371094, "loss/lm": 0.3342924416065216, "step": 4761 }, { "epoch": 0.9776226647505646, "grad_norm": 0.8754696639781716, "kd_ratio": 0.5, "learning_rate": 2.6261092656927333e-08, "loss": 1.3589403629302979, "loss/kd": 2.441321611404419, "loss/lm": 0.2765590250492096, "step": 4762 }, { "epoch": 0.9778279614042291, "grad_norm": 0.8796875244701066, "kd_ratio": 0.5, "learning_rate": 2.5781654297267756e-08, "loss": 0.9533047676086426, "loss/kd": 1.6010749340057373, "loss/lm": 0.30553460121154785, "step": 4763 }, { "epoch": 0.9780332580578937, "grad_norm": 0.8165704690081833, "kd_ratio": 0.5, "learning_rate": 2.530662715867216e-08, "loss": 1.1747709512710571, "loss/kd": 1.9917514324188232, "loss/lm": 0.3577905595302582, "step": 4764 }, { "epoch": 0.9782385547115582, "grad_norm": 0.8292781416701966, "kd_ratio": 0.5, "learning_rate": 2.4836011451228048e-08, "loss": 0.9110015034675598, "loss/kd": 1.5004971027374268, "loss/lm": 0.32150590419769287, "step": 4765 }, { "epoch": 0.9784438513652227, "grad_norm": 0.799604251699435, "kd_ratio": 0.5, "learning_rate": 2.4369807383071154e-08, "loss": 1.0095995664596558, "loss/kd": 1.6843024492263794, "loss/lm": 0.3348967730998993, "step": 4766 }, { "epoch": 0.9786491480188872, "grad_norm": 0.8136894877397892, "kd_ratio": 0.5, "learning_rate": 2.3908015160385433e-08, "loss": 1.083507776260376, "loss/kd": 1.8726478815078735, "loss/lm": 0.29436758160591125, "step": 4767 }, { "epoch": 0.9788544446725518, "grad_norm": 0.7966289071894987, "kd_ratio": 0.5, "learning_rate": 2.3450634987404185e-08, "loss": 0.7721943259239197, "loss/kd": 1.30548095703125, "loss/lm": 0.23890773952007294, "step": 4768 }, { "epoch": 0.9790597413262164, "grad_norm": 0.8341783601785258, "kd_ratio": 0.5, "learning_rate": 2.2997667066410045e-08, "loss": 0.9342132806777954, "loss/kd": 1.5365902185440063, "loss/lm": 0.3318363130092621, "step": 4769 }, { "epoch": 0.979265037979881, "grad_norm": 0.8041693531332242, "kd_ratio": 0.5, "learning_rate": 2.2549111597732766e-08, "loss": 1.0006085634231567, "loss/kd": 1.6911312341690063, "loss/lm": 0.3100859522819519, "step": 4770 }, { "epoch": 0.9794703346335455, "grad_norm": 0.8427743736338955, "kd_ratio": 0.5, "learning_rate": 2.2104968779752546e-08, "loss": 1.0602704286575317, "loss/kd": 1.7525928020477295, "loss/lm": 0.3679479956626892, "step": 4771 }, { "epoch": 0.97967563128721, "grad_norm": 0.8129911290699253, "kd_ratio": 0.5, "learning_rate": 2.1665238808895595e-08, "loss": 1.0601104497909546, "loss/kd": 1.7882606983184814, "loss/lm": 0.3319602608680725, "step": 4772 }, { "epoch": 0.9798809279408746, "grad_norm": 1.1144990078705321, "kd_ratio": 0.5, "learning_rate": 2.1229921879639683e-08, "loss": 0.9327167868614197, "loss/kd": 1.538469910621643, "loss/lm": 0.3269636631011963, "step": 4773 }, { "epoch": 0.9800862245945391, "grad_norm": 0.8285168886556916, "kd_ratio": 0.5, "learning_rate": 2.079901818450858e-08, "loss": 1.0240298509597778, "loss/kd": 1.7058154344558716, "loss/lm": 0.34224429726600647, "step": 4774 }, { "epoch": 0.9802915212482036, "grad_norm": 0.7999528687339577, "kd_ratio": 0.5, "learning_rate": 2.037252791407318e-08, "loss": 0.8431475162506104, "loss/kd": 1.3749366998672485, "loss/lm": 0.3113582730293274, "step": 4775 }, { "epoch": 0.9804968179018682, "grad_norm": 0.8212778005148911, "kd_ratio": 0.5, "learning_rate": 1.9950451256957048e-08, "loss": 1.0479207038879395, "loss/kd": 1.7937580347061157, "loss/lm": 0.30208343267440796, "step": 4776 }, { "epoch": 0.9807021145555328, "grad_norm": 0.8235627558273638, "kd_ratio": 0.5, "learning_rate": 1.953278839982531e-08, "loss": 1.2249191999435425, "loss/kd": 2.130354881286621, "loss/lm": 0.31948351860046387, "step": 4777 }, { "epoch": 0.9809074112091973, "grad_norm": 0.9304665590424442, "kd_ratio": 0.5, "learning_rate": 1.911953952739687e-08, "loss": 1.0791832208633423, "loss/kd": 1.8714646100997925, "loss/lm": 0.28690189123153687, "step": 4778 }, { "epoch": 0.9811127078628619, "grad_norm": 0.8799240440971003, "kd_ratio": 0.5, "learning_rate": 1.8710704822435534e-08, "loss": 1.1064345836639404, "loss/kd": 1.8725978136062622, "loss/lm": 0.3402712941169739, "step": 4779 }, { "epoch": 0.9813180045165264, "grad_norm": 0.8052452270349413, "kd_ratio": 0.5, "learning_rate": 1.830628446575333e-08, "loss": 0.857038140296936, "loss/kd": 1.4508662223815918, "loss/lm": 0.26321008801460266, "step": 4780 }, { "epoch": 0.9815233011701909, "grad_norm": 0.926433445710725, "kd_ratio": 0.5, "learning_rate": 1.7906278636210527e-08, "loss": 0.9431248307228088, "loss/kd": 1.5430244207382202, "loss/lm": 0.34322524070739746, "step": 4781 }, { "epoch": 0.9817285978238555, "grad_norm": 0.8567024414566577, "kd_ratio": 0.5, "learning_rate": 1.7510687510714495e-08, "loss": 0.880621075630188, "loss/kd": 1.3995612859725952, "loss/lm": 0.361680805683136, "step": 4782 }, { "epoch": 0.98193389447752, "grad_norm": 0.9177794806477642, "kd_ratio": 0.5, "learning_rate": 1.7119511264220844e-08, "loss": 1.0671944618225098, "loss/kd": 1.7337009906768799, "loss/lm": 0.40068790316581726, "step": 4783 }, { "epoch": 0.9821391911311845, "grad_norm": 0.8931868923144834, "kd_ratio": 0.5, "learning_rate": 1.6732750069730076e-08, "loss": 0.9074883460998535, "loss/kd": 1.5197091102600098, "loss/lm": 0.29526761174201965, "step": 4784 }, { "epoch": 0.9823444877848491, "grad_norm": 0.9270882888529768, "kd_ratio": 0.5, "learning_rate": 1.6350404098294247e-08, "loss": 0.9435710906982422, "loss/kd": 1.613830804824829, "loss/lm": 0.2733113467693329, "step": 4785 }, { "epoch": 0.9825497844385137, "grad_norm": 0.9292182566951995, "kd_ratio": 0.5, "learning_rate": 1.5972473519009212e-08, "loss": 0.9435736536979675, "loss/kd": 1.6163361072540283, "loss/lm": 0.2708112299442291, "step": 4786 }, { "epoch": 0.9827550810921782, "grad_norm": 0.7960369444691455, "kd_ratio": 0.5, "learning_rate": 1.5598958499021265e-08, "loss": 1.315967321395874, "loss/kd": 2.3046762943267822, "loss/lm": 0.3272583484649658, "step": 4787 }, { "epoch": 0.9829603777458428, "grad_norm": 0.8375907510795, "kd_ratio": 0.5, "learning_rate": 1.5229859203519382e-08, "loss": 0.9696639776229858, "loss/kd": 1.5739431381225586, "loss/lm": 0.3653848171234131, "step": 4788 }, { "epoch": 0.9831656743995073, "grad_norm": 0.8703917271356703, "kd_ratio": 0.5, "learning_rate": 1.486517579574409e-08, "loss": 0.9674273729324341, "loss/kd": 1.6057331562042236, "loss/lm": 0.32912153005599976, "step": 4789 }, { "epoch": 0.9833709710531718, "grad_norm": 0.8265239436840197, "kd_ratio": 0.5, "learning_rate": 1.4504908436980825e-08, "loss": 1.0165339708328247, "loss/kd": 1.7357748746871948, "loss/lm": 0.2972930073738098, "step": 4790 }, { "epoch": 0.9835762677068364, "grad_norm": 0.7793614369695124, "kd_ratio": 0.5, "learning_rate": 1.4149057286562128e-08, "loss": 0.8672415018081665, "loss/kd": 1.4633594751358032, "loss/lm": 0.2711235284805298, "step": 4791 }, { "epoch": 0.9837815643605009, "grad_norm": 0.8418520939446099, "kd_ratio": 0.5, "learning_rate": 1.3797622501867669e-08, "loss": 1.0647809505462646, "loss/kd": 1.780631422996521, "loss/lm": 0.3489305377006531, "step": 4792 }, { "epoch": 0.9839868610141654, "grad_norm": 0.8475279090222979, "kd_ratio": 0.5, "learning_rate": 1.3450604238323118e-08, "loss": 1.049854040145874, "loss/kd": 1.7578980922698975, "loss/lm": 0.34181007742881775, "step": 4793 }, { "epoch": 0.98419215766783, "grad_norm": 0.8587115041637259, "kd_ratio": 0.5, "learning_rate": 1.3108002649403484e-08, "loss": 1.008234977722168, "loss/kd": 1.7090754508972168, "loss/lm": 0.30739453434944153, "step": 4794 }, { "epoch": 0.9843974543214946, "grad_norm": 0.8155790362793895, "kd_ratio": 0.5, "learning_rate": 1.2769817886626456e-08, "loss": 0.8566333651542664, "loss/kd": 1.4238382577896118, "loss/lm": 0.2894284427165985, "step": 4795 }, { "epoch": 0.9846027509751591, "grad_norm": 1.012605889980009, "kd_ratio": 0.5, "learning_rate": 1.243605009955906e-08, "loss": 1.044975996017456, "loss/kd": 1.7857781648635864, "loss/lm": 0.3041738271713257, "step": 4796 }, { "epoch": 0.9848080476288237, "grad_norm": 0.8568341624846356, "kd_ratio": 0.5, "learning_rate": 1.2106699435814329e-08, "loss": 0.9842793941497803, "loss/kd": 1.6078704595565796, "loss/lm": 0.36068832874298096, "step": 4797 }, { "epoch": 0.9850133442824882, "grad_norm": 1.153860556595216, "kd_ratio": 0.5, "learning_rate": 1.1781766041052412e-08, "loss": 1.1718478202819824, "loss/kd": 1.87826406955719, "loss/lm": 0.46543145179748535, "step": 4798 }, { "epoch": 0.9852186409361527, "grad_norm": 0.9200311904271632, "kd_ratio": 0.5, "learning_rate": 1.1461250058977248e-08, "loss": 0.8847458362579346, "loss/kd": 1.4793623685836792, "loss/lm": 0.2901293635368347, "step": 4799 }, { "epoch": 0.9854239375898173, "grad_norm": 0.8740064549546654, "kd_ratio": 0.5, "learning_rate": 1.114515163134322e-08, "loss": 1.0064494609832764, "loss/kd": 1.7252087593078613, "loss/lm": 0.28769025206565857, "step": 4800 }, { "epoch": 0.9856292342434818, "grad_norm": 0.8253614320340373, "kd_ratio": 0.5, "learning_rate": 1.0833470897947396e-08, "loss": 0.8789533972740173, "loss/kd": 1.4801932573318481, "loss/lm": 0.2777135372161865, "step": 4801 }, { "epoch": 0.9858345308971463, "grad_norm": 0.8559176479978193, "kd_ratio": 0.5, "learning_rate": 1.0526207996633952e-08, "loss": 0.9291260242462158, "loss/kd": 1.4733198881149292, "loss/lm": 0.38493216037750244, "step": 4802 }, { "epoch": 0.9860398275508109, "grad_norm": 0.8374786832142898, "kd_ratio": 0.5, "learning_rate": 1.02233630632953e-08, "loss": 1.0641664266586304, "loss/kd": 1.802074909210205, "loss/lm": 0.32625794410705566, "step": 4803 }, { "epoch": 0.9862451242044755, "grad_norm": 0.8285675700950148, "kd_ratio": 0.5, "learning_rate": 9.924936231866522e-09, "loss": 1.023504614830017, "loss/kd": 1.6764771938323975, "loss/lm": 0.37053197622299194, "step": 4804 }, { "epoch": 0.98645042085814, "grad_norm": 0.8885352193300062, "kd_ratio": 0.5, "learning_rate": 9.630927634333154e-09, "loss": 0.8845916390419006, "loss/kd": 1.3986464738845825, "loss/lm": 0.37053683400154114, "step": 4805 }, { "epoch": 0.9866557175118046, "grad_norm": 0.9274191405675425, "kd_ratio": 0.5, "learning_rate": 9.341337400721185e-09, "loss": 1.0152709484100342, "loss/kd": 1.7207192182540894, "loss/lm": 0.30982279777526855, "step": 4806 }, { "epoch": 0.9868610141654691, "grad_norm": 0.9128908770647158, "kd_ratio": 0.5, "learning_rate": 9.056165659107053e-09, "loss": 1.154263973236084, "loss/kd": 1.9933830499649048, "loss/lm": 0.3151448667049408, "step": 4807 }, { "epoch": 0.9870663108191337, "grad_norm": 0.7825640542515756, "kd_ratio": 0.5, "learning_rate": 8.775412535610984e-09, "loss": 1.0737138986587524, "loss/kd": 1.849158525466919, "loss/lm": 0.2982691824436188, "step": 4808 }, { "epoch": 0.9872716074727982, "grad_norm": 0.829169702423084, "kd_ratio": 0.5, "learning_rate": 8.49907815440032e-09, "loss": 1.034995436668396, "loss/kd": 1.7925220727920532, "loss/lm": 0.27746880054473877, "step": 4809 }, { "epoch": 0.9874769041264627, "grad_norm": 0.9445370679997463, "kd_ratio": 0.5, "learning_rate": 8.227162637686192e-09, "loss": 1.04630708694458, "loss/kd": 1.8072835206985474, "loss/lm": 0.28533077239990234, "step": 4810 }, { "epoch": 0.9876822007801273, "grad_norm": 0.8708226724774795, "kd_ratio": 0.5, "learning_rate": 7.959666105727959e-09, "loss": 1.0693788528442383, "loss/kd": 1.8084145784378052, "loss/lm": 0.3303430378437042, "step": 4811 }, { "epoch": 0.9878874974337918, "grad_norm": 0.8131555589871513, "kd_ratio": 0.5, "learning_rate": 7.696588676827654e-09, "loss": 1.3125994205474854, "loss/kd": 2.2585489749908447, "loss/lm": 0.3666499853134155, "step": 4812 }, { "epoch": 0.9880927940874564, "grad_norm": 0.8555694835520113, "kd_ratio": 0.5, "learning_rate": 7.437930467335541e-09, "loss": 0.9512534141540527, "loss/kd": 1.5700302124023438, "loss/lm": 0.3324766755104065, "step": 4813 }, { "epoch": 0.988298090741121, "grad_norm": 0.88942284930564, "kd_ratio": 0.5, "learning_rate": 7.1836915916456695e-09, "loss": 0.9587860703468323, "loss/kd": 1.6106491088867188, "loss/lm": 0.3069230020046234, "step": 4814 }, { "epoch": 0.9885033873947855, "grad_norm": 0.8157712757226925, "kd_ratio": 0.5, "learning_rate": 6.933872162199207e-09, "loss": 0.9706693887710571, "loss/kd": 1.6252715587615967, "loss/lm": 0.31606727838516235, "step": 4815 }, { "epoch": 0.98870868404845, "grad_norm": 0.8345059271105753, "kd_ratio": 0.5, "learning_rate": 6.6884722894822174e-09, "loss": 0.9286835193634033, "loss/kd": 1.5895135402679443, "loss/lm": 0.26785343885421753, "step": 4816 }, { "epoch": 0.9889139807021146, "grad_norm": 0.9290620360614518, "kd_ratio": 0.5, "learning_rate": 6.447492082024553e-09, "loss": 1.2695156335830688, "loss/kd": 2.232842445373535, "loss/lm": 0.30618882179260254, "step": 4817 }, { "epoch": 0.9891192773557791, "grad_norm": 0.9475975851641751, "kd_ratio": 0.5, "learning_rate": 6.2109316464031845e-09, "loss": 1.2218068838119507, "loss/kd": 2.1057915687561035, "loss/lm": 0.33782219886779785, "step": 4818 }, { "epoch": 0.9893245740094436, "grad_norm": 0.840317686509917, "kd_ratio": 0.5, "learning_rate": 5.978791087239977e-09, "loss": 0.9322194457054138, "loss/kd": 1.497365951538086, "loss/lm": 0.3670729696750641, "step": 4819 }, { "epoch": 0.9895298706631082, "grad_norm": 0.9535028384328361, "kd_ratio": 0.5, "learning_rate": 5.751070507201695e-09, "loss": 0.961465060710907, "loss/kd": 1.6125025749206543, "loss/lm": 0.3104275166988373, "step": 4820 }, { "epoch": 0.9897351673167727, "grad_norm": 0.8727921410571554, "kd_ratio": 0.5, "learning_rate": 5.52777000700111e-09, "loss": 1.3022593259811401, "loss/kd": 2.2653050422668457, "loss/lm": 0.33921360969543457, "step": 4821 }, { "epoch": 0.9899404639704373, "grad_norm": 0.8409385688801928, "kd_ratio": 0.5, "learning_rate": 5.308889685394781e-09, "loss": 1.1750850677490234, "loss/kd": 1.948216199874878, "loss/lm": 0.4019539952278137, "step": 4822 }, { "epoch": 0.9901457606241019, "grad_norm": 1.0479787963629184, "kd_ratio": 0.5, "learning_rate": 5.0944296391863825e-09, "loss": 0.9803471565246582, "loss/kd": 1.692975401878357, "loss/lm": 0.26771897077560425, "step": 4823 }, { "epoch": 0.9903510572777664, "grad_norm": 0.9243044269669306, "kd_ratio": 0.5, "learning_rate": 4.884389963222269e-09, "loss": 1.069244623184204, "loss/kd": 1.8716527223587036, "loss/lm": 0.2668364346027374, "step": 4824 }, { "epoch": 0.9905563539314309, "grad_norm": 0.815752367800357, "kd_ratio": 0.5, "learning_rate": 4.678770750395912e-09, "loss": 0.7942721843719482, "loss/kd": 1.234994888305664, "loss/lm": 0.35354942083358765, "step": 4825 }, { "epoch": 0.9907616505850955, "grad_norm": 0.9674719166148722, "kd_ratio": 0.5, "learning_rate": 4.4775720916445665e-09, "loss": 1.2812036275863647, "loss/kd": 2.2197391986846924, "loss/lm": 0.3426680266857147, "step": 4826 }, { "epoch": 0.99096694723876, "grad_norm": 0.7746749055649507, "kd_ratio": 0.5, "learning_rate": 4.2807940759515e-09, "loss": 1.944779872894287, "loss/kd": 3.5830252170562744, "loss/lm": 0.30653461813926697, "step": 4827 }, { "epoch": 0.9911722438924245, "grad_norm": 0.7604471495206546, "kd_ratio": 0.5, "learning_rate": 4.088436790342653e-09, "loss": 1.211806297302246, "loss/kd": 2.1047568321228027, "loss/lm": 0.31885573267936707, "step": 4828 }, { "epoch": 0.9913775405460891, "grad_norm": 0.8031530820172704, "kd_ratio": 0.5, "learning_rate": 3.900500319892197e-09, "loss": 0.9910537600517273, "loss/kd": 1.715509057044983, "loss/lm": 0.2665984332561493, "step": 4829 }, { "epoch": 0.9915828371997536, "grad_norm": 0.8482982447100125, "kd_ratio": 0.5, "learning_rate": 3.7169847477169786e-09, "loss": 1.0476124286651611, "loss/kd": 1.770291805267334, "loss/lm": 0.32493311166763306, "step": 4830 }, { "epoch": 0.9917881338534182, "grad_norm": 0.877382641543625, "kd_ratio": 0.5, "learning_rate": 3.53789015497763e-09, "loss": 0.9270931482315063, "loss/kd": 1.4868313074111938, "loss/lm": 0.36735498905181885, "step": 4831 }, { "epoch": 0.9919934305070828, "grad_norm": 0.8639716676041356, "kd_ratio": 0.5, "learning_rate": 3.363216620883014e-09, "loss": 1.3883774280548096, "loss/kd": 2.412484884262085, "loss/lm": 0.36427000164985657, "step": 4832 }, { "epoch": 0.9921987271607473, "grad_norm": 0.7635550782031414, "kd_ratio": 0.5, "learning_rate": 3.1929642226824486e-09, "loss": 1.0293041467666626, "loss/kd": 1.711250901222229, "loss/lm": 0.34735745191574097, "step": 4833 }, { "epoch": 0.9924040238144118, "grad_norm": 0.9814000935861532, "kd_ratio": 0.5, "learning_rate": 3.02713303567459e-09, "loss": 0.9582639932632446, "loss/kd": 1.6247502565383911, "loss/lm": 0.29177775979042053, "step": 4834 }, { "epoch": 0.9926093204680764, "grad_norm": 0.9841910326941479, "kd_ratio": 0.5, "learning_rate": 2.8657231331985503e-09, "loss": 1.162184476852417, "loss/kd": 2.0154597759246826, "loss/lm": 0.308909147977829, "step": 4835 }, { "epoch": 0.9928146171217409, "grad_norm": 0.8961744791109567, "kd_ratio": 0.5, "learning_rate": 2.7087345866394497e-09, "loss": 0.863104522228241, "loss/kd": 1.4195096492767334, "loss/lm": 0.30669939517974854, "step": 4836 }, { "epoch": 0.9930199137754054, "grad_norm": 0.9254408230613614, "kd_ratio": 0.5, "learning_rate": 2.5561674654295264e-09, "loss": 1.0318922996520996, "loss/kd": 1.7167433500289917, "loss/lm": 0.34704113006591797, "step": 4837 }, { "epoch": 0.99322521042907, "grad_norm": 0.78938584305107, "kd_ratio": 0.5, "learning_rate": 2.4080218370414744e-09, "loss": 1.3456652164459229, "loss/kd": 2.3097805976867676, "loss/lm": 0.38154980540275574, "step": 4838 }, { "epoch": 0.9934305070827345, "grad_norm": 0.8787188930521339, "kd_ratio": 0.5, "learning_rate": 2.264297766995105e-09, "loss": 1.3834589719772339, "loss/kd": 2.463092565536499, "loss/lm": 0.30382540822029114, "step": 4839 }, { "epoch": 0.9936358037363991, "grad_norm": 0.8311539373784937, "kd_ratio": 0.5, "learning_rate": 2.1249953188551277e-09, "loss": 0.8990634083747864, "loss/kd": 1.5061266422271729, "loss/lm": 0.2920002043247223, "step": 4840 }, { "epoch": 0.9938411003900637, "grad_norm": 0.8389043896422805, "kd_ratio": 0.5, "learning_rate": 1.990114554228928e-09, "loss": 1.4044767618179321, "loss/kd": 2.438465118408203, "loss/lm": 0.37048834562301636, "step": 4841 }, { "epoch": 0.9940463970437282, "grad_norm": 0.9609735497031923, "kd_ratio": 0.5, "learning_rate": 1.8596555327687893e-09, "loss": 0.9023053646087646, "loss/kd": 1.3767313957214355, "loss/lm": 0.42787933349609375, "step": 4842 }, { "epoch": 0.9942516936973927, "grad_norm": 0.8465696418058982, "kd_ratio": 0.5, "learning_rate": 1.7336183121730022e-09, "loss": 0.9726619124412537, "loss/kd": 1.660654902458191, "loss/lm": 0.284668892621994, "step": 4843 }, { "epoch": 0.9944569903510573, "grad_norm": 0.8538618373450244, "kd_ratio": 0.5, "learning_rate": 1.6120029481814237e-09, "loss": 0.7908816933631897, "loss/kd": 1.2699445486068726, "loss/lm": 0.3118188679218292, "step": 4844 }, { "epoch": 0.9946622870047218, "grad_norm": 0.9283889245771673, "kd_ratio": 0.5, "learning_rate": 1.4948094945810287e-09, "loss": 1.0078264474868774, "loss/kd": 1.7319577932357788, "loss/lm": 0.2836950421333313, "step": 4845 }, { "epoch": 0.9948675836583863, "grad_norm": 0.8888444486375334, "kd_ratio": 0.5, "learning_rate": 1.3820380032025794e-09, "loss": 1.1162264347076416, "loss/kd": 1.8655527830123901, "loss/lm": 0.36690017580986023, "step": 4846 }, { "epoch": 0.9950728803120509, "grad_norm": 0.7980966332561684, "kd_ratio": 0.5, "learning_rate": 1.273688523919514e-09, "loss": 0.9835305213928223, "loss/kd": 1.663395881652832, "loss/lm": 0.3036652207374573, "step": 4847 }, { "epoch": 0.9952781769657154, "grad_norm": 0.8346966159116285, "kd_ratio": 0.5, "learning_rate": 1.169761104651279e-09, "loss": 0.9658477902412415, "loss/kd": 1.6049017906188965, "loss/lm": 0.3267937898635864, "step": 4848 }, { "epoch": 0.99548347361938, "grad_norm": 0.8367998885896603, "kd_ratio": 0.5, "learning_rate": 1.0702557913611078e-09, "loss": 1.3260319232940674, "loss/kd": 2.337968587875366, "loss/lm": 0.3140953481197357, "step": 4849 }, { "epoch": 0.9956887702730446, "grad_norm": 0.8223477388844098, "kd_ratio": 0.5, "learning_rate": 9.751726280560203e-10, "loss": 0.9877786636352539, "loss/kd": 1.6602106094360352, "loss/lm": 0.31534671783447266, "step": 4850 }, { "epoch": 0.9958940669267091, "grad_norm": 0.9116361515217702, "kd_ratio": 0.5, "learning_rate": 8.845116567879342e-10, "loss": 1.4944334030151367, "loss/kd": 2.6593430042266846, "loss/lm": 0.32952389121055603, "step": 4851 }, { "epoch": 0.9960993635803737, "grad_norm": 0.97230730154053, "kd_ratio": 0.5, "learning_rate": 7.982729176536641e-10, "loss": 0.890538215637207, "loss/kd": 1.4753236770629883, "loss/lm": 0.3057527542114258, "step": 4852 }, { "epoch": 0.9963046602340382, "grad_norm": 0.846826056701838, "kd_ratio": 0.5, "learning_rate": 7.164564487915915e-10, "loss": 0.9906265735626221, "loss/kd": 1.593322515487671, "loss/lm": 0.38793066143989563, "step": 4853 }, { "epoch": 0.9965099568877027, "grad_norm": 0.8222703588974725, "kd_ratio": 0.5, "learning_rate": 6.390622863872154e-10, "loss": 1.114810824394226, "loss/kd": 1.915810465812683, "loss/lm": 0.3138112723827362, "step": 4854 }, { "epoch": 0.9967152535413673, "grad_norm": 1.0377442053480923, "kd_ratio": 0.5, "learning_rate": 5.660904646698217e-10, "loss": 0.9216887950897217, "loss/kd": 1.5075750350952148, "loss/lm": 0.3358026146888733, "step": 4855 }, { "epoch": 0.9969205501950318, "grad_norm": 0.8015609067553932, "kd_ratio": 0.5, "learning_rate": 4.975410159102634e-10, "loss": 1.122183918952942, "loss/kd": 1.9124828577041626, "loss/lm": 0.3318849205970764, "step": 4856 }, { "epoch": 0.9971258468486963, "grad_norm": 0.7904517985201446, "kd_ratio": 0.5, "learning_rate": 4.3341397042651057e-10, "loss": 1.124496579170227, "loss/kd": 1.8730432987213135, "loss/lm": 0.37594982981681824, "step": 4857 }, { "epoch": 0.997331143502361, "grad_norm": 0.9422683849488938, "kd_ratio": 0.5, "learning_rate": 3.737093565792105e-10, "loss": 0.8774573802947998, "loss/kd": 1.4287983179092407, "loss/lm": 0.3261164128780365, "step": 4858 }, { "epoch": 0.9975364401560255, "grad_norm": 0.8126049882396389, "kd_ratio": 0.5, "learning_rate": 3.1842720077390754e-10, "loss": 1.2113357782363892, "loss/kd": 2.076427936553955, "loss/lm": 0.34624359011650085, "step": 4859 }, { "epoch": 0.99774173680969, "grad_norm": 0.8545645090686639, "kd_ratio": 0.5, "learning_rate": 2.67567527458823e-10, "loss": 0.9876433610916138, "loss/kd": 1.6146323680877686, "loss/lm": 0.36065438389778137, "step": 4860 }, { "epoch": 0.9979470334633546, "grad_norm": 0.7847125547420395, "kd_ratio": 0.5, "learning_rate": 2.211303591292957e-10, "loss": 0.9957967400550842, "loss/kd": 1.6637402772903442, "loss/lm": 0.3278532326221466, "step": 4861 }, { "epoch": 0.9981523301170191, "grad_norm": 0.8299930866372985, "kd_ratio": 0.5, "learning_rate": 1.791157163200108e-10, "loss": 0.9456013441085815, "loss/kd": 1.6183699369430542, "loss/lm": 0.27283281087875366, "step": 4862 }, { "epoch": 0.9983576267706836, "grad_norm": 0.8309477124115721, "kd_ratio": 0.5, "learning_rate": 1.4152361761388122e-10, "loss": 0.9713614583015442, "loss/kd": 1.5314888954162598, "loss/lm": 0.4112339913845062, "step": 4863 }, { "epoch": 0.9985629234243482, "grad_norm": 0.8416183417106421, "kd_ratio": 0.5, "learning_rate": 1.0835407963649681e-10, "loss": 1.0734905004501343, "loss/kd": 1.7627755403518677, "loss/lm": 0.38420549035072327, "step": 4864 }, { "epoch": 0.9987682200780127, "grad_norm": 0.7824828881886656, "kd_ratio": 0.5, "learning_rate": 7.960711705834456e-11, "loss": 1.1541101932525635, "loss/kd": 2.010463237762451, "loss/lm": 0.29775723814964294, "step": 4865 }, { "epoch": 0.9989735167316772, "grad_norm": 1.0073215631770187, "kd_ratio": 0.5, "learning_rate": 5.528274259147814e-11, "loss": 1.1369210481643677, "loss/kd": 1.9036270380020142, "loss/lm": 0.3702150285243988, "step": 4866 }, { "epoch": 0.9991788133853419, "grad_norm": 0.8465146961298529, "kd_ratio": 0.5, "learning_rate": 3.5380966993958655e-11, "loss": 1.0115811824798584, "loss/kd": 1.7033002376556396, "loss/lm": 0.3198622465133667, "step": 4867 }, { "epoch": 0.9993841100390064, "grad_norm": 0.8694326288548495, "kd_ratio": 0.5, "learning_rate": 1.9901799068744452e-11, "loss": 0.8838280439376831, "loss/kd": 1.4543876647949219, "loss/lm": 0.3132684528827667, "step": 4868 }, { "epoch": 0.9995894066926709, "grad_norm": 0.8239830591046758, "kd_ratio": 0.5, "learning_rate": 8.845245660360491e-12, "loss": 1.07481050491333, "loss/kd": 1.8585987091064453, "loss/lm": 0.29102227091789246, "step": 4869 }, { "epoch": 0.9997947033463355, "grad_norm": 0.9802643463380759, "kd_ratio": 0.5, "learning_rate": 2.211311659339188e-12, "loss": 1.1414085626602173, "loss/kd": 1.9430110454559326, "loss/lm": 0.33980605006217957, "step": 4870 }, { "epoch": 1.0, "grad_norm": 0.7945095039692668, "kd_ratio": 0.5, "learning_rate": 0.0, "loss": 1.0025914907455444, "loss/kd": 1.7298988103866577, "loss/lm": 0.27528420090675354, "step": 4871 }, { "epoch": 1.0, "step": 4871, "total_flos": 1045604814274560.0, "train_loss": 1.2044322484818555, "train_runtime": 17133.8275, "train_samples_per_second": 72.774, "train_steps_per_second": 0.284 } ], "logging_steps": 1.0, "max_steps": 4871, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 2000, "total_flos": 1045604814274560.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }