{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.018465515649524512, "eval_steps": 10000, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00036931031299049027, "grad_norm": 0.05093964371235935, "learning_rate": 0.0, "loss": 0.0176, "reward/mean": 0.4305254817008972, "reward/std": 0.023368891328573227, "rewards/correct_answer_reward_func/mean": 0.8723958134651184, "rewards/correct_answer_reward_func/std": 0.3338659703731537, "rewards/correct_crop_func/mean": 0.0, "rewards/correct_crop_func/std": 0.0, "rewards/correct_extract_func/mean": 0.939539909362793, "rewards/correct_extract_func/std": 0.2306855320930481, "rewards/correct_find_color/mean": 0.0, "rewards/correct_find_color/std": 0.0, "rewards/format_reward_func/mean": 1.4062052965164185, "rewards/format_reward_func/std": 0.023711344227194786, "step": 1 }, { "epoch": 0.0007386206259809805, "grad_norm": 0.05093885614710434, "learning_rate": 1e-07, "loss": 0.0176, "step": 2 }, { "epoch": 0.0011079309389714707, "grad_norm": 0.049709599363636135, "learning_rate": 2e-07, "loss": 0.0177, "reward/mean": 0.43362969160079956, "reward/std": 0.024930372834205627, "rewards/correct_answer_reward_func/mean": 0.8841145634651184, "rewards/correct_answer_reward_func/std": 0.3202960789203644, "rewards/correct_crop_func/mean": 0.0, "rewards/correct_crop_func/std": 0.0, "rewards/correct_extract_func/mean": 0.9216580390930176, "rewards/correct_extract_func/std": 0.25809445977211, "rewards/correct_find_color/mean": 0.0, "rewards/correct_find_color/std": 0.0, "rewards/format_reward_func/mean": 1.4074559211730957, "rewards/format_reward_func/std": 0.018833689391613007, "step": 3 }, { "epoch": 0.001477241251961961, "grad_norm": 0.04919887971444478, "learning_rate": 3e-07, "loss": 0.0177, "step": 4 }, { "epoch": 0.0018465515649524512, "grad_norm": 0.04609056285217934, "learning_rate": 4e-07, "loss": 0.017, "reward/mean": 0.43184012174606323, "reward/std": 0.016295205801725388, "rewards/correct_answer_reward_func/mean": 0.87890625, "rewards/correct_answer_reward_func/std": 0.32644879817962646, "rewards/correct_crop_func/mean": 0.0, "rewards/correct_crop_func/std": 0.0, "rewards/correct_extract_func/mean": 0.9188368320465088, "rewards/correct_extract_func/std": 0.2631855309009552, "rewards/correct_find_color/mean": 0.0, "rewards/correct_find_color/std": 0.0, "rewards/format_reward_func/mean": 1.4060311317443848, "rewards/format_reward_func/std": 0.019542310386896133, "step": 5 }, { "epoch": 0.0022158618779429414, "grad_norm": 0.04790138249587581, "learning_rate": 5e-07, "loss": 0.017, "step": 6 }, { "epoch": 0.002585172190933432, "grad_norm": 0.07345664511629879, "learning_rate": 6e-07, "loss": 0.0171, "reward/mean": 0.4144955277442932, "reward/std": 0.025130389258265495, "rewards/correct_answer_reward_func/mean": 0.8255208134651184, "rewards/correct_answer_reward_func/std": 0.37976834177970886, "rewards/correct_crop_func/mean": 0.0, "rewards/correct_crop_func/std": 0.0, "rewards/correct_extract_func/mean": 0.880946159362793, "rewards/correct_extract_func/std": 0.31111887097358704, "rewards/correct_find_color/mean": 0.0, "rewards/correct_find_color/std": 0.0, "rewards/format_reward_func/mean": 1.4041085243225098, "rewards/format_reward_func/std": 0.03701591119170189, "step": 7 }, { "epoch": 0.002954482503923922, "grad_norm": 0.07610320387658168, "learning_rate": 7e-07, "loss": 0.0171, "step": 8 }, { "epoch": 0.0033237928169144123, "grad_norm": 0.05112137950522487, "learning_rate": 8e-07, "loss": 0.0181, "reward/mean": 0.4387373626232147, "reward/std": 0.019346633926033974, "rewards/correct_answer_reward_func/mean": 0.9036458134651184, "rewards/correct_answer_reward_func/std": 0.2952686548233032, "rewards/correct_crop_func/mean": 0.0, "rewards/correct_crop_func/std": 0.0, "rewards/correct_extract_func/mean": 0.9055989384651184, "rewards/correct_extract_func/std": 0.28282052278518677, "rewards/correct_find_color/mean": 0.0, "rewards/correct_find_color/std": 0.0, "rewards/format_reward_func/mean": 1.4047561883926392, "rewards/format_reward_func/std": 0.06315362453460693, "step": 9 }, { "epoch": 0.0036931031299049025, "grad_norm": 0.05094809364589051, "learning_rate": 9e-07, "loss": 0.0181, "step": 10 }, { "epoch": 0.004062413442895393, "grad_norm": 0.05275996751985643, "learning_rate": 1e-06, "loss": 0.0175, "reward/mean": 0.43490076065063477, "reward/std": 0.02377907559275627, "rewards/correct_answer_reward_func/mean": 0.8893229365348816, "rewards/correct_answer_reward_func/std": 0.3139362931251526, "rewards/correct_crop_func/mean": 0.0, "rewards/correct_crop_func/std": 0.0, "rewards/correct_extract_func/mean": 0.9134114384651184, "rewards/correct_extract_func/std": 0.2700866460800171, "rewards/correct_find_color/mean": 0.0, "rewards/correct_find_color/std": 0.0, "rewards/format_reward_func/mean": 1.407015323638916, "rewards/format_reward_func/std": 0.01986781507730484, "step": 11 }, { "epoch": 0.004431723755885883, "grad_norm": 0.051170250158850315, "learning_rate": 1e-06, "loss": 0.0175, "step": 12 }, { "epoch": 0.004801034068876373, "grad_norm": 0.05767247415587884, "learning_rate": 1e-06, "loss": 0.0181, "reward/mean": 0.4389788508415222, "reward/std": 0.02013307623565197, "rewards/correct_answer_reward_func/mean": 0.9049479365348816, "rewards/correct_answer_reward_func/std": 0.29347798228263855, "rewards/correct_crop_func/mean": 0.0, "rewards/correct_crop_func/std": 0.0, "rewards/correct_extract_func/mean": 0.8903212547302246, "rewards/correct_extract_func/std": 0.2998242974281311, "rewards/correct_find_color/mean": 0.0, "rewards/correct_find_color/std": 0.0, "rewards/format_reward_func/mean": 1.4078478813171387, "rewards/format_reward_func/std": 0.014290675520896912, "step": 13 }, { "epoch": 0.005170344381866864, "grad_norm": 0.05451251011427531, "learning_rate": 1e-06, "loss": 0.0181, "step": 14 }, { "epoch": 0.005539654694857354, "grad_norm": 0.0548131395951059, "learning_rate": 1e-06, "loss": 0.0181, "reward/mean": 0.4339887797832489, "reward/std": 0.018782436847686768, "rewards/correct_answer_reward_func/mean": 0.88671875, "rewards/correct_answer_reward_func/std": 0.3171428442001343, "rewards/correct_crop_func/mean": 0.0, "rewards/correct_crop_func/std": 0.0, "rewards/correct_extract_func/mean": 0.9032118320465088, "rewards/correct_extract_func/std": 0.2824605405330658, "rewards/correct_find_color/mean": 0.0, "rewards/correct_find_color/std": 0.0, "rewards/format_reward_func/mean": 1.4087677001953125, "rewards/format_reward_func/std": 0.012980460189282894, "step": 15 }, { "epoch": 0.005908965007847844, "grad_norm": 0.05240983246508732, "learning_rate": 1e-06, "loss": 0.0181, "step": 16 }, { "epoch": 0.006278275320838334, "grad_norm": 0.05134459409849994, "learning_rate": 1e-06, "loss": 0.0182, "reward/mean": 0.43516525626182556, "reward/std": 0.02198929898440838, "rewards/correct_answer_reward_func/mean": 0.89453125, "rewards/correct_answer_reward_func/std": 0.3073566257953644, "rewards/correct_crop_func/mean": 0.0, "rewards/correct_crop_func/std": 0.0, "rewards/correct_extract_func/mean": 0.8657768368721008, "rewards/correct_extract_func/std": 0.3262862265110016, "rewards/correct_find_color/mean": 0.0, "rewards/correct_find_color/std": 0.0, "rewards/format_reward_func/mean": 1.4083256721496582, "rewards/format_reward_func/std": 0.02241017296910286, "step": 17 }, { "epoch": 0.006647585633828825, "grad_norm": 0.050341338341445184, "learning_rate": 1e-06, "loss": 0.0182, "step": 18 }, { "epoch": 0.007016895946819315, "grad_norm": 0.0432942729059209, "learning_rate": 1e-06, "loss": 0.0176, "reward/mean": 0.43305787444114685, "reward/std": 0.017742186784744263, "rewards/correct_answer_reward_func/mean": 0.8854166865348816, "rewards/correct_answer_reward_func/std": 0.3187260329723358, "rewards/correct_crop_func/mean": 0.0, "rewards/correct_crop_func/std": 0.0, "rewards/correct_extract_func/mean": 0.8930990099906921, "rewards/correct_extract_func/std": 0.29494285583496094, "rewards/correct_find_color/mean": 0.0, "rewards/correct_find_color/std": 0.0, "rewards/format_reward_func/mean": 1.4063993692398071, "rewards/format_reward_func/std": 0.03480615094304085, "step": 19 }, { "epoch": 0.007386206259809805, "grad_norm": 0.043736628776102855, "learning_rate": 1e-06, "loss": 0.0176, "step": 20 }, { "epoch": 0.0077555165728002955, "grad_norm": 0.05359119220833686, "learning_rate": 1e-06, "loss": 0.0171, "reward/mean": 0.4271976351737976, "reward/std": 0.02278582751750946, "rewards/correct_answer_reward_func/mean": 0.8619791865348816, "rewards/correct_answer_reward_func/std": 0.34514662623405457, "rewards/correct_crop_func/mean": 0.0, "rewards/correct_crop_func/std": 0.0, "rewards/correct_extract_func/mean": 0.9271919131278992, "rewards/correct_extract_func/std": 0.2506465017795563, "rewards/correct_find_color/mean": 0.0, "rewards/correct_find_color/std": 0.0, "rewards/format_reward_func/mean": 1.4078807830810547, "rewards/format_reward_func/std": 0.011555412784218788, "step": 21 }, { "epoch": 0.008124826885790786, "grad_norm": 0.053384876112853016, "learning_rate": 1e-06, "loss": 0.0171, "step": 22 }, { "epoch": 0.008494137198781277, "grad_norm": 0.05457105190222447, "learning_rate": 1e-06, "loss": 0.0176, "reward/mean": 0.4344092011451721, "reward/std": 0.018806444481015205, "rewards/correct_answer_reward_func/mean": 0.8893229365348816, "rewards/correct_answer_reward_func/std": 0.3139362931251526, "rewards/correct_crop_func/mean": 0.0, "rewards/correct_crop_func/std": 0.0, "rewards/correct_extract_func/mean": 0.8963108062744141, "rewards/correct_extract_func/std": 0.29719239473342896, "rewards/correct_find_color/mean": 0.0, "rewards/correct_find_color/std": 0.0, "rewards/format_reward_func/mean": 1.4072299003601074, "rewards/format_reward_func/std": 0.013067901134490967, "step": 23 }, { "epoch": 0.008863447511771766, "grad_norm": 0.051790764460388904, "learning_rate": 1e-06, "loss": 0.0176, "step": 24 }, { "epoch": 0.009232757824762256, "grad_norm": 0.06455557806308003, "learning_rate": 1e-06, "loss": 0.0177, "reward/mean": 0.44281578063964844, "reward/std": 0.016280503943562508, "rewards/correct_answer_reward_func/mean": 0.9153645634651184, "rewards/correct_answer_reward_func/std": 0.27851977944374084, "rewards/correct_crop_func/mean": 0.0, "rewards/correct_crop_func/std": 0.0, "rewards/correct_extract_func/mean": 0.9203993678092957, "rewards/correct_extract_func/std": 0.25632622838020325, "rewards/correct_find_color/mean": 0.0, "rewards/correct_find_color/std": 0.0, "rewards/format_reward_func/mean": 1.4059442281723022, "rewards/format_reward_func/std": 0.02205752208828926, "step": 25 }, { "epoch": 0.009602068137752747, "grad_norm": 0.054460571323261056, "learning_rate": 1e-06, "loss": 0.0177, "step": 26 }, { "epoch": 0.009971378450743237, "grad_norm": 0.045474497731843734, "learning_rate": 1e-06, "loss": 0.0175, "reward/mean": 0.4445436894893646, "reward/std": 0.014738515019416809, "rewards/correct_answer_reward_func/mean": 0.9192708134651184, "rewards/correct_answer_reward_func/std": 0.27259624004364014, "rewards/correct_crop_func/mean": 0.0, "rewards/correct_crop_func/std": 0.0, "rewards/correct_extract_func/mean": 0.9342448115348816, "rewards/correct_extract_func/std": 0.23349910974502563, "rewards/correct_find_color/mean": 0.0, "rewards/correct_find_color/std": 0.0, "rewards/format_reward_func/mean": 1.4073508977890015, "rewards/format_reward_func/std": 0.014469039626419544, "step": 27 }, { "epoch": 0.010340688763733728, "grad_norm": 0.04518446989067636, "learning_rate": 1e-06, "loss": 0.0175, "step": 28 }, { "epoch": 0.010709999076724217, "grad_norm": 0.05758722247738054, "learning_rate": 1e-06, "loss": 0.0183, "reward/mean": 0.44419676065444946, "reward/std": 0.02353046089410782, "rewards/correct_answer_reward_func/mean": 0.9192708134651184, "rewards/correct_answer_reward_func/std": 0.27259624004364014, "rewards/correct_crop_func/mean": 0.0, "rewards/correct_crop_func/std": 0.0, "rewards/correct_extract_func/mean": 0.923828125, "rewards/correct_extract_func/std": 0.25748127698898315, "rewards/correct_find_color/mean": 0.0, "rewards/correct_find_color/std": 0.0, "rewards/format_reward_func/mean": 1.4070063829421997, "rewards/format_reward_func/std": 0.013799347914755344, "step": 29 }, { "epoch": 0.011079309389714707, "grad_norm": 0.04951275155135316, "learning_rate": 1e-06, "loss": 0.0183, "step": 30 }, { "epoch": 0.011448619702705198, "grad_norm": 0.047494845238651454, "learning_rate": 1e-06, "loss": 0.0173, "reward/mean": 0.44647669792175293, "reward/std": 0.014447808265686035, "rewards/correct_answer_reward_func/mean": 0.92578125, "rewards/correct_answer_reward_func/std": 0.2622973620891571, "rewards/correct_crop_func/mean": 0.0, "rewards/correct_crop_func/std": 0.0, "rewards/correct_extract_func/mean": 0.9365885257720947, "rewards/correct_extract_func/std": 0.24006153643131256, "rewards/correct_find_color/mean": 0.0, "rewards/correct_find_color/std": 0.0, "rewards/format_reward_func/mean": 1.4064464569091797, "rewards/format_reward_func/std": 0.02209184132516384, "step": 31 }, { "epoch": 0.011817930015695689, "grad_norm": 0.046822948788372606, "learning_rate": 1e-06, "loss": 0.0173, "step": 32 }, { "epoch": 0.01218724032868618, "grad_norm": 0.04380397393536513, "learning_rate": 1e-06, "loss": 0.0174, "reward/mean": 0.4473347067832947, "reward/std": 0.01410503126680851, "rewards/correct_answer_reward_func/mean": 0.9309895634651184, "rewards/correct_answer_reward_func/std": 0.25363701581954956, "rewards/correct_crop_func/mean": 0.0, "rewards/correct_crop_func/std": 0.0, "rewards/correct_extract_func/mean": 0.911241352558136, "rewards/correct_extract_func/std": 0.28002622723579407, "rewards/correct_find_color/mean": 0.0, "rewards/correct_find_color/std": 0.0, "rewards/format_reward_func/mean": 1.4070053100585938, "rewards/format_reward_func/std": 0.007169181946665049, "step": 33 }, { "epoch": 0.012556550641676668, "grad_norm": 0.04264955092023057, "learning_rate": 1e-06, "loss": 0.0174, "step": 34 }, { "epoch": 0.012925860954667159, "grad_norm": 0.04313995996563885, "learning_rate": 1e-06, "loss": 0.0172, "reward/mean": 0.4411402642726898, "reward/std": 0.00797030795365572, "rewards/correct_answer_reward_func/mean": 0.9088541865348816, "rewards/correct_answer_reward_func/std": 0.28800395131111145, "rewards/correct_crop_func/mean": 0.0, "rewards/correct_crop_func/std": 0.0, "rewards/correct_extract_func/mean": 0.9279513359069824, "rewards/correct_extract_func/std": 0.25393322110176086, "rewards/correct_find_color/mean": 0.0, "rewards/correct_find_color/std": 0.0, "rewards/format_reward_func/mean": 1.4064545631408691, "rewards/format_reward_func/std": 0.017624543979763985, "step": 35 }, { "epoch": 0.01329517126765765, "grad_norm": 0.04127535963471486, "learning_rate": 1e-06, "loss": 0.0172, "step": 36 }, { "epoch": 0.01366448158064814, "grad_norm": 0.060380213829128886, "learning_rate": 1e-06, "loss": 0.0176, "reward/mean": 0.4418516159057617, "reward/std": 0.016873031854629517, "rewards/correct_answer_reward_func/mean": 0.9075520634651184, "rewards/correct_answer_reward_func/std": 0.2898460030555725, "rewards/correct_crop_func/mean": 0.0, "rewards/correct_crop_func/std": 0.0, "rewards/correct_extract_func/mean": 0.961718738079071, "rewards/correct_extract_func/std": 0.18086452782154083, "rewards/correct_find_color/mean": 0.0, "rewards/correct_find_color/std": 0.0, "rewards/format_reward_func/mean": 1.4073443412780762, "rewards/format_reward_func/std": 0.013318442739546299, "step": 37 }, { "epoch": 0.01403379189363863, "grad_norm": 0.05602980229868504, "learning_rate": 1e-06, "loss": 0.0176, "step": 38 }, { "epoch": 0.01440310220662912, "grad_norm": 0.05184766134610364, "learning_rate": 1e-06, "loss": 0.0177, "reward/mean": 0.43869584798812866, "reward/std": 0.019008934497833252, "rewards/correct_answer_reward_func/mean": 0.9036458134651184, "rewards/correct_answer_reward_func/std": 0.2952686548233032, "rewards/correct_crop_func/mean": 0.0, "rewards/correct_crop_func/std": 0.0, "rewards/correct_extract_func/mean": 0.9003472328186035, "rewards/correct_extract_func/std": 0.2904185354709625, "rewards/correct_find_color/mean": 0.0, "rewards/correct_find_color/std": 0.0, "rewards/format_reward_func/mean": 1.405916690826416, "rewards/format_reward_func/std": 0.0255013108253479, "step": 39 }, { "epoch": 0.01477241251961961, "grad_norm": 0.05148310080801445, "learning_rate": 1e-06, "loss": 0.0176, "step": 40 }, { "epoch": 0.0151417228326101, "grad_norm": 0.05581834264976948, "learning_rate": 1e-06, "loss": 0.0175, "reward/mean": 0.4319329261779785, "reward/std": 0.020871102809906006, "rewards/correct_answer_reward_func/mean": 0.8802083134651184, "rewards/correct_answer_reward_func/std": 0.3249293863773346, "rewards/correct_crop_func/mean": 0.0, "rewards/correct_crop_func/std": 0.0, "rewards/correct_extract_func/mean": 0.907118022441864, "rewards/correct_extract_func/std": 0.2817213237285614, "rewards/correct_find_color/mean": 0.0, "rewards/correct_find_color/std": 0.0, "rewards/format_reward_func/mean": 1.406569004058838, "rewards/format_reward_func/std": 0.02022281102836132, "step": 41 }, { "epoch": 0.015511033145600591, "grad_norm": 0.05268407384500521, "learning_rate": 1e-06, "loss": 0.0175, "step": 42 }, { "epoch": 0.01588034345859108, "grad_norm": 0.09367675201948025, "learning_rate": 1e-06, "loss": 0.0176, "reward/mean": 0.4355073869228363, "reward/std": 0.01918705925345421, "rewards/correct_answer_reward_func/mean": 0.8919270634651184, "rewards/correct_answer_reward_func/std": 0.3106748163700104, "rewards/correct_crop_func/mean": 0.0, "rewards/correct_crop_func/std": 0.0, "rewards/correct_extract_func/mean": 0.9130207896232605, "rewards/correct_extract_func/std": 0.27445390820503235, "rewards/correct_find_color/mean": 0.0, "rewards/correct_find_color/std": 0.0, "rewards/format_reward_func/mean": 1.405385971069336, "rewards/format_reward_func/std": 0.026785731315612793, "step": 43 }, { "epoch": 0.016249653771581572, "grad_norm": 0.05227366508331771, "learning_rate": 1e-06, "loss": 0.0176, "step": 44 }, { "epoch": 0.01661896408457206, "grad_norm": 0.04497808527453425, "learning_rate": 1e-06, "loss": 0.0179, "reward/mean": 0.4444352984428406, "reward/std": 0.01481956522911787, "rewards/correct_answer_reward_func/mean": 0.9192708134651184, "rewards/correct_answer_reward_func/std": 0.27259624004364014, "rewards/correct_crop_func/mean": 0.0, "rewards/correct_crop_func/std": 0.0, "rewards/correct_extract_func/mean": 0.9329426884651184, "rewards/correct_extract_func/std": 0.24013008177280426, "rewards/correct_find_color/mean": 0.0, "rewards/correct_find_color/std": 0.0, "rewards/format_reward_func/mean": 1.406657338142395, "rewards/format_reward_func/std": 0.015218171291053295, "step": 45 }, { "epoch": 0.016988274397562553, "grad_norm": 0.055837167119088496, "learning_rate": 1e-06, "loss": 0.0179, "step": 46 }, { "epoch": 0.017357584710553042, "grad_norm": 0.05873478813480533, "learning_rate": 1e-06, "loss": 0.0178, "reward/mean": 0.44062528014183044, "reward/std": 0.01832752674818039, "rewards/correct_answer_reward_func/mean": 0.9088541865348816, "rewards/correct_answer_reward_func/std": 0.28800395131111145, "rewards/correct_crop_func/mean": 0.0, "rewards/correct_crop_func/std": 0.0, "rewards/correct_extract_func/mean": 0.9090712070465088, "rewards/correct_extract_func/std": 0.27737653255462646, "rewards/correct_find_color/mean": 0.0, "rewards/correct_find_color/std": 0.0, "rewards/format_reward_func/mean": 1.4069687128067017, "rewards/format_reward_func/std": 0.02851445972919464, "step": 47 }, { "epoch": 0.01772689502354353, "grad_norm": 0.05518986438955003, "learning_rate": 1e-06, "loss": 0.0178, "step": 48 }, { "epoch": 0.018096205336534023, "grad_norm": 0.04474697485013119, "learning_rate": 1e-06, "loss": 0.0182, "reward/mean": 0.44659337401390076, "reward/std": 0.013036997988820076, "rewards/correct_answer_reward_func/mean": 0.9296875, "rewards/correct_answer_reward_func/std": 0.2558395564556122, "rewards/correct_crop_func/mean": 0.0, "rewards/correct_crop_func/std": 0.0, "rewards/correct_extract_func/mean": 0.9007161259651184, "rewards/correct_extract_func/std": 0.2947409451007843, "rewards/correct_find_color/mean": 0.0, "rewards/correct_find_color/std": 0.0, "rewards/format_reward_func/mean": 1.406656265258789, "rewards/format_reward_func/std": 0.00936658214777708, "step": 49 }, { "epoch": 0.018465515649524512, "grad_norm": 0.043438923657574534, "learning_rate": 1e-06, "loss": 0.0181, "step": 50 } ], "logging_steps": 1, "max_steps": 2708, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }