{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9981333333333333, "eval_steps": 100, "global_step": 394, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 644.0193140933388, "epoch": 0.012666666666666666, "grad_norm": 0.16860076785087585, "kl": 0.000124662016567431, "learning_rate": 3.75e-07, "loss": 0.0, "reward": 0.605263171384209, "reward_std": 0.40058013012534693, "rewards/accuracy_reward": 0.605263171384209, "rewards/format_reward": 0.0, "step": 5 }, { "completion_length": 601.6614225688734, "epoch": 0.025333333333333333, "grad_norm": 0.14201320707798004, "kl": 0.00020345888639751233, "learning_rate": 7.5e-07, "loss": 0.0, "reward": 0.673684226839166, "reward_std": 0.39511305846666034, "rewards/accuracy_reward": 0.673684226839166, "rewards/format_reward": 0.0, "step": 10 }, { "completion_length": 579.6684371145149, "epoch": 0.038, "grad_norm": 0.3944288194179535, "kl": 0.00023700312564247533, "learning_rate": 1.125e-06, "loss": 0.0, "reward": 0.6631579123045269, "reward_std": 0.3630171424464176, "rewards/accuracy_reward": 0.6631579123045269, "rewards/format_reward": 0.0, "step": 15 }, { "completion_length": 615.7789647152549, "epoch": 0.050666666666666665, "grad_norm": 0.1742940992116928, "kl": 0.0005895815397563733, "learning_rate": 1.5e-06, "loss": 0.0, "reward": 0.6175438761711121, "reward_std": 0.40319750308990476, "rewards/accuracy_reward": 0.6175438761711121, "rewards/format_reward": 0.0, "step": 20 }, { "completion_length": 653.6140499717311, "epoch": 0.06333333333333334, "grad_norm": 0.213926300406456, "kl": 0.0014986289174933182, "learning_rate": 1.875e-06, "loss": 0.0, "reward": 0.6561403632164001, "reward_std": 0.3638197688679946, "rewards/accuracy_reward": 0.6561403632164001, "rewards/format_reward": 0.0, "step": 25 }, { "completion_length": 624.1438784950658, "epoch": 0.076, "grad_norm": 0.16140718758106232, "kl": 0.0023109034488075658, "learning_rate": 2.25e-06, "loss": 0.0, "reward": 0.6947368581044047, "reward_std": 0.32959623650500647, "rewards/accuracy_reward": 0.6947368581044047, "rewards/format_reward": 0.0, "step": 30 }, { "completion_length": 649.8438781738281, "epoch": 0.08866666666666667, "grad_norm": 0.16571274399757385, "kl": 0.009154390033922698, "learning_rate": 2.6250000000000003e-06, "loss": 0.0, "reward": 0.6894737009939395, "reward_std": 0.35860375699244046, "rewards/accuracy_reward": 0.687719314819888, "rewards/format_reward": 0.0017543860171970569, "step": 35 }, { "completion_length": 629.4052797016344, "epoch": 0.10133333333333333, "grad_norm": 0.27635857462882996, "kl": 0.0027755938078227794, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.7035087861512836, "reward_std": 0.34189380752412896, "rewards/accuracy_reward": 0.7035087861512836, "rewards/format_reward": 0.0, "step": 40 }, { "completion_length": 652.133349609375, "epoch": 0.114, "grad_norm": 0.0473485067486763, "kl": 0.026468778911389802, "learning_rate": 2.998523534736735e-06, "loss": 0.0, "reward": 0.6842105420012223, "reward_std": 0.3472982666994396, "rewards/accuracy_reward": 0.6842105420012223, "rewards/format_reward": 0.0, "step": 45 }, { "completion_length": 628.0421226099918, "epoch": 0.12666666666666668, "grad_norm": 0.11586566269397736, "kl": 0.004013121755499589, "learning_rate": 2.994097045546504e-06, "loss": 0.0, "reward": 0.7263158050022627, "reward_std": 0.3308417533573351, "rewards/accuracy_reward": 0.7263158050022627, "rewards/format_reward": 0.0, "step": 50 }, { "completion_length": 634.6701881810238, "epoch": 0.13933333333333334, "grad_norm": 0.09287750720977783, "kl": 0.0033212159809313324, "learning_rate": 2.986729246506011e-06, "loss": 0.0, "reward": 0.6912280863837192, "reward_std": 0.35596638729697777, "rewards/accuracy_reward": 0.6912280863837192, "rewards/format_reward": 0.0, "step": 55 }, { "completion_length": 607.8982656378495, "epoch": 0.152, "grad_norm": 0.1364029198884964, "kl": 0.003622757761101974, "learning_rate": 2.976434642014389e-06, "loss": 0.0, "reward": 0.7561403630595458, "reward_std": 0.2851124694472865, "rewards/accuracy_reward": 0.7543859771992031, "rewards/format_reward": 0.0017543860171970569, "step": 60 }, { "completion_length": 605.7438781738281, "epoch": 0.16466666666666666, "grad_norm": 0.0922616571187973, "kl": 0.007114209626850329, "learning_rate": 2.9632334982395456e-06, "loss": 0.0, "reward": 0.7578947509589948, "reward_std": 0.29374928286201074, "rewards/accuracy_reward": 0.7578947509589948, "rewards/format_reward": 0.0, "step": 65 }, { "completion_length": 597.1175623843544, "epoch": 0.17733333333333334, "grad_norm": 0.0, "kl": 0.003088178132709704, "learning_rate": 2.947151803221774e-06, "loss": 0.0, "reward": 0.7596491361919202, "reward_std": 0.2632309951280293, "rewards/accuracy_reward": 0.7596491361919202, "rewards/format_reward": 0.0, "step": 70 }, { "completion_length": 599.9701933208265, "epoch": 0.19, "grad_norm": 0.10758549720048904, "kl": 0.003361149838096217, "learning_rate": 2.928221215713164e-06, "loss": 0.0, "reward": 0.7666666791627281, "reward_std": 0.2686667611724452, "rewards/accuracy_reward": 0.7666666791627281, "rewards/format_reward": 0.0, "step": 75 }, { "completion_length": 597.1456305252879, "epoch": 0.20266666666666666, "grad_norm": 0.06053958460688591, "kl": 0.0037446674547697368, "learning_rate": 2.906479002853542e-06, "loss": 0.0, "reward": 0.7315789610147476, "reward_std": 0.30245108447576824, "rewards/accuracy_reward": 0.7315789610147476, "rewards/format_reward": 0.0, "step": 80 }, { "completion_length": 577.8438778525905, "epoch": 0.21533333333333332, "grad_norm": 0.0840262621641159, "kl": 0.004712596692537006, "learning_rate": 2.8819679668056195e-06, "loss": 0.0, "reward": 0.7982456276291295, "reward_std": 0.26917701614530465, "rewards/accuracy_reward": 0.7982456276291295, "rewards/format_reward": 0.0, "step": 85 }, { "completion_length": 612.2982611405222, "epoch": 0.228, "grad_norm": 0.0, "kl": 0.02382647865696957, "learning_rate": 2.8547363604937856e-06, "loss": 0.0, "reward": 0.7456140491523241, "reward_std": 0.2956205848016237, "rewards/accuracy_reward": 0.7456140491523241, "rewards/format_reward": 0.0, "step": 90 }, { "completion_length": 598.1403680098684, "epoch": 0.24066666666666667, "grad_norm": 0.0, "kl": 0.005134462055407072, "learning_rate": 2.824837792612416e-06, "loss": 0.0, "reward": 0.7403508889047723, "reward_std": 0.2658375608293634, "rewards/accuracy_reward": 0.7403508889047723, "rewards/format_reward": 0.0, "step": 95 }, { "completion_length": 582.6561587685032, "epoch": 0.25333333333333335, "grad_norm": 0.11761737614870071, "kl": 0.00578155517578125, "learning_rate": 2.792331122090709e-06, "loss": 0.0, "reward": 0.7614035217385543, "reward_std": 0.25812496166480214, "rewards/accuracy_reward": 0.7614035217385543, "rewards/format_reward": 0.0, "step": 100 }, { "epoch": 0.25333333333333335, "eval_completion_length": 576.5513494110107, "eval_kl": 0.10147758865356446, "eval_loss": 3.062895004291022e-08, "eval_reward": 0.6709333465278149, "eval_reward_std": 0.3338339452087879, "eval_rewards/accuracy_reward": 0.6709333465278149, "eval_rewards/format_reward": 0.0, "eval_runtime": 79047.156, "eval_samples_per_second": 0.063, "eval_steps_per_second": 0.011, "step": 100 }, { "completion_length": 549.310542056435, "epoch": 0.266, "grad_norm": 0.08108708262443542, "kl": 0.07660410027754934, "learning_rate": 2.7572803422217976e-06, "loss": 0.0, "reward": 0.8140351000585054, "reward_std": 0.2548168144728008, "rewards/accuracy_reward": 0.8140351000585054, "rewards/format_reward": 0.0, "step": 105 }, { "completion_length": 596.6351042094983, "epoch": 0.2786666666666667, "grad_norm": 0.06687445938587189, "kl": 0.18131464907997533, "learning_rate": 2.71975445468425e-06, "loss": 0.0, "reward": 0.7543859803362897, "reward_std": 0.32687413692474365, "rewards/accuracy_reward": 0.7543859803362897, "rewards/format_reward": 0.0, "step": 110 }, { "completion_length": 547.8263289602179, "epoch": 0.29133333333333333, "grad_norm": 0.08061777800321579, "kl": 4.457168900339227, "learning_rate": 2.679827333703964e-06, "loss": -0.0, "reward": 0.7894736992685418, "reward_std": 0.2753143636803878, "rewards/accuracy_reward": 0.7894736992685418, "rewards/format_reward": 0.0, "step": 115 }, { "completion_length": 533.67895989669, "epoch": 0.304, "grad_norm": 0.15190833806991577, "kl": 0.005019980982730263, "learning_rate": 2.637577580623858e-06, "loss": 0.0, "reward": 0.771929837528028, "reward_std": 0.2711241148020092, "rewards/accuracy_reward": 0.771929837528028, "rewards/format_reward": 0.0, "step": 120 }, { "completion_length": 535.5947535464638, "epoch": 0.31666666666666665, "grad_norm": 0.24402964115142822, "kl": 0.005212723581414474, "learning_rate": 2.593088369167671e-06, "loss": 0.0, "reward": 0.8035087809750908, "reward_std": 0.2135340957265151, "rewards/accuracy_reward": 0.8035087809750908, "rewards/format_reward": 0.0, "step": 125 }, { "completion_length": 532.9789643940172, "epoch": 0.3293333333333333, "grad_norm": 0.28009745478630066, "kl": 0.005336239463404605, "learning_rate": 2.5464472817024772e-06, "loss": 0.0, "reward": 0.8052631699725201, "reward_std": 0.24214739736757782, "rewards/accuracy_reward": 0.8052631699725201, "rewards/format_reward": 0.0, "step": 130 }, { "completion_length": 561.8772093120374, "epoch": 0.342, "grad_norm": 0.18070097267627716, "kl": 0.004754076505962171, "learning_rate": 2.497746136822254e-06, "loss": 0.0, "reward": 0.7771929958933278, "reward_std": 0.27938908150321556, "rewards/accuracy_reward": 0.7771929958933278, "rewards/format_reward": 0.0, "step": 135 }, { "completion_length": 578.6684380782278, "epoch": 0.3546666666666667, "grad_norm": 0.10906434804201126, "kl": 0.005059493215460526, "learning_rate": 2.4470808085919304e-06, "loss": 0.0, "reward": 0.7631579088537317, "reward_std": 0.3001742513556229, "rewards/accuracy_reward": 0.7631579088537317, "rewards/format_reward": 0.0, "step": 140 }, { "completion_length": 602.6772094726563, "epoch": 0.36733333333333335, "grad_norm": 0.05584894120693207, "kl": 0.004565590306332237, "learning_rate": 2.3945510378077523e-06, "loss": 0.0, "reward": 0.7842105371387381, "reward_std": 0.2481692125922755, "rewards/accuracy_reward": 0.7842105371387381, "rewards/format_reward": 0.0, "step": 145 }, { "completion_length": 580.9210699784128, "epoch": 0.38, "grad_norm": 0.0609310120344162, "kl": 0.005594434236225329, "learning_rate": 2.340260235645519e-06, "loss": 0.0, "reward": 0.7368421157723979, "reward_std": 0.26106126277070296, "rewards/accuracy_reward": 0.7368421157723979, "rewards/format_reward": 0.0, "step": 150 }, { "completion_length": 525.4000133313631, "epoch": 0.39266666666666666, "grad_norm": 0.06953240185976028, "kl": 0.007059599223889802, "learning_rate": 2.2843152800832416e-06, "loss": 0.0, "reward": 0.8035087816025082, "reward_std": 0.2169914556177039, "rewards/accuracy_reward": 0.8035087816025082, "rewards/format_reward": 0.0, "step": 155 }, { "completion_length": 559.4386140522204, "epoch": 0.4053333333333333, "grad_norm": 0.08336005359888077, "kl": 0.007145851536800986, "learning_rate": 2.2268263054989753e-06, "loss": 0.0, "reward": 0.7824561511215411, "reward_std": 0.23924477821902224, "rewards/accuracy_reward": 0.7824561511215411, "rewards/format_reward": 0.0, "step": 160 }, { "completion_length": 546.5772112394634, "epoch": 0.418, "grad_norm": 0.0791773647069931, "kl": 0.0061695299650493425, "learning_rate": 2.167906485858047e-06, "loss": 0.0, "reward": 0.803508786308138, "reward_std": 0.25981574560466564, "rewards/accuracy_reward": 0.803508786308138, "rewards/format_reward": 0.0, "step": 165 }, { "completion_length": 568.9280845240543, "epoch": 0.43066666666666664, "grad_norm": 0.11959535628557205, "kl": 0.007060161389802632, "learning_rate": 2.1076718119164804e-06, "loss": 0.0, "reward": 0.7736842216629731, "reward_std": 0.2649867666395087, "rewards/accuracy_reward": 0.7736842216629731, "rewards/format_reward": 0.0, "step": 170 }, { "completion_length": 550.3982609799034, "epoch": 0.44333333333333336, "grad_norm": 0.0, "kl": 0.007817157946134868, "learning_rate": 2.0462408628792335e-06, "loss": 0.0, "reward": 0.7824561511215411, "reward_std": 0.253275244800668, "rewards/accuracy_reward": 0.7824561511215411, "rewards/format_reward": 0.0, "step": 175 }, { "completion_length": 586.7596660413241, "epoch": 0.456, "grad_norm": 0.06831427663564682, "kl": 0.0069172106291118425, "learning_rate": 1.9837345729627633e-06, "loss": 0.0, "reward": 0.7438596612528751, "reward_std": 0.2811869652647721, "rewards/accuracy_reward": 0.7438596612528751, "rewards/format_reward": 0.0, "step": 180 }, { "completion_length": 583.1228234542043, "epoch": 0.4686666666666667, "grad_norm": 0.10550453513860703, "kl": 0.006921226099917763, "learning_rate": 1.9202759933214665e-06, "loss": 0.0, "reward": 0.7631579093242946, "reward_std": 0.29657848289138394, "rewards/accuracy_reward": 0.7631579093242946, "rewards/format_reward": 0.0, "step": 185 }, { "completion_length": 528.0912424187911, "epoch": 0.48133333333333334, "grad_norm": 0.0677071213722229, "kl": 0.009208438270970395, "learning_rate": 1.8559900498066726e-06, "loss": 0.0, "reward": 0.8140350991173795, "reward_std": 0.22778719381282203, "rewards/accuracy_reward": 0.8140350991173795, "rewards/format_reward": 0.0, "step": 190 }, { "completion_length": 550.7807165848582, "epoch": 0.494, "grad_norm": 0.18718236684799194, "kl": 0.008406307822779605, "learning_rate": 1.7910032970350677e-06, "loss": 0.0, "reward": 0.7789473815968162, "reward_std": 0.2719327967417868, "rewards/accuracy_reward": 0.7789473815968162, "rewards/format_reward": 0.0, "step": 195 }, { "completion_length": 551.1315929212068, "epoch": 0.5066666666666667, "grad_norm": 0.08260176330804825, "kl": 0.00812426115337171, "learning_rate": 1.7254436692507058e-06, "loss": 0.0, "reward": 0.8070175547348826, "reward_std": 0.23249007557567797, "rewards/accuracy_reward": 0.8070175547348826, "rewards/format_reward": 0.0, "step": 200 }, { "epoch": 0.5066666666666667, "eval_completion_length": 553.8582824371338, "eval_kl": 0.02361789245605469, "eval_loss": -4.30000654887408e-05, "eval_reward": 0.6822333461046219, "eval_reward_std": 0.32350232841968535, "eval_rewards/accuracy_reward": 0.6822333461046219, "eval_rewards/format_reward": 0.0, "eval_runtime": 76484.7991, "eval_samples_per_second": 0.065, "eval_steps_per_second": 0.011, "step": 200 }, { "completion_length": 570.7842259457237, "epoch": 0.5193333333333333, "grad_norm": 0.14592121541500092, "kl": 0.024847893965871712, "learning_rate": 1.6594402284710481e-06, "loss": 0.0, "reward": 0.7561403608635853, "reward_std": 0.24963736345893459, "rewards/accuracy_reward": 0.7561403608635853, "rewards/format_reward": 0.0, "step": 205 }, { "completion_length": 556.5035239771793, "epoch": 0.532, "grad_norm": 0.06877300888299942, "kl": 0.011327321905838815, "learning_rate": 1.593122910412851e-06, "loss": 0.0, "reward": 0.8210526436567307, "reward_std": 0.22859587669372558, "rewards/accuracy_reward": 0.8210526436567307, "rewards/format_reward": 0.0, "step": 210 }, { "completion_length": 579.1263334575452, "epoch": 0.5446666666666666, "grad_norm": 0.0, "kl": 0.007683683696546053, "learning_rate": 1.5266222686980693e-06, "loss": 0.0, "reward": 0.7561403643143805, "reward_std": 0.2832074833543677, "rewards/accuracy_reward": 0.7561403643143805, "rewards/format_reward": 0.0, "step": 215 }, { "completion_length": 588.4649293598376, "epoch": 0.5573333333333333, "grad_norm": 0.07735468447208405, "kl": 0.007682238127055921, "learning_rate": 1.460069217843338e-06, "loss": 0.0, "reward": 0.7333333465613817, "reward_std": 0.31152473405787817, "rewards/accuracy_reward": 0.7333333465613817, "rewards/format_reward": 0.0, "step": 220 }, { "completion_length": 582.8508938438014, "epoch": 0.57, "grad_norm": 0.04747384414076805, "kl": 0.008666349712171053, "learning_rate": 1.3935947755389924e-06, "loss": 0.0, "reward": 0.7456140472700722, "reward_std": 0.27623014795152767, "rewards/accuracy_reward": 0.7456140472700722, "rewards/format_reward": 0.0, "step": 225 }, { "completion_length": 541.6929962158204, "epoch": 0.5826666666666667, "grad_norm": 0.0, "kl": 0.009119616056743421, "learning_rate": 1.3273298047249756e-06, "loss": 0.0, "reward": 0.7964912389454089, "reward_std": 0.2623067799367403, "rewards/accuracy_reward": 0.7964912389454089, "rewards/format_reward": 0.0, "step": 230 }, { "completion_length": 555.8666826750102, "epoch": 0.5953333333333334, "grad_norm": 0.1013207659125328, "kl": 0.008411447625411184, "learning_rate": 1.2614047559713923e-06, "loss": 0.0, "reward": 0.8263158042179911, "reward_std": 0.25948601145493355, "rewards/accuracy_reward": 0.8263158042179911, "rewards/format_reward": 0.0, "step": 235 }, { "completion_length": 564.3929960552015, "epoch": 0.608, "grad_norm": 0.08883073180913925, "kl": 0.008549579821134868, "learning_rate": 1.1959494106708598e-06, "loss": 0.0, "reward": 0.792982468950121, "reward_std": 0.25643417960719056, "rewards/accuracy_reward": 0.792982468950121, "rewards/format_reward": 0.0, "step": 240 }, { "completion_length": 536.9596663625617, "epoch": 0.6206666666666667, "grad_norm": 0.04451132193207741, "kl": 0.009548789576480263, "learning_rate": 1.1310926255482204e-06, "loss": 0.0, "reward": 0.8052631693451028, "reward_std": 0.24199818247242977, "rewards/accuracy_reward": 0.8052631693451028, "rewards/format_reward": 0.0, "step": 245 }, { "completion_length": 544.7193134508635, "epoch": 0.6333333333333333, "grad_norm": 0.0, "kl": 0.009332596628289473, "learning_rate": 1.0669620789905688e-06, "loss": 0.0, "reward": 0.800000010038677, "reward_std": 0.23219164547167326, "rewards/accuracy_reward": 0.800000010038677, "rewards/format_reward": 0.0, "step": 250 }, { "completion_length": 552.8649261474609, "epoch": 0.646, "grad_norm": 0.06963980942964554, "kl": 0.009455148797286185, "learning_rate": 1.0036840196969795e-06, "loss": 0.0, "reward": 0.807017553009485, "reward_std": 0.21750171090427198, "rewards/accuracy_reward": 0.807017553009485, "rewards/format_reward": 0.0, "step": 255 }, { "completion_length": 560.9298404091282, "epoch": 0.6586666666666666, "grad_norm": 0.13915280997753143, "kl": 0.008615754780016447, "learning_rate": 9.413830181427508e-07, "loss": 0.0, "reward": 0.7894736975431442, "reward_std": 0.26591097994854573, "rewards/accuracy_reward": 0.7894736975431442, "rewards/format_reward": 0.0, "step": 260 }, { "completion_length": 583.3000167043585, "epoch": 0.6713333333333333, "grad_norm": 0.0, "kl": 0.008441322728207237, "learning_rate": 8.801817213474331e-07, "loss": 0.0, "reward": 0.7508772039099744, "reward_std": 0.2809643321915677, "rewards/accuracy_reward": 0.7508772039099744, "rewards/format_reward": 0.0, "step": 265 }, { "completion_length": 531.2859802246094, "epoch": 0.684, "grad_norm": 0.05701286345720291, "kl": 0.00885218570106908, "learning_rate": 8.202006114294044e-07, "loss": 0.0, "reward": 0.7877193099574039, "reward_std": 0.2481692119648582, "rewards/accuracy_reward": 0.7877193099574039, "rewards/format_reward": 0.0, "step": 270 }, { "completion_length": 531.2403676886307, "epoch": 0.6966666666666667, "grad_norm": 0.05670388042926788, "kl": 0.009579307154605263, "learning_rate": 7.615577684223272e-07, "loss": 0.0, "reward": 0.8175438716223365, "reward_std": 0.23557321397881759, "rewards/accuracy_reward": 0.8175438716223365, "rewards/format_reward": 0.0, "step": 275 }, { "completion_length": 544.3842264275802, "epoch": 0.7093333333333334, "grad_norm": 0.0, "kl": 0.009712299547697369, "learning_rate": 7.043686378203864e-07, "loss": 0.0, "reward": 0.7701754490011616, "reward_std": 0.25272287663660553, "rewards/accuracy_reward": 0.7701754490011616, "rewards/format_reward": 0.0, "step": 280 }, { "completion_length": 571.7438746402138, "epoch": 0.722, "grad_norm": 0.08127343654632568, "kl": 0.008696064196134869, "learning_rate": 6.487458033099425e-07, "loss": 0.0, "reward": 0.7701754521382482, "reward_std": 0.2902606151605907, "rewards/accuracy_reward": 0.7701754521382482, "rewards/format_reward": 0.0, "step": 285 }, { "completion_length": 532.1596609015214, "epoch": 0.7346666666666667, "grad_norm": 0.072137750685215, "kl": 0.008551828484786184, "learning_rate": 5.947987651349942e-07, "loss": 0.0, "reward": 0.8070175544211738, "reward_std": 0.21213936178307785, "rewards/accuracy_reward": 0.8070175544211738, "rewards/format_reward": 0.0, "step": 290 }, { "completion_length": 535.1210667660362, "epoch": 0.7473333333333333, "grad_norm": 0.07434926927089691, "kl": 0.008226896587171052, "learning_rate": 5.426337245327703e-07, "loss": 0.0, "reward": 0.8157894832523246, "reward_std": 0.22407589335190622, "rewards/accuracy_reward": 0.8157894832523246, "rewards/format_reward": 0.0, "step": 295 }, { "completion_length": 536.6631729929071, "epoch": 0.76, "grad_norm": 0.08616889268159866, "kl": 0.010335339997944079, "learning_rate": 4.923533746638108e-07, "loss": 0.0, "reward": 0.8105263270829853, "reward_std": 0.24915841535518043, "rewards/accuracy_reward": 0.8105263270829853, "rewards/format_reward": 0.0, "step": 300 }, { "epoch": 0.76, "eval_completion_length": 545.0558823242187, "eval_kl": 0.6695549743652344, "eval_loss": 3.003481197083602e-08, "eval_reward": 0.6868333460628986, "eval_reward_std": 0.3212484232842922, "eval_rewards/accuracy_reward": 0.6868333460628986, "eval_rewards/format_reward": 0.0, "eval_runtime": 75622.6169, "eval_samples_per_second": 0.066, "eval_steps_per_second": 0.011, "step": 300 }, { "completion_length": 574.1017703407689, "epoch": 0.7726666666666666, "grad_norm": 0.0810866504907608, "kl": 0.02553052400287829, "learning_rate": 4.440566984481256e-07, "loss": 0.0, "reward": 0.7666666799469998, "reward_std": 0.27946487539692927, "rewards/accuracy_reward": 0.7666666799469998, "rewards/format_reward": 0.0, "step": 305 }, { "completion_length": 550.2579098350124, "epoch": 0.7853333333333333, "grad_norm": 0.09656031429767609, "kl": 0.009140657123766447, "learning_rate": 3.978387737053994e-07, "loss": 0.0, "reward": 0.7824561517489584, "reward_std": 0.23484032687387968, "rewards/accuracy_reward": 0.7824561517489584, "rewards/format_reward": 0.0, "step": 310 }, { "completion_length": 569.4912441855982, "epoch": 0.798, "grad_norm": 0.094791479408741, "kl": 0.009346088610197369, "learning_rate": 3.5379058598286167e-07, "loss": 0.0, "reward": 0.7666666777510392, "reward_std": 0.25904074530852467, "rewards/accuracy_reward": 0.7666666777510392, "rewards/format_reward": 0.0, "step": 315 }, { "completion_length": 579.2614203202097, "epoch": 0.8106666666666666, "grad_norm": 0.10730049759149551, "kl": 4.134726434004934, "learning_rate": 3.119988494392894e-07, "loss": 0.0, "reward": 0.7508772053216634, "reward_std": 0.2928671808619248, "rewards/accuracy_reward": 0.7508772053216634, "rewards/format_reward": 0.0, "step": 320 }, { "completion_length": 556.2263304057874, "epoch": 0.8233333333333334, "grad_norm": 0.07251843065023422, "kl": 0.009259033203125, "learning_rate": 2.725458361377465e-07, "loss": 0.0, "reward": 0.7736842222903905, "reward_std": 0.24773237705230713, "rewards/accuracy_reward": 0.7736842222903905, "rewards/format_reward": 0.0, "step": 325 }, { "completion_length": 537.1491384405839, "epoch": 0.836, "grad_norm": 0.0, "kl": 0.009849789268092105, "learning_rate": 2.3550921408312737e-07, "loss": 0.0, "reward": 0.8140350983331078, "reward_std": 0.23072349554614016, "rewards/accuracy_reward": 0.8140350983331078, "rewards/format_reward": 0.0, "step": 330 }, { "completion_length": 547.5140491686369, "epoch": 0.8486666666666667, "grad_norm": 0.08143898099660873, "kl": 0.010343531558388158, "learning_rate": 2.0096189432334195e-07, "loss": 0.0, "reward": 0.7824561493961435, "reward_std": 0.23674531171196386, "rewards/accuracy_reward": 0.7824561493961435, "rewards/format_reward": 0.0, "step": 335 }, { "completion_length": 541.5175582082649, "epoch": 0.8613333333333333, "grad_norm": 0.15802732110023499, "kl": 0.009666683799342105, "learning_rate": 1.6897188741514286e-07, "loss": 0.0, "reward": 0.7666666757119329, "reward_std": 0.24941472881718685, "rewards/accuracy_reward": 0.7666666757119329, "rewards/format_reward": 0.0, "step": 340 }, { "completion_length": 566.2824713456004, "epoch": 0.874, "grad_norm": 0.08206634968519211, "kl": 0.009324886924342106, "learning_rate": 1.396021695371582e-07, "loss": 0.0, "reward": 0.7543859756306598, "reward_std": 0.23858530960584942, "rewards/accuracy_reward": 0.7543859756306598, "rewards/format_reward": 0.0, "step": 345 }, { "completion_length": 567.8473815917969, "epoch": 0.8866666666666667, "grad_norm": 0.06388476490974426, "kl": 0.010038034539473684, "learning_rate": 1.1291055851370623e-07, "loss": 0.0, "reward": 0.7491228202455922, "reward_std": 0.2948876986378118, "rewards/accuracy_reward": 0.7491228202455922, "rewards/format_reward": 0.0, "step": 350 }, { "completion_length": 576.67545663934, "epoch": 0.8993333333333333, "grad_norm": 0.058260105550289154, "kl": 0.008216456363075659, "learning_rate": 8.894959999345015e-08, "loss": 0.0, "reward": 0.8017543960558741, "reward_std": 0.23333006344343488, "rewards/accuracy_reward": 0.8017543960558741, "rewards/format_reward": 0.0, "step": 355 }, { "completion_length": 549.091246434262, "epoch": 0.912, "grad_norm": 0.10874364525079727, "kl": 0.00869959781044408, "learning_rate": 6.776646400696212e-08, "loss": 0.0, "reward": 0.796491244749019, "reward_std": 0.3010250455454776, "rewards/accuracy_reward": 0.796491244749019, "rewards/format_reward": 0.0, "step": 360 }, { "completion_length": 560.8719421386719, "epoch": 0.9246666666666666, "grad_norm": 0.10691142827272415, "kl": 0.009839509662828948, "learning_rate": 4.940285210684375e-08, "loss": 0.0, "reward": 0.7964912431804757, "reward_std": 0.28669615042837043, "rewards/accuracy_reward": 0.7964912431804757, "rewards/format_reward": 0.0, "step": 365 }, { "completion_length": 570.056158126028, "epoch": 0.9373333333333334, "grad_norm": 0.14333242177963257, "kl": 0.008819580078125, "learning_rate": 3.389491527319999e-08, "loss": 0.0, "reward": 0.7631579105791293, "reward_std": 0.3113442129210422, "rewards/accuracy_reward": 0.7631579105791293, "rewards/format_reward": 0.0, "step": 370 }, { "completion_length": 581.7122971384149, "epoch": 0.95, "grad_norm": 0.12897399067878723, "kl": 0.009299830386513159, "learning_rate": 2.127318274608381e-08, "loss": 0.0, "reward": 0.759649133211688, "reward_std": 0.24736052971137198, "rewards/accuracy_reward": 0.759649133211688, "rewards/format_reward": 0.0, "step": 375 }, { "completion_length": 519.6631728322883, "epoch": 0.9626666666666667, "grad_norm": 0.046151068061590195, "kl": 0.018807340923108554, "learning_rate": 1.1562501925013125e-08, "loss": 0.0, "reward": 0.8105263286515286, "reward_std": 0.24725342863484434, "rewards/accuracy_reward": 0.8105263286515286, "rewards/format_reward": 0.0, "step": 380 }, { "completion_length": 522.570188984118, "epoch": 0.9753333333333334, "grad_norm": 0.10620765388011932, "kl": 0.00986279939350329, "learning_rate": 4.781989453874814e-09, "loss": 0.0, "reward": 0.7964912391022632, "reward_std": 0.2455626440675635, "rewards/accuracy_reward": 0.7964912391022632, "rewards/format_reward": 0.0, "step": 385 }, { "completion_length": 536.8315922787315, "epoch": 0.988, "grad_norm": 0.09291122853755951, "kl": 0.010211502878289473, "learning_rate": 9.44993587509657e-10, "loss": 0.0, "reward": 0.8070175572445518, "reward_std": 0.2619433641433716, "rewards/accuracy_reward": 0.8070175572445518, "rewards/format_reward": 0.0, "step": 390 }, { "completion_length": 567.2982639513517, "epoch": 0.9981333333333333, "kl": 0.00917715775339227, "reward": 0.7828947472337046, "reward_std": 0.2575972625299504, "rewards/accuracy_reward": 0.7828947472337046, "rewards/format_reward": 0.0, "step": 394, "total_flos": 0.0, "train_loss": 0.0008256121124141636, "train_runtime": 419098.9196, "train_samples_per_second": 0.018, "train_steps_per_second": 0.001 } ], "logging_steps": 5, "max_steps": 394, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }