{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9987438399845395, "eval_steps": 100, "global_step": 646, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 890.5919452667237, "epoch": 0.015460430959512996, "grad_norm": 0.0048114829798690025, "kl": 0.0004779815673828125, "learning_rate": 3.0769230769230774e-06, "loss": 0.0, "reward": 0.2079081600648351, "reward_std": 0.18389849485829474, "rewards/accuracy_reward": 0.2079081600648351, "rewards/format_reward": 0.0, "step": 10 }, { "completion_length": 815.0035549163819, "epoch": 0.03092086191902599, "grad_norm": 0.0037700873930097754, "kl": 0.005376839637756347, "learning_rate": 6.153846153846155e-06, "loss": 0.0002, "reward": 0.3485969334375113, "reward_std": 0.20335620292462409, "rewards/accuracy_reward": 0.3485969334375113, "rewards/format_reward": 0.0, "step": 20 }, { "completion_length": 783.1976894378662, "epoch": 0.04638129287853899, "grad_norm": 0.003633377980625143, "kl": 0.008260059356689452, "learning_rate": 9.230769230769232e-06, "loss": 0.0003, "reward": 0.41823978765169156, "reward_std": 0.214983982546255, "rewards/accuracy_reward": 0.41823978765169156, "rewards/format_reward": 0.0, "step": 30 }, { "completion_length": 796.1503639221191, "epoch": 0.06184172383805198, "grad_norm": 0.004207792051316878, "kl": 0.013314247131347656, "learning_rate": 1.230769230769231e-05, "loss": 0.0005, "reward": 0.45267856353893876, "reward_std": 0.22209063512273133, "rewards/accuracy_reward": 0.45267856353893876, "rewards/format_reward": 0.0, "step": 40 }, { "completion_length": 707.4483289718628, "epoch": 0.07730215479756498, "grad_norm": 0.0039377851262065435, "kl": 0.023092269897460938, "learning_rate": 1.5384615384615387e-05, "loss": 0.0009, "reward": 0.5378826450556516, "reward_std": 0.2228247532621026, "rewards/accuracy_reward": 0.5378826450556516, "rewards/format_reward": 0.0, "step": 50 }, { "completion_length": 738.098327255249, "epoch": 0.09276258575707798, "grad_norm": 0.004922000110022058, "kl": 0.03480720520019531, "learning_rate": 1.8461538461538465e-05, "loss": 0.0014, "reward": 0.5255101950955577, "reward_std": 0.21384936766698956, "rewards/accuracy_reward": 0.5255101950955577, "rewards/format_reward": 0.0, "step": 60 }, { "completion_length": 714.9747295379639, "epoch": 0.10822301671659097, "grad_norm": 0.025813508862428863, "kl": 0.11458740234375, "learning_rate": 1.999634547413886e-05, "loss": 0.0046, "reward": 0.5369897866621614, "reward_std": 0.24942327085882426, "rewards/accuracy_reward": 0.5369897866621614, "rewards/format_reward": 0.0, "step": 70 }, { "completion_length": 729.6705226898193, "epoch": 0.12368344767610397, "grad_norm": 0.004934077059395142, "kl": 0.1241455078125, "learning_rate": 1.9967125291968495e-05, "loss": 0.005, "reward": 0.5182397849857807, "reward_std": 0.22350065293721855, "rewards/accuracy_reward": 0.5182397849857807, "rewards/format_reward": 0.0, "step": 80 }, { "completion_length": 752.231746673584, "epoch": 0.13914387863561697, "grad_norm": 0.004490175226607394, "kl": 0.1960784912109375, "learning_rate": 1.990877034074683e-05, "loss": 0.0078, "reward": 0.47576529716607185, "reward_std": 0.21953069823794066, "rewards/accuracy_reward": 0.47576529716607185, "rewards/format_reward": 0.0, "step": 90 }, { "completion_length": 735.3539382934571, "epoch": 0.15460430959512997, "grad_norm": 0.11798471666893984, "kl": 0.30410003662109375, "learning_rate": 1.9821451197042028e-05, "loss": 0.0122, "reward": 0.4843112153466791, "reward_std": 0.22822934831492603, "rewards/accuracy_reward": 0.4843112153466791, "rewards/format_reward": 0.0, "step": 100 }, { "epoch": 0.15460430959512997, "eval_completion_length": 649.0448852539063, "eval_kl": 0.27952473958333335, "eval_loss": 0.009933823719620705, "eval_reward": 0.45850338935852053, "eval_reward_std": 0.26114755471547446, "eval_rewards/accuracy_reward": 0.45850338935852053, "eval_rewards/format_reward": 0.0, "eval_runtime": 88.3141, "eval_samples_per_second": 1.121, "eval_steps_per_second": 0.17, "step": 100 }, { "completion_length": 732.8517738342285, "epoch": 0.17006474055464296, "grad_norm": 0.02615303400330413, "kl": 1.07412109375, "learning_rate": 1.9705423102261324e-05, "loss": 0.043, "reward": 0.3991071363911033, "reward_std": 0.2730441292747855, "rewards/accuracy_reward": 0.3991071363911033, "rewards/format_reward": 0.0, "step": 110 }, { "completion_length": 792.610315322876, "epoch": 0.18552517151415596, "grad_norm": 0.0287118341101585, "kl": 0.542120361328125, "learning_rate": 1.956102521655831e-05, "loss": 0.0217, "reward": 0.3727040741709061, "reward_std": 0.26482684616930785, "rewards/accuracy_reward": 0.3727040741709061, "rewards/format_reward": 0.0, "step": 120 }, { "completion_length": 742.854447555542, "epoch": 0.20098560247366895, "grad_norm": 0.026940670011693296, "kl": 0.4710235595703125, "learning_rate": 1.9388679627438486e-05, "loss": 0.0188, "reward": 0.46033162334933875, "reward_std": 0.24005382088944316, "rewards/accuracy_reward": 0.46033162334933875, "rewards/format_reward": 0.0, "step": 130 }, { "completion_length": 747.0299587249756, "epoch": 0.21644603343318194, "grad_norm": 0.044131709279297326, "kl": 0.3888519287109375, "learning_rate": 1.9188890115960967e-05, "loss": 0.0156, "reward": 0.4734693782404065, "reward_std": 0.2484902088996023, "rewards/accuracy_reward": 0.4734693782404065, "rewards/format_reward": 0.0, "step": 140 }, { "completion_length": 719.1231998443603, "epoch": 0.23190646439269494, "grad_norm": 0.04685629239096149, "kl": 0.6114959716796875, "learning_rate": 1.8962240684142923e-05, "loss": 0.0245, "reward": 0.4772959094494581, "reward_std": 0.26747574456967416, "rewards/accuracy_reward": 0.4772959094494581, "rewards/format_reward": 0.0, "step": 150 }, { "completion_length": 752.0598068237305, "epoch": 0.24736689535220793, "grad_norm": 0.00786304561850755, "kl": 0.27758941650390623, "learning_rate": 1.8709393847871146e-05, "loss": 0.0111, "reward": 0.4918367238715291, "reward_std": 0.24101508525200188, "rewards/accuracy_reward": 0.4918367238715291, "rewards/format_reward": 0.0, "step": 160 }, { "completion_length": 713.9837879180908, "epoch": 0.26282732631172095, "grad_norm": 0.005284057529020878, "kl": 0.22295989990234374, "learning_rate": 1.8431088700310846e-05, "loss": 0.0089, "reward": 0.5107142773456872, "reward_std": 0.2484118543099612, "rewards/accuracy_reward": 0.5107142773456872, "rewards/format_reward": 0.0, "step": 170 }, { "completion_length": 721.1976871490479, "epoch": 0.27828775727123395, "grad_norm": 0.0059488370043247725, "kl": 0.27528076171875, "learning_rate": 1.8128138751472432e-05, "loss": 0.011, "reward": 0.4794642778113484, "reward_std": 0.28188897627405823, "rewards/accuracy_reward": 0.4794642778113484, "rewards/format_reward": 0.0, "step": 180 }, { "completion_length": 685.0909267425537, "epoch": 0.29374818823074694, "grad_norm": 0.024776439473654245, "kl": 0.3252960205078125, "learning_rate": 1.780142955025139e-05, "loss": 0.013, "reward": 0.49030611482448877, "reward_std": 0.29441971494816244, "rewards/accuracy_reward": 0.49030611482448877, "rewards/format_reward": 0.0, "step": 190 }, { "completion_length": 688.9476896286011, "epoch": 0.30920861919025994, "grad_norm": 0.018511152998280257, "kl": 0.407830810546875, "learning_rate": 1.745191609589231e-05, "loss": 0.0163, "reward": 0.4463010121136904, "reward_std": 0.29204082568176093, "rewards/accuracy_reward": 0.4463010121136904, "rewards/format_reward": 0.0, "step": 200 }, { "epoch": 0.30920861919025994, "eval_completion_length": 684.5197184244792, "eval_kl": 0.31907552083333335, "eval_loss": 0.013225244358181953, "eval_reward": 0.5034013529618581, "eval_reward_std": 0.26736749211947125, "eval_rewards/accuracy_reward": 0.5034013529618581, "eval_rewards/format_reward": 0.0, "eval_runtime": 88.6591, "eval_samples_per_second": 1.117, "eval_steps_per_second": 0.169, "step": 200 }, { "completion_length": 688.203684425354, "epoch": 0.32466905014977293, "grad_norm": 0.04166474923994624, "kl": 0.4008056640625, "learning_rate": 1.7080620046443503e-05, "loss": 0.016, "reward": 0.45382652347907426, "reward_std": 0.27682951451279225, "rewards/accuracy_reward": 0.45382652347907426, "rewards/format_reward": 0.0, "step": 210 }, { "completion_length": 671.052919960022, "epoch": 0.3401294811092859, "grad_norm": 0.03352718681096581, "kl": 0.3749359130859375, "learning_rate": 1.6688626732362192e-05, "loss": 0.015, "reward": 0.4804846870712936, "reward_std": 0.2816257219295949, "rewards/accuracy_reward": 0.4804846870712936, "rewards/format_reward": 0.0, "step": 220 }, { "completion_length": 643.8374883651734, "epoch": 0.3555899120687989, "grad_norm": 0.020351629255596172, "kl": 0.41683349609375, "learning_rate": 1.6277081983999742e-05, "loss": 0.0167, "reward": 0.4418367271311581, "reward_std": 0.28084711018018427, "rewards/accuracy_reward": 0.4418367271311581, "rewards/format_reward": 0.0, "step": 230 }, { "completion_length": 676.2223068237305, "epoch": 0.3710503430283119, "grad_norm": 0.05256794474201724, "kl": 0.4444091796875, "learning_rate": 1.5847188782240473e-05, "loss": 0.0178, "reward": 0.4517857059370726, "reward_std": 0.29922230960801244, "rewards/accuracy_reward": 0.4517857059370726, "rewards/format_reward": 0.0, "step": 240 }, { "completion_length": 668.4691181182861, "epoch": 0.3865107739878249, "grad_norm": 0.03902146766301656, "kl": 0.5255126953125, "learning_rate": 1.5400203742084508e-05, "loss": 0.021, "reward": 0.4383928496390581, "reward_std": 0.2969975466839969, "rewards/accuracy_reward": 0.4383928496390581, "rewards/format_reward": 0.0, "step": 250 }, { "completion_length": 603.6825130462646, "epoch": 0.4019712049473379, "grad_norm": 0.02207794461900282, "kl": 0.4320556640625, "learning_rate": 1.4937433439453465e-05, "loss": 0.0173, "reward": 0.45331631754525004, "reward_std": 0.29558597495779393, "rewards/accuracy_reward": 0.45331631754525004, "rewards/format_reward": 0.0, "step": 260 }, { "completion_length": 640.7654218673706, "epoch": 0.4174316359068509, "grad_norm": 0.02435264101678905, "kl": 0.510498046875, "learning_rate": 1.4460230591956097e-05, "loss": 0.0204, "reward": 0.4378826460801065, "reward_std": 0.2932774598710239, "rewards/accuracy_reward": 0.4378826460801065, "rewards/format_reward": 0.0, "step": 270 }, { "completion_length": 645.673583984375, "epoch": 0.4328920668663639, "grad_norm": 0.08816250567178784, "kl": 0.658056640625, "learning_rate": 1.3969990104777712e-05, "loss": 0.0263, "reward": 0.42423468651250007, "reward_std": 0.2902136994060129, "rewards/accuracy_reward": 0.42423468651250007, "rewards/format_reward": 0.0, "step": 280 }, { "completion_length": 620.3529205322266, "epoch": 0.4483524978258769, "grad_norm": 0.03880404727089487, "kl": 0.92225341796875, "learning_rate": 1.3468144993251735e-05, "loss": 0.0369, "reward": 0.45433672657236457, "reward_std": 0.29660415309481325, "rewards/accuracy_reward": 0.45433672657236457, "rewards/format_reward": 0.0, "step": 290 }, { "completion_length": 662.0209051132202, "epoch": 0.4638129287853899, "grad_norm": 0.0580905052695571, "kl": 1.16871337890625, "learning_rate": 1.295616219403197e-05, "loss": 0.0468, "reward": 0.4096938706934452, "reward_std": 0.3028755730483681, "rewards/accuracy_reward": 0.4096938706934452, "rewards/format_reward": 0.0, "step": 300 }, { "epoch": 0.4638129287853899, "eval_completion_length": 604.8503275553386, "eval_kl": 1.15458984375, "eval_loss": 0.04879453405737877, "eval_reward": 0.4707482943932215, "eval_reward_std": 0.27726571063200633, "eval_rewards/accuracy_reward": 0.4707482943932215, "eval_rewards/format_reward": 0.0, "eval_runtime": 85.9413, "eval_samples_per_second": 1.152, "eval_steps_per_second": 0.175, "step": 300 }, { "completion_length": 618.5189908981323, "epoch": 0.47927335974490287, "grad_norm": 0.04382094795523676, "kl": 0.9635986328125, "learning_rate": 1.2435538277109919e-05, "loss": 0.0385, "reward": 0.44183672638610005, "reward_std": 0.28991906996816397, "rewards/accuracy_reward": 0.44183672638610005, "rewards/format_reward": 0.0, "step": 310 }, { "completion_length": 576.9373609542847, "epoch": 0.49473379070441587, "grad_norm": 0.03238899986450347, "kl": 0.90596923828125, "learning_rate": 1.19077950712113e-05, "loss": 0.0362, "reward": 0.4463010119274259, "reward_std": 0.27278245403431356, "rewards/accuracy_reward": 0.4463010119274259, "rewards/format_reward": 0.0, "step": 320 }, { "completion_length": 535.9521591186524, "epoch": 0.5101942216639289, "grad_norm": 0.018431343598236234, "kl": 0.5312744140625, "learning_rate": 1.137447521535908e-05, "loss": 0.0213, "reward": 0.5080357047729194, "reward_std": 0.2620480744168162, "rewards/accuracy_reward": 0.5080357047729194, "rewards/format_reward": 0.0, "step": 330 }, { "completion_length": 646.3938673019409, "epoch": 0.5256546526234419, "grad_norm": 0.02317552951279719, "kl": 0.513262939453125, "learning_rate": 1.0837137649606241e-05, "loss": 0.0205, "reward": 0.44579080818220973, "reward_std": 0.29469514368101957, "rewards/accuracy_reward": 0.44579080818220973, "rewards/format_reward": 0.0, "step": 340 }, { "completion_length": 586.0761362075806, "epoch": 0.5411150835829549, "grad_norm": 0.016226017261457597, "kl": 0.3769073486328125, "learning_rate": 1.0297353058119209e-05, "loss": 0.0151, "reward": 0.4859693799167871, "reward_std": 0.2713043099734932, "rewards/accuracy_reward": 0.4859693799167871, "rewards/format_reward": 0.0, "step": 350 }, { "completion_length": 571.9863378524781, "epoch": 0.5565755145424679, "grad_norm": 0.0504216391704931, "kl": 0.599774169921875, "learning_rate": 9.756699277932196e-06, "loss": 0.024, "reward": 0.48443876539822667, "reward_std": 0.28054420156404375, "rewards/accuracy_reward": 0.48443876539822667, "rewards/format_reward": 0.0, "step": 360 }, { "completion_length": 563.5765197753906, "epoch": 0.5720359455019809, "grad_norm": 0.13990571845181696, "kl": 0.52132568359375, "learning_rate": 9.216756686793163e-06, "loss": 0.0209, "reward": 0.45790815523359923, "reward_std": 0.26586609268561007, "rewards/accuracy_reward": 0.45790815523359923, "rewards/format_reward": 0.0, "step": 370 }, { "completion_length": 575.829453086853, "epoch": 0.5874963764614939, "grad_norm": 0.013870992065908727, "kl": 0.330126953125, "learning_rate": 8.67910358358298e-06, "loss": 0.0132, "reward": 0.4734693790320307, "reward_std": 0.27416237886063755, "rewards/accuracy_reward": 0.4734693790320307, "rewards/format_reward": 0.0, "step": 380 }, { "completion_length": 568.1160606384277, "epoch": 0.6029568074210069, "grad_norm": 0.010163464330427247, "kl": 0.2426544189453125, "learning_rate": 8.145311574811325e-06, "loss": 0.0097, "reward": 0.4954081534408033, "reward_std": 0.26710083298385146, "rewards/accuracy_reward": 0.4954081534408033, "rewards/format_reward": 0.0, "step": 390 }, { "completion_length": 665.6186065673828, "epoch": 0.6184172383805199, "grad_norm": 0.013029334976500356, "kl": 0.321246337890625, "learning_rate": 7.616940980675004e-06, "loss": 0.0128, "reward": 0.4474489719606936, "reward_std": 0.27854115669615565, "rewards/accuracy_reward": 0.4474489719606936, "rewards/format_reward": 0.0, "step": 400 }, { "epoch": 0.6184172383805199, "eval_completion_length": 627.0203979492187, "eval_kl": 0.2732747395833333, "eval_loss": 0.011420400813221931, "eval_reward": 0.5047619010011355, "eval_reward_std": 0.2536137938499451, "eval_rewards/accuracy_reward": 0.5047619010011355, "eval_rewards/format_reward": 0.0, "eval_runtime": 87.816, "eval_samples_per_second": 1.127, "eval_steps_per_second": 0.171, "step": 400 }, { "completion_length": 646.4711584091186, "epoch": 0.6338776693400329, "grad_norm": 0.023099135498393764, "kl": 0.343701171875, "learning_rate": 7.095536274107046e-06, "loss": 0.0137, "reward": 0.4645408088341355, "reward_std": 0.2891133207827806, "rewards/accuracy_reward": 0.4645408088341355, "rewards/format_reward": 0.0, "step": 410 }, { "completion_length": 587.3729467391968, "epoch": 0.6493381002995459, "grad_norm": 0.013467292425479267, "kl": 0.2858642578125, "learning_rate": 6.58262156614881e-06, "loss": 0.0114, "reward": 0.503188765514642, "reward_std": 0.2590713477227837, "rewards/accuracy_reward": 0.503188765514642, "rewards/format_reward": 0.0, "step": 420 }, { "completion_length": 594.2052188873291, "epoch": 0.6647985312590589, "grad_norm": 0.013766538000227693, "kl": 0.285015869140625, "learning_rate": 6.079696150841634e-06, "loss": 0.0114, "reward": 0.4913265212439001, "reward_std": 0.2684762907214463, "rewards/accuracy_reward": 0.4913265212439001, "rewards/format_reward": 0.0, "step": 430 }, { "completion_length": 631.7939949035645, "epoch": 0.6802589622185718, "grad_norm": 0.07419237005783043, "kl": 0.346197509765625, "learning_rate": 5.588230122660672e-06, "loss": 0.0138, "reward": 0.4860969296656549, "reward_std": 0.2773646651767194, "rewards/accuracy_reward": 0.4860969296656549, "rewards/format_reward": 0.0, "step": 440 }, { "completion_length": 586.2428462982177, "epoch": 0.6957193931780848, "grad_norm": 0.015937060852966797, "kl": 0.2478759765625, "learning_rate": 5.109660079301668e-06, "loss": 0.0099, "reward": 0.5220663199201226, "reward_std": 0.2621162030380219, "rewards/accuracy_reward": 0.5220663199201226, "rewards/format_reward": 0.0, "step": 450 }, { "completion_length": 641.8402923583984, "epoch": 0.7111798241375978, "grad_norm": 0.04445799644422213, "kl": 0.3908416748046875, "learning_rate": 4.64538492238166e-06, "loss": 0.0156, "reward": 0.4795918288640678, "reward_std": 0.28201754316687583, "rewards/accuracy_reward": 0.4795918288640678, "rewards/format_reward": 0.0, "step": 460 }, { "completion_length": 648.5068748474121, "epoch": 0.7266402550971108, "grad_norm": 0.018272003157409882, "kl": 0.44942626953125, "learning_rate": 4.196761768328599e-06, "loss": 0.018, "reward": 0.45650509353727103, "reward_std": 0.2825955556239933, "rewards/accuracy_reward": 0.45650509353727103, "rewards/format_reward": 0.0, "step": 470 }, { "completion_length": 632.1244766235352, "epoch": 0.7421006860566238, "grad_norm": 0.03550172747608362, "kl": 0.54378662109375, "learning_rate": 3.7651019814126656e-06, "loss": 0.0218, "reward": 0.4636479509063065, "reward_std": 0.2770009428262711, "rewards/accuracy_reward": 0.4636479509063065, "rewards/format_reward": 0.0, "step": 480 }, { "completion_length": 632.8059818267823, "epoch": 0.7575611170161368, "grad_norm": 0.9437401498024955, "kl": 1.1802734375, "learning_rate": 3.3516673405151546e-06, "loss": 0.0472, "reward": 0.45255101229995487, "reward_std": 0.2746642493642867, "rewards/accuracy_reward": 0.45255101229995487, "rewards/format_reward": 0.0, "step": 490 }, { "completion_length": 586.2049638748169, "epoch": 0.7730215479756498, "grad_norm": 0.11836217149566043, "kl": 0.545013427734375, "learning_rate": 2.957666350839663e-06, "loss": 0.0218, "reward": 0.48941325647756456, "reward_std": 0.263414288777858, "rewards/accuracy_reward": 0.48941325647756456, "rewards/format_reward": 0.0, "step": 500 }, { "epoch": 0.7730215479756498, "eval_completion_length": 551.6952229817708, "eval_kl": 0.28020833333333334, "eval_loss": 0.011639236472547054, "eval_reward": 0.5387754996617635, "eval_reward_std": 0.2534260580937068, "eval_rewards/accuracy_reward": 0.5387754996617635, "eval_rewards/format_reward": 0.0, "eval_runtime": 85.6752, "eval_samples_per_second": 1.156, "eval_steps_per_second": 0.175, "step": 500 }, { "completion_length": 598.6526651382446, "epoch": 0.7884819789351628, "grad_norm": 0.014324651971585192, "kl": 0.284661865234375, "learning_rate": 2.5842507113469307e-06, "loss": 0.0114, "reward": 0.49579080580733714, "reward_std": 0.2849952794611454, "rewards/accuracy_reward": 0.49579080580733714, "rewards/format_reward": 0.0, "step": 510 }, { "completion_length": 602.975754737854, "epoch": 0.8039424098946758, "grad_norm": 0.013302847380980731, "kl": 0.287591552734375, "learning_rate": 2.2325119482391466e-06, "loss": 0.0115, "reward": 0.4869897892698646, "reward_std": 0.28300182512030003, "rewards/accuracy_reward": 0.4869897892698646, "rewards/format_reward": 0.0, "step": 520 }, { "completion_length": 617.0672046661377, "epoch": 0.8194028408541888, "grad_norm": 0.011506624405655665, "kl": 0.316864013671875, "learning_rate": 1.9034782243345074e-06, "loss": 0.0127, "reward": 0.49005101155489683, "reward_std": 0.283530889172107, "rewards/accuracy_reward": 0.49005101155489683, "rewards/format_reward": 0.0, "step": 530 }, { "completion_length": 608.665803527832, "epoch": 0.8348632718137018, "grad_norm": 0.016337273376864313, "kl": 0.557275390625, "learning_rate": 1.5981113336584041e-06, "loss": 0.0223, "reward": 0.49119897168129684, "reward_std": 0.28141105216927825, "rewards/accuracy_reward": 0.49119897168129684, "rewards/format_reward": 0.0, "step": 540 }, { "completion_length": 625.1159269332886, "epoch": 0.8503237027732148, "grad_norm": 0.057861981590189904, "kl": 0.260540771484375, "learning_rate": 1.3173038900362977e-06, "loss": 0.0104, "reward": 0.4499999931082129, "reward_std": 0.27894868138246237, "rewards/accuracy_reward": 0.4499999931082129, "rewards/format_reward": 0.0, "step": 550 }, { "completion_length": 632.5747304916382, "epoch": 0.8657841337327278, "grad_norm": 0.030466073357798586, "kl": 0.293267822265625, "learning_rate": 1.0618767179063416e-06, "loss": 0.0117, "reward": 0.4618622355163097, "reward_std": 0.2925746965222061, "rewards/accuracy_reward": 0.4618622355163097, "rewards/format_reward": 0.0, "step": 560 }, { "completion_length": 641.7084041595459, "epoch": 0.8812445646922408, "grad_norm": 0.017331692484703218, "kl": 0.316583251953125, "learning_rate": 8.325764529785851e-07, "loss": 0.0127, "reward": 0.4483418272808194, "reward_std": 0.28196476846933366, "rewards/accuracy_reward": 0.4483418272808194, "rewards/format_reward": 0.0, "step": 570 }, { "completion_length": 646.2618488311767, "epoch": 0.8967049956517538, "grad_norm": 0.009327065043797485, "kl": 0.303216552734375, "learning_rate": 6.300733597542086e-07, "loss": 0.0121, "reward": 0.44196427753195167, "reward_std": 0.28285656329244374, "rewards/accuracy_reward": 0.44196427753195167, "rewards/format_reward": 0.0, "step": 580 }, { "completion_length": 626.1570028305053, "epoch": 0.9121654266112668, "grad_norm": 0.010427073776751795, "kl": 0.31864013671875, "learning_rate": 4.549593722844492e-07, "loss": 0.0127, "reward": 0.46224488839507105, "reward_std": 0.28828581105917694, "rewards/accuracy_reward": 0.46224488839507105, "rewards/format_reward": 0.0, "step": 590 }, { "completion_length": 628.7367259979249, "epoch": 0.9276258575707798, "grad_norm": 0.01713962537975438, "kl": 0.3144775390625, "learning_rate": 3.0774636389618196e-07, "loss": 0.0126, "reward": 0.4502550953067839, "reward_std": 0.2758318264503032, "rewards/accuracy_reward": 0.4502550953067839, "rewards/format_reward": 0.0, "step": 600 }, { "epoch": 0.9276258575707798, "eval_completion_length": 607.1224405924479, "eval_kl": 0.2647786458333333, "eval_loss": 0.011027935892343521, "eval_reward": 0.4816326439380646, "eval_reward_std": 0.25309162139892577, "eval_rewards/accuracy_reward": 0.4816326439380646, "eval_rewards/format_reward": 0.0, "eval_runtime": 87.0986, "eval_samples_per_second": 1.137, "eval_steps_per_second": 0.172, "step": 600 }, { "completion_length": 633.0672080993652, "epoch": 0.9430862885302927, "grad_norm": 0.022562920761009898, "kl": 0.302935791015625, "learning_rate": 1.8886465094192895e-07, "loss": 0.0121, "reward": 0.46020407443866135, "reward_std": 0.2828258784487844, "rewards/accuracy_reward": 0.46020407443866135, "rewards/format_reward": 0.0, "step": 610 }, { "completion_length": 626.6780488967895, "epoch": 0.9585467194898057, "grad_norm": 0.012428624166763307, "kl": 0.3028564453125, "learning_rate": 9.866173494794462e-08, "loss": 0.0121, "reward": 0.447959177242592, "reward_std": 0.27789597446098924, "rewards/accuracy_reward": 0.447959177242592, "rewards/format_reward": 0.0, "step": 620 }, { "completion_length": 630.8656784057617, "epoch": 0.9740071504493187, "grad_norm": 0.009372661260439265, "kl": 0.305780029296875, "learning_rate": 3.7401286837214224e-08, "loss": 0.0122, "reward": 0.4577806035755202, "reward_std": 0.27292990586720406, "rewards/accuracy_reward": 0.4577806035755202, "rewards/format_reward": 0.0, "step": 630 }, { "completion_length": 624.4659309387207, "epoch": 0.9894675814088317, "grad_norm": 0.012393498840602491, "kl": 0.301348876953125, "learning_rate": 5.262376196544239e-09, "loss": 0.0121, "reward": 0.4693877460435033, "reward_std": 0.27738194689154627, "rewards/accuracy_reward": 0.4693877460435033, "rewards/format_reward": 0.0, "step": 640 }, { "completion_length": 626.0862976710001, "epoch": 0.9987438399845395, "kl": 0.31524658203125, "reward": 0.46088434507449466, "reward_std": 0.28149245004169643, "rewards/accuracy_reward": 0.46088434507449466, "rewards/format_reward": 0.0, "step": 646, "total_flos": 0.0, "train_loss": 0.01602011715643949, "train_runtime": 67938.2282, "train_samples_per_second": 1.066, "train_steps_per_second": 0.01 } ], "logging_steps": 10, "max_steps": 646, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }