|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9987438399845395, |
|
"eval_steps": 100, |
|
"global_step": 646, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 890.5919452667237, |
|
"epoch": 0.015460430959512996, |
|
"grad_norm": 0.0048114829798690025, |
|
"kl": 0.0004779815673828125, |
|
"learning_rate": 3.0769230769230774e-06, |
|
"loss": 0.0, |
|
"reward": 0.2079081600648351, |
|
"reward_std": 0.18389849485829474, |
|
"rewards/accuracy_reward": 0.2079081600648351, |
|
"rewards/format_reward": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 815.0035549163819, |
|
"epoch": 0.03092086191902599, |
|
"grad_norm": 0.0037700873930097754, |
|
"kl": 0.005376839637756347, |
|
"learning_rate": 6.153846153846155e-06, |
|
"loss": 0.0002, |
|
"reward": 0.3485969334375113, |
|
"reward_std": 0.20335620292462409, |
|
"rewards/accuracy_reward": 0.3485969334375113, |
|
"rewards/format_reward": 0.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 783.1976894378662, |
|
"epoch": 0.04638129287853899, |
|
"grad_norm": 0.003633377980625143, |
|
"kl": 0.008260059356689452, |
|
"learning_rate": 9.230769230769232e-06, |
|
"loss": 0.0003, |
|
"reward": 0.41823978765169156, |
|
"reward_std": 0.214983982546255, |
|
"rewards/accuracy_reward": 0.41823978765169156, |
|
"rewards/format_reward": 0.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 796.1503639221191, |
|
"epoch": 0.06184172383805198, |
|
"grad_norm": 0.004207792051316878, |
|
"kl": 0.013314247131347656, |
|
"learning_rate": 1.230769230769231e-05, |
|
"loss": 0.0005, |
|
"reward": 0.45267856353893876, |
|
"reward_std": 0.22209063512273133, |
|
"rewards/accuracy_reward": 0.45267856353893876, |
|
"rewards/format_reward": 0.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 707.4483289718628, |
|
"epoch": 0.07730215479756498, |
|
"grad_norm": 0.0039377851262065435, |
|
"kl": 0.023092269897460938, |
|
"learning_rate": 1.5384615384615387e-05, |
|
"loss": 0.0009, |
|
"reward": 0.5378826450556516, |
|
"reward_std": 0.2228247532621026, |
|
"rewards/accuracy_reward": 0.5378826450556516, |
|
"rewards/format_reward": 0.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 738.098327255249, |
|
"epoch": 0.09276258575707798, |
|
"grad_norm": 0.004922000110022058, |
|
"kl": 0.03480720520019531, |
|
"learning_rate": 1.8461538461538465e-05, |
|
"loss": 0.0014, |
|
"reward": 0.5255101950955577, |
|
"reward_std": 0.21384936766698956, |
|
"rewards/accuracy_reward": 0.5255101950955577, |
|
"rewards/format_reward": 0.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 714.9747295379639, |
|
"epoch": 0.10822301671659097, |
|
"grad_norm": 0.025813508862428863, |
|
"kl": 0.11458740234375, |
|
"learning_rate": 1.999634547413886e-05, |
|
"loss": 0.0046, |
|
"reward": 0.5369897866621614, |
|
"reward_std": 0.24942327085882426, |
|
"rewards/accuracy_reward": 0.5369897866621614, |
|
"rewards/format_reward": 0.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 729.6705226898193, |
|
"epoch": 0.12368344767610397, |
|
"grad_norm": 0.004934077059395142, |
|
"kl": 0.1241455078125, |
|
"learning_rate": 1.9967125291968495e-05, |
|
"loss": 0.005, |
|
"reward": 0.5182397849857807, |
|
"reward_std": 0.22350065293721855, |
|
"rewards/accuracy_reward": 0.5182397849857807, |
|
"rewards/format_reward": 0.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 752.231746673584, |
|
"epoch": 0.13914387863561697, |
|
"grad_norm": 0.004490175226607394, |
|
"kl": 0.1960784912109375, |
|
"learning_rate": 1.990877034074683e-05, |
|
"loss": 0.0078, |
|
"reward": 0.47576529716607185, |
|
"reward_std": 0.21953069823794066, |
|
"rewards/accuracy_reward": 0.47576529716607185, |
|
"rewards/format_reward": 0.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 735.3539382934571, |
|
"epoch": 0.15460430959512997, |
|
"grad_norm": 0.11798471666893984, |
|
"kl": 0.30410003662109375, |
|
"learning_rate": 1.9821451197042028e-05, |
|
"loss": 0.0122, |
|
"reward": 0.4843112153466791, |
|
"reward_std": 0.22822934831492603, |
|
"rewards/accuracy_reward": 0.4843112153466791, |
|
"rewards/format_reward": 0.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.15460430959512997, |
|
"eval_completion_length": 649.0448852539063, |
|
"eval_kl": 0.27952473958333335, |
|
"eval_loss": 0.009933823719620705, |
|
"eval_reward": 0.45850338935852053, |
|
"eval_reward_std": 0.26114755471547446, |
|
"eval_rewards/accuracy_reward": 0.45850338935852053, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 88.3141, |
|
"eval_samples_per_second": 1.121, |
|
"eval_steps_per_second": 0.17, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 732.8517738342285, |
|
"epoch": 0.17006474055464296, |
|
"grad_norm": 0.02615303400330413, |
|
"kl": 1.07412109375, |
|
"learning_rate": 1.9705423102261324e-05, |
|
"loss": 0.043, |
|
"reward": 0.3991071363911033, |
|
"reward_std": 0.2730441292747855, |
|
"rewards/accuracy_reward": 0.3991071363911033, |
|
"rewards/format_reward": 0.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 792.610315322876, |
|
"epoch": 0.18552517151415596, |
|
"grad_norm": 0.0287118341101585, |
|
"kl": 0.542120361328125, |
|
"learning_rate": 1.956102521655831e-05, |
|
"loss": 0.0217, |
|
"reward": 0.3727040741709061, |
|
"reward_std": 0.26482684616930785, |
|
"rewards/accuracy_reward": 0.3727040741709061, |
|
"rewards/format_reward": 0.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 742.854447555542, |
|
"epoch": 0.20098560247366895, |
|
"grad_norm": 0.026940670011693296, |
|
"kl": 0.4710235595703125, |
|
"learning_rate": 1.9388679627438486e-05, |
|
"loss": 0.0188, |
|
"reward": 0.46033162334933875, |
|
"reward_std": 0.24005382088944316, |
|
"rewards/accuracy_reward": 0.46033162334933875, |
|
"rewards/format_reward": 0.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 747.0299587249756, |
|
"epoch": 0.21644603343318194, |
|
"grad_norm": 0.044131709279297326, |
|
"kl": 0.3888519287109375, |
|
"learning_rate": 1.9188890115960967e-05, |
|
"loss": 0.0156, |
|
"reward": 0.4734693782404065, |
|
"reward_std": 0.2484902088996023, |
|
"rewards/accuracy_reward": 0.4734693782404065, |
|
"rewards/format_reward": 0.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 719.1231998443603, |
|
"epoch": 0.23190646439269494, |
|
"grad_norm": 0.04685629239096149, |
|
"kl": 0.6114959716796875, |
|
"learning_rate": 1.8962240684142923e-05, |
|
"loss": 0.0245, |
|
"reward": 0.4772959094494581, |
|
"reward_std": 0.26747574456967416, |
|
"rewards/accuracy_reward": 0.4772959094494581, |
|
"rewards/format_reward": 0.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 752.0598068237305, |
|
"epoch": 0.24736689535220793, |
|
"grad_norm": 0.00786304561850755, |
|
"kl": 0.27758941650390623, |
|
"learning_rate": 1.8709393847871146e-05, |
|
"loss": 0.0111, |
|
"reward": 0.4918367238715291, |
|
"reward_std": 0.24101508525200188, |
|
"rewards/accuracy_reward": 0.4918367238715291, |
|
"rewards/format_reward": 0.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 713.9837879180908, |
|
"epoch": 0.26282732631172095, |
|
"grad_norm": 0.005284057529020878, |
|
"kl": 0.22295989990234374, |
|
"learning_rate": 1.8431088700310846e-05, |
|
"loss": 0.0089, |
|
"reward": 0.5107142773456872, |
|
"reward_std": 0.2484118543099612, |
|
"rewards/accuracy_reward": 0.5107142773456872, |
|
"rewards/format_reward": 0.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 721.1976871490479, |
|
"epoch": 0.27828775727123395, |
|
"grad_norm": 0.0059488370043247725, |
|
"kl": 0.27528076171875, |
|
"learning_rate": 1.8128138751472432e-05, |
|
"loss": 0.011, |
|
"reward": 0.4794642778113484, |
|
"reward_std": 0.28188897627405823, |
|
"rewards/accuracy_reward": 0.4794642778113484, |
|
"rewards/format_reward": 0.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 685.0909267425537, |
|
"epoch": 0.29374818823074694, |
|
"grad_norm": 0.024776439473654245, |
|
"kl": 0.3252960205078125, |
|
"learning_rate": 1.780142955025139e-05, |
|
"loss": 0.013, |
|
"reward": 0.49030611482448877, |
|
"reward_std": 0.29441971494816244, |
|
"rewards/accuracy_reward": 0.49030611482448877, |
|
"rewards/format_reward": 0.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 688.9476896286011, |
|
"epoch": 0.30920861919025994, |
|
"grad_norm": 0.018511152998280257, |
|
"kl": 0.407830810546875, |
|
"learning_rate": 1.745191609589231e-05, |
|
"loss": 0.0163, |
|
"reward": 0.4463010121136904, |
|
"reward_std": 0.29204082568176093, |
|
"rewards/accuracy_reward": 0.4463010121136904, |
|
"rewards/format_reward": 0.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.30920861919025994, |
|
"eval_completion_length": 684.5197184244792, |
|
"eval_kl": 0.31907552083333335, |
|
"eval_loss": 0.013225244358181953, |
|
"eval_reward": 0.5034013529618581, |
|
"eval_reward_std": 0.26736749211947125, |
|
"eval_rewards/accuracy_reward": 0.5034013529618581, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 88.6591, |
|
"eval_samples_per_second": 1.117, |
|
"eval_steps_per_second": 0.169, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 688.203684425354, |
|
"epoch": 0.32466905014977293, |
|
"grad_norm": 0.04166474923994624, |
|
"kl": 0.4008056640625, |
|
"learning_rate": 1.7080620046443503e-05, |
|
"loss": 0.016, |
|
"reward": 0.45382652347907426, |
|
"reward_std": 0.27682951451279225, |
|
"rewards/accuracy_reward": 0.45382652347907426, |
|
"rewards/format_reward": 0.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 671.052919960022, |
|
"epoch": 0.3401294811092859, |
|
"grad_norm": 0.03352718681096581, |
|
"kl": 0.3749359130859375, |
|
"learning_rate": 1.6688626732362192e-05, |
|
"loss": 0.015, |
|
"reward": 0.4804846870712936, |
|
"reward_std": 0.2816257219295949, |
|
"rewards/accuracy_reward": 0.4804846870712936, |
|
"rewards/format_reward": 0.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 643.8374883651734, |
|
"epoch": 0.3555899120687989, |
|
"grad_norm": 0.020351629255596172, |
|
"kl": 0.41683349609375, |
|
"learning_rate": 1.6277081983999742e-05, |
|
"loss": 0.0167, |
|
"reward": 0.4418367271311581, |
|
"reward_std": 0.28084711018018427, |
|
"rewards/accuracy_reward": 0.4418367271311581, |
|
"rewards/format_reward": 0.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 676.2223068237305, |
|
"epoch": 0.3710503430283119, |
|
"grad_norm": 0.05256794474201724, |
|
"kl": 0.4444091796875, |
|
"learning_rate": 1.5847188782240473e-05, |
|
"loss": 0.0178, |
|
"reward": 0.4517857059370726, |
|
"reward_std": 0.29922230960801244, |
|
"rewards/accuracy_reward": 0.4517857059370726, |
|
"rewards/format_reward": 0.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 668.4691181182861, |
|
"epoch": 0.3865107739878249, |
|
"grad_norm": 0.03902146766301656, |
|
"kl": 0.5255126953125, |
|
"learning_rate": 1.5400203742084508e-05, |
|
"loss": 0.021, |
|
"reward": 0.4383928496390581, |
|
"reward_std": 0.2969975466839969, |
|
"rewards/accuracy_reward": 0.4383928496390581, |
|
"rewards/format_reward": 0.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 603.6825130462646, |
|
"epoch": 0.4019712049473379, |
|
"grad_norm": 0.02207794461900282, |
|
"kl": 0.4320556640625, |
|
"learning_rate": 1.4937433439453465e-05, |
|
"loss": 0.0173, |
|
"reward": 0.45331631754525004, |
|
"reward_std": 0.29558597495779393, |
|
"rewards/accuracy_reward": 0.45331631754525004, |
|
"rewards/format_reward": 0.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 640.7654218673706, |
|
"epoch": 0.4174316359068509, |
|
"grad_norm": 0.02435264101678905, |
|
"kl": 0.510498046875, |
|
"learning_rate": 1.4460230591956097e-05, |
|
"loss": 0.0204, |
|
"reward": 0.4378826460801065, |
|
"reward_std": 0.2932774598710239, |
|
"rewards/accuracy_reward": 0.4378826460801065, |
|
"rewards/format_reward": 0.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 645.673583984375, |
|
"epoch": 0.4328920668663639, |
|
"grad_norm": 0.08816250567178784, |
|
"kl": 0.658056640625, |
|
"learning_rate": 1.3969990104777712e-05, |
|
"loss": 0.0263, |
|
"reward": 0.42423468651250007, |
|
"reward_std": 0.2902136994060129, |
|
"rewards/accuracy_reward": 0.42423468651250007, |
|
"rewards/format_reward": 0.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 620.3529205322266, |
|
"epoch": 0.4483524978258769, |
|
"grad_norm": 0.03880404727089487, |
|
"kl": 0.92225341796875, |
|
"learning_rate": 1.3468144993251735e-05, |
|
"loss": 0.0369, |
|
"reward": 0.45433672657236457, |
|
"reward_std": 0.29660415309481325, |
|
"rewards/accuracy_reward": 0.45433672657236457, |
|
"rewards/format_reward": 0.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 662.0209051132202, |
|
"epoch": 0.4638129287853899, |
|
"grad_norm": 0.0580905052695571, |
|
"kl": 1.16871337890625, |
|
"learning_rate": 1.295616219403197e-05, |
|
"loss": 0.0468, |
|
"reward": 0.4096938706934452, |
|
"reward_std": 0.3028755730483681, |
|
"rewards/accuracy_reward": 0.4096938706934452, |
|
"rewards/format_reward": 0.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4638129287853899, |
|
"eval_completion_length": 604.8503275553386, |
|
"eval_kl": 1.15458984375, |
|
"eval_loss": 0.04879453405737877, |
|
"eval_reward": 0.4707482943932215, |
|
"eval_reward_std": 0.27726571063200633, |
|
"eval_rewards/accuracy_reward": 0.4707482943932215, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 85.9413, |
|
"eval_samples_per_second": 1.152, |
|
"eval_steps_per_second": 0.175, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 618.5189908981323, |
|
"epoch": 0.47927335974490287, |
|
"grad_norm": 0.04382094795523676, |
|
"kl": 0.9635986328125, |
|
"learning_rate": 1.2435538277109919e-05, |
|
"loss": 0.0385, |
|
"reward": 0.44183672638610005, |
|
"reward_std": 0.28991906996816397, |
|
"rewards/accuracy_reward": 0.44183672638610005, |
|
"rewards/format_reward": 0.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 576.9373609542847, |
|
"epoch": 0.49473379070441587, |
|
"grad_norm": 0.03238899986450347, |
|
"kl": 0.90596923828125, |
|
"learning_rate": 1.19077950712113e-05, |
|
"loss": 0.0362, |
|
"reward": 0.4463010119274259, |
|
"reward_std": 0.27278245403431356, |
|
"rewards/accuracy_reward": 0.4463010119274259, |
|
"rewards/format_reward": 0.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 535.9521591186524, |
|
"epoch": 0.5101942216639289, |
|
"grad_norm": 0.018431343598236234, |
|
"kl": 0.5312744140625, |
|
"learning_rate": 1.137447521535908e-05, |
|
"loss": 0.0213, |
|
"reward": 0.5080357047729194, |
|
"reward_std": 0.2620480744168162, |
|
"rewards/accuracy_reward": 0.5080357047729194, |
|
"rewards/format_reward": 0.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 646.3938673019409, |
|
"epoch": 0.5256546526234419, |
|
"grad_norm": 0.02317552951279719, |
|
"kl": 0.513262939453125, |
|
"learning_rate": 1.0837137649606241e-05, |
|
"loss": 0.0205, |
|
"reward": 0.44579080818220973, |
|
"reward_std": 0.29469514368101957, |
|
"rewards/accuracy_reward": 0.44579080818220973, |
|
"rewards/format_reward": 0.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 586.0761362075806, |
|
"epoch": 0.5411150835829549, |
|
"grad_norm": 0.016226017261457597, |
|
"kl": 0.3769073486328125, |
|
"learning_rate": 1.0297353058119209e-05, |
|
"loss": 0.0151, |
|
"reward": 0.4859693799167871, |
|
"reward_std": 0.2713043099734932, |
|
"rewards/accuracy_reward": 0.4859693799167871, |
|
"rewards/format_reward": 0.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 571.9863378524781, |
|
"epoch": 0.5565755145424679, |
|
"grad_norm": 0.0504216391704931, |
|
"kl": 0.599774169921875, |
|
"learning_rate": 9.756699277932196e-06, |
|
"loss": 0.024, |
|
"reward": 0.48443876539822667, |
|
"reward_std": 0.28054420156404375, |
|
"rewards/accuracy_reward": 0.48443876539822667, |
|
"rewards/format_reward": 0.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 563.5765197753906, |
|
"epoch": 0.5720359455019809, |
|
"grad_norm": 0.13990571845181696, |
|
"kl": 0.52132568359375, |
|
"learning_rate": 9.216756686793163e-06, |
|
"loss": 0.0209, |
|
"reward": 0.45790815523359923, |
|
"reward_std": 0.26586609268561007, |
|
"rewards/accuracy_reward": 0.45790815523359923, |
|
"rewards/format_reward": 0.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 575.829453086853, |
|
"epoch": 0.5874963764614939, |
|
"grad_norm": 0.013870992065908727, |
|
"kl": 0.330126953125, |
|
"learning_rate": 8.67910358358298e-06, |
|
"loss": 0.0132, |
|
"reward": 0.4734693790320307, |
|
"reward_std": 0.27416237886063755, |
|
"rewards/accuracy_reward": 0.4734693790320307, |
|
"rewards/format_reward": 0.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 568.1160606384277, |
|
"epoch": 0.6029568074210069, |
|
"grad_norm": 0.010163464330427247, |
|
"kl": 0.2426544189453125, |
|
"learning_rate": 8.145311574811325e-06, |
|
"loss": 0.0097, |
|
"reward": 0.4954081534408033, |
|
"reward_std": 0.26710083298385146, |
|
"rewards/accuracy_reward": 0.4954081534408033, |
|
"rewards/format_reward": 0.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"completion_length": 665.6186065673828, |
|
"epoch": 0.6184172383805199, |
|
"grad_norm": 0.013029334976500356, |
|
"kl": 0.321246337890625, |
|
"learning_rate": 7.616940980675004e-06, |
|
"loss": 0.0128, |
|
"reward": 0.4474489719606936, |
|
"reward_std": 0.27854115669615565, |
|
"rewards/accuracy_reward": 0.4474489719606936, |
|
"rewards/format_reward": 0.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6184172383805199, |
|
"eval_completion_length": 627.0203979492187, |
|
"eval_kl": 0.2732747395833333, |
|
"eval_loss": 0.011420400813221931, |
|
"eval_reward": 0.5047619010011355, |
|
"eval_reward_std": 0.2536137938499451, |
|
"eval_rewards/accuracy_reward": 0.5047619010011355, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 87.816, |
|
"eval_samples_per_second": 1.127, |
|
"eval_steps_per_second": 0.171, |
|
"step": 400 |
|
}, |
|
{ |
|
"completion_length": 646.4711584091186, |
|
"epoch": 0.6338776693400329, |
|
"grad_norm": 0.023099135498393764, |
|
"kl": 0.343701171875, |
|
"learning_rate": 7.095536274107046e-06, |
|
"loss": 0.0137, |
|
"reward": 0.4645408088341355, |
|
"reward_std": 0.2891133207827806, |
|
"rewards/accuracy_reward": 0.4645408088341355, |
|
"rewards/format_reward": 0.0, |
|
"step": 410 |
|
}, |
|
{ |
|
"completion_length": 587.3729467391968, |
|
"epoch": 0.6493381002995459, |
|
"grad_norm": 0.013467292425479267, |
|
"kl": 0.2858642578125, |
|
"learning_rate": 6.58262156614881e-06, |
|
"loss": 0.0114, |
|
"reward": 0.503188765514642, |
|
"reward_std": 0.2590713477227837, |
|
"rewards/accuracy_reward": 0.503188765514642, |
|
"rewards/format_reward": 0.0, |
|
"step": 420 |
|
}, |
|
{ |
|
"completion_length": 594.2052188873291, |
|
"epoch": 0.6647985312590589, |
|
"grad_norm": 0.013766538000227693, |
|
"kl": 0.285015869140625, |
|
"learning_rate": 6.079696150841634e-06, |
|
"loss": 0.0114, |
|
"reward": 0.4913265212439001, |
|
"reward_std": 0.2684762907214463, |
|
"rewards/accuracy_reward": 0.4913265212439001, |
|
"rewards/format_reward": 0.0, |
|
"step": 430 |
|
}, |
|
{ |
|
"completion_length": 631.7939949035645, |
|
"epoch": 0.6802589622185718, |
|
"grad_norm": 0.07419237005783043, |
|
"kl": 0.346197509765625, |
|
"learning_rate": 5.588230122660672e-06, |
|
"loss": 0.0138, |
|
"reward": 0.4860969296656549, |
|
"reward_std": 0.2773646651767194, |
|
"rewards/accuracy_reward": 0.4860969296656549, |
|
"rewards/format_reward": 0.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"completion_length": 586.2428462982177, |
|
"epoch": 0.6957193931780848, |
|
"grad_norm": 0.015937060852966797, |
|
"kl": 0.2478759765625, |
|
"learning_rate": 5.109660079301668e-06, |
|
"loss": 0.0099, |
|
"reward": 0.5220663199201226, |
|
"reward_std": 0.2621162030380219, |
|
"rewards/accuracy_reward": 0.5220663199201226, |
|
"rewards/format_reward": 0.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"completion_length": 641.8402923583984, |
|
"epoch": 0.7111798241375978, |
|
"grad_norm": 0.04445799644422213, |
|
"kl": 0.3908416748046875, |
|
"learning_rate": 4.64538492238166e-06, |
|
"loss": 0.0156, |
|
"reward": 0.4795918288640678, |
|
"reward_std": 0.28201754316687583, |
|
"rewards/accuracy_reward": 0.4795918288640678, |
|
"rewards/format_reward": 0.0, |
|
"step": 460 |
|
}, |
|
{ |
|
"completion_length": 648.5068748474121, |
|
"epoch": 0.7266402550971108, |
|
"grad_norm": 0.018272003157409882, |
|
"kl": 0.44942626953125, |
|
"learning_rate": 4.196761768328599e-06, |
|
"loss": 0.018, |
|
"reward": 0.45650509353727103, |
|
"reward_std": 0.2825955556239933, |
|
"rewards/accuracy_reward": 0.45650509353727103, |
|
"rewards/format_reward": 0.0, |
|
"step": 470 |
|
}, |
|
{ |
|
"completion_length": 632.1244766235352, |
|
"epoch": 0.7421006860566238, |
|
"grad_norm": 0.03550172747608362, |
|
"kl": 0.54378662109375, |
|
"learning_rate": 3.7651019814126656e-06, |
|
"loss": 0.0218, |
|
"reward": 0.4636479509063065, |
|
"reward_std": 0.2770009428262711, |
|
"rewards/accuracy_reward": 0.4636479509063065, |
|
"rewards/format_reward": 0.0, |
|
"step": 480 |
|
}, |
|
{ |
|
"completion_length": 632.8059818267823, |
|
"epoch": 0.7575611170161368, |
|
"grad_norm": 0.9437401498024955, |
|
"kl": 1.1802734375, |
|
"learning_rate": 3.3516673405151546e-06, |
|
"loss": 0.0472, |
|
"reward": 0.45255101229995487, |
|
"reward_std": 0.2746642493642867, |
|
"rewards/accuracy_reward": 0.45255101229995487, |
|
"rewards/format_reward": 0.0, |
|
"step": 490 |
|
}, |
|
{ |
|
"completion_length": 586.2049638748169, |
|
"epoch": 0.7730215479756498, |
|
"grad_norm": 0.11836217149566043, |
|
"kl": 0.545013427734375, |
|
"learning_rate": 2.957666350839663e-06, |
|
"loss": 0.0218, |
|
"reward": 0.48941325647756456, |
|
"reward_std": 0.263414288777858, |
|
"rewards/accuracy_reward": 0.48941325647756456, |
|
"rewards/format_reward": 0.0, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7730215479756498, |
|
"eval_completion_length": 551.6952229817708, |
|
"eval_kl": 0.28020833333333334, |
|
"eval_loss": 0.011639236472547054, |
|
"eval_reward": 0.5387754996617635, |
|
"eval_reward_std": 0.2534260580937068, |
|
"eval_rewards/accuracy_reward": 0.5387754996617635, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 85.6752, |
|
"eval_samples_per_second": 1.156, |
|
"eval_steps_per_second": 0.175, |
|
"step": 500 |
|
}, |
|
{ |
|
"completion_length": 598.6526651382446, |
|
"epoch": 0.7884819789351628, |
|
"grad_norm": 0.014324651971585192, |
|
"kl": 0.284661865234375, |
|
"learning_rate": 2.5842507113469307e-06, |
|
"loss": 0.0114, |
|
"reward": 0.49579080580733714, |
|
"reward_std": 0.2849952794611454, |
|
"rewards/accuracy_reward": 0.49579080580733714, |
|
"rewards/format_reward": 0.0, |
|
"step": 510 |
|
}, |
|
{ |
|
"completion_length": 602.975754737854, |
|
"epoch": 0.8039424098946758, |
|
"grad_norm": 0.013302847380980731, |
|
"kl": 0.287591552734375, |
|
"learning_rate": 2.2325119482391466e-06, |
|
"loss": 0.0115, |
|
"reward": 0.4869897892698646, |
|
"reward_std": 0.28300182512030003, |
|
"rewards/accuracy_reward": 0.4869897892698646, |
|
"rewards/format_reward": 0.0, |
|
"step": 520 |
|
}, |
|
{ |
|
"completion_length": 617.0672046661377, |
|
"epoch": 0.8194028408541888, |
|
"grad_norm": 0.011506624405655665, |
|
"kl": 0.316864013671875, |
|
"learning_rate": 1.9034782243345074e-06, |
|
"loss": 0.0127, |
|
"reward": 0.49005101155489683, |
|
"reward_std": 0.283530889172107, |
|
"rewards/accuracy_reward": 0.49005101155489683, |
|
"rewards/format_reward": 0.0, |
|
"step": 530 |
|
}, |
|
{ |
|
"completion_length": 608.665803527832, |
|
"epoch": 0.8348632718137018, |
|
"grad_norm": 0.016337273376864313, |
|
"kl": 0.557275390625, |
|
"learning_rate": 1.5981113336584041e-06, |
|
"loss": 0.0223, |
|
"reward": 0.49119897168129684, |
|
"reward_std": 0.28141105216927825, |
|
"rewards/accuracy_reward": 0.49119897168129684, |
|
"rewards/format_reward": 0.0, |
|
"step": 540 |
|
}, |
|
{ |
|
"completion_length": 625.1159269332886, |
|
"epoch": 0.8503237027732148, |
|
"grad_norm": 0.057861981590189904, |
|
"kl": 0.260540771484375, |
|
"learning_rate": 1.3173038900362977e-06, |
|
"loss": 0.0104, |
|
"reward": 0.4499999931082129, |
|
"reward_std": 0.27894868138246237, |
|
"rewards/accuracy_reward": 0.4499999931082129, |
|
"rewards/format_reward": 0.0, |
|
"step": 550 |
|
}, |
|
{ |
|
"completion_length": 632.5747304916382, |
|
"epoch": 0.8657841337327278, |
|
"grad_norm": 0.030466073357798586, |
|
"kl": 0.293267822265625, |
|
"learning_rate": 1.0618767179063416e-06, |
|
"loss": 0.0117, |
|
"reward": 0.4618622355163097, |
|
"reward_std": 0.2925746965222061, |
|
"rewards/accuracy_reward": 0.4618622355163097, |
|
"rewards/format_reward": 0.0, |
|
"step": 560 |
|
}, |
|
{ |
|
"completion_length": 641.7084041595459, |
|
"epoch": 0.8812445646922408, |
|
"grad_norm": 0.017331692484703218, |
|
"kl": 0.316583251953125, |
|
"learning_rate": 8.325764529785851e-07, |
|
"loss": 0.0127, |
|
"reward": 0.4483418272808194, |
|
"reward_std": 0.28196476846933366, |
|
"rewards/accuracy_reward": 0.4483418272808194, |
|
"rewards/format_reward": 0.0, |
|
"step": 570 |
|
}, |
|
{ |
|
"completion_length": 646.2618488311767, |
|
"epoch": 0.8967049956517538, |
|
"grad_norm": 0.009327065043797485, |
|
"kl": 0.303216552734375, |
|
"learning_rate": 6.300733597542086e-07, |
|
"loss": 0.0121, |
|
"reward": 0.44196427753195167, |
|
"reward_std": 0.28285656329244374, |
|
"rewards/accuracy_reward": 0.44196427753195167, |
|
"rewards/format_reward": 0.0, |
|
"step": 580 |
|
}, |
|
{ |
|
"completion_length": 626.1570028305053, |
|
"epoch": 0.9121654266112668, |
|
"grad_norm": 0.010427073776751795, |
|
"kl": 0.31864013671875, |
|
"learning_rate": 4.549593722844492e-07, |
|
"loss": 0.0127, |
|
"reward": 0.46224488839507105, |
|
"reward_std": 0.28828581105917694, |
|
"rewards/accuracy_reward": 0.46224488839507105, |
|
"rewards/format_reward": 0.0, |
|
"step": 590 |
|
}, |
|
{ |
|
"completion_length": 628.7367259979249, |
|
"epoch": 0.9276258575707798, |
|
"grad_norm": 0.01713962537975438, |
|
"kl": 0.3144775390625, |
|
"learning_rate": 3.0774636389618196e-07, |
|
"loss": 0.0126, |
|
"reward": 0.4502550953067839, |
|
"reward_std": 0.2758318264503032, |
|
"rewards/accuracy_reward": 0.4502550953067839, |
|
"rewards/format_reward": 0.0, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9276258575707798, |
|
"eval_completion_length": 607.1224405924479, |
|
"eval_kl": 0.2647786458333333, |
|
"eval_loss": 0.011027935892343521, |
|
"eval_reward": 0.4816326439380646, |
|
"eval_reward_std": 0.25309162139892577, |
|
"eval_rewards/accuracy_reward": 0.4816326439380646, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 87.0986, |
|
"eval_samples_per_second": 1.137, |
|
"eval_steps_per_second": 0.172, |
|
"step": 600 |
|
}, |
|
{ |
|
"completion_length": 633.0672080993652, |
|
"epoch": 0.9430862885302927, |
|
"grad_norm": 0.022562920761009898, |
|
"kl": 0.302935791015625, |
|
"learning_rate": 1.8886465094192895e-07, |
|
"loss": 0.0121, |
|
"reward": 0.46020407443866135, |
|
"reward_std": 0.2828258784487844, |
|
"rewards/accuracy_reward": 0.46020407443866135, |
|
"rewards/format_reward": 0.0, |
|
"step": 610 |
|
}, |
|
{ |
|
"completion_length": 626.6780488967895, |
|
"epoch": 0.9585467194898057, |
|
"grad_norm": 0.012428624166763307, |
|
"kl": 0.3028564453125, |
|
"learning_rate": 9.866173494794462e-08, |
|
"loss": 0.0121, |
|
"reward": 0.447959177242592, |
|
"reward_std": 0.27789597446098924, |
|
"rewards/accuracy_reward": 0.447959177242592, |
|
"rewards/format_reward": 0.0, |
|
"step": 620 |
|
}, |
|
{ |
|
"completion_length": 630.8656784057617, |
|
"epoch": 0.9740071504493187, |
|
"grad_norm": 0.009372661260439265, |
|
"kl": 0.305780029296875, |
|
"learning_rate": 3.7401286837214224e-08, |
|
"loss": 0.0122, |
|
"reward": 0.4577806035755202, |
|
"reward_std": 0.27292990586720406, |
|
"rewards/accuracy_reward": 0.4577806035755202, |
|
"rewards/format_reward": 0.0, |
|
"step": 630 |
|
}, |
|
{ |
|
"completion_length": 624.4659309387207, |
|
"epoch": 0.9894675814088317, |
|
"grad_norm": 0.012393498840602491, |
|
"kl": 0.301348876953125, |
|
"learning_rate": 5.262376196544239e-09, |
|
"loss": 0.0121, |
|
"reward": 0.4693877460435033, |
|
"reward_std": 0.27738194689154627, |
|
"rewards/accuracy_reward": 0.4693877460435033, |
|
"rewards/format_reward": 0.0, |
|
"step": 640 |
|
}, |
|
{ |
|
"completion_length": 626.0862976710001, |
|
"epoch": 0.9987438399845395, |
|
"kl": 0.31524658203125, |
|
"reward": 0.46088434507449466, |
|
"reward_std": 0.28149245004169643, |
|
"rewards/accuracy_reward": 0.46088434507449466, |
|
"rewards/format_reward": 0.0, |
|
"step": 646, |
|
"total_flos": 0.0, |
|
"train_loss": 0.01602011715643949, |
|
"train_runtime": 67938.2282, |
|
"train_samples_per_second": 1.066, |
|
"train_steps_per_second": 0.01 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 646, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|