|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9984, |
|
"eval_steps": 100, |
|
"global_step": 468, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 633.2446681976319, |
|
"epoch": 0.010666666666666666, |
|
"grad_norm": 2.2443933486938477, |
|
"kl": 0.00011417865753173828, |
|
"learning_rate": 3.1914893617021275e-07, |
|
"loss": 0.0, |
|
"reward": 1.138736367225647, |
|
"reward_std": 0.8278621450066567, |
|
"rewards/accuracy_reward": 0.5946428831666708, |
|
"rewards/cosine_scaled_reward": 0.2899268216686323, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.25416668243706225, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 600.8857383728027, |
|
"epoch": 0.021333333333333333, |
|
"grad_norm": 5.001251220703125, |
|
"kl": 0.00020779371261596679, |
|
"learning_rate": 6.382978723404255e-07, |
|
"loss": 0.0, |
|
"reward": 1.2528822764754295, |
|
"reward_std": 0.8592379853129387, |
|
"rewards/accuracy_reward": 0.6553571775555611, |
|
"rewards/cosine_scaled_reward": 0.34097747248015364, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.25654763616621495, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 601.8518112182617, |
|
"epoch": 0.032, |
|
"grad_norm": 3.453845500946045, |
|
"kl": 0.00034580230712890627, |
|
"learning_rate": 9.574468085106384e-07, |
|
"loss": 0.0, |
|
"reward": 1.2825960636138916, |
|
"reward_std": 0.7762525148689747, |
|
"rewards/accuracy_reward": 0.6642857484519482, |
|
"rewards/cosine_scaled_reward": 0.3486674582702108, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.26964287189766767, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 620.7839553833007, |
|
"epoch": 0.042666666666666665, |
|
"grad_norm": 63.01131057739258, |
|
"kl": 0.001246500015258789, |
|
"learning_rate": 1.276595744680851e-06, |
|
"loss": 0.0001, |
|
"reward": 1.2914750523865224, |
|
"reward_std": 0.7945833645761013, |
|
"rewards/accuracy_reward": 0.6571428865194321, |
|
"rewards/cosine_scaled_reward": 0.3593321413063677, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.2750000203028321, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 639.3946762084961, |
|
"epoch": 0.05333333333333334, |
|
"grad_norm": 1.1951252222061157, |
|
"kl": 0.001938199996948242, |
|
"learning_rate": 1.5957446808510639e-06, |
|
"loss": 0.0001, |
|
"reward": 1.2197763450443744, |
|
"reward_std": 0.7964548453688621, |
|
"rewards/accuracy_reward": 0.6285714630037547, |
|
"rewards/cosine_scaled_reward": 0.323942980915308, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.2672619212418795, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 645.9482414245606, |
|
"epoch": 0.064, |
|
"grad_norm": 0.5322187542915344, |
|
"kl": 0.0028698921203613283, |
|
"learning_rate": 1.9148936170212767e-06, |
|
"loss": 0.0001, |
|
"reward": 1.34233574308455, |
|
"reward_std": 0.7051636058837175, |
|
"rewards/accuracy_reward": 0.6821428902447224, |
|
"rewards/cosine_scaled_reward": 0.38400235488079487, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.2761904950253665, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 630.1071678161621, |
|
"epoch": 0.07466666666666667, |
|
"grad_norm": 0.686019241809845, |
|
"kl": 0.00424489974975586, |
|
"learning_rate": 2.2340425531914894e-06, |
|
"loss": 0.0002, |
|
"reward": 1.2706220560474322, |
|
"reward_std": 0.7081292014569044, |
|
"rewards/accuracy_reward": 0.6839286010712385, |
|
"rewards/cosine_scaled_reward": 0.34145535016432405, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.2452381114475429, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 663.8464553833007, |
|
"epoch": 0.08533333333333333, |
|
"grad_norm": 10619385856.0, |
|
"kl": 11324620.806011772, |
|
"learning_rate": 2.553191489361702e-06, |
|
"loss": 453134.65, |
|
"reward": 1.4818414891138674, |
|
"reward_std": 0.724718413501978, |
|
"rewards/accuracy_reward": 0.7196428954601288, |
|
"rewards/cosine_scaled_reward": 0.43124618427827954, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.3309524044394493, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 636.5178840637207, |
|
"epoch": 0.096, |
|
"grad_norm": 0.4083445370197296, |
|
"kl": 0.1388763427734375, |
|
"learning_rate": 2.872340425531915e-06, |
|
"loss": 0.0055, |
|
"reward": 1.5206772923469543, |
|
"reward_std": 0.6890950493514538, |
|
"rewards/accuracy_reward": 0.7428571715950966, |
|
"rewards/cosine_scaled_reward": 0.4444867596961558, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.3333333550952375, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 624.0178833007812, |
|
"epoch": 0.10666666666666667, |
|
"grad_norm": 0.6491600275039673, |
|
"kl": 0.014713478088378907, |
|
"learning_rate": 2.9996241442585123e-06, |
|
"loss": 0.0006, |
|
"reward": 1.5073627218604089, |
|
"reward_std": 0.7132997542619706, |
|
"rewards/accuracy_reward": 0.712500025331974, |
|
"rewards/cosine_scaled_reward": 0.41093407664448023, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.38392860516905786, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 631.5339569091797, |
|
"epoch": 0.11733333333333333, |
|
"grad_norm": 0.7147920727729797, |
|
"kl": 0.007195663452148437, |
|
"learning_rate": 2.9973279301399446e-06, |
|
"loss": 0.0003, |
|
"reward": 1.5377919152379036, |
|
"reward_std": 0.76092077344656, |
|
"rewards/accuracy_reward": 0.7232143200933934, |
|
"rewards/cosine_scaled_reward": 0.4282680474221706, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.386309552192688, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 627.9214561462402, |
|
"epoch": 0.128, |
|
"grad_norm": 0.8942143321037292, |
|
"kl": 0.008642578125, |
|
"learning_rate": 2.992947502998804e-06, |
|
"loss": 0.0003, |
|
"reward": 1.6543699458241463, |
|
"reward_std": 0.7264986954629421, |
|
"rewards/accuracy_reward": 0.7214285999536514, |
|
"rewards/cosine_scaled_reward": 0.40972703909501434, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.5232143249362707, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 633.0232421875, |
|
"epoch": 0.13866666666666666, |
|
"grad_norm": 6.921348571777344, |
|
"kl": 0.01439208984375, |
|
"learning_rate": 2.9864889601923268e-06, |
|
"loss": 0.0006, |
|
"reward": 1.7206872910261155, |
|
"reward_std": 0.7344334974884987, |
|
"rewards/accuracy_reward": 0.725000036507845, |
|
"rewards/cosine_scaled_reward": 0.43497296012938025, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.5607143200933933, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 656.7178894042969, |
|
"epoch": 0.14933333333333335, |
|
"grad_norm": 0.6442045569419861, |
|
"kl": 0.01673736572265625, |
|
"learning_rate": 2.977961291721137e-06, |
|
"loss": 0.0007, |
|
"reward": 1.8801582887768746, |
|
"reward_std": 0.7263622097671032, |
|
"rewards/accuracy_reward": 0.7571428894996644, |
|
"rewards/cosine_scaled_reward": 0.47301534870639445, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.6500000521540642, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 619.4536033630371, |
|
"epoch": 0.16, |
|
"grad_norm": 1.7239394187927246, |
|
"kl": 0.026496124267578126, |
|
"learning_rate": 2.9673763677155655e-06, |
|
"loss": 0.0011, |
|
"reward": 1.8051109313964844, |
|
"reward_std": 0.7346500240266323, |
|
"rewards/accuracy_reward": 0.7160714596509934, |
|
"rewards/cosine_scaled_reward": 0.39439656864851713, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.6946429140865803, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 623.1785926818848, |
|
"epoch": 0.17066666666666666, |
|
"grad_norm": 0.6716666221618652, |
|
"kl": 0.018997955322265624, |
|
"learning_rate": 2.9547489219129666e-06, |
|
"loss": 0.0008, |
|
"reward": 1.9212585434317588, |
|
"reward_std": 0.634969700500369, |
|
"rewards/accuracy_reward": 0.7785714574158191, |
|
"rewards/cosine_scaled_reward": 0.4653060721466318, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.6773809991776943, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 690.1518196105957, |
|
"epoch": 0.18133333333333335, |
|
"grad_norm": 1.1456305980682373, |
|
"kl": 0.02204437255859375, |
|
"learning_rate": 2.9400965311490175e-06, |
|
"loss": 0.0009, |
|
"reward": 1.9084690719842912, |
|
"reward_std": 0.7263222638517618, |
|
"rewards/accuracy_reward": 0.7303571783006191, |
|
"rewards/cosine_scaled_reward": 0.4507309086387977, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.7273810178041458, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 650.4768188476562, |
|
"epoch": 0.192, |
|
"grad_norm": 29.814361572265625, |
|
"kl": 0.078216552734375, |
|
"learning_rate": 2.9234395908915565e-06, |
|
"loss": 0.0031, |
|
"reward": 1.8972563683986663, |
|
"reward_std": 0.7165740359574556, |
|
"rewards/accuracy_reward": 0.6875000324100256, |
|
"rewards/cosine_scaled_reward": 0.4055896209087223, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.8041667267680168, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 668.3339584350585, |
|
"epoch": 0.20266666666666666, |
|
"grad_norm": 0.48750847578048706, |
|
"kl": 0.02767181396484375, |
|
"learning_rate": 2.904801286851009e-06, |
|
"loss": 0.0011, |
|
"reward": 1.9524270623922348, |
|
"reward_std": 0.6363851364701987, |
|
"rewards/accuracy_reward": 0.7035714564844966, |
|
"rewards/cosine_scaled_reward": 0.42206980669870975, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.826785783469677, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 645.9428840637207, |
|
"epoch": 0.21333333333333335, |
|
"grad_norm": 0.8315287232398987, |
|
"kl": 0.02986602783203125, |
|
"learning_rate": 2.884207562706925e-06, |
|
"loss": 0.0012, |
|
"reward": 2.0384097367525102, |
|
"reward_std": 0.6786769151687622, |
|
"rewards/accuracy_reward": 0.7517857387661934, |
|
"rewards/cosine_scaled_reward": 0.4657905898289755, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.820833396166563, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.21333333333333335, |
|
"eval_completion_length": 688.0076597412109, |
|
"eval_kl": 0.0332870361328125, |
|
"eval_loss": 0.0013802805915474892, |
|
"eval_reward": 1.86520801551342, |
|
"eval_reward_std": 0.7114028903335333, |
|
"eval_rewards/accuracy_reward": 0.650542886838317, |
|
"eval_rewards/cosine_scaled_reward": 0.3737031816519331, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_rewards/reasoning_steps_reward": 0.8409619681358338, |
|
"eval_runtime": 32350.4437, |
|
"eval_samples_per_second": 0.155, |
|
"eval_steps_per_second": 0.011, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 717.150033569336, |
|
"epoch": 0.224, |
|
"grad_norm": 1.5486549139022827, |
|
"kl": 0.03196563720703125, |
|
"learning_rate": 2.8616870839955444e-06, |
|
"loss": 0.0013, |
|
"reward": 2.0346583992242815, |
|
"reward_std": 0.7014419212937355, |
|
"rewards/accuracy_reward": 0.7232143215835094, |
|
"rewards/cosine_scaled_reward": 0.457277343980968, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.8541667237877846, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 708.8571708679199, |
|
"epoch": 0.23466666666666666, |
|
"grad_norm": 0.5981384515762329, |
|
"kl": 0.02979583740234375, |
|
"learning_rate": 2.837271198208662e-06, |
|
"loss": 0.0012, |
|
"reward": 2.0179374665021896, |
|
"reward_std": 0.6652137346565723, |
|
"rewards/accuracy_reward": 0.7250000320374965, |
|
"rewards/cosine_scaled_reward": 0.47091358043253423, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.8220238700509072, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 632.7732406616211, |
|
"epoch": 0.24533333333333332, |
|
"grad_norm": 0.7111315131187439, |
|
"kl": 0.02539825439453125, |
|
"learning_rate": 2.8109938911593322e-06, |
|
"loss": 0.001, |
|
"reward": 2.0148118153214454, |
|
"reward_std": 0.6429756574332715, |
|
"rewards/accuracy_reward": 0.728571455553174, |
|
"rewards/cosine_scaled_reward": 0.44754982106387614, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.8386905357241631, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 655.8321723937988, |
|
"epoch": 0.256, |
|
"grad_norm": 0.5316483974456787, |
|
"kl": 0.02179107666015625, |
|
"learning_rate": 2.7828917396751474e-06, |
|
"loss": 0.0009, |
|
"reward": 1.9900789648294448, |
|
"reward_std": 0.6477071691304446, |
|
"rewards/accuracy_reward": 0.7160714656114578, |
|
"rewards/cosine_scaled_reward": 0.43412648113444446, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.8398810118436814, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 644.7321693420411, |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 0.4458823800086975, |
|
"kl": 0.025299072265625, |
|
"learning_rate": 2.753003860684943e-06, |
|
"loss": 0.001, |
|
"reward": 2.1427780210971834, |
|
"reward_std": 0.6711063630878925, |
|
"rewards/accuracy_reward": 0.7750000268220901, |
|
"rewards/cosine_scaled_reward": 0.5183731818571686, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.8494048312306404, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 684.2911033630371, |
|
"epoch": 0.2773333333333333, |
|
"grad_norm": 0.7146270871162415, |
|
"kl": 0.034222412109375, |
|
"learning_rate": 2.721371856769793e-06, |
|
"loss": 0.0014, |
|
"reward": 1.9814838409423827, |
|
"reward_std": 0.7353869907557964, |
|
"rewards/accuracy_reward": 0.6625000331550837, |
|
"rewards/cosine_scaled_reward": 0.3981504186260281, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9208333924412727, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 650.483960723877, |
|
"epoch": 0.288, |
|
"grad_norm": 0.8331003189086914, |
|
"kl": 0.046978759765625, |
|
"learning_rate": 2.688039758254093e-06, |
|
"loss": 0.0019, |
|
"reward": 2.223627084493637, |
|
"reward_std": 0.6465678755193949, |
|
"rewards/accuracy_reward": 0.7732143219560385, |
|
"rewards/cosine_scaled_reward": 0.506960358901415, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.94345243871212, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 702.9536026000976, |
|
"epoch": 0.2986666666666667, |
|
"grad_norm": 1.9107334613800049, |
|
"kl": 0.0536590576171875, |
|
"learning_rate": 2.65305396191733e-06, |
|
"loss": 0.0021, |
|
"reward": 2.1239778250455856, |
|
"reward_std": 0.6765143848955631, |
|
"rewards/accuracy_reward": 0.7071428891271353, |
|
"rewards/cosine_scaled_reward": 0.4555253505706787, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9613095715641975, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 733.6089630126953, |
|
"epoch": 0.30933333333333335, |
|
"grad_norm": 0.5300867557525635, |
|
"kl": 0.05316162109375, |
|
"learning_rate": 2.61646316641186e-06, |
|
"loss": 0.0021, |
|
"reward": 2.1554796636104583, |
|
"reward_std": 0.6578622825443745, |
|
"rewards/accuracy_reward": 0.7303571704775095, |
|
"rewards/cosine_scaled_reward": 0.47036054339259864, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9547619551420212, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 713.221459197998, |
|
"epoch": 0.32, |
|
"grad_norm": 0.6026062369346619, |
|
"kl": 0.0533843994140625, |
|
"learning_rate": 2.5783183044765715e-06, |
|
"loss": 0.0021, |
|
"reward": 2.1126459658145906, |
|
"reward_std": 0.5920085646212101, |
|
"rewards/accuracy_reward": 0.7089285995811224, |
|
"rewards/cosine_scaled_reward": 0.4566935421898961, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9470238655805587, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 678.6428886413574, |
|
"epoch": 0.33066666666666666, |
|
"grad_norm": 0.6598377227783203, |
|
"kl": 0.049908447265625, |
|
"learning_rate": 2.5386724720408135e-06, |
|
"loss": 0.002, |
|
"reward": 2.243595580756664, |
|
"reward_std": 0.6088640403002501, |
|
"rewards/accuracy_reward": 0.7767857441678643, |
|
"rewards/cosine_scaled_reward": 0.5435954930260778, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9232143476605416, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 683.9268142700196, |
|
"epoch": 0.3413333333333333, |
|
"grad_norm": 0.6654959321022034, |
|
"kl": 0.0447540283203125, |
|
"learning_rate": 2.49758085431725e-06, |
|
"loss": 0.0018, |
|
"reward": 2.0952899247407912, |
|
"reward_std": 0.6968366518616677, |
|
"rewards/accuracy_reward": 0.7232143208384514, |
|
"rewards/cosine_scaled_reward": 0.4637422326952219, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9083333939313889, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 691.3464614868165, |
|
"epoch": 0.352, |
|
"grad_norm": 0.689552903175354, |
|
"kl": 0.0448211669921875, |
|
"learning_rate": 2.455100648986533e-06, |
|
"loss": 0.0018, |
|
"reward": 2.0519487097859384, |
|
"reward_std": 0.7221721112728119, |
|
"rewards/accuracy_reward": 0.6964286031201482, |
|
"rewards/cosine_scaled_reward": 0.4602819522842765, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.8952381581068038, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 696.5268180847168, |
|
"epoch": 0.3626666666666667, |
|
"grad_norm": 1.0024878978729248, |
|
"kl": 0.065167236328125, |
|
"learning_rate": 2.4112909865807053e-06, |
|
"loss": 0.0026, |
|
"reward": 1.7887505039572715, |
|
"reward_std": 0.7482936225831509, |
|
"rewards/accuracy_reward": 0.571428600884974, |
|
"rewards/cosine_scaled_reward": 0.3333932981360704, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.8839286401867866, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 703.2714614868164, |
|
"epoch": 0.37333333333333335, |
|
"grad_norm": 0.5711168050765991, |
|
"kl": 0.093731689453125, |
|
"learning_rate": 2.366212848176164e-06, |
|
"loss": 0.0037, |
|
"reward": 1.9069189459085465, |
|
"reward_std": 0.8069212771952152, |
|
"rewards/accuracy_reward": 0.6500000327825546, |
|
"rewards/cosine_scaled_reward": 0.42358550764620306, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.8333333879709244, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 714.2536003112793, |
|
"epoch": 0.384, |
|
"grad_norm": 3.1069464683532715, |
|
"kl": 0.1747802734375, |
|
"learning_rate": 2.319928980510752e-06, |
|
"loss": 0.007, |
|
"reward": 1.6917703241109847, |
|
"reward_std": 0.8836216881871224, |
|
"rewards/accuracy_reward": 0.6089285977184773, |
|
"rewards/cosine_scaled_reward": 0.35307975246978457, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.7297619581222534, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 727.7018188476562, |
|
"epoch": 0.39466666666666667, |
|
"grad_norm": 1.1932159662246704, |
|
"kl": 0.193988037109375, |
|
"learning_rate": 2.272503808643123e-06, |
|
"loss": 0.0078, |
|
"reward": 1.7027929693460464, |
|
"reward_std": 0.7921728197485208, |
|
"rewards/accuracy_reward": 0.6267857421189547, |
|
"rewards/cosine_scaled_reward": 0.3605310095474124, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.7154762461781502, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 677.6518127441407, |
|
"epoch": 0.4053333333333333, |
|
"grad_norm": 0.6525413393974304, |
|
"kl": 0.1227813720703125, |
|
"learning_rate": 2.2240033462759628e-06, |
|
"loss": 0.0049, |
|
"reward": 2.055608908832073, |
|
"reward_std": 0.6409808352589608, |
|
"rewards/accuracy_reward": 0.7428571667522192, |
|
"rewards/cosine_scaled_reward": 0.4907278836122714, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.8220238700509072, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 729.3125358581543, |
|
"epoch": 0.416, |
|
"grad_norm": 0.470821738243103, |
|
"kl": 0.1053009033203125, |
|
"learning_rate": 2.1744951038678905e-06, |
|
"loss": 0.0042, |
|
"reward": 2.1352262631058694, |
|
"reward_std": 0.6541992913931608, |
|
"rewards/accuracy_reward": 0.7446428880095481, |
|
"rewards/cosine_scaled_reward": 0.5340357202105224, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.8565476804971695, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 736.6607482910156, |
|
"epoch": 0.4266666666666667, |
|
"grad_norm": 0.3663829267024994, |
|
"kl": 0.145220947265625, |
|
"learning_rate": 2.124047994661941e-06, |
|
"loss": 0.0058, |
|
"reward": 2.0683016672730448, |
|
"reward_std": 0.6785697277635336, |
|
"rewards/accuracy_reward": 0.7107143150642514, |
|
"rewards/cosine_scaled_reward": 0.4861587251536548, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.8714286342263222, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4266666666666667, |
|
"eval_completion_length": 743.3604330322265, |
|
"eval_kl": 0.1699279296875, |
|
"eval_loss": 0.006734147202223539, |
|
"eval_reward": 1.8947704853653908, |
|
"eval_reward_std": 0.7092250557422638, |
|
"eval_rewards/accuracy_reward": 0.6307143133163452, |
|
"eval_rewards/cosine_scaled_reward": 0.39257041423644407, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_rewards/reasoning_steps_reward": 0.871485775399208, |
|
"eval_runtime": 32670.592, |
|
"eval_samples_per_second": 0.153, |
|
"eval_steps_per_second": 0.011, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 752.7053955078125, |
|
"epoch": 0.43733333333333335, |
|
"grad_norm": 0.5299625396728516, |
|
"kl": 0.1930633544921875, |
|
"learning_rate": 2.072732238761434e-06, |
|
"loss": 0.0077, |
|
"reward": 1.8860187515616418, |
|
"reward_std": 0.7606242794543505, |
|
"rewards/accuracy_reward": 0.6446428863331676, |
|
"rewards/cosine_scaled_reward": 0.40447108587541153, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.8369048193097115, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 733.603606414795, |
|
"epoch": 0.448, |
|
"grad_norm": 1.6152819395065308, |
|
"kl": 0.219268798828125, |
|
"learning_rate": 2.0206192653867536e-06, |
|
"loss": 0.0088, |
|
"reward": 1.997245892137289, |
|
"reward_std": 0.7402419943362475, |
|
"rewards/accuracy_reward": 0.7017857382073999, |
|
"rewards/cosine_scaled_reward": 0.47284105569124224, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.8226191058754921, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 844.0661102294922, |
|
"epoch": 0.45866666666666667, |
|
"grad_norm": 7.516280651092529, |
|
"kl": 0.27982177734375, |
|
"learning_rate": 1.967781613449095e-06, |
|
"loss": 0.0112, |
|
"reward": 1.5464881896972655, |
|
"reward_std": 0.8091491930186748, |
|
"rewards/accuracy_reward": 0.49107144959270954, |
|
"rewards/cosine_scaled_reward": 0.21672622584737838, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.8386905357241631, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 814.1696807861329, |
|
"epoch": 0.4693333333333333, |
|
"grad_norm": 0.4684678018093109, |
|
"kl": 0.194140625, |
|
"learning_rate": 1.9142928305795637e-06, |
|
"loss": 0.0078, |
|
"reward": 1.8477135568857193, |
|
"reward_std": 0.7414120733737946, |
|
"rewards/accuracy_reward": 0.6178571652621031, |
|
"rewards/cosine_scaled_reward": 0.3584277655696496, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.8714286401867867, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 754.1857452392578, |
|
"epoch": 0.48, |
|
"grad_norm": 0.4328997731208801, |
|
"kl": 0.12838134765625, |
|
"learning_rate": 1.8602273707541886e-06, |
|
"loss": 0.0051, |
|
"reward": 2.1135876968503, |
|
"reward_std": 0.6965163860470056, |
|
"rewards/accuracy_reward": 0.742857176810503, |
|
"rewards/cosine_scaled_reward": 0.5159685641527176, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.8547619715332985, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 742.7750381469726, |
|
"epoch": 0.49066666666666664, |
|
"grad_norm": 0.4649052619934082, |
|
"kl": 0.1558837890625, |
|
"learning_rate": 1.8056604906573418e-06, |
|
"loss": 0.0062, |
|
"reward": 2.0384344711899756, |
|
"reward_std": 0.6620127268135547, |
|
"rewards/accuracy_reward": 0.7035714626312256, |
|
"rewards/cosine_scaled_reward": 0.483077246020548, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.8517857760190963, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 739.6268203735351, |
|
"epoch": 0.5013333333333333, |
|
"grad_norm": 1.5264660120010376, |
|
"kl": 0.145806884765625, |
|
"learning_rate": 1.7506681449278226e-06, |
|
"loss": 0.0058, |
|
"reward": 1.999456986784935, |
|
"reward_std": 0.7032103724777699, |
|
"rewards/accuracy_reward": 0.6785714574158191, |
|
"rewards/cosine_scaled_reward": 0.45302835907787087, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.8678572103381157, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 725.905387878418, |
|
"epoch": 0.512, |
|
"grad_norm": 13.703657150268555, |
|
"kl": 0.354132080078125, |
|
"learning_rate": 1.6953268804334257e-06, |
|
"loss": 0.0142, |
|
"reward": 2.012031316757202, |
|
"reward_std": 0.6349152896553278, |
|
"rewards/accuracy_reward": 0.6660714553669095, |
|
"rewards/cosine_scaled_reward": 0.46024551438167693, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.8857143551111222, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 711.9410980224609, |
|
"epoch": 0.5226666666666666, |
|
"grad_norm": 42.922752380371094, |
|
"kl": 0.81356201171875, |
|
"learning_rate": 1.6397137297211436e-06, |
|
"loss": 0.0325, |
|
"reward": 2.129089578986168, |
|
"reward_std": 0.699107101932168, |
|
"rewards/accuracy_reward": 0.7160714577883482, |
|
"rewards/cosine_scaled_reward": 0.5064704709046055, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9065476730465889, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 738.9821746826171, |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 212.6622314453125, |
|
"kl": 1.157550048828125, |
|
"learning_rate": 1.5839061037913395e-06, |
|
"loss": 0.0463, |
|
"reward": 2.1009622782468798, |
|
"reward_std": 0.7158728931099176, |
|
"rewards/accuracy_reward": 0.7000000283122063, |
|
"rewards/cosine_scaled_reward": 0.5027479250915349, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.8982143506407738, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 760.2428916931152, |
|
"epoch": 0.544, |
|
"grad_norm": 10.118670463562012, |
|
"kl": 0.637158203125, |
|
"learning_rate": 1.527981684345115e-06, |
|
"loss": 0.0255, |
|
"reward": 1.9621681660413741, |
|
"reward_std": 0.67494813259691, |
|
"rewards/accuracy_reward": 0.639285740070045, |
|
"rewards/cosine_scaled_reward": 0.4276442806003615, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.8952381491661072, |
|
"step": 255 |
|
}, |
|
{ |
|
"completion_length": 754.6803894042969, |
|
"epoch": 0.5546666666666666, |
|
"grad_norm": 7.878048419952393, |
|
"kl": 0.972845458984375, |
|
"learning_rate": 1.4720183156548855e-06, |
|
"loss": 0.0389, |
|
"reward": 1.9780788227915764, |
|
"reward_std": 0.6262619759887457, |
|
"rewards/accuracy_reward": 0.6339285982772708, |
|
"rewards/cosine_scaled_reward": 0.4304597085807472, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9136905416846275, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 751.5857498168946, |
|
"epoch": 0.5653333333333334, |
|
"grad_norm": 12.42583179473877, |
|
"kl": 3.09744873046875, |
|
"learning_rate": 1.4160938962086612e-06, |
|
"loss": 0.1241, |
|
"reward": 2.0433208346366882, |
|
"reward_std": 0.661328698694706, |
|
"rewards/accuracy_reward": 0.676785740442574, |
|
"rewards/cosine_scaled_reward": 0.44689220561413096, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9196429163217544, |
|
"step": 265 |
|
}, |
|
{ |
|
"completion_length": 729.028604888916, |
|
"epoch": 0.576, |
|
"grad_norm": 7.453009605407715, |
|
"kl": 2.2955322265625, |
|
"learning_rate": 1.3602862702788567e-06, |
|
"loss": 0.0917, |
|
"reward": 2.094664843380451, |
|
"reward_std": 0.6356621380895376, |
|
"rewards/accuracy_reward": 0.7000000346451998, |
|
"rewards/cosine_scaled_reward": 0.46371242445893585, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9309524431824684, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 730.825032043457, |
|
"epoch": 0.5866666666666667, |
|
"grad_norm": 7.0367817878723145, |
|
"kl": 0.6509521484375, |
|
"learning_rate": 1.3046731195665748e-06, |
|
"loss": 0.0261, |
|
"reward": 2.083331751823425, |
|
"reward_std": 0.6676435235887765, |
|
"rewards/accuracy_reward": 0.6821428818628192, |
|
"rewards/cosine_scaled_reward": 0.45714118536561726, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.944047674536705, |
|
"step": 275 |
|
}, |
|
{ |
|
"completion_length": 742.180387878418, |
|
"epoch": 0.5973333333333334, |
|
"grad_norm": 1.3236949443817139, |
|
"kl": 4.09298095703125, |
|
"learning_rate": 1.2493318550721775e-06, |
|
"loss": 0.1637, |
|
"reward": 2.075996032357216, |
|
"reward_std": 0.6379393456503749, |
|
"rewards/accuracy_reward": 0.6857143174856901, |
|
"rewards/cosine_scaled_reward": 0.4563530746847391, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9339286297559738, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 708.1018157958985, |
|
"epoch": 0.608, |
|
"grad_norm": 5.264936447143555, |
|
"kl": 0.21192626953125, |
|
"learning_rate": 1.1943395093426585e-06, |
|
"loss": 0.0085, |
|
"reward": 2.1390477627515794, |
|
"reward_std": 0.600306774303317, |
|
"rewards/accuracy_reward": 0.7196428820490837, |
|
"rewards/cosine_scaled_reward": 0.49619057439267633, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9232143506407737, |
|
"step": 285 |
|
}, |
|
{ |
|
"completion_length": 715.4125289916992, |
|
"epoch": 0.6186666666666667, |
|
"grad_norm": 2.6887574195861816, |
|
"kl": 2.8669677734375, |
|
"learning_rate": 1.1397726292458115e-06, |
|
"loss": 0.1151, |
|
"reward": 2.1179503470659258, |
|
"reward_std": 0.5490788316354156, |
|
"rewards/accuracy_reward": 0.7053571708500386, |
|
"rewards/cosine_scaled_reward": 0.4905693273060024, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9220238789916039, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 742.6803916931152, |
|
"epoch": 0.6293333333333333, |
|
"grad_norm": 6.9418721199035645, |
|
"kl": 0.39151611328125, |
|
"learning_rate": 1.085707169420437e-06, |
|
"loss": 0.0157, |
|
"reward": 1.8962592497467994, |
|
"reward_std": 0.6060247957706452, |
|
"rewards/accuracy_reward": 0.5964285938069225, |
|
"rewards/cosine_scaled_reward": 0.3754258565604687, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.924404813349247, |
|
"step": 295 |
|
}, |
|
{ |
|
"completion_length": 716.3464584350586, |
|
"epoch": 0.64, |
|
"grad_norm": 4.2906060218811035, |
|
"kl": 0.57667236328125, |
|
"learning_rate": 1.0322183865509054e-06, |
|
"loss": 0.0231, |
|
"reward": 2.1815308302640917, |
|
"reward_std": 0.6235232371836901, |
|
"rewards/accuracy_reward": 0.7428571732714773, |
|
"rewards/cosine_scaled_reward": 0.5255783690838143, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.913095298409462, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_completion_length": 728.9849459716797, |
|
"eval_kl": 22.31169453125, |
|
"eval_loss": 0.8926114439964294, |
|
"eval_reward": 1.9843467233777046, |
|
"eval_reward_std": 0.6538388645738363, |
|
"eval_rewards/accuracy_reward": 0.6382285982251167, |
|
"eval_rewards/cosine_scaled_reward": 0.41530855364510644, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_rewards/reasoning_steps_reward": 0.9308095807313919, |
|
"eval_runtime": 32207.7986, |
|
"eval_samples_per_second": 0.155, |
|
"eval_steps_per_second": 0.011, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 723.2625328063965, |
|
"epoch": 0.6506666666666666, |
|
"grad_norm": 79.97950744628906, |
|
"kl": 487.1179443359375, |
|
"learning_rate": 9.793807346132464e-07, |
|
"loss": 19.4474, |
|
"reward": 2.162437987327576, |
|
"reward_std": 0.6324797321110964, |
|
"rewards/accuracy_reward": 0.7267857410013676, |
|
"rewards/cosine_scaled_reward": 0.5112474345514784, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9244048178195954, |
|
"step": 305 |
|
}, |
|
{ |
|
"completion_length": 739.6375335693359, |
|
"epoch": 0.6613333333333333, |
|
"grad_norm": 9.395992279052734, |
|
"kl": 0.60579833984375, |
|
"learning_rate": 9.272677612385667e-07, |
|
"loss": 0.0242, |
|
"reward": 2.004467612504959, |
|
"reward_std": 0.6282935816794634, |
|
"rewards/accuracy_reward": 0.6607143184170127, |
|
"rewards/cosine_scaled_reward": 0.42589613443706187, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9178571999073029, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 735.6286071777344, |
|
"epoch": 0.672, |
|
"grad_norm": 12.830111503601074, |
|
"kl": 0.9565673828125, |
|
"learning_rate": 8.759520053380591e-07, |
|
"loss": 0.0383, |
|
"reward": 1.9197196617722512, |
|
"reward_std": 0.6299623921513557, |
|
"rewards/accuracy_reward": 0.6035714576020836, |
|
"rewards/cosine_scaled_reward": 0.39055290608666837, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9255953043699264, |
|
"step": 315 |
|
}, |
|
{ |
|
"completion_length": 718.0571731567383, |
|
"epoch": 0.6826666666666666, |
|
"grad_norm": 176.6972198486328, |
|
"kl": 1.54287109375, |
|
"learning_rate": 8.255048961321088e-07, |
|
"loss": 0.0618, |
|
"reward": 2.1281729131937026, |
|
"reward_std": 0.6808584026992321, |
|
"rewards/accuracy_reward": 0.714285746216774, |
|
"rewards/cosine_scaled_reward": 0.4888871216215193, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9250000536441803, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 721.4732475280762, |
|
"epoch": 0.6933333333333334, |
|
"grad_norm": 6.025720119476318, |
|
"kl": 0.98104248046875, |
|
"learning_rate": 7.759966537240373e-07, |
|
"loss": 0.0392, |
|
"reward": 2.054315000772476, |
|
"reward_std": 0.6834255807101727, |
|
"rewards/accuracy_reward": 0.6714285992085933, |
|
"rewards/cosine_scaled_reward": 0.45312447142787277, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9297619640827179, |
|
"step": 325 |
|
}, |
|
{ |
|
"completion_length": 729.3982498168946, |
|
"epoch": 0.704, |
|
"grad_norm": 6.682721138000488, |
|
"kl": 2.40982666015625, |
|
"learning_rate": 7.274961913568773e-07, |
|
"loss": 0.0964, |
|
"reward": 2.0376005843281746, |
|
"reward_std": 0.7055317234247923, |
|
"rewards/accuracy_reward": 0.6660714562982321, |
|
"rewards/cosine_scaled_reward": 0.4655766852200031, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9059524461627007, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 737.005387878418, |
|
"epoch": 0.7146666666666667, |
|
"grad_norm": 21.818754196166992, |
|
"kl": 0.653094482421875, |
|
"learning_rate": 6.800710194892484e-07, |
|
"loss": 0.0261, |
|
"reward": 2.056803268194199, |
|
"reward_std": 0.7108213260769844, |
|
"rewards/accuracy_reward": 0.6660714574158192, |
|
"rewards/cosine_scaled_reward": 0.45680318772792816, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9339286327362061, |
|
"step": 335 |
|
}, |
|
{ |
|
"completion_length": 729.6393203735352, |
|
"epoch": 0.7253333333333334, |
|
"grad_norm": 4.025352954864502, |
|
"kl": 0.63848876953125, |
|
"learning_rate": 6.33787151823836e-07, |
|
"loss": 0.0256, |
|
"reward": 1.9720933943986894, |
|
"reward_std": 0.6898978160694241, |
|
"rewards/accuracy_reward": 0.6250000264495611, |
|
"rewards/cosine_scaled_reward": 0.42685523356776683, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9202381581068039, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 699.1571701049804, |
|
"epoch": 0.736, |
|
"grad_norm": 5.142830848693848, |
|
"kl": 0.65721435546875, |
|
"learning_rate": 5.887090134192947e-07, |
|
"loss": 0.0263, |
|
"reward": 2.100009024143219, |
|
"reward_std": 0.6496724892407656, |
|
"rewards/accuracy_reward": 0.6910714615136385, |
|
"rewards/cosine_scaled_reward": 0.4851280112750828, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9238095805048943, |
|
"step": 345 |
|
}, |
|
{ |
|
"completion_length": 723.3910995483399, |
|
"epoch": 0.7466666666666667, |
|
"grad_norm": 4.602946758270264, |
|
"kl": 0.394140625, |
|
"learning_rate": 5.448993510134669e-07, |
|
"loss": 0.0158, |
|
"reward": 2.0926264360547067, |
|
"reward_std": 0.6916316740214825, |
|
"rewards/accuracy_reward": 0.6857143180444837, |
|
"rewards/cosine_scaled_reward": 0.4831025514518842, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9238095790147781, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 722.5375305175781, |
|
"epoch": 0.7573333333333333, |
|
"grad_norm": 6.0756731033325195, |
|
"kl": 1.08592529296875, |
|
"learning_rate": 5.024191456827498e-07, |
|
"loss": 0.0435, |
|
"reward": 2.0994770556688307, |
|
"reward_std": 0.666194306127727, |
|
"rewards/accuracy_reward": 0.6982143167406321, |
|
"rewards/cosine_scaled_reward": 0.4917388891801238, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9095238700509072, |
|
"step": 355 |
|
}, |
|
{ |
|
"completion_length": 713.5250350952149, |
|
"epoch": 0.768, |
|
"grad_norm": 7.16264533996582, |
|
"kl": 26.27894287109375, |
|
"learning_rate": 4.6132752795918667e-07, |
|
"loss": 1.0497, |
|
"reward": 2.055359125137329, |
|
"reward_std": 0.7066416556015611, |
|
"rewards/accuracy_reward": 0.6678571753203869, |
|
"rewards/cosine_scaled_reward": 0.4732161985710263, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9142857760190963, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 751.5964584350586, |
|
"epoch": 0.7786666666666666, |
|
"grad_norm": 3.023808002471924, |
|
"kl": 1.154327392578125, |
|
"learning_rate": 4.2168169552342905e-07, |
|
"loss": 0.0462, |
|
"reward": 1.9766315311193465, |
|
"reward_std": 0.7433438140898943, |
|
"rewards/accuracy_reward": 0.6339286021888256, |
|
"rewards/cosine_scaled_reward": 0.42544099894585086, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9172619596123696, |
|
"step": 365 |
|
}, |
|
{ |
|
"completion_length": 704.278596496582, |
|
"epoch": 0.7893333333333333, |
|
"grad_norm": 1.0741926431655884, |
|
"kl": 0.53934326171875, |
|
"learning_rate": 3.8353683358814046e-07, |
|
"loss": 0.0216, |
|
"reward": 2.0491741001605988, |
|
"reward_std": 0.587555892020464, |
|
"rewards/accuracy_reward": 0.6678571693599225, |
|
"rewards/cosine_scaled_reward": 0.46226926781237127, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9190476790070534, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 738.875033569336, |
|
"epoch": 0.8, |
|
"grad_norm": 41.52888870239258, |
|
"kl": 0.6643310546875, |
|
"learning_rate": 3.469460380826697e-07, |
|
"loss": 0.0265, |
|
"reward": 2.0449665546417237, |
|
"reward_std": 0.6989724855870009, |
|
"rewards/accuracy_reward": 0.6625000312924385, |
|
"rewards/cosine_scaled_reward": 0.4574664521496743, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9250000640749931, |
|
"step": 375 |
|
}, |
|
{ |
|
"completion_length": 724.0678855895997, |
|
"epoch": 0.8106666666666666, |
|
"grad_norm": 4.322193145751953, |
|
"kl": 0.7086669921875, |
|
"learning_rate": 3.119602417459075e-07, |
|
"loss": 0.0284, |
|
"reward": 2.055614770948887, |
|
"reward_std": 0.6039443843066692, |
|
"rewards/accuracy_reward": 0.667857171408832, |
|
"rewards/cosine_scaled_reward": 0.46275755076203495, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9250000655651093, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 739.5125350952148, |
|
"epoch": 0.8213333333333334, |
|
"grad_norm": 4.056361198425293, |
|
"kl": 0.7447509765625, |
|
"learning_rate": 2.786281432302071e-07, |
|
"loss": 0.0298, |
|
"reward": 2.0523035705089567, |
|
"reward_std": 0.6267267379909753, |
|
"rewards/accuracy_reward": 0.6750000279396773, |
|
"rewards/cosine_scaled_reward": 0.4463511134439614, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9309524476528168, |
|
"step": 385 |
|
}, |
|
{ |
|
"completion_length": 722.548243713379, |
|
"epoch": 0.832, |
|
"grad_norm": 1.378568410873413, |
|
"kl": 0.501007080078125, |
|
"learning_rate": 2.46996139315057e-07, |
|
"loss": 0.02, |
|
"reward": 2.0793206453323365, |
|
"reward_std": 0.6533296214416623, |
|
"rewards/accuracy_reward": 0.6875000290572644, |
|
"rewards/cosine_scaled_reward": 0.4781300783797633, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9136905401945115, |
|
"step": 390 |
|
}, |
|
{ |
|
"completion_length": 732.4714630126953, |
|
"epoch": 0.8426666666666667, |
|
"grad_norm": 2.4824626445770264, |
|
"kl": 0.71015625, |
|
"learning_rate": 2.1710826032485286e-07, |
|
"loss": 0.0284, |
|
"reward": 2.1464335188269614, |
|
"reward_std": 0.6267410140484572, |
|
"rewards/accuracy_reward": 0.7071428874507546, |
|
"rewards/cosine_scaled_reward": 0.5136953465640545, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9255953013896943, |
|
"step": 395 |
|
}, |
|
{ |
|
"completion_length": 769.8607528686523, |
|
"epoch": 0.8533333333333334, |
|
"grad_norm": 5.1279401779174805, |
|
"kl": 0.787158203125, |
|
"learning_rate": 1.8900610884066817e-07, |
|
"loss": 0.0315, |
|
"reward": 1.9811220198869706, |
|
"reward_std": 0.6900037627667188, |
|
"rewards/accuracy_reward": 0.6357143126428128, |
|
"rewards/cosine_scaled_reward": 0.4329076783033088, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.912500062584877, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8533333333333334, |
|
"eval_completion_length": 738.6478045166016, |
|
"eval_kl": 0.67065634765625, |
|
"eval_loss": 0.026821324601769447, |
|
"eval_reward": 1.9358687758922577, |
|
"eval_reward_std": 0.681571420711279, |
|
"eval_rewards/accuracy_reward": 0.6160857413113118, |
|
"eval_rewards/cosine_scaled_reward": 0.4032782297934056, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_rewards/reasoning_steps_reward": 0.9165048221349716, |
|
"eval_runtime": 32285.4404, |
|
"eval_samples_per_second": 0.155, |
|
"eval_steps_per_second": 0.011, |
|
"step": 400 |
|
}, |
|
{ |
|
"completion_length": 763.4339599609375, |
|
"epoch": 0.864, |
|
"grad_norm": 4.143102169036865, |
|
"kl": 0.609136962890625, |
|
"learning_rate": 1.627288017913383e-07, |
|
"loss": 0.0244, |
|
"reward": 1.9788720414042473, |
|
"reward_std": 0.6925495602190495, |
|
"rewards/accuracy_reward": 0.6375000275671482, |
|
"rewards/cosine_scaled_reward": 0.42411007191985844, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9172619670629502, |
|
"step": 405 |
|
}, |
|
{ |
|
"completion_length": 754.2500312805175, |
|
"epoch": 0.8746666666666667, |
|
"grad_norm": 4.33268928527832, |
|
"kl": 0.9586181640625, |
|
"learning_rate": 1.3831291600445573e-07, |
|
"loss": 0.0383, |
|
"reward": 1.9650759071111679, |
|
"reward_std": 0.6423604141920805, |
|
"rewards/accuracy_reward": 0.6303571704775095, |
|
"rewards/cosine_scaled_reward": 0.4222186904400587, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.912500062584877, |
|
"step": 410 |
|
}, |
|
{ |
|
"completion_length": 751.8071762084961, |
|
"epoch": 0.8853333333333333, |
|
"grad_norm": 7.097233295440674, |
|
"kl": 0.8556884765625, |
|
"learning_rate": 1.1579243729307487e-07, |
|
"loss": 0.0342, |
|
"reward": 1.9338065341114998, |
|
"reward_std": 0.7414230849593878, |
|
"rewards/accuracy_reward": 0.6321428898721934, |
|
"rewards/cosine_scaled_reward": 0.41178265907801687, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.8898810192942619, |
|
"step": 415 |
|
}, |
|
{ |
|
"completion_length": 752.8571723937988, |
|
"epoch": 0.896, |
|
"grad_norm": 3.0274124145507812, |
|
"kl": 0.67294921875, |
|
"learning_rate": 9.519871314899092e-08, |
|
"loss": 0.0269, |
|
"reward": 1.9913182631134987, |
|
"reward_std": 0.7086525153368711, |
|
"rewards/accuracy_reward": 0.6571428876370191, |
|
"rewards/cosine_scaled_reward": 0.4359610580140725, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.8982143491506577, |
|
"step": 420 |
|
}, |
|
{ |
|
"completion_length": 751.7411056518555, |
|
"epoch": 0.9066666666666666, |
|
"grad_norm": 1.3194289207458496, |
|
"kl": 0.722802734375, |
|
"learning_rate": 7.656040910844358e-08, |
|
"loss": 0.0289, |
|
"reward": 2.0188252568244933, |
|
"reward_std": 0.7707155652344226, |
|
"rewards/accuracy_reward": 0.644642885029316, |
|
"rewards/cosine_scaled_reward": 0.44144419142976404, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9327381521463394, |
|
"step": 425 |
|
}, |
|
{ |
|
"completion_length": 755.0464630126953, |
|
"epoch": 0.9173333333333333, |
|
"grad_norm": 4.276956081390381, |
|
"kl": 0.9569580078125, |
|
"learning_rate": 5.990346885098235e-08, |
|
"loss": 0.0383, |
|
"reward": 2.000167742371559, |
|
"reward_std": 0.7376608021557332, |
|
"rewards/accuracy_reward": 0.6589285988360644, |
|
"rewards/cosine_scaled_reward": 0.45314384531229734, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.8880952954292297, |
|
"step": 430 |
|
}, |
|
{ |
|
"completion_length": 727.2536087036133, |
|
"epoch": 0.928, |
|
"grad_norm": 19.139204025268555, |
|
"kl": 1.32947998046875, |
|
"learning_rate": 4.5251078087033493e-08, |
|
"loss": 0.0532, |
|
"reward": 2.039694218337536, |
|
"reward_std": 0.6533694989979267, |
|
"rewards/accuracy_reward": 0.6732143165543676, |
|
"rewards/cosine_scaled_reward": 0.4462417368311435, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9202381521463394, |
|
"step": 435 |
|
}, |
|
{ |
|
"completion_length": 734.5536064147949, |
|
"epoch": 0.9386666666666666, |
|
"grad_norm": 9.922527313232422, |
|
"kl": 1.4177001953125, |
|
"learning_rate": 3.262363228443427e-08, |
|
"loss": 0.0567, |
|
"reward": 1.9774114236235618, |
|
"reward_std": 0.7198221303522587, |
|
"rewards/accuracy_reward": 0.6571428865194321, |
|
"rewards/cosine_scaled_reward": 0.4309827778954059, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.8892857789993286, |
|
"step": 440 |
|
}, |
|
{ |
|
"completion_length": 755.5053962707519, |
|
"epoch": 0.9493333333333334, |
|
"grad_norm": 3.058717727661133, |
|
"kl": 1.02747802734375, |
|
"learning_rate": 2.2038708278862952e-08, |
|
"loss": 0.0411, |
|
"reward": 1.9413904681801797, |
|
"reward_std": 0.6192027345299721, |
|
"rewards/accuracy_reward": 0.6214285951107741, |
|
"rewards/cosine_scaled_reward": 0.41579519272781906, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9041667267680168, |
|
"step": 445 |
|
}, |
|
{ |
|
"completion_length": 723.6143173217773, |
|
"epoch": 0.96, |
|
"grad_norm": 2.64345383644104, |
|
"kl": 0.74544677734375, |
|
"learning_rate": 1.3511039807673209e-08, |
|
"loss": 0.0298, |
|
"reward": 2.1570381984114646, |
|
"reward_std": 0.6153812855482101, |
|
"rewards/accuracy_reward": 0.7089286003261804, |
|
"rewards/cosine_scaled_reward": 0.5165619559586048, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9315476790070534, |
|
"step": 450 |
|
}, |
|
{ |
|
"completion_length": 728.894679260254, |
|
"epoch": 0.9706666666666667, |
|
"grad_norm": 2.217505693435669, |
|
"kl": 0.678607177734375, |
|
"learning_rate": 7.0524970011963675e-09, |
|
"loss": 0.0272, |
|
"reward": 2.2157696574926375, |
|
"reward_std": 0.6317826233804226, |
|
"rewards/accuracy_reward": 0.7500000305473804, |
|
"rewards/cosine_scaled_reward": 0.5425553207285703, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9232143491506577, |
|
"step": 455 |
|
}, |
|
{ |
|
"completion_length": 722.3839637756348, |
|
"epoch": 0.9813333333333333, |
|
"grad_norm": 3.196773052215576, |
|
"kl": 0.709228515625, |
|
"learning_rate": 2.6720698600553595e-09, |
|
"loss": 0.0284, |
|
"reward": 2.122882993519306, |
|
"reward_std": 0.599827627837658, |
|
"rewards/accuracy_reward": 0.7017857432365417, |
|
"rewards/cosine_scaled_reward": 0.5175257750786841, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9035714983940124, |
|
"step": 460 |
|
}, |
|
{ |
|
"completion_length": 754.775032043457, |
|
"epoch": 0.992, |
|
"grad_norm": 8.455827713012695, |
|
"kl": 0.835205078125, |
|
"learning_rate": 3.7585574148779613e-10, |
|
"loss": 0.0334, |
|
"reward": 1.9985675051808358, |
|
"reward_std": 0.7642196819186211, |
|
"rewards/accuracy_reward": 0.6500000316649676, |
|
"rewards/cosine_scaled_reward": 0.4402340850589098, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.9083333894610405, |
|
"step": 465 |
|
}, |
|
{ |
|
"completion_length": 746.1607462565104, |
|
"epoch": 0.9984, |
|
"kl": 0.8069661458333334, |
|
"reward": 2.0161508160332837, |
|
"reward_std": 0.7148686709503332, |
|
"rewards/accuracy_reward": 0.6636905111372471, |
|
"rewards/cosine_scaled_reward": 0.4536507367156446, |
|
"rewards/format_reward": 0.0, |
|
"rewards/reasoning_steps_reward": 0.898809589445591, |
|
"step": 468, |
|
"total_flos": 0.0, |
|
"train_loss": 4841.422249500714, |
|
"train_runtime": 180396.3107, |
|
"train_samples_per_second": 0.042, |
|
"train_steps_per_second": 0.003 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 468, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|