|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 100, |
|
"global_step": 469, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 614.4910888671875, |
|
"epoch": 0.0021321961620469083, |
|
"grad_norm": 0.653412401676178, |
|
"grpo_loss": -0.05488306283950806, |
|
"kl": 0.0, |
|
"learning_rate": 6.382978723404255e-08, |
|
"loss": 0.0, |
|
"reward": 0.3571428656578064, |
|
"reward_std": 0.3737320303916931, |
|
"rewards/accuracy_reward": 0.3571428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 649.857177734375, |
|
"epoch": 0.0042643923240938165, |
|
"grad_norm": 0.5258876085281372, |
|
"grpo_loss": 0.047233134508132935, |
|
"kl": 0.0, |
|
"learning_rate": 1.276595744680851e-07, |
|
"loss": 0.0, |
|
"reward": 0.3571428656578064, |
|
"reward_std": 0.42088788747787476, |
|
"rewards/accuracy_reward": 0.3571428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 606.1875, |
|
"epoch": 0.006396588486140725, |
|
"grad_norm": 7.712527751922607, |
|
"grpo_loss": 0.016701802611351013, |
|
"kl": 6.341934204101562e-05, |
|
"learning_rate": 1.9148936170212765e-07, |
|
"loss": 0.0, |
|
"reward": 0.3035714328289032, |
|
"reward_std": 0.3098275065422058, |
|
"rewards/accuracy_reward": 0.3035714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 624.0625, |
|
"epoch": 0.008528784648187633, |
|
"grad_norm": 0.40128573775291443, |
|
"grpo_loss": 2.2761523723602295e-06, |
|
"kl": 4.601478576660156e-05, |
|
"learning_rate": 2.553191489361702e-07, |
|
"loss": 0.0, |
|
"reward": 0.4285714626312256, |
|
"reward_std": 0.42494088411331177, |
|
"rewards/accuracy_reward": 0.4285714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 535.2232666015625, |
|
"epoch": 0.010660980810234541, |
|
"grad_norm": 1.0285664796829224, |
|
"grpo_loss": -0.01669881120324135, |
|
"kl": 6.628036499023438e-05, |
|
"learning_rate": 3.1914893617021275e-07, |
|
"loss": 0.0, |
|
"reward": 0.330357164144516, |
|
"reward_std": 0.3667682409286499, |
|
"rewards/accuracy_reward": 0.330357164144516, |
|
"rewards/format_reward": 0.0, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 565.2142944335938, |
|
"epoch": 0.01279317697228145, |
|
"grad_norm": 1.2028206586837769, |
|
"grpo_loss": -0.05488109588623047, |
|
"kl": 5.555152893066406e-05, |
|
"learning_rate": 3.829787234042553e-07, |
|
"loss": 0.0, |
|
"reward": 0.4285714626312256, |
|
"reward_std": 0.42670944333076477, |
|
"rewards/accuracy_reward": 0.4285714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 547.4107666015625, |
|
"epoch": 0.014925373134328358, |
|
"grad_norm": 1.7286690473556519, |
|
"grpo_loss": -0.18294166028499603, |
|
"kl": 7.390975952148438e-05, |
|
"learning_rate": 4.468085106382979e-07, |
|
"loss": 0.0, |
|
"reward": 0.4375000298023224, |
|
"reward_std": 0.33336058259010315, |
|
"rewards/accuracy_reward": 0.4285714626312256, |
|
"rewards/format_reward": 0.008928571827709675, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 589.6160888671875, |
|
"epoch": 0.017057569296375266, |
|
"grad_norm": 0.6688471436500549, |
|
"grpo_loss": 1.0952353477478027e-06, |
|
"kl": 5.459785461425781e-05, |
|
"learning_rate": 5.106382978723404e-07, |
|
"loss": 0.0, |
|
"reward": 0.392857164144516, |
|
"reward_std": 0.42565688490867615, |
|
"rewards/accuracy_reward": 0.3839285969734192, |
|
"rewards/format_reward": 0.008928571827709675, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 583.9017944335938, |
|
"epoch": 0.019189765458422176, |
|
"grad_norm": 0.6561264991760254, |
|
"grpo_loss": 0.10020516812801361, |
|
"kl": 5.5789947509765625e-05, |
|
"learning_rate": 5.74468085106383e-07, |
|
"loss": 0.0, |
|
"reward": 0.5089285969734192, |
|
"reward_std": 0.40017586946487427, |
|
"rewards/accuracy_reward": 0.5089285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 564.0625, |
|
"epoch": 0.021321961620469083, |
|
"grad_norm": 2.1838438510894775, |
|
"grpo_loss": 0.04723589867353439, |
|
"kl": 7.343292236328125e-05, |
|
"learning_rate": 6.382978723404255e-07, |
|
"loss": 0.0, |
|
"reward": 0.3035714328289032, |
|
"reward_std": 0.3835168778896332, |
|
"rewards/accuracy_reward": 0.2946428656578064, |
|
"rewards/format_reward": 0.008928571827709675, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 601.5982666015625, |
|
"epoch": 0.023454157782515993, |
|
"grad_norm": 0.7729598879814148, |
|
"grpo_loss": 1.9669532775878906e-06, |
|
"kl": 7.82012939453125e-05, |
|
"learning_rate": 7.021276595744681e-07, |
|
"loss": 0.0, |
|
"reward": 0.3392857313156128, |
|
"reward_std": 0.38351690769195557, |
|
"rewards/accuracy_reward": 0.3392857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 557.0535888671875, |
|
"epoch": 0.0255863539445629, |
|
"grad_norm": 0.7991155385971069, |
|
"grpo_loss": -0.11808028072118759, |
|
"kl": 6.031990051269531e-05, |
|
"learning_rate": 7.659574468085106e-07, |
|
"loss": 0.0, |
|
"reward": 0.4821428656578064, |
|
"reward_std": 0.4198353588581085, |
|
"rewards/accuracy_reward": 0.4821428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 557.8035888671875, |
|
"epoch": 0.02771855010660981, |
|
"grad_norm": 0.7683190703392029, |
|
"grpo_loss": 9.611248970031738e-07, |
|
"kl": 5.555152893066406e-05, |
|
"learning_rate": 8.297872340425533e-07, |
|
"loss": 0.0, |
|
"reward": 0.455357164144516, |
|
"reward_std": 0.3334502577781677, |
|
"rewards/accuracy_reward": 0.455357164144516, |
|
"rewards/format_reward": 0.0, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 591.169677734375, |
|
"epoch": 0.029850746268656716, |
|
"grad_norm": 0.54457026720047, |
|
"grpo_loss": 0.04723565652966499, |
|
"kl": 0.00010728836059570312, |
|
"learning_rate": 8.936170212765958e-07, |
|
"loss": 0.0, |
|
"reward": 0.4017857313156128, |
|
"reward_std": 0.35989412665367126, |
|
"rewards/accuracy_reward": 0.392857164144516, |
|
"rewards/format_reward": 0.008928571827709675, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 607.357177734375, |
|
"epoch": 0.031982942430703626, |
|
"grad_norm": 0.44422805309295654, |
|
"grpo_loss": 0.11808501183986664, |
|
"kl": 6.389617919921875e-05, |
|
"learning_rate": 9.574468085106384e-07, |
|
"loss": 0.0, |
|
"reward": 0.3750000298023224, |
|
"reward_std": 0.2989003658294678, |
|
"rewards/accuracy_reward": 0.3750000298023224, |
|
"rewards/format_reward": 0.0, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 602.857177734375, |
|
"epoch": 0.03411513859275053, |
|
"grad_norm": 0.5179570913314819, |
|
"grpo_loss": -0.054881345480680466, |
|
"kl": 6.580352783203125e-05, |
|
"learning_rate": 1.0212765957446809e-06, |
|
"loss": 0.0, |
|
"reward": 0.3660714328289032, |
|
"reward_std": 0.357073038816452, |
|
"rewards/accuracy_reward": 0.3660714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 634.8035888671875, |
|
"epoch": 0.03624733475479744, |
|
"grad_norm": 0.45830434560775757, |
|
"grpo_loss": -0.07317615300416946, |
|
"kl": 5.650520324707031e-05, |
|
"learning_rate": 1.0851063829787236e-06, |
|
"loss": 0.0, |
|
"reward": 0.3839285969734192, |
|
"reward_std": 0.4347257912158966, |
|
"rewards/accuracy_reward": 0.3839285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 576.4732666015625, |
|
"epoch": 0.03837953091684435, |
|
"grad_norm": 0.5573252439498901, |
|
"grpo_loss": -0.07317613065242767, |
|
"kl": 6.246566772460938e-05, |
|
"learning_rate": 1.148936170212766e-06, |
|
"loss": 0.0, |
|
"reward": 0.6339285969734192, |
|
"reward_std": 0.4847923815250397, |
|
"rewards/accuracy_reward": 0.6339285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 632.6964721679688, |
|
"epoch": 0.04051172707889126, |
|
"grad_norm": 0.6752464771270752, |
|
"grpo_loss": 2.041459083557129e-06, |
|
"kl": 5.7697296142578125e-05, |
|
"learning_rate": 1.2127659574468085e-06, |
|
"loss": 0.0, |
|
"reward": 0.330357164144516, |
|
"reward_std": 0.3501092493534088, |
|
"rewards/accuracy_reward": 0.3214285969734192, |
|
"rewards/format_reward": 0.008928571827709675, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 641.8125, |
|
"epoch": 0.042643923240938165, |
|
"grad_norm": 1.7546889781951904, |
|
"grpo_loss": -0.054879434406757355, |
|
"kl": 0.00014495849609375, |
|
"learning_rate": 1.276595744680851e-06, |
|
"loss": 0.0, |
|
"reward": 0.2946428656578064, |
|
"reward_std": 0.4111029803752899, |
|
"rewards/accuracy_reward": 0.2857142984867096, |
|
"rewards/format_reward": 0.008928571827709675, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 589.169677734375, |
|
"epoch": 0.04477611940298507, |
|
"grad_norm": 0.4196793735027313, |
|
"grpo_loss": 0.054885976016521454, |
|
"kl": 7.677078247070312e-05, |
|
"learning_rate": 1.3404255319148935e-06, |
|
"loss": 0.0, |
|
"reward": 0.3839285969734192, |
|
"reward_std": 0.42379865050315857, |
|
"rewards/accuracy_reward": 0.3839285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 655.5714721679688, |
|
"epoch": 0.046908315565031986, |
|
"grad_norm": 0.41036805510520935, |
|
"grpo_loss": 2.6362554308434483e-06, |
|
"kl": 8.821487426757812e-05, |
|
"learning_rate": 1.4042553191489362e-06, |
|
"loss": 0.0, |
|
"reward": 0.3392857313156128, |
|
"reward_std": 0.3501092493534088, |
|
"rewards/accuracy_reward": 0.3392857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 641.6875, |
|
"epoch": 0.04904051172707889, |
|
"grad_norm": 3.576270580291748, |
|
"grpo_loss": 1.2814998626708984e-05, |
|
"kl": 0.0001163482666015625, |
|
"learning_rate": 1.4680851063829787e-06, |
|
"loss": 0.0, |
|
"reward": 0.3750000298023224, |
|
"reward_std": 0.3450036942958832, |
|
"rewards/accuracy_reward": 0.3750000298023224, |
|
"rewards/format_reward": 0.0, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 653.0267944335938, |
|
"epoch": 0.0511727078891258, |
|
"grad_norm": 1.1303625106811523, |
|
"grpo_loss": 1.1533498764038086e-05, |
|
"kl": 0.0001163482666015625, |
|
"learning_rate": 1.5319148936170212e-06, |
|
"loss": 0.0, |
|
"reward": 0.3660714328289032, |
|
"reward_std": 0.42785170674324036, |
|
"rewards/accuracy_reward": 0.3571428656578064, |
|
"rewards/format_reward": 0.008928571827709675, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 640.3928833007812, |
|
"epoch": 0.053304904051172705, |
|
"grad_norm": 5.7301344871521, |
|
"grpo_loss": 4.673004150390625e-05, |
|
"kl": 0.0005645751953125, |
|
"learning_rate": 1.5957446808510639e-06, |
|
"loss": 0.0, |
|
"reward": 0.3482142984867096, |
|
"reward_std": 0.33627134561538696, |
|
"rewards/accuracy_reward": 0.3392857313156128, |
|
"rewards/format_reward": 0.008928571827709675, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 709.2500610351562, |
|
"epoch": 0.05543710021321962, |
|
"grad_norm": 0.3460543155670166, |
|
"grpo_loss": 0.016702793538570404, |
|
"kl": 0.00013637542724609375, |
|
"learning_rate": 1.6595744680851066e-06, |
|
"loss": 0.0, |
|
"reward": 0.3035714328289032, |
|
"reward_std": 0.29598960280418396, |
|
"rewards/accuracy_reward": 0.3035714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 467.8660888671875, |
|
"epoch": 0.057569296375266525, |
|
"grad_norm": 0.8011674880981445, |
|
"grpo_loss": -0.07316015660762787, |
|
"kl": 0.0003757476806640625, |
|
"learning_rate": 1.723404255319149e-06, |
|
"loss": 0.0, |
|
"reward": 0.5089285969734192, |
|
"reward_std": 0.393301784992218, |
|
"rewards/accuracy_reward": 0.5089285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 659.2767944335938, |
|
"epoch": 0.05970149253731343, |
|
"grad_norm": 0.3461753726005554, |
|
"grpo_loss": -0.1336020827293396, |
|
"kl": 0.0002613067626953125, |
|
"learning_rate": 1.7872340425531915e-06, |
|
"loss": 0.0, |
|
"reward": 0.3035714328289032, |
|
"reward_std": 0.3432351350784302, |
|
"rewards/accuracy_reward": 0.3035714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 722.732177734375, |
|
"epoch": 0.06183368869936034, |
|
"grad_norm": 0.4500427842140198, |
|
"grpo_loss": -0.05487678945064545, |
|
"kl": 0.000286102294921875, |
|
"learning_rate": 1.851063829787234e-06, |
|
"loss": 0.0, |
|
"reward": 0.3660714328289032, |
|
"reward_std": 0.3737320005893707, |
|
"rewards/accuracy_reward": 0.3660714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 598.294677734375, |
|
"epoch": 0.06396588486140725, |
|
"grad_norm": 1.4335441589355469, |
|
"grpo_loss": -0.01668836548924446, |
|
"kl": 0.0007171630859375, |
|
"learning_rate": 1.9148936170212767e-06, |
|
"loss": 0.0, |
|
"reward": 0.3035714328289032, |
|
"reward_std": 0.37258976697921753, |
|
"rewards/accuracy_reward": 0.3035714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 576.669677734375, |
|
"epoch": 0.06609808102345416, |
|
"grad_norm": 0.637279748916626, |
|
"grpo_loss": -0.100145623087883, |
|
"kl": 0.000812530517578125, |
|
"learning_rate": 1.978723404255319e-06, |
|
"loss": 0.0, |
|
"reward": 0.330357164144516, |
|
"reward_std": 0.3501092493534088, |
|
"rewards/accuracy_reward": 0.330357164144516, |
|
"rewards/format_reward": 0.0, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 590.0357666015625, |
|
"epoch": 0.06823027718550106, |
|
"grad_norm": 1.1126184463500977, |
|
"grpo_loss": -0.01668006181716919, |
|
"kl": 0.00135040283203125, |
|
"learning_rate": 2.0425531914893617e-06, |
|
"loss": 0.0001, |
|
"reward": 0.3660714328289032, |
|
"reward_std": 0.33336055278778076, |
|
"rewards/accuracy_reward": 0.3660714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 550.5892944335938, |
|
"epoch": 0.07036247334754797, |
|
"grad_norm": 0.5474586486816406, |
|
"grpo_loss": 6.0905444115633145e-05, |
|
"kl": 0.00099945068359375, |
|
"learning_rate": 2.1063829787234044e-06, |
|
"loss": 0.0, |
|
"reward": 0.3214285969734192, |
|
"reward_std": 0.3206649422645569, |
|
"rewards/accuracy_reward": 0.3214285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 610.5892944335938, |
|
"epoch": 0.07249466950959488, |
|
"grad_norm": 0.6375254988670349, |
|
"grpo_loss": 0.10025076568126678, |
|
"kl": 0.008544921875, |
|
"learning_rate": 2.170212765957447e-06, |
|
"loss": 0.0003, |
|
"reward": 0.3750000298023224, |
|
"reward_std": 0.3903909921646118, |
|
"rewards/accuracy_reward": 0.3750000298023224, |
|
"rewards/format_reward": 0.0, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 633.2589721679688, |
|
"epoch": 0.07462686567164178, |
|
"grad_norm": 0.5401435494422913, |
|
"grpo_loss": -0.05477993190288544, |
|
"kl": 0.002655029296875, |
|
"learning_rate": 2.2340425531914894e-06, |
|
"loss": 0.0001, |
|
"reward": 0.3392857313156128, |
|
"reward_std": 0.35407260060310364, |
|
"rewards/accuracy_reward": 0.3392857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 586.9732666015625, |
|
"epoch": 0.0767590618336887, |
|
"grad_norm": 0.5796173810958862, |
|
"grpo_loss": 0.05496525764465332, |
|
"kl": 0.00262451171875, |
|
"learning_rate": 2.297872340425532e-06, |
|
"loss": 0.0001, |
|
"reward": 0.5446428656578064, |
|
"reward_std": 0.4081922173500061, |
|
"rewards/accuracy_reward": 0.5446428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 654.9017944335938, |
|
"epoch": 0.07889125799573561, |
|
"grad_norm": 0.5337525010108948, |
|
"grpo_loss": -0.0982007384300232, |
|
"kl": 0.0093994140625, |
|
"learning_rate": 2.3617021276595748e-06, |
|
"loss": 0.0004, |
|
"reward": 0.4107142984867096, |
|
"reward_std": 0.41005048155784607, |
|
"rewards/accuracy_reward": 0.4107142984867096, |
|
"rewards/format_reward": 0.0, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 615.4285888671875, |
|
"epoch": 0.08102345415778252, |
|
"grad_norm": 0.40850356221199036, |
|
"grpo_loss": -0.07306132465600967, |
|
"kl": 0.002777099609375, |
|
"learning_rate": 2.425531914893617e-06, |
|
"loss": 0.0001, |
|
"reward": 0.3482142984867096, |
|
"reward_std": 0.38642770051956177, |
|
"rewards/accuracy_reward": 0.3482142984867096, |
|
"rewards/format_reward": 0.0, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 645.3928833007812, |
|
"epoch": 0.08315565031982942, |
|
"grad_norm": 2.3292202949523926, |
|
"grpo_loss": 4.441291093826294e-05, |
|
"kl": 0.010009765625, |
|
"learning_rate": 2.4893617021276598e-06, |
|
"loss": 0.0004, |
|
"reward": 0.5, |
|
"reward_std": 0.4099607765674591, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 647.3303833007812, |
|
"epoch": 0.08528784648187633, |
|
"grad_norm": 0.3452330231666565, |
|
"grpo_loss": -0.016505777835845947, |
|
"kl": 0.0033111572265625, |
|
"learning_rate": 2.553191489361702e-06, |
|
"loss": 0.0001, |
|
"reward": 0.535714328289032, |
|
"reward_std": 0.3580358922481537, |
|
"rewards/accuracy_reward": 0.535714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 680.4107666015625, |
|
"epoch": 0.08742004264392324, |
|
"grad_norm": 0.4223065972328186, |
|
"grpo_loss": 0.0549992173910141, |
|
"kl": 0.003387451171875, |
|
"learning_rate": 2.6170212765957447e-06, |
|
"loss": 0.0001, |
|
"reward": 0.4017857313156128, |
|
"reward_std": 0.3679104745388031, |
|
"rewards/accuracy_reward": 0.4017857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 524.732177734375, |
|
"epoch": 0.08955223880597014, |
|
"grad_norm": 0.6519092321395874, |
|
"grpo_loss": -0.07294157892465591, |
|
"kl": 0.01025390625, |
|
"learning_rate": 2.680851063829787e-06, |
|
"loss": 0.0004, |
|
"reward": 0.5, |
|
"reward_std": 0.44742143154144287, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 653.4910888671875, |
|
"epoch": 0.09168443496801706, |
|
"grad_norm": 0.4341220557689667, |
|
"grpo_loss": 0.01703798398375511, |
|
"kl": 0.00616455078125, |
|
"learning_rate": 2.7446808510638297e-06, |
|
"loss": 0.0002, |
|
"reward": 0.4910714626312256, |
|
"reward_std": 0.30004259943962097, |
|
"rewards/accuracy_reward": 0.4910714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 603.5535888671875, |
|
"epoch": 0.09381663113006397, |
|
"grad_norm": 1.1075998544692993, |
|
"grpo_loss": -0.05463188886642456, |
|
"kl": 0.010498046875, |
|
"learning_rate": 2.8085106382978724e-06, |
|
"loss": 0.0004, |
|
"reward": 0.625, |
|
"reward_std": 0.38351690769195557, |
|
"rewards/accuracy_reward": 0.625, |
|
"rewards/format_reward": 0.0, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 577.9017944335938, |
|
"epoch": 0.09594882729211088, |
|
"grad_norm": 0.45368558168411255, |
|
"grpo_loss": 0.000235043466091156, |
|
"kl": 0.007110595703125, |
|
"learning_rate": 2.872340425531915e-06, |
|
"loss": 0.0003, |
|
"reward": 0.4821428656578064, |
|
"reward_std": 0.3490566909313202, |
|
"rewards/accuracy_reward": 0.4732142984867096, |
|
"rewards/format_reward": 0.008928571827709675, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 617.7410888671875, |
|
"epoch": 0.09808102345415778, |
|
"grad_norm": 0.4727841019630432, |
|
"grpo_loss": 0.07361437380313873, |
|
"kl": 0.0216064453125, |
|
"learning_rate": 2.9361702127659574e-06, |
|
"loss": 0.0009, |
|
"reward": 0.4375000298023224, |
|
"reward_std": 0.3638574182987213, |
|
"rewards/accuracy_reward": 0.4375000298023224, |
|
"rewards/format_reward": 0.0, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 638.0892944335938, |
|
"epoch": 0.10021321961620469, |
|
"grad_norm": 0.23905974626541138, |
|
"grpo_loss": 0.017111822962760925, |
|
"kl": 0.00933837890625, |
|
"learning_rate": 3e-06, |
|
"loss": 0.0004, |
|
"reward": 0.5714285969734192, |
|
"reward_std": 0.31264859437942505, |
|
"rewards/accuracy_reward": 0.5714285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 640.9732666015625, |
|
"epoch": 0.1023454157782516, |
|
"grad_norm": 1.3742345571517944, |
|
"grpo_loss": 0.0002704933285713196, |
|
"kl": 0.029296875, |
|
"learning_rate": 2.9999584343954855e-06, |
|
"loss": 0.0012, |
|
"reward": 0.5, |
|
"reward_std": 0.3501092493534088, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 633.0892944335938, |
|
"epoch": 0.1044776119402985, |
|
"grad_norm": 0.33841472864151, |
|
"grpo_loss": -0.11769130825996399, |
|
"kl": 0.01080322265625, |
|
"learning_rate": 2.999833739885541e-06, |
|
"loss": 0.0004, |
|
"reward": 0.5535714626312256, |
|
"reward_std": 0.4169245660305023, |
|
"rewards/accuracy_reward": 0.5535714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 516.9642944335938, |
|
"epoch": 0.10660980810234541, |
|
"grad_norm": 0.32168567180633545, |
|
"grpo_loss": 0.10090474039316177, |
|
"kl": 0.0263671875, |
|
"learning_rate": 2.9996259233808368e-06, |
|
"loss": 0.0011, |
|
"reward": 0.6696428656578064, |
|
"reward_std": 0.38351690769195557, |
|
"rewards/accuracy_reward": 0.6696428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 669.3839721679688, |
|
"epoch": 0.10874200426439233, |
|
"grad_norm": 0.7801251411437988, |
|
"grpo_loss": -0.07295064628124237, |
|
"kl": 0.044677734375, |
|
"learning_rate": 2.9993349963987307e-06, |
|
"loss": 0.0018, |
|
"reward": 0.5535714626312256, |
|
"reward_std": 0.31379079818725586, |
|
"rewards/accuracy_reward": 0.5535714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 662.794677734375, |
|
"epoch": 0.11087420042643924, |
|
"grad_norm": 0.7691836953163147, |
|
"grpo_loss": -0.1333162784576416, |
|
"kl": 0.03076171875, |
|
"learning_rate": 2.9989609750626313e-06, |
|
"loss": 0.0012, |
|
"reward": 0.455357164144516, |
|
"reward_std": 0.3195226788520813, |
|
"rewards/accuracy_reward": 0.455357164144516, |
|
"rewards/format_reward": 0.0, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 632.3928833007812, |
|
"epoch": 0.11300639658848614, |
|
"grad_norm": 5.331604957580566, |
|
"grpo_loss": 0.10157421231269836, |
|
"kl": 0.0255126953125, |
|
"learning_rate": 2.998503880101102e-06, |
|
"loss": 0.001, |
|
"reward": 0.5267857313156128, |
|
"reward_std": 0.3679104745388031, |
|
"rewards/accuracy_reward": 0.5267857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 635.169677734375, |
|
"epoch": 0.11513859275053305, |
|
"grad_norm": 0.25747939944267273, |
|
"grpo_loss": 0.0004050950810778886, |
|
"kl": 0.006378173828125, |
|
"learning_rate": 2.9979637368467146e-06, |
|
"loss": 0.0003, |
|
"reward": 0.625, |
|
"reward_std": 0.3028636872768402, |
|
"rewards/accuracy_reward": 0.625, |
|
"rewards/format_reward": 0.0, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 655.8035888671875, |
|
"epoch": 0.11727078891257996, |
|
"grad_norm": 26.88686752319336, |
|
"grpo_loss": 0.0002674385905265808, |
|
"kl": 0.59765625, |
|
"learning_rate": 2.9973405752346425e-06, |
|
"loss": 0.0237, |
|
"reward": 0.5446428656578064, |
|
"reward_std": 0.37664279341697693, |
|
"rewards/accuracy_reward": 0.5446428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 681.5178833007812, |
|
"epoch": 0.11940298507462686, |
|
"grad_norm": 0.22809027135372162, |
|
"grpo_loss": 0.00032356235897168517, |
|
"kl": 0.00927734375, |
|
"learning_rate": 2.9966344298010057e-06, |
|
"loss": 0.0004, |
|
"reward": 0.5535714626312256, |
|
"reward_std": 0.29016801714897156, |
|
"rewards/accuracy_reward": 0.5535714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 597.0357666015625, |
|
"epoch": 0.12153518123667377, |
|
"grad_norm": 0.9132996797561646, |
|
"grpo_loss": 0.047636114060878754, |
|
"kl": 0.032470703125, |
|
"learning_rate": 2.9958453396809527e-06, |
|
"loss": 0.0013, |
|
"reward": 0.5803571939468384, |
|
"reward_std": 0.2931685149669647, |
|
"rewards/accuracy_reward": 0.5803571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 621.4107666015625, |
|
"epoch": 0.12366737739872068, |
|
"grad_norm": 0.29949259757995605, |
|
"grpo_loss": -0.015969865024089813, |
|
"kl": 0.01531982421875, |
|
"learning_rate": 2.994973348606494e-06, |
|
"loss": 0.0006, |
|
"reward": 0.535714328289032, |
|
"reward_std": 0.316791296005249, |
|
"rewards/accuracy_reward": 0.535714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 587.0178833007812, |
|
"epoch": 0.1257995735607676, |
|
"grad_norm": 0.3156592547893524, |
|
"grpo_loss": 0.0001606196165084839, |
|
"kl": 0.0283203125, |
|
"learning_rate": 2.994018504904078e-06, |
|
"loss": 0.0011, |
|
"reward": 0.598214328289032, |
|
"reward_std": 0.26945602893829346, |
|
"rewards/accuracy_reward": 0.598214328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 538.6517944335938, |
|
"epoch": 0.1279317697228145, |
|
"grad_norm": 0.2868943214416504, |
|
"grpo_loss": 0.00023730844259262085, |
|
"kl": 0.0250244140625, |
|
"learning_rate": 2.9929808614919113e-06, |
|
"loss": 0.001, |
|
"reward": 0.7053571939468384, |
|
"reward_std": 0.25976085662841797, |
|
"rewards/accuracy_reward": 0.7053571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 679.1875, |
|
"epoch": 0.1300639658848614, |
|
"grad_norm": 0.15767447650432587, |
|
"grpo_loss": 8.637533755972981e-05, |
|
"kl": 0.004791259765625, |
|
"learning_rate": 2.9918604758770298e-06, |
|
"loss": 0.0002, |
|
"reward": 0.5446428656578064, |
|
"reward_std": 0.24874404072761536, |
|
"rewards/accuracy_reward": 0.5446428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 593.4464721679688, |
|
"epoch": 0.13219616204690832, |
|
"grad_norm": 0.20824034512043, |
|
"grpo_loss": 0.016763929277658463, |
|
"kl": 0.006134033203125, |
|
"learning_rate": 2.9906574101521067e-06, |
|
"loss": 0.0002, |
|
"reward": 0.5803571939468384, |
|
"reward_std": 0.3431454598903656, |
|
"rewards/accuracy_reward": 0.5803571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 582.4107666015625, |
|
"epoch": 0.13432835820895522, |
|
"grad_norm": 0.32729294896125793, |
|
"grpo_loss": -0.09963612258434296, |
|
"kl": 0.017333984375, |
|
"learning_rate": 2.9893717309920135e-06, |
|
"loss": 0.0007, |
|
"reward": 0.6517857313156128, |
|
"reward_std": 0.26267164945602417, |
|
"rewards/accuracy_reward": 0.6428571939468384, |
|
"rewards/format_reward": 0.008928571827709675, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 706.3303833007812, |
|
"epoch": 0.13646055437100213, |
|
"grad_norm": 0.17250199615955353, |
|
"grpo_loss": -0.10007388144731522, |
|
"kl": 0.00469970703125, |
|
"learning_rate": 2.9880035096501265e-06, |
|
"loss": 0.0002, |
|
"reward": 0.4107142984867096, |
|
"reward_std": 0.3334502577781677, |
|
"rewards/accuracy_reward": 0.4107142984867096, |
|
"rewards/format_reward": 0.0, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 508.1875305175781, |
|
"epoch": 0.13859275053304904, |
|
"grad_norm": 0.18875280022621155, |
|
"grpo_loss": 0.10045573860406876, |
|
"kl": 0.01495361328125, |
|
"learning_rate": 2.9865528219543747e-06, |
|
"loss": 0.0006, |
|
"reward": 0.7589285969734192, |
|
"reward_std": 0.3254339098930359, |
|
"rewards/accuracy_reward": 0.7589285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 589.0625, |
|
"epoch": 0.14072494669509594, |
|
"grad_norm": 0.2253061980009079, |
|
"grpo_loss": 0.0006140768527984619, |
|
"kl": 0.024658203125, |
|
"learning_rate": 2.9850197483030394e-06, |
|
"loss": 0.001, |
|
"reward": 0.6875000596046448, |
|
"reward_std": 0.2695457339286804, |
|
"rewards/accuracy_reward": 0.6875000596046448, |
|
"rewards/format_reward": 0.0, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 553.5714721679688, |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 0.26960617303848267, |
|
"grpo_loss": 0.0003652747254818678, |
|
"kl": 0.020263671875, |
|
"learning_rate": 2.9834043736602984e-06, |
|
"loss": 0.0008, |
|
"reward": 0.7142857313156128, |
|
"reward_std": 0.2792409062385559, |
|
"rewards/accuracy_reward": 0.7142857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 67 |
|
}, |
|
{ |
|
"completion_length": 590.5357666015625, |
|
"epoch": 0.14498933901918976, |
|
"grad_norm": 0.18081212043762207, |
|
"grpo_loss": 0.055027492344379425, |
|
"kl": 0.01373291015625, |
|
"learning_rate": 2.9817067875515166e-06, |
|
"loss": 0.0005, |
|
"reward": 0.598214328289032, |
|
"reward_std": 0.351251482963562, |
|
"rewards/accuracy_reward": 0.598214328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 573.75, |
|
"epoch": 0.14712153518123666, |
|
"grad_norm": 0.14580072462558746, |
|
"grpo_loss": 9.892135858535767e-05, |
|
"kl": 0.010009765625, |
|
"learning_rate": 2.9799270840582835e-06, |
|
"loss": 0.0004, |
|
"reward": 0.7321428656578064, |
|
"reward_std": 0.24001170694828033, |
|
"rewards/accuracy_reward": 0.7321428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 549.3125, |
|
"epoch": 0.14925373134328357, |
|
"grad_norm": 0.16547168791294098, |
|
"grpo_loss": 0.0005362048395909369, |
|
"kl": 0.0089111328125, |
|
"learning_rate": 2.978065361813203e-06, |
|
"loss": 0.0004, |
|
"reward": 0.8660714626312256, |
|
"reward_std": 0.15152154862880707, |
|
"rewards/accuracy_reward": 0.8660714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 627.2410888671875, |
|
"epoch": 0.1513859275053305, |
|
"grad_norm": 0.2011519968509674, |
|
"grpo_loss": -0.04696042090654373, |
|
"kl": 0.00927734375, |
|
"learning_rate": 2.97612172399442e-06, |
|
"loss": 0.0004, |
|
"reward": 0.4821428656578064, |
|
"reward_std": 0.31961238384246826, |
|
"rewards/accuracy_reward": 0.4821428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 71 |
|
}, |
|
{ |
|
"completion_length": 707.5267944335938, |
|
"epoch": 0.1535181236673774, |
|
"grad_norm": 0.5303527116775513, |
|
"grpo_loss": 0.047381460666656494, |
|
"kl": 0.0166015625, |
|
"learning_rate": 2.9740962783199107e-06, |
|
"loss": 0.0007, |
|
"reward": 0.4910714626312256, |
|
"reward_std": 0.3098274767398834, |
|
"rewards/accuracy_reward": 0.4910714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 553.2678833007812, |
|
"epoch": 0.15565031982942432, |
|
"grad_norm": 0.19617433845996857, |
|
"grpo_loss": 0.0549967885017395, |
|
"kl": 0.00518798828125, |
|
"learning_rate": 2.971989137041507e-06, |
|
"loss": 0.0002, |
|
"reward": 0.6696428656578064, |
|
"reward_std": 0.3748742640018463, |
|
"rewards/accuracy_reward": 0.6696428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 73 |
|
}, |
|
{ |
|
"completion_length": 639.7410888671875, |
|
"epoch": 0.15778251599147122, |
|
"grad_norm": 0.18848314881324768, |
|
"grpo_loss": -0.0726790651679039, |
|
"kl": 0.00927734375, |
|
"learning_rate": 2.969800416938676e-06, |
|
"loss": 0.0004, |
|
"reward": 0.5892857313156128, |
|
"reward_std": 0.25570783019065857, |
|
"rewards/accuracy_reward": 0.5892857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 618.9285888671875, |
|
"epoch": 0.15991471215351813, |
|
"grad_norm": 0.15183982253074646, |
|
"grpo_loss": 0.0001978054642677307, |
|
"kl": 0.006591796875, |
|
"learning_rate": 2.967530239312051e-06, |
|
"loss": 0.0003, |
|
"reward": 0.5446428656578064, |
|
"reward_std": 0.2832939326763153, |
|
"rewards/accuracy_reward": 0.5446428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 628.6428833007812, |
|
"epoch": 0.16204690831556504, |
|
"grad_norm": 0.17083314061164856, |
|
"grpo_loss": 0.0003498228034004569, |
|
"kl": 0.0093994140625, |
|
"learning_rate": 2.9651787299767046e-06, |
|
"loss": 0.0004, |
|
"reward": 0.5089285969734192, |
|
"reward_std": 0.3029533922672272, |
|
"rewards/accuracy_reward": 0.5089285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 645.6964721679688, |
|
"epoch": 0.16417910447761194, |
|
"grad_norm": 0.1758926510810852, |
|
"grpo_loss": 0.04745073243975639, |
|
"kl": 0.01544189453125, |
|
"learning_rate": 2.9627460192551807e-06, |
|
"loss": 0.0006, |
|
"reward": 0.6517857313156128, |
|
"reward_std": 0.2027304321527481, |
|
"rewards/accuracy_reward": 0.6517857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 77 |
|
}, |
|
{ |
|
"completion_length": 548.2857666015625, |
|
"epoch": 0.16631130063965885, |
|
"grad_norm": 0.35794636607170105, |
|
"grpo_loss": 0.00010507553815841675, |
|
"kl": 0.0294189453125, |
|
"learning_rate": 2.960232241970268e-06, |
|
"loss": 0.0012, |
|
"reward": 0.7321428656578064, |
|
"reward_std": 0.3235756754875183, |
|
"rewards/accuracy_reward": 0.7321428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 602.4107666015625, |
|
"epoch": 0.16844349680170576, |
|
"grad_norm": 0.40275028347969055, |
|
"grpo_loss": 0.00044640994747169316, |
|
"kl": 0.018310546875, |
|
"learning_rate": 2.957637537437529e-06, |
|
"loss": 0.0007, |
|
"reward": 0.5535714626312256, |
|
"reward_std": 0.20846228301525116, |
|
"rewards/accuracy_reward": 0.5535714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 79 |
|
}, |
|
{ |
|
"completion_length": 645.7589721679688, |
|
"epoch": 0.17057569296375266, |
|
"grad_norm": 0.15431800484657288, |
|
"grpo_loss": 0.0002495869994163513, |
|
"kl": 0.00836181640625, |
|
"learning_rate": 2.9549620494575816e-06, |
|
"loss": 0.0003, |
|
"reward": 0.5892857313156128, |
|
"reward_std": 0.2430122047662735, |
|
"rewards/accuracy_reward": 0.5892857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 612.6339721679688, |
|
"epoch": 0.17270788912579957, |
|
"grad_norm": 0.1417791098356247, |
|
"grpo_loss": 0.0002077631652355194, |
|
"kl": 0.00592041015625, |
|
"learning_rate": 2.952205926308125e-06, |
|
"loss": 0.0002, |
|
"reward": 0.6339285969734192, |
|
"reward_std": 0.2832939326763153, |
|
"rewards/accuracy_reward": 0.6339285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 476.14288330078125, |
|
"epoch": 0.17484008528784648, |
|
"grad_norm": 0.16035664081573486, |
|
"grpo_loss": 0.00023966887965798378, |
|
"kl": 0.0093994140625, |
|
"learning_rate": 2.9493693207357266e-06, |
|
"loss": 0.0004, |
|
"reward": 0.7767857313156128, |
|
"reward_std": 0.22626349329948425, |
|
"rewards/accuracy_reward": 0.7767857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 633.6785888671875, |
|
"epoch": 0.17697228144989338, |
|
"grad_norm": 0.22332347929477692, |
|
"grpo_loss": 0.05593012273311615, |
|
"kl": 0.01171875, |
|
"learning_rate": 2.946452389947353e-06, |
|
"loss": 0.0005, |
|
"reward": 0.5535714626312256, |
|
"reward_std": 0.33741360902786255, |
|
"rewards/accuracy_reward": 0.5535714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 83 |
|
}, |
|
{ |
|
"completion_length": 681.1428833007812, |
|
"epoch": 0.1791044776119403, |
|
"grad_norm": 0.16065801680088043, |
|
"grpo_loss": 0.01748570427298546, |
|
"kl": 0.00799560546875, |
|
"learning_rate": 2.9434552956016594e-06, |
|
"loss": 0.0003, |
|
"reward": 0.625, |
|
"reward_std": 0.33336058259010315, |
|
"rewards/accuracy_reward": 0.625, |
|
"rewards/format_reward": 0.0, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 675.5357666015625, |
|
"epoch": 0.1812366737739872, |
|
"grad_norm": 0.17096219956874847, |
|
"grpo_loss": 0.04753614217042923, |
|
"kl": 0.009521484375, |
|
"learning_rate": 2.9403782038000303e-06, |
|
"loss": 0.0004, |
|
"reward": 0.4910714626312256, |
|
"reward_std": 0.35989415645599365, |
|
"rewards/accuracy_reward": 0.4910714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 626.3482666015625, |
|
"epoch": 0.18336886993603413, |
|
"grad_norm": 0.16287542879581451, |
|
"grpo_loss": -0.07303746789693832, |
|
"kl": 0.006500244140625, |
|
"learning_rate": 2.9372212850773744e-06, |
|
"loss": 0.0003, |
|
"reward": 0.6428571939468384, |
|
"reward_std": 0.25279706716537476, |
|
"rewards/accuracy_reward": 0.6428571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 617.419677734375, |
|
"epoch": 0.18550106609808104, |
|
"grad_norm": 0.2341645210981369, |
|
"grpo_loss": -0.11798445880413055, |
|
"kl": 0.006378173828125, |
|
"learning_rate": 2.9339847143926705e-06, |
|
"loss": 0.0003, |
|
"reward": 0.6071428656578064, |
|
"reward_std": 0.28338363766670227, |
|
"rewards/accuracy_reward": 0.6071428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 664.5, |
|
"epoch": 0.18763326226012794, |
|
"grad_norm": 0.14599959552288055, |
|
"grpo_loss": 0.10042649507522583, |
|
"kl": 0.004547119140625, |
|
"learning_rate": 2.9306686711192755e-06, |
|
"loss": 0.0002, |
|
"reward": 0.535714328289032, |
|
"reward_std": 0.27933061122894287, |
|
"rewards/accuracy_reward": 0.535714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 574.3928833007812, |
|
"epoch": 0.18976545842217485, |
|
"grad_norm": 0.19283778965473175, |
|
"grpo_loss": 0.1338942050933838, |
|
"kl": 0.008056640625, |
|
"learning_rate": 2.92727333903498e-06, |
|
"loss": 0.0003, |
|
"reward": 0.7946428656578064, |
|
"reward_std": 0.15839563310146332, |
|
"rewards/accuracy_reward": 0.7946428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 89 |
|
}, |
|
{ |
|
"completion_length": 562.8125, |
|
"epoch": 0.19189765458422176, |
|
"grad_norm": 0.21768853068351746, |
|
"grpo_loss": 0.05516081675887108, |
|
"kl": 0.006927490234375, |
|
"learning_rate": 2.923798906311825e-06, |
|
"loss": 0.0003, |
|
"reward": 0.7767857313156128, |
|
"reward_std": 0.30004262924194336, |
|
"rewards/accuracy_reward": 0.7767857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 590.607177734375, |
|
"epoch": 0.19402985074626866, |
|
"grad_norm": 0.18265320360660553, |
|
"grpo_loss": 0.11892025172710419, |
|
"kl": 0.00775146484375, |
|
"learning_rate": 2.920245565505673e-06, |
|
"loss": 0.0003, |
|
"reward": 0.455357164144516, |
|
"reward_std": 0.2832939326763153, |
|
"rewards/accuracy_reward": 0.455357164144516, |
|
"rewards/format_reward": 0.0, |
|
"step": 91 |
|
}, |
|
{ |
|
"completion_length": 556.732177734375, |
|
"epoch": 0.19616204690831557, |
|
"grad_norm": 0.21578750014305115, |
|
"grpo_loss": -0.04708147421479225, |
|
"kl": 0.00531005859375, |
|
"learning_rate": 2.916613513545535e-06, |
|
"loss": 0.0002, |
|
"reward": 0.6875000596046448, |
|
"reward_std": 0.3069167137145996, |
|
"rewards/accuracy_reward": 0.6875000596046448, |
|
"rewards/format_reward": 0.0, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 562.2232666015625, |
|
"epoch": 0.19829424307036247, |
|
"grad_norm": 1.9854023456573486, |
|
"grpo_loss": -0.07264155149459839, |
|
"kl": 0.07373046875, |
|
"learning_rate": 2.912902951722658e-06, |
|
"loss": 0.003, |
|
"reward": 0.6696428656578064, |
|
"reward_std": 0.29307880997657776, |
|
"rewards/accuracy_reward": 0.6696428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 93 |
|
}, |
|
{ |
|
"completion_length": 563.5535888671875, |
|
"epoch": 0.20042643923240938, |
|
"grad_norm": 0.19654212892055511, |
|
"grpo_loss": 0.0001397952437400818, |
|
"kl": 0.00885009765625, |
|
"learning_rate": 2.909114085679369e-06, |
|
"loss": 0.0004, |
|
"reward": 0.5625, |
|
"reward_std": 0.31961238384246826, |
|
"rewards/accuracy_reward": 0.5625, |
|
"rewards/format_reward": 0.0, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 564.732177734375, |
|
"epoch": 0.2025586353944563, |
|
"grad_norm": 0.306581050157547, |
|
"grpo_loss": -0.07296334207057953, |
|
"kl": 0.0213623046875, |
|
"learning_rate": 2.9052471253976785e-06, |
|
"loss": 0.0009, |
|
"reward": 0.7142857313156128, |
|
"reward_std": 0.26654526591300964, |
|
"rewards/accuracy_reward": 0.7142857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 484.5000305175781, |
|
"epoch": 0.2046908315565032, |
|
"grad_norm": 0.0992506816983223, |
|
"grpo_loss": -0.0729239359498024, |
|
"kl": 0.00653076171875, |
|
"learning_rate": 2.9013022851876413e-06, |
|
"loss": 0.0003, |
|
"reward": 0.8928571939468384, |
|
"reward_std": 0.1514318436384201, |
|
"rewards/accuracy_reward": 0.8928571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 566.6875, |
|
"epoch": 0.2068230277185501, |
|
"grad_norm": 0.16707083582878113, |
|
"grpo_loss": 6.328150629997253e-05, |
|
"kl": 0.006195068359375, |
|
"learning_rate": 2.897279783675483e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7410714626312256, |
|
"reward_std": 0.25279706716537476, |
|
"rewards/accuracy_reward": 0.7410714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 97 |
|
}, |
|
{ |
|
"completion_length": 658.6964721679688, |
|
"epoch": 0.208955223880597, |
|
"grad_norm": 0.13153815269470215, |
|
"grpo_loss": 0.00013254210352897644, |
|
"kl": 0.006195068359375, |
|
"learning_rate": 2.893179843791478e-06, |
|
"loss": 0.0002, |
|
"reward": 0.5803571939468384, |
|
"reward_std": 0.2418699413537979, |
|
"rewards/accuracy_reward": 0.5714285969734192, |
|
"rewards/format_reward": 0.008928571827709675, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 604.9553833007812, |
|
"epoch": 0.21108742004264391, |
|
"grad_norm": 0.12218047678470612, |
|
"grpo_loss": -0.047130413353443146, |
|
"kl": 0.005096435546875, |
|
"learning_rate": 2.8890026927576e-06, |
|
"loss": 0.0002, |
|
"reward": 0.6428571939468384, |
|
"reward_std": 0.19471408426761627, |
|
"rewards/accuracy_reward": 0.6428571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 99 |
|
}, |
|
{ |
|
"completion_length": 474.1160888671875, |
|
"epoch": 0.21321961620469082, |
|
"grad_norm": 0.25340017676353455, |
|
"grpo_loss": 0.10033953934907913, |
|
"kl": 0.011962890625, |
|
"learning_rate": 2.8847485620749263e-06, |
|
"loss": 0.0005, |
|
"reward": 0.7321428656578064, |
|
"reward_std": 0.2724565267562866, |
|
"rewards/accuracy_reward": 0.7321428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.21321961620469082, |
|
"eval_completion_length": 609.0062072558906, |
|
"eval_grpo_loss": 0.00038164247871754457, |
|
"eval_kl": 0.014738820231379792, |
|
"eval_loss": 0.0005614032270386815, |
|
"eval_reward": 0.5379963731422973, |
|
"eval_reward_std": 0.29159833647953437, |
|
"eval_rewards/accuracy_reward": 0.5379393215948781, |
|
"eval_rewards/format_reward": 5.7051577173863736e-05, |
|
"eval_runtime": 4026.4829, |
|
"eval_samples_per_second": 1.242, |
|
"eval_steps_per_second": 0.011, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 607.4464721679688, |
|
"epoch": 0.21535181236673773, |
|
"grad_norm": 0.10197024047374725, |
|
"grpo_loss": 0.00038136146031320095, |
|
"kl": 0.00634765625, |
|
"learning_rate": 2.8804176875108078e-06, |
|
"loss": 0.0003, |
|
"reward": 0.6071428656578064, |
|
"reward_std": 0.14164696633815765, |
|
"rewards/accuracy_reward": 0.598214328289032, |
|
"rewards/format_reward": 0.008928571827709675, |
|
"step": 101 |
|
}, |
|
{ |
|
"completion_length": 553.2142944335938, |
|
"epoch": 0.21748400852878466, |
|
"grad_norm": 0.15817181766033173, |
|
"grpo_loss": 0.0006425325991585851, |
|
"kl": 0.01019287109375, |
|
"learning_rate": 2.876010309085804e-06, |
|
"loss": 0.0004, |
|
"reward": 0.6875000596046448, |
|
"reward_std": 0.1928558498620987, |
|
"rewards/accuracy_reward": 0.6875000596046448, |
|
"rewards/format_reward": 0.0, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 554.8214721679688, |
|
"epoch": 0.21961620469083157, |
|
"grad_norm": 0.3014904260635376, |
|
"grpo_loss": 0.000414554524468258, |
|
"kl": 0.01361083984375, |
|
"learning_rate": 2.8715266710603797e-06, |
|
"loss": 0.0005, |
|
"reward": 0.6696428656578064, |
|
"reward_std": 0.2625819444656372, |
|
"rewards/accuracy_reward": 0.6696428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 103 |
|
}, |
|
{ |
|
"completion_length": 684.0089721679688, |
|
"epoch": 0.22174840085287847, |
|
"grad_norm": 0.1303500086069107, |
|
"grpo_loss": 0.04732581973075867, |
|
"kl": 0.00628662109375, |
|
"learning_rate": 2.8669670219213677e-06, |
|
"loss": 0.0003, |
|
"reward": 0.4821428656578064, |
|
"reward_std": 0.216478630900383, |
|
"rewards/accuracy_reward": 0.4821428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 553.3660888671875, |
|
"epoch": 0.22388059701492538, |
|
"grad_norm": 0.24533118307590485, |
|
"grpo_loss": 0.047773294150829315, |
|
"kl": 0.00714111328125, |
|
"learning_rate": 2.862331614368199e-06, |
|
"loss": 0.0003, |
|
"reward": 0.660714328289032, |
|
"reward_std": 0.27933061122894287, |
|
"rewards/accuracy_reward": 0.660714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 601.3125, |
|
"epoch": 0.2260127931769723, |
|
"grad_norm": 0.17875026166439056, |
|
"grpo_loss": 0.0001519266952527687, |
|
"kl": 0.00836181640625, |
|
"learning_rate": 2.857620705298896e-06, |
|
"loss": 0.0003, |
|
"reward": 0.6875000596046448, |
|
"reward_std": 0.26654526591300964, |
|
"rewards/accuracy_reward": 0.6875000596046448, |
|
"rewards/format_reward": 0.0, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 592.0625, |
|
"epoch": 0.2281449893390192, |
|
"grad_norm": 0.19455592334270477, |
|
"grpo_loss": 9.074594709090889e-05, |
|
"kl": 0.00518798828125, |
|
"learning_rate": 2.8528345557958363e-06, |
|
"loss": 0.0002, |
|
"reward": 0.6785714626312256, |
|
"reward_std": 0.222300186753273, |
|
"rewards/accuracy_reward": 0.6785714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 107 |
|
}, |
|
{ |
|
"completion_length": 607.6339721679688, |
|
"epoch": 0.2302771855010661, |
|
"grad_norm": 0.15129579603672028, |
|
"grpo_loss": 0.00017056881915777922, |
|
"kl": 0.00799560546875, |
|
"learning_rate": 2.847973431111284e-06, |
|
"loss": 0.0003, |
|
"reward": 0.6160714626312256, |
|
"reward_std": 0.22626349329948425, |
|
"rewards/accuracy_reward": 0.6160714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 575.4107666015625, |
|
"epoch": 0.232409381663113, |
|
"grad_norm": 0.23053210973739624, |
|
"grpo_loss": 9.338387462776154e-05, |
|
"kl": 0.0098876953125, |
|
"learning_rate": 2.843037600652686e-06, |
|
"loss": 0.0004, |
|
"reward": 0.6071428656578064, |
|
"reward_std": 0.21542608737945557, |
|
"rewards/accuracy_reward": 0.6071428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 109 |
|
}, |
|
{ |
|
"completion_length": 634.8392944335938, |
|
"epoch": 0.2345415778251599, |
|
"grad_norm": 0.1780773103237152, |
|
"grpo_loss": 0.0002103373408317566, |
|
"kl": 0.0050048828125, |
|
"learning_rate": 2.8380273379677463e-06, |
|
"loss": 0.0002, |
|
"reward": 0.6517857313156128, |
|
"reward_std": 0.29598960280418396, |
|
"rewards/accuracy_reward": 0.6517857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 601.3035888671875, |
|
"epoch": 0.23667377398720682, |
|
"grad_norm": 0.22908470034599304, |
|
"grpo_loss": -0.01645032875239849, |
|
"kl": 0.0106201171875, |
|
"learning_rate": 2.8329429207292592e-06, |
|
"loss": 0.0004, |
|
"reward": 0.6696428656578064, |
|
"reward_std": 0.31970205903053284, |
|
"rewards/accuracy_reward": 0.6696428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 111 |
|
}, |
|
{ |
|
"completion_length": 700.0714721679688, |
|
"epoch": 0.23880597014925373, |
|
"grad_norm": 0.1368950456380844, |
|
"grpo_loss": 0.0002479627728462219, |
|
"kl": 0.00531005859375, |
|
"learning_rate": 2.827784630719728e-06, |
|
"loss": 0.0002, |
|
"reward": 0.6160714626312256, |
|
"reward_std": 0.26945602893829346, |
|
"rewards/accuracy_reward": 0.6160714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 594.7053833007812, |
|
"epoch": 0.24093816631130063, |
|
"grad_norm": 0.11902152001857758, |
|
"grpo_loss": 0.00034128251718357205, |
|
"kl": 0.005340576171875, |
|
"learning_rate": 2.8225527538157413e-06, |
|
"loss": 0.0002, |
|
"reward": 0.6428571939468384, |
|
"reward_std": 0.24697549641132355, |
|
"rewards/accuracy_reward": 0.6428571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 113 |
|
}, |
|
{ |
|
"completion_length": 556.8035888671875, |
|
"epoch": 0.24307036247334754, |
|
"grad_norm": 2.0720760822296143, |
|
"grpo_loss": 0.1181984543800354, |
|
"kl": 0.05810546875, |
|
"learning_rate": 2.8172475799721353e-06, |
|
"loss": 0.0023, |
|
"reward": 0.7678571939468384, |
|
"reward_std": 0.3040059506893158, |
|
"rewards/accuracy_reward": 0.7678571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 609.9375, |
|
"epoch": 0.24520255863539445, |
|
"grad_norm": 0.45657312870025635, |
|
"grpo_loss": 0.00024568289518356323, |
|
"kl": 0.00994873046875, |
|
"learning_rate": 2.8118694032059203e-06, |
|
"loss": 0.0004, |
|
"reward": 0.5714285969734192, |
|
"reward_std": 0.29016801714897156, |
|
"rewards/accuracy_reward": 0.5714285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 606.669677734375, |
|
"epoch": 0.24733475479744135, |
|
"grad_norm": 0.14336203038692474, |
|
"grpo_loss": 0.00017342311912216246, |
|
"kl": 0.00799560546875, |
|
"learning_rate": 2.806418521579987e-06, |
|
"loss": 0.0003, |
|
"reward": 0.6071428656578064, |
|
"reward_std": 0.2193893939256668, |
|
"rewards/accuracy_reward": 0.6071428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 561.6607666015625, |
|
"epoch": 0.24946695095948826, |
|
"grad_norm": 0.21323221921920776, |
|
"grpo_loss": 0.00015168637037277222, |
|
"kl": 0.00628662109375, |
|
"learning_rate": 2.8008952371865888e-06, |
|
"loss": 0.0003, |
|
"reward": 0.7142857313156128, |
|
"reward_std": 0.3795536160469055, |
|
"rewards/accuracy_reward": 0.7142857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 117 |
|
}, |
|
{ |
|
"completion_length": 570.419677734375, |
|
"epoch": 0.2515991471215352, |
|
"grad_norm": 0.1808198243379593, |
|
"grpo_loss": 0.00019172427710145712, |
|
"kl": 0.00872802734375, |
|
"learning_rate": 2.795299856130599e-06, |
|
"loss": 0.0004, |
|
"reward": 0.5535714626312256, |
|
"reward_std": 0.3069167137145996, |
|
"rewards/accuracy_reward": 0.5535714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 625.3660888671875, |
|
"epoch": 0.2537313432835821, |
|
"grad_norm": 8.979061126708984, |
|
"grpo_loss": 0.10089191794395447, |
|
"kl": 0.181640625, |
|
"learning_rate": 2.789632688512545e-06, |
|
"loss": 0.0073, |
|
"reward": 0.5, |
|
"reward_std": 0.2625819444656372, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"step": 119 |
|
}, |
|
{ |
|
"completion_length": 719.3482666015625, |
|
"epoch": 0.255863539445629, |
|
"grad_norm": 0.17377711832523346, |
|
"grpo_loss": 0.016832787543535233, |
|
"kl": 0.008544921875, |
|
"learning_rate": 2.783894048411425e-06, |
|
"loss": 0.0003, |
|
"reward": 0.4642857313156128, |
|
"reward_std": 0.22917428612709045, |
|
"rewards/accuracy_reward": 0.4642857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 520.7857666015625, |
|
"epoch": 0.2579957356076759, |
|
"grad_norm": 0.1450602114200592, |
|
"grpo_loss": 0.047628480941057205, |
|
"kl": 0.00958251953125, |
|
"learning_rate": 2.7780842538672983e-06, |
|
"loss": 0.0004, |
|
"reward": 0.7410714626312256, |
|
"reward_std": 0.2499759942293167, |
|
"rewards/accuracy_reward": 0.7410714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 121 |
|
}, |
|
{ |
|
"completion_length": 588.1339721679688, |
|
"epoch": 0.2601279317697228, |
|
"grad_norm": 0.24374298751354218, |
|
"grpo_loss": 0.00011707842350006104, |
|
"kl": 0.0140380859375, |
|
"learning_rate": 2.772203626863661e-06, |
|
"loss": 0.0006, |
|
"reward": 0.6071428656578064, |
|
"reward_std": 0.31670159101486206, |
|
"rewards/accuracy_reward": 0.6071428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 539.0982666015625, |
|
"epoch": 0.2622601279317697, |
|
"grad_norm": 0.6004351377487183, |
|
"grpo_loss": 0.00021716207265853882, |
|
"kl": 0.01055908203125, |
|
"learning_rate": 2.766252493309603e-06, |
|
"loss": 0.0004, |
|
"reward": 0.6875000596046448, |
|
"reward_std": 0.2822414040565491, |
|
"rewards/accuracy_reward": 0.6875000596046448, |
|
"rewards/format_reward": 0.0, |
|
"step": 123 |
|
}, |
|
{ |
|
"completion_length": 640.2232666015625, |
|
"epoch": 0.26439232409381663, |
|
"grad_norm": 0.15684480965137482, |
|
"grpo_loss": 0.0734117180109024, |
|
"kl": 0.004180908203125, |
|
"learning_rate": 2.7602311830217408e-06, |
|
"loss": 0.0002, |
|
"reward": 0.5446428656578064, |
|
"reward_std": 0.2692219913005829, |
|
"rewards/accuracy_reward": 0.535714328289032, |
|
"rewards/format_reward": 0.008928571827709675, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 594.2767944335938, |
|
"epoch": 0.26652452025586354, |
|
"grad_norm": 0.15290771424770355, |
|
"grpo_loss": -0.046935826539993286, |
|
"kl": 0.00457763671875, |
|
"learning_rate": 2.754140029705945e-06, |
|
"loss": 0.0002, |
|
"reward": 0.5178571939468384, |
|
"reward_std": 0.28725728392601013, |
|
"rewards/accuracy_reward": 0.5178571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 613.8839721679688, |
|
"epoch": 0.26865671641791045, |
|
"grad_norm": 0.2813848555088043, |
|
"grpo_loss": -0.016470883041620255, |
|
"kl": 0.00555419921875, |
|
"learning_rate": 2.7479793709388413e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7053571939468384, |
|
"reward_std": 0.22521094977855682, |
|
"rewards/accuracy_reward": 0.7053571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 580.6339721679688, |
|
"epoch": 0.27078891257995735, |
|
"grad_norm": 0.2711651027202606, |
|
"grpo_loss": 0.11836685985326767, |
|
"kl": 0.009033203125, |
|
"learning_rate": 2.741749548149105e-06, |
|
"loss": 0.0004, |
|
"reward": 0.8303571939468384, |
|
"reward_std": 0.2960793077945709, |
|
"rewards/accuracy_reward": 0.8303571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 127 |
|
}, |
|
{ |
|
"completion_length": 619.5357666015625, |
|
"epoch": 0.27292110874200426, |
|
"grad_norm": 0.18601906299591064, |
|
"grpo_loss": 0.04743209481239319, |
|
"kl": 0.006134033203125, |
|
"learning_rate": 2.735450906598535e-06, |
|
"loss": 0.0002, |
|
"reward": 0.7053571939468384, |
|
"reward_std": 0.3206648826599121, |
|
"rewards/accuracy_reward": 0.7053571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 593.7142944335938, |
|
"epoch": 0.27505330490405117, |
|
"grad_norm": 0.30626460909843445, |
|
"grpo_loss": 0.000193767249584198, |
|
"kl": 0.00970458984375, |
|
"learning_rate": 2.7290837953629244e-06, |
|
"loss": 0.0004, |
|
"reward": 0.6785714626312256, |
|
"reward_std": 0.3795536160469055, |
|
"rewards/accuracy_reward": 0.6785714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 129 |
|
}, |
|
{ |
|
"completion_length": 705.8928833007812, |
|
"epoch": 0.2771855010660981, |
|
"grad_norm": 0.17629599571228027, |
|
"grpo_loss": 0.00018097274005413055, |
|
"kl": 0.006988525390625, |
|
"learning_rate": 2.722648567312709e-06, |
|
"loss": 0.0003, |
|
"reward": 0.5, |
|
"reward_std": 0.2625819444656372, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 622.5982666015625, |
|
"epoch": 0.279317697228145, |
|
"grad_norm": 0.22440823912620544, |
|
"grpo_loss": 0.11841889470815659, |
|
"kl": 0.00799560546875, |
|
"learning_rate": 2.716145579093415e-06, |
|
"loss": 0.0003, |
|
"reward": 0.6071428656578064, |
|
"reward_std": 0.30004259943962097, |
|
"rewards/accuracy_reward": 0.6071428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 131 |
|
}, |
|
{ |
|
"completion_length": 673.0625, |
|
"epoch": 0.2814498933901919, |
|
"grad_norm": 0.17421214282512665, |
|
"grpo_loss": 0.055158667266368866, |
|
"kl": 0.00592041015625, |
|
"learning_rate": 2.70957519110589e-06, |
|
"loss": 0.0002, |
|
"reward": 0.455357164144516, |
|
"reward_std": 0.33336055278778076, |
|
"rewards/accuracy_reward": 0.455357164144516, |
|
"rewards/format_reward": 0.0, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 579.982177734375, |
|
"epoch": 0.2835820895522388, |
|
"grad_norm": 0.21082566678524017, |
|
"grpo_loss": 0.04748784005641937, |
|
"kl": 0.007293701171875, |
|
"learning_rate": 2.702937767486333e-06, |
|
"loss": 0.0003, |
|
"reward": 0.535714328289032, |
|
"reward_std": 0.34323516488075256, |
|
"rewards/accuracy_reward": 0.535714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 133 |
|
}, |
|
{ |
|
"completion_length": 648.9107666015625, |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 0.16841772198677063, |
|
"grpo_loss": 0.017035316675901413, |
|
"kl": 0.00640869140625, |
|
"learning_rate": 2.6962336760861105e-06, |
|
"loss": 0.0003, |
|
"reward": 0.6339285969734192, |
|
"reward_std": 0.2891155183315277, |
|
"rewards/accuracy_reward": 0.6339285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 579.5, |
|
"epoch": 0.2878464818763326, |
|
"grad_norm": 0.15011237561702728, |
|
"grpo_loss": 0.00023609399795532227, |
|
"kl": 0.004119873046875, |
|
"learning_rate": 2.689463288451372e-06, |
|
"loss": 0.0002, |
|
"reward": 0.6696428656578064, |
|
"reward_std": 0.2695457339286804, |
|
"rewards/accuracy_reward": 0.6696428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 542.25, |
|
"epoch": 0.2899786780383795, |
|
"grad_norm": 0.23496368527412415, |
|
"grpo_loss": 0.0005470961332321167, |
|
"kl": 0.00994873046875, |
|
"learning_rate": 2.6826269798024567e-06, |
|
"loss": 0.0004, |
|
"reward": 0.6696428656578064, |
|
"reward_std": 0.27933061122894287, |
|
"rewards/accuracy_reward": 0.6696428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 660.232177734375, |
|
"epoch": 0.2921108742004264, |
|
"grad_norm": 0.1643698662519455, |
|
"grpo_loss": 0.05503631383180618, |
|
"kl": 0.0052490234375, |
|
"learning_rate": 2.6757251290131003e-06, |
|
"loss": 0.0002, |
|
"reward": 0.6517857313156128, |
|
"reward_std": 0.33450281620025635, |
|
"rewards/accuracy_reward": 0.6517857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 137 |
|
}, |
|
{ |
|
"completion_length": 597.4375, |
|
"epoch": 0.2942430703624733, |
|
"grad_norm": 0.3305177092552185, |
|
"grpo_loss": -0.07286404073238373, |
|
"kl": 0.0123291015625, |
|
"learning_rate": 2.6687581185894363e-06, |
|
"loss": 0.0005, |
|
"reward": 0.6339285969734192, |
|
"reward_std": 0.46417009830474854, |
|
"rewards/accuracy_reward": 0.6339285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 606.5803833007812, |
|
"epoch": 0.29637526652452023, |
|
"grad_norm": 0.15607373416423798, |
|
"grpo_loss": -0.09995691478252411, |
|
"kl": 0.0059814453125, |
|
"learning_rate": 2.6617263346487987e-06, |
|
"loss": 0.0002, |
|
"reward": 0.5803571939468384, |
|
"reward_std": 0.22812172770500183, |
|
"rewards/accuracy_reward": 0.5803571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 139 |
|
}, |
|
{ |
|
"completion_length": 636.4017944335938, |
|
"epoch": 0.29850746268656714, |
|
"grad_norm": 0.4233054518699646, |
|
"grpo_loss": -0.04692365229129791, |
|
"kl": 0.007720947265625, |
|
"learning_rate": 2.6546301668983207e-06, |
|
"loss": 0.0003, |
|
"reward": 0.6160714626312256, |
|
"reward_std": 0.36280491948127747, |
|
"rewards/accuracy_reward": 0.6160714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 654.5625, |
|
"epoch": 0.3006396588486141, |
|
"grad_norm": 0.30437514185905457, |
|
"grpo_loss": -0.04689915478229523, |
|
"kl": 0.006378173828125, |
|
"learning_rate": 2.6474700086133384e-06, |
|
"loss": 0.0003, |
|
"reward": 0.5803571939468384, |
|
"reward_std": 0.3363610506057739, |
|
"rewards/accuracy_reward": 0.5803571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 141 |
|
}, |
|
{ |
|
"completion_length": 645.0625, |
|
"epoch": 0.302771855010661, |
|
"grad_norm": 0.16769744455814362, |
|
"grpo_loss": 0.00032076984643936157, |
|
"kl": 0.0069580078125, |
|
"learning_rate": 2.640246256615596e-06, |
|
"loss": 0.0003, |
|
"reward": 0.5535714626312256, |
|
"reward_std": 0.2803831696510315, |
|
"rewards/accuracy_reward": 0.5535714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 600.5892944335938, |
|
"epoch": 0.3049040511727079, |
|
"grad_norm": 0.185498908162117, |
|
"grpo_loss": 0.0009974241256713867, |
|
"kl": 0.0123291015625, |
|
"learning_rate": 2.632959311251251e-06, |
|
"loss": 0.0005, |
|
"reward": 0.5535714626312256, |
|
"reward_std": 0.27641981840133667, |
|
"rewards/accuracy_reward": 0.5535714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 143 |
|
}, |
|
{ |
|
"completion_length": 667.2678833007812, |
|
"epoch": 0.3070362473347548, |
|
"grad_norm": 0.18703410029411316, |
|
"grpo_loss": 7.99037516117096e-05, |
|
"kl": 0.00555419921875, |
|
"learning_rate": 2.625609576368689e-06, |
|
"loss": 0.0002, |
|
"reward": 0.5446428656578064, |
|
"reward_std": 0.33354002237319946, |
|
"rewards/accuracy_reward": 0.5446428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 594.232177734375, |
|
"epoch": 0.3091684434968017, |
|
"grad_norm": 0.17264287173748016, |
|
"grpo_loss": 0.0005372203886508942, |
|
"kl": 0.0086669921875, |
|
"learning_rate": 2.6181974592961413e-06, |
|
"loss": 0.0003, |
|
"reward": 0.7142857313156128, |
|
"reward_std": 0.23613807559013367, |
|
"rewards/accuracy_reward": 0.7142857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 636.4910888671875, |
|
"epoch": 0.31130063965884863, |
|
"grad_norm": 0.34566304087638855, |
|
"grpo_loss": -0.047039397060871124, |
|
"kl": 0.0096435546875, |
|
"learning_rate": 2.610723370819111e-06, |
|
"loss": 0.0004, |
|
"reward": 0.6160714626312256, |
|
"reward_std": 0.29713183641433716, |
|
"rewards/accuracy_reward": 0.6160714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 693.5000610351562, |
|
"epoch": 0.31343283582089554, |
|
"grad_norm": 0.1667724996805191, |
|
"grpo_loss": -0.09995532780885696, |
|
"kl": 0.005706787109375, |
|
"learning_rate": 2.6031877251576055e-06, |
|
"loss": 0.0002, |
|
"reward": 0.5892857313156128, |
|
"reward_std": 0.24195964634418488, |
|
"rewards/accuracy_reward": 0.5892857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 147 |
|
}, |
|
{ |
|
"completion_length": 625.357177734375, |
|
"epoch": 0.31556503198294245, |
|
"grad_norm": 0.2609766125679016, |
|
"grpo_loss": 0.0003863796591758728, |
|
"kl": 0.006622314453125, |
|
"learning_rate": 2.5955909399431797e-06, |
|
"loss": 0.0003, |
|
"reward": 0.5178571939468384, |
|
"reward_std": 0.21824714541435242, |
|
"rewards/accuracy_reward": 0.5178571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 539.3482666015625, |
|
"epoch": 0.31769722814498935, |
|
"grad_norm": 0.20627616345882416, |
|
"grpo_loss": 0.04802427440881729, |
|
"kl": 0.0081787109375, |
|
"learning_rate": 2.5879334361957955e-06, |
|
"loss": 0.0003, |
|
"reward": 0.6964285969734192, |
|
"reward_std": 0.32366544008255005, |
|
"rewards/accuracy_reward": 0.6964285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 149 |
|
}, |
|
{ |
|
"completion_length": 622.8214721679688, |
|
"epoch": 0.31982942430703626, |
|
"grad_norm": 0.1990203708410263, |
|
"grpo_loss": 0.00011615455150604248, |
|
"kl": 0.01055908203125, |
|
"learning_rate": 2.5802156383004816e-06, |
|
"loss": 0.0004, |
|
"reward": 0.7142857313156128, |
|
"reward_std": 0.24997597932815552, |
|
"rewards/accuracy_reward": 0.7142857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 603.875, |
|
"epoch": 0.32196162046908317, |
|
"grad_norm": 0.2065436840057373, |
|
"grpo_loss": -0.047058749943971634, |
|
"kl": 0.01055908203125, |
|
"learning_rate": 2.572437973983818e-06, |
|
"loss": 0.0004, |
|
"reward": 0.625, |
|
"reward_std": 0.33159202337265015, |
|
"rewards/accuracy_reward": 0.625, |
|
"rewards/format_reward": 0.0, |
|
"step": 151 |
|
}, |
|
{ |
|
"completion_length": 576.2232666015625, |
|
"epoch": 0.32409381663113007, |
|
"grad_norm": 0.2540753185749054, |
|
"grpo_loss": -0.04715569317340851, |
|
"kl": 0.007293701171875, |
|
"learning_rate": 2.5646008742902307e-06, |
|
"loss": 0.0003, |
|
"reward": 0.7321428656578064, |
|
"reward_std": 0.23208504915237427, |
|
"rewards/accuracy_reward": 0.7321428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 596.4910888671875, |
|
"epoch": 0.326226012793177, |
|
"grad_norm": 0.31783199310302734, |
|
"grpo_loss": 0.00040250414167530835, |
|
"kl": 0.0098876953125, |
|
"learning_rate": 2.556704773558101e-06, |
|
"loss": 0.0004, |
|
"reward": 0.598214328289032, |
|
"reward_std": 0.2625819444656372, |
|
"rewards/accuracy_reward": 0.598214328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 153 |
|
}, |
|
{ |
|
"completion_length": 665.375, |
|
"epoch": 0.3283582089552239, |
|
"grad_norm": 0.19316673278808594, |
|
"grpo_loss": -0.0164567232131958, |
|
"kl": 0.0067138671875, |
|
"learning_rate": 2.5487501093956955e-06, |
|
"loss": 0.0003, |
|
"reward": 0.5178571939468384, |
|
"reward_std": 0.4376365542411804, |
|
"rewards/accuracy_reward": 0.5178571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 576.1517944335938, |
|
"epoch": 0.3304904051172708, |
|
"grad_norm": 0.2708187997341156, |
|
"grpo_loss": -0.046256136149168015, |
|
"kl": 0.0133056640625, |
|
"learning_rate": 2.540737322656915e-06, |
|
"loss": 0.0005, |
|
"reward": 0.6517857313156128, |
|
"reward_std": 0.38756993412971497, |
|
"rewards/accuracy_reward": 0.6517857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 585.6160888671875, |
|
"epoch": 0.3326226012793177, |
|
"grad_norm": 0.3584955930709839, |
|
"grpo_loss": 0.00033188471570611, |
|
"kl": 0.0206298828125, |
|
"learning_rate": 2.532666857416858e-06, |
|
"loss": 0.0008, |
|
"reward": 0.6517857313156128, |
|
"reward_std": 0.2586185932159424, |
|
"rewards/accuracy_reward": 0.6517857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 408.6875305175781, |
|
"epoch": 0.3347547974413646, |
|
"grad_norm": 0.290502667427063, |
|
"grpo_loss": 0.0005320041673257947, |
|
"kl": 0.0177001953125, |
|
"learning_rate": 2.524539160947213e-06, |
|
"loss": 0.0007, |
|
"reward": 0.8303571939468384, |
|
"reward_std": 0.2637241780757904, |
|
"rewards/accuracy_reward": 0.8303571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 157 |
|
}, |
|
{ |
|
"completion_length": 634.544677734375, |
|
"epoch": 0.3368869936034115, |
|
"grad_norm": 0.30897942185401917, |
|
"grpo_loss": 0.0007045269012451172, |
|
"kl": 0.0135498046875, |
|
"learning_rate": 2.5163546836914705e-06, |
|
"loss": 0.0005, |
|
"reward": 0.598214328289032, |
|
"reward_std": 0.2832939326763153, |
|
"rewards/accuracy_reward": 0.598214328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 519.1160888671875, |
|
"epoch": 0.3390191897654584, |
|
"grad_norm": 0.17009268701076508, |
|
"grpo_loss": 0.00013849325478076935, |
|
"kl": 0.00860595703125, |
|
"learning_rate": 2.5081138792399558e-06, |
|
"loss": 0.0003, |
|
"reward": 0.8392857313156128, |
|
"reward_std": 0.20564121007919312, |
|
"rewards/accuracy_reward": 0.8392857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 159 |
|
}, |
|
{ |
|
"completion_length": 649.6785888671875, |
|
"epoch": 0.3411513859275053, |
|
"grad_norm": 0.20049817860126495, |
|
"grpo_loss": 0.00011303433711873367, |
|
"kl": 0.0062255859375, |
|
"learning_rate": 2.499817204304694e-06, |
|
"loss": 0.0002, |
|
"reward": 0.392857164144516, |
|
"reward_std": 0.2153363674879074, |
|
"rewards/accuracy_reward": 0.392857164144516, |
|
"rewards/format_reward": 0.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 584.1607666015625, |
|
"epoch": 0.34328358208955223, |
|
"grad_norm": 0.32430610060691833, |
|
"grpo_loss": 0.0003687143325805664, |
|
"kl": 0.01513671875, |
|
"learning_rate": 2.491465118694097e-06, |
|
"loss": 0.0006, |
|
"reward": 0.6696428656578064, |
|
"reward_std": 0.2695457339286804, |
|
"rewards/accuracy_reward": 0.6696428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 161 |
|
}, |
|
{ |
|
"completion_length": 661.607177734375, |
|
"epoch": 0.34541577825159914, |
|
"grad_norm": 0.11526723206043243, |
|
"grpo_loss": 0.00017088651657104492, |
|
"kl": 0.0135498046875, |
|
"learning_rate": 2.483058085287483e-06, |
|
"loss": 0.0005, |
|
"reward": 0.598214328289032, |
|
"reward_std": 0.1928558498620987, |
|
"rewards/accuracy_reward": 0.598214328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 644.357177734375, |
|
"epoch": 0.34754797441364604, |
|
"grad_norm": 0.16817770898342133, |
|
"grpo_loss": 0.055027663707733154, |
|
"kl": 0.00567626953125, |
|
"learning_rate": 2.474596570009417e-06, |
|
"loss": 0.0002, |
|
"reward": 0.6517857313156128, |
|
"reward_std": 0.23904886841773987, |
|
"rewards/accuracy_reward": 0.6517857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 163 |
|
}, |
|
{ |
|
"completion_length": 600.8928833007812, |
|
"epoch": 0.34968017057569295, |
|
"grad_norm": 0.2603262662887573, |
|
"grpo_loss": -0.04644265025854111, |
|
"kl": 0.021728515625, |
|
"learning_rate": 2.4660810418038984e-06, |
|
"loss": 0.0009, |
|
"reward": 0.6964285969734192, |
|
"reward_std": 0.2862047255039215, |
|
"rewards/accuracy_reward": 0.6964285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 667.419677734375, |
|
"epoch": 0.35181236673773986, |
|
"grad_norm": 0.1652158796787262, |
|
"grpo_loss": 7.62939453125e-05, |
|
"kl": 0.0089111328125, |
|
"learning_rate": 2.4575119726083624e-06, |
|
"loss": 0.0004, |
|
"reward": 0.6339285969734192, |
|
"reward_std": 0.2389591485261917, |
|
"rewards/accuracy_reward": 0.6339285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 694.2142944335938, |
|
"epoch": 0.35394456289978676, |
|
"grad_norm": 0.16855111718177795, |
|
"grpo_loss": 0.00020201876759529114, |
|
"kl": 0.01031494140625, |
|
"learning_rate": 2.448889837327531e-06, |
|
"loss": 0.0004, |
|
"reward": 0.5803571939468384, |
|
"reward_std": 0.35187777876853943, |
|
"rewards/accuracy_reward": 0.5803571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 568.6160888671875, |
|
"epoch": 0.35607675906183367, |
|
"grad_norm": 0.21794408559799194, |
|
"grpo_loss": 0.01685473695397377, |
|
"kl": 0.006439208984375, |
|
"learning_rate": 2.440215113807091e-06, |
|
"loss": 0.0003, |
|
"reward": 0.6071428656578064, |
|
"reward_std": 0.40203410387039185, |
|
"rewards/accuracy_reward": 0.6071428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 167 |
|
}, |
|
{ |
|
"completion_length": 555.6160888671875, |
|
"epoch": 0.3582089552238806, |
|
"grad_norm": 0.2721169888973236, |
|
"grpo_loss": -0.0713144987821579, |
|
"kl": 0.01556396484375, |
|
"learning_rate": 2.4314882828072126e-06, |
|
"loss": 0.0006, |
|
"reward": 0.723214328289032, |
|
"reward_std": 0.21251529455184937, |
|
"rewards/accuracy_reward": 0.723214328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 576.1339721679688, |
|
"epoch": 0.3603411513859275, |
|
"grad_norm": 0.22736425697803497, |
|
"grpo_loss": 0.05517146736383438, |
|
"kl": 0.0185546875, |
|
"learning_rate": 2.422709827975903e-06, |
|
"loss": 0.0007, |
|
"reward": 0.7678571939468384, |
|
"reward_std": 0.27350908517837524, |
|
"rewards/accuracy_reward": 0.7678571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 169 |
|
}, |
|
{ |
|
"completion_length": 586.4910888671875, |
|
"epoch": 0.3624733475479744, |
|
"grad_norm": 0.21218353509902954, |
|
"grpo_loss": 0.18360236287117004, |
|
"kl": 0.01043701171875, |
|
"learning_rate": 2.413880235822205e-06, |
|
"loss": 0.0004, |
|
"reward": 0.6517857313156128, |
|
"reward_std": 0.31961238384246826, |
|
"rewards/accuracy_reward": 0.6517857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 623.1339721679688, |
|
"epoch": 0.3646055437100213, |
|
"grad_norm": 0.19818057119846344, |
|
"grpo_loss": -0.1175161674618721, |
|
"kl": 0.01446533203125, |
|
"learning_rate": 2.4049999956892328e-06, |
|
"loss": 0.0006, |
|
"reward": 0.6964285969734192, |
|
"reward_std": 0.3069167137145996, |
|
"rewards/accuracy_reward": 0.6964285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 171 |
|
}, |
|
{ |
|
"completion_length": 565.6875, |
|
"epoch": 0.36673773987206826, |
|
"grad_norm": 0.18358924984931946, |
|
"grpo_loss": 0.055470846593379974, |
|
"kl": 0.01068115234375, |
|
"learning_rate": 2.396069599727051e-06, |
|
"loss": 0.0004, |
|
"reward": 0.6785714626312256, |
|
"reward_std": 0.23322731256484985, |
|
"rewards/accuracy_reward": 0.6785714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 644.7410888671875, |
|
"epoch": 0.36886993603411516, |
|
"grad_norm": 0.09578749537467957, |
|
"grpo_loss": 5.925947334617376e-05, |
|
"kl": 0.00555419921875, |
|
"learning_rate": 2.3870895428654033e-06, |
|
"loss": 0.0002, |
|
"reward": 0.6160714626312256, |
|
"reward_std": 0.1986774057149887, |
|
"rewards/accuracy_reward": 0.6160714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 173 |
|
}, |
|
{ |
|
"completion_length": 561.4553833007812, |
|
"epoch": 0.37100213219616207, |
|
"grad_norm": 0.1927262395620346, |
|
"grpo_loss": 0.05529294162988663, |
|
"kl": 0.0096435546875, |
|
"learning_rate": 2.3780603227862782e-06, |
|
"loss": 0.0004, |
|
"reward": 0.7053571939468384, |
|
"reward_std": 0.3276287019252777, |
|
"rewards/accuracy_reward": 0.7053571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 581.4642944335938, |
|
"epoch": 0.373134328358209, |
|
"grad_norm": 0.2750888466835022, |
|
"grpo_loss": -0.047094203531742096, |
|
"kl": 0.014404296875, |
|
"learning_rate": 2.3689824398963307e-06, |
|
"loss": 0.0006, |
|
"reward": 0.723214328289032, |
|
"reward_std": 0.29713186621665955, |
|
"rewards/accuracy_reward": 0.723214328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 658.9910888671875, |
|
"epoch": 0.3752665245202559, |
|
"grad_norm": 0.1884259432554245, |
|
"grpo_loss": 0.0011673122644424438, |
|
"kl": 0.01190185546875, |
|
"learning_rate": 2.3598563972991476e-06, |
|
"loss": 0.0005, |
|
"reward": 0.4732142984867096, |
|
"reward_std": 0.21251530945301056, |
|
"rewards/accuracy_reward": 0.4732142984867096, |
|
"rewards/format_reward": 0.0, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 614.5089721679688, |
|
"epoch": 0.3773987206823028, |
|
"grad_norm": 0.1662159264087677, |
|
"grpo_loss": 0.0007153626647777855, |
|
"kl": 0.0152587890625, |
|
"learning_rate": 2.350682700767365e-06, |
|
"loss": 0.0006, |
|
"reward": 0.7321428656578064, |
|
"reward_std": 0.16130642592906952, |
|
"rewards/accuracy_reward": 0.7321428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 177 |
|
}, |
|
{ |
|
"completion_length": 615.1875, |
|
"epoch": 0.3795309168443497, |
|
"grad_norm": 0.2590639293193817, |
|
"grpo_loss": 0.00103016197681427, |
|
"kl": 0.0233154296875, |
|
"learning_rate": 2.3414618587146394e-06, |
|
"loss": 0.0009, |
|
"reward": 0.5625, |
|
"reward_std": 0.3599838316440582, |
|
"rewards/accuracy_reward": 0.5625, |
|
"rewards/format_reward": 0.0, |
|
"step": 178 |
|
}, |
|
{ |
|
"completion_length": 653.2142944335938, |
|
"epoch": 0.3816631130063966, |
|
"grad_norm": 0.1929996758699417, |
|
"grpo_loss": -0.0464971587061882, |
|
"kl": 0.0118408203125, |
|
"learning_rate": 2.3321943821674686e-06, |
|
"loss": 0.0005, |
|
"reward": 0.5625, |
|
"reward_std": 0.2832939326763153, |
|
"rewards/accuracy_reward": 0.5625, |
|
"rewards/format_reward": 0.0, |
|
"step": 179 |
|
}, |
|
{ |
|
"completion_length": 566.169677734375, |
|
"epoch": 0.3837953091684435, |
|
"grad_norm": 0.17937371134757996, |
|
"grpo_loss": 0.0013021007180213928, |
|
"kl": 0.01483154296875, |
|
"learning_rate": 2.3228807847368733e-06, |
|
"loss": 0.0006, |
|
"reward": 0.6428571939468384, |
|
"reward_std": 0.34323516488075256, |
|
"rewards/accuracy_reward": 0.6428571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 613.7410888671875, |
|
"epoch": 0.3859275053304904, |
|
"grad_norm": 0.08274456858634949, |
|
"grpo_loss": 0.01679709553718567, |
|
"kl": 0.0079345703125, |
|
"learning_rate": 2.3135215825899298e-06, |
|
"loss": 0.0003, |
|
"reward": 0.7321428656578064, |
|
"reward_std": 0.20846226811408997, |
|
"rewards/accuracy_reward": 0.7321428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 181 |
|
}, |
|
{ |
|
"completion_length": 590.2232666015625, |
|
"epoch": 0.3880597014925373, |
|
"grad_norm": 0.22545363008975983, |
|
"grpo_loss": 0.0002574026584625244, |
|
"kl": 0.00885009765625, |
|
"learning_rate": 2.304117294421165e-06, |
|
"loss": 0.0004, |
|
"reward": 0.5803571939468384, |
|
"reward_std": 0.25976085662841797, |
|
"rewards/accuracy_reward": 0.5803571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 182 |
|
}, |
|
{ |
|
"completion_length": 572.4464721679688, |
|
"epoch": 0.39019189765458423, |
|
"grad_norm": 0.2379264086484909, |
|
"grpo_loss": 0.0008170232176780701, |
|
"kl": 0.01513671875, |
|
"learning_rate": 2.294668441423809e-06, |
|
"loss": 0.0006, |
|
"reward": 0.625, |
|
"reward_std": 0.27236682176589966, |
|
"rewards/accuracy_reward": 0.625, |
|
"rewards/format_reward": 0.0, |
|
"step": 183 |
|
}, |
|
{ |
|
"completion_length": 597.5267944335938, |
|
"epoch": 0.39232409381663114, |
|
"grad_norm": 0.19797514379024506, |
|
"grpo_loss": 0.00013130396837368608, |
|
"kl": 0.00946044921875, |
|
"learning_rate": 2.2851755472609112e-06, |
|
"loss": 0.0004, |
|
"reward": 0.6428571939468384, |
|
"reward_std": 0.19180330634117126, |
|
"rewards/accuracy_reward": 0.6428571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 184 |
|
}, |
|
{ |
|
"completion_length": 635.2142944335938, |
|
"epoch": 0.39445628997867804, |
|
"grad_norm": 0.15277421474456787, |
|
"grpo_loss": 0.04737038165330887, |
|
"kl": 0.006622314453125, |
|
"learning_rate": 2.2756391380363176e-06, |
|
"loss": 0.0003, |
|
"reward": 0.4910714626312256, |
|
"reward_std": 0.29016801714897156, |
|
"rewards/accuracy_reward": 0.4910714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 602.2767944335938, |
|
"epoch": 0.39658848614072495, |
|
"grad_norm": 0.8596810698509216, |
|
"grpo_loss": 0.11840670555830002, |
|
"kl": 0.0244140625, |
|
"learning_rate": 2.2660597422655136e-06, |
|
"loss": 0.001, |
|
"reward": 0.6517857313156128, |
|
"reward_std": 0.17223355174064636, |
|
"rewards/accuracy_reward": 0.6517857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 186 |
|
}, |
|
{ |
|
"completion_length": 629.0714721679688, |
|
"epoch": 0.39872068230277186, |
|
"grad_norm": 0.23284487426280975, |
|
"grpo_loss": -0.014619708061218262, |
|
"kl": 0.0194091796875, |
|
"learning_rate": 2.2564378908463343e-06, |
|
"loss": 0.0008, |
|
"reward": 0.455357164144516, |
|
"reward_std": 0.27933061122894287, |
|
"rewards/accuracy_reward": 0.455357164144516, |
|
"rewards/format_reward": 0.0, |
|
"step": 187 |
|
}, |
|
{ |
|
"completion_length": 644.25, |
|
"epoch": 0.40085287846481876, |
|
"grad_norm": 0.16274768114089966, |
|
"grpo_loss": 0.0004352938267402351, |
|
"kl": 0.01171875, |
|
"learning_rate": 2.24677411702954e-06, |
|
"loss": 0.0005, |
|
"reward": 0.6339285969734192, |
|
"reward_std": 0.27933061122894287, |
|
"rewards/accuracy_reward": 0.6339285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 188 |
|
}, |
|
{ |
|
"completion_length": 567.6875, |
|
"epoch": 0.40298507462686567, |
|
"grad_norm": 0.3285521864891052, |
|
"grpo_loss": 0.07440680265426636, |
|
"kl": 0.031494140625, |
|
"learning_rate": 2.237068956389266e-06, |
|
"loss": 0.0013, |
|
"reward": 0.660714328289032, |
|
"reward_std": 0.3069167137145996, |
|
"rewards/accuracy_reward": 0.660714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 189 |
|
}, |
|
{ |
|
"completion_length": 634.5625, |
|
"epoch": 0.4051172707889126, |
|
"grad_norm": 0.2191726267337799, |
|
"grpo_loss": 0.00019488348334562033, |
|
"kl": 0.0205078125, |
|
"learning_rate": 2.227322946793337e-06, |
|
"loss": 0.0008, |
|
"reward": 0.6339285969734192, |
|
"reward_std": 0.29307880997657776, |
|
"rewards/accuracy_reward": 0.6339285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 517.9910888671875, |
|
"epoch": 0.4072494669509595, |
|
"grad_norm": 0.2723895311355591, |
|
"grpo_loss": 0.05497168004512787, |
|
"kl": 0.0152587890625, |
|
"learning_rate": 2.21753662837346e-06, |
|
"loss": 0.0006, |
|
"reward": 0.7500000596046448, |
|
"reward_std": 0.4014078378677368, |
|
"rewards/accuracy_reward": 0.7500000596046448, |
|
"rewards/format_reward": 0.0, |
|
"step": 191 |
|
}, |
|
{ |
|
"completion_length": 660.4107666015625, |
|
"epoch": 0.4093816631130064, |
|
"grad_norm": 0.343417763710022, |
|
"grpo_loss": -0.015305643901228905, |
|
"kl": 0.035888671875, |
|
"learning_rate": 2.20771054349529e-06, |
|
"loss": 0.0014, |
|
"reward": 0.6517857313156128, |
|
"reward_std": 0.3334502577781677, |
|
"rewards/accuracy_reward": 0.6517857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 505.982177734375, |
|
"epoch": 0.4115138592750533, |
|
"grad_norm": 0.2592337131500244, |
|
"grpo_loss": -0.01446167379617691, |
|
"kl": 0.0235595703125, |
|
"learning_rate": 2.197845236728371e-06, |
|
"loss": 0.0009, |
|
"reward": 0.7678571939468384, |
|
"reward_std": 0.305774450302124, |
|
"rewards/accuracy_reward": 0.7678571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 193 |
|
}, |
|
{ |
|
"completion_length": 547.2589721679688, |
|
"epoch": 0.4136460554371002, |
|
"grad_norm": 0.1847042590379715, |
|
"grpo_loss": 0.00015539950982201844, |
|
"kl": 0.0157470703125, |
|
"learning_rate": 2.187941254815956e-06, |
|
"loss": 0.0006, |
|
"reward": 0.7589285969734192, |
|
"reward_std": 0.216478630900383, |
|
"rewards/accuracy_reward": 0.7589285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 194 |
|
}, |
|
{ |
|
"completion_length": 629.1428833007812, |
|
"epoch": 0.4157782515991471, |
|
"grad_norm": 0.22321003675460815, |
|
"grpo_loss": 0.055582527071237564, |
|
"kl": 0.0302734375, |
|
"learning_rate": 2.1779991466447045e-06, |
|
"loss": 0.0012, |
|
"reward": 0.5535714626312256, |
|
"reward_std": 0.3471984565258026, |
|
"rewards/accuracy_reward": 0.5535714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 696.7142944335938, |
|
"epoch": 0.417910447761194, |
|
"grad_norm": 0.1626308560371399, |
|
"grpo_loss": 0.047444313764572144, |
|
"kl": 0.01507568359375, |
|
"learning_rate": 2.168019463214266e-06, |
|
"loss": 0.0006, |
|
"reward": 0.4017857313156128, |
|
"reward_std": 0.3294869661331177, |
|
"rewards/accuracy_reward": 0.4017857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 196 |
|
}, |
|
{ |
|
"completion_length": 620.294677734375, |
|
"epoch": 0.4200426439232409, |
|
"grad_norm": 0.17187148332595825, |
|
"grpo_loss": 0.0013135150074958801, |
|
"kl": 0.020751953125, |
|
"learning_rate": 2.1580027576067387e-06, |
|
"loss": 0.0008, |
|
"reward": 0.4821428656578064, |
|
"reward_std": 0.3264864683151245, |
|
"rewards/accuracy_reward": 0.4821428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 197 |
|
}, |
|
{ |
|
"completion_length": 561.7767944335938, |
|
"epoch": 0.42217484008528783, |
|
"grad_norm": 0.1164230927824974, |
|
"grpo_loss": 0.0005880985409021378, |
|
"kl": 0.010986328125, |
|
"learning_rate": 2.1479495849560225e-06, |
|
"loss": 0.0004, |
|
"reward": 0.6428571939468384, |
|
"reward_std": 0.1779654175043106, |
|
"rewards/accuracy_reward": 0.6428571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 198 |
|
}, |
|
{ |
|
"completion_length": 601.8125, |
|
"epoch": 0.42430703624733473, |
|
"grad_norm": 0.27141571044921875, |
|
"grpo_loss": 0.10048054158687592, |
|
"kl": 0.019287109375, |
|
"learning_rate": 2.1378605024170477e-06, |
|
"loss": 0.0008, |
|
"reward": 0.6517857313156128, |
|
"reward_std": 0.2862944006919861, |
|
"rewards/accuracy_reward": 0.6517857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 199 |
|
}, |
|
{ |
|
"completion_length": 759.8839721679688, |
|
"epoch": 0.42643923240938164, |
|
"grad_norm": 0.13523803651332855, |
|
"grpo_loss": 0.001226574881002307, |
|
"kl": 0.01348876953125, |
|
"learning_rate": 2.127736069134901e-06, |
|
"loss": 0.0005, |
|
"reward": 0.4375000298023224, |
|
"reward_std": 0.3460562527179718, |
|
"rewards/accuracy_reward": 0.4375000298023224, |
|
"rewards/format_reward": 0.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.42643923240938164, |
|
"eval_completion_length": 611.8816388894955, |
|
"eval_grpo_loss": -0.0015939308651721107, |
|
"eval_kl": 0.022449334970297524, |
|
"eval_loss": 0.0009027738706208766, |
|
"eval_reward": 0.5542560714883165, |
|
"eval_reward_std": 0.28316991823835497, |
|
"eval_rewards/accuracy_reward": 0.5542560714883165, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 4037.199, |
|
"eval_samples_per_second": 1.238, |
|
"eval_steps_per_second": 0.011, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 531.4375, |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 0.21513928472995758, |
|
"grpo_loss": 0.11847112327814102, |
|
"kl": 0.015869140625, |
|
"learning_rate": 2.117576846213835e-06, |
|
"loss": 0.0006, |
|
"reward": 0.723214328289032, |
|
"reward_std": 0.3363610804080963, |
|
"rewards/accuracy_reward": 0.723214328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 201 |
|
}, |
|
{ |
|
"completion_length": 666.0892944335938, |
|
"epoch": 0.43070362473347545, |
|
"grad_norm": 0.18399223685264587, |
|
"grpo_loss": 0.00018279292271472514, |
|
"kl": 0.02001953125, |
|
"learning_rate": 2.107383396686173e-06, |
|
"loss": 0.0008, |
|
"reward": 0.5, |
|
"reward_std": 0.23499584197998047, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"step": 202 |
|
}, |
|
{ |
|
"completion_length": 628.7767944335938, |
|
"epoch": 0.43283582089552236, |
|
"grad_norm": 0.18898162245750427, |
|
"grpo_loss": -0.0157584547996521, |
|
"kl": 0.0218505859375, |
|
"learning_rate": 2.0971562854811047e-06, |
|
"loss": 0.0009, |
|
"reward": 0.4196428656578064, |
|
"reward_std": 0.265492707490921, |
|
"rewards/accuracy_reward": 0.4196428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 203 |
|
}, |
|
{ |
|
"completion_length": 642.169677734375, |
|
"epoch": 0.4349680170575693, |
|
"grad_norm": 0.14317277073860168, |
|
"grpo_loss": -0.09992219507694244, |
|
"kl": 0.01165771484375, |
|
"learning_rate": 2.0868960793933745e-06, |
|
"loss": 0.0005, |
|
"reward": 0.5446428656578064, |
|
"reward_std": 0.34032437205314636, |
|
"rewards/accuracy_reward": 0.5446428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 204 |
|
}, |
|
{ |
|
"completion_length": 529.7589721679688, |
|
"epoch": 0.43710021321961623, |
|
"grad_norm": 0.1738615781068802, |
|
"grpo_loss": 0.00030606670770794153, |
|
"kl": 0.01385498046875, |
|
"learning_rate": 2.076603347051875e-06, |
|
"loss": 0.0006, |
|
"reward": 0.8214285969734192, |
|
"reward_std": 0.25279706716537476, |
|
"rewards/accuracy_reward": 0.8214285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 556.75, |
|
"epoch": 0.43923240938166314, |
|
"grad_norm": 0.19676941633224487, |
|
"grpo_loss": -0.047070156782865524, |
|
"kl": 0.015869140625, |
|
"learning_rate": 2.0662786588881275e-06, |
|
"loss": 0.0006, |
|
"reward": 0.6517857313156128, |
|
"reward_std": 0.27641984820365906, |
|
"rewards/accuracy_reward": 0.6517857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 206 |
|
}, |
|
{ |
|
"completion_length": 650.7232666015625, |
|
"epoch": 0.44136460554371004, |
|
"grad_norm": 0.1258775293827057, |
|
"grpo_loss": 6.173212022986263e-05, |
|
"kl": 0.00579833984375, |
|
"learning_rate": 2.0559225871046738e-06, |
|
"loss": 0.0002, |
|
"reward": 0.6160714626312256, |
|
"reward_std": 0.1986774057149887, |
|
"rewards/accuracy_reward": 0.6160714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 207 |
|
}, |
|
{ |
|
"completion_length": 639.232177734375, |
|
"epoch": 0.44349680170575695, |
|
"grad_norm": 0.18094466626644135, |
|
"grpo_loss": -0.046934500336647034, |
|
"kl": 0.014892578125, |
|
"learning_rate": 2.045535705643358e-06, |
|
"loss": 0.0006, |
|
"reward": 0.625, |
|
"reward_std": 0.2832939326763153, |
|
"rewards/accuracy_reward": 0.625, |
|
"rewards/format_reward": 0.0, |
|
"step": 208 |
|
}, |
|
{ |
|
"completion_length": 609.0892944335938, |
|
"epoch": 0.44562899786780386, |
|
"grad_norm": 0.17628911137580872, |
|
"grpo_loss": -0.04628950357437134, |
|
"kl": 0.0152587890625, |
|
"learning_rate": 2.0351185901535227e-06, |
|
"loss": 0.0006, |
|
"reward": 0.6160714626312256, |
|
"reward_std": 0.30004262924194336, |
|
"rewards/accuracy_reward": 0.6160714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 209 |
|
}, |
|
{ |
|
"completion_length": 580.0982666015625, |
|
"epoch": 0.44776119402985076, |
|
"grad_norm": 0.17487889528274536, |
|
"grpo_loss": -0.09924271702766418, |
|
"kl": 0.0172119140625, |
|
"learning_rate": 2.0246718179601055e-06, |
|
"loss": 0.0007, |
|
"reward": 0.723214328289032, |
|
"reward_std": 0.21137306094169617, |
|
"rewards/accuracy_reward": 0.723214328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 667.8214721679688, |
|
"epoch": 0.44989339019189767, |
|
"grad_norm": 0.11795714497566223, |
|
"grpo_loss": 0.00025861337780952454, |
|
"kl": 0.01043701171875, |
|
"learning_rate": 2.014195968031641e-06, |
|
"loss": 0.0004, |
|
"reward": 0.5178571939468384, |
|
"reward_std": 0.2902577519416809, |
|
"rewards/accuracy_reward": 0.5178571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 211 |
|
}, |
|
{ |
|
"completion_length": 549.5714721679688, |
|
"epoch": 0.4520255863539446, |
|
"grad_norm": 0.15185706317424774, |
|
"grpo_loss": 0.11831355839967728, |
|
"kl": 0.00970458984375, |
|
"learning_rate": 2.003691620948176e-06, |
|
"loss": 0.0004, |
|
"reward": 0.7678571939468384, |
|
"reward_std": 0.1791076362133026, |
|
"rewards/accuracy_reward": 0.7678571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 212 |
|
}, |
|
{ |
|
"completion_length": 685.2142944335938, |
|
"epoch": 0.4541577825159915, |
|
"grad_norm": 0.2295156866312027, |
|
"grpo_loss": 0.04919052869081497, |
|
"kl": 0.037353515625, |
|
"learning_rate": 1.993159358869093e-06, |
|
"loss": 0.0015, |
|
"reward": 0.5535714626312256, |
|
"reward_std": 0.27933061122894287, |
|
"rewards/accuracy_reward": 0.5535714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 213 |
|
}, |
|
{ |
|
"completion_length": 569.1160888671875, |
|
"epoch": 0.4562899786780384, |
|
"grad_norm": 0.17347709834575653, |
|
"grpo_loss": 0.01694546267390251, |
|
"kl": 0.0196533203125, |
|
"learning_rate": 1.9825997655008457e-06, |
|
"loss": 0.0008, |
|
"reward": 0.6875000596046448, |
|
"reward_std": 0.24874402582645416, |
|
"rewards/accuracy_reward": 0.6875000596046448, |
|
"rewards/format_reward": 0.0, |
|
"step": 214 |
|
}, |
|
{ |
|
"completion_length": 482.8035888671875, |
|
"epoch": 0.4584221748400853, |
|
"grad_norm": 0.2306608110666275, |
|
"grpo_loss": 0.048458606004714966, |
|
"kl": 0.015869140625, |
|
"learning_rate": 1.9720134260646093e-06, |
|
"loss": 0.0006, |
|
"reward": 0.6160714626312256, |
|
"reward_std": 0.24883373081684113, |
|
"rewards/accuracy_reward": 0.6160714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 658.9107666015625, |
|
"epoch": 0.4605543710021322, |
|
"grad_norm": 0.1076991856098175, |
|
"grpo_loss": 0.0008707195520401001, |
|
"kl": 0.01287841796875, |
|
"learning_rate": 1.9614009272638483e-06, |
|
"loss": 0.0005, |
|
"reward": 0.5089285969734192, |
|
"reward_std": 0.16818054020404816, |
|
"rewards/accuracy_reward": 0.5089285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 216 |
|
}, |
|
{ |
|
"completion_length": 619.625, |
|
"epoch": 0.4626865671641791, |
|
"grad_norm": 0.16215959191322327, |
|
"grpo_loss": 0.000715792179107666, |
|
"kl": 0.01007080078125, |
|
"learning_rate": 1.9507628572518003e-06, |
|
"loss": 0.0004, |
|
"reward": 0.5178571939468384, |
|
"reward_std": 0.27350908517837524, |
|
"rewards/accuracy_reward": 0.5178571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 217 |
|
}, |
|
{ |
|
"completion_length": 536.8125, |
|
"epoch": 0.464818763326226, |
|
"grad_norm": 0.2339649200439453, |
|
"grpo_loss": 0.0006662532687187195, |
|
"kl": 0.025634765625, |
|
"learning_rate": 1.9400998055988797e-06, |
|
"loss": 0.001, |
|
"reward": 0.7678571939468384, |
|
"reward_std": 0.29713183641433716, |
|
"rewards/accuracy_reward": 0.7678571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 218 |
|
}, |
|
{ |
|
"completion_length": 617.169677734375, |
|
"epoch": 0.4669509594882729, |
|
"grad_norm": 0.16902314126491547, |
|
"grpo_loss": 0.07331579923629761, |
|
"kl": 0.01611328125, |
|
"learning_rate": 1.9294123632600037e-06, |
|
"loss": 0.0006, |
|
"reward": 0.6517857313156128, |
|
"reward_std": 0.31088003516197205, |
|
"rewards/accuracy_reward": 0.6517857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 219 |
|
}, |
|
{ |
|
"completion_length": 625.9642944335938, |
|
"epoch": 0.4690831556503198, |
|
"grad_norm": 0.21332789957523346, |
|
"grpo_loss": -0.046973273158073425, |
|
"kl": 0.0225830078125, |
|
"learning_rate": 1.9187011225418415e-06, |
|
"loss": 0.0009, |
|
"reward": 0.6071428656578064, |
|
"reward_std": 0.2862047255039215, |
|
"rewards/accuracy_reward": 0.6071428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 558.9732666015625, |
|
"epoch": 0.47121535181236673, |
|
"grad_norm": 0.205609530210495, |
|
"grpo_loss": -0.046633653342723846, |
|
"kl": 0.01953125, |
|
"learning_rate": 1.907966677069986e-06, |
|
"loss": 0.0008, |
|
"reward": 0.6964285969734192, |
|
"reward_std": 0.24883373081684113, |
|
"rewards/accuracy_reward": 0.6964285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 221 |
|
}, |
|
{ |
|
"completion_length": 585.7589721679688, |
|
"epoch": 0.47334754797441364, |
|
"grad_norm": 0.21356123685836792, |
|
"grpo_loss": -0.07238468527793884, |
|
"kl": 0.031494140625, |
|
"learning_rate": 1.8972096217560587e-06, |
|
"loss": 0.0013, |
|
"reward": 0.6428571939468384, |
|
"reward_std": 0.27059829235076904, |
|
"rewards/accuracy_reward": 0.6428571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 222 |
|
}, |
|
{ |
|
"completion_length": 660.482177734375, |
|
"epoch": 0.47547974413646055, |
|
"grad_norm": 0.13936030864715576, |
|
"grpo_loss": -0.04697471857070923, |
|
"kl": 0.0184326171875, |
|
"learning_rate": 1.886430552764735e-06, |
|
"loss": 0.0007, |
|
"reward": 0.5892857313156128, |
|
"reward_std": 0.1986774057149887, |
|
"rewards/accuracy_reward": 0.5892857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 223 |
|
}, |
|
{ |
|
"completion_length": 591.4375, |
|
"epoch": 0.47761194029850745, |
|
"grad_norm": 0.13793712854385376, |
|
"grpo_loss": 0.01739496923983097, |
|
"kl": 0.01116943359375, |
|
"learning_rate": 1.8756300674807064e-06, |
|
"loss": 0.0004, |
|
"reward": 0.6785714626312256, |
|
"reward_std": 0.20158818364143372, |
|
"rewards/accuracy_reward": 0.6785714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 224 |
|
}, |
|
{ |
|
"completion_length": 647.232177734375, |
|
"epoch": 0.47974413646055436, |
|
"grad_norm": 0.12633898854255676, |
|
"grpo_loss": 0.00012505054473876953, |
|
"kl": 0.005950927734375, |
|
"learning_rate": 1.8648087644755727e-06, |
|
"loss": 0.0002, |
|
"reward": 0.6696428656578064, |
|
"reward_std": 0.24874402582645416, |
|
"rewards/accuracy_reward": 0.6696428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 603.3392944335938, |
|
"epoch": 0.48187633262260127, |
|
"grad_norm": 0.25130102038383484, |
|
"grpo_loss": 0.00014068186283111572, |
|
"kl": 0.01904296875, |
|
"learning_rate": 1.8539672434746695e-06, |
|
"loss": 0.0008, |
|
"reward": 0.5625, |
|
"reward_std": 0.3098274767398834, |
|
"rewards/accuracy_reward": 0.5625, |
|
"rewards/format_reward": 0.0, |
|
"step": 226 |
|
}, |
|
{ |
|
"completion_length": 631.232177734375, |
|
"epoch": 0.4840085287846482, |
|
"grad_norm": 0.18931794166564941, |
|
"grpo_loss": 0.00030638277530670166, |
|
"kl": 0.025390625, |
|
"learning_rate": 1.8431061053238275e-06, |
|
"loss": 0.001, |
|
"reward": 0.6428571939468384, |
|
"reward_std": 0.3098274767398834, |
|
"rewards/accuracy_reward": 0.6428571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 227 |
|
}, |
|
{ |
|
"completion_length": 606.982177734375, |
|
"epoch": 0.4861407249466951, |
|
"grad_norm": 0.13589723408222198, |
|
"grpo_loss": -0.04714548587799072, |
|
"kl": 0.00933837890625, |
|
"learning_rate": 1.832225951956079e-06, |
|
"loss": 0.0004, |
|
"reward": 0.785714328289032, |
|
"reward_std": 0.20564121007919312, |
|
"rewards/accuracy_reward": 0.785714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 228 |
|
}, |
|
{ |
|
"completion_length": 618.125, |
|
"epoch": 0.488272921108742, |
|
"grad_norm": 0.16579659283161163, |
|
"grpo_loss": 0.017814993858337402, |
|
"kl": 0.0208740234375, |
|
"learning_rate": 1.8213273863582918e-06, |
|
"loss": 0.0008, |
|
"reward": 0.5714285969734192, |
|
"reward_std": 0.23208506405353546, |
|
"rewards/accuracy_reward": 0.5714285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 229 |
|
}, |
|
{ |
|
"completion_length": 622.4910888671875, |
|
"epoch": 0.4904051172707889, |
|
"grad_norm": 0.1541467159986496, |
|
"grpo_loss": 0.00014001131057739258, |
|
"kl": 0.01806640625, |
|
"learning_rate": 1.8104110125377569e-06, |
|
"loss": 0.0007, |
|
"reward": 0.6428571939468384, |
|
"reward_std": 0.2223002016544342, |
|
"rewards/accuracy_reward": 0.6428571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 669.0267944335938, |
|
"epoch": 0.4925373134328358, |
|
"grad_norm": 0.3476913571357727, |
|
"grpo_loss": 0.04737507551908493, |
|
"kl": 0.031494140625, |
|
"learning_rate": 1.7994774354887095e-06, |
|
"loss": 0.0013, |
|
"reward": 0.660714328289032, |
|
"reward_std": 0.20960453152656555, |
|
"rewards/accuracy_reward": 0.660714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 231 |
|
}, |
|
{ |
|
"completion_length": 584.1607666015625, |
|
"epoch": 0.4946695095948827, |
|
"grad_norm": 0.103955939412117, |
|
"grpo_loss": 0.0015392377972602844, |
|
"kl": 0.0244140625, |
|
"learning_rate": 1.7885272611588032e-06, |
|
"loss": 0.001, |
|
"reward": 0.6964285969734192, |
|
"reward_std": 0.16818051040172577, |
|
"rewards/accuracy_reward": 0.6964285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 232 |
|
}, |
|
{ |
|
"completion_length": 626.6964721679688, |
|
"epoch": 0.4968017057569296, |
|
"grad_norm": 0.12988092005252838, |
|
"grpo_loss": 0.0003137141466140747, |
|
"kl": 0.007598876953125, |
|
"learning_rate": 1.7775610964155246e-06, |
|
"loss": 0.0003, |
|
"reward": 0.7142857313156128, |
|
"reward_std": 0.2862047255039215, |
|
"rewards/accuracy_reward": 0.7142857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 233 |
|
}, |
|
{ |
|
"completion_length": 644.7857666015625, |
|
"epoch": 0.4989339019189765, |
|
"grad_norm": 0.1723473221063614, |
|
"grpo_loss": 0.00012041255831718445, |
|
"kl": 0.018310546875, |
|
"learning_rate": 1.7665795490125628e-06, |
|
"loss": 0.0007, |
|
"reward": 0.4910714626312256, |
|
"reward_std": 0.2459229677915573, |
|
"rewards/accuracy_reward": 0.4910714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 234 |
|
}, |
|
{ |
|
"completion_length": 565.25, |
|
"epoch": 0.5010660980810234, |
|
"grad_norm": 0.19746161997318268, |
|
"grpo_loss": -0.046281278133392334, |
|
"kl": 0.0291748046875, |
|
"learning_rate": 1.7555832275561267e-06, |
|
"loss": 0.0012, |
|
"reward": 0.6428571939468384, |
|
"reward_std": 0.31670159101486206, |
|
"rewards/accuracy_reward": 0.6428571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 683.0535888671875, |
|
"epoch": 0.5031982942430704, |
|
"grad_norm": 0.10217569023370743, |
|
"grpo_loss": 0.00012935325503349304, |
|
"kl": 0.00860595703125, |
|
"learning_rate": 1.7445727414712143e-06, |
|
"loss": 0.0003, |
|
"reward": 0.4196428656578064, |
|
"reward_std": 0.21251529455184937, |
|
"rewards/accuracy_reward": 0.4196428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 236 |
|
}, |
|
{ |
|
"completion_length": 677.669677734375, |
|
"epoch": 0.5053304904051172, |
|
"grad_norm": 0.1365184187889099, |
|
"grpo_loss": -0.04525519907474518, |
|
"kl": 0.0155029296875, |
|
"learning_rate": 1.7335487009678392e-06, |
|
"loss": 0.0006, |
|
"reward": 0.535714328289032, |
|
"reward_std": 0.2625819146633148, |
|
"rewards/accuracy_reward": 0.535714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 237 |
|
}, |
|
{ |
|
"completion_length": 601.4732666015625, |
|
"epoch": 0.5074626865671642, |
|
"grad_norm": 0.20106498897075653, |
|
"grpo_loss": 7.900974014773965e-05, |
|
"kl": 0.0167236328125, |
|
"learning_rate": 1.7225117170072113e-06, |
|
"loss": 0.0007, |
|
"reward": 0.625, |
|
"reward_std": 0.222300186753273, |
|
"rewards/accuracy_reward": 0.625, |
|
"rewards/format_reward": 0.0, |
|
"step": 238 |
|
}, |
|
{ |
|
"completion_length": 712.4375610351562, |
|
"epoch": 0.509594882729211, |
|
"grad_norm": 0.17602279782295227, |
|
"grpo_loss": -0.046808093786239624, |
|
"kl": 0.0203857421875, |
|
"learning_rate": 1.7114624012678788e-06, |
|
"loss": 0.0008, |
|
"reward": 0.6160714626312256, |
|
"reward_std": 0.2862046957015991, |
|
"rewards/accuracy_reward": 0.6160714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 239 |
|
}, |
|
{ |
|
"completion_length": 604.25, |
|
"epoch": 0.511727078891258, |
|
"grad_norm": 0.15483473241329193, |
|
"grpo_loss": 0.05522661283612251, |
|
"kl": 0.018798828125, |
|
"learning_rate": 1.7004013661118244e-06, |
|
"loss": 0.0008, |
|
"reward": 0.598214328289032, |
|
"reward_std": 0.25279706716537476, |
|
"rewards/accuracy_reward": 0.598214328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 617.6339721679688, |
|
"epoch": 0.5138592750533049, |
|
"grad_norm": 0.1640341877937317, |
|
"grpo_loss": 0.0003432449884712696, |
|
"kl": 0.01220703125, |
|
"learning_rate": 1.689329224550533e-06, |
|
"loss": 0.0005, |
|
"reward": 0.598214328289032, |
|
"reward_std": 0.19471408426761627, |
|
"rewards/accuracy_reward": 0.598214328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 241 |
|
}, |
|
{ |
|
"completion_length": 643.857177734375, |
|
"epoch": 0.5159914712153518, |
|
"grad_norm": 0.16077256202697754, |
|
"grpo_loss": -0.04704210162162781, |
|
"kl": 0.0341796875, |
|
"learning_rate": 1.6782465902110138e-06, |
|
"loss": 0.0014, |
|
"reward": 0.4821428656578064, |
|
"reward_std": 0.21251529455184937, |
|
"rewards/accuracy_reward": 0.4821428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 242 |
|
}, |
|
{ |
|
"completion_length": 558.3125, |
|
"epoch": 0.5181236673773987, |
|
"grad_norm": 0.22869129478931427, |
|
"grpo_loss": -0.09804785996675491, |
|
"kl": 0.0262451171875, |
|
"learning_rate": 1.6671540773017952e-06, |
|
"loss": 0.0011, |
|
"reward": 0.7410714626312256, |
|
"reward_std": 0.25570783019065857, |
|
"rewards/accuracy_reward": 0.7410714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 243 |
|
}, |
|
{ |
|
"completion_length": 624.982177734375, |
|
"epoch": 0.5202558635394456, |
|
"grad_norm": 0.1790386587381363, |
|
"grpo_loss": 0.11985684931278229, |
|
"kl": 0.02099609375, |
|
"learning_rate": 1.6560523005788832e-06, |
|
"loss": 0.0008, |
|
"reward": 0.625, |
|
"reward_std": 0.21542608737945557, |
|
"rewards/accuracy_reward": 0.625, |
|
"rewards/format_reward": 0.0, |
|
"step": 244 |
|
}, |
|
{ |
|
"completion_length": 654.6517944335938, |
|
"epoch": 0.5223880597014925, |
|
"grad_norm": 0.16443993151187897, |
|
"grpo_loss": -0.04706743732094765, |
|
"kl": 0.0181884765625, |
|
"learning_rate": 1.6449418753116917e-06, |
|
"loss": 0.0007, |
|
"reward": 0.598214328289032, |
|
"reward_std": 0.35698333382606506, |
|
"rewards/accuracy_reward": 0.598214328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 542.6785888671875, |
|
"epoch": 0.5245202558635395, |
|
"grad_norm": 0.1444680094718933, |
|
"grpo_loss": -0.016090432181954384, |
|
"kl": 0.012939453125, |
|
"learning_rate": 1.6338234172489443e-06, |
|
"loss": 0.0005, |
|
"reward": 0.660714328289032, |
|
"reward_std": 0.1583956480026245, |
|
"rewards/accuracy_reward": 0.660714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 246 |
|
}, |
|
{ |
|
"completion_length": 595.5625, |
|
"epoch": 0.5266524520255863, |
|
"grad_norm": 0.26646825671195984, |
|
"grpo_loss": 0.000985555350780487, |
|
"kl": 0.019775390625, |
|
"learning_rate": 1.6226975425845488e-06, |
|
"loss": 0.0008, |
|
"reward": 0.5803571939468384, |
|
"reward_std": 0.2695457637310028, |
|
"rewards/accuracy_reward": 0.5803571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 247 |
|
}, |
|
{ |
|
"completion_length": 574.3928833007812, |
|
"epoch": 0.5287846481876333, |
|
"grad_norm": 0.19544407725334167, |
|
"grpo_loss": -0.04526256024837494, |
|
"kl": 0.026611328125, |
|
"learning_rate": 1.6115648679234468e-06, |
|
"loss": 0.0011, |
|
"reward": 0.6428571939468384, |
|
"reward_std": 0.2666349411010742, |
|
"rewards/accuracy_reward": 0.6428571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 248 |
|
}, |
|
{ |
|
"completion_length": 597.419677734375, |
|
"epoch": 0.5309168443496801, |
|
"grad_norm": 0.19683720171451569, |
|
"grpo_loss": -0.07291500270366669, |
|
"kl": 0.0235595703125, |
|
"learning_rate": 1.6004260102474408e-06, |
|
"loss": 0.0009, |
|
"reward": 0.660714328289032, |
|
"reward_std": 0.2774723768234253, |
|
"rewards/accuracy_reward": 0.660714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 249 |
|
}, |
|
{ |
|
"completion_length": 636.5535888671875, |
|
"epoch": 0.5330490405117271, |
|
"grad_norm": 0.20401231944561005, |
|
"grpo_loss": -0.11792036145925522, |
|
"kl": 0.0220947265625, |
|
"learning_rate": 1.5892815868810014e-06, |
|
"loss": 0.0009, |
|
"reward": 0.5892857313156128, |
|
"reward_std": 0.32075461745262146, |
|
"rewards/accuracy_reward": 0.5892857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 670.4017944335938, |
|
"epoch": 0.535181236673774, |
|
"grad_norm": 0.24213992059230804, |
|
"grpo_loss": 0.00031362520530819893, |
|
"kl": 0.01458740234375, |
|
"learning_rate": 1.5781322154570548e-06, |
|
"loss": 0.0006, |
|
"reward": 0.535714328289032, |
|
"reward_std": 0.27933061122894287, |
|
"rewards/accuracy_reward": 0.535714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 251 |
|
}, |
|
{ |
|
"completion_length": 579.4732666015625, |
|
"epoch": 0.5373134328358209, |
|
"grad_norm": 0.1810131072998047, |
|
"grpo_loss": 0.0024060215801000595, |
|
"kl": 0.02685546875, |
|
"learning_rate": 1.5669785138827513e-06, |
|
"loss": 0.0011, |
|
"reward": 0.598214328289032, |
|
"reward_std": 0.24883373081684113, |
|
"rewards/accuracy_reward": 0.598214328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 252 |
|
}, |
|
{ |
|
"completion_length": 581.419677734375, |
|
"epoch": 0.5394456289978679, |
|
"grad_norm": 0.5175444483757019, |
|
"grpo_loss": 0.00015514915867242962, |
|
"kl": 0.01904296875, |
|
"learning_rate": 1.5558211003052223e-06, |
|
"loss": 0.0008, |
|
"reward": 0.7142857313156128, |
|
"reward_std": 0.16526976227760315, |
|
"rewards/accuracy_reward": 0.7142857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 253 |
|
}, |
|
{ |
|
"completion_length": 624.4553833007812, |
|
"epoch": 0.5415778251599147, |
|
"grad_norm": 0.21199892461299896, |
|
"grpo_loss": -0.047045424580574036, |
|
"kl": 0.0361328125, |
|
"learning_rate": 1.5446605930773215e-06, |
|
"loss": 0.0014, |
|
"reward": 0.6428571939468384, |
|
"reward_std": 0.30004262924194336, |
|
"rewards/accuracy_reward": 0.6428571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 254 |
|
}, |
|
{ |
|
"completion_length": 590.6607666015625, |
|
"epoch": 0.5437100213219617, |
|
"grad_norm": 0.24763309955596924, |
|
"grpo_loss": 0.0017136521637439728, |
|
"kl": 0.032958984375, |
|
"learning_rate": 1.5334976107233556e-06, |
|
"loss": 0.0013, |
|
"reward": 0.6785714626312256, |
|
"reward_std": 0.29598960280418396, |
|
"rewards/accuracy_reward": 0.6785714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 255 |
|
}, |
|
{ |
|
"completion_length": 547.5535888671875, |
|
"epoch": 0.5458422174840085, |
|
"grad_norm": 0.2570716440677643, |
|
"grpo_loss": -0.046061620116233826, |
|
"kl": 0.052490234375, |
|
"learning_rate": 1.5223327719048027e-06, |
|
"loss": 0.0021, |
|
"reward": 0.7053571939468384, |
|
"reward_std": 0.40317636728286743, |
|
"rewards/accuracy_reward": 0.7053571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 256 |
|
}, |
|
{ |
|
"completion_length": 700.4910888671875, |
|
"epoch": 0.5479744136460555, |
|
"grad_norm": 0.1590230017900467, |
|
"grpo_loss": 0.0008169859647750854, |
|
"kl": 0.042724609375, |
|
"learning_rate": 1.511166695386031e-06, |
|
"loss": 0.0017, |
|
"reward": 0.4107142984867096, |
|
"reward_std": 0.2891155183315277, |
|
"rewards/accuracy_reward": 0.4107142984867096, |
|
"rewards/format_reward": 0.0, |
|
"step": 257 |
|
}, |
|
{ |
|
"completion_length": 563.8035888671875, |
|
"epoch": 0.5501066098081023, |
|
"grad_norm": 0.22808882594108582, |
|
"grpo_loss": -0.1334923654794693, |
|
"kl": 0.047119140625, |
|
"learning_rate": 1.5e-06, |
|
"loss": 0.0019, |
|
"reward": 0.7142857313156128, |
|
"reward_std": 0.3363610506057739, |
|
"rewards/accuracy_reward": 0.7142857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 258 |
|
}, |
|
{ |
|
"completion_length": 588.4464721679688, |
|
"epoch": 0.5522388059701493, |
|
"grad_norm": 0.1522502601146698, |
|
"grpo_loss": -0.053403325378894806, |
|
"kl": 0.0177001953125, |
|
"learning_rate": 1.4888333046139694e-06, |
|
"loss": 0.0007, |
|
"reward": 0.535714328289032, |
|
"reward_std": 0.219389408826828, |
|
"rewards/accuracy_reward": 0.535714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 259 |
|
}, |
|
{ |
|
"completion_length": 598.5714721679688, |
|
"epoch": 0.5543710021321961, |
|
"grad_norm": 0.14512988924980164, |
|
"grpo_loss": 0.0016735941171646118, |
|
"kl": 0.02001953125, |
|
"learning_rate": 1.477667228095197e-06, |
|
"loss": 0.0008, |
|
"reward": 0.6071428656578064, |
|
"reward_std": 0.24592295289039612, |
|
"rewards/accuracy_reward": 0.6071428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 675.044677734375, |
|
"epoch": 0.5565031982942431, |
|
"grad_norm": 0.23608967661857605, |
|
"grpo_loss": 0.057517096400260925, |
|
"kl": 0.052001953125, |
|
"learning_rate": 1.4665023892766447e-06, |
|
"loss": 0.0021, |
|
"reward": 0.4375000298023224, |
|
"reward_std": 0.3206648826599121, |
|
"rewards/accuracy_reward": 0.4375000298023224, |
|
"rewards/format_reward": 0.0, |
|
"step": 261 |
|
}, |
|
{ |
|
"completion_length": 610.3839721679688, |
|
"epoch": 0.55863539445629, |
|
"grad_norm": 0.17539887130260468, |
|
"grpo_loss": -0.06992875784635544, |
|
"kl": 0.03564453125, |
|
"learning_rate": 1.4553394069226786e-06, |
|
"loss": 0.0014, |
|
"reward": 0.6785714626312256, |
|
"reward_std": 0.24478070437908173, |
|
"rewards/accuracy_reward": 0.6785714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 262 |
|
}, |
|
{ |
|
"completion_length": 561.732177734375, |
|
"epoch": 0.5607675906183369, |
|
"grad_norm": 0.18649928271770477, |
|
"grpo_loss": 0.13406673073768616, |
|
"kl": 0.021728515625, |
|
"learning_rate": 1.444178899694778e-06, |
|
"loss": 0.0009, |
|
"reward": 0.7500000596046448, |
|
"reward_std": 0.25279706716537476, |
|
"rewards/accuracy_reward": 0.7500000596046448, |
|
"rewards/format_reward": 0.0, |
|
"step": 263 |
|
}, |
|
{ |
|
"completion_length": 585.482177734375, |
|
"epoch": 0.5628997867803838, |
|
"grad_norm": 0.1891089528799057, |
|
"grpo_loss": -0.1000761166214943, |
|
"kl": 0.03466796875, |
|
"learning_rate": 1.4330214861172487e-06, |
|
"loss": 0.0014, |
|
"reward": 0.7589285969734192, |
|
"reward_std": 0.29899007081985474, |
|
"rewards/accuracy_reward": 0.7589285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 264 |
|
}, |
|
{ |
|
"completion_length": 666.8660888671875, |
|
"epoch": 0.5650319829424307, |
|
"grad_norm": 0.37016505002975464, |
|
"grpo_loss": -0.014646738767623901, |
|
"kl": 0.047607421875, |
|
"learning_rate": 1.4218677845429455e-06, |
|
"loss": 0.0019, |
|
"reward": 0.5178571939468384, |
|
"reward_std": 0.2695457339286804, |
|
"rewards/accuracy_reward": 0.5178571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 265 |
|
}, |
|
{ |
|
"completion_length": 611.1339721679688, |
|
"epoch": 0.5671641791044776, |
|
"grad_norm": 0.15229058265686035, |
|
"grpo_loss": 0.05506877228617668, |
|
"kl": 0.01202392578125, |
|
"learning_rate": 1.410718413118999e-06, |
|
"loss": 0.0005, |
|
"reward": 0.660714328289032, |
|
"reward_std": 0.2724565267562866, |
|
"rewards/accuracy_reward": 0.660714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 266 |
|
}, |
|
{ |
|
"completion_length": 507.8035888671875, |
|
"epoch": 0.5692963752665245, |
|
"grad_norm": 0.20434416830539703, |
|
"grpo_loss": 0.0012201443314552307, |
|
"kl": 0.0206298828125, |
|
"learning_rate": 1.3995739897525592e-06, |
|
"loss": 0.0008, |
|
"reward": 0.8392857313156128, |
|
"reward_std": 0.27059829235076904, |
|
"rewards/accuracy_reward": 0.8392857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 267 |
|
}, |
|
{ |
|
"completion_length": 576.5267944335938, |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 0.19015087187290192, |
|
"grpo_loss": -0.047088075429201126, |
|
"kl": 0.0302734375, |
|
"learning_rate": 1.3884351320765534e-06, |
|
"loss": 0.0012, |
|
"reward": 0.7053571939468384, |
|
"reward_std": 0.2803831696510315, |
|
"rewards/accuracy_reward": 0.7053571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 268 |
|
}, |
|
{ |
|
"completion_length": 554.9107666015625, |
|
"epoch": 0.5735607675906184, |
|
"grad_norm": 0.19130241870880127, |
|
"grpo_loss": 0.0005413442850112915, |
|
"kl": 0.0400390625, |
|
"learning_rate": 1.3773024574154515e-06, |
|
"loss": 0.0016, |
|
"reward": 0.5892857313156128, |
|
"reward_std": 0.27236682176589966, |
|
"rewards/accuracy_reward": 0.5892857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 269 |
|
}, |
|
{ |
|
"completion_length": 586.3928833007812, |
|
"epoch": 0.5756929637526652, |
|
"grad_norm": 0.1519208699464798, |
|
"grpo_loss": -0.04699365422129631, |
|
"kl": 0.0186767578125, |
|
"learning_rate": 1.3661765827510562e-06, |
|
"loss": 0.0007, |
|
"reward": 0.535714328289032, |
|
"reward_std": 0.219389408826828, |
|
"rewards/accuracy_reward": 0.535714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 733.2410888671875, |
|
"epoch": 0.5778251599147122, |
|
"grad_norm": 0.12186726182699203, |
|
"grpo_loss": 0.00036180391907691956, |
|
"kl": 0.0245361328125, |
|
"learning_rate": 1.3550581246883084e-06, |
|
"loss": 0.001, |
|
"reward": 0.3660714328289032, |
|
"reward_std": 0.2124256044626236, |
|
"rewards/accuracy_reward": 0.3660714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 271 |
|
}, |
|
{ |
|
"completion_length": 600.6428833007812, |
|
"epoch": 0.579957356076759, |
|
"grad_norm": 0.15703478455543518, |
|
"grpo_loss": 0.00036340951919555664, |
|
"kl": 0.033203125, |
|
"learning_rate": 1.3439476994211171e-06, |
|
"loss": 0.0013, |
|
"reward": 0.598214328289032, |
|
"reward_std": 0.2459229677915573, |
|
"rewards/accuracy_reward": 0.598214328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 272 |
|
}, |
|
{ |
|
"completion_length": 608.2232666015625, |
|
"epoch": 0.582089552238806, |
|
"grad_norm": 0.17682023346424103, |
|
"grpo_loss": 0.017652347683906555, |
|
"kl": 0.026123046875, |
|
"learning_rate": 1.3328459226982051e-06, |
|
"loss": 0.001, |
|
"reward": 0.6696428656578064, |
|
"reward_std": 0.36280491948127747, |
|
"rewards/accuracy_reward": 0.6696428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 273 |
|
}, |
|
{ |
|
"completion_length": 669.1875, |
|
"epoch": 0.5842217484008528, |
|
"grad_norm": 0.13847747445106506, |
|
"grpo_loss": 0.00015268649440258741, |
|
"kl": 0.0142822265625, |
|
"learning_rate": 1.3217534097889865e-06, |
|
"loss": 0.0006, |
|
"reward": 0.5446428656578064, |
|
"reward_std": 0.31670159101486206, |
|
"rewards/accuracy_reward": 0.5446428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 274 |
|
}, |
|
{ |
|
"completion_length": 672.4285888671875, |
|
"epoch": 0.5863539445628998, |
|
"grad_norm": 0.15076103806495667, |
|
"grpo_loss": 0.0006213184678927064, |
|
"kl": 0.0167236328125, |
|
"learning_rate": 1.310670775449467e-06, |
|
"loss": 0.0007, |
|
"reward": 0.535714328289032, |
|
"reward_std": 0.31961238384246826, |
|
"rewards/accuracy_reward": 0.535714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 275 |
|
}, |
|
{ |
|
"completion_length": 762.8839721679688, |
|
"epoch": 0.5884861407249466, |
|
"grad_norm": 0.12425374239683151, |
|
"grpo_loss": 0.0010431259870529175, |
|
"kl": 0.014892578125, |
|
"learning_rate": 1.2995986338881757e-06, |
|
"loss": 0.0006, |
|
"reward": 0.4910714626312256, |
|
"reward_std": 0.2430122047662735, |
|
"rewards/accuracy_reward": 0.4910714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 276 |
|
}, |
|
{ |
|
"completion_length": 478.9285888671875, |
|
"epoch": 0.5906183368869936, |
|
"grad_norm": 0.15039986371994019, |
|
"grpo_loss": -0.04706709831953049, |
|
"kl": 0.0186767578125, |
|
"learning_rate": 1.2885375987321222e-06, |
|
"loss": 0.0007, |
|
"reward": 0.8392857313156128, |
|
"reward_std": 0.1928558498620987, |
|
"rewards/accuracy_reward": 0.8392857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 277 |
|
}, |
|
{ |
|
"completion_length": 575.3660888671875, |
|
"epoch": 0.5927505330490405, |
|
"grad_norm": 0.14196723699569702, |
|
"grpo_loss": 0.05593723803758621, |
|
"kl": 0.015869140625, |
|
"learning_rate": 1.2774882829927886e-06, |
|
"loss": 0.0006, |
|
"reward": 0.6875000596046448, |
|
"reward_std": 0.20960454642772675, |
|
"rewards/accuracy_reward": 0.6875000596046448, |
|
"rewards/format_reward": 0.0, |
|
"step": 278 |
|
}, |
|
{ |
|
"completion_length": 699.4910888671875, |
|
"epoch": 0.5948827292110874, |
|
"grad_norm": 0.18591664731502533, |
|
"grpo_loss": 0.05759921297430992, |
|
"kl": 0.037841796875, |
|
"learning_rate": 1.2664512990321611e-06, |
|
"loss": 0.0015, |
|
"reward": 0.5625, |
|
"reward_std": 0.35407260060310364, |
|
"rewards/accuracy_reward": 0.5625, |
|
"rewards/format_reward": 0.0, |
|
"step": 279 |
|
}, |
|
{ |
|
"completion_length": 637.1785888671875, |
|
"epoch": 0.5970149253731343, |
|
"grad_norm": 0.16850171983242035, |
|
"grpo_loss": 0.00017352678696624935, |
|
"kl": 0.020263671875, |
|
"learning_rate": 1.2554272585287862e-06, |
|
"loss": 0.0008, |
|
"reward": 0.598214328289032, |
|
"reward_std": 0.23208506405353546, |
|
"rewards/accuracy_reward": 0.598214328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 593.5, |
|
"epoch": 0.5991471215351812, |
|
"grad_norm": 0.1354869157075882, |
|
"grpo_loss": 0.0011160299181938171, |
|
"kl": 0.0133056640625, |
|
"learning_rate": 1.244416772443874e-06, |
|
"loss": 0.0005, |
|
"reward": 0.5178571939468384, |
|
"reward_std": 0.2832939326763153, |
|
"rewards/accuracy_reward": 0.5178571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 281 |
|
}, |
|
{ |
|
"completion_length": 542.4910888671875, |
|
"epoch": 0.6012793176972282, |
|
"grad_norm": 0.19948160648345947, |
|
"grpo_loss": 0.0181841142475605, |
|
"kl": 0.038818359375, |
|
"learning_rate": 1.2334204509874373e-06, |
|
"loss": 0.0016, |
|
"reward": 0.7142857313156128, |
|
"reward_std": 0.305864155292511, |
|
"rewards/accuracy_reward": 0.7142857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 282 |
|
}, |
|
{ |
|
"completion_length": 567.794677734375, |
|
"epoch": 0.603411513859275, |
|
"grad_norm": 0.14735370874404907, |
|
"grpo_loss": 0.0002326183021068573, |
|
"kl": 0.018798828125, |
|
"learning_rate": 1.2224389035844757e-06, |
|
"loss": 0.0008, |
|
"reward": 0.6517857313156128, |
|
"reward_std": 0.22812172770500183, |
|
"rewards/accuracy_reward": 0.6517857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 283 |
|
}, |
|
{ |
|
"completion_length": 590.7678833007812, |
|
"epoch": 0.605543710021322, |
|
"grad_norm": 0.18942324817180634, |
|
"grpo_loss": -0.10003073513507843, |
|
"kl": 0.0299072265625, |
|
"learning_rate": 1.2114727388411973e-06, |
|
"loss": 0.0012, |
|
"reward": 0.598214328289032, |
|
"reward_std": 0.3235756754875183, |
|
"rewards/accuracy_reward": 0.598214328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 284 |
|
}, |
|
{ |
|
"completion_length": 562.1607666015625, |
|
"epoch": 0.6076759061833689, |
|
"grad_norm": 0.23257029056549072, |
|
"grpo_loss": -0.05442488566040993, |
|
"kl": 0.050537109375, |
|
"learning_rate": 1.2005225645112906e-06, |
|
"loss": 0.002, |
|
"reward": 0.5178571939468384, |
|
"reward_std": 0.26654526591300964, |
|
"rewards/accuracy_reward": 0.5178571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 285 |
|
}, |
|
{ |
|
"completion_length": 545.0089721679688, |
|
"epoch": 0.6098081023454158, |
|
"grad_norm": 0.16995713114738464, |
|
"grpo_loss": 0.0012850586790591478, |
|
"kl": 0.0242919921875, |
|
"learning_rate": 1.1895889874622432e-06, |
|
"loss": 0.001, |
|
"reward": 0.6785714626312256, |
|
"reward_std": 0.27933061122894287, |
|
"rewards/accuracy_reward": 0.6785714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 286 |
|
}, |
|
{ |
|
"completion_length": 630.0357666015625, |
|
"epoch": 0.6119402985074627, |
|
"grad_norm": 0.15201345086097717, |
|
"grpo_loss": 0.04734975844621658, |
|
"kl": 0.02099609375, |
|
"learning_rate": 1.1786726136417083e-06, |
|
"loss": 0.0008, |
|
"reward": 0.598214328289032, |
|
"reward_std": 0.20855198800563812, |
|
"rewards/accuracy_reward": 0.598214328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 287 |
|
}, |
|
{ |
|
"completion_length": 562.3928833007812, |
|
"epoch": 0.6140724946695096, |
|
"grad_norm": 0.14891578257083893, |
|
"grpo_loss": -0.04348957538604736, |
|
"kl": 0.024169921875, |
|
"learning_rate": 1.1677740480439218e-06, |
|
"loss": 0.001, |
|
"reward": 0.8214285969734192, |
|
"reward_std": 0.24697549641132355, |
|
"rewards/accuracy_reward": 0.8214285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 288 |
|
}, |
|
{ |
|
"completion_length": 716.5000610351562, |
|
"epoch": 0.6162046908315565, |
|
"grad_norm": 0.14191721379756927, |
|
"grpo_loss": 0.0008554987725801766, |
|
"kl": 0.030029296875, |
|
"learning_rate": 1.1568938946761726e-06, |
|
"loss": 0.0012, |
|
"reward": 0.4821428656578064, |
|
"reward_std": 0.18889252841472626, |
|
"rewards/accuracy_reward": 0.4821428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 289 |
|
}, |
|
{ |
|
"completion_length": 646.1875, |
|
"epoch": 0.6183368869936035, |
|
"grad_norm": 0.18612991273403168, |
|
"grpo_loss": -0.047120094299316406, |
|
"kl": 0.05126953125, |
|
"learning_rate": 1.1460327565253308e-06, |
|
"loss": 0.0021, |
|
"reward": 0.6785714626312256, |
|
"reward_std": 0.27641981840133667, |
|
"rewards/accuracy_reward": 0.6785714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 290 |
|
}, |
|
{ |
|
"completion_length": 532.75, |
|
"epoch": 0.6204690831556503, |
|
"grad_norm": 0.15706709027290344, |
|
"grpo_loss": -0.046887509524822235, |
|
"kl": 0.019287109375, |
|
"learning_rate": 1.1351912355244273e-06, |
|
"loss": 0.0008, |
|
"reward": 0.8125000596046448, |
|
"reward_std": 0.23613810539245605, |
|
"rewards/accuracy_reward": 0.8125000596046448, |
|
"rewards/format_reward": 0.0, |
|
"step": 291 |
|
}, |
|
{ |
|
"completion_length": 528.8125, |
|
"epoch": 0.6226012793176973, |
|
"grad_norm": 0.26372042298316956, |
|
"grpo_loss": -0.05313573777675629, |
|
"kl": 0.046875, |
|
"learning_rate": 1.1243699325192936e-06, |
|
"loss": 0.0019, |
|
"reward": 0.6785714626312256, |
|
"reward_std": 0.3501092493534088, |
|
"rewards/accuracy_reward": 0.6785714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 292 |
|
}, |
|
{ |
|
"completion_length": 582.669677734375, |
|
"epoch": 0.6247334754797441, |
|
"grad_norm": 0.14917142689228058, |
|
"grpo_loss": 0.0013749670470133424, |
|
"kl": 0.0247802734375, |
|
"learning_rate": 1.1135694472352654e-06, |
|
"loss": 0.001, |
|
"reward": 0.6696428656578064, |
|
"reward_std": 0.1986774057149887, |
|
"rewards/accuracy_reward": 0.6696428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 293 |
|
}, |
|
{ |
|
"completion_length": 600.6428833007812, |
|
"epoch": 0.6268656716417911, |
|
"grad_norm": 0.24822071194648743, |
|
"grpo_loss": 0.0018174201250076294, |
|
"kl": 0.0390625, |
|
"learning_rate": 1.1027903782439413e-06, |
|
"loss": 0.0016, |
|
"reward": 0.6875000596046448, |
|
"reward_std": 0.2989003360271454, |
|
"rewards/accuracy_reward": 0.6875000596046448, |
|
"rewards/format_reward": 0.0, |
|
"step": 294 |
|
}, |
|
{ |
|
"completion_length": 689.8482666015625, |
|
"epoch": 0.6289978678038379, |
|
"grad_norm": 0.13699181377887726, |
|
"grpo_loss": 0.0017366781830787659, |
|
"kl": 0.01446533203125, |
|
"learning_rate": 1.092033322930014e-06, |
|
"loss": 0.0006, |
|
"reward": 0.4910714626312256, |
|
"reward_std": 0.2223002016544342, |
|
"rewards/accuracy_reward": 0.4910714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 295 |
|
}, |
|
{ |
|
"completion_length": 597.2232666015625, |
|
"epoch": 0.6311300639658849, |
|
"grad_norm": 0.1713307648897171, |
|
"grpo_loss": -0.07294347882270813, |
|
"kl": 0.0286865234375, |
|
"learning_rate": 1.0812988774581588e-06, |
|
"loss": 0.0012, |
|
"reward": 0.723214328289032, |
|
"reward_std": 0.3264864683151245, |
|
"rewards/accuracy_reward": 0.723214328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 296 |
|
}, |
|
{ |
|
"completion_length": 509.5625305175781, |
|
"epoch": 0.6332622601279317, |
|
"grad_norm": 0.22946512699127197, |
|
"grpo_loss": 0.11910638958215714, |
|
"kl": 0.02734375, |
|
"learning_rate": 1.0705876367399966e-06, |
|
"loss": 0.0011, |
|
"reward": 0.6875000596046448, |
|
"reward_std": 0.2666349411010742, |
|
"rewards/accuracy_reward": 0.6875000596046448, |
|
"rewards/format_reward": 0.0, |
|
"step": 297 |
|
}, |
|
{ |
|
"completion_length": 631.6964721679688, |
|
"epoch": 0.6353944562899787, |
|
"grad_norm": 0.29484447836875916, |
|
"grpo_loss": 0.05576039105653763, |
|
"kl": 0.07080078125, |
|
"learning_rate": 1.0599001944011204e-06, |
|
"loss": 0.0028, |
|
"reward": 0.6160714626312256, |
|
"reward_std": 0.2538496255874634, |
|
"rewards/accuracy_reward": 0.6160714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 298 |
|
}, |
|
{ |
|
"completion_length": 634.0714721679688, |
|
"epoch": 0.6375266524520256, |
|
"grad_norm": 0.5733016729354858, |
|
"grpo_loss": 0.0023387782275676727, |
|
"kl": 0.0213623046875, |
|
"learning_rate": 1.0492371427482002e-06, |
|
"loss": 0.0009, |
|
"reward": 0.7767857313156128, |
|
"reward_std": 0.16526976227760315, |
|
"rewards/accuracy_reward": 0.7767857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 299 |
|
}, |
|
{ |
|
"completion_length": 587.0982666015625, |
|
"epoch": 0.6396588486140725, |
|
"grad_norm": 0.15578092634677887, |
|
"grpo_loss": 0.0022323429584503174, |
|
"kl": 0.0264892578125, |
|
"learning_rate": 1.0385990727361518e-06, |
|
"loss": 0.0011, |
|
"reward": 0.7053571939468384, |
|
"reward_std": 0.3029533922672272, |
|
"rewards/accuracy_reward": 0.7053571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6396588486140725, |
|
"eval_completion_length": 607.3742406680561, |
|
"eval_grpo_loss": -0.0026763521267401317, |
|
"eval_kl": 0.03561235616763179, |
|
"eval_loss": 0.0014403663808479905, |
|
"eval_reward": 0.5617298292466246, |
|
"eval_reward_std": 0.2739596153362491, |
|
"eval_rewards/accuracy_reward": 0.5617298292466246, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 4024.0485, |
|
"eval_samples_per_second": 1.243, |
|
"eval_steps_per_second": 0.011, |
|
"step": 300 |
|
}, |
|
{ |
|
"completion_length": 598.4375, |
|
"epoch": 0.6417910447761194, |
|
"grad_norm": 0.13117839395999908, |
|
"grpo_loss": -0.04589998349547386, |
|
"kl": 0.018310546875, |
|
"learning_rate": 1.027986573935391e-06, |
|
"loss": 0.0007, |
|
"reward": 0.6428571939468384, |
|
"reward_std": 0.23208506405353546, |
|
"rewards/accuracy_reward": 0.6428571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 301 |
|
}, |
|
{ |
|
"completion_length": 584.4642944335938, |
|
"epoch": 0.6439232409381663, |
|
"grad_norm": 0.17518894374370575, |
|
"grpo_loss": 0.04966043680906296, |
|
"kl": 0.04052734375, |
|
"learning_rate": 1.0174002344991544e-06, |
|
"loss": 0.0016, |
|
"reward": 0.598214328289032, |
|
"reward_std": 0.14861077070236206, |
|
"rewards/accuracy_reward": 0.598214328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 302 |
|
}, |
|
{ |
|
"completion_length": 561.0892944335938, |
|
"epoch": 0.6460554371002132, |
|
"grad_norm": 0.15005171298980713, |
|
"grpo_loss": 0.00026489607989788055, |
|
"kl": 0.0213623046875, |
|
"learning_rate": 1.0068406411309068e-06, |
|
"loss": 0.0009, |
|
"reward": 0.6428571939468384, |
|
"reward_std": 0.19585634768009186, |
|
"rewards/accuracy_reward": 0.6428571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 303 |
|
}, |
|
{ |
|
"completion_length": 598.232177734375, |
|
"epoch": 0.6481876332622601, |
|
"grad_norm": 0.19444920122623444, |
|
"grpo_loss": 0.00011016204371117055, |
|
"kl": 0.03564453125, |
|
"learning_rate": 9.96308379051824e-07, |
|
"loss": 0.0014, |
|
"reward": 0.5267857313156128, |
|
"reward_std": 0.30409565567970276, |
|
"rewards/accuracy_reward": 0.5267857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 304 |
|
}, |
|
{ |
|
"completion_length": 541.9464721679688, |
|
"epoch": 0.650319829424307, |
|
"grad_norm": 0.19593097269535065, |
|
"grpo_loss": -0.07303665578365326, |
|
"kl": 0.040771484375, |
|
"learning_rate": 9.858040319683595e-07, |
|
"loss": 0.0016, |
|
"reward": 0.6339285969734192, |
|
"reward_std": 0.2989003658294678, |
|
"rewards/accuracy_reward": 0.6339285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 305 |
|
}, |
|
{ |
|
"completion_length": 662.419677734375, |
|
"epoch": 0.652452025586354, |
|
"grad_norm": 0.156465083360672, |
|
"grpo_loss": 0.0031750944908708334, |
|
"kl": 0.04833984375, |
|
"learning_rate": 9.753281820398952e-07, |
|
"loss": 0.0019, |
|
"reward": 0.660714328289032, |
|
"reward_std": 0.25570783019065857, |
|
"rewards/accuracy_reward": 0.660714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 306 |
|
}, |
|
{ |
|
"completion_length": 623.2232666015625, |
|
"epoch": 0.6545842217484008, |
|
"grad_norm": 0.16462744772434235, |
|
"grpo_loss": 0.04866427183151245, |
|
"kl": 0.03271484375, |
|
"learning_rate": 9.648814098464774e-07, |
|
"loss": 0.0013, |
|
"reward": 0.6785714626312256, |
|
"reward_std": 0.29713183641433716, |
|
"rewards/accuracy_reward": 0.6785714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 307 |
|
}, |
|
{ |
|
"completion_length": 568.5535888671875, |
|
"epoch": 0.6567164179104478, |
|
"grad_norm": 0.20251184701919556, |
|
"grpo_loss": 0.0002517774701118469, |
|
"kl": 0.0267333984375, |
|
"learning_rate": 9.544642943566426e-07, |
|
"loss": 0.0011, |
|
"reward": 0.5446428656578064, |
|
"reward_std": 0.30004259943962097, |
|
"rewards/accuracy_reward": 0.5446428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 308 |
|
}, |
|
{ |
|
"completion_length": 512.169677734375, |
|
"epoch": 0.6588486140724946, |
|
"grad_norm": 0.20472615957260132, |
|
"grpo_loss": 0.00059002626221627, |
|
"kl": 0.04638671875, |
|
"learning_rate": 9.440774128953266e-07, |
|
"loss": 0.0018, |
|
"reward": 0.723214328289032, |
|
"reward_std": 0.21251530945301056, |
|
"rewards/accuracy_reward": 0.723214328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 309 |
|
}, |
|
{ |
|
"completion_length": 630.8839721679688, |
|
"epoch": 0.6609808102345416, |
|
"grad_norm": 0.15194885432720184, |
|
"grpo_loss": 0.00018625587108545005, |
|
"kl": 0.032958984375, |
|
"learning_rate": 9.337213411118722e-07, |
|
"loss": 0.0013, |
|
"reward": 0.6517857313156128, |
|
"reward_std": 0.23904886841773987, |
|
"rewards/accuracy_reward": 0.6517857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 310 |
|
}, |
|
{ |
|
"completion_length": 678.7142944335938, |
|
"epoch": 0.6631130063965884, |
|
"grad_norm": 0.14824587106704712, |
|
"grpo_loss": -0.014349933713674545, |
|
"kl": 0.0247802734375, |
|
"learning_rate": 9.233966529481255e-07, |
|
"loss": 0.001, |
|
"reward": 0.4910714626312256, |
|
"reward_std": 0.2459229677915573, |
|
"rewards/accuracy_reward": 0.4910714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 311 |
|
}, |
|
{ |
|
"completion_length": 564.0267944335938, |
|
"epoch": 0.6652452025586354, |
|
"grad_norm": 0.16684581339359283, |
|
"grpo_loss": 0.0020749643445014954, |
|
"kl": 0.0279541015625, |
|
"learning_rate": 9.131039206066261e-07, |
|
"loss": 0.0011, |
|
"reward": 0.7500000596046448, |
|
"reward_std": 0.2724565267562866, |
|
"rewards/accuracy_reward": 0.7500000596046448, |
|
"rewards/format_reward": 0.0, |
|
"step": 312 |
|
}, |
|
{ |
|
"completion_length": 641.0089721679688, |
|
"epoch": 0.6673773987206824, |
|
"grad_norm": 0.17203587293624878, |
|
"grpo_loss": 0.001197556615807116, |
|
"kl": 0.033447265625, |
|
"learning_rate": 9.028437145188962e-07, |
|
"loss": 0.0013, |
|
"reward": 0.5803571939468384, |
|
"reward_std": 0.3391821086406708, |
|
"rewards/accuracy_reward": 0.5803571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 313 |
|
}, |
|
{ |
|
"completion_length": 550.5178833007812, |
|
"epoch": 0.6695095948827292, |
|
"grad_norm": 0.16829533874988556, |
|
"grpo_loss": 0.00012534562847577035, |
|
"kl": 0.031982421875, |
|
"learning_rate": 8.926166033138266e-07, |
|
"loss": 0.0013, |
|
"reward": 0.5892857313156128, |
|
"reward_std": 0.21251530945301056, |
|
"rewards/accuracy_reward": 0.5892857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 314 |
|
}, |
|
{ |
|
"completion_length": 680.6785888671875, |
|
"epoch": 0.6716417910447762, |
|
"grad_norm": 0.36079153418540955, |
|
"grpo_loss": 0.0016356073319911957, |
|
"kl": 0.031005859375, |
|
"learning_rate": 8.82423153786165e-07, |
|
"loss": 0.0012, |
|
"reward": 0.5446428656578064, |
|
"reward_std": 0.29016801714897156, |
|
"rewards/accuracy_reward": 0.5446428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 315 |
|
}, |
|
{ |
|
"completion_length": 610.3482666015625, |
|
"epoch": 0.673773987206823, |
|
"grad_norm": 0.1447429209947586, |
|
"grpo_loss": 0.001389749813824892, |
|
"kl": 0.033447265625, |
|
"learning_rate": 8.72263930865099e-07, |
|
"loss": 0.0013, |
|
"reward": 0.5714285969734192, |
|
"reward_std": 0.2124256193637848, |
|
"rewards/accuracy_reward": 0.5714285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 316 |
|
}, |
|
{ |
|
"completion_length": 716.5982666015625, |
|
"epoch": 0.67590618336887, |
|
"grad_norm": 0.12955766916275024, |
|
"grpo_loss": 0.0010962896049022675, |
|
"kl": 0.010009765625, |
|
"learning_rate": 8.62139497582953e-07, |
|
"loss": 0.0004, |
|
"reward": 0.5892857313156128, |
|
"reward_std": 0.29598960280418396, |
|
"rewards/accuracy_reward": 0.5892857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 317 |
|
}, |
|
{ |
|
"completion_length": 766.857177734375, |
|
"epoch": 0.6780383795309168, |
|
"grad_norm": 0.14128656685352325, |
|
"grpo_loss": 0.0012699184007942677, |
|
"kl": 0.02978515625, |
|
"learning_rate": 8.520504150439773e-07, |
|
"loss": 0.0012, |
|
"reward": 0.3571428656578064, |
|
"reward_std": 0.3028636872768402, |
|
"rewards/accuracy_reward": 0.3571428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 318 |
|
}, |
|
{ |
|
"completion_length": 644.1428833007812, |
|
"epoch": 0.6801705756929638, |
|
"grad_norm": 0.15934458374977112, |
|
"grpo_loss": 0.0012877359986305237, |
|
"kl": 0.04541015625, |
|
"learning_rate": 8.419972423932613e-07, |
|
"loss": 0.0018, |
|
"reward": 0.5178571939468384, |
|
"reward_std": 0.29307880997657776, |
|
"rewards/accuracy_reward": 0.5178571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 319 |
|
}, |
|
{ |
|
"completion_length": 571.5803833007812, |
|
"epoch": 0.6823027718550106, |
|
"grad_norm": 0.1932043880224228, |
|
"grpo_loss": 0.00025262683629989624, |
|
"kl": 0.0269775390625, |
|
"learning_rate": 8.319805367857344e-07, |
|
"loss": 0.0011, |
|
"reward": 0.6875000596046448, |
|
"reward_std": 0.2832939326763153, |
|
"rewards/accuracy_reward": 0.6875000596046448, |
|
"rewards/format_reward": 0.0, |
|
"step": 320 |
|
}, |
|
{ |
|
"completion_length": 548.4553833007812, |
|
"epoch": 0.6844349680170576, |
|
"grad_norm": 0.13966359198093414, |
|
"grpo_loss": 0.07430977374315262, |
|
"kl": 0.0159912109375, |
|
"learning_rate": 8.220008533552953e-07, |
|
"loss": 0.0006, |
|
"reward": 0.6696428656578064, |
|
"reward_std": 0.2625819444656372, |
|
"rewards/accuracy_reward": 0.6696428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 321 |
|
}, |
|
{ |
|
"completion_length": 656.044677734375, |
|
"epoch": 0.6865671641791045, |
|
"grad_norm": 0.25401002168655396, |
|
"grpo_loss": 0.00247298926115036, |
|
"kl": 0.043701171875, |
|
"learning_rate": 8.120587451840439e-07, |
|
"loss": 0.0017, |
|
"reward": 0.625, |
|
"reward_std": 0.2891154885292053, |
|
"rewards/accuracy_reward": 0.625, |
|
"rewards/format_reward": 0.0, |
|
"step": 322 |
|
}, |
|
{ |
|
"completion_length": 615.544677734375, |
|
"epoch": 0.6886993603411514, |
|
"grad_norm": 0.16577310860157013, |
|
"grpo_loss": -0.05358615145087242, |
|
"kl": 0.00927734375, |
|
"learning_rate": 8.021547632716291e-07, |
|
"loss": 0.0004, |
|
"reward": 0.4910714626312256, |
|
"reward_std": 0.29713183641433716, |
|
"rewards/accuracy_reward": 0.4910714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 323 |
|
}, |
|
{ |
|
"completion_length": 623.0982666015625, |
|
"epoch": 0.6908315565031983, |
|
"grad_norm": 0.12740829586982727, |
|
"grpo_loss": 0.05615002661943436, |
|
"kl": 0.0255126953125, |
|
"learning_rate": 7.9228945650471e-07, |
|
"loss": 0.001, |
|
"reward": 0.5714285969734192, |
|
"reward_std": 0.2389591485261917, |
|
"rewards/accuracy_reward": 0.5714285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 324 |
|
}, |
|
{ |
|
"completion_length": 638.9375, |
|
"epoch": 0.6929637526652452, |
|
"grad_norm": 0.6456794738769531, |
|
"grpo_loss": -0.045602887868881226, |
|
"kl": 0.0302734375, |
|
"learning_rate": 7.824633716265399e-07, |
|
"loss": 0.0012, |
|
"reward": 0.6071428656578064, |
|
"reward_std": 0.1820184290409088, |
|
"rewards/accuracy_reward": 0.6071428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 325 |
|
}, |
|
{ |
|
"completion_length": 610.3035888671875, |
|
"epoch": 0.6950959488272921, |
|
"grad_norm": 0.12833137810230255, |
|
"grpo_loss": 0.00032634875969961286, |
|
"kl": 0.0096435546875, |
|
"learning_rate": 7.72677053206663e-07, |
|
"loss": 0.0004, |
|
"reward": 0.6517857313156128, |
|
"reward_std": 0.2418699413537979, |
|
"rewards/accuracy_reward": 0.6517857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 326 |
|
}, |
|
{ |
|
"completion_length": 561.544677734375, |
|
"epoch": 0.697228144989339, |
|
"grad_norm": 0.19920487701892853, |
|
"grpo_loss": -0.04694785922765732, |
|
"kl": 0.0240478515625, |
|
"learning_rate": 7.629310436107342e-07, |
|
"loss": 0.001, |
|
"reward": 0.6785714626312256, |
|
"reward_std": 0.3472881615161896, |
|
"rewards/accuracy_reward": 0.6785714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 327 |
|
}, |
|
{ |
|
"completion_length": 600.1517944335938, |
|
"epoch": 0.6993603411513859, |
|
"grad_norm": 0.13510799407958984, |
|
"grpo_loss": 0.0018375907093286514, |
|
"kl": 0.0257568359375, |
|
"learning_rate": 7.5322588297046e-07, |
|
"loss": 0.001, |
|
"reward": 0.5446428656578064, |
|
"reward_std": 0.21251529455184937, |
|
"rewards/accuracy_reward": 0.5446428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 328 |
|
}, |
|
{ |
|
"completion_length": 540.9285888671875, |
|
"epoch": 0.7014925373134329, |
|
"grad_norm": 0.17828156054019928, |
|
"grpo_loss": 0.00013902665523346514, |
|
"kl": 0.02587890625, |
|
"learning_rate": 7.43562109153666e-07, |
|
"loss": 0.001, |
|
"reward": 0.8125000596046448, |
|
"reward_std": 0.14746850728988647, |
|
"rewards/accuracy_reward": 0.8125000596046448, |
|
"rewards/format_reward": 0.0, |
|
"step": 329 |
|
}, |
|
{ |
|
"completion_length": 651.9910888671875, |
|
"epoch": 0.7036247334754797, |
|
"grad_norm": 0.1514289230108261, |
|
"grpo_loss": 0.05768703296780586, |
|
"kl": 0.028564453125, |
|
"learning_rate": 7.339402577344863e-07, |
|
"loss": 0.0011, |
|
"reward": 0.6160714626312256, |
|
"reward_std": 0.2360483705997467, |
|
"rewards/accuracy_reward": 0.6160714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 330 |
|
}, |
|
{ |
|
"completion_length": 638.544677734375, |
|
"epoch": 0.7057569296375267, |
|
"grad_norm": 0.16231295466423035, |
|
"grpo_loss": 0.05624303221702576, |
|
"kl": 0.044921875, |
|
"learning_rate": 7.243608619636828e-07, |
|
"loss": 0.0018, |
|
"reward": 0.6964285969734192, |
|
"reward_std": 0.29307880997657776, |
|
"rewards/accuracy_reward": 0.6964285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 331 |
|
}, |
|
{ |
|
"completion_length": 593.3303833007812, |
|
"epoch": 0.7078891257995735, |
|
"grad_norm": 0.1985163390636444, |
|
"grpo_loss": 0.001253962516784668, |
|
"kl": 0.03271484375, |
|
"learning_rate": 7.14824452739089e-07, |
|
"loss": 0.0013, |
|
"reward": 0.6428571939468384, |
|
"reward_std": 0.3433248698711395, |
|
"rewards/accuracy_reward": 0.6428571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 332 |
|
}, |
|
{ |
|
"completion_length": 563.4642944335938, |
|
"epoch": 0.7100213219616205, |
|
"grad_norm": 0.1781807243824005, |
|
"grpo_loss": -0.04701165482401848, |
|
"kl": 0.0274658203125, |
|
"learning_rate": 7.053315585761911e-07, |
|
"loss": 0.0011, |
|
"reward": 0.723214328289032, |
|
"reward_std": 0.2794203460216522, |
|
"rewards/accuracy_reward": 0.723214328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 333 |
|
}, |
|
{ |
|
"completion_length": 621.7142944335938, |
|
"epoch": 0.7121535181236673, |
|
"grad_norm": 0.16407182812690735, |
|
"grpo_loss": 0.00021706148982048035, |
|
"kl": 0.0255126953125, |
|
"learning_rate": 6.958827055788351e-07, |
|
"loss": 0.001, |
|
"reward": 0.6428571939468384, |
|
"reward_std": 0.3127382695674896, |
|
"rewards/accuracy_reward": 0.6428571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 334 |
|
}, |
|
{ |
|
"completion_length": 651.419677734375, |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 0.1288588047027588, |
|
"grpo_loss": 0.0014822334051132202, |
|
"kl": 0.044921875, |
|
"learning_rate": 6.864784174100703e-07, |
|
"loss": 0.0018, |
|
"reward": 0.5089285969734192, |
|
"reward_std": 0.1820184290409088, |
|
"rewards/accuracy_reward": 0.5089285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 335 |
|
}, |
|
{ |
|
"completion_length": 654.6964721679688, |
|
"epoch": 0.7164179104477612, |
|
"grad_norm": 0.14913132786750793, |
|
"grpo_loss": 0.00022607296705245972, |
|
"kl": 0.0203857421875, |
|
"learning_rate": 6.771192152631274e-07, |
|
"loss": 0.0008, |
|
"reward": 0.5714285969734192, |
|
"reward_std": 0.21251530945301056, |
|
"rewards/accuracy_reward": 0.5714285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 336 |
|
}, |
|
{ |
|
"completion_length": 635.4910888671875, |
|
"epoch": 0.7185501066098081, |
|
"grad_norm": 0.20662419497966766, |
|
"grpo_loss": -0.04704027250409126, |
|
"kl": 0.02880859375, |
|
"learning_rate": 6.678056178325319e-07, |
|
"loss": 0.0012, |
|
"reward": 0.535714328289032, |
|
"reward_std": 0.30004262924194336, |
|
"rewards/accuracy_reward": 0.535714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 337 |
|
}, |
|
{ |
|
"completion_length": 680.0, |
|
"epoch": 0.720682302771855, |
|
"grad_norm": 0.1413746476173401, |
|
"grpo_loss": -0.11560186743736267, |
|
"kl": 0.0296630859375, |
|
"learning_rate": 6.585381412853612e-07, |
|
"loss": 0.0012, |
|
"reward": 0.5892857313156128, |
|
"reward_std": 0.3177541494369507, |
|
"rewards/accuracy_reward": 0.5892857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 338 |
|
}, |
|
{ |
|
"completion_length": 607.1339721679688, |
|
"epoch": 0.7228144989339019, |
|
"grad_norm": 0.21040762960910797, |
|
"grpo_loss": 0.0747302919626236, |
|
"kl": 0.0380859375, |
|
"learning_rate": 6.49317299232635e-07, |
|
"loss": 0.0015, |
|
"reward": 0.5803571939468384, |
|
"reward_std": 0.3954068720340729, |
|
"rewards/accuracy_reward": 0.5803571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 339 |
|
}, |
|
{ |
|
"completion_length": 647.3214721679688, |
|
"epoch": 0.7249466950959488, |
|
"grad_norm": 0.1362115442752838, |
|
"grpo_loss": -0.01653050072491169, |
|
"kl": 0.0269775390625, |
|
"learning_rate": 6.401436027008529e-07, |
|
"loss": 0.0011, |
|
"reward": 0.7321428656578064, |
|
"reward_std": 0.2684035003185272, |
|
"rewards/accuracy_reward": 0.7321428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 340 |
|
}, |
|
{ |
|
"completion_length": 732.0089721679688, |
|
"epoch": 0.7270788912579957, |
|
"grad_norm": 0.13052456080913544, |
|
"grpo_loss": 0.0003261137753725052, |
|
"kl": 0.018798828125, |
|
"learning_rate": 6.310175601036695e-07, |
|
"loss": 0.0007, |
|
"reward": 0.5803571939468384, |
|
"reward_std": 0.23613807559013367, |
|
"rewards/accuracy_reward": 0.5803571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 341 |
|
}, |
|
{ |
|
"completion_length": 587.8839721679688, |
|
"epoch": 0.7292110874200426, |
|
"grad_norm": 0.20251357555389404, |
|
"grpo_loss": 0.001199830323457718, |
|
"kl": 0.047607421875, |
|
"learning_rate": 6.219396772137218e-07, |
|
"loss": 0.0019, |
|
"reward": 0.6339285969734192, |
|
"reward_std": 0.2833836078643799, |
|
"rewards/accuracy_reward": 0.6339285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 342 |
|
}, |
|
{ |
|
"completion_length": 658.625, |
|
"epoch": 0.7313432835820896, |
|
"grad_norm": 0.19635452330112457, |
|
"grpo_loss": 9.168311953544617e-05, |
|
"kl": 0.0289306640625, |
|
"learning_rate": 6.129104571345968e-07, |
|
"loss": 0.0012, |
|
"reward": 0.5803571939468384, |
|
"reward_std": 0.1986774057149887, |
|
"rewards/accuracy_reward": 0.5803571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 343 |
|
}, |
|
{ |
|
"completion_length": 741.2767944335938, |
|
"epoch": 0.7334754797441365, |
|
"grad_norm": 0.11911910772323608, |
|
"grpo_loss": -0.01411936804652214, |
|
"kl": 0.021240234375, |
|
"learning_rate": 6.039304002729494e-07, |
|
"loss": 0.0009, |
|
"reward": 0.4464285969734192, |
|
"reward_std": 0.1750546246767044, |
|
"rewards/accuracy_reward": 0.4464285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 344 |
|
}, |
|
{ |
|
"completion_length": 647.1517944335938, |
|
"epoch": 0.7356076759061834, |
|
"grad_norm": 0.15977312624454498, |
|
"grpo_loss": 0.005686178803443909, |
|
"kl": 0.058349609375, |
|
"learning_rate": 5.950000043107681e-07, |
|
"loss": 0.0023, |
|
"reward": 0.5446428656578064, |
|
"reward_std": 0.259671151638031, |
|
"rewards/accuracy_reward": 0.5446428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 345 |
|
}, |
|
{ |
|
"completion_length": 570.1339721679688, |
|
"epoch": 0.7377398720682303, |
|
"grad_norm": 0.1959923654794693, |
|
"grpo_loss": 0.002989797620102763, |
|
"kl": 0.052490234375, |
|
"learning_rate": 5.861197641777953e-07, |
|
"loss": 0.0021, |
|
"reward": 0.6785714626312256, |
|
"reward_std": 0.22626349329948425, |
|
"rewards/accuracy_reward": 0.6785714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 346 |
|
}, |
|
{ |
|
"completion_length": 546.8482666015625, |
|
"epoch": 0.7398720682302772, |
|
"grad_norm": 0.15611131489276886, |
|
"grpo_loss": 0.13409610092639923, |
|
"kl": 0.035400390625, |
|
"learning_rate": 5.772901720240971e-07, |
|
"loss": 0.0014, |
|
"reward": 0.7767857313156128, |
|
"reward_std": 0.2389591485261917, |
|
"rewards/accuracy_reward": 0.7767857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 347 |
|
}, |
|
{ |
|
"completion_length": 677.794677734375, |
|
"epoch": 0.7420042643923241, |
|
"grad_norm": 0.11709578335285187, |
|
"grpo_loss": 0.1003992035984993, |
|
"kl": 0.0177001953125, |
|
"learning_rate": 5.685117171927879e-07, |
|
"loss": 0.0007, |
|
"reward": 0.5, |
|
"reward_std": 0.20555149018764496, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"step": 348 |
|
}, |
|
{ |
|
"completion_length": 671.2142944335938, |
|
"epoch": 0.744136460554371, |
|
"grad_norm": 0.14730335772037506, |
|
"grpo_loss": 0.0018871352076530457, |
|
"kl": 0.044677734375, |
|
"learning_rate": 5.597848861929091e-07, |
|
"loss": 0.0018, |
|
"reward": 0.6160714626312256, |
|
"reward_std": 0.21542608737945557, |
|
"rewards/accuracy_reward": 0.6160714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 349 |
|
}, |
|
{ |
|
"completion_length": 588.6785888671875, |
|
"epoch": 0.746268656716418, |
|
"grad_norm": 0.4064622223377228, |
|
"grpo_loss": 0.00033608078956604004, |
|
"kl": 0.044189453125, |
|
"learning_rate": 5.511101626724693e-07, |
|
"loss": 0.0018, |
|
"reward": 0.5535714626312256, |
|
"reward_std": 0.3206648826599121, |
|
"rewards/accuracy_reward": 0.5535714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 350 |
|
}, |
|
{ |
|
"completion_length": 570.4375, |
|
"epoch": 0.7484008528784648, |
|
"grad_norm": 0.1400219202041626, |
|
"grpo_loss": 0.05647098273038864, |
|
"kl": 0.02197265625, |
|
"learning_rate": 5.42488027391638e-07, |
|
"loss": 0.0009, |
|
"reward": 0.6339285969734192, |
|
"reward_std": 0.2331376075744629, |
|
"rewards/accuracy_reward": 0.6339285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 351 |
|
}, |
|
{ |
|
"completion_length": 632.4732666015625, |
|
"epoch": 0.7505330490405118, |
|
"grad_norm": 0.19489286839962006, |
|
"grpo_loss": -0.046175189316272736, |
|
"kl": 0.05419921875, |
|
"learning_rate": 5.339189581961024e-07, |
|
"loss": 0.0022, |
|
"reward": 0.6875000596046448, |
|
"reward_std": 0.3264864683151245, |
|
"rewards/accuracy_reward": 0.6875000596046448, |
|
"rewards/format_reward": 0.0, |
|
"step": 352 |
|
}, |
|
{ |
|
"completion_length": 650.6160888671875, |
|
"epoch": 0.7526652452025586, |
|
"grad_norm": 0.1552044153213501, |
|
"grpo_loss": 0.12257284671068192, |
|
"kl": 0.0279541015625, |
|
"learning_rate": 5.254034299905825e-07, |
|
"loss": 0.0011, |
|
"reward": 0.5535714626312256, |
|
"reward_std": 0.24883373081684113, |
|
"rewards/accuracy_reward": 0.5535714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 353 |
|
}, |
|
{ |
|
"completion_length": 590.6428833007812, |
|
"epoch": 0.7547974413646056, |
|
"grad_norm": 0.1753339320421219, |
|
"grpo_loss": 0.11830225586891174, |
|
"kl": 0.03076171875, |
|
"learning_rate": 5.169419147125176e-07, |
|
"loss": 0.0012, |
|
"reward": 0.7946428656578064, |
|
"reward_std": 0.23613807559013367, |
|
"rewards/accuracy_reward": 0.7946428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 354 |
|
}, |
|
{ |
|
"completion_length": 533.4553833007812, |
|
"epoch": 0.7569296375266524, |
|
"grad_norm": 0.23187799751758575, |
|
"grpo_loss": 0.0014726445078849792, |
|
"kl": 0.04736328125, |
|
"learning_rate": 5.085348813059028e-07, |
|
"loss": 0.0019, |
|
"reward": 0.6785714626312256, |
|
"reward_std": 0.25279709696769714, |
|
"rewards/accuracy_reward": 0.6785714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 355 |
|
}, |
|
{ |
|
"completion_length": 637.8392944335938, |
|
"epoch": 0.7590618336886994, |
|
"grad_norm": 0.1692523956298828, |
|
"grpo_loss": 0.11937975883483887, |
|
"kl": 0.01556396484375, |
|
"learning_rate": 5.001827956953066e-07, |
|
"loss": 0.0006, |
|
"reward": 0.535714328289032, |
|
"reward_std": 0.30004262924194336, |
|
"rewards/accuracy_reward": 0.535714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 356 |
|
}, |
|
{ |
|
"completion_length": 628.4642944335938, |
|
"epoch": 0.7611940298507462, |
|
"grad_norm": 0.15542620420455933, |
|
"grpo_loss": 0.0015178961912170053, |
|
"kl": 0.05419921875, |
|
"learning_rate": 4.918861207600445e-07, |
|
"loss": 0.0022, |
|
"reward": 0.598214328289032, |
|
"reward_std": 0.20564121007919312, |
|
"rewards/accuracy_reward": 0.598214328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 357 |
|
}, |
|
{ |
|
"completion_length": 535.4642944335938, |
|
"epoch": 0.7633262260127932, |
|
"grad_norm": 0.1944689154624939, |
|
"grpo_loss": 0.1363382488489151, |
|
"kl": 0.054443359375, |
|
"learning_rate": 4.8364531630853e-07, |
|
"loss": 0.0022, |
|
"reward": 0.6964285969734192, |
|
"reward_std": 0.31670159101486206, |
|
"rewards/accuracy_reward": 0.6964285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 358 |
|
}, |
|
{ |
|
"completion_length": 563.6607666015625, |
|
"epoch": 0.7654584221748401, |
|
"grad_norm": 0.17013201117515564, |
|
"grpo_loss": 0.0014447793364524841, |
|
"kl": 0.041015625, |
|
"learning_rate": 4.754608390527869e-07, |
|
"loss": 0.0016, |
|
"reward": 0.7142857313156128, |
|
"reward_std": 0.16130642592906952, |
|
"rewards/accuracy_reward": 0.7142857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 359 |
|
}, |
|
{ |
|
"completion_length": 588.7142944335938, |
|
"epoch": 0.767590618336887, |
|
"grad_norm": 0.26594096422195435, |
|
"grpo_loss": 0.01679377630352974, |
|
"kl": 0.035888671875, |
|
"learning_rate": 4.6733314258314206e-07, |
|
"loss": 0.0014, |
|
"reward": 0.5535714626312256, |
|
"reward_std": 0.31379079818725586, |
|
"rewards/accuracy_reward": 0.5535714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 360 |
|
}, |
|
{ |
|
"completion_length": 599.4464721679688, |
|
"epoch": 0.7697228144989339, |
|
"grad_norm": 0.7181060314178467, |
|
"grpo_loss": 0.00020268559455871582, |
|
"kl": 0.0322265625, |
|
"learning_rate": 4.59262677343085e-07, |
|
"loss": 0.0013, |
|
"reward": 0.5625, |
|
"reward_std": 0.25976085662841797, |
|
"rewards/accuracy_reward": 0.5625, |
|
"rewards/format_reward": 0.0, |
|
"step": 361 |
|
}, |
|
{ |
|
"completion_length": 578.7678833007812, |
|
"epoch": 0.7718550106609808, |
|
"grad_norm": 0.1754969209432602, |
|
"grpo_loss": 0.05639899522066116, |
|
"kl": 0.053955078125, |
|
"learning_rate": 4.512498906043046e-07, |
|
"loss": 0.0022, |
|
"reward": 0.7142857313156128, |
|
"reward_std": 0.2902577519416809, |
|
"rewards/accuracy_reward": 0.7142857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 362 |
|
}, |
|
{ |
|
"completion_length": 676.5178833007812, |
|
"epoch": 0.7739872068230277, |
|
"grad_norm": 0.1559028923511505, |
|
"grpo_loss": -0.05473152920603752, |
|
"kl": 0.04443359375, |
|
"learning_rate": 4.432952264418995e-07, |
|
"loss": 0.0018, |
|
"reward": 0.5, |
|
"reward_std": 0.2193893939256668, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"step": 363 |
|
}, |
|
{ |
|
"completion_length": 670.5267944335938, |
|
"epoch": 0.7761194029850746, |
|
"grad_norm": 0.14586284756660461, |
|
"grpo_loss": 0.1182800680398941, |
|
"kl": 0.0235595703125, |
|
"learning_rate": 4.3539912570976967e-07, |
|
"loss": 0.0009, |
|
"reward": 0.598214328289032, |
|
"reward_std": 0.3098275065422058, |
|
"rewards/accuracy_reward": 0.598214328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 364 |
|
}, |
|
{ |
|
"completion_length": 514.3125, |
|
"epoch": 0.7782515991471215, |
|
"grad_norm": 0.15664242208003998, |
|
"grpo_loss": 0.0003577885509002954, |
|
"kl": 0.03173828125, |
|
"learning_rate": 4.275620260161819e-07, |
|
"loss": 0.0013, |
|
"reward": 0.7589285969734192, |
|
"reward_std": 0.2891154885292053, |
|
"rewards/accuracy_reward": 0.7589285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 365 |
|
}, |
|
{ |
|
"completion_length": 647.982177734375, |
|
"epoch": 0.7803837953091685, |
|
"grad_norm": 0.15858404338359833, |
|
"grpo_loss": 0.05947595462203026, |
|
"kl": 0.052734375, |
|
"learning_rate": 4.1978436169951883e-07, |
|
"loss": 0.0021, |
|
"reward": 0.5178571939468384, |
|
"reward_std": 0.16235896944999695, |
|
"rewards/accuracy_reward": 0.5178571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 366 |
|
}, |
|
{ |
|
"completion_length": 689.1875610351562, |
|
"epoch": 0.7825159914712153, |
|
"grad_norm": 0.1707778126001358, |
|
"grpo_loss": -0.11788696050643921, |
|
"kl": 0.02734375, |
|
"learning_rate": 4.1206656380420467e-07, |
|
"loss": 0.0011, |
|
"reward": 0.4910714626312256, |
|
"reward_std": 0.2902577519416809, |
|
"rewards/accuracy_reward": 0.4910714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 367 |
|
}, |
|
{ |
|
"completion_length": 612.2053833007812, |
|
"epoch": 0.7846481876332623, |
|
"grad_norm": 0.1905473917722702, |
|
"grpo_loss": 0.05517677217721939, |
|
"kl": 0.038330078125, |
|
"learning_rate": 4.044090600568202e-07, |
|
"loss": 0.0015, |
|
"reward": 0.5178571939468384, |
|
"reward_std": 0.33450281620025635, |
|
"rewards/accuracy_reward": 0.5178571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 368 |
|
}, |
|
{ |
|
"completion_length": 597.8125, |
|
"epoch": 0.7867803837953091, |
|
"grad_norm": 0.2364969551563263, |
|
"grpo_loss": 0.00435774028301239, |
|
"kl": 0.054443359375, |
|
"learning_rate": 3.9681227484239497e-07, |
|
"loss": 0.0022, |
|
"reward": 0.5803571939468384, |
|
"reward_std": 0.3264864683151245, |
|
"rewards/accuracy_reward": 0.5803571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 369 |
|
}, |
|
{ |
|
"completion_length": 680.4910888671875, |
|
"epoch": 0.7889125799573561, |
|
"grad_norm": 0.11851251870393753, |
|
"grpo_loss": -0.05272572487592697, |
|
"kl": 0.0164794921875, |
|
"learning_rate": 3.892766291808893e-07, |
|
"loss": 0.0007, |
|
"reward": 0.5714285969734192, |
|
"reward_std": 0.26363450288772583, |
|
"rewards/accuracy_reward": 0.5714285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 370 |
|
}, |
|
{ |
|
"completion_length": 576.919677734375, |
|
"epoch": 0.7910447761194029, |
|
"grad_norm": 0.14334093034267426, |
|
"grpo_loss": 0.04737274721264839, |
|
"kl": 0.031494140625, |
|
"learning_rate": 3.818025407038582e-07, |
|
"loss": 0.0013, |
|
"reward": 0.660714328289032, |
|
"reward_std": 0.3069167137145996, |
|
"rewards/accuracy_reward": 0.660714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 371 |
|
}, |
|
{ |
|
"completion_length": 675.6160888671875, |
|
"epoch": 0.7931769722814499, |
|
"grad_norm": 0.16912008821964264, |
|
"grpo_loss": -0.04576794058084488, |
|
"kl": 0.03173828125, |
|
"learning_rate": 3.74390423631311e-07, |
|
"loss": 0.0013, |
|
"reward": 0.4732142984867096, |
|
"reward_std": 0.27641984820365906, |
|
"rewards/accuracy_reward": 0.4732142984867096, |
|
"rewards/format_reward": 0.0, |
|
"step": 372 |
|
}, |
|
{ |
|
"completion_length": 625.6607666015625, |
|
"epoch": 0.7953091684434968, |
|
"grad_norm": 0.20823583006858826, |
|
"grpo_loss": -0.09997741132974625, |
|
"kl": 0.05322265625, |
|
"learning_rate": 3.6704068874874916e-07, |
|
"loss": 0.0021, |
|
"reward": 0.5625, |
|
"reward_std": 0.27641984820365906, |
|
"rewards/accuracy_reward": 0.5625, |
|
"rewards/format_reward": 0.0, |
|
"step": 373 |
|
}, |
|
{ |
|
"completion_length": 630.6517944335938, |
|
"epoch": 0.7974413646055437, |
|
"grad_norm": 0.28135645389556885, |
|
"grpo_loss": 0.004614211618900299, |
|
"kl": 0.0693359375, |
|
"learning_rate": 3.597537433844046e-07, |
|
"loss": 0.0028, |
|
"reward": 0.5267857313156128, |
|
"reward_std": 0.3029533922672272, |
|
"rewards/accuracy_reward": 0.5267857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 374 |
|
}, |
|
{ |
|
"completion_length": 537.8928833007812, |
|
"epoch": 0.7995735607675906, |
|
"grad_norm": 0.1784980595111847, |
|
"grpo_loss": 0.0028876028954982758, |
|
"kl": 0.03759765625, |
|
"learning_rate": 3.525299913866615e-07, |
|
"loss": 0.0015, |
|
"reward": 0.7500000596046448, |
|
"reward_std": 0.1554848700761795, |
|
"rewards/accuracy_reward": 0.7500000596046448, |
|
"rewards/format_reward": 0.0, |
|
"step": 375 |
|
}, |
|
{ |
|
"completion_length": 590.2142944335938, |
|
"epoch": 0.8017057569296375, |
|
"grad_norm": 0.24804876744747162, |
|
"grpo_loss": 0.12203040719032288, |
|
"kl": 0.04443359375, |
|
"learning_rate": 3.4536983310167975e-07, |
|
"loss": 0.0018, |
|
"reward": 0.6964285969734192, |
|
"reward_std": 0.357073038816452, |
|
"rewards/accuracy_reward": 0.6964285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 376 |
|
}, |
|
{ |
|
"completion_length": 588.1785888671875, |
|
"epoch": 0.8038379530916845, |
|
"grad_norm": 0.23009361326694489, |
|
"grpo_loss": -0.07306977361440659, |
|
"kl": 0.04443359375, |
|
"learning_rate": 3.382736653512016e-07, |
|
"loss": 0.0018, |
|
"reward": 0.6428571939468384, |
|
"reward_std": 0.26267164945602417, |
|
"rewards/accuracy_reward": 0.6428571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 377 |
|
}, |
|
{ |
|
"completion_length": 628.3125, |
|
"epoch": 0.8059701492537313, |
|
"grad_norm": 0.13790591061115265, |
|
"grpo_loss": 0.0014116030652076006, |
|
"kl": 0.0240478515625, |
|
"learning_rate": 3.312418814105638e-07, |
|
"loss": 0.001, |
|
"reward": 0.6071428656578064, |
|
"reward_std": 0.1820184290409088, |
|
"rewards/accuracy_reward": 0.6071428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 378 |
|
}, |
|
{ |
|
"completion_length": 552.4553833007812, |
|
"epoch": 0.8081023454157783, |
|
"grad_norm": 0.18349257111549377, |
|
"grpo_loss": -0.09748789668083191, |
|
"kl": 0.04052734375, |
|
"learning_rate": 3.242748709868999e-07, |
|
"loss": 0.0016, |
|
"reward": 0.660714328289032, |
|
"reward_std": 0.2822414040565491, |
|
"rewards/accuracy_reward": 0.660714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 379 |
|
}, |
|
{ |
|
"completion_length": 673.5535888671875, |
|
"epoch": 0.8102345415778252, |
|
"grad_norm": 0.17732153832912445, |
|
"grpo_loss": 0.0002803046954795718, |
|
"kl": 0.0341796875, |
|
"learning_rate": 3.173730201975439e-07, |
|
"loss": 0.0014, |
|
"reward": 0.3839285969734192, |
|
"reward_std": 0.27641984820365906, |
|
"rewards/accuracy_reward": 0.3839285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 380 |
|
}, |
|
{ |
|
"completion_length": 615.857177734375, |
|
"epoch": 0.8123667377398721, |
|
"grad_norm": 0.22036485373973846, |
|
"grpo_loss": 0.07462641596794128, |
|
"kl": 0.032958984375, |
|
"learning_rate": 3.1053671154862833e-07, |
|
"loss": 0.0013, |
|
"reward": 0.660714328289032, |
|
"reward_std": 0.2873469889163971, |
|
"rewards/accuracy_reward": 0.660714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 381 |
|
}, |
|
{ |
|
"completion_length": 657.3392944335938, |
|
"epoch": 0.814498933901919, |
|
"grad_norm": 0.14535248279571533, |
|
"grpo_loss": -0.10004384815692902, |
|
"kl": 0.021728515625, |
|
"learning_rate": 3.037663239138895e-07, |
|
"loss": 0.0009, |
|
"reward": 0.5535714626312256, |
|
"reward_std": 0.2792409062385559, |
|
"rewards/accuracy_reward": 0.5535714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 382 |
|
}, |
|
{ |
|
"completion_length": 566.0803833007812, |
|
"epoch": 0.8166311300639659, |
|
"grad_norm": 0.2126779854297638, |
|
"grpo_loss": 0.002174288034439087, |
|
"kl": 0.05517578125, |
|
"learning_rate": 2.970622325136669e-07, |
|
"loss": 0.0022, |
|
"reward": 0.6964285969734192, |
|
"reward_std": 0.22626349329948425, |
|
"rewards/accuracy_reward": 0.6964285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 383 |
|
}, |
|
{ |
|
"completion_length": 602.5178833007812, |
|
"epoch": 0.8187633262260128, |
|
"grad_norm": 0.3173239827156067, |
|
"grpo_loss": -0.13343751430511475, |
|
"kl": 0.03759765625, |
|
"learning_rate": 2.904248088941102e-07, |
|
"loss": 0.0015, |
|
"reward": 0.7500000596046448, |
|
"reward_std": 0.2862047255039215, |
|
"rewards/accuracy_reward": 0.7500000596046448, |
|
"rewards/format_reward": 0.0, |
|
"step": 384 |
|
}, |
|
{ |
|
"completion_length": 625.3035888671875, |
|
"epoch": 0.8208955223880597, |
|
"grad_norm": 0.13514848053455353, |
|
"grpo_loss": 0.0001852833229349926, |
|
"kl": 0.0230712890625, |
|
"learning_rate": 2.8385442090658554e-07, |
|
"loss": 0.0009, |
|
"reward": 0.5446428656578064, |
|
"reward_std": 0.1692330688238144, |
|
"rewards/accuracy_reward": 0.5446428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 385 |
|
}, |
|
{ |
|
"completion_length": 539.857177734375, |
|
"epoch": 0.8230277185501066, |
|
"grad_norm": 0.21709002554416656, |
|
"grpo_loss": -0.04592399671673775, |
|
"kl": 0.04248046875, |
|
"learning_rate": 2.773514326872911e-07, |
|
"loss": 0.0017, |
|
"reward": 0.7589285969734192, |
|
"reward_std": 0.22926399111747742, |
|
"rewards/accuracy_reward": 0.7589285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 386 |
|
}, |
|
{ |
|
"completion_length": 547.044677734375, |
|
"epoch": 0.8251599147121536, |
|
"grad_norm": 0.20077502727508545, |
|
"grpo_loss": 0.0027071647346019745, |
|
"kl": 0.036865234375, |
|
"learning_rate": 2.7091620463707565e-07, |
|
"loss": 0.0015, |
|
"reward": 0.6428571939468384, |
|
"reward_std": 0.27527761459350586, |
|
"rewards/accuracy_reward": 0.6428571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 387 |
|
}, |
|
{ |
|
"completion_length": 675.4732666015625, |
|
"epoch": 0.8272921108742004, |
|
"grad_norm": 0.23532140254974365, |
|
"grpo_loss": -0.047013476490974426, |
|
"kl": 0.042724609375, |
|
"learning_rate": 2.6454909340146526e-07, |
|
"loss": 0.0017, |
|
"reward": 0.6339285969734192, |
|
"reward_std": 0.27641984820365906, |
|
"rewards/accuracy_reward": 0.6339285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 388 |
|
}, |
|
{ |
|
"completion_length": 579.1339721679688, |
|
"epoch": 0.8294243070362474, |
|
"grad_norm": 0.17333650588989258, |
|
"grpo_loss": 0.0011966601014137268, |
|
"kl": 0.02001953125, |
|
"learning_rate": 2.5825045185089576e-07, |
|
"loss": 0.0008, |
|
"reward": 0.5535714626312256, |
|
"reward_std": 0.27236682176589966, |
|
"rewards/accuracy_reward": 0.5535714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 389 |
|
}, |
|
{ |
|
"completion_length": 663.6428833007812, |
|
"epoch": 0.8315565031982942, |
|
"grad_norm": 0.15946130454540253, |
|
"grpo_loss": 0.11816628277301788, |
|
"kl": 0.0296630859375, |
|
"learning_rate": 2.5202062906115886e-07, |
|
"loss": 0.0012, |
|
"reward": 0.5625, |
|
"reward_std": 0.3236653804779053, |
|
"rewards/accuracy_reward": 0.5625, |
|
"rewards/format_reward": 0.0, |
|
"step": 390 |
|
}, |
|
{ |
|
"completion_length": 727.0982666015625, |
|
"epoch": 0.8336886993603412, |
|
"grad_norm": 0.1320721060037613, |
|
"grpo_loss": -0.04703760892152786, |
|
"kl": 0.0279541015625, |
|
"learning_rate": 2.458599702940551e-07, |
|
"loss": 0.0011, |
|
"reward": 0.5803571939468384, |
|
"reward_std": 0.3098274767398834, |
|
"rewards/accuracy_reward": 0.5803571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 391 |
|
}, |
|
{ |
|
"completion_length": 661.625, |
|
"epoch": 0.835820895522388, |
|
"grad_norm": 0.21743802726268768, |
|
"grpo_loss": -0.0698922798037529, |
|
"kl": 0.039794921875, |
|
"learning_rate": 2.3976881697825945e-07, |
|
"loss": 0.0016, |
|
"reward": 0.5803571939468384, |
|
"reward_std": 0.25685009360313416, |
|
"rewards/accuracy_reward": 0.5803571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 392 |
|
}, |
|
{ |
|
"completion_length": 641.2053833007812, |
|
"epoch": 0.837953091684435, |
|
"grad_norm": 3.930511236190796, |
|
"grpo_loss": -0.07228344678878784, |
|
"kl": 0.030517578125, |
|
"learning_rate": 2.337475066903973e-07, |
|
"loss": 0.0012, |
|
"reward": 0.625, |
|
"reward_std": 0.3098274767398834, |
|
"rewards/accuracy_reward": 0.625, |
|
"rewards/format_reward": 0.0, |
|
"step": 393 |
|
}, |
|
{ |
|
"completion_length": 551.8303833007812, |
|
"epoch": 0.8400852878464818, |
|
"grad_norm": 0.19481515884399414, |
|
"grpo_loss": 0.00013812871475238353, |
|
"kl": 0.019287109375, |
|
"learning_rate": 2.2779637313633867e-07, |
|
"loss": 0.0008, |
|
"reward": 0.6964285969734192, |
|
"reward_std": 0.21251529455184937, |
|
"rewards/accuracy_reward": 0.6964285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 394 |
|
}, |
|
{ |
|
"completion_length": 632.1964721679688, |
|
"epoch": 0.8422174840085288, |
|
"grad_norm": 0.28915032744407654, |
|
"grpo_loss": -0.04566855728626251, |
|
"kl": 0.04443359375, |
|
"learning_rate": 2.2191574613270176e-07, |
|
"loss": 0.0018, |
|
"reward": 0.660714328289032, |
|
"reward_std": 0.27933061122894287, |
|
"rewards/accuracy_reward": 0.660714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 395 |
|
}, |
|
{ |
|
"completion_length": 733.4732666015625, |
|
"epoch": 0.8443496801705757, |
|
"grad_norm": 0.258179247379303, |
|
"grpo_loss": -0.11782819032669067, |
|
"kl": 0.0400390625, |
|
"learning_rate": 2.16105951588575e-07, |
|
"loss": 0.0016, |
|
"reward": 0.598214328289032, |
|
"reward_std": 0.316791296005249, |
|
"rewards/accuracy_reward": 0.598214328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 396 |
|
}, |
|
{ |
|
"completion_length": 607.3839721679688, |
|
"epoch": 0.8464818763326226, |
|
"grad_norm": 0.3129705488681793, |
|
"grpo_loss": -0.045982833951711655, |
|
"kl": 0.052001953125, |
|
"learning_rate": 2.103673114874552e-07, |
|
"loss": 0.0021, |
|
"reward": 0.598214328289032, |
|
"reward_std": 0.27536728978157043, |
|
"rewards/accuracy_reward": 0.598214328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 397 |
|
}, |
|
{ |
|
"completion_length": 571.7589721679688, |
|
"epoch": 0.8486140724946695, |
|
"grad_norm": 0.6058499217033386, |
|
"grpo_loss": 0.05869365483522415, |
|
"kl": 0.046142578125, |
|
"learning_rate": 2.047001438694015e-07, |
|
"loss": 0.0018, |
|
"reward": 0.7678571939468384, |
|
"reward_std": 0.36782076954841614, |
|
"rewards/accuracy_reward": 0.7678571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 398 |
|
}, |
|
{ |
|
"completion_length": 669.75, |
|
"epoch": 0.8507462686567164, |
|
"grad_norm": 0.19755227863788605, |
|
"grpo_loss": -0.1000501960515976, |
|
"kl": 0.031005859375, |
|
"learning_rate": 1.9910476281341155e-07, |
|
"loss": 0.0012, |
|
"reward": 0.5178571939468384, |
|
"reward_std": 0.24988628923892975, |
|
"rewards/accuracy_reward": 0.5178571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 399 |
|
}, |
|
{ |
|
"completion_length": 650.6964721679688, |
|
"epoch": 0.8528784648187633, |
|
"grad_norm": 0.24487632513046265, |
|
"grpo_loss": 0.0018350593745708466, |
|
"kl": 0.0625, |
|
"learning_rate": 1.9358147842001328e-07, |
|
"loss": 0.0025, |
|
"reward": 0.5178571939468384, |
|
"reward_std": 0.1514318436384201, |
|
"rewards/accuracy_reward": 0.5178571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8528784648187633, |
|
"eval_completion_length": 611.5137771752695, |
|
"eval_grpo_loss": 0.002802959766321497, |
|
"eval_kl": 0.049074215629992014, |
|
"eval_loss": 0.0019418156007304788, |
|
"eval_reward": 0.5530294624570841, |
|
"eval_reward_std": 0.28017472720945985, |
|
"eval_rewards/accuracy_reward": 0.5530294624570841, |
|
"eval_rewards/format_reward": 0.0, |
|
"eval_runtime": 4037.4288, |
|
"eval_samples_per_second": 1.238, |
|
"eval_steps_per_second": 0.011, |
|
"step": 400 |
|
}, |
|
{ |
|
"completion_length": 519.6964721679688, |
|
"epoch": 0.8550106609808102, |
|
"grad_norm": 0.25118526816368103, |
|
"grpo_loss": -0.04701920598745346, |
|
"kl": 0.04052734375, |
|
"learning_rate": 1.8813059679408012e-07, |
|
"loss": 0.0016, |
|
"reward": 0.785714328289032, |
|
"reward_std": 0.24592295289039612, |
|
"rewards/accuracy_reward": 0.785714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 401 |
|
}, |
|
{ |
|
"completion_length": 615.7232666015625, |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 0.2910488545894623, |
|
"grpo_loss": 0.00017983283032663167, |
|
"kl": 0.0478515625, |
|
"learning_rate": 1.827524200278648e-07, |
|
"loss": 0.0019, |
|
"reward": 0.6428571939468384, |
|
"reward_std": 0.18889254331588745, |
|
"rewards/accuracy_reward": 0.6428571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 402 |
|
}, |
|
{ |
|
"completion_length": 614.8035888671875, |
|
"epoch": 0.8592750533049041, |
|
"grad_norm": 0.31290826201438904, |
|
"grpo_loss": -0.045705024152994156, |
|
"kl": 0.02880859375, |
|
"learning_rate": 1.7744724618425868e-07, |
|
"loss": 0.0012, |
|
"reward": 0.6517857313156128, |
|
"reward_std": 0.3277184069156647, |
|
"rewards/accuracy_reward": 0.6517857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 403 |
|
}, |
|
{ |
|
"completion_length": 629.3928833007812, |
|
"epoch": 0.8614072494669509, |
|
"grad_norm": 0.35251688957214355, |
|
"grpo_loss": -0.11518432199954987, |
|
"kl": 0.04443359375, |
|
"learning_rate": 1.7221536928027232e-07, |
|
"loss": 0.0018, |
|
"reward": 0.625, |
|
"reward_std": 0.3334502577781677, |
|
"rewards/accuracy_reward": 0.625, |
|
"rewards/format_reward": 0.0, |
|
"step": 404 |
|
}, |
|
{ |
|
"completion_length": 609.9375, |
|
"epoch": 0.8635394456289979, |
|
"grad_norm": 0.4341069757938385, |
|
"grpo_loss": -0.046183593571186066, |
|
"kl": 0.03466796875, |
|
"learning_rate": 1.6705707927074104e-07, |
|
"loss": 0.0014, |
|
"reward": 0.6696428656578064, |
|
"reward_std": 0.3264864683151245, |
|
"rewards/accuracy_reward": 0.6696428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 405 |
|
}, |
|
{ |
|
"completion_length": 611.6517944335938, |
|
"epoch": 0.8656716417910447, |
|
"grad_norm": 0.4119378626346588, |
|
"grpo_loss": -0.013815686106681824, |
|
"kl": 0.03759765625, |
|
"learning_rate": 1.6197266203225441e-07, |
|
"loss": 0.0015, |
|
"reward": 0.5625, |
|
"reward_std": 0.3127382695674896, |
|
"rewards/accuracy_reward": 0.5625, |
|
"rewards/format_reward": 0.0, |
|
"step": 406 |
|
}, |
|
{ |
|
"completion_length": 590.1875, |
|
"epoch": 0.8678038379530917, |
|
"grad_norm": 0.3130354583263397, |
|
"grpo_loss": 0.0482063889503479, |
|
"kl": 0.041748046875, |
|
"learning_rate": 1.5696239934731394e-07, |
|
"loss": 0.0017, |
|
"reward": 0.6160714626312256, |
|
"reward_std": 0.3235756754875183, |
|
"rewards/accuracy_reward": 0.6160714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 407 |
|
}, |
|
{ |
|
"completion_length": 489.95538330078125, |
|
"epoch": 0.8699360341151386, |
|
"grad_norm": 1.4019256830215454, |
|
"grpo_loss": 0.0030681490898132324, |
|
"kl": 0.07373046875, |
|
"learning_rate": 1.520265688887162e-07, |
|
"loss": 0.0029, |
|
"reward": 0.6785714626312256, |
|
"reward_std": 0.3737320303916931, |
|
"rewards/accuracy_reward": 0.6785714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 408 |
|
}, |
|
{ |
|
"completion_length": 634.6785888671875, |
|
"epoch": 0.8720682302771855, |
|
"grad_norm": 0.7867122888565063, |
|
"grpo_loss": 0.002787616103887558, |
|
"kl": 0.049560546875, |
|
"learning_rate": 1.4716544420416366e-07, |
|
"loss": 0.002, |
|
"reward": 0.6785714626312256, |
|
"reward_std": 0.3265761733055115, |
|
"rewards/accuracy_reward": 0.6785714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 409 |
|
}, |
|
{ |
|
"completion_length": 695.1964721679688, |
|
"epoch": 0.8742004264392325, |
|
"grad_norm": 0.35332122445106506, |
|
"grpo_loss": 0.0004137594369240105, |
|
"kl": 0.032470703125, |
|
"learning_rate": 1.4237929470110406e-07, |
|
"loss": 0.0013, |
|
"reward": 0.5803571939468384, |
|
"reward_std": 0.26267164945602417, |
|
"rewards/accuracy_reward": 0.5803571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 410 |
|
}, |
|
{ |
|
"completion_length": 672.4285888671875, |
|
"epoch": 0.8763326226012793, |
|
"grad_norm": 0.5385404229164124, |
|
"grpo_loss": 0.07336309552192688, |
|
"kl": 0.042236328125, |
|
"learning_rate": 1.3766838563180117e-07, |
|
"loss": 0.0017, |
|
"reward": 0.5446428656578064, |
|
"reward_std": 0.31379079818725586, |
|
"rewards/accuracy_reward": 0.5446428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 411 |
|
}, |
|
{ |
|
"completion_length": 591.0267944335938, |
|
"epoch": 0.8784648187633263, |
|
"grad_norm": 1.787886619567871, |
|
"grpo_loss": -0.053161606192588806, |
|
"kl": 0.2265625, |
|
"learning_rate": 1.330329780786324e-07, |
|
"loss": 0.0091, |
|
"reward": 0.6696428656578064, |
|
"reward_std": 0.3177541494369507, |
|
"rewards/accuracy_reward": 0.6696428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 412 |
|
}, |
|
{ |
|
"completion_length": 596.6339721679688, |
|
"epoch": 0.8805970149253731, |
|
"grad_norm": 0.8436120748519897, |
|
"grpo_loss": 0.01977485604584217, |
|
"kl": 0.045654296875, |
|
"learning_rate": 1.2847332893962078e-07, |
|
"loss": 0.0018, |
|
"reward": 0.7142857313156128, |
|
"reward_std": 0.2862047255039215, |
|
"rewards/accuracy_reward": 0.7142857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 413 |
|
}, |
|
{ |
|
"completion_length": 618.9464721679688, |
|
"epoch": 0.8827292110874201, |
|
"grad_norm": 0.6383178234100342, |
|
"grpo_loss": 0.0014204904437065125, |
|
"kl": 0.054931640625, |
|
"learning_rate": 1.23989690914196e-07, |
|
"loss": 0.0022, |
|
"reward": 0.6160714626312256, |
|
"reward_std": 0.2724565267562866, |
|
"rewards/accuracy_reward": 0.6160714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 414 |
|
}, |
|
{ |
|
"completion_length": 668.4732666015625, |
|
"epoch": 0.8848614072494669, |
|
"grad_norm": 0.4944930076599121, |
|
"grpo_loss": 0.0009383819997310638, |
|
"kl": 0.04052734375, |
|
"learning_rate": 1.1958231248919232e-07, |
|
"loss": 0.0016, |
|
"reward": 0.5446428656578064, |
|
"reward_std": 0.3519674837589264, |
|
"rewards/accuracy_reward": 0.5446428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 415 |
|
}, |
|
{ |
|
"completion_length": 652.8035888671875, |
|
"epoch": 0.8869936034115139, |
|
"grad_norm": 0.739689826965332, |
|
"grpo_loss": 0.001788435154594481, |
|
"kl": 0.05615234375, |
|
"learning_rate": 1.1525143792507397e-07, |
|
"loss": 0.0022, |
|
"reward": 0.625, |
|
"reward_std": 0.24988627433776855, |
|
"rewards/accuracy_reward": 0.625, |
|
"rewards/format_reward": 0.0, |
|
"step": 416 |
|
}, |
|
{ |
|
"completion_length": 621.5357666015625, |
|
"epoch": 0.8891257995735607, |
|
"grad_norm": 1.2304091453552246, |
|
"grpo_loss": 0.0009033530950546265, |
|
"kl": 0.1025390625, |
|
"learning_rate": 1.1099730724240037e-07, |
|
"loss": 0.0041, |
|
"reward": 0.660714328289032, |
|
"reward_std": 0.2695457339286804, |
|
"rewards/accuracy_reward": 0.660714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 417 |
|
}, |
|
{ |
|
"completion_length": 579.1964721679688, |
|
"epoch": 0.8912579957356077, |
|
"grad_norm": 1.6807209253311157, |
|
"grpo_loss": 0.0036621836479753256, |
|
"kl": 0.07275390625, |
|
"learning_rate": 1.0682015620852214e-07, |
|
"loss": 0.0029, |
|
"reward": 0.6964285969734192, |
|
"reward_std": 0.2666349411010742, |
|
"rewards/accuracy_reward": 0.6964285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 418 |
|
}, |
|
{ |
|
"completion_length": 513.3928833007812, |
|
"epoch": 0.8933901918976546, |
|
"grad_norm": 7.738749027252197, |
|
"grpo_loss": 0.12100201845169067, |
|
"kl": 0.470703125, |
|
"learning_rate": 1.0272021632451745e-07, |
|
"loss": 0.0188, |
|
"reward": 0.6160714626312256, |
|
"reward_std": 0.30004262924194336, |
|
"rewards/accuracy_reward": 0.6160714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 419 |
|
}, |
|
{ |
|
"completion_length": 633.0892944335938, |
|
"epoch": 0.8955223880597015, |
|
"grad_norm": 0.5201921463012695, |
|
"grpo_loss": 0.12260356545448303, |
|
"kl": 0.046142578125, |
|
"learning_rate": 9.869771481235851e-08, |
|
"loss": 0.0019, |
|
"reward": 0.5535714626312256, |
|
"reward_std": 0.24883373081684113, |
|
"rewards/accuracy_reward": 0.5535714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 420 |
|
}, |
|
{ |
|
"completion_length": 627.919677734375, |
|
"epoch": 0.8976545842217484, |
|
"grad_norm": 1.1365468502044678, |
|
"grpo_loss": 0.12431085109710693, |
|
"kl": 0.054443359375, |
|
"learning_rate": 9.475287460232174e-08, |
|
"loss": 0.0022, |
|
"reward": 0.6428571939468384, |
|
"reward_std": 0.2832939326763153, |
|
"rewards/accuracy_reward": 0.6428571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 421 |
|
}, |
|
{ |
|
"completion_length": 687.1428833007812, |
|
"epoch": 0.8997867803837953, |
|
"grad_norm": 0.8904653191566467, |
|
"grpo_loss": 0.0030726641416549683, |
|
"kl": 0.07666015625, |
|
"learning_rate": 9.088591432063109e-08, |
|
"loss": 0.0031, |
|
"reward": 0.4464285969734192, |
|
"reward_std": 0.24883374571800232, |
|
"rewards/accuracy_reward": 0.4464285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 422 |
|
}, |
|
{ |
|
"completion_length": 702.982177734375, |
|
"epoch": 0.9019189765458422, |
|
"grad_norm": 0.5287430882453918, |
|
"grpo_loss": 0.0010234508663415909, |
|
"kl": 0.03564453125, |
|
"learning_rate": 8.709704827734244e-08, |
|
"loss": 0.0014, |
|
"reward": 0.4732142984867096, |
|
"reward_std": 0.21251530945301056, |
|
"rewards/accuracy_reward": 0.4732142984867096, |
|
"rewards/format_reward": 0.0, |
|
"step": 423 |
|
}, |
|
{ |
|
"completion_length": 582.6607666015625, |
|
"epoch": 0.9040511727078892, |
|
"grad_norm": 1.1305034160614014, |
|
"grpo_loss": 0.0023384217638522387, |
|
"kl": 0.0654296875, |
|
"learning_rate": 8.33864864544654e-08, |
|
"loss": 0.0026, |
|
"reward": 0.660714328289032, |
|
"reward_std": 0.2695457339286804, |
|
"rewards/accuracy_reward": 0.660714328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 424 |
|
}, |
|
{ |
|
"completion_length": 522.7589721679688, |
|
"epoch": 0.906183368869936, |
|
"grad_norm": 1.2338292598724365, |
|
"grpo_loss": 0.0016110788565129042, |
|
"kl": 0.08740234375, |
|
"learning_rate": 7.975443449432712e-08, |
|
"loss": 0.0035, |
|
"reward": 0.7410714626312256, |
|
"reward_std": 0.23904886841773987, |
|
"rewards/accuracy_reward": 0.7410714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 425 |
|
}, |
|
{ |
|
"completion_length": 662.3392944335938, |
|
"epoch": 0.908315565031983, |
|
"grad_norm": 1.3673124313354492, |
|
"grpo_loss": 0.004744287580251694, |
|
"kl": 0.06884765625, |
|
"learning_rate": 7.620109368817479e-08, |
|
"loss": 0.0028, |
|
"reward": 0.6428571939468384, |
|
"reward_std": 0.3069167137145996, |
|
"rewards/accuracy_reward": 0.6428571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 426 |
|
}, |
|
{ |
|
"completion_length": 576.375, |
|
"epoch": 0.9104477611940298, |
|
"grad_norm": 1.6284408569335938, |
|
"grpo_loss": -0.09632082283496857, |
|
"kl": 0.0859375, |
|
"learning_rate": 7.272666096502017e-08, |
|
"loss": 0.0034, |
|
"reward": 0.5625, |
|
"reward_std": 0.35116177797317505, |
|
"rewards/accuracy_reward": 0.5625, |
|
"rewards/format_reward": 0.0, |
|
"step": 427 |
|
}, |
|
{ |
|
"completion_length": 515.625, |
|
"epoch": 0.9125799573560768, |
|
"grad_norm": 1.218694806098938, |
|
"grpo_loss": 0.0009350869804620743, |
|
"kl": 0.0625, |
|
"learning_rate": 6.933132888072452e-08, |
|
"loss": 0.0025, |
|
"reward": 0.7321428656578064, |
|
"reward_std": 0.29713186621665955, |
|
"rewards/accuracy_reward": 0.7321428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 428 |
|
}, |
|
{ |
|
"completion_length": 610.9642944335938, |
|
"epoch": 0.9147121535181236, |
|
"grad_norm": 1.6818090677261353, |
|
"grpo_loss": 0.002560252556577325, |
|
"kl": 0.0849609375, |
|
"learning_rate": 6.601528560732978e-08, |
|
"loss": 0.0034, |
|
"reward": 0.7142857313156128, |
|
"reward_std": 0.3040059804916382, |
|
"rewards/accuracy_reward": 0.7142857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 429 |
|
}, |
|
{ |
|
"completion_length": 561.5714721679688, |
|
"epoch": 0.9168443496801706, |
|
"grad_norm": 1.855384349822998, |
|
"grpo_loss": 0.005668744444847107, |
|
"kl": 0.10302734375, |
|
"learning_rate": 6.277871492262593e-08, |
|
"loss": 0.0041, |
|
"reward": 0.7410714626312256, |
|
"reward_std": 0.36685794591903687, |
|
"rewards/accuracy_reward": 0.7410714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 430 |
|
}, |
|
{ |
|
"completion_length": 650.4285888671875, |
|
"epoch": 0.9189765458422174, |
|
"grad_norm": 1.2501542568206787, |
|
"grpo_loss": -0.07116611301898956, |
|
"kl": 0.049560546875, |
|
"learning_rate": 5.962179619996966e-08, |
|
"loss": 0.002, |
|
"reward": 0.4910714626312256, |
|
"reward_std": 0.2862047255039215, |
|
"rewards/accuracy_reward": 0.4910714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 431 |
|
}, |
|
{ |
|
"completion_length": 606.2410888671875, |
|
"epoch": 0.9211087420042644, |
|
"grad_norm": 1.4896941184997559, |
|
"grpo_loss": -0.014261804521083832, |
|
"kl": 0.1328125, |
|
"learning_rate": 5.654470439834058e-08, |
|
"loss": 0.0053, |
|
"reward": 0.6696428656578064, |
|
"reward_std": 0.29016801714897156, |
|
"rewards/accuracy_reward": 0.6696428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 432 |
|
}, |
|
{ |
|
"completion_length": 596.0625, |
|
"epoch": 0.9232409381663113, |
|
"grad_norm": 1.6670781373977661, |
|
"grpo_loss": -0.0711505264043808, |
|
"kl": 0.126953125, |
|
"learning_rate": 5.3547610052647246e-08, |
|
"loss": 0.0051, |
|
"reward": 0.7321428656578064, |
|
"reward_std": 0.2695457339286804, |
|
"rewards/accuracy_reward": 0.7321428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 433 |
|
}, |
|
{ |
|
"completion_length": 752.544677734375, |
|
"epoch": 0.9253731343283582, |
|
"grad_norm": 1.245611310005188, |
|
"grpo_loss": 0.007793694734573364, |
|
"kl": 0.0771484375, |
|
"learning_rate": 5.063067926427345e-08, |
|
"loss": 0.0031, |
|
"reward": 0.4732142984867096, |
|
"reward_std": 0.3363610506057739, |
|
"rewards/accuracy_reward": 0.4732142984867096, |
|
"rewards/format_reward": 0.0, |
|
"step": 434 |
|
}, |
|
{ |
|
"completion_length": 634.5535888671875, |
|
"epoch": 0.9275053304904051, |
|
"grad_norm": 1.4781724214553833, |
|
"grpo_loss": 0.0024833083152770996, |
|
"kl": 0.07666015625, |
|
"learning_rate": 4.7794073691875e-08, |
|
"loss": 0.0031, |
|
"reward": 0.5714285969734192, |
|
"reward_std": 0.3098274767398834, |
|
"rewards/accuracy_reward": 0.5714285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 435 |
|
}, |
|
{ |
|
"completion_length": 675.8392944335938, |
|
"epoch": 0.929637526652452, |
|
"grad_norm": 1.7764922380447388, |
|
"grpo_loss": 0.10629189014434814, |
|
"kl": 0.130859375, |
|
"learning_rate": 4.5037950542418327e-08, |
|
"loss": 0.0052, |
|
"reward": 0.6071428656578064, |
|
"reward_std": 0.23208504915237427, |
|
"rewards/accuracy_reward": 0.6071428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 436 |
|
}, |
|
{ |
|
"completion_length": 595.0267944335938, |
|
"epoch": 0.9317697228144989, |
|
"grad_norm": 0.9373055696487427, |
|
"grpo_loss": 0.0014045275747776031, |
|
"kl": 0.0751953125, |
|
"learning_rate": 4.236246256247084e-08, |
|
"loss": 0.003, |
|
"reward": 0.6517857313156128, |
|
"reward_std": 0.30004262924194336, |
|
"rewards/accuracy_reward": 0.6517857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 437 |
|
}, |
|
{ |
|
"completion_length": 622.2053833007812, |
|
"epoch": 0.9339019189765458, |
|
"grad_norm": 1.3948901891708374, |
|
"grpo_loss": 0.003673933446407318, |
|
"kl": 0.09716796875, |
|
"learning_rate": 3.976775802973232e-08, |
|
"loss": 0.0039, |
|
"reward": 0.6785714626312256, |
|
"reward_std": 0.3206649422645569, |
|
"rewards/accuracy_reward": 0.6785714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 438 |
|
}, |
|
{ |
|
"completion_length": 641.232177734375, |
|
"epoch": 0.9360341151385928, |
|
"grad_norm": 1.9123766422271729, |
|
"grpo_loss": -0.036110423505306244, |
|
"kl": 0.16015625, |
|
"learning_rate": 3.7253980744819335e-08, |
|
"loss": 0.0064, |
|
"reward": 0.5535714626312256, |
|
"reward_std": 0.3265761733055115, |
|
"rewards/accuracy_reward": 0.5535714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 439 |
|
}, |
|
{ |
|
"completion_length": 649.419677734375, |
|
"epoch": 0.9381663113006397, |
|
"grad_norm": 1.4534177780151367, |
|
"grpo_loss": 0.003873981535434723, |
|
"kl": 0.162109375, |
|
"learning_rate": 3.482127002329532e-08, |
|
"loss": 0.0065, |
|
"reward": 0.6428571939468384, |
|
"reward_std": 0.27933064103126526, |
|
"rewards/accuracy_reward": 0.6428571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 440 |
|
}, |
|
{ |
|
"completion_length": 701.0089721679688, |
|
"epoch": 0.9402985074626866, |
|
"grad_norm": 3.5775556564331055, |
|
"grpo_loss": 0.12740851938724518, |
|
"kl": 0.13671875, |
|
"learning_rate": 3.246976068794938e-08, |
|
"loss": 0.0055, |
|
"reward": 0.5089285969734192, |
|
"reward_std": 0.28338366746902466, |
|
"rewards/accuracy_reward": 0.5089285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 441 |
|
}, |
|
{ |
|
"completion_length": 630.2410888671875, |
|
"epoch": 0.9424307036247335, |
|
"grad_norm": 2.2547216415405273, |
|
"grpo_loss": 0.0050017498433589935, |
|
"kl": 0.2080078125, |
|
"learning_rate": 3.019958306132409e-08, |
|
"loss": 0.0083, |
|
"reward": 0.5, |
|
"reward_std": 0.40713968873023987, |
|
"rewards/accuracy_reward": 0.5, |
|
"rewards/format_reward": 0.0, |
|
"step": 442 |
|
}, |
|
{ |
|
"completion_length": 545.8839721679688, |
|
"epoch": 0.9445628997867804, |
|
"grad_norm": 0.666123628616333, |
|
"grpo_loss": 0.010975733399391174, |
|
"kl": 0.09765625, |
|
"learning_rate": 2.8010862958493143e-08, |
|
"loss": 0.0039, |
|
"reward": 0.6964285969734192, |
|
"reward_std": 0.2153363674879074, |
|
"rewards/accuracy_reward": 0.6964285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 443 |
|
}, |
|
{ |
|
"completion_length": 654.375, |
|
"epoch": 0.9466950959488273, |
|
"grad_norm": 4.539513111114502, |
|
"grpo_loss": 0.07739832252264023, |
|
"kl": 0.28125, |
|
"learning_rate": 2.5903721680089088e-08, |
|
"loss": 0.0112, |
|
"reward": 0.5535714626312256, |
|
"reward_std": 0.27933061122894287, |
|
"rewards/accuracy_reward": 0.5535714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 444 |
|
}, |
|
{ |
|
"completion_length": 606.5892944335938, |
|
"epoch": 0.9488272921108742, |
|
"grad_norm": 1.9015904664993286, |
|
"grpo_loss": -0.04408063739538193, |
|
"kl": 0.09375, |
|
"learning_rate": 2.3878276005580014e-08, |
|
"loss": 0.0037, |
|
"reward": 0.7767857313156128, |
|
"reward_std": 0.30004262924194336, |
|
"rewards/accuracy_reward": 0.7767857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 445 |
|
}, |
|
{ |
|
"completion_length": 636.732177734375, |
|
"epoch": 0.9509594882729211, |
|
"grad_norm": 0.7139691710472107, |
|
"grpo_loss": -0.09958329796791077, |
|
"kl": 0.0654296875, |
|
"learning_rate": 2.1934638186797418e-08, |
|
"loss": 0.0026, |
|
"reward": 0.6517857313156128, |
|
"reward_std": 0.31670159101486206, |
|
"rewards/accuracy_reward": 0.6517857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 446 |
|
}, |
|
{ |
|
"completion_length": 550.25, |
|
"epoch": 0.9530916844349681, |
|
"grad_norm": 0.9299170970916748, |
|
"grpo_loss": -0.0699908658862114, |
|
"kl": 0.1328125, |
|
"learning_rate": 2.0072915941716176e-08, |
|
"loss": 0.0053, |
|
"reward": 0.5714285969734192, |
|
"reward_std": 0.3235757052898407, |
|
"rewards/accuracy_reward": 0.5714285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 447 |
|
}, |
|
{ |
|
"completion_length": 547.2857666015625, |
|
"epoch": 0.9552238805970149, |
|
"grad_norm": 1.3427575826644897, |
|
"grpo_loss": 0.0044424869120121, |
|
"kl": 0.0791015625, |
|
"learning_rate": 1.8293212448483476e-08, |
|
"loss": 0.0032, |
|
"reward": 0.6964285969734192, |
|
"reward_std": 0.24301216006278992, |
|
"rewards/accuracy_reward": 0.6964285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 448 |
|
}, |
|
{ |
|
"completion_length": 607.3392944335938, |
|
"epoch": 0.9573560767590619, |
|
"grad_norm": 1933856.75, |
|
"grpo_loss": 0.0023131519556045532, |
|
"kl": 27648.0, |
|
"learning_rate": 1.6595626339701407e-08, |
|
"loss": 1106.2889, |
|
"reward": 0.6428571939468384, |
|
"reward_std": 0.27641984820365906, |
|
"rewards/accuracy_reward": 0.6428571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 449 |
|
}, |
|
{ |
|
"completion_length": 599.5803833007812, |
|
"epoch": 0.9594882729211087, |
|
"grad_norm": 0.7657116055488586, |
|
"grpo_loss": 0.11946123838424683, |
|
"kl": 0.1064453125, |
|
"learning_rate": 1.498025169696049e-08, |
|
"loss": 0.0043, |
|
"reward": 0.6160714626312256, |
|
"reward_std": 0.3098275065422058, |
|
"rewards/accuracy_reward": 0.6160714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 450 |
|
}, |
|
{ |
|
"completion_length": 558.044677734375, |
|
"epoch": 0.9616204690831557, |
|
"grad_norm": 1.7871955633163452, |
|
"grpo_loss": 0.0009799189865589142, |
|
"kl": 0.10888671875, |
|
"learning_rate": 1.344717804562534e-08, |
|
"loss": 0.0044, |
|
"reward": 0.7053571939468384, |
|
"reward_std": 0.23904886841773987, |
|
"rewards/accuracy_reward": 0.7053571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 451 |
|
}, |
|
{ |
|
"completion_length": 568.357177734375, |
|
"epoch": 0.9637526652452025, |
|
"grad_norm": 93.76356506347656, |
|
"grpo_loss": 0.013948827981948853, |
|
"kl": 1.28125, |
|
"learning_rate": 1.1996490349873657e-08, |
|
"loss": 0.0513, |
|
"reward": 0.6428571939468384, |
|
"reward_std": 0.2695457339286804, |
|
"rewards/accuracy_reward": 0.6428571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 452 |
|
}, |
|
{ |
|
"completion_length": 672.2678833007812, |
|
"epoch": 0.9658848614072495, |
|
"grad_norm": 2.1468069553375244, |
|
"grpo_loss": -0.04484168440103531, |
|
"kl": 0.1865234375, |
|
"learning_rate": 1.062826900798647e-08, |
|
"loss": 0.0075, |
|
"reward": 0.5803571939468384, |
|
"reward_std": 0.3708212673664093, |
|
"rewards/accuracy_reward": 0.5803571939468384, |
|
"rewards/format_reward": 0.0, |
|
"step": 453 |
|
}, |
|
{ |
|
"completion_length": 598.8303833007812, |
|
"epoch": 0.9680170575692963, |
|
"grad_norm": 1.5067918300628662, |
|
"grpo_loss": -0.00438106432557106, |
|
"kl": 0.16796875, |
|
"learning_rate": 9.34258984789338e-09, |
|
"loss": 0.0067, |
|
"reward": 0.6160714626312256, |
|
"reward_std": 0.3363610506057739, |
|
"rewards/accuracy_reward": 0.6160714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 454 |
|
}, |
|
{ |
|
"completion_length": 666.1964721679688, |
|
"epoch": 0.9701492537313433, |
|
"grad_norm": 1.465742588043213, |
|
"grpo_loss": 0.009304620325565338, |
|
"kl": 0.1376953125, |
|
"learning_rate": 8.139524122970254e-09, |
|
"loss": 0.0055, |
|
"reward": 0.4285714626312256, |
|
"reward_std": 0.20264071226119995, |
|
"rewards/accuracy_reward": 0.4285714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 455 |
|
}, |
|
{ |
|
"completion_length": 611.8839721679688, |
|
"epoch": 0.9722814498933902, |
|
"grad_norm": 0.6134836077690125, |
|
"grpo_loss": -0.06908067315816879, |
|
"kl": 0.072265625, |
|
"learning_rate": 7.019138508088552e-09, |
|
"loss": 0.0029, |
|
"reward": 0.6517857313156128, |
|
"reward_std": 0.27641984820365906, |
|
"rewards/accuracy_reward": 0.6517857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 456 |
|
}, |
|
{ |
|
"completion_length": 609.9285888671875, |
|
"epoch": 0.9744136460554371, |
|
"grad_norm": 2.488274097442627, |
|
"grpo_loss": -0.09877821803092957, |
|
"kl": 0.125, |
|
"learning_rate": 5.98149509592244e-09, |
|
"loss": 0.005, |
|
"reward": 0.5267857313156128, |
|
"reward_std": 0.26945602893829346, |
|
"rewards/accuracy_reward": 0.5267857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 457 |
|
}, |
|
{ |
|
"completion_length": 549.5535888671875, |
|
"epoch": 0.976545842217484, |
|
"grad_norm": 26.11075210571289, |
|
"grpo_loss": 0.21839597821235657, |
|
"kl": 0.8359375, |
|
"learning_rate": 5.026651393506387e-09, |
|
"loss": 0.0335, |
|
"reward": 0.7410714626312256, |
|
"reward_std": 0.316791296005249, |
|
"rewards/accuracy_reward": 0.7410714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 458 |
|
}, |
|
{ |
|
"completion_length": 562.25, |
|
"epoch": 0.9786780383795309, |
|
"grad_norm": 5.394749164581299, |
|
"grpo_loss": 0.0021801290567964315, |
|
"kl": 0.3359375, |
|
"learning_rate": 4.154660319047543e-09, |
|
"loss": 0.0134, |
|
"reward": 0.723214328289032, |
|
"reward_std": 0.22926399111747742, |
|
"rewards/accuracy_reward": 0.723214328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 459 |
|
}, |
|
{ |
|
"completion_length": 687.1517944335938, |
|
"epoch": 0.9808102345415778, |
|
"grad_norm": 5.675504207611084, |
|
"grpo_loss": 0.06382696330547333, |
|
"kl": 0.341796875, |
|
"learning_rate": 3.3655701989944164e-09, |
|
"loss": 0.0137, |
|
"reward": 0.4821428656578064, |
|
"reward_std": 0.26363447308540344, |
|
"rewards/accuracy_reward": 0.4821428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 460 |
|
}, |
|
{ |
|
"completion_length": 585.3125, |
|
"epoch": 0.9829424307036247, |
|
"grad_norm": 2.350153684616089, |
|
"grpo_loss": 0.0019125863909721375, |
|
"kl": 0.1630859375, |
|
"learning_rate": 2.659424765357521e-09, |
|
"loss": 0.0065, |
|
"reward": 0.598214328289032, |
|
"reward_std": 0.2331376075744629, |
|
"rewards/accuracy_reward": 0.598214328289032, |
|
"rewards/format_reward": 0.0, |
|
"step": 461 |
|
}, |
|
{ |
|
"completion_length": 660.732177734375, |
|
"epoch": 0.9850746268656716, |
|
"grad_norm": 5.494519233703613, |
|
"grpo_loss": -0.0505463182926178, |
|
"kl": 0.2412109375, |
|
"learning_rate": 2.0362631532856445e-09, |
|
"loss": 0.0096, |
|
"reward": 0.4910714626312256, |
|
"reward_std": 0.3402346670627594, |
|
"rewards/accuracy_reward": 0.4910714626312256, |
|
"rewards/format_reward": 0.0, |
|
"step": 462 |
|
}, |
|
{ |
|
"completion_length": 677.8928833007812, |
|
"epoch": 0.9872068230277186, |
|
"grad_norm": 1.2300056219100952, |
|
"grpo_loss": -0.049303088337183, |
|
"kl": 0.1044921875, |
|
"learning_rate": 1.4961198988979185e-09, |
|
"loss": 0.0042, |
|
"reward": 0.5892857313156128, |
|
"reward_std": 0.3501092493534088, |
|
"rewards/accuracy_reward": 0.5892857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 463 |
|
}, |
|
{ |
|
"completion_length": 562.107177734375, |
|
"epoch": 0.9893390191897654, |
|
"grad_norm": 1.0299087762832642, |
|
"grpo_loss": -0.09899073094129562, |
|
"kl": 0.12890625, |
|
"learning_rate": 1.0390249373686823e-09, |
|
"loss": 0.0052, |
|
"reward": 0.7589285969734192, |
|
"reward_std": 0.26945602893829346, |
|
"rewards/accuracy_reward": 0.7589285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 464 |
|
}, |
|
{ |
|
"completion_length": 614.982177734375, |
|
"epoch": 0.9914712153518124, |
|
"grad_norm": 1.0833293199539185, |
|
"grpo_loss": 0.006685294210910797, |
|
"kl": 0.123046875, |
|
"learning_rate": 6.650036012693095e-10, |
|
"loss": 0.0049, |
|
"reward": 0.5625, |
|
"reward_std": 0.2360483705997467, |
|
"rewards/accuracy_reward": 0.5625, |
|
"rewards/format_reward": 0.0, |
|
"step": 465 |
|
}, |
|
{ |
|
"completion_length": 575.0089721679688, |
|
"epoch": 0.9936034115138592, |
|
"grad_norm": 0.6512173414230347, |
|
"grpo_loss": 0.07838420569896698, |
|
"kl": 0.10595703125, |
|
"learning_rate": 3.7407661916349924e-10, |
|
"loss": 0.0042, |
|
"reward": 0.6071428656578064, |
|
"reward_std": 0.346056193113327, |
|
"rewards/accuracy_reward": 0.6071428656578064, |
|
"rewards/format_reward": 0.0, |
|
"step": 466 |
|
}, |
|
{ |
|
"completion_length": 612.419677734375, |
|
"epoch": 0.9957356076759062, |
|
"grad_norm": 3.9021387100219727, |
|
"grpo_loss": 0.020383043214678764, |
|
"kl": 0.228515625, |
|
"learning_rate": 1.6626011445919353e-10, |
|
"loss": 0.0091, |
|
"reward": 0.5892857313156128, |
|
"reward_std": 0.22917428612709045, |
|
"rewards/accuracy_reward": 0.5892857313156128, |
|
"rewards/format_reward": 0.0, |
|
"step": 467 |
|
}, |
|
{ |
|
"completion_length": 642.1160888671875, |
|
"epoch": 0.997867803837953, |
|
"grad_norm": 0.9386311769485474, |
|
"grpo_loss": 0.009107634425163269, |
|
"kl": 0.1259765625, |
|
"learning_rate": 4.156560451462621e-11, |
|
"loss": 0.005, |
|
"reward": 0.5714285969734192, |
|
"reward_std": 0.18483950197696686, |
|
"rewards/accuracy_reward": 0.5714285969734192, |
|
"rewards/format_reward": 0.0, |
|
"step": 468 |
|
}, |
|
{ |
|
"completion_length": 644.1759033203125, |
|
"epoch": 1.0, |
|
"grad_norm": 1.1123640537261963, |
|
"grpo_loss": 0.005602791905403137, |
|
"kl": 0.10107421875, |
|
"learning_rate": 0.0, |
|
"loss": 0.004, |
|
"reward": 0.625, |
|
"reward_std": 0.2999529242515564, |
|
"rewards/accuracy_reward": 0.625, |
|
"rewards/format_reward": 0.0, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 469, |
|
"total_flos": 0.0, |
|
"train_loss": 0.0, |
|
"train_runtime": 1.7067, |
|
"train_samples_per_second": 4394.419, |
|
"train_steps_per_second": 274.798 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 469, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|