Qwen2.5-1.5B-Open-R1-GRPO / trainer_state.json
saemin21's picture
Model save
9f4be26 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 283,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 387.71745681762695,
"epoch": 0.0035335689045936395,
"grad_norm": 0.7826879124381357,
"kl": 0.0,
"learning_rate": 6.896551724137931e-07,
"loss": 0.0,
"reward": 0.6354166809469461,
"reward_std": 0.4388374499976635,
"rewards/accuracy_reward": 0.16276042070239782,
"rewards/format_reward": 0.4726562611758709,
"step": 1
},
{
"completion_length": 405.3405055999756,
"epoch": 0.007067137809187279,
"grad_norm": 1.1927588816666794,
"kl": 0.0,
"learning_rate": 1.3793103448275862e-06,
"loss": 0.0,
"reward": 0.6035156473517418,
"reward_std": 0.4155316762626171,
"rewards/accuracy_reward": 0.1523437537252903,
"rewards/format_reward": 0.4511718861758709,
"step": 2
},
{
"completion_length": 409.28972244262695,
"epoch": 0.01060070671378092,
"grad_norm": 0.6905561253532118,
"kl": 0.00020521879196166992,
"learning_rate": 2.0689655172413796e-06,
"loss": 0.0,
"reward": 0.5996093954890966,
"reward_std": 0.4422223027795553,
"rewards/accuracy_reward": 0.15234375465661287,
"rewards/format_reward": 0.4472656361758709,
"step": 3
},
{
"completion_length": 410.4349060058594,
"epoch": 0.014134275618374558,
"grad_norm": 0.7884781255286112,
"kl": 0.00036644935607910156,
"learning_rate": 2.7586206896551725e-06,
"loss": 0.0,
"reward": 0.6054687723517418,
"reward_std": 0.4260980412364006,
"rewards/accuracy_reward": 0.14192708721384406,
"rewards/format_reward": 0.46354168467223644,
"step": 4
},
{
"completion_length": 346.99024391174316,
"epoch": 0.0176678445229682,
"grad_norm": 0.5273525890826313,
"kl": 0.0029783248901367188,
"learning_rate": 3.448275862068966e-06,
"loss": 0.0001,
"reward": 0.7408854402601719,
"reward_std": 0.40636622719466686,
"rewards/accuracy_reward": 0.11979167081881315,
"rewards/format_reward": 0.6210937686264515,
"step": 5
},
{
"completion_length": 325.3359498977661,
"epoch": 0.02120141342756184,
"grad_norm": 0.46274180489878297,
"kl": 0.01139068603515625,
"learning_rate": 4.137931034482759e-06,
"loss": 0.0005,
"reward": 0.7929687649011612,
"reward_std": 0.4467017278075218,
"rewards/accuracy_reward": 0.1158854195382446,
"rewards/format_reward": 0.6770833507180214,
"step": 6
},
{
"completion_length": 227.93359851837158,
"epoch": 0.024734982332155476,
"grad_norm": 53.42434025598496,
"kl": 1.0528564453125,
"learning_rate": 4.8275862068965525e-06,
"loss": 0.042,
"reward": 0.9694010689854622,
"reward_std": 0.24079907592386007,
"rewards/accuracy_reward": 0.06445312686264515,
"rewards/format_reward": 0.9049479439854622,
"step": 7
},
{
"completion_length": 202.09896278381348,
"epoch": 0.028268551236749116,
"grad_norm": 19.059740265959093,
"kl": 0.4888916015625,
"learning_rate": 5.517241379310345e-06,
"loss": 0.0195,
"reward": 0.9843750298023224,
"reward_std": 0.24569615349173546,
"rewards/accuracy_reward": 0.06380208441987634,
"rewards/format_reward": 0.9205729328095913,
"step": 8
},
{
"completion_length": 183.13867664337158,
"epoch": 0.03180212014134275,
"grad_norm": 1.481224853303734,
"kl": 0.0942840576171875,
"learning_rate": 6.206896551724138e-06,
"loss": 0.0038,
"reward": 1.023437526077032,
"reward_std": 0.236824631690979,
"rewards/accuracy_reward": 0.09309896151535213,
"rewards/format_reward": 0.9303385615348816,
"step": 9
},
{
"completion_length": 175.65234756469727,
"epoch": 0.0353356890459364,
"grad_norm": 0.5566746911844828,
"kl": 0.0570068359375,
"learning_rate": 6.896551724137932e-06,
"loss": 0.0023,
"reward": 1.0058594197034836,
"reward_std": 0.20571813452988863,
"rewards/accuracy_reward": 0.0716145855258219,
"rewards/format_reward": 0.934244804084301,
"step": 10
},
{
"completion_length": 173.4589900970459,
"epoch": 0.038869257950530034,
"grad_norm": 0.3466466544445655,
"kl": 0.0336151123046875,
"learning_rate": 7.586206896551724e-06,
"loss": 0.0013,
"reward": 1.0175781473517418,
"reward_std": 0.21056926436722279,
"rewards/accuracy_reward": 0.07291666802484542,
"rewards/format_reward": 0.9446614757180214,
"step": 11
},
{
"completion_length": 167.7506561279297,
"epoch": 0.04240282685512368,
"grad_norm": 0.31983345914206196,
"kl": 0.042755126953125,
"learning_rate": 8.275862068965518e-06,
"loss": 0.0017,
"reward": 1.0455729477107525,
"reward_std": 0.20765206310898066,
"rewards/accuracy_reward": 0.09375000174622983,
"rewards/format_reward": 0.9518229328095913,
"step": 12
},
{
"completion_length": 170.16797399520874,
"epoch": 0.045936395759717315,
"grad_norm": 0.36163723632628936,
"kl": 0.0521392822265625,
"learning_rate": 8.965517241379312e-06,
"loss": 0.0021,
"reward": 1.0390625335276127,
"reward_std": 0.22777050640434027,
"rewards/accuracy_reward": 0.09244792052777484,
"rewards/format_reward": 0.9466146044433117,
"step": 13
},
{
"completion_length": 158.57943153381348,
"epoch": 0.04946996466431095,
"grad_norm": 0.3520770926699356,
"kl": 0.0601348876953125,
"learning_rate": 9.655172413793105e-06,
"loss": 0.0024,
"reward": 1.0709635838866234,
"reward_std": 0.25874905101954937,
"rewards/accuracy_reward": 0.13020833674818277,
"rewards/format_reward": 0.9407552257180214,
"step": 14
},
{
"completion_length": 118.45377922058105,
"epoch": 0.053003533568904596,
"grad_norm": 0.35628917266654875,
"kl": 0.10369873046875,
"learning_rate": 1.0344827586206898e-05,
"loss": 0.0041,
"reward": 1.0696614794433117,
"reward_std": 0.21860592905431986,
"rewards/accuracy_reward": 0.11132812919095159,
"rewards/format_reward": 0.958333358168602,
"step": 15
},
{
"completion_length": 81.11849284172058,
"epoch": 0.05653710247349823,
"grad_norm": 1.1321068685550928,
"kl": 0.23651123046875,
"learning_rate": 1.103448275862069e-05,
"loss": 0.0095,
"reward": 1.0891927629709244,
"reward_std": 0.16596419550478458,
"rewards/accuracy_reward": 0.10677083791233599,
"rewards/format_reward": 0.9824218899011612,
"step": 16
},
{
"completion_length": 116.46028995513916,
"epoch": 0.06007067137809187,
"grad_norm": 0.4979444532314162,
"kl": 0.111053466796875,
"learning_rate": 1.1724137931034483e-05,
"loss": 0.0044,
"reward": 1.091796912252903,
"reward_std": 0.20691483141854405,
"rewards/accuracy_reward": 0.12630208651535213,
"rewards/format_reward": 0.9654948078095913,
"step": 17
},
{
"completion_length": 106.39062833786011,
"epoch": 0.0636042402826855,
"grad_norm": 0.39934788591997045,
"kl": 0.13848876953125,
"learning_rate": 1.2413793103448277e-05,
"loss": 0.0055,
"reward": 1.1243490055203438,
"reward_std": 0.23089734092354774,
"rewards/accuracy_reward": 0.15299479360692203,
"rewards/format_reward": 0.9713541902601719,
"step": 18
},
{
"completion_length": 116.17318058013916,
"epoch": 0.06713780918727916,
"grad_norm": 0.40747455110093217,
"kl": 0.15606689453125,
"learning_rate": 1.310344827586207e-05,
"loss": 0.0062,
"reward": 1.1321614980697632,
"reward_std": 0.23220197623595595,
"rewards/accuracy_reward": 0.15625000465661287,
"rewards/format_reward": 0.9759114794433117,
"step": 19
},
{
"completion_length": 109.37304925918579,
"epoch": 0.0706713780918728,
"grad_norm": 0.38360455679162203,
"kl": 0.2166748046875,
"learning_rate": 1.3793103448275863e-05,
"loss": 0.0087,
"reward": 1.1816406697034836,
"reward_std": 0.2594331307336688,
"rewards/accuracy_reward": 0.20182292396202683,
"rewards/format_reward": 0.9798177219927311,
"step": 20
},
{
"completion_length": 106.66992425918579,
"epoch": 0.07420494699646643,
"grad_norm": 0.4520249358225062,
"kl": 0.266357421875,
"learning_rate": 1.4482758620689657e-05,
"loss": 0.0107,
"reward": 1.1595052555203438,
"reward_std": 0.24440898094326258,
"rewards/accuracy_reward": 0.17773437732830644,
"rewards/format_reward": 0.9817708544433117,
"step": 21
},
{
"completion_length": 140.5937557220459,
"epoch": 0.07773851590106007,
"grad_norm": 1.4718976020696355,
"kl": 0.232421875,
"learning_rate": 1.5172413793103448e-05,
"loss": 0.0093,
"reward": 1.199869841337204,
"reward_std": 0.2511229431256652,
"rewards/accuracy_reward": 0.2174479211680591,
"rewards/format_reward": 0.9824219010770321,
"step": 22
},
{
"completion_length": 186.76562976837158,
"epoch": 0.0812720848056537,
"grad_norm": 0.9576903316083716,
"kl": 0.173370361328125,
"learning_rate": 1.586206896551724e-05,
"loss": 0.0069,
"reward": 1.2298177480697632,
"reward_std": 0.26364279724657536,
"rewards/accuracy_reward": 0.24869792629033327,
"rewards/format_reward": 0.9811198152601719,
"step": 23
},
{
"completion_length": 189.61133193969727,
"epoch": 0.08480565371024736,
"grad_norm": 0.3064098864277374,
"kl": 0.15606689453125,
"learning_rate": 1.6551724137931037e-05,
"loss": 0.0062,
"reward": 1.2526042088866234,
"reward_std": 0.27888874523341656,
"rewards/accuracy_reward": 0.28125000931322575,
"rewards/format_reward": 0.9713541828095913,
"step": 24
},
{
"completion_length": 179.1979217529297,
"epoch": 0.08833922261484099,
"grad_norm": 0.4424785145099648,
"kl": 0.168243408203125,
"learning_rate": 1.7241379310344828e-05,
"loss": 0.0067,
"reward": 1.189453162252903,
"reward_std": 0.2696635592728853,
"rewards/accuracy_reward": 0.22526042349636555,
"rewards/format_reward": 0.9641927294433117,
"step": 25
},
{
"completion_length": 220.5813865661621,
"epoch": 0.09187279151943463,
"grad_norm": 0.2927466463526939,
"kl": 0.117034912109375,
"learning_rate": 1.7931034482758623e-05,
"loss": 0.0047,
"reward": 1.221354216337204,
"reward_std": 0.28838597796857357,
"rewards/accuracy_reward": 0.25520834047347307,
"rewards/format_reward": 0.9661458507180214,
"step": 26
},
{
"completion_length": 218.9602918624878,
"epoch": 0.09540636042402827,
"grad_norm": 0.3147261189039069,
"kl": 0.1197509765625,
"learning_rate": 1.8620689655172415e-05,
"loss": 0.0048,
"reward": 1.2246094197034836,
"reward_std": 0.2890056548640132,
"rewards/accuracy_reward": 0.26236980129033327,
"rewards/format_reward": 0.962239608168602,
"step": 27
},
{
"completion_length": 192.85547256469727,
"epoch": 0.0989399293286219,
"grad_norm": 0.2815314878632684,
"kl": 0.147918701171875,
"learning_rate": 1.931034482758621e-05,
"loss": 0.0059,
"reward": 1.2167969048023224,
"reward_std": 0.29042986780405045,
"rewards/accuracy_reward": 0.2584635470993817,
"rewards/format_reward": 0.9583333544433117,
"step": 28
},
{
"completion_length": 178.49284267425537,
"epoch": 0.10247349823321555,
"grad_norm": 0.32426980772846276,
"kl": 0.1710205078125,
"learning_rate": 2e-05,
"loss": 0.0068,
"reward": 1.2174479514360428,
"reward_std": 0.28664571419358253,
"rewards/accuracy_reward": 0.26627604849636555,
"rewards/format_reward": 0.9511718936264515,
"step": 29
},
{
"completion_length": 132.87891054153442,
"epoch": 0.10600706713780919,
"grad_norm": 0.3462539007364067,
"kl": 0.24053955078125,
"learning_rate": 1.999923511388017e-05,
"loss": 0.0096,
"reward": 1.2102864906191826,
"reward_std": 0.2935393461957574,
"rewards/accuracy_reward": 0.2519531361758709,
"rewards/format_reward": 0.9583333544433117,
"step": 30
},
{
"completion_length": 104.59570646286011,
"epoch": 0.10954063604240283,
"grad_norm": 0.31757187279941806,
"kl": 0.29840087890625,
"learning_rate": 1.999694057253083e-05,
"loss": 0.0119,
"reward": 1.2070312798023224,
"reward_std": 0.2599523845128715,
"rewards/accuracy_reward": 0.24153646733611822,
"rewards/format_reward": 0.9654948115348816,
"step": 31
},
{
"completion_length": 122.79622745513916,
"epoch": 0.11307420494699646,
"grad_norm": 0.29516858001336377,
"kl": 0.289306640625,
"learning_rate": 1.9993116726964554e-05,
"loss": 0.0116,
"reward": 1.1855469048023224,
"reward_std": 0.2488250662572682,
"rewards/accuracy_reward": 0.2220052140764892,
"rewards/format_reward": 0.9635416865348816,
"step": 32
},
{
"completion_length": 169.53385829925537,
"epoch": 0.1166077738515901,
"grad_norm": 0.2854397155723977,
"kl": 0.218505859375,
"learning_rate": 1.9987764162142615e-05,
"loss": 0.0087,
"reward": 1.2135417088866234,
"reward_std": 0.27537838369607925,
"rewards/accuracy_reward": 0.25130209513008595,
"rewards/format_reward": 0.9622396044433117,
"step": 33
},
{
"completion_length": 219.41602230072021,
"epoch": 0.12014134275618374,
"grad_norm": 0.22474385235022273,
"kl": 0.1595458984375,
"learning_rate": 1.998088369688552e-05,
"loss": 0.0064,
"reward": 1.214843787252903,
"reward_std": 0.31507682241499424,
"rewards/accuracy_reward": 0.2766927136108279,
"rewards/format_reward": 0.9381510615348816,
"step": 34
},
{
"completion_length": 274.76563453674316,
"epoch": 0.12367491166077739,
"grad_norm": 1452525.1386107916,
"kl": 21248.131378173828,
"learning_rate": 1.9972476383747748e-05,
"loss": 851.4882,
"reward": 1.1894531697034836,
"reward_std": 0.3239498296752572,
"rewards/accuracy_reward": 0.2558593829162419,
"rewards/format_reward": 0.9335937686264515,
"step": 35
},
{
"completion_length": 315.4687614440918,
"epoch": 0.127208480565371,
"grad_norm": 4.918800245024653,
"kl": 0.221282958984375,
"learning_rate": 1.9962543508856722e-05,
"loss": 0.0088,
"reward": 1.212890662252903,
"reward_std": 0.3367620576173067,
"rewards/accuracy_reward": 0.29817709140479565,
"rewards/format_reward": 0.9147135615348816,
"step": 36
},
{
"completion_length": 308.2695417404175,
"epoch": 0.13074204946996468,
"grad_norm": 0.2301169380333895,
"kl": 0.126373291015625,
"learning_rate": 1.995108659171607e-05,
"loss": 0.0051,
"reward": 1.2324219271540642,
"reward_std": 0.31519000325351954,
"rewards/accuracy_reward": 0.28971354849636555,
"rewards/format_reward": 0.9427083544433117,
"step": 37
},
{
"completion_length": 287.9537887573242,
"epoch": 0.13427561837455831,
"grad_norm": 0.20139063038206165,
"kl": 0.11163330078125,
"learning_rate": 1.9938107384973165e-05,
"loss": 0.0045,
"reward": 1.2350260764360428,
"reward_std": 0.2919177133589983,
"rewards/accuracy_reward": 0.26888021547347307,
"rewards/format_reward": 0.966145858168602,
"step": 38
},
{
"completion_length": 288.69141578674316,
"epoch": 0.13780918727915195,
"grad_norm": 0.18498422915016208,
"kl": 0.1121826171875,
"learning_rate": 1.992360787415103e-05,
"loss": 0.0045,
"reward": 1.2382812947034836,
"reward_std": 0.2696660226210952,
"rewards/accuracy_reward": 0.270182297565043,
"rewards/format_reward": 0.9680989757180214,
"step": 39
},
{
"completion_length": 285.7558660507202,
"epoch": 0.1413427561837456,
"grad_norm": 0.18156169712553022,
"kl": 0.112762451171875,
"learning_rate": 1.9907590277344582e-05,
"loss": 0.0045,
"reward": 1.2604167014360428,
"reward_std": 0.23935140296816826,
"rewards/accuracy_reward": 0.2799479244276881,
"rewards/format_reward": 0.9804687649011612,
"step": 40
},
{
"completion_length": 274.0761842727661,
"epoch": 0.14487632508833923,
"grad_norm": 0.2080024626531214,
"kl": 0.1217041015625,
"learning_rate": 1.9890057044881308e-05,
"loss": 0.0049,
"reward": 1.2936198338866234,
"reward_std": 0.2932386351749301,
"rewards/accuracy_reward": 0.32031251210719347,
"rewards/format_reward": 0.9733073152601719,
"step": 41
},
{
"completion_length": 282.8776149749756,
"epoch": 0.14840989399293286,
"grad_norm": 0.23856185666588572,
"kl": 0.125213623046875,
"learning_rate": 1.9871010858946443e-05,
"loss": 0.005,
"reward": 1.2753906697034836,
"reward_std": 0.2589658652432263,
"rewards/accuracy_reward": 0.3033854253590107,
"rewards/format_reward": 0.972005233168602,
"step": 42
},
{
"completion_length": 296.82422733306885,
"epoch": 0.1519434628975265,
"grad_norm": 0.2858858078510366,
"kl": 0.140716552734375,
"learning_rate": 1.9850454633172632e-05,
"loss": 0.0056,
"reward": 1.265625037252903,
"reward_std": 0.3042640471830964,
"rewards/accuracy_reward": 0.3138020960614085,
"rewards/format_reward": 0.951822929084301,
"step": 43
},
{
"completion_length": 312.989595413208,
"epoch": 0.15547703180212014,
"grad_norm": 75.43924110056017,
"kl": 1.633636474609375,
"learning_rate": 1.982839151219424e-05,
"loss": 0.0654,
"reward": 1.2662760615348816,
"reward_std": 0.3203592775389552,
"rewards/accuracy_reward": 0.32617188431322575,
"rewards/format_reward": 0.9401041865348816,
"step": 44
},
{
"completion_length": 318.19271659851074,
"epoch": 0.15901060070671377,
"grad_norm": 4.839307387093032,
"kl": 0.45843505859375,
"learning_rate": 1.9804824871166254e-05,
"loss": 0.0183,
"reward": 1.2480469271540642,
"reward_std": 0.37619344517588615,
"rewards/accuracy_reward": 0.3489583432674408,
"rewards/format_reward": 0.8990885615348816,
"step": 45
},
{
"completion_length": 360.181001663208,
"epoch": 0.1625441696113074,
"grad_norm": 1.6956048738668972,
"kl": 0.27557373046875,
"learning_rate": 1.9779758315248006e-05,
"loss": 0.011,
"reward": 1.2226562947034836,
"reward_std": 0.3949108961969614,
"rewards/accuracy_reward": 0.3483073003590107,
"rewards/format_reward": 0.874348983168602,
"step": 46
},
{
"completion_length": 365.8092555999756,
"epoch": 0.16607773851590105,
"grad_norm": 201.83615153266913,
"kl": 12.43408203125,
"learning_rate": 1.975319567905163e-05,
"loss": 0.4973,
"reward": 1.1972656659781933,
"reward_std": 0.4045752976089716,
"rewards/accuracy_reward": 0.345703131519258,
"rewards/format_reward": 0.8515625186264515,
"step": 47
},
{
"completion_length": 342.6002674102783,
"epoch": 0.1696113074204947,
"grad_norm": 5.398450236245697,
"kl": 0.656494140625,
"learning_rate": 1.9725141026055473e-05,
"loss": 0.0263,
"reward": 1.2246094085276127,
"reward_std": 0.4386922810226679,
"rewards/accuracy_reward": 0.37304688431322575,
"rewards/format_reward": 0.8515625186264515,
"step": 48
},
{
"completion_length": 354.3912830352783,
"epoch": 0.17314487632508835,
"grad_norm": 6.492613210824359,
"kl": 0.2779541015625,
"learning_rate": 1.9695598647982467e-05,
"loss": 0.0111,
"reward": 1.1972656697034836,
"reward_std": 0.44870651699602604,
"rewards/accuracy_reward": 0.3587239682674408,
"rewards/format_reward": 0.8385416902601719,
"step": 49
},
{
"completion_length": 317.2200622558594,
"epoch": 0.17667844522968199,
"grad_norm": 6.9054290512934475,
"kl": 0.31793212890625,
"learning_rate": 1.9664573064143604e-05,
"loss": 0.0127,
"reward": 1.148437537252903,
"reward_std": 0.4271644949913025,
"rewards/accuracy_reward": 0.3033854244276881,
"rewards/format_reward": 0.8450521044433117,
"step": 50
},
{
"completion_length": 291.3737087249756,
"epoch": 0.18021201413427562,
"grad_norm": 1.295034459732409,
"kl": 0.384765625,
"learning_rate": 1.9632069020746574e-05,
"loss": 0.0154,
"reward": 1.1966146305203438,
"reward_std": 0.41263195499777794,
"rewards/accuracy_reward": 0.34179688338190317,
"rewards/format_reward": 0.8548177257180214,
"step": 51
},
{
"completion_length": 242.330735206604,
"epoch": 0.18374558303886926,
"grad_norm": 44.75372246484856,
"kl": 3.65264892578125,
"learning_rate": 1.9598091490169696e-05,
"loss": 0.1463,
"reward": 1.1809896305203438,
"reward_std": 0.3946582209318876,
"rewards/accuracy_reward": 0.31445313477888703,
"rewards/format_reward": 0.8665364757180214,
"step": 52
},
{
"completion_length": 196.8216199874878,
"epoch": 0.1872791519434629,
"grad_norm": 4.161042131383654,
"kl": 0.65478515625,
"learning_rate": 1.9562645670201278e-05,
"loss": 0.0262,
"reward": 1.2526041865348816,
"reward_std": 0.3336602235212922,
"rewards/accuracy_reward": 0.3313802145421505,
"rewards/format_reward": 0.9212239682674408,
"step": 53
},
{
"completion_length": 176.54818201065063,
"epoch": 0.19081272084805653,
"grad_norm": 0.2979202087235891,
"kl": 0.368896484375,
"learning_rate": 1.9525736983244458e-05,
"loss": 0.0148,
"reward": 1.2096354514360428,
"reward_std": 0.32884097658097744,
"rewards/accuracy_reward": 0.29687500884756446,
"rewards/format_reward": 0.912760429084301,
"step": 54
},
{
"completion_length": 166.145188331604,
"epoch": 0.19434628975265017,
"grad_norm": 1.099328042290411,
"kl": 0.488525390625,
"learning_rate": 1.948737107548771e-05,
"loss": 0.0195,
"reward": 1.263671912252903,
"reward_std": 0.3232028791680932,
"rewards/accuracy_reward": 0.34700521919876337,
"rewards/format_reward": 0.9166666865348816,
"step": 55
},
{
"completion_length": 201.4856834411621,
"epoch": 0.1978798586572438,
"grad_norm": 0.5237539826150962,
"kl": 0.37255859375,
"learning_rate": 1.94475538160411e-05,
"loss": 0.0149,
"reward": 1.2526042014360428,
"reward_std": 0.32452640403062105,
"rewards/accuracy_reward": 0.3457031324505806,
"rewards/format_reward": 0.906901054084301,
"step": 56
},
{
"completion_length": 223.5299530029297,
"epoch": 0.20141342756183744,
"grad_norm": 1.0789636117575958,
"kl": 0.4588623046875,
"learning_rate": 1.940629129603844e-05,
"loss": 0.0183,
"reward": 1.2089843973517418,
"reward_std": 0.367857669480145,
"rewards/accuracy_reward": 0.3307291744276881,
"rewards/format_reward": 0.8782552294433117,
"step": 57
},
{
"completion_length": 236.89389038085938,
"epoch": 0.2049469964664311,
"grad_norm": 0.9522159379621601,
"kl": 0.476318359375,
"learning_rate": 1.9363589827705494e-05,
"loss": 0.0191,
"reward": 1.227213591337204,
"reward_std": 0.39456650614738464,
"rewards/accuracy_reward": 0.34049480501562357,
"rewards/format_reward": 0.8867187611758709,
"step": 58
},
{
"completion_length": 218.1360740661621,
"epoch": 0.20848056537102475,
"grad_norm": 1.264236430534227,
"kl": 0.8280029296875,
"learning_rate": 1.9319455943394347e-05,
"loss": 0.0331,
"reward": 1.2571614980697632,
"reward_std": 0.3343982622027397,
"rewards/accuracy_reward": 0.35286459513008595,
"rewards/format_reward": 0.9042969010770321,
"step": 59
},
{
"completion_length": 185.6367244720459,
"epoch": 0.21201413427561838,
"grad_norm": 111.38995544077012,
"kl": 7.3544921875,
"learning_rate": 1.9273896394584103e-05,
"loss": 0.2946,
"reward": 1.272786483168602,
"reward_std": 0.3291959064081311,
"rewards/accuracy_reward": 0.35221355222165585,
"rewards/format_reward": 0.9205729365348816,
"step": 60
},
{
"completion_length": 197.7513074874878,
"epoch": 0.21554770318021202,
"grad_norm": 1.9921896449669312,
"kl": 0.7696533203125,
"learning_rate": 1.9226918150848067e-05,
"loss": 0.0308,
"reward": 1.2643229588866234,
"reward_std": 0.32842374220490456,
"rewards/accuracy_reward": 0.3457031324505806,
"rewards/format_reward": 0.9186198115348816,
"step": 61
},
{
"completion_length": 255.86784744262695,
"epoch": 0.21908127208480566,
"grad_norm": 2.1340639274317144,
"kl": 0.924560546875,
"learning_rate": 1.9178528398787553e-05,
"loss": 0.037,
"reward": 1.1725260838866234,
"reward_std": 0.4286086466163397,
"rewards/accuracy_reward": 0.31575521547347307,
"rewards/format_reward": 0.8567708544433117,
"step": 62
},
{
"completion_length": 175.7168025970459,
"epoch": 0.2226148409893993,
"grad_norm": 7.908467450030422,
"kl": 1.04931640625,
"learning_rate": 1.9128734540932494e-05,
"loss": 0.042,
"reward": 1.2838541865348816,
"reward_std": 0.31748174503445625,
"rewards/accuracy_reward": 0.34765625838190317,
"rewards/format_reward": 0.9361979365348816,
"step": 63
},
{
"completion_length": 145.36784315109253,
"epoch": 0.22614840989399293,
"grad_norm": 5066.629420414826,
"kl": 549.625,
"learning_rate": 1.907754419460904e-05,
"loss": 22.0063,
"reward": 1.3138021156191826,
"reward_std": 0.25937482714653015,
"rewards/accuracy_reward": 0.3567708423361182,
"rewards/format_reward": 0.9570312723517418,
"step": 64
},
{
"completion_length": 134.21419763565063,
"epoch": 0.22968197879858657,
"grad_norm": 73.69279102025739,
"kl": 7.2021484375,
"learning_rate": 1.9024965190774262e-05,
"loss": 0.2879,
"reward": 1.2675781548023224,
"reward_std": 0.2847044528461993,
"rewards/accuracy_reward": 0.32226563058793545,
"rewards/format_reward": 0.9453125260770321,
"step": 65
},
{
"completion_length": 295.8131628036499,
"epoch": 0.2332155477031802,
"grad_norm": 15.351766639697455,
"kl": 1.186279296875,
"learning_rate": 1.8971005572818213e-05,
"loss": 0.0474,
"reward": 1.1197917014360428,
"reward_std": 0.4257570914924145,
"rewards/accuracy_reward": 0.285807297565043,
"rewards/format_reward": 0.8339843899011612,
"step": 66
},
{
"completion_length": 483.8196773529053,
"epoch": 0.23674911660777384,
"grad_norm": 595.1397215741732,
"kl": 64.0,
"learning_rate": 1.8915673595333443e-05,
"loss": 2.5615,
"reward": 0.9531250223517418,
"reward_std": 0.5964761041104794,
"rewards/accuracy_reward": 0.30403646640479565,
"rewards/format_reward": 0.649088554084301,
"step": 67
},
{
"completion_length": 788.6002807617188,
"epoch": 0.24028268551236748,
"grad_norm": 50.44103352403159,
"kl": 3.42041015625,
"learning_rate": 1.8858977722852273e-05,
"loss": 0.1367,
"reward": 0.4303385578095913,
"reward_std": 0.5175576768815517,
"rewards/accuracy_reward": 0.17187500512227416,
"rewards/format_reward": 0.2584635503590107,
"step": 68
},
{
"completion_length": 929.8216323852539,
"epoch": 0.24381625441696114,
"grad_norm": 734.3438170967142,
"kl": 85.0,
"learning_rate": 1.8800926628551884e-05,
"loss": 3.3995,
"reward": 0.21744792256504297,
"reward_std": 0.338073399849236,
"rewards/accuracy_reward": 0.11653646267950535,
"rewards/format_reward": 0.1009114624466747,
"step": 69
},
{
"completion_length": 997.1341323852539,
"epoch": 0.24734982332155478,
"grad_norm": 30.37607254327561,
"kl": 5.88671875,
"learning_rate": 1.8741529192927528e-05,
"loss": 0.2356,
"reward": 0.125000003259629,
"reward_std": 0.22288852790370584,
"rewards/accuracy_reward": 0.09440104337409139,
"rewards/format_reward": 0.03059895901242271,
"step": 70
},
{
"completion_length": 1017.4121170043945,
"epoch": 0.2508833922261484,
"grad_norm": 27.33797696554464,
"kl": 1.1336669921875,
"learning_rate": 1.8680794502434018e-05,
"loss": 0.0453,
"reward": 0.09960937825962901,
"reward_std": 0.15996943740174174,
"rewards/accuracy_reward": 0.09895833628252149,
"rewards/format_reward": 0.0006510416860692203,
"step": 71
},
{
"completion_length": 1011.7773590087891,
"epoch": 0.254416961130742,
"grad_norm": 51.58164513519689,
"kl": 1.179931640625,
"learning_rate": 1.8618731848095706e-05,
"loss": 0.0472,
"reward": 0.12369791930541396,
"reward_std": 0.19708295073360205,
"rewards/accuracy_reward": 0.1223958358168602,
"rewards/format_reward": 0.0013020833721384406,
"step": 72
},
{
"completion_length": 1023.6119804382324,
"epoch": 0.2579505300353357,
"grad_norm": 0.3170159566701739,
"kl": 0.142608642578125,
"learning_rate": 1.855535072408516e-05,
"loss": 0.0057,
"reward": 0.11263021267950535,
"reward_std": 0.1862776312045753,
"rewards/accuracy_reward": 0.11197917093522847,
"rewards/format_reward": 0.0006510416860692203,
"step": 73
},
{
"completion_length": 1024.0,
"epoch": 0.26148409893992935,
"grad_norm": 0.4703496805090262,
"kl": 0.159820556640625,
"learning_rate": 1.849066082627079e-05,
"loss": 0.0064,
"reward": 0.10807291930541396,
"reward_std": 0.15622267639264464,
"rewards/accuracy_reward": 0.10807291930541396,
"rewards/format_reward": 0.0,
"step": 74
},
{
"completion_length": 1024.0,
"epoch": 0.26501766784452296,
"grad_norm": 0.17854669938905443,
"kl": 0.15557861328125,
"learning_rate": 1.8424672050733577e-05,
"loss": 0.0062,
"reward": 0.16731771267950535,
"reward_std": 0.2150044571608305,
"rewards/accuracy_reward": 0.16731771267950535,
"rewards/format_reward": 0.0,
"step": 75
},
{
"completion_length": 1024.0,
"epoch": 0.26855123674911663,
"grad_norm": 0.9729182998733601,
"kl": 0.2708740234375,
"learning_rate": 1.8357394492253216e-05,
"loss": 0.0108,
"reward": 0.20703125465661287,
"reward_std": 0.24680148623883724,
"rewards/accuracy_reward": 0.20703125465661287,
"rewards/format_reward": 0.0,
"step": 76
},
{
"completion_length": 1024.0,
"epoch": 0.27208480565371024,
"grad_norm": 0.10194934645252694,
"kl": 0.17828369140625,
"learning_rate": 1.8288838442763838e-05,
"loss": 0.0071,
"reward": 0.2675781324505806,
"reward_std": 0.27682292833924294,
"rewards/accuracy_reward": 0.2662760494276881,
"rewards/format_reward": 0.0013020833721384406,
"step": 77
},
{
"completion_length": 1024.0,
"epoch": 0.2756183745583039,
"grad_norm": 0.10819020058919213,
"kl": 0.18212890625,
"learning_rate": 1.8219014389779586e-05,
"loss": 0.0073,
"reward": 0.26497396547347307,
"reward_std": 0.25599073618650436,
"rewards/accuracy_reward": 0.2597656324505806,
"rewards/format_reward": 0.0052083334885537624,
"step": 78
},
{
"completion_length": 1024.0,
"epoch": 0.2791519434628975,
"grad_norm": 0.25970666338072285,
"kl": 0.196533203125,
"learning_rate": 1.8147933014790245e-05,
"loss": 0.0079,
"reward": 0.2988281287252903,
"reward_std": 0.28412702213972807,
"rewards/accuracy_reward": 0.2760416744276881,
"rewards/format_reward": 0.02278645895421505,
"step": 79
},
{
"completion_length": 1024.0,
"epoch": 0.2826855123674912,
"grad_norm": 0.7315926003388828,
"kl": 0.2489013671875,
"learning_rate": 1.8075605191627242e-05,
"loss": 0.01,
"reward": 0.401041679084301,
"reward_std": 0.3803202658891678,
"rewards/accuracy_reward": 0.2910156296566129,
"rewards/format_reward": 0.11002604523673654,
"step": 80
},
{
"completion_length": 1024.0,
"epoch": 0.2862190812720848,
"grad_norm": 10.638076717998826,
"kl": 1.59765625,
"learning_rate": 1.8002041984800173e-05,
"loss": 0.064,
"reward": 0.8561198078095913,
"reward_std": 0.5722472295165062,
"rewards/accuracy_reward": 0.29622396547347307,
"rewards/format_reward": 0.5598958469927311,
"step": 81
},
{
"completion_length": 1024.0,
"epoch": 0.28975265017667845,
"grad_norm": 1.8766038235299047,
"kl": 0.3572998046875,
"learning_rate": 1.792725464780421e-05,
"loss": 0.0143,
"reward": 0.9231771044433117,
"reward_std": 0.5282374154776335,
"rewards/accuracy_reward": 0.2630208423361182,
"rewards/format_reward": 0.6601562723517418,
"step": 82
},
{
"completion_length": 1024.0,
"epoch": 0.29328621908127206,
"grad_norm": 2.8486965993956006,
"kl": 0.9862060546875,
"learning_rate": 1.785125462139855e-05,
"loss": 0.0394,
"reward": 1.1699219197034836,
"reward_std": 0.3966878689825535,
"rewards/accuracy_reward": 0.2936198003590107,
"rewards/format_reward": 0.876302108168602,
"step": 83
},
{
"completion_length": 1024.0,
"epoch": 0.2968197879858657,
"grad_norm": 1.3612075349379993,
"kl": 0.6107177734375,
"learning_rate": 1.7774053531856258e-05,
"loss": 0.0244,
"reward": 1.2044271007180214,
"reward_std": 0.4200763385742903,
"rewards/accuracy_reward": 0.30989584047347307,
"rewards/format_reward": 0.8945312723517418,
"step": 84
},
{
"completion_length": 1024.0,
"epoch": 0.3003533568904594,
"grad_norm": 8.701071380759565,
"kl": 2.2080078125,
"learning_rate": 1.7695663189185703e-05,
"loss": 0.0883,
"reward": 1.208984412252903,
"reward_std": 0.36989905312657356,
"rewards/accuracy_reward": 0.2812500102445483,
"rewards/format_reward": 0.9277343936264515,
"step": 85
},
{
"completion_length": 1024.0,
"epoch": 0.303886925795053,
"grad_norm": 1.8019930642199862,
"kl": 1.42626953125,
"learning_rate": 1.7616095585323882e-05,
"loss": 0.0571,
"reward": 1.156901091337204,
"reward_std": 0.4002630840986967,
"rewards/accuracy_reward": 0.26041667349636555,
"rewards/format_reward": 0.8964843973517418,
"step": 86
},
{
"completion_length": 1024.0,
"epoch": 0.30742049469964666,
"grad_norm": 4.127300637009884,
"kl": 0.299072265625,
"learning_rate": 1.7535362892301953e-05,
"loss": 0.012,
"reward": 1.0872396118938923,
"reward_std": 0.4400251917541027,
"rewards/accuracy_reward": 0.23958334233611822,
"rewards/format_reward": 0.8476562760770321,
"step": 87
},
{
"completion_length": 1024.0,
"epoch": 0.31095406360424027,
"grad_norm": 4.667835148606083,
"kl": 0.3037109375,
"learning_rate": 1.745347746038319e-05,
"loss": 0.0122,
"reward": 1.0917969234287739,
"reward_std": 0.4730749297887087,
"rewards/accuracy_reward": 0.2747395886108279,
"rewards/format_reward": 0.8170573078095913,
"step": 88
},
{
"completion_length": 1024.0,
"epoch": 0.31448763250883394,
"grad_norm": 2.7630279757978706,
"kl": 0.873291015625,
"learning_rate": 1.737045181617364e-05,
"loss": 0.035,
"reward": 1.0364583693444729,
"reward_std": 0.5082622393965721,
"rewards/accuracy_reward": 0.2389322966337204,
"rewards/format_reward": 0.7975260578095913,
"step": 89
},
{
"completion_length": 1024.0,
"epoch": 0.31802120141342755,
"grad_norm": 27.960489910885407,
"kl": 5.583984375,
"learning_rate": 1.7286298660705877e-05,
"loss": 0.2233,
"reward": 1.0781250298023224,
"reward_std": 0.47590856440365314,
"rewards/accuracy_reward": 0.25000000838190317,
"rewards/format_reward": 0.8281250223517418,
"step": 90
},
{
"completion_length": 1024.0,
"epoch": 0.3215547703180212,
"grad_norm": 46.613552133923285,
"kl": 8.90234375,
"learning_rate": 1.7201030867496005e-05,
"loss": 0.3559,
"reward": 1.0358073264360428,
"reward_std": 0.4818594641983509,
"rewards/accuracy_reward": 0.2369791753590107,
"rewards/format_reward": 0.7988281399011612,
"step": 91
},
{
"completion_length": 1024.0,
"epoch": 0.3250883392226148,
"grad_norm": 28.615484872216385,
"kl": 5.740234375,
"learning_rate": 1.711466148057433e-05,
"loss": 0.2297,
"reward": 0.9694010727107525,
"reward_std": 0.5088351331651211,
"rewards/accuracy_reward": 0.22916667582467198,
"rewards/format_reward": 0.7402343861758709,
"step": 92
},
{
"completion_length": 1024.0,
"epoch": 0.3286219081272085,
"grad_norm": 5.615041558235327,
"kl": 2.17333984375,
"learning_rate": 1.7027203712489902e-05,
"loss": 0.0869,
"reward": 0.8919270969927311,
"reward_std": 0.5635814908891916,
"rewards/accuracy_reward": 0.26757813431322575,
"rewards/format_reward": 0.6243489757180214,
"step": 93
},
{
"completion_length": 1024.0,
"epoch": 0.3321554770318021,
"grad_norm": 8.244116658103383,
"kl": 0.3486328125,
"learning_rate": 1.6938670942289292e-05,
"loss": 0.0139,
"reward": 0.6302083544433117,
"reward_std": 0.5751823391765356,
"rewards/accuracy_reward": 0.22395833861082792,
"rewards/format_reward": 0.40625000931322575,
"step": 94
},
{
"completion_length": 1024.0,
"epoch": 0.33568904593639576,
"grad_norm": 6.524355268359978,
"kl": 0.24981689453125,
"learning_rate": 1.6849076713469914e-05,
"loss": 0.01,
"reward": 0.5638021007180214,
"reward_std": 0.5523576978594065,
"rewards/accuracy_reward": 0.23893229756504297,
"rewards/format_reward": 0.32486980222165585,
"step": 95
},
{
"completion_length": 1024.0,
"epoch": 0.3392226148409894,
"grad_norm": 7.953861555674408,
"kl": 0.3388671875,
"learning_rate": 1.6758434731908178e-05,
"loss": 0.0136,
"reward": 0.6627604402601719,
"reward_std": 0.5646504014730453,
"rewards/accuracy_reward": 0.27343750838190317,
"rewards/format_reward": 0.3893229253590107,
"step": 96
},
{
"completion_length": 1024.0,
"epoch": 0.34275618374558303,
"grad_norm": 9.117171774796663,
"kl": 0.66259765625,
"learning_rate": 1.6666758863762796e-05,
"loss": 0.0265,
"reward": 0.8255208507180214,
"reward_std": 0.5879920609295368,
"rewards/accuracy_reward": 0.281250006519258,
"rewards/format_reward": 0.5442708488553762,
"step": 97
},
{
"completion_length": 1024.0,
"epoch": 0.3462897526501767,
"grad_norm": 3.2064911207772133,
"kl": 0.98388671875,
"learning_rate": 1.657406313335358e-05,
"loss": 0.0394,
"reward": 0.9733073189854622,
"reward_std": 0.5437621138989925,
"rewards/accuracy_reward": 0.27604167629033327,
"rewards/format_reward": 0.6972656436264515,
"step": 98
},
{
"completion_length": 1024.0,
"epoch": 0.3498233215547703,
"grad_norm": 3.916153998441947,
"kl": 0.6441650390625,
"learning_rate": 1.6480361721016053e-05,
"loss": 0.0258,
"reward": 1.0598958618938923,
"reward_std": 0.46388070471584797,
"rewards/accuracy_reward": 0.2578125074505806,
"rewards/format_reward": 0.8020833507180214,
"step": 99
},
{
"completion_length": 1024.0,
"epoch": 0.35335689045936397,
"grad_norm": 1.527146735006838,
"kl": 0.5404052734375,
"learning_rate": 1.6385668960932143e-05,
"loss": 0.0216,
"reward": 1.104166690260172,
"reward_std": 0.4302889872342348,
"rewards/accuracy_reward": 0.2565104253590107,
"rewards/format_reward": 0.8476562686264515,
"step": 100
},
{
"completion_length": 1024.0,
"epoch": 0.3568904593639576,
"grad_norm": 0.650990035380561,
"kl": 0.4241943359375,
"learning_rate": 1.6289999338937427e-05,
"loss": 0.017,
"reward": 1.2311198338866234,
"reward_std": 0.36215772293508053,
"rewards/accuracy_reward": 0.3190104244276881,
"rewards/format_reward": 0.9121093973517418,
"step": 101
},
{
"completion_length": 1024.0,
"epoch": 0.36042402826855124,
"grad_norm": 2.309888696921956,
"kl": 0.6422119140625,
"learning_rate": 1.619336749030509e-05,
"loss": 0.0257,
"reward": 1.1972656697034836,
"reward_std": 0.33416645554825664,
"rewards/accuracy_reward": 0.27018230129033327,
"rewards/format_reward": 0.9270833507180214,
"step": 102
},
{
"completion_length": 1024.0,
"epoch": 0.36395759717314485,
"grad_norm": 4.444041311869189,
"kl": 0.896728515625,
"learning_rate": 1.609578819750708e-05,
"loss": 0.0359,
"reward": 1.0091146156191826,
"reward_std": 0.4832933880388737,
"rewards/accuracy_reward": 0.2298177145421505,
"rewards/format_reward": 0.7792968936264515,
"step": 103
},
{
"completion_length": 1024.0,
"epoch": 0.3674911660777385,
"grad_norm": 11.19029279075626,
"kl": 0.737548828125,
"learning_rate": 1.5997276387952733e-05,
"loss": 0.0295,
"reward": 0.9277344010770321,
"reward_std": 0.5210180301219225,
"rewards/accuracy_reward": 0.2356770890764892,
"rewards/format_reward": 0.6920573078095913,
"step": 104
},
{
"completion_length": 1024.0,
"epoch": 0.3710247349823322,
"grad_norm": 15.55734153622524,
"kl": 1.072021484375,
"learning_rate": 1.5897847131705194e-05,
"loss": 0.0429,
"reward": 0.9375000223517418,
"reward_std": 0.5307967625558376,
"rewards/accuracy_reward": 0.2506510494276881,
"rewards/format_reward": 0.6868489757180214,
"step": 105
},
{
"completion_length": 1024.0,
"epoch": 0.3745583038869258,
"grad_norm": 20.781177075399597,
"kl": 1.77783203125,
"learning_rate": 1.5797515639176077e-05,
"loss": 0.0711,
"reward": 0.9694010652601719,
"reward_std": 0.5016757268458605,
"rewards/accuracy_reward": 0.2220052145421505,
"rewards/format_reward": 0.7473958507180214,
"step": 106
},
{
"completion_length": 1024.0,
"epoch": 0.37809187279151946,
"grad_norm": 119.3128363246367,
"kl": 6.580078125,
"learning_rate": 1.5696297258798573e-05,
"loss": 0.2632,
"reward": 1.1093750447034836,
"reward_std": 0.44075607880949974,
"rewards/accuracy_reward": 0.25455730129033327,
"rewards/format_reward": 0.8548177294433117,
"step": 107
},
{
"completion_length": 1024.0,
"epoch": 0.38162544169611307,
"grad_norm": 6.7022300054671895,
"kl": 2.0478515625,
"learning_rate": 1.5594207474679533e-05,
"loss": 0.082,
"reward": 1.0462239943444729,
"reward_std": 0.45448117703199387,
"rewards/accuracy_reward": 0.22916667442768812,
"rewards/format_reward": 0.8170573115348816,
"step": 108
},
{
"completion_length": 1024.0,
"epoch": 0.38515901060070673,
"grad_norm": 62.28140494172677,
"kl": 3.994140625,
"learning_rate": 1.549126190423073e-05,
"loss": 0.1599,
"reward": 1.1243490017950535,
"reward_std": 0.39074354991316795,
"rewards/accuracy_reward": 0.2441406287252903,
"rewards/format_reward": 0.8802083507180214,
"step": 109
},
{
"completion_length": 1024.0,
"epoch": 0.38869257950530034,
"grad_norm": 4.965909615838145,
"kl": 0.9990234375,
"learning_rate": 1.5387476295779737e-05,
"loss": 0.04,
"reward": 1.065104205161333,
"reward_std": 0.423415495082736,
"rewards/accuracy_reward": 0.21809896687045693,
"rewards/format_reward": 0.8470052182674408,
"step": 110
},
{
"completion_length": 1024.0,
"epoch": 0.392226148409894,
"grad_norm": 3.528015251218169,
"kl": 0.8387451171875,
"learning_rate": 1.5282866526160837e-05,
"loss": 0.0335,
"reward": 1.0117187798023224,
"reward_std": 0.41450436040759087,
"rewards/accuracy_reward": 0.16471354756504297,
"rewards/format_reward": 0.847005233168602,
"step": 111
},
{
"completion_length": 1024.0,
"epoch": 0.3957597173144876,
"grad_norm": 1.3076932365383318,
"kl": 0.799560546875,
"learning_rate": 1.5177448598286182e-05,
"loss": 0.032,
"reward": 1.0097656548023224,
"reward_std": 0.3976512663066387,
"rewards/accuracy_reward": 0.1523437537252903,
"rewards/format_reward": 0.8574218936264515,
"step": 112
},
{
"completion_length": 1024.0,
"epoch": 0.3992932862190813,
"grad_norm": 3.6570709102528838,
"kl": 0.9739990234375,
"learning_rate": 1.5071238638697731e-05,
"loss": 0.0389,
"reward": 1.0397135764360428,
"reward_std": 0.38695196248590946,
"rewards/accuracy_reward": 0.18229167023673654,
"rewards/format_reward": 0.8574218899011612,
"step": 113
},
{
"completion_length": 1024.0,
"epoch": 0.4028268551236749,
"grad_norm": 303.94124396845274,
"kl": 42.03125,
"learning_rate": 1.4964252895100265e-05,
"loss": 1.6829,
"reward": 1.1425781548023224,
"reward_std": 0.3955598259344697,
"rewards/accuracy_reward": 0.24218750558793545,
"rewards/format_reward": 0.9003906473517418,
"step": 114
},
{
"completion_length": 1024.0,
"epoch": 0.40636042402826855,
"grad_norm": 68.76205878652742,
"kl": 10.7421875,
"learning_rate": 1.4856507733875837e-05,
"loss": 0.4297,
"reward": 1.1054687798023224,
"reward_std": 0.36262817680835724,
"rewards/accuracy_reward": 0.20052083814516664,
"rewards/format_reward": 0.9049479365348816,
"step": 115
},
{
"completion_length": 1024.0,
"epoch": 0.4098939929328622,
"grad_norm": 14.914598534678452,
"kl": 2.833984375,
"learning_rate": 1.4748019637580116e-05,
"loss": 0.1134,
"reward": 1.0625000447034836,
"reward_std": 0.4123513549566269,
"rewards/accuracy_reward": 0.19531250838190317,
"rewards/format_reward": 0.8671875149011612,
"step": 116
},
{
"completion_length": 1024.0,
"epoch": 0.4134275618374558,
"grad_norm": 5.452147867911957,
"kl": 1.337890625,
"learning_rate": 1.4638805202420896e-05,
"loss": 0.0535,
"reward": 0.9967448301613331,
"reward_std": 0.46121339313685894,
"rewards/accuracy_reward": 0.2018229216337204,
"rewards/format_reward": 0.7949219010770321,
"step": 117
},
{
"completion_length": 1024.0,
"epoch": 0.4169611307420495,
"grad_norm": 5.07384920327551,
"kl": 1.025634765625,
"learning_rate": 1.452888113571929e-05,
"loss": 0.041,
"reward": 0.9401041865348816,
"reward_std": 0.5634889136999846,
"rewards/accuracy_reward": 0.24739584233611822,
"rewards/format_reward": 0.6927083544433117,
"step": 118
},
{
"completion_length": 1024.0,
"epoch": 0.4204946996466431,
"grad_norm": 7.69200350154935,
"kl": 0.906005859375,
"learning_rate": 1.4418264253353869e-05,
"loss": 0.0362,
"reward": 0.821614608168602,
"reward_std": 0.5736533179879189,
"rewards/accuracy_reward": 0.2070312537252903,
"rewards/format_reward": 0.6145833544433117,
"step": 119
},
{
"completion_length": 1024.0,
"epoch": 0.42402826855123676,
"grad_norm": 20.808857934877054,
"kl": 1.56982421875,
"learning_rate": 1.4306971477188223e-05,
"loss": 0.0627,
"reward": 0.7656250223517418,
"reward_std": 0.5566702261567116,
"rewards/accuracy_reward": 0.17838542046956718,
"rewards/format_reward": 0.587239608168602,
"step": 120
},
{
"completion_length": 1024.0,
"epoch": 0.4275618374558304,
"grad_norm": 26.447237281301867,
"kl": 2.738037109375,
"learning_rate": 1.419501983248229e-05,
"loss": 0.1095,
"reward": 0.7792969048023224,
"reward_std": 0.5849619917571545,
"rewards/accuracy_reward": 0.1907552140764892,
"rewards/format_reward": 0.5885416846722364,
"step": 121
},
{
"completion_length": 1024.0,
"epoch": 0.43109540636042404,
"grad_norm": 22.987609528434852,
"kl": 1.94189453125,
"learning_rate": 1.4082426445287904e-05,
"loss": 0.0775,
"reward": 0.8574218973517418,
"reward_std": 0.5550148580223322,
"rewards/accuracy_reward": 0.19661458861082792,
"rewards/format_reward": 0.660807304084301,
"step": 122
},
{
"completion_length": 1024.0,
"epoch": 0.43462897526501765,
"grad_norm": 7.848483482793478,
"kl": 1.8486328125,
"learning_rate": 1.3969208539828873e-05,
"loss": 0.074,
"reward": 0.8059896044433117,
"reward_std": 0.5727136358618736,
"rewards/accuracy_reward": 0.1894531319849193,
"rewards/format_reward": 0.6165364794433117,
"step": 123
},
{
"completion_length": 1024.0,
"epoch": 0.4381625441696113,
"grad_norm": 7.568264702961115,
"kl": 3.517578125,
"learning_rate": 1.3855383435866076e-05,
"loss": 0.1407,
"reward": 0.7825521007180214,
"reward_std": 0.5777693595737219,
"rewards/accuracy_reward": 0.17643229733221233,
"rewards/format_reward": 0.6061198096722364,
"step": 124
},
{
"completion_length": 1024.0,
"epoch": 0.4416961130742049,
"grad_norm": 1142.9874069005743,
"kl": 47.984375,
"learning_rate": 1.3740968546047935e-05,
"loss": 1.9207,
"reward": 0.7265625223517418,
"reward_std": 0.5636367797851562,
"rewards/accuracy_reward": 0.14778646221384406,
"rewards/format_reward": 0.5787760615348816,
"step": 125
},
{
"completion_length": 1024.0,
"epoch": 0.4452296819787986,
"grad_norm": 27.152545132029452,
"kl": 5.333984375,
"learning_rate": 1.362598137324667e-05,
"loss": 0.2134,
"reward": 0.7291666902601719,
"reward_std": 0.5741878617554903,
"rewards/accuracy_reward": 0.18489583674818277,
"rewards/format_reward": 0.5442708525806665,
"step": 126
},
{
"completion_length": 1024.0,
"epoch": 0.44876325088339225,
"grad_norm": 15.772408193991923,
"kl": 6.375,
"learning_rate": 1.3510439507880778e-05,
"loss": 0.255,
"reward": 0.6901041865348816,
"reward_std": 0.554063655436039,
"rewards/accuracy_reward": 0.17122396267950535,
"rewards/format_reward": 0.5188802238553762,
"step": 127
},
{
"completion_length": 1024.0,
"epoch": 0.45229681978798586,
"grad_norm": 14.045655101866169,
"kl": 4.6875,
"learning_rate": 1.3394360625224067e-05,
"loss": 0.1874,
"reward": 0.6523437760770321,
"reward_std": 0.5496281944215298,
"rewards/accuracy_reward": 0.15950521756894886,
"rewards/format_reward": 0.49283855594694614,
"step": 128
},
{
"completion_length": 1024.0,
"epoch": 0.4558303886925795,
"grad_norm": 6.019406500188639,
"kl": 4.501953125,
"learning_rate": 1.3277762482701769e-05,
"loss": 0.1801,
"reward": 0.6139323115348816,
"reward_std": 0.5297244675457478,
"rewards/accuracy_reward": 0.14778646267950535,
"rewards/format_reward": 0.46614584885537624,
"step": 129
},
{
"completion_length": 1024.0,
"epoch": 0.45936395759717313,
"grad_norm": 8.557172478534012,
"kl": 2.5107421875,
"learning_rate": 1.3160662917174045e-05,
"loss": 0.1005,
"reward": 0.5481771044433117,
"reward_std": 0.5158671271055937,
"rewards/accuracy_reward": 0.132812503259629,
"rewards/format_reward": 0.4153645932674408,
"step": 130
},
{
"completion_length": 1024.0,
"epoch": 0.4628975265017668,
"grad_norm": 9.680783748799318,
"kl": 2.01806640625,
"learning_rate": 1.3043079842207363e-05,
"loss": 0.0807,
"reward": 0.5436198115348816,
"reward_std": 0.525303166359663,
"rewards/accuracy_reward": 0.13411458616610616,
"rewards/format_reward": 0.40950521640479565,
"step": 131
},
{
"completion_length": 1024.0,
"epoch": 0.4664310954063604,
"grad_norm": 11.919372203113637,
"kl": 3.8720703125,
"learning_rate": 1.2925031245334112e-05,
"loss": 0.1549,
"reward": 0.5683593936264515,
"reward_std": 0.5238314680755138,
"rewards/accuracy_reward": 0.15104167046956718,
"rewards/format_reward": 0.41731772013008595,
"step": 132
},
{
"completion_length": 1024.0,
"epoch": 0.46996466431095407,
"grad_norm": 2.91738147399812,
"kl": 3.1455078125,
"learning_rate": 1.2806535185300931e-05,
"loss": 0.1258,
"reward": 0.5989583507180214,
"reward_std": 0.5382577646523714,
"rewards/accuracy_reward": 0.14388021198101342,
"rewards/format_reward": 0.45507813803851604,
"step": 133
},
{
"completion_length": 1024.0,
"epoch": 0.4734982332155477,
"grad_norm": 9.809127835476216,
"kl": 4.83984375,
"learning_rate": 1.2687609789306144e-05,
"loss": 0.1935,
"reward": 0.5527343917638063,
"reward_std": 0.5343853384256363,
"rewards/accuracy_reward": 0.12760417070239782,
"rewards/format_reward": 0.42513022013008595,
"step": 134
},
{
"completion_length": 1024.0,
"epoch": 0.47703180212014135,
"grad_norm": 17.04549908577345,
"kl": 7.4140625,
"learning_rate": 1.2568273250226681e-05,
"loss": 0.2967,
"reward": 0.6009114738553762,
"reward_std": 0.5301515571773052,
"rewards/accuracy_reward": 0.13606771151535213,
"rewards/format_reward": 0.46484375931322575,
"step": 135
},
{
"completion_length": 1024.0,
"epoch": 0.48056537102473496,
"grad_norm": 8.477206528274886,
"kl": 5.021484375,
"learning_rate": 1.2448543823835016e-05,
"loss": 0.201,
"reward": 0.6223958544433117,
"reward_std": 0.5402844380587339,
"rewards/accuracy_reward": 0.13411458721384406,
"rewards/format_reward": 0.4882812611758709,
"step": 136
},
{
"completion_length": 1024.0,
"epoch": 0.4840989399293286,
"grad_norm": 3.8500660741333816,
"kl": 3.294921875,
"learning_rate": 1.2328439826006415e-05,
"loss": 0.1319,
"reward": 0.655598983168602,
"reward_std": 0.5419861897826195,
"rewards/accuracy_reward": 0.15364583767950535,
"rewards/format_reward": 0.5019531399011612,
"step": 137
},
{
"completion_length": 1024.0,
"epoch": 0.4876325088339223,
"grad_norm": 4.591683686353463,
"kl": 2.4130859375,
"learning_rate": 1.2207979629917061e-05,
"loss": 0.0966,
"reward": 0.6510416865348816,
"reward_std": 0.5556948073208332,
"rewards/accuracy_reward": 0.12630208616610616,
"rewards/format_reward": 0.5247395969927311,
"step": 138
},
{
"completion_length": 1024.0,
"epoch": 0.4911660777385159,
"grad_norm": 26.38361978331117,
"kl": 5.92236328125,
"learning_rate": 1.2087181663233354e-05,
"loss": 0.2373,
"reward": 0.6998698189854622,
"reward_std": 0.5648407433182001,
"rewards/accuracy_reward": 0.13541667070239782,
"rewards/format_reward": 0.5644531436264515,
"step": 139
},
{
"completion_length": 1024.0,
"epoch": 0.49469964664310956,
"grad_norm": 4.793844402068783,
"kl": 2.24755859375,
"learning_rate": 1.1966064405292887e-05,
"loss": 0.0899,
"reward": 0.7552083618938923,
"reward_std": 0.5623416192829609,
"rewards/accuracy_reward": 0.17057292209938169,
"rewards/format_reward": 0.5846354328095913,
"step": 140
},
{
"completion_length": 1024.0,
"epoch": 0.49823321554770317,
"grad_norm": 1.6059354727609763,
"kl": 3.4482421875,
"learning_rate": 1.184464638427756e-05,
"loss": 0.1379,
"reward": 0.7838541828095913,
"reward_std": 0.5696821231395006,
"rewards/accuracy_reward": 0.18815104570239782,
"rewards/format_reward": 0.5957031436264515,
"step": 141
},
{
"completion_length": 1024.0,
"epoch": 0.5017667844522968,
"grad_norm": 14.85453956454356,
"kl": 7.228515625,
"learning_rate": 1.1722946174379168e-05,
"loss": 0.2892,
"reward": 0.7656250186264515,
"reward_std": 0.5824484005570412,
"rewards/accuracy_reward": 0.17317708721384406,
"rewards/format_reward": 0.5924479383975267,
"step": 142
},
{
"completion_length": 1024.0,
"epoch": 0.5053003533568905,
"grad_norm": 21.201841225460676,
"kl": 9.515625,
"learning_rate": 1.1600982392957978e-05,
"loss": 0.3802,
"reward": 0.7910156510770321,
"reward_std": 0.5755977407097816,
"rewards/accuracy_reward": 0.16341146174818277,
"rewards/format_reward": 0.6276041846722364,
"step": 143
},
{
"completion_length": 1024.0,
"epoch": 0.508833922261484,
"grad_norm": 79.28650798092659,
"kl": 7.9423828125,
"learning_rate": 1.1478773697694691e-05,
"loss": 0.318,
"reward": 0.9016927294433117,
"reward_std": 0.5674024932086468,
"rewards/accuracy_reward": 0.2285156287252903,
"rewards/format_reward": 0.6731771044433117,
"step": 144
},
{
"completion_length": 1024.0,
"epoch": 0.5123674911660777,
"grad_norm": 7.988007399943678,
"kl": 4.2353515625,
"learning_rate": 1.1356338783736256e-05,
"loss": 0.1695,
"reward": 0.8457031473517418,
"reward_std": 0.5507397223263979,
"rewards/accuracy_reward": 0.18294271500781178,
"rewards/format_reward": 0.6627604365348816,
"step": 145
},
{
"completion_length": 1024.0,
"epoch": 0.5159010600706714,
"grad_norm": 6.6368383146740335,
"kl": 2.5166015625,
"learning_rate": 1.123369638083593e-05,
"loss": 0.1007,
"reward": 0.8782552406191826,
"reward_std": 0.5208123382180929,
"rewards/accuracy_reward": 0.1855468787252903,
"rewards/format_reward": 0.6927083507180214,
"step": 146
},
{
"completion_length": 1024.0,
"epoch": 0.519434628975265,
"grad_norm": 5.018034104235896,
"kl": 2.7021484375,
"learning_rate": 1.1110865250488047e-05,
"loss": 0.1081,
"reward": 0.884114608168602,
"reward_std": 0.530108455568552,
"rewards/accuracy_reward": 0.16601562919095159,
"rewards/format_reward": 0.7180989719927311,
"step": 147
},
{
"completion_length": 1024.0,
"epoch": 0.5229681978798587,
"grad_norm": 4.782732693496574,
"kl": 4.1826171875,
"learning_rate": 1.0987864183057943e-05,
"loss": 0.1672,
"reward": 0.9218750223517418,
"reward_std": 0.5164159703999758,
"rewards/accuracy_reward": 0.18229167396202683,
"rewards/format_reward": 0.7395833544433117,
"step": 148
},
{
"completion_length": 1024.0,
"epoch": 0.5265017667844523,
"grad_norm": 16.810647138161976,
"kl": 6.6533203125,
"learning_rate": 1.0864711994907457e-05,
"loss": 0.2665,
"reward": 0.9960937686264515,
"reward_std": 0.48126981779932976,
"rewards/accuracy_reward": 0.22786459187045693,
"rewards/format_reward": 0.7682291828095913,
"step": 149
},
{
"completion_length": 1024.0,
"epoch": 0.5300353356890459,
"grad_norm": 10.7748697556108,
"kl": 5.4052734375,
"learning_rate": 1.0741427525516463e-05,
"loss": 0.2162,
"reward": 0.9928385689854622,
"reward_std": 0.5116328075528145,
"rewards/accuracy_reward": 0.22395833989139646,
"rewards/format_reward": 0.7688802257180214,
"step": 150
},
{
"completion_length": 1024.0,
"epoch": 0.5335689045936396,
"grad_norm": 2.3886371142629574,
"kl": 2.9921875,
"learning_rate": 1.0618029634600843e-05,
"loss": 0.1197,
"reward": 0.9869791977107525,
"reward_std": 0.48671552538871765,
"rewards/accuracy_reward": 0.20572917303070426,
"rewards/format_reward": 0.7812500186264515,
"step": 151
},
{
"completion_length": 1024.0,
"epoch": 0.5371024734982333,
"grad_norm": 1.7440867401519615,
"kl": 3.501953125,
"learning_rate": 1.0494537199227393e-05,
"loss": 0.1401,
"reward": 0.9563802443444729,
"reward_std": 0.4778597932308912,
"rewards/accuracy_reward": 0.1940104216337204,
"rewards/format_reward": 0.7623698152601719,
"step": 152
},
{
"completion_length": 1024.0,
"epoch": 0.5406360424028268,
"grad_norm": 2.4296240779116753,
"kl": 3.357421875,
"learning_rate": 1.0370969110926052e-05,
"loss": 0.1343,
"reward": 0.9921875298023224,
"reward_std": 0.4780040867626667,
"rewards/accuracy_reward": 0.21679688291624188,
"rewards/format_reward": 0.7753906473517418,
"step": 153
},
{
"completion_length": 1024.0,
"epoch": 0.5441696113074205,
"grad_norm": 4.117699458280056,
"kl": 3.2236328125,
"learning_rate": 1.024734427279995e-05,
"loss": 0.129,
"reward": 1.0136718973517418,
"reward_std": 0.49156964384019375,
"rewards/accuracy_reward": 0.23502604896202683,
"rewards/format_reward": 0.7786458469927311,
"step": 154
},
{
"completion_length": 1024.0,
"epoch": 0.5477031802120141,
"grad_norm": 9.030839517376265,
"kl": 5.662109375,
"learning_rate": 1.012368159663363e-05,
"loss": 0.2264,
"reward": 1.0794271267950535,
"reward_std": 0.45495169796049595,
"rewards/accuracy_reward": 0.2740885503590107,
"rewards/format_reward": 0.8053385578095913,
"step": 155
},
{
"completion_length": 1024.0,
"epoch": 0.5512367491166078,
"grad_norm": 3.2489067144831174,
"kl": 3.689453125,
"learning_rate": 1e-05,
"loss": 0.1475,
"reward": 1.100260455161333,
"reward_std": 0.4964223224669695,
"rewards/accuracy_reward": 0.2955729253590107,
"rewards/format_reward": 0.8046875223517418,
"step": 156
},
{
"completion_length": 1024.0,
"epoch": 0.5547703180212014,
"grad_norm": 5.586842094139113,
"kl": 1.96337890625,
"learning_rate": 9.876318403366371e-06,
"loss": 0.0785,
"reward": 1.0937500335276127,
"reward_std": 0.4783368781208992,
"rewards/accuracy_reward": 0.2884114645421505,
"rewards/format_reward": 0.8053385578095913,
"step": 157
},
{
"completion_length": 1024.0,
"epoch": 0.558303886925795,
"grad_norm": 6.4948169343262245,
"kl": 1.94287109375,
"learning_rate": 9.752655727200051e-06,
"loss": 0.0777,
"reward": 1.040364608168602,
"reward_std": 0.49424389004707336,
"rewards/accuracy_reward": 0.25390625838190317,
"rewards/format_reward": 0.7864583507180214,
"step": 158
},
{
"completion_length": 1024.0,
"epoch": 0.5618374558303887,
"grad_norm": 5.326307273819102,
"kl": 2.5322265625,
"learning_rate": 9.62903088907395e-06,
"loss": 0.1013,
"reward": 1.0703125298023224,
"reward_std": 0.486019866541028,
"rewards/accuracy_reward": 0.27669271547347307,
"rewards/format_reward": 0.7936198078095913,
"step": 159
},
{
"completion_length": 1024.0,
"epoch": 0.5653710247349824,
"grad_norm": 5.794058798518192,
"kl": 4.05859375,
"learning_rate": 9.505462800772612e-06,
"loss": 0.1624,
"reward": 1.1087239980697632,
"reward_std": 0.48937827721238136,
"rewards/accuracy_reward": 0.3196614682674408,
"rewards/format_reward": 0.7890625186264515,
"step": 160
},
{
"completion_length": 1024.0,
"epoch": 0.568904593639576,
"grad_norm": 9.402836751388962,
"kl": 5.33984375,
"learning_rate": 9.381970365399162e-06,
"loss": 0.2135,
"reward": 0.9941406510770321,
"reward_std": 0.522324126213789,
"rewards/accuracy_reward": 0.263671881519258,
"rewards/format_reward": 0.7304687723517418,
"step": 161
},
{
"completion_length": 1024.0,
"epoch": 0.5724381625441696,
"grad_norm": 4.195689922174244,
"kl": 4.86328125,
"learning_rate": 9.25857247448354e-06,
"loss": 0.1944,
"reward": 0.960937537252903,
"reward_std": 0.5458177234977484,
"rewards/accuracy_reward": 0.24739584140479565,
"rewards/format_reward": 0.7135416939854622,
"step": 162
},
{
"completion_length": 1024.0,
"epoch": 0.5759717314487632,
"grad_norm": 2.8989498058377228,
"kl": 4.87109375,
"learning_rate": 9.135288005092546e-06,
"loss": 0.1949,
"reward": 1.0930989868938923,
"reward_std": 0.5132731832563877,
"rewards/accuracy_reward": 0.29947918001562357,
"rewards/format_reward": 0.7936198115348816,
"step": 163
},
{
"completion_length": 1024.0,
"epoch": 0.5795053003533569,
"grad_norm": 3.0006286232984234,
"kl": 5.009765625,
"learning_rate": 9.012135816942058e-06,
"loss": 0.2003,
"reward": 1.0377604514360428,
"reward_std": 0.47819276340305805,
"rewards/accuracy_reward": 0.24283854709938169,
"rewards/format_reward": 0.7949218899011612,
"step": 164
},
{
"completion_length": 1024.0,
"epoch": 0.5830388692579506,
"grad_norm": 4.6993358160287615,
"kl": 5.35546875,
"learning_rate": 8.889134749511956e-06,
"loss": 0.2143,
"reward": 1.0800781473517418,
"reward_std": 0.5242850538343191,
"rewards/accuracy_reward": 0.281901054084301,
"rewards/format_reward": 0.798177108168602,
"step": 165
},
{
"completion_length": 1024.0,
"epoch": 0.5865724381625441,
"grad_norm": 6.4731877967945355,
"kl": 6.76953125,
"learning_rate": 8.76630361916407e-06,
"loss": 0.2708,
"reward": 1.0436198264360428,
"reward_std": 0.4580067917704582,
"rewards/accuracy_reward": 0.24804688291624188,
"rewards/format_reward": 0.7955729365348816,
"step": 166
},
{
"completion_length": 1024.0,
"epoch": 0.5901060070671378,
"grad_norm": 9.638355564108144,
"kl": 8.08203125,
"learning_rate": 8.643661216263744e-06,
"loss": 0.3235,
"reward": 1.0696614980697632,
"reward_std": 0.47720608301460743,
"rewards/accuracy_reward": 0.254557297565043,
"rewards/format_reward": 0.8151041939854622,
"step": 167
},
{
"completion_length": 1024.0,
"epoch": 0.5936395759717314,
"grad_norm": 5.425360658521879,
"kl": 7.126953125,
"learning_rate": 8.52122630230531e-06,
"loss": 0.2852,
"reward": 1.0677083730697632,
"reward_std": 0.4944526255130768,
"rewards/accuracy_reward": 0.2669270886108279,
"rewards/format_reward": 0.8007812649011612,
"step": 168
},
{
"completion_length": 1024.0,
"epoch": 0.5971731448763251,
"grad_norm": 2.6727398917140324,
"kl": 5.6484375,
"learning_rate": 8.399017607042025e-06,
"loss": 0.2259,
"reward": 1.0963541865348816,
"reward_std": 0.4828463848680258,
"rewards/accuracy_reward": 0.28645834140479565,
"rewards/format_reward": 0.8098958544433117,
"step": 169
},
{
"completion_length": 1024.0,
"epoch": 0.6007067137809188,
"grad_norm": 3.0934284767601503,
"kl": 4.96484375,
"learning_rate": 8.277053825620836e-06,
"loss": 0.1987,
"reward": 1.0039062909781933,
"reward_std": 0.47822842188179493,
"rewards/accuracy_reward": 0.22005208861082792,
"rewards/format_reward": 0.7838541865348816,
"step": 170
},
{
"completion_length": 1024.0,
"epoch": 0.6042402826855123,
"grad_norm": 1.415026805369631,
"kl": 4.71875,
"learning_rate": 8.155353615722442e-06,
"loss": 0.1887,
"reward": 1.0240885764360428,
"reward_std": 0.5142606012523174,
"rewards/accuracy_reward": 0.24283855129033327,
"rewards/format_reward": 0.7812500149011612,
"step": 171
},
{
"completion_length": 1024.0,
"epoch": 0.607773851590106,
"grad_norm": 3.671045541857197,
"kl": 4.7958984375,
"learning_rate": 8.033935594707116e-06,
"loss": 0.1918,
"reward": 0.9915364906191826,
"reward_std": 0.5378254223614931,
"rewards/accuracy_reward": 0.2363281308207661,
"rewards/format_reward": 0.7552083544433117,
"step": 172
},
{
"completion_length": 1024.0,
"epoch": 0.6113074204946997,
"grad_norm": 4.829747105605246,
"kl": 4.9267578125,
"learning_rate": 7.91281833676665e-06,
"loss": 0.1968,
"reward": 1.0175781548023224,
"reward_std": 0.533087344840169,
"rewards/accuracy_reward": 0.262369797565043,
"rewards/format_reward": 0.7552083544433117,
"step": 173
},
{
"completion_length": 1024.0,
"epoch": 0.6148409893992933,
"grad_norm": 4.133358091951167,
"kl": 5.173828125,
"learning_rate": 7.79202037008294e-06,
"loss": 0.2072,
"reward": 0.9824219010770321,
"reward_std": 0.5031039249151945,
"rewards/accuracy_reward": 0.23111979756504297,
"rewards/format_reward": 0.7513021044433117,
"step": 174
},
{
"completion_length": 1024.0,
"epoch": 0.6183745583038869,
"grad_norm": 1.7387249238999762,
"kl": 5.0078125,
"learning_rate": 7.671560173993588e-06,
"loss": 0.2003,
"reward": 0.956380233168602,
"reward_std": 0.5033866986632347,
"rewards/accuracy_reward": 0.22135417256504297,
"rewards/format_reward": 0.7350260652601719,
"step": 175
},
{
"completion_length": 1024.0,
"epoch": 0.6219081272084805,
"grad_norm": 2.172215905238513,
"kl": 4.857421875,
"learning_rate": 7.551456176164989e-06,
"loss": 0.1943,
"reward": 0.9941406548023224,
"reward_std": 0.526677755638957,
"rewards/accuracy_reward": 0.24414063151925802,
"rewards/format_reward": 0.7500000223517418,
"step": 176
},
{
"completion_length": 1024.0,
"epoch": 0.6254416961130742,
"grad_norm": 0.8842413004407766,
"kl": 5.447265625,
"learning_rate": 7.431726749773322e-06,
"loss": 0.2178,
"reward": 0.9596354402601719,
"reward_std": 0.5413137227296829,
"rewards/accuracy_reward": 0.2278645895421505,
"rewards/format_reward": 0.7317708507180214,
"step": 177
},
{
"completion_length": 1024.0,
"epoch": 0.6289752650176679,
"grad_norm": 0.7402618851617847,
"kl": 5.388671875,
"learning_rate": 7.312390210693863e-06,
"loss": 0.2156,
"reward": 0.9811198189854622,
"reward_std": 0.5133567694574594,
"rewards/accuracy_reward": 0.23242188384756446,
"rewards/format_reward": 0.7486979402601719,
"step": 178
},
{
"completion_length": 1024.0,
"epoch": 0.6325088339222615,
"grad_norm": 3.560567058348375,
"kl": 5.2578125,
"learning_rate": 7.193464814699073e-06,
"loss": 0.2104,
"reward": 0.9238281324505806,
"reward_std": 0.5353942643851042,
"rewards/accuracy_reward": 0.20507813151925802,
"rewards/format_reward": 0.7187500223517418,
"step": 179
},
{
"completion_length": 1024.0,
"epoch": 0.6360424028268551,
"grad_norm": 2.753064271775204,
"kl": 5.181640625,
"learning_rate": 7.07496875466589e-06,
"loss": 0.2074,
"reward": 0.8417968861758709,
"reward_std": 0.5037534404546022,
"rewards/accuracy_reward": 0.1321614630287513,
"rewards/format_reward": 0.7096354328095913,
"step": 180
},
{
"completion_length": 1024.0,
"epoch": 0.6395759717314488,
"grad_norm": 3.87667763131927,
"kl": 6.107421875,
"learning_rate": 6.9569201577926395e-06,
"loss": 0.2442,
"reward": 0.8977864868938923,
"reward_std": 0.5129530522972345,
"rewards/accuracy_reward": 0.18424479803070426,
"rewards/format_reward": 0.7135416939854622,
"step": 181
},
{
"completion_length": 1024.0,
"epoch": 0.6431095406360424,
"grad_norm": 3.8854100556266813,
"kl": 6.060546875,
"learning_rate": 6.839337082825954e-06,
"loss": 0.2426,
"reward": 0.8893229365348816,
"reward_std": 0.4821504820138216,
"rewards/accuracy_reward": 0.15755208861082792,
"rewards/format_reward": 0.7317708469927311,
"step": 182
},
{
"completion_length": 1024.0,
"epoch": 0.6466431095406361,
"grad_norm": 1.2484444905977732,
"kl": 5.591796875,
"learning_rate": 6.722237517298232e-06,
"loss": 0.2238,
"reward": 0.958984412252903,
"reward_std": 0.48374492302536964,
"rewards/accuracy_reward": 0.1861979211680591,
"rewards/format_reward": 0.7727864794433117,
"step": 183
},
{
"completion_length": 1024.0,
"epoch": 0.6501766784452296,
"grad_norm": 2.6893650368389546,
"kl": 5.880859375,
"learning_rate": 6.605639374775934e-06,
"loss": 0.2352,
"reward": 0.9785156510770321,
"reward_std": 0.4921079948544502,
"rewards/accuracy_reward": 0.2057291716337204,
"rewards/format_reward": 0.7727864757180214,
"step": 184
},
{
"completion_length": 1024.0,
"epoch": 0.6537102473498233,
"grad_norm": 2.3442839100460233,
"kl": 6.041015625,
"learning_rate": 6.489560492119225e-06,
"loss": 0.2416,
"reward": 0.9583333693444729,
"reward_std": 0.4440547488629818,
"rewards/accuracy_reward": 0.1692708395421505,
"rewards/format_reward": 0.7890625223517418,
"step": 185
},
{
"completion_length": 1024.0,
"epoch": 0.657243816254417,
"grad_norm": 2.1253880553627047,
"kl": 6.69921875,
"learning_rate": 6.374018626753331e-06,
"loss": 0.268,
"reward": 0.9765625223517418,
"reward_std": 0.4347160626202822,
"rewards/accuracy_reward": 0.17838542256504297,
"rewards/format_reward": 0.7981770969927311,
"step": 186
},
{
"completion_length": 1024.0,
"epoch": 0.6607773851590106,
"grad_norm": 1.0194540597887043,
"kl": 6.712890625,
"learning_rate": 6.2590314539520695e-06,
"loss": 0.2686,
"reward": 0.9921875335276127,
"reward_std": 0.44406831078231335,
"rewards/accuracy_reward": 0.2005208395421505,
"rewards/format_reward": 0.7916666828095913,
"step": 187
},
{
"completion_length": 1024.0,
"epoch": 0.6643109540636042,
"grad_norm": 0.5545671762601952,
"kl": 6.857421875,
"learning_rate": 6.144616564133927e-06,
"loss": 0.2743,
"reward": 1.023437526077032,
"reward_std": 0.4340708777308464,
"rewards/accuracy_reward": 0.19401042442768812,
"rewards/format_reward": 0.829427108168602,
"step": 188
},
{
"completion_length": 1024.0,
"epoch": 0.6678445229681979,
"grad_norm": 0.3583901424418257,
"kl": 6.94140625,
"learning_rate": 6.03079146017113e-06,
"loss": 0.2778,
"reward": 1.0410156548023224,
"reward_std": 0.4304163958877325,
"rewards/accuracy_reward": 0.21354167070239782,
"rewards/format_reward": 0.8274739794433117,
"step": 189
},
{
"completion_length": 1024.0,
"epoch": 0.6713780918727915,
"grad_norm": 0.17755970310205513,
"kl": 7.12890625,
"learning_rate": 5.9175735547120975e-06,
"loss": 0.2852,
"reward": 1.0572916865348816,
"reward_std": 0.4284838940948248,
"rewards/accuracy_reward": 0.20963542349636555,
"rewards/format_reward": 0.8476562723517418,
"step": 190
},
{
"completion_length": 1024.0,
"epoch": 0.6749116607773852,
"grad_norm": 0.16047119425020795,
"kl": 7.486328125,
"learning_rate": 5.804980167517712e-06,
"loss": 0.2995,
"reward": 1.0820313021540642,
"reward_std": 0.3956974996253848,
"rewards/accuracy_reward": 0.22395833861082792,
"rewards/format_reward": 0.8580729365348816,
"step": 191
},
{
"completion_length": 1024.0,
"epoch": 0.6784452296819788,
"grad_norm": 0.18607515894747625,
"kl": 7.34765625,
"learning_rate": 5.693028522811783e-06,
"loss": 0.2937,
"reward": 1.1145833805203438,
"reward_std": 0.42582329362630844,
"rewards/accuracy_reward": 0.25911459047347307,
"rewards/format_reward": 0.8554687798023224,
"step": 192
},
{
"completion_length": 1024.0,
"epoch": 0.6819787985865724,
"grad_norm": 0.12713756611493196,
"kl": 7.560546875,
"learning_rate": 5.581735746646134e-06,
"loss": 0.3023,
"reward": 1.1165364943444729,
"reward_std": 0.4042051937431097,
"rewards/accuracy_reward": 0.24609375651925802,
"rewards/format_reward": 0.8704427368938923,
"step": 193
},
{
"completion_length": 1006.2291679382324,
"epoch": 0.6855123674911661,
"grad_norm": 0.17261260884206758,
"kl": 7.578125,
"learning_rate": 5.471118864280716e-06,
"loss": 0.3027,
"reward": 1.1165365055203438,
"reward_std": 0.4118177331984043,
"rewards/accuracy_reward": 0.2513020895421505,
"rewards/format_reward": 0.8652343973517418,
"step": 194
},
{
"completion_length": 1009.5208358764648,
"epoch": 0.6890459363957597,
"grad_norm": 0.12753618037588546,
"kl": 7.6796875,
"learning_rate": 5.361194797579108e-06,
"loss": 0.3073,
"reward": 1.0963542014360428,
"reward_std": 0.38153328374028206,
"rewards/accuracy_reward": 0.22330729849636555,
"rewards/format_reward": 0.8730468936264515,
"step": 195
},
{
"completion_length": 992.2708358764648,
"epoch": 0.6925795053003534,
"grad_norm": 0.1939287572168999,
"kl": 7.9375,
"learning_rate": 5.2519803624198865e-06,
"loss": 0.3175,
"reward": 1.1438802480697632,
"reward_std": 0.36042055673897266,
"rewards/accuracy_reward": 0.2545572994276881,
"rewards/format_reward": 0.8893229365348816,
"step": 196
},
{
"completion_length": 1005.8333358764648,
"epoch": 0.696113074204947,
"grad_norm": 1.6055164563240731,
"kl": 7.822265625,
"learning_rate": 5.143492266124164e-06,
"loss": 0.313,
"reward": 1.0944010764360428,
"reward_std": 0.3512597717344761,
"rewards/accuracy_reward": 0.21289063151925802,
"rewards/format_reward": 0.8815104328095913,
"step": 197
},
{
"completion_length": 976.1875038146973,
"epoch": 0.6996466431095406,
"grad_norm": 0.15206731503411422,
"kl": 7.78125,
"learning_rate": 5.035747104899738e-06,
"loss": 0.3114,
"reward": 1.0540364906191826,
"reward_std": 0.35300159733742476,
"rewards/accuracy_reward": 0.18880208674818277,
"rewards/format_reward": 0.8652343899011612,
"step": 198
},
{
"completion_length": 983.6250038146973,
"epoch": 0.7031802120141343,
"grad_norm": 0.18996755943687274,
"kl": 7.857421875,
"learning_rate": 4.928761361302269e-06,
"loss": 0.3144,
"reward": 1.1308594197034836,
"reward_std": 0.37638773024082184,
"rewards/accuracy_reward": 0.24869792396202683,
"rewards/format_reward": 0.8821614794433117,
"step": 199
},
{
"completion_length": 987.5208358764648,
"epoch": 0.7067137809187279,
"grad_norm": 0.19405956495375348,
"kl": 7.912109375,
"learning_rate": 4.8225514017138205e-06,
"loss": 0.3164,
"reward": 1.096354190260172,
"reward_std": 0.34191144444048405,
"rewards/accuracy_reward": 0.20638021361082792,
"rewards/format_reward": 0.8899739757180214,
"step": 200
},
{
"completion_length": 1008.8958358764648,
"epoch": 0.7102473498233216,
"grad_norm": 0.1978005231935579,
"kl": 7.767578125,
"learning_rate": 4.717133473839163e-06,
"loss": 0.3108,
"reward": 1.1432292237877846,
"reward_std": 0.3862752038985491,
"rewards/accuracy_reward": 0.2701823003590107,
"rewards/format_reward": 0.8730468973517418,
"step": 201
},
{
"completion_length": 1009.25,
"epoch": 0.7137809187279152,
"grad_norm": 0.1284484089519714,
"kl": 7.6015625,
"learning_rate": 4.612523704220264e-06,
"loss": 0.3041,
"reward": 1.115234412252903,
"reward_std": 0.42590648494660854,
"rewards/accuracy_reward": 0.26562500838190317,
"rewards/format_reward": 0.8496093973517418,
"step": 202
},
{
"completion_length": 1024.0,
"epoch": 0.7173144876325088,
"grad_norm": 0.14352585595949552,
"kl": 7.435546875,
"learning_rate": 4.508738095769278e-06,
"loss": 0.2974,
"reward": 1.061848983168602,
"reward_std": 0.41079509258270264,
"rewards/accuracy_reward": 0.2265625074505806,
"rewards/format_reward": 0.8352864757180214,
"step": 203
},
{
"completion_length": 1024.0,
"epoch": 0.7208480565371025,
"grad_norm": 0.1521889190457422,
"kl": 7.193359375,
"learning_rate": 4.405792525320469e-06,
"loss": 0.2877,
"reward": 1.0279948264360428,
"reward_std": 0.424892058596015,
"rewards/accuracy_reward": 0.2037760482635349,
"rewards/format_reward": 0.8242187686264515,
"step": 204
},
{
"completion_length": 1024.0,
"epoch": 0.7243816254416962,
"grad_norm": 0.3506188463952097,
"kl": 7.2890625,
"learning_rate": 4.303702741201431e-06,
"loss": 0.2915,
"reward": 1.059895858168602,
"reward_std": 0.42388765700161457,
"rewards/accuracy_reward": 0.2473958432674408,
"rewards/format_reward": 0.8125000186264515,
"step": 205
},
{
"completion_length": 1024.0,
"epoch": 0.7279151943462897,
"grad_norm": 0.1677757492998703,
"kl": 6.9921875,
"learning_rate": 4.202484360823926e-06,
"loss": 0.2797,
"reward": 1.0266927368938923,
"reward_std": 0.45705331675708294,
"rewards/accuracy_reward": 0.23502604942768812,
"rewards/format_reward": 0.791666679084301,
"step": 206
},
{
"completion_length": 1011.9791679382324,
"epoch": 0.7314487632508834,
"grad_norm": 0.1517309736158255,
"kl": 6.935546875,
"learning_rate": 4.1021528682948064e-06,
"loss": 0.2774,
"reward": 1.049479216337204,
"reward_std": 0.46575335413217545,
"rewards/accuracy_reward": 0.2526041716337204,
"rewards/format_reward": 0.7968750149011612,
"step": 207
},
{
"completion_length": 1024.0,
"epoch": 0.734982332155477,
"grad_norm": 0.16127641070568163,
"kl": 7.111328125,
"learning_rate": 4.002723612047272e-06,
"loss": 0.2847,
"reward": 1.085286483168602,
"reward_std": 0.44572209380567074,
"rewards/accuracy_reward": 0.26302084419876337,
"rewards/format_reward": 0.8222656436264515,
"step": 208
},
{
"completion_length": 1009.3125,
"epoch": 0.7385159010600707,
"grad_norm": 0.13871720330015053,
"kl": 7.23828125,
"learning_rate": 3.904211802492922e-06,
"loss": 0.2897,
"reward": 1.100260455161333,
"reward_std": 0.4249584712088108,
"rewards/accuracy_reward": 0.26953125558793545,
"rewards/format_reward": 0.830729179084301,
"step": 209
},
{
"completion_length": 1014.8125,
"epoch": 0.7420494699646644,
"grad_norm": 0.13865406061603536,
"kl": 7.302734375,
"learning_rate": 3.8066325096949153e-06,
"loss": 0.2922,
"reward": 1.1406250447034836,
"reward_std": 0.41488252952694893,
"rewards/accuracy_reward": 0.2812500074505806,
"rewards/format_reward": 0.8593750223517418,
"step": 210
},
{
"completion_length": 1024.0,
"epoch": 0.7455830388692579,
"grad_norm": 0.12592871455753224,
"kl": 7.369140625,
"learning_rate": 3.710000661062578e-06,
"loss": 0.2948,
"reward": 1.1796875484287739,
"reward_std": 0.41464217752218246,
"rewards/accuracy_reward": 0.313151047565043,
"rewards/format_reward": 0.8665364794433117,
"step": 211
},
{
"completion_length": 995.2291679382324,
"epoch": 0.7491166077738516,
"grad_norm": 0.1393901308152561,
"kl": 7.501953125,
"learning_rate": 3.6143310390678544e-06,
"loss": 0.3001,
"reward": 1.1647135838866234,
"reward_std": 0.3797377645969391,
"rewards/accuracy_reward": 0.2838541753590107,
"rewards/format_reward": 0.8808593899011612,
"step": 212
},
{
"completion_length": 1010.125,
"epoch": 0.7526501766784452,
"grad_norm": 0.14034512809285338,
"kl": 7.31640625,
"learning_rate": 3.5196382789839477e-06,
"loss": 0.2926,
"reward": 1.1549479514360428,
"reward_std": 0.39015408605337143,
"rewards/accuracy_reward": 0.27994792629033327,
"rewards/format_reward": 0.8750000223517418,
"step": 213
},
{
"completion_length": 1011.875,
"epoch": 0.7561837455830389,
"grad_norm": 0.14989504739682083,
"kl": 7.28125,
"learning_rate": 3.425936866646419e-06,
"loss": 0.2911,
"reward": 1.1614583656191826,
"reward_std": 0.39541246369481087,
"rewards/accuracy_reward": 0.292968756519258,
"rewards/format_reward": 0.8684896007180214,
"step": 214
},
{
"completion_length": 1024.0,
"epoch": 0.7597173144876325,
"grad_norm": 0.13609166349614024,
"kl": 6.982421875,
"learning_rate": 3.3332411362372063e-06,
"loss": 0.2793,
"reward": 1.1263021379709244,
"reward_std": 0.44077819399535656,
"rewards/accuracy_reward": 0.29101563384756446,
"rewards/format_reward": 0.8352864794433117,
"step": 215
},
{
"completion_length": 1024.0,
"epoch": 0.7632508833922261,
"grad_norm": 0.15918888254694777,
"kl": 7.1640625,
"learning_rate": 3.2415652680918262e-06,
"loss": 0.2865,
"reward": 1.1223958879709244,
"reward_std": 0.41186920180916786,
"rewards/accuracy_reward": 0.28190105222165585,
"rewards/format_reward": 0.8404948152601719,
"step": 216
},
{
"completion_length": 1008.0416679382324,
"epoch": 0.7667844522968198,
"grad_norm": 0.12370384010350725,
"kl": 7.244140625,
"learning_rate": 3.1509232865300886e-06,
"loss": 0.2899,
"reward": 1.1601563058793545,
"reward_std": 0.38652253709733486,
"rewards/accuracy_reward": 0.3059895886108279,
"rewards/format_reward": 0.8541666939854622,
"step": 217
},
{
"completion_length": 1024.0,
"epoch": 0.7703180212014135,
"grad_norm": 0.20461489596469734,
"kl": 7.2421875,
"learning_rate": 3.061329057710711e-06,
"loss": 0.2898,
"reward": 1.072916705161333,
"reward_std": 0.4447946548461914,
"rewards/accuracy_reward": 0.2571614645421505,
"rewards/format_reward": 0.8157552294433117,
"step": 218
},
{
"completion_length": 1024.0,
"epoch": 0.773851590106007,
"grad_norm": 0.1804867199582671,
"kl": 7.388671875,
"learning_rate": 2.9727962875101e-06,
"loss": 0.2956,
"reward": 1.1334635727107525,
"reward_std": 0.39954448491334915,
"rewards/accuracy_reward": 0.27929688338190317,
"rewards/format_reward": 0.8541666828095913,
"step": 219
},
{
"completion_length": 1024.0,
"epoch": 0.7773851590106007,
"grad_norm": 0.16932493071108376,
"kl": 7.197265625,
"learning_rate": 2.8853385194256677e-06,
"loss": 0.2879,
"reward": 1.1041667014360428,
"reward_std": 0.4760838821530342,
"rewards/accuracy_reward": 0.28125000884756446,
"rewards/format_reward": 0.8229166865348816,
"step": 220
},
{
"completion_length": 1024.0,
"epoch": 0.7809187279151943,
"grad_norm": 0.15687360626369415,
"kl": 7.0703125,
"learning_rate": 2.798969132503997e-06,
"loss": 0.283,
"reward": 1.0423177294433117,
"reward_std": 0.44260338321328163,
"rewards/accuracy_reward": 0.2304687574505806,
"rewards/format_reward": 0.8118489794433117,
"step": 221
},
{
"completion_length": 1024.0,
"epoch": 0.784452296819788,
"grad_norm": 0.17190503748389882,
"kl": 7.171875,
"learning_rate": 2.713701339294129e-06,
"loss": 0.2869,
"reward": 1.1419271193444729,
"reward_std": 0.4103549234569073,
"rewards/accuracy_reward": 0.301432297565043,
"rewards/format_reward": 0.8404948115348816,
"step": 222
},
{
"completion_length": 1024.0,
"epoch": 0.7879858657243817,
"grad_norm": 0.14054774738185286,
"kl": 7.33203125,
"learning_rate": 2.6295481838263628e-06,
"loss": 0.2932,
"reward": 1.0891927443444729,
"reward_std": 0.415512815117836,
"rewards/accuracy_reward": 0.24479167256504297,
"rewards/format_reward": 0.844401054084301,
"step": 223
},
{
"completion_length": 1024.0,
"epoch": 0.7915194346289752,
"grad_norm": 0.1087364190858895,
"kl": 7.130859375,
"learning_rate": 2.5465225396168134e-06,
"loss": 0.2853,
"reward": 1.1236979588866234,
"reward_std": 0.44147299975156784,
"rewards/accuracy_reward": 0.285807297565043,
"rewards/format_reward": 0.8378906473517418,
"step": 224
},
{
"completion_length": 1008.2916679382324,
"epoch": 0.7950530035335689,
"grad_norm": 0.11246121980520436,
"kl": 7.4765625,
"learning_rate": 2.464637107698046e-06,
"loss": 0.2994,
"reward": 1.1829427480697632,
"reward_std": 0.37431807816028595,
"rewards/accuracy_reward": 0.3046875074505806,
"rewards/format_reward": 0.8782552294433117,
"step": 225
},
{
"completion_length": 1024.0,
"epoch": 0.7985865724381626,
"grad_norm": 0.1112131457977264,
"kl": 7.265625,
"learning_rate": 2.3839044146761227e-06,
"loss": 0.2907,
"reward": 1.164713591337204,
"reward_std": 0.3692896058782935,
"rewards/accuracy_reward": 0.2890625037252903,
"rewards/format_reward": 0.8756510689854622,
"step": 226
},
{
"completion_length": 1024.0,
"epoch": 0.8021201413427562,
"grad_norm": 0.21800229184914455,
"kl": 7.462890625,
"learning_rate": 2.304336810814305e-06,
"loss": 0.2983,
"reward": 1.1809896230697632,
"reward_std": 0.3522001476958394,
"rewards/accuracy_reward": 0.29427084140479565,
"rewards/format_reward": 0.8867187723517418,
"step": 227
},
{
"completion_length": 1007.1458358764648,
"epoch": 0.8056537102473498,
"grad_norm": 0.1666037699534655,
"kl": 7.740234375,
"learning_rate": 2.2259464681437404e-06,
"loss": 0.3096,
"reward": 1.2037760689854622,
"reward_std": 0.3384133204817772,
"rewards/accuracy_reward": 0.30338542349636555,
"rewards/format_reward": 0.9003906436264515,
"step": 228
},
{
"completion_length": 1024.0,
"epoch": 0.8091872791519434,
"grad_norm": 0.17750163436679095,
"kl": 7.51953125,
"learning_rate": 2.1487453786014513e-06,
"loss": 0.301,
"reward": 1.1744792126119137,
"reward_std": 0.4062032885849476,
"rewards/accuracy_reward": 0.3001302173361182,
"rewards/format_reward": 0.8743489794433117,
"step": 229
},
{
"completion_length": 1008.1875,
"epoch": 0.8127208480565371,
"grad_norm": 0.15247459978463207,
"kl": 7.580078125,
"learning_rate": 2.072745352195794e-06,
"loss": 0.303,
"reward": 1.1803385764360428,
"reward_std": 0.37856387067586184,
"rewards/accuracy_reward": 0.29361979849636555,
"rewards/format_reward": 0.8867187686264515,
"step": 230
},
{
"completion_length": 1011.2916679382324,
"epoch": 0.8162544169611308,
"grad_norm": 0.18019373306879702,
"kl": 7.603515625,
"learning_rate": 1.997958015199829e-06,
"loss": 0.304,
"reward": 1.1835937798023224,
"reward_std": 0.37871948070824146,
"rewards/accuracy_reward": 0.293619797565043,
"rewards/format_reward": 0.8899739719927311,
"step": 231
},
{
"completion_length": 1024.0,
"epoch": 0.8197879858657244,
"grad_norm": 0.22583268385399752,
"kl": 7.529296875,
"learning_rate": 1.9243948083727626e-06,
"loss": 0.3012,
"reward": 1.118489608168602,
"reward_std": 0.37204239144921303,
"rewards/accuracy_reward": 0.25000000512227416,
"rewards/format_reward": 0.8684896044433117,
"step": 232
},
{
"completion_length": 1024.0,
"epoch": 0.823321554770318,
"grad_norm": 0.3443852485581638,
"kl": 7.34375,
"learning_rate": 1.8520669852097573e-06,
"loss": 0.2938,
"reward": 1.1399739980697632,
"reward_std": 0.40698738768696785,
"rewards/accuracy_reward": 0.2864583395421505,
"rewards/format_reward": 0.8535156473517418,
"step": 233
},
{
"completion_length": 1024.0,
"epoch": 0.8268551236749117,
"grad_norm": 0.2918520900684292,
"kl": 7.419921875,
"learning_rate": 1.7809856102204148e-06,
"loss": 0.2967,
"reward": 1.1516927555203438,
"reward_std": 0.3947129677981138,
"rewards/accuracy_reward": 0.30664063431322575,
"rewards/format_reward": 0.8450520969927311,
"step": 234
},
{
"completion_length": 993.1250038146973,
"epoch": 0.8303886925795053,
"grad_norm": 0.2326957931992313,
"kl": 7.501953125,
"learning_rate": 1.7111615572361628e-06,
"loss": 0.3001,
"reward": 1.1100260615348816,
"reward_std": 0.4124826304614544,
"rewards/accuracy_reward": 0.2643229244276881,
"rewards/format_reward": 0.8457031473517418,
"step": 235
},
{
"completion_length": 1008.4791679382324,
"epoch": 0.833922261484099,
"grad_norm": 0.20186154524969585,
"kl": 7.302734375,
"learning_rate": 1.642605507746786e-06,
"loss": 0.2922,
"reward": 1.0924479439854622,
"reward_std": 0.4381315726786852,
"rewards/accuracy_reward": 0.2623697970993817,
"rewards/format_reward": 0.8300781436264515,
"step": 236
},
{
"completion_length": 1012.5625,
"epoch": 0.8374558303886925,
"grad_norm": 0.12307486347767096,
"kl": 7.609375,
"learning_rate": 1.5753279492664264e-06,
"loss": 0.3044,
"reward": 1.0917969197034836,
"reward_std": 0.37845473177731037,
"rewards/accuracy_reward": 0.23046875465661287,
"rewards/format_reward": 0.8613281436264515,
"step": 237
},
{
"completion_length": 996.0000038146973,
"epoch": 0.8409893992932862,
"grad_norm": 0.12490510412366805,
"kl": 7.5703125,
"learning_rate": 1.509339173729214e-06,
"loss": 0.3028,
"reward": 1.1119791939854622,
"reward_std": 0.37066210247576237,
"rewards/accuracy_reward": 0.25195313338190317,
"rewards/format_reward": 0.8600260652601719,
"step": 238
},
{
"completion_length": 1013.7708358764648,
"epoch": 0.8445229681978799,
"grad_norm": 0.13493904378820848,
"kl": 7.599609375,
"learning_rate": 1.4446492759148411e-06,
"loss": 0.3039,
"reward": 1.1028646193444729,
"reward_std": 0.36637131590396166,
"rewards/accuracy_reward": 0.24934896640479565,
"rewards/format_reward": 0.8535156436264515,
"step": 239
},
{
"completion_length": 1024.0,
"epoch": 0.8480565371024735,
"grad_norm": 0.15035989381249287,
"kl": 7.650390625,
"learning_rate": 1.381268151904298e-06,
"loss": 0.3059,
"reward": 1.1451823264360428,
"reward_std": 0.3553981352597475,
"rewards/accuracy_reward": 0.25781250931322575,
"rewards/format_reward": 0.8873698078095913,
"step": 240
},
{
"completion_length": 1024.0,
"epoch": 0.8515901060070671,
"grad_norm": 0.22950348264138257,
"kl": 7.703125,
"learning_rate": 1.319205497565983e-06,
"loss": 0.3085,
"reward": 1.177083384245634,
"reward_std": 0.36925146263092756,
"rewards/accuracy_reward": 0.2929687611758709,
"rewards/format_reward": 0.884114608168602,
"step": 241
},
{
"completion_length": 1010.7916679382324,
"epoch": 0.8551236749116607,
"grad_norm": 0.16497755116206372,
"kl": 7.603515625,
"learning_rate": 1.2584708070724738e-06,
"loss": 0.3041,
"reward": 1.1829427406191826,
"reward_std": 0.3738958667963743,
"rewards/accuracy_reward": 0.2897135494276881,
"rewards/format_reward": 0.8932291865348816,
"step": 242
},
{
"completion_length": 1008.9583358764648,
"epoch": 0.8586572438162544,
"grad_norm": 0.17585743920433772,
"kl": 7.76171875,
"learning_rate": 1.1990733714481185e-06,
"loss": 0.3107,
"reward": 1.1471354588866234,
"reward_std": 0.3857318237423897,
"rewards/accuracy_reward": 0.26888021547347307,
"rewards/format_reward": 0.8782552294433117,
"step": 243
},
{
"completion_length": 1009.5833358764648,
"epoch": 0.8621908127208481,
"grad_norm": 0.09937123864278254,
"kl": 7.640625,
"learning_rate": 1.1410222771477276e-06,
"loss": 0.3056,
"reward": 1.1360677555203438,
"reward_std": 0.3525569401681423,
"rewards/accuracy_reward": 0.24934896640479565,
"rewards/format_reward": 0.8867187686264515,
"step": 244
},
{
"completion_length": 1024.0,
"epoch": 0.8657243816254417,
"grad_norm": 0.23903667479086305,
"kl": 7.603515625,
"learning_rate": 1.0843264046665558e-06,
"loss": 0.304,
"reward": 1.1881510764360428,
"reward_std": 0.3764577666297555,
"rewards/accuracy_reward": 0.3027343852445483,
"rewards/format_reward": 0.8854166865348816,
"step": 245
},
{
"completion_length": 1024.0,
"epoch": 0.8692579505300353,
"grad_norm": 0.1464770828710901,
"kl": 7.6328125,
"learning_rate": 1.0289944271817898e-06,
"loss": 0.3051,
"reward": 1.1829427406191826,
"reward_std": 0.36000128649175167,
"rewards/accuracy_reward": 0.2910156324505806,
"rewards/format_reward": 0.891927108168602,
"step": 246
},
{
"completion_length": 1011.8333358764648,
"epoch": 0.872791519434629,
"grad_norm": 0.12575527161822952,
"kl": 7.70703125,
"learning_rate": 9.750348092257368e-07,
"loss": 0.3083,
"reward": 1.2018229737877846,
"reward_std": 0.33693454321473837,
"rewards/accuracy_reward": 0.296875006519258,
"rewards/format_reward": 0.9049479402601719,
"step": 247
},
{
"completion_length": 1007.6041679382324,
"epoch": 0.8763250883392226,
"grad_norm": 0.2181922287172582,
"kl": 7.560546875,
"learning_rate": 9.224558053909615e-07,
"loss": 0.3026,
"reward": 1.1432291939854622,
"reward_std": 0.3927531726658344,
"rewards/accuracy_reward": 0.26562500931322575,
"rewards/format_reward": 0.8776041902601719,
"step": 248
},
{
"completion_length": 1003.625,
"epoch": 0.8798586572438163,
"grad_norm": 0.1951601816286408,
"kl": 7.806640625,
"learning_rate": 8.712654590675085e-07,
"loss": 0.3126,
"reward": 1.1347656659781933,
"reward_std": 0.3628583550453186,
"rewards/accuracy_reward": 0.2552083423361182,
"rewards/format_reward": 0.8795573152601719,
"step": 249
},
{
"completion_length": 1017.8541679382324,
"epoch": 0.8833922261484098,
"grad_norm": 0.13270611252972092,
"kl": 7.900390625,
"learning_rate": 8.214716012124491e-07,
"loss": 0.3162,
"reward": 1.2278646156191826,
"reward_std": 0.33511496149003506,
"rewards/accuracy_reward": 0.3190104244276881,
"rewards/format_reward": 0.9088541902601719,
"step": 250
},
{
"completion_length": 1024.0,
"epoch": 0.8869257950530035,
"grad_norm": 0.16899318018961745,
"kl": 7.703125,
"learning_rate": 7.730818491519343e-07,
"loss": 0.3083,
"reward": 1.1673177480697632,
"reward_std": 0.35251003317534924,
"rewards/accuracy_reward": 0.28515625838190317,
"rewards/format_reward": 0.8821614794433117,
"step": 251
},
{
"completion_length": 1024.0,
"epoch": 0.8904593639575972,
"grad_norm": 0.19761065312810003,
"kl": 7.53125,
"learning_rate": 7.261036054158965e-07,
"loss": 0.3013,
"reward": 1.1621094308793545,
"reward_std": 0.361306588165462,
"rewards/accuracy_reward": 0.28515625558793545,
"rewards/format_reward": 0.8769531473517418,
"step": 252
},
{
"completion_length": 980.1666679382324,
"epoch": 0.8939929328621908,
"grad_norm": 0.12059278219677871,
"kl": 7.86328125,
"learning_rate": 6.805440566056554e-07,
"loss": 0.3147,
"reward": 1.1438802555203438,
"reward_std": 0.3613898027688265,
"rewards/accuracy_reward": 0.2636718824505806,
"rewards/format_reward": 0.8802083507180214,
"step": 253
},
{
"completion_length": 1011.1041679382324,
"epoch": 0.8975265017667845,
"grad_norm": 0.21668223679230778,
"kl": 7.732421875,
"learning_rate": 6.364101722945082e-07,
"loss": 0.309,
"reward": 1.144531287252903,
"reward_std": 0.3460462633520365,
"rewards/accuracy_reward": 0.2636718829162419,
"rewards/format_reward": 0.8808594010770321,
"step": 254
},
{
"completion_length": 1024.0,
"epoch": 0.901060070671378,
"grad_norm": 0.2561339153931696,
"kl": 7.5546875,
"learning_rate": 5.937087039615619e-07,
"loss": 0.3021,
"reward": 1.1035156734287739,
"reward_std": 0.3481142967939377,
"rewards/accuracy_reward": 0.23372396640479565,
"rewards/format_reward": 0.8697916902601719,
"step": 255
},
{
"completion_length": 1024.0,
"epoch": 0.9045936395759717,
"grad_norm": 0.1552920007884265,
"kl": 7.521484375,
"learning_rate": 5.524461839589012e-07,
"loss": 0.3008,
"reward": 1.1230469271540642,
"reward_std": 0.36596967838704586,
"rewards/accuracy_reward": 0.2526041688397527,
"rewards/format_reward": 0.870442733168602,
"step": 256
},
{
"completion_length": 1008.7708358764648,
"epoch": 0.9081272084805654,
"grad_norm": 0.183290959725611,
"kl": 7.634765625,
"learning_rate": 5.126289245122906e-07,
"loss": 0.3054,
"reward": 1.1549479588866234,
"reward_std": 0.3895051181316376,
"rewards/accuracy_reward": 0.27929688058793545,
"rewards/format_reward": 0.8756510578095913,
"step": 257
},
{
"completion_length": 1014.6875,
"epoch": 0.911660777385159,
"grad_norm": 0.14993091508284867,
"kl": 7.6953125,
"learning_rate": 4.7426301675554285e-07,
"loss": 0.3077,
"reward": 1.1523437835276127,
"reward_std": 0.343139311298728,
"rewards/accuracy_reward": 0.261718756519258,
"rewards/format_reward": 0.8906250186264515,
"step": 258
},
{
"completion_length": 1007.9791679382324,
"epoch": 0.9151943462897526,
"grad_norm": 0.1374641960302751,
"kl": 7.583984375,
"learning_rate": 4.3735432979872593e-07,
"loss": 0.3032,
"reward": 1.1640625447034836,
"reward_std": 0.37700783647596836,
"rewards/accuracy_reward": 0.27799479849636555,
"rewards/format_reward": 0.8860677294433117,
"step": 259
},
{
"completion_length": 1006.0,
"epoch": 0.9187279151943463,
"grad_norm": 0.17289466011678567,
"kl": 7.734375,
"learning_rate": 4.019085098303077e-07,
"loss": 0.3092,
"reward": 1.1796875298023224,
"reward_std": 0.3705375073477626,
"rewards/accuracy_reward": 0.3046875074505806,
"rewards/format_reward": 0.8750000111758709,
"step": 260
},
{
"completion_length": 1020.5625,
"epoch": 0.9222614840989399,
"grad_norm": 0.2504365893065849,
"kl": 7.763671875,
"learning_rate": 3.679309792534291e-07,
"loss": 0.3107,
"reward": 1.1927083730697632,
"reward_std": 0.36883932538330555,
"rewards/accuracy_reward": 0.30338542722165585,
"rewards/format_reward": 0.8893229402601719,
"step": 261
},
{
"completion_length": 1024.0,
"epoch": 0.9257950530035336,
"grad_norm": 0.15980023469329024,
"kl": 7.708984375,
"learning_rate": 3.354269358563966e-07,
"loss": 0.3083,
"reward": 1.1666666939854622,
"reward_std": 0.3717129658907652,
"rewards/accuracy_reward": 0.28125000838190317,
"rewards/format_reward": 0.8854166828095913,
"step": 262
},
{
"completion_length": 1024.0,
"epoch": 0.9293286219081273,
"grad_norm": 0.14673263922185112,
"kl": 7.51171875,
"learning_rate": 3.044013520175337e-07,
"loss": 0.3004,
"reward": 1.1158854588866234,
"reward_std": 0.39931169617921114,
"rewards/accuracy_reward": 0.2441406324505806,
"rewards/format_reward": 0.8717448115348816,
"step": 263
},
{
"completion_length": 1024.0,
"epoch": 0.9328621908127208,
"grad_norm": 0.21979625485038595,
"kl": 7.701171875,
"learning_rate": 2.7485897394453067e-07,
"loss": 0.308,
"reward": 1.1269531771540642,
"reward_std": 0.3809357853606343,
"rewards/accuracy_reward": 0.23763021733611822,
"rewards/format_reward": 0.8893229328095913,
"step": 264
},
{
"completion_length": 999.4583358764648,
"epoch": 0.9363957597173145,
"grad_norm": 0.1678262537397904,
"kl": 7.7421875,
"learning_rate": 2.4680432094837394e-07,
"loss": 0.3099,
"reward": 1.1510417014360428,
"reward_std": 0.3365847198292613,
"rewards/accuracy_reward": 0.2623697994276881,
"rewards/format_reward": 0.8886718973517418,
"step": 265
},
{
"completion_length": 1024.0,
"epoch": 0.9399293286219081,
"grad_norm": 0.17657969817520183,
"kl": 7.6640625,
"learning_rate": 2.2024168475199615e-07,
"loss": 0.3068,
"reward": 1.125000037252903,
"reward_std": 0.3468599859625101,
"rewards/accuracy_reward": 0.25195313058793545,
"rewards/format_reward": 0.8730468899011612,
"step": 266
},
{
"completion_length": 1024.0,
"epoch": 0.9434628975265018,
"grad_norm": 0.16463993549487405,
"kl": 7.640625,
"learning_rate": 1.9517512883374667e-07,
"loss": 0.3057,
"reward": 1.1464844048023224,
"reward_std": 0.38465187326073647,
"rewards/accuracy_reward": 0.26106771687045693,
"rewards/format_reward": 0.8854166828095913,
"step": 267
},
{
"completion_length": 1024.0,
"epoch": 0.9469964664310954,
"grad_norm": 0.1836439792441492,
"kl": 7.57421875,
"learning_rate": 1.7160848780576334e-07,
"loss": 0.303,
"reward": 1.1764323264360428,
"reward_std": 0.39003968983888626,
"rewards/accuracy_reward": 0.29296875838190317,
"rewards/format_reward": 0.8834635689854622,
"step": 268
},
{
"completion_length": 1003.2916679382324,
"epoch": 0.950530035335689,
"grad_norm": 0.20059017619611783,
"kl": 7.5078125,
"learning_rate": 1.495453668273672e-07,
"loss": 0.3002,
"reward": 1.149088565260172,
"reward_std": 0.3729328028857708,
"rewards/accuracy_reward": 0.27408855129033327,
"rewards/format_reward": 0.8750000260770321,
"step": 269
},
{
"completion_length": 1002.9375038146973,
"epoch": 0.9540636042402827,
"grad_norm": 0.1818614489805749,
"kl": 7.669921875,
"learning_rate": 1.289891410535593e-07,
"loss": 0.3069,
"reward": 1.1751302555203438,
"reward_std": 0.3694411441683769,
"rewards/accuracy_reward": 0.28190105129033327,
"rewards/format_reward": 0.8932291865348816,
"step": 270
},
{
"completion_length": 1024.0,
"epoch": 0.9575971731448764,
"grad_norm": 0.1417185938823282,
"kl": 7.783203125,
"learning_rate": 1.0994295511869257e-07,
"loss": 0.3114,
"reward": 1.164713580161333,
"reward_std": 0.3704876583069563,
"rewards/accuracy_reward": 0.28385417629033327,
"rewards/format_reward": 0.8808593973517418,
"step": 271
},
{
"completion_length": 1007.5833358764648,
"epoch": 0.9611307420494699,
"grad_norm": 0.1335404235268344,
"kl": 7.65234375,
"learning_rate": 9.240972265541992e-08,
"loss": 0.3061,
"reward": 1.136718787252903,
"reward_std": 0.33945256378501654,
"rewards/accuracy_reward": 0.25325521687045693,
"rewards/format_reward": 0.8834635652601719,
"step": 272
},
{
"completion_length": 995.3958358764648,
"epoch": 0.9646643109540636,
"grad_norm": 0.16627237105129372,
"kl": 7.748046875,
"learning_rate": 7.639212584897082e-08,
"loss": 0.3097,
"reward": 1.1458333767950535,
"reward_std": 0.36959288641810417,
"rewards/accuracy_reward": 0.2591145895421505,
"rewards/format_reward": 0.8867187611758709,
"step": 273
},
{
"completion_length": 977.8541717529297,
"epoch": 0.9681978798586572,
"grad_norm": 0.1442104843092151,
"kl": 7.8046875,
"learning_rate": 6.189261502683619e-08,
"loss": 0.3121,
"reward": 1.1608073338866234,
"reward_std": 0.33501617051661015,
"rewards/accuracy_reward": 0.26302084047347307,
"rewards/format_reward": 0.897786483168602,
"step": 274
},
{
"completion_length": 1005.8333358764648,
"epoch": 0.9717314487632509,
"grad_norm": 0.1335599188895197,
"kl": 7.734375,
"learning_rate": 4.8913408283934874e-08,
"loss": 0.3094,
"reward": 1.1829427480697632,
"reward_std": 0.3483387678861618,
"rewards/accuracy_reward": 0.2923177173361182,
"rewards/format_reward": 0.8906250149011612,
"step": 275
},
{
"completion_length": 986.5833358764648,
"epoch": 0.9752650176678446,
"grad_norm": 0.1566671460322668,
"kl": 7.9140625,
"learning_rate": 3.745649114328065e-08,
"loss": 0.3167,
"reward": 1.2102865017950535,
"reward_std": 0.35581814870238304,
"rewards/accuracy_reward": 0.305338547565043,
"rewards/format_reward": 0.9049479402601719,
"step": 276
},
{
"completion_length": 1024.0,
"epoch": 0.9787985865724381,
"grad_norm": 0.18267938124865138,
"kl": 7.66796875,
"learning_rate": 2.7523616252252972e-08,
"loss": 0.3068,
"reward": 1.1458333730697632,
"reward_std": 0.372573995962739,
"rewards/accuracy_reward": 0.2617187579162419,
"rewards/format_reward": 0.8841146044433117,
"step": 277
},
{
"completion_length": 990.7500038146973,
"epoch": 0.9823321554770318,
"grad_norm": 0.21259523984138215,
"kl": 7.6875,
"learning_rate": 1.9116303114480316e-08,
"loss": 0.3076,
"reward": 1.1731771230697632,
"reward_std": 0.3659754488617182,
"rewards/accuracy_reward": 0.28971354803070426,
"rewards/format_reward": 0.8834635689854622,
"step": 278
},
{
"completion_length": 1012.2291679382324,
"epoch": 0.9858657243816255,
"grad_norm": 0.16162769674837876,
"kl": 7.580078125,
"learning_rate": 1.2235837857387246e-08,
"loss": 0.3032,
"reward": 1.130859412252903,
"reward_std": 0.3709502723067999,
"rewards/accuracy_reward": 0.2500000074505806,
"rewards/format_reward": 0.8808593973517418,
"step": 279
},
{
"completion_length": 1024.0,
"epoch": 0.9893992932862191,
"grad_norm": 0.1297894260306723,
"kl": 7.84375,
"learning_rate": 6.883273035447335e-09,
"loss": 0.3136,
"reward": 1.1634114906191826,
"reward_std": 0.3336097300052643,
"rewards/accuracy_reward": 0.27539063431322575,
"rewards/format_reward": 0.888020858168602,
"step": 280
},
{
"completion_length": 1024.0,
"epoch": 0.9929328621908127,
"grad_norm": 0.1521970740888536,
"kl": 7.67578125,
"learning_rate": 3.0594274691686522e-09,
"loss": 0.3069,
"reward": 1.1373698264360428,
"reward_std": 0.35927175264805555,
"rewards/accuracy_reward": 0.258463550824672,
"rewards/format_reward": 0.8789062723517418,
"step": 281
},
{
"completion_length": 992.9791679382324,
"epoch": 0.9964664310954063,
"grad_norm": 0.19083083078751598,
"kl": 7.509765625,
"learning_rate": 7.648861198306101e-10,
"loss": 0.3004,
"reward": 1.164713580161333,
"reward_std": 0.38056557066738605,
"rewards/accuracy_reward": 0.299479179084301,
"rewards/format_reward": 0.8652343973517418,
"step": 282
},
{
"completion_length": 1004.0,
"epoch": 1.0,
"grad_norm": 0.22334310417649858,
"kl": 7.740234375,
"learning_rate": 0.0,
"loss": 0.3095,
"reward": 1.1516927778720856,
"reward_std": 0.36419933661818504,
"rewards/accuracy_reward": 0.26171875558793545,
"rewards/format_reward": 0.889973983168602,
"step": 283
},
{
"epoch": 1.0,
"step": 283,
"total_flos": 0.0,
"train_loss": 3.28601383127165,
"train_runtime": 57853.0435,
"train_samples_per_second": 1.252,
"train_steps_per_second": 0.005
}
],
"logging_steps": 1,
"max_steps": 283,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 60,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}