{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 283, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 387.71745681762695, "epoch": 0.0035335689045936395, "grad_norm": 0.7826879124381357, "kl": 0.0, "learning_rate": 6.896551724137931e-07, "loss": 0.0, "reward": 0.6354166809469461, "reward_std": 0.4388374499976635, "rewards/accuracy_reward": 0.16276042070239782, "rewards/format_reward": 0.4726562611758709, "step": 1 }, { "completion_length": 405.3405055999756, "epoch": 0.007067137809187279, "grad_norm": 1.1927588816666794, "kl": 0.0, "learning_rate": 1.3793103448275862e-06, "loss": 0.0, "reward": 0.6035156473517418, "reward_std": 0.4155316762626171, "rewards/accuracy_reward": 0.1523437537252903, "rewards/format_reward": 0.4511718861758709, "step": 2 }, { "completion_length": 409.28972244262695, "epoch": 0.01060070671378092, "grad_norm": 0.6905561253532118, "kl": 0.00020521879196166992, "learning_rate": 2.0689655172413796e-06, "loss": 0.0, "reward": 0.5996093954890966, "reward_std": 0.4422223027795553, "rewards/accuracy_reward": 0.15234375465661287, "rewards/format_reward": 0.4472656361758709, "step": 3 }, { "completion_length": 410.4349060058594, "epoch": 0.014134275618374558, "grad_norm": 0.7884781255286112, "kl": 0.00036644935607910156, "learning_rate": 2.7586206896551725e-06, "loss": 0.0, "reward": 0.6054687723517418, "reward_std": 0.4260980412364006, "rewards/accuracy_reward": 0.14192708721384406, "rewards/format_reward": 0.46354168467223644, "step": 4 }, { "completion_length": 346.99024391174316, "epoch": 0.0176678445229682, "grad_norm": 0.5273525890826313, "kl": 0.0029783248901367188, "learning_rate": 3.448275862068966e-06, "loss": 0.0001, "reward": 0.7408854402601719, "reward_std": 0.40636622719466686, "rewards/accuracy_reward": 0.11979167081881315, "rewards/format_reward": 0.6210937686264515, "step": 5 }, { "completion_length": 325.3359498977661, "epoch": 0.02120141342756184, "grad_norm": 0.46274180489878297, "kl": 0.01139068603515625, "learning_rate": 4.137931034482759e-06, "loss": 0.0005, "reward": 0.7929687649011612, "reward_std": 0.4467017278075218, "rewards/accuracy_reward": 0.1158854195382446, "rewards/format_reward": 0.6770833507180214, "step": 6 }, { "completion_length": 227.93359851837158, "epoch": 0.024734982332155476, "grad_norm": 53.42434025598496, "kl": 1.0528564453125, "learning_rate": 4.8275862068965525e-06, "loss": 0.042, "reward": 0.9694010689854622, "reward_std": 0.24079907592386007, "rewards/accuracy_reward": 0.06445312686264515, "rewards/format_reward": 0.9049479439854622, "step": 7 }, { "completion_length": 202.09896278381348, "epoch": 0.028268551236749116, "grad_norm": 19.059740265959093, "kl": 0.4888916015625, "learning_rate": 5.517241379310345e-06, "loss": 0.0195, "reward": 0.9843750298023224, "reward_std": 0.24569615349173546, "rewards/accuracy_reward": 0.06380208441987634, "rewards/format_reward": 0.9205729328095913, "step": 8 }, { "completion_length": 183.13867664337158, "epoch": 0.03180212014134275, "grad_norm": 1.481224853303734, "kl": 0.0942840576171875, "learning_rate": 6.206896551724138e-06, "loss": 0.0038, "reward": 1.023437526077032, "reward_std": 0.236824631690979, "rewards/accuracy_reward": 0.09309896151535213, "rewards/format_reward": 0.9303385615348816, "step": 9 }, { "completion_length": 175.65234756469727, "epoch": 0.0353356890459364, "grad_norm": 0.5566746911844828, "kl": 0.0570068359375, "learning_rate": 6.896551724137932e-06, "loss": 0.0023, "reward": 1.0058594197034836, "reward_std": 0.20571813452988863, "rewards/accuracy_reward": 0.0716145855258219, "rewards/format_reward": 0.934244804084301, "step": 10 }, { "completion_length": 173.4589900970459, "epoch": 0.038869257950530034, "grad_norm": 0.3466466544445655, "kl": 0.0336151123046875, "learning_rate": 7.586206896551724e-06, "loss": 0.0013, "reward": 1.0175781473517418, "reward_std": 0.21056926436722279, "rewards/accuracy_reward": 0.07291666802484542, "rewards/format_reward": 0.9446614757180214, "step": 11 }, { "completion_length": 167.7506561279297, "epoch": 0.04240282685512368, "grad_norm": 0.31983345914206196, "kl": 0.042755126953125, "learning_rate": 8.275862068965518e-06, "loss": 0.0017, "reward": 1.0455729477107525, "reward_std": 0.20765206310898066, "rewards/accuracy_reward": 0.09375000174622983, "rewards/format_reward": 0.9518229328095913, "step": 12 }, { "completion_length": 170.16797399520874, "epoch": 0.045936395759717315, "grad_norm": 0.36163723632628936, "kl": 0.0521392822265625, "learning_rate": 8.965517241379312e-06, "loss": 0.0021, "reward": 1.0390625335276127, "reward_std": 0.22777050640434027, "rewards/accuracy_reward": 0.09244792052777484, "rewards/format_reward": 0.9466146044433117, "step": 13 }, { "completion_length": 158.57943153381348, "epoch": 0.04946996466431095, "grad_norm": 0.3520770926699356, "kl": 0.0601348876953125, "learning_rate": 9.655172413793105e-06, "loss": 0.0024, "reward": 1.0709635838866234, "reward_std": 0.25874905101954937, "rewards/accuracy_reward": 0.13020833674818277, "rewards/format_reward": 0.9407552257180214, "step": 14 }, { "completion_length": 118.45377922058105, "epoch": 0.053003533568904596, "grad_norm": 0.35628917266654875, "kl": 0.10369873046875, "learning_rate": 1.0344827586206898e-05, "loss": 0.0041, "reward": 1.0696614794433117, "reward_std": 0.21860592905431986, "rewards/accuracy_reward": 0.11132812919095159, "rewards/format_reward": 0.958333358168602, "step": 15 }, { "completion_length": 81.11849284172058, "epoch": 0.05653710247349823, "grad_norm": 1.1321068685550928, "kl": 0.23651123046875, "learning_rate": 1.103448275862069e-05, "loss": 0.0095, "reward": 1.0891927629709244, "reward_std": 0.16596419550478458, "rewards/accuracy_reward": 0.10677083791233599, "rewards/format_reward": 0.9824218899011612, "step": 16 }, { "completion_length": 116.46028995513916, "epoch": 0.06007067137809187, "grad_norm": 0.4979444532314162, "kl": 0.111053466796875, "learning_rate": 1.1724137931034483e-05, "loss": 0.0044, "reward": 1.091796912252903, "reward_std": 0.20691483141854405, "rewards/accuracy_reward": 0.12630208651535213, "rewards/format_reward": 0.9654948078095913, "step": 17 }, { "completion_length": 106.39062833786011, "epoch": 0.0636042402826855, "grad_norm": 0.39934788591997045, "kl": 0.13848876953125, "learning_rate": 1.2413793103448277e-05, "loss": 0.0055, "reward": 1.1243490055203438, "reward_std": 0.23089734092354774, "rewards/accuracy_reward": 0.15299479360692203, "rewards/format_reward": 0.9713541902601719, "step": 18 }, { "completion_length": 116.17318058013916, "epoch": 0.06713780918727916, "grad_norm": 0.40747455110093217, "kl": 0.15606689453125, "learning_rate": 1.310344827586207e-05, "loss": 0.0062, "reward": 1.1321614980697632, "reward_std": 0.23220197623595595, "rewards/accuracy_reward": 0.15625000465661287, "rewards/format_reward": 0.9759114794433117, "step": 19 }, { "completion_length": 109.37304925918579, "epoch": 0.0706713780918728, "grad_norm": 0.38360455679162203, "kl": 0.2166748046875, "learning_rate": 1.3793103448275863e-05, "loss": 0.0087, "reward": 1.1816406697034836, "reward_std": 0.2594331307336688, "rewards/accuracy_reward": 0.20182292396202683, "rewards/format_reward": 0.9798177219927311, "step": 20 }, { "completion_length": 106.66992425918579, "epoch": 0.07420494699646643, "grad_norm": 0.4520249358225062, "kl": 0.266357421875, "learning_rate": 1.4482758620689657e-05, "loss": 0.0107, "reward": 1.1595052555203438, "reward_std": 0.24440898094326258, "rewards/accuracy_reward": 0.17773437732830644, "rewards/format_reward": 0.9817708544433117, "step": 21 }, { "completion_length": 140.5937557220459, "epoch": 0.07773851590106007, "grad_norm": 1.4718976020696355, "kl": 0.232421875, "learning_rate": 1.5172413793103448e-05, "loss": 0.0093, "reward": 1.199869841337204, "reward_std": 0.2511229431256652, "rewards/accuracy_reward": 0.2174479211680591, "rewards/format_reward": 0.9824219010770321, "step": 22 }, { "completion_length": 186.76562976837158, "epoch": 0.0812720848056537, "grad_norm": 0.9576903316083716, "kl": 0.173370361328125, "learning_rate": 1.586206896551724e-05, "loss": 0.0069, "reward": 1.2298177480697632, "reward_std": 0.26364279724657536, "rewards/accuracy_reward": 0.24869792629033327, "rewards/format_reward": 0.9811198152601719, "step": 23 }, { "completion_length": 189.61133193969727, "epoch": 0.08480565371024736, "grad_norm": 0.3064098864277374, "kl": 0.15606689453125, "learning_rate": 1.6551724137931037e-05, "loss": 0.0062, "reward": 1.2526042088866234, "reward_std": 0.27888874523341656, "rewards/accuracy_reward": 0.28125000931322575, "rewards/format_reward": 0.9713541828095913, "step": 24 }, { "completion_length": 179.1979217529297, "epoch": 0.08833922261484099, "grad_norm": 0.4424785145099648, "kl": 0.168243408203125, "learning_rate": 1.7241379310344828e-05, "loss": 0.0067, "reward": 1.189453162252903, "reward_std": 0.2696635592728853, "rewards/accuracy_reward": 0.22526042349636555, "rewards/format_reward": 0.9641927294433117, "step": 25 }, { "completion_length": 220.5813865661621, "epoch": 0.09187279151943463, "grad_norm": 0.2927466463526939, "kl": 0.117034912109375, "learning_rate": 1.7931034482758623e-05, "loss": 0.0047, "reward": 1.221354216337204, "reward_std": 0.28838597796857357, "rewards/accuracy_reward": 0.25520834047347307, "rewards/format_reward": 0.9661458507180214, "step": 26 }, { "completion_length": 218.9602918624878, "epoch": 0.09540636042402827, "grad_norm": 0.3147261189039069, "kl": 0.1197509765625, "learning_rate": 1.8620689655172415e-05, "loss": 0.0048, "reward": 1.2246094197034836, "reward_std": 0.2890056548640132, "rewards/accuracy_reward": 0.26236980129033327, "rewards/format_reward": 0.962239608168602, "step": 27 }, { "completion_length": 192.85547256469727, "epoch": 0.0989399293286219, "grad_norm": 0.2815314878632684, "kl": 0.147918701171875, "learning_rate": 1.931034482758621e-05, "loss": 0.0059, "reward": 1.2167969048023224, "reward_std": 0.29042986780405045, "rewards/accuracy_reward": 0.2584635470993817, "rewards/format_reward": 0.9583333544433117, "step": 28 }, { "completion_length": 178.49284267425537, "epoch": 0.10247349823321555, "grad_norm": 0.32426980772846276, "kl": 0.1710205078125, "learning_rate": 2e-05, "loss": 0.0068, "reward": 1.2174479514360428, "reward_std": 0.28664571419358253, "rewards/accuracy_reward": 0.26627604849636555, "rewards/format_reward": 0.9511718936264515, "step": 29 }, { "completion_length": 132.87891054153442, "epoch": 0.10600706713780919, "grad_norm": 0.3462539007364067, "kl": 0.24053955078125, "learning_rate": 1.999923511388017e-05, "loss": 0.0096, "reward": 1.2102864906191826, "reward_std": 0.2935393461957574, "rewards/accuracy_reward": 0.2519531361758709, "rewards/format_reward": 0.9583333544433117, "step": 30 }, { "completion_length": 104.59570646286011, "epoch": 0.10954063604240283, "grad_norm": 0.31757187279941806, "kl": 0.29840087890625, "learning_rate": 1.999694057253083e-05, "loss": 0.0119, "reward": 1.2070312798023224, "reward_std": 0.2599523845128715, "rewards/accuracy_reward": 0.24153646733611822, "rewards/format_reward": 0.9654948115348816, "step": 31 }, { "completion_length": 122.79622745513916, "epoch": 0.11307420494699646, "grad_norm": 0.29516858001336377, "kl": 0.289306640625, "learning_rate": 1.9993116726964554e-05, "loss": 0.0116, "reward": 1.1855469048023224, "reward_std": 0.2488250662572682, "rewards/accuracy_reward": 0.2220052140764892, "rewards/format_reward": 0.9635416865348816, "step": 32 }, { "completion_length": 169.53385829925537, "epoch": 0.1166077738515901, "grad_norm": 0.2854397155723977, "kl": 0.218505859375, "learning_rate": 1.9987764162142615e-05, "loss": 0.0087, "reward": 1.2135417088866234, "reward_std": 0.27537838369607925, "rewards/accuracy_reward": 0.25130209513008595, "rewards/format_reward": 0.9622396044433117, "step": 33 }, { "completion_length": 219.41602230072021, "epoch": 0.12014134275618374, "grad_norm": 0.22474385235022273, "kl": 0.1595458984375, "learning_rate": 1.998088369688552e-05, "loss": 0.0064, "reward": 1.214843787252903, "reward_std": 0.31507682241499424, "rewards/accuracy_reward": 0.2766927136108279, "rewards/format_reward": 0.9381510615348816, "step": 34 }, { "completion_length": 274.76563453674316, "epoch": 0.12367491166077739, "grad_norm": 1452525.1386107916, "kl": 21248.131378173828, "learning_rate": 1.9972476383747748e-05, "loss": 851.4882, "reward": 1.1894531697034836, "reward_std": 0.3239498296752572, "rewards/accuracy_reward": 0.2558593829162419, "rewards/format_reward": 0.9335937686264515, "step": 35 }, { "completion_length": 315.4687614440918, "epoch": 0.127208480565371, "grad_norm": 4.918800245024653, "kl": 0.221282958984375, "learning_rate": 1.9962543508856722e-05, "loss": 0.0088, "reward": 1.212890662252903, "reward_std": 0.3367620576173067, "rewards/accuracy_reward": 0.29817709140479565, "rewards/format_reward": 0.9147135615348816, "step": 36 }, { "completion_length": 308.2695417404175, "epoch": 0.13074204946996468, "grad_norm": 0.2301169380333895, "kl": 0.126373291015625, "learning_rate": 1.995108659171607e-05, "loss": 0.0051, "reward": 1.2324219271540642, "reward_std": 0.31519000325351954, "rewards/accuracy_reward": 0.28971354849636555, "rewards/format_reward": 0.9427083544433117, "step": 37 }, { "completion_length": 287.9537887573242, "epoch": 0.13427561837455831, "grad_norm": 0.20139063038206165, "kl": 0.11163330078125, "learning_rate": 1.9938107384973165e-05, "loss": 0.0045, "reward": 1.2350260764360428, "reward_std": 0.2919177133589983, "rewards/accuracy_reward": 0.26888021547347307, "rewards/format_reward": 0.966145858168602, "step": 38 }, { "completion_length": 288.69141578674316, "epoch": 0.13780918727915195, "grad_norm": 0.18498422915016208, "kl": 0.1121826171875, "learning_rate": 1.992360787415103e-05, "loss": 0.0045, "reward": 1.2382812947034836, "reward_std": 0.2696660226210952, "rewards/accuracy_reward": 0.270182297565043, "rewards/format_reward": 0.9680989757180214, "step": 39 }, { "completion_length": 285.7558660507202, "epoch": 0.1413427561837456, "grad_norm": 0.18156169712553022, "kl": 0.112762451171875, "learning_rate": 1.9907590277344582e-05, "loss": 0.0045, "reward": 1.2604167014360428, "reward_std": 0.23935140296816826, "rewards/accuracy_reward": 0.2799479244276881, "rewards/format_reward": 0.9804687649011612, "step": 40 }, { "completion_length": 274.0761842727661, "epoch": 0.14487632508833923, "grad_norm": 0.2080024626531214, "kl": 0.1217041015625, "learning_rate": 1.9890057044881308e-05, "loss": 0.0049, "reward": 1.2936198338866234, "reward_std": 0.2932386351749301, "rewards/accuracy_reward": 0.32031251210719347, "rewards/format_reward": 0.9733073152601719, "step": 41 }, { "completion_length": 282.8776149749756, "epoch": 0.14840989399293286, "grad_norm": 0.23856185666588572, "kl": 0.125213623046875, "learning_rate": 1.9871010858946443e-05, "loss": 0.005, "reward": 1.2753906697034836, "reward_std": 0.2589658652432263, "rewards/accuracy_reward": 0.3033854253590107, "rewards/format_reward": 0.972005233168602, "step": 42 }, { "completion_length": 296.82422733306885, "epoch": 0.1519434628975265, "grad_norm": 0.2858858078510366, "kl": 0.140716552734375, "learning_rate": 1.9850454633172632e-05, "loss": 0.0056, "reward": 1.265625037252903, "reward_std": 0.3042640471830964, "rewards/accuracy_reward": 0.3138020960614085, "rewards/format_reward": 0.951822929084301, "step": 43 }, { "completion_length": 312.989595413208, "epoch": 0.15547703180212014, "grad_norm": 75.43924110056017, "kl": 1.633636474609375, "learning_rate": 1.982839151219424e-05, "loss": 0.0654, "reward": 1.2662760615348816, "reward_std": 0.3203592775389552, "rewards/accuracy_reward": 0.32617188431322575, "rewards/format_reward": 0.9401041865348816, "step": 44 }, { "completion_length": 318.19271659851074, "epoch": 0.15901060070671377, "grad_norm": 4.839307387093032, "kl": 0.45843505859375, "learning_rate": 1.9804824871166254e-05, "loss": 0.0183, "reward": 1.2480469271540642, "reward_std": 0.37619344517588615, "rewards/accuracy_reward": 0.3489583432674408, "rewards/format_reward": 0.8990885615348816, "step": 45 }, { "completion_length": 360.181001663208, "epoch": 0.1625441696113074, "grad_norm": 1.6956048738668972, "kl": 0.27557373046875, "learning_rate": 1.9779758315248006e-05, "loss": 0.011, "reward": 1.2226562947034836, "reward_std": 0.3949108961969614, "rewards/accuracy_reward": 0.3483073003590107, "rewards/format_reward": 0.874348983168602, "step": 46 }, { "completion_length": 365.8092555999756, "epoch": 0.16607773851590105, "grad_norm": 201.83615153266913, "kl": 12.43408203125, "learning_rate": 1.975319567905163e-05, "loss": 0.4973, "reward": 1.1972656659781933, "reward_std": 0.4045752976089716, "rewards/accuracy_reward": 0.345703131519258, "rewards/format_reward": 0.8515625186264515, "step": 47 }, { "completion_length": 342.6002674102783, "epoch": 0.1696113074204947, "grad_norm": 5.398450236245697, "kl": 0.656494140625, "learning_rate": 1.9725141026055473e-05, "loss": 0.0263, "reward": 1.2246094085276127, "reward_std": 0.4386922810226679, "rewards/accuracy_reward": 0.37304688431322575, "rewards/format_reward": 0.8515625186264515, "step": 48 }, { "completion_length": 354.3912830352783, "epoch": 0.17314487632508835, "grad_norm": 6.492613210824359, "kl": 0.2779541015625, "learning_rate": 1.9695598647982467e-05, "loss": 0.0111, "reward": 1.1972656697034836, "reward_std": 0.44870651699602604, "rewards/accuracy_reward": 0.3587239682674408, "rewards/format_reward": 0.8385416902601719, "step": 49 }, { "completion_length": 317.2200622558594, "epoch": 0.17667844522968199, "grad_norm": 6.9054290512934475, "kl": 0.31793212890625, "learning_rate": 1.9664573064143604e-05, "loss": 0.0127, "reward": 1.148437537252903, "reward_std": 0.4271644949913025, "rewards/accuracy_reward": 0.3033854244276881, "rewards/format_reward": 0.8450521044433117, "step": 50 }, { "completion_length": 291.3737087249756, "epoch": 0.18021201413427562, "grad_norm": 1.295034459732409, "kl": 0.384765625, "learning_rate": 1.9632069020746574e-05, "loss": 0.0154, "reward": 1.1966146305203438, "reward_std": 0.41263195499777794, "rewards/accuracy_reward": 0.34179688338190317, "rewards/format_reward": 0.8548177257180214, "step": 51 }, { "completion_length": 242.330735206604, "epoch": 0.18374558303886926, "grad_norm": 44.75372246484856, "kl": 3.65264892578125, "learning_rate": 1.9598091490169696e-05, "loss": 0.1463, "reward": 1.1809896305203438, "reward_std": 0.3946582209318876, "rewards/accuracy_reward": 0.31445313477888703, "rewards/format_reward": 0.8665364757180214, "step": 52 }, { "completion_length": 196.8216199874878, "epoch": 0.1872791519434629, "grad_norm": 4.161042131383654, "kl": 0.65478515625, "learning_rate": 1.9562645670201278e-05, "loss": 0.0262, "reward": 1.2526041865348816, "reward_std": 0.3336602235212922, "rewards/accuracy_reward": 0.3313802145421505, "rewards/format_reward": 0.9212239682674408, "step": 53 }, { "completion_length": 176.54818201065063, "epoch": 0.19081272084805653, "grad_norm": 0.2979202087235891, "kl": 0.368896484375, "learning_rate": 1.9525736983244458e-05, "loss": 0.0148, "reward": 1.2096354514360428, "reward_std": 0.32884097658097744, "rewards/accuracy_reward": 0.29687500884756446, "rewards/format_reward": 0.912760429084301, "step": 54 }, { "completion_length": 166.145188331604, "epoch": 0.19434628975265017, "grad_norm": 1.099328042290411, "kl": 0.488525390625, "learning_rate": 1.948737107548771e-05, "loss": 0.0195, "reward": 1.263671912252903, "reward_std": 0.3232028791680932, "rewards/accuracy_reward": 0.34700521919876337, "rewards/format_reward": 0.9166666865348816, "step": 55 }, { "completion_length": 201.4856834411621, "epoch": 0.1978798586572438, "grad_norm": 0.5237539826150962, "kl": 0.37255859375, "learning_rate": 1.94475538160411e-05, "loss": 0.0149, "reward": 1.2526042014360428, "reward_std": 0.32452640403062105, "rewards/accuracy_reward": 0.3457031324505806, "rewards/format_reward": 0.906901054084301, "step": 56 }, { "completion_length": 223.5299530029297, "epoch": 0.20141342756183744, "grad_norm": 1.0789636117575958, "kl": 0.4588623046875, "learning_rate": 1.940629129603844e-05, "loss": 0.0183, "reward": 1.2089843973517418, "reward_std": 0.367857669480145, "rewards/accuracy_reward": 0.3307291744276881, "rewards/format_reward": 0.8782552294433117, "step": 57 }, { "completion_length": 236.89389038085938, "epoch": 0.2049469964664311, "grad_norm": 0.9522159379621601, "kl": 0.476318359375, "learning_rate": 1.9363589827705494e-05, "loss": 0.0191, "reward": 1.227213591337204, "reward_std": 0.39456650614738464, "rewards/accuracy_reward": 0.34049480501562357, "rewards/format_reward": 0.8867187611758709, "step": 58 }, { "completion_length": 218.1360740661621, "epoch": 0.20848056537102475, "grad_norm": 1.264236430534227, "kl": 0.8280029296875, "learning_rate": 1.9319455943394347e-05, "loss": 0.0331, "reward": 1.2571614980697632, "reward_std": 0.3343982622027397, "rewards/accuracy_reward": 0.35286459513008595, "rewards/format_reward": 0.9042969010770321, "step": 59 }, { "completion_length": 185.6367244720459, "epoch": 0.21201413427561838, "grad_norm": 111.38995544077012, "kl": 7.3544921875, "learning_rate": 1.9273896394584103e-05, "loss": 0.2946, "reward": 1.272786483168602, "reward_std": 0.3291959064081311, "rewards/accuracy_reward": 0.35221355222165585, "rewards/format_reward": 0.9205729365348816, "step": 60 }, { "completion_length": 197.7513074874878, "epoch": 0.21554770318021202, "grad_norm": 1.9921896449669312, "kl": 0.7696533203125, "learning_rate": 1.9226918150848067e-05, "loss": 0.0308, "reward": 1.2643229588866234, "reward_std": 0.32842374220490456, "rewards/accuracy_reward": 0.3457031324505806, "rewards/format_reward": 0.9186198115348816, "step": 61 }, { "completion_length": 255.86784744262695, "epoch": 0.21908127208480566, "grad_norm": 2.1340639274317144, "kl": 0.924560546875, "learning_rate": 1.9178528398787553e-05, "loss": 0.037, "reward": 1.1725260838866234, "reward_std": 0.4286086466163397, "rewards/accuracy_reward": 0.31575521547347307, "rewards/format_reward": 0.8567708544433117, "step": 62 }, { "completion_length": 175.7168025970459, "epoch": 0.2226148409893993, "grad_norm": 7.908467450030422, "kl": 1.04931640625, "learning_rate": 1.9128734540932494e-05, "loss": 0.042, "reward": 1.2838541865348816, "reward_std": 0.31748174503445625, "rewards/accuracy_reward": 0.34765625838190317, "rewards/format_reward": 0.9361979365348816, "step": 63 }, { "completion_length": 145.36784315109253, "epoch": 0.22614840989399293, "grad_norm": 5066.629420414826, "kl": 549.625, "learning_rate": 1.907754419460904e-05, "loss": 22.0063, "reward": 1.3138021156191826, "reward_std": 0.25937482714653015, "rewards/accuracy_reward": 0.3567708423361182, "rewards/format_reward": 0.9570312723517418, "step": 64 }, { "completion_length": 134.21419763565063, "epoch": 0.22968197879858657, "grad_norm": 73.69279102025739, "kl": 7.2021484375, "learning_rate": 1.9024965190774262e-05, "loss": 0.2879, "reward": 1.2675781548023224, "reward_std": 0.2847044528461993, "rewards/accuracy_reward": 0.32226563058793545, "rewards/format_reward": 0.9453125260770321, "step": 65 }, { "completion_length": 295.8131628036499, "epoch": 0.2332155477031802, "grad_norm": 15.351766639697455, "kl": 1.186279296875, "learning_rate": 1.8971005572818213e-05, "loss": 0.0474, "reward": 1.1197917014360428, "reward_std": 0.4257570914924145, "rewards/accuracy_reward": 0.285807297565043, "rewards/format_reward": 0.8339843899011612, "step": 66 }, { "completion_length": 483.8196773529053, "epoch": 0.23674911660777384, "grad_norm": 595.1397215741732, "kl": 64.0, "learning_rate": 1.8915673595333443e-05, "loss": 2.5615, "reward": 0.9531250223517418, "reward_std": 0.5964761041104794, "rewards/accuracy_reward": 0.30403646640479565, "rewards/format_reward": 0.649088554084301, "step": 67 }, { "completion_length": 788.6002807617188, "epoch": 0.24028268551236748, "grad_norm": 50.44103352403159, "kl": 3.42041015625, "learning_rate": 1.8858977722852273e-05, "loss": 0.1367, "reward": 0.4303385578095913, "reward_std": 0.5175576768815517, "rewards/accuracy_reward": 0.17187500512227416, "rewards/format_reward": 0.2584635503590107, "step": 68 }, { "completion_length": 929.8216323852539, "epoch": 0.24381625441696114, "grad_norm": 734.3438170967142, "kl": 85.0, "learning_rate": 1.8800926628551884e-05, "loss": 3.3995, "reward": 0.21744792256504297, "reward_std": 0.338073399849236, "rewards/accuracy_reward": 0.11653646267950535, "rewards/format_reward": 0.1009114624466747, "step": 69 }, { "completion_length": 997.1341323852539, "epoch": 0.24734982332155478, "grad_norm": 30.37607254327561, "kl": 5.88671875, "learning_rate": 1.8741529192927528e-05, "loss": 0.2356, "reward": 0.125000003259629, "reward_std": 0.22288852790370584, "rewards/accuracy_reward": 0.09440104337409139, "rewards/format_reward": 0.03059895901242271, "step": 70 }, { "completion_length": 1017.4121170043945, "epoch": 0.2508833922261484, "grad_norm": 27.33797696554464, "kl": 1.1336669921875, "learning_rate": 1.8680794502434018e-05, "loss": 0.0453, "reward": 0.09960937825962901, "reward_std": 0.15996943740174174, "rewards/accuracy_reward": 0.09895833628252149, "rewards/format_reward": 0.0006510416860692203, "step": 71 }, { "completion_length": 1011.7773590087891, "epoch": 0.254416961130742, "grad_norm": 51.58164513519689, "kl": 1.179931640625, "learning_rate": 1.8618731848095706e-05, "loss": 0.0472, "reward": 0.12369791930541396, "reward_std": 0.19708295073360205, "rewards/accuracy_reward": 0.1223958358168602, "rewards/format_reward": 0.0013020833721384406, "step": 72 }, { "completion_length": 1023.6119804382324, "epoch": 0.2579505300353357, "grad_norm": 0.3170159566701739, "kl": 0.142608642578125, "learning_rate": 1.855535072408516e-05, "loss": 0.0057, "reward": 0.11263021267950535, "reward_std": 0.1862776312045753, "rewards/accuracy_reward": 0.11197917093522847, "rewards/format_reward": 0.0006510416860692203, "step": 73 }, { "completion_length": 1024.0, "epoch": 0.26148409893992935, "grad_norm": 0.4703496805090262, "kl": 0.159820556640625, "learning_rate": 1.849066082627079e-05, "loss": 0.0064, "reward": 0.10807291930541396, "reward_std": 0.15622267639264464, "rewards/accuracy_reward": 0.10807291930541396, "rewards/format_reward": 0.0, "step": 74 }, { "completion_length": 1024.0, "epoch": 0.26501766784452296, "grad_norm": 0.17854669938905443, "kl": 0.15557861328125, "learning_rate": 1.8424672050733577e-05, "loss": 0.0062, "reward": 0.16731771267950535, "reward_std": 0.2150044571608305, "rewards/accuracy_reward": 0.16731771267950535, "rewards/format_reward": 0.0, "step": 75 }, { "completion_length": 1024.0, "epoch": 0.26855123674911663, "grad_norm": 0.9729182998733601, "kl": 0.2708740234375, "learning_rate": 1.8357394492253216e-05, "loss": 0.0108, "reward": 0.20703125465661287, "reward_std": 0.24680148623883724, "rewards/accuracy_reward": 0.20703125465661287, "rewards/format_reward": 0.0, "step": 76 }, { "completion_length": 1024.0, "epoch": 0.27208480565371024, "grad_norm": 0.10194934645252694, "kl": 0.17828369140625, "learning_rate": 1.8288838442763838e-05, "loss": 0.0071, "reward": 0.2675781324505806, "reward_std": 0.27682292833924294, "rewards/accuracy_reward": 0.2662760494276881, "rewards/format_reward": 0.0013020833721384406, "step": 77 }, { "completion_length": 1024.0, "epoch": 0.2756183745583039, "grad_norm": 0.10819020058919213, "kl": 0.18212890625, "learning_rate": 1.8219014389779586e-05, "loss": 0.0073, "reward": 0.26497396547347307, "reward_std": 0.25599073618650436, "rewards/accuracy_reward": 0.2597656324505806, "rewards/format_reward": 0.0052083334885537624, "step": 78 }, { "completion_length": 1024.0, "epoch": 0.2791519434628975, "grad_norm": 0.25970666338072285, "kl": 0.196533203125, "learning_rate": 1.8147933014790245e-05, "loss": 0.0079, "reward": 0.2988281287252903, "reward_std": 0.28412702213972807, "rewards/accuracy_reward": 0.2760416744276881, "rewards/format_reward": 0.02278645895421505, "step": 79 }, { "completion_length": 1024.0, "epoch": 0.2826855123674912, "grad_norm": 0.7315926003388828, "kl": 0.2489013671875, "learning_rate": 1.8075605191627242e-05, "loss": 0.01, "reward": 0.401041679084301, "reward_std": 0.3803202658891678, "rewards/accuracy_reward": 0.2910156296566129, "rewards/format_reward": 0.11002604523673654, "step": 80 }, { "completion_length": 1024.0, "epoch": 0.2862190812720848, "grad_norm": 10.638076717998826, "kl": 1.59765625, "learning_rate": 1.8002041984800173e-05, "loss": 0.064, "reward": 0.8561198078095913, "reward_std": 0.5722472295165062, "rewards/accuracy_reward": 0.29622396547347307, "rewards/format_reward": 0.5598958469927311, "step": 81 }, { "completion_length": 1024.0, "epoch": 0.28975265017667845, "grad_norm": 1.8766038235299047, "kl": 0.3572998046875, "learning_rate": 1.792725464780421e-05, "loss": 0.0143, "reward": 0.9231771044433117, "reward_std": 0.5282374154776335, "rewards/accuracy_reward": 0.2630208423361182, "rewards/format_reward": 0.6601562723517418, "step": 82 }, { "completion_length": 1024.0, "epoch": 0.29328621908127206, "grad_norm": 2.8486965993956006, "kl": 0.9862060546875, "learning_rate": 1.785125462139855e-05, "loss": 0.0394, "reward": 1.1699219197034836, "reward_std": 0.3966878689825535, "rewards/accuracy_reward": 0.2936198003590107, "rewards/format_reward": 0.876302108168602, "step": 83 }, { "completion_length": 1024.0, "epoch": 0.2968197879858657, "grad_norm": 1.3612075349379993, "kl": 0.6107177734375, "learning_rate": 1.7774053531856258e-05, "loss": 0.0244, "reward": 1.2044271007180214, "reward_std": 0.4200763385742903, "rewards/accuracy_reward": 0.30989584047347307, "rewards/format_reward": 0.8945312723517418, "step": 84 }, { "completion_length": 1024.0, "epoch": 0.3003533568904594, "grad_norm": 8.701071380759565, "kl": 2.2080078125, "learning_rate": 1.7695663189185703e-05, "loss": 0.0883, "reward": 1.208984412252903, "reward_std": 0.36989905312657356, "rewards/accuracy_reward": 0.2812500102445483, "rewards/format_reward": 0.9277343936264515, "step": 85 }, { "completion_length": 1024.0, "epoch": 0.303886925795053, "grad_norm": 1.8019930642199862, "kl": 1.42626953125, "learning_rate": 1.7616095585323882e-05, "loss": 0.0571, "reward": 1.156901091337204, "reward_std": 0.4002630840986967, "rewards/accuracy_reward": 0.26041667349636555, "rewards/format_reward": 0.8964843973517418, "step": 86 }, { "completion_length": 1024.0, "epoch": 0.30742049469964666, "grad_norm": 4.127300637009884, "kl": 0.299072265625, "learning_rate": 1.7535362892301953e-05, "loss": 0.012, "reward": 1.0872396118938923, "reward_std": 0.4400251917541027, "rewards/accuracy_reward": 0.23958334233611822, "rewards/format_reward": 0.8476562760770321, "step": 87 }, { "completion_length": 1024.0, "epoch": 0.31095406360424027, "grad_norm": 4.667835148606083, "kl": 0.3037109375, "learning_rate": 1.745347746038319e-05, "loss": 0.0122, "reward": 1.0917969234287739, "reward_std": 0.4730749297887087, "rewards/accuracy_reward": 0.2747395886108279, "rewards/format_reward": 0.8170573078095913, "step": 88 }, { "completion_length": 1024.0, "epoch": 0.31448763250883394, "grad_norm": 2.7630279757978706, "kl": 0.873291015625, "learning_rate": 1.737045181617364e-05, "loss": 0.035, "reward": 1.0364583693444729, "reward_std": 0.5082622393965721, "rewards/accuracy_reward": 0.2389322966337204, "rewards/format_reward": 0.7975260578095913, "step": 89 }, { "completion_length": 1024.0, "epoch": 0.31802120141342755, "grad_norm": 27.960489910885407, "kl": 5.583984375, "learning_rate": 1.7286298660705877e-05, "loss": 0.2233, "reward": 1.0781250298023224, "reward_std": 0.47590856440365314, "rewards/accuracy_reward": 0.25000000838190317, "rewards/format_reward": 0.8281250223517418, "step": 90 }, { "completion_length": 1024.0, "epoch": 0.3215547703180212, "grad_norm": 46.613552133923285, "kl": 8.90234375, "learning_rate": 1.7201030867496005e-05, "loss": 0.3559, "reward": 1.0358073264360428, "reward_std": 0.4818594641983509, "rewards/accuracy_reward": 0.2369791753590107, "rewards/format_reward": 0.7988281399011612, "step": 91 }, { "completion_length": 1024.0, "epoch": 0.3250883392226148, "grad_norm": 28.615484872216385, "kl": 5.740234375, "learning_rate": 1.711466148057433e-05, "loss": 0.2297, "reward": 0.9694010727107525, "reward_std": 0.5088351331651211, "rewards/accuracy_reward": 0.22916667582467198, "rewards/format_reward": 0.7402343861758709, "step": 92 }, { "completion_length": 1024.0, "epoch": 0.3286219081272085, "grad_norm": 5.615041558235327, "kl": 2.17333984375, "learning_rate": 1.7027203712489902e-05, "loss": 0.0869, "reward": 0.8919270969927311, "reward_std": 0.5635814908891916, "rewards/accuracy_reward": 0.26757813431322575, "rewards/format_reward": 0.6243489757180214, "step": 93 }, { "completion_length": 1024.0, "epoch": 0.3321554770318021, "grad_norm": 8.244116658103383, "kl": 0.3486328125, "learning_rate": 1.6938670942289292e-05, "loss": 0.0139, "reward": 0.6302083544433117, "reward_std": 0.5751823391765356, "rewards/accuracy_reward": 0.22395833861082792, "rewards/format_reward": 0.40625000931322575, "step": 94 }, { "completion_length": 1024.0, "epoch": 0.33568904593639576, "grad_norm": 6.524355268359978, "kl": 0.24981689453125, "learning_rate": 1.6849076713469914e-05, "loss": 0.01, "reward": 0.5638021007180214, "reward_std": 0.5523576978594065, "rewards/accuracy_reward": 0.23893229756504297, "rewards/format_reward": 0.32486980222165585, "step": 95 }, { "completion_length": 1024.0, "epoch": 0.3392226148409894, "grad_norm": 7.953861555674408, "kl": 0.3388671875, "learning_rate": 1.6758434731908178e-05, "loss": 0.0136, "reward": 0.6627604402601719, "reward_std": 0.5646504014730453, "rewards/accuracy_reward": 0.27343750838190317, "rewards/format_reward": 0.3893229253590107, "step": 96 }, { "completion_length": 1024.0, "epoch": 0.34275618374558303, "grad_norm": 9.117171774796663, "kl": 0.66259765625, "learning_rate": 1.6666758863762796e-05, "loss": 0.0265, "reward": 0.8255208507180214, "reward_std": 0.5879920609295368, "rewards/accuracy_reward": 0.281250006519258, "rewards/format_reward": 0.5442708488553762, "step": 97 }, { "completion_length": 1024.0, "epoch": 0.3462897526501767, "grad_norm": 3.2064911207772133, "kl": 0.98388671875, "learning_rate": 1.657406313335358e-05, "loss": 0.0394, "reward": 0.9733073189854622, "reward_std": 0.5437621138989925, "rewards/accuracy_reward": 0.27604167629033327, "rewards/format_reward": 0.6972656436264515, "step": 98 }, { "completion_length": 1024.0, "epoch": 0.3498233215547703, "grad_norm": 3.916153998441947, "kl": 0.6441650390625, "learning_rate": 1.6480361721016053e-05, "loss": 0.0258, "reward": 1.0598958618938923, "reward_std": 0.46388070471584797, "rewards/accuracy_reward": 0.2578125074505806, "rewards/format_reward": 0.8020833507180214, "step": 99 }, { "completion_length": 1024.0, "epoch": 0.35335689045936397, "grad_norm": 1.527146735006838, "kl": 0.5404052734375, "learning_rate": 1.6385668960932143e-05, "loss": 0.0216, "reward": 1.104166690260172, "reward_std": 0.4302889872342348, "rewards/accuracy_reward": 0.2565104253590107, "rewards/format_reward": 0.8476562686264515, "step": 100 }, { "completion_length": 1024.0, "epoch": 0.3568904593639576, "grad_norm": 0.650990035380561, "kl": 0.4241943359375, "learning_rate": 1.6289999338937427e-05, "loss": 0.017, "reward": 1.2311198338866234, "reward_std": 0.36215772293508053, "rewards/accuracy_reward": 0.3190104244276881, "rewards/format_reward": 0.9121093973517418, "step": 101 }, { "completion_length": 1024.0, "epoch": 0.36042402826855124, "grad_norm": 2.309888696921956, "kl": 0.6422119140625, "learning_rate": 1.619336749030509e-05, "loss": 0.0257, "reward": 1.1972656697034836, "reward_std": 0.33416645554825664, "rewards/accuracy_reward": 0.27018230129033327, "rewards/format_reward": 0.9270833507180214, "step": 102 }, { "completion_length": 1024.0, "epoch": 0.36395759717314485, "grad_norm": 4.444041311869189, "kl": 0.896728515625, "learning_rate": 1.609578819750708e-05, "loss": 0.0359, "reward": 1.0091146156191826, "reward_std": 0.4832933880388737, "rewards/accuracy_reward": 0.2298177145421505, "rewards/format_reward": 0.7792968936264515, "step": 103 }, { "completion_length": 1024.0, "epoch": 0.3674911660777385, "grad_norm": 11.19029279075626, "kl": 0.737548828125, "learning_rate": 1.5997276387952733e-05, "loss": 0.0295, "reward": 0.9277344010770321, "reward_std": 0.5210180301219225, "rewards/accuracy_reward": 0.2356770890764892, "rewards/format_reward": 0.6920573078095913, "step": 104 }, { "completion_length": 1024.0, "epoch": 0.3710247349823322, "grad_norm": 15.55734153622524, "kl": 1.072021484375, "learning_rate": 1.5897847131705194e-05, "loss": 0.0429, "reward": 0.9375000223517418, "reward_std": 0.5307967625558376, "rewards/accuracy_reward": 0.2506510494276881, "rewards/format_reward": 0.6868489757180214, "step": 105 }, { "completion_length": 1024.0, "epoch": 0.3745583038869258, "grad_norm": 20.781177075399597, "kl": 1.77783203125, "learning_rate": 1.5797515639176077e-05, "loss": 0.0711, "reward": 0.9694010652601719, "reward_std": 0.5016757268458605, "rewards/accuracy_reward": 0.2220052145421505, "rewards/format_reward": 0.7473958507180214, "step": 106 }, { "completion_length": 1024.0, "epoch": 0.37809187279151946, "grad_norm": 119.3128363246367, "kl": 6.580078125, "learning_rate": 1.5696297258798573e-05, "loss": 0.2632, "reward": 1.1093750447034836, "reward_std": 0.44075607880949974, "rewards/accuracy_reward": 0.25455730129033327, "rewards/format_reward": 0.8548177294433117, "step": 107 }, { "completion_length": 1024.0, "epoch": 0.38162544169611307, "grad_norm": 6.7022300054671895, "kl": 2.0478515625, "learning_rate": 1.5594207474679533e-05, "loss": 0.082, "reward": 1.0462239943444729, "reward_std": 0.45448117703199387, "rewards/accuracy_reward": 0.22916667442768812, "rewards/format_reward": 0.8170573115348816, "step": 108 }, { "completion_length": 1024.0, "epoch": 0.38515901060070673, "grad_norm": 62.28140494172677, "kl": 3.994140625, "learning_rate": 1.549126190423073e-05, "loss": 0.1599, "reward": 1.1243490017950535, "reward_std": 0.39074354991316795, "rewards/accuracy_reward": 0.2441406287252903, "rewards/format_reward": 0.8802083507180214, "step": 109 }, { "completion_length": 1024.0, "epoch": 0.38869257950530034, "grad_norm": 4.965909615838145, "kl": 0.9990234375, "learning_rate": 1.5387476295779737e-05, "loss": 0.04, "reward": 1.065104205161333, "reward_std": 0.423415495082736, "rewards/accuracy_reward": 0.21809896687045693, "rewards/format_reward": 0.8470052182674408, "step": 110 }, { "completion_length": 1024.0, "epoch": 0.392226148409894, "grad_norm": 3.528015251218169, "kl": 0.8387451171875, "learning_rate": 1.5282866526160837e-05, "loss": 0.0335, "reward": 1.0117187798023224, "reward_std": 0.41450436040759087, "rewards/accuracy_reward": 0.16471354756504297, "rewards/format_reward": 0.847005233168602, "step": 111 }, { "completion_length": 1024.0, "epoch": 0.3957597173144876, "grad_norm": 1.3076932365383318, "kl": 0.799560546875, "learning_rate": 1.5177448598286182e-05, "loss": 0.032, "reward": 1.0097656548023224, "reward_std": 0.3976512663066387, "rewards/accuracy_reward": 0.1523437537252903, "rewards/format_reward": 0.8574218936264515, "step": 112 }, { "completion_length": 1024.0, "epoch": 0.3992932862190813, "grad_norm": 3.6570709102528838, "kl": 0.9739990234375, "learning_rate": 1.5071238638697731e-05, "loss": 0.0389, "reward": 1.0397135764360428, "reward_std": 0.38695196248590946, "rewards/accuracy_reward": 0.18229167023673654, "rewards/format_reward": 0.8574218899011612, "step": 113 }, { "completion_length": 1024.0, "epoch": 0.4028268551236749, "grad_norm": 303.94124396845274, "kl": 42.03125, "learning_rate": 1.4964252895100265e-05, "loss": 1.6829, "reward": 1.1425781548023224, "reward_std": 0.3955598259344697, "rewards/accuracy_reward": 0.24218750558793545, "rewards/format_reward": 0.9003906473517418, "step": 114 }, { "completion_length": 1024.0, "epoch": 0.40636042402826855, "grad_norm": 68.76205878652742, "kl": 10.7421875, "learning_rate": 1.4856507733875837e-05, "loss": 0.4297, "reward": 1.1054687798023224, "reward_std": 0.36262817680835724, "rewards/accuracy_reward": 0.20052083814516664, "rewards/format_reward": 0.9049479365348816, "step": 115 }, { "completion_length": 1024.0, "epoch": 0.4098939929328622, "grad_norm": 14.914598534678452, "kl": 2.833984375, "learning_rate": 1.4748019637580116e-05, "loss": 0.1134, "reward": 1.0625000447034836, "reward_std": 0.4123513549566269, "rewards/accuracy_reward": 0.19531250838190317, "rewards/format_reward": 0.8671875149011612, "step": 116 }, { "completion_length": 1024.0, "epoch": 0.4134275618374558, "grad_norm": 5.452147867911957, "kl": 1.337890625, "learning_rate": 1.4638805202420896e-05, "loss": 0.0535, "reward": 0.9967448301613331, "reward_std": 0.46121339313685894, "rewards/accuracy_reward": 0.2018229216337204, "rewards/format_reward": 0.7949219010770321, "step": 117 }, { "completion_length": 1024.0, "epoch": 0.4169611307420495, "grad_norm": 5.07384920327551, "kl": 1.025634765625, "learning_rate": 1.452888113571929e-05, "loss": 0.041, "reward": 0.9401041865348816, "reward_std": 0.5634889136999846, "rewards/accuracy_reward": 0.24739584233611822, "rewards/format_reward": 0.6927083544433117, "step": 118 }, { "completion_length": 1024.0, "epoch": 0.4204946996466431, "grad_norm": 7.69200350154935, "kl": 0.906005859375, "learning_rate": 1.4418264253353869e-05, "loss": 0.0362, "reward": 0.821614608168602, "reward_std": 0.5736533179879189, "rewards/accuracy_reward": 0.2070312537252903, "rewards/format_reward": 0.6145833544433117, "step": 119 }, { "completion_length": 1024.0, "epoch": 0.42402826855123676, "grad_norm": 20.808857934877054, "kl": 1.56982421875, "learning_rate": 1.4306971477188223e-05, "loss": 0.0627, "reward": 0.7656250223517418, "reward_std": 0.5566702261567116, "rewards/accuracy_reward": 0.17838542046956718, "rewards/format_reward": 0.587239608168602, "step": 120 }, { "completion_length": 1024.0, "epoch": 0.4275618374558304, "grad_norm": 26.447237281301867, "kl": 2.738037109375, "learning_rate": 1.419501983248229e-05, "loss": 0.1095, "reward": 0.7792969048023224, "reward_std": 0.5849619917571545, "rewards/accuracy_reward": 0.1907552140764892, "rewards/format_reward": 0.5885416846722364, "step": 121 }, { "completion_length": 1024.0, "epoch": 0.43109540636042404, "grad_norm": 22.987609528434852, "kl": 1.94189453125, "learning_rate": 1.4082426445287904e-05, "loss": 0.0775, "reward": 0.8574218973517418, "reward_std": 0.5550148580223322, "rewards/accuracy_reward": 0.19661458861082792, "rewards/format_reward": 0.660807304084301, "step": 122 }, { "completion_length": 1024.0, "epoch": 0.43462897526501765, "grad_norm": 7.848483482793478, "kl": 1.8486328125, "learning_rate": 1.3969208539828873e-05, "loss": 0.074, "reward": 0.8059896044433117, "reward_std": 0.5727136358618736, "rewards/accuracy_reward": 0.1894531319849193, "rewards/format_reward": 0.6165364794433117, "step": 123 }, { "completion_length": 1024.0, "epoch": 0.4381625441696113, "grad_norm": 7.568264702961115, "kl": 3.517578125, "learning_rate": 1.3855383435866076e-05, "loss": 0.1407, "reward": 0.7825521007180214, "reward_std": 0.5777693595737219, "rewards/accuracy_reward": 0.17643229733221233, "rewards/format_reward": 0.6061198096722364, "step": 124 }, { "completion_length": 1024.0, "epoch": 0.4416961130742049, "grad_norm": 1142.9874069005743, "kl": 47.984375, "learning_rate": 1.3740968546047935e-05, "loss": 1.9207, "reward": 0.7265625223517418, "reward_std": 0.5636367797851562, "rewards/accuracy_reward": 0.14778646221384406, "rewards/format_reward": 0.5787760615348816, "step": 125 }, { "completion_length": 1024.0, "epoch": 0.4452296819787986, "grad_norm": 27.152545132029452, "kl": 5.333984375, "learning_rate": 1.362598137324667e-05, "loss": 0.2134, "reward": 0.7291666902601719, "reward_std": 0.5741878617554903, "rewards/accuracy_reward": 0.18489583674818277, "rewards/format_reward": 0.5442708525806665, "step": 126 }, { "completion_length": 1024.0, "epoch": 0.44876325088339225, "grad_norm": 15.772408193991923, "kl": 6.375, "learning_rate": 1.3510439507880778e-05, "loss": 0.255, "reward": 0.6901041865348816, "reward_std": 0.554063655436039, "rewards/accuracy_reward": 0.17122396267950535, "rewards/format_reward": 0.5188802238553762, "step": 127 }, { "completion_length": 1024.0, "epoch": 0.45229681978798586, "grad_norm": 14.045655101866169, "kl": 4.6875, "learning_rate": 1.3394360625224067e-05, "loss": 0.1874, "reward": 0.6523437760770321, "reward_std": 0.5496281944215298, "rewards/accuracy_reward": 0.15950521756894886, "rewards/format_reward": 0.49283855594694614, "step": 128 }, { "completion_length": 1024.0, "epoch": 0.4558303886925795, "grad_norm": 6.019406500188639, "kl": 4.501953125, "learning_rate": 1.3277762482701769e-05, "loss": 0.1801, "reward": 0.6139323115348816, "reward_std": 0.5297244675457478, "rewards/accuracy_reward": 0.14778646267950535, "rewards/format_reward": 0.46614584885537624, "step": 129 }, { "completion_length": 1024.0, "epoch": 0.45936395759717313, "grad_norm": 8.557172478534012, "kl": 2.5107421875, "learning_rate": 1.3160662917174045e-05, "loss": 0.1005, "reward": 0.5481771044433117, "reward_std": 0.5158671271055937, "rewards/accuracy_reward": 0.132812503259629, "rewards/format_reward": 0.4153645932674408, "step": 130 }, { "completion_length": 1024.0, "epoch": 0.4628975265017668, "grad_norm": 9.680783748799318, "kl": 2.01806640625, "learning_rate": 1.3043079842207363e-05, "loss": 0.0807, "reward": 0.5436198115348816, "reward_std": 0.525303166359663, "rewards/accuracy_reward": 0.13411458616610616, "rewards/format_reward": 0.40950521640479565, "step": 131 }, { "completion_length": 1024.0, "epoch": 0.4664310954063604, "grad_norm": 11.919372203113637, "kl": 3.8720703125, "learning_rate": 1.2925031245334112e-05, "loss": 0.1549, "reward": 0.5683593936264515, "reward_std": 0.5238314680755138, "rewards/accuracy_reward": 0.15104167046956718, "rewards/format_reward": 0.41731772013008595, "step": 132 }, { "completion_length": 1024.0, "epoch": 0.46996466431095407, "grad_norm": 2.91738147399812, "kl": 3.1455078125, "learning_rate": 1.2806535185300931e-05, "loss": 0.1258, "reward": 0.5989583507180214, "reward_std": 0.5382577646523714, "rewards/accuracy_reward": 0.14388021198101342, "rewards/format_reward": 0.45507813803851604, "step": 133 }, { "completion_length": 1024.0, "epoch": 0.4734982332155477, "grad_norm": 9.809127835476216, "kl": 4.83984375, "learning_rate": 1.2687609789306144e-05, "loss": 0.1935, "reward": 0.5527343917638063, "reward_std": 0.5343853384256363, "rewards/accuracy_reward": 0.12760417070239782, "rewards/format_reward": 0.42513022013008595, "step": 134 }, { "completion_length": 1024.0, "epoch": 0.47703180212014135, "grad_norm": 17.04549908577345, "kl": 7.4140625, "learning_rate": 1.2568273250226681e-05, "loss": 0.2967, "reward": 0.6009114738553762, "reward_std": 0.5301515571773052, "rewards/accuracy_reward": 0.13606771151535213, "rewards/format_reward": 0.46484375931322575, "step": 135 }, { "completion_length": 1024.0, "epoch": 0.48056537102473496, "grad_norm": 8.477206528274886, "kl": 5.021484375, "learning_rate": 1.2448543823835016e-05, "loss": 0.201, "reward": 0.6223958544433117, "reward_std": 0.5402844380587339, "rewards/accuracy_reward": 0.13411458721384406, "rewards/format_reward": 0.4882812611758709, "step": 136 }, { "completion_length": 1024.0, "epoch": 0.4840989399293286, "grad_norm": 3.8500660741333816, "kl": 3.294921875, "learning_rate": 1.2328439826006415e-05, "loss": 0.1319, "reward": 0.655598983168602, "reward_std": 0.5419861897826195, "rewards/accuracy_reward": 0.15364583767950535, "rewards/format_reward": 0.5019531399011612, "step": 137 }, { "completion_length": 1024.0, "epoch": 0.4876325088339223, "grad_norm": 4.591683686353463, "kl": 2.4130859375, "learning_rate": 1.2207979629917061e-05, "loss": 0.0966, "reward": 0.6510416865348816, "reward_std": 0.5556948073208332, "rewards/accuracy_reward": 0.12630208616610616, "rewards/format_reward": 0.5247395969927311, "step": 138 }, { "completion_length": 1024.0, "epoch": 0.4911660777385159, "grad_norm": 26.38361978331117, "kl": 5.92236328125, "learning_rate": 1.2087181663233354e-05, "loss": 0.2373, "reward": 0.6998698189854622, "reward_std": 0.5648407433182001, "rewards/accuracy_reward": 0.13541667070239782, "rewards/format_reward": 0.5644531436264515, "step": 139 }, { "completion_length": 1024.0, "epoch": 0.49469964664310956, "grad_norm": 4.793844402068783, "kl": 2.24755859375, "learning_rate": 1.1966064405292887e-05, "loss": 0.0899, "reward": 0.7552083618938923, "reward_std": 0.5623416192829609, "rewards/accuracy_reward": 0.17057292209938169, "rewards/format_reward": 0.5846354328095913, "step": 140 }, { "completion_length": 1024.0, "epoch": 0.49823321554770317, "grad_norm": 1.6059354727609763, "kl": 3.4482421875, "learning_rate": 1.184464638427756e-05, "loss": 0.1379, "reward": 0.7838541828095913, "reward_std": 0.5696821231395006, "rewards/accuracy_reward": 0.18815104570239782, "rewards/format_reward": 0.5957031436264515, "step": 141 }, { "completion_length": 1024.0, "epoch": 0.5017667844522968, "grad_norm": 14.85453956454356, "kl": 7.228515625, "learning_rate": 1.1722946174379168e-05, "loss": 0.2892, "reward": 0.7656250186264515, "reward_std": 0.5824484005570412, "rewards/accuracy_reward": 0.17317708721384406, "rewards/format_reward": 0.5924479383975267, "step": 142 }, { "completion_length": 1024.0, "epoch": 0.5053003533568905, "grad_norm": 21.201841225460676, "kl": 9.515625, "learning_rate": 1.1600982392957978e-05, "loss": 0.3802, "reward": 0.7910156510770321, "reward_std": 0.5755977407097816, "rewards/accuracy_reward": 0.16341146174818277, "rewards/format_reward": 0.6276041846722364, "step": 143 }, { "completion_length": 1024.0, "epoch": 0.508833922261484, "grad_norm": 79.28650798092659, "kl": 7.9423828125, "learning_rate": 1.1478773697694691e-05, "loss": 0.318, "reward": 0.9016927294433117, "reward_std": 0.5674024932086468, "rewards/accuracy_reward": 0.2285156287252903, "rewards/format_reward": 0.6731771044433117, "step": 144 }, { "completion_length": 1024.0, "epoch": 0.5123674911660777, "grad_norm": 7.988007399943678, "kl": 4.2353515625, "learning_rate": 1.1356338783736256e-05, "loss": 0.1695, "reward": 0.8457031473517418, "reward_std": 0.5507397223263979, "rewards/accuracy_reward": 0.18294271500781178, "rewards/format_reward": 0.6627604365348816, "step": 145 }, { "completion_length": 1024.0, "epoch": 0.5159010600706714, "grad_norm": 6.6368383146740335, "kl": 2.5166015625, "learning_rate": 1.123369638083593e-05, "loss": 0.1007, "reward": 0.8782552406191826, "reward_std": 0.5208123382180929, "rewards/accuracy_reward": 0.1855468787252903, "rewards/format_reward": 0.6927083507180214, "step": 146 }, { "completion_length": 1024.0, "epoch": 0.519434628975265, "grad_norm": 5.018034104235896, "kl": 2.7021484375, "learning_rate": 1.1110865250488047e-05, "loss": 0.1081, "reward": 0.884114608168602, "reward_std": 0.530108455568552, "rewards/accuracy_reward": 0.16601562919095159, "rewards/format_reward": 0.7180989719927311, "step": 147 }, { "completion_length": 1024.0, "epoch": 0.5229681978798587, "grad_norm": 4.782732693496574, "kl": 4.1826171875, "learning_rate": 1.0987864183057943e-05, "loss": 0.1672, "reward": 0.9218750223517418, "reward_std": 0.5164159703999758, "rewards/accuracy_reward": 0.18229167396202683, "rewards/format_reward": 0.7395833544433117, "step": 148 }, { "completion_length": 1024.0, "epoch": 0.5265017667844523, "grad_norm": 16.810647138161976, "kl": 6.6533203125, "learning_rate": 1.0864711994907457e-05, "loss": 0.2665, "reward": 0.9960937686264515, "reward_std": 0.48126981779932976, "rewards/accuracy_reward": 0.22786459187045693, "rewards/format_reward": 0.7682291828095913, "step": 149 }, { "completion_length": 1024.0, "epoch": 0.5300353356890459, "grad_norm": 10.7748697556108, "kl": 5.4052734375, "learning_rate": 1.0741427525516463e-05, "loss": 0.2162, "reward": 0.9928385689854622, "reward_std": 0.5116328075528145, "rewards/accuracy_reward": 0.22395833989139646, "rewards/format_reward": 0.7688802257180214, "step": 150 }, { "completion_length": 1024.0, "epoch": 0.5335689045936396, "grad_norm": 2.3886371142629574, "kl": 2.9921875, "learning_rate": 1.0618029634600843e-05, "loss": 0.1197, "reward": 0.9869791977107525, "reward_std": 0.48671552538871765, "rewards/accuracy_reward": 0.20572917303070426, "rewards/format_reward": 0.7812500186264515, "step": 151 }, { "completion_length": 1024.0, "epoch": 0.5371024734982333, "grad_norm": 1.7440867401519615, "kl": 3.501953125, "learning_rate": 1.0494537199227393e-05, "loss": 0.1401, "reward": 0.9563802443444729, "reward_std": 0.4778597932308912, "rewards/accuracy_reward": 0.1940104216337204, "rewards/format_reward": 0.7623698152601719, "step": 152 }, { "completion_length": 1024.0, "epoch": 0.5406360424028268, "grad_norm": 2.4296240779116753, "kl": 3.357421875, "learning_rate": 1.0370969110926052e-05, "loss": 0.1343, "reward": 0.9921875298023224, "reward_std": 0.4780040867626667, "rewards/accuracy_reward": 0.21679688291624188, "rewards/format_reward": 0.7753906473517418, "step": 153 }, { "completion_length": 1024.0, "epoch": 0.5441696113074205, "grad_norm": 4.117699458280056, "kl": 3.2236328125, "learning_rate": 1.024734427279995e-05, "loss": 0.129, "reward": 1.0136718973517418, "reward_std": 0.49156964384019375, "rewards/accuracy_reward": 0.23502604896202683, "rewards/format_reward": 0.7786458469927311, "step": 154 }, { "completion_length": 1024.0, "epoch": 0.5477031802120141, "grad_norm": 9.030839517376265, "kl": 5.662109375, "learning_rate": 1.012368159663363e-05, "loss": 0.2264, "reward": 1.0794271267950535, "reward_std": 0.45495169796049595, "rewards/accuracy_reward": 0.2740885503590107, "rewards/format_reward": 0.8053385578095913, "step": 155 }, { "completion_length": 1024.0, "epoch": 0.5512367491166078, "grad_norm": 3.2489067144831174, "kl": 3.689453125, "learning_rate": 1e-05, "loss": 0.1475, "reward": 1.100260455161333, "reward_std": 0.4964223224669695, "rewards/accuracy_reward": 0.2955729253590107, "rewards/format_reward": 0.8046875223517418, "step": 156 }, { "completion_length": 1024.0, "epoch": 0.5547703180212014, "grad_norm": 5.586842094139113, "kl": 1.96337890625, "learning_rate": 9.876318403366371e-06, "loss": 0.0785, "reward": 1.0937500335276127, "reward_std": 0.4783368781208992, "rewards/accuracy_reward": 0.2884114645421505, "rewards/format_reward": 0.8053385578095913, "step": 157 }, { "completion_length": 1024.0, "epoch": 0.558303886925795, "grad_norm": 6.4948169343262245, "kl": 1.94287109375, "learning_rate": 9.752655727200051e-06, "loss": 0.0777, "reward": 1.040364608168602, "reward_std": 0.49424389004707336, "rewards/accuracy_reward": 0.25390625838190317, "rewards/format_reward": 0.7864583507180214, "step": 158 }, { "completion_length": 1024.0, "epoch": 0.5618374558303887, "grad_norm": 5.326307273819102, "kl": 2.5322265625, "learning_rate": 9.62903088907395e-06, "loss": 0.1013, "reward": 1.0703125298023224, "reward_std": 0.486019866541028, "rewards/accuracy_reward": 0.27669271547347307, "rewards/format_reward": 0.7936198078095913, "step": 159 }, { "completion_length": 1024.0, "epoch": 0.5653710247349824, "grad_norm": 5.794058798518192, "kl": 4.05859375, "learning_rate": 9.505462800772612e-06, "loss": 0.1624, "reward": 1.1087239980697632, "reward_std": 0.48937827721238136, "rewards/accuracy_reward": 0.3196614682674408, "rewards/format_reward": 0.7890625186264515, "step": 160 }, { "completion_length": 1024.0, "epoch": 0.568904593639576, "grad_norm": 9.402836751388962, "kl": 5.33984375, "learning_rate": 9.381970365399162e-06, "loss": 0.2135, "reward": 0.9941406510770321, "reward_std": 0.522324126213789, "rewards/accuracy_reward": 0.263671881519258, "rewards/format_reward": 0.7304687723517418, "step": 161 }, { "completion_length": 1024.0, "epoch": 0.5724381625441696, "grad_norm": 4.195689922174244, "kl": 4.86328125, "learning_rate": 9.25857247448354e-06, "loss": 0.1944, "reward": 0.960937537252903, "reward_std": 0.5458177234977484, "rewards/accuracy_reward": 0.24739584140479565, "rewards/format_reward": 0.7135416939854622, "step": 162 }, { "completion_length": 1024.0, "epoch": 0.5759717314487632, "grad_norm": 2.8989498058377228, "kl": 4.87109375, "learning_rate": 9.135288005092546e-06, "loss": 0.1949, "reward": 1.0930989868938923, "reward_std": 0.5132731832563877, "rewards/accuracy_reward": 0.29947918001562357, "rewards/format_reward": 0.7936198115348816, "step": 163 }, { "completion_length": 1024.0, "epoch": 0.5795053003533569, "grad_norm": 3.0006286232984234, "kl": 5.009765625, "learning_rate": 9.012135816942058e-06, "loss": 0.2003, "reward": 1.0377604514360428, "reward_std": 0.47819276340305805, "rewards/accuracy_reward": 0.24283854709938169, "rewards/format_reward": 0.7949218899011612, "step": 164 }, { "completion_length": 1024.0, "epoch": 0.5830388692579506, "grad_norm": 4.6993358160287615, "kl": 5.35546875, "learning_rate": 8.889134749511956e-06, "loss": 0.2143, "reward": 1.0800781473517418, "reward_std": 0.5242850538343191, "rewards/accuracy_reward": 0.281901054084301, "rewards/format_reward": 0.798177108168602, "step": 165 }, { "completion_length": 1024.0, "epoch": 0.5865724381625441, "grad_norm": 6.4731877967945355, "kl": 6.76953125, "learning_rate": 8.76630361916407e-06, "loss": 0.2708, "reward": 1.0436198264360428, "reward_std": 0.4580067917704582, "rewards/accuracy_reward": 0.24804688291624188, "rewards/format_reward": 0.7955729365348816, "step": 166 }, { "completion_length": 1024.0, "epoch": 0.5901060070671378, "grad_norm": 9.638355564108144, "kl": 8.08203125, "learning_rate": 8.643661216263744e-06, "loss": 0.3235, "reward": 1.0696614980697632, "reward_std": 0.47720608301460743, "rewards/accuracy_reward": 0.254557297565043, "rewards/format_reward": 0.8151041939854622, "step": 167 }, { "completion_length": 1024.0, "epoch": 0.5936395759717314, "grad_norm": 5.425360658521879, "kl": 7.126953125, "learning_rate": 8.52122630230531e-06, "loss": 0.2852, "reward": 1.0677083730697632, "reward_std": 0.4944526255130768, "rewards/accuracy_reward": 0.2669270886108279, "rewards/format_reward": 0.8007812649011612, "step": 168 }, { "completion_length": 1024.0, "epoch": 0.5971731448763251, "grad_norm": 2.6727398917140324, "kl": 5.6484375, "learning_rate": 8.399017607042025e-06, "loss": 0.2259, "reward": 1.0963541865348816, "reward_std": 0.4828463848680258, "rewards/accuracy_reward": 0.28645834140479565, "rewards/format_reward": 0.8098958544433117, "step": 169 }, { "completion_length": 1024.0, "epoch": 0.6007067137809188, "grad_norm": 3.0934284767601503, "kl": 4.96484375, "learning_rate": 8.277053825620836e-06, "loss": 0.1987, "reward": 1.0039062909781933, "reward_std": 0.47822842188179493, "rewards/accuracy_reward": 0.22005208861082792, "rewards/format_reward": 0.7838541865348816, "step": 170 }, { "completion_length": 1024.0, "epoch": 0.6042402826855123, "grad_norm": 1.415026805369631, "kl": 4.71875, "learning_rate": 8.155353615722442e-06, "loss": 0.1887, "reward": 1.0240885764360428, "reward_std": 0.5142606012523174, "rewards/accuracy_reward": 0.24283855129033327, "rewards/format_reward": 0.7812500149011612, "step": 171 }, { "completion_length": 1024.0, "epoch": 0.607773851590106, "grad_norm": 3.671045541857197, "kl": 4.7958984375, "learning_rate": 8.033935594707116e-06, "loss": 0.1918, "reward": 0.9915364906191826, "reward_std": 0.5378254223614931, "rewards/accuracy_reward": 0.2363281308207661, "rewards/format_reward": 0.7552083544433117, "step": 172 }, { "completion_length": 1024.0, "epoch": 0.6113074204946997, "grad_norm": 4.829747105605246, "kl": 4.9267578125, "learning_rate": 7.91281833676665e-06, "loss": 0.1968, "reward": 1.0175781548023224, "reward_std": 0.533087344840169, "rewards/accuracy_reward": 0.262369797565043, "rewards/format_reward": 0.7552083544433117, "step": 173 }, { "completion_length": 1024.0, "epoch": 0.6148409893992933, "grad_norm": 4.133358091951167, "kl": 5.173828125, "learning_rate": 7.79202037008294e-06, "loss": 0.2072, "reward": 0.9824219010770321, "reward_std": 0.5031039249151945, "rewards/accuracy_reward": 0.23111979756504297, "rewards/format_reward": 0.7513021044433117, "step": 174 }, { "completion_length": 1024.0, "epoch": 0.6183745583038869, "grad_norm": 1.7387249238999762, "kl": 5.0078125, "learning_rate": 7.671560173993588e-06, "loss": 0.2003, "reward": 0.956380233168602, "reward_std": 0.5033866986632347, "rewards/accuracy_reward": 0.22135417256504297, "rewards/format_reward": 0.7350260652601719, "step": 175 }, { "completion_length": 1024.0, "epoch": 0.6219081272084805, "grad_norm": 2.172215905238513, "kl": 4.857421875, "learning_rate": 7.551456176164989e-06, "loss": 0.1943, "reward": 0.9941406548023224, "reward_std": 0.526677755638957, "rewards/accuracy_reward": 0.24414063151925802, "rewards/format_reward": 0.7500000223517418, "step": 176 }, { "completion_length": 1024.0, "epoch": 0.6254416961130742, "grad_norm": 0.8842413004407766, "kl": 5.447265625, "learning_rate": 7.431726749773322e-06, "loss": 0.2178, "reward": 0.9596354402601719, "reward_std": 0.5413137227296829, "rewards/accuracy_reward": 0.2278645895421505, "rewards/format_reward": 0.7317708507180214, "step": 177 }, { "completion_length": 1024.0, "epoch": 0.6289752650176679, "grad_norm": 0.7402618851617847, "kl": 5.388671875, "learning_rate": 7.312390210693863e-06, "loss": 0.2156, "reward": 0.9811198189854622, "reward_std": 0.5133567694574594, "rewards/accuracy_reward": 0.23242188384756446, "rewards/format_reward": 0.7486979402601719, "step": 178 }, { "completion_length": 1024.0, "epoch": 0.6325088339222615, "grad_norm": 3.560567058348375, "kl": 5.2578125, "learning_rate": 7.193464814699073e-06, "loss": 0.2104, "reward": 0.9238281324505806, "reward_std": 0.5353942643851042, "rewards/accuracy_reward": 0.20507813151925802, "rewards/format_reward": 0.7187500223517418, "step": 179 }, { "completion_length": 1024.0, "epoch": 0.6360424028268551, "grad_norm": 2.753064271775204, "kl": 5.181640625, "learning_rate": 7.07496875466589e-06, "loss": 0.2074, "reward": 0.8417968861758709, "reward_std": 0.5037534404546022, "rewards/accuracy_reward": 0.1321614630287513, "rewards/format_reward": 0.7096354328095913, "step": 180 }, { "completion_length": 1024.0, "epoch": 0.6395759717314488, "grad_norm": 3.87667763131927, "kl": 6.107421875, "learning_rate": 6.9569201577926395e-06, "loss": 0.2442, "reward": 0.8977864868938923, "reward_std": 0.5129530522972345, "rewards/accuracy_reward": 0.18424479803070426, "rewards/format_reward": 0.7135416939854622, "step": 181 }, { "completion_length": 1024.0, "epoch": 0.6431095406360424, "grad_norm": 3.8854100556266813, "kl": 6.060546875, "learning_rate": 6.839337082825954e-06, "loss": 0.2426, "reward": 0.8893229365348816, "reward_std": 0.4821504820138216, "rewards/accuracy_reward": 0.15755208861082792, "rewards/format_reward": 0.7317708469927311, "step": 182 }, { "completion_length": 1024.0, "epoch": 0.6466431095406361, "grad_norm": 1.2484444905977732, "kl": 5.591796875, "learning_rate": 6.722237517298232e-06, "loss": 0.2238, "reward": 0.958984412252903, "reward_std": 0.48374492302536964, "rewards/accuracy_reward": 0.1861979211680591, "rewards/format_reward": 0.7727864794433117, "step": 183 }, { "completion_length": 1024.0, "epoch": 0.6501766784452296, "grad_norm": 2.6893650368389546, "kl": 5.880859375, "learning_rate": 6.605639374775934e-06, "loss": 0.2352, "reward": 0.9785156510770321, "reward_std": 0.4921079948544502, "rewards/accuracy_reward": 0.2057291716337204, "rewards/format_reward": 0.7727864757180214, "step": 184 }, { "completion_length": 1024.0, "epoch": 0.6537102473498233, "grad_norm": 2.3442839100460233, "kl": 6.041015625, "learning_rate": 6.489560492119225e-06, "loss": 0.2416, "reward": 0.9583333693444729, "reward_std": 0.4440547488629818, "rewards/accuracy_reward": 0.1692708395421505, "rewards/format_reward": 0.7890625223517418, "step": 185 }, { "completion_length": 1024.0, "epoch": 0.657243816254417, "grad_norm": 2.1253880553627047, "kl": 6.69921875, "learning_rate": 6.374018626753331e-06, "loss": 0.268, "reward": 0.9765625223517418, "reward_std": 0.4347160626202822, "rewards/accuracy_reward": 0.17838542256504297, "rewards/format_reward": 0.7981770969927311, "step": 186 }, { "completion_length": 1024.0, "epoch": 0.6607773851590106, "grad_norm": 1.0194540597887043, "kl": 6.712890625, "learning_rate": 6.2590314539520695e-06, "loss": 0.2686, "reward": 0.9921875335276127, "reward_std": 0.44406831078231335, "rewards/accuracy_reward": 0.2005208395421505, "rewards/format_reward": 0.7916666828095913, "step": 187 }, { "completion_length": 1024.0, "epoch": 0.6643109540636042, "grad_norm": 0.5545671762601952, "kl": 6.857421875, "learning_rate": 6.144616564133927e-06, "loss": 0.2743, "reward": 1.023437526077032, "reward_std": 0.4340708777308464, "rewards/accuracy_reward": 0.19401042442768812, "rewards/format_reward": 0.829427108168602, "step": 188 }, { "completion_length": 1024.0, "epoch": 0.6678445229681979, "grad_norm": 0.3583901424418257, "kl": 6.94140625, "learning_rate": 6.03079146017113e-06, "loss": 0.2778, "reward": 1.0410156548023224, "reward_std": 0.4304163958877325, "rewards/accuracy_reward": 0.21354167070239782, "rewards/format_reward": 0.8274739794433117, "step": 189 }, { "completion_length": 1024.0, "epoch": 0.6713780918727915, "grad_norm": 0.17755970310205513, "kl": 7.12890625, "learning_rate": 5.9175735547120975e-06, "loss": 0.2852, "reward": 1.0572916865348816, "reward_std": 0.4284838940948248, "rewards/accuracy_reward": 0.20963542349636555, "rewards/format_reward": 0.8476562723517418, "step": 190 }, { "completion_length": 1024.0, "epoch": 0.6749116607773852, "grad_norm": 0.16047119425020795, "kl": 7.486328125, "learning_rate": 5.804980167517712e-06, "loss": 0.2995, "reward": 1.0820313021540642, "reward_std": 0.3956974996253848, "rewards/accuracy_reward": 0.22395833861082792, "rewards/format_reward": 0.8580729365348816, "step": 191 }, { "completion_length": 1024.0, "epoch": 0.6784452296819788, "grad_norm": 0.18607515894747625, "kl": 7.34765625, "learning_rate": 5.693028522811783e-06, "loss": 0.2937, "reward": 1.1145833805203438, "reward_std": 0.42582329362630844, "rewards/accuracy_reward": 0.25911459047347307, "rewards/format_reward": 0.8554687798023224, "step": 192 }, { "completion_length": 1024.0, "epoch": 0.6819787985865724, "grad_norm": 0.12713756611493196, "kl": 7.560546875, "learning_rate": 5.581735746646134e-06, "loss": 0.3023, "reward": 1.1165364943444729, "reward_std": 0.4042051937431097, "rewards/accuracy_reward": 0.24609375651925802, "rewards/format_reward": 0.8704427368938923, "step": 193 }, { "completion_length": 1006.2291679382324, "epoch": 0.6855123674911661, "grad_norm": 0.17261260884206758, "kl": 7.578125, "learning_rate": 5.471118864280716e-06, "loss": 0.3027, "reward": 1.1165365055203438, "reward_std": 0.4118177331984043, "rewards/accuracy_reward": 0.2513020895421505, "rewards/format_reward": 0.8652343973517418, "step": 194 }, { "completion_length": 1009.5208358764648, "epoch": 0.6890459363957597, "grad_norm": 0.12753618037588546, "kl": 7.6796875, "learning_rate": 5.361194797579108e-06, "loss": 0.3073, "reward": 1.0963542014360428, "reward_std": 0.38153328374028206, "rewards/accuracy_reward": 0.22330729849636555, "rewards/format_reward": 0.8730468936264515, "step": 195 }, { "completion_length": 992.2708358764648, "epoch": 0.6925795053003534, "grad_norm": 0.1939287572168999, "kl": 7.9375, "learning_rate": 5.2519803624198865e-06, "loss": 0.3175, "reward": 1.1438802480697632, "reward_std": 0.36042055673897266, "rewards/accuracy_reward": 0.2545572994276881, "rewards/format_reward": 0.8893229365348816, "step": 196 }, { "completion_length": 1005.8333358764648, "epoch": 0.696113074204947, "grad_norm": 1.6055164563240731, "kl": 7.822265625, "learning_rate": 5.143492266124164e-06, "loss": 0.313, "reward": 1.0944010764360428, "reward_std": 0.3512597717344761, "rewards/accuracy_reward": 0.21289063151925802, "rewards/format_reward": 0.8815104328095913, "step": 197 }, { "completion_length": 976.1875038146973, "epoch": 0.6996466431095406, "grad_norm": 0.15206731503411422, "kl": 7.78125, "learning_rate": 5.035747104899738e-06, "loss": 0.3114, "reward": 1.0540364906191826, "reward_std": 0.35300159733742476, "rewards/accuracy_reward": 0.18880208674818277, "rewards/format_reward": 0.8652343899011612, "step": 198 }, { "completion_length": 983.6250038146973, "epoch": 0.7031802120141343, "grad_norm": 0.18996755943687274, "kl": 7.857421875, "learning_rate": 4.928761361302269e-06, "loss": 0.3144, "reward": 1.1308594197034836, "reward_std": 0.37638773024082184, "rewards/accuracy_reward": 0.24869792396202683, "rewards/format_reward": 0.8821614794433117, "step": 199 }, { "completion_length": 987.5208358764648, "epoch": 0.7067137809187279, "grad_norm": 0.19405956495375348, "kl": 7.912109375, "learning_rate": 4.8225514017138205e-06, "loss": 0.3164, "reward": 1.096354190260172, "reward_std": 0.34191144444048405, "rewards/accuracy_reward": 0.20638021361082792, "rewards/format_reward": 0.8899739757180214, "step": 200 }, { "completion_length": 1008.8958358764648, "epoch": 0.7102473498233216, "grad_norm": 0.1978005231935579, "kl": 7.767578125, "learning_rate": 4.717133473839163e-06, "loss": 0.3108, "reward": 1.1432292237877846, "reward_std": 0.3862752038985491, "rewards/accuracy_reward": 0.2701823003590107, "rewards/format_reward": 0.8730468973517418, "step": 201 }, { "completion_length": 1009.25, "epoch": 0.7137809187279152, "grad_norm": 0.1284484089519714, "kl": 7.6015625, "learning_rate": 4.612523704220264e-06, "loss": 0.3041, "reward": 1.115234412252903, "reward_std": 0.42590648494660854, "rewards/accuracy_reward": 0.26562500838190317, "rewards/format_reward": 0.8496093973517418, "step": 202 }, { "completion_length": 1024.0, "epoch": 0.7173144876325088, "grad_norm": 0.14352585595949552, "kl": 7.435546875, "learning_rate": 4.508738095769278e-06, "loss": 0.2974, "reward": 1.061848983168602, "reward_std": 0.41079509258270264, "rewards/accuracy_reward": 0.2265625074505806, "rewards/format_reward": 0.8352864757180214, "step": 203 }, { "completion_length": 1024.0, "epoch": 0.7208480565371025, "grad_norm": 0.1521889190457422, "kl": 7.193359375, "learning_rate": 4.405792525320469e-06, "loss": 0.2877, "reward": 1.0279948264360428, "reward_std": 0.424892058596015, "rewards/accuracy_reward": 0.2037760482635349, "rewards/format_reward": 0.8242187686264515, "step": 204 }, { "completion_length": 1024.0, "epoch": 0.7243816254416962, "grad_norm": 0.3506188463952097, "kl": 7.2890625, "learning_rate": 4.303702741201431e-06, "loss": 0.2915, "reward": 1.059895858168602, "reward_std": 0.42388765700161457, "rewards/accuracy_reward": 0.2473958432674408, "rewards/format_reward": 0.8125000186264515, "step": 205 }, { "completion_length": 1024.0, "epoch": 0.7279151943462897, "grad_norm": 0.1677757492998703, "kl": 6.9921875, "learning_rate": 4.202484360823926e-06, "loss": 0.2797, "reward": 1.0266927368938923, "reward_std": 0.45705331675708294, "rewards/accuracy_reward": 0.23502604942768812, "rewards/format_reward": 0.791666679084301, "step": 206 }, { "completion_length": 1011.9791679382324, "epoch": 0.7314487632508834, "grad_norm": 0.1517309736158255, "kl": 6.935546875, "learning_rate": 4.1021528682948064e-06, "loss": 0.2774, "reward": 1.049479216337204, "reward_std": 0.46575335413217545, "rewards/accuracy_reward": 0.2526041716337204, "rewards/format_reward": 0.7968750149011612, "step": 207 }, { "completion_length": 1024.0, "epoch": 0.734982332155477, "grad_norm": 0.16127641070568163, "kl": 7.111328125, "learning_rate": 4.002723612047272e-06, "loss": 0.2847, "reward": 1.085286483168602, "reward_std": 0.44572209380567074, "rewards/accuracy_reward": 0.26302084419876337, "rewards/format_reward": 0.8222656436264515, "step": 208 }, { "completion_length": 1009.3125, "epoch": 0.7385159010600707, "grad_norm": 0.13871720330015053, "kl": 7.23828125, "learning_rate": 3.904211802492922e-06, "loss": 0.2897, "reward": 1.100260455161333, "reward_std": 0.4249584712088108, "rewards/accuracy_reward": 0.26953125558793545, "rewards/format_reward": 0.830729179084301, "step": 209 }, { "completion_length": 1014.8125, "epoch": 0.7420494699646644, "grad_norm": 0.13865406061603536, "kl": 7.302734375, "learning_rate": 3.8066325096949153e-06, "loss": 0.2922, "reward": 1.1406250447034836, "reward_std": 0.41488252952694893, "rewards/accuracy_reward": 0.2812500074505806, "rewards/format_reward": 0.8593750223517418, "step": 210 }, { "completion_length": 1024.0, "epoch": 0.7455830388692579, "grad_norm": 0.12592871455753224, "kl": 7.369140625, "learning_rate": 3.710000661062578e-06, "loss": 0.2948, "reward": 1.1796875484287739, "reward_std": 0.41464217752218246, "rewards/accuracy_reward": 0.313151047565043, "rewards/format_reward": 0.8665364794433117, "step": 211 }, { "completion_length": 995.2291679382324, "epoch": 0.7491166077738516, "grad_norm": 0.1393901308152561, "kl": 7.501953125, "learning_rate": 3.6143310390678544e-06, "loss": 0.3001, "reward": 1.1647135838866234, "reward_std": 0.3797377645969391, "rewards/accuracy_reward": 0.2838541753590107, "rewards/format_reward": 0.8808593899011612, "step": 212 }, { "completion_length": 1010.125, "epoch": 0.7526501766784452, "grad_norm": 0.14034512809285338, "kl": 7.31640625, "learning_rate": 3.5196382789839477e-06, "loss": 0.2926, "reward": 1.1549479514360428, "reward_std": 0.39015408605337143, "rewards/accuracy_reward": 0.27994792629033327, "rewards/format_reward": 0.8750000223517418, "step": 213 }, { "completion_length": 1011.875, "epoch": 0.7561837455830389, "grad_norm": 0.14989504739682083, "kl": 7.28125, "learning_rate": 3.425936866646419e-06, "loss": 0.2911, "reward": 1.1614583656191826, "reward_std": 0.39541246369481087, "rewards/accuracy_reward": 0.292968756519258, "rewards/format_reward": 0.8684896007180214, "step": 214 }, { "completion_length": 1024.0, "epoch": 0.7597173144876325, "grad_norm": 0.13609166349614024, "kl": 6.982421875, "learning_rate": 3.3332411362372063e-06, "loss": 0.2793, "reward": 1.1263021379709244, "reward_std": 0.44077819399535656, "rewards/accuracy_reward": 0.29101563384756446, "rewards/format_reward": 0.8352864794433117, "step": 215 }, { "completion_length": 1024.0, "epoch": 0.7632508833922261, "grad_norm": 0.15918888254694777, "kl": 7.1640625, "learning_rate": 3.2415652680918262e-06, "loss": 0.2865, "reward": 1.1223958879709244, "reward_std": 0.41186920180916786, "rewards/accuracy_reward": 0.28190105222165585, "rewards/format_reward": 0.8404948152601719, "step": 216 }, { "completion_length": 1008.0416679382324, "epoch": 0.7667844522968198, "grad_norm": 0.12370384010350725, "kl": 7.244140625, "learning_rate": 3.1509232865300886e-06, "loss": 0.2899, "reward": 1.1601563058793545, "reward_std": 0.38652253709733486, "rewards/accuracy_reward": 0.3059895886108279, "rewards/format_reward": 0.8541666939854622, "step": 217 }, { "completion_length": 1024.0, "epoch": 0.7703180212014135, "grad_norm": 0.20461489596469734, "kl": 7.2421875, "learning_rate": 3.061329057710711e-06, "loss": 0.2898, "reward": 1.072916705161333, "reward_std": 0.4447946548461914, "rewards/accuracy_reward": 0.2571614645421505, "rewards/format_reward": 0.8157552294433117, "step": 218 }, { "completion_length": 1024.0, "epoch": 0.773851590106007, "grad_norm": 0.1804867199582671, "kl": 7.388671875, "learning_rate": 2.9727962875101e-06, "loss": 0.2956, "reward": 1.1334635727107525, "reward_std": 0.39954448491334915, "rewards/accuracy_reward": 0.27929688338190317, "rewards/format_reward": 0.8541666828095913, "step": 219 }, { "completion_length": 1024.0, "epoch": 0.7773851590106007, "grad_norm": 0.16932493071108376, "kl": 7.197265625, "learning_rate": 2.8853385194256677e-06, "loss": 0.2879, "reward": 1.1041667014360428, "reward_std": 0.4760838821530342, "rewards/accuracy_reward": 0.28125000884756446, "rewards/format_reward": 0.8229166865348816, "step": 220 }, { "completion_length": 1024.0, "epoch": 0.7809187279151943, "grad_norm": 0.15687360626369415, "kl": 7.0703125, "learning_rate": 2.798969132503997e-06, "loss": 0.283, "reward": 1.0423177294433117, "reward_std": 0.44260338321328163, "rewards/accuracy_reward": 0.2304687574505806, "rewards/format_reward": 0.8118489794433117, "step": 221 }, { "completion_length": 1024.0, "epoch": 0.784452296819788, "grad_norm": 0.17190503748389882, "kl": 7.171875, "learning_rate": 2.713701339294129e-06, "loss": 0.2869, "reward": 1.1419271193444729, "reward_std": 0.4103549234569073, "rewards/accuracy_reward": 0.301432297565043, "rewards/format_reward": 0.8404948115348816, "step": 222 }, { "completion_length": 1024.0, "epoch": 0.7879858657243817, "grad_norm": 0.14054774738185286, "kl": 7.33203125, "learning_rate": 2.6295481838263628e-06, "loss": 0.2932, "reward": 1.0891927443444729, "reward_std": 0.415512815117836, "rewards/accuracy_reward": 0.24479167256504297, "rewards/format_reward": 0.844401054084301, "step": 223 }, { "completion_length": 1024.0, "epoch": 0.7915194346289752, "grad_norm": 0.1087364190858895, "kl": 7.130859375, "learning_rate": 2.5465225396168134e-06, "loss": 0.2853, "reward": 1.1236979588866234, "reward_std": 0.44147299975156784, "rewards/accuracy_reward": 0.285807297565043, "rewards/format_reward": 0.8378906473517418, "step": 224 }, { "completion_length": 1008.2916679382324, "epoch": 0.7950530035335689, "grad_norm": 0.11246121980520436, "kl": 7.4765625, "learning_rate": 2.464637107698046e-06, "loss": 0.2994, "reward": 1.1829427480697632, "reward_std": 0.37431807816028595, "rewards/accuracy_reward": 0.3046875074505806, "rewards/format_reward": 0.8782552294433117, "step": 225 }, { "completion_length": 1024.0, "epoch": 0.7985865724381626, "grad_norm": 0.1112131457977264, "kl": 7.265625, "learning_rate": 2.3839044146761227e-06, "loss": 0.2907, "reward": 1.164713591337204, "reward_std": 0.3692896058782935, "rewards/accuracy_reward": 0.2890625037252903, "rewards/format_reward": 0.8756510689854622, "step": 226 }, { "completion_length": 1024.0, "epoch": 0.8021201413427562, "grad_norm": 0.21800229184914455, "kl": 7.462890625, "learning_rate": 2.304336810814305e-06, "loss": 0.2983, "reward": 1.1809896230697632, "reward_std": 0.3522001476958394, "rewards/accuracy_reward": 0.29427084140479565, "rewards/format_reward": 0.8867187723517418, "step": 227 }, { "completion_length": 1007.1458358764648, "epoch": 0.8056537102473498, "grad_norm": 0.1666037699534655, "kl": 7.740234375, "learning_rate": 2.2259464681437404e-06, "loss": 0.3096, "reward": 1.2037760689854622, "reward_std": 0.3384133204817772, "rewards/accuracy_reward": 0.30338542349636555, "rewards/format_reward": 0.9003906436264515, "step": 228 }, { "completion_length": 1024.0, "epoch": 0.8091872791519434, "grad_norm": 0.17750163436679095, "kl": 7.51953125, "learning_rate": 2.1487453786014513e-06, "loss": 0.301, "reward": 1.1744792126119137, "reward_std": 0.4062032885849476, "rewards/accuracy_reward": 0.3001302173361182, "rewards/format_reward": 0.8743489794433117, "step": 229 }, { "completion_length": 1008.1875, "epoch": 0.8127208480565371, "grad_norm": 0.15247459978463207, "kl": 7.580078125, "learning_rate": 2.072745352195794e-06, "loss": 0.303, "reward": 1.1803385764360428, "reward_std": 0.37856387067586184, "rewards/accuracy_reward": 0.29361979849636555, "rewards/format_reward": 0.8867187686264515, "step": 230 }, { "completion_length": 1011.2916679382324, "epoch": 0.8162544169611308, "grad_norm": 0.18019373306879702, "kl": 7.603515625, "learning_rate": 1.997958015199829e-06, "loss": 0.304, "reward": 1.1835937798023224, "reward_std": 0.37871948070824146, "rewards/accuracy_reward": 0.293619797565043, "rewards/format_reward": 0.8899739719927311, "step": 231 }, { "completion_length": 1024.0, "epoch": 0.8197879858657244, "grad_norm": 0.22583268385399752, "kl": 7.529296875, "learning_rate": 1.9243948083727626e-06, "loss": 0.3012, "reward": 1.118489608168602, "reward_std": 0.37204239144921303, "rewards/accuracy_reward": 0.25000000512227416, "rewards/format_reward": 0.8684896044433117, "step": 232 }, { "completion_length": 1024.0, "epoch": 0.823321554770318, "grad_norm": 0.3443852485581638, "kl": 7.34375, "learning_rate": 1.8520669852097573e-06, "loss": 0.2938, "reward": 1.1399739980697632, "reward_std": 0.40698738768696785, "rewards/accuracy_reward": 0.2864583395421505, "rewards/format_reward": 0.8535156473517418, "step": 233 }, { "completion_length": 1024.0, "epoch": 0.8268551236749117, "grad_norm": 0.2918520900684292, "kl": 7.419921875, "learning_rate": 1.7809856102204148e-06, "loss": 0.2967, "reward": 1.1516927555203438, "reward_std": 0.3947129677981138, "rewards/accuracy_reward": 0.30664063431322575, "rewards/format_reward": 0.8450520969927311, "step": 234 }, { "completion_length": 993.1250038146973, "epoch": 0.8303886925795053, "grad_norm": 0.2326957931992313, "kl": 7.501953125, "learning_rate": 1.7111615572361628e-06, "loss": 0.3001, "reward": 1.1100260615348816, "reward_std": 0.4124826304614544, "rewards/accuracy_reward": 0.2643229244276881, "rewards/format_reward": 0.8457031473517418, "step": 235 }, { "completion_length": 1008.4791679382324, "epoch": 0.833922261484099, "grad_norm": 0.20186154524969585, "kl": 7.302734375, "learning_rate": 1.642605507746786e-06, "loss": 0.2922, "reward": 1.0924479439854622, "reward_std": 0.4381315726786852, "rewards/accuracy_reward": 0.2623697970993817, "rewards/format_reward": 0.8300781436264515, "step": 236 }, { "completion_length": 1012.5625, "epoch": 0.8374558303886925, "grad_norm": 0.12307486347767096, "kl": 7.609375, "learning_rate": 1.5753279492664264e-06, "loss": 0.3044, "reward": 1.0917969197034836, "reward_std": 0.37845473177731037, "rewards/accuracy_reward": 0.23046875465661287, "rewards/format_reward": 0.8613281436264515, "step": 237 }, { "completion_length": 996.0000038146973, "epoch": 0.8409893992932862, "grad_norm": 0.12490510412366805, "kl": 7.5703125, "learning_rate": 1.509339173729214e-06, "loss": 0.3028, "reward": 1.1119791939854622, "reward_std": 0.37066210247576237, "rewards/accuracy_reward": 0.25195313338190317, "rewards/format_reward": 0.8600260652601719, "step": 238 }, { "completion_length": 1013.7708358764648, "epoch": 0.8445229681978799, "grad_norm": 0.13493904378820848, "kl": 7.599609375, "learning_rate": 1.4446492759148411e-06, "loss": 0.3039, "reward": 1.1028646193444729, "reward_std": 0.36637131590396166, "rewards/accuracy_reward": 0.24934896640479565, "rewards/format_reward": 0.8535156436264515, "step": 239 }, { "completion_length": 1024.0, "epoch": 0.8480565371024735, "grad_norm": 0.15035989381249287, "kl": 7.650390625, "learning_rate": 1.381268151904298e-06, "loss": 0.3059, "reward": 1.1451823264360428, "reward_std": 0.3553981352597475, "rewards/accuracy_reward": 0.25781250931322575, "rewards/format_reward": 0.8873698078095913, "step": 240 }, { "completion_length": 1024.0, "epoch": 0.8515901060070671, "grad_norm": 0.22950348264138257, "kl": 7.703125, "learning_rate": 1.319205497565983e-06, "loss": 0.3085, "reward": 1.177083384245634, "reward_std": 0.36925146263092756, "rewards/accuracy_reward": 0.2929687611758709, "rewards/format_reward": 0.884114608168602, "step": 241 }, { "completion_length": 1010.7916679382324, "epoch": 0.8551236749116607, "grad_norm": 0.16497755116206372, "kl": 7.603515625, "learning_rate": 1.2584708070724738e-06, "loss": 0.3041, "reward": 1.1829427406191826, "reward_std": 0.3738958667963743, "rewards/accuracy_reward": 0.2897135494276881, "rewards/format_reward": 0.8932291865348816, "step": 242 }, { "completion_length": 1008.9583358764648, "epoch": 0.8586572438162544, "grad_norm": 0.17585743920433772, "kl": 7.76171875, "learning_rate": 1.1990733714481185e-06, "loss": 0.3107, "reward": 1.1471354588866234, "reward_std": 0.3857318237423897, "rewards/accuracy_reward": 0.26888021547347307, "rewards/format_reward": 0.8782552294433117, "step": 243 }, { "completion_length": 1009.5833358764648, "epoch": 0.8621908127208481, "grad_norm": 0.09937123864278254, "kl": 7.640625, "learning_rate": 1.1410222771477276e-06, "loss": 0.3056, "reward": 1.1360677555203438, "reward_std": 0.3525569401681423, "rewards/accuracy_reward": 0.24934896640479565, "rewards/format_reward": 0.8867187686264515, "step": 244 }, { "completion_length": 1024.0, "epoch": 0.8657243816254417, "grad_norm": 0.23903667479086305, "kl": 7.603515625, "learning_rate": 1.0843264046665558e-06, "loss": 0.304, "reward": 1.1881510764360428, "reward_std": 0.3764577666297555, "rewards/accuracy_reward": 0.3027343852445483, "rewards/format_reward": 0.8854166865348816, "step": 245 }, { "completion_length": 1024.0, "epoch": 0.8692579505300353, "grad_norm": 0.1464770828710901, "kl": 7.6328125, "learning_rate": 1.0289944271817898e-06, "loss": 0.3051, "reward": 1.1829427406191826, "reward_std": 0.36000128649175167, "rewards/accuracy_reward": 0.2910156324505806, "rewards/format_reward": 0.891927108168602, "step": 246 }, { "completion_length": 1011.8333358764648, "epoch": 0.872791519434629, "grad_norm": 0.12575527161822952, "kl": 7.70703125, "learning_rate": 9.750348092257368e-07, "loss": 0.3083, "reward": 1.2018229737877846, "reward_std": 0.33693454321473837, "rewards/accuracy_reward": 0.296875006519258, "rewards/format_reward": 0.9049479402601719, "step": 247 }, { "completion_length": 1007.6041679382324, "epoch": 0.8763250883392226, "grad_norm": 0.2181922287172582, "kl": 7.560546875, "learning_rate": 9.224558053909615e-07, "loss": 0.3026, "reward": 1.1432291939854622, "reward_std": 0.3927531726658344, "rewards/accuracy_reward": 0.26562500931322575, "rewards/format_reward": 0.8776041902601719, "step": 248 }, { "completion_length": 1003.625, "epoch": 0.8798586572438163, "grad_norm": 0.1951601816286408, "kl": 7.806640625, "learning_rate": 8.712654590675085e-07, "loss": 0.3126, "reward": 1.1347656659781933, "reward_std": 0.3628583550453186, "rewards/accuracy_reward": 0.2552083423361182, "rewards/format_reward": 0.8795573152601719, "step": 249 }, { "completion_length": 1017.8541679382324, "epoch": 0.8833922261484098, "grad_norm": 0.13270611252972092, "kl": 7.900390625, "learning_rate": 8.214716012124491e-07, "loss": 0.3162, "reward": 1.2278646156191826, "reward_std": 0.33511496149003506, "rewards/accuracy_reward": 0.3190104244276881, "rewards/format_reward": 0.9088541902601719, "step": 250 }, { "completion_length": 1024.0, "epoch": 0.8869257950530035, "grad_norm": 0.16899318018961745, "kl": 7.703125, "learning_rate": 7.730818491519343e-07, "loss": 0.3083, "reward": 1.1673177480697632, "reward_std": 0.35251003317534924, "rewards/accuracy_reward": 0.28515625838190317, "rewards/format_reward": 0.8821614794433117, "step": 251 }, { "completion_length": 1024.0, "epoch": 0.8904593639575972, "grad_norm": 0.19761065312810003, "kl": 7.53125, "learning_rate": 7.261036054158965e-07, "loss": 0.3013, "reward": 1.1621094308793545, "reward_std": 0.361306588165462, "rewards/accuracy_reward": 0.28515625558793545, "rewards/format_reward": 0.8769531473517418, "step": 252 }, { "completion_length": 980.1666679382324, "epoch": 0.8939929328621908, "grad_norm": 0.12059278219677871, "kl": 7.86328125, "learning_rate": 6.805440566056554e-07, "loss": 0.3147, "reward": 1.1438802555203438, "reward_std": 0.3613898027688265, "rewards/accuracy_reward": 0.2636718824505806, "rewards/format_reward": 0.8802083507180214, "step": 253 }, { "completion_length": 1011.1041679382324, "epoch": 0.8975265017667845, "grad_norm": 0.21668223679230778, "kl": 7.732421875, "learning_rate": 6.364101722945082e-07, "loss": 0.309, "reward": 1.144531287252903, "reward_std": 0.3460462633520365, "rewards/accuracy_reward": 0.2636718829162419, "rewards/format_reward": 0.8808594010770321, "step": 254 }, { "completion_length": 1024.0, "epoch": 0.901060070671378, "grad_norm": 0.2561339153931696, "kl": 7.5546875, "learning_rate": 5.937087039615619e-07, "loss": 0.3021, "reward": 1.1035156734287739, "reward_std": 0.3481142967939377, "rewards/accuracy_reward": 0.23372396640479565, "rewards/format_reward": 0.8697916902601719, "step": 255 }, { "completion_length": 1024.0, "epoch": 0.9045936395759717, "grad_norm": 0.1552920007884265, "kl": 7.521484375, "learning_rate": 5.524461839589012e-07, "loss": 0.3008, "reward": 1.1230469271540642, "reward_std": 0.36596967838704586, "rewards/accuracy_reward": 0.2526041688397527, "rewards/format_reward": 0.870442733168602, "step": 256 }, { "completion_length": 1008.7708358764648, "epoch": 0.9081272084805654, "grad_norm": 0.183290959725611, "kl": 7.634765625, "learning_rate": 5.126289245122906e-07, "loss": 0.3054, "reward": 1.1549479588866234, "reward_std": 0.3895051181316376, "rewards/accuracy_reward": 0.27929688058793545, "rewards/format_reward": 0.8756510578095913, "step": 257 }, { "completion_length": 1014.6875, "epoch": 0.911660777385159, "grad_norm": 0.14993091508284867, "kl": 7.6953125, "learning_rate": 4.7426301675554285e-07, "loss": 0.3077, "reward": 1.1523437835276127, "reward_std": 0.343139311298728, "rewards/accuracy_reward": 0.261718756519258, "rewards/format_reward": 0.8906250186264515, "step": 258 }, { "completion_length": 1007.9791679382324, "epoch": 0.9151943462897526, "grad_norm": 0.1374641960302751, "kl": 7.583984375, "learning_rate": 4.3735432979872593e-07, "loss": 0.3032, "reward": 1.1640625447034836, "reward_std": 0.37700783647596836, "rewards/accuracy_reward": 0.27799479849636555, "rewards/format_reward": 0.8860677294433117, "step": 259 }, { "completion_length": 1006.0, "epoch": 0.9187279151943463, "grad_norm": 0.17289466011678567, "kl": 7.734375, "learning_rate": 4.019085098303077e-07, "loss": 0.3092, "reward": 1.1796875298023224, "reward_std": 0.3705375073477626, "rewards/accuracy_reward": 0.3046875074505806, "rewards/format_reward": 0.8750000111758709, "step": 260 }, { "completion_length": 1020.5625, "epoch": 0.9222614840989399, "grad_norm": 0.2504365893065849, "kl": 7.763671875, "learning_rate": 3.679309792534291e-07, "loss": 0.3107, "reward": 1.1927083730697632, "reward_std": 0.36883932538330555, "rewards/accuracy_reward": 0.30338542722165585, "rewards/format_reward": 0.8893229402601719, "step": 261 }, { "completion_length": 1024.0, "epoch": 0.9257950530035336, "grad_norm": 0.15980023469329024, "kl": 7.708984375, "learning_rate": 3.354269358563966e-07, "loss": 0.3083, "reward": 1.1666666939854622, "reward_std": 0.3717129658907652, "rewards/accuracy_reward": 0.28125000838190317, "rewards/format_reward": 0.8854166828095913, "step": 262 }, { "completion_length": 1024.0, "epoch": 0.9293286219081273, "grad_norm": 0.14673263922185112, "kl": 7.51171875, "learning_rate": 3.044013520175337e-07, "loss": 0.3004, "reward": 1.1158854588866234, "reward_std": 0.39931169617921114, "rewards/accuracy_reward": 0.2441406324505806, "rewards/format_reward": 0.8717448115348816, "step": 263 }, { "completion_length": 1024.0, "epoch": 0.9328621908127208, "grad_norm": 0.21979625485038595, "kl": 7.701171875, "learning_rate": 2.7485897394453067e-07, "loss": 0.308, "reward": 1.1269531771540642, "reward_std": 0.3809357853606343, "rewards/accuracy_reward": 0.23763021733611822, "rewards/format_reward": 0.8893229328095913, "step": 264 }, { "completion_length": 999.4583358764648, "epoch": 0.9363957597173145, "grad_norm": 0.1678262537397904, "kl": 7.7421875, "learning_rate": 2.4680432094837394e-07, "loss": 0.3099, "reward": 1.1510417014360428, "reward_std": 0.3365847198292613, "rewards/accuracy_reward": 0.2623697994276881, "rewards/format_reward": 0.8886718973517418, "step": 265 }, { "completion_length": 1024.0, "epoch": 0.9399293286219081, "grad_norm": 0.17657969817520183, "kl": 7.6640625, "learning_rate": 2.2024168475199615e-07, "loss": 0.3068, "reward": 1.125000037252903, "reward_std": 0.3468599859625101, "rewards/accuracy_reward": 0.25195313058793545, "rewards/format_reward": 0.8730468899011612, "step": 266 }, { "completion_length": 1024.0, "epoch": 0.9434628975265018, "grad_norm": 0.16463993549487405, "kl": 7.640625, "learning_rate": 1.9517512883374667e-07, "loss": 0.3057, "reward": 1.1464844048023224, "reward_std": 0.38465187326073647, "rewards/accuracy_reward": 0.26106771687045693, "rewards/format_reward": 0.8854166828095913, "step": 267 }, { "completion_length": 1024.0, "epoch": 0.9469964664310954, "grad_norm": 0.1836439792441492, "kl": 7.57421875, "learning_rate": 1.7160848780576334e-07, "loss": 0.303, "reward": 1.1764323264360428, "reward_std": 0.39003968983888626, "rewards/accuracy_reward": 0.29296875838190317, "rewards/format_reward": 0.8834635689854622, "step": 268 }, { "completion_length": 1003.2916679382324, "epoch": 0.950530035335689, "grad_norm": 0.20059017619611783, "kl": 7.5078125, "learning_rate": 1.495453668273672e-07, "loss": 0.3002, "reward": 1.149088565260172, "reward_std": 0.3729328028857708, "rewards/accuracy_reward": 0.27408855129033327, "rewards/format_reward": 0.8750000260770321, "step": 269 }, { "completion_length": 1002.9375038146973, "epoch": 0.9540636042402827, "grad_norm": 0.1818614489805749, "kl": 7.669921875, "learning_rate": 1.289891410535593e-07, "loss": 0.3069, "reward": 1.1751302555203438, "reward_std": 0.3694411441683769, "rewards/accuracy_reward": 0.28190105129033327, "rewards/format_reward": 0.8932291865348816, "step": 270 }, { "completion_length": 1024.0, "epoch": 0.9575971731448764, "grad_norm": 0.1417185938823282, "kl": 7.783203125, "learning_rate": 1.0994295511869257e-07, "loss": 0.3114, "reward": 1.164713580161333, "reward_std": 0.3704876583069563, "rewards/accuracy_reward": 0.28385417629033327, "rewards/format_reward": 0.8808593973517418, "step": 271 }, { "completion_length": 1007.5833358764648, "epoch": 0.9611307420494699, "grad_norm": 0.1335404235268344, "kl": 7.65234375, "learning_rate": 9.240972265541992e-08, "loss": 0.3061, "reward": 1.136718787252903, "reward_std": 0.33945256378501654, "rewards/accuracy_reward": 0.25325521687045693, "rewards/format_reward": 0.8834635652601719, "step": 272 }, { "completion_length": 995.3958358764648, "epoch": 0.9646643109540636, "grad_norm": 0.16627237105129372, "kl": 7.748046875, "learning_rate": 7.639212584897082e-08, "loss": 0.3097, "reward": 1.1458333767950535, "reward_std": 0.36959288641810417, "rewards/accuracy_reward": 0.2591145895421505, "rewards/format_reward": 0.8867187611758709, "step": 273 }, { "completion_length": 977.8541717529297, "epoch": 0.9681978798586572, "grad_norm": 0.1442104843092151, "kl": 7.8046875, "learning_rate": 6.189261502683619e-08, "loss": 0.3121, "reward": 1.1608073338866234, "reward_std": 0.33501617051661015, "rewards/accuracy_reward": 0.26302084047347307, "rewards/format_reward": 0.897786483168602, "step": 274 }, { "completion_length": 1005.8333358764648, "epoch": 0.9717314487632509, "grad_norm": 0.1335599188895197, "kl": 7.734375, "learning_rate": 4.8913408283934874e-08, "loss": 0.3094, "reward": 1.1829427480697632, "reward_std": 0.3483387678861618, "rewards/accuracy_reward": 0.2923177173361182, "rewards/format_reward": 0.8906250149011612, "step": 275 }, { "completion_length": 986.5833358764648, "epoch": 0.9752650176678446, "grad_norm": 0.1566671460322668, "kl": 7.9140625, "learning_rate": 3.745649114328065e-08, "loss": 0.3167, "reward": 1.2102865017950535, "reward_std": 0.35581814870238304, "rewards/accuracy_reward": 0.305338547565043, "rewards/format_reward": 0.9049479402601719, "step": 276 }, { "completion_length": 1024.0, "epoch": 0.9787985865724381, "grad_norm": 0.18267938124865138, "kl": 7.66796875, "learning_rate": 2.7523616252252972e-08, "loss": 0.3068, "reward": 1.1458333730697632, "reward_std": 0.372573995962739, "rewards/accuracy_reward": 0.2617187579162419, "rewards/format_reward": 0.8841146044433117, "step": 277 }, { "completion_length": 990.7500038146973, "epoch": 0.9823321554770318, "grad_norm": 0.21259523984138215, "kl": 7.6875, "learning_rate": 1.9116303114480316e-08, "loss": 0.3076, "reward": 1.1731771230697632, "reward_std": 0.3659754488617182, "rewards/accuracy_reward": 0.28971354803070426, "rewards/format_reward": 0.8834635689854622, "step": 278 }, { "completion_length": 1012.2291679382324, "epoch": 0.9858657243816255, "grad_norm": 0.16162769674837876, "kl": 7.580078125, "learning_rate": 1.2235837857387246e-08, "loss": 0.3032, "reward": 1.130859412252903, "reward_std": 0.3709502723067999, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.8808593973517418, "step": 279 }, { "completion_length": 1024.0, "epoch": 0.9893992932862191, "grad_norm": 0.1297894260306723, "kl": 7.84375, "learning_rate": 6.883273035447335e-09, "loss": 0.3136, "reward": 1.1634114906191826, "reward_std": 0.3336097300052643, "rewards/accuracy_reward": 0.27539063431322575, "rewards/format_reward": 0.888020858168602, "step": 280 }, { "completion_length": 1024.0, "epoch": 0.9929328621908127, "grad_norm": 0.1521970740888536, "kl": 7.67578125, "learning_rate": 3.0594274691686522e-09, "loss": 0.3069, "reward": 1.1373698264360428, "reward_std": 0.35927175264805555, "rewards/accuracy_reward": 0.258463550824672, "rewards/format_reward": 0.8789062723517418, "step": 281 }, { "completion_length": 992.9791679382324, "epoch": 0.9964664310954063, "grad_norm": 0.19083083078751598, "kl": 7.509765625, "learning_rate": 7.648861198306101e-10, "loss": 0.3004, "reward": 1.164713580161333, "reward_std": 0.38056557066738605, "rewards/accuracy_reward": 0.299479179084301, "rewards/format_reward": 0.8652343973517418, "step": 282 }, { "completion_length": 1004.0, "epoch": 1.0, "grad_norm": 0.22334310417649858, "kl": 7.740234375, "learning_rate": 0.0, "loss": 0.3095, "reward": 1.1516927778720856, "reward_std": 0.36419933661818504, "rewards/accuracy_reward": 0.26171875558793545, "rewards/format_reward": 0.889973983168602, "step": 283 }, { "epoch": 1.0, "step": 283, "total_flos": 0.0, "train_loss": 3.28601383127165, "train_runtime": 57853.0435, "train_samples_per_second": 1.252, "train_steps_per_second": 0.005 } ], "logging_steps": 1, "max_steps": 283, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 60, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }