|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 500, |
|
"global_step": 283, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 387.71745681762695, |
|
"epoch": 0.0035335689045936395, |
|
"grad_norm": 0.7826879124381357, |
|
"kl": 0.0, |
|
"learning_rate": 6.896551724137931e-07, |
|
"loss": 0.0, |
|
"reward": 0.6354166809469461, |
|
"reward_std": 0.4388374499976635, |
|
"rewards/accuracy_reward": 0.16276042070239782, |
|
"rewards/format_reward": 0.4726562611758709, |
|
"step": 1 |
|
}, |
|
{ |
|
"completion_length": 405.3405055999756, |
|
"epoch": 0.007067137809187279, |
|
"grad_norm": 1.1927588816666794, |
|
"kl": 0.0, |
|
"learning_rate": 1.3793103448275862e-06, |
|
"loss": 0.0, |
|
"reward": 0.6035156473517418, |
|
"reward_std": 0.4155316762626171, |
|
"rewards/accuracy_reward": 0.1523437537252903, |
|
"rewards/format_reward": 0.4511718861758709, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 409.28972244262695, |
|
"epoch": 0.01060070671378092, |
|
"grad_norm": 0.6905561253532118, |
|
"kl": 0.00020521879196166992, |
|
"learning_rate": 2.0689655172413796e-06, |
|
"loss": 0.0, |
|
"reward": 0.5996093954890966, |
|
"reward_std": 0.4422223027795553, |
|
"rewards/accuracy_reward": 0.15234375465661287, |
|
"rewards/format_reward": 0.4472656361758709, |
|
"step": 3 |
|
}, |
|
{ |
|
"completion_length": 410.4349060058594, |
|
"epoch": 0.014134275618374558, |
|
"grad_norm": 0.7884781255286112, |
|
"kl": 0.00036644935607910156, |
|
"learning_rate": 2.7586206896551725e-06, |
|
"loss": 0.0, |
|
"reward": 0.6054687723517418, |
|
"reward_std": 0.4260980412364006, |
|
"rewards/accuracy_reward": 0.14192708721384406, |
|
"rewards/format_reward": 0.46354168467223644, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 346.99024391174316, |
|
"epoch": 0.0176678445229682, |
|
"grad_norm": 0.5273525890826313, |
|
"kl": 0.0029783248901367188, |
|
"learning_rate": 3.448275862068966e-06, |
|
"loss": 0.0001, |
|
"reward": 0.7408854402601719, |
|
"reward_std": 0.40636622719466686, |
|
"rewards/accuracy_reward": 0.11979167081881315, |
|
"rewards/format_reward": 0.6210937686264515, |
|
"step": 5 |
|
}, |
|
{ |
|
"completion_length": 325.3359498977661, |
|
"epoch": 0.02120141342756184, |
|
"grad_norm": 0.46274180489878297, |
|
"kl": 0.01139068603515625, |
|
"learning_rate": 4.137931034482759e-06, |
|
"loss": 0.0005, |
|
"reward": 0.7929687649011612, |
|
"reward_std": 0.4467017278075218, |
|
"rewards/accuracy_reward": 0.1158854195382446, |
|
"rewards/format_reward": 0.6770833507180214, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 227.93359851837158, |
|
"epoch": 0.024734982332155476, |
|
"grad_norm": 53.42434025598496, |
|
"kl": 1.0528564453125, |
|
"learning_rate": 4.8275862068965525e-06, |
|
"loss": 0.042, |
|
"reward": 0.9694010689854622, |
|
"reward_std": 0.24079907592386007, |
|
"rewards/accuracy_reward": 0.06445312686264515, |
|
"rewards/format_reward": 0.9049479439854622, |
|
"step": 7 |
|
}, |
|
{ |
|
"completion_length": 202.09896278381348, |
|
"epoch": 0.028268551236749116, |
|
"grad_norm": 19.059740265959093, |
|
"kl": 0.4888916015625, |
|
"learning_rate": 5.517241379310345e-06, |
|
"loss": 0.0195, |
|
"reward": 0.9843750298023224, |
|
"reward_std": 0.24569615349173546, |
|
"rewards/accuracy_reward": 0.06380208441987634, |
|
"rewards/format_reward": 0.9205729328095913, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 183.13867664337158, |
|
"epoch": 0.03180212014134275, |
|
"grad_norm": 1.481224853303734, |
|
"kl": 0.0942840576171875, |
|
"learning_rate": 6.206896551724138e-06, |
|
"loss": 0.0038, |
|
"reward": 1.023437526077032, |
|
"reward_std": 0.236824631690979, |
|
"rewards/accuracy_reward": 0.09309896151535213, |
|
"rewards/format_reward": 0.9303385615348816, |
|
"step": 9 |
|
}, |
|
{ |
|
"completion_length": 175.65234756469727, |
|
"epoch": 0.0353356890459364, |
|
"grad_norm": 0.5566746911844828, |
|
"kl": 0.0570068359375, |
|
"learning_rate": 6.896551724137932e-06, |
|
"loss": 0.0023, |
|
"reward": 1.0058594197034836, |
|
"reward_std": 0.20571813452988863, |
|
"rewards/accuracy_reward": 0.0716145855258219, |
|
"rewards/format_reward": 0.934244804084301, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 173.4589900970459, |
|
"epoch": 0.038869257950530034, |
|
"grad_norm": 0.3466466544445655, |
|
"kl": 0.0336151123046875, |
|
"learning_rate": 7.586206896551724e-06, |
|
"loss": 0.0013, |
|
"reward": 1.0175781473517418, |
|
"reward_std": 0.21056926436722279, |
|
"rewards/accuracy_reward": 0.07291666802484542, |
|
"rewards/format_reward": 0.9446614757180214, |
|
"step": 11 |
|
}, |
|
{ |
|
"completion_length": 167.7506561279297, |
|
"epoch": 0.04240282685512368, |
|
"grad_norm": 0.31983345914206196, |
|
"kl": 0.042755126953125, |
|
"learning_rate": 8.275862068965518e-06, |
|
"loss": 0.0017, |
|
"reward": 1.0455729477107525, |
|
"reward_std": 0.20765206310898066, |
|
"rewards/accuracy_reward": 0.09375000174622983, |
|
"rewards/format_reward": 0.9518229328095913, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 170.16797399520874, |
|
"epoch": 0.045936395759717315, |
|
"grad_norm": 0.36163723632628936, |
|
"kl": 0.0521392822265625, |
|
"learning_rate": 8.965517241379312e-06, |
|
"loss": 0.0021, |
|
"reward": 1.0390625335276127, |
|
"reward_std": 0.22777050640434027, |
|
"rewards/accuracy_reward": 0.09244792052777484, |
|
"rewards/format_reward": 0.9466146044433117, |
|
"step": 13 |
|
}, |
|
{ |
|
"completion_length": 158.57943153381348, |
|
"epoch": 0.04946996466431095, |
|
"grad_norm": 0.3520770926699356, |
|
"kl": 0.0601348876953125, |
|
"learning_rate": 9.655172413793105e-06, |
|
"loss": 0.0024, |
|
"reward": 1.0709635838866234, |
|
"reward_std": 0.25874905101954937, |
|
"rewards/accuracy_reward": 0.13020833674818277, |
|
"rewards/format_reward": 0.9407552257180214, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 118.45377922058105, |
|
"epoch": 0.053003533568904596, |
|
"grad_norm": 0.35628917266654875, |
|
"kl": 0.10369873046875, |
|
"learning_rate": 1.0344827586206898e-05, |
|
"loss": 0.0041, |
|
"reward": 1.0696614794433117, |
|
"reward_std": 0.21860592905431986, |
|
"rewards/accuracy_reward": 0.11132812919095159, |
|
"rewards/format_reward": 0.958333358168602, |
|
"step": 15 |
|
}, |
|
{ |
|
"completion_length": 81.11849284172058, |
|
"epoch": 0.05653710247349823, |
|
"grad_norm": 1.1321068685550928, |
|
"kl": 0.23651123046875, |
|
"learning_rate": 1.103448275862069e-05, |
|
"loss": 0.0095, |
|
"reward": 1.0891927629709244, |
|
"reward_std": 0.16596419550478458, |
|
"rewards/accuracy_reward": 0.10677083791233599, |
|
"rewards/format_reward": 0.9824218899011612, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 116.46028995513916, |
|
"epoch": 0.06007067137809187, |
|
"grad_norm": 0.4979444532314162, |
|
"kl": 0.111053466796875, |
|
"learning_rate": 1.1724137931034483e-05, |
|
"loss": 0.0044, |
|
"reward": 1.091796912252903, |
|
"reward_std": 0.20691483141854405, |
|
"rewards/accuracy_reward": 0.12630208651535213, |
|
"rewards/format_reward": 0.9654948078095913, |
|
"step": 17 |
|
}, |
|
{ |
|
"completion_length": 106.39062833786011, |
|
"epoch": 0.0636042402826855, |
|
"grad_norm": 0.39934788591997045, |
|
"kl": 0.13848876953125, |
|
"learning_rate": 1.2413793103448277e-05, |
|
"loss": 0.0055, |
|
"reward": 1.1243490055203438, |
|
"reward_std": 0.23089734092354774, |
|
"rewards/accuracy_reward": 0.15299479360692203, |
|
"rewards/format_reward": 0.9713541902601719, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 116.17318058013916, |
|
"epoch": 0.06713780918727916, |
|
"grad_norm": 0.40747455110093217, |
|
"kl": 0.15606689453125, |
|
"learning_rate": 1.310344827586207e-05, |
|
"loss": 0.0062, |
|
"reward": 1.1321614980697632, |
|
"reward_std": 0.23220197623595595, |
|
"rewards/accuracy_reward": 0.15625000465661287, |
|
"rewards/format_reward": 0.9759114794433117, |
|
"step": 19 |
|
}, |
|
{ |
|
"completion_length": 109.37304925918579, |
|
"epoch": 0.0706713780918728, |
|
"grad_norm": 0.38360455679162203, |
|
"kl": 0.2166748046875, |
|
"learning_rate": 1.3793103448275863e-05, |
|
"loss": 0.0087, |
|
"reward": 1.1816406697034836, |
|
"reward_std": 0.2594331307336688, |
|
"rewards/accuracy_reward": 0.20182292396202683, |
|
"rewards/format_reward": 0.9798177219927311, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 106.66992425918579, |
|
"epoch": 0.07420494699646643, |
|
"grad_norm": 0.4520249358225062, |
|
"kl": 0.266357421875, |
|
"learning_rate": 1.4482758620689657e-05, |
|
"loss": 0.0107, |
|
"reward": 1.1595052555203438, |
|
"reward_std": 0.24440898094326258, |
|
"rewards/accuracy_reward": 0.17773437732830644, |
|
"rewards/format_reward": 0.9817708544433117, |
|
"step": 21 |
|
}, |
|
{ |
|
"completion_length": 140.5937557220459, |
|
"epoch": 0.07773851590106007, |
|
"grad_norm": 1.4718976020696355, |
|
"kl": 0.232421875, |
|
"learning_rate": 1.5172413793103448e-05, |
|
"loss": 0.0093, |
|
"reward": 1.199869841337204, |
|
"reward_std": 0.2511229431256652, |
|
"rewards/accuracy_reward": 0.2174479211680591, |
|
"rewards/format_reward": 0.9824219010770321, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 186.76562976837158, |
|
"epoch": 0.0812720848056537, |
|
"grad_norm": 0.9576903316083716, |
|
"kl": 0.173370361328125, |
|
"learning_rate": 1.586206896551724e-05, |
|
"loss": 0.0069, |
|
"reward": 1.2298177480697632, |
|
"reward_std": 0.26364279724657536, |
|
"rewards/accuracy_reward": 0.24869792629033327, |
|
"rewards/format_reward": 0.9811198152601719, |
|
"step": 23 |
|
}, |
|
{ |
|
"completion_length": 189.61133193969727, |
|
"epoch": 0.08480565371024736, |
|
"grad_norm": 0.3064098864277374, |
|
"kl": 0.15606689453125, |
|
"learning_rate": 1.6551724137931037e-05, |
|
"loss": 0.0062, |
|
"reward": 1.2526042088866234, |
|
"reward_std": 0.27888874523341656, |
|
"rewards/accuracy_reward": 0.28125000931322575, |
|
"rewards/format_reward": 0.9713541828095913, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 179.1979217529297, |
|
"epoch": 0.08833922261484099, |
|
"grad_norm": 0.4424785145099648, |
|
"kl": 0.168243408203125, |
|
"learning_rate": 1.7241379310344828e-05, |
|
"loss": 0.0067, |
|
"reward": 1.189453162252903, |
|
"reward_std": 0.2696635592728853, |
|
"rewards/accuracy_reward": 0.22526042349636555, |
|
"rewards/format_reward": 0.9641927294433117, |
|
"step": 25 |
|
}, |
|
{ |
|
"completion_length": 220.5813865661621, |
|
"epoch": 0.09187279151943463, |
|
"grad_norm": 0.2927466463526939, |
|
"kl": 0.117034912109375, |
|
"learning_rate": 1.7931034482758623e-05, |
|
"loss": 0.0047, |
|
"reward": 1.221354216337204, |
|
"reward_std": 0.28838597796857357, |
|
"rewards/accuracy_reward": 0.25520834047347307, |
|
"rewards/format_reward": 0.9661458507180214, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 218.9602918624878, |
|
"epoch": 0.09540636042402827, |
|
"grad_norm": 0.3147261189039069, |
|
"kl": 0.1197509765625, |
|
"learning_rate": 1.8620689655172415e-05, |
|
"loss": 0.0048, |
|
"reward": 1.2246094197034836, |
|
"reward_std": 0.2890056548640132, |
|
"rewards/accuracy_reward": 0.26236980129033327, |
|
"rewards/format_reward": 0.962239608168602, |
|
"step": 27 |
|
}, |
|
{ |
|
"completion_length": 192.85547256469727, |
|
"epoch": 0.0989399293286219, |
|
"grad_norm": 0.2815314878632684, |
|
"kl": 0.147918701171875, |
|
"learning_rate": 1.931034482758621e-05, |
|
"loss": 0.0059, |
|
"reward": 1.2167969048023224, |
|
"reward_std": 0.29042986780405045, |
|
"rewards/accuracy_reward": 0.2584635470993817, |
|
"rewards/format_reward": 0.9583333544433117, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 178.49284267425537, |
|
"epoch": 0.10247349823321555, |
|
"grad_norm": 0.32426980772846276, |
|
"kl": 0.1710205078125, |
|
"learning_rate": 2e-05, |
|
"loss": 0.0068, |
|
"reward": 1.2174479514360428, |
|
"reward_std": 0.28664571419358253, |
|
"rewards/accuracy_reward": 0.26627604849636555, |
|
"rewards/format_reward": 0.9511718936264515, |
|
"step": 29 |
|
}, |
|
{ |
|
"completion_length": 132.87891054153442, |
|
"epoch": 0.10600706713780919, |
|
"grad_norm": 0.3462539007364067, |
|
"kl": 0.24053955078125, |
|
"learning_rate": 1.999923511388017e-05, |
|
"loss": 0.0096, |
|
"reward": 1.2102864906191826, |
|
"reward_std": 0.2935393461957574, |
|
"rewards/accuracy_reward": 0.2519531361758709, |
|
"rewards/format_reward": 0.9583333544433117, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 104.59570646286011, |
|
"epoch": 0.10954063604240283, |
|
"grad_norm": 0.31757187279941806, |
|
"kl": 0.29840087890625, |
|
"learning_rate": 1.999694057253083e-05, |
|
"loss": 0.0119, |
|
"reward": 1.2070312798023224, |
|
"reward_std": 0.2599523845128715, |
|
"rewards/accuracy_reward": 0.24153646733611822, |
|
"rewards/format_reward": 0.9654948115348816, |
|
"step": 31 |
|
}, |
|
{ |
|
"completion_length": 122.79622745513916, |
|
"epoch": 0.11307420494699646, |
|
"grad_norm": 0.29516858001336377, |
|
"kl": 0.289306640625, |
|
"learning_rate": 1.9993116726964554e-05, |
|
"loss": 0.0116, |
|
"reward": 1.1855469048023224, |
|
"reward_std": 0.2488250662572682, |
|
"rewards/accuracy_reward": 0.2220052140764892, |
|
"rewards/format_reward": 0.9635416865348816, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 169.53385829925537, |
|
"epoch": 0.1166077738515901, |
|
"grad_norm": 0.2854397155723977, |
|
"kl": 0.218505859375, |
|
"learning_rate": 1.9987764162142615e-05, |
|
"loss": 0.0087, |
|
"reward": 1.2135417088866234, |
|
"reward_std": 0.27537838369607925, |
|
"rewards/accuracy_reward": 0.25130209513008595, |
|
"rewards/format_reward": 0.9622396044433117, |
|
"step": 33 |
|
}, |
|
{ |
|
"completion_length": 219.41602230072021, |
|
"epoch": 0.12014134275618374, |
|
"grad_norm": 0.22474385235022273, |
|
"kl": 0.1595458984375, |
|
"learning_rate": 1.998088369688552e-05, |
|
"loss": 0.0064, |
|
"reward": 1.214843787252903, |
|
"reward_std": 0.31507682241499424, |
|
"rewards/accuracy_reward": 0.2766927136108279, |
|
"rewards/format_reward": 0.9381510615348816, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 274.76563453674316, |
|
"epoch": 0.12367491166077739, |
|
"grad_norm": 1452525.1386107916, |
|
"kl": 21248.131378173828, |
|
"learning_rate": 1.9972476383747748e-05, |
|
"loss": 851.4882, |
|
"reward": 1.1894531697034836, |
|
"reward_std": 0.3239498296752572, |
|
"rewards/accuracy_reward": 0.2558593829162419, |
|
"rewards/format_reward": 0.9335937686264515, |
|
"step": 35 |
|
}, |
|
{ |
|
"completion_length": 315.4687614440918, |
|
"epoch": 0.127208480565371, |
|
"grad_norm": 4.918800245024653, |
|
"kl": 0.221282958984375, |
|
"learning_rate": 1.9962543508856722e-05, |
|
"loss": 0.0088, |
|
"reward": 1.212890662252903, |
|
"reward_std": 0.3367620576173067, |
|
"rewards/accuracy_reward": 0.29817709140479565, |
|
"rewards/format_reward": 0.9147135615348816, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 308.2695417404175, |
|
"epoch": 0.13074204946996468, |
|
"grad_norm": 0.2301169380333895, |
|
"kl": 0.126373291015625, |
|
"learning_rate": 1.995108659171607e-05, |
|
"loss": 0.0051, |
|
"reward": 1.2324219271540642, |
|
"reward_std": 0.31519000325351954, |
|
"rewards/accuracy_reward": 0.28971354849636555, |
|
"rewards/format_reward": 0.9427083544433117, |
|
"step": 37 |
|
}, |
|
{ |
|
"completion_length": 287.9537887573242, |
|
"epoch": 0.13427561837455831, |
|
"grad_norm": 0.20139063038206165, |
|
"kl": 0.11163330078125, |
|
"learning_rate": 1.9938107384973165e-05, |
|
"loss": 0.0045, |
|
"reward": 1.2350260764360428, |
|
"reward_std": 0.2919177133589983, |
|
"rewards/accuracy_reward": 0.26888021547347307, |
|
"rewards/format_reward": 0.966145858168602, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 288.69141578674316, |
|
"epoch": 0.13780918727915195, |
|
"grad_norm": 0.18498422915016208, |
|
"kl": 0.1121826171875, |
|
"learning_rate": 1.992360787415103e-05, |
|
"loss": 0.0045, |
|
"reward": 1.2382812947034836, |
|
"reward_std": 0.2696660226210952, |
|
"rewards/accuracy_reward": 0.270182297565043, |
|
"rewards/format_reward": 0.9680989757180214, |
|
"step": 39 |
|
}, |
|
{ |
|
"completion_length": 285.7558660507202, |
|
"epoch": 0.1413427561837456, |
|
"grad_norm": 0.18156169712553022, |
|
"kl": 0.112762451171875, |
|
"learning_rate": 1.9907590277344582e-05, |
|
"loss": 0.0045, |
|
"reward": 1.2604167014360428, |
|
"reward_std": 0.23935140296816826, |
|
"rewards/accuracy_reward": 0.2799479244276881, |
|
"rewards/format_reward": 0.9804687649011612, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 274.0761842727661, |
|
"epoch": 0.14487632508833923, |
|
"grad_norm": 0.2080024626531214, |
|
"kl": 0.1217041015625, |
|
"learning_rate": 1.9890057044881308e-05, |
|
"loss": 0.0049, |
|
"reward": 1.2936198338866234, |
|
"reward_std": 0.2932386351749301, |
|
"rewards/accuracy_reward": 0.32031251210719347, |
|
"rewards/format_reward": 0.9733073152601719, |
|
"step": 41 |
|
}, |
|
{ |
|
"completion_length": 282.8776149749756, |
|
"epoch": 0.14840989399293286, |
|
"grad_norm": 0.23856185666588572, |
|
"kl": 0.125213623046875, |
|
"learning_rate": 1.9871010858946443e-05, |
|
"loss": 0.005, |
|
"reward": 1.2753906697034836, |
|
"reward_std": 0.2589658652432263, |
|
"rewards/accuracy_reward": 0.3033854253590107, |
|
"rewards/format_reward": 0.972005233168602, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 296.82422733306885, |
|
"epoch": 0.1519434628975265, |
|
"grad_norm": 0.2858858078510366, |
|
"kl": 0.140716552734375, |
|
"learning_rate": 1.9850454633172632e-05, |
|
"loss": 0.0056, |
|
"reward": 1.265625037252903, |
|
"reward_std": 0.3042640471830964, |
|
"rewards/accuracy_reward": 0.3138020960614085, |
|
"rewards/format_reward": 0.951822929084301, |
|
"step": 43 |
|
}, |
|
{ |
|
"completion_length": 312.989595413208, |
|
"epoch": 0.15547703180212014, |
|
"grad_norm": 75.43924110056017, |
|
"kl": 1.633636474609375, |
|
"learning_rate": 1.982839151219424e-05, |
|
"loss": 0.0654, |
|
"reward": 1.2662760615348816, |
|
"reward_std": 0.3203592775389552, |
|
"rewards/accuracy_reward": 0.32617188431322575, |
|
"rewards/format_reward": 0.9401041865348816, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 318.19271659851074, |
|
"epoch": 0.15901060070671377, |
|
"grad_norm": 4.839307387093032, |
|
"kl": 0.45843505859375, |
|
"learning_rate": 1.9804824871166254e-05, |
|
"loss": 0.0183, |
|
"reward": 1.2480469271540642, |
|
"reward_std": 0.37619344517588615, |
|
"rewards/accuracy_reward": 0.3489583432674408, |
|
"rewards/format_reward": 0.8990885615348816, |
|
"step": 45 |
|
}, |
|
{ |
|
"completion_length": 360.181001663208, |
|
"epoch": 0.1625441696113074, |
|
"grad_norm": 1.6956048738668972, |
|
"kl": 0.27557373046875, |
|
"learning_rate": 1.9779758315248006e-05, |
|
"loss": 0.011, |
|
"reward": 1.2226562947034836, |
|
"reward_std": 0.3949108961969614, |
|
"rewards/accuracy_reward": 0.3483073003590107, |
|
"rewards/format_reward": 0.874348983168602, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 365.8092555999756, |
|
"epoch": 0.16607773851590105, |
|
"grad_norm": 201.83615153266913, |
|
"kl": 12.43408203125, |
|
"learning_rate": 1.975319567905163e-05, |
|
"loss": 0.4973, |
|
"reward": 1.1972656659781933, |
|
"reward_std": 0.4045752976089716, |
|
"rewards/accuracy_reward": 0.345703131519258, |
|
"rewards/format_reward": 0.8515625186264515, |
|
"step": 47 |
|
}, |
|
{ |
|
"completion_length": 342.6002674102783, |
|
"epoch": 0.1696113074204947, |
|
"grad_norm": 5.398450236245697, |
|
"kl": 0.656494140625, |
|
"learning_rate": 1.9725141026055473e-05, |
|
"loss": 0.0263, |
|
"reward": 1.2246094085276127, |
|
"reward_std": 0.4386922810226679, |
|
"rewards/accuracy_reward": 0.37304688431322575, |
|
"rewards/format_reward": 0.8515625186264515, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 354.3912830352783, |
|
"epoch": 0.17314487632508835, |
|
"grad_norm": 6.492613210824359, |
|
"kl": 0.2779541015625, |
|
"learning_rate": 1.9695598647982467e-05, |
|
"loss": 0.0111, |
|
"reward": 1.1972656697034836, |
|
"reward_std": 0.44870651699602604, |
|
"rewards/accuracy_reward": 0.3587239682674408, |
|
"rewards/format_reward": 0.8385416902601719, |
|
"step": 49 |
|
}, |
|
{ |
|
"completion_length": 317.2200622558594, |
|
"epoch": 0.17667844522968199, |
|
"grad_norm": 6.9054290512934475, |
|
"kl": 0.31793212890625, |
|
"learning_rate": 1.9664573064143604e-05, |
|
"loss": 0.0127, |
|
"reward": 1.148437537252903, |
|
"reward_std": 0.4271644949913025, |
|
"rewards/accuracy_reward": 0.3033854244276881, |
|
"rewards/format_reward": 0.8450521044433117, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 291.3737087249756, |
|
"epoch": 0.18021201413427562, |
|
"grad_norm": 1.295034459732409, |
|
"kl": 0.384765625, |
|
"learning_rate": 1.9632069020746574e-05, |
|
"loss": 0.0154, |
|
"reward": 1.1966146305203438, |
|
"reward_std": 0.41263195499777794, |
|
"rewards/accuracy_reward": 0.34179688338190317, |
|
"rewards/format_reward": 0.8548177257180214, |
|
"step": 51 |
|
}, |
|
{ |
|
"completion_length": 242.330735206604, |
|
"epoch": 0.18374558303886926, |
|
"grad_norm": 44.75372246484856, |
|
"kl": 3.65264892578125, |
|
"learning_rate": 1.9598091490169696e-05, |
|
"loss": 0.1463, |
|
"reward": 1.1809896305203438, |
|
"reward_std": 0.3946582209318876, |
|
"rewards/accuracy_reward": 0.31445313477888703, |
|
"rewards/format_reward": 0.8665364757180214, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 196.8216199874878, |
|
"epoch": 0.1872791519434629, |
|
"grad_norm": 4.161042131383654, |
|
"kl": 0.65478515625, |
|
"learning_rate": 1.9562645670201278e-05, |
|
"loss": 0.0262, |
|
"reward": 1.2526041865348816, |
|
"reward_std": 0.3336602235212922, |
|
"rewards/accuracy_reward": 0.3313802145421505, |
|
"rewards/format_reward": 0.9212239682674408, |
|
"step": 53 |
|
}, |
|
{ |
|
"completion_length": 176.54818201065063, |
|
"epoch": 0.19081272084805653, |
|
"grad_norm": 0.2979202087235891, |
|
"kl": 0.368896484375, |
|
"learning_rate": 1.9525736983244458e-05, |
|
"loss": 0.0148, |
|
"reward": 1.2096354514360428, |
|
"reward_std": 0.32884097658097744, |
|
"rewards/accuracy_reward": 0.29687500884756446, |
|
"rewards/format_reward": 0.912760429084301, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 166.145188331604, |
|
"epoch": 0.19434628975265017, |
|
"grad_norm": 1.099328042290411, |
|
"kl": 0.488525390625, |
|
"learning_rate": 1.948737107548771e-05, |
|
"loss": 0.0195, |
|
"reward": 1.263671912252903, |
|
"reward_std": 0.3232028791680932, |
|
"rewards/accuracy_reward": 0.34700521919876337, |
|
"rewards/format_reward": 0.9166666865348816, |
|
"step": 55 |
|
}, |
|
{ |
|
"completion_length": 201.4856834411621, |
|
"epoch": 0.1978798586572438, |
|
"grad_norm": 0.5237539826150962, |
|
"kl": 0.37255859375, |
|
"learning_rate": 1.94475538160411e-05, |
|
"loss": 0.0149, |
|
"reward": 1.2526042014360428, |
|
"reward_std": 0.32452640403062105, |
|
"rewards/accuracy_reward": 0.3457031324505806, |
|
"rewards/format_reward": 0.906901054084301, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 223.5299530029297, |
|
"epoch": 0.20141342756183744, |
|
"grad_norm": 1.0789636117575958, |
|
"kl": 0.4588623046875, |
|
"learning_rate": 1.940629129603844e-05, |
|
"loss": 0.0183, |
|
"reward": 1.2089843973517418, |
|
"reward_std": 0.367857669480145, |
|
"rewards/accuracy_reward": 0.3307291744276881, |
|
"rewards/format_reward": 0.8782552294433117, |
|
"step": 57 |
|
}, |
|
{ |
|
"completion_length": 236.89389038085938, |
|
"epoch": 0.2049469964664311, |
|
"grad_norm": 0.9522159379621601, |
|
"kl": 0.476318359375, |
|
"learning_rate": 1.9363589827705494e-05, |
|
"loss": 0.0191, |
|
"reward": 1.227213591337204, |
|
"reward_std": 0.39456650614738464, |
|
"rewards/accuracy_reward": 0.34049480501562357, |
|
"rewards/format_reward": 0.8867187611758709, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 218.1360740661621, |
|
"epoch": 0.20848056537102475, |
|
"grad_norm": 1.264236430534227, |
|
"kl": 0.8280029296875, |
|
"learning_rate": 1.9319455943394347e-05, |
|
"loss": 0.0331, |
|
"reward": 1.2571614980697632, |
|
"reward_std": 0.3343982622027397, |
|
"rewards/accuracy_reward": 0.35286459513008595, |
|
"rewards/format_reward": 0.9042969010770321, |
|
"step": 59 |
|
}, |
|
{ |
|
"completion_length": 185.6367244720459, |
|
"epoch": 0.21201413427561838, |
|
"grad_norm": 111.38995544077012, |
|
"kl": 7.3544921875, |
|
"learning_rate": 1.9273896394584103e-05, |
|
"loss": 0.2946, |
|
"reward": 1.272786483168602, |
|
"reward_std": 0.3291959064081311, |
|
"rewards/accuracy_reward": 0.35221355222165585, |
|
"rewards/format_reward": 0.9205729365348816, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 197.7513074874878, |
|
"epoch": 0.21554770318021202, |
|
"grad_norm": 1.9921896449669312, |
|
"kl": 0.7696533203125, |
|
"learning_rate": 1.9226918150848067e-05, |
|
"loss": 0.0308, |
|
"reward": 1.2643229588866234, |
|
"reward_std": 0.32842374220490456, |
|
"rewards/accuracy_reward": 0.3457031324505806, |
|
"rewards/format_reward": 0.9186198115348816, |
|
"step": 61 |
|
}, |
|
{ |
|
"completion_length": 255.86784744262695, |
|
"epoch": 0.21908127208480566, |
|
"grad_norm": 2.1340639274317144, |
|
"kl": 0.924560546875, |
|
"learning_rate": 1.9178528398787553e-05, |
|
"loss": 0.037, |
|
"reward": 1.1725260838866234, |
|
"reward_std": 0.4286086466163397, |
|
"rewards/accuracy_reward": 0.31575521547347307, |
|
"rewards/format_reward": 0.8567708544433117, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 175.7168025970459, |
|
"epoch": 0.2226148409893993, |
|
"grad_norm": 7.908467450030422, |
|
"kl": 1.04931640625, |
|
"learning_rate": 1.9128734540932494e-05, |
|
"loss": 0.042, |
|
"reward": 1.2838541865348816, |
|
"reward_std": 0.31748174503445625, |
|
"rewards/accuracy_reward": 0.34765625838190317, |
|
"rewards/format_reward": 0.9361979365348816, |
|
"step": 63 |
|
}, |
|
{ |
|
"completion_length": 145.36784315109253, |
|
"epoch": 0.22614840989399293, |
|
"grad_norm": 5066.629420414826, |
|
"kl": 549.625, |
|
"learning_rate": 1.907754419460904e-05, |
|
"loss": 22.0063, |
|
"reward": 1.3138021156191826, |
|
"reward_std": 0.25937482714653015, |
|
"rewards/accuracy_reward": 0.3567708423361182, |
|
"rewards/format_reward": 0.9570312723517418, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 134.21419763565063, |
|
"epoch": 0.22968197879858657, |
|
"grad_norm": 73.69279102025739, |
|
"kl": 7.2021484375, |
|
"learning_rate": 1.9024965190774262e-05, |
|
"loss": 0.2879, |
|
"reward": 1.2675781548023224, |
|
"reward_std": 0.2847044528461993, |
|
"rewards/accuracy_reward": 0.32226563058793545, |
|
"rewards/format_reward": 0.9453125260770321, |
|
"step": 65 |
|
}, |
|
{ |
|
"completion_length": 295.8131628036499, |
|
"epoch": 0.2332155477031802, |
|
"grad_norm": 15.351766639697455, |
|
"kl": 1.186279296875, |
|
"learning_rate": 1.8971005572818213e-05, |
|
"loss": 0.0474, |
|
"reward": 1.1197917014360428, |
|
"reward_std": 0.4257570914924145, |
|
"rewards/accuracy_reward": 0.285807297565043, |
|
"rewards/format_reward": 0.8339843899011612, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 483.8196773529053, |
|
"epoch": 0.23674911660777384, |
|
"grad_norm": 595.1397215741732, |
|
"kl": 64.0, |
|
"learning_rate": 1.8915673595333443e-05, |
|
"loss": 2.5615, |
|
"reward": 0.9531250223517418, |
|
"reward_std": 0.5964761041104794, |
|
"rewards/accuracy_reward": 0.30403646640479565, |
|
"rewards/format_reward": 0.649088554084301, |
|
"step": 67 |
|
}, |
|
{ |
|
"completion_length": 788.6002807617188, |
|
"epoch": 0.24028268551236748, |
|
"grad_norm": 50.44103352403159, |
|
"kl": 3.42041015625, |
|
"learning_rate": 1.8858977722852273e-05, |
|
"loss": 0.1367, |
|
"reward": 0.4303385578095913, |
|
"reward_std": 0.5175576768815517, |
|
"rewards/accuracy_reward": 0.17187500512227416, |
|
"rewards/format_reward": 0.2584635503590107, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 929.8216323852539, |
|
"epoch": 0.24381625441696114, |
|
"grad_norm": 734.3438170967142, |
|
"kl": 85.0, |
|
"learning_rate": 1.8800926628551884e-05, |
|
"loss": 3.3995, |
|
"reward": 0.21744792256504297, |
|
"reward_std": 0.338073399849236, |
|
"rewards/accuracy_reward": 0.11653646267950535, |
|
"rewards/format_reward": 0.1009114624466747, |
|
"step": 69 |
|
}, |
|
{ |
|
"completion_length": 997.1341323852539, |
|
"epoch": 0.24734982332155478, |
|
"grad_norm": 30.37607254327561, |
|
"kl": 5.88671875, |
|
"learning_rate": 1.8741529192927528e-05, |
|
"loss": 0.2356, |
|
"reward": 0.125000003259629, |
|
"reward_std": 0.22288852790370584, |
|
"rewards/accuracy_reward": 0.09440104337409139, |
|
"rewards/format_reward": 0.03059895901242271, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 1017.4121170043945, |
|
"epoch": 0.2508833922261484, |
|
"grad_norm": 27.33797696554464, |
|
"kl": 1.1336669921875, |
|
"learning_rate": 1.8680794502434018e-05, |
|
"loss": 0.0453, |
|
"reward": 0.09960937825962901, |
|
"reward_std": 0.15996943740174174, |
|
"rewards/accuracy_reward": 0.09895833628252149, |
|
"rewards/format_reward": 0.0006510416860692203, |
|
"step": 71 |
|
}, |
|
{ |
|
"completion_length": 1011.7773590087891, |
|
"epoch": 0.254416961130742, |
|
"grad_norm": 51.58164513519689, |
|
"kl": 1.179931640625, |
|
"learning_rate": 1.8618731848095706e-05, |
|
"loss": 0.0472, |
|
"reward": 0.12369791930541396, |
|
"reward_std": 0.19708295073360205, |
|
"rewards/accuracy_reward": 0.1223958358168602, |
|
"rewards/format_reward": 0.0013020833721384406, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 1023.6119804382324, |
|
"epoch": 0.2579505300353357, |
|
"grad_norm": 0.3170159566701739, |
|
"kl": 0.142608642578125, |
|
"learning_rate": 1.855535072408516e-05, |
|
"loss": 0.0057, |
|
"reward": 0.11263021267950535, |
|
"reward_std": 0.1862776312045753, |
|
"rewards/accuracy_reward": 0.11197917093522847, |
|
"rewards/format_reward": 0.0006510416860692203, |
|
"step": 73 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.26148409893992935, |
|
"grad_norm": 0.4703496805090262, |
|
"kl": 0.159820556640625, |
|
"learning_rate": 1.849066082627079e-05, |
|
"loss": 0.0064, |
|
"reward": 0.10807291930541396, |
|
"reward_std": 0.15622267639264464, |
|
"rewards/accuracy_reward": 0.10807291930541396, |
|
"rewards/format_reward": 0.0, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.26501766784452296, |
|
"grad_norm": 0.17854669938905443, |
|
"kl": 0.15557861328125, |
|
"learning_rate": 1.8424672050733577e-05, |
|
"loss": 0.0062, |
|
"reward": 0.16731771267950535, |
|
"reward_std": 0.2150044571608305, |
|
"rewards/accuracy_reward": 0.16731771267950535, |
|
"rewards/format_reward": 0.0, |
|
"step": 75 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.26855123674911663, |
|
"grad_norm": 0.9729182998733601, |
|
"kl": 0.2708740234375, |
|
"learning_rate": 1.8357394492253216e-05, |
|
"loss": 0.0108, |
|
"reward": 0.20703125465661287, |
|
"reward_std": 0.24680148623883724, |
|
"rewards/accuracy_reward": 0.20703125465661287, |
|
"rewards/format_reward": 0.0, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.27208480565371024, |
|
"grad_norm": 0.10194934645252694, |
|
"kl": 0.17828369140625, |
|
"learning_rate": 1.8288838442763838e-05, |
|
"loss": 0.0071, |
|
"reward": 0.2675781324505806, |
|
"reward_std": 0.27682292833924294, |
|
"rewards/accuracy_reward": 0.2662760494276881, |
|
"rewards/format_reward": 0.0013020833721384406, |
|
"step": 77 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.2756183745583039, |
|
"grad_norm": 0.10819020058919213, |
|
"kl": 0.18212890625, |
|
"learning_rate": 1.8219014389779586e-05, |
|
"loss": 0.0073, |
|
"reward": 0.26497396547347307, |
|
"reward_std": 0.25599073618650436, |
|
"rewards/accuracy_reward": 0.2597656324505806, |
|
"rewards/format_reward": 0.0052083334885537624, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.2791519434628975, |
|
"grad_norm": 0.25970666338072285, |
|
"kl": 0.196533203125, |
|
"learning_rate": 1.8147933014790245e-05, |
|
"loss": 0.0079, |
|
"reward": 0.2988281287252903, |
|
"reward_std": 0.28412702213972807, |
|
"rewards/accuracy_reward": 0.2760416744276881, |
|
"rewards/format_reward": 0.02278645895421505, |
|
"step": 79 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.2826855123674912, |
|
"grad_norm": 0.7315926003388828, |
|
"kl": 0.2489013671875, |
|
"learning_rate": 1.8075605191627242e-05, |
|
"loss": 0.01, |
|
"reward": 0.401041679084301, |
|
"reward_std": 0.3803202658891678, |
|
"rewards/accuracy_reward": 0.2910156296566129, |
|
"rewards/format_reward": 0.11002604523673654, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.2862190812720848, |
|
"grad_norm": 10.638076717998826, |
|
"kl": 1.59765625, |
|
"learning_rate": 1.8002041984800173e-05, |
|
"loss": 0.064, |
|
"reward": 0.8561198078095913, |
|
"reward_std": 0.5722472295165062, |
|
"rewards/accuracy_reward": 0.29622396547347307, |
|
"rewards/format_reward": 0.5598958469927311, |
|
"step": 81 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.28975265017667845, |
|
"grad_norm": 1.8766038235299047, |
|
"kl": 0.3572998046875, |
|
"learning_rate": 1.792725464780421e-05, |
|
"loss": 0.0143, |
|
"reward": 0.9231771044433117, |
|
"reward_std": 0.5282374154776335, |
|
"rewards/accuracy_reward": 0.2630208423361182, |
|
"rewards/format_reward": 0.6601562723517418, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.29328621908127206, |
|
"grad_norm": 2.8486965993956006, |
|
"kl": 0.9862060546875, |
|
"learning_rate": 1.785125462139855e-05, |
|
"loss": 0.0394, |
|
"reward": 1.1699219197034836, |
|
"reward_std": 0.3966878689825535, |
|
"rewards/accuracy_reward": 0.2936198003590107, |
|
"rewards/format_reward": 0.876302108168602, |
|
"step": 83 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.2968197879858657, |
|
"grad_norm": 1.3612075349379993, |
|
"kl": 0.6107177734375, |
|
"learning_rate": 1.7774053531856258e-05, |
|
"loss": 0.0244, |
|
"reward": 1.2044271007180214, |
|
"reward_std": 0.4200763385742903, |
|
"rewards/accuracy_reward": 0.30989584047347307, |
|
"rewards/format_reward": 0.8945312723517418, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.3003533568904594, |
|
"grad_norm": 8.701071380759565, |
|
"kl": 2.2080078125, |
|
"learning_rate": 1.7695663189185703e-05, |
|
"loss": 0.0883, |
|
"reward": 1.208984412252903, |
|
"reward_std": 0.36989905312657356, |
|
"rewards/accuracy_reward": 0.2812500102445483, |
|
"rewards/format_reward": 0.9277343936264515, |
|
"step": 85 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.303886925795053, |
|
"grad_norm": 1.8019930642199862, |
|
"kl": 1.42626953125, |
|
"learning_rate": 1.7616095585323882e-05, |
|
"loss": 0.0571, |
|
"reward": 1.156901091337204, |
|
"reward_std": 0.4002630840986967, |
|
"rewards/accuracy_reward": 0.26041667349636555, |
|
"rewards/format_reward": 0.8964843973517418, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.30742049469964666, |
|
"grad_norm": 4.127300637009884, |
|
"kl": 0.299072265625, |
|
"learning_rate": 1.7535362892301953e-05, |
|
"loss": 0.012, |
|
"reward": 1.0872396118938923, |
|
"reward_std": 0.4400251917541027, |
|
"rewards/accuracy_reward": 0.23958334233611822, |
|
"rewards/format_reward": 0.8476562760770321, |
|
"step": 87 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.31095406360424027, |
|
"grad_norm": 4.667835148606083, |
|
"kl": 0.3037109375, |
|
"learning_rate": 1.745347746038319e-05, |
|
"loss": 0.0122, |
|
"reward": 1.0917969234287739, |
|
"reward_std": 0.4730749297887087, |
|
"rewards/accuracy_reward": 0.2747395886108279, |
|
"rewards/format_reward": 0.8170573078095913, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.31448763250883394, |
|
"grad_norm": 2.7630279757978706, |
|
"kl": 0.873291015625, |
|
"learning_rate": 1.737045181617364e-05, |
|
"loss": 0.035, |
|
"reward": 1.0364583693444729, |
|
"reward_std": 0.5082622393965721, |
|
"rewards/accuracy_reward": 0.2389322966337204, |
|
"rewards/format_reward": 0.7975260578095913, |
|
"step": 89 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.31802120141342755, |
|
"grad_norm": 27.960489910885407, |
|
"kl": 5.583984375, |
|
"learning_rate": 1.7286298660705877e-05, |
|
"loss": 0.2233, |
|
"reward": 1.0781250298023224, |
|
"reward_std": 0.47590856440365314, |
|
"rewards/accuracy_reward": 0.25000000838190317, |
|
"rewards/format_reward": 0.8281250223517418, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.3215547703180212, |
|
"grad_norm": 46.613552133923285, |
|
"kl": 8.90234375, |
|
"learning_rate": 1.7201030867496005e-05, |
|
"loss": 0.3559, |
|
"reward": 1.0358073264360428, |
|
"reward_std": 0.4818594641983509, |
|
"rewards/accuracy_reward": 0.2369791753590107, |
|
"rewards/format_reward": 0.7988281399011612, |
|
"step": 91 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.3250883392226148, |
|
"grad_norm": 28.615484872216385, |
|
"kl": 5.740234375, |
|
"learning_rate": 1.711466148057433e-05, |
|
"loss": 0.2297, |
|
"reward": 0.9694010727107525, |
|
"reward_std": 0.5088351331651211, |
|
"rewards/accuracy_reward": 0.22916667582467198, |
|
"rewards/format_reward": 0.7402343861758709, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.3286219081272085, |
|
"grad_norm": 5.615041558235327, |
|
"kl": 2.17333984375, |
|
"learning_rate": 1.7027203712489902e-05, |
|
"loss": 0.0869, |
|
"reward": 0.8919270969927311, |
|
"reward_std": 0.5635814908891916, |
|
"rewards/accuracy_reward": 0.26757813431322575, |
|
"rewards/format_reward": 0.6243489757180214, |
|
"step": 93 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.3321554770318021, |
|
"grad_norm": 8.244116658103383, |
|
"kl": 0.3486328125, |
|
"learning_rate": 1.6938670942289292e-05, |
|
"loss": 0.0139, |
|
"reward": 0.6302083544433117, |
|
"reward_std": 0.5751823391765356, |
|
"rewards/accuracy_reward": 0.22395833861082792, |
|
"rewards/format_reward": 0.40625000931322575, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.33568904593639576, |
|
"grad_norm": 6.524355268359978, |
|
"kl": 0.24981689453125, |
|
"learning_rate": 1.6849076713469914e-05, |
|
"loss": 0.01, |
|
"reward": 0.5638021007180214, |
|
"reward_std": 0.5523576978594065, |
|
"rewards/accuracy_reward": 0.23893229756504297, |
|
"rewards/format_reward": 0.32486980222165585, |
|
"step": 95 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.3392226148409894, |
|
"grad_norm": 7.953861555674408, |
|
"kl": 0.3388671875, |
|
"learning_rate": 1.6758434731908178e-05, |
|
"loss": 0.0136, |
|
"reward": 0.6627604402601719, |
|
"reward_std": 0.5646504014730453, |
|
"rewards/accuracy_reward": 0.27343750838190317, |
|
"rewards/format_reward": 0.3893229253590107, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.34275618374558303, |
|
"grad_norm": 9.117171774796663, |
|
"kl": 0.66259765625, |
|
"learning_rate": 1.6666758863762796e-05, |
|
"loss": 0.0265, |
|
"reward": 0.8255208507180214, |
|
"reward_std": 0.5879920609295368, |
|
"rewards/accuracy_reward": 0.281250006519258, |
|
"rewards/format_reward": 0.5442708488553762, |
|
"step": 97 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.3462897526501767, |
|
"grad_norm": 3.2064911207772133, |
|
"kl": 0.98388671875, |
|
"learning_rate": 1.657406313335358e-05, |
|
"loss": 0.0394, |
|
"reward": 0.9733073189854622, |
|
"reward_std": 0.5437621138989925, |
|
"rewards/accuracy_reward": 0.27604167629033327, |
|
"rewards/format_reward": 0.6972656436264515, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.3498233215547703, |
|
"grad_norm": 3.916153998441947, |
|
"kl": 0.6441650390625, |
|
"learning_rate": 1.6480361721016053e-05, |
|
"loss": 0.0258, |
|
"reward": 1.0598958618938923, |
|
"reward_std": 0.46388070471584797, |
|
"rewards/accuracy_reward": 0.2578125074505806, |
|
"rewards/format_reward": 0.8020833507180214, |
|
"step": 99 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.35335689045936397, |
|
"grad_norm": 1.527146735006838, |
|
"kl": 0.5404052734375, |
|
"learning_rate": 1.6385668960932143e-05, |
|
"loss": 0.0216, |
|
"reward": 1.104166690260172, |
|
"reward_std": 0.4302889872342348, |
|
"rewards/accuracy_reward": 0.2565104253590107, |
|
"rewards/format_reward": 0.8476562686264515, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.3568904593639576, |
|
"grad_norm": 0.650990035380561, |
|
"kl": 0.4241943359375, |
|
"learning_rate": 1.6289999338937427e-05, |
|
"loss": 0.017, |
|
"reward": 1.2311198338866234, |
|
"reward_std": 0.36215772293508053, |
|
"rewards/accuracy_reward": 0.3190104244276881, |
|
"rewards/format_reward": 0.9121093973517418, |
|
"step": 101 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.36042402826855124, |
|
"grad_norm": 2.309888696921956, |
|
"kl": 0.6422119140625, |
|
"learning_rate": 1.619336749030509e-05, |
|
"loss": 0.0257, |
|
"reward": 1.1972656697034836, |
|
"reward_std": 0.33416645554825664, |
|
"rewards/accuracy_reward": 0.27018230129033327, |
|
"rewards/format_reward": 0.9270833507180214, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.36395759717314485, |
|
"grad_norm": 4.444041311869189, |
|
"kl": 0.896728515625, |
|
"learning_rate": 1.609578819750708e-05, |
|
"loss": 0.0359, |
|
"reward": 1.0091146156191826, |
|
"reward_std": 0.4832933880388737, |
|
"rewards/accuracy_reward": 0.2298177145421505, |
|
"rewards/format_reward": 0.7792968936264515, |
|
"step": 103 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.3674911660777385, |
|
"grad_norm": 11.19029279075626, |
|
"kl": 0.737548828125, |
|
"learning_rate": 1.5997276387952733e-05, |
|
"loss": 0.0295, |
|
"reward": 0.9277344010770321, |
|
"reward_std": 0.5210180301219225, |
|
"rewards/accuracy_reward": 0.2356770890764892, |
|
"rewards/format_reward": 0.6920573078095913, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.3710247349823322, |
|
"grad_norm": 15.55734153622524, |
|
"kl": 1.072021484375, |
|
"learning_rate": 1.5897847131705194e-05, |
|
"loss": 0.0429, |
|
"reward": 0.9375000223517418, |
|
"reward_std": 0.5307967625558376, |
|
"rewards/accuracy_reward": 0.2506510494276881, |
|
"rewards/format_reward": 0.6868489757180214, |
|
"step": 105 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.3745583038869258, |
|
"grad_norm": 20.781177075399597, |
|
"kl": 1.77783203125, |
|
"learning_rate": 1.5797515639176077e-05, |
|
"loss": 0.0711, |
|
"reward": 0.9694010652601719, |
|
"reward_std": 0.5016757268458605, |
|
"rewards/accuracy_reward": 0.2220052145421505, |
|
"rewards/format_reward": 0.7473958507180214, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.37809187279151946, |
|
"grad_norm": 119.3128363246367, |
|
"kl": 6.580078125, |
|
"learning_rate": 1.5696297258798573e-05, |
|
"loss": 0.2632, |
|
"reward": 1.1093750447034836, |
|
"reward_std": 0.44075607880949974, |
|
"rewards/accuracy_reward": 0.25455730129033327, |
|
"rewards/format_reward": 0.8548177294433117, |
|
"step": 107 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.38162544169611307, |
|
"grad_norm": 6.7022300054671895, |
|
"kl": 2.0478515625, |
|
"learning_rate": 1.5594207474679533e-05, |
|
"loss": 0.082, |
|
"reward": 1.0462239943444729, |
|
"reward_std": 0.45448117703199387, |
|
"rewards/accuracy_reward": 0.22916667442768812, |
|
"rewards/format_reward": 0.8170573115348816, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.38515901060070673, |
|
"grad_norm": 62.28140494172677, |
|
"kl": 3.994140625, |
|
"learning_rate": 1.549126190423073e-05, |
|
"loss": 0.1599, |
|
"reward": 1.1243490017950535, |
|
"reward_std": 0.39074354991316795, |
|
"rewards/accuracy_reward": 0.2441406287252903, |
|
"rewards/format_reward": 0.8802083507180214, |
|
"step": 109 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.38869257950530034, |
|
"grad_norm": 4.965909615838145, |
|
"kl": 0.9990234375, |
|
"learning_rate": 1.5387476295779737e-05, |
|
"loss": 0.04, |
|
"reward": 1.065104205161333, |
|
"reward_std": 0.423415495082736, |
|
"rewards/accuracy_reward": 0.21809896687045693, |
|
"rewards/format_reward": 0.8470052182674408, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.392226148409894, |
|
"grad_norm": 3.528015251218169, |
|
"kl": 0.8387451171875, |
|
"learning_rate": 1.5282866526160837e-05, |
|
"loss": 0.0335, |
|
"reward": 1.0117187798023224, |
|
"reward_std": 0.41450436040759087, |
|
"rewards/accuracy_reward": 0.16471354756504297, |
|
"rewards/format_reward": 0.847005233168602, |
|
"step": 111 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.3957597173144876, |
|
"grad_norm": 1.3076932365383318, |
|
"kl": 0.799560546875, |
|
"learning_rate": 1.5177448598286182e-05, |
|
"loss": 0.032, |
|
"reward": 1.0097656548023224, |
|
"reward_std": 0.3976512663066387, |
|
"rewards/accuracy_reward": 0.1523437537252903, |
|
"rewards/format_reward": 0.8574218936264515, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.3992932862190813, |
|
"grad_norm": 3.6570709102528838, |
|
"kl": 0.9739990234375, |
|
"learning_rate": 1.5071238638697731e-05, |
|
"loss": 0.0389, |
|
"reward": 1.0397135764360428, |
|
"reward_std": 0.38695196248590946, |
|
"rewards/accuracy_reward": 0.18229167023673654, |
|
"rewards/format_reward": 0.8574218899011612, |
|
"step": 113 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.4028268551236749, |
|
"grad_norm": 303.94124396845274, |
|
"kl": 42.03125, |
|
"learning_rate": 1.4964252895100265e-05, |
|
"loss": 1.6829, |
|
"reward": 1.1425781548023224, |
|
"reward_std": 0.3955598259344697, |
|
"rewards/accuracy_reward": 0.24218750558793545, |
|
"rewards/format_reward": 0.9003906473517418, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.40636042402826855, |
|
"grad_norm": 68.76205878652742, |
|
"kl": 10.7421875, |
|
"learning_rate": 1.4856507733875837e-05, |
|
"loss": 0.4297, |
|
"reward": 1.1054687798023224, |
|
"reward_std": 0.36262817680835724, |
|
"rewards/accuracy_reward": 0.20052083814516664, |
|
"rewards/format_reward": 0.9049479365348816, |
|
"step": 115 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.4098939929328622, |
|
"grad_norm": 14.914598534678452, |
|
"kl": 2.833984375, |
|
"learning_rate": 1.4748019637580116e-05, |
|
"loss": 0.1134, |
|
"reward": 1.0625000447034836, |
|
"reward_std": 0.4123513549566269, |
|
"rewards/accuracy_reward": 0.19531250838190317, |
|
"rewards/format_reward": 0.8671875149011612, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.4134275618374558, |
|
"grad_norm": 5.452147867911957, |
|
"kl": 1.337890625, |
|
"learning_rate": 1.4638805202420896e-05, |
|
"loss": 0.0535, |
|
"reward": 0.9967448301613331, |
|
"reward_std": 0.46121339313685894, |
|
"rewards/accuracy_reward": 0.2018229216337204, |
|
"rewards/format_reward": 0.7949219010770321, |
|
"step": 117 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.4169611307420495, |
|
"grad_norm": 5.07384920327551, |
|
"kl": 1.025634765625, |
|
"learning_rate": 1.452888113571929e-05, |
|
"loss": 0.041, |
|
"reward": 0.9401041865348816, |
|
"reward_std": 0.5634889136999846, |
|
"rewards/accuracy_reward": 0.24739584233611822, |
|
"rewards/format_reward": 0.6927083544433117, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.4204946996466431, |
|
"grad_norm": 7.69200350154935, |
|
"kl": 0.906005859375, |
|
"learning_rate": 1.4418264253353869e-05, |
|
"loss": 0.0362, |
|
"reward": 0.821614608168602, |
|
"reward_std": 0.5736533179879189, |
|
"rewards/accuracy_reward": 0.2070312537252903, |
|
"rewards/format_reward": 0.6145833544433117, |
|
"step": 119 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.42402826855123676, |
|
"grad_norm": 20.808857934877054, |
|
"kl": 1.56982421875, |
|
"learning_rate": 1.4306971477188223e-05, |
|
"loss": 0.0627, |
|
"reward": 0.7656250223517418, |
|
"reward_std": 0.5566702261567116, |
|
"rewards/accuracy_reward": 0.17838542046956718, |
|
"rewards/format_reward": 0.587239608168602, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.4275618374558304, |
|
"grad_norm": 26.447237281301867, |
|
"kl": 2.738037109375, |
|
"learning_rate": 1.419501983248229e-05, |
|
"loss": 0.1095, |
|
"reward": 0.7792969048023224, |
|
"reward_std": 0.5849619917571545, |
|
"rewards/accuracy_reward": 0.1907552140764892, |
|
"rewards/format_reward": 0.5885416846722364, |
|
"step": 121 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.43109540636042404, |
|
"grad_norm": 22.987609528434852, |
|
"kl": 1.94189453125, |
|
"learning_rate": 1.4082426445287904e-05, |
|
"loss": 0.0775, |
|
"reward": 0.8574218973517418, |
|
"reward_std": 0.5550148580223322, |
|
"rewards/accuracy_reward": 0.19661458861082792, |
|
"rewards/format_reward": 0.660807304084301, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.43462897526501765, |
|
"grad_norm": 7.848483482793478, |
|
"kl": 1.8486328125, |
|
"learning_rate": 1.3969208539828873e-05, |
|
"loss": 0.074, |
|
"reward": 0.8059896044433117, |
|
"reward_std": 0.5727136358618736, |
|
"rewards/accuracy_reward": 0.1894531319849193, |
|
"rewards/format_reward": 0.6165364794433117, |
|
"step": 123 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.4381625441696113, |
|
"grad_norm": 7.568264702961115, |
|
"kl": 3.517578125, |
|
"learning_rate": 1.3855383435866076e-05, |
|
"loss": 0.1407, |
|
"reward": 0.7825521007180214, |
|
"reward_std": 0.5777693595737219, |
|
"rewards/accuracy_reward": 0.17643229733221233, |
|
"rewards/format_reward": 0.6061198096722364, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.4416961130742049, |
|
"grad_norm": 1142.9874069005743, |
|
"kl": 47.984375, |
|
"learning_rate": 1.3740968546047935e-05, |
|
"loss": 1.9207, |
|
"reward": 0.7265625223517418, |
|
"reward_std": 0.5636367797851562, |
|
"rewards/accuracy_reward": 0.14778646221384406, |
|
"rewards/format_reward": 0.5787760615348816, |
|
"step": 125 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.4452296819787986, |
|
"grad_norm": 27.152545132029452, |
|
"kl": 5.333984375, |
|
"learning_rate": 1.362598137324667e-05, |
|
"loss": 0.2134, |
|
"reward": 0.7291666902601719, |
|
"reward_std": 0.5741878617554903, |
|
"rewards/accuracy_reward": 0.18489583674818277, |
|
"rewards/format_reward": 0.5442708525806665, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.44876325088339225, |
|
"grad_norm": 15.772408193991923, |
|
"kl": 6.375, |
|
"learning_rate": 1.3510439507880778e-05, |
|
"loss": 0.255, |
|
"reward": 0.6901041865348816, |
|
"reward_std": 0.554063655436039, |
|
"rewards/accuracy_reward": 0.17122396267950535, |
|
"rewards/format_reward": 0.5188802238553762, |
|
"step": 127 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.45229681978798586, |
|
"grad_norm": 14.045655101866169, |
|
"kl": 4.6875, |
|
"learning_rate": 1.3394360625224067e-05, |
|
"loss": 0.1874, |
|
"reward": 0.6523437760770321, |
|
"reward_std": 0.5496281944215298, |
|
"rewards/accuracy_reward": 0.15950521756894886, |
|
"rewards/format_reward": 0.49283855594694614, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.4558303886925795, |
|
"grad_norm": 6.019406500188639, |
|
"kl": 4.501953125, |
|
"learning_rate": 1.3277762482701769e-05, |
|
"loss": 0.1801, |
|
"reward": 0.6139323115348816, |
|
"reward_std": 0.5297244675457478, |
|
"rewards/accuracy_reward": 0.14778646267950535, |
|
"rewards/format_reward": 0.46614584885537624, |
|
"step": 129 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.45936395759717313, |
|
"grad_norm": 8.557172478534012, |
|
"kl": 2.5107421875, |
|
"learning_rate": 1.3160662917174045e-05, |
|
"loss": 0.1005, |
|
"reward": 0.5481771044433117, |
|
"reward_std": 0.5158671271055937, |
|
"rewards/accuracy_reward": 0.132812503259629, |
|
"rewards/format_reward": 0.4153645932674408, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.4628975265017668, |
|
"grad_norm": 9.680783748799318, |
|
"kl": 2.01806640625, |
|
"learning_rate": 1.3043079842207363e-05, |
|
"loss": 0.0807, |
|
"reward": 0.5436198115348816, |
|
"reward_std": 0.525303166359663, |
|
"rewards/accuracy_reward": 0.13411458616610616, |
|
"rewards/format_reward": 0.40950521640479565, |
|
"step": 131 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.4664310954063604, |
|
"grad_norm": 11.919372203113637, |
|
"kl": 3.8720703125, |
|
"learning_rate": 1.2925031245334112e-05, |
|
"loss": 0.1549, |
|
"reward": 0.5683593936264515, |
|
"reward_std": 0.5238314680755138, |
|
"rewards/accuracy_reward": 0.15104167046956718, |
|
"rewards/format_reward": 0.41731772013008595, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.46996466431095407, |
|
"grad_norm": 2.91738147399812, |
|
"kl": 3.1455078125, |
|
"learning_rate": 1.2806535185300931e-05, |
|
"loss": 0.1258, |
|
"reward": 0.5989583507180214, |
|
"reward_std": 0.5382577646523714, |
|
"rewards/accuracy_reward": 0.14388021198101342, |
|
"rewards/format_reward": 0.45507813803851604, |
|
"step": 133 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.4734982332155477, |
|
"grad_norm": 9.809127835476216, |
|
"kl": 4.83984375, |
|
"learning_rate": 1.2687609789306144e-05, |
|
"loss": 0.1935, |
|
"reward": 0.5527343917638063, |
|
"reward_std": 0.5343853384256363, |
|
"rewards/accuracy_reward": 0.12760417070239782, |
|
"rewards/format_reward": 0.42513022013008595, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.47703180212014135, |
|
"grad_norm": 17.04549908577345, |
|
"kl": 7.4140625, |
|
"learning_rate": 1.2568273250226681e-05, |
|
"loss": 0.2967, |
|
"reward": 0.6009114738553762, |
|
"reward_std": 0.5301515571773052, |
|
"rewards/accuracy_reward": 0.13606771151535213, |
|
"rewards/format_reward": 0.46484375931322575, |
|
"step": 135 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.48056537102473496, |
|
"grad_norm": 8.477206528274886, |
|
"kl": 5.021484375, |
|
"learning_rate": 1.2448543823835016e-05, |
|
"loss": 0.201, |
|
"reward": 0.6223958544433117, |
|
"reward_std": 0.5402844380587339, |
|
"rewards/accuracy_reward": 0.13411458721384406, |
|
"rewards/format_reward": 0.4882812611758709, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.4840989399293286, |
|
"grad_norm": 3.8500660741333816, |
|
"kl": 3.294921875, |
|
"learning_rate": 1.2328439826006415e-05, |
|
"loss": 0.1319, |
|
"reward": 0.655598983168602, |
|
"reward_std": 0.5419861897826195, |
|
"rewards/accuracy_reward": 0.15364583767950535, |
|
"rewards/format_reward": 0.5019531399011612, |
|
"step": 137 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.4876325088339223, |
|
"grad_norm": 4.591683686353463, |
|
"kl": 2.4130859375, |
|
"learning_rate": 1.2207979629917061e-05, |
|
"loss": 0.0966, |
|
"reward": 0.6510416865348816, |
|
"reward_std": 0.5556948073208332, |
|
"rewards/accuracy_reward": 0.12630208616610616, |
|
"rewards/format_reward": 0.5247395969927311, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.4911660777385159, |
|
"grad_norm": 26.38361978331117, |
|
"kl": 5.92236328125, |
|
"learning_rate": 1.2087181663233354e-05, |
|
"loss": 0.2373, |
|
"reward": 0.6998698189854622, |
|
"reward_std": 0.5648407433182001, |
|
"rewards/accuracy_reward": 0.13541667070239782, |
|
"rewards/format_reward": 0.5644531436264515, |
|
"step": 139 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.49469964664310956, |
|
"grad_norm": 4.793844402068783, |
|
"kl": 2.24755859375, |
|
"learning_rate": 1.1966064405292887e-05, |
|
"loss": 0.0899, |
|
"reward": 0.7552083618938923, |
|
"reward_std": 0.5623416192829609, |
|
"rewards/accuracy_reward": 0.17057292209938169, |
|
"rewards/format_reward": 0.5846354328095913, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.49823321554770317, |
|
"grad_norm": 1.6059354727609763, |
|
"kl": 3.4482421875, |
|
"learning_rate": 1.184464638427756e-05, |
|
"loss": 0.1379, |
|
"reward": 0.7838541828095913, |
|
"reward_std": 0.5696821231395006, |
|
"rewards/accuracy_reward": 0.18815104570239782, |
|
"rewards/format_reward": 0.5957031436264515, |
|
"step": 141 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.5017667844522968, |
|
"grad_norm": 14.85453956454356, |
|
"kl": 7.228515625, |
|
"learning_rate": 1.1722946174379168e-05, |
|
"loss": 0.2892, |
|
"reward": 0.7656250186264515, |
|
"reward_std": 0.5824484005570412, |
|
"rewards/accuracy_reward": 0.17317708721384406, |
|
"rewards/format_reward": 0.5924479383975267, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.5053003533568905, |
|
"grad_norm": 21.201841225460676, |
|
"kl": 9.515625, |
|
"learning_rate": 1.1600982392957978e-05, |
|
"loss": 0.3802, |
|
"reward": 0.7910156510770321, |
|
"reward_std": 0.5755977407097816, |
|
"rewards/accuracy_reward": 0.16341146174818277, |
|
"rewards/format_reward": 0.6276041846722364, |
|
"step": 143 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.508833922261484, |
|
"grad_norm": 79.28650798092659, |
|
"kl": 7.9423828125, |
|
"learning_rate": 1.1478773697694691e-05, |
|
"loss": 0.318, |
|
"reward": 0.9016927294433117, |
|
"reward_std": 0.5674024932086468, |
|
"rewards/accuracy_reward": 0.2285156287252903, |
|
"rewards/format_reward": 0.6731771044433117, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.5123674911660777, |
|
"grad_norm": 7.988007399943678, |
|
"kl": 4.2353515625, |
|
"learning_rate": 1.1356338783736256e-05, |
|
"loss": 0.1695, |
|
"reward": 0.8457031473517418, |
|
"reward_std": 0.5507397223263979, |
|
"rewards/accuracy_reward": 0.18294271500781178, |
|
"rewards/format_reward": 0.6627604365348816, |
|
"step": 145 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.5159010600706714, |
|
"grad_norm": 6.6368383146740335, |
|
"kl": 2.5166015625, |
|
"learning_rate": 1.123369638083593e-05, |
|
"loss": 0.1007, |
|
"reward": 0.8782552406191826, |
|
"reward_std": 0.5208123382180929, |
|
"rewards/accuracy_reward": 0.1855468787252903, |
|
"rewards/format_reward": 0.6927083507180214, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.519434628975265, |
|
"grad_norm": 5.018034104235896, |
|
"kl": 2.7021484375, |
|
"learning_rate": 1.1110865250488047e-05, |
|
"loss": 0.1081, |
|
"reward": 0.884114608168602, |
|
"reward_std": 0.530108455568552, |
|
"rewards/accuracy_reward": 0.16601562919095159, |
|
"rewards/format_reward": 0.7180989719927311, |
|
"step": 147 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.5229681978798587, |
|
"grad_norm": 4.782732693496574, |
|
"kl": 4.1826171875, |
|
"learning_rate": 1.0987864183057943e-05, |
|
"loss": 0.1672, |
|
"reward": 0.9218750223517418, |
|
"reward_std": 0.5164159703999758, |
|
"rewards/accuracy_reward": 0.18229167396202683, |
|
"rewards/format_reward": 0.7395833544433117, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.5265017667844523, |
|
"grad_norm": 16.810647138161976, |
|
"kl": 6.6533203125, |
|
"learning_rate": 1.0864711994907457e-05, |
|
"loss": 0.2665, |
|
"reward": 0.9960937686264515, |
|
"reward_std": 0.48126981779932976, |
|
"rewards/accuracy_reward": 0.22786459187045693, |
|
"rewards/format_reward": 0.7682291828095913, |
|
"step": 149 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.5300353356890459, |
|
"grad_norm": 10.7748697556108, |
|
"kl": 5.4052734375, |
|
"learning_rate": 1.0741427525516463e-05, |
|
"loss": 0.2162, |
|
"reward": 0.9928385689854622, |
|
"reward_std": 0.5116328075528145, |
|
"rewards/accuracy_reward": 0.22395833989139646, |
|
"rewards/format_reward": 0.7688802257180214, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.5335689045936396, |
|
"grad_norm": 2.3886371142629574, |
|
"kl": 2.9921875, |
|
"learning_rate": 1.0618029634600843e-05, |
|
"loss": 0.1197, |
|
"reward": 0.9869791977107525, |
|
"reward_std": 0.48671552538871765, |
|
"rewards/accuracy_reward": 0.20572917303070426, |
|
"rewards/format_reward": 0.7812500186264515, |
|
"step": 151 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.5371024734982333, |
|
"grad_norm": 1.7440867401519615, |
|
"kl": 3.501953125, |
|
"learning_rate": 1.0494537199227393e-05, |
|
"loss": 0.1401, |
|
"reward": 0.9563802443444729, |
|
"reward_std": 0.4778597932308912, |
|
"rewards/accuracy_reward": 0.1940104216337204, |
|
"rewards/format_reward": 0.7623698152601719, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.5406360424028268, |
|
"grad_norm": 2.4296240779116753, |
|
"kl": 3.357421875, |
|
"learning_rate": 1.0370969110926052e-05, |
|
"loss": 0.1343, |
|
"reward": 0.9921875298023224, |
|
"reward_std": 0.4780040867626667, |
|
"rewards/accuracy_reward": 0.21679688291624188, |
|
"rewards/format_reward": 0.7753906473517418, |
|
"step": 153 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.5441696113074205, |
|
"grad_norm": 4.117699458280056, |
|
"kl": 3.2236328125, |
|
"learning_rate": 1.024734427279995e-05, |
|
"loss": 0.129, |
|
"reward": 1.0136718973517418, |
|
"reward_std": 0.49156964384019375, |
|
"rewards/accuracy_reward": 0.23502604896202683, |
|
"rewards/format_reward": 0.7786458469927311, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.5477031802120141, |
|
"grad_norm": 9.030839517376265, |
|
"kl": 5.662109375, |
|
"learning_rate": 1.012368159663363e-05, |
|
"loss": 0.2264, |
|
"reward": 1.0794271267950535, |
|
"reward_std": 0.45495169796049595, |
|
"rewards/accuracy_reward": 0.2740885503590107, |
|
"rewards/format_reward": 0.8053385578095913, |
|
"step": 155 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.5512367491166078, |
|
"grad_norm": 3.2489067144831174, |
|
"kl": 3.689453125, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1475, |
|
"reward": 1.100260455161333, |
|
"reward_std": 0.4964223224669695, |
|
"rewards/accuracy_reward": 0.2955729253590107, |
|
"rewards/format_reward": 0.8046875223517418, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.5547703180212014, |
|
"grad_norm": 5.586842094139113, |
|
"kl": 1.96337890625, |
|
"learning_rate": 9.876318403366371e-06, |
|
"loss": 0.0785, |
|
"reward": 1.0937500335276127, |
|
"reward_std": 0.4783368781208992, |
|
"rewards/accuracy_reward": 0.2884114645421505, |
|
"rewards/format_reward": 0.8053385578095913, |
|
"step": 157 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.558303886925795, |
|
"grad_norm": 6.4948169343262245, |
|
"kl": 1.94287109375, |
|
"learning_rate": 9.752655727200051e-06, |
|
"loss": 0.0777, |
|
"reward": 1.040364608168602, |
|
"reward_std": 0.49424389004707336, |
|
"rewards/accuracy_reward": 0.25390625838190317, |
|
"rewards/format_reward": 0.7864583507180214, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.5618374558303887, |
|
"grad_norm": 5.326307273819102, |
|
"kl": 2.5322265625, |
|
"learning_rate": 9.62903088907395e-06, |
|
"loss": 0.1013, |
|
"reward": 1.0703125298023224, |
|
"reward_std": 0.486019866541028, |
|
"rewards/accuracy_reward": 0.27669271547347307, |
|
"rewards/format_reward": 0.7936198078095913, |
|
"step": 159 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.5653710247349824, |
|
"grad_norm": 5.794058798518192, |
|
"kl": 4.05859375, |
|
"learning_rate": 9.505462800772612e-06, |
|
"loss": 0.1624, |
|
"reward": 1.1087239980697632, |
|
"reward_std": 0.48937827721238136, |
|
"rewards/accuracy_reward": 0.3196614682674408, |
|
"rewards/format_reward": 0.7890625186264515, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.568904593639576, |
|
"grad_norm": 9.402836751388962, |
|
"kl": 5.33984375, |
|
"learning_rate": 9.381970365399162e-06, |
|
"loss": 0.2135, |
|
"reward": 0.9941406510770321, |
|
"reward_std": 0.522324126213789, |
|
"rewards/accuracy_reward": 0.263671881519258, |
|
"rewards/format_reward": 0.7304687723517418, |
|
"step": 161 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.5724381625441696, |
|
"grad_norm": 4.195689922174244, |
|
"kl": 4.86328125, |
|
"learning_rate": 9.25857247448354e-06, |
|
"loss": 0.1944, |
|
"reward": 0.960937537252903, |
|
"reward_std": 0.5458177234977484, |
|
"rewards/accuracy_reward": 0.24739584140479565, |
|
"rewards/format_reward": 0.7135416939854622, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.5759717314487632, |
|
"grad_norm": 2.8989498058377228, |
|
"kl": 4.87109375, |
|
"learning_rate": 9.135288005092546e-06, |
|
"loss": 0.1949, |
|
"reward": 1.0930989868938923, |
|
"reward_std": 0.5132731832563877, |
|
"rewards/accuracy_reward": 0.29947918001562357, |
|
"rewards/format_reward": 0.7936198115348816, |
|
"step": 163 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.5795053003533569, |
|
"grad_norm": 3.0006286232984234, |
|
"kl": 5.009765625, |
|
"learning_rate": 9.012135816942058e-06, |
|
"loss": 0.2003, |
|
"reward": 1.0377604514360428, |
|
"reward_std": 0.47819276340305805, |
|
"rewards/accuracy_reward": 0.24283854709938169, |
|
"rewards/format_reward": 0.7949218899011612, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.5830388692579506, |
|
"grad_norm": 4.6993358160287615, |
|
"kl": 5.35546875, |
|
"learning_rate": 8.889134749511956e-06, |
|
"loss": 0.2143, |
|
"reward": 1.0800781473517418, |
|
"reward_std": 0.5242850538343191, |
|
"rewards/accuracy_reward": 0.281901054084301, |
|
"rewards/format_reward": 0.798177108168602, |
|
"step": 165 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.5865724381625441, |
|
"grad_norm": 6.4731877967945355, |
|
"kl": 6.76953125, |
|
"learning_rate": 8.76630361916407e-06, |
|
"loss": 0.2708, |
|
"reward": 1.0436198264360428, |
|
"reward_std": 0.4580067917704582, |
|
"rewards/accuracy_reward": 0.24804688291624188, |
|
"rewards/format_reward": 0.7955729365348816, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.5901060070671378, |
|
"grad_norm": 9.638355564108144, |
|
"kl": 8.08203125, |
|
"learning_rate": 8.643661216263744e-06, |
|
"loss": 0.3235, |
|
"reward": 1.0696614980697632, |
|
"reward_std": 0.47720608301460743, |
|
"rewards/accuracy_reward": 0.254557297565043, |
|
"rewards/format_reward": 0.8151041939854622, |
|
"step": 167 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.5936395759717314, |
|
"grad_norm": 5.425360658521879, |
|
"kl": 7.126953125, |
|
"learning_rate": 8.52122630230531e-06, |
|
"loss": 0.2852, |
|
"reward": 1.0677083730697632, |
|
"reward_std": 0.4944526255130768, |
|
"rewards/accuracy_reward": 0.2669270886108279, |
|
"rewards/format_reward": 0.8007812649011612, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.5971731448763251, |
|
"grad_norm": 2.6727398917140324, |
|
"kl": 5.6484375, |
|
"learning_rate": 8.399017607042025e-06, |
|
"loss": 0.2259, |
|
"reward": 1.0963541865348816, |
|
"reward_std": 0.4828463848680258, |
|
"rewards/accuracy_reward": 0.28645834140479565, |
|
"rewards/format_reward": 0.8098958544433117, |
|
"step": 169 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.6007067137809188, |
|
"grad_norm": 3.0934284767601503, |
|
"kl": 4.96484375, |
|
"learning_rate": 8.277053825620836e-06, |
|
"loss": 0.1987, |
|
"reward": 1.0039062909781933, |
|
"reward_std": 0.47822842188179493, |
|
"rewards/accuracy_reward": 0.22005208861082792, |
|
"rewards/format_reward": 0.7838541865348816, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.6042402826855123, |
|
"grad_norm": 1.415026805369631, |
|
"kl": 4.71875, |
|
"learning_rate": 8.155353615722442e-06, |
|
"loss": 0.1887, |
|
"reward": 1.0240885764360428, |
|
"reward_std": 0.5142606012523174, |
|
"rewards/accuracy_reward": 0.24283855129033327, |
|
"rewards/format_reward": 0.7812500149011612, |
|
"step": 171 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.607773851590106, |
|
"grad_norm": 3.671045541857197, |
|
"kl": 4.7958984375, |
|
"learning_rate": 8.033935594707116e-06, |
|
"loss": 0.1918, |
|
"reward": 0.9915364906191826, |
|
"reward_std": 0.5378254223614931, |
|
"rewards/accuracy_reward": 0.2363281308207661, |
|
"rewards/format_reward": 0.7552083544433117, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.6113074204946997, |
|
"grad_norm": 4.829747105605246, |
|
"kl": 4.9267578125, |
|
"learning_rate": 7.91281833676665e-06, |
|
"loss": 0.1968, |
|
"reward": 1.0175781548023224, |
|
"reward_std": 0.533087344840169, |
|
"rewards/accuracy_reward": 0.262369797565043, |
|
"rewards/format_reward": 0.7552083544433117, |
|
"step": 173 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.6148409893992933, |
|
"grad_norm": 4.133358091951167, |
|
"kl": 5.173828125, |
|
"learning_rate": 7.79202037008294e-06, |
|
"loss": 0.2072, |
|
"reward": 0.9824219010770321, |
|
"reward_std": 0.5031039249151945, |
|
"rewards/accuracy_reward": 0.23111979756504297, |
|
"rewards/format_reward": 0.7513021044433117, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.6183745583038869, |
|
"grad_norm": 1.7387249238999762, |
|
"kl": 5.0078125, |
|
"learning_rate": 7.671560173993588e-06, |
|
"loss": 0.2003, |
|
"reward": 0.956380233168602, |
|
"reward_std": 0.5033866986632347, |
|
"rewards/accuracy_reward": 0.22135417256504297, |
|
"rewards/format_reward": 0.7350260652601719, |
|
"step": 175 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.6219081272084805, |
|
"grad_norm": 2.172215905238513, |
|
"kl": 4.857421875, |
|
"learning_rate": 7.551456176164989e-06, |
|
"loss": 0.1943, |
|
"reward": 0.9941406548023224, |
|
"reward_std": 0.526677755638957, |
|
"rewards/accuracy_reward": 0.24414063151925802, |
|
"rewards/format_reward": 0.7500000223517418, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.6254416961130742, |
|
"grad_norm": 0.8842413004407766, |
|
"kl": 5.447265625, |
|
"learning_rate": 7.431726749773322e-06, |
|
"loss": 0.2178, |
|
"reward": 0.9596354402601719, |
|
"reward_std": 0.5413137227296829, |
|
"rewards/accuracy_reward": 0.2278645895421505, |
|
"rewards/format_reward": 0.7317708507180214, |
|
"step": 177 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.6289752650176679, |
|
"grad_norm": 0.7402618851617847, |
|
"kl": 5.388671875, |
|
"learning_rate": 7.312390210693863e-06, |
|
"loss": 0.2156, |
|
"reward": 0.9811198189854622, |
|
"reward_std": 0.5133567694574594, |
|
"rewards/accuracy_reward": 0.23242188384756446, |
|
"rewards/format_reward": 0.7486979402601719, |
|
"step": 178 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.6325088339222615, |
|
"grad_norm": 3.560567058348375, |
|
"kl": 5.2578125, |
|
"learning_rate": 7.193464814699073e-06, |
|
"loss": 0.2104, |
|
"reward": 0.9238281324505806, |
|
"reward_std": 0.5353942643851042, |
|
"rewards/accuracy_reward": 0.20507813151925802, |
|
"rewards/format_reward": 0.7187500223517418, |
|
"step": 179 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.6360424028268551, |
|
"grad_norm": 2.753064271775204, |
|
"kl": 5.181640625, |
|
"learning_rate": 7.07496875466589e-06, |
|
"loss": 0.2074, |
|
"reward": 0.8417968861758709, |
|
"reward_std": 0.5037534404546022, |
|
"rewards/accuracy_reward": 0.1321614630287513, |
|
"rewards/format_reward": 0.7096354328095913, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.6395759717314488, |
|
"grad_norm": 3.87667763131927, |
|
"kl": 6.107421875, |
|
"learning_rate": 6.9569201577926395e-06, |
|
"loss": 0.2442, |
|
"reward": 0.8977864868938923, |
|
"reward_std": 0.5129530522972345, |
|
"rewards/accuracy_reward": 0.18424479803070426, |
|
"rewards/format_reward": 0.7135416939854622, |
|
"step": 181 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.6431095406360424, |
|
"grad_norm": 3.8854100556266813, |
|
"kl": 6.060546875, |
|
"learning_rate": 6.839337082825954e-06, |
|
"loss": 0.2426, |
|
"reward": 0.8893229365348816, |
|
"reward_std": 0.4821504820138216, |
|
"rewards/accuracy_reward": 0.15755208861082792, |
|
"rewards/format_reward": 0.7317708469927311, |
|
"step": 182 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.6466431095406361, |
|
"grad_norm": 1.2484444905977732, |
|
"kl": 5.591796875, |
|
"learning_rate": 6.722237517298232e-06, |
|
"loss": 0.2238, |
|
"reward": 0.958984412252903, |
|
"reward_std": 0.48374492302536964, |
|
"rewards/accuracy_reward": 0.1861979211680591, |
|
"rewards/format_reward": 0.7727864794433117, |
|
"step": 183 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.6501766784452296, |
|
"grad_norm": 2.6893650368389546, |
|
"kl": 5.880859375, |
|
"learning_rate": 6.605639374775934e-06, |
|
"loss": 0.2352, |
|
"reward": 0.9785156510770321, |
|
"reward_std": 0.4921079948544502, |
|
"rewards/accuracy_reward": 0.2057291716337204, |
|
"rewards/format_reward": 0.7727864757180214, |
|
"step": 184 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.6537102473498233, |
|
"grad_norm": 2.3442839100460233, |
|
"kl": 6.041015625, |
|
"learning_rate": 6.489560492119225e-06, |
|
"loss": 0.2416, |
|
"reward": 0.9583333693444729, |
|
"reward_std": 0.4440547488629818, |
|
"rewards/accuracy_reward": 0.1692708395421505, |
|
"rewards/format_reward": 0.7890625223517418, |
|
"step": 185 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.657243816254417, |
|
"grad_norm": 2.1253880553627047, |
|
"kl": 6.69921875, |
|
"learning_rate": 6.374018626753331e-06, |
|
"loss": 0.268, |
|
"reward": 0.9765625223517418, |
|
"reward_std": 0.4347160626202822, |
|
"rewards/accuracy_reward": 0.17838542256504297, |
|
"rewards/format_reward": 0.7981770969927311, |
|
"step": 186 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.6607773851590106, |
|
"grad_norm": 1.0194540597887043, |
|
"kl": 6.712890625, |
|
"learning_rate": 6.2590314539520695e-06, |
|
"loss": 0.2686, |
|
"reward": 0.9921875335276127, |
|
"reward_std": 0.44406831078231335, |
|
"rewards/accuracy_reward": 0.2005208395421505, |
|
"rewards/format_reward": 0.7916666828095913, |
|
"step": 187 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.6643109540636042, |
|
"grad_norm": 0.5545671762601952, |
|
"kl": 6.857421875, |
|
"learning_rate": 6.144616564133927e-06, |
|
"loss": 0.2743, |
|
"reward": 1.023437526077032, |
|
"reward_std": 0.4340708777308464, |
|
"rewards/accuracy_reward": 0.19401042442768812, |
|
"rewards/format_reward": 0.829427108168602, |
|
"step": 188 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.6678445229681979, |
|
"grad_norm": 0.3583901424418257, |
|
"kl": 6.94140625, |
|
"learning_rate": 6.03079146017113e-06, |
|
"loss": 0.2778, |
|
"reward": 1.0410156548023224, |
|
"reward_std": 0.4304163958877325, |
|
"rewards/accuracy_reward": 0.21354167070239782, |
|
"rewards/format_reward": 0.8274739794433117, |
|
"step": 189 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.6713780918727915, |
|
"grad_norm": 0.17755970310205513, |
|
"kl": 7.12890625, |
|
"learning_rate": 5.9175735547120975e-06, |
|
"loss": 0.2852, |
|
"reward": 1.0572916865348816, |
|
"reward_std": 0.4284838940948248, |
|
"rewards/accuracy_reward": 0.20963542349636555, |
|
"rewards/format_reward": 0.8476562723517418, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.6749116607773852, |
|
"grad_norm": 0.16047119425020795, |
|
"kl": 7.486328125, |
|
"learning_rate": 5.804980167517712e-06, |
|
"loss": 0.2995, |
|
"reward": 1.0820313021540642, |
|
"reward_std": 0.3956974996253848, |
|
"rewards/accuracy_reward": 0.22395833861082792, |
|
"rewards/format_reward": 0.8580729365348816, |
|
"step": 191 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.6784452296819788, |
|
"grad_norm": 0.18607515894747625, |
|
"kl": 7.34765625, |
|
"learning_rate": 5.693028522811783e-06, |
|
"loss": 0.2937, |
|
"reward": 1.1145833805203438, |
|
"reward_std": 0.42582329362630844, |
|
"rewards/accuracy_reward": 0.25911459047347307, |
|
"rewards/format_reward": 0.8554687798023224, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.6819787985865724, |
|
"grad_norm": 0.12713756611493196, |
|
"kl": 7.560546875, |
|
"learning_rate": 5.581735746646134e-06, |
|
"loss": 0.3023, |
|
"reward": 1.1165364943444729, |
|
"reward_std": 0.4042051937431097, |
|
"rewards/accuracy_reward": 0.24609375651925802, |
|
"rewards/format_reward": 0.8704427368938923, |
|
"step": 193 |
|
}, |
|
{ |
|
"completion_length": 1006.2291679382324, |
|
"epoch": 0.6855123674911661, |
|
"grad_norm": 0.17261260884206758, |
|
"kl": 7.578125, |
|
"learning_rate": 5.471118864280716e-06, |
|
"loss": 0.3027, |
|
"reward": 1.1165365055203438, |
|
"reward_std": 0.4118177331984043, |
|
"rewards/accuracy_reward": 0.2513020895421505, |
|
"rewards/format_reward": 0.8652343973517418, |
|
"step": 194 |
|
}, |
|
{ |
|
"completion_length": 1009.5208358764648, |
|
"epoch": 0.6890459363957597, |
|
"grad_norm": 0.12753618037588546, |
|
"kl": 7.6796875, |
|
"learning_rate": 5.361194797579108e-06, |
|
"loss": 0.3073, |
|
"reward": 1.0963542014360428, |
|
"reward_std": 0.38153328374028206, |
|
"rewards/accuracy_reward": 0.22330729849636555, |
|
"rewards/format_reward": 0.8730468936264515, |
|
"step": 195 |
|
}, |
|
{ |
|
"completion_length": 992.2708358764648, |
|
"epoch": 0.6925795053003534, |
|
"grad_norm": 0.1939287572168999, |
|
"kl": 7.9375, |
|
"learning_rate": 5.2519803624198865e-06, |
|
"loss": 0.3175, |
|
"reward": 1.1438802480697632, |
|
"reward_std": 0.36042055673897266, |
|
"rewards/accuracy_reward": 0.2545572994276881, |
|
"rewards/format_reward": 0.8893229365348816, |
|
"step": 196 |
|
}, |
|
{ |
|
"completion_length": 1005.8333358764648, |
|
"epoch": 0.696113074204947, |
|
"grad_norm": 1.6055164563240731, |
|
"kl": 7.822265625, |
|
"learning_rate": 5.143492266124164e-06, |
|
"loss": 0.313, |
|
"reward": 1.0944010764360428, |
|
"reward_std": 0.3512597717344761, |
|
"rewards/accuracy_reward": 0.21289063151925802, |
|
"rewards/format_reward": 0.8815104328095913, |
|
"step": 197 |
|
}, |
|
{ |
|
"completion_length": 976.1875038146973, |
|
"epoch": 0.6996466431095406, |
|
"grad_norm": 0.15206731503411422, |
|
"kl": 7.78125, |
|
"learning_rate": 5.035747104899738e-06, |
|
"loss": 0.3114, |
|
"reward": 1.0540364906191826, |
|
"reward_std": 0.35300159733742476, |
|
"rewards/accuracy_reward": 0.18880208674818277, |
|
"rewards/format_reward": 0.8652343899011612, |
|
"step": 198 |
|
}, |
|
{ |
|
"completion_length": 983.6250038146973, |
|
"epoch": 0.7031802120141343, |
|
"grad_norm": 0.18996755943687274, |
|
"kl": 7.857421875, |
|
"learning_rate": 4.928761361302269e-06, |
|
"loss": 0.3144, |
|
"reward": 1.1308594197034836, |
|
"reward_std": 0.37638773024082184, |
|
"rewards/accuracy_reward": 0.24869792396202683, |
|
"rewards/format_reward": 0.8821614794433117, |
|
"step": 199 |
|
}, |
|
{ |
|
"completion_length": 987.5208358764648, |
|
"epoch": 0.7067137809187279, |
|
"grad_norm": 0.19405956495375348, |
|
"kl": 7.912109375, |
|
"learning_rate": 4.8225514017138205e-06, |
|
"loss": 0.3164, |
|
"reward": 1.096354190260172, |
|
"reward_std": 0.34191144444048405, |
|
"rewards/accuracy_reward": 0.20638021361082792, |
|
"rewards/format_reward": 0.8899739757180214, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 1008.8958358764648, |
|
"epoch": 0.7102473498233216, |
|
"grad_norm": 0.1978005231935579, |
|
"kl": 7.767578125, |
|
"learning_rate": 4.717133473839163e-06, |
|
"loss": 0.3108, |
|
"reward": 1.1432292237877846, |
|
"reward_std": 0.3862752038985491, |
|
"rewards/accuracy_reward": 0.2701823003590107, |
|
"rewards/format_reward": 0.8730468973517418, |
|
"step": 201 |
|
}, |
|
{ |
|
"completion_length": 1009.25, |
|
"epoch": 0.7137809187279152, |
|
"grad_norm": 0.1284484089519714, |
|
"kl": 7.6015625, |
|
"learning_rate": 4.612523704220264e-06, |
|
"loss": 0.3041, |
|
"reward": 1.115234412252903, |
|
"reward_std": 0.42590648494660854, |
|
"rewards/accuracy_reward": 0.26562500838190317, |
|
"rewards/format_reward": 0.8496093973517418, |
|
"step": 202 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.7173144876325088, |
|
"grad_norm": 0.14352585595949552, |
|
"kl": 7.435546875, |
|
"learning_rate": 4.508738095769278e-06, |
|
"loss": 0.2974, |
|
"reward": 1.061848983168602, |
|
"reward_std": 0.41079509258270264, |
|
"rewards/accuracy_reward": 0.2265625074505806, |
|
"rewards/format_reward": 0.8352864757180214, |
|
"step": 203 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.7208480565371025, |
|
"grad_norm": 0.1521889190457422, |
|
"kl": 7.193359375, |
|
"learning_rate": 4.405792525320469e-06, |
|
"loss": 0.2877, |
|
"reward": 1.0279948264360428, |
|
"reward_std": 0.424892058596015, |
|
"rewards/accuracy_reward": 0.2037760482635349, |
|
"rewards/format_reward": 0.8242187686264515, |
|
"step": 204 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.7243816254416962, |
|
"grad_norm": 0.3506188463952097, |
|
"kl": 7.2890625, |
|
"learning_rate": 4.303702741201431e-06, |
|
"loss": 0.2915, |
|
"reward": 1.059895858168602, |
|
"reward_std": 0.42388765700161457, |
|
"rewards/accuracy_reward": 0.2473958432674408, |
|
"rewards/format_reward": 0.8125000186264515, |
|
"step": 205 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.7279151943462897, |
|
"grad_norm": 0.1677757492998703, |
|
"kl": 6.9921875, |
|
"learning_rate": 4.202484360823926e-06, |
|
"loss": 0.2797, |
|
"reward": 1.0266927368938923, |
|
"reward_std": 0.45705331675708294, |
|
"rewards/accuracy_reward": 0.23502604942768812, |
|
"rewards/format_reward": 0.791666679084301, |
|
"step": 206 |
|
}, |
|
{ |
|
"completion_length": 1011.9791679382324, |
|
"epoch": 0.7314487632508834, |
|
"grad_norm": 0.1517309736158255, |
|
"kl": 6.935546875, |
|
"learning_rate": 4.1021528682948064e-06, |
|
"loss": 0.2774, |
|
"reward": 1.049479216337204, |
|
"reward_std": 0.46575335413217545, |
|
"rewards/accuracy_reward": 0.2526041716337204, |
|
"rewards/format_reward": 0.7968750149011612, |
|
"step": 207 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.734982332155477, |
|
"grad_norm": 0.16127641070568163, |
|
"kl": 7.111328125, |
|
"learning_rate": 4.002723612047272e-06, |
|
"loss": 0.2847, |
|
"reward": 1.085286483168602, |
|
"reward_std": 0.44572209380567074, |
|
"rewards/accuracy_reward": 0.26302084419876337, |
|
"rewards/format_reward": 0.8222656436264515, |
|
"step": 208 |
|
}, |
|
{ |
|
"completion_length": 1009.3125, |
|
"epoch": 0.7385159010600707, |
|
"grad_norm": 0.13871720330015053, |
|
"kl": 7.23828125, |
|
"learning_rate": 3.904211802492922e-06, |
|
"loss": 0.2897, |
|
"reward": 1.100260455161333, |
|
"reward_std": 0.4249584712088108, |
|
"rewards/accuracy_reward": 0.26953125558793545, |
|
"rewards/format_reward": 0.830729179084301, |
|
"step": 209 |
|
}, |
|
{ |
|
"completion_length": 1014.8125, |
|
"epoch": 0.7420494699646644, |
|
"grad_norm": 0.13865406061603536, |
|
"kl": 7.302734375, |
|
"learning_rate": 3.8066325096949153e-06, |
|
"loss": 0.2922, |
|
"reward": 1.1406250447034836, |
|
"reward_std": 0.41488252952694893, |
|
"rewards/accuracy_reward": 0.2812500074505806, |
|
"rewards/format_reward": 0.8593750223517418, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.7455830388692579, |
|
"grad_norm": 0.12592871455753224, |
|
"kl": 7.369140625, |
|
"learning_rate": 3.710000661062578e-06, |
|
"loss": 0.2948, |
|
"reward": 1.1796875484287739, |
|
"reward_std": 0.41464217752218246, |
|
"rewards/accuracy_reward": 0.313151047565043, |
|
"rewards/format_reward": 0.8665364794433117, |
|
"step": 211 |
|
}, |
|
{ |
|
"completion_length": 995.2291679382324, |
|
"epoch": 0.7491166077738516, |
|
"grad_norm": 0.1393901308152561, |
|
"kl": 7.501953125, |
|
"learning_rate": 3.6143310390678544e-06, |
|
"loss": 0.3001, |
|
"reward": 1.1647135838866234, |
|
"reward_std": 0.3797377645969391, |
|
"rewards/accuracy_reward": 0.2838541753590107, |
|
"rewards/format_reward": 0.8808593899011612, |
|
"step": 212 |
|
}, |
|
{ |
|
"completion_length": 1010.125, |
|
"epoch": 0.7526501766784452, |
|
"grad_norm": 0.14034512809285338, |
|
"kl": 7.31640625, |
|
"learning_rate": 3.5196382789839477e-06, |
|
"loss": 0.2926, |
|
"reward": 1.1549479514360428, |
|
"reward_std": 0.39015408605337143, |
|
"rewards/accuracy_reward": 0.27994792629033327, |
|
"rewards/format_reward": 0.8750000223517418, |
|
"step": 213 |
|
}, |
|
{ |
|
"completion_length": 1011.875, |
|
"epoch": 0.7561837455830389, |
|
"grad_norm": 0.14989504739682083, |
|
"kl": 7.28125, |
|
"learning_rate": 3.425936866646419e-06, |
|
"loss": 0.2911, |
|
"reward": 1.1614583656191826, |
|
"reward_std": 0.39541246369481087, |
|
"rewards/accuracy_reward": 0.292968756519258, |
|
"rewards/format_reward": 0.8684896007180214, |
|
"step": 214 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.7597173144876325, |
|
"grad_norm": 0.13609166349614024, |
|
"kl": 6.982421875, |
|
"learning_rate": 3.3332411362372063e-06, |
|
"loss": 0.2793, |
|
"reward": 1.1263021379709244, |
|
"reward_std": 0.44077819399535656, |
|
"rewards/accuracy_reward": 0.29101563384756446, |
|
"rewards/format_reward": 0.8352864794433117, |
|
"step": 215 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.7632508833922261, |
|
"grad_norm": 0.15918888254694777, |
|
"kl": 7.1640625, |
|
"learning_rate": 3.2415652680918262e-06, |
|
"loss": 0.2865, |
|
"reward": 1.1223958879709244, |
|
"reward_std": 0.41186920180916786, |
|
"rewards/accuracy_reward": 0.28190105222165585, |
|
"rewards/format_reward": 0.8404948152601719, |
|
"step": 216 |
|
}, |
|
{ |
|
"completion_length": 1008.0416679382324, |
|
"epoch": 0.7667844522968198, |
|
"grad_norm": 0.12370384010350725, |
|
"kl": 7.244140625, |
|
"learning_rate": 3.1509232865300886e-06, |
|
"loss": 0.2899, |
|
"reward": 1.1601563058793545, |
|
"reward_std": 0.38652253709733486, |
|
"rewards/accuracy_reward": 0.3059895886108279, |
|
"rewards/format_reward": 0.8541666939854622, |
|
"step": 217 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.7703180212014135, |
|
"grad_norm": 0.20461489596469734, |
|
"kl": 7.2421875, |
|
"learning_rate": 3.061329057710711e-06, |
|
"loss": 0.2898, |
|
"reward": 1.072916705161333, |
|
"reward_std": 0.4447946548461914, |
|
"rewards/accuracy_reward": 0.2571614645421505, |
|
"rewards/format_reward": 0.8157552294433117, |
|
"step": 218 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.773851590106007, |
|
"grad_norm": 0.1804867199582671, |
|
"kl": 7.388671875, |
|
"learning_rate": 2.9727962875101e-06, |
|
"loss": 0.2956, |
|
"reward": 1.1334635727107525, |
|
"reward_std": 0.39954448491334915, |
|
"rewards/accuracy_reward": 0.27929688338190317, |
|
"rewards/format_reward": 0.8541666828095913, |
|
"step": 219 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.7773851590106007, |
|
"grad_norm": 0.16932493071108376, |
|
"kl": 7.197265625, |
|
"learning_rate": 2.8853385194256677e-06, |
|
"loss": 0.2879, |
|
"reward": 1.1041667014360428, |
|
"reward_std": 0.4760838821530342, |
|
"rewards/accuracy_reward": 0.28125000884756446, |
|
"rewards/format_reward": 0.8229166865348816, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.7809187279151943, |
|
"grad_norm": 0.15687360626369415, |
|
"kl": 7.0703125, |
|
"learning_rate": 2.798969132503997e-06, |
|
"loss": 0.283, |
|
"reward": 1.0423177294433117, |
|
"reward_std": 0.44260338321328163, |
|
"rewards/accuracy_reward": 0.2304687574505806, |
|
"rewards/format_reward": 0.8118489794433117, |
|
"step": 221 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.784452296819788, |
|
"grad_norm": 0.17190503748389882, |
|
"kl": 7.171875, |
|
"learning_rate": 2.713701339294129e-06, |
|
"loss": 0.2869, |
|
"reward": 1.1419271193444729, |
|
"reward_std": 0.4103549234569073, |
|
"rewards/accuracy_reward": 0.301432297565043, |
|
"rewards/format_reward": 0.8404948115348816, |
|
"step": 222 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.7879858657243817, |
|
"grad_norm": 0.14054774738185286, |
|
"kl": 7.33203125, |
|
"learning_rate": 2.6295481838263628e-06, |
|
"loss": 0.2932, |
|
"reward": 1.0891927443444729, |
|
"reward_std": 0.415512815117836, |
|
"rewards/accuracy_reward": 0.24479167256504297, |
|
"rewards/format_reward": 0.844401054084301, |
|
"step": 223 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.7915194346289752, |
|
"grad_norm": 0.1087364190858895, |
|
"kl": 7.130859375, |
|
"learning_rate": 2.5465225396168134e-06, |
|
"loss": 0.2853, |
|
"reward": 1.1236979588866234, |
|
"reward_std": 0.44147299975156784, |
|
"rewards/accuracy_reward": 0.285807297565043, |
|
"rewards/format_reward": 0.8378906473517418, |
|
"step": 224 |
|
}, |
|
{ |
|
"completion_length": 1008.2916679382324, |
|
"epoch": 0.7950530035335689, |
|
"grad_norm": 0.11246121980520436, |
|
"kl": 7.4765625, |
|
"learning_rate": 2.464637107698046e-06, |
|
"loss": 0.2994, |
|
"reward": 1.1829427480697632, |
|
"reward_std": 0.37431807816028595, |
|
"rewards/accuracy_reward": 0.3046875074505806, |
|
"rewards/format_reward": 0.8782552294433117, |
|
"step": 225 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.7985865724381626, |
|
"grad_norm": 0.1112131457977264, |
|
"kl": 7.265625, |
|
"learning_rate": 2.3839044146761227e-06, |
|
"loss": 0.2907, |
|
"reward": 1.164713591337204, |
|
"reward_std": 0.3692896058782935, |
|
"rewards/accuracy_reward": 0.2890625037252903, |
|
"rewards/format_reward": 0.8756510689854622, |
|
"step": 226 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.8021201413427562, |
|
"grad_norm": 0.21800229184914455, |
|
"kl": 7.462890625, |
|
"learning_rate": 2.304336810814305e-06, |
|
"loss": 0.2983, |
|
"reward": 1.1809896230697632, |
|
"reward_std": 0.3522001476958394, |
|
"rewards/accuracy_reward": 0.29427084140479565, |
|
"rewards/format_reward": 0.8867187723517418, |
|
"step": 227 |
|
}, |
|
{ |
|
"completion_length": 1007.1458358764648, |
|
"epoch": 0.8056537102473498, |
|
"grad_norm": 0.1666037699534655, |
|
"kl": 7.740234375, |
|
"learning_rate": 2.2259464681437404e-06, |
|
"loss": 0.3096, |
|
"reward": 1.2037760689854622, |
|
"reward_std": 0.3384133204817772, |
|
"rewards/accuracy_reward": 0.30338542349636555, |
|
"rewards/format_reward": 0.9003906436264515, |
|
"step": 228 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.8091872791519434, |
|
"grad_norm": 0.17750163436679095, |
|
"kl": 7.51953125, |
|
"learning_rate": 2.1487453786014513e-06, |
|
"loss": 0.301, |
|
"reward": 1.1744792126119137, |
|
"reward_std": 0.4062032885849476, |
|
"rewards/accuracy_reward": 0.3001302173361182, |
|
"rewards/format_reward": 0.8743489794433117, |
|
"step": 229 |
|
}, |
|
{ |
|
"completion_length": 1008.1875, |
|
"epoch": 0.8127208480565371, |
|
"grad_norm": 0.15247459978463207, |
|
"kl": 7.580078125, |
|
"learning_rate": 2.072745352195794e-06, |
|
"loss": 0.303, |
|
"reward": 1.1803385764360428, |
|
"reward_std": 0.37856387067586184, |
|
"rewards/accuracy_reward": 0.29361979849636555, |
|
"rewards/format_reward": 0.8867187686264515, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 1011.2916679382324, |
|
"epoch": 0.8162544169611308, |
|
"grad_norm": 0.18019373306879702, |
|
"kl": 7.603515625, |
|
"learning_rate": 1.997958015199829e-06, |
|
"loss": 0.304, |
|
"reward": 1.1835937798023224, |
|
"reward_std": 0.37871948070824146, |
|
"rewards/accuracy_reward": 0.293619797565043, |
|
"rewards/format_reward": 0.8899739719927311, |
|
"step": 231 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.8197879858657244, |
|
"grad_norm": 0.22583268385399752, |
|
"kl": 7.529296875, |
|
"learning_rate": 1.9243948083727626e-06, |
|
"loss": 0.3012, |
|
"reward": 1.118489608168602, |
|
"reward_std": 0.37204239144921303, |
|
"rewards/accuracy_reward": 0.25000000512227416, |
|
"rewards/format_reward": 0.8684896044433117, |
|
"step": 232 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.823321554770318, |
|
"grad_norm": 0.3443852485581638, |
|
"kl": 7.34375, |
|
"learning_rate": 1.8520669852097573e-06, |
|
"loss": 0.2938, |
|
"reward": 1.1399739980697632, |
|
"reward_std": 0.40698738768696785, |
|
"rewards/accuracy_reward": 0.2864583395421505, |
|
"rewards/format_reward": 0.8535156473517418, |
|
"step": 233 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.8268551236749117, |
|
"grad_norm": 0.2918520900684292, |
|
"kl": 7.419921875, |
|
"learning_rate": 1.7809856102204148e-06, |
|
"loss": 0.2967, |
|
"reward": 1.1516927555203438, |
|
"reward_std": 0.3947129677981138, |
|
"rewards/accuracy_reward": 0.30664063431322575, |
|
"rewards/format_reward": 0.8450520969927311, |
|
"step": 234 |
|
}, |
|
{ |
|
"completion_length": 993.1250038146973, |
|
"epoch": 0.8303886925795053, |
|
"grad_norm": 0.2326957931992313, |
|
"kl": 7.501953125, |
|
"learning_rate": 1.7111615572361628e-06, |
|
"loss": 0.3001, |
|
"reward": 1.1100260615348816, |
|
"reward_std": 0.4124826304614544, |
|
"rewards/accuracy_reward": 0.2643229244276881, |
|
"rewards/format_reward": 0.8457031473517418, |
|
"step": 235 |
|
}, |
|
{ |
|
"completion_length": 1008.4791679382324, |
|
"epoch": 0.833922261484099, |
|
"grad_norm": 0.20186154524969585, |
|
"kl": 7.302734375, |
|
"learning_rate": 1.642605507746786e-06, |
|
"loss": 0.2922, |
|
"reward": 1.0924479439854622, |
|
"reward_std": 0.4381315726786852, |
|
"rewards/accuracy_reward": 0.2623697970993817, |
|
"rewards/format_reward": 0.8300781436264515, |
|
"step": 236 |
|
}, |
|
{ |
|
"completion_length": 1012.5625, |
|
"epoch": 0.8374558303886925, |
|
"grad_norm": 0.12307486347767096, |
|
"kl": 7.609375, |
|
"learning_rate": 1.5753279492664264e-06, |
|
"loss": 0.3044, |
|
"reward": 1.0917969197034836, |
|
"reward_std": 0.37845473177731037, |
|
"rewards/accuracy_reward": 0.23046875465661287, |
|
"rewards/format_reward": 0.8613281436264515, |
|
"step": 237 |
|
}, |
|
{ |
|
"completion_length": 996.0000038146973, |
|
"epoch": 0.8409893992932862, |
|
"grad_norm": 0.12490510412366805, |
|
"kl": 7.5703125, |
|
"learning_rate": 1.509339173729214e-06, |
|
"loss": 0.3028, |
|
"reward": 1.1119791939854622, |
|
"reward_std": 0.37066210247576237, |
|
"rewards/accuracy_reward": 0.25195313338190317, |
|
"rewards/format_reward": 0.8600260652601719, |
|
"step": 238 |
|
}, |
|
{ |
|
"completion_length": 1013.7708358764648, |
|
"epoch": 0.8445229681978799, |
|
"grad_norm": 0.13493904378820848, |
|
"kl": 7.599609375, |
|
"learning_rate": 1.4446492759148411e-06, |
|
"loss": 0.3039, |
|
"reward": 1.1028646193444729, |
|
"reward_std": 0.36637131590396166, |
|
"rewards/accuracy_reward": 0.24934896640479565, |
|
"rewards/format_reward": 0.8535156436264515, |
|
"step": 239 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.8480565371024735, |
|
"grad_norm": 0.15035989381249287, |
|
"kl": 7.650390625, |
|
"learning_rate": 1.381268151904298e-06, |
|
"loss": 0.3059, |
|
"reward": 1.1451823264360428, |
|
"reward_std": 0.3553981352597475, |
|
"rewards/accuracy_reward": 0.25781250931322575, |
|
"rewards/format_reward": 0.8873698078095913, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.8515901060070671, |
|
"grad_norm": 0.22950348264138257, |
|
"kl": 7.703125, |
|
"learning_rate": 1.319205497565983e-06, |
|
"loss": 0.3085, |
|
"reward": 1.177083384245634, |
|
"reward_std": 0.36925146263092756, |
|
"rewards/accuracy_reward": 0.2929687611758709, |
|
"rewards/format_reward": 0.884114608168602, |
|
"step": 241 |
|
}, |
|
{ |
|
"completion_length": 1010.7916679382324, |
|
"epoch": 0.8551236749116607, |
|
"grad_norm": 0.16497755116206372, |
|
"kl": 7.603515625, |
|
"learning_rate": 1.2584708070724738e-06, |
|
"loss": 0.3041, |
|
"reward": 1.1829427406191826, |
|
"reward_std": 0.3738958667963743, |
|
"rewards/accuracy_reward": 0.2897135494276881, |
|
"rewards/format_reward": 0.8932291865348816, |
|
"step": 242 |
|
}, |
|
{ |
|
"completion_length": 1008.9583358764648, |
|
"epoch": 0.8586572438162544, |
|
"grad_norm": 0.17585743920433772, |
|
"kl": 7.76171875, |
|
"learning_rate": 1.1990733714481185e-06, |
|
"loss": 0.3107, |
|
"reward": 1.1471354588866234, |
|
"reward_std": 0.3857318237423897, |
|
"rewards/accuracy_reward": 0.26888021547347307, |
|
"rewards/format_reward": 0.8782552294433117, |
|
"step": 243 |
|
}, |
|
{ |
|
"completion_length": 1009.5833358764648, |
|
"epoch": 0.8621908127208481, |
|
"grad_norm": 0.09937123864278254, |
|
"kl": 7.640625, |
|
"learning_rate": 1.1410222771477276e-06, |
|
"loss": 0.3056, |
|
"reward": 1.1360677555203438, |
|
"reward_std": 0.3525569401681423, |
|
"rewards/accuracy_reward": 0.24934896640479565, |
|
"rewards/format_reward": 0.8867187686264515, |
|
"step": 244 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.8657243816254417, |
|
"grad_norm": 0.23903667479086305, |
|
"kl": 7.603515625, |
|
"learning_rate": 1.0843264046665558e-06, |
|
"loss": 0.304, |
|
"reward": 1.1881510764360428, |
|
"reward_std": 0.3764577666297555, |
|
"rewards/accuracy_reward": 0.3027343852445483, |
|
"rewards/format_reward": 0.8854166865348816, |
|
"step": 245 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.8692579505300353, |
|
"grad_norm": 0.1464770828710901, |
|
"kl": 7.6328125, |
|
"learning_rate": 1.0289944271817898e-06, |
|
"loss": 0.3051, |
|
"reward": 1.1829427406191826, |
|
"reward_std": 0.36000128649175167, |
|
"rewards/accuracy_reward": 0.2910156324505806, |
|
"rewards/format_reward": 0.891927108168602, |
|
"step": 246 |
|
}, |
|
{ |
|
"completion_length": 1011.8333358764648, |
|
"epoch": 0.872791519434629, |
|
"grad_norm": 0.12575527161822952, |
|
"kl": 7.70703125, |
|
"learning_rate": 9.750348092257368e-07, |
|
"loss": 0.3083, |
|
"reward": 1.2018229737877846, |
|
"reward_std": 0.33693454321473837, |
|
"rewards/accuracy_reward": 0.296875006519258, |
|
"rewards/format_reward": 0.9049479402601719, |
|
"step": 247 |
|
}, |
|
{ |
|
"completion_length": 1007.6041679382324, |
|
"epoch": 0.8763250883392226, |
|
"grad_norm": 0.2181922287172582, |
|
"kl": 7.560546875, |
|
"learning_rate": 9.224558053909615e-07, |
|
"loss": 0.3026, |
|
"reward": 1.1432291939854622, |
|
"reward_std": 0.3927531726658344, |
|
"rewards/accuracy_reward": 0.26562500931322575, |
|
"rewards/format_reward": 0.8776041902601719, |
|
"step": 248 |
|
}, |
|
{ |
|
"completion_length": 1003.625, |
|
"epoch": 0.8798586572438163, |
|
"grad_norm": 0.1951601816286408, |
|
"kl": 7.806640625, |
|
"learning_rate": 8.712654590675085e-07, |
|
"loss": 0.3126, |
|
"reward": 1.1347656659781933, |
|
"reward_std": 0.3628583550453186, |
|
"rewards/accuracy_reward": 0.2552083423361182, |
|
"rewards/format_reward": 0.8795573152601719, |
|
"step": 249 |
|
}, |
|
{ |
|
"completion_length": 1017.8541679382324, |
|
"epoch": 0.8833922261484098, |
|
"grad_norm": 0.13270611252972092, |
|
"kl": 7.900390625, |
|
"learning_rate": 8.214716012124491e-07, |
|
"loss": 0.3162, |
|
"reward": 1.2278646156191826, |
|
"reward_std": 0.33511496149003506, |
|
"rewards/accuracy_reward": 0.3190104244276881, |
|
"rewards/format_reward": 0.9088541902601719, |
|
"step": 250 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.8869257950530035, |
|
"grad_norm": 0.16899318018961745, |
|
"kl": 7.703125, |
|
"learning_rate": 7.730818491519343e-07, |
|
"loss": 0.3083, |
|
"reward": 1.1673177480697632, |
|
"reward_std": 0.35251003317534924, |
|
"rewards/accuracy_reward": 0.28515625838190317, |
|
"rewards/format_reward": 0.8821614794433117, |
|
"step": 251 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.8904593639575972, |
|
"grad_norm": 0.19761065312810003, |
|
"kl": 7.53125, |
|
"learning_rate": 7.261036054158965e-07, |
|
"loss": 0.3013, |
|
"reward": 1.1621094308793545, |
|
"reward_std": 0.361306588165462, |
|
"rewards/accuracy_reward": 0.28515625558793545, |
|
"rewards/format_reward": 0.8769531473517418, |
|
"step": 252 |
|
}, |
|
{ |
|
"completion_length": 980.1666679382324, |
|
"epoch": 0.8939929328621908, |
|
"grad_norm": 0.12059278219677871, |
|
"kl": 7.86328125, |
|
"learning_rate": 6.805440566056554e-07, |
|
"loss": 0.3147, |
|
"reward": 1.1438802555203438, |
|
"reward_std": 0.3613898027688265, |
|
"rewards/accuracy_reward": 0.2636718824505806, |
|
"rewards/format_reward": 0.8802083507180214, |
|
"step": 253 |
|
}, |
|
{ |
|
"completion_length": 1011.1041679382324, |
|
"epoch": 0.8975265017667845, |
|
"grad_norm": 0.21668223679230778, |
|
"kl": 7.732421875, |
|
"learning_rate": 6.364101722945082e-07, |
|
"loss": 0.309, |
|
"reward": 1.144531287252903, |
|
"reward_std": 0.3460462633520365, |
|
"rewards/accuracy_reward": 0.2636718829162419, |
|
"rewards/format_reward": 0.8808594010770321, |
|
"step": 254 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.901060070671378, |
|
"grad_norm": 0.2561339153931696, |
|
"kl": 7.5546875, |
|
"learning_rate": 5.937087039615619e-07, |
|
"loss": 0.3021, |
|
"reward": 1.1035156734287739, |
|
"reward_std": 0.3481142967939377, |
|
"rewards/accuracy_reward": 0.23372396640479565, |
|
"rewards/format_reward": 0.8697916902601719, |
|
"step": 255 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.9045936395759717, |
|
"grad_norm": 0.1552920007884265, |
|
"kl": 7.521484375, |
|
"learning_rate": 5.524461839589012e-07, |
|
"loss": 0.3008, |
|
"reward": 1.1230469271540642, |
|
"reward_std": 0.36596967838704586, |
|
"rewards/accuracy_reward": 0.2526041688397527, |
|
"rewards/format_reward": 0.870442733168602, |
|
"step": 256 |
|
}, |
|
{ |
|
"completion_length": 1008.7708358764648, |
|
"epoch": 0.9081272084805654, |
|
"grad_norm": 0.183290959725611, |
|
"kl": 7.634765625, |
|
"learning_rate": 5.126289245122906e-07, |
|
"loss": 0.3054, |
|
"reward": 1.1549479588866234, |
|
"reward_std": 0.3895051181316376, |
|
"rewards/accuracy_reward": 0.27929688058793545, |
|
"rewards/format_reward": 0.8756510578095913, |
|
"step": 257 |
|
}, |
|
{ |
|
"completion_length": 1014.6875, |
|
"epoch": 0.911660777385159, |
|
"grad_norm": 0.14993091508284867, |
|
"kl": 7.6953125, |
|
"learning_rate": 4.7426301675554285e-07, |
|
"loss": 0.3077, |
|
"reward": 1.1523437835276127, |
|
"reward_std": 0.343139311298728, |
|
"rewards/accuracy_reward": 0.261718756519258, |
|
"rewards/format_reward": 0.8906250186264515, |
|
"step": 258 |
|
}, |
|
{ |
|
"completion_length": 1007.9791679382324, |
|
"epoch": 0.9151943462897526, |
|
"grad_norm": 0.1374641960302751, |
|
"kl": 7.583984375, |
|
"learning_rate": 4.3735432979872593e-07, |
|
"loss": 0.3032, |
|
"reward": 1.1640625447034836, |
|
"reward_std": 0.37700783647596836, |
|
"rewards/accuracy_reward": 0.27799479849636555, |
|
"rewards/format_reward": 0.8860677294433117, |
|
"step": 259 |
|
}, |
|
{ |
|
"completion_length": 1006.0, |
|
"epoch": 0.9187279151943463, |
|
"grad_norm": 0.17289466011678567, |
|
"kl": 7.734375, |
|
"learning_rate": 4.019085098303077e-07, |
|
"loss": 0.3092, |
|
"reward": 1.1796875298023224, |
|
"reward_std": 0.3705375073477626, |
|
"rewards/accuracy_reward": 0.3046875074505806, |
|
"rewards/format_reward": 0.8750000111758709, |
|
"step": 260 |
|
}, |
|
{ |
|
"completion_length": 1020.5625, |
|
"epoch": 0.9222614840989399, |
|
"grad_norm": 0.2504365893065849, |
|
"kl": 7.763671875, |
|
"learning_rate": 3.679309792534291e-07, |
|
"loss": 0.3107, |
|
"reward": 1.1927083730697632, |
|
"reward_std": 0.36883932538330555, |
|
"rewards/accuracy_reward": 0.30338542722165585, |
|
"rewards/format_reward": 0.8893229402601719, |
|
"step": 261 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.9257950530035336, |
|
"grad_norm": 0.15980023469329024, |
|
"kl": 7.708984375, |
|
"learning_rate": 3.354269358563966e-07, |
|
"loss": 0.3083, |
|
"reward": 1.1666666939854622, |
|
"reward_std": 0.3717129658907652, |
|
"rewards/accuracy_reward": 0.28125000838190317, |
|
"rewards/format_reward": 0.8854166828095913, |
|
"step": 262 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.9293286219081273, |
|
"grad_norm": 0.14673263922185112, |
|
"kl": 7.51171875, |
|
"learning_rate": 3.044013520175337e-07, |
|
"loss": 0.3004, |
|
"reward": 1.1158854588866234, |
|
"reward_std": 0.39931169617921114, |
|
"rewards/accuracy_reward": 0.2441406324505806, |
|
"rewards/format_reward": 0.8717448115348816, |
|
"step": 263 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.9328621908127208, |
|
"grad_norm": 0.21979625485038595, |
|
"kl": 7.701171875, |
|
"learning_rate": 2.7485897394453067e-07, |
|
"loss": 0.308, |
|
"reward": 1.1269531771540642, |
|
"reward_std": 0.3809357853606343, |
|
"rewards/accuracy_reward": 0.23763021733611822, |
|
"rewards/format_reward": 0.8893229328095913, |
|
"step": 264 |
|
}, |
|
{ |
|
"completion_length": 999.4583358764648, |
|
"epoch": 0.9363957597173145, |
|
"grad_norm": 0.1678262537397904, |
|
"kl": 7.7421875, |
|
"learning_rate": 2.4680432094837394e-07, |
|
"loss": 0.3099, |
|
"reward": 1.1510417014360428, |
|
"reward_std": 0.3365847198292613, |
|
"rewards/accuracy_reward": 0.2623697994276881, |
|
"rewards/format_reward": 0.8886718973517418, |
|
"step": 265 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.9399293286219081, |
|
"grad_norm": 0.17657969817520183, |
|
"kl": 7.6640625, |
|
"learning_rate": 2.2024168475199615e-07, |
|
"loss": 0.3068, |
|
"reward": 1.125000037252903, |
|
"reward_std": 0.3468599859625101, |
|
"rewards/accuracy_reward": 0.25195313058793545, |
|
"rewards/format_reward": 0.8730468899011612, |
|
"step": 266 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.9434628975265018, |
|
"grad_norm": 0.16463993549487405, |
|
"kl": 7.640625, |
|
"learning_rate": 1.9517512883374667e-07, |
|
"loss": 0.3057, |
|
"reward": 1.1464844048023224, |
|
"reward_std": 0.38465187326073647, |
|
"rewards/accuracy_reward": 0.26106771687045693, |
|
"rewards/format_reward": 0.8854166828095913, |
|
"step": 267 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.9469964664310954, |
|
"grad_norm": 0.1836439792441492, |
|
"kl": 7.57421875, |
|
"learning_rate": 1.7160848780576334e-07, |
|
"loss": 0.303, |
|
"reward": 1.1764323264360428, |
|
"reward_std": 0.39003968983888626, |
|
"rewards/accuracy_reward": 0.29296875838190317, |
|
"rewards/format_reward": 0.8834635689854622, |
|
"step": 268 |
|
}, |
|
{ |
|
"completion_length": 1003.2916679382324, |
|
"epoch": 0.950530035335689, |
|
"grad_norm": 0.20059017619611783, |
|
"kl": 7.5078125, |
|
"learning_rate": 1.495453668273672e-07, |
|
"loss": 0.3002, |
|
"reward": 1.149088565260172, |
|
"reward_std": 0.3729328028857708, |
|
"rewards/accuracy_reward": 0.27408855129033327, |
|
"rewards/format_reward": 0.8750000260770321, |
|
"step": 269 |
|
}, |
|
{ |
|
"completion_length": 1002.9375038146973, |
|
"epoch": 0.9540636042402827, |
|
"grad_norm": 0.1818614489805749, |
|
"kl": 7.669921875, |
|
"learning_rate": 1.289891410535593e-07, |
|
"loss": 0.3069, |
|
"reward": 1.1751302555203438, |
|
"reward_std": 0.3694411441683769, |
|
"rewards/accuracy_reward": 0.28190105129033327, |
|
"rewards/format_reward": 0.8932291865348816, |
|
"step": 270 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.9575971731448764, |
|
"grad_norm": 0.1417185938823282, |
|
"kl": 7.783203125, |
|
"learning_rate": 1.0994295511869257e-07, |
|
"loss": 0.3114, |
|
"reward": 1.164713580161333, |
|
"reward_std": 0.3704876583069563, |
|
"rewards/accuracy_reward": 0.28385417629033327, |
|
"rewards/format_reward": 0.8808593973517418, |
|
"step": 271 |
|
}, |
|
{ |
|
"completion_length": 1007.5833358764648, |
|
"epoch": 0.9611307420494699, |
|
"grad_norm": 0.1335404235268344, |
|
"kl": 7.65234375, |
|
"learning_rate": 9.240972265541992e-08, |
|
"loss": 0.3061, |
|
"reward": 1.136718787252903, |
|
"reward_std": 0.33945256378501654, |
|
"rewards/accuracy_reward": 0.25325521687045693, |
|
"rewards/format_reward": 0.8834635652601719, |
|
"step": 272 |
|
}, |
|
{ |
|
"completion_length": 995.3958358764648, |
|
"epoch": 0.9646643109540636, |
|
"grad_norm": 0.16627237105129372, |
|
"kl": 7.748046875, |
|
"learning_rate": 7.639212584897082e-08, |
|
"loss": 0.3097, |
|
"reward": 1.1458333767950535, |
|
"reward_std": 0.36959288641810417, |
|
"rewards/accuracy_reward": 0.2591145895421505, |
|
"rewards/format_reward": 0.8867187611758709, |
|
"step": 273 |
|
}, |
|
{ |
|
"completion_length": 977.8541717529297, |
|
"epoch": 0.9681978798586572, |
|
"grad_norm": 0.1442104843092151, |
|
"kl": 7.8046875, |
|
"learning_rate": 6.189261502683619e-08, |
|
"loss": 0.3121, |
|
"reward": 1.1608073338866234, |
|
"reward_std": 0.33501617051661015, |
|
"rewards/accuracy_reward": 0.26302084047347307, |
|
"rewards/format_reward": 0.897786483168602, |
|
"step": 274 |
|
}, |
|
{ |
|
"completion_length": 1005.8333358764648, |
|
"epoch": 0.9717314487632509, |
|
"grad_norm": 0.1335599188895197, |
|
"kl": 7.734375, |
|
"learning_rate": 4.8913408283934874e-08, |
|
"loss": 0.3094, |
|
"reward": 1.1829427480697632, |
|
"reward_std": 0.3483387678861618, |
|
"rewards/accuracy_reward": 0.2923177173361182, |
|
"rewards/format_reward": 0.8906250149011612, |
|
"step": 275 |
|
}, |
|
{ |
|
"completion_length": 986.5833358764648, |
|
"epoch": 0.9752650176678446, |
|
"grad_norm": 0.1566671460322668, |
|
"kl": 7.9140625, |
|
"learning_rate": 3.745649114328065e-08, |
|
"loss": 0.3167, |
|
"reward": 1.2102865017950535, |
|
"reward_std": 0.35581814870238304, |
|
"rewards/accuracy_reward": 0.305338547565043, |
|
"rewards/format_reward": 0.9049479402601719, |
|
"step": 276 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.9787985865724381, |
|
"grad_norm": 0.18267938124865138, |
|
"kl": 7.66796875, |
|
"learning_rate": 2.7523616252252972e-08, |
|
"loss": 0.3068, |
|
"reward": 1.1458333730697632, |
|
"reward_std": 0.372573995962739, |
|
"rewards/accuracy_reward": 0.2617187579162419, |
|
"rewards/format_reward": 0.8841146044433117, |
|
"step": 277 |
|
}, |
|
{ |
|
"completion_length": 990.7500038146973, |
|
"epoch": 0.9823321554770318, |
|
"grad_norm": 0.21259523984138215, |
|
"kl": 7.6875, |
|
"learning_rate": 1.9116303114480316e-08, |
|
"loss": 0.3076, |
|
"reward": 1.1731771230697632, |
|
"reward_std": 0.3659754488617182, |
|
"rewards/accuracy_reward": 0.28971354803070426, |
|
"rewards/format_reward": 0.8834635689854622, |
|
"step": 278 |
|
}, |
|
{ |
|
"completion_length": 1012.2291679382324, |
|
"epoch": 0.9858657243816255, |
|
"grad_norm": 0.16162769674837876, |
|
"kl": 7.580078125, |
|
"learning_rate": 1.2235837857387246e-08, |
|
"loss": 0.3032, |
|
"reward": 1.130859412252903, |
|
"reward_std": 0.3709502723067999, |
|
"rewards/accuracy_reward": 0.2500000074505806, |
|
"rewards/format_reward": 0.8808593973517418, |
|
"step": 279 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.9893992932862191, |
|
"grad_norm": 0.1297894260306723, |
|
"kl": 7.84375, |
|
"learning_rate": 6.883273035447335e-09, |
|
"loss": 0.3136, |
|
"reward": 1.1634114906191826, |
|
"reward_std": 0.3336097300052643, |
|
"rewards/accuracy_reward": 0.27539063431322575, |
|
"rewards/format_reward": 0.888020858168602, |
|
"step": 280 |
|
}, |
|
{ |
|
"completion_length": 1024.0, |
|
"epoch": 0.9929328621908127, |
|
"grad_norm": 0.1521970740888536, |
|
"kl": 7.67578125, |
|
"learning_rate": 3.0594274691686522e-09, |
|
"loss": 0.3069, |
|
"reward": 1.1373698264360428, |
|
"reward_std": 0.35927175264805555, |
|
"rewards/accuracy_reward": 0.258463550824672, |
|
"rewards/format_reward": 0.8789062723517418, |
|
"step": 281 |
|
}, |
|
{ |
|
"completion_length": 992.9791679382324, |
|
"epoch": 0.9964664310954063, |
|
"grad_norm": 0.19083083078751598, |
|
"kl": 7.509765625, |
|
"learning_rate": 7.648861198306101e-10, |
|
"loss": 0.3004, |
|
"reward": 1.164713580161333, |
|
"reward_std": 0.38056557066738605, |
|
"rewards/accuracy_reward": 0.299479179084301, |
|
"rewards/format_reward": 0.8652343973517418, |
|
"step": 282 |
|
}, |
|
{ |
|
"completion_length": 1004.0, |
|
"epoch": 1.0, |
|
"grad_norm": 0.22334310417649858, |
|
"kl": 7.740234375, |
|
"learning_rate": 0.0, |
|
"loss": 0.3095, |
|
"reward": 1.1516927778720856, |
|
"reward_std": 0.36419933661818504, |
|
"rewards/accuracy_reward": 0.26171875558793545, |
|
"rewards/format_reward": 0.889973983168602, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 283, |
|
"total_flos": 0.0, |
|
"train_loss": 3.28601383127165, |
|
"train_runtime": 57853.0435, |
|
"train_samples_per_second": 1.252, |
|
"train_steps_per_second": 0.005 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 283, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 60, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|