{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.944, "eval_steps": 100, "global_step": 248, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 288.16796875, "epoch": 0.016, "grad_norm": 0.9921875, "kl": 0.0, "learning_rate": 2.0000000000000002e-07, "loss": -0.0, "reward": 2.1448024585843086, "reward_std": 0.6503619067370892, "rewards/accuracy_reward": 0.064453125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.576262284691135, "rewards/reasoning_steps_reward": 0.35156250186264515, "step": 1 }, { "completion_length": 280.5390625, "epoch": 0.032, "grad_norm": 2.453125, "kl": 0.0, "learning_rate": 4.0000000000000003e-07, "loss": -0.0, "reward": 2.9461557120084763, "reward_std": 0.7598665952682495, "rewards/accuracy_reward": 0.017578125, "rewards/format_reward": 0.001953125, "rewards/novelty_reward_func_explore_exploit": 0.8809234369546175, "rewards/reasoning_steps_reward": 0.2838541753590107, "step": 2 }, { "completion_length": 282.580078125, "epoch": 0.048, "grad_norm": 1.609375, "kl": 0.0010201742788922274, "learning_rate": 6.000000000000001e-07, "loss": 0.0, "reward": 2.4310644939541817, "reward_std": 0.7171800062060356, "rewards/accuracy_reward": 0.099609375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.68535483473291, "rewards/reasoning_steps_reward": 0.27539063314907253, "step": 3 }, { "completion_length": 281.859375, "epoch": 0.064, "grad_norm": 1.6640625, "kl": 0.0006131621394160902, "learning_rate": 8.000000000000001e-07, "loss": 0.0, "reward": 2.392010949552059, "reward_std": 0.7056797686964273, "rewards/accuracy_reward": 0.123046875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6480314154177904, "rewards/reasoning_steps_reward": 0.3248697896488011, "step": 4 }, { "completion_length": 276.5234375, "epoch": 0.08, "grad_norm": 1.140625, "kl": 0.0008916492552089039, "learning_rate": 1.0000000000000002e-06, "loss": 0.0, "reward": 2.185966059565544, "reward_std": 0.7970924656838179, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6036553451170524, "rewards/reasoning_steps_reward": 0.2578125027939677, "step": 5 }, { "completion_length": 290.345703125, "epoch": 0.096, "grad_norm": 0.98828125, "kl": 0.0007805953682691325, "learning_rate": 1.2000000000000002e-06, "loss": 0.0, "reward": 2.586206890642643, "reward_std": 0.7317942306399345, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7433623660666248, "rewards/reasoning_steps_reward": 0.29361979849636555, "step": 6 }, { "completion_length": 288.357421875, "epoch": 0.112, "grad_norm": 3.3125, "kl": 0.0006862173449917464, "learning_rate": 1.4000000000000001e-06, "loss": 0.0, "reward": 2.9549497589468956, "reward_std": 0.7832636646926403, "rewards/accuracy_reward": 0.060546875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8552089141060909, "rewards/reasoning_steps_reward": 0.3287760401144624, "step": 7 }, { "completion_length": 291.0859375, "epoch": 0.128, "grad_norm": 1.34375, "kl": 0.0007065349800541298, "learning_rate": 1.6000000000000001e-06, "loss": 0.0, "reward": 2.769632026553154, "reward_std": 0.6810889039188623, "rewards/accuracy_reward": 0.02734375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8127506040036678, "rewards/reasoning_steps_reward": 0.3040364640764892, "step": 8 }, { "completion_length": 285.240234375, "epoch": 0.144, "grad_norm": 1.4921875, "kl": 0.000676694346111617, "learning_rate": 1.8000000000000001e-06, "loss": 0.0, "reward": 2.951853834092617, "reward_std": 0.833003468811512, "rewards/accuracy_reward": 0.02734375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8860780304918686, "rewards/reasoning_steps_reward": 0.2662760391831398, "step": 9 }, { "completion_length": 273.15625, "epoch": 0.16, "grad_norm": 2.40625, "kl": 0.0007681718943786109, "learning_rate": 2.0000000000000003e-06, "loss": 0.0, "reward": 2.4880168437957764, "reward_std": 0.7941582556813955, "rewards/accuracy_reward": 0.083984375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6932712296644846, "rewards/reasoning_steps_reward": 0.3242187509313226, "step": 10 }, { "completion_length": 263.72265625, "epoch": 0.176, "grad_norm": 1.8125, "kl": 0.0007444877319358056, "learning_rate": 2.2e-06, "loss": 0.0, "reward": 2.060094438493252, "reward_std": 0.7453512959182262, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.540864814693729, "rewards/reasoning_steps_reward": 0.28125000139698386, "step": 11 }, { "completion_length": 287.52734375, "epoch": 0.192, "grad_norm": 0.84765625, "kl": 0.0006410041951312451, "learning_rate": 2.4000000000000003e-06, "loss": 0.0, "reward": 2.6830679774284363, "reward_std": 0.7234712429344654, "rewards/accuracy_reward": 0.07421875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.775215346676608, "rewards/reasoning_steps_reward": 0.28320312732830644, "step": 12 }, { "completion_length": 271.40625, "epoch": 0.208, "grad_norm": 1.6875, "kl": 0.0006587781517737312, "learning_rate": 2.6e-06, "loss": 0.0, "reward": 2.297848492860794, "reward_std": 0.7962923254817724, "rewards/accuracy_reward": 0.07421875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.654187332217892, "rewards/reasoning_steps_reward": 0.2610677082557231, "step": 13 }, { "completion_length": 282.728515625, "epoch": 0.224, "grad_norm": 1.1015625, "kl": 0.0009098516529775225, "learning_rate": 2.8000000000000003e-06, "loss": 0.0, "reward": 2.512993238866329, "reward_std": 0.7325041498988867, "rewards/accuracy_reward": 0.11328125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.698124471741418, "rewards/reasoning_steps_reward": 0.30533855268731713, "step": 14 }, { "completion_length": 286.90234375, "epoch": 0.24, "grad_norm": 1.7890625, "kl": 0.0009447168922633864, "learning_rate": 3e-06, "loss": 0.0, "reward": 2.572930172085762, "reward_std": 0.7189842760562897, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7254819249113401, "rewards/reasoning_steps_reward": 0.34960938710719347, "step": 15 }, { "completion_length": 282.0234375, "epoch": 0.256, "grad_norm": 1.9453125, "kl": 0.0007176450344559271, "learning_rate": 3.2000000000000003e-06, "loss": 0.0, "reward": 2.3780763298273087, "reward_std": 0.7437136992812157, "rewards/accuracy_reward": 0.044921875, "rewards/format_reward": 0.001953125, "rewards/novelty_reward_func_explore_exploit": 0.668343149125576, "rewards/reasoning_steps_reward": 0.326171881519258, "step": 16 }, { "completion_length": 277.181640625, "epoch": 0.272, "grad_norm": 1.84375, "kl": 0.0009444843672099523, "learning_rate": 3.4000000000000005e-06, "loss": 0.0, "reward": 2.6828741505742073, "reward_std": 0.7672664560377598, "rewards/accuracy_reward": 0.068359375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7499771338577071, "rewards/reasoning_steps_reward": 0.3645833469927311, "step": 17 }, { "completion_length": 286.3671875, "epoch": 0.288, "grad_norm": 1.28125, "kl": 0.0009558251094858861, "learning_rate": 3.6000000000000003e-06, "loss": 0.0, "reward": 2.608707718551159, "reward_std": 0.7508547510951757, "rewards/accuracy_reward": 0.056640625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7523817578330636, "rewards/reasoning_steps_reward": 0.29492188477888703, "step": 18 }, { "completion_length": 288.197265625, "epoch": 0.304, "grad_norm": 1.3203125, "kl": 0.0009933830478985328, "learning_rate": 3.8000000000000005e-06, "loss": 0.0, "reward": 3.152822159230709, "reward_std": 0.7633876148611307, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.9196473040307561, "rewards/reasoning_steps_reward": 0.3626302182674408, "step": 19 }, { "completion_length": 291.763671875, "epoch": 0.32, "grad_norm": 0.95703125, "kl": 0.0010635810940584633, "learning_rate": 4.000000000000001e-06, "loss": 0.0, "reward": 2.500213325023651, "reward_std": 0.7017618604004383, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6949495617300272, "rewards/reasoning_steps_reward": 0.32942708022892475, "step": 20 }, { "completion_length": 281.04296875, "epoch": 0.336, "grad_norm": 3872.0, "kl": 38.398836399162974, "learning_rate": 4.2000000000000004e-06, "loss": 1.536, "reward": 2.2889985144138336, "reward_std": 0.7787356674671173, "rewards/accuracy_reward": 0.068359375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6505863160515825, "rewards/reasoning_steps_reward": 0.2688802117481828, "step": 21 }, { "completion_length": 288.47265625, "epoch": 0.352, "grad_norm": 1.0859375, "kl": 0.0012669887473748531, "learning_rate": 4.4e-06, "loss": 0.0001, "reward": 2.6376563012599945, "reward_std": 0.6690970882773399, "rewards/accuracy_reward": 0.087890625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7099479290967187, "rewards/reasoning_steps_reward": 0.41992187313735485, "step": 22 }, { "completion_length": 286.966796875, "epoch": 0.368, "grad_norm": 1.2734375, "kl": 0.0015128458107938059, "learning_rate": 4.600000000000001e-06, "loss": 0.0001, "reward": 2.5921228751540184, "reward_std": 0.6939626764506102, "rewards/accuracy_reward": 0.0859375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7249350572625796, "rewards/reasoning_steps_reward": 0.33138021221384406, "step": 23 }, { "completion_length": 280.384765625, "epoch": 0.384, "grad_norm": 0.9375, "kl": 0.0015014593445812352, "learning_rate": 4.800000000000001e-06, "loss": 0.0001, "reward": 2.704825095832348, "reward_std": 0.8105385769158602, "rewards/accuracy_reward": 0.056640625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7590302489697933, "rewards/reasoning_steps_reward": 0.3710937546566129, "step": 24 }, { "completion_length": 290.41796875, "epoch": 0.4, "grad_norm": 0.9140625, "kl": 0.001665601652348414, "learning_rate": 5e-06, "loss": 0.0001, "reward": 2.357452914118767, "reward_std": 0.7088302746415138, "rewards/accuracy_reward": 0.064453125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6477967969452342, "rewards/reasoning_steps_reward": 0.349609381519258, "step": 25 }, { "completion_length": 291.509765625, "epoch": 0.416, "grad_norm": 0.9609375, "kl": 0.001865061596618034, "learning_rate": 4.999751919373782e-06, "loss": 0.0001, "reward": 2.281202170997858, "reward_std": 0.6989514082670212, "rewards/accuracy_reward": 0.08984375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.5948191303759813, "rewards/reasoning_steps_reward": 0.40690105222165585, "step": 26 }, { "completion_length": 287.421875, "epoch": 0.432, "grad_norm": 0.9140625, "kl": 0.002278451618622057, "learning_rate": 4.9990077267303256e-06, "loss": 0.0001, "reward": 2.39421396702528, "reward_std": 0.7001004256308079, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6474636799345413, "rewards/reasoning_steps_reward": 0.3346354281529784, "step": 27 }, { "completion_length": 289.5859375, "epoch": 0.448, "grad_norm": 1.0625, "kl": 0.0022466240770881996, "learning_rate": 4.997767569765452e-06, "loss": 0.0001, "reward": 2.673185594379902, "reward_std": 0.7204618379473686, "rewards/accuracy_reward": 0.060546875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7612875507523617, "rewards/reasoning_steps_reward": 0.32877604896202683, "step": 28 }, { "completion_length": 290.353515625, "epoch": 0.464, "grad_norm": 0.94921875, "kl": 0.002421206998405978, "learning_rate": 4.996031694606294e-06, "loss": 0.0001, "reward": 2.2081645615398884, "reward_std": 0.7610204052180052, "rewards/accuracy_reward": 0.095703125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.5865322849713266, "rewards/reasoning_steps_reward": 0.3528645895421505, "step": 29 }, { "completion_length": 284.9765625, "epoch": 0.48, "grad_norm": 0.95703125, "kl": 0.0032389966800110415, "learning_rate": 4.993800445762451e-06, "loss": 0.0001, "reward": 2.4655835777521133, "reward_std": 0.8400940801948309, "rewards/accuracy_reward": 0.087890625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6395695237442851, "rewards/reasoning_steps_reward": 0.45898438431322575, "step": 30 }, { "completion_length": 288.93359375, "epoch": 0.496, "grad_norm": 1.0, "kl": 0.0028132440565968864, "learning_rate": 4.991074266057609e-06, "loss": 0.0001, "reward": 2.666738063097, "reward_std": 0.6789926886558533, "rewards/accuracy_reward": 0.087890625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7322286684066057, "rewards/reasoning_steps_reward": 0.3821614580228925, "step": 31 }, { "completion_length": 292.3671875, "epoch": 0.512, "grad_norm": 1.0078125, "kl": 0.004060989667777903, "learning_rate": 4.987853696541664e-06, "loss": 0.0002, "reward": 2.5818087458610535, "reward_std": 0.6875880807638168, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7134674986203512, "rewards/reasoning_steps_reward": 0.3789062611758709, "step": 32 }, { "completion_length": 286.158203125, "epoch": 0.528, "grad_norm": 1.140625, "kl": 0.005552116854232736, "learning_rate": 4.984139376383337e-06, "loss": 0.0002, "reward": 2.8399546705186367, "reward_std": 0.750790286809206, "rewards/accuracy_reward": 0.138671875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7791168199231228, "rewards/reasoning_steps_reward": 0.3639322938397527, "step": 33 }, { "completion_length": 287.48828125, "epoch": 0.544, "grad_norm": 3.171875, "kl": 0.00440279851318337, "learning_rate": 4.979932042743324e-06, "loss": 0.0002, "reward": 3.1019199565052986, "reward_std": 0.8068479858338833, "rewards/accuracy_reward": 0.06640625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.877940321341157, "rewards/reasoning_steps_reward": 0.4016927145421505, "step": 34 }, { "completion_length": 291.712890625, "epoch": 0.56, "grad_norm": 0.83984375, "kl": 0.003549927467247471, "learning_rate": 4.975232530627998e-06, "loss": 0.0001, "reward": 2.758346803486347, "reward_std": 0.73613665625453, "rewards/accuracy_reward": 0.05078125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7701433822512627, "rewards/reasoning_steps_reward": 0.3971354244276881, "step": 35 }, { "completion_length": 280.013671875, "epoch": 0.576, "grad_norm": 0.94921875, "kl": 0.004770460931467824, "learning_rate": 4.970041772723685e-06, "loss": 0.0002, "reward": 2.5518586486577988, "reward_std": 0.752994803711772, "rewards/accuracy_reward": 0.185546875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6477115458498398, "rewards/reasoning_steps_reward": 0.4231770895421505, "step": 36 }, { "completion_length": 294.7265625, "epoch": 0.592, "grad_norm": 0.88671875, "kl": 0.004318368082749657, "learning_rate": 4.964360799211563e-06, "loss": 0.0002, "reward": 2.9847040474414825, "reward_std": 0.7252895850688219, "rewards/accuracy_reward": 0.03515625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8412555102258921, "rewards/reasoning_steps_reward": 0.42578124813735485, "step": 37 }, { "completion_length": 287.59765625, "epoch": 0.608, "grad_norm": 0.92578125, "kl": 0.005480331514263526, "learning_rate": 4.958190737563203e-06, "loss": 0.0002, "reward": 2.4749373346567154, "reward_std": 0.7473156917840242, "rewards/accuracy_reward": 0.11328125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6489808422823747, "rewards/reasoning_steps_reward": 0.41471355129033327, "step": 38 }, { "completion_length": 295.189453125, "epoch": 0.624, "grad_norm": 0.8828125, "kl": 0.005519463520613499, "learning_rate": 4.951532812316814e-06, "loss": 0.0002, "reward": 2.7017148807644844, "reward_std": 0.713581632822752, "rewards/accuracy_reward": 0.03515625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7169778756797314, "rewards/reasoning_steps_reward": 0.5156250111758709, "step": 39 }, { "completion_length": 289.95703125, "epoch": 0.64, "grad_norm": 0.8828125, "kl": 0.005352065360057168, "learning_rate": 4.944388344834205e-06, "loss": 0.0002, "reward": 2.7851984202861786, "reward_std": 0.658753015100956, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7454567709937692, "rewards/reasoning_steps_reward": 0.43945313338190317, "step": 40 }, { "completion_length": 290.80078125, "epoch": 0.656, "grad_norm": 0.8515625, "kl": 0.00584478146629408, "learning_rate": 4.936758753038551e-06, "loss": 0.0002, "reward": 2.83456464856863, "reward_std": 0.6670792158693075, "rewards/accuracy_reward": 0.056640625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7892559381822745, "rewards/reasoning_steps_reward": 0.41015625838190317, "step": 41 }, { "completion_length": 286.533203125, "epoch": 0.672, "grad_norm": 1.09375, "kl": 0.009831015078816563, "learning_rate": 4.92864555113298e-06, "loss": 0.0004, "reward": 3.0447439029812813, "reward_std": 0.6739194095134735, "rewards/accuracy_reward": 0.16015625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8074493408203125, "rewards/reasoning_steps_reward": 0.4622395820915699, "step": 42 }, { "completion_length": 295.37109375, "epoch": 0.688, "grad_norm": 0.95703125, "kl": 0.0045513896038755774, "learning_rate": 4.92005034930006e-06, "loss": 0.0002, "reward": 2.367835894227028, "reward_std": 0.6928801033645868, "rewards/accuracy_reward": 0.05859375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6254331463327011, "rewards/reasoning_steps_reward": 0.4329427080228925, "step": 43 }, { "completion_length": 291.12890625, "epoch": 0.704, "grad_norm": 0.89453125, "kl": 0.007478385392460041, "learning_rate": 4.9109748533822315e-06, "loss": 0.0003, "reward": 3.1017851755023003, "reward_std": 0.7546116765588522, "rewards/accuracy_reward": 0.05078125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8752912487834692, "rewards/reasoning_steps_reward": 0.42513021221384406, "step": 44 }, { "completion_length": 286.34375, "epoch": 0.72, "grad_norm": 0.90234375, "kl": 0.007521548090153374, "learning_rate": 4.901420864543265e-06, "loss": 0.0003, "reward": 2.608507961034775, "reward_std": 0.6855722554028034, "rewards/accuracy_reward": 0.119140625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.674624165520072, "rewards/reasoning_steps_reward": 0.4654947901144624, "step": 45 }, { "completion_length": 287.42578125, "epoch": 0.736, "grad_norm": 0.984375, "kl": 0.006817970628617331, "learning_rate": 4.891390278910788e-06, "loss": 0.0003, "reward": 2.673181392252445, "reward_std": 0.7831938974559307, "rewards/accuracy_reward": 0.095703125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7087687657525142, "rewards/reasoning_steps_reward": 0.4511718712747097, "step": 46 }, { "completion_length": 285.908203125, "epoch": 0.752, "grad_norm": 1.2890625, "kl": 0.007845322310458869, "learning_rate": 4.880885087199972e-06, "loss": 0.0003, "reward": 2.7633985728025436, "reward_std": 0.7148055490106344, "rewards/accuracy_reward": 0.099609375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.750776955857873, "rewards/reasoning_steps_reward": 0.41145834047347307, "step": 47 }, { "completion_length": 289.779296875, "epoch": 0.768, "grad_norm": 1.15625, "kl": 0.008501806572894566, "learning_rate": 4.869907374318446e-06, "loss": 0.0003, "reward": 3.029990702867508, "reward_std": 0.7890328913927078, "rewards/accuracy_reward": 0.08203125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8099100968490044, "rewards/reasoning_steps_reward": 0.5182291734963655, "step": 48 }, { "completion_length": 288.1328125, "epoch": 0.784, "grad_norm": 1.5390625, "kl": 0.008691710012499243, "learning_rate": 4.858459318952521e-06, "loss": 0.0003, "reward": 2.929666645824909, "reward_std": 0.7696562893688679, "rewards/accuracy_reward": 0.072265625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7836301922798157, "rewards/reasoning_steps_reward": 0.5065104309469461, "step": 49 }, { "completion_length": 290.06640625, "epoch": 0.8, "grad_norm": 0.95703125, "kl": 0.007455944927642122, "learning_rate": 4.8465431931347904e-06, "loss": 0.0003, "reward": 2.573406994342804, "reward_std": 0.7570146657526493, "rewards/accuracy_reward": 0.103515625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6466478169895709, "rewards/reasoning_steps_reward": 0.529947929084301, "step": 50 }, { "completion_length": 291.49609375, "epoch": 0.816, "grad_norm": 0.98046875, "kl": 0.01057859291904606, "learning_rate": 4.83416136179322e-06, "loss": 0.0004, "reward": 2.6446976363658905, "reward_std": 0.6847481243312359, "rewards/accuracy_reward": 0.025390625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7094738796974221, "rewards/reasoning_steps_reward": 0.49088541977107525, "step": 51 }, { "completion_length": 282.392578125, "epoch": 0.832, "grad_norm": 2.03125, "kl": 0.01016361394431442, "learning_rate": 4.821316282281788e-06, "loss": 0.0004, "reward": 2.766519770026207, "reward_std": 0.7637902311980724, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6958277653902769, "rewards/reasoning_steps_reward": 0.5462239757180214, "step": 52 }, { "completion_length": 287.380859375, "epoch": 0.848, "grad_norm": 1.5, "kl": 0.00917052014847286, "learning_rate": 4.808010503892788e-06, "loss": 0.0004, "reward": 2.570107080042362, "reward_std": 0.7531391996890306, "rewards/accuracy_reward": 0.10546875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6537943718334039, "rewards/reasoning_steps_reward": 0.5032552108168602, "step": 53 }, { "completion_length": 291.640625, "epoch": 0.864, "grad_norm": 0.89453125, "kl": 0.009139836591202766, "learning_rate": 4.794246667350889e-06, "loss": 0.0004, "reward": 2.8398406505584717, "reward_std": 0.7224904727190733, "rewards/accuracy_reward": 0.04296875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.770832309499383, "rewards/reasoning_steps_reward": 0.484375006519258, "step": 54 }, { "completion_length": 273.5, "epoch": 0.88, "grad_norm": 1.2734375, "kl": 0.009875323972664773, "learning_rate": 4.780027504289043e-06, "loss": 0.0004, "reward": 2.9461885392665863, "reward_std": 0.7379185315221548, "rewards/accuracy_reward": 0.193359375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7422624975442886, "rewards/reasoning_steps_reward": 0.5260416679084301, "step": 55 }, { "completion_length": 291.078125, "epoch": 0.896, "grad_norm": 0.8828125, "kl": 0.00826937542296946, "learning_rate": 4.765355836706349e-06, "loss": 0.0003, "reward": 2.880779907107353, "reward_std": 0.711861016228795, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7781853148092827, "rewards/reasoning_steps_reward": 0.5149739719927311, "step": 56 }, { "completion_length": 288.72265625, "epoch": 0.912, "grad_norm": 0.9140625, "kl": 0.008809896156890318, "learning_rate": 4.750234576407994e-06, "loss": 0.0004, "reward": 2.6955473721027374, "reward_std": 0.8277835454791784, "rewards/accuracy_reward": 0.087890625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6945227358179787, "rewards/reasoning_steps_reward": 0.5240885429084301, "step": 57 }, { "completion_length": 285.9921875, "epoch": 0.928, "grad_norm": 0.90234375, "kl": 0.008899325417587534, "learning_rate": 4.734666724427357e-06, "loss": 0.0004, "reward": 3.041392058134079, "reward_std": 0.6156999934464693, "rewards/accuracy_reward": 0.115234375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8032938965285817, "rewards/reasoning_steps_reward": 0.516276054084301, "step": 58 }, { "completion_length": 290.314453125, "epoch": 0.944, "grad_norm": 4.21875, "kl": 0.011989369959337637, "learning_rate": 4.718655370430411e-06, "loss": 0.0005, "reward": 2.9865424036979675, "reward_std": 0.8030446134507656, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7860957235097885, "rewards/reasoning_steps_reward": 0.5501302164047956, "step": 59 }, { "completion_length": 282.091796875, "epoch": 0.96, "grad_norm": 1.8125, "kl": 0.012043666996760294, "learning_rate": 4.702203692102539e-06, "loss": 0.0005, "reward": 3.1328602582216263, "reward_std": 0.6528369020670652, "rewards/accuracy_reward": 0.111328125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8283579163253307, "rewards/reasoning_steps_reward": 0.5364583358168602, "step": 60 }, { "completion_length": 288.666015625, "epoch": 0.976, "grad_norm": 0.76953125, "kl": 0.009388500155182555, "learning_rate": 4.68531495451787e-06, "loss": 0.0004, "reward": 2.58310616761446, "reward_std": 0.6356705613434315, "rewards/accuracy_reward": 0.126953125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6383791274080673, "rewards/reasoning_steps_reward": 0.5410156436264515, "step": 61 }, { "completion_length": 288.513671875, "epoch": 0.992, "grad_norm": 1.203125, "kl": 0.010823950171470642, "learning_rate": 4.66799250949128e-06, "loss": 0.0004, "reward": 3.1646435484290123, "reward_std": 0.7192362230271101, "rewards/accuracy_reward": 0.095703125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8294037394225597, "rewards/reasoning_steps_reward": 0.5807291716337204, "step": 62 }, { "completion_length": 283.671875, "epoch": 1.0, "grad_norm": 0.65234375, "kl": 0.011424218711908907, "learning_rate": 4.650239794913177e-06, "loss": 0.0002, "reward": 2.6004482805728912, "reward_std": 0.775815561413765, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6493681768576304, "rewards/reasoning_steps_reward": 0.5039062574505806, "step": 63 }, { "completion_length": 293.845703125, "epoch": 1.016, "grad_norm": 1.921875, "kl": 0.011366115068085492, "learning_rate": 4.632060334067202e-06, "loss": 0.0005, "reward": 2.7260814532637596, "reward_std": 0.6874045897275209, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6918969408919414, "rewards/reasoning_steps_reward": 0.5722656305879354, "step": 64 }, { "completion_length": 294.06640625, "epoch": 1.032, "grad_norm": 2.171875, "kl": 0.012063174799550325, "learning_rate": 4.613457734930978e-06, "loss": 0.0005, "reward": 2.6708649322390556, "reward_std": 0.6978613398969173, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6702362283443412, "rewards/reasoning_steps_reward": 0.5507812555879354, "step": 65 }, { "completion_length": 293.265625, "epoch": 1.048, "grad_norm": 0.91015625, "kl": 0.010817280679475516, "learning_rate": 4.5944356894600615e-06, "loss": 0.0004, "reward": 2.96081106364727, "reward_std": 0.7362911906093359, "rewards/accuracy_reward": 0.052734375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7738293862591187, "rewards/reasoning_steps_reward": 0.5865885466337204, "step": 66 }, { "completion_length": 278.333984375, "epoch": 1.064, "grad_norm": 0.82421875, "kl": 0.010780130076454952, "learning_rate": 4.574997972855212e-06, "loss": 0.0004, "reward": 2.909902695566416, "reward_std": 0.6607580352574587, "rewards/accuracy_reward": 0.228515625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.708031815631936, "rewards/reasoning_steps_reward": 0.5572916734963655, "step": 67 }, { "completion_length": 283.216796875, "epoch": 1.08, "grad_norm": 0.890625, "kl": 0.012393000011797994, "learning_rate": 4.5551484428131575e-06, "loss": 0.0005, "reward": 2.827034629881382, "reward_std": 0.6700945645570755, "rewards/accuracy_reward": 0.130859375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7125271611536542, "rewards/reasoning_steps_reward": 0.5585937593132257, "step": 68 }, { "completion_length": 288.322265625, "epoch": 1.096, "grad_norm": 1.21875, "kl": 0.013504860282409936, "learning_rate": 4.534891038760971e-06, "loss": 0.0005, "reward": 3.1474373564124107, "reward_std": 0.7250996101647615, "rewards/accuracy_reward": 0.07421875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.818677028020223, "rewards/reasoning_steps_reward": 0.6171875037252903, "step": 69 }, { "completion_length": 282.470703125, "epoch": 1.112, "grad_norm": 1.7421875, "kl": 0.010700971761252731, "learning_rate": 4.514229781074239e-06, "loss": 0.0004, "reward": 2.8449594378471375, "reward_std": 0.7744644097983837, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.70309411889563, "rewards/reasoning_steps_reward": 0.5872395783662796, "step": 70 }, { "completion_length": 290.74609375, "epoch": 1.1280000000000001, "grad_norm": 0.98828125, "kl": 0.012390443938784301, "learning_rate": 4.49316877027916e-06, "loss": 0.0005, "reward": 2.777492232620716, "reward_std": 0.6991744674742222, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6892856055249771, "rewards/reasoning_steps_reward": 0.6002604309469461, "step": 71 }, { "completion_length": 286.109375, "epoch": 1.144, "grad_norm": 0.921875, "kl": 0.012579885253217071, "learning_rate": 4.471712186238728e-06, "loss": 0.0005, "reward": 2.548068232834339, "reward_std": 0.6026105545461178, "rewards/accuracy_reward": 0.15234375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6145470403134823, "rewards/reasoning_steps_reward": 0.5520833395421505, "step": 72 }, { "completion_length": 283.822265625, "epoch": 1.16, "grad_norm": 0.80859375, "kl": 0.01136038324330002, "learning_rate": 4.449864287323188e-06, "loss": 0.0005, "reward": 2.7529877200722694, "reward_std": 0.575440164655447, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6726538874208927, "rewards/reasoning_steps_reward": 0.633463554084301, "step": 73 }, { "completion_length": 287.66015625, "epoch": 1.176, "grad_norm": 0.99609375, "kl": 0.013483586255460978, "learning_rate": 4.427629409564898e-06, "loss": 0.0005, "reward": 2.6529831513762474, "reward_std": 0.7726290188729763, "rewards/accuracy_reward": 0.025390625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6770794546852509, "rewards/reasoning_steps_reward": 0.5963541734963655, "step": 74 }, { "completion_length": 287.505859375, "epoch": 1.192, "grad_norm": 0.8828125, "kl": 0.010077012644615024, "learning_rate": 4.405011965797775e-06, "loss": 0.0004, "reward": 2.944363258779049, "reward_std": 0.6908796802163124, "rewards/accuracy_reward": 0.111328125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7551089283078909, "rewards/reasoning_steps_reward": 0.567708333954215, "step": 75 }, { "completion_length": 286.248046875, "epoch": 1.208, "grad_norm": 1.109375, "kl": 0.014365001203259453, "learning_rate": 4.382016444781509e-06, "loss": 0.0006, "reward": 2.8981464356184006, "reward_std": 0.7666896525770426, "rewards/accuracy_reward": 0.09765625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7290696154038111, "rewards/reasoning_steps_reward": 0.6132812537252903, "step": 76 }, { "completion_length": 296.5078125, "epoch": 1.224, "grad_norm": 0.91796875, "kl": 0.011508767551276833, "learning_rate": 4.3586474103107034e-06, "loss": 0.0005, "reward": 3.2085797861218452, "reward_std": 0.7307887077331543, "rewards/accuracy_reward": 0.0078125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8637974336743355, "rewards/reasoning_steps_reward": 0.6093750055879354, "step": 77 }, { "completion_length": 288.017578125, "epoch": 1.24, "grad_norm": 1.7890625, "kl": 0.01725275401258841, "learning_rate": 4.334909500309124e-06, "loss": 0.0007, "reward": 2.819778010249138, "reward_std": 0.680737467482686, "rewards/accuracy_reward": 0.087890625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7289884922405084, "rewards/reasoning_steps_reward": 0.5449218675494194, "step": 78 }, { "completion_length": 289.611328125, "epoch": 1.256, "grad_norm": 0.91015625, "kl": 0.012111473915865645, "learning_rate": 4.310807425909231e-06, "loss": 0.0005, "reward": 2.8959785476326942, "reward_std": 0.7515880167484283, "rewards/accuracy_reward": 0.091796875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7370275671904286, "rewards/reasoning_steps_reward": 0.5930989608168602, "step": 79 }, { "completion_length": 289.447265625, "epoch": 1.272, "grad_norm": 1.0625, "kl": 0.01488638247246854, "learning_rate": 4.286345970517195e-06, "loss": 0.0006, "reward": 3.0342861488461494, "reward_std": 0.7542771827429533, "rewards/accuracy_reward": 0.08203125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7896405051772794, "rewards/reasoning_steps_reward": 0.5833333432674408, "step": 80 }, { "completion_length": 291.05078125, "epoch": 1.288, "grad_norm": 2.203125, "kl": 0.018765830318443477, "learning_rate": 4.261529988863552e-06, "loss": 0.0008, "reward": 2.6918394044041634, "reward_std": 0.5996266044676304, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6713683251291513, "rewards/reasoning_steps_reward": 0.6230468954890966, "step": 81 }, { "completion_length": 286.84765625, "epoch": 1.304, "grad_norm": 1.1796875, "kl": 0.014768981840461493, "learning_rate": 4.236364406039718e-06, "loss": 0.0006, "reward": 2.7222700491547585, "reward_std": 0.7165388073772192, "rewards/accuracy_reward": 0.166015625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6546021662652493, "rewards/reasoning_steps_reward": 0.5924479365348816, "step": 82 }, { "completion_length": 286.40625, "epoch": 1.32, "grad_norm": 1.1875, "kl": 0.013985031400807202, "learning_rate": 4.210854216520529e-06, "loss": 0.0006, "reward": 2.992369443178177, "reward_std": 0.704998791217804, "rewards/accuracy_reward": 0.140625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7387759207437435, "rewards/reasoning_steps_reward": 0.6354166734963655, "step": 83 }, { "completion_length": 285.0546875, "epoch": 1.336, "grad_norm": 0.98828125, "kl": 0.015817424457054585, "learning_rate": 4.185004483173018e-06, "loss": 0.0006, "reward": 2.6470197066664696, "reward_std": 0.6006427239626646, "rewards/accuracy_reward": 0.091796875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6638069117131332, "rewards/reasoning_steps_reward": 0.5638020895421505, "step": 84 }, { "completion_length": 289.240234375, "epoch": 1.3519999999999999, "grad_norm": 0.7734375, "kl": 0.012478121934691444, "learning_rate": 4.158820336251615e-06, "loss": 0.0005, "reward": 2.86134272813797, "reward_std": 0.6924843583256006, "rewards/accuracy_reward": 0.103515625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7050829759488503, "rewards/reasoning_steps_reward": 0.6425781324505806, "step": 85 }, { "completion_length": 290.703125, "epoch": 1.3679999999999999, "grad_norm": 1.0, "kl": 0.01463651837548241, "learning_rate": 4.132306972379971e-06, "loss": 0.0006, "reward": 2.752312555909157, "reward_std": 0.6686646416783333, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6626632142191132, "rewards/reasoning_steps_reward": 0.6705729197710752, "step": 86 }, { "completion_length": 295.359375, "epoch": 1.384, "grad_norm": 6.46875, "kl": 0.051578508340753615, "learning_rate": 4.105469653519617e-06, "loss": 0.0021, "reward": 2.62810418009758, "reward_std": 0.7081009931862354, "rewards/accuracy_reward": 0.08203125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.630809023976326, "rewards/reasoning_steps_reward": 0.6536458320915699, "step": 87 }, { "completion_length": 274.9765625, "epoch": 1.4, "grad_norm": 0.9921875, "kl": 0.01592816604534164, "learning_rate": 4.078313705925647e-06, "loss": 0.0006, "reward": 2.9463500678539276, "reward_std": 0.6255538109689951, "rewards/accuracy_reward": 0.171875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.735371884269019, "rewards/reasoning_steps_reward": 0.5683593694120646, "step": 88 }, { "completion_length": 285.123046875, "epoch": 1.416, "grad_norm": 0.9375, "kl": 0.016973954916466027, "learning_rate": 4.0508445190896505e-06, "loss": 0.0007, "reward": 2.821994110941887, "reward_std": 0.6989093981683254, "rewards/accuracy_reward": 0.123046875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7006473361204067, "rewards/reasoning_steps_reward": 0.5970052275806665, "step": 89 }, { "completion_length": 289.72265625, "epoch": 1.432, "grad_norm": 1.03125, "kl": 0.013915765506681055, "learning_rate": 4.023067544670082e-06, "loss": 0.0006, "reward": 2.775428354740143, "reward_std": 0.6686036083847284, "rewards/accuracy_reward": 0.05078125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7163754136612018, "rewards/reasoning_steps_reward": 0.5755208488553762, "step": 90 }, { "completion_length": 285.888671875, "epoch": 1.448, "grad_norm": 0.84375, "kl": 0.014326595468446612, "learning_rate": 3.9949882954103115e-06, "loss": 0.0006, "reward": 2.778537377715111, "reward_std": 0.6794710606336594, "rewards/accuracy_reward": 0.10546875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6774811816091338, "rewards/reasoning_steps_reward": 0.6406249962747097, "step": 91 }, { "completion_length": 288.5703125, "epoch": 1.464, "grad_norm": 1.0078125, "kl": 0.0172699682880193, "learning_rate": 3.9666123440445295e-06, "loss": 0.0007, "reward": 3.1450441628694534, "reward_std": 0.6363171022385359, "rewards/accuracy_reward": 0.08203125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8013862585648894, "rewards/reasoning_steps_reward": 0.6588541604578495, "step": 92 }, { "completion_length": 290.484375, "epoch": 1.48, "grad_norm": 1.0703125, "kl": 0.01583321939688176, "learning_rate": 3.937945322191763e-06, "loss": 0.0006, "reward": 2.80034501850605, "reward_std": 0.6433209720999002, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6977712381631136, "rewards/reasoning_steps_reward": 0.6601562574505806, "step": 93 }, { "completion_length": 288.1953125, "epoch": 1.496, "grad_norm": 0.8515625, "kl": 0.014628544799052179, "learning_rate": 3.9089929192382e-06, "loss": 0.0006, "reward": 2.8053995221853256, "reward_std": 0.6814130581915379, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6920776072268685, "rewards/reasoning_steps_reward": 0.627604166045785, "step": 94 }, { "completion_length": 281.654296875, "epoch": 1.512, "grad_norm": 1.7578125, "kl": 0.018830388551577926, "learning_rate": 3.879760881208043e-06, "loss": 0.0008, "reward": 3.1405431628227234, "reward_std": 0.6778986994177103, "rewards/accuracy_reward": 0.126953125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7955456127723058, "rewards/reasoning_steps_reward": 0.6269531361758709, "step": 95 }, { "completion_length": 288.43359375, "epoch": 1.528, "grad_norm": 0.97265625, "kl": 0.015546579379588366, "learning_rate": 3.8502550096231325e-06, "loss": 0.0006, "reward": 2.9025785624980927, "reward_std": 0.6252446379512548, "rewards/accuracy_reward": 0.14453125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6997310233612856, "rewards/reasoning_steps_reward": 0.6588541865348816, "step": 96 }, { "completion_length": 287.794921875, "epoch": 1.544, "grad_norm": 1.90625, "kl": 0.01677697291597724, "learning_rate": 3.82048116035155e-06, "loss": 0.0007, "reward": 2.9905193150043488, "reward_std": 0.6695100143551826, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7668050316472849, "rewards/reasoning_steps_reward": 0.5807291697710752, "step": 97 }, { "completion_length": 290.875, "epoch": 1.56, "grad_norm": 0.93359375, "kl": 0.017896617820952088, "learning_rate": 3.790445242445432e-06, "loss": 0.0007, "reward": 3.0564729273319244, "reward_std": 0.7583746667951345, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.79595104791224, "rewards/reasoning_steps_reward": 0.6061197966337204, "step": 98 }, { "completion_length": 290.935546875, "epoch": 1.576, "grad_norm": 0.87890625, "kl": 0.01600857445737347, "learning_rate": 3.7601532169682363e-06, "loss": 0.0006, "reward": 3.207048572599888, "reward_std": 0.7255587056279182, "rewards/accuracy_reward": 0.10546875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8183651498208443, "rewards/reasoning_steps_reward": 0.6464843899011612, "step": 99 }, { "completion_length": 293.923828125, "epoch": 1.592, "grad_norm": 0.94921875, "kl": 0.016008648555725813, "learning_rate": 3.7296110958116845e-06, "loss": 0.0006, "reward": 3.213783323764801, "reward_std": 0.7248476631939411, "rewards/accuracy_reward": 0.05859375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8229972099264463, "rewards/reasoning_steps_reward": 0.6861979197710752, "step": 100 }, { "completion_length": 293.623046875, "epoch": 1.608, "grad_norm": 1.890625, "kl": 0.018201105995103717, "learning_rate": 3.69882494050261e-06, "loss": 0.0007, "reward": 3.1282228976488113, "reward_std": 0.7127013597637415, "rewards/accuracy_reward": 0.072265625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.780154167364041, "rewards/reasoning_steps_reward": 0.7154948115348816, "step": 101 }, { "completion_length": 282.21875, "epoch": 1.624, "grad_norm": 1.0078125, "kl": 0.01832750393077731, "learning_rate": 3.6678008609999618e-06, "loss": 0.0007, "reward": 2.694710373878479, "reward_std": 0.6631567031145096, "rewards/accuracy_reward": 0.142578125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.630658664740622, "rewards/reasoning_steps_reward": 0.6601562537252903, "step": 102 }, { "completion_length": 286.57421875, "epoch": 1.6400000000000001, "grad_norm": 0.8359375, "kl": 0.01992178033106029, "learning_rate": 3.636545014482198e-06, "loss": 0.0008, "reward": 2.5292934477329254, "reward_std": 0.6474510300904512, "rewards/accuracy_reward": 0.123046875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.5965700279921293, "rewards/reasoning_steps_reward": 0.6165364626795053, "step": 103 }, { "completion_length": 292.59375, "epoch": 1.6560000000000001, "grad_norm": 0.9140625, "kl": 0.01710453676059842, "learning_rate": 3.6050636041252996e-06, "loss": 0.0007, "reward": 2.915451444685459, "reward_std": 0.7090357206761837, "rewards/accuracy_reward": 0.072265625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7202980586638054, "rewards/reasoning_steps_reward": 0.6822916753590107, "step": 104 }, { "completion_length": 289.3359375, "epoch": 1.6720000000000002, "grad_norm": 1.0859375, "kl": 0.017357071512378752, "learning_rate": 3.5733628778716645e-06, "loss": 0.0007, "reward": 3.073413372039795, "reward_std": 0.6876837071031332, "rewards/accuracy_reward": 0.119140625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7621013515939316, "rewards/reasoning_steps_reward": 0.6679687574505806, "step": 105 }, { "completion_length": 291.736328125, "epoch": 1.688, "grad_norm": 0.96875, "kl": 0.02157578180776909, "learning_rate": 3.5414491271901073e-06, "loss": 0.0009, "reward": 2.819728344678879, "reward_std": 0.5820730160921812, "rewards/accuracy_reward": 0.123046875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6790587411572536, "rewards/reasoning_steps_reward": 0.6595052182674408, "step": 106 }, { "completion_length": 289.84375, "epoch": 1.704, "grad_norm": 0.90234375, "kl": 0.0179019469069317, "learning_rate": 3.5093286858272325e-06, "loss": 0.0007, "reward": 3.1114601120352745, "reward_std": 0.6653738301247358, "rewards/accuracy_reward": 0.08203125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7914936700835824, "rewards/reasoning_steps_reward": 0.6549479365348816, "step": 107 }, { "completion_length": 288.5234375, "epoch": 1.72, "grad_norm": 0.99609375, "kl": 0.01925749407382682, "learning_rate": 3.4770079285504053e-06, "loss": 0.0008, "reward": 2.79416061937809, "reward_std": 0.7290520258247852, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6850760908176502, "rewards/reasoning_steps_reward": 0.6608072835952044, "step": 108 }, { "completion_length": 290.9921875, "epoch": 1.736, "grad_norm": 0.98046875, "kl": 0.017092529160436243, "learning_rate": 3.4444932698825904e-06, "loss": 0.0007, "reward": 3.1415600925683975, "reward_std": 0.7225816715508699, "rewards/accuracy_reward": 0.064453125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7919783468047777, "rewards/reasoning_steps_reward": 0.7011718712747097, "step": 109 }, { "completion_length": 293.560546875, "epoch": 1.752, "grad_norm": 0.88671875, "kl": 0.016746411798521876, "learning_rate": 3.4117911628292944e-06, "loss": 0.0007, "reward": 2.7672165408730507, "reward_std": 0.6844876762479544, "rewards/accuracy_reward": 0.041015625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.680652025466164, "rewards/reasoning_steps_reward": 0.6842447929084301, "step": 110 }, { "completion_length": 285.9921875, "epoch": 1.768, "grad_norm": 0.98828125, "kl": 0.018680680135730654, "learning_rate": 3.378908097597875e-06, "loss": 0.0007, "reward": 2.875435918569565, "reward_std": 0.6584971006959677, "rewards/accuracy_reward": 0.119140625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.688730369011561, "rewards/reasoning_steps_reward": 0.6901041753590107, "step": 111 }, { "completion_length": 284.619140625, "epoch": 1.784, "grad_norm": 1.1796875, "kl": 0.01885543600656092, "learning_rate": 3.3458506003094626e-06, "loss": 0.0008, "reward": 3.2833499684929848, "reward_std": 0.630975978448987, "rewards/accuracy_reward": 0.15234375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8016982388993105, "rewards/reasoning_steps_reward": 0.7259114682674408, "step": 112 }, { "completion_length": 294.26953125, "epoch": 1.8, "grad_norm": 0.859375, "kl": 0.01702951017068699, "learning_rate": 3.3126252317037616e-06, "loss": 0.0007, "reward": 3.0866554528474808, "reward_std": 0.7168434467166662, "rewards/accuracy_reward": 0.046875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7908209301531315, "rewards/reasoning_steps_reward": 0.6673177108168602, "step": 113 }, { "completion_length": 288.5625, "epoch": 1.8159999999999998, "grad_norm": 31.875, "kl": 0.15217732661403716, "learning_rate": 3.2792385858369706e-06, "loss": 0.0061, "reward": 2.8756242617964745, "reward_std": 0.6984493192285299, "rewards/accuracy_reward": 0.056640625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7315449056526026, "rewards/reasoning_steps_reward": 0.6243489552289248, "step": 114 }, { "completion_length": 283.037109375, "epoch": 1.8319999999999999, "grad_norm": 0.7578125, "kl": 0.016328598430845886, "learning_rate": 3.245697288773102e-06, "loss": 0.0007, "reward": 2.902892917394638, "reward_std": 0.6528493817895651, "rewards/accuracy_reward": 0.158203125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6931084062283238, "rewards/reasoning_steps_reward": 0.6653645895421505, "step": 115 }, { "completion_length": 292.095703125, "epoch": 1.8479999999999999, "grad_norm": 0.9140625, "kl": 0.018676706589758396, "learning_rate": 3.2120079972689385e-06, "loss": 0.0007, "reward": 2.9004068598151207, "reward_std": 0.7504412587732077, "rewards/accuracy_reward": 0.087890625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7139811124652624, "rewards/reasoning_steps_reward": 0.6705729253590107, "step": 116 }, { "completion_length": 288.615234375, "epoch": 1.8639999999999999, "grad_norm": 1.1171875, "kl": 0.020321853808127344, "learning_rate": 3.1781773974529072e-06, "loss": 0.0008, "reward": 2.7037860229611397, "reward_std": 0.6163357421755791, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6365050650201738, "rewards/reasoning_steps_reward": 0.7005208358168602, "step": 117 }, { "completion_length": 290.62109375, "epoch": 1.88, "grad_norm": 1.09375, "kl": 0.019285056594526395, "learning_rate": 3.1442122034981187e-06, "loss": 0.0008, "reward": 2.6533412411808968, "reward_std": 0.6223033964633942, "rewards/accuracy_reward": 0.10546875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6294557445993026, "rewards/reasoning_steps_reward": 0.6595052145421505, "step": 118 }, { "completion_length": 290.82421875, "epoch": 1.896, "grad_norm": 1.1953125, "kl": 0.017033788317348808, "learning_rate": 3.110119156289841e-06, "loss": 0.0007, "reward": 3.352183550596237, "reward_std": 0.6941560469567776, "rewards/accuracy_reward": 0.083984375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8489483365168174, "rewards/reasoning_steps_reward": 0.7213541753590107, "step": 119 }, { "completion_length": 283.248046875, "epoch": 1.912, "grad_norm": 2.015625, "kl": 0.024006142339203507, "learning_rate": 3.075905022087675e-06, "loss": 0.001, "reward": 2.9336234778165817, "reward_std": 0.649795226752758, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6996626891195774, "rewards/reasoning_steps_reward": 0.6783854253590107, "step": 120 }, { "completion_length": 282.849609375, "epoch": 1.928, "grad_norm": 1.109375, "kl": 0.02002483472460881, "learning_rate": 3.0415765911826916e-06, "loss": 0.0008, "reward": 2.675464451313019, "reward_std": 0.6967358216643333, "rewards/accuracy_reward": 0.119140625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6331409340103468, "rewards/reasoning_steps_reward": 0.6569010391831398, "step": 121 }, { "completion_length": 290.783203125, "epoch": 1.944, "grad_norm": 1.1171875, "kl": 0.019037541293073446, "learning_rate": 3.0071406765498003e-06, "loss": 0.0008, "reward": 3.0036216378211975, "reward_std": 0.6973935160785913, "rewards/accuracy_reward": 0.080078125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7529433167849978, "rewards/reasoning_steps_reward": 0.6647135354578495, "step": 122 }, { "completion_length": 283.216796875, "epoch": 1.96, "grad_norm": 0.859375, "kl": 0.017990577791351825, "learning_rate": 2.9726041124956128e-06, "loss": 0.0007, "reward": 2.773143321275711, "reward_std": 0.714199235662818, "rewards/accuracy_reward": 0.111328125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6735130585730076, "rewards/reasoning_steps_reward": 0.6412760522216558, "step": 123 }, { "completion_length": 287.9765625, "epoch": 1.976, "grad_norm": 0.859375, "kl": 0.017044205858837813, "learning_rate": 2.9379737533020812e-06, "loss": 0.0007, "reward": 3.244216948747635, "reward_std": 0.6938743200153112, "rewards/accuracy_reward": 0.08984375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8157806489616632, "rewards/reasoning_steps_reward": 0.7070312425494194, "step": 124 }, { "completion_length": 290.36328125, "epoch": 1.992, "grad_norm": 0.9765625, "kl": 0.017912040289957076, "learning_rate": 2.9032564718661606e-06, "loss": 0.0007, "reward": 2.990898907184601, "reward_std": 0.6811724901199341, "rewards/accuracy_reward": 0.05859375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7619402616595229, "rewards/reasoning_steps_reward": 0.6464843824505806, "step": 125 }, { "completion_length": 292.08203125, "epoch": 2.0, "grad_norm": 0.609375, "kl": 0.017485147807747126, "learning_rate": 2.8684591583357863e-06, "loss": 0.0003, "reward": 3.365106463432312, "reward_std": 0.690997276455164, "rewards/accuracy_reward": 0.08203125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8508687826494375, "rewards/reasoning_steps_reward": 0.7304687425494194, "step": 126 }, { "completion_length": 274.076171875, "epoch": 2.016, "grad_norm": 0.9453125, "kl": 0.020696480583865196, "learning_rate": 2.8335887187424225e-06, "loss": 0.0008, "reward": 3.0040955543518066, "reward_std": 0.6572606600821018, "rewards/accuracy_reward": 0.248046875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7068773318702976, "rewards/reasoning_steps_reward": 0.6354166716337204, "step": 127 }, { "completion_length": 291.470703125, "epoch": 2.032, "grad_norm": 0.89453125, "kl": 0.018271160661242902, "learning_rate": 2.7986520736304632e-06, "loss": 0.0007, "reward": 2.8309315219521523, "reward_std": 0.6785434670746326, "rewards/accuracy_reward": 0.072265625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.688218497360746, "rewards/reasoning_steps_reward": 0.694010429084301, "step": 128 }, { "completion_length": 295.984375, "epoch": 2.048, "grad_norm": 1.03125, "kl": 0.018810921494150534, "learning_rate": 2.7636561566837463e-06, "loss": 0.0008, "reward": 3.07464836537838, "reward_std": 0.7456005457788706, "rewards/accuracy_reward": 0.037109375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7963671404868364, "rewards/reasoning_steps_reward": 0.6484375149011612, "step": 129 }, { "completion_length": 281.150390625, "epoch": 2.064, "grad_norm": 0.98046875, "kl": 0.020406899857334793, "learning_rate": 2.728607913349464e-06, "loss": 0.0008, "reward": 2.931031860411167, "reward_std": 0.6928635407239199, "rewards/accuracy_reward": 0.15234375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6862120016788443, "rewards/reasoning_steps_reward": 0.7200520895421505, "step": 130 }, { "completion_length": 290.275390625, "epoch": 2.08, "grad_norm": 0.8203125, "kl": 0.018310176266822964, "learning_rate": 2.6935142994597407e-06, "loss": 0.0007, "reward": 3.099424757063389, "reward_std": 0.6812999919056892, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7933412299801906, "rewards/reasoning_steps_reward": 0.6412760503590107, "step": 131 }, { "completion_length": 291.0234375, "epoch": 2.096, "grad_norm": 0.96484375, "kl": 0.01775828906102106, "learning_rate": 2.6583822798511428e-06, "loss": 0.0007, "reward": 3.313634306192398, "reward_std": 0.6808726880699396, "rewards/accuracy_reward": 0.083984375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8779822603488961, "rewards/reasoning_steps_reward": 0.5957031287252903, "step": 132 }, { "completion_length": 285.263671875, "epoch": 2.112, "grad_norm": 0.84375, "kl": 0.018688918324187398, "learning_rate": 2.623218826982411e-06, "loss": 0.0007, "reward": 2.7654543220996857, "reward_std": 0.6947140172123909, "rewards/accuracy_reward": 0.15234375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.640568091844519, "rewards/reasoning_steps_reward": 0.69140625, "step": 133 }, { "completion_length": 282.830078125, "epoch": 2.128, "grad_norm": 0.94140625, "kl": 0.021032241464126855, "learning_rate": 2.5880309195506714e-06, "loss": 0.0008, "reward": 2.8315402641892433, "reward_std": 0.6945422478020191, "rewards/accuracy_reward": 0.158203125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.641546401505669, "rewards/reasoning_steps_reward": 0.748697929084301, "step": 134 }, { "completion_length": 292.361328125, "epoch": 2.144, "grad_norm": 0.890625, "kl": 0.018107893760316074, "learning_rate": 2.552825541106414e-06, "loss": 0.0007, "reward": 3.0376425981521606, "reward_std": 0.7193902563303709, "rewards/accuracy_reward": 0.029296875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7757853666941324, "rewards/reasoning_steps_reward": 0.6809895876795053, "step": 135 }, { "completion_length": 287.232421875, "epoch": 2.16, "grad_norm": 0.85546875, "kl": 0.018850211054086685, "learning_rate": 2.517609678667501e-06, "loss": 0.0008, "reward": 2.687412917613983, "reward_std": 0.6682394985109568, "rewards/accuracy_reward": 0.08984375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6455872931207219, "rewards/reasoning_steps_reward": 0.6608072966337204, "step": 136 }, { "completion_length": 290.71484375, "epoch": 2.176, "grad_norm": 0.859375, "kl": 0.017231477366294712, "learning_rate": 2.4823903213324995e-06, "loss": 0.0007, "reward": 3.0338680148124695, "reward_std": 0.6302597746253014, "rewards/accuracy_reward": 0.09765625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7382858718434969, "rewards/reasoning_steps_reward": 0.7213541753590107, "step": 137 }, { "completion_length": 289.5703125, "epoch": 2.192, "grad_norm": 0.796875, "kl": 0.01626200118334964, "learning_rate": 2.447174458893587e-06, "loss": 0.0007, "reward": 2.984310381114483, "reward_std": 0.6622797809541225, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7369576270381609, "rewards/reasoning_steps_reward": 0.6718750167638063, "step": 138 }, { "completion_length": 287.76953125, "epoch": 2.208, "grad_norm": 0.75390625, "kl": 0.01649257366079837, "learning_rate": 2.4119690804493285e-06, "loss": 0.0007, "reward": 3.0554041862487793, "reward_std": 0.7084929272532463, "rewards/accuracy_reward": 0.107421875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7320097194363674, "rewards/reasoning_steps_reward": 0.7519531361758709, "step": 139 }, { "completion_length": 294.828125, "epoch": 2.224, "grad_norm": 0.984375, "kl": 0.018743149645160884, "learning_rate": 2.376781173017589e-06, "loss": 0.0007, "reward": 2.9738914221525192, "reward_std": 0.6525749433785677, "rewards/accuracy_reward": 0.041015625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7449863621344169, "rewards/reasoning_steps_reward": 0.6979166753590107, "step": 140 }, { "completion_length": 289.109375, "epoch": 2.24, "grad_norm": 0.98046875, "kl": 0.022565504419617355, "learning_rate": 2.3416177201488585e-06, "loss": 0.0009, "reward": 3.2985419929027557, "reward_std": 0.6833065822720528, "rewards/accuracy_reward": 0.099609375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8430035648246607, "rewards/reasoning_steps_reward": 0.6699218768626451, "step": 141 }, { "completion_length": 288.521484375, "epoch": 2.2560000000000002, "grad_norm": 1.015625, "kl": 0.020633232838008553, "learning_rate": 2.3064857005402606e-06, "loss": 0.0008, "reward": 3.1613398045301437, "reward_std": 0.7222296446561813, "rewards/accuracy_reward": 0.095703125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7796913882096609, "rewards/reasoning_steps_reward": 0.7265624962747097, "step": 142 }, { "completion_length": 279.908203125, "epoch": 2.2720000000000002, "grad_norm": 1.078125, "kl": 0.02265268244082108, "learning_rate": 2.2713920866505364e-06, "loss": 0.0009, "reward": 2.9546066522598267, "reward_std": 0.681933119893074, "rewards/accuracy_reward": 0.193359375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7051379711677631, "rewards/reasoning_steps_reward": 0.6458333414047956, "step": 143 }, { "completion_length": 288.0, "epoch": 2.288, "grad_norm": 0.875, "kl": 0.01793542131781578, "learning_rate": 2.236343843316254e-06, "loss": 0.0007, "reward": 2.790590211749077, "reward_std": 0.651448430493474, "rewards/accuracy_reward": 0.05078125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6706481222063303, "rewards/reasoning_steps_reward": 0.7278645765036345, "step": 144 }, { "completion_length": 285.646484375, "epoch": 2.304, "grad_norm": 0.9609375, "kl": 0.018404830596409738, "learning_rate": 2.201347926369537e-06, "loss": 0.0007, "reward": 2.710278756916523, "reward_std": 0.6365776527673006, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6241293720280131, "rewards/reasoning_steps_reward": 0.7128906175494194, "step": 145 }, { "completion_length": 295.73046875, "epoch": 2.32, "grad_norm": 0.9921875, "kl": 0.021149699110537767, "learning_rate": 2.166411281257578e-06, "loss": 0.0008, "reward": 3.2047041803598404, "reward_std": 0.7344950754195452, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8334256897990903, "rewards/reasoning_steps_reward": 0.6731770820915699, "step": 146 }, { "completion_length": 288.49609375, "epoch": 2.336, "grad_norm": 1.921875, "kl": 0.019101842306554317, "learning_rate": 2.1315408416642145e-06, "loss": 0.0008, "reward": 2.9557630866765976, "reward_std": 0.6881984118372202, "rewards/accuracy_reward": 0.111328125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7087786557773749, "rewards/reasoning_steps_reward": 0.7180989496409893, "step": 147 }, { "completion_length": 283.1796875, "epoch": 2.352, "grad_norm": 0.82421875, "kl": 0.01961760746780783, "learning_rate": 2.09674352813384e-06, "loss": 0.0008, "reward": 3.1119301542639732, "reward_std": 0.5922442562878132, "rewards/accuracy_reward": 0.150390625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7777614261334141, "rewards/reasoning_steps_reward": 0.6282552145421505, "step": 148 }, { "completion_length": 284.3828125, "epoch": 2.368, "grad_norm": 0.8671875, "kl": 0.022024919569958, "learning_rate": 2.062026246697919e-06, "loss": 0.0009, "reward": 3.0898532271385193, "reward_std": 0.6860612127929926, "rewards/accuracy_reward": 0.146484375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7363313144693772, "rewards/reasoning_steps_reward": 0.7343750149011612, "step": 149 }, { "completion_length": 286.181640625, "epoch": 2.384, "grad_norm": 1.1484375, "kl": 0.01775654760422185, "learning_rate": 2.0273958875043877e-06, "loss": 0.0007, "reward": 2.974420055747032, "reward_std": 0.6679348535835743, "rewards/accuracy_reward": 0.123046875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7212910537297527, "rewards/reasoning_steps_reward": 0.6875000037252903, "step": 150 }, { "completion_length": 276.9296875, "epoch": 2.4, "grad_norm": 1.03125, "kl": 0.02118692739168182, "learning_rate": 1.992859323450201e-06, "loss": 0.0008, "reward": 2.724317155778408, "reward_std": 0.6507551912218332, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6227324477707347, "rewards/reasoning_steps_reward": 0.6686198022216558, "step": 151 }, { "completion_length": 285.744140625, "epoch": 2.416, "grad_norm": 4.90625, "kl": 0.02042768005048856, "learning_rate": 1.958423408817309e-06, "loss": 0.0008, "reward": 3.1025044322013855, "reward_std": 0.6402757167816162, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7509650103747845, "rewards/reasoning_steps_reward": 0.7011718768626451, "step": 152 }, { "completion_length": 286.40234375, "epoch": 2.432, "grad_norm": 1.046875, "kl": 0.022553854738362134, "learning_rate": 1.924094977912326e-06, "loss": 0.0009, "reward": 2.981735587120056, "reward_std": 0.7370939962565899, "rewards/accuracy_reward": 0.11328125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7332781835769614, "rewards/reasoning_steps_reward": 0.6686198078095913, "step": 153 }, { "completion_length": 288.896484375, "epoch": 2.448, "grad_norm": 0.83203125, "kl": 0.019592860713601112, "learning_rate": 1.8898808437101598e-06, "loss": 0.0008, "reward": 2.95571531355381, "reward_std": 0.7355391271412373, "rewards/accuracy_reward": 0.068359375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7332853010545174, "rewards/reasoning_steps_reward": 0.6875000037252903, "step": 154 }, { "completion_length": 294.271484375, "epoch": 2.464, "grad_norm": 0.94140625, "kl": 0.019800655485596508, "learning_rate": 1.8557877965018817e-06, "loss": 0.0008, "reward": 3.0556194335222244, "reward_std": 0.7033564373850822, "rewards/accuracy_reward": 0.044921875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7715779819215337, "rewards/reasoning_steps_reward": 0.6959635429084301, "step": 155 }, { "completion_length": 294.31640625, "epoch": 2.48, "grad_norm": 0.859375, "kl": 0.018254225375130773, "learning_rate": 1.8218226025470934e-06, "loss": 0.0007, "reward": 3.604881629347801, "reward_std": 0.715133111923933, "rewards/accuracy_reward": 0.052734375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.9479379473874966, "rewards/reasoning_steps_reward": 0.7083333265036345, "step": 156 }, { "completion_length": 289.25390625, "epoch": 2.496, "grad_norm": 0.83984375, "kl": 0.017004019115120173, "learning_rate": 1.7879920027310621e-06, "loss": 0.0007, "reward": 3.051852695643902, "reward_std": 0.7096979664638638, "rewards/accuracy_reward": 0.07421875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7540463662395874, "rewards/reasoning_steps_reward": 0.7154948078095913, "step": 157 }, { "completion_length": 290.005859375, "epoch": 2.512, "grad_norm": 0.9765625, "kl": 0.019147633225657046, "learning_rate": 1.7543027112268994e-06, "loss": 0.0008, "reward": 2.991758108139038, "reward_std": 0.684099368751049, "rewards/accuracy_reward": 0.103515625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7318446912492315, "rewards/reasoning_steps_reward": 0.6927083320915699, "step": 158 }, { "completion_length": 280.697265625, "epoch": 2.528, "grad_norm": 1.1171875, "kl": 0.020928668964188546, "learning_rate": 1.7207614141630304e-06, "loss": 0.0008, "reward": 2.596983939409256, "reward_std": 0.6806027349084616, "rewards/accuracy_reward": 0.12890625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6024234692255656, "rewards/reasoning_steps_reward": 0.6608072984963655, "step": 159 }, { "completion_length": 285.546875, "epoch": 2.544, "grad_norm": 1.09375, "kl": 0.022152581717818975, "learning_rate": 1.6873747682962393e-06, "loss": 0.0009, "reward": 2.8694690242409706, "reward_std": 0.6588537991046906, "rewards/accuracy_reward": 0.126953125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6882605010954043, "rewards/reasoning_steps_reward": 0.6777343805879354, "step": 160 }, { "completion_length": 283.427734375, "epoch": 2.56, "grad_norm": 0.94140625, "kl": 0.020902132673654705, "learning_rate": 1.6541493996905378e-06, "loss": 0.0008, "reward": 3.1272382587194443, "reward_std": 0.6674788426607847, "rewards/accuracy_reward": 0.12890625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7581245402495066, "rewards/reasoning_steps_reward": 0.7239583395421505, "step": 161 }, { "completion_length": 286.408203125, "epoch": 2.576, "grad_norm": 0.953125, "kl": 0.022079574409872293, "learning_rate": 1.6210919024021258e-06, "loss": 0.0009, "reward": 2.9151505902409554, "reward_std": 0.7153513710945845, "rewards/accuracy_reward": 0.11328125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7115172014261285, "rewards/reasoning_steps_reward": 0.6673177219927311, "step": 162 }, { "completion_length": 291.72265625, "epoch": 2.592, "grad_norm": 0.875, "kl": 0.017877445730846375, "learning_rate": 1.588208837170706e-06, "loss": 0.0007, "reward": 2.937485493719578, "reward_std": 0.7016174159944057, "rewards/accuracy_reward": 0.056640625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.734153147165974, "rewards/reasoning_steps_reward": 0.6783854216337204, "step": 163 }, { "completion_length": 289.837890625, "epoch": 2.608, "grad_norm": 1.015625, "kl": 0.023135888564866036, "learning_rate": 1.55550673011741e-06, "loss": 0.0009, "reward": 3.3134661614894867, "reward_std": 0.6674238592386246, "rewards/accuracy_reward": 0.09765625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.824106777086854, "rewards/reasoning_steps_reward": 0.7434895969927311, "step": 164 }, { "completion_length": 285.73828125, "epoch": 2.624, "grad_norm": 0.8828125, "kl": 0.017427237355150282, "learning_rate": 1.522992071449595e-06, "loss": 0.0007, "reward": 2.761025607585907, "reward_std": 0.5951798930764198, "rewards/accuracy_reward": 0.115234375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6484234320620695, "rewards/reasoning_steps_reward": 0.7005208227783442, "step": 165 }, { "completion_length": 292.478515625, "epoch": 2.64, "grad_norm": 0.90625, "kl": 0.02133324311580509, "learning_rate": 1.4906713141727677e-06, "loss": 0.0009, "reward": 2.930042363703251, "reward_std": 0.6626697592437267, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7227745447307825, "rewards/reasoning_steps_reward": 0.6992187462747097, "step": 166 }, { "completion_length": 279.865234375, "epoch": 2.656, "grad_norm": 1.09375, "kl": 0.024935539229772985, "learning_rate": 1.4585508728098935e-06, "loss": 0.001, "reward": 2.825145460665226, "reward_std": 0.7417711336165667, "rewards/accuracy_reward": 0.181640625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6630693133920431, "rewards/reasoning_steps_reward": 0.6542968759313226, "step": 167 }, { "completion_length": 293.625, "epoch": 2.672, "grad_norm": 1.0234375, "kl": 0.018147769616916776, "learning_rate": 1.4266371221283367e-06, "loss": 0.0007, "reward": 2.7056074738502502, "reward_std": 0.6061353217810392, "rewards/accuracy_reward": 0.02734375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6455757624159256, "rewards/reasoning_steps_reward": 0.7415364719927311, "step": 168 }, { "completion_length": 293.119140625, "epoch": 2.6879999999999997, "grad_norm": 1.046875, "kl": 0.019442370510660112, "learning_rate": 1.3949363958747004e-06, "loss": 0.0008, "reward": 3.226225107908249, "reward_std": 0.7255453541874886, "rewards/accuracy_reward": 0.01953125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8371271109208465, "rewards/reasoning_steps_reward": 0.6953125037252903, "step": 169 }, { "completion_length": 291.013671875, "epoch": 2.7039999999999997, "grad_norm": 0.8359375, "kl": 0.01949766167672351, "learning_rate": 1.363454985517803e-06, "loss": 0.0008, "reward": 2.700456887483597, "reward_std": 0.7572273463010788, "rewards/accuracy_reward": 0.11328125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6375654794586202, "rewards/reasoning_steps_reward": 0.6744791809469461, "step": 170 }, { "completion_length": 292.544921875, "epoch": 2.7199999999999998, "grad_norm": 0.9453125, "kl": 0.020666938507929444, "learning_rate": 1.3321991390000382e-06, "loss": 0.0008, "reward": 2.9996937662363052, "reward_std": 0.653770299628377, "rewards/accuracy_reward": 0.044921875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7529360945336521, "rewards/reasoning_steps_reward": 0.6959635615348816, "step": 171 }, { "completion_length": 289.091796875, "epoch": 2.7359999999999998, "grad_norm": 0.98828125, "kl": 0.021086076740175486, "learning_rate": 1.301175059497391e-06, "loss": 0.0008, "reward": 2.95357333868742, "reward_std": 0.6372328288853168, "rewards/accuracy_reward": 0.13671875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.701104310962061, "rewards/reasoning_steps_reward": 0.7135416828095913, "step": 172 }, { "completion_length": 281.466796875, "epoch": 2.752, "grad_norm": 0.94921875, "kl": 0.02050035016145557, "learning_rate": 1.270388904188316e-06, "loss": 0.0008, "reward": 2.7741658687591553, "reward_std": 0.7323318216949701, "rewards/accuracy_reward": 0.15625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6441229923317829, "rewards/reasoning_steps_reward": 0.6855468563735485, "step": 173 }, { "completion_length": 288.654296875, "epoch": 2.768, "grad_norm": 0.953125, "kl": 0.018003857927396894, "learning_rate": 1.2398467830317635e-06, "loss": 0.0007, "reward": 2.823809191584587, "reward_std": 0.6888855472207069, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6687002858767906, "rewards/reasoning_steps_reward": 0.7083333376795053, "step": 174 }, { "completion_length": 295.314453125, "epoch": 2.784, "grad_norm": 0.86328125, "kl": 0.018295871559530497, "learning_rate": 1.2095547575545685e-06, "loss": 0.0007, "reward": 3.150137387216091, "reward_std": 0.6382329538464546, "rewards/accuracy_reward": 0.041015625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7963565507282814, "rewards/reasoning_steps_reward": 0.7200520895421505, "step": 175 }, { "completion_length": 288.701171875, "epoch": 2.8, "grad_norm": 1.0234375, "kl": 0.020645066746510565, "learning_rate": 1.1795188396484505e-06, "loss": 0.0008, "reward": 2.7497966438531876, "reward_std": 0.696668054908514, "rewards/accuracy_reward": 0.103515625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6516249105334282, "rewards/reasoning_steps_reward": 0.6914062425494194, "step": 176 }, { "completion_length": 285.642578125, "epoch": 2.816, "grad_norm": 1.1484375, "kl": 0.01807958845165558, "learning_rate": 1.149744990376868e-06, "loss": 0.0007, "reward": 2.925790064036846, "reward_std": 0.6442860681563616, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6968345294396082, "rewards/reasoning_steps_reward": 0.7102864719927311, "step": 177 }, { "completion_length": 294.732421875, "epoch": 2.832, "grad_norm": 2.765625, "kl": 0.020244878192897886, "learning_rate": 1.1202391187919575e-06, "loss": 0.0008, "reward": 3.2739059031009674, "reward_std": 0.6519978456199169, "rewards/accuracy_reward": 0.0546875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8315363439420859, "rewards/reasoning_steps_reward": 0.7246093861758709, "step": 178 }, { "completion_length": 287.73828125, "epoch": 2.848, "grad_norm": 1.09375, "kl": 0.021688284177798778, "learning_rate": 1.0910070807618012e-06, "loss": 0.0009, "reward": 2.786106266081333, "reward_std": 0.676231924444437, "rewards/accuracy_reward": 0.103515625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6424607516576847, "rewards/reasoning_steps_reward": 0.7552083432674408, "step": 179 }, { "completion_length": 286.52734375, "epoch": 2.864, "grad_norm": 1.0546875, "kl": 0.02280406339559704, "learning_rate": 1.062054677808238e-06, "loss": 0.0009, "reward": 3.1157227605581284, "reward_std": 0.6361609604209661, "rewards/accuracy_reward": 0.099609375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.764702707529068, "rewards/reasoning_steps_reward": 0.7220052257180214, "step": 180 }, { "completion_length": 290.2109375, "epoch": 2.88, "grad_norm": 1.1328125, "kl": 0.028993367042858154, "learning_rate": 1.033387655955471e-06, "loss": 0.0012, "reward": 3.221103757619858, "reward_std": 0.5982426293194294, "rewards/accuracy_reward": 0.07421875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8156717581053575, "rewards/reasoning_steps_reward": 0.6998698078095913, "step": 181 }, { "completion_length": 289.28125, "epoch": 2.896, "grad_norm": 1.0390625, "kl": 0.02006814256310463, "learning_rate": 1.0050117045896889e-06, "loss": 0.0008, "reward": 2.751469612121582, "reward_std": 0.7198650874197483, "rewards/accuracy_reward": 0.11328125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6480593029409647, "rewards/reasoning_steps_reward": 0.6940104179084301, "step": 182 }, { "completion_length": 292.52734375, "epoch": 2.912, "grad_norm": 1.4140625, "kl": 0.022621202806476504, "learning_rate": 9.769324553299174e-07, "loss": 0.0009, "reward": 3.1886699497699738, "reward_std": 0.7633016854524612, "rewards/accuracy_reward": 0.12890625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7927076746709645, "rewards/reasoning_steps_reward": 0.6816406436264515, "step": 183 }, { "completion_length": 288.79296875, "epoch": 2.928, "grad_norm": 0.98046875, "kl": 0.021405818057246506, "learning_rate": 9.491554809103509e-07, "loss": 0.0009, "reward": 2.6857599690556526, "reward_std": 0.6840799152851105, "rewards/accuracy_reward": 0.083984375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6376578211784363, "rewards/reasoning_steps_reward": 0.6888020988553762, "step": 184 }, { "completion_length": 290.58203125, "epoch": 2.944, "grad_norm": 0.99609375, "kl": 0.020275956427212805, "learning_rate": 9.216862940743529e-07, "loss": 0.0008, "reward": 2.757513716816902, "reward_std": 0.602615574374795, "rewards/accuracy_reward": 0.115234375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.636836152523756, "rewards/reasoning_steps_reward": 0.7317708358168602, "step": 185 }, { "completion_length": 286.619140625, "epoch": 2.96, "grad_norm": 0.984375, "kl": 0.019758898008149117, "learning_rate": 8.945303464803833e-07, "loss": 0.0008, "reward": 3.0790238082408905, "reward_std": 0.5770421475172043, "rewards/accuracy_reward": 0.12109375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7574610461791357, "rewards/reasoning_steps_reward": 0.685546888038516, "step": 186 }, { "completion_length": 286.671875, "epoch": 2.976, "grad_norm": 0.9765625, "kl": 0.02084403787739575, "learning_rate": 8.676930276200294e-07, "loss": 0.0008, "reward": 3.0736390501260757, "reward_std": 0.6433412320911884, "rewards/accuracy_reward": 0.072265625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7658657878637314, "rewards/reasoning_steps_reward": 0.7037760429084301, "step": 187 }, { "completion_length": 284.01171875, "epoch": 2.992, "grad_norm": 1.0, "kl": 0.019843781657982618, "learning_rate": 8.411796637483852e-07, "loss": 0.0008, "reward": 2.9655564725399017, "reward_std": 0.6882054135203362, "rewards/accuracy_reward": 0.107421875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7163833907494942, "rewards/reasoning_steps_reward": 0.7089843675494194, "step": 188 }, { "completion_length": 290.765625, "epoch": 3.0, "grad_norm": 0.69921875, "kl": 0.017977926647290587, "learning_rate": 8.149955168269822e-07, "loss": 0.0004, "reward": 2.5494449138641357, "reward_std": 0.6191319935023785, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.5538079980760813, "rewards/reasoning_steps_reward": 0.7864583507180214, "step": 189 }, { "completion_length": 289.603515625, "epoch": 3.016, "grad_norm": 0.87890625, "kl": 0.019213943742215633, "learning_rate": 7.891457834794711e-07, "loss": 0.0008, "reward": 3.084651954472065, "reward_std": 0.6362812034785748, "rewards/accuracy_reward": 0.09375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7643284083654484, "rewards/reasoning_steps_reward": 0.6979166697710752, "step": 190 }, { "completion_length": 285.248046875, "epoch": 3.032, "grad_norm": 1.03125, "kl": 0.020919292815960944, "learning_rate": 7.636355939602824e-07, "loss": 0.0008, "reward": 2.85429210960865, "reward_std": 0.6567655950784683, "rewards/accuracy_reward": 0.078125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6931841528664032, "rewards/reasoning_steps_reward": 0.6966145969927311, "step": 191 }, { "completion_length": 290.787109375, "epoch": 3.048, "grad_norm": 0.87890625, "kl": 0.016602561168838292, "learning_rate": 7.384700111364487e-07, "loss": 0.0007, "reward": 2.8143509328365326, "reward_std": 0.6266643963754177, "rewards/accuracy_reward": 0.11328125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6690197528029481, "rewards/reasoning_steps_reward": 0.6940104104578495, "step": 192 }, { "completion_length": 282.99609375, "epoch": 3.064, "grad_norm": 0.96875, "kl": 0.02081725694006309, "learning_rate": 7.136540294828062e-07, "loss": 0.0008, "reward": 2.8774597868323326, "reward_std": 0.7187161836773157, "rewards/accuracy_reward": 0.083984375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6952643600913385, "rewards/reasoning_steps_reward": 0.7076822966337204, "step": 193 }, { "completion_length": 294.306640625, "epoch": 3.08, "grad_norm": 0.9296875, "kl": 0.02071163459913805, "learning_rate": 6.891925740907701e-07, "loss": 0.0008, "reward": 2.8051391541957855, "reward_std": 0.6224446576088667, "rewards/accuracy_reward": 0.021484375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.695680051886787, "rewards/reasoning_steps_reward": 0.6966145820915699, "step": 194 }, { "completion_length": 286.19140625, "epoch": 3.096, "grad_norm": 0.84765625, "kl": 0.018767547328025103, "learning_rate": 6.650904996908772e-07, "loss": 0.0008, "reward": 3.3200203105807304, "reward_std": 0.7382683884352446, "rewards/accuracy_reward": 0.1484375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8095814054831862, "rewards/reasoning_steps_reward": 0.7428385503590107, "step": 195 }, { "completion_length": 285.822265625, "epoch": 3.112, "grad_norm": 1.078125, "kl": 0.02104048355249688, "learning_rate": 6.413525896892972e-07, "loss": 0.0008, "reward": 2.955541580915451, "reward_std": 0.6638543289154768, "rewards/accuracy_reward": 0.103515625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7093558870255947, "rewards/reasoning_steps_reward": 0.7239583395421505, "step": 196 }, { "completion_length": 288.87890625, "epoch": 3.128, "grad_norm": 0.92578125, "kl": 0.02037365094292909, "learning_rate": 6.179835552184924e-07, "loss": 0.0008, "reward": 2.7349835634231567, "reward_std": 0.6583398748189211, "rewards/accuracy_reward": 0.07421875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.644300079283615, "rewards/reasoning_steps_reward": 0.7278645858168602, "step": 197 }, { "completion_length": 288.52734375, "epoch": 3.144, "grad_norm": 0.9453125, "kl": 0.02122843312099576, "learning_rate": 5.949880342022258e-07, "loss": 0.0008, "reward": 3.1269255951046944, "reward_std": 0.7235856931656599, "rewards/accuracy_reward": 0.068359375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7810237923016151, "rewards/reasoning_steps_reward": 0.7154947929084301, "step": 198 }, { "completion_length": 285.734375, "epoch": 3.16, "grad_norm": 0.9765625, "kl": 0.02136942616198212, "learning_rate": 5.723705904351027e-07, "loss": 0.0009, "reward": 2.681896522641182, "reward_std": 0.6634827610105276, "rewards/accuracy_reward": 0.109375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6170557765290141, "rewards/reasoning_steps_reward": 0.7213541716337204, "step": 199 }, { "completion_length": 286.953125, "epoch": 3.176, "grad_norm": 0.91796875, "kl": 0.019215874548535794, "learning_rate": 5.501357126768117e-07, "loss": 0.0008, "reward": 2.6373501121997833, "reward_std": 0.7020009346306324, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.5913562929878632, "rewards/reasoning_steps_reward": 0.7617187462747097, "step": 200 }, { "completion_length": 286.212890625, "epoch": 3.192, "grad_norm": 0.875, "kl": 0.020688754506409168, "learning_rate": 5.282878137612738e-07, "loss": 0.0008, "reward": 3.007347419857979, "reward_std": 0.6104327123612165, "rewards/accuracy_reward": 0.1328125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7257564406221112, "rewards/reasoning_steps_reward": 0.6972656305879354, "step": 201 }, { "completion_length": 286.62109375, "epoch": 3.208, "grad_norm": 0.8828125, "kl": 0.02169125445652753, "learning_rate": 5.068312297208414e-07, "loss": 0.0009, "reward": 3.0148477032780647, "reward_std": 0.679189708083868, "rewards/accuracy_reward": 0.08203125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7471367201457421, "rewards/reasoning_steps_reward": 0.69140625, "step": 202 }, { "completion_length": 293.78125, "epoch": 3.224, "grad_norm": 0.85546875, "kl": 0.02024375193286687, "learning_rate": 4.857702189257613e-07, "loss": 0.0008, "reward": 3.007346175611019, "reward_std": 0.6605745330452919, "rewards/accuracy_reward": 0.06640625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7448532494405905, "rewards/reasoning_steps_reward": 0.7063802145421505, "step": 203 }, { "completion_length": 296.005859375, "epoch": 3.24, "grad_norm": 0.875, "kl": 0.020721249806229025, "learning_rate": 4.6510896123903027e-07, "loss": 0.0008, "reward": 3.162186399102211, "reward_std": 0.668110404163599, "rewards/accuracy_reward": 0.03125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8146958220750093, "rewards/reasoning_steps_reward": 0.6868489757180214, "step": 204 }, { "completion_length": 284.65234375, "epoch": 3.2560000000000002, "grad_norm": 1.109375, "kl": 0.023086362169124186, "learning_rate": 4.4485155718684334e-07, "loss": 0.0009, "reward": 2.8323604688048363, "reward_std": 0.7187584564089775, "rewards/accuracy_reward": 0.1171875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6745888954028487, "rewards/reasoning_steps_reward": 0.6914062649011612, "step": 205 }, { "completion_length": 295.3203125, "epoch": 3.2720000000000002, "grad_norm": 0.890625, "kl": 0.019205813470762223, "learning_rate": 4.2500202714478853e-07, "loss": 0.0008, "reward": 3.3045015186071396, "reward_std": 0.7383539900183678, "rewards/accuracy_reward": 0.048828125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8480282643189033, "rewards/reasoning_steps_reward": 0.7115885354578495, "step": 206 }, { "completion_length": 276.3671875, "epoch": 3.288, "grad_norm": 0.9296875, "kl": 0.019852709374390543, "learning_rate": 4.05564310539939e-07, "loss": 0.0008, "reward": 3.327822983264923, "reward_std": 0.7267354801297188, "rewards/accuracy_reward": 0.21875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8076250155766805, "rewards/reasoning_steps_reward": 0.686197929084301, "step": 207 }, { "completion_length": 295.84375, "epoch": 3.304, "grad_norm": 0.79296875, "kl": 0.017830375931225717, "learning_rate": 3.8654226506902204e-07, "loss": 0.0007, "reward": 2.7935037687420845, "reward_std": 0.7168517392128706, "rewards/accuracy_reward": 0.05078125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6909335435678562, "rewards/reasoning_steps_reward": 0.6699218731373549, "step": 208 }, { "completion_length": 289.603515625, "epoch": 3.32, "grad_norm": 1.1875, "kl": 0.019766899524256587, "learning_rate": 3.679396659327986e-07, "loss": 0.0008, "reward": 3.100301645696163, "reward_std": 0.7392721492797136, "rewards/accuracy_reward": 0.10546875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.789510258163015, "rewards/reasoning_steps_reward": 0.6263020895421505, "step": 209 }, { "completion_length": 278.326171875, "epoch": 3.336, "grad_norm": 0.921875, "kl": 0.020691857673227787, "learning_rate": 3.4976020508682345e-07, "loss": 0.0008, "reward": 3.0393467769026756, "reward_std": 0.6087249293923378, "rewards/accuracy_reward": 0.16796875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7225339828679959, "rewards/reasoning_steps_reward": 0.703776054084301, "step": 210 }, { "completion_length": 292.322265625, "epoch": 3.352, "grad_norm": 1.15625, "kl": 0.020608096150681376, "learning_rate": 3.320074905087212e-07, "loss": 0.0008, "reward": 2.8478069826960564, "reward_std": 0.6319366451352835, "rewards/accuracy_reward": 0.087890625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6738783651962876, "rewards/reasoning_steps_reward": 0.7382812425494194, "step": 211 }, { "completion_length": 283.3984375, "epoch": 3.368, "grad_norm": 0.8671875, "kl": 0.023123053135350347, "learning_rate": 3.14685045482131e-07, "loss": 0.0009, "reward": 2.7474499940872192, "reward_std": 0.6886056587100029, "rewards/accuracy_reward": 0.095703125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6692888826752702, "rewards/reasoning_steps_reward": 0.6438802052289248, "step": 212 }, { "completion_length": 295.224609375, "epoch": 3.384, "grad_norm": 0.9140625, "kl": 0.023218440066557378, "learning_rate": 2.977963078974616e-07, "loss": 0.0009, "reward": 2.9267039820551872, "reward_std": 0.6514626033604145, "rewards/accuracy_reward": 0.060546875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7105940394103527, "rewards/reasoning_steps_reward": 0.7343750055879354, "step": 213 }, { "completion_length": 287.263671875, "epoch": 3.4, "grad_norm": 0.98046875, "kl": 0.021008892101235688, "learning_rate": 2.813446295695893e-07, "loss": 0.0008, "reward": 3.1436211466789246, "reward_std": 0.6997925061732531, "rewards/accuracy_reward": 0.076171875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7876740458110968, "rewards/reasoning_steps_reward": 0.7044270932674408, "step": 214 }, { "completion_length": 285.443359375, "epoch": 3.416, "grad_norm": 1.0625, "kl": 0.02351184340659529, "learning_rate": 2.65333275572644e-07, "loss": 0.0009, "reward": 2.9087352752685547, "reward_std": 0.6324813142418861, "rewards/accuracy_reward": 0.11328125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.707642661097149, "rewards/reasoning_steps_reward": 0.6725260391831398, "step": 215 }, { "completion_length": 292.380859375, "epoch": 3.432, "grad_norm": 0.88671875, "kl": 0.023277590342331678, "learning_rate": 2.4976542359200664e-07, "loss": 0.0009, "reward": 2.6246762797236443, "reward_std": 0.691521966829896, "rewards/accuracy_reward": 0.072265625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.610135139276584, "rewards/reasoning_steps_reward": 0.7220052182674408, "step": 216 }, { "completion_length": 280.158203125, "epoch": 3.448, "grad_norm": 1.1484375, "kl": 0.02378622384276241, "learning_rate": 2.3464416329365137e-07, "loss": 0.001, "reward": 2.8623234406113625, "reward_std": 0.6014144476503134, "rewards/accuracy_reward": 0.13671875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6630921829491854, "rewards/reasoning_steps_reward": 0.7363281361758709, "step": 217 }, { "completion_length": 295.375, "epoch": 3.464, "grad_norm": 0.8203125, "kl": 0.016009816259611398, "learning_rate": 2.1997249571095835e-07, "loss": 0.0006, "reward": 3.290237843990326, "reward_std": 0.6886514872312546, "rewards/accuracy_reward": 0.04296875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8556435058514277, "rewards/reasoning_steps_reward": 0.6803385578095913, "step": 218 }, { "completion_length": 289.02734375, "epoch": 3.48, "grad_norm": 0.859375, "kl": 0.02076311851851642, "learning_rate": 2.0575333264911125e-07, "loss": 0.0008, "reward": 2.800406724214554, "reward_std": 0.6951953694224358, "rewards/accuracy_reward": 0.10546875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6608994637305537, "rewards/reasoning_steps_reward": 0.7122395895421505, "step": 219 }, { "completion_length": 279.2421875, "epoch": 3.496, "grad_norm": 0.9921875, "kl": 0.01861161779379472, "learning_rate": 1.9198949610721273e-07, "loss": 0.0007, "reward": 2.7829076945781708, "reward_std": 0.5952301491051912, "rewards/accuracy_reward": 0.19140625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6188251003623009, "rewards/reasoning_steps_reward": 0.735026054084301, "step": 220 }, { "completion_length": 281.380859375, "epoch": 3.512, "grad_norm": 0.95703125, "kl": 0.018613723281305283, "learning_rate": 1.786837177182127e-07, "loss": 0.0007, "reward": 2.8807911574840546, "reward_std": 0.6898195426911116, "rewards/accuracy_reward": 0.15234375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6807498056441545, "rewards/reasoning_steps_reward": 0.686197929084301, "step": 221 }, { "completion_length": 287.46484375, "epoch": 3.528, "grad_norm": 0.859375, "kl": 0.01997726986883208, "learning_rate": 1.6583863820678032e-07, "loss": 0.0008, "reward": 2.8661443442106247, "reward_std": 0.6607285998761654, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6637147692963481, "rewards/reasoning_steps_reward": 0.7500000074505806, "step": 222 }, { "completion_length": 287.591796875, "epoch": 3.544, "grad_norm": 0.91015625, "kl": 0.020406617608387023, "learning_rate": 1.534568068652101e-07, "loss": 0.0008, "reward": 2.8175922632217407, "reward_std": 0.772568928077817, "rewards/accuracy_reward": 0.080078125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6876783277839422, "rewards/reasoning_steps_reward": 0.6744791697710752, "step": 223 }, { "completion_length": 281.27734375, "epoch": 3.56, "grad_norm": 0.921875, "kl": 0.021257835964206606, "learning_rate": 1.4154068104747981e-07, "loss": 0.0009, "reward": 3.101296618580818, "reward_std": 0.7108908668160439, "rewards/accuracy_reward": 0.1640625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7581579011554519, "rewards/reasoning_steps_reward": 0.662760416045785, "step": 224 }, { "completion_length": 279.06640625, "epoch": 3.576, "grad_norm": 1.375, "kl": 0.022064094548113644, "learning_rate": 1.3009262568155462e-07, "loss": 0.0009, "reward": 2.9315654188394547, "reward_std": 0.705444760620594, "rewards/accuracy_reward": 0.130859375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6976745830227932, "rewards/reasoning_steps_reward": 0.707682304084301, "step": 225 }, { "completion_length": 291.966796875, "epoch": 3.592, "grad_norm": 2.09375, "kl": 0.0230710570467636, "learning_rate": 1.1911491280002907e-07, "loss": 0.0009, "reward": 3.4133089035749435, "reward_std": 0.7498599980026484, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8753998465836048, "rewards/reasoning_steps_reward": 0.7246093787252903, "step": 226 }, { "completion_length": 277.298828125, "epoch": 3.608, "grad_norm": 0.83984375, "kl": 0.019825019757263362, "learning_rate": 1.0860972108921258e-07, "loss": 0.0008, "reward": 2.766988158226013, "reward_std": 0.6906307358294725, "rewards/accuracy_reward": 0.138671875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6410793742785851, "rewards/reasoning_steps_reward": 0.7050781287252903, "step": 227 }, { "completion_length": 291.142578125, "epoch": 3.624, "grad_norm": 0.91015625, "kl": 0.019932835886720568, "learning_rate": 9.857913545673503e-08, "loss": 0.0008, "reward": 3.3143957555294037, "reward_std": 0.6608162298798561, "rewards/accuracy_reward": 0.080078125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.836786450818181, "rewards/reasoning_steps_reward": 0.7239583283662796, "step": 228 }, { "completion_length": 288.453125, "epoch": 3.64, "grad_norm": 0.8984375, "kl": 0.018874026485718787, "learning_rate": 8.902514661776885e-08, "loss": 0.0008, "reward": 3.2070699259638786, "reward_std": 0.7432738393545151, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8040493360410134, "rewards/reasoning_steps_reward": 0.6699218824505806, "step": 229 }, { "completion_length": 293.146484375, "epoch": 3.656, "grad_norm": 0.80078125, "kl": 0.016978327243123204, "learning_rate": 7.994965069994143e-08, "loss": 0.0007, "reward": 3.143362358212471, "reward_std": 0.6415095869451761, "rewards/accuracy_reward": 0.072265625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7838985491544008, "rewards/reasoning_steps_reward": 0.7194010280072689, "step": 230 }, { "completion_length": 288.62890625, "epoch": 3.672, "grad_norm": 1.1171875, "kl": 0.021244205767288804, "learning_rate": 7.135444886702064e-08, "loss": 0.0008, "reward": 2.9098562449216843, "reward_std": 0.7155030779540539, "rewards/accuracy_reward": 0.119140625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7021569274365902, "rewards/reasoning_steps_reward": 0.684244804084301, "step": 231 }, { "completion_length": 285.921875, "epoch": 3.6879999999999997, "grad_norm": 0.81640625, "kl": 0.01845627831062302, "learning_rate": 6.324124696144962e-08, "loss": 0.0007, "reward": 2.8958379551768303, "reward_std": 0.6293431017547846, "rewards/accuracy_reward": 0.111328125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7039945904786388, "rewards/reasoning_steps_reward": 0.6725260466337204, "step": 232 }, { "completion_length": 290.05859375, "epoch": 3.7039999999999997, "grad_norm": 0.91015625, "kl": 0.017910517868585885, "learning_rate": 5.5611655165795365e-08, "loss": 0.0007, "reward": 2.9697776436805725, "reward_std": 0.6638195030391216, "rewards/accuracy_reward": 0.08203125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7292921990156174, "rewards/reasoning_steps_reward": 0.699869804084301, "step": 233 }, { "completion_length": 287.302734375, "epoch": 3.7199999999999998, "grad_norm": 0.859375, "kl": 0.01873377658193931, "learning_rate": 4.846718768318659e-08, "loss": 0.0007, "reward": 3.1371295899152756, "reward_std": 0.6027075219899416, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.762940771256884, "rewards/reasoning_steps_reward": 0.7233072891831398, "step": 234 }, { "completion_length": 293.28125, "epoch": 3.7359999999999998, "grad_norm": 0.90625, "kl": 0.019052452000323683, "learning_rate": 4.1809262436796896e-08, "loss": 0.0008, "reward": 3.043783374130726, "reward_std": 0.6604214962571859, "rewards/accuracy_reward": 0.06640625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7457142351195216, "rewards/reasoning_steps_reward": 0.7402343787252903, "step": 235 }, { "completion_length": 292.73828125, "epoch": 3.752, "grad_norm": 0.828125, "kl": 0.019031181174796075, "learning_rate": 3.563920078843791e-08, "loss": 0.0008, "reward": 3.077702447772026, "reward_std": 0.6683868058025837, "rewards/accuracy_reward": 0.107421875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7663521977762381, "rewards/reasoning_steps_reward": 0.6712239496409893, "step": 236 }, { "completion_length": 281.478515625, "epoch": 3.768, "grad_norm": 1.21875, "kl": 0.022773202043026686, "learning_rate": 2.99582272763152e-08, "loss": 0.0009, "reward": 2.9361980706453323, "reward_std": 0.667768020182848, "rewards/accuracy_reward": 0.17578125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6935764315227667, "rewards/reasoning_steps_reward": 0.6796875037252903, "step": 237 }, { "completion_length": 286.15234375, "epoch": 3.784, "grad_norm": 0.953125, "kl": 0.020423304580617696, "learning_rate": 2.4767469372002362e-08, "loss": 0.0008, "reward": 2.6705066189169884, "reward_std": 0.6245338693261147, "rewards/accuracy_reward": 0.162109375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.606531698256731, "rewards/reasoning_steps_reward": 0.6888020746409893, "step": 238 }, { "completion_length": 291.84765625, "epoch": 3.8, "grad_norm": 0.84765625, "kl": 0.017052936542313546, "learning_rate": 2.0067957256676428e-08, "loss": 0.0007, "reward": 3.033589616417885, "reward_std": 0.6652188412845135, "rewards/accuracy_reward": 0.08203125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7442694374670585, "rewards/reasoning_steps_reward": 0.7187499944120646, "step": 239 }, { "completion_length": 290.75, "epoch": 3.816, "grad_norm": 0.796875, "kl": 0.021085154090542346, "learning_rate": 1.5860623616664183e-08, "loss": 0.0008, "reward": 2.713896244764328, "reward_std": 0.6678700372576714, "rewards/accuracy_reward": 0.083984375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6320626304174463, "rewards/reasoning_steps_reward": 0.733723958954215, "step": 240 }, { "completion_length": 286.052734375, "epoch": 3.832, "grad_norm": 0.93359375, "kl": 0.02104910637717694, "learning_rate": 1.2146303458337172e-08, "loss": 0.0008, "reward": 3.3307963609695435, "reward_std": 0.6925474908202887, "rewards/accuracy_reward": 0.123046875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8309685722924769, "rewards/reasoning_steps_reward": 0.7148437593132257, "step": 241 }, { "completion_length": 286.875, "epoch": 3.848, "grad_norm": 2.375, "kl": 0.023578285879921168, "learning_rate": 8.92573394239149e-09, "loss": 0.0009, "reward": 2.9508322179317474, "reward_std": 0.6057112123817205, "rewards/accuracy_reward": 0.107421875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7184197666744391, "rewards/reasoning_steps_reward": 0.6881510633975267, "step": 242 }, { "completion_length": 296.923828125, "epoch": 3.864, "grad_norm": 1.0390625, "kl": 0.019981018383987248, "learning_rate": 6.1995542375495325e-09, "loss": 0.0008, "reward": 3.1651005297899246, "reward_std": 0.6937647629529238, "rewards/accuracy_reward": 0.064453125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.8048164764574418, "rewards/reasoning_steps_reward": 0.6861979216337204, "step": 243 }, { "completion_length": 289.396484375, "epoch": 3.88, "grad_norm": 0.82421875, "kl": 0.01859537634300068, "learning_rate": 3.96830539370563e-09, "loss": 0.0007, "reward": 3.588533952832222, "reward_std": 0.7532828189432621, "rewards/accuracy_reward": 0.087890625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.9253446195895473, "rewards/reasoning_steps_reward": 0.724609375, "step": 244 }, { "completion_length": 294.28515625, "epoch": 3.896, "grad_norm": 0.94140625, "kl": 0.01655962661607191, "learning_rate": 2.2324302345483327e-09, "loss": 0.0007, "reward": 3.025103345513344, "reward_std": 0.6890733204782009, "rewards/accuracy_reward": 0.064453125, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7340622267996272, "rewards/reasoning_steps_reward": 0.7584635354578495, "step": 245 }, { "completion_length": 288.9453125, "epoch": 3.912, "grad_norm": 0.97265625, "kl": 0.020295250928029418, "learning_rate": 9.922732696748816e-10, "loss": 0.0008, "reward": 2.733549617230892, "reward_std": 0.6962179783731699, "rewards/accuracy_reward": 0.07421875, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6568429181352258, "rewards/reasoning_steps_reward": 0.6888020858168602, "step": 246 }, { "completion_length": 285.66796875, "epoch": 3.928, "grad_norm": 1.0546875, "kl": 0.017885809938888997, "learning_rate": 2.480806262181168e-10, "loss": 0.0007, "reward": 2.9583439081907272, "reward_std": 0.6105441423133016, "rewards/accuracy_reward": 0.115234375, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.7220087309057514, "rewards/reasoning_steps_reward": 0.6770833320915699, "step": 247 }, { "completion_length": 288.5234375, "epoch": 3.944, "grad_norm": 26.625, "kl": 0.03823809011373669, "learning_rate": 0.0, "loss": 0.0015, "reward": 2.7989018857479095, "reward_std": 0.6642574854195118, "rewards/accuracy_reward": 0.1015625, "rewards/format_reward": 0.0, "rewards/novelty_reward_func_explore_exploit": 0.6736357094099125, "rewards/reasoning_steps_reward": 0.676432304084301, "step": 248 }, { "epoch": 3.944, "step": 248, "total_flos": 0.0, "train_loss": 0.006824205948613517, "train_runtime": 18399.2985, "train_samples_per_second": 0.435, "train_steps_per_second": 0.013 } ], "logging_steps": 1, "max_steps": 248, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }