{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1669170422300117, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 256.0, "epoch": 0.00016691704223001168, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 5.0000000000000004e-08, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 1 }, { "completion_length": 256.0, "epoch": 0.00033383408446002337, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.0000000000000001e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 2 }, { "completion_length": 256.0, "epoch": 0.000500751126690035, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1.5000000000000002e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 3 }, { "completion_length": 153.5, "epoch": 0.0006676681689200467, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 2.0000000000000002e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 4 }, { "completion_length": 256.0, "epoch": 0.0008345852111500584, "grad_norm": 0.4355925917625427, "kl": 0.0, "learning_rate": 2.5000000000000004e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 5 }, { "completion_length": 256.0, "epoch": 0.00100150225338007, "grad_norm": 0.0023074704222381115, "kl": 0.0006924427580088377, "learning_rate": 3.0000000000000004e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 6 }, { "completion_length": 221.0, "epoch": 0.0011684192956100817, "grad_norm": 0.002545094583183527, "kl": 0.0008617227431386709, "learning_rate": 3.5000000000000004e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 7 }, { "completion_length": 256.0, "epoch": 0.0013353363378400935, "grad_norm": 0.0018900125287473202, "kl": 0.0006088059162721038, "learning_rate": 4.0000000000000003e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 8 }, { "completion_length": 256.0, "epoch": 0.0015022533800701052, "grad_norm": 0.42436301708221436, "kl": 0.0006020100554451346, "learning_rate": 4.5000000000000003e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 9 }, { "completion_length": 256.0, "epoch": 0.0016691704223001167, "grad_norm": 0.4781365692615509, "kl": 0.0008082038257271051, "learning_rate": 5.000000000000001e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 10 }, { "completion_length": 256.0, "epoch": 0.0018360874645301285, "grad_norm": 0.0023456409107893705, "kl": 0.0007865740917623043, "learning_rate": 5.5e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 11 }, { "completion_length": 233.0, "epoch": 0.00200300450676014, "grad_norm": 0.5862958431243896, "kl": 0.0007361505413427949, "learning_rate": 6.000000000000001e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 12 }, { "completion_length": 252.5, "epoch": 0.002169921548990152, "grad_norm": 0.48376238346099854, "kl": 0.0005487503949552774, "learning_rate": 6.5e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 13 }, { "completion_length": 256.0, "epoch": 0.0023368385912201635, "grad_norm": 0.002120411954820156, "kl": 0.000743017066270113, "learning_rate": 7.000000000000001e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 14 }, { "completion_length": 175.0, "epoch": 0.0025037556334501754, "grad_norm": 0.0026380703784525394, "kl": 0.0007172441110014915, "learning_rate": 7.5e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 15 }, { "completion_length": 110.0, "epoch": 0.002670672675680187, "grad_norm": 0.0029629545751959085, "kl": 0.0007808679947629571, "learning_rate": 8.000000000000001e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 16 }, { "completion_length": 105.5, "epoch": 0.0028375897179101985, "grad_norm": 0.0037870861124247313, "kl": 0.0007570089073851705, "learning_rate": 8.500000000000001e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 17 }, { "completion_length": 96.0, "epoch": 0.0030045067601402104, "grad_norm": 0.00275947037152946, "kl": 0.0004887838149443269, "learning_rate": 9.000000000000001e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 18 }, { "completion_length": 256.0, "epoch": 0.003171423802370222, "grad_norm": 0.0016414800193160772, "kl": 0.0005866154097020626, "learning_rate": 9.500000000000001e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 19 }, { "completion_length": 171.0, "epoch": 0.0033383408446002335, "grad_norm": 0.00516565190628171, "kl": 0.0011643741745501757, "learning_rate": 1.0000000000000002e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 20 }, { "completion_length": 256.0, "epoch": 0.0035052578868302454, "grad_norm": 0.0018838247051462531, "kl": 0.000623451080173254, "learning_rate": 1.0500000000000001e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 21 }, { "completion_length": 256.0, "epoch": 0.003672174929060257, "grad_norm": 0.0017560900887474418, "kl": 0.0007140530506148934, "learning_rate": 1.1e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 22 }, { "completion_length": 256.0, "epoch": 0.003839091971290269, "grad_norm": 0.0031194656621664762, "kl": 0.0008934169309213758, "learning_rate": 1.1500000000000002e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 23 }, { "completion_length": 210.0, "epoch": 0.00400600901352028, "grad_norm": 0.4673703908920288, "kl": 0.0005224989145062864, "learning_rate": 1.2000000000000002e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 24 }, { "completion_length": 256.0, "epoch": 0.004172926055750292, "grad_norm": 0.0021465765312314034, "kl": 0.0007942361989989877, "learning_rate": 1.25e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 25 }, { "completion_length": 256.0, "epoch": 0.004339843097980304, "grad_norm": 0.0019226443255320191, "kl": 0.0007524782558903098, "learning_rate": 1.3e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 26 }, { "completion_length": 256.0, "epoch": 0.004506760140210316, "grad_norm": 0.0022293697111308575, "kl": 0.0008433335460722446, "learning_rate": 1.3500000000000002e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 27 }, { "completion_length": 171.5, "epoch": 0.004673677182440327, "grad_norm": 0.0030323590617626905, "kl": 0.0008573549566790462, "learning_rate": 1.4000000000000001e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 28 }, { "completion_length": 96.0, "epoch": 0.004840594224670339, "grad_norm": 0.0032209851779043674, "kl": 0.0008154669776558876, "learning_rate": 1.45e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 29 }, { "completion_length": 248.5, "epoch": 0.005007511266900351, "grad_norm": 0.3956551253795624, "kl": 0.0007140351226553321, "learning_rate": 1.5e-06, "loss": 0.0, "reward": -0.3425000011920929, "reward_std": 0.6611448526382446, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3425000011920929, "step": 30 }, { "completion_length": 256.0, "epoch": 0.005174428309130362, "grad_norm": 0.0017684625927358866, "kl": 0.0007516610203310847, "learning_rate": 1.5500000000000002e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 31 }, { "completion_length": 77.0, "epoch": 0.005341345351360374, "grad_norm": 0.00509336031973362, "kl": 0.0007999609224498272, "learning_rate": 1.6000000000000001e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 32 }, { "completion_length": 256.0, "epoch": 0.005508262393590386, "grad_norm": 0.003236782504245639, "kl": 0.0009384113363921642, "learning_rate": 1.6500000000000003e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 33 }, { "completion_length": 256.0, "epoch": 0.005675179435820397, "grad_norm": 0.4282550513744354, "kl": 0.0005632344400510192, "learning_rate": 1.7000000000000002e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 34 }, { "completion_length": 237.0, "epoch": 0.005842096478050409, "grad_norm": 0.4902305006980896, "kl": 0.00067540054442361, "learning_rate": 1.75e-06, "loss": 0.0, "reward": -0.3009999990463257, "reward_std": 0.7792316675186157, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3009999990463257, "step": 35 }, { "completion_length": 221.5, "epoch": 0.006009013520280421, "grad_norm": 0.5777540802955627, "kl": 0.000863127235788852, "learning_rate": 1.8000000000000001e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 36 }, { "completion_length": 256.0, "epoch": 0.006175930562510432, "grad_norm": 0.002305163536220789, "kl": 0.0007089747814461589, "learning_rate": 1.85e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 37 }, { "completion_length": 202.5, "epoch": 0.006342847604740444, "grad_norm": 0.002381665166467428, "kl": 0.0007373938569799066, "learning_rate": 1.9000000000000002e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 38 }, { "completion_length": 256.0, "epoch": 0.006509764646970456, "grad_norm": 0.00199413625523448, "kl": 0.0007965309778228402, "learning_rate": 1.9500000000000004e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 39 }, { "completion_length": 256.0, "epoch": 0.006676681689200467, "grad_norm": 0.45001348853111267, "kl": 0.0007809748640283942, "learning_rate": 2.0000000000000003e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 40 }, { "completion_length": 256.0, "epoch": 0.006843598731430479, "grad_norm": 0.002464474178850651, "kl": 0.0007592637557536364, "learning_rate": 2.05e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 41 }, { "completion_length": 256.0, "epoch": 0.007010515773660491, "grad_norm": 0.0018699566135182977, "kl": 0.0006333279889076948, "learning_rate": 2.1000000000000002e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 42 }, { "completion_length": 256.0, "epoch": 0.007177432815890503, "grad_norm": 0.0017541371053084731, "kl": 0.0007327662315219641, "learning_rate": 2.15e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 43 }, { "completion_length": 136.0, "epoch": 0.007344349858120514, "grad_norm": 0.003846902633085847, "kl": 0.0007626832812093198, "learning_rate": 2.2e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 44 }, { "completion_length": 256.0, "epoch": 0.007511266900350526, "grad_norm": 0.001827204949222505, "kl": 0.0006649623392149806, "learning_rate": 2.25e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 45 }, { "completion_length": 225.5, "epoch": 0.007678183942580538, "grad_norm": 0.003200680483132601, "kl": 0.0008845384581945837, "learning_rate": 2.3000000000000004e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 46 }, { "completion_length": 142.0, "epoch": 0.00784510098481055, "grad_norm": 0.0024895467795431614, "kl": 0.0006290185265243053, "learning_rate": 2.35e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 47 }, { "completion_length": 152.0, "epoch": 0.00801201802704056, "grad_norm": 0.8204300999641418, "kl": 0.0005838776705786586, "learning_rate": 2.4000000000000003e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 48 }, { "completion_length": 78.5, "epoch": 0.008178935069270572, "grad_norm": 0.005335088353604078, "kl": 0.0007884252700023353, "learning_rate": 2.4500000000000003e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 49 }, { "completion_length": 202.0, "epoch": 0.008345852111500584, "grad_norm": 0.5542981624603271, "kl": 0.0007119841175153852, "learning_rate": 2.5e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 50 }, { "completion_length": 244.0, "epoch": 0.008512769153730596, "grad_norm": 0.0027991230599582195, "kl": 0.000727020320482552, "learning_rate": 2.55e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 51 }, { "completion_length": 256.0, "epoch": 0.008679686195960608, "grad_norm": 0.0028236291836947203, "kl": 0.0008543728617951274, "learning_rate": 2.6e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 52 }, { "completion_length": 228.5, "epoch": 0.00884660323819062, "grad_norm": 0.00168053328525275, "kl": 0.0005044269491918385, "learning_rate": 2.6500000000000005e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 53 }, { "completion_length": 195.0, "epoch": 0.009013520280420632, "grad_norm": 0.0025725034065544605, "kl": 0.0007799636223353446, "learning_rate": 2.7000000000000004e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 54 }, { "completion_length": 256.0, "epoch": 0.009180437322650642, "grad_norm": 0.49676376581192017, "kl": 0.0007005356019362807, "learning_rate": 2.7500000000000004e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 55 }, { "completion_length": 256.0, "epoch": 0.009347354364880654, "grad_norm": 0.0016647669253870845, "kl": 0.0005751823773607612, "learning_rate": 2.8000000000000003e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 56 }, { "completion_length": 256.0, "epoch": 0.009514271407110666, "grad_norm": 0.0019769608043134212, "kl": 0.000668263528496027, "learning_rate": 2.85e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 57 }, { "completion_length": 156.5, "epoch": 0.009681188449340678, "grad_norm": 1.061002254486084, "kl": 0.0009229230345226824, "learning_rate": 2.9e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 58 }, { "completion_length": 159.5, "epoch": 0.00984810549157069, "grad_norm": 0.8685297966003418, "kl": 0.0006633138982579112, "learning_rate": 2.95e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 59 }, { "completion_length": 256.0, "epoch": 0.010015022533800702, "grad_norm": 0.0013861958868801594, "kl": 0.0005295326700434089, "learning_rate": 3e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 60 }, { "completion_length": 104.5, "epoch": 0.010181939576030712, "grad_norm": 0.0031733207870274782, "kl": 0.000785140146035701, "learning_rate": 3.05e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 61 }, { "completion_length": 256.0, "epoch": 0.010348856618260724, "grad_norm": 0.41396209597587585, "kl": 0.0006947015644982457, "learning_rate": 3.1000000000000004e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 62 }, { "completion_length": 69.0, "epoch": 0.010515773660490736, "grad_norm": 0.008254519663751125, "kl": 0.0012825066223740578, "learning_rate": 3.1500000000000003e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 63 }, { "completion_length": 157.0, "epoch": 0.010682690702720748, "grad_norm": 0.0024698572233319283, "kl": 0.0005963145522400737, "learning_rate": 3.2000000000000003e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 64 }, { "completion_length": 256.0, "epoch": 0.01084960774495076, "grad_norm": 0.0024705580435693264, "kl": 0.0008366592228412628, "learning_rate": 3.2500000000000002e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 65 }, { "completion_length": 256.0, "epoch": 0.011016524787180772, "grad_norm": 0.0016799605218693614, "kl": 0.0005449674790725112, "learning_rate": 3.3000000000000006e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 66 }, { "completion_length": 217.5, "epoch": 0.011183441829410784, "grad_norm": 0.4892553985118866, "kl": 0.0006908230716362596, "learning_rate": 3.3500000000000005e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 67 }, { "completion_length": 38.5, "epoch": 0.011350358871640794, "grad_norm": 0.006257229018956423, "kl": 0.00020465116540435702, "learning_rate": 3.4000000000000005e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 68 }, { "completion_length": 256.0, "epoch": 0.011517275913870806, "grad_norm": 0.0017687833169475198, "kl": 0.0006731381872668862, "learning_rate": 3.45e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 69 }, { "completion_length": 215.5, "epoch": 0.011684192956100818, "grad_norm": 0.4772097170352936, "kl": 0.0006857815315015614, "learning_rate": 3.5e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 70 }, { "completion_length": 256.0, "epoch": 0.01185110999833083, "grad_norm": 0.41286587715148926, "kl": 0.0006363401189446449, "learning_rate": 3.5500000000000003e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 71 }, { "completion_length": 96.0, "epoch": 0.012018027040560842, "grad_norm": 0.0036073343362659216, "kl": 0.0007669712067581713, "learning_rate": 3.6000000000000003e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 72 }, { "completion_length": 107.0, "epoch": 0.012184944082790854, "grad_norm": 0.014187167398631573, "kl": 0.0017385465325787663, "learning_rate": 3.65e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 73 }, { "completion_length": 139.0, "epoch": 0.012351861125020864, "grad_norm": 0.006237726658582687, "kl": 0.0005240284954197705, "learning_rate": 3.7e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 74 }, { "completion_length": 159.0, "epoch": 0.012518778167250876, "grad_norm": 0.8166159987449646, "kl": 0.0010699054691940546, "learning_rate": 3.7500000000000005e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 75 }, { "completion_length": 256.0, "epoch": 0.012685695209480888, "grad_norm": 0.4218797981739044, "kl": 0.0006836229003965855, "learning_rate": 3.8000000000000005e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 76 }, { "completion_length": 256.0, "epoch": 0.0128526122517109, "grad_norm": 0.0022117469925433397, "kl": 0.0006208907580003142, "learning_rate": 3.85e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 77 }, { "completion_length": 179.5, "epoch": 0.013019529293940912, "grad_norm": 0.6410713195800781, "kl": 0.0008844473049975932, "learning_rate": 3.900000000000001e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 78 }, { "completion_length": 166.0, "epoch": 0.013186446336170924, "grad_norm": 0.0031605232506990433, "kl": 0.000757245346903801, "learning_rate": 3.95e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 79 }, { "completion_length": 102.5, "epoch": 0.013353363378400934, "grad_norm": 0.7943593263626099, "kl": 0.0008970008930191398, "learning_rate": 4.000000000000001e-06, "loss": 0.0, "reward": -0.02449999749660492, "reward_std": 0.38820162415504456, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.02449999749660492, "step": 80 }, { "completion_length": 256.0, "epoch": 0.013520280420630946, "grad_norm": 0.0017543798312544823, "kl": 0.0005730526754632592, "learning_rate": 4.05e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 81 }, { "completion_length": 256.0, "epoch": 0.013687197462860958, "grad_norm": 0.002782423747703433, "kl": 0.0010118272621184587, "learning_rate": 4.1e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 82 }, { "completion_length": 158.0, "epoch": 0.01385411450509097, "grad_norm": 1.0117136240005493, "kl": 0.0019336793338879943, "learning_rate": 4.15e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 83 }, { "completion_length": 256.0, "epoch": 0.014021031547320982, "grad_norm": 0.0037892700638622046, "kl": 0.0010702450526878238, "learning_rate": 4.2000000000000004e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 84 }, { "completion_length": 256.0, "epoch": 0.014187948589550994, "grad_norm": 0.0019413195550441742, "kl": 0.0006971035618335009, "learning_rate": 4.25e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 85 }, { "completion_length": 256.0, "epoch": 0.014354865631781006, "grad_norm": 0.509328305721283, "kl": 0.0012641346547752619, "learning_rate": 4.3e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 86 }, { "completion_length": 176.0, "epoch": 0.014521782674011016, "grad_norm": 0.5929401516914368, "kl": 0.0011120084673166275, "learning_rate": 4.350000000000001e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 87 }, { "completion_length": 208.0, "epoch": 0.014688699716241028, "grad_norm": 0.5447751879692078, "kl": 0.001371556194499135, "learning_rate": 4.4e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 88 }, { "completion_length": 255.0, "epoch": 0.01485561675847104, "grad_norm": 0.0034100220073014498, "kl": 0.0010690975468605757, "learning_rate": 4.450000000000001e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 89 }, { "completion_length": 256.0, "epoch": 0.015022533800701052, "grad_norm": 0.002973145106807351, "kl": 0.0009098384762182832, "learning_rate": 4.5e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 90 }, { "completion_length": 39.5, "epoch": 0.015189450842931064, "grad_norm": 0.03341549634933472, "kl": 0.008237367495894432, "learning_rate": 4.5500000000000005e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 91 }, { "completion_length": 223.5, "epoch": 0.015356367885161076, "grad_norm": 0.48820972442626953, "kl": 0.0009756239014677703, "learning_rate": 4.600000000000001e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 92 }, { "completion_length": 89.5, "epoch": 0.015523284927391086, "grad_norm": 0.02054094895720482, "kl": 0.004568272735923529, "learning_rate": 4.65e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 93 }, { "completion_length": 256.0, "epoch": 0.0156902019696211, "grad_norm": 0.002148204715922475, "kl": 0.0006858772831037641, "learning_rate": 4.7e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 94 }, { "completion_length": 134.0, "epoch": 0.01585711901185111, "grad_norm": 0.006535620894283056, "kl": 0.0012684892863035202, "learning_rate": 4.75e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 95 }, { "completion_length": 53.0, "epoch": 0.01602403605408112, "grad_norm": 0.008152181282639503, "kl": 0.0028163655661046505, "learning_rate": 4.800000000000001e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 96 }, { "completion_length": 159.0, "epoch": 0.016190953096311134, "grad_norm": 0.8876697421073914, "kl": 0.0020389193668961525, "learning_rate": 4.85e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 97 }, { "completion_length": 256.0, "epoch": 0.016357870138541144, "grad_norm": 0.002760911826044321, "kl": 0.0008105032611638308, "learning_rate": 4.9000000000000005e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 98 }, { "completion_length": 242.0, "epoch": 0.016524787180771158, "grad_norm": 0.002614391967654228, "kl": 0.001181511557660997, "learning_rate": 4.95e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 99 }, { "completion_length": 188.5, "epoch": 0.016691704223001168, "grad_norm": 0.00833022128790617, "kl": 0.0020136404782533646, "learning_rate": 5e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 100 }, { "completion_length": 171.0, "epoch": 0.01685862126523118, "grad_norm": 0.730735182762146, "kl": 0.0035489422734826803, "learning_rate": 4.999984769144476e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 101 }, { "completion_length": 73.0, "epoch": 0.01702553830746119, "grad_norm": 0.016109637916088104, "kl": 0.005685964599251747, "learning_rate": 4.999939076763487e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 102 }, { "completion_length": 136.0, "epoch": 0.017192455349691202, "grad_norm": 0.007926014252007008, "kl": 0.002989206463098526, "learning_rate": 4.999862923413781e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 103 }, { "completion_length": 205.5, "epoch": 0.017359372391921216, "grad_norm": 0.0018580041360110044, "kl": 0.0006508008809760213, "learning_rate": 4.999756310023261e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 104 }, { "completion_length": 173.0, "epoch": 0.017526289434151226, "grad_norm": 0.004838588647544384, "kl": 0.0022138473577797413, "learning_rate": 4.9996192378909785e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 105 }, { "completion_length": 103.0, "epoch": 0.01769320647638124, "grad_norm": 0.0037362566217780113, "kl": 0.0014561975840479136, "learning_rate": 4.999451708687114e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 106 }, { "completion_length": 256.0, "epoch": 0.01786012351861125, "grad_norm": 0.45699435472488403, "kl": 0.0017439527437090874, "learning_rate": 4.9992537244529585e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 107 }, { "completion_length": 204.5, "epoch": 0.018027040560841263, "grad_norm": 0.006480565760284662, "kl": 0.00335732102394104, "learning_rate": 4.999025287600886e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 108 }, { "completion_length": 256.0, "epoch": 0.018193957603071274, "grad_norm": 0.007707216776907444, "kl": 0.001968174707144499, "learning_rate": 4.998766400914329e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 109 }, { "completion_length": 251.5, "epoch": 0.018360874645301284, "grad_norm": 0.006300975102931261, "kl": 0.002719060517847538, "learning_rate": 4.99847706754774e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 110 }, { "completion_length": 184.0, "epoch": 0.018527791687531298, "grad_norm": 0.01569710113108158, "kl": 0.0064395577646791935, "learning_rate": 4.998157291026553e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 111 }, { "completion_length": 130.0, "epoch": 0.018694708729761308, "grad_norm": 0.0071138604544103146, "kl": 0.0030896691605448723, "learning_rate": 4.997807075247147e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 112 }, { "completion_length": 178.5, "epoch": 0.01886162577199132, "grad_norm": 0.7200388312339783, "kl": 0.004501280374825001, "learning_rate": 4.997426424476787e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 113 }, { "completion_length": 256.0, "epoch": 0.01902854281422133, "grad_norm": 0.4705701768398285, "kl": 0.0007659038528800011, "learning_rate": 4.9970153433535855e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 114 }, { "completion_length": 115.5, "epoch": 0.019195459856451345, "grad_norm": 0.00373659934848547, "kl": 0.0010495111346244812, "learning_rate": 4.9965738368864345e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 115 }, { "completion_length": 242.0, "epoch": 0.019362376898681356, "grad_norm": 0.002459950977936387, "kl": 0.00098153215367347, "learning_rate": 4.996101910454953e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 116 }, { "completion_length": 253.5, "epoch": 0.019529293940911366, "grad_norm": 0.0021585533395409584, "kl": 0.0009589203400537372, "learning_rate": 4.995599569809414e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 117 }, { "completion_length": 156.0, "epoch": 0.01969621098314138, "grad_norm": 0.012676632031798363, "kl": 0.004054306074976921, "learning_rate": 4.9950668210706795e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 118 }, { "completion_length": 256.0, "epoch": 0.01986312802537139, "grad_norm": 0.0028360753785818815, "kl": 0.0011460497044026852, "learning_rate": 4.994503670730126e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 119 }, { "completion_length": 256.0, "epoch": 0.020030045067601403, "grad_norm": 0.0020838764030486345, "kl": 0.0008744155056774616, "learning_rate": 4.993910125649561e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 120 }, { "completion_length": 153.0, "epoch": 0.020196962109831414, "grad_norm": 1.0788556337356567, "kl": 0.004594118800014257, "learning_rate": 4.993286193061145e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 121 }, { "completion_length": 209.5, "epoch": 0.020363879152061424, "grad_norm": 0.5527423620223999, "kl": 0.0026736934669315815, "learning_rate": 4.992631880567301e-06, "loss": 0.0001, "reward": -0.11249999701976776, "reward_std": 0.5126524567604065, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11249999701976776, "step": 122 }, { "completion_length": 171.0, "epoch": 0.020530796194291438, "grad_norm": 0.02060548961162567, "kl": 0.007148497737944126, "learning_rate": 4.991947196140619e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 123 }, { "completion_length": 256.0, "epoch": 0.020697713236521448, "grad_norm": 0.004743944387882948, "kl": 0.0008479248499497771, "learning_rate": 4.9912321481237616e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 124 }, { "completion_length": 97.0, "epoch": 0.02086463027875146, "grad_norm": 0.0058629862032830715, "kl": 0.002442185301333666, "learning_rate": 4.990486745229364e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 125 }, { "completion_length": 237.0, "epoch": 0.02103154732098147, "grad_norm": 0.004354474134743214, "kl": 0.002903785789385438, "learning_rate": 4.989710996539926e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 126 }, { "completion_length": 256.0, "epoch": 0.021198464363211485, "grad_norm": 0.0021160589531064034, "kl": 0.0009215105092152953, "learning_rate": 4.9889049115077e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 127 }, { "completion_length": 256.0, "epoch": 0.021365381405441496, "grad_norm": 0.008305010385811329, "kl": 0.0033679387997835875, "learning_rate": 4.988068499954578e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 128 }, { "completion_length": 256.0, "epoch": 0.021532298447671506, "grad_norm": 0.001991692930459976, "kl": 0.0006306689465418458, "learning_rate": 4.987201772071971e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 129 }, { "completion_length": 256.0, "epoch": 0.02169921548990152, "grad_norm": 0.47524091601371765, "kl": 0.004586502909660339, "learning_rate": 4.986304738420684e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 130 }, { "completion_length": 230.0, "epoch": 0.02186613253213153, "grad_norm": 0.003476138226687908, "kl": 0.002353046787902713, "learning_rate": 4.985377409930789e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 131 }, { "completion_length": 108.5, "epoch": 0.022033049574361543, "grad_norm": 0.016695160418748856, "kl": 0.008191770873963833, "learning_rate": 4.984419797901491e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 132 }, { "completion_length": 256.0, "epoch": 0.022199966616591554, "grad_norm": 0.0023697568103671074, "kl": 0.0014776038005948067, "learning_rate": 4.983431914000991e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 133 }, { "completion_length": 256.0, "epoch": 0.022366883658821567, "grad_norm": 0.004167099948972464, "kl": 0.0011776989558711648, "learning_rate": 4.9824137702663424e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 134 }, { "completion_length": 162.0, "epoch": 0.022533800701051578, "grad_norm": 0.563796877861023, "kl": 0.005261815153062344, "learning_rate": 4.981365379103306e-06, "loss": 0.0002, "reward": 0.31299999356269836, "reward_std": 0.08909544348716736, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31299999356269836, "step": 135 }, { "completion_length": 236.0, "epoch": 0.022700717743281588, "grad_norm": 0.003136793617159128, "kl": 0.001523593906313181, "learning_rate": 4.980286753286196e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 136 }, { "completion_length": 256.0, "epoch": 0.0228676347855116, "grad_norm": 0.0017039531376212835, "kl": 0.0006258584326133132, "learning_rate": 4.979177905957726e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 137 }, { "completion_length": 256.0, "epoch": 0.02303455182774161, "grad_norm": 0.470661461353302, "kl": 0.002396379131823778, "learning_rate": 4.978038850628855e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 138 }, { "completion_length": 57.0, "epoch": 0.023201468869971625, "grad_norm": 0.007195879705250263, "kl": 0.0040284707210958, "learning_rate": 4.9768696011786095e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 139 }, { "completion_length": 255.0, "epoch": 0.023368385912201636, "grad_norm": 0.004115454852581024, "kl": 0.002767103724181652, "learning_rate": 4.975670171853926e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 140 }, { "completion_length": 256.0, "epoch": 0.023535302954431646, "grad_norm": 0.004490667954087257, "kl": 0.002605405170470476, "learning_rate": 4.974440577269473e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 141 }, { "completion_length": 157.0, "epoch": 0.02370221999666166, "grad_norm": 0.783027708530426, "kl": 0.0030547939240932465, "learning_rate": 4.973180832407471e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 142 }, { "completion_length": 206.0, "epoch": 0.02386913703889167, "grad_norm": 0.0045102620497345924, "kl": 0.0013860156759619713, "learning_rate": 4.971890952617515e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 143 }, { "completion_length": 256.0, "epoch": 0.024036054081121683, "grad_norm": 0.0025455558206886053, "kl": 0.0007086819969117641, "learning_rate": 4.970570953616383e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 144 }, { "completion_length": 256.0, "epoch": 0.024202971123351694, "grad_norm": 0.49667930603027344, "kl": 0.0025217789225280285, "learning_rate": 4.9692208514878445e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 145 }, { "completion_length": 106.5, "epoch": 0.024369888165581707, "grad_norm": 0.7649089694023132, "kl": 0.0005496042431332171, "learning_rate": 4.96784066268247e-06, "loss": 0.0, "reward": 0.026500001549720764, "reward_std": 0.31607672572135925, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.026500001549720764, "step": 146 }, { "completion_length": 45.0, "epoch": 0.024536805207811718, "grad_norm": 0.04059964045882225, "kl": 0.024512799456715584, "learning_rate": 4.966430404017424e-06, "loss": 0.001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 147 }, { "completion_length": 125.5, "epoch": 0.024703722250041728, "grad_norm": 0.0032058542128652334, "kl": 0.0013864783104509115, "learning_rate": 4.964990092676263e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 148 }, { "completion_length": 37.5, "epoch": 0.02487063929227174, "grad_norm": 0.03411239758133888, "kl": 0.015727538615465164, "learning_rate": 4.963519746208726e-06, "loss": 0.0006, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 149 }, { "completion_length": 69.0, "epoch": 0.02503755633450175, "grad_norm": 0.006437742151319981, "kl": 0.0008675693534314632, "learning_rate": 4.962019382530521e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 150 }, { "completion_length": 256.0, "epoch": 0.025204473376731765, "grad_norm": 0.551456868648529, "kl": 0.002061716513708234, "learning_rate": 4.960489019923105e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 151 }, { "completion_length": 133.0, "epoch": 0.025371390418961776, "grad_norm": 0.0028516994789242744, "kl": 0.0008921318221837282, "learning_rate": 4.958928677033465e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 152 }, { "completion_length": 177.0, "epoch": 0.02553830746119179, "grad_norm": 0.7803808450698853, "kl": 0.0027085822075605392, "learning_rate": 4.957338372873886e-06, "loss": 0.0001, "reward": 0.10799999535083771, "reward_std": 0.20081833004951477, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.10799999535083771, "step": 153 }, { "completion_length": 256.0, "epoch": 0.0257052245034218, "grad_norm": 0.5542619824409485, "kl": 0.0016000234754756093, "learning_rate": 4.9557181268217225e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 154 }, { "completion_length": 256.0, "epoch": 0.02587214154565181, "grad_norm": 0.002420460805296898, "kl": 0.0006344997091218829, "learning_rate": 4.9540679586191605e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 155 }, { "completion_length": 89.0, "epoch": 0.026039058587881823, "grad_norm": 0.006856379099190235, "kl": 0.003562736324965954, "learning_rate": 4.9523878883729794e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 156 }, { "completion_length": 92.5, "epoch": 0.026205975630111834, "grad_norm": 0.004289491567760706, "kl": 0.001653953455388546, "learning_rate": 4.9506779365543054e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 157 }, { "completion_length": 221.0, "epoch": 0.026372892672341847, "grad_norm": 0.003742998233065009, "kl": 0.0018485569162294269, "learning_rate": 4.94893812399836e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 158 }, { "completion_length": 256.0, "epoch": 0.026539809714571858, "grad_norm": 0.5246918201446533, "kl": 0.0012630211422219872, "learning_rate": 4.947168471904213e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 159 }, { "completion_length": 191.0, "epoch": 0.026706726756801868, "grad_norm": 0.0018313068430870771, "kl": 0.0008364144014194608, "learning_rate": 4.9453690018345144e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 160 }, { "completion_length": 165.0, "epoch": 0.02687364379903188, "grad_norm": 0.6951025128364563, "kl": 0.004667551256716251, "learning_rate": 4.9435397357152406e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 161 }, { "completion_length": 73.5, "epoch": 0.02704056084126189, "grad_norm": 0.003615825902670622, "kl": 0.0007906734244897962, "learning_rate": 4.9416806958354206e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 162 }, { "completion_length": 99.0, "epoch": 0.027207477883491905, "grad_norm": 0.743748664855957, "kl": 0.004089992493391037, "learning_rate": 4.939791904846869e-06, "loss": 0.0002, "reward": 0.016999997198581696, "reward_std": 0.3295117914676666, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.016999997198581696, "step": 163 }, { "completion_length": 202.0, "epoch": 0.027374394925721916, "grad_norm": 0.010561556555330753, "kl": 0.004778451751917601, "learning_rate": 4.937873385763909e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 164 }, { "completion_length": 93.5, "epoch": 0.02754131196795193, "grad_norm": 0.02456500567495823, "kl": 0.010260429233312607, "learning_rate": 4.935925161963089e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 165 }, { "completion_length": 226.5, "epoch": 0.02770822901018194, "grad_norm": 0.0023516525980085135, "kl": 0.0010813018307089806, "learning_rate": 4.933947257182901e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 166 }, { "completion_length": 189.5, "epoch": 0.02787514605241195, "grad_norm": 0.0029474033508449793, "kl": 0.001338199945166707, "learning_rate": 4.9319396955234925e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 167 }, { "completion_length": 208.0, "epoch": 0.028042063094641963, "grad_norm": 0.007299169898033142, "kl": 0.002620633225888014, "learning_rate": 4.9299025014463665e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 168 }, { "completion_length": 247.0, "epoch": 0.028208980136871974, "grad_norm": 0.007038203068077564, "kl": 0.003361462615430355, "learning_rate": 4.92783569977409e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 169 }, { "completion_length": 253.0, "epoch": 0.028375897179101987, "grad_norm": 0.003194525372236967, "kl": 0.0014862786047160625, "learning_rate": 4.925739315689991e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 170 }, { "completion_length": 256.0, "epoch": 0.028542814221331998, "grad_norm": 0.5311148762702942, "kl": 0.0022094352170825005, "learning_rate": 4.923613374737848e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 171 }, { "completion_length": 56.0, "epoch": 0.02870973126356201, "grad_norm": 0.0028501120395958424, "kl": 0.0012579626636579633, "learning_rate": 4.921457902821578e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 172 }, { "completion_length": 132.5, "epoch": 0.02887664830579202, "grad_norm": 0.005550900008529425, "kl": 0.002430474851280451, "learning_rate": 4.9192729262049285e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 173 }, { "completion_length": 113.0, "epoch": 0.02904356534802203, "grad_norm": 0.004829529672861099, "kl": 0.0024908287450671196, "learning_rate": 4.917058471511149e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 174 }, { "completion_length": 193.5, "epoch": 0.029210482390252045, "grad_norm": 0.008240972645580769, "kl": 0.002698900643736124, "learning_rate": 4.914814565722671e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 175 }, { "completion_length": 256.0, "epoch": 0.029377399432482056, "grad_norm": 0.002342839725315571, "kl": 0.0006892987294122577, "learning_rate": 4.912541236180779e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 176 }, { "completion_length": 256.0, "epoch": 0.02954431647471207, "grad_norm": 0.0020155266392976046, "kl": 0.0008545288583263755, "learning_rate": 4.910238510585275e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 177 }, { "completion_length": 242.0, "epoch": 0.02971123351694208, "grad_norm": 0.004538952372968197, "kl": 0.0022097649052739143, "learning_rate": 4.907906416994146e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 178 }, { "completion_length": 256.0, "epoch": 0.029878150559172093, "grad_norm": 0.5544212460517883, "kl": 0.0022937313187867403, "learning_rate": 4.905544983823214e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 179 }, { "completion_length": 75.0, "epoch": 0.030045067601402103, "grad_norm": 0.009744975715875626, "kl": 0.003921708557754755, "learning_rate": 4.903154239845798e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 180 }, { "completion_length": 151.5, "epoch": 0.030211984643632114, "grad_norm": 0.014394350349903107, "kl": 0.006705806124955416, "learning_rate": 4.900734214192358e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 181 }, { "completion_length": 256.0, "epoch": 0.030378901685862127, "grad_norm": 0.0025003571063280106, "kl": 0.0007842597551643848, "learning_rate": 4.898284936350144e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 182 }, { "completion_length": 175.5, "epoch": 0.030545818728092138, "grad_norm": 0.0033461935818195343, "kl": 0.0014864741824567318, "learning_rate": 4.8958064361628334e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 183 }, { "completion_length": 256.0, "epoch": 0.03071273577032215, "grad_norm": 0.00289236381649971, "kl": 0.0008769807172939181, "learning_rate": 4.893298743830168e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 184 }, { "completion_length": 256.0, "epoch": 0.03087965281255216, "grad_norm": 0.45990633964538574, "kl": 0.001738408813253045, "learning_rate": 4.890761889907589e-06, "loss": 0.0001, "reward": -0.48900002241134644, "reward_std": 0.8683272004127502, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.48900002241134644, "step": 185 }, { "completion_length": 66.0, "epoch": 0.03104656985478217, "grad_norm": 0.008808651007711887, "kl": 0.0032781457994133234, "learning_rate": 4.888195905305859e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 186 }, { "completion_length": 256.0, "epoch": 0.031213486897012185, "grad_norm": 0.46073803305625916, "kl": 0.0015886081382632256, "learning_rate": 4.885600821290692e-06, "loss": 0.0001, "reward": -0.34950000047683716, "reward_std": 0.8478209972381592, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.34950000047683716, "step": 187 }, { "completion_length": 256.0, "epoch": 0.0313804039392422, "grad_norm": 0.002850939519703388, "kl": 0.000604389701038599, "learning_rate": 4.882976669482368e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 188 }, { "completion_length": 247.0, "epoch": 0.031547320981472206, "grad_norm": 0.004800661467015743, "kl": 0.0021335515193641186, "learning_rate": 4.880323481855347e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 189 }, { "completion_length": 158.5, "epoch": 0.03171423802370222, "grad_norm": 0.005557985045015812, "kl": 0.002252049744129181, "learning_rate": 4.8776412907378845e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 190 }, { "completion_length": 256.0, "epoch": 0.03188115506593223, "grad_norm": 0.0025908444076776505, "kl": 0.0009160751942545176, "learning_rate": 4.874930128811631e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 191 }, { "completion_length": 70.0, "epoch": 0.03204807210816224, "grad_norm": 0.014216624200344086, "kl": 0.0060432180762290955, "learning_rate": 4.8721900291112415e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 192 }, { "completion_length": 199.5, "epoch": 0.032214989150392254, "grad_norm": 0.011198217049241066, "kl": 0.003438035026192665, "learning_rate": 4.869421025023965e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 193 }, { "completion_length": 79.0, "epoch": 0.03238190619262227, "grad_norm": 0.8080710172653198, "kl": 0.002157788258045912, "learning_rate": 4.866623150289241e-06, "loss": 0.0001, "reward": 0.31299999356269836, "reward_std": 0.08909544348716736, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31299999356269836, "step": 194 }, { "completion_length": 106.0, "epoch": 0.03254882323485228, "grad_norm": 0.020709408447146416, "kl": 0.006670342292636633, "learning_rate": 4.863796438998293e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 195 }, { "completion_length": 226.5, "epoch": 0.03271574027708229, "grad_norm": 0.5578277111053467, "kl": 0.002193016931414604, "learning_rate": 4.860940925593703e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 196 }, { "completion_length": 256.0, "epoch": 0.0328826573193123, "grad_norm": 0.003257354721426964, "kl": 0.0011970436898991466, "learning_rate": 4.858056644869002e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 197 }, { "completion_length": 256.0, "epoch": 0.033049574361542315, "grad_norm": 0.0018079557921737432, "kl": 0.0006898987339809537, "learning_rate": 4.855143631968242e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 198 }, { "completion_length": 79.5, "epoch": 0.03321649140377232, "grad_norm": 0.013823915272951126, "kl": 0.0033698189072310925, "learning_rate": 4.852201922385564e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 199 }, { "completion_length": 186.0, "epoch": 0.033383408446002336, "grad_norm": 0.7572923302650452, "kl": 0.0006614526500925422, "learning_rate": 4.849231551964771e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 200 }, { "completion_length": 256.0, "epoch": 0.03355032548823235, "grad_norm": 0.4194643199443817, "kl": 0.0007158059161156416, "learning_rate": 4.84623255689889e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 201 }, { "completion_length": 125.5, "epoch": 0.03371724253046236, "grad_norm": 0.008742684498429298, "kl": 0.0026122299022972584, "learning_rate": 4.84320497372973e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 202 }, { "completion_length": 256.0, "epoch": 0.03388415957269237, "grad_norm": 0.006188174244016409, "kl": 0.002113808412104845, "learning_rate": 4.840148839347434e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 203 }, { "completion_length": 256.0, "epoch": 0.03405107661492238, "grad_norm": 0.00213708751834929, "kl": 0.0008161086589097977, "learning_rate": 4.837064190990036e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 204 }, { "completion_length": 256.0, "epoch": 0.0342179936571524, "grad_norm": 0.003033376531675458, "kl": 0.0008216318674385548, "learning_rate": 4.833951066243004e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 205 }, { "completion_length": 230.5, "epoch": 0.034384910699382404, "grad_norm": 0.0020868410356342793, "kl": 0.0007428427925333381, "learning_rate": 4.830809503038781e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 206 }, { "completion_length": 199.0, "epoch": 0.03455182774161242, "grad_norm": 0.5663337111473083, "kl": 0.0014018776128068566, "learning_rate": 4.8276395396563215e-06, "loss": 0.0001, "reward": -0.08100000023841858, "reward_std": 0.29132798314094543, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08100000023841858, "step": 207 }, { "completion_length": 159.0, "epoch": 0.03471874478384243, "grad_norm": 0.8271621465682983, "kl": 0.001577866030856967, "learning_rate": 4.824441214720629e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 208 }, { "completion_length": 175.5, "epoch": 0.034885661826072445, "grad_norm": 0.6131613254547119, "kl": 0.0015017276164144278, "learning_rate": 4.821214567202284e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 209 }, { "completion_length": 256.0, "epoch": 0.03505257886830245, "grad_norm": 0.0026052636094391346, "kl": 0.0007710367208346725, "learning_rate": 4.817959636416969e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 210 }, { "completion_length": 211.0, "epoch": 0.035219495910532465, "grad_norm": 0.004183937329798937, "kl": 0.0015982184559106827, "learning_rate": 4.814676462024988e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 211 }, { "completion_length": 160.0, "epoch": 0.03538641295276248, "grad_norm": 0.009894547052681446, "kl": 0.0030887513421475887, "learning_rate": 4.811365084030784e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 212 }, { "completion_length": 165.5, "epoch": 0.035553329994992486, "grad_norm": 0.6228646039962769, "kl": 0.0024271938018500805, "learning_rate": 4.808025542782453e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 213 }, { "completion_length": 144.0, "epoch": 0.0357202470372225, "grad_norm": 0.0069277845323085785, "kl": 0.0021884976886212826, "learning_rate": 4.804657878971252e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 214 }, { "completion_length": 256.0, "epoch": 0.03588716407945251, "grad_norm": 0.4761464595794678, "kl": 0.0012601235648617148, "learning_rate": 4.801262133631101e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 215 }, { "completion_length": 126.0, "epoch": 0.03605408112168253, "grad_norm": 0.008177109062671661, "kl": 0.0019751747604459524, "learning_rate": 4.7978383481380865e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 216 }, { "completion_length": 131.0, "epoch": 0.036220998163912534, "grad_norm": 0.00613454869017005, "kl": 0.0018755451310425997, "learning_rate": 4.794386564209953e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 217 }, { "completion_length": 223.5, "epoch": 0.03638791520614255, "grad_norm": 0.44634270668029785, "kl": 0.0014435371849685907, "learning_rate": 4.790906823905599e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 218 }, { "completion_length": 91.0, "epoch": 0.03655483224837256, "grad_norm": 0.7354722619056702, "kl": 0.0022487337701022625, "learning_rate": 4.787399169624562e-06, "loss": 0.0001, "reward": -0.0065000057220458984, "reward_std": 0.3627457916736603, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0065000057220458984, "step": 219 }, { "completion_length": 256.0, "epoch": 0.03672174929060257, "grad_norm": 0.0037936577573418617, "kl": 0.0009520038729533553, "learning_rate": 4.783863644106502e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 220 }, { "completion_length": 256.0, "epoch": 0.03688866633283258, "grad_norm": 0.002587083959951997, "kl": 0.0009270606096833944, "learning_rate": 4.780300290430683e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 221 }, { "completion_length": 205.0, "epoch": 0.037055583375062595, "grad_norm": 0.003639143193140626, "kl": 0.0020004468970000744, "learning_rate": 4.776709152015443e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 222 }, { "completion_length": 147.0, "epoch": 0.03722250041729261, "grad_norm": 0.008083125576376915, "kl": 0.0027345772832632065, "learning_rate": 4.773090272617672e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 223 }, { "completion_length": 66.5, "epoch": 0.037389417459522616, "grad_norm": 0.04055819287896156, "kl": 0.010171202942728996, "learning_rate": 4.769443696332272e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 224 }, { "completion_length": 189.5, "epoch": 0.03755633450175263, "grad_norm": 0.7105002403259277, "kl": 0.0019799629226326942, "learning_rate": 4.765769467591626e-06, "loss": 0.0001, "reward": 0.009499996900558472, "reward_std": 0.3401183485984802, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.009499996900558472, "step": 225 }, { "completion_length": 159.0, "epoch": 0.03772325154398264, "grad_norm": 0.9629635810852051, "kl": 0.004896222613751888, "learning_rate": 4.762067631165049e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 226 }, { "completion_length": 196.5, "epoch": 0.03789016858621265, "grad_norm": 0.0023067721631377935, "kl": 0.0006931637763045728, "learning_rate": 4.7583382321582525e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 227 }, { "completion_length": 253.5, "epoch": 0.03805708562844266, "grad_norm": 0.6065590381622314, "kl": 0.0016378038562834263, "learning_rate": 4.754581316012785e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 228 }, { "completion_length": 256.0, "epoch": 0.03822400267067268, "grad_norm": 0.4182409942150116, "kl": 0.0006535338470712304, "learning_rate": 4.750796928505484e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 229 }, { "completion_length": 62.0, "epoch": 0.03839091971290269, "grad_norm": 0.0074080415070056915, "kl": 0.0021970951929688454, "learning_rate": 4.746985115747918e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 230 }, { "completion_length": 256.0, "epoch": 0.0385578367551327, "grad_norm": 0.002946356078609824, "kl": 0.0010866527445614338, "learning_rate": 4.743145924185821e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 231 }, { "completion_length": 182.0, "epoch": 0.03872475379736271, "grad_norm": 0.8010044097900391, "kl": 0.0018577806185930967, "learning_rate": 4.7392794005985324e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 232 }, { "completion_length": 153.5, "epoch": 0.038891670839592725, "grad_norm": 0.029739540070295334, "kl": 0.005337054841220379, "learning_rate": 4.735385592098421e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 233 }, { "completion_length": 110.0, "epoch": 0.03905858788182273, "grad_norm": 0.012487626634538174, "kl": 0.0035706604830920696, "learning_rate": 4.731464546130315e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 234 }, { "completion_length": 229.5, "epoch": 0.039225504924052745, "grad_norm": 0.41128891706466675, "kl": 0.0006509397644549608, "learning_rate": 4.72751631047092e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 235 }, { "completion_length": 256.0, "epoch": 0.03939242196628276, "grad_norm": 0.0018210287671536207, "kl": 0.0006590959383174777, "learning_rate": 4.723540933228245e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 236 }, { "completion_length": 148.5, "epoch": 0.039559339008512766, "grad_norm": 0.008435525000095367, "kl": 0.00294931186363101, "learning_rate": 4.719538462841003e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 237 }, { "completion_length": 62.5, "epoch": 0.03972625605074278, "grad_norm": 0.014434335753321648, "kl": 0.005917271599173546, "learning_rate": 4.715508948078037e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 238 }, { "completion_length": 256.0, "epoch": 0.03989317309297279, "grad_norm": 0.004059739410877228, "kl": 0.001634508720599115, "learning_rate": 4.71145243803771e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 239 }, { "completion_length": 256.0, "epoch": 0.04006009013520281, "grad_norm": 0.46403589844703674, "kl": 0.000833989935927093, "learning_rate": 4.707368982147318e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 240 }, { "completion_length": 99.0, "epoch": 0.040227007177432814, "grad_norm": 0.007492441218346357, "kl": 0.0029247449710965157, "learning_rate": 4.703258630162481e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 241 }, { "completion_length": 172.0, "epoch": 0.04039392421966283, "grad_norm": 0.7002574801445007, "kl": 0.0040121604688465595, "learning_rate": 4.699121432166542e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 242 }, { "completion_length": 256.0, "epoch": 0.04056084126189284, "grad_norm": 0.5193341970443726, "kl": 0.0014945006696507335, "learning_rate": 4.6949574385699514e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 243 }, { "completion_length": 152.0, "epoch": 0.04072775830412285, "grad_norm": 0.0032617978285998106, "kl": 0.0014553058426827192, "learning_rate": 4.690766700109659e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 244 }, { "completion_length": 168.0, "epoch": 0.04089467534635286, "grad_norm": 0.6632115840911865, "kl": 0.0026855652686208487, "learning_rate": 4.68654926784849e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 245 }, { "completion_length": 132.5, "epoch": 0.041061592388582875, "grad_norm": 0.011687880381941795, "kl": 0.005734768696129322, "learning_rate": 4.682305193174524e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 246 }, { "completion_length": 207.5, "epoch": 0.04122850943081289, "grad_norm": 0.5963542461395264, "kl": 0.0018796151271089911, "learning_rate": 4.6780345278004744e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 247 }, { "completion_length": 256.0, "epoch": 0.041395426473042896, "grad_norm": 0.0023439242504537106, "kl": 0.0006716196658089757, "learning_rate": 4.673737323763048e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 248 }, { "completion_length": 214.5, "epoch": 0.04156234351527291, "grad_norm": 0.006127151660621166, "kl": 0.002559205750003457, "learning_rate": 4.669413633422322e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 249 }, { "completion_length": 256.0, "epoch": 0.04172926055750292, "grad_norm": 0.00755158718675375, "kl": 0.001906523248180747, "learning_rate": 4.665063509461098e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 250 }, { "completion_length": 164.0, "epoch": 0.04189617759973293, "grad_norm": 0.8039184212684631, "kl": 0.0033837109804153442, "learning_rate": 4.6606870048842626e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 251 }, { "completion_length": 215.5, "epoch": 0.04206309464196294, "grad_norm": 0.002458620583638549, "kl": 0.0011240073945373297, "learning_rate": 4.656284173018144e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 252 }, { "completion_length": 256.0, "epoch": 0.04223001168419296, "grad_norm": 0.5395125150680542, "kl": 0.0027889427728950977, "learning_rate": 4.65185506750986e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 253 }, { "completion_length": 218.0, "epoch": 0.04239692872642297, "grad_norm": 0.010818185284733772, "kl": 0.004072962794452906, "learning_rate": 4.6473997423266615e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 254 }, { "completion_length": 124.0, "epoch": 0.04256384576865298, "grad_norm": 0.00726601667702198, "kl": 0.002971404232084751, "learning_rate": 4.642918251755281e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 255 }, { "completion_length": 224.0, "epoch": 0.04273076281088299, "grad_norm": 0.007763705216348171, "kl": 0.00342262489721179, "learning_rate": 4.638410650401267e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 256 }, { "completion_length": 80.5, "epoch": 0.042897679853113005, "grad_norm": 0.005960578098893166, "kl": 0.002792040351778269, "learning_rate": 4.633876993188319e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 257 }, { "completion_length": 256.0, "epoch": 0.04306459689534301, "grad_norm": 0.0025253419298678637, "kl": 0.000980317359790206, "learning_rate": 4.62931733535762e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 258 }, { "completion_length": 98.5, "epoch": 0.043231513937573025, "grad_norm": 0.014364824630320072, "kl": 0.006811304017901421, "learning_rate": 4.62473173246716e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 259 }, { "completion_length": 247.0, "epoch": 0.04339843097980304, "grad_norm": 0.0035524980630725622, "kl": 0.0018612551502883434, "learning_rate": 4.620120240391065e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 260 }, { "completion_length": 176.5, "epoch": 0.04356534802203305, "grad_norm": 0.6896646022796631, "kl": 0.004097536206245422, "learning_rate": 4.6154829153189105e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 261 }, { "completion_length": 166.5, "epoch": 0.04373226506426306, "grad_norm": 0.8282080292701721, "kl": 0.006228203885257244, "learning_rate": 4.610819813755038e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 262 }, { "completion_length": 186.0, "epoch": 0.04389918210649307, "grad_norm": 0.002550558652728796, "kl": 0.001502474769949913, "learning_rate": 4.60613099251787e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 263 }, { "completion_length": 256.0, "epoch": 0.04406609914872309, "grad_norm": 0.392997145652771, "kl": 0.0026185126043856144, "learning_rate": 4.601416508739211e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 264 }, { "completion_length": 169.5, "epoch": 0.044233016190953094, "grad_norm": 0.6534034609794617, "kl": 0.004573761485517025, "learning_rate": 4.596676419863561e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 265 }, { "completion_length": 155.0, "epoch": 0.04439993323318311, "grad_norm": 0.009656507521867752, "kl": 0.006164904218167067, "learning_rate": 4.591910783647405e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 266 }, { "completion_length": 256.0, "epoch": 0.04456685027541312, "grad_norm": 0.5320852398872375, "kl": 0.0027643111534416676, "learning_rate": 4.587119658158517e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 267 }, { "completion_length": 256.0, "epoch": 0.044733767317643135, "grad_norm": 0.0024587693624198437, "kl": 0.000822383095510304, "learning_rate": 4.582303101775249e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 268 }, { "completion_length": 188.0, "epoch": 0.04490068435987314, "grad_norm": 0.004378703888505697, "kl": 0.003045399207621813, "learning_rate": 4.577461173185821e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 269 }, { "completion_length": 256.0, "epoch": 0.045067601402103155, "grad_norm": 0.47458064556121826, "kl": 0.003992489073425531, "learning_rate": 4.572593931387604e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 270 }, { "completion_length": 105.0, "epoch": 0.04523451844433317, "grad_norm": 0.013432961888611317, "kl": 0.010349882766604424, "learning_rate": 4.567701435686405e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 271 }, { "completion_length": 256.0, "epoch": 0.045401435486563176, "grad_norm": 0.42819666862487793, "kl": 0.0028620134107768536, "learning_rate": 4.562783745695738e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 272 }, { "completion_length": 105.0, "epoch": 0.04556835252879319, "grad_norm": 0.610860288143158, "kl": 0.0021511802915483713, "learning_rate": 4.5578409213361055e-06, "loss": 0.0001, "reward": 0.0429999977350235, "reward_std": 0.29274222254753113, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0429999977350235, "step": 273 }, { "completion_length": 192.5, "epoch": 0.0457352695710232, "grad_norm": 0.0033041678834706545, "kl": 0.0024997168220579624, "learning_rate": 4.55287302283426e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 274 }, { "completion_length": 232.0, "epoch": 0.04590218661325321, "grad_norm": 0.003982267342507839, "kl": 0.004451976157724857, "learning_rate": 4.54788011072248e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 275 }, { "completion_length": 157.5, "epoch": 0.04606910365548322, "grad_norm": 0.00776221277192235, "kl": 0.007638050243258476, "learning_rate": 4.542862245837821e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 276 }, { "completion_length": 174.0, "epoch": 0.04623602069771324, "grad_norm": 0.014356784522533417, "kl": 0.012734368443489075, "learning_rate": 4.537819489321385e-06, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 277 }, { "completion_length": 249.0, "epoch": 0.04640293773994325, "grad_norm": 0.004380898084491491, "kl": 0.004263445734977722, "learning_rate": 4.5327519026175694e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 278 }, { "completion_length": 256.0, "epoch": 0.04656985478217326, "grad_norm": 0.014358595944941044, "kl": 0.006899717263877392, "learning_rate": 4.527659547473317e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 279 }, { "completion_length": 133.0, "epoch": 0.04673677182440327, "grad_norm": 0.826359212398529, "kl": 0.014278488233685493, "learning_rate": 4.522542485937369e-06, "loss": 0.0006, "reward": 0.31299999356269836, "reward_std": 0.08909544348716736, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31299999356269836, "step": 280 }, { "completion_length": 43.0, "epoch": 0.046903688866633285, "grad_norm": 0.02362542226910591, "kl": 0.030153125524520874, "learning_rate": 4.517400780359505e-06, "loss": 0.0012, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 281 }, { "completion_length": 68.0, "epoch": 0.04707060590886329, "grad_norm": 0.013137142173945904, "kl": 0.008583633229136467, "learning_rate": 4.512234493389785e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 282 }, { "completion_length": 227.5, "epoch": 0.047237522951093305, "grad_norm": 0.0046507674269378185, "kl": 0.003629918210208416, "learning_rate": 4.507043687977787e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 283 }, { "completion_length": 256.0, "epoch": 0.04740443999332332, "grad_norm": 0.42082175612449646, "kl": 0.002722459379583597, "learning_rate": 4.501828427371834e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 284 }, { "completion_length": 235.0, "epoch": 0.04757135703555333, "grad_norm": 0.5911824107170105, "kl": 0.003979473374783993, "learning_rate": 4.496588775118232e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 285 }, { "completion_length": 149.0, "epoch": 0.04773827407778334, "grad_norm": 0.0048468331806361675, "kl": 0.004797625355422497, "learning_rate": 4.491324795060491e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 286 }, { "completion_length": 250.0, "epoch": 0.04790519112001335, "grad_norm": 0.0017999842530116439, "kl": 0.0006170593551360071, "learning_rate": 4.4860365513385456e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 287 }, { "completion_length": 256.0, "epoch": 0.04807210816224337, "grad_norm": 0.0029833903536200523, "kl": 0.003325324971228838, "learning_rate": 4.4807241083879774e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 288 }, { "completion_length": 139.5, "epoch": 0.048239025204473374, "grad_norm": 1.1344103813171387, "kl": 0.005017009563744068, "learning_rate": 4.475387530939226e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 289 }, { "completion_length": 55.5, "epoch": 0.04840594224670339, "grad_norm": 0.006019354797899723, "kl": 0.004322670865803957, "learning_rate": 4.470026884016805e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 290 }, { "completion_length": 94.5, "epoch": 0.0485728592889334, "grad_norm": 0.7935484051704407, "kl": 0.004219060763716698, "learning_rate": 4.464642232938505e-06, "loss": 0.0002, "reward": 0.11500000208616257, "reward_std": 0.19091883301734924, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.11500000208616257, "step": 291 }, { "completion_length": 174.0, "epoch": 0.048739776331163415, "grad_norm": 0.0022192213218659163, "kl": 0.0007574663031846285, "learning_rate": 4.4592336433146e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 292 }, { "completion_length": 209.0, "epoch": 0.04890669337339342, "grad_norm": 0.005186779424548149, "kl": 0.004556355997920036, "learning_rate": 4.453801181047047e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 293 }, { "completion_length": 228.0, "epoch": 0.049073610415623435, "grad_norm": 0.006424285005778074, "kl": 0.0038462267257273197, "learning_rate": 4.448344912328686e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 294 }, { "completion_length": 256.0, "epoch": 0.04924052745785345, "grad_norm": 0.4358481466770172, "kl": 0.004346412606537342, "learning_rate": 4.442864903642428e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 295 }, { "completion_length": 175.5, "epoch": 0.049407444500083456, "grad_norm": 0.008684922009706497, "kl": 0.007895098999142647, "learning_rate": 4.437361221760449e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 296 }, { "completion_length": 256.0, "epoch": 0.04957436154231347, "grad_norm": 0.0022305110469460487, "kl": 0.0006597781321033835, "learning_rate": 4.431833933743378e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 297 }, { "completion_length": 256.0, "epoch": 0.04974127858454348, "grad_norm": 0.005712293088436127, "kl": 0.005445517599582672, "learning_rate": 4.426283106939474e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 298 }, { "completion_length": 256.0, "epoch": 0.0499081956267735, "grad_norm": 0.003091034246608615, "kl": 0.0014638254651799798, "learning_rate": 4.420708808983809e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 299 }, { "completion_length": 74.5, "epoch": 0.0500751126690035, "grad_norm": 0.021067529916763306, "kl": 0.02039944753050804, "learning_rate": 4.415111107797445e-06, "loss": 0.0008, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 300 }, { "completion_length": 202.5, "epoch": 0.05024202971123352, "grad_norm": 0.006776970811188221, "kl": 0.007739311549812555, "learning_rate": 4.409490071586606e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 301 }, { "completion_length": 256.0, "epoch": 0.05040894675346353, "grad_norm": 0.0018317042849957943, "kl": 0.0007313088281080127, "learning_rate": 4.403845768841842e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 302 }, { "completion_length": 201.5, "epoch": 0.05057586379569354, "grad_norm": 0.005300500895828009, "kl": 0.0014829222345724702, "learning_rate": 4.398178268337202e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 303 }, { "completion_length": 161.0, "epoch": 0.05074278083792355, "grad_norm": 0.005339760798960924, "kl": 0.005453492980450392, "learning_rate": 4.3924876391293915e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 304 }, { "completion_length": 256.0, "epoch": 0.050909697880153565, "grad_norm": 0.004233522340655327, "kl": 0.00088059704285115, "learning_rate": 4.386773950556931e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 305 }, { "completion_length": 256.0, "epoch": 0.05107661492238358, "grad_norm": 0.004186778329312801, "kl": 0.004309483338147402, "learning_rate": 4.381037272239311e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 306 }, { "completion_length": 222.5, "epoch": 0.051243531964613585, "grad_norm": 0.5819360613822937, "kl": 0.004527037963271141, "learning_rate": 4.3752776740761495e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 307 }, { "completion_length": 256.0, "epoch": 0.0514104490068436, "grad_norm": 0.4481275677680969, "kl": 0.002896673046052456, "learning_rate": 4.36949522624633e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 308 }, { "completion_length": 238.5, "epoch": 0.05157736604907361, "grad_norm": 0.004218791611492634, "kl": 0.004020088817924261, "learning_rate": 4.3636899992071555e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 309 }, { "completion_length": 102.5, "epoch": 0.05174428309130362, "grad_norm": 0.00786628108471632, "kl": 0.008536279201507568, "learning_rate": 4.357862063693486e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 310 }, { "completion_length": 72.5, "epoch": 0.05191120013353363, "grad_norm": 0.004252138547599316, "kl": 0.003163908841088414, "learning_rate": 4.352011490716875e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 311 }, { "completion_length": 220.0, "epoch": 0.05207811717576365, "grad_norm": 0.00810963474214077, "kl": 0.007133485749363899, "learning_rate": 4.346138351564711e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 312 }, { "completion_length": 76.5, "epoch": 0.05224503421799366, "grad_norm": 0.004160956013947725, "kl": 0.00112797855399549, "learning_rate": 4.340242717799337e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 313 }, { "completion_length": 206.5, "epoch": 0.05241195126022367, "grad_norm": 0.005039164796471596, "kl": 0.005804196931421757, "learning_rate": 4.334324661257191e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 314 }, { "completion_length": 241.5, "epoch": 0.05257886830245368, "grad_norm": 0.0036636528093367815, "kl": 0.0026740310713648796, "learning_rate": 4.328384254047927e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 315 }, { "completion_length": 210.0, "epoch": 0.052745785344683695, "grad_norm": 0.0030909108463674784, "kl": 0.0012426788453012705, "learning_rate": 4.322421568553529e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 316 }, { "completion_length": 159.0, "epoch": 0.0529127023869137, "grad_norm": 0.9456648230552673, "kl": 0.014048774726688862, "learning_rate": 4.316436677427441e-06, "loss": 0.0006, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 317 }, { "completion_length": 256.0, "epoch": 0.053079619429143715, "grad_norm": 0.47334498167037964, "kl": 0.001022295211441815, "learning_rate": 4.3104296535936695e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 318 }, { "completion_length": 111.5, "epoch": 0.05324653647137373, "grad_norm": 0.7564533948898315, "kl": 0.006583939306437969, "learning_rate": 4.3044005702459055e-06, "loss": 0.0003, "reward": -0.031000003218650818, "reward_std": 0.3973940312862396, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.031000003218650818, "step": 319 }, { "completion_length": 256.0, "epoch": 0.053413453513603736, "grad_norm": 0.0028573765885084867, "kl": 0.0018674938473850489, "learning_rate": 4.2983495008466285e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 320 }, { "completion_length": 236.5, "epoch": 0.05358037055583375, "grad_norm": 0.010966010391712189, "kl": 0.008310123346745968, "learning_rate": 4.2922765191262075e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 321 }, { "completion_length": 256.0, "epoch": 0.05374728759806376, "grad_norm": 0.47935691475868225, "kl": 0.00490741990506649, "learning_rate": 4.286181699082008e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 322 }, { "completion_length": 256.0, "epoch": 0.05391420464029378, "grad_norm": 0.5778952240943909, "kl": 0.004295697435736656, "learning_rate": 4.280065114977492e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 323 }, { "completion_length": 256.0, "epoch": 0.05408112168252378, "grad_norm": 0.001877494272775948, "kl": 0.0007368560181930661, "learning_rate": 4.273926841341303e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 324 }, { "completion_length": 256.0, "epoch": 0.0542480387247538, "grad_norm": 0.3824060261249542, "kl": 0.0006189742125570774, "learning_rate": 4.267766952966369e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 325 }, { "completion_length": 179.0, "epoch": 0.05441495576698381, "grad_norm": 0.003620666917413473, "kl": 0.003809203626587987, "learning_rate": 4.261585524908987e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 326 }, { "completion_length": 230.0, "epoch": 0.05458187280921382, "grad_norm": 0.002703301142901182, "kl": 0.002424915786832571, "learning_rate": 4.255382632487907e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 327 }, { "completion_length": 157.0, "epoch": 0.05474878985144383, "grad_norm": 0.004891808144748211, "kl": 0.0036584073677659035, "learning_rate": 4.249158351283414e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 328 }, { "completion_length": 172.0, "epoch": 0.054915706893673845, "grad_norm": 0.005536617245525122, "kl": 0.006536508910357952, "learning_rate": 4.242912757136412e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 329 }, { "completion_length": 156.0, "epoch": 0.05508262393590386, "grad_norm": 0.004169250372797251, "kl": 0.008032742887735367, "learning_rate": 4.236645926147493e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 330 }, { "completion_length": 145.5, "epoch": 0.055249540978133865, "grad_norm": 0.0034251853358000517, "kl": 0.0036275191232562065, "learning_rate": 4.230357934676017e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 331 }, { "completion_length": 143.0, "epoch": 0.05541645802036388, "grad_norm": 0.0033021210692822933, "kl": 0.0028168954886496067, "learning_rate": 4.224048859339175e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 332 }, { "completion_length": 123.5, "epoch": 0.05558337506259389, "grad_norm": 0.024828089401125908, "kl": 0.018611863255500793, "learning_rate": 4.217718777011058e-06, "loss": 0.0007, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 333 }, { "completion_length": 256.0, "epoch": 0.0557502921048239, "grad_norm": 0.47796720266342163, "kl": 0.0038352536503225565, "learning_rate": 4.211367764821722e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 334 }, { "completion_length": 90.5, "epoch": 0.05591720914705391, "grad_norm": 0.003393622813746333, "kl": 0.002635728335008025, "learning_rate": 4.204995900156247e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 335 }, { "completion_length": 256.0, "epoch": 0.05608412618928393, "grad_norm": 0.506782054901123, "kl": 0.003959814086556435, "learning_rate": 4.198603260653792e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 336 }, { "completion_length": 231.0, "epoch": 0.05625104323151394, "grad_norm": 0.5356849431991577, "kl": 0.0070463502779603004, "learning_rate": 4.192189924206652e-06, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 337 }, { "completion_length": 92.5, "epoch": 0.05641796027374395, "grad_norm": 0.005931315012276173, "kl": 0.003802720457315445, "learning_rate": 4.185755968959308e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 338 }, { "completion_length": 70.5, "epoch": 0.05658487731597396, "grad_norm": 0.008107639849185944, "kl": 0.01177256740629673, "learning_rate": 4.179301473307476e-06, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 339 }, { "completion_length": 256.0, "epoch": 0.056751794358203975, "grad_norm": 0.5622881650924683, "kl": 0.005616649053990841, "learning_rate": 4.172826515897146e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 340 }, { "completion_length": 253.0, "epoch": 0.05691871140043398, "grad_norm": 0.002877097809687257, "kl": 0.0022432636469602585, "learning_rate": 4.166331175623631e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 341 }, { "completion_length": 160.0, "epoch": 0.057085628442663995, "grad_norm": 0.0025175949558615685, "kl": 0.0021109329536557198, "learning_rate": 4.159815531630604e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 342 }, { "completion_length": 110.5, "epoch": 0.05725254548489401, "grad_norm": 0.00542190158739686, "kl": 0.002032297197729349, "learning_rate": 4.15327966330913e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 343 }, { "completion_length": 218.0, "epoch": 0.05741946252712402, "grad_norm": 0.003057193011045456, "kl": 0.0037488488014787436, "learning_rate": 4.146723650296701e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 344 }, { "completion_length": 256.0, "epoch": 0.05758637956935403, "grad_norm": 0.0015634832670912147, "kl": 0.0005131916841492057, "learning_rate": 4.140147572476269e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 345 }, { "completion_length": 116.5, "epoch": 0.05775329661158404, "grad_norm": 0.00788839627057314, "kl": 0.009475115686655045, "learning_rate": 4.133551509975264e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 346 }, { "completion_length": 189.5, "epoch": 0.05792021365381406, "grad_norm": 0.527866780757904, "kl": 0.007002187427133322, "learning_rate": 4.126935543164628e-06, "loss": 0.0003, "reward": -0.16699999570846558, "reward_std": 0.5897270441055298, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.16699999570846558, "step": 347 }, { "completion_length": 256.0, "epoch": 0.05808713069604406, "grad_norm": 0.0020951395854353905, "kl": 0.0015595173463225365, "learning_rate": 4.120299752657828e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 348 }, { "completion_length": 118.5, "epoch": 0.05825404773827408, "grad_norm": 0.6061116456985474, "kl": 0.005168904084712267, "learning_rate": 4.113644219309877e-06, "loss": 0.0002, "reward": 0.010499998927116394, "reward_std": 0.3387041389942169, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.010499998927116394, "step": 349 }, { "completion_length": 175.0, "epoch": 0.05842096478050409, "grad_norm": 0.004706359468400478, "kl": 0.005897793918848038, "learning_rate": 4.106969024216348e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 350 }, { "completion_length": 256.0, "epoch": 0.058587881822734104, "grad_norm": 0.0027129158843308687, "kl": 0.0017536095110699534, "learning_rate": 4.1002742487123896e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 351 }, { "completion_length": 98.0, "epoch": 0.05875479886496411, "grad_norm": 0.004158460535109043, "kl": 0.004504214972257614, "learning_rate": 4.093559974371725e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 352 }, { "completion_length": 256.0, "epoch": 0.058921715907194125, "grad_norm": 0.5336728096008301, "kl": 0.0062333084642887115, "learning_rate": 4.086826283005669e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 353 }, { "completion_length": 256.0, "epoch": 0.05908863294942414, "grad_norm": 0.003140585497021675, "kl": 0.003534170798957348, "learning_rate": 4.080073256662128e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 354 }, { "completion_length": 208.5, "epoch": 0.059255549991654145, "grad_norm": 0.009265796281397343, "kl": 0.009437667205929756, "learning_rate": 4.073300977624594e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 355 }, { "completion_length": 255.5, "epoch": 0.05942246703388416, "grad_norm": 0.4773617386817932, "kl": 0.006052106618881226, "learning_rate": 4.066509528411151e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 356 }, { "completion_length": 256.0, "epoch": 0.05958938407611417, "grad_norm": 0.0030096934642642736, "kl": 0.007244409993290901, "learning_rate": 4.059698991773466e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 357 }, { "completion_length": 256.0, "epoch": 0.059756301118344186, "grad_norm": 0.002705556806176901, "kl": 0.0047636074014008045, "learning_rate": 4.052869450695776e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 358 }, { "completion_length": 61.0, "epoch": 0.05992321816057419, "grad_norm": 0.009141207672655582, "kl": 0.022383172065019608, "learning_rate": 4.046020988393886e-06, "loss": 0.0009, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 359 }, { "completion_length": 256.0, "epoch": 0.06009013520280421, "grad_norm": 0.0029306025244295597, "kl": 0.00490144919604063, "learning_rate": 4.039153688314146e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 360 }, { "completion_length": 239.5, "epoch": 0.06025705224503422, "grad_norm": 0.0025303824804723263, "kl": 0.0018043061718344688, "learning_rate": 4.032267634132442e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 361 }, { "completion_length": 167.5, "epoch": 0.06042396928726423, "grad_norm": 0.006648820359259844, "kl": 0.013031981885433197, "learning_rate": 4.02536290975317e-06, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 362 }, { "completion_length": 85.5, "epoch": 0.06059088632949424, "grad_norm": 0.0039238655008375645, "kl": 0.01414913684129715, "learning_rate": 4.018439599308217e-06, "loss": 0.0006, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 363 }, { "completion_length": 93.0, "epoch": 0.060757803371724255, "grad_norm": 0.0033583033364266157, "kl": 0.0012269504368305206, "learning_rate": 4.011497787155938e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 364 }, { "completion_length": 256.0, "epoch": 0.06092472041395426, "grad_norm": 0.48644334077835083, "kl": 0.006793852895498276, "learning_rate": 4.0045375578801216e-06, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 365 }, { "completion_length": 76.5, "epoch": 0.061091637456184275, "grad_norm": 0.008035773411393166, "kl": 0.0044710165821015835, "learning_rate": 3.997558996288965e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 366 }, { "completion_length": 206.5, "epoch": 0.06125855449841429, "grad_norm": 0.0031689624302089214, "kl": 0.006396918557584286, "learning_rate": 3.9905621874140396e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 367 }, { "completion_length": 256.0, "epoch": 0.0614254715406443, "grad_norm": 0.00592978298664093, "kl": 0.008718734607100487, "learning_rate": 3.983547216509254e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 368 }, { "completion_length": 188.0, "epoch": 0.06159238858287431, "grad_norm": 0.003845668863505125, "kl": 0.0024752216413617134, "learning_rate": 3.976514169049814e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 369 }, { "completion_length": 104.5, "epoch": 0.06175930562510432, "grad_norm": 0.007666219025850296, "kl": 0.008180764503777027, "learning_rate": 3.969463130731183e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 370 }, { "completion_length": 253.5, "epoch": 0.06192622266733434, "grad_norm": 0.002188122132793069, "kl": 0.0015635810559615493, "learning_rate": 3.96239418746804e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 371 }, { "completion_length": 123.0, "epoch": 0.06209313970956434, "grad_norm": 0.004745712969452143, "kl": 0.00777268223464489, "learning_rate": 3.955307425393224e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 372 }, { "completion_length": 234.0, "epoch": 0.06226005675179436, "grad_norm": 0.001988980919122696, "kl": 0.0014141000574454665, "learning_rate": 3.948202930856697e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 373 }, { "completion_length": 109.5, "epoch": 0.06242697379402437, "grad_norm": 0.0067640491761267185, "kl": 0.01475146971642971, "learning_rate": 3.941080790424483e-06, "loss": 0.0006, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 374 }, { "completion_length": 157.5, "epoch": 0.06259389083625438, "grad_norm": 0.05447385460138321, "kl": 0.020113306120038033, "learning_rate": 3.933941090877615e-06, "loss": 0.0008, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 375 }, { "completion_length": 157.5, "epoch": 0.0627608078784844, "grad_norm": 0.0034481449984014034, "kl": 0.0008895009523257613, "learning_rate": 3.92678391921108e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 376 }, { "completion_length": 133.5, "epoch": 0.06292772492071441, "grad_norm": 0.004916926380246878, "kl": 0.009922463446855545, "learning_rate": 3.9196093626327535e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 377 }, { "completion_length": 37.5, "epoch": 0.06309464196294441, "grad_norm": 0.006436909083276987, "kl": 0.025706635788083076, "learning_rate": 3.912417508562345e-06, "loss": 0.001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 378 }, { "completion_length": 241.0, "epoch": 0.06326155900517443, "grad_norm": 0.003208921756595373, "kl": 0.0062410952523350716, "learning_rate": 3.905208444630326e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 379 }, { "completion_length": 117.5, "epoch": 0.06342847604740444, "grad_norm": 0.007015647832304239, "kl": 0.008801888674497604, "learning_rate": 3.897982258676867e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 380 }, { "completion_length": 247.0, "epoch": 0.06359539308963445, "grad_norm": 0.4503168761730194, "kl": 0.002253452781587839, "learning_rate": 3.890739038750763e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 381 }, { "completion_length": 89.5, "epoch": 0.06376231013186447, "grad_norm": 0.005086138378828764, "kl": 0.001769829774275422, "learning_rate": 3.88347887310836e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 382 }, { "completion_length": 149.0, "epoch": 0.06392922717409448, "grad_norm": 0.007208625786006451, "kl": 0.011640203185379505, "learning_rate": 3.876201850212489e-06, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 383 }, { "completion_length": 161.0, "epoch": 0.06409614421632448, "grad_norm": 0.8889212608337402, "kl": 0.01527651771903038, "learning_rate": 3.868908058731376e-06, "loss": 0.0006, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 384 }, { "completion_length": 213.0, "epoch": 0.0642630612585545, "grad_norm": 0.5385991930961609, "kl": 0.006658099126070738, "learning_rate": 3.861597587537568e-06, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 385 }, { "completion_length": 256.0, "epoch": 0.06442997830078451, "grad_norm": 0.45821189880371094, "kl": 0.004864447750151157, "learning_rate": 3.85427052570685e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 386 }, { "completion_length": 197.5, "epoch": 0.06459689534301452, "grad_norm": 0.5216923952102661, "kl": 0.0066991569474339485, "learning_rate": 3.846926962517158e-06, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 387 }, { "completion_length": 223.0, "epoch": 0.06476381238524453, "grad_norm": 0.0034660983365029097, "kl": 0.0013932466972619295, "learning_rate": 3.839566987447492e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 388 }, { "completion_length": 232.0, "epoch": 0.06493072942747455, "grad_norm": 0.0025906520895659924, "kl": 0.0014679576270282269, "learning_rate": 3.832190690176825e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 389 }, { "completion_length": 146.5, "epoch": 0.06509764646970456, "grad_norm": 0.6655511856079102, "kl": 0.007280864752829075, "learning_rate": 3.824798160583012e-06, "loss": 0.0003, "reward": -0.16200000047683716, "reward_std": 0.5826559662818909, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.16200000047683716, "step": 390 }, { "completion_length": 152.0, "epoch": 0.06526456351193456, "grad_norm": 0.006808367092162371, "kl": 0.012140478938817978, "learning_rate": 3.817389488741694e-06, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 391 }, { "completion_length": 205.0, "epoch": 0.06543148055416458, "grad_norm": 0.002851013094186783, "kl": 0.0020360774360597134, "learning_rate": 3.8099647649251984e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 392 }, { "completion_length": 256.0, "epoch": 0.06559839759639459, "grad_norm": 0.002986885141581297, "kl": 0.0034806416369974613, "learning_rate": 3.802524079601442e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 393 }, { "completion_length": 134.5, "epoch": 0.0657653146386246, "grad_norm": 0.004309982992708683, "kl": 0.009969418868422508, "learning_rate": 3.795067523432826e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 394 }, { "completion_length": 255.0, "epoch": 0.06593223168085462, "grad_norm": 0.0028138558845967054, "kl": 0.0056825182400643826, "learning_rate": 3.787595187275136e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 395 }, { "completion_length": 58.5, "epoch": 0.06609914872308463, "grad_norm": 0.008521797135472298, "kl": 0.0068394895642995834, "learning_rate": 3.780107162176429e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 396 }, { "completion_length": 179.5, "epoch": 0.06626606576531464, "grad_norm": 0.005005613900721073, "kl": 0.00859293807297945, "learning_rate": 3.772603539375929e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 397 }, { "completion_length": 253.5, "epoch": 0.06643298280754464, "grad_norm": 0.0024087412748485804, "kl": 0.0014911536127328873, "learning_rate": 3.7650844103029093e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 398 }, { "completion_length": 89.5, "epoch": 0.06659989984977466, "grad_norm": 0.009002960287034512, "kl": 0.020543519407510757, "learning_rate": 3.7575498665755884e-06, "loss": 0.0008, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 399 }, { "completion_length": 79.0, "epoch": 0.06676681689200467, "grad_norm": 0.005351222585886717, "kl": 0.006398818455636501, "learning_rate": 3.7500000000000005e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 400 }, { "completion_length": 162.5, "epoch": 0.06693373393423468, "grad_norm": 0.0037720431573688984, "kl": 0.004047206602990627, "learning_rate": 3.742434902568889e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 401 }, { "completion_length": 165.5, "epoch": 0.0671006509764647, "grad_norm": 0.004343738779425621, "kl": 0.0045508770272135735, "learning_rate": 3.7348546664605777e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 402 }, { "completion_length": 256.0, "epoch": 0.06726756801869471, "grad_norm": 0.002527782227844, "kl": 0.0033916987013071775, "learning_rate": 3.7272593840378526e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 403 }, { "completion_length": 63.0, "epoch": 0.06743448506092473, "grad_norm": 0.007267891429364681, "kl": 0.00811045803129673, "learning_rate": 3.7196491478468322e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 404 }, { "completion_length": 256.0, "epoch": 0.06760140210315473, "grad_norm": 0.0068841855973005295, "kl": 0.00899864174425602, "learning_rate": 3.7120240506158433e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 405 }, { "completion_length": 256.0, "epoch": 0.06776831914538474, "grad_norm": 0.00266668270342052, "kl": 0.005115564446896315, "learning_rate": 3.7043841852542884e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 406 }, { "completion_length": 256.0, "epoch": 0.06793523618761475, "grad_norm": 0.0038791887927800417, "kl": 0.0041780476458370686, "learning_rate": 3.6967296448515176e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 407 }, { "completion_length": 245.0, "epoch": 0.06810215322984477, "grad_norm": 0.5836929082870483, "kl": 0.0022355513647198677, "learning_rate": 3.689060522675689e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 408 }, { "completion_length": 182.0, "epoch": 0.06826907027207478, "grad_norm": 0.005977727472782135, "kl": 0.010167822241783142, "learning_rate": 3.6813769121726356e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 409 }, { "completion_length": 129.5, "epoch": 0.0684359873143048, "grad_norm": 0.0038507983554154634, "kl": 0.002051785122603178, "learning_rate": 3.6736789069647273e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 410 }, { "completion_length": 183.0, "epoch": 0.06860290435653481, "grad_norm": 0.0029404833912849426, "kl": 0.003210225608199835, "learning_rate": 3.6659666008497287e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 411 }, { "completion_length": 94.0, "epoch": 0.06876982139876481, "grad_norm": 0.0043430631048977375, "kl": 0.01032315194606781, "learning_rate": 3.658240087799655e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 412 }, { "completion_length": 96.5, "epoch": 0.06893673844099482, "grad_norm": 0.0057783955708146095, "kl": 0.01503991149365902, "learning_rate": 3.6504994619596295e-06, "loss": 0.0006, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 413 }, { "completion_length": 205.5, "epoch": 0.06910365548322484, "grad_norm": 0.46108299493789673, "kl": 0.0033161365427076817, "learning_rate": 3.642744817646736e-06, "loss": 0.0001, "reward": -0.132999986410141, "reward_std": 0.5416437387466431, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.132999986410141, "step": 414 }, { "completion_length": 237.5, "epoch": 0.06927057252545485, "grad_norm": 0.5405523180961609, "kl": 0.01245264895260334, "learning_rate": 3.634976249348867e-06, "loss": 0.0005, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 415 }, { "completion_length": 161.5, "epoch": 0.06943748956768486, "grad_norm": 0.01344638504087925, "kl": 0.020867502316832542, "learning_rate": 3.627193851723577e-06, "loss": 0.0008, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 416 }, { "completion_length": 174.0, "epoch": 0.06960440660991488, "grad_norm": 0.6875776052474976, "kl": 0.005284446757286787, "learning_rate": 3.6193977195969243e-06, "loss": 0.0002, "reward": 0.0005000010132789612, "reward_std": 0.17606958746910095, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0005000010132789612, "step": 417 }, { "completion_length": 256.0, "epoch": 0.06977132365214489, "grad_norm": 0.004543864168226719, "kl": 0.0015383971622213721, "learning_rate": 3.611587947962319e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 418 }, { "completion_length": 138.0, "epoch": 0.06993824069437489, "grad_norm": 1.3993828296661377, "kl": 0.035674117505550385, "learning_rate": 3.6037646319793635e-06, "loss": 0.0014, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 419 }, { "completion_length": 103.0, "epoch": 0.0701051577366049, "grad_norm": 0.6647042036056519, "kl": 0.008622417226433754, "learning_rate": 3.595927866972694e-06, "loss": 0.0003, "reward": 0.31299999356269836, "reward_std": 0.08909544348716736, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31299999356269836, "step": 420 }, { "completion_length": 256.0, "epoch": 0.07027207477883492, "grad_norm": 0.002625485649332404, "kl": 0.000989075400866568, "learning_rate": 3.5880777484308193e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 421 }, { "completion_length": 199.5, "epoch": 0.07043899182106493, "grad_norm": 0.003418461186811328, "kl": 0.002218523994088173, "learning_rate": 3.5802143720049565e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 422 }, { "completion_length": 256.0, "epoch": 0.07060590886329494, "grad_norm": 0.0028757541440427303, "kl": 0.003217359073460102, "learning_rate": 3.5723378335078653e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 423 }, { "completion_length": 256.0, "epoch": 0.07077282590552496, "grad_norm": 0.5442543625831604, "kl": 0.0075112697668373585, "learning_rate": 3.564448228912682e-06, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 424 }, { "completion_length": 256.0, "epoch": 0.07093974294775497, "grad_norm": 0.008780797012150288, "kl": 0.008342335000634193, "learning_rate": 3.556545654351749e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 425 }, { "completion_length": 93.0, "epoch": 0.07110665998998497, "grad_norm": 0.004694369155913591, "kl": 0.006499157287180424, "learning_rate": 3.5486302061154433e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 426 }, { "completion_length": 63.5, "epoch": 0.07127357703221499, "grad_norm": 0.004946056287735701, "kl": 0.012154093012213707, "learning_rate": 3.5407019806510035e-06, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 427 }, { "completion_length": 256.0, "epoch": 0.071440494074445, "grad_norm": 0.003223523497581482, "kl": 0.003974433988332748, "learning_rate": 3.532761074561355e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 428 }, { "completion_length": 247.5, "epoch": 0.07160741111667501, "grad_norm": 0.0030117055866867304, "kl": 0.004316904116421938, "learning_rate": 3.524807584603932e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 429 }, { "completion_length": 219.0, "epoch": 0.07177432815890503, "grad_norm": 0.0038340440951287746, "kl": 0.007941951975226402, "learning_rate": 3.516841607689501e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 430 }, { "completion_length": 256.0, "epoch": 0.07194124520113504, "grad_norm": 0.004773168358951807, "kl": 0.006558659020811319, "learning_rate": 3.5088632408809757e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 431 }, { "completion_length": 236.0, "epoch": 0.07210816224336505, "grad_norm": 0.004651021212339401, "kl": 0.008961416780948639, "learning_rate": 3.5008725813922383e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 432 }, { "completion_length": 149.0, "epoch": 0.07227507928559505, "grad_norm": 0.004056077916175127, "kl": 0.0032332688570022583, "learning_rate": 3.4928697265869516e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 433 }, { "completion_length": 171.5, "epoch": 0.07244199632782507, "grad_norm": 0.0033490906935185194, "kl": 0.0012542055919766426, "learning_rate": 3.4848547739773782e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 434 }, { "completion_length": 236.5, "epoch": 0.07260891337005508, "grad_norm": 0.0026693614199757576, "kl": 0.00452476367354393, "learning_rate": 3.476827821223184e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 435 }, { "completion_length": 164.5, "epoch": 0.0727758304122851, "grad_norm": 0.004354130942374468, "kl": 0.0028194452170282602, "learning_rate": 3.4687889661302577e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 436 }, { "completion_length": 256.0, "epoch": 0.07294274745451511, "grad_norm": 0.4462840259075165, "kl": 0.008473025634884834, "learning_rate": 3.460738306649509e-06, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 437 }, { "completion_length": 242.5, "epoch": 0.07310966449674512, "grad_norm": 0.0030239997431635857, "kl": 0.0024750949814915657, "learning_rate": 3.452675940875686e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 438 }, { "completion_length": 195.0, "epoch": 0.07327658153897514, "grad_norm": 0.5708684325218201, "kl": 0.007269646972417831, "learning_rate": 3.4446019670461684e-06, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 439 }, { "completion_length": 256.0, "epoch": 0.07344349858120514, "grad_norm": 0.0031012417748570442, "kl": 0.0010529134888201952, "learning_rate": 3.436516483539781e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 440 }, { "completion_length": 155.0, "epoch": 0.07361041562343515, "grad_norm": 0.0031471047550439835, "kl": 0.0010129029396921396, "learning_rate": 3.4284195888755877e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 441 }, { "completion_length": 57.0, "epoch": 0.07377733266566516, "grad_norm": 0.004951135255396366, "kl": 0.006312179379165173, "learning_rate": 3.4203113817116955e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 442 }, { "completion_length": 254.0, "epoch": 0.07394424970789518, "grad_norm": 0.0030335085466504097, "kl": 0.006209371145814657, "learning_rate": 3.412191960844049e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 443 }, { "completion_length": 151.5, "epoch": 0.07411116675012519, "grad_norm": 0.00420411815866828, "kl": 0.0037854118272662163, "learning_rate": 3.4040614252052305e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 444 }, { "completion_length": 173.0, "epoch": 0.0742780837923552, "grad_norm": 0.005495717283338308, "kl": 0.0023110085166990757, "learning_rate": 3.39591987386325e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 445 }, { "completion_length": 155.5, "epoch": 0.07444500083458522, "grad_norm": 0.004874846898019314, "kl": 0.010738937184214592, "learning_rate": 3.387767406020343e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 446 }, { "completion_length": 256.0, "epoch": 0.07461191787681522, "grad_norm": 0.004281467292457819, "kl": 0.007576585281640291, "learning_rate": 3.3796041210117545e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 447 }, { "completion_length": 204.0, "epoch": 0.07477883491904523, "grad_norm": 0.0032856480684131384, "kl": 0.003489980474114418, "learning_rate": 3.3714301183045382e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 448 }, { "completion_length": 255.5, "epoch": 0.07494575196127524, "grad_norm": 0.6564204692840576, "kl": 0.0057500554248690605, "learning_rate": 3.3632454974963368e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 449 }, { "completion_length": 256.0, "epoch": 0.07511266900350526, "grad_norm": 0.001835031434893608, "kl": 0.0005812081508338451, "learning_rate": 3.3550503583141726e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 450 }, { "completion_length": 176.0, "epoch": 0.07527958604573527, "grad_norm": 0.7408838868141174, "kl": 0.014697302132844925, "learning_rate": 3.346844800613229e-06, "loss": 0.0006, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 451 }, { "completion_length": 256.0, "epoch": 0.07544650308796529, "grad_norm": 0.002940148115158081, "kl": 0.006415344309061766, "learning_rate": 3.338628924375638e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 452 }, { "completion_length": 158.5, "epoch": 0.0756134201301953, "grad_norm": 0.003837752854451537, "kl": 0.0091791283339262, "learning_rate": 3.3304028297092583e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 453 }, { "completion_length": 249.5, "epoch": 0.0757803371724253, "grad_norm": 0.0024029077030718327, "kl": 0.0009061304735951126, "learning_rate": 3.3221666168464584e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 454 }, { "completion_length": 256.0, "epoch": 0.07594725421465531, "grad_norm": 0.0021282413508743048, "kl": 0.0007478459738194942, "learning_rate": 3.313920386142892e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 455 }, { "completion_length": 126.0, "epoch": 0.07611417125688533, "grad_norm": 0.0037257184740155935, "kl": 0.001630447688512504, "learning_rate": 3.3056642380762783e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 456 }, { "completion_length": 164.5, "epoch": 0.07628108829911534, "grad_norm": 0.005106627009809017, "kl": 0.008369042538106441, "learning_rate": 3.2973982732451753e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 457 }, { "completion_length": 115.0, "epoch": 0.07644800534134535, "grad_norm": 0.00416791345924139, "kl": 0.0030869594775140285, "learning_rate": 3.2891225923677565e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 458 }, { "completion_length": 206.5, "epoch": 0.07661492238357537, "grad_norm": 0.008169974200427532, "kl": 0.013879917562007904, "learning_rate": 3.280837296280582e-06, "loss": 0.0006, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 459 }, { "completion_length": 205.0, "epoch": 0.07678183942580538, "grad_norm": 0.0033063869923353195, "kl": 0.00390999810770154, "learning_rate": 3.272542485937369e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 460 }, { "completion_length": 48.5, "epoch": 0.07694875646803538, "grad_norm": 0.03453482687473297, "kl": 0.051553212106227875, "learning_rate": 3.2642382624077647e-06, "loss": 0.0021, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 461 }, { "completion_length": 167.0, "epoch": 0.0771156735102654, "grad_norm": 0.005188738461583853, "kl": 0.0036034665536135435, "learning_rate": 3.2559247268761117e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 462 }, { "completion_length": 97.5, "epoch": 0.07728259055249541, "grad_norm": 0.6110023856163025, "kl": 0.003773051779717207, "learning_rate": 3.247601980640217e-06, "loss": 0.0002, "reward": 0.12800000607967377, "reward_std": 0.1725340634584427, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12800000607967377, "step": 463 }, { "completion_length": 186.0, "epoch": 0.07744950759472542, "grad_norm": 0.003268215572461486, "kl": 0.0022569475695490837, "learning_rate": 3.2392701251101172e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 464 }, { "completion_length": 93.0, "epoch": 0.07761642463695544, "grad_norm": 0.003732310840860009, "kl": 0.007325306534767151, "learning_rate": 3.230929261806842e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 465 }, { "completion_length": 144.0, "epoch": 0.07778334167918545, "grad_norm": 0.005549812689423561, "kl": 0.012345299124717712, "learning_rate": 3.222579492361179e-06, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 466 }, { "completion_length": 174.5, "epoch": 0.07795025872141545, "grad_norm": 0.6120846271514893, "kl": 0.011977024376392365, "learning_rate": 3.214220918512434e-06, "loss": 0.0005, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 467 }, { "completion_length": 256.0, "epoch": 0.07811717576364546, "grad_norm": 0.0023493580520153046, "kl": 0.0008547472534701228, "learning_rate": 3.205853642107192e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 468 }, { "completion_length": 256.0, "epoch": 0.07828409280587548, "grad_norm": 0.0016203024424612522, "kl": 0.0005688403034582734, "learning_rate": 3.1974777650980737e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 469 }, { "completion_length": 256.0, "epoch": 0.07845100984810549, "grad_norm": 0.0030670296400785446, "kl": 0.005791377276182175, "learning_rate": 3.189093389542498e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 470 }, { "completion_length": 256.0, "epoch": 0.0786179268903355, "grad_norm": 0.0027270414866507053, "kl": 0.005058842711150646, "learning_rate": 3.180700617601436e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 471 }, { "completion_length": 252.5, "epoch": 0.07878484393256552, "grad_norm": 0.004971963819116354, "kl": 0.010813718661665916, "learning_rate": 3.1722995515381644e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 472 }, { "completion_length": 212.5, "epoch": 0.07895176097479553, "grad_norm": 0.008081819862127304, "kl": 0.009239361621439457, "learning_rate": 3.1638902937170224e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 473 }, { "completion_length": 256.0, "epoch": 0.07911867801702553, "grad_norm": 0.0021279898937791586, "kl": 0.0015924839535728097, "learning_rate": 3.155472946602162e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 474 }, { "completion_length": 182.0, "epoch": 0.07928559505925555, "grad_norm": 0.7744367122650146, "kl": 0.011277498677372932, "learning_rate": 3.147047612756302e-06, "loss": 0.0005, "reward": -0.0040000006556510925, "reward_std": 0.18243356049060822, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0040000006556510925, "step": 475 }, { "completion_length": 256.0, "epoch": 0.07945251210148556, "grad_norm": 0.003445917973294854, "kl": 0.001701794215478003, "learning_rate": 3.1386143948394764e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 476 }, { "completion_length": 180.0, "epoch": 0.07961942914371557, "grad_norm": 0.0027461673598736525, "kl": 0.0015370894689112902, "learning_rate": 3.130173395607785e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 477 }, { "completion_length": 227.5, "epoch": 0.07978634618594559, "grad_norm": 0.0024700446520000696, "kl": 0.003734992817044258, "learning_rate": 3.121724717912138e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 478 }, { "completion_length": 83.0, "epoch": 0.0799532632281756, "grad_norm": 0.0034396243281662464, "kl": 0.00546608678996563, "learning_rate": 3.1132684646970068e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 479 }, { "completion_length": 256.0, "epoch": 0.08012018027040561, "grad_norm": 0.002586987102404237, "kl": 0.00100737065076828, "learning_rate": 3.1048047389991693e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 480 }, { "completion_length": 109.0, "epoch": 0.08028709731263561, "grad_norm": 0.0034532060381025076, "kl": 0.0010555561166256666, "learning_rate": 3.0963336439464527e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 481 }, { "completion_length": 159.0, "epoch": 0.08045401435486563, "grad_norm": 0.011007145047187805, "kl": 0.017930401489138603, "learning_rate": 3.087855282756475e-06, "loss": 0.0007, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 482 }, { "completion_length": 82.5, "epoch": 0.08062093139709564, "grad_norm": 0.0050530945882201195, "kl": 0.0012187592219561338, "learning_rate": 3.079369758735393e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 483 }, { "completion_length": 256.0, "epoch": 0.08078784843932565, "grad_norm": 0.004584734793752432, "kl": 0.004924206528812647, "learning_rate": 3.0708771752766397e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 484 }, { "completion_length": 53.0, "epoch": 0.08095476548155567, "grad_norm": 0.006897653918713331, "kl": 0.01568857952952385, "learning_rate": 3.062377635859663e-06, "loss": 0.0006, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 485 }, { "completion_length": 244.0, "epoch": 0.08112168252378568, "grad_norm": 0.003658297238871455, "kl": 0.001440483029000461, "learning_rate": 3.053871244048669e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 486 }, { "completion_length": 216.0, "epoch": 0.0812885995660157, "grad_norm": 0.004017638973891735, "kl": 0.005744835361838341, "learning_rate": 3.045358103491357e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 487 }, { "completion_length": 184.5, "epoch": 0.0814555166082457, "grad_norm": 0.0029865752439945936, "kl": 0.00407864386215806, "learning_rate": 3.0368383179176584e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 488 }, { "completion_length": 256.0, "epoch": 0.08162243365047571, "grad_norm": 0.0022525957319885492, "kl": 0.0007382537005469203, "learning_rate": 3.0283119911384724e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 489 }, { "completion_length": 256.0, "epoch": 0.08178935069270572, "grad_norm": 0.48885685205459595, "kl": 0.002361223567277193, "learning_rate": 3.019779227044398e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 490 }, { "completion_length": 256.0, "epoch": 0.08195626773493574, "grad_norm": 0.003155249636620283, "kl": 0.0032425224781036377, "learning_rate": 3.0112401296044756e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 491 }, { "completion_length": 153.5, "epoch": 0.08212318477716575, "grad_norm": 0.8349368572235107, "kl": 0.007628859952092171, "learning_rate": 3.002694802864912e-06, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 492 }, { "completion_length": 200.5, "epoch": 0.08229010181939576, "grad_norm": 0.003111280035227537, "kl": 0.00525133591145277, "learning_rate": 2.9941433509478157e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 493 }, { "completion_length": 136.5, "epoch": 0.08245701886162578, "grad_norm": 0.01320971455425024, "kl": 0.017018627375364304, "learning_rate": 2.98558587804993e-06, "loss": 0.0007, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 494 }, { "completion_length": 256.0, "epoch": 0.08262393590385578, "grad_norm": 0.002314383629709482, "kl": 0.0008120458805933595, "learning_rate": 2.9770224884413625e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 495 }, { "completion_length": 224.0, "epoch": 0.08279085294608579, "grad_norm": 0.49683764576911926, "kl": 0.006487313657999039, "learning_rate": 2.9684532864643123e-06, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 496 }, { "completion_length": 79.5, "epoch": 0.0829577699883158, "grad_norm": 0.978888988494873, "kl": 0.017479021102190018, "learning_rate": 2.9598783765318005e-06, "loss": 0.0007, "reward": 0.12049999833106995, "reward_std": 0.18314066529273987, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.12049999833106995, "step": 497 }, { "completion_length": 207.5, "epoch": 0.08312468703054582, "grad_norm": 0.5755419135093689, "kl": 0.006486868020147085, "learning_rate": 2.9512978631264006e-06, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 498 }, { "completion_length": 217.5, "epoch": 0.08329160407277583, "grad_norm": 0.7735682129859924, "kl": 0.00708380714058876, "learning_rate": 2.942711850798959e-06, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 499 }, { "completion_length": 217.0, "epoch": 0.08345852111500585, "grad_norm": 0.009821887128055096, "kl": 0.009430669248104095, "learning_rate": 2.9341204441673267e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 500 }, { "completion_length": 81.0, "epoch": 0.08362543815723586, "grad_norm": 0.0055695087648928165, "kl": 0.00434044748544693, "learning_rate": 2.9255237479150815e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 501 }, { "completion_length": 104.5, "epoch": 0.08379235519946586, "grad_norm": 0.013159430585801601, "kl": 0.01600869931280613, "learning_rate": 2.9169218667902562e-06, "loss": 0.0006, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 502 }, { "completion_length": 221.0, "epoch": 0.08395927224169587, "grad_norm": 0.004928835667669773, "kl": 0.007577899843454361, "learning_rate": 2.908314905604056e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 503 }, { "completion_length": 166.5, "epoch": 0.08412618928392589, "grad_norm": 0.0032661915756762028, "kl": 0.0033788690343499184, "learning_rate": 2.8997029692295875e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 504 }, { "completion_length": 145.5, "epoch": 0.0842931063261559, "grad_norm": 0.6954944729804993, "kl": 0.00877952016890049, "learning_rate": 2.8910861626005774e-06, "loss": 0.0004, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 505 }, { "completion_length": 220.5, "epoch": 0.08446002336838591, "grad_norm": 0.5234226584434509, "kl": 0.008442236110568047, "learning_rate": 2.8824645907100957e-06, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 506 }, { "completion_length": 115.0, "epoch": 0.08462694041061593, "grad_norm": 0.006071748211979866, "kl": 0.005778242833912373, "learning_rate": 2.8738383586092745e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 507 }, { "completion_length": 256.0, "epoch": 0.08479385745284594, "grad_norm": 0.00351152615621686, "kl": 0.006955133285373449, "learning_rate": 2.8652075714060296e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 508 }, { "completion_length": 180.0, "epoch": 0.08496077449507594, "grad_norm": 0.6544327139854431, "kl": 0.010870318859815598, "learning_rate": 2.8565723342637797e-06, "loss": 0.0004, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 509 }, { "completion_length": 256.0, "epoch": 0.08512769153730596, "grad_norm": 0.005275247152894735, "kl": 0.007907304912805557, "learning_rate": 2.847932752400164e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 510 }, { "completion_length": 179.5, "epoch": 0.08529460857953597, "grad_norm": 0.005482695996761322, "kl": 0.011352375149726868, "learning_rate": 2.8392889310857615e-06, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 511 }, { "completion_length": 240.0, "epoch": 0.08546152562176598, "grad_norm": 0.0035331028047949076, "kl": 0.005988858174532652, "learning_rate": 2.8306409756428067e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 512 }, { "completion_length": 123.5, "epoch": 0.085628442663996, "grad_norm": 0.0038066289853304625, "kl": 0.0055668228305876255, "learning_rate": 2.8219889914439073e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 513 }, { "completion_length": 193.0, "epoch": 0.08579535970622601, "grad_norm": 0.002871275180950761, "kl": 0.005563478916883469, "learning_rate": 2.813333083910761e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 514 }, { "completion_length": 205.0, "epoch": 0.08596227674845602, "grad_norm": 0.0029429288115352392, "kl": 0.001173964235931635, "learning_rate": 2.804673358512869e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 515 }, { "completion_length": 86.5, "epoch": 0.08612919379068602, "grad_norm": 0.00788928847759962, "kl": 0.0037452911492437124, "learning_rate": 2.7960099207662535e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 516 }, { "completion_length": 256.0, "epoch": 0.08629611083291604, "grad_norm": 0.0021909947972744703, "kl": 0.0007361274911090732, "learning_rate": 2.7873428762321667e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 517 }, { "completion_length": 172.5, "epoch": 0.08646302787514605, "grad_norm": 0.004803095478564501, "kl": 0.0030090666841715574, "learning_rate": 2.778672330515814e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 518 }, { "completion_length": 119.5, "epoch": 0.08662994491737606, "grad_norm": 0.006903729401528835, "kl": 0.006199518218636513, "learning_rate": 2.769998389265057e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 519 }, { "completion_length": 256.0, "epoch": 0.08679686195960608, "grad_norm": 0.4873637557029724, "kl": 0.0044858711771667, "learning_rate": 2.761321158169134e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 520 }, { "completion_length": 176.0, "epoch": 0.08696377900183609, "grad_norm": 0.6946375966072083, "kl": 0.00945674255490303, "learning_rate": 2.752640742957366e-06, "loss": 0.0004, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 521 }, { "completion_length": 80.5, "epoch": 0.0871306960440661, "grad_norm": 0.003866190556436777, "kl": 0.0009174761362373829, "learning_rate": 2.743957249397874e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 522 }, { "completion_length": 136.0, "epoch": 0.0872976130862961, "grad_norm": 0.0030815822537988424, "kl": 0.004138518590480089, "learning_rate": 2.7352707832962865e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 523 }, { "completion_length": 256.0, "epoch": 0.08746453012852612, "grad_norm": 0.0025072998832911253, "kl": 0.0008064579451456666, "learning_rate": 2.726581450494451e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 524 }, { "completion_length": 256.0, "epoch": 0.08763144717075613, "grad_norm": 0.0036690179258584976, "kl": 0.006884913891553879, "learning_rate": 2.717889356869146e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 525 }, { "completion_length": 97.5, "epoch": 0.08779836421298615, "grad_norm": 0.00998605601489544, "kl": 0.021080952137708664, "learning_rate": 2.70919460833079e-06, "loss": 0.0008, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 526 }, { "completion_length": 254.0, "epoch": 0.08796528125521616, "grad_norm": 0.002963931765407324, "kl": 0.0013856296427547932, "learning_rate": 2.700497310822147e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 527 }, { "completion_length": 256.0, "epoch": 0.08813219829744617, "grad_norm": 0.0037498208694159985, "kl": 0.005031573586165905, "learning_rate": 2.6917975703170466e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 528 }, { "completion_length": 232.5, "epoch": 0.08829911533967619, "grad_norm": 0.003729135263711214, "kl": 0.005832407623529434, "learning_rate": 2.6830954928190795e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 529 }, { "completion_length": 239.0, "epoch": 0.08846603238190619, "grad_norm": 0.003873983398079872, "kl": 0.0039303917437791824, "learning_rate": 2.6743911843603134e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 530 }, { "completion_length": 224.5, "epoch": 0.0886329494241362, "grad_norm": 0.002249809680506587, "kl": 0.002112780464813113, "learning_rate": 2.6656847510000013e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 531 }, { "completion_length": 182.5, "epoch": 0.08879986646636621, "grad_norm": 0.02570975571870804, "kl": 0.011340406723320484, "learning_rate": 2.6569762988232838e-06, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 532 }, { "completion_length": 53.5, "epoch": 0.08896678350859623, "grad_norm": 0.02127251960337162, "kl": 0.04005546122789383, "learning_rate": 2.6482659339399047e-06, "loss": 0.0016, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 533 }, { "completion_length": 249.5, "epoch": 0.08913370055082624, "grad_norm": 0.010571831837296486, "kl": 0.012272438034415245, "learning_rate": 2.63955376248291e-06, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 534 }, { "completion_length": 236.5, "epoch": 0.08930061759305626, "grad_norm": 0.45433419942855835, "kl": 0.007260519079864025, "learning_rate": 2.6308398906073603e-06, "loss": 0.0003, "reward": -0.40450000762939453, "reward_std": 0.9256027936935425, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.40450000762939453, "step": 535 }, { "completion_length": 256.0, "epoch": 0.08946753463528627, "grad_norm": 0.0031795583199709654, "kl": 0.0030776518397033215, "learning_rate": 2.6221244244890336e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 536 }, { "completion_length": 256.0, "epoch": 0.08963445167751627, "grad_norm": 0.005588881205767393, "kl": 0.004720202647149563, "learning_rate": 2.613407470323134e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 537 }, { "completion_length": 220.5, "epoch": 0.08980136871974628, "grad_norm": 0.0037963271606713533, "kl": 0.004590251483023167, "learning_rate": 2.604689134322999e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 538 }, { "completion_length": 185.0, "epoch": 0.0899682857619763, "grad_norm": 0.00245186360552907, "kl": 0.0020502228289842606, "learning_rate": 2.5959695227188e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 539 }, { "completion_length": 240.0, "epoch": 0.09013520280420631, "grad_norm": 0.0028924455400556326, "kl": 0.005683612544089556, "learning_rate": 2.587248741756253e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 540 }, { "completion_length": 256.0, "epoch": 0.09030211984643632, "grad_norm": 0.00761244585737586, "kl": 0.002345051383599639, "learning_rate": 2.578526897695321e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 541 }, { "completion_length": 247.0, "epoch": 0.09046903688866634, "grad_norm": 0.006318111438304186, "kl": 0.007166513241827488, "learning_rate": 2.569804096808923e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 542 }, { "completion_length": 75.5, "epoch": 0.09063595393089635, "grad_norm": 0.010733108036220074, "kl": 0.017714977264404297, "learning_rate": 2.5610804453816333e-06, "loss": 0.0007, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 543 }, { "completion_length": 256.0, "epoch": 0.09080287097312635, "grad_norm": 0.5314186811447144, "kl": 0.008886504918336868, "learning_rate": 2.5523560497083927e-06, "loss": 0.0004, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 544 }, { "completion_length": 256.0, "epoch": 0.09096978801535636, "grad_norm": 0.5008943676948547, "kl": 0.006337753497064114, "learning_rate": 2.543631016093209e-06, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 545 }, { "completion_length": 237.0, "epoch": 0.09113670505758638, "grad_norm": 0.5126908421516418, "kl": 0.0012720136437565088, "learning_rate": 2.5349054508478636e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 546 }, { "completion_length": 129.0, "epoch": 0.09130362209981639, "grad_norm": 0.0046554431319236755, "kl": 0.0035154586657881737, "learning_rate": 2.526179460290615e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 547 }, { "completion_length": 54.5, "epoch": 0.0914705391420464, "grad_norm": 0.036062028259038925, "kl": 0.038960050791502, "learning_rate": 2.517453150744904e-06, "loss": 0.0016, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 548 }, { "completion_length": 64.5, "epoch": 0.09163745618427642, "grad_norm": 0.00512732332572341, "kl": 0.005088353529572487, "learning_rate": 2.5087266285380597e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 549 }, { "completion_length": 256.0, "epoch": 0.09180437322650642, "grad_norm": 0.0018525129416957498, "kl": 0.0011197177227586508, "learning_rate": 2.5e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 550 }, { "completion_length": 240.0, "epoch": 0.09197129026873643, "grad_norm": 0.002617382910102606, "kl": 0.001130789634771645, "learning_rate": 2.4912733714619415e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 551 }, { "completion_length": 80.0, "epoch": 0.09213820731096645, "grad_norm": 0.016234418377280235, "kl": 0.016744405031204224, "learning_rate": 2.482546849255096e-06, "loss": 0.0007, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 552 }, { "completion_length": 121.5, "epoch": 0.09230512435319646, "grad_norm": 0.007699879817664623, "kl": 0.0075293974950909615, "learning_rate": 2.4738205397093863e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 553 }, { "completion_length": 63.5, "epoch": 0.09247204139542647, "grad_norm": 0.004952226299792528, "kl": 0.0030680138152092695, "learning_rate": 2.4650945491521372e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 554 }, { "completion_length": 256.0, "epoch": 0.09263895843765649, "grad_norm": 0.007318846415728331, "kl": 0.0043776617385447025, "learning_rate": 2.4563689839067913e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 555 }, { "completion_length": 132.0, "epoch": 0.0928058754798865, "grad_norm": 0.6986380219459534, "kl": 0.015310941264033318, "learning_rate": 2.447643950291608e-06, "loss": 0.0006, "reward": -0.015000000596046448, "reward_std": 0.37476658821105957, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.015000000596046448, "step": 556 }, { "completion_length": 77.0, "epoch": 0.0929727925221165, "grad_norm": 0.009616225026547909, "kl": 0.009371273219585419, "learning_rate": 2.4389195546183676e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 557 }, { "completion_length": 75.0, "epoch": 0.09313970956434652, "grad_norm": 0.007143979426473379, "kl": 0.005017437972128391, "learning_rate": 2.4301959031910785e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 558 }, { "completion_length": 221.5, "epoch": 0.09330662660657653, "grad_norm": 0.0035380718763917685, "kl": 0.002410850254818797, "learning_rate": 2.4214731023046795e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 559 }, { "completion_length": 178.5, "epoch": 0.09347354364880654, "grad_norm": 0.0035938003566116095, "kl": 0.0015970750246196985, "learning_rate": 2.4127512582437486e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 560 }, { "completion_length": 207.0, "epoch": 0.09364046069103656, "grad_norm": 0.009205947630107403, "kl": 0.008856229484081268, "learning_rate": 2.4040304772812002e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 561 }, { "completion_length": 173.0, "epoch": 0.09380737773326657, "grad_norm": 0.010820462368428707, "kl": 0.010479423217475414, "learning_rate": 2.3953108656770018e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 562 }, { "completion_length": 139.0, "epoch": 0.09397429477549658, "grad_norm": 0.006724325940012932, "kl": 0.004297596402466297, "learning_rate": 2.3865925296768658e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 563 }, { "completion_length": 148.5, "epoch": 0.09414121181772658, "grad_norm": 0.017499007284641266, "kl": 0.012587500736117363, "learning_rate": 2.377875575510967e-06, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 564 }, { "completion_length": 256.0, "epoch": 0.0943081288599566, "grad_norm": 0.0035812382120639086, "kl": 0.0021713194437325, "learning_rate": 2.3691601093926406e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 565 }, { "completion_length": 62.5, "epoch": 0.09447504590218661, "grad_norm": 0.0037167605478316545, "kl": 0.001043609227053821, "learning_rate": 2.3604462375170905e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 566 }, { "completion_length": 50.5, "epoch": 0.09464196294441662, "grad_norm": 0.007324697449803352, "kl": 0.0028083904180675745, "learning_rate": 2.3517340660600965e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 567 }, { "completion_length": 194.0, "epoch": 0.09480887998664664, "grad_norm": 0.005572296213358641, "kl": 0.005086231976747513, "learning_rate": 2.3430237011767166e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 568 }, { "completion_length": 154.0, "epoch": 0.09497579702887665, "grad_norm": 0.003216053592041135, "kl": 0.00128525763284415, "learning_rate": 2.3343152490000004e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 569 }, { "completion_length": 210.5, "epoch": 0.09514271407110667, "grad_norm": 0.012434786185622215, "kl": 0.00913763977587223, "learning_rate": 2.325608815639687e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 570 }, { "completion_length": 149.0, "epoch": 0.09530963111333667, "grad_norm": 0.615765392780304, "kl": 0.013096854090690613, "learning_rate": 2.3169045071809217e-06, "loss": 0.0005, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 571 }, { "completion_length": 256.0, "epoch": 0.09547654815556668, "grad_norm": 0.007719189859926701, "kl": 0.006099322345107794, "learning_rate": 2.3082024296829538e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 572 }, { "completion_length": 256.0, "epoch": 0.09564346519779669, "grad_norm": 0.0026712212711572647, "kl": 0.0029691383242607117, "learning_rate": 2.2995026891778533e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 573 }, { "completion_length": 174.5, "epoch": 0.0958103822400267, "grad_norm": 0.8240460157394409, "kl": 0.006153644993901253, "learning_rate": 2.290805391669212e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 574 }, { "completion_length": 256.0, "epoch": 0.09597729928225672, "grad_norm": 0.004137665499001741, "kl": 0.003442507702857256, "learning_rate": 2.2821106431308546e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 575 }, { "completion_length": 200.0, "epoch": 0.09614421632448673, "grad_norm": 0.526257336139679, "kl": 0.005499016959220171, "learning_rate": 2.2734185495055503e-06, "loss": 0.0002, "reward": -0.1274999976158142, "reward_std": 0.35708892345428467, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1274999976158142, "step": 576 }, { "completion_length": 229.5, "epoch": 0.09631113336671675, "grad_norm": 0.0023302400950342417, "kl": 0.0013272331561893225, "learning_rate": 2.2647292167037143e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 577 }, { "completion_length": 72.0, "epoch": 0.09647805040894675, "grad_norm": 0.006156591698527336, "kl": 0.005364671349525452, "learning_rate": 2.256042750602127e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 578 }, { "completion_length": 236.5, "epoch": 0.09664496745117676, "grad_norm": 0.0024851798079907894, "kl": 0.0010842899791896343, "learning_rate": 2.2473592570426343e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 579 }, { "completion_length": 141.0, "epoch": 0.09681188449340677, "grad_norm": 0.010501858778297901, "kl": 0.007221114821732044, "learning_rate": 2.238678841830867e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 580 }, { "completion_length": 244.0, "epoch": 0.09697880153563679, "grad_norm": 0.5442523956298828, "kl": 0.004301885608583689, "learning_rate": 2.230001610734943e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 581 }, { "completion_length": 64.0, "epoch": 0.0971457185778668, "grad_norm": 0.012467192485928535, "kl": 0.009203070774674416, "learning_rate": 2.2213276694841866e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 582 }, { "completion_length": 144.0, "epoch": 0.09731263562009682, "grad_norm": 0.00739732850342989, "kl": 0.005040724296122789, "learning_rate": 2.212657123767834e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 583 }, { "completion_length": 248.5, "epoch": 0.09747955266232683, "grad_norm": 0.45888715982437134, "kl": 0.0016770644579082727, "learning_rate": 2.2039900792337477e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 584 }, { "completion_length": 216.5, "epoch": 0.09764646970455683, "grad_norm": 0.00274821394123137, "kl": 0.0018149616662412882, "learning_rate": 2.195326641487132e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 585 }, { "completion_length": 251.0, "epoch": 0.09781338674678684, "grad_norm": 0.5467800498008728, "kl": 0.0021324437111616135, "learning_rate": 2.186666916089239e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 586 }, { "completion_length": 256.0, "epoch": 0.09798030378901686, "grad_norm": 0.47040319442749023, "kl": 0.004431235138326883, "learning_rate": 2.1780110085560935e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 587 }, { "completion_length": 164.5, "epoch": 0.09814722083124687, "grad_norm": 0.003456822829321027, "kl": 0.0018927573692053556, "learning_rate": 2.1693590243571937e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 588 }, { "completion_length": 256.0, "epoch": 0.09831413787347688, "grad_norm": 0.002554119797423482, "kl": 0.0009059360018000007, "learning_rate": 2.1607110689142393e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 589 }, { "completion_length": 233.0, "epoch": 0.0984810549157069, "grad_norm": 0.4831177294254303, "kl": 0.0038947355933487415, "learning_rate": 2.1520672475998374e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 590 }, { "completion_length": 256.0, "epoch": 0.09864797195793691, "grad_norm": 0.0024663670919835567, "kl": 0.0007825277280062437, "learning_rate": 2.143427665736221e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 591 }, { "completion_length": 198.0, "epoch": 0.09881488900016691, "grad_norm": 0.010011746548116207, "kl": 0.007285246625542641, "learning_rate": 2.134792428593971e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 592 }, { "completion_length": 256.0, "epoch": 0.09898180604239692, "grad_norm": 0.5267623066902161, "kl": 0.0036683431826531887, "learning_rate": 2.1261616413907267e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 593 }, { "completion_length": 256.0, "epoch": 0.09914872308462694, "grad_norm": 0.002190379658713937, "kl": 0.0009642443619668484, "learning_rate": 2.117535409289905e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 594 }, { "completion_length": 254.0, "epoch": 0.09931564012685695, "grad_norm": 0.0027535122353583574, "kl": 0.002447437262162566, "learning_rate": 2.1089138373994226e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 595 }, { "completion_length": 59.5, "epoch": 0.09948255716908697, "grad_norm": 0.010300359688699245, "kl": 0.006278687156736851, "learning_rate": 2.1002970307704134e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 596 }, { "completion_length": 153.5, "epoch": 0.09964947421131698, "grad_norm": 0.03334111347794533, "kl": 0.008050693199038506, "learning_rate": 2.0916850943959453e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 597 }, { "completion_length": 155.0, "epoch": 0.099816391253547, "grad_norm": 0.009442522190511227, "kl": 0.008748309686779976, "learning_rate": 2.0830781332097446e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 598 }, { "completion_length": 247.0, "epoch": 0.099983308295777, "grad_norm": 0.002567583229392767, "kl": 0.0012941848253831267, "learning_rate": 2.0744762520849193e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 599 }, { "completion_length": 145.5, "epoch": 0.100150225338007, "grad_norm": 0.012639117427170277, "kl": 0.010388681665062904, "learning_rate": 2.0658795558326745e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 600 }, { "completion_length": 256.0, "epoch": 0.10031714238023702, "grad_norm": 0.0031263360287994146, "kl": 0.0014575449749827385, "learning_rate": 2.0572881492010423e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 601 }, { "completion_length": 119.0, "epoch": 0.10048405942246703, "grad_norm": 0.7916089296340942, "kl": 0.0011365371756255627, "learning_rate": 2.0487021368736002e-06, "loss": 0.0, "reward": -0.018000006675720215, "reward_std": 0.3790092468261719, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.018000006675720215, "step": 602 }, { "completion_length": 256.0, "epoch": 0.10065097646469705, "grad_norm": 0.0068686604499816895, "kl": 0.00514450017362833, "learning_rate": 2.0401216234682e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 603 }, { "completion_length": 160.0, "epoch": 0.10081789350692706, "grad_norm": 0.9224010109901428, "kl": 0.012890800833702087, "learning_rate": 2.031546713535688e-06, "loss": 0.0005, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 604 }, { "completion_length": 226.5, "epoch": 0.10098481054915708, "grad_norm": 0.011330134235322475, "kl": 0.009687060490250587, "learning_rate": 2.022977511558638e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 605 }, { "completion_length": 256.0, "epoch": 0.10115172759138708, "grad_norm": 0.0037978102918714285, "kl": 0.0009386714082211256, "learning_rate": 2.0144141219500707e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 606 }, { "completion_length": 186.0, "epoch": 0.10131864463361709, "grad_norm": 0.6875645518302917, "kl": 0.006696147844195366, "learning_rate": 2.0058566490521848e-06, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 607 }, { "completion_length": 256.0, "epoch": 0.1014855616758471, "grad_norm": 0.002334353979676962, "kl": 0.0011450935853645205, "learning_rate": 1.997305197135089e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 608 }, { "completion_length": 163.0, "epoch": 0.10165247871807712, "grad_norm": 1.1933141946792603, "kl": 0.004068414680659771, "learning_rate": 1.9887598703955244e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 609 }, { "completion_length": 256.0, "epoch": 0.10181939576030713, "grad_norm": 0.015876002609729767, "kl": 0.004238633438944817, "learning_rate": 1.9802207729556023e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 610 }, { "completion_length": 204.0, "epoch": 0.10198631280253714, "grad_norm": 0.0035863034427165985, "kl": 0.002124765422195196, "learning_rate": 1.971688008861529e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 611 }, { "completion_length": 256.0, "epoch": 0.10215322984476716, "grad_norm": 0.005595522932708263, "kl": 0.005296463146805763, "learning_rate": 1.963161682082342e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 612 }, { "completion_length": 165.5, "epoch": 0.10232014688699716, "grad_norm": 0.005693406797945499, "kl": 0.001465098699554801, "learning_rate": 1.9546418965086444e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 613 }, { "completion_length": 105.5, "epoch": 0.10248706392922717, "grad_norm": 0.00595076521858573, "kl": 0.00452285073697567, "learning_rate": 1.946128755951332e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 614 }, { "completion_length": 79.0, "epoch": 0.10265398097145718, "grad_norm": 0.0054558319970965385, "kl": 0.005237277131527662, "learning_rate": 1.937622364140338e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 615 }, { "completion_length": 256.0, "epoch": 0.1028208980136872, "grad_norm": 0.007923711091279984, "kl": 0.0023628040216863155, "learning_rate": 1.9291228247233607e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 616 }, { "completion_length": 256.0, "epoch": 0.10298781505591721, "grad_norm": 0.0028428547084331512, "kl": 0.0013459092006087303, "learning_rate": 1.9206302412646074e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 617 }, { "completion_length": 254.0, "epoch": 0.10315473209814723, "grad_norm": 0.5219612121582031, "kl": 0.0029801949858665466, "learning_rate": 1.912144717243525e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 618 }, { "completion_length": 79.0, "epoch": 0.10332164914037724, "grad_norm": 0.011791742406785488, "kl": 0.01171540655195713, "learning_rate": 1.9036663560535484e-06, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 619 }, { "completion_length": 256.0, "epoch": 0.10348856618260724, "grad_norm": 0.0020486272405833006, "kl": 0.0007671897765249014, "learning_rate": 1.895195261000831e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 620 }, { "completion_length": 256.0, "epoch": 0.10365548322483725, "grad_norm": 0.0044900826178491116, "kl": 0.0015889524947851896, "learning_rate": 1.8867315353029937e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 621 }, { "completion_length": 155.0, "epoch": 0.10382240026706727, "grad_norm": 0.0046933614648878574, "kl": 0.001559242606163025, "learning_rate": 1.8782752820878636e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 622 }, { "completion_length": 221.0, "epoch": 0.10398931730929728, "grad_norm": 0.003020944306626916, "kl": 0.002960260957479477, "learning_rate": 1.8698266043922159e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 623 }, { "completion_length": 167.5, "epoch": 0.1041562343515273, "grad_norm": 0.006338793784379959, "kl": 0.003395651001483202, "learning_rate": 1.8613856051605242e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 624 }, { "completion_length": 151.0, "epoch": 0.10432315139375731, "grad_norm": 0.005361701361835003, "kl": 0.005949013866484165, "learning_rate": 1.852952387243698e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 625 }, { "completion_length": 240.0, "epoch": 0.10449006843598732, "grad_norm": 0.007017835974693298, "kl": 0.005688042379915714, "learning_rate": 1.8445270533978387e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 626 }, { "completion_length": 256.0, "epoch": 0.10465698547821732, "grad_norm": 0.009663361124694347, "kl": 0.007955463603138924, "learning_rate": 1.836109706282978e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 627 }, { "completion_length": 256.0, "epoch": 0.10482390252044733, "grad_norm": 0.00313054071739316, "kl": 0.0031066411174833775, "learning_rate": 1.827700448461836e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 628 }, { "completion_length": 86.0, "epoch": 0.10499081956267735, "grad_norm": 0.9127371311187744, "kl": 0.009920144453644753, "learning_rate": 1.8192993823985643e-06, "loss": 0.0004, "reward": -0.017499998211860657, "reward_std": 0.37830209732055664, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.017499998211860657, "step": 629 }, { "completion_length": 214.0, "epoch": 0.10515773660490736, "grad_norm": 0.005062410142272711, "kl": 0.004193211439996958, "learning_rate": 1.8109066104575023e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 630 }, { "completion_length": 256.0, "epoch": 0.10532465364713738, "grad_norm": 0.48623329401016235, "kl": 0.0021127895452082157, "learning_rate": 1.8025222349019273e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 631 }, { "completion_length": 256.0, "epoch": 0.10549157068936739, "grad_norm": 0.002360970014706254, "kl": 0.0006812467472627759, "learning_rate": 1.7941463578928088e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 632 }, { "completion_length": 88.5, "epoch": 0.10565848773159739, "grad_norm": 0.007747290655970573, "kl": 0.002134050242602825, "learning_rate": 1.7857790814875665e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 633 }, { "completion_length": 256.0, "epoch": 0.1058254047738274, "grad_norm": 0.0019858612213283777, "kl": 0.0007175060454756021, "learning_rate": 1.7774205076388207e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 634 }, { "completion_length": 236.0, "epoch": 0.10599232181605742, "grad_norm": 0.011990373022854328, "kl": 0.006520405411720276, "learning_rate": 1.7690707381931585e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 635 }, { "completion_length": 256.0, "epoch": 0.10615923885828743, "grad_norm": 0.43469539284706116, "kl": 0.003659032518044114, "learning_rate": 1.7607298748898844e-06, "loss": 0.0001, "reward": -0.5120000243186951, "reward_std": 0.9008540511131287, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5120000243186951, "step": 636 }, { "completion_length": 254.0, "epoch": 0.10632615590051744, "grad_norm": 0.0035142526030540466, "kl": 0.0031231101602315903, "learning_rate": 1.7523980193597837e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 637 }, { "completion_length": 155.0, "epoch": 0.10649307294274746, "grad_norm": 0.010990633629262447, "kl": 0.005851843860000372, "learning_rate": 1.744075273123889e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 638 }, { "completion_length": 88.5, "epoch": 0.10665998998497747, "grad_norm": 0.0068098995834589005, "kl": 0.00253370963037014, "learning_rate": 1.735761737592236e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 639 }, { "completion_length": 180.0, "epoch": 0.10682690702720747, "grad_norm": 0.7031934857368469, "kl": 0.005007128231227398, "learning_rate": 1.7274575140626318e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 640 }, { "completion_length": 160.5, "epoch": 0.10699382406943748, "grad_norm": 0.011134757660329342, "kl": 0.00784469023346901, "learning_rate": 1.7191627037194187e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 641 }, { "completion_length": 172.5, "epoch": 0.1071607411116675, "grad_norm": 0.003215308766812086, "kl": 0.00280794152058661, "learning_rate": 1.7108774076322443e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 642 }, { "completion_length": 256.0, "epoch": 0.10732765815389751, "grad_norm": 0.45838433504104614, "kl": 0.0021866250317543745, "learning_rate": 1.702601726754825e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 643 }, { "completion_length": 78.5, "epoch": 0.10749457519612753, "grad_norm": 0.012380188331007957, "kl": 0.008310909382998943, "learning_rate": 1.6943357619237227e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 644 }, { "completion_length": 107.5, "epoch": 0.10766149223835754, "grad_norm": 0.005393755156546831, "kl": 0.001239210949279368, "learning_rate": 1.686079613857109e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 645 }, { "completion_length": 155.0, "epoch": 0.10782840928058755, "grad_norm": 0.8328249454498291, "kl": 0.00908622331917286, "learning_rate": 1.677833383153542e-06, "loss": 0.0004, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 646 }, { "completion_length": 157.0, "epoch": 0.10799532632281755, "grad_norm": 0.005370954517275095, "kl": 0.0016931245336309075, "learning_rate": 1.6695971702907425e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 647 }, { "completion_length": 193.5, "epoch": 0.10816224336504757, "grad_norm": 0.004229923710227013, "kl": 0.004577081650495529, "learning_rate": 1.661371075624363e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 648 }, { "completion_length": 256.0, "epoch": 0.10832916040727758, "grad_norm": 0.0028755555395036936, "kl": 0.0015142381889745593, "learning_rate": 1.6531551993867717e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 649 }, { "completion_length": 256.0, "epoch": 0.1084960774495076, "grad_norm": 0.004993704613298178, "kl": 0.005255586933344603, "learning_rate": 1.6449496416858285e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 650 }, { "completion_length": 180.5, "epoch": 0.10866299449173761, "grad_norm": 0.010454480536282063, "kl": 0.006091472692787647, "learning_rate": 1.6367545025036634e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 651 }, { "completion_length": 256.0, "epoch": 0.10882991153396762, "grad_norm": 0.49596691131591797, "kl": 0.003517314326018095, "learning_rate": 1.6285698816954626e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 652 }, { "completion_length": 236.5, "epoch": 0.10899682857619764, "grad_norm": 0.002970484085381031, "kl": 0.0012486765626817942, "learning_rate": 1.6203958789882457e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 653 }, { "completion_length": 199.0, "epoch": 0.10916374561842764, "grad_norm": 0.5206317901611328, "kl": 0.005238628946244717, "learning_rate": 1.612232593979658e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 654 }, { "completion_length": 242.0, "epoch": 0.10933066266065765, "grad_norm": 0.0037179423961788416, "kl": 0.003951991908252239, "learning_rate": 1.6040801261367494e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 655 }, { "completion_length": 117.5, "epoch": 0.10949757970288766, "grad_norm": 0.007327806670218706, "kl": 0.004378964193165302, "learning_rate": 1.5959385747947697e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 656 }, { "completion_length": 211.0, "epoch": 0.10966449674511768, "grad_norm": 0.0036351201124489307, "kl": 0.003093164414167404, "learning_rate": 1.5878080391559507e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 657 }, { "completion_length": 239.5, "epoch": 0.10983141378734769, "grad_norm": 0.002534073544666171, "kl": 0.0011297967284917831, "learning_rate": 1.5796886182883053e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 658 }, { "completion_length": 54.5, "epoch": 0.1099983308295777, "grad_norm": 0.020572390407323837, "kl": 0.012646788731217384, "learning_rate": 1.5715804111244138e-06, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 659 }, { "completion_length": 71.0, "epoch": 0.11016524787180772, "grad_norm": 0.015084728598594666, "kl": 0.010100253857672215, "learning_rate": 1.56348351646022e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 660 }, { "completion_length": 65.0, "epoch": 0.11033216491403772, "grad_norm": 0.005622664000838995, "kl": 0.0013524906244128942, "learning_rate": 1.5553980329538326e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 661 }, { "completion_length": 129.5, "epoch": 0.11049908195626773, "grad_norm": 0.008980745449662209, "kl": 0.008137463591992855, "learning_rate": 1.547324059124315e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 662 }, { "completion_length": 162.5, "epoch": 0.11066599899849774, "grad_norm": 0.011171391233801842, "kl": 0.0066465954296290874, "learning_rate": 1.539261693350491e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 663 }, { "completion_length": 57.5, "epoch": 0.11083291604072776, "grad_norm": 0.013423729687929153, "kl": 0.01011478528380394, "learning_rate": 1.5312110338697427e-06, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 664 }, { "completion_length": 253.0, "epoch": 0.11099983308295777, "grad_norm": 0.5158193707466125, "kl": 0.003248332068324089, "learning_rate": 1.5231721787768162e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 665 }, { "completion_length": 71.0, "epoch": 0.11116675012518779, "grad_norm": 0.008422031998634338, "kl": 0.0073223356157541275, "learning_rate": 1.5151452260226224e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 666 }, { "completion_length": 164.5, "epoch": 0.1113336671674178, "grad_norm": 0.009458163753151894, "kl": 0.0034558558836579323, "learning_rate": 1.5071302734130488e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 667 }, { "completion_length": 224.5, "epoch": 0.1115005842096478, "grad_norm": 0.0047956365160644054, "kl": 0.0027931369841098785, "learning_rate": 1.4991274186077632e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 668 }, { "completion_length": 212.0, "epoch": 0.11166750125187781, "grad_norm": 0.706501841545105, "kl": 0.0056234439834952354, "learning_rate": 1.491136759119025e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 669 }, { "completion_length": 256.0, "epoch": 0.11183441829410783, "grad_norm": 0.42869821190834045, "kl": 0.0026654996909201145, "learning_rate": 1.4831583923105e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 670 }, { "completion_length": 249.0, "epoch": 0.11200133533633784, "grad_norm": 0.46702301502227783, "kl": 0.000965918879956007, "learning_rate": 1.4751924153960681e-06, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 671 }, { "completion_length": 182.5, "epoch": 0.11216825237856785, "grad_norm": 0.004120787139981985, "kl": 0.003206958994269371, "learning_rate": 1.467238925438646e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 672 }, { "completion_length": 145.0, "epoch": 0.11233516942079787, "grad_norm": 0.008434955030679703, "kl": 0.0036655825097113848, "learning_rate": 1.4592980193489975e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 673 }, { "completion_length": 199.0, "epoch": 0.11250208646302788, "grad_norm": 0.0038621495477855206, "kl": 0.0016037032473832369, "learning_rate": 1.4513697938845571e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 674 }, { "completion_length": 256.0, "epoch": 0.11266900350525788, "grad_norm": 0.5179656147956848, "kl": 0.004215891472995281, "learning_rate": 1.443454345648252e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 675 }, { "completion_length": 80.5, "epoch": 0.1128359205474879, "grad_norm": 0.004269412253051996, "kl": 0.003054692642763257, "learning_rate": 1.4355517710873184e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 676 }, { "completion_length": 239.0, "epoch": 0.11300283758971791, "grad_norm": 0.006975049618631601, "kl": 0.005563998594880104, "learning_rate": 1.4276621664921358e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 677 }, { "completion_length": 216.0, "epoch": 0.11316975463194792, "grad_norm": 0.005669728387147188, "kl": 0.0026360589545220137, "learning_rate": 1.419785627995044e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 678 }, { "completion_length": 244.0, "epoch": 0.11333667167417794, "grad_norm": 0.5407212376594543, "kl": 0.0017977873794734478, "learning_rate": 1.4119222515691817e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 679 }, { "completion_length": 161.0, "epoch": 0.11350358871640795, "grad_norm": 0.004036806058138609, "kl": 0.0022941159550100565, "learning_rate": 1.4040721330273063e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 680 }, { "completion_length": 156.0, "epoch": 0.11367050575863796, "grad_norm": 0.011179282329976559, "kl": 0.0065450300462543964, "learning_rate": 1.3962353680206372e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 681 }, { "completion_length": 75.5, "epoch": 0.11383742280086796, "grad_norm": 0.005359592381864786, "kl": 0.003024233039468527, "learning_rate": 1.388412052037682e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 682 }, { "completion_length": 143.5, "epoch": 0.11400433984309798, "grad_norm": 0.004219674505293369, "kl": 0.00199957937002182, "learning_rate": 1.380602280403076e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 683 }, { "completion_length": 251.0, "epoch": 0.11417125688532799, "grad_norm": 0.009329888969659805, "kl": 0.00699983537197113, "learning_rate": 1.3728061482764238e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 684 }, { "completion_length": 256.0, "epoch": 0.114338173927558, "grad_norm": 0.007103400304913521, "kl": 0.004981080535799265, "learning_rate": 1.3650237506511333e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 685 }, { "completion_length": 249.0, "epoch": 0.11450509096978802, "grad_norm": 0.002061739331111312, "kl": 0.0010626704897731543, "learning_rate": 1.3572551823532654e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 686 }, { "completion_length": 70.5, "epoch": 0.11467200801201803, "grad_norm": 0.019414808601140976, "kl": 0.011976665817201138, "learning_rate": 1.349500538040371e-06, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 687 }, { "completion_length": 256.0, "epoch": 0.11483892505424804, "grad_norm": 0.003375643864274025, "kl": 0.0013525803806260228, "learning_rate": 1.3417599122003464e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 688 }, { "completion_length": 146.5, "epoch": 0.11500584209647804, "grad_norm": 0.006899573840200901, "kl": 0.003690815530717373, "learning_rate": 1.3340333991502723e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 689 }, { "completion_length": 148.5, "epoch": 0.11517275913870806, "grad_norm": 0.0037705407012254, "kl": 0.0012647663243114948, "learning_rate": 1.3263210930352737e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 690 }, { "completion_length": 217.0, "epoch": 0.11533967618093807, "grad_norm": 0.0030316279735416174, "kl": 0.0019777275156229734, "learning_rate": 1.3186230878273654e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 691 }, { "completion_length": 162.5, "epoch": 0.11550659322316809, "grad_norm": 0.006936139427125454, "kl": 0.005391708109527826, "learning_rate": 1.3109394773243117e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 692 }, { "completion_length": 175.0, "epoch": 0.1156735102653981, "grad_norm": 0.6813364028930664, "kl": 0.008214378729462624, "learning_rate": 1.3032703551484832e-06, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 693 }, { "completion_length": 98.5, "epoch": 0.11584042730762811, "grad_norm": 0.008320584893226624, "kl": 0.005631724372506142, "learning_rate": 1.2956158147457116e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 694 }, { "completion_length": 89.0, "epoch": 0.11600734434985813, "grad_norm": 0.8635313510894775, "kl": 0.006169388070702553, "learning_rate": 1.2879759493841577e-06, "loss": 0.0002, "reward": 0.16899999976158142, "reward_std": 0.11455129832029343, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16899999976158142, "step": 695 }, { "completion_length": 231.0, "epoch": 0.11617426139208813, "grad_norm": 0.005272208712995052, "kl": 0.0035810263361781836, "learning_rate": 1.280350852153168e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 696 }, { "completion_length": 256.0, "epoch": 0.11634117843431814, "grad_norm": 0.0025159569922834635, "kl": 0.0007863318314775825, "learning_rate": 1.272740615962148e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 697 }, { "completion_length": 169.5, "epoch": 0.11650809547654815, "grad_norm": 0.0074836439453065395, "kl": 0.00409605260938406, "learning_rate": 1.2651453335394232e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 698 }, { "completion_length": 253.5, "epoch": 0.11667501251877817, "grad_norm": 0.4980584383010864, "kl": 0.003737745340913534, "learning_rate": 1.2575650974311118e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 699 }, { "completion_length": 103.5, "epoch": 0.11684192956100818, "grad_norm": 0.022754136472940445, "kl": 0.012202652171254158, "learning_rate": 1.2500000000000007e-06, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 700 }, { "completion_length": 256.0, "epoch": 0.1170088466032382, "grad_norm": 0.0020106318406760693, "kl": 0.0007439733017235994, "learning_rate": 1.2424501334244124e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 701 }, { "completion_length": 119.5, "epoch": 0.11717576364546821, "grad_norm": 0.008001664653420448, "kl": 0.006625610403716564, "learning_rate": 1.234915589697091e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 702 }, { "completion_length": 122.5, "epoch": 0.11734268068769821, "grad_norm": 0.010243231430649757, "kl": 0.008414046838879585, "learning_rate": 1.2273964606240718e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 703 }, { "completion_length": 254.5, "epoch": 0.11750959772992822, "grad_norm": 0.010029933415353298, "kl": 0.00722455233335495, "learning_rate": 1.2198928378235717e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 704 }, { "completion_length": 256.0, "epoch": 0.11767651477215824, "grad_norm": 0.005438199266791344, "kl": 0.0008080101106315851, "learning_rate": 1.2124048127248644e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 705 }, { "completion_length": 194.0, "epoch": 0.11784343181438825, "grad_norm": 0.8630590438842773, "kl": 0.0025648274458944798, "learning_rate": 1.204932476567175e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 706 }, { "completion_length": 163.0, "epoch": 0.11801034885661826, "grad_norm": 0.005165139678865671, "kl": 0.002841666340827942, "learning_rate": 1.19747592039856e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 707 }, { "completion_length": 140.5, "epoch": 0.11817726589884828, "grad_norm": 0.0038985908031463623, "kl": 0.001925965305417776, "learning_rate": 1.1900352350748026e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 708 }, { "completion_length": 256.0, "epoch": 0.11834418294107829, "grad_norm": 0.001849853666499257, "kl": 0.0008236380526795983, "learning_rate": 1.1826105112583061e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 709 }, { "completion_length": 256.0, "epoch": 0.11851109998330829, "grad_norm": 0.0019781796727329493, "kl": 0.0008586732437834144, "learning_rate": 1.1752018394169882e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 710 }, { "completion_length": 222.0, "epoch": 0.1186780170255383, "grad_norm": 0.5734697580337524, "kl": 0.003300589043647051, "learning_rate": 1.1678093098231748e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 711 }, { "completion_length": 256.0, "epoch": 0.11884493406776832, "grad_norm": 0.0038314491976052523, "kl": 0.0022698375396430492, "learning_rate": 1.160433012552508e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 712 }, { "completion_length": 102.0, "epoch": 0.11901185110999833, "grad_norm": 0.008070508949458599, "kl": 0.005472409073263407, "learning_rate": 1.1530730374828422e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 713 }, { "completion_length": 256.0, "epoch": 0.11917876815222835, "grad_norm": 0.002034439705312252, "kl": 0.0006539764581248164, "learning_rate": 1.1457294742931508e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 714 }, { "completion_length": 212.5, "epoch": 0.11934568519445836, "grad_norm": 0.6229653358459473, "kl": 0.003596169874072075, "learning_rate": 1.1384024124624324e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 715 }, { "completion_length": 219.5, "epoch": 0.11951260223668837, "grad_norm": 0.002904074266552925, "kl": 0.0007492978475056589, "learning_rate": 1.1310919412686248e-06, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 716 }, { "completion_length": 256.0, "epoch": 0.11967951927891837, "grad_norm": 0.005333792883902788, "kl": 0.0035447266418486834, "learning_rate": 1.1237981497875112e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 717 }, { "completion_length": 256.0, "epoch": 0.11984643632114839, "grad_norm": 0.0022055739536881447, "kl": 0.001662406837567687, "learning_rate": 1.11652112689164e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 718 }, { "completion_length": 178.5, "epoch": 0.1200133533633784, "grad_norm": 0.8318312764167786, "kl": 0.003096943022683263, "learning_rate": 1.109260961249238e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 719 }, { "completion_length": 165.5, "epoch": 0.12018027040560841, "grad_norm": 0.8927222490310669, "kl": 0.005077797919511795, "learning_rate": 1.1020177413231334e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 720 }, { "completion_length": 256.0, "epoch": 0.12034718744783843, "grad_norm": 0.013147182762622833, "kl": 0.00402587465941906, "learning_rate": 1.0947915553696742e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 721 }, { "completion_length": 256.0, "epoch": 0.12051410449006844, "grad_norm": 0.0018908609636127949, "kl": 0.0005860370583832264, "learning_rate": 1.0875824914376555e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 722 }, { "completion_length": 82.5, "epoch": 0.12068102153229844, "grad_norm": 0.005685973446816206, "kl": 0.002979591488838196, "learning_rate": 1.0803906373672477e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 723 }, { "completion_length": 158.0, "epoch": 0.12084793857452845, "grad_norm": 0.8811036348342896, "kl": 0.008721512742340565, "learning_rate": 1.073216080788921e-06, "loss": 0.0003, "reward": -0.0010000020265579224, "reward_std": 0.3549675941467285, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0010000020265579224, "step": 724 }, { "completion_length": 193.5, "epoch": 0.12101485561675847, "grad_norm": 0.0030168057419359684, "kl": 0.001498053316026926, "learning_rate": 1.0660589091223854e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 725 }, { "completion_length": 256.0, "epoch": 0.12118177265898848, "grad_norm": 0.4766358435153961, "kl": 0.001396383624523878, "learning_rate": 1.0589192095755172e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 726 }, { "completion_length": 228.0, "epoch": 0.1213486897012185, "grad_norm": 0.002483733231201768, "kl": 0.0012550248065963387, "learning_rate": 1.0517970691433035e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 727 }, { "completion_length": 127.5, "epoch": 0.12151560674344851, "grad_norm": 0.004861411172896624, "kl": 0.0033973355311900377, "learning_rate": 1.0446925746067768e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 728 }, { "completion_length": 256.0, "epoch": 0.12168252378567852, "grad_norm": 0.4948384165763855, "kl": 0.0029284926131367683, "learning_rate": 1.0376058125319614e-06, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 729 }, { "completion_length": 255.0, "epoch": 0.12184944082790852, "grad_norm": 0.0060725430957973, "kl": 0.004152575973421335, "learning_rate": 1.0305368692688175e-06, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 730 }, { "completion_length": 256.0, "epoch": 0.12201635787013854, "grad_norm": 0.003351796418428421, "kl": 0.002844823757186532, "learning_rate": 1.0234858309501864e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 731 }, { "completion_length": 208.5, "epoch": 0.12218327491236855, "grad_norm": 0.005679297726601362, "kl": 0.0034756078384816647, "learning_rate": 1.0164527834907468e-06, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 732 }, { "completion_length": 170.0, "epoch": 0.12235019195459856, "grad_norm": 0.008278874680399895, "kl": 0.006335783749818802, "learning_rate": 1.0094378125859602e-06, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 733 }, { "completion_length": 206.5, "epoch": 0.12251710899682858, "grad_norm": 0.6127408742904663, "kl": 0.003953251987695694, "learning_rate": 1.0024410037110358e-06, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 734 }, { "completion_length": 239.0, "epoch": 0.12268402603905859, "grad_norm": 0.0028590294532477856, "kl": 0.0013268778566271067, "learning_rate": 9.95462442119879e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 735 }, { "completion_length": 169.5, "epoch": 0.1228509430812886, "grad_norm": 0.005638209171593189, "kl": 0.0018434133380651474, "learning_rate": 9.88502212844063e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 736 }, { "completion_length": 80.0, "epoch": 0.1230178601235186, "grad_norm": 0.01114263292402029, "kl": 0.009264732711017132, "learning_rate": 9.815604006917839e-07, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 737 }, { "completion_length": 135.5, "epoch": 0.12318477716574862, "grad_norm": 0.00732225738465786, "kl": 0.0038388939574360847, "learning_rate": 9.746370902468311e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 738 }, { "completion_length": 152.0, "epoch": 0.12335169420797863, "grad_norm": 0.005148455500602722, "kl": 0.003323192708194256, "learning_rate": 9.677323658675594e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 739 }, { "completion_length": 256.0, "epoch": 0.12351861125020865, "grad_norm": 0.007627917919307947, "kl": 0.004518797155469656, "learning_rate": 9.608463116858544e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 740 }, { "completion_length": 159.5, "epoch": 0.12368552829243866, "grad_norm": 0.007910925894975662, "kl": 0.0029804075602442026, "learning_rate": 9.53979011606115e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 741 }, { "completion_length": 256.0, "epoch": 0.12385244533466867, "grad_norm": 0.0037019262090325356, "kl": 0.0011272076517343521, "learning_rate": 9.471305493042243e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 742 }, { "completion_length": 256.0, "epoch": 0.12401936237689869, "grad_norm": 0.0017964406870305538, "kl": 0.0006631310097873211, "learning_rate": 9.403010082265351e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 743 }, { "completion_length": 210.5, "epoch": 0.12418627941912869, "grad_norm": 0.005085213575512171, "kl": 0.003912879154086113, "learning_rate": 9.334904715888496e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 744 }, { "completion_length": 256.0, "epoch": 0.1243531964613587, "grad_norm": 0.004010829143226147, "kl": 0.003270295914262533, "learning_rate": 9.266990223754069e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 745 }, { "completion_length": 242.0, "epoch": 0.12452011350358871, "grad_norm": 0.0022703963331878185, "kl": 0.0008300324552692473, "learning_rate": 9.199267433378728e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 746 }, { "completion_length": 237.5, "epoch": 0.12468703054581873, "grad_norm": 0.010111304000020027, "kl": 0.0015885557513684034, "learning_rate": 9.131737169943314e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 747 }, { "completion_length": 136.0, "epoch": 0.12485394758804874, "grad_norm": 0.003882358316332102, "kl": 0.0009629528503865004, "learning_rate": 9.064400256282757e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 748 }, { "completion_length": 59.5, "epoch": 0.12502086463027876, "grad_norm": 0.011124087497591972, "kl": 0.00722032506018877, "learning_rate": 8.99725751287611e-07, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 749 }, { "completion_length": 68.0, "epoch": 0.12518778167250877, "grad_norm": 0.012933854013681412, "kl": 0.007505254354327917, "learning_rate": 8.930309757836517e-07, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 750 }, { "completion_length": 256.0, "epoch": 0.12535469871473878, "grad_norm": 0.003265213221311569, "kl": 0.0025998991914093494, "learning_rate": 8.863557806901233e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 751 }, { "completion_length": 256.0, "epoch": 0.1255216157569688, "grad_norm": 0.5089396238327026, "kl": 0.003808401059359312, "learning_rate": 8.797002473421729e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 752 }, { "completion_length": 77.5, "epoch": 0.1256885327991988, "grad_norm": 0.009932399727404118, "kl": 0.004392838105559349, "learning_rate": 8.73064456835373e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 753 }, { "completion_length": 256.0, "epoch": 0.12585544984142882, "grad_norm": 0.004371561575680971, "kl": 0.0022678403183817863, "learning_rate": 8.664484900247363e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 754 }, { "completion_length": 256.0, "epoch": 0.1260223668836588, "grad_norm": 0.0023078746162354946, "kl": 0.0009486688068136573, "learning_rate": 8.598524275237321e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 755 }, { "completion_length": 256.0, "epoch": 0.12618928392588882, "grad_norm": 0.003615024732425809, "kl": 0.0010449396213516593, "learning_rate": 8.532763497032987e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 756 }, { "completion_length": 238.0, "epoch": 0.12635620096811884, "grad_norm": 0.009149770252406597, "kl": 0.005729164928197861, "learning_rate": 8.467203366908708e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 757 }, { "completion_length": 172.5, "epoch": 0.12652311801034885, "grad_norm": 0.003835267387330532, "kl": 0.0012998022139072418, "learning_rate": 8.40184468369396e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 758 }, { "completion_length": 142.0, "epoch": 0.12669003505257886, "grad_norm": 0.004842799622565508, "kl": 0.0016193941701203585, "learning_rate": 8.336688243763691e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 759 }, { "completion_length": 173.0, "epoch": 0.12685695209480888, "grad_norm": 0.0080416863784194, "kl": 0.006362335290759802, "learning_rate": 8.271734841028553e-07, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 760 }, { "completion_length": 256.0, "epoch": 0.1270238691370389, "grad_norm": 0.524850606918335, "kl": 0.0031557315960526466, "learning_rate": 8.206985266925249e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 761 }, { "completion_length": 238.0, "epoch": 0.1271907861792689, "grad_norm": 0.0029861507937312126, "kl": 0.001328176585957408, "learning_rate": 8.142440310406923e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 762 }, { "completion_length": 256.0, "epoch": 0.12735770322149892, "grad_norm": 0.004247487522661686, "kl": 0.00415077293291688, "learning_rate": 8.078100757933486e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 763 }, { "completion_length": 238.0, "epoch": 0.12752462026372893, "grad_norm": 0.5173860192298889, "kl": 0.002977007534354925, "learning_rate": 8.013967393462094e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 764 }, { "completion_length": 68.5, "epoch": 0.12769153730595895, "grad_norm": 0.0048787533305585384, "kl": 0.0015520062297582626, "learning_rate": 7.950040998437541e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 765 }, { "completion_length": 172.0, "epoch": 0.12785845434818896, "grad_norm": 0.5477895140647888, "kl": 0.0033054936211556196, "learning_rate": 7.886322351782782e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 766 }, { "completion_length": 256.0, "epoch": 0.12802537139041897, "grad_norm": 0.5342759490013123, "kl": 0.003088635392487049, "learning_rate": 7.822812229889429e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 767 }, { "completion_length": 122.0, "epoch": 0.12819228843264896, "grad_norm": 0.013621831312775612, "kl": 0.010760795325040817, "learning_rate": 7.759511406608255e-07, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 768 }, { "completion_length": 240.0, "epoch": 0.12835920547487897, "grad_norm": 0.4534660577774048, "kl": 0.0034502018243074417, "learning_rate": 7.696420653239834e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 769 }, { "completion_length": 256.0, "epoch": 0.128526122517109, "grad_norm": 0.002532788086682558, "kl": 0.0009099298622459173, "learning_rate": 7.633540738525066e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 770 }, { "completion_length": 73.5, "epoch": 0.128693039559339, "grad_norm": 0.005694825667887926, "kl": 0.0011848880676552653, "learning_rate": 7.57087242863589e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 771 }, { "completion_length": 44.5, "epoch": 0.12885995660156901, "grad_norm": 0.01967308297753334, "kl": 0.015691326931118965, "learning_rate": 7.508416487165862e-07, "loss": 0.0006, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 772 }, { "completion_length": 256.0, "epoch": 0.12902687364379903, "grad_norm": 0.47101891040802, "kl": 0.003394038649275899, "learning_rate": 7.44617367512094e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 773 }, { "completion_length": 149.5, "epoch": 0.12919379068602904, "grad_norm": 0.7993741035461426, "kl": 0.009296840988099575, "learning_rate": 7.384144750910133e-07, "loss": 0.0004, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 774 }, { "completion_length": 112.5, "epoch": 0.12936070772825906, "grad_norm": 0.008208347484469414, "kl": 0.004461268894374371, "learning_rate": 7.322330470336314e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 775 }, { "completion_length": 256.0, "epoch": 0.12952762477048907, "grad_norm": 0.0054216464050114155, "kl": 0.004019685555249453, "learning_rate": 7.260731586586983e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 776 }, { "completion_length": 213.0, "epoch": 0.12969454181271908, "grad_norm": 0.5638223886489868, "kl": 0.002086436375975609, "learning_rate": 7.199348850225091e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 777 }, { "completion_length": 157.5, "epoch": 0.1298614588549491, "grad_norm": 0.8067062497138977, "kl": 0.007936405017971992, "learning_rate": 7.138183009179922e-07, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 778 }, { "completion_length": 162.0, "epoch": 0.1300283758971791, "grad_norm": 0.890365481376648, "kl": 0.0036694372538477182, "learning_rate": 7.077234808737932e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 779 }, { "completion_length": 127.5, "epoch": 0.13019529293940912, "grad_norm": 0.003538926597684622, "kl": 0.0008457253570668399, "learning_rate": 7.016504991533727e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 780 }, { "completion_length": 256.0, "epoch": 0.13036220998163914, "grad_norm": 0.5976073741912842, "kl": 0.002260797191411257, "learning_rate": 6.955994297540947e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 781 }, { "completion_length": 65.0, "epoch": 0.13052912702386912, "grad_norm": 0.018074361607432365, "kl": 0.013518733903765678, "learning_rate": 6.895703464063319e-07, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 782 }, { "completion_length": 173.5, "epoch": 0.13069604406609914, "grad_norm": 0.0031067761592566967, "kl": 0.002237193053588271, "learning_rate": 6.835633225725604e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 783 }, { "completion_length": 84.5, "epoch": 0.13086296110832915, "grad_norm": 0.009631584398448467, "kl": 0.007298514246940613, "learning_rate": 6.775784314464717e-07, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 784 }, { "completion_length": 194.5, "epoch": 0.13102987815055916, "grad_norm": 0.004095294047147036, "kl": 0.003293570363894105, "learning_rate": 6.716157459520739e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 785 }, { "completion_length": 66.0, "epoch": 0.13119679519278918, "grad_norm": 0.006123723462224007, "kl": 0.0027884431183338165, "learning_rate": 6.656753387428089e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 786 }, { "completion_length": 256.0, "epoch": 0.1313637122350192, "grad_norm": 0.007620881777256727, "kl": 0.006329049821943045, "learning_rate": 6.597572822006643e-07, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 787 }, { "completion_length": 256.0, "epoch": 0.1315306292772492, "grad_norm": 0.0050136251375079155, "kl": 0.005434454418718815, "learning_rate": 6.538616484352902e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 788 }, { "completion_length": 232.5, "epoch": 0.13169754631947922, "grad_norm": 0.0046582818031311035, "kl": 0.004846640862524509, "learning_rate": 6.479885092831251e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 789 }, { "completion_length": 238.0, "epoch": 0.13186446336170923, "grad_norm": 0.0038322065956890583, "kl": 0.0017344653606414795, "learning_rate": 6.421379363065142e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 790 }, { "completion_length": 256.0, "epoch": 0.13203138040393925, "grad_norm": 0.49257346987724304, "kl": 0.0044576553627848625, "learning_rate": 6.363100007928447e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 791 }, { "completion_length": 70.0, "epoch": 0.13219829744616926, "grad_norm": 0.014173240400850773, "kl": 0.013517082668840885, "learning_rate": 6.305047737536707e-07, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 792 }, { "completion_length": 196.5, "epoch": 0.13236521448839927, "grad_norm": 0.73614102602005, "kl": 0.00628274492919445, "learning_rate": 6.247223259238511e-07, "loss": 0.0003, "reward": -0.06050000339746475, "reward_std": 0.2623366117477417, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.06050000339746475, "step": 793 }, { "completion_length": 256.0, "epoch": 0.1325321315306293, "grad_norm": 0.4722929298877716, "kl": 0.0038292179815471172, "learning_rate": 6.189627277606894e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 794 }, { "completion_length": 239.0, "epoch": 0.1326990485728593, "grad_norm": 0.004288392141461372, "kl": 0.0019985917024314404, "learning_rate": 6.1322604944307e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 795 }, { "completion_length": 49.0, "epoch": 0.1328659656150893, "grad_norm": 0.01650991663336754, "kl": 0.013808228075504303, "learning_rate": 6.075123608706093e-07, "loss": 0.0006, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 796 }, { "completion_length": 256.0, "epoch": 0.1330328826573193, "grad_norm": 0.0023252167738974094, "kl": 0.0009933353867381811, "learning_rate": 6.01821731662798e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 797 }, { "completion_length": 219.5, "epoch": 0.13319979969954931, "grad_norm": 0.4729636013507843, "kl": 0.0031482495833188295, "learning_rate": 5.961542311581586e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 798 }, { "completion_length": 256.0, "epoch": 0.13336671674177933, "grad_norm": 0.0018909811042249203, "kl": 0.0006723840488120914, "learning_rate": 5.905099284133953e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 799 }, { "completion_length": 249.0, "epoch": 0.13353363378400934, "grad_norm": 0.45728832483291626, "kl": 0.0017008164431899786, "learning_rate": 5.848888922025553e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 800 }, { "completion_length": 54.5, "epoch": 0.13370055082623936, "grad_norm": 0.020091548562049866, "kl": 0.01764632575213909, "learning_rate": 5.792911910161922e-07, "loss": 0.0007, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 801 }, { "completion_length": 242.0, "epoch": 0.13386746786846937, "grad_norm": 0.008378352969884872, "kl": 0.0031477343291044235, "learning_rate": 5.737168930605272e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 802 }, { "completion_length": 62.5, "epoch": 0.13403438491069938, "grad_norm": 0.012264654971659184, "kl": 0.005765328649431467, "learning_rate": 5.681660662566225e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 803 }, { "completion_length": 203.5, "epoch": 0.1342013019529294, "grad_norm": 0.6984907388687134, "kl": 0.005404851399362087, "learning_rate": 5.626387782395512e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 804 }, { "completion_length": 256.0, "epoch": 0.1343682189951594, "grad_norm": 0.0019785251934081316, "kl": 0.0007348760263994336, "learning_rate": 5.571350963575728e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 805 }, { "completion_length": 247.5, "epoch": 0.13453513603738942, "grad_norm": 0.5536428689956665, "kl": 0.001849822117947042, "learning_rate": 5.516550876713142e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 806 }, { "completion_length": 250.5, "epoch": 0.13470205307961944, "grad_norm": 0.0030756425112485886, "kl": 0.002019682200625539, "learning_rate": 5.461988189529529e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 807 }, { "completion_length": 52.0, "epoch": 0.13486897012184945, "grad_norm": 0.014867255464196205, "kl": 0.009393827989697456, "learning_rate": 5.407663566854008e-07, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 808 }, { "completion_length": 256.0, "epoch": 0.13503588716407947, "grad_norm": 0.005959495902061462, "kl": 0.0012761109974235296, "learning_rate": 5.353577670614951e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 809 }, { "completion_length": 256.0, "epoch": 0.13520280420630945, "grad_norm": 0.5573244094848633, "kl": 0.004010498523712158, "learning_rate": 5.299731159831953e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 810 }, { "completion_length": 256.0, "epoch": 0.13536972124853947, "grad_norm": 0.0029934882186353207, "kl": 0.0016786068445071578, "learning_rate": 5.24612469060774e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 811 }, { "completion_length": 256.0, "epoch": 0.13553663829076948, "grad_norm": 0.4866054654121399, "kl": 0.004436603747308254, "learning_rate": 5.192758916120236e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 812 }, { "completion_length": 256.0, "epoch": 0.1357035553329995, "grad_norm": 0.52545166015625, "kl": 0.002430671826004982, "learning_rate": 5.139634486614544e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 813 }, { "completion_length": 256.0, "epoch": 0.1358704723752295, "grad_norm": 0.5245779752731323, "kl": 0.0018838656833395362, "learning_rate": 5.086752049395094e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 814 }, { "completion_length": 256.0, "epoch": 0.13603738941745952, "grad_norm": 0.004886572249233723, "kl": 0.005426435731351376, "learning_rate": 5.034112248817685e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 815 }, { "completion_length": 256.0, "epoch": 0.13620430645968953, "grad_norm": 0.5134595632553101, "kl": 0.0026297911535948515, "learning_rate": 4.981715726281666e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 816 }, { "completion_length": 134.5, "epoch": 0.13637122350191955, "grad_norm": 0.006052852608263493, "kl": 0.00300239329226315, "learning_rate": 4.929563120222142e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 817 }, { "completion_length": 256.0, "epoch": 0.13653814054414956, "grad_norm": 0.0032966092694550753, "kl": 0.0011440865928307176, "learning_rate": 4.87765506610215e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 818 }, { "completion_length": 179.5, "epoch": 0.13670505758637957, "grad_norm": 0.6741744875907898, "kl": 0.0036334411706775427, "learning_rate": 4.825992196404958e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 819 }, { "completion_length": 256.0, "epoch": 0.1368719746286096, "grad_norm": 0.5539278984069824, "kl": 0.0032550443429499865, "learning_rate": 4.774575140626317e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 820 }, { "completion_length": 256.0, "epoch": 0.1370388916708396, "grad_norm": 0.001875555724836886, "kl": 0.0009470806689932942, "learning_rate": 4.7234045252668393e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 821 }, { "completion_length": 82.5, "epoch": 0.13720580871306962, "grad_norm": 0.005445122718811035, "kl": 0.004179139621555805, "learning_rate": 4.672480973824312e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 822 }, { "completion_length": 160.5, "epoch": 0.13737272575529963, "grad_norm": 0.9762455224990845, "kl": 0.01037596259266138, "learning_rate": 4.6218051067861423e-07, "loss": 0.0004, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 823 }, { "completion_length": 59.0, "epoch": 0.13753964279752962, "grad_norm": 0.02112569659948349, "kl": 0.019140899181365967, "learning_rate": 4.5713775416217884e-07, "loss": 0.0008, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 824 }, { "completion_length": 157.5, "epoch": 0.13770655983975963, "grad_norm": 0.004465037491172552, "kl": 0.0035725748166441917, "learning_rate": 4.5211988927752026e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 825 }, { "completion_length": 256.0, "epoch": 0.13787347688198964, "grad_norm": 0.0025144016835838556, "kl": 0.0011006388813257217, "learning_rate": 4.4712697716573994e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 826 }, { "completion_length": 256.0, "epoch": 0.13804039392421966, "grad_norm": 0.0030888847541064024, "kl": 0.0014094084035605192, "learning_rate": 4.421590786638952e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 827 }, { "completion_length": 106.0, "epoch": 0.13820731096644967, "grad_norm": 0.01150460820645094, "kl": 0.011161336675286293, "learning_rate": 4.372162543042624e-07, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 828 }, { "completion_length": 166.0, "epoch": 0.13837422800867968, "grad_norm": 0.7316986322402954, "kl": 0.002473530126735568, "learning_rate": 4.3229856431359516e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 829 }, { "completion_length": 153.5, "epoch": 0.1385411450509097, "grad_norm": 1.0918455123901367, "kl": 0.003575547132641077, "learning_rate": 4.27406068612396e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 830 }, { "completion_length": 88.5, "epoch": 0.1387080620931397, "grad_norm": 0.007397250272333622, "kl": 0.00782083161175251, "learning_rate": 4.225388268141797e-07, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 831 }, { "completion_length": 219.0, "epoch": 0.13887497913536972, "grad_norm": 0.002631208160892129, "kl": 0.0017421485390514135, "learning_rate": 4.1769689822475147e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 832 }, { "completion_length": 63.5, "epoch": 0.13904189617759974, "grad_norm": 0.004690243862569332, "kl": 0.0030801701359450817, "learning_rate": 4.12880341841484e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 833 }, { "completion_length": 86.0, "epoch": 0.13920881321982975, "grad_norm": 0.012623929418623447, "kl": 0.009898202493786812, "learning_rate": 4.0808921635259595e-07, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 834 }, { "completion_length": 256.0, "epoch": 0.13937573026205977, "grad_norm": 0.0020263209007680416, "kl": 0.0007481900975108147, "learning_rate": 4.033235801364402e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 835 }, { "completion_length": 94.0, "epoch": 0.13954264730428978, "grad_norm": 0.00880457740277052, "kl": 0.005432521924376488, "learning_rate": 3.9858349126078945e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 836 }, { "completion_length": 256.0, "epoch": 0.1397095643465198, "grad_norm": 0.0029188196640461683, "kl": 0.001883528078906238, "learning_rate": 3.938690074821314e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 837 }, { "completion_length": 220.0, "epoch": 0.13987648138874978, "grad_norm": 0.5914039611816406, "kl": 0.0026893140748143196, "learning_rate": 3.891801862449629e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 838 }, { "completion_length": 159.0, "epoch": 0.1400433984309798, "grad_norm": 0.8290711045265198, "kl": 0.01050684042274952, "learning_rate": 3.8451708468109026e-07, "loss": 0.0004, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 839 }, { "completion_length": 94.0, "epoch": 0.1402103154732098, "grad_norm": 0.006229136139154434, "kl": 0.004554025363177061, "learning_rate": 3.798797596089351e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 840 }, { "completion_length": 256.0, "epoch": 0.14037723251543982, "grad_norm": 0.4648675322532654, "kl": 0.001236902317032218, "learning_rate": 3.7526826753284065e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 841 }, { "completion_length": 219.0, "epoch": 0.14054414955766983, "grad_norm": 0.6128706932067871, "kl": 0.002601848915219307, "learning_rate": 3.7068266464238085e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 842 }, { "completion_length": 126.5, "epoch": 0.14071106659989985, "grad_norm": 0.005596857517957687, "kl": 0.004144805949181318, "learning_rate": 3.661230068116811e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 843 }, { "completion_length": 256.0, "epoch": 0.14087798364212986, "grad_norm": 0.0025527647230774164, "kl": 0.0008959530387073755, "learning_rate": 3.615893495987335e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 844 }, { "completion_length": 230.0, "epoch": 0.14104490068435988, "grad_norm": 0.004377175122499466, "kl": 0.005035638343542814, "learning_rate": 3.5708174824471947e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 845 }, { "completion_length": 175.0, "epoch": 0.1412118177265899, "grad_norm": 0.7435800433158875, "kl": 0.00854445155709982, "learning_rate": 3.5260025767333894e-07, "loss": 0.0003, "reward": -0.3005000054836273, "reward_std": 0.7785245776176453, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3005000054836273, "step": 846 }, { "completion_length": 244.0, "epoch": 0.1413787347688199, "grad_norm": 0.635755181312561, "kl": 0.004419215489178896, "learning_rate": 3.481449324901412e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 847 }, { "completion_length": 149.0, "epoch": 0.14154565181104992, "grad_norm": 0.6101305484771729, "kl": 0.004588788375258446, "learning_rate": 3.4371582698185636e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 848 }, { "completion_length": 256.0, "epoch": 0.14171256885327993, "grad_norm": 0.5154933333396912, "kl": 0.0033976086415350437, "learning_rate": 3.393129951157384e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 849 }, { "completion_length": 197.5, "epoch": 0.14187948589550994, "grad_norm": 0.7184182405471802, "kl": 0.008417181670665741, "learning_rate": 3.3493649053890325e-07, "loss": 0.0003, "reward": 0.31299999356269836, "reward_std": 0.08909544348716736, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.31299999356269836, "step": 850 }, { "completion_length": 256.0, "epoch": 0.14204640293773993, "grad_norm": 0.003443158231675625, "kl": 0.0010476569877937436, "learning_rate": 3.3058636657767927e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 851 }, { "completion_length": 159.5, "epoch": 0.14221331997996994, "grad_norm": 0.930862307548523, "kl": 0.015550239011645317, "learning_rate": 3.262626762369525e-07, "loss": 0.0006, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 852 }, { "completion_length": 234.0, "epoch": 0.14238023702219996, "grad_norm": 0.002579766558483243, "kl": 0.0013492496218532324, "learning_rate": 3.219654721995266e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 853 }, { "completion_length": 256.0, "epoch": 0.14254715406442997, "grad_norm": 0.4313600957393646, "kl": 0.0026642968878149986, "learning_rate": 3.176948068254762e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 854 }, { "completion_length": 234.5, "epoch": 0.14271407110665998, "grad_norm": 0.00427050469443202, "kl": 0.0034891399554908276, "learning_rate": 3.134507321515107e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 855 }, { "completion_length": 234.0, "epoch": 0.14288098814889, "grad_norm": 0.010417378507554531, "kl": 0.008459441363811493, "learning_rate": 3.092332998903416e-07, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 856 }, { "completion_length": 164.0, "epoch": 0.14304790519112, "grad_norm": 0.005296144634485245, "kl": 0.003608694998547435, "learning_rate": 3.050425614300487e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 857 }, { "completion_length": 256.0, "epoch": 0.14321482223335003, "grad_norm": 0.45642179250717163, "kl": 0.0007742423331364989, "learning_rate": 3.0087856783345916e-07, "loss": 0.0, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 858 }, { "completion_length": 256.0, "epoch": 0.14338173927558004, "grad_norm": 0.003481900319457054, "kl": 0.002061380771920085, "learning_rate": 2.967413698375196e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 859 }, { "completion_length": 45.5, "epoch": 0.14354865631781005, "grad_norm": 0.01556455623358488, "kl": 0.005926279816776514, "learning_rate": 2.9263101785268253e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 860 }, { "completion_length": 256.0, "epoch": 0.14371557336004007, "grad_norm": 0.5251971483230591, "kl": 0.005259896628558636, "learning_rate": 2.8854756196229017e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 861 }, { "completion_length": 166.5, "epoch": 0.14388249040227008, "grad_norm": 0.7052189111709595, "kl": 0.007968240417540073, "learning_rate": 2.844910519219632e-07, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 862 }, { "completion_length": 256.0, "epoch": 0.1440494074445001, "grad_norm": 0.005507852416485548, "kl": 0.003954583313316107, "learning_rate": 2.8046153715899695e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 863 }, { "completion_length": 256.0, "epoch": 0.1442163244867301, "grad_norm": 0.002067790599539876, "kl": 0.0009433812228962779, "learning_rate": 2.764590667717562e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 864 }, { "completion_length": 256.0, "epoch": 0.1443832415289601, "grad_norm": 0.0027790102176368237, "kl": 0.0018618814647197723, "learning_rate": 2.7248368952908055e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 865 }, { "completion_length": 42.0, "epoch": 0.1445501585711901, "grad_norm": 0.0065199388191103935, "kl": 0.0021575845312327147, "learning_rate": 2.6853545386968607e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 866 }, { "completion_length": 224.0, "epoch": 0.14471707561342012, "grad_norm": 0.5402498841285706, "kl": 0.004701968748122454, "learning_rate": 2.6461440790157974e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 867 }, { "completion_length": 256.0, "epoch": 0.14488399265565013, "grad_norm": 0.4622535705566406, "kl": 0.0036258602049201727, "learning_rate": 2.6072059940146775e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 868 }, { "completion_length": 133.5, "epoch": 0.14505090969788015, "grad_norm": 0.0036056244280189276, "kl": 0.0009431007783859968, "learning_rate": 2.568540758141791e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 869 }, { "completion_length": 171.0, "epoch": 0.14521782674011016, "grad_norm": 0.9153416752815247, "kl": 0.007942994125187397, "learning_rate": 2.53014884252083e-07, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 870 }, { "completion_length": 256.0, "epoch": 0.14538474378234018, "grad_norm": 0.0030074124224483967, "kl": 0.0025512904394418, "learning_rate": 2.492030714945162e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 871 }, { "completion_length": 213.0, "epoch": 0.1455516608245702, "grad_norm": 0.4808412492275238, "kl": 0.003594657639041543, "learning_rate": 2.454186839872158e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 872 }, { "completion_length": 156.5, "epoch": 0.1457185778668002, "grad_norm": 0.00340995192527771, "kl": 0.003926926292479038, "learning_rate": 2.4166176784174795e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 873 }, { "completion_length": 256.0, "epoch": 0.14588549490903022, "grad_norm": 0.0061739301308989525, "kl": 0.0020808004774153233, "learning_rate": 2.3793236883495164e-07, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 874 }, { "completion_length": 167.0, "epoch": 0.14605241195126023, "grad_norm": 0.0033972396049648523, "kl": 0.0014102263376116753, "learning_rate": 2.3423053240837518e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 875 }, { "completion_length": 256.0, "epoch": 0.14621932899349024, "grad_norm": 0.0031935828737914562, "kl": 0.0017577905673533678, "learning_rate": 2.3055630366772857e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 876 }, { "completion_length": 256.0, "epoch": 0.14638624603572026, "grad_norm": 0.005095973145216703, "kl": 0.0038753552362322807, "learning_rate": 2.269097273823287e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 877 }, { "completion_length": 172.0, "epoch": 0.14655316307795027, "grad_norm": 0.005949539598077536, "kl": 0.002605489920824766, "learning_rate": 2.2329084798455747e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 878 }, { "completion_length": 256.0, "epoch": 0.14672008012018026, "grad_norm": 0.0026737491134554148, "kl": 0.0008208452491089702, "learning_rate": 2.1969970956931762e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 879 }, { "completion_length": 228.5, "epoch": 0.14688699716241027, "grad_norm": 0.5437566041946411, "kl": 0.0044526029378175735, "learning_rate": 2.1613635589349756e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 880 }, { "completion_length": 251.0, "epoch": 0.14705391420464028, "grad_norm": 0.00730206910520792, "kl": 0.004896904341876507, "learning_rate": 2.1260083037543817e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 881 }, { "completion_length": 226.0, "epoch": 0.1472208312468703, "grad_norm": 0.002611221745610237, "kl": 0.0011845962144434452, "learning_rate": 2.0909317609440093e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 882 }, { "completion_length": 179.0, "epoch": 0.1473877482891003, "grad_norm": 0.003357395762577653, "kl": 0.0030630819965153933, "learning_rate": 2.0561343579004716e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 883 }, { "completion_length": 135.0, "epoch": 0.14755466533133033, "grad_norm": 0.004378518555313349, "kl": 0.002720799297094345, "learning_rate": 2.0216165186191406e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 884 }, { "completion_length": 219.0, "epoch": 0.14772158237356034, "grad_norm": 0.003807013388723135, "kl": 0.002014197874814272, "learning_rate": 1.9873786636889908e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 885 }, { "completion_length": 191.0, "epoch": 0.14788849941579035, "grad_norm": 0.5960204601287842, "kl": 0.006442911922931671, "learning_rate": 1.95342121028749e-07, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 886 }, { "completion_length": 193.5, "epoch": 0.14805541645802037, "grad_norm": 0.6074711084365845, "kl": 0.007331774570047855, "learning_rate": 1.9197445721754777e-07, "loss": 0.0003, "reward": -0.06849999725818634, "reward_std": 0.27365031838417053, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.06849999725818634, "step": 887 }, { "completion_length": 40.5, "epoch": 0.14822233350025038, "grad_norm": 0.010207987390458584, "kl": 0.003927064593881369, "learning_rate": 1.8863491596921745e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 888 }, { "completion_length": 198.0, "epoch": 0.1483892505424804, "grad_norm": 0.0047151437029242516, "kl": 0.00224324525333941, "learning_rate": 1.8532353797501318e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 889 }, { "completion_length": 195.0, "epoch": 0.1485561675847104, "grad_norm": 0.6288222074508667, "kl": 0.005672920029610395, "learning_rate": 1.8204036358303173e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 890 }, { "completion_length": 59.0, "epoch": 0.14872308462694042, "grad_norm": 0.010948281735181808, "kl": 0.007617480121552944, "learning_rate": 1.787854327977162e-07, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 891 }, { "completion_length": 217.0, "epoch": 0.14889000166917044, "grad_norm": 0.5721127390861511, "kl": 0.005048188380897045, "learning_rate": 1.7555878527937164e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 892 }, { "completion_length": 165.0, "epoch": 0.14905691871140042, "grad_norm": 0.006274485494941473, "kl": 0.004529902711510658, "learning_rate": 1.7236046034367959e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 893 }, { "completion_length": 68.5, "epoch": 0.14922383575363043, "grad_norm": 0.004330054856836796, "kl": 0.001196682220324874, "learning_rate": 1.6919049696121957e-07, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 894 }, { "completion_length": 71.0, "epoch": 0.14939075279586045, "grad_norm": 0.008376315236091614, "kl": 0.005932566709816456, "learning_rate": 1.6604893375699594e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 895 }, { "completion_length": 162.5, "epoch": 0.14955766983809046, "grad_norm": 0.8711625337600708, "kl": 0.00764960702508688, "learning_rate": 1.629358090099639e-07, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 896 }, { "completion_length": 256.0, "epoch": 0.14972458688032048, "grad_norm": 0.0033741092775017023, "kl": 0.0012689315481111407, "learning_rate": 1.5985116065256683e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 897 }, { "completion_length": 182.5, "epoch": 0.1498915039225505, "grad_norm": 0.7120598554611206, "kl": 0.006283145397901535, "learning_rate": 1.567950262702714e-07, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 898 }, { "completion_length": 53.5, "epoch": 0.1500584209647805, "grad_norm": 0.009901667945086956, "kl": 0.009561508893966675, "learning_rate": 1.5376744310111019e-07, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 899 }, { "completion_length": 166.0, "epoch": 0.15022533800701052, "grad_norm": 0.8032528758049011, "kl": 0.009224602952599525, "learning_rate": 1.507684480352292e-07, "loss": 0.0004, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 900 }, { "completion_length": 189.5, "epoch": 0.15039225504924053, "grad_norm": 0.005856986157596111, "kl": 0.00782160833477974, "learning_rate": 1.4779807761443638e-07, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 901 }, { "completion_length": 250.5, "epoch": 0.15055917209147054, "grad_norm": 0.5660610198974609, "kl": 0.004799860529601574, "learning_rate": 1.4485636803175828e-07, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 902 }, { "completion_length": 102.5, "epoch": 0.15072608913370056, "grad_norm": 0.004288618452847004, "kl": 0.0016143240500241518, "learning_rate": 1.419433551309976e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 903 }, { "completion_length": 248.0, "epoch": 0.15089300617593057, "grad_norm": 0.002689730143174529, "kl": 0.0020439382642507553, "learning_rate": 1.3905907440629752e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 904 }, { "completion_length": 256.0, "epoch": 0.15105992321816059, "grad_norm": 0.002285890281200409, "kl": 0.0015803974820300937, "learning_rate": 1.362035610017079e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 905 }, { "completion_length": 252.0, "epoch": 0.1512268402603906, "grad_norm": 0.5096849203109741, "kl": 0.0035494393669068813, "learning_rate": 1.3337684971075932e-07, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 906 }, { "completion_length": 62.5, "epoch": 0.15139375730262059, "grad_norm": 0.014397697523236275, "kl": 0.011696840636432171, "learning_rate": 1.305789749760361e-07, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 907 }, { "completion_length": 256.0, "epoch": 0.1515606743448506, "grad_norm": 0.007007586769759655, "kl": 0.004515727981925011, "learning_rate": 1.278099708887587e-07, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 908 }, { "completion_length": 119.0, "epoch": 0.1517275913870806, "grad_norm": 0.0062120831571519375, "kl": 0.005740188993513584, "learning_rate": 1.2506987118836912e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 909 }, { "completion_length": 256.0, "epoch": 0.15189450842931063, "grad_norm": 0.0024361603427678347, "kl": 0.0009502327302470803, "learning_rate": 1.223587092621162e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 910 }, { "completion_length": 89.0, "epoch": 0.15206142547154064, "grad_norm": 0.006180557422339916, "kl": 0.0030573175754398108, "learning_rate": 1.1967651814465353e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 911 }, { "completion_length": 101.0, "epoch": 0.15222834251377065, "grad_norm": 0.005860715173184872, "kl": 0.004987116903066635, "learning_rate": 1.1702333051763271e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 912 }, { "completion_length": 171.5, "epoch": 0.15239525955600067, "grad_norm": 0.016579054296016693, "kl": 0.012085339054465294, "learning_rate": 1.1439917870930795e-07, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 913 }, { "completion_length": 256.0, "epoch": 0.15256217659823068, "grad_norm": 0.002616903278976679, "kl": 0.001278625917620957, "learning_rate": 1.1180409469414094e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 914 }, { "completion_length": 78.0, "epoch": 0.1527290936404607, "grad_norm": 0.012577167712152004, "kl": 0.011343262158334255, "learning_rate": 1.0923811009241142e-07, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 915 }, { "completion_length": 209.5, "epoch": 0.1528960106826907, "grad_norm": 0.007407314609736204, "kl": 0.00446341373026371, "learning_rate": 1.067012561698319e-07, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 916 }, { "completion_length": 59.0, "epoch": 0.15306292772492072, "grad_norm": 0.005476160906255245, "kl": 0.0025178331416100264, "learning_rate": 1.041935638371669e-07, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 917 }, { "completion_length": 57.5, "epoch": 0.15322984476715074, "grad_norm": 0.41750121116638184, "kl": 0.03501199558377266, "learning_rate": 1.0171506364985622e-07, "loss": 0.0014, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 918 }, { "completion_length": 165.0, "epoch": 0.15339676180938075, "grad_norm": 0.013334715738892555, "kl": 0.010705744847655296, "learning_rate": 9.926578580764234e-08, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 919 }, { "completion_length": 78.5, "epoch": 0.15356367885161076, "grad_norm": 0.010402748361229897, "kl": 0.006247858516871929, "learning_rate": 9.684576015420277e-08, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 920 }, { "completion_length": 214.5, "epoch": 0.15373059589384075, "grad_norm": 0.6227617263793945, "kl": 0.0018448762129992247, "learning_rate": 9.445501617678654e-08, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 921 }, { "completion_length": 256.0, "epoch": 0.15389751293607076, "grad_norm": 0.0019218004308640957, "kl": 0.0006806186866015196, "learning_rate": 9.209358300585474e-08, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 922 }, { "completion_length": 118.0, "epoch": 0.15406442997830078, "grad_norm": 0.009312363341450691, "kl": 0.009238087572157383, "learning_rate": 8.9761489414725e-08, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 923 }, { "completion_length": 256.0, "epoch": 0.1542313470205308, "grad_norm": 0.5169605016708374, "kl": 0.004755707923322916, "learning_rate": 8.745876381922147e-08, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 924 }, { "completion_length": 221.5, "epoch": 0.1543982640627608, "grad_norm": 0.002694807481020689, "kl": 0.0009950060630217195, "learning_rate": 8.518543427732951e-08, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 925 }, { "completion_length": 256.0, "epoch": 0.15456518110499082, "grad_norm": 0.4558197855949402, "kl": 0.004176877439022064, "learning_rate": 8.294152848885156e-08, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 926 }, { "completion_length": 256.0, "epoch": 0.15473209814722083, "grad_norm": 0.0026055637281388044, "kl": 0.002046598820015788, "learning_rate": 8.072707379507217e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 927 }, { "completion_length": 204.0, "epoch": 0.15489901518945084, "grad_norm": 0.006314119789749384, "kl": 0.00278397835791111, "learning_rate": 7.854209717842231e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 928 }, { "completion_length": 136.5, "epoch": 0.15506593223168086, "grad_norm": 0.0042397934012115, "kl": 0.0034691779874265194, "learning_rate": 7.638662526215284e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 929 }, { "completion_length": 186.5, "epoch": 0.15523284927391087, "grad_norm": 0.4873674213886261, "kl": 0.0014185598120093346, "learning_rate": 7.426068431000883e-08, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 930 }, { "completion_length": 231.0, "epoch": 0.15539976631614089, "grad_norm": 0.4493325352668762, "kl": 0.005158409476280212, "learning_rate": 7.216430022591009e-08, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 931 }, { "completion_length": 256.0, "epoch": 0.1555666833583709, "grad_norm": 0.004120826721191406, "kl": 0.0015174639411270618, "learning_rate": 7.009749855363457e-08, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 932 }, { "completion_length": 180.5, "epoch": 0.1557336004006009, "grad_norm": 0.5835127234458923, "kl": 0.004220470320433378, "learning_rate": 6.806030447650879e-08, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 933 }, { "completion_length": 256.0, "epoch": 0.1559005174428309, "grad_norm": 0.0035010778810828924, "kl": 0.001267659361474216, "learning_rate": 6.605274281709929e-08, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 934 }, { "completion_length": 256.0, "epoch": 0.1560674344850609, "grad_norm": 0.003016152884811163, "kl": 0.002599175553768873, "learning_rate": 6.407483803691216e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 935 }, { "completion_length": 256.0, "epoch": 0.15623435152729093, "grad_norm": 0.4573005437850952, "kl": 0.0030494886450469494, "learning_rate": 6.212661423609184e-08, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 936 }, { "completion_length": 56.5, "epoch": 0.15640126856952094, "grad_norm": 0.019429225474596024, "kl": 0.0026581143029034138, "learning_rate": 6.020809515313141e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 937 }, { "completion_length": 69.5, "epoch": 0.15656818561175095, "grad_norm": 0.0044250194914639, "kl": 0.0011532610515132546, "learning_rate": 5.83193041645802e-08, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 938 }, { "completion_length": 90.0, "epoch": 0.15673510265398097, "grad_norm": 0.006088037975132465, "kl": 0.0013648029416799545, "learning_rate": 5.6460264284760316e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 939 }, { "completion_length": 256.0, "epoch": 0.15690201969621098, "grad_norm": 0.002076206961646676, "kl": 0.0008977483958005905, "learning_rate": 5.463099816548578e-08, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 940 }, { "completion_length": 256.0, "epoch": 0.157068936738441, "grad_norm": 0.4976649284362793, "kl": 0.006934022530913353, "learning_rate": 5.283152809578751e-08, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 941 }, { "completion_length": 90.0, "epoch": 0.157235853780671, "grad_norm": 0.007505532819777727, "kl": 0.004286427050828934, "learning_rate": 5.106187600163987e-08, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 942 }, { "completion_length": 207.0, "epoch": 0.15740277082290102, "grad_norm": 0.0026391535066068172, "kl": 0.0012901662848889828, "learning_rate": 4.932206344569562e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 943 }, { "completion_length": 171.0, "epoch": 0.15756968786513104, "grad_norm": 0.004229592625051737, "kl": 0.0031001782044768333, "learning_rate": 4.761211162702117e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 944 }, { "completion_length": 92.5, "epoch": 0.15773660490736105, "grad_norm": 0.010589242912828922, "kl": 0.006883353926241398, "learning_rate": 4.593204138084006e-08, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 945 }, { "completion_length": 149.0, "epoch": 0.15790352194959106, "grad_norm": 0.6355993151664734, "kl": 0.01794269308447838, "learning_rate": 4.428187317827848e-08, "loss": 0.0007, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 946 }, { "completion_length": 80.0, "epoch": 0.15807043899182108, "grad_norm": 0.01792294718325138, "kl": 0.013799919746816158, "learning_rate": 4.26616271261146e-08, "loss": 0.0006, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 947 }, { "completion_length": 256.0, "epoch": 0.15823735603405106, "grad_norm": 0.003109060227870941, "kl": 0.002601678017526865, "learning_rate": 4.1071322966535487e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 948 }, { "completion_length": 167.5, "epoch": 0.15840427307628108, "grad_norm": 0.9559182524681091, "kl": 0.013255510479211807, "learning_rate": 3.95109800768953e-08, "loss": 0.0005, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 949 }, { "completion_length": 224.5, "epoch": 0.1585711901185111, "grad_norm": 0.00369926355779171, "kl": 0.0026472900062799454, "learning_rate": 3.798061746947995e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 950 }, { "completion_length": 51.5, "epoch": 0.1587381071607411, "grad_norm": 0.007497505750507116, "kl": 0.012396030128002167, "learning_rate": 3.648025379127479e-08, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 951 }, { "completion_length": 181.0, "epoch": 0.15890502420297112, "grad_norm": 0.004028906114399433, "kl": 0.0019237271044403315, "learning_rate": 3.5009907323737826e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 952 }, { "completion_length": 161.5, "epoch": 0.15907194124520113, "grad_norm": 0.014116289094090462, "kl": 0.007392593659460545, "learning_rate": 3.3569595982576584e-08, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 953 }, { "completion_length": 256.0, "epoch": 0.15923885828743115, "grad_norm": 0.003214628668501973, "kl": 0.0013086342951282859, "learning_rate": 3.2159337317530234e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 954 }, { "completion_length": 162.0, "epoch": 0.15940577532966116, "grad_norm": 0.8468399047851562, "kl": 0.01043287105858326, "learning_rate": 3.077914851215585e-08, "loss": 0.0004, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 955 }, { "completion_length": 223.0, "epoch": 0.15957269237189117, "grad_norm": 0.00315938051789999, "kl": 0.0021718821953982115, "learning_rate": 2.9429046383618042e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 956 }, { "completion_length": 183.5, "epoch": 0.1597396094141212, "grad_norm": 0.7200230956077576, "kl": 0.007411726284772158, "learning_rate": 2.810904738248549e-08, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 957 }, { "completion_length": 75.0, "epoch": 0.1599065264563512, "grad_norm": 0.005140088498592377, "kl": 0.0027749394066631794, "learning_rate": 2.681916759252917e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 958 }, { "completion_length": 141.0, "epoch": 0.1600734434985812, "grad_norm": 0.0062827239744365215, "kl": 0.004846738651394844, "learning_rate": 2.555942273052753e-08, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 959 }, { "completion_length": 194.5, "epoch": 0.16024036054081123, "grad_norm": 0.5293864607810974, "kl": 0.004886666312813759, "learning_rate": 2.4329828146074096e-08, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 960 }, { "completion_length": 194.0, "epoch": 0.16040727758304124, "grad_norm": 0.004061645828187466, "kl": 0.002863900735974312, "learning_rate": 2.313039882139101e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 961 }, { "completion_length": 66.0, "epoch": 0.16057419462527123, "grad_norm": 0.013109861873090267, "kl": 0.0038286433555185795, "learning_rate": 2.1961149371145795e-08, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 962 }, { "completion_length": 256.0, "epoch": 0.16074111166750124, "grad_norm": 0.0026470397133380175, "kl": 0.0009522895561531186, "learning_rate": 2.082209404227403e-08, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 963 }, { "completion_length": 255.5, "epoch": 0.16090802870973125, "grad_norm": 0.5309305787086487, "kl": 0.0026301685720682144, "learning_rate": 1.9713246713805588e-08, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 964 }, { "completion_length": 166.0, "epoch": 0.16107494575196127, "grad_norm": 0.700700581073761, "kl": 0.005935050547122955, "learning_rate": 1.8634620896695044e-08, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 965 }, { "completion_length": 256.0, "epoch": 0.16124186279419128, "grad_norm": 0.009644546546041965, "kl": 0.0034475401043891907, "learning_rate": 1.7586229733657646e-08, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 966 }, { "completion_length": 242.0, "epoch": 0.1614087798364213, "grad_norm": 0.00596182607114315, "kl": 0.007220048923045397, "learning_rate": 1.6568085999008886e-08, "loss": 0.0003, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 967 }, { "completion_length": 234.5, "epoch": 0.1615756968786513, "grad_norm": 0.531011700630188, "kl": 0.0032313859555870295, "learning_rate": 1.5580202098509078e-08, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 968 }, { "completion_length": 256.0, "epoch": 0.16174261392088132, "grad_norm": 0.00563878333196044, "kl": 0.003927887417376041, "learning_rate": 1.4622590069211517e-08, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 969 }, { "completion_length": 256.0, "epoch": 0.16190953096311134, "grad_norm": 0.002992888679727912, "kl": 0.0008594400715082884, "learning_rate": 1.3695261579316776e-08, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 970 }, { "completion_length": 202.5, "epoch": 0.16207644800534135, "grad_norm": 0.0030997898429632187, "kl": 0.0015460936119779944, "learning_rate": 1.2798227928029483e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 971 }, { "completion_length": 256.0, "epoch": 0.16224336504757136, "grad_norm": 0.0025505644734948874, "kl": 0.001887658960185945, "learning_rate": 1.193150004542204e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 972 }, { "completion_length": 234.0, "epoch": 0.16241028208980138, "grad_norm": 0.0035507562570273876, "kl": 0.001689232885837555, "learning_rate": 1.109508849230001e-08, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 973 }, { "completion_length": 195.0, "epoch": 0.1625771991320314, "grad_norm": 0.0041445959359407425, "kl": 0.004734058864414692, "learning_rate": 1.0289003460074165e-08, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 974 }, { "completion_length": 256.0, "epoch": 0.1627441161742614, "grad_norm": 0.005934651009738445, "kl": 0.0033802478574216366, "learning_rate": 9.513254770636138e-09, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 975 }, { "completion_length": 207.0, "epoch": 0.1629110332164914, "grad_norm": 0.5480122566223145, "kl": 0.004450419917702675, "learning_rate": 8.767851876239075e-09, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 976 }, { "completion_length": 222.0, "epoch": 0.1630779502587214, "grad_norm": 0.5008136034011841, "kl": 0.003711721859872341, "learning_rate": 8.052803859382174e-09, "loss": 0.0001, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 977 }, { "completion_length": 149.0, "epoch": 0.16324486730095142, "grad_norm": 1.4795379638671875, "kl": 0.004235450178384781, "learning_rate": 7.368119432699383e-09, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 978 }, { "completion_length": 195.0, "epoch": 0.16341178434318143, "grad_norm": 0.009295170195400715, "kl": 0.009135499596595764, "learning_rate": 6.7138069388547614e-09, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 979 }, { "completion_length": 76.0, "epoch": 0.16357870138541145, "grad_norm": 0.014273913577198982, "kl": 0.009966228157281876, "learning_rate": 6.089874350439507e-09, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 980 }, { "completion_length": 251.5, "epoch": 0.16374561842764146, "grad_norm": 0.5024834275245667, "kl": 0.004165974911302328, "learning_rate": 5.4963292698750896e-09, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 981 }, { "completion_length": 232.0, "epoch": 0.16391253546987147, "grad_norm": 0.003592568449676037, "kl": 0.002369510242715478, "learning_rate": 4.933178929321103e-09, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 982 }, { "completion_length": 99.5, "epoch": 0.1640794525121015, "grad_norm": 0.004043578170239925, "kl": 0.005188662093132734, "learning_rate": 4.400430190586724e-09, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 983 }, { "completion_length": 164.5, "epoch": 0.1642463695543315, "grad_norm": 0.8307241201400757, "kl": 0.006935087963938713, "learning_rate": 3.8980895450474455e-09, "loss": 0.0003, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 984 }, { "completion_length": 256.0, "epoch": 0.16441328659656151, "grad_norm": 0.003296250244602561, "kl": 0.0034450627863407135, "learning_rate": 3.4261631135654174e-09, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 985 }, { "completion_length": 256.0, "epoch": 0.16458020363879153, "grad_norm": 0.0031280957628041506, "kl": 0.0026572002097964287, "learning_rate": 2.984656646415063e-09, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 986 }, { "completion_length": 166.5, "epoch": 0.16474712068102154, "grad_norm": 0.002538673346862197, "kl": 0.0011523328721523285, "learning_rate": 2.573575523213412e-09, "loss": 0.0, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 987 }, { "completion_length": 64.5, "epoch": 0.16491403772325156, "grad_norm": 0.012686343863606453, "kl": 0.010003810748457909, "learning_rate": 2.192924752854042e-09, "loss": 0.0004, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 988 }, { "completion_length": 256.0, "epoch": 0.16508095476548157, "grad_norm": 0.5312541127204895, "kl": 0.005845358595252037, "learning_rate": 1.842708973447127e-09, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 989 }, { "completion_length": 220.5, "epoch": 0.16524787180771155, "grad_norm": 0.006919648963958025, "kl": 0.0037374142557382584, "learning_rate": 1.5229324522605949e-09, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 990 }, { "completion_length": 256.0, "epoch": 0.16541478884994157, "grad_norm": 0.003939046524465084, "kl": 0.004600961692631245, "learning_rate": 1.2335990856710001e-09, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 991 }, { "completion_length": 252.0, "epoch": 0.16558170589217158, "grad_norm": 0.003164430381730199, "kl": 0.003024528967216611, "learning_rate": 9.747123991141193e-10, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 992 }, { "completion_length": 232.0, "epoch": 0.1657486229344016, "grad_norm": 0.5451778173446655, "kl": 0.005355913657695055, "learning_rate": 7.462755470422078e-10, "loss": 0.0002, "reward": 0.1875, "reward_std": 0.0883883461356163, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1875, "step": 993 }, { "completion_length": 221.0, "epoch": 0.1659155399766316, "grad_norm": 0.0025971492286771536, "kl": 0.0012575022410601377, "learning_rate": 5.48291312886251e-10, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 994 }, { "completion_length": 81.5, "epoch": 0.16608245701886162, "grad_norm": 0.014851098880171776, "kl": 0.012549110688269138, "learning_rate": 3.8076210902182607e-10, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 995 }, { "completion_length": 187.5, "epoch": 0.16624937406109164, "grad_norm": 0.0035904680844396353, "kl": 0.00317639228887856, "learning_rate": 2.43689976739403e-10, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 996 }, { "completion_length": 160.0, "epoch": 0.16641629110332165, "grad_norm": 0.004597581923007965, "kl": 0.0028870534151792526, "learning_rate": 1.3707658621964216e-10, "loss": 0.0001, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 997 }, { "completion_length": 55.0, "epoch": 0.16658320814555166, "grad_norm": 0.010526842437684536, "kl": 0.011365446262061596, "learning_rate": 6.092323651313293e-11, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 998 }, { "completion_length": 132.5, "epoch": 0.16675012518778168, "grad_norm": 0.006425110623240471, "kl": 0.004926392808556557, "learning_rate": 1.5230855524017708e-11, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 999 }, { "completion_length": 250.0, "epoch": 0.1669170422300117, "grad_norm": 0.003824537852779031, "kl": 0.004088413901627064, "learning_rate": 0.0, "loss": 0.0002, "reward": 0.25, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.25, "step": 1000 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }