{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9982449982449982, "eval_steps": 500, "global_step": 474, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 142.80611928304037, "epoch": 0.002106002106002106, "grad_norm": 1.0551681518554688, "kl": 0.0, "learning_rate": 2e-08, "loss": 0.0002, "reward": 2.131795952717463, "reward_std": 6.255450010299683, "rewards/citation_reward_func": 3.8265305360158286, "rewards/correctness_reward_func": 1.3775509695212047, "rewards/formatting_reward_func": 0.4863945543766022, "rewards/length_reward_func": -3.1122448245684304, "rewards/penalize_wrong_passages_reward_func": -0.9319727768500646, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4855373551448186, "step": 1 }, { "completion_length": 142.05781936645508, "epoch": 0.004212004212004212, "grad_norm": 0.9199629426002502, "kl": 0.0, "learning_rate": 4e-08, "loss": 0.0004, "reward": 2.249255125721296, "reward_std": 5.873880942662557, "rewards/citation_reward_func": 3.8095237016677856, "rewards/correctness_reward_func": 1.8027210533618927, "rewards/formatting_reward_func": 0.47789114713668823, "rewards/length_reward_func": -3.3673468430836997, "rewards/penalize_wrong_passages_reward_func": -0.952380950252215, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.47884689768155414, "step": 2 }, { "completion_length": 147.4081598917643, "epoch": 0.006318006318006318, "grad_norm": 0.9852209687232971, "kl": 0.001129150390625, "learning_rate": 6e-08, "loss": 0.0, "reward": 3.1659286667903266, "reward_std": 5.810667594273885, "rewards/citation_reward_func": 3.784013589223226, "rewards/correctness_reward_func": 2.0068026781082153, "rewards/formatting_reward_func": 0.4812925159931183, "rewards/length_reward_func": -2.602040797472, "rewards/penalize_wrong_passages_reward_func": -0.9863945345083872, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4822550565004349, "step": 3 }, { "completion_length": 149.8843510945638, "epoch": 0.008424008424008424, "grad_norm": 1.000421404838562, "kl": 0.0008897781372070312, "learning_rate": 8e-08, "loss": 0.0, "reward": 1.7928027113278706, "reward_std": 6.236873547236125, "rewards/citation_reward_func": 3.647959073384603, "rewards/correctness_reward_func": 1.7517006198565166, "rewards/formatting_reward_func": 0.48299319048722583, "rewards/length_reward_func": -3.163265268007914, "rewards/penalize_wrong_passages_reward_func": -1.2380952139695485, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.48157819112141925, "step": 4 }, { "completion_length": 142.11224365234375, "epoch": 0.01053001053001053, "grad_norm": 0.9926426410675049, "kl": 0.0012919108072916667, "learning_rate": 1e-07, "loss": 0.0005, "reward": 2.159136086702347, "reward_std": 5.8599865436553955, "rewards/citation_reward_func": 3.562925100326538, "rewards/correctness_reward_func": 2.142857084671656, "rewards/formatting_reward_func": 0.4863945593436559, "rewards/length_reward_func": -3.6224488814671836, "rewards/penalize_wrong_passages_reward_func": -0.8979591826597849, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.48736729224522907, "step": 5 }, { "completion_length": 141.88434982299805, "epoch": 0.012636012636012635, "grad_norm": 0.8212317228317261, "kl": 0.001293182373046875, "learning_rate": 1.2e-07, "loss": 0.0007, "reward": 3.040091894567013, "reward_std": 5.166776895523071, "rewards/citation_reward_func": 3.4778910080591836, "rewards/correctness_reward_func": 1.9557822744051616, "rewards/formatting_reward_func": 0.4880952388048172, "rewards/length_reward_func": -2.704081575075785, "rewards/penalize_wrong_passages_reward_func": -0.6666666567325592, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4890713890393575, "step": 6 }, { "completion_length": 144.5646209716797, "epoch": 0.014742014742014743, "grad_norm": 1.0746428966522217, "kl": 0.0010700225830078125, "learning_rate": 1.4e-07, "loss": 0.0004, "reward": 2.7565646121899285, "reward_std": 5.139555593331655, "rewards/citation_reward_func": 3.7499999602635703, "rewards/correctness_reward_func": 2.0068026781082153, "rewards/formatting_reward_func": 0.4880952338377635, "rewards/length_reward_func": -2.908163234591484, "rewards/penalize_wrong_passages_reward_func": -1.068027178446452, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4878570834795634, "step": 7 }, { "completion_length": 145.99659474690756, "epoch": 0.016848016848016848, "grad_norm": 0.9713934659957886, "kl": 0.0009145736694335938, "learning_rate": 1.6e-07, "loss": 0.0003, "reward": 3.9408061106999717, "reward_std": 5.972306688626607, "rewards/citation_reward_func": 3.741496523221334, "rewards/correctness_reward_func": 2.0068026781082153, "rewards/formatting_reward_func": 0.4863945543766022, "rewards/length_reward_func": -2.19387752811114, "rewards/penalize_wrong_passages_reward_func": -0.5850339954098066, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.48502375682195026, "step": 8 }, { "completion_length": 158.15305836995444, "epoch": 0.018954018954018954, "grad_norm": 1.0545966625213623, "kl": 0.0009892781575520833, "learning_rate": 1.8e-07, "loss": 0.0, "reward": 1.793391227722168, "reward_std": 7.030755837758382, "rewards/citation_reward_func": 3.9115644693374634, "rewards/correctness_reward_func": 1.4795918067296345, "rewards/formatting_reward_func": 0.4829931954542796, "rewards/length_reward_func": -3.2142856319745383, "rewards/penalize_wrong_passages_reward_func": -1.1768707136313121, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.48046593368053436, "step": 9 }, { "completion_length": 141.08503214518228, "epoch": 0.02106002106002106, "grad_norm": 0.8638601899147034, "kl": 0.0014311472574869792, "learning_rate": 2e-07, "loss": 0.0005, "reward": 0.5391122450431188, "reward_std": 6.993181943893433, "rewards/citation_reward_func": 3.545918345451355, "rewards/correctness_reward_func": 1.343537410100301, "rewards/formatting_reward_func": 0.46598638594150543, "rewards/length_reward_func": -4.336734573046367, "rewards/penalize_wrong_passages_reward_func": -0.9455782324075699, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.46598294874032337, "step": 10 }, { "completion_length": 150.2993189493815, "epoch": 0.023166023166023165, "grad_norm": 0.9541848301887512, "kl": 0.0010693868001302083, "learning_rate": 2.1999999999999998e-07, "loss": 0.0004, "reward": 0.8768775562445322, "reward_std": 7.34400741259257, "rewards/citation_reward_func": 3.860544204711914, "rewards/correctness_reward_func": 1.7006802260875702, "rewards/formatting_reward_func": 0.47278910875320435, "rewards/length_reward_func": -4.438775380452474, "rewards/penalize_wrong_passages_reward_func": -1.19047615925471, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4721156011025111, "step": 11 }, { "completion_length": 146.27210235595703, "epoch": 0.02527202527202527, "grad_norm": 1.0617852210998535, "kl": 0.0011603037516276042, "learning_rate": 2.4e-07, "loss": 0.0001, "reward": 2.064258575439453, "reward_std": 5.884495735168457, "rewards/citation_reward_func": 3.8435373306274414, "rewards/correctness_reward_func": 1.8197278082370758, "rewards/formatting_reward_func": 0.4812925159931183, "rewards/length_reward_func": -3.5204080740610757, "rewards/penalize_wrong_passages_reward_func": -1.0408163170019786, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.48092512786388397, "step": 12 }, { "completion_length": 142.45578002929688, "epoch": 0.02737802737802738, "grad_norm": 1.0271327495574951, "kl": 0.0016148885091145833, "learning_rate": 2.6e-07, "loss": 0.0009, "reward": 3.8155850768089294, "reward_std": 5.828161716461182, "rewards/citation_reward_func": 3.6819727023442588, "rewards/correctness_reward_func": 2.6360543171564736, "rewards/formatting_reward_func": 0.47959182659784955, "rewards/length_reward_func": -2.7551019390424094, "rewards/penalize_wrong_passages_reward_func": -0.7074829836686453, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.48055097957452136, "step": 13 }, { "completion_length": 151.4897918701172, "epoch": 0.029484029484029485, "grad_norm": 0.853244960308075, "kl": 0.0016167958577473958, "learning_rate": 2.8e-07, "loss": 0.0014, "reward": 3.293789138396581, "reward_std": 5.249532063802083, "rewards/citation_reward_func": 3.7499999602635703, "rewards/correctness_reward_func": 2.1598638892173767, "rewards/formatting_reward_func": 0.48979591329892475, "rewards/length_reward_func": -2.2959182957808175, "rewards/penalize_wrong_passages_reward_func": -1.2993196845054626, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4893672863642375, "step": 14 }, { "completion_length": 137.7653045654297, "epoch": 0.03159003159003159, "grad_norm": 1.0360064506530762, "kl": 0.0015691121419270833, "learning_rate": 3e-07, "loss": 0.0002, "reward": 0.6607789664218823, "reward_std": 7.108782927195231, "rewards/citation_reward_func": 3.4523808558781943, "rewards/correctness_reward_func": 1.2925169517596562, "rewards/formatting_reward_func": 0.4574829836686452, "rewards/length_reward_func": -3.979591647783915, "rewards/penalize_wrong_passages_reward_func": -1.0204081336657207, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4583979199330012, "step": 15 }, { "completion_length": 146.68026733398438, "epoch": 0.033696033696033696, "grad_norm": 1.042144775390625, "kl": 0.0016771952311197917, "learning_rate": 3.2e-07, "loss": 0.0008, "reward": 1.779846937706073, "reward_std": 6.210790753364563, "rewards/citation_reward_func": 3.809523661931356, "rewards/correctness_reward_func": 1.530612200498581, "rewards/formatting_reward_func": 0.46768706540266675, "rewards/length_reward_func": -3.775510162115097, "rewards/penalize_wrong_passages_reward_func": -0.7210884292920431, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.468622421224912, "step": 16 }, { "completion_length": 155.00679779052734, "epoch": 0.0358020358020358, "grad_norm": 0.8416888117790222, "kl": 0.001552581787109375, "learning_rate": 3.4000000000000003e-07, "loss": 0.0007, "reward": 1.9689354697863262, "reward_std": 5.606051802635193, "rewards/citation_reward_func": 3.5884352922439575, "rewards/correctness_reward_func": 0.9693877349297205, "rewards/formatting_reward_func": 0.4812925110260646, "rewards/length_reward_func": -2.9591835737228394, "rewards/penalize_wrong_passages_reward_func": -0.591836716979742, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4808400919040044, "step": 17 }, { "completion_length": 138.35713831583658, "epoch": 0.03790803790803791, "grad_norm": 1.098335862159729, "kl": 0.002567291259765625, "learning_rate": 3.6e-07, "loss": 0.0, "reward": 4.020343641440074, "reward_std": 5.701698939005534, "rewards/citation_reward_func": 3.6989795764287314, "rewards/correctness_reward_func": 2.4659862915674844, "rewards/formatting_reward_func": 0.47789115210374195, "rewards/length_reward_func": -2.4999999403953552, "rewards/penalize_wrong_passages_reward_func": -0.5986394435167313, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4761258115371068, "step": 18 }, { "completion_length": 134.54761505126953, "epoch": 0.04001404001404001, "grad_norm": 1.1855769157409668, "kl": 0.0048675537109375, "learning_rate": 3.7999999999999996e-07, "loss": 0.0007, "reward": 3.8262110551198325, "reward_std": 4.684337218602498, "rewards/citation_reward_func": 3.656462550163269, "rewards/correctness_reward_func": 2.057823101679484, "rewards/formatting_reward_func": 0.4812925159931183, "rewards/length_reward_func": -2.3469387193520865, "rewards/penalize_wrong_passages_reward_func": -0.5034013539552689, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4809727370738983, "step": 19 }, { "completion_length": 138.53060658772787, "epoch": 0.04212004212004212, "grad_norm": 1.0796778202056885, "kl": 0.005633036295572917, "learning_rate": 4e-07, "loss": 0.0003, "reward": 1.8895781971514225, "reward_std": 5.371582110722859, "rewards/citation_reward_func": 3.4608842929204306, "rewards/correctness_reward_func": 1.0374149332443874, "rewards/formatting_reward_func": 0.4855442096789678, "rewards/length_reward_func": -2.90816322962443, "rewards/penalize_wrong_passages_reward_func": -0.6734693745772043, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4873672773440679, "step": 20 }, { "completion_length": 151.2653020222982, "epoch": 0.044226044226044224, "grad_norm": 1.172623634338379, "kl": 0.008516947428385416, "learning_rate": 4.1999999999999995e-07, "loss": 0.0004, "reward": 2.291476254661878, "reward_std": 6.13926366964976, "rewards/citation_reward_func": 3.579931934674581, "rewards/correctness_reward_func": 1.4965985814730327, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -2.959183613459269, "rewards/penalize_wrong_passages_reward_func": -0.8163265287876129, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.493857075770696, "step": 21 }, { "completion_length": 144.25850041707358, "epoch": 0.04633204633204633, "grad_norm": 0.9470953345298767, "kl": 0.007146199544270833, "learning_rate": 4.3999999999999997e-07, "loss": 0.0009, "reward": 3.1404490868250527, "reward_std": 4.314921895662944, "rewards/citation_reward_func": 3.9795918067296348, "rewards/correctness_reward_func": 0.9863945146401724, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -2.4999999503294625, "rewards/penalize_wrong_passages_reward_func": -0.3197278883308172, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.497591773668925, "step": 22 }, { "completion_length": 135.66666158040366, "epoch": 0.048438048438048435, "grad_norm": 0.7583827376365662, "kl": 0.007680257161458333, "learning_rate": 4.6e-07, "loss": 0.001, "reward": 2.0998741885026297, "reward_std": 4.641193389892578, "rewards/citation_reward_func": 3.6394556760787964, "rewards/correctness_reward_func": 1.3775509943564732, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -2.8061223874489465, "rewards/penalize_wrong_passages_reward_func": -1.102040817340215, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4944319079319636, "step": 23 }, { "completion_length": 151.2006746927897, "epoch": 0.05054405054405054, "grad_norm": 0.83041912317276, "kl": 0.008819580078125, "learning_rate": 4.8e-07, "loss": 0.0005, "reward": 2.3360307614008584, "reward_std": 4.612976272900899, "rewards/citation_reward_func": 3.7329931259155273, "rewards/correctness_reward_func": 1.4625850121180217, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -2.8571428110202155, "rewards/penalize_wrong_passages_reward_func": -0.8299319495757421, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.49929585059483844, "step": 24 }, { "completion_length": 143.89795684814453, "epoch": 0.05265005265005265, "grad_norm": 0.8499120473861694, "kl": 0.0132293701171875, "learning_rate": 5e-07, "loss": 0.0005, "reward": 3.760387728611628, "reward_std": 4.180753588676453, "rewards/citation_reward_func": 3.767006754875183, "rewards/correctness_reward_func": 2.1768707036972046, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -2.4999999701976776, "rewards/penalize_wrong_passages_reward_func": -0.6802720973889033, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49848292271296185, "step": 25 }, { "completion_length": 146.84693400065103, "epoch": 0.05475605475605476, "grad_norm": 0.649495542049408, "kl": 0.008579254150390625, "learning_rate": 5.2e-07, "loss": 0.0012, "reward": 3.507796049118042, "reward_std": 4.758268475532532, "rewards/citation_reward_func": 3.741496443748474, "rewards/correctness_reward_func": 1.8707482516765594, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -2.295918345451355, "rewards/penalize_wrong_passages_reward_func": -0.8027210781971613, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.497591773668925, "step": 26 }, { "completion_length": 145.1700668334961, "epoch": 0.056862056862056864, "grad_norm": 0.8985055088996887, "kl": 0.01629638671875, "learning_rate": 5.4e-07, "loss": 0.0008, "reward": 2.723782400290171, "reward_std": 4.800549666086833, "rewards/citation_reward_func": 3.4778910875320435, "rewards/correctness_reward_func": 1.8707482516765594, "rewards/formatting_reward_func": 0.49659863611062366, "rewards/length_reward_func": -3.061224400997162, "rewards/penalize_wrong_passages_reward_func": -0.5578231147180001, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.497591773668925, "step": 27 }, { "completion_length": 148.224484761556, "epoch": 0.05896805896805897, "grad_norm": 1.046953797340393, "kl": 0.02532958984375, "learning_rate": 5.6e-07, "loss": 0.0005, "reward": 3.3295952950914702, "reward_std": 5.556566874186198, "rewards/citation_reward_func": 3.579931855201721, "rewards/correctness_reward_func": 2.4149659176667533, "rewards/formatting_reward_func": 0.4948979616165161, "rewards/length_reward_func": -2.551020344098409, "rewards/penalize_wrong_passages_reward_func": -0.9319727768500646, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.4928604910771052, "step": 28 }, { "completion_length": 161.9489771525065, "epoch": 0.061074061074061076, "grad_norm": 0.9288030862808228, "kl": 0.057291666666666664, "learning_rate": 5.8e-07, "loss": 0.0007, "reward": 1.887748343249162, "reward_std": 5.772057731946309, "rewards/citation_reward_func": 3.4523808558781943, "rewards/correctness_reward_func": 1.5986394186814625, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -3.0612244407335916, "rewards/penalize_wrong_passages_reward_func": -0.9251700465877851, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.4965917766094208, "step": 29 }, { "completion_length": 141.9795913696289, "epoch": 0.06318006318006318, "grad_norm": 0.994685709476471, "kl": 0.0557861328125, "learning_rate": 6e-07, "loss": 0.0018, "reward": 4.592092037200928, "reward_std": 4.368195136388143, "rewards/citation_reward_func": 3.945578098297119, "rewards/correctness_reward_func": 2.414965867996216, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -2.1428571144739785, "rewards/penalize_wrong_passages_reward_func": -0.619047611951828, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.493452325463295, "step": 30 }, { "completion_length": 149.51360066731772, "epoch": 0.06528606528606529, "grad_norm": 0.8815411329269409, "kl": 0.058308919270833336, "learning_rate": 6.2e-07, "loss": 0.0009, "reward": 4.303721110026042, "reward_std": 5.176619450251262, "rewards/citation_reward_func": 4.2176869710286455, "rewards/correctness_reward_func": 1.9727890690167744, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -1.989795873562495, "rewards/penalize_wrong_passages_reward_func": -0.8979591627915701, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 31 }, { "completion_length": 153.89795684814453, "epoch": 0.06739206739206739, "grad_norm": 1.1185945272445679, "kl": 0.0631103515625, "learning_rate": 6.4e-07, "loss": 0.0007, "reward": 2.2272075613339744, "reward_std": 5.821100076039632, "rewards/citation_reward_func": 3.945578138033549, "rewards/correctness_reward_func": 2.023809482653936, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -3.0612244606018066, "rewards/penalize_wrong_passages_reward_func": -1.6802720924218495, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4993162552515666, "step": 32 }, { "completion_length": 144.96258163452148, "epoch": 0.0694980694980695, "grad_norm": 0.9808516502380371, "kl": 0.0740966796875, "learning_rate": 6.6e-07, "loss": 0.0014, "reward": 5.15746267636617, "reward_std": 4.294728080431621, "rewards/citation_reward_func": 3.622448960940043, "rewards/correctness_reward_func": 2.772108813126882, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -1.632653037707011, "rewards/penalize_wrong_passages_reward_func": -0.6054421712954839, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 33 }, { "completion_length": 145.58162943522134, "epoch": 0.0716040716040716, "grad_norm": 0.9893280863761902, "kl": 0.102783203125, "learning_rate": 6.800000000000001e-07, "loss": 0.0017, "reward": 6.129275401433309, "reward_std": 3.9419746001561484, "rewards/citation_reward_func": 4.022108713785808, "rewards/correctness_reward_func": 2.9081631700197854, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -1.3265306105216343, "rewards/penalize_wrong_passages_reward_func": -0.4693877398967743, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4966223786274592, "step": 34 }, { "completion_length": 144.02720896402994, "epoch": 0.07371007371007371, "grad_norm": 1.1553752422332764, "kl": 0.09427897135416667, "learning_rate": 7e-07, "loss": 0.0012, "reward": 5.2912074228127794, "reward_std": 5.189612110455831, "rewards/citation_reward_func": 4.090135892232259, "rewards/correctness_reward_func": 2.568027118841807, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -1.7346938451131184, "rewards/penalize_wrong_passages_reward_func": -0.6326530476411184, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5003910859425863, "step": 35 }, { "completion_length": 138.0680249532064, "epoch": 0.07581607581607581, "grad_norm": 1.5305249691009521, "kl": 0.14192708333333334, "learning_rate": 7.2e-07, "loss": 0.0018, "reward": 3.28841503461202, "reward_std": 4.651200453440349, "rewards/citation_reward_func": 3.6139455238978067, "rewards/correctness_reward_func": 1.7517006198565166, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -1.7857142686843872, "rewards/penalize_wrong_passages_reward_func": -1.2925169666608174, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 36 }, { "completion_length": 143.6292495727539, "epoch": 0.07792207792207792, "grad_norm": 0.9717904925346375, "kl": 0.159912109375, "learning_rate": 7.4e-07, "loss": 0.002, "reward": 4.3089354038238525, "reward_std": 3.8151880502700806, "rewards/citation_reward_func": 3.5969387690226235, "rewards/correctness_reward_func": 1.870748261610667, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -1.530612200498581, "rewards/penalize_wrong_passages_reward_func": -0.6190476020177206, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4909080962340037, "step": 37 }, { "completion_length": 145.60544077555338, "epoch": 0.08002808002808003, "grad_norm": 1.1155370473861694, "kl": 0.19954427083333334, "learning_rate": 7.599999999999999e-07, "loss": 0.0022, "reward": 4.726190567016602, "reward_std": 3.97695521513621, "rewards/citation_reward_func": 4.03911550839742, "rewards/correctness_reward_func": 2.2619047264258065, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -1.3775510092576344, "rewards/penalize_wrong_passages_reward_func": -1.183673453827699, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4863944947719574, "step": 38 }, { "completion_length": 143.42516581217447, "epoch": 0.08213408213408213, "grad_norm": 0.9343476891517639, "kl": 0.14713541666666666, "learning_rate": 7.799999999999999e-07, "loss": 0.0019, "reward": 4.451527198155721, "reward_std": 3.3758476177851358, "rewards/citation_reward_func": 3.9795917669932046, "rewards/correctness_reward_func": 1.1564625600973766, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -1.224489798148473, "rewards/penalize_wrong_passages_reward_func": -0.45578230917453766, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49574483434359234, "step": 39 }, { "completion_length": 139.33672841389975, "epoch": 0.08424008424008424, "grad_norm": 0.9100993275642395, "kl": 0.197265625, "learning_rate": 8e-07, "loss": 0.0031, "reward": 5.527904828389485, "reward_std": 3.306452294190725, "rewards/citation_reward_func": 3.784013509750366, "rewards/correctness_reward_func": 2.1938775181770325, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.9693877448638281, "rewards/penalize_wrong_passages_reward_func": -0.4761904676755269, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49559177954991657, "step": 40 }, { "completion_length": 125.27891031901042, "epoch": 0.08634608634608634, "grad_norm": 1.0793579816818237, "kl": 0.2594401041666667, "learning_rate": 8.199999999999999e-07, "loss": 0.0033, "reward": 5.9822925726572675, "reward_std": 3.2602258125940957, "rewards/citation_reward_func": 3.954081575075785, "rewards/correctness_reward_func": 2.1938775181770325, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.8673469324906667, "rewards/penalize_wrong_passages_reward_func": -0.29931971554954845, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 41 }, { "completion_length": 140.08503341674805, "epoch": 0.08845208845208845, "grad_norm": 1.0207041501998901, "kl": 0.3108723958333333, "learning_rate": 8.399999999999999e-07, "loss": 0.0033, "reward": 4.319027250011762, "reward_std": 3.1209686001141868, "rewards/citation_reward_func": 3.818027059237162, "rewards/correctness_reward_func": 1.4625849823156993, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -1.3775510042905807, "rewards/penalize_wrong_passages_reward_func": -0.5850340078274409, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 42 }, { "completion_length": 154.65986124674478, "epoch": 0.09055809055809055, "grad_norm": 0.9811875820159912, "kl": 0.3186848958333333, "learning_rate": 8.599999999999999e-07, "loss": 0.0035, "reward": 4.860387921333313, "reward_std": 4.377778013547261, "rewards/citation_reward_func": 4.030612150828044, "rewards/correctness_reward_func": 1.8537414173285167, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -1.4795918017625809, "rewards/penalize_wrong_passages_reward_func": -0.5442176821331183, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49984346826871234, "step": 43 }, { "completion_length": 161.1530558268229, "epoch": 0.09266409266409266, "grad_norm": 1.072112798690796, "kl": 0.3014322916666667, "learning_rate": 8.799999999999999e-07, "loss": 0.0035, "reward": 3.8310410181681314, "reward_std": 5.302052021026611, "rewards/citation_reward_func": 4.005101879437764, "rewards/correctness_reward_func": 1.4455781976381938, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -1.9387754797935486, "rewards/penalize_wrong_passages_reward_func": -0.6802720949053764, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49940809110800427, "step": 44 }, { "completion_length": 157.3537394205729, "epoch": 0.09477009477009476, "grad_norm": 1.0070186853408813, "kl": 0.31298828125, "learning_rate": 9e-07, "loss": 0.004, "reward": 6.792323112487793, "reward_std": 2.9815571308135986, "rewards/citation_reward_func": 3.8605441649754844, "rewards/correctness_reward_func": 3.010204037030538, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.6122448891401291, "rewards/penalize_wrong_passages_reward_func": -0.46258503446976346, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49640469749768573, "step": 45 }, { "completion_length": 148.18026987711588, "epoch": 0.09687609687609687, "grad_norm": 0.8120829463005066, "kl": 248.32552083333334, "learning_rate": 9.2e-07, "loss": 2.4913, "reward": 7.790044228235881, "reward_std": 2.8049110968907676, "rewards/citation_reward_func": 4.15816315015157, "rewards/correctness_reward_func": 3.5714284578959146, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.5102040817340215, "rewards/penalize_wrong_passages_reward_func": -0.42176870505015057, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49412578841050464, "step": 46 }, { "completion_length": 140.71428298950195, "epoch": 0.09898209898209898, "grad_norm": 1.2571049928665161, "kl": 0.498046875, "learning_rate": 9.399999999999999e-07, "loss": 0.0054, "reward": 5.644897858301799, "reward_std": 2.7742998798688254, "rewards/citation_reward_func": 3.74999992052714, "rewards/correctness_reward_func": 2.414965877930323, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.8673469225565592, "rewards/penalize_wrong_passages_reward_func": -0.6462584932645162, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4952380359172821, "step": 47 }, { "completion_length": 142.21088155110678, "epoch": 0.10108810108810108, "grad_norm": 1.004626750946045, "kl": 0.3631184895833333, "learning_rate": 9.6e-07, "loss": 0.004, "reward": 3.835642953713735, "reward_std": 3.7980151573816934, "rewards/citation_reward_func": 3.869047522544861, "rewards/correctness_reward_func": 1.1224489609400432, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -1.275510201851527, "rewards/penalize_wrong_passages_reward_func": -0.870748296380043, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4904047002394994, "step": 48 }, { "completion_length": 133.32992808024088, "epoch": 0.10319410319410319, "grad_norm": 1.8091953992843628, "kl": 0.5882161458333334, "learning_rate": 9.8e-07, "loss": 0.006, "reward": 4.456782499949138, "reward_std": 3.1944571336110434, "rewards/citation_reward_func": 3.6564625104268393, "rewards/correctness_reward_func": 1.9897959033648174, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.8163265387217203, "rewards/penalize_wrong_passages_reward_func": -1.3741496553023655, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 49 }, { "completion_length": 128.5748291015625, "epoch": 0.1053001053001053, "grad_norm": 16.53020668029785, "kl": 1.4166666666666667, "learning_rate": 1e-06, "loss": 0.0148, "reward": 7.325829982757568, "reward_std": 2.0804774363835654, "rewards/citation_reward_func": 3.6139454444249473, "rewards/correctness_reward_func": 2.976190368334452, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -0.1632653040190538, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 50 }, { "completion_length": 142.61224365234375, "epoch": 0.10740610740610741, "grad_norm": 2.8453311920166016, "kl": 2.1321614583333335, "learning_rate": 1e-06, "loss": 0.022, "reward": 5.023935596148173, "reward_std": 2.8823713461558023, "rewards/citation_reward_func": 3.911564509073893, "rewards/correctness_reward_func": 1.5476190348466237, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -1.0204081684350967, "rewards/penalize_wrong_passages_reward_func": -0.4149659772713979, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5001257807016373, "step": 51 }, { "completion_length": 171.63264973958334, "epoch": 0.10951210951210952, "grad_norm": 22.674123764038086, "kl": 1.5296223958333333, "learning_rate": 1e-06, "loss": 0.0155, "reward": 0.20642862717310587, "reward_std": 5.004753351211548, "rewards/citation_reward_func": 3.9965985218683877, "rewards/correctness_reward_func": 0.34013604124387103, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -4.5408161878585815, "rewards/penalize_wrong_passages_reward_func": -0.5850340028603872, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49554415543874103, "step": 52 }, { "completion_length": 151.25509897867838, "epoch": 0.11161811161811162, "grad_norm": 0.8755993247032166, "kl": 0.4990234375, "learning_rate": 1e-06, "loss": 0.0053, "reward": 5.218874295552571, "reward_std": 3.1620943943659463, "rewards/citation_reward_func": 3.8860543966293335, "rewards/correctness_reward_func": 1.7346938451131184, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.8163265337546667, "rewards/penalize_wrong_passages_reward_func": -0.585034000997742, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4994863271713257, "step": 53 }, { "completion_length": 149.2585016886393, "epoch": 0.11372411372411373, "grad_norm": 0.8691079616546631, "kl": 0.3743489583333333, "learning_rate": 1e-06, "loss": 0.0044, "reward": 5.637054522832234, "reward_std": 3.094425678253174, "rewards/citation_reward_func": 4.013605356216431, "rewards/correctness_reward_func": 1.7517006198565166, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.6122448941071829, "rewards/penalize_wrong_passages_reward_func": -0.5170068045457205, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 54 }, { "completion_length": 161.76190185546875, "epoch": 0.11583011583011583, "grad_norm": 1.1524864435195923, "kl": 0.4420572916666667, "learning_rate": 1e-06, "loss": 0.0045, "reward": 1.8751495977242787, "reward_std": 3.3608768383661904, "rewards/citation_reward_func": 3.996598561604818, "rewards/correctness_reward_func": 1.581632599234581, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -3.8265305956204734, "rewards/penalize_wrong_passages_reward_func": -0.8775509844223658, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 55 }, { "completion_length": 149.0646209716797, "epoch": 0.11793611793611794, "grad_norm": 1.3282649517059326, "kl": 0.5231119791666666, "learning_rate": 1e-06, "loss": 0.0053, "reward": 3.513479550679525, "reward_std": 3.8186222910881042, "rewards/citation_reward_func": 4.081632653872172, "rewards/correctness_reward_func": 1.2244897584120433, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -2.193877483407656, "rewards/penalize_wrong_passages_reward_func": -0.4285714191695054, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.4998740802208583, "step": 56 }, { "completion_length": 158.3197250366211, "epoch": 0.12004212004212005, "grad_norm": 2.27993106842041, "kl": 0.6731770833333334, "learning_rate": 1e-06, "loss": 0.0069, "reward": 6.510074933369954, "reward_std": 2.7599647641181946, "rewards/citation_reward_func": 3.9115644693374634, "rewards/correctness_reward_func": 2.346938689549764, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.30612245202064514, "rewards/penalize_wrong_passages_reward_func": -0.4353741407394409, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49306795994440716, "step": 57 }, { "completion_length": 155.25509897867838, "epoch": 0.12214812214812215, "grad_norm": 1.1884284019470215, "kl": 0.6569010416666666, "learning_rate": 1e-06, "loss": 0.0069, "reward": 6.011476318041484, "reward_std": 3.121876875559489, "rewards/citation_reward_func": 3.89455775419871, "rewards/correctness_reward_func": 2.1938775181770325, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.5102040767669678, "rewards/penalize_wrong_passages_reward_func": -0.5646258319417635, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49787067870299023, "step": 58 }, { "completion_length": 151.30951690673828, "epoch": 0.12425412425412426, "grad_norm": 1.67170250415802, "kl": 0.6803385416666666, "learning_rate": 1e-06, "loss": 0.0076, "reward": 6.520530700683594, "reward_std": 2.5361270904541016, "rewards/citation_reward_func": 4.0646257400512695, "rewards/correctness_reward_func": 2.3129251301288605, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.510204071799914, "rewards/penalize_wrong_passages_reward_func": -0.3401360474526882, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4950203349192937, "step": 59 }, { "completion_length": 128.4727872212728, "epoch": 0.12636012636012636, "grad_norm": 1.07207190990448, "kl": 0.6061197916666666, "learning_rate": 1e-06, "loss": 0.007, "reward": 8.203622579574585, "reward_std": 2.1818835139274597, "rewards/citation_reward_func": 4.022108793258667, "rewards/correctness_reward_func": 3.2993196646372476, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.11564625551303227, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49784007171789807, "step": 60 }, { "completion_length": 137.25169881184897, "epoch": 0.12846612846612845, "grad_norm": 1.870184063911438, "kl": 0.572265625, "learning_rate": 1e-06, "loss": 0.0066, "reward": 7.795095205307007, "reward_std": 2.1725188493728638, "rewards/citation_reward_func": 3.8435372511545816, "rewards/correctness_reward_func": 3.3843536376953125, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.3809523756305377, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4991768052180608, "step": 61 }, { "completion_length": 146.1836675008138, "epoch": 0.13057213057213057, "grad_norm": 0.9689912796020508, "kl": 0.541015625, "learning_rate": 1e-06, "loss": 0.0064, "reward": 6.3955680926640825, "reward_std": 2.069407343864441, "rewards/citation_reward_func": 3.9710882902145386, "rewards/correctness_reward_func": 2.04081629216671, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -0.4625850295027097, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4993094503879547, "step": 62 }, { "completion_length": 158.43877156575522, "epoch": 0.13267813267813267, "grad_norm": 1.0672656297683716, "kl": 0.5579427083333334, "learning_rate": 1e-06, "loss": 0.0065, "reward": 6.065329949061076, "reward_std": 2.4219020207722983, "rewards/citation_reward_func": 4.217687010765076, "rewards/correctness_reward_func": 1.632652997970581, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.3571428557236989, "rewards/penalize_wrong_passages_reward_func": -0.4285714291036129, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5007040103276571, "step": 63 }, { "completion_length": 147.59183502197266, "epoch": 0.13478413478413478, "grad_norm": 0.9976738095283508, "kl": 0.6643880208333334, "learning_rate": 1e-06, "loss": 0.0075, "reward": 5.30202051003774, "reward_std": 2.7674037416776023, "rewards/citation_reward_func": 4.2261903285980225, "rewards/correctness_reward_func": 1.5306122203667958, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.5102040767669678, "rewards/penalize_wrong_passages_reward_func": -0.945578183978796, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 64 }, { "completion_length": 141.9965960184733, "epoch": 0.13689013689013688, "grad_norm": 0.8568897247314453, "kl": 1.1334635416666667, "learning_rate": 1e-06, "loss": 0.0132, "reward": 7.853527307510376, "reward_std": 1.9899038672447205, "rewards/citation_reward_func": 4.2261903285980225, "rewards/correctness_reward_func": 3.418367326259613, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -0.6870748152335485, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4980849673350652, "step": 65 }, { "completion_length": 139.36054356892905, "epoch": 0.138996138996139, "grad_norm": 7.162007808685303, "kl": 1.1429036458333333, "learning_rate": 1e-06, "loss": 0.0127, "reward": 4.970387836297353, "reward_std": 2.34166028102239, "rewards/citation_reward_func": 4.421768585840861, "rewards/correctness_reward_func": 0.6802720973889033, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.7653061151504517, "rewards/penalize_wrong_passages_reward_func": -0.36734693869948387, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 66 }, { "completion_length": 161.24829610188803, "epoch": 0.14110214110214112, "grad_norm": 2.989043951034546, "kl": 0.794921875, "learning_rate": 1e-06, "loss": 0.0098, "reward": 6.70848286151886, "reward_std": 2.340596000353495, "rewards/citation_reward_func": 4.149659752845764, "rewards/correctness_reward_func": 3.299319624900818, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.9183673063913981, "rewards/penalize_wrong_passages_reward_func": -0.8231292217969894, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 67 }, { "completion_length": 151.68707275390625, "epoch": 0.1432081432081432, "grad_norm": 2.169832706451416, "kl": 0.8333333333333334, "learning_rate": 1e-06, "loss": 0.0093, "reward": 6.5593845049540205, "reward_std": 2.7000568310419717, "rewards/citation_reward_func": 4.192176739374797, "rewards/correctness_reward_func": 2.2959183057149253, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.2551020383834839, "rewards/penalize_wrong_passages_reward_func": -0.6734693696101507, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4998604704936345, "step": 68 }, { "completion_length": 148.39115142822266, "epoch": 0.14531414531414533, "grad_norm": 0.9319802522659302, "kl": 0.5231119791666666, "learning_rate": 1e-06, "loss": 0.0073, "reward": 7.101214488347371, "reward_std": 2.3352424701054892, "rewards/citation_reward_func": 4.336734652519226, "rewards/correctness_reward_func": 2.7380951245625815, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -0.768707479039828, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49917339781920117, "step": 69 }, { "completion_length": 157.34693400065103, "epoch": 0.14742014742014742, "grad_norm": 0.8337175250053406, "kl": 0.6435546875, "learning_rate": 1e-06, "loss": 0.0072, "reward": 6.779911518096924, "reward_std": 2.5660134156545005, "rewards/citation_reward_func": 4.234693686167399, "rewards/correctness_reward_func": 2.29591832558314, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.2551020383834839, "rewards/penalize_wrong_passages_reward_func": -0.49659863611062366, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 70 }, { "completion_length": 147.37755076090494, "epoch": 0.14952614952614954, "grad_norm": 1.07455575466156, "kl": 0.5833333333333334, "learning_rate": 1e-06, "loss": 0.0064, "reward": 6.333510239919026, "reward_std": 2.8906733194986978, "rewards/citation_reward_func": 4.18367342154185, "rewards/correctness_reward_func": 2.1598638792832694, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.3571428557236989, "rewards/penalize_wrong_passages_reward_func": -0.6530612111091614, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5001767973105112, "step": 71 }, { "completion_length": 133.7721061706543, "epoch": 0.15163215163215163, "grad_norm": 0.8392318487167358, "kl": 0.6985677083333334, "learning_rate": 1e-06, "loss": 0.0094, "reward": 7.531945625940959, "reward_std": 1.5154228607813518, "rewards/citation_reward_func": 4.685374021530151, "rewards/correctness_reward_func": 2.58503391345342, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.7346938649813334, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4962312380472819, "step": 72 }, { "completion_length": 140.7721061706543, "epoch": 0.15373815373815375, "grad_norm": 0.9759712219238281, "kl": 0.6940104166666666, "learning_rate": 1e-06, "loss": 0.0081, "reward": 5.715149720509847, "reward_std": 2.087883015473684, "rewards/citation_reward_func": 3.9795918464660645, "rewards/correctness_reward_func": 2.1258502999941506, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -1.2857142488161724, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49746252099672955, "step": 73 }, { "completion_length": 152.61224619547525, "epoch": 0.15584415584415584, "grad_norm": 0.6399812698364258, "kl": 0.5185546875, "learning_rate": 1e-06, "loss": 0.006, "reward": 4.938377459843953, "reward_std": 1.8990906874338787, "rewards/citation_reward_func": 3.962584932645162, "rewards/correctness_reward_func": 1.39455779393514, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.3571428557236989, "rewards/penalize_wrong_passages_reward_func": -1.061224450667699, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49960197508335114, "step": 74 }, { "completion_length": 144.59183756510416, "epoch": 0.15795015795015796, "grad_norm": 5.202109336853027, "kl": 0.9856770833333334, "learning_rate": 1e-06, "loss": 0.0104, "reward": 5.507989803949992, "reward_std": 2.5388510624567666, "rewards/citation_reward_func": 4.090135931968689, "rewards/correctness_reward_func": 2.3299319048722587, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -1.700680245955785, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4926836242278417, "step": 75 }, { "completion_length": 133.173464457194, "epoch": 0.16005616005616005, "grad_norm": 1.2735567092895508, "kl": 0.7428385416666666, "learning_rate": 1e-06, "loss": 0.009, "reward": 6.55202039082845, "reward_std": 1.8033390442530315, "rewards/citation_reward_func": 4.489795843760173, "rewards/correctness_reward_func": 3.316326459248861, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -2.1020407577355704, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 76 }, { "completion_length": 150.09183502197266, "epoch": 0.16216216216216217, "grad_norm": 0.912190318107605, "kl": 0.71875, "learning_rate": 1e-06, "loss": 0.009, "reward": 6.876408179601033, "reward_std": 2.352629860242208, "rewards/citation_reward_func": 4.379251639048259, "rewards/correctness_reward_func": 2.227891117334366, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -0.35374149307608604, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.49715640644232434, "step": 77 }, { "completion_length": 149.22108205159506, "epoch": 0.16426816426816426, "grad_norm": 0.8740860223770142, "kl": 0.5198567708333334, "learning_rate": 1e-06, "loss": 0.007, "reward": 6.6944014231363935, "reward_std": 2.1339696844418845, "rewards/citation_reward_func": 4.124149521191915, "rewards/correctness_reward_func": 2.636054356892904, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.3571428557236989, "rewards/penalize_wrong_passages_reward_func": -0.700680265824, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49372102816899616, "step": 78 }, { "completion_length": 176.8197224934896, "epoch": 0.16637416637416638, "grad_norm": 0.850678026676178, "kl": 0.5071614583333334, "learning_rate": 1e-06, "loss": 0.0057, "reward": 5.613517125447591, "reward_std": 3.3167173663775125, "rewards/citation_reward_func": 4.498299201329549, "rewards/correctness_reward_func": 1.173469344774882, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.40816326936086017, "rewards/penalize_wrong_passages_reward_func": -0.4625850319862366, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.4825645883878072, "step": 79 }, { "completion_length": 150.61564127604166, "epoch": 0.16848016848016847, "grad_norm": 0.6305944919586182, "kl": 0.5807291666666666, "learning_rate": 1e-06, "loss": 0.0078, "reward": 7.111183563868205, "reward_std": 2.422848731279373, "rewards/citation_reward_func": 4.685374021530151, "rewards/correctness_reward_func": 2.0068026781082153, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -0.2040816309551398, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.49723803003629047, "step": 80 }, { "completion_length": 140.08843231201172, "epoch": 0.1705861705861706, "grad_norm": 0.7477515935897827, "kl": 0.6803385416666666, "learning_rate": 1e-06, "loss": 0.0093, "reward": 7.217928409576416, "reward_std": 2.468548610806465, "rewards/citation_reward_func": 4.6513603528340655, "rewards/correctness_reward_func": 2.9081632097562156, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.2551020383834839, "rewards/penalize_wrong_passages_reward_func": -1.0816326302786667, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49513939023017883, "step": 81 }, { "completion_length": 136.18026987711588, "epoch": 0.17269217269217269, "grad_norm": 0.8937349319458008, "kl": 0.6920572916666666, "learning_rate": 1e-06, "loss": 0.0086, "reward": 7.065135955810547, "reward_std": 2.2013906836509705, "rewards/citation_reward_func": 4.251700599988301, "rewards/correctness_reward_func": 2.5170067250728607, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.20408162971337637, "rewards/penalize_wrong_passages_reward_func": -0.48979589839776355, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4903060595194499, "step": 82 }, { "completion_length": 142.0033976236979, "epoch": 0.1747981747981748, "grad_norm": 0.8927910327911377, "kl": 0.658203125, "learning_rate": 1e-06, "loss": 0.0075, "reward": 5.907251795132955, "reward_std": 2.337292790412903, "rewards/citation_reward_func": 3.962584892908732, "rewards/correctness_reward_func": 1.7687074542045593, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -0.6666666517655054, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4956870178381602, "step": 83 }, { "completion_length": 143.99319458007812, "epoch": 0.1769041769041769, "grad_norm": 1.0283397436141968, "kl": 0.7311197916666666, "learning_rate": 1e-06, "loss": 0.0094, "reward": 6.188986460367839, "reward_std": 2.1218987504641214, "rewards/citation_reward_func": 4.583333253860474, "rewards/correctness_reward_func": 3.265306015809377, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.30612244705359143, "rewards/penalize_wrong_passages_reward_func": -2.353741466999054, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5002108216285706, "step": 84 }, { "completion_length": 143.93197123209634, "epoch": 0.17901017901017902, "grad_norm": 1.2225751876831055, "kl": 1.29296875, "learning_rate": 1e-06, "loss": 0.0138, "reward": 5.960183660189311, "reward_std": 2.4073960979779563, "rewards/citation_reward_func": 4.319727897644043, "rewards/correctness_reward_func": 1.9047618508338928, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.40816325942675274, "rewards/penalize_wrong_passages_reward_func": -0.8571428507566452, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 85 }, { "completion_length": 140.00680033365884, "epoch": 0.1811161811161811, "grad_norm": 1.0447474718093872, "kl": 0.662109375, "learning_rate": 1e-06, "loss": 0.0081, "reward": 7.196578184763591, "reward_std": 1.5716410279273987, "rewards/citation_reward_func": 4.3622448444366455, "rewards/correctness_reward_func": 2.329931862652302, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.49659861996769905, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 86 }, { "completion_length": 154.99319712320963, "epoch": 0.18322218322218323, "grad_norm": 0.9008639454841614, "kl": 0.599609375, "learning_rate": 1e-06, "loss": 0.0078, "reward": 6.203136126200358, "reward_std": 1.8692207137743633, "rewards/citation_reward_func": 4.345238010088603, "rewards/correctness_reward_func": 2.2278911074002585, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -1.265306081622839, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4973536729812622, "step": 87 }, { "completion_length": 138.05101521809897, "epoch": 0.18532818532818532, "grad_norm": 1.0423378944396973, "kl": 0.7408854166666666, "learning_rate": 1e-06, "loss": 0.01, "reward": 7.49479603767395, "reward_std": 1.503724937637647, "rewards/citation_reward_func": 4.625850280125936, "rewards/correctness_reward_func": 4.0136053164800005, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -2.040816222627958, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4981972227493922, "step": 88 }, { "completion_length": 148.11224238077799, "epoch": 0.18743418743418744, "grad_norm": 0.9338856935501099, "kl": 0.6295572916666666, "learning_rate": 1e-06, "loss": 0.0073, "reward": 5.587258418401082, "reward_std": 2.0752708315849304, "rewards/citation_reward_func": 4.685373942057292, "rewards/correctness_reward_func": 1.615646208326022, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -1.6598639239867528, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.497122382124265, "step": 89 }, { "completion_length": 145.36054229736328, "epoch": 0.18954018954018953, "grad_norm": 0.9711011648178101, "kl": 0.6637369791666666, "learning_rate": 1e-06, "loss": 0.0074, "reward": 4.11749655008316, "reward_std": 1.244092543919881, "rewards/citation_reward_func": 4.566326379776001, "rewards/correctness_reward_func": 1.7687074492375057, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -3.217687033737699, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500149592757225, "step": 90 }, { "completion_length": 132.89795684814453, "epoch": 0.19164619164619165, "grad_norm": 0.8163198828697205, "kl": 0.97265625, "learning_rate": 1e-06, "loss": 0.0131, "reward": 8.493574778238932, "reward_std": 1.6131847749153774, "rewards/citation_reward_func": 4.651360511779785, "rewards/correctness_reward_func": 3.9795917669932046, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.408163254459699, "rewards/penalize_wrong_passages_reward_func": -0.7278911570707957, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4986768066883087, "step": 91 }, { "completion_length": 140.93537139892578, "epoch": 0.19375219375219374, "grad_norm": 2.3272147178649902, "kl": 1.0787760416666667, "learning_rate": 1e-06, "loss": 0.012, "reward": 6.0862347682317095, "reward_std": 2.626093844572703, "rewards/citation_reward_func": 4.44727885723114, "rewards/correctness_reward_func": 2.9761903484662375, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -2.231292466322581, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4960985829432805, "step": 92 }, { "completion_length": 166.67346700032553, "epoch": 0.19585819585819586, "grad_norm": 2.9835522174835205, "kl": 1.3411458333333333, "learning_rate": 1e-06, "loss": 0.0145, "reward": 6.7499386469523115, "reward_std": 2.8229238192240396, "rewards/citation_reward_func": 4.18367338180542, "rewards/correctness_reward_func": 2.8911563555399575, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -1.2176870703697205, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4948366681734721, "step": 93 }, { "completion_length": 154.5986353556315, "epoch": 0.19796419796419795, "grad_norm": 1.780610203742981, "kl": 1.1497395833333333, "learning_rate": 1e-06, "loss": 0.0134, "reward": 7.306034167607625, "reward_std": 2.362267851829529, "rewards/citation_reward_func": 4.56632645924886, "rewards/correctness_reward_func": 2.7380951642990112, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -0.843537408237656, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4982108175754547, "step": 94 }, { "completion_length": 155.43536885579428, "epoch": 0.20007020007020007, "grad_norm": 0.7836578488349915, "kl": 0.6829427083333334, "learning_rate": 1e-06, "loss": 0.0089, "reward": 7.553721110026042, "reward_std": 2.3052654465039573, "rewards/citation_reward_func": 4.532312790552775, "rewards/correctness_reward_func": 2.7040815949440002, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -0.5306122352679571, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 95 }, { "completion_length": 158.023806254069, "epoch": 0.20217620217620216, "grad_norm": 0.9116750955581665, "kl": 1.1731770833333333, "learning_rate": 1e-06, "loss": 0.0137, "reward": 6.622407933076222, "reward_std": 2.6178742349147797, "rewards/citation_reward_func": 4.574829896291097, "rewards/correctness_reward_func": 2.5510203341643014, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.20408162971337637, "rewards/penalize_wrong_passages_reward_func": -1.129251668850581, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.4999591161807378, "step": 96 }, { "completion_length": 149.49319203694662, "epoch": 0.20428220428220428, "grad_norm": 2.335477590560913, "kl": 0.8600260416666666, "learning_rate": 1e-06, "loss": 0.012, "reward": 7.119860688845317, "reward_std": 1.709948976834615, "rewards/citation_reward_func": 4.676870743433635, "rewards/correctness_reward_func": 3.1292516191800437, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -1.5782312601804733, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49401014546553296, "step": 97 }, { "completion_length": 123.80611928304036, "epoch": 0.20638820638820637, "grad_norm": 0.8113567233085632, "kl": 0.6715494791666666, "learning_rate": 1e-06, "loss": 0.0122, "reward": 9.800319592157999, "reward_std": 0.7947500944137573, "rewards/citation_reward_func": 4.948979536692302, "rewards/correctness_reward_func": 4.455782175064087, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.6054421526690325, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 98 }, { "completion_length": 131.07822799682617, "epoch": 0.2084942084942085, "grad_norm": 0.6562955975532532, "kl": 0.7076822916666666, "learning_rate": 1e-06, "loss": 0.0102, "reward": 6.293738126754761, "reward_std": 1.256690410276254, "rewards/citation_reward_func": 4.838435252507527, "rewards/correctness_reward_func": 1.122448980808258, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.20408162971337637, "rewards/penalize_wrong_passages_reward_func": -0.4625850170850754, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4995203415552775, "step": 99 }, { "completion_length": 145.0952377319336, "epoch": 0.2106002106002106, "grad_norm": 0.9318126440048218, "kl": 0.6321614583333334, "learning_rate": 1e-06, "loss": 0.007, "reward": 7.648156483968099, "reward_std": 2.704567869504293, "rewards/citation_reward_func": 4.396258354187012, "rewards/correctness_reward_func": 2.9421768188476562, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -0.537414958079656, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5001972069342931, "step": 100 }, { "completion_length": 158.9659856160482, "epoch": 0.2127062127062127, "grad_norm": 1.0392431020736694, "kl": 0.6048177083333334, "learning_rate": 1e-06, "loss": 0.007, "reward": 6.060523907343547, "reward_std": 2.274144728978475, "rewards/citation_reward_func": 3.9200679461161294, "rewards/correctness_reward_func": 2.1938774983088174, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -0.9523809204498926, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 101 }, { "completion_length": 128.78570938110352, "epoch": 0.21481221481221482, "grad_norm": 0.9947078227996826, "kl": 0.814453125, "learning_rate": 1e-06, "loss": 0.0089, "reward": 4.414262016614278, "reward_std": 1.7643556793530781, "rewards/citation_reward_func": 3.869047482808431, "rewards/correctness_reward_func": 1.445578212539355, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -1.7959183355172474, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49929585059483844, "step": 102 }, { "completion_length": 142.8333307902018, "epoch": 0.21691821691821692, "grad_norm": 0.915675699710846, "kl": 0.7298177083333334, "learning_rate": 1e-06, "loss": 0.0089, "reward": 6.6523605187733965, "reward_std": 2.4479230642318726, "rewards/citation_reward_func": 4.583333253860474, "rewards/correctness_reward_func": 2.142857069770495, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -0.9727890839179357, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 103 }, { "completion_length": 155.09863789876303, "epoch": 0.21902421902421904, "grad_norm": 0.8193255066871643, "kl": 0.740234375, "learning_rate": 1e-06, "loss": 0.0085, "reward": 6.158255100250244, "reward_std": 2.398227870464325, "rewards/citation_reward_func": 4.285714268684387, "rewards/correctness_reward_func": 1.5816325942675273, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -0.6054421663284302, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49839108685652417, "step": 104 }, { "completion_length": 171.91156260172525, "epoch": 0.22113022113022113, "grad_norm": 1.1751590967178345, "kl": 0.8697916666666666, "learning_rate": 1e-06, "loss": 0.0091, "reward": 5.9842549959818525, "reward_std": 3.8744190533955893, "rewards/citation_reward_func": 4.0731290976206465, "rewards/correctness_reward_func": 2.840135931968689, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.30612245202064514, "rewards/penalize_wrong_passages_reward_func": -1.6190475821495056, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4961598018805186, "step": 105 }, { "completion_length": 161.8333307902018, "epoch": 0.22323622323622325, "grad_norm": 0.9442234635353088, "kl": 0.6337890625, "learning_rate": 1e-06, "loss": 0.0079, "reward": 8.165966113408407, "reward_std": 2.3428567250569663, "rewards/citation_reward_func": 4.396258473396301, "rewards/correctness_reward_func": 3.418367306391398, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.5986394534508387, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 106 }, { "completion_length": 147.87074534098306, "epoch": 0.22534222534222534, "grad_norm": 0.8142126202583313, "kl": 0.828125, "learning_rate": 1e-06, "loss": 0.0118, "reward": 8.370047489802042, "reward_std": 1.7788663109143574, "rewards/citation_reward_func": 4.498299201329549, "rewards/correctness_reward_func": 3.7925169467926025, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.870748296380043, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 107 }, { "completion_length": 135.7074826558431, "epoch": 0.22744822744822746, "grad_norm": 0.8248280882835388, "kl": 0.8001302083333334, "learning_rate": 1e-06, "loss": 0.0104, "reward": 5.128768642743428, "reward_std": 2.9163127541542053, "rewards/citation_reward_func": 4.608843406041463, "rewards/correctness_reward_func": 3.1462583939234414, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -3.251700679461161, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.49951693415641785, "step": 108 }, { "completion_length": 134.26870473225912, "epoch": 0.22955422955422955, "grad_norm": 1.0382417440414429, "kl": 0.8639322916666666, "learning_rate": 1e-06, "loss": 0.0106, "reward": 6.94229261080424, "reward_std": 2.0236403942108154, "rewards/citation_reward_func": 4.59183669090271, "rewards/correctness_reward_func": 2.482993165651957, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.2551020433505376, "rewards/penalize_wrong_passages_reward_func": -0.8775510191917419, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5001155783732733, "step": 109 }, { "completion_length": 139.26529947916666, "epoch": 0.23166023166023167, "grad_norm": 1.1215533018112183, "kl": 0.7395833333333334, "learning_rate": 1e-06, "loss": 0.0102, "reward": 7.058823188145955, "reward_std": 1.4295719663302104, "rewards/citation_reward_func": 4.540816307067871, "rewards/correctness_reward_func": 2.6360543767611184, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -1.068027191484968, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 110 }, { "completion_length": 154.25169626871744, "epoch": 0.23376623376623376, "grad_norm": 0.8044309020042419, "kl": 0.7018229166666666, "learning_rate": 1e-06, "loss": 0.0101, "reward": 6.965285857518514, "reward_std": 1.674074004093806, "rewards/citation_reward_func": 4.4982991218566895, "rewards/correctness_reward_func": 2.1938775181770325, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -0.625850323587656, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 111 }, { "completion_length": 134.3401336669922, "epoch": 0.23587223587223588, "grad_norm": 0.9982149600982666, "kl": 1.177734375, "learning_rate": 1e-06, "loss": 0.0141, "reward": 8.769040743509928, "reward_std": 1.9494565327962239, "rewards/citation_reward_func": 4.6343536376953125, "rewards/correctness_reward_func": 3.6054420471191406, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.4693877423803012, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49863258997599286, "step": 112 }, { "completion_length": 160.38775380452475, "epoch": 0.23797823797823797, "grad_norm": 1.1707923412322998, "kl": 0.7259114583333334, "learning_rate": 1e-06, "loss": 0.008, "reward": 6.240132649739583, "reward_std": 2.118043899536133, "rewards/citation_reward_func": 4.489795843760173, "rewards/correctness_reward_func": 2.363945484161377, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -1.6122448245684307, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49863598744074505, "step": 113 }, { "completion_length": 148.93197123209634, "epoch": 0.2400842400842401, "grad_norm": 17.552936553955078, "kl": 1.8580729166666667, "learning_rate": 1e-06, "loss": 0.0197, "reward": 7.550319671630859, "reward_std": 1.7853082915147145, "rewards/citation_reward_func": 4.668367147445679, "rewards/correctness_reward_func": 2.1938775231440864, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.31292515868941945, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 114 }, { "completion_length": 124.07482655843098, "epoch": 0.24219024219024218, "grad_norm": 1.9848018884658813, "kl": 1.2858072916666667, "learning_rate": 1e-06, "loss": 0.0149, "reward": 8.497598648071289, "reward_std": 1.8612036903699238, "rewards/citation_reward_func": 4.727891047795613, "rewards/correctness_reward_func": 3.061224420865377, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.29251699708402157, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 115 }, { "completion_length": 138.43196868896484, "epoch": 0.2442962442962443, "grad_norm": 1.1814403533935547, "kl": 2.2493489583333335, "learning_rate": 1e-06, "loss": 0.0241, "reward": 4.523796101411183, "reward_std": 1.643856147925059, "rewards/citation_reward_func": 4.73639444510142, "rewards/correctness_reward_func": 3.4523807565371194, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -4.503401279449463, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4914829283952713, "step": 116 }, { "completion_length": 133.05441919962564, "epoch": 0.2464022464022464, "grad_norm": 11.506072998046875, "kl": 1.5598958333333333, "learning_rate": 1e-06, "loss": 0.0176, "reward": 8.436374107996622, "reward_std": 1.970664918422699, "rewards/citation_reward_func": 4.489795843760173, "rewards/correctness_reward_func": 3.9285713036855063, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.9319727718830109, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 117 }, { "completion_length": 136.46938451131186, "epoch": 0.2485082485082485, "grad_norm": 1.1682838201522827, "kl": 0.8463541666666666, "learning_rate": 1e-06, "loss": 0.0107, "reward": 9.16936723391215, "reward_std": 1.515159587065379, "rewards/citation_reward_func": 4.685374021530151, "rewards/correctness_reward_func": 4.1156461636225385, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.6326530451575915, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 118 }, { "completion_length": 162.7789077758789, "epoch": 0.25061425061425063, "grad_norm": 15.175686836242676, "kl": 2.0638020833333335, "learning_rate": 1e-06, "loss": 0.0226, "reward": 6.813918272654216, "reward_std": 1.9256925384203594, "rewards/citation_reward_func": 4.625850280125936, "rewards/correctness_reward_func": 2.7380951841672263, "rewards/formatting_reward_func": 0.49659863611062366, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -1.442176838715871, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.497591773668925, "step": 119 }, { "completion_length": 130.61224365234375, "epoch": 0.2527202527202527, "grad_norm": 0.8811450004577637, "kl": 0.7669270833333334, "learning_rate": 1e-06, "loss": 0.0138, "reward": 9.869921366373697, "reward_std": 1.0036840935548146, "rewards/citation_reward_func": 4.727891127268474, "rewards/correctness_reward_func": 4.506802638371785, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.3129251648982366, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49917339781920117, "step": 120 }, { "completion_length": 143.23129018147787, "epoch": 0.2548262548262548, "grad_norm": 1.374078631401062, "kl": 0.7858072916666666, "learning_rate": 1e-06, "loss": 0.011, "reward": 8.281694094340006, "reward_std": 1.8786971072355907, "rewards/citation_reward_func": 4.625850240389506, "rewards/correctness_reward_func": 3.0442176262537637, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.3333333258827527, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4976802070935567, "step": 121 }, { "completion_length": 166.99319458007812, "epoch": 0.2569322569322569, "grad_norm": 2.271318197250366, "kl": 1.203125, "learning_rate": 1e-06, "loss": 0.0131, "reward": 7.029516935348511, "reward_std": 2.833619177341461, "rewards/citation_reward_func": 4.336734612782796, "rewards/correctness_reward_func": 2.6020407478014627, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.15306122104326883, "rewards/penalize_wrong_passages_reward_func": -0.7551020185152689, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49890469014644623, "step": 122 }, { "completion_length": 164.01700592041016, "epoch": 0.25903825903825906, "grad_norm": 2.9062628746032715, "kl": 2.017578125, "learning_rate": 1e-06, "loss": 0.0208, "reward": 5.047608931859334, "reward_std": 2.909602721532186, "rewards/citation_reward_func": 3.5714284578959146, "rewards/correctness_reward_func": 1.8707482020060222, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -1.238095223903656, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4965883692105611, "step": 123 }, { "completion_length": 166.54761759440103, "epoch": 0.26114426114426115, "grad_norm": 0.6600808501243591, "kl": 0.7701822916666666, "learning_rate": 1e-06, "loss": 0.0102, "reward": 5.248527089754741, "reward_std": 1.834931919972102, "rewards/citation_reward_func": 4.013605276743571, "rewards/correctness_reward_func": 2.602040727933248, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -2.3129250705242157, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4968264698982239, "step": 124 }, { "completion_length": 140.8945515950521, "epoch": 0.26325026325026324, "grad_norm": 23.109085083007812, "kl": 0.9609375, "learning_rate": 1e-06, "loss": 0.0129, "reward": 8.27480951944987, "reward_std": 1.7910826206207275, "rewards/citation_reward_func": 4.49829916159312, "rewards/correctness_reward_func": 3.299319644769033, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -0.42176870070397854, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 125 }, { "completion_length": 168.1904729207357, "epoch": 0.26535626535626533, "grad_norm": 0.9592843055725098, "kl": 1.1588541666666667, "learning_rate": 1e-06, "loss": 0.0141, "reward": 7.51559845606486, "reward_std": 2.5106443961461387, "rewards/citation_reward_func": 4.532312790552775, "rewards/correctness_reward_func": 3.945578098297119, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.30612245202064514, "rewards/penalize_wrong_passages_reward_func": -1.6530611912409465, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4968910962343216, "step": 126 }, { "completion_length": 168.1394526163737, "epoch": 0.2674622674622675, "grad_norm": 0.9258236885070801, "kl": 1.9557291666666667, "learning_rate": 1e-06, "loss": 0.022, "reward": 6.473989725112915, "reward_std": 2.391785681247711, "rewards/citation_reward_func": 4.234693805376689, "rewards/correctness_reward_func": 2.653061161438624, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.2551020383834839, "rewards/penalize_wrong_passages_reward_func": -1.1564625725150108, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49779925247033435, "step": 127 }, { "completion_length": 166.78230794270834, "epoch": 0.26956826956826957, "grad_norm": 0.8637123703956604, "kl": 0.7350260416666666, "learning_rate": 1e-06, "loss": 0.0085, "reward": 7.016306241353353, "reward_std": 2.6598212321599326, "rewards/citation_reward_func": 4.345238010088603, "rewards/correctness_reward_func": 2.551020304361979, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.8299319446086884, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 128 }, { "completion_length": 146.89795430501303, "epoch": 0.27167427167427166, "grad_norm": 1.0868693590164185, "kl": 0.8072916666666666, "learning_rate": 1e-06, "loss": 0.0113, "reward": 8.904061237970987, "reward_std": 1.806675414244334, "rewards/citation_reward_func": 4.702380816141765, "rewards/correctness_reward_func": 3.537414868672689, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.28571428172290325, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 129 }, { "completion_length": 131.71428553263345, "epoch": 0.27378027378027375, "grad_norm": 0.6997035145759583, "kl": 0.8606770833333334, "learning_rate": 1e-06, "loss": 0.0136, "reward": 8.30031935373942, "reward_std": 1.2167588621377945, "rewards/citation_reward_func": 4.761904716491699, "rewards/correctness_reward_func": 3.1972788075606027, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.4897959188868602, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 130 }, { "completion_length": 173.03401056925455, "epoch": 0.2758862758862759, "grad_norm": 2.1735517978668213, "kl": 0.9928385416666666, "learning_rate": 1e-06, "loss": 0.0111, "reward": 5.81254776318868, "reward_std": 2.344173808892568, "rewards/citation_reward_func": 4.47278904914856, "rewards/correctness_reward_func": 1.6666665971279144, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -1.1700680057207744, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49622102578481037, "step": 131 }, { "completion_length": 183.0442148844401, "epoch": 0.277992277992278, "grad_norm": 0.7319064140319824, "kl": 0.7174479166666666, "learning_rate": 1e-06, "loss": 0.0099, "reward": 7.494925260543823, "reward_std": 2.043013642231623, "rewards/citation_reward_func": 4.693877458572388, "rewards/correctness_reward_func": 2.9251699844996133, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.20408162971337637, "rewards/penalize_wrong_passages_reward_func": -0.9183673312266668, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49832646548748016, "step": 132 }, { "completion_length": 169.08843485514322, "epoch": 0.2800982800982801, "grad_norm": 9.555499076843262, "kl": 1.4869791666666667, "learning_rate": 1e-06, "loss": 0.0184, "reward": 8.021408240000406, "reward_std": 1.6322008272012074, "rewards/citation_reward_func": 4.625850280125936, "rewards/correctness_reward_func": 3.3503400087356567, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.9047619005044302, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 133 }, { "completion_length": 181.9047597249349, "epoch": 0.28220428220428223, "grad_norm": 6.901526927947998, "kl": 0.9251302083333334, "learning_rate": 1e-06, "loss": 0.0095, "reward": 2.1488198041915894, "reward_std": 2.6775263945261636, "rewards/citation_reward_func": 4.277210791905721, "rewards/correctness_reward_func": 2.1768707036972046, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -5.251700500647227, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4974591185649236, "step": 134 }, { "completion_length": 144.98299153645834, "epoch": 0.2843102843102843, "grad_norm": 0.97262042760849, "kl": 0.7350260416666666, "learning_rate": 1e-06, "loss": 0.0096, "reward": 6.09113605817159, "reward_std": 0.9187750220298767, "rewards/citation_reward_func": 4.804421742757161, "rewards/correctness_reward_func": 0.40816325942675274, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.12244897956649463, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 135 }, { "completion_length": 148.2789077758789, "epoch": 0.2864162864162864, "grad_norm": 0.9249170422554016, "kl": 1.5032552083333333, "learning_rate": 1e-06, "loss": 0.0164, "reward": 7.3938571612040205, "reward_std": 1.8515236973762512, "rewards/citation_reward_func": 4.583333253860474, "rewards/correctness_reward_func": 2.1258502875765166, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.2653061170130968, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 136 }, { "completion_length": 179.51360066731772, "epoch": 0.2885222885222885, "grad_norm": 1.422672986984253, "kl": 0.94921875, "learning_rate": 1e-06, "loss": 0.0102, "reward": 2.7681804299354553, "reward_std": 2.8826077977816262, "rewards/citation_reward_func": 3.4778911074002585, "rewards/correctness_reward_func": 2.2619047264258065, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.3571428507566452, "rewards/penalize_wrong_passages_reward_func": -3.428571422894796, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.4858672966559728, "step": 137 }, { "completion_length": 181.78911336263022, "epoch": 0.29062829062829065, "grad_norm": 0.8672325611114502, "kl": 1.7734375, "learning_rate": 1e-06, "loss": 0.0195, "reward": 1.4140309020876884, "reward_std": 1.965984453757604, "rewards/citation_reward_func": 3.5884352922439575, "rewards/correctness_reward_func": 2.1258503049612045, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.30612244705359143, "rewards/penalize_wrong_passages_reward_func": -4.965986222028732, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.47185367345809937, "step": 138 }, { "completion_length": 174.54421361287436, "epoch": 0.29273429273429274, "grad_norm": 103.85633850097656, "kl": 8.597005208333334, "learning_rate": 1e-06, "loss": 0.0873, "reward": 6.572428623835246, "reward_std": 1.6893009940783184, "rewards/citation_reward_func": 4.676870663960774, "rewards/correctness_reward_func": 3.7585033178329468, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -2.8639455369363227, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 139 }, { "completion_length": 142.275510152181, "epoch": 0.29484029484029484, "grad_norm": 0.7605285048484802, "kl": 0.7317708333333334, "learning_rate": 1e-06, "loss": 0.0092, "reward": 8.055418491363525, "reward_std": 1.8336223363876343, "rewards/citation_reward_func": 4.795918226242065, "rewards/correctness_reward_func": 3.333333214124044, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -1.0204081137975056, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49929585059483844, "step": 140 }, { "completion_length": 167.7551015218099, "epoch": 0.29694629694629693, "grad_norm": 0.8190628290176392, "kl": 0.7721354166666666, "learning_rate": 1e-06, "loss": 0.0094, "reward": 5.228380918502808, "reward_std": 2.086130917072296, "rewards/citation_reward_func": 4.489795843760173, "rewards/correctness_reward_func": 2.1088434855143228, "rewards/formatting_reward_func": 0.49659863611062366, "rewards/length_reward_func": -0.3571428606907527, "rewards/penalize_wrong_passages_reward_func": -1.9999999403953552, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4902856449286143, "step": 141 }, { "completion_length": 152.649658203125, "epoch": 0.2990522990522991, "grad_norm": 0.787834882736206, "kl": 0.6796875, "learning_rate": 1e-06, "loss": 0.0086, "reward": 6.84282660484314, "reward_std": 2.0990612308184304, "rewards/citation_reward_func": 4.6343536376953125, "rewards/correctness_reward_func": 1.9557822346687317, "rewards/formatting_reward_func": 0.4948979566494624, "rewards/length_reward_func": -0.2551020383834839, "rewards/penalize_wrong_passages_reward_func": -0.48299319048722583, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4958876967430115, "step": 142 }, { "completion_length": 142.88095092773438, "epoch": 0.30115830115830117, "grad_norm": 0.9918654561042786, "kl": 0.6673177083333334, "learning_rate": 1e-06, "loss": 0.0074, "reward": 7.225874503453572, "reward_std": 2.072286307811737, "rewards/citation_reward_func": 4.642857074737549, "rewards/correctness_reward_func": 2.074829876422882, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.48979591329892475, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4979829291502635, "step": 143 }, { "completion_length": 149.2278849283854, "epoch": 0.30326430326430326, "grad_norm": 91.97071838378906, "kl": 4.718098958333333, "learning_rate": 1e-06, "loss": 0.0481, "reward": 5.090360681215922, "reward_std": 2.203482369581858, "rewards/citation_reward_func": 4.03911554813385, "rewards/correctness_reward_func": 1.0884353493650754, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.459183673063914, "rewards/penalize_wrong_passages_reward_func": -0.5782312800486883, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5002244164546331, "step": 144 }, { "completion_length": 147.08503214518228, "epoch": 0.30537030537030535, "grad_norm": 2.862992525100708, "kl": 3.07421875, "learning_rate": 1e-06, "loss": 0.0321, "reward": 2.5679593483606973, "reward_std": 2.9004951119422913, "rewards/citation_reward_func": 4.217686931292216, "rewards/correctness_reward_func": 0.5272108738621076, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -2.2448979218800864, "rewards/penalize_wrong_passages_reward_func": -0.9319727768500646, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4999319016933441, "step": 145 }, { "completion_length": 135.28570938110352, "epoch": 0.3074763074763075, "grad_norm": 0.7194350361824036, "kl": 0.7213541666666666, "learning_rate": 1e-06, "loss": 0.0112, "reward": 8.593265215555826, "reward_std": 1.1670270164807637, "rewards/citation_reward_func": 4.804421663284302, "rewards/correctness_reward_func": 3.673469305038452, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.8843536997834841, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4997278203566869, "step": 146 }, { "completion_length": 153.82992808024088, "epoch": 0.3095823095823096, "grad_norm": 0.6770069003105164, "kl": 0.6171875, "learning_rate": 1e-06, "loss": 0.0086, "reward": 6.921067953109741, "reward_std": 1.4704915285110474, "rewards/citation_reward_func": 4.753401279449463, "rewards/correctness_reward_func": 2.2619046941399574, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -1.0952380833526452, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 147 }, { "completion_length": 158.23129018147787, "epoch": 0.3116883116883117, "grad_norm": 1.632519006729126, "kl": 0.8046875, "learning_rate": 1e-06, "loss": 0.0091, "reward": 6.650659879048665, "reward_std": 2.724838455518087, "rewards/citation_reward_func": 4.6598639488220215, "rewards/correctness_reward_func": 3.707482894261678, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.2551020433505376, "rewards/penalize_wrong_passages_reward_func": -2.462584992249807, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 148 }, { "completion_length": 159.6292495727539, "epoch": 0.3137943137943138, "grad_norm": 0.969252347946167, "kl": 0.7467447916666666, "learning_rate": 1e-06, "loss": 0.0113, "reward": 8.891149520874023, "reward_std": 1.7988839149475098, "rewards/citation_reward_func": 4.5153060754140215, "rewards/correctness_reward_func": 3.8605440855026245, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.48299319793780643, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4982924511035283, "step": 149 }, { "completion_length": 156.11564127604166, "epoch": 0.3159003159003159, "grad_norm": 1.4659888744354248, "kl": 1.2708333333333333, "learning_rate": 1e-06, "loss": 0.0148, "reward": 6.316040953000386, "reward_std": 3.3565998673439026, "rewards/citation_reward_func": 4.098639369010925, "rewards/correctness_reward_func": 2.7040815552075705, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.30612245202064514, "rewards/penalize_wrong_passages_reward_func": -0.8299319446086884, "rewards/unicode_reward_func": -0.3401360511779785, "rewards/xmlcount_reward_func": 0.48951015373071033, "step": 150 }, { "completion_length": 139.925168355306, "epoch": 0.318006318006318, "grad_norm": 10.20754337310791, "kl": 1.380859375, "learning_rate": 1e-06, "loss": 0.0168, "reward": 8.006098747253418, "reward_std": 1.9967832962671916, "rewards/citation_reward_func": 4.668367226918538, "rewards/correctness_reward_func": 3.3333332737286887, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -0.7891156375408173, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49929585059483844, "step": 151 }, { "completion_length": 150.03061040242514, "epoch": 0.3201123201123201, "grad_norm": 0.7859293818473816, "kl": 0.6477864583333334, "learning_rate": 1e-06, "loss": 0.0091, "reward": 6.6795713901519775, "reward_std": 1.4083468516667683, "rewards/citation_reward_func": 4.702380895614624, "rewards/correctness_reward_func": 2.9761904080708823, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -1.99999996026357, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 152 }, { "completion_length": 139.53741073608398, "epoch": 0.3222183222183222, "grad_norm": 1.0564855337142944, "kl": 0.6477864583333334, "learning_rate": 1e-06, "loss": 0.0074, "reward": 7.244768857955933, "reward_std": 2.8033066193262735, "rewards/citation_reward_func": 4.7534011999766035, "rewards/correctness_reward_func": 2.4319727420806885, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -0.8367346984644731, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49816999832789105, "step": 153 }, { "completion_length": 130.1088409423828, "epoch": 0.32432432432432434, "grad_norm": 0.7908992767333984, "kl": 0.8040364583333334, "learning_rate": 1e-06, "loss": 0.0104, "reward": 6.212700764338176, "reward_std": 1.2905257244904835, "rewards/citation_reward_func": 4.566326379776001, "rewards/correctness_reward_func": 3.214285676678022, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -2.517006744941076, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5001155783732733, "step": 154 }, { "completion_length": 134.5067990620931, "epoch": 0.32643032643032643, "grad_norm": 1.2375706434249878, "kl": 0.9283854166666666, "learning_rate": 1e-06, "loss": 0.0102, "reward": 5.958527167638143, "reward_std": 2.26580548286438, "rewards/citation_reward_func": 4.0391156276067095, "rewards/correctness_reward_func": 3.5714284976323447, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -2.6462584137916565, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4942414363225301, "step": 155 }, { "completion_length": 129.54421361287436, "epoch": 0.3285363285363285, "grad_norm": 0.7165351510047913, "kl": 0.9837239583333334, "learning_rate": 1e-06, "loss": 0.0114, "reward": 6.3339049021403, "reward_std": 1.5690179268519084, "rewards/citation_reward_func": 4.175169944763184, "rewards/correctness_reward_func": 3.89455775419871, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -2.7346938153107962, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49887068569660187, "step": 156 }, { "completion_length": 141.46258544921875, "epoch": 0.3306423306423306, "grad_norm": 0.8587003946304321, "kl": 0.8118489583333334, "learning_rate": 1e-06, "loss": 0.0113, "reward": 8.448506752649942, "reward_std": 1.4765484134356182, "rewards/citation_reward_func": 4.659863789876302, "rewards/correctness_reward_func": 3.622448901335398, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.7823129097620646, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49952714145183563, "step": 157 }, { "completion_length": 130.8979581197103, "epoch": 0.33274833274833276, "grad_norm": 0.7178787589073181, "kl": 0.8391927083333334, "learning_rate": 1e-06, "loss": 0.0123, "reward": 7.7033811410268145, "reward_std": 0.9533064067363739, "rewards/citation_reward_func": 4.889455636342366, "rewards/correctness_reward_func": 2.5680271685123444, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.7551020495593548, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 158 }, { "completion_length": 160.54081217447916, "epoch": 0.33485433485433486, "grad_norm": 0.7822726964950562, "kl": 0.6728515625, "learning_rate": 1e-06, "loss": 0.008, "reward": 5.943180481592814, "reward_std": 2.2619903087615967, "rewards/citation_reward_func": 4.4897957642873125, "rewards/correctness_reward_func": 3.061224381128947, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -2.605442168811957, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4976019710302353, "step": 159 }, { "completion_length": 168.82312520345053, "epoch": 0.33696033696033695, "grad_norm": 1.056015968322754, "kl": 0.6139322916666666, "learning_rate": 1e-06, "loss": 0.0065, "reward": 2.2140272160371146, "reward_std": 2.2160128553708396, "rewards/citation_reward_func": 4.540816148122151, "rewards/correctness_reward_func": 2.7380951642990112, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -6.061224301656087, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.496340071161588, "step": 160 }, { "completion_length": 139.75170135498047, "epoch": 0.33906633906633904, "grad_norm": 1.41465163230896, "kl": 0.9140625, "learning_rate": 1e-06, "loss": 0.0111, "reward": 7.655176599820455, "reward_std": 1.8789227455854416, "rewards/citation_reward_func": 4.685374021530151, "rewards/correctness_reward_func": 3.435374101003011, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -1.3605441848436992, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4970135341087977, "step": 161 }, { "completion_length": 142.07142639160156, "epoch": 0.3411723411723412, "grad_norm": 0.6721370220184326, "kl": 0.8606770833333334, "learning_rate": 1e-06, "loss": 0.0116, "reward": 6.773833195368449, "reward_std": 1.1285482396682103, "rewards/citation_reward_func": 4.7704081535339355, "rewards/correctness_reward_func": 3.639455646276474, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -2.632652991140882, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4966223786274592, "step": 162 }, { "completion_length": 163.81631978352866, "epoch": 0.3432783432783433, "grad_norm": 11.225006103515625, "kl": 1.923828125, "learning_rate": 1e-06, "loss": 0.0198, "reward": 5.977557897567749, "reward_std": 2.643218537171682, "rewards/citation_reward_func": 4.404761910438538, "rewards/correctness_reward_func": 1.3095237612724304, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.30612245202064514, "rewards/penalize_wrong_passages_reward_func": -0.42857142103215057, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4979659269253413, "step": 163 }, { "completion_length": 178.6190414428711, "epoch": 0.34538434538434537, "grad_norm": 1.1446784734725952, "kl": 0.9283854166666666, "learning_rate": 1e-06, "loss": 0.0095, "reward": 5.919367551803589, "reward_std": 2.9474711418151855, "rewards/citation_reward_func": 4.183673461278279, "rewards/correctness_reward_func": 1.5986394087473552, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -0.6598639339208603, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 164 }, { "completion_length": 167.16326395670572, "epoch": 0.3474903474903475, "grad_norm": 1.335066318511963, "kl": 0.7757161458333334, "learning_rate": 1e-06, "loss": 0.0083, "reward": 6.390279054641724, "reward_std": 2.944413344065348, "rewards/citation_reward_func": 4.642856995264689, "rewards/correctness_reward_func": 1.666666607062022, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -0.7142857015132904, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49912238121032715, "step": 165 }, { "completion_length": 183.30271657307944, "epoch": 0.3495963495963496, "grad_norm": 2.683114767074585, "kl": 1.072265625, "learning_rate": 1e-06, "loss": 0.012, "reward": 4.426462570826213, "reward_std": 2.4770036339759827, "rewards/citation_reward_func": 4.285714228947957, "rewards/correctness_reward_func": 1.8367346326510112, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.40816326439380646, "rewards/penalize_wrong_passages_reward_func": -2.2857142289479575, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49789108832677204, "step": 166 }, { "completion_length": 171.02040354410806, "epoch": 0.3517023517023517, "grad_norm": 1.2051540613174438, "kl": 1.9908854166666667, "learning_rate": 1e-06, "loss": 0.0204, "reward": 3.414265056451162, "reward_std": 2.233205517133077, "rewards/citation_reward_func": 4.311224381128947, "rewards/correctness_reward_func": 0.7653060927987099, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.45918366809686023, "rewards/penalize_wrong_passages_reward_func": -2.204081585009893, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 167 }, { "completion_length": 179.71087900797525, "epoch": 0.3538083538083538, "grad_norm": 1.0430907011032104, "kl": 0.7047526041666666, "learning_rate": 1e-06, "loss": 0.0077, "reward": 3.5896018147468567, "reward_std": 2.7991716265678406, "rewards/citation_reward_func": 4.192176858584086, "rewards/correctness_reward_func": 0.8163264989852905, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.3571428507566452, "rewards/penalize_wrong_passages_reward_func": -1.721088429292043, "rewards/unicode_reward_func": -0.3401360511779785, "rewards/xmlcount_reward_func": 0.4994659175475438, "step": 168 }, { "completion_length": 180.67686462402344, "epoch": 0.35591435591435594, "grad_norm": 0.9326726198196411, "kl": 0.4820963541666667, "learning_rate": 1e-06, "loss": 0.006, "reward": 2.6223334272702536, "reward_std": 3.0509551763534546, "rewards/citation_reward_func": 4.370748281478882, "rewards/correctness_reward_func": 0.2551020284493764, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.6632653027772903, "rewards/penalize_wrong_passages_reward_func": -2.170067938665549, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.4998842825492223, "step": 169 }, { "completion_length": 153.03741200764975, "epoch": 0.35802035802035803, "grad_norm": 5.393792629241943, "kl": 1.2434895833333333, "learning_rate": 1e-06, "loss": 0.0135, "reward": 6.667666673660278, "reward_std": 2.0974193811416626, "rewards/citation_reward_func": 4.557823101679484, "rewards/correctness_reward_func": 1.9217686504125595, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.15306122104326883, "rewards/penalize_wrong_passages_reward_func": -0.6598639289538065, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 170 }, { "completion_length": 155.34353510538736, "epoch": 0.3601263601263601, "grad_norm": 1.1496800184249878, "kl": 0.7903645833333334, "learning_rate": 1e-06, "loss": 0.0093, "reward": 5.24929936726888, "reward_std": 1.6736542582511902, "rewards/citation_reward_func": 4.608843485514323, "rewards/correctness_reward_func": 2.534013517200947, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -2.8435374101003013, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 171 }, { "completion_length": 138.33332951863608, "epoch": 0.3622323622323622, "grad_norm": 1.926382303237915, "kl": 1.1145833333333333, "learning_rate": 1e-06, "loss": 0.0128, "reward": 7.129585027694702, "reward_std": 1.6755299766858418, "rewards/citation_reward_func": 4.506802598635356, "rewards/correctness_reward_func": 2.499999930461248, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.8775510142246882, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5003332595030466, "step": 172 }, { "completion_length": 168.19727579752603, "epoch": 0.36433836433836436, "grad_norm": 2.8209266662597656, "kl": 0.8522135416666666, "learning_rate": 1e-06, "loss": 0.0092, "reward": 6.785959243774414, "reward_std": 2.833313842614492, "rewards/citation_reward_func": 4.30272098382314, "rewards/correctness_reward_func": 2.2278911074002585, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -0.6394557654857635, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49854415158430737, "step": 173 }, { "completion_length": 174.9455769856771, "epoch": 0.36644436644436645, "grad_norm": 1.1766891479492188, "kl": 0.8248697916666666, "learning_rate": 1e-06, "loss": 0.0087, "reward": 3.729578137397766, "reward_std": 2.6636635859807334, "rewards/citation_reward_func": 3.988095204035441, "rewards/correctness_reward_func": 2.176870663960775, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -3.278911530971527, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4965849667787552, "step": 174 }, { "completion_length": 185.70067342122397, "epoch": 0.36855036855036855, "grad_norm": 0.9315419793128967, "kl": 0.59375, "learning_rate": 1e-06, "loss": 0.0069, "reward": 2.453159729639689, "reward_std": 1.8253816564877827, "rewards/citation_reward_func": 4.498299280802409, "rewards/correctness_reward_func": 1.1564625352621078, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -4.081632614135742, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.48207135995229083, "step": 175 }, { "completion_length": 195.724484761556, "epoch": 0.37065637065637064, "grad_norm": 1.8000379800796509, "kl": 0.7600911458333334, "learning_rate": 1e-06, "loss": 0.0083, "reward": 1.4989728132883708, "reward_std": 3.75508181254069, "rewards/citation_reward_func": 3.7329931259155273, "rewards/correctness_reward_func": 1.9387754102547963, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -0.30612244705359143, "rewards/penalize_wrong_passages_reward_func": -4.659863829612732, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.4666598041852315, "step": 176 }, { "completion_length": 153.51360066731772, "epoch": 0.3727623727623728, "grad_norm": 1.1415152549743652, "kl": 0.765625, "learning_rate": 1e-06, "loss": 0.0092, "reward": 5.929571429888408, "reward_std": 2.2657308280467987, "rewards/citation_reward_func": 4.59183669090271, "rewards/correctness_reward_func": 2.772108773390452, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -2.3333332737286887, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 177 }, { "completion_length": 155.71088155110678, "epoch": 0.3748683748683749, "grad_norm": 0.7995659708976746, "kl": 0.7845052083333334, "learning_rate": 1e-06, "loss": 0.0102, "reward": 4.895190397898356, "reward_std": 1.4263835549354553, "rewards/citation_reward_func": 4.583333253860474, "rewards/correctness_reward_func": 0.5782312626640002, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -1.1564625749985378, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49553055067857105, "step": 178 }, { "completion_length": 193.20067596435547, "epoch": 0.37697437697437697, "grad_norm": 0.8942872881889343, "kl": 0.611328125, "learning_rate": 1e-06, "loss": 0.0072, "reward": 3.979758540789286, "reward_std": 2.5980204939842224, "rewards/citation_reward_func": 3.9540814558664956, "rewards/correctness_reward_func": 0.5612244804700216, "rewards/formatting_reward_func": 0.49659863611062366, "rewards/length_reward_func": -0.3571428557236989, "rewards/penalize_wrong_passages_reward_func": -1.1700680057207744, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4950645665327708, "step": 179 }, { "completion_length": 183.89115142822266, "epoch": 0.37908037908037906, "grad_norm": 0.9261636734008789, "kl": 0.5338541666666666, "learning_rate": 1e-06, "loss": 0.006, "reward": 5.109836935997009, "reward_std": 2.6890705625216165, "rewards/citation_reward_func": 4.285714149475098, "rewards/correctness_reward_func": 0.7823129172126452, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -0.40816326439380646, "rewards/penalize_wrong_passages_reward_func": -0.5442176734407743, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.497591773668925, "step": 180 }, { "completion_length": 175.2687021891276, "epoch": 0.3811863811863812, "grad_norm": 3.168766736984253, "kl": 1.28515625, "learning_rate": 1e-06, "loss": 0.0135, "reward": 3.4397687713305154, "reward_std": 5.042555729548137, "rewards/citation_reward_func": 4.336734612782796, "rewards/correctness_reward_func": 0.9863945295413336, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -2.1428570598363876, "rewards/penalize_wrong_passages_reward_func": -0.7346938649813334, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.497591773668925, "step": 181 }, { "completion_length": 131.99999618530273, "epoch": 0.3832923832923833, "grad_norm": 0.970337986946106, "kl": 0.8287760416666666, "learning_rate": 1e-06, "loss": 0.0125, "reward": 9.27480936050415, "reward_std": 1.602648675441742, "rewards/citation_reward_func": 4.821428537368774, "rewards/correctness_reward_func": 3.8775508801142373, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -0.27210883299509686, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 182 }, { "completion_length": 145.39455540974936, "epoch": 0.3853983853983854, "grad_norm": 0.8163847923278809, "kl": 0.6829427083333334, "learning_rate": 1e-06, "loss": 0.0089, "reward": 6.255153139432271, "reward_std": 1.4988686243693035, "rewards/citation_reward_func": 4.319727738698323, "rewards/correctness_reward_func": 2.091836671034495, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -1.1564625625809033, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5000509520371755, "step": 183 }, { "completion_length": 154.55782063802084, "epoch": 0.3875043875043875, "grad_norm": 0.6219596862792969, "kl": 0.6966145833333334, "learning_rate": 1e-06, "loss": 0.0103, "reward": 7.526914755503337, "reward_std": 1.4331722855567932, "rewards/citation_reward_func": 4.370748162269592, "rewards/correctness_reward_func": 2.7551020085811615, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.5442176690946022, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4980033338069916, "step": 184 }, { "completion_length": 168.3945566813151, "epoch": 0.38961038961038963, "grad_norm": 0.7838864922523499, "kl": 0.6295572916666666, "learning_rate": 1e-06, "loss": 0.0084, "reward": 6.258248249689738, "reward_std": 2.3601081371307373, "rewards/citation_reward_func": 4.421768665313721, "rewards/correctness_reward_func": 1.8877550562222798, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.30612244705359143, "rewards/penalize_wrong_passages_reward_func": -0.7414965877930323, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4980441480875015, "step": 185 }, { "completion_length": 186.4591827392578, "epoch": 0.3917163917163917, "grad_norm": 0.7081553339958191, "kl": 0.5416666666666666, "learning_rate": 1e-06, "loss": 0.0066, "reward": 6.1795713901519775, "reward_std": 2.27645073334376, "rewards/citation_reward_func": 4.362244725227356, "rewards/correctness_reward_func": 1.5986394087473552, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -0.6802720955262581, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 186 }, { "completion_length": 186.2891108194987, "epoch": 0.3938223938223938, "grad_norm": 0.9178943037986755, "kl": 0.6373697916666666, "learning_rate": 1e-06, "loss": 0.0077, "reward": 6.135353724161784, "reward_std": 1.6183502872784932, "rewards/citation_reward_func": 4.430272022883098, "rewards/correctness_reward_func": 1.3435373802979786, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.6394557605187098, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 187 }, { "completion_length": 157.90816497802734, "epoch": 0.3959283959283959, "grad_norm": 0.9123345613479614, "kl": 0.740234375, "learning_rate": 1e-06, "loss": 0.0087, "reward": 4.997598648071289, "reward_std": 1.726643015940984, "rewards/citation_reward_func": 4.268707315127055, "rewards/correctness_reward_func": 3.6394556363423667, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -3.8095237016677856, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 188 }, { "completion_length": 184.80271657307944, "epoch": 0.39803439803439805, "grad_norm": 0.759057343006134, "kl": 0.6204427083333334, "learning_rate": 1e-06, "loss": 0.0076, "reward": 4.643901387850444, "reward_std": 2.2857230405012765, "rewards/citation_reward_func": 3.9710883696873984, "rewards/correctness_reward_func": 0.7312925085425377, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -0.7278911471366882, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.4959421157836914, "step": 189 }, { "completion_length": 177.35033671061197, "epoch": 0.40014040014040014, "grad_norm": 1.190181016921997, "kl": 0.6712239583333334, "learning_rate": 1e-06, "loss": 0.0086, "reward": 4.23569397131602, "reward_std": 1.947903613249461, "rewards/citation_reward_func": 4.234693805376689, "rewards/correctness_reward_func": 0.6292516887187958, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.8673469324906667, "rewards/penalize_wrong_passages_reward_func": -0.7619047512610754, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 190 }, { "completion_length": 186.51020050048828, "epoch": 0.40224640224640223, "grad_norm": 0.8846628069877625, "kl": 0.6009114583333334, "learning_rate": 1e-06, "loss": 0.0069, "reward": 3.0384148756663003, "reward_std": 3.5275836189587912, "rewards/citation_reward_func": 3.724489768346151, "rewards/correctness_reward_func": 1.6666666318972905, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -2.0408162673314414, "rewards/penalize_wrong_passages_reward_func": -1.3129251599311829, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 191 }, { "completion_length": 162.79251352945963, "epoch": 0.4043524043524043, "grad_norm": 0.596415638923645, "kl": 1.59765625, "learning_rate": 1e-06, "loss": 0.0191, "reward": 8.544118881225586, "reward_std": 1.6781288782755535, "rewards/citation_reward_func": 4.642857074737549, "rewards/correctness_reward_func": 3.452380895614624, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.49659862741827965, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4964999357859294, "step": 192 }, { "completion_length": 150.53061167399088, "epoch": 0.4064584064584065, "grad_norm": 0.6676158308982849, "kl": 0.748046875, "learning_rate": 1e-06, "loss": 0.0103, "reward": 8.596237738927206, "reward_std": 1.2864967584609985, "rewards/citation_reward_func": 4.9319727420806885, "rewards/correctness_reward_func": 3.0782312154769897, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.41496598223845166, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 193 }, { "completion_length": 166.77210489908853, "epoch": 0.40856440856440857, "grad_norm": 0.808198094367981, "kl": 0.8684895833333334, "learning_rate": 1e-06, "loss": 0.0097, "reward": 5.753748019536336, "reward_std": 2.120794693628947, "rewards/citation_reward_func": 4.2772107521692915, "rewards/correctness_reward_func": 1.275510181983312, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -0.6462584882974625, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5003468642632166, "step": 194 }, { "completion_length": 165.4149627685547, "epoch": 0.41067041067041066, "grad_norm": 0.9030170440673828, "kl": 4.500651041666667, "learning_rate": 1e-06, "loss": 0.0464, "reward": 5.399826486905416, "reward_std": 2.171577036380768, "rewards/citation_reward_func": 4.387755036354065, "rewards/correctness_reward_func": 0.7142856965462366, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.2551020433505376, "rewards/penalize_wrong_passages_reward_func": -0.44217686417202157, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49676524599393207, "step": 195 }, { "completion_length": 186.23129018147787, "epoch": 0.41277641277641275, "grad_norm": 0.6888033151626587, "kl": 0.78515625, "learning_rate": 1e-06, "loss": 0.0109, "reward": 7.719346761703491, "reward_std": 1.9923105090856552, "rewards/citation_reward_func": 4.32823113600413, "rewards/correctness_reward_func": 3.163265287876129, "rewards/formatting_reward_func": 0.49659863611062366, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -0.6122449040412903, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4965577671925227, "step": 196 }, { "completion_length": 175.53741200764975, "epoch": 0.4148824148824149, "grad_norm": 0.8608536124229431, "kl": 1.0234375, "learning_rate": 1e-06, "loss": 0.0122, "reward": 6.424663424491882, "reward_std": 2.1536823511123657, "rewards/citation_reward_func": 4.727891127268474, "rewards/correctness_reward_func": 1.751700686911742, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -0.7278911527246237, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.49779245257377625, "step": 197 }, { "completion_length": 162.51700337727866, "epoch": 0.416988416988417, "grad_norm": 0.6589003205299377, "kl": 0.8854166666666666, "learning_rate": 1e-06, "loss": 0.0109, "reward": 7.978091875712077, "reward_std": 1.7386022607485454, "rewards/citation_reward_func": 4.455782214800517, "rewards/correctness_reward_func": 3.1292516787846885, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.6054421712954839, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4984999398390452, "step": 198 }, { "completion_length": 149.35033416748047, "epoch": 0.4190944190944191, "grad_norm": 0.5710175633430481, "kl": 0.8072916666666666, "learning_rate": 1e-06, "loss": 0.0111, "reward": 6.779911359151204, "reward_std": 0.9170621037483215, "rewards/citation_reward_func": 4.76190463701884, "rewards/correctness_reward_func": 1.751700649658839, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.7346938600142797, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 199 }, { "completion_length": 156.374148050944, "epoch": 0.4212004212004212, "grad_norm": 0.8978866338729858, "kl": 0.7766927083333334, "learning_rate": 1e-06, "loss": 0.0128, "reward": 8.68200675646464, "reward_std": 1.6707564791043599, "rewards/citation_reward_func": 4.413265228271484, "rewards/correctness_reward_func": 4.217687010765076, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.8979591851433119, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5000339448451996, "step": 200 }, { "completion_length": 178.60203552246094, "epoch": 0.4233064233064233, "grad_norm": 0.7047922015190125, "kl": 0.83984375, "learning_rate": 1e-06, "loss": 0.0105, "reward": 7.912619272867839, "reward_std": 2.2664144337177277, "rewards/citation_reward_func": 4.515305995941162, "rewards/correctness_reward_func": 3.333333214124044, "rewards/formatting_reward_func": 0.4948979616165161, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -0.7210884193579356, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49425163865089417, "step": 201 }, { "completion_length": 169.9829889933268, "epoch": 0.4254124254124254, "grad_norm": 2.9857473373413086, "kl": 1.0475260416666667, "learning_rate": 1e-06, "loss": 0.0138, "reward": 7.905761957168579, "reward_std": 1.7536719938119252, "rewards/citation_reward_func": 4.557822942733765, "rewards/correctness_reward_func": 2.78911554813385, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -0.3401360462109248, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 202 }, { "completion_length": 146.27210489908853, "epoch": 0.4275184275184275, "grad_norm": 0.7594918012619019, "kl": 0.828125, "learning_rate": 1e-06, "loss": 0.0108, "reward": 8.805421829223633, "reward_std": 1.6902470886707306, "rewards/citation_reward_func": 4.753401279449463, "rewards/correctness_reward_func": 3.809523661931356, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.7074829911192259, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 203 }, { "completion_length": 145.32652537027994, "epoch": 0.42962442962442965, "grad_norm": 0.7676737904548645, "kl": 0.9563802083333334, "learning_rate": 1e-06, "loss": 0.0137, "reward": 7.953380982081096, "reward_std": 1.4818553825219472, "rewards/citation_reward_func": 4.370748241742452, "rewards/correctness_reward_func": 3.214285651842753, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -0.5306122352679571, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 204 }, { "completion_length": 134.48639297485352, "epoch": 0.43173043173043174, "grad_norm": 0.49755561351776123, "kl": 0.9361979166666666, "learning_rate": 1e-06, "loss": 0.0145, "reward": 8.082629362742106, "reward_std": 0.6665602227052053, "rewards/citation_reward_func": 4.872448841730754, "rewards/correctness_reward_func": 2.5170067151387534, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.3061224507788817, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49929585059483844, "step": 205 }, { "completion_length": 140.3401336669922, "epoch": 0.43383643383643383, "grad_norm": 1.6603000164031982, "kl": 1.0774739583333333, "learning_rate": 1e-06, "loss": 0.0137, "reward": 7.779064496358235, "reward_std": 2.068660780787468, "rewards/citation_reward_func": 4.625850280125936, "rewards/correctness_reward_func": 2.99319718281428, "rewards/formatting_reward_func": 0.49659863611062366, "rewards/length_reward_func": -0.15306122104326883, "rewards/penalize_wrong_passages_reward_func": -0.5102040773878495, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.4967516412337621, "step": 206 }, { "completion_length": 164.9047597249349, "epoch": 0.4359424359424359, "grad_norm": 1.059527039527893, "kl": 1.0234375, "learning_rate": 1e-06, "loss": 0.011, "reward": 8.205078125, "reward_std": 2.2538377245267234, "rewards/citation_reward_func": 4.396258393923442, "rewards/correctness_reward_func": 3.3673468232154846, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.5578231140971184, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49929585059483844, "step": 207 }, { "completion_length": 136.49319458007812, "epoch": 0.43804843804843807, "grad_norm": 1.1280230283737183, "kl": 0.8450520833333334, "learning_rate": 1e-06, "loss": 0.0115, "reward": 8.873448689778646, "reward_std": 1.6345154742399852, "rewards/citation_reward_func": 4.838435252507527, "rewards/correctness_reward_func": 3.5034013191858926, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.2993197242418925, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 208 }, { "completion_length": 137.9795888264974, "epoch": 0.44015444015444016, "grad_norm": 1.0411254167556763, "kl": 0.7864583333333334, "learning_rate": 1e-06, "loss": 0.0105, "reward": 8.269707520802816, "reward_std": 1.9344552357991536, "rewards/citation_reward_func": 4.710884253184001, "rewards/correctness_reward_func": 3.0952380498250327, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.36734693062802154, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 209 }, { "completion_length": 142.3197224934896, "epoch": 0.44226044226044225, "grad_norm": 1.1389507055282593, "kl": 0.8151041666666666, "learning_rate": 1e-06, "loss": 0.0101, "reward": 8.856441656748453, "reward_std": 1.6541103919347127, "rewards/citation_reward_func": 4.685374101003011, "rewards/correctness_reward_func": 3.8435373306274414, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.6734693869948387, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 210 }, { "completion_length": 168.58163197835287, "epoch": 0.44436644436644435, "grad_norm": 1.2596967220306396, "kl": 0.8326822916666666, "learning_rate": 1e-06, "loss": 0.0084, "reward": 4.704074879487355, "reward_std": 2.8726566632588706, "rewards/citation_reward_func": 4.455782175064087, "rewards/correctness_reward_func": 1.4795917769273121, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -2.061224510272344, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.49999313056468964, "step": 211 }, { "completion_length": 161.63945515950522, "epoch": 0.4464724464724465, "grad_norm": 1.7201244831085205, "kl": 0.8919270833333334, "learning_rate": 1e-06, "loss": 0.0102, "reward": 6.776510198911031, "reward_std": 2.4815656542778015, "rewards/citation_reward_func": 4.387754996617635, "rewards/correctness_reward_func": 2.244897877176603, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.6870748282720646, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 212 }, { "completion_length": 158.12584686279297, "epoch": 0.4485784485784486, "grad_norm": 1.5194833278656006, "kl": 1.234375, "learning_rate": 1e-06, "loss": 0.014, "reward": 5.601340293884277, "reward_std": 2.280164202054342, "rewards/citation_reward_func": 4.141156315803528, "rewards/correctness_reward_func": 1.819727823138237, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.30612245202064514, "rewards/penalize_wrong_passages_reward_func": -1.054421752691269, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 213 }, { "completion_length": 134.87414677937826, "epoch": 0.4506844506844507, "grad_norm": 0.47165757417678833, "kl": 0.8072916666666666, "learning_rate": 1e-06, "loss": 0.0135, "reward": 9.836033980051676, "reward_std": 1.0725230872631073, "rewards/citation_reward_func": 4.7874148686726885, "rewards/correctness_reward_func": 4.642857074737549, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.5442176771660646, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 214 }, { "completion_length": 157.78911336263022, "epoch": 0.45279045279045277, "grad_norm": 1.5893391370773315, "kl": 0.89453125, "learning_rate": 1e-06, "loss": 0.0097, "reward": 6.057105541229248, "reward_std": 2.7033910751342773, "rewards/citation_reward_func": 4.260203997294108, "rewards/correctness_reward_func": 1.6326530079046886, "rewards/formatting_reward_func": 0.4914965977271398, "rewards/length_reward_func": -0.3571428557236989, "rewards/penalize_wrong_passages_reward_func": -0.4625850220521291, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4924795279900233, "step": 215 }, { "completion_length": 195.2993138631185, "epoch": 0.4548964548964549, "grad_norm": 1.0235472917556763, "kl": 0.7076822916666666, "learning_rate": 1e-06, "loss": 0.0075, "reward": 4.3785510659217834, "reward_std": 1.9232422510782878, "rewards/citation_reward_func": 4.370748162269592, "rewards/correctness_reward_func": 1.9387754499912262, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -2.9319726886848607, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 216 }, { "completion_length": 153.5544179280599, "epoch": 0.457002457002457, "grad_norm": 1.342038631439209, "kl": 0.9921875, "learning_rate": 1e-06, "loss": 0.011, "reward": 7.819027264912923, "reward_std": 2.2121856609980264, "rewards/citation_reward_func": 4.532312790552775, "rewards/correctness_reward_func": 3.061224400997162, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -0.6734693758189678, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 217 }, { "completion_length": 149.75170135498047, "epoch": 0.4591084591084591, "grad_norm": 1.5824962854385376, "kl": 0.9694010416666666, "learning_rate": 1e-06, "loss": 0.0102, "reward": 6.435758511225383, "reward_std": 2.4908171892166138, "rewards/citation_reward_func": 4.285714228947957, "rewards/correctness_reward_func": 1.9897958636283875, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -0.6870748202006022, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5003842860460281, "step": 218 }, { "completion_length": 169.7448959350586, "epoch": 0.4612144612144612, "grad_norm": 1.3664193153381348, "kl": 0.845703125, "learning_rate": 1e-06, "loss": 0.0095, "reward": 6.609415014584859, "reward_std": 2.7110483845074973, "rewards/citation_reward_func": 4.319727818171184, "rewards/correctness_reward_func": 2.9251699844996133, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -1.4285713980595272, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49887068569660187, "step": 219 }, { "completion_length": 174.56122334798178, "epoch": 0.46332046332046334, "grad_norm": 1.1141589879989624, "kl": 2.353515625, "learning_rate": 1e-06, "loss": 0.0244, "reward": 7.031609058380127, "reward_std": 2.6527690092722573, "rewards/citation_reward_func": 4.200680096944173, "rewards/correctness_reward_func": 3.1462584336598716, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.40816326936086017, "rewards/penalize_wrong_passages_reward_func": -0.9047618905703226, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49929585059483844, "step": 220 }, { "completion_length": 184.224484761556, "epoch": 0.46542646542646543, "grad_norm": 0.9297390580177307, "kl": 0.7239583333333334, "learning_rate": 1e-06, "loss": 0.008, "reward": 6.469874382019043, "reward_std": 3.017313222090403, "rewards/citation_reward_func": 4.064625700314839, "rewards/correctness_reward_func": 2.8741495509942374, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.8163265188535055, "rewards/penalize_wrong_passages_reward_func": -0.6530612135926882, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5004863242308298, "step": 221 }, { "completion_length": 191.47618611653647, "epoch": 0.4675324675324675, "grad_norm": 0.9865539073944092, "kl": 0.7630208333333334, "learning_rate": 1e-06, "loss": 0.0085, "reward": 4.598503390947978, "reward_std": 3.093896965185801, "rewards/citation_reward_func": 4.166666626930237, "rewards/correctness_reward_func": 1.0884353617827098, "rewards/formatting_reward_func": 0.4931972771883011, "rewards/length_reward_func": -0.5102040867010752, "rewards/penalize_wrong_passages_reward_func": -1.1292516452570756, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.48965980112552643, "step": 222 }, { "completion_length": 175.85033671061197, "epoch": 0.4696384696384696, "grad_norm": 1.1267229318618774, "kl": 0.7220052083333334, "learning_rate": 1e-06, "loss": 0.0102, "reward": 8.249298870563507, "reward_std": 2.0633734663327536, "rewards/citation_reward_func": 4.438775459925334, "rewards/correctness_reward_func": 3.5714283883571625, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -0.6598639513055483, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 223 }, { "completion_length": 174.49319458007812, "epoch": 0.47174447174447176, "grad_norm": 0.8411754369735718, "kl": 0.740234375, "learning_rate": 1e-06, "loss": 0.0093, "reward": 7.572517077128093, "reward_std": 2.8577981988588967, "rewards/citation_reward_func": 4.515306035677592, "rewards/correctness_reward_func": 3.1462584336598716, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -0.816326508919398, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.4993876814842224, "step": 224 }, { "completion_length": 146.80951944986978, "epoch": 0.47385047385047385, "grad_norm": 0.7524572610855103, "kl": 0.8352864583333334, "learning_rate": 1e-06, "loss": 0.0123, "reward": 8.59113605817159, "reward_std": 1.6423706163962681, "rewards/citation_reward_func": 4.3112244208653765, "rewards/correctness_reward_func": 4.523809472719829, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -1.0748299211263657, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 225 }, { "completion_length": 162.69387563069662, "epoch": 0.47595647595647594, "grad_norm": 0.6513383388519287, "kl": 0.72265625, "learning_rate": 1e-06, "loss": 0.0096, "reward": 5.912564555803935, "reward_std": 1.7237219214439392, "rewards/citation_reward_func": 4.642857074737549, "rewards/correctness_reward_func": 3.962584972381592, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -3.591836671034495, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 226 }, { "completion_length": 157.63264973958334, "epoch": 0.47806247806247804, "grad_norm": 0.9033396244049072, "kl": 0.7278645833333334, "learning_rate": 1e-06, "loss": 0.0091, "reward": 5.257574955622355, "reward_std": 1.365848998228709, "rewards/citation_reward_func": 4.778911431630452, "rewards/correctness_reward_func": 3.4863944053649902, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -4.006802638371785, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4990713646014531, "step": 227 }, { "completion_length": 157.925168355306, "epoch": 0.4801684801684802, "grad_norm": 1.0495508909225464, "kl": 0.8313802083333334, "learning_rate": 1e-06, "loss": 0.0111, "reward": 5.719282348950704, "reward_std": 1.4672572215398152, "rewards/citation_reward_func": 4.5408161878585815, "rewards/correctness_reward_func": 3.2142856121063232, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -2.9319727222124734, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4981938103834788, "step": 228 }, { "completion_length": 150.01360066731772, "epoch": 0.4822744822744823, "grad_norm": 1.5118231773376465, "kl": 0.7799479166666666, "learning_rate": 1e-06, "loss": 0.0087, "reward": 6.944877624511719, "reward_std": 1.9879997571309407, "rewards/citation_reward_func": 4.260203997294108, "rewards/correctness_reward_func": 2.312925140062968, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.5782312800486883, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 229 }, { "completion_length": 171.23128763834634, "epoch": 0.48438048438048437, "grad_norm": 0.9057350158691406, "kl": 0.7044270833333334, "learning_rate": 1e-06, "loss": 0.0075, "reward": 5.530394633611043, "reward_std": 1.9266496499379475, "rewards/citation_reward_func": 3.928571343421936, "rewards/correctness_reward_func": 1.3605441649754841, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.7006802558898926, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4929795265197754, "step": 230 }, { "completion_length": 170.30271657307944, "epoch": 0.4864864864864865, "grad_norm": 0.9910404086112976, "kl": 0.6901041666666666, "learning_rate": 1e-06, "loss": 0.0075, "reward": 7.714860757191976, "reward_std": 2.9793601632118225, "rewards/citation_reward_func": 4.659863869349162, "rewards/correctness_reward_func": 2.7040815552075705, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.42857141109804314, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.5005747626225153, "step": 231 }, { "completion_length": 154.95237731933594, "epoch": 0.4885924885924886, "grad_norm": 38.09885787963867, "kl": 1.8313802083333333, "learning_rate": 1e-06, "loss": 0.0206, "reward": 8.529652913411459, "reward_std": 1.7911757131417592, "rewards/citation_reward_func": 4.719387690226237, "rewards/correctness_reward_func": 3.180272022883097, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.3673469287653764, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49734007318814594, "step": 232 }, { "completion_length": 132.86394500732422, "epoch": 0.4906984906984907, "grad_norm": 1.356911540031433, "kl": 0.939453125, "learning_rate": 1e-06, "loss": 0.0138, "reward": 8.633653004964193, "reward_std": 1.5540131802360218, "rewards/citation_reward_func": 4.574829816818237, "rewards/correctness_reward_func": 3.673469305038452, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -0.4625850251565377, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 233 }, { "completion_length": 136.83332951863608, "epoch": 0.4928044928044928, "grad_norm": 1.2966426610946655, "kl": 0.86328125, "learning_rate": 1e-06, "loss": 0.0132, "reward": 9.37684996922811, "reward_std": 1.3524375955263774, "rewards/citation_reward_func": 4.719387610753377, "rewards/correctness_reward_func": 4.166666587193807, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -0.4081632619102796, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 234 }, { "completion_length": 169.86394246419272, "epoch": 0.49491049491049494, "grad_norm": 0.8496416807174683, "kl": 0.7350260416666666, "learning_rate": 1e-06, "loss": 0.0092, "reward": 7.440455913543701, "reward_std": 2.1057456930478415, "rewards/citation_reward_func": 4.5153060754140215, "rewards/correctness_reward_func": 2.7210883696873984, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -0.639455775419871, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49827884634335834, "step": 235 }, { "completion_length": 161.74829864501953, "epoch": 0.497016497016497, "grad_norm": 0.8334783315658569, "kl": 0.86328125, "learning_rate": 1e-06, "loss": 0.0139, "reward": 8.172744790712992, "reward_std": 1.832821786403656, "rewards/citation_reward_func": 4.651360511779785, "rewards/correctness_reward_func": 3.8265304962793985, "rewards/formatting_reward_func": 0.4880952338377635, "rewards/length_reward_func": -0.4591836631298065, "rewards/penalize_wrong_passages_reward_func": -0.8231292168299357, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4890713741381963, "step": 236 }, { "completion_length": 189.775510152181, "epoch": 0.4991224991224991, "grad_norm": 2.218723773956299, "kl": 1.2096354166666667, "learning_rate": 1e-06, "loss": 0.016, "reward": 7.1474082469940186, "reward_std": 2.675163338581721, "rewards/citation_reward_func": 4.549319664637248, "rewards/correctness_reward_func": 2.9081631898880005, "rewards/formatting_reward_func": 0.4880952338377635, "rewards/length_reward_func": -0.5612244804700216, "rewards/penalize_wrong_passages_reward_func": -0.7210884143908819, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.48414280513922375, "step": 237 }, { "completion_length": 198.9999974568685, "epoch": 0.5012285012285013, "grad_norm": 0.7778282761573792, "kl": 0.65625, "learning_rate": 1e-06, "loss": 0.008, "reward": 4.987530579169591, "reward_std": 3.4593644042809806, "rewards/citation_reward_func": 3.784013549486796, "rewards/correctness_reward_func": 2.142857094605764, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -0.40816326439380646, "rewards/penalize_wrong_passages_reward_func": -1.353741466999054, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.4960339516401291, "step": 238 }, { "completion_length": 135.46938196818033, "epoch": 0.5033345033345034, "grad_norm": 0.6986418962478638, "kl": 0.884765625, "learning_rate": 1e-06, "loss": 0.0134, "reward": 9.60133981704712, "reward_std": 1.030010461807251, "rewards/citation_reward_func": 4.8894557158152265, "rewards/correctness_reward_func": 3.962584932645162, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.25170067449410755, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 239 }, { "completion_length": 125.5646235148112, "epoch": 0.5054405054405054, "grad_norm": 2.5262744426727295, "kl": 1.1393229166666667, "learning_rate": 1e-06, "loss": 0.0155, "reward": 9.468686580657959, "reward_std": 1.2599839965502422, "rewards/citation_reward_func": 4.821428457895915, "rewards/correctness_reward_func": 3.945578138033549, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.29931971927483875, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 240 }, { "completion_length": 125.99659729003906, "epoch": 0.5075465075465075, "grad_norm": 1.0987335443496704, "kl": 0.9466145833333334, "learning_rate": 1e-06, "loss": 0.0131, "reward": 9.814948876698812, "reward_std": 1.6750460763772328, "rewards/citation_reward_func": 4.974489688873291, "rewards/correctness_reward_func": 4.047618945439656, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.15646257748206457, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5003230571746826, "step": 241 }, { "completion_length": 153.58162689208984, "epoch": 0.5096525096525096, "grad_norm": 0.7817453145980835, "kl": 0.775390625, "learning_rate": 1e-06, "loss": 0.01, "reward": 8.463584661483765, "reward_std": 1.5922152797381084, "rewards/citation_reward_func": 4.778911431630452, "rewards/correctness_reward_func": 3.6904760599136353, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -0.8027210558454195, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 242 }, { "completion_length": 174.13945515950522, "epoch": 0.5117585117585117, "grad_norm": 1.1635452508926392, "kl": 0.8131510416666666, "learning_rate": 1e-06, "loss": 0.011, "reward": 7.308819691340129, "reward_std": 2.3860236207644143, "rewards/citation_reward_func": 4.702380895614624, "rewards/correctness_reward_func": 3.3333332538604736, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -1.503401351471742, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.49929585059483844, "step": 243 }, { "completion_length": 185.59523264567056, "epoch": 0.5138645138645138, "grad_norm": 0.7059080004692078, "kl": 0.9095052083333334, "learning_rate": 1e-06, "loss": 0.0124, "reward": 7.917653242746989, "reward_std": 2.116127530733744, "rewards/citation_reward_func": 4.413265148798625, "rewards/correctness_reward_func": 3.8435372511545816, "rewards/formatting_reward_func": 0.4931972771883011, "rewards/length_reward_func": -0.20408162971337637, "rewards/penalize_wrong_passages_reward_func": -1.1224489708741505, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49418361485004425, "step": 244 }, { "completion_length": 177.6292495727539, "epoch": 0.515970515970516, "grad_norm": 0.7793982028961182, "kl": 0.7532552083333334, "learning_rate": 1e-06, "loss": 0.0117, "reward": 8.637050946553549, "reward_std": 2.5918923815091452, "rewards/citation_reward_func": 4.540816227595012, "rewards/correctness_reward_func": 4.081632574399312, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.15306122104326883, "rewards/penalize_wrong_passages_reward_func": -0.82993194522957, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49929585059483844, "step": 245 }, { "completion_length": 151.10203806559244, "epoch": 0.5180765180765181, "grad_norm": 0.7599817514419556, "kl": 0.73828125, "learning_rate": 1e-06, "loss": 0.0102, "reward": 6.0459217230478925, "reward_std": 1.4534762352705002, "rewards/citation_reward_func": 4.421768585840861, "rewards/correctness_reward_func": 1.2414965331554413, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.5646258524308602, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4983026534318924, "step": 246 }, { "completion_length": 149.7789077758789, "epoch": 0.5201825201825202, "grad_norm": 1.0434558391571045, "kl": 0.8001302083333334, "learning_rate": 1e-06, "loss": 0.0108, "reward": 7.849003394444783, "reward_std": 1.688551406065623, "rewards/citation_reward_func": 4.710884253184001, "rewards/correctness_reward_func": 2.636054356892904, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -0.39455781939129037, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49866320192813873, "step": 247 }, { "completion_length": 192.9591801961263, "epoch": 0.5222885222885223, "grad_norm": 0.767522931098938, "kl": 0.9921875, "learning_rate": 1e-06, "loss": 0.0118, "reward": 5.846408238013585, "reward_std": 2.7072153290112815, "rewards/citation_reward_func": 3.9540814956029258, "rewards/correctness_reward_func": 2.755101998647054, "rewards/formatting_reward_func": 0.4948979566494624, "rewards/length_reward_func": -0.3571428557236989, "rewards/penalize_wrong_passages_reward_func": -1.4829931687563658, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4824625253677368, "step": 248 }, { "completion_length": 132.92516454060873, "epoch": 0.5243945243945244, "grad_norm": 4.862525463104248, "kl": 1.4322916666666667, "learning_rate": 1e-06, "loss": 0.0167, "reward": 9.254401048024496, "reward_std": 1.774699608484904, "rewards/citation_reward_func": 4.770408074061076, "rewards/correctness_reward_func": 3.8435372908910117, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.36054420471191406, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 249 }, { "completion_length": 140.15986124674478, "epoch": 0.5265005265005265, "grad_norm": 1.2383294105529785, "kl": 1.0032552083333333, "learning_rate": 1e-06, "loss": 0.0122, "reward": 7.85304053624471, "reward_std": 1.551555981238683, "rewards/citation_reward_func": 4.753401279449463, "rewards/correctness_reward_func": 2.4489794969558716, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.2993197174121936, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 250 }, { "completion_length": 143.27550760904947, "epoch": 0.5286065286065286, "grad_norm": 0.9058665633201599, "kl": 0.8587239583333334, "learning_rate": 1e-06, "loss": 0.01, "reward": 6.524013678232829, "reward_std": 2.4335868755976358, "rewards/citation_reward_func": 4.387755115826924, "rewards/correctness_reward_func": 2.0408162673314414, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -0.7006802608569463, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5002040167649587, "step": 251 }, { "completion_length": 162.08162943522134, "epoch": 0.5307125307125307, "grad_norm": 1.0201135873794556, "kl": 0.8170572916666666, "learning_rate": 1e-06, "loss": 0.0091, "reward": 6.7847583293914795, "reward_std": 1.9675355752309163, "rewards/citation_reward_func": 4.5068027178446455, "rewards/correctness_reward_func": 2.1938775380452475, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -0.8095238109429678, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49734347065289813, "step": 252 }, { "completion_length": 171.5544179280599, "epoch": 0.5328185328185329, "grad_norm": 1.0924259424209595, "kl": 0.6809895833333334, "learning_rate": 1e-06, "loss": 0.0076, "reward": 5.832513689994812, "reward_std": 2.4871179461479187, "rewards/citation_reward_func": 4.455782254536946, "rewards/correctness_reward_func": 1.4795918265978496, "rewards/formatting_reward_func": 0.4948979566494624, "rewards/length_reward_func": -0.2551020433505376, "rewards/penalize_wrong_passages_reward_func": -0.836734672387441, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4940781792004903, "step": 253 }, { "completion_length": 160.2789077758789, "epoch": 0.534924534924535, "grad_norm": 0.6430193781852722, "kl": 0.8372395833333334, "learning_rate": 1e-06, "loss": 0.0097, "reward": 7.685734748840332, "reward_std": 1.3944465617338817, "rewards/citation_reward_func": 4.022108832995097, "rewards/correctness_reward_func": 3.418367306391398, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.7551020216196775, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5003604739904404, "step": 254 }, { "completion_length": 163.47618611653647, "epoch": 0.537030537030537, "grad_norm": 0.6005195379257202, "kl": 0.7591145833333334, "learning_rate": 1e-06, "loss": 0.011, "reward": 5.973788936932881, "reward_std": 1.2611841062704723, "rewards/citation_reward_func": 4.71088433265686, "rewards/correctness_reward_func": 1.2585033824046452, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -0.8435373933364948, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 255 }, { "completion_length": 131.9693832397461, "epoch": 0.5391365391365391, "grad_norm": 0.5962944030761719, "kl": 0.96484375, "learning_rate": 1e-06, "loss": 0.0157, "reward": 9.269704023996988, "reward_std": 0.8620906795064608, "rewards/citation_reward_func": 4.863945484161377, "rewards/correctness_reward_func": 4.200680216153462, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.7414965877930323, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49929585059483844, "step": 256 }, { "completion_length": 138.84353383382162, "epoch": 0.5412425412425412, "grad_norm": 0.6719671487808228, "kl": 0.89453125, "learning_rate": 1e-06, "loss": 0.0128, "reward": 7.557119131088257, "reward_std": 1.2554789185523987, "rewards/citation_reward_func": 4.566326379776001, "rewards/correctness_reward_func": 2.7040815353393555, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.6598639351626238, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49929585059483844, "step": 257 }, { "completion_length": 137.53401311238608, "epoch": 0.5433485433485433, "grad_norm": 0.8949998021125793, "kl": 1.06640625, "learning_rate": 1e-06, "loss": 0.0133, "reward": 8.044159889221191, "reward_std": 2.113215277592341, "rewards/citation_reward_func": 4.693877379099528, "rewards/correctness_reward_func": 3.0442176262537637, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.20408162971337637, "rewards/penalize_wrong_passages_reward_func": -0.4897959033648173, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4999421040217082, "step": 258 }, { "completion_length": 137.28911336263022, "epoch": 0.5454545454545454, "grad_norm": 11.216543197631836, "kl": 1.9811197916666667, "learning_rate": 1e-06, "loss": 0.0219, "reward": 7.045217672983806, "reward_std": 2.517554461956024, "rewards/citation_reward_func": 4.710884173711141, "rewards/correctness_reward_func": 2.09183669090271, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -0.43537414570649463, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 259 }, { "completion_length": 147.71428426106772, "epoch": 0.5475605475605475, "grad_norm": 1.2328022718429565, "kl": 0.7734375, "learning_rate": 1e-06, "loss": 0.0082, "reward": 6.194877703984578, "reward_std": 2.832310895125071, "rewards/citation_reward_func": 4.659863789876302, "rewards/correctness_reward_func": 1.4115646084149678, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -0.6734693770607313, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 260 }, { "completion_length": 138.08503087361655, "epoch": 0.5496665496665497, "grad_norm": 0.8712778091430664, "kl": 2.0670572916666665, "learning_rate": 1e-06, "loss": 0.023, "reward": 6.020068089167277, "reward_std": 1.27622323234876, "rewards/citation_reward_func": 4.464285612106323, "rewards/correctness_reward_func": 1.275510162115097, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.6666666567325592, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49795911212762195, "step": 261 }, { "completion_length": 136.39795430501303, "epoch": 0.5517725517725518, "grad_norm": 0.970741868019104, "kl": 0.8522135416666666, "learning_rate": 1e-06, "loss": 0.0107, "reward": 7.543517192204793, "reward_std": 1.5009947220484416, "rewards/citation_reward_func": 4.719387610753377, "rewards/correctness_reward_func": 3.520408014456431, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -1.6462584833304088, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 262 }, { "completion_length": 148.2448933919271, "epoch": 0.5538785538785539, "grad_norm": 0.8934192061424255, "kl": 0.71484375, "learning_rate": 1e-06, "loss": 0.0091, "reward": 6.40746267636617, "reward_std": 1.2594018677870433, "rewards/citation_reward_func": 4.872448921203613, "rewards/correctness_reward_func": 2.3639455238978067, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -1.8299319446086884, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 263 }, { "completion_length": 155.17686971028647, "epoch": 0.555984555984556, "grad_norm": 0.6256468892097473, "kl": 0.7734375, "learning_rate": 1e-06, "loss": 0.0108, "reward": 7.1438571612040205, "reward_std": 1.2106933891773224, "rewards/citation_reward_func": 4.846938769022624, "rewards/correctness_reward_func": 2.789115568002065, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -1.4421768623093765, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 264 }, { "completion_length": 153.8197250366211, "epoch": 0.5580905580905581, "grad_norm": 0.9129427075386047, "kl": 0.728515625, "learning_rate": 1e-06, "loss": 0.0105, "reward": 8.354840358098349, "reward_std": 1.5037815868854523, "rewards/citation_reward_func": 4.73639456431071, "rewards/correctness_reward_func": 2.976190427939097, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.3061224476744731, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4993978887796402, "step": 265 }, { "completion_length": 145.45237731933594, "epoch": 0.5601965601965602, "grad_norm": 0.5100239515304565, "kl": 0.787109375, "learning_rate": 1e-06, "loss": 0.0112, "reward": 7.560523907343547, "reward_std": 1.0577657222747803, "rewards/citation_reward_func": 4.7874148686726885, "rewards/correctness_reward_func": 2.4999999006589255, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.7278911328564087, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 266 }, { "completion_length": 171.1496556599935, "epoch": 0.5623025623025623, "grad_norm": 0.934743344783783, "kl": 0.66796875, "learning_rate": 1e-06, "loss": 0.0084, "reward": 7.694870392481486, "reward_std": 2.9706706007321677, "rewards/citation_reward_func": 4.226190368334453, "rewards/correctness_reward_func": 3.571428418159485, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.2551020433505376, "rewards/penalize_wrong_passages_reward_func": -0.8435373902320862, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.497591773668925, "step": 267 }, { "completion_length": 159.51700337727866, "epoch": 0.5644085644085645, "grad_norm": 2.148893356323242, "kl": 0.7467447916666666, "learning_rate": 1e-06, "loss": 0.0084, "reward": 6.452187379201253, "reward_std": 2.600463350613912, "rewards/citation_reward_func": 4.515305956204732, "rewards/correctness_reward_func": 2.1598638792832694, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.2551020383834839, "rewards/penalize_wrong_passages_reward_func": -0.9659863697985808, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49810537695884705, "step": 268 }, { "completion_length": 142.34353892008463, "epoch": 0.5665145665145666, "grad_norm": 1.0294554233551025, "kl": 0.7233072916666666, "learning_rate": 1e-06, "loss": 0.008, "reward": 6.883653163909912, "reward_std": 2.1682270169258118, "rewards/citation_reward_func": 4.7704081535339355, "rewards/correctness_reward_func": 2.4659863313039145, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -1.3537414769331615, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 269 }, { "completion_length": 142.6394526163737, "epoch": 0.5686205686205686, "grad_norm": 0.7771506309509277, "kl": 0.8059895833333334, "learning_rate": 1e-06, "loss": 0.0106, "reward": 7.640455881754558, "reward_std": 1.9366973439852397, "rewards/citation_reward_func": 4.795918305714925, "rewards/correctness_reward_func": 3.231292406717936, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -1.3877550611893337, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 270 }, { "completion_length": 163.00679779052734, "epoch": 0.5707265707265707, "grad_norm": 0.785870373249054, "kl": 0.9095052083333334, "learning_rate": 1e-06, "loss": 0.0117, "reward": 8.092830101648966, "reward_std": 2.4080686370531716, "rewards/citation_reward_func": 4.353741367657979, "rewards/correctness_reward_func": 3.537414828936259, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -0.6394557779033979, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.497591773668925, "step": 271 }, { "completion_length": 161.8945515950521, "epoch": 0.5728325728325728, "grad_norm": 0.848395049571991, "kl": 0.798828125, "learning_rate": 1e-06, "loss": 0.011, "reward": 8.57502031326294, "reward_std": 1.9230751991271973, "rewards/citation_reward_func": 4.625850200653076, "rewards/correctness_reward_func": 3.3673469026883445, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.3673469324906667, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5001904020706812, "step": 272 }, { "completion_length": 166.25169372558594, "epoch": 0.5749385749385749, "grad_norm": 0.69926917552948, "kl": 0.7936197916666666, "learning_rate": 1e-06, "loss": 0.0118, "reward": 8.480765422185263, "reward_std": 1.7872259020805359, "rewards/citation_reward_func": 4.464285572369893, "rewards/correctness_reward_func": 3.5544217427571616, "rewards/formatting_reward_func": 0.49659863611062366, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -0.42857142724096775, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49607136845588684, "step": 273 }, { "completion_length": 146.66326141357422, "epoch": 0.577044577044577, "grad_norm": 0.987402617931366, "kl": 1.2063802083333333, "learning_rate": 1e-06, "loss": 0.014, "reward": 8.949979543685913, "reward_std": 2.1892781058947244, "rewards/citation_reward_func": 4.676870663960774, "rewards/correctness_reward_func": 3.6564625104268393, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.333333329608043, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 274 }, { "completion_length": 142.36394246419272, "epoch": 0.5791505791505791, "grad_norm": 1.0434736013412476, "kl": 1.03515625, "learning_rate": 1e-06, "loss": 0.0133, "reward": 8.55031975110372, "reward_std": 1.6762052774429321, "rewards/citation_reward_func": 4.8894557158152265, "rewards/correctness_reward_func": 2.8741495609283447, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.1632653015355269, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 275 }, { "completion_length": 144.30611928304037, "epoch": 0.5812565812565813, "grad_norm": 2.7037227153778076, "kl": 1.1516927083333333, "learning_rate": 1e-06, "loss": 0.0133, "reward": 7.040115435918172, "reward_std": 1.6779302060604095, "rewards/citation_reward_func": 4.9234693845113116, "rewards/correctness_reward_func": 2.4999999403953552, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -1.3333333035310109, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 276 }, { "completion_length": 138.149658203125, "epoch": 0.5833625833625834, "grad_norm": 0.9938499927520752, "kl": 1.4986979166666667, "learning_rate": 1e-06, "loss": 0.0181, "reward": 6.847938934961955, "reward_std": 1.6733147700627644, "rewards/citation_reward_func": 4.795918305714925, "rewards/correctness_reward_func": 2.857142766316732, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -1.7551019787788391, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 277 }, { "completion_length": 141.54081217447916, "epoch": 0.5854685854685855, "grad_norm": 0.6015098690986633, "kl": 0.982421875, "learning_rate": 1e-06, "loss": 0.013, "reward": 6.965959231058757, "reward_std": 1.5529690434535344, "rewards/citation_reward_func": 4.778911431630452, "rewards/correctness_reward_func": 3.5884352922439575, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -0.20408162971337637, "rewards/penalize_wrong_passages_reward_func": -2.1904762083043656, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4965713669856389, "step": 278 }, { "completion_length": 154.874148050944, "epoch": 0.5875745875745876, "grad_norm": 0.8088123798370361, "kl": 0.90234375, "learning_rate": 1e-06, "loss": 0.0112, "reward": 7.800319751103719, "reward_std": 1.5292981366316478, "rewards/citation_reward_func": 4.829931894938151, "rewards/correctness_reward_func": 2.1938774784406028, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.22448979442318281, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 279 }, { "completion_length": 147.12244415283203, "epoch": 0.5896805896805897, "grad_norm": 1.0946311950683594, "kl": 0.9694010416666666, "learning_rate": 1e-06, "loss": 0.011, "reward": 7.138755162556966, "reward_std": 1.8616977731386821, "rewards/citation_reward_func": 4.787414789199829, "rewards/correctness_reward_func": 1.632652997970581, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.23129250730077425, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 280 }, { "completion_length": 147.74829864501953, "epoch": 0.5917865917865918, "grad_norm": 0.99028480052948, "kl": 1.228515625, "learning_rate": 1e-06, "loss": 0.0143, "reward": 6.8122890790303545, "reward_std": 1.8934701879819233, "rewards/citation_reward_func": 4.846938689549764, "rewards/correctness_reward_func": 1.4285713632901509, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.23809523383776346, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.4976631999015808, "step": 281 }, { "completion_length": 159.523806254069, "epoch": 0.5938925938925939, "grad_norm": 0.7205305099487305, "kl": 0.779296875, "learning_rate": 1e-06, "loss": 0.0102, "reward": 5.7441972096761065, "reward_std": 1.631241689125697, "rewards/citation_reward_func": 4.209183533986409, "rewards/correctness_reward_func": 1.3435373703638713, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.8095237972835699, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 282 }, { "completion_length": 159.98299153645834, "epoch": 0.595998595998596, "grad_norm": 1.254953145980835, "kl": 0.9583333333333334, "learning_rate": 1e-06, "loss": 0.0115, "reward": 5.963574965794881, "reward_std": 2.36332497994105, "rewards/citation_reward_func": 4.166666547457377, "rewards/correctness_reward_func": 2.0918366809686026, "rewards/formatting_reward_func": 0.4948979566494624, "rewards/length_reward_func": -0.30612245202064514, "rewards/penalize_wrong_passages_reward_func": -0.9795918166637421, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4958876967430115, "step": 283 }, { "completion_length": 156.17686716715494, "epoch": 0.5981045981045982, "grad_norm": 0.9321157932281494, "kl": 0.8502604166666666, "learning_rate": 1e-06, "loss": 0.011, "reward": 7.640455881754558, "reward_std": 1.9198250969250996, "rewards/citation_reward_func": 4.455782175064087, "rewards/correctness_reward_func": 2.9251699844996133, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -0.5374149531126022, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 284 }, { "completion_length": 165.04761505126953, "epoch": 0.6002106002106002, "grad_norm": 1.290971040725708, "kl": 8.250651041666666, "learning_rate": 1e-06, "loss": 0.0833, "reward": 6.30113951365153, "reward_std": 3.333476463953654, "rewards/citation_reward_func": 3.852040688196818, "rewards/correctness_reward_func": 2.9421767791112265, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.2551020433505376, "rewards/penalize_wrong_passages_reward_func": -1.2380952437718709, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5001189808050791, "step": 285 }, { "completion_length": 167.17346700032553, "epoch": 0.6023166023166023, "grad_norm": 1.112428069114685, "kl": 0.8079427083333334, "learning_rate": 1e-06, "loss": 0.0093, "reward": 6.459765354792277, "reward_std": 2.901971995830536, "rewards/citation_reward_func": 3.8945577144622803, "rewards/correctness_reward_func": 2.9251699646313987, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.2551020383834839, "rewards/penalize_wrong_passages_reward_func": -1.1020407974720001, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49718021353085834, "step": 286 }, { "completion_length": 162.51360321044922, "epoch": 0.6044226044226044, "grad_norm": 1.2326608896255493, "kl": 1.4615885416666667, "learning_rate": 1e-06, "loss": 0.0159, "reward": 5.752693970998128, "reward_std": 2.563380718231201, "rewards/citation_reward_func": 3.945578098297119, "rewards/correctness_reward_func": 1.870748261610667, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -0.2551020433505376, "rewards/penalize_wrong_passages_reward_func": -0.8027210732301077, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.497591773668925, "step": 287 }, { "completion_length": 175.84353637695312, "epoch": 0.6065286065286065, "grad_norm": 1.1159588098526, "kl": 0.9329427083333334, "learning_rate": 1e-06, "loss": 0.0105, "reward": 6.295217712720235, "reward_std": 2.303104837735494, "rewards/citation_reward_func": 4.175170024236043, "rewards/correctness_reward_func": 2.040816302100817, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -0.7687074740727743, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 288 }, { "completion_length": 155.27210489908853, "epoch": 0.6086346086346086, "grad_norm": 0.7354316115379333, "kl": 0.734375, "learning_rate": 1e-06, "loss": 0.0131, "reward": 9.381948471069336, "reward_std": 1.341506339609623, "rewards/citation_reward_func": 4.76190463701884, "rewards/correctness_reward_func": 4.149659872055054, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -0.37414966337382793, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49929585059483844, "step": 289 }, { "completion_length": 144.54081217447916, "epoch": 0.6107406107406107, "grad_norm": 0.8398804664611816, "kl": 0.7884114583333334, "learning_rate": 1e-06, "loss": 0.0122, "reward": 9.019707520802816, "reward_std": 1.1839089542627335, "rewards/citation_reward_func": 4.719387690226237, "rewards/correctness_reward_func": 4.149659752845764, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.8503401384999355, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 290 }, { "completion_length": 142.43877410888672, "epoch": 0.6128466128466128, "grad_norm": 1.53350031375885, "kl": 0.890625, "learning_rate": 1e-06, "loss": 0.011, "reward": 7.9516801834106445, "reward_std": 1.5891411205132802, "rewards/citation_reward_func": 4.719387690226237, "rewards/correctness_reward_func": 2.653061166405678, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.42176870505015057, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 291 }, { "completion_length": 145.73809305826822, "epoch": 0.614952614952615, "grad_norm": 1.1438617706298828, "kl": 0.7376302083333334, "learning_rate": 1e-06, "loss": 0.0104, "reward": 9.490796089172363, "reward_std": 1.4907788634300232, "rewards/citation_reward_func": 4.88095235824585, "rewards/correctness_reward_func": 3.860544125239054, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.251700675735871, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 292 }, { "completion_length": 178.14285278320312, "epoch": 0.6170586170586171, "grad_norm": 1.3113124370574951, "kl": 0.87890625, "learning_rate": 1e-06, "loss": 0.0106, "reward": 6.215275506178538, "reward_std": 2.398002475500107, "rewards/citation_reward_func": 4.506802638371785, "rewards/correctness_reward_func": 3.571428418159485, "rewards/formatting_reward_func": 0.4948979566494624, "rewards/length_reward_func": -0.2551020383834839, "rewards/penalize_wrong_passages_reward_func": -2.5986393888791404, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.495887686808904, "step": 293 }, { "completion_length": 166.6972745259603, "epoch": 0.6191646191646192, "grad_norm": 1.2684246301651, "kl": 1.12890625, "learning_rate": 1e-06, "loss": 0.0119, "reward": 5.689976135889689, "reward_std": 3.035001496473948, "rewards/citation_reward_func": 4.4557822942733765, "rewards/correctness_reward_func": 2.346938749154409, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -1.8843536774317424, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.4943978985150655, "step": 294 }, { "completion_length": 169.28911336263022, "epoch": 0.6212706212706213, "grad_norm": 0.9547519683837891, "kl": 1.728515625, "learning_rate": 1e-06, "loss": 0.0178, "reward": 3.982292652130127, "reward_std": 4.037615021069844, "rewards/citation_reward_func": 3.9030612309773765, "rewards/correctness_reward_func": 1.5816325893004735, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.30612244705359143, "rewards/penalize_wrong_passages_reward_func": -1.857142796119054, "rewards/unicode_reward_func": -0.3401360511779785, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 295 }, { "completion_length": 157.51020050048828, "epoch": 0.6233766233766234, "grad_norm": 0.9751655459403992, "kl": 0.9680989583333334, "learning_rate": 1e-06, "loss": 0.012, "reward": 6.598163406054179, "reward_std": 2.1323233445485434, "rewards/citation_reward_func": 4.498299201329549, "rewards/correctness_reward_func": 2.176870713631312, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.2551020433505376, "rewards/penalize_wrong_passages_reward_func": -0.8163264989852905, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49612238506476086, "step": 296 }, { "completion_length": 144.2040761311849, "epoch": 0.6254826254826255, "grad_norm": 0.8613839745521545, "kl": 0.865234375, "learning_rate": 1e-06, "loss": 0.0124, "reward": 7.519839843114217, "reward_std": 1.2146833290656407, "rewards/citation_reward_func": 4.438775459925334, "rewards/correctness_reward_func": 2.6190475821495056, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.5374149655302366, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49943191309769946, "step": 297 }, { "completion_length": 137.93196868896484, "epoch": 0.6275886275886275, "grad_norm": 0.8827674388885498, "kl": 0.9602864583333334, "learning_rate": 1e-06, "loss": 0.0128, "reward": 6.497445583343506, "reward_std": 1.1725004116694133, "rewards/citation_reward_func": 4.532312790552775, "rewards/correctness_reward_func": 1.3095237612724304, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.3401360536615054, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4957448293765386, "step": 298 }, { "completion_length": 140.56462605794272, "epoch": 0.6296946296946297, "grad_norm": 0.6594263315200806, "kl": 0.8899739583333334, "learning_rate": 1e-06, "loss": 0.0121, "reward": 6.2714048226674395, "reward_std": 0.911820242802302, "rewards/citation_reward_func": 4.7874148686726885, "rewards/correctness_reward_func": 4.625850319862366, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -4.0884352922439575, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49929585059483844, "step": 299 }, { "completion_length": 149.50339889526367, "epoch": 0.6318006318006318, "grad_norm": 1.0458486080169678, "kl": 0.986328125, "learning_rate": 1e-06, "loss": 0.0135, "reward": 8.150659720102945, "reward_std": 1.8430875837802887, "rewards/citation_reward_func": 4.234693845113118, "rewards/correctness_reward_func": 3.707482933998108, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.7414965803424517, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 300 }, { "completion_length": 166.51360321044922, "epoch": 0.6339066339066339, "grad_norm": 0.6753512620925903, "kl": 0.845703125, "learning_rate": 1e-06, "loss": 0.013, "reward": 8.259503444035849, "reward_std": 2.7765374779701233, "rewards/citation_reward_func": 4.540816227595012, "rewards/correctness_reward_func": 3.8775508801142373, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.2551020433505376, "rewards/penalize_wrong_passages_reward_func": -0.7346938600142797, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 301 }, { "completion_length": 183.44217427571616, "epoch": 0.636012636012636, "grad_norm": 0.9966219067573547, "kl": 1.2005208333333333, "learning_rate": 1e-06, "loss": 0.014, "reward": 4.120401422182719, "reward_std": 3.795739163955053, "rewards/citation_reward_func": 4.073129216829936, "rewards/correctness_reward_func": 3.656462470690409, "rewards/formatting_reward_func": 0.49234693745772046, "rewards/length_reward_func": -1.7346938749154408, "rewards/penalize_wrong_passages_reward_func": -2.857142766316732, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4902992645899455, "step": 302 }, { "completion_length": 193.60203552246094, "epoch": 0.6381186381186381, "grad_norm": 1.1840745210647583, "kl": 1.0247395833333333, "learning_rate": 1e-06, "loss": 0.0109, "reward": 5.088537494341533, "reward_std": 5.065946817398071, "rewards/citation_reward_func": 3.945578098297119, "rewards/correctness_reward_func": 3.027210831642151, "rewards/formatting_reward_func": 0.4948979566494624, "rewards/length_reward_func": -1.5306122104326885, "rewards/penalize_wrong_passages_reward_func": -1.3401360313097637, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49159857630729675, "step": 303 }, { "completion_length": 168.074826558431, "epoch": 0.6402246402246402, "grad_norm": 0.9291290044784546, "kl": 0.8951822916666666, "learning_rate": 1e-06, "loss": 0.0107, "reward": 5.667663335800171, "reward_std": 4.012887159983317, "rewards/citation_reward_func": 4.676870584487915, "rewards/correctness_reward_func": 2.4149659276008606, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -1.9387754599253337, "rewards/penalize_wrong_passages_reward_func": -0.4829931954542796, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49929585059483844, "step": 304 }, { "completion_length": 148.09523264567056, "epoch": 0.6423306423306423, "grad_norm": 1.2598353624343872, "kl": 1.2278645833333333, "learning_rate": 1e-06, "loss": 0.0156, "reward": 6.578816254933675, "reward_std": 1.0740979760885239, "rewards/citation_reward_func": 4.396258354187012, "rewards/correctness_reward_func": 2.0748298863569894, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.8911564573645592, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49888428548971814, "step": 305 }, { "completion_length": 134.02720896402994, "epoch": 0.6444366444366444, "grad_norm": 0.8883174657821655, "kl": 1.0091145833333333, "learning_rate": 1e-06, "loss": 0.0146, "reward": 9.13562266031901, "reward_std": 1.1919774264097214, "rewards/citation_reward_func": 4.710884173711141, "rewards/correctness_reward_func": 3.7925169865290322, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -0.2653061201175054, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4995679606993993, "step": 306 }, { "completion_length": 140.46598052978516, "epoch": 0.6465426465426466, "grad_norm": 0.7637023329734802, "kl": 0.9381510416666666, "learning_rate": 1e-06, "loss": 0.0129, "reward": 6.968687216440837, "reward_std": 1.1493754784266155, "rewards/citation_reward_func": 4.3622448444366455, "rewards/correctness_reward_func": 2.295918265978495, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.6394557605187098, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 307 }, { "completion_length": 131.6020393371582, "epoch": 0.6486486486486487, "grad_norm": 0.6228148341178894, "kl": 1.03125, "learning_rate": 1e-06, "loss": 0.016, "reward": 8.279911756515503, "reward_std": 0.8148330748081207, "rewards/citation_reward_func": 4.795918226242065, "rewards/correctness_reward_func": 3.2142856121063232, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.6802721035977205, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 308 }, { "completion_length": 139.99659474690756, "epoch": 0.6507546507546508, "grad_norm": 19.074426651000977, "kl": 1.279296875, "learning_rate": 1e-06, "loss": 0.0158, "reward": 5.977187156677246, "reward_std": 1.4246279398600261, "rewards/citation_reward_func": 4.472789009412129, "rewards/correctness_reward_func": 4.608843445777893, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -3.897959033648173, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49929585059483844, "step": 309 }, { "completion_length": 158.9897944132487, "epoch": 0.6528606528606529, "grad_norm": 1.0727325677871704, "kl": 0.873046875, "learning_rate": 1e-06, "loss": 0.0104, "reward": 7.802013556162517, "reward_std": 2.125181575616201, "rewards/citation_reward_func": 4.676870743433635, "rewards/correctness_reward_func": 2.704081575075785, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -0.42176869759957, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.497591773668925, "step": 310 }, { "completion_length": 164.09863789876303, "epoch": 0.654966654966655, "grad_norm": 0.8916863799095154, "kl": 0.9446614583333334, "learning_rate": 1e-06, "loss": 0.0102, "reward": 7.430758555730184, "reward_std": 2.2719212770462036, "rewards/citation_reward_func": 4.472789009412129, "rewards/correctness_reward_func": 2.6360543767611184, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.6258503198623657, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4987856447696686, "step": 311 }, { "completion_length": 136.65646107991537, "epoch": 0.657072657072657, "grad_norm": 47.32864761352539, "kl": 2.8509114583333335, "learning_rate": 1e-06, "loss": 0.0309, "reward": 6.757802804311116, "reward_std": 1.4053757439057033, "rewards/citation_reward_func": 4.668367306391398, "rewards/correctness_reward_func": 3.5204081535339355, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -2.3809523483117423, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 312 }, { "completion_length": 152.82312774658203, "epoch": 0.6591786591786591, "grad_norm": 1.4776674509048462, "kl": 1.0787760416666667, "learning_rate": 1e-06, "loss": 0.0113, "reward": 6.497523943583171, "reward_std": 2.623288551966349, "rewards/citation_reward_func": 4.6343536376953125, "rewards/correctness_reward_func": 1.9557822744051616, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -1.040816307067871, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4992244243621826, "step": 313 }, { "completion_length": 161.29251352945963, "epoch": 0.6612846612846612, "grad_norm": 1.09335196018219, "kl": 1.1256510416666667, "learning_rate": 1e-06, "loss": 0.0119, "reward": 6.773108879725139, "reward_std": 3.392216761906942, "rewards/citation_reward_func": 4.540816307067871, "rewards/correctness_reward_func": 2.363945464293162, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.2551020383834839, "rewards/penalize_wrong_passages_reward_func": -0.8775509893894196, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 314 }, { "completion_length": 164.6700642903646, "epoch": 0.6633906633906634, "grad_norm": 13.127339363098145, "kl": 2.5885416666666665, "learning_rate": 1e-06, "loss": 0.027, "reward": 6.473789056142171, "reward_std": 3.381900449593862, "rewards/citation_reward_func": 4.574829816818237, "rewards/correctness_reward_func": 2.057823042074839, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.561224490404129, "rewards/penalize_wrong_passages_reward_func": -0.5986394435167313, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 315 }, { "completion_length": 166.86394246419272, "epoch": 0.6654966654966655, "grad_norm": 2.0115606784820557, "kl": 1.81640625, "learning_rate": 1e-06, "loss": 0.0196, "reward": 5.744387944539388, "reward_std": 3.269197463989258, "rewards/citation_reward_func": 4.294217665990193, "rewards/correctness_reward_func": 2.0068026582400003, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -0.6632653077443441, "rewards/penalize_wrong_passages_reward_func": -0.8843537370363871, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4943876961867015, "step": 316 }, { "completion_length": 148.34353383382162, "epoch": 0.6676026676026676, "grad_norm": 17.85995864868164, "kl": 3.5807291666666665, "learning_rate": 1e-06, "loss": 0.0383, "reward": 7.790108839670817, "reward_std": 2.6309839884440103, "rewards/citation_reward_func": 4.625850200653076, "rewards/correctness_reward_func": 3.061224381128947, "rewards/formatting_reward_func": 0.49659863611062366, "rewards/length_reward_func": -0.40816326936086017, "rewards/penalize_wrong_passages_reward_func": -0.4829931929707527, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.497591773668925, "step": 317 }, { "completion_length": 140.26870473225912, "epoch": 0.6697086697086697, "grad_norm": 1.0049431324005127, "kl": 1.2799479166666667, "learning_rate": 1e-06, "loss": 0.016, "reward": 7.075745026270549, "reward_std": 2.417468766371409, "rewards/citation_reward_func": 4.396258433659871, "rewards/correctness_reward_func": 2.8061224222183228, "rewards/formatting_reward_func": 0.4948979616165161, "rewards/length_reward_func": -0.40816326439380646, "rewards/penalize_wrong_passages_reward_func": -0.7074829836686453, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49411219358444214, "step": 318 }, { "completion_length": 137.91836420694986, "epoch": 0.6718146718146718, "grad_norm": 0.9339328408241272, "kl": 1.0768229166666667, "learning_rate": 1e-06, "loss": 0.0155, "reward": 9.93605089187622, "reward_std": 1.3493054906527202, "rewards/citation_reward_func": 4.727891127268474, "rewards/correctness_reward_func": 4.455782135327657, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.19727890690167746, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.5006767958402634, "step": 319 }, { "completion_length": 150.0952377319336, "epoch": 0.6739206739206739, "grad_norm": 1.0160009860992432, "kl": 1.0696614583333333, "learning_rate": 1e-06, "loss": 0.0142, "reward": 9.079231341679892, "reward_std": 1.7906232078870137, "rewards/citation_reward_func": 4.76190463701884, "rewards/correctness_reward_func": 3.7244897286097207, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -0.20408163219690323, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 320 }, { "completion_length": 163.38094838460287, "epoch": 0.676026676026676, "grad_norm": 0.927356481552124, "kl": 0.9557291666666666, "learning_rate": 1e-06, "loss": 0.0115, "reward": 7.07500680287679, "reward_std": 2.7190446058909097, "rewards/citation_reward_func": 4.294217745463054, "rewards/correctness_reward_func": 2.7210883696873984, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.3571428557236989, "rewards/penalize_wrong_passages_reward_func": -0.5782312750816345, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49677544832229614, "step": 321 }, { "completion_length": 135.02040354410806, "epoch": 0.6781326781326781, "grad_norm": 0.7541379928588867, "kl": 1.3046875, "learning_rate": 1e-06, "loss": 0.0186, "reward": 9.808823108673096, "reward_std": 1.0998690476020176, "rewards/citation_reward_func": 4.821428537368774, "rewards/correctness_reward_func": 4.336734612782796, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.29931971927483875, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 322 }, { "completion_length": 151.50340016682944, "epoch": 0.6802386802386803, "grad_norm": 1.2360163927078247, "kl": 1.322265625, "learning_rate": 1e-06, "loss": 0.0162, "reward": 7.388632615407308, "reward_std": 1.9448821544647217, "rewards/citation_reward_func": 4.3792515595753985, "rewards/correctness_reward_func": 2.7721087535222373, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -0.557823121547699, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49917681018511456, "step": 323 }, { "completion_length": 153.70748138427734, "epoch": 0.6823446823446824, "grad_norm": 1.0694539546966553, "kl": 1.0631510416666667, "learning_rate": 1e-06, "loss": 0.0137, "reward": 6.286017080148061, "reward_std": 3.029209574063619, "rewards/citation_reward_func": 4.625850200653076, "rewards/correctness_reward_func": 2.3129251102606454, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.9183673113584518, "rewards/penalize_wrong_passages_reward_func": -0.5578231240312258, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.4934999297062556, "step": 324 }, { "completion_length": 138.7653020222982, "epoch": 0.6844506844506845, "grad_norm": 2.301809549331665, "kl": 1.3697916666666667, "learning_rate": 1e-06, "loss": 0.0154, "reward": 8.145108779271444, "reward_std": 2.592639982700348, "rewards/citation_reward_func": 4.88095235824585, "rewards/correctness_reward_func": 3.1462584336598716, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.2551020433505376, "rewards/penalize_wrong_passages_reward_func": -0.6258503198623657, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4988502711057663, "step": 325 }, { "completion_length": 150.5816281636556, "epoch": 0.6865566865566866, "grad_norm": 1.4067254066467285, "kl": 1.146484375, "learning_rate": 1e-06, "loss": 0.015, "reward": 6.889067967732747, "reward_std": 3.112168868382772, "rewards/citation_reward_func": 4.685374021530151, "rewards/correctness_reward_func": 3.1802720030148826, "rewards/formatting_reward_func": 0.4948979566494624, "rewards/length_reward_func": -1.275510181983312, "rewards/penalize_wrong_passages_reward_func": -0.6870748164753119, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.491108775138855, "step": 326 }, { "completion_length": 146.06802368164062, "epoch": 0.6886626886626886, "grad_norm": 1.3995894193649292, "kl": 1.0501302083333333, "learning_rate": 1e-06, "loss": 0.0143, "reward": 8.407578070958456, "reward_std": 1.973113218943278, "rewards/citation_reward_func": 4.812925020853679, "rewards/correctness_reward_func": 3.452380895614624, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.459183673063914, "rewards/penalize_wrong_passages_reward_func": -0.3945578138033549, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49771422147750854, "step": 327 }, { "completion_length": 165.10543823242188, "epoch": 0.6907686907686907, "grad_norm": 1.1846466064453125, "kl": 1.28515625, "learning_rate": 1e-06, "loss": 0.0138, "reward": 5.958414912223816, "reward_std": 3.437282919883728, "rewards/citation_reward_func": 4.20918349424998, "rewards/correctness_reward_func": 2.1938775181770325, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.40816326439380646, "rewards/penalize_wrong_passages_reward_func": -0.8571428457895914, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.4924285064140956, "step": 328 }, { "completion_length": 147.21768442789713, "epoch": 0.6928746928746928, "grad_norm": 1.1162633895874023, "kl": 1.1796875, "learning_rate": 1e-06, "loss": 0.0138, "reward": 7.48524824778239, "reward_std": 1.8048174877961476, "rewards/citation_reward_func": 4.515305995941162, "rewards/correctness_reward_func": 2.9081631700197854, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.3571428507566452, "rewards/penalize_wrong_passages_reward_func": -0.5782312899827957, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49715299407641095, "step": 329 }, { "completion_length": 148.9251708984375, "epoch": 0.694980694980695, "grad_norm": 0.6264429092407227, "kl": 0.92578125, "learning_rate": 1e-06, "loss": 0.013, "reward": 9.310523192087809, "reward_std": 1.478491594394048, "rewards/citation_reward_func": 4.812925100326538, "rewards/correctness_reward_func": 3.7244897286097207, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.17687074281275272, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 330 }, { "completion_length": 164.04080963134766, "epoch": 0.6970866970866971, "grad_norm": 1.2959609031677246, "kl": 1.0891927083333333, "learning_rate": 1e-06, "loss": 0.0142, "reward": 8.000187873840332, "reward_std": 2.20181867480278, "rewards/citation_reward_func": 4.390589475631714, "rewards/correctness_reward_func": 3.503401279449463, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.2551020433505376, "rewards/penalize_wrong_passages_reward_func": -0.6326530426740646, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49565299848715466, "step": 331 }, { "completion_length": 225.14625549316406, "epoch": 0.6991926991926992, "grad_norm": 3.462467670440674, "kl": 0.9622395833333334, "learning_rate": 1e-06, "loss": 0.0096, "reward": 2.013375143210093, "reward_std": 3.3742276430130005, "rewards/citation_reward_func": 2.8684807221094766, "rewards/correctness_reward_func": 0.6972788994510969, "rewards/formatting_reward_func": 0.4931972821553548, "rewards/length_reward_func": -0.7142857064803442, "rewards/penalize_wrong_passages_reward_func": -1.8231291969617207, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49183327456315357, "step": 332 }, { "completion_length": 226.3333282470703, "epoch": 0.7012987012987013, "grad_norm": 1.2166374921798706, "kl": 1.123046875, "learning_rate": 1e-06, "loss": 0.0112, "reward": 2.068095083038012, "reward_std": 3.714888095855713, "rewards/citation_reward_func": 3.129251480102539, "rewards/correctness_reward_func": 0.9013605018456777, "rewards/formatting_reward_func": 0.49659863611062366, "rewards/length_reward_func": -0.6122448990742365, "rewards/penalize_wrong_passages_reward_func": -2.1700679858525596, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.4932652513186137, "step": 333 }, { "completion_length": 205.5408172607422, "epoch": 0.7034047034047034, "grad_norm": 1.0852947235107422, "kl": 2.6360677083333335, "learning_rate": 1e-06, "loss": 0.0264, "reward": 2.4366893072923026, "reward_std": 4.2191972732543945, "rewards/citation_reward_func": 3.3219953378041587, "rewards/correctness_reward_func": 1.2244897385438283, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -1.836734692255656, "rewards/penalize_wrong_passages_reward_func": -1.272108832995097, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49904755254586536, "step": 334 }, { "completion_length": 173.92516581217447, "epoch": 0.7055107055107055, "grad_norm": 0.9657922983169556, "kl": 0.8541666666666666, "learning_rate": 1e-06, "loss": 0.0085, "reward": 2.525939663251241, "reward_std": 2.653134822845459, "rewards/citation_reward_func": 3.7018140157063804, "rewards/correctness_reward_func": 1.037414940694968, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -2.836734632651011, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.49929585059483844, "step": 335 }, { "completion_length": 211.1088409423828, "epoch": 0.7076167076167076, "grad_norm": 0.9506802558898926, "kl": 0.7936197916666666, "learning_rate": 1e-06, "loss": 0.0079, "reward": 3.7129976749420166, "reward_std": 3.488153060277303, "rewards/citation_reward_func": 3.191610018412272, "rewards/correctness_reward_func": 1.258503367503484, "rewards/formatting_reward_func": 0.4897959182659785, "rewards/length_reward_func": -0.6632653127113978, "rewards/penalize_wrong_passages_reward_func": -1.0544217626253765, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4907754609982173, "step": 336 }, { "completion_length": 233.43536631266275, "epoch": 0.7097227097227097, "grad_norm": 1.0683774948120117, "kl": 0.7545572916666666, "learning_rate": 1e-06, "loss": 0.0075, "reward": 4.680926203727722, "reward_std": 3.037352502346039, "rewards/citation_reward_func": 3.0215419133504233, "rewards/correctness_reward_func": 2.1938775132099786, "rewards/formatting_reward_func": 0.4948979566494624, "rewards/length_reward_func": -0.5102040767669678, "rewards/penalize_wrong_passages_reward_func": -1.013605425755183, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49441831310590106, "step": 337 }, { "completion_length": 199.06122080485025, "epoch": 0.7118287118287119, "grad_norm": 1.2184964418411255, "kl": 0.80078125, "learning_rate": 1e-06, "loss": 0.008, "reward": 0.370614451666673, "reward_std": 3.0889535943667092, "rewards/citation_reward_func": 3.202947735786438, "rewards/correctness_reward_func": 2.8231292019287744, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -6.333333174387614, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.500999927520752, "step": 338 }, { "completion_length": 206.4489771525065, "epoch": 0.713934713934714, "grad_norm": 0.8987395167350769, "kl": 0.8020833333333334, "learning_rate": 1e-06, "loss": 0.008, "reward": 2.939191480477651, "reward_std": 3.4129937092463174, "rewards/citation_reward_func": 3.497732241948446, "rewards/correctness_reward_func": 3.1632652282714844, "rewards/formatting_reward_func": 0.4931972771883011, "rewards/length_reward_func": -0.40816326936086017, "rewards/penalize_wrong_passages_reward_func": -4.299319674571355, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4924795428911845, "step": 339 }, { "completion_length": 183.36734517415366, "epoch": 0.7160407160407161, "grad_norm": 2.6049094200134277, "kl": 1.048828125, "learning_rate": 1e-06, "loss": 0.0105, "reward": 6.931339343388875, "reward_std": 2.379570245742798, "rewards/citation_reward_func": 3.713151772816976, "rewards/correctness_reward_func": 2.9421768337488174, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.3571428606907527, "rewards/penalize_wrong_passages_reward_func": -0.36054421216249466, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4953978955745697, "step": 340 }, { "completion_length": 205.87074279785156, "epoch": 0.7181467181467182, "grad_norm": 0.9560754895210266, "kl": 0.865234375, "learning_rate": 1e-06, "loss": 0.0087, "reward": 4.302693744500478, "reward_std": 3.5764169692993164, "rewards/citation_reward_func": 3.928571343421936, "rewards/correctness_reward_func": 0.9523809428016344, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.459183673063914, "rewards/penalize_wrong_passages_reward_func": -0.7755101751536131, "rewards/unicode_reward_func": -0.3401360511779785, "rewards/xmlcount_reward_func": 0.4982720414797465, "step": 341 }, { "completion_length": 174.1190439860026, "epoch": 0.7202527202527202, "grad_norm": 1.4415292739868164, "kl": 0.87890625, "learning_rate": 1e-06, "loss": 0.009, "reward": 1.8660724461078644, "reward_std": 2.3170458575089774, "rewards/citation_reward_func": 4.229024728139241, "rewards/correctness_reward_func": 1.139455755551656, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -4.2925169467926025, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.497591773668925, "step": 342 }, { "completion_length": 235.49319203694662, "epoch": 0.7223587223587223, "grad_norm": 1.1462864875793457, "kl": 0.6555989583333334, "learning_rate": 1e-06, "loss": 0.0066, "reward": 1.1734318683544795, "reward_std": 3.075716018676758, "rewards/citation_reward_func": 2.431972603003184, "rewards/correctness_reward_func": 1.1394557654857635, "rewards/formatting_reward_func": 0.49659863611062366, "rewards/length_reward_func": -0.3571428557236989, "rewards/penalize_wrong_passages_reward_func": -3.0340135296185813, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4965611646572749, "step": 343 }, { "completion_length": 223.60543823242188, "epoch": 0.7244647244647244, "grad_norm": 1.000259518623352, "kl": 1.1751302083333333, "learning_rate": 1e-06, "loss": 0.0119, "reward": 4.428008993466695, "reward_std": 2.8102714816729226, "rewards/citation_reward_func": 3.4070293505986533, "rewards/correctness_reward_func": 0.9693877349297205, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -0.734693855047226, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49206797281901044, "step": 344 }, { "completion_length": 223.97958628336588, "epoch": 0.7265707265707265, "grad_norm": 0.7306500673294067, "kl": 1.046875, "learning_rate": 1e-06, "loss": 0.0105, "reward": 3.538080414136251, "reward_std": 2.771793325742086, "rewards/citation_reward_func": 3.253968079884847, "rewards/correctness_reward_func": 1.037414958079656, "rewards/formatting_reward_func": 0.49149659276008606, "rewards/length_reward_func": -0.30612245202064514, "rewards/penalize_wrong_passages_reward_func": -1.4285713980595272, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.48989449938138324, "step": 345 }, { "completion_length": 184.20747884114584, "epoch": 0.7286767286767287, "grad_norm": 0.9912636876106262, "kl": 0.8352864583333334, "learning_rate": 1e-06, "loss": 0.0084, "reward": 3.882302482922872, "reward_std": 2.212656001249949, "rewards/citation_reward_func": 3.027210831642151, "rewards/correctness_reward_func": 0.8673469126224518, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -0.8571428457895914, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4979489247004191, "step": 346 }, { "completion_length": 189.8333282470703, "epoch": 0.7307827307827308, "grad_norm": 0.8739500641822815, "kl": 0.9264322916666666, "learning_rate": 1e-06, "loss": 0.0092, "reward": 4.077586094538371, "reward_std": 2.7235729893048606, "rewards/citation_reward_func": 3.3446712493896484, "rewards/correctness_reward_func": 1.0884353518486023, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.40816325942675274, "rewards/penalize_wrong_passages_reward_func": -0.9387754847606024, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49311897655328113, "step": 347 }, { "completion_length": 161.45237731933594, "epoch": 0.7328887328887329, "grad_norm": 1.0412046909332275, "kl": 1.0709635416666667, "learning_rate": 1e-06, "loss": 0.0107, "reward": 3.8484999338785806, "reward_std": 1.791780486702919, "rewards/citation_reward_func": 3.4523807366689048, "rewards/correctness_reward_func": 1.2244897882143657, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.05102040867010752, "rewards/penalize_wrong_passages_reward_func": -1.6054421067237854, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.49815979103247326, "step": 348 }, { "completion_length": 206.01020050048828, "epoch": 0.734994734994735, "grad_norm": 1.1982636451721191, "kl": 0.875, "learning_rate": 1e-06, "loss": 0.0087, "reward": 4.812347888946533, "reward_std": 3.0765510201454163, "rewards/citation_reward_func": 3.327664375305176, "rewards/correctness_reward_func": 1.6666666269302368, "rewards/formatting_reward_func": 0.49149659276008606, "rewards/length_reward_func": -0.3571428507566452, "rewards/penalize_wrong_passages_reward_func": -0.8027210583289465, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4863843123118083, "step": 349 }, { "completion_length": 197.74149068196616, "epoch": 0.7371007371007371, "grad_norm": 5.780428886413574, "kl": 1.513671875, "learning_rate": 1e-06, "loss": 0.0153, "reward": 2.651944398880005, "reward_std": 4.0778325001398725, "rewards/citation_reward_func": 3.0498865842819214, "rewards/correctness_reward_func": 0.7482993106047312, "rewards/formatting_reward_func": 0.4931972821553548, "rewards/length_reward_func": -0.6122448941071829, "rewards/penalize_wrong_passages_reward_func": -1.3401360313097637, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.4830101529757182, "step": 350 }, { "completion_length": 200.83673350016275, "epoch": 0.7392067392067392, "grad_norm": 19.824453353881836, "kl": 8.28125, "learning_rate": 1e-06, "loss": 0.0824, "reward": 1.8109624261657398, "reward_std": 3.1978684663772583, "rewards/citation_reward_func": 3.6904759804407754, "rewards/correctness_reward_func": 1.224489763379097, "rewards/formatting_reward_func": 0.49574829638004303, "rewards/length_reward_func": -0.6122448990742365, "rewards/penalize_wrong_passages_reward_func": -3.4829931457837424, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4954863339662552, "step": 351 }, { "completion_length": 184.20747884114584, "epoch": 0.7413127413127413, "grad_norm": 0.9565224647521973, "kl": 0.8385416666666666, "learning_rate": 1e-06, "loss": 0.0084, "reward": -0.03650915250182152, "reward_std": 2.751798431078593, "rewards/citation_reward_func": 2.851473848025004, "rewards/correctness_reward_func": 2.363945504029592, "rewards/formatting_reward_func": 0.4948979566494624, "rewards/length_reward_func": -0.30612244705359143, "rewards/penalize_wrong_passages_reward_func": -5.931972761948903, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4912686546643575, "step": 352 }, { "completion_length": 198.51360066731772, "epoch": 0.7434187434187434, "grad_norm": 1.1145179271697998, "kl": 0.775390625, "learning_rate": 1e-06, "loss": 0.0082, "reward": 3.776633802180489, "reward_std": 2.4671822786331177, "rewards/citation_reward_func": 3.4297050635019937, "rewards/correctness_reward_func": 2.0918366561333337, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.40816326936086017, "rewards/penalize_wrong_passages_reward_func": -2.333333301047484, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4982890437046687, "step": 353 }, { "completion_length": 240.19046783447266, "epoch": 0.7455247455247456, "grad_norm": 0.906201958656311, "kl": 0.72265625, "learning_rate": 1e-06, "loss": 0.0072, "reward": 2.3721699317296348, "reward_std": 3.7090295950571694, "rewards/citation_reward_func": 2.976190368334452, "rewards/correctness_reward_func": 0.9183673212925593, "rewards/formatting_reward_func": 0.4863945593436559, "rewards/length_reward_func": -0.8163265238205591, "rewards/penalize_wrong_passages_reward_func": -1.673469364643097, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.481013556321462, "step": 354 }, { "completion_length": 189.56122080485025, "epoch": 0.7476307476307477, "grad_norm": 32.47963333129883, "kl": 1.783203125, "learning_rate": 1e-06, "loss": 0.0178, "reward": 2.647131323814392, "reward_std": 3.3386093378067017, "rewards/citation_reward_func": 3.5260767141977944, "rewards/correctness_reward_func": 1.9217686653137207, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -0.40816326439380646, "rewards/penalize_wrong_passages_reward_func": -3.204081575075785, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.48499993483225506, "step": 355 }, { "completion_length": 194.7448959350586, "epoch": 0.7497367497367498, "grad_norm": 1.9522308111190796, "kl": 0.873046875, "learning_rate": 1e-06, "loss": 0.0086, "reward": 0.12059734265009563, "reward_std": 2.213576853275299, "rewards/citation_reward_func": 3.2879815896352134, "rewards/correctness_reward_func": 1.734693835179011, "rewards/formatting_reward_func": 0.4931972771883011, "rewards/length_reward_func": -0.3571428557236989, "rewards/penalize_wrong_passages_reward_func": -5.530612190564473, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49247953792413074, "step": 356 }, { "completion_length": 212.523806254069, "epoch": 0.7518427518427518, "grad_norm": 0.8430114984512329, "kl": 0.9361979166666666, "learning_rate": 1e-06, "loss": 0.0098, "reward": 4.969536264737447, "reward_std": 3.3358057339986167, "rewards/citation_reward_func": 3.3049886226654053, "rewards/correctness_reward_func": 2.3469387094179788, "rewards/formatting_reward_func": 0.4931972771883011, "rewards/length_reward_func": -0.40816326439380646, "rewards/penalize_wrong_passages_reward_func": -1.0884353419144948, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.4910781780878703, "step": 357 }, { "completion_length": 193.30611928304037, "epoch": 0.7539487539487539, "grad_norm": 0.9406774640083313, "kl": 0.8352864583333334, "learning_rate": 1e-06, "loss": 0.0084, "reward": 3.408174604177475, "reward_std": 2.1897811690966287, "rewards/citation_reward_func": 3.3106573820114136, "rewards/correctness_reward_func": 1.1054421464602153, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.40816326936086017, "rewards/penalize_wrong_passages_reward_func": -1.5986394279946883, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49887748062610626, "step": 358 }, { "completion_length": 221.0544204711914, "epoch": 0.756054756054756, "grad_norm": 0.7500853538513184, "kl": 0.6780598958333334, "learning_rate": 1e-06, "loss": 0.0072, "reward": 6.953930695851644, "reward_std": 3.0307931105295816, "rewards/citation_reward_func": 3.5941041310628257, "rewards/correctness_reward_func": 3.8435373107592263, "rewards/formatting_reward_func": 0.4931972771883011, "rewards/length_reward_func": -0.5102040867010752, "rewards/penalize_wrong_passages_reward_func": -0.9591836531956991, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.492479532957077, "step": 359 }, { "completion_length": 185.12244669596353, "epoch": 0.7581607581607581, "grad_norm": 0.8867349624633789, "kl": 0.6803385416666666, "learning_rate": 1e-06, "loss": 0.0069, "reward": 5.780478318532308, "reward_std": 2.767763157685598, "rewards/citation_reward_func": 3.4920632441838584, "rewards/correctness_reward_func": 2.9761904080708823, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.20408162971337637, "rewards/penalize_wrong_passages_reward_func": -1.4829931457837422, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49929924805959064, "step": 360 }, { "completion_length": 187.57142639160156, "epoch": 0.7602667602667603, "grad_norm": 0.9376119375228882, "kl": 0.8313802083333334, "learning_rate": 1e-06, "loss": 0.0084, "reward": 3.219564517339071, "reward_std": 2.914902945359548, "rewards/citation_reward_func": 3.5204078753789267, "rewards/correctness_reward_func": 1.3775509893894196, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -0.2551020433505376, "rewards/penalize_wrong_passages_reward_func": -2.244897892077764, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.4950747738281886, "step": 361 }, { "completion_length": 211.78230794270834, "epoch": 0.7623727623727624, "grad_norm": 0.9319391250610352, "kl": 0.71484375, "learning_rate": 1e-06, "loss": 0.0071, "reward": 4.074728965759277, "reward_std": 3.0338656107584634, "rewards/citation_reward_func": 3.038548747698466, "rewards/correctness_reward_func": 1.54761899014314, "rewards/formatting_reward_func": 0.4948979566494624, "rewards/length_reward_func": -0.5102040767669678, "rewards/penalize_wrong_passages_reward_func": -0.9863945345083872, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49026183784008026, "step": 362 }, { "completion_length": 213.2789103190104, "epoch": 0.7644787644787645, "grad_norm": 0.9279811978340149, "kl": 0.755859375, "learning_rate": 1e-06, "loss": 0.0076, "reward": 4.575824022293091, "reward_std": 3.2141083478927612, "rewards/citation_reward_func": 2.7154194513956704, "rewards/correctness_reward_func": 2.755101978778839, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -0.45918366809686023, "rewards/penalize_wrong_passages_reward_func": -1.4285714030265808, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49645912647247314, "step": 363 }, { "completion_length": 193.82312774658203, "epoch": 0.7665847665847666, "grad_norm": 7.441447734832764, "kl": 1.0807291666666667, "learning_rate": 1e-06, "loss": 0.0109, "reward": 5.427708625793457, "reward_std": 2.763730764389038, "rewards/citation_reward_func": 3.191609819730123, "rewards/correctness_reward_func": 2.6700679659843445, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.30612244705359143, "rewards/penalize_wrong_passages_reward_func": -1.1224489510059357, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49630266427993774, "step": 364 }, { "completion_length": 184.27210744222006, "epoch": 0.7686907686907687, "grad_norm": 1.2132267951965332, "kl": 0.900390625, "learning_rate": 1e-06, "loss": 0.009, "reward": 6.065507729848226, "reward_std": 2.7998267809549966, "rewards/citation_reward_func": 3.2426302433013916, "rewards/correctness_reward_func": 2.9931972324848175, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.4591836780309677, "rewards/penalize_wrong_passages_reward_func": -0.7074829886356989, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4980475604534149, "step": 365 }, { "completion_length": 215.0782267252604, "epoch": 0.7707967707967708, "grad_norm": 0.9835836887359619, "kl": 0.7194010416666666, "learning_rate": 1e-06, "loss": 0.0072, "reward": 3.548775384823481, "reward_std": 3.152905980745951, "rewards/citation_reward_func": 3.4013604720433555, "rewards/correctness_reward_func": 1.1734693795442581, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.40816325942675274, "rewards/penalize_wrong_passages_reward_func": -1.612244874238968, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4960543711980184, "step": 366 }, { "completion_length": 188.61224110921225, "epoch": 0.7729027729027729, "grad_norm": 0.9326059818267822, "kl": 0.8587239583333334, "learning_rate": 1e-06, "loss": 0.0086, "reward": 4.506099636356036, "reward_std": 3.020311951637268, "rewards/citation_reward_func": 3.1065758069356284, "rewards/correctness_reward_func": 2.3469387690226235, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -1.6734693696101506, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.4981631984313329, "step": 367 }, { "completion_length": 208.7823053995768, "epoch": 0.775008775008775, "grad_norm": 0.9097429513931274, "kl": 0.783203125, "learning_rate": 1e-06, "loss": 0.0078, "reward": 4.044582664966583, "reward_std": 3.683608889579773, "rewards/citation_reward_func": 3.0215417941411338, "rewards/correctness_reward_func": 1.8537414570649464, "rewards/formatting_reward_func": 0.49149659276008606, "rewards/length_reward_func": -0.40816326439380646, "rewards/penalize_wrong_passages_reward_func": -1.4013605117797852, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4873264779647191, "step": 368 }, { "completion_length": 165.74829483032227, "epoch": 0.7771147771147772, "grad_norm": 1.2218385934829712, "kl": 0.998046875, "learning_rate": 1e-06, "loss": 0.0102, "reward": 4.376276512940724, "reward_std": 2.677362342675527, "rewards/citation_reward_func": 3.684807022412618, "rewards/correctness_reward_func": 2.1768707036972046, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -0.2551020433505376, "rewards/penalize_wrong_passages_reward_func": -2.2244898260881505, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.497591773668925, "step": 369 }, { "completion_length": 223.14625295003256, "epoch": 0.7792207792207793, "grad_norm": 0.9640951156616211, "kl": 0.7174479166666666, "learning_rate": 1e-06, "loss": 0.0072, "reward": 2.730006674925486, "reward_std": 3.398833155632019, "rewards/citation_reward_func": 3.5714284578959146, "rewards/correctness_reward_func": 2.040816237529119, "rewards/formatting_reward_func": 0.49659863611062366, "rewards/length_reward_func": -0.5102040867010752, "rewards/penalize_wrong_passages_reward_func": -3.3605441451072693, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49191151559352875, "step": 370 }, { "completion_length": 179.43537139892578, "epoch": 0.7813267813267813, "grad_norm": 2.156036138534546, "kl": 1.0071614583333333, "learning_rate": 1e-06, "loss": 0.0101, "reward": 5.707685788472493, "reward_std": 1.7288099726041157, "rewards/citation_reward_func": 3.1689340273539224, "rewards/correctness_reward_func": 2.0918366784850755, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -0.5510204037030538, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4979353149731954, "step": 371 }, { "completion_length": 194.06802368164062, "epoch": 0.7834327834327834, "grad_norm": 1.0008419752120972, "kl": 1.0358072916666667, "learning_rate": 1e-06, "loss": 0.0104, "reward": 3.9335667292277017, "reward_std": 2.542555034160614, "rewards/citation_reward_func": 2.9138320287068686, "rewards/correctness_reward_func": 1.4455781827370326, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -1.2176870505015056, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49762578308582306, "step": 372 }, { "completion_length": 169.0952351888021, "epoch": 0.7855387855387855, "grad_norm": 0.9800570607185364, "kl": 1.0807291666666667, "learning_rate": 1e-06, "loss": 0.0109, "reward": 4.42509392897288, "reward_std": 2.331450124581655, "rewards/citation_reward_func": 3.2199544509251914, "rewards/correctness_reward_func": 1.7176870008309681, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.2551020433505376, "rewards/penalize_wrong_passages_reward_func": -1.2517006695270538, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49425504604975384, "step": 373 }, { "completion_length": 196.67686716715494, "epoch": 0.7876447876447876, "grad_norm": 0.8940144181251526, "kl": 0.865234375, "learning_rate": 1e-06, "loss": 0.0087, "reward": 5.650235652923584, "reward_std": 2.8446309566497803, "rewards/citation_reward_func": 3.1916098992029824, "rewards/correctness_reward_func": 2.8911564151446023, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -1.319727857907613, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4909387181202571, "step": 374 }, { "completion_length": 211.76190185546875, "epoch": 0.7897507897507897, "grad_norm": 0.8513844013214111, "kl": 0.8001302083333334, "learning_rate": 1e-06, "loss": 0.008, "reward": 3.264024794101715, "reward_std": 3.1702443758646646, "rewards/citation_reward_func": 2.494330883026123, "rewards/correctness_reward_func": 1.6836734414100647, "rewards/formatting_reward_func": 0.4931972771883011, "rewards/length_reward_func": -0.3571428606907527, "rewards/penalize_wrong_passages_reward_func": -1.5442176659901936, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49418361981709796, "step": 375 }, { "completion_length": 204.04761505126953, "epoch": 0.7918567918567918, "grad_norm": 1.1359713077545166, "kl": 0.865234375, "learning_rate": 1e-06, "loss": 0.0086, "reward": 3.6691199938456216, "reward_std": 3.034039258956909, "rewards/citation_reward_func": 3.1575962702433267, "rewards/correctness_reward_func": 1.2414965629577637, "rewards/formatting_reward_func": 0.4931972771883011, "rewards/length_reward_func": -0.5102040817340215, "rewards/penalize_wrong_passages_reward_func": -1.2040815949440002, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49111558496952057, "step": 376 }, { "completion_length": 187.55101776123047, "epoch": 0.793962793962794, "grad_norm": 0.8377850651741028, "kl": 1.251953125, "learning_rate": 1e-06, "loss": 0.0129, "reward": 3.168164153893789, "reward_std": 2.8421239455540976, "rewards/citation_reward_func": 3.3446709712346396, "rewards/correctness_reward_func": 1.751700629790624, "rewards/formatting_reward_func": 0.4948979616165161, "rewards/length_reward_func": -0.510204071799914, "rewards/penalize_wrong_passages_reward_func": -2.4013604819774628, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.48845912516117096, "step": 377 }, { "completion_length": 180.3537394205729, "epoch": 0.7960687960687961, "grad_norm": 1.4555526971817017, "kl": 1.087890625, "learning_rate": 1e-06, "loss": 0.0111, "reward": 5.376014610131581, "reward_std": 3.2300790747006736, "rewards/citation_reward_func": 3.6678003072738647, "rewards/correctness_reward_func": 2.2789114912350974, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.40816326439380646, "rewards/penalize_wrong_passages_reward_func": -0.9863945270578066, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.4956291963656743, "step": 378 }, { "completion_length": 228.18026733398438, "epoch": 0.7981747981747982, "grad_norm": 14.021940231323242, "kl": 2.5520833333333335, "learning_rate": 1e-06, "loss": 0.0255, "reward": 3.4462526788314185, "reward_std": 3.789495587348938, "rewards/citation_reward_func": 3.2426303227742515, "rewards/correctness_reward_func": 2.1768706937630973, "rewards/formatting_reward_func": 0.4829931954542796, "rewards/length_reward_func": -0.9693877498308817, "rewards/penalize_wrong_passages_reward_func": -1.9659863710403442, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.479132612546285, "step": 379 }, { "completion_length": 184.55441538492838, "epoch": 0.8002808002808003, "grad_norm": 2.2284538745880127, "kl": 1.2708333333333333, "learning_rate": 1e-06, "loss": 0.0127, "reward": 4.8683435916900635, "reward_std": 2.3631349007288613, "rewards/citation_reward_func": 3.5884350538253784, "rewards/correctness_reward_func": 1.5986394186814625, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.2551020433505376, "rewards/penalize_wrong_passages_reward_func": -1.0612244804700215, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49929585059483844, "step": 380 }, { "completion_length": 191.6088409423828, "epoch": 0.8023868023868024, "grad_norm": 1.0186786651611328, "kl": 1.0078125, "learning_rate": 1e-06, "loss": 0.0101, "reward": 4.85123352209727, "reward_std": 1.894408146540324, "rewards/citation_reward_func": 3.424036184946696, "rewards/correctness_reward_func": 1.1054421464602153, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -0.4693877498308818, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49692511558532715, "step": 381 }, { "completion_length": 179.3469327290853, "epoch": 0.8044928044928045, "grad_norm": 1.5976791381835938, "kl": 1.2115885416666667, "learning_rate": 1e-06, "loss": 0.0126, "reward": 5.690453330675761, "reward_std": 2.155216157436371, "rewards/citation_reward_func": 3.888888637224833, "rewards/correctness_reward_func": 1.7687074343363445, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.40816326936086017, "rewards/penalize_wrong_passages_reward_func": -0.5510203925271829, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49374144275983173, "step": 382 }, { "completion_length": 166.1700642903646, "epoch": 0.8065988065988066, "grad_norm": 1.177154541015625, "kl": 1.2799479166666667, "learning_rate": 1e-06, "loss": 0.0128, "reward": 3.771610975265503, "reward_std": 2.488835245370865, "rewards/citation_reward_func": 3.543083747227987, "rewards/correctness_reward_func": 0.6802720998724302, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -1.0612244606018066, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.48362920184930164, "step": 383 }, { "completion_length": 202.77210744222006, "epoch": 0.8087048087048087, "grad_norm": 1.2002400159835815, "kl": 0.9446614583333334, "learning_rate": 1e-06, "loss": 0.0094, "reward": 3.114571273326874, "reward_std": 2.7324097553888955, "rewards/citation_reward_func": 3.282312830289205, "rewards/correctness_reward_func": 1.3265305906534195, "rewards/formatting_reward_func": 0.4948979566494624, "rewards/length_reward_func": -0.3571428606907527, "rewards/penalize_wrong_passages_reward_func": -2.1224488814671836, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49042171239852905, "step": 384 }, { "completion_length": 186.28911336263022, "epoch": 0.8108108108108109, "grad_norm": 1.269479513168335, "kl": 1.3678385416666667, "learning_rate": 1e-06, "loss": 0.0139, "reward": 5.498821934064229, "reward_std": 2.9702013532320657, "rewards/citation_reward_func": 3.628117561340332, "rewards/correctness_reward_func": 3.010203997294108, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -1.7619047115246456, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.49825504422187805, "step": 385 }, { "completion_length": 204.0544179280599, "epoch": 0.812916812916813, "grad_norm": 1.0071189403533936, "kl": 0.77734375, "learning_rate": 1e-06, "loss": 0.008, "reward": 5.695616881052653, "reward_std": 3.5208513736724854, "rewards/citation_reward_func": 3.395691474278768, "rewards/correctness_reward_func": 2.551020324230194, "rewards/formatting_reward_func": 0.4948979566494624, "rewards/length_reward_func": -0.3571428557236989, "rewards/penalize_wrong_passages_reward_func": -0.8775510092576345, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.48870062331358594, "step": 386 }, { "completion_length": 208.5952351888021, "epoch": 0.815022815022815, "grad_norm": 0.9028397798538208, "kl": 0.9401041666666666, "learning_rate": 1e-06, "loss": 0.0099, "reward": 5.026899019877116, "reward_std": 2.7452229261398315, "rewards/citation_reward_func": 3.2426302433013916, "rewards/correctness_reward_func": 2.0578230718771615, "rewards/formatting_reward_func": 0.4931972821553548, "rewards/length_reward_func": -0.3571428606907527, "rewards/penalize_wrong_passages_reward_func": -0.8979591429233551, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4883502970139186, "step": 387 }, { "completion_length": 241.89115142822266, "epoch": 0.8171288171288171, "grad_norm": 2.2114672660827637, "kl": 1.0813802083333333, "learning_rate": 1e-06, "loss": 0.0108, "reward": 3.6548535426457724, "reward_std": 4.8498828411102295, "rewards/citation_reward_func": 3.180272022883097, "rewards/correctness_reward_func": 1.6156462331612904, "rewards/formatting_reward_func": 0.4880952388048172, "rewards/length_reward_func": -0.8673469424247742, "rewards/penalize_wrong_passages_reward_func": -1.231292486190796, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4694795409838359, "step": 388 }, { "completion_length": 178.46258036295572, "epoch": 0.8192348192348192, "grad_norm": 0.9208208322525024, "kl": 0.9479166666666666, "learning_rate": 1e-06, "loss": 0.0099, "reward": 5.69873583316803, "reward_std": 2.291304608186086, "rewards/citation_reward_func": 3.4807255268096924, "rewards/correctness_reward_func": 2.108843465646108, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -0.7823129296302795, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49522102375825244, "step": 389 }, { "completion_length": 221.82652537027994, "epoch": 0.8213408213408213, "grad_norm": 1.2106159925460815, "kl": 0.7799479166666666, "learning_rate": 1e-06, "loss": 0.0078, "reward": 2.308008924126625, "reward_std": 3.1861923933029175, "rewards/citation_reward_func": 3.4410430192947388, "rewards/correctness_reward_func": 1.8027210632960002, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -0.45918366809686023, "rewards/penalize_wrong_passages_reward_func": -3.4557822545369468, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.48261219759782154, "step": 390 }, { "completion_length": 183.39115142822266, "epoch": 0.8234468234468234, "grad_norm": 0.8995904326438904, "kl": 0.9108072916666666, "learning_rate": 1e-06, "loss": 0.0091, "reward": 4.622468153635661, "reward_std": 3.1639206409454346, "rewards/citation_reward_func": 3.1009069283803306, "rewards/correctness_reward_func": 2.0408162077267966, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.3571428557236989, "rewards/penalize_wrong_passages_reward_func": -1.1564625600973766, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49435028930505115, "step": 391 }, { "completion_length": 168.17686716715494, "epoch": 0.8255528255528255, "grad_norm": 1.2364623546600342, "kl": 1.05078125, "learning_rate": 1e-06, "loss": 0.0107, "reward": 5.528898000717163, "reward_std": 2.4589553276697793, "rewards/citation_reward_func": 3.282312790552775, "rewards/correctness_reward_func": 2.6020407676696777, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -1.251700629790624, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4982856512069702, "step": 392 }, { "completion_length": 186.74829610188803, "epoch": 0.8276588276588277, "grad_norm": 1.070202350616455, "kl": 1.0299479166666667, "learning_rate": 1e-06, "loss": 0.0103, "reward": 3.7258060773213706, "reward_std": 3.1855319341023765, "rewards/citation_reward_func": 3.0782312949498496, "rewards/correctness_reward_func": 1.3265305558840434, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.5102040817340215, "rewards/penalize_wrong_passages_reward_func": -1.1632652878761292, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49621422092119855, "step": 393 }, { "completion_length": 185.4455769856771, "epoch": 0.8297648297648298, "grad_norm": 4.5259528160095215, "kl": 1.0944010416666667, "learning_rate": 1e-06, "loss": 0.0111, "reward": 3.85529363155365, "reward_std": 3.284583489100138, "rewards/citation_reward_func": 3.701813896497091, "rewards/correctness_reward_func": 1.1394557654857635, "rewards/formatting_reward_func": 0.4948979616165161, "rewards/length_reward_func": -0.8163265238205591, "rewards/penalize_wrong_passages_reward_func": -1.156462570031484, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49191490809122723, "step": 394 }, { "completion_length": 207.25169372558594, "epoch": 0.8318708318708319, "grad_norm": 1.1044845581054688, "kl": 0.9192708333333334, "learning_rate": 1e-06, "loss": 0.0093, "reward": 4.692568937937419, "reward_std": 2.8676576217015586, "rewards/citation_reward_func": 3.8038547039031982, "rewards/correctness_reward_func": 1.7687074492375057, "rewards/formatting_reward_func": 0.4931972771883011, "rewards/length_reward_func": -0.4591836780309677, "rewards/penalize_wrong_passages_reward_func": -1.3945577840010326, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.48055095970630646, "step": 395 }, { "completion_length": 195.43536885579428, "epoch": 0.833976833976834, "grad_norm": 12.616693496704102, "kl": 1.59765625, "learning_rate": 1e-06, "loss": 0.016, "reward": 4.594591736793518, "reward_std": 3.1584444443384805, "rewards/citation_reward_func": 3.3333332935969033, "rewards/correctness_reward_func": 1.8877550462881725, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.3571428606907527, "rewards/penalize_wrong_passages_reward_func": -1.2585033774375916, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.48914961020151776, "step": 396 }, { "completion_length": 213.34353383382162, "epoch": 0.8360828360828361, "grad_norm": 0.9019458293914795, "kl": 0.8990885416666666, "learning_rate": 1e-06, "loss": 0.009, "reward": 3.3717334220806756, "reward_std": 2.71410596370697, "rewards/citation_reward_func": 2.7947846253712973, "rewards/correctness_reward_func": 1.836734652519226, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.8163265337546667, "rewards/penalize_wrong_passages_reward_func": -1.435374101003011, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4936155875523885, "step": 397 }, { "completion_length": 216.26870473225912, "epoch": 0.8381888381888382, "grad_norm": 1.1500489711761475, "kl": 0.837890625, "learning_rate": 1e-06, "loss": 0.0084, "reward": 4.473621169726054, "reward_std": 3.942271669705709, "rewards/citation_reward_func": 3.3049885034561157, "rewards/correctness_reward_func": 2.2278910676638284, "rewards/formatting_reward_func": 0.4931972821553548, "rewards/length_reward_func": -0.6632653027772903, "rewards/penalize_wrong_passages_reward_func": -1.2108843227227528, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.49176184336344403, "step": 398 }, { "completion_length": 202.49659474690756, "epoch": 0.8402948402948403, "grad_norm": 0.8359233140945435, "kl": 1.3411458333333333, "learning_rate": 1e-06, "loss": 0.0134, "reward": 6.837395668029785, "reward_std": 2.400991588830948, "rewards/citation_reward_func": 3.667800267537435, "rewards/correctness_reward_func": 2.925170044104258, "rewards/formatting_reward_func": 0.4948979566494624, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -0.5918367306391398, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4944251130024592, "step": 399 }, { "completion_length": 211.3197224934896, "epoch": 0.8424008424008425, "grad_norm": 0.8955492377281189, "kl": 0.9088541666666666, "learning_rate": 1e-06, "loss": 0.0091, "reward": 2.618852473795414, "reward_std": 3.1600440740585327, "rewards/citation_reward_func": 2.743764122327169, "rewards/correctness_reward_func": 0.8843537171681722, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.40816326439380646, "rewards/penalize_wrong_passages_reward_func": -1.5918367107709248, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49243531624476117, "step": 400 }, { "completion_length": 226.51700592041016, "epoch": 0.8445068445068445, "grad_norm": 1.0206358432769775, "kl": 0.9166666666666666, "learning_rate": 1e-06, "loss": 0.0093, "reward": 3.305037297308445, "reward_std": 4.15559458732605, "rewards/citation_reward_func": 3.3503399888674417, "rewards/correctness_reward_func": 1.4795918265978496, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.5612244953711828, "rewards/penalize_wrong_passages_reward_func": -1.7755101919174194, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.4836087723573049, "step": 401 }, { "completion_length": 203.78570810953775, "epoch": 0.8466128466128466, "grad_norm": 0.9299895763397217, "kl": 0.9049479166666666, "learning_rate": 1e-06, "loss": 0.0096, "reward": 5.001368463039398, "reward_std": 2.560189406077067, "rewards/citation_reward_func": 3.7358275651931763, "rewards/correctness_reward_func": 2.0238094528516135, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.30612245202064514, "rewards/penalize_wrong_passages_reward_func": -1.4489795466264088, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49683327476183575, "step": 402 }, { "completion_length": 245.96598307291666, "epoch": 0.8487188487188487, "grad_norm": 0.9358532428741455, "kl": 1.001953125, "learning_rate": 1e-06, "loss": 0.01, "reward": 2.2839873830477395, "reward_std": 3.627606511116028, "rewards/citation_reward_func": 3.565759460131327, "rewards/correctness_reward_func": 1.2925169964631398, "rewards/formatting_reward_func": 0.48894557853539783, "rewards/length_reward_func": -0.5612244854370753, "rewards/penalize_wrong_passages_reward_func": -2.9795917669932046, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.47758158047993976, "step": 403 }, { "completion_length": 272.39795176188153, "epoch": 0.8508248508248508, "grad_norm": 2.81453537940979, "kl": 0.892578125, "learning_rate": 1e-06, "loss": 0.0089, "reward": 1.596479393541813, "reward_std": 5.596612215042114, "rewards/citation_reward_func": 2.482993245124817, "rewards/correctness_reward_func": 1.7857142488161724, "rewards/formatting_reward_func": 0.47448978821436566, "rewards/length_reward_func": -1.4285714079936345, "rewards/penalize_wrong_passages_reward_func": -2.1700679659843445, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4519217262665431, "step": 404 }, { "completion_length": 266.0340118408203, "epoch": 0.8529308529308529, "grad_norm": 0.8979971408843994, "kl": 0.7799479166666666, "learning_rate": 1e-06, "loss": 0.0078, "reward": 2.1668489103515944, "reward_std": 3.853674292564392, "rewards/citation_reward_func": 2.9308391014734902, "rewards/correctness_reward_func": 1.5646257996559143, "rewards/formatting_reward_func": 0.48979591329892475, "rewards/length_reward_func": -0.6122448941071829, "rewards/penalize_wrong_passages_reward_func": -2.5238095124562583, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.4877108285824458, "step": 405 }, { "completion_length": 244.4931894938151, "epoch": 0.855036855036855, "grad_norm": 0.9777901768684387, "kl": 0.7272135416666666, "learning_rate": 1e-06, "loss": 0.0073, "reward": 1.3467469811439514, "reward_std": 3.340526362260183, "rewards/citation_reward_func": 2.7947843869527182, "rewards/correctness_reward_func": 1.054421752691269, "rewards/formatting_reward_func": 0.4880952338377635, "rewards/length_reward_func": -0.5102040767669678, "rewards/penalize_wrong_passages_reward_func": -2.9659863909085593, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4856359859307607, "step": 406 }, { "completion_length": 220.93196868896484, "epoch": 0.8571428571428571, "grad_norm": 2.2201004028320312, "kl": 1.3938802083333333, "learning_rate": 1e-06, "loss": 0.0143, "reward": 2.413368208023409, "reward_std": 3.0608163078626, "rewards/citation_reward_func": 3.3276642163594565, "rewards/correctness_reward_func": 1.819727857907613, "rewards/formatting_reward_func": 0.4948979566494624, "rewards/length_reward_func": -0.459183673063914, "rewards/penalize_wrong_passages_reward_func": -3.2585032880306244, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.48876525461673737, "step": 407 }, { "completion_length": 210.1394526163737, "epoch": 0.8592488592488593, "grad_norm": 1.1213274002075195, "kl": 0.9225260416666666, "learning_rate": 1e-06, "loss": 0.0092, "reward": 3.7354998191197715, "reward_std": 3.206026335557302, "rewards/citation_reward_func": 3.0612245003382363, "rewards/correctness_reward_func": 1.564625807106495, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -0.6632653127113978, "rewards/penalize_wrong_passages_reward_func": -1.2176870654026668, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4940033406019211, "step": 408 }, { "completion_length": 221.2993138631185, "epoch": 0.8613548613548614, "grad_norm": 1.0960359573364258, "kl": 0.884765625, "learning_rate": 1e-06, "loss": 0.0089, "reward": 4.068404674530029, "reward_std": 3.2786128918329873, "rewards/citation_reward_func": 3.112244804700216, "rewards/correctness_reward_func": 1.39455779393514, "rewards/formatting_reward_func": 0.4931972771883011, "rewards/length_reward_func": -0.6122448990742365, "rewards/penalize_wrong_passages_reward_func": -0.8095237910747528, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4901734193166097, "step": 409 }, { "completion_length": 222.9897893269857, "epoch": 0.8634608634608635, "grad_norm": 1.0138585567474365, "kl": 0.9602864583333334, "learning_rate": 1e-06, "loss": 0.0096, "reward": 3.9506948391596475, "reward_std": 3.6894630193710327, "rewards/citation_reward_func": 2.647392193476359, "rewards/correctness_reward_func": 2.789115528265635, "rewards/formatting_reward_func": 0.4931972771883011, "rewards/length_reward_func": -0.459183673063914, "rewards/penalize_wrong_passages_reward_func": -2.0068026582400003, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.48697614173094433, "step": 410 }, { "completion_length": 196.2789052327474, "epoch": 0.8655668655668656, "grad_norm": 0.9708966016769409, "kl": 0.8671875, "learning_rate": 1e-06, "loss": 0.0092, "reward": 4.430862545967102, "reward_std": 2.7824105819066367, "rewards/citation_reward_func": 3.815192619959513, "rewards/correctness_reward_func": 2.9421767791112265, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.30612245202064514, "rewards/penalize_wrong_passages_reward_func": -3.006802717844645, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4881189912557602, "step": 411 }, { "completion_length": 187.4183603922526, "epoch": 0.8676728676728677, "grad_norm": 0.6679948568344116, "kl": 0.9576822916666666, "learning_rate": 1e-06, "loss": 0.0097, "reward": 5.555980682373047, "reward_std": 2.303885757923126, "rewards/citation_reward_func": 3.157596230506897, "rewards/correctness_reward_func": 3.6054420471191406, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.2551020383834839, "rewards/penalize_wrong_passages_reward_func": -1.9455782175064087, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4936223824818929, "step": 412 }, { "completion_length": 187.95237731933594, "epoch": 0.8697788697788698, "grad_norm": 1.1267014741897583, "kl": 0.9283854166666666, "learning_rate": 1e-06, "loss": 0.0093, "reward": 4.6450633605321245, "reward_std": 2.902350743611654, "rewards/citation_reward_func": 3.0498865048090615, "rewards/correctness_reward_func": 2.2448979020118713, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -0.3571428557236989, "rewards/penalize_wrong_passages_reward_func": -1.2857142488161724, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49653735756874084, "step": 413 }, { "completion_length": 224.27210489908853, "epoch": 0.8718848718848718, "grad_norm": 0.8470773696899414, "kl": 0.8430989583333334, "learning_rate": 1e-06, "loss": 0.0084, "reward": 5.8639976978302, "reward_std": 2.6660279631614685, "rewards/citation_reward_func": 3.2936505873998008, "rewards/correctness_reward_func": 2.4999999602635703, "rewards/formatting_reward_func": 0.48979591329892475, "rewards/length_reward_func": -0.3571428606907527, "rewards/penalize_wrong_passages_reward_func": -0.551020403082172, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4887142429749171, "step": 414 }, { "completion_length": 228.59863789876303, "epoch": 0.8739908739908739, "grad_norm": 0.8768046498298645, "kl": 1.158203125, "learning_rate": 1e-06, "loss": 0.0119, "reward": 4.329038341840108, "reward_std": 3.2816514571507773, "rewards/citation_reward_func": 3.4126983086268106, "rewards/correctness_reward_func": 2.346938669681549, "rewards/formatting_reward_func": 0.48639454940954846, "rewards/length_reward_func": -0.5102040867010752, "rewards/penalize_wrong_passages_reward_func": -1.8843537171681721, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.47756456832091015, "step": 415 }, { "completion_length": 229.88774871826172, "epoch": 0.8760968760968761, "grad_norm": 3.260483503341675, "kl": 1.0201822916666667, "learning_rate": 1e-06, "loss": 0.0102, "reward": 2.8331868648529053, "reward_std": 3.004487911860148, "rewards/citation_reward_func": 2.9761902888615928, "rewards/correctness_reward_func": 1.1224489609400432, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -0.3571428606907527, "rewards/penalize_wrong_passages_reward_func": -1.8979591329892476, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4930509676535924, "step": 416 }, { "completion_length": 201.8197250366211, "epoch": 0.8782028782028782, "grad_norm": 1.051156997680664, "kl": 1024.71875, "learning_rate": 1e-06, "loss": 10.2348, "reward": 4.993551929791768, "reward_std": 2.8192378679911294, "rewards/citation_reward_func": 3.8038545846939087, "rewards/correctness_reward_func": 1.3265305906534195, "rewards/formatting_reward_func": 0.4931972771883011, "rewards/length_reward_func": -0.459183673063914, "rewards/penalize_wrong_passages_reward_func": -0.6598639413714409, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.48901695013046265, "step": 417 }, { "completion_length": 246.04421742757162, "epoch": 0.8803088803088803, "grad_norm": 1.0293517112731934, "kl": 0.8938802083333334, "learning_rate": 1e-06, "loss": 0.009, "reward": 4.082921763261159, "reward_std": 4.129499514897664, "rewards/citation_reward_func": 3.3333330949147544, "rewards/correctness_reward_func": 1.8027210632960002, "rewards/formatting_reward_func": 0.47789115210374195, "rewards/length_reward_func": -1.0204081684350967, "rewards/penalize_wrong_passages_reward_func": -0.9795917967955271, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.468976154923439, "step": 418 }, { "completion_length": 227.7686996459961, "epoch": 0.8824148824148824, "grad_norm": 0.9488903284072876, "kl": 0.853515625, "learning_rate": 1e-06, "loss": 0.0087, "reward": 3.356776495774587, "reward_std": 4.17160701751709, "rewards/citation_reward_func": 3.1916098594665527, "rewards/correctness_reward_func": 1.2074829737345378, "rewards/formatting_reward_func": 0.48639454940954846, "rewards/length_reward_func": -0.5102040817340215, "rewards/penalize_wrong_passages_reward_func": -1.156462550163269, "rewards/unicode_reward_func": -0.3401360511779785, "rewards/xmlcount_reward_func": 0.4780917863051097, "step": 419 }, { "completion_length": 232.66666412353516, "epoch": 0.8845208845208845, "grad_norm": 1.2417980432510376, "kl": 0.9186197916666666, "learning_rate": 1e-06, "loss": 0.0092, "reward": 2.7340973714987435, "reward_std": 3.529271046320597, "rewards/citation_reward_func": 3.0328797499338784, "rewards/correctness_reward_func": 0.8843537221352259, "rewards/formatting_reward_func": 0.48469386994838715, "rewards/length_reward_func": -0.8163265188535055, "rewards/penalize_wrong_passages_reward_func": -1.3265305856863658, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4750271489222844, "step": 420 }, { "completion_length": 212.38095092773438, "epoch": 0.8866268866268866, "grad_norm": 1.6659380197525024, "kl": 1.17578125, "learning_rate": 1e-06, "loss": 0.0117, "reward": 4.945539553960164, "reward_std": 2.7063895066579184, "rewards/citation_reward_func": 3.0498866637547812, "rewards/correctness_reward_func": 2.040816237529119, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -0.30612245202064514, "rewards/penalize_wrong_passages_reward_func": -0.8231292466322581, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4874897499879201, "step": 421 }, { "completion_length": 231.6088383992513, "epoch": 0.8887328887328887, "grad_norm": 9.702512741088867, "kl": 1.6959635416666667, "learning_rate": 1e-06, "loss": 0.017, "reward": 6.399842421213786, "reward_std": 3.2888264854749045, "rewards/citation_reward_func": 3.475056529045105, "rewards/correctness_reward_func": 2.9421767791112265, "rewards/formatting_reward_func": 0.49064625799655914, "rewards/length_reward_func": -0.45918366809686023, "rewards/penalize_wrong_passages_reward_func": -0.537414958079656, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.48856116831302643, "step": 422 }, { "completion_length": 240.5578180948893, "epoch": 0.8908388908388908, "grad_norm": 1.0227175951004028, "kl": 1.32421875, "learning_rate": 1e-06, "loss": 0.0132, "reward": 4.322190443674724, "reward_std": 3.828075965245565, "rewards/citation_reward_func": 3.4183671474456787, "rewards/correctness_reward_func": 2.1088434855143228, "rewards/formatting_reward_func": 0.4897959182659785, "rewards/length_reward_func": -0.561224490404129, "rewards/penalize_wrong_passages_reward_func": -1.612244854370753, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.478653018673261, "step": 423 }, { "completion_length": 243.05782063802084, "epoch": 0.892944892944893, "grad_norm": 1.1528595685958862, "kl": 0.951171875, "learning_rate": 1e-06, "loss": 0.0095, "reward": 3.417047699292501, "reward_std": 4.412300546964009, "rewards/citation_reward_func": 3.282312790552775, "rewards/correctness_reward_func": 1.5306121756633122, "rewards/formatting_reward_func": 0.48469386994838715, "rewards/length_reward_func": -0.7653061201175054, "rewards/penalize_wrong_passages_reward_func": -1.4217686653137207, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.47657138605912525, "step": 424 }, { "completion_length": 238.9285685221354, "epoch": 0.8950508950508951, "grad_norm": 1.091051697731018, "kl": 1.4557291666666667, "learning_rate": 1e-06, "loss": 0.0146, "reward": 3.0012970169385276, "reward_std": 4.6244891087214155, "rewards/citation_reward_func": 3.0555554231007895, "rewards/correctness_reward_func": 1.3095237811406453, "rewards/formatting_reward_func": 0.48639454940954846, "rewards/length_reward_func": -1.4285714129606883, "rewards/penalize_wrong_passages_reward_func": -0.9047618905703226, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.48315641780694324, "step": 425 }, { "completion_length": 216.97618865966797, "epoch": 0.8971568971568972, "grad_norm": 1.0596503019332886, "kl": 1.3919270833333333, "learning_rate": 1e-06, "loss": 0.0139, "reward": 6.610499779383342, "reward_std": 3.202061096827189, "rewards/citation_reward_func": 3.36734676361084, "rewards/correctness_reward_func": 3.3163264195124307, "rewards/formatting_reward_func": 0.4897959182659785, "rewards/length_reward_func": -0.5612244804700216, "rewards/penalize_wrong_passages_reward_func": -0.4829931855201721, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4812482496102651, "step": 426 }, { "completion_length": 188.94897969563803, "epoch": 0.8992628992628993, "grad_norm": 2.3366739749908447, "kl": 1.3515625, "learning_rate": 1e-06, "loss": 0.0135, "reward": 6.2090316613515215, "reward_std": 1.9871355493863423, "rewards/citation_reward_func": 3.3276642163594565, "rewards/correctness_reward_func": 3.1802720626195273, "rewards/formatting_reward_func": 0.49659863611062366, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -1.1360543767611186, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4936121851205826, "step": 427 }, { "completion_length": 221.59183502197266, "epoch": 0.9013689013689014, "grad_norm": 1.1556252241134644, "kl": 1.22265625, "learning_rate": 1e-06, "loss": 0.0122, "reward": 2.2772664166986942, "reward_std": 3.3601556619008384, "rewards/citation_reward_func": 2.664398948351542, "rewards/correctness_reward_func": 1.0544217303395271, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.6122448941071829, "rewards/penalize_wrong_passages_reward_func": -1.8231291969617207, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49552034835020703, "step": 428 }, { "completion_length": 203.52040354410806, "epoch": 0.9034749034749034, "grad_norm": 3.2301814556121826, "kl": 1.4303385416666667, "learning_rate": 1e-06, "loss": 0.0143, "reward": 4.25222647190094, "reward_std": 3.0159141023953757, "rewards/citation_reward_func": 3.066893458366394, "rewards/correctness_reward_func": 1.9897958834966023, "rewards/formatting_reward_func": 0.49659863611062366, "rewards/length_reward_func": -0.3571428606907527, "rewards/penalize_wrong_passages_reward_func": -1.4353741109371185, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49145572384198505, "step": 429 }, { "completion_length": 210.39795430501303, "epoch": 0.9055809055809055, "grad_norm": 5.057128429412842, "kl": 1.2421875, "learning_rate": 1e-06, "loss": 0.0124, "reward": 4.724735697110494, "reward_std": 3.067882537841797, "rewards/citation_reward_func": 3.1916099786758423, "rewards/correctness_reward_func": 2.0408162847161293, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -1.1292516787846882, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.49741150438785553, "step": 430 }, { "completion_length": 233.9387690226237, "epoch": 0.9076869076869077, "grad_norm": 8.886016845703125, "kl": 1.5846354166666667, "learning_rate": 1e-06, "loss": 0.0158, "reward": 3.3445235888163247, "reward_std": 3.751445452372233, "rewards/citation_reward_func": 2.806122342745463, "rewards/correctness_reward_func": 1.649659812450409, "rewards/formatting_reward_func": 0.47789114713668823, "rewards/length_reward_func": -0.7653061101833979, "rewards/penalize_wrong_passages_reward_func": -1.2925169865290325, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4686734229326248, "step": 431 }, { "completion_length": 226.38434855143228, "epoch": 0.9097929097929098, "grad_norm": 1.0712940692901611, "kl": 0.8515625, "learning_rate": 1e-06, "loss": 0.0085, "reward": 4.341326395670573, "reward_std": 4.08898941675822, "rewards/citation_reward_func": 2.857142686843872, "rewards/correctness_reward_func": 2.6020407478014627, "rewards/formatting_reward_func": 0.4931972821553548, "rewards/length_reward_func": -0.459183673063914, "rewards/penalize_wrong_passages_reward_func": -1.4693877498308818, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.48758498827616376, "step": 432 }, { "completion_length": 250.47958374023438, "epoch": 0.9118989118989119, "grad_norm": 0.754627525806427, "kl": 0.6861979166666666, "learning_rate": 1e-06, "loss": 0.0071, "reward": 3.53940478960673, "reward_std": 3.8470928072929382, "rewards/citation_reward_func": 2.3937074740727744, "rewards/correctness_reward_func": 3.027210851510366, "rewards/formatting_reward_func": 0.48979591329892475, "rewards/length_reward_func": -1.020408148566882, "rewards/penalize_wrong_passages_reward_func": -1.836734652519226, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.485833282272021, "step": 433 }, { "completion_length": 277.05781809488934, "epoch": 0.914004914004914, "grad_norm": 1.4854369163513184, "kl": 0.84375, "learning_rate": 1e-06, "loss": 0.0084, "reward": 2.3832449913024902, "reward_std": 4.8437340259552, "rewards/citation_reward_func": 2.682823061943054, "rewards/correctness_reward_func": 1.6666666269302368, "rewards/formatting_reward_func": 0.4693877498308818, "rewards/length_reward_func": -1.377550999323527, "rewards/penalize_wrong_passages_reward_func": -1.3469387392203014, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.45892512798309326, "step": 434 }, { "completion_length": 245.16326141357422, "epoch": 0.9161109161109161, "grad_norm": 0.9480783939361572, "kl": 0.7565104166666666, "learning_rate": 1e-06, "loss": 0.0078, "reward": 4.360251814126968, "reward_std": 3.766782840092977, "rewards/citation_reward_func": 2.478741387526194, "rewards/correctness_reward_func": 2.7380951642990112, "rewards/formatting_reward_func": 0.4948979616165161, "rewards/length_reward_func": -0.5102040817340215, "rewards/penalize_wrong_passages_reward_func": -1.1632652878761292, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.49205436805884045, "step": 435 }, { "completion_length": 255.5033925374349, "epoch": 0.9182169182169182, "grad_norm": 0.7878819108009338, "kl": 0.708984375, "learning_rate": 1e-06, "loss": 0.0073, "reward": 2.683357129494349, "reward_std": 3.135557929674784, "rewards/citation_reward_func": 2.827380895614624, "rewards/correctness_reward_func": 1.2925169716278713, "rewards/formatting_reward_func": 0.4931972771883011, "rewards/length_reward_func": -0.6632653027772903, "rewards/penalize_wrong_passages_reward_func": -1.7551020284493764, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4886292020479838, "step": 436 }, { "completion_length": 257.1292495727539, "epoch": 0.9203229203229203, "grad_norm": 0.8499004244804382, "kl": 0.7428385416666666, "learning_rate": 1e-06, "loss": 0.0074, "reward": 3.225904862085978, "reward_std": 3.394826134045919, "rewards/citation_reward_func": 2.848639408747355, "rewards/correctness_reward_func": 1.9217686752478282, "rewards/formatting_reward_func": 0.49149659276008606, "rewards/length_reward_func": -0.3571428557236989, "rewards/penalize_wrong_passages_reward_func": -2.163265277942022, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.48440809547901154, "step": 437 }, { "completion_length": 266.0101954142253, "epoch": 0.9224289224289224, "grad_norm": 0.8942264318466187, "kl": 0.71875, "learning_rate": 1e-06, "loss": 0.0072, "reward": 2.4781905015309653, "reward_std": 4.1750030517578125, "rewards/citation_reward_func": 2.699829896291097, "rewards/correctness_reward_func": 1.3095237811406453, "rewards/formatting_reward_func": 0.48469387491544086, "rewards/length_reward_func": -1.020408163468043, "rewards/penalize_wrong_passages_reward_func": -1.476190447807312, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4807414561510086, "step": 438 }, { "completion_length": 254.13264973958334, "epoch": 0.9245349245349246, "grad_norm": 2.025777578353882, "kl": 1.359375, "learning_rate": 1e-06, "loss": 0.0137, "reward": 3.723285893599192, "reward_std": 2.8511158426602683, "rewards/citation_reward_func": 2.5977890888849893, "rewards/correctness_reward_func": 1.9387754499912262, "rewards/formatting_reward_func": 0.4931972821553548, "rewards/length_reward_func": -0.3571428557236989, "rewards/penalize_wrong_passages_reward_func": -1.428571383158366, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.47923805316289264, "step": 439 }, { "completion_length": 241.6836700439453, "epoch": 0.9266409266409267, "grad_norm": 0.8116592764854431, "kl": 1.0397135416666667, "learning_rate": 1e-06, "loss": 0.0104, "reward": 1.0626495877901714, "reward_std": 3.930151581764221, "rewards/citation_reward_func": 2.551020304361979, "rewards/correctness_reward_func": 0.6122448866566023, "rewards/formatting_reward_func": 0.4948979566494624, "rewards/length_reward_func": -0.7142857213815054, "rewards/penalize_wrong_passages_reward_func": -2.0340136090914407, "rewards/unicode_reward_func": -0.3401360511779785, "rewards/xmlcount_reward_func": 0.49292171994845074, "step": 440 }, { "completion_length": 247.81292215983072, "epoch": 0.9287469287469288, "grad_norm": 8.80766773223877, "kl": 1.15625, "learning_rate": 1e-06, "loss": 0.0115, "reward": 2.8308265656232834, "reward_std": 3.2842116355895996, "rewards/citation_reward_func": 2.346938729286194, "rewards/correctness_reward_func": 1.5986394186814625, "rewards/formatting_reward_func": 0.4880952338377635, "rewards/length_reward_func": -0.40816326936086017, "rewards/penalize_wrong_passages_reward_func": -1.6734693745772045, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4787856588761012, "step": 441 }, { "completion_length": 211.26870727539062, "epoch": 0.9308529308529309, "grad_norm": 3.453256130218506, "kl": 0.8912760416666666, "learning_rate": 1e-06, "loss": 0.0096, "reward": 6.2583504517873125, "reward_std": 2.4285461703936257, "rewards/citation_reward_func": 2.844387690226237, "rewards/correctness_reward_func": 2.9761904080708823, "rewards/formatting_reward_func": 0.4948979566494624, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -0.3469387690226237, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4938944975535075, "step": 442 }, { "completion_length": 240.78571065266928, "epoch": 0.932958932958933, "grad_norm": 0.8041434288024902, "kl": 3.5592447916666665, "learning_rate": 1e-06, "loss": 0.0357, "reward": 3.460792601108551, "reward_std": 3.017461578051249, "rewards/citation_reward_func": 2.993197202682495, "rewards/correctness_reward_func": 1.2414965530236561, "rewards/formatting_reward_func": 0.4914965977271398, "rewards/length_reward_func": -0.40816326439380646, "rewards/penalize_wrong_passages_reward_func": -1.3401360511779785, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.48290130992730457, "step": 443 }, { "completion_length": 242.91156005859375, "epoch": 0.935064935064935, "grad_norm": 0.8912382125854492, "kl": 0.7233072916666666, "learning_rate": 1e-06, "loss": 0.0072, "reward": 3.097394548356533, "reward_std": 3.375369350115458, "rewards/citation_reward_func": 2.7465986013412476, "rewards/correctness_reward_func": 1.1904761642217636, "rewards/formatting_reward_func": 0.4948979616165161, "rewards/length_reward_func": -0.40816326936086017, "rewards/penalize_wrong_passages_reward_func": -1.4149659872055054, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.48855096598466236, "step": 444 }, { "completion_length": 240.36053975423178, "epoch": 0.9371709371709371, "grad_norm": 0.84913170337677, "kl": 0.8639322916666666, "learning_rate": 1e-06, "loss": 0.0086, "reward": 3.8615239461263022, "reward_std": 3.605143388112386, "rewards/citation_reward_func": 2.6870747804641724, "rewards/correctness_reward_func": 1.9557822247346242, "rewards/formatting_reward_func": 0.4948979616165161, "rewards/length_reward_func": -0.30612245202064514, "rewards/penalize_wrong_passages_reward_func": -1.2925169865290325, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.49247613549232483, "step": 445 }, { "completion_length": 231.4829889933268, "epoch": 0.9392769392769392, "grad_norm": 0.9911380410194397, "kl": 0.912109375, "learning_rate": 1e-06, "loss": 0.0091, "reward": 3.4259490370750427, "reward_std": 3.146302322546641, "rewards/citation_reward_func": 3.2270408074061074, "rewards/correctness_reward_func": 1.2585033799211185, "rewards/formatting_reward_func": 0.49659863611062366, "rewards/length_reward_func": -0.561224490404129, "rewards/penalize_wrong_passages_reward_func": -1.482993150750796, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4880237430334091, "step": 446 }, { "completion_length": 274.37414296468097, "epoch": 0.9413829413829414, "grad_norm": 0.7898405194282532, "kl": 0.7565104166666666, "learning_rate": 1e-06, "loss": 0.0076, "reward": 4.6013062198956804, "reward_std": 3.6253005266189575, "rewards/citation_reward_func": 2.351190427939097, "rewards/correctness_reward_func": 3.4353740215301514, "rewards/formatting_reward_func": 0.4812925159931183, "rewards/length_reward_func": -0.6122448841730753, "rewards/penalize_wrong_passages_reward_func": -1.5306121905644734, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4763060708840688, "step": 447 }, { "completion_length": 252.36053466796875, "epoch": 0.9434889434889435, "grad_norm": 1.2493234872817993, "kl": 0.8743489583333334, "learning_rate": 1e-06, "loss": 0.0087, "reward": 4.8478163580099745, "reward_std": 2.818999171257019, "rewards/citation_reward_func": 2.8741496006647744, "rewards/correctness_reward_func": 2.534013569355011, "rewards/formatting_reward_func": 0.4948979566494624, "rewards/length_reward_func": -0.30612245202064514, "rewards/penalize_wrong_passages_reward_func": -1.238095184167226, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4889727383852005, "step": 448 }, { "completion_length": 263.86734263102215, "epoch": 0.9455949455949456, "grad_norm": 5.652228355407715, "kl": 1.0559895833333333, "learning_rate": 1e-06, "loss": 0.0106, "reward": 2.1674863497416177, "reward_std": 3.2532392740249634, "rewards/citation_reward_func": 2.2916666070620217, "rewards/correctness_reward_func": 1.2244897832473118, "rewards/formatting_reward_func": 0.4931972771883011, "rewards/length_reward_func": -0.40816326936086017, "rewards/penalize_wrong_passages_reward_func": -1.911564588546753, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4778604904810588, "step": 449 }, { "completion_length": 227.81292215983072, "epoch": 0.9477009477009477, "grad_norm": 0.9847736954689026, "kl": 0.9290364583333334, "learning_rate": 1e-06, "loss": 0.0093, "reward": 0.2178944672147433, "reward_std": 3.902046004931132, "rewards/citation_reward_func": 2.933673401673635, "rewards/correctness_reward_func": 1.9557822222510974, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.9693877448638281, "rewards/penalize_wrong_passages_reward_func": -4.5238093336423235, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.49340470135211945, "step": 450 }, { "completion_length": 223.03740692138672, "epoch": 0.9498069498069498, "grad_norm": 0.9363918900489807, "kl": 0.8463541666666666, "learning_rate": 1e-06, "loss": 0.0085, "reward": 3.2303468783696494, "reward_std": 2.5281269550323486, "rewards/citation_reward_func": 3.2525509198506675, "rewards/correctness_reward_func": 1.5646257797876995, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": 0.0, "rewards/penalize_wrong_passages_reward_func": -2.58503395318985, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4982040176788966, "step": 451 }, { "completion_length": 239.9897918701172, "epoch": 0.9519129519129519, "grad_norm": 1.1018811464309692, "kl": 0.7884114583333334, "learning_rate": 1e-06, "loss": 0.0079, "reward": 3.2081496125708022, "reward_std": 3.1324497063954673, "rewards/citation_reward_func": 3.120748241742452, "rewards/correctness_reward_func": 3.639455715815226, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -4.340135971705119, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4955645700295766, "step": 452 }, { "completion_length": 234.323122660319, "epoch": 0.954018954018954, "grad_norm": 0.9514901041984558, "kl": 1.3157552083333333, "learning_rate": 1e-06, "loss": 0.0132, "reward": 1.9489863812923431, "reward_std": 3.943875233332316, "rewards/citation_reward_func": 2.78911558787028, "rewards/correctness_reward_func": 0.8333332935969034, "rewards/formatting_reward_func": 0.4948979566494624, "rewards/length_reward_func": -0.4591836780309677, "rewards/penalize_wrong_passages_reward_func": -1.8571427861849468, "rewards/unicode_reward_func": -0.3401360511779785, "rewards/xmlcount_reward_func": 0.4881019840637843, "step": 453 }, { "completion_length": 255.76189931233725, "epoch": 0.9561249561249561, "grad_norm": 3.1003830432891846, "kl": 0.9947916666666666, "learning_rate": 1e-06, "loss": 0.01, "reward": 1.5095713399350643, "reward_std": 3.59877481063207, "rewards/citation_reward_func": 2.402210851510366, "rewards/correctness_reward_func": 0.6462584833304087, "rewards/formatting_reward_func": 0.4948979566494624, "rewards/length_reward_func": -0.30612245202064514, "rewards/penalize_wrong_passages_reward_func": -2.047619044780731, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.49001354972521466, "step": 454 }, { "completion_length": 263.9285634358724, "epoch": 0.9582309582309583, "grad_norm": 12.15134048461914, "kl": 1.44921875, "learning_rate": 1e-06, "loss": 0.0145, "reward": 4.058207631111145, "reward_std": 3.4817044734954834, "rewards/citation_reward_func": 2.8741496006647744, "rewards/correctness_reward_func": 2.4149659077326455, "rewards/formatting_reward_func": 0.4914965977271398, "rewards/length_reward_func": -0.40816326936086017, "rewards/penalize_wrong_passages_reward_func": -1.8027210235595703, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.48847954471906024, "step": 455 }, { "completion_length": 272.2584991455078, "epoch": 0.9603369603369604, "grad_norm": 0.920379638671875, "kl": 1.822265625, "learning_rate": 1e-06, "loss": 0.0182, "reward": 1.6935850928227107, "reward_std": 4.720509966214498, "rewards/citation_reward_func": 2.7083332935969033, "rewards/correctness_reward_func": 0.7653061002492905, "rewards/formatting_reward_func": 0.4880952288707097, "rewards/length_reward_func": -0.9693877498308817, "rewards/penalize_wrong_passages_reward_func": -1.59183669090271, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.4631428172190984, "step": 456 }, { "completion_length": 216.44557444254556, "epoch": 0.9624429624429625, "grad_norm": 1.5930014848709106, "kl": 0.9583333333333334, "learning_rate": 1e-06, "loss": 0.0096, "reward": 0.889006977279981, "reward_std": 3.1517065366109214, "rewards/citation_reward_func": 3.0357141892115274, "rewards/correctness_reward_func": 2.99319722255071, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.10204081734021504, "rewards/penalize_wrong_passages_reward_func": -6.034013519684474, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49785027901331586, "step": 457 }, { "completion_length": 252.0952377319336, "epoch": 0.9645489645489645, "grad_norm": 0.9933049082756042, "kl": 1.4166666666666667, "learning_rate": 1e-06, "loss": 0.0142, "reward": 1.27609184384346, "reward_std": 3.663653016090393, "rewards/citation_reward_func": 2.589285651842753, "rewards/correctness_reward_func": 1.2755101919174194, "rewards/formatting_reward_func": 0.48979591329892475, "rewards/length_reward_func": -0.561224490404129, "rewards/penalize_wrong_passages_reward_func": -2.829931855201721, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.4827244331439336, "step": 458 }, { "completion_length": 203.9489720662435, "epoch": 0.9666549666549666, "grad_norm": 0.8730067014694214, "kl": 1.1178385416666667, "learning_rate": 1e-06, "loss": 0.0117, "reward": 4.5937449137369795, "reward_std": 3.855661233266195, "rewards/citation_reward_func": 3.592686971028646, "rewards/correctness_reward_func": 1.9387754499912262, "rewards/formatting_reward_func": 0.4855442096789678, "rewards/length_reward_func": -0.7142857114473978, "rewards/penalize_wrong_passages_reward_func": -1.183673471212387, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4746972322463989, "step": 459 }, { "completion_length": 170.48638916015625, "epoch": 0.9687609687609687, "grad_norm": 1.0719090700149536, "kl": 1.2141927083333333, "learning_rate": 1e-06, "loss": 0.0136, "reward": 5.673636317253113, "reward_std": 3.2085538109143577, "rewards/citation_reward_func": 3.945578098297119, "rewards/correctness_reward_func": 2.1938774983088174, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -0.9047618905703226, "rewards/unicode_reward_func": -0.3401360511779785, "rewards/xmlcount_reward_func": 0.4865611692269643, "step": 460 }, { "completion_length": 171.149658203125, "epoch": 0.9708669708669708, "grad_norm": 1.5770235061645508, "kl": 1.212890625, "learning_rate": 1e-06, "loss": 0.0141, "reward": 6.019292672475179, "reward_std": 2.4112029671669006, "rewards/citation_reward_func": 3.962584972381592, "rewards/correctness_reward_func": 2.38095231850942, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -1.115646243095398, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4971836010615031, "step": 461 }, { "completion_length": 178.16666158040366, "epoch": 0.972972972972973, "grad_norm": 1.2439326047897339, "kl": 1.1927083333333333, "learning_rate": 1e-06, "loss": 0.0125, "reward": 5.120292564233144, "reward_std": 2.738259176413218, "rewards/citation_reward_func": 3.8095237016677856, "rewards/correctness_reward_func": 1.6666666169961293, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.15306122104326883, "rewards/penalize_wrong_passages_reward_func": -1.1972788721323013, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49444211026032764, "step": 462 }, { "completion_length": 170.9285659790039, "epoch": 0.9750789750789751, "grad_norm": 2.0321953296661377, "kl": 1.3502604166666667, "learning_rate": 1e-06, "loss": 0.015, "reward": 5.051782409350078, "reward_std": 3.1064772605895996, "rewards/citation_reward_func": 3.7755101521809897, "rewards/correctness_reward_func": 2.0068026383717856, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.40816326936086017, "rewards/penalize_wrong_passages_reward_func": -1.3129251301288605, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.49055776993433636, "step": 463 }, { "completion_length": 171.72108459472656, "epoch": 0.9771849771849772, "grad_norm": 1.0354406833648682, "kl": 1.0266927083333333, "learning_rate": 1e-06, "loss": 0.0115, "reward": 5.325897852579753, "reward_std": 2.9984676440556846, "rewards/citation_reward_func": 3.835033933321635, "rewards/correctness_reward_func": 2.091836671034495, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.20408163468043009, "rewards/penalize_wrong_passages_reward_func": -1.3877550661563873, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4908638844887416, "step": 464 }, { "completion_length": 171.44557444254556, "epoch": 0.9792909792909793, "grad_norm": 0.9120634198188782, "kl": 2.31640625, "learning_rate": 1e-06, "loss": 0.0242, "reward": 5.117595195770264, "reward_std": 3.7810667753219604, "rewards/citation_reward_func": 3.801020383834839, "rewards/correctness_reward_func": 2.0918366511662803, "rewards/formatting_reward_func": 0.49914966026941937, "rewards/length_reward_func": -0.30612244705359143, "rewards/penalize_wrong_passages_reward_func": -1.1156462132930756, "rewards/unicode_reward_func": -0.3401360511779785, "rewards/xmlcount_reward_func": 0.4874931474526723, "step": 465 }, { "completion_length": 166.34693654378256, "epoch": 0.9813969813969814, "grad_norm": 0.8665127754211426, "kl": 1.0703125, "learning_rate": 1e-06, "loss": 0.0117, "reward": 4.781190474828084, "reward_std": 2.7656540870666504, "rewards/citation_reward_func": 3.8265304962793985, "rewards/correctness_reward_func": 1.5986394385496776, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -0.2551020433505376, "rewards/penalize_wrong_passages_reward_func": -1.374149630467097, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.48867341379324597, "step": 466 }, { "completion_length": 171.7789077758789, "epoch": 0.9835029835029835, "grad_norm": 1.0515021085739136, "kl": 1.138671875, "learning_rate": 1e-06, "loss": 0.0132, "reward": 5.545894384384155, "reward_std": 2.702861169974009, "rewards/citation_reward_func": 4.022108793258667, "rewards/correctness_reward_func": 1.887755036354065, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.30612245202064514, "rewards/penalize_wrong_passages_reward_func": -1.0408163219690323, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.48296932876110077, "step": 467 }, { "completion_length": 161.1292470296224, "epoch": 0.9856089856089856, "grad_norm": 0.6496269702911377, "kl": 1.1419270833333333, "learning_rate": 1e-06, "loss": 0.0138, "reward": 6.711469570795695, "reward_std": 1.9912763635317485, "rewards/citation_reward_func": 4.217686891555786, "rewards/correctness_reward_func": 2.568027138710022, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.20408162971337637, "rewards/penalize_wrong_passages_reward_func": -0.8639455686012903, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4954829414685567, "step": 468 }, { "completion_length": 168.3945515950521, "epoch": 0.9877149877149877, "grad_norm": 1.0953691005706787, "kl": 1.33203125, "learning_rate": 1e-06, "loss": 0.0152, "reward": 6.473023891448975, "reward_std": 2.3332280913988748, "rewards/citation_reward_func": 4.056122342745463, "rewards/correctness_reward_func": 2.6020407676696777, "rewards/formatting_reward_func": 0.4965986410776774, "rewards/length_reward_func": -0.2551020433505376, "rewards/penalize_wrong_passages_reward_func": -0.9183673361937205, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4917312413454056, "step": 469 }, { "completion_length": 166.93536885579428, "epoch": 0.9898209898209899, "grad_norm": 1.272312045097351, "kl": 1.2864583333333333, "learning_rate": 1e-06, "loss": 0.0137, "reward": 6.26665194829305, "reward_std": 2.231449862321218, "rewards/citation_reward_func": 3.7386619249979653, "rewards/correctness_reward_func": 2.4149659276008606, "rewards/formatting_reward_func": 0.49574829638004303, "rewards/length_reward_func": -0.15306122601032257, "rewards/penalize_wrong_passages_reward_func": -0.7142857064803442, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.48462240397930145, "step": 470 }, { "completion_length": 196.43536885579428, "epoch": 0.991926991926992, "grad_norm": 1.019243597984314, "kl": 6.277994791666667, "learning_rate": 1e-06, "loss": 0.0628, "reward": 4.13928226629893, "reward_std": 2.5580894947052, "rewards/citation_reward_func": 3.180271943410238, "rewards/correctness_reward_func": 1.2414965679248173, "rewards/formatting_reward_func": 0.4982993205388387, "rewards/length_reward_func": -0.30612244705359143, "rewards/penalize_wrong_passages_reward_func": -0.9659863710403442, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4913230687379837, "step": 471 }, { "completion_length": 200.1496556599935, "epoch": 0.994032994032994, "grad_norm": 1.7666877508163452, "kl": 1.2421875, "learning_rate": 1e-06, "loss": 0.0124, "reward": 2.1022719343503318, "reward_std": 3.5998586813608804, "rewards/citation_reward_func": 3.2142856121063232, "rewards/correctness_reward_func": 1.5646257797876995, "rewards/formatting_reward_func": 0.5, "rewards/length_reward_func": -0.9183673212925593, "rewards/penalize_wrong_passages_reward_func": -2.5714284976323447, "rewards/unicode_reward_func": -0.17006802558898926, "rewards/xmlcount_reward_func": 0.48322444160779315, "step": 472 }, { "completion_length": 204.49659729003906, "epoch": 0.9961389961389961, "grad_norm": 0.7166609764099121, "kl": 1.1692708333333333, "learning_rate": 1e-06, "loss": 0.0113, "reward": -0.41065768152475357, "reward_std": 3.7700961033503213, "rewards/citation_reward_func": 2.98185924688975, "rewards/correctness_reward_func": 1.2755101794997852, "rewards/formatting_reward_func": 0.4931972771883011, "rewards/length_reward_func": -0.5612244953711828, "rewards/penalize_wrong_passages_reward_func": -4.727891008059184, "rewards/unicode_reward_func": -0.3401360511779785, "rewards/xmlcount_reward_func": 0.46802715957164764, "step": 473 }, { "completion_length": 189.5646209716797, "epoch": 0.9982449982449982, "grad_norm": 1.0915828943252563, "kl": 1.30078125, "learning_rate": 1e-06, "loss": 0.0135, "reward": 3.6559808254241943, "reward_std": 2.721195101737976, "rewards/citation_reward_func": 2.753684719403585, "rewards/correctness_reward_func": 2.142857074737549, "rewards/formatting_reward_func": 0.4948979616165161, "rewards/length_reward_func": -0.30612245202064514, "rewards/penalize_wrong_passages_reward_func": -1.9047618955373764, "rewards/unicode_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.4754251291354497, "step": 474 } ], "logging_steps": 1, "max_steps": 474, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 7, "trial_name": null, "trial_params": null }