diff --git "a/checkpoint-474/trainer_state.json" "b/checkpoint-474/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-474/trainer_state.json" @@ -0,0 +1,8565 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9982449982449982, + "eval_steps": 500, + "global_step": 474, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 142.80611928304037, + "epoch": 0.002106002106002106, + "grad_norm": 1.0551681518554688, + "kl": 0.0, + "learning_rate": 2e-08, + "loss": 0.0002, + "reward": 2.131795952717463, + "reward_std": 6.255450010299683, + "rewards/citation_reward_func": 3.8265305360158286, + "rewards/correctness_reward_func": 1.3775509695212047, + "rewards/formatting_reward_func": 0.4863945543766022, + "rewards/length_reward_func": -3.1122448245684304, + "rewards/penalize_wrong_passages_reward_func": -0.9319727768500646, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4855373551448186, + "step": 1 + }, + { + "completion_length": 142.05781936645508, + "epoch": 0.004212004212004212, + "grad_norm": 0.9199629426002502, + "kl": 0.0, + "learning_rate": 4e-08, + "loss": 0.0004, + "reward": 2.249255125721296, + "reward_std": 5.873880942662557, + "rewards/citation_reward_func": 3.8095237016677856, + "rewards/correctness_reward_func": 1.8027210533618927, + "rewards/formatting_reward_func": 0.47789114713668823, + "rewards/length_reward_func": -3.3673468430836997, + "rewards/penalize_wrong_passages_reward_func": -0.952380950252215, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.47884689768155414, + "step": 2 + }, + { + "completion_length": 147.4081598917643, + "epoch": 0.006318006318006318, + "grad_norm": 0.9852209687232971, + "kl": 0.001129150390625, + "learning_rate": 6e-08, + "loss": 0.0, + "reward": 3.1659286667903266, + "reward_std": 5.810667594273885, + "rewards/citation_reward_func": 3.784013589223226, + "rewards/correctness_reward_func": 2.0068026781082153, + "rewards/formatting_reward_func": 0.4812925159931183, + "rewards/length_reward_func": -2.602040797472, + "rewards/penalize_wrong_passages_reward_func": -0.9863945345083872, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4822550565004349, + "step": 3 + }, + { + "completion_length": 149.8843510945638, + "epoch": 0.008424008424008424, + "grad_norm": 1.000421404838562, + "kl": 0.0008897781372070312, + "learning_rate": 8e-08, + "loss": 0.0, + "reward": 1.7928027113278706, + "reward_std": 6.236873547236125, + "rewards/citation_reward_func": 3.647959073384603, + "rewards/correctness_reward_func": 1.7517006198565166, + "rewards/formatting_reward_func": 0.48299319048722583, + "rewards/length_reward_func": -3.163265268007914, + "rewards/penalize_wrong_passages_reward_func": -1.2380952139695485, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.48157819112141925, + "step": 4 + }, + { + "completion_length": 142.11224365234375, + "epoch": 0.01053001053001053, + "grad_norm": 0.9926426410675049, + "kl": 0.0012919108072916667, + "learning_rate": 1e-07, + "loss": 0.0005, + "reward": 2.159136086702347, + "reward_std": 5.8599865436553955, + "rewards/citation_reward_func": 3.562925100326538, + "rewards/correctness_reward_func": 2.142857084671656, + "rewards/formatting_reward_func": 0.4863945593436559, + "rewards/length_reward_func": -3.6224488814671836, + "rewards/penalize_wrong_passages_reward_func": -0.8979591826597849, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.48736729224522907, + "step": 5 + }, + { + "completion_length": 141.88434982299805, + "epoch": 0.012636012636012635, + "grad_norm": 0.8212317228317261, + "kl": 0.001293182373046875, + "learning_rate": 1.2e-07, + "loss": 0.0007, + "reward": 3.040091894567013, + "reward_std": 5.166776895523071, + "rewards/citation_reward_func": 3.4778910080591836, + "rewards/correctness_reward_func": 1.9557822744051616, + "rewards/formatting_reward_func": 0.4880952388048172, + "rewards/length_reward_func": -2.704081575075785, + "rewards/penalize_wrong_passages_reward_func": -0.6666666567325592, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4890713890393575, + "step": 6 + }, + { + "completion_length": 144.5646209716797, + "epoch": 0.014742014742014743, + "grad_norm": 1.0746428966522217, + "kl": 0.0010700225830078125, + "learning_rate": 1.4e-07, + "loss": 0.0004, + "reward": 2.7565646121899285, + "reward_std": 5.139555593331655, + "rewards/citation_reward_func": 3.7499999602635703, + "rewards/correctness_reward_func": 2.0068026781082153, + "rewards/formatting_reward_func": 0.4880952338377635, + "rewards/length_reward_func": -2.908163234591484, + "rewards/penalize_wrong_passages_reward_func": -1.068027178446452, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4878570834795634, + "step": 7 + }, + { + "completion_length": 145.99659474690756, + "epoch": 0.016848016848016848, + "grad_norm": 0.9713934659957886, + "kl": 0.0009145736694335938, + "learning_rate": 1.6e-07, + "loss": 0.0003, + "reward": 3.9408061106999717, + "reward_std": 5.972306688626607, + "rewards/citation_reward_func": 3.741496523221334, + "rewards/correctness_reward_func": 2.0068026781082153, + "rewards/formatting_reward_func": 0.4863945543766022, + "rewards/length_reward_func": -2.19387752811114, + "rewards/penalize_wrong_passages_reward_func": -0.5850339954098066, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.48502375682195026, + "step": 8 + }, + { + "completion_length": 158.15305836995444, + "epoch": 0.018954018954018954, + "grad_norm": 1.0545966625213623, + "kl": 0.0009892781575520833, + "learning_rate": 1.8e-07, + "loss": 0.0, + "reward": 1.793391227722168, + "reward_std": 7.030755837758382, + "rewards/citation_reward_func": 3.9115644693374634, + "rewards/correctness_reward_func": 1.4795918067296345, + "rewards/formatting_reward_func": 0.4829931954542796, + "rewards/length_reward_func": -3.2142856319745383, + "rewards/penalize_wrong_passages_reward_func": -1.1768707136313121, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.48046593368053436, + "step": 9 + }, + { + "completion_length": 141.08503214518228, + "epoch": 0.02106002106002106, + "grad_norm": 0.8638601899147034, + "kl": 0.0014311472574869792, + "learning_rate": 2e-07, + "loss": 0.0005, + "reward": 0.5391122450431188, + "reward_std": 6.993181943893433, + "rewards/citation_reward_func": 3.545918345451355, + "rewards/correctness_reward_func": 1.343537410100301, + "rewards/formatting_reward_func": 0.46598638594150543, + "rewards/length_reward_func": -4.336734573046367, + "rewards/penalize_wrong_passages_reward_func": -0.9455782324075699, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.46598294874032337, + "step": 10 + }, + { + "completion_length": 150.2993189493815, + "epoch": 0.023166023166023165, + "grad_norm": 0.9541848301887512, + "kl": 0.0010693868001302083, + "learning_rate": 2.1999999999999998e-07, + "loss": 0.0004, + "reward": 0.8768775562445322, + "reward_std": 7.34400741259257, + "rewards/citation_reward_func": 3.860544204711914, + "rewards/correctness_reward_func": 1.7006802260875702, + "rewards/formatting_reward_func": 0.47278910875320435, + "rewards/length_reward_func": -4.438775380452474, + "rewards/penalize_wrong_passages_reward_func": -1.19047615925471, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4721156011025111, + "step": 11 + }, + { + "completion_length": 146.27210235595703, + "epoch": 0.02527202527202527, + "grad_norm": 1.0617852210998535, + "kl": 0.0011603037516276042, + "learning_rate": 2.4e-07, + "loss": 0.0001, + "reward": 2.064258575439453, + "reward_std": 5.884495735168457, + "rewards/citation_reward_func": 3.8435373306274414, + "rewards/correctness_reward_func": 1.8197278082370758, + "rewards/formatting_reward_func": 0.4812925159931183, + "rewards/length_reward_func": -3.5204080740610757, + "rewards/penalize_wrong_passages_reward_func": -1.0408163170019786, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.48092512786388397, + "step": 12 + }, + { + "completion_length": 142.45578002929688, + "epoch": 0.02737802737802738, + "grad_norm": 1.0271327495574951, + "kl": 0.0016148885091145833, + "learning_rate": 2.6e-07, + "loss": 0.0009, + "reward": 3.8155850768089294, + "reward_std": 5.828161716461182, + "rewards/citation_reward_func": 3.6819727023442588, + "rewards/correctness_reward_func": 2.6360543171564736, + "rewards/formatting_reward_func": 0.47959182659784955, + "rewards/length_reward_func": -2.7551019390424094, + "rewards/penalize_wrong_passages_reward_func": -0.7074829836686453, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.48055097957452136, + "step": 13 + }, + { + "completion_length": 151.4897918701172, + "epoch": 0.029484029484029485, + "grad_norm": 0.853244960308075, + "kl": 0.0016167958577473958, + "learning_rate": 2.8e-07, + "loss": 0.0014, + "reward": 3.293789138396581, + "reward_std": 5.249532063802083, + "rewards/citation_reward_func": 3.7499999602635703, + "rewards/correctness_reward_func": 2.1598638892173767, + "rewards/formatting_reward_func": 0.48979591329892475, + "rewards/length_reward_func": -2.2959182957808175, + "rewards/penalize_wrong_passages_reward_func": -1.2993196845054626, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4893672863642375, + "step": 14 + }, + { + "completion_length": 137.7653045654297, + "epoch": 0.03159003159003159, + "grad_norm": 1.0360064506530762, + "kl": 0.0015691121419270833, + "learning_rate": 3e-07, + "loss": 0.0002, + "reward": 0.6607789664218823, + "reward_std": 7.108782927195231, + "rewards/citation_reward_func": 3.4523808558781943, + "rewards/correctness_reward_func": 1.2925169517596562, + "rewards/formatting_reward_func": 0.4574829836686452, + "rewards/length_reward_func": -3.979591647783915, + "rewards/penalize_wrong_passages_reward_func": -1.0204081336657207, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4583979199330012, + "step": 15 + }, + { + "completion_length": 146.68026733398438, + "epoch": 0.033696033696033696, + "grad_norm": 1.042144775390625, + "kl": 0.0016771952311197917, + "learning_rate": 3.2e-07, + "loss": 0.0008, + "reward": 1.779846937706073, + "reward_std": 6.210790753364563, + "rewards/citation_reward_func": 3.809523661931356, + "rewards/correctness_reward_func": 1.530612200498581, + "rewards/formatting_reward_func": 0.46768706540266675, + "rewards/length_reward_func": -3.775510162115097, + "rewards/penalize_wrong_passages_reward_func": -0.7210884292920431, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.468622421224912, + "step": 16 + }, + { + "completion_length": 155.00679779052734, + "epoch": 0.0358020358020358, + "grad_norm": 0.8416888117790222, + "kl": 0.001552581787109375, + "learning_rate": 3.4000000000000003e-07, + "loss": 0.0007, + "reward": 1.9689354697863262, + "reward_std": 5.606051802635193, + "rewards/citation_reward_func": 3.5884352922439575, + "rewards/correctness_reward_func": 0.9693877349297205, + "rewards/formatting_reward_func": 0.4812925110260646, + "rewards/length_reward_func": -2.9591835737228394, + "rewards/penalize_wrong_passages_reward_func": -0.591836716979742, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4808400919040044, + "step": 17 + }, + { + "completion_length": 138.35713831583658, + "epoch": 0.03790803790803791, + "grad_norm": 1.098335862159729, + "kl": 0.002567291259765625, + "learning_rate": 3.6e-07, + "loss": 0.0, + "reward": 4.020343641440074, + "reward_std": 5.701698939005534, + "rewards/citation_reward_func": 3.6989795764287314, + "rewards/correctness_reward_func": 2.4659862915674844, + "rewards/formatting_reward_func": 0.47789115210374195, + "rewards/length_reward_func": -2.4999999403953552, + "rewards/penalize_wrong_passages_reward_func": -0.5986394435167313, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4761258115371068, + "step": 18 + }, + { + "completion_length": 134.54761505126953, + "epoch": 0.04001404001404001, + "grad_norm": 1.1855769157409668, + "kl": 0.0048675537109375, + "learning_rate": 3.7999999999999996e-07, + "loss": 0.0007, + "reward": 3.8262110551198325, + "reward_std": 4.684337218602498, + "rewards/citation_reward_func": 3.656462550163269, + "rewards/correctness_reward_func": 2.057823101679484, + "rewards/formatting_reward_func": 0.4812925159931183, + "rewards/length_reward_func": -2.3469387193520865, + "rewards/penalize_wrong_passages_reward_func": -0.5034013539552689, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4809727370738983, + "step": 19 + }, + { + "completion_length": 138.53060658772787, + "epoch": 0.04212004212004212, + "grad_norm": 1.0796778202056885, + "kl": 0.005633036295572917, + "learning_rate": 4e-07, + "loss": 0.0003, + "reward": 1.8895781971514225, + "reward_std": 5.371582110722859, + "rewards/citation_reward_func": 3.4608842929204306, + "rewards/correctness_reward_func": 1.0374149332443874, + "rewards/formatting_reward_func": 0.4855442096789678, + "rewards/length_reward_func": -2.90816322962443, + "rewards/penalize_wrong_passages_reward_func": -0.6734693745772043, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4873672773440679, + "step": 20 + }, + { + "completion_length": 151.2653020222982, + "epoch": 0.044226044226044224, + "grad_norm": 1.172623634338379, + "kl": 0.008516947428385416, + "learning_rate": 4.1999999999999995e-07, + "loss": 0.0004, + "reward": 2.291476254661878, + "reward_std": 6.13926366964976, + "rewards/citation_reward_func": 3.579931934674581, + "rewards/correctness_reward_func": 1.4965985814730327, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -2.959183613459269, + "rewards/penalize_wrong_passages_reward_func": -0.8163265287876129, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.493857075770696, + "step": 21 + }, + { + "completion_length": 144.25850041707358, + "epoch": 0.04633204633204633, + "grad_norm": 0.9470953345298767, + "kl": 0.007146199544270833, + "learning_rate": 4.3999999999999997e-07, + "loss": 0.0009, + "reward": 3.1404490868250527, + "reward_std": 4.314921895662944, + "rewards/citation_reward_func": 3.9795918067296348, + "rewards/correctness_reward_func": 0.9863945146401724, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -2.4999999503294625, + "rewards/penalize_wrong_passages_reward_func": -0.3197278883308172, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.497591773668925, + "step": 22 + }, + { + "completion_length": 135.66666158040366, + "epoch": 0.048438048438048435, + "grad_norm": 0.7583827376365662, + "kl": 0.007680257161458333, + "learning_rate": 4.6e-07, + "loss": 0.001, + "reward": 2.0998741885026297, + "reward_std": 4.641193389892578, + "rewards/citation_reward_func": 3.6394556760787964, + "rewards/correctness_reward_func": 1.3775509943564732, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -2.8061223874489465, + "rewards/penalize_wrong_passages_reward_func": -1.102040817340215, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4944319079319636, + "step": 23 + }, + { + "completion_length": 151.2006746927897, + "epoch": 0.05054405054405054, + "grad_norm": 0.83041912317276, + "kl": 0.008819580078125, + "learning_rate": 4.8e-07, + "loss": 0.0005, + "reward": 2.3360307614008584, + "reward_std": 4.612976272900899, + "rewards/citation_reward_func": 3.7329931259155273, + "rewards/correctness_reward_func": 1.4625850121180217, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -2.8571428110202155, + "rewards/penalize_wrong_passages_reward_func": -0.8299319495757421, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.49929585059483844, + "step": 24 + }, + { + "completion_length": 143.89795684814453, + "epoch": 0.05265005265005265, + "grad_norm": 0.8499120473861694, + "kl": 0.0132293701171875, + "learning_rate": 5e-07, + "loss": 0.0005, + "reward": 3.760387728611628, + "reward_std": 4.180753588676453, + "rewards/citation_reward_func": 3.767006754875183, + "rewards/correctness_reward_func": 2.1768707036972046, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -2.4999999701976776, + "rewards/penalize_wrong_passages_reward_func": -0.6802720973889033, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49848292271296185, + "step": 25 + }, + { + "completion_length": 146.84693400065103, + "epoch": 0.05475605475605476, + "grad_norm": 0.649495542049408, + "kl": 0.008579254150390625, + "learning_rate": 5.2e-07, + "loss": 0.0012, + "reward": 3.507796049118042, + "reward_std": 4.758268475532532, + "rewards/citation_reward_func": 3.741496443748474, + "rewards/correctness_reward_func": 1.8707482516765594, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -2.295918345451355, + "rewards/penalize_wrong_passages_reward_func": -0.8027210781971613, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.497591773668925, + "step": 26 + }, + { + "completion_length": 145.1700668334961, + "epoch": 0.056862056862056864, + "grad_norm": 0.8985055088996887, + "kl": 0.01629638671875, + "learning_rate": 5.4e-07, + "loss": 0.0008, + "reward": 2.723782400290171, + "reward_std": 4.800549666086833, + "rewards/citation_reward_func": 3.4778910875320435, + "rewards/correctness_reward_func": 1.8707482516765594, + "rewards/formatting_reward_func": 0.49659863611062366, + "rewards/length_reward_func": -3.061224400997162, + "rewards/penalize_wrong_passages_reward_func": -0.5578231147180001, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.497591773668925, + "step": 27 + }, + { + "completion_length": 148.224484761556, + "epoch": 0.05896805896805897, + "grad_norm": 1.046953797340393, + "kl": 0.02532958984375, + "learning_rate": 5.6e-07, + "loss": 0.0005, + "reward": 3.3295952950914702, + "reward_std": 5.556566874186198, + "rewards/citation_reward_func": 3.579931855201721, + "rewards/correctness_reward_func": 2.4149659176667533, + "rewards/formatting_reward_func": 0.4948979616165161, + "rewards/length_reward_func": -2.551020344098409, + "rewards/penalize_wrong_passages_reward_func": -0.9319727768500646, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.4928604910771052, + "step": 28 + }, + { + "completion_length": 161.9489771525065, + "epoch": 0.061074061074061076, + "grad_norm": 0.9288030862808228, + "kl": 0.057291666666666664, + "learning_rate": 5.8e-07, + "loss": 0.0007, + "reward": 1.887748343249162, + "reward_std": 5.772057731946309, + "rewards/citation_reward_func": 3.4523808558781943, + "rewards/correctness_reward_func": 1.5986394186814625, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -3.0612244407335916, + "rewards/penalize_wrong_passages_reward_func": -0.9251700465877851, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.4965917766094208, + "step": 29 + }, + { + "completion_length": 141.9795913696289, + "epoch": 0.06318006318006318, + "grad_norm": 0.994685709476471, + "kl": 0.0557861328125, + "learning_rate": 6e-07, + "loss": 0.0018, + "reward": 4.592092037200928, + "reward_std": 4.368195136388143, + "rewards/citation_reward_func": 3.945578098297119, + "rewards/correctness_reward_func": 2.414965867996216, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -2.1428571144739785, + "rewards/penalize_wrong_passages_reward_func": -0.619047611951828, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.493452325463295, + "step": 30 + }, + { + "completion_length": 149.51360066731772, + "epoch": 0.06528606528606529, + "grad_norm": 0.8815411329269409, + "kl": 0.058308919270833336, + "learning_rate": 6.2e-07, + "loss": 0.0009, + "reward": 4.303721110026042, + "reward_std": 5.176619450251262, + "rewards/citation_reward_func": 4.2176869710286455, + "rewards/correctness_reward_func": 1.9727890690167744, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -1.989795873562495, + "rewards/penalize_wrong_passages_reward_func": -0.8979591627915701, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 31 + }, + { + "completion_length": 153.89795684814453, + "epoch": 0.06739206739206739, + "grad_norm": 1.1185945272445679, + "kl": 0.0631103515625, + "learning_rate": 6.4e-07, + "loss": 0.0007, + "reward": 2.2272075613339744, + "reward_std": 5.821100076039632, + "rewards/citation_reward_func": 3.945578138033549, + "rewards/correctness_reward_func": 2.023809482653936, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -3.0612244606018066, + "rewards/penalize_wrong_passages_reward_func": -1.6802720924218495, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4993162552515666, + "step": 32 + }, + { + "completion_length": 144.96258163452148, + "epoch": 0.0694980694980695, + "grad_norm": 0.9808516502380371, + "kl": 0.0740966796875, + "learning_rate": 6.6e-07, + "loss": 0.0014, + "reward": 5.15746267636617, + "reward_std": 4.294728080431621, + "rewards/citation_reward_func": 3.622448960940043, + "rewards/correctness_reward_func": 2.772108813126882, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -1.632653037707011, + "rewards/penalize_wrong_passages_reward_func": -0.6054421712954839, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 33 + }, + { + "completion_length": 145.58162943522134, + "epoch": 0.0716040716040716, + "grad_norm": 0.9893280863761902, + "kl": 0.102783203125, + "learning_rate": 6.800000000000001e-07, + "loss": 0.0017, + "reward": 6.129275401433309, + "reward_std": 3.9419746001561484, + "rewards/citation_reward_func": 4.022108713785808, + "rewards/correctness_reward_func": 2.9081631700197854, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -1.3265306105216343, + "rewards/penalize_wrong_passages_reward_func": -0.4693877398967743, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4966223786274592, + "step": 34 + }, + { + "completion_length": 144.02720896402994, + "epoch": 0.07371007371007371, + "grad_norm": 1.1553752422332764, + "kl": 0.09427897135416667, + "learning_rate": 7e-07, + "loss": 0.0012, + "reward": 5.2912074228127794, + "reward_std": 5.189612110455831, + "rewards/citation_reward_func": 4.090135892232259, + "rewards/correctness_reward_func": 2.568027118841807, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -1.7346938451131184, + "rewards/penalize_wrong_passages_reward_func": -0.6326530476411184, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.5003910859425863, + "step": 35 + }, + { + "completion_length": 138.0680249532064, + "epoch": 0.07581607581607581, + "grad_norm": 1.5305249691009521, + "kl": 0.14192708333333334, + "learning_rate": 7.2e-07, + "loss": 0.0018, + "reward": 3.28841503461202, + "reward_std": 4.651200453440349, + "rewards/citation_reward_func": 3.6139455238978067, + "rewards/correctness_reward_func": 1.7517006198565166, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -1.7857142686843872, + "rewards/penalize_wrong_passages_reward_func": -1.2925169666608174, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 36 + }, + { + "completion_length": 143.6292495727539, + "epoch": 0.07792207792207792, + "grad_norm": 0.9717904925346375, + "kl": 0.159912109375, + "learning_rate": 7.4e-07, + "loss": 0.002, + "reward": 4.3089354038238525, + "reward_std": 3.8151880502700806, + "rewards/citation_reward_func": 3.5969387690226235, + "rewards/correctness_reward_func": 1.870748261610667, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -1.530612200498581, + "rewards/penalize_wrong_passages_reward_func": -0.6190476020177206, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4909080962340037, + "step": 37 + }, + { + "completion_length": 145.60544077555338, + "epoch": 0.08002808002808003, + "grad_norm": 1.1155370473861694, + "kl": 0.19954427083333334, + "learning_rate": 7.599999999999999e-07, + "loss": 0.0022, + "reward": 4.726190567016602, + "reward_std": 3.97695521513621, + "rewards/citation_reward_func": 4.03911550839742, + "rewards/correctness_reward_func": 2.2619047264258065, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -1.3775510092576344, + "rewards/penalize_wrong_passages_reward_func": -1.183673453827699, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4863944947719574, + "step": 38 + }, + { + "completion_length": 143.42516581217447, + "epoch": 0.08213408213408213, + "grad_norm": 0.9343476891517639, + "kl": 0.14713541666666666, + "learning_rate": 7.799999999999999e-07, + "loss": 0.0019, + "reward": 4.451527198155721, + "reward_std": 3.3758476177851358, + "rewards/citation_reward_func": 3.9795917669932046, + "rewards/correctness_reward_func": 1.1564625600973766, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -1.224489798148473, + "rewards/penalize_wrong_passages_reward_func": -0.45578230917453766, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49574483434359234, + "step": 39 + }, + { + "completion_length": 139.33672841389975, + "epoch": 0.08424008424008424, + "grad_norm": 0.9100993275642395, + "kl": 0.197265625, + "learning_rate": 8e-07, + "loss": 0.0031, + "reward": 5.527904828389485, + "reward_std": 3.306452294190725, + "rewards/citation_reward_func": 3.784013509750366, + "rewards/correctness_reward_func": 2.1938775181770325, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.9693877448638281, + "rewards/penalize_wrong_passages_reward_func": -0.4761904676755269, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49559177954991657, + "step": 40 + }, + { + "completion_length": 125.27891031901042, + "epoch": 0.08634608634608634, + "grad_norm": 1.0793579816818237, + "kl": 0.2594401041666667, + "learning_rate": 8.199999999999999e-07, + "loss": 0.0033, + "reward": 5.9822925726572675, + "reward_std": 3.2602258125940957, + "rewards/citation_reward_func": 3.954081575075785, + "rewards/correctness_reward_func": 2.1938775181770325, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.8673469324906667, + "rewards/penalize_wrong_passages_reward_func": -0.29931971554954845, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 41 + }, + { + "completion_length": 140.08503341674805, + "epoch": 0.08845208845208845, + "grad_norm": 1.0207041501998901, + "kl": 0.3108723958333333, + "learning_rate": 8.399999999999999e-07, + "loss": 0.0033, + "reward": 4.319027250011762, + "reward_std": 3.1209686001141868, + "rewards/citation_reward_func": 3.818027059237162, + "rewards/correctness_reward_func": 1.4625849823156993, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -1.3775510042905807, + "rewards/penalize_wrong_passages_reward_func": -0.5850340078274409, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 42 + }, + { + "completion_length": 154.65986124674478, + "epoch": 0.09055809055809055, + "grad_norm": 0.9811875820159912, + "kl": 0.3186848958333333, + "learning_rate": 8.599999999999999e-07, + "loss": 0.0035, + "reward": 4.860387921333313, + "reward_std": 4.377778013547261, + "rewards/citation_reward_func": 4.030612150828044, + "rewards/correctness_reward_func": 1.8537414173285167, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -1.4795918017625809, + "rewards/penalize_wrong_passages_reward_func": -0.5442176821331183, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49984346826871234, + "step": 43 + }, + { + "completion_length": 161.1530558268229, + "epoch": 0.09266409266409266, + "grad_norm": 1.072112798690796, + "kl": 0.3014322916666667, + "learning_rate": 8.799999999999999e-07, + "loss": 0.0035, + "reward": 3.8310410181681314, + "reward_std": 5.302052021026611, + "rewards/citation_reward_func": 4.005101879437764, + "rewards/correctness_reward_func": 1.4455781976381938, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -1.9387754797935486, + "rewards/penalize_wrong_passages_reward_func": -0.6802720949053764, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49940809110800427, + "step": 44 + }, + { + "completion_length": 157.3537394205729, + "epoch": 0.09477009477009476, + "grad_norm": 1.0070186853408813, + "kl": 0.31298828125, + "learning_rate": 9e-07, + "loss": 0.004, + "reward": 6.792323112487793, + "reward_std": 2.9815571308135986, + "rewards/citation_reward_func": 3.8605441649754844, + "rewards/correctness_reward_func": 3.010204037030538, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.6122448891401291, + "rewards/penalize_wrong_passages_reward_func": -0.46258503446976346, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49640469749768573, + "step": 45 + }, + { + "completion_length": 148.18026987711588, + "epoch": 0.09687609687609687, + "grad_norm": 0.8120829463005066, + "kl": 248.32552083333334, + "learning_rate": 9.2e-07, + "loss": 2.4913, + "reward": 7.790044228235881, + "reward_std": 2.8049110968907676, + "rewards/citation_reward_func": 4.15816315015157, + "rewards/correctness_reward_func": 3.5714284578959146, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.5102040817340215, + "rewards/penalize_wrong_passages_reward_func": -0.42176870505015057, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49412578841050464, + "step": 46 + }, + { + "completion_length": 140.71428298950195, + "epoch": 0.09898209898209898, + "grad_norm": 1.2571049928665161, + "kl": 0.498046875, + "learning_rate": 9.399999999999999e-07, + "loss": 0.0054, + "reward": 5.644897858301799, + "reward_std": 2.7742998798688254, + "rewards/citation_reward_func": 3.74999992052714, + "rewards/correctness_reward_func": 2.414965877930323, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.8673469225565592, + "rewards/penalize_wrong_passages_reward_func": -0.6462584932645162, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4952380359172821, + "step": 47 + }, + { + "completion_length": 142.21088155110678, + "epoch": 0.10108810108810108, + "grad_norm": 1.004626750946045, + "kl": 0.3631184895833333, + "learning_rate": 9.6e-07, + "loss": 0.004, + "reward": 3.835642953713735, + "reward_std": 3.7980151573816934, + "rewards/citation_reward_func": 3.869047522544861, + "rewards/correctness_reward_func": 1.1224489609400432, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -1.275510201851527, + "rewards/penalize_wrong_passages_reward_func": -0.870748296380043, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4904047002394994, + "step": 48 + }, + { + "completion_length": 133.32992808024088, + "epoch": 0.10319410319410319, + "grad_norm": 1.8091953992843628, + "kl": 0.5882161458333334, + "learning_rate": 9.8e-07, + "loss": 0.006, + "reward": 4.456782499949138, + "reward_std": 3.1944571336110434, + "rewards/citation_reward_func": 3.6564625104268393, + "rewards/correctness_reward_func": 1.9897959033648174, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.8163265387217203, + "rewards/penalize_wrong_passages_reward_func": -1.3741496553023655, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 49 + }, + { + "completion_length": 128.5748291015625, + "epoch": 0.1053001053001053, + "grad_norm": 16.53020668029785, + "kl": 1.4166666666666667, + "learning_rate": 1e-06, + "loss": 0.0148, + "reward": 7.325829982757568, + "reward_std": 2.0804774363835654, + "rewards/citation_reward_func": 3.6139454444249473, + "rewards/correctness_reward_func": 2.976190368334452, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -0.1632653040190538, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 50 + }, + { + "completion_length": 142.61224365234375, + "epoch": 0.10740610740610741, + "grad_norm": 2.8453311920166016, + "kl": 2.1321614583333335, + "learning_rate": 1e-06, + "loss": 0.022, + "reward": 5.023935596148173, + "reward_std": 2.8823713461558023, + "rewards/citation_reward_func": 3.911564509073893, + "rewards/correctness_reward_func": 1.5476190348466237, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -1.0204081684350967, + "rewards/penalize_wrong_passages_reward_func": -0.4149659772713979, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.5001257807016373, + "step": 51 + }, + { + "completion_length": 171.63264973958334, + "epoch": 0.10951210951210952, + "grad_norm": 22.674123764038086, + "kl": 1.5296223958333333, + "learning_rate": 1e-06, + "loss": 0.0155, + "reward": 0.20642862717310587, + "reward_std": 5.004753351211548, + "rewards/citation_reward_func": 3.9965985218683877, + "rewards/correctness_reward_func": 0.34013604124387103, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -4.5408161878585815, + "rewards/penalize_wrong_passages_reward_func": -0.5850340028603872, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49554415543874103, + "step": 52 + }, + { + "completion_length": 151.25509897867838, + "epoch": 0.11161811161811162, + "grad_norm": 0.8755993247032166, + "kl": 0.4990234375, + "learning_rate": 1e-06, + "loss": 0.0053, + "reward": 5.218874295552571, + "reward_std": 3.1620943943659463, + "rewards/citation_reward_func": 3.8860543966293335, + "rewards/correctness_reward_func": 1.7346938451131184, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.8163265337546667, + "rewards/penalize_wrong_passages_reward_func": -0.585034000997742, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4994863271713257, + "step": 53 + }, + { + "completion_length": 149.2585016886393, + "epoch": 0.11372411372411373, + "grad_norm": 0.8691079616546631, + "kl": 0.3743489583333333, + "learning_rate": 1e-06, + "loss": 0.0044, + "reward": 5.637054522832234, + "reward_std": 3.094425678253174, + "rewards/citation_reward_func": 4.013605356216431, + "rewards/correctness_reward_func": 1.7517006198565166, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.6122448941071829, + "rewards/penalize_wrong_passages_reward_func": -0.5170068045457205, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 54 + }, + { + "completion_length": 161.76190185546875, + "epoch": 0.11583011583011583, + "grad_norm": 1.1524864435195923, + "kl": 0.4420572916666667, + "learning_rate": 1e-06, + "loss": 0.0045, + "reward": 1.8751495977242787, + "reward_std": 3.3608768383661904, + "rewards/citation_reward_func": 3.996598561604818, + "rewards/correctness_reward_func": 1.581632599234581, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -3.8265305956204734, + "rewards/penalize_wrong_passages_reward_func": -0.8775509844223658, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 55 + }, + { + "completion_length": 149.0646209716797, + "epoch": 0.11793611793611794, + "grad_norm": 1.3282649517059326, + "kl": 0.5231119791666666, + "learning_rate": 1e-06, + "loss": 0.0053, + "reward": 3.513479550679525, + "reward_std": 3.8186222910881042, + "rewards/citation_reward_func": 4.081632653872172, + "rewards/correctness_reward_func": 1.2244897584120433, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -2.193877483407656, + "rewards/penalize_wrong_passages_reward_func": -0.4285714191695054, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.4998740802208583, + "step": 56 + }, + { + "completion_length": 158.3197250366211, + "epoch": 0.12004212004212005, + "grad_norm": 2.27993106842041, + "kl": 0.6731770833333334, + "learning_rate": 1e-06, + "loss": 0.0069, + "reward": 6.510074933369954, + "reward_std": 2.7599647641181946, + "rewards/citation_reward_func": 3.9115644693374634, + "rewards/correctness_reward_func": 2.346938689549764, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.30612245202064514, + "rewards/penalize_wrong_passages_reward_func": -0.4353741407394409, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49306795994440716, + "step": 57 + }, + { + "completion_length": 155.25509897867838, + "epoch": 0.12214812214812215, + "grad_norm": 1.1884284019470215, + "kl": 0.6569010416666666, + "learning_rate": 1e-06, + "loss": 0.0069, + "reward": 6.011476318041484, + "reward_std": 3.121876875559489, + "rewards/citation_reward_func": 3.89455775419871, + "rewards/correctness_reward_func": 2.1938775181770325, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.5102040767669678, + "rewards/penalize_wrong_passages_reward_func": -0.5646258319417635, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49787067870299023, + "step": 58 + }, + { + "completion_length": 151.30951690673828, + "epoch": 0.12425412425412426, + "grad_norm": 1.67170250415802, + "kl": 0.6803385416666666, + "learning_rate": 1e-06, + "loss": 0.0076, + "reward": 6.520530700683594, + "reward_std": 2.5361270904541016, + "rewards/citation_reward_func": 4.0646257400512695, + "rewards/correctness_reward_func": 2.3129251301288605, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.510204071799914, + "rewards/penalize_wrong_passages_reward_func": -0.3401360474526882, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4950203349192937, + "step": 59 + }, + { + "completion_length": 128.4727872212728, + "epoch": 0.12636012636012636, + "grad_norm": 1.07207190990448, + "kl": 0.6061197916666666, + "learning_rate": 1e-06, + "loss": 0.007, + "reward": 8.203622579574585, + "reward_std": 2.1818835139274597, + "rewards/citation_reward_func": 4.022108793258667, + "rewards/correctness_reward_func": 3.2993196646372476, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.11564625551303227, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49784007171789807, + "step": 60 + }, + { + "completion_length": 137.25169881184897, + "epoch": 0.12846612846612845, + "grad_norm": 1.870184063911438, + "kl": 0.572265625, + "learning_rate": 1e-06, + "loss": 0.0066, + "reward": 7.795095205307007, + "reward_std": 2.1725188493728638, + "rewards/citation_reward_func": 3.8435372511545816, + "rewards/correctness_reward_func": 3.3843536376953125, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.3809523756305377, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4991768052180608, + "step": 61 + }, + { + "completion_length": 146.1836675008138, + "epoch": 0.13057213057213057, + "grad_norm": 0.9689912796020508, + "kl": 0.541015625, + "learning_rate": 1e-06, + "loss": 0.0064, + "reward": 6.3955680926640825, + "reward_std": 2.069407343864441, + "rewards/citation_reward_func": 3.9710882902145386, + "rewards/correctness_reward_func": 2.04081629216671, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -0.4625850295027097, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4993094503879547, + "step": 62 + }, + { + "completion_length": 158.43877156575522, + "epoch": 0.13267813267813267, + "grad_norm": 1.0672656297683716, + "kl": 0.5579427083333334, + "learning_rate": 1e-06, + "loss": 0.0065, + "reward": 6.065329949061076, + "reward_std": 2.4219020207722983, + "rewards/citation_reward_func": 4.217687010765076, + "rewards/correctness_reward_func": 1.632652997970581, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.3571428557236989, + "rewards/penalize_wrong_passages_reward_func": -0.4285714291036129, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.5007040103276571, + "step": 63 + }, + { + "completion_length": 147.59183502197266, + "epoch": 0.13478413478413478, + "grad_norm": 0.9976738095283508, + "kl": 0.6643880208333334, + "learning_rate": 1e-06, + "loss": 0.0075, + "reward": 5.30202051003774, + "reward_std": 2.7674037416776023, + "rewards/citation_reward_func": 4.2261903285980225, + "rewards/correctness_reward_func": 1.5306122203667958, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.5102040767669678, + "rewards/penalize_wrong_passages_reward_func": -0.945578183978796, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 64 + }, + { + "completion_length": 141.9965960184733, + "epoch": 0.13689013689013688, + "grad_norm": 0.8568897247314453, + "kl": 1.1334635416666667, + "learning_rate": 1e-06, + "loss": 0.0132, + "reward": 7.853527307510376, + "reward_std": 1.9899038672447205, + "rewards/citation_reward_func": 4.2261903285980225, + "rewards/correctness_reward_func": 3.418367326259613, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -0.6870748152335485, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4980849673350652, + "step": 65 + }, + { + "completion_length": 139.36054356892905, + "epoch": 0.138996138996139, + "grad_norm": 7.162007808685303, + "kl": 1.1429036458333333, + "learning_rate": 1e-06, + "loss": 0.0127, + "reward": 4.970387836297353, + "reward_std": 2.34166028102239, + "rewards/citation_reward_func": 4.421768585840861, + "rewards/correctness_reward_func": 0.6802720973889033, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.7653061151504517, + "rewards/penalize_wrong_passages_reward_func": -0.36734693869948387, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 66 + }, + { + "completion_length": 161.24829610188803, + "epoch": 0.14110214110214112, + "grad_norm": 2.989043951034546, + "kl": 0.794921875, + "learning_rate": 1e-06, + "loss": 0.0098, + "reward": 6.70848286151886, + "reward_std": 2.340596000353495, + "rewards/citation_reward_func": 4.149659752845764, + "rewards/correctness_reward_func": 3.299319624900818, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.9183673063913981, + "rewards/penalize_wrong_passages_reward_func": -0.8231292217969894, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 67 + }, + { + "completion_length": 151.68707275390625, + "epoch": 0.1432081432081432, + "grad_norm": 2.169832706451416, + "kl": 0.8333333333333334, + "learning_rate": 1e-06, + "loss": 0.0093, + "reward": 6.5593845049540205, + "reward_std": 2.7000568310419717, + "rewards/citation_reward_func": 4.192176739374797, + "rewards/correctness_reward_func": 2.2959183057149253, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.2551020383834839, + "rewards/penalize_wrong_passages_reward_func": -0.6734693696101507, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4998604704936345, + "step": 68 + }, + { + "completion_length": 148.39115142822266, + "epoch": 0.14531414531414533, + "grad_norm": 0.9319802522659302, + "kl": 0.5231119791666666, + "learning_rate": 1e-06, + "loss": 0.0073, + "reward": 7.101214488347371, + "reward_std": 2.3352424701054892, + "rewards/citation_reward_func": 4.336734652519226, + "rewards/correctness_reward_func": 2.7380951245625815, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -0.768707479039828, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49917339781920117, + "step": 69 + }, + { + "completion_length": 157.34693400065103, + "epoch": 0.14742014742014742, + "grad_norm": 0.8337175250053406, + "kl": 0.6435546875, + "learning_rate": 1e-06, + "loss": 0.0072, + "reward": 6.779911518096924, + "reward_std": 2.5660134156545005, + "rewards/citation_reward_func": 4.234693686167399, + "rewards/correctness_reward_func": 2.29591832558314, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.2551020383834839, + "rewards/penalize_wrong_passages_reward_func": -0.49659863611062366, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 70 + }, + { + "completion_length": 147.37755076090494, + "epoch": 0.14952614952614954, + "grad_norm": 1.07455575466156, + "kl": 0.5833333333333334, + "learning_rate": 1e-06, + "loss": 0.0064, + "reward": 6.333510239919026, + "reward_std": 2.8906733194986978, + "rewards/citation_reward_func": 4.18367342154185, + "rewards/correctness_reward_func": 2.1598638792832694, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.3571428557236989, + "rewards/penalize_wrong_passages_reward_func": -0.6530612111091614, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.5001767973105112, + "step": 71 + }, + { + "completion_length": 133.7721061706543, + "epoch": 0.15163215163215163, + "grad_norm": 0.8392318487167358, + "kl": 0.6985677083333334, + "learning_rate": 1e-06, + "loss": 0.0094, + "reward": 7.531945625940959, + "reward_std": 1.5154228607813518, + "rewards/citation_reward_func": 4.685374021530151, + "rewards/correctness_reward_func": 2.58503391345342, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.7346938649813334, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4962312380472819, + "step": 72 + }, + { + "completion_length": 140.7721061706543, + "epoch": 0.15373815373815375, + "grad_norm": 0.9759712219238281, + "kl": 0.6940104166666666, + "learning_rate": 1e-06, + "loss": 0.0081, + "reward": 5.715149720509847, + "reward_std": 2.087883015473684, + "rewards/citation_reward_func": 3.9795918464660645, + "rewards/correctness_reward_func": 2.1258502999941506, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -1.2857142488161724, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49746252099672955, + "step": 73 + }, + { + "completion_length": 152.61224619547525, + "epoch": 0.15584415584415584, + "grad_norm": 0.6399812698364258, + "kl": 0.5185546875, + "learning_rate": 1e-06, + "loss": 0.006, + "reward": 4.938377459843953, + "reward_std": 1.8990906874338787, + "rewards/citation_reward_func": 3.962584932645162, + "rewards/correctness_reward_func": 1.39455779393514, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.3571428557236989, + "rewards/penalize_wrong_passages_reward_func": -1.061224450667699, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49960197508335114, + "step": 74 + }, + { + "completion_length": 144.59183756510416, + "epoch": 0.15795015795015796, + "grad_norm": 5.202109336853027, + "kl": 0.9856770833333334, + "learning_rate": 1e-06, + "loss": 0.0104, + "reward": 5.507989803949992, + "reward_std": 2.5388510624567666, + "rewards/citation_reward_func": 4.090135931968689, + "rewards/correctness_reward_func": 2.3299319048722587, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -1.700680245955785, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4926836242278417, + "step": 75 + }, + { + "completion_length": 133.173464457194, + "epoch": 0.16005616005616005, + "grad_norm": 1.2735567092895508, + "kl": 0.7428385416666666, + "learning_rate": 1e-06, + "loss": 0.009, + "reward": 6.55202039082845, + "reward_std": 1.8033390442530315, + "rewards/citation_reward_func": 4.489795843760173, + "rewards/correctness_reward_func": 3.316326459248861, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -2.1020407577355704, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 76 + }, + { + "completion_length": 150.09183502197266, + "epoch": 0.16216216216216217, + "grad_norm": 0.912190318107605, + "kl": 0.71875, + "learning_rate": 1e-06, + "loss": 0.009, + "reward": 6.876408179601033, + "reward_std": 2.352629860242208, + "rewards/citation_reward_func": 4.379251639048259, + "rewards/correctness_reward_func": 2.227891117334366, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -0.35374149307608604, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.49715640644232434, + "step": 77 + }, + { + "completion_length": 149.22108205159506, + "epoch": 0.16426816426816426, + "grad_norm": 0.8740860223770142, + "kl": 0.5198567708333334, + "learning_rate": 1e-06, + "loss": 0.007, + "reward": 6.6944014231363935, + "reward_std": 2.1339696844418845, + "rewards/citation_reward_func": 4.124149521191915, + "rewards/correctness_reward_func": 2.636054356892904, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.3571428557236989, + "rewards/penalize_wrong_passages_reward_func": -0.700680265824, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49372102816899616, + "step": 78 + }, + { + "completion_length": 176.8197224934896, + "epoch": 0.16637416637416638, + "grad_norm": 0.850678026676178, + "kl": 0.5071614583333334, + "learning_rate": 1e-06, + "loss": 0.0057, + "reward": 5.613517125447591, + "reward_std": 3.3167173663775125, + "rewards/citation_reward_func": 4.498299201329549, + "rewards/correctness_reward_func": 1.173469344774882, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.40816326936086017, + "rewards/penalize_wrong_passages_reward_func": -0.4625850319862366, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.4825645883878072, + "step": 79 + }, + { + "completion_length": 150.61564127604166, + "epoch": 0.16848016848016847, + "grad_norm": 0.6305944919586182, + "kl": 0.5807291666666666, + "learning_rate": 1e-06, + "loss": 0.0078, + "reward": 7.111183563868205, + "reward_std": 2.422848731279373, + "rewards/citation_reward_func": 4.685374021530151, + "rewards/correctness_reward_func": 2.0068026781082153, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -0.2040816309551398, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.49723803003629047, + "step": 80 + }, + { + "completion_length": 140.08843231201172, + "epoch": 0.1705861705861706, + "grad_norm": 0.7477515935897827, + "kl": 0.6803385416666666, + "learning_rate": 1e-06, + "loss": 0.0093, + "reward": 7.217928409576416, + "reward_std": 2.468548610806465, + "rewards/citation_reward_func": 4.6513603528340655, + "rewards/correctness_reward_func": 2.9081632097562156, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.2551020383834839, + "rewards/penalize_wrong_passages_reward_func": -1.0816326302786667, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49513939023017883, + "step": 81 + }, + { + "completion_length": 136.18026987711588, + "epoch": 0.17269217269217269, + "grad_norm": 0.8937349319458008, + "kl": 0.6920572916666666, + "learning_rate": 1e-06, + "loss": 0.0086, + "reward": 7.065135955810547, + "reward_std": 2.2013906836509705, + "rewards/citation_reward_func": 4.251700599988301, + "rewards/correctness_reward_func": 2.5170067250728607, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.20408162971337637, + "rewards/penalize_wrong_passages_reward_func": -0.48979589839776355, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4903060595194499, + "step": 82 + }, + { + "completion_length": 142.0033976236979, + "epoch": 0.1747981747981748, + "grad_norm": 0.8927910327911377, + "kl": 0.658203125, + "learning_rate": 1e-06, + "loss": 0.0075, + "reward": 5.907251795132955, + "reward_std": 2.337292790412903, + "rewards/citation_reward_func": 3.962584892908732, + "rewards/correctness_reward_func": 1.7687074542045593, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -0.6666666517655054, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4956870178381602, + "step": 83 + }, + { + "completion_length": 143.99319458007812, + "epoch": 0.1769041769041769, + "grad_norm": 1.0283397436141968, + "kl": 0.7311197916666666, + "learning_rate": 1e-06, + "loss": 0.0094, + "reward": 6.188986460367839, + "reward_std": 2.1218987504641214, + "rewards/citation_reward_func": 4.583333253860474, + "rewards/correctness_reward_func": 3.265306015809377, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.30612244705359143, + "rewards/penalize_wrong_passages_reward_func": -2.353741466999054, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.5002108216285706, + "step": 84 + }, + { + "completion_length": 143.93197123209634, + "epoch": 0.17901017901017902, + "grad_norm": 1.2225751876831055, + "kl": 1.29296875, + "learning_rate": 1e-06, + "loss": 0.0138, + "reward": 5.960183660189311, + "reward_std": 2.4073960979779563, + "rewards/citation_reward_func": 4.319727897644043, + "rewards/correctness_reward_func": 1.9047618508338928, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.40816325942675274, + "rewards/penalize_wrong_passages_reward_func": -0.8571428507566452, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 85 + }, + { + "completion_length": 140.00680033365884, + "epoch": 0.1811161811161811, + "grad_norm": 1.0447474718093872, + "kl": 0.662109375, + "learning_rate": 1e-06, + "loss": 0.0081, + "reward": 7.196578184763591, + "reward_std": 1.5716410279273987, + "rewards/citation_reward_func": 4.3622448444366455, + "rewards/correctness_reward_func": 2.329931862652302, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.49659861996769905, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 86 + }, + { + "completion_length": 154.99319712320963, + "epoch": 0.18322218322218323, + "grad_norm": 0.9008639454841614, + "kl": 0.599609375, + "learning_rate": 1e-06, + "loss": 0.0078, + "reward": 6.203136126200358, + "reward_std": 1.8692207137743633, + "rewards/citation_reward_func": 4.345238010088603, + "rewards/correctness_reward_func": 2.2278911074002585, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -1.265306081622839, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4973536729812622, + "step": 87 + }, + { + "completion_length": 138.05101521809897, + "epoch": 0.18532818532818532, + "grad_norm": 1.0423378944396973, + "kl": 0.7408854166666666, + "learning_rate": 1e-06, + "loss": 0.01, + "reward": 7.49479603767395, + "reward_std": 1.503724937637647, + "rewards/citation_reward_func": 4.625850280125936, + "rewards/correctness_reward_func": 4.0136053164800005, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -2.040816222627958, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4981972227493922, + "step": 88 + }, + { + "completion_length": 148.11224238077799, + "epoch": 0.18743418743418744, + "grad_norm": 0.9338856935501099, + "kl": 0.6295572916666666, + "learning_rate": 1e-06, + "loss": 0.0073, + "reward": 5.587258418401082, + "reward_std": 2.0752708315849304, + "rewards/citation_reward_func": 4.685373942057292, + "rewards/correctness_reward_func": 1.615646208326022, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -1.6598639239867528, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.497122382124265, + "step": 89 + }, + { + "completion_length": 145.36054229736328, + "epoch": 0.18954018954018953, + "grad_norm": 0.9711011648178101, + "kl": 0.6637369791666666, + "learning_rate": 1e-06, + "loss": 0.0074, + "reward": 4.11749655008316, + "reward_std": 1.244092543919881, + "rewards/citation_reward_func": 4.566326379776001, + "rewards/correctness_reward_func": 1.7687074492375057, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -3.217687033737699, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500149592757225, + "step": 90 + }, + { + "completion_length": 132.89795684814453, + "epoch": 0.19164619164619165, + "grad_norm": 0.8163198828697205, + "kl": 0.97265625, + "learning_rate": 1e-06, + "loss": 0.0131, + "reward": 8.493574778238932, + "reward_std": 1.6131847749153774, + "rewards/citation_reward_func": 4.651360511779785, + "rewards/correctness_reward_func": 3.9795917669932046, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.408163254459699, + "rewards/penalize_wrong_passages_reward_func": -0.7278911570707957, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4986768066883087, + "step": 91 + }, + { + "completion_length": 140.93537139892578, + "epoch": 0.19375219375219374, + "grad_norm": 2.3272147178649902, + "kl": 1.0787760416666667, + "learning_rate": 1e-06, + "loss": 0.012, + "reward": 6.0862347682317095, + "reward_std": 2.626093844572703, + "rewards/citation_reward_func": 4.44727885723114, + "rewards/correctness_reward_func": 2.9761903484662375, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -2.231292466322581, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4960985829432805, + "step": 92 + }, + { + "completion_length": 166.67346700032553, + "epoch": 0.19585819585819586, + "grad_norm": 2.9835522174835205, + "kl": 1.3411458333333333, + "learning_rate": 1e-06, + "loss": 0.0145, + "reward": 6.7499386469523115, + "reward_std": 2.8229238192240396, + "rewards/citation_reward_func": 4.18367338180542, + "rewards/correctness_reward_func": 2.8911563555399575, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -1.2176870703697205, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4948366681734721, + "step": 93 + }, + { + "completion_length": 154.5986353556315, + "epoch": 0.19796419796419795, + "grad_norm": 1.780610203742981, + "kl": 1.1497395833333333, + "learning_rate": 1e-06, + "loss": 0.0134, + "reward": 7.306034167607625, + "reward_std": 2.362267851829529, + "rewards/citation_reward_func": 4.56632645924886, + "rewards/correctness_reward_func": 2.7380951642990112, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -0.843537408237656, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4982108175754547, + "step": 94 + }, + { + "completion_length": 155.43536885579428, + "epoch": 0.20007020007020007, + "grad_norm": 0.7836578488349915, + "kl": 0.6829427083333334, + "learning_rate": 1e-06, + "loss": 0.0089, + "reward": 7.553721110026042, + "reward_std": 2.3052654465039573, + "rewards/citation_reward_func": 4.532312790552775, + "rewards/correctness_reward_func": 2.7040815949440002, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -0.5306122352679571, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 95 + }, + { + "completion_length": 158.023806254069, + "epoch": 0.20217620217620216, + "grad_norm": 0.9116750955581665, + "kl": 1.1731770833333333, + "learning_rate": 1e-06, + "loss": 0.0137, + "reward": 6.622407933076222, + "reward_std": 2.6178742349147797, + "rewards/citation_reward_func": 4.574829896291097, + "rewards/correctness_reward_func": 2.5510203341643014, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.20408162971337637, + "rewards/penalize_wrong_passages_reward_func": -1.129251668850581, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.4999591161807378, + "step": 96 + }, + { + "completion_length": 149.49319203694662, + "epoch": 0.20428220428220428, + "grad_norm": 2.335477590560913, + "kl": 0.8600260416666666, + "learning_rate": 1e-06, + "loss": 0.012, + "reward": 7.119860688845317, + "reward_std": 1.709948976834615, + "rewards/citation_reward_func": 4.676870743433635, + "rewards/correctness_reward_func": 3.1292516191800437, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -1.5782312601804733, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49401014546553296, + "step": 97 + }, + { + "completion_length": 123.80611928304036, + "epoch": 0.20638820638820637, + "grad_norm": 0.8113567233085632, + "kl": 0.6715494791666666, + "learning_rate": 1e-06, + "loss": 0.0122, + "reward": 9.800319592157999, + "reward_std": 0.7947500944137573, + "rewards/citation_reward_func": 4.948979536692302, + "rewards/correctness_reward_func": 4.455782175064087, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.6054421526690325, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 98 + }, + { + "completion_length": 131.07822799682617, + "epoch": 0.2084942084942085, + "grad_norm": 0.6562955975532532, + "kl": 0.7076822916666666, + "learning_rate": 1e-06, + "loss": 0.0102, + "reward": 6.293738126754761, + "reward_std": 1.256690410276254, + "rewards/citation_reward_func": 4.838435252507527, + "rewards/correctness_reward_func": 1.122448980808258, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.20408162971337637, + "rewards/penalize_wrong_passages_reward_func": -0.4625850170850754, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4995203415552775, + "step": 99 + }, + { + "completion_length": 145.0952377319336, + "epoch": 0.2106002106002106, + "grad_norm": 0.9318126440048218, + "kl": 0.6321614583333334, + "learning_rate": 1e-06, + "loss": 0.007, + "reward": 7.648156483968099, + "reward_std": 2.704567869504293, + "rewards/citation_reward_func": 4.396258354187012, + "rewards/correctness_reward_func": 2.9421768188476562, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -0.537414958079656, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.5001972069342931, + "step": 100 + }, + { + "completion_length": 158.9659856160482, + "epoch": 0.2127062127062127, + "grad_norm": 1.0392431020736694, + "kl": 0.6048177083333334, + "learning_rate": 1e-06, + "loss": 0.007, + "reward": 6.060523907343547, + "reward_std": 2.274144728978475, + "rewards/citation_reward_func": 3.9200679461161294, + "rewards/correctness_reward_func": 2.1938774983088174, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -0.9523809204498926, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 101 + }, + { + "completion_length": 128.78570938110352, + "epoch": 0.21481221481221482, + "grad_norm": 0.9947078227996826, + "kl": 0.814453125, + "learning_rate": 1e-06, + "loss": 0.0089, + "reward": 4.414262016614278, + "reward_std": 1.7643556793530781, + "rewards/citation_reward_func": 3.869047482808431, + "rewards/correctness_reward_func": 1.445578212539355, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -1.7959183355172474, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49929585059483844, + "step": 102 + }, + { + "completion_length": 142.8333307902018, + "epoch": 0.21691821691821692, + "grad_norm": 0.915675699710846, + "kl": 0.7298177083333334, + "learning_rate": 1e-06, + "loss": 0.0089, + "reward": 6.6523605187733965, + "reward_std": 2.4479230642318726, + "rewards/citation_reward_func": 4.583333253860474, + "rewards/correctness_reward_func": 2.142857069770495, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -0.9727890839179357, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 103 + }, + { + "completion_length": 155.09863789876303, + "epoch": 0.21902421902421904, + "grad_norm": 0.8193255066871643, + "kl": 0.740234375, + "learning_rate": 1e-06, + "loss": 0.0085, + "reward": 6.158255100250244, + "reward_std": 2.398227870464325, + "rewards/citation_reward_func": 4.285714268684387, + "rewards/correctness_reward_func": 1.5816325942675273, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -0.6054421663284302, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49839108685652417, + "step": 104 + }, + { + "completion_length": 171.91156260172525, + "epoch": 0.22113022113022113, + "grad_norm": 1.1751590967178345, + "kl": 0.8697916666666666, + "learning_rate": 1e-06, + "loss": 0.0091, + "reward": 5.9842549959818525, + "reward_std": 3.8744190533955893, + "rewards/citation_reward_func": 4.0731290976206465, + "rewards/correctness_reward_func": 2.840135931968689, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.30612245202064514, + "rewards/penalize_wrong_passages_reward_func": -1.6190475821495056, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4961598018805186, + "step": 105 + }, + { + "completion_length": 161.8333307902018, + "epoch": 0.22323622323622325, + "grad_norm": 0.9442234635353088, + "kl": 0.6337890625, + "learning_rate": 1e-06, + "loss": 0.0079, + "reward": 8.165966113408407, + "reward_std": 2.3428567250569663, + "rewards/citation_reward_func": 4.396258473396301, + "rewards/correctness_reward_func": 3.418367306391398, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.5986394534508387, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 106 + }, + { + "completion_length": 147.87074534098306, + "epoch": 0.22534222534222534, + "grad_norm": 0.8142126202583313, + "kl": 0.828125, + "learning_rate": 1e-06, + "loss": 0.0118, + "reward": 8.370047489802042, + "reward_std": 1.7788663109143574, + "rewards/citation_reward_func": 4.498299201329549, + "rewards/correctness_reward_func": 3.7925169467926025, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.870748296380043, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 107 + }, + { + "completion_length": 135.7074826558431, + "epoch": 0.22744822744822746, + "grad_norm": 0.8248280882835388, + "kl": 0.8001302083333334, + "learning_rate": 1e-06, + "loss": 0.0104, + "reward": 5.128768642743428, + "reward_std": 2.9163127541542053, + "rewards/citation_reward_func": 4.608843406041463, + "rewards/correctness_reward_func": 3.1462583939234414, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -3.251700679461161, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.49951693415641785, + "step": 108 + }, + { + "completion_length": 134.26870473225912, + "epoch": 0.22955422955422955, + "grad_norm": 1.0382417440414429, + "kl": 0.8639322916666666, + "learning_rate": 1e-06, + "loss": 0.0106, + "reward": 6.94229261080424, + "reward_std": 2.0236403942108154, + "rewards/citation_reward_func": 4.59183669090271, + "rewards/correctness_reward_func": 2.482993165651957, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.2551020433505376, + "rewards/penalize_wrong_passages_reward_func": -0.8775510191917419, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.5001155783732733, + "step": 109 + }, + { + "completion_length": 139.26529947916666, + "epoch": 0.23166023166023167, + "grad_norm": 1.1215533018112183, + "kl": 0.7395833333333334, + "learning_rate": 1e-06, + "loss": 0.0102, + "reward": 7.058823188145955, + "reward_std": 1.4295719663302104, + "rewards/citation_reward_func": 4.540816307067871, + "rewards/correctness_reward_func": 2.6360543767611184, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -1.068027191484968, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 110 + }, + { + "completion_length": 154.25169626871744, + "epoch": 0.23376623376623376, + "grad_norm": 0.8044309020042419, + "kl": 0.7018229166666666, + "learning_rate": 1e-06, + "loss": 0.0101, + "reward": 6.965285857518514, + "reward_std": 1.674074004093806, + "rewards/citation_reward_func": 4.4982991218566895, + "rewards/correctness_reward_func": 2.1938775181770325, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -0.625850323587656, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 111 + }, + { + "completion_length": 134.3401336669922, + "epoch": 0.23587223587223588, + "grad_norm": 0.9982149600982666, + "kl": 1.177734375, + "learning_rate": 1e-06, + "loss": 0.0141, + "reward": 8.769040743509928, + "reward_std": 1.9494565327962239, + "rewards/citation_reward_func": 4.6343536376953125, + "rewards/correctness_reward_func": 3.6054420471191406, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.4693877423803012, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49863258997599286, + "step": 112 + }, + { + "completion_length": 160.38775380452475, + "epoch": 0.23797823797823797, + "grad_norm": 1.1707923412322998, + "kl": 0.7259114583333334, + "learning_rate": 1e-06, + "loss": 0.008, + "reward": 6.240132649739583, + "reward_std": 2.118043899536133, + "rewards/citation_reward_func": 4.489795843760173, + "rewards/correctness_reward_func": 2.363945484161377, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -1.6122448245684307, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49863598744074505, + "step": 113 + }, + { + "completion_length": 148.93197123209634, + "epoch": 0.2400842400842401, + "grad_norm": 17.552936553955078, + "kl": 1.8580729166666667, + "learning_rate": 1e-06, + "loss": 0.0197, + "reward": 7.550319671630859, + "reward_std": 1.7853082915147145, + "rewards/citation_reward_func": 4.668367147445679, + "rewards/correctness_reward_func": 2.1938775231440864, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.31292515868941945, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 114 + }, + { + "completion_length": 124.07482655843098, + "epoch": 0.24219024219024218, + "grad_norm": 1.9848018884658813, + "kl": 1.2858072916666667, + "learning_rate": 1e-06, + "loss": 0.0149, + "reward": 8.497598648071289, + "reward_std": 1.8612036903699238, + "rewards/citation_reward_func": 4.727891047795613, + "rewards/correctness_reward_func": 3.061224420865377, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.29251699708402157, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 115 + }, + { + "completion_length": 138.43196868896484, + "epoch": 0.2442962442962443, + "grad_norm": 1.1814403533935547, + "kl": 2.2493489583333335, + "learning_rate": 1e-06, + "loss": 0.0241, + "reward": 4.523796101411183, + "reward_std": 1.643856147925059, + "rewards/citation_reward_func": 4.73639444510142, + "rewards/correctness_reward_func": 3.4523807565371194, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -4.503401279449463, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4914829283952713, + "step": 116 + }, + { + "completion_length": 133.05441919962564, + "epoch": 0.2464022464022464, + "grad_norm": 11.506072998046875, + "kl": 1.5598958333333333, + "learning_rate": 1e-06, + "loss": 0.0176, + "reward": 8.436374107996622, + "reward_std": 1.970664918422699, + "rewards/citation_reward_func": 4.489795843760173, + "rewards/correctness_reward_func": 3.9285713036855063, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.9319727718830109, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 117 + }, + { + "completion_length": 136.46938451131186, + "epoch": 0.2485082485082485, + "grad_norm": 1.1682838201522827, + "kl": 0.8463541666666666, + "learning_rate": 1e-06, + "loss": 0.0107, + "reward": 9.16936723391215, + "reward_std": 1.515159587065379, + "rewards/citation_reward_func": 4.685374021530151, + "rewards/correctness_reward_func": 4.1156461636225385, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.6326530451575915, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 118 + }, + { + "completion_length": 162.7789077758789, + "epoch": 0.25061425061425063, + "grad_norm": 15.175686836242676, + "kl": 2.0638020833333335, + "learning_rate": 1e-06, + "loss": 0.0226, + "reward": 6.813918272654216, + "reward_std": 1.9256925384203594, + "rewards/citation_reward_func": 4.625850280125936, + "rewards/correctness_reward_func": 2.7380951841672263, + "rewards/formatting_reward_func": 0.49659863611062366, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -1.442176838715871, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.497591773668925, + "step": 119 + }, + { + "completion_length": 130.61224365234375, + "epoch": 0.2527202527202527, + "grad_norm": 0.8811450004577637, + "kl": 0.7669270833333334, + "learning_rate": 1e-06, + "loss": 0.0138, + "reward": 9.869921366373697, + "reward_std": 1.0036840935548146, + "rewards/citation_reward_func": 4.727891127268474, + "rewards/correctness_reward_func": 4.506802638371785, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.3129251648982366, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49917339781920117, + "step": 120 + }, + { + "completion_length": 143.23129018147787, + "epoch": 0.2548262548262548, + "grad_norm": 1.374078631401062, + "kl": 0.7858072916666666, + "learning_rate": 1e-06, + "loss": 0.011, + "reward": 8.281694094340006, + "reward_std": 1.8786971072355907, + "rewards/citation_reward_func": 4.625850240389506, + "rewards/correctness_reward_func": 3.0442176262537637, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.3333333258827527, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4976802070935567, + "step": 121 + }, + { + "completion_length": 166.99319458007812, + "epoch": 0.2569322569322569, + "grad_norm": 2.271318197250366, + "kl": 1.203125, + "learning_rate": 1e-06, + "loss": 0.0131, + "reward": 7.029516935348511, + "reward_std": 2.833619177341461, + "rewards/citation_reward_func": 4.336734612782796, + "rewards/correctness_reward_func": 2.6020407478014627, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.15306122104326883, + "rewards/penalize_wrong_passages_reward_func": -0.7551020185152689, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49890469014644623, + "step": 122 + }, + { + "completion_length": 164.01700592041016, + "epoch": 0.25903825903825906, + "grad_norm": 2.9062628746032715, + "kl": 2.017578125, + "learning_rate": 1e-06, + "loss": 0.0208, + "reward": 5.047608931859334, + "reward_std": 2.909602721532186, + "rewards/citation_reward_func": 3.5714284578959146, + "rewards/correctness_reward_func": 1.8707482020060222, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -1.238095223903656, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4965883692105611, + "step": 123 + }, + { + "completion_length": 166.54761759440103, + "epoch": 0.26114426114426115, + "grad_norm": 0.6600808501243591, + "kl": 0.7701822916666666, + "learning_rate": 1e-06, + "loss": 0.0102, + "reward": 5.248527089754741, + "reward_std": 1.834931919972102, + "rewards/citation_reward_func": 4.013605276743571, + "rewards/correctness_reward_func": 2.602040727933248, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -2.3129250705242157, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4968264698982239, + "step": 124 + }, + { + "completion_length": 140.8945515950521, + "epoch": 0.26325026325026324, + "grad_norm": 23.109085083007812, + "kl": 0.9609375, + "learning_rate": 1e-06, + "loss": 0.0129, + "reward": 8.27480951944987, + "reward_std": 1.7910826206207275, + "rewards/citation_reward_func": 4.49829916159312, + "rewards/correctness_reward_func": 3.299319644769033, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -0.42176870070397854, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 125 + }, + { + "completion_length": 168.1904729207357, + "epoch": 0.26535626535626533, + "grad_norm": 0.9592843055725098, + "kl": 1.1588541666666667, + "learning_rate": 1e-06, + "loss": 0.0141, + "reward": 7.51559845606486, + "reward_std": 2.5106443961461387, + "rewards/citation_reward_func": 4.532312790552775, + "rewards/correctness_reward_func": 3.945578098297119, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.30612245202064514, + "rewards/penalize_wrong_passages_reward_func": -1.6530611912409465, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4968910962343216, + "step": 126 + }, + { + "completion_length": 168.1394526163737, + "epoch": 0.2674622674622675, + "grad_norm": 0.9258236885070801, + "kl": 1.9557291666666667, + "learning_rate": 1e-06, + "loss": 0.022, + "reward": 6.473989725112915, + "reward_std": 2.391785681247711, + "rewards/citation_reward_func": 4.234693805376689, + "rewards/correctness_reward_func": 2.653061161438624, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.2551020383834839, + "rewards/penalize_wrong_passages_reward_func": -1.1564625725150108, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49779925247033435, + "step": 127 + }, + { + "completion_length": 166.78230794270834, + "epoch": 0.26956826956826957, + "grad_norm": 0.8637123703956604, + "kl": 0.7350260416666666, + "learning_rate": 1e-06, + "loss": 0.0085, + "reward": 7.016306241353353, + "reward_std": 2.6598212321599326, + "rewards/citation_reward_func": 4.345238010088603, + "rewards/correctness_reward_func": 2.551020304361979, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.8299319446086884, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 128 + }, + { + "completion_length": 146.89795430501303, + "epoch": 0.27167427167427166, + "grad_norm": 1.0868693590164185, + "kl": 0.8072916666666666, + "learning_rate": 1e-06, + "loss": 0.0113, + "reward": 8.904061237970987, + "reward_std": 1.806675414244334, + "rewards/citation_reward_func": 4.702380816141765, + "rewards/correctness_reward_func": 3.537414868672689, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.28571428172290325, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 129 + }, + { + "completion_length": 131.71428553263345, + "epoch": 0.27378027378027375, + "grad_norm": 0.6997035145759583, + "kl": 0.8606770833333334, + "learning_rate": 1e-06, + "loss": 0.0136, + "reward": 8.30031935373942, + "reward_std": 1.2167588621377945, + "rewards/citation_reward_func": 4.761904716491699, + "rewards/correctness_reward_func": 3.1972788075606027, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.4897959188868602, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 130 + }, + { + "completion_length": 173.03401056925455, + "epoch": 0.2758862758862759, + "grad_norm": 2.1735517978668213, + "kl": 0.9928385416666666, + "learning_rate": 1e-06, + "loss": 0.0111, + "reward": 5.81254776318868, + "reward_std": 2.344173808892568, + "rewards/citation_reward_func": 4.47278904914856, + "rewards/correctness_reward_func": 1.6666665971279144, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -1.1700680057207744, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49622102578481037, + "step": 131 + }, + { + "completion_length": 183.0442148844401, + "epoch": 0.277992277992278, + "grad_norm": 0.7319064140319824, + "kl": 0.7174479166666666, + "learning_rate": 1e-06, + "loss": 0.0099, + "reward": 7.494925260543823, + "reward_std": 2.043013642231623, + "rewards/citation_reward_func": 4.693877458572388, + "rewards/correctness_reward_func": 2.9251699844996133, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.20408162971337637, + "rewards/penalize_wrong_passages_reward_func": -0.9183673312266668, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49832646548748016, + "step": 132 + }, + { + "completion_length": 169.08843485514322, + "epoch": 0.2800982800982801, + "grad_norm": 9.555499076843262, + "kl": 1.4869791666666667, + "learning_rate": 1e-06, + "loss": 0.0184, + "reward": 8.021408240000406, + "reward_std": 1.6322008272012074, + "rewards/citation_reward_func": 4.625850280125936, + "rewards/correctness_reward_func": 3.3503400087356567, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.9047619005044302, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 133 + }, + { + "completion_length": 181.9047597249349, + "epoch": 0.28220428220428223, + "grad_norm": 6.901526927947998, + "kl": 0.9251302083333334, + "learning_rate": 1e-06, + "loss": 0.0095, + "reward": 2.1488198041915894, + "reward_std": 2.6775263945261636, + "rewards/citation_reward_func": 4.277210791905721, + "rewards/correctness_reward_func": 2.1768707036972046, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -5.251700500647227, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4974591185649236, + "step": 134 + }, + { + "completion_length": 144.98299153645834, + "epoch": 0.2843102843102843, + "grad_norm": 0.97262042760849, + "kl": 0.7350260416666666, + "learning_rate": 1e-06, + "loss": 0.0096, + "reward": 6.09113605817159, + "reward_std": 0.9187750220298767, + "rewards/citation_reward_func": 4.804421742757161, + "rewards/correctness_reward_func": 0.40816325942675274, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.12244897956649463, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 135 + }, + { + "completion_length": 148.2789077758789, + "epoch": 0.2864162864162864, + "grad_norm": 0.9249170422554016, + "kl": 1.5032552083333333, + "learning_rate": 1e-06, + "loss": 0.0164, + "reward": 7.3938571612040205, + "reward_std": 1.8515236973762512, + "rewards/citation_reward_func": 4.583333253860474, + "rewards/correctness_reward_func": 2.1258502875765166, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.2653061170130968, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 136 + }, + { + "completion_length": 179.51360066731772, + "epoch": 0.2885222885222885, + "grad_norm": 1.422672986984253, + "kl": 0.94921875, + "learning_rate": 1e-06, + "loss": 0.0102, + "reward": 2.7681804299354553, + "reward_std": 2.8826077977816262, + "rewards/citation_reward_func": 3.4778911074002585, + "rewards/correctness_reward_func": 2.2619047264258065, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.3571428507566452, + "rewards/penalize_wrong_passages_reward_func": -3.428571422894796, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.4858672966559728, + "step": 137 + }, + { + "completion_length": 181.78911336263022, + "epoch": 0.29062829062829065, + "grad_norm": 0.8672325611114502, + "kl": 1.7734375, + "learning_rate": 1e-06, + "loss": 0.0195, + "reward": 1.4140309020876884, + "reward_std": 1.965984453757604, + "rewards/citation_reward_func": 3.5884352922439575, + "rewards/correctness_reward_func": 2.1258503049612045, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.30612244705359143, + "rewards/penalize_wrong_passages_reward_func": -4.965986222028732, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.47185367345809937, + "step": 138 + }, + { + "completion_length": 174.54421361287436, + "epoch": 0.29273429273429274, + "grad_norm": 103.85633850097656, + "kl": 8.597005208333334, + "learning_rate": 1e-06, + "loss": 0.0873, + "reward": 6.572428623835246, + "reward_std": 1.6893009940783184, + "rewards/citation_reward_func": 4.676870663960774, + "rewards/correctness_reward_func": 3.7585033178329468, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -2.8639455369363227, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 139 + }, + { + "completion_length": 142.275510152181, + "epoch": 0.29484029484029484, + "grad_norm": 0.7605285048484802, + "kl": 0.7317708333333334, + "learning_rate": 1e-06, + "loss": 0.0092, + "reward": 8.055418491363525, + "reward_std": 1.8336223363876343, + "rewards/citation_reward_func": 4.795918226242065, + "rewards/correctness_reward_func": 3.333333214124044, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -1.0204081137975056, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49929585059483844, + "step": 140 + }, + { + "completion_length": 167.7551015218099, + "epoch": 0.29694629694629693, + "grad_norm": 0.8190628290176392, + "kl": 0.7721354166666666, + "learning_rate": 1e-06, + "loss": 0.0094, + "reward": 5.228380918502808, + "reward_std": 2.086130917072296, + "rewards/citation_reward_func": 4.489795843760173, + "rewards/correctness_reward_func": 2.1088434855143228, + "rewards/formatting_reward_func": 0.49659863611062366, + "rewards/length_reward_func": -0.3571428606907527, + "rewards/penalize_wrong_passages_reward_func": -1.9999999403953552, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4902856449286143, + "step": 141 + }, + { + "completion_length": 152.649658203125, + "epoch": 0.2990522990522991, + "grad_norm": 0.787834882736206, + "kl": 0.6796875, + "learning_rate": 1e-06, + "loss": 0.0086, + "reward": 6.84282660484314, + "reward_std": 2.0990612308184304, + "rewards/citation_reward_func": 4.6343536376953125, + "rewards/correctness_reward_func": 1.9557822346687317, + "rewards/formatting_reward_func": 0.4948979566494624, + "rewards/length_reward_func": -0.2551020383834839, + "rewards/penalize_wrong_passages_reward_func": -0.48299319048722583, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4958876967430115, + "step": 142 + }, + { + "completion_length": 142.88095092773438, + "epoch": 0.30115830115830117, + "grad_norm": 0.9918654561042786, + "kl": 0.6673177083333334, + "learning_rate": 1e-06, + "loss": 0.0074, + "reward": 7.225874503453572, + "reward_std": 2.072286307811737, + "rewards/citation_reward_func": 4.642857074737549, + "rewards/correctness_reward_func": 2.074829876422882, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.48979591329892475, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4979829291502635, + "step": 143 + }, + { + "completion_length": 149.2278849283854, + "epoch": 0.30326430326430326, + "grad_norm": 91.97071838378906, + "kl": 4.718098958333333, + "learning_rate": 1e-06, + "loss": 0.0481, + "reward": 5.090360681215922, + "reward_std": 2.203482369581858, + "rewards/citation_reward_func": 4.03911554813385, + "rewards/correctness_reward_func": 1.0884353493650754, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.459183673063914, + "rewards/penalize_wrong_passages_reward_func": -0.5782312800486883, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.5002244164546331, + "step": 144 + }, + { + "completion_length": 147.08503214518228, + "epoch": 0.30537030537030535, + "grad_norm": 2.862992525100708, + "kl": 3.07421875, + "learning_rate": 1e-06, + "loss": 0.0321, + "reward": 2.5679593483606973, + "reward_std": 2.9004951119422913, + "rewards/citation_reward_func": 4.217686931292216, + "rewards/correctness_reward_func": 0.5272108738621076, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -2.2448979218800864, + "rewards/penalize_wrong_passages_reward_func": -0.9319727768500646, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4999319016933441, + "step": 145 + }, + { + "completion_length": 135.28570938110352, + "epoch": 0.3074763074763075, + "grad_norm": 0.7194350361824036, + "kl": 0.7213541666666666, + "learning_rate": 1e-06, + "loss": 0.0112, + "reward": 8.593265215555826, + "reward_std": 1.1670270164807637, + "rewards/citation_reward_func": 4.804421663284302, + "rewards/correctness_reward_func": 3.673469305038452, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.8843536997834841, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4997278203566869, + "step": 146 + }, + { + "completion_length": 153.82992808024088, + "epoch": 0.3095823095823096, + "grad_norm": 0.6770069003105164, + "kl": 0.6171875, + "learning_rate": 1e-06, + "loss": 0.0086, + "reward": 6.921067953109741, + "reward_std": 1.4704915285110474, + "rewards/citation_reward_func": 4.753401279449463, + "rewards/correctness_reward_func": 2.2619046941399574, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -1.0952380833526452, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 147 + }, + { + "completion_length": 158.23129018147787, + "epoch": 0.3116883116883117, + "grad_norm": 1.632519006729126, + "kl": 0.8046875, + "learning_rate": 1e-06, + "loss": 0.0091, + "reward": 6.650659879048665, + "reward_std": 2.724838455518087, + "rewards/citation_reward_func": 4.6598639488220215, + "rewards/correctness_reward_func": 3.707482894261678, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.2551020433505376, + "rewards/penalize_wrong_passages_reward_func": -2.462584992249807, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 148 + }, + { + "completion_length": 159.6292495727539, + "epoch": 0.3137943137943138, + "grad_norm": 0.969252347946167, + "kl": 0.7467447916666666, + "learning_rate": 1e-06, + "loss": 0.0113, + "reward": 8.891149520874023, + "reward_std": 1.7988839149475098, + "rewards/citation_reward_func": 4.5153060754140215, + "rewards/correctness_reward_func": 3.8605440855026245, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.48299319793780643, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4982924511035283, + "step": 149 + }, + { + "completion_length": 156.11564127604166, + "epoch": 0.3159003159003159, + "grad_norm": 1.4659888744354248, + "kl": 1.2708333333333333, + "learning_rate": 1e-06, + "loss": 0.0148, + "reward": 6.316040953000386, + "reward_std": 3.3565998673439026, + "rewards/citation_reward_func": 4.098639369010925, + "rewards/correctness_reward_func": 2.7040815552075705, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.30612245202064514, + "rewards/penalize_wrong_passages_reward_func": -0.8299319446086884, + "rewards/unicode_reward_func": -0.3401360511779785, + "rewards/xmlcount_reward_func": 0.48951015373071033, + "step": 150 + }, + { + "completion_length": 139.925168355306, + "epoch": 0.318006318006318, + "grad_norm": 10.20754337310791, + "kl": 1.380859375, + "learning_rate": 1e-06, + "loss": 0.0168, + "reward": 8.006098747253418, + "reward_std": 1.9967832962671916, + "rewards/citation_reward_func": 4.668367226918538, + "rewards/correctness_reward_func": 3.3333332737286887, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -0.7891156375408173, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49929585059483844, + "step": 151 + }, + { + "completion_length": 150.03061040242514, + "epoch": 0.3201123201123201, + "grad_norm": 0.7859293818473816, + "kl": 0.6477864583333334, + "learning_rate": 1e-06, + "loss": 0.0091, + "reward": 6.6795713901519775, + "reward_std": 1.4083468516667683, + "rewards/citation_reward_func": 4.702380895614624, + "rewards/correctness_reward_func": 2.9761904080708823, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -1.99999996026357, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 152 + }, + { + "completion_length": 139.53741073608398, + "epoch": 0.3222183222183222, + "grad_norm": 1.0564855337142944, + "kl": 0.6477864583333334, + "learning_rate": 1e-06, + "loss": 0.0074, + "reward": 7.244768857955933, + "reward_std": 2.8033066193262735, + "rewards/citation_reward_func": 4.7534011999766035, + "rewards/correctness_reward_func": 2.4319727420806885, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -0.8367346984644731, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49816999832789105, + "step": 153 + }, + { + "completion_length": 130.1088409423828, + "epoch": 0.32432432432432434, + "grad_norm": 0.7908992767333984, + "kl": 0.8040364583333334, + "learning_rate": 1e-06, + "loss": 0.0104, + "reward": 6.212700764338176, + "reward_std": 1.2905257244904835, + "rewards/citation_reward_func": 4.566326379776001, + "rewards/correctness_reward_func": 3.214285676678022, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -2.517006744941076, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.5001155783732733, + "step": 154 + }, + { + "completion_length": 134.5067990620931, + "epoch": 0.32643032643032643, + "grad_norm": 1.2375706434249878, + "kl": 0.9283854166666666, + "learning_rate": 1e-06, + "loss": 0.0102, + "reward": 5.958527167638143, + "reward_std": 2.26580548286438, + "rewards/citation_reward_func": 4.0391156276067095, + "rewards/correctness_reward_func": 3.5714284976323447, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -2.6462584137916565, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4942414363225301, + "step": 155 + }, + { + "completion_length": 129.54421361287436, + "epoch": 0.3285363285363285, + "grad_norm": 0.7165351510047913, + "kl": 0.9837239583333334, + "learning_rate": 1e-06, + "loss": 0.0114, + "reward": 6.3339049021403, + "reward_std": 1.5690179268519084, + "rewards/citation_reward_func": 4.175169944763184, + "rewards/correctness_reward_func": 3.89455775419871, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -2.7346938153107962, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49887068569660187, + "step": 156 + }, + { + "completion_length": 141.46258544921875, + "epoch": 0.3306423306423306, + "grad_norm": 0.8587003946304321, + "kl": 0.8118489583333334, + "learning_rate": 1e-06, + "loss": 0.0113, + "reward": 8.448506752649942, + "reward_std": 1.4765484134356182, + "rewards/citation_reward_func": 4.659863789876302, + "rewards/correctness_reward_func": 3.622448901335398, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.7823129097620646, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49952714145183563, + "step": 157 + }, + { + "completion_length": 130.8979581197103, + "epoch": 0.33274833274833276, + "grad_norm": 0.7178787589073181, + "kl": 0.8391927083333334, + "learning_rate": 1e-06, + "loss": 0.0123, + "reward": 7.7033811410268145, + "reward_std": 0.9533064067363739, + "rewards/citation_reward_func": 4.889455636342366, + "rewards/correctness_reward_func": 2.5680271685123444, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.7551020495593548, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 158 + }, + { + "completion_length": 160.54081217447916, + "epoch": 0.33485433485433486, + "grad_norm": 0.7822726964950562, + "kl": 0.6728515625, + "learning_rate": 1e-06, + "loss": 0.008, + "reward": 5.943180481592814, + "reward_std": 2.2619903087615967, + "rewards/citation_reward_func": 4.4897957642873125, + "rewards/correctness_reward_func": 3.061224381128947, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -2.605442168811957, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4976019710302353, + "step": 159 + }, + { + "completion_length": 168.82312520345053, + "epoch": 0.33696033696033695, + "grad_norm": 1.056015968322754, + "kl": 0.6139322916666666, + "learning_rate": 1e-06, + "loss": 0.0065, + "reward": 2.2140272160371146, + "reward_std": 2.2160128553708396, + "rewards/citation_reward_func": 4.540816148122151, + "rewards/correctness_reward_func": 2.7380951642990112, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -6.061224301656087, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.496340071161588, + "step": 160 + }, + { + "completion_length": 139.75170135498047, + "epoch": 0.33906633906633904, + "grad_norm": 1.41465163230896, + "kl": 0.9140625, + "learning_rate": 1e-06, + "loss": 0.0111, + "reward": 7.655176599820455, + "reward_std": 1.8789227455854416, + "rewards/citation_reward_func": 4.685374021530151, + "rewards/correctness_reward_func": 3.435374101003011, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -1.3605441848436992, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4970135341087977, + "step": 161 + }, + { + "completion_length": 142.07142639160156, + "epoch": 0.3411723411723412, + "grad_norm": 0.6721370220184326, + "kl": 0.8606770833333334, + "learning_rate": 1e-06, + "loss": 0.0116, + "reward": 6.773833195368449, + "reward_std": 1.1285482396682103, + "rewards/citation_reward_func": 4.7704081535339355, + "rewards/correctness_reward_func": 3.639455646276474, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -2.632652991140882, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4966223786274592, + "step": 162 + }, + { + "completion_length": 163.81631978352866, + "epoch": 0.3432783432783433, + "grad_norm": 11.225006103515625, + "kl": 1.923828125, + "learning_rate": 1e-06, + "loss": 0.0198, + "reward": 5.977557897567749, + "reward_std": 2.643218537171682, + "rewards/citation_reward_func": 4.404761910438538, + "rewards/correctness_reward_func": 1.3095237612724304, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.30612245202064514, + "rewards/penalize_wrong_passages_reward_func": -0.42857142103215057, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4979659269253413, + "step": 163 + }, + { + "completion_length": 178.6190414428711, + "epoch": 0.34538434538434537, + "grad_norm": 1.1446784734725952, + "kl": 0.9283854166666666, + "learning_rate": 1e-06, + "loss": 0.0095, + "reward": 5.919367551803589, + "reward_std": 2.9474711418151855, + "rewards/citation_reward_func": 4.183673461278279, + "rewards/correctness_reward_func": 1.5986394087473552, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -0.6598639339208603, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 164 + }, + { + "completion_length": 167.16326395670572, + "epoch": 0.3474903474903475, + "grad_norm": 1.335066318511963, + "kl": 0.7757161458333334, + "learning_rate": 1e-06, + "loss": 0.0083, + "reward": 6.390279054641724, + "reward_std": 2.944413344065348, + "rewards/citation_reward_func": 4.642856995264689, + "rewards/correctness_reward_func": 1.666666607062022, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -0.7142857015132904, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49912238121032715, + "step": 165 + }, + { + "completion_length": 183.30271657307944, + "epoch": 0.3495963495963496, + "grad_norm": 2.683114767074585, + "kl": 1.072265625, + "learning_rate": 1e-06, + "loss": 0.012, + "reward": 4.426462570826213, + "reward_std": 2.4770036339759827, + "rewards/citation_reward_func": 4.285714228947957, + "rewards/correctness_reward_func": 1.8367346326510112, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.40816326439380646, + "rewards/penalize_wrong_passages_reward_func": -2.2857142289479575, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49789108832677204, + "step": 166 + }, + { + "completion_length": 171.02040354410806, + "epoch": 0.3517023517023517, + "grad_norm": 1.2051540613174438, + "kl": 1.9908854166666667, + "learning_rate": 1e-06, + "loss": 0.0204, + "reward": 3.414265056451162, + "reward_std": 2.233205517133077, + "rewards/citation_reward_func": 4.311224381128947, + "rewards/correctness_reward_func": 0.7653060927987099, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.45918366809686023, + "rewards/penalize_wrong_passages_reward_func": -2.204081585009893, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 167 + }, + { + "completion_length": 179.71087900797525, + "epoch": 0.3538083538083538, + "grad_norm": 1.0430907011032104, + "kl": 0.7047526041666666, + "learning_rate": 1e-06, + "loss": 0.0077, + "reward": 3.5896018147468567, + "reward_std": 2.7991716265678406, + "rewards/citation_reward_func": 4.192176858584086, + "rewards/correctness_reward_func": 0.8163264989852905, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.3571428507566452, + "rewards/penalize_wrong_passages_reward_func": -1.721088429292043, + "rewards/unicode_reward_func": -0.3401360511779785, + "rewards/xmlcount_reward_func": 0.4994659175475438, + "step": 168 + }, + { + "completion_length": 180.67686462402344, + "epoch": 0.35591435591435594, + "grad_norm": 0.9326726198196411, + "kl": 0.4820963541666667, + "learning_rate": 1e-06, + "loss": 0.006, + "reward": 2.6223334272702536, + "reward_std": 3.0509551763534546, + "rewards/citation_reward_func": 4.370748281478882, + "rewards/correctness_reward_func": 0.2551020284493764, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.6632653027772903, + "rewards/penalize_wrong_passages_reward_func": -2.170067938665549, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.4998842825492223, + "step": 169 + }, + { + "completion_length": 153.03741200764975, + "epoch": 0.35802035802035803, + "grad_norm": 5.393792629241943, + "kl": 1.2434895833333333, + "learning_rate": 1e-06, + "loss": 0.0135, + "reward": 6.667666673660278, + "reward_std": 2.0974193811416626, + "rewards/citation_reward_func": 4.557823101679484, + "rewards/correctness_reward_func": 1.9217686504125595, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.15306122104326883, + "rewards/penalize_wrong_passages_reward_func": -0.6598639289538065, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 170 + }, + { + "completion_length": 155.34353510538736, + "epoch": 0.3601263601263601, + "grad_norm": 1.1496800184249878, + "kl": 0.7903645833333334, + "learning_rate": 1e-06, + "loss": 0.0093, + "reward": 5.24929936726888, + "reward_std": 1.6736542582511902, + "rewards/citation_reward_func": 4.608843485514323, + "rewards/correctness_reward_func": 2.534013517200947, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -2.8435374101003013, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 171 + }, + { + "completion_length": 138.33332951863608, + "epoch": 0.3622323622323622, + "grad_norm": 1.926382303237915, + "kl": 1.1145833333333333, + "learning_rate": 1e-06, + "loss": 0.0128, + "reward": 7.129585027694702, + "reward_std": 1.6755299766858418, + "rewards/citation_reward_func": 4.506802598635356, + "rewards/correctness_reward_func": 2.499999930461248, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.8775510142246882, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.5003332595030466, + "step": 172 + }, + { + "completion_length": 168.19727579752603, + "epoch": 0.36433836433836436, + "grad_norm": 2.8209266662597656, + "kl": 0.8522135416666666, + "learning_rate": 1e-06, + "loss": 0.0092, + "reward": 6.785959243774414, + "reward_std": 2.833313842614492, + "rewards/citation_reward_func": 4.30272098382314, + "rewards/correctness_reward_func": 2.2278911074002585, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -0.6394557654857635, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49854415158430737, + "step": 173 + }, + { + "completion_length": 174.9455769856771, + "epoch": 0.36644436644436645, + "grad_norm": 1.1766891479492188, + "kl": 0.8248697916666666, + "learning_rate": 1e-06, + "loss": 0.0087, + "reward": 3.729578137397766, + "reward_std": 2.6636635859807334, + "rewards/citation_reward_func": 3.988095204035441, + "rewards/correctness_reward_func": 2.176870663960775, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -3.278911530971527, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4965849667787552, + "step": 174 + }, + { + "completion_length": 185.70067342122397, + "epoch": 0.36855036855036855, + "grad_norm": 0.9315419793128967, + "kl": 0.59375, + "learning_rate": 1e-06, + "loss": 0.0069, + "reward": 2.453159729639689, + "reward_std": 1.8253816564877827, + "rewards/citation_reward_func": 4.498299280802409, + "rewards/correctness_reward_func": 1.1564625352621078, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -4.081632614135742, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.48207135995229083, + "step": 175 + }, + { + "completion_length": 195.724484761556, + "epoch": 0.37065637065637064, + "grad_norm": 1.8000379800796509, + "kl": 0.7600911458333334, + "learning_rate": 1e-06, + "loss": 0.0083, + "reward": 1.4989728132883708, + "reward_std": 3.75508181254069, + "rewards/citation_reward_func": 3.7329931259155273, + "rewards/correctness_reward_func": 1.9387754102547963, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -0.30612244705359143, + "rewards/penalize_wrong_passages_reward_func": -4.659863829612732, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.4666598041852315, + "step": 176 + }, + { + "completion_length": 153.51360066731772, + "epoch": 0.3727623727623728, + "grad_norm": 1.1415152549743652, + "kl": 0.765625, + "learning_rate": 1e-06, + "loss": 0.0092, + "reward": 5.929571429888408, + "reward_std": 2.2657308280467987, + "rewards/citation_reward_func": 4.59183669090271, + "rewards/correctness_reward_func": 2.772108773390452, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -2.3333332737286887, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 177 + }, + { + "completion_length": 155.71088155110678, + "epoch": 0.3748683748683749, + "grad_norm": 0.7995659708976746, + "kl": 0.7845052083333334, + "learning_rate": 1e-06, + "loss": 0.0102, + "reward": 4.895190397898356, + "reward_std": 1.4263835549354553, + "rewards/citation_reward_func": 4.583333253860474, + "rewards/correctness_reward_func": 0.5782312626640002, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -1.1564625749985378, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49553055067857105, + "step": 178 + }, + { + "completion_length": 193.20067596435547, + "epoch": 0.37697437697437697, + "grad_norm": 0.8942872881889343, + "kl": 0.611328125, + "learning_rate": 1e-06, + "loss": 0.0072, + "reward": 3.979758540789286, + "reward_std": 2.5980204939842224, + "rewards/citation_reward_func": 3.9540814558664956, + "rewards/correctness_reward_func": 0.5612244804700216, + "rewards/formatting_reward_func": 0.49659863611062366, + "rewards/length_reward_func": -0.3571428557236989, + "rewards/penalize_wrong_passages_reward_func": -1.1700680057207744, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4950645665327708, + "step": 179 + }, + { + "completion_length": 183.89115142822266, + "epoch": 0.37908037908037906, + "grad_norm": 0.9261636734008789, + "kl": 0.5338541666666666, + "learning_rate": 1e-06, + "loss": 0.006, + "reward": 5.109836935997009, + "reward_std": 2.6890705625216165, + "rewards/citation_reward_func": 4.285714149475098, + "rewards/correctness_reward_func": 0.7823129172126452, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -0.40816326439380646, + "rewards/penalize_wrong_passages_reward_func": -0.5442176734407743, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.497591773668925, + "step": 180 + }, + { + "completion_length": 175.2687021891276, + "epoch": 0.3811863811863812, + "grad_norm": 3.168766736984253, + "kl": 1.28515625, + "learning_rate": 1e-06, + "loss": 0.0135, + "reward": 3.4397687713305154, + "reward_std": 5.042555729548137, + "rewards/citation_reward_func": 4.336734612782796, + "rewards/correctness_reward_func": 0.9863945295413336, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -2.1428570598363876, + "rewards/penalize_wrong_passages_reward_func": -0.7346938649813334, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.497591773668925, + "step": 181 + }, + { + "completion_length": 131.99999618530273, + "epoch": 0.3832923832923833, + "grad_norm": 0.970337986946106, + "kl": 0.8287760416666666, + "learning_rate": 1e-06, + "loss": 0.0125, + "reward": 9.27480936050415, + "reward_std": 1.602648675441742, + "rewards/citation_reward_func": 4.821428537368774, + "rewards/correctness_reward_func": 3.8775508801142373, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -0.27210883299509686, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 182 + }, + { + "completion_length": 145.39455540974936, + "epoch": 0.3853983853983854, + "grad_norm": 0.8163847923278809, + "kl": 0.6829427083333334, + "learning_rate": 1e-06, + "loss": 0.0089, + "reward": 6.255153139432271, + "reward_std": 1.4988686243693035, + "rewards/citation_reward_func": 4.319727738698323, + "rewards/correctness_reward_func": 2.091836671034495, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -1.1564625625809033, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.5000509520371755, + "step": 183 + }, + { + "completion_length": 154.55782063802084, + "epoch": 0.3875043875043875, + "grad_norm": 0.6219596862792969, + "kl": 0.6966145833333334, + "learning_rate": 1e-06, + "loss": 0.0103, + "reward": 7.526914755503337, + "reward_std": 1.4331722855567932, + "rewards/citation_reward_func": 4.370748162269592, + "rewards/correctness_reward_func": 2.7551020085811615, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.5442176690946022, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4980033338069916, + "step": 184 + }, + { + "completion_length": 168.3945566813151, + "epoch": 0.38961038961038963, + "grad_norm": 0.7838864922523499, + "kl": 0.6295572916666666, + "learning_rate": 1e-06, + "loss": 0.0084, + "reward": 6.258248249689738, + "reward_std": 2.3601081371307373, + "rewards/citation_reward_func": 4.421768665313721, + "rewards/correctness_reward_func": 1.8877550562222798, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.30612244705359143, + "rewards/penalize_wrong_passages_reward_func": -0.7414965877930323, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4980441480875015, + "step": 185 + }, + { + "completion_length": 186.4591827392578, + "epoch": 0.3917163917163917, + "grad_norm": 0.7081553339958191, + "kl": 0.5416666666666666, + "learning_rate": 1e-06, + "loss": 0.0066, + "reward": 6.1795713901519775, + "reward_std": 2.27645073334376, + "rewards/citation_reward_func": 4.362244725227356, + "rewards/correctness_reward_func": 1.5986394087473552, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -0.6802720955262581, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 186 + }, + { + "completion_length": 186.2891108194987, + "epoch": 0.3938223938223938, + "grad_norm": 0.9178943037986755, + "kl": 0.6373697916666666, + "learning_rate": 1e-06, + "loss": 0.0077, + "reward": 6.135353724161784, + "reward_std": 1.6183502872784932, + "rewards/citation_reward_func": 4.430272022883098, + "rewards/correctness_reward_func": 1.3435373802979786, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.6394557605187098, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 187 + }, + { + "completion_length": 157.90816497802734, + "epoch": 0.3959283959283959, + "grad_norm": 0.9123345613479614, + "kl": 0.740234375, + "learning_rate": 1e-06, + "loss": 0.0087, + "reward": 4.997598648071289, + "reward_std": 1.726643015940984, + "rewards/citation_reward_func": 4.268707315127055, + "rewards/correctness_reward_func": 3.6394556363423667, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -3.8095237016677856, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 188 + }, + { + "completion_length": 184.80271657307944, + "epoch": 0.39803439803439805, + "grad_norm": 0.759057343006134, + "kl": 0.6204427083333334, + "learning_rate": 1e-06, + "loss": 0.0076, + "reward": 4.643901387850444, + "reward_std": 2.2857230405012765, + "rewards/citation_reward_func": 3.9710883696873984, + "rewards/correctness_reward_func": 0.7312925085425377, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -0.7278911471366882, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.4959421157836914, + "step": 189 + }, + { + "completion_length": 177.35033671061197, + "epoch": 0.40014040014040014, + "grad_norm": 1.190181016921997, + "kl": 0.6712239583333334, + "learning_rate": 1e-06, + "loss": 0.0086, + "reward": 4.23569397131602, + "reward_std": 1.947903613249461, + "rewards/citation_reward_func": 4.234693805376689, + "rewards/correctness_reward_func": 0.6292516887187958, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.8673469324906667, + "rewards/penalize_wrong_passages_reward_func": -0.7619047512610754, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 190 + }, + { + "completion_length": 186.51020050048828, + "epoch": 0.40224640224640223, + "grad_norm": 0.8846628069877625, + "kl": 0.6009114583333334, + "learning_rate": 1e-06, + "loss": 0.0069, + "reward": 3.0384148756663003, + "reward_std": 3.5275836189587912, + "rewards/citation_reward_func": 3.724489768346151, + "rewards/correctness_reward_func": 1.6666666318972905, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -2.0408162673314414, + "rewards/penalize_wrong_passages_reward_func": -1.3129251599311829, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 191 + }, + { + "completion_length": 162.79251352945963, + "epoch": 0.4043524043524043, + "grad_norm": 0.596415638923645, + "kl": 1.59765625, + "learning_rate": 1e-06, + "loss": 0.0191, + "reward": 8.544118881225586, + "reward_std": 1.6781288782755535, + "rewards/citation_reward_func": 4.642857074737549, + "rewards/correctness_reward_func": 3.452380895614624, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.49659862741827965, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4964999357859294, + "step": 192 + }, + { + "completion_length": 150.53061167399088, + "epoch": 0.4064584064584065, + "grad_norm": 0.6676158308982849, + "kl": 0.748046875, + "learning_rate": 1e-06, + "loss": 0.0103, + "reward": 8.596237738927206, + "reward_std": 1.2864967584609985, + "rewards/citation_reward_func": 4.9319727420806885, + "rewards/correctness_reward_func": 3.0782312154769897, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.41496598223845166, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 193 + }, + { + "completion_length": 166.77210489908853, + "epoch": 0.40856440856440857, + "grad_norm": 0.808198094367981, + "kl": 0.8684895833333334, + "learning_rate": 1e-06, + "loss": 0.0097, + "reward": 5.753748019536336, + "reward_std": 2.120794693628947, + "rewards/citation_reward_func": 4.2772107521692915, + "rewards/correctness_reward_func": 1.275510181983312, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -0.6462584882974625, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.5003468642632166, + "step": 194 + }, + { + "completion_length": 165.4149627685547, + "epoch": 0.41067041067041066, + "grad_norm": 0.9030170440673828, + "kl": 4.500651041666667, + "learning_rate": 1e-06, + "loss": 0.0464, + "reward": 5.399826486905416, + "reward_std": 2.171577036380768, + "rewards/citation_reward_func": 4.387755036354065, + "rewards/correctness_reward_func": 0.7142856965462366, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.2551020433505376, + "rewards/penalize_wrong_passages_reward_func": -0.44217686417202157, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49676524599393207, + "step": 195 + }, + { + "completion_length": 186.23129018147787, + "epoch": 0.41277641277641275, + "grad_norm": 0.6888033151626587, + "kl": 0.78515625, + "learning_rate": 1e-06, + "loss": 0.0109, + "reward": 7.719346761703491, + "reward_std": 1.9923105090856552, + "rewards/citation_reward_func": 4.32823113600413, + "rewards/correctness_reward_func": 3.163265287876129, + "rewards/formatting_reward_func": 0.49659863611062366, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -0.6122449040412903, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4965577671925227, + "step": 196 + }, + { + "completion_length": 175.53741200764975, + "epoch": 0.4148824148824149, + "grad_norm": 0.8608536124229431, + "kl": 1.0234375, + "learning_rate": 1e-06, + "loss": 0.0122, + "reward": 6.424663424491882, + "reward_std": 2.1536823511123657, + "rewards/citation_reward_func": 4.727891127268474, + "rewards/correctness_reward_func": 1.751700686911742, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -0.7278911527246237, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.49779245257377625, + "step": 197 + }, + { + "completion_length": 162.51700337727866, + "epoch": 0.416988416988417, + "grad_norm": 0.6589003205299377, + "kl": 0.8854166666666666, + "learning_rate": 1e-06, + "loss": 0.0109, + "reward": 7.978091875712077, + "reward_std": 1.7386022607485454, + "rewards/citation_reward_func": 4.455782214800517, + "rewards/correctness_reward_func": 3.1292516787846885, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.6054421712954839, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4984999398390452, + "step": 198 + }, + { + "completion_length": 149.35033416748047, + "epoch": 0.4190944190944191, + "grad_norm": 0.5710175633430481, + "kl": 0.8072916666666666, + "learning_rate": 1e-06, + "loss": 0.0111, + "reward": 6.779911359151204, + "reward_std": 0.9170621037483215, + "rewards/citation_reward_func": 4.76190463701884, + "rewards/correctness_reward_func": 1.751700649658839, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.7346938600142797, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 199 + }, + { + "completion_length": 156.374148050944, + "epoch": 0.4212004212004212, + "grad_norm": 0.8978866338729858, + "kl": 0.7766927083333334, + "learning_rate": 1e-06, + "loss": 0.0128, + "reward": 8.68200675646464, + "reward_std": 1.6707564791043599, + "rewards/citation_reward_func": 4.413265228271484, + "rewards/correctness_reward_func": 4.217687010765076, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.8979591851433119, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.5000339448451996, + "step": 200 + }, + { + "completion_length": 178.60203552246094, + "epoch": 0.4233064233064233, + "grad_norm": 0.7047922015190125, + "kl": 0.83984375, + "learning_rate": 1e-06, + "loss": 0.0105, + "reward": 7.912619272867839, + "reward_std": 2.2664144337177277, + "rewards/citation_reward_func": 4.515305995941162, + "rewards/correctness_reward_func": 3.333333214124044, + "rewards/formatting_reward_func": 0.4948979616165161, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -0.7210884193579356, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49425163865089417, + "step": 201 + }, + { + "completion_length": 169.9829889933268, + "epoch": 0.4254124254124254, + "grad_norm": 2.9857473373413086, + "kl": 1.0475260416666667, + "learning_rate": 1e-06, + "loss": 0.0138, + "reward": 7.905761957168579, + "reward_std": 1.7536719938119252, + "rewards/citation_reward_func": 4.557822942733765, + "rewards/correctness_reward_func": 2.78911554813385, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -0.3401360462109248, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 202 + }, + { + "completion_length": 146.27210489908853, + "epoch": 0.4275184275184275, + "grad_norm": 0.7594918012619019, + "kl": 0.828125, + "learning_rate": 1e-06, + "loss": 0.0108, + "reward": 8.805421829223633, + "reward_std": 1.6902470886707306, + "rewards/citation_reward_func": 4.753401279449463, + "rewards/correctness_reward_func": 3.809523661931356, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.7074829911192259, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 203 + }, + { + "completion_length": 145.32652537027994, + "epoch": 0.42962442962442965, + "grad_norm": 0.7676737904548645, + "kl": 0.9563802083333334, + "learning_rate": 1e-06, + "loss": 0.0137, + "reward": 7.953380982081096, + "reward_std": 1.4818553825219472, + "rewards/citation_reward_func": 4.370748241742452, + "rewards/correctness_reward_func": 3.214285651842753, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -0.5306122352679571, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 204 + }, + { + "completion_length": 134.48639297485352, + "epoch": 0.43173043173043174, + "grad_norm": 0.49755561351776123, + "kl": 0.9361979166666666, + "learning_rate": 1e-06, + "loss": 0.0145, + "reward": 8.082629362742106, + "reward_std": 0.6665602227052053, + "rewards/citation_reward_func": 4.872448841730754, + "rewards/correctness_reward_func": 2.5170067151387534, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.3061224507788817, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49929585059483844, + "step": 205 + }, + { + "completion_length": 140.3401336669922, + "epoch": 0.43383643383643383, + "grad_norm": 1.6603000164031982, + "kl": 1.0774739583333333, + "learning_rate": 1e-06, + "loss": 0.0137, + "reward": 7.779064496358235, + "reward_std": 2.068660780787468, + "rewards/citation_reward_func": 4.625850280125936, + "rewards/correctness_reward_func": 2.99319718281428, + "rewards/formatting_reward_func": 0.49659863611062366, + "rewards/length_reward_func": -0.15306122104326883, + "rewards/penalize_wrong_passages_reward_func": -0.5102040773878495, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.4967516412337621, + "step": 206 + }, + { + "completion_length": 164.9047597249349, + "epoch": 0.4359424359424359, + "grad_norm": 1.059527039527893, + "kl": 1.0234375, + "learning_rate": 1e-06, + "loss": 0.011, + "reward": 8.205078125, + "reward_std": 2.2538377245267234, + "rewards/citation_reward_func": 4.396258393923442, + "rewards/correctness_reward_func": 3.3673468232154846, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.5578231140971184, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49929585059483844, + "step": 207 + }, + { + "completion_length": 136.49319458007812, + "epoch": 0.43804843804843807, + "grad_norm": 1.1280230283737183, + "kl": 0.8450520833333334, + "learning_rate": 1e-06, + "loss": 0.0115, + "reward": 8.873448689778646, + "reward_std": 1.6345154742399852, + "rewards/citation_reward_func": 4.838435252507527, + "rewards/correctness_reward_func": 3.5034013191858926, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.2993197242418925, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 208 + }, + { + "completion_length": 137.9795888264974, + "epoch": 0.44015444015444016, + "grad_norm": 1.0411254167556763, + "kl": 0.7864583333333334, + "learning_rate": 1e-06, + "loss": 0.0105, + "reward": 8.269707520802816, + "reward_std": 1.9344552357991536, + "rewards/citation_reward_func": 4.710884253184001, + "rewards/correctness_reward_func": 3.0952380498250327, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.36734693062802154, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 209 + }, + { + "completion_length": 142.3197224934896, + "epoch": 0.44226044226044225, + "grad_norm": 1.1389507055282593, + "kl": 0.8151041666666666, + "learning_rate": 1e-06, + "loss": 0.0101, + "reward": 8.856441656748453, + "reward_std": 1.6541103919347127, + "rewards/citation_reward_func": 4.685374101003011, + "rewards/correctness_reward_func": 3.8435373306274414, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.6734693869948387, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 210 + }, + { + "completion_length": 168.58163197835287, + "epoch": 0.44436644436644435, + "grad_norm": 1.2596967220306396, + "kl": 0.8326822916666666, + "learning_rate": 1e-06, + "loss": 0.0084, + "reward": 4.704074879487355, + "reward_std": 2.8726566632588706, + "rewards/citation_reward_func": 4.455782175064087, + "rewards/correctness_reward_func": 1.4795917769273121, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -2.061224510272344, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.49999313056468964, + "step": 211 + }, + { + "completion_length": 161.63945515950522, + "epoch": 0.4464724464724465, + "grad_norm": 1.7201244831085205, + "kl": 0.8919270833333334, + "learning_rate": 1e-06, + "loss": 0.0102, + "reward": 6.776510198911031, + "reward_std": 2.4815656542778015, + "rewards/citation_reward_func": 4.387754996617635, + "rewards/correctness_reward_func": 2.244897877176603, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.6870748282720646, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 212 + }, + { + "completion_length": 158.12584686279297, + "epoch": 0.4485784485784486, + "grad_norm": 1.5194833278656006, + "kl": 1.234375, + "learning_rate": 1e-06, + "loss": 0.014, + "reward": 5.601340293884277, + "reward_std": 2.280164202054342, + "rewards/citation_reward_func": 4.141156315803528, + "rewards/correctness_reward_func": 1.819727823138237, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.30612245202064514, + "rewards/penalize_wrong_passages_reward_func": -1.054421752691269, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 213 + }, + { + "completion_length": 134.87414677937826, + "epoch": 0.4506844506844507, + "grad_norm": 0.47165757417678833, + "kl": 0.8072916666666666, + "learning_rate": 1e-06, + "loss": 0.0135, + "reward": 9.836033980051676, + "reward_std": 1.0725230872631073, + "rewards/citation_reward_func": 4.7874148686726885, + "rewards/correctness_reward_func": 4.642857074737549, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.5442176771660646, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 214 + }, + { + "completion_length": 157.78911336263022, + "epoch": 0.45279045279045277, + "grad_norm": 1.5893391370773315, + "kl": 0.89453125, + "learning_rate": 1e-06, + "loss": 0.0097, + "reward": 6.057105541229248, + "reward_std": 2.7033910751342773, + "rewards/citation_reward_func": 4.260203997294108, + "rewards/correctness_reward_func": 1.6326530079046886, + "rewards/formatting_reward_func": 0.4914965977271398, + "rewards/length_reward_func": -0.3571428557236989, + "rewards/penalize_wrong_passages_reward_func": -0.4625850220521291, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4924795279900233, + "step": 215 + }, + { + "completion_length": 195.2993138631185, + "epoch": 0.4548964548964549, + "grad_norm": 1.0235472917556763, + "kl": 0.7076822916666666, + "learning_rate": 1e-06, + "loss": 0.0075, + "reward": 4.3785510659217834, + "reward_std": 1.9232422510782878, + "rewards/citation_reward_func": 4.370748162269592, + "rewards/correctness_reward_func": 1.9387754499912262, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -2.9319726886848607, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 216 + }, + { + "completion_length": 153.5544179280599, + "epoch": 0.457002457002457, + "grad_norm": 1.342038631439209, + "kl": 0.9921875, + "learning_rate": 1e-06, + "loss": 0.011, + "reward": 7.819027264912923, + "reward_std": 2.2121856609980264, + "rewards/citation_reward_func": 4.532312790552775, + "rewards/correctness_reward_func": 3.061224400997162, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -0.6734693758189678, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 217 + }, + { + "completion_length": 149.75170135498047, + "epoch": 0.4591084591084591, + "grad_norm": 1.5824962854385376, + "kl": 0.9694010416666666, + "learning_rate": 1e-06, + "loss": 0.0102, + "reward": 6.435758511225383, + "reward_std": 2.4908171892166138, + "rewards/citation_reward_func": 4.285714228947957, + "rewards/correctness_reward_func": 1.9897958636283875, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -0.6870748202006022, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.5003842860460281, + "step": 218 + }, + { + "completion_length": 169.7448959350586, + "epoch": 0.4612144612144612, + "grad_norm": 1.3664193153381348, + "kl": 0.845703125, + "learning_rate": 1e-06, + "loss": 0.0095, + "reward": 6.609415014584859, + "reward_std": 2.7110483845074973, + "rewards/citation_reward_func": 4.319727818171184, + "rewards/correctness_reward_func": 2.9251699844996133, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -1.4285713980595272, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49887068569660187, + "step": 219 + }, + { + "completion_length": 174.56122334798178, + "epoch": 0.46332046332046334, + "grad_norm": 1.1141589879989624, + "kl": 2.353515625, + "learning_rate": 1e-06, + "loss": 0.0244, + "reward": 7.031609058380127, + "reward_std": 2.6527690092722573, + "rewards/citation_reward_func": 4.200680096944173, + "rewards/correctness_reward_func": 3.1462584336598716, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.40816326936086017, + "rewards/penalize_wrong_passages_reward_func": -0.9047618905703226, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49929585059483844, + "step": 220 + }, + { + "completion_length": 184.224484761556, + "epoch": 0.46542646542646543, + "grad_norm": 0.9297390580177307, + "kl": 0.7239583333333334, + "learning_rate": 1e-06, + "loss": 0.008, + "reward": 6.469874382019043, + "reward_std": 3.017313222090403, + "rewards/citation_reward_func": 4.064625700314839, + "rewards/correctness_reward_func": 2.8741495509942374, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.8163265188535055, + "rewards/penalize_wrong_passages_reward_func": -0.6530612135926882, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.5004863242308298, + "step": 221 + }, + { + "completion_length": 191.47618611653647, + "epoch": 0.4675324675324675, + "grad_norm": 0.9865539073944092, + "kl": 0.7630208333333334, + "learning_rate": 1e-06, + "loss": 0.0085, + "reward": 4.598503390947978, + "reward_std": 3.093896965185801, + "rewards/citation_reward_func": 4.166666626930237, + "rewards/correctness_reward_func": 1.0884353617827098, + "rewards/formatting_reward_func": 0.4931972771883011, + "rewards/length_reward_func": -0.5102040867010752, + "rewards/penalize_wrong_passages_reward_func": -1.1292516452570756, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.48965980112552643, + "step": 222 + }, + { + "completion_length": 175.85033671061197, + "epoch": 0.4696384696384696, + "grad_norm": 1.1267229318618774, + "kl": 0.7220052083333334, + "learning_rate": 1e-06, + "loss": 0.0102, + "reward": 8.249298870563507, + "reward_std": 2.0633734663327536, + "rewards/citation_reward_func": 4.438775459925334, + "rewards/correctness_reward_func": 3.5714283883571625, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -0.6598639513055483, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 223 + }, + { + "completion_length": 174.49319458007812, + "epoch": 0.47174447174447176, + "grad_norm": 0.8411754369735718, + "kl": 0.740234375, + "learning_rate": 1e-06, + "loss": 0.0093, + "reward": 7.572517077128093, + "reward_std": 2.8577981988588967, + "rewards/citation_reward_func": 4.515306035677592, + "rewards/correctness_reward_func": 3.1462584336598716, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -0.816326508919398, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.4993876814842224, + "step": 224 + }, + { + "completion_length": 146.80951944986978, + "epoch": 0.47385047385047385, + "grad_norm": 0.7524572610855103, + "kl": 0.8352864583333334, + "learning_rate": 1e-06, + "loss": 0.0123, + "reward": 8.59113605817159, + "reward_std": 1.6423706163962681, + "rewards/citation_reward_func": 4.3112244208653765, + "rewards/correctness_reward_func": 4.523809472719829, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -1.0748299211263657, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 225 + }, + { + "completion_length": 162.69387563069662, + "epoch": 0.47595647595647594, + "grad_norm": 0.6513383388519287, + "kl": 0.72265625, + "learning_rate": 1e-06, + "loss": 0.0096, + "reward": 5.912564555803935, + "reward_std": 1.7237219214439392, + "rewards/citation_reward_func": 4.642857074737549, + "rewards/correctness_reward_func": 3.962584972381592, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -3.591836671034495, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 226 + }, + { + "completion_length": 157.63264973958334, + "epoch": 0.47806247806247804, + "grad_norm": 0.9033396244049072, + "kl": 0.7278645833333334, + "learning_rate": 1e-06, + "loss": 0.0091, + "reward": 5.257574955622355, + "reward_std": 1.365848998228709, + "rewards/citation_reward_func": 4.778911431630452, + "rewards/correctness_reward_func": 3.4863944053649902, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -4.006802638371785, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4990713646014531, + "step": 227 + }, + { + "completion_length": 157.925168355306, + "epoch": 0.4801684801684802, + "grad_norm": 1.0495508909225464, + "kl": 0.8313802083333334, + "learning_rate": 1e-06, + "loss": 0.0111, + "reward": 5.719282348950704, + "reward_std": 1.4672572215398152, + "rewards/citation_reward_func": 4.5408161878585815, + "rewards/correctness_reward_func": 3.2142856121063232, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -2.9319727222124734, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4981938103834788, + "step": 228 + }, + { + "completion_length": 150.01360066731772, + "epoch": 0.4822744822744823, + "grad_norm": 1.5118231773376465, + "kl": 0.7799479166666666, + "learning_rate": 1e-06, + "loss": 0.0087, + "reward": 6.944877624511719, + "reward_std": 1.9879997571309407, + "rewards/citation_reward_func": 4.260203997294108, + "rewards/correctness_reward_func": 2.312925140062968, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.5782312800486883, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 229 + }, + { + "completion_length": 171.23128763834634, + "epoch": 0.48438048438048437, + "grad_norm": 0.9057350158691406, + "kl": 0.7044270833333334, + "learning_rate": 1e-06, + "loss": 0.0075, + "reward": 5.530394633611043, + "reward_std": 1.9266496499379475, + "rewards/citation_reward_func": 3.928571343421936, + "rewards/correctness_reward_func": 1.3605441649754841, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.7006802558898926, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4929795265197754, + "step": 230 + }, + { + "completion_length": 170.30271657307944, + "epoch": 0.4864864864864865, + "grad_norm": 0.9910404086112976, + "kl": 0.6901041666666666, + "learning_rate": 1e-06, + "loss": 0.0075, + "reward": 7.714860757191976, + "reward_std": 2.9793601632118225, + "rewards/citation_reward_func": 4.659863869349162, + "rewards/correctness_reward_func": 2.7040815552075705, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.42857141109804314, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.5005747626225153, + "step": 231 + }, + { + "completion_length": 154.95237731933594, + "epoch": 0.4885924885924886, + "grad_norm": 38.09885787963867, + "kl": 1.8313802083333333, + "learning_rate": 1e-06, + "loss": 0.0206, + "reward": 8.529652913411459, + "reward_std": 1.7911757131417592, + "rewards/citation_reward_func": 4.719387690226237, + "rewards/correctness_reward_func": 3.180272022883097, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.3673469287653764, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49734007318814594, + "step": 232 + }, + { + "completion_length": 132.86394500732422, + "epoch": 0.4906984906984907, + "grad_norm": 1.356911540031433, + "kl": 0.939453125, + "learning_rate": 1e-06, + "loss": 0.0138, + "reward": 8.633653004964193, + "reward_std": 1.5540131802360218, + "rewards/citation_reward_func": 4.574829816818237, + "rewards/correctness_reward_func": 3.673469305038452, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -0.4625850251565377, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 233 + }, + { + "completion_length": 136.83332951863608, + "epoch": 0.4928044928044928, + "grad_norm": 1.2966426610946655, + "kl": 0.86328125, + "learning_rate": 1e-06, + "loss": 0.0132, + "reward": 9.37684996922811, + "reward_std": 1.3524375955263774, + "rewards/citation_reward_func": 4.719387610753377, + "rewards/correctness_reward_func": 4.166666587193807, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -0.4081632619102796, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 234 + }, + { + "completion_length": 169.86394246419272, + "epoch": 0.49491049491049494, + "grad_norm": 0.8496416807174683, + "kl": 0.7350260416666666, + "learning_rate": 1e-06, + "loss": 0.0092, + "reward": 7.440455913543701, + "reward_std": 2.1057456930478415, + "rewards/citation_reward_func": 4.5153060754140215, + "rewards/correctness_reward_func": 2.7210883696873984, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -0.639455775419871, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49827884634335834, + "step": 235 + }, + { + "completion_length": 161.74829864501953, + "epoch": 0.497016497016497, + "grad_norm": 0.8334783315658569, + "kl": 0.86328125, + "learning_rate": 1e-06, + "loss": 0.0139, + "reward": 8.172744790712992, + "reward_std": 1.832821786403656, + "rewards/citation_reward_func": 4.651360511779785, + "rewards/correctness_reward_func": 3.8265304962793985, + "rewards/formatting_reward_func": 0.4880952338377635, + "rewards/length_reward_func": -0.4591836631298065, + "rewards/penalize_wrong_passages_reward_func": -0.8231292168299357, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4890713741381963, + "step": 236 + }, + { + "completion_length": 189.775510152181, + "epoch": 0.4991224991224991, + "grad_norm": 2.218723773956299, + "kl": 1.2096354166666667, + "learning_rate": 1e-06, + "loss": 0.016, + "reward": 7.1474082469940186, + "reward_std": 2.675163338581721, + "rewards/citation_reward_func": 4.549319664637248, + "rewards/correctness_reward_func": 2.9081631898880005, + "rewards/formatting_reward_func": 0.4880952338377635, + "rewards/length_reward_func": -0.5612244804700216, + "rewards/penalize_wrong_passages_reward_func": -0.7210884143908819, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.48414280513922375, + "step": 237 + }, + { + "completion_length": 198.9999974568685, + "epoch": 0.5012285012285013, + "grad_norm": 0.7778282761573792, + "kl": 0.65625, + "learning_rate": 1e-06, + "loss": 0.008, + "reward": 4.987530579169591, + "reward_std": 3.4593644042809806, + "rewards/citation_reward_func": 3.784013549486796, + "rewards/correctness_reward_func": 2.142857094605764, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -0.40816326439380646, + "rewards/penalize_wrong_passages_reward_func": -1.353741466999054, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.4960339516401291, + "step": 238 + }, + { + "completion_length": 135.46938196818033, + "epoch": 0.5033345033345034, + "grad_norm": 0.6986418962478638, + "kl": 0.884765625, + "learning_rate": 1e-06, + "loss": 0.0134, + "reward": 9.60133981704712, + "reward_std": 1.030010461807251, + "rewards/citation_reward_func": 4.8894557158152265, + "rewards/correctness_reward_func": 3.962584932645162, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.25170067449410755, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 239 + }, + { + "completion_length": 125.5646235148112, + "epoch": 0.5054405054405054, + "grad_norm": 2.5262744426727295, + "kl": 1.1393229166666667, + "learning_rate": 1e-06, + "loss": 0.0155, + "reward": 9.468686580657959, + "reward_std": 1.2599839965502422, + "rewards/citation_reward_func": 4.821428457895915, + "rewards/correctness_reward_func": 3.945578138033549, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.29931971927483875, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 240 + }, + { + "completion_length": 125.99659729003906, + "epoch": 0.5075465075465075, + "grad_norm": 1.0987335443496704, + "kl": 0.9466145833333334, + "learning_rate": 1e-06, + "loss": 0.0131, + "reward": 9.814948876698812, + "reward_std": 1.6750460763772328, + "rewards/citation_reward_func": 4.974489688873291, + "rewards/correctness_reward_func": 4.047618945439656, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.15646257748206457, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.5003230571746826, + "step": 241 + }, + { + "completion_length": 153.58162689208984, + "epoch": 0.5096525096525096, + "grad_norm": 0.7817453145980835, + "kl": 0.775390625, + "learning_rate": 1e-06, + "loss": 0.01, + "reward": 8.463584661483765, + "reward_std": 1.5922152797381084, + "rewards/citation_reward_func": 4.778911431630452, + "rewards/correctness_reward_func": 3.6904760599136353, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -0.8027210558454195, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 242 + }, + { + "completion_length": 174.13945515950522, + "epoch": 0.5117585117585117, + "grad_norm": 1.1635452508926392, + "kl": 0.8131510416666666, + "learning_rate": 1e-06, + "loss": 0.011, + "reward": 7.308819691340129, + "reward_std": 2.3860236207644143, + "rewards/citation_reward_func": 4.702380895614624, + "rewards/correctness_reward_func": 3.3333332538604736, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -1.503401351471742, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.49929585059483844, + "step": 243 + }, + { + "completion_length": 185.59523264567056, + "epoch": 0.5138645138645138, + "grad_norm": 0.7059080004692078, + "kl": 0.9095052083333334, + "learning_rate": 1e-06, + "loss": 0.0124, + "reward": 7.917653242746989, + "reward_std": 2.116127530733744, + "rewards/citation_reward_func": 4.413265148798625, + "rewards/correctness_reward_func": 3.8435372511545816, + "rewards/formatting_reward_func": 0.4931972771883011, + "rewards/length_reward_func": -0.20408162971337637, + "rewards/penalize_wrong_passages_reward_func": -1.1224489708741505, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49418361485004425, + "step": 244 + }, + { + "completion_length": 177.6292495727539, + "epoch": 0.515970515970516, + "grad_norm": 0.7793982028961182, + "kl": 0.7532552083333334, + "learning_rate": 1e-06, + "loss": 0.0117, + "reward": 8.637050946553549, + "reward_std": 2.5918923815091452, + "rewards/citation_reward_func": 4.540816227595012, + "rewards/correctness_reward_func": 4.081632574399312, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.15306122104326883, + "rewards/penalize_wrong_passages_reward_func": -0.82993194522957, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49929585059483844, + "step": 245 + }, + { + "completion_length": 151.10203806559244, + "epoch": 0.5180765180765181, + "grad_norm": 0.7599817514419556, + "kl": 0.73828125, + "learning_rate": 1e-06, + "loss": 0.0102, + "reward": 6.0459217230478925, + "reward_std": 1.4534762352705002, + "rewards/citation_reward_func": 4.421768585840861, + "rewards/correctness_reward_func": 1.2414965331554413, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.5646258524308602, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4983026534318924, + "step": 246 + }, + { + "completion_length": 149.7789077758789, + "epoch": 0.5201825201825202, + "grad_norm": 1.0434558391571045, + "kl": 0.8001302083333334, + "learning_rate": 1e-06, + "loss": 0.0108, + "reward": 7.849003394444783, + "reward_std": 1.688551406065623, + "rewards/citation_reward_func": 4.710884253184001, + "rewards/correctness_reward_func": 2.636054356892904, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -0.39455781939129037, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49866320192813873, + "step": 247 + }, + { + "completion_length": 192.9591801961263, + "epoch": 0.5222885222885223, + "grad_norm": 0.767522931098938, + "kl": 0.9921875, + "learning_rate": 1e-06, + "loss": 0.0118, + "reward": 5.846408238013585, + "reward_std": 2.7072153290112815, + "rewards/citation_reward_func": 3.9540814956029258, + "rewards/correctness_reward_func": 2.755101998647054, + "rewards/formatting_reward_func": 0.4948979566494624, + "rewards/length_reward_func": -0.3571428557236989, + "rewards/penalize_wrong_passages_reward_func": -1.4829931687563658, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4824625253677368, + "step": 248 + }, + { + "completion_length": 132.92516454060873, + "epoch": 0.5243945243945244, + "grad_norm": 4.862525463104248, + "kl": 1.4322916666666667, + "learning_rate": 1e-06, + "loss": 0.0167, + "reward": 9.254401048024496, + "reward_std": 1.774699608484904, + "rewards/citation_reward_func": 4.770408074061076, + "rewards/correctness_reward_func": 3.8435372908910117, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.36054420471191406, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 249 + }, + { + "completion_length": 140.15986124674478, + "epoch": 0.5265005265005265, + "grad_norm": 1.2383294105529785, + "kl": 1.0032552083333333, + "learning_rate": 1e-06, + "loss": 0.0122, + "reward": 7.85304053624471, + "reward_std": 1.551555981238683, + "rewards/citation_reward_func": 4.753401279449463, + "rewards/correctness_reward_func": 2.4489794969558716, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.2993197174121936, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 250 + }, + { + "completion_length": 143.27550760904947, + "epoch": 0.5286065286065286, + "grad_norm": 0.9058665633201599, + "kl": 0.8587239583333334, + "learning_rate": 1e-06, + "loss": 0.01, + "reward": 6.524013678232829, + "reward_std": 2.4335868755976358, + "rewards/citation_reward_func": 4.387755115826924, + "rewards/correctness_reward_func": 2.0408162673314414, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -0.7006802608569463, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.5002040167649587, + "step": 251 + }, + { + "completion_length": 162.08162943522134, + "epoch": 0.5307125307125307, + "grad_norm": 1.0201135873794556, + "kl": 0.8170572916666666, + "learning_rate": 1e-06, + "loss": 0.0091, + "reward": 6.7847583293914795, + "reward_std": 1.9675355752309163, + "rewards/citation_reward_func": 4.5068027178446455, + "rewards/correctness_reward_func": 2.1938775380452475, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -0.8095238109429678, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49734347065289813, + "step": 252 + }, + { + "completion_length": 171.5544179280599, + "epoch": 0.5328185328185329, + "grad_norm": 1.0924259424209595, + "kl": 0.6809895833333334, + "learning_rate": 1e-06, + "loss": 0.0076, + "reward": 5.832513689994812, + "reward_std": 2.4871179461479187, + "rewards/citation_reward_func": 4.455782254536946, + "rewards/correctness_reward_func": 1.4795918265978496, + "rewards/formatting_reward_func": 0.4948979566494624, + "rewards/length_reward_func": -0.2551020433505376, + "rewards/penalize_wrong_passages_reward_func": -0.836734672387441, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4940781792004903, + "step": 253 + }, + { + "completion_length": 160.2789077758789, + "epoch": 0.534924534924535, + "grad_norm": 0.6430193781852722, + "kl": 0.8372395833333334, + "learning_rate": 1e-06, + "loss": 0.0097, + "reward": 7.685734748840332, + "reward_std": 1.3944465617338817, + "rewards/citation_reward_func": 4.022108832995097, + "rewards/correctness_reward_func": 3.418367306391398, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.7551020216196775, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.5003604739904404, + "step": 254 + }, + { + "completion_length": 163.47618611653647, + "epoch": 0.537030537030537, + "grad_norm": 0.6005195379257202, + "kl": 0.7591145833333334, + "learning_rate": 1e-06, + "loss": 0.011, + "reward": 5.973788936932881, + "reward_std": 1.2611841062704723, + "rewards/citation_reward_func": 4.71088433265686, + "rewards/correctness_reward_func": 1.2585033824046452, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -0.8435373933364948, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 255 + }, + { + "completion_length": 131.9693832397461, + "epoch": 0.5391365391365391, + "grad_norm": 0.5962944030761719, + "kl": 0.96484375, + "learning_rate": 1e-06, + "loss": 0.0157, + "reward": 9.269704023996988, + "reward_std": 0.8620906795064608, + "rewards/citation_reward_func": 4.863945484161377, + "rewards/correctness_reward_func": 4.200680216153462, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.7414965877930323, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49929585059483844, + "step": 256 + }, + { + "completion_length": 138.84353383382162, + "epoch": 0.5412425412425412, + "grad_norm": 0.6719671487808228, + "kl": 0.89453125, + "learning_rate": 1e-06, + "loss": 0.0128, + "reward": 7.557119131088257, + "reward_std": 1.2554789185523987, + "rewards/citation_reward_func": 4.566326379776001, + "rewards/correctness_reward_func": 2.7040815353393555, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.6598639351626238, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49929585059483844, + "step": 257 + }, + { + "completion_length": 137.53401311238608, + "epoch": 0.5433485433485433, + "grad_norm": 0.8949998021125793, + "kl": 1.06640625, + "learning_rate": 1e-06, + "loss": 0.0133, + "reward": 8.044159889221191, + "reward_std": 2.113215277592341, + "rewards/citation_reward_func": 4.693877379099528, + "rewards/correctness_reward_func": 3.0442176262537637, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.20408162971337637, + "rewards/penalize_wrong_passages_reward_func": -0.4897959033648173, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4999421040217082, + "step": 258 + }, + { + "completion_length": 137.28911336263022, + "epoch": 0.5454545454545454, + "grad_norm": 11.216543197631836, + "kl": 1.9811197916666667, + "learning_rate": 1e-06, + "loss": 0.0219, + "reward": 7.045217672983806, + "reward_std": 2.517554461956024, + "rewards/citation_reward_func": 4.710884173711141, + "rewards/correctness_reward_func": 2.09183669090271, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -0.43537414570649463, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 259 + }, + { + "completion_length": 147.71428426106772, + "epoch": 0.5475605475605475, + "grad_norm": 1.2328022718429565, + "kl": 0.7734375, + "learning_rate": 1e-06, + "loss": 0.0082, + "reward": 6.194877703984578, + "reward_std": 2.832310895125071, + "rewards/citation_reward_func": 4.659863789876302, + "rewards/correctness_reward_func": 1.4115646084149678, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -0.6734693770607313, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 260 + }, + { + "completion_length": 138.08503087361655, + "epoch": 0.5496665496665497, + "grad_norm": 0.8712778091430664, + "kl": 2.0670572916666665, + "learning_rate": 1e-06, + "loss": 0.023, + "reward": 6.020068089167277, + "reward_std": 1.27622323234876, + "rewards/citation_reward_func": 4.464285612106323, + "rewards/correctness_reward_func": 1.275510162115097, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.6666666567325592, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49795911212762195, + "step": 261 + }, + { + "completion_length": 136.39795430501303, + "epoch": 0.5517725517725518, + "grad_norm": 0.970741868019104, + "kl": 0.8522135416666666, + "learning_rate": 1e-06, + "loss": 0.0107, + "reward": 7.543517192204793, + "reward_std": 1.5009947220484416, + "rewards/citation_reward_func": 4.719387610753377, + "rewards/correctness_reward_func": 3.520408014456431, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -1.6462584833304088, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 262 + }, + { + "completion_length": 148.2448933919271, + "epoch": 0.5538785538785539, + "grad_norm": 0.8934192061424255, + "kl": 0.71484375, + "learning_rate": 1e-06, + "loss": 0.0091, + "reward": 6.40746267636617, + "reward_std": 1.2594018677870433, + "rewards/citation_reward_func": 4.872448921203613, + "rewards/correctness_reward_func": 2.3639455238978067, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -1.8299319446086884, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 263 + }, + { + "completion_length": 155.17686971028647, + "epoch": 0.555984555984556, + "grad_norm": 0.6256468892097473, + "kl": 0.7734375, + "learning_rate": 1e-06, + "loss": 0.0108, + "reward": 7.1438571612040205, + "reward_std": 1.2106933891773224, + "rewards/citation_reward_func": 4.846938769022624, + "rewards/correctness_reward_func": 2.789115568002065, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -1.4421768623093765, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 264 + }, + { + "completion_length": 153.8197250366211, + "epoch": 0.5580905580905581, + "grad_norm": 0.9129427075386047, + "kl": 0.728515625, + "learning_rate": 1e-06, + "loss": 0.0105, + "reward": 8.354840358098349, + "reward_std": 1.5037815868854523, + "rewards/citation_reward_func": 4.73639456431071, + "rewards/correctness_reward_func": 2.976190427939097, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.3061224476744731, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4993978887796402, + "step": 265 + }, + { + "completion_length": 145.45237731933594, + "epoch": 0.5601965601965602, + "grad_norm": 0.5100239515304565, + "kl": 0.787109375, + "learning_rate": 1e-06, + "loss": 0.0112, + "reward": 7.560523907343547, + "reward_std": 1.0577657222747803, + "rewards/citation_reward_func": 4.7874148686726885, + "rewards/correctness_reward_func": 2.4999999006589255, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.7278911328564087, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 266 + }, + { + "completion_length": 171.1496556599935, + "epoch": 0.5623025623025623, + "grad_norm": 0.934743344783783, + "kl": 0.66796875, + "learning_rate": 1e-06, + "loss": 0.0084, + "reward": 7.694870392481486, + "reward_std": 2.9706706007321677, + "rewards/citation_reward_func": 4.226190368334453, + "rewards/correctness_reward_func": 3.571428418159485, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.2551020433505376, + "rewards/penalize_wrong_passages_reward_func": -0.8435373902320862, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.497591773668925, + "step": 267 + }, + { + "completion_length": 159.51700337727866, + "epoch": 0.5644085644085645, + "grad_norm": 2.148893356323242, + "kl": 0.7467447916666666, + "learning_rate": 1e-06, + "loss": 0.0084, + "reward": 6.452187379201253, + "reward_std": 2.600463350613912, + "rewards/citation_reward_func": 4.515305956204732, + "rewards/correctness_reward_func": 2.1598638792832694, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.2551020383834839, + "rewards/penalize_wrong_passages_reward_func": -0.9659863697985808, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49810537695884705, + "step": 268 + }, + { + "completion_length": 142.34353892008463, + "epoch": 0.5665145665145666, + "grad_norm": 1.0294554233551025, + "kl": 0.7233072916666666, + "learning_rate": 1e-06, + "loss": 0.008, + "reward": 6.883653163909912, + "reward_std": 2.1682270169258118, + "rewards/citation_reward_func": 4.7704081535339355, + "rewards/correctness_reward_func": 2.4659863313039145, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -1.3537414769331615, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 269 + }, + { + "completion_length": 142.6394526163737, + "epoch": 0.5686205686205686, + "grad_norm": 0.7771506309509277, + "kl": 0.8059895833333334, + "learning_rate": 1e-06, + "loss": 0.0106, + "reward": 7.640455881754558, + "reward_std": 1.9366973439852397, + "rewards/citation_reward_func": 4.795918305714925, + "rewards/correctness_reward_func": 3.231292406717936, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -1.3877550611893337, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 270 + }, + { + "completion_length": 163.00679779052734, + "epoch": 0.5707265707265707, + "grad_norm": 0.785870373249054, + "kl": 0.9095052083333334, + "learning_rate": 1e-06, + "loss": 0.0117, + "reward": 8.092830101648966, + "reward_std": 2.4080686370531716, + "rewards/citation_reward_func": 4.353741367657979, + "rewards/correctness_reward_func": 3.537414828936259, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -0.6394557779033979, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.497591773668925, + "step": 271 + }, + { + "completion_length": 161.8945515950521, + "epoch": 0.5728325728325728, + "grad_norm": 0.848395049571991, + "kl": 0.798828125, + "learning_rate": 1e-06, + "loss": 0.011, + "reward": 8.57502031326294, + "reward_std": 1.9230751991271973, + "rewards/citation_reward_func": 4.625850200653076, + "rewards/correctness_reward_func": 3.3673469026883445, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.3673469324906667, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.5001904020706812, + "step": 272 + }, + { + "completion_length": 166.25169372558594, + "epoch": 0.5749385749385749, + "grad_norm": 0.69926917552948, + "kl": 0.7936197916666666, + "learning_rate": 1e-06, + "loss": 0.0118, + "reward": 8.480765422185263, + "reward_std": 1.7872259020805359, + "rewards/citation_reward_func": 4.464285572369893, + "rewards/correctness_reward_func": 3.5544217427571616, + "rewards/formatting_reward_func": 0.49659863611062366, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -0.42857142724096775, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49607136845588684, + "step": 273 + }, + { + "completion_length": 146.66326141357422, + "epoch": 0.577044577044577, + "grad_norm": 0.987402617931366, + "kl": 1.2063802083333333, + "learning_rate": 1e-06, + "loss": 0.014, + "reward": 8.949979543685913, + "reward_std": 2.1892781058947244, + "rewards/citation_reward_func": 4.676870663960774, + "rewards/correctness_reward_func": 3.6564625104268393, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.333333329608043, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 274 + }, + { + "completion_length": 142.36394246419272, + "epoch": 0.5791505791505791, + "grad_norm": 1.0434736013412476, + "kl": 1.03515625, + "learning_rate": 1e-06, + "loss": 0.0133, + "reward": 8.55031975110372, + "reward_std": 1.6762052774429321, + "rewards/citation_reward_func": 4.8894557158152265, + "rewards/correctness_reward_func": 2.8741495609283447, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.1632653015355269, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 275 + }, + { + "completion_length": 144.30611928304037, + "epoch": 0.5812565812565813, + "grad_norm": 2.7037227153778076, + "kl": 1.1516927083333333, + "learning_rate": 1e-06, + "loss": 0.0133, + "reward": 7.040115435918172, + "reward_std": 1.6779302060604095, + "rewards/citation_reward_func": 4.9234693845113116, + "rewards/correctness_reward_func": 2.4999999403953552, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -1.3333333035310109, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 276 + }, + { + "completion_length": 138.149658203125, + "epoch": 0.5833625833625834, + "grad_norm": 0.9938499927520752, + "kl": 1.4986979166666667, + "learning_rate": 1e-06, + "loss": 0.0181, + "reward": 6.847938934961955, + "reward_std": 1.6733147700627644, + "rewards/citation_reward_func": 4.795918305714925, + "rewards/correctness_reward_func": 2.857142766316732, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -1.7551019787788391, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 277 + }, + { + "completion_length": 141.54081217447916, + "epoch": 0.5854685854685855, + "grad_norm": 0.6015098690986633, + "kl": 0.982421875, + "learning_rate": 1e-06, + "loss": 0.013, + "reward": 6.965959231058757, + "reward_std": 1.5529690434535344, + "rewards/citation_reward_func": 4.778911431630452, + "rewards/correctness_reward_func": 3.5884352922439575, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -0.20408162971337637, + "rewards/penalize_wrong_passages_reward_func": -2.1904762083043656, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4965713669856389, + "step": 278 + }, + { + "completion_length": 154.874148050944, + "epoch": 0.5875745875745876, + "grad_norm": 0.8088123798370361, + "kl": 0.90234375, + "learning_rate": 1e-06, + "loss": 0.0112, + "reward": 7.800319751103719, + "reward_std": 1.5292981366316478, + "rewards/citation_reward_func": 4.829931894938151, + "rewards/correctness_reward_func": 2.1938774784406028, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.22448979442318281, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 279 + }, + { + "completion_length": 147.12244415283203, + "epoch": 0.5896805896805897, + "grad_norm": 1.0946311950683594, + "kl": 0.9694010416666666, + "learning_rate": 1e-06, + "loss": 0.011, + "reward": 7.138755162556966, + "reward_std": 1.8616977731386821, + "rewards/citation_reward_func": 4.787414789199829, + "rewards/correctness_reward_func": 1.632652997970581, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.23129250730077425, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 280 + }, + { + "completion_length": 147.74829864501953, + "epoch": 0.5917865917865918, + "grad_norm": 0.99028480052948, + "kl": 1.228515625, + "learning_rate": 1e-06, + "loss": 0.0143, + "reward": 6.8122890790303545, + "reward_std": 1.8934701879819233, + "rewards/citation_reward_func": 4.846938689549764, + "rewards/correctness_reward_func": 1.4285713632901509, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.23809523383776346, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.4976631999015808, + "step": 281 + }, + { + "completion_length": 159.523806254069, + "epoch": 0.5938925938925939, + "grad_norm": 0.7205305099487305, + "kl": 0.779296875, + "learning_rate": 1e-06, + "loss": 0.0102, + "reward": 5.7441972096761065, + "reward_std": 1.631241689125697, + "rewards/citation_reward_func": 4.209183533986409, + "rewards/correctness_reward_func": 1.3435373703638713, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.8095237972835699, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 282 + }, + { + "completion_length": 159.98299153645834, + "epoch": 0.595998595998596, + "grad_norm": 1.254953145980835, + "kl": 0.9583333333333334, + "learning_rate": 1e-06, + "loss": 0.0115, + "reward": 5.963574965794881, + "reward_std": 2.36332497994105, + "rewards/citation_reward_func": 4.166666547457377, + "rewards/correctness_reward_func": 2.0918366809686026, + "rewards/formatting_reward_func": 0.4948979566494624, + "rewards/length_reward_func": -0.30612245202064514, + "rewards/penalize_wrong_passages_reward_func": -0.9795918166637421, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4958876967430115, + "step": 283 + }, + { + "completion_length": 156.17686716715494, + "epoch": 0.5981045981045982, + "grad_norm": 0.9321157932281494, + "kl": 0.8502604166666666, + "learning_rate": 1e-06, + "loss": 0.011, + "reward": 7.640455881754558, + "reward_std": 1.9198250969250996, + "rewards/citation_reward_func": 4.455782175064087, + "rewards/correctness_reward_func": 2.9251699844996133, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -0.5374149531126022, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 284 + }, + { + "completion_length": 165.04761505126953, + "epoch": 0.6002106002106002, + "grad_norm": 1.290971040725708, + "kl": 8.250651041666666, + "learning_rate": 1e-06, + "loss": 0.0833, + "reward": 6.30113951365153, + "reward_std": 3.333476463953654, + "rewards/citation_reward_func": 3.852040688196818, + "rewards/correctness_reward_func": 2.9421767791112265, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.2551020433505376, + "rewards/penalize_wrong_passages_reward_func": -1.2380952437718709, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.5001189808050791, + "step": 285 + }, + { + "completion_length": 167.17346700032553, + "epoch": 0.6023166023166023, + "grad_norm": 1.112428069114685, + "kl": 0.8079427083333334, + "learning_rate": 1e-06, + "loss": 0.0093, + "reward": 6.459765354792277, + "reward_std": 2.901971995830536, + "rewards/citation_reward_func": 3.8945577144622803, + "rewards/correctness_reward_func": 2.9251699646313987, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.2551020383834839, + "rewards/penalize_wrong_passages_reward_func": -1.1020407974720001, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49718021353085834, + "step": 286 + }, + { + "completion_length": 162.51360321044922, + "epoch": 0.6044226044226044, + "grad_norm": 1.2326608896255493, + "kl": 1.4615885416666667, + "learning_rate": 1e-06, + "loss": 0.0159, + "reward": 5.752693970998128, + "reward_std": 2.563380718231201, + "rewards/citation_reward_func": 3.945578098297119, + "rewards/correctness_reward_func": 1.870748261610667, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -0.2551020433505376, + "rewards/penalize_wrong_passages_reward_func": -0.8027210732301077, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.497591773668925, + "step": 287 + }, + { + "completion_length": 175.84353637695312, + "epoch": 0.6065286065286065, + "grad_norm": 1.1159588098526, + "kl": 0.9329427083333334, + "learning_rate": 1e-06, + "loss": 0.0105, + "reward": 6.295217712720235, + "reward_std": 2.303104837735494, + "rewards/citation_reward_func": 4.175170024236043, + "rewards/correctness_reward_func": 2.040816302100817, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -0.7687074740727743, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 288 + }, + { + "completion_length": 155.27210489908853, + "epoch": 0.6086346086346086, + "grad_norm": 0.7354316115379333, + "kl": 0.734375, + "learning_rate": 1e-06, + "loss": 0.0131, + "reward": 9.381948471069336, + "reward_std": 1.341506339609623, + "rewards/citation_reward_func": 4.76190463701884, + "rewards/correctness_reward_func": 4.149659872055054, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -0.37414966337382793, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49929585059483844, + "step": 289 + }, + { + "completion_length": 144.54081217447916, + "epoch": 0.6107406107406107, + "grad_norm": 0.8398804664611816, + "kl": 0.7884114583333334, + "learning_rate": 1e-06, + "loss": 0.0122, + "reward": 9.019707520802816, + "reward_std": 1.1839089542627335, + "rewards/citation_reward_func": 4.719387690226237, + "rewards/correctness_reward_func": 4.149659752845764, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.8503401384999355, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 290 + }, + { + "completion_length": 142.43877410888672, + "epoch": 0.6128466128466128, + "grad_norm": 1.53350031375885, + "kl": 0.890625, + "learning_rate": 1e-06, + "loss": 0.011, + "reward": 7.9516801834106445, + "reward_std": 1.5891411205132802, + "rewards/citation_reward_func": 4.719387690226237, + "rewards/correctness_reward_func": 2.653061166405678, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.42176870505015057, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 291 + }, + { + "completion_length": 145.73809305826822, + "epoch": 0.614952614952615, + "grad_norm": 1.1438617706298828, + "kl": 0.7376302083333334, + "learning_rate": 1e-06, + "loss": 0.0104, + "reward": 9.490796089172363, + "reward_std": 1.4907788634300232, + "rewards/citation_reward_func": 4.88095235824585, + "rewards/correctness_reward_func": 3.860544125239054, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.251700675735871, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 292 + }, + { + "completion_length": 178.14285278320312, + "epoch": 0.6170586170586171, + "grad_norm": 1.3113124370574951, + "kl": 0.87890625, + "learning_rate": 1e-06, + "loss": 0.0106, + "reward": 6.215275506178538, + "reward_std": 2.398002475500107, + "rewards/citation_reward_func": 4.506802638371785, + "rewards/correctness_reward_func": 3.571428418159485, + "rewards/formatting_reward_func": 0.4948979566494624, + "rewards/length_reward_func": -0.2551020383834839, + "rewards/penalize_wrong_passages_reward_func": -2.5986393888791404, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.495887686808904, + "step": 293 + }, + { + "completion_length": 166.6972745259603, + "epoch": 0.6191646191646192, + "grad_norm": 1.2684246301651, + "kl": 1.12890625, + "learning_rate": 1e-06, + "loss": 0.0119, + "reward": 5.689976135889689, + "reward_std": 3.035001496473948, + "rewards/citation_reward_func": 4.4557822942733765, + "rewards/correctness_reward_func": 2.346938749154409, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -1.8843536774317424, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.4943978985150655, + "step": 294 + }, + { + "completion_length": 169.28911336263022, + "epoch": 0.6212706212706213, + "grad_norm": 0.9547519683837891, + "kl": 1.728515625, + "learning_rate": 1e-06, + "loss": 0.0178, + "reward": 3.982292652130127, + "reward_std": 4.037615021069844, + "rewards/citation_reward_func": 3.9030612309773765, + "rewards/correctness_reward_func": 1.5816325893004735, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.30612244705359143, + "rewards/penalize_wrong_passages_reward_func": -1.857142796119054, + "rewards/unicode_reward_func": -0.3401360511779785, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 295 + }, + { + "completion_length": 157.51020050048828, + "epoch": 0.6233766233766234, + "grad_norm": 0.9751655459403992, + "kl": 0.9680989583333334, + "learning_rate": 1e-06, + "loss": 0.012, + "reward": 6.598163406054179, + "reward_std": 2.1323233445485434, + "rewards/citation_reward_func": 4.498299201329549, + "rewards/correctness_reward_func": 2.176870713631312, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.2551020433505376, + "rewards/penalize_wrong_passages_reward_func": -0.8163264989852905, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49612238506476086, + "step": 296 + }, + { + "completion_length": 144.2040761311849, + "epoch": 0.6254826254826255, + "grad_norm": 0.8613839745521545, + "kl": 0.865234375, + "learning_rate": 1e-06, + "loss": 0.0124, + "reward": 7.519839843114217, + "reward_std": 1.2146833290656407, + "rewards/citation_reward_func": 4.438775459925334, + "rewards/correctness_reward_func": 2.6190475821495056, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.5374149655302366, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49943191309769946, + "step": 297 + }, + { + "completion_length": 137.93196868896484, + "epoch": 0.6275886275886275, + "grad_norm": 0.8827674388885498, + "kl": 0.9602864583333334, + "learning_rate": 1e-06, + "loss": 0.0128, + "reward": 6.497445583343506, + "reward_std": 1.1725004116694133, + "rewards/citation_reward_func": 4.532312790552775, + "rewards/correctness_reward_func": 1.3095237612724304, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.3401360536615054, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4957448293765386, + "step": 298 + }, + { + "completion_length": 140.56462605794272, + "epoch": 0.6296946296946297, + "grad_norm": 0.6594263315200806, + "kl": 0.8899739583333334, + "learning_rate": 1e-06, + "loss": 0.0121, + "reward": 6.2714048226674395, + "reward_std": 0.911820242802302, + "rewards/citation_reward_func": 4.7874148686726885, + "rewards/correctness_reward_func": 4.625850319862366, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -4.0884352922439575, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49929585059483844, + "step": 299 + }, + { + "completion_length": 149.50339889526367, + "epoch": 0.6318006318006318, + "grad_norm": 1.0458486080169678, + "kl": 0.986328125, + "learning_rate": 1e-06, + "loss": 0.0135, + "reward": 8.150659720102945, + "reward_std": 1.8430875837802887, + "rewards/citation_reward_func": 4.234693845113118, + "rewards/correctness_reward_func": 3.707482933998108, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.7414965803424517, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 300 + }, + { + "completion_length": 166.51360321044922, + "epoch": 0.6339066339066339, + "grad_norm": 0.6753512620925903, + "kl": 0.845703125, + "learning_rate": 1e-06, + "loss": 0.013, + "reward": 8.259503444035849, + "reward_std": 2.7765374779701233, + "rewards/citation_reward_func": 4.540816227595012, + "rewards/correctness_reward_func": 3.8775508801142373, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.2551020433505376, + "rewards/penalize_wrong_passages_reward_func": -0.7346938600142797, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 301 + }, + { + "completion_length": 183.44217427571616, + "epoch": 0.636012636012636, + "grad_norm": 0.9966219067573547, + "kl": 1.2005208333333333, + "learning_rate": 1e-06, + "loss": 0.014, + "reward": 4.120401422182719, + "reward_std": 3.795739163955053, + "rewards/citation_reward_func": 4.073129216829936, + "rewards/correctness_reward_func": 3.656462470690409, + "rewards/formatting_reward_func": 0.49234693745772046, + "rewards/length_reward_func": -1.7346938749154408, + "rewards/penalize_wrong_passages_reward_func": -2.857142766316732, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4902992645899455, + "step": 302 + }, + { + "completion_length": 193.60203552246094, + "epoch": 0.6381186381186381, + "grad_norm": 1.1840745210647583, + "kl": 1.0247395833333333, + "learning_rate": 1e-06, + "loss": 0.0109, + "reward": 5.088537494341533, + "reward_std": 5.065946817398071, + "rewards/citation_reward_func": 3.945578098297119, + "rewards/correctness_reward_func": 3.027210831642151, + "rewards/formatting_reward_func": 0.4948979566494624, + "rewards/length_reward_func": -1.5306122104326885, + "rewards/penalize_wrong_passages_reward_func": -1.3401360313097637, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49159857630729675, + "step": 303 + }, + { + "completion_length": 168.074826558431, + "epoch": 0.6402246402246402, + "grad_norm": 0.9291290044784546, + "kl": 0.8951822916666666, + "learning_rate": 1e-06, + "loss": 0.0107, + "reward": 5.667663335800171, + "reward_std": 4.012887159983317, + "rewards/citation_reward_func": 4.676870584487915, + "rewards/correctness_reward_func": 2.4149659276008606, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -1.9387754599253337, + "rewards/penalize_wrong_passages_reward_func": -0.4829931954542796, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49929585059483844, + "step": 304 + }, + { + "completion_length": 148.09523264567056, + "epoch": 0.6423306423306423, + "grad_norm": 1.2598353624343872, + "kl": 1.2278645833333333, + "learning_rate": 1e-06, + "loss": 0.0156, + "reward": 6.578816254933675, + "reward_std": 1.0740979760885239, + "rewards/citation_reward_func": 4.396258354187012, + "rewards/correctness_reward_func": 2.0748298863569894, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.8911564573645592, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49888428548971814, + "step": 305 + }, + { + "completion_length": 134.02720896402994, + "epoch": 0.6444366444366444, + "grad_norm": 0.8883174657821655, + "kl": 1.0091145833333333, + "learning_rate": 1e-06, + "loss": 0.0146, + "reward": 9.13562266031901, + "reward_std": 1.1919774264097214, + "rewards/citation_reward_func": 4.710884173711141, + "rewards/correctness_reward_func": 3.7925169865290322, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -0.2653061201175054, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4995679606993993, + "step": 306 + }, + { + "completion_length": 140.46598052978516, + "epoch": 0.6465426465426466, + "grad_norm": 0.7637023329734802, + "kl": 0.9381510416666666, + "learning_rate": 1e-06, + "loss": 0.0129, + "reward": 6.968687216440837, + "reward_std": 1.1493754784266155, + "rewards/citation_reward_func": 4.3622448444366455, + "rewards/correctness_reward_func": 2.295918265978495, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.6394557605187098, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 307 + }, + { + "completion_length": 131.6020393371582, + "epoch": 0.6486486486486487, + "grad_norm": 0.6228148341178894, + "kl": 1.03125, + "learning_rate": 1e-06, + "loss": 0.016, + "reward": 8.279911756515503, + "reward_std": 0.8148330748081207, + "rewards/citation_reward_func": 4.795918226242065, + "rewards/correctness_reward_func": 3.2142856121063232, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.6802721035977205, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 308 + }, + { + "completion_length": 139.99659474690756, + "epoch": 0.6507546507546508, + "grad_norm": 19.074426651000977, + "kl": 1.279296875, + "learning_rate": 1e-06, + "loss": 0.0158, + "reward": 5.977187156677246, + "reward_std": 1.4246279398600261, + "rewards/citation_reward_func": 4.472789009412129, + "rewards/correctness_reward_func": 4.608843445777893, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -3.897959033648173, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49929585059483844, + "step": 309 + }, + { + "completion_length": 158.9897944132487, + "epoch": 0.6528606528606529, + "grad_norm": 1.0727325677871704, + "kl": 0.873046875, + "learning_rate": 1e-06, + "loss": 0.0104, + "reward": 7.802013556162517, + "reward_std": 2.125181575616201, + "rewards/citation_reward_func": 4.676870743433635, + "rewards/correctness_reward_func": 2.704081575075785, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -0.42176869759957, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.497591773668925, + "step": 310 + }, + { + "completion_length": 164.09863789876303, + "epoch": 0.654966654966655, + "grad_norm": 0.8916863799095154, + "kl": 0.9446614583333334, + "learning_rate": 1e-06, + "loss": 0.0102, + "reward": 7.430758555730184, + "reward_std": 2.2719212770462036, + "rewards/citation_reward_func": 4.472789009412129, + "rewards/correctness_reward_func": 2.6360543767611184, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.6258503198623657, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4987856447696686, + "step": 311 + }, + { + "completion_length": 136.65646107991537, + "epoch": 0.657072657072657, + "grad_norm": 47.32864761352539, + "kl": 2.8509114583333335, + "learning_rate": 1e-06, + "loss": 0.0309, + "reward": 6.757802804311116, + "reward_std": 1.4053757439057033, + "rewards/citation_reward_func": 4.668367306391398, + "rewards/correctness_reward_func": 3.5204081535339355, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -2.3809523483117423, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 312 + }, + { + "completion_length": 152.82312774658203, + "epoch": 0.6591786591786591, + "grad_norm": 1.4776674509048462, + "kl": 1.0787760416666667, + "learning_rate": 1e-06, + "loss": 0.0113, + "reward": 6.497523943583171, + "reward_std": 2.623288551966349, + "rewards/citation_reward_func": 4.6343536376953125, + "rewards/correctness_reward_func": 1.9557822744051616, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -1.040816307067871, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4992244243621826, + "step": 313 + }, + { + "completion_length": 161.29251352945963, + "epoch": 0.6612846612846612, + "grad_norm": 1.09335196018219, + "kl": 1.1256510416666667, + "learning_rate": 1e-06, + "loss": 0.0119, + "reward": 6.773108879725139, + "reward_std": 3.392216761906942, + "rewards/citation_reward_func": 4.540816307067871, + "rewards/correctness_reward_func": 2.363945464293162, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.2551020383834839, + "rewards/penalize_wrong_passages_reward_func": -0.8775509893894196, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 314 + }, + { + "completion_length": 164.6700642903646, + "epoch": 0.6633906633906634, + "grad_norm": 13.127339363098145, + "kl": 2.5885416666666665, + "learning_rate": 1e-06, + "loss": 0.027, + "reward": 6.473789056142171, + "reward_std": 3.381900449593862, + "rewards/citation_reward_func": 4.574829816818237, + "rewards/correctness_reward_func": 2.057823042074839, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.561224490404129, + "rewards/penalize_wrong_passages_reward_func": -0.5986394435167313, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 315 + }, + { + "completion_length": 166.86394246419272, + "epoch": 0.6654966654966655, + "grad_norm": 2.0115606784820557, + "kl": 1.81640625, + "learning_rate": 1e-06, + "loss": 0.0196, + "reward": 5.744387944539388, + "reward_std": 3.269197463989258, + "rewards/citation_reward_func": 4.294217665990193, + "rewards/correctness_reward_func": 2.0068026582400003, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -0.6632653077443441, + "rewards/penalize_wrong_passages_reward_func": -0.8843537370363871, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4943876961867015, + "step": 316 + }, + { + "completion_length": 148.34353383382162, + "epoch": 0.6676026676026676, + "grad_norm": 17.85995864868164, + "kl": 3.5807291666666665, + "learning_rate": 1e-06, + "loss": 0.0383, + "reward": 7.790108839670817, + "reward_std": 2.6309839884440103, + "rewards/citation_reward_func": 4.625850200653076, + "rewards/correctness_reward_func": 3.061224381128947, + "rewards/formatting_reward_func": 0.49659863611062366, + "rewards/length_reward_func": -0.40816326936086017, + "rewards/penalize_wrong_passages_reward_func": -0.4829931929707527, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.497591773668925, + "step": 317 + }, + { + "completion_length": 140.26870473225912, + "epoch": 0.6697086697086697, + "grad_norm": 1.0049431324005127, + "kl": 1.2799479166666667, + "learning_rate": 1e-06, + "loss": 0.016, + "reward": 7.075745026270549, + "reward_std": 2.417468766371409, + "rewards/citation_reward_func": 4.396258433659871, + "rewards/correctness_reward_func": 2.8061224222183228, + "rewards/formatting_reward_func": 0.4948979616165161, + "rewards/length_reward_func": -0.40816326439380646, + "rewards/penalize_wrong_passages_reward_func": -0.7074829836686453, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49411219358444214, + "step": 318 + }, + { + "completion_length": 137.91836420694986, + "epoch": 0.6718146718146718, + "grad_norm": 0.9339328408241272, + "kl": 1.0768229166666667, + "learning_rate": 1e-06, + "loss": 0.0155, + "reward": 9.93605089187622, + "reward_std": 1.3493054906527202, + "rewards/citation_reward_func": 4.727891127268474, + "rewards/correctness_reward_func": 4.455782135327657, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.19727890690167746, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.5006767958402634, + "step": 319 + }, + { + "completion_length": 150.0952377319336, + "epoch": 0.6739206739206739, + "grad_norm": 1.0160009860992432, + "kl": 1.0696614583333333, + "learning_rate": 1e-06, + "loss": 0.0142, + "reward": 9.079231341679892, + "reward_std": 1.7906232078870137, + "rewards/citation_reward_func": 4.76190463701884, + "rewards/correctness_reward_func": 3.7244897286097207, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -0.20408163219690323, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 320 + }, + { + "completion_length": 163.38094838460287, + "epoch": 0.676026676026676, + "grad_norm": 0.927356481552124, + "kl": 0.9557291666666666, + "learning_rate": 1e-06, + "loss": 0.0115, + "reward": 7.07500680287679, + "reward_std": 2.7190446058909097, + "rewards/citation_reward_func": 4.294217745463054, + "rewards/correctness_reward_func": 2.7210883696873984, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.3571428557236989, + "rewards/penalize_wrong_passages_reward_func": -0.5782312750816345, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49677544832229614, + "step": 321 + }, + { + "completion_length": 135.02040354410806, + "epoch": 0.6781326781326781, + "grad_norm": 0.7541379928588867, + "kl": 1.3046875, + "learning_rate": 1e-06, + "loss": 0.0186, + "reward": 9.808823108673096, + "reward_std": 1.0998690476020176, + "rewards/citation_reward_func": 4.821428537368774, + "rewards/correctness_reward_func": 4.336734612782796, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.29931971927483875, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 322 + }, + { + "completion_length": 151.50340016682944, + "epoch": 0.6802386802386803, + "grad_norm": 1.2360163927078247, + "kl": 1.322265625, + "learning_rate": 1e-06, + "loss": 0.0162, + "reward": 7.388632615407308, + "reward_std": 1.9448821544647217, + "rewards/citation_reward_func": 4.3792515595753985, + "rewards/correctness_reward_func": 2.7721087535222373, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -0.557823121547699, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49917681018511456, + "step": 323 + }, + { + "completion_length": 153.70748138427734, + "epoch": 0.6823446823446824, + "grad_norm": 1.0694539546966553, + "kl": 1.0631510416666667, + "learning_rate": 1e-06, + "loss": 0.0137, + "reward": 6.286017080148061, + "reward_std": 3.029209574063619, + "rewards/citation_reward_func": 4.625850200653076, + "rewards/correctness_reward_func": 2.3129251102606454, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.9183673113584518, + "rewards/penalize_wrong_passages_reward_func": -0.5578231240312258, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.4934999297062556, + "step": 324 + }, + { + "completion_length": 138.7653020222982, + "epoch": 0.6844506844506845, + "grad_norm": 2.301809549331665, + "kl": 1.3697916666666667, + "learning_rate": 1e-06, + "loss": 0.0154, + "reward": 8.145108779271444, + "reward_std": 2.592639982700348, + "rewards/citation_reward_func": 4.88095235824585, + "rewards/correctness_reward_func": 3.1462584336598716, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.2551020433505376, + "rewards/penalize_wrong_passages_reward_func": -0.6258503198623657, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4988502711057663, + "step": 325 + }, + { + "completion_length": 150.5816281636556, + "epoch": 0.6865566865566866, + "grad_norm": 1.4067254066467285, + "kl": 1.146484375, + "learning_rate": 1e-06, + "loss": 0.015, + "reward": 6.889067967732747, + "reward_std": 3.112168868382772, + "rewards/citation_reward_func": 4.685374021530151, + "rewards/correctness_reward_func": 3.1802720030148826, + "rewards/formatting_reward_func": 0.4948979566494624, + "rewards/length_reward_func": -1.275510181983312, + "rewards/penalize_wrong_passages_reward_func": -0.6870748164753119, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.491108775138855, + "step": 326 + }, + { + "completion_length": 146.06802368164062, + "epoch": 0.6886626886626886, + "grad_norm": 1.3995894193649292, + "kl": 1.0501302083333333, + "learning_rate": 1e-06, + "loss": 0.0143, + "reward": 8.407578070958456, + "reward_std": 1.973113218943278, + "rewards/citation_reward_func": 4.812925020853679, + "rewards/correctness_reward_func": 3.452380895614624, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.459183673063914, + "rewards/penalize_wrong_passages_reward_func": -0.3945578138033549, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49771422147750854, + "step": 327 + }, + { + "completion_length": 165.10543823242188, + "epoch": 0.6907686907686907, + "grad_norm": 1.1846466064453125, + "kl": 1.28515625, + "learning_rate": 1e-06, + "loss": 0.0138, + "reward": 5.958414912223816, + "reward_std": 3.437282919883728, + "rewards/citation_reward_func": 4.20918349424998, + "rewards/correctness_reward_func": 2.1938775181770325, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.40816326439380646, + "rewards/penalize_wrong_passages_reward_func": -0.8571428457895914, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.4924285064140956, + "step": 328 + }, + { + "completion_length": 147.21768442789713, + "epoch": 0.6928746928746928, + "grad_norm": 1.1162633895874023, + "kl": 1.1796875, + "learning_rate": 1e-06, + "loss": 0.0138, + "reward": 7.48524824778239, + "reward_std": 1.8048174877961476, + "rewards/citation_reward_func": 4.515305995941162, + "rewards/correctness_reward_func": 2.9081631700197854, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.3571428507566452, + "rewards/penalize_wrong_passages_reward_func": -0.5782312899827957, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49715299407641095, + "step": 329 + }, + { + "completion_length": 148.9251708984375, + "epoch": 0.694980694980695, + "grad_norm": 0.6264429092407227, + "kl": 0.92578125, + "learning_rate": 1e-06, + "loss": 0.013, + "reward": 9.310523192087809, + "reward_std": 1.478491594394048, + "rewards/citation_reward_func": 4.812925100326538, + "rewards/correctness_reward_func": 3.7244897286097207, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.17687074281275272, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 330 + }, + { + "completion_length": 164.04080963134766, + "epoch": 0.6970866970866971, + "grad_norm": 1.2959609031677246, + "kl": 1.0891927083333333, + "learning_rate": 1e-06, + "loss": 0.0142, + "reward": 8.000187873840332, + "reward_std": 2.20181867480278, + "rewards/citation_reward_func": 4.390589475631714, + "rewards/correctness_reward_func": 3.503401279449463, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.2551020433505376, + "rewards/penalize_wrong_passages_reward_func": -0.6326530426740646, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49565299848715466, + "step": 331 + }, + { + "completion_length": 225.14625549316406, + "epoch": 0.6991926991926992, + "grad_norm": 3.462467670440674, + "kl": 0.9622395833333334, + "learning_rate": 1e-06, + "loss": 0.0096, + "reward": 2.013375143210093, + "reward_std": 3.3742276430130005, + "rewards/citation_reward_func": 2.8684807221094766, + "rewards/correctness_reward_func": 0.6972788994510969, + "rewards/formatting_reward_func": 0.4931972821553548, + "rewards/length_reward_func": -0.7142857064803442, + "rewards/penalize_wrong_passages_reward_func": -1.8231291969617207, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49183327456315357, + "step": 332 + }, + { + "completion_length": 226.3333282470703, + "epoch": 0.7012987012987013, + "grad_norm": 1.2166374921798706, + "kl": 1.123046875, + "learning_rate": 1e-06, + "loss": 0.0112, + "reward": 2.068095083038012, + "reward_std": 3.714888095855713, + "rewards/citation_reward_func": 3.129251480102539, + "rewards/correctness_reward_func": 0.9013605018456777, + "rewards/formatting_reward_func": 0.49659863611062366, + "rewards/length_reward_func": -0.6122448990742365, + "rewards/penalize_wrong_passages_reward_func": -2.1700679858525596, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.4932652513186137, + "step": 333 + }, + { + "completion_length": 205.5408172607422, + "epoch": 0.7034047034047034, + "grad_norm": 1.0852947235107422, + "kl": 2.6360677083333335, + "learning_rate": 1e-06, + "loss": 0.0264, + "reward": 2.4366893072923026, + "reward_std": 4.2191972732543945, + "rewards/citation_reward_func": 3.3219953378041587, + "rewards/correctness_reward_func": 1.2244897385438283, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -1.836734692255656, + "rewards/penalize_wrong_passages_reward_func": -1.272108832995097, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49904755254586536, + "step": 334 + }, + { + "completion_length": 173.92516581217447, + "epoch": 0.7055107055107055, + "grad_norm": 0.9657922983169556, + "kl": 0.8541666666666666, + "learning_rate": 1e-06, + "loss": 0.0085, + "reward": 2.525939663251241, + "reward_std": 2.653134822845459, + "rewards/citation_reward_func": 3.7018140157063804, + "rewards/correctness_reward_func": 1.037414940694968, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -2.836734632651011, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.49929585059483844, + "step": 335 + }, + { + "completion_length": 211.1088409423828, + "epoch": 0.7076167076167076, + "grad_norm": 0.9506802558898926, + "kl": 0.7936197916666666, + "learning_rate": 1e-06, + "loss": 0.0079, + "reward": 3.7129976749420166, + "reward_std": 3.488153060277303, + "rewards/citation_reward_func": 3.191610018412272, + "rewards/correctness_reward_func": 1.258503367503484, + "rewards/formatting_reward_func": 0.4897959182659785, + "rewards/length_reward_func": -0.6632653127113978, + "rewards/penalize_wrong_passages_reward_func": -1.0544217626253765, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4907754609982173, + "step": 336 + }, + { + "completion_length": 233.43536631266275, + "epoch": 0.7097227097227097, + "grad_norm": 1.0683774948120117, + "kl": 0.7545572916666666, + "learning_rate": 1e-06, + "loss": 0.0075, + "reward": 4.680926203727722, + "reward_std": 3.037352502346039, + "rewards/citation_reward_func": 3.0215419133504233, + "rewards/correctness_reward_func": 2.1938775132099786, + "rewards/formatting_reward_func": 0.4948979566494624, + "rewards/length_reward_func": -0.5102040767669678, + "rewards/penalize_wrong_passages_reward_func": -1.013605425755183, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49441831310590106, + "step": 337 + }, + { + "completion_length": 199.06122080485025, + "epoch": 0.7118287118287119, + "grad_norm": 1.2184964418411255, + "kl": 0.80078125, + "learning_rate": 1e-06, + "loss": 0.008, + "reward": 0.370614451666673, + "reward_std": 3.0889535943667092, + "rewards/citation_reward_func": 3.202947735786438, + "rewards/correctness_reward_func": 2.8231292019287744, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -6.333333174387614, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.500999927520752, + "step": 338 + }, + { + "completion_length": 206.4489771525065, + "epoch": 0.713934713934714, + "grad_norm": 0.8987395167350769, + "kl": 0.8020833333333334, + "learning_rate": 1e-06, + "loss": 0.008, + "reward": 2.939191480477651, + "reward_std": 3.4129937092463174, + "rewards/citation_reward_func": 3.497732241948446, + "rewards/correctness_reward_func": 3.1632652282714844, + "rewards/formatting_reward_func": 0.4931972771883011, + "rewards/length_reward_func": -0.40816326936086017, + "rewards/penalize_wrong_passages_reward_func": -4.299319674571355, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4924795428911845, + "step": 339 + }, + { + "completion_length": 183.36734517415366, + "epoch": 0.7160407160407161, + "grad_norm": 2.6049094200134277, + "kl": 1.048828125, + "learning_rate": 1e-06, + "loss": 0.0105, + "reward": 6.931339343388875, + "reward_std": 2.379570245742798, + "rewards/citation_reward_func": 3.713151772816976, + "rewards/correctness_reward_func": 2.9421768337488174, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.3571428606907527, + "rewards/penalize_wrong_passages_reward_func": -0.36054421216249466, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4953978955745697, + "step": 340 + }, + { + "completion_length": 205.87074279785156, + "epoch": 0.7181467181467182, + "grad_norm": 0.9560754895210266, + "kl": 0.865234375, + "learning_rate": 1e-06, + "loss": 0.0087, + "reward": 4.302693744500478, + "reward_std": 3.5764169692993164, + "rewards/citation_reward_func": 3.928571343421936, + "rewards/correctness_reward_func": 0.9523809428016344, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.459183673063914, + "rewards/penalize_wrong_passages_reward_func": -0.7755101751536131, + "rewards/unicode_reward_func": -0.3401360511779785, + "rewards/xmlcount_reward_func": 0.4982720414797465, + "step": 341 + }, + { + "completion_length": 174.1190439860026, + "epoch": 0.7202527202527202, + "grad_norm": 1.4415292739868164, + "kl": 0.87890625, + "learning_rate": 1e-06, + "loss": 0.009, + "reward": 1.8660724461078644, + "reward_std": 2.3170458575089774, + "rewards/citation_reward_func": 4.229024728139241, + "rewards/correctness_reward_func": 1.139455755551656, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -4.2925169467926025, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.497591773668925, + "step": 342 + }, + { + "completion_length": 235.49319203694662, + "epoch": 0.7223587223587223, + "grad_norm": 1.1462864875793457, + "kl": 0.6555989583333334, + "learning_rate": 1e-06, + "loss": 0.0066, + "reward": 1.1734318683544795, + "reward_std": 3.075716018676758, + "rewards/citation_reward_func": 2.431972603003184, + "rewards/correctness_reward_func": 1.1394557654857635, + "rewards/formatting_reward_func": 0.49659863611062366, + "rewards/length_reward_func": -0.3571428557236989, + "rewards/penalize_wrong_passages_reward_func": -3.0340135296185813, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4965611646572749, + "step": 343 + }, + { + "completion_length": 223.60543823242188, + "epoch": 0.7244647244647244, + "grad_norm": 1.000259518623352, + "kl": 1.1751302083333333, + "learning_rate": 1e-06, + "loss": 0.0119, + "reward": 4.428008993466695, + "reward_std": 2.8102714816729226, + "rewards/citation_reward_func": 3.4070293505986533, + "rewards/correctness_reward_func": 0.9693877349297205, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -0.734693855047226, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49206797281901044, + "step": 344 + }, + { + "completion_length": 223.97958628336588, + "epoch": 0.7265707265707265, + "grad_norm": 0.7306500673294067, + "kl": 1.046875, + "learning_rate": 1e-06, + "loss": 0.0105, + "reward": 3.538080414136251, + "reward_std": 2.771793325742086, + "rewards/citation_reward_func": 3.253968079884847, + "rewards/correctness_reward_func": 1.037414958079656, + "rewards/formatting_reward_func": 0.49149659276008606, + "rewards/length_reward_func": -0.30612245202064514, + "rewards/penalize_wrong_passages_reward_func": -1.4285713980595272, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.48989449938138324, + "step": 345 + }, + { + "completion_length": 184.20747884114584, + "epoch": 0.7286767286767287, + "grad_norm": 0.9912636876106262, + "kl": 0.8352864583333334, + "learning_rate": 1e-06, + "loss": 0.0084, + "reward": 3.882302482922872, + "reward_std": 2.212656001249949, + "rewards/citation_reward_func": 3.027210831642151, + "rewards/correctness_reward_func": 0.8673469126224518, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -0.8571428457895914, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4979489247004191, + "step": 346 + }, + { + "completion_length": 189.8333282470703, + "epoch": 0.7307827307827308, + "grad_norm": 0.8739500641822815, + "kl": 0.9264322916666666, + "learning_rate": 1e-06, + "loss": 0.0092, + "reward": 4.077586094538371, + "reward_std": 2.7235729893048606, + "rewards/citation_reward_func": 3.3446712493896484, + "rewards/correctness_reward_func": 1.0884353518486023, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.40816325942675274, + "rewards/penalize_wrong_passages_reward_func": -0.9387754847606024, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49311897655328113, + "step": 347 + }, + { + "completion_length": 161.45237731933594, + "epoch": 0.7328887328887329, + "grad_norm": 1.0412046909332275, + "kl": 1.0709635416666667, + "learning_rate": 1e-06, + "loss": 0.0107, + "reward": 3.8484999338785806, + "reward_std": 1.791780486702919, + "rewards/citation_reward_func": 3.4523807366689048, + "rewards/correctness_reward_func": 1.2244897882143657, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.05102040867010752, + "rewards/penalize_wrong_passages_reward_func": -1.6054421067237854, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.49815979103247326, + "step": 348 + }, + { + "completion_length": 206.01020050048828, + "epoch": 0.734994734994735, + "grad_norm": 1.1982636451721191, + "kl": 0.875, + "learning_rate": 1e-06, + "loss": 0.0087, + "reward": 4.812347888946533, + "reward_std": 3.0765510201454163, + "rewards/citation_reward_func": 3.327664375305176, + "rewards/correctness_reward_func": 1.6666666269302368, + "rewards/formatting_reward_func": 0.49149659276008606, + "rewards/length_reward_func": -0.3571428507566452, + "rewards/penalize_wrong_passages_reward_func": -0.8027210583289465, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4863843123118083, + "step": 349 + }, + { + "completion_length": 197.74149068196616, + "epoch": 0.7371007371007371, + "grad_norm": 5.780428886413574, + "kl": 1.513671875, + "learning_rate": 1e-06, + "loss": 0.0153, + "reward": 2.651944398880005, + "reward_std": 4.0778325001398725, + "rewards/citation_reward_func": 3.0498865842819214, + "rewards/correctness_reward_func": 0.7482993106047312, + "rewards/formatting_reward_func": 0.4931972821553548, + "rewards/length_reward_func": -0.6122448941071829, + "rewards/penalize_wrong_passages_reward_func": -1.3401360313097637, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.4830101529757182, + "step": 350 + }, + { + "completion_length": 200.83673350016275, + "epoch": 0.7392067392067392, + "grad_norm": 19.824453353881836, + "kl": 8.28125, + "learning_rate": 1e-06, + "loss": 0.0824, + "reward": 1.8109624261657398, + "reward_std": 3.1978684663772583, + "rewards/citation_reward_func": 3.6904759804407754, + "rewards/correctness_reward_func": 1.224489763379097, + "rewards/formatting_reward_func": 0.49574829638004303, + "rewards/length_reward_func": -0.6122448990742365, + "rewards/penalize_wrong_passages_reward_func": -3.4829931457837424, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4954863339662552, + "step": 351 + }, + { + "completion_length": 184.20747884114584, + "epoch": 0.7413127413127413, + "grad_norm": 0.9565224647521973, + "kl": 0.8385416666666666, + "learning_rate": 1e-06, + "loss": 0.0084, + "reward": -0.03650915250182152, + "reward_std": 2.751798431078593, + "rewards/citation_reward_func": 2.851473848025004, + "rewards/correctness_reward_func": 2.363945504029592, + "rewards/formatting_reward_func": 0.4948979566494624, + "rewards/length_reward_func": -0.30612244705359143, + "rewards/penalize_wrong_passages_reward_func": -5.931972761948903, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4912686546643575, + "step": 352 + }, + { + "completion_length": 198.51360066731772, + "epoch": 0.7434187434187434, + "grad_norm": 1.1145179271697998, + "kl": 0.775390625, + "learning_rate": 1e-06, + "loss": 0.0082, + "reward": 3.776633802180489, + "reward_std": 2.4671822786331177, + "rewards/citation_reward_func": 3.4297050635019937, + "rewards/correctness_reward_func": 2.0918366561333337, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.40816326936086017, + "rewards/penalize_wrong_passages_reward_func": -2.333333301047484, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4982890437046687, + "step": 353 + }, + { + "completion_length": 240.19046783447266, + "epoch": 0.7455247455247456, + "grad_norm": 0.906201958656311, + "kl": 0.72265625, + "learning_rate": 1e-06, + "loss": 0.0072, + "reward": 2.3721699317296348, + "reward_std": 3.7090295950571694, + "rewards/citation_reward_func": 2.976190368334452, + "rewards/correctness_reward_func": 0.9183673212925593, + "rewards/formatting_reward_func": 0.4863945593436559, + "rewards/length_reward_func": -0.8163265238205591, + "rewards/penalize_wrong_passages_reward_func": -1.673469364643097, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.481013556321462, + "step": 354 + }, + { + "completion_length": 189.56122080485025, + "epoch": 0.7476307476307477, + "grad_norm": 32.47963333129883, + "kl": 1.783203125, + "learning_rate": 1e-06, + "loss": 0.0178, + "reward": 2.647131323814392, + "reward_std": 3.3386093378067017, + "rewards/citation_reward_func": 3.5260767141977944, + "rewards/correctness_reward_func": 1.9217686653137207, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -0.40816326439380646, + "rewards/penalize_wrong_passages_reward_func": -3.204081575075785, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.48499993483225506, + "step": 355 + }, + { + "completion_length": 194.7448959350586, + "epoch": 0.7497367497367498, + "grad_norm": 1.9522308111190796, + "kl": 0.873046875, + "learning_rate": 1e-06, + "loss": 0.0086, + "reward": 0.12059734265009563, + "reward_std": 2.213576853275299, + "rewards/citation_reward_func": 3.2879815896352134, + "rewards/correctness_reward_func": 1.734693835179011, + "rewards/formatting_reward_func": 0.4931972771883011, + "rewards/length_reward_func": -0.3571428557236989, + "rewards/penalize_wrong_passages_reward_func": -5.530612190564473, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49247953792413074, + "step": 356 + }, + { + "completion_length": 212.523806254069, + "epoch": 0.7518427518427518, + "grad_norm": 0.8430114984512329, + "kl": 0.9361979166666666, + "learning_rate": 1e-06, + "loss": 0.0098, + "reward": 4.969536264737447, + "reward_std": 3.3358057339986167, + "rewards/citation_reward_func": 3.3049886226654053, + "rewards/correctness_reward_func": 2.3469387094179788, + "rewards/formatting_reward_func": 0.4931972771883011, + "rewards/length_reward_func": -0.40816326439380646, + "rewards/penalize_wrong_passages_reward_func": -1.0884353419144948, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.4910781780878703, + "step": 357 + }, + { + "completion_length": 193.30611928304037, + "epoch": 0.7539487539487539, + "grad_norm": 0.9406774640083313, + "kl": 0.8352864583333334, + "learning_rate": 1e-06, + "loss": 0.0084, + "reward": 3.408174604177475, + "reward_std": 2.1897811690966287, + "rewards/citation_reward_func": 3.3106573820114136, + "rewards/correctness_reward_func": 1.1054421464602153, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.40816326936086017, + "rewards/penalize_wrong_passages_reward_func": -1.5986394279946883, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49887748062610626, + "step": 358 + }, + { + "completion_length": 221.0544204711914, + "epoch": 0.756054756054756, + "grad_norm": 0.7500853538513184, + "kl": 0.6780598958333334, + "learning_rate": 1e-06, + "loss": 0.0072, + "reward": 6.953930695851644, + "reward_std": 3.0307931105295816, + "rewards/citation_reward_func": 3.5941041310628257, + "rewards/correctness_reward_func": 3.8435373107592263, + "rewards/formatting_reward_func": 0.4931972771883011, + "rewards/length_reward_func": -0.5102040867010752, + "rewards/penalize_wrong_passages_reward_func": -0.9591836531956991, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.492479532957077, + "step": 359 + }, + { + "completion_length": 185.12244669596353, + "epoch": 0.7581607581607581, + "grad_norm": 0.8867349624633789, + "kl": 0.6803385416666666, + "learning_rate": 1e-06, + "loss": 0.0069, + "reward": 5.780478318532308, + "reward_std": 2.767763157685598, + "rewards/citation_reward_func": 3.4920632441838584, + "rewards/correctness_reward_func": 2.9761904080708823, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.20408162971337637, + "rewards/penalize_wrong_passages_reward_func": -1.4829931457837422, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49929924805959064, + "step": 360 + }, + { + "completion_length": 187.57142639160156, + "epoch": 0.7602667602667603, + "grad_norm": 0.9376119375228882, + "kl": 0.8313802083333334, + "learning_rate": 1e-06, + "loss": 0.0084, + "reward": 3.219564517339071, + "reward_std": 2.914902945359548, + "rewards/citation_reward_func": 3.5204078753789267, + "rewards/correctness_reward_func": 1.3775509893894196, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -0.2551020433505376, + "rewards/penalize_wrong_passages_reward_func": -2.244897892077764, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.4950747738281886, + "step": 361 + }, + { + "completion_length": 211.78230794270834, + "epoch": 0.7623727623727624, + "grad_norm": 0.9319391250610352, + "kl": 0.71484375, + "learning_rate": 1e-06, + "loss": 0.0071, + "reward": 4.074728965759277, + "reward_std": 3.0338656107584634, + "rewards/citation_reward_func": 3.038548747698466, + "rewards/correctness_reward_func": 1.54761899014314, + "rewards/formatting_reward_func": 0.4948979566494624, + "rewards/length_reward_func": -0.5102040767669678, + "rewards/penalize_wrong_passages_reward_func": -0.9863945345083872, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49026183784008026, + "step": 362 + }, + { + "completion_length": 213.2789103190104, + "epoch": 0.7644787644787645, + "grad_norm": 0.9279811978340149, + "kl": 0.755859375, + "learning_rate": 1e-06, + "loss": 0.0076, + "reward": 4.575824022293091, + "reward_std": 3.2141083478927612, + "rewards/citation_reward_func": 2.7154194513956704, + "rewards/correctness_reward_func": 2.755101978778839, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -0.45918366809686023, + "rewards/penalize_wrong_passages_reward_func": -1.4285714030265808, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49645912647247314, + "step": 363 + }, + { + "completion_length": 193.82312774658203, + "epoch": 0.7665847665847666, + "grad_norm": 7.441447734832764, + "kl": 1.0807291666666667, + "learning_rate": 1e-06, + "loss": 0.0109, + "reward": 5.427708625793457, + "reward_std": 2.763730764389038, + "rewards/citation_reward_func": 3.191609819730123, + "rewards/correctness_reward_func": 2.6700679659843445, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.30612244705359143, + "rewards/penalize_wrong_passages_reward_func": -1.1224489510059357, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49630266427993774, + "step": 364 + }, + { + "completion_length": 184.27210744222006, + "epoch": 0.7686907686907687, + "grad_norm": 1.2132267951965332, + "kl": 0.900390625, + "learning_rate": 1e-06, + "loss": 0.009, + "reward": 6.065507729848226, + "reward_std": 2.7998267809549966, + "rewards/citation_reward_func": 3.2426302433013916, + "rewards/correctness_reward_func": 2.9931972324848175, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.4591836780309677, + "rewards/penalize_wrong_passages_reward_func": -0.7074829886356989, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4980475604534149, + "step": 365 + }, + { + "completion_length": 215.0782267252604, + "epoch": 0.7707967707967708, + "grad_norm": 0.9835836887359619, + "kl": 0.7194010416666666, + "learning_rate": 1e-06, + "loss": 0.0072, + "reward": 3.548775384823481, + "reward_std": 3.152905980745951, + "rewards/citation_reward_func": 3.4013604720433555, + "rewards/correctness_reward_func": 1.1734693795442581, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.40816325942675274, + "rewards/penalize_wrong_passages_reward_func": -1.612244874238968, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4960543711980184, + "step": 366 + }, + { + "completion_length": 188.61224110921225, + "epoch": 0.7729027729027729, + "grad_norm": 0.9326059818267822, + "kl": 0.8587239583333334, + "learning_rate": 1e-06, + "loss": 0.0086, + "reward": 4.506099636356036, + "reward_std": 3.020311951637268, + "rewards/citation_reward_func": 3.1065758069356284, + "rewards/correctness_reward_func": 2.3469387690226235, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -1.6734693696101506, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.4981631984313329, + "step": 367 + }, + { + "completion_length": 208.7823053995768, + "epoch": 0.775008775008775, + "grad_norm": 0.9097429513931274, + "kl": 0.783203125, + "learning_rate": 1e-06, + "loss": 0.0078, + "reward": 4.044582664966583, + "reward_std": 3.683608889579773, + "rewards/citation_reward_func": 3.0215417941411338, + "rewards/correctness_reward_func": 1.8537414570649464, + "rewards/formatting_reward_func": 0.49149659276008606, + "rewards/length_reward_func": -0.40816326439380646, + "rewards/penalize_wrong_passages_reward_func": -1.4013605117797852, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4873264779647191, + "step": 368 + }, + { + "completion_length": 165.74829483032227, + "epoch": 0.7771147771147772, + "grad_norm": 1.2218385934829712, + "kl": 0.998046875, + "learning_rate": 1e-06, + "loss": 0.0102, + "reward": 4.376276512940724, + "reward_std": 2.677362342675527, + "rewards/citation_reward_func": 3.684807022412618, + "rewards/correctness_reward_func": 2.1768707036972046, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -0.2551020433505376, + "rewards/penalize_wrong_passages_reward_func": -2.2244898260881505, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.497591773668925, + "step": 369 + }, + { + "completion_length": 223.14625295003256, + "epoch": 0.7792207792207793, + "grad_norm": 0.9640951156616211, + "kl": 0.7174479166666666, + "learning_rate": 1e-06, + "loss": 0.0072, + "reward": 2.730006674925486, + "reward_std": 3.398833155632019, + "rewards/citation_reward_func": 3.5714284578959146, + "rewards/correctness_reward_func": 2.040816237529119, + "rewards/formatting_reward_func": 0.49659863611062366, + "rewards/length_reward_func": -0.5102040867010752, + "rewards/penalize_wrong_passages_reward_func": -3.3605441451072693, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49191151559352875, + "step": 370 + }, + { + "completion_length": 179.43537139892578, + "epoch": 0.7813267813267813, + "grad_norm": 2.156036138534546, + "kl": 1.0071614583333333, + "learning_rate": 1e-06, + "loss": 0.0101, + "reward": 5.707685788472493, + "reward_std": 1.7288099726041157, + "rewards/citation_reward_func": 3.1689340273539224, + "rewards/correctness_reward_func": 2.0918366784850755, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -0.5510204037030538, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4979353149731954, + "step": 371 + }, + { + "completion_length": 194.06802368164062, + "epoch": 0.7834327834327834, + "grad_norm": 1.0008419752120972, + "kl": 1.0358072916666667, + "learning_rate": 1e-06, + "loss": 0.0104, + "reward": 3.9335667292277017, + "reward_std": 2.542555034160614, + "rewards/citation_reward_func": 2.9138320287068686, + "rewards/correctness_reward_func": 1.4455781827370326, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -1.2176870505015056, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49762578308582306, + "step": 372 + }, + { + "completion_length": 169.0952351888021, + "epoch": 0.7855387855387855, + "grad_norm": 0.9800570607185364, + "kl": 1.0807291666666667, + "learning_rate": 1e-06, + "loss": 0.0109, + "reward": 4.42509392897288, + "reward_std": 2.331450124581655, + "rewards/citation_reward_func": 3.2199544509251914, + "rewards/correctness_reward_func": 1.7176870008309681, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.2551020433505376, + "rewards/penalize_wrong_passages_reward_func": -1.2517006695270538, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49425504604975384, + "step": 373 + }, + { + "completion_length": 196.67686716715494, + "epoch": 0.7876447876447876, + "grad_norm": 0.8940144181251526, + "kl": 0.865234375, + "learning_rate": 1e-06, + "loss": 0.0087, + "reward": 5.650235652923584, + "reward_std": 2.8446309566497803, + "rewards/citation_reward_func": 3.1916098992029824, + "rewards/correctness_reward_func": 2.8911564151446023, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -1.319727857907613, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4909387181202571, + "step": 374 + }, + { + "completion_length": 211.76190185546875, + "epoch": 0.7897507897507897, + "grad_norm": 0.8513844013214111, + "kl": 0.8001302083333334, + "learning_rate": 1e-06, + "loss": 0.008, + "reward": 3.264024794101715, + "reward_std": 3.1702443758646646, + "rewards/citation_reward_func": 2.494330883026123, + "rewards/correctness_reward_func": 1.6836734414100647, + "rewards/formatting_reward_func": 0.4931972771883011, + "rewards/length_reward_func": -0.3571428606907527, + "rewards/penalize_wrong_passages_reward_func": -1.5442176659901936, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49418361981709796, + "step": 375 + }, + { + "completion_length": 204.04761505126953, + "epoch": 0.7918567918567918, + "grad_norm": 1.1359713077545166, + "kl": 0.865234375, + "learning_rate": 1e-06, + "loss": 0.0086, + "reward": 3.6691199938456216, + "reward_std": 3.034039258956909, + "rewards/citation_reward_func": 3.1575962702433267, + "rewards/correctness_reward_func": 1.2414965629577637, + "rewards/formatting_reward_func": 0.4931972771883011, + "rewards/length_reward_func": -0.5102040817340215, + "rewards/penalize_wrong_passages_reward_func": -1.2040815949440002, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49111558496952057, + "step": 376 + }, + { + "completion_length": 187.55101776123047, + "epoch": 0.793962793962794, + "grad_norm": 0.8377850651741028, + "kl": 1.251953125, + "learning_rate": 1e-06, + "loss": 0.0129, + "reward": 3.168164153893789, + "reward_std": 2.8421239455540976, + "rewards/citation_reward_func": 3.3446709712346396, + "rewards/correctness_reward_func": 1.751700629790624, + "rewards/formatting_reward_func": 0.4948979616165161, + "rewards/length_reward_func": -0.510204071799914, + "rewards/penalize_wrong_passages_reward_func": -2.4013604819774628, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.48845912516117096, + "step": 377 + }, + { + "completion_length": 180.3537394205729, + "epoch": 0.7960687960687961, + "grad_norm": 1.4555526971817017, + "kl": 1.087890625, + "learning_rate": 1e-06, + "loss": 0.0111, + "reward": 5.376014610131581, + "reward_std": 3.2300790747006736, + "rewards/citation_reward_func": 3.6678003072738647, + "rewards/correctness_reward_func": 2.2789114912350974, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.40816326439380646, + "rewards/penalize_wrong_passages_reward_func": -0.9863945270578066, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.4956291963656743, + "step": 378 + }, + { + "completion_length": 228.18026733398438, + "epoch": 0.7981747981747982, + "grad_norm": 14.021940231323242, + "kl": 2.5520833333333335, + "learning_rate": 1e-06, + "loss": 0.0255, + "reward": 3.4462526788314185, + "reward_std": 3.789495587348938, + "rewards/citation_reward_func": 3.2426303227742515, + "rewards/correctness_reward_func": 2.1768706937630973, + "rewards/formatting_reward_func": 0.4829931954542796, + "rewards/length_reward_func": -0.9693877498308817, + "rewards/penalize_wrong_passages_reward_func": -1.9659863710403442, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.479132612546285, + "step": 379 + }, + { + "completion_length": 184.55441538492838, + "epoch": 0.8002808002808003, + "grad_norm": 2.2284538745880127, + "kl": 1.2708333333333333, + "learning_rate": 1e-06, + "loss": 0.0127, + "reward": 4.8683435916900635, + "reward_std": 2.3631349007288613, + "rewards/citation_reward_func": 3.5884350538253784, + "rewards/correctness_reward_func": 1.5986394186814625, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.2551020433505376, + "rewards/penalize_wrong_passages_reward_func": -1.0612244804700215, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49929585059483844, + "step": 380 + }, + { + "completion_length": 191.6088409423828, + "epoch": 0.8023868023868024, + "grad_norm": 1.0186786651611328, + "kl": 1.0078125, + "learning_rate": 1e-06, + "loss": 0.0101, + "reward": 4.85123352209727, + "reward_std": 1.894408146540324, + "rewards/citation_reward_func": 3.424036184946696, + "rewards/correctness_reward_func": 1.1054421464602153, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -0.4693877498308818, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49692511558532715, + "step": 381 + }, + { + "completion_length": 179.3469327290853, + "epoch": 0.8044928044928045, + "grad_norm": 1.5976791381835938, + "kl": 1.2115885416666667, + "learning_rate": 1e-06, + "loss": 0.0126, + "reward": 5.690453330675761, + "reward_std": 2.155216157436371, + "rewards/citation_reward_func": 3.888888637224833, + "rewards/correctness_reward_func": 1.7687074343363445, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.40816326936086017, + "rewards/penalize_wrong_passages_reward_func": -0.5510203925271829, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49374144275983173, + "step": 382 + }, + { + "completion_length": 166.1700642903646, + "epoch": 0.8065988065988066, + "grad_norm": 1.177154541015625, + "kl": 1.2799479166666667, + "learning_rate": 1e-06, + "loss": 0.0128, + "reward": 3.771610975265503, + "reward_std": 2.488835245370865, + "rewards/citation_reward_func": 3.543083747227987, + "rewards/correctness_reward_func": 0.6802720998724302, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -1.0612244606018066, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.48362920184930164, + "step": 383 + }, + { + "completion_length": 202.77210744222006, + "epoch": 0.8087048087048087, + "grad_norm": 1.2002400159835815, + "kl": 0.9446614583333334, + "learning_rate": 1e-06, + "loss": 0.0094, + "reward": 3.114571273326874, + "reward_std": 2.7324097553888955, + "rewards/citation_reward_func": 3.282312830289205, + "rewards/correctness_reward_func": 1.3265305906534195, + "rewards/formatting_reward_func": 0.4948979566494624, + "rewards/length_reward_func": -0.3571428606907527, + "rewards/penalize_wrong_passages_reward_func": -2.1224488814671836, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49042171239852905, + "step": 384 + }, + { + "completion_length": 186.28911336263022, + "epoch": 0.8108108108108109, + "grad_norm": 1.269479513168335, + "kl": 1.3678385416666667, + "learning_rate": 1e-06, + "loss": 0.0139, + "reward": 5.498821934064229, + "reward_std": 2.9702013532320657, + "rewards/citation_reward_func": 3.628117561340332, + "rewards/correctness_reward_func": 3.010203997294108, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -1.7619047115246456, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.49825504422187805, + "step": 385 + }, + { + "completion_length": 204.0544179280599, + "epoch": 0.812916812916813, + "grad_norm": 1.0071189403533936, + "kl": 0.77734375, + "learning_rate": 1e-06, + "loss": 0.008, + "reward": 5.695616881052653, + "reward_std": 3.5208513736724854, + "rewards/citation_reward_func": 3.395691474278768, + "rewards/correctness_reward_func": 2.551020324230194, + "rewards/formatting_reward_func": 0.4948979566494624, + "rewards/length_reward_func": -0.3571428557236989, + "rewards/penalize_wrong_passages_reward_func": -0.8775510092576345, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.48870062331358594, + "step": 386 + }, + { + "completion_length": 208.5952351888021, + "epoch": 0.815022815022815, + "grad_norm": 0.9028397798538208, + "kl": 0.9401041666666666, + "learning_rate": 1e-06, + "loss": 0.0099, + "reward": 5.026899019877116, + "reward_std": 2.7452229261398315, + "rewards/citation_reward_func": 3.2426302433013916, + "rewards/correctness_reward_func": 2.0578230718771615, + "rewards/formatting_reward_func": 0.4931972821553548, + "rewards/length_reward_func": -0.3571428606907527, + "rewards/penalize_wrong_passages_reward_func": -0.8979591429233551, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4883502970139186, + "step": 387 + }, + { + "completion_length": 241.89115142822266, + "epoch": 0.8171288171288171, + "grad_norm": 2.2114672660827637, + "kl": 1.0813802083333333, + "learning_rate": 1e-06, + "loss": 0.0108, + "reward": 3.6548535426457724, + "reward_std": 4.8498828411102295, + "rewards/citation_reward_func": 3.180272022883097, + "rewards/correctness_reward_func": 1.6156462331612904, + "rewards/formatting_reward_func": 0.4880952388048172, + "rewards/length_reward_func": -0.8673469424247742, + "rewards/penalize_wrong_passages_reward_func": -1.231292486190796, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4694795409838359, + "step": 388 + }, + { + "completion_length": 178.46258036295572, + "epoch": 0.8192348192348192, + "grad_norm": 0.9208208322525024, + "kl": 0.9479166666666666, + "learning_rate": 1e-06, + "loss": 0.0099, + "reward": 5.69873583316803, + "reward_std": 2.291304608186086, + "rewards/citation_reward_func": 3.4807255268096924, + "rewards/correctness_reward_func": 2.108843465646108, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -0.7823129296302795, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49522102375825244, + "step": 389 + }, + { + "completion_length": 221.82652537027994, + "epoch": 0.8213408213408213, + "grad_norm": 1.2106159925460815, + "kl": 0.7799479166666666, + "learning_rate": 1e-06, + "loss": 0.0078, + "reward": 2.308008924126625, + "reward_std": 3.1861923933029175, + "rewards/citation_reward_func": 3.4410430192947388, + "rewards/correctness_reward_func": 1.8027210632960002, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -0.45918366809686023, + "rewards/penalize_wrong_passages_reward_func": -3.4557822545369468, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.48261219759782154, + "step": 390 + }, + { + "completion_length": 183.39115142822266, + "epoch": 0.8234468234468234, + "grad_norm": 0.8995904326438904, + "kl": 0.9108072916666666, + "learning_rate": 1e-06, + "loss": 0.0091, + "reward": 4.622468153635661, + "reward_std": 3.1639206409454346, + "rewards/citation_reward_func": 3.1009069283803306, + "rewards/correctness_reward_func": 2.0408162077267966, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.3571428557236989, + "rewards/penalize_wrong_passages_reward_func": -1.1564625600973766, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49435028930505115, + "step": 391 + }, + { + "completion_length": 168.17686716715494, + "epoch": 0.8255528255528255, + "grad_norm": 1.2364623546600342, + "kl": 1.05078125, + "learning_rate": 1e-06, + "loss": 0.0107, + "reward": 5.528898000717163, + "reward_std": 2.4589553276697793, + "rewards/citation_reward_func": 3.282312790552775, + "rewards/correctness_reward_func": 2.6020407676696777, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -1.251700629790624, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4982856512069702, + "step": 392 + }, + { + "completion_length": 186.74829610188803, + "epoch": 0.8276588276588277, + "grad_norm": 1.070202350616455, + "kl": 1.0299479166666667, + "learning_rate": 1e-06, + "loss": 0.0103, + "reward": 3.7258060773213706, + "reward_std": 3.1855319341023765, + "rewards/citation_reward_func": 3.0782312949498496, + "rewards/correctness_reward_func": 1.3265305558840434, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.5102040817340215, + "rewards/penalize_wrong_passages_reward_func": -1.1632652878761292, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49621422092119855, + "step": 393 + }, + { + "completion_length": 185.4455769856771, + "epoch": 0.8297648297648298, + "grad_norm": 4.5259528160095215, + "kl": 1.0944010416666667, + "learning_rate": 1e-06, + "loss": 0.0111, + "reward": 3.85529363155365, + "reward_std": 3.284583489100138, + "rewards/citation_reward_func": 3.701813896497091, + "rewards/correctness_reward_func": 1.1394557654857635, + "rewards/formatting_reward_func": 0.4948979616165161, + "rewards/length_reward_func": -0.8163265238205591, + "rewards/penalize_wrong_passages_reward_func": -1.156462570031484, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49191490809122723, + "step": 394 + }, + { + "completion_length": 207.25169372558594, + "epoch": 0.8318708318708319, + "grad_norm": 1.1044845581054688, + "kl": 0.9192708333333334, + "learning_rate": 1e-06, + "loss": 0.0093, + "reward": 4.692568937937419, + "reward_std": 2.8676576217015586, + "rewards/citation_reward_func": 3.8038547039031982, + "rewards/correctness_reward_func": 1.7687074492375057, + "rewards/formatting_reward_func": 0.4931972771883011, + "rewards/length_reward_func": -0.4591836780309677, + "rewards/penalize_wrong_passages_reward_func": -1.3945577840010326, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.48055095970630646, + "step": 395 + }, + { + "completion_length": 195.43536885579428, + "epoch": 0.833976833976834, + "grad_norm": 12.616693496704102, + "kl": 1.59765625, + "learning_rate": 1e-06, + "loss": 0.016, + "reward": 4.594591736793518, + "reward_std": 3.1584444443384805, + "rewards/citation_reward_func": 3.3333332935969033, + "rewards/correctness_reward_func": 1.8877550462881725, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.3571428606907527, + "rewards/penalize_wrong_passages_reward_func": -1.2585033774375916, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.48914961020151776, + "step": 396 + }, + { + "completion_length": 213.34353383382162, + "epoch": 0.8360828360828361, + "grad_norm": 0.9019458293914795, + "kl": 0.8990885416666666, + "learning_rate": 1e-06, + "loss": 0.009, + "reward": 3.3717334220806756, + "reward_std": 2.71410596370697, + "rewards/citation_reward_func": 2.7947846253712973, + "rewards/correctness_reward_func": 1.836734652519226, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.8163265337546667, + "rewards/penalize_wrong_passages_reward_func": -1.435374101003011, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4936155875523885, + "step": 397 + }, + { + "completion_length": 216.26870473225912, + "epoch": 0.8381888381888382, + "grad_norm": 1.1500489711761475, + "kl": 0.837890625, + "learning_rate": 1e-06, + "loss": 0.0084, + "reward": 4.473621169726054, + "reward_std": 3.942271669705709, + "rewards/citation_reward_func": 3.3049885034561157, + "rewards/correctness_reward_func": 2.2278910676638284, + "rewards/formatting_reward_func": 0.4931972821553548, + "rewards/length_reward_func": -0.6632653027772903, + "rewards/penalize_wrong_passages_reward_func": -1.2108843227227528, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.49176184336344403, + "step": 398 + }, + { + "completion_length": 202.49659474690756, + "epoch": 0.8402948402948403, + "grad_norm": 0.8359233140945435, + "kl": 1.3411458333333333, + "learning_rate": 1e-06, + "loss": 0.0134, + "reward": 6.837395668029785, + "reward_std": 2.400991588830948, + "rewards/citation_reward_func": 3.667800267537435, + "rewards/correctness_reward_func": 2.925170044104258, + "rewards/formatting_reward_func": 0.4948979566494624, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -0.5918367306391398, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4944251130024592, + "step": 399 + }, + { + "completion_length": 211.3197224934896, + "epoch": 0.8424008424008425, + "grad_norm": 0.8955492377281189, + "kl": 0.9088541666666666, + "learning_rate": 1e-06, + "loss": 0.0091, + "reward": 2.618852473795414, + "reward_std": 3.1600440740585327, + "rewards/citation_reward_func": 2.743764122327169, + "rewards/correctness_reward_func": 0.8843537171681722, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.40816326439380646, + "rewards/penalize_wrong_passages_reward_func": -1.5918367107709248, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49243531624476117, + "step": 400 + }, + { + "completion_length": 226.51700592041016, + "epoch": 0.8445068445068445, + "grad_norm": 1.0206358432769775, + "kl": 0.9166666666666666, + "learning_rate": 1e-06, + "loss": 0.0093, + "reward": 3.305037297308445, + "reward_std": 4.15559458732605, + "rewards/citation_reward_func": 3.3503399888674417, + "rewards/correctness_reward_func": 1.4795918265978496, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.5612244953711828, + "rewards/penalize_wrong_passages_reward_func": -1.7755101919174194, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.4836087723573049, + "step": 401 + }, + { + "completion_length": 203.78570810953775, + "epoch": 0.8466128466128466, + "grad_norm": 0.9299895763397217, + "kl": 0.9049479166666666, + "learning_rate": 1e-06, + "loss": 0.0096, + "reward": 5.001368463039398, + "reward_std": 2.560189406077067, + "rewards/citation_reward_func": 3.7358275651931763, + "rewards/correctness_reward_func": 2.0238094528516135, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.30612245202064514, + "rewards/penalize_wrong_passages_reward_func": -1.4489795466264088, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49683327476183575, + "step": 402 + }, + { + "completion_length": 245.96598307291666, + "epoch": 0.8487188487188487, + "grad_norm": 0.9358532428741455, + "kl": 1.001953125, + "learning_rate": 1e-06, + "loss": 0.01, + "reward": 2.2839873830477395, + "reward_std": 3.627606511116028, + "rewards/citation_reward_func": 3.565759460131327, + "rewards/correctness_reward_func": 1.2925169964631398, + "rewards/formatting_reward_func": 0.48894557853539783, + "rewards/length_reward_func": -0.5612244854370753, + "rewards/penalize_wrong_passages_reward_func": -2.9795917669932046, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.47758158047993976, + "step": 403 + }, + { + "completion_length": 272.39795176188153, + "epoch": 0.8508248508248508, + "grad_norm": 2.81453537940979, + "kl": 0.892578125, + "learning_rate": 1e-06, + "loss": 0.0089, + "reward": 1.596479393541813, + "reward_std": 5.596612215042114, + "rewards/citation_reward_func": 2.482993245124817, + "rewards/correctness_reward_func": 1.7857142488161724, + "rewards/formatting_reward_func": 0.47448978821436566, + "rewards/length_reward_func": -1.4285714079936345, + "rewards/penalize_wrong_passages_reward_func": -2.1700679659843445, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4519217262665431, + "step": 404 + }, + { + "completion_length": 266.0340118408203, + "epoch": 0.8529308529308529, + "grad_norm": 0.8979971408843994, + "kl": 0.7799479166666666, + "learning_rate": 1e-06, + "loss": 0.0078, + "reward": 2.1668489103515944, + "reward_std": 3.853674292564392, + "rewards/citation_reward_func": 2.9308391014734902, + "rewards/correctness_reward_func": 1.5646257996559143, + "rewards/formatting_reward_func": 0.48979591329892475, + "rewards/length_reward_func": -0.6122448941071829, + "rewards/penalize_wrong_passages_reward_func": -2.5238095124562583, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.4877108285824458, + "step": 405 + }, + { + "completion_length": 244.4931894938151, + "epoch": 0.855036855036855, + "grad_norm": 0.9777901768684387, + "kl": 0.7272135416666666, + "learning_rate": 1e-06, + "loss": 0.0073, + "reward": 1.3467469811439514, + "reward_std": 3.340526362260183, + "rewards/citation_reward_func": 2.7947843869527182, + "rewards/correctness_reward_func": 1.054421752691269, + "rewards/formatting_reward_func": 0.4880952338377635, + "rewards/length_reward_func": -0.5102040767669678, + "rewards/penalize_wrong_passages_reward_func": -2.9659863909085593, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4856359859307607, + "step": 406 + }, + { + "completion_length": 220.93196868896484, + "epoch": 0.8571428571428571, + "grad_norm": 2.2201004028320312, + "kl": 1.3938802083333333, + "learning_rate": 1e-06, + "loss": 0.0143, + "reward": 2.413368208023409, + "reward_std": 3.0608163078626, + "rewards/citation_reward_func": 3.3276642163594565, + "rewards/correctness_reward_func": 1.819727857907613, + "rewards/formatting_reward_func": 0.4948979566494624, + "rewards/length_reward_func": -0.459183673063914, + "rewards/penalize_wrong_passages_reward_func": -3.2585032880306244, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.48876525461673737, + "step": 407 + }, + { + "completion_length": 210.1394526163737, + "epoch": 0.8592488592488593, + "grad_norm": 1.1213274002075195, + "kl": 0.9225260416666666, + "learning_rate": 1e-06, + "loss": 0.0092, + "reward": 3.7354998191197715, + "reward_std": 3.206026335557302, + "rewards/citation_reward_func": 3.0612245003382363, + "rewards/correctness_reward_func": 1.564625807106495, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -0.6632653127113978, + "rewards/penalize_wrong_passages_reward_func": -1.2176870654026668, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4940033406019211, + "step": 408 + }, + { + "completion_length": 221.2993138631185, + "epoch": 0.8613548613548614, + "grad_norm": 1.0960359573364258, + "kl": 0.884765625, + "learning_rate": 1e-06, + "loss": 0.0089, + "reward": 4.068404674530029, + "reward_std": 3.2786128918329873, + "rewards/citation_reward_func": 3.112244804700216, + "rewards/correctness_reward_func": 1.39455779393514, + "rewards/formatting_reward_func": 0.4931972771883011, + "rewards/length_reward_func": -0.6122448990742365, + "rewards/penalize_wrong_passages_reward_func": -0.8095237910747528, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4901734193166097, + "step": 409 + }, + { + "completion_length": 222.9897893269857, + "epoch": 0.8634608634608635, + "grad_norm": 1.0138585567474365, + "kl": 0.9602864583333334, + "learning_rate": 1e-06, + "loss": 0.0096, + "reward": 3.9506948391596475, + "reward_std": 3.6894630193710327, + "rewards/citation_reward_func": 2.647392193476359, + "rewards/correctness_reward_func": 2.789115528265635, + "rewards/formatting_reward_func": 0.4931972771883011, + "rewards/length_reward_func": -0.459183673063914, + "rewards/penalize_wrong_passages_reward_func": -2.0068026582400003, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.48697614173094433, + "step": 410 + }, + { + "completion_length": 196.2789052327474, + "epoch": 0.8655668655668656, + "grad_norm": 0.9708966016769409, + "kl": 0.8671875, + "learning_rate": 1e-06, + "loss": 0.0092, + "reward": 4.430862545967102, + "reward_std": 2.7824105819066367, + "rewards/citation_reward_func": 3.815192619959513, + "rewards/correctness_reward_func": 2.9421767791112265, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.30612245202064514, + "rewards/penalize_wrong_passages_reward_func": -3.006802717844645, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4881189912557602, + "step": 411 + }, + { + "completion_length": 187.4183603922526, + "epoch": 0.8676728676728677, + "grad_norm": 0.6679948568344116, + "kl": 0.9576822916666666, + "learning_rate": 1e-06, + "loss": 0.0097, + "reward": 5.555980682373047, + "reward_std": 2.303885757923126, + "rewards/citation_reward_func": 3.157596230506897, + "rewards/correctness_reward_func": 3.6054420471191406, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.2551020383834839, + "rewards/penalize_wrong_passages_reward_func": -1.9455782175064087, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4936223824818929, + "step": 412 + }, + { + "completion_length": 187.95237731933594, + "epoch": 0.8697788697788698, + "grad_norm": 1.1267014741897583, + "kl": 0.9283854166666666, + "learning_rate": 1e-06, + "loss": 0.0093, + "reward": 4.6450633605321245, + "reward_std": 2.902350743611654, + "rewards/citation_reward_func": 3.0498865048090615, + "rewards/correctness_reward_func": 2.2448979020118713, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -0.3571428557236989, + "rewards/penalize_wrong_passages_reward_func": -1.2857142488161724, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49653735756874084, + "step": 413 + }, + { + "completion_length": 224.27210489908853, + "epoch": 0.8718848718848718, + "grad_norm": 0.8470773696899414, + "kl": 0.8430989583333334, + "learning_rate": 1e-06, + "loss": 0.0084, + "reward": 5.8639976978302, + "reward_std": 2.6660279631614685, + "rewards/citation_reward_func": 3.2936505873998008, + "rewards/correctness_reward_func": 2.4999999602635703, + "rewards/formatting_reward_func": 0.48979591329892475, + "rewards/length_reward_func": -0.3571428606907527, + "rewards/penalize_wrong_passages_reward_func": -0.551020403082172, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4887142429749171, + "step": 414 + }, + { + "completion_length": 228.59863789876303, + "epoch": 0.8739908739908739, + "grad_norm": 0.8768046498298645, + "kl": 1.158203125, + "learning_rate": 1e-06, + "loss": 0.0119, + "reward": 4.329038341840108, + "reward_std": 3.2816514571507773, + "rewards/citation_reward_func": 3.4126983086268106, + "rewards/correctness_reward_func": 2.346938669681549, + "rewards/formatting_reward_func": 0.48639454940954846, + "rewards/length_reward_func": -0.5102040867010752, + "rewards/penalize_wrong_passages_reward_func": -1.8843537171681721, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.47756456832091015, + "step": 415 + }, + { + "completion_length": 229.88774871826172, + "epoch": 0.8760968760968761, + "grad_norm": 3.260483503341675, + "kl": 1.0201822916666667, + "learning_rate": 1e-06, + "loss": 0.0102, + "reward": 2.8331868648529053, + "reward_std": 3.004487911860148, + "rewards/citation_reward_func": 2.9761902888615928, + "rewards/correctness_reward_func": 1.1224489609400432, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -0.3571428606907527, + "rewards/penalize_wrong_passages_reward_func": -1.8979591329892476, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4930509676535924, + "step": 416 + }, + { + "completion_length": 201.8197250366211, + "epoch": 0.8782028782028782, + "grad_norm": 1.051156997680664, + "kl": 1024.71875, + "learning_rate": 1e-06, + "loss": 10.2348, + "reward": 4.993551929791768, + "reward_std": 2.8192378679911294, + "rewards/citation_reward_func": 3.8038545846939087, + "rewards/correctness_reward_func": 1.3265305906534195, + "rewards/formatting_reward_func": 0.4931972771883011, + "rewards/length_reward_func": -0.459183673063914, + "rewards/penalize_wrong_passages_reward_func": -0.6598639413714409, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.48901695013046265, + "step": 417 + }, + { + "completion_length": 246.04421742757162, + "epoch": 0.8803088803088803, + "grad_norm": 1.0293517112731934, + "kl": 0.8938802083333334, + "learning_rate": 1e-06, + "loss": 0.009, + "reward": 4.082921763261159, + "reward_std": 4.129499514897664, + "rewards/citation_reward_func": 3.3333330949147544, + "rewards/correctness_reward_func": 1.8027210632960002, + "rewards/formatting_reward_func": 0.47789115210374195, + "rewards/length_reward_func": -1.0204081684350967, + "rewards/penalize_wrong_passages_reward_func": -0.9795917967955271, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.468976154923439, + "step": 418 + }, + { + "completion_length": 227.7686996459961, + "epoch": 0.8824148824148824, + "grad_norm": 0.9488903284072876, + "kl": 0.853515625, + "learning_rate": 1e-06, + "loss": 0.0087, + "reward": 3.356776495774587, + "reward_std": 4.17160701751709, + "rewards/citation_reward_func": 3.1916098594665527, + "rewards/correctness_reward_func": 1.2074829737345378, + "rewards/formatting_reward_func": 0.48639454940954846, + "rewards/length_reward_func": -0.5102040817340215, + "rewards/penalize_wrong_passages_reward_func": -1.156462550163269, + "rewards/unicode_reward_func": -0.3401360511779785, + "rewards/xmlcount_reward_func": 0.4780917863051097, + "step": 419 + }, + { + "completion_length": 232.66666412353516, + "epoch": 0.8845208845208845, + "grad_norm": 1.2417980432510376, + "kl": 0.9186197916666666, + "learning_rate": 1e-06, + "loss": 0.0092, + "reward": 2.7340973714987435, + "reward_std": 3.529271046320597, + "rewards/citation_reward_func": 3.0328797499338784, + "rewards/correctness_reward_func": 0.8843537221352259, + "rewards/formatting_reward_func": 0.48469386994838715, + "rewards/length_reward_func": -0.8163265188535055, + "rewards/penalize_wrong_passages_reward_func": -1.3265305856863658, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4750271489222844, + "step": 420 + }, + { + "completion_length": 212.38095092773438, + "epoch": 0.8866268866268866, + "grad_norm": 1.6659380197525024, + "kl": 1.17578125, + "learning_rate": 1e-06, + "loss": 0.0117, + "reward": 4.945539553960164, + "reward_std": 2.7063895066579184, + "rewards/citation_reward_func": 3.0498866637547812, + "rewards/correctness_reward_func": 2.040816237529119, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -0.30612245202064514, + "rewards/penalize_wrong_passages_reward_func": -0.8231292466322581, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4874897499879201, + "step": 421 + }, + { + "completion_length": 231.6088383992513, + "epoch": 0.8887328887328887, + "grad_norm": 9.702512741088867, + "kl": 1.6959635416666667, + "learning_rate": 1e-06, + "loss": 0.017, + "reward": 6.399842421213786, + "reward_std": 3.2888264854749045, + "rewards/citation_reward_func": 3.475056529045105, + "rewards/correctness_reward_func": 2.9421767791112265, + "rewards/formatting_reward_func": 0.49064625799655914, + "rewards/length_reward_func": -0.45918366809686023, + "rewards/penalize_wrong_passages_reward_func": -0.537414958079656, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.48856116831302643, + "step": 422 + }, + { + "completion_length": 240.5578180948893, + "epoch": 0.8908388908388908, + "grad_norm": 1.0227175951004028, + "kl": 1.32421875, + "learning_rate": 1e-06, + "loss": 0.0132, + "reward": 4.322190443674724, + "reward_std": 3.828075965245565, + "rewards/citation_reward_func": 3.4183671474456787, + "rewards/correctness_reward_func": 2.1088434855143228, + "rewards/formatting_reward_func": 0.4897959182659785, + "rewards/length_reward_func": -0.561224490404129, + "rewards/penalize_wrong_passages_reward_func": -1.612244854370753, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.478653018673261, + "step": 423 + }, + { + "completion_length": 243.05782063802084, + "epoch": 0.892944892944893, + "grad_norm": 1.1528595685958862, + "kl": 0.951171875, + "learning_rate": 1e-06, + "loss": 0.0095, + "reward": 3.417047699292501, + "reward_std": 4.412300546964009, + "rewards/citation_reward_func": 3.282312790552775, + "rewards/correctness_reward_func": 1.5306121756633122, + "rewards/formatting_reward_func": 0.48469386994838715, + "rewards/length_reward_func": -0.7653061201175054, + "rewards/penalize_wrong_passages_reward_func": -1.4217686653137207, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.47657138605912525, + "step": 424 + }, + { + "completion_length": 238.9285685221354, + "epoch": 0.8950508950508951, + "grad_norm": 1.091051697731018, + "kl": 1.4557291666666667, + "learning_rate": 1e-06, + "loss": 0.0146, + "reward": 3.0012970169385276, + "reward_std": 4.6244891087214155, + "rewards/citation_reward_func": 3.0555554231007895, + "rewards/correctness_reward_func": 1.3095237811406453, + "rewards/formatting_reward_func": 0.48639454940954846, + "rewards/length_reward_func": -1.4285714129606883, + "rewards/penalize_wrong_passages_reward_func": -0.9047618905703226, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.48315641780694324, + "step": 425 + }, + { + "completion_length": 216.97618865966797, + "epoch": 0.8971568971568972, + "grad_norm": 1.0596503019332886, + "kl": 1.3919270833333333, + "learning_rate": 1e-06, + "loss": 0.0139, + "reward": 6.610499779383342, + "reward_std": 3.202061096827189, + "rewards/citation_reward_func": 3.36734676361084, + "rewards/correctness_reward_func": 3.3163264195124307, + "rewards/formatting_reward_func": 0.4897959182659785, + "rewards/length_reward_func": -0.5612244804700216, + "rewards/penalize_wrong_passages_reward_func": -0.4829931855201721, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4812482496102651, + "step": 426 + }, + { + "completion_length": 188.94897969563803, + "epoch": 0.8992628992628993, + "grad_norm": 2.3366739749908447, + "kl": 1.3515625, + "learning_rate": 1e-06, + "loss": 0.0135, + "reward": 6.2090316613515215, + "reward_std": 1.9871355493863423, + "rewards/citation_reward_func": 3.3276642163594565, + "rewards/correctness_reward_func": 3.1802720626195273, + "rewards/formatting_reward_func": 0.49659863611062366, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -1.1360543767611186, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4936121851205826, + "step": 427 + }, + { + "completion_length": 221.59183502197266, + "epoch": 0.9013689013689014, + "grad_norm": 1.1556252241134644, + "kl": 1.22265625, + "learning_rate": 1e-06, + "loss": 0.0122, + "reward": 2.2772664166986942, + "reward_std": 3.3601556619008384, + "rewards/citation_reward_func": 2.664398948351542, + "rewards/correctness_reward_func": 1.0544217303395271, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.6122448941071829, + "rewards/penalize_wrong_passages_reward_func": -1.8231291969617207, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49552034835020703, + "step": 428 + }, + { + "completion_length": 203.52040354410806, + "epoch": 0.9034749034749034, + "grad_norm": 3.2301814556121826, + "kl": 1.4303385416666667, + "learning_rate": 1e-06, + "loss": 0.0143, + "reward": 4.25222647190094, + "reward_std": 3.0159141023953757, + "rewards/citation_reward_func": 3.066893458366394, + "rewards/correctness_reward_func": 1.9897958834966023, + "rewards/formatting_reward_func": 0.49659863611062366, + "rewards/length_reward_func": -0.3571428606907527, + "rewards/penalize_wrong_passages_reward_func": -1.4353741109371185, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49145572384198505, + "step": 429 + }, + { + "completion_length": 210.39795430501303, + "epoch": 0.9055809055809055, + "grad_norm": 5.057128429412842, + "kl": 1.2421875, + "learning_rate": 1e-06, + "loss": 0.0124, + "reward": 4.724735697110494, + "reward_std": 3.067882537841797, + "rewards/citation_reward_func": 3.1916099786758423, + "rewards/correctness_reward_func": 2.0408162847161293, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -1.1292516787846882, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.49741150438785553, + "step": 430 + }, + { + "completion_length": 233.9387690226237, + "epoch": 0.9076869076869077, + "grad_norm": 8.886016845703125, + "kl": 1.5846354166666667, + "learning_rate": 1e-06, + "loss": 0.0158, + "reward": 3.3445235888163247, + "reward_std": 3.751445452372233, + "rewards/citation_reward_func": 2.806122342745463, + "rewards/correctness_reward_func": 1.649659812450409, + "rewards/formatting_reward_func": 0.47789114713668823, + "rewards/length_reward_func": -0.7653061101833979, + "rewards/penalize_wrong_passages_reward_func": -1.2925169865290325, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4686734229326248, + "step": 431 + }, + { + "completion_length": 226.38434855143228, + "epoch": 0.9097929097929098, + "grad_norm": 1.0712940692901611, + "kl": 0.8515625, + "learning_rate": 1e-06, + "loss": 0.0085, + "reward": 4.341326395670573, + "reward_std": 4.08898941675822, + "rewards/citation_reward_func": 2.857142686843872, + "rewards/correctness_reward_func": 2.6020407478014627, + "rewards/formatting_reward_func": 0.4931972821553548, + "rewards/length_reward_func": -0.459183673063914, + "rewards/penalize_wrong_passages_reward_func": -1.4693877498308818, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.48758498827616376, + "step": 432 + }, + { + "completion_length": 250.47958374023438, + "epoch": 0.9118989118989119, + "grad_norm": 0.754627525806427, + "kl": 0.6861979166666666, + "learning_rate": 1e-06, + "loss": 0.0071, + "reward": 3.53940478960673, + "reward_std": 3.8470928072929382, + "rewards/citation_reward_func": 2.3937074740727744, + "rewards/correctness_reward_func": 3.027210851510366, + "rewards/formatting_reward_func": 0.48979591329892475, + "rewards/length_reward_func": -1.020408148566882, + "rewards/penalize_wrong_passages_reward_func": -1.836734652519226, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.485833282272021, + "step": 433 + }, + { + "completion_length": 277.05781809488934, + "epoch": 0.914004914004914, + "grad_norm": 1.4854369163513184, + "kl": 0.84375, + "learning_rate": 1e-06, + "loss": 0.0084, + "reward": 2.3832449913024902, + "reward_std": 4.8437340259552, + "rewards/citation_reward_func": 2.682823061943054, + "rewards/correctness_reward_func": 1.6666666269302368, + "rewards/formatting_reward_func": 0.4693877498308818, + "rewards/length_reward_func": -1.377550999323527, + "rewards/penalize_wrong_passages_reward_func": -1.3469387392203014, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.45892512798309326, + "step": 434 + }, + { + "completion_length": 245.16326141357422, + "epoch": 0.9161109161109161, + "grad_norm": 0.9480783939361572, + "kl": 0.7565104166666666, + "learning_rate": 1e-06, + "loss": 0.0078, + "reward": 4.360251814126968, + "reward_std": 3.766782840092977, + "rewards/citation_reward_func": 2.478741387526194, + "rewards/correctness_reward_func": 2.7380951642990112, + "rewards/formatting_reward_func": 0.4948979616165161, + "rewards/length_reward_func": -0.5102040817340215, + "rewards/penalize_wrong_passages_reward_func": -1.1632652878761292, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.49205436805884045, + "step": 435 + }, + { + "completion_length": 255.5033925374349, + "epoch": 0.9182169182169182, + "grad_norm": 0.7878819108009338, + "kl": 0.708984375, + "learning_rate": 1e-06, + "loss": 0.0073, + "reward": 2.683357129494349, + "reward_std": 3.135557929674784, + "rewards/citation_reward_func": 2.827380895614624, + "rewards/correctness_reward_func": 1.2925169716278713, + "rewards/formatting_reward_func": 0.4931972771883011, + "rewards/length_reward_func": -0.6632653027772903, + "rewards/penalize_wrong_passages_reward_func": -1.7551020284493764, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4886292020479838, + "step": 436 + }, + { + "completion_length": 257.1292495727539, + "epoch": 0.9203229203229203, + "grad_norm": 0.8499004244804382, + "kl": 0.7428385416666666, + "learning_rate": 1e-06, + "loss": 0.0074, + "reward": 3.225904862085978, + "reward_std": 3.394826134045919, + "rewards/citation_reward_func": 2.848639408747355, + "rewards/correctness_reward_func": 1.9217686752478282, + "rewards/formatting_reward_func": 0.49149659276008606, + "rewards/length_reward_func": -0.3571428557236989, + "rewards/penalize_wrong_passages_reward_func": -2.163265277942022, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.48440809547901154, + "step": 437 + }, + { + "completion_length": 266.0101954142253, + "epoch": 0.9224289224289224, + "grad_norm": 0.8942264318466187, + "kl": 0.71875, + "learning_rate": 1e-06, + "loss": 0.0072, + "reward": 2.4781905015309653, + "reward_std": 4.1750030517578125, + "rewards/citation_reward_func": 2.699829896291097, + "rewards/correctness_reward_func": 1.3095237811406453, + "rewards/formatting_reward_func": 0.48469387491544086, + "rewards/length_reward_func": -1.020408163468043, + "rewards/penalize_wrong_passages_reward_func": -1.476190447807312, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4807414561510086, + "step": 438 + }, + { + "completion_length": 254.13264973958334, + "epoch": 0.9245349245349246, + "grad_norm": 2.025777578353882, + "kl": 1.359375, + "learning_rate": 1e-06, + "loss": 0.0137, + "reward": 3.723285893599192, + "reward_std": 2.8511158426602683, + "rewards/citation_reward_func": 2.5977890888849893, + "rewards/correctness_reward_func": 1.9387754499912262, + "rewards/formatting_reward_func": 0.4931972821553548, + "rewards/length_reward_func": -0.3571428557236989, + "rewards/penalize_wrong_passages_reward_func": -1.428571383158366, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.47923805316289264, + "step": 439 + }, + { + "completion_length": 241.6836700439453, + "epoch": 0.9266409266409267, + "grad_norm": 0.8116592764854431, + "kl": 1.0397135416666667, + "learning_rate": 1e-06, + "loss": 0.0104, + "reward": 1.0626495877901714, + "reward_std": 3.930151581764221, + "rewards/citation_reward_func": 2.551020304361979, + "rewards/correctness_reward_func": 0.6122448866566023, + "rewards/formatting_reward_func": 0.4948979566494624, + "rewards/length_reward_func": -0.7142857213815054, + "rewards/penalize_wrong_passages_reward_func": -2.0340136090914407, + "rewards/unicode_reward_func": -0.3401360511779785, + "rewards/xmlcount_reward_func": 0.49292171994845074, + "step": 440 + }, + { + "completion_length": 247.81292215983072, + "epoch": 0.9287469287469288, + "grad_norm": 8.80766773223877, + "kl": 1.15625, + "learning_rate": 1e-06, + "loss": 0.0115, + "reward": 2.8308265656232834, + "reward_std": 3.2842116355895996, + "rewards/citation_reward_func": 2.346938729286194, + "rewards/correctness_reward_func": 1.5986394186814625, + "rewards/formatting_reward_func": 0.4880952338377635, + "rewards/length_reward_func": -0.40816326936086017, + "rewards/penalize_wrong_passages_reward_func": -1.6734693745772045, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4787856588761012, + "step": 441 + }, + { + "completion_length": 211.26870727539062, + "epoch": 0.9308529308529309, + "grad_norm": 3.453256130218506, + "kl": 0.8912760416666666, + "learning_rate": 1e-06, + "loss": 0.0096, + "reward": 6.2583504517873125, + "reward_std": 2.4285461703936257, + "rewards/citation_reward_func": 2.844387690226237, + "rewards/correctness_reward_func": 2.9761904080708823, + "rewards/formatting_reward_func": 0.4948979566494624, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -0.3469387690226237, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4938944975535075, + "step": 442 + }, + { + "completion_length": 240.78571065266928, + "epoch": 0.932958932958933, + "grad_norm": 0.8041434288024902, + "kl": 3.5592447916666665, + "learning_rate": 1e-06, + "loss": 0.0357, + "reward": 3.460792601108551, + "reward_std": 3.017461578051249, + "rewards/citation_reward_func": 2.993197202682495, + "rewards/correctness_reward_func": 1.2414965530236561, + "rewards/formatting_reward_func": 0.4914965977271398, + "rewards/length_reward_func": -0.40816326439380646, + "rewards/penalize_wrong_passages_reward_func": -1.3401360511779785, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.48290130992730457, + "step": 443 + }, + { + "completion_length": 242.91156005859375, + "epoch": 0.935064935064935, + "grad_norm": 0.8912382125854492, + "kl": 0.7233072916666666, + "learning_rate": 1e-06, + "loss": 0.0072, + "reward": 3.097394548356533, + "reward_std": 3.375369350115458, + "rewards/citation_reward_func": 2.7465986013412476, + "rewards/correctness_reward_func": 1.1904761642217636, + "rewards/formatting_reward_func": 0.4948979616165161, + "rewards/length_reward_func": -0.40816326936086017, + "rewards/penalize_wrong_passages_reward_func": -1.4149659872055054, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.48855096598466236, + "step": 444 + }, + { + "completion_length": 240.36053975423178, + "epoch": 0.9371709371709371, + "grad_norm": 0.84913170337677, + "kl": 0.8639322916666666, + "learning_rate": 1e-06, + "loss": 0.0086, + "reward": 3.8615239461263022, + "reward_std": 3.605143388112386, + "rewards/citation_reward_func": 2.6870747804641724, + "rewards/correctness_reward_func": 1.9557822247346242, + "rewards/formatting_reward_func": 0.4948979616165161, + "rewards/length_reward_func": -0.30612245202064514, + "rewards/penalize_wrong_passages_reward_func": -1.2925169865290325, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.49247613549232483, + "step": 445 + }, + { + "completion_length": 231.4829889933268, + "epoch": 0.9392769392769392, + "grad_norm": 0.9911380410194397, + "kl": 0.912109375, + "learning_rate": 1e-06, + "loss": 0.0091, + "reward": 3.4259490370750427, + "reward_std": 3.146302322546641, + "rewards/citation_reward_func": 3.2270408074061074, + "rewards/correctness_reward_func": 1.2585033799211185, + "rewards/formatting_reward_func": 0.49659863611062366, + "rewards/length_reward_func": -0.561224490404129, + "rewards/penalize_wrong_passages_reward_func": -1.482993150750796, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4880237430334091, + "step": 446 + }, + { + "completion_length": 274.37414296468097, + "epoch": 0.9413829413829414, + "grad_norm": 0.7898405194282532, + "kl": 0.7565104166666666, + "learning_rate": 1e-06, + "loss": 0.0076, + "reward": 4.6013062198956804, + "reward_std": 3.6253005266189575, + "rewards/citation_reward_func": 2.351190427939097, + "rewards/correctness_reward_func": 3.4353740215301514, + "rewards/formatting_reward_func": 0.4812925159931183, + "rewards/length_reward_func": -0.6122448841730753, + "rewards/penalize_wrong_passages_reward_func": -1.5306121905644734, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4763060708840688, + "step": 447 + }, + { + "completion_length": 252.36053466796875, + "epoch": 0.9434889434889435, + "grad_norm": 1.2493234872817993, + "kl": 0.8743489583333334, + "learning_rate": 1e-06, + "loss": 0.0087, + "reward": 4.8478163580099745, + "reward_std": 2.818999171257019, + "rewards/citation_reward_func": 2.8741496006647744, + "rewards/correctness_reward_func": 2.534013569355011, + "rewards/formatting_reward_func": 0.4948979566494624, + "rewards/length_reward_func": -0.30612245202064514, + "rewards/penalize_wrong_passages_reward_func": -1.238095184167226, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4889727383852005, + "step": 448 + }, + { + "completion_length": 263.86734263102215, + "epoch": 0.9455949455949456, + "grad_norm": 5.652228355407715, + "kl": 1.0559895833333333, + "learning_rate": 1e-06, + "loss": 0.0106, + "reward": 2.1674863497416177, + "reward_std": 3.2532392740249634, + "rewards/citation_reward_func": 2.2916666070620217, + "rewards/correctness_reward_func": 1.2244897832473118, + "rewards/formatting_reward_func": 0.4931972771883011, + "rewards/length_reward_func": -0.40816326936086017, + "rewards/penalize_wrong_passages_reward_func": -1.911564588546753, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4778604904810588, + "step": 449 + }, + { + "completion_length": 227.81292215983072, + "epoch": 0.9477009477009477, + "grad_norm": 0.9847736954689026, + "kl": 0.9290364583333334, + "learning_rate": 1e-06, + "loss": 0.0093, + "reward": 0.2178944672147433, + "reward_std": 3.902046004931132, + "rewards/citation_reward_func": 2.933673401673635, + "rewards/correctness_reward_func": 1.9557822222510974, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.9693877448638281, + "rewards/penalize_wrong_passages_reward_func": -4.5238093336423235, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.49340470135211945, + "step": 450 + }, + { + "completion_length": 223.03740692138672, + "epoch": 0.9498069498069498, + "grad_norm": 0.9363918900489807, + "kl": 0.8463541666666666, + "learning_rate": 1e-06, + "loss": 0.0085, + "reward": 3.2303468783696494, + "reward_std": 2.5281269550323486, + "rewards/citation_reward_func": 3.2525509198506675, + "rewards/correctness_reward_func": 1.5646257797876995, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": 0.0, + "rewards/penalize_wrong_passages_reward_func": -2.58503395318985, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4982040176788966, + "step": 451 + }, + { + "completion_length": 239.9897918701172, + "epoch": 0.9519129519129519, + "grad_norm": 1.1018811464309692, + "kl": 0.7884114583333334, + "learning_rate": 1e-06, + "loss": 0.0079, + "reward": 3.2081496125708022, + "reward_std": 3.1324497063954673, + "rewards/citation_reward_func": 3.120748241742452, + "rewards/correctness_reward_func": 3.639455715815226, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -4.340135971705119, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4955645700295766, + "step": 452 + }, + { + "completion_length": 234.323122660319, + "epoch": 0.954018954018954, + "grad_norm": 0.9514901041984558, + "kl": 1.3157552083333333, + "learning_rate": 1e-06, + "loss": 0.0132, + "reward": 1.9489863812923431, + "reward_std": 3.943875233332316, + "rewards/citation_reward_func": 2.78911558787028, + "rewards/correctness_reward_func": 0.8333332935969034, + "rewards/formatting_reward_func": 0.4948979566494624, + "rewards/length_reward_func": -0.4591836780309677, + "rewards/penalize_wrong_passages_reward_func": -1.8571427861849468, + "rewards/unicode_reward_func": -0.3401360511779785, + "rewards/xmlcount_reward_func": 0.4881019840637843, + "step": 453 + }, + { + "completion_length": 255.76189931233725, + "epoch": 0.9561249561249561, + "grad_norm": 3.1003830432891846, + "kl": 0.9947916666666666, + "learning_rate": 1e-06, + "loss": 0.01, + "reward": 1.5095713399350643, + "reward_std": 3.59877481063207, + "rewards/citation_reward_func": 2.402210851510366, + "rewards/correctness_reward_func": 0.6462584833304087, + "rewards/formatting_reward_func": 0.4948979566494624, + "rewards/length_reward_func": -0.30612245202064514, + "rewards/penalize_wrong_passages_reward_func": -2.047619044780731, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.49001354972521466, + "step": 454 + }, + { + "completion_length": 263.9285634358724, + "epoch": 0.9582309582309583, + "grad_norm": 12.15134048461914, + "kl": 1.44921875, + "learning_rate": 1e-06, + "loss": 0.0145, + "reward": 4.058207631111145, + "reward_std": 3.4817044734954834, + "rewards/citation_reward_func": 2.8741496006647744, + "rewards/correctness_reward_func": 2.4149659077326455, + "rewards/formatting_reward_func": 0.4914965977271398, + "rewards/length_reward_func": -0.40816326936086017, + "rewards/penalize_wrong_passages_reward_func": -1.8027210235595703, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.48847954471906024, + "step": 455 + }, + { + "completion_length": 272.2584991455078, + "epoch": 0.9603369603369604, + "grad_norm": 0.920379638671875, + "kl": 1.822265625, + "learning_rate": 1e-06, + "loss": 0.0182, + "reward": 1.6935850928227107, + "reward_std": 4.720509966214498, + "rewards/citation_reward_func": 2.7083332935969033, + "rewards/correctness_reward_func": 0.7653061002492905, + "rewards/formatting_reward_func": 0.4880952288707097, + "rewards/length_reward_func": -0.9693877498308817, + "rewards/penalize_wrong_passages_reward_func": -1.59183669090271, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.4631428172190984, + "step": 456 + }, + { + "completion_length": 216.44557444254556, + "epoch": 0.9624429624429625, + "grad_norm": 1.5930014848709106, + "kl": 0.9583333333333334, + "learning_rate": 1e-06, + "loss": 0.0096, + "reward": 0.889006977279981, + "reward_std": 3.1517065366109214, + "rewards/citation_reward_func": 3.0357141892115274, + "rewards/correctness_reward_func": 2.99319722255071, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.10204081734021504, + "rewards/penalize_wrong_passages_reward_func": -6.034013519684474, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49785027901331586, + "step": 457 + }, + { + "completion_length": 252.0952377319336, + "epoch": 0.9645489645489645, + "grad_norm": 0.9933049082756042, + "kl": 1.4166666666666667, + "learning_rate": 1e-06, + "loss": 0.0142, + "reward": 1.27609184384346, + "reward_std": 3.663653016090393, + "rewards/citation_reward_func": 2.589285651842753, + "rewards/correctness_reward_func": 1.2755101919174194, + "rewards/formatting_reward_func": 0.48979591329892475, + "rewards/length_reward_func": -0.561224490404129, + "rewards/penalize_wrong_passages_reward_func": -2.829931855201721, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.4827244331439336, + "step": 458 + }, + { + "completion_length": 203.9489720662435, + "epoch": 0.9666549666549666, + "grad_norm": 0.8730067014694214, + "kl": 1.1178385416666667, + "learning_rate": 1e-06, + "loss": 0.0117, + "reward": 4.5937449137369795, + "reward_std": 3.855661233266195, + "rewards/citation_reward_func": 3.592686971028646, + "rewards/correctness_reward_func": 1.9387754499912262, + "rewards/formatting_reward_func": 0.4855442096789678, + "rewards/length_reward_func": -0.7142857114473978, + "rewards/penalize_wrong_passages_reward_func": -1.183673471212387, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4746972322463989, + "step": 459 + }, + { + "completion_length": 170.48638916015625, + "epoch": 0.9687609687609687, + "grad_norm": 1.0719090700149536, + "kl": 1.2141927083333333, + "learning_rate": 1e-06, + "loss": 0.0136, + "reward": 5.673636317253113, + "reward_std": 3.2085538109143577, + "rewards/citation_reward_func": 3.945578098297119, + "rewards/correctness_reward_func": 2.1938774983088174, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -0.9047618905703226, + "rewards/unicode_reward_func": -0.3401360511779785, + "rewards/xmlcount_reward_func": 0.4865611692269643, + "step": 460 + }, + { + "completion_length": 171.149658203125, + "epoch": 0.9708669708669708, + "grad_norm": 1.5770235061645508, + "kl": 1.212890625, + "learning_rate": 1e-06, + "loss": 0.0141, + "reward": 6.019292672475179, + "reward_std": 2.4112029671669006, + "rewards/citation_reward_func": 3.962584972381592, + "rewards/correctness_reward_func": 2.38095231850942, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -1.115646243095398, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4971836010615031, + "step": 461 + }, + { + "completion_length": 178.16666158040366, + "epoch": 0.972972972972973, + "grad_norm": 1.2439326047897339, + "kl": 1.1927083333333333, + "learning_rate": 1e-06, + "loss": 0.0125, + "reward": 5.120292564233144, + "reward_std": 2.738259176413218, + "rewards/citation_reward_func": 3.8095237016677856, + "rewards/correctness_reward_func": 1.6666666169961293, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.15306122104326883, + "rewards/penalize_wrong_passages_reward_func": -1.1972788721323013, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49444211026032764, + "step": 462 + }, + { + "completion_length": 170.9285659790039, + "epoch": 0.9750789750789751, + "grad_norm": 2.0321953296661377, + "kl": 1.3502604166666667, + "learning_rate": 1e-06, + "loss": 0.015, + "reward": 5.051782409350078, + "reward_std": 3.1064772605895996, + "rewards/citation_reward_func": 3.7755101521809897, + "rewards/correctness_reward_func": 2.0068026383717856, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.40816326936086017, + "rewards/penalize_wrong_passages_reward_func": -1.3129251301288605, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.49055776993433636, + "step": 463 + }, + { + "completion_length": 171.72108459472656, + "epoch": 0.9771849771849772, + "grad_norm": 1.0354406833648682, + "kl": 1.0266927083333333, + "learning_rate": 1e-06, + "loss": 0.0115, + "reward": 5.325897852579753, + "reward_std": 2.9984676440556846, + "rewards/citation_reward_func": 3.835033933321635, + "rewards/correctness_reward_func": 2.091836671034495, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.20408163468043009, + "rewards/penalize_wrong_passages_reward_func": -1.3877550661563873, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4908638844887416, + "step": 464 + }, + { + "completion_length": 171.44557444254556, + "epoch": 0.9792909792909793, + "grad_norm": 0.9120634198188782, + "kl": 2.31640625, + "learning_rate": 1e-06, + "loss": 0.0242, + "reward": 5.117595195770264, + "reward_std": 3.7810667753219604, + "rewards/citation_reward_func": 3.801020383834839, + "rewards/correctness_reward_func": 2.0918366511662803, + "rewards/formatting_reward_func": 0.49914966026941937, + "rewards/length_reward_func": -0.30612244705359143, + "rewards/penalize_wrong_passages_reward_func": -1.1156462132930756, + "rewards/unicode_reward_func": -0.3401360511779785, + "rewards/xmlcount_reward_func": 0.4874931474526723, + "step": 465 + }, + { + "completion_length": 166.34693654378256, + "epoch": 0.9813969813969814, + "grad_norm": 0.8665127754211426, + "kl": 1.0703125, + "learning_rate": 1e-06, + "loss": 0.0117, + "reward": 4.781190474828084, + "reward_std": 2.7656540870666504, + "rewards/citation_reward_func": 3.8265304962793985, + "rewards/correctness_reward_func": 1.5986394385496776, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -0.2551020433505376, + "rewards/penalize_wrong_passages_reward_func": -1.374149630467097, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.48867341379324597, + "step": 466 + }, + { + "completion_length": 171.7789077758789, + "epoch": 0.9835029835029835, + "grad_norm": 1.0515021085739136, + "kl": 1.138671875, + "learning_rate": 1e-06, + "loss": 0.0132, + "reward": 5.545894384384155, + "reward_std": 2.702861169974009, + "rewards/citation_reward_func": 4.022108793258667, + "rewards/correctness_reward_func": 1.887755036354065, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.30612245202064514, + "rewards/penalize_wrong_passages_reward_func": -1.0408163219690323, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.48296932876110077, + "step": 467 + }, + { + "completion_length": 161.1292470296224, + "epoch": 0.9856089856089856, + "grad_norm": 0.6496269702911377, + "kl": 1.1419270833333333, + "learning_rate": 1e-06, + "loss": 0.0138, + "reward": 6.711469570795695, + "reward_std": 1.9912763635317485, + "rewards/citation_reward_func": 4.217686891555786, + "rewards/correctness_reward_func": 2.568027138710022, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.20408162971337637, + "rewards/penalize_wrong_passages_reward_func": -0.8639455686012903, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4954829414685567, + "step": 468 + }, + { + "completion_length": 168.3945515950521, + "epoch": 0.9877149877149877, + "grad_norm": 1.0953691005706787, + "kl": 1.33203125, + "learning_rate": 1e-06, + "loss": 0.0152, + "reward": 6.473023891448975, + "reward_std": 2.3332280913988748, + "rewards/citation_reward_func": 4.056122342745463, + "rewards/correctness_reward_func": 2.6020407676696777, + "rewards/formatting_reward_func": 0.4965986410776774, + "rewards/length_reward_func": -0.2551020433505376, + "rewards/penalize_wrong_passages_reward_func": -0.9183673361937205, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4917312413454056, + "step": 469 + }, + { + "completion_length": 166.93536885579428, + "epoch": 0.9898209898209899, + "grad_norm": 1.272312045097351, + "kl": 1.2864583333333333, + "learning_rate": 1e-06, + "loss": 0.0137, + "reward": 6.26665194829305, + "reward_std": 2.231449862321218, + "rewards/citation_reward_func": 3.7386619249979653, + "rewards/correctness_reward_func": 2.4149659276008606, + "rewards/formatting_reward_func": 0.49574829638004303, + "rewards/length_reward_func": -0.15306122601032257, + "rewards/penalize_wrong_passages_reward_func": -0.7142857064803442, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.48462240397930145, + "step": 470 + }, + { + "completion_length": 196.43536885579428, + "epoch": 0.991926991926992, + "grad_norm": 1.019243597984314, + "kl": 6.277994791666667, + "learning_rate": 1e-06, + "loss": 0.0628, + "reward": 4.13928226629893, + "reward_std": 2.5580894947052, + "rewards/citation_reward_func": 3.180271943410238, + "rewards/correctness_reward_func": 1.2414965679248173, + "rewards/formatting_reward_func": 0.4982993205388387, + "rewards/length_reward_func": -0.30612244705359143, + "rewards/penalize_wrong_passages_reward_func": -0.9659863710403442, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4913230687379837, + "step": 471 + }, + { + "completion_length": 200.1496556599935, + "epoch": 0.994032994032994, + "grad_norm": 1.7666877508163452, + "kl": 1.2421875, + "learning_rate": 1e-06, + "loss": 0.0124, + "reward": 2.1022719343503318, + "reward_std": 3.5998586813608804, + "rewards/citation_reward_func": 3.2142856121063232, + "rewards/correctness_reward_func": 1.5646257797876995, + "rewards/formatting_reward_func": 0.5, + "rewards/length_reward_func": -0.9183673212925593, + "rewards/penalize_wrong_passages_reward_func": -2.5714284976323447, + "rewards/unicode_reward_func": -0.17006802558898926, + "rewards/xmlcount_reward_func": 0.48322444160779315, + "step": 472 + }, + { + "completion_length": 204.49659729003906, + "epoch": 0.9961389961389961, + "grad_norm": 0.7166609764099121, + "kl": 1.1692708333333333, + "learning_rate": 1e-06, + "loss": 0.0113, + "reward": -0.41065768152475357, + "reward_std": 3.7700961033503213, + "rewards/citation_reward_func": 2.98185924688975, + "rewards/correctness_reward_func": 1.2755101794997852, + "rewards/formatting_reward_func": 0.4931972771883011, + "rewards/length_reward_func": -0.5612244953711828, + "rewards/penalize_wrong_passages_reward_func": -4.727891008059184, + "rewards/unicode_reward_func": -0.3401360511779785, + "rewards/xmlcount_reward_func": 0.46802715957164764, + "step": 473 + }, + { + "completion_length": 189.5646209716797, + "epoch": 0.9982449982449982, + "grad_norm": 1.0915828943252563, + "kl": 1.30078125, + "learning_rate": 1e-06, + "loss": 0.0135, + "reward": 3.6559808254241943, + "reward_std": 2.721195101737976, + "rewards/citation_reward_func": 2.753684719403585, + "rewards/correctness_reward_func": 2.142857074737549, + "rewards/formatting_reward_func": 0.4948979616165161, + "rewards/length_reward_func": -0.30612245202064514, + "rewards/penalize_wrong_passages_reward_func": -1.9047618955373764, + "rewards/unicode_reward_func": 0.0, + "rewards/xmlcount_reward_func": 0.4754251291354497, + "step": 474 + } + ], + "logging_steps": 1, + "max_steps": 474, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 7, + "trial_name": null, + "trial_params": null +}