{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9988798446717945, "eval_steps": 500, "global_step": 836, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 1960.6987380981445, "epoch": 0.0011948323500858785, "grad_norm": 0.031351201236248016, "kl": 0.0, "learning_rate": 1.1904761904761903e-08, "loss": 0.0045, "reward": 0.05803571757860482, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.0, "step": 1 }, { "completion_length": 1910.7634811401367, "epoch": 0.002389664700171757, "grad_norm": 0.03475209325551987, "kl": 0.0, "learning_rate": 2.3809523809523807e-08, "loss": 0.0107, "reward": 0.06026785960420966, "reward_std": 0.03125000139698386, "rewards/accuracy_reward": 0.06026785960420966, "rewards/format_reward": 0.0, "step": 2 }, { "completion_length": 1951.9018783569336, "epoch": 0.0035844970502576356, "grad_norm": 0.02049637772142887, "kl": 8.744001388549805e-05, "learning_rate": 3.571428571428571e-08, "loss": 0.003, "reward": 0.06250000279396772, "reward_std": 0.008928571827709675, "rewards/accuracy_reward": 0.06250000279396772, "rewards/format_reward": 0.0, "step": 3 }, { "completion_length": 1957.2969360351562, "epoch": 0.004779329400343514, "grad_norm": 0.04382353276014328, "kl": 9.191036224365234e-05, "learning_rate": 4.7619047619047613e-08, "loss": 0.0055, "reward": 0.06919643143191934, "reward_std": 0.03640491422265768, "rewards/accuracy_reward": 0.06919643143191934, "rewards/format_reward": 0.0, "step": 4 }, { "completion_length": 1935.0648193359375, "epoch": 0.005974161750429393, "grad_norm": 0.038842663168907166, "kl": 8.96751880645752e-05, "learning_rate": 5.9523809523809515e-08, "loss": 0.0054, "reward": 0.09151786263100803, "reward_std": 0.0326312561519444, "rewards/accuracy_reward": 0.09151786263100803, "rewards/format_reward": 0.0, "step": 5 }, { "completion_length": 1931.8349075317383, "epoch": 0.007168994100515271, "grad_norm": 0.03012772463262081, "kl": 8.606910705566406e-05, "learning_rate": 7.142857142857142e-08, "loss": 0.005, "reward": 0.08035714644938707, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.0, "step": 6 }, { "completion_length": 1961.6875762939453, "epoch": 0.00836382645060115, "grad_norm": 0.03170635923743248, "kl": 0.00012761354446411133, "learning_rate": 8.333333333333333e-08, "loss": 0.0028, "reward": 0.08482143306173384, "reward_std": 0.024393311701714993, "rewards/accuracy_reward": 0.08482143306173384, "rewards/format_reward": 0.0, "step": 7 }, { "completion_length": 1931.6920623779297, "epoch": 0.009558658800687028, "grad_norm": 0.024889996275305748, "kl": 9.66191291809082e-05, "learning_rate": 9.523809523809523e-08, "loss": 0.004, "reward": 0.08258928940631449, "reward_std": 0.013392857741564512, "rewards/accuracy_reward": 0.08258928940631449, "rewards/format_reward": 0.0, "step": 8 }, { "completion_length": 1959.3483047485352, "epoch": 0.010753491150772907, "grad_norm": 0.040122345089912415, "kl": 9.369850158691406e-05, "learning_rate": 1.0714285714285713e-07, "loss": 0.0099, "reward": 0.05133928800933063, "reward_std": 0.0326312561519444, "rewards/accuracy_reward": 0.05133928800933063, "rewards/format_reward": 0.0, "step": 9 }, { "completion_length": 1968.520164489746, "epoch": 0.011948323500858785, "grad_norm": 0.01981578953564167, "kl": 0.00010353326797485352, "learning_rate": 1.1904761904761903e-07, "loss": 0.001, "reward": 0.044642859138548374, "reward_std": 0.008928571827709675, "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.0, "step": 10 }, { "completion_length": 1927.6697235107422, "epoch": 0.013143155850944664, "grad_norm": 0.038495443761348724, "kl": 0.00011032819747924805, "learning_rate": 1.3095238095238095e-07, "loss": 0.0087, "reward": 0.06473214598372579, "reward_std": 0.023702683858573437, "rewards/accuracy_reward": 0.06473214598372579, "rewards/format_reward": 0.0, "step": 11 }, { "completion_length": 1948.377311706543, "epoch": 0.014337988201030542, "grad_norm": 0.01566208340227604, "kl": 9.85562801361084e-05, "learning_rate": 1.4285714285714285e-07, "loss": 0.0019, "reward": 0.058035716880112886, "reward_std": 0.008928571827709675, "rewards/accuracy_reward": 0.058035716880112886, "rewards/format_reward": 0.0, "step": 12 }, { "completion_length": 1956.3348999023438, "epoch": 0.01553282055111642, "grad_norm": 0.02541220933198929, "kl": 9.056925773620605e-05, "learning_rate": 1.5476190476190475e-07, "loss": 0.0037, "reward": 0.06250000232830644, "reward_std": 0.01785714365541935, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, "step": 13 }, { "completion_length": 1914.0313262939453, "epoch": 0.0167276529012023, "grad_norm": 0.024918921291828156, "kl": 0.00010663270950317383, "learning_rate": 1.6666666666666665e-07, "loss": 0.0036, "reward": 0.05133928754366934, "reward_std": 0.013392857741564512, "rewards/accuracy_reward": 0.05133928754366934, "rewards/format_reward": 0.0, "step": 14 }, { "completion_length": 1919.707664489746, "epoch": 0.01792248525128818, "grad_norm": 0.02902700938284397, "kl": 8.481740951538086e-05, "learning_rate": 1.7857142857142858e-07, "loss": 0.0063, "reward": 0.058035716181620955, "reward_std": 0.023012056481093168, "rewards/accuracy_reward": 0.058035716181620955, "rewards/format_reward": 0.0, "step": 15 }, { "completion_length": 1916.4978408813477, "epoch": 0.019117317601374056, "grad_norm": 0.028829045593738556, "kl": 9.545683860778809e-05, "learning_rate": 1.9047619047619045e-07, "loss": 0.0065, "reward": 0.05133928847499192, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.05133928847499192, "rewards/format_reward": 0.0, "step": 16 }, { "completion_length": 1950.4643859863281, "epoch": 0.020312149951459937, "grad_norm": 0.026649655774235725, "kl": 9.629130363464355e-05, "learning_rate": 2.0238095238095238e-07, "loss": 0.0042, "reward": 0.044642859138548374, "reward_std": 0.01785714365541935, "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.0, "step": 17 }, { "completion_length": 1948.5804290771484, "epoch": 0.021506982301545814, "grad_norm": 0.025350496172904968, "kl": 9.492039680480957e-05, "learning_rate": 2.1428571428571426e-07, "loss": 0.0034, "reward": 0.0357142873108387, "reward_std": 0.01785714365541935, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, "step": 18 }, { "completion_length": 1928.464370727539, "epoch": 0.022701814651631694, "grad_norm": 0.03270775452256203, "kl": 9.47117805480957e-05, "learning_rate": 2.2619047619047619e-07, "loss": 0.0038, "reward": 0.060267859837040305, "reward_std": 0.018547771032899618, "rewards/accuracy_reward": 0.060267859837040305, "rewards/format_reward": 0.0, "step": 19 }, { "completion_length": 1952.533576965332, "epoch": 0.02389664700171757, "grad_norm": 0.02037668786942959, "kl": 8.732080459594727e-05, "learning_rate": 2.3809523809523806e-07, "loss": 0.0008, "reward": 0.06250000325962901, "reward_std": 0.01408348511904478, "rewards/accuracy_reward": 0.06250000325962901, "rewards/format_reward": 0.0, "step": 20 }, { "completion_length": 1956.058120727539, "epoch": 0.02509147935180345, "grad_norm": 0.03669093921780586, "kl": 9.500980377197266e-05, "learning_rate": 2.5e-07, "loss": 0.0087, "reward": 0.042410715483129025, "reward_std": 0.018547771032899618, "rewards/accuracy_reward": 0.042410715483129025, "rewards/format_reward": 0.0, "step": 21 }, { "completion_length": 1954.4040908813477, "epoch": 0.026286311701889328, "grad_norm": 0.030187800526618958, "kl": 9.185075759887695e-05, "learning_rate": 2.619047619047619e-07, "loss": 0.0047, "reward": 0.06919643259607255, "reward_std": 0.019929025787860155, "rewards/accuracy_reward": 0.06919643259607255, "rewards/format_reward": 0.0, "step": 22 }, { "completion_length": 1925.7277603149414, "epoch": 0.027481144051975208, "grad_norm": 0.03137194365262985, "kl": 9.304285049438477e-05, "learning_rate": 2.7380952380952385e-07, "loss": 0.0062, "reward": 0.042410716181620955, "reward_std": 0.022321429569274187, "rewards/accuracy_reward": 0.042410716181620955, "rewards/format_reward": 0.0, "step": 23 }, { "completion_length": 1951.1384735107422, "epoch": 0.028675976402061085, "grad_norm": 0.03048517554998398, "kl": 0.00013184547424316406, "learning_rate": 2.857142857142857e-07, "loss": 0.0053, "reward": 0.04687500186264515, "reward_std": 0.018547771032899618, "rewards/accuracy_reward": 0.04687500186264515, "rewards/format_reward": 0.0, "step": 24 }, { "completion_length": 1937.254539489746, "epoch": 0.029870808752146965, "grad_norm": 0.022243598476052284, "kl": 9.262561798095703e-05, "learning_rate": 2.976190476190476e-07, "loss": 0.0042, "reward": 0.037946430733427405, "reward_std": 0.01477411249652505, "rewards/accuracy_reward": 0.037946430733427405, "rewards/format_reward": 0.0, "step": 25 }, { "completion_length": 1966.9063339233398, "epoch": 0.03106564110223284, "grad_norm": 0.035921622067689896, "kl": 8.973479270935059e-05, "learning_rate": 3.095238095238095e-07, "loss": 0.0072, "reward": 0.08035714575089514, "reward_std": 0.03194062830880284, "rewards/accuracy_reward": 0.08035714575089514, "rewards/format_reward": 0.0, "step": 26 }, { "completion_length": 1987.0313339233398, "epoch": 0.03226047345231872, "grad_norm": 0.019158925861120224, "kl": 9.378790855407715e-05, "learning_rate": 3.2142857142857145e-07, "loss": 0.0041, "reward": 0.03348214435391128, "reward_std": 0.013392857741564512, "rewards/accuracy_reward": 0.03348214435391128, "rewards/format_reward": 0.0, "step": 27 }, { "completion_length": 1937.551414489746, "epoch": 0.0334553058024046, "grad_norm": 0.030402429401874542, "kl": 8.895993232727051e-05, "learning_rate": 3.333333333333333e-07, "loss": -0.0012, "reward": 0.046875002793967724, "reward_std": 0.022321429569274187, "rewards/accuracy_reward": 0.046875002793967724, "rewards/format_reward": 0.0, "step": 28 }, { "completion_length": 1946.368392944336, "epoch": 0.034650138152490476, "grad_norm": 0.017964906990528107, "kl": 8.7738037109375e-05, "learning_rate": 3.452380952380952e-07, "loss": -0.0003, "reward": 0.04910714505240321, "reward_std": 0.008928571827709675, "rewards/accuracy_reward": 0.04910714505240321, "rewards/format_reward": 0.0, "step": 29 }, { "completion_length": 1916.5268630981445, "epoch": 0.03584497050257636, "grad_norm": 0.02517622709274292, "kl": 9.006261825561523e-05, "learning_rate": 3.5714285714285716e-07, "loss": 0.0025, "reward": 0.08035714598372579, "reward_std": 0.01785714365541935, "rewards/accuracy_reward": 0.08035714598372579, "rewards/format_reward": 0.0, "step": 30 }, { "completion_length": 1932.2322235107422, "epoch": 0.037039802852662236, "grad_norm": 0.031563326716423035, "kl": 8.749961853027344e-05, "learning_rate": 3.6904761904761906e-07, "loss": 0.0059, "reward": 0.03794643050059676, "reward_std": 0.03125000139698386, "rewards/accuracy_reward": 0.03794643050059676, "rewards/format_reward": 0.0, "step": 31 }, { "completion_length": 1937.6518783569336, "epoch": 0.03823463520274811, "grad_norm": 0.016230538487434387, "kl": 8.036196231842041e-05, "learning_rate": 3.809523809523809e-07, "loss": 0.0021, "reward": 0.08705357532016933, "reward_std": 0.009619199205189943, "rewards/accuracy_reward": 0.08705357532016933, "rewards/format_reward": 0.0, "step": 32 }, { "completion_length": 1920.430892944336, "epoch": 0.03942946755283399, "grad_norm": 0.039354369044303894, "kl": 8.529424667358398e-05, "learning_rate": 3.928571428571428e-07, "loss": 0.0064, "reward": 0.06696428800933063, "reward_std": 0.03194062830880284, "rewards/accuracy_reward": 0.06696428800933063, "rewards/format_reward": 0.0, "step": 33 }, { "completion_length": 1954.4130249023438, "epoch": 0.04062429990291987, "grad_norm": 0.021925702691078186, "kl": 7.94827938079834e-05, "learning_rate": 4.0476190476190476e-07, "loss": 0.0023, "reward": 0.06026785960420966, "reward_std": 0.013392857741564512, "rewards/accuracy_reward": 0.06026785960420966, "rewards/format_reward": 0.0, "step": 34 }, { "completion_length": 1957.267936706543, "epoch": 0.04181913225300575, "grad_norm": 0.03491223603487015, "kl": 9.757280349731445e-05, "learning_rate": 4.1666666666666667e-07, "loss": 0.0089, "reward": 0.07142857485450804, "reward_std": 0.028166969772428274, "rewards/accuracy_reward": 0.07142857485450804, "rewards/format_reward": 0.0, "step": 35 }, { "completion_length": 1939.540267944336, "epoch": 0.04301396460309163, "grad_norm": 0.03514111042022705, "kl": 8.028745651245117e-05, "learning_rate": 4.285714285714285e-07, "loss": 0.0075, "reward": 0.04017857275903225, "reward_std": 0.02816697023808956, "rewards/accuracy_reward": 0.04017857275903225, "rewards/format_reward": 0.0, "step": 36 }, { "completion_length": 1928.4308853149414, "epoch": 0.04420879695317751, "grad_norm": 0.024085428565740585, "kl": 7.957220077514648e-05, "learning_rate": 4.4047619047619047e-07, "loss": 0.0085, "reward": 0.04687500186264515, "reward_std": 0.018547771032899618, "rewards/accuracy_reward": 0.04687500186264515, "rewards/format_reward": 0.0, "step": 37 }, { "completion_length": 1944.207664489746, "epoch": 0.04540362930326339, "grad_norm": 0.016460195183753967, "kl": 0.00012072920799255371, "learning_rate": 4.5238095238095237e-07, "loss": 0.0002, "reward": 0.04017857299186289, "reward_std": 0.008928571827709675, "rewards/accuracy_reward": 0.04017857299186289, "rewards/format_reward": 0.0, "step": 38 }, { "completion_length": 1993.486671447754, "epoch": 0.046598461653349264, "grad_norm": 0.016457097604870796, "kl": 7.876753807067871e-05, "learning_rate": 4.6428571428571427e-07, "loss": 0.0015, "reward": 0.08705357578583062, "reward_std": 0.009619199205189943, "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.0, "step": 39 }, { "completion_length": 1964.9822235107422, "epoch": 0.04779329400343514, "grad_norm": 0.019358787685632706, "kl": 7.081031799316406e-05, "learning_rate": 4.761904761904761e-07, "loss": 0.0029, "reward": 0.06919643143191934, "reward_std": 0.013392857741564512, "rewards/accuracy_reward": 0.06919643143191934, "rewards/format_reward": 0.0, "step": 40 }, { "completion_length": 1936.9866943359375, "epoch": 0.048988126353521025, "grad_norm": 0.0339331217110157, "kl": 6.847083568572998e-05, "learning_rate": 4.880952380952381e-07, "loss": 0.0062, "reward": 0.04910714505240321, "reward_std": 0.02816697023808956, "rewards/accuracy_reward": 0.04910714505240321, "rewards/format_reward": 0.0, "step": 41 }, { "completion_length": 1904.2746276855469, "epoch": 0.0501829587036069, "grad_norm": 0.03859223797917366, "kl": 7.021427154541016e-05, "learning_rate": 5e-07, "loss": 0.0097, "reward": 0.0647321455180645, "reward_std": 0.03125000139698386, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, "step": 42 }, { "completion_length": 1912.1005477905273, "epoch": 0.05137779105369278, "grad_norm": 0.03665842488408089, "kl": 7.288157939910889e-05, "learning_rate": 5.119047619047619e-07, "loss": 0.0092, "reward": 0.058035716880112886, "reward_std": 0.03709554160013795, "rewards/accuracy_reward": 0.058035716880112886, "rewards/format_reward": 0.0, "step": 43 }, { "completion_length": 1935.6741943359375, "epoch": 0.052572623403778655, "grad_norm": 0.011850275099277496, "kl": 0.0001459568738937378, "learning_rate": 5.238095238095238e-07, "loss": 0.001, "reward": 0.03125000186264515, "reward_std": 0.005154913291335106, "rewards/accuracy_reward": 0.03125000186264515, "rewards/format_reward": 0.0, "step": 44 }, { "completion_length": 1920.330436706543, "epoch": 0.05376745575386454, "grad_norm": 0.0406753271818161, "kl": 6.775557994842529e-05, "learning_rate": 5.357142857142857e-07, "loss": 0.0063, "reward": 0.10491071944124997, "reward_std": 0.03640491468831897, "rewards/accuracy_reward": 0.10491071944124997, "rewards/format_reward": 0.0, "step": 45 }, { "completion_length": 1939.2166061401367, "epoch": 0.054962288103950416, "grad_norm": 0.026179056614637375, "kl": 7.511675357818604e-05, "learning_rate": 5.476190476190477e-07, "loss": 0.0062, "reward": 0.07589286030270159, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.07589286030270159, "rewards/format_reward": 0.0, "step": 46 }, { "completion_length": 1940.5491943359375, "epoch": 0.05615712045403629, "grad_norm": 0.027593238279223442, "kl": 6.988644599914551e-05, "learning_rate": 5.595238095238095e-07, "loss": 0.0011, "reward": 0.08258928940631449, "reward_std": 0.023702684324234724, "rewards/accuracy_reward": 0.08258928940631449, "rewards/format_reward": 0.0, "step": 47 }, { "completion_length": 1937.330451965332, "epoch": 0.05735195280412217, "grad_norm": 0.024891486391425133, "kl": 8.04215669631958e-05, "learning_rate": 5.714285714285714e-07, "loss": 0.0051, "reward": 0.05580357392318547, "reward_std": 0.018547771032899618, "rewards/accuracy_reward": 0.05580357392318547, "rewards/format_reward": 0.0, "step": 48 }, { "completion_length": 1949.1317901611328, "epoch": 0.05854678515420805, "grad_norm": 0.027651535347104073, "kl": 7.799267768859863e-05, "learning_rate": 5.833333333333334e-07, "loss": 0.0078, "reward": 0.05803571711294353, "reward_std": 0.01408348511904478, "rewards/accuracy_reward": 0.05803571711294353, "rewards/format_reward": 0.0, "step": 49 }, { "completion_length": 1934.8974075317383, "epoch": 0.05974161750429393, "grad_norm": 0.029048161581158638, "kl": 8.735060691833496e-05, "learning_rate": 5.952380952380952e-07, "loss": 0.0042, "reward": 0.02455357275903225, "reward_std": 0.018547771032899618, "rewards/accuracy_reward": 0.02455357275903225, "rewards/format_reward": 0.0, "step": 50 }, { "completion_length": 1955.4665908813477, "epoch": 0.06093644985437981, "grad_norm": 0.02348640002310276, "kl": 8.958578109741211e-05, "learning_rate": 6.071428571428571e-07, "loss": 0.0016, "reward": 0.05803571594879031, "reward_std": 0.01408348511904478, "rewards/accuracy_reward": 0.05803571594879031, "rewards/format_reward": 0.0, "step": 51 }, { "completion_length": 1951.839370727539, "epoch": 0.06213128220446568, "grad_norm": 0.030765259638428688, "kl": 0.00011533498764038086, "learning_rate": 6.19047619047619e-07, "loss": 0.0012, "reward": 0.05357143096625805, "reward_std": 0.026785715483129025, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "step": 52 }, { "completion_length": 1944.1094665527344, "epoch": 0.06332611455455156, "grad_norm": 0.0356757715344429, "kl": 0.00012364983558654785, "learning_rate": 6.309523809523809e-07, "loss": 0.0077, "reward": 0.07366071664728224, "reward_std": 0.023702684324234724, "rewards/accuracy_reward": 0.07366071664728224, "rewards/format_reward": 0.0, "step": 53 }, { "completion_length": 1900.790267944336, "epoch": 0.06452094690463744, "grad_norm": 0.026726773008704185, "kl": 0.00014388561248779297, "learning_rate": 6.428571428571429e-07, "loss": 0.0052, "reward": 0.058035717345774174, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.058035717345774174, "rewards/format_reward": 0.0, "step": 54 }, { "completion_length": 1961.9308853149414, "epoch": 0.06571577925472333, "grad_norm": 0.019678525626659393, "kl": 0.0004793703556060791, "learning_rate": 6.547619047619047e-07, "loss": 0.0044, "reward": 0.06026785960420966, "reward_std": 0.013392857741564512, "rewards/accuracy_reward": 0.06026785960420966, "rewards/format_reward": 0.0, "step": 55 }, { "completion_length": 1927.0157089233398, "epoch": 0.0669106116048092, "grad_norm": 0.025784065946936607, "kl": 0.00016304850578308105, "learning_rate": 6.666666666666666e-07, "loss": 0.003, "reward": 0.07142857485450804, "reward_std": 0.008928571827709675, "rewards/accuracy_reward": 0.07142857485450804, "rewards/format_reward": 0.0, "step": 56 }, { "completion_length": 1925.9197235107422, "epoch": 0.06810544395489508, "grad_norm": 0.03593931347131729, "kl": 0.00017896294593811035, "learning_rate": 6.785714285714286e-07, "loss": 0.0078, "reward": 0.03794643050059676, "reward_std": 0.03263125568628311, "rewards/accuracy_reward": 0.03794643050059676, "rewards/format_reward": 0.0, "step": 57 }, { "completion_length": 1909.3326721191406, "epoch": 0.06930027630498095, "grad_norm": 0.02181076817214489, "kl": 0.00021398067474365234, "learning_rate": 6.904761904761904e-07, "loss": 0.0037, "reward": 0.05357143119908869, "reward_std": 0.014083484653383493, "rewards/accuracy_reward": 0.05357143119908869, "rewards/format_reward": 0.0, "step": 58 }, { "completion_length": 1899.9353408813477, "epoch": 0.07049510865506683, "grad_norm": 0.02036217227578163, "kl": 0.0005359351634979248, "learning_rate": 7.023809523809523e-07, "loss": 0.0032, "reward": 0.08258928917348385, "reward_std": 0.014774112030863762, "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, "step": 59 }, { "completion_length": 1913.8840103149414, "epoch": 0.07168994100515272, "grad_norm": 0.025502286851406097, "kl": 0.00028121471405029297, "learning_rate": 7.142857142857143e-07, "loss": 0.0066, "reward": 0.058035717345774174, "reward_std": 0.019238398410379887, "rewards/accuracy_reward": 0.058035717345774174, "rewards/format_reward": 0.0, "step": 60 }, { "completion_length": 1950.2121505737305, "epoch": 0.07288477335523859, "grad_norm": 0.03495552018284798, "kl": 0.00028192996978759766, "learning_rate": 7.261904761904761e-07, "loss": 0.0065, "reward": 0.09821429057046771, "reward_std": 0.03194062877446413, "rewards/accuracy_reward": 0.09821429057046771, "rewards/format_reward": 0.0, "step": 61 }, { "completion_length": 1914.5447158813477, "epoch": 0.07407960570532447, "grad_norm": 0.026228569447994232, "kl": 0.0003268122673034668, "learning_rate": 7.380952380952381e-07, "loss": 0.0075, "reward": 0.0758928598370403, "reward_std": 0.026785715483129025, "rewards/accuracy_reward": 0.0758928598370403, "rewards/format_reward": 0.0, "step": 62 }, { "completion_length": 1938.205451965332, "epoch": 0.07527443805541036, "grad_norm": 0.021130835637450218, "kl": 0.0003739595413208008, "learning_rate": 7.5e-07, "loss": 0.0024, "reward": 0.03348214435391128, "reward_std": 0.013392857741564512, "rewards/accuracy_reward": 0.03348214435391128, "rewards/format_reward": 0.0, "step": 63 }, { "completion_length": 1930.9621353149414, "epoch": 0.07646927040549623, "grad_norm": 0.03406514599919319, "kl": 0.00043487548828125, "learning_rate": 7.619047619047618e-07, "loss": 0.0032, "reward": 0.0580357164144516, "reward_std": 0.037095542065799236, "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.0, "step": 64 }, { "completion_length": 1942.4286575317383, "epoch": 0.07766410275558211, "grad_norm": 0.02805194817483425, "kl": 0.0004545450210571289, "learning_rate": 7.738095238095238e-07, "loss": 0.0056, "reward": 0.05803571664728224, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.05803571664728224, "rewards/format_reward": 0.0, "step": 65 }, { "completion_length": 1930.2121276855469, "epoch": 0.07885893510566798, "grad_norm": 0.02087632566690445, "kl": 0.0010627508163452148, "learning_rate": 7.857142857142856e-07, "loss": 0.0044, "reward": 0.05357143096625805, "reward_std": 0.01785714365541935, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "step": 66 }, { "completion_length": 1915.830451965332, "epoch": 0.08005376745575386, "grad_norm": 0.015341238118708134, "kl": 0.0005818605422973633, "learning_rate": 7.976190476190476e-07, "loss": 0.0008, "reward": 0.029017858440056443, "reward_std": 0.004464285913854837, "rewards/accuracy_reward": 0.029017858440056443, "rewards/format_reward": 0.0, "step": 67 }, { "completion_length": 1903.3259735107422, "epoch": 0.08124859980583975, "grad_norm": 0.02514255978167057, "kl": 0.0006593465805053711, "learning_rate": 8.095238095238095e-07, "loss": 0.0031, "reward": 0.0870535746216774, "reward_std": 0.01854777056723833, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, "step": 68 }, { "completion_length": 1913.346061706543, "epoch": 0.08244343215592562, "grad_norm": 0.03169098496437073, "kl": 0.0007158517837524414, "learning_rate": 8.214285714285713e-07, "loss": 0.0058, "reward": 0.07589286146685481, "reward_std": 0.01785714365541935, "rewards/accuracy_reward": 0.07589286146685481, "rewards/format_reward": 0.0, "step": 69 }, { "completion_length": 1893.189811706543, "epoch": 0.0836382645060115, "grad_norm": 0.024002598598599434, "kl": 0.0007786750793457031, "learning_rate": 8.333333333333333e-07, "loss": 0.0042, "reward": 0.03794643026776612, "reward_std": 0.018547771032899618, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, "step": 70 }, { "completion_length": 1944.5447158813477, "epoch": 0.08483309685609738, "grad_norm": 0.02124917320907116, "kl": 0.0007531642913818359, "learning_rate": 8.452380952380952e-07, "loss": 0.0019, "reward": 0.06919643213041127, "reward_std": 0.013392857741564512, "rewards/accuracy_reward": 0.06919643213041127, "rewards/format_reward": 0.0, "step": 71 }, { "completion_length": 1916.3817901611328, "epoch": 0.08602792920618325, "grad_norm": 0.028824644163250923, "kl": 0.0008597373962402344, "learning_rate": 8.57142857142857e-07, "loss": 0.0054, "reward": 0.0781250037252903, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, "step": 72 }, { "completion_length": 1893.5871353149414, "epoch": 0.08722276155626914, "grad_norm": 0.04633302241563797, "kl": 0.0017404556274414062, "learning_rate": 8.69047619047619e-07, "loss": 0.0119, "reward": 0.08482143236324191, "reward_std": 0.04363171011209488, "rewards/accuracy_reward": 0.08482143236324191, "rewards/format_reward": 0.0, "step": 73 }, { "completion_length": 1949.1451797485352, "epoch": 0.08841759390635502, "grad_norm": 0.02200537919998169, "kl": 0.0020051002502441406, "learning_rate": 8.809523809523809e-07, "loss": 0.0036, "reward": 0.05580357392318547, "reward_std": 0.018547771032899618, "rewards/accuracy_reward": 0.05580357392318547, "rewards/format_reward": 0.0, "step": 74 }, { "completion_length": 1905.8393630981445, "epoch": 0.08961242625644089, "grad_norm": 0.025715140625834465, "kl": 0.0011448860168457031, "learning_rate": 8.928571428571428e-07, "loss": 0.0043, "reward": 0.08258928940631449, "reward_std": 0.018547771032899618, "rewards/accuracy_reward": 0.08258928940631449, "rewards/format_reward": 0.0, "step": 75 }, { "completion_length": 1937.2344665527344, "epoch": 0.09080725860652678, "grad_norm": 0.0302599910646677, "kl": 0.0010852813720703125, "learning_rate": 9.047619047619047e-07, "loss": 0.0049, "reward": 0.02901785890571773, "reward_std": 0.018547771032899618, "rewards/accuracy_reward": 0.02901785890571773, "rewards/format_reward": 0.0, "step": 76 }, { "completion_length": 1912.2389221191406, "epoch": 0.09200209095661264, "grad_norm": 0.03185253590345383, "kl": 0.0023200511932373047, "learning_rate": 9.166666666666665e-07, "loss": 0.0049, "reward": 0.055803573690354824, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.055803573690354824, "rewards/format_reward": 0.0, "step": 77 }, { "completion_length": 1899.3884811401367, "epoch": 0.09319692330669853, "grad_norm": 0.025443824008107185, "kl": 0.0013415813446044922, "learning_rate": 9.285714285714285e-07, "loss": 0.0007, "reward": 0.053571430733427405, "reward_std": 0.01408348511904478, "rewards/accuracy_reward": 0.053571430733427405, "rewards/format_reward": 0.0, "step": 78 }, { "completion_length": 1912.1608123779297, "epoch": 0.09439175565678441, "grad_norm": 0.0218362957239151, "kl": 0.001329660415649414, "learning_rate": 9.404761904761904e-07, "loss": 0.0041, "reward": 0.04910714505240321, "reward_std": 0.008928571827709675, "rewards/accuracy_reward": 0.04910714505240321, "rewards/format_reward": 0.0, "step": 79 }, { "completion_length": 1885.0692749023438, "epoch": 0.09558658800687028, "grad_norm": 0.018288061022758484, "kl": 0.0018634796142578125, "learning_rate": 9.523809523809522e-07, "loss": 0.0038, "reward": 0.03348214435391128, "reward_std": 0.013392857741564512, "rewards/accuracy_reward": 0.03348214435391128, "rewards/format_reward": 0.0, "step": 80 }, { "completion_length": 1913.1473999023438, "epoch": 0.09678142035695617, "grad_norm": 0.023828675970435143, "kl": 0.0014421939849853516, "learning_rate": 9.642857142857142e-07, "loss": 0.0013, "reward": 0.04687500139698386, "reward_std": 0.013392857741564512, "rewards/accuracy_reward": 0.04687500139698386, "rewards/format_reward": 0.0, "step": 81 }, { "completion_length": 1926.1005401611328, "epoch": 0.09797625270704205, "grad_norm": 0.01930251531302929, "kl": 0.0014677047729492188, "learning_rate": 9.761904761904762e-07, "loss": 0.003, "reward": 0.06026786006987095, "reward_std": 0.009619199205189943, "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.0, "step": 82 }, { "completion_length": 1896.9509735107422, "epoch": 0.09917108505712792, "grad_norm": 0.03674913942813873, "kl": 0.0015721321105957031, "learning_rate": 9.88095238095238e-07, "loss": 0.009, "reward": 0.07142857415601611, "reward_std": 0.0357142873108387, "rewards/accuracy_reward": 0.07142857415601611, "rewards/format_reward": 0.0, "step": 83 }, { "completion_length": 1890.1697311401367, "epoch": 0.1003659174072138, "grad_norm": 0.036670565605163574, "kl": 0.0016551017761230469, "learning_rate": 1e-06, "loss": 0.0077, "reward": 0.03794643050059676, "reward_std": 0.03640491422265768, "rewards/accuracy_reward": 0.03794643050059676, "rewards/format_reward": 0.0, "step": 84 }, { "completion_length": 1908.8661499023438, "epoch": 0.10156074975729967, "grad_norm": 0.026105426251888275, "kl": 0.005558013916015625, "learning_rate": 9.999960731351846e-07, "loss": 0.0024, "reward": 0.04687500209547579, "reward_std": 0.018547771032899618, "rewards/accuracy_reward": 0.04687500209547579, "rewards/format_reward": 0.0, "step": 85 }, { "completion_length": 1897.6630477905273, "epoch": 0.10275558210738556, "grad_norm": 0.027114860713481903, "kl": 0.0017933845520019531, "learning_rate": 9.999842926092733e-07, "loss": 0.0036, "reward": 0.07589286053553224, "reward_std": 0.01785714365541935, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, "step": 86 }, { "completion_length": 1941.3304443359375, "epoch": 0.10395041445747144, "grad_norm": 0.025932157412171364, "kl": 0.0016579627990722656, "learning_rate": 9.99964658627868e-07, "loss": 0.0076, "reward": 0.04241071664728224, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.04241071664728224, "rewards/format_reward": 0.0, "step": 87 }, { "completion_length": 1858.7322158813477, "epoch": 0.10514524680755731, "grad_norm": 0.03559509664773941, "kl": 0.00197601318359375, "learning_rate": 9.999371715336356e-07, "loss": 0.0007, "reward": 0.0892857180442661, "reward_std": 0.040869200602173805, "rewards/accuracy_reward": 0.0892857180442661, "rewards/format_reward": 0.0, "step": 88 }, { "completion_length": 1912.448745727539, "epoch": 0.1063400791576432, "grad_norm": 0.03719576820731163, "kl": 0.004304409027099609, "learning_rate": 9.99901831806301e-07, "loss": 0.0048, "reward": 0.08035714598372579, "reward_std": 0.03332188306376338, "rewards/accuracy_reward": 0.08035714598372579, "rewards/format_reward": 0.0, "step": 89 }, { "completion_length": 1885.0157089233398, "epoch": 0.10753491150772908, "grad_norm": 0.03067575953900814, "kl": 0.0075054168701171875, "learning_rate": 9.99858640062639e-07, "loss": 0.008, "reward": 0.03125000116415322, "reward_std": 0.026785715483129025, "rewards/accuracy_reward": 0.03125000116415322, "rewards/format_reward": 0.0, "step": 90 }, { "completion_length": 1858.2657165527344, "epoch": 0.10872974385781495, "grad_norm": 0.026665568351745605, "kl": 0.006274223327636719, "learning_rate": 9.998075970564635e-07, "loss": 0.003, "reward": 0.0892857180442661, "reward_std": 0.01785714365541935, "rewards/accuracy_reward": 0.0892857180442661, "rewards/format_reward": 0.0, "step": 91 }, { "completion_length": 1904.9576797485352, "epoch": 0.10992457620790083, "grad_norm": 0.034863099455833435, "kl": 0.002331256866455078, "learning_rate": 9.997487036786145e-07, "loss": 0.0023, "reward": 0.06026785937137902, "reward_std": 0.032631255220621824, "rewards/accuracy_reward": 0.06026785937137902, "rewards/format_reward": 0.0, "step": 92 }, { "completion_length": 1916.0938262939453, "epoch": 0.1111194085579867, "grad_norm": 0.04491715133190155, "kl": 0.0023064613342285156, "learning_rate": 9.996819609569422e-07, "loss": 0.0124, "reward": 0.058035717345774174, "reward_std": 0.03709554113447666, "rewards/accuracy_reward": 0.058035717345774174, "rewards/format_reward": 0.0, "step": 93 }, { "completion_length": 1914.9532089233398, "epoch": 0.11231424090807259, "grad_norm": 0.04777304455637932, "kl": 0.002444744110107422, "learning_rate": 9.996073700562898e-07, "loss": 0.0098, "reward": 0.08258928940631449, "reward_std": 0.041559827513992786, "rewards/accuracy_reward": 0.08258928940631449, "rewards/format_reward": 0.0, "step": 94 }, { "completion_length": 1931.6094665527344, "epoch": 0.11350907325815847, "grad_norm": 0.02972174808382988, "kl": 0.0025696754455566406, "learning_rate": 9.99524932278472e-07, "loss": 0.0036, "reward": 0.05133928777649999, "reward_std": 0.02747634332627058, "rewards/accuracy_reward": 0.05133928777649999, "rewards/format_reward": 0.0, "step": 95 }, { "completion_length": 1827.4487533569336, "epoch": 0.11470390560824434, "grad_norm": 0.06368128955364227, "kl": 0.0031843185424804688, "learning_rate": 9.994346490622537e-07, "loss": 0.0193, "reward": 0.10044643259607255, "reward_std": 0.050488398876041174, "rewards/accuracy_reward": 0.10044643259607255, "rewards/format_reward": 0.0, "step": 96 }, { "completion_length": 1892.2701797485352, "epoch": 0.11589873795833022, "grad_norm": 0.056695107370615005, "kl": 0.003215789794921875, "learning_rate": 9.99336521983323e-07, "loss": 0.0109, "reward": 0.10044643189758062, "reward_std": 0.04017857322469354, "rewards/accuracy_reward": 0.10044643189758062, "rewards/format_reward": 0.0, "step": 97 }, { "completion_length": 1889.6697311401367, "epoch": 0.1170935703084161, "grad_norm": 0.04041042551398277, "kl": 0.00557708740234375, "learning_rate": 9.992305527542663e-07, "loss": 0.0087, "reward": 0.09375000465661287, "reward_std": 0.040869200602173805, "rewards/accuracy_reward": 0.09375000465661287, "rewards/format_reward": 0.0, "step": 98 }, { "completion_length": 1893.7634582519531, "epoch": 0.11828840265850198, "grad_norm": 0.02061825804412365, "kl": 0.00373077392578125, "learning_rate": 9.991167432245357e-07, "loss": 0.003, "reward": 0.04464285960420966, "reward_std": 0.01408348511904478, "rewards/accuracy_reward": 0.04464285960420966, "rewards/format_reward": 0.0, "step": 99 }, { "completion_length": 1922.299201965332, "epoch": 0.11948323500858786, "grad_norm": 0.04469740390777588, "kl": 0.0042095184326171875, "learning_rate": 9.98995095380419e-07, "loss": 0.0093, "reward": 0.08258928917348385, "reward_std": 0.037786169443279505, "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, "step": 100 }, { "completion_length": 1839.2344589233398, "epoch": 0.12067806735867373, "grad_norm": 0.030348792672157288, "kl": 0.004492759704589844, "learning_rate": 9.988656113450028e-07, "loss": 0.0069, "reward": 0.05580357415601611, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.05580357415601611, "rewards/format_reward": 0.0, "step": 101 }, { "completion_length": 1863.7947387695312, "epoch": 0.12187289970875961, "grad_norm": 0.03222937881946564, "kl": 0.007572174072265625, "learning_rate": 9.98728293378138e-07, "loss": 0.0047, "reward": 0.06919643189758062, "reward_std": 0.019929025787860155, "rewards/accuracy_reward": 0.06919643189758062, "rewards/format_reward": 0.0, "step": 102 }, { "completion_length": 1899.1942825317383, "epoch": 0.1230677320588455, "grad_norm": 0.03390585258603096, "kl": 0.0053253173828125, "learning_rate": 9.985831438763979e-07, "loss": 0.0027, "reward": 0.042410716181620955, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.042410716181620955, "rewards/format_reward": 0.0, "step": 103 }, { "completion_length": 1826.5223999023438, "epoch": 0.12426256440893137, "grad_norm": 0.043829139322042465, "kl": 0.008695602416992188, "learning_rate": 9.98430165373038e-07, "loss": 0.0096, "reward": 0.09151786146685481, "reward_std": 0.04294108273461461, "rewards/accuracy_reward": 0.09151786146685481, "rewards/format_reward": 0.0, "step": 104 }, { "completion_length": 1871.359474182129, "epoch": 0.12545739675901724, "grad_norm": 0.04022064805030823, "kl": 0.0058002471923828125, "learning_rate": 9.982693605379515e-07, "loss": 0.0097, "reward": 0.07589286030270159, "reward_std": 0.046024112962186337, "rewards/accuracy_reward": 0.07589286030270159, "rewards/format_reward": 0.0, "step": 105 }, { "completion_length": 1832.285789489746, "epoch": 0.12665222910910312, "grad_norm": 0.04429134353995323, "kl": 0.00606536865234375, "learning_rate": 9.98100732177622e-07, "loss": 0.0134, "reward": 0.06473214575089514, "reward_std": 0.041559827979654074, "rewards/accuracy_reward": 0.06473214575089514, "rewards/format_reward": 0.0, "step": 106 }, { "completion_length": 1846.1183776855469, "epoch": 0.127847061459189, "grad_norm": 0.038170237094163895, "kl": 0.00634002685546875, "learning_rate": 9.979242832350748e-07, "loss": 0.0123, "reward": 0.0781250037252903, "reward_std": 0.041559827979654074, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, "step": 107 }, { "completion_length": 1816.729995727539, "epoch": 0.1290418938092749, "grad_norm": 0.042868830263614655, "kl": 0.0068683624267578125, "learning_rate": 9.977400167898268e-07, "loss": 0.0108, "reward": 0.09375000465661287, "reward_std": 0.04979777242988348, "rewards/accuracy_reward": 0.09375000465661287, "rewards/format_reward": 0.0, "step": 108 }, { "completion_length": 1800.3348999023438, "epoch": 0.13023672615936077, "grad_norm": 0.04913003742694855, "kl": 0.007541656494140625, "learning_rate": 9.975479360578303e-07, "loss": 0.0071, "reward": 0.05803571757860482, "reward_std": 0.05117902671918273, "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.0, "step": 109 }, { "completion_length": 1822.4442825317383, "epoch": 0.13143155850944666, "grad_norm": 0.032876260578632355, "kl": 0.01309967041015625, "learning_rate": 9.973480443914196e-07, "loss": 0.0038, "reward": 0.07589286053553224, "reward_std": 0.01785714365541935, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, "step": 110 }, { "completion_length": 1800.667495727539, "epoch": 0.1326263908595325, "grad_norm": 0.0231518242508173, "kl": 0.008142471313476562, "learning_rate": 9.9714034527925e-07, "loss": 0.0041, "reward": 0.060267859837040305, "reward_std": 0.009619199205189943, "rewards/accuracy_reward": 0.060267859837040305, "rewards/format_reward": 0.0, "step": 111 }, { "completion_length": 1834.587142944336, "epoch": 0.1338212232096184, "grad_norm": 0.02913672663271427, "kl": 0.0082550048828125, "learning_rate": 9.969248423462383e-07, "loss": 0.0082, "reward": 0.09375000395812094, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.09375000395812094, "rewards/format_reward": 0.0, "step": 112 }, { "completion_length": 1808.8594589233398, "epoch": 0.13501605555970428, "grad_norm": 0.03942106291651726, "kl": 0.008478164672851562, "learning_rate": 9.967015393535002e-07, "loss": 0.0044, "reward": 0.06473214528523386, "reward_std": 0.036404915153980255, "rewards/accuracy_reward": 0.06473214528523386, "rewards/format_reward": 0.0, "step": 113 }, { "completion_length": 1803.6831130981445, "epoch": 0.13621088790979016, "grad_norm": 0.04154250770807266, "kl": 0.013181686401367188, "learning_rate": 9.964704401982828e-07, "loss": 0.0099, "reward": 0.10937500488944352, "reward_std": 0.0491071455180645, "rewards/accuracy_reward": 0.10937500488944352, "rewards/format_reward": 0.0, "step": 114 }, { "completion_length": 1807.1362380981445, "epoch": 0.13740572025987605, "grad_norm": 0.03864143416285515, "kl": 0.01250457763671875, "learning_rate": 9.96231548913898e-07, "loss": 0.0064, "reward": 0.07589286076836288, "reward_std": 0.040869200602173805, "rewards/accuracy_reward": 0.07589286076836288, "rewards/format_reward": 0.0, "step": 115 }, { "completion_length": 1782.471061706543, "epoch": 0.1386005526099619, "grad_norm": 0.03343848139047623, "kl": 0.009401321411132812, "learning_rate": 9.95984869669651e-07, "loss": 0.0123, "reward": 0.13839286426082253, "reward_std": 0.03194062877446413, "rewards/accuracy_reward": 0.13839286426082253, "rewards/format_reward": 0.0, "step": 116 }, { "completion_length": 1777.38623046875, "epoch": 0.13979538496004779, "grad_norm": 0.03745860978960991, "kl": 0.028089523315429688, "learning_rate": 9.957304067707693e-07, "loss": 0.0058, "reward": 0.08482143259607255, "reward_std": 0.0295482249930501, "rewards/accuracy_reward": 0.08482143259607255, "rewards/format_reward": 0.0, "step": 117 }, { "completion_length": 1776.8683853149414, "epoch": 0.14099021731013367, "grad_norm": 0.04127826541662216, "kl": 0.015289306640625, "learning_rate": 9.954681646583252e-07, "loss": 0.0044, "reward": 0.08035714598372579, "reward_std": 0.03332188352942467, "rewards/accuracy_reward": 0.08035714598372579, "rewards/format_reward": 0.0, "step": 118 }, { "completion_length": 1821.323745727539, "epoch": 0.14218504966021955, "grad_norm": 0.06132641062140465, "kl": 0.0103759765625, "learning_rate": 9.951981479091609e-07, "loss": 0.0217, "reward": 0.04464285960420966, "reward_std": 0.04602411389350891, "rewards/accuracy_reward": 0.04464285960420966, "rewards/format_reward": 0.0, "step": 119 }, { "completion_length": 1750.52685546875, "epoch": 0.14337988201030544, "grad_norm": 0.03916619345545769, "kl": 0.014801025390625, "learning_rate": 9.949203612358058e-07, "loss": 0.0044, "reward": 0.098214291036129, "reward_std": 0.04225045442581177, "rewards/accuracy_reward": 0.098214291036129, "rewards/format_reward": 0.0, "step": 120 }, { "completion_length": 1791.4732971191406, "epoch": 0.14457471436039132, "grad_norm": 0.04524555802345276, "kl": 0.0239410400390625, "learning_rate": 9.946348094863965e-07, "loss": 0.0123, "reward": 0.1093750037252903, "reward_std": 0.04533348651602864, "rewards/accuracy_reward": 0.1093750037252903, "rewards/format_reward": 0.0, "step": 121 }, { "completion_length": 1799.8014221191406, "epoch": 0.14576954671047718, "grad_norm": 0.030431672930717468, "kl": 0.02434539794921875, "learning_rate": 9.943414976445917e-07, "loss": -0.0, "reward": 0.03794643026776612, "reward_std": 0.004464285913854837, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, "step": 122 }, { "completion_length": 1797.2701568603516, "epoch": 0.14696437906056306, "grad_norm": 0.039115965366363525, "kl": 0.017578125, "learning_rate": 9.940404308294844e-07, "loss": 0.0069, "reward": 0.06696428847499192, "reward_std": 0.02816697023808956, "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.0, "step": 123 }, { "completion_length": 1783.564826965332, "epoch": 0.14815921141064894, "grad_norm": 0.036285508424043655, "kl": 0.012054443359375, "learning_rate": 9.937316142955129e-07, "loss": 0.01, "reward": 0.08928571944124997, "reward_std": 0.03194062877446413, "rewards/accuracy_reward": 0.08928571944124997, "rewards/format_reward": 0.0, "step": 124 }, { "completion_length": 1724.9822158813477, "epoch": 0.14935404376073483, "grad_norm": 0.03619866445660591, "kl": 0.02353668212890625, "learning_rate": 9.934150534323698e-07, "loss": 0.0064, "reward": 0.07366071827709675, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.0, "step": 125 }, { "completion_length": 1816.2791213989258, "epoch": 0.1505488761108207, "grad_norm": 0.03852919861674309, "kl": 0.012279510498046875, "learning_rate": 9.930907537649068e-07, "loss": 0.0096, "reward": 0.05133928800933063, "reward_std": 0.0326312561519444, "rewards/accuracy_reward": 0.05133928800933063, "rewards/format_reward": 0.0, "step": 126 }, { "completion_length": 1775.2523193359375, "epoch": 0.15174370846090657, "grad_norm": 0.03357725217938423, "kl": 0.01306915283203125, "learning_rate": 9.9275872095304e-07, "loss": 0.0116, "reward": 0.10044643329456449, "reward_std": 0.03263125568628311, "rewards/accuracy_reward": 0.10044643329456449, "rewards/format_reward": 0.0, "step": 127 }, { "completion_length": 1818.9442749023438, "epoch": 0.15293854081099245, "grad_norm": 0.03856300190091133, "kl": 0.012660980224609375, "learning_rate": 9.924189607916484e-07, "loss": 0.0098, "reward": 0.10491071850992739, "reward_std": 0.02885759761556983, "rewards/accuracy_reward": 0.10491071850992739, "rewards/format_reward": 0.0, "step": 128 }, { "completion_length": 1727.1072158813477, "epoch": 0.15413337316107834, "grad_norm": 0.05298230051994324, "kl": 0.013278961181640625, "learning_rate": 9.920714792104758e-07, "loss": 0.0104, "reward": 0.04241071664728224, "reward_std": 0.05426205834373832, "rewards/accuracy_reward": 0.04241071664728224, "rewards/format_reward": 0.0, "step": 129 }, { "completion_length": 1767.8415908813477, "epoch": 0.15532820551116422, "grad_norm": 0.08449454605579376, "kl": 0.02042388916015625, "learning_rate": 9.917162822740253e-07, "loss": 0.0214, "reward": 0.10491071920841932, "reward_std": 0.06972679728642106, "rewards/accuracy_reward": 0.10491071920841932, "rewards/format_reward": 0.0, "step": 130 }, { "completion_length": 1765.314811706543, "epoch": 0.1565230378612501, "grad_norm": 0.037820495665073395, "kl": 0.020782470703125, "learning_rate": 9.913533761814537e-07, "loss": 0.0041, "reward": 0.07589285937137902, "reward_std": 0.026785715483129025, "rewards/accuracy_reward": 0.07589285937137902, "rewards/format_reward": 0.0, "step": 131 }, { "completion_length": 1812.598289489746, "epoch": 0.15771787021133596, "grad_norm": 0.03846178576350212, "kl": 0.0135955810546875, "learning_rate": 9.90982767266464e-07, "loss": 0.0099, "reward": 0.09151786123402417, "reward_std": 0.041559827513992786, "rewards/accuracy_reward": 0.09151786123402417, "rewards/format_reward": 0.0, "step": 132 }, { "completion_length": 1772.1027603149414, "epoch": 0.15891270256142184, "grad_norm": 0.03498194366693497, "kl": 0.0247955322265625, "learning_rate": 9.906044619971946e-07, "loss": 0.0108, "reward": 0.051339288242161274, "reward_std": 0.022321429569274187, "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, "step": 133 }, { "completion_length": 1803.7813186645508, "epoch": 0.16010753491150773, "grad_norm": 0.04519898444414139, "kl": 0.020618438720703125, "learning_rate": 9.902184669761063e-07, "loss": 0.006, "reward": 0.10267857648432255, "reward_std": 0.05256028147414327, "rewards/accuracy_reward": 0.10267857648432255, "rewards/format_reward": 0.0, "step": 134 }, { "completion_length": 1795.707664489746, "epoch": 0.1613023672615936, "grad_norm": 0.042551204562187195, "kl": 0.015712738037109375, "learning_rate": 9.898247889398664e-07, "loss": 0.0091, "reward": 0.08928571874275804, "reward_std": 0.03194062877446413, "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.0, "step": 135 }, { "completion_length": 1809.7746353149414, "epoch": 0.1624971996116795, "grad_norm": 0.04211409017443657, "kl": 0.01676177978515625, "learning_rate": 9.89423434759233e-07, "loss": 0.0126, "reward": 0.08705357485450804, "reward_std": 0.041559827513992786, "rewards/accuracy_reward": 0.08705357485450804, "rewards/format_reward": 0.0, "step": 136 }, { "completion_length": 1758.6072082519531, "epoch": 0.16369203196176538, "grad_norm": 0.043146513402462006, "kl": 0.017253875732421875, "learning_rate": 9.89014411438933e-07, "loss": 0.0086, "reward": 0.04017857322469354, "reward_std": 0.01408348511904478, "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, "step": 137 }, { "completion_length": 1772.058120727539, "epoch": 0.16488686431185123, "grad_norm": 0.02955915965139866, "kl": 0.017364501953125, "learning_rate": 9.88597726117541e-07, "loss": 0.0078, "reward": 0.08035714598372579, "reward_std": 0.01785714365541935, "rewards/accuracy_reward": 0.08035714598372579, "rewards/format_reward": 0.0, "step": 138 }, { "completion_length": 1749.5090103149414, "epoch": 0.16608169666193712, "grad_norm": 0.03616601601243019, "kl": 0.01781463623046875, "learning_rate": 9.881733860673543e-07, "loss": 0.0091, "reward": 0.0781250037252903, "reward_std": 0.0326312561519444, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, "step": 139 }, { "completion_length": 1781.6541061401367, "epoch": 0.167276529012023, "grad_norm": 0.05517016723752022, "kl": 0.016933441162109375, "learning_rate": 9.877413986942667e-07, "loss": 0.022, "reward": 0.04017857299186289, "reward_std": 0.044642859138548374, "rewards/accuracy_reward": 0.04017857299186289, "rewards/format_reward": 0.0, "step": 140 }, { "completion_length": 1796.6875915527344, "epoch": 0.16847136136210888, "grad_norm": 0.04376187175512314, "kl": 0.017948150634765625, "learning_rate": 9.873017715376379e-07, "loss": 0.0052, "reward": 0.051339288940653205, "reward_std": 0.019929025787860155, "rewards/accuracy_reward": 0.051339288940653205, "rewards/format_reward": 0.0, "step": 141 }, { "completion_length": 1784.0425033569336, "epoch": 0.16966619371219477, "grad_norm": 0.03657816722989082, "kl": 0.01831817626953125, "learning_rate": 9.86854512270163e-07, "loss": 0.008, "reward": 0.05357143096625805, "reward_std": 0.028166969772428274, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "step": 142 }, { "completion_length": 1723.9509811401367, "epoch": 0.17086102606228062, "grad_norm": 0.050094712525606155, "kl": 0.0188140869140625, "learning_rate": 9.863996286977389e-07, "loss": 0.0121, "reward": 0.06696428824216127, "reward_std": 0.040869200602173805, "rewards/accuracy_reward": 0.06696428824216127, "rewards/format_reward": 0.0, "step": 143 }, { "completion_length": 1725.517921447754, "epoch": 0.1720558584123665, "grad_norm": 0.06016527861356735, "kl": 0.0192718505859375, "learning_rate": 9.859371287593262e-07, "loss": 0.0217, "reward": 0.05803571711294353, "reward_std": 0.04979777196422219, "rewards/accuracy_reward": 0.05803571711294353, "rewards/format_reward": 0.0, "step": 144 }, { "completion_length": 1745.6272888183594, "epoch": 0.1732506907624524, "grad_norm": 0.058507274836301804, "kl": 0.0196380615234375, "learning_rate": 9.854670205268133e-07, "loss": 0.0129, "reward": 0.04464285867288709, "reward_std": 0.0549526852555573, "rewards/accuracy_reward": 0.04464285867288709, "rewards/format_reward": 0.0, "step": 145 }, { "completion_length": 1767.0915985107422, "epoch": 0.17444552311253828, "grad_norm": 0.03160831704735756, "kl": 0.01983642578125, "learning_rate": 9.84989312204873e-07, "loss": 0.0085, "reward": 0.08928571827709675, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, "step": 146 }, { "completion_length": 1765.8505325317383, "epoch": 0.17564035546262416, "grad_norm": 0.05937029793858528, "kl": 0.02191925048828125, "learning_rate": 9.845040121308209e-07, "loss": 0.0161, "reward": 0.058035716880112886, "reward_std": 0.056333940010517836, "rewards/accuracy_reward": 0.058035716880112886, "rewards/format_reward": 0.0, "step": 147 }, { "completion_length": 1767.44873046875, "epoch": 0.17683518781271004, "grad_norm": 0.04569656401872635, "kl": 0.02190399169921875, "learning_rate": 9.840111287744695e-07, "loss": 0.0036, "reward": 0.03794643026776612, "reward_std": 0.018547771032899618, "rewards/accuracy_reward": 0.03794643026776612, "rewards/format_reward": 0.0, "step": 148 }, { "completion_length": 1778.01123046875, "epoch": 0.1780300201627959, "grad_norm": 0.036317892372608185, "kl": 0.03035736083984375, "learning_rate": 9.8351067073798e-07, "loss": 0.009, "reward": 0.07589286146685481, "reward_std": 0.019238398410379887, "rewards/accuracy_reward": 0.07589286146685481, "rewards/format_reward": 0.0, "step": 149 }, { "completion_length": 1767.192039489746, "epoch": 0.17922485251288178, "grad_norm": 0.03625959903001785, "kl": 0.02275848388671875, "learning_rate": 9.830026467557129e-07, "loss": 0.0075, "reward": 0.06696428870782256, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.06696428870782256, "rewards/format_reward": 0.0, "step": 150 }, { "completion_length": 1738.8594512939453, "epoch": 0.18041968486296767, "grad_norm": 0.05405912920832634, "kl": 0.022430419921875, "learning_rate": 9.824870656940748e-07, "loss": 0.0139, "reward": 0.06026786030270159, "reward_std": 0.04533348698168993, "rewards/accuracy_reward": 0.06026786030270159, "rewards/format_reward": 0.0, "step": 151 }, { "completion_length": 1737.1407012939453, "epoch": 0.18161451721305355, "grad_norm": 0.04454251006245613, "kl": 0.02196502685546875, "learning_rate": 9.819639365513637e-07, "loss": 0.0067, "reward": 0.06473214505240321, "reward_std": 0.041559827979654074, "rewards/accuracy_reward": 0.06473214505240321, "rewards/format_reward": 0.0, "step": 152 }, { "completion_length": 1814.0491790771484, "epoch": 0.18280934956313943, "grad_norm": 0.03791825473308563, "kl": 0.0238800048828125, "learning_rate": 9.814332684576132e-07, "loss": 0.01, "reward": 0.0491071455180645, "reward_std": 0.03194062830880284, "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.0, "step": 153 }, { "completion_length": 1751.4130249023438, "epoch": 0.1840041819132253, "grad_norm": 0.048251841217279434, "kl": 0.0237274169921875, "learning_rate": 9.808950706744313e-07, "loss": 0.0034, "reward": 0.0491071455180645, "reward_std": 0.019238398410379887, "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.0, "step": 154 }, { "completion_length": 1688.8639068603516, "epoch": 0.18519901426331117, "grad_norm": 0.042374223470687866, "kl": 0.03575897216796875, "learning_rate": 9.8034935259484e-07, "loss": 0.0058, "reward": 0.08482143213041127, "reward_std": 0.040869200602173805, "rewards/accuracy_reward": 0.08482143213041127, "rewards/format_reward": 0.0, "step": 155 }, { "completion_length": 1717.5581130981445, "epoch": 0.18639384661339706, "grad_norm": 0.04356525465846062, "kl": 0.024169921875, "learning_rate": 9.797961237431104e-07, "loss": 0.0151, "reward": 0.07142857508733869, "reward_std": 0.040869200602173805, "rewards/accuracy_reward": 0.07142857508733869, "rewards/format_reward": 0.0, "step": 156 }, { "completion_length": 1769.9286422729492, "epoch": 0.18758867896348294, "grad_norm": 0.052511461079120636, "kl": 0.02364349365234375, "learning_rate": 9.792353937745988e-07, "loss": 0.0158, "reward": 0.05803571711294353, "reward_std": 0.037095542065799236, "rewards/accuracy_reward": 0.05803571711294353, "rewards/format_reward": 0.0, "step": 157 }, { "completion_length": 1768.1228408813477, "epoch": 0.18878351131356882, "grad_norm": 0.04499455913901329, "kl": 0.0249176025390625, "learning_rate": 9.786671724755742e-07, "loss": 0.015, "reward": 0.05580357392318547, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.05580357392318547, "rewards/format_reward": 0.0, "step": 158 }, { "completion_length": 1769.348289489746, "epoch": 0.18997834366365468, "grad_norm": 0.04796489328145981, "kl": 0.0238189697265625, "learning_rate": 9.78091469763052e-07, "loss": 0.008, "reward": 0.03125000139698386, "reward_std": 0.03332188306376338, "rewards/accuracy_reward": 0.03125000139698386, "rewards/format_reward": 0.0, "step": 159 }, { "completion_length": 1759.511245727539, "epoch": 0.19117317601374056, "grad_norm": 0.030061161145567894, "kl": 0.02353668212890625, "learning_rate": 9.775082956846168e-07, "loss": 0.0087, "reward": 0.06696428870782256, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.06696428870782256, "rewards/format_reward": 0.0, "step": 160 }, { "completion_length": 1769.7344665527344, "epoch": 0.19236800836382645, "grad_norm": 0.044831160455942154, "kl": 0.02388763427734375, "learning_rate": 9.7691766041825e-07, "loss": 0.0057, "reward": 0.0870535762514919, "reward_std": 0.041559827979654074, "rewards/accuracy_reward": 0.0870535762514919, "rewards/format_reward": 0.0, "step": 161 }, { "completion_length": 1777.0223999023438, "epoch": 0.19356284071391233, "grad_norm": 0.03385160118341446, "kl": 0.02364349365234375, "learning_rate": 9.763195742721512e-07, "loss": 0.0072, "reward": 0.06919643166474998, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.06919643166474998, "rewards/format_reward": 0.0, "step": 162 }, { "completion_length": 1743.596061706543, "epoch": 0.19475767306399822, "grad_norm": 0.08730708807706833, "kl": 0.02454376220703125, "learning_rate": 9.75714047684558e-07, "loss": 0.0227, "reward": 0.10714286332949996, "reward_std": 0.06010759901255369, "rewards/accuracy_reward": 0.10714286332949996, "rewards/format_reward": 0.0, "step": 163 }, { "completion_length": 1740.2545471191406, "epoch": 0.1959525054140841, "grad_norm": 0.05083740875124931, "kl": 0.0241241455078125, "learning_rate": 9.751010912235634e-07, "loss": 0.0057, "reward": 0.058035717345774174, "reward_std": 0.04979777196422219, "rewards/accuracy_reward": 0.058035717345774174, "rewards/format_reward": 0.0, "step": 164 }, { "completion_length": 1756.8214950561523, "epoch": 0.19714733776416996, "grad_norm": 0.03620550036430359, "kl": 0.02423858642578125, "learning_rate": 9.744807155869328e-07, "loss": 0.013, "reward": 0.06250000256113708, "reward_std": 0.026785715483129025, "rewards/accuracy_reward": 0.06250000256113708, "rewards/format_reward": 0.0, "step": 165 }, { "completion_length": 1714.8438186645508, "epoch": 0.19834217011425584, "grad_norm": 0.04348772019147873, "kl": 0.02567291259765625, "learning_rate": 9.738529316019168e-07, "loss": 0.007, "reward": 0.06919643236324191, "reward_std": 0.03125000139698386, "rewards/accuracy_reward": 0.06919643236324191, "rewards/format_reward": 0.0, "step": 166 }, { "completion_length": 1717.3683624267578, "epoch": 0.19953700246434172, "grad_norm": 0.048585906624794006, "kl": 0.0382080078125, "learning_rate": 9.732177502250605e-07, "loss": 0.011, "reward": 0.07812500349245965, "reward_std": 0.037786169443279505, "rewards/accuracy_reward": 0.07812500349245965, "rewards/format_reward": 0.0, "step": 167 }, { "completion_length": 1699.6094589233398, "epoch": 0.2007318348144276, "grad_norm": 0.06201766058802605, "kl": 0.02635955810546875, "learning_rate": 9.72575182542015e-07, "loss": 0.0172, "reward": 0.11383929057046771, "reward_std": 0.04294108273461461, "rewards/accuracy_reward": 0.11383929057046771, "rewards/format_reward": 0.0, "step": 168 }, { "completion_length": 1615.0692596435547, "epoch": 0.2019266671645135, "grad_norm": 0.0603337287902832, "kl": 0.026214599609375, "learning_rate": 9.719252397673423e-07, "loss": 0.0206, "reward": 0.06026786076836288, "reward_std": 0.051869654096663, "rewards/accuracy_reward": 0.06026786076836288, "rewards/format_reward": 0.0, "step": 169 }, { "completion_length": 1667.0759735107422, "epoch": 0.20312149951459935, "grad_norm": 0.041036318987607956, "kl": 0.02719879150390625, "learning_rate": 9.712679332443194e-07, "loss": 0.0137, "reward": 0.1004464328289032, "reward_std": 0.03263125568628311, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, "step": 170 }, { "completion_length": 1606.817024230957, "epoch": 0.20431633186468523, "grad_norm": 0.056043580174446106, "kl": 0.061431884765625, "learning_rate": 9.706032744447417e-07, "loss": 0.0141, "reward": 0.08705357508733869, "reward_std": 0.04671474080532789, "rewards/accuracy_reward": 0.08705357508733869, "rewards/format_reward": 0.0, "step": 171 }, { "completion_length": 1647.8036346435547, "epoch": 0.2055111642147711, "grad_norm": 0.10786686837673187, "kl": 0.02806854248046875, "learning_rate": 9.69931274968721e-07, "loss": 0.0279, "reward": 0.07589286100119352, "reward_std": 0.06526251137256622, "rewards/accuracy_reward": 0.07589286100119352, "rewards/format_reward": 0.0, "step": 172 }, { "completion_length": 1690.4822158813477, "epoch": 0.206705996564857, "grad_norm": 0.037835970520973206, "kl": 0.02832794189453125, "learning_rate": 9.692519465444848e-07, "loss": 0.01, "reward": 0.07366071734577417, "reward_std": 0.03263125568628311, "rewards/accuracy_reward": 0.07366071734577417, "rewards/format_reward": 0.0, "step": 173 }, { "completion_length": 1637.54248046875, "epoch": 0.20790082891494288, "grad_norm": 0.040888700634241104, "kl": 0.0403900146484375, "learning_rate": 9.685653010281701e-07, "loss": 0.013, "reward": 0.0848214328289032, "reward_std": 0.04086920013651252, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, "step": 174 }, { "completion_length": 1581.629539489746, "epoch": 0.20909566126502874, "grad_norm": 0.056287731975317, "kl": 0.03960418701171875, "learning_rate": 9.678713504036177e-07, "loss": -0.0006, "reward": 0.07589286100119352, "reward_std": 0.037095542065799236, "rewards/accuracy_reward": 0.07589286100119352, "rewards/format_reward": 0.0, "step": 175 }, { "completion_length": 1593.3349151611328, "epoch": 0.21029049361511462, "grad_norm": 0.052414316684007645, "kl": 0.04175567626953125, "learning_rate": 9.671701067821619e-07, "loss": 0.0141, "reward": 0.06919643166474998, "reward_std": 0.03263125568628311, "rewards/accuracy_reward": 0.06919643166474998, "rewards/format_reward": 0.0, "step": 176 }, { "completion_length": 1638.4732971191406, "epoch": 0.2114853259652005, "grad_norm": 0.04180744290351868, "kl": 0.0336761474609375, "learning_rate": 9.664615824024202e-07, "loss": 0.0136, "reward": 0.04017857322469354, "reward_std": 0.03194062877446413, "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, "step": 177 }, { "completion_length": 1621.1407012939453, "epoch": 0.2126801583152864, "grad_norm": 0.049981728196144104, "kl": 0.0360870361328125, "learning_rate": 9.657457896300791e-07, "loss": 0.0112, "reward": 0.09151786169968545, "reward_std": 0.0415598270483315, "rewards/accuracy_reward": 0.09151786169968545, "rewards/format_reward": 0.0, "step": 178 }, { "completion_length": 1653.3125610351562, "epoch": 0.21387499066537227, "grad_norm": 0.046576421707868576, "kl": 0.0510101318359375, "learning_rate": 9.650227409576783e-07, "loss": 0.0093, "reward": 0.08482143376022577, "reward_std": 0.028166969772428274, "rewards/accuracy_reward": 0.08482143376022577, "rewards/format_reward": 0.0, "step": 179 }, { "completion_length": 1704.6407089233398, "epoch": 0.21506982301545816, "grad_norm": 0.052126962691545486, "kl": 0.037872314453125, "learning_rate": 9.64292449004393e-07, "loss": 0.0112, "reward": 0.05133928800933063, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.05133928800933063, "rewards/format_reward": 0.0, "step": 180 }, { "completion_length": 1789.7947235107422, "epoch": 0.216264655365544, "grad_norm": 0.04103609547019005, "kl": 0.0371551513671875, "learning_rate": 9.635549265158132e-07, "loss": 0.0094, "reward": 0.07142857508733869, "reward_std": 0.03194062924012542, "rewards/accuracy_reward": 0.07142857508733869, "rewards/format_reward": 0.0, "step": 181 }, { "completion_length": 1734.5648193359375, "epoch": 0.2174594877156299, "grad_norm": 0.0495380200445652, "kl": 0.0365447998046875, "learning_rate": 9.628101863637217e-07, "loss": 0.0037, "reward": 0.06250000209547579, "reward_std": 0.03194062877446413, "rewards/accuracy_reward": 0.06250000209547579, "rewards/format_reward": 0.0, "step": 182 }, { "completion_length": 1743.8281936645508, "epoch": 0.21865432006571578, "grad_norm": 0.0665469691157341, "kl": 0.03769683837890625, "learning_rate": 9.620582415458692e-07, "loss": 0.0079, "reward": 0.06919643096625805, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.0, "step": 183 }, { "completion_length": 1786.3282012939453, "epoch": 0.21984915241580166, "grad_norm": 0.056005463004112244, "kl": 0.04552459716796875, "learning_rate": 9.612991051857472e-07, "loss": 0.0174, "reward": 0.07366071688011289, "reward_std": 0.041559827513992786, "rewards/accuracy_reward": 0.07366071688011289, "rewards/format_reward": 0.0, "step": 184 }, { "completion_length": 1768.0067825317383, "epoch": 0.22104398476588755, "grad_norm": 0.05328615754842758, "kl": 0.0373992919921875, "learning_rate": 9.605327905323599e-07, "loss": 0.0071, "reward": 0.05133928800933063, "reward_std": 0.023702684324234724, "rewards/accuracy_reward": 0.05133928800933063, "rewards/format_reward": 0.0, "step": 185 }, { "completion_length": 1812.3616943359375, "epoch": 0.2222388171159734, "grad_norm": 0.042483970522880554, "kl": 0.036956787109375, "learning_rate": 9.597593109599917e-07, "loss": 0.0083, "reward": 0.055803575087338686, "reward_std": 0.0326312561519444, "rewards/accuracy_reward": 0.055803575087338686, "rewards/format_reward": 0.0, "step": 186 }, { "completion_length": 1834.8639373779297, "epoch": 0.2234336494660593, "grad_norm": 0.06950689107179642, "kl": 0.04253387451171875, "learning_rate": 9.58978679967975e-07, "loss": 0.0191, "reward": 0.07366071781143546, "reward_std": 0.05564331216737628, "rewards/accuracy_reward": 0.07366071781143546, "rewards/format_reward": 0.0, "step": 187 }, { "completion_length": 1791.3415985107422, "epoch": 0.22462848181614517, "grad_norm": 0.049344152212142944, "kl": 0.0380096435546875, "learning_rate": 9.581909111804533e-07, "loss": 0.0153, "reward": 0.05357143050059676, "reward_std": 0.04602411389350891, "rewards/accuracy_reward": 0.05357143050059676, "rewards/format_reward": 0.0, "step": 188 }, { "completion_length": 1864.2188415527344, "epoch": 0.22582331416623105, "grad_norm": 0.04422346130013466, "kl": 0.03714752197265625, "learning_rate": 9.573960183461448e-07, "loss": 0.0104, "reward": 0.06473214621655643, "reward_std": 0.02885759761556983, "rewards/accuracy_reward": 0.06473214621655643, "rewards/format_reward": 0.0, "step": 189 }, { "completion_length": 1834.0581130981445, "epoch": 0.22701814651631694, "grad_norm": 0.040585026144981384, "kl": 0.03923797607421875, "learning_rate": 9.565940153381013e-07, "loss": 0.0092, "reward": 0.08705357671715319, "reward_std": 0.028857596684247255, "rewards/accuracy_reward": 0.08705357671715319, "rewards/format_reward": 0.0, "step": 190 }, { "completion_length": 1800.26123046875, "epoch": 0.22821297886640282, "grad_norm": 0.047462135553359985, "kl": 0.04217529296875, "learning_rate": 9.55784916153467e-07, "loss": 0.0112, "reward": 0.09598214761354029, "reward_std": 0.04155982844531536, "rewards/accuracy_reward": 0.09598214761354029, "rewards/format_reward": 0.0, "step": 191 }, { "completion_length": 1757.6228485107422, "epoch": 0.22940781121648868, "grad_norm": 0.05245211720466614, "kl": 0.0369415283203125, "learning_rate": 9.549687349132335e-07, "loss": 0.0143, "reward": 0.08705357601866126, "reward_std": 0.0326312561519444, "rewards/accuracy_reward": 0.08705357601866126, "rewards/format_reward": 0.0, "step": 192 }, { "completion_length": 1833.9822463989258, "epoch": 0.23060264356657456, "grad_norm": 0.05480256304144859, "kl": 0.0359649658203125, "learning_rate": 9.541454858619935e-07, "loss": 0.0052, "reward": 0.08258928940631449, "reward_std": 0.013392857741564512, "rewards/accuracy_reward": 0.08258928940631449, "rewards/format_reward": 0.0, "step": 193 }, { "completion_length": 1782.5826797485352, "epoch": 0.23179747591666044, "grad_norm": 0.03965674340724945, "kl": 0.03722381591796875, "learning_rate": 9.533151833676927e-07, "loss": 0.0096, "reward": 0.07812500395812094, "reward_std": 0.03125000186264515, "rewards/accuracy_reward": 0.07812500395812094, "rewards/format_reward": 0.0, "step": 194 }, { "completion_length": 1788.1451797485352, "epoch": 0.23299230826674633, "grad_norm": 0.06590753048658371, "kl": 0.03614044189453125, "learning_rate": 9.524778419213782e-07, "loss": 0.0141, "reward": 0.06919643189758062, "reward_std": 0.054262058809399605, "rewards/accuracy_reward": 0.06919643189758062, "rewards/format_reward": 0.0, "step": 195 }, { "completion_length": 1774.8125762939453, "epoch": 0.2341871406168322, "grad_norm": 0.06757646054029465, "kl": 0.0346527099609375, "learning_rate": 9.516334761369466e-07, "loss": 0.0062, "reward": 0.06919643166474998, "reward_std": 0.041559827513992786, "rewards/accuracy_reward": 0.06919643166474998, "rewards/format_reward": 0.0, "step": 196 }, { "completion_length": 1722.4688339233398, "epoch": 0.23538197296691807, "grad_norm": 0.04876266047358513, "kl": 0.035186767578125, "learning_rate": 9.507821007508878e-07, "loss": 0.0149, "reward": 0.08482143259607255, "reward_std": 0.037095542065799236, "rewards/accuracy_reward": 0.08482143259607255, "rewards/format_reward": 0.0, "step": 197 }, { "completion_length": 1728.2679290771484, "epoch": 0.23657680531700395, "grad_norm": 0.05999374017119408, "kl": 0.0343170166015625, "learning_rate": 9.499237306220287e-07, "loss": 0.0158, "reward": 0.1093750074505806, "reward_std": 0.04947724984958768, "rewards/accuracy_reward": 0.1093750074505806, "rewards/format_reward": 0.0, "step": 198 }, { "completion_length": 1654.4933547973633, "epoch": 0.23777163766708984, "grad_norm": 0.09377201646566391, "kl": 0.0353851318359375, "learning_rate": 9.490583807312737e-07, "loss": 0.0227, "reward": 0.08482143236324191, "reward_std": 0.060107598546892405, "rewards/accuracy_reward": 0.08482143236324191, "rewards/format_reward": 0.0, "step": 199 }, { "completion_length": 1687.660789489746, "epoch": 0.23896647001717572, "grad_norm": 0.03733208402991295, "kl": 0.03704071044921875, "learning_rate": 9.481860661813429e-07, "loss": 0.0114, "reward": 0.053571431431919336, "reward_std": 0.03194062877446413, "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.0, "step": 200 }, { "completion_length": 1567.0380096435547, "epoch": 0.2401613023672616, "grad_norm": 0.06525576859712601, "kl": 0.08606719970703125, "learning_rate": 9.473068021965087e-07, "loss": 0.0055, "reward": 0.06696428940631449, "reward_std": 0.06388125754892826, "rewards/accuracy_reward": 0.06696428940631449, "rewards/format_reward": 0.0, "step": 201 }, { "completion_length": 1622.245590209961, "epoch": 0.24135613471734746, "grad_norm": 0.11824239790439606, "kl": 0.070587158203125, "learning_rate": 9.464206041223303e-07, "loss": 0.0267, "reward": 0.09821429033763707, "reward_std": 0.060107598546892405, "rewards/accuracy_reward": 0.09821429033763707, "rewards/format_reward": 0.0, "step": 202 }, { "completion_length": 1623.846061706543, "epoch": 0.24255096706743334, "grad_norm": 0.08403824269771576, "kl": 0.0413360595703125, "learning_rate": 9.45527487425386e-07, "loss": 0.0153, "reward": 0.06919643189758062, "reward_std": 0.04294108273461461, "rewards/accuracy_reward": 0.06919643189758062, "rewards/format_reward": 0.0, "step": 203 }, { "completion_length": 1670.7300033569336, "epoch": 0.24374579941751923, "grad_norm": 0.040049828588962555, "kl": 0.0520172119140625, "learning_rate": 9.446274676930022e-07, "loss": 0.0123, "reward": 0.060267859837040305, "reward_std": 0.034012510906904936, "rewards/accuracy_reward": 0.060267859837040305, "rewards/format_reward": 0.0, "step": 204 }, { "completion_length": 1616.1161346435547, "epoch": 0.2449406317676051, "grad_norm": 0.04737696051597595, "kl": 0.041259765625, "learning_rate": 9.437205606329825e-07, "loss": 0.0131, "reward": 0.08258929057046771, "reward_std": 0.055643313098698854, "rewards/accuracy_reward": 0.08258929057046771, "rewards/format_reward": 0.0, "step": 205 }, { "completion_length": 1631.3750686645508, "epoch": 0.246135464117691, "grad_norm": 0.06103721261024475, "kl": 0.0520172119140625, "learning_rate": 9.428067820733334e-07, "loss": 0.013, "reward": 0.07589286146685481, "reward_std": 0.058726344257593155, "rewards/accuracy_reward": 0.07589286146685481, "rewards/format_reward": 0.0, "step": 206 }, { "completion_length": 1677.4420318603516, "epoch": 0.24733029646777688, "grad_norm": 0.06075451895594597, "kl": 0.0617523193359375, "learning_rate": 9.418861479619879e-07, "loss": 0.0058, "reward": 0.06919643306173384, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.06919643306173384, "rewards/format_reward": 0.0, "step": 207 }, { "completion_length": 1646.7366790771484, "epoch": 0.24852512881786273, "grad_norm": 0.08363071829080582, "kl": 0.0706024169921875, "learning_rate": 9.409586743665263e-07, "loss": 0.0041, "reward": 0.06696428754366934, "reward_std": 0.03194062877446413, "rewards/accuracy_reward": 0.06696428754366934, "rewards/format_reward": 0.0, "step": 208 }, { "completion_length": 1709.7790756225586, "epoch": 0.24971996116794862, "grad_norm": 0.06936007738113403, "kl": 0.050506591796875, "learning_rate": 9.400243774738977e-07, "loss": 0.0044, "reward": 0.055803573690354824, "reward_std": 0.04294108226895332, "rewards/accuracy_reward": 0.055803573690354824, "rewards/format_reward": 0.0, "step": 209 }, { "completion_length": 1670.4442672729492, "epoch": 0.2509147935180345, "grad_norm": 0.04232772812247276, "kl": 0.064300537109375, "learning_rate": 9.390832735901357e-07, "loss": 0.0145, "reward": 0.0848214328289032, "reward_std": 0.037095542065799236, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, "step": 210 }, { "completion_length": 1629.9978485107422, "epoch": 0.25210962586812036, "grad_norm": 0.05225410684943199, "kl": 0.05120849609375, "learning_rate": 9.381353791400747e-07, "loss": 0.0095, "reward": 0.06696428917348385, "reward_std": 0.04225045535713434, "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.0, "step": 211 }, { "completion_length": 1737.283576965332, "epoch": 0.25330445821820624, "grad_norm": 0.0638667419552803, "kl": 0.0720977783203125, "learning_rate": 9.371807106670627e-07, "loss": 0.0103, "reward": 0.07589286076836288, "reward_std": 0.037095542065799236, "rewards/accuracy_reward": 0.07589286076836288, "rewards/format_reward": 0.0, "step": 212 }, { "completion_length": 1765.9487380981445, "epoch": 0.2544992905682921, "grad_norm": 0.06025272235274315, "kl": 0.0511016845703125, "learning_rate": 9.362192848326734e-07, "loss": 0.0042, "reward": 0.08035714621655643, "reward_std": 0.0357142873108387, "rewards/accuracy_reward": 0.08035714621655643, "rewards/format_reward": 0.0, "step": 213 }, { "completion_length": 1823.5290908813477, "epoch": 0.255694122918378, "grad_norm": 0.07083398848772049, "kl": 0.0521392822265625, "learning_rate": 9.352511184164149e-07, "loss": 0.017, "reward": 0.06250000325962901, "reward_std": 0.046024113427847624, "rewards/accuracy_reward": 0.06250000325962901, "rewards/format_reward": 0.0, "step": 214 }, { "completion_length": 1815.1920471191406, "epoch": 0.2568889552684639, "grad_norm": 0.06432505697011948, "kl": 0.055389404296875, "learning_rate": 9.342762283154364e-07, "loss": 0.0087, "reward": 0.08035714738070965, "reward_std": 0.04225045535713434, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.0, "step": 215 }, { "completion_length": 1860.8438262939453, "epoch": 0.2580837876185498, "grad_norm": 0.08157993108034134, "kl": 0.052581787109375, "learning_rate": 9.332946315442342e-07, "loss": 0.007, "reward": 0.07589286006987095, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, "step": 216 }, { "completion_length": 1815.9866943359375, "epoch": 0.25927861996863566, "grad_norm": 0.07907180488109589, "kl": 0.0516815185546875, "learning_rate": 9.323063452343542e-07, "loss": 0.0189, "reward": 0.08482143259607255, "reward_std": 0.06388125708326697, "rewards/accuracy_reward": 0.08482143259607255, "rewards/format_reward": 0.0, "step": 217 }, { "completion_length": 1818.3058853149414, "epoch": 0.26047345231872154, "grad_norm": 0.05212155357003212, "kl": 0.05987548828125, "learning_rate": 9.313113866340929e-07, "loss": 0.0106, "reward": 0.0580357164144516, "reward_std": 0.019238398410379887, "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.0, "step": 218 }, { "completion_length": 1854.759033203125, "epoch": 0.2616682846688074, "grad_norm": 0.06010846793651581, "kl": 0.0521697998046875, "learning_rate": 9.303097731081968e-07, "loss": 0.0048, "reward": 0.058035716880112886, "reward_std": 0.019238398410379887, "rewards/accuracy_reward": 0.058035716880112886, "rewards/format_reward": 0.0, "step": 219 }, { "completion_length": 1842.9598922729492, "epoch": 0.2628631170188933, "grad_norm": 0.05745924636721611, "kl": 0.0491943359375, "learning_rate": 9.293015221375586e-07, "loss": 0.0097, "reward": 0.09821428963914514, "reward_std": 0.052560281939804554, "rewards/accuracy_reward": 0.09821428963914514, "rewards/format_reward": 0.0, "step": 220 }, { "completion_length": 1851.6250915527344, "epoch": 0.26405794936897914, "grad_norm": 0.045208580791950226, "kl": 0.05609130859375, "learning_rate": 9.282866513189129e-07, "loss": 0.0126, "reward": 0.060267860535532236, "reward_std": 0.0326312561519444, "rewards/accuracy_reward": 0.060267860535532236, "rewards/format_reward": 0.0, "step": 221 }, { "completion_length": 1846.767936706543, "epoch": 0.265252781719065, "grad_norm": 0.061325449496507645, "kl": 0.04986572265625, "learning_rate": 9.272651783645288e-07, "loss": 0.0163, "reward": 0.11830357671715319, "reward_std": 0.03640491422265768, "rewards/accuracy_reward": 0.11830357671715319, "rewards/format_reward": 0.0, "step": 222 }, { "completion_length": 1822.852767944336, "epoch": 0.2664476140691509, "grad_norm": 0.05418906360864639, "kl": 0.04730224609375, "learning_rate": 9.262371211019004e-07, "loss": 0.0111, "reward": 0.08705357601866126, "reward_std": 0.03778616897761822, "rewards/accuracy_reward": 0.08705357601866126, "rewards/format_reward": 0.0, "step": 223 }, { "completion_length": 1842.805892944336, "epoch": 0.2676424464192368, "grad_norm": 0.04060911387205124, "kl": 0.04571533203125, "learning_rate": 9.25202497473436e-07, "loss": 0.0083, "reward": 0.060267859837040305, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.060267859837040305, "rewards/format_reward": 0.0, "step": 224 }, { "completion_length": 1787.5581130981445, "epoch": 0.2688372787693227, "grad_norm": 0.0416116788983345, "kl": 0.045928955078125, "learning_rate": 9.241613255361454e-07, "loss": 0.0067, "reward": 0.08258928940631449, "reward_std": 0.03125000139698386, "rewards/accuracy_reward": 0.08258928940631449, "rewards/format_reward": 0.0, "step": 225 }, { "completion_length": 1808.8884735107422, "epoch": 0.27003211111940856, "grad_norm": 0.05132749676704407, "kl": 0.0447845458984375, "learning_rate": 9.231136234613233e-07, "loss": 0.0077, "reward": 0.06250000302679837, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.06250000302679837, "rewards/format_reward": 0.0, "step": 226 }, { "completion_length": 1779.058120727539, "epoch": 0.27122694346949444, "grad_norm": 0.07144412398338318, "kl": 0.044708251953125, "learning_rate": 9.220594095342344e-07, "loss": 0.0155, "reward": 0.04910714481957257, "reward_std": 0.040869200602173805, "rewards/accuracy_reward": 0.04910714481957257, "rewards/format_reward": 0.0, "step": 227 }, { "completion_length": 1777.3639221191406, "epoch": 0.2724217758195803, "grad_norm": 0.05889216437935829, "kl": 0.04315185546875, "learning_rate": 9.209987021537921e-07, "loss": 0.0151, "reward": 0.06473214598372579, "reward_std": 0.05048839980736375, "rewards/accuracy_reward": 0.06473214598372579, "rewards/format_reward": 0.0, "step": 228 }, { "completion_length": 1783.9107818603516, "epoch": 0.2736166081696662, "grad_norm": 0.07380811125040054, "kl": 0.039520263671875, "learning_rate": 9.199315198322385e-07, "loss": 0.0197, "reward": 0.07589286076836288, "reward_std": 0.04979777196422219, "rewards/accuracy_reward": 0.07589286076836288, "rewards/format_reward": 0.0, "step": 229 }, { "completion_length": 1757.2031936645508, "epoch": 0.2748114405197521, "grad_norm": 0.04281708225607872, "kl": 0.0430755615234375, "learning_rate": 9.188578811948214e-07, "loss": 0.0124, "reward": 0.09151786100119352, "reward_std": 0.03640491468831897, "rewards/accuracy_reward": 0.09151786100119352, "rewards/format_reward": 0.0, "step": 230 }, { "completion_length": 1748.6317749023438, "epoch": 0.276006272869838, "grad_norm": 0.06471376866102219, "kl": 0.0422515869140625, "learning_rate": 9.177778049794686e-07, "loss": 0.0092, "reward": 0.08258928917348385, "reward_std": 0.022321429569274187, "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, "step": 231 }, { "completion_length": 1766.346061706543, "epoch": 0.2772011052199238, "grad_norm": 0.04782126098871231, "kl": 0.0407562255859375, "learning_rate": 9.166913100364615e-07, "loss": 0.0098, "reward": 0.04241071711294353, "reward_std": 0.03640491422265768, "rewards/accuracy_reward": 0.04241071711294353, "rewards/format_reward": 0.0, "step": 232 }, { "completion_length": 1734.0536651611328, "epoch": 0.2783959375700097, "grad_norm": 0.047262661159038544, "kl": 0.04083251953125, "learning_rate": 9.155984153281057e-07, "loss": 0.0041, "reward": 0.06696428940631449, "reward_std": 0.02816697023808956, "rewards/accuracy_reward": 0.06696428940631449, "rewards/format_reward": 0.0, "step": 233 }, { "completion_length": 1707.308120727539, "epoch": 0.27959076992009557, "grad_norm": 0.04049357771873474, "kl": 0.040557861328125, "learning_rate": 9.144991399284e-07, "loss": 0.0088, "reward": 0.08705357555299997, "reward_std": 0.03125000139698386, "rewards/accuracy_reward": 0.08705357555299997, "rewards/format_reward": 0.0, "step": 234 }, { "completion_length": 1717.5625686645508, "epoch": 0.28078560227018146, "grad_norm": 0.04044070467352867, "kl": 0.081390380859375, "learning_rate": 9.133935030227042e-07, "loss": 0.0151, "reward": 0.03571428754366934, "reward_std": 0.03194062924012542, "rewards/accuracy_reward": 0.03571428754366934, "rewards/format_reward": 0.0, "step": 235 }, { "completion_length": 1655.3148040771484, "epoch": 0.28198043462026734, "grad_norm": 0.03544924780726433, "kl": 0.049896240234375, "learning_rate": 9.122815239074033e-07, "loss": 0.0085, "reward": 0.04017857345752418, "reward_std": 0.01785714365541935, "rewards/accuracy_reward": 0.04017857345752418, "rewards/format_reward": 0.0, "step": 236 }, { "completion_length": 1652.7902450561523, "epoch": 0.2831752669703532, "grad_norm": 0.03821130469441414, "kl": 0.0517425537109375, "learning_rate": 9.111632219895714e-07, "loss": 0.0103, "reward": 0.04687500139698386, "reward_std": 0.03640491468831897, "rewards/accuracy_reward": 0.04687500139698386, "rewards/format_reward": 0.0, "step": 237 }, { "completion_length": 1727.009017944336, "epoch": 0.2843700993204391, "grad_norm": 0.042800839990377426, "kl": 0.03711700439453125, "learning_rate": 9.100386167866327e-07, "loss": 0.013, "reward": 0.06026786030270159, "reward_std": 0.03640491422265768, "rewards/accuracy_reward": 0.06026786030270159, "rewards/format_reward": 0.0, "step": 238 }, { "completion_length": 1660.9532012939453, "epoch": 0.285564931670525, "grad_norm": 0.06006590276956558, "kl": 0.039093017578125, "learning_rate": 9.089077279260211e-07, "loss": 0.0159, "reward": 0.042410716181620955, "reward_std": 0.037786169443279505, "rewards/accuracy_reward": 0.042410716181620955, "rewards/format_reward": 0.0, "step": 239 }, { "completion_length": 1706.0759658813477, "epoch": 0.2867597640206109, "grad_norm": 0.10355778783559799, "kl": 0.03755950927734375, "learning_rate": 9.077705751448373e-07, "loss": 0.025, "reward": 0.08258928917348385, "reward_std": 0.03640491468831897, "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, "step": 240 }, { "completion_length": 1745.0313339233398, "epoch": 0.28795459637069676, "grad_norm": 0.11789200454950333, "kl": 0.039825439453125, "learning_rate": 9.066271782895047e-07, "loss": 0.0271, "reward": 0.07589286076836288, "reward_std": 0.05117902671918273, "rewards/accuracy_reward": 0.07589286076836288, "rewards/format_reward": 0.0, "step": 241 }, { "completion_length": 1682.6005172729492, "epoch": 0.28914942872078264, "grad_norm": 0.04094213992357254, "kl": 0.039642333984375, "learning_rate": 9.054775573154226e-07, "loss": 0.0091, "reward": 0.044642857974395156, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.044642857974395156, "rewards/format_reward": 0.0, "step": 242 }, { "completion_length": 1675.1607971191406, "epoch": 0.29034426107086847, "grad_norm": 0.04326920956373215, "kl": 0.0663909912109375, "learning_rate": 9.043217322866185e-07, "loss": 0.0138, "reward": 0.07589285913854837, "reward_std": 0.03194062877446413, "rewards/accuracy_reward": 0.07589285913854837, "rewards/format_reward": 0.0, "step": 243 }, { "completion_length": 1683.9755249023438, "epoch": 0.29153909342095435, "grad_norm": 0.0396721176803112, "kl": 0.043792724609375, "learning_rate": 9.031597233753974e-07, "loss": 0.0106, "reward": 0.07366071711294353, "reward_std": 0.03640491468831897, "rewards/accuracy_reward": 0.07366071711294353, "rewards/format_reward": 0.0, "step": 244 }, { "completion_length": 1721.0715103149414, "epoch": 0.29273392577104024, "grad_norm": 0.06731508672237396, "kl": 0.043792724609375, "learning_rate": 9.0199155086199e-07, "loss": 0.0023, "reward": 0.0736607164144516, "reward_std": 0.03125000139698386, "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.0, "step": 245 }, { "completion_length": 1678.5692825317383, "epoch": 0.2939287581211261, "grad_norm": 0.07014211267232895, "kl": 0.0460357666015625, "learning_rate": 9.008172351341988e-07, "loss": 0.0076, "reward": 0.09375000488944352, "reward_std": 0.023012056481093168, "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.0, "step": 246 }, { "completion_length": 1677.8014221191406, "epoch": 0.295123590471212, "grad_norm": 0.05396761745214462, "kl": 0.07578277587890625, "learning_rate": 8.99636796687042e-07, "loss": 0.0107, "reward": 0.08482143259607255, "reward_std": 0.02816697023808956, "rewards/accuracy_reward": 0.08482143259607255, "rewards/format_reward": 0.0, "step": 247 }, { "completion_length": 1693.6384506225586, "epoch": 0.2963184228212979, "grad_norm": 0.05983927473425865, "kl": 0.048583984375, "learning_rate": 8.98450256122396e-07, "loss": 0.0112, "reward": 0.058035716880112886, "reward_std": 0.01785714365541935, "rewards/accuracy_reward": 0.058035716880112886, "rewards/format_reward": 0.0, "step": 248 }, { "completion_length": 1772.792495727539, "epoch": 0.2975132551713838, "grad_norm": 0.047884415835142136, "kl": 0.045318603515625, "learning_rate": 8.97257634148636e-07, "loss": 0.0116, "reward": 0.08258929033763707, "reward_std": 0.0326312561519444, "rewards/accuracy_reward": 0.08258929033763707, "rewards/format_reward": 0.0, "step": 249 }, { "completion_length": 1814.9978408813477, "epoch": 0.29870808752146966, "grad_norm": 0.04011714085936546, "kl": 0.0483856201171875, "learning_rate": 8.960589515802743e-07, "loss": 0.015, "reward": 0.09375000395812094, "reward_std": 0.03194062877446413, "rewards/accuracy_reward": 0.09375000395812094, "rewards/format_reward": 0.0, "step": 250 }, { "completion_length": 1744.1384735107422, "epoch": 0.29990291987155554, "grad_norm": 0.05059587210416794, "kl": 0.169952392578125, "learning_rate": 8.948542293375971e-07, "loss": 0.0085, "reward": 0.09151786030270159, "reward_std": 0.03640491468831897, "rewards/accuracy_reward": 0.09151786030270159, "rewards/format_reward": 0.0, "step": 251 }, { "completion_length": 1757.1250762939453, "epoch": 0.3010977522216414, "grad_norm": 0.04547136649489403, "kl": 0.0452423095703125, "learning_rate": 8.936434884462994e-07, "loss": 0.01, "reward": 0.07366071781143546, "reward_std": 0.02885759761556983, "rewards/accuracy_reward": 0.07366071781143546, "rewards/format_reward": 0.0, "step": 252 }, { "completion_length": 1814.2411651611328, "epoch": 0.30229258457172725, "grad_norm": 0.04067513346672058, "kl": 0.041595458984375, "learning_rate": 8.924267500371181e-07, "loss": 0.0084, "reward": 0.09151786076836288, "reward_std": 0.04533348651602864, "rewards/accuracy_reward": 0.09151786076836288, "rewards/format_reward": 0.0, "step": 253 }, { "completion_length": 1765.8572158813477, "epoch": 0.30348741692181314, "grad_norm": 0.09482776373624802, "kl": 0.0472412109375, "learning_rate": 8.912040353454634e-07, "loss": 0.0192, "reward": 0.10267857671715319, "reward_std": 0.06010759808123112, "rewards/accuracy_reward": 0.10267857671715319, "rewards/format_reward": 0.0, "step": 254 }, { "completion_length": 1802.1652603149414, "epoch": 0.304682249271899, "grad_norm": 0.04419989883899689, "kl": 0.044830322265625, "learning_rate": 8.899753657110475e-07, "loss": 0.0108, "reward": 0.12500000558793545, "reward_std": 0.040869200602173805, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 255 }, { "completion_length": 1758.2679290771484, "epoch": 0.3058770816219849, "grad_norm": 0.058879632502794266, "kl": 0.04559326171875, "learning_rate": 8.887407625775131e-07, "loss": 0.0173, "reward": 0.053571431431919336, "reward_std": 0.04086920013651252, "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.0, "step": 256 }, { "completion_length": 1785.174186706543, "epoch": 0.3070719139720708, "grad_norm": 0.057185444980859756, "kl": 0.0446014404296875, "learning_rate": 8.875002474920582e-07, "loss": 0.0047, "reward": 0.04910714505240321, "reward_std": 0.008928571827709675, "rewards/accuracy_reward": 0.04910714505240321, "rewards/format_reward": 0.0, "step": 257 }, { "completion_length": 1794.3661422729492, "epoch": 0.30826674632215667, "grad_norm": 0.04548226669430733, "kl": 0.043182373046875, "learning_rate": 8.86253842105061e-07, "loss": 0.013, "reward": 0.058035718044266105, "reward_std": 0.03709554160013795, "rewards/accuracy_reward": 0.058035718044266105, "rewards/format_reward": 0.0, "step": 258 }, { "completion_length": 1843.4353408813477, "epoch": 0.30946157867224255, "grad_norm": 0.05014818534255028, "kl": 0.0458526611328125, "learning_rate": 8.850015681697013e-07, "loss": 0.009, "reward": 0.058035717345774174, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.058035717345774174, "rewards/format_reward": 0.0, "step": 259 }, { "completion_length": 1757.026870727539, "epoch": 0.31065641102232844, "grad_norm": 0.056322745978832245, "kl": 0.0449371337890625, "learning_rate": 8.83743447541581e-07, "loss": 0.0109, "reward": 0.09375000395812094, "reward_std": 0.044642859138548374, "rewards/accuracy_reward": 0.09375000395812094, "rewards/format_reward": 0.0, "step": 260 }, { "completion_length": 1825.1652603149414, "epoch": 0.3118512433724143, "grad_norm": 0.07245280593633652, "kl": 0.0457611083984375, "learning_rate": 8.824795021783428e-07, "loss": 0.0152, "reward": 0.051339288242161274, "reward_std": 0.03640491468831897, "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, "step": 261 }, { "completion_length": 1823.8973999023438, "epoch": 0.3130460757225002, "grad_norm": 0.051859915256500244, "kl": 0.0439605712890625, "learning_rate": 8.812097541392871e-07, "loss": 0.0149, "reward": 0.06473214621655643, "reward_std": 0.028857597149908543, "rewards/accuracy_reward": 0.06473214621655643, "rewards/format_reward": 0.0, "step": 262 }, { "completion_length": 1830.221061706543, "epoch": 0.3142409080725861, "grad_norm": 0.05041242763400078, "kl": 0.0460662841796875, "learning_rate": 8.799342255849871e-07, "loss": 0.0056, "reward": 0.06696428847499192, "reward_std": 0.03332188352942467, "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.0, "step": 263 }, { "completion_length": 1841.0849151611328, "epoch": 0.3154357404226719, "grad_norm": 0.043725624680519104, "kl": 0.0540313720703125, "learning_rate": 8.786529387769012e-07, "loss": 0.0107, "reward": 0.08705357555299997, "reward_std": 0.04294108273461461, "rewards/accuracy_reward": 0.08705357555299997, "rewards/format_reward": 0.0, "step": 264 }, { "completion_length": 1874.9219665527344, "epoch": 0.3166305727727578, "grad_norm": 0.04214875027537346, "kl": 0.043243408203125, "learning_rate": 8.773659160769853e-07, "loss": 0.0088, "reward": 0.055803574388846755, "reward_std": 0.022321429569274187, "rewards/accuracy_reward": 0.055803574388846755, "rewards/format_reward": 0.0, "step": 265 }, { "completion_length": 1831.6027603149414, "epoch": 0.3178254051228437, "grad_norm": 0.03985632583498955, "kl": 0.0450439453125, "learning_rate": 8.760731799473024e-07, "loss": 0.0084, "reward": 0.07142857555299997, "reward_std": 0.01785714365541935, "rewards/accuracy_reward": 0.07142857555299997, "rewards/format_reward": 0.0, "step": 266 }, { "completion_length": 1814.2567977905273, "epoch": 0.31902023747292957, "grad_norm": 0.043512940406799316, "kl": 0.042449951171875, "learning_rate": 8.747747529496302e-07, "loss": 0.0075, "reward": 0.08258928940631449, "reward_std": 0.04294108273461461, "rewards/accuracy_reward": 0.08258928940631449, "rewards/format_reward": 0.0, "step": 267 }, { "completion_length": 1813.1183776855469, "epoch": 0.32021506982301545, "grad_norm": 0.0412718839943409, "kl": 0.0396575927734375, "learning_rate": 8.734706577450682e-07, "loss": 0.0085, "reward": 0.10714286123402417, "reward_std": 0.03709554160013795, "rewards/accuracy_reward": 0.10714286123402417, "rewards/format_reward": 0.0, "step": 268 }, { "completion_length": 1816.4085693359375, "epoch": 0.32140990217310134, "grad_norm": 0.03997774422168732, "kl": 0.03961181640625, "learning_rate": 8.721609170936409e-07, "loss": 0.0059, "reward": 0.06919643143191934, "reward_std": 0.013392857741564512, "rewards/accuracy_reward": 0.06919643143191934, "rewards/format_reward": 0.0, "step": 269 }, { "completion_length": 1759.7701797485352, "epoch": 0.3226047345231872, "grad_norm": 0.10376866906881332, "kl": 0.04351806640625, "learning_rate": 8.708455538539014e-07, "loss": 0.02, "reward": 0.09598214807920158, "reward_std": 0.060798225458711386, "rewards/accuracy_reward": 0.09598214807920158, "rewards/format_reward": 0.0, "step": 270 }, { "completion_length": 1742.2009735107422, "epoch": 0.3237995668732731, "grad_norm": 0.05522194504737854, "kl": 0.03875732421875, "learning_rate": 8.695245909825326e-07, "loss": 0.0121, "reward": 0.11160714854486287, "reward_std": 0.05117902718484402, "rewards/accuracy_reward": 0.11160714854486287, "rewards/format_reward": 0.0, "step": 271 }, { "completion_length": 1793.2389221191406, "epoch": 0.324994399223359, "grad_norm": 0.039614755660295486, "kl": 0.04241943359375, "learning_rate": 8.681980515339463e-07, "loss": 0.0137, "reward": 0.07366071874275804, "reward_std": 0.03640491468831897, "rewards/accuracy_reward": 0.07366071874275804, "rewards/format_reward": 0.0, "step": 272 }, { "completion_length": 1816.174186706543, "epoch": 0.32618923157344487, "grad_norm": 0.04751076176762581, "kl": 0.0428314208984375, "learning_rate": 8.668659586598807e-07, "loss": 0.0143, "reward": 0.06696428847499192, "reward_std": 0.040869200602173805, "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.0, "step": 273 }, { "completion_length": 1719.9688262939453, "epoch": 0.32738406392353075, "grad_norm": 0.04421813413500786, "kl": 0.04296875, "learning_rate": 8.65528335608996e-07, "loss": 0.0147, "reward": 0.09375000442378223, "reward_std": 0.03194062877446413, "rewards/accuracy_reward": 0.09375000442378223, "rewards/format_reward": 0.0, "step": 274 }, { "completion_length": 1744.9889068603516, "epoch": 0.3285788962736166, "grad_norm": 0.09624681621789932, "kl": 0.041839599609375, "learning_rate": 8.641852057264699e-07, "loss": 0.0188, "reward": 0.08928571827709675, "reward_std": 0.042250454891473055, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, "step": 275 }, { "completion_length": 1769.7433700561523, "epoch": 0.32977372862370247, "grad_norm": 0.040807534009218216, "kl": 0.041839599609375, "learning_rate": 8.628365924535891e-07, "loss": 0.0102, "reward": 0.06250000325962901, "reward_std": 0.023012056481093168, "rewards/accuracy_reward": 0.06250000325962901, "rewards/format_reward": 0.0, "step": 276 }, { "completion_length": 1770.0804443359375, "epoch": 0.33096856097378835, "grad_norm": 0.03713390231132507, "kl": 0.044036865234375, "learning_rate": 8.614825193273397e-07, "loss": 0.0086, "reward": 0.12500000675208867, "reward_std": 0.02816697023808956, "rewards/accuracy_reward": 0.12500000675208867, "rewards/format_reward": 0.0, "step": 277 }, { "completion_length": 1737.9375762939453, "epoch": 0.33216339332387423, "grad_norm": 0.08166145533323288, "kl": 0.044830322265625, "learning_rate": 8.601230099799983e-07, "loss": 0.0173, "reward": 0.12723214994184673, "reward_std": 0.059416971169412136, "rewards/accuracy_reward": 0.12723214994184673, "rewards/format_reward": 0.0, "step": 278 }, { "completion_length": 1745.642936706543, "epoch": 0.3333582256739601, "grad_norm": 0.04890977591276169, "kl": 0.04410552978515625, "learning_rate": 8.587580881387179e-07, "loss": 0.0106, "reward": 0.09375000256113708, "reward_std": 0.0357142873108387, "rewards/accuracy_reward": 0.09375000256113708, "rewards/format_reward": 0.0, "step": 279 }, { "completion_length": 1719.1027526855469, "epoch": 0.334553058024046, "grad_norm": 0.050948407500982285, "kl": 0.0501708984375, "learning_rate": 8.573877776251139e-07, "loss": 0.0088, "reward": 0.051339288242161274, "reward_std": 0.023702683858573437, "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, "step": 280 }, { "completion_length": 1711.7634658813477, "epoch": 0.3357478903741319, "grad_norm": 0.052441731095314026, "kl": 0.0516815185546875, "learning_rate": 8.560121023548493e-07, "loss": 0.0131, "reward": 0.06473214505240321, "reward_std": 0.028857597149908543, "rewards/accuracy_reward": 0.06473214505240321, "rewards/format_reward": 0.0, "step": 281 }, { "completion_length": 1719.4643630981445, "epoch": 0.33694272272421777, "grad_norm": 0.05112472549080849, "kl": 0.0706024169921875, "learning_rate": 8.546310863372168e-07, "loss": 0.0155, "reward": 0.05803571711294353, "reward_std": 0.042250454891473055, "rewards/accuracy_reward": 0.05803571711294353, "rewards/format_reward": 0.0, "step": 282 }, { "completion_length": 1731.7054290771484, "epoch": 0.33813755507430365, "grad_norm": 0.10207454115152359, "kl": 0.0526885986328125, "learning_rate": 8.532447536747196e-07, "loss": 0.026, "reward": 0.05803571664728224, "reward_std": 0.046024113427847624, "rewards/accuracy_reward": 0.05803571664728224, "rewards/format_reward": 0.0, "step": 283 }, { "completion_length": 1722.6585693359375, "epoch": 0.33933238742438954, "grad_norm": 0.06487075984477997, "kl": 0.05938720703125, "learning_rate": 8.518531285626506e-07, "loss": 0.0195, "reward": 0.06473214575089514, "reward_std": 0.041559827979654074, "rewards/accuracy_reward": 0.06473214575089514, "rewards/format_reward": 0.0, "step": 284 }, { "completion_length": 1652.32373046875, "epoch": 0.3405272197744754, "grad_norm": 0.0555218942463398, "kl": 0.0539703369140625, "learning_rate": 8.504562352886706e-07, "loss": 0.0116, "reward": 0.055803573690354824, "reward_std": 0.03640491422265768, "rewards/accuracy_reward": 0.055803573690354824, "rewards/format_reward": 0.0, "step": 285 }, { "completion_length": 1730.852767944336, "epoch": 0.34172205212456125, "grad_norm": 0.10037334263324738, "kl": 0.0613555908203125, "learning_rate": 8.490540982323844e-07, "loss": 0.02, "reward": 0.04910714575089514, "reward_std": 0.0549526852555573, "rewards/accuracy_reward": 0.04910714575089514, "rewards/format_reward": 0.0, "step": 286 }, { "completion_length": 1699.8907089233398, "epoch": 0.34291688447464713, "grad_norm": 0.073909230530262, "kl": 0.0539703369140625, "learning_rate": 8.476467418649152e-07, "loss": 0.0194, "reward": 0.11383929173462093, "reward_std": 0.04671474127098918, "rewards/accuracy_reward": 0.11383929173462093, "rewards/format_reward": 0.0, "step": 287 }, { "completion_length": 1662.7947235107422, "epoch": 0.344111716824733, "grad_norm": 0.06144828721880913, "kl": 0.0787200927734375, "learning_rate": 8.462341907484767e-07, "loss": 0.0138, "reward": 0.07589286100119352, "reward_std": 0.04225045535713434, "rewards/accuracy_reward": 0.07589286100119352, "rewards/format_reward": 0.0, "step": 288 }, { "completion_length": 1701.5781936645508, "epoch": 0.3453065491748189, "grad_norm": 0.0555301271378994, "kl": 0.0649871826171875, "learning_rate": 8.448164695359461e-07, "loss": 0.0129, "reward": 0.10491072037257254, "reward_std": 0.037786169443279505, "rewards/accuracy_reward": 0.10491072037257254, "rewards/format_reward": 0.0, "step": 289 }, { "completion_length": 1637.3683700561523, "epoch": 0.3465013815249048, "grad_norm": 0.05649305507540703, "kl": 0.068267822265625, "learning_rate": 8.433936029704323e-07, "loss": 0.0171, "reward": 0.0714285746216774, "reward_std": 0.03194062877446413, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, "step": 290 }, { "completion_length": 1691.4532165527344, "epoch": 0.34769621387499067, "grad_norm": 0.06656340509653091, "kl": 0.0740203857421875, "learning_rate": 8.419656158848452e-07, "loss": 0.0198, "reward": 0.08928571827709675, "reward_std": 0.03332188352942467, "rewards/accuracy_reward": 0.08928571827709675, "rewards/format_reward": 0.0, "step": 291 }, { "completion_length": 1646.65185546875, "epoch": 0.34889104622507655, "grad_norm": 0.0833982527256012, "kl": 0.0699615478515625, "learning_rate": 8.40532533201461e-07, "loss": 0.0142, "reward": 0.0825892889406532, "reward_std": 0.03640491422265768, "rewards/accuracy_reward": 0.0825892889406532, "rewards/format_reward": 0.0, "step": 292 }, { "completion_length": 1664.3416061401367, "epoch": 0.35008587857516243, "grad_norm": 0.06360318511724472, "kl": 0.075836181640625, "learning_rate": 8.390943799314888e-07, "loss": 0.0151, "reward": 0.06473214598372579, "reward_std": 0.032631256617605686, "rewards/accuracy_reward": 0.06473214598372579, "rewards/format_reward": 0.0, "step": 293 }, { "completion_length": 1732.7344436645508, "epoch": 0.3512807109252483, "grad_norm": 0.11390808969736099, "kl": 0.0800018310546875, "learning_rate": 8.376511811746327e-07, "loss": 0.025, "reward": 0.07812500349245965, "reward_std": 0.05564331263303757, "rewards/accuracy_reward": 0.07812500349245965, "rewards/format_reward": 0.0, "step": 294 }, { "completion_length": 1651.823745727539, "epoch": 0.3524755432753342, "grad_norm": 0.07286576926708221, "kl": 0.0741729736328125, "learning_rate": 8.362029621186546e-07, "loss": 0.009, "reward": 0.10267857741564512, "reward_std": 0.03709554160013795, "rewards/accuracy_reward": 0.10267857741564512, "rewards/format_reward": 0.0, "step": 295 }, { "completion_length": 1670.0223999023438, "epoch": 0.3536703756254201, "grad_norm": 0.09410213679075241, "kl": 0.0783538818359375, "learning_rate": 8.347497480389343e-07, "loss": 0.0043, "reward": 0.07589286076836288, "reward_std": 0.028166969772428274, "rewards/accuracy_reward": 0.07589286076836288, "rewards/format_reward": 0.0, "step": 296 }, { "completion_length": 1723.3125610351562, "epoch": 0.3548652079755059, "grad_norm": 0.151943176984787, "kl": 0.0821685791015625, "learning_rate": 8.332915642980285e-07, "loss": 0.0059, "reward": 0.011160714784637094, "reward_std": 0.022321429569274187, "rewards/accuracy_reward": 0.011160714784637094, "rewards/format_reward": 0.0, "step": 297 }, { "completion_length": 1751.1786499023438, "epoch": 0.3560600403255918, "grad_norm": 0.05442013964056969, "kl": 0.0791015625, "learning_rate": 8.31828436345228e-07, "loss": 0.017, "reward": 0.09375000419095159, "reward_std": 0.03194062877446413, "rewards/accuracy_reward": 0.09375000419095159, "rewards/format_reward": 0.0, "step": 298 }, { "completion_length": 1669.8929138183594, "epoch": 0.3572548726756777, "grad_norm": 0.09812013059854507, "kl": 0.067352294921875, "learning_rate": 8.303603897161135e-07, "loss": 0.0205, "reward": 0.1272321492433548, "reward_std": 0.05702456785365939, "rewards/accuracy_reward": 0.1272321492433548, "rewards/format_reward": 0.0, "step": 299 }, { "completion_length": 1710.54248046875, "epoch": 0.35844970502576357, "grad_norm": 0.07327042520046234, "kl": 0.077362060546875, "learning_rate": 8.288874500321101e-07, "loss": 0.0108, "reward": 0.10937500582076609, "reward_std": 0.04533348651602864, "rewards/accuracy_reward": 0.10937500582076609, "rewards/format_reward": 0.0, "step": 300 }, { "completion_length": 1669.3840026855469, "epoch": 0.35964453737584945, "grad_norm": 0.06459075957536697, "kl": 0.0799713134765625, "learning_rate": 8.274096430000403e-07, "loss": 0.0145, "reward": 0.07589286216534674, "reward_std": 0.037095542065799236, "rewards/accuracy_reward": 0.07589286216534674, "rewards/format_reward": 0.0, "step": 301 }, { "completion_length": 1805.9018859863281, "epoch": 0.36083936972593533, "grad_norm": 0.05991438403725624, "kl": 0.0738525390625, "learning_rate": 8.259269944116749e-07, "loss": 0.0112, "reward": 0.04910714505240321, "reward_std": 0.0357142873108387, "rewards/accuracy_reward": 0.04910714505240321, "rewards/format_reward": 0.0, "step": 302 }, { "completion_length": 1772.4063262939453, "epoch": 0.3620342020760212, "grad_norm": 0.06683792173862457, "kl": 0.0709991455078125, "learning_rate": 8.244395301432829e-07, "loss": 0.0151, "reward": 0.07812500419095159, "reward_std": 0.04671474080532789, "rewards/accuracy_reward": 0.07812500419095159, "rewards/format_reward": 0.0, "step": 303 }, { "completion_length": 1704.5447158813477, "epoch": 0.3632290344261071, "grad_norm": 0.06785424798727036, "kl": 0.0706024169921875, "learning_rate": 8.229472761551802e-07, "loss": 0.0101, "reward": 0.07366071711294353, "reward_std": 0.041559827513992786, "rewards/accuracy_reward": 0.07366071711294353, "rewards/format_reward": 0.0, "step": 304 }, { "completion_length": 1774.1875686645508, "epoch": 0.364423866776193, "grad_norm": 0.04891849681735039, "kl": 0.0732574462890625, "learning_rate": 8.214502584912772e-07, "loss": 0.0119, "reward": 0.06250000325962901, "reward_std": 0.0357142873108387, "rewards/accuracy_reward": 0.06250000325962901, "rewards/format_reward": 0.0, "step": 305 }, { "completion_length": 1830.47998046875, "epoch": 0.36561869912627887, "grad_norm": 0.04641446843743324, "kl": 0.07135009765625, "learning_rate": 8.19948503278622e-07, "loss": 0.0131, "reward": 0.05133928847499192, "reward_std": 0.041559827513992786, "rewards/accuracy_reward": 0.05133928847499192, "rewards/format_reward": 0.0, "step": 306 }, { "completion_length": 1799.5982971191406, "epoch": 0.3668135314763647, "grad_norm": 0.04893700033426285, "kl": 0.0741729736328125, "learning_rate": 8.18442036726947e-07, "loss": 0.0105, "reward": 0.0669642889406532, "reward_std": 0.02954822452738881, "rewards/accuracy_reward": 0.0669642889406532, "rewards/format_reward": 0.0, "step": 307 }, { "completion_length": 1816.0402603149414, "epoch": 0.3680083638264506, "grad_norm": 0.10018644481897354, "kl": 0.0693359375, "learning_rate": 8.169308851282098e-07, "loss": 0.0212, "reward": 0.06473214621655643, "reward_std": 0.04294108273461461, "rewards/accuracy_reward": 0.06473214621655643, "rewards/format_reward": 0.0, "step": 308 }, { "completion_length": 1826.0893783569336, "epoch": 0.36920319617653646, "grad_norm": 0.05953473597764969, "kl": 0.0752410888671875, "learning_rate": 8.154150748561353e-07, "loss": 0.0186, "reward": 0.08705357392318547, "reward_std": 0.0415598270483315, "rewards/accuracy_reward": 0.08705357392318547, "rewards/format_reward": 0.0, "step": 309 }, { "completion_length": 1825.9688415527344, "epoch": 0.37039802852662235, "grad_norm": 0.05075536295771599, "kl": 0.07208251953125, "learning_rate": 8.138946323657543e-07, "loss": 0.0112, "reward": 0.10044643143191934, "reward_std": 0.040178573690354824, "rewards/accuracy_reward": 0.10044643143191934, "rewards/format_reward": 0.0, "step": 310 }, { "completion_length": 1817.1094589233398, "epoch": 0.37159286087670823, "grad_norm": 0.05632238835096359, "kl": 0.065582275390625, "learning_rate": 8.123695841929432e-07, "loss": 0.0075, "reward": 0.031250001629814506, "reward_std": 0.02816697023808956, "rewards/accuracy_reward": 0.031250001629814506, "rewards/format_reward": 0.0, "step": 311 }, { "completion_length": 1838.5089950561523, "epoch": 0.3727876932267941, "grad_norm": 0.09342899173498154, "kl": 0.0681610107421875, "learning_rate": 8.108399569539598e-07, "loss": 0.0167, "reward": 0.07812500419095159, "reward_std": 0.055643313098698854, "rewards/accuracy_reward": 0.07812500419095159, "rewards/format_reward": 0.0, "step": 312 }, { "completion_length": 1772.1049880981445, "epoch": 0.37398252557688, "grad_norm": 0.0448901504278183, "kl": 0.0684814453125, "learning_rate": 8.093057773449791e-07, "loss": 0.0109, "reward": 0.07142857415601611, "reward_std": 0.037095542065799236, "rewards/accuracy_reward": 0.07142857415601611, "rewards/format_reward": 0.0, "step": 313 }, { "completion_length": 1803.0916061401367, "epoch": 0.3751773579269659, "grad_norm": 0.07671260088682175, "kl": 0.0693206787109375, "learning_rate": 8.077670721416274e-07, "loss": 0.009, "reward": 0.05357143119908869, "reward_std": 0.028166969772428274, "rewards/accuracy_reward": 0.05357143119908869, "rewards/format_reward": 0.0, "step": 314 }, { "completion_length": 1829.9576568603516, "epoch": 0.37637219027705177, "grad_norm": 0.053671155124902725, "kl": 0.072784423828125, "learning_rate": 8.062238681985151e-07, "loss": 0.0072, "reward": 0.06026785960420966, "reward_std": 0.03125000139698386, "rewards/accuracy_reward": 0.06026785960420966, "rewards/format_reward": 0.0, "step": 315 }, { "completion_length": 1798.3974075317383, "epoch": 0.37756702262713765, "grad_norm": 0.051302239298820496, "kl": 0.068939208984375, "learning_rate": 8.046761924487678e-07, "loss": 0.0116, "reward": 0.08928571734577417, "reward_std": 0.03194062877446413, "rewards/accuracy_reward": 0.08928571734577417, "rewards/format_reward": 0.0, "step": 316 }, { "completion_length": 1825.1206436157227, "epoch": 0.37876185497722353, "grad_norm": 0.04785200208425522, "kl": 0.066162109375, "learning_rate": 8.031240719035564e-07, "loss": 0.0064, "reward": 0.08258929033763707, "reward_std": 0.02885759761556983, "rewards/accuracy_reward": 0.08258929033763707, "rewards/format_reward": 0.0, "step": 317 }, { "completion_length": 1757.314811706543, "epoch": 0.37995668732730936, "grad_norm": 0.10219365358352661, "kl": 0.061981201171875, "learning_rate": 8.015675336516255e-07, "loss": 0.0188, "reward": 0.08035714668221772, "reward_std": 0.060107598546892405, "rewards/accuracy_reward": 0.08035714668221772, "rewards/format_reward": 0.0, "step": 318 }, { "completion_length": 1772.033576965332, "epoch": 0.38115151967739525, "grad_norm": 0.0675572007894516, "kl": 0.067962646484375, "learning_rate": 8.00006604858821e-07, "loss": 0.0172, "reward": 0.09598214691504836, "reward_std": 0.051869654096663, "rewards/accuracy_reward": 0.09598214691504836, "rewards/format_reward": 0.0, "step": 319 }, { "completion_length": 1778.6764297485352, "epoch": 0.38234635202748113, "grad_norm": 0.05040724575519562, "kl": 0.0610809326171875, "learning_rate": 7.984413127676156e-07, "loss": 0.0063, "reward": 0.08482143143191934, "reward_std": 0.0357142873108387, "rewards/accuracy_reward": 0.08482143143191934, "rewards/format_reward": 0.0, "step": 320 }, { "completion_length": 1792.7679443359375, "epoch": 0.383541184377567, "grad_norm": 0.06314099580049515, "kl": 0.066497802734375, "learning_rate": 7.968716846966332e-07, "loss": 0.0129, "reward": 0.08482143236324191, "reward_std": 0.046024113427847624, "rewards/accuracy_reward": 0.08482143236324191, "rewards/format_reward": 0.0, "step": 321 }, { "completion_length": 1787.6384811401367, "epoch": 0.3847360167276529, "grad_norm": 0.0670035183429718, "kl": 0.068603515625, "learning_rate": 7.952977480401729e-07, "loss": 0.0079, "reward": 0.06026785960420966, "reward_std": 0.013392857741564512, "rewards/accuracy_reward": 0.06026785960420966, "rewards/format_reward": 0.0, "step": 322 }, { "completion_length": 1739.15185546875, "epoch": 0.3859308490777388, "grad_norm": 0.09252247959375381, "kl": 0.0670166015625, "learning_rate": 7.937195302677302e-07, "loss": 0.0201, "reward": 0.058035717345774174, "reward_std": 0.037095542065799236, "rewards/accuracy_reward": 0.058035717345774174, "rewards/format_reward": 0.0, "step": 323 }, { "completion_length": 1768.3415985107422, "epoch": 0.38712568142782466, "grad_norm": 0.06258081644773483, "kl": 0.0606842041015625, "learning_rate": 7.921370589235177e-07, "loss": 0.0161, "reward": 0.08258928987197578, "reward_std": 0.04294108226895332, "rewards/accuracy_reward": 0.08258928987197578, "rewards/format_reward": 0.0, "step": 324 }, { "completion_length": 1760.6139297485352, "epoch": 0.38832051377791055, "grad_norm": 0.11397919803857803, "kl": 0.066650390625, "learning_rate": 7.905503616259843e-07, "loss": 0.0193, "reward": 0.04910714575089514, "reward_std": 0.06010759901255369, "rewards/accuracy_reward": 0.04910714575089514, "rewards/format_reward": 0.0, "step": 325 }, { "completion_length": 1697.736686706543, "epoch": 0.38951534612799643, "grad_norm": 0.050416626036167145, "kl": 0.0697174072265625, "learning_rate": 7.889594660673337e-07, "loss": 0.0145, "reward": 0.06919643236324191, "reward_std": 0.041559827513992786, "rewards/accuracy_reward": 0.06919643236324191, "rewards/format_reward": 0.0, "step": 326 }, { "completion_length": 1696.142936706543, "epoch": 0.3907101784780823, "grad_norm": 0.05142740160226822, "kl": 0.0737457275390625, "learning_rate": 7.873644000130405e-07, "loss": 0.0193, "reward": 0.08035714691504836, "reward_std": 0.04602411389350891, "rewards/accuracy_reward": 0.08035714691504836, "rewards/format_reward": 0.0, "step": 327 }, { "completion_length": 1690.4219589233398, "epoch": 0.3919050108281682, "grad_norm": 0.11544405668973923, "kl": 0.0735626220703125, "learning_rate": 7.857651913013659e-07, "loss": 0.0297, "reward": 0.07142857485450804, "reward_std": 0.06388125754892826, "rewards/accuracy_reward": 0.07142857485450804, "rewards/format_reward": 0.0, "step": 328 }, { "completion_length": 1619.738899230957, "epoch": 0.393099843178254, "grad_norm": 0.22098380327224731, "kl": 0.0800323486328125, "learning_rate": 7.841618678428718e-07, "loss": 0.0372, "reward": 0.13392857601866126, "reward_std": 0.07557233795523643, "rewards/accuracy_reward": 0.13392857601866126, "rewards/format_reward": 0.0, "step": 329 }, { "completion_length": 1665.7478408813477, "epoch": 0.3942946755283399, "grad_norm": 0.06823365390300751, "kl": 0.0889129638671875, "learning_rate": 7.825544576199335e-07, "loss": 0.0144, "reward": 0.07589286076836288, "reward_std": 0.03709554160013795, "rewards/accuracy_reward": 0.07589286076836288, "rewards/format_reward": 0.0, "step": 330 }, { "completion_length": 1664.1384811401367, "epoch": 0.3954895078784258, "grad_norm": 0.15817119181156158, "kl": 0.094207763671875, "learning_rate": 7.809429886862518e-07, "loss": 0.0065, "reward": 0.03125000139698386, "reward_std": 0.008928571827709675, "rewards/accuracy_reward": 0.03125000139698386, "rewards/format_reward": 0.0, "step": 331 }, { "completion_length": 1612.8817825317383, "epoch": 0.3966843402285117, "grad_norm": 0.08517898619174957, "kl": 0.09564208984375, "learning_rate": 7.793274891663629e-07, "loss": 0.0114, "reward": 0.11830357648432255, "reward_std": 0.03640491468831897, "rewards/accuracy_reward": 0.11830357648432255, "rewards/format_reward": 0.0, "step": 332 }, { "completion_length": 1673.2165985107422, "epoch": 0.39787917257859756, "grad_norm": 0.07917753607034683, "kl": 0.103057861328125, "learning_rate": 7.777079872551478e-07, "loss": 0.018, "reward": 0.09821429057046771, "reward_std": 0.03571428777649999, "rewards/accuracy_reward": 0.09821429057046771, "rewards/format_reward": 0.0, "step": 333 }, { "completion_length": 1631.4621276855469, "epoch": 0.39907400492868345, "grad_norm": 0.08698119223117828, "kl": 0.1060791015625, "learning_rate": 7.760845112173401e-07, "loss": 0.0234, "reward": 0.11383929173462093, "reward_std": 0.05186965363100171, "rewards/accuracy_reward": 0.11383929173462093, "rewards/format_reward": 0.0, "step": 334 }, { "completion_length": 1653.6652526855469, "epoch": 0.40026883727876933, "grad_norm": 0.07778538763523102, "kl": 0.115478515625, "learning_rate": 7.744570893870329e-07, "loss": 0.0193, "reward": 0.1361607222352177, "reward_std": 0.05564331263303757, "rewards/accuracy_reward": 0.1361607222352177, "rewards/format_reward": 0.0, "step": 335 }, { "completion_length": 1644.9063415527344, "epoch": 0.4014636696288552, "grad_norm": 0.06362377107143402, "kl": 0.119873046875, "learning_rate": 7.728257501671841e-07, "loss": 0.023, "reward": 0.06919643096625805, "reward_std": 0.04294108133763075, "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.0, "step": 336 }, { "completion_length": 1637.1228485107422, "epoch": 0.4026585019789411, "grad_norm": 0.11725102365016937, "kl": 0.10687255859375, "learning_rate": 7.711905220291207e-07, "loss": 0.0115, "reward": 0.033482144586741924, "reward_std": 0.018547771032899618, "rewards/accuracy_reward": 0.033482144586741924, "rewards/format_reward": 0.0, "step": 337 }, { "completion_length": 1674.4197006225586, "epoch": 0.403853334329027, "grad_norm": 0.18356521427631378, "kl": 0.12188720703125, "learning_rate": 7.695514335120422e-07, "loss": 0.0081, "reward": 0.07812500395812094, "reward_std": 0.0326312561519444, "rewards/accuracy_reward": 0.07812500395812094, "rewards/format_reward": 0.0, "step": 338 }, { "completion_length": 1685.1831130981445, "epoch": 0.40504816667911286, "grad_norm": 0.10411418974399567, "kl": 0.11572265625, "learning_rate": 7.679085132225215e-07, "loss": 0.0151, "reward": 0.0825892889406532, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.0825892889406532, "rewards/format_reward": 0.0, "step": 339 }, { "completion_length": 1707.9219512939453, "epoch": 0.4062429990291987, "grad_norm": 0.07847895473241806, "kl": 0.108551025390625, "learning_rate": 7.662617898340077e-07, "loss": 0.0245, "reward": 0.10491071850992739, "reward_std": 0.04533348651602864, "rewards/accuracy_reward": 0.10491071850992739, "rewards/format_reward": 0.0, "step": 340 }, { "completion_length": 1722.4286499023438, "epoch": 0.4074378313792846, "grad_norm": 0.07311594486236572, "kl": 0.11444091796875, "learning_rate": 7.646112920863232e-07, "loss": 0.0221, "reward": 0.11607143469154835, "reward_std": 0.047405367717146873, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, "step": 341 }, { "completion_length": 1751.3728408813477, "epoch": 0.40863266372937046, "grad_norm": 0.0657731145620346, "kl": 0.1024169921875, "learning_rate": 7.629570487851635e-07, "loss": 0.0164, "reward": 0.08035714738070965, "reward_std": 0.04225045535713434, "rewards/accuracy_reward": 0.08035714738070965, "rewards/format_reward": 0.0, "step": 342 }, { "completion_length": 1728.323745727539, "epoch": 0.40982749607945634, "grad_norm": 0.07158273458480835, "kl": 0.0986328125, "learning_rate": 7.612990888015948e-07, "loss": 0.0176, "reward": 0.06250000325962901, "reward_std": 0.030929479748010635, "rewards/accuracy_reward": 0.06250000325962901, "rewards/format_reward": 0.0, "step": 343 }, { "completion_length": 1747.736686706543, "epoch": 0.4110223284295422, "grad_norm": 0.08995690941810608, "kl": 0.096771240234375, "learning_rate": 7.596374410715492e-07, "loss": 0.0151, "reward": 0.0625000037252903, "reward_std": 0.04602411389350891, "rewards/accuracy_reward": 0.0625000037252903, "rewards/format_reward": 0.0, "step": 344 }, { "completion_length": 1703.9621200561523, "epoch": 0.4122171607796281, "grad_norm": 0.15626010298728943, "kl": 0.087738037109375, "learning_rate": 7.579721345953199e-07, "loss": 0.0033, "reward": 0.11160714831203222, "reward_std": 0.01408348511904478, "rewards/accuracy_reward": 0.11160714831203222, "rewards/format_reward": 0.0, "step": 345 }, { "completion_length": 1778.4442749023438, "epoch": 0.413411993129714, "grad_norm": 0.1309770941734314, "kl": 0.0924835205078125, "learning_rate": 7.563031984370553e-07, "loss": 0.0036, "reward": 0.07589286030270159, "reward_std": 0.01785714365541935, "rewards/accuracy_reward": 0.07589286030270159, "rewards/format_reward": 0.0, "step": 346 }, { "completion_length": 1779.8728485107422, "epoch": 0.4146068254797999, "grad_norm": 0.06376446038484573, "kl": 0.080841064453125, "learning_rate": 7.54630661724252e-07, "loss": 0.01, "reward": 0.05357143119908869, "reward_std": 0.03194062830880284, "rewards/accuracy_reward": 0.05357143119908869, "rewards/format_reward": 0.0, "step": 347 }, { "completion_length": 1786.877311706543, "epoch": 0.41580165782988576, "grad_norm": 0.14151634275913239, "kl": 0.077484130859375, "learning_rate": 7.529545536472459e-07, "loss": 0.0175, "reward": 0.053571431897580624, "reward_std": 0.06010759808123112, "rewards/accuracy_reward": 0.053571431897580624, "rewards/format_reward": 0.0, "step": 348 }, { "completion_length": 1863.6808776855469, "epoch": 0.41699649017997165, "grad_norm": 0.04642028361558914, "kl": 0.0702667236328125, "learning_rate": 7.512749034587027e-07, "loss": 0.0101, "reward": 0.049107144586741924, "reward_std": 0.026785715483129025, "rewards/accuracy_reward": 0.049107144586741924, "rewards/format_reward": 0.0, "step": 349 }, { "completion_length": 1796.1541137695312, "epoch": 0.4181913225300575, "grad_norm": 0.0546141192317009, "kl": 0.067535400390625, "learning_rate": 7.495917404731078e-07, "loss": 0.0122, "reward": 0.03125000139698386, "reward_std": 0.03332188352942467, "rewards/accuracy_reward": 0.03125000139698386, "rewards/format_reward": 0.0, "step": 350 }, { "completion_length": 1787.0157012939453, "epoch": 0.41938615488014336, "grad_norm": 0.10838630050420761, "kl": 0.061798095703125, "learning_rate": 7.479050940662548e-07, "loss": 0.0185, "reward": 0.05357143096625805, "reward_std": 0.042250454891473055, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "step": 351 }, { "completion_length": 1791.301414489746, "epoch": 0.42058098723022924, "grad_norm": 0.11792529374361038, "kl": 0.0639495849609375, "learning_rate": 7.462149936747322e-07, "loss": 0.0272, "reward": 0.07812500419095159, "reward_std": 0.06457188446074724, "rewards/accuracy_reward": 0.07812500419095159, "rewards/format_reward": 0.0, "step": 352 }, { "completion_length": 1856.8907012939453, "epoch": 0.4217758195803151, "grad_norm": 0.06336203962564468, "kl": 0.06085205078125, "learning_rate": 7.4452146879541e-07, "loss": 0.0139, "reward": 0.08258929010480642, "reward_std": 0.03640491422265768, "rewards/accuracy_reward": 0.08258929010480642, "rewards/format_reward": 0.0, "step": 353 }, { "completion_length": 1827.040267944336, "epoch": 0.422970651930401, "grad_norm": 0.061344586312770844, "kl": 0.066650390625, "learning_rate": 7.428245489849249e-07, "loss": 0.0102, "reward": 0.05803571757860482, "reward_std": 0.028166969772428274, "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.0, "step": 354 }, { "completion_length": 1852.5425033569336, "epoch": 0.4241654842804869, "grad_norm": 0.04946181923151016, "kl": 0.0687255859375, "learning_rate": 7.411242638591648e-07, "loss": 0.0112, "reward": 0.07142857601866126, "reward_std": 0.026785715483129025, "rewards/accuracy_reward": 0.07142857601866126, "rewards/format_reward": 0.0, "step": 355 }, { "completion_length": 1834.8951797485352, "epoch": 0.4253603166305728, "grad_norm": 0.05471396818757057, "kl": 0.0636138916015625, "learning_rate": 7.394206430927507e-07, "loss": 0.0153, "reward": 0.06473214575089514, "reward_std": 0.03401251044124365, "rewards/accuracy_reward": 0.06473214575089514, "rewards/format_reward": 0.0, "step": 356 }, { "completion_length": 1852.5201721191406, "epoch": 0.42655514898065866, "grad_norm": 0.08148486167192459, "kl": 0.0594940185546875, "learning_rate": 7.377137164185204e-07, "loss": 0.0152, "reward": 0.08482143329456449, "reward_std": 0.05117902671918273, "rewards/accuracy_reward": 0.08482143329456449, "rewards/format_reward": 0.0, "step": 357 }, { "completion_length": 1814.6139373779297, "epoch": 0.42774998133074454, "grad_norm": 0.08427347242832184, "kl": 0.05621337890625, "learning_rate": 7.360035136270085e-07, "loss": 0.0115, "reward": 0.055803573690354824, "reward_std": 0.03125000139698386, "rewards/accuracy_reward": 0.055803573690354824, "rewards/format_reward": 0.0, "step": 358 }, { "completion_length": 1804.72998046875, "epoch": 0.42894481368083043, "grad_norm": 0.04562581330537796, "kl": 0.0593719482421875, "learning_rate": 7.342900645659269e-07, "loss": 0.0119, "reward": 0.07142857532016933, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.07142857532016933, "rewards/format_reward": 0.0, "step": 359 }, { "completion_length": 1801.7143630981445, "epoch": 0.4301396460309163, "grad_norm": 0.10245370119810104, "kl": 0.062652587890625, "learning_rate": 7.325733991396437e-07, "loss": 0.0028, "reward": 0.06250000279396772, "reward_std": 0.010309826582670212, "rewards/accuracy_reward": 0.06250000279396772, "rewards/format_reward": 0.0, "step": 360 }, { "completion_length": 1794.5134735107422, "epoch": 0.43133447838100214, "grad_norm": 0.05346539616584778, "kl": 0.067138671875, "learning_rate": 7.308535473086614e-07, "loss": 0.0078, "reward": 0.03571428661234677, "reward_std": 0.028166969772428274, "rewards/accuracy_reward": 0.03571428661234677, "rewards/format_reward": 0.0, "step": 361 }, { "completion_length": 1799.4889221191406, "epoch": 0.432529310731088, "grad_norm": 0.0623571015894413, "kl": 0.06402587890625, "learning_rate": 7.291305390890939e-07, "loss": 0.0121, "reward": 0.05357143096625805, "reward_std": 0.037095542065799236, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "step": 362 }, { "completion_length": 1786.4532165527344, "epoch": 0.4337241430811739, "grad_norm": 0.048688456416130066, "kl": 0.056396484375, "learning_rate": 7.274044045521429e-07, "loss": 0.0117, "reward": 0.1272321487776935, "reward_std": 0.027476342394948006, "rewards/accuracy_reward": 0.1272321487776935, "rewards/format_reward": 0.0, "step": 363 }, { "completion_length": 1799.3125686645508, "epoch": 0.4349189754312598, "grad_norm": 0.060017120093107224, "kl": 0.0555877685546875, "learning_rate": 7.256751738235726e-07, "loss": 0.0121, "reward": 0.035714288242161274, "reward_std": 0.03709554160013795, "rewards/accuracy_reward": 0.035714288242161274, "rewards/format_reward": 0.0, "step": 364 }, { "completion_length": 1802.0112380981445, "epoch": 0.4361138077813457, "grad_norm": 0.07884395867586136, "kl": 0.06060791015625, "learning_rate": 7.239428770831845e-07, "loss": 0.018, "reward": 0.08035714761354029, "reward_std": 0.042250454891473055, "rewards/accuracy_reward": 0.08035714761354029, "rewards/format_reward": 0.0, "step": 365 }, { "completion_length": 1838.752311706543, "epoch": 0.43730864013143156, "grad_norm": 0.051215760409832, "kl": 0.056732177734375, "learning_rate": 7.222075445642904e-07, "loss": 0.0112, "reward": 0.02901785890571773, "reward_std": 0.03640491468831897, "rewards/accuracy_reward": 0.02901785890571773, "rewards/format_reward": 0.0, "step": 366 }, { "completion_length": 1825.8572311401367, "epoch": 0.43850347248151744, "grad_norm": 0.07488728314638138, "kl": 0.0590667724609375, "learning_rate": 7.204692065531843e-07, "loss": 0.0057, "reward": 0.044642859138548374, "reward_std": 0.01408348511904478, "rewards/accuracy_reward": 0.044642859138548374, "rewards/format_reward": 0.0, "step": 367 }, { "completion_length": 1819.9509887695312, "epoch": 0.4396983048316033, "grad_norm": 0.04479961842298508, "kl": 0.0578155517578125, "learning_rate": 7.187278933886145e-07, "loss": 0.0137, "reward": 0.06026785960420966, "reward_std": 0.03125000139698386, "rewards/accuracy_reward": 0.06026785960420966, "rewards/format_reward": 0.0, "step": 368 }, { "completion_length": 1798.274642944336, "epoch": 0.4408931371816892, "grad_norm": 0.1197081059217453, "kl": 0.0579986572265625, "learning_rate": 7.169836354612539e-07, "loss": 0.0204, "reward": 0.08035714621655643, "reward_std": 0.042250454891473055, "rewards/accuracy_reward": 0.08035714621655643, "rewards/format_reward": 0.0, "step": 369 }, { "completion_length": 1813.7277603149414, "epoch": 0.4420879695317751, "grad_norm": 0.04676646739244461, "kl": 0.0638580322265625, "learning_rate": 7.152364632131699e-07, "loss": 0.009, "reward": 0.07142857438884676, "reward_std": 0.01785714365541935, "rewards/accuracy_reward": 0.07142857438884676, "rewards/format_reward": 0.0, "step": 370 }, { "completion_length": 1746.6072082519531, "epoch": 0.443282801881861, "grad_norm": 0.07443659007549286, "kl": 0.06011962890625, "learning_rate": 7.134864071372919e-07, "loss": 0.0167, "reward": 0.07589286076836288, "reward_std": 0.03332188259810209, "rewards/accuracy_reward": 0.07589286076836288, "rewards/format_reward": 0.0, "step": 371 }, { "completion_length": 1780.8639221191406, "epoch": 0.4444776342319468, "grad_norm": 0.06244147941470146, "kl": 0.0662384033203125, "learning_rate": 7.117334977768806e-07, "loss": 0.0094, "reward": 0.10491071944124997, "reward_std": 0.04533348651602864, "rewards/accuracy_reward": 0.10491071944124997, "rewards/format_reward": 0.0, "step": 372 }, { "completion_length": 1805.3349227905273, "epoch": 0.4456724665820327, "grad_norm": 0.04088316857814789, "kl": 0.06201171875, "learning_rate": 7.099777657249947e-07, "loss": 0.0075, "reward": 0.04017857322469354, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.04017857322469354, "rewards/format_reward": 0.0, "step": 373 }, { "completion_length": 1757.613914489746, "epoch": 0.4468672989321186, "grad_norm": 0.08762752264738083, "kl": 0.0582733154296875, "learning_rate": 7.082192416239553e-07, "loss": 0.0152, "reward": 0.0558035746216774, "reward_std": 0.027476342394948006, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.0, "step": 374 }, { "completion_length": 1741.2121200561523, "epoch": 0.44806213128220446, "grad_norm": 0.06065506860613823, "kl": 0.0612640380859375, "learning_rate": 7.064579561648135e-07, "loss": 0.0093, "reward": 0.04910714505240321, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.04910714505240321, "rewards/format_reward": 0.0, "step": 375 }, { "completion_length": 1742.176414489746, "epoch": 0.44925696363229034, "grad_norm": 0.08876502513885498, "kl": 0.0591278076171875, "learning_rate": 7.046939400868135e-07, "loss": 0.0222, "reward": 0.064732147147879, "reward_std": 0.04294108273461461, "rewards/accuracy_reward": 0.064732147147879, "rewards/format_reward": 0.0, "step": 376 }, { "completion_length": 1734.533576965332, "epoch": 0.4504517959823762, "grad_norm": 0.04903697967529297, "kl": 0.066162109375, "learning_rate": 7.029272241768557e-07, "loss": 0.0077, "reward": 0.07366071711294353, "reward_std": 0.023702684324234724, "rewards/accuracy_reward": 0.07366071711294353, "rewards/format_reward": 0.0, "step": 377 }, { "completion_length": 1763.5603256225586, "epoch": 0.4516466283324621, "grad_norm": 0.04450186342000961, "kl": 0.0629119873046875, "learning_rate": 7.011578392689605e-07, "loss": 0.0131, "reward": 0.04687500209547579, "reward_std": 0.018547771032899618, "rewards/accuracy_reward": 0.04687500209547579, "rewards/format_reward": 0.0, "step": 378 }, { "completion_length": 1697.5603332519531, "epoch": 0.452841460682548, "grad_norm": 0.06202075630426407, "kl": 0.0622711181640625, "learning_rate": 6.993858162437293e-07, "loss": 0.0183, "reward": 0.12946429126895964, "reward_std": 0.056333940010517836, "rewards/accuracy_reward": 0.12946429126895964, "rewards/format_reward": 0.0, "step": 379 }, { "completion_length": 1702.8996353149414, "epoch": 0.4540362930326339, "grad_norm": 0.06156419217586517, "kl": 0.06640625, "learning_rate": 6.97611186027806e-07, "loss": 0.0077, "reward": 0.06250000186264515, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 380 }, { "completion_length": 1717.2634658813477, "epoch": 0.45523112538271976, "grad_norm": 0.0606830008327961, "kl": 0.0583038330078125, "learning_rate": 6.958339795933372e-07, "loss": 0.0064, "reward": 0.037946430733427405, "reward_std": 0.03640491468831897, "rewards/accuracy_reward": 0.037946430733427405, "rewards/format_reward": 0.0, "step": 381 }, { "completion_length": 1796.3705978393555, "epoch": 0.45642595773280564, "grad_norm": 0.05976226553320885, "kl": 0.0599517822265625, "learning_rate": 6.940542279574314e-07, "loss": 0.0126, "reward": 0.09375000442378223, "reward_std": 0.05117902718484402, "rewards/accuracy_reward": 0.09375000442378223, "rewards/format_reward": 0.0, "step": 382 }, { "completion_length": 1699.9889221191406, "epoch": 0.45762079008289147, "grad_norm": 0.12721261382102966, "kl": 0.0631561279296875, "learning_rate": 6.922719621816176e-07, "loss": 0.0229, "reward": 0.10491071874275804, "reward_std": 0.08896519569680095, "rewards/accuracy_reward": 0.10491071874275804, "rewards/format_reward": 0.0, "step": 383 }, { "completion_length": 1693.3081130981445, "epoch": 0.45881562243297735, "grad_norm": 0.10395307838916779, "kl": 0.06005859375, "learning_rate": 6.904872133713039e-07, "loss": 0.0041, "reward": 0.0669642889406532, "reward_std": 0.01785714365541935, "rewards/accuracy_reward": 0.0669642889406532, "rewards/format_reward": 0.0, "step": 384 }, { "completion_length": 1752.439811706543, "epoch": 0.46001045478306324, "grad_norm": 0.14242814481258392, "kl": 0.0664215087890625, "learning_rate": 6.887000126752338e-07, "loss": 0.0241, "reward": 0.09821429033763707, "reward_std": 0.03709554160013795, "rewards/accuracy_reward": 0.09821429033763707, "rewards/format_reward": 0.0, "step": 385 }, { "completion_length": 1792.3282089233398, "epoch": 0.4612052871331491, "grad_norm": 0.05522778257727623, "kl": 0.0621490478515625, "learning_rate": 6.869103912849429e-07, "loss": 0.0068, "reward": 0.05133928777649999, "reward_std": 0.022321429569274187, "rewards/accuracy_reward": 0.05133928777649999, "rewards/format_reward": 0.0, "step": 386 }, { "completion_length": 1756.7567749023438, "epoch": 0.462400119483235, "grad_norm": 0.05421626940369606, "kl": 0.0592193603515625, "learning_rate": 6.851183804342147e-07, "loss": 0.0097, "reward": 0.06250000209547579, "reward_std": 0.03847679682075977, "rewards/accuracy_reward": 0.06250000209547579, "rewards/format_reward": 0.0, "step": 387 }, { "completion_length": 1729.9978485107422, "epoch": 0.4635949518333209, "grad_norm": 0.06419473141431808, "kl": 0.059967041015625, "learning_rate": 6.833240113985353e-07, "loss": 0.0115, "reward": 0.0691964328289032, "reward_std": 0.03778616897761822, "rewards/accuracy_reward": 0.0691964328289032, "rewards/format_reward": 0.0, "step": 388 }, { "completion_length": 1767.8527526855469, "epoch": 0.4647897841834068, "grad_norm": 0.05154287442564964, "kl": 0.0947113037109375, "learning_rate": 6.815273154945474e-07, "loss": 0.016, "reward": 0.07142857369035482, "reward_std": 0.0357142873108387, "rewards/accuracy_reward": 0.07142857369035482, "rewards/format_reward": 0.0, "step": 389 }, { "completion_length": 1820.3840026855469, "epoch": 0.46598461653349266, "grad_norm": 0.03844178467988968, "kl": 0.0636749267578125, "learning_rate": 6.797283240795042e-07, "loss": 0.0131, "reward": 0.05357143119908869, "reward_std": 0.03194062877446413, "rewards/accuracy_reward": 0.05357143119908869, "rewards/format_reward": 0.0, "step": 390 }, { "completion_length": 1809.4777603149414, "epoch": 0.46717944888357854, "grad_norm": 0.08682113885879517, "kl": 0.070098876953125, "learning_rate": 6.779270685507215e-07, "loss": 0.025, "reward": 0.04017857275903225, "reward_std": 0.044642859138548374, "rewards/accuracy_reward": 0.04017857275903225, "rewards/format_reward": 0.0, "step": 391 }, { "completion_length": 1757.636245727539, "epoch": 0.4683742812336644, "grad_norm": 0.06275549530982971, "kl": 0.0738677978515625, "learning_rate": 6.761235803450303e-07, "loss": 0.0148, "reward": 0.10937500442378223, "reward_std": 0.04671474080532789, "rewards/accuracy_reward": 0.10937500442378223, "rewards/format_reward": 0.0, "step": 392 }, { "completion_length": 1765.3527603149414, "epoch": 0.46956911358375025, "grad_norm": 0.05870820954442024, "kl": 0.06982421875, "learning_rate": 6.743178909382276e-07, "loss": 0.0104, "reward": 0.07142857508733869, "reward_std": 0.04086920106783509, "rewards/accuracy_reward": 0.07142857508733869, "rewards/format_reward": 0.0, "step": 393 }, { "completion_length": 1786.6742095947266, "epoch": 0.47076394593383614, "grad_norm": 0.052153825759887695, "kl": 0.070281982421875, "learning_rate": 6.72510031844528e-07, "loss": 0.0089, "reward": 0.07589286053553224, "reward_std": 0.026785715483129025, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, "step": 394 }, { "completion_length": 1797.6362533569336, "epoch": 0.471958778283922, "grad_norm": 0.12408849596977234, "kl": 0.0732421875, "learning_rate": 6.707000346160126e-07, "loss": 0.0083, "reward": 0.053571431431919336, "reward_std": 0.01408348511904478, "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.0, "step": 395 }, { "completion_length": 1750.517936706543, "epoch": 0.4731536106340079, "grad_norm": 0.06921978294849396, "kl": 0.0763702392578125, "learning_rate": 6.688879308420788e-07, "loss": 0.0117, "reward": 0.08258928940631449, "reward_std": 0.02885759761556983, "rewards/accuracy_reward": 0.08258928940631449, "rewards/format_reward": 0.0, "step": 396 }, { "completion_length": 1735.348289489746, "epoch": 0.4743484429840938, "grad_norm": 0.07390104234218597, "kl": 0.065704345703125, "learning_rate": 6.670737521488888e-07, "loss": 0.0193, "reward": 0.10714286123402417, "reward_std": 0.0384767958894372, "rewards/accuracy_reward": 0.10714286123402417, "rewards/format_reward": 0.0, "step": 397 }, { "completion_length": 1753.2210693359375, "epoch": 0.47554327533417967, "grad_norm": 0.09489661455154419, "kl": 0.0729217529296875, "learning_rate": 6.652575301988184e-07, "loss": 0.0051, "reward": 0.09375000395812094, "reward_std": 0.008928571827709675, "rewards/accuracy_reward": 0.09375000395812094, "rewards/format_reward": 0.0, "step": 398 }, { "completion_length": 1791.4398040771484, "epoch": 0.47673810768426556, "grad_norm": 0.07931240648031235, "kl": 0.0683441162109375, "learning_rate": 6.634392966899036e-07, "loss": 0.005, "reward": 0.0669642889406532, "reward_std": 0.023012056481093168, "rewards/accuracy_reward": 0.0669642889406532, "rewards/format_reward": 0.0, "step": 399 }, { "completion_length": 1836.9800033569336, "epoch": 0.47793294003435144, "grad_norm": 0.08115304261445999, "kl": 0.0619659423828125, "learning_rate": 6.616190833552869e-07, "loss": 0.018, "reward": 0.0758928598370403, "reward_std": 0.0549526852555573, "rewards/accuracy_reward": 0.0758928598370403, "rewards/format_reward": 0.0, "step": 400 }, { "completion_length": 1854.9174880981445, "epoch": 0.4791277723844373, "grad_norm": 0.05723116174340248, "kl": 0.0621185302734375, "learning_rate": 6.597969219626653e-07, "loss": 0.0118, "reward": 0.07142857438884676, "reward_std": 0.040869200602173805, "rewards/accuracy_reward": 0.07142857438884676, "rewards/format_reward": 0.0, "step": 401 }, { "completion_length": 1877.2768630981445, "epoch": 0.4803226047345232, "grad_norm": 0.08552679419517517, "kl": 0.0567474365234375, "learning_rate": 6.579728443137341e-07, "loss": 0.0163, "reward": 0.09375000488944352, "reward_std": 0.042250454891473055, "rewards/accuracy_reward": 0.09375000488944352, "rewards/format_reward": 0.0, "step": 402 }, { "completion_length": 1841.8974151611328, "epoch": 0.4815174370846091, "grad_norm": 0.06886173039674759, "kl": 0.067169189453125, "learning_rate": 6.561468822436323e-07, "loss": 0.0206, "reward": 0.07812500279396772, "reward_std": 0.041559827513992786, "rewards/accuracy_reward": 0.07812500279396772, "rewards/format_reward": 0.0, "step": 403 }, { "completion_length": 1874.2255249023438, "epoch": 0.4827122694346949, "grad_norm": 0.04968215897679329, "kl": 0.0591278076171875, "learning_rate": 6.543190676203877e-07, "loss": 0.008, "reward": 0.042410715483129025, "reward_std": 0.018547771032899618, "rewards/accuracy_reward": 0.042410715483129025, "rewards/format_reward": 0.0, "step": 404 }, { "completion_length": 1855.2969589233398, "epoch": 0.4839071017847808, "grad_norm": 0.1234685406088829, "kl": 0.0589447021484375, "learning_rate": 6.5248943234436e-07, "loss": 0.0186, "reward": 0.11160714784637094, "reward_std": 0.04602411389350891, "rewards/accuracy_reward": 0.11160714784637094, "rewards/format_reward": 0.0, "step": 405 }, { "completion_length": 1858.9666061401367, "epoch": 0.4851019341348667, "grad_norm": 0.08718442916870117, "kl": 0.0637969970703125, "learning_rate": 6.506580083476842e-07, "loss": 0.0162, "reward": 0.095982147147879, "reward_std": 0.05702456785365939, "rewards/accuracy_reward": 0.095982147147879, "rewards/format_reward": 0.0, "step": 406 }, { "completion_length": 1803.4844665527344, "epoch": 0.48629676648495257, "grad_norm": 0.05425015836954117, "kl": 0.057647705078125, "learning_rate": 6.488248275937134e-07, "loss": 0.0096, "reward": 0.05357143050059676, "reward_std": 0.03709554160013795, "rewards/accuracy_reward": 0.05357143050059676, "rewards/format_reward": 0.0, "step": 407 }, { "completion_length": 1802.0335540771484, "epoch": 0.48749159883503845, "grad_norm": 0.041156619787216187, "kl": 0.065643310546875, "learning_rate": 6.469899220764611e-07, "loss": 0.0081, "reward": 0.06473214598372579, "reward_std": 0.018547771032899618, "rewards/accuracy_reward": 0.06473214598372579, "rewards/format_reward": 0.0, "step": 408 }, { "completion_length": 1791.9197235107422, "epoch": 0.48868643118512434, "grad_norm": 0.054738547652959824, "kl": 0.0669403076171875, "learning_rate": 6.451533238200423e-07, "loss": 0.0181, "reward": 0.07589286100119352, "reward_std": 0.052560281939804554, "rewards/accuracy_reward": 0.07589286100119352, "rewards/format_reward": 0.0, "step": 409 }, { "completion_length": 1795.533561706543, "epoch": 0.4898812635352102, "grad_norm": 0.05028402432799339, "kl": 0.069732666015625, "learning_rate": 6.433150648781154e-07, "loss": 0.0085, "reward": 0.05357143096625805, "reward_std": 0.028166969772428274, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "step": 410 }, { "completion_length": 1823.2054443359375, "epoch": 0.4910760958852961, "grad_norm": 0.10148292779922485, "kl": 0.0673828125, "learning_rate": 6.414751773333215e-07, "loss": 0.0155, "reward": 0.10044643259607255, "reward_std": 0.04533348651602864, "rewards/accuracy_reward": 0.10044643259607255, "rewards/format_reward": 0.0, "step": 411 }, { "completion_length": 1828.1005401611328, "epoch": 0.492270928235382, "grad_norm": 0.08333028107881546, "kl": 0.070770263671875, "learning_rate": 6.396336932967261e-07, "loss": 0.0116, "reward": 0.1183035762514919, "reward_std": 0.05564331356436014, "rewards/accuracy_reward": 0.1183035762514919, "rewards/format_reward": 0.0, "step": 412 }, { "completion_length": 1827.1228561401367, "epoch": 0.49346576058546787, "grad_norm": 0.11227072775363922, "kl": 0.06951904296875, "learning_rate": 6.377906449072577e-07, "loss": 0.0179, "reward": 0.0714285746216774, "reward_std": 0.03709554160013795, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, "step": 413 }, { "completion_length": 1791.2991943359375, "epoch": 0.49466059293555376, "grad_norm": 0.07309604436159134, "kl": 0.0726776123046875, "learning_rate": 6.359460643311466e-07, "loss": 0.0128, "reward": 0.058035717345774174, "reward_std": 0.04464285960420966, "rewards/accuracy_reward": 0.058035717345774174, "rewards/format_reward": 0.0, "step": 414 }, { "completion_length": 1774.9978485107422, "epoch": 0.4958554252856396, "grad_norm": 0.06612934917211533, "kl": 0.0726470947265625, "learning_rate": 6.34099983761364e-07, "loss": 0.0174, "reward": 0.07812500395812094, "reward_std": 0.03401251044124365, "rewards/accuracy_reward": 0.07812500395812094, "rewards/format_reward": 0.0, "step": 415 }, { "completion_length": 1808.1719512939453, "epoch": 0.49705025763572547, "grad_norm": 0.06789853423833847, "kl": 0.0803070068359375, "learning_rate": 6.322524354170606e-07, "loss": 0.0206, "reward": 0.051339288242161274, "reward_std": 0.041559827513992786, "rewards/accuracy_reward": 0.051339288242161274, "rewards/format_reward": 0.0, "step": 416 }, { "completion_length": 1758.0782089233398, "epoch": 0.49824508998581135, "grad_norm": 0.12447439134120941, "kl": 0.071563720703125, "learning_rate": 6.304034515430036e-07, "loss": 0.0236, "reward": 0.08035714621655643, "reward_std": 0.05117902718484402, "rewards/accuracy_reward": 0.08035714621655643, "rewards/format_reward": 0.0, "step": 417 }, { "completion_length": 1718.74560546875, "epoch": 0.49943992233589724, "grad_norm": 0.06526026874780655, "kl": 0.0781402587890625, "learning_rate": 6.285530644090135e-07, "loss": 0.0126, "reward": 0.06696428917348385, "reward_std": 0.04979777242988348, "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.0, "step": 418 }, { "completion_length": 1742.962142944336, "epoch": 0.5006347546859832, "grad_norm": 0.09488677233457565, "kl": 0.0962066650390625, "learning_rate": 6.267013063094021e-07, "loss": 0.0179, "reward": 0.060267859837040305, "reward_std": 0.04432233748957515, "rewards/accuracy_reward": 0.060267859837040305, "rewards/format_reward": 0.0, "step": 419 }, { "completion_length": 1689.5625839233398, "epoch": 0.501829587036069, "grad_norm": 0.09764929115772247, "kl": 0.084869384765625, "learning_rate": 6.248482095624086e-07, "loss": 0.0112, "reward": 0.06026786006987095, "reward_std": 0.023702684324234724, "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.0, "step": 420 }, { "completion_length": 1744.801414489746, "epoch": 0.5030244193861548, "grad_norm": 0.07685290277004242, "kl": 0.09881591796875, "learning_rate": 6.229938065096343e-07, "loss": 0.0155, "reward": 0.0825892889406532, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.0825892889406532, "rewards/format_reward": 0.0, "step": 421 }, { "completion_length": 1701.7634735107422, "epoch": 0.5042192517362407, "grad_norm": 0.07576372474431992, "kl": 0.0885162353515625, "learning_rate": 6.2113812951548e-07, "loss": 0.0141, "reward": 0.05133928800933063, "reward_std": 0.041559827513992786, "rewards/accuracy_reward": 0.05133928800933063, "rewards/format_reward": 0.0, "step": 422 }, { "completion_length": 1724.4040908813477, "epoch": 0.5054140840863266, "grad_norm": 0.060388918966054916, "kl": 0.092498779296875, "learning_rate": 6.192812109665801e-07, "loss": 0.0075, "reward": 0.07812500279396772, "reward_std": 0.03125000139698386, "rewards/accuracy_reward": 0.07812500279396772, "rewards/format_reward": 0.0, "step": 423 }, { "completion_length": 1713.080421447754, "epoch": 0.5066089164364125, "grad_norm": 0.1478889137506485, "kl": 0.115386962890625, "learning_rate": 6.174230832712366e-07, "loss": 0.0088, "reward": 0.07589285960420966, "reward_std": 0.0357142873108387, "rewards/accuracy_reward": 0.07589285960420966, "rewards/format_reward": 0.0, "step": 424 }, { "completion_length": 1679.0335540771484, "epoch": 0.5078037487864984, "grad_norm": 0.19699537754058838, "kl": 0.099365234375, "learning_rate": 6.155637788588559e-07, "loss": 0.0271, "reward": 0.08705357532016933, "reward_std": 0.06079822639003396, "rewards/accuracy_reward": 0.08705357532016933, "rewards/format_reward": 0.0, "step": 425 }, { "completion_length": 1683.1786422729492, "epoch": 0.5089985811365842, "grad_norm": 0.13458943367004395, "kl": 0.1060028076171875, "learning_rate": 6.137033301793801e-07, "loss": 0.0067, "reward": 0.08705357485450804, "reward_std": 0.022321429569274187, "rewards/accuracy_reward": 0.08705357485450804, "rewards/format_reward": 0.0, "step": 426 }, { "completion_length": 1778.0223922729492, "epoch": 0.5101934134866701, "grad_norm": 0.10207806527614594, "kl": 0.11865234375, "learning_rate": 6.118417697027227e-07, "loss": 0.0108, "reward": 0.05133928777649999, "reward_std": 0.023702684324234724, "rewards/accuracy_reward": 0.05133928777649999, "rewards/format_reward": 0.0, "step": 427 }, { "completion_length": 1708.0089950561523, "epoch": 0.511388245836756, "grad_norm": 0.07417236268520355, "kl": 0.11798095703125, "learning_rate": 6.099791299182005e-07, "loss": 0.0265, "reward": 0.11607143352739513, "reward_std": 0.06287010945379734, "rewards/accuracy_reward": 0.11607143352739513, "rewards/format_reward": 0.0, "step": 428 }, { "completion_length": 1749.8081283569336, "epoch": 0.5125830781868419, "grad_norm": 0.10500745475292206, "kl": 0.1107177734375, "learning_rate": 6.081154433339675e-07, "loss": 0.0136, "reward": 0.0825892873108387, "reward_std": 0.06319063063710928, "rewards/accuracy_reward": 0.0825892873108387, "rewards/format_reward": 0.0, "step": 429 }, { "completion_length": 1810.1764297485352, "epoch": 0.5137779105369278, "grad_norm": 0.08082050085067749, "kl": 0.126708984375, "learning_rate": 6.06250742476447e-07, "loss": 0.0129, "reward": 0.0736607180442661, "reward_std": 0.0326312561519444, "rewards/accuracy_reward": 0.0736607180442661, "rewards/format_reward": 0.0, "step": 430 }, { "completion_length": 1827.776870727539, "epoch": 0.5149727428870137, "grad_norm": 0.13827075064182281, "kl": 0.13555908203125, "learning_rate": 6.043850598897647e-07, "loss": 0.0161, "reward": 0.06473214621655643, "reward_std": 0.03125000139698386, "rewards/accuracy_reward": 0.06473214621655643, "rewards/format_reward": 0.0, "step": 431 }, { "completion_length": 1852.2947387695312, "epoch": 0.5161675752370996, "grad_norm": 0.12165528535842896, "kl": 0.118621826171875, "learning_rate": 6.025184281351792e-07, "loss": 0.0078, "reward": 0.06696429033763707, "reward_std": 0.040869200602173805, "rewards/accuracy_reward": 0.06696429033763707, "rewards/format_reward": 0.0, "step": 432 }, { "completion_length": 1803.339370727539, "epoch": 0.5173624075871854, "grad_norm": 0.07116678357124329, "kl": 0.110198974609375, "learning_rate": 6.006508797905157e-07, "loss": 0.0107, "reward": 0.10267857508733869, "reward_std": 0.042250454891473055, "rewards/accuracy_reward": 0.10267857508733869, "rewards/format_reward": 0.0, "step": 433 }, { "completion_length": 1879.4822235107422, "epoch": 0.5185572399372713, "grad_norm": 0.05898585543036461, "kl": 0.105804443359375, "learning_rate": 5.987824474495955e-07, "loss": 0.0119, "reward": 0.08258928963914514, "reward_std": 0.0326312561519444, "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.0, "step": 434 }, { "completion_length": 1849.1741943359375, "epoch": 0.5197520722873572, "grad_norm": 0.08267360925674438, "kl": 0.0939178466796875, "learning_rate": 5.969131637216687e-07, "loss": 0.0151, "reward": 0.06026786030270159, "reward_std": 0.04533348651602864, "rewards/accuracy_reward": 0.06026786030270159, "rewards/format_reward": 0.0, "step": 435 }, { "completion_length": 1899.8572311401367, "epoch": 0.5209469046374431, "grad_norm": 0.08089195191860199, "kl": 0.09442138671875, "learning_rate": 5.950430612308444e-07, "loss": 0.0099, "reward": 0.049107144586741924, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.049107144586741924, "rewards/format_reward": 0.0, "step": 436 }, { "completion_length": 1868.979995727539, "epoch": 0.522141736987529, "grad_norm": 0.0891386941075325, "kl": 0.0930328369140625, "learning_rate": 5.931721726155206e-07, "loss": 0.0091, "reward": 0.07142857508733869, "reward_std": 0.03194062877446413, "rewards/accuracy_reward": 0.07142857508733869, "rewards/format_reward": 0.0, "step": 437 }, { "completion_length": 1916.4197387695312, "epoch": 0.5233365693376149, "grad_norm": 0.1272231489419937, "kl": 0.080169677734375, "learning_rate": 5.913005305278162e-07, "loss": 0.0048, "reward": 0.04017857345752418, "reward_std": 0.01408348511904478, "rewards/accuracy_reward": 0.04017857345752418, "rewards/format_reward": 0.0, "step": 438 }, { "completion_length": 1871.0715103149414, "epoch": 0.5245314016877007, "grad_norm": 0.05636937916278839, "kl": 0.0720367431640625, "learning_rate": 5.894281676329998e-07, "loss": 0.0081, "reward": 0.04464285890571773, "reward_std": 0.026785715948790312, "rewards/accuracy_reward": 0.04464285890571773, "rewards/format_reward": 0.0, "step": 439 }, { "completion_length": 1853.8170547485352, "epoch": 0.5257262340377866, "grad_norm": 0.11180628091096878, "kl": 0.0885009765625, "learning_rate": 5.875551166089205e-07, "loss": 0.0146, "reward": 0.0758928598370403, "reward_std": 0.042250454891473055, "rewards/accuracy_reward": 0.0758928598370403, "rewards/format_reward": 0.0, "step": 440 }, { "completion_length": 1905.9465103149414, "epoch": 0.5269210663878725, "grad_norm": 0.06430531293153763, "kl": 0.0628662109375, "learning_rate": 5.856814101454363e-07, "loss": 0.0113, "reward": 0.08035714644938707, "reward_std": 0.026785715483129025, "rewards/accuracy_reward": 0.08035714644938707, "rewards/format_reward": 0.0, "step": 441 }, { "completion_length": 1880.774642944336, "epoch": 0.5281158987379583, "grad_norm": 0.06069602072238922, "kl": 0.0552215576171875, "learning_rate": 5.838070809438452e-07, "loss": 0.0054, "reward": 0.05133928800933063, "reward_std": 0.023702684324234724, "rewards/accuracy_reward": 0.05133928800933063, "rewards/format_reward": 0.0, "step": 442 }, { "completion_length": 1868.5335693359375, "epoch": 0.5293107310880442, "grad_norm": 0.042140234261751175, "kl": 0.04996490478515625, "learning_rate": 5.819321617163135e-07, "loss": 0.0046, "reward": 0.08258928987197578, "reward_std": 0.022321429569274187, "rewards/accuracy_reward": 0.08258928987197578, "rewards/format_reward": 0.0, "step": 443 }, { "completion_length": 1863.2233276367188, "epoch": 0.53050556343813, "grad_norm": 0.12926386296749115, "kl": 0.0538787841796875, "learning_rate": 5.800566851853047e-07, "loss": 0.0167, "reward": 0.07589286076836288, "reward_std": 0.047405367717146873, "rewards/accuracy_reward": 0.07589286076836288, "rewards/format_reward": 0.0, "step": 444 }, { "completion_length": 1838.7813339233398, "epoch": 0.5317003957882159, "grad_norm": 0.12238122522830963, "kl": 0.0488128662109375, "learning_rate": 5.781806840830093e-07, "loss": 0.0167, "reward": 0.1004464344587177, "reward_std": 0.06079822592437267, "rewards/accuracy_reward": 0.1004464344587177, "rewards/format_reward": 0.0, "step": 445 }, { "completion_length": 1812.642936706543, "epoch": 0.5328952281383018, "grad_norm": 0.09392524510622025, "kl": 0.0553741455078125, "learning_rate": 5.763041911507723e-07, "loss": 0.0157, "reward": 0.08705357415601611, "reward_std": 0.04533348698168993, "rewards/accuracy_reward": 0.08705357415601611, "rewards/format_reward": 0.0, "step": 446 }, { "completion_length": 1810.1741943359375, "epoch": 0.5340900604883877, "grad_norm": 0.1386694759130478, "kl": 0.059783935546875, "learning_rate": 5.744272391385234e-07, "loss": 0.0149, "reward": 0.12946429126895964, "reward_std": 0.04363170964643359, "rewards/accuracy_reward": 0.12946429126895964, "rewards/format_reward": 0.0, "step": 447 }, { "completion_length": 1819.122817993164, "epoch": 0.5352848928384736, "grad_norm": 0.06648056954145432, "kl": 0.0631103515625, "learning_rate": 5.725498608042036e-07, "loss": 0.0098, "reward": 0.06026786006987095, "reward_std": 0.018547771032899618, "rewards/accuracy_reward": 0.06026786006987095, "rewards/format_reward": 0.0, "step": 448 }, { "completion_length": 1832.3192749023438, "epoch": 0.5364797251885595, "grad_norm": 0.10533496737480164, "kl": 0.0672454833984375, "learning_rate": 5.706720889131952e-07, "loss": 0.0155, "reward": 0.11607143329456449, "reward_std": 0.06526251137256622, "rewards/accuracy_reward": 0.11607143329456449, "rewards/format_reward": 0.0, "step": 449 }, { "completion_length": 1817.9442825317383, "epoch": 0.5376745575386453, "grad_norm": 0.06209107115864754, "kl": 0.074951171875, "learning_rate": 5.687939562377485e-07, "loss": 0.0118, "reward": 0.07366071757860482, "reward_std": 0.0326312561519444, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, "step": 450 }, { "completion_length": 1841.3081130981445, "epoch": 0.5388693898887312, "grad_norm": 0.056556787341833115, "kl": 0.0746917724609375, "learning_rate": 5.669154955564108e-07, "loss": 0.0114, "reward": 0.06026785937137902, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.06026785937137902, "rewards/format_reward": 0.0, "step": 451 }, { "completion_length": 1755.6072235107422, "epoch": 0.5400642222388171, "grad_norm": 0.12634915113449097, "kl": 0.0769195556640625, "learning_rate": 5.650367396534536e-07, "loss": 0.0138, "reward": 0.06250000279396772, "reward_std": 0.04602411389350891, "rewards/accuracy_reward": 0.06250000279396772, "rewards/format_reward": 0.0, "step": 452 }, { "completion_length": 1764.2969360351562, "epoch": 0.541259054588903, "grad_norm": 0.10136446356773376, "kl": 0.0791015625, "learning_rate": 5.631577213183014e-07, "loss": 0.0091, "reward": 0.0357142873108387, "reward_std": 0.01408348511904478, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, "step": 453 }, { "completion_length": 1745.5871353149414, "epoch": 0.5424538869389889, "grad_norm": 0.06372546404600143, "kl": 0.0893096923828125, "learning_rate": 5.612784733449588e-07, "loss": 0.0161, "reward": 0.06250000256113708, "reward_std": 0.034703138284385204, "rewards/accuracy_reward": 0.06250000256113708, "rewards/format_reward": 0.0, "step": 454 }, { "completion_length": 1741.861671447754, "epoch": 0.5436487192890748, "grad_norm": 0.06530442833900452, "kl": 0.0816650390625, "learning_rate": 5.593990285314375e-07, "loss": 0.0131, "reward": 0.07812500419095159, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.07812500419095159, "rewards/format_reward": 0.0, "step": 455 }, { "completion_length": 1760.0067672729492, "epoch": 0.5448435516391607, "grad_norm": 0.06928327679634094, "kl": 0.0963134765625, "learning_rate": 5.575194196791854e-07, "loss": 0.0124, "reward": 0.08928571874275804, "reward_std": 0.02816697023808956, "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.0, "step": 456 }, { "completion_length": 1823.3527526855469, "epoch": 0.5460383839892465, "grad_norm": 0.12434660643339157, "kl": 0.085906982421875, "learning_rate": 5.556396795925132e-07, "loss": 0.0084, "reward": 0.09375000512227416, "reward_std": 0.028166969772428274, "rewards/accuracy_reward": 0.09375000512227416, "rewards/format_reward": 0.0, "step": 457 }, { "completion_length": 1764.0826797485352, "epoch": 0.5472332163393324, "grad_norm": 0.09554239362478256, "kl": 0.08038330078125, "learning_rate": 5.537598410780213e-07, "loss": 0.0052, "reward": 0.07142857485450804, "reward_std": 0.01785714365541935, "rewards/accuracy_reward": 0.07142857485450804, "rewards/format_reward": 0.0, "step": 458 }, { "completion_length": 1751.2366790771484, "epoch": 0.5484280486894183, "grad_norm": 0.0793769508600235, "kl": 0.0816650390625, "learning_rate": 5.51879936944029e-07, "loss": 0.0108, "reward": 0.08482143376022577, "reward_std": 0.06010759808123112, "rewards/accuracy_reward": 0.08482143376022577, "rewards/format_reward": 0.0, "step": 459 }, { "completion_length": 1846.118392944336, "epoch": 0.5496228810395042, "grad_norm": 0.0933011919260025, "kl": 0.0843048095703125, "learning_rate": 5.5e-07, "loss": 0.0076, "reward": 0.06919643166474998, "reward_std": 0.023702683858573437, "rewards/accuracy_reward": 0.06919643166474998, "rewards/format_reward": 0.0, "step": 460 }, { "completion_length": 1828.0201797485352, "epoch": 0.5508177133895901, "grad_norm": 0.08577533811330795, "kl": 0.0774383544921875, "learning_rate": 5.481200630559711e-07, "loss": 0.0079, "reward": 0.06919643213041127, "reward_std": 0.019929025787860155, "rewards/accuracy_reward": 0.06919643213041127, "rewards/format_reward": 0.0, "step": 461 }, { "completion_length": 1865.4197158813477, "epoch": 0.552012545739676, "grad_norm": 0.1377187818288803, "kl": 0.07861328125, "learning_rate": 5.462401589219787e-07, "loss": 0.0166, "reward": 0.07812500442378223, "reward_std": 0.037786169443279505, "rewards/accuracy_reward": 0.07812500442378223, "rewards/format_reward": 0.0, "step": 462 }, { "completion_length": 1874.9755401611328, "epoch": 0.5532073780897617, "grad_norm": 0.04966536536812782, "kl": 0.0663299560546875, "learning_rate": 5.443603204074868e-07, "loss": 0.0102, "reward": 0.06473214575089514, "reward_std": 0.01854777056723833, "rewards/accuracy_reward": 0.06473214575089514, "rewards/format_reward": 0.0, "step": 463 }, { "completion_length": 1864.7009811401367, "epoch": 0.5544022104398476, "grad_norm": 0.05995866656303406, "kl": 0.0783233642578125, "learning_rate": 5.424805803208145e-07, "loss": 0.0172, "reward": 0.08928571757860482, "reward_std": 0.0357142873108387, "rewards/accuracy_reward": 0.08928571757860482, "rewards/format_reward": 0.0, "step": 464 }, { "completion_length": 1824.910789489746, "epoch": 0.5555970427899335, "grad_norm": 0.08523385226726532, "kl": 0.071868896484375, "learning_rate": 5.406009714685625e-07, "loss": 0.0076, "reward": 0.07366071757860482, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.07366071757860482, "rewards/format_reward": 0.0, "step": 465 }, { "completion_length": 1861.7232971191406, "epoch": 0.5567918751400194, "grad_norm": 0.08287857472896576, "kl": 0.0638885498046875, "learning_rate": 5.387215266550414e-07, "loss": 0.004, "reward": 0.06696428870782256, "reward_std": 0.008928571827709675, "rewards/accuracy_reward": 0.06696428870782256, "rewards/format_reward": 0.0, "step": 466 }, { "completion_length": 1859.6942825317383, "epoch": 0.5579867074901053, "grad_norm": 0.04093291983008385, "kl": 0.059112548828125, "learning_rate": 5.368422786816986e-07, "loss": 0.0077, "reward": 0.08258928917348385, "reward_std": 0.018547771032899618, "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, "step": 467 }, { "completion_length": 1896.0224151611328, "epoch": 0.5591815398401911, "grad_norm": 0.05395809933543205, "kl": 0.055389404296875, "learning_rate": 5.349632603465466e-07, "loss": 0.0061, "reward": 0.04910714505240321, "reward_std": 0.026785715483129025, "rewards/accuracy_reward": 0.04910714505240321, "rewards/format_reward": 0.0, "step": 468 }, { "completion_length": 1862.2523193359375, "epoch": 0.560376372190277, "grad_norm": 0.060915809124708176, "kl": 0.0486297607421875, "learning_rate": 5.330845044435893e-07, "loss": 0.0043, "reward": 0.08928571897558868, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.08928571897558868, "rewards/format_reward": 0.0, "step": 469 }, { "completion_length": 1846.555892944336, "epoch": 0.5615712045403629, "grad_norm": 0.0804431140422821, "kl": 0.0442962646484375, "learning_rate": 5.312060437622515e-07, "loss": 0.0123, "reward": 0.07366071827709675, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.0, "step": 470 }, { "completion_length": 1912.2188262939453, "epoch": 0.5627660368904488, "grad_norm": 0.08818567544221878, "kl": 0.04418182373046875, "learning_rate": 5.293279110868048e-07, "loss": 0.0148, "reward": 0.0937500037252903, "reward_std": 0.05117902671918273, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, "step": 471 }, { "completion_length": 1924.426414489746, "epoch": 0.5639608692405347, "grad_norm": 0.041804175823926926, "kl": 0.04543304443359375, "learning_rate": 5.274501391957964e-07, "loss": 0.0046, "reward": 0.042410716181620955, "reward_std": 0.013392857741564512, "rewards/accuracy_reward": 0.042410716181620955, "rewards/format_reward": 0.0, "step": 472 }, { "completion_length": 1936.1965026855469, "epoch": 0.5651557015906206, "grad_norm": 0.07341870665550232, "kl": 0.04166412353515625, "learning_rate": 5.255727608614766e-07, "loss": 0.0068, "reward": 0.09821429080329835, "reward_std": 0.01785714365541935, "rewards/accuracy_reward": 0.09821429080329835, "rewards/format_reward": 0.0, "step": 473 }, { "completion_length": 1831.0558776855469, "epoch": 0.5663505339407064, "grad_norm": 0.15760813653469086, "kl": 0.04241943359375, "learning_rate": 5.236958088492278e-07, "loss": 0.0108, "reward": 0.10267857555299997, "reward_std": 0.028166969772428274, "rewards/accuracy_reward": 0.10267857555299997, "rewards/format_reward": 0.0, "step": 474 }, { "completion_length": 1889.2143630981445, "epoch": 0.5675453662907923, "grad_norm": 0.07539822161197662, "kl": 0.04290771484375, "learning_rate": 5.218193159169908e-07, "loss": 0.0136, "reward": 0.04017857345752418, "reward_std": 0.044642859138548374, "rewards/accuracy_reward": 0.04017857345752418, "rewards/format_reward": 0.0, "step": 475 }, { "completion_length": 1887.2768630981445, "epoch": 0.5687401986408782, "grad_norm": 0.052779633551836014, "kl": 0.04090118408203125, "learning_rate": 5.199433148146953e-07, "loss": 0.0052, "reward": 0.055803574388846755, "reward_std": 0.023702683858573437, "rewards/accuracy_reward": 0.055803574388846755, "rewards/format_reward": 0.0, "step": 476 }, { "completion_length": 1809.6875839233398, "epoch": 0.5699350309909641, "grad_norm": 0.12113090604543686, "kl": 0.04265594482421875, "learning_rate": 5.180678382836864e-07, "loss": 0.017, "reward": 0.07589286123402417, "reward_std": 0.05117902671918273, "rewards/accuracy_reward": 0.07589286123402417, "rewards/format_reward": 0.0, "step": 477 }, { "completion_length": 1837.3884735107422, "epoch": 0.57112986334105, "grad_norm": 0.1282443404197693, "kl": 0.042633056640625, "learning_rate": 5.161929190561548e-07, "loss": 0.0176, "reward": 0.07142857485450804, "reward_std": 0.04086920013651252, "rewards/accuracy_reward": 0.07142857485450804, "rewards/format_reward": 0.0, "step": 478 }, { "completion_length": 1782.1764297485352, "epoch": 0.5723246956911359, "grad_norm": 0.07831906527280807, "kl": 0.0478057861328125, "learning_rate": 5.143185898545638e-07, "loss": 0.0075, "reward": 0.10044643352739513, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.10044643352739513, "rewards/format_reward": 0.0, "step": 479 }, { "completion_length": 1793.1563339233398, "epoch": 0.5735195280412217, "grad_norm": 0.08308440446853638, "kl": 0.0506439208984375, "learning_rate": 5.124448833910796e-07, "loss": 0.0136, "reward": 0.0915178619325161, "reward_std": 0.03778616851195693, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, "step": 480 }, { "completion_length": 1767.7612228393555, "epoch": 0.5747143603913076, "grad_norm": 0.08632687479257584, "kl": 0.0569610595703125, "learning_rate": 5.10571832367e-07, "loss": 0.0111, "reward": 0.14062500628642738, "reward_std": 0.03640491468831897, "rewards/accuracy_reward": 0.14062500628642738, "rewards/format_reward": 0.0, "step": 481 }, { "completion_length": 1716.8728485107422, "epoch": 0.5759091927413935, "grad_norm": 0.3481000065803528, "kl": 0.0599365234375, "learning_rate": 5.086994694721838e-07, "loss": 0.038, "reward": 0.08482143236324191, "reward_std": 0.07557233795523643, "rewards/accuracy_reward": 0.08482143236324191, "rewards/format_reward": 0.0, "step": 482 }, { "completion_length": 1722.7411422729492, "epoch": 0.5771040250914794, "grad_norm": 0.05826178193092346, "kl": 0.0570068359375, "learning_rate": 5.068278273844795e-07, "loss": 0.0076, "reward": 0.06919643189758062, "reward_std": 0.0491071455180645, "rewards/accuracy_reward": 0.06919643189758062, "rewards/format_reward": 0.0, "step": 483 }, { "completion_length": 1640.2344436645508, "epoch": 0.5782988574415653, "grad_norm": 0.10991848260164261, "kl": 0.0720672607421875, "learning_rate": 5.049569387691557e-07, "loss": 0.0198, "reward": 0.06473214668221772, "reward_std": 0.050488400273025036, "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.0, "step": 484 }, { "completion_length": 1669.8192901611328, "epoch": 0.5794936897916511, "grad_norm": 0.11281648278236389, "kl": 0.08087158203125, "learning_rate": 5.030868362783312e-07, "loss": 0.0181, "reward": 0.0736607180442661, "reward_std": 0.041559827979654074, "rewards/accuracy_reward": 0.0736607180442661, "rewards/format_reward": 0.0, "step": 485 }, { "completion_length": 1629.830421447754, "epoch": 0.5806885221417369, "grad_norm": 0.28225648403167725, "kl": 0.0879974365234375, "learning_rate": 5.012175525504045e-07, "loss": 0.0246, "reward": 0.10714286216534674, "reward_std": 0.086893314961344, "rewards/accuracy_reward": 0.10714286216534674, "rewards/format_reward": 0.0, "step": 486 }, { "completion_length": 1634.645164489746, "epoch": 0.5818833544918228, "grad_norm": 0.09167776256799698, "kl": 0.102569580078125, "learning_rate": 4.993491202094844e-07, "loss": 0.0183, "reward": 0.11160714668221772, "reward_std": 0.07041742699220777, "rewards/accuracy_reward": 0.11160714668221772, "rewards/format_reward": 0.0, "step": 487 }, { "completion_length": 1636.3281936645508, "epoch": 0.5830781868419087, "grad_norm": 0.09853469580411911, "kl": 0.109588623046875, "learning_rate": 4.974815718648206e-07, "loss": 0.0126, "reward": 0.13169643352739513, "reward_std": 0.06457188492640853, "rewards/accuracy_reward": 0.13169643352739513, "rewards/format_reward": 0.0, "step": 488 }, { "completion_length": 1618.8036499023438, "epoch": 0.5842730191919946, "grad_norm": 0.07792109996080399, "kl": 0.11541748046875, "learning_rate": 4.956149401102355e-07, "loss": 0.0136, "reward": 0.0870535762514919, "reward_std": 0.03640491468831897, "rewards/accuracy_reward": 0.0870535762514919, "rewards/format_reward": 0.0, "step": 489 }, { "completion_length": 1666.4465103149414, "epoch": 0.5854678515420805, "grad_norm": 0.12893584370613098, "kl": 0.127960205078125, "learning_rate": 4.93749257523553e-07, "loss": 0.0187, "reward": 0.10491071827709675, "reward_std": 0.05564331263303757, "rewards/accuracy_reward": 0.10491071827709675, "rewards/format_reward": 0.0, "step": 490 }, { "completion_length": 1625.180892944336, "epoch": 0.5866626838921664, "grad_norm": 0.1727665215730667, "kl": 0.14068603515625, "learning_rate": 4.918845566660326e-07, "loss": 0.0108, "reward": 0.0781250037252903, "reward_std": 0.03640491468831897, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, "step": 491 }, { "completion_length": 1579.238899230957, "epoch": 0.5878575162422522, "grad_norm": 0.09637086093425751, "kl": 0.1319580078125, "learning_rate": 4.900208700817996e-07, "loss": 0.024, "reward": 0.07812500279396772, "reward_std": 0.0621794811449945, "rewards/accuracy_reward": 0.07812500279396772, "rewards/format_reward": 0.0, "step": 492 }, { "completion_length": 1686.7902374267578, "epoch": 0.5890523485923381, "grad_norm": 0.19729827344417572, "kl": 0.1439208984375, "learning_rate": 4.881582302972774e-07, "loss": 0.0165, "reward": 0.11160714877769351, "reward_std": 0.02816697023808956, "rewards/accuracy_reward": 0.11160714877769351, "rewards/format_reward": 0.0, "step": 493 }, { "completion_length": 1686.8683776855469, "epoch": 0.590247180942424, "grad_norm": 0.157417893409729, "kl": 0.15289306640625, "learning_rate": 4.862966698206198e-07, "loss": 0.0206, "reward": 0.12053571967408061, "reward_std": 0.04979777242988348, "rewards/accuracy_reward": 0.12053571967408061, "rewards/format_reward": 0.0, "step": 494 }, { "completion_length": 1673.0268478393555, "epoch": 0.5914420132925099, "grad_norm": 0.10255645215511322, "kl": 0.139984130859375, "learning_rate": 4.84436221141144e-07, "loss": 0.0194, "reward": 0.0781250037252903, "reward_std": 0.03263125568628311, "rewards/accuracy_reward": 0.0781250037252903, "rewards/format_reward": 0.0, "step": 495 }, { "completion_length": 1679.2924880981445, "epoch": 0.5926368456425958, "grad_norm": 0.1100635752081871, "kl": 0.155364990234375, "learning_rate": 4.825769167287633e-07, "loss": 0.0245, "reward": 0.060267860535532236, "reward_std": 0.04671474127098918, "rewards/accuracy_reward": 0.060267860535532236, "rewards/format_reward": 0.0, "step": 496 }, { "completion_length": 1740.88623046875, "epoch": 0.5938316779926817, "grad_norm": 0.2121514230966568, "kl": 0.154449462890625, "learning_rate": 4.807187890334201e-07, "loss": 0.0137, "reward": 0.08258929080329835, "reward_std": 0.022321429569274187, "rewards/accuracy_reward": 0.08258929080329835, "rewards/format_reward": 0.0, "step": 497 }, { "completion_length": 1754.2612380981445, "epoch": 0.5950265103427675, "grad_norm": 0.24599485099315643, "kl": 0.1614990234375, "learning_rate": 4.788618704845199e-07, "loss": 0.0131, "reward": 0.0758928598370403, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.0758928598370403, "rewards/format_reward": 0.0, "step": 498 }, { "completion_length": 1768.8103561401367, "epoch": 0.5962213426928534, "grad_norm": 0.24913953244686127, "kl": 0.131500244140625, "learning_rate": 4.770061934903657e-07, "loss": 0.0077, "reward": 0.04687500209547579, "reward_std": 0.018547771032899618, "rewards/accuracy_reward": 0.04687500209547579, "rewards/format_reward": 0.0, "step": 499 }, { "completion_length": 1791.0000915527344, "epoch": 0.5974161750429393, "grad_norm": 0.1808418184518814, "kl": 0.123626708984375, "learning_rate": 4.7515179043759146e-07, "loss": 0.0095, "reward": 0.0714285746216774, "reward_std": 0.026785715483129025, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, "step": 500 }, { "completion_length": 1776.2121276855469, "epoch": 0.5986110073930252, "grad_norm": 0.10543771833181381, "kl": 0.124053955078125, "learning_rate": 4.7329869369059793e-07, "loss": 0.0157, "reward": 0.09151786100119352, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.09151786100119352, "rewards/format_reward": 0.0, "step": 501 }, { "completion_length": 1822.9152603149414, "epoch": 0.5998058397431111, "grad_norm": 0.09516432136297226, "kl": 0.118988037109375, "learning_rate": 4.714469355909867e-07, "loss": 0.0156, "reward": 0.07589286123402417, "reward_std": 0.02816697023808956, "rewards/accuracy_reward": 0.07589286123402417, "rewards/format_reward": 0.0, "step": 502 }, { "completion_length": 1875.176414489746, "epoch": 0.601000672093197, "grad_norm": 0.07685718685388565, "kl": 0.104034423828125, "learning_rate": 4.695965484569966e-07, "loss": 0.014, "reward": 0.08705357648432255, "reward_std": 0.041559827979654074, "rewards/accuracy_reward": 0.08705357648432255, "rewards/format_reward": 0.0, "step": 503 }, { "completion_length": 1851.573745727539, "epoch": 0.6021955044432828, "grad_norm": 0.07660838216543198, "kl": 0.079925537109375, "learning_rate": 4.677475645829394e-07, "loss": 0.014, "reward": 0.08035714668221772, "reward_std": 0.04602411389350891, "rewards/accuracy_reward": 0.08035714668221772, "rewards/format_reward": 0.0, "step": 504 }, { "completion_length": 1891.0246505737305, "epoch": 0.6033903367933687, "grad_norm": 0.14563655853271484, "kl": 0.07568359375, "learning_rate": 4.6590001623863593e-07, "loss": 0.0168, "reward": 0.08035714691504836, "reward_std": 0.037095542065799236, "rewards/accuracy_reward": 0.08035714691504836, "rewards/format_reward": 0.0, "step": 505 }, { "completion_length": 1849.8460693359375, "epoch": 0.6045851691434545, "grad_norm": 0.05801551043987274, "kl": 0.069061279296875, "learning_rate": 4.6405393566885344e-07, "loss": 0.0072, "reward": 0.06696428917348385, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.0, "step": 506 }, { "completion_length": 1918.4398193359375, "epoch": 0.6057800014935404, "grad_norm": 0.05543803796172142, "kl": 0.0703277587890625, "learning_rate": 4.6220935509274227e-07, "loss": 0.0087, "reward": 0.07366071664728224, "reward_std": 0.03263125568628311, "rewards/accuracy_reward": 0.07366071664728224, "rewards/format_reward": 0.0, "step": 507 }, { "completion_length": 1800.6541061401367, "epoch": 0.6069748338436263, "grad_norm": 0.33682650327682495, "kl": 0.0626983642578125, "learning_rate": 4.603663067032738e-07, "loss": 0.0287, "reward": 0.09375000349245965, "reward_std": 0.05256028147414327, "rewards/accuracy_reward": 0.09375000349245965, "rewards/format_reward": 0.0, "step": 508 }, { "completion_length": 1921.3148193359375, "epoch": 0.6081696661937122, "grad_norm": 0.0664508044719696, "kl": 0.05902099609375, "learning_rate": 4.585248226666785e-07, "loss": 0.0081, "reward": 0.06473214621655643, "reward_std": 0.01477411249652505, "rewards/accuracy_reward": 0.06473214621655643, "rewards/format_reward": 0.0, "step": 509 }, { "completion_length": 1885.8326873779297, "epoch": 0.609364498543798, "grad_norm": 0.14841929078102112, "kl": 0.0582733154296875, "learning_rate": 4.5668493512188454e-07, "loss": 0.0179, "reward": 0.10491071944124997, "reward_std": 0.041559827513992786, "rewards/accuracy_reward": 0.10491071944124997, "rewards/format_reward": 0.0, "step": 510 }, { "completion_length": 1869.2813415527344, "epoch": 0.6105593308938839, "grad_norm": 0.07392990589141846, "kl": 0.05812835693359375, "learning_rate": 4.5484667617995764e-07, "loss": 0.0129, "reward": 0.058035716880112886, "reward_std": 0.03709554160013795, "rewards/accuracy_reward": 0.058035716880112886, "rewards/format_reward": 0.0, "step": 511 }, { "completion_length": 1877.158576965332, "epoch": 0.6117541632439698, "grad_norm": 0.06480254977941513, "kl": 0.05816650390625, "learning_rate": 4.5301007792353894e-07, "loss": 0.0081, "reward": 0.049107145285233855, "reward_std": 0.02816697023808956, "rewards/accuracy_reward": 0.049107145285233855, "rewards/format_reward": 0.0, "step": 512 }, { "completion_length": 1913.9063339233398, "epoch": 0.6129489955940557, "grad_norm": 0.05200779810547829, "kl": 0.06549072265625, "learning_rate": 4.5117517240628656e-07, "loss": 0.0132, "reward": 0.07589286053553224, "reward_std": 0.026785715483129025, "rewards/accuracy_reward": 0.07589286053553224, "rewards/format_reward": 0.0, "step": 513 }, { "completion_length": 1946.4308853149414, "epoch": 0.6141438279441416, "grad_norm": 0.07707808911800385, "kl": 0.059417724609375, "learning_rate": 4.493419916523159e-07, "loss": 0.0036, "reward": 0.05133928754366934, "reward_std": 0.013392857741564512, "rewards/accuracy_reward": 0.05133928754366934, "rewards/format_reward": 0.0, "step": 514 }, { "completion_length": 1873.3103485107422, "epoch": 0.6153386602942275, "grad_norm": 0.1112465038895607, "kl": 0.0721588134765625, "learning_rate": 4.475105676556401e-07, "loss": 0.0144, "reward": 0.08258929033763707, "reward_std": 0.05186965363100171, "rewards/accuracy_reward": 0.08258929033763707, "rewards/format_reward": 0.0, "step": 515 }, { "completion_length": 1904.479995727539, "epoch": 0.6165334926443133, "grad_norm": 0.06764061003923416, "kl": 0.0627899169921875, "learning_rate": 4.4568093237961226e-07, "loss": 0.0144, "reward": 0.06919643306173384, "reward_std": 0.0326312561519444, "rewards/accuracy_reward": 0.06919643306173384, "rewards/format_reward": 0.0, "step": 516 }, { "completion_length": 1905.7322387695312, "epoch": 0.6177283249943992, "grad_norm": 0.06313049793243408, "kl": 0.0632781982421875, "learning_rate": 4.4385311775636757e-07, "loss": 0.0056, "reward": 0.060267859837040305, "reward_std": 0.013392857741564512, "rewards/accuracy_reward": 0.060267859837040305, "rewards/format_reward": 0.0, "step": 517 }, { "completion_length": 1814.8951721191406, "epoch": 0.6189231573444851, "grad_norm": 0.09103602916002274, "kl": 0.067230224609375, "learning_rate": 4.4202715568626593e-07, "loss": 0.0161, "reward": 0.1004464328289032, "reward_std": 0.03640491422265768, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, "step": 518 }, { "completion_length": 1854.2590026855469, "epoch": 0.620117989694571, "grad_norm": 0.06793206930160522, "kl": 0.06207275390625, "learning_rate": 4.402030780373347e-07, "loss": 0.0074, "reward": 0.08035714598372579, "reward_std": 0.044642859138548374, "rewards/accuracy_reward": 0.08035714598372579, "rewards/format_reward": 0.0, "step": 519 }, { "completion_length": 1857.8103485107422, "epoch": 0.6213128220446569, "grad_norm": 0.09940844029188156, "kl": 0.0604248046875, "learning_rate": 4.38380916644713e-07, "loss": 0.0136, "reward": 0.05803571664728224, "reward_std": 0.037095542065799236, "rewards/accuracy_reward": 0.05803571664728224, "rewards/format_reward": 0.0, "step": 520 }, { "completion_length": 1904.8594589233398, "epoch": 0.6225076543947428, "grad_norm": 0.10906030237674713, "kl": 0.057342529296875, "learning_rate": 4.3656070331009664e-07, "loss": 0.0139, "reward": 0.09151786123402417, "reward_std": 0.028857597149908543, "rewards/accuracy_reward": 0.09151786123402417, "rewards/format_reward": 0.0, "step": 521 }, { "completion_length": 1857.3661499023438, "epoch": 0.6237024867448286, "grad_norm": 0.11494660377502441, "kl": 0.06472015380859375, "learning_rate": 4.3474246980118146e-07, "loss": 0.0131, "reward": 0.0892857180442661, "reward_std": 0.026785715483129025, "rewards/accuracy_reward": 0.0892857180442661, "rewards/format_reward": 0.0, "step": 522 }, { "completion_length": 1842.986686706543, "epoch": 0.6248973190949145, "grad_norm": 0.05762450397014618, "kl": 0.05792999267578125, "learning_rate": 4.329262478511111e-07, "loss": 0.0119, "reward": 0.06473214528523386, "reward_std": 0.03640491468831897, "rewards/accuracy_reward": 0.06473214528523386, "rewards/format_reward": 0.0, "step": 523 }, { "completion_length": 1849.9197311401367, "epoch": 0.6260921514450004, "grad_norm": 0.08118993043899536, "kl": 0.06855010986328125, "learning_rate": 4.311120691579213e-07, "loss": 0.006, "reward": 0.05357143119908869, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.05357143119908869, "rewards/format_reward": 0.0, "step": 524 }, { "completion_length": 1833.7835540771484, "epoch": 0.6272869837950863, "grad_norm": 0.06589169055223465, "kl": 0.0644073486328125, "learning_rate": 4.292999653839874e-07, "loss": 0.0075, "reward": 0.07142857532016933, "reward_std": 0.03194062830880284, "rewards/accuracy_reward": 0.07142857532016933, "rewards/format_reward": 0.0, "step": 525 }, { "completion_length": 1869.4911422729492, "epoch": 0.6284818161451722, "grad_norm": 0.10139084607362747, "kl": 0.0660858154296875, "learning_rate": 4.27489968155472e-07, "loss": 0.0062, "reward": 0.055803574388846755, "reward_std": 0.023702683858573437, "rewards/accuracy_reward": 0.055803574388846755, "rewards/format_reward": 0.0, "step": 526 }, { "completion_length": 1833.5313339233398, "epoch": 0.6296766484952581, "grad_norm": 0.09691954404115677, "kl": 0.06451416015625, "learning_rate": 4.256821090617724e-07, "loss": 0.013, "reward": 0.10937500512227416, "reward_std": 0.023702684324234724, "rewards/accuracy_reward": 0.10937500512227416, "rewards/format_reward": 0.0, "step": 527 }, { "completion_length": 1815.7009735107422, "epoch": 0.6308714808453438, "grad_norm": 0.09956888109445572, "kl": 0.06634521484375, "learning_rate": 4.238764196549697e-07, "loss": 0.0161, "reward": 0.05580357392318547, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.05580357392318547, "rewards/format_reward": 0.0, "step": 528 }, { "completion_length": 1795.8951797485352, "epoch": 0.6320663131954297, "grad_norm": 0.1377066969871521, "kl": 0.0603790283203125, "learning_rate": 4.2207293144927846e-07, "loss": 0.0121, "reward": 0.10267857671715319, "reward_std": 0.05256028147414327, "rewards/accuracy_reward": 0.10267857671715319, "rewards/format_reward": 0.0, "step": 529 }, { "completion_length": 1861.3594589233398, "epoch": 0.6332611455455156, "grad_norm": 0.10090659558773041, "kl": 0.0570831298828125, "learning_rate": 4.2027167592049583e-07, "loss": 0.0059, "reward": 0.026785715483129025, "reward_std": 0.01785714365541935, "rewards/accuracy_reward": 0.026785715483129025, "rewards/format_reward": 0.0, "step": 530 }, { "completion_length": 1794.4197158813477, "epoch": 0.6344559778956015, "grad_norm": 0.11963106691837311, "kl": 0.0580902099609375, "learning_rate": 4.184726845054526e-07, "loss": 0.0176, "reward": 0.10267857532016933, "reward_std": 0.0549526852555573, "rewards/accuracy_reward": 0.10267857532016933, "rewards/format_reward": 0.0, "step": 531 }, { "completion_length": 1837.9755249023438, "epoch": 0.6356508102456874, "grad_norm": 0.07913490384817123, "kl": 0.05867767333984375, "learning_rate": 4.166759886014648e-07, "loss": 0.0078, "reward": 0.08258929010480642, "reward_std": 0.02885759761556983, "rewards/accuracy_reward": 0.08258929010480642, "rewards/format_reward": 0.0, "step": 532 }, { "completion_length": 1829.3750686645508, "epoch": 0.6368456425957733, "grad_norm": 0.16118530929088593, "kl": 0.0676727294921875, "learning_rate": 4.1488161956578547e-07, "loss": 0.0182, "reward": 0.08482143213041127, "reward_std": 0.056333940010517836, "rewards/accuracy_reward": 0.08482143213041127, "rewards/format_reward": 0.0, "step": 533 }, { "completion_length": 1765.7880249023438, "epoch": 0.6380404749458591, "grad_norm": 0.17572851479053497, "kl": 0.0743408203125, "learning_rate": 4.130896087150572e-07, "loss": 0.0236, "reward": 0.10714286030270159, "reward_std": 0.0549526852555573, "rewards/accuracy_reward": 0.10714286030270159, "rewards/format_reward": 0.0, "step": 534 }, { "completion_length": 1799.6027603149414, "epoch": 0.639235307295945, "grad_norm": 0.1056859940290451, "kl": 0.079681396484375, "learning_rate": 4.1129998732476624e-07, "loss": 0.0137, "reward": 0.07366071594879031, "reward_std": 0.03640491422265768, "rewards/accuracy_reward": 0.07366071594879031, "rewards/format_reward": 0.0, "step": 535 }, { "completion_length": 1731.238899230957, "epoch": 0.6404301396460309, "grad_norm": 0.07803025096654892, "kl": 0.07708740234375, "learning_rate": 4.0951278662869614e-07, "loss": 0.0103, "reward": 0.1049107201397419, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.1049107201397419, "rewards/format_reward": 0.0, "step": 536 }, { "completion_length": 1733.8371276855469, "epoch": 0.6416249719961168, "grad_norm": 0.13762643933296204, "kl": 0.079742431640625, "learning_rate": 4.077280378183825e-07, "loss": 0.008, "reward": 0.05580357415601611, "reward_std": 0.03640491468831897, "rewards/accuracy_reward": 0.05580357415601611, "rewards/format_reward": 0.0, "step": 537 }, { "completion_length": 1747.4286346435547, "epoch": 0.6428198043462027, "grad_norm": 0.13914401829242706, "kl": 0.0885467529296875, "learning_rate": 4.059457720425686e-07, "loss": 0.0242, "reward": 0.0736607164144516, "reward_std": 0.05426205834373832, "rewards/accuracy_reward": 0.0736607164144516, "rewards/format_reward": 0.0, "step": 538 }, { "completion_length": 1704.8125686645508, "epoch": 0.6440146366962886, "grad_norm": 0.10378368943929672, "kl": 0.0957794189453125, "learning_rate": 4.041660204066628e-07, "loss": 0.0222, "reward": 0.09375000442378223, "reward_std": 0.06388125708326697, "rewards/accuracy_reward": 0.09375000442378223, "rewards/format_reward": 0.0, "step": 539 }, { "completion_length": 1718.986686706543, "epoch": 0.6452094690463744, "grad_norm": 0.08586075901985168, "kl": 0.09954833984375, "learning_rate": 4.0238881397219395e-07, "loss": 0.0178, "reward": 0.07366071897558868, "reward_std": 0.023702683858573437, "rewards/accuracy_reward": 0.07366071897558868, "rewards/format_reward": 0.0, "step": 540 }, { "completion_length": 1623.042495727539, "epoch": 0.6464043013964603, "grad_norm": 0.11508069187402725, "kl": 0.1069793701171875, "learning_rate": 4.0061418375627063e-07, "loss": 0.0157, "reward": 0.07589286030270159, "reward_std": 0.026785715483129025, "rewards/accuracy_reward": 0.07589286030270159, "rewards/format_reward": 0.0, "step": 541 }, { "completion_length": 1638.3616943359375, "epoch": 0.6475991337465462, "grad_norm": 0.08514707535505295, "kl": 0.0948944091796875, "learning_rate": 3.988421607310395e-07, "loss": 0.0162, "reward": 0.09375000512227416, "reward_std": 0.04602411389350891, "rewards/accuracy_reward": 0.09375000512227416, "rewards/format_reward": 0.0, "step": 542 }, { "completion_length": 1624.0982971191406, "epoch": 0.6487939660966321, "grad_norm": 0.07648032903671265, "kl": 0.08807373046875, "learning_rate": 3.9707277582314434e-07, "loss": 0.0143, "reward": 0.06473214575089514, "reward_std": 0.03125000139698386, "rewards/accuracy_reward": 0.06473214575089514, "rewards/format_reward": 0.0, "step": 543 }, { "completion_length": 1667.2478408813477, "epoch": 0.649988798446718, "grad_norm": 0.10173901170492172, "kl": 0.10552978515625, "learning_rate": 3.9530605991318665e-07, "loss": 0.014, "reward": 0.05133928777649999, "reward_std": 0.03778616897761822, "rewards/accuracy_reward": 0.05133928777649999, "rewards/format_reward": 0.0, "step": 544 }, { "completion_length": 1689.3907012939453, "epoch": 0.6511836307968039, "grad_norm": 0.08177798986434937, "kl": 0.10467529296875, "learning_rate": 3.935420438351865e-07, "loss": 0.0153, "reward": 0.0558035746216774, "reward_std": 0.041559827979654074, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.0, "step": 545 }, { "completion_length": 1686.2500762939453, "epoch": 0.6523784631468897, "grad_norm": 0.12319135665893555, "kl": 0.1057281494140625, "learning_rate": 3.9178075837604465e-07, "loss": 0.0175, "reward": 0.08258928963914514, "reward_std": 0.04294108226895332, "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.0, "step": 546 }, { "completion_length": 1697.5536346435547, "epoch": 0.6535732954969756, "grad_norm": 0.09942708164453506, "kl": 0.10235595703125, "learning_rate": 3.9002223427500534e-07, "loss": 0.0154, "reward": 0.08705357438884676, "reward_std": 0.03778616897761822, "rewards/accuracy_reward": 0.08705357438884676, "rewards/format_reward": 0.0, "step": 547 }, { "completion_length": 1737.6339874267578, "epoch": 0.6547681278470615, "grad_norm": 0.13474924862384796, "kl": 0.0972137451171875, "learning_rate": 3.882665022231193e-07, "loss": 0.0065, "reward": 0.05357143119908869, "reward_std": 0.04225045442581177, "rewards/accuracy_reward": 0.05357143119908869, "rewards/format_reward": 0.0, "step": 548 }, { "completion_length": 1745.7076721191406, "epoch": 0.6559629601971473, "grad_norm": 0.16155661642551422, "kl": 0.08868408203125, "learning_rate": 3.8651359286270813e-07, "loss": 0.0239, "reward": 0.11160714807920158, "reward_std": 0.060107598546892405, "rewards/accuracy_reward": 0.11160714807920158, "rewards/format_reward": 0.0, "step": 549 }, { "completion_length": 1757.6942901611328, "epoch": 0.6571577925472332, "grad_norm": 0.19252263009548187, "kl": 0.093505859375, "learning_rate": 3.8476353678683027e-07, "loss": 0.0263, "reward": 0.05357143096625805, "reward_std": 0.0549526852555573, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "step": 550 }, { "completion_length": 1707.49560546875, "epoch": 0.658352624897319, "grad_norm": 0.10461177676916122, "kl": 0.09320068359375, "learning_rate": 3.830163645387462e-07, "loss": 0.008, "reward": 0.07812500349245965, "reward_std": 0.023702684324234724, "rewards/accuracy_reward": 0.07812500349245965, "rewards/format_reward": 0.0, "step": 551 }, { "completion_length": 1747.0000762939453, "epoch": 0.6595474572474049, "grad_norm": 0.13642312586307526, "kl": 0.0976409912109375, "learning_rate": 3.812721066113855e-07, "loss": 0.0265, "reward": 0.03571428777649999, "reward_std": 0.046024112962186337, "rewards/accuracy_reward": 0.03571428777649999, "rewards/format_reward": 0.0, "step": 552 }, { "completion_length": 1787.8036575317383, "epoch": 0.6607422895974908, "grad_norm": 0.09445420652627945, "kl": 0.0895843505859375, "learning_rate": 3.7953079344681573e-07, "loss": 0.0141, "reward": 0.09375000395812094, "reward_std": 0.026785715483129025, "rewards/accuracy_reward": 0.09375000395812094, "rewards/format_reward": 0.0, "step": 553 }, { "completion_length": 1753.3259658813477, "epoch": 0.6619371219475767, "grad_norm": 0.10592974722385406, "kl": 0.1085052490234375, "learning_rate": 3.777924554357096e-07, "loss": 0.0192, "reward": 0.13839286449365318, "reward_std": 0.03709554160013795, "rewards/accuracy_reward": 0.13839286449365318, "rewards/format_reward": 0.0, "step": 554 }, { "completion_length": 1779.0938262939453, "epoch": 0.6631319542976626, "grad_norm": 0.15189969539642334, "kl": 0.1248016357421875, "learning_rate": 3.760571229168155e-07, "loss": 0.0062, "reward": 0.05357143096625805, "reward_std": 0.02816697023808956, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "step": 555 }, { "completion_length": 1814.9509811401367, "epoch": 0.6643267866477485, "grad_norm": 0.12868443131446838, "kl": 0.10076904296875, "learning_rate": 3.7432482617642734e-07, "loss": 0.0139, "reward": 0.12500000558793545, "reward_std": 0.05117902625352144, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 556 }, { "completion_length": 1840.073745727539, "epoch": 0.6655216189978344, "grad_norm": 0.13210028409957886, "kl": 0.1137237548828125, "learning_rate": 3.7259559544785713e-07, "loss": 0.0133, "reward": 0.0714285746216774, "reward_std": 0.02816697023808956, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, "step": 557 }, { "completion_length": 1807.4576721191406, "epoch": 0.6667164513479202, "grad_norm": 0.08642993867397308, "kl": 0.0975494384765625, "learning_rate": 3.708694609109061e-07, "loss": 0.0174, "reward": 0.0870535746216774, "reward_std": 0.04671474080532789, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, "step": 558 }, { "completion_length": 1833.1384811401367, "epoch": 0.6679112836980061, "grad_norm": 0.1445986032485962, "kl": 0.0944976806640625, "learning_rate": 3.691464526913387e-07, "loss": 0.0056, "reward": 0.03348214435391128, "reward_std": 0.013392857741564512, "rewards/accuracy_reward": 0.03348214435391128, "rewards/format_reward": 0.0, "step": 559 }, { "completion_length": 1831.1206283569336, "epoch": 0.669106116048092, "grad_norm": 0.0937800407409668, "kl": 0.0798187255859375, "learning_rate": 3.6742660086035636e-07, "loss": 0.0094, "reward": 0.06919643189758062, "reward_std": 0.03640491468831897, "rewards/accuracy_reward": 0.06919643189758062, "rewards/format_reward": 0.0, "step": 560 }, { "completion_length": 1770.6340026855469, "epoch": 0.6703009483981779, "grad_norm": 0.15422633290290833, "kl": 0.08751678466796875, "learning_rate": 3.6570993543407324e-07, "loss": 0.0178, "reward": 0.11830357578583062, "reward_std": 0.05941697070375085, "rewards/accuracy_reward": 0.11830357578583062, "rewards/format_reward": 0.0, "step": 561 }, { "completion_length": 1825.9175033569336, "epoch": 0.6714957807482638, "grad_norm": 0.09986953437328339, "kl": 0.096160888671875, "learning_rate": 3.6399648637299164e-07, "loss": 0.0174, "reward": 0.11830357694998384, "reward_std": 0.03640491468831897, "rewards/accuracy_reward": 0.11830357694998384, "rewards/format_reward": 0.0, "step": 562 }, { "completion_length": 1798.0313415527344, "epoch": 0.6726906130983497, "grad_norm": 0.20112042129039764, "kl": 0.0879669189453125, "learning_rate": 3.622862835814796e-07, "loss": 0.0199, "reward": 0.06473214668221772, "reward_std": 0.03778616897761822, "rewards/accuracy_reward": 0.06473214668221772, "rewards/format_reward": 0.0, "step": 563 }, { "completion_length": 1789.207664489746, "epoch": 0.6738854454484355, "grad_norm": 0.09522834420204163, "kl": 0.0930023193359375, "learning_rate": 3.6057935690724927e-07, "loss": 0.0166, "reward": 0.0513392873108387, "reward_std": 0.041559827513992786, "rewards/accuracy_reward": 0.0513392873108387, "rewards/format_reward": 0.0, "step": 564 }, { "completion_length": 1831.9755249023438, "epoch": 0.6750802777985214, "grad_norm": 0.09288113564252853, "kl": 0.0861053466796875, "learning_rate": 3.588757361408353e-07, "loss": 0.0079, "reward": 0.06696428847499192, "reward_std": 0.01785714365541935, "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.0, "step": 565 }, { "completion_length": 1820.752311706543, "epoch": 0.6762751101486073, "grad_norm": 0.08538973331451416, "kl": 0.0875396728515625, "learning_rate": 3.5717545101507485e-07, "loss": 0.0195, "reward": 0.05580357392318547, "reward_std": 0.041559827979654074, "rewards/accuracy_reward": 0.05580357392318547, "rewards/format_reward": 0.0, "step": 566 }, { "completion_length": 1817.4219512939453, "epoch": 0.6774699424986932, "grad_norm": 0.1046675518155098, "kl": 0.0906982421875, "learning_rate": 3.554785312045899e-07, "loss": 0.0141, "reward": 0.09375000349245965, "reward_std": 0.03709554160013795, "rewards/accuracy_reward": 0.09375000349245965, "rewards/format_reward": 0.0, "step": 567 }, { "completion_length": 1831.2768630981445, "epoch": 0.6786647748487791, "grad_norm": 0.10678687691688538, "kl": 0.102752685546875, "learning_rate": 3.537850063252679e-07, "loss": 0.0149, "reward": 0.06696428940631449, "reward_std": 0.0460241143591702, "rewards/accuracy_reward": 0.06696428940631449, "rewards/format_reward": 0.0, "step": 568 }, { "completion_length": 1861.9666061401367, "epoch": 0.679859607198865, "grad_norm": 0.13669022917747498, "kl": 0.08380126953125, "learning_rate": 3.520949059337451e-07, "loss": 0.0051, "reward": 0.05133928754366934, "reward_std": 0.01854777056723833, "rewards/accuracy_reward": 0.05133928754366934, "rewards/format_reward": 0.0, "step": 569 }, { "completion_length": 1774.7612533569336, "epoch": 0.6810544395489508, "grad_norm": 0.0826817974448204, "kl": 0.091339111328125, "learning_rate": 3.5040825952689225e-07, "loss": 0.0134, "reward": 0.09375000349245965, "reward_std": 0.03709554160013795, "rewards/accuracy_reward": 0.09375000349245965, "rewards/format_reward": 0.0, "step": 570 }, { "completion_length": 1755.9130325317383, "epoch": 0.6822492718990366, "grad_norm": 0.18007031083106995, "kl": 0.08941650390625, "learning_rate": 3.487250965412975e-07, "loss": 0.0162, "reward": 0.10044643259607255, "reward_std": 0.05048839934170246, "rewards/accuracy_reward": 0.10044643259607255, "rewards/format_reward": 0.0, "step": 571 }, { "completion_length": 1854.8371276855469, "epoch": 0.6834441042491225, "grad_norm": 0.0679960623383522, "kl": 0.0758514404296875, "learning_rate": 3.4704544635275415e-07, "loss": 0.0093, "reward": 0.05580357299186289, "reward_std": 0.03640491422265768, "rewards/accuracy_reward": 0.05580357299186289, "rewards/format_reward": 0.0, "step": 572 }, { "completion_length": 1814.4018630981445, "epoch": 0.6846389365992084, "grad_norm": 0.11033713072538376, "kl": 0.07994842529296875, "learning_rate": 3.45369338275748e-07, "loss": 0.0072, "reward": 0.10044643376022577, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.10044643376022577, "rewards/format_reward": 0.0, "step": 573 }, { "completion_length": 1811.2232971191406, "epoch": 0.6858337689492943, "grad_norm": 0.10589395463466644, "kl": 0.0879974365234375, "learning_rate": 3.436968015629447e-07, "loss": 0.0112, "reward": 0.08035714505240321, "reward_std": 0.026785715483129025, "rewards/accuracy_reward": 0.08035714505240321, "rewards/format_reward": 0.0, "step": 574 }, { "completion_length": 1889.3639221191406, "epoch": 0.6870286012993801, "grad_norm": 0.081288181245327, "kl": 0.0794525146484375, "learning_rate": 3.4202786540468014e-07, "loss": 0.0148, "reward": 0.0825892889406532, "reward_std": 0.03263125568628311, "rewards/accuracy_reward": 0.0825892889406532, "rewards/format_reward": 0.0, "step": 575 }, { "completion_length": 1861.7991943359375, "epoch": 0.688223433649466, "grad_norm": 0.10511035472154617, "kl": 0.0851593017578125, "learning_rate": 3.4036255892845084e-07, "loss": 0.007, "reward": 0.04687500186264515, "reward_std": 0.018547771032899618, "rewards/accuracy_reward": 0.04687500186264515, "rewards/format_reward": 0.0, "step": 576 }, { "completion_length": 1822.0358047485352, "epoch": 0.6894182659995519, "grad_norm": 0.09403185546398163, "kl": 0.0936431884765625, "learning_rate": 3.387009111984053e-07, "loss": 0.0083, "reward": 0.0870535762514919, "reward_std": 0.03640491468831897, "rewards/accuracy_reward": 0.0870535762514919, "rewards/format_reward": 0.0, "step": 577 }, { "completion_length": 1884.1161727905273, "epoch": 0.6906130983496378, "grad_norm": 0.14700038731098175, "kl": 0.0776214599609375, "learning_rate": 3.3704295121483664e-07, "loss": 0.0191, "reward": 0.05357143119908869, "reward_std": 0.040869200602173805, "rewards/accuracy_reward": 0.05357143119908869, "rewards/format_reward": 0.0, "step": 578 }, { "completion_length": 1868.808120727539, "epoch": 0.6918079306997237, "grad_norm": 0.07966268062591553, "kl": 0.070343017578125, "learning_rate": 3.353887079136769e-07, "loss": 0.0134, "reward": 0.09151786076836288, "reward_std": 0.03778616897761822, "rewards/accuracy_reward": 0.09151786076836288, "rewards/format_reward": 0.0, "step": 579 }, { "completion_length": 1852.8192825317383, "epoch": 0.6930027630498096, "grad_norm": 0.11561112105846405, "kl": 0.0655517578125, "learning_rate": 3.337382101659923e-07, "loss": 0.0049, "reward": 0.06473214505240321, "reward_std": 0.059416971169412136, "rewards/accuracy_reward": 0.06473214505240321, "rewards/format_reward": 0.0, "step": 580 }, { "completion_length": 1884.6540985107422, "epoch": 0.6941975953998955, "grad_norm": 0.06498418748378754, "kl": 0.0677032470703125, "learning_rate": 3.3209148677747825e-07, "loss": 0.0092, "reward": 0.053571430733427405, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.053571430733427405, "rewards/format_reward": 0.0, "step": 581 }, { "completion_length": 1870.4286651611328, "epoch": 0.6953924277499813, "grad_norm": 0.10556535422801971, "kl": 0.0749969482421875, "learning_rate": 3.304485664879578e-07, "loss": 0.007, "reward": 0.04464285937137902, "reward_std": 0.028166969772428274, "rewards/accuracy_reward": 0.04464285937137902, "rewards/format_reward": 0.0, "step": 582 }, { "completion_length": 1847.5045471191406, "epoch": 0.6965872601000672, "grad_norm": 0.07487612217664719, "kl": 0.078277587890625, "learning_rate": 3.288094779708792e-07, "loss": 0.0112, "reward": 0.0892857180442661, "reward_std": 0.03709554160013795, "rewards/accuracy_reward": 0.0892857180442661, "rewards/format_reward": 0.0, "step": 583 }, { "completion_length": 1856.3326721191406, "epoch": 0.6977820924501531, "grad_norm": 0.143977552652359, "kl": 0.07122802734375, "learning_rate": 3.271742498328159e-07, "loss": 0.0125, "reward": 0.0558035746216774, "reward_std": 0.0326312561519444, "rewards/accuracy_reward": 0.0558035746216774, "rewards/format_reward": 0.0, "step": 584 }, { "completion_length": 1891.5045471191406, "epoch": 0.698976924800239, "grad_norm": 0.06661387532949448, "kl": 0.05290985107421875, "learning_rate": 3.2554291061296715e-07, "loss": 0.0099, "reward": 0.04017857345752418, "reward_std": 0.01785714365541935, "rewards/accuracy_reward": 0.04017857345752418, "rewards/format_reward": 0.0, "step": 585 }, { "completion_length": 1846.9375686645508, "epoch": 0.7001717571503249, "grad_norm": 0.06417391449213028, "kl": 0.0593414306640625, "learning_rate": 3.2391548878266e-07, "loss": 0.0092, "reward": 0.08258928917348385, "reward_std": 0.03125000139698386, "rewards/accuracy_reward": 0.08258928917348385, "rewards/format_reward": 0.0, "step": 586 }, { "completion_length": 1912.3282089233398, "epoch": 0.7013665895004108, "grad_norm": 0.12758365273475647, "kl": 0.0611419677734375, "learning_rate": 3.222920127448523e-07, "loss": 0.0127, "reward": 0.06026786100119352, "reward_std": 0.034012510906904936, "rewards/accuracy_reward": 0.06026786100119352, "rewards/format_reward": 0.0, "step": 587 }, { "completion_length": 1864.1340103149414, "epoch": 0.7025614218504966, "grad_norm": 0.1080550104379654, "kl": 0.07251739501953125, "learning_rate": 3.206725108336371e-07, "loss": 0.0114, "reward": 0.08482143189758062, "reward_std": 0.03847679682075977, "rewards/accuracy_reward": 0.08482143189758062, "rewards/format_reward": 0.0, "step": 588 }, { "completion_length": 1842.4264221191406, "epoch": 0.7037562542005825, "grad_norm": 0.17991642653942108, "kl": 0.0657501220703125, "learning_rate": 3.1905701131374816e-07, "loss": 0.0164, "reward": 0.06026786030270159, "reward_std": 0.041559827513992786, "rewards/accuracy_reward": 0.06026786030270159, "rewards/format_reward": 0.0, "step": 589 }, { "completion_length": 1881.2121505737305, "epoch": 0.7049510865506684, "grad_norm": 0.08804582059383392, "kl": 0.0616455078125, "learning_rate": 3.174455423800666e-07, "loss": 0.0061, "reward": 0.09821429057046771, "reward_std": 0.05117902718484402, "rewards/accuracy_reward": 0.09821429057046771, "rewards/format_reward": 0.0, "step": 590 }, { "completion_length": 1916.5916061401367, "epoch": 0.7061459189007543, "grad_norm": 0.07536152005195618, "kl": 0.0522918701171875, "learning_rate": 3.1583813215712827e-07, "loss": 0.0073, "reward": 0.040178573690354824, "reward_std": 0.019238398410379887, "rewards/accuracy_reward": 0.040178573690354824, "rewards/format_reward": 0.0, "step": 591 }, { "completion_length": 1921.3907089233398, "epoch": 0.7073407512508402, "grad_norm": 0.061079394072294235, "kl": 0.0639801025390625, "learning_rate": 3.1423480869863415e-07, "loss": 0.0106, "reward": 0.10937500721774995, "reward_std": 0.03125000139698386, "rewards/accuracy_reward": 0.10937500721774995, "rewards/format_reward": 0.0, "step": 592 }, { "completion_length": 1867.5670547485352, "epoch": 0.7085355836009259, "grad_norm": 0.10595210641622543, "kl": 0.0727691650390625, "learning_rate": 3.1263559998695945e-07, "loss": 0.0144, "reward": 0.06473214644938707, "reward_std": 0.02885759761556983, "rewards/accuracy_reward": 0.06473214644938707, "rewards/format_reward": 0.0, "step": 593 }, { "completion_length": 1887.1652603149414, "epoch": 0.7097304159510118, "grad_norm": 0.07174505293369293, "kl": 0.06748199462890625, "learning_rate": 3.1104053393266627e-07, "loss": 0.0112, "reward": 0.058035716880112886, "reward_std": 0.026785715483129025, "rewards/accuracy_reward": 0.058035716880112886, "rewards/format_reward": 0.0, "step": 594 }, { "completion_length": 1893.3416213989258, "epoch": 0.7109252483010977, "grad_norm": 0.12920311093330383, "kl": 0.0731353759765625, "learning_rate": 3.094496383740156e-07, "loss": 0.0122, "reward": 0.10044643469154835, "reward_std": 0.0326312561519444, "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.0, "step": 595 }, { "completion_length": 1903.0424880981445, "epoch": 0.7121200806511836, "grad_norm": 0.07109269499778748, "kl": 0.07269287109375, "learning_rate": 3.0786294107648234e-07, "loss": 0.0106, "reward": 0.03125000209547579, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.03125000209547579, "rewards/format_reward": 0.0, "step": 596 }, { "completion_length": 1891.526870727539, "epoch": 0.7133149130012695, "grad_norm": 0.07242727279663086, "kl": 0.0893402099609375, "learning_rate": 3.062804697322698e-07, "loss": 0.0108, "reward": 0.07142857532016933, "reward_std": 0.026785715483129025, "rewards/accuracy_reward": 0.07142857532016933, "rewards/format_reward": 0.0, "step": 597 }, { "completion_length": 1852.533561706543, "epoch": 0.7145097453513554, "grad_norm": 0.15052804350852966, "kl": 0.080780029296875, "learning_rate": 3.047022519598271e-07, "loss": 0.0152, "reward": 0.1071428635623306, "reward_std": 0.05357143096625805, "rewards/accuracy_reward": 0.1071428635623306, "rewards/format_reward": 0.0, "step": 598 }, { "completion_length": 1886.5023193359375, "epoch": 0.7157045777014412, "grad_norm": 0.07656344026327133, "kl": 0.0841064453125, "learning_rate": 3.0312831530336677e-07, "loss": 0.0088, "reward": 0.05803571757860482, "reward_std": 0.01785714365541935, "rewards/accuracy_reward": 0.05803571757860482, "rewards/format_reward": 0.0, "step": 599 }, { "completion_length": 1867.8348922729492, "epoch": 0.7168994100515271, "grad_norm": 0.1510997712612152, "kl": 0.096649169921875, "learning_rate": 3.0155868723238456e-07, "loss": 0.0081, "reward": 0.09375000419095159, "reward_std": 0.03194062830880284, "rewards/accuracy_reward": 0.09375000419095159, "rewards/format_reward": 0.0, "step": 600 }, { "completion_length": 1866.3103408813477, "epoch": 0.718094242401613, "grad_norm": 0.26030343770980835, "kl": 0.0834197998046875, "learning_rate": 2.999933951411791e-07, "loss": 0.0135, "reward": 0.07589286100119352, "reward_std": 0.04225045535713434, "rewards/accuracy_reward": 0.07589286100119352, "rewards/format_reward": 0.0, "step": 601 }, { "completion_length": 1864.5871276855469, "epoch": 0.7192890747516989, "grad_norm": 0.12312754988670349, "kl": 0.093353271484375, "learning_rate": 2.984324663483745e-07, "loss": 0.0082, "reward": 0.06919643259607255, "reward_std": 0.022321429569274187, "rewards/accuracy_reward": 0.06919643259607255, "rewards/format_reward": 0.0, "step": 602 }, { "completion_length": 1899.7076721191406, "epoch": 0.7204839071017848, "grad_norm": 0.1215401142835617, "kl": 0.094268798828125, "learning_rate": 2.968759280964437e-07, "loss": 0.01, "reward": 0.04687500209547579, "reward_std": 0.018547771032899618, "rewards/accuracy_reward": 0.04687500209547579, "rewards/format_reward": 0.0, "step": 603 }, { "completion_length": 1866.096061706543, "epoch": 0.7216787394518707, "grad_norm": 0.07917793840169907, "kl": 0.0869903564453125, "learning_rate": 2.953238075512321e-07, "loss": 0.0107, "reward": 0.08035714621655643, "reward_std": 0.0357142873108387, "rewards/accuracy_reward": 0.08035714621655643, "rewards/format_reward": 0.0, "step": 604 }, { "completion_length": 1824.5915908813477, "epoch": 0.7228735718019565, "grad_norm": 0.0875629410147667, "kl": 0.08294677734375, "learning_rate": 2.937761318014849e-07, "loss": 0.0152, "reward": 0.12276786309666932, "reward_std": 0.04294108226895332, "rewards/accuracy_reward": 0.12276786309666932, "rewards/format_reward": 0.0, "step": 605 }, { "completion_length": 1897.1094665527344, "epoch": 0.7240684041520424, "grad_norm": 0.10494077205657959, "kl": 0.0833587646484375, "learning_rate": 2.922329278583726e-07, "loss": 0.0088, "reward": 0.05357143026776612, "reward_std": 0.019238398410379887, "rewards/accuracy_reward": 0.05357143026776612, "rewards/format_reward": 0.0, "step": 606 }, { "completion_length": 1871.618392944336, "epoch": 0.7252632365021283, "grad_norm": 0.09592457860708237, "kl": 0.0795135498046875, "learning_rate": 2.90694222655021e-07, "loss": 0.0113, "reward": 0.06026785960420966, "reward_std": 0.03125000139698386, "rewards/accuracy_reward": 0.06026785960420966, "rewards/format_reward": 0.0, "step": 607 }, { "completion_length": 1881.1741943359375, "epoch": 0.7264580688522142, "grad_norm": 0.07680676877498627, "kl": 0.08575439453125, "learning_rate": 2.8916004304604013e-07, "loss": 0.0146, "reward": 0.07589286076836288, "reward_std": 0.04225045535713434, "rewards/accuracy_reward": 0.07589286076836288, "rewards/format_reward": 0.0, "step": 608 }, { "completion_length": 1914.8706130981445, "epoch": 0.7276529012023001, "grad_norm": 0.30729496479034424, "kl": 0.0750579833984375, "learning_rate": 2.8763041580705676e-07, "loss": 0.0144, "reward": 0.06919643143191934, "reward_std": 0.04533348651602864, "rewards/accuracy_reward": 0.06919643143191934, "rewards/format_reward": 0.0, "step": 609 }, { "completion_length": 1869.0067749023438, "epoch": 0.728847733552386, "grad_norm": 0.08962177485227585, "kl": 0.076507568359375, "learning_rate": 2.861053676342456e-07, "loss": 0.0124, "reward": 0.07812500442378223, "reward_std": 0.037786169443279505, "rewards/accuracy_reward": 0.07812500442378223, "rewards/format_reward": 0.0, "step": 610 }, { "completion_length": 1902.6719589233398, "epoch": 0.7300425659024719, "grad_norm": 0.08622395992279053, "kl": 0.080169677734375, "learning_rate": 2.8458492514386473e-07, "loss": 0.0105, "reward": 0.07366071781143546, "reward_std": 0.018547771032899618, "rewards/accuracy_reward": 0.07366071781143546, "rewards/format_reward": 0.0, "step": 611 }, { "completion_length": 1888.8572158813477, "epoch": 0.7312373982525577, "grad_norm": 0.10739275068044662, "kl": 0.0837249755859375, "learning_rate": 2.830691148717902e-07, "loss": 0.0104, "reward": 0.053571430733427405, "reward_std": 0.01785714365541935, "rewards/accuracy_reward": 0.053571430733427405, "rewards/format_reward": 0.0, "step": 612 }, { "completion_length": 1879.9130172729492, "epoch": 0.7324322306026436, "grad_norm": 0.09747501462697983, "kl": 0.0772247314453125, "learning_rate": 2.815579632730531e-07, "loss": 0.011, "reward": 0.09821429080329835, "reward_std": 0.028166969772428274, "rewards/accuracy_reward": 0.09821429080329835, "rewards/format_reward": 0.0, "step": 613 }, { "completion_length": 1886.9487533569336, "epoch": 0.7336270629527294, "grad_norm": 0.15940381586551666, "kl": 0.0799713134765625, "learning_rate": 2.8005149672137815e-07, "loss": 0.0176, "reward": 0.07812500419095159, "reward_std": 0.05186965363100171, "rewards/accuracy_reward": 0.07812500419095159, "rewards/format_reward": 0.0, "step": 614 }, { "completion_length": 1811.9420471191406, "epoch": 0.7348218953028153, "grad_norm": 0.0676736980676651, "kl": 0.0695037841796875, "learning_rate": 2.785497415087229e-07, "loss": 0.016, "reward": 0.0937500037252903, "reward_std": 0.046024113427847624, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, "step": 615 }, { "completion_length": 1857.0514297485352, "epoch": 0.7360167276529012, "grad_norm": 0.07386067509651184, "kl": 0.0760650634765625, "learning_rate": 2.770527238448197e-07, "loss": 0.0049, "reward": 0.12053571920841932, "reward_std": 0.03194062924012542, "rewards/accuracy_reward": 0.12053571920841932, "rewards/format_reward": 0.0, "step": 616 }, { "completion_length": 1853.705451965332, "epoch": 0.737211560002987, "grad_norm": 0.1588428020477295, "kl": 0.072479248046875, "learning_rate": 2.755604698567171e-07, "loss": 0.0002, "reward": 0.05357143096625805, "reward_std": 0.01785714365541935, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "step": 617 }, { "completion_length": 1867.2947311401367, "epoch": 0.7384063923530729, "grad_norm": 0.13709181547164917, "kl": 0.074127197265625, "learning_rate": 2.740730055883251e-07, "loss": 0.0135, "reward": 0.13392857741564512, "reward_std": 0.060107598546892405, "rewards/accuracy_reward": 0.13392857741564512, "rewards/format_reward": 0.0, "step": 618 }, { "completion_length": 1897.5157089233398, "epoch": 0.7396012247031588, "grad_norm": 0.052962467074394226, "kl": 0.059295654296875, "learning_rate": 2.7259035699995967e-07, "loss": 0.0086, "reward": 0.07812500442378223, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.07812500442378223, "rewards/format_reward": 0.0, "step": 619 }, { "completion_length": 1888.0558853149414, "epoch": 0.7407960570532447, "grad_norm": 0.11950363218784332, "kl": 0.068634033203125, "learning_rate": 2.711125499678899e-07, "loss": 0.0169, "reward": 0.04464285890571773, "reward_std": 0.042250454891473055, "rewards/accuracy_reward": 0.04464285890571773, "rewards/format_reward": 0.0, "step": 620 }, { "completion_length": 1863.5424880981445, "epoch": 0.7419908894033306, "grad_norm": 0.11961328238248825, "kl": 0.0815277099609375, "learning_rate": 2.6963961028388657e-07, "loss": 0.0124, "reward": 0.1026785762514919, "reward_std": 0.04225045535713434, "rewards/accuracy_reward": 0.1026785762514919, "rewards/format_reward": 0.0, "step": 621 }, { "completion_length": 1933.9175109863281, "epoch": 0.7431857217534165, "grad_norm": 0.07915452867746353, "kl": 0.0635833740234375, "learning_rate": 2.68171563654772e-07, "loss": 0.0078, "reward": 0.0357142873108387, "reward_std": 0.01408348511904478, "rewards/accuracy_reward": 0.0357142873108387, "rewards/format_reward": 0.0, "step": 622 }, { "completion_length": 1882.9889221191406, "epoch": 0.7443805541035023, "grad_norm": 0.1243758350610733, "kl": 0.0749053955078125, "learning_rate": 2.6670843570197154e-07, "loss": 0.007, "reward": 0.09375000419095159, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.09375000419095159, "rewards/format_reward": 0.0, "step": 623 }, { "completion_length": 1892.9621353149414, "epoch": 0.7455753864535882, "grad_norm": 0.07276484370231628, "kl": 0.059356689453125, "learning_rate": 2.6525025196106564e-07, "loss": 0.0115, "reward": 0.09821429126895964, "reward_std": 0.03709554113447666, "rewards/accuracy_reward": 0.09821429126895964, "rewards/format_reward": 0.0, "step": 624 }, { "completion_length": 1856.964370727539, "epoch": 0.7467702188036741, "grad_norm": 0.24513977766036987, "kl": 0.067901611328125, "learning_rate": 2.637970378813454e-07, "loss": 0.0107, "reward": 0.07366071827709675, "reward_std": 0.0491071455180645, "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.0, "step": 625 }, { "completion_length": 1827.2076797485352, "epoch": 0.74796505115376, "grad_norm": 0.16318318247795105, "kl": 0.0675506591796875, "learning_rate": 2.623488188253674e-07, "loss": 0.0198, "reward": 0.0625000016298145, "reward_std": 0.046024113427847624, "rewards/accuracy_reward": 0.0625000016298145, "rewards/format_reward": 0.0, "step": 626 }, { "completion_length": 1838.6741943359375, "epoch": 0.7491598835038459, "grad_norm": 0.12326207756996155, "kl": 0.0706024169921875, "learning_rate": 2.609056200685113e-07, "loss": 0.004, "reward": 0.04017857345752418, "reward_std": 0.01408348511904478, "rewards/accuracy_reward": 0.04017857345752418, "rewards/format_reward": 0.0, "step": 627 }, { "completion_length": 1843.6541061401367, "epoch": 0.7503547158539318, "grad_norm": 0.1119377464056015, "kl": 0.0815887451171875, "learning_rate": 2.5946746679853893e-07, "loss": 0.0124, "reward": 0.07366071781143546, "reward_std": 0.03263125568628311, "rewards/accuracy_reward": 0.07366071781143546, "rewards/format_reward": 0.0, "step": 628 }, { "completion_length": 1867.4108123779297, "epoch": 0.7515495482040176, "grad_norm": 0.10206142067909241, "kl": 0.0781402587890625, "learning_rate": 2.5803438411515483e-07, "loss": 0.0105, "reward": 0.05357143050059676, "reward_std": 0.03194062877446413, "rewards/accuracy_reward": 0.05357143050059676, "rewards/format_reward": 0.0, "step": 629 }, { "completion_length": 1894.6250610351562, "epoch": 0.7527443805541035, "grad_norm": 0.10961124300956726, "kl": 0.0720062255859375, "learning_rate": 2.5660639702956767e-07, "loss": 0.0019, "reward": 0.06473214528523386, "reward_std": 0.018547771032899618, "rewards/accuracy_reward": 0.06473214528523386, "rewards/format_reward": 0.0, "step": 630 }, { "completion_length": 1861.2143630981445, "epoch": 0.7539392129041894, "grad_norm": 0.18056420981884003, "kl": 0.064605712890625, "learning_rate": 2.5518353046405386e-07, "loss": 0.0055, "reward": 0.05133928777649999, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.05133928777649999, "rewards/format_reward": 0.0, "step": 631 }, { "completion_length": 1844.3973922729492, "epoch": 0.7551340452542753, "grad_norm": 0.07380843162536621, "kl": 0.06591796875, "learning_rate": 2.537658092515233e-07, "loss": 0.0099, "reward": 0.11160714877769351, "reward_std": 0.0295482249930501, "rewards/accuracy_reward": 0.11160714877769351, "rewards/format_reward": 0.0, "step": 632 }, { "completion_length": 1848.2679443359375, "epoch": 0.7563288776043612, "grad_norm": 0.11214245110750198, "kl": 0.065948486328125, "learning_rate": 2.52353258135085e-07, "loss": 0.0066, "reward": 0.04687500232830644, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.04687500232830644, "rewards/format_reward": 0.0, "step": 633 }, { "completion_length": 1886.1072235107422, "epoch": 0.7575237099544471, "grad_norm": 0.10343652218580246, "kl": 0.0801239013671875, "learning_rate": 2.5094590176761547e-07, "loss": 0.0118, "reward": 0.06696428824216127, "reward_std": 0.04225045535713434, "rewards/accuracy_reward": 0.06696428824216127, "rewards/format_reward": 0.0, "step": 634 }, { "completion_length": 1885.6072235107422, "epoch": 0.758718542304533, "grad_norm": 0.10567346960306168, "kl": 0.0729217529296875, "learning_rate": 2.4954376471132937e-07, "loss": 0.0107, "reward": 0.07142857415601611, "reward_std": 0.046024113427847624, "rewards/accuracy_reward": 0.07142857415601611, "rewards/format_reward": 0.0, "step": 635 }, { "completion_length": 1880.488914489746, "epoch": 0.7599133746546187, "grad_norm": 0.1105286106467247, "kl": 0.06377410888671875, "learning_rate": 2.4814687143734957e-07, "loss": 0.0144, "reward": 0.0736607180442661, "reward_std": 0.03778616897761822, "rewards/accuracy_reward": 0.0736607180442661, "rewards/format_reward": 0.0, "step": 636 }, { "completion_length": 1843.1675109863281, "epoch": 0.7611082070047046, "grad_norm": 0.14759249985218048, "kl": 0.0691375732421875, "learning_rate": 2.467552463252804e-07, "loss": 0.0135, "reward": 0.11383929080329835, "reward_std": 0.04294108273461461, "rewards/accuracy_reward": 0.11383929080329835, "rewards/format_reward": 0.0, "step": 637 }, { "completion_length": 1831.3103408813477, "epoch": 0.7623030393547905, "grad_norm": 0.1330907791852951, "kl": 0.074310302734375, "learning_rate": 2.4536891366278307e-07, "loss": 0.0062, "reward": 0.053571431897580624, "reward_std": 0.019238398410379887, "rewards/accuracy_reward": 0.053571431897580624, "rewards/format_reward": 0.0, "step": 638 }, { "completion_length": 1923.3192901611328, "epoch": 0.7634978717048764, "grad_norm": 0.07484295219182968, "kl": 0.0805816650390625, "learning_rate": 2.439878976451507e-07, "loss": 0.0097, "reward": 0.06473214575089514, "reward_std": 0.022321429569274187, "rewards/accuracy_reward": 0.06473214575089514, "rewards/format_reward": 0.0, "step": 639 }, { "completion_length": 1891.2612533569336, "epoch": 0.7646927040549623, "grad_norm": 0.13395258784294128, "kl": 0.074188232421875, "learning_rate": 2.4261222237488613e-07, "loss": 0.0072, "reward": 0.0870535746216774, "reward_std": 0.03778616851195693, "rewards/accuracy_reward": 0.0870535746216774, "rewards/format_reward": 0.0, "step": 640 }, { "completion_length": 1912.136245727539, "epoch": 0.7658875364050481, "grad_norm": 0.09453363716602325, "kl": 0.0767822265625, "learning_rate": 2.412419118612822e-07, "loss": 0.0063, "reward": 0.0580357164144516, "reward_std": 0.026785715483129025, "rewards/accuracy_reward": 0.0580357164144516, "rewards/format_reward": 0.0, "step": 641 }, { "completion_length": 1853.1630096435547, "epoch": 0.767082368755134, "grad_norm": 0.09451920539140701, "kl": 0.0663909912109375, "learning_rate": 2.3987699002000167e-07, "loss": 0.0044, "reward": 0.06696428824216127, "reward_std": 0.028166969772428274, "rewards/accuracy_reward": 0.06696428824216127, "rewards/format_reward": 0.0, "step": 642 }, { "completion_length": 1857.446517944336, "epoch": 0.7682772011052199, "grad_norm": 0.34483471512794495, "kl": 0.0574798583984375, "learning_rate": 2.385174806726603e-07, "loss": 0.0179, "reward": 0.09821429010480642, "reward_std": 0.0549526852555573, "rewards/accuracy_reward": 0.09821429010480642, "rewards/format_reward": 0.0, "step": 643 }, { "completion_length": 1882.3215103149414, "epoch": 0.7694720334553058, "grad_norm": 0.14822633564472198, "kl": 0.063629150390625, "learning_rate": 2.3716340754641099e-07, "loss": 0.0042, "reward": 0.06250000256113708, "reward_std": 0.0357142873108387, "rewards/accuracy_reward": 0.06250000256113708, "rewards/format_reward": 0.0, "step": 644 }, { "completion_length": 1900.8103408813477, "epoch": 0.7706668658053917, "grad_norm": 0.0953778401017189, "kl": 0.065704345703125, "learning_rate": 2.3581479427353e-07, "loss": 0.0112, "reward": 0.06473214481957257, "reward_std": 0.041559827979654074, "rewards/accuracy_reward": 0.06473214481957257, "rewards/format_reward": 0.0, "step": 645 }, { "completion_length": 1915.68310546875, "epoch": 0.7718616981554776, "grad_norm": 0.05897451564669609, "kl": 0.0667877197265625, "learning_rate": 2.3447166439100384e-07, "loss": 0.0047, "reward": 0.07142857485450804, "reward_std": 0.023012056481093168, "rewards/accuracy_reward": 0.07142857485450804, "rewards/format_reward": 0.0, "step": 646 }, { "completion_length": 1874.714370727539, "epoch": 0.7730565305055634, "grad_norm": 0.06354082375764847, "kl": 0.08056640625, "learning_rate": 2.3313404134011932e-07, "loss": 0.0091, "reward": 0.0669642889406532, "reward_std": 0.023012056481093168, "rewards/accuracy_reward": 0.0669642889406532, "rewards/format_reward": 0.0, "step": 647 }, { "completion_length": 1893.5023193359375, "epoch": 0.7742513628556493, "grad_norm": 0.17876312136650085, "kl": 0.07623291015625, "learning_rate": 2.3180194846605364e-07, "loss": 0.0035, "reward": 0.06919643189758062, "reward_std": 0.022321429569274187, "rewards/accuracy_reward": 0.06919643189758062, "rewards/format_reward": 0.0, "step": 648 }, { "completion_length": 1844.8281936645508, "epoch": 0.7754461952057352, "grad_norm": 0.17409731447696686, "kl": 0.0766754150390625, "learning_rate": 2.3047540901746739e-07, "loss": 0.0153, "reward": 0.06919643166474998, "reward_std": 0.05186965363100171, "rewards/accuracy_reward": 0.06919643166474998, "rewards/format_reward": 0.0, "step": 649 }, { "completion_length": 1874.4286499023438, "epoch": 0.7766410275558211, "grad_norm": 0.06004172936081886, "kl": 0.0654296875, "learning_rate": 2.2915444614609875e-07, "loss": 0.0067, "reward": 0.04687500209547579, "reward_std": 0.018547771032899618, "rewards/accuracy_reward": 0.04687500209547579, "rewards/format_reward": 0.0, "step": 650 }, { "completion_length": 1850.332664489746, "epoch": 0.777835859905907, "grad_norm": 0.10386177152395248, "kl": 0.0714874267578125, "learning_rate": 2.278390829063592e-07, "loss": 0.0032, "reward": 0.08482143236324191, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.08482143236324191, "rewards/format_reward": 0.0, "step": 651 }, { "completion_length": 1889.5848999023438, "epoch": 0.7790306922559929, "grad_norm": 0.14917847514152527, "kl": 0.0664215087890625, "learning_rate": 2.2652934225493188e-07, "loss": 0.0016, "reward": 0.12053571990691125, "reward_std": 0.03194062877446413, "rewards/accuracy_reward": 0.12053571990691125, "rewards/format_reward": 0.0, "step": 652 }, { "completion_length": 1856.955451965332, "epoch": 0.7802255246060787, "grad_norm": 0.09466379135847092, "kl": 0.0666961669921875, "learning_rate": 2.2522524705036964e-07, "loss": 0.0134, "reward": 0.06026786076836288, "reward_std": 0.041559827513992786, "rewards/accuracy_reward": 0.06026786076836288, "rewards/format_reward": 0.0, "step": 653 }, { "completion_length": 1865.196517944336, "epoch": 0.7814203569561646, "grad_norm": 0.09586599469184875, "kl": 0.05628204345703125, "learning_rate": 2.2392682005269765e-07, "loss": 0.0035, "reward": 0.05133928777649999, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.05133928777649999, "rewards/format_reward": 0.0, "step": 654 }, { "completion_length": 1886.2322235107422, "epoch": 0.7826151893062505, "grad_norm": 0.06425314396619797, "kl": 0.06714630126953125, "learning_rate": 2.2263408392301478e-07, "loss": 0.0044, "reward": 0.05357143096625805, "reward_std": 0.026785715483129025, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "step": 655 }, { "completion_length": 1820.4531936645508, "epoch": 0.7838100216563364, "grad_norm": 0.07938586175441742, "kl": 0.0651702880859375, "learning_rate": 2.2134706122309888e-07, "loss": 0.0124, "reward": 0.08035714807920158, "reward_std": 0.028166969772428274, "rewards/accuracy_reward": 0.08035714807920158, "rewards/format_reward": 0.0, "step": 656 }, { "completion_length": 1844.5090026855469, "epoch": 0.7850048540064222, "grad_norm": 0.07052425295114517, "kl": 0.0446319580078125, "learning_rate": 2.2006577441501288e-07, "loss": 0.0081, "reward": 0.06696428917348385, "reward_std": 0.03194062877446413, "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.0, "step": 657 }, { "completion_length": 1929.7210540771484, "epoch": 0.786199686356508, "grad_norm": 0.10864143073558807, "kl": 0.0627593994140625, "learning_rate": 2.1879024586071282e-07, "loss": 0.0128, "reward": 0.05580357485450804, "reward_std": 0.02508393907919526, "rewards/accuracy_reward": 0.05580357485450804, "rewards/format_reward": 0.0, "step": 658 }, { "completion_length": 1862.7567825317383, "epoch": 0.7873945187065939, "grad_norm": 0.10634759068489075, "kl": 0.062164306640625, "learning_rate": 2.1752049782165733e-07, "loss": 0.0134, "reward": 0.0647321455180645, "reward_std": 0.05941697163507342, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, "step": 659 }, { "completion_length": 1880.0134735107422, "epoch": 0.7885893510566798, "grad_norm": 0.140049010515213, "kl": 0.05912017822265625, "learning_rate": 2.162565524584191e-07, "loss": 0.0097, "reward": 0.1004464328289032, "reward_std": 0.0491071455180645, "rewards/accuracy_reward": 0.1004464328289032, "rewards/format_reward": 0.0, "step": 660 }, { "completion_length": 1871.0491790771484, "epoch": 0.7897841834067657, "grad_norm": 0.07653600722551346, "kl": 0.0606842041015625, "learning_rate": 2.149984318302988e-07, "loss": 0.0102, "reward": 0.04910714435391128, "reward_std": 0.04464285960420966, "rewards/accuracy_reward": 0.04910714435391128, "rewards/format_reward": 0.0, "step": 661 }, { "completion_length": 1883.5491943359375, "epoch": 0.7909790157568516, "grad_norm": 0.19539529085159302, "kl": 0.063720703125, "learning_rate": 2.13746157894939e-07, "loss": 0.0116, "reward": 0.10267857671715319, "reward_std": 0.037095542065799236, "rewards/accuracy_reward": 0.10267857671715319, "rewards/format_reward": 0.0, "step": 662 }, { "completion_length": 1862.667495727539, "epoch": 0.7921738481069375, "grad_norm": 0.07275313138961792, "kl": 0.06131744384765625, "learning_rate": 2.1249975250794173e-07, "loss": 0.0116, "reward": 0.0937500037252903, "reward_std": 0.037095542065799236, "rewards/accuracy_reward": 0.0937500037252903, "rewards/format_reward": 0.0, "step": 663 }, { "completion_length": 1807.5223999023438, "epoch": 0.7933686804570234, "grad_norm": 0.08854836970567703, "kl": 0.064697265625, "learning_rate": 2.112592374224869e-07, "loss": 0.0079, "reward": 0.03794642956927419, "reward_std": 0.023702684324234724, "rewards/accuracy_reward": 0.03794642956927419, "rewards/format_reward": 0.0, "step": 664 }, { "completion_length": 1851.7857971191406, "epoch": 0.7945635128071092, "grad_norm": 0.11060748249292374, "kl": 0.0783538818359375, "learning_rate": 2.1002463428895248e-07, "loss": 0.0121, "reward": 0.06026786100119352, "reward_std": 0.03401251044124365, "rewards/accuracy_reward": 0.06026786100119352, "rewards/format_reward": 0.0, "step": 665 }, { "completion_length": 1837.0670471191406, "epoch": 0.7957583451571951, "grad_norm": 0.097548708319664, "kl": 0.077972412109375, "learning_rate": 2.0879596465453654e-07, "loss": 0.0105, "reward": 0.07589286030270159, "reward_std": 0.04086920106783509, "rewards/accuracy_reward": 0.07589286030270159, "rewards/format_reward": 0.0, "step": 666 }, { "completion_length": 1839.9353408813477, "epoch": 0.796953177507281, "grad_norm": 0.0729847401380539, "kl": 0.0687408447265625, "learning_rate": 2.0757324996288183e-07, "loss": 0.0069, "reward": 0.06473214505240321, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.06473214505240321, "rewards/format_reward": 0.0, "step": 667 }, { "completion_length": 1852.5826721191406, "epoch": 0.7981480098573669, "grad_norm": 0.07494714856147766, "kl": 0.0789337158203125, "learning_rate": 2.063565115537006e-07, "loss": 0.0106, "reward": 0.05580357345752418, "reward_std": 0.018547771032899618, "rewards/accuracy_reward": 0.05580357345752418, "rewards/format_reward": 0.0, "step": 668 }, { "completion_length": 1878.671974182129, "epoch": 0.7993428422074528, "grad_norm": 0.05142665654420853, "kl": 0.0682525634765625, "learning_rate": 2.0514577066240286e-07, "loss": 0.006, "reward": 0.08482143376022577, "reward_std": 0.026785715483129025, "rewards/accuracy_reward": 0.08482143376022577, "rewards/format_reward": 0.0, "step": 669 }, { "completion_length": 1849.160789489746, "epoch": 0.8005376745575387, "grad_norm": 0.1327022910118103, "kl": 0.07177734375, "learning_rate": 2.0394104841972566e-07, "loss": 0.0092, "reward": 0.06919643166474998, "reward_std": 0.034012510906904936, "rewards/accuracy_reward": 0.06919643166474998, "rewards/format_reward": 0.0, "step": 670 }, { "completion_length": 1843.4465255737305, "epoch": 0.8017325069076245, "grad_norm": 0.08175896108150482, "kl": 0.072021484375, "learning_rate": 2.02742365851364e-07, "loss": 0.0093, "reward": 0.06250000302679837, "reward_std": 0.03332188352942467, "rewards/accuracy_reward": 0.06250000302679837, "rewards/format_reward": 0.0, "step": 671 }, { "completion_length": 1862.8527526855469, "epoch": 0.8029273392577104, "grad_norm": 0.1511981189250946, "kl": 0.0692901611328125, "learning_rate": 2.01549743877604e-07, "loss": 0.0192, "reward": 0.09821429150179029, "reward_std": 0.04464285960420966, "rewards/accuracy_reward": 0.09821429150179029, "rewards/format_reward": 0.0, "step": 672 }, { "completion_length": 1878.292495727539, "epoch": 0.8041221716077963, "grad_norm": 0.08222120255231857, "kl": 0.074859619140625, "learning_rate": 2.0036320331295798e-07, "loss": 0.0083, "reward": 0.07812500419095159, "reward_std": 0.0326312561519444, "rewards/accuracy_reward": 0.07812500419095159, "rewards/format_reward": 0.0, "step": 673 }, { "completion_length": 1851.8081130981445, "epoch": 0.8053170039578822, "grad_norm": 0.06467738747596741, "kl": 0.06224822998046875, "learning_rate": 1.9918276486580117e-07, "loss": 0.0083, "reward": 0.06250000256113708, "reward_std": 0.026785715483129025, "rewards/accuracy_reward": 0.06250000256113708, "rewards/format_reward": 0.0, "step": 674 }, { "completion_length": 1836.102767944336, "epoch": 0.8065118363079681, "grad_norm": 0.13374808430671692, "kl": 0.078369140625, "learning_rate": 1.9800844913800985e-07, "loss": 0.0073, "reward": 0.111607147147879, "reward_std": 0.05633393954485655, "rewards/accuracy_reward": 0.111607147147879, "rewards/format_reward": 0.0, "step": 675 }, { "completion_length": 1862.4554595947266, "epoch": 0.807706668658054, "grad_norm": 0.10425970703363419, "kl": 0.0815277099609375, "learning_rate": 1.9684027662460257e-07, "loss": 0.0131, "reward": 0.09151786169968545, "reward_std": 0.03778616897761822, "rewards/accuracy_reward": 0.09151786169968545, "rewards/format_reward": 0.0, "step": 676 }, { "completion_length": 1839.493392944336, "epoch": 0.8089015010081398, "grad_norm": 0.2592446208000183, "kl": 0.078887939453125, "learning_rate": 1.9567826771338153e-07, "loss": 0.0173, "reward": 0.07812500325962901, "reward_std": 0.05978707689791918, "rewards/accuracy_reward": 0.07812500325962901, "rewards/format_reward": 0.0, "step": 677 }, { "completion_length": 1853.4911499023438, "epoch": 0.8100963333582257, "grad_norm": 0.09625484049320221, "kl": 0.076171875, "learning_rate": 1.9452244268457742e-07, "loss": 0.0072, "reward": 0.03125000186264515, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.03125000186264515, "rewards/format_reward": 0.0, "step": 678 }, { "completion_length": 1823.6295547485352, "epoch": 0.8112911657083115, "grad_norm": 0.07832616567611694, "kl": 0.0686798095703125, "learning_rate": 1.9337282171049542e-07, "loss": 0.0099, "reward": 0.08705357555299997, "reward_std": 0.04533348651602864, "rewards/accuracy_reward": 0.08705357555299997, "rewards/format_reward": 0.0, "step": 679 }, { "completion_length": 1804.4889068603516, "epoch": 0.8124859980583974, "grad_norm": 0.06921615451574326, "kl": 0.083984375, "learning_rate": 1.9222942485516263e-07, "loss": 0.0131, "reward": 0.06919643189758062, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.06919643189758062, "rewards/format_reward": 0.0, "step": 680 }, { "completion_length": 1801.7143630981445, "epoch": 0.8136808304084833, "grad_norm": 0.07748515903949738, "kl": 0.0786285400390625, "learning_rate": 1.910922720739789e-07, "loss": 0.0145, "reward": 0.10044643236324191, "reward_std": 0.05564331263303757, "rewards/accuracy_reward": 0.10044643236324191, "rewards/format_reward": 0.0, "step": 681 }, { "completion_length": 1882.5715103149414, "epoch": 0.8148756627585692, "grad_norm": 0.06931213289499283, "kl": 0.06304931640625, "learning_rate": 1.899613832133672e-07, "loss": 0.0117, "reward": 0.06696428870782256, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.06696428870782256, "rewards/format_reward": 0.0, "step": 682 }, { "completion_length": 1852.6875762939453, "epoch": 0.816070495108655, "grad_norm": 0.10315227508544922, "kl": 0.06780242919921875, "learning_rate": 1.8883677801042856e-07, "loss": 0.0086, "reward": 0.06250000325962901, "reward_std": 0.042250454891473055, "rewards/accuracy_reward": 0.06250000325962901, "rewards/format_reward": 0.0, "step": 683 }, { "completion_length": 1831.9442977905273, "epoch": 0.8172653274587409, "grad_norm": 0.08957939594984055, "kl": 0.06207275390625, "learning_rate": 1.8771847609259674e-07, "loss": 0.0095, "reward": 0.06473214621655643, "reward_std": 0.0326312561519444, "rewards/accuracy_reward": 0.06473214621655643, "rewards/format_reward": 0.0, "step": 684 }, { "completion_length": 1879.0157165527344, "epoch": 0.8184601598088268, "grad_norm": 0.06882953643798828, "kl": 0.0618896484375, "learning_rate": 1.8660649697729586e-07, "loss": 0.0074, "reward": 0.046875002793967724, "reward_std": 0.03263125568628311, "rewards/accuracy_reward": 0.046875002793967724, "rewards/format_reward": 0.0, "step": 685 }, { "completion_length": 1878.5915985107422, "epoch": 0.8196549921589127, "grad_norm": 0.08852121233940125, "kl": 0.058563232421875, "learning_rate": 1.8550086007160008e-07, "loss": 0.0072, "reward": 0.049107145285233855, "reward_std": 0.03194062877446413, "rewards/accuracy_reward": 0.049107145285233855, "rewards/format_reward": 0.0, "step": 686 }, { "completion_length": 1842.7366943359375, "epoch": 0.8208498245089986, "grad_norm": 0.10435935109853745, "kl": 0.070068359375, "learning_rate": 1.8440158467189437e-07, "loss": 0.0043, "reward": 0.06696429010480642, "reward_std": 0.03194062877446413, "rewards/accuracy_reward": 0.06696429010480642, "rewards/format_reward": 0.0, "step": 687 }, { "completion_length": 1813.8282012939453, "epoch": 0.8220446568590845, "grad_norm": 0.14258939027786255, "kl": 0.0782470703125, "learning_rate": 1.833086899635385e-07, "loss": 0.0133, "reward": 0.035714287078008056, "reward_std": 0.037095542065799236, "rewards/accuracy_reward": 0.035714287078008056, "rewards/format_reward": 0.0, "step": 688 }, { "completion_length": 1878.1139297485352, "epoch": 0.8232394892091703, "grad_norm": 0.09881190955638885, "kl": 0.0817718505859375, "learning_rate": 1.8222219502053133e-07, "loss": 0.0087, "reward": 0.04910714505240321, "reward_std": 0.03194062877446413, "rewards/accuracy_reward": 0.04910714505240321, "rewards/format_reward": 0.0, "step": 689 }, { "completion_length": 1803.0179443359375, "epoch": 0.8244343215592562, "grad_norm": 0.11943608522415161, "kl": 0.0895233154296875, "learning_rate": 1.8114211880517853e-07, "loss": 0.0135, "reward": 0.082589291036129, "reward_std": 0.04671474080532789, "rewards/accuracy_reward": 0.082589291036129, "rewards/format_reward": 0.0, "step": 690 }, { "completion_length": 1866.1853561401367, "epoch": 0.8256291539093421, "grad_norm": 0.1004241332411766, "kl": 0.071197509765625, "learning_rate": 1.8006848016776146e-07, "loss": 0.0127, "reward": 0.06026786030270159, "reward_std": 0.02885759761556983, "rewards/accuracy_reward": 0.06026786030270159, "rewards/format_reward": 0.0, "step": 691 }, { "completion_length": 1815.886245727539, "epoch": 0.826823986259428, "grad_norm": 0.09816662222146988, "kl": 0.07293701171875, "learning_rate": 1.7900129784620796e-07, "loss": 0.0091, "reward": 0.04464285937137902, "reward_std": 0.037095542065799236, "rewards/accuracy_reward": 0.04464285937137902, "rewards/format_reward": 0.0, "step": 692 }, { "completion_length": 1838.642951965332, "epoch": 0.8280188186095139, "grad_norm": 0.08755306154489517, "kl": 0.0870361328125, "learning_rate": 1.7794059046576554e-07, "loss": 0.0106, "reward": 0.09375000419095159, "reward_std": 0.028166969772428274, "rewards/accuracy_reward": 0.09375000419095159, "rewards/format_reward": 0.0, "step": 693 }, { "completion_length": 1831.5023345947266, "epoch": 0.8292136509595998, "grad_norm": 0.20488619804382324, "kl": 0.0922698974609375, "learning_rate": 1.768863765386766e-07, "loss": 0.0116, "reward": 0.11160714807920158, "reward_std": 0.07796474266797304, "rewards/accuracy_reward": 0.11160714807920158, "rewards/format_reward": 0.0, "step": 694 }, { "completion_length": 1824.2634735107422, "epoch": 0.8304084833096856, "grad_norm": 0.16928982734680176, "kl": 0.090057373046875, "learning_rate": 1.758386744638546e-07, "loss": 0.0146, "reward": 0.06250000232830644, "reward_std": 0.03194062877446413, "rewards/accuracy_reward": 0.06250000232830644, "rewards/format_reward": 0.0, "step": 695 }, { "completion_length": 1891.723289489746, "epoch": 0.8316033156597715, "grad_norm": 0.07520829141139984, "kl": 0.0862274169921875, "learning_rate": 1.7479750252656385e-07, "loss": 0.0106, "reward": 0.05803571781143546, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.05803571781143546, "rewards/format_reward": 0.0, "step": 696 }, { "completion_length": 1839.7991790771484, "epoch": 0.8327981480098574, "grad_norm": 0.11534183472394943, "kl": 0.0875091552734375, "learning_rate": 1.7376287889809956e-07, "loss": 0.0116, "reward": 0.07812500419095159, "reward_std": 0.03640491422265768, "rewards/accuracy_reward": 0.07812500419095159, "rewards/format_reward": 0.0, "step": 697 }, { "completion_length": 1833.957664489746, "epoch": 0.8339929803599433, "grad_norm": 0.14332051575183868, "kl": 0.0828399658203125, "learning_rate": 1.7273482163547104e-07, "loss": 0.0153, "reward": 0.08482143189758062, "reward_std": 0.05633393954485655, "rewards/accuracy_reward": 0.08482143189758062, "rewards/format_reward": 0.0, "step": 698 }, { "completion_length": 1872.354995727539, "epoch": 0.8351878127100292, "grad_norm": 0.14433005452156067, "kl": 0.07476806640625, "learning_rate": 1.7171334868108695e-07, "loss": 0.0133, "reward": 0.08482143259607255, "reward_std": 0.04602411389350891, "rewards/accuracy_reward": 0.08482143259607255, "rewards/format_reward": 0.0, "step": 699 }, { "completion_length": 1813.477767944336, "epoch": 0.836382645060115, "grad_norm": 0.21457916498184204, "kl": 0.0898284912109375, "learning_rate": 1.7069847786244134e-07, "loss": 0.0256, "reward": 0.09375000349245965, "reward_std": 0.06526251137256622, "rewards/accuracy_reward": 0.09375000349245965, "rewards/format_reward": 0.0, "step": 700 }, { "completion_length": 1803.3170547485352, "epoch": 0.8375774774102008, "grad_norm": 0.12451878935098648, "kl": 0.1056365966796875, "learning_rate": 1.6969022689180325e-07, "loss": 0.0105, "reward": 0.13169643422588706, "reward_std": 0.051869654096663, "rewards/accuracy_reward": 0.13169643422588706, "rewards/format_reward": 0.0, "step": 701 }, { "completion_length": 1813.1250915527344, "epoch": 0.8387723097602867, "grad_norm": 0.1907883584499359, "kl": 0.121429443359375, "learning_rate": 1.6868861336590702e-07, "loss": 0.0134, "reward": 0.08705357555299997, "reward_std": 0.041559827979654074, "rewards/accuracy_reward": 0.08705357555299997, "rewards/format_reward": 0.0, "step": 702 }, { "completion_length": 1800.7032089233398, "epoch": 0.8399671421103726, "grad_norm": 0.10976855456829071, "kl": 0.08660888671875, "learning_rate": 1.676936547656458e-07, "loss": 0.0137, "reward": 0.1004464344587177, "reward_std": 0.037786169443279505, "rewards/accuracy_reward": 0.1004464344587177, "rewards/format_reward": 0.0, "step": 703 }, { "completion_length": 1808.9889221191406, "epoch": 0.8411619744604585, "grad_norm": 0.09750930964946747, "kl": 0.1032562255859375, "learning_rate": 1.667053684557657e-07, "loss": 0.0093, "reward": 0.07142857392318547, "reward_std": 0.02816697023808956, "rewards/accuracy_reward": 0.07142857392318547, "rewards/format_reward": 0.0, "step": 704 }, { "completion_length": 1833.2121276855469, "epoch": 0.8423568068105444, "grad_norm": 0.08384870737791061, "kl": 0.08144378662109375, "learning_rate": 1.6572377168456353e-07, "loss": 0.0095, "reward": 0.05803571711294353, "reward_std": 0.026785715483129025, "rewards/accuracy_reward": 0.05803571711294353, "rewards/format_reward": 0.0, "step": 705 }, { "completion_length": 1819.0826950073242, "epoch": 0.8435516391606303, "grad_norm": 0.1823926568031311, "kl": 0.0973663330078125, "learning_rate": 1.6474888158358511e-07, "loss": 0.0146, "reward": 0.11830357927829027, "reward_std": 0.06079822592437267, "rewards/accuracy_reward": 0.11830357927829027, "rewards/format_reward": 0.0, "step": 706 }, { "completion_length": 1788.1630477905273, "epoch": 0.8447464715107161, "grad_norm": 0.4205247759819031, "kl": 0.0965423583984375, "learning_rate": 1.6378071516732652e-07, "loss": 0.0313, "reward": 0.12276786030270159, "reward_std": 0.08896519662812352, "rewards/accuracy_reward": 0.12276786030270159, "rewards/format_reward": 0.0, "step": 707 }, { "completion_length": 1791.0045471191406, "epoch": 0.845941303860802, "grad_norm": 0.10521113127470016, "kl": 0.08575439453125, "learning_rate": 1.6281928933293738e-07, "loss": 0.0101, "reward": 0.08928571850992739, "reward_std": 0.019238398410379887, "rewards/accuracy_reward": 0.08928571850992739, "rewards/format_reward": 0.0, "step": 708 }, { "completion_length": 1794.1027603149414, "epoch": 0.8471361362108879, "grad_norm": 0.12744131684303284, "kl": 0.09134674072265625, "learning_rate": 1.618646208599254e-07, "loss": 0.0106, "reward": 0.08035714761354029, "reward_std": 0.0357142873108387, "rewards/accuracy_reward": 0.08035714761354029, "rewards/format_reward": 0.0, "step": 709 }, { "completion_length": 1816.4063415527344, "epoch": 0.8483309685609738, "grad_norm": 0.1666640043258667, "kl": 0.107452392578125, "learning_rate": 1.609167264098643e-07, "loss": 0.0152, "reward": 0.10491071944124997, "reward_std": 0.05941697070375085, "rewards/accuracy_reward": 0.10491071944124997, "rewards/format_reward": 0.0, "step": 710 }, { "completion_length": 1812.1161499023438, "epoch": 0.8495258009110597, "grad_norm": 0.12624943256378174, "kl": 0.1035614013671875, "learning_rate": 1.599756225261022e-07, "loss": 0.015, "reward": 0.1250000053551048, "reward_std": 0.056333940010517836, "rewards/accuracy_reward": 0.1250000053551048, "rewards/format_reward": 0.0, "step": 711 }, { "completion_length": 1791.3661651611328, "epoch": 0.8507206332611456, "grad_norm": 0.23463799059391022, "kl": 0.1070556640625, "learning_rate": 1.590413256334736e-07, "loss": 0.0177, "reward": 0.06919643329456449, "reward_std": 0.07865536911413074, "rewards/accuracy_reward": 0.06919643329456449, "rewards/format_reward": 0.0, "step": 712 }, { "completion_length": 1768.9598922729492, "epoch": 0.8519154656112314, "grad_norm": 0.14523281157016754, "kl": 0.0947418212890625, "learning_rate": 1.5811385203801217e-07, "loss": 0.017, "reward": 0.13392857648432255, "reward_std": 0.05117902671918273, "rewards/accuracy_reward": 0.13392857648432255, "rewards/format_reward": 0.0, "step": 713 }, { "completion_length": 1807.9598999023438, "epoch": 0.8531102979613173, "grad_norm": 0.11095204204320908, "kl": 0.1065826416015625, "learning_rate": 1.5719321792666653e-07, "loss": 0.0188, "reward": 0.095982147147879, "reward_std": 0.04294108273461461, "rewards/accuracy_reward": 0.095982147147879, "rewards/format_reward": 0.0, "step": 714 }, { "completion_length": 1797.21435546875, "epoch": 0.8543051303114032, "grad_norm": 0.1482749730348587, "kl": 0.1114654541015625, "learning_rate": 1.5627943936701752e-07, "loss": 0.01, "reward": 0.06919643236324191, "reward_std": 0.03640491422265768, "rewards/accuracy_reward": 0.06919643236324191, "rewards/format_reward": 0.0, "step": 715 }, { "completion_length": 1820.8594665527344, "epoch": 0.8554999626614891, "grad_norm": 0.1770404577255249, "kl": 0.1291046142578125, "learning_rate": 1.5537253230699782e-07, "loss": 0.0141, "reward": 0.07366071664728224, "reward_std": 0.03640491468831897, "rewards/accuracy_reward": 0.07366071664728224, "rewards/format_reward": 0.0, "step": 716 }, { "completion_length": 1799.9063262939453, "epoch": 0.856694795011575, "grad_norm": 0.1669173240661621, "kl": 0.1253662109375, "learning_rate": 1.5447251257461398e-07, "loss": 0.0142, "reward": 0.09375000395812094, "reward_std": 0.03332188306376338, "rewards/accuracy_reward": 0.09375000395812094, "rewards/format_reward": 0.0, "step": 717 }, { "completion_length": 1747.5357971191406, "epoch": 0.8578896273616609, "grad_norm": 0.09901155531406403, "kl": 0.105712890625, "learning_rate": 1.535793958776695e-07, "loss": 0.0156, "reward": 0.08035714598372579, "reward_std": 0.0652625123038888, "rewards/accuracy_reward": 0.08035714598372579, "rewards/format_reward": 0.0, "step": 718 }, { "completion_length": 1795.82373046875, "epoch": 0.8590844597117467, "grad_norm": 0.22439634799957275, "kl": 0.114013671875, "learning_rate": 1.5269319780349127e-07, "loss": 0.0053, "reward": 0.07142857392318547, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.07142857392318547, "rewards/format_reward": 0.0, "step": 719 }, { "completion_length": 1776.8215255737305, "epoch": 0.8602792920618326, "grad_norm": 0.16996550559997559, "kl": 0.13824462890625, "learning_rate": 1.5181393381865714e-07, "loss": 0.0188, "reward": 0.11383929126895964, "reward_std": 0.046714740339666605, "rewards/accuracy_reward": 0.11383929126895964, "rewards/format_reward": 0.0, "step": 720 }, { "completion_length": 1788.861701965332, "epoch": 0.8614741244119185, "grad_norm": 0.12506355345249176, "kl": 0.1369171142578125, "learning_rate": 1.509416192687264e-07, "loss": 0.018, "reward": 0.05133928870782256, "reward_std": 0.03640491468831897, "rewards/accuracy_reward": 0.05133928870782256, "rewards/format_reward": 0.0, "step": 721 }, { "completion_length": 1718.7857971191406, "epoch": 0.8626689567620043, "grad_norm": 0.1570463627576828, "kl": 0.1433258056640625, "learning_rate": 1.500762693779713e-07, "loss": 0.0125, "reward": 0.08705357578583062, "reward_std": 0.041559827979654074, "rewards/accuracy_reward": 0.08705357578583062, "rewards/format_reward": 0.0, "step": 722 }, { "completion_length": 1778.0380249023438, "epoch": 0.8638637891120902, "grad_norm": 0.20148339867591858, "kl": 0.142486572265625, "learning_rate": 1.4921789924911232e-07, "loss": 0.017, "reward": 0.07589286123402417, "reward_std": 0.05633394047617912, "rewards/accuracy_reward": 0.07589286123402417, "rewards/format_reward": 0.0, "step": 723 }, { "completion_length": 1769.5714874267578, "epoch": 0.865058621462176, "grad_norm": 0.09923796355724335, "kl": 0.11907958984375, "learning_rate": 1.4836652386305347e-07, "loss": 0.0177, "reward": 0.0647321455180645, "reward_std": 0.05048839934170246, "rewards/accuracy_reward": 0.0647321455180645, "rewards/format_reward": 0.0, "step": 724 }, { "completion_length": 1747.0603485107422, "epoch": 0.8662534538122619, "grad_norm": 0.18696816265583038, "kl": 0.129608154296875, "learning_rate": 1.4752215807862174e-07, "loss": 0.0116, "reward": 0.08035714691504836, "reward_std": 0.037095542065799236, "rewards/accuracy_reward": 0.08035714691504836, "rewards/format_reward": 0.0, "step": 725 }, { "completion_length": 1757.442039489746, "epoch": 0.8674482861623478, "grad_norm": 0.1604207456111908, "kl": 0.146820068359375, "learning_rate": 1.466848166323073e-07, "loss": 0.0182, "reward": 0.08705357601866126, "reward_std": 0.05564331263303757, "rewards/accuracy_reward": 0.08705357601866126, "rewards/format_reward": 0.0, "step": 726 }, { "completion_length": 1793.7947158813477, "epoch": 0.8686431185124337, "grad_norm": 0.26080793142318726, "kl": 0.133941650390625, "learning_rate": 1.458545141380065e-07, "loss": -0.0005, "reward": 0.06250000256113708, "reward_std": 0.040869200602173805, "rewards/accuracy_reward": 0.06250000256113708, "rewards/format_reward": 0.0, "step": 727 }, { "completion_length": 1753.495620727539, "epoch": 0.8698379508625196, "grad_norm": 0.3480418026447296, "kl": 0.106689453125, "learning_rate": 1.450312650867665e-07, "loss": 0.0054, "reward": 0.07366071781143546, "reward_std": 0.041559827513992786, "rewards/accuracy_reward": 0.07366071781143546, "rewards/format_reward": 0.0, "step": 728 }, { "completion_length": 1727.4732971191406, "epoch": 0.8710327832126055, "grad_norm": 0.2691614031791687, "kl": 0.10601806640625, "learning_rate": 1.4421508384653297e-07, "loss": 0.0091, "reward": 0.09151786263100803, "reward_std": 0.046714740339666605, "rewards/accuracy_reward": 0.09151786263100803, "rewards/format_reward": 0.0, "step": 729 }, { "completion_length": 1826.1786422729492, "epoch": 0.8722276155626913, "grad_norm": 0.15194325149059296, "kl": 0.1133880615234375, "learning_rate": 1.4340598466189865e-07, "loss": 0.0088, "reward": 0.06919643166474998, "reward_std": 0.041559827513992786, "rewards/accuracy_reward": 0.06919643166474998, "rewards/format_reward": 0.0, "step": 730 }, { "completion_length": 1774.0536499023438, "epoch": 0.8734224479127772, "grad_norm": 0.13060227036476135, "kl": 0.1119537353515625, "learning_rate": 1.4260398165385518e-07, "loss": 0.0067, "reward": 0.07812500512227416, "reward_std": 0.04155982844531536, "rewards/accuracy_reward": 0.07812500512227416, "rewards/format_reward": 0.0, "step": 731 }, { "completion_length": 1733.7947311401367, "epoch": 0.8746172802628631, "grad_norm": 0.11781053990125656, "kl": 0.109588623046875, "learning_rate": 1.4180908881954667e-07, "loss": 0.0126, "reward": 0.07812500302679837, "reward_std": 0.05048839934170246, "rewards/accuracy_reward": 0.07812500302679837, "rewards/format_reward": 0.0, "step": 732 }, { "completion_length": 1737.3817825317383, "epoch": 0.875812112612949, "grad_norm": 0.11595380306243896, "kl": 0.0977325439453125, "learning_rate": 1.4102132003202507e-07, "loss": 0.0167, "reward": 0.09151786146685481, "reward_std": 0.057024567387998104, "rewards/accuracy_reward": 0.09151786146685481, "rewards/format_reward": 0.0, "step": 733 }, { "completion_length": 1789.4822387695312, "epoch": 0.8770069449630349, "grad_norm": 0.25030437111854553, "kl": 0.1036224365234375, "learning_rate": 1.4024068904000817e-07, "loss": 0.0168, "reward": 0.09598214854486287, "reward_std": 0.07865536864846945, "rewards/accuracy_reward": 0.09598214854486287, "rewards/format_reward": 0.0, "step": 734 }, { "completion_length": 1834.7634735107422, "epoch": 0.8782017773131208, "grad_norm": 0.13520805537700653, "kl": 0.08709716796875, "learning_rate": 1.394672094676401e-07, "loss": 0.005, "reward": 0.033482144586741924, "reward_std": 0.009619198739528656, "rewards/accuracy_reward": 0.033482144586741924, "rewards/format_reward": 0.0, "step": 735 }, { "completion_length": 1762.9398193359375, "epoch": 0.8793966096632067, "grad_norm": 0.10127894580364227, "kl": 0.091278076171875, "learning_rate": 1.3870089481425278e-07, "loss": 0.0051, "reward": 0.11383929057046771, "reward_std": 0.045333486050367355, "rewards/accuracy_reward": 0.11383929057046771, "rewards/format_reward": 0.0, "step": 736 }, { "completion_length": 1799.5447311401367, "epoch": 0.8805914420132925, "grad_norm": 0.21716977655887604, "kl": 0.079742431640625, "learning_rate": 1.3794175845413097e-07, "loss": 0.0113, "reward": 0.07142857485450804, "reward_std": 0.04602411389350891, "rewards/accuracy_reward": 0.07142857485450804, "rewards/format_reward": 0.0, "step": 737 }, { "completion_length": 1776.7433853149414, "epoch": 0.8817862743633784, "grad_norm": 0.17076952755451202, "kl": 0.08953094482421875, "learning_rate": 1.3718981363627832e-07, "loss": 0.0136, "reward": 0.08482143166474998, "reward_std": 0.04979777242988348, "rewards/accuracy_reward": 0.08482143166474998, "rewards/format_reward": 0.0, "step": 738 }, { "completion_length": 1785.799186706543, "epoch": 0.8829811067134643, "grad_norm": 0.11073677986860275, "kl": 0.0908050537109375, "learning_rate": 1.3644507348418683e-07, "loss": 0.0111, "reward": 0.06696428847499192, "reward_std": 0.04086920106783509, "rewards/accuracy_reward": 0.06696428847499192, "rewards/format_reward": 0.0, "step": 739 }, { "completion_length": 1826.38623046875, "epoch": 0.8841759390635502, "grad_norm": 0.2149447798728943, "kl": 0.0965118408203125, "learning_rate": 1.35707550995607e-07, "loss": 0.0165, "reward": 0.08258929010480642, "reward_std": 0.06696428870782256, "rewards/accuracy_reward": 0.08258929010480642, "rewards/format_reward": 0.0, "step": 740 }, { "completion_length": 1793.399642944336, "epoch": 0.8853707714136361, "grad_norm": 0.1412253975868225, "kl": 0.0906219482421875, "learning_rate": 1.349772590423217e-07, "loss": 0.0102, "reward": 0.10044643469154835, "reward_std": 0.04017857322469354, "rewards/accuracy_reward": 0.10044643469154835, "rewards/format_reward": 0.0, "step": 741 }, { "completion_length": 1802.7076797485352, "epoch": 0.886565603763722, "grad_norm": 0.16085264086723328, "kl": 0.1032257080078125, "learning_rate": 1.3425421036992097e-07, "loss": 0.0066, "reward": 0.06473214528523386, "reward_std": 0.03401251044124365, "rewards/accuracy_reward": 0.06473214528523386, "rewards/format_reward": 0.0, "step": 742 }, { "completion_length": 1791.6072235107422, "epoch": 0.8877604361138077, "grad_norm": 0.1573924571275711, "kl": 0.0871124267578125, "learning_rate": 1.3353841759757986e-07, "loss": 0.0087, "reward": 0.042410716181620955, "reward_std": 0.009619199205189943, "rewards/accuracy_reward": 0.042410716181620955, "rewards/format_reward": 0.0, "step": 743 }, { "completion_length": 1732.6116790771484, "epoch": 0.8889552684638936, "grad_norm": 0.14577293395996094, "kl": 0.0960693359375, "learning_rate": 1.328298932178382e-07, "loss": 0.0164, "reward": 0.13616071757860482, "reward_std": 0.059416971169412136, "rewards/accuracy_reward": 0.13616071757860482, "rewards/format_reward": 0.0, "step": 744 }, { "completion_length": 1851.634017944336, "epoch": 0.8901501008139795, "grad_norm": 0.26097917556762695, "kl": 0.089569091796875, "learning_rate": 1.3212864959638235e-07, "loss": 0.0165, "reward": 0.10267857811413705, "reward_std": 0.06148885330185294, "rewards/accuracy_reward": 0.10267857811413705, "rewards/format_reward": 0.0, "step": 745 }, { "completion_length": 1825.1541137695312, "epoch": 0.8913449331640654, "grad_norm": 0.14696398377418518, "kl": 0.0927886962890625, "learning_rate": 1.3143469897182983e-07, "loss": 0.0114, "reward": 0.042410716880112886, "reward_std": 0.04017857322469354, "rewards/accuracy_reward": 0.042410716880112886, "rewards/format_reward": 0.0, "step": 746 }, { "completion_length": 1772.080451965332, "epoch": 0.8925397655141513, "grad_norm": 0.10673752427101135, "kl": 0.0867767333984375, "learning_rate": 1.3074805345551506e-07, "loss": 0.0187, "reward": 0.06696428917348385, "reward_std": 0.060107598546892405, "rewards/accuracy_reward": 0.06696428917348385, "rewards/format_reward": 0.0, "step": 747 }, { "completion_length": 1778.7947235107422, "epoch": 0.8937345978642371, "grad_norm": 0.13511167466640472, "kl": 0.0764923095703125, "learning_rate": 1.3006872503127887e-07, "loss": 0.0051, "reward": 0.08035714621655643, "reward_std": 0.037095542065799236, "rewards/accuracy_reward": 0.08035714621655643, "rewards/format_reward": 0.0, "step": 748 }, { "completion_length": 1792.1875839233398, "epoch": 0.894929430214323, "grad_norm": 0.09044623374938965, "kl": 0.09295654296875, "learning_rate": 1.293967255552583e-07, "loss": 0.0147, "reward": 0.04910714575089514, "reward_std": 0.044642859138548374, "rewards/accuracy_reward": 0.04910714575089514, "rewards/format_reward": 0.0, "step": 749 }, { "completion_length": 1798.7813339233398, "epoch": 0.8961242625644089, "grad_norm": 0.09687493741512299, "kl": 0.0916748046875, "learning_rate": 1.2873206675568052e-07, "loss": 0.011, "reward": 0.04687500256113708, "reward_std": 0.028857596684247255, "rewards/accuracy_reward": 0.04687500256113708, "rewards/format_reward": 0.0, "step": 750 }, { "completion_length": 1820.2657165527344, "epoch": 0.8973190949144948, "grad_norm": 0.11097890138626099, "kl": 0.0830230712890625, "learning_rate": 1.2807476023265773e-07, "loss": 0.0082, "reward": 0.06250000349245965, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.06250000349245965, "rewards/format_reward": 0.0, "step": 751 }, { "completion_length": 1768.3103790283203, "epoch": 0.8985139272645807, "grad_norm": 0.11973078548908234, "kl": 0.0858001708984375, "learning_rate": 1.2742481745798493e-07, "loss": 0.0107, "reward": 0.07366071897558868, "reward_std": 0.03640491422265768, "rewards/accuracy_reward": 0.07366071897558868, "rewards/format_reward": 0.0, "step": 752 }, { "completion_length": 1829.7880249023438, "epoch": 0.8997087596146666, "grad_norm": 0.10984499752521515, "kl": 0.08251953125, "learning_rate": 1.2678224977493954e-07, "loss": 0.0069, "reward": 0.058035716880112886, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.058035716880112886, "rewards/format_reward": 0.0, "step": 753 }, { "completion_length": 1805.7121353149414, "epoch": 0.9009035919647524, "grad_norm": 0.16038893163204193, "kl": 0.0825347900390625, "learning_rate": 1.2614706839808328e-07, "loss": 0.013, "reward": 0.10491072037257254, "reward_std": 0.05564331216737628, "rewards/accuracy_reward": 0.10491072037257254, "rewards/format_reward": 0.0, "step": 754 }, { "completion_length": 1795.3371200561523, "epoch": 0.9020984243148383, "grad_norm": 0.11312306672334671, "kl": 0.10943603515625, "learning_rate": 1.2551928441306697e-07, "loss": 0.013, "reward": 0.10937500488944352, "reward_std": 0.05564331263303757, "rewards/accuracy_reward": 0.10937500488944352, "rewards/format_reward": 0.0, "step": 755 }, { "completion_length": 1803.8103485107422, "epoch": 0.9032932566649242, "grad_norm": 0.13881947100162506, "kl": 0.083465576171875, "learning_rate": 1.2489890877643659e-07, "loss": 0.01, "reward": 0.0848214328289032, "reward_std": 0.05633393954485655, "rewards/accuracy_reward": 0.0848214328289032, "rewards/format_reward": 0.0, "step": 756 }, { "completion_length": 1819.3616943359375, "epoch": 0.9044880890150101, "grad_norm": 0.11652617901563644, "kl": 0.0882568359375, "learning_rate": 1.2428595231544202e-07, "loss": 0.0115, "reward": 0.060267859837040305, "reward_std": 0.041559827979654074, "rewards/accuracy_reward": 0.060267859837040305, "rewards/format_reward": 0.0, "step": 757 }, { "completion_length": 1822.964370727539, "epoch": 0.905682921365096, "grad_norm": 0.1365566998720169, "kl": 0.084808349609375, "learning_rate": 1.2368042572784862e-07, "loss": 0.003, "reward": 0.06250000302679837, "reward_std": 0.024393311236053705, "rewards/accuracy_reward": 0.06250000302679837, "rewards/format_reward": 0.0, "step": 758 }, { "completion_length": 1808.2009811401367, "epoch": 0.9068777537151819, "grad_norm": 0.11313049495220184, "kl": 0.0931243896484375, "learning_rate": 1.2308233958174985e-07, "loss": 0.015, "reward": 0.07812500419095159, "reward_std": 0.041559827979654074, "rewards/accuracy_reward": 0.07812500419095159, "rewards/format_reward": 0.0, "step": 759 }, { "completion_length": 1790.2947235107422, "epoch": 0.9080725860652678, "grad_norm": 0.1415162980556488, "kl": 0.097869873046875, "learning_rate": 1.2249170431538318e-07, "loss": 0.0122, "reward": 0.09821428987197578, "reward_std": 0.03194062877446413, "rewards/accuracy_reward": 0.09821428987197578, "rewards/format_reward": 0.0, "step": 760 }, { "completion_length": 1810.9286422729492, "epoch": 0.9092674184153536, "grad_norm": 0.11718883365392685, "kl": 0.084564208984375, "learning_rate": 1.2190853023694807e-07, "loss": 0.0134, "reward": 0.09598214668221772, "reward_std": 0.036404915153980255, "rewards/accuracy_reward": 0.09598214668221772, "rewards/format_reward": 0.0, "step": 761 }, { "completion_length": 1876.0603485107422, "epoch": 0.9104622507654395, "grad_norm": 0.09497907757759094, "kl": 0.084564208984375, "learning_rate": 1.2133282752442565e-07, "loss": 0.011, "reward": 0.05133928754366934, "reward_std": 0.03263125568628311, "rewards/accuracy_reward": 0.05133928754366934, "rewards/format_reward": 0.0, "step": 762 }, { "completion_length": 1861.6875762939453, "epoch": 0.9116570831155254, "grad_norm": 0.09009799361228943, "kl": 0.087646484375, "learning_rate": 1.2076460622540127e-07, "loss": 0.0086, "reward": 0.06919643189758062, "reward_std": 0.018547771032899618, "rewards/accuracy_reward": 0.06919643189758062, "rewards/format_reward": 0.0, "step": 763 }, { "completion_length": 1847.4040908813477, "epoch": 0.9128519154656113, "grad_norm": 0.11033658683300018, "kl": 0.08758544921875, "learning_rate": 1.202038762568894e-07, "loss": 0.0073, "reward": 0.10714286169968545, "reward_std": 0.02816697023808956, "rewards/accuracy_reward": 0.10714286169968545, "rewards/format_reward": 0.0, "step": 764 }, { "completion_length": 1799.7902526855469, "epoch": 0.9140467478156971, "grad_norm": 0.1375969499349594, "kl": 0.076263427734375, "learning_rate": 1.1965064740516017e-07, "loss": 0.0102, "reward": 0.07812500442378223, "reward_std": 0.03125000139698386, "rewards/accuracy_reward": 0.07812500442378223, "rewards/format_reward": 0.0, "step": 765 }, { "completion_length": 1785.1429443359375, "epoch": 0.9152415801657829, "grad_norm": 0.121034175157547, "kl": 0.079833984375, "learning_rate": 1.1910492932556876e-07, "loss": 0.0155, "reward": 0.0491071455180645, "reward_std": 0.042250454891473055, "rewards/accuracy_reward": 0.0491071455180645, "rewards/format_reward": 0.0, "step": 766 }, { "completion_length": 1835.4755249023438, "epoch": 0.9164364125158688, "grad_norm": 0.12526878714561462, "kl": 0.08648681640625, "learning_rate": 1.1856673154238674e-07, "loss": 0.0143, "reward": 0.11607143166474998, "reward_std": 0.05357143096625805, "rewards/accuracy_reward": 0.11607143166474998, "rewards/format_reward": 0.0, "step": 767 }, { "completion_length": 1841.4152755737305, "epoch": 0.9176312448659547, "grad_norm": 0.09840578585863113, "kl": 0.0793914794921875, "learning_rate": 1.1803606344863615e-07, "loss": 0.0095, "reward": 0.06250000256113708, "reward_std": 0.03194062877446413, "rewards/accuracy_reward": 0.06250000256113708, "rewards/format_reward": 0.0, "step": 768 }, { "completion_length": 1847.0759735107422, "epoch": 0.9188260772160406, "grad_norm": 0.17498041689395905, "kl": 0.08109283447265625, "learning_rate": 1.1751293430592523e-07, "loss": 0.0131, "reward": 0.11607143469154835, "reward_std": 0.04979777242988348, "rewards/accuracy_reward": 0.11607143469154835, "rewards/format_reward": 0.0, "step": 769 }, { "completion_length": 1806.118392944336, "epoch": 0.9200209095661265, "grad_norm": 0.2155098170042038, "kl": 0.087005615234375, "learning_rate": 1.1699735324428705e-07, "loss": 0.0191, "reward": 0.10491071897558868, "reward_std": 0.05941697070375085, "rewards/accuracy_reward": 0.10491071897558868, "rewards/format_reward": 0.0, "step": 770 }, { "completion_length": 1872.0313415527344, "epoch": 0.9212157419162124, "grad_norm": 0.2045278549194336, "kl": 0.07497406005859375, "learning_rate": 1.1648932926201996e-07, "loss": 0.0202, "reward": 0.095982147147879, "reward_std": 0.037786169443279505, "rewards/accuracy_reward": 0.095982147147879, "rewards/format_reward": 0.0, "step": 771 }, { "completion_length": 1819.0290985107422, "epoch": 0.9224105742662982, "grad_norm": 0.11777309328317642, "kl": 0.0754547119140625, "learning_rate": 1.159888712255306e-07, "loss": 0.013, "reward": 0.07812500325962901, "reward_std": 0.03125000139698386, "rewards/accuracy_reward": 0.07812500325962901, "rewards/format_reward": 0.0, "step": 772 }, { "completion_length": 1871.4800033569336, "epoch": 0.9236054066163841, "grad_norm": 0.25556230545043945, "kl": 0.0780181884765625, "learning_rate": 1.1549598786917915e-07, "loss": 0.0143, "reward": 0.05580357392318547, "reward_std": 0.05564331263303757, "rewards/accuracy_reward": 0.05580357392318547, "rewards/format_reward": 0.0, "step": 773 }, { "completion_length": 1790.9799880981445, "epoch": 0.92480023896647, "grad_norm": 0.09730090945959091, "kl": 0.08673858642578125, "learning_rate": 1.1501068779512707e-07, "loss": 0.0132, "reward": 0.06919643096625805, "reward_std": 0.027476342860609293, "rewards/accuracy_reward": 0.06919643096625805, "rewards/format_reward": 0.0, "step": 774 }, { "completion_length": 1794.6027755737305, "epoch": 0.9259950713165559, "grad_norm": 0.12952396273612976, "kl": 0.10272216796875, "learning_rate": 1.1453297947318674e-07, "loss": 0.0157, "reward": 0.11607143422588706, "reward_std": 0.04602411389350891, "rewards/accuracy_reward": 0.11607143422588706, "rewards/format_reward": 0.0, "step": 775 }, { "completion_length": 1851.2098922729492, "epoch": 0.9271899036666418, "grad_norm": 0.11911668628454208, "kl": 0.0872344970703125, "learning_rate": 1.140628712406736e-07, "loss": 0.0129, "reward": 0.09375000232830644, "reward_std": 0.04979777242988348, "rewards/accuracy_reward": 0.09375000232830644, "rewards/format_reward": 0.0, "step": 776 }, { "completion_length": 1833.9933853149414, "epoch": 0.9283847360167277, "grad_norm": 0.13663139939308167, "kl": 0.0994720458984375, "learning_rate": 1.1360037130226111e-07, "loss": 0.0051, "reward": 0.05580357345752418, "reward_std": 0.022321429569274187, "rewards/accuracy_reward": 0.05580357345752418, "rewards/format_reward": 0.0, "step": 777 }, { "completion_length": 1819.4509582519531, "epoch": 0.9295795683668135, "grad_norm": 0.12317535281181335, "kl": 0.0873870849609375, "learning_rate": 1.1314548772983685e-07, "loss": 0.0076, "reward": 0.08705357532016933, "reward_std": 0.041559827979654074, "rewards/accuracy_reward": 0.08705357532016933, "rewards/format_reward": 0.0, "step": 778 }, { "completion_length": 1764.58935546875, "epoch": 0.9307744007168994, "grad_norm": 0.11292005330324173, "kl": 0.0887451171875, "learning_rate": 1.1269822846236213e-07, "loss": 0.0081, "reward": 0.11383929080329835, "reward_std": 0.05564331263303757, "rewards/accuracy_reward": 0.11383929080329835, "rewards/format_reward": 0.0, "step": 779 }, { "completion_length": 1821.8973922729492, "epoch": 0.9319692330669853, "grad_norm": 0.13166998326778412, "kl": 0.0939788818359375, "learning_rate": 1.1225860130573332e-07, "loss": 0.0095, "reward": 0.10714286332949996, "reward_std": 0.040869200602173805, "rewards/accuracy_reward": 0.10714286332949996, "rewards/format_reward": 0.0, "step": 780 }, { "completion_length": 1868.5112686157227, "epoch": 0.9331640654170712, "grad_norm": 0.13915444910526276, "kl": 0.100189208984375, "learning_rate": 1.1182661393264556e-07, "loss": 0.0135, "reward": 0.06919643189758062, "reward_std": 0.051869654562324286, "rewards/accuracy_reward": 0.06919643189758062, "rewards/format_reward": 0.0, "step": 781 }, { "completion_length": 1827.852767944336, "epoch": 0.9343588977671571, "grad_norm": 0.15526387095451355, "kl": 0.10372161865234375, "learning_rate": 1.1140227388245897e-07, "loss": 0.008, "reward": 0.10491071967408061, "reward_std": 0.018547771032899618, "rewards/accuracy_reward": 0.10491071967408061, "rewards/format_reward": 0.0, "step": 782 }, { "completion_length": 1862.4688339233398, "epoch": 0.935553730117243, "grad_norm": 0.11174941807985306, "kl": 0.089202880859375, "learning_rate": 1.1098558856106691e-07, "loss": 0.0082, "reward": 0.04017857299186289, "reward_std": 0.037095542065799236, "rewards/accuracy_reward": 0.04017857299186289, "rewards/format_reward": 0.0, "step": 783 }, { "completion_length": 1795.8014221191406, "epoch": 0.9367485624673288, "grad_norm": 0.09242259711027145, "kl": 0.108612060546875, "learning_rate": 1.1057656524076689e-07, "loss": 0.0152, "reward": 0.10267857694998384, "reward_std": 0.04225045535713434, "rewards/accuracy_reward": 0.10267857694998384, "rewards/format_reward": 0.0, "step": 784 }, { "completion_length": 1873.2322235107422, "epoch": 0.9379433948174147, "grad_norm": 0.10608790069818497, "kl": 0.0861968994140625, "learning_rate": 1.1017521106013348e-07, "loss": 0.0128, "reward": 0.07142857369035482, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.07142857369035482, "rewards/format_reward": 0.0, "step": 785 }, { "completion_length": 1842.799186706543, "epoch": 0.9391382271675005, "grad_norm": 0.10633320361375809, "kl": 0.08411407470703125, "learning_rate": 1.097815330238938e-07, "loss": 0.0081, "reward": 0.08928571897558868, "reward_std": 0.02816697023808956, "rewards/accuracy_reward": 0.08928571897558868, "rewards/format_reward": 0.0, "step": 786 }, { "completion_length": 1796.5335693359375, "epoch": 0.9403330595175864, "grad_norm": 0.16250939667224884, "kl": 0.089447021484375, "learning_rate": 1.0939553800280531e-07, "loss": 0.0075, "reward": 0.07812500395812094, "reward_std": 0.028857597149908543, "rewards/accuracy_reward": 0.07812500395812094, "rewards/format_reward": 0.0, "step": 787 }, { "completion_length": 1862.6094589233398, "epoch": 0.9415278918676723, "grad_norm": 0.21528133749961853, "kl": 0.08258056640625, "learning_rate": 1.0901723273353597e-07, "loss": 0.0182, "reward": 0.07366071827709675, "reward_std": 0.05564331263303757, "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.0, "step": 788 }, { "completion_length": 1898.5313415527344, "epoch": 0.9427227242177582, "grad_norm": 0.08547373861074448, "kl": 0.076690673828125, "learning_rate": 1.086466238185463e-07, "loss": 0.0102, "reward": 0.04464285890571773, "reward_std": 0.03194062877446413, "rewards/accuracy_reward": 0.04464285890571773, "rewards/format_reward": 0.0, "step": 789 }, { "completion_length": 1835.9956283569336, "epoch": 0.943917556567844, "grad_norm": 0.10149899125099182, "kl": 0.080718994140625, "learning_rate": 1.0828371772597467e-07, "loss": 0.0101, "reward": 0.10044643376022577, "reward_std": 0.051869654096663, "rewards/accuracy_reward": 0.10044643376022577, "rewards/format_reward": 0.0, "step": 790 }, { "completion_length": 1861.962142944336, "epoch": 0.9451123889179299, "grad_norm": 0.16612079739570618, "kl": 0.097259521484375, "learning_rate": 1.0792852078952403e-07, "loss": 0.0157, "reward": 0.04910714505240321, "reward_std": 0.05357143096625805, "rewards/accuracy_reward": 0.04910714505240321, "rewards/format_reward": 0.0, "step": 791 }, { "completion_length": 1880.5692825317383, "epoch": 0.9463072212680158, "grad_norm": 0.15846429765224457, "kl": 0.090911865234375, "learning_rate": 1.0758103920835142e-07, "loss": 0.0095, "reward": 0.04241071571595967, "reward_std": 0.04533348651602864, "rewards/accuracy_reward": 0.04241071571595967, "rewards/format_reward": 0.0, "step": 792 }, { "completion_length": 1807.3438415527344, "epoch": 0.9475020536181017, "grad_norm": 0.12543198466300964, "kl": 0.099456787109375, "learning_rate": 1.0724127904696007e-07, "loss": 0.0112, "reward": 0.07366071827709675, "reward_std": 0.041559827979654074, "rewards/accuracy_reward": 0.07366071827709675, "rewards/format_reward": 0.0, "step": 793 }, { "completion_length": 1785.9576797485352, "epoch": 0.9486968859681876, "grad_norm": 0.10621478408575058, "kl": 0.0770111083984375, "learning_rate": 1.0690924623509307e-07, "loss": 0.0117, "reward": 0.07589286146685481, "reward_std": 0.04979777242988348, "rewards/accuracy_reward": 0.07589286146685481, "rewards/format_reward": 0.0, "step": 794 }, { "completion_length": 1822.7790985107422, "epoch": 0.9498917183182735, "grad_norm": 0.1895214170217514, "kl": 0.1090240478515625, "learning_rate": 1.0658494656763034e-07, "loss": 0.007, "reward": 0.0714285746216774, "reward_std": 0.0357142873108387, "rewards/accuracy_reward": 0.0714285746216774, "rewards/format_reward": 0.0, "step": 795 }, { "completion_length": 1838.627296447754, "epoch": 0.9510865506683593, "grad_norm": 0.09814899414777756, "kl": 0.0834808349609375, "learning_rate": 1.0626838570448715e-07, "loss": 0.0134, "reward": 0.11160714784637094, "reward_std": 0.04225045535713434, "rewards/accuracy_reward": 0.11160714784637094, "rewards/format_reward": 0.0, "step": 796 }, { "completion_length": 1761.2255020141602, "epoch": 0.9522813830184452, "grad_norm": 0.12127888202667236, "kl": 0.0948333740234375, "learning_rate": 1.0595956917051569e-07, "loss": 0.0123, "reward": 0.04910714505240321, "reward_std": 0.037095542065799236, "rewards/accuracy_reward": 0.04910714505240321, "rewards/format_reward": 0.0, "step": 797 }, { "completion_length": 1824.6161499023438, "epoch": 0.9534762153685311, "grad_norm": 0.11042162030935287, "kl": 0.0860137939453125, "learning_rate": 1.0565850235540817e-07, "loss": 0.0131, "reward": 0.12053572060540318, "reward_std": 0.03709554160013795, "rewards/accuracy_reward": 0.12053572060540318, "rewards/format_reward": 0.0, "step": 798 }, { "completion_length": 1800.3728485107422, "epoch": 0.954671047718617, "grad_norm": 0.11311227083206177, "kl": 0.090362548828125, "learning_rate": 1.0536519051360337e-07, "loss": 0.0156, "reward": 0.10714286076836288, "reward_std": 0.046024113427847624, "rewards/accuracy_reward": 0.10714286076836288, "rewards/format_reward": 0.0, "step": 799 }, { "completion_length": 1858.406349182129, "epoch": 0.9558658800687029, "grad_norm": 0.08605191111564636, "kl": 0.0832672119140625, "learning_rate": 1.0507963876419423e-07, "loss": 0.0148, "reward": 0.11160714807920158, "reward_std": 0.03709554160013795, "rewards/accuracy_reward": 0.11160714807920158, "rewards/format_reward": 0.0, "step": 800 }, { "completion_length": 1853.9755477905273, "epoch": 0.9570607124187888, "grad_norm": 0.1350506693124771, "kl": 0.0926666259765625, "learning_rate": 1.0480185209083917e-07, "loss": 0.0092, "reward": 0.07142857415601611, "reward_std": 0.038476796355098486, "rewards/accuracy_reward": 0.07142857415601611, "rewards/format_reward": 0.0, "step": 801 }, { "completion_length": 1827.2790985107422, "epoch": 0.9582555447688746, "grad_norm": 0.24265776574611664, "kl": 0.092376708984375, "learning_rate": 1.0453183534167469e-07, "loss": 0.0145, "reward": 0.05357143096625805, "reward_std": 0.04225045582279563, "rewards/accuracy_reward": 0.05357143096625805, "rewards/format_reward": 0.0, "step": 802 }, { "completion_length": 1879.229995727539, "epoch": 0.9594503771189605, "grad_norm": 0.10007164627313614, "kl": 0.090362548828125, "learning_rate": 1.0426959322923078e-07, "loss": 0.01, "reward": 0.09151786076836288, "reward_std": 0.02885759761556983, "rewards/accuracy_reward": 0.09151786076836288, "rewards/format_reward": 0.0, "step": 803 }, { "completion_length": 1872.4911499023438, "epoch": 0.9606452094690464, "grad_norm": 0.08248502761125565, "kl": 0.079345703125, "learning_rate": 1.04015130330349e-07, "loss": 0.0106, "reward": 0.08705357601866126, "reward_std": 0.042941081803292036, "rewards/accuracy_reward": 0.08705357601866126, "rewards/format_reward": 0.0, "step": 804 }, { "completion_length": 1811.283561706543, "epoch": 0.9618400418191323, "grad_norm": 0.13761551678180695, "kl": 0.0919952392578125, "learning_rate": 1.0376845108610213e-07, "loss": 0.0097, "reward": 0.04241071571595967, "reward_std": 0.01854777056723833, "rewards/accuracy_reward": 0.04241071571595967, "rewards/format_reward": 0.0, "step": 805 }, { "completion_length": 1782.04248046875, "epoch": 0.9630348741692182, "grad_norm": 0.13883823156356812, "kl": 0.0999908447265625, "learning_rate": 1.0352955980171716e-07, "loss": 0.0125, "reward": 0.0915178619325161, "reward_std": 0.03401250997558236, "rewards/accuracy_reward": 0.0915178619325161, "rewards/format_reward": 0.0, "step": 806 }, { "completion_length": 1814.5358047485352, "epoch": 0.9642297065193041, "grad_norm": 0.10450931638479233, "kl": 0.09161376953125, "learning_rate": 1.0329846064649978e-07, "loss": 0.0092, "reward": 0.08928571874275804, "reward_std": 0.02816697023808956, "rewards/accuracy_reward": 0.08928571874275804, "rewards/format_reward": 0.0, "step": 807 }, { "completion_length": 1800.9643630981445, "epoch": 0.9654245388693898, "grad_norm": 0.114235520362854, "kl": 0.09136962890625, "learning_rate": 1.0307515765376166e-07, "loss": 0.0125, "reward": 0.06919643119908869, "reward_std": 0.041559827513992786, "rewards/accuracy_reward": 0.06919643119908869, "rewards/format_reward": 0.0, "step": 808 }, { "completion_length": 1838.6295623779297, "epoch": 0.9666193712194757, "grad_norm": 0.14840729534626007, "kl": 0.0862884521484375, "learning_rate": 1.0285965472075022e-07, "loss": 0.0079, "reward": 0.08035714505240321, "reward_std": 0.026785715483129025, "rewards/accuracy_reward": 0.08035714505240321, "rewards/format_reward": 0.0, "step": 809 }, { "completion_length": 1841.1317901611328, "epoch": 0.9678142035695616, "grad_norm": 0.09060106426477432, "kl": 0.0871734619140625, "learning_rate": 1.026519556085805e-07, "loss": 0.0066, "reward": 0.060267860535532236, "reward_std": 0.036404915153980255, "rewards/accuracy_reward": 0.060267860535532236, "rewards/format_reward": 0.0, "step": 810 }, { "completion_length": 1824.3951721191406, "epoch": 0.9690090359196475, "grad_norm": 0.09206762164831161, "kl": 0.08707427978515625, "learning_rate": 1.0245206394216954e-07, "loss": 0.0079, "reward": 0.08258928963914514, "reward_std": 0.0326312561519444, "rewards/accuracy_reward": 0.08258928963914514, "rewards/format_reward": 0.0, "step": 811 }, { "completion_length": 1852.174201965332, "epoch": 0.9702038682697334, "grad_norm": 0.10502824187278748, "kl": 0.077728271484375, "learning_rate": 1.0225998321017314e-07, "loss": 0.006, "reward": 0.05803571664728224, "reward_std": 0.023012056946754456, "rewards/accuracy_reward": 0.05803571664728224, "rewards/format_reward": 0.0, "step": 812 }, { "completion_length": 1860.6407012939453, "epoch": 0.9713987006198193, "grad_norm": 0.16716104745864868, "kl": 0.07513427734375, "learning_rate": 1.0207571676492502e-07, "loss": 0.0013, "reward": 0.06250000279396772, "reward_std": 0.028166969772428274, "rewards/accuracy_reward": 0.06250000279396772, "rewards/format_reward": 0.0, "step": 813 }, { "completion_length": 1779.4755325317383, "epoch": 0.9725935329699051, "grad_norm": 0.1968853771686554, "kl": 0.070556640625, "learning_rate": 1.0189926782237802e-07, "loss": 0.0145, "reward": 0.08258929010480642, "reward_std": 0.05564331356436014, "rewards/accuracy_reward": 0.08258929010480642, "rewards/format_reward": 0.0, "step": 814 }, { "completion_length": 1845.5067977905273, "epoch": 0.973788365319991, "grad_norm": 0.11542942374944687, "kl": 0.087615966796875, "learning_rate": 1.0173063946204841e-07, "loss": 0.0136, "reward": 0.06919643213041127, "reward_std": 0.041559827979654074, "rewards/accuracy_reward": 0.06919643213041127, "rewards/format_reward": 0.0, "step": 815 }, { "completion_length": 1814.464370727539, "epoch": 0.9749831976700769, "grad_norm": 0.10792675614356995, "kl": 0.09576416015625, "learning_rate": 1.0156983462696178e-07, "loss": 0.0109, "reward": 0.06473214575089514, "reward_std": 0.04671474127098918, "rewards/accuracy_reward": 0.06473214575089514, "rewards/format_reward": 0.0, "step": 816 }, { "completion_length": 1850.4375762939453, "epoch": 0.9761780300201628, "grad_norm": 0.11561498045921326, "kl": 0.07128143310546875, "learning_rate": 1.0141685612360205e-07, "loss": 0.0064, "reward": 0.053571431431919336, "reward_std": 0.023012056481093168, "rewards/accuracy_reward": 0.053571431431919336, "rewards/format_reward": 0.0, "step": 817 }, { "completion_length": 1803.9197235107422, "epoch": 0.9773728623702487, "grad_norm": 0.0930875763297081, "kl": 0.074798583984375, "learning_rate": 1.0127170662186201e-07, "loss": 0.0122, "reward": 0.10267857671715319, "reward_std": 0.024393311701714993, "rewards/accuracy_reward": 0.10267857671715319, "rewards/format_reward": 0.0, "step": 818 }, { "completion_length": 1806.3170471191406, "epoch": 0.9785676947203346, "grad_norm": 0.29196375608444214, "kl": 0.0844268798828125, "learning_rate": 1.0113438865499707e-07, "loss": 0.021, "reward": 0.06473214621655643, "reward_std": 0.051869654096663, "rewards/accuracy_reward": 0.06473214621655643, "rewards/format_reward": 0.0, "step": 819 }, { "completion_length": 1821.9041061401367, "epoch": 0.9797625270704204, "grad_norm": 0.10478786379098892, "kl": 0.06589508056640625, "learning_rate": 1.0100490461958109e-07, "loss": 0.0129, "reward": 0.08482143236324191, "reward_std": 0.03332188306376338, "rewards/accuracy_reward": 0.08482143236324191, "rewards/format_reward": 0.0, "step": 820 }, { "completion_length": 1772.8661499023438, "epoch": 0.9809573594205063, "grad_norm": 0.4126225411891937, "kl": 0.08649444580078125, "learning_rate": 1.0088325677546418e-07, "loss": 0.0202, "reward": 0.07589286100119352, "reward_std": 0.06010759808123112, "rewards/accuracy_reward": 0.07589286100119352, "rewards/format_reward": 0.0, "step": 821 }, { "completion_length": 1839.7902374267578, "epoch": 0.9821521917705922, "grad_norm": 0.12006872147321701, "kl": 0.0850830078125, "learning_rate": 1.007694472457337e-07, "loss": 0.0128, "reward": 0.04464285890571773, "reward_std": 0.02816697023808956, "rewards/accuracy_reward": 0.04464285890571773, "rewards/format_reward": 0.0, "step": 822 }, { "completion_length": 1873.392936706543, "epoch": 0.9833470241206781, "grad_norm": 0.11566156893968582, "kl": 0.0991668701171875, "learning_rate": 1.0066347801667693e-07, "loss": 0.0174, "reward": 0.06919643189758062, "reward_std": 0.0326312561519444, "rewards/accuracy_reward": 0.06919643189758062, "rewards/format_reward": 0.0, "step": 823 }, { "completion_length": 1827.729995727539, "epoch": 0.984541856470764, "grad_norm": 0.2342265099287033, "kl": 0.07782745361328125, "learning_rate": 1.0056535093774642e-07, "loss": 0.0127, "reward": 0.07589286006987095, "reward_std": 0.04464285960420966, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, "step": 824 }, { "completion_length": 1830.3773040771484, "epoch": 0.9857366888208499, "grad_norm": 0.10413384437561035, "kl": 0.0682373046875, "learning_rate": 1.0047506772152785e-07, "loss": 0.0098, "reward": 0.07812500419095159, "reward_std": 0.028857597149908543, "rewards/accuracy_reward": 0.07812500419095159, "rewards/format_reward": 0.0, "step": 825 }, { "completion_length": 1827.533546447754, "epoch": 0.9869315211709357, "grad_norm": 0.1357140839099884, "kl": 0.08184814453125, "learning_rate": 1.0039262994371012e-07, "loss": 0.0136, "reward": 0.07142857438884676, "reward_std": 0.02816697023808956, "rewards/accuracy_reward": 0.07142857438884676, "rewards/format_reward": 0.0, "step": 826 }, { "completion_length": 1841.3996200561523, "epoch": 0.9881263535210216, "grad_norm": 0.1422327309846878, "kl": 0.09521484375, "learning_rate": 1.0031803904305762e-07, "loss": 0.0066, "reward": 0.07589286006987095, "reward_std": 0.03194062877446413, "rewards/accuracy_reward": 0.07589286006987095, "rewards/format_reward": 0.0, "step": 827 }, { "completion_length": 1836.6139297485352, "epoch": 0.9893211858711075, "grad_norm": 0.12234248965978622, "kl": 0.0738067626953125, "learning_rate": 1.0025129632138543e-07, "loss": 0.009, "reward": 0.06696428800933063, "reward_std": 0.04225045535713434, "rewards/accuracy_reward": 0.06696428800933063, "rewards/format_reward": 0.0, "step": 828 }, { "completion_length": 1858.4554290771484, "epoch": 0.9905160182211933, "grad_norm": 0.10416804254055023, "kl": 0.07415771484375, "learning_rate": 1.001924029435364e-07, "loss": 0.0095, "reward": 0.05357142956927419, "reward_std": 0.026785715948790312, "rewards/accuracy_reward": 0.05357142956927419, "rewards/format_reward": 0.0, "step": 829 }, { "completion_length": 1822.8750686645508, "epoch": 0.9917108505712792, "grad_norm": 0.10049787908792496, "kl": 0.0854034423828125, "learning_rate": 1.0014135993736095e-07, "loss": 0.0119, "reward": 0.08482143236324191, "reward_std": 0.02816697023808956, "rewards/accuracy_reward": 0.08482143236324191, "rewards/format_reward": 0.0, "step": 830 }, { "completion_length": 1823.6451797485352, "epoch": 0.992905682921365, "grad_norm": 0.14631982147693634, "kl": 0.08233642578125, "learning_rate": 1.0009816819369891e-07, "loss": 0.0113, "reward": 0.06250000256113708, "reward_std": 0.04979777242988348, "rewards/accuracy_reward": 0.06250000256113708, "rewards/format_reward": 0.0, "step": 831 }, { "completion_length": 1836.6027603149414, "epoch": 0.9941005152714509, "grad_norm": 0.08562933653593063, "kl": 0.0792083740234375, "learning_rate": 1.0006282846636432e-07, "loss": 0.0082, "reward": 0.05133928754366934, "reward_std": 0.04017857322469354, "rewards/accuracy_reward": 0.05133928754366934, "rewards/format_reward": 0.0, "step": 832 }, { "completion_length": 1824.1786499023438, "epoch": 0.9952953476215368, "grad_norm": 0.11289257556200027, "kl": 0.0745086669921875, "learning_rate": 1.0003534137213199e-07, "loss": 0.0125, "reward": 0.06919643189758062, "reward_std": 0.04671474080532789, "rewards/accuracy_reward": 0.06919643189758062, "rewards/format_reward": 0.0, "step": 833 }, { "completion_length": 1852.0715255737305, "epoch": 0.9964901799716227, "grad_norm": 0.15242980420589447, "kl": 0.0762786865234375, "learning_rate": 1.0001570739072679e-07, "loss": 0.013, "reward": 0.08258929080329835, "reward_std": 0.03263125568628311, "rewards/accuracy_reward": 0.08258929080329835, "rewards/format_reward": 0.0, "step": 834 }, { "completion_length": 1798.136245727539, "epoch": 0.9976850123217086, "grad_norm": 0.2872120141983032, "kl": 0.0915679931640625, "learning_rate": 1.0000392686481531e-07, "loss": 0.0176, "reward": 0.08035714668221772, "reward_std": 0.026785715483129025, "rewards/accuracy_reward": 0.08035714668221772, "rewards/format_reward": 0.0, "step": 835 }, { "completion_length": 1833.1250915527344, "epoch": 0.9988798446717945, "grad_norm": 0.11767753213644028, "kl": 0.0992584228515625, "learning_rate": 1e-07, "loss": 0.0114, "reward": 0.07812500325962901, "reward_std": 0.037786169443279505, "rewards/accuracy_reward": 0.07812500325962901, "rewards/format_reward": 0.0, "step": 836 }, { "epoch": 0.9988798446717945, "step": 836, "total_flos": 0.0, "train_loss": 0.011288413916377742, "train_runtime": 235885.794, "train_samples_per_second": 0.397, "train_steps_per_second": 0.004 } ], "logging_steps": 1, "max_steps": 836, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }