diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4086 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 1563, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 51.266319274902344, + "kl": 0.010414814576506615, + "learning_rate": 1.592356687898089e-08, + "logps/chosen": -294.9905700683594, + "logps/rejected": -345.313232421875, + "loss": 0.577, + "rewards/chosen": -0.004230289254337549, + "rewards/margins": -0.0014572818763554096, + "rewards/rejected": -0.0027730073779821396, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 58.25449752807617, + "kl": 0.016649872064590454, + "learning_rate": 3.184713375796178e-08, + "logps/chosen": -294.2644958496094, + "logps/rejected": -347.5428771972656, + "loss": 0.5753, + "rewards/chosen": 0.007328943815082312, + "rewards/margins": 0.012493056245148182, + "rewards/rejected": -0.005164111964404583, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 48.10483169555664, + "kl": 0.2333705872297287, + "learning_rate": 4.777070063694268e-08, + "logps/chosen": -332.3365173339844, + "logps/rejected": -320.77374267578125, + "loss": 0.5743, + "rewards/chosen": 0.0657496452331543, + "rewards/margins": 0.056174833327531815, + "rewards/rejected": 0.009574810042977333, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 53.18347930908203, + "kl": 0.5247635841369629, + "learning_rate": 6.369426751592356e-08, + "logps/chosen": -296.2086486816406, + "logps/rejected": -362.4876708984375, + "loss": 0.5593, + "rewards/chosen": 0.1558167189359665, + "rewards/margins": 0.14346782863140106, + "rewards/rejected": 0.012348905205726624, + "step": 20 + }, + { + "epoch": 0.02, + "grad_norm": 46.746944427490234, + "kl": 0.25808069109916687, + "learning_rate": 7.961783439490445e-08, + "logps/chosen": -297.1956481933594, + "logps/rejected": -363.6723327636719, + "loss": 0.5438, + "rewards/chosen": 0.2308042347431183, + "rewards/margins": 0.2523272931575775, + "rewards/rejected": -0.021523073315620422, + "step": 25 + }, + { + "epoch": 0.02, + "grad_norm": 53.65740966796875, + "kl": 0.051769644021987915, + "learning_rate": 9.554140127388536e-08, + "logps/chosen": -337.7224426269531, + "logps/rejected": -327.62603759765625, + "loss": 0.5058, + "rewards/chosen": 0.3092527985572815, + "rewards/margins": 0.42350277304649353, + "rewards/rejected": -0.11424995958805084, + "step": 30 + }, + { + "epoch": 0.02, + "grad_norm": 50.72907638549805, + "kl": 0.0, + "learning_rate": 1.1146496815286624e-07, + "logps/chosen": -268.0375061035156, + "logps/rejected": -378.62786865234375, + "loss": 0.474, + "rewards/chosen": 0.3441382944583893, + "rewards/margins": 0.7155375480651855, + "rewards/rejected": -0.37139931321144104, + "step": 35 + }, + { + "epoch": 0.03, + "grad_norm": 48.7026481628418, + "kl": 0.0, + "learning_rate": 1.2738853503184713e-07, + "logps/chosen": -308.1761474609375, + "logps/rejected": -326.7469787597656, + "loss": 0.4604, + "rewards/chosen": 0.3228895366191864, + "rewards/margins": 0.9184707403182983, + "rewards/rejected": -0.5955811738967896, + "step": 40 + }, + { + "epoch": 0.03, + "grad_norm": 43.46382141113281, + "kl": 0.0, + "learning_rate": 1.43312101910828e-07, + "logps/chosen": -315.6592712402344, + "logps/rejected": -341.8274841308594, + "loss": 0.4292, + "rewards/chosen": 0.3122224509716034, + "rewards/margins": 1.5055707693099976, + "rewards/rejected": -1.1933482885360718, + "step": 45 + }, + { + "epoch": 0.03, + "grad_norm": 44.770355224609375, + "kl": 0.0, + "learning_rate": 1.592356687898089e-07, + "logps/chosen": -262.85552978515625, + "logps/rejected": -353.92596435546875, + "loss": 0.3882, + "rewards/chosen": 0.3527736961841583, + "rewards/margins": 1.9195839166641235, + "rewards/rejected": -1.5668100118637085, + "step": 50 + }, + { + "epoch": 0.04, + "grad_norm": 36.85205841064453, + "kl": 0.0, + "learning_rate": 1.7515923566878978e-07, + "logps/chosen": -280.04779052734375, + "logps/rejected": -339.02423095703125, + "loss": 0.3544, + "rewards/chosen": 0.3607901334762573, + "rewards/margins": 2.4449303150177, + "rewards/rejected": -2.0841403007507324, + "step": 55 + }, + { + "epoch": 0.04, + "grad_norm": 43.077369689941406, + "kl": 0.0, + "learning_rate": 1.9108280254777072e-07, + "logps/chosen": -315.29046630859375, + "logps/rejected": -367.8749694824219, + "loss": 0.3512, + "rewards/chosen": 0.38210782408714294, + "rewards/margins": 2.8307223320007324, + "rewards/rejected": -2.4486145973205566, + "step": 60 + }, + { + "epoch": 0.04, + "grad_norm": 45.82477951049805, + "kl": 0.0, + "learning_rate": 2.070063694267516e-07, + "logps/chosen": -308.75518798828125, + "logps/rejected": -360.6058349609375, + "loss": 0.3215, + "rewards/chosen": 0.35675397515296936, + "rewards/margins": 3.136427164077759, + "rewards/rejected": -2.7796730995178223, + "step": 65 + }, + { + "epoch": 0.04, + "grad_norm": 38.866817474365234, + "kl": 0.0, + "learning_rate": 2.2292993630573247e-07, + "logps/chosen": -290.3988037109375, + "logps/rejected": -362.57110595703125, + "loss": 0.3413, + "rewards/chosen": 0.41365399956703186, + "rewards/margins": 3.5323078632354736, + "rewards/rejected": -3.1186537742614746, + "step": 70 + }, + { + "epoch": 0.05, + "grad_norm": 48.03970718383789, + "kl": 0.0, + "learning_rate": 2.388535031847134e-07, + "logps/chosen": -304.66754150390625, + "logps/rejected": -380.553955078125, + "loss": 0.3263, + "rewards/chosen": 0.45406588912010193, + "rewards/margins": 4.028099536895752, + "rewards/rejected": -3.574033260345459, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 43.04393005371094, + "kl": 0.0, + "learning_rate": 2.5477707006369425e-07, + "logps/chosen": -312.28076171875, + "logps/rejected": -399.56585693359375, + "loss": 0.3325, + "rewards/chosen": 0.49335041642189026, + "rewards/margins": 3.85711669921875, + "rewards/rejected": -3.3637664318084717, + "step": 80 + }, + { + "epoch": 0.05, + "grad_norm": 59.04008483886719, + "kl": 0.0, + "learning_rate": 2.7070063694267513e-07, + "logps/chosen": -234.32943725585938, + "logps/rejected": -376.53857421875, + "loss": 0.3408, + "rewards/chosen": 0.5379557013511658, + "rewards/margins": 4.110851287841797, + "rewards/rejected": -3.5728955268859863, + "step": 85 + }, + { + "epoch": 0.06, + "grad_norm": 39.72758483886719, + "kl": 0.0, + "learning_rate": 2.86624203821656e-07, + "logps/chosen": -261.9068298339844, + "logps/rejected": -377.6480407714844, + "loss": 0.2993, + "rewards/chosen": 0.5026695132255554, + "rewards/margins": 4.6207404136657715, + "rewards/rejected": -4.118070602416992, + "step": 90 + }, + { + "epoch": 0.06, + "grad_norm": 40.38072967529297, + "kl": 0.0, + "learning_rate": 3.0254777070063694e-07, + "logps/chosen": -281.74041748046875, + "logps/rejected": -362.71697998046875, + "loss": 0.3052, + "rewards/chosen": 0.5910047292709351, + "rewards/margins": 4.633607864379883, + "rewards/rejected": -4.042603492736816, + "step": 95 + }, + { + "epoch": 0.06, + "grad_norm": 40.266170501708984, + "kl": 0.0, + "learning_rate": 3.184713375796178e-07, + "logps/chosen": -304.7105712890625, + "logps/rejected": -359.0072326660156, + "loss": 0.2996, + "rewards/chosen": 0.5609632730484009, + "rewards/margins": 4.776108741760254, + "rewards/rejected": -4.215145111083984, + "step": 100 + }, + { + "epoch": 0.07, + "grad_norm": 45.701961517333984, + "kl": 0.0, + "learning_rate": 3.343949044585987e-07, + "logps/chosen": -255.57705688476562, + "logps/rejected": -379.2811584472656, + "loss": 0.3568, + "rewards/chosen": 0.3781484365463257, + "rewards/margins": 3.8812553882598877, + "rewards/rejected": -3.5031065940856934, + "step": 105 + }, + { + "epoch": 0.07, + "grad_norm": 40.277076721191406, + "kl": 0.0, + "learning_rate": 3.5031847133757957e-07, + "logps/chosen": -302.58612060546875, + "logps/rejected": -359.902099609375, + "loss": 0.3384, + "rewards/chosen": 0.5127553939819336, + "rewards/margins": 3.2677035331726074, + "rewards/rejected": -2.754948139190674, + "step": 110 + }, + { + "epoch": 0.07, + "grad_norm": 41.674957275390625, + "kl": 0.0, + "learning_rate": 3.6624203821656045e-07, + "logps/chosen": -314.4385681152344, + "logps/rejected": -375.05438232421875, + "loss": 0.2909, + "rewards/chosen": 0.5935775637626648, + "rewards/margins": 4.4186811447143555, + "rewards/rejected": -3.825103282928467, + "step": 115 + }, + { + "epoch": 0.08, + "grad_norm": 35.64720153808594, + "kl": 0.0, + "learning_rate": 3.8216560509554143e-07, + "logps/chosen": -320.26318359375, + "logps/rejected": -376.4805908203125, + "loss": 0.2777, + "rewards/chosen": 0.581780195236206, + "rewards/margins": 5.2582688331604, + "rewards/rejected": -4.676488399505615, + "step": 120 + }, + { + "epoch": 0.08, + "grad_norm": 37.95709228515625, + "kl": 0.0, + "learning_rate": 3.980891719745223e-07, + "logps/chosen": -308.4683837890625, + "logps/rejected": -375.2101135253906, + "loss": 0.3019, + "rewards/chosen": 0.7046070694923401, + "rewards/margins": 5.051932334899902, + "rewards/rejected": -4.347325325012207, + "step": 125 + }, + { + "epoch": 0.08, + "grad_norm": 43.29692840576172, + "kl": 0.0, + "learning_rate": 4.140127388535032e-07, + "logps/chosen": -245.1961669921875, + "logps/rejected": -364.4614562988281, + "loss": 0.3119, + "rewards/chosen": 0.5381637215614319, + "rewards/margins": 5.0186543464660645, + "rewards/rejected": -4.480490684509277, + "step": 130 + }, + { + "epoch": 0.09, + "grad_norm": 38.105064392089844, + "kl": 0.0, + "learning_rate": 4.2993630573248406e-07, + "logps/chosen": -312.35406494140625, + "logps/rejected": -380.74542236328125, + "loss": 0.3123, + "rewards/chosen": 0.46837490797042847, + "rewards/margins": 4.953141689300537, + "rewards/rejected": -4.484766960144043, + "step": 135 + }, + { + "epoch": 0.09, + "grad_norm": 47.0628547668457, + "kl": 0.0, + "learning_rate": 4.4585987261146494e-07, + "logps/chosen": -279.7536315917969, + "logps/rejected": -382.98297119140625, + "loss": 0.2795, + "rewards/chosen": 0.635105311870575, + "rewards/margins": 5.238924026489258, + "rewards/rejected": -4.603818416595459, + "step": 140 + }, + { + "epoch": 0.09, + "grad_norm": 37.483062744140625, + "kl": 0.0, + "learning_rate": 4.6178343949044587e-07, + "logps/chosen": -274.5091247558594, + "logps/rejected": -391.46881103515625, + "loss": 0.2843, + "rewards/chosen": 0.5880447626113892, + "rewards/margins": 5.390235424041748, + "rewards/rejected": -4.802191257476807, + "step": 145 + }, + { + "epoch": 0.1, + "grad_norm": 36.84580993652344, + "kl": 0.0, + "learning_rate": 4.777070063694267e-07, + "logps/chosen": -292.7626953125, + "logps/rejected": -387.938720703125, + "loss": 0.2768, + "rewards/chosen": 0.6716945171356201, + "rewards/margins": 5.8321943283081055, + "rewards/rejected": -5.160500526428223, + "step": 150 + }, + { + "epoch": 0.1, + "grad_norm": 31.7560977935791, + "kl": 0.0, + "learning_rate": 4.936305732484076e-07, + "logps/chosen": -215.79736328125, + "logps/rejected": -373.4687805175781, + "loss": 0.2792, + "rewards/chosen": 0.5537887811660767, + "rewards/margins": 5.9485039710998535, + "rewards/rejected": -5.394715309143066, + "step": 155 + }, + { + "epoch": 0.1, + "grad_norm": 33.220497131347656, + "kl": 0.0, + "learning_rate": 4.989331436699858e-07, + "logps/chosen": -311.62884521484375, + "logps/rejected": -401.2769470214844, + "loss": 0.2544, + "rewards/chosen": 0.7818514108657837, + "rewards/margins": 6.500135898590088, + "rewards/rejected": -5.718283653259277, + "step": 160 + }, + { + "epoch": 0.11, + "grad_norm": 32.72365951538086, + "kl": 0.0, + "learning_rate": 4.971550497866287e-07, + "logps/chosen": -258.6673278808594, + "logps/rejected": -436.42742919921875, + "loss": 0.2901, + "rewards/chosen": 0.5816069841384888, + "rewards/margins": 6.598071098327637, + "rewards/rejected": -6.0164642333984375, + "step": 165 + }, + { + "epoch": 0.11, + "grad_norm": 41.379356384277344, + "kl": 0.0, + "learning_rate": 4.953769559032717e-07, + "logps/chosen": -270.22393798828125, + "logps/rejected": -411.33245849609375, + "loss": 0.2819, + "rewards/chosen": 0.6400431394577026, + "rewards/margins": 7.076806545257568, + "rewards/rejected": -6.436762809753418, + "step": 170 + }, + { + "epoch": 0.11, + "grad_norm": 41.31757736206055, + "kl": 0.0, + "learning_rate": 4.935988620199146e-07, + "logps/chosen": -269.7322692871094, + "logps/rejected": -390.0412292480469, + "loss": 0.2645, + "rewards/chosen": 0.5173057317733765, + "rewards/margins": 6.083005905151367, + "rewards/rejected": -5.565700531005859, + "step": 175 + }, + { + "epoch": 0.12, + "grad_norm": 38.93199920654297, + "kl": 0.0, + "learning_rate": 4.918207681365576e-07, + "logps/chosen": -256.9288330078125, + "logps/rejected": -391.0644226074219, + "loss": 0.2613, + "rewards/chosen": 0.681791365146637, + "rewards/margins": 6.400958061218262, + "rewards/rejected": -5.7191667556762695, + "step": 180 + }, + { + "epoch": 0.12, + "grad_norm": 39.77524185180664, + "kl": 0.0, + "learning_rate": 4.900426742532006e-07, + "logps/chosen": -297.71759033203125, + "logps/rejected": -393.12628173828125, + "loss": 0.2873, + "rewards/chosen": 0.5777992606163025, + "rewards/margins": 6.261810779571533, + "rewards/rejected": -5.684011936187744, + "step": 185 + }, + { + "epoch": 0.12, + "grad_norm": 36.284488677978516, + "kl": 0.0, + "learning_rate": 4.882645803698435e-07, + "logps/chosen": -305.66693115234375, + "logps/rejected": -376.7122802734375, + "loss": 0.2585, + "rewards/chosen": 0.7627078294754028, + "rewards/margins": 6.659689903259277, + "rewards/rejected": -5.896982669830322, + "step": 190 + }, + { + "epoch": 0.12, + "grad_norm": 35.469642639160156, + "kl": 0.0, + "learning_rate": 4.864864864864865e-07, + "logps/chosen": -254.2747039794922, + "logps/rejected": -421.32196044921875, + "loss": 0.2475, + "rewards/chosen": 0.716246485710144, + "rewards/margins": 7.351188659667969, + "rewards/rejected": -6.634942531585693, + "step": 195 + }, + { + "epoch": 0.13, + "grad_norm": 38.18144607543945, + "kl": 0.0, + "learning_rate": 4.847083926031294e-07, + "logps/chosen": -274.53326416015625, + "logps/rejected": -432.90911865234375, + "loss": 0.2815, + "rewards/chosen": 0.6570473313331604, + "rewards/margins": 7.282747745513916, + "rewards/rejected": -6.6257004737854, + "step": 200 + }, + { + "epoch": 0.13, + "grad_norm": 33.6481819152832, + "kl": 0.0, + "learning_rate": 4.829302987197724e-07, + "logps/chosen": -306.6775817871094, + "logps/rejected": -427.87615966796875, + "loss": 0.2636, + "rewards/chosen": 0.8706709146499634, + "rewards/margins": 7.621199607849121, + "rewards/rejected": -6.750528812408447, + "step": 205 + }, + { + "epoch": 0.13, + "grad_norm": 35.351131439208984, + "kl": 0.0, + "learning_rate": 4.811522048364154e-07, + "logps/chosen": -301.287109375, + "logps/rejected": -397.7789001464844, + "loss": 0.2904, + "rewards/chosen": 0.6713122129440308, + "rewards/margins": 6.777222633361816, + "rewards/rejected": -6.105909824371338, + "step": 210 + }, + { + "epoch": 0.14, + "grad_norm": 41.3302116394043, + "kl": 0.0, + "learning_rate": 4.793741109530583e-07, + "logps/chosen": -351.16436767578125, + "logps/rejected": -376.658447265625, + "loss": 0.2487, + "rewards/chosen": 0.9130572080612183, + "rewards/margins": 7.177925109863281, + "rewards/rejected": -6.264868259429932, + "step": 215 + }, + { + "epoch": 0.14, + "grad_norm": 40.998443603515625, + "kl": 0.0, + "learning_rate": 4.775960170697012e-07, + "logps/chosen": -275.54840087890625, + "logps/rejected": -400.478271484375, + "loss": 0.2659, + "rewards/chosen": 0.8280189633369446, + "rewards/margins": 7.407543182373047, + "rewards/rejected": -6.579524040222168, + "step": 220 + }, + { + "epoch": 0.14, + "grad_norm": 41.12675857543945, + "kl": 0.0, + "learning_rate": 4.7581792318634425e-07, + "logps/chosen": -263.6675109863281, + "logps/rejected": -419.4658203125, + "loss": 0.2953, + "rewards/chosen": 0.6901615262031555, + "rewards/margins": 7.24615478515625, + "rewards/rejected": -6.555993556976318, + "step": 225 + }, + { + "epoch": 0.15, + "grad_norm": 36.073734283447266, + "kl": 0.0, + "learning_rate": 4.7403982930298717e-07, + "logps/chosen": -280.42840576171875, + "logps/rejected": -428.30029296875, + "loss": 0.2677, + "rewards/chosen": 0.8988403081893921, + "rewards/margins": 7.7359209060668945, + "rewards/rejected": -6.837080478668213, + "step": 230 + }, + { + "epoch": 0.15, + "grad_norm": 31.510356903076172, + "kl": 0.0, + "learning_rate": 4.7226173541963014e-07, + "logps/chosen": -312.79095458984375, + "logps/rejected": -379.7386169433594, + "loss": 0.268, + "rewards/chosen": 0.8146812319755554, + "rewards/margins": 7.17245626449585, + "rewards/rejected": -6.3577752113342285, + "step": 235 + }, + { + "epoch": 0.15, + "grad_norm": 32.352935791015625, + "kl": 0.0, + "learning_rate": 4.7048364153627306e-07, + "logps/chosen": -268.1000061035156, + "logps/rejected": -398.8023376464844, + "loss": 0.2728, + "rewards/chosen": 0.7865932583808899, + "rewards/margins": 7.554474830627441, + "rewards/rejected": -6.767881870269775, + "step": 240 + }, + { + "epoch": 0.16, + "grad_norm": 33.94474792480469, + "kl": 0.0, + "learning_rate": 4.6870554765291604e-07, + "logps/chosen": -304.180419921875, + "logps/rejected": -382.78045654296875, + "loss": 0.2535, + "rewards/chosen": 0.7778481245040894, + "rewards/margins": 8.106607437133789, + "rewards/rejected": -7.328759670257568, + "step": 245 + }, + { + "epoch": 0.16, + "grad_norm": 27.869224548339844, + "kl": 0.0, + "learning_rate": 4.66927453769559e-07, + "logps/chosen": -270.0757141113281, + "logps/rejected": -418.12640380859375, + "loss": 0.245, + "rewards/chosen": 0.7761438488960266, + "rewards/margins": 7.896877288818359, + "rewards/rejected": -7.120733737945557, + "step": 250 + }, + { + "epoch": 0.16, + "grad_norm": 39.59625244140625, + "kl": 0.0, + "learning_rate": 4.65149359886202e-07, + "logps/chosen": -241.5212860107422, + "logps/rejected": -390.13092041015625, + "loss": 0.2604, + "rewards/chosen": 0.8376949429512024, + "rewards/margins": 7.854789733886719, + "rewards/rejected": -7.01709508895874, + "step": 255 + }, + { + "epoch": 0.17, + "grad_norm": 45.24748992919922, + "kl": 0.0, + "learning_rate": 4.633712660028449e-07, + "logps/chosen": -298.83477783203125, + "logps/rejected": -432.9170837402344, + "loss": 0.2885, + "rewards/chosen": 0.8538596034049988, + "rewards/margins": 8.486310958862305, + "rewards/rejected": -7.632451057434082, + "step": 260 + }, + { + "epoch": 0.17, + "grad_norm": 47.65301513671875, + "kl": 0.0, + "learning_rate": 4.615931721194879e-07, + "logps/chosen": -293.2570495605469, + "logps/rejected": -426.631591796875, + "loss": 0.2571, + "rewards/chosen": 1.0033259391784668, + "rewards/margins": 8.315601348876953, + "rewards/rejected": -7.3122758865356445, + "step": 265 + }, + { + "epoch": 0.17, + "grad_norm": 37.398311614990234, + "kl": 0.0, + "learning_rate": 4.5981507823613085e-07, + "logps/chosen": -270.73297119140625, + "logps/rejected": -390.40679931640625, + "loss": 0.2698, + "rewards/chosen": 0.8841345906257629, + "rewards/margins": 7.326432704925537, + "rewards/rejected": -6.442298889160156, + "step": 270 + }, + { + "epoch": 0.18, + "grad_norm": 43.30259704589844, + "kl": 0.0, + "learning_rate": 4.580369843527738e-07, + "logps/chosen": -305.30859375, + "logps/rejected": -456.27410888671875, + "loss": 0.2583, + "rewards/chosen": 1.079576849937439, + "rewards/margins": 9.011800765991211, + "rewards/rejected": -7.932224273681641, + "step": 275 + }, + { + "epoch": 0.18, + "grad_norm": 44.13119125366211, + "kl": 0.0, + "learning_rate": 4.562588904694168e-07, + "logps/chosen": -253.0196990966797, + "logps/rejected": -396.925048828125, + "loss": 0.2702, + "rewards/chosen": 0.8021785616874695, + "rewards/margins": 7.5618791580200195, + "rewards/rejected": -6.759700775146484, + "step": 280 + }, + { + "epoch": 0.18, + "grad_norm": 40.930442810058594, + "kl": 0.0, + "learning_rate": 4.544807965860597e-07, + "logps/chosen": -281.5596618652344, + "logps/rejected": -401.64459228515625, + "loss": 0.2626, + "rewards/chosen": 0.9091368913650513, + "rewards/margins": 8.310415267944336, + "rewards/rejected": -7.401278495788574, + "step": 285 + }, + { + "epoch": 0.19, + "grad_norm": 32.669708251953125, + "kl": 0.0, + "learning_rate": 4.5270270270270264e-07, + "logps/chosen": -256.7948303222656, + "logps/rejected": -414.135009765625, + "loss": 0.2561, + "rewards/chosen": 0.9493656158447266, + "rewards/margins": 8.324688911437988, + "rewards/rejected": -7.375323295593262, + "step": 290 + }, + { + "epoch": 0.19, + "grad_norm": 36.701683044433594, + "kl": 0.0, + "learning_rate": 4.509246088193456e-07, + "logps/chosen": -240.853759765625, + "logps/rejected": -393.74444580078125, + "loss": 0.2503, + "rewards/chosen": 0.7818459868431091, + "rewards/margins": 8.209932327270508, + "rewards/rejected": -7.428086280822754, + "step": 295 + }, + { + "epoch": 0.19, + "grad_norm": 34.59037399291992, + "kl": 0.0, + "learning_rate": 4.491465149359886e-07, + "logps/chosen": -269.46124267578125, + "logps/rejected": -425.86859130859375, + "loss": 0.2675, + "rewards/chosen": 0.9379463195800781, + "rewards/margins": 8.188257217407227, + "rewards/rejected": -7.250311374664307, + "step": 300 + }, + { + "epoch": 0.2, + "grad_norm": 36.5736198425293, + "kl": 0.0, + "learning_rate": 4.4736842105263156e-07, + "logps/chosen": -260.8907775878906, + "logps/rejected": -407.47393798828125, + "loss": 0.2638, + "rewards/chosen": 0.9083566665649414, + "rewards/margins": 8.074954986572266, + "rewards/rejected": -7.166598320007324, + "step": 305 + }, + { + "epoch": 0.2, + "grad_norm": 31.28874969482422, + "kl": 0.0, + "learning_rate": 4.4559032716927454e-07, + "logps/chosen": -239.89321899414062, + "logps/rejected": -407.52899169921875, + "loss": 0.2465, + "rewards/chosen": 0.998735249042511, + "rewards/margins": 7.849595069885254, + "rewards/rejected": -6.850859642028809, + "step": 310 + }, + { + "epoch": 0.2, + "grad_norm": 41.77967071533203, + "kl": 0.0, + "learning_rate": 4.438122332859175e-07, + "logps/chosen": -281.0654602050781, + "logps/rejected": -414.912841796875, + "loss": 0.2557, + "rewards/chosen": 1.065047264099121, + "rewards/margins": 8.236440658569336, + "rewards/rejected": -7.171392917633057, + "step": 315 + }, + { + "epoch": 0.2, + "grad_norm": 40.99684524536133, + "kl": 0.0, + "learning_rate": 4.420341394025605e-07, + "logps/chosen": -287.92938232421875, + "logps/rejected": -420.6644592285156, + "loss": 0.2581, + "rewards/chosen": 1.0740591287612915, + "rewards/margins": 8.318581581115723, + "rewards/rejected": -7.2445220947265625, + "step": 320 + }, + { + "epoch": 0.21, + "grad_norm": 34.950050354003906, + "kl": 0.0, + "learning_rate": 4.4025604551920335e-07, + "logps/chosen": -287.9638671875, + "logps/rejected": -412.91180419921875, + "loss": 0.253, + "rewards/chosen": 1.076217770576477, + "rewards/margins": 8.107794761657715, + "rewards/rejected": -7.031577110290527, + "step": 325 + }, + { + "epoch": 0.21, + "grad_norm": 33.869659423828125, + "kl": 0.0, + "learning_rate": 4.384779516358463e-07, + "logps/chosen": -264.02105712890625, + "logps/rejected": -427.40972900390625, + "loss": 0.2516, + "rewards/chosen": 1.0842151641845703, + "rewards/margins": 8.836532592773438, + "rewards/rejected": -7.752317905426025, + "step": 330 + }, + { + "epoch": 0.21, + "grad_norm": 33.70438766479492, + "kl": 0.0, + "learning_rate": 4.366998577524893e-07, + "logps/chosen": -261.1457824707031, + "logps/rejected": -386.77679443359375, + "loss": 0.272, + "rewards/chosen": 0.9901145696640015, + "rewards/margins": 8.374135971069336, + "rewards/rejected": -7.384020805358887, + "step": 335 + }, + { + "epoch": 0.22, + "grad_norm": 41.73198318481445, + "kl": 0.0, + "learning_rate": 4.3492176386913227e-07, + "logps/chosen": -273.5060119628906, + "logps/rejected": -395.254638671875, + "loss": 0.254, + "rewards/chosen": 1.3327045440673828, + "rewards/margins": 8.71018123626709, + "rewards/rejected": -7.377476692199707, + "step": 340 + }, + { + "epoch": 0.22, + "grad_norm": 38.74711227416992, + "kl": 0.0, + "learning_rate": 4.3314366998577524e-07, + "logps/chosen": -280.02056884765625, + "logps/rejected": -425.0785217285156, + "loss": 0.2242, + "rewards/chosen": 1.2169251441955566, + "rewards/margins": 8.982121467590332, + "rewards/rejected": -7.765196323394775, + "step": 345 + }, + { + "epoch": 0.22, + "grad_norm": 30.853017807006836, + "kl": 0.0, + "learning_rate": 4.313655761024182e-07, + "logps/chosen": -252.4849853515625, + "logps/rejected": -382.7063903808594, + "loss": 0.253, + "rewards/chosen": 1.2118154764175415, + "rewards/margins": 8.987495422363281, + "rewards/rejected": -7.775679588317871, + "step": 350 + }, + { + "epoch": 0.23, + "grad_norm": 37.265647888183594, + "kl": 0.0, + "learning_rate": 4.2958748221906114e-07, + "logps/chosen": -260.780029296875, + "logps/rejected": -381.5146484375, + "loss": 0.2455, + "rewards/chosen": 1.0870484113693237, + "rewards/margins": 8.00279426574707, + "rewards/rejected": -6.915744781494141, + "step": 355 + }, + { + "epoch": 0.23, + "grad_norm": 33.478515625, + "kl": 0.0, + "learning_rate": 4.278093883357041e-07, + "logps/chosen": -284.0282897949219, + "logps/rejected": -435.1630859375, + "loss": 0.2535, + "rewards/chosen": 1.0355793237686157, + "rewards/margins": 8.832125663757324, + "rewards/rejected": -7.79654598236084, + "step": 360 + }, + { + "epoch": 0.23, + "grad_norm": 38.28589630126953, + "kl": 0.0, + "learning_rate": 4.260312944523471e-07, + "logps/chosen": -274.62249755859375, + "logps/rejected": -414.0751953125, + "loss": 0.2505, + "rewards/chosen": 1.04520583152771, + "rewards/margins": 8.583889961242676, + "rewards/rejected": -7.5386834144592285, + "step": 365 + }, + { + "epoch": 0.24, + "grad_norm": 35.55525207519531, + "kl": 0.0, + "learning_rate": 4.2425320056899e-07, + "logps/chosen": -315.9610900878906, + "logps/rejected": -406.73321533203125, + "loss": 0.2546, + "rewards/chosen": 1.0429364442825317, + "rewards/margins": 8.117002487182617, + "rewards/rejected": -7.074066162109375, + "step": 370 + }, + { + "epoch": 0.24, + "grad_norm": 38.92825698852539, + "kl": 0.0, + "learning_rate": 4.22475106685633e-07, + "logps/chosen": -308.98614501953125, + "logps/rejected": -387.21124267578125, + "loss": 0.2592, + "rewards/chosen": 1.0128390789031982, + "rewards/margins": 8.51533031463623, + "rewards/rejected": -7.502490997314453, + "step": 375 + }, + { + "epoch": 0.24, + "grad_norm": 38.473472595214844, + "kl": 0.0, + "learning_rate": 4.2069701280227595e-07, + "logps/chosen": -258.6905212402344, + "logps/rejected": -383.15997314453125, + "loss": 0.2414, + "rewards/chosen": 1.2547948360443115, + "rewards/margins": 8.752579689025879, + "rewards/rejected": -7.497784614562988, + "step": 380 + }, + { + "epoch": 0.25, + "grad_norm": 30.740337371826172, + "kl": 0.0, + "learning_rate": 4.189189189189189e-07, + "logps/chosen": -272.1437683105469, + "logps/rejected": -419.9095764160156, + "loss": 0.2627, + "rewards/chosen": 1.0294225215911865, + "rewards/margins": 8.776593208312988, + "rewards/rejected": -7.747170448303223, + "step": 385 + }, + { + "epoch": 0.25, + "grad_norm": 31.76836395263672, + "kl": 0.0, + "learning_rate": 4.1714082503556185e-07, + "logps/chosen": -265.4826965332031, + "logps/rejected": -433.3349609375, + "loss": 0.2302, + "rewards/chosen": 1.2581713199615479, + "rewards/margins": 9.139310836791992, + "rewards/rejected": -7.881140232086182, + "step": 390 + }, + { + "epoch": 0.25, + "grad_norm": 33.282405853271484, + "kl": 0.0, + "learning_rate": 4.153627311522048e-07, + "logps/chosen": -274.07366943359375, + "logps/rejected": -414.37530517578125, + "loss": 0.2525, + "rewards/chosen": 1.0393680334091187, + "rewards/margins": 8.62564754486084, + "rewards/rejected": -7.586278438568115, + "step": 395 + }, + { + "epoch": 0.26, + "grad_norm": 31.313518524169922, + "kl": 0.0, + "learning_rate": 4.135846372688478e-07, + "logps/chosen": -246.4826202392578, + "logps/rejected": -426.61761474609375, + "loss": 0.2331, + "rewards/chosen": 1.1517924070358276, + "rewards/margins": 8.839007377624512, + "rewards/rejected": -7.687216281890869, + "step": 400 + }, + { + "epoch": 0.26, + "grad_norm": 35.637672424316406, + "kl": 0.0, + "learning_rate": 4.1180654338549077e-07, + "logps/chosen": -311.7854309082031, + "logps/rejected": -413.76788330078125, + "loss": 0.2488, + "rewards/chosen": 1.0609071254730225, + "rewards/margins": 8.962824821472168, + "rewards/rejected": -7.901917934417725, + "step": 405 + }, + { + "epoch": 0.26, + "grad_norm": 35.219844818115234, + "kl": 0.0, + "learning_rate": 4.100284495021337e-07, + "logps/chosen": -251.4270477294922, + "logps/rejected": -416.0596618652344, + "loss": 0.2488, + "rewards/chosen": 1.0597121715545654, + "rewards/margins": 8.908254623413086, + "rewards/rejected": -7.8485426902771, + "step": 410 + }, + { + "epoch": 0.27, + "grad_norm": 30.371381759643555, + "kl": 0.0, + "learning_rate": 4.082503556187766e-07, + "logps/chosen": -309.2123718261719, + "logps/rejected": -410.77972412109375, + "loss": 0.2512, + "rewards/chosen": 1.3098243474960327, + "rewards/margins": 8.587312698364258, + "rewards/rejected": -7.277488708496094, + "step": 415 + }, + { + "epoch": 0.27, + "grad_norm": 32.01753234863281, + "kl": 0.0, + "learning_rate": 4.064722617354196e-07, + "logps/chosen": -235.7932891845703, + "logps/rejected": -408.72283935546875, + "loss": 0.2391, + "rewards/chosen": 1.005859613418579, + "rewards/margins": 8.284139633178711, + "rewards/rejected": -7.278280735015869, + "step": 420 + }, + { + "epoch": 0.27, + "grad_norm": 38.12109375, + "kl": 0.0, + "learning_rate": 4.0469416785206256e-07, + "logps/chosen": -336.7947998046875, + "logps/rejected": -425.9994201660156, + "loss": 0.2431, + "rewards/chosen": 1.259305715560913, + "rewards/margins": 9.334249496459961, + "rewards/rejected": -8.074944496154785, + "step": 425 + }, + { + "epoch": 0.28, + "grad_norm": 41.13262176513672, + "kl": 0.0, + "learning_rate": 4.0291607396870553e-07, + "logps/chosen": -310.271240234375, + "logps/rejected": -399.4604187011719, + "loss": 0.2383, + "rewards/chosen": 1.3284270763397217, + "rewards/margins": 9.02976131439209, + "rewards/rejected": -7.7013349533081055, + "step": 430 + }, + { + "epoch": 0.28, + "grad_norm": 34.537960052490234, + "kl": 0.0, + "learning_rate": 4.011379800853485e-07, + "logps/chosen": -269.629150390625, + "logps/rejected": -415.3108825683594, + "loss": 0.2512, + "rewards/chosen": 1.2032315731048584, + "rewards/margins": 8.693357467651367, + "rewards/rejected": -7.4901251792907715, + "step": 435 + }, + { + "epoch": 0.28, + "grad_norm": 31.5653018951416, + "kl": 0.0, + "learning_rate": 3.993598862019915e-07, + "logps/chosen": -283.61785888671875, + "logps/rejected": -447.841552734375, + "loss": 0.2324, + "rewards/chosen": 1.1091759204864502, + "rewards/margins": 9.330785751342773, + "rewards/rejected": -8.221609115600586, + "step": 440 + }, + { + "epoch": 0.28, + "grad_norm": 37.161312103271484, + "kl": 0.0, + "learning_rate": 3.975817923186344e-07, + "logps/chosen": -268.5045471191406, + "logps/rejected": -409.74615478515625, + "loss": 0.2348, + "rewards/chosen": 1.2139015197753906, + "rewards/margins": 9.01981258392334, + "rewards/rejected": -7.805911064147949, + "step": 445 + }, + { + "epoch": 0.29, + "grad_norm": 49.3568000793457, + "kl": 0.0, + "learning_rate": 3.9580369843527737e-07, + "logps/chosen": -269.85748291015625, + "logps/rejected": -425.7276916503906, + "loss": 0.2633, + "rewards/chosen": 1.1275835037231445, + "rewards/margins": 8.786178588867188, + "rewards/rejected": -7.658595085144043, + "step": 450 + }, + { + "epoch": 0.29, + "grad_norm": 33.5339469909668, + "kl": 0.0, + "learning_rate": 3.940256045519203e-07, + "logps/chosen": -289.10333251953125, + "logps/rejected": -363.444580078125, + "loss": 0.2304, + "rewards/chosen": 1.321057677268982, + "rewards/margins": 8.864150047302246, + "rewards/rejected": -7.543093204498291, + "step": 455 + }, + { + "epoch": 0.29, + "grad_norm": 28.44658660888672, + "kl": 0.0, + "learning_rate": 3.9224751066856327e-07, + "logps/chosen": -275.4959411621094, + "logps/rejected": -464.95037841796875, + "loss": 0.2108, + "rewards/chosen": 1.447545051574707, + "rewards/margins": 9.413272857666016, + "rewards/rejected": -7.965727806091309, + "step": 460 + }, + { + "epoch": 0.3, + "grad_norm": 34.716697692871094, + "kl": 0.0, + "learning_rate": 3.9046941678520624e-07, + "logps/chosen": -294.32208251953125, + "logps/rejected": -413.497802734375, + "loss": 0.2439, + "rewards/chosen": 1.2425860166549683, + "rewards/margins": 8.84013557434082, + "rewards/rejected": -7.5975494384765625, + "step": 465 + }, + { + "epoch": 0.3, + "grad_norm": 39.656314849853516, + "kl": 0.0, + "learning_rate": 3.886913229018492e-07, + "logps/chosen": -271.3268737792969, + "logps/rejected": -388.7061462402344, + "loss": 0.248, + "rewards/chosen": 1.1116678714752197, + "rewards/margins": 8.070991516113281, + "rewards/rejected": -6.959322929382324, + "step": 470 + }, + { + "epoch": 0.3, + "grad_norm": 34.133079528808594, + "kl": 0.0, + "learning_rate": 3.8691322901849213e-07, + "logps/chosen": -287.83502197265625, + "logps/rejected": -384.031005859375, + "loss": 0.2423, + "rewards/chosen": 1.1618165969848633, + "rewards/margins": 7.873547554016113, + "rewards/rejected": -6.71173095703125, + "step": 475 + }, + { + "epoch": 0.31, + "grad_norm": 31.249347686767578, + "kl": 0.0, + "learning_rate": 3.851351351351351e-07, + "logps/chosen": -259.7761535644531, + "logps/rejected": -427.97686767578125, + "loss": 0.2102, + "rewards/chosen": 1.493577241897583, + "rewards/margins": 9.368111610412598, + "rewards/rejected": -7.874534606933594, + "step": 480 + }, + { + "epoch": 0.31, + "grad_norm": 38.84846496582031, + "kl": 0.0, + "learning_rate": 3.833570412517781e-07, + "logps/chosen": -257.1318664550781, + "logps/rejected": -424.8448791503906, + "loss": 0.2341, + "rewards/chosen": 1.1005711555480957, + "rewards/margins": 8.918696403503418, + "rewards/rejected": -7.818125247955322, + "step": 485 + }, + { + "epoch": 0.31, + "grad_norm": 33.03726577758789, + "kl": 0.0, + "learning_rate": 3.8157894736842105e-07, + "logps/chosen": -307.0685729980469, + "logps/rejected": -416.6412048339844, + "loss": 0.2222, + "rewards/chosen": 1.335828423500061, + "rewards/margins": 9.174932479858398, + "rewards/rejected": -7.839103698730469, + "step": 490 + }, + { + "epoch": 0.32, + "grad_norm": 31.244539260864258, + "kl": 0.0, + "learning_rate": 3.7980085348506403e-07, + "logps/chosen": -266.61871337890625, + "logps/rejected": -417.67144775390625, + "loss": 0.2396, + "rewards/chosen": 1.3928061723709106, + "rewards/margins": 9.07042121887207, + "rewards/rejected": -7.677615165710449, + "step": 495 + }, + { + "epoch": 0.32, + "grad_norm": 29.97535514831543, + "kl": 0.0, + "learning_rate": 3.7802275960170695e-07, + "logps/chosen": -241.9157257080078, + "logps/rejected": -433.5245056152344, + "loss": 0.2255, + "rewards/chosen": 1.3554142713546753, + "rewards/margins": 9.060113906860352, + "rewards/rejected": -7.704699516296387, + "step": 500 + }, + { + "epoch": 0.32, + "grad_norm": 29.664825439453125, + "kl": 0.0, + "learning_rate": 3.7624466571834987e-07, + "logps/chosen": -281.2408752441406, + "logps/rejected": -375.9148864746094, + "loss": 0.2331, + "rewards/chosen": 1.4141138792037964, + "rewards/margins": 9.32427978515625, + "rewards/rejected": -7.910165309906006, + "step": 505 + }, + { + "epoch": 0.33, + "grad_norm": 33.88861083984375, + "kl": 0.0, + "learning_rate": 3.7446657183499284e-07, + "logps/chosen": -326.9747314453125, + "logps/rejected": -423.695556640625, + "loss": 0.2435, + "rewards/chosen": 1.3004200458526611, + "rewards/margins": 9.390417098999023, + "rewards/rejected": -8.089996337890625, + "step": 510 + }, + { + "epoch": 0.33, + "grad_norm": 37.29655075073242, + "kl": 0.0, + "learning_rate": 3.726884779516358e-07, + "logps/chosen": -302.214599609375, + "logps/rejected": -448.48651123046875, + "loss": 0.2374, + "rewards/chosen": 1.234261393547058, + "rewards/margins": 9.143485069274902, + "rewards/rejected": -7.9092230796813965, + "step": 515 + }, + { + "epoch": 0.33, + "grad_norm": 34.4422607421875, + "kl": 0.0, + "learning_rate": 3.709103840682788e-07, + "logps/chosen": -280.8515625, + "logps/rejected": -424.0084533691406, + "loss": 0.2239, + "rewards/chosen": 1.3763736486434937, + "rewards/margins": 9.12721061706543, + "rewards/rejected": -7.750836372375488, + "step": 520 + }, + { + "epoch": 0.34, + "grad_norm": 36.52278137207031, + "kl": 0.0, + "learning_rate": 3.6913229018492176e-07, + "logps/chosen": -258.38580322265625, + "logps/rejected": -454.7950134277344, + "loss": 0.2394, + "rewards/chosen": 1.308040976524353, + "rewards/margins": 9.846240043640137, + "rewards/rejected": -8.538199424743652, + "step": 525 + }, + { + "epoch": 0.34, + "grad_norm": 32.81550979614258, + "kl": 0.0, + "learning_rate": 3.6735419630156474e-07, + "logps/chosen": -254.6020965576172, + "logps/rejected": -408.1050720214844, + "loss": 0.2352, + "rewards/chosen": 1.385811448097229, + "rewards/margins": 9.574739456176758, + "rewards/rejected": -8.18892765045166, + "step": 530 + }, + { + "epoch": 0.34, + "grad_norm": 34.74668502807617, + "kl": 0.0, + "learning_rate": 3.655761024182077e-07, + "logps/chosen": -298.1682434082031, + "logps/rejected": -367.279052734375, + "loss": 0.2462, + "rewards/chosen": 1.1377981901168823, + "rewards/margins": 8.490508079528809, + "rewards/rejected": -7.352709770202637, + "step": 535 + }, + { + "epoch": 0.35, + "grad_norm": 31.25318717956543, + "kl": 0.0, + "learning_rate": 3.637980085348506e-07, + "logps/chosen": -255.138671875, + "logps/rejected": -402.1342468261719, + "loss": 0.1937, + "rewards/chosen": 1.6373679637908936, + "rewards/margins": 9.590011596679688, + "rewards/rejected": -7.95264196395874, + "step": 540 + }, + { + "epoch": 0.35, + "grad_norm": 59.81840133666992, + "kl": 0.0, + "learning_rate": 3.6201991465149355e-07, + "logps/chosen": -272.1231384277344, + "logps/rejected": -433.9981384277344, + "loss": 0.2188, + "rewards/chosen": 1.2901055812835693, + "rewards/margins": 9.862019538879395, + "rewards/rejected": -8.57191276550293, + "step": 545 + }, + { + "epoch": 0.35, + "grad_norm": 32.66777801513672, + "kl": 0.0, + "learning_rate": 3.602418207681365e-07, + "logps/chosen": -245.57778930664062, + "logps/rejected": -429.46820068359375, + "loss": 0.2129, + "rewards/chosen": 1.5993635654449463, + "rewards/margins": 10.029012680053711, + "rewards/rejected": -8.429649353027344, + "step": 550 + }, + { + "epoch": 0.36, + "grad_norm": 33.075504302978516, + "kl": 0.0, + "learning_rate": 3.584637268847795e-07, + "logps/chosen": -256.0730285644531, + "logps/rejected": -392.3653259277344, + "loss": 0.2547, + "rewards/chosen": 1.0711311101913452, + "rewards/margins": 7.961878776550293, + "rewards/rejected": -6.890748023986816, + "step": 555 + }, + { + "epoch": 0.36, + "grad_norm": 27.854759216308594, + "kl": 0.0, + "learning_rate": 3.5668563300142247e-07, + "logps/chosen": -274.1041564941406, + "logps/rejected": -443.4473571777344, + "loss": 0.2106, + "rewards/chosen": 1.3565480709075928, + "rewards/margins": 9.13036060333252, + "rewards/rejected": -7.773812770843506, + "step": 560 + }, + { + "epoch": 0.36, + "grad_norm": 35.90435791015625, + "kl": 0.0, + "learning_rate": 3.5490753911806545e-07, + "logps/chosen": -203.62710571289062, + "logps/rejected": -415.73699951171875, + "loss": 0.2318, + "rewards/chosen": 1.3598759174346924, + "rewards/margins": 8.84917163848877, + "rewards/rejected": -7.489295959472656, + "step": 565 + }, + { + "epoch": 0.36, + "grad_norm": 35.37255859375, + "kl": 0.0, + "learning_rate": 3.5312944523470837e-07, + "logps/chosen": -290.1922912597656, + "logps/rejected": -400.71392822265625, + "loss": 0.2473, + "rewards/chosen": 1.21297287940979, + "rewards/margins": 8.549745559692383, + "rewards/rejected": -7.3367719650268555, + "step": 570 + }, + { + "epoch": 0.37, + "grad_norm": 32.8957405090332, + "kl": 0.0, + "learning_rate": 3.5135135135135134e-07, + "logps/chosen": -257.54644775390625, + "logps/rejected": -408.51837158203125, + "loss": 0.2254, + "rewards/chosen": 1.4590113162994385, + "rewards/margins": 9.009567260742188, + "rewards/rejected": -7.550555229187012, + "step": 575 + }, + { + "epoch": 0.37, + "grad_norm": 38.119930267333984, + "kl": 0.0, + "learning_rate": 3.495732574679943e-07, + "logps/chosen": -274.0845642089844, + "logps/rejected": -402.39837646484375, + "loss": 0.2353, + "rewards/chosen": 1.2590384483337402, + "rewards/margins": 9.06662654876709, + "rewards/rejected": -7.80758810043335, + "step": 580 + }, + { + "epoch": 0.37, + "grad_norm": 27.68585205078125, + "kl": 0.0, + "learning_rate": 3.4779516358463724e-07, + "logps/chosen": -308.3606872558594, + "logps/rejected": -440.54278564453125, + "loss": 0.2028, + "rewards/chosen": 1.6195247173309326, + "rewards/margins": 9.370535850524902, + "rewards/rejected": -7.751010894775391, + "step": 585 + }, + { + "epoch": 0.38, + "grad_norm": 33.99714279174805, + "kl": 0.0, + "learning_rate": 3.460170697012802e-07, + "logps/chosen": -242.11557006835938, + "logps/rejected": -413.9893493652344, + "loss": 0.2192, + "rewards/chosen": 1.276958703994751, + "rewards/margins": 9.278302192687988, + "rewards/rejected": -8.0013427734375, + "step": 590 + }, + { + "epoch": 0.38, + "grad_norm": 25.458580017089844, + "kl": 0.0, + "learning_rate": 3.442389758179232e-07, + "logps/chosen": -287.11602783203125, + "logps/rejected": -398.76226806640625, + "loss": 0.2048, + "rewards/chosen": 1.4868042469024658, + "rewards/margins": 9.341771125793457, + "rewards/rejected": -7.854966640472412, + "step": 595 + }, + { + "epoch": 0.38, + "grad_norm": 26.453166961669922, + "kl": 0.0, + "learning_rate": 3.424608819345661e-07, + "logps/chosen": -273.73828125, + "logps/rejected": -429.99725341796875, + "loss": 0.229, + "rewards/chosen": 1.4580796957015991, + "rewards/margins": 9.377992630004883, + "rewards/rejected": -7.919912815093994, + "step": 600 + }, + { + "epoch": 0.39, + "grad_norm": 30.808612823486328, + "kl": 0.0, + "learning_rate": 3.406827880512091e-07, + "logps/chosen": -279.26220703125, + "logps/rejected": -450.73077392578125, + "loss": 0.2029, + "rewards/chosen": 1.528760313987732, + "rewards/margins": 9.553967475891113, + "rewards/rejected": -8.025206565856934, + "step": 605 + }, + { + "epoch": 0.39, + "grad_norm": 33.38883972167969, + "kl": 0.0, + "learning_rate": 3.3890469416785205e-07, + "logps/chosen": -230.0989227294922, + "logps/rejected": -422.76287841796875, + "loss": 0.215, + "rewards/chosen": 1.429572343826294, + "rewards/margins": 9.307806968688965, + "rewards/rejected": -7.87823486328125, + "step": 610 + }, + { + "epoch": 0.39, + "grad_norm": 36.601112365722656, + "kl": 0.0, + "learning_rate": 3.37126600284495e-07, + "logps/chosen": -331.847900390625, + "logps/rejected": -389.3409118652344, + "loss": 0.2412, + "rewards/chosen": 1.426178216934204, + "rewards/margins": 8.995063781738281, + "rewards/rejected": -7.568885803222656, + "step": 615 + }, + { + "epoch": 0.4, + "grad_norm": 34.287841796875, + "kl": 0.0, + "learning_rate": 3.35348506401138e-07, + "logps/chosen": -242.3442840576172, + "logps/rejected": -403.28692626953125, + "loss": 0.2415, + "rewards/chosen": 1.321367859840393, + "rewards/margins": 8.539412498474121, + "rewards/rejected": -7.218045234680176, + "step": 620 + }, + { + "epoch": 0.4, + "grad_norm": 26.556983947753906, + "kl": 0.0, + "learning_rate": 3.335704125177809e-07, + "logps/chosen": -270.54498291015625, + "logps/rejected": -396.00115966796875, + "loss": 0.2024, + "rewards/chosen": 1.631829857826233, + "rewards/margins": 9.575605392456055, + "rewards/rejected": -7.943775177001953, + "step": 625 + }, + { + "epoch": 0.4, + "grad_norm": 32.46802520751953, + "kl": 0.0, + "learning_rate": 3.3179231863442384e-07, + "logps/chosen": -271.47515869140625, + "logps/rejected": -448.47247314453125, + "loss": 0.2425, + "rewards/chosen": 1.432576060295105, + "rewards/margins": 10.182676315307617, + "rewards/rejected": -8.750100135803223, + "step": 630 + }, + { + "epoch": 0.41, + "grad_norm": 33.824581146240234, + "kl": 0.0, + "learning_rate": 3.300142247510668e-07, + "logps/chosen": -262.5051574707031, + "logps/rejected": -421.6875915527344, + "loss": 0.1999, + "rewards/chosen": 1.4772937297821045, + "rewards/margins": 9.918327331542969, + "rewards/rejected": -8.441034317016602, + "step": 635 + }, + { + "epoch": 0.41, + "grad_norm": 30.441814422607422, + "kl": 0.0, + "learning_rate": 3.282361308677098e-07, + "logps/chosen": -262.089111328125, + "logps/rejected": -440.53369140625, + "loss": 0.2073, + "rewards/chosen": 1.5038022994995117, + "rewards/margins": 10.03141975402832, + "rewards/rejected": -8.527616500854492, + "step": 640 + }, + { + "epoch": 0.41, + "grad_norm": 28.910472869873047, + "kl": 0.0, + "learning_rate": 3.2645803698435276e-07, + "logps/chosen": -262.71087646484375, + "logps/rejected": -396.29339599609375, + "loss": 0.2451, + "rewards/chosen": 1.2832496166229248, + "rewards/margins": 8.926631927490234, + "rewards/rejected": -7.6433820724487305, + "step": 645 + }, + { + "epoch": 0.42, + "grad_norm": 21.52187728881836, + "kl": 0.0, + "learning_rate": 3.2467994310099573e-07, + "logps/chosen": -247.0696563720703, + "logps/rejected": -412.88330078125, + "loss": 0.2268, + "rewards/chosen": 1.3726212978363037, + "rewards/margins": 9.311899185180664, + "rewards/rejected": -7.939278602600098, + "step": 650 + }, + { + "epoch": 0.42, + "grad_norm": 31.051502227783203, + "kl": 0.0, + "learning_rate": 3.229018492176387e-07, + "logps/chosen": -254.9578094482422, + "logps/rejected": -414.27490234375, + "loss": 0.2138, + "rewards/chosen": 1.6382958889007568, + "rewards/margins": 9.529626846313477, + "rewards/rejected": -7.891331672668457, + "step": 655 + }, + { + "epoch": 0.42, + "grad_norm": 31.465892791748047, + "kl": 0.0, + "learning_rate": 3.211237553342817e-07, + "logps/chosen": -263.1916198730469, + "logps/rejected": -452.6835021972656, + "loss": 0.2124, + "rewards/chosen": 1.2616233825683594, + "rewards/margins": 10.00756549835205, + "rewards/rejected": -8.745941162109375, + "step": 660 + }, + { + "epoch": 0.43, + "grad_norm": 29.08134651184082, + "kl": 0.0, + "learning_rate": 3.193456614509246e-07, + "logps/chosen": -268.81488037109375, + "logps/rejected": -389.06536865234375, + "loss": 0.211, + "rewards/chosen": 1.7246748208999634, + "rewards/margins": 9.57551097869873, + "rewards/rejected": -7.850836753845215, + "step": 665 + }, + { + "epoch": 0.43, + "grad_norm": 31.119674682617188, + "kl": 0.0, + "learning_rate": 3.175675675675675e-07, + "logps/chosen": -271.3936462402344, + "logps/rejected": -438.13897705078125, + "loss": 0.2077, + "rewards/chosen": 1.4627150297164917, + "rewards/margins": 10.253105163574219, + "rewards/rejected": -8.790389060974121, + "step": 670 + }, + { + "epoch": 0.43, + "grad_norm": 27.952205657958984, + "kl": 0.0, + "learning_rate": 3.157894736842105e-07, + "logps/chosen": -246.884765625, + "logps/rejected": -417.5811462402344, + "loss": 0.2216, + "rewards/chosen": 1.5833324193954468, + "rewards/margins": 10.267784118652344, + "rewards/rejected": -8.684452056884766, + "step": 675 + }, + { + "epoch": 0.44, + "grad_norm": 29.325481414794922, + "kl": 0.0, + "learning_rate": 3.1401137980085347e-07, + "logps/chosen": -276.2046203613281, + "logps/rejected": -433.17401123046875, + "loss": 0.2028, + "rewards/chosen": 1.9258476495742798, + "rewards/margins": 10.283378601074219, + "rewards/rejected": -8.35753059387207, + "step": 680 + }, + { + "epoch": 0.44, + "grad_norm": 29.178499221801758, + "kl": 0.0, + "learning_rate": 3.1223328591749644e-07, + "logps/chosen": -256.66241455078125, + "logps/rejected": -422.0272521972656, + "loss": 0.2028, + "rewards/chosen": 1.725765585899353, + "rewards/margins": 10.144853591918945, + "rewards/rejected": -8.419087409973145, + "step": 685 + }, + { + "epoch": 0.44, + "grad_norm": 36.10505294799805, + "kl": 0.0, + "learning_rate": 3.104551920341394e-07, + "logps/chosen": -258.7532653808594, + "logps/rejected": -423.11517333984375, + "loss": 0.1846, + "rewards/chosen": 1.7036828994750977, + "rewards/margins": 10.446949005126953, + "rewards/rejected": -8.743266105651855, + "step": 690 + }, + { + "epoch": 0.44, + "grad_norm": 29.056480407714844, + "kl": 0.0, + "learning_rate": 3.0867709815078234e-07, + "logps/chosen": -255.25146484375, + "logps/rejected": -418.20904541015625, + "loss": 0.2052, + "rewards/chosen": 1.4203163385391235, + "rewards/margins": 9.596684455871582, + "rewards/rejected": -8.17636775970459, + "step": 695 + }, + { + "epoch": 0.45, + "grad_norm": 28.945829391479492, + "kl": 0.0, + "learning_rate": 3.068990042674253e-07, + "logps/chosen": -278.22515869140625, + "logps/rejected": -429.06988525390625, + "loss": 0.2215, + "rewards/chosen": 1.5914744138717651, + "rewards/margins": 10.057108879089355, + "rewards/rejected": -8.4656343460083, + "step": 700 + }, + { + "epoch": 0.45, + "grad_norm": 38.864200592041016, + "kl": 0.0, + "learning_rate": 3.051209103840683e-07, + "logps/chosen": -298.11981201171875, + "logps/rejected": -401.8730163574219, + "loss": 0.2159, + "rewards/chosen": 1.4088857173919678, + "rewards/margins": 9.174168586730957, + "rewards/rejected": -7.765283107757568, + "step": 705 + }, + { + "epoch": 0.45, + "grad_norm": 30.77165412902832, + "kl": 0.0, + "learning_rate": 3.033428165007112e-07, + "logps/chosen": -294.92120361328125, + "logps/rejected": -458.24859619140625, + "loss": 0.2173, + "rewards/chosen": 1.629966378211975, + "rewards/margins": 10.43989372253418, + "rewards/rejected": -8.809928894042969, + "step": 710 + }, + { + "epoch": 0.46, + "grad_norm": 35.23933410644531, + "kl": 0.0, + "learning_rate": 3.015647226173542e-07, + "logps/chosen": -254.14697265625, + "logps/rejected": -409.28619384765625, + "loss": 0.2258, + "rewards/chosen": 1.2798521518707275, + "rewards/margins": 9.090901374816895, + "rewards/rejected": -7.811048984527588, + "step": 715 + }, + { + "epoch": 0.46, + "grad_norm": 36.05426025390625, + "kl": 0.0, + "learning_rate": 2.9978662873399715e-07, + "logps/chosen": -253.47683715820312, + "logps/rejected": -453.486083984375, + "loss": 0.2104, + "rewards/chosen": 1.5098592042922974, + "rewards/margins": 10.354707717895508, + "rewards/rejected": -8.844846725463867, + "step": 720 + }, + { + "epoch": 0.46, + "grad_norm": 33.010982513427734, + "kl": 0.0, + "learning_rate": 2.9800853485064007e-07, + "logps/chosen": -312.1941833496094, + "logps/rejected": -457.25762939453125, + "loss": 0.205, + "rewards/chosen": 1.6310077905654907, + "rewards/margins": 11.00071907043457, + "rewards/rejected": -9.369711875915527, + "step": 725 + }, + { + "epoch": 0.47, + "grad_norm": 30.9288387298584, + "kl": 0.0, + "learning_rate": 2.9623044096728305e-07, + "logps/chosen": -243.4445037841797, + "logps/rejected": -434.6583557128906, + "loss": 0.1894, + "rewards/chosen": 1.8518720865249634, + "rewards/margins": 11.06387996673584, + "rewards/rejected": -9.212007522583008, + "step": 730 + }, + { + "epoch": 0.47, + "grad_norm": 33.26964569091797, + "kl": 0.0, + "learning_rate": 2.94452347083926e-07, + "logps/chosen": -273.52972412109375, + "logps/rejected": -438.6238708496094, + "loss": 0.1873, + "rewards/chosen": 1.8763511180877686, + "rewards/margins": 10.942605972290039, + "rewards/rejected": -9.066253662109375, + "step": 735 + }, + { + "epoch": 0.47, + "grad_norm": 27.57000732421875, + "kl": 0.0, + "learning_rate": 2.92674253200569e-07, + "logps/chosen": -311.89337158203125, + "logps/rejected": -431.88311767578125, + "loss": 0.1965, + "rewards/chosen": 1.648795485496521, + "rewards/margins": 10.737363815307617, + "rewards/rejected": -9.088567733764648, + "step": 740 + }, + { + "epoch": 0.48, + "grad_norm": 33.400115966796875, + "kl": 0.0, + "learning_rate": 2.9089615931721197e-07, + "logps/chosen": -285.564697265625, + "logps/rejected": -427.61492919921875, + "loss": 0.229, + "rewards/chosen": 1.5363407135009766, + "rewards/margins": 10.41348934173584, + "rewards/rejected": -8.87714958190918, + "step": 745 + }, + { + "epoch": 0.48, + "grad_norm": 38.698974609375, + "kl": 0.0, + "learning_rate": 2.8911806543385494e-07, + "logps/chosen": -323.2370910644531, + "logps/rejected": -412.75714111328125, + "loss": 0.237, + "rewards/chosen": 1.409673810005188, + "rewards/margins": 10.063661575317383, + "rewards/rejected": -8.6539888381958, + "step": 750 + }, + { + "epoch": 0.48, + "grad_norm": 31.202178955078125, + "kl": 0.0, + "learning_rate": 2.873399715504978e-07, + "logps/chosen": -233.3540496826172, + "logps/rejected": -404.90667724609375, + "loss": 0.2033, + "rewards/chosen": 1.9290201663970947, + "rewards/margins": 10.473341941833496, + "rewards/rejected": -8.544321060180664, + "step": 755 + }, + { + "epoch": 0.49, + "grad_norm": 32.82831954956055, + "kl": 0.0, + "learning_rate": 2.855618776671408e-07, + "logps/chosen": -264.25909423828125, + "logps/rejected": -425.609375, + "loss": 0.2372, + "rewards/chosen": 1.3035495281219482, + "rewards/margins": 10.148138046264648, + "rewards/rejected": -8.844587326049805, + "step": 760 + }, + { + "epoch": 0.49, + "grad_norm": 31.877046585083008, + "kl": 0.0, + "learning_rate": 2.8378378378378376e-07, + "logps/chosen": -267.76617431640625, + "logps/rejected": -414.3863830566406, + "loss": 0.2131, + "rewards/chosen": 1.630545973777771, + "rewards/margins": 9.6016845703125, + "rewards/rejected": -7.971138000488281, + "step": 765 + }, + { + "epoch": 0.49, + "grad_norm": 27.605052947998047, + "kl": 0.0, + "learning_rate": 2.8200568990042673e-07, + "logps/chosen": -264.8264465332031, + "logps/rejected": -415.3543395996094, + "loss": 0.1985, + "rewards/chosen": 1.7074753046035767, + "rewards/margins": 10.28056526184082, + "rewards/rejected": -8.573090553283691, + "step": 770 + }, + { + "epoch": 0.5, + "grad_norm": 31.359970092773438, + "kl": 0.0, + "learning_rate": 2.802275960170697e-07, + "logps/chosen": -304.1432189941406, + "logps/rejected": -396.9605407714844, + "loss": 0.1959, + "rewards/chosen": 1.5722863674163818, + "rewards/margins": 9.709434509277344, + "rewards/rejected": -8.1371488571167, + "step": 775 + }, + { + "epoch": 0.5, + "grad_norm": 26.068336486816406, + "kl": 0.0, + "learning_rate": 2.784495021337127e-07, + "logps/chosen": -244.50808715820312, + "logps/rejected": -431.31573486328125, + "loss": 0.2077, + "rewards/chosen": 1.5841686725616455, + "rewards/margins": 10.045574188232422, + "rewards/rejected": -8.461404800415039, + "step": 780 + }, + { + "epoch": 0.5, + "grad_norm": 34.23469924926758, + "kl": 0.0, + "learning_rate": 2.766714082503556e-07, + "logps/chosen": -309.2042236328125, + "logps/rejected": -385.1072692871094, + "loss": 0.2296, + "rewards/chosen": 1.528110146522522, + "rewards/margins": 8.884299278259277, + "rewards/rejected": -7.356189727783203, + "step": 785 + }, + { + "epoch": 0.51, + "grad_norm": 31.527212142944336, + "kl": 0.0, + "learning_rate": 2.7489331436699857e-07, + "logps/chosen": -261.33660888671875, + "logps/rejected": -397.09765625, + "loss": 0.1996, + "rewards/chosen": 1.783190131187439, + "rewards/margins": 9.507209777832031, + "rewards/rejected": -7.724020481109619, + "step": 790 + }, + { + "epoch": 0.51, + "grad_norm": 27.93092918395996, + "kl": 0.0, + "learning_rate": 2.7311522048364154e-07, + "logps/chosen": -281.72003173828125, + "logps/rejected": -442.7430725097656, + "loss": 0.1926, + "rewards/chosen": 1.924103021621704, + "rewards/margins": 9.900445938110352, + "rewards/rejected": -7.97634220123291, + "step": 795 + }, + { + "epoch": 0.51, + "grad_norm": 25.62993621826172, + "kl": 0.0, + "learning_rate": 2.7133712660028446e-07, + "logps/chosen": -294.6544494628906, + "logps/rejected": -436.26611328125, + "loss": 0.1812, + "rewards/chosen": 1.679582953453064, + "rewards/margins": 9.951603889465332, + "rewards/rejected": -8.272021293640137, + "step": 800 + }, + { + "epoch": 0.52, + "grad_norm": 31.204952239990234, + "kl": 0.0, + "learning_rate": 2.6955903271692744e-07, + "logps/chosen": -276.06512451171875, + "logps/rejected": -445.56268310546875, + "loss": 0.1795, + "rewards/chosen": 1.8605276346206665, + "rewards/margins": 10.302029609680176, + "rewards/rejected": -8.44150161743164, + "step": 805 + }, + { + "epoch": 0.52, + "grad_norm": 27.505334854125977, + "kl": 0.0, + "learning_rate": 2.677809388335704e-07, + "logps/chosen": -261.7455139160156, + "logps/rejected": -428.7049255371094, + "loss": 0.2231, + "rewards/chosen": 1.501050591468811, + "rewards/margins": 9.754376411437988, + "rewards/rejected": -8.253324508666992, + "step": 810 + }, + { + "epoch": 0.52, + "grad_norm": 36.04350662231445, + "kl": 0.0, + "learning_rate": 2.6600284495021333e-07, + "logps/chosen": -243.1392059326172, + "logps/rejected": -394.90814208984375, + "loss": 0.1914, + "rewards/chosen": 1.5635383129119873, + "rewards/margins": 9.669111251831055, + "rewards/rejected": -8.105573654174805, + "step": 815 + }, + { + "epoch": 0.52, + "grad_norm": 32.719242095947266, + "kl": 0.0, + "learning_rate": 2.642247510668563e-07, + "logps/chosen": -274.9061584472656, + "logps/rejected": -421.139892578125, + "loss": 0.2005, + "rewards/chosen": 2.0269622802734375, + "rewards/margins": 10.025403022766113, + "rewards/rejected": -7.998439788818359, + "step": 820 + }, + { + "epoch": 0.53, + "grad_norm": 32.51384353637695, + "kl": 0.0, + "learning_rate": 2.624466571834993e-07, + "logps/chosen": -271.8145446777344, + "logps/rejected": -428.3387145996094, + "loss": 0.178, + "rewards/chosen": 1.7497066259384155, + "rewards/margins": 10.334202766418457, + "rewards/rejected": -8.584495544433594, + "step": 825 + }, + { + "epoch": 0.53, + "grad_norm": 32.91698455810547, + "kl": 0.0, + "learning_rate": 2.6066856330014225e-07, + "logps/chosen": -274.6688537597656, + "logps/rejected": -424.2642517089844, + "loss": 0.1726, + "rewards/chosen": 1.8553260564804077, + "rewards/margins": 10.73515510559082, + "rewards/rejected": -8.879829406738281, + "step": 830 + }, + { + "epoch": 0.53, + "grad_norm": 31.989952087402344, + "kl": 0.0, + "learning_rate": 2.5889046941678523e-07, + "logps/chosen": -278.80364990234375, + "logps/rejected": -393.033935546875, + "loss": 0.1894, + "rewards/chosen": 2.037987470626831, + "rewards/margins": 10.227518081665039, + "rewards/rejected": -8.189531326293945, + "step": 835 + }, + { + "epoch": 0.54, + "grad_norm": 32.226680755615234, + "kl": 0.0, + "learning_rate": 2.5711237553342815e-07, + "logps/chosen": -261.1275634765625, + "logps/rejected": -425.26690673828125, + "loss": 0.2044, + "rewards/chosen": 1.545175552368164, + "rewards/margins": 9.810770988464355, + "rewards/rejected": -8.265595436096191, + "step": 840 + }, + { + "epoch": 0.54, + "grad_norm": 33.65993881225586, + "kl": 0.0, + "learning_rate": 2.5533428165007107e-07, + "logps/chosen": -260.96636962890625, + "logps/rejected": -425.2252502441406, + "loss": 0.2119, + "rewards/chosen": 1.733242392539978, + "rewards/margins": 10.170026779174805, + "rewards/rejected": -8.436783790588379, + "step": 845 + }, + { + "epoch": 0.54, + "grad_norm": 26.046958923339844, + "kl": 0.0, + "learning_rate": 2.5355618776671404e-07, + "logps/chosen": -253.7927703857422, + "logps/rejected": -441.16845703125, + "loss": 0.2048, + "rewards/chosen": 1.948599100112915, + "rewards/margins": 10.65324592590332, + "rewards/rejected": -8.704646110534668, + "step": 850 + }, + { + "epoch": 0.55, + "grad_norm": 27.351463317871094, + "kl": 0.0, + "learning_rate": 2.51778093883357e-07, + "logps/chosen": -272.27984619140625, + "logps/rejected": -431.0148010253906, + "loss": 0.1727, + "rewards/chosen": 2.044630527496338, + "rewards/margins": 10.910379409790039, + "rewards/rejected": -8.865748405456543, + "step": 855 + }, + { + "epoch": 0.55, + "grad_norm": 29.846065521240234, + "kl": 0.0, + "learning_rate": 2.5e-07, + "logps/chosen": -303.9323425292969, + "logps/rejected": -408.2849426269531, + "loss": 0.1872, + "rewards/chosen": 2.064297914505005, + "rewards/margins": 9.957128524780273, + "rewards/rejected": -7.892831325531006, + "step": 860 + }, + { + "epoch": 0.55, + "grad_norm": 23.12837791442871, + "kl": 0.0, + "learning_rate": 2.4822190611664296e-07, + "logps/chosen": -256.0515441894531, + "logps/rejected": -444.7254943847656, + "loss": 0.1889, + "rewards/chosen": 1.7166566848754883, + "rewards/margins": 10.624780654907227, + "rewards/rejected": -8.908124923706055, + "step": 865 + }, + { + "epoch": 0.56, + "grad_norm": 35.630897521972656, + "kl": 0.0, + "learning_rate": 2.4644381223328594e-07, + "logps/chosen": -264.610107421875, + "logps/rejected": -418.3433532714844, + "loss": 0.2181, + "rewards/chosen": 1.5745290517807007, + "rewards/margins": 10.60425090789795, + "rewards/rejected": -9.029722213745117, + "step": 870 + }, + { + "epoch": 0.56, + "grad_norm": 28.484664916992188, + "kl": 0.0, + "learning_rate": 2.4466571834992886e-07, + "logps/chosen": -272.72271728515625, + "logps/rejected": -433.21173095703125, + "loss": 0.1801, + "rewards/chosen": 2.46614146232605, + "rewards/margins": 10.789461135864258, + "rewards/rejected": -8.323318481445312, + "step": 875 + }, + { + "epoch": 0.56, + "grad_norm": 24.918109893798828, + "kl": 0.0, + "learning_rate": 2.4288762446657183e-07, + "logps/chosen": -256.21923828125, + "logps/rejected": -436.86376953125, + "loss": 0.1901, + "rewards/chosen": 1.860933542251587, + "rewards/margins": 10.575204849243164, + "rewards/rejected": -8.714271545410156, + "step": 880 + }, + { + "epoch": 0.57, + "grad_norm": 22.929967880249023, + "kl": 0.0, + "learning_rate": 2.411095305832148e-07, + "logps/chosen": -261.22625732421875, + "logps/rejected": -396.7969665527344, + "loss": 0.2018, + "rewards/chosen": 1.8665907382965088, + "rewards/margins": 10.46824836730957, + "rewards/rejected": -8.601656913757324, + "step": 885 + }, + { + "epoch": 0.57, + "grad_norm": 27.053821563720703, + "kl": 0.0, + "learning_rate": 2.393314366998578e-07, + "logps/chosen": -263.8940124511719, + "logps/rejected": -437.24969482421875, + "loss": 0.1789, + "rewards/chosen": 1.9356715679168701, + "rewards/margins": 11.14433765411377, + "rewards/rejected": -9.20866584777832, + "step": 890 + }, + { + "epoch": 0.57, + "grad_norm": 32.19890594482422, + "kl": 0.0, + "learning_rate": 2.375533428165007e-07, + "logps/chosen": -273.77117919921875, + "logps/rejected": -406.4714050292969, + "loss": 0.1788, + "rewards/chosen": 2.0741090774536133, + "rewards/margins": 10.467344284057617, + "rewards/rejected": -8.393235206604004, + "step": 895 + }, + { + "epoch": 0.58, + "grad_norm": 34.50763702392578, + "kl": 0.0, + "learning_rate": 2.3577524893314365e-07, + "logps/chosen": -288.15338134765625, + "logps/rejected": -444.86419677734375, + "loss": 0.1858, + "rewards/chosen": 1.712510108947754, + "rewards/margins": 10.688920974731445, + "rewards/rejected": -8.976411819458008, + "step": 900 + }, + { + "epoch": 0.58, + "grad_norm": 31.742473602294922, + "kl": 0.0, + "learning_rate": 2.3399715504978662e-07, + "logps/chosen": -292.7054748535156, + "logps/rejected": -449.0078125, + "loss": 0.1891, + "rewards/chosen": 2.181274175643921, + "rewards/margins": 10.978649139404297, + "rewards/rejected": -8.797374725341797, + "step": 905 + }, + { + "epoch": 0.58, + "grad_norm": 26.00906753540039, + "kl": 0.0, + "learning_rate": 2.322190611664296e-07, + "logps/chosen": -267.0461730957031, + "logps/rejected": -465.12689208984375, + "loss": 0.1882, + "rewards/chosen": 1.6607955694198608, + "rewards/margins": 11.397066116333008, + "rewards/rejected": -9.7362699508667, + "step": 910 + }, + { + "epoch": 0.59, + "grad_norm": 27.99995231628418, + "kl": 0.0, + "learning_rate": 2.304409672830725e-07, + "logps/chosen": -289.6124267578125, + "logps/rejected": -420.7085876464844, + "loss": 0.1721, + "rewards/chosen": 2.0229907035827637, + "rewards/margins": 11.071403503417969, + "rewards/rejected": -9.048412322998047, + "step": 915 + }, + { + "epoch": 0.59, + "grad_norm": 34.18607711791992, + "kl": 0.0, + "learning_rate": 2.2866287339971549e-07, + "logps/chosen": -295.18035888671875, + "logps/rejected": -436.540283203125, + "loss": 0.1805, + "rewards/chosen": 1.8851134777069092, + "rewards/margins": 10.579368591308594, + "rewards/rejected": -8.694254875183105, + "step": 920 + }, + { + "epoch": 0.59, + "grad_norm": 33.18037033081055, + "kl": 0.0, + "learning_rate": 2.2688477951635846e-07, + "logps/chosen": -291.39752197265625, + "logps/rejected": -442.94781494140625, + "loss": 0.1905, + "rewards/chosen": 1.595261573791504, + "rewards/margins": 10.855417251586914, + "rewards/rejected": -9.26015567779541, + "step": 925 + }, + { + "epoch": 0.6, + "grad_norm": 36.65940475463867, + "kl": 0.0, + "learning_rate": 2.251066856330014e-07, + "logps/chosen": -297.91693115234375, + "logps/rejected": -424.9837951660156, + "loss": 0.1963, + "rewards/chosen": 1.8958244323730469, + "rewards/margins": 10.079604148864746, + "rewards/rejected": -8.1837797164917, + "step": 930 + }, + { + "epoch": 0.6, + "grad_norm": 24.643966674804688, + "kl": 0.0, + "learning_rate": 2.2332859174964438e-07, + "logps/chosen": -288.42578125, + "logps/rejected": -416.49090576171875, + "loss": 0.1792, + "rewards/chosen": 1.7624881267547607, + "rewards/margins": 10.258140563964844, + "rewards/rejected": -8.49565315246582, + "step": 935 + }, + { + "epoch": 0.6, + "grad_norm": 32.83414077758789, + "kl": 0.0, + "learning_rate": 2.2155049786628733e-07, + "logps/chosen": -245.7664794921875, + "logps/rejected": -404.72698974609375, + "loss": 0.1879, + "rewards/chosen": 1.6799345016479492, + "rewards/margins": 10.454665184020996, + "rewards/rejected": -8.774730682373047, + "step": 940 + }, + { + "epoch": 0.6, + "grad_norm": 24.070175170898438, + "kl": 0.0, + "learning_rate": 2.1977240398293027e-07, + "logps/chosen": -261.0927734375, + "logps/rejected": -446.99908447265625, + "loss": 0.1742, + "rewards/chosen": 2.0352275371551514, + "rewards/margins": 10.900886535644531, + "rewards/rejected": -8.865659713745117, + "step": 945 + }, + { + "epoch": 0.61, + "grad_norm": 30.027536392211914, + "kl": 0.0, + "learning_rate": 2.1799431009957325e-07, + "logps/chosen": -270.7321472167969, + "logps/rejected": -454.71087646484375, + "loss": 0.1864, + "rewards/chosen": 1.623396635055542, + "rewards/margins": 10.708158493041992, + "rewards/rejected": -9.084760665893555, + "step": 950 + }, + { + "epoch": 0.61, + "grad_norm": 28.615467071533203, + "kl": 0.0, + "learning_rate": 2.1621621621621622e-07, + "logps/chosen": -307.35498046875, + "logps/rejected": -417.56689453125, + "loss": 0.1691, + "rewards/chosen": 2.1113579273223877, + "rewards/margins": 10.654966354370117, + "rewards/rejected": -8.543607711791992, + "step": 955 + }, + { + "epoch": 0.61, + "grad_norm": 36.107337951660156, + "kl": 0.0, + "learning_rate": 2.1443812233285914e-07, + "logps/chosen": -277.4281005859375, + "logps/rejected": -441.26190185546875, + "loss": 0.1748, + "rewards/chosen": 1.8290290832519531, + "rewards/margins": 11.018919944763184, + "rewards/rejected": -9.189891815185547, + "step": 960 + }, + { + "epoch": 0.62, + "grad_norm": 30.27995491027832, + "kl": 0.0, + "learning_rate": 2.1266002844950212e-07, + "logps/chosen": -252.20654296875, + "logps/rejected": -385.89166259765625, + "loss": 0.2177, + "rewards/chosen": 1.4553921222686768, + "rewards/margins": 10.100979804992676, + "rewards/rejected": -8.645586967468262, + "step": 965 + }, + { + "epoch": 0.62, + "grad_norm": 32.431095123291016, + "kl": 0.0, + "learning_rate": 2.108819345661451e-07, + "logps/chosen": -323.1937561035156, + "logps/rejected": -399.3692626953125, + "loss": 0.169, + "rewards/chosen": 2.363802671432495, + "rewards/margins": 10.098939895629883, + "rewards/rejected": -7.735138893127441, + "step": 970 + }, + { + "epoch": 0.62, + "grad_norm": 32.58518600463867, + "kl": 0.0, + "learning_rate": 2.0910384068278806e-07, + "logps/chosen": -290.06280517578125, + "logps/rejected": -453.532958984375, + "loss": 0.169, + "rewards/chosen": 2.02885365486145, + "rewards/margins": 11.639073371887207, + "rewards/rejected": -9.610219955444336, + "step": 975 + }, + { + "epoch": 0.63, + "grad_norm": 27.531936645507812, + "kl": 0.0, + "learning_rate": 2.0732574679943098e-07, + "logps/chosen": -274.7181091308594, + "logps/rejected": -429.36895751953125, + "loss": 0.1858, + "rewards/chosen": 2.207428455352783, + "rewards/margins": 11.3438081741333, + "rewards/rejected": -9.13637924194336, + "step": 980 + }, + { + "epoch": 0.63, + "grad_norm": 25.77531623840332, + "kl": 0.0, + "learning_rate": 2.0554765291607396e-07, + "logps/chosen": -288.15863037109375, + "logps/rejected": -420.2445373535156, + "loss": 0.1668, + "rewards/chosen": 1.9953186511993408, + "rewards/margins": 11.334251403808594, + "rewards/rejected": -9.338932991027832, + "step": 985 + }, + { + "epoch": 0.63, + "grad_norm": 30.00310707092285, + "kl": 0.0, + "learning_rate": 2.0376955903271693e-07, + "logps/chosen": -281.99505615234375, + "logps/rejected": -426.06622314453125, + "loss": 0.1714, + "rewards/chosen": 2.062351942062378, + "rewards/margins": 11.414217948913574, + "rewards/rejected": -9.351865768432617, + "step": 990 + }, + { + "epoch": 0.64, + "grad_norm": 29.115968704223633, + "kl": 0.0, + "learning_rate": 2.0199146514935988e-07, + "logps/chosen": -270.6589660644531, + "logps/rejected": -453.30914306640625, + "loss": 0.1911, + "rewards/chosen": 1.767245888710022, + "rewards/margins": 11.179068565368652, + "rewards/rejected": -9.411824226379395, + "step": 995 + }, + { + "epoch": 0.64, + "grad_norm": 33.765933990478516, + "kl": 0.0, + "learning_rate": 2.0021337126600283e-07, + "logps/chosen": -275.52850341796875, + "logps/rejected": -411.69647216796875, + "loss": 0.18, + "rewards/chosen": 2.0011379718780518, + "rewards/margins": 10.669230461120605, + "rewards/rejected": -8.668092727661133, + "step": 1000 + }, + { + "epoch": 0.64, + "grad_norm": 40.146724700927734, + "kl": 0.0, + "learning_rate": 1.984352773826458e-07, + "logps/chosen": -286.11456298828125, + "logps/rejected": -421.98956298828125, + "loss": 0.1854, + "rewards/chosen": 1.923966407775879, + "rewards/margins": 11.38940715789795, + "rewards/rejected": -9.46544075012207, + "step": 1005 + }, + { + "epoch": 0.65, + "grad_norm": 30.437040328979492, + "kl": 0.0, + "learning_rate": 1.9665718349928875e-07, + "logps/chosen": -285.3699951171875, + "logps/rejected": -415.08221435546875, + "loss": 0.189, + "rewards/chosen": 1.893922209739685, + "rewards/margins": 10.257635116577148, + "rewards/rejected": -8.363713264465332, + "step": 1010 + }, + { + "epoch": 0.65, + "grad_norm": 44.26390075683594, + "kl": 0.0, + "learning_rate": 1.9487908961593172e-07, + "logps/chosen": -289.23773193359375, + "logps/rejected": -403.2334899902344, + "loss": 0.1903, + "rewards/chosen": 1.8229591846466064, + "rewards/margins": 10.558061599731445, + "rewards/rejected": -8.735103607177734, + "step": 1015 + }, + { + "epoch": 0.65, + "grad_norm": 30.460708618164062, + "kl": 0.0, + "learning_rate": 1.931009957325747e-07, + "logps/chosen": -262.11456298828125, + "logps/rejected": -431.6568298339844, + "loss": 0.1664, + "rewards/chosen": 2.2022650241851807, + "rewards/margins": 11.189310073852539, + "rewards/rejected": -8.987046241760254, + "step": 1020 + }, + { + "epoch": 0.66, + "grad_norm": 26.1412296295166, + "kl": 0.0, + "learning_rate": 1.9132290184921761e-07, + "logps/chosen": -290.6312561035156, + "logps/rejected": -436.25146484375, + "loss": 0.1815, + "rewards/chosen": 2.1080222129821777, + "rewards/margins": 10.946023941040039, + "rewards/rejected": -8.83800220489502, + "step": 1025 + }, + { + "epoch": 0.66, + "grad_norm": 30.030075073242188, + "kl": 0.0, + "learning_rate": 1.895448079658606e-07, + "logps/chosen": -269.8115234375, + "logps/rejected": -406.8975524902344, + "loss": 0.1609, + "rewards/chosen": 2.2383086681365967, + "rewards/margins": 11.194424629211426, + "rewards/rejected": -8.956116676330566, + "step": 1030 + }, + { + "epoch": 0.66, + "grad_norm": 27.133041381835938, + "kl": 0.0, + "learning_rate": 1.8776671408250356e-07, + "logps/chosen": -286.6905517578125, + "logps/rejected": -421.914794921875, + "loss": 0.1612, + "rewards/chosen": 1.9345842599868774, + "rewards/margins": 10.909126281738281, + "rewards/rejected": -8.974542617797852, + "step": 1035 + }, + { + "epoch": 0.67, + "grad_norm": 28.558135986328125, + "kl": 0.0, + "learning_rate": 1.859886201991465e-07, + "logps/chosen": -272.40423583984375, + "logps/rejected": -435.93157958984375, + "loss": 0.1764, + "rewards/chosen": 1.9151103496551514, + "rewards/margins": 11.380414009094238, + "rewards/rejected": -9.465303421020508, + "step": 1040 + }, + { + "epoch": 0.67, + "grad_norm": 25.598812103271484, + "kl": 0.0, + "learning_rate": 1.8421052631578946e-07, + "logps/chosen": -292.720947265625, + "logps/rejected": -427.9752502441406, + "loss": 0.1588, + "rewards/chosen": 2.3256161212921143, + "rewards/margins": 11.020068168640137, + "rewards/rejected": -8.694452285766602, + "step": 1045 + }, + { + "epoch": 0.67, + "grad_norm": 30.871252059936523, + "kl": 0.0, + "learning_rate": 1.8243243243243243e-07, + "logps/chosen": -253.33505249023438, + "logps/rejected": -409.4463806152344, + "loss": 0.1777, + "rewards/chosen": 1.9444713592529297, + "rewards/margins": 10.616201400756836, + "rewards/rejected": -8.67172908782959, + "step": 1050 + }, + { + "epoch": 0.67, + "grad_norm": 28.533897399902344, + "kl": 0.0, + "learning_rate": 1.8065433854907538e-07, + "logps/chosen": -318.8229064941406, + "logps/rejected": -392.5040588378906, + "loss": 0.1836, + "rewards/chosen": 1.9878658056259155, + "rewards/margins": 9.962384223937988, + "rewards/rejected": -7.974517822265625, + "step": 1055 + }, + { + "epoch": 0.68, + "grad_norm": 26.75721549987793, + "kl": 0.0, + "learning_rate": 1.7887624466571835e-07, + "logps/chosen": -257.9976501464844, + "logps/rejected": -451.36871337890625, + "loss": 0.1809, + "rewards/chosen": 1.9583194255828857, + "rewards/margins": 11.265231132507324, + "rewards/rejected": -9.30691146850586, + "step": 1060 + }, + { + "epoch": 0.68, + "grad_norm": 35.624107360839844, + "kl": 0.0, + "learning_rate": 1.770981507823613e-07, + "logps/chosen": -282.64056396484375, + "logps/rejected": -430.7513732910156, + "loss": 0.1525, + "rewards/chosen": 2.1451783180236816, + "rewards/margins": 11.390130996704102, + "rewards/rejected": -9.244951248168945, + "step": 1065 + }, + { + "epoch": 0.68, + "grad_norm": 31.359891891479492, + "kl": 0.0, + "learning_rate": 1.7532005689900424e-07, + "logps/chosen": -266.92822265625, + "logps/rejected": -422.86065673828125, + "loss": 0.1842, + "rewards/chosen": 2.03436279296875, + "rewards/margins": 11.101476669311523, + "rewards/rejected": -9.067112922668457, + "step": 1070 + }, + { + "epoch": 0.69, + "grad_norm": 30.415653228759766, + "kl": 0.0, + "learning_rate": 1.7354196301564722e-07, + "logps/chosen": -304.77734375, + "logps/rejected": -441.39935302734375, + "loss": 0.1701, + "rewards/chosen": 2.063904285430908, + "rewards/margins": 11.589478492736816, + "rewards/rejected": -9.52557373046875, + "step": 1075 + }, + { + "epoch": 0.69, + "grad_norm": 22.34760284423828, + "kl": 0.0, + "learning_rate": 1.717638691322902e-07, + "logps/chosen": -222.26657104492188, + "logps/rejected": -388.18524169921875, + "loss": 0.1699, + "rewards/chosen": 2.138190984725952, + "rewards/margins": 10.250738143920898, + "rewards/rejected": -8.112547874450684, + "step": 1080 + }, + { + "epoch": 0.69, + "grad_norm": 25.913867950439453, + "kl": 0.0, + "learning_rate": 1.6998577524893314e-07, + "logps/chosen": -296.4642028808594, + "logps/rejected": -446.37664794921875, + "loss": 0.1661, + "rewards/chosen": 2.1257483959198, + "rewards/margins": 11.645601272583008, + "rewards/rejected": -9.519853591918945, + "step": 1085 + }, + { + "epoch": 0.7, + "grad_norm": 26.88484764099121, + "kl": 0.0, + "learning_rate": 1.6820768136557609e-07, + "logps/chosen": -265.0510559082031, + "logps/rejected": -439.36785888671875, + "loss": 0.166, + "rewards/chosen": 2.2344844341278076, + "rewards/margins": 11.633499145507812, + "rewards/rejected": -9.399014472961426, + "step": 1090 + }, + { + "epoch": 0.7, + "grad_norm": 33.54571533203125, + "kl": 0.0, + "learning_rate": 1.6642958748221906e-07, + "logps/chosen": -232.16415405273438, + "logps/rejected": -419.260986328125, + "loss": 0.1727, + "rewards/chosen": 2.076587677001953, + "rewards/margins": 10.830089569091797, + "rewards/rejected": -8.753500938415527, + "step": 1095 + }, + { + "epoch": 0.7, + "grad_norm": 27.437702178955078, + "kl": 0.0, + "learning_rate": 1.64651493598862e-07, + "logps/chosen": -285.01983642578125, + "logps/rejected": -433.81915283203125, + "loss": 0.1967, + "rewards/chosen": 2.082289934158325, + "rewards/margins": 10.996795654296875, + "rewards/rejected": -8.914506912231445, + "step": 1100 + }, + { + "epoch": 0.71, + "grad_norm": 31.939613342285156, + "kl": 0.0, + "learning_rate": 1.6287339971550498e-07, + "logps/chosen": -251.11685180664062, + "logps/rejected": -428.45928955078125, + "loss": 0.1885, + "rewards/chosen": 1.553290605545044, + "rewards/margins": 10.479029655456543, + "rewards/rejected": -8.925740242004395, + "step": 1105 + }, + { + "epoch": 0.71, + "grad_norm": 29.216533660888672, + "kl": 0.0, + "learning_rate": 1.6109530583214793e-07, + "logps/chosen": -278.77825927734375, + "logps/rejected": -422.4730529785156, + "loss": 0.155, + "rewards/chosen": 2.2448031902313232, + "rewards/margins": 12.009950637817383, + "rewards/rejected": -9.76514720916748, + "step": 1110 + }, + { + "epoch": 0.71, + "grad_norm": 30.50436782836914, + "kl": 0.0, + "learning_rate": 1.5931721194879087e-07, + "logps/chosen": -273.7040710449219, + "logps/rejected": -427.70062255859375, + "loss": 0.1862, + "rewards/chosen": 1.7097088098526, + "rewards/margins": 10.923454284667969, + "rewards/rejected": -9.2137451171875, + "step": 1115 + }, + { + "epoch": 0.72, + "grad_norm": 25.549413681030273, + "kl": 0.0, + "learning_rate": 1.5753911806543385e-07, + "logps/chosen": -308.25079345703125, + "logps/rejected": -466.87518310546875, + "loss": 0.1519, + "rewards/chosen": 2.1832187175750732, + "rewards/margins": 12.121397018432617, + "rewards/rejected": -9.938179016113281, + "step": 1120 + }, + { + "epoch": 0.72, + "grad_norm": 27.82436180114746, + "kl": 0.0, + "learning_rate": 1.5576102418207682e-07, + "logps/chosen": -271.1653747558594, + "logps/rejected": -416.2850646972656, + "loss": 0.1642, + "rewards/chosen": 2.35219144821167, + "rewards/margins": 11.501470565795898, + "rewards/rejected": -9.14927864074707, + "step": 1125 + }, + { + "epoch": 0.72, + "grad_norm": 21.758255004882812, + "kl": 0.0, + "learning_rate": 1.5398293029871974e-07, + "logps/chosen": -303.6253356933594, + "logps/rejected": -469.1205139160156, + "loss": 0.1652, + "rewards/chosen": 2.1315276622772217, + "rewards/margins": 12.238561630249023, + "rewards/rejected": -10.107034683227539, + "step": 1130 + }, + { + "epoch": 0.73, + "grad_norm": 48.562782287597656, + "kl": 0.0, + "learning_rate": 1.5220483641536272e-07, + "logps/chosen": -263.2828063964844, + "logps/rejected": -456.22100830078125, + "loss": 0.1497, + "rewards/chosen": 2.2434208393096924, + "rewards/margins": 11.659939765930176, + "rewards/rejected": -9.416519165039062, + "step": 1135 + }, + { + "epoch": 0.73, + "grad_norm": 25.21342658996582, + "kl": 0.0, + "learning_rate": 1.504267425320057e-07, + "logps/chosen": -293.3092041015625, + "logps/rejected": -429.980712890625, + "loss": 0.1603, + "rewards/chosen": 2.3119020462036133, + "rewards/margins": 11.362417221069336, + "rewards/rejected": -9.050514221191406, + "step": 1140 + }, + { + "epoch": 0.73, + "grad_norm": 25.476823806762695, + "kl": 0.0, + "learning_rate": 1.4864864864864866e-07, + "logps/chosen": -278.0580749511719, + "logps/rejected": -430.935791015625, + "loss": 0.1517, + "rewards/chosen": 2.4627890586853027, + "rewards/margins": 11.769883155822754, + "rewards/rejected": -9.30709457397461, + "step": 1145 + }, + { + "epoch": 0.74, + "grad_norm": 26.627599716186523, + "kl": 0.0, + "learning_rate": 1.4687055476529158e-07, + "logps/chosen": -278.7668151855469, + "logps/rejected": -440.79840087890625, + "loss": 0.1525, + "rewards/chosen": 2.298746109008789, + "rewards/margins": 11.890974998474121, + "rewards/rejected": -9.592229843139648, + "step": 1150 + }, + { + "epoch": 0.74, + "grad_norm": 33.060585021972656, + "kl": 0.0, + "learning_rate": 1.4509246088193456e-07, + "logps/chosen": -274.751220703125, + "logps/rejected": -425.123779296875, + "loss": 0.1704, + "rewards/chosen": 2.241877317428589, + "rewards/margins": 11.402631759643555, + "rewards/rejected": -9.160754203796387, + "step": 1155 + }, + { + "epoch": 0.74, + "grad_norm": 29.089338302612305, + "kl": 0.0, + "learning_rate": 1.4331436699857753e-07, + "logps/chosen": -238.36630249023438, + "logps/rejected": -464.2720642089844, + "loss": 0.1511, + "rewards/chosen": 2.008114814758301, + "rewards/margins": 11.63502311706543, + "rewards/rejected": -9.626908302307129, + "step": 1160 + }, + { + "epoch": 0.75, + "grad_norm": 27.289066314697266, + "kl": 0.0, + "learning_rate": 1.4153627311522048e-07, + "logps/chosen": -268.7559509277344, + "logps/rejected": -420.65570068359375, + "loss": 0.1646, + "rewards/chosen": 2.3184897899627686, + "rewards/margins": 11.422869682312012, + "rewards/rejected": -9.104379653930664, + "step": 1165 + }, + { + "epoch": 0.75, + "grad_norm": 28.602949142456055, + "kl": 0.0, + "learning_rate": 1.3975817923186345e-07, + "logps/chosen": -302.52545166015625, + "logps/rejected": -445.30657958984375, + "loss": 0.1655, + "rewards/chosen": 2.089745044708252, + "rewards/margins": 11.941364288330078, + "rewards/rejected": -9.851619720458984, + "step": 1170 + }, + { + "epoch": 0.75, + "grad_norm": 23.429819107055664, + "kl": 0.0, + "learning_rate": 1.379800853485064e-07, + "logps/chosen": -231.4444122314453, + "logps/rejected": -430.3841857910156, + "loss": 0.1656, + "rewards/chosen": 2.1710591316223145, + "rewards/margins": 11.049272537231445, + "rewards/rejected": -8.878213882446289, + "step": 1175 + }, + { + "epoch": 0.75, + "grad_norm": 22.412796020507812, + "kl": 0.0, + "learning_rate": 1.3620199146514935e-07, + "logps/chosen": -268.0068664550781, + "logps/rejected": -437.62646484375, + "loss": 0.1599, + "rewards/chosen": 1.9776779413223267, + "rewards/margins": 11.274179458618164, + "rewards/rejected": -9.296501159667969, + "step": 1180 + }, + { + "epoch": 0.76, + "grad_norm": 25.37950325012207, + "kl": 0.0, + "learning_rate": 1.3442389758179232e-07, + "logps/chosen": -255.9052276611328, + "logps/rejected": -444.33013916015625, + "loss": 0.187, + "rewards/chosen": 2.0841331481933594, + "rewards/margins": 11.117557525634766, + "rewards/rejected": -9.033424377441406, + "step": 1185 + }, + { + "epoch": 0.76, + "grad_norm": 30.78813934326172, + "kl": 0.0, + "learning_rate": 1.326458036984353e-07, + "logps/chosen": -286.6682434082031, + "logps/rejected": -441.3355407714844, + "loss": 0.1465, + "rewards/chosen": 2.3942222595214844, + "rewards/margins": 11.716878890991211, + "rewards/rejected": -9.322656631469727, + "step": 1190 + }, + { + "epoch": 0.76, + "grad_norm": 28.977252960205078, + "kl": 0.0, + "learning_rate": 1.3086770981507821e-07, + "logps/chosen": -245.698486328125, + "logps/rejected": -405.50445556640625, + "loss": 0.1805, + "rewards/chosen": 1.9074538946151733, + "rewards/margins": 10.517099380493164, + "rewards/rejected": -8.60964584350586, + "step": 1195 + }, + { + "epoch": 0.77, + "grad_norm": 33.64772033691406, + "kl": 0.0, + "learning_rate": 1.290896159317212e-07, + "logps/chosen": -265.48529052734375, + "logps/rejected": -433.76904296875, + "loss": 0.1613, + "rewards/chosen": 1.7114953994750977, + "rewards/margins": 11.555914878845215, + "rewards/rejected": -9.844419479370117, + "step": 1200 + }, + { + "epoch": 0.77, + "grad_norm": 31.616493225097656, + "kl": 0.0, + "learning_rate": 1.2731152204836416e-07, + "logps/chosen": -248.8570556640625, + "logps/rejected": -434.2515563964844, + "loss": 0.1708, + "rewards/chosen": 2.1189894676208496, + "rewards/margins": 11.114141464233398, + "rewards/rejected": -8.995152473449707, + "step": 1205 + }, + { + "epoch": 0.77, + "grad_norm": 24.81133460998535, + "kl": 0.0, + "learning_rate": 1.255334281650071e-07, + "logps/chosen": -263.2900390625, + "logps/rejected": -444.3939514160156, + "loss": 0.1324, + "rewards/chosen": 2.4141650199890137, + "rewards/margins": 11.944352149963379, + "rewards/rejected": -9.530186653137207, + "step": 1210 + }, + { + "epoch": 0.78, + "grad_norm": 25.361906051635742, + "kl": 0.0, + "learning_rate": 1.2375533428165005e-07, + "logps/chosen": -268.45123291015625, + "logps/rejected": -427.3312072753906, + "loss": 0.1721, + "rewards/chosen": 2.090129852294922, + "rewards/margins": 10.85669994354248, + "rewards/rejected": -8.766571044921875, + "step": 1215 + }, + { + "epoch": 0.78, + "grad_norm": 22.66915512084961, + "kl": 0.0, + "learning_rate": 1.2197724039829303e-07, + "logps/chosen": -267.35003662109375, + "logps/rejected": -463.38250732421875, + "loss": 0.1787, + "rewards/chosen": 2.1323537826538086, + "rewards/margins": 12.177955627441406, + "rewards/rejected": -10.045602798461914, + "step": 1220 + }, + { + "epoch": 0.78, + "grad_norm": 24.390474319458008, + "kl": 0.0, + "learning_rate": 1.2019914651493598e-07, + "logps/chosen": -273.03778076171875, + "logps/rejected": -411.14178466796875, + "loss": 0.158, + "rewards/chosen": 1.9639747142791748, + "rewards/margins": 10.484541893005371, + "rewards/rejected": -8.520566940307617, + "step": 1225 + }, + { + "epoch": 0.79, + "grad_norm": 24.46862030029297, + "kl": 0.0, + "learning_rate": 1.1842105263157894e-07, + "logps/chosen": -288.54296875, + "logps/rejected": -423.35760498046875, + "loss": 0.1574, + "rewards/chosen": 2.0397889614105225, + "rewards/margins": 11.161340713500977, + "rewards/rejected": -9.121551513671875, + "step": 1230 + }, + { + "epoch": 0.79, + "grad_norm": 27.27725601196289, + "kl": 0.0, + "learning_rate": 1.166429587482219e-07, + "logps/chosen": -244.09326171875, + "logps/rejected": -456.2569885253906, + "loss": 0.1511, + "rewards/chosen": 2.042201519012451, + "rewards/margins": 11.101190567016602, + "rewards/rejected": -9.058988571166992, + "step": 1235 + }, + { + "epoch": 0.79, + "grad_norm": 27.39021873474121, + "kl": 0.0, + "learning_rate": 1.1486486486486487e-07, + "logps/chosen": -276.58819580078125, + "logps/rejected": -425.1268005371094, + "loss": 0.1648, + "rewards/chosen": 2.0721068382263184, + "rewards/margins": 11.045255661010742, + "rewards/rejected": -8.973149299621582, + "step": 1240 + }, + { + "epoch": 0.8, + "grad_norm": 24.352102279663086, + "kl": 0.0, + "learning_rate": 1.1308677098150782e-07, + "logps/chosen": -282.8309020996094, + "logps/rejected": -407.66717529296875, + "loss": 0.1542, + "rewards/chosen": 2.180142641067505, + "rewards/margins": 10.879863739013672, + "rewards/rejected": -8.69972038269043, + "step": 1245 + }, + { + "epoch": 0.8, + "grad_norm": 31.191946029663086, + "kl": 0.0, + "learning_rate": 1.1130867709815078e-07, + "logps/chosen": -303.6287536621094, + "logps/rejected": -430.66815185546875, + "loss": 0.154, + "rewards/chosen": 2.3230533599853516, + "rewards/margins": 11.597023010253906, + "rewards/rejected": -9.273969650268555, + "step": 1250 + }, + { + "epoch": 0.8, + "grad_norm": 31.79303550720215, + "kl": 0.0, + "learning_rate": 1.0953058321479374e-07, + "logps/chosen": -321.51959228515625, + "logps/rejected": -430.92974853515625, + "loss": 0.1755, + "rewards/chosen": 2.0515542030334473, + "rewards/margins": 11.131525039672852, + "rewards/rejected": -9.079970359802246, + "step": 1255 + }, + { + "epoch": 0.81, + "grad_norm": 28.54650115966797, + "kl": 0.0, + "learning_rate": 1.077524893314367e-07, + "logps/chosen": -273.19195556640625, + "logps/rejected": -442.4486389160156, + "loss": 0.174, + "rewards/chosen": 2.0409607887268066, + "rewards/margins": 11.32982349395752, + "rewards/rejected": -9.288861274719238, + "step": 1260 + }, + { + "epoch": 0.81, + "grad_norm": 31.283008575439453, + "kl": 0.0, + "learning_rate": 1.0597439544807964e-07, + "logps/chosen": -283.18890380859375, + "logps/rejected": -416.5110778808594, + "loss": 0.167, + "rewards/chosen": 2.393275737762451, + "rewards/margins": 11.679018020629883, + "rewards/rejected": -9.285740852355957, + "step": 1265 + }, + { + "epoch": 0.81, + "grad_norm": 23.58814811706543, + "kl": 0.0, + "learning_rate": 1.0419630156472262e-07, + "logps/chosen": -283.10565185546875, + "logps/rejected": -402.07684326171875, + "loss": 0.1573, + "rewards/chosen": 2.145526885986328, + "rewards/margins": 11.390679359436035, + "rewards/rejected": -9.24515151977539, + "step": 1270 + }, + { + "epoch": 0.82, + "grad_norm": 22.255775451660156, + "kl": 0.0, + "learning_rate": 1.0241820768136557e-07, + "logps/chosen": -293.5381164550781, + "logps/rejected": -409.7227478027344, + "loss": 0.1521, + "rewards/chosen": 2.332836627960205, + "rewards/margins": 11.542078971862793, + "rewards/rejected": -9.209242820739746, + "step": 1275 + }, + { + "epoch": 0.82, + "grad_norm": 24.767562866210938, + "kl": 0.0, + "learning_rate": 1.0064011379800854e-07, + "logps/chosen": -245.151611328125, + "logps/rejected": -469.22509765625, + "loss": 0.1666, + "rewards/chosen": 1.9871269464492798, + "rewards/margins": 11.738969802856445, + "rewards/rejected": -9.751842498779297, + "step": 1280 + }, + { + "epoch": 0.82, + "grad_norm": 27.796966552734375, + "kl": 0.0, + "learning_rate": 9.886201991465149e-08, + "logps/chosen": -250.67642211914062, + "logps/rejected": -387.165771484375, + "loss": 0.167, + "rewards/chosen": 2.231963634490967, + "rewards/margins": 10.673835754394531, + "rewards/rejected": -8.441873550415039, + "step": 1285 + }, + { + "epoch": 0.83, + "grad_norm": 21.186832427978516, + "kl": 0.0, + "learning_rate": 9.708392603129445e-08, + "logps/chosen": -239.9123992919922, + "logps/rejected": -444.4414978027344, + "loss": 0.1593, + "rewards/chosen": 2.1016697883605957, + "rewards/margins": 11.904180526733398, + "rewards/rejected": -9.802510261535645, + "step": 1290 + }, + { + "epoch": 0.83, + "grad_norm": 38.97263717651367, + "kl": 0.0, + "learning_rate": 9.530583214793741e-08, + "logps/chosen": -265.32354736328125, + "logps/rejected": -429.73126220703125, + "loss": 0.1528, + "rewards/chosen": 2.0850026607513428, + "rewards/margins": 10.97376823425293, + "rewards/rejected": -8.888765335083008, + "step": 1295 + }, + { + "epoch": 0.83, + "grad_norm": 25.232553482055664, + "kl": 0.0, + "learning_rate": 9.352773826458037e-08, + "logps/chosen": -271.5675048828125, + "logps/rejected": -407.0108337402344, + "loss": 0.1654, + "rewards/chosen": 2.2973520755767822, + "rewards/margins": 10.86426067352295, + "rewards/rejected": -8.56690788269043, + "step": 1300 + }, + { + "epoch": 0.83, + "grad_norm": 25.400094985961914, + "kl": 0.0, + "learning_rate": 9.174964438122331e-08, + "logps/chosen": -302.88580322265625, + "logps/rejected": -436.335205078125, + "loss": 0.1541, + "rewards/chosen": 2.13349986076355, + "rewards/margins": 11.886751174926758, + "rewards/rejected": -9.753250122070312, + "step": 1305 + }, + { + "epoch": 0.84, + "grad_norm": 29.706186294555664, + "kl": 0.0, + "learning_rate": 8.997155049786629e-08, + "logps/chosen": -242.15316772460938, + "logps/rejected": -383.33984375, + "loss": 0.1632, + "rewards/chosen": 2.095715045928955, + "rewards/margins": 10.616997718811035, + "rewards/rejected": -8.521283149719238, + "step": 1310 + }, + { + "epoch": 0.84, + "grad_norm": 27.454551696777344, + "kl": 0.0, + "learning_rate": 8.819345661450925e-08, + "logps/chosen": -224.43917846679688, + "logps/rejected": -439.21160888671875, + "loss": 0.1756, + "rewards/chosen": 1.6575233936309814, + "rewards/margins": 11.428009033203125, + "rewards/rejected": -9.770486831665039, + "step": 1315 + }, + { + "epoch": 0.84, + "grad_norm": 29.03775978088379, + "kl": 0.0, + "learning_rate": 8.64153627311522e-08, + "logps/chosen": -260.82977294921875, + "logps/rejected": -423.10186767578125, + "loss": 0.1674, + "rewards/chosen": 2.265705108642578, + "rewards/margins": 11.70507526397705, + "rewards/rejected": -9.439369201660156, + "step": 1320 + }, + { + "epoch": 0.85, + "grad_norm": 24.21120262145996, + "kl": 0.0, + "learning_rate": 8.463726884779517e-08, + "logps/chosen": -330.37994384765625, + "logps/rejected": -462.821533203125, + "loss": 0.1688, + "rewards/chosen": 2.0837504863739014, + "rewards/margins": 10.997267723083496, + "rewards/rejected": -8.9135160446167, + "step": 1325 + }, + { + "epoch": 0.85, + "grad_norm": 17.94179344177246, + "kl": 0.0, + "learning_rate": 8.285917496443812e-08, + "logps/chosen": -273.67681884765625, + "logps/rejected": -466.666259765625, + "loss": 0.1512, + "rewards/chosen": 2.4386792182922363, + "rewards/margins": 12.027200698852539, + "rewards/rejected": -9.588521957397461, + "step": 1330 + }, + { + "epoch": 0.85, + "grad_norm": 23.39067268371582, + "kl": 0.0, + "learning_rate": 8.108108108108108e-08, + "logps/chosen": -276.4830627441406, + "logps/rejected": -441.99053955078125, + "loss": 0.1583, + "rewards/chosen": 2.1169440746307373, + "rewards/margins": 11.944867134094238, + "rewards/rejected": -9.827921867370605, + "step": 1335 + }, + { + "epoch": 0.86, + "grad_norm": 24.148284912109375, + "kl": 0.0, + "learning_rate": 7.930298719772404e-08, + "logps/chosen": -290.4081726074219, + "logps/rejected": -424.72296142578125, + "loss": 0.1385, + "rewards/chosen": 2.2973082065582275, + "rewards/margins": 11.383561134338379, + "rewards/rejected": -9.08625316619873, + "step": 1340 + }, + { + "epoch": 0.86, + "grad_norm": 32.26573944091797, + "kl": 0.0, + "learning_rate": 7.7524893314367e-08, + "logps/chosen": -277.98760986328125, + "logps/rejected": -421.136962890625, + "loss": 0.1744, + "rewards/chosen": 2.007927894592285, + "rewards/margins": 11.15161418914795, + "rewards/rejected": -9.143686294555664, + "step": 1345 + }, + { + "epoch": 0.86, + "grad_norm": 24.157018661499023, + "kl": 0.0, + "learning_rate": 7.574679943100994e-08, + "logps/chosen": -270.72259521484375, + "logps/rejected": -414.43511962890625, + "loss": 0.185, + "rewards/chosen": 1.9621318578720093, + "rewards/margins": 11.223711013793945, + "rewards/rejected": -9.261579513549805, + "step": 1350 + }, + { + "epoch": 0.87, + "grad_norm": 30.29186248779297, + "kl": 0.0, + "learning_rate": 7.396870554765292e-08, + "logps/chosen": -312.62603759765625, + "logps/rejected": -437.23052978515625, + "loss": 0.1554, + "rewards/chosen": 2.294711112976074, + "rewards/margins": 11.318277359008789, + "rewards/rejected": -9.023566246032715, + "step": 1355 + }, + { + "epoch": 0.87, + "grad_norm": 21.90921401977539, + "kl": 0.0, + "learning_rate": 7.219061166429587e-08, + "logps/chosen": -245.4251708984375, + "logps/rejected": -436.4671325683594, + "loss": 0.1443, + "rewards/chosen": 2.178391218185425, + "rewards/margins": 11.533452987670898, + "rewards/rejected": -9.355062484741211, + "step": 1360 + }, + { + "epoch": 0.87, + "grad_norm": 30.29448699951172, + "kl": 0.0, + "learning_rate": 7.041251778093883e-08, + "logps/chosen": -262.2453918457031, + "logps/rejected": -430.98565673828125, + "loss": 0.1726, + "rewards/chosen": 2.155410051345825, + "rewards/margins": 11.638139724731445, + "rewards/rejected": -9.4827299118042, + "step": 1365 + }, + { + "epoch": 0.88, + "grad_norm": 26.87623405456543, + "kl": 0.0, + "learning_rate": 6.863442389758179e-08, + "logps/chosen": -282.3881530761719, + "logps/rejected": -422.7132873535156, + "loss": 0.1628, + "rewards/chosen": 2.140310525894165, + "rewards/margins": 11.545256614685059, + "rewards/rejected": -9.404945373535156, + "step": 1370 + }, + { + "epoch": 0.88, + "grad_norm": 31.97431182861328, + "kl": 0.0, + "learning_rate": 6.685633001422475e-08, + "logps/chosen": -300.5008239746094, + "logps/rejected": -429.50579833984375, + "loss": 0.1565, + "rewards/chosen": 2.009218215942383, + "rewards/margins": 11.088391304016113, + "rewards/rejected": -9.079172134399414, + "step": 1375 + }, + { + "epoch": 0.88, + "grad_norm": 27.48915672302246, + "kl": 0.0, + "learning_rate": 6.507823613086771e-08, + "logps/chosen": -288.3194580078125, + "logps/rejected": -424.296142578125, + "loss": 0.1391, + "rewards/chosen": 2.393162250518799, + "rewards/margins": 11.204710006713867, + "rewards/rejected": -8.811546325683594, + "step": 1380 + }, + { + "epoch": 0.89, + "grad_norm": 25.372827529907227, + "kl": 0.0, + "learning_rate": 6.330014224751067e-08, + "logps/chosen": -248.9453887939453, + "logps/rejected": -443.82000732421875, + "loss": 0.1596, + "rewards/chosen": 2.2566885948181152, + "rewards/margins": 11.549893379211426, + "rewards/rejected": -9.293205261230469, + "step": 1385 + }, + { + "epoch": 0.89, + "grad_norm": 28.675615310668945, + "kl": 0.0, + "learning_rate": 6.152204836415363e-08, + "logps/chosen": -284.46307373046875, + "logps/rejected": -435.80859375, + "loss": 0.1463, + "rewards/chosen": 1.9566986560821533, + "rewards/margins": 11.848084449768066, + "rewards/rejected": -9.891386032104492, + "step": 1390 + }, + { + "epoch": 0.89, + "grad_norm": 37.92374801635742, + "kl": 0.0, + "learning_rate": 5.974395448079659e-08, + "logps/chosen": -246.85205078125, + "logps/rejected": -449.06463623046875, + "loss": 0.1525, + "rewards/chosen": 2.1182522773742676, + "rewards/margins": 12.141133308410645, + "rewards/rejected": -10.022881507873535, + "step": 1395 + }, + { + "epoch": 0.9, + "grad_norm": 24.609821319580078, + "kl": 0.0, + "learning_rate": 5.796586059743954e-08, + "logps/chosen": -237.67111206054688, + "logps/rejected": -457.66632080078125, + "loss": 0.1518, + "rewards/chosen": 2.348842144012451, + "rewards/margins": 12.221162796020508, + "rewards/rejected": -9.872321128845215, + "step": 1400 + }, + { + "epoch": 0.9, + "grad_norm": 30.03931427001953, + "kl": 0.0, + "learning_rate": 5.61877667140825e-08, + "logps/chosen": -295.3681640625, + "logps/rejected": -456.4273376464844, + "loss": 0.1597, + "rewards/chosen": 2.270292282104492, + "rewards/margins": 12.329472541809082, + "rewards/rejected": -10.059179306030273, + "step": 1405 + }, + { + "epoch": 0.9, + "grad_norm": 30.871780395507812, + "kl": 0.0, + "learning_rate": 5.4409672830725456e-08, + "logps/chosen": -239.51406860351562, + "logps/rejected": -392.5760803222656, + "loss": 0.1682, + "rewards/chosen": 2.3251259326934814, + "rewards/margins": 11.237902641296387, + "rewards/rejected": -8.9127779006958, + "step": 1410 + }, + { + "epoch": 0.91, + "grad_norm": 24.213848114013672, + "kl": 0.0, + "learning_rate": 5.2631578947368416e-08, + "logps/chosen": -282.6657409667969, + "logps/rejected": -449.935302734375, + "loss": 0.1517, + "rewards/chosen": 2.1117091178894043, + "rewards/margins": 12.3379545211792, + "rewards/rejected": -10.226245880126953, + "step": 1415 + }, + { + "epoch": 0.91, + "grad_norm": 26.362215042114258, + "kl": 0.0, + "learning_rate": 5.0853485064011376e-08, + "logps/chosen": -274.45501708984375, + "logps/rejected": -454.06048583984375, + "loss": 0.1572, + "rewards/chosen": 2.2160325050354004, + "rewards/margins": 12.087754249572754, + "rewards/rejected": -9.871722221374512, + "step": 1420 + }, + { + "epoch": 0.91, + "grad_norm": 17.865629196166992, + "kl": 0.0, + "learning_rate": 4.9075391180654337e-08, + "logps/chosen": -300.43731689453125, + "logps/rejected": -417.7992248535156, + "loss": 0.1209, + "rewards/chosen": 2.6023740768432617, + "rewards/margins": 11.860528945922852, + "rewards/rejected": -9.258153915405273, + "step": 1425 + }, + { + "epoch": 0.91, + "grad_norm": 31.106910705566406, + "kl": 0.0, + "learning_rate": 4.72972972972973e-08, + "logps/chosen": -291.830322265625, + "logps/rejected": -405.03314208984375, + "loss": 0.1713, + "rewards/chosen": 2.3040544986724854, + "rewards/margins": 10.725131034851074, + "rewards/rejected": -8.421076774597168, + "step": 1430 + }, + { + "epoch": 0.92, + "grad_norm": 41.000526428222656, + "kl": 0.0, + "learning_rate": 4.551920341394026e-08, + "logps/chosen": -256.0244445800781, + "logps/rejected": -424.0470275878906, + "loss": 0.1511, + "rewards/chosen": 2.2252817153930664, + "rewards/margins": 11.0009183883667, + "rewards/rejected": -8.775636672973633, + "step": 1435 + }, + { + "epoch": 0.92, + "grad_norm": 25.92328453063965, + "kl": 0.0, + "learning_rate": 4.374110953058322e-08, + "logps/chosen": -275.6056213378906, + "logps/rejected": -412.8865661621094, + "loss": 0.1789, + "rewards/chosen": 2.167658567428589, + "rewards/margins": 11.966622352600098, + "rewards/rejected": -9.798962593078613, + "step": 1440 + }, + { + "epoch": 0.92, + "grad_norm": 29.096887588500977, + "kl": 0.0, + "learning_rate": 4.196301564722617e-08, + "logps/chosen": -269.5967102050781, + "logps/rejected": -467.5411682128906, + "loss": 0.1806, + "rewards/chosen": 2.218864679336548, + "rewards/margins": 12.105215072631836, + "rewards/rejected": -9.886350631713867, + "step": 1445 + }, + { + "epoch": 0.93, + "grad_norm": 26.817333221435547, + "kl": 0.0, + "learning_rate": 4.018492176386913e-08, + "logps/chosen": -283.4931945800781, + "logps/rejected": -479.77227783203125, + "loss": 0.1282, + "rewards/chosen": 2.702843427658081, + "rewards/margins": 12.9722261428833, + "rewards/rejected": -10.269383430480957, + "step": 1450 + }, + { + "epoch": 0.93, + "grad_norm": 30.314491271972656, + "kl": 0.0, + "learning_rate": 3.840682788051209e-08, + "logps/chosen": -268.67218017578125, + "logps/rejected": -430.715576171875, + "loss": 0.1719, + "rewards/chosen": 2.1400671005249023, + "rewards/margins": 11.173896789550781, + "rewards/rejected": -9.033829689025879, + "step": 1455 + }, + { + "epoch": 0.93, + "grad_norm": 23.925148010253906, + "kl": 0.0, + "learning_rate": 3.6628733997155046e-08, + "logps/chosen": -244.7641143798828, + "logps/rejected": -420.8289489746094, + "loss": 0.1616, + "rewards/chosen": 1.9465503692626953, + "rewards/margins": 11.341249465942383, + "rewards/rejected": -9.394698143005371, + "step": 1460 + }, + { + "epoch": 0.94, + "grad_norm": 26.21236228942871, + "kl": 0.0, + "learning_rate": 3.4850640113798006e-08, + "logps/chosen": -253.13265991210938, + "logps/rejected": -395.868408203125, + "loss": 0.1444, + "rewards/chosen": 2.5426197052001953, + "rewards/margins": 12.135528564453125, + "rewards/rejected": -9.59290885925293, + "step": 1465 + }, + { + "epoch": 0.94, + "grad_norm": 23.762733459472656, + "kl": 0.0, + "learning_rate": 3.3072546230440967e-08, + "logps/chosen": -289.7633361816406, + "logps/rejected": -433.18212890625, + "loss": 0.1417, + "rewards/chosen": 2.608954906463623, + "rewards/margins": 12.459098815917969, + "rewards/rejected": -9.850143432617188, + "step": 1470 + }, + { + "epoch": 0.94, + "grad_norm": 17.17668342590332, + "kl": 0.0, + "learning_rate": 3.129445234708392e-08, + "logps/chosen": -244.68838500976562, + "logps/rejected": -435.24737548828125, + "loss": 0.148, + "rewards/chosen": 2.157148838043213, + "rewards/margins": 11.793753623962402, + "rewards/rejected": -9.636604309082031, + "step": 1475 + }, + { + "epoch": 0.95, + "grad_norm": 29.167118072509766, + "kl": 0.0, + "learning_rate": 2.9516358463726884e-08, + "logps/chosen": -256.3995361328125, + "logps/rejected": -412.01495361328125, + "loss": 0.168, + "rewards/chosen": 2.236207962036133, + "rewards/margins": 12.077910423278809, + "rewards/rejected": -9.841702461242676, + "step": 1480 + }, + { + "epoch": 0.95, + "grad_norm": 23.674877166748047, + "kl": 0.0, + "learning_rate": 2.7738264580369844e-08, + "logps/chosen": -243.34078979492188, + "logps/rejected": -438.3180236816406, + "loss": 0.1651, + "rewards/chosen": 2.185333728790283, + "rewards/margins": 12.011825561523438, + "rewards/rejected": -9.826491355895996, + "step": 1485 + }, + { + "epoch": 0.95, + "grad_norm": 29.555051803588867, + "kl": 0.0, + "learning_rate": 2.59601706970128e-08, + "logps/chosen": -302.4588928222656, + "logps/rejected": -430.30682373046875, + "loss": 0.152, + "rewards/chosen": 2.319841146469116, + "rewards/margins": 11.491790771484375, + "rewards/rejected": -9.17194938659668, + "step": 1490 + }, + { + "epoch": 0.96, + "grad_norm": 24.97121238708496, + "kl": 0.0, + "learning_rate": 2.418207681365576e-08, + "logps/chosen": -299.60247802734375, + "logps/rejected": -458.22381591796875, + "loss": 0.1577, + "rewards/chosen": 2.0418765544891357, + "rewards/margins": 11.608416557312012, + "rewards/rejected": -9.566540718078613, + "step": 1495 + }, + { + "epoch": 0.96, + "grad_norm": 18.722761154174805, + "kl": 0.0, + "learning_rate": 2.240398293029872e-08, + "logps/chosen": -263.1552429199219, + "logps/rejected": -391.51300048828125, + "loss": 0.1524, + "rewards/chosen": 2.344292402267456, + "rewards/margins": 11.393754959106445, + "rewards/rejected": -9.049463272094727, + "step": 1500 + }, + { + "epoch": 0.96, + "grad_norm": 23.644838333129883, + "kl": 0.0, + "learning_rate": 2.0625889046941676e-08, + "logps/chosen": -291.31048583984375, + "logps/rejected": -445.3470153808594, + "loss": 0.1441, + "rewards/chosen": 2.071507453918457, + "rewards/margins": 12.295246124267578, + "rewards/rejected": -10.223738670349121, + "step": 1505 + }, + { + "epoch": 0.97, + "grad_norm": 25.264785766601562, + "kl": 0.0, + "learning_rate": 1.8847795163584636e-08, + "logps/chosen": -248.8640594482422, + "logps/rejected": -401.29925537109375, + "loss": 0.1538, + "rewards/chosen": 2.236462116241455, + "rewards/margins": 11.150157928466797, + "rewards/rejected": -8.9136962890625, + "step": 1510 + }, + { + "epoch": 0.97, + "grad_norm": 27.405046463012695, + "kl": 0.0, + "learning_rate": 1.7069701280227596e-08, + "logps/chosen": -270.66522216796875, + "logps/rejected": -422.05914306640625, + "loss": 0.1622, + "rewards/chosen": 2.3051300048828125, + "rewards/margins": 12.115873336791992, + "rewards/rejected": -9.810742378234863, + "step": 1515 + }, + { + "epoch": 0.97, + "grad_norm": 28.10026741027832, + "kl": 0.0, + "learning_rate": 1.5291607396870554e-08, + "logps/chosen": -291.5521545410156, + "logps/rejected": -415.6878967285156, + "loss": 0.163, + "rewards/chosen": 2.0566792488098145, + "rewards/margins": 11.73741340637207, + "rewards/rejected": -9.680734634399414, + "step": 1520 + }, + { + "epoch": 0.98, + "grad_norm": 30.15706443786621, + "kl": 0.0, + "learning_rate": 1.3513513513513514e-08, + "logps/chosen": -303.3666076660156, + "logps/rejected": -436.81689453125, + "loss": 0.1677, + "rewards/chosen": 2.270231008529663, + "rewards/margins": 11.719429016113281, + "rewards/rejected": -9.449198722839355, + "step": 1525 + }, + { + "epoch": 0.98, + "grad_norm": 23.98443603515625, + "kl": 0.0, + "learning_rate": 1.1735419630156473e-08, + "logps/chosen": -290.3255615234375, + "logps/rejected": -448.453125, + "loss": 0.1374, + "rewards/chosen": 2.524505376815796, + "rewards/margins": 12.247071266174316, + "rewards/rejected": -9.722566604614258, + "step": 1530 + }, + { + "epoch": 0.98, + "grad_norm": 27.463891983032227, + "kl": 0.0, + "learning_rate": 9.95732574679943e-09, + "logps/chosen": -264.30474853515625, + "logps/rejected": -450.0399475097656, + "loss": 0.1647, + "rewards/chosen": 2.185473918914795, + "rewards/margins": 11.05543327331543, + "rewards/rejected": -8.869958877563477, + "step": 1535 + }, + { + "epoch": 0.99, + "grad_norm": 22.17925453186035, + "kl": 0.0, + "learning_rate": 8.179231863442388e-09, + "logps/chosen": -274.0540466308594, + "logps/rejected": -421.3465270996094, + "loss": 0.1565, + "rewards/chosen": 2.378261089324951, + "rewards/margins": 11.525609970092773, + "rewards/rejected": -9.14734935760498, + "step": 1540 + }, + { + "epoch": 0.99, + "grad_norm": 24.82571029663086, + "kl": 0.0, + "learning_rate": 6.401137980085348e-09, + "logps/chosen": -210.9959259033203, + "logps/rejected": -412.92303466796875, + "loss": 0.1477, + "rewards/chosen": 2.322688102722168, + "rewards/margins": 11.523736000061035, + "rewards/rejected": -9.201047897338867, + "step": 1545 + }, + { + "epoch": 0.99, + "grad_norm": 23.87914276123047, + "kl": 0.0, + "learning_rate": 4.623044096728307e-09, + "logps/chosen": -279.3990173339844, + "logps/rejected": -447.538818359375, + "loss": 0.1495, + "rewards/chosen": 2.5236871242523193, + "rewards/margins": 12.28012466430664, + "rewards/rejected": -9.756436347961426, + "step": 1550 + }, + { + "epoch": 0.99, + "grad_norm": 22.962329864501953, + "kl": 0.0, + "learning_rate": 2.844950213371266e-09, + "logps/chosen": -291.07733154296875, + "logps/rejected": -454.6717224121094, + "loss": 0.1473, + "rewards/chosen": 2.4102189540863037, + "rewards/margins": 12.55319881439209, + "rewards/rejected": -10.142979621887207, + "step": 1555 + }, + { + "epoch": 1.0, + "grad_norm": 24.947708129882812, + "kl": 0.0, + "learning_rate": 1.0668563300142248e-09, + "logps/chosen": -283.6773376464844, + "logps/rejected": -398.395751953125, + "loss": 0.1471, + "rewards/chosen": 2.680229663848877, + "rewards/margins": 11.58387565612793, + "rewards/rejected": -8.903645515441895, + "step": 1560 + }, + { + "epoch": 1.0, + "step": 1563, + "total_flos": 0.0, + "train_loss": 0.2172593233757727, + "train_runtime": 11063.2306, + "train_samples_per_second": 9.039, + "train_steps_per_second": 0.141 + } + ], + "logging_steps": 5, + "max_steps": 1563, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}