diff --git "a/checkpoint-19402/trainer_state.json" "b/checkpoint-19402/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-19402/trainer_state.json" @@ -0,0 +1,29196 @@ +{ + "best_metric": 0.2106233835220337, + "best_model_checkpoint": "models/qwen2.5-3b-dpo-finegrained-40-vanilla/checkpoint-15000", + "epoch": 0.9999742301249839, + "eval_steps": 5000, + "global_step": 19402, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.1539750032212345e-05, + "grad_norm": 1.8958798019342287, + "learning_rate": 2.575991756826378e-10, + "logits/chosen": -2.28125, + "logits/rejected": -1.78125, + "logps/chosen": -155.0, + "logps/rejected": -182.0, + "loss": 0.6914, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0005153975003221235, + "grad_norm": 1.7550083456255021, + "learning_rate": 2.575991756826378e-09, + "logits/chosen": -2.484375, + "logits/rejected": -2.15625, + "logps/chosen": -179.0, + "logps/rejected": -170.0, + "loss": 0.6922, + "rewards/accuracies": 0.2083333283662796, + "rewards/chosen": -0.00019168853759765625, + "rewards/margins": -0.0002593994140625, + "rewards/rejected": 6.961822509765625e-05, + "step": 10 + }, + { + "epoch": 0.001030795000644247, + "grad_norm": 1.845636439566388, + "learning_rate": 5.151983513652756e-09, + "logits/chosen": -2.328125, + "logits/rejected": -2.140625, + "logps/chosen": -187.0, + "logps/rejected": -161.0, + "loss": 0.6926, + "rewards/accuracies": 0.3499999940395355, + "rewards/chosen": 0.000392913818359375, + "rewards/margins": -3.0517578125e-05, + "rewards/rejected": 0.0004215240478515625, + "step": 20 + }, + { + "epoch": 0.0015461925009663702, + "grad_norm": 1.9263187247781828, + "learning_rate": 7.727975270479133e-09, + "logits/chosen": -2.34375, + "logits/rejected": -2.125, + "logps/chosen": -201.0, + "logps/rejected": -202.0, + "loss": 0.6925, + "rewards/accuracies": 0.26249998807907104, + "rewards/chosen": -0.000438690185546875, + "rewards/margins": -0.0004711151123046875, + "rewards/rejected": 3.0517578125e-05, + "step": 30 + }, + { + "epoch": 0.002061590001288494, + "grad_norm": 1.7249151074711806, + "learning_rate": 1.0303967027305512e-08, + "logits/chosen": -2.453125, + "logits/rejected": -2.203125, + "logps/chosen": -173.0, + "logps/rejected": -153.0, + "loss": 0.6927, + "rewards/accuracies": 0.36250001192092896, + "rewards/chosen": 0.0003910064697265625, + "rewards/margins": 0.0004711151123046875, + "rewards/rejected": -7.82012939453125e-05, + "step": 40 + }, + { + "epoch": 0.0025769875016106174, + "grad_norm": 1.918742645386906, + "learning_rate": 1.287995878413189e-08, + "logits/chosen": -2.421875, + "logits/rejected": -2.109375, + "logps/chosen": -176.0, + "logps/rejected": -167.0, + "loss": 0.693, + "rewards/accuracies": 0.3062500059604645, + "rewards/chosen": -0.00167083740234375, + "rewards/margins": -0.0012969970703125, + "rewards/rejected": -0.0003757476806640625, + "step": 50 + }, + { + "epoch": 0.0030923850019327404, + "grad_norm": 1.9731661869916353, + "learning_rate": 1.5455950540958267e-08, + "logits/chosen": -2.3125, + "logits/rejected": -2.09375, + "logps/chosen": -184.0, + "logps/rejected": -169.0, + "loss": 0.6927, + "rewards/accuracies": 0.28125, + "rewards/chosen": -0.0028228759765625, + "rewards/margins": -0.0021209716796875, + "rewards/rejected": -0.000705718994140625, + "step": 60 + }, + { + "epoch": 0.003607782502254864, + "grad_norm": 1.7921851949136531, + "learning_rate": 1.8031942297784647e-08, + "logits/chosen": -2.546875, + "logits/rejected": -2.078125, + "logps/chosen": -158.0, + "logps/rejected": -165.0, + "loss": 0.6926, + "rewards/accuracies": 0.39375001192092896, + "rewards/chosen": -0.0011138916015625, + "rewards/margins": 0.00168609619140625, + "rewards/rejected": -0.0027923583984375, + "step": 70 + }, + { + "epoch": 0.004123180002576988, + "grad_norm": 1.9241103057595568, + "learning_rate": 2.0607934054611024e-08, + "logits/chosen": -2.578125, + "logits/rejected": -2.265625, + "logps/chosen": -171.0, + "logps/rejected": -201.0, + "loss": 0.6925, + "rewards/accuracies": 0.34375, + "rewards/chosen": 0.0002346038818359375, + "rewards/margins": 0.000392913818359375, + "rewards/rejected": -0.000156402587890625, + "step": 80 + }, + { + "epoch": 0.004638577502899111, + "grad_norm": 1.8806357663286761, + "learning_rate": 2.3183925811437404e-08, + "logits/chosen": -2.453125, + "logits/rejected": -2.15625, + "logps/chosen": -179.0, + "logps/rejected": -150.0, + "loss": 0.6927, + "rewards/accuracies": 0.3375000059604645, + "rewards/chosen": -0.0004062652587890625, + "rewards/margins": -0.000827789306640625, + "rewards/rejected": 0.0004215240478515625, + "step": 90 + }, + { + "epoch": 0.005153975003221235, + "grad_norm": 1.8494470653376172, + "learning_rate": 2.575991756826378e-08, + "logits/chosen": -2.40625, + "logits/rejected": -2.203125, + "logps/chosen": -185.0, + "logps/rejected": -170.0, + "loss": 0.6928, + "rewards/accuracies": 0.35624998807907104, + "rewards/chosen": -0.000690460205078125, + "rewards/margins": -0.0005950927734375, + "rewards/rejected": -9.298324584960938e-05, + "step": 100 + }, + { + "epoch": 0.005669372503543358, + "grad_norm": 2.1161648433085425, + "learning_rate": 2.8335909325090157e-08, + "logits/chosen": -2.515625, + "logits/rejected": -2.296875, + "logps/chosen": -189.0, + "logps/rejected": -155.0, + "loss": 0.6925, + "rewards/accuracies": 0.3375000059604645, + "rewards/chosen": 0.001220703125, + "rewards/margins": 0.0018310546875, + "rewards/rejected": -0.0006103515625, + "step": 110 + }, + { + "epoch": 0.006184770003865481, + "grad_norm": 1.729278221114437, + "learning_rate": 3.0911901081916534e-08, + "logits/chosen": -2.453125, + "logits/rejected": -2.15625, + "logps/chosen": -170.0, + "logps/rejected": -158.0, + "loss": 0.6926, + "rewards/accuracies": 0.3125, + "rewards/chosen": -1.6450881958007812e-05, + "rewards/margins": -0.00028228759765625, + "rewards/rejected": 0.0002651214599609375, + "step": 120 + }, + { + "epoch": 0.006700167504187605, + "grad_norm": 1.8396780395497838, + "learning_rate": 3.348789283874291e-08, + "logits/chosen": -2.328125, + "logits/rejected": -2.140625, + "logps/chosen": -155.0, + "logps/rejected": -162.0, + "loss": 0.6928, + "rewards/accuracies": 0.3375000059604645, + "rewards/chosen": 5.4836273193359375e-05, + "rewards/margins": 0.0008544921875, + "rewards/rejected": -0.000797271728515625, + "step": 130 + }, + { + "epoch": 0.007215565004509728, + "grad_norm": 1.91500468996866, + "learning_rate": 3.6063884595569294e-08, + "logits/chosen": -2.4375, + "logits/rejected": -2.140625, + "logps/chosen": -168.0, + "logps/rejected": -147.0, + "loss": 0.6925, + "rewards/accuracies": 0.36250001192092896, + "rewards/chosen": -0.00067138671875, + "rewards/margins": -0.0002651214599609375, + "rewards/rejected": -0.0004062652587890625, + "step": 140 + }, + { + "epoch": 0.007730962504831852, + "grad_norm": 1.9755211253639784, + "learning_rate": 3.863987635239567e-08, + "logits/chosen": -2.46875, + "logits/rejected": -2.296875, + "logps/chosen": -192.0, + "logps/rejected": -164.0, + "loss": 0.6925, + "rewards/accuracies": 0.38749998807907104, + "rewards/chosen": -0.0003299713134765625, + "rewards/margins": 3.818422555923462e-07, + "rewards/rejected": -0.00032806396484375, + "step": 150 + }, + { + "epoch": 0.008246360005153976, + "grad_norm": 1.8399907871326475, + "learning_rate": 4.121586810922205e-08, + "logits/chosen": -2.40625, + "logits/rejected": -1.921875, + "logps/chosen": -164.0, + "logps/rejected": -185.0, + "loss": 0.6922, + "rewards/accuracies": 0.3375000059604645, + "rewards/chosen": -7.724761962890625e-05, + "rewards/margins": 0.0002689361572265625, + "rewards/rejected": -0.0003452301025390625, + "step": 160 + }, + { + "epoch": 0.008761757505476099, + "grad_norm": 1.964955448665975, + "learning_rate": 4.379185986604843e-08, + "logits/chosen": -2.453125, + "logits/rejected": -2.21875, + "logps/chosen": -168.0, + "logps/rejected": -138.0, + "loss": 0.6925, + "rewards/accuracies": 0.36250001192092896, + "rewards/chosen": 0.0014190673828125, + "rewards/margins": 0.00032806396484375, + "rewards/rejected": 0.00109100341796875, + "step": 170 + }, + { + "epoch": 0.009277155005798222, + "grad_norm": 1.7587263610000405, + "learning_rate": 4.636785162287481e-08, + "logits/chosen": -2.59375, + "logits/rejected": -2.265625, + "logps/chosen": -177.0, + "logps/rejected": -156.0, + "loss": 0.6923, + "rewards/accuracies": 0.41874998807907104, + "rewards/chosen": 0.00150299072265625, + "rewards/margins": 0.00115966796875, + "rewards/rejected": 0.0003452301025390625, + "step": 180 + }, + { + "epoch": 0.009792552506120345, + "grad_norm": 1.7120885747660863, + "learning_rate": 4.8943843379701184e-08, + "logits/chosen": -2.4375, + "logits/rejected": -2.109375, + "logps/chosen": -172.0, + "logps/rejected": -165.0, + "loss": 0.692, + "rewards/accuracies": 0.35624998807907104, + "rewards/chosen": 0.0012054443359375, + "rewards/margins": 0.00034332275390625, + "rewards/rejected": 0.000858306884765625, + "step": 190 + }, + { + "epoch": 0.01030795000644247, + "grad_norm": 2.050554943576804, + "learning_rate": 5.151983513652756e-08, + "logits/chosen": -2.40625, + "logits/rejected": -2.109375, + "logps/chosen": -157.0, + "logps/rejected": -147.0, + "loss": 0.6923, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.00140380859375, + "rewards/margins": 0.00115966796875, + "rewards/rejected": 0.0002498626708984375, + "step": 200 + }, + { + "epoch": 0.010823347506764593, + "grad_norm": 1.9771258607359028, + "learning_rate": 5.409582689335394e-08, + "logits/chosen": -2.46875, + "logits/rejected": -2.03125, + "logps/chosen": -171.0, + "logps/rejected": -144.0, + "loss": 0.6915, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": 0.0032806396484375, + "rewards/margins": 0.00335693359375, + "rewards/rejected": -7.867813110351562e-05, + "step": 210 + }, + { + "epoch": 0.011338745007086716, + "grad_norm": 1.7203840400145423, + "learning_rate": 5.6671818650180314e-08, + "logits/chosen": -2.421875, + "logits/rejected": -2.1875, + "logps/chosen": -180.0, + "logps/rejected": -154.0, + "loss": 0.6909, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.00518798828125, + "rewards/margins": 0.0027923583984375, + "rewards/rejected": 0.0023956298828125, + "step": 220 + }, + { + "epoch": 0.011854142507408839, + "grad_norm": 1.8951892156339936, + "learning_rate": 5.92478104070067e-08, + "logits/chosen": -2.5, + "logits/rejected": -2.203125, + "logps/chosen": -170.0, + "logps/rejected": -161.0, + "loss": 0.6915, + "rewards/accuracies": 0.40625, + "rewards/chosen": 0.004058837890625, + "rewards/margins": 0.000518798828125, + "rewards/rejected": 0.0035400390625, + "step": 230 + }, + { + "epoch": 0.012369540007730962, + "grad_norm": 2.1125398578032195, + "learning_rate": 6.182380216383307e-08, + "logits/chosen": -2.421875, + "logits/rejected": -2.140625, + "logps/chosen": -178.0, + "logps/rejected": -174.0, + "loss": 0.6915, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.004669189453125, + "rewards/margins": 0.00165557861328125, + "rewards/rejected": 0.003021240234375, + "step": 240 + }, + { + "epoch": 0.012884937508053087, + "grad_norm": 2.1232145558433353, + "learning_rate": 6.439979392065944e-08, + "logits/chosen": -2.4375, + "logits/rejected": -2.234375, + "logps/chosen": -180.0, + "logps/rejected": -200.0, + "loss": 0.6914, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.004608154296875, + "rewards/margins": 0.0023193359375, + "rewards/rejected": 0.0022735595703125, + "step": 250 + }, + { + "epoch": 0.01340033500837521, + "grad_norm": 1.8056563671011512, + "learning_rate": 6.697578567748582e-08, + "logits/chosen": -2.484375, + "logits/rejected": -2.21875, + "logps/chosen": -174.0, + "logps/rejected": -178.0, + "loss": 0.6904, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.006591796875, + "rewards/margins": 0.00506591796875, + "rewards/rejected": 0.00153350830078125, + "step": 260 + }, + { + "epoch": 0.013915732508697333, + "grad_norm": 2.0117917202958133, + "learning_rate": 6.95517774343122e-08, + "logits/chosen": -2.4375, + "logits/rejected": -2.21875, + "logps/chosen": -189.0, + "logps/rejected": -173.0, + "loss": 0.6903, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.00640869140625, + "rewards/margins": 0.004730224609375, + "rewards/rejected": 0.00167083740234375, + "step": 270 + }, + { + "epoch": 0.014431130009019456, + "grad_norm": 2.040838163478164, + "learning_rate": 7.212776919113859e-08, + "logits/chosen": -2.5, + "logits/rejected": -2.375, + "logps/chosen": -186.0, + "logps/rejected": -162.0, + "loss": 0.6902, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0087890625, + "rewards/margins": 0.0052490234375, + "rewards/rejected": 0.0035400390625, + "step": 280 + }, + { + "epoch": 0.01494652750934158, + "grad_norm": 1.8190430538680442, + "learning_rate": 7.470376094796496e-08, + "logits/chosen": -2.4375, + "logits/rejected": -2.203125, + "logps/chosen": -157.0, + "logps/rejected": -158.0, + "loss": 0.6895, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 0.01385498046875, + "rewards/margins": 0.006591796875, + "rewards/rejected": 0.007293701171875, + "step": 290 + }, + { + "epoch": 0.015461925009663703, + "grad_norm": 1.8975673467815963, + "learning_rate": 7.727975270479134e-08, + "logits/chosen": -2.3125, + "logits/rejected": -2.0625, + "logps/chosen": -175.0, + "logps/rejected": -178.0, + "loss": 0.6892, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0107421875, + "rewards/margins": 0.0078125, + "rewards/rejected": 0.0029449462890625, + "step": 300 + }, + { + "epoch": 0.015977322509985827, + "grad_norm": 1.9189914274781914, + "learning_rate": 7.985574446161772e-08, + "logits/chosen": -2.375, + "logits/rejected": -2.203125, + "logps/chosen": -166.0, + "logps/rejected": -139.0, + "loss": 0.6894, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0125732421875, + "rewards/margins": 0.007476806640625, + "rewards/rejected": 0.005096435546875, + "step": 310 + }, + { + "epoch": 0.01649272001030795, + "grad_norm": 1.9862474119612328, + "learning_rate": 8.24317362184441e-08, + "logits/chosen": -2.515625, + "logits/rejected": -2.234375, + "logps/chosen": -172.0, + "logps/rejected": -160.0, + "loss": 0.6881, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.01263427734375, + "rewards/margins": 0.00830078125, + "rewards/rejected": 0.00433349609375, + "step": 320 + }, + { + "epoch": 0.017008117510630073, + "grad_norm": 1.8857178216322663, + "learning_rate": 8.500772797527047e-08, + "logits/chosen": -2.578125, + "logits/rejected": -2.234375, + "logps/chosen": -183.0, + "logps/rejected": -162.0, + "loss": 0.6881, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.0162353515625, + "rewards/margins": 0.0101318359375, + "rewards/rejected": 0.006103515625, + "step": 330 + }, + { + "epoch": 0.017523515010952197, + "grad_norm": 1.9023450570287852, + "learning_rate": 8.758371973209686e-08, + "logits/chosen": -2.5625, + "logits/rejected": -2.25, + "logps/chosen": -193.0, + "logps/rejected": -177.0, + "loss": 0.6872, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.017578125, + "rewards/margins": 0.01080322265625, + "rewards/rejected": 0.0068359375, + "step": 340 + }, + { + "epoch": 0.01803891251127432, + "grad_norm": 1.8762718827076927, + "learning_rate": 9.015971148892324e-08, + "logits/chosen": -2.5625, + "logits/rejected": -2.359375, + "logps/chosen": -183.0, + "logps/rejected": -174.0, + "loss": 0.6876, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.013916015625, + "rewards/margins": 0.011962890625, + "rewards/rejected": 0.001922607421875, + "step": 350 + }, + { + "epoch": 0.018554310011596443, + "grad_norm": 1.8193107885729907, + "learning_rate": 9.273570324574961e-08, + "logits/chosen": -2.53125, + "logits/rejected": -2.28125, + "logps/chosen": -195.0, + "logps/rejected": -180.0, + "loss": 0.6862, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.0162353515625, + "rewards/margins": 0.0123291015625, + "rewards/rejected": 0.00390625, + "step": 360 + }, + { + "epoch": 0.01906970751191857, + "grad_norm": 1.8364732352444786, + "learning_rate": 9.531169500257599e-08, + "logits/chosen": -2.53125, + "logits/rejected": -2.203125, + "logps/chosen": -177.0, + "logps/rejected": -175.0, + "loss": 0.6855, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.0185546875, + "rewards/margins": 0.01904296875, + "rewards/rejected": -0.0004673004150390625, + "step": 370 + }, + { + "epoch": 0.01958510501224069, + "grad_norm": 1.7365184397919573, + "learning_rate": 9.788768675940237e-08, + "logits/chosen": -2.328125, + "logits/rejected": -2.234375, + "logps/chosen": -186.0, + "logps/rejected": -164.0, + "loss": 0.6856, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.01708984375, + "rewards/margins": 0.00970458984375, + "rewards/rejected": 0.00732421875, + "step": 380 + }, + { + "epoch": 0.020100502512562814, + "grad_norm": 1.8501229709188791, + "learning_rate": 1.0046367851622874e-07, + "logits/chosen": -2.484375, + "logits/rejected": -2.21875, + "logps/chosen": -169.0, + "logps/rejected": -143.0, + "loss": 0.6846, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.0240478515625, + "rewards/margins": 0.018310546875, + "rewards/rejected": 0.005706787109375, + "step": 390 + }, + { + "epoch": 0.02061590001288494, + "grad_norm": 1.9497237778181722, + "learning_rate": 1.0303967027305512e-07, + "logits/chosen": -2.625, + "logits/rejected": -2.21875, + "logps/chosen": -168.0, + "logps/rejected": -150.0, + "loss": 0.6829, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0260009765625, + "rewards/margins": 0.0228271484375, + "rewards/rejected": 0.0031890869140625, + "step": 400 + }, + { + "epoch": 0.02113129751320706, + "grad_norm": 1.8273476133809328, + "learning_rate": 1.056156620298815e-07, + "logits/chosen": -2.46875, + "logits/rejected": -2.265625, + "logps/chosen": -176.0, + "logps/rejected": -172.0, + "loss": 0.683, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.017822265625, + "rewards/margins": 0.02490234375, + "rewards/rejected": -0.0072021484375, + "step": 410 + }, + { + "epoch": 0.021646695013529185, + "grad_norm": 2.0246738586450044, + "learning_rate": 1.0819165378670788e-07, + "logits/chosen": -2.59375, + "logits/rejected": -2.28125, + "logps/chosen": -174.0, + "logps/rejected": -178.0, + "loss": 0.6822, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.0157470703125, + "rewards/margins": 0.0216064453125, + "rewards/rejected": -0.005889892578125, + "step": 420 + }, + { + "epoch": 0.022162092513851307, + "grad_norm": 1.6643537574060239, + "learning_rate": 1.1076764554353425e-07, + "logits/chosen": -2.375, + "logits/rejected": -2.140625, + "logps/chosen": -153.0, + "logps/rejected": -150.0, + "loss": 0.6808, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.0191650390625, + "rewards/margins": 0.0235595703125, + "rewards/rejected": -0.004486083984375, + "step": 430 + }, + { + "epoch": 0.02267749001417343, + "grad_norm": 1.9883859818907657, + "learning_rate": 1.1334363730036063e-07, + "logits/chosen": -2.53125, + "logits/rejected": -2.1875, + "logps/chosen": -182.0, + "logps/rejected": -168.0, + "loss": 0.6806, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": 0.0211181640625, + "rewards/margins": 0.034423828125, + "rewards/rejected": -0.01336669921875, + "step": 440 + }, + { + "epoch": 0.023192887514495556, + "grad_norm": 2.2879463071821964, + "learning_rate": 1.15919629057187e-07, + "logits/chosen": -2.5625, + "logits/rejected": -2.15625, + "logps/chosen": -164.0, + "logps/rejected": -153.0, + "loss": 0.679, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.0186767578125, + "rewards/margins": 0.032958984375, + "rewards/rejected": -0.01416015625, + "step": 450 + }, + { + "epoch": 0.023708285014817677, + "grad_norm": 6.276636737033863, + "learning_rate": 1.184956208140134e-07, + "logits/chosen": -2.53125, + "logits/rejected": -2.28125, + "logps/chosen": -183.0, + "logps/rejected": -159.0, + "loss": 0.6782, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.0152587890625, + "rewards/margins": 0.033447265625, + "rewards/rejected": -0.01806640625, + "step": 460 + }, + { + "epoch": 0.024223682515139802, + "grad_norm": 1.660447099806635, + "learning_rate": 1.2107161257083976e-07, + "logits/chosen": -2.609375, + "logits/rejected": -2.328125, + "logps/chosen": -174.0, + "logps/rejected": -159.0, + "loss": 0.6775, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.0157470703125, + "rewards/margins": 0.033203125, + "rewards/rejected": -0.0174560546875, + "step": 470 + }, + { + "epoch": 0.024739080015461924, + "grad_norm": 1.8229996419208616, + "learning_rate": 1.2364760432766614e-07, + "logits/chosen": -2.640625, + "logits/rejected": -2.40625, + "logps/chosen": -192.0, + "logps/rejected": -182.0, + "loss": 0.676, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0030670166015625, + "rewards/margins": 0.026611328125, + "rewards/rejected": -0.0235595703125, + "step": 480 + }, + { + "epoch": 0.02525447751578405, + "grad_norm": 1.8826861421082453, + "learning_rate": 1.262235960844925e-07, + "logits/chosen": -2.515625, + "logits/rejected": -2.15625, + "logps/chosen": -176.0, + "logps/rejected": -165.0, + "loss": 0.6742, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.0120849609375, + "rewards/margins": 0.039794921875, + "rewards/rejected": -0.027587890625, + "step": 490 + }, + { + "epoch": 0.025769875016106173, + "grad_norm": 1.989472780477497, + "learning_rate": 1.287995878413189e-07, + "logits/chosen": -2.53125, + "logits/rejected": -2.265625, + "logps/chosen": -175.0, + "logps/rejected": -148.0, + "loss": 0.6725, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.012451171875, + "rewards/margins": 0.04541015625, + "rewards/rejected": -0.032958984375, + "step": 500 + }, + { + "epoch": 0.026285272516428294, + "grad_norm": 1.9554241473553333, + "learning_rate": 1.3137557959814527e-07, + "logits/chosen": -2.5625, + "logits/rejected": -2.3125, + "logps/chosen": -176.0, + "logps/rejected": -174.0, + "loss": 0.6703, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0036773681640625, + "rewards/margins": 0.0361328125, + "rewards/rejected": -0.039794921875, + "step": 510 + }, + { + "epoch": 0.02680067001675042, + "grad_norm": 2.0299264358447493, + "learning_rate": 1.3395157135497164e-07, + "logits/chosen": -2.5625, + "logits/rejected": -2.375, + "logps/chosen": -174.0, + "logps/rejected": -156.0, + "loss": 0.6698, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.00299072265625, + "rewards/margins": 0.03955078125, + "rewards/rejected": -0.04248046875, + "step": 520 + }, + { + "epoch": 0.02731606751707254, + "grad_norm": 1.928939572747326, + "learning_rate": 1.3652756311179802e-07, + "logits/chosen": -2.640625, + "logits/rejected": -2.34375, + "logps/chosen": -184.0, + "logps/rejected": -163.0, + "loss": 0.6674, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.0186767578125, + "rewards/margins": 0.04931640625, + "rewards/rejected": -0.06787109375, + "step": 530 + }, + { + "epoch": 0.027831465017394665, + "grad_norm": 1.8993206430942549, + "learning_rate": 1.391035548686244e-07, + "logits/chosen": -2.625, + "logits/rejected": -2.296875, + "logps/chosen": -162.0, + "logps/rejected": -156.0, + "loss": 0.6643, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0206298828125, + "rewards/margins": 0.048095703125, + "rewards/rejected": -0.06884765625, + "step": 540 + }, + { + "epoch": 0.02834686251771679, + "grad_norm": 1.9623219649831574, + "learning_rate": 1.4167954662545077e-07, + "logits/chosen": -2.5625, + "logits/rejected": -2.203125, + "logps/chosen": -176.0, + "logps/rejected": -173.0, + "loss": 0.6624, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.047119140625, + "rewards/margins": 0.06884765625, + "rewards/rejected": -0.11572265625, + "step": 550 + }, + { + "epoch": 0.02886226001803891, + "grad_norm": 1.9087363641009303, + "learning_rate": 1.4425553838227718e-07, + "logits/chosen": -2.484375, + "logits/rejected": -2.234375, + "logps/chosen": -184.0, + "logps/rejected": -171.0, + "loss": 0.6597, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.030517578125, + "rewards/margins": 0.08251953125, + "rewards/rejected": -0.11328125, + "step": 560 + }, + { + "epoch": 0.029377657518361036, + "grad_norm": 2.424883748793067, + "learning_rate": 1.4683153013910355e-07, + "logits/chosen": -2.71875, + "logits/rejected": -2.28125, + "logps/chosen": -178.0, + "logps/rejected": -216.0, + "loss": 0.6535, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.060302734375, + "rewards/margins": 0.0986328125, + "rewards/rejected": -0.1591796875, + "step": 570 + }, + { + "epoch": 0.02989305501868316, + "grad_norm": 2.042524843810125, + "learning_rate": 1.4940752189592993e-07, + "logits/chosen": -2.671875, + "logits/rejected": -2.40625, + "logps/chosen": -184.0, + "logps/rejected": -181.0, + "loss": 0.6497, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.06591796875, + "rewards/margins": 0.123046875, + "rewards/rejected": -0.189453125, + "step": 580 + }, + { + "epoch": 0.030408452519005282, + "grad_norm": 2.7593811037469917, + "learning_rate": 1.519835136527563e-07, + "logits/chosen": -2.75, + "logits/rejected": -2.375, + "logps/chosen": -169.0, + "logps/rejected": -178.0, + "loss": 0.6459, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.10107421875, + "rewards/margins": 0.09765625, + "rewards/rejected": -0.19921875, + "step": 590 + }, + { + "epoch": 0.030923850019327407, + "grad_norm": 2.5124700773071704, + "learning_rate": 1.5455950540958268e-07, + "logits/chosen": -2.625, + "logits/rejected": -2.40625, + "logps/chosen": -196.0, + "logps/rejected": -209.0, + "loss": 0.6395, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.12451171875, + "rewards/margins": 0.138671875, + "rewards/rejected": -0.263671875, + "step": 600 + }, + { + "epoch": 0.03143924751964953, + "grad_norm": 2.1819639978006644, + "learning_rate": 1.5713549716640906e-07, + "logits/chosen": -2.65625, + "logits/rejected": -2.40625, + "logps/chosen": -184.0, + "logps/rejected": -194.0, + "loss": 0.6395, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.173828125, + "rewards/margins": 0.119140625, + "rewards/rejected": -0.29296875, + "step": 610 + }, + { + "epoch": 0.03195464501997165, + "grad_norm": 2.2212260929957472, + "learning_rate": 1.5971148892323544e-07, + "logits/chosen": -2.640625, + "logits/rejected": -2.296875, + "logps/chosen": -200.0, + "logps/rejected": -195.0, + "loss": 0.635, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.21875, + "rewards/margins": 0.1240234375, + "rewards/rejected": -0.341796875, + "step": 620 + }, + { + "epoch": 0.03247004252029378, + "grad_norm": 2.146404818070363, + "learning_rate": 1.622874806800618e-07, + "logits/chosen": -2.734375, + "logits/rejected": -2.46875, + "logps/chosen": -191.0, + "logps/rejected": -206.0, + "loss": 0.6337, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.240234375, + "rewards/margins": 0.1669921875, + "rewards/rejected": -0.408203125, + "step": 630 + }, + { + "epoch": 0.0329854400206159, + "grad_norm": 2.2000226927411273, + "learning_rate": 1.648634724368882e-07, + "logits/chosen": -2.78125, + "logits/rejected": -2.46875, + "logps/chosen": -239.0, + "logps/rejected": -206.0, + "loss": 0.6229, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.29296875, + "rewards/margins": 0.1396484375, + "rewards/rejected": -0.43359375, + "step": 640 + }, + { + "epoch": 0.03350083752093802, + "grad_norm": 2.683284954193124, + "learning_rate": 1.6743946419371457e-07, + "logits/chosen": -2.75, + "logits/rejected": -2.5, + "logps/chosen": -206.0, + "logps/rejected": -224.0, + "loss": 0.6186, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.31640625, + "rewards/margins": 0.2080078125, + "rewards/rejected": -0.5234375, + "step": 650 + }, + { + "epoch": 0.034016235021260145, + "grad_norm": 2.320514508663569, + "learning_rate": 1.7001545595054094e-07, + "logits/chosen": -2.6875, + "logits/rejected": -2.359375, + "logps/chosen": -205.0, + "logps/rejected": -228.0, + "loss": 0.6098, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.34375, + "rewards/margins": 0.244140625, + "rewards/rejected": -0.58984375, + "step": 660 + }, + { + "epoch": 0.03453163252158227, + "grad_norm": 2.3767942599451537, + "learning_rate": 1.7259144770736732e-07, + "logits/chosen": -2.71875, + "logits/rejected": -2.390625, + "logps/chosen": -186.0, + "logps/rejected": -202.0, + "loss": 0.6009, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.37890625, + "rewards/margins": 0.224609375, + "rewards/rejected": -0.6015625, + "step": 670 + }, + { + "epoch": 0.035047030021904395, + "grad_norm": 2.2390700980996505, + "learning_rate": 1.7516743946419372e-07, + "logits/chosen": -2.703125, + "logits/rejected": -2.421875, + "logps/chosen": -226.0, + "logps/rejected": -218.0, + "loss": 0.6086, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.435546875, + "rewards/margins": 0.24609375, + "rewards/rejected": -0.6796875, + "step": 680 + }, + { + "epoch": 0.03556242752222652, + "grad_norm": 2.2783170620492923, + "learning_rate": 1.777434312210201e-07, + "logits/chosen": -2.859375, + "logits/rejected": -2.46875, + "logps/chosen": -209.0, + "logps/rejected": -244.0, + "loss": 0.6059, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.435546875, + "rewards/margins": 0.298828125, + "rewards/rejected": -0.734375, + "step": 690 + }, + { + "epoch": 0.03607782502254864, + "grad_norm": 2.4117176591449683, + "learning_rate": 1.8031942297784648e-07, + "logits/chosen": -2.765625, + "logits/rejected": -2.40625, + "logps/chosen": -228.0, + "logps/rejected": -229.0, + "loss": 0.5984, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.50390625, + "rewards/margins": 0.2138671875, + "rewards/rejected": -0.71875, + "step": 700 + }, + { + "epoch": 0.03659322252287076, + "grad_norm": 2.65331139123412, + "learning_rate": 1.8289541473467285e-07, + "logits/chosen": -2.9375, + "logits/rejected": -2.5, + "logps/chosen": -213.0, + "logps/rejected": -262.0, + "loss": 0.5838, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.490234375, + "rewards/margins": 0.341796875, + "rewards/rejected": -0.83203125, + "step": 710 + }, + { + "epoch": 0.03710862002319289, + "grad_norm": 2.507908249135596, + "learning_rate": 1.8547140649149923e-07, + "logits/chosen": -2.8125, + "logits/rejected": -2.4375, + "logps/chosen": -232.0, + "logps/rejected": -242.0, + "loss": 0.5942, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.54296875, + "rewards/margins": 0.279296875, + "rewards/rejected": -0.8203125, + "step": 720 + }, + { + "epoch": 0.03762401752351501, + "grad_norm": 2.852547925076602, + "learning_rate": 1.880473982483256e-07, + "logits/chosen": -2.90625, + "logits/rejected": -2.625, + "logps/chosen": -235.0, + "logps/rejected": -254.0, + "loss": 0.5889, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.57421875, + "rewards/margins": 0.28515625, + "rewards/rejected": -0.859375, + "step": 730 + }, + { + "epoch": 0.03813941502383714, + "grad_norm": 3.120176660062423, + "learning_rate": 1.9062339000515198e-07, + "logits/chosen": -2.875, + "logits/rejected": -2.578125, + "logps/chosen": -228.0, + "logps/rejected": -266.0, + "loss": 0.5725, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.59765625, + "rewards/margins": 0.400390625, + "rewards/rejected": -0.99609375, + "step": 740 + }, + { + "epoch": 0.038654812524159254, + "grad_norm": 3.353216050340019, + "learning_rate": 1.9319938176197836e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.875, + "logps/chosen": -242.0, + "logps/rejected": -284.0, + "loss": 0.572, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.6640625, + "rewards/margins": 0.439453125, + "rewards/rejected": -1.1015625, + "step": 750 + }, + { + "epoch": 0.03917021002448138, + "grad_norm": 3.460693814550771, + "learning_rate": 1.9577537351880474e-07, + "logits/chosen": -3.078125, + "logits/rejected": -2.875, + "logps/chosen": -251.0, + "logps/rejected": -294.0, + "loss": 0.5638, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.734375, + "rewards/margins": 0.4453125, + "rewards/rejected": -1.1796875, + "step": 760 + }, + { + "epoch": 0.039685607524803504, + "grad_norm": 3.8748807156212726, + "learning_rate": 1.983513652756311e-07, + "logits/chosen": -3.046875, + "logits/rejected": -2.875, + "logps/chosen": -240.0, + "logps/rejected": -270.0, + "loss": 0.5548, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.73046875, + "rewards/margins": 0.453125, + "rewards/rejected": -1.1875, + "step": 770 + }, + { + "epoch": 0.04020100502512563, + "grad_norm": 3.298838595555662, + "learning_rate": 2.009273570324575e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.90625, + "logps/chosen": -252.0, + "logps/rejected": -294.0, + "loss": 0.5684, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.734375, + "rewards/margins": 0.5234375, + "rewards/rejected": -1.2578125, + "step": 780 + }, + { + "epoch": 0.040716402525447754, + "grad_norm": 3.7711378104164597, + "learning_rate": 2.0350334878928384e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.875, + "logps/chosen": -256.0, + "logps/rejected": -278.0, + "loss": 0.5659, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.79296875, + "rewards/margins": 0.4296875, + "rewards/rejected": -1.2265625, + "step": 790 + }, + { + "epoch": 0.04123180002576988, + "grad_norm": 5.295599581506164, + "learning_rate": 2.0607934054611024e-07, + "logits/chosen": -3.203125, + "logits/rejected": -3.0, + "logps/chosen": -240.0, + "logps/rejected": -270.0, + "loss": 0.5515, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.7265625, + "rewards/margins": 0.47265625, + "rewards/rejected": -1.203125, + "step": 800 + }, + { + "epoch": 0.041747197526091996, + "grad_norm": 3.851488419693965, + "learning_rate": 2.0865533230293662e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.984375, + "logps/chosen": -258.0, + "logps/rejected": -294.0, + "loss": 0.5373, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.79296875, + "rewards/margins": 0.59765625, + "rewards/rejected": -1.390625, + "step": 810 + }, + { + "epoch": 0.04226259502641412, + "grad_norm": 4.571198767974995, + "learning_rate": 2.11231324059763e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.078125, + "logps/chosen": -278.0, + "logps/rejected": -320.0, + "loss": 0.5526, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.83984375, + "rewards/margins": 0.546875, + "rewards/rejected": -1.390625, + "step": 820 + }, + { + "epoch": 0.042777992526736246, + "grad_norm": 4.989670237666831, + "learning_rate": 2.1380731581658937e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.859375, + "logps/chosen": -272.0, + "logps/rejected": -304.0, + "loss": 0.5361, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.796875, + "rewards/margins": 0.490234375, + "rewards/rejected": -1.2890625, + "step": 830 + }, + { + "epoch": 0.04329339002705837, + "grad_norm": 4.12568366463292, + "learning_rate": 2.1638330757341575e-07, + "logits/chosen": -3.140625, + "logits/rejected": -3.0625, + "logps/chosen": -282.0, + "logps/rejected": -286.0, + "loss": 0.5509, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.9140625, + "rewards/margins": 0.37109375, + "rewards/rejected": -1.2890625, + "step": 840 + }, + { + "epoch": 0.043808787527380495, + "grad_norm": 4.175260872943488, + "learning_rate": 2.1895929933024213e-07, + "logits/chosen": -3.1875, + "logits/rejected": -3.078125, + "logps/chosen": -256.0, + "logps/rejected": -290.0, + "loss": 0.5346, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.8515625, + "rewards/margins": 0.470703125, + "rewards/rejected": -1.3203125, + "step": 850 + }, + { + "epoch": 0.04432418502770261, + "grad_norm": 4.227087917418865, + "learning_rate": 2.215352910870685e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.96875, + "logps/chosen": -258.0, + "logps/rejected": -328.0, + "loss": 0.5247, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.84765625, + "rewards/margins": 0.65234375, + "rewards/rejected": -1.5, + "step": 860 + }, + { + "epoch": 0.04483958252802474, + "grad_norm": 4.334496018435708, + "learning_rate": 2.2411128284389488e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.96875, + "logps/chosen": -270.0, + "logps/rejected": -298.0, + "loss": 0.538, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.09375, + "rewards/margins": 0.458984375, + "rewards/rejected": -1.5546875, + "step": 870 + }, + { + "epoch": 0.04535498002834686, + "grad_norm": 4.403588625835884, + "learning_rate": 2.2668727460072126e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.921875, + "logps/chosen": -245.0, + "logps/rejected": -304.0, + "loss": 0.5212, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.94921875, + "rewards/margins": 0.625, + "rewards/rejected": -1.578125, + "step": 880 + }, + { + "epoch": 0.04587037752866899, + "grad_norm": 4.8292547795876715, + "learning_rate": 2.2926326635754763e-07, + "logits/chosen": -3.0, + "logits/rejected": -2.8125, + "logps/chosen": -270.0, + "logps/rejected": -326.0, + "loss": 0.5293, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.90625, + "rewards/margins": 0.578125, + "rewards/rejected": -1.484375, + "step": 890 + }, + { + "epoch": 0.04638577502899111, + "grad_norm": 5.806418620408354, + "learning_rate": 2.31839258114374e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.96875, + "logps/chosen": -290.0, + "logps/rejected": -328.0, + "loss": 0.5281, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.03125, + "rewards/margins": 0.73046875, + "rewards/rejected": -1.765625, + "step": 900 + }, + { + "epoch": 0.04690117252931323, + "grad_norm": 4.1810267176678995, + "learning_rate": 2.344152498712004e-07, + "logits/chosen": -3.234375, + "logits/rejected": -3.171875, + "logps/chosen": -304.0, + "logps/rejected": -336.0, + "loss": 0.5053, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.09375, + "rewards/margins": 0.80078125, + "rewards/rejected": -1.890625, + "step": 910 + }, + { + "epoch": 0.047416570029635355, + "grad_norm": 6.111194360481807, + "learning_rate": 2.369912416280268e-07, + "logits/chosen": -3.234375, + "logits/rejected": -3.03125, + "logps/chosen": -258.0, + "logps/rejected": -344.0, + "loss": 0.5184, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.99609375, + "rewards/margins": 0.7421875, + "rewards/rejected": -1.7421875, + "step": 920 + }, + { + "epoch": 0.04793196752995748, + "grad_norm": 6.861672598027938, + "learning_rate": 2.3956723338485317e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.984375, + "logps/chosen": -328.0, + "logps/rejected": -376.0, + "loss": 0.515, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.15625, + "rewards/margins": 0.86328125, + "rewards/rejected": -2.03125, + "step": 930 + }, + { + "epoch": 0.048447365030279604, + "grad_norm": 4.781273222413918, + "learning_rate": 2.421432251416795e-07, + "logits/chosen": -3.125, + "logits/rejected": -3.078125, + "logps/chosen": -298.0, + "logps/rejected": -348.0, + "loss": 0.5109, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.078125, + "rewards/margins": 0.79296875, + "rewards/rejected": -1.8671875, + "step": 940 + }, + { + "epoch": 0.04896276253060173, + "grad_norm": 5.225683164592811, + "learning_rate": 2.447192168985059e-07, + "logits/chosen": -3.078125, + "logits/rejected": -2.90625, + "logps/chosen": -274.0, + "logps/rejected": -330.0, + "loss": 0.5317, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.0, + "rewards/margins": 0.7734375, + "rewards/rejected": -1.78125, + "step": 950 + }, + { + "epoch": 0.04947816003092385, + "grad_norm": 5.715406183414545, + "learning_rate": 2.4729520865533227e-07, + "logits/chosen": -3.078125, + "logits/rejected": -2.890625, + "logps/chosen": -248.0, + "logps/rejected": -296.0, + "loss": 0.533, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0078125, + "rewards/margins": 0.5078125, + "rewards/rejected": -1.515625, + "step": 960 + }, + { + "epoch": 0.04999355753124597, + "grad_norm": 10.391487453656136, + "learning_rate": 2.498712004121587e-07, + "logits/chosen": -3.03125, + "logits/rejected": -2.890625, + "logps/chosen": -304.0, + "logps/rejected": -376.0, + "loss": 0.5073, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1953125, + "rewards/margins": 0.77734375, + "rewards/rejected": -1.96875, + "step": 970 + }, + { + "epoch": 0.0505089550315681, + "grad_norm": 5.6824952005863585, + "learning_rate": 2.52447192168985e-07, + "logits/chosen": -3.015625, + "logits/rejected": -2.6875, + "logps/chosen": -264.0, + "logps/rejected": -338.0, + "loss": 0.5309, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.140625, + "rewards/margins": 0.80078125, + "rewards/rejected": -1.9375, + "step": 980 + }, + { + "epoch": 0.05102435253189022, + "grad_norm": 5.291202460264466, + "learning_rate": 2.5502318392581143e-07, + "logits/chosen": -3.125, + "logits/rejected": -3.0, + "logps/chosen": -284.0, + "logps/rejected": -354.0, + "loss": 0.5, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.15625, + "rewards/margins": 0.6484375, + "rewards/rejected": -1.8046875, + "step": 990 + }, + { + "epoch": 0.051539750032212346, + "grad_norm": 5.355732269140899, + "learning_rate": 2.575991756826378e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.8125, + "logps/chosen": -312.0, + "logps/rejected": -394.0, + "loss": 0.5185, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.265625, + "rewards/margins": 0.9765625, + "rewards/rejected": -2.25, + "step": 1000 + }, + { + "epoch": 0.052055147532534464, + "grad_norm": 8.751752880308684, + "learning_rate": 2.601751674394642e-07, + "logits/chosen": -3.109375, + "logits/rejected": -3.0, + "logps/chosen": -312.0, + "logps/rejected": -350.0, + "loss": 0.5087, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.28125, + "rewards/margins": 0.578125, + "rewards/rejected": -1.859375, + "step": 1010 + }, + { + "epoch": 0.05257054503285659, + "grad_norm": 6.9734200353175355, + "learning_rate": 2.6275115919629053e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.953125, + "logps/chosen": -300.0, + "logps/rejected": -372.0, + "loss": 0.4925, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.1796875, + "rewards/margins": 0.78515625, + "rewards/rejected": -1.96875, + "step": 1020 + }, + { + "epoch": 0.053085942533178714, + "grad_norm": 5.5312767104309355, + "learning_rate": 2.6532715095311693e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.9375, + "logps/chosen": -306.0, + "logps/rejected": -350.0, + "loss": 0.5059, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.1875, + "rewards/margins": 0.73828125, + "rewards/rejected": -1.921875, + "step": 1030 + }, + { + "epoch": 0.05360134003350084, + "grad_norm": 7.312117621372699, + "learning_rate": 2.679031427099433e-07, + "logits/chosen": -3.1875, + "logits/rejected": -3.046875, + "logps/chosen": -310.0, + "logps/rejected": -352.0, + "loss": 0.5021, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1796875, + "rewards/margins": 0.6953125, + "rewards/rejected": -1.875, + "step": 1040 + }, + { + "epoch": 0.05411673753382296, + "grad_norm": 6.716674532997825, + "learning_rate": 2.704791344667697e-07, + "logits/chosen": -3.109375, + "logits/rejected": -3.046875, + "logps/chosen": -310.0, + "logps/rejected": -356.0, + "loss": 0.5133, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.328125, + "rewards/margins": 0.69921875, + "rewards/rejected": -2.03125, + "step": 1050 + }, + { + "epoch": 0.05463213503414508, + "grad_norm": 6.783153840086816, + "learning_rate": 2.7305512622359604e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.84375, + "logps/chosen": -282.0, + "logps/rejected": -346.0, + "loss": 0.4903, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.0859375, + "rewards/margins": 0.79296875, + "rewards/rejected": -1.875, + "step": 1060 + }, + { + "epoch": 0.055147532534467206, + "grad_norm": 8.439783913250835, + "learning_rate": 2.7563111798042244e-07, + "logits/chosen": -3.03125, + "logits/rejected": -2.953125, + "logps/chosen": -374.0, + "logps/rejected": -418.0, + "loss": 0.4836, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5, + "rewards/margins": 0.90234375, + "rewards/rejected": -2.40625, + "step": 1070 + }, + { + "epoch": 0.05566293003478933, + "grad_norm": 6.758424218037328, + "learning_rate": 2.782071097372488e-07, + "logits/chosen": -3.0625, + "logits/rejected": -2.796875, + "logps/chosen": -322.0, + "logps/rejected": -410.0, + "loss": 0.5138, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.375, + "rewards/margins": 1.0, + "rewards/rejected": -2.375, + "step": 1080 + }, + { + "epoch": 0.056178327535111455, + "grad_norm": 8.125020191754599, + "learning_rate": 2.807831014940752e-07, + "logits/chosen": -2.859375, + "logits/rejected": -2.84375, + "logps/chosen": -306.0, + "logps/rejected": -342.0, + "loss": 0.4982, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.1484375, + "rewards/margins": 0.68359375, + "rewards/rejected": -1.828125, + "step": 1090 + }, + { + "epoch": 0.05669372503543358, + "grad_norm": 7.625360676076644, + "learning_rate": 2.8335909325090154e-07, + "logits/chosen": -3.0, + "logits/rejected": -2.96875, + "logps/chosen": -320.0, + "logps/rejected": -368.0, + "loss": 0.5088, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.5234375, + "rewards/margins": 0.5390625, + "rewards/rejected": -2.0625, + "step": 1100 + }, + { + "epoch": 0.057209122535755705, + "grad_norm": 5.974638598891512, + "learning_rate": 2.8593508500772795e-07, + "logits/chosen": -2.890625, + "logits/rejected": -2.65625, + "logps/chosen": -316.0, + "logps/rejected": -378.0, + "loss": 0.498, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3125, + "rewards/margins": 0.6796875, + "rewards/rejected": -1.9921875, + "step": 1110 + }, + { + "epoch": 0.05772452003607782, + "grad_norm": 15.306896226057646, + "learning_rate": 2.8851107676455435e-07, + "logits/chosen": -2.96875, + "logits/rejected": -2.796875, + "logps/chosen": -330.0, + "logps/rejected": -380.0, + "loss": 0.4853, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.578125, + "rewards/margins": 0.8046875, + "rewards/rejected": -2.390625, + "step": 1120 + }, + { + "epoch": 0.05823991753639995, + "grad_norm": 5.926246583614072, + "learning_rate": 2.910870685213807e-07, + "logits/chosen": -2.796875, + "logits/rejected": -2.78125, + "logps/chosen": -336.0, + "logps/rejected": -430.0, + "loss": 0.4875, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.5, + "rewards/margins": 0.97265625, + "rewards/rejected": -2.46875, + "step": 1130 + }, + { + "epoch": 0.05875531503672207, + "grad_norm": 7.013013714165615, + "learning_rate": 2.936630602782071e-07, + "logits/chosen": -2.921875, + "logits/rejected": -2.71875, + "logps/chosen": -300.0, + "logps/rejected": -400.0, + "loss": 0.4711, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.40625, + "rewards/margins": 0.83984375, + "rewards/rejected": -2.25, + "step": 1140 + }, + { + "epoch": 0.0592707125370442, + "grad_norm": 6.578634836308115, + "learning_rate": 2.9623905203503345e-07, + "logits/chosen": -2.984375, + "logits/rejected": -2.84375, + "logps/chosen": -350.0, + "logps/rejected": -458.0, + "loss": 0.4726, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6640625, + "rewards/margins": 1.1875, + "rewards/rejected": -2.859375, + "step": 1150 + }, + { + "epoch": 0.05978611003736632, + "grad_norm": 7.231527410434733, + "learning_rate": 2.9881504379185986e-07, + "logits/chosen": -2.984375, + "logits/rejected": -2.84375, + "logps/chosen": -340.0, + "logps/rejected": -412.0, + "loss": 0.4794, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.578125, + "rewards/margins": 1.046875, + "rewards/rejected": -2.625, + "step": 1160 + }, + { + "epoch": 0.06030150753768844, + "grad_norm": 7.084681185793602, + "learning_rate": 3.013910355486862e-07, + "logits/chosen": -2.828125, + "logits/rejected": -2.734375, + "logps/chosen": -352.0, + "logps/rejected": -406.0, + "loss": 0.4811, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.6640625, + "rewards/margins": 0.68359375, + "rewards/rejected": -2.34375, + "step": 1170 + }, + { + "epoch": 0.060816905038010564, + "grad_norm": 7.655911625118512, + "learning_rate": 3.039670273055126e-07, + "logits/chosen": -2.90625, + "logits/rejected": -2.734375, + "logps/chosen": -340.0, + "logps/rejected": -432.0, + "loss": 0.4712, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.546875, + "rewards/margins": 1.015625, + "rewards/rejected": -2.5625, + "step": 1180 + }, + { + "epoch": 0.06133230253833269, + "grad_norm": 8.102461542194696, + "learning_rate": 3.0654301906233896e-07, + "logits/chosen": -2.90625, + "logits/rejected": -2.828125, + "logps/chosen": -322.0, + "logps/rejected": -402.0, + "loss": 0.4728, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -1.453125, + "rewards/margins": 1.0390625, + "rewards/rejected": -2.484375, + "step": 1190 + }, + { + "epoch": 0.061847700038654814, + "grad_norm": 8.471592501976598, + "learning_rate": 3.0911901081916536e-07, + "logits/chosen": -2.8125, + "logits/rejected": -2.625, + "logps/chosen": -318.0, + "logps/rejected": -414.0, + "loss": 0.4738, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4453125, + "rewards/margins": 1.0078125, + "rewards/rejected": -2.453125, + "step": 1200 + }, + { + "epoch": 0.06236309753897694, + "grad_norm": 9.529488990965092, + "learning_rate": 3.116950025759917e-07, + "logits/chosen": -2.921875, + "logits/rejected": -2.78125, + "logps/chosen": -308.0, + "logps/rejected": -396.0, + "loss": 0.4611, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.390625, + "rewards/margins": 1.0859375, + "rewards/rejected": -2.484375, + "step": 1210 + }, + { + "epoch": 0.06287849503929906, + "grad_norm": 11.51659180653601, + "learning_rate": 3.142709943328181e-07, + "logits/chosen": -2.8125, + "logits/rejected": -2.640625, + "logps/chosen": -326.0, + "logps/rejected": -402.0, + "loss": 0.4439, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.4453125, + "rewards/margins": 0.98046875, + "rewards/rejected": -2.421875, + "step": 1220 + }, + { + "epoch": 0.06339389253962119, + "grad_norm": 9.639029104828266, + "learning_rate": 3.168469860896445e-07, + "logits/chosen": -2.859375, + "logits/rejected": -2.6875, + "logps/chosen": -368.0, + "logps/rejected": -488.0, + "loss": 0.4377, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.8671875, + "rewards/margins": 1.1015625, + "rewards/rejected": -2.96875, + "step": 1230 + }, + { + "epoch": 0.0639092900399433, + "grad_norm": 6.516108338788054, + "learning_rate": 3.1942297784647087e-07, + "logits/chosen": -2.921875, + "logits/rejected": -2.78125, + "logps/chosen": -368.0, + "logps/rejected": -458.0, + "loss": 0.4665, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.8984375, + "rewards/margins": 1.078125, + "rewards/rejected": -2.984375, + "step": 1240 + }, + { + "epoch": 0.06442468754026542, + "grad_norm": 9.185847430328039, + "learning_rate": 3.219989696032973e-07, + "logits/chosen": -2.984375, + "logits/rejected": -2.75, + "logps/chosen": -336.0, + "logps/rejected": -450.0, + "loss": 0.4606, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -1.625, + "rewards/margins": 1.046875, + "rewards/rejected": -2.671875, + "step": 1250 + }, + { + "epoch": 0.06494008504058756, + "grad_norm": 8.265917401103026, + "learning_rate": 3.245749613601236e-07, + "logits/chosen": -2.921875, + "logits/rejected": -2.703125, + "logps/chosen": -316.0, + "logps/rejected": -406.0, + "loss": 0.4615, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3359375, + "rewards/margins": 1.0859375, + "rewards/rejected": -2.421875, + "step": 1260 + }, + { + "epoch": 0.06545548254090967, + "grad_norm": 9.756400121946823, + "learning_rate": 3.2715095311695003e-07, + "logits/chosen": -2.875, + "logits/rejected": -2.546875, + "logps/chosen": -326.0, + "logps/rejected": -428.0, + "loss": 0.4426, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.5625, + "rewards/margins": 1.1015625, + "rewards/rejected": -2.671875, + "step": 1270 + }, + { + "epoch": 0.0659708800412318, + "grad_norm": 8.038018636880018, + "learning_rate": 3.297269448737764e-07, + "logits/chosen": -2.765625, + "logits/rejected": -2.703125, + "logps/chosen": -316.0, + "logps/rejected": -382.0, + "loss": 0.4799, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.5390625, + "rewards/margins": 0.82421875, + "rewards/rejected": -2.359375, + "step": 1280 + }, + { + "epoch": 0.06648627754155392, + "grad_norm": 11.415415587170264, + "learning_rate": 3.323029366306028e-07, + "logits/chosen": -2.671875, + "logits/rejected": -2.578125, + "logps/chosen": -342.0, + "logps/rejected": -444.0, + "loss": 0.4415, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.6171875, + "rewards/margins": 1.125, + "rewards/rejected": -2.734375, + "step": 1290 + }, + { + "epoch": 0.06700167504187604, + "grad_norm": 8.000071852589558, + "learning_rate": 3.3487892838742913e-07, + "logits/chosen": -2.8125, + "logits/rejected": -2.703125, + "logps/chosen": -340.0, + "logps/rejected": -406.0, + "loss": 0.4536, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4921875, + "rewards/margins": 0.98828125, + "rewards/rejected": -2.484375, + "step": 1300 + }, + { + "epoch": 0.06751707254219817, + "grad_norm": 8.959283026906567, + "learning_rate": 3.3745492014425554e-07, + "logits/chosen": -2.796875, + "logits/rejected": -2.625, + "logps/chosen": -384.0, + "logps/rejected": -426.0, + "loss": 0.5011, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.171875, + "rewards/margins": 0.65234375, + "rewards/rejected": -2.8125, + "step": 1310 + }, + { + "epoch": 0.06803247004252029, + "grad_norm": 9.639994660512421, + "learning_rate": 3.400309119010819e-07, + "logits/chosen": -2.765625, + "logits/rejected": -2.65625, + "logps/chosen": -322.0, + "logps/rejected": -382.0, + "loss": 0.458, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.4296875, + "rewards/margins": 0.91015625, + "rewards/rejected": -2.34375, + "step": 1320 + }, + { + "epoch": 0.06854786754284242, + "grad_norm": 13.065074546570425, + "learning_rate": 3.426069036579083e-07, + "logits/chosen": -2.890625, + "logits/rejected": -2.765625, + "logps/chosen": -338.0, + "logps/rejected": -442.0, + "loss": 0.443, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.71875, + "rewards/margins": 1.1875, + "rewards/rejected": -2.90625, + "step": 1330 + }, + { + "epoch": 0.06906326504316454, + "grad_norm": 9.097721735200238, + "learning_rate": 3.4518289541473464e-07, + "logits/chosen": -2.78125, + "logits/rejected": -2.796875, + "logps/chosen": -386.0, + "logps/rejected": -448.0, + "loss": 0.4585, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.96875, + "rewards/margins": 0.97265625, + "rewards/rejected": -2.9375, + "step": 1340 + }, + { + "epoch": 0.06957866254348666, + "grad_norm": 8.238936263627584, + "learning_rate": 3.4775888717156104e-07, + "logits/chosen": -2.96875, + "logits/rejected": -2.75, + "logps/chosen": -352.0, + "logps/rejected": -472.0, + "loss": 0.44, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.7890625, + "rewards/margins": 1.15625, + "rewards/rejected": -2.953125, + "step": 1350 + }, + { + "epoch": 0.07009406004380879, + "grad_norm": 8.796660056745345, + "learning_rate": 3.5033487892838745e-07, + "logits/chosen": -2.96875, + "logits/rejected": -2.640625, + "logps/chosen": -328.0, + "logps/rejected": -462.0, + "loss": 0.4485, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.6875, + "rewards/margins": 1.1484375, + "rewards/rejected": -2.84375, + "step": 1360 + }, + { + "epoch": 0.07060945754413091, + "grad_norm": 11.602275286633564, + "learning_rate": 3.529108706852138e-07, + "logits/chosen": -2.9375, + "logits/rejected": -2.859375, + "logps/chosen": -344.0, + "logps/rejected": -436.0, + "loss": 0.4502, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.71875, + "rewards/margins": 1.046875, + "rewards/rejected": -2.75, + "step": 1370 + }, + { + "epoch": 0.07112485504445304, + "grad_norm": 9.979526572079687, + "learning_rate": 3.554868624420402e-07, + "logits/chosen": -2.90625, + "logits/rejected": -2.75, + "logps/chosen": -320.0, + "logps/rejected": -430.0, + "loss": 0.426, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.640625, + "rewards/margins": 1.21875, + "rewards/rejected": -2.859375, + "step": 1380 + }, + { + "epoch": 0.07164025254477516, + "grad_norm": 11.708186167229037, + "learning_rate": 3.5806285419886655e-07, + "logits/chosen": -2.96875, + "logits/rejected": -2.765625, + "logps/chosen": -332.0, + "logps/rejected": -468.0, + "loss": 0.4515, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.59375, + "rewards/margins": 1.4921875, + "rewards/rejected": -3.09375, + "step": 1390 + }, + { + "epoch": 0.07215565004509727, + "grad_norm": 9.600673022645905, + "learning_rate": 3.6063884595569295e-07, + "logits/chosen": -2.921875, + "logits/rejected": -2.90625, + "logps/chosen": -324.0, + "logps/rejected": -394.0, + "loss": 0.4238, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4609375, + "rewards/margins": 0.9609375, + "rewards/rejected": -2.421875, + "step": 1400 + }, + { + "epoch": 0.0726710475454194, + "grad_norm": 15.088664033372726, + "learning_rate": 3.632148377125193e-07, + "logits/chosen": -3.015625, + "logits/rejected": -2.703125, + "logps/chosen": -386.0, + "logps/rejected": -508.0, + "loss": 0.4263, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.015625, + "rewards/margins": 1.5, + "rewards/rejected": -3.5, + "step": 1410 + }, + { + "epoch": 0.07318644504574152, + "grad_norm": 9.39135524220523, + "learning_rate": 3.657908294693457e-07, + "logits/chosen": -2.78125, + "logits/rejected": -2.6875, + "logps/chosen": -400.0, + "logps/rejected": -520.0, + "loss": 0.4354, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -2.265625, + "rewards/margins": 1.2578125, + "rewards/rejected": -3.53125, + "step": 1420 + }, + { + "epoch": 0.07370184254606366, + "grad_norm": 11.91645009363579, + "learning_rate": 3.6836682122617206e-07, + "logits/chosen": -2.75, + "logits/rejected": -2.5625, + "logps/chosen": -366.0, + "logps/rejected": -510.0, + "loss": 0.4419, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.9609375, + "rewards/margins": 1.484375, + "rewards/rejected": -3.4375, + "step": 1430 + }, + { + "epoch": 0.07421724004638577, + "grad_norm": 10.222566963972072, + "learning_rate": 3.7094281298299846e-07, + "logits/chosen": -2.953125, + "logits/rejected": -2.84375, + "logps/chosen": -380.0, + "logps/rejected": -492.0, + "loss": 0.4145, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.0625, + "rewards/margins": 1.3828125, + "rewards/rejected": -3.4375, + "step": 1440 + }, + { + "epoch": 0.07473263754670789, + "grad_norm": 9.75011088798615, + "learning_rate": 3.735188047398248e-07, + "logits/chosen": -3.0625, + "logits/rejected": -2.75, + "logps/chosen": -360.0, + "logps/rejected": -512.0, + "loss": 0.411, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -1.9609375, + "rewards/margins": 1.6015625, + "rewards/rejected": -3.5625, + "step": 1450 + }, + { + "epoch": 0.07524803504703002, + "grad_norm": 10.90048825603066, + "learning_rate": 3.760947964966512e-07, + "logits/chosen": -3.0, + "logits/rejected": -2.890625, + "logps/chosen": -408.0, + "logps/rejected": -524.0, + "loss": 0.4292, + "rewards/accuracies": 0.78125, + "rewards/chosen": -2.25, + "rewards/margins": 1.265625, + "rewards/rejected": -3.515625, + "step": 1460 + }, + { + "epoch": 0.07576343254735214, + "grad_norm": 38.004660256894745, + "learning_rate": 3.786707882534776e-07, + "logits/chosen": -2.828125, + "logits/rejected": -2.609375, + "logps/chosen": -392.0, + "logps/rejected": -540.0, + "loss": 0.4125, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.390625, + "rewards/margins": 1.2421875, + "rewards/rejected": -3.640625, + "step": 1470 + }, + { + "epoch": 0.07627883004767427, + "grad_norm": 7.106678516180965, + "learning_rate": 3.8124678001030397e-07, + "logits/chosen": -3.03125, + "logits/rejected": -2.75, + "logps/chosen": -400.0, + "logps/rejected": -516.0, + "loss": 0.415, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.203125, + "rewards/margins": 1.21875, + "rewards/rejected": -3.421875, + "step": 1480 + }, + { + "epoch": 0.07679422754799639, + "grad_norm": 10.927575012381428, + "learning_rate": 3.8382277176713037e-07, + "logits/chosen": -2.984375, + "logits/rejected": -2.828125, + "logps/chosen": -424.0, + "logps/rejected": -552.0, + "loss": 0.4313, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.59375, + "rewards/margins": 1.40625, + "rewards/rejected": -3.984375, + "step": 1490 + }, + { + "epoch": 0.07730962504831851, + "grad_norm": 13.375610309164585, + "learning_rate": 3.863987635239567e-07, + "logits/chosen": -2.90625, + "logits/rejected": -2.734375, + "logps/chosen": -402.0, + "logps/rejected": -498.0, + "loss": 0.4166, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.1875, + "rewards/margins": 1.3046875, + "rewards/rejected": -3.5, + "step": 1500 + }, + { + "epoch": 0.07782502254864064, + "grad_norm": 10.486700936634765, + "learning_rate": 3.889747552807831e-07, + "logits/chosen": -2.8125, + "logits/rejected": -2.609375, + "logps/chosen": -410.0, + "logps/rejected": -540.0, + "loss": 0.4177, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -2.46875, + "rewards/margins": 1.2890625, + "rewards/rejected": -3.765625, + "step": 1510 + }, + { + "epoch": 0.07834042004896276, + "grad_norm": 12.084072484247844, + "learning_rate": 3.9155074703760947e-07, + "logits/chosen": -2.8125, + "logits/rejected": -2.65625, + "logps/chosen": -410.0, + "logps/rejected": -572.0, + "loss": 0.384, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.421875, + "rewards/margins": 1.7578125, + "rewards/rejected": -4.1875, + "step": 1520 + }, + { + "epoch": 0.07885581754928489, + "grad_norm": 8.049799716229789, + "learning_rate": 3.941267387944359e-07, + "logits/chosen": -2.84375, + "logits/rejected": -2.65625, + "logps/chosen": -426.0, + "logps/rejected": -572.0, + "loss": 0.423, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.515625, + "rewards/margins": 1.5078125, + "rewards/rejected": -4.03125, + "step": 1530 + }, + { + "epoch": 0.07937121504960701, + "grad_norm": 12.260306340742877, + "learning_rate": 3.967027305512622e-07, + "logits/chosen": -2.84375, + "logits/rejected": -2.625, + "logps/chosen": -408.0, + "logps/rejected": -564.0, + "loss": 0.4196, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.40625, + "rewards/margins": 1.59375, + "rewards/rejected": -4.0, + "step": 1540 + }, + { + "epoch": 0.07988661254992914, + "grad_norm": 10.946366750806083, + "learning_rate": 3.9927872230808863e-07, + "logits/chosen": -2.9375, + "logits/rejected": -2.765625, + "logps/chosen": -424.0, + "logps/rejected": -556.0, + "loss": 0.4139, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.203125, + "rewards/margins": 1.5390625, + "rewards/rejected": -3.75, + "step": 1550 + }, + { + "epoch": 0.08040201005025126, + "grad_norm": 10.399460078127245, + "learning_rate": 4.01854714064915e-07, + "logits/chosen": -3.015625, + "logits/rejected": -2.90625, + "logps/chosen": -452.0, + "logps/rejected": -624.0, + "loss": 0.4341, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.625, + "rewards/margins": 1.7421875, + "rewards/rejected": -4.375, + "step": 1560 + }, + { + "epoch": 0.08091740755057338, + "grad_norm": 13.047043538807717, + "learning_rate": 4.0443070582174133e-07, + "logits/chosen": -2.953125, + "logits/rejected": -2.765625, + "logps/chosen": -418.0, + "logps/rejected": -532.0, + "loss": 0.4063, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -2.21875, + "rewards/margins": 1.3984375, + "rewards/rejected": -3.625, + "step": 1570 + }, + { + "epoch": 0.08143280505089551, + "grad_norm": 11.558693317169963, + "learning_rate": 4.070066975785677e-07, + "logits/chosen": -2.78125, + "logits/rejected": -2.609375, + "logps/chosen": -374.0, + "logps/rejected": -504.0, + "loss": 0.4073, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.078125, + "rewards/margins": 1.484375, + "rewards/rejected": -3.546875, + "step": 1580 + }, + { + "epoch": 0.08194820255121762, + "grad_norm": 10.998085051434327, + "learning_rate": 4.095826893353941e-07, + "logits/chosen": -3.0, + "logits/rejected": -2.78125, + "logps/chosen": -370.0, + "logps/rejected": -498.0, + "loss": 0.4149, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.8984375, + "rewards/margins": 1.5390625, + "rewards/rejected": -3.4375, + "step": 1590 + }, + { + "epoch": 0.08246360005153976, + "grad_norm": 9.146948931732108, + "learning_rate": 4.121586810922205e-07, + "logits/chosen": -2.984375, + "logits/rejected": -2.75, + "logps/chosen": -394.0, + "logps/rejected": -524.0, + "loss": 0.4108, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.015625, + "rewards/margins": 1.4765625, + "rewards/rejected": -3.484375, + "step": 1600 + }, + { + "epoch": 0.08297899755186187, + "grad_norm": 9.358895618688681, + "learning_rate": 4.1473467284904684e-07, + "logits/chosen": -2.890625, + "logits/rejected": -2.71875, + "logps/chosen": -416.0, + "logps/rejected": -576.0, + "loss": 0.4109, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.546875, + "rewards/margins": 1.5703125, + "rewards/rejected": -4.125, + "step": 1610 + }, + { + "epoch": 0.08349439505218399, + "grad_norm": 13.904331236390052, + "learning_rate": 4.1731066460587324e-07, + "logits/chosen": -3.078125, + "logits/rejected": -2.8125, + "logps/chosen": -380.0, + "logps/rejected": -536.0, + "loss": 0.4079, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -2.125, + "rewards/margins": 1.4296875, + "rewards/rejected": -3.5625, + "step": 1620 + }, + { + "epoch": 0.08400979255250612, + "grad_norm": 8.931819411577086, + "learning_rate": 4.198866563626996e-07, + "logits/chosen": -3.015625, + "logits/rejected": -2.828125, + "logps/chosen": -388.0, + "logps/rejected": -516.0, + "loss": 0.3868, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.9765625, + "rewards/margins": 1.5234375, + "rewards/rejected": -3.5, + "step": 1630 + }, + { + "epoch": 0.08452519005282824, + "grad_norm": 13.855178225099491, + "learning_rate": 4.22462648119526e-07, + "logits/chosen": -2.96875, + "logits/rejected": -2.71875, + "logps/chosen": -424.0, + "logps/rejected": -548.0, + "loss": 0.407, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.4375, + "rewards/margins": 1.46875, + "rewards/rejected": -3.90625, + "step": 1640 + }, + { + "epoch": 0.08504058755315037, + "grad_norm": 8.314498841212094, + "learning_rate": 4.2503863987635234e-07, + "logits/chosen": -2.84375, + "logits/rejected": -2.71875, + "logps/chosen": -390.0, + "logps/rejected": -494.0, + "loss": 0.3999, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.046875, + "rewards/margins": 1.3125, + "rewards/rejected": -3.375, + "step": 1650 + }, + { + "epoch": 0.08555598505347249, + "grad_norm": 10.037122170800954, + "learning_rate": 4.2761463163317875e-07, + "logits/chosen": -2.734375, + "logits/rejected": -2.5, + "logps/chosen": -378.0, + "logps/rejected": -512.0, + "loss": 0.4076, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.15625, + "rewards/margins": 1.34375, + "rewards/rejected": -3.5, + "step": 1660 + }, + { + "epoch": 0.08607138255379461, + "grad_norm": 9.334870568919381, + "learning_rate": 4.301906233900051e-07, + "logits/chosen": -2.96875, + "logits/rejected": -2.71875, + "logps/chosen": -412.0, + "logps/rejected": -592.0, + "loss": 0.3758, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.21875, + "rewards/margins": 1.8125, + "rewards/rejected": -4.03125, + "step": 1670 + }, + { + "epoch": 0.08658678005411674, + "grad_norm": 12.108052916985576, + "learning_rate": 4.327666151468315e-07, + "logits/chosen": -2.890625, + "logits/rejected": -2.65625, + "logps/chosen": -404.0, + "logps/rejected": -532.0, + "loss": 0.3974, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.421875, + "rewards/margins": 1.40625, + "rewards/rejected": -3.828125, + "step": 1680 + }, + { + "epoch": 0.08710217755443886, + "grad_norm": 8.736767487724562, + "learning_rate": 4.3534260690365785e-07, + "logits/chosen": -2.84375, + "logits/rejected": -2.640625, + "logps/chosen": -412.0, + "logps/rejected": -592.0, + "loss": 0.428, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.375, + "rewards/margins": 1.8359375, + "rewards/rejected": -4.21875, + "step": 1690 + }, + { + "epoch": 0.08761757505476099, + "grad_norm": 10.216371355778199, + "learning_rate": 4.3791859866048425e-07, + "logits/chosen": -2.921875, + "logits/rejected": -2.78125, + "logps/chosen": -350.0, + "logps/rejected": -492.0, + "loss": 0.397, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.890625, + "rewards/margins": 1.6015625, + "rewards/rejected": -3.5, + "step": 1700 + }, + { + "epoch": 0.08813297255508311, + "grad_norm": 11.954199786172309, + "learning_rate": 4.404945904173106e-07, + "logits/chosen": -3.046875, + "logits/rejected": -2.921875, + "logps/chosen": -440.0, + "logps/rejected": -572.0, + "loss": 0.4115, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -2.5, + "rewards/margins": 1.59375, + "rewards/rejected": -4.09375, + "step": 1710 + }, + { + "epoch": 0.08864837005540523, + "grad_norm": 9.658567065815928, + "learning_rate": 4.43070582174137e-07, + "logits/chosen": -3.0, + "logits/rejected": -2.859375, + "logps/chosen": -386.0, + "logps/rejected": -492.0, + "loss": 0.4182, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.109375, + "rewards/margins": 1.234375, + "rewards/rejected": -3.34375, + "step": 1720 + }, + { + "epoch": 0.08916376755572736, + "grad_norm": 8.966708635735491, + "learning_rate": 4.456465739309634e-07, + "logits/chosen": -3.03125, + "logits/rejected": -2.734375, + "logps/chosen": -424.0, + "logps/rejected": -560.0, + "loss": 0.4255, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.546875, + "rewards/margins": 1.4921875, + "rewards/rejected": -4.03125, + "step": 1730 + }, + { + "epoch": 0.08967916505604948, + "grad_norm": 11.985988282104614, + "learning_rate": 4.4822256568778976e-07, + "logits/chosen": -2.84375, + "logits/rejected": -2.75, + "logps/chosen": -424.0, + "logps/rejected": -552.0, + "loss": 0.4004, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.40625, + "rewards/margins": 1.484375, + "rewards/rejected": -3.890625, + "step": 1740 + }, + { + "epoch": 0.09019456255637161, + "grad_norm": 9.551229894645127, + "learning_rate": 4.5079855744461616e-07, + "logits/chosen": -3.078125, + "logits/rejected": -2.734375, + "logps/chosen": -444.0, + "logps/rejected": -612.0, + "loss": 0.3742, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.609375, + "rewards/margins": 1.7734375, + "rewards/rejected": -4.375, + "step": 1750 + }, + { + "epoch": 0.09070996005669373, + "grad_norm": 12.840500637125716, + "learning_rate": 4.533745492014425e-07, + "logits/chosen": -3.0625, + "logits/rejected": -2.890625, + "logps/chosen": -408.0, + "logps/rejected": -540.0, + "loss": 0.3928, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.390625, + "rewards/margins": 1.3828125, + "rewards/rejected": -3.765625, + "step": 1760 + }, + { + "epoch": 0.09122535755701584, + "grad_norm": 7.907641706540803, + "learning_rate": 4.559505409582689e-07, + "logits/chosen": -3.0, + "logits/rejected": -2.875, + "logps/chosen": -434.0, + "logps/rejected": -584.0, + "loss": 0.4111, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -2.625, + "rewards/margins": 1.6328125, + "rewards/rejected": -4.25, + "step": 1770 + }, + { + "epoch": 0.09174075505733797, + "grad_norm": 9.994059148042052, + "learning_rate": 4.5852653271509527e-07, + "logits/chosen": -2.96875, + "logits/rejected": -2.828125, + "logps/chosen": -414.0, + "logps/rejected": -612.0, + "loss": 0.3579, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.390625, + "rewards/margins": 2.046875, + "rewards/rejected": -4.4375, + "step": 1780 + }, + { + "epoch": 0.09225615255766009, + "grad_norm": 18.101886405023603, + "learning_rate": 4.6110252447192167e-07, + "logits/chosen": -3.0, + "logits/rejected": -2.71875, + "logps/chosen": -504.0, + "logps/rejected": -672.0, + "loss": 0.4064, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.296875, + "rewards/margins": 1.6875, + "rewards/rejected": -4.96875, + "step": 1790 + }, + { + "epoch": 0.09277155005798222, + "grad_norm": 9.265366564086714, + "learning_rate": 4.63678516228748e-07, + "logits/chosen": -2.859375, + "logits/rejected": -2.578125, + "logps/chosen": -410.0, + "logps/rejected": -556.0, + "loss": 0.3899, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.328125, + "rewards/margins": 1.53125, + "rewards/rejected": -3.859375, + "step": 1800 + }, + { + "epoch": 0.09328694755830434, + "grad_norm": 10.386125277081572, + "learning_rate": 4.662545079855744e-07, + "logits/chosen": -2.734375, + "logits/rejected": -2.625, + "logps/chosen": -438.0, + "logps/rejected": -568.0, + "loss": 0.4157, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -2.6875, + "rewards/margins": 1.359375, + "rewards/rejected": -4.03125, + "step": 1810 + }, + { + "epoch": 0.09380234505862646, + "grad_norm": 8.58666887291785, + "learning_rate": 4.688304997424008e-07, + "logits/chosen": -2.828125, + "logits/rejected": -2.640625, + "logps/chosen": -430.0, + "logps/rejected": -564.0, + "loss": 0.4001, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.703125, + "rewards/margins": 1.390625, + "rewards/rejected": -4.09375, + "step": 1820 + }, + { + "epoch": 0.09431774255894859, + "grad_norm": 9.321954931037936, + "learning_rate": 4.714064914992272e-07, + "logits/chosen": -3.015625, + "logits/rejected": -2.65625, + "logps/chosen": -408.0, + "logps/rejected": -560.0, + "loss": 0.3849, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.359375, + "rewards/margins": 1.4765625, + "rewards/rejected": -3.828125, + "step": 1830 + }, + { + "epoch": 0.09483314005927071, + "grad_norm": 11.271517324806592, + "learning_rate": 4.739824832560536e-07, + "logits/chosen": -2.828125, + "logits/rejected": -2.75, + "logps/chosen": -450.0, + "logps/rejected": -588.0, + "loss": 0.4182, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.8125, + "rewards/margins": 1.4296875, + "rewards/rejected": -4.25, + "step": 1840 + }, + { + "epoch": 0.09534853755959284, + "grad_norm": 8.782867890951488, + "learning_rate": 4.7655847501287993e-07, + "logits/chosen": -2.828125, + "logits/rejected": -2.671875, + "logps/chosen": -410.0, + "logps/rejected": -520.0, + "loss": 0.3704, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -2.1875, + "rewards/margins": 1.5546875, + "rewards/rejected": -3.734375, + "step": 1850 + }, + { + "epoch": 0.09586393505991496, + "grad_norm": 10.25036460256607, + "learning_rate": 4.791344667697063e-07, + "logits/chosen": -2.953125, + "logits/rejected": -2.78125, + "logps/chosen": -470.0, + "logps/rejected": -684.0, + "loss": 0.3828, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -2.890625, + "rewards/margins": 2.328125, + "rewards/rejected": -5.21875, + "step": 1860 + }, + { + "epoch": 0.09637933256023708, + "grad_norm": 8.71840900614672, + "learning_rate": 4.817104585265327e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.890625, + "logps/chosen": -424.0, + "logps/rejected": -552.0, + "loss": 0.3821, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.484375, + "rewards/margins": 1.421875, + "rewards/rejected": -3.90625, + "step": 1870 + }, + { + "epoch": 0.09689473006055921, + "grad_norm": 9.971445617522583, + "learning_rate": 4.84286450283359e-07, + "logits/chosen": -2.90625, + "logits/rejected": -2.796875, + "logps/chosen": -452.0, + "logps/rejected": -584.0, + "loss": 0.3803, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.65625, + "rewards/margins": 1.53125, + "rewards/rejected": -4.1875, + "step": 1880 + }, + { + "epoch": 0.09741012756088133, + "grad_norm": 9.322578647285217, + "learning_rate": 4.868624420401854e-07, + "logits/chosen": -3.0, + "logits/rejected": -2.765625, + "logps/chosen": -406.0, + "logps/rejected": -576.0, + "loss": 0.3977, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.46875, + "rewards/margins": 1.65625, + "rewards/rejected": -4.125, + "step": 1890 + }, + { + "epoch": 0.09792552506120346, + "grad_norm": 10.282047263248899, + "learning_rate": 4.894384337970118e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.953125, + "logps/chosen": -430.0, + "logps/rejected": -612.0, + "loss": 0.3742, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.640625, + "rewards/margins": 1.8125, + "rewards/rejected": -4.4375, + "step": 1900 + }, + { + "epoch": 0.09844092256152558, + "grad_norm": 9.571376851769879, + "learning_rate": 4.920144255538382e-07, + "logits/chosen": -3.015625, + "logits/rejected": -2.921875, + "logps/chosen": -438.0, + "logps/rejected": -608.0, + "loss": 0.3959, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.6875, + "rewards/margins": 1.6875, + "rewards/rejected": -4.375, + "step": 1910 + }, + { + "epoch": 0.0989563200618477, + "grad_norm": 12.232291034773834, + "learning_rate": 4.945904173106645e-07, + "logits/chosen": -2.984375, + "logits/rejected": -2.828125, + "logps/chosen": -424.0, + "logps/rejected": -560.0, + "loss": 0.4133, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.578125, + "rewards/margins": 1.515625, + "rewards/rejected": -4.09375, + "step": 1920 + }, + { + "epoch": 0.09947171756216983, + "grad_norm": 9.031332263642849, + "learning_rate": 4.971664090674909e-07, + "logits/chosen": -3.046875, + "logits/rejected": -2.921875, + "logps/chosen": -410.0, + "logps/rejected": -560.0, + "loss": 0.3703, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -2.34375, + "rewards/margins": 1.6640625, + "rewards/rejected": -4.0, + "step": 1930 + }, + { + "epoch": 0.09998711506249194, + "grad_norm": 8.447207765397863, + "learning_rate": 4.997424008243173e-07, + "logits/chosen": -3.125, + "logits/rejected": -3.0, + "logps/chosen": -440.0, + "logps/rejected": -632.0, + "loss": 0.3799, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.703125, + "rewards/margins": 2.03125, + "rewards/rejected": -4.75, + "step": 1940 + }, + { + "epoch": 0.10050251256281408, + "grad_norm": 9.164110913472712, + "learning_rate": 4.999996722396175e-07, + "logits/chosen": -3.078125, + "logits/rejected": -2.921875, + "logps/chosen": -440.0, + "logps/rejected": -612.0, + "loss": 0.3671, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -2.625, + "rewards/margins": 1.921875, + "rewards/rejected": -4.5625, + "step": 1950 + }, + { + "epoch": 0.1010179100631362, + "grad_norm": 8.788251236604996, + "learning_rate": 4.99998539241868e-07, + "logits/chosen": -3.046875, + "logits/rejected": -2.96875, + "logps/chosen": -452.0, + "logps/rejected": -624.0, + "loss": 0.38, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.9375, + "rewards/margins": 1.734375, + "rewards/rejected": -4.65625, + "step": 1960 + }, + { + "epoch": 0.10153330756345831, + "grad_norm": 14.392784264668732, + "learning_rate": 4.999965969639936e-07, + "logits/chosen": -3.03125, + "logits/rejected": -2.90625, + "logps/chosen": -458.0, + "logps/rejected": -648.0, + "loss": 0.3595, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -2.734375, + "rewards/margins": 2.0625, + "rewards/rejected": -4.78125, + "step": 1970 + }, + { + "epoch": 0.10204870506378044, + "grad_norm": 8.719604452517874, + "learning_rate": 4.999938454122821e-07, + "logits/chosen": -2.96875, + "logits/rejected": -2.875, + "logps/chosen": -450.0, + "logps/rejected": -636.0, + "loss": 0.3611, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.75, + "rewards/margins": 1.984375, + "rewards/rejected": -4.75, + "step": 1980 + }, + { + "epoch": 0.10256410256410256, + "grad_norm": 8.054694565018183, + "learning_rate": 4.999902845956402e-07, + "logits/chosen": -2.9375, + "logits/rejected": -2.859375, + "logps/chosen": -432.0, + "logps/rejected": -624.0, + "loss": 0.3702, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -2.53125, + "rewards/margins": 2.203125, + "rewards/rejected": -4.71875, + "step": 1990 + }, + { + "epoch": 0.10307950006442469, + "grad_norm": 23.041514071431806, + "learning_rate": 4.99985914525595e-07, + "logits/chosen": -2.984375, + "logits/rejected": -2.84375, + "logps/chosen": -480.0, + "logps/rejected": -656.0, + "loss": 0.3917, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.0625, + "rewards/margins": 1.78125, + "rewards/rejected": -4.84375, + "step": 2000 + }, + { + "epoch": 0.10359489756474681, + "grad_norm": 8.900156111728577, + "learning_rate": 4.99980735216293e-07, + "logits/chosen": -2.984375, + "logits/rejected": -2.890625, + "logps/chosen": -422.0, + "logps/rejected": -584.0, + "loss": 0.3793, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.4375, + "rewards/margins": 1.6015625, + "rewards/rejected": -4.03125, + "step": 2010 + }, + { + "epoch": 0.10411029506506893, + "grad_norm": 10.227450951094895, + "learning_rate": 4.999747466845003e-07, + "logits/chosen": -3.078125, + "logits/rejected": -2.90625, + "logps/chosen": -460.0, + "logps/rejected": -584.0, + "loss": 0.377, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -2.75, + "rewards/margins": 1.4375, + "rewards/rejected": -4.1875, + "step": 2020 + }, + { + "epoch": 0.10462569256539106, + "grad_norm": 11.580955643551807, + "learning_rate": 4.999679489496028e-07, + "logits/chosen": -2.953125, + "logits/rejected": -2.875, + "logps/chosen": -496.0, + "logps/rejected": -644.0, + "loss": 0.3814, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -3.09375, + "rewards/margins": 1.8125, + "rewards/rejected": -4.90625, + "step": 2030 + }, + { + "epoch": 0.10514109006571318, + "grad_norm": 15.631379419509667, + "learning_rate": 4.999603420336054e-07, + "logits/chosen": -2.9375, + "logits/rejected": -2.8125, + "logps/chosen": -474.0, + "logps/rejected": -640.0, + "loss": 0.3892, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.921875, + "rewards/margins": 1.6171875, + "rewards/rejected": -4.53125, + "step": 2040 + }, + { + "epoch": 0.10565648756603531, + "grad_norm": 14.872015340591947, + "learning_rate": 4.99951925961133e-07, + "logits/chosen": -2.9375, + "logits/rejected": -2.734375, + "logps/chosen": -516.0, + "logps/rejected": -672.0, + "loss": 0.3935, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.34375, + "rewards/margins": 1.84375, + "rewards/rejected": -5.1875, + "step": 2050 + }, + { + "epoch": 0.10617188506635743, + "grad_norm": 9.79565242275787, + "learning_rate": 4.999427007594295e-07, + "logits/chosen": -3.046875, + "logits/rejected": -2.859375, + "logps/chosen": -470.0, + "logps/rejected": -660.0, + "loss": 0.3679, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.09375, + "rewards/margins": 1.7734375, + "rewards/rejected": -4.875, + "step": 2060 + }, + { + "epoch": 0.10668728256667954, + "grad_norm": 8.479750902541308, + "learning_rate": 4.999326664583582e-07, + "logits/chosen": -3.1875, + "logits/rejected": -3.109375, + "logps/chosen": -516.0, + "logps/rejected": -624.0, + "loss": 0.3805, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.234375, + "rewards/margins": 1.375, + "rewards/rejected": -4.625, + "step": 2070 + }, + { + "epoch": 0.10720268006700168, + "grad_norm": 9.67024138029288, + "learning_rate": 4.999218230904015e-07, + "logits/chosen": -3.046875, + "logits/rejected": -2.78125, + "logps/chosen": -426.0, + "logps/rejected": -596.0, + "loss": 0.3721, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -2.515625, + "rewards/margins": 1.8984375, + "rewards/rejected": -4.40625, + "step": 2080 + }, + { + "epoch": 0.1077180775673238, + "grad_norm": 12.636155321330268, + "learning_rate": 4.999101706906608e-07, + "logits/chosen": -3.0625, + "logits/rejected": -2.9375, + "logps/chosen": -480.0, + "logps/rejected": -668.0, + "loss": 0.3628, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.078125, + "rewards/margins": 2.109375, + "rewards/rejected": -5.1875, + "step": 2090 + }, + { + "epoch": 0.10823347506764593, + "grad_norm": 7.8175680165192425, + "learning_rate": 4.998977092968569e-07, + "logits/chosen": -2.84375, + "logits/rejected": -2.546875, + "logps/chosen": -516.0, + "logps/rejected": -676.0, + "loss": 0.3904, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.15625, + "rewards/margins": 1.8515625, + "rewards/rejected": -5.0, + "step": 2100 + }, + { + "epoch": 0.10874887256796804, + "grad_norm": 10.580767541989763, + "learning_rate": 4.998844389493286e-07, + "logits/chosen": -2.90625, + "logits/rejected": -2.734375, + "logps/chosen": -470.0, + "logps/rejected": -640.0, + "loss": 0.3637, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.140625, + "rewards/margins": 1.7265625, + "rewards/rejected": -4.875, + "step": 2110 + }, + { + "epoch": 0.10926427006829016, + "grad_norm": 11.535820377681725, + "learning_rate": 4.998703596910342e-07, + "logits/chosen": -2.703125, + "logits/rejected": -2.453125, + "logps/chosen": -502.0, + "logps/rejected": -652.0, + "loss": 0.3768, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.40625, + "rewards/margins": 1.6953125, + "rewards/rejected": -5.09375, + "step": 2120 + }, + { + "epoch": 0.1097796675686123, + "grad_norm": 21.89694557462376, + "learning_rate": 4.9985547156755e-07, + "logits/chosen": -2.8125, + "logits/rejected": -2.625, + "logps/chosen": -490.0, + "logps/rejected": -644.0, + "loss": 0.3619, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.140625, + "rewards/margins": 1.6953125, + "rewards/rejected": -4.84375, + "step": 2130 + }, + { + "epoch": 0.11029506506893441, + "grad_norm": 10.970558215192725, + "learning_rate": 4.998397746270711e-07, + "logits/chosen": -2.875, + "logits/rejected": -2.609375, + "logps/chosen": -500.0, + "logps/rejected": -676.0, + "loss": 0.3592, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.296875, + "rewards/margins": 1.921875, + "rewards/rejected": -5.21875, + "step": 2140 + }, + { + "epoch": 0.11081046256925654, + "grad_norm": 10.415887423236942, + "learning_rate": 4.998232689204107e-07, + "logits/chosen": -2.890625, + "logits/rejected": -2.796875, + "logps/chosen": -484.0, + "logps/rejected": -636.0, + "loss": 0.3771, + "rewards/accuracies": 0.78125, + "rewards/chosen": -3.109375, + "rewards/margins": 1.7734375, + "rewards/rejected": -4.875, + "step": 2150 + }, + { + "epoch": 0.11132586006957866, + "grad_norm": 8.75348696639557, + "learning_rate": 4.998059545009998e-07, + "logits/chosen": -2.84375, + "logits/rejected": -2.625, + "logps/chosen": -486.0, + "logps/rejected": -704.0, + "loss": 0.3522, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -3.203125, + "rewards/margins": 2.140625, + "rewards/rejected": -5.34375, + "step": 2160 + }, + { + "epoch": 0.11184125756990079, + "grad_norm": 11.977637947931024, + "learning_rate": 4.99787831424888e-07, + "logits/chosen": -3.015625, + "logits/rejected": -2.875, + "logps/chosen": -488.0, + "logps/rejected": -668.0, + "loss": 0.3836, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.03125, + "rewards/margins": 1.9140625, + "rewards/rejected": -4.96875, + "step": 2170 + }, + { + "epoch": 0.11235665507022291, + "grad_norm": 9.421141235715226, + "learning_rate": 4.997688997507418e-07, + "logits/chosen": -3.0625, + "logits/rejected": -2.828125, + "logps/chosen": -464.0, + "logps/rejected": -620.0, + "loss": 0.3522, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.984375, + "rewards/margins": 1.65625, + "rewards/rejected": -4.625, + "step": 2180 + }, + { + "epoch": 0.11287205257054503, + "grad_norm": 10.218796479696952, + "learning_rate": 4.99749159539846e-07, + "logits/chosen": -3.046875, + "logits/rejected": -2.875, + "logps/chosen": -474.0, + "logps/rejected": -644.0, + "loss": 0.365, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.0, + "rewards/margins": 1.9140625, + "rewards/rejected": -4.90625, + "step": 2190 + }, + { + "epoch": 0.11338745007086716, + "grad_norm": 12.823209887503944, + "learning_rate": 4.997286108561023e-07, + "logits/chosen": -2.859375, + "logits/rejected": -2.703125, + "logps/chosen": -480.0, + "logps/rejected": -628.0, + "loss": 0.3708, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.09375, + "rewards/margins": 1.65625, + "rewards/rejected": -4.75, + "step": 2200 + }, + { + "epoch": 0.11390284757118928, + "grad_norm": 12.7580248567788, + "learning_rate": 4.997072537660296e-07, + "logits/chosen": -3.03125, + "logits/rejected": -2.828125, + "logps/chosen": -468.0, + "logps/rejected": -644.0, + "loss": 0.3394, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -2.90625, + "rewards/margins": 1.9140625, + "rewards/rejected": -4.8125, + "step": 2210 + }, + { + "epoch": 0.11441824507151141, + "grad_norm": 11.950064326896262, + "learning_rate": 4.996850883387639e-07, + "logits/chosen": -3.078125, + "logits/rejected": -2.859375, + "logps/chosen": -500.0, + "logps/rejected": -728.0, + "loss": 0.3536, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.359375, + "rewards/margins": 2.15625, + "rewards/rejected": -5.5, + "step": 2220 + }, + { + "epoch": 0.11493364257183353, + "grad_norm": 8.092109662170868, + "learning_rate": 4.996621146460576e-07, + "logits/chosen": -2.890625, + "logits/rejected": -2.609375, + "logps/chosen": -458.0, + "logps/rejected": -632.0, + "loss": 0.3712, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9375, + "rewards/margins": 1.890625, + "rewards/rejected": -4.84375, + "step": 2230 + }, + { + "epoch": 0.11544904007215565, + "grad_norm": 14.330971141416589, + "learning_rate": 4.996383327622799e-07, + "logits/chosen": -2.859375, + "logits/rejected": -2.578125, + "logps/chosen": -466.0, + "logps/rejected": -624.0, + "loss": 0.3604, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.046875, + "rewards/margins": 1.765625, + "rewards/rejected": -4.8125, + "step": 2240 + }, + { + "epoch": 0.11596443757247778, + "grad_norm": 11.815565866126455, + "learning_rate": 4.996137427644159e-07, + "logits/chosen": -2.953125, + "logits/rejected": -2.890625, + "logps/chosen": -496.0, + "logps/rejected": -660.0, + "loss": 0.3545, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -3.09375, + "rewards/margins": 1.9765625, + "rewards/rejected": -5.0625, + "step": 2250 + }, + { + "epoch": 0.1164798350727999, + "grad_norm": 9.023548448935102, + "learning_rate": 4.995883447320671e-07, + "logits/chosen": -3.0625, + "logits/rejected": -3.03125, + "logps/chosen": -428.0, + "logps/rejected": -568.0, + "loss": 0.3479, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.609375, + "rewards/margins": 1.6640625, + "rewards/rejected": -4.28125, + "step": 2260 + }, + { + "epoch": 0.11699523257312203, + "grad_norm": 13.532195694814154, + "learning_rate": 4.995621387474504e-07, + "logits/chosen": -2.859375, + "logits/rejected": -2.640625, + "logps/chosen": -520.0, + "logps/rejected": -768.0, + "loss": 0.3622, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.359375, + "rewards/margins": 2.515625, + "rewards/rejected": -5.875, + "step": 2270 + }, + { + "epoch": 0.11751063007344414, + "grad_norm": 11.686154309045065, + "learning_rate": 4.995351248953981e-07, + "logits/chosen": -2.703125, + "logits/rejected": -2.546875, + "logps/chosen": -484.0, + "logps/rejected": -684.0, + "loss": 0.3518, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.03125, + "rewards/margins": 2.140625, + "rewards/rejected": -5.1875, + "step": 2280 + }, + { + "epoch": 0.11802602757376626, + "grad_norm": 9.963888427476531, + "learning_rate": 4.995073032633578e-07, + "logits/chosen": -2.828125, + "logits/rejected": -2.65625, + "logps/chosen": -502.0, + "logps/rejected": -668.0, + "loss": 0.3758, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.25, + "rewards/margins": 1.71875, + "rewards/rejected": -4.96875, + "step": 2290 + }, + { + "epoch": 0.1185414250740884, + "grad_norm": 9.658635177798915, + "learning_rate": 4.994786739413922e-07, + "logits/chosen": -2.828125, + "logits/rejected": -2.53125, + "logps/chosen": -536.0, + "logps/rejected": -724.0, + "loss": 0.3697, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.625, + "rewards/margins": 1.96875, + "rewards/rejected": -5.59375, + "step": 2300 + }, + { + "epoch": 0.11905682257441051, + "grad_norm": 8.575154235449407, + "learning_rate": 4.994492370221782e-07, + "logits/chosen": -2.765625, + "logits/rejected": -2.578125, + "logps/chosen": -540.0, + "logps/rejected": -732.0, + "loss": 0.3274, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.578125, + "rewards/margins": 2.046875, + "rewards/rejected": -5.625, + "step": 2310 + }, + { + "epoch": 0.11957222007473264, + "grad_norm": 12.058444943089095, + "learning_rate": 4.994189926010073e-07, + "logits/chosen": -2.765625, + "logits/rejected": -2.71875, + "logps/chosen": -490.0, + "logps/rejected": -644.0, + "loss": 0.3619, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.015625, + "rewards/margins": 1.734375, + "rewards/rejected": -4.75, + "step": 2320 + }, + { + "epoch": 0.12008761757505476, + "grad_norm": 10.534196685913836, + "learning_rate": 4.993879407757849e-07, + "logits/chosen": -2.953125, + "logits/rejected": -2.75, + "logps/chosen": -512.0, + "logps/rejected": -680.0, + "loss": 0.3593, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.25, + "rewards/margins": 1.8203125, + "rewards/rejected": -5.0625, + "step": 2330 + }, + { + "epoch": 0.12060301507537688, + "grad_norm": 7.3918651196897525, + "learning_rate": 4.993560816470301e-07, + "logits/chosen": -2.859375, + "logits/rejected": -2.8125, + "logps/chosen": -516.0, + "logps/rejected": -672.0, + "loss": 0.3498, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -3.4375, + "rewards/margins": 1.7890625, + "rewards/rejected": -5.21875, + "step": 2340 + }, + { + "epoch": 0.12111841257569901, + "grad_norm": 9.707850918155094, + "learning_rate": 4.993234153178753e-07, + "logits/chosen": -3.078125, + "logits/rejected": -2.875, + "logps/chosen": -464.0, + "logps/rejected": -624.0, + "loss": 0.3734, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -2.96875, + "rewards/margins": 1.515625, + "rewards/rejected": -4.46875, + "step": 2350 + }, + { + "epoch": 0.12163381007602113, + "grad_norm": 7.319608024965551, + "learning_rate": 4.99289941894066e-07, + "logits/chosen": -3.046875, + "logits/rejected": -2.875, + "logps/chosen": -512.0, + "logps/rejected": -712.0, + "loss": 0.3461, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.328125, + "rewards/margins": 1.90625, + "rewards/rejected": -5.21875, + "step": 2360 + }, + { + "epoch": 0.12214920757634326, + "grad_norm": 10.45762899715267, + "learning_rate": 4.992556614839603e-07, + "logits/chosen": -3.078125, + "logits/rejected": -2.90625, + "logps/chosen": -468.0, + "logps/rejected": -620.0, + "loss": 0.36, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.921875, + "rewards/margins": 1.765625, + "rewards/rejected": -4.6875, + "step": 2370 + }, + { + "epoch": 0.12266460507666538, + "grad_norm": 11.331700383166977, + "learning_rate": 4.992205741985288e-07, + "logits/chosen": -2.96875, + "logits/rejected": -2.78125, + "logps/chosen": -516.0, + "logps/rejected": -700.0, + "loss": 0.3439, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -3.21875, + "rewards/margins": 2.140625, + "rewards/rejected": -5.34375, + "step": 2380 + }, + { + "epoch": 0.1231800025769875, + "grad_norm": 10.525525288538287, + "learning_rate": 4.991846801513538e-07, + "logits/chosen": -2.921875, + "logits/rejected": -2.78125, + "logps/chosen": -512.0, + "logps/rejected": -684.0, + "loss": 0.3543, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.046875, + "rewards/margins": 2.171875, + "rewards/rejected": -5.21875, + "step": 2390 + }, + { + "epoch": 0.12369540007730963, + "grad_norm": 9.304376409497051, + "learning_rate": 4.991479794586294e-07, + "logits/chosen": -3.015625, + "logits/rejected": -2.6875, + "logps/chosen": -536.0, + "logps/rejected": -728.0, + "loss": 0.3323, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -3.5625, + "rewards/margins": 1.9453125, + "rewards/rejected": -5.5, + "step": 2400 + }, + { + "epoch": 0.12421079757763175, + "grad_norm": 10.837348319831435, + "learning_rate": 4.991104722391608e-07, + "logits/chosen": -3.015625, + "logits/rejected": -2.71875, + "logps/chosen": -516.0, + "logps/rejected": -748.0, + "loss": 0.3386, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -3.46875, + "rewards/margins": 2.28125, + "rewards/rejected": -5.75, + "step": 2410 + }, + { + "epoch": 0.12472619507795388, + "grad_norm": 12.3487504100721, + "learning_rate": 4.99072158614364e-07, + "logits/chosen": -2.859375, + "logits/rejected": -2.71875, + "logps/chosen": -520.0, + "logps/rejected": -704.0, + "loss": 0.3321, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.609375, + "rewards/margins": 1.875, + "rewards/rejected": -5.5, + "step": 2420 + }, + { + "epoch": 0.125241592578276, + "grad_norm": 8.443882073288608, + "learning_rate": 4.990330387082659e-07, + "logits/chosen": -2.9375, + "logits/rejected": -2.734375, + "logps/chosen": -516.0, + "logps/rejected": -684.0, + "loss": 0.3452, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -3.4375, + "rewards/margins": 1.703125, + "rewards/rejected": -5.125, + "step": 2430 + }, + { + "epoch": 0.1257569900785981, + "grad_norm": 10.263729352767859, + "learning_rate": 4.989931126475027e-07, + "logits/chosen": -2.96875, + "logits/rejected": -2.8125, + "logps/chosen": -502.0, + "logps/rejected": -660.0, + "loss": 0.3592, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.140625, + "rewards/margins": 1.75, + "rewards/rejected": -4.875, + "step": 2440 + }, + { + "epoch": 0.12627238757892023, + "grad_norm": 13.955291279427067, + "learning_rate": 4.989523805613209e-07, + "logits/chosen": -3.03125, + "logits/rejected": -2.8125, + "logps/chosen": -528.0, + "logps/rejected": -760.0, + "loss": 0.3396, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.421875, + "rewards/margins": 2.328125, + "rewards/rejected": -5.75, + "step": 2450 + }, + { + "epoch": 0.12678778507924238, + "grad_norm": 19.631289080736487, + "learning_rate": 4.989108425815756e-07, + "logits/chosen": -2.9375, + "logits/rejected": -2.75, + "logps/chosen": -532.0, + "logps/rejected": -724.0, + "loss": 0.3492, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.546875, + "rewards/margins": 2.0, + "rewards/rejected": -5.5625, + "step": 2460 + }, + { + "epoch": 0.1273031825795645, + "grad_norm": 7.240765945123882, + "learning_rate": 4.988684988427314e-07, + "logits/chosen": -2.96875, + "logits/rejected": -2.59375, + "logps/chosen": -506.0, + "logps/rejected": -728.0, + "loss": 0.3212, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -3.265625, + "rewards/margins": 2.3125, + "rewards/rejected": -5.5625, + "step": 2470 + }, + { + "epoch": 0.1278185800798866, + "grad_norm": 12.549303347242173, + "learning_rate": 4.988253494818608e-07, + "logits/chosen": -2.8125, + "logits/rejected": -2.578125, + "logps/chosen": -540.0, + "logps/rejected": -780.0, + "loss": 0.3671, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.78125, + "rewards/margins": 2.34375, + "rewards/rejected": -6.125, + "step": 2480 + }, + { + "epoch": 0.12833397758020873, + "grad_norm": 11.067298545942405, + "learning_rate": 4.987813946386442e-07, + "logits/chosen": -2.921875, + "logits/rejected": -2.765625, + "logps/chosen": -476.0, + "logps/rejected": -688.0, + "loss": 0.3523, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.046875, + "rewards/margins": 2.25, + "rewards/rejected": -5.28125, + "step": 2490 + }, + { + "epoch": 0.12884937508053085, + "grad_norm": 10.349867205427714, + "learning_rate": 4.987366344553696e-07, + "logits/chosen": -2.90625, + "logits/rejected": -2.71875, + "logps/chosen": -536.0, + "logps/rejected": -724.0, + "loss": 0.3463, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.734375, + "rewards/margins": 2.15625, + "rewards/rejected": -5.90625, + "step": 2500 + }, + { + "epoch": 0.129364772580853, + "grad_norm": 9.570398709546632, + "learning_rate": 4.986910690769319e-07, + "logits/chosen": -3.03125, + "logits/rejected": -2.78125, + "logps/chosen": -520.0, + "logps/rejected": -732.0, + "loss": 0.3238, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -3.546875, + "rewards/margins": 2.171875, + "rewards/rejected": -5.71875, + "step": 2510 + }, + { + "epoch": 0.1298801700811751, + "grad_norm": 8.720346720733145, + "learning_rate": 4.986446986508326e-07, + "logits/chosen": -3.0, + "logits/rejected": -2.875, + "logps/chosen": -520.0, + "logps/rejected": -716.0, + "loss": 0.3518, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -3.53125, + "rewards/margins": 2.109375, + "rewards/rejected": -5.625, + "step": 2520 + }, + { + "epoch": 0.13039556758149723, + "grad_norm": 9.084907437122686, + "learning_rate": 4.985975233271794e-07, + "logits/chosen": -2.859375, + "logits/rejected": -2.765625, + "logps/chosen": -536.0, + "logps/rejected": -764.0, + "loss": 0.3352, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.65625, + "rewards/margins": 2.359375, + "rewards/rejected": -6.03125, + "step": 2530 + }, + { + "epoch": 0.13091096508181935, + "grad_norm": 10.404793489161777, + "learning_rate": 4.985495432586851e-07, + "logits/chosen": -3.0, + "logits/rejected": -2.703125, + "logps/chosen": -512.0, + "logps/rejected": -784.0, + "loss": 0.3168, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.46875, + "rewards/margins": 2.609375, + "rewards/rejected": -6.09375, + "step": 2540 + }, + { + "epoch": 0.13142636258214146, + "grad_norm": 9.002646054280127, + "learning_rate": 4.985007586006682e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.953125, + "logps/chosen": -460.0, + "logps/rejected": -684.0, + "loss": 0.3495, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -2.96875, + "rewards/margins": 2.140625, + "rewards/rejected": -5.125, + "step": 2550 + }, + { + "epoch": 0.1319417600824636, + "grad_norm": 8.6997351280305, + "learning_rate": 4.984511695110512e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.875, + "logps/chosen": -474.0, + "logps/rejected": -712.0, + "loss": 0.3353, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.03125, + "rewards/margins": 2.234375, + "rewards/rejected": -5.25, + "step": 2560 + }, + { + "epoch": 0.13245715758278573, + "grad_norm": 10.25244748698182, + "learning_rate": 4.98400776150361e-07, + "logits/chosen": -2.953125, + "logits/rejected": -2.859375, + "logps/chosen": -540.0, + "logps/rejected": -740.0, + "loss": 0.3398, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.546875, + "rewards/margins": 2.1875, + "rewards/rejected": -5.71875, + "step": 2570 + }, + { + "epoch": 0.13297255508310785, + "grad_norm": 9.358787572331396, + "learning_rate": 4.983495786817278e-07, + "logits/chosen": -2.953125, + "logits/rejected": -2.78125, + "logps/chosen": -494.0, + "logps/rejected": -680.0, + "loss": 0.3364, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -3.296875, + "rewards/margins": 1.8359375, + "rewards/rejected": -5.125, + "step": 2580 + }, + { + "epoch": 0.13348795258342996, + "grad_norm": 9.860095720349676, + "learning_rate": 4.982975772708852e-07, + "logits/chosen": -3.1875, + "logits/rejected": -3.0625, + "logps/chosen": -544.0, + "logps/rejected": -716.0, + "loss": 0.3564, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.578125, + "rewards/margins": 1.9375, + "rewards/rejected": -5.5, + "step": 2590 + }, + { + "epoch": 0.13400335008375208, + "grad_norm": 10.392806164168467, + "learning_rate": 4.982447720861688e-07, + "logits/chosen": -2.96875, + "logits/rejected": -2.71875, + "logps/chosen": -524.0, + "logps/rejected": -736.0, + "loss": 0.3187, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.53125, + "rewards/margins": 2.359375, + "rewards/rejected": -5.90625, + "step": 2600 + }, + { + "epoch": 0.13451874758407423, + "grad_norm": 8.90680299319631, + "learning_rate": 4.981911632985164e-07, + "logits/chosen": -2.890625, + "logits/rejected": -2.8125, + "logps/chosen": -544.0, + "logps/rejected": -736.0, + "loss": 0.332, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.546875, + "rewards/margins": 2.21875, + "rewards/rejected": -5.75, + "step": 2610 + }, + { + "epoch": 0.13503414508439635, + "grad_norm": 8.450345298124095, + "learning_rate": 4.981367510814672e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.875, + "logps/chosen": -512.0, + "logps/rejected": -676.0, + "loss": 0.3646, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.265625, + "rewards/margins": 1.765625, + "rewards/rejected": -5.03125, + "step": 2620 + }, + { + "epoch": 0.13554954258471846, + "grad_norm": 8.03036745735481, + "learning_rate": 4.980815356111609e-07, + "logits/chosen": -2.921875, + "logits/rejected": -2.71875, + "logps/chosen": -490.0, + "logps/rejected": -696.0, + "loss": 0.3528, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.171875, + "rewards/margins": 2.1875, + "rewards/rejected": -5.34375, + "step": 2630 + }, + { + "epoch": 0.13606494008504058, + "grad_norm": 8.264503079429826, + "learning_rate": 4.98025517066338e-07, + "logits/chosen": -3.078125, + "logits/rejected": -2.8125, + "logps/chosen": -520.0, + "logps/rejected": -692.0, + "loss": 0.3341, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -3.3125, + "rewards/margins": 1.8515625, + "rewards/rejected": -5.15625, + "step": 2640 + }, + { + "epoch": 0.1365803375853627, + "grad_norm": 20.845322270565035, + "learning_rate": 4.979686956283381e-07, + "logits/chosen": -3.046875, + "logits/rejected": -2.828125, + "logps/chosen": -584.0, + "logps/rejected": -772.0, + "loss": 0.3389, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -4.0, + "rewards/margins": 1.984375, + "rewards/rejected": -6.0, + "step": 2650 + }, + { + "epoch": 0.13709573508568484, + "grad_norm": 10.3817871121152, + "learning_rate": 4.979110714811002e-07, + "logits/chosen": -3.015625, + "logits/rejected": -2.609375, + "logps/chosen": -512.0, + "logps/rejected": -752.0, + "loss": 0.3328, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.515625, + "rewards/margins": 2.4375, + "rewards/rejected": -5.96875, + "step": 2660 + }, + { + "epoch": 0.13761113258600696, + "grad_norm": 8.823283826491169, + "learning_rate": 4.978526448111616e-07, + "logits/chosen": -3.03125, + "logits/rejected": -2.734375, + "logps/chosen": -506.0, + "logps/rejected": -740.0, + "loss": 0.3317, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.21875, + "rewards/margins": 2.515625, + "rewards/rejected": -5.75, + "step": 2670 + }, + { + "epoch": 0.13812653008632908, + "grad_norm": 10.146931992425573, + "learning_rate": 4.977934158076578e-07, + "logits/chosen": -3.0, + "logits/rejected": -2.703125, + "logps/chosen": -528.0, + "logps/rejected": -724.0, + "loss": 0.3136, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -3.46875, + "rewards/margins": 2.25, + "rewards/rejected": -5.71875, + "step": 2680 + }, + { + "epoch": 0.1386419275866512, + "grad_norm": 10.764707708666965, + "learning_rate": 4.977333846623212e-07, + "logits/chosen": -2.953125, + "logits/rejected": -2.734375, + "logps/chosen": -564.0, + "logps/rejected": -816.0, + "loss": 0.3621, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.984375, + "rewards/margins": 2.578125, + "rewards/rejected": -6.5625, + "step": 2690 + }, + { + "epoch": 0.13915732508697332, + "grad_norm": 8.535348568103249, + "learning_rate": 4.97672551569481e-07, + "logits/chosen": -2.9375, + "logits/rejected": -2.75, + "logps/chosen": -498.0, + "logps/rejected": -720.0, + "loss": 0.3247, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.234375, + "rewards/margins": 2.21875, + "rewards/rejected": -5.46875, + "step": 2700 + }, + { + "epoch": 0.13967272258729546, + "grad_norm": 9.766392274688236, + "learning_rate": 4.976109167260624e-07, + "logits/chosen": -3.046875, + "logits/rejected": -2.859375, + "logps/chosen": -528.0, + "logps/rejected": -748.0, + "loss": 0.354, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.46875, + "rewards/margins": 2.3125, + "rewards/rejected": -5.78125, + "step": 2710 + }, + { + "epoch": 0.14018812008761758, + "grad_norm": 9.7312374983141, + "learning_rate": 4.975484803315859e-07, + "logits/chosen": -2.96875, + "logits/rejected": -2.859375, + "logps/chosen": -604.0, + "logps/rejected": -788.0, + "loss": 0.3481, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.15625, + "rewards/margins": 1.9921875, + "rewards/rejected": -6.15625, + "step": 2720 + }, + { + "epoch": 0.1407035175879397, + "grad_norm": 10.558656068635253, + "learning_rate": 4.974852425881669e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.9375, + "logps/chosen": -510.0, + "logps/rejected": -768.0, + "loss": 0.3331, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -3.328125, + "rewards/margins": 2.546875, + "rewards/rejected": -5.875, + "step": 2730 + }, + { + "epoch": 0.14121891508826181, + "grad_norm": 8.445195252334234, + "learning_rate": 4.974212037005149e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.90625, + "logps/chosen": -576.0, + "logps/rejected": -784.0, + "loss": 0.3272, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.84375, + "rewards/margins": 2.3125, + "rewards/rejected": -6.15625, + "step": 2740 + }, + { + "epoch": 0.14173431258858393, + "grad_norm": 8.30772436276008, + "learning_rate": 4.973563638759325e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.953125, + "logps/chosen": -486.0, + "logps/rejected": -684.0, + "loss": 0.3281, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.15625, + "rewards/margins": 2.140625, + "rewards/rejected": -5.28125, + "step": 2750 + }, + { + "epoch": 0.14224971008890608, + "grad_norm": 8.109749766505562, + "learning_rate": 4.972907233243154e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.953125, + "logps/chosen": -552.0, + "logps/rejected": -760.0, + "loss": 0.3314, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -3.625, + "rewards/margins": 2.421875, + "rewards/rejected": -6.0625, + "step": 2760 + }, + { + "epoch": 0.1427651075892282, + "grad_norm": 8.030026459014152, + "learning_rate": 4.97224282258151e-07, + "logits/chosen": -2.953125, + "logits/rejected": -2.734375, + "logps/chosen": -592.0, + "logps/rejected": -796.0, + "loss": 0.3303, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.21875, + "rewards/margins": 2.171875, + "rewards/rejected": -6.40625, + "step": 2770 + }, + { + "epoch": 0.14328050508955031, + "grad_norm": 8.831604734359994, + "learning_rate": 4.971570408925186e-07, + "logits/chosen": -2.890625, + "logits/rejected": -2.71875, + "logps/chosen": -528.0, + "logps/rejected": -740.0, + "loss": 0.3111, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.46875, + "rewards/margins": 2.3125, + "rewards/rejected": -5.78125, + "step": 2780 + }, + { + "epoch": 0.14379590258987243, + "grad_norm": 13.474031701339474, + "learning_rate": 4.970889994450877e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.890625, + "logps/chosen": -568.0, + "logps/rejected": -768.0, + "loss": 0.3241, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.0, + "rewards/margins": 2.15625, + "rewards/rejected": -6.15625, + "step": 2790 + }, + { + "epoch": 0.14431130009019455, + "grad_norm": 8.82569264471602, + "learning_rate": 4.97020158136118e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.875, + "logps/chosen": -540.0, + "logps/rejected": -752.0, + "loss": 0.3399, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.734375, + "rewards/margins": 2.125, + "rewards/rejected": -5.84375, + "step": 2800 + }, + { + "epoch": 0.1448266975905167, + "grad_norm": 9.528445723847849, + "learning_rate": 4.969505171884584e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.96875, + "logps/chosen": -536.0, + "logps/rejected": -736.0, + "loss": 0.3254, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.53125, + "rewards/margins": 2.078125, + "rewards/rejected": -5.59375, + "step": 2810 + }, + { + "epoch": 0.1453420950908388, + "grad_norm": 10.647341675837259, + "learning_rate": 4.968800768275463e-07, + "logits/chosen": -3.0625, + "logits/rejected": -2.9375, + "logps/chosen": -540.0, + "logps/rejected": -744.0, + "loss": 0.3539, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.609375, + "rewards/margins": 2.125, + "rewards/rejected": -5.71875, + "step": 2820 + }, + { + "epoch": 0.14585749259116093, + "grad_norm": 27.4541369555074, + "learning_rate": 4.968088372814071e-07, + "logits/chosen": -3.046875, + "logits/rejected": -2.921875, + "logps/chosen": -584.0, + "logps/rejected": -808.0, + "loss": 0.3219, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.03125, + "rewards/margins": 2.375, + "rewards/rejected": -6.40625, + "step": 2830 + }, + { + "epoch": 0.14637289009148305, + "grad_norm": 9.079742358132215, + "learning_rate": 4.96736798780653e-07, + "logits/chosen": -3.03125, + "logits/rejected": -2.859375, + "logps/chosen": -516.0, + "logps/rejected": -724.0, + "loss": 0.3155, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -3.46875, + "rewards/margins": 2.375, + "rewards/rejected": -5.84375, + "step": 2840 + }, + { + "epoch": 0.14688828759180517, + "grad_norm": 11.068354156823748, + "learning_rate": 4.966639615584828e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.953125, + "logps/chosen": -508.0, + "logps/rejected": -708.0, + "loss": 0.3455, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -3.578125, + "rewards/margins": 1.84375, + "rewards/rejected": -5.40625, + "step": 2850 + }, + { + "epoch": 0.1474036850921273, + "grad_norm": 12.132869070390665, + "learning_rate": 4.965903258506806e-07, + "logits/chosen": -3.09375, + "logits/rejected": -3.03125, + "logps/chosen": -576.0, + "logps/rejected": -764.0, + "loss": 0.3187, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.984375, + "rewards/margins": 2.046875, + "rewards/rejected": -6.03125, + "step": 2860 + }, + { + "epoch": 0.14791908259244943, + "grad_norm": 8.085238974506842, + "learning_rate": 4.965158918956154e-07, + "logits/chosen": -3.03125, + "logits/rejected": -2.984375, + "logps/chosen": -592.0, + "logps/rejected": -804.0, + "loss": 0.3074, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.90625, + "rewards/margins": 2.4375, + "rewards/rejected": -6.34375, + "step": 2870 + }, + { + "epoch": 0.14843448009277155, + "grad_norm": 9.628890357192786, + "learning_rate": 4.964406599342406e-07, + "logits/chosen": -3.03125, + "logits/rejected": -2.8125, + "logps/chosen": -580.0, + "logps/rejected": -828.0, + "loss": 0.3221, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.984375, + "rewards/margins": 2.625, + "rewards/rejected": -6.59375, + "step": 2880 + }, + { + "epoch": 0.14894987759309367, + "grad_norm": 10.256561521219842, + "learning_rate": 4.963646302100925e-07, + "logits/chosen": -2.859375, + "logits/rejected": -2.71875, + "logps/chosen": -548.0, + "logps/rejected": -776.0, + "loss": 0.3107, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.828125, + "rewards/margins": 2.40625, + "rewards/rejected": -6.25, + "step": 2890 + }, + { + "epoch": 0.14946527509341578, + "grad_norm": 22.679791300576383, + "learning_rate": 4.962878029692895e-07, + "logits/chosen": -2.921875, + "logits/rejected": -2.875, + "logps/chosen": -592.0, + "logps/rejected": -776.0, + "loss": 0.3532, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.0, + "rewards/margins": 2.21875, + "rewards/rejected": -6.21875, + "step": 2900 + }, + { + "epoch": 0.14998067259373793, + "grad_norm": 7.139846238434944, + "learning_rate": 4.962101784605325e-07, + "logits/chosen": -2.984375, + "logits/rejected": -2.8125, + "logps/chosen": -528.0, + "logps/rejected": -764.0, + "loss": 0.3411, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.65625, + "rewards/margins": 2.34375, + "rewards/rejected": -6.0, + "step": 2910 + }, + { + "epoch": 0.15049607009406005, + "grad_norm": 8.85968148488643, + "learning_rate": 4.961317569351028e-07, + "logits/chosen": -2.96875, + "logits/rejected": -2.8125, + "logps/chosen": -524.0, + "logps/rejected": -732.0, + "loss": 0.326, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.578125, + "rewards/margins": 2.109375, + "rewards/rejected": -5.6875, + "step": 2920 + }, + { + "epoch": 0.15101146759438216, + "grad_norm": 12.14435791704894, + "learning_rate": 4.960525386468616e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.875, + "logps/chosen": -580.0, + "logps/rejected": -776.0, + "loss": 0.33, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.890625, + "rewards/margins": 2.140625, + "rewards/rejected": -6.03125, + "step": 2930 + }, + { + "epoch": 0.15152686509470428, + "grad_norm": 7.173269986213736, + "learning_rate": 4.959725238522498e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.84375, + "logps/chosen": -512.0, + "logps/rejected": -760.0, + "loss": 0.3084, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.515625, + "rewards/margins": 2.296875, + "rewards/rejected": -5.8125, + "step": 2940 + }, + { + "epoch": 0.1520422625950264, + "grad_norm": 22.87260930499155, + "learning_rate": 4.958917128102862e-07, + "logits/chosen": -3.0625, + "logits/rejected": -2.90625, + "logps/chosen": -604.0, + "logps/rejected": -884.0, + "loss": 0.3226, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.1875, + "rewards/margins": 2.90625, + "rewards/rejected": -7.09375, + "step": 2950 + }, + { + "epoch": 0.15255766009534855, + "grad_norm": 8.21229391499097, + "learning_rate": 4.958101057825675e-07, + "logits/chosen": -2.9375, + "logits/rejected": -2.71875, + "logps/chosen": -528.0, + "logps/rejected": -756.0, + "loss": 0.3231, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -3.578125, + "rewards/margins": 2.484375, + "rewards/rejected": -6.0625, + "step": 2960 + }, + { + "epoch": 0.15307305759567066, + "grad_norm": 8.708405912749587, + "learning_rate": 4.957277030332672e-07, + "logits/chosen": -2.953125, + "logits/rejected": -2.671875, + "logps/chosen": -540.0, + "logps/rejected": -708.0, + "loss": 0.3452, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -3.578125, + "rewards/margins": 1.9375, + "rewards/rejected": -5.53125, + "step": 2970 + }, + { + "epoch": 0.15358845509599278, + "grad_norm": 7.293444993903999, + "learning_rate": 4.956445048291343e-07, + "logits/chosen": -3.078125, + "logits/rejected": -2.703125, + "logps/chosen": -552.0, + "logps/rejected": -752.0, + "loss": 0.3253, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.796875, + "rewards/margins": 2.046875, + "rewards/rejected": -5.84375, + "step": 2980 + }, + { + "epoch": 0.1541038525963149, + "grad_norm": 8.929252987284961, + "learning_rate": 4.955605114394931e-07, + "logits/chosen": -2.984375, + "logits/rejected": -2.75, + "logps/chosen": -502.0, + "logps/rejected": -756.0, + "loss": 0.3129, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -3.28125, + "rewards/margins": 2.625, + "rewards/rejected": -5.90625, + "step": 2990 + }, + { + "epoch": 0.15461925009663702, + "grad_norm": 9.468362755468288, + "learning_rate": 4.954757231362417e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.765625, + "logps/chosen": -488.0, + "logps/rejected": -752.0, + "loss": 0.3174, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.25, + "rewards/margins": 2.71875, + "rewards/rejected": -5.96875, + "step": 3000 + }, + { + "epoch": 0.15513464759695916, + "grad_norm": 11.074221635227174, + "learning_rate": 4.953901401938519e-07, + "logits/chosen": -3.015625, + "logits/rejected": -2.953125, + "logps/chosen": -516.0, + "logps/rejected": -744.0, + "loss": 0.328, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -3.4375, + "rewards/margins": 2.453125, + "rewards/rejected": -5.875, + "step": 3010 + }, + { + "epoch": 0.15565004509728128, + "grad_norm": 7.988947349106309, + "learning_rate": 4.953037628893676e-07, + "logits/chosen": -2.96875, + "logits/rejected": -2.859375, + "logps/chosen": -552.0, + "logps/rejected": -760.0, + "loss": 0.3112, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.5, + "rewards/margins": 2.40625, + "rewards/rejected": -5.90625, + "step": 3020 + }, + { + "epoch": 0.1561654425976034, + "grad_norm": 10.161410185851524, + "learning_rate": 4.952165915024039e-07, + "logits/chosen": -2.96875, + "logits/rejected": -2.625, + "logps/chosen": -588.0, + "logps/rejected": -824.0, + "loss": 0.3276, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.1875, + "rewards/margins": 2.34375, + "rewards/rejected": -6.53125, + "step": 3030 + }, + { + "epoch": 0.15668084009792552, + "grad_norm": 9.622402151118266, + "learning_rate": 4.951286263151471e-07, + "logits/chosen": -2.8125, + "logits/rejected": -2.515625, + "logps/chosen": -524.0, + "logps/rejected": -756.0, + "loss": 0.3191, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.59375, + "rewards/margins": 2.234375, + "rewards/rejected": -5.8125, + "step": 3040 + }, + { + "epoch": 0.15719623759824766, + "grad_norm": 8.760813126987232, + "learning_rate": 4.950398676123525e-07, + "logits/chosen": -2.953125, + "logits/rejected": -2.65625, + "logps/chosen": -568.0, + "logps/rejected": -800.0, + "loss": 0.3149, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.890625, + "rewards/margins": 2.28125, + "rewards/rejected": -6.1875, + "step": 3050 + }, + { + "epoch": 0.15771163509856978, + "grad_norm": 8.248470569209255, + "learning_rate": 4.949503156813446e-07, + "logits/chosen": -3.03125, + "logits/rejected": -2.875, + "logps/chosen": -592.0, + "logps/rejected": -824.0, + "loss": 0.3169, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -3.9375, + "rewards/margins": 2.5, + "rewards/rejected": -6.4375, + "step": 3060 + }, + { + "epoch": 0.1582270325988919, + "grad_norm": 7.516571350487094, + "learning_rate": 4.948599708120152e-07, + "logits/chosen": -3.0, + "logits/rejected": -2.78125, + "logps/chosen": -556.0, + "logps/rejected": -744.0, + "loss": 0.336, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.703125, + "rewards/margins": 2.234375, + "rewards/rejected": -5.9375, + "step": 3070 + }, + { + "epoch": 0.15874243009921402, + "grad_norm": 7.925138536600202, + "learning_rate": 4.947688332968237e-07, + "logits/chosen": -2.890625, + "logits/rejected": -2.65625, + "logps/chosen": -592.0, + "logps/rejected": -792.0, + "loss": 0.2971, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.921875, + "rewards/margins": 2.265625, + "rewards/rejected": -6.1875, + "step": 3080 + }, + { + "epoch": 0.15925782759953613, + "grad_norm": 10.797611439612597, + "learning_rate": 4.946769034307945e-07, + "logits/chosen": -2.921875, + "logits/rejected": -2.6875, + "logps/chosen": -508.0, + "logps/rejected": -720.0, + "loss": 0.3253, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.375, + "rewards/margins": 2.234375, + "rewards/rejected": -5.59375, + "step": 3090 + }, + { + "epoch": 0.15977322509985828, + "grad_norm": 7.019299159846779, + "learning_rate": 4.945841815115176e-07, + "logits/chosen": -3.015625, + "logits/rejected": -2.6875, + "logps/chosen": -540.0, + "logps/rejected": -772.0, + "loss": 0.2985, + "rewards/accuracies": 0.84375, + "rewards/chosen": -3.71875, + "rewards/margins": 2.484375, + "rewards/rejected": -6.21875, + "step": 3100 + }, + { + "epoch": 0.1602886226001804, + "grad_norm": 7.185437282298617, + "learning_rate": 4.944906678391467e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.8125, + "logps/chosen": -532.0, + "logps/rejected": -792.0, + "loss": 0.3193, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -3.75, + "rewards/margins": 2.6875, + "rewards/rejected": -6.4375, + "step": 3110 + }, + { + "epoch": 0.16080402010050251, + "grad_norm": 9.683653854209235, + "learning_rate": 4.943963627163986e-07, + "logits/chosen": -3.015625, + "logits/rejected": -2.765625, + "logps/chosen": -568.0, + "logps/rejected": -792.0, + "loss": 0.3029, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.890625, + "rewards/margins": 2.125, + "rewards/rejected": -6.03125, + "step": 3120 + }, + { + "epoch": 0.16131941760082463, + "grad_norm": 12.292902214521924, + "learning_rate": 4.943012664485522e-07, + "logits/chosen": -3.0, + "logits/rejected": -2.75, + "logps/chosen": -556.0, + "logps/rejected": -784.0, + "loss": 0.3191, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.984375, + "rewards/margins": 2.4375, + "rewards/rejected": -6.4375, + "step": 3130 + }, + { + "epoch": 0.16183481510114675, + "grad_norm": 8.545171715870788, + "learning_rate": 4.942053793434473e-07, + "logits/chosen": -2.9375, + "logits/rejected": -2.546875, + "logps/chosen": -520.0, + "logps/rejected": -736.0, + "loss": 0.3401, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.5625, + "rewards/margins": 2.203125, + "rewards/rejected": -5.75, + "step": 3140 + }, + { + "epoch": 0.1623502126014689, + "grad_norm": 7.353447487742015, + "learning_rate": 4.941087017114839e-07, + "logits/chosen": -2.90625, + "logits/rejected": -2.640625, + "logps/chosen": -564.0, + "logps/rejected": -760.0, + "loss": 0.3003, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -3.890625, + "rewards/margins": 2.109375, + "rewards/rejected": -6.0, + "step": 3150 + }, + { + "epoch": 0.16286561010179101, + "grad_norm": 9.772478664111473, + "learning_rate": 4.94011233865621e-07, + "logits/chosen": -2.90625, + "logits/rejected": -2.65625, + "logps/chosen": -588.0, + "logps/rejected": -816.0, + "loss": 0.3426, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -4.3125, + "rewards/margins": 2.3125, + "rewards/rejected": -6.625, + "step": 3160 + }, + { + "epoch": 0.16338100760211313, + "grad_norm": 9.37803681224802, + "learning_rate": 4.939129761213755e-07, + "logits/chosen": -2.9375, + "logits/rejected": -2.765625, + "logps/chosen": -596.0, + "logps/rejected": -808.0, + "loss": 0.305, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.09375, + "rewards/margins": 2.40625, + "rewards/rejected": -6.46875, + "step": 3170 + }, + { + "epoch": 0.16389640510243525, + "grad_norm": 10.520897204911597, + "learning_rate": 4.938139287968215e-07, + "logits/chosen": -3.078125, + "logits/rejected": -3.03125, + "logps/chosen": -572.0, + "logps/rejected": -792.0, + "loss": 0.3015, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.984375, + "rewards/margins": 2.328125, + "rewards/rejected": -6.3125, + "step": 3180 + }, + { + "epoch": 0.16441180260275737, + "grad_norm": 8.158871569111717, + "learning_rate": 4.937140922125889e-07, + "logits/chosen": -2.9375, + "logits/rejected": -2.703125, + "logps/chosen": -564.0, + "logps/rejected": -820.0, + "loss": 0.3276, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.0, + "rewards/margins": 2.53125, + "rewards/rejected": -6.53125, + "step": 3190 + }, + { + "epoch": 0.1649272001030795, + "grad_norm": 13.156465263958202, + "learning_rate": 4.936134666918626e-07, + "logits/chosen": -3.03125, + "logits/rejected": -2.828125, + "logps/chosen": -608.0, + "logps/rejected": -848.0, + "loss": 0.3122, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.0625, + "rewards/margins": 2.671875, + "rewards/rejected": -6.75, + "step": 3200 + }, + { + "epoch": 0.16544259760340163, + "grad_norm": 8.113674707133248, + "learning_rate": 4.935120525603818e-07, + "logits/chosen": -3.03125, + "logits/rejected": -2.78125, + "logps/chosen": -596.0, + "logps/rejected": -824.0, + "loss": 0.3005, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.3125, + "rewards/margins": 2.34375, + "rewards/rejected": -6.65625, + "step": 3210 + }, + { + "epoch": 0.16595799510372375, + "grad_norm": 9.236625233276555, + "learning_rate": 4.934098501464378e-07, + "logits/chosen": -3.015625, + "logits/rejected": -2.625, + "logps/chosen": -560.0, + "logps/rejected": -796.0, + "loss": 0.2995, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -3.921875, + "rewards/margins": 2.515625, + "rewards/rejected": -6.4375, + "step": 3220 + }, + { + "epoch": 0.16647339260404587, + "grad_norm": 8.470183994971467, + "learning_rate": 4.933068597808741e-07, + "logits/chosen": -3.046875, + "logits/rejected": -2.828125, + "logps/chosen": -632.0, + "logps/rejected": -872.0, + "loss": 0.3268, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.46875, + "rewards/margins": 2.46875, + "rewards/rejected": -6.9375, + "step": 3230 + }, + { + "epoch": 0.16698879010436798, + "grad_norm": 9.156023271257757, + "learning_rate": 4.932030817970851e-07, + "logits/chosen": -3.03125, + "logits/rejected": -2.78125, + "logps/chosen": -584.0, + "logps/rejected": -804.0, + "loss": 0.2898, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.09375, + "rewards/margins": 2.234375, + "rewards/rejected": -6.3125, + "step": 3240 + }, + { + "epoch": 0.16750418760469013, + "grad_norm": 11.957046492094268, + "learning_rate": 4.930985165310143e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.890625, + "logps/chosen": -612.0, + "logps/rejected": -848.0, + "loss": 0.3217, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.28125, + "rewards/margins": 2.578125, + "rewards/rejected": -6.875, + "step": 3250 + }, + { + "epoch": 0.16801958510501225, + "grad_norm": 9.484502049567105, + "learning_rate": 4.929931643211542e-07, + "logits/chosen": -3.03125, + "logits/rejected": -2.875, + "logps/chosen": -572.0, + "logps/rejected": -792.0, + "loss": 0.3236, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.1875, + "rewards/margins": 2.125, + "rewards/rejected": -6.3125, + "step": 3260 + }, + { + "epoch": 0.16853498260533437, + "grad_norm": 8.799758376938463, + "learning_rate": 4.928870255085447e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.703125, + "logps/chosen": -612.0, + "logps/rejected": -828.0, + "loss": 0.3366, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.375, + "rewards/margins": 2.3125, + "rewards/rejected": -6.6875, + "step": 3270 + }, + { + "epoch": 0.16905038010565648, + "grad_norm": 8.07795000126954, + "learning_rate": 4.927801004367718e-07, + "logits/chosen": -3.03125, + "logits/rejected": -2.734375, + "logps/chosen": -660.0, + "logps/rejected": -844.0, + "loss": 0.3187, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.96875, + "rewards/margins": 2.015625, + "rewards/rejected": -6.96875, + "step": 3280 + }, + { + "epoch": 0.1695657776059786, + "grad_norm": 7.803622380620347, + "learning_rate": 4.926723894519669e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.875, + "logps/chosen": -592.0, + "logps/rejected": -840.0, + "loss": 0.306, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.21875, + "rewards/margins": 2.53125, + "rewards/rejected": -6.71875, + "step": 3290 + }, + { + "epoch": 0.17008117510630075, + "grad_norm": 9.0215492763325, + "learning_rate": 4.925638929028055e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.90625, + "logps/chosen": -628.0, + "logps/rejected": -864.0, + "loss": 0.309, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.5625, + "rewards/margins": 2.515625, + "rewards/rejected": -7.09375, + "step": 3300 + }, + { + "epoch": 0.17059657260662286, + "grad_norm": 7.930746733024545, + "learning_rate": 4.924546111405059e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.828125, + "logps/chosen": -624.0, + "logps/rejected": -864.0, + "loss": 0.3104, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -4.53125, + "rewards/margins": 2.515625, + "rewards/rejected": -7.03125, + "step": 3310 + }, + { + "epoch": 0.17111197010694498, + "grad_norm": 8.107097251519779, + "learning_rate": 4.923445445188286e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.9375, + "logps/chosen": -656.0, + "logps/rejected": -864.0, + "loss": 0.2805, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.65625, + "rewards/margins": 2.3125, + "rewards/rejected": -6.96875, + "step": 3320 + }, + { + "epoch": 0.1716273676072671, + "grad_norm": 9.979276967339496, + "learning_rate": 4.922336933940745e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.765625, + "logps/chosen": -684.0, + "logps/rejected": -912.0, + "loss": 0.3242, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.0625, + "rewards/margins": 2.59375, + "rewards/rejected": -7.65625, + "step": 3330 + }, + { + "epoch": 0.17214276510758922, + "grad_norm": 7.433788969345211, + "learning_rate": 4.921220581250842e-07, + "logits/chosen": -3.0625, + "logits/rejected": -2.984375, + "logps/chosen": -596.0, + "logps/rejected": -808.0, + "loss": 0.3141, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.0625, + "rewards/margins": 2.390625, + "rewards/rejected": -6.46875, + "step": 3340 + }, + { + "epoch": 0.17265816260791136, + "grad_norm": 9.344190897723273, + "learning_rate": 4.920096390732365e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.84375, + "logps/chosen": -604.0, + "logps/rejected": -868.0, + "loss": 0.2916, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.375, + "rewards/margins": 2.703125, + "rewards/rejected": -7.0625, + "step": 3350 + }, + { + "epoch": 0.17317356010823348, + "grad_norm": 8.49006151623474, + "learning_rate": 4.918964366024477e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.84375, + "logps/chosen": -620.0, + "logps/rejected": -856.0, + "loss": 0.3243, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.40625, + "rewards/margins": 2.421875, + "rewards/rejected": -6.84375, + "step": 3360 + }, + { + "epoch": 0.1736889576085556, + "grad_norm": 9.07355816587576, + "learning_rate": 4.917824510791699e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.828125, + "logps/chosen": -624.0, + "logps/rejected": -844.0, + "loss": 0.2944, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.34375, + "rewards/margins": 2.53125, + "rewards/rejected": -6.875, + "step": 3370 + }, + { + "epoch": 0.17420435510887772, + "grad_norm": 11.393308859845522, + "learning_rate": 4.916676828723902e-07, + "logits/chosen": -3.171875, + "logits/rejected": -3.0, + "logps/chosen": -632.0, + "logps/rejected": -848.0, + "loss": 0.3113, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.53125, + "rewards/margins": 2.328125, + "rewards/rejected": -6.875, + "step": 3380 + }, + { + "epoch": 0.17471975260919984, + "grad_norm": 8.308891772991496, + "learning_rate": 4.915521323536292e-07, + "logits/chosen": -2.921875, + "logits/rejected": -2.734375, + "logps/chosen": -628.0, + "logps/rejected": -916.0, + "loss": 0.3009, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -4.625, + "rewards/margins": 3.046875, + "rewards/rejected": -7.65625, + "step": 3390 + }, + { + "epoch": 0.17523515010952198, + "grad_norm": 10.11039004355552, + "learning_rate": 4.914357998969401e-07, + "logits/chosen": -2.984375, + "logits/rejected": -2.796875, + "logps/chosen": -620.0, + "logps/rejected": -852.0, + "loss": 0.321, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.5, + "rewards/margins": 2.34375, + "rewards/rejected": -6.84375, + "step": 3400 + }, + { + "epoch": 0.1757505476098441, + "grad_norm": 7.696100609498626, + "learning_rate": 4.913186858789072e-07, + "logits/chosen": -3.046875, + "logits/rejected": -2.796875, + "logps/chosen": -644.0, + "logps/rejected": -880.0, + "loss": 0.2931, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.625, + "rewards/margins": 2.484375, + "rewards/rejected": -7.125, + "step": 3410 + }, + { + "epoch": 0.17626594511016622, + "grad_norm": 6.773588742426124, + "learning_rate": 4.91200790678645e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.984375, + "logps/chosen": -596.0, + "logps/rejected": -824.0, + "loss": 0.3118, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.15625, + "rewards/margins": 2.34375, + "rewards/rejected": -6.5, + "step": 3420 + }, + { + "epoch": 0.17678134261048833, + "grad_norm": 8.240926813310582, + "learning_rate": 4.910821146777967e-07, + "logits/chosen": -3.359375, + "logits/rejected": -3.03125, + "logps/chosen": -568.0, + "logps/rejected": -808.0, + "loss": 0.3064, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -3.953125, + "rewards/margins": 2.453125, + "rewards/rejected": -6.40625, + "step": 3430 + }, + { + "epoch": 0.17729674011081045, + "grad_norm": 7.958177279871505, + "learning_rate": 4.909626582605328e-07, + "logits/chosen": -3.25, + "logits/rejected": -3.015625, + "logps/chosen": -616.0, + "logps/rejected": -876.0, + "loss": 0.2794, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.34375, + "rewards/margins": 2.8125, + "rewards/rejected": -7.1875, + "step": 3440 + }, + { + "epoch": 0.1778121376111326, + "grad_norm": 7.2270648792800705, + "learning_rate": 4.908424218135509e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.890625, + "logps/chosen": -608.0, + "logps/rejected": -892.0, + "loss": 0.2913, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.28125, + "rewards/margins": 2.734375, + "rewards/rejected": -7.0, + "step": 3450 + }, + { + "epoch": 0.17832753511145472, + "grad_norm": 7.110828443901787, + "learning_rate": 4.907214057260726e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.8125, + "logps/chosen": -584.0, + "logps/rejected": -812.0, + "loss": 0.3099, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.1875, + "rewards/margins": 2.515625, + "rewards/rejected": -6.6875, + "step": 3460 + }, + { + "epoch": 0.17884293261177683, + "grad_norm": 11.280851522272572, + "learning_rate": 4.905996103898441e-07, + "logits/chosen": -2.96875, + "logits/rejected": -2.78125, + "logps/chosen": -636.0, + "logps/rejected": -904.0, + "loss": 0.3112, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.71875, + "rewards/margins": 2.640625, + "rewards/rejected": -7.34375, + "step": 3470 + }, + { + "epoch": 0.17935833011209895, + "grad_norm": 7.031013871460757, + "learning_rate": 4.90477036199134e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.828125, + "logps/chosen": -688.0, + "logps/rejected": -884.0, + "loss": 0.3158, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.0, + "rewards/margins": 2.375, + "rewards/rejected": -7.375, + "step": 3480 + }, + { + "epoch": 0.17987372761242107, + "grad_norm": 9.589935420119156, + "learning_rate": 4.903536835507318e-07, + "logits/chosen": -2.765625, + "logits/rejected": -2.625, + "logps/chosen": -612.0, + "logps/rejected": -824.0, + "loss": 0.3291, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.40625, + "rewards/margins": 2.28125, + "rewards/rejected": -6.6875, + "step": 3490 + }, + { + "epoch": 0.18038912511274321, + "grad_norm": 8.984455638149415, + "learning_rate": 4.902295528439473e-07, + "logits/chosen": -2.734375, + "logits/rejected": -2.421875, + "logps/chosen": -656.0, + "logps/rejected": -884.0, + "loss": 0.3225, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -4.71875, + "rewards/margins": 2.359375, + "rewards/rejected": -7.0625, + "step": 3500 + }, + { + "epoch": 0.18090452261306533, + "grad_norm": 7.685922098656539, + "learning_rate": 4.90104644480609e-07, + "logits/chosen": -2.6875, + "logits/rejected": -2.453125, + "logps/chosen": -644.0, + "logps/rejected": -856.0, + "loss": 0.3058, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.6875, + "rewards/margins": 2.328125, + "rewards/rejected": -7.03125, + "step": 3510 + }, + { + "epoch": 0.18141992011338745, + "grad_norm": 8.35122940661912, + "learning_rate": 4.899789588650625e-07, + "logits/chosen": -2.84375, + "logits/rejected": -2.578125, + "logps/chosen": -672.0, + "logps/rejected": -860.0, + "loss": 0.3226, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -4.75, + "rewards/margins": 2.140625, + "rewards/rejected": -6.875, + "step": 3520 + }, + { + "epoch": 0.18193531761370957, + "grad_norm": 7.316814129593554, + "learning_rate": 4.898524964041699e-07, + "logits/chosen": -2.890625, + "logits/rejected": -2.71875, + "logps/chosen": -584.0, + "logps/rejected": -788.0, + "loss": 0.3104, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.25, + "rewards/margins": 2.109375, + "rewards/rejected": -6.34375, + "step": 3530 + }, + { + "epoch": 0.1824507151140317, + "grad_norm": 7.1786794586292, + "learning_rate": 4.897252575073077e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.75, + "logps/chosen": -608.0, + "logps/rejected": -848.0, + "loss": 0.2957, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.5, + "rewards/margins": 2.4375, + "rewards/rejected": -6.9375, + "step": 3540 + }, + { + "epoch": 0.18296611261435383, + "grad_norm": 8.589316357139246, + "learning_rate": 4.89597242586366e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.921875, + "logps/chosen": -628.0, + "logps/rejected": -884.0, + "loss": 0.3425, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.46875, + "rewards/margins": 2.84375, + "rewards/rejected": -7.3125, + "step": 3550 + }, + { + "epoch": 0.18348151011467595, + "grad_norm": 7.375450426479102, + "learning_rate": 4.894684520557467e-07, + "logits/chosen": -2.96875, + "logits/rejected": -2.78125, + "logps/chosen": -636.0, + "logps/rejected": -856.0, + "loss": 0.3331, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.59375, + "rewards/margins": 2.265625, + "rewards/rejected": -6.84375, + "step": 3560 + }, + { + "epoch": 0.18399690761499807, + "grad_norm": 6.521326887590367, + "learning_rate": 4.89338886332363e-07, + "logits/chosen": -2.96875, + "logits/rejected": -2.734375, + "logps/chosen": -616.0, + "logps/rejected": -876.0, + "loss": 0.2973, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.65625, + "rewards/margins": 2.546875, + "rewards/rejected": -7.1875, + "step": 3570 + }, + { + "epoch": 0.18451230511532019, + "grad_norm": 10.319893106211808, + "learning_rate": 4.892085458356371e-07, + "logits/chosen": -2.921875, + "logits/rejected": -2.84375, + "logps/chosen": -624.0, + "logps/rejected": -852.0, + "loss": 0.3048, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.5, + "rewards/margins": 2.40625, + "rewards/rejected": -6.90625, + "step": 3580 + }, + { + "epoch": 0.1850277026156423, + "grad_norm": 7.343117913436097, + "learning_rate": 4.890774309874994e-07, + "logits/chosen": -2.9375, + "logits/rejected": -2.6875, + "logps/chosen": -592.0, + "logps/rejected": -820.0, + "loss": 0.3114, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.21875, + "rewards/margins": 2.53125, + "rewards/rejected": -6.75, + "step": 3590 + }, + { + "epoch": 0.18554310011596445, + "grad_norm": 8.211945393938576, + "learning_rate": 4.889455422123868e-07, + "logits/chosen": -3.0625, + "logits/rejected": -2.734375, + "logps/chosen": -604.0, + "logps/rejected": -848.0, + "loss": 0.2697, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.34375, + "rewards/margins": 2.53125, + "rewards/rejected": -6.84375, + "step": 3600 + }, + { + "epoch": 0.18605849761628657, + "grad_norm": 7.955016257917494, + "learning_rate": 4.888128799372418e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.84375, + "logps/chosen": -600.0, + "logps/rejected": -832.0, + "loss": 0.3313, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.34375, + "rewards/margins": 2.359375, + "rewards/rejected": -6.6875, + "step": 3610 + }, + { + "epoch": 0.18657389511660868, + "grad_norm": 7.172347283221737, + "learning_rate": 4.886794445915106e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.96875, + "logps/chosen": -608.0, + "logps/rejected": -868.0, + "loss": 0.3077, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.40625, + "rewards/margins": 2.546875, + "rewards/rejected": -6.9375, + "step": 3620 + }, + { + "epoch": 0.1870892926169308, + "grad_norm": 9.018182797356321, + "learning_rate": 4.885452366071419e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.953125, + "logps/chosen": -672.0, + "logps/rejected": -876.0, + "loss": 0.2965, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.8125, + "rewards/margins": 2.375, + "rewards/rejected": -7.1875, + "step": 3630 + }, + { + "epoch": 0.18760469011725292, + "grad_norm": 7.181577793758476, + "learning_rate": 4.884102564185861e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.9375, + "logps/chosen": -592.0, + "logps/rejected": -856.0, + "loss": 0.3151, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.0625, + "rewards/margins": 2.703125, + "rewards/rejected": -6.75, + "step": 3640 + }, + { + "epoch": 0.18812008761757507, + "grad_norm": 7.92829994322413, + "learning_rate": 4.882745044627924e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.046875, + "logps/chosen": -572.0, + "logps/rejected": -820.0, + "loss": 0.3094, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.03125, + "rewards/margins": 2.5625, + "rewards/rejected": -6.59375, + "step": 3650 + }, + { + "epoch": 0.18863548511789718, + "grad_norm": 10.92906526671337, + "learning_rate": 4.881379811792092e-07, + "logits/chosen": -3.078125, + "logits/rejected": -2.75, + "logps/chosen": -668.0, + "logps/rejected": -928.0, + "loss": 0.3164, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.8125, + "rewards/margins": 2.75, + "rewards/rejected": -7.5625, + "step": 3660 + }, + { + "epoch": 0.1891508826182193, + "grad_norm": 6.327075929522721, + "learning_rate": 4.88000687009781e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.9375, + "logps/chosen": -616.0, + "logps/rejected": -832.0, + "loss": 0.3094, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.4375, + "rewards/margins": 2.375, + "rewards/rejected": -6.8125, + "step": 3670 + }, + { + "epoch": 0.18966628011854142, + "grad_norm": 9.347843635742946, + "learning_rate": 4.878626223989485e-07, + "logits/chosen": -3.0625, + "logits/rejected": -2.671875, + "logps/chosen": -608.0, + "logps/rejected": -836.0, + "loss": 0.2844, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.46875, + "rewards/margins": 2.265625, + "rewards/rejected": -6.75, + "step": 3680 + }, + { + "epoch": 0.19018167761886354, + "grad_norm": 6.797339225888468, + "learning_rate": 4.877237877936458e-07, + "logits/chosen": -3.0, + "logits/rejected": -2.890625, + "logps/chosen": -628.0, + "logps/rejected": -856.0, + "loss": 0.3015, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -4.4375, + "rewards/margins": 2.515625, + "rewards/rejected": -6.96875, + "step": 3690 + }, + { + "epoch": 0.19069707511918568, + "grad_norm": 8.194377618916263, + "learning_rate": 4.875841836433001e-07, + "logits/chosen": -3.171875, + "logits/rejected": -3.015625, + "logps/chosen": -620.0, + "logps/rejected": -832.0, + "loss": 0.3199, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.40625, + "rewards/margins": 2.390625, + "rewards/rejected": -6.78125, + "step": 3700 + }, + { + "epoch": 0.1912124726195078, + "grad_norm": 9.191799239103425, + "learning_rate": 4.874438103998295e-07, + "logits/chosen": -3.125, + "logits/rejected": -3.03125, + "logps/chosen": -652.0, + "logps/rejected": -872.0, + "loss": 0.3144, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.53125, + "rewards/margins": 2.421875, + "rewards/rejected": -6.96875, + "step": 3710 + }, + { + "epoch": 0.19172787011982992, + "grad_norm": 7.928953969725331, + "learning_rate": 4.873026685176417e-07, + "logits/chosen": -3.0625, + "logits/rejected": -2.8125, + "logps/chosen": -608.0, + "logps/rejected": -832.0, + "loss": 0.3003, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.375, + "rewards/margins": 2.453125, + "rewards/rejected": -6.8125, + "step": 3720 + }, + { + "epoch": 0.19224326762015204, + "grad_norm": 9.00077409674879, + "learning_rate": 4.871607584536325e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.875, + "logps/chosen": -544.0, + "logps/rejected": -808.0, + "loss": 0.3072, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -3.796875, + "rewards/margins": 2.53125, + "rewards/rejected": -6.34375, + "step": 3730 + }, + { + "epoch": 0.19275866512047415, + "grad_norm": 10.377483990135854, + "learning_rate": 4.870180806671849e-07, + "logits/chosen": -3.015625, + "logits/rejected": -2.71875, + "logps/chosen": -600.0, + "logps/rejected": -848.0, + "loss": 0.2986, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -4.25, + "rewards/margins": 2.578125, + "rewards/rejected": -6.84375, + "step": 3740 + }, + { + "epoch": 0.1932740626207963, + "grad_norm": 6.019505595940947, + "learning_rate": 4.868746356201666e-07, + "logits/chosen": -2.875, + "logits/rejected": -2.796875, + "logps/chosen": -544.0, + "logps/rejected": -796.0, + "loss": 0.2999, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.75, + "rewards/margins": 2.71875, + "rewards/rejected": -6.46875, + "step": 3750 + }, + { + "epoch": 0.19378946012111842, + "grad_norm": 7.919517131667352, + "learning_rate": 4.86730423776929e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.953125, + "logps/chosen": -548.0, + "logps/rejected": -788.0, + "loss": 0.2906, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.640625, + "rewards/margins": 2.578125, + "rewards/rejected": -6.21875, + "step": 3760 + }, + { + "epoch": 0.19430485762144054, + "grad_norm": 5.754834283193178, + "learning_rate": 4.865854456043063e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.875, + "logps/chosen": -652.0, + "logps/rejected": -872.0, + "loss": 0.3196, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.75, + "rewards/margins": 2.3125, + "rewards/rejected": -7.0625, + "step": 3770 + }, + { + "epoch": 0.19482025512176265, + "grad_norm": 9.94335872925226, + "learning_rate": 4.864397015716128e-07, + "logits/chosen": -3.203125, + "logits/rejected": -3.0, + "logps/chosen": -572.0, + "logps/rejected": -844.0, + "loss": 0.2765, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.09375, + "rewards/margins": 2.78125, + "rewards/rejected": -6.875, + "step": 3780 + }, + { + "epoch": 0.19533565262208477, + "grad_norm": 11.452818302047211, + "learning_rate": 4.862931921506425e-07, + "logits/chosen": -3.0625, + "logits/rejected": -2.75, + "logps/chosen": -580.0, + "logps/rejected": -880.0, + "loss": 0.3045, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.09375, + "rewards/margins": 2.890625, + "rewards/rejected": -7.0, + "step": 3790 + }, + { + "epoch": 0.19585105012240692, + "grad_norm": 8.751375326600819, + "learning_rate": 4.861459178156665e-07, + "logits/chosen": -2.953125, + "logits/rejected": -2.625, + "logps/chosen": -604.0, + "logps/rejected": -876.0, + "loss": 0.3198, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.3125, + "rewards/margins": 2.75, + "rewards/rejected": -7.0625, + "step": 3800 + }, + { + "epoch": 0.19636644762272903, + "grad_norm": 9.497660249574864, + "learning_rate": 4.859978790434327e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.859375, + "logps/chosen": -564.0, + "logps/rejected": -800.0, + "loss": 0.3216, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -3.78125, + "rewards/margins": 2.5625, + "rewards/rejected": -6.34375, + "step": 3810 + }, + { + "epoch": 0.19688184512305115, + "grad_norm": 6.752904407638405, + "learning_rate": 4.858490763131629e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.9375, + "logps/chosen": -632.0, + "logps/rejected": -864.0, + "loss": 0.3172, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.6875, + "rewards/margins": 2.375, + "rewards/rejected": -7.0625, + "step": 3820 + }, + { + "epoch": 0.19739724262337327, + "grad_norm": 6.960583183443331, + "learning_rate": 4.856995101065526e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.0625, + "logps/chosen": -648.0, + "logps/rejected": -892.0, + "loss": 0.3342, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.6875, + "rewards/margins": 2.484375, + "rewards/rejected": -7.1875, + "step": 3830 + }, + { + "epoch": 0.1979126401236954, + "grad_norm": 7.815401360065738, + "learning_rate": 4.855491809077682e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.90625, + "logps/chosen": -572.0, + "logps/rejected": -828.0, + "loss": 0.3007, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.03125, + "rewards/margins": 2.71875, + "rewards/rejected": -6.78125, + "step": 3840 + }, + { + "epoch": 0.19842803762401753, + "grad_norm": 9.663304473836881, + "learning_rate": 4.853980892034465e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.046875, + "logps/chosen": -592.0, + "logps/rejected": -812.0, + "loss": 0.3185, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.28125, + "rewards/margins": 2.234375, + "rewards/rejected": -6.5, + "step": 3850 + }, + { + "epoch": 0.19894343512433965, + "grad_norm": 8.803505140713787, + "learning_rate": 4.852462354826922e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.953125, + "logps/chosen": -648.0, + "logps/rejected": -896.0, + "loss": 0.2785, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.59375, + "rewards/margins": 2.484375, + "rewards/rejected": -7.09375, + "step": 3860 + }, + { + "epoch": 0.19945883262466177, + "grad_norm": 7.382946409377529, + "learning_rate": 4.850936202370772e-07, + "logits/chosen": -3.046875, + "logits/rejected": -2.796875, + "logps/chosen": -628.0, + "logps/rejected": -852.0, + "loss": 0.327, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.5625, + "rewards/margins": 2.375, + "rewards/rejected": -6.9375, + "step": 3870 + }, + { + "epoch": 0.1999742301249839, + "grad_norm": 7.209008160597982, + "learning_rate": 4.849402439606383e-07, + "logits/chosen": -3.078125, + "logits/rejected": -2.78125, + "logps/chosen": -640.0, + "logps/rejected": -852.0, + "loss": 0.3106, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.71875, + "rewards/margins": 2.296875, + "rewards/rejected": -7.03125, + "step": 3880 + }, + { + "epoch": 0.200489627625306, + "grad_norm": 8.6266710186365, + "learning_rate": 4.847861071498756e-07, + "logits/chosen": -2.84375, + "logits/rejected": -2.703125, + "logps/chosen": -600.0, + "logps/rejected": -832.0, + "loss": 0.3216, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -4.34375, + "rewards/margins": 2.34375, + "rewards/rejected": -6.6875, + "step": 3890 + }, + { + "epoch": 0.20100502512562815, + "grad_norm": 8.739472958342153, + "learning_rate": 4.846312103037516e-07, + "logits/chosen": -2.953125, + "logits/rejected": -2.671875, + "logps/chosen": -564.0, + "logps/rejected": -800.0, + "loss": 0.3018, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.71875, + "rewards/margins": 2.65625, + "rewards/rejected": -6.375, + "step": 3900 + }, + { + "epoch": 0.20152042262595027, + "grad_norm": 7.711089745269723, + "learning_rate": 4.844755539236891e-07, + "logits/chosen": -3.046875, + "logits/rejected": -2.796875, + "logps/chosen": -632.0, + "logps/rejected": -860.0, + "loss": 0.3079, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.625, + "rewards/margins": 2.34375, + "rewards/rejected": -6.96875, + "step": 3910 + }, + { + "epoch": 0.2020358201262724, + "grad_norm": 8.613305422324098, + "learning_rate": 4.843191385135695e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.875, + "logps/chosen": -628.0, + "logps/rejected": -888.0, + "loss": 0.2949, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.5625, + "rewards/margins": 2.703125, + "rewards/rejected": -7.28125, + "step": 3920 + }, + { + "epoch": 0.2025512176265945, + "grad_norm": 7.584979644877159, + "learning_rate": 4.84161964579731e-07, + "logits/chosen": -2.96875, + "logits/rejected": -2.703125, + "logps/chosen": -608.0, + "logps/rejected": -848.0, + "loss": 0.3059, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.46875, + "rewards/margins": 2.296875, + "rewards/rejected": -6.75, + "step": 3930 + }, + { + "epoch": 0.20306661512691662, + "grad_norm": 9.810031473258109, + "learning_rate": 4.840040326309679e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.859375, + "logps/chosen": -600.0, + "logps/rejected": -860.0, + "loss": 0.2758, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.3125, + "rewards/margins": 2.8125, + "rewards/rejected": -7.125, + "step": 3940 + }, + { + "epoch": 0.20358201262723877, + "grad_norm": 9.489791898812014, + "learning_rate": 4.838453431785277e-07, + "logits/chosen": -3.078125, + "logits/rejected": -2.84375, + "logps/chosen": -656.0, + "logps/rejected": -944.0, + "loss": 0.2945, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -4.5625, + "rewards/margins": 3.09375, + "rewards/rejected": -7.6875, + "step": 3950 + }, + { + "epoch": 0.20409741012756089, + "grad_norm": 7.189745408586003, + "learning_rate": 4.836858967361104e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.9375, + "logps/chosen": -576.0, + "logps/rejected": -812.0, + "loss": 0.3151, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.03125, + "rewards/margins": 2.515625, + "rewards/rejected": -6.53125, + "step": 3960 + }, + { + "epoch": 0.204612807627883, + "grad_norm": 11.428361904068005, + "learning_rate": 4.835256938198664e-07, + "logits/chosen": -3.046875, + "logits/rejected": -2.796875, + "logps/chosen": -576.0, + "logps/rejected": -820.0, + "loss": 0.2628, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -3.953125, + "rewards/margins": 2.71875, + "rewards/rejected": -6.6875, + "step": 3970 + }, + { + "epoch": 0.20512820512820512, + "grad_norm": 9.924538464374693, + "learning_rate": 4.833647349483947e-07, + "logits/chosen": -2.953125, + "logits/rejected": -2.703125, + "logps/chosen": -644.0, + "logps/rejected": -876.0, + "loss": 0.2973, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.6875, + "rewards/margins": 2.421875, + "rewards/rejected": -7.09375, + "step": 3980 + }, + { + "epoch": 0.20564360262852724, + "grad_norm": 7.915443909029323, + "learning_rate": 4.832030206427418e-07, + "logits/chosen": -2.921875, + "logits/rejected": -2.796875, + "logps/chosen": -648.0, + "logps/rejected": -884.0, + "loss": 0.311, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.59375, + "rewards/margins": 2.734375, + "rewards/rejected": -7.34375, + "step": 3990 + }, + { + "epoch": 0.20615900012884938, + "grad_norm": 8.198959116273715, + "learning_rate": 4.830405514263992e-07, + "logits/chosen": -3.078125, + "logits/rejected": -2.796875, + "logps/chosen": -636.0, + "logps/rejected": -884.0, + "loss": 0.2906, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.59375, + "rewards/margins": 2.609375, + "rewards/rejected": -7.21875, + "step": 4000 + }, + { + "epoch": 0.2066743976291715, + "grad_norm": 8.150941524122768, + "learning_rate": 4.828773278253027e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.9375, + "logps/chosen": -592.0, + "logps/rejected": -852.0, + "loss": 0.3081, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.21875, + "rewards/margins": 2.640625, + "rewards/rejected": -6.875, + "step": 4010 + }, + { + "epoch": 0.20718979512949362, + "grad_norm": 7.2412717391054535, + "learning_rate": 4.827133503678298e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.09375, + "logps/chosen": -576.0, + "logps/rejected": -804.0, + "loss": 0.2929, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -3.828125, + "rewards/margins": 2.5, + "rewards/rejected": -6.34375, + "step": 4020 + }, + { + "epoch": 0.20770519262981574, + "grad_norm": 8.105773859800914, + "learning_rate": 4.825486195847981e-07, + "logits/chosen": -3.171875, + "logits/rejected": -3.0, + "logps/chosen": -644.0, + "logps/rejected": -884.0, + "loss": 0.3125, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.5625, + "rewards/margins": 2.78125, + "rewards/rejected": -7.34375, + "step": 4030 + }, + { + "epoch": 0.20822059013013786, + "grad_norm": 10.747835682920662, + "learning_rate": 4.823831360094645e-07, + "logits/chosen": -3.203125, + "logits/rejected": -3.03125, + "logps/chosen": -644.0, + "logps/rejected": -868.0, + "loss": 0.2969, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.59375, + "rewards/margins": 2.453125, + "rewards/rejected": -7.03125, + "step": 4040 + }, + { + "epoch": 0.20873598763046, + "grad_norm": 7.075833717865981, + "learning_rate": 4.82216900177522e-07, + "logits/chosen": -3.015625, + "logits/rejected": -2.6875, + "logps/chosen": -588.0, + "logps/rejected": -844.0, + "loss": 0.2906, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.3125, + "rewards/margins": 2.515625, + "rewards/rejected": -6.8125, + "step": 4050 + }, + { + "epoch": 0.20925138513078212, + "grad_norm": 7.7010956846513645, + "learning_rate": 4.820499126270996e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.875, + "logps/chosen": -620.0, + "logps/rejected": -868.0, + "loss": 0.2953, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -4.375, + "rewards/margins": 2.53125, + "rewards/rejected": -6.90625, + "step": 4060 + }, + { + "epoch": 0.20976678263110424, + "grad_norm": 6.984026980233429, + "learning_rate": 4.818821738987589e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.9375, + "logps/chosen": -656.0, + "logps/rejected": -892.0, + "loss": 0.3021, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.65625, + "rewards/margins": 2.4375, + "rewards/rejected": -7.125, + "step": 4070 + }, + { + "epoch": 0.21028218013142636, + "grad_norm": 8.671287931298396, + "learning_rate": 4.817136845354939e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.84375, + "logps/chosen": -600.0, + "logps/rejected": -860.0, + "loss": 0.2746, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.28125, + "rewards/margins": 2.6875, + "rewards/rejected": -6.96875, + "step": 4080 + }, + { + "epoch": 0.21079757763174847, + "grad_norm": 8.58406129177322, + "learning_rate": 4.815444450827278e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.96875, + "logps/chosen": -600.0, + "logps/rejected": -864.0, + "loss": 0.2945, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -4.28125, + "rewards/margins": 2.71875, + "rewards/rejected": -7.0, + "step": 4090 + }, + { + "epoch": 0.21131297513207062, + "grad_norm": 9.416608561004821, + "learning_rate": 4.813744560883127e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.84375, + "logps/chosen": -608.0, + "logps/rejected": -856.0, + "loss": 0.2923, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.28125, + "rewards/margins": 2.53125, + "rewards/rejected": -6.8125, + "step": 4100 + }, + { + "epoch": 0.21182837263239274, + "grad_norm": 7.627073133576545, + "learning_rate": 4.812037181025262e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.84375, + "logps/chosen": -604.0, + "logps/rejected": -848.0, + "loss": 0.3017, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.4375, + "rewards/margins": 2.390625, + "rewards/rejected": -6.8125, + "step": 4110 + }, + { + "epoch": 0.21234377013271485, + "grad_norm": 7.480277681682737, + "learning_rate": 4.810322316780713e-07, + "logits/chosen": -2.984375, + "logits/rejected": -2.765625, + "logps/chosen": -608.0, + "logps/rejected": -872.0, + "loss": 0.2948, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.46875, + "rewards/margins": 2.609375, + "rewards/rejected": -7.0625, + "step": 4120 + }, + { + "epoch": 0.21285916763303697, + "grad_norm": 8.49177890720753, + "learning_rate": 4.808599973700734e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.828125, + "logps/chosen": -580.0, + "logps/rejected": -880.0, + "loss": 0.2995, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.1875, + "rewards/margins": 2.78125, + "rewards/rejected": -6.96875, + "step": 4130 + }, + { + "epoch": 0.2133745651333591, + "grad_norm": 7.360349225438156, + "learning_rate": 4.80687015736079e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.96875, + "logps/chosen": -628.0, + "logps/rejected": -852.0, + "loss": 0.2887, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.46875, + "rewards/margins": 2.46875, + "rewards/rejected": -6.9375, + "step": 4140 + }, + { + "epoch": 0.21388996263368124, + "grad_norm": 8.703175825755736, + "learning_rate": 4.805132873360536e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.9375, + "logps/chosen": -608.0, + "logps/rejected": -900.0, + "loss": 0.299, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.46875, + "rewards/margins": 2.90625, + "rewards/rejected": -7.375, + "step": 4150 + }, + { + "epoch": 0.21440536013400335, + "grad_norm": 9.89709281598567, + "learning_rate": 4.803388127323804e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.984375, + "logps/chosen": -592.0, + "logps/rejected": -844.0, + "loss": 0.2678, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.3125, + "rewards/margins": 2.671875, + "rewards/rejected": -6.96875, + "step": 4160 + }, + { + "epoch": 0.21492075763432547, + "grad_norm": 7.672726879087714, + "learning_rate": 4.80163592489858e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.109375, + "logps/chosen": -568.0, + "logps/rejected": -828.0, + "loss": 0.2958, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -3.96875, + "rewards/margins": 2.625, + "rewards/rejected": -6.59375, + "step": 4170 + }, + { + "epoch": 0.2154361551346476, + "grad_norm": 8.638495066662339, + "learning_rate": 4.799876271756987e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.875, + "logps/chosen": -628.0, + "logps/rejected": -904.0, + "loss": 0.2757, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.4375, + "rewards/margins": 2.984375, + "rewards/rejected": -7.4375, + "step": 4180 + }, + { + "epoch": 0.2159515526349697, + "grad_norm": 7.3827849455147625, + "learning_rate": 4.798109173595268e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.984375, + "logps/chosen": -656.0, + "logps/rejected": -912.0, + "loss": 0.2955, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.71875, + "rewards/margins": 2.78125, + "rewards/rejected": -7.5, + "step": 4190 + }, + { + "epoch": 0.21646695013529185, + "grad_norm": 7.114702076000237, + "learning_rate": 4.796334636133766e-07, + "logits/chosen": -3.078125, + "logits/rejected": -2.84375, + "logps/chosen": -612.0, + "logps/rejected": -848.0, + "loss": 0.3216, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.375, + "rewards/margins": 2.515625, + "rewards/rejected": -6.875, + "step": 4200 + }, + { + "epoch": 0.21698234763561397, + "grad_norm": 7.219385642368885, + "learning_rate": 4.794552665116906e-07, + "logits/chosen": -2.984375, + "logits/rejected": -2.703125, + "logps/chosen": -612.0, + "logps/rejected": -832.0, + "loss": 0.3264, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.40625, + "rewards/margins": 2.109375, + "rewards/rejected": -6.5, + "step": 4210 + }, + { + "epoch": 0.2174977451359361, + "grad_norm": 7.298116884307016, + "learning_rate": 4.792763266313177e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.875, + "logps/chosen": -628.0, + "logps/rejected": -864.0, + "loss": 0.2892, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.40625, + "rewards/margins": 2.46875, + "rewards/rejected": -6.875, + "step": 4220 + }, + { + "epoch": 0.2180131426362582, + "grad_norm": 10.503480626789262, + "learning_rate": 4.790966445515112e-07, + "logits/chosen": -3.28125, + "logits/rejected": -3.046875, + "logps/chosen": -664.0, + "logps/rejected": -908.0, + "loss": 0.319, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.6875, + "rewards/margins": 2.84375, + "rewards/rejected": -7.53125, + "step": 4230 + }, + { + "epoch": 0.21852854013658032, + "grad_norm": 7.345085644707261, + "learning_rate": 4.78916220853927e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.890625, + "logps/chosen": -608.0, + "logps/rejected": -888.0, + "loss": 0.2643, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.46875, + "rewards/margins": 2.84375, + "rewards/rejected": -7.28125, + "step": 4240 + }, + { + "epoch": 0.21904393763690247, + "grad_norm": 9.323726603083948, + "learning_rate": 4.787350561226219e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.828125, + "logps/chosen": -640.0, + "logps/rejected": -896.0, + "loss": 0.2982, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.65625, + "rewards/margins": 2.859375, + "rewards/rejected": -7.5, + "step": 4250 + }, + { + "epoch": 0.2195593351372246, + "grad_norm": 5.977138710147792, + "learning_rate": 4.785531509440512e-07, + "logits/chosen": -3.078125, + "logits/rejected": -2.84375, + "logps/chosen": -620.0, + "logps/rejected": -840.0, + "loss": 0.3129, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.46875, + "rewards/margins": 2.484375, + "rewards/rejected": -6.9375, + "step": 4260 + }, + { + "epoch": 0.2200747326375467, + "grad_norm": 9.285309398671876, + "learning_rate": 4.783705059070675e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.859375, + "logps/chosen": -644.0, + "logps/rejected": -892.0, + "loss": 0.3067, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.65625, + "rewards/margins": 2.3125, + "rewards/rejected": -6.96875, + "step": 4270 + }, + { + "epoch": 0.22059013013786882, + "grad_norm": 9.260592677048967, + "learning_rate": 4.781871216029182e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.90625, + "logps/chosen": -696.0, + "logps/rejected": -992.0, + "loss": 0.2894, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.125, + "rewards/margins": 2.953125, + "rewards/rejected": -8.0625, + "step": 4280 + }, + { + "epoch": 0.22110552763819097, + "grad_norm": 7.721924724790773, + "learning_rate": 4.780029986252438e-07, + "logits/chosen": -3.390625, + "logits/rejected": -2.96875, + "logps/chosen": -608.0, + "logps/rejected": -860.0, + "loss": 0.2825, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.5625, + "rewards/margins": 2.5, + "rewards/rejected": -7.0625, + "step": 4290 + }, + { + "epoch": 0.2216209251385131, + "grad_norm": 15.778945902809916, + "learning_rate": 4.778181375700761e-07, + "logits/chosen": -3.078125, + "logits/rejected": -2.890625, + "logps/chosen": -620.0, + "logps/rejected": -896.0, + "loss": 0.2768, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.5, + "rewards/margins": 2.859375, + "rewards/rejected": -7.34375, + "step": 4300 + }, + { + "epoch": 0.2221363226388352, + "grad_norm": 6.3836232513156785, + "learning_rate": 4.776325390358361e-07, + "logits/chosen": -3.125, + "logits/rejected": -3.078125, + "logps/chosen": -628.0, + "logps/rejected": -868.0, + "loss": 0.3135, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.40625, + "rewards/margins": 2.578125, + "rewards/rejected": -7.0, + "step": 4310 + }, + { + "epoch": 0.22265172013915732, + "grad_norm": 6.072741359596065, + "learning_rate": 4.774462036233321e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.84375, + "logps/chosen": -580.0, + "logps/rejected": -808.0, + "loss": 0.2858, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.1875, + "rewards/margins": 2.421875, + "rewards/rejected": -6.625, + "step": 4320 + }, + { + "epoch": 0.22316711763947944, + "grad_norm": 9.72474820237613, + "learning_rate": 4.772591319357578e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.90625, + "logps/chosen": -580.0, + "logps/rejected": -828.0, + "loss": 0.2953, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.09375, + "rewards/margins": 2.5, + "rewards/rejected": -6.59375, + "step": 4330 + }, + { + "epoch": 0.22368251513980159, + "grad_norm": 7.9452660285526395, + "learning_rate": 4.770713245786905e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.953125, + "logps/chosen": -612.0, + "logps/rejected": -860.0, + "loss": 0.2868, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.53125, + "rewards/margins": 2.46875, + "rewards/rejected": -7.0, + "step": 4340 + }, + { + "epoch": 0.2241979126401237, + "grad_norm": 6.489070101738693, + "learning_rate": 4.768827821600885e-07, + "logits/chosen": -3.203125, + "logits/rejected": -3.03125, + "logps/chosen": -616.0, + "logps/rejected": -856.0, + "loss": 0.2868, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.21875, + "rewards/margins": 2.625, + "rewards/rejected": -6.84375, + "step": 4350 + }, + { + "epoch": 0.22471331014044582, + "grad_norm": 7.689561402416481, + "learning_rate": 4.766935052902902e-07, + "logits/chosen": -3.25, + "logits/rejected": -3.03125, + "logps/chosen": -576.0, + "logps/rejected": -852.0, + "loss": 0.2796, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.03125, + "rewards/margins": 2.875, + "rewards/rejected": -6.90625, + "step": 4360 + }, + { + "epoch": 0.22522870764076794, + "grad_norm": 9.925159538784987, + "learning_rate": 4.76503494582011e-07, + "logits/chosen": -3.203125, + "logits/rejected": -3.203125, + "logps/chosen": -632.0, + "logps/rejected": -852.0, + "loss": 0.2971, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.375, + "rewards/margins": 2.34375, + "rewards/rejected": -6.71875, + "step": 4370 + }, + { + "epoch": 0.22574410514109006, + "grad_norm": 7.179319492258064, + "learning_rate": 4.763127506503421e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.953125, + "logps/chosen": -620.0, + "logps/rejected": -876.0, + "loss": 0.2943, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -4.34375, + "rewards/margins": 2.65625, + "rewards/rejected": -7.0, + "step": 4380 + }, + { + "epoch": 0.2262595026414122, + "grad_norm": 9.73965149625161, + "learning_rate": 4.7612127411274826e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.8125, + "logps/chosen": -636.0, + "logps/rejected": -912.0, + "loss": 0.2856, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.59375, + "rewards/margins": 2.8125, + "rewards/rejected": -7.40625, + "step": 4390 + }, + { + "epoch": 0.22677490014173432, + "grad_norm": 7.904454481974661, + "learning_rate": 4.7592906558906567e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.03125, + "logps/chosen": -648.0, + "logps/rejected": -912.0, + "loss": 0.2861, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.46875, + "rewards/margins": 3.0, + "rewards/rejected": -7.46875, + "step": 4400 + }, + { + "epoch": 0.22729029764205644, + "grad_norm": 9.577584846564035, + "learning_rate": 4.7573612570150016e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.84375, + "logps/chosen": -604.0, + "logps/rejected": -864.0, + "loss": 0.2942, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -4.375, + "rewards/margins": 2.609375, + "rewards/rejected": -7.0, + "step": 4410 + }, + { + "epoch": 0.22780569514237856, + "grad_norm": 7.9172645404320425, + "learning_rate": 4.75542455074625e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.078125, + "logps/chosen": -624.0, + "logps/rejected": -872.0, + "loss": 0.3097, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.375, + "rewards/margins": 2.53125, + "rewards/rejected": -6.90625, + "step": 4420 + }, + { + "epoch": 0.22832109264270067, + "grad_norm": 6.936164661332623, + "learning_rate": 4.7534805433537906e-07, + "logits/chosen": -3.171875, + "logits/rejected": -3.0625, + "logps/chosen": -612.0, + "logps/rejected": -840.0, + "loss": 0.3294, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.5, + "rewards/margins": 2.34375, + "rewards/rejected": -6.84375, + "step": 4430 + }, + { + "epoch": 0.22883649014302282, + "grad_norm": 7.5428129158512744, + "learning_rate": 4.7515292411306464e-07, + "logits/chosen": -3.3125, + "logits/rejected": -2.953125, + "logps/chosen": -616.0, + "logps/rejected": -872.0, + "loss": 0.2984, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.34375, + "rewards/margins": 2.640625, + "rewards/rejected": -6.96875, + "step": 4440 + }, + { + "epoch": 0.22935188764334494, + "grad_norm": 7.313831285801238, + "learning_rate": 4.7495706503934543e-07, + "logits/chosen": -3.21875, + "logits/rejected": -3.0625, + "logps/chosen": -660.0, + "logps/rejected": -900.0, + "loss": 0.2976, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.875, + "rewards/margins": 2.40625, + "rewards/rejected": -7.3125, + "step": 4450 + }, + { + "epoch": 0.22986728514366706, + "grad_norm": 7.127131516883687, + "learning_rate": 4.747604777482445e-07, + "logits/chosen": -3.21875, + "logits/rejected": -3.03125, + "logps/chosen": -656.0, + "logps/rejected": -892.0, + "loss": 0.2922, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.71875, + "rewards/margins": 2.484375, + "rewards/rejected": -7.21875, + "step": 4460 + }, + { + "epoch": 0.23038268264398917, + "grad_norm": 10.442060052870037, + "learning_rate": 4.745631628761424e-07, + "logits/chosen": -3.046875, + "logits/rejected": -2.734375, + "logps/chosen": -652.0, + "logps/rejected": -892.0, + "loss": 0.3063, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -5.03125, + "rewards/margins": 2.421875, + "rewards/rejected": -7.4375, + "step": 4470 + }, + { + "epoch": 0.2308980801443113, + "grad_norm": 7.526153981768375, + "learning_rate": 4.74365121061775e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.890625, + "logps/chosen": -668.0, + "logps/rejected": -956.0, + "loss": 0.2799, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.8125, + "rewards/margins": 2.96875, + "rewards/rejected": -7.75, + "step": 4480 + }, + { + "epoch": 0.23141347764463344, + "grad_norm": 9.415756040410207, + "learning_rate": 4.7416635294623095e-07, + "logits/chosen": -3.03125, + "logits/rejected": -2.96875, + "logps/chosen": -740.0, + "logps/rejected": -952.0, + "loss": 0.2852, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -5.34375, + "rewards/margins": 2.546875, + "rewards/rejected": -7.875, + "step": 4490 + }, + { + "epoch": 0.23192887514495555, + "grad_norm": 8.031161971725094, + "learning_rate": 4.739668591729506e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.984375, + "logps/chosen": -636.0, + "logps/rejected": -904.0, + "loss": 0.2946, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.6875, + "rewards/margins": 2.703125, + "rewards/rejected": -7.40625, + "step": 4500 + }, + { + "epoch": 0.23244427264527767, + "grad_norm": 10.406186205936345, + "learning_rate": 4.7376664038772306e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.84375, + "logps/chosen": -624.0, + "logps/rejected": -920.0, + "loss": 0.2856, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.71875, + "rewards/margins": 3.0625, + "rewards/rejected": -7.78125, + "step": 4510 + }, + { + "epoch": 0.2329596701455998, + "grad_norm": 7.950402335987625, + "learning_rate": 4.735656972386844e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.703125, + "logps/chosen": -688.0, + "logps/rejected": -956.0, + "loss": 0.297, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.125, + "rewards/margins": 2.71875, + "rewards/rejected": -7.84375, + "step": 4520 + }, + { + "epoch": 0.2334750676459219, + "grad_norm": 7.880736821562359, + "learning_rate": 4.733640303763157e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.921875, + "logps/chosen": -612.0, + "logps/rejected": -872.0, + "loss": 0.2899, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.53125, + "rewards/margins": 2.71875, + "rewards/rejected": -7.21875, + "step": 4530 + }, + { + "epoch": 0.23399046514624405, + "grad_norm": 8.382986844035228, + "learning_rate": 4.731616404534405e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.96875, + "logps/chosen": -664.0, + "logps/rejected": -952.0, + "loss": 0.2957, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.84375, + "rewards/margins": 2.890625, + "rewards/rejected": -7.71875, + "step": 4540 + }, + { + "epoch": 0.23450586264656617, + "grad_norm": 8.280468167478837, + "learning_rate": 4.7295852812522347e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.96875, + "logps/chosen": -644.0, + "logps/rejected": -896.0, + "loss": 0.2964, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -4.71875, + "rewards/margins": 2.6875, + "rewards/rejected": -7.40625, + "step": 4550 + }, + { + "epoch": 0.2350212601468883, + "grad_norm": 9.102021390494578, + "learning_rate": 4.7275469404916733e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.921875, + "logps/chosen": -632.0, + "logps/rejected": -880.0, + "loss": 0.2899, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.8125, + "rewards/margins": 2.328125, + "rewards/rejected": -7.15625, + "step": 4560 + }, + { + "epoch": 0.2355366576472104, + "grad_norm": 8.734788318658868, + "learning_rate": 4.7255013888511153e-07, + "logits/chosen": -3.0625, + "logits/rejected": -2.921875, + "logps/chosen": -676.0, + "logps/rejected": -932.0, + "loss": 0.3001, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.15625, + "rewards/margins": 2.6875, + "rewards/rejected": -7.84375, + "step": 4570 + }, + { + "epoch": 0.23605205514753252, + "grad_norm": 8.30525542268798, + "learning_rate": 4.723448632952296e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.703125, + "logps/chosen": -640.0, + "logps/rejected": -916.0, + "loss": 0.265, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.71875, + "rewards/margins": 2.953125, + "rewards/rejected": -7.65625, + "step": 4580 + }, + { + "epoch": 0.23656745264785467, + "grad_norm": 8.926423557205885, + "learning_rate": 4.721388679440272e-07, + "logits/chosen": -3.203125, + "logits/rejected": -3.0625, + "logps/chosen": -604.0, + "logps/rejected": -876.0, + "loss": 0.292, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.375, + "rewards/margins": 2.8125, + "rewards/rejected": -7.1875, + "step": 4590 + }, + { + "epoch": 0.2370828501481768, + "grad_norm": 8.34904649285889, + "learning_rate": 4.7193215349834e-07, + "logits/chosen": -3.203125, + "logits/rejected": -3.0625, + "logps/chosen": -624.0, + "logps/rejected": -844.0, + "loss": 0.3089, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.46875, + "rewards/margins": 2.484375, + "rewards/rejected": -6.9375, + "step": 4600 + }, + { + "epoch": 0.2375982476484989, + "grad_norm": 6.735405397012328, + "learning_rate": 4.7172472062733167e-07, + "logits/chosen": -3.03125, + "logits/rejected": -2.796875, + "logps/chosen": -636.0, + "logps/rejected": -856.0, + "loss": 0.286, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.625, + "rewards/margins": 2.328125, + "rewards/rejected": -6.96875, + "step": 4610 + }, + { + "epoch": 0.23811364514882102, + "grad_norm": 7.4266966045550245, + "learning_rate": 4.7151657000249105e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.953125, + "logps/chosen": -624.0, + "logps/rejected": -916.0, + "loss": 0.2823, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -4.375, + "rewards/margins": 3.09375, + "rewards/rejected": -7.5, + "step": 4620 + }, + { + "epoch": 0.23862904264914314, + "grad_norm": 7.744958440198034, + "learning_rate": 4.7130770229763094e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.984375, + "logps/chosen": -676.0, + "logps/rejected": -924.0, + "loss": 0.2786, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.90625, + "rewards/margins": 2.671875, + "rewards/rejected": -7.5625, + "step": 4630 + }, + { + "epoch": 0.2391444401494653, + "grad_norm": 9.232194879754882, + "learning_rate": 4.710981181888851e-07, + "logits/chosen": -3.21875, + "logits/rejected": -3.015625, + "logps/chosen": -708.0, + "logps/rejected": -1008.0, + "loss": 0.2754, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.28125, + "rewards/margins": 3.109375, + "rewards/rejected": -8.375, + "step": 4640 + }, + { + "epoch": 0.2396598376497874, + "grad_norm": 9.475382053726806, + "learning_rate": 4.7088781835470656e-07, + "logits/chosen": -3.234375, + "logits/rejected": -3.0625, + "logps/chosen": -648.0, + "logps/rejected": -872.0, + "loss": 0.2914, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.84375, + "rewards/margins": 2.375, + "rewards/rejected": -7.21875, + "step": 4650 + }, + { + "epoch": 0.24017523515010952, + "grad_norm": 8.805531491537135, + "learning_rate": 4.7067680347586525e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.890625, + "logps/chosen": -616.0, + "logps/rejected": -892.0, + "loss": 0.2605, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.5, + "rewards/margins": 2.796875, + "rewards/rejected": -7.3125, + "step": 4660 + }, + { + "epoch": 0.24069063265043164, + "grad_norm": 9.612262424723838, + "learning_rate": 4.7046507423544565e-07, + "logits/chosen": -3.21875, + "logits/rejected": -3.0, + "logps/chosen": -636.0, + "logps/rejected": -912.0, + "loss": 0.285, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.6875, + "rewards/margins": 2.859375, + "rewards/rejected": -7.5625, + "step": 4670 + }, + { + "epoch": 0.24120603015075376, + "grad_norm": 7.865973124256476, + "learning_rate": 4.7025263131884483e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.9375, + "logps/chosen": -648.0, + "logps/rejected": -924.0, + "loss": 0.3005, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.75, + "rewards/margins": 2.78125, + "rewards/rejected": -7.53125, + "step": 4680 + }, + { + "epoch": 0.2417214276510759, + "grad_norm": 7.700215682521424, + "learning_rate": 4.7003947541377024e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.9375, + "logps/chosen": -624.0, + "logps/rejected": -868.0, + "loss": 0.2908, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.34375, + "rewards/margins": 2.71875, + "rewards/rejected": -7.03125, + "step": 4690 + }, + { + "epoch": 0.24223682515139802, + "grad_norm": 7.7113803234004346, + "learning_rate": 4.6982560721023725e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.984375, + "logps/chosen": -668.0, + "logps/rejected": -896.0, + "loss": 0.2826, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.90625, + "rewards/margins": 2.53125, + "rewards/rejected": -7.40625, + "step": 4700 + }, + { + "epoch": 0.24275222265172014, + "grad_norm": 9.222305026646428, + "learning_rate": 4.696110274005669e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.75, + "logps/chosen": -644.0, + "logps/rejected": -904.0, + "loss": 0.294, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.71875, + "rewards/margins": 2.671875, + "rewards/rejected": -7.40625, + "step": 4710 + }, + { + "epoch": 0.24326762015204226, + "grad_norm": 9.086346650860447, + "learning_rate": 4.6939573667938406e-07, + "logits/chosen": -3.0625, + "logits/rejected": -2.8125, + "logps/chosen": -604.0, + "logps/rejected": -852.0, + "loss": 0.3056, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.40625, + "rewards/margins": 2.609375, + "rewards/rejected": -7.0, + "step": 4720 + }, + { + "epoch": 0.24378301765236438, + "grad_norm": 7.705845015668752, + "learning_rate": 4.691797357436148e-07, + "logits/chosen": -3.046875, + "logits/rejected": -2.78125, + "logps/chosen": -608.0, + "logps/rejected": -852.0, + "loss": 0.2924, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.34375, + "rewards/margins": 2.53125, + "rewards/rejected": -6.875, + "step": 4730 + }, + { + "epoch": 0.24429841515268652, + "grad_norm": 7.814857521379984, + "learning_rate": 4.689630252924843e-07, + "logits/chosen": -3.203125, + "logits/rejected": -3.03125, + "logps/chosen": -656.0, + "logps/rejected": -880.0, + "loss": 0.3031, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.8125, + "rewards/margins": 2.3125, + "rewards/rejected": -7.125, + "step": 4740 + }, + { + "epoch": 0.24481381265300864, + "grad_norm": 7.115969300748152, + "learning_rate": 4.6874560602751444e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.90625, + "logps/chosen": -640.0, + "logps/rejected": -928.0, + "loss": 0.2967, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.625, + "rewards/margins": 3.078125, + "rewards/rejected": -7.71875, + "step": 4750 + }, + { + "epoch": 0.24532921015333076, + "grad_norm": 9.523088889546342, + "learning_rate": 4.6852747865252166e-07, + "logits/chosen": -2.921875, + "logits/rejected": -2.609375, + "logps/chosen": -648.0, + "logps/rejected": -896.0, + "loss": 0.2743, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.71875, + "rewards/margins": 2.671875, + "rewards/rejected": -7.375, + "step": 4760 + }, + { + "epoch": 0.24584460765365287, + "grad_norm": 6.9933178023203855, + "learning_rate": 4.683086438736148e-07, + "logits/chosen": -3.078125, + "logits/rejected": -2.734375, + "logps/chosen": -632.0, + "logps/rejected": -912.0, + "loss": 0.2753, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -4.8125, + "rewards/margins": 2.6875, + "rewards/rejected": -7.5, + "step": 4770 + }, + { + "epoch": 0.246360005153975, + "grad_norm": 9.414145163610204, + "learning_rate": 4.6808910239919247e-07, + "logits/chosen": -3.015625, + "logits/rejected": -2.71875, + "logps/chosen": -684.0, + "logps/rejected": -992.0, + "loss": 0.2685, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.125, + "rewards/margins": 3.203125, + "rewards/rejected": -8.3125, + "step": 4780 + }, + { + "epoch": 0.24687540265429714, + "grad_norm": 7.619208086281357, + "learning_rate": 4.6786885493994107e-07, + "logits/chosen": -3.046875, + "logits/rejected": -2.796875, + "logps/chosen": -668.0, + "logps/rejected": -940.0, + "loss": 0.303, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -4.8125, + "rewards/margins": 2.765625, + "rewards/rejected": -7.59375, + "step": 4790 + }, + { + "epoch": 0.24739080015461926, + "grad_norm": 7.289769946353819, + "learning_rate": 4.676479022088322e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.984375, + "logps/chosen": -640.0, + "logps/rejected": -916.0, + "loss": 0.3048, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.53125, + "rewards/margins": 3.0625, + "rewards/rejected": -7.59375, + "step": 4800 + }, + { + "epoch": 0.24790619765494137, + "grad_norm": 8.208493627955885, + "learning_rate": 4.674262449211209e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.796875, + "logps/chosen": -640.0, + "logps/rejected": -912.0, + "loss": 0.2627, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.75, + "rewards/margins": 2.875, + "rewards/rejected": -7.625, + "step": 4810 + }, + { + "epoch": 0.2484215951552635, + "grad_norm": 7.439719618530367, + "learning_rate": 4.6720388379434253e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.890625, + "logps/chosen": -624.0, + "logps/rejected": -892.0, + "loss": 0.2629, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.375, + "rewards/margins": 2.75, + "rewards/rejected": -7.15625, + "step": 4820 + }, + { + "epoch": 0.2489369926555856, + "grad_norm": 7.600669518049735, + "learning_rate": 4.66980819548311e-07, + "logits/chosen": -3.390625, + "logits/rejected": -2.984375, + "logps/chosen": -624.0, + "logps/rejected": -912.0, + "loss": 0.2972, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.5, + "rewards/margins": 3.0, + "rewards/rejected": -7.5, + "step": 4830 + }, + { + "epoch": 0.24945239015590775, + "grad_norm": 6.568114640871866, + "learning_rate": 4.6675705290511657e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.9375, + "logps/chosen": -668.0, + "logps/rejected": -916.0, + "loss": 0.2834, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.8125, + "rewards/margins": 2.671875, + "rewards/rejected": -7.5, + "step": 4840 + }, + { + "epoch": 0.24996778765622987, + "grad_norm": 9.88996670035786, + "learning_rate": 4.6653258458912294e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.953125, + "logps/chosen": -648.0, + "logps/rejected": -916.0, + "loss": 0.2805, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.6875, + "rewards/margins": 2.625, + "rewards/rejected": -7.3125, + "step": 4850 + }, + { + "epoch": 0.250483185156552, + "grad_norm": 10.249304573370784, + "learning_rate": 4.663074153269654e-07, + "logits/chosen": -3.0, + "logits/rejected": -2.71875, + "logps/chosen": -660.0, + "logps/rejected": -928.0, + "loss": 0.2488, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.75, + "rewards/margins": 3.015625, + "rewards/rejected": -7.75, + "step": 4860 + }, + { + "epoch": 0.25099858265687414, + "grad_norm": 6.240582106891933, + "learning_rate": 4.660815458475481e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.8125, + "logps/chosen": -664.0, + "logps/rejected": -936.0, + "loss": 0.2864, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.78125, + "rewards/margins": 2.90625, + "rewards/rejected": -7.6875, + "step": 4870 + }, + { + "epoch": 0.2515139801571962, + "grad_norm": 6.386268320491348, + "learning_rate": 4.6585497688204225e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.875, + "logps/chosen": -624.0, + "logps/rejected": -884.0, + "loss": 0.2791, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.5, + "rewards/margins": 2.578125, + "rewards/rejected": -7.0625, + "step": 4880 + }, + { + "epoch": 0.25202937765751837, + "grad_norm": 8.310038645126818, + "learning_rate": 4.6562770916388306e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.765625, + "logps/chosen": -640.0, + "logps/rejected": -920.0, + "loss": 0.2632, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.71875, + "rewards/margins": 2.96875, + "rewards/rejected": -7.6875, + "step": 4890 + }, + { + "epoch": 0.25254477515784046, + "grad_norm": 7.778828768687632, + "learning_rate": 4.6539974342876775e-07, + "logits/chosen": -3.28125, + "logits/rejected": -3.0, + "logps/chosen": -616.0, + "logps/rejected": -908.0, + "loss": 0.2558, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.5625, + "rewards/margins": 3.078125, + "rewards/rejected": -7.625, + "step": 4900 + }, + { + "epoch": 0.2530601726581626, + "grad_norm": 7.5899017932354536, + "learning_rate": 4.6517108041465336e-07, + "logits/chosen": -3.203125, + "logits/rejected": -3.0, + "logps/chosen": -620.0, + "logps/rejected": -904.0, + "loss": 0.2698, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.53125, + "rewards/margins": 2.96875, + "rewards/rejected": -7.5, + "step": 4910 + }, + { + "epoch": 0.25357557015848475, + "grad_norm": 10.727099881594816, + "learning_rate": 4.6494172086175377e-07, + "logits/chosen": -3.34375, + "logits/rejected": -2.984375, + "logps/chosen": -644.0, + "logps/rejected": -928.0, + "loss": 0.2861, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.71875, + "rewards/margins": 2.828125, + "rewards/rejected": -7.53125, + "step": 4920 + }, + { + "epoch": 0.25409096765880684, + "grad_norm": 8.329957079813074, + "learning_rate": 4.6471166551253793e-07, + "logits/chosen": -3.359375, + "logits/rejected": -3.15625, + "logps/chosen": -632.0, + "logps/rejected": -888.0, + "loss": 0.3005, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.46875, + "rewards/margins": 2.671875, + "rewards/rejected": -7.15625, + "step": 4930 + }, + { + "epoch": 0.254606365159129, + "grad_norm": 7.963874150376027, + "learning_rate": 4.6448091511172716e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.96875, + "logps/chosen": -596.0, + "logps/rejected": -860.0, + "loss": 0.2851, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.28125, + "rewards/margins": 2.671875, + "rewards/rejected": -6.9375, + "step": 4940 + }, + { + "epoch": 0.2551217626594511, + "grad_norm": 7.310981006824624, + "learning_rate": 4.6424947040629246e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.828125, + "logps/chosen": -632.0, + "logps/rejected": -920.0, + "loss": 0.283, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.6875, + "rewards/margins": 2.8125, + "rewards/rejected": -7.5, + "step": 4950 + }, + { + "epoch": 0.2556371601597732, + "grad_norm": 8.430464205305235, + "learning_rate": 4.6401733214545277e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.828125, + "logps/chosen": -652.0, + "logps/rejected": -908.0, + "loss": 0.2625, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -4.6875, + "rewards/margins": 2.765625, + "rewards/rejected": -7.4375, + "step": 4960 + }, + { + "epoch": 0.25615255766009537, + "grad_norm": 8.457256499086482, + "learning_rate": 4.637845010806719e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.9375, + "logps/chosen": -684.0, + "logps/rejected": -992.0, + "loss": 0.2887, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.0, + "rewards/margins": 3.28125, + "rewards/rejected": -8.25, + "step": 4970 + }, + { + "epoch": 0.25666795516041746, + "grad_norm": 8.991439424358743, + "learning_rate": 4.6355097796565657e-07, + "logits/chosen": -3.0625, + "logits/rejected": -2.84375, + "logps/chosen": -652.0, + "logps/rejected": -928.0, + "loss": 0.3016, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -4.875, + "rewards/margins": 2.734375, + "rewards/rejected": -7.59375, + "step": 4980 + }, + { + "epoch": 0.2571833526607396, + "grad_norm": 6.2070749915841565, + "learning_rate": 4.6331676355635364e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.84375, + "logps/chosen": -616.0, + "logps/rejected": -916.0, + "loss": 0.2931, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -4.46875, + "rewards/margins": 2.9375, + "rewards/rejected": -7.40625, + "step": 4990 + }, + { + "epoch": 0.2576987501610617, + "grad_norm": 6.358232686791836, + "learning_rate": 4.630818586109477e-07, + "logits/chosen": -3.015625, + "logits/rejected": -2.765625, + "logps/chosen": -680.0, + "logps/rejected": -956.0, + "loss": 0.3106, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.125, + "rewards/margins": 2.953125, + "rewards/rejected": -8.0625, + "step": 5000 + }, + { + "epoch": 0.2576987501610617, + "eval_logits/chosen": -2.765625, + "eval_logits/rejected": -2.46875, + "eval_logps/chosen": -664.0, + "eval_logps/rejected": -976.0, + "eval_loss": 0.23780851066112518, + "eval_rewards/accuracies": 0.8990259766578674, + "eval_rewards/chosen": -4.9375, + "eval_rewards/margins": 3.28125, + "eval_rewards/rejected": -8.1875, + "eval_runtime": 3575.3671, + "eval_samples_per_second": 27.55, + "eval_steps_per_second": 0.431, + "step": 5000 + }, + { + "epoch": 0.25821414766138384, + "grad_norm": 7.696475589664955, + "learning_rate": 4.628462638898589e-07, + "logits/chosen": -3.015625, + "logits/rejected": -2.96875, + "logps/chosen": -640.0, + "logps/rejected": -872.0, + "loss": 0.2598, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.65625, + "rewards/margins": 2.53125, + "rewards/rejected": -7.1875, + "step": 5010 + }, + { + "epoch": 0.258729545161706, + "grad_norm": 8.082757072937039, + "learning_rate": 4.6260998015574026e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.0, + "logps/chosen": -668.0, + "logps/rejected": -876.0, + "loss": 0.3006, + "rewards/accuracies": 0.78125, + "rewards/chosen": -4.8125, + "rewards/margins": 2.21875, + "rewards/rejected": -7.0, + "step": 5020 + }, + { + "epoch": 0.2592449426620281, + "grad_norm": 8.330912056778539, + "learning_rate": 4.6237300817347503e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.890625, + "logps/chosen": -648.0, + "logps/rejected": -916.0, + "loss": 0.291, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.78125, + "rewards/margins": 2.84375, + "rewards/rejected": -7.59375, + "step": 5030 + }, + { + "epoch": 0.2597603401623502, + "grad_norm": 9.646784562563218, + "learning_rate": 4.621353487101746e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.890625, + "logps/chosen": -660.0, + "logps/rejected": -912.0, + "loss": 0.2896, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.875, + "rewards/margins": 2.546875, + "rewards/rejected": -7.4375, + "step": 5040 + }, + { + "epoch": 0.2602757376626723, + "grad_norm": 8.956900697514865, + "learning_rate": 4.618970025351758e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.90625, + "logps/chosen": -668.0, + "logps/rejected": -960.0, + "loss": 0.2815, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.875, + "rewards/margins": 2.8125, + "rewards/rejected": -7.6875, + "step": 5050 + }, + { + "epoch": 0.26079113516299446, + "grad_norm": 9.141682742321017, + "learning_rate": 4.616579704200385e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.15625, + "logps/chosen": -660.0, + "logps/rejected": -924.0, + "loss": 0.2972, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.875, + "rewards/margins": 2.75, + "rewards/rejected": -7.65625, + "step": 5060 + }, + { + "epoch": 0.2613065326633166, + "grad_norm": 7.931252428088198, + "learning_rate": 4.6141825313854285e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.96875, + "logps/chosen": -688.0, + "logps/rejected": -976.0, + "loss": 0.2693, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.1875, + "rewards/margins": 2.828125, + "rewards/rejected": -8.0, + "step": 5070 + }, + { + "epoch": 0.2618219301636387, + "grad_norm": 8.975677044349409, + "learning_rate": 4.611778514666872e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.828125, + "logps/chosen": -656.0, + "logps/rejected": -892.0, + "loss": 0.2852, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.75, + "rewards/margins": 2.4375, + "rewards/rejected": -7.1875, + "step": 5080 + }, + { + "epoch": 0.26233732766396084, + "grad_norm": 7.931157622506553, + "learning_rate": 4.6093676618268517e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.828125, + "logps/chosen": -624.0, + "logps/rejected": -912.0, + "loss": 0.2737, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.4375, + "rewards/margins": 3.0, + "rewards/rejected": -7.4375, + "step": 5090 + }, + { + "epoch": 0.26285272516428293, + "grad_norm": 13.295475882205588, + "learning_rate": 4.606949980669636e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.875, + "logps/chosen": -692.0, + "logps/rejected": -940.0, + "loss": 0.2753, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.125, + "rewards/margins": 2.703125, + "rewards/rejected": -7.8125, + "step": 5100 + }, + { + "epoch": 0.2633681226646051, + "grad_norm": 7.093230615949716, + "learning_rate": 4.604525479021595e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.90625, + "logps/chosen": -648.0, + "logps/rejected": -932.0, + "loss": 0.2843, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.8125, + "rewards/margins": 2.765625, + "rewards/rejected": -7.5625, + "step": 5110 + }, + { + "epoch": 0.2638835201649272, + "grad_norm": 7.091287912662896, + "learning_rate": 4.602094164731178e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.75, + "logps/chosen": -612.0, + "logps/rejected": -892.0, + "loss": 0.2968, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.59375, + "rewards/margins": 2.703125, + "rewards/rejected": -7.3125, + "step": 5120 + }, + { + "epoch": 0.2643989176652493, + "grad_norm": 7.932359763648786, + "learning_rate": 4.5996560456688887e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.859375, + "logps/chosen": -668.0, + "logps/rejected": -936.0, + "loss": 0.288, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.8125, + "rewards/margins": 2.796875, + "rewards/rejected": -7.59375, + "step": 5130 + }, + { + "epoch": 0.26491431516557146, + "grad_norm": 8.791142903887126, + "learning_rate": 4.5972111297272586e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.96875, + "logps/chosen": -656.0, + "logps/rejected": -956.0, + "loss": 0.2741, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.8125, + "rewards/margins": 3.046875, + "rewards/rejected": -7.84375, + "step": 5140 + }, + { + "epoch": 0.26542971266589355, + "grad_norm": 9.48377120889135, + "learning_rate": 4.594759424820822e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.921875, + "logps/chosen": -616.0, + "logps/rejected": -880.0, + "loss": 0.2945, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.4375, + "rewards/margins": 2.84375, + "rewards/rejected": -7.28125, + "step": 5150 + }, + { + "epoch": 0.2659451101662157, + "grad_norm": 7.486610797440326, + "learning_rate": 4.592300938886089e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.953125, + "logps/chosen": -672.0, + "logps/rejected": -920.0, + "loss": 0.2792, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.84375, + "rewards/margins": 2.625, + "rewards/rejected": -7.46875, + "step": 5160 + }, + { + "epoch": 0.26646050766653784, + "grad_norm": 8.724849577259555, + "learning_rate": 4.589835679881521e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.796875, + "logps/chosen": -644.0, + "logps/rejected": -944.0, + "loss": 0.2593, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -4.6875, + "rewards/margins": 3.046875, + "rewards/rejected": -7.75, + "step": 5170 + }, + { + "epoch": 0.26697590516685993, + "grad_norm": 8.603925121536653, + "learning_rate": 4.587363655787504e-07, + "logits/chosen": -3.28125, + "logits/rejected": -3.03125, + "logps/chosen": -664.0, + "logps/rejected": -992.0, + "loss": 0.2624, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.9375, + "rewards/margins": 3.328125, + "rewards/rejected": -8.25, + "step": 5180 + }, + { + "epoch": 0.2674913026671821, + "grad_norm": 6.466770324434367, + "learning_rate": 4.5848848746063276e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.96875, + "logps/chosen": -672.0, + "logps/rejected": -956.0, + "loss": 0.2861, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.09375, + "rewards/margins": 2.953125, + "rewards/rejected": -8.0625, + "step": 5190 + }, + { + "epoch": 0.26800670016750416, + "grad_norm": 6.241005342499919, + "learning_rate": 4.5823993443621496e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.96875, + "logps/chosen": -688.0, + "logps/rejected": -948.0, + "loss": 0.2772, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.03125, + "rewards/margins": 2.90625, + "rewards/rejected": -7.9375, + "step": 5200 + }, + { + "epoch": 0.2685220976678263, + "grad_norm": 8.337525065069656, + "learning_rate": 4.5799070731009787e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.921875, + "logps/chosen": -648.0, + "logps/rejected": -948.0, + "loss": 0.2884, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.6875, + "rewards/margins": 3.0625, + "rewards/rejected": -7.75, + "step": 5210 + }, + { + "epoch": 0.26903749516814845, + "grad_norm": 9.657986032284995, + "learning_rate": 4.5774080688906445e-07, + "logits/chosen": -2.984375, + "logits/rejected": -2.78125, + "logps/chosen": -660.0, + "logps/rejected": -896.0, + "loss": 0.2997, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.875, + "rewards/margins": 2.53125, + "rewards/rejected": -7.375, + "step": 5220 + }, + { + "epoch": 0.26955289266847055, + "grad_norm": 6.886728590627173, + "learning_rate": 4.574902339820772e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.96875, + "logps/chosen": -684.0, + "logps/rejected": -932.0, + "loss": 0.2695, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.1875, + "rewards/margins": 2.546875, + "rewards/rejected": -7.75, + "step": 5230 + }, + { + "epoch": 0.2700682901687927, + "grad_norm": 9.321002797352381, + "learning_rate": 4.5723898940027555e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.890625, + "logps/chosen": -664.0, + "logps/rejected": -928.0, + "loss": 0.281, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -4.875, + "rewards/margins": 2.84375, + "rewards/rejected": -7.71875, + "step": 5240 + }, + { + "epoch": 0.2705836876691148, + "grad_norm": 9.645141039557183, + "learning_rate": 4.569870739569732e-07, + "logits/chosen": -2.96875, + "logits/rejected": -2.8125, + "logps/chosen": -712.0, + "logps/rejected": -1016.0, + "loss": 0.2739, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.4375, + "rewards/margins": 3.140625, + "rewards/rejected": -8.5625, + "step": 5250 + }, + { + "epoch": 0.2710990851694369, + "grad_norm": 9.555626022823379, + "learning_rate": 4.5673448846765575e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.90625, + "logps/chosen": -708.0, + "logps/rejected": -964.0, + "loss": 0.2858, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.3125, + "rewards/margins": 2.890625, + "rewards/rejected": -8.1875, + "step": 5260 + }, + { + "epoch": 0.27161448266975907, + "grad_norm": 8.328527118573975, + "learning_rate": 4.5648123374997726e-07, + "logits/chosen": -2.9375, + "logits/rejected": -2.65625, + "logps/chosen": -668.0, + "logps/rejected": -936.0, + "loss": 0.2846, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.78125, + "rewards/margins": 2.671875, + "rewards/rejected": -7.46875, + "step": 5270 + }, + { + "epoch": 0.27212988017008116, + "grad_norm": 10.01220772035318, + "learning_rate": 4.5622731062375897e-07, + "logits/chosen": -3.0, + "logits/rejected": -2.859375, + "logps/chosen": -664.0, + "logps/rejected": -912.0, + "loss": 0.2752, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.84375, + "rewards/margins": 2.53125, + "rewards/rejected": -7.375, + "step": 5280 + }, + { + "epoch": 0.2726452776704033, + "grad_norm": 11.31412533175184, + "learning_rate": 4.559727199109852e-07, + "logits/chosen": -3.21875, + "logits/rejected": -3.078125, + "logps/chosen": -696.0, + "logps/rejected": -928.0, + "loss": 0.2766, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -5.1875, + "rewards/margins": 2.453125, + "rewards/rejected": -7.625, + "step": 5290 + }, + { + "epoch": 0.2731606751707254, + "grad_norm": 8.929967329028223, + "learning_rate": 4.5571746243580153e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.9375, + "logps/chosen": -656.0, + "logps/rejected": -948.0, + "loss": 0.258, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.9375, + "rewards/margins": 2.921875, + "rewards/rejected": -7.84375, + "step": 5300 + }, + { + "epoch": 0.27367607267104754, + "grad_norm": 6.309324286786818, + "learning_rate": 4.5546153902451223e-07, + "logits/chosen": -3.25, + "logits/rejected": -3.0, + "logps/chosen": -628.0, + "logps/rejected": -880.0, + "loss": 0.2724, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.59375, + "rewards/margins": 2.421875, + "rewards/rejected": -7.0, + "step": 5310 + }, + { + "epoch": 0.2741914701713697, + "grad_norm": 7.035305969000009, + "learning_rate": 4.5520495050557693e-07, + "logits/chosen": -3.203125, + "logits/rejected": -3.03125, + "logps/chosen": -664.0, + "logps/rejected": -936.0, + "loss": 0.2783, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.84375, + "rewards/margins": 2.8125, + "rewards/rejected": -7.6875, + "step": 5320 + }, + { + "epoch": 0.2747068676716918, + "grad_norm": 6.580764574500793, + "learning_rate": 4.549476977096084e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.921875, + "logps/chosen": -704.0, + "logps/rejected": -956.0, + "loss": 0.2723, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.28125, + "rewards/margins": 2.75, + "rewards/rejected": -8.0, + "step": 5330 + }, + { + "epoch": 0.2752222651720139, + "grad_norm": 6.544935012598318, + "learning_rate": 4.5468978146936986e-07, + "logits/chosen": -3.328125, + "logits/rejected": -2.90625, + "logps/chosen": -632.0, + "logps/rejected": -920.0, + "loss": 0.2759, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.625, + "rewards/margins": 2.9375, + "rewards/rejected": -7.5625, + "step": 5340 + }, + { + "epoch": 0.275737662672336, + "grad_norm": 13.921761797887177, + "learning_rate": 4.544312026197722e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.859375, + "logps/chosen": -672.0, + "logps/rejected": -984.0, + "loss": 0.2946, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.125, + "rewards/margins": 3.1875, + "rewards/rejected": -8.3125, + "step": 5350 + }, + { + "epoch": 0.27625306017265816, + "grad_norm": 6.9818541202118345, + "learning_rate": 4.54171961997871e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.84375, + "logps/chosen": -756.0, + "logps/rejected": -1024.0, + "loss": 0.2905, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.8125, + "rewards/margins": 2.828125, + "rewards/rejected": -8.625, + "step": 5360 + }, + { + "epoch": 0.2767684576729803, + "grad_norm": 7.6536865300718775, + "learning_rate": 4.5391206044286453e-07, + "logits/chosen": -2.9375, + "logits/rejected": -2.734375, + "logps/chosen": -648.0, + "logps/rejected": -912.0, + "loss": 0.2887, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.96875, + "rewards/margins": 2.671875, + "rewards/rejected": -7.625, + "step": 5370 + }, + { + "epoch": 0.2772838551733024, + "grad_norm": 7.133047620457982, + "learning_rate": 4.5365149879609024e-07, + "logits/chosen": -3.015625, + "logits/rejected": -2.84375, + "logps/chosen": -704.0, + "logps/rejected": -932.0, + "loss": 0.2672, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.34375, + "rewards/margins": 2.40625, + "rewards/rejected": -7.75, + "step": 5380 + }, + { + "epoch": 0.27779925267362454, + "grad_norm": 10.383559116238105, + "learning_rate": 4.533902779010224e-07, + "logits/chosen": -3.234375, + "logits/rejected": -3.0, + "logps/chosen": -760.0, + "logps/rejected": -1048.0, + "loss": 0.2711, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.75, + "rewards/margins": 3.0, + "rewards/rejected": -8.75, + "step": 5390 + }, + { + "epoch": 0.27831465017394663, + "grad_norm": 8.15038297376314, + "learning_rate": 4.5312839860326967e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.125, + "logps/chosen": -764.0, + "logps/rejected": -1056.0, + "loss": 0.2721, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.8125, + "rewards/margins": 2.984375, + "rewards/rejected": -8.8125, + "step": 5400 + }, + { + "epoch": 0.2788300476742688, + "grad_norm": 7.64584748175223, + "learning_rate": 4.5286586175057175e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.96875, + "logps/chosen": -672.0, + "logps/rejected": -968.0, + "loss": 0.2971, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.15625, + "rewards/margins": 2.90625, + "rewards/rejected": -8.0625, + "step": 5410 + }, + { + "epoch": 0.2793454451745909, + "grad_norm": 7.104353000195838, + "learning_rate": 4.526026681927971e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.984375, + "logps/chosen": -696.0, + "logps/rejected": -952.0, + "loss": 0.2485, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.84375, + "rewards/margins": 2.890625, + "rewards/rejected": -7.75, + "step": 5420 + }, + { + "epoch": 0.279860842674913, + "grad_norm": 7.0753582866147795, + "learning_rate": 4.523388187819398e-07, + "logits/chosen": -3.234375, + "logits/rejected": -3.0, + "logps/chosen": -684.0, + "logps/rejected": -928.0, + "loss": 0.2788, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.09375, + "rewards/margins": 2.53125, + "rewards/rejected": -7.625, + "step": 5430 + }, + { + "epoch": 0.28037624017523516, + "grad_norm": 6.717328187008004, + "learning_rate": 4.520743143721173e-07, + "logits/chosen": -3.359375, + "logits/rejected": -2.984375, + "logps/chosen": -636.0, + "logps/rejected": -916.0, + "loss": 0.2848, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.59375, + "rewards/margins": 3.140625, + "rewards/rejected": -7.75, + "step": 5440 + }, + { + "epoch": 0.28089163767555725, + "grad_norm": 7.811390450853758, + "learning_rate": 4.5180915581956735e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.890625, + "logps/chosen": -720.0, + "logps/rejected": -976.0, + "loss": 0.2768, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.46875, + "rewards/margins": 2.765625, + "rewards/rejected": -8.25, + "step": 5450 + }, + { + "epoch": 0.2814070351758794, + "grad_norm": 6.9400683968941586, + "learning_rate": 4.515433439826449e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.921875, + "logps/chosen": -616.0, + "logps/rejected": -916.0, + "loss": 0.2824, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.5625, + "rewards/margins": 2.9375, + "rewards/rejected": -7.5, + "step": 5460 + }, + { + "epoch": 0.28192243267620154, + "grad_norm": 8.836636939405864, + "learning_rate": 4.5127687972182005e-07, + "logits/chosen": -3.25, + "logits/rejected": -3.0, + "logps/chosen": -648.0, + "logps/rejected": -904.0, + "loss": 0.2698, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -4.5625, + "rewards/margins": 2.640625, + "rewards/rejected": -7.1875, + "step": 5470 + }, + { + "epoch": 0.28243783017652363, + "grad_norm": 8.58464408222779, + "learning_rate": 4.5100976389967483e-07, + "logits/chosen": -3.40625, + "logits/rejected": -3.0, + "logps/chosen": -684.0, + "logps/rejected": -988.0, + "loss": 0.2614, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.90625, + "rewards/margins": 3.125, + "rewards/rejected": -8.0, + "step": 5480 + }, + { + "epoch": 0.2829532276768458, + "grad_norm": 8.589681825830802, + "learning_rate": 4.5074199738090036e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.0625, + "logps/chosen": -692.0, + "logps/rejected": -1008.0, + "loss": 0.265, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.125, + "rewards/margins": 3.40625, + "rewards/rejected": -8.5, + "step": 5490 + }, + { + "epoch": 0.28346862517716787, + "grad_norm": 9.96951306342504, + "learning_rate": 4.504735810322941e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.828125, + "logps/chosen": -616.0, + "logps/rejected": -912.0, + "loss": 0.2921, + "rewards/accuracies": 0.84375, + "rewards/chosen": -4.5, + "rewards/margins": 2.953125, + "rewards/rejected": -7.46875, + "step": 5500 + }, + { + "epoch": 0.28398402267749, + "grad_norm": 7.764194736941346, + "learning_rate": 4.502045157227573e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.984375, + "logps/chosen": -668.0, + "logps/rejected": -928.0, + "loss": 0.287, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.90625, + "rewards/margins": 2.625, + "rewards/rejected": -7.53125, + "step": 5510 + }, + { + "epoch": 0.28449942017781216, + "grad_norm": 10.119837551539739, + "learning_rate": 4.499348023232918e-07, + "logits/chosen": -3.078125, + "logits/rejected": -2.828125, + "logps/chosen": -704.0, + "logps/rejected": -988.0, + "loss": 0.2555, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.375, + "rewards/margins": 2.90625, + "rewards/rejected": -8.25, + "step": 5520 + }, + { + "epoch": 0.28501481767813425, + "grad_norm": 7.622484496982999, + "learning_rate": 4.4966444170699747e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.796875, + "logps/chosen": -736.0, + "logps/rejected": -1072.0, + "loss": 0.2669, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.5625, + "rewards/margins": 3.453125, + "rewards/rejected": -9.0, + "step": 5530 + }, + { + "epoch": 0.2855302151784564, + "grad_norm": 7.058001909625613, + "learning_rate": 4.4939343474906945e-07, + "logits/chosen": -3.171875, + "logits/rejected": -3.0, + "logps/chosen": -704.0, + "logps/rejected": -956.0, + "loss": 0.2822, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.125, + "rewards/margins": 2.671875, + "rewards/rejected": -7.8125, + "step": 5540 + }, + { + "epoch": 0.2860456126787785, + "grad_norm": 7.937554312014191, + "learning_rate": 4.491217823267949e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.796875, + "logps/chosen": -660.0, + "logps/rejected": -944.0, + "loss": 0.2846, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.8125, + "rewards/margins": 2.890625, + "rewards/rejected": -7.71875, + "step": 5550 + }, + { + "epoch": 0.28656101017910063, + "grad_norm": 10.617673887184113, + "learning_rate": 4.4884948531955063e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.921875, + "logps/chosen": -712.0, + "logps/rejected": -968.0, + "loss": 0.2616, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.4375, + "rewards/margins": 2.65625, + "rewards/rejected": -8.0625, + "step": 5560 + }, + { + "epoch": 0.2870764076794228, + "grad_norm": 8.667980172784015, + "learning_rate": 4.4857654460880003e-07, + "logits/chosen": -3.375, + "logits/rejected": -3.0, + "logps/chosen": -656.0, + "logps/rejected": -1004.0, + "loss": 0.2596, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.0, + "rewards/margins": 3.4375, + "rewards/rejected": -8.4375, + "step": 5570 + }, + { + "epoch": 0.28759180517974486, + "grad_norm": 10.95837034854095, + "learning_rate": 4.483029610780902e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.953125, + "logps/chosen": -664.0, + "logps/rejected": -936.0, + "loss": 0.2929, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.875, + "rewards/margins": 2.890625, + "rewards/rejected": -7.78125, + "step": 5580 + }, + { + "epoch": 0.288107202680067, + "grad_norm": 8.840899876603244, + "learning_rate": 4.480287356130492e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.984375, + "logps/chosen": -696.0, + "logps/rejected": -968.0, + "loss": 0.2856, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.21875, + "rewards/margins": 2.8125, + "rewards/rejected": -8.0, + "step": 5590 + }, + { + "epoch": 0.2886226001803891, + "grad_norm": 8.386571433178093, + "learning_rate": 4.4775386910138315e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.875, + "logps/chosen": -704.0, + "logps/rejected": -964.0, + "loss": 0.273, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.46875, + "rewards/margins": 2.71875, + "rewards/rejected": -8.1875, + "step": 5600 + }, + { + "epoch": 0.28913799768071125, + "grad_norm": 7.741367278166939, + "learning_rate": 4.474783624328731e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.875, + "logps/chosen": -664.0, + "logps/rejected": -936.0, + "loss": 0.2612, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.03125, + "rewards/margins": 2.875, + "rewards/rejected": -7.90625, + "step": 5610 + }, + { + "epoch": 0.2896533951810334, + "grad_norm": 7.412075079522674, + "learning_rate": 4.472022164993727e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.875, + "logps/chosen": -708.0, + "logps/rejected": -976.0, + "loss": 0.2864, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.28125, + "rewards/margins": 2.984375, + "rewards/rejected": -8.25, + "step": 5620 + }, + { + "epoch": 0.2901687926813555, + "grad_norm": 6.6927441644632735, + "learning_rate": 4.4692543219480476e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.953125, + "logps/chosen": -768.0, + "logps/rejected": -1056.0, + "loss": 0.2684, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.8125, + "rewards/margins": 3.09375, + "rewards/rejected": -8.9375, + "step": 5630 + }, + { + "epoch": 0.2906841901816776, + "grad_norm": 7.929265775650355, + "learning_rate": 4.466480104151587e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.71875, + "logps/chosen": -724.0, + "logps/rejected": -1048.0, + "loss": 0.2543, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.46875, + "rewards/margins": 3.359375, + "rewards/rejected": -8.8125, + "step": 5640 + }, + { + "epoch": 0.2911995876819997, + "grad_norm": 7.070113993708752, + "learning_rate": 4.463699520584875e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.84375, + "logps/chosen": -740.0, + "logps/rejected": -1032.0, + "loss": 0.2864, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -5.6875, + "rewards/margins": 2.734375, + "rewards/rejected": -8.4375, + "step": 5650 + }, + { + "epoch": 0.29171498518232186, + "grad_norm": 7.567827414720724, + "learning_rate": 4.460912580249048e-07, + "logits/chosen": -3.078125, + "logits/rejected": -2.59375, + "logps/chosen": -708.0, + "logps/rejected": -1072.0, + "loss": 0.2558, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.40625, + "rewards/margins": 3.484375, + "rewards/rejected": -8.9375, + "step": 5660 + }, + { + "epoch": 0.292230382682644, + "grad_norm": 7.096779042746862, + "learning_rate": 4.4581192921658193e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.765625, + "logps/chosen": -664.0, + "logps/rejected": -960.0, + "loss": 0.2959, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.9375, + "rewards/margins": 3.0, + "rewards/rejected": -7.9375, + "step": 5670 + }, + { + "epoch": 0.2927457801829661, + "grad_norm": 6.045077677065587, + "learning_rate": 4.4553196653774546e-07, + "logits/chosen": -2.96875, + "logits/rejected": -2.75, + "logps/chosen": -664.0, + "logps/rejected": -968.0, + "loss": 0.2536, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.03125, + "rewards/margins": 3.125, + "rewards/rejected": -8.125, + "step": 5680 + }, + { + "epoch": 0.29326117768328824, + "grad_norm": 10.105163398786202, + "learning_rate": 4.4525137089467335e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.890625, + "logps/chosen": -732.0, + "logps/rejected": -1024.0, + "loss": 0.2953, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.625, + "rewards/margins": 2.9375, + "rewards/rejected": -8.5625, + "step": 5690 + }, + { + "epoch": 0.29377657518361033, + "grad_norm": 8.121217614102896, + "learning_rate": 4.4497014319569297e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.71875, + "logps/chosen": -756.0, + "logps/rejected": -1024.0, + "loss": 0.2998, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.875, + "rewards/margins": 2.796875, + "rewards/rejected": -8.6875, + "step": 5700 + }, + { + "epoch": 0.2942919726839325, + "grad_norm": 8.943026432823865, + "learning_rate": 4.4468828435117755e-07, + "logits/chosen": -2.90625, + "logits/rejected": -2.515625, + "logps/chosen": -700.0, + "logps/rejected": -1064.0, + "loss": 0.2453, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.1875, + "rewards/margins": 3.546875, + "rewards/rejected": -8.75, + "step": 5710 + }, + { + "epoch": 0.2948073701842546, + "grad_norm": 7.928472272867718, + "learning_rate": 4.4440579527354345e-07, + "logits/chosen": -2.984375, + "logits/rejected": -2.625, + "logps/chosen": -644.0, + "logps/rejected": -944.0, + "loss": 0.2473, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.75, + "rewards/margins": 3.28125, + "rewards/rejected": -8.0, + "step": 5720 + }, + { + "epoch": 0.2953227676845767, + "grad_norm": 5.895795537739257, + "learning_rate": 4.4412267687724714e-07, + "logits/chosen": -3.078125, + "logits/rejected": -2.96875, + "logps/chosen": -700.0, + "logps/rejected": -960.0, + "loss": 0.272, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.3125, + "rewards/margins": 2.734375, + "rewards/rejected": -8.0625, + "step": 5730 + }, + { + "epoch": 0.29583816518489886, + "grad_norm": 7.578681034366449, + "learning_rate": 4.4383893007878245e-07, + "logits/chosen": -3.03125, + "logits/rejected": -2.734375, + "logps/chosen": -736.0, + "logps/rejected": -1016.0, + "loss": 0.2774, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.59375, + "rewards/margins": 2.9375, + "rewards/rejected": -8.5, + "step": 5740 + }, + { + "epoch": 0.29635356268522095, + "grad_norm": 26.212203349312606, + "learning_rate": 4.4355455579667724e-07, + "logits/chosen": -3.046875, + "logits/rejected": -2.796875, + "logps/chosen": -708.0, + "logps/rejected": -1020.0, + "loss": 0.2785, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.375, + "rewards/margins": 3.328125, + "rewards/rejected": -8.6875, + "step": 5750 + }, + { + "epoch": 0.2968689601855431, + "grad_norm": 7.512490186761141, + "learning_rate": 4.4326955495149074e-07, + "logits/chosen": -3.21875, + "logits/rejected": -3.09375, + "logps/chosen": -688.0, + "logps/rejected": -956.0, + "loss": 0.2809, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -5.21875, + "rewards/margins": 2.59375, + "rewards/rejected": -7.8125, + "step": 5760 + }, + { + "epoch": 0.29738435768586524, + "grad_norm": 17.925348826294375, + "learning_rate": 4.4298392846581044e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.78125, + "logps/chosen": -672.0, + "logps/rejected": -968.0, + "loss": 0.3006, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.0, + "rewards/margins": 3.015625, + "rewards/rejected": -8.0, + "step": 5770 + }, + { + "epoch": 0.29789975518618733, + "grad_norm": 6.240568848887286, + "learning_rate": 4.4269767726424914e-07, + "logits/chosen": -2.9375, + "logits/rejected": -2.75, + "logps/chosen": -676.0, + "logps/rejected": -956.0, + "loss": 0.2746, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.0625, + "rewards/margins": 2.84375, + "rewards/rejected": -7.90625, + "step": 5780 + }, + { + "epoch": 0.2984151526865095, + "grad_norm": 7.15250648504887, + "learning_rate": 4.424108022734417e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.765625, + "logps/chosen": -676.0, + "logps/rejected": -988.0, + "loss": 0.2785, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.03125, + "rewards/margins": 3.25, + "rewards/rejected": -8.25, + "step": 5790 + }, + { + "epoch": 0.29893055018683157, + "grad_norm": 6.715733081415759, + "learning_rate": 4.4212330442204265e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.78125, + "logps/chosen": -696.0, + "logps/rejected": -956.0, + "loss": 0.266, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.21875, + "rewards/margins": 2.734375, + "rewards/rejected": -7.96875, + "step": 5800 + }, + { + "epoch": 0.2994459476871537, + "grad_norm": 7.0730873207245, + "learning_rate": 4.4183518464072255e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.125, + "logps/chosen": -672.0, + "logps/rejected": -904.0, + "loss": 0.2749, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -4.96875, + "rewards/margins": 2.390625, + "rewards/rejected": -7.375, + "step": 5810 + }, + { + "epoch": 0.29996134518747586, + "grad_norm": 8.296901796153508, + "learning_rate": 4.415464438621653e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.984375, + "logps/chosen": -668.0, + "logps/rejected": -912.0, + "loss": 0.294, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -4.9375, + "rewards/margins": 2.515625, + "rewards/rejected": -7.4375, + "step": 5820 + }, + { + "epoch": 0.30047674268779795, + "grad_norm": 6.798707889428047, + "learning_rate": 4.41257083021065e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.015625, + "logps/chosen": -660.0, + "logps/rejected": -940.0, + "loss": 0.2539, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.875, + "rewards/margins": 2.828125, + "rewards/rejected": -7.71875, + "step": 5830 + }, + { + "epoch": 0.3009921401881201, + "grad_norm": 7.997797897332838, + "learning_rate": 4.409671030541231e-07, + "logits/chosen": -3.234375, + "logits/rejected": -3.046875, + "logps/chosen": -640.0, + "logps/rejected": -904.0, + "loss": 0.2756, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.65625, + "rewards/margins": 2.765625, + "rewards/rejected": -7.40625, + "step": 5840 + }, + { + "epoch": 0.3015075376884422, + "grad_norm": 10.642298301949623, + "learning_rate": 4.4067650490004516e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.796875, + "logps/chosen": -704.0, + "logps/rejected": -968.0, + "loss": 0.2899, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.625, + "rewards/margins": 2.578125, + "rewards/rejected": -8.25, + "step": 5850 + }, + { + "epoch": 0.30202293518876433, + "grad_norm": 9.847022214794706, + "learning_rate": 4.4038528949953794e-07, + "logits/chosen": -3.15625, + "logits/rejected": -3.015625, + "logps/chosen": -748.0, + "logps/rejected": -1032.0, + "loss": 0.289, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.59375, + "rewards/margins": 3.0625, + "rewards/rejected": -8.6875, + "step": 5860 + }, + { + "epoch": 0.3025383326890865, + "grad_norm": 8.562557228698294, + "learning_rate": 4.4009345779530625e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.03125, + "logps/chosen": -700.0, + "logps/rejected": -936.0, + "loss": 0.2664, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.34375, + "rewards/margins": 2.40625, + "rewards/rejected": -7.75, + "step": 5870 + }, + { + "epoch": 0.30305373018940857, + "grad_norm": 9.769554389471285, + "learning_rate": 4.398010107320501e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.953125, + "logps/chosen": -712.0, + "logps/rejected": -984.0, + "loss": 0.2853, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.375, + "rewards/margins": 2.84375, + "rewards/rejected": -8.1875, + "step": 5880 + }, + { + "epoch": 0.3035691276897307, + "grad_norm": 15.61824235089561, + "learning_rate": 4.3950794925646127e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.921875, + "logps/chosen": -708.0, + "logps/rejected": -964.0, + "loss": 0.2742, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.21875, + "rewards/margins": 2.75, + "rewards/rejected": -7.96875, + "step": 5890 + }, + { + "epoch": 0.3040845251900528, + "grad_norm": 8.763783959267723, + "learning_rate": 4.392142743172207e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.9375, + "logps/chosen": -644.0, + "logps/rejected": -964.0, + "loss": 0.2615, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.6875, + "rewards/margins": 3.53125, + "rewards/rejected": -8.25, + "step": 5900 + }, + { + "epoch": 0.30459992269037495, + "grad_norm": 8.063021994312658, + "learning_rate": 4.389199868649951e-07, + "logits/chosen": -3.34375, + "logits/rejected": -2.984375, + "logps/chosen": -688.0, + "logps/rejected": -952.0, + "loss": 0.2727, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -4.96875, + "rewards/margins": 2.828125, + "rewards/rejected": -7.8125, + "step": 5910 + }, + { + "epoch": 0.3051153201906971, + "grad_norm": 8.369162744368877, + "learning_rate": 4.386250878524339e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.15625, + "logps/chosen": -692.0, + "logps/rejected": -968.0, + "loss": 0.274, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.125, + "rewards/margins": 2.84375, + "rewards/rejected": -7.96875, + "step": 5920 + }, + { + "epoch": 0.3056307176910192, + "grad_norm": 9.051718446960804, + "learning_rate": 4.383295782341664e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.921875, + "logps/chosen": -688.0, + "logps/rejected": -956.0, + "loss": 0.2632, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.1875, + "rewards/margins": 2.9375, + "rewards/rejected": -8.125, + "step": 5930 + }, + { + "epoch": 0.30614611519134133, + "grad_norm": 6.942800357403213, + "learning_rate": 4.3803345896679844e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.875, + "logps/chosen": -720.0, + "logps/rejected": -980.0, + "loss": 0.2679, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -5.59375, + "rewards/margins": 2.71875, + "rewards/rejected": -8.3125, + "step": 5940 + }, + { + "epoch": 0.3066615126916634, + "grad_norm": 7.236034509411131, + "learning_rate": 4.3773673100890936e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.796875, + "logps/chosen": -652.0, + "logps/rejected": -928.0, + "loss": 0.2784, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -4.90625, + "rewards/margins": 2.890625, + "rewards/rejected": -7.78125, + "step": 5950 + }, + { + "epoch": 0.30717691019198556, + "grad_norm": 7.577472041742464, + "learning_rate": 4.3743939532104884e-07, + "logits/chosen": -3.25, + "logits/rejected": -3.09375, + "logps/chosen": -656.0, + "logps/rejected": -916.0, + "loss": 0.2803, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.65625, + "rewards/margins": 2.90625, + "rewards/rejected": -7.5625, + "step": 5960 + }, + { + "epoch": 0.3076923076923077, + "grad_norm": 6.917855353522115, + "learning_rate": 4.371414528657341e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.9375, + "logps/chosen": -668.0, + "logps/rejected": -892.0, + "loss": 0.2605, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -4.75, + "rewards/margins": 2.65625, + "rewards/rejected": -7.40625, + "step": 5970 + }, + { + "epoch": 0.3082077051926298, + "grad_norm": 7.506055235069546, + "learning_rate": 4.3684290460744633e-07, + "logits/chosen": -3.4375, + "logits/rejected": -3.125, + "logps/chosen": -688.0, + "logps/rejected": -956.0, + "loss": 0.2644, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -4.96875, + "rewards/margins": 2.734375, + "rewards/rejected": -7.71875, + "step": 5980 + }, + { + "epoch": 0.30872310269295195, + "grad_norm": 31.204795580010764, + "learning_rate": 4.365437515126278e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.765625, + "logps/chosen": -744.0, + "logps/rejected": -1064.0, + "loss": 0.2856, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.71875, + "rewards/margins": 3.4375, + "rewards/rejected": -9.125, + "step": 5990 + }, + { + "epoch": 0.30923850019327404, + "grad_norm": 10.831676224089627, + "learning_rate": 4.3624399454967894e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.0625, + "logps/chosen": -704.0, + "logps/rejected": -992.0, + "loss": 0.275, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.25, + "rewards/margins": 3.09375, + "rewards/rejected": -8.375, + "step": 6000 + }, + { + "epoch": 0.3097538976935962, + "grad_norm": 8.370674668820849, + "learning_rate": 4.3594363468895465e-07, + "logits/chosen": -3.359375, + "logits/rejected": -2.984375, + "logps/chosen": -692.0, + "logps/rejected": -940.0, + "loss": 0.2833, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.1875, + "rewards/margins": 2.609375, + "rewards/rejected": -7.78125, + "step": 6010 + }, + { + "epoch": 0.3102692951939183, + "grad_norm": 10.45119321771906, + "learning_rate": 4.356426729027618e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.859375, + "logps/chosen": -712.0, + "logps/rejected": -928.0, + "loss": 0.2901, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.21875, + "rewards/margins": 2.46875, + "rewards/rejected": -7.6875, + "step": 6020 + }, + { + "epoch": 0.3107846926942404, + "grad_norm": 8.542234047087366, + "learning_rate": 4.353411101653557e-07, + "logits/chosen": -3.140625, + "logits/rejected": -3.0625, + "logps/chosen": -692.0, + "logps/rejected": -956.0, + "loss": 0.2769, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.21875, + "rewards/margins": 2.796875, + "rewards/rejected": -8.0, + "step": 6030 + }, + { + "epoch": 0.31130009019456256, + "grad_norm": 7.9641330691436485, + "learning_rate": 4.350389474529368e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.828125, + "logps/chosen": -740.0, + "logps/rejected": -988.0, + "loss": 0.2658, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.65625, + "rewards/margins": 2.65625, + "rewards/rejected": -8.3125, + "step": 6040 + }, + { + "epoch": 0.31181548769488465, + "grad_norm": 9.201384879601473, + "learning_rate": 4.34736185743648e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.78125, + "logps/chosen": -720.0, + "logps/rejected": -1048.0, + "loss": 0.2642, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.5, + "rewards/margins": 3.296875, + "rewards/rejected": -8.8125, + "step": 6050 + }, + { + "epoch": 0.3123308851952068, + "grad_norm": 8.296558889581098, + "learning_rate": 4.344328260175712e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.78125, + "logps/chosen": -692.0, + "logps/rejected": -932.0, + "loss": 0.2768, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -5.15625, + "rewards/margins": 2.625, + "rewards/rejected": -7.78125, + "step": 6060 + }, + { + "epoch": 0.31284628269552894, + "grad_norm": 6.290621936052438, + "learning_rate": 4.341288692567241e-07, + "logits/chosen": -3.0625, + "logits/rejected": -2.75, + "logps/chosen": -672.0, + "logps/rejected": -928.0, + "loss": 0.2621, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.0, + "rewards/margins": 2.734375, + "rewards/rejected": -7.75, + "step": 6070 + }, + { + "epoch": 0.31336168019585103, + "grad_norm": 7.59735171271875, + "learning_rate": 4.338243164450571e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.71875, + "logps/chosen": -752.0, + "logps/rejected": -1020.0, + "loss": 0.264, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.6875, + "rewards/margins": 2.921875, + "rewards/rejected": -8.625, + "step": 6080 + }, + { + "epoch": 0.3138770776961732, + "grad_norm": 10.264121771275347, + "learning_rate": 4.335191685684501e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.765625, + "logps/chosen": -728.0, + "logps/rejected": -1080.0, + "loss": 0.2609, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.46875, + "rewards/margins": 3.765625, + "rewards/rejected": -9.25, + "step": 6090 + }, + { + "epoch": 0.3143924751964953, + "grad_norm": 8.548506762924381, + "learning_rate": 4.3321342661470937e-07, + "logits/chosen": -3.234375, + "logits/rejected": -3.03125, + "logps/chosen": -736.0, + "logps/rejected": -1024.0, + "loss": 0.2692, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.53125, + "rewards/margins": 2.984375, + "rewards/rejected": -8.5, + "step": 6100 + }, + { + "epoch": 0.3149078726968174, + "grad_norm": 8.3353687289671, + "learning_rate": 4.329070915735642e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.90625, + "logps/chosen": -696.0, + "logps/rejected": -992.0, + "loss": 0.2544, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.28125, + "rewards/margins": 3.125, + "rewards/rejected": -8.4375, + "step": 6110 + }, + { + "epoch": 0.31542327019713956, + "grad_norm": 6.141338271870342, + "learning_rate": 4.3260016443666386e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.859375, + "logps/chosen": -720.0, + "logps/rejected": -972.0, + "loss": 0.2737, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.4375, + "rewards/margins": 2.703125, + "rewards/rejected": -8.125, + "step": 6120 + }, + { + "epoch": 0.31593866769746165, + "grad_norm": 9.74408552657053, + "learning_rate": 4.322926461975742e-07, + "logits/chosen": -2.84375, + "logits/rejected": -2.515625, + "logps/chosen": -736.0, + "logps/rejected": -1020.0, + "loss": 0.2611, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.8125, + "rewards/margins": 2.921875, + "rewards/rejected": -8.75, + "step": 6130 + }, + { + "epoch": 0.3164540651977838, + "grad_norm": 8.5281974794959, + "learning_rate": 4.319845378517748e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.71875, + "logps/chosen": -712.0, + "logps/rejected": -1024.0, + "loss": 0.255, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.5, + "rewards/margins": 3.28125, + "rewards/rejected": -8.75, + "step": 6140 + }, + { + "epoch": 0.31696946269810594, + "grad_norm": 8.86528903743889, + "learning_rate": 4.316758403966551e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.75, + "logps/chosen": -760.0, + "logps/rejected": -1032.0, + "loss": 0.2884, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.8125, + "rewards/margins": 2.78125, + "rewards/rejected": -8.625, + "step": 6150 + }, + { + "epoch": 0.31748486019842803, + "grad_norm": 8.175373461683948, + "learning_rate": 4.3136655483151194e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.765625, + "logps/chosen": -796.0, + "logps/rejected": -1096.0, + "loss": 0.2525, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.21875, + "rewards/margins": 2.96875, + "rewards/rejected": -9.1875, + "step": 6160 + }, + { + "epoch": 0.3180002576987502, + "grad_norm": 8.559101631427732, + "learning_rate": 4.310566821575459e-07, + "logits/chosen": -3.34375, + "logits/rejected": -2.921875, + "logps/chosen": -780.0, + "logps/rejected": -1072.0, + "loss": 0.2809, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.0, + "rewards/margins": 3.03125, + "rewards/rejected": -9.0, + "step": 6170 + }, + { + "epoch": 0.31851565519907227, + "grad_norm": 5.847780470036177, + "learning_rate": 4.307462233778578e-07, + "logits/chosen": -3.390625, + "logits/rejected": -2.921875, + "logps/chosen": -752.0, + "logps/rejected": -1008.0, + "loss": 0.2755, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.75, + "rewards/margins": 2.796875, + "rewards/rejected": -8.5625, + "step": 6180 + }, + { + "epoch": 0.3190310526993944, + "grad_norm": 9.08494070542008, + "learning_rate": 4.304351794974461e-07, + "logits/chosen": -3.3125, + "logits/rejected": -2.9375, + "logps/chosen": -724.0, + "logps/rejected": -1040.0, + "loss": 0.2732, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.65625, + "rewards/margins": 3.234375, + "rewards/rejected": -8.875, + "step": 6190 + }, + { + "epoch": 0.31954645019971656, + "grad_norm": 8.622890398704289, + "learning_rate": 4.3012355152320326e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.21875, + "logps/chosen": -760.0, + "logps/rejected": -1016.0, + "loss": 0.274, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.84375, + "rewards/margins": 2.640625, + "rewards/rejected": -8.5, + "step": 6200 + }, + { + "epoch": 0.32006184770003865, + "grad_norm": 7.565013289589243, + "learning_rate": 4.2981134046391243e-07, + "logits/chosen": -3.359375, + "logits/rejected": -3.046875, + "logps/chosen": -744.0, + "logps/rejected": -1040.0, + "loss": 0.2609, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.75, + "rewards/margins": 2.9375, + "rewards/rejected": -8.6875, + "step": 6210 + }, + { + "epoch": 0.3205772452003608, + "grad_norm": 6.995509841024269, + "learning_rate": 4.2949854733024433e-07, + "logits/chosen": -3.453125, + "logits/rejected": -3.1875, + "logps/chosen": -712.0, + "logps/rejected": -1012.0, + "loss": 0.252, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.4375, + "rewards/margins": 3.140625, + "rewards/rejected": -8.5625, + "step": 6220 + }, + { + "epoch": 0.3210926427006829, + "grad_norm": 10.816112852913479, + "learning_rate": 4.2918517313475395e-07, + "logits/chosen": -2.953125, + "logits/rejected": -2.65625, + "logps/chosen": -760.0, + "logps/rejected": -1088.0, + "loss": 0.2571, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.78125, + "rewards/margins": 3.71875, + "rewards/rejected": -9.5, + "step": 6230 + }, + { + "epoch": 0.32160804020100503, + "grad_norm": 10.590483547827988, + "learning_rate": 4.288712188918773e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.90625, + "logps/chosen": -732.0, + "logps/rejected": -1064.0, + "loss": 0.2566, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.625, + "rewards/margins": 3.453125, + "rewards/rejected": -9.125, + "step": 6240 + }, + { + "epoch": 0.3221234377013272, + "grad_norm": 9.611583105946844, + "learning_rate": 4.285566856179279e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.609375, + "logps/chosen": -684.0, + "logps/rejected": -988.0, + "loss": 0.2748, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.25, + "rewards/margins": 3.109375, + "rewards/rejected": -8.375, + "step": 6250 + }, + { + "epoch": 0.32263883520164927, + "grad_norm": 6.198884291800569, + "learning_rate": 4.2824157433109387e-07, + "logits/chosen": -3.015625, + "logits/rejected": -2.734375, + "logps/chosen": -708.0, + "logps/rejected": -972.0, + "loss": 0.2712, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.4375, + "rewards/margins": 2.84375, + "rewards/rejected": -8.3125, + "step": 6260 + }, + { + "epoch": 0.3231542327019714, + "grad_norm": 7.585289684234336, + "learning_rate": 4.2792588605143443e-07, + "logits/chosen": -3.046875, + "logits/rejected": -2.8125, + "logps/chosen": -756.0, + "logps/rejected": -1056.0, + "loss": 0.2805, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.9375, + "rewards/margins": 3.046875, + "rewards/rejected": -9.0, + "step": 6270 + }, + { + "epoch": 0.3236696302022935, + "grad_norm": 8.327373055059388, + "learning_rate": 4.2760962180087654e-07, + "logits/chosen": -3.1875, + "logits/rejected": -3.03125, + "logps/chosen": -792.0, + "logps/rejected": -1040.0, + "loss": 0.2788, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.03125, + "rewards/margins": 2.734375, + "rewards/rejected": -8.75, + "step": 6280 + }, + { + "epoch": 0.32418502770261565, + "grad_norm": 8.293050267401599, + "learning_rate": 4.2729278260321164e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.203125, + "logps/chosen": -752.0, + "logps/rejected": -1008.0, + "loss": 0.2458, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.59375, + "rewards/margins": 2.875, + "rewards/rejected": -8.4375, + "step": 6290 + }, + { + "epoch": 0.3247004252029378, + "grad_norm": 6.7534954053550855, + "learning_rate": 4.269753694840924e-07, + "logits/chosen": -3.453125, + "logits/rejected": -3.28125, + "logps/chosen": -780.0, + "logps/rejected": -1080.0, + "loss": 0.2519, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.78125, + "rewards/margins": 3.125, + "rewards/rejected": -8.9375, + "step": 6300 + }, + { + "epoch": 0.3252158227032599, + "grad_norm": 16.433235570448254, + "learning_rate": 4.266573834710293e-07, + "logits/chosen": -3.359375, + "logits/rejected": -3.09375, + "logps/chosen": -720.0, + "logps/rejected": -1048.0, + "loss": 0.2784, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.625, + "rewards/margins": 3.34375, + "rewards/rejected": -8.9375, + "step": 6310 + }, + { + "epoch": 0.32573122020358203, + "grad_norm": 8.34784531819884, + "learning_rate": 4.263388255933874e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.109375, + "logps/chosen": -736.0, + "logps/rejected": -1012.0, + "loss": 0.2971, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.8125, + "rewards/margins": 2.8125, + "rewards/rejected": -8.625, + "step": 6320 + }, + { + "epoch": 0.3262466177039041, + "grad_norm": 7.755730423680257, + "learning_rate": 4.2601969688238303e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.890625, + "logps/chosen": -704.0, + "logps/rejected": -1032.0, + "loss": 0.2615, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.53125, + "rewards/margins": 3.140625, + "rewards/rejected": -8.6875, + "step": 6330 + }, + { + "epoch": 0.32676201520422626, + "grad_norm": 8.009325822747948, + "learning_rate": 4.256999983710803e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.921875, + "logps/chosen": -708.0, + "logps/rejected": -960.0, + "loss": 0.2631, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.21875, + "rewards/margins": 2.703125, + "rewards/rejected": -7.9375, + "step": 6340 + }, + { + "epoch": 0.3272774127045484, + "grad_norm": 7.421419175428578, + "learning_rate": 4.2537973109438784e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.953125, + "logps/chosen": -692.0, + "logps/rejected": -1000.0, + "loss": 0.2663, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.21875, + "rewards/margins": 3.125, + "rewards/rejected": -8.375, + "step": 6350 + }, + { + "epoch": 0.3277928102048705, + "grad_norm": 10.81594880990431, + "learning_rate": 4.250588960890555e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.84375, + "logps/chosen": -680.0, + "logps/rejected": -960.0, + "loss": 0.2798, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.1875, + "rewards/margins": 2.90625, + "rewards/rejected": -8.125, + "step": 6360 + }, + { + "epoch": 0.32830820770519265, + "grad_norm": 7.826180900347714, + "learning_rate": 4.247374943936708e-07, + "logits/chosen": -3.234375, + "logits/rejected": -3.015625, + "logps/chosen": -708.0, + "logps/rejected": -976.0, + "loss": 0.2575, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.25, + "rewards/margins": 2.96875, + "rewards/rejected": -8.25, + "step": 6370 + }, + { + "epoch": 0.32882360520551474, + "grad_norm": 8.36999498951757, + "learning_rate": 4.24415527048656e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.953125, + "logps/chosen": -700.0, + "logps/rejected": -976.0, + "loss": 0.2644, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.25, + "rewards/margins": 2.90625, + "rewards/rejected": -8.125, + "step": 6380 + }, + { + "epoch": 0.3293390027058369, + "grad_norm": 6.53566714464811, + "learning_rate": 4.240929950962642e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.828125, + "logps/chosen": -724.0, + "logps/rejected": -1048.0, + "loss": 0.2516, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.71875, + "rewards/margins": 3.15625, + "rewards/rejected": -8.875, + "step": 6390 + }, + { + "epoch": 0.329854400206159, + "grad_norm": 10.455339794170555, + "learning_rate": 4.237698995805763e-07, + "logits/chosen": -3.3125, + "logits/rejected": -2.953125, + "logps/chosen": -692.0, + "logps/rejected": -1020.0, + "loss": 0.2836, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.21875, + "rewards/margins": 3.25, + "rewards/rejected": -8.4375, + "step": 6400 + }, + { + "epoch": 0.3303697977064811, + "grad_norm": 7.4050868245363795, + "learning_rate": 4.2344624154749755e-07, + "logits/chosen": -3.484375, + "logits/rejected": -3.09375, + "logps/chosen": -732.0, + "logps/rejected": -976.0, + "loss": 0.2637, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.40625, + "rewards/margins": 2.671875, + "rewards/rejected": -8.0625, + "step": 6410 + }, + { + "epoch": 0.33088519520680326, + "grad_norm": 7.8663103547944635, + "learning_rate": 4.2312202204475407e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.859375, + "logps/chosen": -696.0, + "logps/rejected": -972.0, + "loss": 0.2925, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -5.5625, + "rewards/margins": 2.6875, + "rewards/rejected": -8.25, + "step": 6420 + }, + { + "epoch": 0.33140059270712535, + "grad_norm": 7.2248488388975165, + "learning_rate": 4.2279724212188963e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.921875, + "logps/chosen": -704.0, + "logps/rejected": -1008.0, + "loss": 0.271, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.375, + "rewards/margins": 2.953125, + "rewards/rejected": -8.3125, + "step": 6430 + }, + { + "epoch": 0.3319159902074475, + "grad_norm": 7.863171449439744, + "learning_rate": 4.224719028302622e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.203125, + "logps/chosen": -728.0, + "logps/rejected": -1048.0, + "loss": 0.2467, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.40625, + "rewards/margins": 3.3125, + "rewards/rejected": -8.75, + "step": 6440 + }, + { + "epoch": 0.33243138770776964, + "grad_norm": 9.612254287499677, + "learning_rate": 4.2214600522304026e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.859375, + "logps/chosen": -712.0, + "logps/rejected": -1032.0, + "loss": 0.2498, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.46875, + "rewards/margins": 3.359375, + "rewards/rejected": -8.8125, + "step": 6450 + }, + { + "epoch": 0.33294678520809173, + "grad_norm": 10.821610147674072, + "learning_rate": 4.2181955035519994e-07, + "logits/chosen": -3.453125, + "logits/rejected": -3.0, + "logps/chosen": -732.0, + "logps/rejected": -1032.0, + "loss": 0.2634, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.625, + "rewards/margins": 3.109375, + "rewards/rejected": -8.75, + "step": 6460 + }, + { + "epoch": 0.3334621827084139, + "grad_norm": 7.802935620735925, + "learning_rate": 4.214925392835211e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.125, + "logps/chosen": -688.0, + "logps/rejected": -984.0, + "loss": 0.2591, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.34375, + "rewards/margins": 2.953125, + "rewards/rejected": -8.3125, + "step": 6470 + }, + { + "epoch": 0.33397758020873597, + "grad_norm": 7.50266766468835, + "learning_rate": 4.211649730665842e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.953125, + "logps/chosen": -696.0, + "logps/rejected": -936.0, + "loss": 0.2734, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -5.28125, + "rewards/margins": 2.5625, + "rewards/rejected": -7.84375, + "step": 6480 + }, + { + "epoch": 0.3344929777090581, + "grad_norm": 7.9666344822236015, + "learning_rate": 4.2083685276476664e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.96875, + "logps/chosen": -728.0, + "logps/rejected": -984.0, + "loss": 0.2952, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.59375, + "rewards/margins": 2.59375, + "rewards/rejected": -8.1875, + "step": 6490 + }, + { + "epoch": 0.33500837520938026, + "grad_norm": 8.449511213025161, + "learning_rate": 4.2050817944023967e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.984375, + "logps/chosen": -752.0, + "logps/rejected": -984.0, + "loss": 0.2722, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.84375, + "rewards/margins": 2.484375, + "rewards/rejected": -8.3125, + "step": 6500 + }, + { + "epoch": 0.33552377270970235, + "grad_norm": 10.605289490687616, + "learning_rate": 4.201789541569647e-07, + "logits/chosen": -3.46875, + "logits/rejected": -2.859375, + "logps/chosen": -752.0, + "logps/rejected": -1024.0, + "loss": 0.2627, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.6875, + "rewards/margins": 2.953125, + "rewards/rejected": -8.625, + "step": 6510 + }, + { + "epoch": 0.3360391702100245, + "grad_norm": 5.435779899518126, + "learning_rate": 4.1984917798068985e-07, + "logits/chosen": -3.4375, + "logits/rejected": -3.1875, + "logps/chosen": -728.0, + "logps/rejected": -1024.0, + "loss": 0.2732, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.40625, + "rewards/margins": 3.1875, + "rewards/rejected": -8.5625, + "step": 6520 + }, + { + "epoch": 0.3365545677103466, + "grad_norm": 9.610286646672934, + "learning_rate": 4.195188519789465e-07, + "logits/chosen": -3.484375, + "logits/rejected": -3.03125, + "logps/chosen": -648.0, + "logps/rejected": -964.0, + "loss": 0.286, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.0625, + "rewards/margins": 3.1875, + "rewards/rejected": -8.25, + "step": 6530 + }, + { + "epoch": 0.33706996521066873, + "grad_norm": 7.3460875956980285, + "learning_rate": 4.191879772210461e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.875, + "logps/chosen": -696.0, + "logps/rejected": -996.0, + "loss": 0.252, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.28125, + "rewards/margins": 3.125, + "rewards/rejected": -8.375, + "step": 6540 + }, + { + "epoch": 0.3375853627109909, + "grad_norm": 7.615758420442853, + "learning_rate": 4.1885655477807623e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.015625, + "logps/chosen": -744.0, + "logps/rejected": -1032.0, + "loss": 0.2617, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.71875, + "rewards/margins": 2.984375, + "rewards/rejected": -8.6875, + "step": 6550 + }, + { + "epoch": 0.33810076021131297, + "grad_norm": 8.65411435937681, + "learning_rate": 4.185245857228976e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.046875, + "logps/chosen": -728.0, + "logps/rejected": -1048.0, + "loss": 0.2447, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.71875, + "rewards/margins": 3.203125, + "rewards/rejected": -8.9375, + "step": 6560 + }, + { + "epoch": 0.3386161577116351, + "grad_norm": 13.583119492557458, + "learning_rate": 4.1819207113014043e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.109375, + "logps/chosen": -752.0, + "logps/rejected": -1012.0, + "loss": 0.2999, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -5.8125, + "rewards/margins": 2.71875, + "rewards/rejected": -8.5625, + "step": 6570 + }, + { + "epoch": 0.3391315552119572, + "grad_norm": 7.8151813174141385, + "learning_rate": 4.1785901207620065e-07, + "logits/chosen": -3.21875, + "logits/rejected": -3.03125, + "logps/chosen": -724.0, + "logps/rejected": -952.0, + "loss": 0.2789, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.625, + "rewards/margins": 2.40625, + "rewards/rejected": -8.0, + "step": 6580 + }, + { + "epoch": 0.33964695271227935, + "grad_norm": 6.745568926569111, + "learning_rate": 4.1752540963923695e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.828125, + "logps/chosen": -748.0, + "logps/rejected": -1004.0, + "loss": 0.2828, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -5.8125, + "rewards/margins": 2.59375, + "rewards/rejected": -8.4375, + "step": 6590 + }, + { + "epoch": 0.3401623502126015, + "grad_norm": 10.254923268629952, + "learning_rate": 4.1719126489916684e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.0625, + "logps/chosen": -756.0, + "logps/rejected": -1024.0, + "loss": 0.2876, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.84375, + "rewards/margins": 2.8125, + "rewards/rejected": -8.625, + "step": 6600 + }, + { + "epoch": 0.3406777477129236, + "grad_norm": 8.196823363674211, + "learning_rate": 4.1685657893766353e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.953125, + "logps/chosen": -764.0, + "logps/rejected": -1012.0, + "loss": 0.2852, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.71875, + "rewards/margins": 2.75, + "rewards/rejected": -8.4375, + "step": 6610 + }, + { + "epoch": 0.34119314521324573, + "grad_norm": 8.380236460099981, + "learning_rate": 4.16521352838152e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.109375, + "logps/chosen": -760.0, + "logps/rejected": -1000.0, + "loss": 0.2703, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.84375, + "rewards/margins": 2.75, + "rewards/rejected": -8.625, + "step": 6620 + }, + { + "epoch": 0.3417085427135678, + "grad_norm": 7.107927959263898, + "learning_rate": 4.161855876858061e-07, + "logits/chosen": -3.03125, + "logits/rejected": -2.734375, + "logps/chosen": -764.0, + "logps/rejected": -1020.0, + "loss": 0.3008, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.875, + "rewards/margins": 2.796875, + "rewards/rejected": -8.6875, + "step": 6630 + }, + { + "epoch": 0.34222394021388997, + "grad_norm": 4.794981384983031, + "learning_rate": 4.158492845675442e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.875, + "logps/chosen": -744.0, + "logps/rejected": -1048.0, + "loss": 0.2515, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.625, + "rewards/margins": 2.984375, + "rewards/rejected": -8.625, + "step": 6640 + }, + { + "epoch": 0.3427393377142121, + "grad_norm": 7.260741457086861, + "learning_rate": 4.155124445720265e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.96875, + "logps/chosen": -740.0, + "logps/rejected": -1048.0, + "loss": 0.2568, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.6875, + "rewards/margins": 3.25, + "rewards/rejected": -8.9375, + "step": 6650 + }, + { + "epoch": 0.3432547352145342, + "grad_norm": 10.726346699832709, + "learning_rate": 4.15175068789651e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.96875, + "logps/chosen": -748.0, + "logps/rejected": -1048.0, + "loss": 0.2693, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.78125, + "rewards/margins": 3.125, + "rewards/rejected": -8.9375, + "step": 6660 + }, + { + "epoch": 0.34377013271485635, + "grad_norm": 8.286842353679617, + "learning_rate": 4.1483715831255017e-07, + "logits/chosen": -3.4375, + "logits/rejected": -3.125, + "logps/chosen": -744.0, + "logps/rejected": -1064.0, + "loss": 0.2562, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.6875, + "rewards/margins": 3.265625, + "rewards/rejected": -8.9375, + "step": 6670 + }, + { + "epoch": 0.34428553021517844, + "grad_norm": 11.39683717226108, + "learning_rate": 4.1449871423458726e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.984375, + "logps/chosen": -764.0, + "logps/rejected": -1048.0, + "loss": 0.2944, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.84375, + "rewards/margins": 2.984375, + "rewards/rejected": -8.8125, + "step": 6680 + }, + { + "epoch": 0.3448009277155006, + "grad_norm": 7.765551872229651, + "learning_rate": 4.1415973765135303e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.71875, + "logps/chosen": -744.0, + "logps/rejected": -1032.0, + "loss": 0.2497, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.6875, + "rewards/margins": 2.875, + "rewards/rejected": -8.5625, + "step": 6690 + }, + { + "epoch": 0.34531632521582273, + "grad_norm": 9.920486062916595, + "learning_rate": 4.1382022966016183e-07, + "logits/chosen": -3.28125, + "logits/rejected": -3.0625, + "logps/chosen": -744.0, + "logps/rejected": -1012.0, + "loss": 0.2744, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.78125, + "rewards/margins": 2.8125, + "rewards/rejected": -8.5625, + "step": 6700 + }, + { + "epoch": 0.3458317227161448, + "grad_norm": 8.026722666788869, + "learning_rate": 4.134801913600485e-07, + "logits/chosen": -3.359375, + "logits/rejected": -2.90625, + "logps/chosen": -724.0, + "logps/rejected": -1064.0, + "loss": 0.2498, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.3125, + "rewards/margins": 3.390625, + "rewards/rejected": -8.6875, + "step": 6710 + }, + { + "epoch": 0.34634712021646696, + "grad_norm": 7.7649991956524, + "learning_rate": 4.131396238517643e-07, + "logits/chosen": -3.46875, + "logits/rejected": -3.265625, + "logps/chosen": -748.0, + "logps/rejected": -992.0, + "loss": 0.2527, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.71875, + "rewards/margins": 2.65625, + "rewards/rejected": -8.375, + "step": 6720 + }, + { + "epoch": 0.34686251771678905, + "grad_norm": 11.112916098057527, + "learning_rate": 4.1279852823777374e-07, + "logits/chosen": -3.53125, + "logits/rejected": -3.1875, + "logps/chosen": -740.0, + "logps/rejected": -1020.0, + "loss": 0.2716, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.4375, + "rewards/margins": 3.046875, + "rewards/rejected": -8.5, + "step": 6730 + }, + { + "epoch": 0.3473779152171112, + "grad_norm": 6.144078637514483, + "learning_rate": 4.1245690562225097e-07, + "logits/chosen": -3.515625, + "logits/rejected": -3.265625, + "logps/chosen": -724.0, + "logps/rejected": -992.0, + "loss": 0.2913, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.4375, + "rewards/margins": 2.828125, + "rewards/rejected": -8.25, + "step": 6740 + }, + { + "epoch": 0.34789331271743335, + "grad_norm": 7.416777629309593, + "learning_rate": 4.12114757111076e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.046875, + "logps/chosen": -704.0, + "logps/rejected": -988.0, + "loss": 0.2651, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.375, + "rewards/margins": 2.796875, + "rewards/rejected": -8.1875, + "step": 6750 + }, + { + "epoch": 0.34840871021775544, + "grad_norm": 7.979635556314714, + "learning_rate": 4.1177208381183113e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.078125, + "logps/chosen": -684.0, + "logps/rejected": -932.0, + "loss": 0.2745, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.125, + "rewards/margins": 2.71875, + "rewards/rejected": -7.84375, + "step": 6760 + }, + { + "epoch": 0.3489241077180776, + "grad_norm": 7.930257068510449, + "learning_rate": 4.1142888683379775e-07, + "logits/chosen": -3.40625, + "logits/rejected": -2.921875, + "logps/chosen": -708.0, + "logps/rejected": -1032.0, + "loss": 0.2689, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.46875, + "rewards/margins": 3.140625, + "rewards/rejected": -8.5625, + "step": 6770 + }, + { + "epoch": 0.34943950521839967, + "grad_norm": 9.46915707798769, + "learning_rate": 4.110851672879523e-07, + "logits/chosen": -3.375, + "logits/rejected": -3.125, + "logps/chosen": -732.0, + "logps/rejected": -1032.0, + "loss": 0.2928, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.625, + "rewards/margins": 3.1875, + "rewards/rejected": -8.8125, + "step": 6780 + }, + { + "epoch": 0.3499549027187218, + "grad_norm": 7.169773658770997, + "learning_rate": 4.1074092628696275e-07, + "logits/chosen": -3.421875, + "logits/rejected": -3.0625, + "logps/chosen": -764.0, + "logps/rejected": -1064.0, + "loss": 0.2563, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.875, + "rewards/margins": 3.03125, + "rewards/rejected": -8.9375, + "step": 6790 + }, + { + "epoch": 0.35047030021904396, + "grad_norm": 7.878219974647186, + "learning_rate": 4.1039616494518536e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.015625, + "logps/chosen": -772.0, + "logps/rejected": -1040.0, + "loss": 0.2616, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.09375, + "rewards/margins": 2.796875, + "rewards/rejected": -8.9375, + "step": 6800 + }, + { + "epoch": 0.35098569771936605, + "grad_norm": 9.354822687176634, + "learning_rate": 4.1005088437866065e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.890625, + "logps/chosen": -740.0, + "logps/rejected": -1072.0, + "loss": 0.2391, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.8125, + "rewards/margins": 3.296875, + "rewards/rejected": -9.125, + "step": 6810 + }, + { + "epoch": 0.3515010952196882, + "grad_norm": 8.806464043103848, + "learning_rate": 4.0970508570510987e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.203125, + "logps/chosen": -764.0, + "logps/rejected": -1080.0, + "loss": 0.2316, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.78125, + "rewards/margins": 3.453125, + "rewards/rejected": -9.25, + "step": 6820 + }, + { + "epoch": 0.3520164927200103, + "grad_norm": 13.42177474059734, + "learning_rate": 4.093587700439316e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.0, + "logps/chosen": -788.0, + "logps/rejected": -1128.0, + "loss": 0.2923, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.0, + "rewards/margins": 3.484375, + "rewards/rejected": -9.4375, + "step": 6830 + }, + { + "epoch": 0.35253189022033243, + "grad_norm": 7.791058562206344, + "learning_rate": 4.090119385161979e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.96875, + "logps/chosen": -796.0, + "logps/rejected": -1088.0, + "loss": 0.2708, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.15625, + "rewards/margins": 3.171875, + "rewards/rejected": -9.3125, + "step": 6840 + }, + { + "epoch": 0.3530472877206546, + "grad_norm": 7.4721748012348455, + "learning_rate": 4.086645922446507e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.96875, + "logps/chosen": -780.0, + "logps/rejected": -1032.0, + "loss": 0.2864, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.0625, + "rewards/margins": 2.78125, + "rewards/rejected": -8.8125, + "step": 6850 + }, + { + "epoch": 0.35356268522097667, + "grad_norm": 7.906922171127193, + "learning_rate": 4.0831673235369835e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.765625, + "logps/chosen": -732.0, + "logps/rejected": -1008.0, + "loss": 0.2626, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.65625, + "rewards/margins": 2.796875, + "rewards/rejected": -8.4375, + "step": 6860 + }, + { + "epoch": 0.3540780827212988, + "grad_norm": 8.566677319190653, + "learning_rate": 4.0796835996941195e-07, + "logits/chosen": -3.25, + "logits/rejected": -3.015625, + "logps/chosen": -792.0, + "logps/rejected": -1088.0, + "loss": 0.2519, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.0, + "rewards/margins": 3.15625, + "rewards/rejected": -9.125, + "step": 6870 + }, + { + "epoch": 0.3545934802216209, + "grad_norm": 10.399732734481859, + "learning_rate": 4.076194762195212e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.984375, + "logps/chosen": -764.0, + "logps/rejected": -1048.0, + "loss": 0.2652, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.6875, + "rewards/margins": 3.109375, + "rewards/rejected": -8.8125, + "step": 6880 + }, + { + "epoch": 0.35510887772194305, + "grad_norm": 8.011248485037884, + "learning_rate": 4.0727008223341163e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.046875, + "logps/chosen": -772.0, + "logps/rejected": -1080.0, + "loss": 0.2712, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.8125, + "rewards/margins": 3.15625, + "rewards/rejected": -8.9375, + "step": 6890 + }, + { + "epoch": 0.3556242752222652, + "grad_norm": 9.058669128149397, + "learning_rate": 4.069201791421204e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.96875, + "logps/chosen": -800.0, + "logps/rejected": -1048.0, + "loss": 0.2634, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.15625, + "rewards/margins": 2.5625, + "rewards/rejected": -8.75, + "step": 6900 + }, + { + "epoch": 0.3561396727225873, + "grad_norm": 6.065903931518636, + "learning_rate": 4.065697680783323e-07, + "logits/chosen": -3.28125, + "logits/rejected": -3.109375, + "logps/chosen": -716.0, + "logps/rejected": -1016.0, + "loss": 0.2713, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.46875, + "rewards/margins": 3.03125, + "rewards/rejected": -8.5, + "step": 6910 + }, + { + "epoch": 0.35665507022290943, + "grad_norm": 9.99237020024195, + "learning_rate": 4.0621885017637704e-07, + "logits/chosen": -3.21875, + "logits/rejected": -3.015625, + "logps/chosen": -760.0, + "logps/rejected": -1004.0, + "loss": 0.2991, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.90625, + "rewards/margins": 2.625, + "rewards/rejected": -8.5625, + "step": 6920 + }, + { + "epoch": 0.3571704677232315, + "grad_norm": 11.755237419557693, + "learning_rate": 4.058674265722246e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.953125, + "logps/chosen": -756.0, + "logps/rejected": -1048.0, + "loss": 0.286, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.84375, + "rewards/margins": 2.9375, + "rewards/rejected": -8.8125, + "step": 6930 + }, + { + "epoch": 0.35768586522355367, + "grad_norm": 8.911786449093844, + "learning_rate": 4.055154984034823e-07, + "logits/chosen": -3.0, + "logits/rejected": -2.796875, + "logps/chosen": -732.0, + "logps/rejected": -1040.0, + "loss": 0.2989, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.71875, + "rewards/margins": 3.0, + "rewards/rejected": -8.6875, + "step": 6940 + }, + { + "epoch": 0.3582012627238758, + "grad_norm": 10.144650392530226, + "learning_rate": 4.051630668093907e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.921875, + "logps/chosen": -752.0, + "logps/rejected": -1048.0, + "loss": 0.2524, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.8125, + "rewards/margins": 2.9375, + "rewards/rejected": -8.75, + "step": 6950 + }, + { + "epoch": 0.3587166602241979, + "grad_norm": 7.951127794866728, + "learning_rate": 4.0481013293082e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.046875, + "logps/chosen": -772.0, + "logps/rejected": -1048.0, + "loss": 0.2795, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.9375, + "rewards/margins": 2.84375, + "rewards/rejected": -8.8125, + "step": 6960 + }, + { + "epoch": 0.35923205772452005, + "grad_norm": 7.578124137287364, + "learning_rate": 4.044566979102663e-07, + "logits/chosen": -3.125, + "logits/rejected": -3.03125, + "logps/chosen": -748.0, + "logps/rejected": -1024.0, + "loss": 0.2665, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.84375, + "rewards/margins": 2.84375, + "rewards/rejected": -8.6875, + "step": 6970 + }, + { + "epoch": 0.35974745522484214, + "grad_norm": 8.516873063414792, + "learning_rate": 4.0410276289184807e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.125, + "logps/chosen": -760.0, + "logps/rejected": -1032.0, + "loss": 0.2692, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.5625, + "rewards/margins": 3.03125, + "rewards/rejected": -8.625, + "step": 6980 + }, + { + "epoch": 0.3602628527251643, + "grad_norm": 6.373098124852575, + "learning_rate": 4.0374832902130227e-07, + "logits/chosen": -3.390625, + "logits/rejected": -2.96875, + "logps/chosen": -728.0, + "logps/rejected": -1024.0, + "loss": 0.2439, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.65625, + "rewards/margins": 3.0625, + "rewards/rejected": -8.6875, + "step": 6990 + }, + { + "epoch": 0.36077825022548643, + "grad_norm": 8.008308529029403, + "learning_rate": 4.0339339744598085e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.125, + "logps/chosen": -772.0, + "logps/rejected": -1064.0, + "loss": 0.2723, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.9375, + "rewards/margins": 3.046875, + "rewards/rejected": -9.0, + "step": 7000 + }, + { + "epoch": 0.3612936477258085, + "grad_norm": 8.331720279750911, + "learning_rate": 4.030379693148467e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.09375, + "logps/chosen": -792.0, + "logps/rejected": -1104.0, + "loss": 0.2604, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.15625, + "rewards/margins": 3.265625, + "rewards/rejected": -9.375, + "step": 7010 + }, + { + "epoch": 0.36180904522613067, + "grad_norm": 7.422664412592785, + "learning_rate": 4.026820457784703e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.984375, + "logps/chosen": -748.0, + "logps/rejected": -1016.0, + "loss": 0.2865, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.65625, + "rewards/margins": 2.8125, + "rewards/rejected": -8.5, + "step": 7020 + }, + { + "epoch": 0.36232444272645276, + "grad_norm": 9.081803840747403, + "learning_rate": 4.023256279890257e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.90625, + "logps/chosen": -712.0, + "logps/rejected": -992.0, + "loss": 0.2737, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.59375, + "rewards/margins": 2.875, + "rewards/rejected": -8.4375, + "step": 7030 + }, + { + "epoch": 0.3628398402267749, + "grad_norm": 7.789561878555876, + "learning_rate": 4.0196871710028695e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.078125, + "logps/chosen": -736.0, + "logps/rejected": -1004.0, + "loss": 0.2586, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.59375, + "rewards/margins": 2.8125, + "rewards/rejected": -8.375, + "step": 7040 + }, + { + "epoch": 0.36335523772709705, + "grad_norm": 5.855648584483455, + "learning_rate": 4.016113142676245e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.84375, + "logps/chosen": -716.0, + "logps/rejected": -1032.0, + "loss": 0.2581, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.4375, + "rewards/margins": 3.34375, + "rewards/rejected": -8.8125, + "step": 7050 + }, + { + "epoch": 0.36387063522741914, + "grad_norm": 5.997327374405138, + "learning_rate": 4.0125342064800105e-07, + "logits/chosen": -3.25, + "logits/rejected": -3.03125, + "logps/chosen": -744.0, + "logps/rejected": -1008.0, + "loss": 0.2594, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.6875, + "rewards/margins": 2.859375, + "rewards/rejected": -8.5625, + "step": 7060 + }, + { + "epoch": 0.3643860327277413, + "grad_norm": 10.002737115455743, + "learning_rate": 4.0089503739996816e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.0, + "logps/chosen": -744.0, + "logps/rejected": -1056.0, + "loss": 0.29, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.71875, + "rewards/margins": 3.109375, + "rewards/rejected": -8.875, + "step": 7070 + }, + { + "epoch": 0.3649014302280634, + "grad_norm": 7.959073217549133, + "learning_rate": 4.0053616568366234e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.78125, + "logps/chosen": -756.0, + "logps/rejected": -1032.0, + "loss": 0.265, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.84375, + "rewards/margins": 2.921875, + "rewards/rejected": -8.75, + "step": 7080 + }, + { + "epoch": 0.3654168277283855, + "grad_norm": 5.965971153487051, + "learning_rate": 4.0017680666080135e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.0625, + "logps/chosen": -716.0, + "logps/rejected": -1004.0, + "loss": 0.2472, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.46875, + "rewards/margins": 2.828125, + "rewards/rejected": -8.3125, + "step": 7090 + }, + { + "epoch": 0.36593222522870766, + "grad_norm": 9.202025422726726, + "learning_rate": 3.9981696149468045e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.859375, + "logps/chosen": -692.0, + "logps/rejected": -976.0, + "loss": 0.2471, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.21875, + "rewards/margins": 3.0, + "rewards/rejected": -8.1875, + "step": 7100 + }, + { + "epoch": 0.36644762272902975, + "grad_norm": 15.15918090604591, + "learning_rate": 3.9945663135016873e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.890625, + "logps/chosen": -752.0, + "logps/rejected": -1032.0, + "loss": 0.2722, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.78125, + "rewards/margins": 3.015625, + "rewards/rejected": -8.8125, + "step": 7110 + }, + { + "epoch": 0.3669630202293519, + "grad_norm": 10.002325364134633, + "learning_rate": 3.9909581739370503e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.875, + "logps/chosen": -736.0, + "logps/rejected": -1072.0, + "loss": 0.269, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.6875, + "rewards/margins": 3.375, + "rewards/rejected": -9.0625, + "step": 7120 + }, + { + "epoch": 0.367478417729674, + "grad_norm": 7.828554952940457, + "learning_rate": 3.987345207932943e-07, + "logits/chosen": -3.203125, + "logits/rejected": -3.03125, + "logps/chosen": -764.0, + "logps/rejected": -1032.0, + "loss": 0.2584, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.875, + "rewards/margins": 2.8125, + "rewards/rejected": -8.6875, + "step": 7130 + }, + { + "epoch": 0.36799381522999614, + "grad_norm": 6.164441577392585, + "learning_rate": 3.983727427185042e-07, + "logits/chosen": -3.171875, + "logits/rejected": -3.046875, + "logps/chosen": -764.0, + "logps/rejected": -1032.0, + "loss": 0.2532, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.875, + "rewards/margins": 2.640625, + "rewards/rejected": -8.5, + "step": 7140 + }, + { + "epoch": 0.3685092127303183, + "grad_norm": 7.029993202469793, + "learning_rate": 3.9801048434046067e-07, + "logits/chosen": -3.3125, + "logits/rejected": -2.90625, + "logps/chosen": -764.0, + "logps/rejected": -1048.0, + "loss": 0.2615, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.8125, + "rewards/margins": 2.96875, + "rewards/rejected": -8.75, + "step": 7150 + }, + { + "epoch": 0.36902461023064037, + "grad_norm": 7.120708865407629, + "learning_rate": 3.9764774683184463e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.9375, + "logps/chosen": -744.0, + "logps/rejected": -1048.0, + "loss": 0.2586, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.625, + "rewards/margins": 3.015625, + "rewards/rejected": -8.6875, + "step": 7160 + }, + { + "epoch": 0.3695400077309625, + "grad_norm": 5.87995712839481, + "learning_rate": 3.9728453136688797e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.75, + "logps/chosen": -768.0, + "logps/rejected": -1048.0, + "loss": 0.256, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.9375, + "rewards/margins": 2.875, + "rewards/rejected": -8.8125, + "step": 7170 + }, + { + "epoch": 0.3700554052312846, + "grad_norm": 9.676289354799001, + "learning_rate": 3.969208391213699e-07, + "logits/chosen": -3.3125, + "logits/rejected": -2.953125, + "logps/chosen": -768.0, + "logps/rejected": -1024.0, + "loss": 0.2761, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.875, + "rewards/margins": 2.6875, + "rewards/rejected": -8.5625, + "step": 7180 + }, + { + "epoch": 0.37057080273160675, + "grad_norm": 6.742461153706223, + "learning_rate": 3.965566712726126e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.953125, + "logps/chosen": -788.0, + "logps/rejected": -1088.0, + "loss": 0.2483, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.09375, + "rewards/margins": 3.203125, + "rewards/rejected": -9.3125, + "step": 7190 + }, + { + "epoch": 0.3710862002319289, + "grad_norm": 5.732880532423514, + "learning_rate": 3.9619202899947844e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.90625, + "logps/chosen": -784.0, + "logps/rejected": -1064.0, + "loss": 0.2526, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.9375, + "rewards/margins": 3.0, + "rewards/rejected": -8.9375, + "step": 7200 + }, + { + "epoch": 0.371601597732251, + "grad_norm": 9.844188745040613, + "learning_rate": 3.9582691348236524e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.765625, + "logps/chosen": -776.0, + "logps/rejected": -1104.0, + "loss": 0.2682, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.0625, + "rewards/margins": 3.21875, + "rewards/rejected": -9.25, + "step": 7210 + }, + { + "epoch": 0.37211699523257313, + "grad_norm": 9.267131746713105, + "learning_rate": 3.954613259032028e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.828125, + "logps/chosen": -744.0, + "logps/rejected": -1072.0, + "loss": 0.2695, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.78125, + "rewards/margins": 3.28125, + "rewards/rejected": -9.0625, + "step": 7220 + }, + { + "epoch": 0.3726323927328952, + "grad_norm": 5.667094688766419, + "learning_rate": 3.9509526744544914e-07, + "logits/chosen": -2.984375, + "logits/rejected": -2.859375, + "logps/chosen": -752.0, + "logps/rejected": -1032.0, + "loss": 0.2561, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.8125, + "rewards/margins": 2.90625, + "rewards/rejected": -8.6875, + "step": 7230 + }, + { + "epoch": 0.37314779023321737, + "grad_norm": 8.501648061428298, + "learning_rate": 3.9472873929408647e-07, + "logits/chosen": -3.015625, + "logits/rejected": -2.75, + "logps/chosen": -740.0, + "logps/rejected": -1048.0, + "loss": 0.2628, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.59375, + "rewards/margins": 3.15625, + "rewards/rejected": -8.75, + "step": 7240 + }, + { + "epoch": 0.3736631877335395, + "grad_norm": 8.643300067983844, + "learning_rate": 3.943617426356175e-07, + "logits/chosen": -3.078125, + "logits/rejected": -2.75, + "logps/chosen": -784.0, + "logps/rejected": -1080.0, + "loss": 0.2604, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.9375, + "rewards/margins": 3.0, + "rewards/rejected": -8.9375, + "step": 7250 + }, + { + "epoch": 0.3741785852338616, + "grad_norm": 8.76557497308575, + "learning_rate": 3.9399427865806154e-07, + "logits/chosen": -3.1875, + "logits/rejected": -3.0, + "logps/chosen": -784.0, + "logps/rejected": -1056.0, + "loss": 0.2322, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.0, + "rewards/margins": 2.875, + "rewards/rejected": -8.875, + "step": 7260 + }, + { + "epoch": 0.37469398273418375, + "grad_norm": 9.629970552793015, + "learning_rate": 3.9362634855095083e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.828125, + "logps/chosen": -788.0, + "logps/rejected": -1112.0, + "loss": 0.2667, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.15625, + "rewards/margins": 3.234375, + "rewards/rejected": -9.375, + "step": 7270 + }, + { + "epoch": 0.37520938023450584, + "grad_norm": 10.381224358095677, + "learning_rate": 3.932579535053264e-07, + "logits/chosen": -2.96875, + "logits/rejected": -2.671875, + "logps/chosen": -740.0, + "logps/rejected": -1020.0, + "loss": 0.2578, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.625, + "rewards/margins": 2.953125, + "rewards/rejected": -8.5625, + "step": 7280 + }, + { + "epoch": 0.375724777734828, + "grad_norm": 7.783284461680263, + "learning_rate": 3.9288909471373433e-07, + "logits/chosen": -2.75, + "logits/rejected": -2.4375, + "logps/chosen": -760.0, + "logps/rejected": -1024.0, + "loss": 0.2667, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.0, + "rewards/margins": 2.796875, + "rewards/rejected": -8.8125, + "step": 7290 + }, + { + "epoch": 0.37624017523515013, + "grad_norm": 10.824933922525647, + "learning_rate": 3.925197733702219e-07, + "logits/chosen": -3.046875, + "logits/rejected": -2.8125, + "logps/chosen": -796.0, + "logps/rejected": -1072.0, + "loss": 0.2514, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.125, + "rewards/margins": 2.859375, + "rewards/rejected": -9.0, + "step": 7300 + }, + { + "epoch": 0.3767555727354722, + "grad_norm": 6.749820528121927, + "learning_rate": 3.9214999067033387e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.78125, + "logps/chosen": -756.0, + "logps/rejected": -1064.0, + "loss": 0.2655, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.84375, + "rewards/margins": 3.15625, + "rewards/rejected": -9.0, + "step": 7310 + }, + { + "epoch": 0.37727097023579437, + "grad_norm": 8.999702883116393, + "learning_rate": 3.9177974781110833e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.90625, + "logps/chosen": -796.0, + "logps/rejected": -1072.0, + "loss": 0.2688, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.09375, + "rewards/margins": 2.859375, + "rewards/rejected": -8.9375, + "step": 7320 + }, + { + "epoch": 0.37778636773611646, + "grad_norm": 6.332588548525222, + "learning_rate": 3.9140904599107324e-07, + "logits/chosen": -3.0, + "logits/rejected": -2.78125, + "logps/chosen": -772.0, + "logps/rejected": -1088.0, + "loss": 0.2784, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.03125, + "rewards/margins": 3.140625, + "rewards/rejected": -9.1875, + "step": 7330 + }, + { + "epoch": 0.3783017652364386, + "grad_norm": 8.124870956885319, + "learning_rate": 3.910378864102417e-07, + "logits/chosen": -3.375, + "logits/rejected": -3.03125, + "logps/chosen": -776.0, + "logps/rejected": -1032.0, + "loss": 0.2707, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.875, + "rewards/margins": 2.71875, + "rewards/rejected": -8.5625, + "step": 7340 + }, + { + "epoch": 0.37881716273676075, + "grad_norm": 7.9600630304087945, + "learning_rate": 3.9066627027010934e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.75, + "logps/chosen": -724.0, + "logps/rejected": -1056.0, + "loss": 0.2774, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.5625, + "rewards/margins": 3.46875, + "rewards/rejected": -9.0625, + "step": 7350 + }, + { + "epoch": 0.37933256023708284, + "grad_norm": 9.620577100362055, + "learning_rate": 3.902941987736493e-07, + "logits/chosen": -3.359375, + "logits/rejected": -2.96875, + "logps/chosen": -752.0, + "logps/rejected": -1048.0, + "loss": 0.2655, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.71875, + "rewards/margins": 3.078125, + "rewards/rejected": -8.8125, + "step": 7360 + }, + { + "epoch": 0.379847957737405, + "grad_norm": 9.323662191597467, + "learning_rate": 3.899216731253088e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.96875, + "logps/chosen": -724.0, + "logps/rejected": -1000.0, + "loss": 0.2676, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.5, + "rewards/margins": 2.75, + "rewards/rejected": -8.25, + "step": 7370 + }, + { + "epoch": 0.3803633552377271, + "grad_norm": 7.314434264044707, + "learning_rate": 3.8954869453100545e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.8125, + "logps/chosen": -744.0, + "logps/rejected": -1032.0, + "loss": 0.2569, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.59375, + "rewards/margins": 3.140625, + "rewards/rejected": -8.75, + "step": 7380 + }, + { + "epoch": 0.3808787527380492, + "grad_norm": 8.542103199447359, + "learning_rate": 3.891752641981229e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.8125, + "logps/chosen": -736.0, + "logps/rejected": -1040.0, + "loss": 0.2725, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.5625, + "rewards/margins": 3.046875, + "rewards/rejected": -8.625, + "step": 7390 + }, + { + "epoch": 0.38139415023837137, + "grad_norm": 6.159333559826709, + "learning_rate": 3.888013833355072e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.90625, + "logps/chosen": -776.0, + "logps/rejected": -1056.0, + "loss": 0.2539, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.6875, + "rewards/margins": 3.109375, + "rewards/rejected": -8.8125, + "step": 7400 + }, + { + "epoch": 0.38190954773869346, + "grad_norm": 8.797793851436866, + "learning_rate": 3.884270531534627e-07, + "logits/chosen": -3.359375, + "logits/rejected": -3.15625, + "logps/chosen": -792.0, + "logps/rejected": -1088.0, + "loss": 0.2637, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.1875, + "rewards/margins": 3.0, + "rewards/rejected": -9.1875, + "step": 7410 + }, + { + "epoch": 0.3824249452390156, + "grad_norm": 10.258740820851347, + "learning_rate": 3.880522748637486e-07, + "logits/chosen": -3.453125, + "logits/rejected": -3.0625, + "logps/chosen": -732.0, + "logps/rejected": -1024.0, + "loss": 0.2805, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.6875, + "rewards/margins": 3.140625, + "rewards/rejected": -8.8125, + "step": 7420 + }, + { + "epoch": 0.3829403427393377, + "grad_norm": 8.871306527416587, + "learning_rate": 3.8767704967957435e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.921875, + "logps/chosen": -712.0, + "logps/rejected": -1004.0, + "loss": 0.2618, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.3125, + "rewards/margins": 2.96875, + "rewards/rejected": -8.25, + "step": 7430 + }, + { + "epoch": 0.38345574023965984, + "grad_norm": 8.074362407372668, + "learning_rate": 3.873013788155962e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.96875, + "logps/chosen": -720.0, + "logps/rejected": -1056.0, + "loss": 0.2616, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.46875, + "rewards/margins": 3.375, + "rewards/rejected": -8.875, + "step": 7440 + }, + { + "epoch": 0.383971137739982, + "grad_norm": 6.736509158214131, + "learning_rate": 3.869252634879131e-07, + "logits/chosen": -2.921875, + "logits/rejected": -2.65625, + "logps/chosen": -724.0, + "logps/rejected": -1064.0, + "loss": 0.2704, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.5625, + "rewards/margins": 3.34375, + "rewards/rejected": -8.875, + "step": 7450 + }, + { + "epoch": 0.3844865352403041, + "grad_norm": 7.264595751867449, + "learning_rate": 3.865487049140629e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.84375, + "logps/chosen": -736.0, + "logps/rejected": -988.0, + "loss": 0.2736, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.5625, + "rewards/margins": 2.65625, + "rewards/rejected": -8.1875, + "step": 7460 + }, + { + "epoch": 0.3850019327406262, + "grad_norm": 7.189825678083039, + "learning_rate": 3.86171704313018e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.765625, + "logps/chosen": -720.0, + "logps/rejected": -1000.0, + "loss": 0.2625, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.375, + "rewards/margins": 2.953125, + "rewards/rejected": -8.3125, + "step": 7470 + }, + { + "epoch": 0.3855173302409483, + "grad_norm": 6.1151974086939855, + "learning_rate": 3.8579426290518215e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.8125, + "logps/chosen": -732.0, + "logps/rejected": -988.0, + "loss": 0.2597, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.1875, + "rewards/margins": 2.9375, + "rewards/rejected": -8.125, + "step": 7480 + }, + { + "epoch": 0.38603272774127045, + "grad_norm": 6.799855924719146, + "learning_rate": 3.8541638191238564e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.953125, + "logps/chosen": -740.0, + "logps/rejected": -1000.0, + "loss": 0.2686, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.59375, + "rewards/margins": 2.71875, + "rewards/rejected": -8.3125, + "step": 7490 + }, + { + "epoch": 0.3865481252415926, + "grad_norm": 8.880234213867006, + "learning_rate": 3.850380625578819e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.0, + "logps/chosen": -716.0, + "logps/rejected": -1020.0, + "loss": 0.2491, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.34375, + "rewards/margins": 3.234375, + "rewards/rejected": -8.5625, + "step": 7500 + }, + { + "epoch": 0.3870635227419147, + "grad_norm": 10.867341424121067, + "learning_rate": 3.8465930606634357e-07, + "logits/chosen": -3.421875, + "logits/rejected": -3.09375, + "logps/chosen": -724.0, + "logps/rejected": -1056.0, + "loss": 0.2605, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.34375, + "rewards/margins": 3.46875, + "rewards/rejected": -8.8125, + "step": 7510 + }, + { + "epoch": 0.38757892024223684, + "grad_norm": 8.558509108062633, + "learning_rate": 3.8428011366385803e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.84375, + "logps/chosen": -724.0, + "logps/rejected": -1032.0, + "loss": 0.2601, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -5.5625, + "rewards/margins": 3.15625, + "rewards/rejected": -8.6875, + "step": 7520 + }, + { + "epoch": 0.3880943177425589, + "grad_norm": 5.28863571392748, + "learning_rate": 3.8390048657792423e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.890625, + "logps/chosen": -668.0, + "logps/rejected": -992.0, + "loss": 0.2757, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.9375, + "rewards/margins": 3.234375, + "rewards/rejected": -8.1875, + "step": 7530 + }, + { + "epoch": 0.38860971524288107, + "grad_norm": 8.332399821209892, + "learning_rate": 3.835204260374477e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.953125, + "logps/chosen": -716.0, + "logps/rejected": -1000.0, + "loss": 0.2553, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.59375, + "rewards/margins": 2.90625, + "rewards/rejected": -8.5, + "step": 7540 + }, + { + "epoch": 0.3891251127432032, + "grad_norm": 9.263455193613783, + "learning_rate": 3.831399332727375e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.15625, + "logps/chosen": -708.0, + "logps/rejected": -1012.0, + "loss": 0.2563, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.3125, + "rewards/margins": 3.203125, + "rewards/rejected": -8.5, + "step": 7550 + }, + { + "epoch": 0.3896405102435253, + "grad_norm": 6.7645732215633485, + "learning_rate": 3.827590095155018e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.1875, + "logps/chosen": -716.0, + "logps/rejected": -976.0, + "loss": 0.2432, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.375, + "rewards/margins": 2.765625, + "rewards/rejected": -8.125, + "step": 7560 + }, + { + "epoch": 0.39015590774384745, + "grad_norm": 9.080758424189499, + "learning_rate": 3.8237765599884373e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.625, + "logps/chosen": -656.0, + "logps/rejected": -1020.0, + "loss": 0.2515, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -4.9375, + "rewards/margins": 3.4375, + "rewards/rejected": -8.375, + "step": 7570 + }, + { + "epoch": 0.39067130524416954, + "grad_norm": 7.944850458912384, + "learning_rate": 3.81995873957258e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.953125, + "logps/chosen": -700.0, + "logps/rejected": -992.0, + "loss": 0.2798, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.21875, + "rewards/margins": 3.046875, + "rewards/rejected": -8.25, + "step": 7580 + }, + { + "epoch": 0.3911867027444917, + "grad_norm": 7.763554653174158, + "learning_rate": 3.8161366462662623e-07, + "logits/chosen": -3.25, + "logits/rejected": -3.0625, + "logps/chosen": -724.0, + "logps/rejected": -1032.0, + "loss": 0.2499, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.46875, + "rewards/margins": 3.09375, + "rewards/rejected": -8.5625, + "step": 7590 + }, + { + "epoch": 0.39170210024481383, + "grad_norm": 7.180915948254688, + "learning_rate": 3.812310292442133e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.96875, + "logps/chosen": -684.0, + "logps/rejected": -996.0, + "loss": 0.2547, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.09375, + "rewards/margins": 3.203125, + "rewards/rejected": -8.3125, + "step": 7600 + }, + { + "epoch": 0.3922174977451359, + "grad_norm": 7.368820196426635, + "learning_rate": 3.8084796904866333e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.984375, + "logps/chosen": -744.0, + "logps/rejected": -1088.0, + "loss": 0.2626, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.59375, + "rewards/margins": 3.296875, + "rewards/rejected": -8.875, + "step": 7610 + }, + { + "epoch": 0.39273289524545807, + "grad_norm": 8.50812316894033, + "learning_rate": 3.8046448527999555e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.8125, + "logps/chosen": -704.0, + "logps/rejected": -1048.0, + "loss": 0.2626, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.4375, + "rewards/margins": 3.484375, + "rewards/rejected": -8.9375, + "step": 7620 + }, + { + "epoch": 0.39324829274578016, + "grad_norm": 6.302833047502839, + "learning_rate": 3.800805791796003e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.75, + "logps/chosen": -732.0, + "logps/rejected": -1072.0, + "loss": 0.257, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.59375, + "rewards/margins": 3.453125, + "rewards/rejected": -9.0625, + "step": 7630 + }, + { + "epoch": 0.3937636902461023, + "grad_norm": 7.162323941193423, + "learning_rate": 3.7969625199023525e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.921875, + "logps/chosen": -736.0, + "logps/rejected": -1056.0, + "loss": 0.2559, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.71875, + "rewards/margins": 3.125, + "rewards/rejected": -8.8125, + "step": 7640 + }, + { + "epoch": 0.39427908774642445, + "grad_norm": 9.322144389482885, + "learning_rate": 3.7931150495602097e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.96875, + "logps/chosen": -724.0, + "logps/rejected": -1032.0, + "loss": 0.2454, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.71875, + "rewards/margins": 3.0625, + "rewards/rejected": -8.8125, + "step": 7650 + }, + { + "epoch": 0.39479448524674654, + "grad_norm": 6.557667090288679, + "learning_rate": 3.7892633932243746e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.875, + "logps/chosen": -696.0, + "logps/rejected": -1012.0, + "loss": 0.259, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.21875, + "rewards/margins": 3.234375, + "rewards/rejected": -8.4375, + "step": 7660 + }, + { + "epoch": 0.3953098827470687, + "grad_norm": 5.666767849838314, + "learning_rate": 3.7854075633631945e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.8125, + "logps/chosen": -720.0, + "logps/rejected": -1032.0, + "loss": 0.2504, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.375, + "rewards/margins": 3.265625, + "rewards/rejected": -8.625, + "step": 7670 + }, + { + "epoch": 0.3958252802473908, + "grad_norm": 9.528522978289478, + "learning_rate": 3.781547572458528e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.8125, + "logps/chosen": -756.0, + "logps/rejected": -1120.0, + "loss": 0.2498, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.75, + "rewards/margins": 3.6875, + "rewards/rejected": -9.4375, + "step": 7680 + }, + { + "epoch": 0.3963406777477129, + "grad_norm": 9.844218965123579, + "learning_rate": 3.777683433005704e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.90625, + "logps/chosen": -744.0, + "logps/rejected": -1032.0, + "loss": 0.2607, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.6875, + "rewards/margins": 3.234375, + "rewards/rejected": -8.9375, + "step": 7690 + }, + { + "epoch": 0.39685607524803507, + "grad_norm": 9.842068427371602, + "learning_rate": 3.7738151575134803e-07, + "logits/chosen": -3.0, + "logits/rejected": -2.6875, + "logps/chosen": -780.0, + "logps/rejected": -1048.0, + "loss": 0.2594, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.84375, + "rewards/margins": 2.9375, + "rewards/rejected": -8.75, + "step": 7700 + }, + { + "epoch": 0.39737147274835716, + "grad_norm": 9.178469759666328, + "learning_rate": 3.7699427585040047e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.984375, + "logps/chosen": -724.0, + "logps/rejected": -1000.0, + "loss": 0.2444, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.3125, + "rewards/margins": 2.9375, + "rewards/rejected": -8.25, + "step": 7710 + }, + { + "epoch": 0.3978868702486793, + "grad_norm": 8.04417849301189, + "learning_rate": 3.766066248512772e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.78125, + "logps/chosen": -696.0, + "logps/rejected": -1012.0, + "loss": 0.2465, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.25, + "rewards/margins": 3.296875, + "rewards/rejected": -8.5625, + "step": 7720 + }, + { + "epoch": 0.3984022677490014, + "grad_norm": 5.285916644670392, + "learning_rate": 3.762185640088586e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.921875, + "logps/chosen": -696.0, + "logps/rejected": -1024.0, + "loss": 0.2588, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.4375, + "rewards/margins": 3.09375, + "rewards/rejected": -8.5, + "step": 7730 + }, + { + "epoch": 0.39891766524932354, + "grad_norm": 8.56010094769273, + "learning_rate": 3.7583009457935166e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.0625, + "logps/chosen": -760.0, + "logps/rejected": -988.0, + "loss": 0.2647, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.6875, + "rewards/margins": 2.515625, + "rewards/rejected": -8.1875, + "step": 7740 + }, + { + "epoch": 0.3994330627496457, + "grad_norm": 9.482545731331454, + "learning_rate": 3.7544121782028613e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.9375, + "logps/chosen": -744.0, + "logps/rejected": -1040.0, + "loss": 0.2714, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.71875, + "rewards/margins": 3.03125, + "rewards/rejected": -8.75, + "step": 7750 + }, + { + "epoch": 0.3999484602499678, + "grad_norm": 9.938942289444423, + "learning_rate": 3.750519349905103e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.796875, + "logps/chosen": -708.0, + "logps/rejected": -1024.0, + "loss": 0.2527, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.25, + "rewards/margins": 3.328125, + "rewards/rejected": -8.5625, + "step": 7760 + }, + { + "epoch": 0.4004638577502899, + "grad_norm": 7.81250503320777, + "learning_rate": 3.746622473501869e-07, + "logits/chosen": -3.09375, + "logits/rejected": -3.015625, + "logps/chosen": -736.0, + "logps/rejected": -1024.0, + "loss": 0.2539, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.65625, + "rewards/margins": 2.875, + "rewards/rejected": -8.5, + "step": 7770 + }, + { + "epoch": 0.400979255250612, + "grad_norm": 7.944324131780007, + "learning_rate": 3.742721561607892e-07, + "logits/chosen": -3.046875, + "logits/rejected": -2.65625, + "logps/chosen": -728.0, + "logps/rejected": -984.0, + "loss": 0.2652, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.46875, + "rewards/margins": 2.640625, + "rewards/rejected": -8.125, + "step": 7780 + }, + { + "epoch": 0.40149465275093416, + "grad_norm": 7.766239919999441, + "learning_rate": 3.738816626850968e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.828125, + "logps/chosen": -760.0, + "logps/rejected": -1024.0, + "loss": 0.2625, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.71875, + "rewards/margins": 2.78125, + "rewards/rejected": -8.5, + "step": 7790 + }, + { + "epoch": 0.4020100502512563, + "grad_norm": 6.786795258693618, + "learning_rate": 3.734907681871915e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.671875, + "logps/chosen": -760.0, + "logps/rejected": -1080.0, + "loss": 0.2561, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.84375, + "rewards/margins": 3.359375, + "rewards/rejected": -9.25, + "step": 7800 + }, + { + "epoch": 0.4025254477515784, + "grad_norm": 8.40706902117597, + "learning_rate": 3.730994739324532e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.8125, + "logps/chosen": -752.0, + "logps/rejected": -1064.0, + "loss": 0.273, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.90625, + "rewards/margins": 3.265625, + "rewards/rejected": -9.1875, + "step": 7810 + }, + { + "epoch": 0.40304084525190054, + "grad_norm": 7.459383291011195, + "learning_rate": 3.7270778118755597e-07, + "logits/chosen": -3.046875, + "logits/rejected": -2.890625, + "logps/chosen": -736.0, + "logps/rejected": -1032.0, + "loss": 0.2249, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.5, + "rewards/margins": 3.171875, + "rewards/rejected": -8.6875, + "step": 7820 + }, + { + "epoch": 0.4035562427522226, + "grad_norm": 6.015878183036743, + "learning_rate": 3.7231569122046396e-07, + "logits/chosen": -3.0625, + "logits/rejected": -2.75, + "logps/chosen": -692.0, + "logps/rejected": -972.0, + "loss": 0.249, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.25, + "rewards/margins": 2.9375, + "rewards/rejected": -8.1875, + "step": 7830 + }, + { + "epoch": 0.4040716402525448, + "grad_norm": 6.396823041514454, + "learning_rate": 3.7192320530042696e-07, + "logits/chosen": -3.25, + "logits/rejected": -3.015625, + "logps/chosen": -732.0, + "logps/rejected": -1040.0, + "loss": 0.216, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.5, + "rewards/margins": 3.140625, + "rewards/rejected": -8.625, + "step": 7840 + }, + { + "epoch": 0.4045870377528669, + "grad_norm": 9.795129408620385, + "learning_rate": 3.7153032469797664e-07, + "logits/chosen": -3.34375, + "logits/rejected": -2.90625, + "logps/chosen": -768.0, + "logps/rejected": -1112.0, + "loss": 0.2402, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.0, + "rewards/margins": 3.5, + "rewards/rejected": -9.5, + "step": 7850 + }, + { + "epoch": 0.405102435253189, + "grad_norm": 7.5943976401416835, + "learning_rate": 3.7113705068492224e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.890625, + "logps/chosen": -744.0, + "logps/rejected": -1088.0, + "loss": 0.2564, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.8125, + "rewards/margins": 3.46875, + "rewards/rejected": -9.3125, + "step": 7860 + }, + { + "epoch": 0.40561783275351115, + "grad_norm": 6.379315845532914, + "learning_rate": 3.7074338453434654e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.203125, + "logps/chosen": -708.0, + "logps/rejected": -976.0, + "loss": 0.2377, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.40625, + "rewards/margins": 2.859375, + "rewards/rejected": -8.3125, + "step": 7870 + }, + { + "epoch": 0.40613323025383324, + "grad_norm": 9.822512541695474, + "learning_rate": 3.703493275206019e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.8125, + "logps/chosen": -768.0, + "logps/rejected": -1080.0, + "loss": 0.2568, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.9375, + "rewards/margins": 3.03125, + "rewards/rejected": -9.0, + "step": 7880 + }, + { + "epoch": 0.4066486277541554, + "grad_norm": 9.148962298531115, + "learning_rate": 3.6995488091930574e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.125, + "logps/chosen": -792.0, + "logps/rejected": -1104.0, + "loss": 0.2528, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.09375, + "rewards/margins": 3.234375, + "rewards/rejected": -9.3125, + "step": 7890 + }, + { + "epoch": 0.40716402525447754, + "grad_norm": 6.796169640674312, + "learning_rate": 3.6956004600733657e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.828125, + "logps/chosen": -732.0, + "logps/rejected": -1080.0, + "loss": 0.2731, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.75, + "rewards/margins": 3.265625, + "rewards/rejected": -9.0, + "step": 7900 + }, + { + "epoch": 0.4076794227547996, + "grad_norm": 7.188286420714103, + "learning_rate": 3.6916482406283024e-07, + "logits/chosen": -3.34375, + "logits/rejected": -2.9375, + "logps/chosen": -720.0, + "logps/rejected": -1012.0, + "loss": 0.2503, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.40625, + "rewards/margins": 3.125, + "rewards/rejected": -8.5, + "step": 7910 + }, + { + "epoch": 0.40819482025512177, + "grad_norm": 9.582681722281448, + "learning_rate": 3.687692163651752e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.859375, + "logps/chosen": -728.0, + "logps/rejected": -1000.0, + "loss": 0.2749, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.53125, + "rewards/margins": 2.96875, + "rewards/rejected": -8.5, + "step": 7920 + }, + { + "epoch": 0.40871021775544386, + "grad_norm": 6.182713513623194, + "learning_rate": 3.6837322419500864e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.875, + "logps/chosen": -732.0, + "logps/rejected": -1008.0, + "loss": 0.2727, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.59375, + "rewards/margins": 2.984375, + "rewards/rejected": -8.5625, + "step": 7930 + }, + { + "epoch": 0.409225615255766, + "grad_norm": 7.040796443369069, + "learning_rate": 3.679768488342125e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.9375, + "logps/chosen": -756.0, + "logps/rejected": -996.0, + "loss": 0.2448, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.65625, + "rewards/margins": 2.8125, + "rewards/rejected": -8.5, + "step": 7940 + }, + { + "epoch": 0.40974101275608815, + "grad_norm": 6.121218664349856, + "learning_rate": 3.675800915659092e-07, + "logits/chosen": -3.234375, + "logits/rejected": -3.140625, + "logps/chosen": -732.0, + "logps/rejected": -1016.0, + "loss": 0.2449, + "rewards/accuracies": 0.8125, + "rewards/chosen": -5.53125, + "rewards/margins": 3.0, + "rewards/rejected": -8.5625, + "step": 7950 + }, + { + "epoch": 0.41025641025641024, + "grad_norm": 10.198545065162596, + "learning_rate": 3.671829536744572e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.875, + "logps/chosen": -688.0, + "logps/rejected": -1024.0, + "loss": 0.273, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.25, + "rewards/margins": 3.234375, + "rewards/rejected": -8.5, + "step": 7960 + }, + { + "epoch": 0.4107718077567324, + "grad_norm": 7.79541235182568, + "learning_rate": 3.6678543644544714e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.953125, + "logps/chosen": -736.0, + "logps/rejected": -1020.0, + "loss": 0.2523, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.34375, + "rewards/margins": 3.1875, + "rewards/rejected": -8.5625, + "step": 7970 + }, + { + "epoch": 0.4112872052570545, + "grad_norm": 8.416012183973855, + "learning_rate": 3.6638754116569776e-07, + "logits/chosen": -3.234375, + "logits/rejected": -3.078125, + "logps/chosen": -680.0, + "logps/rejected": -996.0, + "loss": 0.2546, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.125, + "rewards/margins": 3.203125, + "rewards/rejected": -8.3125, + "step": 7980 + }, + { + "epoch": 0.4118026027573766, + "grad_norm": 11.399294420524981, + "learning_rate": 3.6598926912325177e-07, + "logits/chosen": -2.953125, + "logits/rejected": -2.6875, + "logps/chosen": -712.0, + "logps/rejected": -1048.0, + "loss": 0.2677, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.46875, + "rewards/margins": 3.4375, + "rewards/rejected": -8.875, + "step": 7990 + }, + { + "epoch": 0.41231800025769877, + "grad_norm": 7.499252882226975, + "learning_rate": 3.6559062160737097e-07, + "logits/chosen": -3.015625, + "logits/rejected": -2.96875, + "logps/chosen": -736.0, + "logps/rejected": -1020.0, + "loss": 0.2629, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.40625, + "rewards/margins": 3.171875, + "rewards/rejected": -8.5625, + "step": 8000 + }, + { + "epoch": 0.41283339775802086, + "grad_norm": 7.984615057479461, + "learning_rate": 3.651915999085332e-07, + "logits/chosen": -3.015625, + "logits/rejected": -2.671875, + "logps/chosen": -688.0, + "logps/rejected": -996.0, + "loss": 0.2581, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.3125, + "rewards/margins": 3.296875, + "rewards/rejected": -8.625, + "step": 8010 + }, + { + "epoch": 0.413348795258343, + "grad_norm": 7.111224180159463, + "learning_rate": 3.647922053184272e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.96875, + "logps/chosen": -712.0, + "logps/rejected": -984.0, + "loss": 0.2472, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.28125, + "rewards/margins": 2.96875, + "rewards/rejected": -8.25, + "step": 8020 + }, + { + "epoch": 0.4138641927586651, + "grad_norm": 10.877567176734386, + "learning_rate": 3.643924391299489e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.171875, + "logps/chosen": -720.0, + "logps/rejected": -1048.0, + "loss": 0.2641, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.375, + "rewards/margins": 3.5625, + "rewards/rejected": -8.9375, + "step": 8030 + }, + { + "epoch": 0.41437959025898724, + "grad_norm": 8.366301247089094, + "learning_rate": 3.639923026371973e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.015625, + "logps/chosen": -748.0, + "logps/rejected": -1120.0, + "loss": 0.259, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.5625, + "rewards/margins": 3.859375, + "rewards/rejected": -9.4375, + "step": 8040 + }, + { + "epoch": 0.4148949877593094, + "grad_norm": 6.177475071905952, + "learning_rate": 3.6359179713547e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.203125, + "logps/chosen": -716.0, + "logps/rejected": -1004.0, + "loss": 0.2501, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.40625, + "rewards/margins": 3.046875, + "rewards/rejected": -8.4375, + "step": 8050 + }, + { + "epoch": 0.4154103852596315, + "grad_norm": 10.627745704338746, + "learning_rate": 3.6319092392125905e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.96875, + "logps/chosen": -692.0, + "logps/rejected": -1008.0, + "loss": 0.2345, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.375, + "rewards/margins": 3.0625, + "rewards/rejected": -8.4375, + "step": 8060 + }, + { + "epoch": 0.4159257827599536, + "grad_norm": 10.380527091680515, + "learning_rate": 3.627896842922471e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.859375, + "logps/chosen": -756.0, + "logps/rejected": -1048.0, + "loss": 0.2574, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.96875, + "rewards/margins": 3.015625, + "rewards/rejected": -9.0, + "step": 8070 + }, + { + "epoch": 0.4164411802602757, + "grad_norm": 8.192851482113912, + "learning_rate": 3.6238807954730253e-07, + "logits/chosen": -3.21875, + "logits/rejected": -3.0, + "logps/chosen": -716.0, + "logps/rejected": -1056.0, + "loss": 0.2451, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.46875, + "rewards/margins": 3.5, + "rewards/rejected": -8.9375, + "step": 8080 + }, + { + "epoch": 0.41695657776059786, + "grad_norm": 6.553646718257844, + "learning_rate": 3.6198611098647606e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.15625, + "logps/chosen": -720.0, + "logps/rejected": -980.0, + "loss": 0.2618, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.3125, + "rewards/margins": 2.84375, + "rewards/rejected": -8.1875, + "step": 8090 + }, + { + "epoch": 0.41747197526092, + "grad_norm": 8.239748058711568, + "learning_rate": 3.61583779910996e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.984375, + "logps/chosen": -736.0, + "logps/rejected": -1000.0, + "loss": 0.2495, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.5, + "rewards/margins": 2.90625, + "rewards/rejected": -8.375, + "step": 8100 + }, + { + "epoch": 0.4179873727612421, + "grad_norm": 7.6897687364146545, + "learning_rate": 3.611810876232639e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.03125, + "logps/chosen": -732.0, + "logps/rejected": -1040.0, + "loss": 0.2293, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.5625, + "rewards/margins": 3.296875, + "rewards/rejected": -8.875, + "step": 8110 + }, + { + "epoch": 0.41850277026156424, + "grad_norm": 5.64842733460634, + "learning_rate": 3.607780354268511e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.953125, + "logps/chosen": -708.0, + "logps/rejected": -1088.0, + "loss": 0.2405, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.28125, + "rewards/margins": 3.875, + "rewards/rejected": -9.125, + "step": 8120 + }, + { + "epoch": 0.41901816776188633, + "grad_norm": 10.231219509296164, + "learning_rate": 3.603746246264936e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.859375, + "logps/chosen": -728.0, + "logps/rejected": -1024.0, + "loss": 0.2549, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.5625, + "rewards/margins": 3.15625, + "rewards/rejected": -8.75, + "step": 8130 + }, + { + "epoch": 0.4195335652622085, + "grad_norm": 7.191259412069122, + "learning_rate": 3.5997085652808846e-07, + "logits/chosen": -2.953125, + "logits/rejected": -2.65625, + "logps/chosen": -684.0, + "logps/rejected": -1016.0, + "loss": 0.2602, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.25, + "rewards/margins": 3.40625, + "rewards/rejected": -8.6875, + "step": 8140 + }, + { + "epoch": 0.4200489627625306, + "grad_norm": 12.316976341689397, + "learning_rate": 3.5956673243868926e-07, + "logits/chosen": -3.015625, + "logits/rejected": -2.828125, + "logps/chosen": -744.0, + "logps/rejected": -1020.0, + "loss": 0.2353, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.59375, + "rewards/margins": 2.953125, + "rewards/rejected": -8.5625, + "step": 8150 + }, + { + "epoch": 0.4205643602628527, + "grad_norm": 7.539355478096153, + "learning_rate": 3.5916225366650197e-07, + "logits/chosen": -3.03125, + "logits/rejected": -2.90625, + "logps/chosen": -732.0, + "logps/rejected": -1024.0, + "loss": 0.2405, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.625, + "rewards/margins": 3.03125, + "rewards/rejected": -8.6875, + "step": 8160 + }, + { + "epoch": 0.42107975776317486, + "grad_norm": 6.770213515394323, + "learning_rate": 3.5875742152088093e-07, + "logits/chosen": -3.21875, + "logits/rejected": -3.109375, + "logps/chosen": -772.0, + "logps/rejected": -1040.0, + "loss": 0.2724, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -6.0625, + "rewards/margins": 2.546875, + "rewards/rejected": -8.625, + "step": 8170 + }, + { + "epoch": 0.42159515526349695, + "grad_norm": 6.901700657330805, + "learning_rate": 3.583522373123241e-07, + "logits/chosen": -3.046875, + "logits/rejected": -2.671875, + "logps/chosen": -732.0, + "logps/rejected": -1072.0, + "loss": 0.2579, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.53125, + "rewards/margins": 3.484375, + "rewards/rejected": -9.0, + "step": 8180 + }, + { + "epoch": 0.4221105527638191, + "grad_norm": 6.822610096128342, + "learning_rate": 3.5794670235246936e-07, + "logits/chosen": -3.25, + "logits/rejected": -3.0625, + "logps/chosen": -720.0, + "logps/rejected": -1016.0, + "loss": 0.234, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.15625, + "rewards/margins": 3.28125, + "rewards/rejected": -8.4375, + "step": 8190 + }, + { + "epoch": 0.42262595026414124, + "grad_norm": 7.924862052411909, + "learning_rate": 3.5754081795408977e-07, + "logits/chosen": -3.046875, + "logits/rejected": -2.671875, + "logps/chosen": -728.0, + "logps/rejected": -984.0, + "loss": 0.2653, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.5, + "rewards/margins": 2.796875, + "rewards/rejected": -8.3125, + "step": 8200 + }, + { + "epoch": 0.4231413477644633, + "grad_norm": 7.854942785860033, + "learning_rate": 3.571345854310898e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.734375, + "logps/chosen": -716.0, + "logps/rejected": -1008.0, + "loss": 0.2576, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.40625, + "rewards/margins": 2.953125, + "rewards/rejected": -8.375, + "step": 8210 + }, + { + "epoch": 0.4236567452647855, + "grad_norm": 9.436746863947233, + "learning_rate": 3.567280060985008e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.875, + "logps/chosen": -704.0, + "logps/rejected": -980.0, + "loss": 0.2475, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.21875, + "rewards/margins": 3.140625, + "rewards/rejected": -8.375, + "step": 8220 + }, + { + "epoch": 0.42417214276510756, + "grad_norm": 7.445594219608082, + "learning_rate": 3.563210812724767e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.875, + "logps/chosen": -776.0, + "logps/rejected": -1056.0, + "loss": 0.2657, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.90625, + "rewards/margins": 3.046875, + "rewards/rejected": -8.9375, + "step": 8230 + }, + { + "epoch": 0.4246875402654297, + "grad_norm": 7.173636186171042, + "learning_rate": 3.5591381227028983e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.0, + "logps/chosen": -728.0, + "logps/rejected": -1016.0, + "loss": 0.2498, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.5625, + "rewards/margins": 2.734375, + "rewards/rejected": -8.3125, + "step": 8240 + }, + { + "epoch": 0.42520293776575185, + "grad_norm": 9.174402066213547, + "learning_rate": 3.555062004103268e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.921875, + "logps/chosen": -752.0, + "logps/rejected": -1008.0, + "loss": 0.2633, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.625, + "rewards/margins": 2.796875, + "rewards/rejected": -8.375, + "step": 8250 + }, + { + "epoch": 0.42571833526607394, + "grad_norm": 8.296117874736396, + "learning_rate": 3.550982470120841e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.90625, + "logps/chosen": -732.0, + "logps/rejected": -1024.0, + "loss": 0.2795, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.46875, + "rewards/margins": 3.078125, + "rewards/rejected": -8.5625, + "step": 8260 + }, + { + "epoch": 0.4262337327663961, + "grad_norm": 6.288284970812515, + "learning_rate": 3.546899533961636e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.125, + "logps/chosen": -668.0, + "logps/rejected": -952.0, + "loss": 0.2852, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.125, + "rewards/margins": 2.8125, + "rewards/rejected": -7.9375, + "step": 8270 + }, + { + "epoch": 0.4267491302667182, + "grad_norm": 9.294828706142534, + "learning_rate": 3.5428132088426877e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.640625, + "logps/chosen": -620.0, + "logps/rejected": -980.0, + "loss": 0.2374, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.75, + "rewards/margins": 3.53125, + "rewards/rejected": -8.25, + "step": 8280 + }, + { + "epoch": 0.4272645277670403, + "grad_norm": 11.248390964210559, + "learning_rate": 3.5387235079919997e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.703125, + "logps/chosen": -704.0, + "logps/rejected": -976.0, + "loss": 0.2668, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.4375, + "rewards/margins": 2.765625, + "rewards/rejected": -8.1875, + "step": 8290 + }, + { + "epoch": 0.42777992526736247, + "grad_norm": 8.85161049108326, + "learning_rate": 3.534630444648504e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.921875, + "logps/chosen": -732.0, + "logps/rejected": -1056.0, + "loss": 0.2581, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.59375, + "rewards/margins": 3.40625, + "rewards/rejected": -9.0, + "step": 8300 + }, + { + "epoch": 0.42829532276768456, + "grad_norm": 7.263382662769472, + "learning_rate": 3.530534032062017e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.890625, + "logps/chosen": -708.0, + "logps/rejected": -1040.0, + "loss": 0.2302, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.4375, + "rewards/margins": 3.40625, + "rewards/rejected": -8.875, + "step": 8310 + }, + { + "epoch": 0.4288107202680067, + "grad_norm": 9.376879560672354, + "learning_rate": 3.5264342834931984e-07, + "logits/chosen": -3.0625, + "logits/rejected": -2.84375, + "logps/chosen": -752.0, + "logps/rejected": -1032.0, + "loss": 0.2636, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.8125, + "rewards/margins": 2.9375, + "rewards/rejected": -8.75, + "step": 8320 + }, + { + "epoch": 0.4293261177683288, + "grad_norm": 9.194502958976207, + "learning_rate": 3.5223312122135037e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.984375, + "logps/chosen": -704.0, + "logps/rejected": -1024.0, + "loss": 0.2237, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.3125, + "rewards/margins": 3.4375, + "rewards/rejected": -8.75, + "step": 8330 + }, + { + "epoch": 0.42984151526865094, + "grad_norm": 8.582347233707322, + "learning_rate": 3.518224831505149e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.0625, + "logps/chosen": -708.0, + "logps/rejected": -1032.0, + "loss": 0.2538, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.21875, + "rewards/margins": 3.34375, + "rewards/rejected": -8.5625, + "step": 8340 + }, + { + "epoch": 0.4303569127689731, + "grad_norm": 9.046395575634234, + "learning_rate": 3.514115154661059e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.015625, + "logps/chosen": -748.0, + "logps/rejected": -1040.0, + "loss": 0.235, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.625, + "rewards/margins": 3.140625, + "rewards/rejected": -8.75, + "step": 8350 + }, + { + "epoch": 0.4308723102692952, + "grad_norm": 8.236426822030161, + "learning_rate": 3.5100021949848324e-07, + "logits/chosen": -3.421875, + "logits/rejected": -2.9375, + "logps/chosen": -724.0, + "logps/rejected": -1008.0, + "loss": 0.2438, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.3125, + "rewards/margins": 3.15625, + "rewards/rejected": -8.4375, + "step": 8360 + }, + { + "epoch": 0.4313877077696173, + "grad_norm": 7.695917648798121, + "learning_rate": 3.5058859657906937e-07, + "logits/chosen": -3.203125, + "logits/rejected": -3.0, + "logps/chosen": -712.0, + "logps/rejected": -1024.0, + "loss": 0.2539, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.34375, + "rewards/margins": 3.34375, + "rewards/rejected": -8.6875, + "step": 8370 + }, + { + "epoch": 0.4319031052699394, + "grad_norm": 9.29039475597391, + "learning_rate": 3.5017664804034475e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.984375, + "logps/chosen": -708.0, + "logps/rejected": -976.0, + "loss": 0.2792, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.28125, + "rewards/margins": 2.765625, + "rewards/rejected": -8.0625, + "step": 8380 + }, + { + "epoch": 0.43241850277026156, + "grad_norm": 8.728710383083508, + "learning_rate": 3.4976437521584445e-07, + "logits/chosen": -3.171875, + "logits/rejected": -3.0625, + "logps/chosen": -692.0, + "logps/rejected": -960.0, + "loss": 0.2639, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.03125, + "rewards/margins": 2.890625, + "rewards/rejected": -7.90625, + "step": 8390 + }, + { + "epoch": 0.4329339002705837, + "grad_norm": 7.3787067419780055, + "learning_rate": 3.493517794401531e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.15625, + "logps/chosen": -700.0, + "logps/rejected": -980.0, + "loss": 0.2605, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.25, + "rewards/margins": 3.046875, + "rewards/rejected": -8.3125, + "step": 8400 + }, + { + "epoch": 0.4334492977709058, + "grad_norm": 7.149988576524228, + "learning_rate": 3.4893886204890064e-07, + "logits/chosen": -3.140625, + "logits/rejected": -3.015625, + "logps/chosen": -700.0, + "logps/rejected": -1048.0, + "loss": 0.2425, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.34375, + "rewards/margins": 3.390625, + "rewards/rejected": -8.75, + "step": 8410 + }, + { + "epoch": 0.43396469527122794, + "grad_norm": 9.237758917894384, + "learning_rate": 3.4852562437875837e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.015625, + "logps/chosen": -692.0, + "logps/rejected": -1056.0, + "loss": 0.2434, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.25, + "rewards/margins": 3.78125, + "rewards/rejected": -9.0, + "step": 8420 + }, + { + "epoch": 0.43448009277155003, + "grad_norm": 10.977965394422705, + "learning_rate": 3.481120677674341e-07, + "logits/chosen": -3.421875, + "logits/rejected": -3.140625, + "logps/chosen": -720.0, + "logps/rejected": -1024.0, + "loss": 0.2786, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.40625, + "rewards/margins": 3.28125, + "rewards/rejected": -8.6875, + "step": 8430 + }, + { + "epoch": 0.4349954902718722, + "grad_norm": 8.688413112143412, + "learning_rate": 3.4769819355366825e-07, + "logits/chosen": -3.1875, + "logits/rejected": -3.015625, + "logps/chosen": -704.0, + "logps/rejected": -1004.0, + "loss": 0.2581, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.53125, + "rewards/margins": 2.984375, + "rewards/rejected": -8.5, + "step": 8440 + }, + { + "epoch": 0.4355108877721943, + "grad_norm": 5.551441695500734, + "learning_rate": 3.472840030772295e-07, + "logits/chosen": -3.515625, + "logits/rejected": -2.875, + "logps/chosen": -732.0, + "logps/rejected": -1004.0, + "loss": 0.2542, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.625, + "rewards/margins": 2.875, + "rewards/rejected": -8.5, + "step": 8450 + }, + { + "epoch": 0.4360262852725164, + "grad_norm": 11.039513214887814, + "learning_rate": 3.4686949767890994e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.171875, + "logps/chosen": -736.0, + "logps/rejected": -1016.0, + "loss": 0.2483, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.5, + "rewards/margins": 2.9375, + "rewards/rejected": -8.4375, + "step": 8460 + }, + { + "epoch": 0.43654168277283856, + "grad_norm": 6.555721173659843, + "learning_rate": 3.4645467870052146e-07, + "logits/chosen": -3.3125, + "logits/rejected": -2.96875, + "logps/chosen": -696.0, + "logps/rejected": -1032.0, + "loss": 0.2437, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.28125, + "rewards/margins": 3.484375, + "rewards/rejected": -8.75, + "step": 8470 + }, + { + "epoch": 0.43705708027316065, + "grad_norm": 8.07301871826025, + "learning_rate": 3.4603954748489094e-07, + "logits/chosen": -3.4375, + "logits/rejected": -2.921875, + "logps/chosen": -728.0, + "logps/rejected": -1024.0, + "loss": 0.2509, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.53125, + "rewards/margins": 3.125, + "rewards/rejected": -8.625, + "step": 8480 + }, + { + "epoch": 0.4375724777734828, + "grad_norm": 7.278101026070933, + "learning_rate": 3.4562410537585595e-07, + "logits/chosen": -3.453125, + "logits/rejected": -3.140625, + "logps/chosen": -752.0, + "logps/rejected": -1080.0, + "loss": 0.2422, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.8125, + "rewards/margins": 3.21875, + "rewards/rejected": -9.0, + "step": 8490 + }, + { + "epoch": 0.43808787527380494, + "grad_norm": 7.861994839302311, + "learning_rate": 3.4520835371826066e-07, + "logits/chosen": -3.328125, + "logits/rejected": -2.953125, + "logps/chosen": -728.0, + "logps/rejected": -1012.0, + "loss": 0.2602, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.75, + "rewards/margins": 3.046875, + "rewards/rejected": -8.8125, + "step": 8500 + }, + { + "epoch": 0.43860327277412703, + "grad_norm": 7.12426875688472, + "learning_rate": 3.447922938579509e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.890625, + "logps/chosen": -728.0, + "logps/rejected": -1064.0, + "loss": 0.2467, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.5625, + "rewards/margins": 3.40625, + "rewards/rejected": -8.9375, + "step": 8510 + }, + { + "epoch": 0.4391186702744492, + "grad_norm": 12.219469562793309, + "learning_rate": 3.443759271417706e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.375, + "logps/chosen": -796.0, + "logps/rejected": -1080.0, + "loss": 0.26, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -6.125, + "rewards/margins": 3.046875, + "rewards/rejected": -9.1875, + "step": 8520 + }, + { + "epoch": 0.4396340677747713, + "grad_norm": 8.648682779082753, + "learning_rate": 3.439592549175568e-07, + "logits/chosen": -3.40625, + "logits/rejected": -3.265625, + "logps/chosen": -760.0, + "logps/rejected": -1088.0, + "loss": 0.2409, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.84375, + "rewards/margins": 3.5, + "rewards/rejected": -9.3125, + "step": 8530 + }, + { + "epoch": 0.4401494652750934, + "grad_norm": 7.4835366247723565, + "learning_rate": 3.435422785341357e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.125, + "logps/chosen": -772.0, + "logps/rejected": -1048.0, + "loss": 0.2639, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.875, + "rewards/margins": 2.96875, + "rewards/rejected": -8.8125, + "step": 8540 + }, + { + "epoch": 0.44066486277541556, + "grad_norm": 9.962018930854752, + "learning_rate": 3.4312499934131785e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.96875, + "logps/chosen": -748.0, + "logps/rejected": -1056.0, + "loss": 0.2675, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.8125, + "rewards/margins": 3.046875, + "rewards/rejected": -8.875, + "step": 8550 + }, + { + "epoch": 0.44118026027573765, + "grad_norm": 6.719924152929047, + "learning_rate": 3.4270741868989423e-07, + "logits/chosen": -3.171875, + "logits/rejected": -3.03125, + "logps/chosen": -712.0, + "logps/rejected": -996.0, + "loss": 0.2547, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.4375, + "rewards/margins": 3.046875, + "rewards/rejected": -8.5, + "step": 8560 + }, + { + "epoch": 0.4416956577760598, + "grad_norm": 6.350641806596293, + "learning_rate": 3.422895379316316e-07, + "logits/chosen": -3.28125, + "logits/rejected": -3.140625, + "logps/chosen": -712.0, + "logps/rejected": -980.0, + "loss": 0.2571, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -5.4375, + "rewards/margins": 2.734375, + "rewards/rejected": -8.1875, + "step": 8570 + }, + { + "epoch": 0.44221105527638194, + "grad_norm": 6.863665694111863, + "learning_rate": 3.4187135841926827e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.828125, + "logps/chosen": -744.0, + "logps/rejected": -1048.0, + "loss": 0.2459, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.71875, + "rewards/margins": 3.171875, + "rewards/rejected": -8.875, + "step": 8580 + }, + { + "epoch": 0.442726452776704, + "grad_norm": 6.273859898605491, + "learning_rate": 3.4145288150650966e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.9375, + "logps/chosen": -736.0, + "logps/rejected": -1088.0, + "loss": 0.2447, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.625, + "rewards/margins": 3.5, + "rewards/rejected": -9.125, + "step": 8590 + }, + { + "epoch": 0.4432418502770262, + "grad_norm": 10.8377671864262, + "learning_rate": 3.410341085480237e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.859375, + "logps/chosen": -716.0, + "logps/rejected": -1064.0, + "loss": 0.264, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.46875, + "rewards/margins": 3.46875, + "rewards/rejected": -8.9375, + "step": 8600 + }, + { + "epoch": 0.44375724777734826, + "grad_norm": 7.771573099475227, + "learning_rate": 3.40615040899437e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.015625, + "logps/chosen": -720.0, + "logps/rejected": -1032.0, + "loss": 0.2331, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.46875, + "rewards/margins": 3.125, + "rewards/rejected": -8.625, + "step": 8610 + }, + { + "epoch": 0.4442726452776704, + "grad_norm": 8.08396036005254, + "learning_rate": 3.4019567991733e-07, + "logits/chosen": -3.4375, + "logits/rejected": -3.0625, + "logps/chosen": -720.0, + "logps/rejected": -1004.0, + "loss": 0.2565, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.53125, + "rewards/margins": 2.9375, + "rewards/rejected": -8.4375, + "step": 8620 + }, + { + "epoch": 0.44478804277799255, + "grad_norm": 6.170960207590853, + "learning_rate": 3.3977602695923244e-07, + "logits/chosen": -3.328125, + "logits/rejected": -2.90625, + "logps/chosen": -736.0, + "logps/rejected": -1032.0, + "loss": 0.2482, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.78125, + "rewards/margins": 3.03125, + "rewards/rejected": -8.8125, + "step": 8630 + }, + { + "epoch": 0.44530344027831464, + "grad_norm": 5.9003652553891115, + "learning_rate": 3.3935608338361976e-07, + "logits/chosen": -3.4375, + "logits/rejected": -2.875, + "logps/chosen": -696.0, + "logps/rejected": -1048.0, + "loss": 0.2561, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.25, + "rewards/margins": 3.390625, + "rewards/rejected": -8.625, + "step": 8640 + }, + { + "epoch": 0.4458188377786368, + "grad_norm": 6.797128222513764, + "learning_rate": 3.389358505499076e-07, + "logits/chosen": -3.375, + "logits/rejected": -3.046875, + "logps/chosen": -732.0, + "logps/rejected": -1008.0, + "loss": 0.2472, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.5, + "rewards/margins": 2.953125, + "rewards/rejected": -8.4375, + "step": 8650 + }, + { + "epoch": 0.4463342352789589, + "grad_norm": 7.8002849659114055, + "learning_rate": 3.385153298184483e-07, + "logits/chosen": -3.359375, + "logits/rejected": -2.984375, + "logps/chosen": -732.0, + "logps/rejected": -1048.0, + "loss": 0.2688, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.71875, + "rewards/margins": 3.171875, + "rewards/rejected": -8.875, + "step": 8660 + }, + { + "epoch": 0.446849632779281, + "grad_norm": 5.88376435339397, + "learning_rate": 3.380945225505262e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.890625, + "logps/chosen": -760.0, + "logps/rejected": -1056.0, + "loss": 0.2438, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.84375, + "rewards/margins": 2.953125, + "rewards/rejected": -8.8125, + "step": 8670 + }, + { + "epoch": 0.44736503027960317, + "grad_norm": 10.748986006687838, + "learning_rate": 3.3767343010835296e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.984375, + "logps/chosen": -740.0, + "logps/rejected": -1080.0, + "loss": 0.2475, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.625, + "rewards/margins": 3.53125, + "rewards/rejected": -9.125, + "step": 8680 + }, + { + "epoch": 0.44788042777992526, + "grad_norm": 8.626644757966666, + "learning_rate": 3.3725205385506363e-07, + "logits/chosen": -3.15625, + "logits/rejected": -3.03125, + "logps/chosen": -724.0, + "logps/rejected": -1016.0, + "loss": 0.2512, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.53125, + "rewards/margins": 3.0625, + "rewards/rejected": -8.5625, + "step": 8690 + }, + { + "epoch": 0.4483958252802474, + "grad_norm": 9.421190100230234, + "learning_rate": 3.3683039515471177e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.890625, + "logps/chosen": -716.0, + "logps/rejected": -1032.0, + "loss": 0.2526, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.4375, + "rewards/margins": 3.359375, + "rewards/rejected": -8.8125, + "step": 8700 + }, + { + "epoch": 0.4489112227805695, + "grad_norm": 8.92032413340527, + "learning_rate": 3.364084553722654e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.078125, + "logps/chosen": -732.0, + "logps/rejected": -1012.0, + "loss": 0.2531, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.5, + "rewards/margins": 3.015625, + "rewards/rejected": -8.5, + "step": 8710 + }, + { + "epoch": 0.44942662028089164, + "grad_norm": 7.6646412577450045, + "learning_rate": 3.3598623587360255e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.015625, + "logps/chosen": -736.0, + "logps/rejected": -1032.0, + "loss": 0.2564, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.5625, + "rewards/margins": 3.03125, + "rewards/rejected": -8.625, + "step": 8720 + }, + { + "epoch": 0.4499420177812138, + "grad_norm": 8.08129454768361, + "learning_rate": 3.3556373802550647e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.015625, + "logps/chosen": -736.0, + "logps/rejected": -1040.0, + "loss": 0.2537, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.6875, + "rewards/margins": 3.078125, + "rewards/rejected": -8.75, + "step": 8730 + }, + { + "epoch": 0.4504574152815359, + "grad_norm": 8.41101804819421, + "learning_rate": 3.351409631956616e-07, + "logits/chosen": -3.25, + "logits/rejected": -3.0, + "logps/chosen": -788.0, + "logps/rejected": -1088.0, + "loss": 0.2535, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.09375, + "rewards/margins": 3.21875, + "rewards/rejected": -9.3125, + "step": 8740 + }, + { + "epoch": 0.450972812781858, + "grad_norm": 8.191669587734147, + "learning_rate": 3.3471791275264914e-07, + "logits/chosen": -3.421875, + "logits/rejected": -3.09375, + "logps/chosen": -748.0, + "logps/rejected": -1096.0, + "loss": 0.2455, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.84375, + "rewards/margins": 3.5625, + "rewards/rejected": -9.375, + "step": 8750 + }, + { + "epoch": 0.4514882102821801, + "grad_norm": 9.517317244809519, + "learning_rate": 3.342945880659422e-07, + "logits/chosen": -3.4375, + "logits/rejected": -3.078125, + "logps/chosen": -744.0, + "logps/rejected": -1056.0, + "loss": 0.2414, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.59375, + "rewards/margins": 3.34375, + "rewards/rejected": -8.9375, + "step": 8760 + }, + { + "epoch": 0.45200360778250226, + "grad_norm": 6.459326053941941, + "learning_rate": 3.3387099050590187e-07, + "logits/chosen": -3.5, + "logits/rejected": -3.25, + "logps/chosen": -736.0, + "logps/rejected": -1000.0, + "loss": 0.2555, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.5625, + "rewards/margins": 2.75, + "rewards/rejected": -8.3125, + "step": 8770 + }, + { + "epoch": 0.4525190052828244, + "grad_norm": 8.409353028450841, + "learning_rate": 3.3344712144377247e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.78125, + "logps/chosen": -724.0, + "logps/rejected": -1040.0, + "loss": 0.2466, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.5, + "rewards/margins": 3.1875, + "rewards/rejected": -8.6875, + "step": 8780 + }, + { + "epoch": 0.4530344027831465, + "grad_norm": 6.704939569161189, + "learning_rate": 3.330229822516772e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.921875, + "logps/chosen": -756.0, + "logps/rejected": -1112.0, + "loss": 0.2322, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.78125, + "rewards/margins": 3.578125, + "rewards/rejected": -9.375, + "step": 8790 + }, + { + "epoch": 0.45354980028346864, + "grad_norm": 14.688293962065002, + "learning_rate": 3.325985743026141e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.03125, + "logps/chosen": -856.0, + "logps/rejected": -1112.0, + "loss": 0.278, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.46875, + "rewards/margins": 3.0, + "rewards/rejected": -9.4375, + "step": 8800 + }, + { + "epoch": 0.45406519778379073, + "grad_norm": 7.871172312463691, + "learning_rate": 3.3217389897045035e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.90625, + "logps/chosen": -820.0, + "logps/rejected": -1104.0, + "loss": 0.2664, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.375, + "rewards/margins": 3.03125, + "rewards/rejected": -9.4375, + "step": 8810 + }, + { + "epoch": 0.4545805952841129, + "grad_norm": 7.190288535054146, + "learning_rate": 3.3174895762991947e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.0625, + "logps/chosen": -796.0, + "logps/rejected": -1056.0, + "loss": 0.2413, + "rewards/accuracies": 0.84375, + "rewards/chosen": -6.15625, + "rewards/margins": 2.875, + "rewards/rejected": -9.0, + "step": 8820 + }, + { + "epoch": 0.455095992784435, + "grad_norm": 8.32178759875973, + "learning_rate": 3.313237516566159e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.734375, + "logps/chosen": -776.0, + "logps/rejected": -1080.0, + "loss": 0.2468, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.0625, + "rewards/margins": 3.03125, + "rewards/rejected": -9.125, + "step": 8830 + }, + { + "epoch": 0.4556113902847571, + "grad_norm": 9.332011201915297, + "learning_rate": 3.308982824269905e-07, + "logits/chosen": -3.3125, + "logits/rejected": -2.953125, + "logps/chosen": -784.0, + "logps/rejected": -1088.0, + "loss": 0.2646, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.0625, + "rewards/margins": 3.234375, + "rewards/rejected": -9.3125, + "step": 8840 + }, + { + "epoch": 0.45612678778507926, + "grad_norm": 10.056553791486492, + "learning_rate": 3.304725513183466e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.984375, + "logps/chosen": -772.0, + "logps/rejected": -1048.0, + "loss": 0.2618, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.75, + "rewards/margins": 2.96875, + "rewards/rejected": -8.6875, + "step": 8850 + }, + { + "epoch": 0.45664218528540135, + "grad_norm": 6.879242117492892, + "learning_rate": 3.300465597088351e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.015625, + "logps/chosen": -736.0, + "logps/rejected": -1032.0, + "loss": 0.2342, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.6875, + "rewards/margins": 2.921875, + "rewards/rejected": -8.625, + "step": 8860 + }, + { + "epoch": 0.4571575827857235, + "grad_norm": 9.547206983991193, + "learning_rate": 3.296203089774502e-07, + "logits/chosen": -3.359375, + "logits/rejected": -2.953125, + "logps/chosen": -760.0, + "logps/rejected": -1064.0, + "loss": 0.2563, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.84375, + "rewards/margins": 2.84375, + "rewards/rejected": -8.6875, + "step": 8870 + }, + { + "epoch": 0.45767298028604564, + "grad_norm": 8.870663863059468, + "learning_rate": 3.29193800504025e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.921875, + "logps/chosen": -744.0, + "logps/rejected": -1072.0, + "loss": 0.246, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.6875, + "rewards/margins": 3.328125, + "rewards/rejected": -9.0, + "step": 8880 + }, + { + "epoch": 0.45818837778636773, + "grad_norm": 5.871810650112869, + "learning_rate": 3.287670356692268e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.921875, + "logps/chosen": -768.0, + "logps/rejected": -1032.0, + "loss": 0.2466, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.96875, + "rewards/margins": 2.90625, + "rewards/rejected": -8.875, + "step": 8890 + }, + { + "epoch": 0.4587037752866899, + "grad_norm": 6.276020631221267, + "learning_rate": 3.28340015854553e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.84375, + "logps/chosen": -748.0, + "logps/rejected": -1080.0, + "loss": 0.2414, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.84375, + "rewards/margins": 3.625, + "rewards/rejected": -9.5, + "step": 8900 + }, + { + "epoch": 0.45921917278701196, + "grad_norm": 8.235096667759928, + "learning_rate": 3.279127424423262e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.90625, + "logps/chosen": -748.0, + "logps/rejected": -1072.0, + "loss": 0.2287, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.65625, + "rewards/margins": 3.53125, + "rewards/rejected": -9.1875, + "step": 8910 + }, + { + "epoch": 0.4597345702873341, + "grad_norm": 9.027598854324367, + "learning_rate": 3.274852168156899e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.84375, + "logps/chosen": -764.0, + "logps/rejected": -1064.0, + "loss": 0.2371, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.84375, + "rewards/margins": 3.25, + "rewards/rejected": -9.125, + "step": 8920 + }, + { + "epoch": 0.46024996778765626, + "grad_norm": 9.17210617098644, + "learning_rate": 3.2705744035860436e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.953125, + "logps/chosen": -748.0, + "logps/rejected": -1040.0, + "loss": 0.2553, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.71875, + "rewards/margins": 2.875, + "rewards/rejected": -8.5625, + "step": 8930 + }, + { + "epoch": 0.46076536528797835, + "grad_norm": 9.81843589404486, + "learning_rate": 3.2662941445584147e-07, + "logits/chosen": -3.421875, + "logits/rejected": -3.078125, + "logps/chosen": -760.0, + "logps/rejected": -1056.0, + "loss": 0.2373, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.8125, + "rewards/margins": 3.109375, + "rewards/rejected": -8.9375, + "step": 8940 + }, + { + "epoch": 0.4612807627883005, + "grad_norm": 7.911591185624961, + "learning_rate": 3.2620114049298075e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.046875, + "logps/chosen": -776.0, + "logps/rejected": -1072.0, + "loss": 0.2381, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.96875, + "rewards/margins": 3.171875, + "rewards/rejected": -9.1875, + "step": 8950 + }, + { + "epoch": 0.4617961602886226, + "grad_norm": 8.229988567980968, + "learning_rate": 3.257726198564049e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.21875, + "logps/chosen": -776.0, + "logps/rejected": -1056.0, + "loss": 0.2751, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.875, + "rewards/margins": 3.046875, + "rewards/rejected": -8.9375, + "step": 8960 + }, + { + "epoch": 0.4623115577889447, + "grad_norm": 7.572903406250695, + "learning_rate": 3.2534385393329465e-07, + "logits/chosen": -3.234375, + "logits/rejected": -3.0, + "logps/chosen": -756.0, + "logps/rejected": -1040.0, + "loss": 0.2599, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.8125, + "rewards/margins": 3.015625, + "rewards/rejected": -8.8125, + "step": 8970 + }, + { + "epoch": 0.4628269552892669, + "grad_norm": 7.741056856397316, + "learning_rate": 3.249148441116254e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.921875, + "logps/chosen": -704.0, + "logps/rejected": -1080.0, + "loss": 0.2467, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.5, + "rewards/margins": 3.53125, + "rewards/rejected": -9.0625, + "step": 8980 + }, + { + "epoch": 0.46334235278958896, + "grad_norm": 7.3261728879636365, + "learning_rate": 3.2448559178016156e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.9375, + "logps/chosen": -756.0, + "logps/rejected": -1072.0, + "loss": 0.2434, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.71875, + "rewards/margins": 3.421875, + "rewards/rejected": -9.125, + "step": 8990 + }, + { + "epoch": 0.4638577502899111, + "grad_norm": 8.822748554195476, + "learning_rate": 3.2405609832845273e-07, + "logits/chosen": -3.359375, + "logits/rejected": -3.09375, + "logps/chosen": -752.0, + "logps/rejected": -1024.0, + "loss": 0.2711, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.75, + "rewards/margins": 2.9375, + "rewards/rejected": -8.6875, + "step": 9000 + }, + { + "epoch": 0.4643731477902332, + "grad_norm": 6.748427312025938, + "learning_rate": 3.236263651468292e-07, + "logits/chosen": -3.4375, + "logits/rejected": -2.9375, + "logps/chosen": -688.0, + "logps/rejected": -1032.0, + "loss": 0.2372, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.125, + "rewards/margins": 3.453125, + "rewards/rejected": -8.5625, + "step": 9010 + }, + { + "epoch": 0.46488854529055534, + "grad_norm": 7.311778252940069, + "learning_rate": 3.2319639362639714e-07, + "logits/chosen": -3.375, + "logits/rejected": -3.3125, + "logps/chosen": -664.0, + "logps/rejected": -952.0, + "loss": 0.2507, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.0, + "rewards/margins": 2.984375, + "rewards/rejected": -8.0, + "step": 9020 + }, + { + "epoch": 0.4654039427908775, + "grad_norm": 5.437161937314265, + "learning_rate": 3.227661851590344e-07, + "logits/chosen": -3.484375, + "logits/rejected": -3.328125, + "logps/chosen": -728.0, + "logps/rejected": -1024.0, + "loss": 0.2558, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -5.40625, + "rewards/margins": 3.125, + "rewards/rejected": -8.5625, + "step": 9030 + }, + { + "epoch": 0.4659193402911996, + "grad_norm": 9.101450275338841, + "learning_rate": 3.223357411373857e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.0, + "logps/chosen": -708.0, + "logps/rejected": -1016.0, + "loss": 0.2577, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.40625, + "rewards/margins": 3.265625, + "rewards/rejected": -8.6875, + "step": 9040 + }, + { + "epoch": 0.4664347377915217, + "grad_norm": 8.09364858455947, + "learning_rate": 3.219050629548582e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.78125, + "logps/chosen": -756.0, + "logps/rejected": -1088.0, + "loss": 0.2467, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.84375, + "rewards/margins": 3.34375, + "rewards/rejected": -9.1875, + "step": 9050 + }, + { + "epoch": 0.4669501352918438, + "grad_norm": 8.40319121096191, + "learning_rate": 3.2147415200561753e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.984375, + "logps/chosen": -748.0, + "logps/rejected": -1048.0, + "loss": 0.2364, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.71875, + "rewards/margins": 3.0625, + "rewards/rejected": -8.75, + "step": 9060 + }, + { + "epoch": 0.46746553279216596, + "grad_norm": 8.907665020341124, + "learning_rate": 3.210430096845822e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.875, + "logps/chosen": -680.0, + "logps/rejected": -1008.0, + "loss": 0.261, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.03125, + "rewards/margins": 3.328125, + "rewards/rejected": -8.375, + "step": 9070 + }, + { + "epoch": 0.4679809302924881, + "grad_norm": 8.561610397267481, + "learning_rate": 3.2061163738742024e-07, + "logits/chosen": -3.234375, + "logits/rejected": -3.0, + "logps/chosen": -700.0, + "logps/rejected": -992.0, + "loss": 0.2576, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.28125, + "rewards/margins": 3.109375, + "rewards/rejected": -8.375, + "step": 9080 + }, + { + "epoch": 0.4684963277928102, + "grad_norm": 9.212565299002526, + "learning_rate": 3.201800365105437e-07, + "logits/chosen": -3.359375, + "logits/rejected": -3.0, + "logps/chosen": -784.0, + "logps/rejected": -1080.0, + "loss": 0.2517, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.9375, + "rewards/margins": 3.0625, + "rewards/rejected": -9.0, + "step": 9090 + }, + { + "epoch": 0.46901172529313234, + "grad_norm": 10.451731136200195, + "learning_rate": 3.19748208451105e-07, + "logits/chosen": -3.34375, + "logits/rejected": -2.9375, + "logps/chosen": -752.0, + "logps/rejected": -1096.0, + "loss": 0.2437, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.9375, + "rewards/margins": 3.46875, + "rewards/rejected": -9.375, + "step": 9100 + }, + { + "epoch": 0.46952712279345443, + "grad_norm": 8.782556146121012, + "learning_rate": 3.193161546069917e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.984375, + "logps/chosen": -776.0, + "logps/rejected": -1056.0, + "loss": 0.2556, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.0625, + "rewards/margins": 2.890625, + "rewards/rejected": -8.9375, + "step": 9110 + }, + { + "epoch": 0.4700425202937766, + "grad_norm": 8.891646980169648, + "learning_rate": 3.188838763768223e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.015625, + "logps/chosen": -732.0, + "logps/rejected": -1032.0, + "loss": 0.2337, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.625, + "rewards/margins": 3.1875, + "rewards/rejected": -8.8125, + "step": 9120 + }, + { + "epoch": 0.4705579177940987, + "grad_norm": 7.835101154335234, + "learning_rate": 3.184513751599417e-07, + "logits/chosen": -3.34375, + "logits/rejected": -2.953125, + "logps/chosen": -764.0, + "logps/rejected": -1056.0, + "loss": 0.2557, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.90625, + "rewards/margins": 3.125, + "rewards/rejected": -9.0, + "step": 9130 + }, + { + "epoch": 0.4710733152944208, + "grad_norm": 8.118988147551542, + "learning_rate": 3.1801865235641655e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.75, + "logps/chosen": -724.0, + "logps/rejected": -1032.0, + "loss": 0.2563, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.625, + "rewards/margins": 3.0, + "rewards/rejected": -8.625, + "step": 9140 + }, + { + "epoch": 0.47158871279474296, + "grad_norm": 7.738285669699191, + "learning_rate": 3.1758570936703105e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.1875, + "logps/chosen": -740.0, + "logps/rejected": -1040.0, + "loss": 0.2619, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.6875, + "rewards/margins": 3.109375, + "rewards/rejected": -8.75, + "step": 9150 + }, + { + "epoch": 0.47210411029506505, + "grad_norm": 6.76665567639257, + "learning_rate": 3.1715254759328197e-07, + "logits/chosen": -3.421875, + "logits/rejected": -3.03125, + "logps/chosen": -728.0, + "logps/rejected": -1004.0, + "loss": 0.2514, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.625, + "rewards/margins": 2.859375, + "rewards/rejected": -8.5, + "step": 9160 + }, + { + "epoch": 0.4726195077953872, + "grad_norm": 8.537943176694204, + "learning_rate": 3.167191684373743e-07, + "logits/chosen": -3.328125, + "logits/rejected": -2.90625, + "logps/chosen": -732.0, + "logps/rejected": -1048.0, + "loss": 0.2385, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.53125, + "rewards/margins": 3.328125, + "rewards/rejected": -8.875, + "step": 9170 + }, + { + "epoch": 0.47313490529570934, + "grad_norm": 8.546424521161429, + "learning_rate": 3.1628557330221687e-07, + "logits/chosen": -3.359375, + "logits/rejected": -2.84375, + "logps/chosen": -748.0, + "logps/rejected": -1120.0, + "loss": 0.2549, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.71875, + "rewards/margins": 3.734375, + "rewards/rejected": -9.4375, + "step": 9180 + }, + { + "epoch": 0.47365030279603143, + "grad_norm": 5.457415745095585, + "learning_rate": 3.158517635914175e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.890625, + "logps/chosen": -708.0, + "logps/rejected": -1048.0, + "loss": 0.2537, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.34375, + "rewards/margins": 3.3125, + "rewards/rejected": -8.6875, + "step": 9190 + }, + { + "epoch": 0.4741657002963536, + "grad_norm": 8.875559614457645, + "learning_rate": 3.154177407092787e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.890625, + "logps/chosen": -712.0, + "logps/rejected": -1008.0, + "loss": 0.2637, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.46875, + "rewards/margins": 3.046875, + "rewards/rejected": -8.5, + "step": 9200 + }, + { + "epoch": 0.47468109779667567, + "grad_norm": 7.362711827755383, + "learning_rate": 3.1498350606079324e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.8125, + "logps/chosen": -776.0, + "logps/rejected": -1032.0, + "loss": 0.2609, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.65625, + "rewards/margins": 2.921875, + "rewards/rejected": -8.5625, + "step": 9210 + }, + { + "epoch": 0.4751964952969978, + "grad_norm": 7.8252898091638725, + "learning_rate": 3.1454906105163894e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.96875, + "logps/chosen": -736.0, + "logps/rejected": -1040.0, + "loss": 0.2433, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.53125, + "rewards/margins": 3.265625, + "rewards/rejected": -8.8125, + "step": 9220 + }, + { + "epoch": 0.47571189279731996, + "grad_norm": 8.535084498118263, + "learning_rate": 3.14114407088175e-07, + "logits/chosen": -3.3125, + "logits/rejected": -2.875, + "logps/chosen": -756.0, + "logps/rejected": -1088.0, + "loss": 0.2549, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.96875, + "rewards/margins": 3.34375, + "rewards/rejected": -9.3125, + "step": 9230 + }, + { + "epoch": 0.47622729029764205, + "grad_norm": 8.321454341494203, + "learning_rate": 3.136795455774369e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.84375, + "logps/chosen": -764.0, + "logps/rejected": -1056.0, + "loss": 0.2534, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.9375, + "rewards/margins": 2.953125, + "rewards/rejected": -8.875, + "step": 9240 + }, + { + "epoch": 0.4767426877979642, + "grad_norm": 7.936491209068606, + "learning_rate": 3.1324447792713194e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.015625, + "logps/chosen": -720.0, + "logps/rejected": -1040.0, + "loss": 0.2369, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.46875, + "rewards/margins": 3.34375, + "rewards/rejected": -8.8125, + "step": 9250 + }, + { + "epoch": 0.4772580852982863, + "grad_norm": 8.915457395722392, + "learning_rate": 3.1280920554563484e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.96875, + "logps/chosen": -748.0, + "logps/rejected": -1080.0, + "loss": 0.2526, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.875, + "rewards/margins": 3.34375, + "rewards/rejected": -9.1875, + "step": 9260 + }, + { + "epoch": 0.47777348279860843, + "grad_norm": 7.184752069656401, + "learning_rate": 3.1237372984198284e-07, + "logits/chosen": -3.25, + "logits/rejected": -3.03125, + "logps/chosen": -768.0, + "logps/rejected": -1064.0, + "loss": 0.2398, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.6875, + "rewards/margins": 3.25, + "rewards/rejected": -8.9375, + "step": 9270 + }, + { + "epoch": 0.4782888802989306, + "grad_norm": 7.359467838958708, + "learning_rate": 3.1193805222587155e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.109375, + "logps/chosen": -744.0, + "logps/rejected": -1032.0, + "loss": 0.2637, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.5, + "rewards/margins": 3.140625, + "rewards/rejected": -8.625, + "step": 9280 + }, + { + "epoch": 0.47880427779925266, + "grad_norm": 7.737050958748853, + "learning_rate": 3.115021741076503e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.125, + "logps/chosen": -760.0, + "logps/rejected": -1012.0, + "loss": 0.2605, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.71875, + "rewards/margins": 2.734375, + "rewards/rejected": -8.4375, + "step": 9290 + }, + { + "epoch": 0.4793196752995748, + "grad_norm": 6.5232994384972, + "learning_rate": 3.1106609689831716e-07, + "logits/chosen": -3.21875, + "logits/rejected": -3.0625, + "logps/chosen": -744.0, + "logps/rejected": -1048.0, + "loss": 0.2422, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.625, + "rewards/margins": 3.296875, + "rewards/rejected": -8.875, + "step": 9300 + }, + { + "epoch": 0.4798350727998969, + "grad_norm": 10.271933119633609, + "learning_rate": 3.1062982200951495e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.921875, + "logps/chosen": -760.0, + "logps/rejected": -1048.0, + "loss": 0.2407, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.84375, + "rewards/margins": 3.0, + "rewards/rejected": -8.875, + "step": 9310 + }, + { + "epoch": 0.48035047030021905, + "grad_norm": 7.9565148257124525, + "learning_rate": 3.1019335085352617e-07, + "logits/chosen": -3.453125, + "logits/rejected": -3.015625, + "logps/chosen": -760.0, + "logps/rejected": -1064.0, + "loss": 0.25, + "rewards/accuracies": 0.84375, + "rewards/chosen": -6.03125, + "rewards/margins": 2.953125, + "rewards/rejected": -9.0, + "step": 9320 + }, + { + "epoch": 0.4808658678005412, + "grad_norm": 7.000524768206913, + "learning_rate": 3.0975668484326887e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.078125, + "logps/chosen": -772.0, + "logps/rejected": -1096.0, + "loss": 0.2426, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.0, + "rewards/margins": 3.28125, + "rewards/rejected": -9.25, + "step": 9330 + }, + { + "epoch": 0.4813812653008633, + "grad_norm": 6.834331714675262, + "learning_rate": 3.09319825392292e-07, + "logits/chosen": -3.4375, + "logits/rejected": -3.046875, + "logps/chosen": -824.0, + "logps/rejected": -1168.0, + "loss": 0.2474, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.34375, + "rewards/margins": 3.703125, + "rewards/rejected": -10.0625, + "step": 9340 + }, + { + "epoch": 0.4818966628011854, + "grad_norm": 9.513655834601625, + "learning_rate": 3.0888277391477017e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.9375, + "logps/chosen": -756.0, + "logps/rejected": -1112.0, + "loss": 0.246, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.90625, + "rewards/margins": 3.75, + "rewards/rejected": -9.625, + "step": 9350 + }, + { + "epoch": 0.4824120603015075, + "grad_norm": 7.3788625190022605, + "learning_rate": 3.084455318255002e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.890625, + "logps/chosen": -716.0, + "logps/rejected": -1032.0, + "loss": 0.2592, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.65625, + "rewards/margins": 3.125, + "rewards/rejected": -8.8125, + "step": 9360 + }, + { + "epoch": 0.48292745780182966, + "grad_norm": 7.158613137397024, + "learning_rate": 3.080081005398956e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.984375, + "logps/chosen": -728.0, + "logps/rejected": -1040.0, + "loss": 0.2313, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.71875, + "rewards/margins": 3.1875, + "rewards/rejected": -8.875, + "step": 9370 + }, + { + "epoch": 0.4834428553021518, + "grad_norm": 9.230761241893637, + "learning_rate": 3.0757048147398226e-07, + "logits/chosen": -3.421875, + "logits/rejected": -3.15625, + "logps/chosen": -748.0, + "logps/rejected": -1080.0, + "loss": 0.2369, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.75, + "rewards/margins": 3.34375, + "rewards/rejected": -9.125, + "step": 9380 + }, + { + "epoch": 0.4839582528024739, + "grad_norm": 11.685970718720478, + "learning_rate": 3.0713267604439447e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.90625, + "logps/chosen": -788.0, + "logps/rejected": -1128.0, + "loss": 0.2511, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.15625, + "rewards/margins": 3.28125, + "rewards/rejected": -9.4375, + "step": 9390 + }, + { + "epoch": 0.48447365030279604, + "grad_norm": 10.919412460365638, + "learning_rate": 3.0669468566836914e-07, + "logits/chosen": -3.21875, + "logits/rejected": -3.15625, + "logps/chosen": -776.0, + "logps/rejected": -1048.0, + "loss": 0.2469, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.0, + "rewards/margins": 2.953125, + "rewards/rejected": -8.9375, + "step": 9400 + }, + { + "epoch": 0.48498904780311813, + "grad_norm": 10.734092647894249, + "learning_rate": 3.0625651176374233e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.875, + "logps/chosen": -752.0, + "logps/rejected": -1088.0, + "loss": 0.2728, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.8125, + "rewards/margins": 3.5, + "rewards/rejected": -9.3125, + "step": 9410 + }, + { + "epoch": 0.4855044453034403, + "grad_norm": 6.038535031260593, + "learning_rate": 3.05818155748944e-07, + "logits/chosen": -3.21875, + "logits/rejected": -3.046875, + "logps/chosen": -764.0, + "logps/rejected": -1040.0, + "loss": 0.2533, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.8125, + "rewards/margins": 2.859375, + "rewards/rejected": -8.6875, + "step": 9420 + }, + { + "epoch": 0.4860198428037624, + "grad_norm": 6.90299882303407, + "learning_rate": 3.0537961904299364e-07, + "logits/chosen": -3.375, + "logits/rejected": -3.0, + "logps/chosen": -732.0, + "logps/rejected": -1008.0, + "loss": 0.2385, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.78125, + "rewards/margins": 2.8125, + "rewards/rejected": -8.625, + "step": 9430 + }, + { + "epoch": 0.4865352403040845, + "grad_norm": 6.8746407914801795, + "learning_rate": 3.049409030654958e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.015625, + "logps/chosen": -792.0, + "logps/rejected": -1072.0, + "loss": 0.2394, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.09375, + "rewards/margins": 2.9375, + "rewards/rejected": -9.0, + "step": 9440 + }, + { + "epoch": 0.48705063780440666, + "grad_norm": 8.550276844979006, + "learning_rate": 3.045020092366352e-07, + "logits/chosen": -3.28125, + "logits/rejected": -3.1875, + "logps/chosen": -816.0, + "logps/rejected": -1120.0, + "loss": 0.2619, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.28125, + "rewards/margins": 3.453125, + "rewards/rejected": -9.75, + "step": 9450 + }, + { + "epoch": 0.48756603530472875, + "grad_norm": 6.640107036227005, + "learning_rate": 3.040629389771724e-07, + "logits/chosen": -3.21875, + "logits/rejected": -3.109375, + "logps/chosen": -804.0, + "logps/rejected": -1104.0, + "loss": 0.2528, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.1875, + "rewards/margins": 3.171875, + "rewards/rejected": -9.3125, + "step": 9460 + }, + { + "epoch": 0.4880814328050509, + "grad_norm": 7.1877702688219385, + "learning_rate": 3.0362369370843895e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.03125, + "logps/chosen": -756.0, + "logps/rejected": -1080.0, + "loss": 0.2406, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.6875, + "rewards/margins": 3.25, + "rewards/rejected": -8.9375, + "step": 9470 + }, + { + "epoch": 0.48859683030537304, + "grad_norm": 7.2832816276711245, + "learning_rate": 3.0318427485233314e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.984375, + "logps/chosen": -760.0, + "logps/rejected": -1064.0, + "loss": 0.2205, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.65625, + "rewards/margins": 3.453125, + "rewards/rejected": -9.125, + "step": 9480 + }, + { + "epoch": 0.48911222780569513, + "grad_norm": 8.08820230568925, + "learning_rate": 3.0274468383131494e-07, + "logits/chosen": -3.5, + "logits/rejected": -3.34375, + "logps/chosen": -720.0, + "logps/rejected": -1048.0, + "loss": 0.2543, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.53125, + "rewards/margins": 3.234375, + "rewards/rejected": -8.75, + "step": 9490 + }, + { + "epoch": 0.4896276253060173, + "grad_norm": 7.022900077546328, + "learning_rate": 3.0230492206840195e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.25, + "logps/chosen": -716.0, + "logps/rejected": -996.0, + "loss": 0.2511, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -5.53125, + "rewards/margins": 2.953125, + "rewards/rejected": -8.5, + "step": 9500 + }, + { + "epoch": 0.49014302280633937, + "grad_norm": 8.896703153424793, + "learning_rate": 3.018649909871641e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.9375, + "logps/chosen": -748.0, + "logps/rejected": -1080.0, + "loss": 0.2394, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.6875, + "rewards/margins": 3.421875, + "rewards/rejected": -9.0625, + "step": 9510 + }, + { + "epoch": 0.4906584203066615, + "grad_norm": 8.936847258468767, + "learning_rate": 3.014248920117198e-07, + "logits/chosen": -3.421875, + "logits/rejected": -3.265625, + "logps/chosen": -756.0, + "logps/rejected": -1064.0, + "loss": 0.241, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.84375, + "rewards/margins": 3.140625, + "rewards/rejected": -9.0, + "step": 9520 + }, + { + "epoch": 0.49117381780698366, + "grad_norm": 11.78176364459783, + "learning_rate": 3.009846265667306e-07, + "logits/chosen": -3.328125, + "logits/rejected": -2.9375, + "logps/chosen": -768.0, + "logps/rejected": -1064.0, + "loss": 0.2483, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.78125, + "rewards/margins": 3.25, + "rewards/rejected": -9.0, + "step": 9530 + }, + { + "epoch": 0.49168921530730575, + "grad_norm": 5.538508809106594, + "learning_rate": 3.005441960773974e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.09375, + "logps/chosen": -772.0, + "logps/rejected": -1064.0, + "loss": 0.2437, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.03125, + "rewards/margins": 3.046875, + "rewards/rejected": -9.0625, + "step": 9540 + }, + { + "epoch": 0.4922046128076279, + "grad_norm": 12.176135096266846, + "learning_rate": 3.001036019694548e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.171875, + "logps/chosen": -772.0, + "logps/rejected": -1072.0, + "loss": 0.2602, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.6875, + "rewards/margins": 3.234375, + "rewards/rejected": -8.9375, + "step": 9550 + }, + { + "epoch": 0.49272001030795, + "grad_norm": 7.660529293807747, + "learning_rate": 2.996628456691676e-07, + "logits/chosen": -3.3125, + "logits/rejected": -2.9375, + "logps/chosen": -732.0, + "logps/rejected": -1064.0, + "loss": 0.2502, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.59375, + "rewards/margins": 3.546875, + "rewards/rejected": -9.125, + "step": 9560 + }, + { + "epoch": 0.49323540780827213, + "grad_norm": 7.261037045682567, + "learning_rate": 2.992219286033252e-07, + "logits/chosen": -3.28125, + "logits/rejected": -3.09375, + "logps/chosen": -756.0, + "logps/rejected": -1096.0, + "loss": 0.2295, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.84375, + "rewards/margins": 3.40625, + "rewards/rejected": -9.25, + "step": 9570 + }, + { + "epoch": 0.4937508053085943, + "grad_norm": 8.397828334240286, + "learning_rate": 2.9878085219923777e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.0625, + "logps/chosen": -776.0, + "logps/rejected": -1072.0, + "loss": 0.2426, + "rewards/accuracies": 0.84375, + "rewards/chosen": -6.0, + "rewards/margins": 3.125, + "rewards/rejected": -9.125, + "step": 9580 + }, + { + "epoch": 0.49426620280891637, + "grad_norm": 8.656373620638727, + "learning_rate": 2.9833961788473105e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.0, + "logps/chosen": -760.0, + "logps/rejected": -1104.0, + "loss": 0.2511, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.90625, + "rewards/margins": 3.5625, + "rewards/rejected": -9.5, + "step": 9590 + }, + { + "epoch": 0.4947816003092385, + "grad_norm": 10.028490034899654, + "learning_rate": 2.9789822708814195e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.96875, + "logps/chosen": -736.0, + "logps/rejected": -1064.0, + "loss": 0.244, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.46875, + "rewards/margins": 3.4375, + "rewards/rejected": -8.9375, + "step": 9600 + }, + { + "epoch": 0.4952969978095606, + "grad_norm": 7.240311882649063, + "learning_rate": 2.9745668123831414e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.03125, + "logps/chosen": -744.0, + "logps/rejected": -1056.0, + "loss": 0.2396, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.6875, + "rewards/margins": 3.4375, + "rewards/rejected": -9.125, + "step": 9610 + }, + { + "epoch": 0.49581239530988275, + "grad_norm": 7.166534441783562, + "learning_rate": 2.9701498176459303e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.8125, + "logps/chosen": -728.0, + "logps/rejected": -1032.0, + "loss": 0.2627, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.53125, + "rewards/margins": 3.28125, + "rewards/rejected": -8.8125, + "step": 9620 + }, + { + "epoch": 0.4963277928102049, + "grad_norm": 11.108731930142365, + "learning_rate": 2.965731300968214e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.015625, + "logps/chosen": -744.0, + "logps/rejected": -1064.0, + "loss": 0.2679, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.65625, + "rewards/margins": 3.328125, + "rewards/rejected": -9.0, + "step": 9630 + }, + { + "epoch": 0.496843190310527, + "grad_norm": 6.470791658068663, + "learning_rate": 2.9613112766533475e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.828125, + "logps/chosen": -768.0, + "logps/rejected": -1048.0, + "loss": 0.2476, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.0, + "rewards/margins": 2.9375, + "rewards/rejected": -8.9375, + "step": 9640 + }, + { + "epoch": 0.49735858781084913, + "grad_norm": 8.103257641098397, + "learning_rate": 2.956889759009565e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.859375, + "logps/chosen": -740.0, + "logps/rejected": -1024.0, + "loss": 0.2253, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.625, + "rewards/margins": 3.203125, + "rewards/rejected": -8.8125, + "step": 9650 + }, + { + "epoch": 0.4978739853111712, + "grad_norm": 8.04573779244103, + "learning_rate": 2.9524667623499363e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.9375, + "logps/chosen": -764.0, + "logps/rejected": -1096.0, + "loss": 0.251, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.78125, + "rewards/margins": 3.5625, + "rewards/rejected": -9.3125, + "step": 9660 + }, + { + "epoch": 0.49838938281149336, + "grad_norm": 7.081853432656316, + "learning_rate": 2.948042300992317e-07, + "logits/chosen": -3.3125, + "logits/rejected": -2.84375, + "logps/chosen": -744.0, + "logps/rejected": -1112.0, + "loss": 0.2384, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.6875, + "rewards/margins": 3.8125, + "rewards/rejected": -9.5, + "step": 9670 + }, + { + "epoch": 0.4989047803118155, + "grad_norm": 6.128266276771616, + "learning_rate": 2.9436163892593063e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.15625, + "logps/chosen": -776.0, + "logps/rejected": -1088.0, + "loss": 0.2476, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.0, + "rewards/margins": 3.265625, + "rewards/rejected": -9.3125, + "step": 9680 + }, + { + "epoch": 0.4994201778121376, + "grad_norm": 8.953717828043107, + "learning_rate": 2.9391890414781977e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.8125, + "logps/chosen": -804.0, + "logps/rejected": -1152.0, + "loss": 0.2361, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.375, + "rewards/margins": 3.671875, + "rewards/rejected": -10.0625, + "step": 9690 + }, + { + "epoch": 0.49993557531245975, + "grad_norm": 7.743727822315821, + "learning_rate": 2.9347602719809326e-07, + "logits/chosen": -3.375, + "logits/rejected": -3.078125, + "logps/chosen": -788.0, + "logps/rejected": -1112.0, + "loss": 0.2497, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.09375, + "rewards/margins": 3.546875, + "rewards/rejected": -9.625, + "step": 9700 + }, + { + "epoch": 0.5004509728127818, + "grad_norm": 8.677076898040083, + "learning_rate": 2.9303300951040557e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.9375, + "logps/chosen": -780.0, + "logps/rejected": -1064.0, + "loss": 0.261, + "rewards/accuracies": 0.84375, + "rewards/chosen": -6.09375, + "rewards/margins": 2.9375, + "rewards/rejected": -9.0625, + "step": 9710 + }, + { + "epoch": 0.500966370313104, + "grad_norm": 7.240542920926351, + "learning_rate": 2.925898525188667e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.90625, + "logps/chosen": -768.0, + "logps/rejected": -1064.0, + "loss": 0.2268, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.9375, + "rewards/margins": 3.125, + "rewards/rejected": -9.0625, + "step": 9720 + }, + { + "epoch": 0.5014817678134261, + "grad_norm": 5.1971418284883315, + "learning_rate": 2.921465576580376e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.9375, + "logps/chosen": -788.0, + "logps/rejected": -1088.0, + "loss": 0.2566, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -6.28125, + "rewards/margins": 3.03125, + "rewards/rejected": -9.3125, + "step": 9730 + }, + { + "epoch": 0.5019971653137483, + "grad_norm": 10.804273225673018, + "learning_rate": 2.9170312636292557e-07, + "logits/chosen": -3.015625, + "logits/rejected": -2.84375, + "logps/chosen": -772.0, + "logps/rejected": -1080.0, + "loss": 0.269, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.03125, + "rewards/margins": 3.125, + "rewards/rejected": -9.1875, + "step": 9740 + }, + { + "epoch": 0.5025125628140703, + "grad_norm": 7.4102825600655375, + "learning_rate": 2.912595600689795e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.796875, + "logps/chosen": -744.0, + "logps/rejected": -1040.0, + "loss": 0.2342, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.75, + "rewards/margins": 3.140625, + "rewards/rejected": -8.875, + "step": 9750 + }, + { + "epoch": 0.5030279603143925, + "grad_norm": 7.827222798751902, + "learning_rate": 2.9081586021208527e-07, + "logits/chosen": -3.28125, + "logits/rejected": -3.125, + "logps/chosen": -756.0, + "logps/rejected": -1064.0, + "loss": 0.2745, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.84375, + "rewards/margins": 3.25, + "rewards/rejected": -9.0625, + "step": 9760 + }, + { + "epoch": 0.5035433578147146, + "grad_norm": 10.102412811432991, + "learning_rate": 2.9037202822856133e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.71875, + "logps/chosen": -736.0, + "logps/rejected": -1072.0, + "loss": 0.2091, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.53125, + "rewards/margins": 3.609375, + "rewards/rejected": -9.125, + "step": 9770 + }, + { + "epoch": 0.5040587553150367, + "grad_norm": 7.758895230049078, + "learning_rate": 2.899280655551535e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.953125, + "logps/chosen": -772.0, + "logps/rejected": -1104.0, + "loss": 0.2316, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.96875, + "rewards/margins": 3.640625, + "rewards/rejected": -9.625, + "step": 9780 + }, + { + "epoch": 0.5045741528153589, + "grad_norm": 6.625808889242965, + "learning_rate": 2.894839736290311e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.109375, + "logps/chosen": -812.0, + "logps/rejected": -1168.0, + "loss": 0.2352, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.40625, + "rewards/margins": 3.6875, + "rewards/rejected": -10.0625, + "step": 9790 + }, + { + "epoch": 0.5050895503156809, + "grad_norm": 8.898376443400604, + "learning_rate": 2.890397538877813e-07, + "logits/chosen": -3.421875, + "logits/rejected": -3.09375, + "logps/chosen": -780.0, + "logps/rejected": -1080.0, + "loss": 0.2599, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.03125, + "rewards/margins": 3.1875, + "rewards/rejected": -9.1875, + "step": 9800 + }, + { + "epoch": 0.5056049478160031, + "grad_norm": 8.40824443679867, + "learning_rate": 2.8859540776940555e-07, + "logits/chosen": -3.453125, + "logits/rejected": -3.140625, + "logps/chosen": -716.0, + "logps/rejected": -1088.0, + "loss": 0.2619, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.40625, + "rewards/margins": 3.671875, + "rewards/rejected": -9.0625, + "step": 9810 + }, + { + "epoch": 0.5061203453163252, + "grad_norm": 6.434630723891529, + "learning_rate": 2.881509367123141e-07, + "logits/chosen": -3.234375, + "logits/rejected": -3.0, + "logps/chosen": -736.0, + "logps/rejected": -1024.0, + "loss": 0.2463, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.71875, + "rewards/margins": 3.078125, + "rewards/rejected": -8.8125, + "step": 9820 + }, + { + "epoch": 0.5066357428166474, + "grad_norm": 5.9894057116221795, + "learning_rate": 2.877063421553218e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.875, + "logps/chosen": -724.0, + "logps/rejected": -1056.0, + "loss": 0.2348, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.65625, + "rewards/margins": 3.328125, + "rewards/rejected": -9.0, + "step": 9830 + }, + { + "epoch": 0.5071511403169695, + "grad_norm": 10.132604995552908, + "learning_rate": 2.8726162553764306e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.875, + "logps/chosen": -776.0, + "logps/rejected": -1072.0, + "loss": 0.2363, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.9375, + "rewards/margins": 3.0625, + "rewards/rejected": -9.0, + "step": 9840 + }, + { + "epoch": 0.5076665378172915, + "grad_norm": 7.231958530883204, + "learning_rate": 2.868167882988877e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.96875, + "logps/chosen": -720.0, + "logps/rejected": -996.0, + "loss": 0.2516, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.5625, + "rewards/margins": 2.828125, + "rewards/rejected": -8.375, + "step": 9850 + }, + { + "epoch": 0.5081819353176137, + "grad_norm": 10.670140129583364, + "learning_rate": 2.863718318790559e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.265625, + "logps/chosen": -744.0, + "logps/rejected": -1040.0, + "loss": 0.2346, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.59375, + "rewards/margins": 3.0, + "rewards/rejected": -8.625, + "step": 9860 + }, + { + "epoch": 0.5086973328179358, + "grad_norm": 7.5782241331896305, + "learning_rate": 2.859267577185336e-07, + "logits/chosen": -3.4375, + "logits/rejected": -3.265625, + "logps/chosen": -752.0, + "logps/rejected": -1056.0, + "loss": 0.2435, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.6875, + "rewards/margins": 3.1875, + "rewards/rejected": -8.875, + "step": 9870 + }, + { + "epoch": 0.509212730318258, + "grad_norm": 7.9760954464320655, + "learning_rate": 2.854815672580878e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.78125, + "logps/chosen": -728.0, + "logps/rejected": -1020.0, + "loss": 0.25, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.625, + "rewards/margins": 3.125, + "rewards/rejected": -8.75, + "step": 9880 + }, + { + "epoch": 0.5097281278185801, + "grad_norm": 8.142300779054027, + "learning_rate": 2.850362619388622e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.890625, + "logps/chosen": -732.0, + "logps/rejected": -1008.0, + "loss": 0.2701, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.65625, + "rewards/margins": 3.03125, + "rewards/rejected": -8.6875, + "step": 9890 + }, + { + "epoch": 0.5102435253189022, + "grad_norm": 6.584241941711037, + "learning_rate": 2.8459084320237213e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.890625, + "logps/chosen": -724.0, + "logps/rejected": -1064.0, + "loss": 0.2565, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.71875, + "rewards/margins": 3.390625, + "rewards/rejected": -9.0625, + "step": 9900 + }, + { + "epoch": 0.5107589228192243, + "grad_norm": 8.641333858504474, + "learning_rate": 2.841453124905002e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.875, + "logps/chosen": -760.0, + "logps/rejected": -1056.0, + "loss": 0.2407, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.8125, + "rewards/margins": 3.171875, + "rewards/rejected": -9.0, + "step": 9910 + }, + { + "epoch": 0.5112743203195464, + "grad_norm": 6.7600966584937945, + "learning_rate": 2.8369967124549143e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.890625, + "logps/chosen": -752.0, + "logps/rejected": -1048.0, + "loss": 0.2577, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.78125, + "rewards/margins": 3.125, + "rewards/rejected": -8.875, + "step": 9920 + }, + { + "epoch": 0.5117897178198686, + "grad_norm": 6.7931180990963815, + "learning_rate": 2.8325392090994857e-07, + "logits/chosen": -3.015625, + "logits/rejected": -2.875, + "logps/chosen": -748.0, + "logps/rejected": -1040.0, + "loss": 0.244, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.625, + "rewards/margins": 3.3125, + "rewards/rejected": -8.9375, + "step": 9930 + }, + { + "epoch": 0.5123051153201907, + "grad_norm": 5.757620995710984, + "learning_rate": 2.828080629268277e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.015625, + "logps/chosen": -748.0, + "logps/rejected": -1080.0, + "loss": 0.2218, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.71875, + "rewards/margins": 3.453125, + "rewards/rejected": -9.1875, + "step": 9940 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 7.582224112662788, + "learning_rate": 2.8236209873943333e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.90625, + "logps/chosen": -740.0, + "logps/rejected": -1088.0, + "loss": 0.2354, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.53125, + "rewards/margins": 3.671875, + "rewards/rejected": -9.1875, + "step": 9950 + }, + { + "epoch": 0.5133359103208349, + "grad_norm": 5.978696730775908, + "learning_rate": 2.8191602979141356e-07, + "logits/chosen": -3.421875, + "logits/rejected": -3.125, + "logps/chosen": -768.0, + "logps/rejected": -1104.0, + "loss": 0.2481, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.84375, + "rewards/margins": 3.640625, + "rewards/rejected": -9.5, + "step": 9960 + }, + { + "epoch": 0.5138513078211571, + "grad_norm": 7.49760769680377, + "learning_rate": 2.8146985752675576e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.046875, + "logps/chosen": -788.0, + "logps/rejected": -1152.0, + "loss": 0.2423, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.03125, + "rewards/margins": 3.796875, + "rewards/rejected": -9.8125, + "step": 9970 + }, + { + "epoch": 0.5143667053214792, + "grad_norm": 7.021551150906601, + "learning_rate": 2.810235833897819e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.90625, + "logps/chosen": -736.0, + "logps/rejected": -1048.0, + "loss": 0.2581, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.625, + "rewards/margins": 3.34375, + "rewards/rejected": -9.0, + "step": 9980 + }, + { + "epoch": 0.5148821028218014, + "grad_norm": 10.18805654913856, + "learning_rate": 2.805772088251434e-07, + "logits/chosen": -3.375, + "logits/rejected": -3.109375, + "logps/chosen": -724.0, + "logps/rejected": -1056.0, + "loss": 0.243, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.5625, + "rewards/margins": 3.296875, + "rewards/rejected": -8.875, + "step": 9990 + }, + { + "epoch": 0.5153975003221234, + "grad_norm": 12.263753742637784, + "learning_rate": 2.8013073527781715e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.03125, + "logps/chosen": -760.0, + "logps/rejected": -1096.0, + "loss": 0.2445, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.90625, + "rewards/margins": 3.28125, + "rewards/rejected": -9.1875, + "step": 10000 + }, + { + "epoch": 0.5153975003221234, + "eval_logits/chosen": -2.875, + "eval_logits/rejected": -2.5, + "eval_logps/chosen": -768.0, + "eval_logps/rejected": -1144.0, + "eval_loss": 0.21323643624782562, + "eval_rewards/accuracies": 0.911282479763031, + "eval_rewards/chosen": -5.96875, + "eval_rewards/margins": 3.90625, + "eval_rewards/rejected": -9.875, + "eval_runtime": 3574.0528, + "eval_samples_per_second": 27.56, + "eval_steps_per_second": 0.431, + "step": 10000 + }, + { + "epoch": 0.5159128978224455, + "grad_norm": 7.1924587656115735, + "learning_rate": 2.7968416419310005e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.984375, + "logps/chosen": -808.0, + "logps/rejected": -1112.0, + "loss": 0.2505, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.25, + "rewards/margins": 3.375, + "rewards/rejected": -9.625, + "step": 10010 + }, + { + "epoch": 0.5164282953227677, + "grad_norm": 9.185097024350892, + "learning_rate": 2.792374970166052e-07, + "logits/chosen": -3.25, + "logits/rejected": -3.0625, + "logps/chosen": -804.0, + "logps/rejected": -1112.0, + "loss": 0.2509, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.375, + "rewards/margins": 3.203125, + "rewards/rejected": -9.5625, + "step": 10020 + }, + { + "epoch": 0.5169436928230898, + "grad_norm": 9.390489245637262, + "learning_rate": 2.7879073519425636e-07, + "logits/chosen": -3.3125, + "logits/rejected": -2.859375, + "logps/chosen": -828.0, + "logps/rejected": -1160.0, + "loss": 0.2297, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.3125, + "rewards/margins": 3.390625, + "rewards/rejected": -9.6875, + "step": 10030 + }, + { + "epoch": 0.517459090323412, + "grad_norm": 9.330815457249457, + "learning_rate": 2.7834388017228384e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.765625, + "logps/chosen": -820.0, + "logps/rejected": -1144.0, + "loss": 0.2396, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.28125, + "rewards/margins": 3.34375, + "rewards/rejected": -9.625, + "step": 10040 + }, + { + "epoch": 0.517974487823734, + "grad_norm": 11.032639198456842, + "learning_rate": 2.7789693339721966e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.75, + "logps/chosen": -796.0, + "logps/rejected": -1160.0, + "loss": 0.2246, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.375, + "rewards/margins": 3.59375, + "rewards/rejected": -10.0, + "step": 10050 + }, + { + "epoch": 0.5184898853240562, + "grad_norm": 9.443395043557825, + "learning_rate": 2.774498963158928e-07, + "logits/chosen": -3.0625, + "logits/rejected": -2.890625, + "logps/chosen": -792.0, + "logps/rejected": -1112.0, + "loss": 0.2579, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.3125, + "rewards/margins": 3.234375, + "rewards/rejected": -9.5625, + "step": 10060 + }, + { + "epoch": 0.5190052828243783, + "grad_norm": 6.6391061887930105, + "learning_rate": 2.770027703754248e-07, + "logits/chosen": -3.3125, + "logits/rejected": -2.921875, + "logps/chosen": -756.0, + "logps/rejected": -1080.0, + "loss": 0.2595, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.875, + "rewards/margins": 3.40625, + "rewards/rejected": -9.3125, + "step": 10070 + }, + { + "epoch": 0.5195206803247004, + "grad_norm": 9.324673674518799, + "learning_rate": 2.765555570232245e-07, + "logits/chosen": -3.375, + "logits/rejected": -3.078125, + "logps/chosen": -792.0, + "logps/rejected": -1056.0, + "loss": 0.26, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.90625, + "rewards/margins": 2.984375, + "rewards/rejected": -8.875, + "step": 10080 + }, + { + "epoch": 0.5200360778250226, + "grad_norm": 8.309511886956113, + "learning_rate": 2.761082577069838e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.921875, + "logps/chosen": -724.0, + "logps/rejected": -1032.0, + "loss": 0.228, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.71875, + "rewards/margins": 3.140625, + "rewards/rejected": -8.875, + "step": 10090 + }, + { + "epoch": 0.5205514753253446, + "grad_norm": 5.933894195181388, + "learning_rate": 2.7566087387467323e-07, + "logits/chosen": -3.515625, + "logits/rejected": -2.96875, + "logps/chosen": -760.0, + "logps/rejected": -1096.0, + "loss": 0.2392, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.875, + "rewards/margins": 3.40625, + "rewards/rejected": -9.3125, + "step": 10100 + }, + { + "epoch": 0.5210668728256668, + "grad_norm": 8.390405076560777, + "learning_rate": 2.7521340697453635e-07, + "logits/chosen": -3.359375, + "logits/rejected": -2.953125, + "logps/chosen": -760.0, + "logps/rejected": -1128.0, + "loss": 0.2557, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.9375, + "rewards/margins": 3.6875, + "rewards/rejected": -9.625, + "step": 10110 + }, + { + "epoch": 0.5215822703259889, + "grad_norm": 8.420578830942118, + "learning_rate": 2.74765858455086e-07, + "logits/chosen": -3.375, + "logits/rejected": -2.984375, + "logps/chosen": -748.0, + "logps/rejected": -1088.0, + "loss": 0.2481, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.6875, + "rewards/margins": 3.515625, + "rewards/rejected": -9.1875, + "step": 10120 + }, + { + "epoch": 0.5220976678263111, + "grad_norm": 6.863140462976504, + "learning_rate": 2.7431822976509915e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.140625, + "logps/chosen": -772.0, + "logps/rejected": -1064.0, + "loss": 0.2336, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.96875, + "rewards/margins": 3.09375, + "rewards/rejected": -9.0625, + "step": 10130 + }, + { + "epoch": 0.5226130653266332, + "grad_norm": 7.891318059029568, + "learning_rate": 2.738705223536122e-07, + "logits/chosen": -3.4375, + "logits/rejected": -3.0625, + "logps/chosen": -816.0, + "logps/rejected": -1112.0, + "loss": 0.2419, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.21875, + "rewards/margins": 3.328125, + "rewards/rejected": -9.5625, + "step": 10140 + }, + { + "epoch": 0.5231284628269552, + "grad_norm": 7.093077053775605, + "learning_rate": 2.734227376699165e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.71875, + "logps/chosen": -780.0, + "logps/rejected": -1112.0, + "loss": 0.2376, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.0625, + "rewards/margins": 3.5625, + "rewards/rejected": -9.625, + "step": 10150 + }, + { + "epoch": 0.5236438603272774, + "grad_norm": 7.076046048900981, + "learning_rate": 2.729748771635536e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.765625, + "logps/chosen": -764.0, + "logps/rejected": -1120.0, + "loss": 0.238, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.96875, + "rewards/margins": 3.59375, + "rewards/rejected": -9.5625, + "step": 10160 + }, + { + "epoch": 0.5241592578275995, + "grad_norm": 6.84326515800556, + "learning_rate": 2.725269422843102e-07, + "logits/chosen": -3.359375, + "logits/rejected": -3.1875, + "logps/chosen": -768.0, + "logps/rejected": -1104.0, + "loss": 0.2571, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.875, + "rewards/margins": 3.40625, + "rewards/rejected": -9.25, + "step": 10170 + }, + { + "epoch": 0.5246746553279217, + "grad_norm": 9.501549753414404, + "learning_rate": 2.7207893448221425e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.015625, + "logps/chosen": -776.0, + "logps/rejected": -1104.0, + "loss": 0.259, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.09375, + "rewards/margins": 3.265625, + "rewards/rejected": -9.375, + "step": 10180 + }, + { + "epoch": 0.5251900528282438, + "grad_norm": 7.483828679274859, + "learning_rate": 2.716308552075291e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.8125, + "logps/chosen": -800.0, + "logps/rejected": -1168.0, + "loss": 0.2406, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.125, + "rewards/margins": 3.75, + "rewards/rejected": -9.875, + "step": 10190 + }, + { + "epoch": 0.5257054503285659, + "grad_norm": 7.3716911008744335, + "learning_rate": 2.7118270591075004e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.71875, + "logps/chosen": -792.0, + "logps/rejected": -1144.0, + "loss": 0.2342, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.34375, + "rewards/margins": 3.5, + "rewards/rejected": -9.875, + "step": 10200 + }, + { + "epoch": 0.526220847828888, + "grad_norm": 8.909167732336547, + "learning_rate": 2.707344880425988e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.0, + "logps/chosen": -788.0, + "logps/rejected": -1136.0, + "loss": 0.2527, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.1875, + "rewards/margins": 3.484375, + "rewards/rejected": -9.6875, + "step": 10210 + }, + { + "epoch": 0.5267362453292102, + "grad_norm": 12.248266502838, + "learning_rate": 2.7028620305401906e-07, + "logits/chosen": -3.34375, + "logits/rejected": -2.90625, + "logps/chosen": -796.0, + "logps/rejected": -1112.0, + "loss": 0.2409, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.90625, + "rewards/margins": 3.265625, + "rewards/rejected": -9.1875, + "step": 10220 + }, + { + "epoch": 0.5272516428295323, + "grad_norm": 9.120377809451485, + "learning_rate": 2.698378523961719e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.015625, + "logps/chosen": -776.0, + "logps/rejected": -1120.0, + "loss": 0.2352, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.0625, + "rewards/margins": 3.5, + "rewards/rejected": -9.5625, + "step": 10230 + }, + { + "epoch": 0.5277670403298544, + "grad_norm": 9.153400504213309, + "learning_rate": 2.693894375204309e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.046875, + "logps/chosen": -756.0, + "logps/rejected": -1128.0, + "loss": 0.2241, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.90625, + "rewards/margins": 3.84375, + "rewards/rejected": -9.75, + "step": 10240 + }, + { + "epoch": 0.5282824378301765, + "grad_norm": 11.1544665493579, + "learning_rate": 2.689409598783774e-07, + "logits/chosen": -3.40625, + "logits/rejected": -3.25, + "logps/chosen": -776.0, + "logps/rejected": -1104.0, + "loss": 0.2431, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.6875, + "rewards/margins": 3.546875, + "rewards/rejected": -9.25, + "step": 10250 + }, + { + "epoch": 0.5287978353304986, + "grad_norm": 8.612640458177806, + "learning_rate": 2.684924209217962e-07, + "logits/chosen": -3.40625, + "logits/rejected": -3.109375, + "logps/chosen": -756.0, + "logps/rejected": -1048.0, + "loss": 0.2428, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.8125, + "rewards/margins": 3.21875, + "rewards/rejected": -9.0, + "step": 10260 + }, + { + "epoch": 0.5293132328308208, + "grad_norm": 7.6976606634569995, + "learning_rate": 2.680438221026703e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.0, + "logps/chosen": -732.0, + "logps/rejected": -1104.0, + "loss": 0.2359, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.65625, + "rewards/margins": 3.734375, + "rewards/rejected": -9.375, + "step": 10270 + }, + { + "epoch": 0.5298286303311429, + "grad_norm": 10.265290607646943, + "learning_rate": 2.675951648731768e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.015625, + "logps/chosen": -764.0, + "logps/rejected": -1056.0, + "loss": 0.2592, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.90625, + "rewards/margins": 3.109375, + "rewards/rejected": -9.0, + "step": 10280 + }, + { + "epoch": 0.5303440278314651, + "grad_norm": 6.652919989036308, + "learning_rate": 2.6714645068568153e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.796875, + "logps/chosen": -800.0, + "logps/rejected": -1072.0, + "loss": 0.2472, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.1875, + "rewards/margins": 2.828125, + "rewards/rejected": -9.0, + "step": 10290 + }, + { + "epoch": 0.5308594253317871, + "grad_norm": 8.344427239774305, + "learning_rate": 2.66697680992735e-07, + "logits/chosen": -3.359375, + "logits/rejected": -3.03125, + "logps/chosen": -804.0, + "logps/rejected": -1096.0, + "loss": 0.2702, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.3125, + "rewards/margins": 2.96875, + "rewards/rejected": -9.3125, + "step": 10300 + }, + { + "epoch": 0.5313748228321092, + "grad_norm": 10.67962767620695, + "learning_rate": 2.6624885724706715e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.109375, + "logps/chosen": -768.0, + "logps/rejected": -1080.0, + "loss": 0.235, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.875, + "rewards/margins": 3.34375, + "rewards/rejected": -9.1875, + "step": 10310 + }, + { + "epoch": 0.5318902203324314, + "grad_norm": 7.154323686933887, + "learning_rate": 2.657999809015831e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.09375, + "logps/chosen": -796.0, + "logps/rejected": -1096.0, + "loss": 0.2208, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.90625, + "rewards/margins": 3.328125, + "rewards/rejected": -9.1875, + "step": 10320 + }, + { + "epoch": 0.5324056178327535, + "grad_norm": 7.294921222528877, + "learning_rate": 2.6535105340935814e-07, + "logits/chosen": -3.375, + "logits/rejected": -3.125, + "logps/chosen": -804.0, + "logps/rejected": -1136.0, + "loss": 0.2454, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.25, + "rewards/margins": 3.34375, + "rewards/rejected": -9.625, + "step": 10330 + }, + { + "epoch": 0.5329210153330757, + "grad_norm": 8.024347290244307, + "learning_rate": 2.649020762236331e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.8125, + "logps/chosen": -788.0, + "logps/rejected": -1128.0, + "loss": 0.226, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.125, + "rewards/margins": 3.65625, + "rewards/rejected": -9.8125, + "step": 10340 + }, + { + "epoch": 0.5334364128333977, + "grad_norm": 12.192152530937225, + "learning_rate": 2.644530507978098e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.9375, + "logps/chosen": -756.0, + "logps/rejected": -1088.0, + "loss": 0.227, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.875, + "rewards/margins": 3.421875, + "rewards/rejected": -9.3125, + "step": 10350 + }, + { + "epoch": 0.5339518103337199, + "grad_norm": 12.059105377963906, + "learning_rate": 2.6400397858544616e-07, + "logits/chosen": -3.609375, + "logits/rejected": -3.265625, + "logps/chosen": -792.0, + "logps/rejected": -1088.0, + "loss": 0.2775, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.03125, + "rewards/margins": 3.140625, + "rewards/rejected": -9.1875, + "step": 10360 + }, + { + "epoch": 0.534467207834042, + "grad_norm": 7.725107570212059, + "learning_rate": 2.6355486104025143e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.953125, + "logps/chosen": -744.0, + "logps/rejected": -1056.0, + "loss": 0.2458, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.8125, + "rewards/margins": 3.046875, + "rewards/rejected": -8.875, + "step": 10370 + }, + { + "epoch": 0.5349826053343641, + "grad_norm": 8.914347454299433, + "learning_rate": 2.631056996160818e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.984375, + "logps/chosen": -772.0, + "logps/rejected": -1080.0, + "loss": 0.2433, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.0, + "rewards/margins": 3.125, + "rewards/rejected": -9.125, + "step": 10380 + }, + { + "epoch": 0.5354980028346863, + "grad_norm": 8.238431557417739, + "learning_rate": 2.626564957669354e-07, + "logits/chosen": -3.203125, + "logits/rejected": -3.03125, + "logps/chosen": -804.0, + "logps/rejected": -1120.0, + "loss": 0.2422, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.15625, + "rewards/margins": 3.375, + "rewards/rejected": -9.5625, + "step": 10390 + }, + { + "epoch": 0.5360134003350083, + "grad_norm": 11.215623342002198, + "learning_rate": 2.6220725094694773e-07, + "logits/chosen": -3.359375, + "logits/rejected": -3.015625, + "logps/chosen": -832.0, + "logps/rejected": -1136.0, + "loss": 0.2529, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.34375, + "rewards/margins": 3.359375, + "rewards/rejected": -9.6875, + "step": 10400 + }, + { + "epoch": 0.5365287978353305, + "grad_norm": 8.401879585442614, + "learning_rate": 2.6175796661038683e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.765625, + "logps/chosen": -776.0, + "logps/rejected": -1144.0, + "loss": 0.2323, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.21875, + "rewards/margins": 3.640625, + "rewards/rejected": -9.875, + "step": 10410 + }, + { + "epoch": 0.5370441953356526, + "grad_norm": 8.16086348639431, + "learning_rate": 2.613086442116488e-07, + "logits/chosen": -3.3125, + "logits/rejected": -2.984375, + "logps/chosen": -788.0, + "logps/rejected": -1104.0, + "loss": 0.2522, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.03125, + "rewards/margins": 3.421875, + "rewards/rejected": -9.5, + "step": 10420 + }, + { + "epoch": 0.5375595928359748, + "grad_norm": 9.0215240165115, + "learning_rate": 2.6085928520525294e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.890625, + "logps/chosen": -784.0, + "logps/rejected": -1080.0, + "loss": 0.2327, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.125, + "rewards/margins": 3.140625, + "rewards/rejected": -9.25, + "step": 10430 + }, + { + "epoch": 0.5380749903362969, + "grad_norm": 7.244521595990731, + "learning_rate": 2.604098910458369e-07, + "logits/chosen": -3.28125, + "logits/rejected": -3.125, + "logps/chosen": -728.0, + "logps/rejected": -1088.0, + "loss": 0.2428, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.59375, + "rewards/margins": 3.6875, + "rewards/rejected": -9.25, + "step": 10440 + }, + { + "epoch": 0.538590387836619, + "grad_norm": 12.852394184711562, + "learning_rate": 2.599604631881522e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.0625, + "logps/chosen": -736.0, + "logps/rejected": -1040.0, + "loss": 0.24, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.5625, + "rewards/margins": 3.234375, + "rewards/rejected": -8.8125, + "step": 10450 + }, + { + "epoch": 0.5391057853369411, + "grad_norm": 9.284485426456582, + "learning_rate": 2.5951100308705965e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.0, + "logps/chosen": -768.0, + "logps/rejected": -1080.0, + "loss": 0.2492, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.90625, + "rewards/margins": 3.28125, + "rewards/rejected": -9.1875, + "step": 10460 + }, + { + "epoch": 0.5396211828372632, + "grad_norm": 9.869062222463283, + "learning_rate": 2.5906151219752403e-07, + "logits/chosen": -3.359375, + "logits/rejected": -3.15625, + "logps/chosen": -792.0, + "logps/rejected": -1112.0, + "loss": 0.2559, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.15625, + "rewards/margins": 3.375, + "rewards/rejected": -9.5, + "step": 10470 + }, + { + "epoch": 0.5401365803375854, + "grad_norm": 7.323305043798904, + "learning_rate": 2.586119919746103e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.828125, + "logps/chosen": -788.0, + "logps/rejected": -1088.0, + "loss": 0.2631, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.375, + "rewards/margins": 3.125, + "rewards/rejected": -9.5, + "step": 10480 + }, + { + "epoch": 0.5406519778379075, + "grad_norm": 10.062402536202727, + "learning_rate": 2.581624438734777e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.0625, + "logps/chosen": -816.0, + "logps/rejected": -1088.0, + "loss": 0.2583, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.40625, + "rewards/margins": 2.609375, + "rewards/rejected": -9.0, + "step": 10490 + }, + { + "epoch": 0.5411673753382296, + "grad_norm": 8.875126766162252, + "learning_rate": 2.577128693493764e-07, + "logits/chosen": -3.21875, + "logits/rejected": -3.15625, + "logps/chosen": -788.0, + "logps/rejected": -1144.0, + "loss": 0.2429, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.0, + "rewards/margins": 3.59375, + "rewards/rejected": -9.5625, + "step": 10500 + }, + { + "epoch": 0.5416827728385517, + "grad_norm": 9.299065128023315, + "learning_rate": 2.572632698576418e-07, + "logits/chosen": -3.453125, + "logits/rejected": -3.09375, + "logps/chosen": -756.0, + "logps/rejected": -1120.0, + "loss": 0.2432, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.09375, + "rewards/margins": 3.4375, + "rewards/rejected": -9.5625, + "step": 10510 + }, + { + "epoch": 0.5421981703388739, + "grad_norm": 9.908564048746523, + "learning_rate": 2.5681364685369e-07, + "logits/chosen": -3.421875, + "logits/rejected": -3.34375, + "logps/chosen": -832.0, + "logps/rejected": -1096.0, + "loss": 0.2433, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.375, + "rewards/margins": 2.953125, + "rewards/rejected": -9.3125, + "step": 10520 + }, + { + "epoch": 0.542713567839196, + "grad_norm": 11.78321709356453, + "learning_rate": 2.5636400179301343e-07, + "logits/chosen": -3.40625, + "logits/rejected": -3.203125, + "logps/chosen": -808.0, + "logps/rejected": -1144.0, + "loss": 0.2358, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.09375, + "rewards/margins": 3.484375, + "rewards/rejected": -9.625, + "step": 10530 + }, + { + "epoch": 0.5432289653395181, + "grad_norm": 9.679291224667836, + "learning_rate": 2.5591433613117584e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.03125, + "logps/chosen": -772.0, + "logps/rejected": -1104.0, + "loss": 0.2616, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.9375, + "rewards/margins": 3.40625, + "rewards/rejected": -9.3125, + "step": 10540 + }, + { + "epoch": 0.5437443628398402, + "grad_norm": 8.646923504136428, + "learning_rate": 2.554646513238077e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.890625, + "logps/chosen": -752.0, + "logps/rejected": -1096.0, + "loss": 0.2547, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.875, + "rewards/margins": 3.53125, + "rewards/rejected": -9.375, + "step": 10550 + }, + { + "epoch": 0.5442597603401623, + "grad_norm": 9.34770865080901, + "learning_rate": 2.5501494882660136e-07, + "logits/chosen": -3.375, + "logits/rejected": -3.125, + "logps/chosen": -768.0, + "logps/rejected": -1120.0, + "loss": 0.2399, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.03125, + "rewards/margins": 3.5625, + "rewards/rejected": -9.625, + "step": 10560 + }, + { + "epoch": 0.5447751578404845, + "grad_norm": 13.211015952442356, + "learning_rate": 2.5456523009530644e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.921875, + "logps/chosen": -804.0, + "logps/rejected": -1128.0, + "loss": 0.2576, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.40625, + "rewards/margins": 3.453125, + "rewards/rejected": -9.875, + "step": 10570 + }, + { + "epoch": 0.5452905553408066, + "grad_norm": 6.693032277068501, + "learning_rate": 2.5411549658572526e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.03125, + "logps/chosen": -800.0, + "logps/rejected": -1120.0, + "loss": 0.2442, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.28125, + "rewards/margins": 3.390625, + "rewards/rejected": -9.6875, + "step": 10580 + }, + { + "epoch": 0.5458059528411288, + "grad_norm": 8.398836877742124, + "learning_rate": 2.53665749753708e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.859375, + "logps/chosen": -760.0, + "logps/rejected": -1104.0, + "loss": 0.2423, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.15625, + "rewards/margins": 3.4375, + "rewards/rejected": -9.625, + "step": 10590 + }, + { + "epoch": 0.5463213503414508, + "grad_norm": 7.554045667748188, + "learning_rate": 2.5321599105514757e-07, + "logits/chosen": -3.25, + "logits/rejected": -3.0, + "logps/chosen": -780.0, + "logps/rejected": -1080.0, + "loss": 0.2363, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.09375, + "rewards/margins": 3.25, + "rewards/rejected": -9.3125, + "step": 10600 + }, + { + "epoch": 0.5468367478417729, + "grad_norm": 10.215528937293652, + "learning_rate": 2.527662219459758e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.890625, + "logps/chosen": -768.0, + "logps/rejected": -1096.0, + "loss": 0.2441, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.03125, + "rewards/margins": 3.1875, + "rewards/rejected": -9.25, + "step": 10610 + }, + { + "epoch": 0.5473521453420951, + "grad_norm": 9.092748763979307, + "learning_rate": 2.52316443882158e-07, + "logits/chosen": -3.421875, + "logits/rejected": -3.046875, + "logps/chosen": -752.0, + "logps/rejected": -1080.0, + "loss": 0.2413, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.6875, + "rewards/margins": 3.5625, + "rewards/rejected": -9.25, + "step": 10620 + }, + { + "epoch": 0.5478675428424172, + "grad_norm": 6.378609015463464, + "learning_rate": 2.5186665831968833e-07, + "logits/chosen": -3.40625, + "logits/rejected": -3.015625, + "logps/chosen": -780.0, + "logps/rejected": -1080.0, + "loss": 0.2341, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.0, + "rewards/margins": 3.140625, + "rewards/rejected": -9.125, + "step": 10630 + }, + { + "epoch": 0.5483829403427394, + "grad_norm": 8.069222046243512, + "learning_rate": 2.514168667145855e-07, + "logits/chosen": -3.515625, + "logits/rejected": -3.296875, + "logps/chosen": -768.0, + "logps/rejected": -1096.0, + "loss": 0.2257, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.90625, + "rewards/margins": 3.21875, + "rewards/rejected": -9.125, + "step": 10640 + }, + { + "epoch": 0.5488983378430614, + "grad_norm": 11.837217783195698, + "learning_rate": 2.509670705228875e-07, + "logits/chosen": -3.46875, + "logits/rejected": -2.984375, + "logps/chosen": -748.0, + "logps/rejected": -1144.0, + "loss": 0.2462, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.875, + "rewards/margins": 3.890625, + "rewards/rejected": -9.75, + "step": 10650 + }, + { + "epoch": 0.5494137353433836, + "grad_norm": 6.596564283688587, + "learning_rate": 2.5051727120064754e-07, + "logits/chosen": -3.5, + "logits/rejected": -3.140625, + "logps/chosen": -772.0, + "logps/rejected": -1088.0, + "loss": 0.2248, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.0, + "rewards/margins": 3.390625, + "rewards/rejected": -9.375, + "step": 10660 + }, + { + "epoch": 0.5499291328437057, + "grad_norm": 8.18410714567105, + "learning_rate": 2.5006747020392847e-07, + "logits/chosen": -3.5625, + "logits/rejected": -3.34375, + "logps/chosen": -784.0, + "logps/rejected": -1096.0, + "loss": 0.2479, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.09375, + "rewards/margins": 3.25, + "rewards/rejected": -9.375, + "step": 10670 + }, + { + "epoch": 0.5504445303440278, + "grad_norm": 8.36396486957546, + "learning_rate": 2.4961766898879894e-07, + "logits/chosen": -3.3125, + "logits/rejected": -2.96875, + "logps/chosen": -768.0, + "logps/rejected": -1112.0, + "loss": 0.2245, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.96875, + "rewards/margins": 3.4375, + "rewards/rejected": -9.375, + "step": 10680 + }, + { + "epoch": 0.55095992784435, + "grad_norm": 7.4891613238444465, + "learning_rate": 2.4916786901132827e-07, + "logits/chosen": -3.328125, + "logits/rejected": -2.984375, + "logps/chosen": -760.0, + "logps/rejected": -1112.0, + "loss": 0.2446, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.90625, + "rewards/margins": 3.546875, + "rewards/rejected": -9.4375, + "step": 10690 + }, + { + "epoch": 0.551475325344672, + "grad_norm": 9.056591495643145, + "learning_rate": 2.487180717275816e-07, + "logits/chosen": -3.4375, + "logits/rejected": -3.125, + "logps/chosen": -776.0, + "logps/rejected": -1120.0, + "loss": 0.236, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.125, + "rewards/margins": 3.46875, + "rewards/rejected": -9.625, + "step": 10700 + }, + { + "epoch": 0.5519907228449942, + "grad_norm": 8.547534584785021, + "learning_rate": 2.4826827859361545e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.03125, + "logps/chosen": -764.0, + "logps/rejected": -1080.0, + "loss": 0.2541, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.875, + "rewards/margins": 3.390625, + "rewards/rejected": -9.25, + "step": 10710 + }, + { + "epoch": 0.5525061203453163, + "grad_norm": 9.030079440492035, + "learning_rate": 2.478184910654729e-07, + "logits/chosen": -3.359375, + "logits/rejected": -2.828125, + "logps/chosen": -768.0, + "logps/rejected": -1160.0, + "loss": 0.2506, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.875, + "rewards/margins": 4.0625, + "rewards/rejected": -9.9375, + "step": 10720 + }, + { + "epoch": 0.5530215178456385, + "grad_norm": 8.655116978284024, + "learning_rate": 2.473687105991789e-07, + "logits/chosen": -3.328125, + "logits/rejected": -2.953125, + "logps/chosen": -752.0, + "logps/rejected": -1088.0, + "loss": 0.2398, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.84375, + "rewards/margins": 3.484375, + "rewards/rejected": -9.3125, + "step": 10730 + }, + { + "epoch": 0.5535369153459606, + "grad_norm": 8.029030602813451, + "learning_rate": 2.4691893865073555e-07, + "logits/chosen": -3.359375, + "logits/rejected": -3.1875, + "logps/chosen": -756.0, + "logps/rejected": -1088.0, + "loss": 0.2519, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.65625, + "rewards/margins": 3.4375, + "rewards/rejected": -9.125, + "step": 10740 + }, + { + "epoch": 0.5540523128462826, + "grad_norm": 7.158124320700343, + "learning_rate": 2.464691766761174e-07, + "logits/chosen": -3.5, + "logits/rejected": -2.875, + "logps/chosen": -760.0, + "logps/rejected": -1104.0, + "loss": 0.2279, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.75, + "rewards/margins": 3.515625, + "rewards/rejected": -9.25, + "step": 10750 + }, + { + "epoch": 0.5545677103466048, + "grad_norm": 9.085665419646137, + "learning_rate": 2.4601942613126645e-07, + "logits/chosen": -3.34375, + "logits/rejected": -2.859375, + "logps/chosen": -792.0, + "logps/rejected": -1128.0, + "loss": 0.2439, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.125, + "rewards/margins": 3.5625, + "rewards/rejected": -9.6875, + "step": 10760 + }, + { + "epoch": 0.5550831078469269, + "grad_norm": 7.294822417333419, + "learning_rate": 2.4556968847208807e-07, + "logits/chosen": -3.4375, + "logits/rejected": -3.21875, + "logps/chosen": -768.0, + "logps/rejected": -1080.0, + "loss": 0.2366, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.90625, + "rewards/margins": 3.3125, + "rewards/rejected": -9.25, + "step": 10770 + }, + { + "epoch": 0.5555985053472491, + "grad_norm": 7.9607340540719145, + "learning_rate": 2.4511996515444586e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.9375, + "logps/chosen": -772.0, + "logps/rejected": -1160.0, + "loss": 0.1875, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.0625, + "rewards/margins": 3.984375, + "rewards/rejected": -10.0625, + "step": 10780 + }, + { + "epoch": 0.5561139028475712, + "grad_norm": 9.134239570299465, + "learning_rate": 2.446702576341567e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.03125, + "logps/chosen": -784.0, + "logps/rejected": -1120.0, + "loss": 0.2453, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.125, + "rewards/margins": 3.390625, + "rewards/rejected": -9.5625, + "step": 10790 + }, + { + "epoch": 0.5566293003478933, + "grad_norm": 9.587007102221497, + "learning_rate": 2.4422056736698676e-07, + "logits/chosen": -3.4375, + "logits/rejected": -3.140625, + "logps/chosen": -772.0, + "logps/rejected": -1088.0, + "loss": 0.2436, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.875, + "rewards/margins": 3.453125, + "rewards/rejected": -9.3125, + "step": 10800 + }, + { + "epoch": 0.5571446978482154, + "grad_norm": 7.225580407091084, + "learning_rate": 2.4377089580864587e-07, + "logits/chosen": -3.375, + "logits/rejected": -3.09375, + "logps/chosen": -788.0, + "logps/rejected": -1080.0, + "loss": 0.2419, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.21875, + "rewards/margins": 3.109375, + "rewards/rejected": -9.3125, + "step": 10810 + }, + { + "epoch": 0.5576600953485376, + "grad_norm": 5.710307493175006, + "learning_rate": 2.433212444147836e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.109375, + "logps/chosen": -788.0, + "logps/rejected": -1144.0, + "loss": 0.2303, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.21875, + "rewards/margins": 3.5, + "rewards/rejected": -9.75, + "step": 10820 + }, + { + "epoch": 0.5581754928488597, + "grad_norm": 5.667559675985489, + "learning_rate": 2.428716146409844e-07, + "logits/chosen": -3.40625, + "logits/rejected": -3.15625, + "logps/chosen": -756.0, + "logps/rejected": -1104.0, + "loss": 0.2198, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.75, + "rewards/margins": 3.53125, + "rewards/rejected": -9.3125, + "step": 10830 + }, + { + "epoch": 0.5586908903491818, + "grad_norm": 6.2086699216465115, + "learning_rate": 2.4242200794276233e-07, + "logits/chosen": -3.40625, + "logits/rejected": -3.046875, + "logps/chosen": -740.0, + "logps/rejected": -1072.0, + "loss": 0.2166, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.625, + "rewards/margins": 3.390625, + "rewards/rejected": -9.0, + "step": 10840 + }, + { + "epoch": 0.5592062878495039, + "grad_norm": 7.715208340819855, + "learning_rate": 2.4197242577555713e-07, + "logits/chosen": -3.421875, + "logits/rejected": -2.96875, + "logps/chosen": -772.0, + "logps/rejected": -1128.0, + "loss": 0.2392, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.9375, + "rewards/margins": 3.671875, + "rewards/rejected": -9.625, + "step": 10850 + }, + { + "epoch": 0.559721685349826, + "grad_norm": 7.586264965374173, + "learning_rate": 2.4152286959472876e-07, + "logits/chosen": -3.53125, + "logits/rejected": -3.203125, + "logps/chosen": -760.0, + "logps/rejected": -1064.0, + "loss": 0.2593, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.78125, + "rewards/margins": 3.109375, + "rewards/rejected": -8.875, + "step": 10860 + }, + { + "epoch": 0.5602370828501482, + "grad_norm": 8.234497754003478, + "learning_rate": 2.410733408555533e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.046875, + "logps/chosen": -764.0, + "logps/rejected": -1056.0, + "loss": 0.2375, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.0625, + "rewards/margins": 3.078125, + "rewards/rejected": -9.125, + "step": 10870 + }, + { + "epoch": 0.5607524803504703, + "grad_norm": 8.489217571656669, + "learning_rate": 2.406238410132181e-07, + "logits/chosen": -3.375, + "logits/rejected": -2.984375, + "logps/chosen": -740.0, + "logps/rejected": -1080.0, + "loss": 0.2299, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.6875, + "rewards/margins": 3.5625, + "rewards/rejected": -9.25, + "step": 10880 + }, + { + "epoch": 0.5612678778507925, + "grad_norm": 7.705607301289699, + "learning_rate": 2.401743715228166e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.9375, + "logps/chosen": -752.0, + "logps/rejected": -1112.0, + "loss": 0.2442, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.8125, + "rewards/margins": 3.6875, + "rewards/rejected": -9.5, + "step": 10890 + }, + { + "epoch": 0.5617832753511145, + "grad_norm": 8.91271382778467, + "learning_rate": 2.3972493383934446e-07, + "logits/chosen": -3.40625, + "logits/rejected": -3.03125, + "logps/chosen": -752.0, + "logps/rejected": -1088.0, + "loss": 0.2325, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.8125, + "rewards/margins": 3.546875, + "rewards/rejected": -9.375, + "step": 10900 + }, + { + "epoch": 0.5622986728514366, + "grad_norm": 9.822787081630242, + "learning_rate": 2.3927552941769385e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.234375, + "logps/chosen": -760.0, + "logps/rejected": -1096.0, + "loss": 0.2213, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.84375, + "rewards/margins": 3.421875, + "rewards/rejected": -9.25, + "step": 10910 + }, + { + "epoch": 0.5628140703517588, + "grad_norm": 9.082301320961083, + "learning_rate": 2.3882615971264965e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.96875, + "logps/chosen": -816.0, + "logps/rejected": -1120.0, + "loss": 0.2416, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.5, + "rewards/margins": 3.171875, + "rewards/rejected": -9.6875, + "step": 10920 + }, + { + "epoch": 0.5633294678520809, + "grad_norm": 8.887482425259037, + "learning_rate": 2.3837682617888422e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.9375, + "logps/chosen": -784.0, + "logps/rejected": -1184.0, + "loss": 0.224, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.21875, + "rewards/margins": 4.0625, + "rewards/rejected": -10.25, + "step": 10930 + }, + { + "epoch": 0.5638448653524031, + "grad_norm": 10.58600833686479, + "learning_rate": 2.3792753027095295e-07, + "logits/chosen": -3.421875, + "logits/rejected": -3.296875, + "logps/chosen": -780.0, + "logps/rejected": -1104.0, + "loss": 0.2539, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.0625, + "rewards/margins": 3.34375, + "rewards/rejected": -9.375, + "step": 10940 + }, + { + "epoch": 0.5643602628527251, + "grad_norm": 8.393359400843194, + "learning_rate": 2.374782734432893e-07, + "logits/chosen": -3.15625, + "logits/rejected": -3.171875, + "logps/chosen": -748.0, + "logps/rejected": -1080.0, + "loss": 0.2141, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.78125, + "rewards/margins": 3.328125, + "rewards/rejected": -9.125, + "step": 10950 + }, + { + "epoch": 0.5648756603530473, + "grad_norm": 7.765689586407587, + "learning_rate": 2.370290571502001e-07, + "logits/chosen": -3.359375, + "logits/rejected": -2.96875, + "logps/chosen": -724.0, + "logps/rejected": -1064.0, + "loss": 0.2375, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.625, + "rewards/margins": 3.46875, + "rewards/rejected": -9.0625, + "step": 10960 + }, + { + "epoch": 0.5653910578533694, + "grad_norm": 8.322552696813139, + "learning_rate": 2.3657988284586133e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.046875, + "logps/chosen": -708.0, + "logps/rejected": -1040.0, + "loss": 0.222, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -5.5, + "rewards/margins": 3.390625, + "rewards/rejected": -8.875, + "step": 10970 + }, + { + "epoch": 0.5659064553536916, + "grad_norm": 9.346091285487255, + "learning_rate": 2.3613075198431264e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.25, + "logps/chosen": -776.0, + "logps/rejected": -1064.0, + "loss": 0.2466, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.03125, + "rewards/margins": 3.125, + "rewards/rejected": -9.125, + "step": 10980 + }, + { + "epoch": 0.5664218528540137, + "grad_norm": 8.714317296689817, + "learning_rate": 2.3568166601945342e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.953125, + "logps/chosen": -740.0, + "logps/rejected": -1120.0, + "loss": 0.2179, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.75, + "rewards/margins": 3.875, + "rewards/rejected": -9.625, + "step": 10990 + }, + { + "epoch": 0.5669372503543357, + "grad_norm": 5.579791253046542, + "learning_rate": 2.3523262640503752e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.953125, + "logps/chosen": -728.0, + "logps/rejected": -1112.0, + "loss": 0.2276, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.625, + "rewards/margins": 3.625, + "rewards/rejected": -9.25, + "step": 11000 + }, + { + "epoch": 0.5674526478546579, + "grad_norm": 10.343707930157022, + "learning_rate": 2.347836345946686e-07, + "logits/chosen": -3.390625, + "logits/rejected": -2.890625, + "logps/chosen": -696.0, + "logps/rejected": -1096.0, + "loss": 0.231, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.40625, + "rewards/margins": 4.03125, + "rewards/rejected": -9.4375, + "step": 11010 + }, + { + "epoch": 0.56796804535498, + "grad_norm": 9.036838462292717, + "learning_rate": 2.3433469204179592e-07, + "logits/chosen": -3.46875, + "logits/rejected": -3.125, + "logps/chosen": -740.0, + "logps/rejected": -1104.0, + "loss": 0.2217, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.625, + "rewards/margins": 3.65625, + "rewards/rejected": -9.3125, + "step": 11020 + }, + { + "epoch": 0.5684834428553022, + "grad_norm": 9.553270619351782, + "learning_rate": 2.3388580019970906e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.921875, + "logps/chosen": -724.0, + "logps/rejected": -1048.0, + "loss": 0.2374, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.5625, + "rewards/margins": 3.359375, + "rewards/rejected": -8.9375, + "step": 11030 + }, + { + "epoch": 0.5689988403556243, + "grad_norm": 8.788204834465093, + "learning_rate": 2.3343696052153347e-07, + "logits/chosen": -3.46875, + "logits/rejected": -3.078125, + "logps/chosen": -692.0, + "logps/rejected": -1012.0, + "loss": 0.233, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.25, + "rewards/margins": 3.296875, + "rewards/rejected": -8.5625, + "step": 11040 + }, + { + "epoch": 0.5695142378559463, + "grad_norm": 5.736526905140815, + "learning_rate": 2.329881744602259e-07, + "logits/chosen": -3.421875, + "logits/rejected": -3.078125, + "logps/chosen": -768.0, + "logps/rejected": -1072.0, + "loss": 0.2348, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.8125, + "rewards/margins": 3.15625, + "rewards/rejected": -8.9375, + "step": 11050 + }, + { + "epoch": 0.5700296353562685, + "grad_norm": 10.28619628882453, + "learning_rate": 2.3253944346856916e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.109375, + "logps/chosen": -736.0, + "logps/rejected": -1056.0, + "loss": 0.2308, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.59375, + "rewards/margins": 3.28125, + "rewards/rejected": -8.875, + "step": 11060 + }, + { + "epoch": 0.5705450328565906, + "grad_norm": 7.8965864439418345, + "learning_rate": 2.3209076899916813e-07, + "logits/chosen": -3.4375, + "logits/rejected": -3.0, + "logps/chosen": -728.0, + "logps/rejected": -1064.0, + "loss": 0.2384, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -5.59375, + "rewards/margins": 3.3125, + "rewards/rejected": -8.875, + "step": 11070 + }, + { + "epoch": 0.5710604303569128, + "grad_norm": 9.102368136719674, + "learning_rate": 2.3164215250444475e-07, + "logits/chosen": -3.234375, + "logits/rejected": -3.0625, + "logps/chosen": -740.0, + "logps/rejected": -1048.0, + "loss": 0.2476, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.6875, + "rewards/margins": 3.171875, + "rewards/rejected": -8.875, + "step": 11080 + }, + { + "epoch": 0.5715758278572349, + "grad_norm": 10.375776502745843, + "learning_rate": 2.3119359543663295e-07, + "logits/chosen": -3.328125, + "logits/rejected": -2.96875, + "logps/chosen": -736.0, + "logps/rejected": -1072.0, + "loss": 0.2228, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.6875, + "rewards/margins": 3.453125, + "rewards/rejected": -9.125, + "step": 11090 + }, + { + "epoch": 0.572091225357557, + "grad_norm": 9.484151335882643, + "learning_rate": 2.3074509924777472e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.125, + "logps/chosen": -792.0, + "logps/rejected": -1104.0, + "loss": 0.2307, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.125, + "rewards/margins": 3.453125, + "rewards/rejected": -9.5625, + "step": 11100 + }, + { + "epoch": 0.5726066228578791, + "grad_norm": 8.942323803195384, + "learning_rate": 2.3029666538971456e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.109375, + "logps/chosen": -788.0, + "logps/rejected": -1184.0, + "loss": 0.2391, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.125, + "rewards/margins": 4.03125, + "rewards/rejected": -10.125, + "step": 11110 + }, + { + "epoch": 0.5731220203582013, + "grad_norm": 8.323990677019962, + "learning_rate": 2.2984829531409543e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.046875, + "logps/chosen": -788.0, + "logps/rejected": -1080.0, + "loss": 0.2359, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.125, + "rewards/margins": 3.015625, + "rewards/rejected": -9.125, + "step": 11120 + }, + { + "epoch": 0.5736374178585234, + "grad_norm": 5.543042901224358, + "learning_rate": 2.2939999047235382e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.671875, + "logps/chosen": -744.0, + "logps/rejected": -1104.0, + "loss": 0.2333, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.875, + "rewards/margins": 3.53125, + "rewards/rejected": -9.375, + "step": 11130 + }, + { + "epoch": 0.5741528153588455, + "grad_norm": 8.25252555187792, + "learning_rate": 2.289517523157149e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.796875, + "logps/chosen": -756.0, + "logps/rejected": -1072.0, + "loss": 0.2604, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.875, + "rewards/margins": 3.234375, + "rewards/rejected": -9.125, + "step": 11140 + }, + { + "epoch": 0.5746682128591676, + "grad_norm": 6.945452427948089, + "learning_rate": 2.285035822951883e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.921875, + "logps/chosen": -780.0, + "logps/rejected": -1104.0, + "loss": 0.2302, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.9375, + "rewards/margins": 3.46875, + "rewards/rejected": -9.375, + "step": 11150 + }, + { + "epoch": 0.5751836103594897, + "grad_norm": 11.121229164977212, + "learning_rate": 2.2805548186156262e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.15625, + "logps/chosen": -732.0, + "logps/rejected": -1072.0, + "loss": 0.2201, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.84375, + "rewards/margins": 3.28125, + "rewards/rejected": -9.125, + "step": 11160 + }, + { + "epoch": 0.5756990078598119, + "grad_norm": 8.95695296962324, + "learning_rate": 2.2760745246540147e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.96875, + "logps/chosen": -784.0, + "logps/rejected": -1136.0, + "loss": 0.2656, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.90625, + "rewards/margins": 3.84375, + "rewards/rejected": -9.75, + "step": 11170 + }, + { + "epoch": 0.576214405360134, + "grad_norm": 8.892588722270675, + "learning_rate": 2.2715949555703862e-07, + "logits/chosen": -3.328125, + "logits/rejected": -2.953125, + "logps/chosen": -760.0, + "logps/rejected": -1088.0, + "loss": 0.2553, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.78125, + "rewards/margins": 3.375, + "rewards/rejected": -9.1875, + "step": 11180 + }, + { + "epoch": 0.5767298028604562, + "grad_norm": 6.901641809567602, + "learning_rate": 2.2671161258657285e-07, + "logits/chosen": -3.375, + "logits/rejected": -3.015625, + "logps/chosen": -728.0, + "logps/rejected": -1048.0, + "loss": 0.2344, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.59375, + "rewards/margins": 3.296875, + "rewards/rejected": -8.875, + "step": 11190 + }, + { + "epoch": 0.5772452003607782, + "grad_norm": 12.89508329527682, + "learning_rate": 2.2626380500386387e-07, + "logits/chosen": -3.328125, + "logits/rejected": -2.90625, + "logps/chosen": -772.0, + "logps/rejected": -1104.0, + "loss": 0.2305, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.875, + "rewards/margins": 3.125, + "rewards/rejected": -9.0, + "step": 11200 + }, + { + "epoch": 0.5777605978611003, + "grad_norm": 7.92266378529117, + "learning_rate": 2.2581607425852737e-07, + "logits/chosen": -3.390625, + "logits/rejected": -2.96875, + "logps/chosen": -776.0, + "logps/rejected": -1120.0, + "loss": 0.2265, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.03125, + "rewards/margins": 3.640625, + "rewards/rejected": -9.6875, + "step": 11210 + }, + { + "epoch": 0.5782759953614225, + "grad_norm": 12.63011418257724, + "learning_rate": 2.2536842179992992e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.09375, + "logps/chosen": -776.0, + "logps/rejected": -1144.0, + "loss": 0.2356, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.09375, + "rewards/margins": 3.796875, + "rewards/rejected": -9.875, + "step": 11220 + }, + { + "epoch": 0.5787913928617446, + "grad_norm": 6.720995991227801, + "learning_rate": 2.2492084907718512e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.890625, + "logps/chosen": -808.0, + "logps/rejected": -1120.0, + "loss": 0.251, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.3125, + "rewards/margins": 3.21875, + "rewards/rejected": -9.5625, + "step": 11230 + }, + { + "epoch": 0.5793067903620668, + "grad_norm": 7.364130502009685, + "learning_rate": 2.2447335753914825e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.90625, + "logps/chosen": -800.0, + "logps/rejected": -1144.0, + "loss": 0.2044, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.1875, + "rewards/margins": 3.625, + "rewards/rejected": -9.8125, + "step": 11240 + }, + { + "epoch": 0.5798221878623888, + "grad_norm": 10.304266502102855, + "learning_rate": 2.2402594863441176e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.84375, + "logps/chosen": -780.0, + "logps/rejected": -1168.0, + "loss": 0.2234, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.0625, + "rewards/margins": 3.828125, + "rewards/rejected": -9.875, + "step": 11250 + }, + { + "epoch": 0.580337585362711, + "grad_norm": 11.16678444151158, + "learning_rate": 2.2357862381130076e-07, + "logits/chosen": -3.3125, + "logits/rejected": -2.953125, + "logps/chosen": -752.0, + "logps/rejected": -1088.0, + "loss": 0.2535, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.875, + "rewards/margins": 3.421875, + "rewards/rejected": -9.3125, + "step": 11260 + }, + { + "epoch": 0.5808529828630331, + "grad_norm": 8.136066054285187, + "learning_rate": 2.2313138451786798e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.984375, + "logps/chosen": -788.0, + "logps/rejected": -1136.0, + "loss": 0.223, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.0625, + "rewards/margins": 3.625, + "rewards/rejected": -9.6875, + "step": 11270 + }, + { + "epoch": 0.5813683803633553, + "grad_norm": 8.935253941899658, + "learning_rate": 2.226842322018893e-07, + "logits/chosen": -3.40625, + "logits/rejected": -3.21875, + "logps/chosen": -780.0, + "logps/rejected": -1136.0, + "loss": 0.2443, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.96875, + "rewards/margins": 3.65625, + "rewards/rejected": -9.625, + "step": 11280 + }, + { + "epoch": 0.5818837778636774, + "grad_norm": 8.093924534895322, + "learning_rate": 2.2223716831085922e-07, + "logits/chosen": -3.34375, + "logits/rejected": -2.875, + "logps/chosen": -772.0, + "logps/rejected": -1168.0, + "loss": 0.2196, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.84375, + "rewards/margins": 3.890625, + "rewards/rejected": -9.75, + "step": 11290 + }, + { + "epoch": 0.5823991753639994, + "grad_norm": 6.579928356998686, + "learning_rate": 2.217901942919858e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.796875, + "logps/chosen": -716.0, + "logps/rejected": -1096.0, + "loss": 0.2491, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.5, + "rewards/margins": 3.65625, + "rewards/rejected": -9.125, + "step": 11300 + }, + { + "epoch": 0.5829145728643216, + "grad_norm": 7.155127348457726, + "learning_rate": 2.213433115921864e-07, + "logits/chosen": -3.46875, + "logits/rejected": -3.203125, + "logps/chosen": -784.0, + "logps/rejected": -1088.0, + "loss": 0.2497, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.96875, + "rewards/margins": 3.25, + "rewards/rejected": -9.25, + "step": 11310 + }, + { + "epoch": 0.5834299703646437, + "grad_norm": 8.145715220028022, + "learning_rate": 2.2089652165808245e-07, + "logits/chosen": -3.40625, + "logits/rejected": -3.203125, + "logps/chosen": -788.0, + "logps/rejected": -1088.0, + "loss": 0.2399, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.15625, + "rewards/margins": 3.046875, + "rewards/rejected": -9.1875, + "step": 11320 + }, + { + "epoch": 0.5839453678649659, + "grad_norm": 9.039925656608881, + "learning_rate": 2.2044982593599525e-07, + "logits/chosen": -3.1875, + "logits/rejected": -3.0, + "logps/chosen": -780.0, + "logps/rejected": -1088.0, + "loss": 0.2388, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.96875, + "rewards/margins": 3.453125, + "rewards/rejected": -9.4375, + "step": 11330 + }, + { + "epoch": 0.584460765365288, + "grad_norm": 8.150521647736129, + "learning_rate": 2.2000322587194128e-07, + "logits/chosen": -3.34375, + "logits/rejected": -2.953125, + "logps/chosen": -832.0, + "logps/rejected": -1160.0, + "loss": 0.2461, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.34375, + "rewards/margins": 3.546875, + "rewards/rejected": -9.875, + "step": 11340 + }, + { + "epoch": 0.58497616286561, + "grad_norm": 7.121557534681278, + "learning_rate": 2.1955672291162708e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.921875, + "logps/chosen": -756.0, + "logps/rejected": -1072.0, + "loss": 0.2402, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.65625, + "rewards/margins": 3.515625, + "rewards/rejected": -9.1875, + "step": 11350 + }, + { + "epoch": 0.5854915603659322, + "grad_norm": 6.805838734921921, + "learning_rate": 2.1911031850044511e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.875, + "logps/chosen": -768.0, + "logps/rejected": -1080.0, + "loss": 0.2432, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.84375, + "rewards/margins": 3.265625, + "rewards/rejected": -9.0625, + "step": 11360 + }, + { + "epoch": 0.5860069578662543, + "grad_norm": 9.580201109369671, + "learning_rate": 2.1866401408346854e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.21875, + "logps/chosen": -760.0, + "logps/rejected": -1072.0, + "loss": 0.2316, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.6875, + "rewards/margins": 3.25, + "rewards/rejected": -8.9375, + "step": 11370 + }, + { + "epoch": 0.5865223553665765, + "grad_norm": 9.233625081070358, + "learning_rate": 2.18217811105447e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.875, + "logps/chosen": -720.0, + "logps/rejected": -1104.0, + "loss": 0.2186, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.40625, + "rewards/margins": 4.0625, + "rewards/rejected": -9.4375, + "step": 11380 + }, + { + "epoch": 0.5870377528668986, + "grad_norm": 7.508180809090628, + "learning_rate": 2.1777171101080182e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.75, + "logps/chosen": -720.0, + "logps/rejected": -1064.0, + "loss": 0.228, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -5.59375, + "rewards/margins": 3.4375, + "rewards/rejected": -9.0, + "step": 11390 + }, + { + "epoch": 0.5875531503672207, + "grad_norm": 7.4838546230386, + "learning_rate": 2.173257152436212e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.8125, + "logps/chosen": -680.0, + "logps/rejected": -1032.0, + "loss": 0.2339, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -5.1875, + "rewards/margins": 3.53125, + "rewards/rejected": -8.75, + "step": 11400 + }, + { + "epoch": 0.5880685478675428, + "grad_norm": 6.755796037463306, + "learning_rate": 2.1687982524765565e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.0625, + "logps/chosen": -720.0, + "logps/rejected": -1048.0, + "loss": 0.2389, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.3125, + "rewards/margins": 3.484375, + "rewards/rejected": -8.8125, + "step": 11410 + }, + { + "epoch": 0.588583945367865, + "grad_norm": 9.769014904491698, + "learning_rate": 2.1643404246631306e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.96875, + "logps/chosen": -700.0, + "logps/rejected": -1032.0, + "loss": 0.2481, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.375, + "rewards/margins": 3.46875, + "rewards/rejected": -8.875, + "step": 11420 + }, + { + "epoch": 0.5890993428681871, + "grad_norm": 8.21843229839934, + "learning_rate": 2.159883683426546e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.9375, + "logps/chosen": -776.0, + "logps/rejected": -1096.0, + "loss": 0.2353, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.0625, + "rewards/margins": 3.25, + "rewards/rejected": -9.3125, + "step": 11430 + }, + { + "epoch": 0.5896147403685092, + "grad_norm": 7.659561310367181, + "learning_rate": 2.1554280431938949e-07, + "logits/chosen": -3.34375, + "logits/rejected": -2.875, + "logps/chosen": -780.0, + "logps/rejected": -1120.0, + "loss": 0.2458, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.21875, + "rewards/margins": 3.4375, + "rewards/rejected": -9.625, + "step": 11440 + }, + { + "epoch": 0.5901301378688313, + "grad_norm": 7.760377809791155, + "learning_rate": 2.1509735183887066e-07, + "logits/chosen": -3.359375, + "logits/rejected": -2.984375, + "logps/chosen": -808.0, + "logps/rejected": -1112.0, + "loss": 0.2235, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.3125, + "rewards/margins": 3.1875, + "rewards/rejected": -9.5, + "step": 11450 + }, + { + "epoch": 0.5906455353691534, + "grad_norm": 8.240750875148926, + "learning_rate": 2.1465201234308985e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.140625, + "logps/chosen": -740.0, + "logps/rejected": -1096.0, + "loss": 0.2175, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.5625, + "rewards/margins": 3.71875, + "rewards/rejected": -9.25, + "step": 11460 + }, + { + "epoch": 0.5911609328694756, + "grad_norm": 9.128414688459726, + "learning_rate": 2.14206787273673e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.046875, + "logps/chosen": -800.0, + "logps/rejected": -1112.0, + "loss": 0.2337, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.09375, + "rewards/margins": 3.25, + "rewards/rejected": -9.375, + "step": 11470 + }, + { + "epoch": 0.5916763303697977, + "grad_norm": 6.877762935557417, + "learning_rate": 2.1376167807187586e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.859375, + "logps/chosen": -736.0, + "logps/rejected": -1120.0, + "loss": 0.2299, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.71875, + "rewards/margins": 3.96875, + "rewards/rejected": -9.6875, + "step": 11480 + }, + { + "epoch": 0.5921917278701199, + "grad_norm": 12.091510264761336, + "learning_rate": 2.133166861785789e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.09375, + "logps/chosen": -772.0, + "logps/rejected": -1064.0, + "loss": 0.254, + "rewards/accuracies": 0.84375, + "rewards/chosen": -5.84375, + "rewards/margins": 3.296875, + "rewards/rejected": -9.125, + "step": 11490 + }, + { + "epoch": 0.5927071253704419, + "grad_norm": 6.19894964781254, + "learning_rate": 2.1287181303428296e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.921875, + "logps/chosen": -736.0, + "logps/rejected": -1048.0, + "loss": 0.2567, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.5625, + "rewards/margins": 3.265625, + "rewards/rejected": -8.875, + "step": 11500 + }, + { + "epoch": 0.593222522870764, + "grad_norm": 8.699088767051771, + "learning_rate": 2.1242706007910447e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.875, + "logps/chosen": -724.0, + "logps/rejected": -1032.0, + "loss": 0.2423, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.78125, + "rewards/margins": 3.171875, + "rewards/rejected": -8.9375, + "step": 11510 + }, + { + "epoch": 0.5937379203710862, + "grad_norm": 10.082660809065432, + "learning_rate": 2.1198242875277058e-07, + "logits/chosen": -3.4375, + "logits/rejected": -3.09375, + "logps/chosen": -768.0, + "logps/rejected": -1128.0, + "loss": 0.2338, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.0625, + "rewards/margins": 3.5, + "rewards/rejected": -9.5625, + "step": 11520 + }, + { + "epoch": 0.5942533178714083, + "grad_norm": 9.354918617314514, + "learning_rate": 2.1153792049461504e-07, + "logits/chosen": -3.359375, + "logits/rejected": -3.125, + "logps/chosen": -804.0, + "logps/rejected": -1152.0, + "loss": 0.2385, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.125, + "rewards/margins": 3.703125, + "rewards/rejected": -9.8125, + "step": 11530 + }, + { + "epoch": 0.5947687153717305, + "grad_norm": 8.152772596475852, + "learning_rate": 2.1109353674357296e-07, + "logits/chosen": -3.25, + "logits/rejected": -3.078125, + "logps/chosen": -812.0, + "logps/rejected": -1144.0, + "loss": 0.2618, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.40625, + "rewards/margins": 3.46875, + "rewards/rejected": -9.875, + "step": 11540 + }, + { + "epoch": 0.5952841128720525, + "grad_norm": 6.592630077228735, + "learning_rate": 2.1064927893817655e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.921875, + "logps/chosen": -752.0, + "logps/rejected": -1128.0, + "loss": 0.2389, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -5.75, + "rewards/margins": 3.796875, + "rewards/rejected": -9.5625, + "step": 11550 + }, + { + "epoch": 0.5957995103723747, + "grad_norm": 8.84655033593983, + "learning_rate": 2.1020514851655037e-07, + "logits/chosen": -3.46875, + "logits/rejected": -3.03125, + "logps/chosen": -768.0, + "logps/rejected": -1112.0, + "loss": 0.2194, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.96875, + "rewards/margins": 3.46875, + "rewards/rejected": -9.4375, + "step": 11560 + }, + { + "epoch": 0.5963149078726968, + "grad_norm": 10.647938586657759, + "learning_rate": 2.0976114691640632e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.9375, + "logps/chosen": -772.0, + "logps/rejected": -1072.0, + "loss": 0.2369, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.0625, + "rewards/margins": 3.109375, + "rewards/rejected": -9.1875, + "step": 11570 + }, + { + "epoch": 0.596830305373019, + "grad_norm": 10.176529544065172, + "learning_rate": 2.0931727557503948e-07, + "logits/chosen": -3.359375, + "logits/rejected": -2.9375, + "logps/chosen": -768.0, + "logps/rejected": -1104.0, + "loss": 0.2266, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.71875, + "rewards/margins": 3.65625, + "rewards/rejected": -9.375, + "step": 11580 + }, + { + "epoch": 0.5973457028733411, + "grad_norm": 9.836716284247533, + "learning_rate": 2.0887353592932342e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.78125, + "logps/chosen": -720.0, + "logps/rejected": -1064.0, + "loss": 0.2373, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.53125, + "rewards/margins": 3.4375, + "rewards/rejected": -9.0, + "step": 11590 + }, + { + "epoch": 0.5978611003736631, + "grad_norm": 6.1999168126208115, + "learning_rate": 2.0842992941570508e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.078125, + "logps/chosen": -716.0, + "logps/rejected": -1048.0, + "loss": 0.2324, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.4375, + "rewards/margins": 3.359375, + "rewards/rejected": -8.8125, + "step": 11600 + }, + { + "epoch": 0.5983764978739853, + "grad_norm": 8.96888945931225, + "learning_rate": 2.0798645747020078e-07, + "logits/chosen": -3.34375, + "logits/rejected": -2.984375, + "logps/chosen": -724.0, + "logps/rejected": -1048.0, + "loss": 0.2499, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.4375, + "rewards/margins": 3.3125, + "rewards/rejected": -8.75, + "step": 11610 + }, + { + "epoch": 0.5988918953743074, + "grad_norm": 8.50809551405228, + "learning_rate": 2.0754312152839084e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.875, + "logps/chosen": -752.0, + "logps/rejected": -1080.0, + "loss": 0.2287, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.875, + "rewards/margins": 3.484375, + "rewards/rejected": -9.375, + "step": 11620 + }, + { + "epoch": 0.5994072928746296, + "grad_norm": 8.716297261762378, + "learning_rate": 2.0709992302541557e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.953125, + "logps/chosen": -728.0, + "logps/rejected": -1088.0, + "loss": 0.2191, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.78125, + "rewards/margins": 3.421875, + "rewards/rejected": -9.1875, + "step": 11630 + }, + { + "epoch": 0.5999226903749517, + "grad_norm": 11.982447466458218, + "learning_rate": 2.0665686339597035e-07, + "logits/chosen": -3.5, + "logits/rejected": -3.09375, + "logps/chosen": -776.0, + "logps/rejected": -1088.0, + "loss": 0.2623, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.03125, + "rewards/margins": 3.265625, + "rewards/rejected": -9.3125, + "step": 11640 + }, + { + "epoch": 0.6004380878752738, + "grad_norm": 6.079685697431835, + "learning_rate": 2.0621394407430091e-07, + "logits/chosen": -3.375, + "logits/rejected": -3.171875, + "logps/chosen": -776.0, + "logps/rejected": -1096.0, + "loss": 0.2304, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.0, + "rewards/margins": 3.328125, + "rewards/rejected": -9.3125, + "step": 11650 + }, + { + "epoch": 0.6009534853755959, + "grad_norm": 10.661233689962524, + "learning_rate": 2.05771166494199e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.921875, + "logps/chosen": -760.0, + "logps/rejected": -1056.0, + "loss": 0.2472, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.875, + "rewards/margins": 3.25, + "rewards/rejected": -9.125, + "step": 11660 + }, + { + "epoch": 0.601468882875918, + "grad_norm": 7.479902135518757, + "learning_rate": 2.053285320889972e-07, + "logits/chosen": -3.3125, + "logits/rejected": -2.953125, + "logps/chosen": -764.0, + "logps/rejected": -1104.0, + "loss": 0.239, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.8125, + "rewards/margins": 3.3125, + "rewards/rejected": -9.125, + "step": 11670 + }, + { + "epoch": 0.6019842803762402, + "grad_norm": 33.10610610790392, + "learning_rate": 2.0488604229156477e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.125, + "logps/chosen": -764.0, + "logps/rejected": -1048.0, + "loss": 0.2589, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.9375, + "rewards/margins": 3.09375, + "rewards/rejected": -9.0, + "step": 11680 + }, + { + "epoch": 0.6024996778765623, + "grad_norm": 8.781437458945861, + "learning_rate": 2.04443698534303e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.171875, + "logps/chosen": -760.0, + "logps/rejected": -1080.0, + "loss": 0.2647, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.9375, + "rewards/margins": 3.21875, + "rewards/rejected": -9.125, + "step": 11690 + }, + { + "epoch": 0.6030150753768844, + "grad_norm": 7.261257752724377, + "learning_rate": 2.0400150224914026e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.78125, + "logps/chosen": -756.0, + "logps/rejected": -1080.0, + "loss": 0.2285, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -5.84375, + "rewards/margins": 3.4375, + "rewards/rejected": -9.25, + "step": 11700 + }, + { + "epoch": 0.6035304728772065, + "grad_norm": 8.20340577464145, + "learning_rate": 2.0355945486752763e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.9375, + "logps/chosen": -760.0, + "logps/rejected": -1056.0, + "loss": 0.2313, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.96875, + "rewards/margins": 3.171875, + "rewards/rejected": -9.125, + "step": 11710 + }, + { + "epoch": 0.6040458703775287, + "grad_norm": 7.252788486335195, + "learning_rate": 2.0311755782043393e-07, + "logits/chosen": -3.46875, + "logits/rejected": -3.25, + "logps/chosen": -756.0, + "logps/rejected": -1048.0, + "loss": 0.2318, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.90625, + "rewards/margins": 3.03125, + "rewards/rejected": -8.9375, + "step": 11720 + }, + { + "epoch": 0.6045612678778508, + "grad_norm": 9.939275583908438, + "learning_rate": 2.026758125383417e-07, + "logits/chosen": -3.421875, + "logits/rejected": -3.109375, + "logps/chosen": -740.0, + "logps/rejected": -1072.0, + "loss": 0.2401, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.75, + "rewards/margins": 3.390625, + "rewards/rejected": -9.1875, + "step": 11730 + }, + { + "epoch": 0.605076665378173, + "grad_norm": 10.342698998427435, + "learning_rate": 2.0223422045124186e-07, + "logits/chosen": -3.578125, + "logits/rejected": -3.234375, + "logps/chosen": -784.0, + "logps/rejected": -1096.0, + "loss": 0.234, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.0, + "rewards/margins": 3.296875, + "rewards/rejected": -9.3125, + "step": 11740 + }, + { + "epoch": 0.605592062878495, + "grad_norm": 8.693209787895176, + "learning_rate": 2.0179278298862966e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.1875, + "logps/chosen": -788.0, + "logps/rejected": -1112.0, + "loss": 0.2724, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.28125, + "rewards/margins": 3.390625, + "rewards/rejected": -9.6875, + "step": 11750 + }, + { + "epoch": 0.6061074603788171, + "grad_norm": 8.63459768118437, + "learning_rate": 2.0135150157949977e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.0625, + "logps/chosen": -780.0, + "logps/rejected": -1120.0, + "loss": 0.2486, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.1875, + "rewards/margins": 3.390625, + "rewards/rejected": -9.5625, + "step": 11760 + }, + { + "epoch": 0.6066228578791393, + "grad_norm": 6.473687146986929, + "learning_rate": 2.0091037765234143e-07, + "logits/chosen": -3.421875, + "logits/rejected": -3.140625, + "logps/chosen": -784.0, + "logps/rejected": -1120.0, + "loss": 0.247, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.15625, + "rewards/margins": 3.359375, + "rewards/rejected": -9.5, + "step": 11770 + }, + { + "epoch": 0.6071382553794614, + "grad_norm": 9.659731743539854, + "learning_rate": 2.0046941263513445e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.9375, + "logps/chosen": -756.0, + "logps/rejected": -1072.0, + "loss": 0.2362, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.875, + "rewards/margins": 3.375, + "rewards/rejected": -9.25, + "step": 11780 + }, + { + "epoch": 0.6076536528797836, + "grad_norm": 7.34226201091204, + "learning_rate": 2.0002860795534392e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.265625, + "logps/chosen": -796.0, + "logps/rejected": -1072.0, + "loss": 0.2497, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.0625, + "rewards/margins": 2.984375, + "rewards/rejected": -9.0625, + "step": 11790 + }, + { + "epoch": 0.6081690503801056, + "grad_norm": 8.040760326624586, + "learning_rate": 1.9958796503991627e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.921875, + "logps/chosen": -764.0, + "logps/rejected": -1112.0, + "loss": 0.2538, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.875, + "rewards/margins": 3.5, + "rewards/rejected": -9.375, + "step": 11800 + }, + { + "epoch": 0.6086844478804277, + "grad_norm": 13.472256744272329, + "learning_rate": 1.991474853152738e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.859375, + "logps/chosen": -752.0, + "logps/rejected": -1096.0, + "loss": 0.2442, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.875, + "rewards/margins": 3.640625, + "rewards/rejected": -9.5, + "step": 11810 + }, + { + "epoch": 0.6091998453807499, + "grad_norm": 7.159425633168075, + "learning_rate": 1.9870717020731104e-07, + "logits/chosen": -3.4375, + "logits/rejected": -3.1875, + "logps/chosen": -792.0, + "logps/rejected": -1128.0, + "loss": 0.2241, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.09375, + "rewards/margins": 3.5625, + "rewards/rejected": -9.625, + "step": 11820 + }, + { + "epoch": 0.609715242881072, + "grad_norm": 7.875611011878736, + "learning_rate": 1.9826702114138918e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.90625, + "logps/chosen": -756.0, + "logps/rejected": -1104.0, + "loss": 0.2204, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.0, + "rewards/margins": 3.21875, + "rewards/rejected": -9.1875, + "step": 11830 + }, + { + "epoch": 0.6102306403813942, + "grad_norm": 9.767501641418392, + "learning_rate": 1.9782703954233217e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.09375, + "logps/chosen": -760.0, + "logps/rejected": -1072.0, + "loss": 0.2626, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.0, + "rewards/margins": 3.296875, + "rewards/rejected": -9.3125, + "step": 11840 + }, + { + "epoch": 0.6107460378817162, + "grad_norm": 6.924910561623684, + "learning_rate": 1.973872268344217e-07, + "logits/chosen": -3.40625, + "logits/rejected": -3.1875, + "logps/chosen": -760.0, + "logps/rejected": -1080.0, + "loss": 0.2212, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.625, + "rewards/margins": 3.328125, + "rewards/rejected": -8.9375, + "step": 11850 + }, + { + "epoch": 0.6112614353820384, + "grad_norm": 9.194375609559987, + "learning_rate": 1.9694758444139305e-07, + "logits/chosen": -3.421875, + "logits/rejected": -3.078125, + "logps/chosen": -768.0, + "logps/rejected": -1096.0, + "loss": 0.2203, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.6875, + "rewards/margins": 3.59375, + "rewards/rejected": -9.25, + "step": 11860 + }, + { + "epoch": 0.6117768328823605, + "grad_norm": 11.50365530362939, + "learning_rate": 1.9650811378642985e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.109375, + "logps/chosen": -748.0, + "logps/rejected": -1096.0, + "loss": 0.2144, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.8125, + "rewards/margins": 3.65625, + "rewards/rejected": -9.4375, + "step": 11870 + }, + { + "epoch": 0.6122922303826827, + "grad_norm": 8.714576895443498, + "learning_rate": 1.960688162921597e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.890625, + "logps/chosen": -768.0, + "logps/rejected": -1088.0, + "loss": 0.2275, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.03125, + "rewards/margins": 3.375, + "rewards/rejected": -9.375, + "step": 11880 + }, + { + "epoch": 0.6128076278830048, + "grad_norm": 9.69110181892399, + "learning_rate": 1.9562969338065e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.828125, + "logps/chosen": -756.0, + "logps/rejected": -1104.0, + "loss": 0.2074, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.84375, + "rewards/margins": 3.546875, + "rewards/rejected": -9.375, + "step": 11890 + }, + { + "epoch": 0.6133230253833268, + "grad_norm": 9.731634810826836, + "learning_rate": 1.9519074647340277e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.875, + "logps/chosen": -728.0, + "logps/rejected": -1080.0, + "loss": 0.2326, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.75, + "rewards/margins": 3.625, + "rewards/rejected": -9.375, + "step": 11900 + }, + { + "epoch": 0.613838422883649, + "grad_norm": 10.264751121671347, + "learning_rate": 1.9475197699135044e-07, + "logits/chosen": -3.234375, + "logits/rejected": -3.0, + "logps/chosen": -784.0, + "logps/rejected": -1136.0, + "loss": 0.2409, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.9375, + "rewards/margins": 3.6875, + "rewards/rejected": -9.625, + "step": 11910 + }, + { + "epoch": 0.6143538203839711, + "grad_norm": 9.521254923403186, + "learning_rate": 1.9431338635485095e-07, + "logits/chosen": -3.171875, + "logits/rejected": -3.015625, + "logps/chosen": -760.0, + "logps/rejected": -1056.0, + "loss": 0.2444, + "rewards/accuracies": 0.84375, + "rewards/chosen": -6.0, + "rewards/margins": 3.078125, + "rewards/rejected": -9.0625, + "step": 11920 + }, + { + "epoch": 0.6148692178842933, + "grad_norm": 10.006593009120792, + "learning_rate": 1.9387497598368318e-07, + "logits/chosen": -3.25, + "logits/rejected": -3.046875, + "logps/chosen": -756.0, + "logps/rejected": -1056.0, + "loss": 0.2572, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.8125, + "rewards/margins": 3.1875, + "rewards/rejected": -9.0, + "step": 11930 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 10.651628228999224, + "learning_rate": 1.9343674729704286e-07, + "logits/chosen": -3.453125, + "logits/rejected": -3.1875, + "logps/chosen": -728.0, + "logps/rejected": -1072.0, + "loss": 0.2318, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.59375, + "rewards/margins": 3.390625, + "rewards/rejected": -9.0, + "step": 11940 + }, + { + "epoch": 0.6159000128849375, + "grad_norm": 6.626820964024169, + "learning_rate": 1.9299870171353716e-07, + "logits/chosen": -3.4375, + "logits/rejected": -3.203125, + "logps/chosen": -780.0, + "logps/rejected": -1144.0, + "loss": 0.2186, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.96875, + "rewards/margins": 3.484375, + "rewards/rejected": -9.4375, + "step": 11950 + }, + { + "epoch": 0.6164154103852596, + "grad_norm": 9.033488581021947, + "learning_rate": 1.9256084065118083e-07, + "logits/chosen": -3.34375, + "logits/rejected": -2.96875, + "logps/chosen": -744.0, + "logps/rejected": -1144.0, + "loss": 0.2503, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.75, + "rewards/margins": 3.9375, + "rewards/rejected": -9.6875, + "step": 11960 + }, + { + "epoch": 0.6169308078855817, + "grad_norm": 8.357251469939719, + "learning_rate": 1.921231655273912e-07, + "logits/chosen": -3.375, + "logits/rejected": -3.140625, + "logps/chosen": -776.0, + "logps/rejected": -1096.0, + "loss": 0.2454, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.15625, + "rewards/margins": 3.328125, + "rewards/rejected": -9.5, + "step": 11970 + }, + { + "epoch": 0.6174462053859039, + "grad_norm": 6.8926157725812125, + "learning_rate": 1.9168567775898348e-07, + "logits/chosen": -3.234375, + "logits/rejected": -3.03125, + "logps/chosen": -784.0, + "logps/rejected": -1120.0, + "loss": 0.2258, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.15625, + "rewards/margins": 3.28125, + "rewards/rejected": -9.4375, + "step": 11980 + }, + { + "epoch": 0.617961602886226, + "grad_norm": 9.612338285274198, + "learning_rate": 1.9124837876216672e-07, + "logits/chosen": -3.421875, + "logits/rejected": -3.09375, + "logps/chosen": -764.0, + "logps/rejected": -1096.0, + "loss": 0.21, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.8125, + "rewards/margins": 3.375, + "rewards/rejected": -9.1875, + "step": 11990 + }, + { + "epoch": 0.6184770003865481, + "grad_norm": 10.346313790093413, + "learning_rate": 1.908112699525386e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.1875, + "logps/chosen": -768.0, + "logps/rejected": -1080.0, + "loss": 0.237, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.9375, + "rewards/margins": 3.21875, + "rewards/rejected": -9.125, + "step": 12000 + }, + { + "epoch": 0.6189923978868702, + "grad_norm": 6.906386113403898, + "learning_rate": 1.903743527450814e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.109375, + "logps/chosen": -780.0, + "logps/rejected": -1152.0, + "loss": 0.2252, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.125, + "rewards/margins": 3.734375, + "rewards/rejected": -9.8125, + "step": 12010 + }, + { + "epoch": 0.6195077953871924, + "grad_norm": 7.514977608024949, + "learning_rate": 1.8993762855415713e-07, + "logits/chosen": -3.65625, + "logits/rejected": -3.265625, + "logps/chosen": -764.0, + "logps/rejected": -1136.0, + "loss": 0.2373, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.90625, + "rewards/margins": 3.6875, + "rewards/rejected": -9.5625, + "step": 12020 + }, + { + "epoch": 0.6200231928875145, + "grad_norm": 7.444964926376508, + "learning_rate": 1.8950109879350267e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.015625, + "logps/chosen": -748.0, + "logps/rejected": -1048.0, + "loss": 0.2429, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.59375, + "rewards/margins": 3.234375, + "rewards/rejected": -8.8125, + "step": 12030 + }, + { + "epoch": 0.6205385903878367, + "grad_norm": 6.873290360642659, + "learning_rate": 1.8906476487622578e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.15625, + "logps/chosen": -776.0, + "logps/rejected": -1080.0, + "loss": 0.2214, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.125, + "rewards/margins": 3.125, + "rewards/rejected": -9.25, + "step": 12040 + }, + { + "epoch": 0.6210539878881587, + "grad_norm": 9.45206959242052, + "learning_rate": 1.886286282148002e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.109375, + "logps/chosen": -752.0, + "logps/rejected": -1024.0, + "loss": 0.2361, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.84375, + "rewards/margins": 3.015625, + "rewards/rejected": -8.875, + "step": 12050 + }, + { + "epoch": 0.6215693853884808, + "grad_norm": 8.45776840762164, + "learning_rate": 1.881926902210611e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.984375, + "logps/chosen": -728.0, + "logps/rejected": -1048.0, + "loss": 0.228, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.5625, + "rewards/margins": 3.203125, + "rewards/rejected": -8.75, + "step": 12060 + }, + { + "epoch": 0.622084782888803, + "grad_norm": 6.527989827474123, + "learning_rate": 1.8775695230620063e-07, + "logits/chosen": -3.234375, + "logits/rejected": -3.0, + "logps/chosen": -768.0, + "logps/rejected": -1072.0, + "loss": 0.2306, + "rewards/accuracies": 0.8187500238418579, + "rewards/chosen": -5.96875, + "rewards/margins": 3.09375, + "rewards/rejected": -9.0625, + "step": 12070 + }, + { + "epoch": 0.6226001803891251, + "grad_norm": 7.020736618714024, + "learning_rate": 1.8732141588076303e-07, + "logits/chosen": -3.234375, + "logits/rejected": -3.03125, + "logps/chosen": -772.0, + "logps/rejected": -1096.0, + "loss": 0.2444, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.9375, + "rewards/margins": 3.421875, + "rewards/rejected": -9.375, + "step": 12080 + }, + { + "epoch": 0.6231155778894473, + "grad_norm": 6.852300280350611, + "learning_rate": 1.8688608235464042e-07, + "logits/chosen": -3.078125, + "logits/rejected": -3.03125, + "logps/chosen": -760.0, + "logps/rejected": -1096.0, + "loss": 0.2071, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.71875, + "rewards/margins": 3.453125, + "rewards/rejected": -9.1875, + "step": 12090 + }, + { + "epoch": 0.6236309753897693, + "grad_norm": 7.048647709595883, + "learning_rate": 1.8645095313706827e-07, + "logits/chosen": -3.09375, + "logits/rejected": -2.96875, + "logps/chosen": -768.0, + "logps/rejected": -1080.0, + "loss": 0.252, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.90625, + "rewards/margins": 3.265625, + "rewards/rejected": -9.125, + "step": 12100 + }, + { + "epoch": 0.6241463728900914, + "grad_norm": 7.456730599550208, + "learning_rate": 1.8601602963662034e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.765625, + "logps/chosen": -760.0, + "logps/rejected": -1104.0, + "loss": 0.2141, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.8125, + "rewards/margins": 3.578125, + "rewards/rejected": -9.375, + "step": 12110 + }, + { + "epoch": 0.6246617703904136, + "grad_norm": 7.730980717695669, + "learning_rate": 1.8558131326120484e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.96875, + "logps/chosen": -776.0, + "logps/rejected": -1104.0, + "loss": 0.2486, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.125, + "rewards/margins": 3.4375, + "rewards/rejected": -9.5625, + "step": 12120 + }, + { + "epoch": 0.6251771678907357, + "grad_norm": 4.8893969637526205, + "learning_rate": 1.851468054180591e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.859375, + "logps/chosen": -756.0, + "logps/rejected": -1072.0, + "loss": 0.2271, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.78125, + "rewards/margins": 3.296875, + "rewards/rejected": -9.0625, + "step": 12130 + }, + { + "epoch": 0.6256925653910579, + "grad_norm": 6.9642509403387844, + "learning_rate": 1.8471250751374563e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.0625, + "logps/chosen": -748.0, + "logps/rejected": -1096.0, + "loss": 0.2422, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.6875, + "rewards/margins": 3.40625, + "rewards/rejected": -9.125, + "step": 12140 + }, + { + "epoch": 0.62620796289138, + "grad_norm": 8.484841291430905, + "learning_rate": 1.8427842095414735e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.96875, + "logps/chosen": -776.0, + "logps/rejected": -1088.0, + "loss": 0.2358, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.875, + "rewards/margins": 3.4375, + "rewards/rejected": -9.3125, + "step": 12150 + }, + { + "epoch": 0.6267233603917021, + "grad_norm": 10.151453023531351, + "learning_rate": 1.838445471444629e-07, + "logits/chosen": -3.171875, + "logits/rejected": -3.0, + "logps/chosen": -740.0, + "logps/rejected": -1080.0, + "loss": 0.2384, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.65625, + "rewards/margins": 3.71875, + "rewards/rejected": -9.375, + "step": 12160 + }, + { + "epoch": 0.6272387578920242, + "grad_norm": 7.3934773514356245, + "learning_rate": 1.834108874892024e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.859375, + "logps/chosen": -768.0, + "logps/rejected": -1096.0, + "loss": 0.2336, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.0625, + "rewards/margins": 3.25, + "rewards/rejected": -9.3125, + "step": 12170 + }, + { + "epoch": 0.6277541553923464, + "grad_norm": 8.721065194108665, + "learning_rate": 1.8297744339218245e-07, + "logits/chosen": -3.28125, + "logits/rejected": -3.09375, + "logps/chosen": -776.0, + "logps/rejected": -1080.0, + "loss": 0.2445, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.90625, + "rewards/margins": 3.28125, + "rewards/rejected": -9.1875, + "step": 12180 + }, + { + "epoch": 0.6282695528926685, + "grad_norm": 5.790843136417454, + "learning_rate": 1.8254421625652217e-07, + "logits/chosen": -3.1875, + "logits/rejected": -3.0, + "logps/chosen": -768.0, + "logps/rejected": -1112.0, + "loss": 0.2161, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.84375, + "rewards/margins": 3.625, + "rewards/rejected": -9.5, + "step": 12190 + }, + { + "epoch": 0.6287849503929906, + "grad_norm": 6.991830607301393, + "learning_rate": 1.821112074846381e-07, + "logits/chosen": -3.28125, + "logits/rejected": -3.046875, + "logps/chosen": -768.0, + "logps/rejected": -1096.0, + "loss": 0.2266, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.875, + "rewards/margins": 3.53125, + "rewards/rejected": -9.375, + "step": 12200 + }, + { + "epoch": 0.6293003478933127, + "grad_norm": 8.317289248843267, + "learning_rate": 1.8167841847824007e-07, + "logits/chosen": -3.328125, + "logits/rejected": -2.96875, + "logps/chosen": -728.0, + "logps/rejected": -1088.0, + "loss": 0.2245, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.6875, + "rewards/margins": 3.65625, + "rewards/rejected": -9.3125, + "step": 12210 + }, + { + "epoch": 0.6298157453936348, + "grad_norm": 7.139526992199338, + "learning_rate": 1.8124585063832643e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.984375, + "logps/chosen": -796.0, + "logps/rejected": -1112.0, + "loss": 0.2409, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.1875, + "rewards/margins": 3.34375, + "rewards/rejected": -9.5, + "step": 12220 + }, + { + "epoch": 0.630331142893957, + "grad_norm": 7.3833365871204295, + "learning_rate": 1.8081350536517946e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.84375, + "logps/chosen": -748.0, + "logps/rejected": -1120.0, + "loss": 0.2175, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.78125, + "rewards/margins": 3.90625, + "rewards/rejected": -9.6875, + "step": 12230 + }, + { + "epoch": 0.6308465403942791, + "grad_norm": 8.162022673888849, + "learning_rate": 1.8038138405836123e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.984375, + "logps/chosen": -764.0, + "logps/rejected": -1112.0, + "loss": 0.2422, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.90625, + "rewards/margins": 3.484375, + "rewards/rejected": -9.375, + "step": 12240 + }, + { + "epoch": 0.6313619378946013, + "grad_norm": 9.108699009555393, + "learning_rate": 1.799494881167085e-07, + "logits/chosen": -3.15625, + "logits/rejected": -3.046875, + "logps/chosen": -764.0, + "logps/rejected": -1056.0, + "loss": 0.2463, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.875, + "rewards/margins": 3.15625, + "rewards/rejected": -9.0625, + "step": 12250 + }, + { + "epoch": 0.6318773353949233, + "grad_norm": 9.194923066007812, + "learning_rate": 1.7951781893832887e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.875, + "logps/chosen": -756.0, + "logps/rejected": -1104.0, + "loss": 0.2166, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.875, + "rewards/margins": 3.296875, + "rewards/rejected": -9.1875, + "step": 12260 + }, + { + "epoch": 0.6323927328952454, + "grad_norm": 8.558301404722169, + "learning_rate": 1.7908637792059555e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.84375, + "logps/chosen": -752.0, + "logps/rejected": -1120.0, + "loss": 0.2183, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.65625, + "rewards/margins": 3.71875, + "rewards/rejected": -9.375, + "step": 12270 + }, + { + "epoch": 0.6329081303955676, + "grad_norm": 9.48030800947113, + "learning_rate": 1.786551664601431e-07, + "logits/chosen": -3.34375, + "logits/rejected": -2.9375, + "logps/chosen": -748.0, + "logps/rejected": -1056.0, + "loss": 0.235, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.6875, + "rewards/margins": 3.328125, + "rewards/rejected": -9.0, + "step": 12280 + }, + { + "epoch": 0.6334235278958897, + "grad_norm": 6.646441710035768, + "learning_rate": 1.7822418595286337e-07, + "logits/chosen": -3.40625, + "logits/rejected": -3.171875, + "logps/chosen": -776.0, + "logps/rejected": -1096.0, + "loss": 0.2128, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.96875, + "rewards/margins": 3.40625, + "rewards/rejected": -9.375, + "step": 12290 + }, + { + "epoch": 0.6339389253962119, + "grad_norm": 6.111173421691858, + "learning_rate": 1.7779343779390022e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.015625, + "logps/chosen": -756.0, + "logps/rejected": -1080.0, + "loss": 0.2348, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.75, + "rewards/margins": 3.5625, + "rewards/rejected": -9.3125, + "step": 12300 + }, + { + "epoch": 0.6344543228965339, + "grad_norm": 9.501246335724023, + "learning_rate": 1.7736292337764547e-07, + "logits/chosen": -3.375, + "logits/rejected": -3.0625, + "logps/chosen": -800.0, + "logps/rejected": -1104.0, + "loss": 0.2385, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.15625, + "rewards/margins": 3.234375, + "rewards/rejected": -9.375, + "step": 12310 + }, + { + "epoch": 0.6349697203968561, + "grad_norm": 6.07676093008413, + "learning_rate": 1.7693264409773455e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.015625, + "logps/chosen": -752.0, + "logps/rejected": -1112.0, + "loss": 0.2288, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.6875, + "rewards/margins": 3.59375, + "rewards/rejected": -9.25, + "step": 12320 + }, + { + "epoch": 0.6354851178971782, + "grad_norm": 9.319755199834496, + "learning_rate": 1.7650260134704108e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.953125, + "logps/chosen": -748.0, + "logps/rejected": -1096.0, + "loss": 0.2527, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.75, + "rewards/margins": 3.46875, + "rewards/rejected": -9.25, + "step": 12330 + }, + { + "epoch": 0.6360005153975004, + "grad_norm": 11.614838675863348, + "learning_rate": 1.7607279651767365e-07, + "logits/chosen": -3.34375, + "logits/rejected": -2.75, + "logps/chosen": -756.0, + "logps/rejected": -1128.0, + "loss": 0.2392, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.9375, + "rewards/margins": 3.734375, + "rewards/rejected": -9.625, + "step": 12340 + }, + { + "epoch": 0.6365159128978225, + "grad_norm": 11.551437309922878, + "learning_rate": 1.756432310009704e-07, + "logits/chosen": -3.375, + "logits/rejected": -3.15625, + "logps/chosen": -764.0, + "logps/rejected": -1088.0, + "loss": 0.2262, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.84375, + "rewards/margins": 3.4375, + "rewards/rejected": -9.3125, + "step": 12350 + }, + { + "epoch": 0.6370313103981445, + "grad_norm": 9.089079135028491, + "learning_rate": 1.7521390618749469e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.96875, + "logps/chosen": -752.0, + "logps/rejected": -1072.0, + "loss": 0.2515, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.0, + "rewards/margins": 3.296875, + "rewards/rejected": -9.3125, + "step": 12360 + }, + { + "epoch": 0.6375467078984667, + "grad_norm": 9.31086361832489, + "learning_rate": 1.7478482346703097e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.84375, + "logps/chosen": -784.0, + "logps/rejected": -1120.0, + "loss": 0.2207, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.0625, + "rewards/margins": 3.453125, + "rewards/rejected": -9.5, + "step": 12370 + }, + { + "epoch": 0.6380621053987888, + "grad_norm": 8.182811095559321, + "learning_rate": 1.7435598422857967e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.21875, + "logps/chosen": -780.0, + "logps/rejected": -1064.0, + "loss": 0.2427, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.03125, + "rewards/margins": 3.015625, + "rewards/rejected": -9.0, + "step": 12380 + }, + { + "epoch": 0.638577502899111, + "grad_norm": 6.707092165783837, + "learning_rate": 1.739273898603532e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.03125, + "logps/chosen": -760.0, + "logps/rejected": -1136.0, + "loss": 0.2345, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.75, + "rewards/margins": 3.6875, + "rewards/rejected": -9.4375, + "step": 12390 + }, + { + "epoch": 0.6390929003994331, + "grad_norm": 7.9929585483103835, + "learning_rate": 1.7349904174977136e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.140625, + "logps/chosen": -760.0, + "logps/rejected": -1072.0, + "loss": 0.2365, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.78125, + "rewards/margins": 3.234375, + "rewards/rejected": -9.0, + "step": 12400 + }, + { + "epoch": 0.6396082978997552, + "grad_norm": 9.152245841006645, + "learning_rate": 1.7307094128345664e-07, + "logits/chosen": -3.359375, + "logits/rejected": -3.15625, + "logps/chosen": -780.0, + "logps/rejected": -1072.0, + "loss": 0.25, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.0, + "rewards/margins": 3.171875, + "rewards/rejected": -9.1875, + "step": 12410 + }, + { + "epoch": 0.6401236954000773, + "grad_norm": 9.744549906967636, + "learning_rate": 1.7264308984723e-07, + "logits/chosen": -3.140625, + "logits/rejected": -2.859375, + "logps/chosen": -736.0, + "logps/rejected": -1040.0, + "loss": 0.2366, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.625, + "rewards/margins": 3.140625, + "rewards/rejected": -8.8125, + "step": 12420 + }, + { + "epoch": 0.6406390929003994, + "grad_norm": 6.473131126112834, + "learning_rate": 1.722154888261062e-07, + "logits/chosen": -3.34375, + "logits/rejected": -2.921875, + "logps/chosen": -712.0, + "logps/rejected": -1032.0, + "loss": 0.2426, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.5, + "rewards/margins": 3.25, + "rewards/rejected": -8.75, + "step": 12430 + }, + { + "epoch": 0.6411544904007216, + "grad_norm": 7.870736723594007, + "learning_rate": 1.717881396042892e-07, + "logits/chosen": -3.25, + "logits/rejected": -3.0625, + "logps/chosen": -780.0, + "logps/rejected": -1096.0, + "loss": 0.2456, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.0625, + "rewards/margins": 3.125, + "rewards/rejected": -9.1875, + "step": 12440 + }, + { + "epoch": 0.6416698879010437, + "grad_norm": 6.714837905897296, + "learning_rate": 1.7136104356516817e-07, + "logits/chosen": -3.28125, + "logits/rejected": -3.03125, + "logps/chosen": -744.0, + "logps/rejected": -1120.0, + "loss": 0.2369, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.84375, + "rewards/margins": 3.796875, + "rewards/rejected": -9.625, + "step": 12450 + }, + { + "epoch": 0.6421852854013658, + "grad_norm": 8.44964552627194, + "learning_rate": 1.709342020913124e-07, + "logits/chosen": -3.375, + "logits/rejected": -3.0625, + "logps/chosen": -768.0, + "logps/rejected": -1088.0, + "loss": 0.2514, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.9375, + "rewards/margins": 3.21875, + "rewards/rejected": -9.1875, + "step": 12460 + }, + { + "epoch": 0.6427006829016879, + "grad_norm": 6.321706936332154, + "learning_rate": 1.7050761656446733e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.8125, + "logps/chosen": -748.0, + "logps/rejected": -1088.0, + "loss": 0.2403, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.71875, + "rewards/margins": 3.421875, + "rewards/rejected": -9.125, + "step": 12470 + }, + { + "epoch": 0.6432160804020101, + "grad_norm": 5.695185867063922, + "learning_rate": 1.700812883655499e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.265625, + "logps/chosen": -748.0, + "logps/rejected": -1080.0, + "loss": 0.2103, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.6875, + "rewards/margins": 3.359375, + "rewards/rejected": -9.0625, + "step": 12480 + }, + { + "epoch": 0.6437314779023322, + "grad_norm": 8.90419933393203, + "learning_rate": 1.6965521887464372e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.109375, + "logps/chosen": -748.0, + "logps/rejected": -1072.0, + "loss": 0.2258, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.78125, + "rewards/margins": 3.328125, + "rewards/rejected": -9.125, + "step": 12490 + }, + { + "epoch": 0.6442468754026544, + "grad_norm": 10.870430785756824, + "learning_rate": 1.6922940947099517e-07, + "logits/chosen": -3.375, + "logits/rejected": -2.890625, + "logps/chosen": -772.0, + "logps/rejected": -1120.0, + "loss": 0.2364, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.09375, + "rewards/margins": 3.625, + "rewards/rejected": -9.6875, + "step": 12500 + }, + { + "epoch": 0.6447622729029764, + "grad_norm": 8.576107060831212, + "learning_rate": 1.6880386153300878e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.015625, + "logps/chosen": -720.0, + "logps/rejected": -1120.0, + "loss": 0.2319, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.46875, + "rewards/margins": 4.15625, + "rewards/rejected": -9.625, + "step": 12510 + }, + { + "epoch": 0.6452776704032985, + "grad_norm": 10.80497662498453, + "learning_rate": 1.6837857643824238e-07, + "logits/chosen": -3.4375, + "logits/rejected": -3.140625, + "logps/chosen": -752.0, + "logps/rejected": -1088.0, + "loss": 0.2411, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.875, + "rewards/margins": 3.375, + "rewards/rejected": -9.25, + "step": 12520 + }, + { + "epoch": 0.6457930679036207, + "grad_norm": 7.155537051158133, + "learning_rate": 1.6795355556340333e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.046875, + "logps/chosen": -800.0, + "logps/rejected": -1120.0, + "loss": 0.2307, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.125, + "rewards/margins": 3.328125, + "rewards/rejected": -9.4375, + "step": 12530 + }, + { + "epoch": 0.6463084654039428, + "grad_norm": 8.226260079551796, + "learning_rate": 1.6752880028434328e-07, + "logits/chosen": -3.359375, + "logits/rejected": -3.0, + "logps/chosen": -760.0, + "logps/rejected": -1112.0, + "loss": 0.2447, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.0, + "rewards/margins": 3.6875, + "rewards/rejected": -9.6875, + "step": 12540 + }, + { + "epoch": 0.646823862904265, + "grad_norm": 8.362689371063025, + "learning_rate": 1.6710431197605423e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.734375, + "logps/chosen": -772.0, + "logps/rejected": -1120.0, + "loss": 0.2467, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.0, + "rewards/margins": 3.640625, + "rewards/rejected": -9.6875, + "step": 12550 + }, + { + "epoch": 0.647339260404587, + "grad_norm": 6.15139714194258, + "learning_rate": 1.6668009201266423e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.921875, + "logps/chosen": -752.0, + "logps/rejected": -1064.0, + "loss": 0.2124, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.875, + "rewards/margins": 3.265625, + "rewards/rejected": -9.125, + "step": 12560 + }, + { + "epoch": 0.6478546579049091, + "grad_norm": 5.604460110147497, + "learning_rate": 1.6625614176743224e-07, + "logits/chosen": -3.40625, + "logits/rejected": -2.9375, + "logps/chosen": -748.0, + "logps/rejected": -1112.0, + "loss": 0.2354, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.875, + "rewards/margins": 3.8125, + "rewards/rejected": -9.6875, + "step": 12570 + }, + { + "epoch": 0.6483700554052313, + "grad_norm": 9.395359525673973, + "learning_rate": 1.6583246261274454e-07, + "logits/chosen": -3.234375, + "logits/rejected": -3.03125, + "logps/chosen": -776.0, + "logps/rejected": -1096.0, + "loss": 0.2591, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.9375, + "rewards/margins": 3.375, + "rewards/rejected": -9.3125, + "step": 12580 + }, + { + "epoch": 0.6488854529055534, + "grad_norm": 8.668776249438132, + "learning_rate": 1.654090559201094e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.8125, + "logps/chosen": -748.0, + "logps/rejected": -1112.0, + "loss": 0.2281, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.71875, + "rewards/margins": 3.734375, + "rewards/rejected": -9.5, + "step": 12590 + }, + { + "epoch": 0.6494008504058756, + "grad_norm": 8.119792873880115, + "learning_rate": 1.6498592306015332e-07, + "logits/chosen": -3.421875, + "logits/rejected": -3.15625, + "logps/chosen": -764.0, + "logps/rejected": -1056.0, + "loss": 0.261, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.9375, + "rewards/margins": 2.96875, + "rewards/rejected": -8.875, + "step": 12600 + }, + { + "epoch": 0.6499162479061976, + "grad_norm": 5.758446398142689, + "learning_rate": 1.645630654026165e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.96875, + "logps/chosen": -768.0, + "logps/rejected": -1104.0, + "loss": 0.2323, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.96875, + "rewards/margins": 3.390625, + "rewards/rejected": -9.375, + "step": 12610 + }, + { + "epoch": 0.6504316454065198, + "grad_norm": 10.984618562835472, + "learning_rate": 1.6414048431634803e-07, + "logits/chosen": -3.28125, + "logits/rejected": -3.09375, + "logps/chosen": -808.0, + "logps/rejected": -1112.0, + "loss": 0.2391, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.21875, + "rewards/margins": 3.15625, + "rewards/rejected": -9.375, + "step": 12620 + }, + { + "epoch": 0.6509470429068419, + "grad_norm": 6.610035437378527, + "learning_rate": 1.637181811693019e-07, + "logits/chosen": -3.25, + "logits/rejected": -3.0, + "logps/chosen": -784.0, + "logps/rejected": -1120.0, + "loss": 0.2288, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.03125, + "rewards/margins": 3.453125, + "rewards/rejected": -9.5, + "step": 12630 + }, + { + "epoch": 0.6514624404071641, + "grad_norm": 7.23098117762501, + "learning_rate": 1.6329615732853213e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.0625, + "logps/chosen": -792.0, + "logps/rejected": -1104.0, + "loss": 0.2234, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.0625, + "rewards/margins": 3.3125, + "rewards/rejected": -9.375, + "step": 12640 + }, + { + "epoch": 0.6519778379074862, + "grad_norm": 8.71613279640881, + "learning_rate": 1.6287441416018883e-07, + "logits/chosen": -3.359375, + "logits/rejected": -3.09375, + "logps/chosen": -760.0, + "logps/rejected": -1120.0, + "loss": 0.249, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.84375, + "rewards/margins": 3.65625, + "rewards/rejected": -9.5, + "step": 12650 + }, + { + "epoch": 0.6524932354078082, + "grad_norm": 7.597383097796411, + "learning_rate": 1.6245295302951335e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.921875, + "logps/chosen": -804.0, + "logps/rejected": -1120.0, + "loss": 0.2222, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.0625, + "rewards/margins": 3.46875, + "rewards/rejected": -9.5, + "step": 12660 + }, + { + "epoch": 0.6530086329081304, + "grad_norm": 8.3065619617507, + "learning_rate": 1.6203177530083422e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.921875, + "logps/chosen": -760.0, + "logps/rejected": -1104.0, + "loss": 0.2463, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.96875, + "rewards/margins": 3.5, + "rewards/rejected": -9.4375, + "step": 12670 + }, + { + "epoch": 0.6535240304084525, + "grad_norm": 9.790200942667955, + "learning_rate": 1.6161088233756248e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.921875, + "logps/chosen": -784.0, + "logps/rejected": -1080.0, + "loss": 0.2412, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.125, + "rewards/margins": 3.078125, + "rewards/rejected": -9.1875, + "step": 12680 + }, + { + "epoch": 0.6540394279087747, + "grad_norm": 9.25892786860473, + "learning_rate": 1.611902755021872e-07, + "logits/chosen": -3.515625, + "logits/rejected": -3.09375, + "logps/chosen": -760.0, + "logps/rejected": -1112.0, + "loss": 0.2413, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.65625, + "rewards/margins": 3.890625, + "rewards/rejected": -9.5625, + "step": 12690 + }, + { + "epoch": 0.6545548254090968, + "grad_norm": 8.967293015545994, + "learning_rate": 1.6076995615627144e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.015625, + "logps/chosen": -784.0, + "logps/rejected": -1120.0, + "loss": 0.2082, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.0625, + "rewards/margins": 3.46875, + "rewards/rejected": -9.5, + "step": 12700 + }, + { + "epoch": 0.6550702229094189, + "grad_norm": 9.40984535815492, + "learning_rate": 1.6034992566044747e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.9375, + "logps/chosen": -768.0, + "logps/rejected": -1120.0, + "loss": 0.2238, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.78125, + "rewards/margins": 3.796875, + "rewards/rejected": -9.5625, + "step": 12710 + }, + { + "epoch": 0.655585620409741, + "grad_norm": 9.03458712470485, + "learning_rate": 1.5993018537441266e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.0625, + "logps/chosen": -776.0, + "logps/rejected": -1112.0, + "loss": 0.2319, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.875, + "rewards/margins": 3.46875, + "rewards/rejected": -9.375, + "step": 12720 + }, + { + "epoch": 0.6561010179100631, + "grad_norm": 10.842383815761101, + "learning_rate": 1.595107366569248e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.046875, + "logps/chosen": -784.0, + "logps/rejected": -1128.0, + "loss": 0.2392, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.90625, + "rewards/margins": 3.484375, + "rewards/rejected": -9.375, + "step": 12730 + }, + { + "epoch": 0.6566164154103853, + "grad_norm": 6.679342324356698, + "learning_rate": 1.590915808657977e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.84375, + "logps/chosen": -752.0, + "logps/rejected": -1096.0, + "loss": 0.2364, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.96875, + "rewards/margins": 3.609375, + "rewards/rejected": -9.5625, + "step": 12740 + }, + { + "epoch": 0.6571318129107074, + "grad_norm": 10.089100882351309, + "learning_rate": 1.5867271935789733e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.90625, + "logps/chosen": -780.0, + "logps/rejected": -1120.0, + "loss": 0.2472, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.0, + "rewards/margins": 3.609375, + "rewards/rejected": -9.625, + "step": 12750 + }, + { + "epoch": 0.6576472104110295, + "grad_norm": 9.541545422368952, + "learning_rate": 1.5825415348913663e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.96875, + "logps/chosen": -764.0, + "logps/rejected": -1072.0, + "loss": 0.2401, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.96875, + "rewards/margins": 3.375, + "rewards/rejected": -9.3125, + "step": 12760 + }, + { + "epoch": 0.6581626079113516, + "grad_norm": 7.235880461924555, + "learning_rate": 1.5783588461447173e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.0, + "logps/chosen": -788.0, + "logps/rejected": -1088.0, + "loss": 0.2549, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.03125, + "rewards/margins": 3.25, + "rewards/rejected": -9.25, + "step": 12770 + }, + { + "epoch": 0.6586780054116738, + "grad_norm": 7.075046618624804, + "learning_rate": 1.5741791408789737e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.921875, + "logps/chosen": -768.0, + "logps/rejected": -1072.0, + "loss": 0.2213, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.875, + "rewards/margins": 3.25, + "rewards/rejected": -9.125, + "step": 12780 + }, + { + "epoch": 0.6591934029119959, + "grad_norm": 8.601368996164789, + "learning_rate": 1.5700024326244222e-07, + "logits/chosen": -3.21875, + "logits/rejected": -3.046875, + "logps/chosen": -760.0, + "logps/rejected": -1064.0, + "loss": 0.2329, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.84375, + "rewards/margins": 3.203125, + "rewards/rejected": -9.0625, + "step": 12790 + }, + { + "epoch": 0.659708800412318, + "grad_norm": 8.668430196838132, + "learning_rate": 1.5658287349016506e-07, + "logits/chosen": -3.203125, + "logits/rejected": -3.046875, + "logps/chosen": -772.0, + "logps/rejected": -1080.0, + "loss": 0.2416, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.90625, + "rewards/margins": 3.359375, + "rewards/rejected": -9.25, + "step": 12800 + }, + { + "epoch": 0.6602241979126401, + "grad_norm": 7.357382393922462, + "learning_rate": 1.561658061221501e-07, + "logits/chosen": -3.390625, + "logits/rejected": -2.921875, + "logps/chosen": -768.0, + "logps/rejected": -1112.0, + "loss": 0.2371, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.03125, + "rewards/margins": 3.3125, + "rewards/rejected": -9.3125, + "step": 12810 + }, + { + "epoch": 0.6607395954129622, + "grad_norm": 8.452334305503593, + "learning_rate": 1.5574904250850246e-07, + "logits/chosen": -3.078125, + "logits/rejected": -2.890625, + "logps/chosen": -768.0, + "logps/rejected": -1120.0, + "loss": 0.2424, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.96875, + "rewards/margins": 3.5625, + "rewards/rejected": -9.5625, + "step": 12820 + }, + { + "epoch": 0.6612549929132844, + "grad_norm": 9.266530040823797, + "learning_rate": 1.5533258399834421e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.875, + "logps/chosen": -740.0, + "logps/rejected": -1104.0, + "loss": 0.2296, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.84375, + "rewards/margins": 3.734375, + "rewards/rejected": -9.5625, + "step": 12830 + }, + { + "epoch": 0.6617703904136065, + "grad_norm": 8.202032695126173, + "learning_rate": 1.5491643193980946e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.78125, + "logps/chosen": -776.0, + "logps/rejected": -1160.0, + "loss": 0.2186, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.09375, + "rewards/margins": 3.78125, + "rewards/rejected": -9.875, + "step": 12840 + }, + { + "epoch": 0.6622857879139287, + "grad_norm": 9.303803844255983, + "learning_rate": 1.5450058768004043e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.1875, + "logps/chosen": -780.0, + "logps/rejected": -1096.0, + "loss": 0.2259, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.03125, + "rewards/margins": 3.40625, + "rewards/rejected": -9.4375, + "step": 12850 + }, + { + "epoch": 0.6628011854142507, + "grad_norm": 5.9849314270141, + "learning_rate": 1.5408505256518305e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.90625, + "logps/chosen": -772.0, + "logps/rejected": -1136.0, + "loss": 0.232, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.875, + "rewards/margins": 3.953125, + "rewards/rejected": -9.8125, + "step": 12860 + }, + { + "epoch": 0.6633165829145728, + "grad_norm": 10.837457836954654, + "learning_rate": 1.5366982794038234e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.984375, + "logps/chosen": -772.0, + "logps/rejected": -1072.0, + "loss": 0.2394, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.9375, + "rewards/margins": 3.171875, + "rewards/rejected": -9.125, + "step": 12870 + }, + { + "epoch": 0.663831980414895, + "grad_norm": 10.106445743717117, + "learning_rate": 1.5325491514977844e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.09375, + "logps/chosen": -832.0, + "logps/rejected": -1144.0, + "loss": 0.2312, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.3125, + "rewards/margins": 3.375, + "rewards/rejected": -9.6875, + "step": 12880 + }, + { + "epoch": 0.6643473779152171, + "grad_norm": 10.062792420887806, + "learning_rate": 1.5284031553650174e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.078125, + "logps/chosen": -792.0, + "logps/rejected": -1096.0, + "loss": 0.2343, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.1875, + "rewards/margins": 3.125, + "rewards/rejected": -9.3125, + "step": 12890 + }, + { + "epoch": 0.6648627754155393, + "grad_norm": 6.31121920967209, + "learning_rate": 1.5242603044266902e-07, + "logits/chosen": -3.40625, + "logits/rejected": -3.0, + "logps/chosen": -788.0, + "logps/rejected": -1112.0, + "loss": 0.2146, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.25, + "rewards/margins": 3.21875, + "rewards/rejected": -9.5, + "step": 12900 + }, + { + "epoch": 0.6653781729158613, + "grad_norm": 7.663325206688367, + "learning_rate": 1.5201206120937896e-07, + "logits/chosen": -3.359375, + "logits/rejected": -2.953125, + "logps/chosen": -740.0, + "logps/rejected": -1128.0, + "loss": 0.2174, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.78125, + "rewards/margins": 3.9375, + "rewards/rejected": -9.75, + "step": 12910 + }, + { + "epoch": 0.6658935704161835, + "grad_norm": 5.82631234098123, + "learning_rate": 1.5159840917670757e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.109375, + "logps/chosen": -792.0, + "logps/rejected": -1120.0, + "loss": 0.2088, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.25, + "rewards/margins": 3.328125, + "rewards/rejected": -9.5625, + "step": 12920 + }, + { + "epoch": 0.6664089679165056, + "grad_norm": 8.383255289633494, + "learning_rate": 1.5118507568370434e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.90625, + "logps/chosen": -772.0, + "logps/rejected": -1104.0, + "loss": 0.2255, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.125, + "rewards/margins": 3.359375, + "rewards/rejected": -9.5, + "step": 12930 + }, + { + "epoch": 0.6669243654168278, + "grad_norm": 10.580168771386347, + "learning_rate": 1.5077206206838722e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.953125, + "logps/chosen": -792.0, + "logps/rejected": -1136.0, + "loss": 0.2128, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.03125, + "rewards/margins": 3.734375, + "rewards/rejected": -9.75, + "step": 12940 + }, + { + "epoch": 0.6674397629171499, + "grad_norm": 12.510528006219435, + "learning_rate": 1.5035936966773888e-07, + "logits/chosen": -3.453125, + "logits/rejected": -3.078125, + "logps/chosen": -792.0, + "logps/rejected": -1104.0, + "loss": 0.2532, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.1875, + "rewards/margins": 3.125, + "rewards/rejected": -9.3125, + "step": 12950 + }, + { + "epoch": 0.6679551604174719, + "grad_norm": 8.24324410832808, + "learning_rate": 1.499469998177022e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.140625, + "logps/chosen": -796.0, + "logps/rejected": -1120.0, + "loss": 0.2062, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.15625, + "rewards/margins": 3.421875, + "rewards/rejected": -9.5625, + "step": 12960 + }, + { + "epoch": 0.6684705579177941, + "grad_norm": 9.260318999958987, + "learning_rate": 1.4953495385317593e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.921875, + "logps/chosen": -776.0, + "logps/rejected": -1144.0, + "loss": 0.2415, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.1875, + "rewards/margins": 3.6875, + "rewards/rejected": -9.875, + "step": 12970 + }, + { + "epoch": 0.6689859554181162, + "grad_norm": 8.384101413694909, + "learning_rate": 1.491232331080103e-07, + "logits/chosen": -3.28125, + "logits/rejected": -3.03125, + "logps/chosen": -796.0, + "logps/rejected": -1168.0, + "loss": 0.2215, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.0625, + "rewards/margins": 3.78125, + "rewards/rejected": -9.875, + "step": 12980 + }, + { + "epoch": 0.6695013529184384, + "grad_norm": 5.516865797796328, + "learning_rate": 1.4871183891500264e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.078125, + "logps/chosen": -768.0, + "logps/rejected": -1088.0, + "loss": 0.2195, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.84375, + "rewards/margins": 3.234375, + "rewards/rejected": -9.0625, + "step": 12990 + }, + { + "epoch": 0.6700167504187605, + "grad_norm": 11.639626047105118, + "learning_rate": 1.483007726058934e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.875, + "logps/chosen": -732.0, + "logps/rejected": -1112.0, + "loss": 0.2138, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.53125, + "rewards/margins": 3.9375, + "rewards/rejected": -9.4375, + "step": 13000 + }, + { + "epoch": 0.6705321479190826, + "grad_norm": 8.93914843717702, + "learning_rate": 1.4789003551136147e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.09375, + "logps/chosen": -784.0, + "logps/rejected": -1152.0, + "loss": 0.2383, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.1875, + "rewards/margins": 3.609375, + "rewards/rejected": -9.8125, + "step": 13010 + }, + { + "epoch": 0.6710475454194047, + "grad_norm": 8.881998183785665, + "learning_rate": 1.4747962896102018e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.140625, + "logps/chosen": -800.0, + "logps/rejected": -1096.0, + "loss": 0.2333, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.28125, + "rewards/margins": 3.078125, + "rewards/rejected": -9.375, + "step": 13020 + }, + { + "epoch": 0.6715629429197268, + "grad_norm": 8.549022885871125, + "learning_rate": 1.4706955428341262e-07, + "logits/chosen": -3.375, + "logits/rejected": -3.09375, + "logps/chosen": -808.0, + "logps/rejected": -1168.0, + "loss": 0.2445, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.1875, + "rewards/margins": 3.828125, + "rewards/rejected": -10.0625, + "step": 13030 + }, + { + "epoch": 0.672078340420049, + "grad_norm": 8.351708988570488, + "learning_rate": 1.4665981280600788e-07, + "logits/chosen": -3.359375, + "logits/rejected": -2.84375, + "logps/chosen": -772.0, + "logps/rejected": -1136.0, + "loss": 0.2474, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.21875, + "rewards/margins": 3.640625, + "rewards/rejected": -9.875, + "step": 13040 + }, + { + "epoch": 0.6725937379203711, + "grad_norm": 10.927970640987619, + "learning_rate": 1.462504058551961e-07, + "logits/chosen": -3.34375, + "logits/rejected": -2.921875, + "logps/chosen": -780.0, + "logps/rejected": -1112.0, + "loss": 0.2624, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.28125, + "rewards/margins": 3.296875, + "rewards/rejected": -9.5625, + "step": 13050 + }, + { + "epoch": 0.6731091354206932, + "grad_norm": 6.014379193567238, + "learning_rate": 1.4584133475628465e-07, + "logits/chosen": -3.40625, + "logits/rejected": -3.078125, + "logps/chosen": -808.0, + "logps/rejected": -1136.0, + "loss": 0.2251, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.15625, + "rewards/margins": 3.515625, + "rewards/rejected": -9.6875, + "step": 13060 + }, + { + "epoch": 0.6736245329210153, + "grad_norm": 8.003883830113416, + "learning_rate": 1.4543260083349374e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.0625, + "logps/chosen": -784.0, + "logps/rejected": -1112.0, + "loss": 0.219, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.96875, + "rewards/margins": 3.421875, + "rewards/rejected": -9.375, + "step": 13070 + }, + { + "epoch": 0.6741399304213375, + "grad_norm": 8.904803750900998, + "learning_rate": 1.4502420540995208e-07, + "logits/chosen": -3.421875, + "logits/rejected": -3.125, + "logps/chosen": -828.0, + "logps/rejected": -1152.0, + "loss": 0.2362, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.40625, + "rewards/margins": 3.34375, + "rewards/rejected": -9.75, + "step": 13080 + }, + { + "epoch": 0.6746553279216596, + "grad_norm": 10.99862036790619, + "learning_rate": 1.4461614980769275e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.984375, + "logps/chosen": -808.0, + "logps/rejected": -1144.0, + "loss": 0.2303, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.34375, + "rewards/margins": 3.375, + "rewards/rejected": -9.75, + "step": 13090 + }, + { + "epoch": 0.6751707254219818, + "grad_norm": 12.671353092869754, + "learning_rate": 1.4420843534764827e-07, + "logits/chosen": -3.421875, + "logits/rejected": -3.15625, + "logps/chosen": -792.0, + "logps/rejected": -1136.0, + "loss": 0.2469, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.125, + "rewards/margins": 3.484375, + "rewards/rejected": -9.625, + "step": 13100 + }, + { + "epoch": 0.6756861229223038, + "grad_norm": 8.80157445060348, + "learning_rate": 1.438010633496474e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.984375, + "logps/chosen": -768.0, + "logps/rejected": -1096.0, + "loss": 0.249, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.03125, + "rewards/margins": 3.46875, + "rewards/rejected": -9.5, + "step": 13110 + }, + { + "epoch": 0.6762015204226259, + "grad_norm": 7.368335689677706, + "learning_rate": 1.4339403513241003e-07, + "logits/chosen": -3.3125, + "logits/rejected": -2.90625, + "logps/chosen": -736.0, + "logps/rejected": -1096.0, + "loss": 0.2183, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.71875, + "rewards/margins": 3.828125, + "rewards/rejected": -9.5625, + "step": 13120 + }, + { + "epoch": 0.6767169179229481, + "grad_norm": 7.600516201173178, + "learning_rate": 1.4298735201354328e-07, + "logits/chosen": -3.21875, + "logits/rejected": -3.078125, + "logps/chosen": -800.0, + "logps/rejected": -1096.0, + "loss": 0.2416, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.15625, + "rewards/margins": 3.203125, + "rewards/rejected": -9.375, + "step": 13130 + }, + { + "epoch": 0.6772323154232702, + "grad_norm": 10.474644383353072, + "learning_rate": 1.4258101530953723e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.984375, + "logps/chosen": -784.0, + "logps/rejected": -1088.0, + "loss": 0.2343, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.1875, + "rewards/margins": 3.234375, + "rewards/rejected": -9.4375, + "step": 13140 + }, + { + "epoch": 0.6777477129235924, + "grad_norm": 10.161450877422654, + "learning_rate": 1.421750263357603e-07, + "logits/chosen": -3.21875, + "logits/rejected": -3.0, + "logps/chosen": -784.0, + "logps/rejected": -1112.0, + "loss": 0.2478, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.1875, + "rewards/margins": 3.265625, + "rewards/rejected": -9.4375, + "step": 13150 + }, + { + "epoch": 0.6782631104239144, + "grad_norm": 9.162339066109725, + "learning_rate": 1.417693864064553e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.8125, + "logps/chosen": -796.0, + "logps/rejected": -1120.0, + "loss": 0.2288, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.21875, + "rewards/margins": 3.296875, + "rewards/rejected": -9.5, + "step": 13160 + }, + { + "epoch": 0.6787785079242366, + "grad_norm": 6.890733397872839, + "learning_rate": 1.4136409683473532e-07, + "logits/chosen": -3.328125, + "logits/rejected": -2.890625, + "logps/chosen": -768.0, + "logps/rejected": -1120.0, + "loss": 0.2477, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.84375, + "rewards/margins": 3.5625, + "rewards/rejected": -9.375, + "step": 13170 + }, + { + "epoch": 0.6792939054245587, + "grad_norm": 5.824207747112544, + "learning_rate": 1.4095915893257928e-07, + "logits/chosen": -3.359375, + "logits/rejected": -3.125, + "logps/chosen": -756.0, + "logps/rejected": -1096.0, + "loss": 0.2385, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.8125, + "rewards/margins": 3.53125, + "rewards/rejected": -9.375, + "step": 13180 + }, + { + "epoch": 0.6798093029248808, + "grad_norm": 8.747785025089968, + "learning_rate": 1.4055457401082764e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.171875, + "logps/chosen": -780.0, + "logps/rejected": -1088.0, + "loss": 0.2158, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.0625, + "rewards/margins": 3.125, + "rewards/rejected": -9.1875, + "step": 13190 + }, + { + "epoch": 0.680324700425203, + "grad_norm": 7.522590827129012, + "learning_rate": 1.4015034337917813e-07, + "logits/chosen": -3.5, + "logits/rejected": -3.15625, + "logps/chosen": -788.0, + "logps/rejected": -1120.0, + "loss": 0.246, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.03125, + "rewards/margins": 3.34375, + "rewards/rejected": -9.375, + "step": 13200 + }, + { + "epoch": 0.680840097925525, + "grad_norm": 8.158211581459557, + "learning_rate": 1.397464683461816e-07, + "logits/chosen": -3.453125, + "logits/rejected": -3.046875, + "logps/chosen": -776.0, + "logps/rejected": -1112.0, + "loss": 0.2513, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.125, + "rewards/margins": 3.484375, + "rewards/rejected": -9.625, + "step": 13210 + }, + { + "epoch": 0.6813554954258472, + "grad_norm": 9.285973441375255, + "learning_rate": 1.3934295021923787e-07, + "logits/chosen": -3.546875, + "logits/rejected": -3.171875, + "logps/chosen": -788.0, + "logps/rejected": -1112.0, + "loss": 0.2551, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.84375, + "rewards/margins": 3.546875, + "rewards/rejected": -9.375, + "step": 13220 + }, + { + "epoch": 0.6818708929261693, + "grad_norm": 6.771689585478034, + "learning_rate": 1.3893979030459138e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.078125, + "logps/chosen": -788.0, + "logps/rejected": -1064.0, + "loss": 0.2416, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.9375, + "rewards/margins": 3.15625, + "rewards/rejected": -9.0625, + "step": 13230 + }, + { + "epoch": 0.6823862904264915, + "grad_norm": 8.218704141339925, + "learning_rate": 1.385369899073271e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.140625, + "logps/chosen": -780.0, + "logps/rejected": -1112.0, + "loss": 0.2365, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.96875, + "rewards/margins": 3.375, + "rewards/rejected": -9.3125, + "step": 13240 + }, + { + "epoch": 0.6829016879268136, + "grad_norm": 8.151344622688, + "learning_rate": 1.381345503313659e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.8125, + "logps/chosen": -808.0, + "logps/rejected": -1120.0, + "loss": 0.206, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.25, + "rewards/margins": 3.3125, + "rewards/rejected": -9.5625, + "step": 13250 + }, + { + "epoch": 0.6834170854271356, + "grad_norm": 9.932107389961592, + "learning_rate": 1.3773247287946097e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.03125, + "logps/chosen": -780.0, + "logps/rejected": -1152.0, + "loss": 0.2116, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.15625, + "rewards/margins": 3.828125, + "rewards/rejected": -10.0, + "step": 13260 + }, + { + "epoch": 0.6839324829274578, + "grad_norm": 6.024192581717042, + "learning_rate": 1.3733075885319283e-07, + "logits/chosen": -3.359375, + "logits/rejected": -3.09375, + "logps/chosen": -812.0, + "logps/rejected": -1168.0, + "loss": 0.2115, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.3125, + "rewards/margins": 3.640625, + "rewards/rejected": -9.9375, + "step": 13270 + }, + { + "epoch": 0.6844478804277799, + "grad_norm": 8.048261092523775, + "learning_rate": 1.3692940955296597e-07, + "logits/chosen": -3.40625, + "logits/rejected": -3.015625, + "logps/chosen": -792.0, + "logps/rejected": -1184.0, + "loss": 0.2249, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.125, + "rewards/margins": 4.03125, + "rewards/rejected": -10.1875, + "step": 13280 + }, + { + "epoch": 0.6849632779281021, + "grad_norm": 9.48560085695171, + "learning_rate": 1.3652842627800407e-07, + "logits/chosen": -3.359375, + "logits/rejected": -3.015625, + "logps/chosen": -820.0, + "logps/rejected": -1176.0, + "loss": 0.2503, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.34375, + "rewards/margins": 3.65625, + "rewards/rejected": -10.0, + "step": 13290 + }, + { + "epoch": 0.6854786754284242, + "grad_norm": 7.8220493426552595, + "learning_rate": 1.361278103263457e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.078125, + "logps/chosen": -776.0, + "logps/rejected": -1136.0, + "loss": 0.2377, + "rewards/accuracies": 0.84375, + "rewards/chosen": -6.0, + "rewards/margins": 3.671875, + "rewards/rejected": -9.6875, + "step": 13300 + }, + { + "epoch": 0.6859940729287463, + "grad_norm": 9.753397844848715, + "learning_rate": 1.357275629948408e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.875, + "logps/chosen": -736.0, + "logps/rejected": -1096.0, + "loss": 0.2302, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.8125, + "rewards/margins": 3.578125, + "rewards/rejected": -9.4375, + "step": 13310 + }, + { + "epoch": 0.6865094704290684, + "grad_norm": 9.836810750435404, + "learning_rate": 1.353276855791455e-07, + "logits/chosen": -3.171875, + "logits/rejected": -3.109375, + "logps/chosen": -788.0, + "logps/rejected": -1120.0, + "loss": 0.2436, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.0, + "rewards/margins": 3.609375, + "rewards/rejected": -9.625, + "step": 13320 + }, + { + "epoch": 0.6870248679293905, + "grad_norm": 7.737654999804018, + "learning_rate": 1.3492817937371887e-07, + "logits/chosen": -3.203125, + "logits/rejected": -3.0, + "logps/chosen": -776.0, + "logps/rejected": -1128.0, + "loss": 0.2421, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.125, + "rewards/margins": 3.609375, + "rewards/rejected": -9.75, + "step": 13330 + }, + { + "epoch": 0.6875402654297127, + "grad_norm": 8.471523027104276, + "learning_rate": 1.3452904567181834e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.921875, + "logps/chosen": -792.0, + "logps/rejected": -1152.0, + "loss": 0.2261, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.125, + "rewards/margins": 3.828125, + "rewards/rejected": -9.9375, + "step": 13340 + }, + { + "epoch": 0.6880556629300348, + "grad_norm": 9.04400548283302, + "learning_rate": 1.3413028576549512e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.75, + "logps/chosen": -784.0, + "logps/rejected": -1128.0, + "loss": 0.2174, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -6.125, + "rewards/margins": 3.46875, + "rewards/rejected": -9.5625, + "step": 13350 + }, + { + "epoch": 0.6885710604303569, + "grad_norm": 7.5219963529681335, + "learning_rate": 1.337319009455908e-07, + "logits/chosen": -3.3125, + "logits/rejected": -2.90625, + "logps/chosen": -776.0, + "logps/rejected": -1136.0, + "loss": 0.2571, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.96875, + "rewards/margins": 3.609375, + "rewards/rejected": -9.5625, + "step": 13360 + }, + { + "epoch": 0.689086457930679, + "grad_norm": 9.332835223978439, + "learning_rate": 1.3333389250173237e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.984375, + "logps/chosen": -800.0, + "logps/rejected": -1184.0, + "loss": 0.2393, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.28125, + "rewards/margins": 3.890625, + "rewards/rejected": -10.125, + "step": 13370 + }, + { + "epoch": 0.6896018554310012, + "grad_norm": 8.782354682695985, + "learning_rate": 1.329362617223288e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.984375, + "logps/chosen": -772.0, + "logps/rejected": -1056.0, + "loss": 0.252, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.8125, + "rewards/margins": 3.015625, + "rewards/rejected": -8.8125, + "step": 13380 + }, + { + "epoch": 0.6901172529313233, + "grad_norm": 7.571776234114652, + "learning_rate": 1.3253900989456645e-07, + "logits/chosen": -3.375, + "logits/rejected": -3.03125, + "logps/chosen": -756.0, + "logps/rejected": -1104.0, + "loss": 0.221, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.84375, + "rewards/margins": 3.609375, + "rewards/rejected": -9.4375, + "step": 13390 + }, + { + "epoch": 0.6906326504316455, + "grad_norm": 9.787183031375784, + "learning_rate": 1.3214213830440463e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.109375, + "logps/chosen": -748.0, + "logps/rejected": -1056.0, + "loss": 0.2278, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.875, + "rewards/margins": 3.265625, + "rewards/rejected": -9.1875, + "step": 13400 + }, + { + "epoch": 0.6911480479319675, + "grad_norm": 10.361687338958166, + "learning_rate": 1.3174564823657213e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.90625, + "logps/chosen": -768.0, + "logps/rejected": -1088.0, + "loss": 0.2323, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.03125, + "rewards/margins": 3.28125, + "rewards/rejected": -9.3125, + "step": 13410 + }, + { + "epoch": 0.6916634454322896, + "grad_norm": 6.6909000223696955, + "learning_rate": 1.3134954097456272e-07, + "logits/chosen": -3.46875, + "logits/rejected": -3.171875, + "logps/chosen": -752.0, + "logps/rejected": -1120.0, + "loss": 0.2346, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.75, + "rewards/margins": 3.796875, + "rewards/rejected": -9.5625, + "step": 13420 + }, + { + "epoch": 0.6921788429326118, + "grad_norm": 9.134632130588473, + "learning_rate": 1.3095381780063052e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.0625, + "logps/chosen": -816.0, + "logps/rejected": -1152.0, + "loss": 0.2246, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.21875, + "rewards/margins": 3.515625, + "rewards/rejected": -9.75, + "step": 13430 + }, + { + "epoch": 0.6926942404329339, + "grad_norm": 8.345763937466929, + "learning_rate": 1.3055847999578696e-07, + "logits/chosen": -3.421875, + "logits/rejected": -3.09375, + "logps/chosen": -784.0, + "logps/rejected": -1136.0, + "loss": 0.2249, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.15625, + "rewards/margins": 3.625, + "rewards/rejected": -9.8125, + "step": 13440 + }, + { + "epoch": 0.6932096379332561, + "grad_norm": 8.307580944000582, + "learning_rate": 1.3016352883979527e-07, + "logits/chosen": -3.15625, + "logits/rejected": -3.0, + "logps/chosen": -772.0, + "logps/rejected": -1096.0, + "loss": 0.2425, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.0625, + "rewards/margins": 3.3125, + "rewards/rejected": -9.375, + "step": 13450 + }, + { + "epoch": 0.6937250354335781, + "grad_norm": 7.404913056606003, + "learning_rate": 1.2976896561116763e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.046875, + "logps/chosen": -788.0, + "logps/rejected": -1128.0, + "loss": 0.2392, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.09375, + "rewards/margins": 3.53125, + "rewards/rejected": -9.625, + "step": 13460 + }, + { + "epoch": 0.6942404329339003, + "grad_norm": 6.039270308801747, + "learning_rate": 1.2937479158716024e-07, + "logits/chosen": -3.203125, + "logits/rejected": -3.109375, + "logps/chosen": -796.0, + "logps/rejected": -1112.0, + "loss": 0.2299, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.03125, + "rewards/margins": 3.5, + "rewards/rejected": -9.5, + "step": 13470 + }, + { + "epoch": 0.6947558304342224, + "grad_norm": 7.091463851910839, + "learning_rate": 1.2898100804376925e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.0, + "logps/chosen": -820.0, + "logps/rejected": -1136.0, + "loss": 0.2377, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.28125, + "rewards/margins": 3.390625, + "rewards/rejected": -9.6875, + "step": 13480 + }, + { + "epoch": 0.6952712279345445, + "grad_norm": 11.31631602768807, + "learning_rate": 1.28587616255727e-07, + "logits/chosen": -3.25, + "logits/rejected": -3.140625, + "logps/chosen": -784.0, + "logps/rejected": -1136.0, + "loss": 0.245, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -6.03125, + "rewards/margins": 3.796875, + "rewards/rejected": -9.8125, + "step": 13490 + }, + { + "epoch": 0.6957866254348667, + "grad_norm": 8.30344034316391, + "learning_rate": 1.281946174964974e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.046875, + "logps/chosen": -820.0, + "logps/rejected": -1136.0, + "loss": 0.2352, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.375, + "rewards/margins": 3.515625, + "rewards/rejected": -9.875, + "step": 13500 + }, + { + "epoch": 0.6963020229351887, + "grad_norm": 7.002844933427936, + "learning_rate": 1.2780201303827233e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.046875, + "logps/chosen": -784.0, + "logps/rejected": -1128.0, + "loss": 0.2452, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.25, + "rewards/margins": 3.3125, + "rewards/rejected": -9.5625, + "step": 13510 + }, + { + "epoch": 0.6968174204355109, + "grad_norm": 5.824683656236229, + "learning_rate": 1.2740980415196722e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.0625, + "logps/chosen": -800.0, + "logps/rejected": -1120.0, + "loss": 0.2467, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.15625, + "rewards/margins": 3.53125, + "rewards/rejected": -9.6875, + "step": 13520 + }, + { + "epoch": 0.697332817935833, + "grad_norm": 9.270130898689137, + "learning_rate": 1.2701799210721675e-07, + "logits/chosen": -3.421875, + "logits/rejected": -3.15625, + "logps/chosen": -812.0, + "logps/rejected": -1120.0, + "loss": 0.2534, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.21875, + "rewards/margins": 3.34375, + "rewards/rejected": -9.5625, + "step": 13530 + }, + { + "epoch": 0.6978482154361552, + "grad_norm": 7.457610440945698, + "learning_rate": 1.266265781723713e-07, + "logits/chosen": -3.453125, + "logits/rejected": -3.09375, + "logps/chosen": -784.0, + "logps/rejected": -1136.0, + "loss": 0.2341, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.15625, + "rewards/margins": 3.5, + "rewards/rejected": -9.6875, + "step": 13540 + }, + { + "epoch": 0.6983636129364773, + "grad_norm": 9.699927001881205, + "learning_rate": 1.2623556361449217e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.109375, + "logps/chosen": -808.0, + "logps/rejected": -1136.0, + "loss": 0.2492, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.0625, + "rewards/margins": 3.296875, + "rewards/rejected": -9.375, + "step": 13550 + }, + { + "epoch": 0.6988790104367993, + "grad_norm": 7.123574077818472, + "learning_rate": 1.2584494969934812e-07, + "logits/chosen": -3.359375, + "logits/rejected": -3.046875, + "logps/chosen": -756.0, + "logps/rejected": -1096.0, + "loss": 0.2458, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.90625, + "rewards/margins": 3.46875, + "rewards/rejected": -9.375, + "step": 13560 + }, + { + "epoch": 0.6993944079371215, + "grad_norm": 9.014343353909952, + "learning_rate": 1.2545473769141077e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.078125, + "logps/chosen": -784.0, + "logps/rejected": -1112.0, + "loss": 0.2255, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.09375, + "rewards/margins": 3.296875, + "rewards/rejected": -9.375, + "step": 13570 + }, + { + "epoch": 0.6999098054374436, + "grad_norm": 8.768559194363343, + "learning_rate": 1.2506492885385095e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.921875, + "logps/chosen": -776.0, + "logps/rejected": -1104.0, + "loss": 0.2349, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.0625, + "rewards/margins": 3.3125, + "rewards/rejected": -9.375, + "step": 13580 + }, + { + "epoch": 0.7004252029377658, + "grad_norm": 8.999007772279004, + "learning_rate": 1.24675524448534e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.03125, + "logps/chosen": -784.0, + "logps/rejected": -1144.0, + "loss": 0.2186, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.09375, + "rewards/margins": 3.5625, + "rewards/rejected": -9.6875, + "step": 13590 + }, + { + "epoch": 0.7009406004380879, + "grad_norm": 6.959169394267382, + "learning_rate": 1.242865257360165e-07, + "logits/chosen": -3.28125, + "logits/rejected": -3.046875, + "logps/chosen": -760.0, + "logps/rejected": -1096.0, + "loss": 0.2615, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.09375, + "rewards/margins": 3.1875, + "rewards/rejected": -9.3125, + "step": 13600 + }, + { + "epoch": 0.70145599793841, + "grad_norm": 9.40137253210072, + "learning_rate": 1.238979339755413e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.078125, + "logps/chosen": -784.0, + "logps/rejected": -1064.0, + "loss": 0.2405, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.9375, + "rewards/margins": 3.171875, + "rewards/rejected": -9.125, + "step": 13610 + }, + { + "epoch": 0.7019713954387321, + "grad_norm": 8.778429815168483, + "learning_rate": 1.2350975042503414e-07, + "logits/chosen": -3.296875, + "logits/rejected": -3.203125, + "logps/chosen": -808.0, + "logps/rejected": -1080.0, + "loss": 0.2428, + "rewards/accuracies": 0.84375, + "rewards/chosen": -6.3125, + "rewards/margins": 2.8125, + "rewards/rejected": -9.125, + "step": 13620 + }, + { + "epoch": 0.7024867929390542, + "grad_norm": 8.40698939977126, + "learning_rate": 1.2312197634109948e-07, + "logits/chosen": -3.328125, + "logits/rejected": -2.953125, + "logps/chosen": -788.0, + "logps/rejected": -1168.0, + "loss": 0.2261, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.09375, + "rewards/margins": 3.8125, + "rewards/rejected": -9.875, + "step": 13630 + }, + { + "epoch": 0.7030021904393764, + "grad_norm": 11.172015094357485, + "learning_rate": 1.2273461297901584e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.96875, + "logps/chosen": -784.0, + "logps/rejected": -1128.0, + "loss": 0.2198, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.25, + "rewards/margins": 3.34375, + "rewards/rejected": -9.625, + "step": 13640 + }, + { + "epoch": 0.7035175879396985, + "grad_norm": 8.377168686626812, + "learning_rate": 1.223476615927326e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.046875, + "logps/chosen": -776.0, + "logps/rejected": -1112.0, + "loss": 0.2128, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.0, + "rewards/margins": 3.65625, + "rewards/rejected": -9.6875, + "step": 13650 + }, + { + "epoch": 0.7040329854400206, + "grad_norm": 15.761408904971146, + "learning_rate": 1.2196112343486522e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.96875, + "logps/chosen": -792.0, + "logps/rejected": -1112.0, + "loss": 0.2557, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.1875, + "rewards/margins": 3.328125, + "rewards/rejected": -9.5, + "step": 13660 + }, + { + "epoch": 0.7045483829403427, + "grad_norm": 10.194644991394387, + "learning_rate": 1.2157499975669167e-07, + "logits/chosen": -3.25, + "logits/rejected": -3.0625, + "logps/chosen": -768.0, + "logps/rejected": -1128.0, + "loss": 0.2366, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.09375, + "rewards/margins": 3.5, + "rewards/rejected": -9.625, + "step": 13670 + }, + { + "epoch": 0.7050637804406649, + "grad_norm": 7.877839382856908, + "learning_rate": 1.2118929180814822e-07, + "logits/chosen": -3.40625, + "logits/rejected": -3.0, + "logps/chosen": -792.0, + "logps/rejected": -1152.0, + "loss": 0.2262, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.1875, + "rewards/margins": 3.640625, + "rewards/rejected": -9.875, + "step": 13680 + }, + { + "epoch": 0.705579177940987, + "grad_norm": 8.112324266600748, + "learning_rate": 1.2080400083782506e-07, + "logits/chosen": -3.203125, + "logits/rejected": -2.984375, + "logps/chosen": -796.0, + "logps/rejected": -1120.0, + "loss": 0.228, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.3125, + "rewards/margins": 3.265625, + "rewards/rejected": -9.5625, + "step": 13690 + }, + { + "epoch": 0.7060945754413092, + "grad_norm": 7.679737396804753, + "learning_rate": 1.20419128092963e-07, + "logits/chosen": -3.40625, + "logits/rejected": -3.03125, + "logps/chosen": -808.0, + "logps/rejected": -1160.0, + "loss": 0.2332, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.375, + "rewards/margins": 3.59375, + "rewards/rejected": -9.9375, + "step": 13700 + }, + { + "epoch": 0.7066099729416312, + "grad_norm": 9.594605179321693, + "learning_rate": 1.200346748194486e-07, + "logits/chosen": -3.3125, + "logits/rejected": -2.9375, + "logps/chosen": -840.0, + "logps/rejected": -1168.0, + "loss": 0.2132, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.5625, + "rewards/margins": 3.53125, + "rewards/rejected": -10.0625, + "step": 13710 + }, + { + "epoch": 0.7071253704419533, + "grad_norm": 6.426235429855221, + "learning_rate": 1.1965064226181073e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.9375, + "logps/chosen": -820.0, + "logps/rejected": -1216.0, + "loss": 0.2271, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.4375, + "rewards/margins": 4.09375, + "rewards/rejected": -10.5625, + "step": 13720 + }, + { + "epoch": 0.7076407679422755, + "grad_norm": 6.823405303271641, + "learning_rate": 1.1926703166321635e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.90625, + "logps/chosen": -792.0, + "logps/rejected": -1120.0, + "loss": 0.2178, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.28125, + "rewards/margins": 3.390625, + "rewards/rejected": -9.6875, + "step": 13730 + }, + { + "epoch": 0.7081561654425976, + "grad_norm": 7.436726181670968, + "learning_rate": 1.1888384426546658e-07, + "logits/chosen": -3.328125, + "logits/rejected": -3.015625, + "logps/chosen": -788.0, + "logps/rejected": -1112.0, + "loss": 0.253, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.03125, + "rewards/margins": 3.375, + "rewards/rejected": -9.375, + "step": 13740 + }, + { + "epoch": 0.7086715629429198, + "grad_norm": 7.234638566319858, + "learning_rate": 1.1850108130899231e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.96875, + "logps/chosen": -780.0, + "logps/rejected": -1112.0, + "loss": 0.2176, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.90625, + "rewards/margins": 3.59375, + "rewards/rejected": -9.5, + "step": 13750 + }, + { + "epoch": 0.7091869604432418, + "grad_norm": 6.713726822558297, + "learning_rate": 1.1811874403285052e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.828125, + "logps/chosen": -776.0, + "logps/rejected": -1136.0, + "loss": 0.222, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.96875, + "rewards/margins": 3.921875, + "rewards/rejected": -9.875, + "step": 13760 + }, + { + "epoch": 0.709702357943564, + "grad_norm": 8.392019736385812, + "learning_rate": 1.1773683367472037e-07, + "logits/chosen": -3.15625, + "logits/rejected": -3.0, + "logps/chosen": -800.0, + "logps/rejected": -1176.0, + "loss": 0.2468, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.3125, + "rewards/margins": 3.78125, + "rewards/rejected": -10.0625, + "step": 13770 + }, + { + "epoch": 0.7102177554438861, + "grad_norm": 7.407391817238458, + "learning_rate": 1.173553514708989e-07, + "logits/chosen": -3.15625, + "logits/rejected": -2.859375, + "logps/chosen": -768.0, + "logps/rejected": -1168.0, + "loss": 0.2257, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.125, + "rewards/margins": 3.921875, + "rewards/rejected": -10.0625, + "step": 13780 + }, + { + "epoch": 0.7107331529442082, + "grad_norm": 8.686495530039409, + "learning_rate": 1.1697429865629732e-07, + "logits/chosen": -3.234375, + "logits/rejected": -3.046875, + "logps/chosen": -768.0, + "logps/rejected": -1096.0, + "loss": 0.2066, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.125, + "rewards/margins": 3.5, + "rewards/rejected": -9.625, + "step": 13790 + }, + { + "epoch": 0.7112485504445304, + "grad_norm": 9.008861425542916, + "learning_rate": 1.165936764644366e-07, + "logits/chosen": -3.4375, + "logits/rejected": -3.21875, + "logps/chosen": -808.0, + "logps/rejected": -1128.0, + "loss": 0.2485, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.09375, + "rewards/margins": 3.328125, + "rewards/rejected": -9.4375, + "step": 13800 + }, + { + "epoch": 0.7117639479448524, + "grad_norm": 7.835454977051053, + "learning_rate": 1.1621348612744378e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.1875, + "logps/chosen": -812.0, + "logps/rejected": -1104.0, + "loss": 0.22, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.46875, + "rewards/margins": 3.0, + "rewards/rejected": -9.4375, + "step": 13810 + }, + { + "epoch": 0.7122793454451746, + "grad_norm": 6.018069981805151, + "learning_rate": 1.1583372887604803e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.828125, + "logps/chosen": -780.0, + "logps/rejected": -1168.0, + "loss": 0.2152, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -6.15625, + "rewards/margins": 3.875, + "rewards/rejected": -10.0, + "step": 13820 + }, + { + "epoch": 0.7127947429454967, + "grad_norm": 11.941846203011728, + "learning_rate": 1.1545440593957653e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.84375, + "logps/chosen": -768.0, + "logps/rejected": -1144.0, + "loss": 0.2476, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.21875, + "rewards/margins": 3.71875, + "rewards/rejected": -9.9375, + "step": 13830 + }, + { + "epoch": 0.7133101404458189, + "grad_norm": 7.350411024942794, + "learning_rate": 1.1507551854595063e-07, + "logits/chosen": -3.40625, + "logits/rejected": -3.015625, + "logps/chosen": -764.0, + "logps/rejected": -1144.0, + "loss": 0.2266, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.0, + "rewards/margins": 3.859375, + "rewards/rejected": -9.875, + "step": 13840 + }, + { + "epoch": 0.713825537946141, + "grad_norm": 7.838822022562383, + "learning_rate": 1.1469706792168151e-07, + "logits/chosen": -3.359375, + "logits/rejected": -3.234375, + "logps/chosen": -780.0, + "logps/rejected": -1120.0, + "loss": 0.2329, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.15625, + "rewards/margins": 3.578125, + "rewards/rejected": -9.6875, + "step": 13850 + }, + { + "epoch": 0.714340935446463, + "grad_norm": 7.883417788039725, + "learning_rate": 1.1431905529186656e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.0, + "logps/chosen": -764.0, + "logps/rejected": -1120.0, + "loss": 0.2308, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.875, + "rewards/margins": 3.796875, + "rewards/rejected": -9.6875, + "step": 13860 + }, + { + "epoch": 0.7148563329467852, + "grad_norm": 10.107944532712068, + "learning_rate": 1.139414818801854e-07, + "logits/chosen": -3.203125, + "logits/rejected": -3.015625, + "logps/chosen": -792.0, + "logps/rejected": -1128.0, + "loss": 0.2423, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.1875, + "rewards/margins": 3.46875, + "rewards/rejected": -9.625, + "step": 13870 + }, + { + "epoch": 0.7153717304471073, + "grad_norm": 11.116262667890112, + "learning_rate": 1.1356434890889583e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.875, + "logps/chosen": -764.0, + "logps/rejected": -1168.0, + "loss": 0.2462, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.0, + "rewards/margins": 4.03125, + "rewards/rejected": -10.0625, + "step": 13880 + }, + { + "epoch": 0.7158871279474295, + "grad_norm": 10.897990656073144, + "learning_rate": 1.1318765759882987e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.96875, + "logps/chosen": -804.0, + "logps/rejected": -1152.0, + "loss": 0.2319, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.15625, + "rewards/margins": 3.6875, + "rewards/rejected": -9.8125, + "step": 13890 + }, + { + "epoch": 0.7164025254477516, + "grad_norm": 7.424272951025634, + "learning_rate": 1.1281140916938987e-07, + "logits/chosen": -3.25, + "logits/rejected": -3.109375, + "logps/chosen": -812.0, + "logps/rejected": -1136.0, + "loss": 0.2195, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.3125, + "rewards/margins": 3.453125, + "rewards/rejected": -9.75, + "step": 13900 + }, + { + "epoch": 0.7169179229480737, + "grad_norm": 7.525026667152879, + "learning_rate": 1.1243560483854436e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.953125, + "logps/chosen": -776.0, + "logps/rejected": -1136.0, + "loss": 0.2511, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.96875, + "rewards/margins": 3.8125, + "rewards/rejected": -9.75, + "step": 13910 + }, + { + "epoch": 0.7174333204483958, + "grad_norm": 10.485804449406425, + "learning_rate": 1.1206024582282428e-07, + "logits/chosen": -3.34375, + "logits/rejected": -3.15625, + "logps/chosen": -800.0, + "logps/rejected": -1152.0, + "loss": 0.2484, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.15625, + "rewards/margins": 3.6875, + "rewards/rejected": -9.875, + "step": 13920 + }, + { + "epoch": 0.717948717948718, + "grad_norm": 7.29570596991905, + "learning_rate": 1.116853333373192e-07, + "logits/chosen": -3.34375, + "logits/rejected": -2.890625, + "logps/chosen": -776.0, + "logps/rejected": -1136.0, + "loss": 0.2137, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.90625, + "rewards/margins": 3.703125, + "rewards/rejected": -9.625, + "step": 13930 + }, + { + "epoch": 0.7184641154490401, + "grad_norm": 9.461573625036378, + "learning_rate": 1.1131086859567304e-07, + "logits/chosen": -3.25, + "logits/rejected": -3.0, + "logps/chosen": -752.0, + "logps/rejected": -1120.0, + "loss": 0.2188, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.9375, + "rewards/margins": 3.796875, + "rewards/rejected": -9.75, + "step": 13940 + }, + { + "epoch": 0.7189795129493622, + "grad_norm": 7.558451765859401, + "learning_rate": 1.1093685281008053e-07, + "logits/chosen": -3.234375, + "logits/rejected": -2.953125, + "logps/chosen": -808.0, + "logps/rejected": -1160.0, + "loss": 0.2123, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.15625, + "rewards/margins": 3.734375, + "rewards/rejected": -9.875, + "step": 13950 + }, + { + "epoch": 0.7194949104496843, + "grad_norm": 9.412259092060458, + "learning_rate": 1.105632871912828e-07, + "logits/chosen": -3.375, + "logits/rejected": -3.078125, + "logps/chosen": -820.0, + "logps/rejected": -1176.0, + "loss": 0.2423, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.46875, + "rewards/margins": 3.625, + "rewards/rejected": -10.0625, + "step": 13960 + }, + { + "epoch": 0.7200103079500064, + "grad_norm": 9.241659069614707, + "learning_rate": 1.1019017294856373e-07, + "logits/chosen": -3.3125, + "logits/rejected": -2.890625, + "logps/chosen": -796.0, + "logps/rejected": -1136.0, + "loss": 0.2504, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.28125, + "rewards/margins": 3.59375, + "rewards/rejected": -9.875, + "step": 13970 + }, + { + "epoch": 0.7205257054503286, + "grad_norm": 7.3529922677085855, + "learning_rate": 1.0981751128974625e-07, + "logits/chosen": -3.40625, + "logits/rejected": -3.125, + "logps/chosen": -764.0, + "logps/rejected": -1120.0, + "loss": 0.2358, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.0625, + "rewards/margins": 3.515625, + "rewards/rejected": -9.625, + "step": 13980 + }, + { + "epoch": 0.7210411029506507, + "grad_norm": 7.914447764853694, + "learning_rate": 1.094453034211881e-07, + "logits/chosen": -3.171875, + "logits/rejected": -2.90625, + "logps/chosen": -776.0, + "logps/rejected": -1144.0, + "loss": 0.2341, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.125, + "rewards/margins": 3.625, + "rewards/rejected": -9.75, + "step": 13990 + }, + { + "epoch": 0.7215565004509729, + "grad_norm": 8.67603123662342, + "learning_rate": 1.0907355054777811e-07, + "logits/chosen": -3.484375, + "logits/rejected": -3.3125, + "logps/chosen": -816.0, + "logps/rejected": -1104.0, + "loss": 0.2143, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.1875, + "rewards/margins": 3.140625, + "rewards/rejected": -9.3125, + "step": 14000 + }, + { + "epoch": 0.7220718979512949, + "grad_norm": 6.5636678760171705, + "learning_rate": 1.0870225387293211e-07, + "logits/chosen": -3.25, + "logits/rejected": -3.078125, + "logps/chosen": -816.0, + "logps/rejected": -1144.0, + "loss": 0.2357, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.375, + "rewards/margins": 3.25, + "rewards/rejected": -9.625, + "step": 14010 + }, + { + "epoch": 0.722587295451617, + "grad_norm": 6.269484587765931, + "learning_rate": 1.0833141459858908e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.984375, + "logps/chosen": -788.0, + "logps/rejected": -1144.0, + "loss": 0.2149, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.125, + "rewards/margins": 3.640625, + "rewards/rejected": -9.75, + "step": 14020 + }, + { + "epoch": 0.7231026929519392, + "grad_norm": 7.426207174763315, + "learning_rate": 1.0796103392520756e-07, + "logits/chosen": -3.3125, + "logits/rejected": -3.109375, + "logps/chosen": -776.0, + "logps/rejected": -1120.0, + "loss": 0.23, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.15625, + "rewards/margins": 3.421875, + "rewards/rejected": -9.625, + "step": 14030 + }, + { + "epoch": 0.7236180904522613, + "grad_norm": 10.006823275073105, + "learning_rate": 1.0759111305176142e-07, + "logits/chosen": -3.421875, + "logits/rejected": -2.890625, + "logps/chosen": -796.0, + "logps/rejected": -1176.0, + "loss": 0.2315, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.28125, + "rewards/margins": 3.71875, + "rewards/rejected": -10.0, + "step": 14040 + }, + { + "epoch": 0.7241334879525835, + "grad_norm": 9.344026112386432, + "learning_rate": 1.0722165317573617e-07, + "logits/chosen": -3.421875, + "logits/rejected": -3.171875, + "logps/chosen": -812.0, + "logps/rejected": -1160.0, + "loss": 0.2565, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.34375, + "rewards/margins": 3.46875, + "rewards/rejected": -9.8125, + "step": 14050 + }, + { + "epoch": 0.7246488854529055, + "grad_norm": 11.403483254200156, + "learning_rate": 1.0685265549312477e-07, + "logits/chosen": -3.265625, + "logits/rejected": -2.90625, + "logps/chosen": -772.0, + "logps/rejected": -1152.0, + "loss": 0.2206, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.8125, + "rewards/margins": 3.828125, + "rewards/rejected": -9.625, + "step": 14060 + }, + { + "epoch": 0.7251642829532277, + "grad_norm": 7.919796780905841, + "learning_rate": 1.0648412119842434e-07, + "logits/chosen": -3.25, + "logits/rejected": -3.046875, + "logps/chosen": -796.0, + "logps/rejected": -1128.0, + "loss": 0.2483, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.1875, + "rewards/margins": 3.359375, + "rewards/rejected": -9.5625, + "step": 14070 + }, + { + "epoch": 0.7256796804535498, + "grad_norm": 9.003737845907814, + "learning_rate": 1.0611605148463163e-07, + "logits/chosen": -3.28125, + "logits/rejected": -2.953125, + "logps/chosen": -752.0, + "logps/rejected": -1096.0, + "loss": 0.2335, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -5.90625, + "rewards/margins": 3.328125, + "rewards/rejected": -9.25, + "step": 14080 + }, + { + "epoch": 0.726195077953872, + "grad_norm": 8.163219983673839, + "learning_rate": 1.0574844754323964e-07, + "logits/chosen": -3.265625, + "logits/rejected": -3.03125, + "logps/chosen": -784.0, + "logps/rejected": -1096.0, + "loss": 0.2312, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.9375, + "rewards/margins": 3.328125, + "rewards/rejected": -9.25, + "step": 14090 + }, + { + "epoch": 0.7267104754541941, + "grad_norm": 8.181663073855013, + "learning_rate": 1.0538131056423372e-07, + "logits/chosen": -3.203125, + "logits/rejected": -3.0, + "logps/chosen": -776.0, + "logps/rejected": -1120.0, + "loss": 0.2357, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.90625, + "rewards/margins": 3.59375, + "rewards/rejected": -9.5, + "step": 14100 + }, + { + "epoch": 0.7272258729545161, + "grad_norm": 8.806399142989035, + "learning_rate": 1.0501464173608723e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.859375, + "logps/chosen": -768.0, + "logps/rejected": -1104.0, + "loss": 0.2468, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.0625, + "rewards/margins": 3.421875, + "rewards/rejected": -9.5, + "step": 14110 + }, + { + "epoch": 0.7277412704548383, + "grad_norm": 10.01982518274392, + "learning_rate": 1.046484422457585e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.96875, + "logps/chosen": -776.0, + "logps/rejected": -1152.0, + "loss": 0.2081, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -5.875, + "rewards/margins": 3.984375, + "rewards/rejected": -9.875, + "step": 14120 + }, + { + "epoch": 0.7282566679551604, + "grad_norm": 9.353722566666715, + "learning_rate": 1.0428271327868612e-07, + "logits/chosen": -3.1875, + "logits/rejected": -2.828125, + "logps/chosen": -792.0, + "logps/rejected": -1160.0, + "loss": 0.2135, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.15625, + "rewards/margins": 3.75, + "rewards/rejected": -9.875, + "step": 14130 + }, + { + "epoch": 0.7287720654554826, + "grad_norm": 12.756337221487437, + "learning_rate": 1.0391745601878591e-07, + "logits/chosen": -3.109375, + "logits/rejected": -2.90625, + "logps/chosen": -780.0, + "logps/rejected": -1152.0, + "loss": 0.2243, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.21875, + "rewards/margins": 3.875, + "rewards/rejected": -10.0625, + "step": 14140 + }, + { + "epoch": 0.7292874629558047, + "grad_norm": 9.2392868232289, + "learning_rate": 1.0355267164844652e-07, + "logits/chosen": -3.1875, + "logits/rejected": -3.046875, + "logps/chosen": -820.0, + "logps/rejected": -1144.0, + "loss": 0.2256, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.3125, + "rewards/margins": 3.5625, + "rewards/rejected": -9.875, + "step": 14150 + }, + { + "epoch": 0.7298028604561267, + "grad_norm": 11.57292543138038, + "learning_rate": 1.0318836134852573e-07, + "logits/chosen": -3.40625, + "logits/rejected": -3.1875, + "logps/chosen": -792.0, + "logps/rejected": -1136.0, + "loss": 0.2434, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.25, + "rewards/margins": 3.5, + "rewards/rejected": -9.75, + "step": 14160 + }, + { + "epoch": 0.7303182579564489, + "grad_norm": 14.899362879662007, + "learning_rate": 1.0282452629834693e-07, + "logits/chosen": -3.21875, + "logits/rejected": -2.90625, + "logps/chosen": -784.0, + "logps/rejected": -1144.0, + "loss": 0.2151, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.15625, + "rewards/margins": 3.71875, + "rewards/rejected": -9.875, + "step": 14170 + }, + { + "epoch": 0.730833655456771, + "grad_norm": 7.128407510332083, + "learning_rate": 1.0246116767569465e-07, + "logits/chosen": -3.25, + "logits/rejected": -2.9375, + "logps/chosen": -840.0, + "logps/rejected": -1192.0, + "loss": 0.214, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.625, + "rewards/margins": 3.609375, + "rewards/rejected": -10.25, + "step": 14180 + }, + { + "epoch": 0.7313490529570932, + "grad_norm": 10.200601252461519, + "learning_rate": 1.020982866568116e-07, + "logits/chosen": -3.390625, + "logits/rejected": -3.21875, + "logps/chosen": -840.0, + "logps/rejected": -1136.0, + "loss": 0.209, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.625, + "rewards/margins": 3.171875, + "rewards/rejected": -9.75, + "step": 14190 + }, + { + "epoch": 0.7318644504574153, + "grad_norm": 8.103859053319628, + "learning_rate": 1.0173588441639417e-07, + "logits/chosen": -3.4375, + "logits/rejected": -3.1875, + "logps/chosen": -832.0, + "logps/rejected": -1152.0, + "loss": 0.2333, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.59375, + "rewards/margins": 3.25, + "rewards/rejected": -9.8125, + "step": 14200 + }, + { + "epoch": 0.7323798479577374, + "grad_norm": 8.101893577681718, + "learning_rate": 1.0137396212758904e-07, + "logits/chosen": -3.359375, + "logits/rejected": -2.921875, + "logps/chosen": -840.0, + "logps/rejected": -1128.0, + "loss": 0.2367, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.40625, + "rewards/margins": 3.25, + "rewards/rejected": -9.6875, + "step": 14210 + }, + { + "epoch": 0.7328952454580595, + "grad_norm": 10.576203259512416, + "learning_rate": 1.010125209619889e-07, + "logits/chosen": -3.125, + "logits/rejected": -2.9375, + "logps/chosen": -768.0, + "logps/rejected": -1136.0, + "loss": 0.2213, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -6.0625, + "rewards/margins": 3.796875, + "rewards/rejected": -9.875, + "step": 14220 + }, + { + "epoch": 0.7334106429583817, + "grad_norm": 7.85786615261518, + "learning_rate": 1.0065156208962943e-07, + "logits/chosen": -3.25, + "logits/rejected": -3.125, + "logps/chosen": -780.0, + "logps/rejected": -1088.0, + "loss": 0.2532, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.0, + "rewards/margins": 3.4375, + "rewards/rejected": -9.4375, + "step": 14230 + }, + { + "epoch": 0.7339260404587038, + "grad_norm": 9.473311603188352, + "learning_rate": 1.0029108667898462e-07, + "logits/chosen": -3.296875, + "logits/rejected": -2.96875, + "logps/chosen": -776.0, + "logps/rejected": -1104.0, + "loss": 0.2396, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.09375, + "rewards/margins": 3.390625, + "rewards/rejected": -9.5, + "step": 14240 + }, + { + "epoch": 0.734441437959026, + "grad_norm": 8.897950590493522, + "learning_rate": 9.993109589696372e-08, + "logits/chosen": -3.328125, + "logits/rejected": -2.90625, + "logps/chosen": -780.0, + "logps/rejected": -1104.0, + "loss": 0.2246, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.9375, + "rewards/margins": 3.359375, + "rewards/rejected": -9.3125, + "step": 14250 + }, + { + "epoch": 0.734956835459348, + "grad_norm": 12.323623858318392, + "learning_rate": 9.957159090890718e-08, + "logits/chosen": -3.3125, + "logits/rejected": -2.875, + "logps/chosen": -776.0, + "logps/rejected": -1112.0, + "loss": 0.2591, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.28125, + "rewards/margins": 3.375, + "rewards/rejected": -9.625, + "step": 14260 + }, + { + "epoch": 0.7354722329596701, + "grad_norm": 6.753946092218363, + "learning_rate": 9.921257287858257e-08, + "logits/chosen": -3.21875, + "logits/rejected": -2.90625, + "logps/chosen": -776.0, + "logps/rejected": -1112.0, + "loss": 0.2387, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.90625, + "rewards/margins": 3.515625, + "rewards/rejected": -9.4375, + "step": 14270 + }, + { + "epoch": 0.7359876304599923, + "grad_norm": 8.776399497970939, + "learning_rate": 9.885404296818145e-08, + "logits/chosen": -3.375, + "logits/rejected": -3.046875, + "logps/chosen": -772.0, + "logps/rejected": -1144.0, + "loss": 0.2344, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.125, + "rewards/margins": 3.765625, + "rewards/rejected": -9.875, + "step": 14280 + }, + { + "epoch": 0.7365030279603144, + "grad_norm": 10.000218767244135, + "learning_rate": 9.849600233831501e-08, + "logits/chosen": -3.375, + "logits/rejected": -3.203125, + "logps/chosen": -800.0, + "logps/rejected": -1128.0, + "loss": 0.2544, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.1875, + "rewards/margins": 3.34375, + "rewards/rejected": -9.5625, + "step": 14290 + }, + { + "epoch": 0.7370184254606366, + "grad_norm": 10.391152518412708, + "learning_rate": 9.813845214801075e-08, + "logits/chosen": -3.25, + "logits/rejected": -2.875, + "logps/chosen": -768.0, + "logps/rejected": -1168.0, + "loss": 0.2166, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -5.9375, + "rewards/margins": 4.03125, + "rewards/rejected": -9.9375, + "step": 14300 + }, + { + "epoch": 0.7375338229609586, + "grad_norm": 8.960353166970119, + "learning_rate": 9.778139355470858e-08, + "logits/chosen": -3.25, + "logits/rejected": -3.078125, + "logps/chosen": -788.0, + "logps/rejected": -1104.0, + "loss": 0.2336, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.1875, + "rewards/margins": 3.28125, + "rewards/rejected": -9.5, + "step": 14310 + }, + { + "epoch": 0.7380492204612807, + "grad_norm": 9.421650206637777, + "learning_rate": 9.742482771425676e-08, + "logits/chosen": -3.34375, + "logits/rejected": -2.953125, + "logps/chosen": -792.0, + "logps/rejected": -1128.0, + "loss": 0.2392, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.3125, + "rewards/margins": 3.5625, + "rewards/rejected": -9.875, + "step": 14320 + }, + { + "epoch": 0.7385646179616029, + "grad_norm": 7.76813797077336, + "learning_rate": 9.706875578090878e-08, + "logits/chosen": -3.265625, + "logits/rejected": -2.90625, + "logps/chosen": -796.0, + "logps/rejected": -1160.0, + "loss": 0.2327, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.3125, + "rewards/margins": 3.5625, + "rewards/rejected": -9.875, + "step": 14330 + }, + { + "epoch": 0.739080015461925, + "grad_norm": 7.857485268993471, + "learning_rate": 9.6713178907319e-08, + "logits/chosen": -3.5625, + "logits/rejected": -3.328125, + "logps/chosen": -816.0, + "logps/rejected": -1144.0, + "loss": 0.2251, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -6.46875, + "rewards/margins": 3.296875, + "rewards/rejected": -9.75, + "step": 14340 + }, + { + "epoch": 0.7395954129622472, + "grad_norm": 6.658227256681257, + "learning_rate": 9.635809824453938e-08, + "logits/chosen": -3.0625, + "logits/rejected": -2.75, + "logps/chosen": -804.0, + "logps/rejected": -1176.0, + "loss": 0.2294, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.21875, + "rewards/margins": 3.90625, + "rewards/rejected": -10.125, + "step": 14350 + }, + { + "epoch": 0.7401108104625692, + "grad_norm": 9.488655397819045, + "learning_rate": 9.600351494201564e-08, + "logits/chosen": -3.359375, + "logits/rejected": -3.09375, + "logps/chosen": -784.0, + "logps/rejected": -1104.0, + "loss": 0.2452, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.125, + "rewards/margins": 3.34375, + "rewards/rejected": -9.4375, + "step": 14360 + }, + { + "epoch": 0.7406262079628914, + "grad_norm": 9.214149400779817, + "learning_rate": 9.564943014758317e-08, + "logits/chosen": -3.265625, + "logits/rejected": -2.890625, + "logps/chosen": -764.0, + "logps/rejected": -1104.0, + "loss": 0.2308, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.9375, + "rewards/margins": 3.640625, + "rewards/rejected": -9.5625, + "step": 14370 + }, + { + "epoch": 0.7411416054632135, + "grad_norm": 5.526827097628367, + "learning_rate": 9.529584500746391e-08, + "logits/chosen": -3.34375, + "logits/rejected": -3.125, + "logps/chosen": -796.0, + "logps/rejected": -1120.0, + "loss": 0.2517, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.25, + "rewards/margins": 3.3125, + "rewards/rejected": -9.5625, + "step": 14380 + }, + { + "epoch": 0.7416570029635356, + "grad_norm": 9.43222074514146, + "learning_rate": 9.494276066626236e-08, + "logits/chosen": -3.234375, + "logits/rejected": -2.828125, + "logps/chosen": -788.0, + "logps/rejected": -1144.0, + "loss": 0.2533, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.25, + "rewards/margins": 3.546875, + "rewards/rejected": -9.8125, + "step": 14390 + }, + { + "epoch": 0.7421724004638578, + "grad_norm": 8.951631471524662, + "learning_rate": 9.459017826696156e-08, + "logits/chosen": -3.09375, + "logits/rejected": -2.828125, + "logps/chosen": -796.0, + "logps/rejected": -1152.0, + "loss": 0.2287, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.3125, + "rewards/margins": 3.59375, + "rewards/rejected": -9.875, + "step": 14400 + }, + { + "epoch": 0.7426877979641798, + "grad_norm": 11.915247345385573, + "learning_rate": 9.423809895092005e-08, + "logits/chosen": -3.203125, + "logits/rejected": -2.96875, + "logps/chosen": -796.0, + "logps/rejected": -1152.0, + "loss": 0.2403, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.28125, + "rewards/margins": 3.53125, + "rewards/rejected": -9.8125, + "step": 14410 + }, + { + "epoch": 0.743203195464502, + "grad_norm": 11.98467202020444, + "learning_rate": 9.388652385786752e-08, + "logits/chosen": -3.234375, + "logits/rejected": -3.125, + "logps/chosen": -768.0, + "logps/rejected": -1088.0, + "loss": 0.234, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.03125, + "rewards/margins": 3.484375, + "rewards/rejected": -9.5, + "step": 14420 + }, + { + "epoch": 0.7437185929648241, + "grad_norm": 6.200208044973669, + "learning_rate": 9.353545412590163e-08, + "logits/chosen": -3.3125, + "logits/rejected": -3.078125, + "logps/chosen": -812.0, + "logps/rejected": -1152.0, + "loss": 0.2379, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.1875, + "rewards/margins": 3.65625, + "rewards/rejected": -9.875, + "step": 14430 + }, + { + "epoch": 0.7442339904651463, + "grad_norm": 8.555332394548111, + "learning_rate": 9.318489089148415e-08, + "logits/chosen": -3.0, + "logits/rejected": -2.828125, + "logps/chosen": -792.0, + "logps/rejected": -1144.0, + "loss": 0.2204, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.15625, + "rewards/margins": 3.640625, + "rewards/rejected": -9.8125, + "step": 14440 + }, + { + "epoch": 0.7447493879654684, + "grad_norm": 8.877578026735751, + "learning_rate": 9.283483528943695e-08, + "logits/chosen": -3.5625, + "logits/rejected": -3.234375, + "logps/chosen": -800.0, + "logps/rejected": -1192.0, + "loss": 0.2398, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.1875, + "rewards/margins": 3.921875, + "rewards/rejected": -10.125, + "step": 14450 + }, + { + "epoch": 0.7452647854657904, + "grad_norm": 9.223939901942135, + "learning_rate": 9.248528845293898e-08, + "logits/chosen": -3.15625, + "logits/rejected": -3.0, + "logps/chosen": -812.0, + "logps/rejected": -1136.0, + "loss": 0.2286, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.1875, + "rewards/margins": 3.515625, + "rewards/rejected": -9.6875, + "step": 14460 + }, + { + "epoch": 0.7457801829661126, + "grad_norm": 5.738223112258373, + "learning_rate": 9.213625151352194e-08, + "logits/chosen": -3.296875, + "logits/rejected": -3.0625, + "logps/chosen": -796.0, + "logps/rejected": -1104.0, + "loss": 0.2177, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.15625, + "rewards/margins": 3.375, + "rewards/rejected": -9.5, + "step": 14470 + }, + { + "epoch": 0.7462955804664347, + "grad_norm": 10.943201208609143, + "learning_rate": 9.178772560106715e-08, + "logits/chosen": -3.28125, + "logits/rejected": -2.953125, + "logps/chosen": -780.0, + "logps/rejected": -1096.0, + "loss": 0.2264, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.25, + "rewards/margins": 3.265625, + "rewards/rejected": -9.5, + "step": 14480 + }, + { + "epoch": 0.7468109779667569, + "grad_norm": 9.508243221240535, + "learning_rate": 9.143971184380156e-08, + "logits/chosen": -3.265625, + "logits/rejected": -3.03125, + "logps/chosen": -820.0, + "logps/rejected": -1136.0, + "loss": 0.2151, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.34375, + "rewards/margins": 3.34375, + "rewards/rejected": -9.6875, + "step": 14490 + }, + { + "epoch": 0.747326375467079, + "grad_norm": 9.70302338736274, + "learning_rate": 9.109221136829428e-08, + "logits/chosen": -3.3125, + "logits/rejected": -2.96875, + "logps/chosen": -796.0, + "logps/rejected": -1208.0, + "loss": 0.2134, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.3125, + "rewards/margins": 4.0625, + "rewards/rejected": -10.375, + "step": 14500 + }, + { + "epoch": 0.7478417729674011, + "grad_norm": 9.834137607875082, + "learning_rate": 9.074522529945278e-08, + "logits/chosen": -3.3125, + "logits/rejected": -3.09375, + "logps/chosen": -808.0, + "logps/rejected": -1168.0, + "loss": 0.2419, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.34375, + "rewards/margins": 3.6875, + "rewards/rejected": -10.0, + "step": 14510 + }, + { + "epoch": 0.7483571704677232, + "grad_norm": 7.299076374231383, + "learning_rate": 9.03987547605192e-08, + "logits/chosen": -3.28125, + "logits/rejected": -2.90625, + "logps/chosen": -792.0, + "logps/rejected": -1184.0, + "loss": 0.2271, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.25, + "rewards/margins": 3.96875, + "rewards/rejected": -10.25, + "step": 14520 + }, + { + "epoch": 0.7488725679680454, + "grad_norm": 9.042709200881538, + "learning_rate": 9.005280087306704e-08, + "logits/chosen": -3.421875, + "logits/rejected": -3.0, + "logps/chosen": -808.0, + "logps/rejected": -1152.0, + "loss": 0.2168, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.375, + "rewards/margins": 3.390625, + "rewards/rejected": -9.75, + "step": 14530 + }, + { + "epoch": 0.7493879654683675, + "grad_norm": 6.283130533410891, + "learning_rate": 8.970736475699734e-08, + "logits/chosen": -3.421875, + "logits/rejected": -3.1875, + "logps/chosen": -768.0, + "logps/rejected": -1112.0, + "loss": 0.2122, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.90625, + "rewards/margins": 3.53125, + "rewards/rejected": -9.4375, + "step": 14540 + }, + { + "epoch": 0.7499033629686896, + "grad_norm": 7.465763551991171, + "learning_rate": 8.936244753053501e-08, + "logits/chosen": -3.390625, + "logits/rejected": -3.140625, + "logps/chosen": -776.0, + "logps/rejected": -1128.0, + "loss": 0.22, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.21875, + "rewards/margins": 3.53125, + "rewards/rejected": -9.75, + "step": 14550 + }, + { + "epoch": 0.7504187604690117, + "grad_norm": 9.362962651933076, + "learning_rate": 8.901805031022511e-08, + "logits/chosen": -3.296875, + "logits/rejected": -2.953125, + "logps/chosen": -784.0, + "logps/rejected": -1192.0, + "loss": 0.2579, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.125, + "rewards/margins": 4.09375, + "rewards/rejected": -10.25, + "step": 14560 + }, + { + "epoch": 0.7509341579693338, + "grad_norm": 9.878777446068936, + "learning_rate": 8.867417421092938e-08, + "logits/chosen": -3.40625, + "logits/rejected": -3.171875, + "logps/chosen": -804.0, + "logps/rejected": -1176.0, + "loss": 0.2274, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.1875, + "rewards/margins": 3.796875, + "rewards/rejected": -10.0, + "step": 14570 + }, + { + "epoch": 0.751449555469656, + "grad_norm": 7.399348599038793, + "learning_rate": 8.833082034582275e-08, + "logits/chosen": -3.375, + "logits/rejected": -3.140625, + "logps/chosen": -796.0, + "logps/rejected": -1096.0, + "loss": 0.2248, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.21875, + "rewards/margins": 2.921875, + "rewards/rejected": -9.125, + "step": 14580 + }, + { + "epoch": 0.7519649529699781, + "grad_norm": 10.593745922418528, + "learning_rate": 8.798798982638961e-08, + "logits/chosen": -3.21875, + "logits/rejected": -2.953125, + "logps/chosen": -784.0, + "logps/rejected": -1128.0, + "loss": 0.2327, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.0625, + "rewards/margins": 3.53125, + "rewards/rejected": -9.5625, + "step": 14590 + }, + { + "epoch": 0.7524803504703003, + "grad_norm": 7.84098008730953, + "learning_rate": 8.764568376242016e-08, + "logits/chosen": -3.140625, + "logits/rejected": -2.96875, + "logps/chosen": -828.0, + "logps/rejected": -1136.0, + "loss": 0.2247, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.28125, + "rewards/margins": 3.359375, + "rewards/rejected": -9.625, + "step": 14600 + }, + { + "epoch": 0.7529957479706223, + "grad_norm": 9.47103329791981, + "learning_rate": 8.730390326200682e-08, + "logits/chosen": -3.203125, + "logits/rejected": -3.109375, + "logps/chosen": -812.0, + "logps/rejected": -1120.0, + "loss": 0.2286, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.34375, + "rewards/margins": 3.109375, + "rewards/rejected": -9.4375, + "step": 14610 + }, + { + "epoch": 0.7535111454709444, + "grad_norm": 9.56042637382461, + "learning_rate": 8.696264943154063e-08, + "logits/chosen": -3.1875, + "logits/rejected": -2.84375, + "logps/chosen": -792.0, + "logps/rejected": -1168.0, + "loss": 0.2614, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.25, + "rewards/margins": 3.609375, + "rewards/rejected": -9.875, + "step": 14620 + }, + { + "epoch": 0.7540265429712666, + "grad_norm": 8.167617864146289, + "learning_rate": 8.662192337570786e-08, + "logits/chosen": -3.34375, + "logits/rejected": -2.96875, + "logps/chosen": -820.0, + "logps/rejected": -1128.0, + "loss": 0.2323, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.4375, + "rewards/margins": 3.234375, + "rewards/rejected": -9.6875, + "step": 14630 + }, + { + "epoch": 0.7545419404715887, + "grad_norm": 7.17103810093702, + "learning_rate": 8.62817261974863e-08, + "logits/chosen": -3.28125, + "logits/rejected": -3.125, + "logps/chosen": -784.0, + "logps/rejected": -1120.0, + "loss": 0.2243, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.15625, + "rewards/margins": 3.3125, + "rewards/rejected": -9.4375, + "step": 14640 + }, + { + "epoch": 0.7550573379719109, + "grad_norm": 6.434221696271703, + "learning_rate": 8.594205899814164e-08, + "logits/chosen": -3.328125, + "logits/rejected": -3.15625, + "logps/chosen": -804.0, + "logps/rejected": -1136.0, + "loss": 0.2319, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.1875, + "rewards/margins": 3.5, + "rewards/rejected": -9.6875, + "step": 14650 + }, + { + "epoch": 0.7555727354722329, + "grad_norm": 8.907924121587257, + "learning_rate": 8.5602922877224e-08, + "logits/chosen": -3.171875, + "logits/rejected": -2.9375, + "logps/chosen": -744.0, + "logps/rejected": -1144.0, + "loss": 0.2374, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.6875, + "rewards/margins": 4.0, + "rewards/rejected": -9.6875, + "step": 14660 + }, + { + "epoch": 0.7560881329725551, + "grad_norm": 14.768133072640191, + "learning_rate": 8.526431893256422e-08, + "logits/chosen": -3.21875, + "logits/rejected": -3.046875, + "logps/chosen": -796.0, + "logps/rejected": -1168.0, + "loss": 0.245, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.28125, + "rewards/margins": 3.6875, + "rewards/rejected": -10.0, + "step": 14670 + }, + { + "epoch": 0.7566035304728772, + "grad_norm": 8.359042453719084, + "learning_rate": 8.492624826027042e-08, + "logits/chosen": -3.390625, + "logits/rejected": -3.15625, + "logps/chosen": -792.0, + "logps/rejected": -1160.0, + "loss": 0.2147, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.0625, + "rewards/margins": 3.703125, + "rewards/rejected": -9.75, + "step": 14680 + }, + { + "epoch": 0.7571189279731994, + "grad_norm": 8.921995293956693, + "learning_rate": 8.458871195472456e-08, + "logits/chosen": -3.21875, + "logits/rejected": -2.921875, + "logps/chosen": -748.0, + "logps/rejected": -1096.0, + "loss": 0.2391, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.9375, + "rewards/margins": 3.546875, + "rewards/rejected": -9.5, + "step": 14690 + }, + { + "epoch": 0.7576343254735215, + "grad_norm": 8.926113732772397, + "learning_rate": 8.425171110857871e-08, + "logits/chosen": -3.328125, + "logits/rejected": -2.921875, + "logps/chosen": -788.0, + "logps/rejected": -1144.0, + "loss": 0.2346, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.28125, + "rewards/margins": 3.5, + "rewards/rejected": -9.8125, + "step": 14700 + }, + { + "epoch": 0.7581497229738435, + "grad_norm": 6.681070182413681, + "learning_rate": 8.39152468127517e-08, + "logits/chosen": -3.28125, + "logits/rejected": -3.015625, + "logps/chosen": -780.0, + "logps/rejected": -1144.0, + "loss": 0.2241, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.0, + "rewards/margins": 3.71875, + "rewards/rejected": -9.6875, + "step": 14710 + }, + { + "epoch": 0.7586651204741657, + "grad_norm": 7.1138478282808, + "learning_rate": 8.357932015642525e-08, + "logits/chosen": -3.421875, + "logits/rejected": -3.078125, + "logps/chosen": -800.0, + "logps/rejected": -1120.0, + "loss": 0.2399, + "rewards/accuracies": 0.84375, + "rewards/chosen": -6.1875, + "rewards/margins": 3.34375, + "rewards/rejected": -9.5625, + "step": 14720 + }, + { + "epoch": 0.7591805179744878, + "grad_norm": 10.243970914227123, + "learning_rate": 8.32439322270407e-08, + "logits/chosen": -3.203125, + "logits/rejected": -3.078125, + "logps/chosen": -800.0, + "logps/rejected": -1152.0, + "loss": 0.2492, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -6.21875, + "rewards/margins": 3.59375, + "rewards/rejected": -9.8125, + "step": 14730 + }, + { + "epoch": 0.75969591547481, + "grad_norm": 9.70472608394283, + "learning_rate": 8.290908411029565e-08, + "logits/chosen": -3.0625, + "logits/rejected": -2.921875, + "logps/chosen": -800.0, + "logps/rejected": -1088.0, + "loss": 0.2351, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.15625, + "rewards/margins": 3.125, + "rewards/rejected": -9.25, + "step": 14740 + }, + { + "epoch": 0.7602113129751321, + "grad_norm": 6.827355345377845, + "learning_rate": 8.257477689014009e-08, + "logits/chosen": -3.3125, + "logits/rejected": -3.0625, + "logps/chosen": -804.0, + "logps/rejected": -1136.0, + "loss": 0.2256, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.15625, + "rewards/margins": 3.59375, + "rewards/rejected": -9.75, + "step": 14750 + }, + { + "epoch": 0.7607267104754541, + "grad_norm": 19.352118042342127, + "learning_rate": 8.224101164877323e-08, + "logits/chosen": -3.296875, + "logits/rejected": -3.015625, + "logps/chosen": -776.0, + "logps/rejected": -1144.0, + "loss": 0.2236, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.03125, + "rewards/margins": 3.625, + "rewards/rejected": -9.6875, + "step": 14760 + }, + { + "epoch": 0.7612421079757763, + "grad_norm": 9.687014636427913, + "learning_rate": 8.190778946663957e-08, + "logits/chosen": -3.1875, + "logits/rejected": -3.078125, + "logps/chosen": -808.0, + "logps/rejected": -1168.0, + "loss": 0.2208, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.125, + "rewards/margins": 3.78125, + "rewards/rejected": -9.875, + "step": 14770 + }, + { + "epoch": 0.7617575054760984, + "grad_norm": 7.75264514527139, + "learning_rate": 8.157511142242567e-08, + "logits/chosen": -3.34375, + "logits/rejected": -2.875, + "logps/chosen": -784.0, + "logps/rejected": -1128.0, + "loss": 0.224, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.03125, + "rewards/margins": 3.421875, + "rewards/rejected": -9.4375, + "step": 14780 + }, + { + "epoch": 0.7622729029764206, + "grad_norm": 10.08604128979808, + "learning_rate": 8.12429785930569e-08, + "logits/chosen": -3.25, + "logits/rejected": -2.90625, + "logps/chosen": -788.0, + "logps/rejected": -1120.0, + "loss": 0.212, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.0625, + "rewards/margins": 3.546875, + "rewards/rejected": -9.625, + "step": 14790 + }, + { + "epoch": 0.7627883004767427, + "grad_norm": 7.826556508264055, + "learning_rate": 8.091139205369344e-08, + "logits/chosen": -3.25, + "logits/rejected": -3.0, + "logps/chosen": -812.0, + "logps/rejected": -1168.0, + "loss": 0.2123, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.375, + "rewards/margins": 3.578125, + "rewards/rejected": -9.9375, + "step": 14800 + }, + { + "epoch": 0.7633036979770648, + "grad_norm": 9.962452981881349, + "learning_rate": 8.058035287772721e-08, + "logits/chosen": -3.265625, + "logits/rejected": -3.03125, + "logps/chosen": -764.0, + "logps/rejected": -1152.0, + "loss": 0.2279, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -5.78125, + "rewards/margins": 4.21875, + "rewards/rejected": -10.0, + "step": 14810 + }, + { + "epoch": 0.7638190954773869, + "grad_norm": 11.216863867252128, + "learning_rate": 8.024986213677823e-08, + "logits/chosen": -3.21875, + "logits/rejected": -2.796875, + "logps/chosen": -776.0, + "logps/rejected": -1120.0, + "loss": 0.2311, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.25, + "rewards/margins": 3.46875, + "rewards/rejected": -9.6875, + "step": 14820 + }, + { + "epoch": 0.7643344929777091, + "grad_norm": 8.646924125654646, + "learning_rate": 7.991992090069102e-08, + "logits/chosen": -3.4375, + "logits/rejected": -3.0, + "logps/chosen": -800.0, + "logps/rejected": -1160.0, + "loss": 0.2278, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.15625, + "rewards/margins": 3.734375, + "rewards/rejected": -9.875, + "step": 14830 + }, + { + "epoch": 0.7648498904780312, + "grad_norm": 8.75431829175751, + "learning_rate": 7.95905302375313e-08, + "logits/chosen": -3.25, + "logits/rejected": -3.015625, + "logps/chosen": -824.0, + "logps/rejected": -1168.0, + "loss": 0.2167, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.34375, + "rewards/margins": 3.5, + "rewards/rejected": -9.875, + "step": 14840 + }, + { + "epoch": 0.7653652879783533, + "grad_norm": 10.139663456355926, + "learning_rate": 7.926169121358259e-08, + "logits/chosen": -3.125, + "logits/rejected": -2.9375, + "logps/chosen": -796.0, + "logps/rejected": -1104.0, + "loss": 0.215, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.15625, + "rewards/margins": 3.328125, + "rewards/rejected": -9.5, + "step": 14850 + }, + { + "epoch": 0.7658806854786754, + "grad_norm": 8.548888435835746, + "learning_rate": 7.893340489334275e-08, + "logits/chosen": -3.296875, + "logits/rejected": -3.0625, + "logps/chosen": -780.0, + "logps/rejected": -1120.0, + "loss": 0.2362, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.1875, + "rewards/margins": 3.515625, + "rewards/rejected": -9.6875, + "step": 14860 + }, + { + "epoch": 0.7663960829789975, + "grad_norm": 7.505805868525267, + "learning_rate": 7.860567233952034e-08, + "logits/chosen": -3.484375, + "logits/rejected": -3.046875, + "logps/chosen": -804.0, + "logps/rejected": -1152.0, + "loss": 0.2158, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.34375, + "rewards/margins": 3.65625, + "rewards/rejected": -10.0, + "step": 14870 + }, + { + "epoch": 0.7669114804793197, + "grad_norm": 6.723616645430264, + "learning_rate": 7.827849461303135e-08, + "logits/chosen": -3.28125, + "logits/rejected": -3.046875, + "logps/chosen": -808.0, + "logps/rejected": -1136.0, + "loss": 0.2342, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.375, + "rewards/margins": 3.40625, + "rewards/rejected": -9.75, + "step": 14880 + }, + { + "epoch": 0.7674268779796418, + "grad_norm": 10.226949795154587, + "learning_rate": 7.795187277299553e-08, + "logits/chosen": -3.265625, + "logits/rejected": -2.921875, + "logps/chosen": -840.0, + "logps/rejected": -1128.0, + "loss": 0.2436, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.5625, + "rewards/margins": 3.1875, + "rewards/rejected": -9.75, + "step": 14890 + }, + { + "epoch": 0.767942275479964, + "grad_norm": 7.849671993309792, + "learning_rate": 7.762580787673345e-08, + "logits/chosen": -3.328125, + "logits/rejected": -3.0, + "logps/chosen": -808.0, + "logps/rejected": -1120.0, + "loss": 0.2305, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.1875, + "rewards/margins": 3.265625, + "rewards/rejected": -9.4375, + "step": 14900 + }, + { + "epoch": 0.768457672980286, + "grad_norm": 12.153924895058918, + "learning_rate": 7.730030097976259e-08, + "logits/chosen": -3.34375, + "logits/rejected": -2.890625, + "logps/chosen": -792.0, + "logps/rejected": -1112.0, + "loss": 0.2544, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.09375, + "rewards/margins": 3.53125, + "rewards/rejected": -9.625, + "step": 14910 + }, + { + "epoch": 0.7689730704806081, + "grad_norm": 12.280742410963889, + "learning_rate": 7.697535313579428e-08, + "logits/chosen": -3.171875, + "logits/rejected": -3.03125, + "logps/chosen": -796.0, + "logps/rejected": -1176.0, + "loss": 0.2318, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.15625, + "rewards/margins": 3.8125, + "rewards/rejected": -9.9375, + "step": 14920 + }, + { + "epoch": 0.7694884679809303, + "grad_norm": 7.23115755163526, + "learning_rate": 7.665096539672989e-08, + "logits/chosen": -3.3125, + "logits/rejected": -3.140625, + "logps/chosen": -804.0, + "logps/rejected": -1136.0, + "loss": 0.2215, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.1875, + "rewards/margins": 3.46875, + "rewards/rejected": -9.6875, + "step": 14930 + }, + { + "epoch": 0.7700038654812524, + "grad_norm": 14.327961475227204, + "learning_rate": 7.632713881265768e-08, + "logits/chosen": -3.296875, + "logits/rejected": -2.921875, + "logps/chosen": -820.0, + "logps/rejected": -1120.0, + "loss": 0.2467, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.40625, + "rewards/margins": 3.25, + "rewards/rejected": -9.625, + "step": 14940 + }, + { + "epoch": 0.7705192629815746, + "grad_norm": 12.11606671098618, + "learning_rate": 7.600387443184952e-08, + "logits/chosen": -3.234375, + "logits/rejected": -2.75, + "logps/chosen": -760.0, + "logps/rejected": -1152.0, + "loss": 0.2112, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.84375, + "rewards/margins": 4.1875, + "rewards/rejected": -10.0, + "step": 14950 + }, + { + "epoch": 0.7710346604818966, + "grad_norm": 10.727141642349945, + "learning_rate": 7.568117330075729e-08, + "logits/chosen": -3.34375, + "logits/rejected": -2.984375, + "logps/chosen": -796.0, + "logps/rejected": -1144.0, + "loss": 0.2342, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.1875, + "rewards/margins": 3.421875, + "rewards/rejected": -9.625, + "step": 14960 + }, + { + "epoch": 0.7715500579822188, + "grad_norm": 8.319781189976348, + "learning_rate": 7.535903646400959e-08, + "logits/chosen": -3.375, + "logits/rejected": -3.046875, + "logps/chosen": -816.0, + "logps/rejected": -1176.0, + "loss": 0.2234, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.4375, + "rewards/margins": 3.5, + "rewards/rejected": -9.9375, + "step": 14970 + }, + { + "epoch": 0.7720654554825409, + "grad_norm": 10.537355050242434, + "learning_rate": 7.503746496440816e-08, + "logits/chosen": -3.390625, + "logits/rejected": -3.125, + "logps/chosen": -800.0, + "logps/rejected": -1136.0, + "loss": 0.2307, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.28125, + "rewards/margins": 3.46875, + "rewards/rejected": -9.75, + "step": 14980 + }, + { + "epoch": 0.772580852982863, + "grad_norm": 9.228767250091254, + "learning_rate": 7.471645984292493e-08, + "logits/chosen": -3.1875, + "logits/rejected": -2.890625, + "logps/chosen": -776.0, + "logps/rejected": -1136.0, + "loss": 0.216, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.0625, + "rewards/margins": 3.859375, + "rewards/rejected": -9.9375, + "step": 14990 + }, + { + "epoch": 0.7730962504831852, + "grad_norm": 6.743877459114503, + "learning_rate": 7.439602213869803e-08, + "logits/chosen": -3.171875, + "logits/rejected": -2.9375, + "logps/chosen": -764.0, + "logps/rejected": -1136.0, + "loss": 0.2156, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.96875, + "rewards/margins": 3.78125, + "rewards/rejected": -9.75, + "step": 15000 + }, + { + "epoch": 0.7730962504831852, + "eval_logits/chosen": -2.890625, + "eval_logits/rejected": -2.53125, + "eval_logps/chosen": -768.0, + "eval_logps/rejected": -1176.0, + "eval_loss": 0.2106233835220337, + "eval_rewards/accuracies": 0.9127435088157654, + "eval_rewards/chosen": -5.96875, + "eval_rewards/margins": 4.21875, + "eval_rewards/rejected": -10.1875, + "eval_runtime": 3574.8514, + "eval_samples_per_second": 27.554, + "eval_steps_per_second": 0.431, + "step": 15000 + }, + { + "epoch": 0.7736116479835072, + "grad_norm": 9.16648162954116, + "learning_rate": 7.407615288902907e-08, + "logits/chosen": -3.1875, + "logits/rejected": -2.84375, + "logps/chosen": -796.0, + "logps/rejected": -1160.0, + "loss": 0.2073, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.125, + "rewards/margins": 3.796875, + "rewards/rejected": -9.9375, + "step": 15010 + }, + { + "epoch": 0.7741270454838294, + "grad_norm": 10.647496009408849, + "learning_rate": 7.375685312937952e-08, + "logits/chosen": -3.15625, + "logits/rejected": -2.953125, + "logps/chosen": -784.0, + "logps/rejected": -1152.0, + "loss": 0.243, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.25, + "rewards/margins": 3.65625, + "rewards/rejected": -9.9375, + "step": 15020 + }, + { + "epoch": 0.7746424429841515, + "grad_norm": 9.187332188690347, + "learning_rate": 7.343812389336703e-08, + "logits/chosen": -3.421875, + "logits/rejected": -2.953125, + "logps/chosen": -788.0, + "logps/rejected": -1184.0, + "loss": 0.2337, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.3125, + "rewards/margins": 3.90625, + "rewards/rejected": -10.25, + "step": 15030 + }, + { + "epoch": 0.7751578404844737, + "grad_norm": 9.068489850743406, + "learning_rate": 7.311996621276276e-08, + "logits/chosen": -3.390625, + "logits/rejected": -3.0625, + "logps/chosen": -800.0, + "logps/rejected": -1152.0, + "loss": 0.2344, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.15625, + "rewards/margins": 3.625, + "rewards/rejected": -9.8125, + "step": 15040 + }, + { + "epoch": 0.7756732379847958, + "grad_norm": 8.573575686532543, + "learning_rate": 7.280238111748727e-08, + "logits/chosen": -3.203125, + "logits/rejected": -2.96875, + "logps/chosen": -804.0, + "logps/rejected": -1152.0, + "loss": 0.2556, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.28125, + "rewards/margins": 3.59375, + "rewards/rejected": -9.875, + "step": 15050 + }, + { + "epoch": 0.7761886354851179, + "grad_norm": 10.070291915559052, + "learning_rate": 7.248536963560792e-08, + "logits/chosen": -3.40625, + "logits/rejected": -2.953125, + "logps/chosen": -832.0, + "logps/rejected": -1136.0, + "loss": 0.2312, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.40625, + "rewards/margins": 3.25, + "rewards/rejected": -9.625, + "step": 15060 + }, + { + "epoch": 0.77670403298544, + "grad_norm": 8.188595083467513, + "learning_rate": 7.21689327933351e-08, + "logits/chosen": -3.203125, + "logits/rejected": -2.90625, + "logps/chosen": -760.0, + "logps/rejected": -1128.0, + "loss": 0.2144, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.0625, + "rewards/margins": 3.5625, + "rewards/rejected": -9.625, + "step": 15070 + }, + { + "epoch": 0.7772194304857621, + "grad_norm": 9.772070171675606, + "learning_rate": 7.185307161501885e-08, + "logits/chosen": -3.1875, + "logits/rejected": -2.90625, + "logps/chosen": -732.0, + "logps/rejected": -1104.0, + "loss": 0.206, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.6875, + "rewards/margins": 3.765625, + "rewards/rejected": -9.4375, + "step": 15080 + }, + { + "epoch": 0.7777348279860843, + "grad_norm": 8.294876122017511, + "learning_rate": 7.153778712314604e-08, + "logits/chosen": -3.234375, + "logits/rejected": -3.0, + "logps/chosen": -768.0, + "logps/rejected": -1104.0, + "loss": 0.2256, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.96875, + "rewards/margins": 3.484375, + "rewards/rejected": -9.4375, + "step": 15090 + }, + { + "epoch": 0.7782502254864064, + "grad_norm": 7.458999655221639, + "learning_rate": 7.12230803383363e-08, + "logits/chosen": -3.359375, + "logits/rejected": -3.015625, + "logps/chosen": -760.0, + "logps/rejected": -1136.0, + "loss": 0.2192, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.03125, + "rewards/margins": 3.703125, + "rewards/rejected": -9.75, + "step": 15100 + }, + { + "epoch": 0.7787656229867285, + "grad_norm": 10.262447183297176, + "learning_rate": 7.090895227933946e-08, + "logits/chosen": -3.265625, + "logits/rejected": -2.96875, + "logps/chosen": -776.0, + "logps/rejected": -1120.0, + "loss": 0.252, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.09375, + "rewards/margins": 3.515625, + "rewards/rejected": -9.625, + "step": 15110 + }, + { + "epoch": 0.7792810204870506, + "grad_norm": 7.470276929427637, + "learning_rate": 7.059540396303196e-08, + "logits/chosen": -3.125, + "logits/rejected": -2.84375, + "logps/chosen": -784.0, + "logps/rejected": -1120.0, + "loss": 0.229, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.09375, + "rewards/margins": 3.578125, + "rewards/rejected": -9.625, + "step": 15120 + }, + { + "epoch": 0.7797964179873728, + "grad_norm": 8.926745827250253, + "learning_rate": 7.028243640441322e-08, + "logits/chosen": -3.265625, + "logits/rejected": -3.140625, + "logps/chosen": -796.0, + "logps/rejected": -1120.0, + "loss": 0.2494, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.09375, + "rewards/margins": 3.421875, + "rewards/rejected": -9.5, + "step": 15130 + }, + { + "epoch": 0.7803118154876949, + "grad_norm": 11.407496364733372, + "learning_rate": 6.997005061660296e-08, + "logits/chosen": -3.296875, + "logits/rejected": -2.921875, + "logps/chosen": -752.0, + "logps/rejected": -1072.0, + "loss": 0.2218, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.875, + "rewards/margins": 3.390625, + "rewards/rejected": -9.25, + "step": 15140 + }, + { + "epoch": 0.780827212988017, + "grad_norm": 7.634894700897114, + "learning_rate": 6.965824761083758e-08, + "logits/chosen": -3.296875, + "logits/rejected": -2.875, + "logps/chosen": -760.0, + "logps/rejected": -1168.0, + "loss": 0.2211, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.84375, + "rewards/margins": 4.1875, + "rewards/rejected": -10.0, + "step": 15150 + }, + { + "epoch": 0.7813426104883391, + "grad_norm": 8.474552599188147, + "learning_rate": 6.934702839646672e-08, + "logits/chosen": -3.1875, + "logits/rejected": -2.96875, + "logps/chosen": -804.0, + "logps/rejected": -1128.0, + "loss": 0.2131, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.34375, + "rewards/margins": 3.46875, + "rewards/rejected": -9.8125, + "step": 15160 + }, + { + "epoch": 0.7818580079886612, + "grad_norm": 8.397393189520276, + "learning_rate": 6.903639398095057e-08, + "logits/chosen": -3.203125, + "logits/rejected": -2.984375, + "logps/chosen": -816.0, + "logps/rejected": -1120.0, + "loss": 0.2106, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.3125, + "rewards/margins": 3.359375, + "rewards/rejected": -9.6875, + "step": 15170 + }, + { + "epoch": 0.7823734054889834, + "grad_norm": 9.649773900723877, + "learning_rate": 6.872634536985583e-08, + "logits/chosen": -3.3125, + "logits/rejected": -2.90625, + "logps/chosen": -820.0, + "logps/rejected": -1128.0, + "loss": 0.2302, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.3125, + "rewards/margins": 3.171875, + "rewards/rejected": -9.5, + "step": 15180 + }, + { + "epoch": 0.7828888029893055, + "grad_norm": 9.354763553968837, + "learning_rate": 6.841688356685316e-08, + "logits/chosen": -2.984375, + "logits/rejected": -2.734375, + "logps/chosen": -784.0, + "logps/rejected": -1128.0, + "loss": 0.2258, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.21875, + "rewards/margins": 3.484375, + "rewards/rejected": -9.6875, + "step": 15190 + }, + { + "epoch": 0.7834042004896277, + "grad_norm": 7.5332675442490284, + "learning_rate": 6.810800957371368e-08, + "logits/chosen": -3.265625, + "logits/rejected": -2.78125, + "logps/chosen": -756.0, + "logps/rejected": -1120.0, + "loss": 0.2157, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.90625, + "rewards/margins": 3.671875, + "rewards/rejected": -9.5625, + "step": 15200 + }, + { + "epoch": 0.7839195979899497, + "grad_norm": 8.873453770561197, + "learning_rate": 6.779972439030537e-08, + "logits/chosen": -3.03125, + "logits/rejected": -2.9375, + "logps/chosen": -752.0, + "logps/rejected": -1104.0, + "loss": 0.2176, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.875, + "rewards/margins": 3.75, + "rewards/rejected": -9.625, + "step": 15210 + }, + { + "epoch": 0.7844349954902718, + "grad_norm": 11.247571520688652, + "learning_rate": 6.749202901459053e-08, + "logits/chosen": -3.34375, + "logits/rejected": -3.15625, + "logps/chosen": -780.0, + "logps/rejected": -1088.0, + "loss": 0.2312, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -5.875, + "rewards/margins": 3.140625, + "rewards/rejected": -9.0, + "step": 15220 + }, + { + "epoch": 0.784950392990594, + "grad_norm": 7.386955813459967, + "learning_rate": 6.718492444262181e-08, + "logits/chosen": -3.21875, + "logits/rejected": -2.84375, + "logps/chosen": -724.0, + "logps/rejected": -1104.0, + "loss": 0.2023, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.59375, + "rewards/margins": 3.828125, + "rewards/rejected": -9.4375, + "step": 15230 + }, + { + "epoch": 0.7854657904909161, + "grad_norm": 10.248656197749167, + "learning_rate": 6.687841166853961e-08, + "logits/chosen": -3.109375, + "logits/rejected": -2.578125, + "logps/chosen": -780.0, + "logps/rejected": -1160.0, + "loss": 0.2197, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -6.1875, + "rewards/margins": 3.953125, + "rewards/rejected": -10.125, + "step": 15240 + }, + { + "epoch": 0.7859811879912383, + "grad_norm": 10.31607854682159, + "learning_rate": 6.65724916845686e-08, + "logits/chosen": -3.265625, + "logits/rejected": -2.953125, + "logps/chosen": -808.0, + "logps/rejected": -1144.0, + "loss": 0.2306, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.375, + "rewards/margins": 3.4375, + "rewards/rejected": -9.8125, + "step": 15250 + }, + { + "epoch": 0.7864965854915603, + "grad_norm": 8.357480624095361, + "learning_rate": 6.626716548101427e-08, + "logits/chosen": -3.25, + "logits/rejected": -2.9375, + "logps/chosen": -780.0, + "logps/rejected": -1112.0, + "loss": 0.2272, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.0625, + "rewards/margins": 3.296875, + "rewards/rejected": -9.375, + "step": 15260 + }, + { + "epoch": 0.7870119829918825, + "grad_norm": 9.627861742248767, + "learning_rate": 6.596243404626023e-08, + "logits/chosen": -3.125, + "logits/rejected": -2.96875, + "logps/chosen": -776.0, + "logps/rejected": -1112.0, + "loss": 0.2393, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.84375, + "rewards/margins": 3.625, + "rewards/rejected": -9.4375, + "step": 15270 + }, + { + "epoch": 0.7875273804922046, + "grad_norm": 6.6176287300218375, + "learning_rate": 6.565829836676449e-08, + "logits/chosen": -3.171875, + "logits/rejected": -2.890625, + "logps/chosen": -768.0, + "logps/rejected": -1072.0, + "loss": 0.2157, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -5.96875, + "rewards/margins": 3.1875, + "rewards/rejected": -9.125, + "step": 15280 + }, + { + "epoch": 0.7880427779925268, + "grad_norm": 9.08670306517256, + "learning_rate": 6.53547594270567e-08, + "logits/chosen": -3.359375, + "logits/rejected": -2.890625, + "logps/chosen": -752.0, + "logps/rejected": -1136.0, + "loss": 0.2316, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.875, + "rewards/margins": 4.03125, + "rewards/rejected": -9.875, + "step": 15290 + }, + { + "epoch": 0.7885581754928489, + "grad_norm": 8.774585995716865, + "learning_rate": 6.505181820973474e-08, + "logits/chosen": -3.421875, + "logits/rejected": -3.234375, + "logps/chosen": -788.0, + "logps/rejected": -1064.0, + "loss": 0.2361, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.8125, + "rewards/margins": 3.15625, + "rewards/rejected": -9.0, + "step": 15300 + }, + { + "epoch": 0.7890735729931709, + "grad_norm": 10.442082383149785, + "learning_rate": 6.474947569546166e-08, + "logits/chosen": -3.40625, + "logits/rejected": -2.953125, + "logps/chosen": -768.0, + "logps/rejected": -1136.0, + "loss": 0.2237, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.875, + "rewards/margins": 3.78125, + "rewards/rejected": -9.625, + "step": 15310 + }, + { + "epoch": 0.7895889704934931, + "grad_norm": 11.287172236609633, + "learning_rate": 6.444773286296221e-08, + "logits/chosen": -3.40625, + "logits/rejected": -3.03125, + "logps/chosen": -768.0, + "logps/rejected": -1104.0, + "loss": 0.2512, + "rewards/accuracies": 0.875, + "rewards/chosen": -5.90625, + "rewards/margins": 3.5, + "rewards/rejected": -9.4375, + "step": 15320 + }, + { + "epoch": 0.7901043679938152, + "grad_norm": 10.68555438267536, + "learning_rate": 6.414659068901998e-08, + "logits/chosen": -3.34375, + "logits/rejected": -3.03125, + "logps/chosen": -756.0, + "logps/rejected": -1072.0, + "loss": 0.2435, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -5.90625, + "rewards/margins": 3.109375, + "rewards/rejected": -9.0, + "step": 15330 + }, + { + "epoch": 0.7906197654941374, + "grad_norm": 10.38658374928412, + "learning_rate": 6.384605014847419e-08, + "logits/chosen": -3.203125, + "logits/rejected": -2.875, + "logps/chosen": -808.0, + "logps/rejected": -1128.0, + "loss": 0.2375, + "rewards/accuracies": 0.84375, + "rewards/chosen": -6.1875, + "rewards/margins": 3.21875, + "rewards/rejected": -9.4375, + "step": 15340 + }, + { + "epoch": 0.7911351629944595, + "grad_norm": 9.462552351823811, + "learning_rate": 6.354611221421651e-08, + "logits/chosen": -3.359375, + "logits/rejected": -2.90625, + "logps/chosen": -784.0, + "logps/rejected": -1112.0, + "loss": 0.2225, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.03125, + "rewards/margins": 3.3125, + "rewards/rejected": -9.375, + "step": 15350 + }, + { + "epoch": 0.7916505604947816, + "grad_norm": 8.497438778990562, + "learning_rate": 6.32467778571879e-08, + "logits/chosen": -3.21875, + "logits/rejected": -2.9375, + "logps/chosen": -748.0, + "logps/rejected": -1104.0, + "loss": 0.2222, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.96875, + "rewards/margins": 3.546875, + "rewards/rejected": -9.5, + "step": 15360 + }, + { + "epoch": 0.7921659579951037, + "grad_norm": 10.761456622604232, + "learning_rate": 6.294804804637538e-08, + "logits/chosen": -3.140625, + "logits/rejected": -2.9375, + "logps/chosen": -772.0, + "logps/rejected": -1128.0, + "loss": 0.2477, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.8125, + "rewards/margins": 3.828125, + "rewards/rejected": -9.6875, + "step": 15370 + }, + { + "epoch": 0.7926813554954258, + "grad_norm": 9.461364640088263, + "learning_rate": 6.264992374880884e-08, + "logits/chosen": -3.3125, + "logits/rejected": -3.03125, + "logps/chosen": -764.0, + "logps/rejected": -1128.0, + "loss": 0.2312, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.09375, + "rewards/margins": 3.671875, + "rewards/rejected": -9.75, + "step": 15380 + }, + { + "epoch": 0.793196752995748, + "grad_norm": 10.279978599966467, + "learning_rate": 6.235240592955835e-08, + "logits/chosen": -3.359375, + "logits/rejected": -3.09375, + "logps/chosen": -776.0, + "logps/rejected": -1112.0, + "loss": 0.2299, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.09375, + "rewards/margins": 3.328125, + "rewards/rejected": -9.375, + "step": 15390 + }, + { + "epoch": 0.7937121504960701, + "grad_norm": 9.019728526117046, + "learning_rate": 6.20554955517305e-08, + "logits/chosen": -3.515625, + "logits/rejected": -3.125, + "logps/chosen": -768.0, + "logps/rejected": -1120.0, + "loss": 0.2293, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.96875, + "rewards/margins": 3.53125, + "rewards/rejected": -9.5, + "step": 15400 + }, + { + "epoch": 0.7942275479963922, + "grad_norm": 7.613432780467366, + "learning_rate": 6.175919357646569e-08, + "logits/chosen": -3.28125, + "logits/rejected": -2.859375, + "logps/chosen": -776.0, + "logps/rejected": -1128.0, + "loss": 0.2318, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.0625, + "rewards/margins": 3.5625, + "rewards/rejected": -9.625, + "step": 15410 + }, + { + "epoch": 0.7947429454967143, + "grad_norm": 11.213769944587783, + "learning_rate": 6.146350096293457e-08, + "logits/chosen": -3.265625, + "logits/rejected": -3.03125, + "logps/chosen": -768.0, + "logps/rejected": -1128.0, + "loss": 0.2357, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -5.90625, + "rewards/margins": 3.734375, + "rewards/rejected": -9.625, + "step": 15420 + }, + { + "epoch": 0.7952583429970365, + "grad_norm": 9.636785198392282, + "learning_rate": 6.116841866833547e-08, + "logits/chosen": -3.328125, + "logits/rejected": -2.921875, + "logps/chosen": -784.0, + "logps/rejected": -1120.0, + "loss": 0.2354, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -6.125, + "rewards/margins": 3.453125, + "rewards/rejected": -9.5625, + "step": 15430 + }, + { + "epoch": 0.7957737404973586, + "grad_norm": 10.811618210741635, + "learning_rate": 6.087394764789083e-08, + "logits/chosen": -3.296875, + "logits/rejected": -2.953125, + "logps/chosen": -792.0, + "logps/rejected": -1072.0, + "loss": 0.2505, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.15625, + "rewards/margins": 2.875, + "rewards/rejected": -9.0625, + "step": 15440 + }, + { + "epoch": 0.7962891379976808, + "grad_norm": 7.109127532541856, + "learning_rate": 6.058008885484442e-08, + "logits/chosen": -3.40625, + "logits/rejected": -3.09375, + "logps/chosen": -756.0, + "logps/rejected": -1104.0, + "loss": 0.2238, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.8125, + "rewards/margins": 3.734375, + "rewards/rejected": -9.5625, + "step": 15450 + }, + { + "epoch": 0.7968045354980028, + "grad_norm": 9.76149663288524, + "learning_rate": 6.028684324045812e-08, + "logits/chosen": -3.25, + "logits/rejected": -3.0, + "logps/chosen": -776.0, + "logps/rejected": -1144.0, + "loss": 0.2534, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.84375, + "rewards/margins": 3.6875, + "rewards/rejected": -9.5, + "step": 15460 + }, + { + "epoch": 0.7973199329983249, + "grad_norm": 8.546833667092937, + "learning_rate": 5.999421175400898e-08, + "logits/chosen": -3.46875, + "logits/rejected": -3.078125, + "logps/chosen": -800.0, + "logps/rejected": -1112.0, + "loss": 0.2458, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.1875, + "rewards/margins": 3.234375, + "rewards/rejected": -9.4375, + "step": 15470 + }, + { + "epoch": 0.7978353304986471, + "grad_norm": 7.515417483372414, + "learning_rate": 5.970219534278581e-08, + "logits/chosen": -2.984375, + "logits/rejected": -2.78125, + "logps/chosen": -776.0, + "logps/rejected": -1096.0, + "loss": 0.2367, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.15625, + "rewards/margins": 3.1875, + "rewards/rejected": -9.3125, + "step": 15480 + }, + { + "epoch": 0.7983507279989692, + "grad_norm": 6.671415168292261, + "learning_rate": 5.9410794952086396e-08, + "logits/chosen": -3.234375, + "logits/rejected": -3.03125, + "logps/chosen": -780.0, + "logps/rejected": -1088.0, + "loss": 0.2368, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.1875, + "rewards/margins": 3.203125, + "rewards/rejected": -9.375, + "step": 15490 + }, + { + "epoch": 0.7988661254992914, + "grad_norm": 12.610533418152405, + "learning_rate": 5.9120011525214455e-08, + "logits/chosen": -3.4375, + "logits/rejected": -3.140625, + "logps/chosen": -796.0, + "logps/rejected": -1104.0, + "loss": 0.2415, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.21875, + "rewards/margins": 3.1875, + "rewards/rejected": -9.4375, + "step": 15500 + }, + { + "epoch": 0.7993815229996134, + "grad_norm": 6.3602451867958685, + "learning_rate": 5.882984600347654e-08, + "logits/chosen": -3.453125, + "logits/rejected": -3.109375, + "logps/chosen": -768.0, + "logps/rejected": -1088.0, + "loss": 0.2332, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.0625, + "rewards/margins": 3.171875, + "rewards/rejected": -9.25, + "step": 15510 + }, + { + "epoch": 0.7998969204999355, + "grad_norm": 8.074768085008438, + "learning_rate": 5.8540299326178934e-08, + "logits/chosen": -3.296875, + "logits/rejected": -3.03125, + "logps/chosen": -832.0, + "logps/rejected": -1144.0, + "loss": 0.2389, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.4375, + "rewards/margins": 3.171875, + "rewards/rejected": -9.625, + "step": 15520 + }, + { + "epoch": 0.8004123180002577, + "grad_norm": 10.974151848599552, + "learning_rate": 5.825137243062453e-08, + "logits/chosen": -3.15625, + "logits/rejected": -2.890625, + "logps/chosen": -796.0, + "logps/rejected": -1120.0, + "loss": 0.2261, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.21875, + "rewards/margins": 3.390625, + "rewards/rejected": -9.625, + "step": 15530 + }, + { + "epoch": 0.8009277155005798, + "grad_norm": 8.148667315400091, + "learning_rate": 5.7963066252109896e-08, + "logits/chosen": -3.21875, + "logits/rejected": -2.828125, + "logps/chosen": -796.0, + "logps/rejected": -1136.0, + "loss": 0.2104, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.125, + "rewards/margins": 3.5, + "rewards/rejected": -9.625, + "step": 15540 + }, + { + "epoch": 0.801443113000902, + "grad_norm": 9.303805057436868, + "learning_rate": 5.767538172392242e-08, + "logits/chosen": -3.21875, + "logits/rejected": -2.8125, + "logps/chosen": -768.0, + "logps/rejected": -1120.0, + "loss": 0.2535, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.0, + "rewards/margins": 3.765625, + "rewards/rejected": -9.75, + "step": 15550 + }, + { + "epoch": 0.801958510501224, + "grad_norm": 9.202210860267606, + "learning_rate": 5.738831977733699e-08, + "logits/chosen": -3.4375, + "logits/rejected": -3.21875, + "logps/chosen": -784.0, + "logps/rejected": -1072.0, + "loss": 0.2345, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.03125, + "rewards/margins": 3.171875, + "rewards/rejected": -9.1875, + "step": 15560 + }, + { + "epoch": 0.8024739080015462, + "grad_norm": 9.940208708221903, + "learning_rate": 5.7101881341613175e-08, + "logits/chosen": -3.171875, + "logits/rejected": -3.015625, + "logps/chosen": -792.0, + "logps/rejected": -1088.0, + "loss": 0.2237, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.15625, + "rewards/margins": 3.25, + "rewards/rejected": -9.375, + "step": 15570 + }, + { + "epoch": 0.8029893055018683, + "grad_norm": 12.32725320071414, + "learning_rate": 5.681606734399227e-08, + "logits/chosen": -3.21875, + "logits/rejected": -2.875, + "logps/chosen": -784.0, + "logps/rejected": -1136.0, + "loss": 0.2382, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.15625, + "rewards/margins": 3.65625, + "rewards/rejected": -9.8125, + "step": 15580 + }, + { + "epoch": 0.8035047030021905, + "grad_norm": 7.191590094212042, + "learning_rate": 5.653087870969373e-08, + "logits/chosen": -3.265625, + "logits/rejected": -3.09375, + "logps/chosen": -804.0, + "logps/rejected": -1152.0, + "loss": 0.2213, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.21875, + "rewards/margins": 3.625, + "rewards/rejected": -9.8125, + "step": 15590 + }, + { + "epoch": 0.8040201005025126, + "grad_norm": 7.969178772393179, + "learning_rate": 5.6246316361913046e-08, + "logits/chosen": -3.25, + "logits/rejected": -2.921875, + "logps/chosen": -796.0, + "logps/rejected": -1136.0, + "loss": 0.2181, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.25, + "rewards/margins": 3.578125, + "rewards/rejected": -9.8125, + "step": 15600 + }, + { + "epoch": 0.8045354980028346, + "grad_norm": 11.08174020245813, + "learning_rate": 5.596238122181826e-08, + "logits/chosen": -3.296875, + "logits/rejected": -2.984375, + "logps/chosen": -764.0, + "logps/rejected": -1136.0, + "loss": 0.2543, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.03125, + "rewards/margins": 3.6875, + "rewards/rejected": -9.75, + "step": 15610 + }, + { + "epoch": 0.8050508955031568, + "grad_norm": 10.434528288391459, + "learning_rate": 5.567907420854698e-08, + "logits/chosen": -3.390625, + "logits/rejected": -3.0625, + "logps/chosen": -800.0, + "logps/rejected": -1128.0, + "loss": 0.2259, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.15625, + "rewards/margins": 3.4375, + "rewards/rejected": -9.5625, + "step": 15620 + }, + { + "epoch": 0.8055662930034789, + "grad_norm": 8.119247202908316, + "learning_rate": 5.5396396239203556e-08, + "logits/chosen": -3.28125, + "logits/rejected": -3.21875, + "logps/chosen": -804.0, + "logps/rejected": -1096.0, + "loss": 0.2473, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.0625, + "rewards/margins": 3.28125, + "rewards/rejected": -9.3125, + "step": 15630 + }, + { + "epoch": 0.8060816905038011, + "grad_norm": 7.681501860487459, + "learning_rate": 5.5114348228855966e-08, + "logits/chosen": -3.265625, + "logits/rejected": -2.953125, + "logps/chosen": -816.0, + "logps/rejected": -1144.0, + "loss": 0.213, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.34375, + "rewards/margins": 3.5, + "rewards/rejected": -9.875, + "step": 15640 + }, + { + "epoch": 0.8065970880041232, + "grad_norm": 6.388441584427517, + "learning_rate": 5.4832931090532833e-08, + "logits/chosen": -3.203125, + "logits/rejected": -2.984375, + "logps/chosen": -792.0, + "logps/rejected": -1120.0, + "loss": 0.2474, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.21875, + "rewards/margins": 3.46875, + "rewards/rejected": -9.6875, + "step": 15650 + }, + { + "epoch": 0.8071124855044453, + "grad_norm": 6.3387004483225855, + "learning_rate": 5.4552145735220715e-08, + "logits/chosen": -3.296875, + "logits/rejected": -2.9375, + "logps/chosen": -776.0, + "logps/rejected": -1144.0, + "loss": 0.2295, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -6.1875, + "rewards/margins": 3.703125, + "rewards/rejected": -9.875, + "step": 15660 + }, + { + "epoch": 0.8076278830047674, + "grad_norm": 7.114596816053049, + "learning_rate": 5.4271993071861004e-08, + "logits/chosen": -3.203125, + "logits/rejected": -2.953125, + "logps/chosen": -792.0, + "logps/rejected": -1120.0, + "loss": 0.251, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.09375, + "rewards/margins": 3.46875, + "rewards/rejected": -9.5625, + "step": 15670 + }, + { + "epoch": 0.8081432805050895, + "grad_norm": 10.429568710393417, + "learning_rate": 5.399247400734697e-08, + "logits/chosen": -3.375, + "logits/rejected": -3.125, + "logps/chosen": -820.0, + "logps/rejected": -1120.0, + "loss": 0.2444, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.3125, + "rewards/margins": 3.109375, + "rewards/rejected": -9.4375, + "step": 15680 + }, + { + "epoch": 0.8086586780054117, + "grad_norm": 5.493775118153351, + "learning_rate": 5.3713589446520716e-08, + "logits/chosen": -3.21875, + "logits/rejected": -2.9375, + "logps/chosen": -784.0, + "logps/rejected": -1128.0, + "loss": 0.235, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.84375, + "rewards/margins": 3.703125, + "rewards/rejected": -9.5625, + "step": 15690 + }, + { + "epoch": 0.8091740755057338, + "grad_norm": 8.292551849298066, + "learning_rate": 5.3435340292170394e-08, + "logits/chosen": -3.28125, + "logits/rejected": -3.015625, + "logps/chosen": -796.0, + "logps/rejected": -1120.0, + "loss": 0.2396, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.21875, + "rewards/margins": 3.25, + "rewards/rejected": -9.4375, + "step": 15700 + }, + { + "epoch": 0.8096894730060559, + "grad_norm": 10.181402336665892, + "learning_rate": 5.315772744502736e-08, + "logits/chosen": -3.328125, + "logits/rejected": -2.96875, + "logps/chosen": -760.0, + "logps/rejected": -1096.0, + "loss": 0.249, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.78125, + "rewards/margins": 3.625, + "rewards/rejected": -9.375, + "step": 15710 + }, + { + "epoch": 0.810204870506378, + "grad_norm": 8.823135254732936, + "learning_rate": 5.288075180376314e-08, + "logits/chosen": -3.328125, + "logits/rejected": -2.953125, + "logps/chosen": -760.0, + "logps/rejected": -1104.0, + "loss": 0.225, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -6.03125, + "rewards/margins": 3.546875, + "rewards/rejected": -9.5625, + "step": 15720 + }, + { + "epoch": 0.8107202680067002, + "grad_norm": 7.744576161397276, + "learning_rate": 5.260441426498652e-08, + "logits/chosen": -3.328125, + "logits/rejected": -3.171875, + "logps/chosen": -792.0, + "logps/rejected": -1104.0, + "loss": 0.2253, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.09375, + "rewards/margins": 3.46875, + "rewards/rejected": -9.5625, + "step": 15730 + }, + { + "epoch": 0.8112356655070223, + "grad_norm": 9.375018707077189, + "learning_rate": 5.232871572324055e-08, + "logits/chosen": -3.21875, + "logits/rejected": -3.015625, + "logps/chosen": -772.0, + "logps/rejected": -1136.0, + "loss": 0.2311, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.96875, + "rewards/margins": 3.796875, + "rewards/rejected": -9.75, + "step": 15740 + }, + { + "epoch": 0.8117510630073445, + "grad_norm": 11.243340043660838, + "learning_rate": 5.205365707099993e-08, + "logits/chosen": -3.484375, + "logits/rejected": -3.203125, + "logps/chosen": -784.0, + "logps/rejected": -1136.0, + "loss": 0.2227, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.21875, + "rewards/margins": 3.5, + "rewards/rejected": -9.6875, + "step": 15750 + }, + { + "epoch": 0.8122664605076665, + "grad_norm": 8.968060324400902, + "learning_rate": 5.1779239198667776e-08, + "logits/chosen": -3.28125, + "logits/rejected": -2.953125, + "logps/chosen": -756.0, + "logps/rejected": -1112.0, + "loss": 0.2272, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.96875, + "rewards/margins": 3.71875, + "rewards/rejected": -9.6875, + "step": 15760 + }, + { + "epoch": 0.8127818580079886, + "grad_norm": 11.116370296785167, + "learning_rate": 5.1505462994573036e-08, + "logits/chosen": -3.21875, + "logits/rejected": -2.984375, + "logps/chosen": -792.0, + "logps/rejected": -1144.0, + "loss": 0.215, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.15625, + "rewards/margins": 3.796875, + "rewards/rejected": -9.9375, + "step": 15770 + }, + { + "epoch": 0.8132972555083108, + "grad_norm": 9.08066029588928, + "learning_rate": 5.1232329344967516e-08, + "logits/chosen": -3.453125, + "logits/rejected": -3.15625, + "logps/chosen": -804.0, + "logps/rejected": -1168.0, + "loss": 0.2334, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.28125, + "rewards/margins": 3.796875, + "rewards/rejected": -10.0625, + "step": 15780 + }, + { + "epoch": 0.8138126530086329, + "grad_norm": 9.128688949761878, + "learning_rate": 5.0959839134022826e-08, + "logits/chosen": -3.21875, + "logits/rejected": -2.875, + "logps/chosen": -816.0, + "logps/rejected": -1176.0, + "loss": 0.2193, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.5, + "rewards/margins": 3.640625, + "rewards/rejected": -10.125, + "step": 15790 + }, + { + "epoch": 0.8143280505089551, + "grad_norm": 10.143453967244277, + "learning_rate": 5.068799324382783e-08, + "logits/chosen": -3.109375, + "logits/rejected": -2.75, + "logps/chosen": -764.0, + "logps/rejected": -1144.0, + "loss": 0.2064, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.0625, + "rewards/margins": 3.84375, + "rewards/rejected": -9.875, + "step": 15800 + }, + { + "epoch": 0.8148434480092771, + "grad_norm": 12.486313283899827, + "learning_rate": 5.041679255438549e-08, + "logits/chosen": -3.359375, + "logits/rejected": -3.09375, + "logps/chosen": -824.0, + "logps/rejected": -1168.0, + "loss": 0.2344, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.5, + "rewards/margins": 3.640625, + "rewards/rejected": -10.125, + "step": 15810 + }, + { + "epoch": 0.8153588455095993, + "grad_norm": 6.848952307574148, + "learning_rate": 5.014623794361034e-08, + "logits/chosen": -3.328125, + "logits/rejected": -2.953125, + "logps/chosen": -800.0, + "logps/rejected": -1144.0, + "loss": 0.2273, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.21875, + "rewards/margins": 3.625, + "rewards/rejected": -9.8125, + "step": 15820 + }, + { + "epoch": 0.8158742430099214, + "grad_norm": 9.763625917122205, + "learning_rate": 4.9876330287325377e-08, + "logits/chosen": -3.28125, + "logits/rejected": -2.921875, + "logps/chosen": -836.0, + "logps/rejected": -1176.0, + "loss": 0.2201, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.40625, + "rewards/margins": 3.578125, + "rewards/rejected": -10.0, + "step": 15830 + }, + { + "epoch": 0.8163896405102435, + "grad_norm": 6.1703073113075915, + "learning_rate": 4.960707045925922e-08, + "logits/chosen": -3.265625, + "logits/rejected": -2.984375, + "logps/chosen": -780.0, + "logps/rejected": -1192.0, + "loss": 0.2129, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.25, + "rewards/margins": 4.21875, + "rewards/rejected": -10.5, + "step": 15840 + }, + { + "epoch": 0.8169050380105657, + "grad_norm": 8.409540768111183, + "learning_rate": 4.933845933104358e-08, + "logits/chosen": -3.40625, + "logits/rejected": -3.1875, + "logps/chosen": -824.0, + "logps/rejected": -1176.0, + "loss": 0.2296, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.46875, + "rewards/margins": 3.609375, + "rewards/rejected": -10.0625, + "step": 15850 + }, + { + "epoch": 0.8174204355108877, + "grad_norm": 9.296168165381708, + "learning_rate": 4.9070497772210006e-08, + "logits/chosen": -3.40625, + "logits/rejected": -3.09375, + "logps/chosen": -784.0, + "logps/rejected": -1152.0, + "loss": 0.2415, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.1875, + "rewards/margins": 3.765625, + "rewards/rejected": -9.9375, + "step": 15860 + }, + { + "epoch": 0.8179358330112099, + "grad_norm": 11.231228054782182, + "learning_rate": 4.880318665018748e-08, + "logits/chosen": -3.25, + "logits/rejected": -3.046875, + "logps/chosen": -788.0, + "logps/rejected": -1136.0, + "loss": 0.2216, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.09375, + "rewards/margins": 3.8125, + "rewards/rejected": -9.875, + "step": 15870 + }, + { + "epoch": 0.818451230511532, + "grad_norm": 12.543897033021143, + "learning_rate": 4.853652683029946e-08, + "logits/chosen": -3.328125, + "logits/rejected": -3.046875, + "logps/chosen": -776.0, + "logps/rejected": -1136.0, + "loss": 0.2669, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.03125, + "rewards/margins": 3.484375, + "rewards/rejected": -9.5, + "step": 15880 + }, + { + "epoch": 0.8189666280118542, + "grad_norm": 9.08447431645231, + "learning_rate": 4.827051917576083e-08, + "logits/chosen": -3.359375, + "logits/rejected": -3.203125, + "logps/chosen": -800.0, + "logps/rejected": -1144.0, + "loss": 0.2257, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.1875, + "rewards/margins": 3.5625, + "rewards/rejected": -9.75, + "step": 15890 + }, + { + "epoch": 0.8194820255121763, + "grad_norm": 8.802117737592544, + "learning_rate": 4.8005164547675474e-08, + "logits/chosen": -3.3125, + "logits/rejected": -2.9375, + "logps/chosen": -820.0, + "logps/rejected": -1152.0, + "loss": 0.2249, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.53125, + "rewards/margins": 3.34375, + "rewards/rejected": -9.875, + "step": 15900 + }, + { + "epoch": 0.8199974230124983, + "grad_norm": 6.225142809229339, + "learning_rate": 4.774046380503341e-08, + "logits/chosen": -3.359375, + "logits/rejected": -3.140625, + "logps/chosen": -808.0, + "logps/rejected": -1144.0, + "loss": 0.2294, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.1875, + "rewards/margins": 3.421875, + "rewards/rejected": -9.625, + "step": 15910 + }, + { + "epoch": 0.8205128205128205, + "grad_norm": 6.916425086013018, + "learning_rate": 4.7476417804707735e-08, + "logits/chosen": -3.296875, + "logits/rejected": -3.03125, + "logps/chosen": -816.0, + "logps/rejected": -1176.0, + "loss": 0.2096, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.28125, + "rewards/margins": 3.75, + "rewards/rejected": -10.0625, + "step": 15920 + }, + { + "epoch": 0.8210282180131426, + "grad_norm": 7.759004146114974, + "learning_rate": 4.721302740145225e-08, + "logits/chosen": -3.234375, + "logits/rejected": -2.9375, + "logps/chosen": -780.0, + "logps/rejected": -1144.0, + "loss": 0.2258, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.0625, + "rewards/margins": 3.703125, + "rewards/rejected": -9.75, + "step": 15930 + }, + { + "epoch": 0.8215436155134648, + "grad_norm": 13.401181290090618, + "learning_rate": 4.6950293447898335e-08, + "logits/chosen": -3.265625, + "logits/rejected": -3.0, + "logps/chosen": -792.0, + "logps/rejected": -1160.0, + "loss": 0.2405, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.125, + "rewards/margins": 3.828125, + "rewards/rejected": -9.9375, + "step": 15940 + }, + { + "epoch": 0.8220590130137869, + "grad_norm": 6.340316067528457, + "learning_rate": 4.6688216794552424e-08, + "logits/chosen": -3.171875, + "logits/rejected": -2.828125, + "logps/chosen": -820.0, + "logps/rejected": -1104.0, + "loss": 0.2215, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.125, + "rewards/margins": 3.453125, + "rewards/rejected": -9.5625, + "step": 15950 + }, + { + "epoch": 0.822574410514109, + "grad_norm": 9.487996931935994, + "learning_rate": 4.642679828979329e-08, + "logits/chosen": -3.171875, + "logits/rejected": -2.84375, + "logps/chosen": -804.0, + "logps/rejected": -1112.0, + "loss": 0.226, + "rewards/accuracies": 0.84375, + "rewards/chosen": -6.28125, + "rewards/margins": 3.1875, + "rewards/rejected": -9.4375, + "step": 15960 + }, + { + "epoch": 0.8230898080144311, + "grad_norm": 8.395927868138704, + "learning_rate": 4.616603877986897e-08, + "logits/chosen": -3.265625, + "logits/rejected": -3.015625, + "logps/chosen": -776.0, + "logps/rejected": -1120.0, + "loss": 0.2191, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.0, + "rewards/margins": 3.6875, + "rewards/rejected": -9.6875, + "step": 15970 + }, + { + "epoch": 0.8236052055147532, + "grad_norm": 10.85151697812025, + "learning_rate": 4.5905939108894395e-08, + "logits/chosen": -3.140625, + "logits/rejected": -2.875, + "logps/chosen": -808.0, + "logps/rejected": -1128.0, + "loss": 0.2241, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.25, + "rewards/margins": 3.453125, + "rewards/rejected": -9.6875, + "step": 15980 + }, + { + "epoch": 0.8241206030150754, + "grad_norm": 9.107284934796816, + "learning_rate": 4.5646500118848435e-08, + "logits/chosen": -3.296875, + "logits/rejected": -2.984375, + "logps/chosen": -796.0, + "logps/rejected": -1112.0, + "loss": 0.2406, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.15625, + "rewards/margins": 3.359375, + "rewards/rejected": -9.5, + "step": 15990 + }, + { + "epoch": 0.8246360005153975, + "grad_norm": 8.870117053494976, + "learning_rate": 4.5387722649571277e-08, + "logits/chosen": -3.390625, + "logits/rejected": -3.25, + "logps/chosen": -776.0, + "logps/rejected": -1136.0, + "loss": 0.2033, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.0, + "rewards/margins": 3.59375, + "rewards/rejected": -9.5625, + "step": 16000 + }, + { + "epoch": 0.8251513980157196, + "grad_norm": 10.576554708118003, + "learning_rate": 4.512960753876174e-08, + "logits/chosen": -3.484375, + "logits/rejected": -3.046875, + "logps/chosen": -800.0, + "logps/rejected": -1160.0, + "loss": 0.2495, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.125, + "rewards/margins": 3.71875, + "rewards/rejected": -9.8125, + "step": 16010 + }, + { + "epoch": 0.8256667955160417, + "grad_norm": 9.864914715435521, + "learning_rate": 4.48721556219743e-08, + "logits/chosen": -3.125, + "logits/rejected": -2.84375, + "logps/chosen": -768.0, + "logps/rejected": -1120.0, + "loss": 0.2066, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.03125, + "rewards/margins": 3.609375, + "rewards/rejected": -9.625, + "step": 16020 + }, + { + "epoch": 0.8261821930163639, + "grad_norm": 6.78443495684596, + "learning_rate": 4.461536773261676e-08, + "logits/chosen": -3.234375, + "logits/rejected": -2.984375, + "logps/chosen": -796.0, + "logps/rejected": -1096.0, + "loss": 0.2347, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.28125, + "rewards/margins": 3.09375, + "rewards/rejected": -9.375, + "step": 16030 + }, + { + "epoch": 0.826697590516686, + "grad_norm": 9.074638452216224, + "learning_rate": 4.435924470194738e-08, + "logits/chosen": -3.265625, + "logits/rejected": -3.015625, + "logps/chosen": -772.0, + "logps/rejected": -1144.0, + "loss": 0.2159, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.0, + "rewards/margins": 3.75, + "rewards/rejected": -9.75, + "step": 16040 + }, + { + "epoch": 0.8272129880170082, + "grad_norm": 11.009602776499767, + "learning_rate": 4.4103787359072e-08, + "logits/chosen": -3.359375, + "logits/rejected": -3.015625, + "logps/chosen": -812.0, + "logps/rejected": -1128.0, + "loss": 0.2264, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.28125, + "rewards/margins": 3.390625, + "rewards/rejected": -9.6875, + "step": 16050 + }, + { + "epoch": 0.8277283855173302, + "grad_norm": 7.194280498487052, + "learning_rate": 4.384899653094171e-08, + "logits/chosen": -3.4375, + "logits/rejected": -3.21875, + "logps/chosen": -792.0, + "logps/rejected": -1056.0, + "loss": 0.2137, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.15625, + "rewards/margins": 2.890625, + "rewards/rejected": -9.0625, + "step": 16060 + }, + { + "epoch": 0.8282437830176523, + "grad_norm": 8.8197444047488, + "learning_rate": 4.359487304234999e-08, + "logits/chosen": -3.4375, + "logits/rejected": -3.125, + "logps/chosen": -792.0, + "logps/rejected": -1136.0, + "loss": 0.2502, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.125, + "rewards/margins": 3.59375, + "rewards/rejected": -9.75, + "step": 16070 + }, + { + "epoch": 0.8287591805179745, + "grad_norm": 8.174229417001893, + "learning_rate": 4.334141771592992e-08, + "logits/chosen": -3.171875, + "logits/rejected": -2.921875, + "logps/chosen": -772.0, + "logps/rejected": -1136.0, + "loss": 0.2289, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.0625, + "rewards/margins": 3.53125, + "rewards/rejected": -9.625, + "step": 16080 + }, + { + "epoch": 0.8292745780182966, + "grad_norm": 8.403312278848915, + "learning_rate": 4.3088631372151795e-08, + "logits/chosen": -3.296875, + "logits/rejected": -2.921875, + "logps/chosen": -796.0, + "logps/rejected": -1160.0, + "loss": 0.2102, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.34375, + "rewards/margins": 3.765625, + "rewards/rejected": -10.125, + "step": 16090 + }, + { + "epoch": 0.8297899755186188, + "grad_norm": 11.52754734063142, + "learning_rate": 4.2836514829320184e-08, + "logits/chosen": -3.28125, + "logits/rejected": -3.078125, + "logps/chosen": -804.0, + "logps/rejected": -1128.0, + "loss": 0.2332, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.96875, + "rewards/margins": 3.53125, + "rewards/rejected": -9.5, + "step": 16100 + }, + { + "epoch": 0.8303053730189408, + "grad_norm": 7.272583611897796, + "learning_rate": 4.25850689035715e-08, + "logits/chosen": -3.34375, + "logits/rejected": -2.96875, + "logps/chosen": -800.0, + "logps/rejected": -1136.0, + "loss": 0.2202, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.1875, + "rewards/margins": 3.5, + "rewards/rejected": -9.6875, + "step": 16110 + }, + { + "epoch": 0.830820770519263, + "grad_norm": 7.408289503634225, + "learning_rate": 4.233429440887135e-08, + "logits/chosen": -3.3125, + "logits/rejected": -2.921875, + "logps/chosen": -792.0, + "logps/rejected": -1144.0, + "loss": 0.2415, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.09375, + "rewards/margins": 3.578125, + "rewards/rejected": -9.6875, + "step": 16120 + }, + { + "epoch": 0.8313361680195851, + "grad_norm": 8.495796582225472, + "learning_rate": 4.208419215701159e-08, + "logits/chosen": -3.4375, + "logits/rejected": -3.078125, + "logps/chosen": -808.0, + "logps/rejected": -1120.0, + "loss": 0.2192, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.21875, + "rewards/margins": 3.34375, + "rewards/rejected": -9.5625, + "step": 16130 + }, + { + "epoch": 0.8318515655199072, + "grad_norm": 8.64199626277333, + "learning_rate": 4.183476295760821e-08, + "logits/chosen": -3.34375, + "logits/rejected": -2.984375, + "logps/chosen": -836.0, + "logps/rejected": -1152.0, + "loss": 0.2353, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.46875, + "rewards/margins": 3.40625, + "rewards/rejected": -9.875, + "step": 16140 + }, + { + "epoch": 0.8323669630202294, + "grad_norm": 8.812004385629963, + "learning_rate": 4.158600761809822e-08, + "logits/chosen": -3.34375, + "logits/rejected": -3.03125, + "logps/chosen": -784.0, + "logps/rejected": -1176.0, + "loss": 0.2327, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.125, + "rewards/margins": 3.859375, + "rewards/rejected": -10.0, + "step": 16150 + }, + { + "epoch": 0.8328823605205514, + "grad_norm": 8.89589167624695, + "learning_rate": 4.133792694373731e-08, + "logits/chosen": -3.359375, + "logits/rejected": -3.125, + "logps/chosen": -796.0, + "logps/rejected": -1152.0, + "loss": 0.2538, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.25, + "rewards/margins": 3.609375, + "rewards/rejected": -9.875, + "step": 16160 + }, + { + "epoch": 0.8333977580208736, + "grad_norm": 8.900650328165638, + "learning_rate": 4.109052173759733e-08, + "logits/chosen": -3.234375, + "logits/rejected": -2.953125, + "logps/chosen": -760.0, + "logps/rejected": -1112.0, + "loss": 0.2234, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.875, + "rewards/margins": 3.765625, + "rewards/rejected": -9.6875, + "step": 16170 + }, + { + "epoch": 0.8339131555211957, + "grad_norm": 7.79430942301771, + "learning_rate": 4.0843792800563295e-08, + "logits/chosen": -3.390625, + "logits/rejected": -2.921875, + "logps/chosen": -772.0, + "logps/rejected": -1168.0, + "loss": 0.234, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.96875, + "rewards/margins": 4.0625, + "rewards/rejected": -10.0625, + "step": 16180 + }, + { + "epoch": 0.8344285530215179, + "grad_norm": 9.20865362487647, + "learning_rate": 4.059774093133128e-08, + "logits/chosen": -3.328125, + "logits/rejected": -3.046875, + "logps/chosen": -772.0, + "logps/rejected": -1136.0, + "loss": 0.2053, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.03125, + "rewards/margins": 3.8125, + "rewards/rejected": -9.875, + "step": 16190 + }, + { + "epoch": 0.83494395052184, + "grad_norm": 12.750610661377273, + "learning_rate": 4.0352366926405394e-08, + "logits/chosen": -3.265625, + "logits/rejected": -2.96875, + "logps/chosen": -828.0, + "logps/rejected": -1152.0, + "loss": 0.2219, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.4375, + "rewards/margins": 3.484375, + "rewards/rejected": -9.9375, + "step": 16200 + }, + { + "epoch": 0.835459348022162, + "grad_norm": 8.344152322155873, + "learning_rate": 4.010767158009551e-08, + "logits/chosen": -3.328125, + "logits/rejected": -2.9375, + "logps/chosen": -808.0, + "logps/rejected": -1176.0, + "loss": 0.2136, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.375, + "rewards/margins": 3.59375, + "rewards/rejected": -10.0, + "step": 16210 + }, + { + "epoch": 0.8359747455224842, + "grad_norm": 7.905356979051144, + "learning_rate": 3.986365568451463e-08, + "logits/chosen": -3.234375, + "logits/rejected": -2.859375, + "logps/chosen": -812.0, + "logps/rejected": -1152.0, + "loss": 0.2265, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.125, + "rewards/margins": 3.75, + "rewards/rejected": -9.875, + "step": 16220 + }, + { + "epoch": 0.8364901430228063, + "grad_norm": 6.405987967245383, + "learning_rate": 3.962032002957624e-08, + "logits/chosen": -3.203125, + "logits/rejected": -3.0, + "logps/chosen": -832.0, + "logps/rejected": -1152.0, + "loss": 0.2272, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.53125, + "rewards/margins": 3.34375, + "rewards/rejected": -9.875, + "step": 16230 + }, + { + "epoch": 0.8370055405231285, + "grad_norm": 9.33439860379506, + "learning_rate": 3.9377665402991786e-08, + "logits/chosen": -3.390625, + "logits/rejected": -3.03125, + "logps/chosen": -808.0, + "logps/rejected": -1144.0, + "loss": 0.2241, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.25, + "rewards/margins": 3.53125, + "rewards/rejected": -9.8125, + "step": 16240 + }, + { + "epoch": 0.8375209380234506, + "grad_norm": 6.4907545027458795, + "learning_rate": 3.9135692590268e-08, + "logits/chosen": -3.390625, + "logits/rejected": -3.015625, + "logps/chosen": -824.0, + "logps/rejected": -1176.0, + "loss": 0.2117, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.21875, + "rewards/margins": 3.796875, + "rewards/rejected": -10.0, + "step": 16250 + }, + { + "epoch": 0.8380363355237727, + "grad_norm": 12.385812027676515, + "learning_rate": 3.889440237470468e-08, + "logits/chosen": -3.375, + "logits/rejected": -3.234375, + "logps/chosen": -776.0, + "logps/rejected": -1152.0, + "loss": 0.2366, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.03125, + "rewards/margins": 3.875, + "rewards/rejected": -9.875, + "step": 16260 + }, + { + "epoch": 0.8385517330240948, + "grad_norm": 8.474418816671925, + "learning_rate": 3.865379553739193e-08, + "logits/chosen": -3.5, + "logits/rejected": -3.078125, + "logps/chosen": -816.0, + "logps/rejected": -1152.0, + "loss": 0.2285, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.53125, + "rewards/margins": 3.21875, + "rewards/rejected": -9.75, + "step": 16270 + }, + { + "epoch": 0.839067130524417, + "grad_norm": 7.0354550996442455, + "learning_rate": 3.841387285720763e-08, + "logits/chosen": -3.40625, + "logits/rejected": -3.1875, + "logps/chosen": -784.0, + "logps/rejected": -1168.0, + "loss": 0.2336, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.0625, + "rewards/margins": 4.0625, + "rewards/rejected": -10.125, + "step": 16280 + }, + { + "epoch": 0.8395825280247391, + "grad_norm": 8.084293616879037, + "learning_rate": 3.8174635110814954e-08, + "logits/chosen": -3.25, + "logits/rejected": -2.90625, + "logps/chosen": -748.0, + "logps/rejected": -1176.0, + "loss": 0.2381, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -5.78125, + "rewards/margins": 4.5625, + "rewards/rejected": -10.3125, + "step": 16290 + }, + { + "epoch": 0.8400979255250612, + "grad_norm": 7.684315345993253, + "learning_rate": 3.793608307265972e-08, + "logits/chosen": -3.28125, + "logits/rejected": -2.90625, + "logps/chosen": -824.0, + "logps/rejected": -1144.0, + "loss": 0.238, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.375, + "rewards/margins": 3.59375, + "rewards/rejected": -9.9375, + "step": 16300 + }, + { + "epoch": 0.8406133230253833, + "grad_norm": 8.69451572913628, + "learning_rate": 3.769821751496821e-08, + "logits/chosen": -3.453125, + "logits/rejected": -3.078125, + "logps/chosen": -788.0, + "logps/rejected": -1144.0, + "loss": 0.213, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.21875, + "rewards/margins": 3.578125, + "rewards/rejected": -9.8125, + "step": 16310 + }, + { + "epoch": 0.8411287205257054, + "grad_norm": 8.109439746687247, + "learning_rate": 3.74610392077444e-08, + "logits/chosen": -3.21875, + "logits/rejected": -2.84375, + "logps/chosen": -756.0, + "logps/rejected": -1144.0, + "loss": 0.2091, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.78125, + "rewards/margins": 4.125, + "rewards/rejected": -9.875, + "step": 16320 + }, + { + "epoch": 0.8416441180260276, + "grad_norm": 9.20089840161592, + "learning_rate": 3.722454891876764e-08, + "logits/chosen": -3.4375, + "logits/rejected": -3.03125, + "logps/chosen": -792.0, + "logps/rejected": -1176.0, + "loss": 0.2348, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.15625, + "rewards/margins": 3.640625, + "rewards/rejected": -9.8125, + "step": 16330 + }, + { + "epoch": 0.8421595155263497, + "grad_norm": 9.043134644327987, + "learning_rate": 3.698874741358981e-08, + "logits/chosen": -3.265625, + "logits/rejected": -2.90625, + "logps/chosen": -804.0, + "logps/rejected": -1120.0, + "loss": 0.2292, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.375, + "rewards/margins": 3.296875, + "rewards/rejected": -9.6875, + "step": 16340 + }, + { + "epoch": 0.8426749130266719, + "grad_norm": 12.910699091784217, + "learning_rate": 3.6753635455533286e-08, + "logits/chosen": -3.515625, + "logits/rejected": -3.09375, + "logps/chosen": -836.0, + "logps/rejected": -1152.0, + "loss": 0.2395, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.5, + "rewards/margins": 3.234375, + "rewards/rejected": -9.75, + "step": 16350 + }, + { + "epoch": 0.8431903105269939, + "grad_norm": 7.9340299139113935, + "learning_rate": 3.651921380568826e-08, + "logits/chosen": -3.25, + "logits/rejected": -2.890625, + "logps/chosen": -796.0, + "logps/rejected": -1152.0, + "loss": 0.2079, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -6.21875, + "rewards/margins": 3.71875, + "rewards/rejected": -9.9375, + "step": 16360 + }, + { + "epoch": 0.843705708027316, + "grad_norm": 11.485230269922042, + "learning_rate": 3.6285483222910334e-08, + "logits/chosen": -3.28125, + "logits/rejected": -3.09375, + "logps/chosen": -784.0, + "logps/rejected": -1088.0, + "loss": 0.2408, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.09375, + "rewards/margins": 3.28125, + "rewards/rejected": -9.375, + "step": 16370 + }, + { + "epoch": 0.8442211055276382, + "grad_norm": 7.326780608595959, + "learning_rate": 3.6052444463817944e-08, + "logits/chosen": -3.34375, + "logits/rejected": -3.015625, + "logps/chosen": -800.0, + "logps/rejected": -1128.0, + "loss": 0.2209, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.1875, + "rewards/margins": 3.234375, + "rewards/rejected": -9.4375, + "step": 16380 + }, + { + "epoch": 0.8447365030279603, + "grad_norm": 8.356684524892552, + "learning_rate": 3.5820098282790146e-08, + "logits/chosen": -3.265625, + "logits/rejected": -2.984375, + "logps/chosen": -776.0, + "logps/rejected": -1096.0, + "loss": 0.2565, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -6.1875, + "rewards/margins": 3.421875, + "rewards/rejected": -9.5625, + "step": 16390 + }, + { + "epoch": 0.8452519005282825, + "grad_norm": 8.02534892555995, + "learning_rate": 3.558844543196382e-08, + "logits/chosen": -3.21875, + "logits/rejected": -2.84375, + "logps/chosen": -804.0, + "logps/rejected": -1176.0, + "loss": 0.2397, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.40625, + "rewards/margins": 3.765625, + "rewards/rejected": -10.1875, + "step": 16400 + }, + { + "epoch": 0.8457672980286045, + "grad_norm": 7.388665857558497, + "learning_rate": 3.5357486661231505e-08, + "logits/chosen": -3.421875, + "logits/rejected": -3.1875, + "logps/chosen": -788.0, + "logps/rejected": -1152.0, + "loss": 0.2228, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.25, + "rewards/margins": 3.640625, + "rewards/rejected": -9.875, + "step": 16410 + }, + { + "epoch": 0.8462826955289267, + "grad_norm": 7.944632486754261, + "learning_rate": 3.5127222718239006e-08, + "logits/chosen": -3.28125, + "logits/rejected": -2.796875, + "logps/chosen": -800.0, + "logps/rejected": -1128.0, + "loss": 0.2439, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.1875, + "rewards/margins": 3.71875, + "rewards/rejected": -9.875, + "step": 16420 + }, + { + "epoch": 0.8467980930292488, + "grad_norm": 7.845292607187066, + "learning_rate": 3.489765434838279e-08, + "logits/chosen": -3.28125, + "logits/rejected": -2.875, + "logps/chosen": -788.0, + "logps/rejected": -1144.0, + "loss": 0.2177, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.125, + "rewards/margins": 3.65625, + "rewards/rejected": -9.8125, + "step": 16430 + }, + { + "epoch": 0.847313490529571, + "grad_norm": 8.68594134443245, + "learning_rate": 3.466878229480774e-08, + "logits/chosen": -3.1875, + "logits/rejected": -2.875, + "logps/chosen": -812.0, + "logps/rejected": -1152.0, + "loss": 0.2316, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -6.3125, + "rewards/margins": 3.578125, + "rewards/rejected": -9.875, + "step": 16440 + }, + { + "epoch": 0.8478288880298931, + "grad_norm": 7.9715579436972455, + "learning_rate": 3.444060729840456e-08, + "logits/chosen": -3.25, + "logits/rejected": -2.984375, + "logps/chosen": -800.0, + "logps/rejected": -1160.0, + "loss": 0.2305, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.0625, + "rewards/margins": 3.890625, + "rewards/rejected": -9.9375, + "step": 16450 + }, + { + "epoch": 0.8483442855302151, + "grad_norm": 7.861535971301532, + "learning_rate": 3.4213130097807436e-08, + "logits/chosen": -3.46875, + "logits/rejected": -3.046875, + "logps/chosen": -812.0, + "logps/rejected": -1144.0, + "loss": 0.2512, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.28125, + "rewards/margins": 3.484375, + "rewards/rejected": -9.75, + "step": 16460 + }, + { + "epoch": 0.8488596830305373, + "grad_norm": 9.48493536782836, + "learning_rate": 3.398635142939185e-08, + "logits/chosen": -3.265625, + "logits/rejected": -3.0, + "logps/chosen": -808.0, + "logps/rejected": -1136.0, + "loss": 0.2465, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.34375, + "rewards/margins": 3.28125, + "rewards/rejected": -9.625, + "step": 16470 + }, + { + "epoch": 0.8493750805308594, + "grad_norm": 10.783288768321244, + "learning_rate": 3.3760272027271966e-08, + "logits/chosen": -3.34375, + "logits/rejected": -2.984375, + "logps/chosen": -804.0, + "logps/rejected": -1136.0, + "loss": 0.2205, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.1875, + "rewards/margins": 3.625, + "rewards/rejected": -9.8125, + "step": 16480 + }, + { + "epoch": 0.8498904780311816, + "grad_norm": 8.57251166364417, + "learning_rate": 3.3534892623298393e-08, + "logits/chosen": -3.375, + "logits/rejected": -3.15625, + "logps/chosen": -816.0, + "logps/rejected": -1144.0, + "loss": 0.2163, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.3125, + "rewards/margins": 3.453125, + "rewards/rejected": -9.8125, + "step": 16490 + }, + { + "epoch": 0.8504058755315037, + "grad_norm": 10.130638199538224, + "learning_rate": 3.331021394705563e-08, + "logits/chosen": -3.46875, + "logits/rejected": -3.015625, + "logps/chosen": -808.0, + "logps/rejected": -1160.0, + "loss": 0.2316, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.1875, + "rewards/margins": 3.5625, + "rewards/rejected": -9.75, + "step": 16500 + }, + { + "epoch": 0.8509212730318257, + "grad_norm": 11.332311799338253, + "learning_rate": 3.308623672585981e-08, + "logits/chosen": -3.234375, + "logits/rejected": -2.921875, + "logps/chosen": -804.0, + "logps/rejected": -1112.0, + "loss": 0.2524, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.28125, + "rewards/margins": 3.265625, + "rewards/rejected": -9.5625, + "step": 16510 + }, + { + "epoch": 0.8514366705321479, + "grad_norm": 10.312295351360987, + "learning_rate": 3.2862961684756495e-08, + "logits/chosen": -3.421875, + "logits/rejected": -2.90625, + "logps/chosen": -796.0, + "logps/rejected": -1112.0, + "loss": 0.2405, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.28125, + "rewards/margins": 3.34375, + "rewards/rejected": -9.625, + "step": 16520 + }, + { + "epoch": 0.85195206803247, + "grad_norm": 7.80888778083033, + "learning_rate": 3.264038954651807e-08, + "logits/chosen": -3.484375, + "logits/rejected": -3.21875, + "logps/chosen": -792.0, + "logps/rejected": -1104.0, + "loss": 0.216, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.0, + "rewards/margins": 3.328125, + "rewards/rejected": -9.375, + "step": 16530 + }, + { + "epoch": 0.8524674655327922, + "grad_norm": 11.090463195946919, + "learning_rate": 3.2418521031641693e-08, + "logits/chosen": -3.21875, + "logits/rejected": -3.125, + "logps/chosen": -800.0, + "logps/rejected": -1128.0, + "loss": 0.2498, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.3125, + "rewards/margins": 3.453125, + "rewards/rejected": -9.75, + "step": 16540 + }, + { + "epoch": 0.8529828630331143, + "grad_norm": 7.176377147676843, + "learning_rate": 3.219735685834654e-08, + "logits/chosen": -3.328125, + "logits/rejected": -3.25, + "logps/chosen": -796.0, + "logps/rejected": -1104.0, + "loss": 0.2378, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.1875, + "rewards/margins": 3.40625, + "rewards/rejected": -9.5625, + "step": 16550 + }, + { + "epoch": 0.8534982605334364, + "grad_norm": 11.788207414422361, + "learning_rate": 3.197689774257198e-08, + "logits/chosen": -2.984375, + "logits/rejected": -2.78125, + "logps/chosen": -752.0, + "logps/rejected": -1144.0, + "loss": 0.2075, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.9375, + "rewards/margins": 4.03125, + "rewards/rejected": -10.0, + "step": 16560 + }, + { + "epoch": 0.8540136580337585, + "grad_norm": 6.600539834255068, + "learning_rate": 3.1757144397974775e-08, + "logits/chosen": -3.203125, + "logits/rejected": -2.984375, + "logps/chosen": -828.0, + "logps/rejected": -1168.0, + "loss": 0.2302, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.375, + "rewards/margins": 3.734375, + "rewards/rejected": -10.125, + "step": 16570 + }, + { + "epoch": 0.8545290555340807, + "grad_norm": 6.338033083354845, + "learning_rate": 3.1538097535927264e-08, + "logits/chosen": -3.359375, + "logits/rejected": -3.1875, + "logps/chosen": -812.0, + "logps/rejected": -1120.0, + "loss": 0.2315, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.34375, + "rewards/margins": 3.3125, + "rewards/rejected": -9.6875, + "step": 16580 + }, + { + "epoch": 0.8550444530344028, + "grad_norm": 7.894813999596367, + "learning_rate": 3.131975786551461e-08, + "logits/chosen": -3.28125, + "logits/rejected": -2.96875, + "logps/chosen": -812.0, + "logps/rejected": -1128.0, + "loss": 0.2389, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.375, + "rewards/margins": 3.296875, + "rewards/rejected": -9.6875, + "step": 16590 + }, + { + "epoch": 0.8555598505347249, + "grad_norm": 7.365756829355162, + "learning_rate": 3.110212609353283e-08, + "logits/chosen": -3.28125, + "logits/rejected": -2.890625, + "logps/chosen": -776.0, + "logps/rejected": -1160.0, + "loss": 0.2193, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.875, + "rewards/margins": 3.921875, + "rewards/rejected": -9.8125, + "step": 16600 + }, + { + "epoch": 0.856075248035047, + "grad_norm": 13.104288594298561, + "learning_rate": 3.0885202924486295e-08, + "logits/chosen": -3.296875, + "logits/rejected": -3.0, + "logps/chosen": -796.0, + "logps/rejected": -1152.0, + "loss": 0.2284, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.25, + "rewards/margins": 3.59375, + "rewards/rejected": -9.8125, + "step": 16610 + }, + { + "epoch": 0.8565906455353691, + "grad_norm": 7.786622825282487, + "learning_rate": 3.066898906058546e-08, + "logits/chosen": -3.328125, + "logits/rejected": -3.203125, + "logps/chosen": -780.0, + "logps/rejected": -1136.0, + "loss": 0.2412, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.15625, + "rewards/margins": 3.796875, + "rewards/rejected": -9.9375, + "step": 16620 + }, + { + "epoch": 0.8571060430356913, + "grad_norm": 7.848702092552779, + "learning_rate": 3.0453485201744786e-08, + "logits/chosen": -3.234375, + "logits/rejected": -2.9375, + "logps/chosen": -784.0, + "logps/rejected": -1152.0, + "loss": 0.2237, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.09375, + "rewards/margins": 3.796875, + "rewards/rejected": -9.875, + "step": 16630 + }, + { + "epoch": 0.8576214405360134, + "grad_norm": 6.732196405199873, + "learning_rate": 3.0238692045580356e-08, + "logits/chosen": -3.3125, + "logits/rejected": -3.109375, + "logps/chosen": -784.0, + "logps/rejected": -1128.0, + "loss": 0.2233, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.03125, + "rewards/margins": 3.59375, + "rewards/rejected": -9.625, + "step": 16640 + }, + { + "epoch": 0.8581368380363356, + "grad_norm": 10.193890188021998, + "learning_rate": 3.002461028740758e-08, + "logits/chosen": -3.375, + "logits/rejected": -3.0625, + "logps/chosen": -816.0, + "logps/rejected": -1152.0, + "loss": 0.231, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.40625, + "rewards/margins": 3.53125, + "rewards/rejected": -9.9375, + "step": 16650 + }, + { + "epoch": 0.8586522355366576, + "grad_norm": 10.382384070214798, + "learning_rate": 2.9811240620238875e-08, + "logits/chosen": -3.28125, + "logits/rejected": -2.953125, + "logps/chosen": -780.0, + "logps/rejected": -1112.0, + "loss": 0.2153, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.0625, + "rewards/margins": 3.40625, + "rewards/rejected": -9.4375, + "step": 16660 + }, + { + "epoch": 0.8591676330369797, + "grad_norm": 12.295797638210805, + "learning_rate": 2.959858373478161e-08, + "logits/chosen": -3.234375, + "logits/rejected": -3.0, + "logps/chosen": -800.0, + "logps/rejected": -1112.0, + "loss": 0.2367, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.15625, + "rewards/margins": 3.375, + "rewards/rejected": -9.5625, + "step": 16670 + }, + { + "epoch": 0.8596830305373019, + "grad_norm": 7.412769695070452, + "learning_rate": 2.9386640319435803e-08, + "logits/chosen": -3.328125, + "logits/rejected": -3.140625, + "logps/chosen": -788.0, + "logps/rejected": -1160.0, + "loss": 0.2236, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.09375, + "rewards/margins": 3.828125, + "rewards/rejected": -9.9375, + "step": 16680 + }, + { + "epoch": 0.860198428037624, + "grad_norm": 10.983551852519273, + "learning_rate": 2.917541106029181e-08, + "logits/chosen": -3.25, + "logits/rejected": -2.921875, + "logps/chosen": -808.0, + "logps/rejected": -1120.0, + "loss": 0.2175, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.46875, + "rewards/margins": 3.1875, + "rewards/rejected": -9.6875, + "step": 16690 + }, + { + "epoch": 0.8607138255379462, + "grad_norm": 7.601528095728787, + "learning_rate": 2.896489664112825e-08, + "logits/chosen": -3.265625, + "logits/rejected": -2.921875, + "logps/chosen": -780.0, + "logps/rejected": -1168.0, + "loss": 0.2206, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.15625, + "rewards/margins": 3.9375, + "rewards/rejected": -10.0625, + "step": 16700 + }, + { + "epoch": 0.8612292230382682, + "grad_norm": 6.5099768578334185, + "learning_rate": 2.875509774340959e-08, + "logits/chosen": -3.25, + "logits/rejected": -3.109375, + "logps/chosen": -832.0, + "logps/rejected": -1152.0, + "loss": 0.2414, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.34375, + "rewards/margins": 3.40625, + "rewards/rejected": -9.75, + "step": 16710 + }, + { + "epoch": 0.8617446205385904, + "grad_norm": 9.727536087242223, + "learning_rate": 2.854601504628415e-08, + "logits/chosen": -3.25, + "logits/rejected": -2.921875, + "logps/chosen": -804.0, + "logps/rejected": -1168.0, + "loss": 0.2183, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.3125, + "rewards/margins": 3.71875, + "rewards/rejected": -10.0625, + "step": 16720 + }, + { + "epoch": 0.8622600180389125, + "grad_norm": 7.0204867276762215, + "learning_rate": 2.8337649226581716e-08, + "logits/chosen": -3.3125, + "logits/rejected": -2.828125, + "logps/chosen": -800.0, + "logps/rejected": -1152.0, + "loss": 0.2224, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.375, + "rewards/margins": 3.484375, + "rewards/rejected": -9.875, + "step": 16730 + }, + { + "epoch": 0.8627754155392346, + "grad_norm": 9.810040735787954, + "learning_rate": 2.813000095881152e-08, + "logits/chosen": -3.5625, + "logits/rejected": -3.125, + "logps/chosen": -784.0, + "logps/rejected": -1144.0, + "loss": 0.2181, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.125, + "rewards/margins": 3.375, + "rewards/rejected": -9.5, + "step": 16740 + }, + { + "epoch": 0.8632908130395568, + "grad_norm": 9.147896994272934, + "learning_rate": 2.7923070915159996e-08, + "logits/chosen": -3.203125, + "logits/rejected": -2.984375, + "logps/chosen": -760.0, + "logps/rejected": -1152.0, + "loss": 0.2358, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.9375, + "rewards/margins": 4.03125, + "rewards/rejected": -10.0, + "step": 16750 + }, + { + "epoch": 0.8638062105398788, + "grad_norm": 9.338814820556333, + "learning_rate": 2.771685976548846e-08, + "logits/chosen": -3.34375, + "logits/rejected": -3.171875, + "logps/chosen": -836.0, + "logps/rejected": -1120.0, + "loss": 0.2421, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.5625, + "rewards/margins": 3.046875, + "rewards/rejected": -9.625, + "step": 16760 + }, + { + "epoch": 0.864321608040201, + "grad_norm": 9.496716213086872, + "learning_rate": 2.751136817733124e-08, + "logits/chosen": -3.265625, + "logits/rejected": -3.125, + "logps/chosen": -804.0, + "logps/rejected": -1152.0, + "loss": 0.2187, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.34375, + "rewards/margins": 3.4375, + "rewards/rejected": -9.8125, + "step": 16770 + }, + { + "epoch": 0.8648370055405231, + "grad_norm": 8.482804214091166, + "learning_rate": 2.73065968158932e-08, + "logits/chosen": -3.296875, + "logits/rejected": -3.046875, + "logps/chosen": -820.0, + "logps/rejected": -1208.0, + "loss": 0.208, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.375, + "rewards/margins": 3.828125, + "rewards/rejected": -10.1875, + "step": 16780 + }, + { + "epoch": 0.8653524030408453, + "grad_norm": 7.591865212940023, + "learning_rate": 2.7102546344047765e-08, + "logits/chosen": -3.390625, + "logits/rejected": -3.09375, + "logps/chosen": -816.0, + "logps/rejected": -1112.0, + "loss": 0.2423, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.53125, + "rewards/margins": 3.0625, + "rewards/rejected": -9.625, + "step": 16790 + }, + { + "epoch": 0.8658678005411674, + "grad_norm": 9.663625217674067, + "learning_rate": 2.6899217422334864e-08, + "logits/chosen": -3.3125, + "logits/rejected": -3.046875, + "logps/chosen": -800.0, + "logps/rejected": -1152.0, + "loss": 0.2117, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.1875, + "rewards/margins": 3.421875, + "rewards/rejected": -9.625, + "step": 16800 + }, + { + "epoch": 0.8663831980414894, + "grad_norm": 7.951406339402954, + "learning_rate": 2.6696610708958456e-08, + "logits/chosen": -3.390625, + "logits/rejected": -3.234375, + "logps/chosen": -832.0, + "logps/rejected": -1120.0, + "loss": 0.2429, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.4375, + "rewards/margins": 3.078125, + "rewards/rejected": -9.5, + "step": 16810 + }, + { + "epoch": 0.8668985955418116, + "grad_norm": 8.003332451175897, + "learning_rate": 2.6494726859784795e-08, + "logits/chosen": -3.359375, + "logits/rejected": -3.0, + "logps/chosen": -780.0, + "logps/rejected": -1128.0, + "loss": 0.2284, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.03125, + "rewards/margins": 3.703125, + "rewards/rejected": -9.75, + "step": 16820 + }, + { + "epoch": 0.8674139930421337, + "grad_norm": 8.821198337595252, + "learning_rate": 2.6293566528339945e-08, + "logits/chosen": -3.28125, + "logits/rejected": -2.78125, + "logps/chosen": -780.0, + "logps/rejected": -1168.0, + "loss": 0.2092, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.15625, + "rewards/margins": 3.9375, + "rewards/rejected": -10.125, + "step": 16830 + }, + { + "epoch": 0.8679293905424559, + "grad_norm": 7.172142315287931, + "learning_rate": 2.6093130365808012e-08, + "logits/chosen": -3.1875, + "logits/rejected": -2.9375, + "logps/chosen": -816.0, + "logps/rejected": -1168.0, + "loss": 0.2049, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.3125, + "rewards/margins": 3.59375, + "rewards/rejected": -9.9375, + "step": 16840 + }, + { + "epoch": 0.868444788042778, + "grad_norm": 6.602242644504269, + "learning_rate": 2.5893419021028844e-08, + "logits/chosen": -3.46875, + "logits/rejected": -3.171875, + "logps/chosen": -836.0, + "logps/rejected": -1232.0, + "loss": 0.2334, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -6.5625, + "rewards/margins": 3.890625, + "rewards/rejected": -10.4375, + "step": 16850 + }, + { + "epoch": 0.8689601855431001, + "grad_norm": 8.143389155844586, + "learning_rate": 2.5694433140495797e-08, + "logits/chosen": -3.578125, + "logits/rejected": -3.203125, + "logps/chosen": -832.0, + "logps/rejected": -1168.0, + "loss": 0.2389, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.375, + "rewards/margins": 3.53125, + "rewards/rejected": -9.9375, + "step": 16860 + }, + { + "epoch": 0.8694755830434222, + "grad_norm": 6.485253907615014, + "learning_rate": 2.5496173368353997e-08, + "logits/chosen": -3.296875, + "logits/rejected": -3.0, + "logps/chosen": -836.0, + "logps/rejected": -1160.0, + "loss": 0.2528, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.375, + "rewards/margins": 3.1875, + "rewards/rejected": -9.5625, + "step": 16870 + }, + { + "epoch": 0.8699909805437444, + "grad_norm": 9.450304366597184, + "learning_rate": 2.5298640346397947e-08, + "logits/chosen": -3.296875, + "logits/rejected": -2.90625, + "logps/chosen": -792.0, + "logps/rejected": -1152.0, + "loss": 0.2438, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.34375, + "rewards/margins": 3.71875, + "rewards/rejected": -10.0625, + "step": 16880 + }, + { + "epoch": 0.8705063780440665, + "grad_norm": 11.018927780023835, + "learning_rate": 2.5101834714069542e-08, + "logits/chosen": -3.234375, + "logits/rejected": -3.09375, + "logps/chosen": -812.0, + "logps/rejected": -1168.0, + "loss": 0.2184, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.28125, + "rewards/margins": 3.75, + "rewards/rejected": -10.0, + "step": 16890 + }, + { + "epoch": 0.8710217755443886, + "grad_norm": 7.223892735588, + "learning_rate": 2.4905757108456104e-08, + "logits/chosen": -3.359375, + "logits/rejected": -3.171875, + "logps/chosen": -780.0, + "logps/rejected": -1120.0, + "loss": 0.2106, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.15625, + "rewards/margins": 3.640625, + "rewards/rejected": -9.8125, + "step": 16900 + }, + { + "epoch": 0.8715371730447107, + "grad_norm": 8.0531814156596, + "learning_rate": 2.47104081642881e-08, + "logits/chosen": -3.375, + "logits/rejected": -3.078125, + "logps/chosen": -768.0, + "logps/rejected": -1160.0, + "loss": 0.2233, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.0, + "rewards/margins": 3.859375, + "rewards/rejected": -9.8125, + "step": 16910 + }, + { + "epoch": 0.8720525705450328, + "grad_norm": 7.889101016143461, + "learning_rate": 2.4515788513937374e-08, + "logits/chosen": -3.171875, + "logits/rejected": -2.921875, + "logps/chosen": -800.0, + "logps/rejected": -1128.0, + "loss": 0.2519, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.21875, + "rewards/margins": 3.515625, + "rewards/rejected": -9.75, + "step": 16920 + }, + { + "epoch": 0.872567968045355, + "grad_norm": 6.328461057714138, + "learning_rate": 2.4321898787414862e-08, + "logits/chosen": -3.328125, + "logits/rejected": -3.0, + "logps/chosen": -804.0, + "logps/rejected": -1136.0, + "loss": 0.2286, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.09375, + "rewards/margins": 3.578125, + "rewards/rejected": -9.6875, + "step": 16930 + }, + { + "epoch": 0.8730833655456771, + "grad_norm": 8.423039310699336, + "learning_rate": 2.412873961236858e-08, + "logits/chosen": -3.1875, + "logits/rejected": -2.953125, + "logps/chosen": -816.0, + "logps/rejected": -1144.0, + "loss": 0.2291, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.3125, + "rewards/margins": 3.546875, + "rewards/rejected": -9.875, + "step": 16940 + }, + { + "epoch": 0.8735987630459993, + "grad_norm": 8.850486744993697, + "learning_rate": 2.393631161408177e-08, + "logits/chosen": -3.28125, + "logits/rejected": -3.0, + "logps/chosen": -760.0, + "logps/rejected": -1120.0, + "loss": 0.228, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.9375, + "rewards/margins": 3.75, + "rewards/rejected": -9.6875, + "step": 16950 + }, + { + "epoch": 0.8741141605463213, + "grad_norm": 10.580933046112367, + "learning_rate": 2.3744615415470625e-08, + "logits/chosen": -3.25, + "logits/rejected": -2.9375, + "logps/chosen": -800.0, + "logps/rejected": -1168.0, + "loss": 0.2392, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.28125, + "rewards/margins": 3.890625, + "rewards/rejected": -10.1875, + "step": 16960 + }, + { + "epoch": 0.8746295580466434, + "grad_norm": 8.697190528300142, + "learning_rate": 2.3553651637082532e-08, + "logits/chosen": -3.390625, + "logits/rejected": -3.203125, + "logps/chosen": -792.0, + "logps/rejected": -1112.0, + "loss": 0.2323, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.25, + "rewards/margins": 3.40625, + "rewards/rejected": -9.625, + "step": 16970 + }, + { + "epoch": 0.8751449555469656, + "grad_norm": 8.201571271594329, + "learning_rate": 2.336342089709381e-08, + "logits/chosen": -3.265625, + "logits/rejected": -2.96875, + "logps/chosen": -808.0, + "logps/rejected": -1160.0, + "loss": 0.2294, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.1875, + "rewards/margins": 3.734375, + "rewards/rejected": -9.9375, + "step": 16980 + }, + { + "epoch": 0.8756603530472877, + "grad_norm": 8.142947319592166, + "learning_rate": 2.3173923811307948e-08, + "logits/chosen": -3.328125, + "logits/rejected": -3.140625, + "logps/chosen": -780.0, + "logps/rejected": -1144.0, + "loss": 0.2185, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.0625, + "rewards/margins": 3.734375, + "rewards/rejected": -9.8125, + "step": 16990 + }, + { + "epoch": 0.8761757505476099, + "grad_norm": 9.397605074695598, + "learning_rate": 2.2985160993153395e-08, + "logits/chosen": -3.15625, + "logits/rejected": -2.921875, + "logps/chosen": -816.0, + "logps/rejected": -1168.0, + "loss": 0.2321, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.40625, + "rewards/margins": 3.703125, + "rewards/rejected": -10.125, + "step": 17000 + }, + { + "epoch": 0.876691148047932, + "grad_norm": 7.898134037613109, + "learning_rate": 2.2797133053681668e-08, + "logits/chosen": -3.234375, + "logits/rejected": -2.953125, + "logps/chosen": -804.0, + "logps/rejected": -1104.0, + "loss": 0.2462, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.40625, + "rewards/margins": 3.125, + "rewards/rejected": -9.5, + "step": 17010 + }, + { + "epoch": 0.8772065455482541, + "grad_norm": 7.374979570915679, + "learning_rate": 2.2609840601565483e-08, + "logits/chosen": -3.265625, + "logits/rejected": -2.953125, + "logps/chosen": -792.0, + "logps/rejected": -1184.0, + "loss": 0.2125, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.0625, + "rewards/margins": 4.03125, + "rewards/rejected": -10.0625, + "step": 17020 + }, + { + "epoch": 0.8777219430485762, + "grad_norm": 6.509696458103513, + "learning_rate": 2.2423284243096556e-08, + "logits/chosen": -3.328125, + "logits/rejected": -3.140625, + "logps/chosen": -796.0, + "logps/rejected": -1120.0, + "loss": 0.2191, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.125, + "rewards/margins": 3.640625, + "rewards/rejected": -9.75, + "step": 17030 + }, + { + "epoch": 0.8782373405488983, + "grad_norm": 8.452317182505034, + "learning_rate": 2.223746458218395e-08, + "logits/chosen": -3.203125, + "logits/rejected": -2.859375, + "logps/chosen": -776.0, + "logps/rejected": -1136.0, + "loss": 0.2253, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.28125, + "rewards/margins": 3.53125, + "rewards/rejected": -9.8125, + "step": 17040 + }, + { + "epoch": 0.8787527380492205, + "grad_norm": 7.090511037987655, + "learning_rate": 2.2052382220351717e-08, + "logits/chosen": -3.421875, + "logits/rejected": -2.921875, + "logps/chosen": -800.0, + "logps/rejected": -1176.0, + "loss": 0.2132, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.1875, + "rewards/margins": 3.6875, + "rewards/rejected": -9.875, + "step": 17050 + }, + { + "epoch": 0.8792681355495426, + "grad_norm": 6.407663122611167, + "learning_rate": 2.1868037756737233e-08, + "logits/chosen": -3.234375, + "logits/rejected": -3.046875, + "logps/chosen": -824.0, + "logps/rejected": -1128.0, + "loss": 0.2267, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.3125, + "rewards/margins": 3.28125, + "rewards/rejected": -9.625, + "step": 17060 + }, + { + "epoch": 0.8797835330498647, + "grad_norm": 7.88916134018945, + "learning_rate": 2.1684431788089264e-08, + "logits/chosen": -3.53125, + "logits/rejected": -3.125, + "logps/chosen": -780.0, + "logps/rejected": -1168.0, + "loss": 0.2329, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.9375, + "rewards/margins": 3.984375, + "rewards/rejected": -9.9375, + "step": 17070 + }, + { + "epoch": 0.8802989305501868, + "grad_norm": 7.277672279726182, + "learning_rate": 2.1501564908765952e-08, + "logits/chosen": -3.3125, + "logits/rejected": -3.09375, + "logps/chosen": -808.0, + "logps/rejected": -1192.0, + "loss": 0.2551, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.1875, + "rewards/margins": 4.03125, + "rewards/rejected": -10.1875, + "step": 17080 + }, + { + "epoch": 0.880814328050509, + "grad_norm": 9.67634882871027, + "learning_rate": 2.1319437710732913e-08, + "logits/chosen": -3.375, + "logits/rejected": -3.125, + "logps/chosen": -792.0, + "logps/rejected": -1128.0, + "loss": 0.2293, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.21875, + "rewards/margins": 3.5, + "rewards/rejected": -9.75, + "step": 17090 + }, + { + "epoch": 0.8813297255508311, + "grad_norm": 6.623224804465682, + "learning_rate": 2.1138050783561235e-08, + "logits/chosen": -3.4375, + "logits/rejected": -3.203125, + "logps/chosen": -796.0, + "logps/rejected": -1120.0, + "loss": 0.2461, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.28125, + "rewards/margins": 3.375, + "rewards/rejected": -9.625, + "step": 17100 + }, + { + "epoch": 0.8818451230511533, + "grad_norm": 10.746900378556882, + "learning_rate": 2.0957404714425615e-08, + "logits/chosen": -3.328125, + "logits/rejected": -3.046875, + "logps/chosen": -792.0, + "logps/rejected": -1104.0, + "loss": 0.2198, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.25, + "rewards/margins": 3.21875, + "rewards/rejected": -9.4375, + "step": 17110 + }, + { + "epoch": 0.8823605205514753, + "grad_norm": 7.462188949000572, + "learning_rate": 2.077750008810264e-08, + "logits/chosen": -3.3125, + "logits/rejected": -2.9375, + "logps/chosen": -784.0, + "logps/rejected": -1160.0, + "loss": 0.2148, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -6.0625, + "rewards/margins": 3.953125, + "rewards/rejected": -10.0, + "step": 17120 + }, + { + "epoch": 0.8828759180517974, + "grad_norm": 8.99620442830125, + "learning_rate": 2.059833748696868e-08, + "logits/chosen": -3.328125, + "logits/rejected": -2.984375, + "logps/chosen": -800.0, + "logps/rejected": -1176.0, + "loss": 0.2353, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.09375, + "rewards/margins": 3.90625, + "rewards/rejected": -10.0, + "step": 17130 + }, + { + "epoch": 0.8833913155521196, + "grad_norm": 10.108969236710653, + "learning_rate": 2.0419917490998e-08, + "logits/chosen": -3.296875, + "logits/rejected": -3.09375, + "logps/chosen": -796.0, + "logps/rejected": -1152.0, + "loss": 0.2173, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.15625, + "rewards/margins": 3.765625, + "rewards/rejected": -9.9375, + "step": 17140 + }, + { + "epoch": 0.8839067130524417, + "grad_norm": 8.281970255319441, + "learning_rate": 2.024224067776109e-08, + "logits/chosen": -3.375, + "logits/rejected": -3.15625, + "logps/chosen": -808.0, + "logps/rejected": -1144.0, + "loss": 0.2118, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.25, + "rewards/margins": 3.609375, + "rewards/rejected": -9.875, + "step": 17150 + }, + { + "epoch": 0.8844221105527639, + "grad_norm": 10.92488632209321, + "learning_rate": 2.0065307622422534e-08, + "logits/chosen": -3.203125, + "logits/rejected": -3.078125, + "logps/chosen": -792.0, + "logps/rejected": -1112.0, + "loss": 0.2464, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.25, + "rewards/margins": 3.421875, + "rewards/rejected": -9.6875, + "step": 17160 + }, + { + "epoch": 0.8849375080530859, + "grad_norm": 7.15429140772424, + "learning_rate": 1.98891188977392e-08, + "logits/chosen": -3.375, + "logits/rejected": -3.09375, + "logps/chosen": -800.0, + "logps/rejected": -1144.0, + "loss": 0.2373, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.3125, + "rewards/margins": 3.609375, + "rewards/rejected": -9.9375, + "step": 17170 + }, + { + "epoch": 0.885452905553408, + "grad_norm": 8.721644367407158, + "learning_rate": 1.9713675074058634e-08, + "logits/chosen": -3.296875, + "logits/rejected": -3.03125, + "logps/chosen": -812.0, + "logps/rejected": -1184.0, + "loss": 0.227, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.1875, + "rewards/margins": 3.953125, + "rewards/rejected": -10.125, + "step": 17180 + }, + { + "epoch": 0.8859683030537302, + "grad_norm": 9.720599721499521, + "learning_rate": 1.9538976719316925e-08, + "logits/chosen": -3.21875, + "logits/rejected": -2.875, + "logps/chosen": -800.0, + "logps/rejected": -1232.0, + "loss": 0.2203, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -6.21875, + "rewards/margins": 4.3125, + "rewards/rejected": -10.5, + "step": 17190 + }, + { + "epoch": 0.8864837005540523, + "grad_norm": 8.180772830616002, + "learning_rate": 1.936502439903709e-08, + "logits/chosen": -3.359375, + "logits/rejected": -3.03125, + "logps/chosen": -808.0, + "logps/rejected": -1184.0, + "loss": 0.2068, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.40625, + "rewards/margins": 3.828125, + "rewards/rejected": -10.25, + "step": 17200 + }, + { + "epoch": 0.8869990980543745, + "grad_norm": 11.024529644792437, + "learning_rate": 1.9191818676326965e-08, + "logits/chosen": -3.421875, + "logits/rejected": -3.1875, + "logps/chosen": -792.0, + "logps/rejected": -1160.0, + "loss": 0.2349, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.25, + "rewards/margins": 3.734375, + "rewards/rejected": -10.0, + "step": 17210 + }, + { + "epoch": 0.8875144955546965, + "grad_norm": 10.171420389702266, + "learning_rate": 1.9019360111877603e-08, + "logits/chosen": -3.25, + "logits/rejected": -2.96875, + "logps/chosen": -804.0, + "logps/rejected": -1168.0, + "loss": 0.2251, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.4375, + "rewards/margins": 3.65625, + "rewards/rejected": -10.125, + "step": 17220 + }, + { + "epoch": 0.8880298930550187, + "grad_norm": 11.47962089655575, + "learning_rate": 1.8847649263961428e-08, + "logits/chosen": -3.234375, + "logits/rejected": -3.015625, + "logps/chosen": -780.0, + "logps/rejected": -1128.0, + "loss": 0.2232, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.28125, + "rewards/margins": 3.5, + "rewards/rejected": -9.75, + "step": 17230 + }, + { + "epoch": 0.8885452905553408, + "grad_norm": 10.164820457955848, + "learning_rate": 1.867668668843042e-08, + "logits/chosen": -3.359375, + "logits/rejected": -3.0, + "logps/chosen": -816.0, + "logps/rejected": -1160.0, + "loss": 0.2239, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.15625, + "rewards/margins": 3.5625, + "rewards/rejected": -9.75, + "step": 17240 + }, + { + "epoch": 0.889060688055663, + "grad_norm": 6.432066927821508, + "learning_rate": 1.8506472938714297e-08, + "logits/chosen": -3.3125, + "logits/rejected": -3.140625, + "logps/chosen": -792.0, + "logps/rejected": -1152.0, + "loss": 0.1976, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.03125, + "rewards/margins": 3.671875, + "rewards/rejected": -9.6875, + "step": 17250 + }, + { + "epoch": 0.8895760855559851, + "grad_norm": 7.857147999133899, + "learning_rate": 1.8337008565818657e-08, + "logits/chosen": -3.25, + "logits/rejected": -3.046875, + "logps/chosen": -768.0, + "logps/rejected": -1112.0, + "loss": 0.2326, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.15625, + "rewards/margins": 3.515625, + "rewards/rejected": -9.6875, + "step": 17260 + }, + { + "epoch": 0.8900914830563071, + "grad_norm": 9.222512214728617, + "learning_rate": 1.8168294118323295e-08, + "logits/chosen": -3.3125, + "logits/rejected": -3.046875, + "logps/chosen": -776.0, + "logps/rejected": -1112.0, + "loss": 0.2246, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.09375, + "rewards/margins": 3.4375, + "rewards/rejected": -9.5, + "step": 17270 + }, + { + "epoch": 0.8906068805566293, + "grad_norm": 9.848803478871076, + "learning_rate": 1.8000330142380394e-08, + "logits/chosen": -3.390625, + "logits/rejected": -3.09375, + "logps/chosen": -812.0, + "logps/rejected": -1120.0, + "loss": 0.2196, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": -6.4375, + "rewards/margins": 3.25, + "rewards/rejected": -9.6875, + "step": 17280 + }, + { + "epoch": 0.8911222780569514, + "grad_norm": 9.221139653146341, + "learning_rate": 1.7833117181712763e-08, + "logits/chosen": -3.25, + "logits/rejected": -2.984375, + "logps/chosen": -804.0, + "logps/rejected": -1120.0, + "loss": 0.2298, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.21875, + "rewards/margins": 3.4375, + "rewards/rejected": -9.6875, + "step": 17290 + }, + { + "epoch": 0.8916376755572736, + "grad_norm": 10.396894950707363, + "learning_rate": 1.7666655777612087e-08, + "logits/chosen": -3.34375, + "logits/rejected": -2.921875, + "logps/chosen": -796.0, + "logps/rejected": -1176.0, + "loss": 0.2195, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.15625, + "rewards/margins": 3.90625, + "rewards/rejected": -10.0625, + "step": 17300 + }, + { + "epoch": 0.8921530730575957, + "grad_norm": 11.020514831644544, + "learning_rate": 1.7500946468937184e-08, + "logits/chosen": -3.3125, + "logits/rejected": -3.03125, + "logps/chosen": -816.0, + "logps/rejected": -1160.0, + "loss": 0.2371, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.4375, + "rewards/margins": 3.59375, + "rewards/rejected": -10.0625, + "step": 17310 + }, + { + "epoch": 0.8926684705579178, + "grad_norm": 9.59675082571496, + "learning_rate": 1.733598979211215e-08, + "logits/chosen": -3.3125, + "logits/rejected": -2.90625, + "logps/chosen": -820.0, + "logps/rejected": -1208.0, + "loss": 0.2111, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.375, + "rewards/margins": 4.03125, + "rewards/rejected": -10.4375, + "step": 17320 + }, + { + "epoch": 0.8931838680582399, + "grad_norm": 6.648917864320013, + "learning_rate": 1.7171786281124674e-08, + "logits/chosen": -3.28125, + "logits/rejected": -3.015625, + "logps/chosen": -752.0, + "logps/rejected": -1152.0, + "loss": 0.2128, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.9375, + "rewards/margins": 4.0, + "rewards/rejected": -9.9375, + "step": 17330 + }, + { + "epoch": 0.893699265558562, + "grad_norm": 7.088361308237925, + "learning_rate": 1.7008336467524474e-08, + "logits/chosen": -3.375, + "logits/rejected": -3.09375, + "logps/chosen": -796.0, + "logps/rejected": -1128.0, + "loss": 0.2178, + "rewards/accuracies": 0.84375, + "rewards/chosen": -6.34375, + "rewards/margins": 3.375, + "rewards/rejected": -9.75, + "step": 17340 + }, + { + "epoch": 0.8942146630588842, + "grad_norm": 10.104036471457313, + "learning_rate": 1.684564088042137e-08, + "logits/chosen": -3.265625, + "logits/rejected": -3.046875, + "logps/chosen": -808.0, + "logps/rejected": -1144.0, + "loss": 0.2357, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.28125, + "rewards/margins": 3.609375, + "rewards/rejected": -9.875, + "step": 17350 + }, + { + "epoch": 0.8947300605592063, + "grad_norm": 8.140839309225477, + "learning_rate": 1.6683700046483728e-08, + "logits/chosen": -3.234375, + "logits/rejected": -2.9375, + "logps/chosen": -800.0, + "logps/rejected": -1192.0, + "loss": 0.2267, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.28125, + "rewards/margins": 3.84375, + "rewards/rejected": -10.125, + "step": 17360 + }, + { + "epoch": 0.8952454580595284, + "grad_norm": 8.029354477414085, + "learning_rate": 1.652251448993655e-08, + "logits/chosen": -3.234375, + "logits/rejected": -3.265625, + "logps/chosen": -820.0, + "logps/rejected": -1136.0, + "loss": 0.2201, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.28125, + "rewards/margins": 3.515625, + "rewards/rejected": -9.75, + "step": 17370 + }, + { + "epoch": 0.8957608555598505, + "grad_norm": 6.628675126661086, + "learning_rate": 1.63620847325599e-08, + "logits/chosen": -3.375, + "logits/rejected": -3.1875, + "logps/chosen": -760.0, + "logps/rejected": -1144.0, + "loss": 0.2436, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.03125, + "rewards/margins": 3.859375, + "rewards/rejected": -9.875, + "step": 17380 + }, + { + "epoch": 0.8962762530601727, + "grad_norm": 9.836658195787566, + "learning_rate": 1.6202411293687324e-08, + "logits/chosen": -3.34375, + "logits/rejected": -3.09375, + "logps/chosen": -792.0, + "logps/rejected": -1144.0, + "loss": 0.2425, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.125, + "rewards/margins": 3.609375, + "rewards/rejected": -9.75, + "step": 17390 + }, + { + "epoch": 0.8967916505604948, + "grad_norm": 7.115721878961176, + "learning_rate": 1.6043494690203995e-08, + "logits/chosen": -3.3125, + "logits/rejected": -3.0, + "logps/chosen": -772.0, + "logps/rejected": -1160.0, + "loss": 0.2109, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.03125, + "rewards/margins": 3.921875, + "rewards/rejected": -9.9375, + "step": 17400 + }, + { + "epoch": 0.897307048060817, + "grad_norm": 8.570078104347497, + "learning_rate": 1.588533543654516e-08, + "logits/chosen": -3.15625, + "logits/rejected": -2.875, + "logps/chosen": -784.0, + "logps/rejected": -1120.0, + "loss": 0.2288, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.1875, + "rewards/margins": 3.421875, + "rewards/rejected": -9.625, + "step": 17410 + }, + { + "epoch": 0.897822445561139, + "grad_norm": 6.432382539384875, + "learning_rate": 1.572793404469433e-08, + "logits/chosen": -3.546875, + "logits/rejected": -3.15625, + "logps/chosen": -772.0, + "logps/rejected": -1112.0, + "loss": 0.2113, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.0, + "rewards/margins": 3.5625, + "rewards/rejected": -9.5625, + "step": 17420 + }, + { + "epoch": 0.8983378430614611, + "grad_norm": 8.616320600768494, + "learning_rate": 1.5571291024181726e-08, + "logits/chosen": -3.3125, + "logits/rejected": -3.078125, + "logps/chosen": -772.0, + "logps/rejected": -1136.0, + "loss": 0.215, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.90625, + "rewards/margins": 3.796875, + "rewards/rejected": -9.6875, + "step": 17430 + }, + { + "epoch": 0.8988532405617833, + "grad_norm": 10.087569049658201, + "learning_rate": 1.5415406882082648e-08, + "logits/chosen": -3.4375, + "logits/rejected": -3.171875, + "logps/chosen": -788.0, + "logps/rejected": -1096.0, + "loss": 0.2499, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.09375, + "rewards/margins": 3.375, + "rewards/rejected": -9.4375, + "step": 17440 + }, + { + "epoch": 0.8993686380621054, + "grad_norm": 6.073913459659555, + "learning_rate": 1.5260282123015862e-08, + "logits/chosen": -3.28125, + "logits/rejected": -3.03125, + "logps/chosen": -816.0, + "logps/rejected": -1160.0, + "loss": 0.2435, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.4375, + "rewards/margins": 3.578125, + "rewards/rejected": -10.0, + "step": 17450 + }, + { + "epoch": 0.8998840355624276, + "grad_norm": 6.8507233566241235, + "learning_rate": 1.5105917249141814e-08, + "logits/chosen": -3.453125, + "logits/rejected": -3.15625, + "logps/chosen": -804.0, + "logps/rejected": -1184.0, + "loss": 0.2196, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -6.0625, + "rewards/margins": 3.890625, + "rewards/rejected": -9.9375, + "step": 17460 + }, + { + "epoch": 0.9003994330627496, + "grad_norm": 8.758402104162537, + "learning_rate": 1.4952312760161067e-08, + "logits/chosen": -3.390625, + "logits/rejected": -2.953125, + "logps/chosen": -808.0, + "logps/rejected": -1152.0, + "loss": 0.2211, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.4375, + "rewards/margins": 3.609375, + "rewards/rejected": -10.0625, + "step": 17470 + }, + { + "epoch": 0.9009148305630718, + "grad_norm": 6.987750914320716, + "learning_rate": 1.4799469153312844e-08, + "logits/chosen": -3.296875, + "logits/rejected": -2.90625, + "logps/chosen": -812.0, + "logps/rejected": -1120.0, + "loss": 0.2297, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.375, + "rewards/margins": 3.15625, + "rewards/rejected": -9.5625, + "step": 17480 + }, + { + "epoch": 0.9014302280633939, + "grad_norm": 8.536280513464822, + "learning_rate": 1.464738692337314e-08, + "logits/chosen": -3.21875, + "logits/rejected": -2.890625, + "logps/chosen": -788.0, + "logps/rejected": -1160.0, + "loss": 0.1992, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.21875, + "rewards/margins": 3.75, + "rewards/rejected": -9.9375, + "step": 17490 + }, + { + "epoch": 0.901945625563716, + "grad_norm": 8.053833994898111, + "learning_rate": 1.449606656265337e-08, + "logits/chosen": -3.28125, + "logits/rejected": -2.859375, + "logps/chosen": -804.0, + "logps/rejected": -1176.0, + "loss": 0.2087, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.3125, + "rewards/margins": 3.6875, + "rewards/rejected": -10.0, + "step": 17500 + }, + { + "epoch": 0.9024610230640382, + "grad_norm": 8.727864705274987, + "learning_rate": 1.4345508560998643e-08, + "logits/chosen": -3.34375, + "logits/rejected": -3.0, + "logps/chosen": -788.0, + "logps/rejected": -1144.0, + "loss": 0.2245, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.3125, + "rewards/margins": 3.625, + "rewards/rejected": -9.9375, + "step": 17510 + }, + { + "epoch": 0.9029764205643602, + "grad_norm": 12.9430849734967, + "learning_rate": 1.4195713405786181e-08, + "logits/chosen": -3.4375, + "logits/rejected": -3.0625, + "logps/chosen": -772.0, + "logps/rejected": -1128.0, + "loss": 0.2409, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.125, + "rewards/margins": 3.578125, + "rewards/rejected": -9.6875, + "step": 17520 + }, + { + "epoch": 0.9034918180646824, + "grad_norm": 7.7197035006596515, + "learning_rate": 1.4046681581923846e-08, + "logits/chosen": -3.40625, + "logits/rejected": -2.9375, + "logps/chosen": -808.0, + "logps/rejected": -1176.0, + "loss": 0.2345, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.34375, + "rewards/margins": 3.734375, + "rewards/rejected": -10.0625, + "step": 17530 + }, + { + "epoch": 0.9040072155650045, + "grad_norm": 7.326667179308557, + "learning_rate": 1.3898413571848317e-08, + "logits/chosen": -3.46875, + "logits/rejected": -2.90625, + "logps/chosen": -800.0, + "logps/rejected": -1160.0, + "loss": 0.2026, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.1875, + "rewards/margins": 3.578125, + "rewards/rejected": -9.75, + "step": 17540 + }, + { + "epoch": 0.9045226130653267, + "grad_norm": 7.019974347727167, + "learning_rate": 1.375090985552388e-08, + "logits/chosen": -3.546875, + "logits/rejected": -3.203125, + "logps/chosen": -792.0, + "logps/rejected": -1080.0, + "loss": 0.2192, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.125, + "rewards/margins": 3.125, + "rewards/rejected": -9.25, + "step": 17550 + }, + { + "epoch": 0.9050380105656488, + "grad_norm": 7.191643098161281, + "learning_rate": 1.3604170910440666e-08, + "logits/chosen": -3.375, + "logits/rejected": -3.078125, + "logps/chosen": -800.0, + "logps/rejected": -1136.0, + "loss": 0.2469, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.21875, + "rewards/margins": 3.484375, + "rewards/rejected": -9.75, + "step": 17560 + }, + { + "epoch": 0.9055534080659708, + "grad_norm": 7.408593318985825, + "learning_rate": 1.3458197211613009e-08, + "logits/chosen": -3.46875, + "logits/rejected": -3.34375, + "logps/chosen": -776.0, + "logps/rejected": -1104.0, + "loss": 0.2149, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.0625, + "rewards/margins": 3.53125, + "rewards/rejected": -9.625, + "step": 17570 + }, + { + "epoch": 0.906068805566293, + "grad_norm": 8.63236453489365, + "learning_rate": 1.3312989231578225e-08, + "logits/chosen": -3.4375, + "logits/rejected": -3.09375, + "logps/chosen": -792.0, + "logps/rejected": -1120.0, + "loss": 0.2165, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.1875, + "rewards/margins": 3.390625, + "rewards/rejected": -9.5625, + "step": 17580 + }, + { + "epoch": 0.9065842030666151, + "grad_norm": 9.909696080185546, + "learning_rate": 1.3168547440394667e-08, + "logits/chosen": -3.46875, + "logits/rejected": -3.203125, + "logps/chosen": -836.0, + "logps/rejected": -1136.0, + "loss": 0.2359, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.46875, + "rewards/margins": 3.21875, + "rewards/rejected": -9.6875, + "step": 17590 + }, + { + "epoch": 0.9070996005669373, + "grad_norm": 6.235621919781736, + "learning_rate": 1.3024872305640644e-08, + "logits/chosen": -3.4375, + "logits/rejected": -3.0, + "logps/chosen": -776.0, + "logps/rejected": -1120.0, + "loss": 0.217, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.09375, + "rewards/margins": 3.671875, + "rewards/rejected": -9.75, + "step": 17600 + }, + { + "epoch": 0.9076149980672594, + "grad_norm": 7.274191180993609, + "learning_rate": 1.288196429241259e-08, + "logits/chosen": -3.421875, + "logits/rejected": -3.0625, + "logps/chosen": -784.0, + "logps/rejected": -1144.0, + "loss": 0.2156, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.125, + "rewards/margins": 3.625, + "rewards/rejected": -9.75, + "step": 17610 + }, + { + "epoch": 0.9081303955675815, + "grad_norm": 9.644566398281546, + "learning_rate": 1.2739823863323618e-08, + "logits/chosen": -3.34375, + "logits/rejected": -2.96875, + "logps/chosen": -792.0, + "logps/rejected": -1136.0, + "loss": 0.2182, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.21875, + "rewards/margins": 3.578125, + "rewards/rejected": -9.8125, + "step": 17620 + }, + { + "epoch": 0.9086457930679036, + "grad_norm": 9.134463515892648, + "learning_rate": 1.2598451478502136e-08, + "logits/chosen": -3.21875, + "logits/rejected": -3.015625, + "logps/chosen": -824.0, + "logps/rejected": -1160.0, + "loss": 0.2298, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.375, + "rewards/margins": 3.4375, + "rewards/rejected": -9.8125, + "step": 17630 + }, + { + "epoch": 0.9091611905682258, + "grad_norm": 6.809205639646694, + "learning_rate": 1.2457847595590321e-08, + "logits/chosen": -3.25, + "logits/rejected": -3.015625, + "logps/chosen": -792.0, + "logps/rejected": -1152.0, + "loss": 0.215, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.3125, + "rewards/margins": 3.6875, + "rewards/rejected": -10.0, + "step": 17640 + }, + { + "epoch": 0.9096765880685479, + "grad_norm": 9.120901703649679, + "learning_rate": 1.2318012669742445e-08, + "logits/chosen": -3.5, + "logits/rejected": -3.28125, + "logps/chosen": -820.0, + "logps/rejected": -1120.0, + "loss": 0.2366, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.375, + "rewards/margins": 3.234375, + "rewards/rejected": -9.625, + "step": 17650 + }, + { + "epoch": 0.91019198556887, + "grad_norm": 7.688053655153568, + "learning_rate": 1.2178947153623747e-08, + "logits/chosen": -3.34375, + "logits/rejected": -3.0, + "logps/chosen": -828.0, + "logps/rejected": -1168.0, + "loss": 0.2154, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.34375, + "rewards/margins": 3.5, + "rewards/rejected": -9.8125, + "step": 17660 + }, + { + "epoch": 0.9107073830691921, + "grad_norm": 10.65044145295917, + "learning_rate": 1.2040651497408627e-08, + "logits/chosen": -3.515625, + "logits/rejected": -3.28125, + "logps/chosen": -756.0, + "logps/rejected": -1160.0, + "loss": 0.2363, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.8125, + "rewards/margins": 3.96875, + "rewards/rejected": -9.8125, + "step": 17670 + }, + { + "epoch": 0.9112227805695142, + "grad_norm": 5.803554908969874, + "learning_rate": 1.1903126148779391e-08, + "logits/chosen": -3.15625, + "logits/rejected": -2.640625, + "logps/chosen": -792.0, + "logps/rejected": -1192.0, + "loss": 0.2227, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.21875, + "rewards/margins": 4.03125, + "rewards/rejected": -10.25, + "step": 17680 + }, + { + "epoch": 0.9117381780698364, + "grad_norm": 9.50017515742688, + "learning_rate": 1.176637155292487e-08, + "logits/chosen": -3.421875, + "logits/rejected": -3.015625, + "logps/chosen": -840.0, + "logps/rejected": -1160.0, + "loss": 0.2443, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.59375, + "rewards/margins": 3.40625, + "rewards/rejected": -10.0, + "step": 17690 + }, + { + "epoch": 0.9122535755701585, + "grad_norm": 9.25843634667005, + "learning_rate": 1.163038815253864e-08, + "logits/chosen": -3.21875, + "logits/rejected": -3.078125, + "logps/chosen": -788.0, + "logps/rejected": -1120.0, + "loss": 0.2329, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.09375, + "rewards/margins": 3.40625, + "rewards/rejected": -9.5, + "step": 17700 + }, + { + "epoch": 0.9127689730704807, + "grad_norm": 9.380152626503747, + "learning_rate": 1.1495176387817995e-08, + "logits/chosen": -3.4375, + "logits/rejected": -3.1875, + "logps/chosen": -800.0, + "logps/rejected": -1144.0, + "loss": 0.2345, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.28125, + "rewards/margins": 3.578125, + "rewards/rejected": -9.875, + "step": 17710 + }, + { + "epoch": 0.9132843705708027, + "grad_norm": 7.320508327505253, + "learning_rate": 1.1360736696462232e-08, + "logits/chosen": -3.328125, + "logits/rejected": -3.25, + "logps/chosen": -824.0, + "logps/rejected": -1152.0, + "loss": 0.2125, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.25, + "rewards/margins": 3.59375, + "rewards/rejected": -9.8125, + "step": 17720 + }, + { + "epoch": 0.9137997680711248, + "grad_norm": 7.680608112384954, + "learning_rate": 1.122706951367136e-08, + "logits/chosen": -3.359375, + "logits/rejected": -3.078125, + "logps/chosen": -808.0, + "logps/rejected": -1152.0, + "loss": 0.2462, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.28125, + "rewards/margins": 3.640625, + "rewards/rejected": -9.875, + "step": 17730 + }, + { + "epoch": 0.914315165571447, + "grad_norm": 9.860655279134788, + "learning_rate": 1.1094175272144762e-08, + "logits/chosen": -3.203125, + "logits/rejected": -2.96875, + "logps/chosen": -800.0, + "logps/rejected": -1128.0, + "loss": 0.2219, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.21875, + "rewards/margins": 3.5, + "rewards/rejected": -9.75, + "step": 17740 + }, + { + "epoch": 0.9148305630717691, + "grad_norm": 8.119604758823328, + "learning_rate": 1.0962054402079562e-08, + "logits/chosen": -3.359375, + "logits/rejected": -3.046875, + "logps/chosen": -788.0, + "logps/rejected": -1152.0, + "loss": 0.2275, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.9375, + "rewards/margins": 3.796875, + "rewards/rejected": -9.75, + "step": 17750 + }, + { + "epoch": 0.9153459605720913, + "grad_norm": 9.962079108694683, + "learning_rate": 1.0830707331169448e-08, + "logits/chosen": -3.3125, + "logits/rejected": -2.96875, + "logps/chosen": -816.0, + "logps/rejected": -1160.0, + "loss": 0.2125, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.4375, + "rewards/margins": 3.578125, + "rewards/rejected": -10.0, + "step": 17760 + }, + { + "epoch": 0.9158613580724133, + "grad_norm": 11.403971366957325, + "learning_rate": 1.070013448460319e-08, + "logits/chosen": -3.234375, + "logits/rejected": -2.953125, + "logps/chosen": -808.0, + "logps/rejected": -1136.0, + "loss": 0.2305, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.34375, + "rewards/margins": 3.59375, + "rewards/rejected": -9.9375, + "step": 17770 + }, + { + "epoch": 0.9163767555727355, + "grad_norm": 9.11553956769839, + "learning_rate": 1.057033628506332e-08, + "logits/chosen": -3.25, + "logits/rejected": -2.953125, + "logps/chosen": -780.0, + "logps/rejected": -1136.0, + "loss": 0.2265, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.28125, + "rewards/margins": 3.59375, + "rewards/rejected": -9.875, + "step": 17780 + }, + { + "epoch": 0.9168921530730576, + "grad_norm": 7.251568073154336, + "learning_rate": 1.0441313152724674e-08, + "logits/chosen": -3.3125, + "logits/rejected": -3.015625, + "logps/chosen": -800.0, + "logps/rejected": -1160.0, + "loss": 0.2465, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.125, + "rewards/margins": 3.984375, + "rewards/rejected": -10.125, + "step": 17790 + }, + { + "epoch": 0.9174075505733797, + "grad_norm": 10.025812270402586, + "learning_rate": 1.0313065505253183e-08, + "logits/chosen": -3.390625, + "logits/rejected": -3.15625, + "logps/chosen": -792.0, + "logps/rejected": -1120.0, + "loss": 0.211, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.34375, + "rewards/margins": 3.375, + "rewards/rejected": -9.6875, + "step": 17800 + }, + { + "epoch": 0.9179229480737019, + "grad_norm": 7.907577922723818, + "learning_rate": 1.0185593757804283e-08, + "logits/chosen": -3.3125, + "logits/rejected": -3.015625, + "logps/chosen": -776.0, + "logps/rejected": -1152.0, + "loss": 0.2368, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.9375, + "rewards/margins": 4.0625, + "rewards/rejected": -10.0, + "step": 17810 + }, + { + "epoch": 0.9184383455740239, + "grad_norm": 7.443304356991495, + "learning_rate": 1.0058898323021869e-08, + "logits/chosen": -3.390625, + "logits/rejected": -3.109375, + "logps/chosen": -804.0, + "logps/rejected": -1168.0, + "loss": 0.2253, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.1875, + "rewards/margins": 3.828125, + "rewards/rejected": -10.0, + "step": 17820 + }, + { + "epoch": 0.9189537430743461, + "grad_norm": 9.748743613785974, + "learning_rate": 9.932979611036618e-09, + "logits/chosen": -3.390625, + "logits/rejected": -3.046875, + "logps/chosen": -808.0, + "logps/rejected": -1184.0, + "loss": 0.2157, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.46875, + "rewards/margins": 3.71875, + "rewards/rejected": -10.1875, + "step": 17830 + }, + { + "epoch": 0.9194691405746682, + "grad_norm": 7.6543335837692785, + "learning_rate": 9.807838029465054e-09, + "logits/chosen": -3.328125, + "logits/rejected": -3.0625, + "logps/chosen": -816.0, + "logps/rejected": -1160.0, + "loss": 0.2234, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.375, + "rewards/margins": 3.546875, + "rewards/rejected": -9.9375, + "step": 17840 + }, + { + "epoch": 0.9199845380749904, + "grad_norm": 9.377511963803602, + "learning_rate": 9.683473983407909e-09, + "logits/chosen": -3.53125, + "logits/rejected": -3.28125, + "logps/chosen": -824.0, + "logps/rejected": -1160.0, + "loss": 0.2402, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.4375, + "rewards/margins": 3.4375, + "rewards/rejected": -9.875, + "step": 17850 + }, + { + "epoch": 0.9204999355753125, + "grad_norm": 9.100648345230594, + "learning_rate": 9.559887875448897e-09, + "logits/chosen": -3.28125, + "logits/rejected": -3.125, + "logps/chosen": -796.0, + "logps/rejected": -1144.0, + "loss": 0.2439, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.28125, + "rewards/margins": 3.609375, + "rewards/rejected": -9.875, + "step": 17860 + }, + { + "epoch": 0.9210153330756345, + "grad_norm": 9.49555727564418, + "learning_rate": 9.437080105653527e-09, + "logits/chosen": -3.21875, + "logits/rejected": -2.90625, + "logps/chosen": -780.0, + "logps/rejected": -1152.0, + "loss": 0.2371, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.125, + "rewards/margins": 3.8125, + "rewards/rejected": -9.9375, + "step": 17870 + }, + { + "epoch": 0.9215307305759567, + "grad_norm": 11.291407795885535, + "learning_rate": 9.315051071567603e-09, + "logits/chosen": -3.171875, + "logits/rejected": -2.953125, + "logps/chosen": -820.0, + "logps/rejected": -1160.0, + "loss": 0.2307, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.375, + "rewards/margins": 3.546875, + "rewards/rejected": -9.9375, + "step": 17880 + }, + { + "epoch": 0.9220461280762788, + "grad_norm": 9.22325688473457, + "learning_rate": 9.193801168216165e-09, + "logits/chosen": -3.28125, + "logits/rejected": -2.90625, + "logps/chosen": -788.0, + "logps/rejected": -1152.0, + "loss": 0.2263, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.125, + "rewards/margins": 3.625, + "rewards/rejected": -9.75, + "step": 17890 + }, + { + "epoch": 0.922561525576601, + "grad_norm": 7.630590907486966, + "learning_rate": 9.073330788102101e-09, + "logits/chosen": -3.375, + "logits/rejected": -2.859375, + "logps/chosen": -800.0, + "logps/rejected": -1176.0, + "loss": 0.2177, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.1875, + "rewards/margins": 3.890625, + "rewards/rejected": -10.0625, + "step": 17900 + }, + { + "epoch": 0.9230769230769231, + "grad_norm": 9.651086433952166, + "learning_rate": 8.953640321204742e-09, + "logits/chosen": -3.328125, + "logits/rejected": -3.046875, + "logps/chosen": -800.0, + "logps/rejected": -1136.0, + "loss": 0.226, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.3125, + "rewards/margins": 3.359375, + "rewards/rejected": -9.6875, + "step": 17910 + }, + { + "epoch": 0.9235923205772452, + "grad_norm": 9.332621251335214, + "learning_rate": 8.834730154978875e-09, + "logits/chosen": -3.171875, + "logits/rejected": -2.90625, + "logps/chosen": -788.0, + "logps/rejected": -1184.0, + "loss": 0.2179, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.15625, + "rewards/margins": 4.09375, + "rewards/rejected": -10.25, + "step": 17920 + }, + { + "epoch": 0.9241077180775673, + "grad_norm": 8.820671677201197, + "learning_rate": 8.71660067435323e-09, + "logits/chosen": -3.359375, + "logits/rejected": -3.09375, + "logps/chosen": -792.0, + "logps/rejected": -1144.0, + "loss": 0.2568, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.15625, + "rewards/margins": 3.59375, + "rewards/rejected": -9.75, + "step": 17930 + }, + { + "epoch": 0.9246231155778895, + "grad_norm": 9.897399197267081, + "learning_rate": 8.599252261729444e-09, + "logits/chosen": -3.375, + "logits/rejected": -3.03125, + "logps/chosen": -836.0, + "logps/rejected": -1144.0, + "loss": 0.2534, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.625, + "rewards/margins": 3.234375, + "rewards/rejected": -9.875, + "step": 17940 + }, + { + "epoch": 0.9251385130782116, + "grad_norm": 7.42664786371187, + "learning_rate": 8.48268529698068e-09, + "logits/chosen": -3.359375, + "logits/rejected": -3.1875, + "logps/chosen": -804.0, + "logps/rejected": -1184.0, + "loss": 0.2204, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.3125, + "rewards/margins": 3.578125, + "rewards/rejected": -9.875, + "step": 17950 + }, + { + "epoch": 0.9256539105785337, + "grad_norm": 5.8580145451782615, + "learning_rate": 8.366900157450479e-09, + "logits/chosen": -3.375, + "logits/rejected": -2.953125, + "logps/chosen": -784.0, + "logps/rejected": -1120.0, + "loss": 0.2283, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -6.0625, + "rewards/margins": 3.6875, + "rewards/rejected": -9.75, + "step": 17960 + }, + { + "epoch": 0.9261693080788558, + "grad_norm": 8.939285555638733, + "learning_rate": 8.25189721795147e-09, + "logits/chosen": -3.21875, + "logits/rejected": -3.171875, + "logps/chosen": -812.0, + "logps/rejected": -1136.0, + "loss": 0.2332, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.3125, + "rewards/margins": 3.515625, + "rewards/rejected": -9.8125, + "step": 17970 + }, + { + "epoch": 0.9266847055791779, + "grad_norm": 5.983964476254887, + "learning_rate": 8.137676850764136e-09, + "logits/chosen": -3.5, + "logits/rejected": -3.03125, + "logps/chosen": -796.0, + "logps/rejected": -1184.0, + "loss": 0.2173, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.1875, + "rewards/margins": 3.515625, + "rewards/rejected": -9.6875, + "step": 17980 + }, + { + "epoch": 0.9272001030795001, + "grad_norm": 9.103348066081745, + "learning_rate": 8.024239425635793e-09, + "logits/chosen": -3.296875, + "logits/rejected": -3.03125, + "logps/chosen": -828.0, + "logps/rejected": -1144.0, + "loss": 0.2354, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.40625, + "rewards/margins": 3.375, + "rewards/rejected": -9.8125, + "step": 17990 + }, + { + "epoch": 0.9277155005798222, + "grad_norm": 9.200084614374513, + "learning_rate": 7.9115853097792e-09, + "logits/chosen": -3.15625, + "logits/rejected": -2.828125, + "logps/chosen": -804.0, + "logps/rejected": -1144.0, + "loss": 0.2283, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.3125, + "rewards/margins": 3.5, + "rewards/rejected": -9.8125, + "step": 18000 + }, + { + "epoch": 0.9282308980801444, + "grad_norm": 11.935279490576931, + "learning_rate": 7.799714867871427e-09, + "logits/chosen": -3.1875, + "logits/rejected": -2.8125, + "logps/chosen": -824.0, + "logps/rejected": -1160.0, + "loss": 0.22, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.28125, + "rewards/margins": 3.796875, + "rewards/rejected": -10.0625, + "step": 18010 + }, + { + "epoch": 0.9287462955804664, + "grad_norm": 6.724671506174106, + "learning_rate": 7.688628462052731e-09, + "logits/chosen": -3.296875, + "logits/rejected": -3.078125, + "logps/chosen": -812.0, + "logps/rejected": -1184.0, + "loss": 0.2214, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.3125, + "rewards/margins": 3.859375, + "rewards/rejected": -10.1875, + "step": 18020 + }, + { + "epoch": 0.9292616930807885, + "grad_norm": 10.692763926076172, + "learning_rate": 7.578326451925188e-09, + "logits/chosen": -3.28125, + "logits/rejected": -3.109375, + "logps/chosen": -840.0, + "logps/rejected": -1120.0, + "loss": 0.2354, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.5, + "rewards/margins": 3.125, + "rewards/rejected": -9.625, + "step": 18030 + }, + { + "epoch": 0.9297770905811107, + "grad_norm": 9.446389864893394, + "learning_rate": 7.468809194551867e-09, + "logits/chosen": -3.3125, + "logits/rejected": -2.9375, + "logps/chosen": -836.0, + "logps/rejected": -1160.0, + "loss": 0.2563, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.5625, + "rewards/margins": 3.40625, + "rewards/rejected": -9.9375, + "step": 18040 + }, + { + "epoch": 0.9302924880814328, + "grad_norm": 8.507201949361965, + "learning_rate": 7.360077044455349e-09, + "logits/chosen": -3.40625, + "logits/rejected": -3.078125, + "logps/chosen": -772.0, + "logps/rejected": -1112.0, + "loss": 0.2297, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.03125, + "rewards/margins": 3.65625, + "rewards/rejected": -9.6875, + "step": 18050 + }, + { + "epoch": 0.930807885581755, + "grad_norm": 6.146763465303966, + "learning_rate": 7.252130353616825e-09, + "logits/chosen": -3.25, + "logits/rejected": -3.171875, + "logps/chosen": -848.0, + "logps/rejected": -1152.0, + "loss": 0.2233, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.4375, + "rewards/margins": 3.296875, + "rewards/rejected": -9.75, + "step": 18060 + }, + { + "epoch": 0.931323283082077, + "grad_norm": 9.111616389607825, + "learning_rate": 7.1449694714747765e-09, + "logits/chosen": -3.328125, + "logits/rejected": -3.046875, + "logps/chosen": -820.0, + "logps/rejected": -1136.0, + "loss": 0.2325, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.4375, + "rewards/margins": 3.359375, + "rewards/rejected": -9.8125, + "step": 18070 + }, + { + "epoch": 0.9318386805823992, + "grad_norm": 9.997950062296903, + "learning_rate": 7.038594744923854e-09, + "logits/chosen": -3.234375, + "logits/rejected": -2.921875, + "logps/chosen": -828.0, + "logps/rejected": -1200.0, + "loss": 0.2215, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.3125, + "rewards/margins": 3.90625, + "rewards/rejected": -10.25, + "step": 18080 + }, + { + "epoch": 0.9323540780827213, + "grad_norm": 9.494852532253427, + "learning_rate": 6.933006518313911e-09, + "logits/chosen": -3.28125, + "logits/rejected": -2.984375, + "logps/chosen": -820.0, + "logps/rejected": -1168.0, + "loss": 0.2294, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.25, + "rewards/margins": 3.625, + "rewards/rejected": -9.875, + "step": 18090 + }, + { + "epoch": 0.9328694755830435, + "grad_norm": 10.686623321911082, + "learning_rate": 6.828205133448755e-09, + "logits/chosen": -3.34375, + "logits/rejected": -2.96875, + "logps/chosen": -804.0, + "logps/rejected": -1160.0, + "loss": 0.2328, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.21875, + "rewards/margins": 3.65625, + "rewards/rejected": -9.875, + "step": 18100 + }, + { + "epoch": 0.9333848730833656, + "grad_norm": 8.090462519721981, + "learning_rate": 6.724190929585094e-09, + "logits/chosen": -3.375, + "logits/rejected": -3.0, + "logps/chosen": -796.0, + "logps/rejected": -1176.0, + "loss": 0.2139, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.3125, + "rewards/margins": 3.84375, + "rewards/rejected": -10.125, + "step": 18110 + }, + { + "epoch": 0.9339002705836876, + "grad_norm": 12.538062573995912, + "learning_rate": 6.62096424343142e-09, + "logits/chosen": -3.21875, + "logits/rejected": -2.953125, + "logps/chosen": -820.0, + "logps/rejected": -1152.0, + "loss": 0.2256, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -6.25, + "rewards/margins": 3.625, + "rewards/rejected": -9.875, + "step": 18120 + }, + { + "epoch": 0.9344156680840098, + "grad_norm": 11.065589379709989, + "learning_rate": 6.51852540914688e-09, + "logits/chosen": -3.125, + "logits/rejected": -3.0, + "logps/chosen": -824.0, + "logps/rejected": -1104.0, + "loss": 0.2274, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.65625, + "rewards/margins": 3.03125, + "rewards/rejected": -9.6875, + "step": 18130 + }, + { + "epoch": 0.9349310655843319, + "grad_norm": 6.454483148471528, + "learning_rate": 6.416874758340241e-09, + "logits/chosen": -3.1875, + "logits/rejected": -3.03125, + "logps/chosen": -816.0, + "logps/rejected": -1160.0, + "loss": 0.2332, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.25, + "rewards/margins": 3.8125, + "rewards/rejected": -10.0625, + "step": 18140 + }, + { + "epoch": 0.9354464630846541, + "grad_norm": 9.565006391256288, + "learning_rate": 6.316012620068867e-09, + "logits/chosen": -3.21875, + "logits/rejected": -2.9375, + "logps/chosen": -792.0, + "logps/rejected": -1136.0, + "loss": 0.2342, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.28125, + "rewards/margins": 3.65625, + "rewards/rejected": -9.9375, + "step": 18150 + }, + { + "epoch": 0.9359618605849762, + "grad_norm": 10.073913635364953, + "learning_rate": 6.2159393208375545e-09, + "logits/chosen": -3.375, + "logits/rejected": -2.953125, + "logps/chosen": -808.0, + "logps/rejected": -1144.0, + "loss": 0.2275, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.5, + "rewards/margins": 3.296875, + "rewards/rejected": -9.8125, + "step": 18160 + }, + { + "epoch": 0.9364772580852982, + "grad_norm": 10.193129767658721, + "learning_rate": 6.116655184597557e-09, + "logits/chosen": -3.296875, + "logits/rejected": -3.078125, + "logps/chosen": -844.0, + "logps/rejected": -1208.0, + "loss": 0.2426, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.5, + "rewards/margins": 3.484375, + "rewards/rejected": -10.0, + "step": 18170 + }, + { + "epoch": 0.9369926555856204, + "grad_norm": 9.450808612952862, + "learning_rate": 6.0181605327453975e-09, + "logits/chosen": -3.25, + "logits/rejected": -3.0, + "logps/chosen": -796.0, + "logps/rejected": -1168.0, + "loss": 0.2342, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.3125, + "rewards/margins": 3.703125, + "rewards/rejected": -10.0, + "step": 18180 + }, + { + "epoch": 0.9375080530859425, + "grad_norm": 8.06383816578868, + "learning_rate": 5.920455684122027e-09, + "logits/chosen": -3.3125, + "logits/rejected": -3.125, + "logps/chosen": -828.0, + "logps/rejected": -1152.0, + "loss": 0.2185, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.46875, + "rewards/margins": 3.296875, + "rewards/rejected": -9.75, + "step": 18190 + }, + { + "epoch": 0.9380234505862647, + "grad_norm": 10.655783728999955, + "learning_rate": 5.823540955011669e-09, + "logits/chosen": -3.53125, + "logits/rejected": -3.1875, + "logps/chosen": -776.0, + "logps/rejected": -1168.0, + "loss": 0.228, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.9375, + "rewards/margins": 4.09375, + "rewards/rejected": -10.0625, + "step": 18200 + }, + { + "epoch": 0.9385388480865868, + "grad_norm": 8.143627496812147, + "learning_rate": 5.727416659140783e-09, + "logits/chosen": -3.421875, + "logits/rejected": -2.96875, + "logps/chosen": -808.0, + "logps/rejected": -1184.0, + "loss": 0.2411, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.34375, + "rewards/margins": 3.71875, + "rewards/rejected": -10.0625, + "step": 18210 + }, + { + "epoch": 0.9390542455869089, + "grad_norm": 10.67094084735988, + "learning_rate": 5.632083107677183e-09, + "logits/chosen": -3.296875, + "logits/rejected": -2.984375, + "logps/chosen": -808.0, + "logps/rejected": -1144.0, + "loss": 0.2465, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.34375, + "rewards/margins": 3.5625, + "rewards/rejected": -9.875, + "step": 18220 + }, + { + "epoch": 0.939569643087231, + "grad_norm": 7.614519012669045, + "learning_rate": 5.537540609228786e-09, + "logits/chosen": -3.140625, + "logits/rejected": -2.96875, + "logps/chosen": -812.0, + "logps/rejected": -1120.0, + "loss": 0.2352, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.40625, + "rewards/margins": 3.21875, + "rewards/rejected": -9.625, + "step": 18230 + }, + { + "epoch": 0.9400850405875532, + "grad_norm": 8.022616360747097, + "learning_rate": 5.443789469842919e-09, + "logits/chosen": -3.109375, + "logits/rejected": -2.90625, + "logps/chosen": -776.0, + "logps/rejected": -1136.0, + "loss": 0.2213, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.03125, + "rewards/margins": 3.546875, + "rewards/rejected": -9.625, + "step": 18240 + }, + { + "epoch": 0.9406004380878753, + "grad_norm": 9.36601638189685, + "learning_rate": 5.350829993005012e-09, + "logits/chosen": -3.265625, + "logits/rejected": -3.015625, + "logps/chosen": -804.0, + "logps/rejected": -1152.0, + "loss": 0.2259, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.25, + "rewards/margins": 3.625, + "rewards/rejected": -9.875, + "step": 18250 + }, + { + "epoch": 0.9411158355881974, + "grad_norm": 7.98782413003141, + "learning_rate": 5.258662479637882e-09, + "logits/chosen": -3.265625, + "logits/rejected": -3.046875, + "logps/chosen": -812.0, + "logps/rejected": -1160.0, + "loss": 0.23, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.46875, + "rewards/margins": 3.5625, + "rewards/rejected": -10.0, + "step": 18260 + }, + { + "epoch": 0.9416312330885195, + "grad_norm": 8.840497918321564, + "learning_rate": 5.167287228100669e-09, + "logits/chosen": -3.203125, + "logits/rejected": -2.890625, + "logps/chosen": -800.0, + "logps/rejected": -1152.0, + "loss": 0.2176, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.4375, + "rewards/margins": 3.5625, + "rewards/rejected": -10.0, + "step": 18270 + }, + { + "epoch": 0.9421466305888416, + "grad_norm": 8.03038577633172, + "learning_rate": 5.076704534187709e-09, + "logits/chosen": -3.375, + "logits/rejected": -3.109375, + "logps/chosen": -804.0, + "logps/rejected": -1144.0, + "loss": 0.224, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.25, + "rewards/margins": 3.40625, + "rewards/rejected": -9.625, + "step": 18280 + }, + { + "epoch": 0.9426620280891638, + "grad_norm": 9.421428039568658, + "learning_rate": 4.986914691127913e-09, + "logits/chosen": -3.21875, + "logits/rejected": -2.984375, + "logps/chosen": -824.0, + "logps/rejected": -1152.0, + "loss": 0.2079, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.40625, + "rewards/margins": 3.640625, + "rewards/rejected": -10.0625, + "step": 18290 + }, + { + "epoch": 0.9431774255894859, + "grad_norm": 8.816984012824447, + "learning_rate": 4.897917989583417e-09, + "logits/chosen": -3.21875, + "logits/rejected": -2.921875, + "logps/chosen": -832.0, + "logps/rejected": -1184.0, + "loss": 0.2219, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.375, + "rewards/margins": 3.890625, + "rewards/rejected": -10.25, + "step": 18300 + }, + { + "epoch": 0.9436928230898081, + "grad_norm": 8.942780880575647, + "learning_rate": 4.809714717649016e-09, + "logits/chosen": -3.328125, + "logits/rejected": -3.078125, + "logps/chosen": -808.0, + "logps/rejected": -1192.0, + "loss": 0.24, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.4375, + "rewards/margins": 3.84375, + "rewards/rejected": -10.25, + "step": 18310 + }, + { + "epoch": 0.9442082205901301, + "grad_norm": 10.699394528873716, + "learning_rate": 4.7223051608509814e-09, + "logits/chosen": -3.375, + "logits/rejected": -3.125, + "logps/chosen": -804.0, + "logps/rejected": -1152.0, + "loss": 0.2191, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.1875, + "rewards/margins": 3.46875, + "rewards/rejected": -9.625, + "step": 18320 + }, + { + "epoch": 0.9447236180904522, + "grad_norm": 12.189844430784188, + "learning_rate": 4.63568960214622e-09, + "logits/chosen": -3.421875, + "logits/rejected": -2.96875, + "logps/chosen": -800.0, + "logps/rejected": -1144.0, + "loss": 0.2162, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.34375, + "rewards/margins": 3.46875, + "rewards/rejected": -9.8125, + "step": 18330 + }, + { + "epoch": 0.9452390155907744, + "grad_norm": 12.10199859328196, + "learning_rate": 4.549868321921418e-09, + "logits/chosen": -3.234375, + "logits/rejected": -3.0, + "logps/chosen": -800.0, + "logps/rejected": -1144.0, + "loss": 0.231, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.1875, + "rewards/margins": 3.703125, + "rewards/rejected": -9.875, + "step": 18340 + }, + { + "epoch": 0.9457544130910965, + "grad_norm": 10.061444258729104, + "learning_rate": 4.4648415979919865e-09, + "logits/chosen": -3.234375, + "logits/rejected": -3.078125, + "logps/chosen": -804.0, + "logps/rejected": -1144.0, + "loss": 0.2202, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.3125, + "rewards/margins": 3.453125, + "rewards/rejected": -9.75, + "step": 18350 + }, + { + "epoch": 0.9462698105914187, + "grad_norm": 7.742979390197653, + "learning_rate": 4.380609705601279e-09, + "logits/chosen": -3.390625, + "logits/rejected": -3.078125, + "logps/chosen": -748.0, + "logps/rejected": -1168.0, + "loss": 0.206, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.78125, + "rewards/margins": 4.28125, + "rewards/rejected": -10.0625, + "step": 18360 + }, + { + "epoch": 0.9467852080917407, + "grad_norm": 7.3664217734691295, + "learning_rate": 4.2971729174197116e-09, + "logits/chosen": -3.203125, + "logits/rejected": -2.8125, + "logps/chosen": -776.0, + "logps/rejected": -1160.0, + "loss": 0.2254, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.15625, + "rewards/margins": 3.9375, + "rewards/rejected": -10.0625, + "step": 18370 + }, + { + "epoch": 0.9473006055920629, + "grad_norm": 6.741930895293787, + "learning_rate": 4.214531503543728e-09, + "logits/chosen": -3.5, + "logits/rejected": -3.28125, + "logps/chosen": -820.0, + "logps/rejected": -1192.0, + "loss": 0.2435, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.4375, + "rewards/margins": 3.703125, + "rewards/rejected": -10.125, + "step": 18380 + }, + { + "epoch": 0.947816003092385, + "grad_norm": 6.95464241082, + "learning_rate": 4.132685731495194e-09, + "logits/chosen": -3.1875, + "logits/rejected": -2.8125, + "logps/chosen": -812.0, + "logps/rejected": -1096.0, + "loss": 0.2119, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.375, + "rewards/margins": 3.171875, + "rewards/rejected": -9.5, + "step": 18390 + }, + { + "epoch": 0.9483314005927072, + "grad_norm": 10.877730799283848, + "learning_rate": 4.051635866220232e-09, + "logits/chosen": -3.21875, + "logits/rejected": -2.8125, + "logps/chosen": -836.0, + "logps/rejected": -1192.0, + "loss": 0.233, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -6.46875, + "rewards/margins": 3.5625, + "rewards/rejected": -10.0625, + "step": 18400 + }, + { + "epoch": 0.9488467980930293, + "grad_norm": 9.48507045203186, + "learning_rate": 3.97138217008855e-09, + "logits/chosen": -3.265625, + "logits/rejected": -2.984375, + "logps/chosen": -824.0, + "logps/rejected": -1176.0, + "loss": 0.2236, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.53125, + "rewards/margins": 3.671875, + "rewards/rejected": -10.1875, + "step": 18410 + }, + { + "epoch": 0.9493621955933513, + "grad_norm": 6.571485332688373, + "learning_rate": 3.891924902892585e-09, + "logits/chosen": -3.21875, + "logits/rejected": -3.0625, + "logps/chosen": -776.0, + "logps/rejected": -1136.0, + "loss": 0.2192, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.09375, + "rewards/margins": 3.84375, + "rewards/rejected": -9.9375, + "step": 18420 + }, + { + "epoch": 0.9498775930936735, + "grad_norm": 9.138314392872017, + "learning_rate": 3.81326432184656e-09, + "logits/chosen": -3.109375, + "logits/rejected": -2.953125, + "logps/chosen": -784.0, + "logps/rejected": -1176.0, + "loss": 0.2283, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -6.125, + "rewards/margins": 3.90625, + "rewards/rejected": -10.0625, + "step": 18430 + }, + { + "epoch": 0.9503929905939956, + "grad_norm": 8.286413755391731, + "learning_rate": 3.73540068158576e-09, + "logits/chosen": -3.296875, + "logits/rejected": -2.953125, + "logps/chosen": -776.0, + "logps/rejected": -1152.0, + "loss": 0.2355, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.25, + "rewards/margins": 3.59375, + "rewards/rejected": -9.8125, + "step": 18440 + }, + { + "epoch": 0.9509083880943178, + "grad_norm": 10.940750708240778, + "learning_rate": 3.658334234165644e-09, + "logits/chosen": -3.15625, + "logits/rejected": -3.03125, + "logps/chosen": -788.0, + "logps/rejected": -1184.0, + "loss": 0.2399, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.0625, + "rewards/margins": 3.953125, + "rewards/rejected": -10.0, + "step": 18450 + }, + { + "epoch": 0.9514237855946399, + "grad_norm": 9.25311598065719, + "learning_rate": 3.582065229061043e-09, + "logits/chosen": -3.328125, + "logits/rejected": -2.921875, + "logps/chosen": -824.0, + "logps/rejected": -1128.0, + "loss": 0.218, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.34375, + "rewards/margins": 3.140625, + "rewards/rejected": -9.5, + "step": 18460 + }, + { + "epoch": 0.951939183094962, + "grad_norm": 8.218955392546395, + "learning_rate": 3.506593913165323e-09, + "logits/chosen": -3.34375, + "logits/rejected": -3.125, + "logps/chosen": -776.0, + "logps/rejected": -1128.0, + "loss": 0.2405, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.0625, + "rewards/margins": 3.734375, + "rewards/rejected": -9.75, + "step": 18470 + }, + { + "epoch": 0.9524545805952841, + "grad_norm": 7.734453996280383, + "learning_rate": 3.4319205307897225e-09, + "logits/chosen": -3.421875, + "logits/rejected": -3.0625, + "logps/chosen": -776.0, + "logps/rejected": -1144.0, + "loss": 0.2198, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.15625, + "rewards/margins": 3.78125, + "rewards/rejected": -9.9375, + "step": 18480 + }, + { + "epoch": 0.9529699780956062, + "grad_norm": 9.87080665780316, + "learning_rate": 3.3580453236623507e-09, + "logits/chosen": -3.390625, + "logits/rejected": -2.96875, + "logps/chosen": -800.0, + "logps/rejected": -1160.0, + "loss": 0.2365, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.375, + "rewards/margins": 3.6875, + "rewards/rejected": -10.0625, + "step": 18490 + }, + { + "epoch": 0.9534853755959284, + "grad_norm": 8.85379187277789, + "learning_rate": 3.284968530927551e-09, + "logits/chosen": -3.328125, + "logits/rejected": -2.96875, + "logps/chosen": -804.0, + "logps/rejected": -1160.0, + "loss": 0.2091, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.3125, + "rewards/margins": 3.71875, + "rewards/rejected": -10.0625, + "step": 18500 + }, + { + "epoch": 0.9540007730962505, + "grad_norm": 7.646825527475255, + "learning_rate": 3.212690389145095e-09, + "logits/chosen": -3.34375, + "logits/rejected": -2.984375, + "logps/chosen": -812.0, + "logps/rejected": -1152.0, + "loss": 0.2143, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.5, + "rewards/margins": 3.453125, + "rewards/rejected": -9.9375, + "step": 18510 + }, + { + "epoch": 0.9545161705965726, + "grad_norm": 7.5732416575262995, + "learning_rate": 3.1412111322894064e-09, + "logits/chosen": -3.34375, + "logits/rejected": -3.0625, + "logps/chosen": -812.0, + "logps/rejected": -1152.0, + "loss": 0.2104, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.3125, + "rewards/margins": 3.546875, + "rewards/rejected": -9.875, + "step": 18520 + }, + { + "epoch": 0.9550315680968947, + "grad_norm": 7.555149737535175, + "learning_rate": 3.0705309917488098e-09, + "logits/chosen": -3.375, + "logits/rejected": -3.171875, + "logps/chosen": -828.0, + "logps/rejected": -1144.0, + "loss": 0.2311, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.46875, + "rewards/margins": 3.40625, + "rewards/rejected": -9.875, + "step": 18530 + }, + { + "epoch": 0.9555469655972169, + "grad_norm": 8.493376163073588, + "learning_rate": 3.0006501963247555e-09, + "logits/chosen": -3.25, + "logits/rejected": -3.09375, + "logps/chosen": -800.0, + "logps/rejected": -1136.0, + "loss": 0.2218, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -6.125, + "rewards/margins": 3.65625, + "rewards/rejected": -9.75, + "step": 18540 + }, + { + "epoch": 0.956062363097539, + "grad_norm": 6.969274897933996, + "learning_rate": 2.9315689722311243e-09, + "logits/chosen": -3.34375, + "logits/rejected": -3.09375, + "logps/chosen": -840.0, + "logps/rejected": -1160.0, + "loss": 0.2179, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.5, + "rewards/margins": 3.46875, + "rewards/rejected": -9.9375, + "step": 18550 + }, + { + "epoch": 0.9565777605978611, + "grad_norm": 9.057520623944004, + "learning_rate": 2.8632875430935067e-09, + "logits/chosen": -3.078125, + "logits/rejected": -2.921875, + "logps/chosen": -840.0, + "logps/rejected": -1136.0, + "loss": 0.2229, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.5, + "rewards/margins": 3.25, + "rewards/rejected": -9.75, + "step": 18560 + }, + { + "epoch": 0.9570931580981832, + "grad_norm": 7.440645924417364, + "learning_rate": 2.7958061299483695e-09, + "logits/chosen": -3.09375, + "logits/rejected": -2.71875, + "logps/chosen": -816.0, + "logps/rejected": -1184.0, + "loss": 0.2312, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.40625, + "rewards/margins": 3.734375, + "rewards/rejected": -10.125, + "step": 18570 + }, + { + "epoch": 0.9576085555985053, + "grad_norm": 10.287841803596326, + "learning_rate": 2.729124951242473e-09, + "logits/chosen": -3.203125, + "logits/rejected": -2.890625, + "logps/chosen": -804.0, + "logps/rejected": -1152.0, + "loss": 0.2395, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.375, + "rewards/margins": 3.53125, + "rewards/rejected": -9.875, + "step": 18580 + }, + { + "epoch": 0.9581239530988275, + "grad_norm": 6.451004716213239, + "learning_rate": 2.6632442228320673e-09, + "logits/chosen": -3.34375, + "logits/rejected": -3.109375, + "logps/chosen": -856.0, + "logps/rejected": -1176.0, + "loss": 0.2267, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.59375, + "rewards/margins": 3.5, + "rewards/rejected": -10.0625, + "step": 18590 + }, + { + "epoch": 0.9586393505991496, + "grad_norm": 7.568514200865403, + "learning_rate": 2.598164157982252e-09, + "logits/chosen": -3.40625, + "logits/rejected": -3.109375, + "logps/chosen": -808.0, + "logps/rejected": -1168.0, + "loss": 0.2208, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.15625, + "rewards/margins": 3.796875, + "rewards/rejected": -9.9375, + "step": 18600 + }, + { + "epoch": 0.9591547480994718, + "grad_norm": 6.9746096800638036, + "learning_rate": 2.53388496736634e-09, + "logits/chosen": -3.40625, + "logits/rejected": -3.109375, + "logps/chosen": -820.0, + "logps/rejected": -1160.0, + "loss": 0.2463, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.375, + "rewards/margins": 3.453125, + "rewards/rejected": -9.8125, + "step": 18610 + }, + { + "epoch": 0.9596701455997938, + "grad_norm": 7.390430582655483, + "learning_rate": 2.4704068590649385e-09, + "logits/chosen": -3.28125, + "logits/rejected": -2.96875, + "logps/chosen": -816.0, + "logps/rejected": -1168.0, + "loss": 0.2176, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.0625, + "rewards/margins": 3.796875, + "rewards/rejected": -9.875, + "step": 18620 + }, + { + "epoch": 0.960185543100116, + "grad_norm": 8.011973084616239, + "learning_rate": 2.407730038565592e-09, + "logits/chosen": -3.046875, + "logits/rejected": -2.875, + "logps/chosen": -812.0, + "logps/rejected": -1128.0, + "loss": 0.2261, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.4375, + "rewards/margins": 3.09375, + "rewards/rejected": -9.5, + "step": 18630 + }, + { + "epoch": 0.9607009406004381, + "grad_norm": 5.8772580173671765, + "learning_rate": 2.345854708761863e-09, + "logits/chosen": -3.1875, + "logits/rejected": -2.9375, + "logps/chosen": -804.0, + "logps/rejected": -1144.0, + "loss": 0.2413, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.3125, + "rewards/margins": 3.703125, + "rewards/rejected": -10.0625, + "step": 18640 + }, + { + "epoch": 0.9612163381007602, + "grad_norm": 9.22251551840012, + "learning_rate": 2.2847810699528623e-09, + "logits/chosen": -3.1875, + "logits/rejected": -2.984375, + "logps/chosen": -816.0, + "logps/rejected": -1176.0, + "loss": 0.214, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.125, + "rewards/margins": 3.6875, + "rewards/rejected": -9.8125, + "step": 18650 + }, + { + "epoch": 0.9617317356010824, + "grad_norm": 8.260176272667062, + "learning_rate": 2.2245093198424714e-09, + "logits/chosen": -3.1875, + "logits/rejected": -2.921875, + "logps/chosen": -800.0, + "logps/rejected": -1168.0, + "loss": 0.2156, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.25, + "rewards/margins": 3.8125, + "rewards/rejected": -10.0625, + "step": 18660 + }, + { + "epoch": 0.9622471331014044, + "grad_norm": 9.730497171886208, + "learning_rate": 2.1650396535387305e-09, + "logits/chosen": -3.265625, + "logits/rejected": -3.203125, + "logps/chosen": -824.0, + "logps/rejected": -1176.0, + "loss": 0.2222, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.4375, + "rewards/margins": 3.46875, + "rewards/rejected": -9.875, + "step": 18670 + }, + { + "epoch": 0.9627625306017266, + "grad_norm": 9.404238452409256, + "learning_rate": 2.1063722635532298e-09, + "logits/chosen": -3.40625, + "logits/rejected": -3.140625, + "logps/chosen": -816.0, + "logps/rejected": -1144.0, + "loss": 0.2117, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.28125, + "rewards/margins": 3.53125, + "rewards/rejected": -9.8125, + "step": 18680 + }, + { + "epoch": 0.9632779281020487, + "grad_norm": 8.24702329743092, + "learning_rate": 2.0485073398004982e-09, + "logits/chosen": -3.3125, + "logits/rejected": -2.96875, + "logps/chosen": -792.0, + "logps/rejected": -1152.0, + "loss": 0.2197, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.15625, + "rewards/margins": 3.734375, + "rewards/rejected": -9.875, + "step": 18690 + }, + { + "epoch": 0.9637933256023709, + "grad_norm": 9.35185234076602, + "learning_rate": 1.9914450695973917e-09, + "logits/chosen": -3.359375, + "logits/rejected": -3.046875, + "logps/chosen": -780.0, + "logps/rejected": -1192.0, + "loss": 0.2121, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.125, + "rewards/margins": 4.21875, + "rewards/rejected": -10.375, + "step": 18700 + }, + { + "epoch": 0.964308723102693, + "grad_norm": 11.444089267172744, + "learning_rate": 1.935185637662401e-09, + "logits/chosen": -3.21875, + "logits/rejected": -2.921875, + "logps/chosen": -808.0, + "logps/rejected": -1216.0, + "loss": 0.2153, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.3125, + "rewards/margins": 4.15625, + "rewards/rejected": -10.4375, + "step": 18710 + }, + { + "epoch": 0.964824120603015, + "grad_norm": 6.960831389486829, + "learning_rate": 1.879729226115151e-09, + "logits/chosen": -3.21875, + "logits/rejected": -2.953125, + "logps/chosen": -800.0, + "logps/rejected": -1120.0, + "loss": 0.2155, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.34375, + "rewards/margins": 3.359375, + "rewards/rejected": -9.6875, + "step": 18720 + }, + { + "epoch": 0.9653395181033372, + "grad_norm": 7.1690736816081895, + "learning_rate": 1.8250760144757904e-09, + "logits/chosen": -2.953125, + "logits/rejected": -2.921875, + "logps/chosen": -824.0, + "logps/rejected": -1160.0, + "loss": 0.1991, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.15625, + "rewards/margins": 3.734375, + "rewards/rejected": -9.875, + "step": 18730 + }, + { + "epoch": 0.9658549156036593, + "grad_norm": 9.684144503698983, + "learning_rate": 1.7712261796643812e-09, + "logits/chosen": -3.359375, + "logits/rejected": -2.859375, + "logps/chosen": -828.0, + "logps/rejected": -1176.0, + "loss": 0.2195, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.5, + "rewards/margins": 3.5625, + "rewards/rejected": -10.0625, + "step": 18740 + }, + { + "epoch": 0.9663703131039815, + "grad_norm": 8.901548982525577, + "learning_rate": 1.7181798960003434e-09, + "logits/chosen": -3.328125, + "logits/rejected": -2.96875, + "logps/chosen": -804.0, + "logps/rejected": -1096.0, + "loss": 0.2209, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.3125, + "rewards/margins": 3.25, + "rewards/rejected": -9.5625, + "step": 18750 + }, + { + "epoch": 0.9668857106043036, + "grad_norm": 10.204073680251726, + "learning_rate": 1.6659373352018723e-09, + "logits/chosen": -3.328125, + "logits/rejected": -3.015625, + "logps/chosen": -796.0, + "logps/rejected": -1136.0, + "loss": 0.235, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -6.25, + "rewards/margins": 3.53125, + "rewards/rejected": -9.8125, + "step": 18760 + }, + { + "epoch": 0.9674011081046257, + "grad_norm": 8.395051743777762, + "learning_rate": 1.6144986663854943e-09, + "logits/chosen": -3.421875, + "logits/rejected": -3.265625, + "logps/chosen": -792.0, + "logps/rejected": -1120.0, + "loss": 0.223, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.25, + "rewards/margins": 3.34375, + "rewards/rejected": -9.5625, + "step": 18770 + }, + { + "epoch": 0.9679165056049478, + "grad_norm": 7.881280216274359, + "learning_rate": 1.5638640560652616e-09, + "logits/chosen": -3.125, + "logits/rejected": -2.8125, + "logps/chosen": -844.0, + "logps/rejected": -1192.0, + "loss": 0.2178, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.5625, + "rewards/margins": 3.625, + "rewards/rejected": -10.1875, + "step": 18780 + }, + { + "epoch": 0.9684319031052699, + "grad_norm": 9.565781282188969, + "learning_rate": 1.5140336681525035e-09, + "logits/chosen": -3.234375, + "logits/rejected": -2.734375, + "logps/chosen": -804.0, + "logps/rejected": -1176.0, + "loss": 0.2493, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.40625, + "rewards/margins": 3.78125, + "rewards/rejected": -10.1875, + "step": 18790 + }, + { + "epoch": 0.9689473006055921, + "grad_norm": 7.721068371490624, + "learning_rate": 1.4650076639551312e-09, + "logits/chosen": -3.28125, + "logits/rejected": -3.0, + "logps/chosen": -824.0, + "logps/rejected": -1168.0, + "loss": 0.2105, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.375, + "rewards/margins": 3.578125, + "rewards/rejected": -9.9375, + "step": 18800 + }, + { + "epoch": 0.9694626981059142, + "grad_norm": 9.785807024565905, + "learning_rate": 1.4167862021771671e-09, + "logits/chosen": -3.234375, + "logits/rejected": -2.921875, + "logps/chosen": -816.0, + "logps/rejected": -1176.0, + "loss": 0.2162, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.3125, + "rewards/margins": 3.84375, + "rewards/rejected": -10.125, + "step": 18810 + }, + { + "epoch": 0.9699780956062363, + "grad_norm": 8.250865726511591, + "learning_rate": 1.3693694389182165e-09, + "logits/chosen": -3.46875, + "logits/rejected": -3.21875, + "logps/chosen": -832.0, + "logps/rejected": -1176.0, + "loss": 0.2387, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.4375, + "rewards/margins": 3.59375, + "rewards/rejected": -10.0, + "step": 18820 + }, + { + "epoch": 0.9704934931065584, + "grad_norm": 6.835409988787498, + "learning_rate": 1.322757527672913e-09, + "logits/chosen": -3.21875, + "logits/rejected": -2.921875, + "logps/chosen": -832.0, + "logps/rejected": -1160.0, + "loss": 0.2168, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.5625, + "rewards/margins": 3.296875, + "rewards/rejected": -9.875, + "step": 18830 + }, + { + "epoch": 0.9710088906068806, + "grad_norm": 7.118531908024317, + "learning_rate": 1.2769506193305302e-09, + "logits/chosen": -3.375, + "logits/rejected": -3.09375, + "logps/chosen": -840.0, + "logps/rejected": -1152.0, + "loss": 0.2302, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.625, + "rewards/margins": 3.25, + "rewards/rejected": -9.875, + "step": 18840 + }, + { + "epoch": 0.9715242881072027, + "grad_norm": 8.44080702596654, + "learning_rate": 1.2319488621744256e-09, + "logits/chosen": -3.46875, + "logits/rejected": -3.140625, + "logps/chosen": -772.0, + "logps/rejected": -1096.0, + "loss": 0.2221, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.09375, + "rewards/margins": 3.40625, + "rewards/rejected": -9.5, + "step": 18850 + }, + { + "epoch": 0.9720396856075249, + "grad_norm": 5.8395679293256455, + "learning_rate": 1.1877524018815421e-09, + "logits/chosen": -3.328125, + "logits/rejected": -3.140625, + "logps/chosen": -808.0, + "logps/rejected": -1120.0, + "loss": 0.2145, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.3125, + "rewards/margins": 3.34375, + "rewards/rejected": -9.6875, + "step": 18860 + }, + { + "epoch": 0.9725550831078469, + "grad_norm": 7.7489214529607, + "learning_rate": 1.144361381521991e-09, + "logits/chosen": -3.1875, + "logits/rejected": -2.953125, + "logps/chosen": -784.0, + "logps/rejected": -1136.0, + "loss": 0.2289, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.0625, + "rewards/margins": 3.671875, + "rewards/rejected": -9.75, + "step": 18870 + }, + { + "epoch": 0.973070480608169, + "grad_norm": 8.586795889034205, + "learning_rate": 1.1017759415585526e-09, + "logits/chosen": -3.28125, + "logits/rejected": -2.890625, + "logps/chosen": -828.0, + "logps/rejected": -1152.0, + "loss": 0.2205, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.46875, + "rewards/margins": 3.40625, + "rewards/rejected": -9.875, + "step": 18880 + }, + { + "epoch": 0.9735858781084912, + "grad_norm": 7.285185440866089, + "learning_rate": 1.0599962198462598e-09, + "logits/chosen": -3.296875, + "logits/rejected": -2.921875, + "logps/chosen": -772.0, + "logps/rejected": -1152.0, + "loss": 0.2001, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.15625, + "rewards/margins": 3.828125, + "rewards/rejected": -10.0, + "step": 18890 + }, + { + "epoch": 0.9741012756088133, + "grad_norm": 7.482463159426966, + "learning_rate": 1.0190223516318708e-09, + "logits/chosen": -3.40625, + "logits/rejected": -3.09375, + "logps/chosen": -784.0, + "logps/rejected": -1144.0, + "loss": 0.2169, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.125, + "rewards/margins": 3.671875, + "rewards/rejected": -9.8125, + "step": 18900 + }, + { + "epoch": 0.9746166731091355, + "grad_norm": 10.091631036416421, + "learning_rate": 9.788544695535084e-10, + "logits/chosen": -3.296875, + "logits/rejected": -2.875, + "logps/chosen": -820.0, + "logps/rejected": -1152.0, + "loss": 0.2154, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.46875, + "rewards/margins": 3.40625, + "rewards/rejected": -9.875, + "step": 18910 + }, + { + "epoch": 0.9751320706094575, + "grad_norm": 8.957439805278343, + "learning_rate": 9.394927036402156e-10, + "logits/chosen": -3.25, + "logits/rejected": -2.96875, + "logps/chosen": -824.0, + "logps/rejected": -1168.0, + "loss": 0.2135, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.375, + "rewards/margins": 3.40625, + "rewards/rejected": -9.75, + "step": 18920 + }, + { + "epoch": 0.9756474681097796, + "grad_norm": 8.044110333326621, + "learning_rate": 9.009371813115119e-10, + "logits/chosen": -3.328125, + "logits/rejected": -3.0, + "logps/chosen": -792.0, + "logps/rejected": -1136.0, + "loss": 0.2244, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.3125, + "rewards/margins": 3.46875, + "rewards/rejected": -9.75, + "step": 18930 + }, + { + "epoch": 0.9761628656101018, + "grad_norm": 6.059604736745284, + "learning_rate": 8.631880273770043e-10, + "logits/chosen": -3.09375, + "logits/rejected": -2.796875, + "logps/chosen": -788.0, + "logps/rejected": -1160.0, + "loss": 0.2143, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.21875, + "rewards/margins": 3.6875, + "rewards/rejected": -9.875, + "step": 18940 + }, + { + "epoch": 0.9766782631104239, + "grad_norm": 7.349271362280608, + "learning_rate": 8.262453640359435e-10, + "logits/chosen": -3.265625, + "logits/rejected": -2.90625, + "logps/chosen": -804.0, + "logps/rejected": -1168.0, + "loss": 0.2252, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.21875, + "rewards/margins": 3.75, + "rewards/rejected": -9.9375, + "step": 18950 + }, + { + "epoch": 0.9771936606107461, + "grad_norm": 9.53300974297753, + "learning_rate": 7.901093108768909e-10, + "logits/chosen": -3.234375, + "logits/rejected": -2.828125, + "logps/chosen": -784.0, + "logps/rejected": -1144.0, + "loss": 0.2227, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.15625, + "rewards/margins": 3.765625, + "rewards/rejected": -9.9375, + "step": 18960 + }, + { + "epoch": 0.9777090581110681, + "grad_norm": 5.757901661893572, + "learning_rate": 7.547799848773295e-10, + "logits/chosen": -3.265625, + "logits/rejected": -3.0625, + "logps/chosen": -788.0, + "logps/rejected": -1120.0, + "loss": 0.2245, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.25, + "rewards/margins": 3.40625, + "rewards/rejected": -9.625, + "step": 18970 + }, + { + "epoch": 0.9782244556113903, + "grad_norm": 7.391637979069548, + "learning_rate": 7.202575004031652e-10, + "logits/chosen": -3.328125, + "logits/rejected": -3.09375, + "logps/chosen": -788.0, + "logps/rejected": -1136.0, + "loss": 0.203, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.125, + "rewards/margins": 3.734375, + "rewards/rejected": -9.875, + "step": 18980 + }, + { + "epoch": 0.9787398531117124, + "grad_norm": 7.2590773132899304, + "learning_rate": 6.865419692085039e-10, + "logits/chosen": -3.21875, + "logits/rejected": -2.8125, + "logps/chosen": -796.0, + "logps/rejected": -1160.0, + "loss": 0.2219, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.34375, + "rewards/margins": 3.765625, + "rewards/rejected": -10.125, + "step": 18990 + }, + { + "epoch": 0.9792552506120346, + "grad_norm": 8.812005821897449, + "learning_rate": 6.536335004352078e-10, + "logits/chosen": -3.296875, + "logits/rejected": -2.96875, + "logps/chosen": -808.0, + "logps/rejected": -1136.0, + "loss": 0.2482, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -6.34375, + "rewards/margins": 3.34375, + "rewards/rejected": -9.6875, + "step": 19000 + }, + { + "epoch": 0.9797706481123567, + "grad_norm": 8.825426931727138, + "learning_rate": 6.215322006126178e-10, + "logits/chosen": -3.34375, + "logits/rejected": -3.015625, + "logps/chosen": -820.0, + "logps/rejected": -1152.0, + "loss": 0.2401, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.625, + "rewards/margins": 3.328125, + "rewards/rejected": -9.9375, + "step": 19010 + }, + { + "epoch": 0.9802860456126787, + "grad_norm": 7.077464243431834, + "learning_rate": 5.902381736570816e-10, + "logits/chosen": -3.1875, + "logits/rejected": -2.84375, + "logps/chosen": -792.0, + "logps/rejected": -1136.0, + "loss": 0.232, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.1875, + "rewards/margins": 3.640625, + "rewards/rejected": -9.8125, + "step": 19020 + }, + { + "epoch": 0.9808014431130009, + "grad_norm": 9.260118009869878, + "learning_rate": 5.597515208717596e-10, + "logits/chosen": -3.3125, + "logits/rejected": -2.953125, + "logps/chosen": -792.0, + "logps/rejected": -1160.0, + "loss": 0.2491, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.28125, + "rewards/margins": 3.59375, + "rewards/rejected": -9.875, + "step": 19030 + }, + { + "epoch": 0.981316840613323, + "grad_norm": 10.164334128314948, + "learning_rate": 5.300723409461528e-10, + "logits/chosen": -3.3125, + "logits/rejected": -2.921875, + "logps/chosen": -792.0, + "logps/rejected": -1208.0, + "loss": 0.2503, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.15625, + "rewards/margins": 4.25, + "rewards/rejected": -10.4375, + "step": 19040 + }, + { + "epoch": 0.9818322381136452, + "grad_norm": 9.341522785393265, + "learning_rate": 5.012007299559639e-10, + "logits/chosen": -3.1875, + "logits/rejected": -2.921875, + "logps/chosen": -804.0, + "logps/rejected": -1160.0, + "loss": 0.2215, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.21875, + "rewards/margins": 3.515625, + "rewards/rejected": -9.75, + "step": 19050 + }, + { + "epoch": 0.9823476356139673, + "grad_norm": 8.899418736235004, + "learning_rate": 4.731367813625431e-10, + "logits/chosen": -3.28125, + "logits/rejected": -2.96875, + "logps/chosen": -784.0, + "logps/rejected": -1152.0, + "loss": 0.2413, + "rewards/accuracies": 0.84375, + "rewards/chosen": -6.25, + "rewards/margins": 3.75, + "rewards/rejected": -10.0, + "step": 19060 + }, + { + "epoch": 0.9828630331142894, + "grad_norm": 7.714372834205118, + "learning_rate": 4.458805860128867e-10, + "logits/chosen": -3.21875, + "logits/rejected": -3.03125, + "logps/chosen": -808.0, + "logps/rejected": -1128.0, + "loss": 0.2471, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.25, + "rewards/margins": 3.375, + "rewards/rejected": -9.625, + "step": 19070 + }, + { + "epoch": 0.9833784306146115, + "grad_norm": 9.763908957595495, + "learning_rate": 4.1943223213902757e-10, + "logits/chosen": -3.234375, + "logits/rejected": -3.03125, + "logps/chosen": -812.0, + "logps/rejected": -1152.0, + "loss": 0.2399, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.375, + "rewards/margins": 3.640625, + "rewards/rejected": -10.0, + "step": 19080 + }, + { + "epoch": 0.9838938281149336, + "grad_norm": 8.435352523611886, + "learning_rate": 3.937918053580347e-10, + "logits/chosen": -3.203125, + "logits/rejected": -2.96875, + "logps/chosen": -780.0, + "logps/rejected": -1136.0, + "loss": 0.2226, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.28125, + "rewards/margins": 3.515625, + "rewards/rejected": -9.8125, + "step": 19090 + }, + { + "epoch": 0.9844092256152558, + "grad_norm": 10.404175716711439, + "learning_rate": 3.6895938867151367e-10, + "logits/chosen": -3.125, + "logits/rejected": -2.859375, + "logps/chosen": -820.0, + "logps/rejected": -1136.0, + "loss": 0.2266, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.40625, + "rewards/margins": 3.46875, + "rewards/rejected": -9.875, + "step": 19100 + }, + { + "epoch": 0.9849246231155779, + "grad_norm": 9.635004195130652, + "learning_rate": 3.449350624654401e-10, + "logits/chosen": -3.3125, + "logits/rejected": -3.21875, + "logps/chosen": -784.0, + "logps/rejected": -1128.0, + "loss": 0.2544, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.96875, + "rewards/margins": 3.65625, + "rewards/rejected": -9.625, + "step": 19110 + }, + { + "epoch": 0.9854400206159, + "grad_norm": 8.319305831523685, + "learning_rate": 3.2171890450993776e-10, + "logits/chosen": -3.203125, + "logits/rejected": -2.875, + "logps/chosen": -816.0, + "logps/rejected": -1216.0, + "loss": 0.2142, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.46875, + "rewards/margins": 4.09375, + "rewards/rejected": -10.5625, + "step": 19120 + }, + { + "epoch": 0.9859554181162221, + "grad_norm": 9.560614539464513, + "learning_rate": 2.993109899589452e-10, + "logits/chosen": -3.375, + "logits/rejected": -2.828125, + "logps/chosen": -784.0, + "logps/rejected": -1168.0, + "loss": 0.2234, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.0625, + "rewards/margins": 3.84375, + "rewards/rejected": -9.9375, + "step": 19130 + }, + { + "epoch": 0.9864708156165443, + "grad_norm": 8.136196767027727, + "learning_rate": 2.7771139135002173e-10, + "logits/chosen": -3.5625, + "logits/rejected": -3.359375, + "logps/chosen": -808.0, + "logps/rejected": -1152.0, + "loss": 0.2265, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.1875, + "rewards/margins": 3.625, + "rewards/rejected": -9.8125, + "step": 19140 + }, + { + "epoch": 0.9869862131168664, + "grad_norm": 7.342642355356072, + "learning_rate": 2.5692017860404204e-10, + "logits/chosen": -3.28125, + "logits/rejected": -2.765625, + "logps/chosen": -780.0, + "logps/rejected": -1176.0, + "loss": 0.2189, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.0, + "rewards/margins": 4.1875, + "rewards/rejected": -10.1875, + "step": 19150 + }, + { + "epoch": 0.9875016106171886, + "grad_norm": 7.8745632180060205, + "learning_rate": 2.3693741902505727e-10, + "logits/chosen": -3.328125, + "logits/rejected": -3.0, + "logps/chosen": -816.0, + "logps/rejected": -1168.0, + "loss": 0.2374, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.5, + "rewards/margins": 3.71875, + "rewards/rejected": -10.25, + "step": 19160 + }, + { + "epoch": 0.9880170081175106, + "grad_norm": 8.436245881055534, + "learning_rate": 2.177631773000732e-10, + "logits/chosen": -3.234375, + "logits/rejected": -2.890625, + "logps/chosen": -816.0, + "logps/rejected": -1144.0, + "loss": 0.2107, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.5625, + "rewards/margins": 3.546875, + "rewards/rejected": -10.125, + "step": 19170 + }, + { + "epoch": 0.9885324056178327, + "grad_norm": 11.753767511082371, + "learning_rate": 1.993975154987726e-10, + "logits/chosen": -3.34375, + "logits/rejected": -2.921875, + "logps/chosen": -792.0, + "logps/rejected": -1144.0, + "loss": 0.2266, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.28125, + "rewards/margins": 3.5625, + "rewards/rejected": -9.8125, + "step": 19180 + }, + { + "epoch": 0.9890478031181549, + "grad_norm": 10.137144301452759, + "learning_rate": 1.8184049307337633e-10, + "logits/chosen": -3.234375, + "logits/rejected": -2.9375, + "logps/chosen": -776.0, + "logps/rejected": -1096.0, + "loss": 0.2313, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.0, + "rewards/margins": 3.53125, + "rewards/rejected": -9.5625, + "step": 19190 + }, + { + "epoch": 0.989563200618477, + "grad_norm": 9.183445290096747, + "learning_rate": 1.6509216685839377e-10, + "logits/chosen": -3.375, + "logits/rejected": -3.125, + "logps/chosen": -812.0, + "logps/rejected": -1160.0, + "loss": 0.2114, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.3125, + "rewards/margins": 3.59375, + "rewards/rejected": -9.875, + "step": 19200 + }, + { + "epoch": 0.9900785981187992, + "grad_norm": 13.250541480153572, + "learning_rate": 1.491525910705116e-10, + "logits/chosen": -3.359375, + "logits/rejected": -3.109375, + "logps/chosen": -824.0, + "logps/rejected": -1136.0, + "loss": 0.231, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.4375, + "rewards/margins": 3.421875, + "rewards/rejected": -9.875, + "step": 19210 + }, + { + "epoch": 0.9905939956191212, + "grad_norm": 6.170286499763311, + "learning_rate": 1.3402181730834406e-10, + "logits/chosen": -3.4375, + "logits/rejected": -3.140625, + "logps/chosen": -800.0, + "logps/rejected": -1160.0, + "loss": 0.2353, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.28125, + "rewards/margins": 3.609375, + "rewards/rejected": -9.875, + "step": 19220 + }, + { + "epoch": 0.9911093931194433, + "grad_norm": 9.89567647902666, + "learning_rate": 1.1969989455229424e-10, + "logits/chosen": -3.421875, + "logits/rejected": -3.15625, + "logps/chosen": -808.0, + "logps/rejected": -1176.0, + "loss": 0.2215, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.21875, + "rewards/margins": 3.71875, + "rewards/rejected": -9.9375, + "step": 19230 + }, + { + "epoch": 0.9916247906197655, + "grad_norm": 9.6638530814763, + "learning_rate": 1.0618686916447073e-10, + "logits/chosen": -3.390625, + "logits/rejected": -3.046875, + "logps/chosen": -784.0, + "logps/rejected": -1104.0, + "loss": 0.2378, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.125, + "rewards/margins": 3.203125, + "rewards/rejected": -9.375, + "step": 19240 + }, + { + "epoch": 0.9921401881200876, + "grad_norm": 10.99223037084181, + "learning_rate": 9.34827848884101e-11, + "logits/chosen": -3.171875, + "logits/rejected": -2.890625, + "logps/chosen": -824.0, + "logps/rejected": -1152.0, + "loss": 0.2393, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.46875, + "rewards/margins": 3.515625, + "rewards/rejected": -10.0, + "step": 19250 + }, + { + "epoch": 0.9926555856204098, + "grad_norm": 11.095120722429824, + "learning_rate": 8.158768284896589e-11, + "logits/chosen": -3.359375, + "logits/rejected": -3.09375, + "logps/chosen": -796.0, + "logps/rejected": -1128.0, + "loss": 0.2271, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.03125, + "rewards/margins": 3.5625, + "rewards/rejected": -9.625, + "step": 19260 + }, + { + "epoch": 0.9931709831207318, + "grad_norm": 9.327952759944056, + "learning_rate": 7.05016015522808e-11, + "logits/chosen": -3.34375, + "logits/rejected": -2.9375, + "logps/chosen": -800.0, + "logps/rejected": -1144.0, + "loss": 0.2318, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.0625, + "rewards/margins": 3.6875, + "rewards/rejected": -9.75, + "step": 19270 + }, + { + "epoch": 0.993686380621054, + "grad_norm": 7.60105385200162, + "learning_rate": 6.022457688553694e-11, + "logits/chosen": -3.203125, + "logits/rejected": -2.765625, + "logps/chosen": -836.0, + "logps/rejected": -1160.0, + "loss": 0.245, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -6.5625, + "rewards/margins": 3.328125, + "rewards/rejected": -9.875, + "step": 19280 + }, + { + "epoch": 0.9942017781213761, + "grad_norm": 8.496222917348753, + "learning_rate": 5.075664211687258e-11, + "logits/chosen": -3.21875, + "logits/rejected": -3.03125, + "logps/chosen": -772.0, + "logps/rejected": -1152.0, + "loss": 0.2095, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.125, + "rewards/margins": 3.96875, + "rewards/rejected": -10.0625, + "step": 19290 + }, + { + "epoch": 0.9947171756216983, + "grad_norm": 6.515129558620156, + "learning_rate": 4.209782789535432e-11, + "logits/chosen": -3.09375, + "logits/rejected": -2.875, + "logps/chosen": -780.0, + "logps/rejected": -1088.0, + "loss": 0.2287, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.1875, + "rewards/margins": 3.0625, + "rewards/rejected": -9.25, + "step": 19300 + }, + { + "epoch": 0.9952325731220204, + "grad_norm": 8.005286486314064, + "learning_rate": 3.424816225072735e-11, + "logits/chosen": -3.21875, + "logits/rejected": -2.9375, + "logps/chosen": -808.0, + "logps/rejected": -1168.0, + "loss": 0.2198, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.375, + "rewards/margins": 3.703125, + "rewards/rejected": -10.0625, + "step": 19310 + }, + { + "epoch": 0.9957479706223424, + "grad_norm": 10.54057846633689, + "learning_rate": 2.7207670593470955e-11, + "logits/chosen": -3.5, + "logits/rejected": -3.171875, + "logps/chosen": -824.0, + "logps/rejected": -1192.0, + "loss": 0.2078, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.375, + "rewards/margins": 3.71875, + "rewards/rejected": -10.0625, + "step": 19320 + }, + { + "epoch": 0.9962633681226646, + "grad_norm": 9.88910360361171, + "learning_rate": 2.097637571465971e-11, + "logits/chosen": -3.15625, + "logits/rejected": -3.0, + "logps/chosen": -804.0, + "logps/rejected": -1160.0, + "loss": 0.2199, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.3125, + "rewards/margins": 3.65625, + "rewards/rejected": -10.0, + "step": 19330 + }, + { + "epoch": 0.9967787656229867, + "grad_norm": 7.539622776707153, + "learning_rate": 1.5554297785824732e-11, + "logits/chosen": -3.375, + "logits/rejected": -3.15625, + "logps/chosen": -840.0, + "logps/rejected": -1104.0, + "loss": 0.2185, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.5625, + "rewards/margins": 2.984375, + "rewards/rejected": -9.5625, + "step": 19340 + }, + { + "epoch": 0.9972941631233089, + "grad_norm": 7.938231539304325, + "learning_rate": 1.0941454359036928e-11, + "logits/chosen": -3.328125, + "logits/rejected": -3.21875, + "logps/chosen": -832.0, + "logps/rejected": -1144.0, + "loss": 0.2115, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.5, + "rewards/margins": 3.296875, + "rewards/rejected": -9.8125, + "step": 19350 + }, + { + "epoch": 0.997809560623631, + "grad_norm": 9.416028294332314, + "learning_rate": 7.1378603666572e-12, + "logits/chosen": -3.234375, + "logits/rejected": -3.046875, + "logps/chosen": -816.0, + "logps/rejected": -1160.0, + "loss": 0.2159, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -6.28125, + "rewards/margins": 3.546875, + "rewards/rejected": -9.8125, + "step": 19360 + }, + { + "epoch": 0.998324958123953, + "grad_norm": 9.799684839069073, + "learning_rate": 4.143528121502982e-12, + "logits/chosen": -3.421875, + "logits/rejected": -3.09375, + "logps/chosen": -824.0, + "logps/rejected": -1120.0, + "loss": 0.242, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.53125, + "rewards/margins": 3.109375, + "rewards/rejected": -9.625, + "step": 19370 + }, + { + "epoch": 0.9988403556242752, + "grad_norm": 8.621746837035232, + "learning_rate": 1.9584673165984335e-12, + "logits/chosen": -3.3125, + "logits/rejected": -3.140625, + "logps/chosen": -792.0, + "logps/rejected": -1160.0, + "loss": 0.2195, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -6.28125, + "rewards/margins": 3.65625, + "rewards/rejected": -9.9375, + "step": 19380 + }, + { + "epoch": 0.9993557531245973, + "grad_norm": 9.339691200973554, + "learning_rate": 5.826850253132231e-13, + "logits/chosen": -3.265625, + "logits/rejected": -3.03125, + "logps/chosen": -792.0, + "logps/rejected": -1144.0, + "loss": 0.2141, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.21875, + "rewards/margins": 3.546875, + "rewards/rejected": -9.75, + "step": 19390 + }, + { + "epoch": 0.9998711506249195, + "grad_norm": 9.410995492931091, + "learning_rate": 1.6185701251503204e-14, + "logits/chosen": -3.3125, + "logits/rejected": -3.03125, + "logps/chosen": -836.0, + "logps/rejected": -1128.0, + "loss": 0.2364, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -6.4375, + "rewards/margins": 3.140625, + "rewards/rejected": -9.625, + "step": 19400 + } + ], + "logging_steps": 10, + "max_steps": 19402, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 5000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}