{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.943820224719101, "eval_steps": 500, "global_step": 132, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02247191011235955, "grad_norm": 489.5653076171875, "learning_rate": 2.1428571428571428e-07, "logits/chosen": 1.4551408290863037, "logits/rejected": 1.478129267692566, "logps/chosen": -2968.771240234375, "logps/rejected": -3035.35302734375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0449438202247191, "grad_norm": 419.54876708984375, "learning_rate": 4.2857142857142857e-07, "logits/chosen": 1.5314003229141235, "logits/rejected": 1.4525893926620483, "logps/chosen": -3010.43994140625, "logps/rejected": -2926.948974609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.06741573033707865, "grad_norm": 789.9224243164062, "learning_rate": 6.428571428571428e-07, "logits/chosen": 1.482939600944519, "logits/rejected": 1.5616533756256104, "logps/chosen": -2998.501708984375, "logps/rejected": -3179.81982421875, "loss": 0.9204, "rewards/accuracies": 0.46875, "rewards/chosen": -0.08596238493919373, "rewards/margins": -0.19251862168312073, "rewards/rejected": 0.10655620694160461, "step": 3 }, { "epoch": 0.0898876404494382, "grad_norm": 378.14190673828125, "learning_rate": 8.571428571428571e-07, "logits/chosen": 1.6036081314086914, "logits/rejected": 1.7028334140777588, "logps/chosen": -2979.7060546875, "logps/rejected": -2913.69091796875, "loss": 0.6588, "rewards/accuracies": 0.546875, "rewards/chosen": -0.28274667263031006, "rewards/margins": 0.22482016682624817, "rewards/rejected": -0.5075668692588806, "step": 4 }, { "epoch": 0.11235955056179775, "grad_norm": 307.0648193359375, "learning_rate": 1.0714285714285716e-06, "logits/chosen": 1.3923085927963257, "logits/rejected": 1.4200749397277832, "logps/chosen": -3015.828125, "logps/rejected": -3068.435302734375, "loss": 0.5615, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03155745938420296, "rewards/margins": 0.5133614540100098, "rewards/rejected": -0.5449188947677612, "step": 5 }, { "epoch": 0.1348314606741573, "grad_norm": 282.67034912109375, "learning_rate": 1.2857142857142856e-06, "logits/chosen": 1.5581945180892944, "logits/rejected": 1.405899167060852, "logps/chosen": -3204.767333984375, "logps/rejected": -3163.357177734375, "loss": 0.5469, "rewards/accuracies": 0.71875, "rewards/chosen": -0.25397348403930664, "rewards/margins": 0.6482839584350586, "rewards/rejected": -0.9022574424743652, "step": 6 }, { "epoch": 0.15730337078651685, "grad_norm": 218.5866241455078, "learning_rate": 1.5e-06, "logits/chosen": 1.496790885925293, "logits/rejected": 1.4303985834121704, "logps/chosen": -3185.8203125, "logps/rejected": -3225.123046875, "loss": 0.4709, "rewards/accuracies": 0.734375, "rewards/chosen": -0.04301854223012924, "rewards/margins": 1.6269282102584839, "rewards/rejected": -1.6699466705322266, "step": 7 }, { "epoch": 0.1797752808988764, "grad_norm": 181.489501953125, "learning_rate": 1.7142857142857143e-06, "logits/chosen": 1.6130130290985107, "logits/rejected": 1.5007115602493286, "logps/chosen": -3087.791748046875, "logps/rejected": -2948.8115234375, "loss": 0.396, "rewards/accuracies": 0.75, "rewards/chosen": -0.08753497898578644, "rewards/margins": 2.817833185195923, "rewards/rejected": -2.9053683280944824, "step": 8 }, { "epoch": 0.20224719101123595, "grad_norm": 188.34768676757812, "learning_rate": 1.928571428571429e-06, "logits/chosen": 1.5587732791900635, "logits/rejected": 1.6744489669799805, "logps/chosen": -2374.6494140625, "logps/rejected": -2492.75537109375, "loss": 0.448, "rewards/accuracies": 0.71875, "rewards/chosen": -0.14219728112220764, "rewards/margins": 2.7199909687042236, "rewards/rejected": -2.8621885776519775, "step": 9 }, { "epoch": 0.2247191011235955, "grad_norm": 167.6234588623047, "learning_rate": 2.142857142857143e-06, "logits/chosen": 1.581652283668518, "logits/rejected": 1.5243756771087646, "logps/chosen": -2837.341552734375, "logps/rejected": -2842.2666015625, "loss": 0.3618, "rewards/accuracies": 0.765625, "rewards/chosen": -0.06367100775241852, "rewards/margins": 6.429449081420898, "rewards/rejected": -6.493120193481445, "step": 10 }, { "epoch": 0.24719101123595505, "grad_norm": 195.05810546875, "learning_rate": 2.357142857142857e-06, "logits/chosen": 1.531968355178833, "logits/rejected": 1.5490195751190186, "logps/chosen": -2785.763427734375, "logps/rejected": -2938.71533203125, "loss": 0.3962, "rewards/accuracies": 0.703125, "rewards/chosen": -0.2717077136039734, "rewards/margins": 8.072213172912598, "rewards/rejected": -8.343921661376953, "step": 11 }, { "epoch": 0.2696629213483146, "grad_norm": 204.53872680664062, "learning_rate": 2.571428571428571e-06, "logits/chosen": 1.5632414817810059, "logits/rejected": 1.5352647304534912, "logps/chosen": -2883.001220703125, "logps/rejected": -3065.4296875, "loss": 0.4155, "rewards/accuracies": 0.6875, "rewards/chosen": 0.09219703823328018, "rewards/margins": 11.51332950592041, "rewards/rejected": -11.421133041381836, "step": 12 }, { "epoch": 0.29213483146067415, "grad_norm": 181.2421112060547, "learning_rate": 2.785714285714286e-06, "logits/chosen": 1.5124785900115967, "logits/rejected": 1.4263392686843872, "logps/chosen": -3015.5341796875, "logps/rejected": -3136.56982421875, "loss": 0.3343, "rewards/accuracies": 0.75, "rewards/chosen": -0.1826700121164322, "rewards/margins": 16.418424606323242, "rewards/rejected": -16.601093292236328, "step": 13 }, { "epoch": 0.3146067415730337, "grad_norm": 178.02650451660156, "learning_rate": 3e-06, "logits/chosen": 1.4881091117858887, "logits/rejected": 1.4641259908676147, "logps/chosen": -2906.181396484375, "logps/rejected": -3083.74755859375, "loss": 0.3189, "rewards/accuracies": 0.890625, "rewards/chosen": -0.07007797807455063, "rewards/margins": 18.051210403442383, "rewards/rejected": -18.121288299560547, "step": 14 }, { "epoch": 0.33707865168539325, "grad_norm": 188.4379425048828, "learning_rate": 2.999468416685179e-06, "logits/chosen": 1.4958661794662476, "logits/rejected": 1.5740702152252197, "logps/chosen": -2589.415771484375, "logps/rejected": -2884.312744140625, "loss": 0.3903, "rewards/accuracies": 0.765625, "rewards/chosen": -0.1765696406364441, "rewards/margins": 17.232072830200195, "rewards/rejected": -17.408641815185547, "step": 15 }, { "epoch": 0.3595505617977528, "grad_norm": 161.3037872314453, "learning_rate": 2.9978740435151427e-06, "logits/chosen": 1.5349267721176147, "logits/rejected": 1.491062045097351, "logps/chosen": -2951.84619140625, "logps/rejected": -3206.8662109375, "loss": 0.3059, "rewards/accuracies": 0.78125, "rewards/chosen": -1.7078287601470947, "rewards/margins": 23.868520736694336, "rewards/rejected": -25.57634925842285, "step": 16 }, { "epoch": 0.38202247191011235, "grad_norm": 186.13180541992188, "learning_rate": 2.995218010546125e-06, "logits/chosen": 1.4998528957366943, "logits/rejected": 1.4576878547668457, "logps/chosen": -3011.727783203125, "logps/rejected": -3261.4501953125, "loss": 0.3808, "rewards/accuracies": 0.75, "rewards/chosen": -0.25169306993484497, "rewards/margins": 35.25308609008789, "rewards/rejected": -35.50477600097656, "step": 17 }, { "epoch": 0.4044943820224719, "grad_norm": 185.6712188720703, "learning_rate": 2.9915022003152055e-06, "logits/chosen": 1.6139241456985474, "logits/rejected": 1.5550901889801025, "logps/chosen": -2965.4423828125, "logps/rejected": -3224.514404296875, "loss": 0.3542, "rewards/accuracies": 0.8125, "rewards/chosen": 1.8823347091674805, "rewards/margins": 39.0025634765625, "rewards/rejected": -37.12023162841797, "step": 18 }, { "epoch": 0.42696629213483145, "grad_norm": 182.43603515625, "learning_rate": 2.986729246506011e-06, "logits/chosen": 1.244603157043457, "logits/rejected": 1.2053301334381104, "logps/chosen": -2764.19189453125, "logps/rejected": -3084.441650390625, "loss": 0.367, "rewards/accuracies": 0.765625, "rewards/chosen": -1.6243125200271606, "rewards/margins": 43.56684112548828, "rewards/rejected": -45.1911506652832, "step": 19 }, { "epoch": 0.449438202247191, "grad_norm": 198.76722717285156, "learning_rate": 2.980902532082017e-06, "logits/chosen": 1.4910385608673096, "logits/rejected": 1.4667646884918213, "logps/chosen": -2632.417724609375, "logps/rejected": -2912.476806640625, "loss": 0.4946, "rewards/accuracies": 0.75, "rewards/chosen": -2.317056179046631, "rewards/margins": 34.359012603759766, "rewards/rejected": -36.676063537597656, "step": 20 }, { "epoch": 0.47191011235955055, "grad_norm": 203.78700256347656, "learning_rate": 2.9740261868887817e-06, "logits/chosen": 1.4394636154174805, "logits/rejected": 1.3155745267868042, "logps/chosen": -2808.47509765625, "logps/rejected": -3043.707763671875, "loss": 0.4802, "rewards/accuracies": 0.71875, "rewards/chosen": 1.6056139469146729, "rewards/margins": 43.16130065917969, "rewards/rejected": -41.555686950683594, "step": 21 }, { "epoch": 0.4943820224719101, "grad_norm": 199.40330505371094, "learning_rate": 2.9661050847268e-06, "logits/chosen": 1.3054568767547607, "logits/rejected": 1.2870110273361206, "logps/chosen": -2704.07568359375, "logps/rejected": -3091.42626953125, "loss": 0.4924, "rewards/accuracies": 0.828125, "rewards/chosen": -4.835676670074463, "rewards/margins": 40.92457580566406, "rewards/rejected": -45.76025390625, "step": 22 }, { "epoch": 0.5168539325842697, "grad_norm": 184.34901428222656, "learning_rate": 2.957144839897065e-06, "logits/chosen": 1.5794934034347534, "logits/rejected": 1.374954104423523, "logps/chosen": -2828.36083984375, "logps/rejected": -3111.46875, "loss": 0.4932, "rewards/accuracies": 0.734375, "rewards/chosen": 3.432398796081543, "rewards/margins": 62.3823356628418, "rewards/rejected": -58.9499397277832, "step": 23 }, { "epoch": 0.5393258426966292, "grad_norm": 198.54269409179688, "learning_rate": 2.947151803221774e-06, "logits/chosen": 1.6772565841674805, "logits/rejected": 1.6362934112548828, "logps/chosen": -2880.4677734375, "logps/rejected": -3303.3857421875, "loss": 0.3869, "rewards/accuracies": 0.796875, "rewards/chosen": 0.12497274577617645, "rewards/margins": 53.7283821105957, "rewards/rejected": -53.60340881347656, "step": 24 }, { "epoch": 0.5617977528089888, "grad_norm": 173.3833465576172, "learning_rate": 2.936133057543008e-06, "logits/chosen": 1.4493129253387451, "logits/rejected": 1.3350006341934204, "logps/chosen": -2721.460693359375, "logps/rejected": -3138.864990234375, "loss": 0.3981, "rewards/accuracies": 0.78125, "rewards/chosen": 2.794492244720459, "rewards/margins": 69.71061706542969, "rewards/rejected": -66.91613006591797, "step": 25 }, { "epoch": 0.5842696629213483, "grad_norm": 232.13525390625, "learning_rate": 2.924096412702572e-06, "logits/chosen": 1.7099878787994385, "logits/rejected": 1.5226480960845947, "logps/chosen": -2983.288330078125, "logps/rejected": -3093.673095703125, "loss": 0.613, "rewards/accuracies": 0.703125, "rewards/chosen": 2.1761527061462402, "rewards/margins": 59.57087326049805, "rewards/rejected": -57.394718170166016, "step": 26 }, { "epoch": 0.6067415730337079, "grad_norm": 162.77978515625, "learning_rate": 2.91105040000655e-06, "logits/chosen": 1.4071202278137207, "logits/rejected": 1.4425785541534424, "logps/chosen": -2522.546630859375, "logps/rejected": -3321.0537109375, "loss": 0.4005, "rewards/accuracies": 0.859375, "rewards/chosen": 1.8253318071365356, "rewards/margins": 63.75608825683594, "rewards/rejected": -61.930755615234375, "step": 27 }, { "epoch": 0.6292134831460674, "grad_norm": 207.4031219482422, "learning_rate": 2.897004266178508e-06, "logits/chosen": 1.5841655731201172, "logits/rejected": 1.4097201824188232, "logps/chosen": -3239.787841796875, "logps/rejected": -3663.88232421875, "loss": 0.522, "rewards/accuracies": 0.765625, "rewards/chosen": -0.2217176854610443, "rewards/margins": 58.664180755615234, "rewards/rejected": -58.88589859008789, "step": 28 }, { "epoch": 0.651685393258427, "grad_norm": 172.96218872070312, "learning_rate": 2.8819679668056195e-06, "logits/chosen": 1.6320128440856934, "logits/rejected": 1.5467625856399536, "logps/chosen": -2654.78271484375, "logps/rejected": -3225.193359375, "loss": 0.3816, "rewards/accuracies": 0.765625, "rewards/chosen": 2.769482374191284, "rewards/margins": 65.22299194335938, "rewards/rejected": -62.453514099121094, "step": 29 }, { "epoch": 0.6741573033707865, "grad_norm": 200.36915588378906, "learning_rate": 2.8659521592823702e-06, "logits/chosen": 1.6264617443084717, "logits/rejected": 1.421095848083496, "logps/chosen": -2914.17529296875, "logps/rejected": -3396.08544921875, "loss": 0.4913, "rewards/accuracies": 0.765625, "rewards/chosen": 7.334710121154785, "rewards/margins": 89.93038177490234, "rewards/rejected": -82.59567260742188, "step": 30 }, { "epoch": 0.6966292134831461, "grad_norm": 250.5316162109375, "learning_rate": 2.848968195256829e-06, "logits/chosen": 1.6201553344726562, "logits/rejected": 1.4870961904525757, "logps/chosen": -3036.192138671875, "logps/rejected": -3605.6904296875, "loss": 0.708, "rewards/accuracies": 0.65625, "rewards/chosen": 4.598369121551514, "rewards/margins": 79.35784149169922, "rewards/rejected": -74.75946807861328, "step": 31 }, { "epoch": 0.7191011235955056, "grad_norm": 228.1786346435547, "learning_rate": 2.831028112584857e-06, "logits/chosen": 1.3086817264556885, "logits/rejected": 1.2920796871185303, "logps/chosen": -2828.72900390625, "logps/rejected": -3492.97802734375, "loss": 0.5514, "rewards/accuracies": 0.6875, "rewards/chosen": 0.8046822547912598, "rewards/margins": 77.88575744628906, "rewards/rejected": -77.08108520507812, "step": 32 }, { "epoch": 0.7415730337078652, "grad_norm": 156.25662231445312, "learning_rate": 2.812144626797942e-06, "logits/chosen": 1.3912537097930908, "logits/rejected": 1.1646690368652344, "logps/chosen": -3173.48388671875, "logps/rejected": -3708.0390625, "loss": 0.4043, "rewards/accuracies": 0.875, "rewards/chosen": 2.820896863937378, "rewards/margins": 82.55420684814453, "rewards/rejected": -79.73331451416016, "step": 33 }, { "epoch": 0.7640449438202247, "grad_norm": 189.89682006835938, "learning_rate": 2.792331122090709e-06, "logits/chosen": 1.525010108947754, "logits/rejected": 1.4141947031021118, "logps/chosen": -2818.591064453125, "logps/rejected": -3415.1484375, "loss": 0.4825, "rewards/accuracies": 0.796875, "rewards/chosen": 1.3273561000823975, "rewards/margins": 81.49795532226562, "rewards/rejected": -80.17059326171875, "step": 34 }, { "epoch": 0.7865168539325843, "grad_norm": 198.3324432373047, "learning_rate": 2.7716016418345064e-06, "logits/chosen": 1.5669187307357788, "logits/rejected": 1.3444348573684692, "logps/chosen": -2831.2744140625, "logps/rejected": -3359.554931640625, "loss": 0.4821, "rewards/accuracies": 0.84375, "rewards/chosen": 4.969450950622559, "rewards/margins": 95.5076675415039, "rewards/rejected": -90.53821563720703, "step": 35 }, { "epoch": 0.8089887640449438, "grad_norm": 202.50929260253906, "learning_rate": 2.7499708786237724e-06, "logits/chosen": 1.6073535680770874, "logits/rejected": 1.5690536499023438, "logps/chosen": -2898.311279296875, "logps/rejected": -3199.489013671875, "loss": 0.5359, "rewards/accuracies": 0.796875, "rewards/chosen": -3.0962305068969727, "rewards/margins": 49.8695182800293, "rewards/rejected": -52.96574783325195, "step": 36 }, { "epoch": 0.8314606741573034, "grad_norm": 172.3883056640625, "learning_rate": 2.7274541638622533e-06, "logits/chosen": 1.5025634765625, "logits/rejected": 1.2939093112945557, "logps/chosen": -2682.772705078125, "logps/rejected": -3070.16259765625, "loss": 0.5118, "rewards/accuracies": 0.859375, "rewards/chosen": -0.5182172060012817, "rewards/margins": 86.14014434814453, "rewards/rejected": -86.65835571289062, "step": 37 }, { "epoch": 0.8539325842696629, "grad_norm": 200.7554473876953, "learning_rate": 2.7040674568964452e-06, "logits/chosen": 1.4808025360107422, "logits/rejected": 1.3251252174377441, "logps/chosen": -2854.599365234375, "logps/rejected": -3208.1640625, "loss": 0.5253, "rewards/accuracies": 0.8125, "rewards/chosen": 1.5150139331817627, "rewards/margins": 78.78499603271484, "rewards/rejected": -77.26997375488281, "step": 38 }, { "epoch": 0.8764044943820225, "grad_norm": 217.05526733398438, "learning_rate": 2.679827333703964e-06, "logits/chosen": 1.5550140142440796, "logits/rejected": 1.5405230522155762, "logps/chosen": -2775.199951171875, "logps/rejected": -3292.66650390625, "loss": 0.5094, "rewards/accuracies": 0.765625, "rewards/chosen": -0.5831690430641174, "rewards/margins": 75.25239562988281, "rewards/rejected": -75.8355712890625, "step": 39 }, { "epoch": 0.898876404494382, "grad_norm": 260.61224365234375, "learning_rate": 2.6547509751448593e-06, "logits/chosen": 1.5327131748199463, "logits/rejected": 1.404789924621582, "logps/chosen": -2995.2666015625, "logps/rejected": -3701.7333984375, "loss": 0.7054, "rewards/accuracies": 0.703125, "rewards/chosen": 4.574828147888184, "rewards/margins": 96.09221649169922, "rewards/rejected": -91.51737976074219, "step": 40 }, { "epoch": 0.9213483146067416, "grad_norm": 210.46607971191406, "learning_rate": 2.6288561547842076e-06, "logits/chosen": 1.5143060684204102, "logits/rejected": 1.2557826042175293, "logps/chosen": -2932.751953125, "logps/rejected": -3389.65185546875, "loss": 0.6426, "rewards/accuracies": 0.78125, "rewards/chosen": 3.5902769565582275, "rewards/margins": 102.1531982421875, "rewards/rejected": -98.56291198730469, "step": 41 }, { "epoch": 0.9438202247191011, "grad_norm": 203.90863037109375, "learning_rate": 2.602161226294601e-06, "logits/chosen": 1.4669859409332275, "logits/rejected": 1.254248023033142, "logps/chosen": -3275.650146484375, "logps/rejected": -3885.744873046875, "loss": 0.5032, "rewards/accuracies": 0.796875, "rewards/chosen": -7.145351886749268, "rewards/margins": 94.66647338867188, "rewards/rejected": -101.81182861328125, "step": 42 }, { "epoch": 0.9662921348314607, "grad_norm": 190.71495056152344, "learning_rate": 2.5746851104474728e-06, "logits/chosen": 1.4877179861068726, "logits/rejected": 1.3816105127334595, "logps/chosen": -2700.980224609375, "logps/rejected": -3283.328125, "loss": 0.4432, "rewards/accuracies": 0.828125, "rewards/chosen": 1.710632085800171, "rewards/margins": 75.0985107421875, "rewards/rejected": -73.38786315917969, "step": 43 }, { "epoch": 0.9887640449438202, "grad_norm": 192.31964111328125, "learning_rate": 2.5464472817024772e-06, "logits/chosen": 1.3617230653762817, "logits/rejected": 1.2478257417678833, "logps/chosen": -2841.803466796875, "logps/rejected": -3503.9794921875, "loss": 0.5194, "rewards/accuracies": 0.78125, "rewards/chosen": 4.092733383178711, "rewards/margins": 110.31430053710938, "rewards/rejected": -106.22156524658203, "step": 44 }, { "epoch": 1.0, "grad_norm": 192.31964111328125, "learning_rate": 2.517467754404424e-06, "logits/chosen": 1.3865031003952026, "logits/rejected": 1.2281872034072876, "logps/chosen": -2563.0751953125, "logps/rejected": -2940.1357421875, "loss": 0.2103, "rewards/accuracies": 0.84375, "rewards/chosen": 4.377815246582031, "rewards/margins": 81.93372344970703, "rewards/rejected": -77.555908203125, "step": 45 }, { "epoch": 1.0224719101123596, "grad_norm": 135.86026000976562, "learning_rate": 2.487767068597558e-06, "logits/chosen": 1.5341211557388306, "logits/rejected": 1.4015753269195557, "logps/chosen": -3250.149658203125, "logps/rejected": -3893.629150390625, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 19.023714065551758, "rewards/margins": 134.42942810058594, "rewards/rejected": -115.40570068359375, "step": 46 }, { "epoch": 1.0449438202247192, "grad_norm": 1.9560177326202393, "learning_rate": 2.4573662754672303e-06, "logits/chosen": 1.4638060331344604, "logits/rejected": 1.396654486656189, "logps/chosen": -2667.339599609375, "logps/rejected": -3516.595703125, "loss": 0.0114, "rewards/accuracies": 0.984375, "rewards/chosen": 8.45435905456543, "rewards/margins": 107.95783996582031, "rewards/rejected": -99.50347900390625, "step": 47 }, { "epoch": 1.0674157303370786, "grad_norm": 14.909017562866211, "learning_rate": 2.426286922419288e-06, "logits/chosen": 1.6447203159332275, "logits/rejected": 1.6282371282577515, "logps/chosen": -2377.240478515625, "logps/rejected": -2950.48583984375, "loss": 0.0154, "rewards/accuracies": 0.984375, "rewards/chosen": 7.06836462020874, "rewards/margins": 84.36599731445312, "rewards/rejected": -77.29763793945312, "step": 48 }, { "epoch": 1.0898876404494382, "grad_norm": 4.328535556793213, "learning_rate": 2.3945510378077523e-06, "logits/chosen": 1.3356518745422363, "logits/rejected": 1.2965461015701294, "logps/chosen": -2788.0400390625, "logps/rejected": -3457.5185546875, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 11.870361328125, "rewards/margins": 103.6649169921875, "rewards/rejected": -91.79456329345703, "step": 49 }, { "epoch": 1.1123595505617978, "grad_norm": 6.1306352615356445, "learning_rate": 2.3621811153216106e-06, "logits/chosen": 1.3586758375167847, "logits/rejected": 1.2172551155090332, "logps/chosen": -3142.0791015625, "logps/rejected": -3848.3056640625, "loss": 0.0108, "rewards/accuracies": 0.984375, "rewards/chosen": 5.018255710601807, "rewards/margins": 121.07866668701172, "rewards/rejected": -116.06040954589844, "step": 50 }, { "epoch": 1.1348314606741572, "grad_norm": 2.2042205333709717, "learning_rate": 2.32920009804179e-06, "logits/chosen": 1.676792860031128, "logits/rejected": 1.4110440015792847, "logps/chosen": -2846.33056640625, "logps/rejected": -3573.93359375, "loss": 0.0116, "rewards/accuracies": 0.984375, "rewards/chosen": 16.190317153930664, "rewards/margins": 119.14263153076172, "rewards/rejected": -102.95230102539062, "step": 51 }, { "epoch": 1.1573033707865168, "grad_norm": 13.62660026550293, "learning_rate": 2.2956313621796135e-06, "logits/chosen": 1.5751538276672363, "logits/rejected": 1.4073097705841064, "logps/chosen": -2536.8515625, "logps/rejected": -3102.68896484375, "loss": 0.0147, "rewards/accuracies": 0.984375, "rewards/chosen": 7.306772232055664, "rewards/margins": 98.24702453613281, "rewards/rejected": -90.94024658203125, "step": 52 }, { "epoch": 1.1797752808988764, "grad_norm": 1.355103850364685, "learning_rate": 2.26149870050826e-06, "logits/chosen": 1.363991618156433, "logits/rejected": 1.1863415241241455, "logps/chosen": -3056.833740234375, "logps/rejected": -3680.160888671875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 9.664068222045898, "rewards/margins": 112.41234588623047, "rewards/rejected": -102.74827575683594, "step": 53 }, { "epoch": 1.202247191011236, "grad_norm": 2.3306772708892822, "learning_rate": 2.2268263054989753e-06, "logits/chosen": 1.54270339012146, "logits/rejected": 1.475841760635376, "logps/chosen": -2780.744384765625, "logps/rejected": -3487.5322265625, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 11.756105422973633, "rewards/margins": 107.98931884765625, "rewards/rejected": -96.23321533203125, "step": 54 }, { "epoch": 1.2247191011235956, "grad_norm": 1.47923743724823, "learning_rate": 2.191638752173989e-06, "logits/chosen": 1.6175808906555176, "logits/rejected": 1.5379141569137573, "logps/chosen": -2748.61328125, "logps/rejected": -3274.468017578125, "loss": 0.0117, "rewards/accuracies": 0.984375, "rewards/chosen": 8.739614486694336, "rewards/margins": 110.58942413330078, "rewards/rejected": -101.84980010986328, "step": 55 }, { "epoch": 1.247191011235955, "grad_norm": 3.0752482414245605, "learning_rate": 2.1559609806882834e-06, "logits/chosen": 1.4324688911437988, "logits/rejected": 1.2107815742492676, "logps/chosen": -2790.97509765625, "logps/rejected": -3406.87744140625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 5.457365989685059, "rewards/margins": 89.03166198730469, "rewards/rejected": -83.57430267333984, "step": 56 }, { "epoch": 1.2696629213483146, "grad_norm": 0.07106953859329224, "learning_rate": 2.1198182786525674e-06, "logits/chosen": 1.409006118774414, "logits/rejected": 1.2638301849365234, "logps/chosen": -2571.373046875, "logps/rejected": -3436.89892578125, "loss": 0.0112, "rewards/accuracies": 0.984375, "rewards/chosen": 12.910816192626953, "rewards/margins": 133.70639038085938, "rewards/rejected": -120.79557800292969, "step": 57 }, { "epoch": 1.2921348314606742, "grad_norm": 1.3202946186065674, "learning_rate": 2.0832362632099813e-06, "logits/chosen": 1.4980010986328125, "logits/rejected": 1.1623045206069946, "logps/chosen": -3144.611083984375, "logps/rejected": -3731.18212890625, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": 9.096885681152344, "rewards/margins": 142.87937927246094, "rewards/rejected": -133.78250122070312, "step": 58 }, { "epoch": 1.3146067415730336, "grad_norm": 2.9557082653045654, "learning_rate": 2.0462408628792335e-06, "logits/chosen": 1.6109601259231567, "logits/rejected": 1.4365208148956299, "logps/chosen": -2812.40625, "logps/rejected": -3437.3193359375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 8.375179290771484, "rewards/margins": 111.16755676269531, "rewards/rejected": -102.79237365722656, "step": 59 }, { "epoch": 1.3370786516853932, "grad_norm": 0.2892356514930725, "learning_rate": 2.008858299177045e-06, "logits/chosen": 1.4753564596176147, "logits/rejected": 1.2640880346298218, "logps/chosen": -2899.793212890625, "logps/rejected": -3406.771240234375, "loss": 0.0157, "rewards/accuracies": 0.984375, "rewards/chosen": 7.380945682525635, "rewards/margins": 106.26220703125, "rewards/rejected": -98.88125610351562, "step": 60 }, { "epoch": 1.3595505617977528, "grad_norm": 50.00154495239258, "learning_rate": 1.9711150680329234e-06, "logits/chosen": 1.6642662286758423, "logits/rejected": 1.473952054977417, "logps/chosen": -2834.24072265625, "logps/rejected": -3363.942138671875, "loss": 0.0175, "rewards/accuracies": 0.984375, "rewards/chosen": 8.414569854736328, "rewards/margins": 110.77262115478516, "rewards/rejected": -102.35804748535156, "step": 61 }, { "epoch": 1.3820224719101124, "grad_norm": 0.07520447671413422, "learning_rate": 1.9330379210094315e-06, "logits/chosen": 1.5798277854919434, "logits/rejected": 1.4446996450424194, "logps/chosen": -2692.41162109375, "logps/rejected": -3175.50830078125, "loss": 0.0118, "rewards/accuracies": 0.984375, "rewards/chosen": 5.677203178405762, "rewards/margins": 96.32395935058594, "rewards/rejected": -90.64675903320312, "step": 62 }, { "epoch": 1.404494382022472, "grad_norm": 3.16860032081604, "learning_rate": 1.8946538463412818e-06, "logits/chosen": 1.606536865234375, "logits/rejected": 1.5855745077133179, "logps/chosen": -2659.635986328125, "logps/rejected": -3431.36572265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 10.329705238342285, "rewards/margins": 98.20384216308594, "rewards/rejected": -87.87415313720703, "step": 63 }, { "epoch": 1.4269662921348314, "grad_norm": 0.042245469987392426, "learning_rate": 1.8559900498066726e-06, "logits/chosen": 1.605839490890503, "logits/rejected": 1.3888914585113525, "logps/chosen": -2774.67529296875, "logps/rejected": -3620.492431640625, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": 14.000102996826172, "rewards/margins": 140.67535400390625, "rewards/rejected": -126.67523956298828, "step": 64 }, { "epoch": 1.449438202247191, "grad_norm": 28.373090744018555, "learning_rate": 1.8170739354444366e-06, "logits/chosen": 1.5468522310256958, "logits/rejected": 1.316043734550476, "logps/chosen": -2898.541015625, "logps/rejected": -3607.741943359375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 9.336808204650879, "rewards/margins": 125.04135131835938, "rewards/rejected": -115.70454406738281, "step": 65 }, { "epoch": 1.4719101123595506, "grad_norm": 3.688307046890259, "learning_rate": 1.7779330861306717e-06, "logits/chosen": 1.4648973941802979, "logits/rejected": 1.3168296813964844, "logps/chosen": -3060.658935546875, "logps/rejected": -4020.65185546875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.3615617752075195, "rewards/margins": 130.01849365234375, "rewards/rejected": -126.65692138671875, "step": 66 }, { "epoch": 1.49438202247191, "grad_norm": 21.308137893676758, "learning_rate": 1.738595244028608e-06, "logits/chosen": 1.4748642444610596, "logits/rejected": 1.3131040334701538, "logps/chosen": -2794.14599609375, "logps/rejected": -3351.5478515625, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": 2.8835487365722656, "rewards/margins": 98.07205963134766, "rewards/rejected": -95.18850708007812, "step": 67 }, { "epoch": 1.5168539325842696, "grad_norm": 1.3383527994155884, "learning_rate": 1.699088290925583e-06, "logits/chosen": 1.372517704963684, "logits/rejected": 1.302228569984436, "logps/chosen": -2794.654052734375, "logps/rejected": -3820.33837890625, "loss": 0.0112, "rewards/accuracies": 0.984375, "rewards/chosen": 9.68542766571045, "rewards/margins": 141.4244842529297, "rewards/rejected": -131.73907470703125, "step": 68 }, { "epoch": 1.5393258426966292, "grad_norm": 1.4769072532653809, "learning_rate": 1.6594402284710481e-06, "logits/chosen": 1.5602664947509766, "logits/rejected": 1.4328043460845947, "logps/chosen": -2850.06640625, "logps/rejected": -3549.932861328125, "loss": 0.026, "rewards/accuracies": 0.984375, "rewards/chosen": 5.793665409088135, "rewards/margins": 124.38016510009766, "rewards/rejected": -118.58650970458984, "step": 69 }, { "epoch": 1.5617977528089888, "grad_norm": 5.262300968170166, "learning_rate": 1.6196791583296247e-06, "logits/chosen": 1.4012134075164795, "logits/rejected": 1.2154825925827026, "logps/chosen": -2862.569580078125, "logps/rejected": -3687.36328125, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 12.932228088378906, "rewards/margins": 135.03558349609375, "rewards/rejected": -122.10337829589844, "step": 70 }, { "epoch": 1.5842696629213484, "grad_norm": 2.9438984394073486, "learning_rate": 1.579833262263268e-06, "logits/chosen": 1.4590383768081665, "logits/rejected": 1.1356399059295654, "logps/chosen": -2651.068603515625, "logps/rejected": -3142.91455078125, "loss": 0.0118, "rewards/accuracies": 0.984375, "rewards/chosen": 9.391037940979004, "rewards/margins": 119.59295654296875, "rewards/rejected": -110.2019271850586, "step": 71 }, { "epoch": 1.606741573033708, "grad_norm": 0.6242117881774902, "learning_rate": 1.5399307821566623e-06, "logits/chosen": 1.5220391750335693, "logits/rejected": 1.2139172554016113, "logps/chosen": -2834.0634765625, "logps/rejected": -3674.3623046875, "loss": 0.0218, "rewards/accuracies": 0.984375, "rewards/chosen": 14.53393268585205, "rewards/margins": 154.6046142578125, "rewards/rejected": -140.0706787109375, "step": 72 }, { "epoch": 1.6292134831460674, "grad_norm": 0.17758429050445557, "learning_rate": 1.5e-06, "logits/chosen": 1.531368374824524, "logits/rejected": 1.3681552410125732, "logps/chosen": -2943.841064453125, "logps/rejected": -3831.00927734375, "loss": 0.0117, "rewards/accuracies": 0.984375, "rewards/chosen": 11.650660514831543, "rewards/margins": 151.18350219726562, "rewards/rejected": -139.5328369140625, "step": 73 }, { "epoch": 1.651685393258427, "grad_norm": 12.694519996643066, "learning_rate": 1.460069217843338e-06, "logits/chosen": 1.416333794593811, "logits/rejected": 1.1884994506835938, "logps/chosen": -3090.49658203125, "logps/rejected": -3794.48095703125, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 12.209739685058594, "rewards/margins": 145.9217529296875, "rewards/rejected": -133.71200561523438, "step": 74 }, { "epoch": 1.6741573033707864, "grad_norm": 5.181153774261475, "learning_rate": 1.4201667377367324e-06, "logits/chosen": 1.5291459560394287, "logits/rejected": 1.390205979347229, "logps/chosen": -2819.557861328125, "logps/rejected": -3400.41748046875, "loss": 0.0112, "rewards/accuracies": 0.984375, "rewards/chosen": 6.913262367248535, "rewards/margins": 108.99024200439453, "rewards/rejected": -102.07699584960938, "step": 75 }, { "epoch": 1.696629213483146, "grad_norm": 5.866981506347656, "learning_rate": 1.3803208416703752e-06, "logits/chosen": 1.509679913520813, "logits/rejected": 1.3863307237625122, "logps/chosen": -2517.104736328125, "logps/rejected": -3187.1181640625, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 6.015058517456055, "rewards/margins": 110.0936508178711, "rewards/rejected": -104.07859802246094, "step": 76 }, { "epoch": 1.7191011235955056, "grad_norm": 3.792738199234009, "learning_rate": 1.3405597715289522e-06, "logits/chosen": 1.4075974225997925, "logits/rejected": 1.297675609588623, "logps/chosen": -3116.082275390625, "logps/rejected": -3820.78271484375, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 6.922908782958984, "rewards/margins": 124.51133728027344, "rewards/rejected": -117.58842468261719, "step": 77 }, { "epoch": 1.7415730337078652, "grad_norm": 8.345385551452637, "learning_rate": 1.3009117090744173e-06, "logits/chosen": 1.5826494693756104, "logits/rejected": 1.2875326871871948, "logps/chosen": -2909.03515625, "logps/rejected": -3438.2587890625, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": 8.310379981994629, "rewards/margins": 140.91641235351562, "rewards/rejected": -132.6060333251953, "step": 78 }, { "epoch": 1.7640449438202248, "grad_norm": 0.4116104245185852, "learning_rate": 1.2614047559713923e-06, "logits/chosen": 1.4220818281173706, "logits/rejected": 1.2691839933395386, "logps/chosen": -3212.60693359375, "logps/rejected": -3793.721435546875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.4821667671203613, "rewards/margins": 128.71267700195312, "rewards/rejected": -126.23049926757812, "step": 79 }, { "epoch": 1.7865168539325844, "grad_norm": 0.8209803700447083, "learning_rate": 1.2220669138693288e-06, "logits/chosen": 1.3909624814987183, "logits/rejected": 1.1474812030792236, "logps/chosen": -2994.385009765625, "logps/rejected": -3750.771728515625, "loss": 0.0112, "rewards/accuracies": 0.984375, "rewards/chosen": 9.527303695678711, "rewards/margins": 137.7163543701172, "rewards/rejected": -128.18905639648438, "step": 80 }, { "epoch": 1.8089887640449438, "grad_norm": 1.4425156116485596, "learning_rate": 1.1829260645555634e-06, "logits/chosen": 1.3281006813049316, "logits/rejected": 1.039908766746521, "logps/chosen": -3059.208251953125, "logps/rejected": -3867.33349609375, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": 12.086620330810547, "rewards/margins": 160.84959411621094, "rewards/rejected": -148.76295471191406, "step": 81 }, { "epoch": 1.8314606741573034, "grad_norm": 0.7217972278594971, "learning_rate": 1.1440099501933277e-06, "logits/chosen": 1.3363004922866821, "logits/rejected": 1.2744730710983276, "logps/chosen": -3156.716796875, "logps/rejected": -4011.334716796875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 4.8549346923828125, "rewards/margins": 134.17984008789062, "rewards/rejected": -129.3249053955078, "step": 82 }, { "epoch": 1.8539325842696628, "grad_norm": 1.5164899826049805, "learning_rate": 1.1053461536587183e-06, "logits/chosen": 1.4580892324447632, "logits/rejected": 1.2366647720336914, "logps/chosen": -2984.4619140625, "logps/rejected": -3910.234375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 9.195051193237305, "rewards/margins": 148.3942413330078, "rewards/rejected": -139.1991729736328, "step": 83 }, { "epoch": 1.8764044943820224, "grad_norm": 3.071080446243286, "learning_rate": 1.0669620789905688e-06, "logits/chosen": 1.5336228609085083, "logits/rejected": 1.3450926542282104, "logps/chosen": -2671.64892578125, "logps/rejected": -3312.888427734375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 5.30421257019043, "rewards/margins": 96.9708023071289, "rewards/rejected": -91.66659545898438, "step": 84 }, { "epoch": 1.898876404494382, "grad_norm": 0.2966591715812683, "learning_rate": 1.0288849319670773e-06, "logits/chosen": 1.5615055561065674, "logits/rejected": 1.4262051582336426, "logps/chosen": -2924.010498046875, "logps/rejected": -3439.7509765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 4.811070919036865, "rewards/margins": 107.32271575927734, "rewards/rejected": -102.51164245605469, "step": 85 }, { "epoch": 1.9213483146067416, "grad_norm": 0.05935266241431236, "learning_rate": 9.911417008229545e-07, "logits/chosen": 1.4063825607299805, "logits/rejected": 1.1860499382019043, "logps/chosen": -2746.5126953125, "logps/rejected": -3493.92578125, "loss": 0.0325, "rewards/accuracies": 0.953125, "rewards/chosen": 11.67589282989502, "rewards/margins": 137.2821502685547, "rewards/rejected": -125.60626220703125, "step": 86 }, { "epoch": 1.9438202247191012, "grad_norm": 0.21089386940002441, "learning_rate": 9.537591371207668e-07, "logits/chosen": 1.5266857147216797, "logits/rejected": 1.4005635976791382, "logps/chosen": -2387.665771484375, "logps/rejected": -3293.546630859375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 5.131157875061035, "rewards/margins": 137.9029083251953, "rewards/rejected": -132.77175903320312, "step": 87 }, { "epoch": 1.9662921348314608, "grad_norm": 0.4727032780647278, "learning_rate": 9.167637367900192e-07, "logits/chosen": 1.5321190357208252, "logits/rejected": 1.3832690715789795, "logps/chosen": -2469.994384765625, "logps/rejected": -3097.712890625, "loss": 0.0117, "rewards/accuracies": 0.984375, "rewards/chosen": 13.177355766296387, "rewards/margins": 116.04686737060547, "rewards/rejected": -102.8695068359375, "step": 88 }, { "epoch": 1.9887640449438202, "grad_norm": 0.39027953147888184, "learning_rate": 8.801817213474331e-07, "logits/chosen": 1.5794587135314941, "logits/rejected": 1.3486638069152832, "logps/chosen": -2815.1982421875, "logps/rejected": -3435.67919921875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 8.544872283935547, "rewards/margins": 112.28601837158203, "rewards/rejected": -103.74114227294922, "step": 89 }, { "epoch": 2.0, "grad_norm": 0.14720159769058228, "learning_rate": 8.44039019311717e-07, "logits/chosen": 1.492700457572937, "logits/rejected": 1.3120732307434082, "logps/chosen": -3285.24267578125, "logps/rejected": -3985.763916015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 11.00776481628418, "rewards/margins": 157.06927490234375, "rewards/rejected": -146.06150817871094, "step": 90 }, { "epoch": 2.0224719101123596, "grad_norm": 0.019609661772847176, "learning_rate": 8.08361247826011e-07, "logits/chosen": 1.3633915185928345, "logits/rejected": 1.1915699243545532, "logps/chosen": -3307.618408203125, "logps/rejected": -4103.1875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.502930760383606, "rewards/margins": 150.0188446044922, "rewards/rejected": -151.52178955078125, "step": 91 }, { "epoch": 2.044943820224719, "grad_norm": 0.026041870936751366, "learning_rate": 7.731736945010249e-07, "logits/chosen": 1.4235529899597168, "logits/rejected": 1.0836195945739746, "logps/chosen": -3224.001708984375, "logps/rejected": -3803.459228515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.049484252929688, "rewards/margins": 149.46070861816406, "rewards/rejected": -140.41123962402344, "step": 92 }, { "epoch": 2.067415730337079, "grad_norm": 0.36662229895591736, "learning_rate": 7.385012994917405e-07, "logits/chosen": 1.461303949356079, "logits/rejected": 1.401003360748291, "logps/chosen": -2710.856689453125, "logps/rejected": -3409.259765625, "loss": 0.0109, "rewards/accuracies": 0.984375, "rewards/chosen": 5.063204765319824, "rewards/margins": 96.820068359375, "rewards/rejected": -91.75686645507812, "step": 93 }, { "epoch": 2.0898876404494384, "grad_norm": 0.22327114641666412, "learning_rate": 7.043686378203864e-07, "logits/chosen": 1.5914536714553833, "logits/rejected": 1.3907164335250854, "logps/chosen": -2657.873291015625, "logps/rejected": -3420.0283203125, "loss": 0.0109, "rewards/accuracies": 0.984375, "rewards/chosen": 12.433341979980469, "rewards/margins": 118.74362182617188, "rewards/rejected": -106.31027221679688, "step": 94 }, { "epoch": 2.1123595505617976, "grad_norm": 0.006661942228674889, "learning_rate": 6.707999019582104e-07, "logits/chosen": 1.4297124147415161, "logits/rejected": 1.2694649696350098, "logps/chosen": -2567.587890625, "logps/rejected": -3557.106201171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.91953182220459, "rewards/margins": 146.32005310058594, "rewards/rejected": -138.4005126953125, "step": 95 }, { "epoch": 2.134831460674157, "grad_norm": 0.010272935964167118, "learning_rate": 6.378188846783898e-07, "logits/chosen": 1.584874153137207, "logits/rejected": 1.3883558511734009, "logps/chosen": -2836.077880859375, "logps/rejected": -3408.93115234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.626905918121338, "rewards/margins": 121.95980834960938, "rewards/rejected": -115.33291625976562, "step": 96 }, { "epoch": 2.157303370786517, "grad_norm": 0.006059441715478897, "learning_rate": 6.054489621922477e-07, "logits/chosen": 1.6233469247817993, "logits/rejected": 1.4364811182022095, "logps/chosen": -2997.014404296875, "logps/rejected": -3488.54150390625, "loss": 0.0108, "rewards/accuracies": 0.984375, "rewards/chosen": 12.179953575134277, "rewards/margins": 123.74882507324219, "rewards/rejected": -111.56886291503906, "step": 97 }, { "epoch": 2.1797752808988764, "grad_norm": 0.23592473566532135, "learning_rate": 5.737130775807122e-07, "logits/chosen": 1.4150291681289673, "logits/rejected": 1.3036937713623047, "logps/chosen": -2623.100830078125, "logps/rejected": -3417.743408203125, "loss": 0.011, "rewards/accuracies": 0.984375, "rewards/chosen": 9.777491569519043, "rewards/margins": 126.9278335571289, "rewards/rejected": -117.15032196044922, "step": 98 }, { "epoch": 2.202247191011236, "grad_norm": 0.0040085772052407265, "learning_rate": 5.426337245327703e-07, "logits/chosen": 1.3026162385940552, "logits/rejected": 1.194283127784729, "logps/chosen": -2882.58154296875, "logps/rejected": -3794.05078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 10.322346687316895, "rewards/margins": 140.7698211669922, "rewards/rejected": -130.44747924804688, "step": 99 }, { "epoch": 2.2247191011235956, "grad_norm": 0.005036317277699709, "learning_rate": 5.122329314024422e-07, "logits/chosen": 1.4347069263458252, "logits/rejected": 1.2561771869659424, "logps/chosen": -2425.357177734375, "logps/rejected": -3138.833740234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 13.752297401428223, "rewards/margins": 120.6755599975586, "rewards/rejected": -106.92326354980469, "step": 100 }, { "epoch": 2.247191011235955, "grad_norm": 0.267286479473114, "learning_rate": 4.825322455955759e-07, "logits/chosen": 1.376643419265747, "logits/rejected": 1.2739124298095703, "logps/chosen": -2709.716796875, "logps/rejected": -3520.384765625, "loss": 0.0108, "rewards/accuracies": 0.984375, "rewards/chosen": 10.822145462036133, "rewards/margins": 141.28472900390625, "rewards/rejected": -130.4625701904297, "step": 101 }, { "epoch": 2.2696629213483144, "grad_norm": 0.37806662917137146, "learning_rate": 4.5355271829752307e-07, "logits/chosen": 1.4881722927093506, "logits/rejected": 1.346581220626831, "logps/chosen": -2821.6923828125, "logps/rejected": -3442.4619140625, "loss": 0.0108, "rewards/accuracies": 0.984375, "rewards/chosen": 9.021244049072266, "rewards/margins": 126.26439666748047, "rewards/rejected": -117.2431640625, "step": 102 }, { "epoch": 2.292134831460674, "grad_norm": 0.0023486721329391003, "learning_rate": 4.2531488955252726e-07, "logits/chosen": 1.4559850692749023, "logits/rejected": 1.1960179805755615, "logps/chosen": -2982.266357421875, "logps/rejected": -3776.720458984375, "loss": 0.0108, "rewards/accuracies": 0.984375, "rewards/chosen": 13.267072677612305, "rewards/margins": 156.5282440185547, "rewards/rejected": -143.26113891601562, "step": 103 }, { "epoch": 2.3146067415730336, "grad_norm": 0.006942716892808676, "learning_rate": 3.978387737053994e-07, "logits/chosen": 1.5748894214630127, "logits/rejected": 1.4408270120620728, "logps/chosen": -2752.75634765625, "logps/rejected": -3425.216064453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 15.169326782226562, "rewards/margins": 107.41685485839844, "rewards/rejected": -92.24752807617188, "step": 104 }, { "epoch": 2.337078651685393, "grad_norm": 0.1621246337890625, "learning_rate": 3.7114384521579234e-07, "logits/chosen": 1.6052483320236206, "logits/rejected": 1.446576714515686, "logps/chosen": -2733.099609375, "logps/rejected": -3558.54931640625, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": 6.2836151123046875, "rewards/margins": 120.5184326171875, "rewards/rejected": -114.23482513427734, "step": 105 }, { "epoch": 2.359550561797753, "grad_norm": 0.0010318144923076034, "learning_rate": 3.4524902485514043e-07, "logits/chosen": 1.5261331796646118, "logits/rejected": 1.2617827653884888, "logps/chosen": -2832.090576171875, "logps/rejected": -3448.433837890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.34963607788086, "rewards/margins": 127.82434844970703, "rewards/rejected": -119.47471618652344, "step": 106 }, { "epoch": 2.3820224719101124, "grad_norm": 0.001886666170321405, "learning_rate": 3.201726662960363e-07, "logits/chosen": 1.4487926959991455, "logits/rejected": 1.2953495979309082, "logps/chosen": -2931.4873046875, "logps/rejected": -3765.528564453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.4385576248168945, "rewards/margins": 141.048583984375, "rewards/rejected": -135.6100311279297, "step": 107 }, { "epoch": 2.404494382022472, "grad_norm": 0.0003725312708411366, "learning_rate": 2.9593254310355485e-07, "logits/chosen": 1.5249533653259277, "logits/rejected": 1.36188805103302, "logps/chosen": -2958.6279296875, "logps/rejected": -3625.80859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.046311378479004, "rewards/margins": 136.48867797851562, "rewards/rejected": -128.44235229492188, "step": 108 }, { "epoch": 2.4269662921348316, "grad_norm": 0.0058527453802526, "learning_rate": 2.725458361377465e-07, "logits/chosen": 1.449507236480713, "logits/rejected": 1.195552110671997, "logps/chosen": -3101.913330078125, "logps/rejected": -3919.42626953125, "loss": 0.0108, "rewards/accuracies": 0.984375, "rewards/chosen": 9.668648719787598, "rewards/margins": 170.04879760742188, "rewards/rejected": -160.38015747070312, "step": 109 }, { "epoch": 2.449438202247191, "grad_norm": 0.004259227309376001, "learning_rate": 2.5002912137622743e-07, "logits/chosen": 1.3936243057250977, "logits/rejected": 1.1740200519561768, "logps/chosen": -2701.333740234375, "logps/rejected": -3472.6923828125, "loss": 0.0108, "rewards/accuracies": 0.984375, "rewards/chosen": 11.122644424438477, "rewards/margins": 145.8236083984375, "rewards/rejected": -134.70095825195312, "step": 110 }, { "epoch": 2.4719101123595504, "grad_norm": 0.010651292279362679, "learning_rate": 2.2839835816549365e-07, "logits/chosen": 1.711632490158081, "logits/rejected": 1.4845446348190308, "logps/chosen": -3014.84912109375, "logps/rejected": -3401.6298828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 8.185779571533203, "rewards/margins": 117.65122985839844, "rewards/rejected": -109.4654541015625, "step": 111 }, { "epoch": 2.49438202247191, "grad_norm": 0.21365472674369812, "learning_rate": 2.0766887790929072e-07, "logits/chosen": 1.5201102495193481, "logits/rejected": 1.3360121250152588, "logps/chosen": -2596.279296875, "logps/rejected": -3536.295166015625, "loss": 0.0108, "rewards/accuracies": 0.984375, "rewards/chosen": 9.575386047363281, "rewards/margins": 136.92886352539062, "rewards/rejected": -127.35346221923828, "step": 112 }, { "epoch": 2.5168539325842696, "grad_norm": 0.06359975039958954, "learning_rate": 1.8785537320205808e-07, "logits/chosen": 1.4054570198059082, "logits/rejected": 1.304233431816101, "logps/chosen": -2882.770263671875, "logps/rejected": -3637.910888671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 9.500956535339355, "rewards/margins": 114.78219604492188, "rewards/rejected": -105.28123474121094, "step": 113 }, { "epoch": 2.539325842696629, "grad_norm": 0.039696987718343735, "learning_rate": 1.6897188741514286e-07, "logits/chosen": 1.3486000299453735, "logits/rejected": 1.2321511507034302, "logps/chosen": -2972.344970703125, "logps/rejected": -3984.229248046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.131504535675049, "rewards/margins": 162.7792205810547, "rewards/rejected": -157.64772033691406, "step": 114 }, { "epoch": 2.561797752808989, "grad_norm": 0.002948309760540724, "learning_rate": 1.510318047431713e-07, "logits/chosen": 1.4727129936218262, "logits/rejected": 1.3785285949707031, "logps/chosen": -2675.683837890625, "logps/rejected": -3297.158447265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.861666679382324, "rewards/margins": 110.47186279296875, "rewards/rejected": -102.61019134521484, "step": 115 }, { "epoch": 2.5842696629213484, "grad_norm": 0.07731137424707413, "learning_rate": 1.3404784071763015e-07, "logits/chosen": 1.4941082000732422, "logits/rejected": 1.4053186178207397, "logps/chosen": -2728.80615234375, "logps/rejected": -3415.1708984375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 10.857705116271973, "rewards/margins": 109.21708679199219, "rewards/rejected": -98.35939025878906, "step": 116 }, { "epoch": 2.606741573033708, "grad_norm": 0.01123058795928955, "learning_rate": 1.1803203319438056e-07, "logits/chosen": 1.4337643384933472, "logits/rejected": 1.2645751237869263, "logps/chosen": -2684.67041015625, "logps/rejected": -3446.0908203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 13.534300804138184, "rewards/margins": 135.90628051757812, "rewards/rejected": -122.37198638916016, "step": 117 }, { "epoch": 2.629213483146067, "grad_norm": 0.7818881869316101, "learning_rate": 1.0299573382149235e-07, "logits/chosen": 1.4340091943740845, "logits/rejected": 1.2151674032211304, "logps/chosen": -3169.663330078125, "logps/rejected": -4115.5751953125, "loss": 0.0219, "rewards/accuracies": 0.984375, "rewards/chosen": 11.765824317932129, "rewards/margins": 178.14181518554688, "rewards/rejected": -166.37596130371094, "step": 118 }, { "epoch": 2.6516853932584272, "grad_norm": 0.11178380995988846, "learning_rate": 8.894959999345015e-08, "logits/chosen": 1.4085586071014404, "logits/rejected": 1.317073941230774, "logps/chosen": -2706.8623046875, "logps/rejected": -3629.9091796875, "loss": 0.0109, "rewards/accuracies": 1.0, "rewards/chosen": 6.750637531280518, "rewards/margins": 140.9330291748047, "rewards/rejected": -134.18240356445312, "step": 119 }, { "epoch": 2.6741573033707864, "grad_norm": 0.009486271999776363, "learning_rate": 7.590358729742808e-08, "logits/chosen": 1.5044245719909668, "logits/rejected": 1.3787866830825806, "logps/chosen": -2867.752197265625, "logps/rejected": -3833.509765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.230460166931152, "rewards/margins": 134.28904724121094, "rewards/rejected": -128.05857849121094, "step": 120 }, { "epoch": 2.696629213483146, "grad_norm": 0.009250489063560963, "learning_rate": 6.386694245699181e-08, "logits/chosen": 1.5157657861709595, "logits/rejected": 1.2433254718780518, "logps/chosen": -3022.373046875, "logps/rejected": -3732.22900390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.7557570934295654, "rewards/margins": 130.84677124023438, "rewards/rejected": -128.0910186767578, "step": 121 }, { "epoch": 2.7191011235955056, "grad_norm": 0.1917319893836975, "learning_rate": 5.284819677822611e-08, "logits/chosen": 1.6072005033493042, "logits/rejected": 1.528849720954895, "logps/chosen": -2894.672119140625, "logps/rejected": -3495.853515625, "loss": 0.0108, "rewards/accuracies": 0.984375, "rewards/chosen": 3.3133740425109863, "rewards/margins": 105.75206756591797, "rewards/rejected": -102.43870544433594, "step": 122 }, { "epoch": 2.741573033707865, "grad_norm": 0.03384300321340561, "learning_rate": 4.285516010293522e-08, "logits/chosen": 1.4517195224761963, "logits/rejected": 1.3014264106750488, "logps/chosen": -2851.070556640625, "logps/rejected": -3593.665771484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.4544267654418945, "rewards/margins": 122.42274475097656, "rewards/rejected": -114.96832275390625, "step": 123 }, { "epoch": 2.764044943820225, "grad_norm": 0.24889694154262543, "learning_rate": 3.389491527319999e-08, "logits/chosen": 1.4583051204681396, "logits/rejected": 1.2614139318466187, "logps/chosen": -2827.8134765625, "logps/rejected": -3561.30810546875, "loss": 0.0217, "rewards/accuracies": 0.984375, "rewards/chosen": 0.6058197617530823, "rewards/margins": 129.5867919921875, "rewards/rejected": -128.98095703125, "step": 124 }, { "epoch": 2.7865168539325844, "grad_norm": 0.06888113170862198, "learning_rate": 2.5973813111218548e-08, "logits/chosen": 1.529250144958496, "logits/rejected": 1.247063159942627, "logps/chosen": -2882.323974609375, "logps/rejected": -3656.96044921875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 9.58204174041748, "rewards/margins": 154.1719970703125, "rewards/rejected": -144.5899658203125, "step": 125 }, { "epoch": 2.808988764044944, "grad_norm": 0.0029755791183561087, "learning_rate": 1.909746791798317e-08, "logits/chosen": 1.4555425643920898, "logits/rejected": 1.2920844554901123, "logps/chosen": -2807.64208984375, "logps/rejected": -3475.54931640625, "loss": 0.0217, "rewards/accuracies": 0.984375, "rewards/chosen": 5.643215179443359, "rewards/margins": 125.7391128540039, "rewards/rejected": -120.09590148925781, "step": 126 }, { "epoch": 2.831460674157303, "grad_norm": 0.009821542538702488, "learning_rate": 1.3270753493989374e-08, "logits/chosen": 1.535863995552063, "logits/rejected": 1.3580735921859741, "logps/chosen": -2754.88818359375, "logps/rejected": -3732.697021484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 7.623423099517822, "rewards/margins": 136.6768035888672, "rewards/rejected": -129.05337524414062, "step": 127 }, { "epoch": 2.853932584269663, "grad_norm": 0.5018057227134705, "learning_rate": 8.49779968479436e-09, "logits/chosen": 1.3728063106536865, "logits/rejected": 1.154386281967163, "logps/chosen": -3219.5546875, "logps/rejected": -3955.0615234375, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": 2.1031904220581055, "rewards/margins": 135.11688232421875, "rewards/rejected": -133.01368713378906, "step": 128 }, { "epoch": 2.8764044943820224, "grad_norm": 0.0029928251169621944, "learning_rate": 4.781989453874814e-09, "logits/chosen": 1.589327335357666, "logits/rejected": 1.44749116897583, "logps/chosen": -2659.24462890625, "logps/rejected": -3233.244873046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 10.386514663696289, "rewards/margins": 102.26481628417969, "rewards/rejected": -91.87830352783203, "step": 129 }, { "epoch": 2.898876404494382, "grad_norm": 0.009541017934679985, "learning_rate": 2.1259564848570834e-09, "logits/chosen": 1.5677722692489624, "logits/rejected": 1.2758667469024658, "logps/chosen": -2889.547607421875, "logps/rejected": -3603.37109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 15.972006797790527, "rewards/margins": 140.3019256591797, "rewards/rejected": -124.32991790771484, "step": 130 }, { "epoch": 2.9213483146067416, "grad_norm": 0.007502752356231213, "learning_rate": 5.315833148210603e-10, "logits/chosen": 1.6323837041854858, "logits/rejected": 1.446678876876831, "logps/chosen": -2922.07568359375, "logps/rejected": -3691.432373046875, "loss": 0.0108, "rewards/accuracies": 0.984375, "rewards/chosen": 12.317670822143555, "rewards/margins": 135.18690490722656, "rewards/rejected": -122.86924743652344, "step": 131 }, { "epoch": 2.943820224719101, "grad_norm": 0.2958358824253082, "learning_rate": 0.0, "logits/chosen": 1.4742579460144043, "logits/rejected": 1.2774202823638916, "logps/chosen": -2621.55615234375, "logps/rejected": -3527.73193359375, "loss": 0.0217, "rewards/accuracies": 0.984375, "rewards/chosen": 11.16303539276123, "rewards/margins": 133.13824462890625, "rewards/rejected": -121.9752197265625, "step": 132 }, { "epoch": 2.943820224719101, "step": 132, "total_flos": 228521444442112.0, "train_loss": 0.17045999738028772, "train_runtime": 5166.54, "train_samples_per_second": 1.651, "train_steps_per_second": 0.026 } ], "logging_steps": 1, "max_steps": 132, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 228521444442112.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }