Qwen2-72B-SFT-Step-DPO / trainer_state.json
xinlai's picture
upload model
e350c62
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.982222222222222,
"eval_steps": 1,
"global_step": 336,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011851851851851851,
"grad_norm": 44.10000740195015,
"learning_rate": 1.4705882352941176e-08,
"logits/chosen": -1.1635093688964844,
"logits/rejected": -0.9440154433250427,
"logps/chosen": -26.389511108398438,
"logps/rejected": -42.156002044677734,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.023703703703703703,
"grad_norm": 45.622821831639094,
"learning_rate": 2.941176470588235e-08,
"logits/chosen": -0.8899029493331909,
"logits/rejected": -0.9265471696853638,
"logps/chosen": -24.45637321472168,
"logps/rejected": -38.72291564941406,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 2
},
{
"epoch": 0.035555555555555556,
"grad_norm": 41.287867804704256,
"learning_rate": 4.411764705882353e-08,
"logits/chosen": -0.9218576550483704,
"logits/rejected": -0.8510868549346924,
"logps/chosen": -23.573394775390625,
"logps/rejected": -31.830120086669922,
"loss": 0.6917,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.05094228684902191,
"rewards/margins": 0.055795177817344666,
"rewards/rejected": -0.004852890968322754,
"step": 3
},
{
"epoch": 0.047407407407407405,
"grad_norm": 41.148615147033524,
"learning_rate": 5.88235294117647e-08,
"logits/chosen": -0.8889421820640564,
"logits/rejected": -0.7832293510437012,
"logps/chosen": -27.102622985839844,
"logps/rejected": -32.83424377441406,
"loss": 0.692,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.005930736660957336,
"rewards/margins": 0.013045087456703186,
"rewards/rejected": -0.00711435079574585,
"step": 4
},
{
"epoch": 0.05925925925925926,
"grad_norm": 41.57192528486562,
"learning_rate": 7.352941176470588e-08,
"logits/chosen": -0.8269144296646118,
"logits/rejected": -0.8342342376708984,
"logps/chosen": -26.83285903930664,
"logps/rejected": -33.845359802246094,
"loss": 0.7004,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.002873659133911133,
"rewards/margins": 0.0599842369556427,
"rewards/rejected": -0.05711057782173157,
"step": 5
},
{
"epoch": 0.07111111111111111,
"grad_norm": 42.964145384550164,
"learning_rate": 8.823529411764706e-08,
"logits/chosen": -0.9288309216499329,
"logits/rejected": -0.9066528677940369,
"logps/chosen": -31.687969207763672,
"logps/rejected": -35.163841247558594,
"loss": 0.701,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.005395621061325073,
"rewards/margins": -0.016778230667114258,
"rewards/rejected": 0.02217385172843933,
"step": 6
},
{
"epoch": 0.08296296296296296,
"grad_norm": 38.3846396537961,
"learning_rate": 1.0294117647058822e-07,
"logits/chosen": -0.9132620096206665,
"logits/rejected": -0.7912867665290833,
"logps/chosen": -24.47614860534668,
"logps/rejected": -32.74094009399414,
"loss": 0.6924,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.02499394118785858,
"rewards/margins": 0.010348424315452576,
"rewards/rejected": 0.014645516872406006,
"step": 7
},
{
"epoch": 0.09481481481481481,
"grad_norm": 43.34894792705672,
"learning_rate": 1.176470588235294e-07,
"logits/chosen": -0.8170281648635864,
"logits/rejected": -0.8093118667602539,
"logps/chosen": -21.367229461669922,
"logps/rejected": -30.556249618530273,
"loss": 0.6943,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.021743685007095337,
"rewards/margins": 0.05349762737751007,
"rewards/rejected": -0.031753942370414734,
"step": 8
},
{
"epoch": 0.10666666666666667,
"grad_norm": 44.768590418142296,
"learning_rate": 1.3235294117647057e-07,
"logits/chosen": -0.8906874656677246,
"logits/rejected": -0.8358623385429382,
"logps/chosen": -27.88587760925293,
"logps/rejected": -30.677749633789062,
"loss": 0.7014,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.00475698709487915,
"rewards/margins": 0.035931557416915894,
"rewards/rejected": -0.031174570322036743,
"step": 9
},
{
"epoch": 0.11851851851851852,
"grad_norm": 41.386325746824284,
"learning_rate": 1.4705882352941175e-07,
"logits/chosen": -1.0302842855453491,
"logits/rejected": -0.8634576201438904,
"logps/chosen": -28.216838836669922,
"logps/rejected": -38.4200553894043,
"loss": 0.6967,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.02494041621685028,
"rewards/margins": 0.07226283848285675,
"rewards/rejected": -0.04732242226600647,
"step": 10
},
{
"epoch": 0.13037037037037036,
"grad_norm": 42.87170433913047,
"learning_rate": 1.6176470588235293e-07,
"logits/chosen": -0.8272039890289307,
"logits/rejected": -0.8201614618301392,
"logps/chosen": -24.542991638183594,
"logps/rejected": -33.56885528564453,
"loss": 0.7016,
"rewards/accuracies": 0.4375,
"rewards/chosen": 0.04099439084529877,
"rewards/margins": 0.02981768548488617,
"rewards/rejected": 0.011176705360412598,
"step": 11
},
{
"epoch": 0.14222222222222222,
"grad_norm": 41.54515829050869,
"learning_rate": 1.764705882352941e-07,
"logits/chosen": -0.8868040442466736,
"logits/rejected": -0.8360949158668518,
"logps/chosen": -29.391693115234375,
"logps/rejected": -39.35624694824219,
"loss": 0.6989,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.020398467779159546,
"rewards/margins": 0.034372299909591675,
"rewards/rejected": -0.013973832130432129,
"step": 12
},
{
"epoch": 0.15407407407407409,
"grad_norm": 40.960317074043914,
"learning_rate": 1.9117647058823527e-07,
"logits/chosen": -0.9931007623672485,
"logits/rejected": -0.9051375985145569,
"logps/chosen": -21.935997009277344,
"logps/rejected": -29.908475875854492,
"loss": 0.6973,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.006315797567367554,
"rewards/margins": 0.032275840640068054,
"rewards/rejected": -0.03859163820743561,
"step": 13
},
{
"epoch": 0.16592592592592592,
"grad_norm": 46.37678646749312,
"learning_rate": 2.0588235294117645e-07,
"logits/chosen": -0.736880898475647,
"logits/rejected": -0.6582351326942444,
"logps/chosen": -28.070615768432617,
"logps/rejected": -37.080623626708984,
"loss": 0.6942,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.0168197900056839,
"rewards/margins": 0.020758137106895447,
"rewards/rejected": -0.037577927112579346,
"step": 14
},
{
"epoch": 0.17777777777777778,
"grad_norm": 38.418772787456916,
"learning_rate": 2.2058823529411763e-07,
"logits/chosen": -0.8958194851875305,
"logits/rejected": -0.8823959827423096,
"logps/chosen": -24.240140914916992,
"logps/rejected": -36.511985778808594,
"loss": 0.6853,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.010584741830825806,
"rewards/margins": 0.013375014066696167,
"rewards/rejected": -0.023959755897521973,
"step": 15
},
{
"epoch": 0.18962962962962962,
"grad_norm": 39.784578323473944,
"learning_rate": 2.352941176470588e-07,
"logits/chosen": -1.044739007949829,
"logits/rejected": -0.9721382260322571,
"logps/chosen": -24.203937530517578,
"logps/rejected": -38.13182830810547,
"loss": 0.6644,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.008091084659099579,
"rewards/margins": 0.03979543596506119,
"rewards/rejected": -0.04788652062416077,
"step": 16
},
{
"epoch": 0.20148148148148148,
"grad_norm": 38.68619452262893,
"learning_rate": 2.5e-07,
"logits/chosen": -0.9131325483322144,
"logits/rejected": -0.9099739193916321,
"logps/chosen": -23.27505874633789,
"logps/rejected": -25.550016403198242,
"loss": 0.6639,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.04493655264377594,
"rewards/margins": 0.11525127291679382,
"rewards/rejected": -0.07031472027301788,
"step": 17
},
{
"epoch": 0.21333333333333335,
"grad_norm": 38.89353521239618,
"learning_rate": 2.6470588235294114e-07,
"logits/chosen": -1.1501476764678955,
"logits/rejected": -1.0104213953018188,
"logps/chosen": -28.398540496826172,
"logps/rejected": -40.202754974365234,
"loss": 0.6675,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.0002931952476501465,
"rewards/margins": 0.1416773796081543,
"rewards/rejected": -0.14197057485580444,
"step": 18
},
{
"epoch": 0.22518518518518518,
"grad_norm": 39.73809119940035,
"learning_rate": 2.7941176470588235e-07,
"logits/chosen": -0.6393623948097229,
"logits/rejected": -0.5715636014938354,
"logps/chosen": -23.02471160888672,
"logps/rejected": -29.500215530395508,
"loss": 0.6618,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.012419655919075012,
"rewards/margins": 0.04500822722911835,
"rewards/rejected": -0.032588571310043335,
"step": 19
},
{
"epoch": 0.23703703703703705,
"grad_norm": 36.24445457135461,
"learning_rate": 2.941176470588235e-07,
"logits/chosen": -1.090634822845459,
"logits/rejected": -1.0109808444976807,
"logps/chosen": -22.518497467041016,
"logps/rejected": -28.288860321044922,
"loss": 0.6407,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.004169940948486328,
"rewards/margins": 0.08663815259933472,
"rewards/rejected": -0.08246821165084839,
"step": 20
},
{
"epoch": 0.24888888888888888,
"grad_norm": 37.48843626542997,
"learning_rate": 3.088235294117647e-07,
"logits/chosen": -0.9397974610328674,
"logits/rejected": -0.8281663060188293,
"logps/chosen": -29.923145294189453,
"logps/rejected": -37.80279541015625,
"loss": 0.6361,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.010303795337677002,
"rewards/margins": 0.1979476809501648,
"rewards/rejected": -0.2082514762878418,
"step": 21
},
{
"epoch": 0.2607407407407407,
"grad_norm": 37.17562909629751,
"learning_rate": 3.2352941176470586e-07,
"logits/chosen": -0.8852977752685547,
"logits/rejected": -0.8319816589355469,
"logps/chosen": -23.00829315185547,
"logps/rejected": -28.55397605895996,
"loss": 0.6446,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.001584082841873169,
"rewards/margins": 0.16339415311813354,
"rewards/rejected": -0.1649782359600067,
"step": 22
},
{
"epoch": 0.2725925925925926,
"grad_norm": 35.98779991504583,
"learning_rate": 3.3823529411764707e-07,
"logits/chosen": -0.7651995420455933,
"logits/rejected": -0.7312899827957153,
"logps/chosen": -31.04439926147461,
"logps/rejected": -37.98454284667969,
"loss": 0.6453,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.023012787103652954,
"rewards/margins": 0.09663936495780945,
"rewards/rejected": -0.1196521520614624,
"step": 23
},
{
"epoch": 0.28444444444444444,
"grad_norm": 36.65037386431617,
"learning_rate": 3.529411764705882e-07,
"logits/chosen": -0.9652918577194214,
"logits/rejected": -0.9185481071472168,
"logps/chosen": -30.223522186279297,
"logps/rejected": -34.86516189575195,
"loss": 0.6319,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.025013744831085205,
"rewards/margins": 0.17298102378845215,
"rewards/rejected": -0.19799476861953735,
"step": 24
},
{
"epoch": 0.2962962962962963,
"grad_norm": 36.325213836190166,
"learning_rate": 3.6764705882352943e-07,
"logits/chosen": -0.8377700448036194,
"logits/rejected": -0.7563367486000061,
"logps/chosen": -19.788166046142578,
"logps/rejected": -32.94764709472656,
"loss": 0.603,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.00691574439406395,
"rewards/margins": 0.249167799949646,
"rewards/rejected": -0.25608354806900024,
"step": 25
},
{
"epoch": 0.30814814814814817,
"grad_norm": 33.21927808914221,
"learning_rate": 3.8235294117647053e-07,
"logits/chosen": -0.9247075319290161,
"logits/rejected": -0.9600427746772766,
"logps/chosen": -22.75655746459961,
"logps/rejected": -33.42902374267578,
"loss": 0.5963,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.012206077575683594,
"rewards/margins": 0.18068939447402954,
"rewards/rejected": -0.16848331689834595,
"step": 26
},
{
"epoch": 0.32,
"grad_norm": 33.919179281256405,
"learning_rate": 3.9705882352941174e-07,
"logits/chosen": -1.0090656280517578,
"logits/rejected": -0.8680551052093506,
"logps/chosen": -27.313983917236328,
"logps/rejected": -32.803958892822266,
"loss": 0.5868,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.006254285573959351,
"rewards/margins": 0.3513309061527252,
"rewards/rejected": -0.34507662057876587,
"step": 27
},
{
"epoch": 0.33185185185185184,
"grad_norm": 32.88800631538997,
"learning_rate": 4.117647058823529e-07,
"logits/chosen": -0.7507399320602417,
"logits/rejected": -0.6654347777366638,
"logps/chosen": -33.17474365234375,
"logps/rejected": -37.52992248535156,
"loss": 0.5582,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.018849045038223267,
"rewards/margins": 0.5149893164634705,
"rewards/rejected": -0.5338383913040161,
"step": 28
},
{
"epoch": 0.3437037037037037,
"grad_norm": 32.187131672205425,
"learning_rate": 4.264705882352941e-07,
"logits/chosen": -0.9114011526107788,
"logits/rejected": -0.7332407236099243,
"logps/chosen": -27.552963256835938,
"logps/rejected": -33.381103515625,
"loss": 0.569,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.034624576568603516,
"rewards/margins": 0.2657691240310669,
"rewards/rejected": -0.3003937005996704,
"step": 29
},
{
"epoch": 0.35555555555555557,
"grad_norm": 31.43465207056781,
"learning_rate": 4.4117647058823526e-07,
"logits/chosen": -1.080330491065979,
"logits/rejected": -1.018049716949463,
"logps/chosen": -24.93523406982422,
"logps/rejected": -33.0054817199707,
"loss": 0.5787,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.08209644258022308,
"rewards/margins": 0.2938240170478821,
"rewards/rejected": -0.37592047452926636,
"step": 30
},
{
"epoch": 0.3674074074074074,
"grad_norm": 29.8319071034986,
"learning_rate": 4.5588235294117646e-07,
"logits/chosen": -0.7354201078414917,
"logits/rejected": -0.5976296663284302,
"logps/chosen": -20.997676849365234,
"logps/rejected": -32.08062744140625,
"loss": 0.5421,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.06063076853752136,
"rewards/margins": 0.5570548176765442,
"rewards/rejected": -0.6176855564117432,
"step": 31
},
{
"epoch": 0.37925925925925924,
"grad_norm": 32.37223816472854,
"learning_rate": 4.705882352941176e-07,
"logits/chosen": -0.9014286398887634,
"logits/rejected": -0.868757963180542,
"logps/chosen": -23.115407943725586,
"logps/rejected": -39.159507751464844,
"loss": 0.549,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.0811956524848938,
"rewards/margins": 0.4106258153915405,
"rewards/rejected": -0.4918214678764343,
"step": 32
},
{
"epoch": 0.39111111111111113,
"grad_norm": 30.829704180417103,
"learning_rate": 4.852941176470588e-07,
"logits/chosen": -0.8415942788124084,
"logits/rejected": -0.826940655708313,
"logps/chosen": -25.28696060180664,
"logps/rejected": -36.247039794921875,
"loss": 0.5377,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.11577820032835007,
"rewards/margins": 0.5010173916816711,
"rewards/rejected": -0.616795539855957,
"step": 33
},
{
"epoch": 0.40296296296296297,
"grad_norm": 34.240433832755805,
"learning_rate": 5e-07,
"logits/chosen": -1.071217656135559,
"logits/rejected": -0.8587817549705505,
"logps/chosen": -23.079936981201172,
"logps/rejected": -32.364227294921875,
"loss": 0.554,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.06819352507591248,
"rewards/margins": 0.4207611680030823,
"rewards/rejected": -0.48895469307899475,
"step": 34
},
{
"epoch": 0.4148148148148148,
"grad_norm": 31.79158261280939,
"learning_rate": 4.999864732969518e-07,
"logits/chosen": -1.041569471359253,
"logits/rejected": -0.9538137912750244,
"logps/chosen": -29.438274383544922,
"logps/rejected": -35.4671745300293,
"loss": 0.5322,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.10224419832229614,
"rewards/margins": 0.5241090059280396,
"rewards/rejected": -0.6263532042503357,
"step": 35
},
{
"epoch": 0.4266666666666667,
"grad_norm": 28.443303374361268,
"learning_rate": 4.999458946515807e-07,
"logits/chosen": -1.1223492622375488,
"logits/rejected": -1.040766954421997,
"logps/chosen": -32.29949951171875,
"logps/rejected": -41.46755599975586,
"loss": 0.5017,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.0577593594789505,
"rewards/margins": 0.6482563018798828,
"rewards/rejected": -0.7060155868530273,
"step": 36
},
{
"epoch": 0.43851851851851853,
"grad_norm": 30.648927045340578,
"learning_rate": 4.998782684550491e-07,
"logits/chosen": -0.9065847992897034,
"logits/rejected": -0.8718705177307129,
"logps/chosen": -21.124893188476562,
"logps/rejected": -39.29669952392578,
"loss": 0.5147,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.09413473308086395,
"rewards/margins": 0.6028537154197693,
"rewards/rejected": -0.6969884634017944,
"step": 37
},
{
"epoch": 0.45037037037037037,
"grad_norm": 29.437195830990852,
"learning_rate": 4.997836020254328e-07,
"logits/chosen": -0.9325073957443237,
"logits/rejected": -0.8846120238304138,
"logps/chosen": -27.168790817260742,
"logps/rejected": -36.877262115478516,
"loss": 0.5122,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.15598426759243011,
"rewards/margins": 0.6510501503944397,
"rewards/rejected": -0.807034432888031,
"step": 38
},
{
"epoch": 0.4622222222222222,
"grad_norm": 28.44428517855095,
"learning_rate": 4.996619056069291e-07,
"logits/chosen": -0.8960347771644592,
"logits/rejected": -0.8378150463104248,
"logps/chosen": -28.43727684020996,
"logps/rejected": -40.62827682495117,
"loss": 0.4705,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.17505469918251038,
"rewards/margins": 0.8592283725738525,
"rewards/rejected": -1.0342830419540405,
"step": 39
},
{
"epoch": 0.4740740740740741,
"grad_norm": 29.94537092561941,
"learning_rate": 4.995131923687487e-07,
"logits/chosen": -0.9718501567840576,
"logits/rejected": -0.8560028076171875,
"logps/chosen": -29.755184173583984,
"logps/rejected": -37.2801399230957,
"loss": 0.4835,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.11063119769096375,
"rewards/margins": 0.7615076899528503,
"rewards/rejected": -0.8721388578414917,
"step": 40
},
{
"epoch": 0.48592592592592593,
"grad_norm": 26.638351450808948,
"learning_rate": 4.993374784036901e-07,
"logits/chosen": -1.006788969039917,
"logits/rejected": -0.8062241077423096,
"logps/chosen": -27.824739456176758,
"logps/rejected": -37.465415954589844,
"loss": 0.4489,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.16224287450313568,
"rewards/margins": 0.9281247854232788,
"rewards/rejected": -1.0903676748275757,
"step": 41
},
{
"epoch": 0.49777777777777776,
"grad_norm": 29.703403664234436,
"learning_rate": 4.991347827263982e-07,
"logits/chosen": -1.0439155101776123,
"logits/rejected": -0.8992699384689331,
"logps/chosen": -28.927303314208984,
"logps/rejected": -42.13187026977539,
"loss": 0.488,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.13541710376739502,
"rewards/margins": 0.8769669532775879,
"rewards/rejected": -1.0123839378356934,
"step": 42
},
{
"epoch": 0.5096296296296297,
"grad_norm": 25.194805243065485,
"learning_rate": 4.989051272713069e-07,
"logits/chosen": -0.9479715824127197,
"logits/rejected": -0.808491051197052,
"logps/chosen": -30.748804092407227,
"logps/rejected": -48.32786178588867,
"loss": 0.4055,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.0868428647518158,
"rewards/margins": 1.7449877262115479,
"rewards/rejected": -1.8318307399749756,
"step": 43
},
{
"epoch": 0.5214814814814814,
"grad_norm": 28.50704779191256,
"learning_rate": 4.986485368902656e-07,
"logits/chosen": -1.003732681274414,
"logits/rejected": -0.9534778594970703,
"logps/chosen": -25.17104148864746,
"logps/rejected": -36.80795669555664,
"loss": 0.4687,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.15720072388648987,
"rewards/margins": 0.7120774984359741,
"rewards/rejected": -0.8692781925201416,
"step": 44
},
{
"epoch": 0.5333333333333333,
"grad_norm": 26.654378912528262,
"learning_rate": 4.983650393498489e-07,
"logits/chosen": -0.9796334505081177,
"logits/rejected": -0.8810800313949585,
"logps/chosen": -34.67963790893555,
"logps/rejected": -37.48582077026367,
"loss": 0.4059,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.22126227617263794,
"rewards/margins": 1.05548095703125,
"rewards/rejected": -1.2767431735992432,
"step": 45
},
{
"epoch": 0.5451851851851852,
"grad_norm": 25.91641243212481,
"learning_rate": 4.980546653283537e-07,
"logits/chosen": -1.1144230365753174,
"logits/rejected": -0.9187833666801453,
"logps/chosen": -27.469764709472656,
"logps/rejected": -42.77268981933594,
"loss": 0.4794,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.20582953095436096,
"rewards/margins": 1.8931379318237305,
"rewards/rejected": -2.0989675521850586,
"step": 46
},
{
"epoch": 0.557037037037037,
"grad_norm": 27.616713081396448,
"learning_rate": 4.977174484124775e-07,
"logits/chosen": -0.9438971877098083,
"logits/rejected": -0.9460131525993347,
"logps/chosen": -28.729183197021484,
"logps/rejected": -30.642105102539062,
"loss": 0.4464,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.13018304109573364,
"rewards/margins": 0.8073292970657349,
"rewards/rejected": -0.9375122785568237,
"step": 47
},
{
"epoch": 0.5688888888888889,
"grad_norm": 26.228638287015333,
"learning_rate": 4.97353425093685e-07,
"logits/chosen": -1.2007321119308472,
"logits/rejected": -1.0530825853347778,
"logps/chosen": -25.535133361816406,
"logps/rejected": -35.96273422241211,
"loss": 0.4261,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.14477074146270752,
"rewards/margins": 1.4705314636230469,
"rewards/rejected": -1.615302324295044,
"step": 48
},
{
"epoch": 0.5807407407407408,
"grad_norm": 27.447706308710917,
"learning_rate": 4.96962634764259e-07,
"logits/chosen": -1.0324229001998901,
"logits/rejected": -1.000633955001831,
"logps/chosen": -31.232351303100586,
"logps/rejected": -40.054874420166016,
"loss": 0.4274,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.37540578842163086,
"rewards/margins": 0.9162301421165466,
"rewards/rejected": -1.2916358709335327,
"step": 49
},
{
"epoch": 0.5925925925925926,
"grad_norm": 27.537626334544292,
"learning_rate": 4.965451197130372e-07,
"logits/chosen": -1.0934017896652222,
"logits/rejected": -0.9698958396911621,
"logps/chosen": -25.604278564453125,
"logps/rejected": -41.89402770996094,
"loss": 0.4418,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.11558225750923157,
"rewards/margins": 1.3389551639556885,
"rewards/rejected": -1.4545375108718872,
"step": 50
},
{
"epoch": 0.6044444444444445,
"grad_norm": 26.396954082977054,
"learning_rate": 4.961009251208367e-07,
"logits/chosen": -1.071451187133789,
"logits/rejected": -0.9166553616523743,
"logps/chosen": -21.116607666015625,
"logps/rejected": -34.15024948120117,
"loss": 0.4173,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.06339044868946075,
"rewards/margins": 1.8111618757247925,
"rewards/rejected": -1.8745522499084473,
"step": 51
},
{
"epoch": 0.6162962962962963,
"grad_norm": 24.23948883073191,
"learning_rate": 4.956300990555643e-07,
"logits/chosen": -1.0040934085845947,
"logits/rejected": -0.8644249439239502,
"logps/chosen": -24.51968002319336,
"logps/rejected": -32.15287399291992,
"loss": 0.3977,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.1651010513305664,
"rewards/margins": 1.301413655281067,
"rewards/rejected": -1.4665147066116333,
"step": 52
},
{
"epoch": 0.6281481481481481,
"grad_norm": 29.178528579105812,
"learning_rate": 4.951326924670147e-07,
"logits/chosen": -0.8935304880142212,
"logits/rejected": -0.9188090562820435,
"logps/chosen": -29.823339462280273,
"logps/rejected": -42.743675231933594,
"loss": 0.4615,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.31552594900131226,
"rewards/margins": 1.0024209022521973,
"rewards/rejected": -1.3179469108581543,
"step": 53
},
{
"epoch": 0.64,
"grad_norm": 24.40363992735679,
"learning_rate": 4.94608759181358e-07,
"logits/chosen": -0.9994638562202454,
"logits/rejected": -0.8031306266784668,
"logps/chosen": -32.72019577026367,
"logps/rejected": -39.62814712524414,
"loss": 0.3302,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.1551201343536377,
"rewards/margins": 1.3950880765914917,
"rewards/rejected": -1.5502082109451294,
"step": 54
},
{
"epoch": 0.6518518518518519,
"grad_norm": 27.80698317557724,
"learning_rate": 4.940583558953137e-07,
"logits/chosen": -1.1568812131881714,
"logits/rejected": -1.083202838897705,
"logps/chosen": -28.588844299316406,
"logps/rejected": -46.40166091918945,
"loss": 0.4196,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.32859814167022705,
"rewards/margins": 1.721780776977539,
"rewards/rejected": -2.0503790378570557,
"step": 55
},
{
"epoch": 0.6637037037037037,
"grad_norm": 27.03342498011367,
"learning_rate": 4.934815421700164e-07,
"logits/chosen": -0.9664996266365051,
"logits/rejected": -0.9351974725723267,
"logps/chosen": -25.929637908935547,
"logps/rejected": -36.615997314453125,
"loss": 0.4234,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.14317776262760162,
"rewards/margins": 1.6834478378295898,
"rewards/rejected": -1.8266258239746094,
"step": 56
},
{
"epoch": 0.6755555555555556,
"grad_norm": 27.024427262923552,
"learning_rate": 4.928783804245699e-07,
"logits/chosen": -0.8274962902069092,
"logits/rejected": -0.745110273361206,
"logps/chosen": -32.589447021484375,
"logps/rejected": -34.72138977050781,
"loss": 0.3984,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.42406025528907776,
"rewards/margins": 0.8041820526123047,
"rewards/rejected": -1.2282423973083496,
"step": 57
},
{
"epoch": 0.6874074074074074,
"grad_norm": 24.14506468826234,
"learning_rate": 4.922489359292927e-07,
"logits/chosen": -0.920275866985321,
"logits/rejected": -0.7754595279693604,
"logps/chosen": -30.828351974487305,
"logps/rejected": -49.377220153808594,
"loss": 0.3514,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.3007601797580719,
"rewards/margins": 2.031721830368042,
"rewards/rejected": -2.33248233795166,
"step": 58
},
{
"epoch": 0.6992592592592592,
"grad_norm": 22.656374640286362,
"learning_rate": 4.915932767986551e-07,
"logits/chosen": -1.103749394416809,
"logits/rejected": -1.0164357423782349,
"logps/chosen": -26.017108917236328,
"logps/rejected": -43.8387565612793,
"loss": 0.3561,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.2933482527732849,
"rewards/margins": 1.7674319744110107,
"rewards/rejected": -2.0607800483703613,
"step": 59
},
{
"epoch": 0.7111111111111111,
"grad_norm": 24.99314823194104,
"learning_rate": 4.909114739839079e-07,
"logits/chosen": -0.9634025692939758,
"logits/rejected": -0.9252867102622986,
"logps/chosen": -23.952117919921875,
"logps/rejected": -34.92929458618164,
"loss": 0.3598,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.1855652928352356,
"rewards/margins": 1.803605079650879,
"rewards/rejected": -1.9891700744628906,
"step": 60
},
{
"epoch": 0.7229629629629629,
"grad_norm": 22.905046033248826,
"learning_rate": 4.902036012654048e-07,
"logits/chosen": -0.7937788963317871,
"logits/rejected": -0.7061766982078552,
"logps/chosen": -22.034412384033203,
"logps/rejected": -33.86552047729492,
"loss": 0.3401,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.3443925678730011,
"rewards/margins": 1.395371913909912,
"rewards/rejected": -1.73976469039917,
"step": 61
},
{
"epoch": 0.7348148148148148,
"grad_norm": 25.28725048216447,
"learning_rate": 4.894697352446182e-07,
"logits/chosen": -1.0165841579437256,
"logits/rejected": -1.0237828493118286,
"logps/chosen": -24.306283950805664,
"logps/rejected": -39.6012077331543,
"loss": 0.3453,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.09554791450500488,
"rewards/margins": 1.746566891670227,
"rewards/rejected": -1.8421146869659424,
"step": 62
},
{
"epoch": 0.7466666666666667,
"grad_norm": 27.905008683571545,
"learning_rate": 4.887099553358501e-07,
"logits/chosen": -1.087665319442749,
"logits/rejected": -0.9620079398155212,
"logps/chosen": -29.117008209228516,
"logps/rejected": -37.334896087646484,
"loss": 0.3946,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.18033871054649353,
"rewards/margins": 1.7729251384735107,
"rewards/rejected": -1.953263759613037,
"step": 63
},
{
"epoch": 0.7585185185185185,
"grad_norm": 26.563175740341975,
"learning_rate": 4.879243437576383e-07,
"logits/chosen": -1.0562440156936646,
"logits/rejected": -0.8816579580307007,
"logps/chosen": -23.48358726501465,
"logps/rejected": -34.346927642822266,
"loss": 0.369,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.34856918454170227,
"rewards/margins": 1.5337965488433838,
"rewards/rejected": -1.8823657035827637,
"step": 64
},
{
"epoch": 0.7703703703703704,
"grad_norm": 28.950708662099014,
"learning_rate": 4.871129855238588e-07,
"logits/chosen": -1.031766653060913,
"logits/rejected": -1.0294549465179443,
"logps/chosen": -31.139263153076172,
"logps/rejected": -41.21425247192383,
"loss": 0.3715,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.3050842881202698,
"rewards/margins": 1.8005170822143555,
"rewards/rejected": -2.1056013107299805,
"step": 65
},
{
"epoch": 0.7822222222222223,
"grad_norm": 27.546216408337372,
"learning_rate": 4.862759684345269e-07,
"logits/chosen": -1.203002691268921,
"logits/rejected": -1.0988627672195435,
"logps/chosen": -29.396411895751953,
"logps/rejected": -35.40150833129883,
"loss": 0.3922,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.12675023078918457,
"rewards/margins": 2.0646703243255615,
"rewards/rejected": -2.191420316696167,
"step": 66
},
{
"epoch": 0.794074074074074,
"grad_norm": 23.894441975814534,
"learning_rate": 4.854133830662955e-07,
"logits/chosen": -0.9780765771865845,
"logits/rejected": -0.8497614860534668,
"logps/chosen": -28.06260871887207,
"logps/rejected": -34.55665588378906,
"loss": 0.3334,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6227935552597046,
"rewards/margins": 2.2487592697143555,
"rewards/rejected": -2.8715527057647705,
"step": 67
},
{
"epoch": 0.8059259259259259,
"grad_norm": 30.617173652616593,
"learning_rate": 4.845253227626536e-07,
"logits/chosen": -1.0398799180984497,
"logits/rejected": -0.907300591468811,
"logps/chosen": -41.52682876586914,
"logps/rejected": -43.311920166015625,
"loss": 0.4022,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.7630512714385986,
"rewards/margins": 1.217781662940979,
"rewards/rejected": -1.9808329343795776,
"step": 68
},
{
"epoch": 0.8177777777777778,
"grad_norm": 24.025263203043526,
"learning_rate": 4.836118836238252e-07,
"logits/chosen": -1.1331119537353516,
"logits/rejected": -1.0378354787826538,
"logps/chosen": -27.220407485961914,
"logps/rejected": -41.87384796142578,
"loss": 0.3431,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.057599157094955444,
"rewards/margins": 1.6851834058761597,
"rewards/rejected": -1.7427825927734375,
"step": 69
},
{
"epoch": 0.8296296296296296,
"grad_norm": 23.34599437673964,
"learning_rate": 4.826731644963704e-07,
"logits/chosen": -1.0917811393737793,
"logits/rejected": -1.0149914026260376,
"logps/chosen": -25.583330154418945,
"logps/rejected": -33.85319900512695,
"loss": 0.3162,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.5846102237701416,
"rewards/margins": 1.9573626518249512,
"rewards/rejected": -2.5419728755950928,
"step": 70
},
{
"epoch": 0.8414814814814815,
"grad_norm": 24.42006807604626,
"learning_rate": 4.817092669624882e-07,
"logits/chosen": -1.0650672912597656,
"logits/rejected": -0.9445031881332397,
"logps/chosen": -22.825862884521484,
"logps/rejected": -33.60643768310547,
"loss": 0.3745,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.11327299475669861,
"rewards/margins": 2.1697635650634766,
"rewards/rejected": -2.283036708831787,
"step": 71
},
{
"epoch": 0.8533333333333334,
"grad_norm": 24.54245031605526,
"learning_rate": 4.807202953290243e-07,
"logits/chosen": -1.1544904708862305,
"logits/rejected": -0.9994347095489502,
"logps/chosen": -23.641387939453125,
"logps/rejected": -38.42119216918945,
"loss": 0.3599,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.23269107937812805,
"rewards/margins": 2.1029093265533447,
"rewards/rejected": -2.3356003761291504,
"step": 72
},
{
"epoch": 0.8651851851851852,
"grad_norm": 25.210130682755583,
"learning_rate": 4.797063566161834e-07,
"logits/chosen": -0.9285881519317627,
"logits/rejected": -0.8881164789199829,
"logps/chosen": -31.189298629760742,
"logps/rejected": -35.99159622192383,
"loss": 0.3768,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.41402971744537354,
"rewards/margins": 1.2696895599365234,
"rewards/rejected": -1.6837193965911865,
"step": 73
},
{
"epoch": 0.8770370370370371,
"grad_norm": 22.99038510220094,
"learning_rate": 4.786675605459487e-07,
"logits/chosen": -1.1656837463378906,
"logits/rejected": -1.1220611333847046,
"logps/chosen": -28.37079620361328,
"logps/rejected": -45.16815185546875,
"loss": 0.3318,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.2702009975910187,
"rewards/margins": 2.232954978942871,
"rewards/rejected": -2.5031557083129883,
"step": 74
},
{
"epoch": 0.8888888888888888,
"grad_norm": 24.11796136324434,
"learning_rate": 4.776040195302079e-07,
"logits/chosen": -1.112859845161438,
"logits/rejected": -0.9862438440322876,
"logps/chosen": -22.272464752197266,
"logps/rejected": -35.39492416381836,
"loss": 0.3439,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.34517136216163635,
"rewards/margins": 2.139002561569214,
"rewards/rejected": -2.4841737747192383,
"step": 75
},
{
"epoch": 0.9007407407407407,
"grad_norm": 29.8497129464844,
"learning_rate": 4.76515848658589e-07,
"logits/chosen": -1.182924747467041,
"logits/rejected": -1.0297247171401978,
"logps/chosen": -30.078699111938477,
"logps/rejected": -39.582275390625,
"loss": 0.3452,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.5807650089263916,
"rewards/margins": 2.0797762870788574,
"rewards/rejected": -2.660541534423828,
"step": 76
},
{
"epoch": 0.9125925925925926,
"grad_norm": 25.533689636810493,
"learning_rate": 4.754031656860059e-07,
"logits/chosen": -1.0601996183395386,
"logits/rejected": -0.968002200126648,
"logps/chosen": -25.98404312133789,
"logps/rejected": -29.14290428161621,
"loss": 0.3515,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.17048078775405884,
"rewards/margins": 1.8824352025985718,
"rewards/rejected": -2.0529160499572754,
"step": 77
},
{
"epoch": 0.9244444444444444,
"grad_norm": 21.394058422904486,
"learning_rate": 4.74266091019916e-07,
"logits/chosen": -1.1088751554489136,
"logits/rejected": -0.9137270450592041,
"logps/chosen": -28.85074806213379,
"logps/rejected": -34.893470764160156,
"loss": 0.2988,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.05692651867866516,
"rewards/margins": 1.6240626573562622,
"rewards/rejected": -1.6809892654418945,
"step": 78
},
{
"epoch": 0.9362962962962963,
"grad_norm": 25.697276730733257,
"learning_rate": 4.7310474770728996e-07,
"logits/chosen": -1.2263762950897217,
"logits/rejected": -1.1397736072540283,
"logps/chosen": -28.09562873840332,
"logps/rejected": -35.75029754638672,
"loss": 0.3664,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.18649393320083618,
"rewards/margins": 1.1695051193237305,
"rewards/rejected": -1.3559989929199219,
"step": 79
},
{
"epoch": 0.9481481481481482,
"grad_norm": 21.662832078683152,
"learning_rate": 4.719192614212969e-07,
"logits/chosen": -0.9513252377510071,
"logits/rejected": -0.9007601141929626,
"logps/chosen": -34.18433380126953,
"logps/rejected": -53.043609619140625,
"loss": 0.2814,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.33905377984046936,
"rewards/margins": 2.0920355319976807,
"rewards/rejected": -2.431089401245117,
"step": 80
},
{
"epoch": 0.96,
"grad_norm": 24.69839835625674,
"learning_rate": 4.707097604477045e-07,
"logits/chosen": -1.1311062574386597,
"logits/rejected": -0.9999745488166809,
"logps/chosen": -32.54650115966797,
"logps/rejected": -34.888450622558594,
"loss": 0.3278,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.328436940908432,
"rewards/margins": 1.7844316959381104,
"rewards/rejected": -2.112868547439575,
"step": 81
},
{
"epoch": 0.9718518518518519,
"grad_norm": 21.411242391551657,
"learning_rate": 4.694763756709967e-07,
"logits/chosen": -1.1982715129852295,
"logits/rejected": -1.1674623489379883,
"logps/chosen": -28.029937744140625,
"logps/rejected": -37.19408416748047,
"loss": 0.2882,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3920401930809021,
"rewards/margins": 1.973564624786377,
"rewards/rejected": -2.365604877471924,
"step": 82
},
{
"epoch": 0.9837037037037037,
"grad_norm": 21.7744311573738,
"learning_rate": 4.6821924056021053e-07,
"logits/chosen": -1.0800765752792358,
"logits/rejected": -0.9170486330986023,
"logps/chosen": -22.360857009887695,
"logps/rejected": -41.66752624511719,
"loss": 0.3088,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.21913698315620422,
"rewards/margins": 2.08003830909729,
"rewards/rejected": -2.299175262451172,
"step": 83
},
{
"epoch": 0.9955555555555555,
"grad_norm": 24.355082987137063,
"learning_rate": 4.669384911544926e-07,
"logits/chosen": -1.06318199634552,
"logits/rejected": -1.0848791599273682,
"logps/chosen": -24.275285720825195,
"logps/rejected": -37.596893310546875,
"loss": 0.3674,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.2712962031364441,
"rewards/margins": 1.7089827060699463,
"rewards/rejected": -1.9802789688110352,
"step": 84
},
{
"epoch": 1.0074074074074073,
"grad_norm": 22.616093539594576,
"learning_rate": 4.6563426604837817e-07,
"logits/chosen": -1.2081141471862793,
"logits/rejected": -0.9877020716667175,
"logps/chosen": -34.070823669433594,
"logps/rejected": -40.52888107299805,
"loss": 0.2829,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.10701459646224976,
"rewards/margins": 3.043393611907959,
"rewards/rejected": -3.1504077911376953,
"step": 85
},
{
"epoch": 1.0192592592592593,
"grad_norm": 16.700104066458838,
"learning_rate": 4.6430670637679294e-07,
"logits/chosen": -1.0600411891937256,
"logits/rejected": -0.8425652384757996,
"logps/chosen": -22.52095603942871,
"logps/rejected": -33.55463409423828,
"loss": 0.2269,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.14321041107177734,
"rewards/margins": 2.309981346130371,
"rewards/rejected": -2.1667709350585938,
"step": 86
},
{
"epoch": 1.031111111111111,
"grad_norm": 15.240586856186553,
"learning_rate": 4.629559557997804e-07,
"logits/chosen": -1.3102786540985107,
"logits/rejected": -1.143240213394165,
"logps/chosen": -31.257415771484375,
"logps/rejected": -47.26383590698242,
"loss": 0.1831,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.7230758666992188,
"rewards/margins": 3.270418882369995,
"rewards/rejected": -3.993495225906372,
"step": 87
},
{
"epoch": 1.0429629629629629,
"grad_norm": 14.157542057104557,
"learning_rate": 4.615821604869563e-07,
"logits/chosen": -1.094043254852295,
"logits/rejected": -0.8985757827758789,
"logps/chosen": -28.409828186035156,
"logps/rejected": -47.5828971862793,
"loss": 0.1842,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.18681968748569489,
"rewards/margins": 3.8075270652770996,
"rewards/rejected": -3.9943466186523438,
"step": 88
},
{
"epoch": 1.0548148148148149,
"grad_norm": 17.38420675108177,
"learning_rate": 4.6018546910169067e-07,
"logits/chosen": -1.0334746837615967,
"logits/rejected": -0.9715449810028076,
"logps/chosen": -25.995702743530273,
"logps/rejected": -38.42037582397461,
"loss": 0.2053,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.36298614740371704,
"rewards/margins": 2.626688241958618,
"rewards/rejected": -2.9896743297576904,
"step": 89
},
{
"epoch": 1.0666666666666667,
"grad_norm": 16.906629376553013,
"learning_rate": 4.5876603278502027e-07,
"logits/chosen": -1.0619425773620605,
"logits/rejected": -0.9389445781707764,
"logps/chosen": -28.09102439880371,
"logps/rejected": -51.08159255981445,
"loss": 0.2098,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.0606449693441391,
"rewards/margins": 3.6463186740875244,
"rewards/rejected": -3.5856735706329346,
"step": 90
},
{
"epoch": 1.0785185185185184,
"grad_norm": 18.96732689014115,
"learning_rate": 4.573240051392935e-07,
"logits/chosen": -0.9454656839370728,
"logits/rejected": -0.9307714700698853,
"logps/chosen": -26.379640579223633,
"logps/rejected": -37.363258361816406,
"loss": 0.238,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.14031583070755005,
"rewards/margins": 2.1791586875915527,
"rewards/rejected": -2.319474458694458,
"step": 91
},
{
"epoch": 1.0903703703703704,
"grad_norm": 16.671437504434632,
"learning_rate": 4.5585954221154853e-07,
"logits/chosen": -1.3018877506256104,
"logits/rejected": -1.1478052139282227,
"logps/chosen": -25.605445861816406,
"logps/rejected": -44.80401611328125,
"loss": 0.2076,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.03381985425949097,
"rewards/margins": 3.086803436279297,
"rewards/rejected": -3.1206235885620117,
"step": 92
},
{
"epoch": 1.1022222222222222,
"grad_norm": 16.654640941302485,
"learning_rate": 4.5437280247662646e-07,
"logits/chosen": -1.0023672580718994,
"logits/rejected": -0.9070078134536743,
"logps/chosen": -29.185150146484375,
"logps/rejected": -37.990234375,
"loss": 0.1961,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.18216750025749207,
"rewards/margins": 2.39959716796875,
"rewards/rejected": -2.5817646980285645,
"step": 93
},
{
"epoch": 1.114074074074074,
"grad_norm": 16.12699044310946,
"learning_rate": 4.528639468200226e-07,
"logits/chosen": -1.1345858573913574,
"logits/rejected": -1.107000470161438,
"logps/chosen": -28.13390350341797,
"logps/rejected": -36.65238571166992,
"loss": 0.204,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.10749045014381409,
"rewards/margins": 2.2392215728759766,
"rewards/rejected": -2.1317310333251953,
"step": 94
},
{
"epoch": 1.125925925925926,
"grad_norm": 14.378767798932659,
"learning_rate": 4.5133313852047613e-07,
"logits/chosen": -1.058295726776123,
"logits/rejected": -1.0083810091018677,
"logps/chosen": -27.640762329101562,
"logps/rejected": -42.5653076171875,
"loss": 0.1812,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1484062671661377,
"rewards/margins": 2.2676548957824707,
"rewards/rejected": -2.119248390197754,
"step": 95
},
{
"epoch": 1.1377777777777778,
"grad_norm": 20.808144652094654,
"learning_rate": 4.4978054323230144e-07,
"logits/chosen": -1.0242708921432495,
"logits/rejected": -0.9334837198257446,
"logps/chosen": -24.16075897216797,
"logps/rejected": -34.90480041503906,
"loss": 0.241,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.17029838263988495,
"rewards/margins": 2.3052542209625244,
"rewards/rejected": -2.134955406188965,
"step": 96
},
{
"epoch": 1.1496296296296296,
"grad_norm": 14.579273235897853,
"learning_rate": 4.482063289674618e-07,
"logits/chosen": -1.0504794120788574,
"logits/rejected": -0.9864072799682617,
"logps/chosen": -25.85841178894043,
"logps/rejected": -44.5855598449707,
"loss": 0.1552,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2036604881286621,
"rewards/margins": 3.2001941204071045,
"rewards/rejected": -2.9965333938598633,
"step": 97
},
{
"epoch": 1.1614814814814816,
"grad_norm": 14.479069724776132,
"learning_rate": 4.466106660773884e-07,
"logits/chosen": -1.2236568927764893,
"logits/rejected": -1.0246343612670898,
"logps/chosen": -30.013458251953125,
"logps/rejected": -40.343631744384766,
"loss": 0.176,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.13774560391902924,
"rewards/margins": 2.9517884254455566,
"rewards/rejected": -3.089534044265747,
"step": 98
},
{
"epoch": 1.1733333333333333,
"grad_norm": 16.052170855559773,
"learning_rate": 4.44993727234546e-07,
"logits/chosen": -1.102075457572937,
"logits/rejected": -0.9819889664649963,
"logps/chosen": -30.00847816467285,
"logps/rejected": -35.746273040771484,
"loss": 0.1827,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.23734648525714874,
"rewards/margins": 2.4544928073883057,
"rewards/rejected": -2.6918392181396484,
"step": 99
},
{
"epoch": 1.1851851851851851,
"grad_norm": 14.71406650743676,
"learning_rate": 4.4335568741374695e-07,
"logits/chosen": -1.3955886363983154,
"logits/rejected": -1.1072180271148682,
"logps/chosen": -29.151214599609375,
"logps/rejected": -35.26973342895508,
"loss": 0.1753,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.24000686407089233,
"rewards/margins": 2.8170034885406494,
"rewards/rejected": -2.576996326446533,
"step": 100
},
{
"epoch": 1.1970370370370371,
"grad_norm": 15.185117866368294,
"learning_rate": 4.4169672387321735e-07,
"logits/chosen": -0.9774000644683838,
"logits/rejected": -0.8965713977813721,
"logps/chosen": -28.971498489379883,
"logps/rejected": -42.8656120300293,
"loss": 0.1719,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.21834176778793335,
"rewards/margins": 3.638746738433838,
"rewards/rejected": -3.4204049110412598,
"step": 101
},
{
"epoch": 1.208888888888889,
"grad_norm": 19.818913364910017,
"learning_rate": 4.4001701613541454e-07,
"logits/chosen": -0.9378620982170105,
"logits/rejected": -0.8033993244171143,
"logps/chosen": -25.265066146850586,
"logps/rejected": -34.76940155029297,
"loss": 0.2147,
"rewards/accuracies": 0.8125,
"rewards/chosen": 0.2527243196964264,
"rewards/margins": 2.660951852798462,
"rewards/rejected": -2.4082274436950684,
"step": 102
},
{
"epoch": 1.2207407407407407,
"grad_norm": 17.017386662283865,
"learning_rate": 4.383167459676008e-07,
"logits/chosen": -1.101958155632019,
"logits/rejected": -1.0334186553955078,
"logps/chosen": -27.581031799316406,
"logps/rejected": -41.83063507080078,
"loss": 0.2141,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.033310309052467346,
"rewards/margins": 2.295804262161255,
"rewards/rejected": -2.2624940872192383,
"step": 103
},
{
"epoch": 1.2325925925925927,
"grad_norm": 15.01263977310487,
"learning_rate": 4.365960973621734e-07,
"logits/chosen": -1.261305570602417,
"logits/rejected": -1.1650094985961914,
"logps/chosen": -21.846336364746094,
"logps/rejected": -38.35143280029297,
"loss": 0.1664,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.08976972103118896,
"rewards/margins": 2.9284555912017822,
"rewards/rejected": -2.838685989379883,
"step": 104
},
{
"epoch": 1.2444444444444445,
"grad_norm": 15.499811043472015,
"learning_rate": 4.348552565167542e-07,
"logits/chosen": -0.9682034850120544,
"logits/rejected": -0.8779630064964294,
"logps/chosen": -26.32052993774414,
"logps/rejected": -33.074302673339844,
"loss": 0.1766,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.027928471565246582,
"rewards/margins": 2.689946413040161,
"rewards/rejected": -2.717874765396118,
"step": 105
},
{
"epoch": 1.2562962962962962,
"grad_norm": 16.751326465749557,
"learning_rate": 4.330944118140406e-07,
"logits/chosen": -0.9463189840316772,
"logits/rejected": -0.8563187718391418,
"logps/chosen": -29.297607421875,
"logps/rejected": -38.705177307128906,
"loss": 0.1839,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.17362913489341736,
"rewards/margins": 2.7915725708007812,
"rewards/rejected": -2.617943286895752,
"step": 106
},
{
"epoch": 1.268148148148148,
"grad_norm": 13.213493074609195,
"learning_rate": 4.313137538014198e-07,
"logits/chosen": -1.0986582040786743,
"logits/rejected": -0.9737260937690735,
"logps/chosen": -25.97295570373535,
"logps/rejected": -27.29983901977539,
"loss": 0.1545,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.4464994966983795,
"rewards/margins": 2.5145790576934814,
"rewards/rejected": -2.0680792331695557,
"step": 107
},
{
"epoch": 1.28,
"grad_norm": 21.550277344518772,
"learning_rate": 4.295134751703492e-07,
"logits/chosen": -0.9147591590881348,
"logits/rejected": -0.8136166334152222,
"logps/chosen": -39.372562408447266,
"logps/rejected": -40.19895935058594,
"loss": 0.2066,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.10953384637832642,
"rewards/margins": 3.0302987098693848,
"rewards/rejected": -3.1398324966430664,
"step": 108
},
{
"epoch": 1.2918518518518518,
"grad_norm": 15.95008980481358,
"learning_rate": 4.276937707355044e-07,
"logits/chosen": -1.119678020477295,
"logits/rejected": -0.9529648423194885,
"logps/chosen": -29.550357818603516,
"logps/rejected": -40.979732513427734,
"loss": 0.1793,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.23076438903808594,
"rewards/margins": 3.9992775917053223,
"rewards/rejected": -3.7685132026672363,
"step": 109
},
{
"epoch": 1.3037037037037038,
"grad_norm": 14.896618310434517,
"learning_rate": 4.2585483741369755e-07,
"logits/chosen": -1.1377118825912476,
"logits/rejected": -1.0649988651275635,
"logps/chosen": -20.728757858276367,
"logps/rejected": -42.846527099609375,
"loss": 0.1515,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.1395069807767868,
"rewards/margins": 3.1843342781066895,
"rewards/rejected": -3.3238413333892822,
"step": 110
},
{
"epoch": 1.3155555555555556,
"grad_norm": 15.287898186475319,
"learning_rate": 4.239968742025684e-07,
"logits/chosen": -0.9551693797111511,
"logits/rejected": -0.8516461253166199,
"logps/chosen": -22.917587280273438,
"logps/rejected": -43.595619201660156,
"loss": 0.184,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.1407267451286316,
"rewards/margins": 3.2762203216552734,
"rewards/rejected": -3.4169468879699707,
"step": 111
},
{
"epoch": 1.3274074074074074,
"grad_norm": 13.13930765742771,
"learning_rate": 4.2212008215905e-07,
"logits/chosen": -1.309780240058899,
"logits/rejected": -1.1697163581848145,
"logps/chosen": -23.579864501953125,
"logps/rejected": -39.38568115234375,
"loss": 0.1529,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.12219972908496857,
"rewards/margins": 2.721135139465332,
"rewards/rejected": -2.843334913253784,
"step": 112
},
{
"epoch": 1.3392592592592591,
"grad_norm": 16.93467958306283,
"learning_rate": 4.2022466437761154e-07,
"logits/chosen": -1.0195517539978027,
"logits/rejected": -0.9710554480552673,
"logps/chosen": -27.96396255493164,
"logps/rejected": -39.36810302734375,
"loss": 0.1946,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.13629719614982605,
"rewards/margins": 1.8954023122787476,
"rewards/rejected": -2.0316996574401855,
"step": 113
},
{
"epoch": 1.3511111111111112,
"grad_norm": 16.185982425906115,
"learning_rate": 4.18310825968281e-07,
"logits/chosen": -1.085777997970581,
"logits/rejected": -1.0098400115966797,
"logps/chosen": -31.38774871826172,
"logps/rejected": -44.18259811401367,
"loss": 0.1856,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.33986663818359375,
"rewards/margins": 3.2784461975097656,
"rewards/rejected": -3.618312358856201,
"step": 114
},
{
"epoch": 1.362962962962963,
"grad_norm": 14.370437677602862,
"learning_rate": 4.1637877403444923e-07,
"logits/chosen": -1.1370917558670044,
"logits/rejected": -1.076406478881836,
"logps/chosen": -21.368831634521484,
"logps/rejected": -37.987247467041016,
"loss": 0.1862,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.27133771777153015,
"rewards/margins": 3.750422716140747,
"rewards/rejected": -3.4790849685668945,
"step": 115
},
{
"epoch": 1.374814814814815,
"grad_norm": 14.315285669788084,
"learning_rate": 4.144287176504582e-07,
"logits/chosen": -1.0781633853912354,
"logits/rejected": -0.9295682907104492,
"logps/chosen": -27.247238159179688,
"logps/rejected": -39.297607421875,
"loss": 0.1807,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.07360666990280151,
"rewards/margins": 2.9343483448028564,
"rewards/rejected": -2.860741376876831,
"step": 116
},
{
"epoch": 1.3866666666666667,
"grad_norm": 13.224703617010858,
"learning_rate": 4.1246086783897713e-07,
"logits/chosen": -1.143677830696106,
"logits/rejected": -1.035298228263855,
"logps/chosen": -21.692089080810547,
"logps/rejected": -39.77001953125,
"loss": 0.1324,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.11114715039730072,
"rewards/margins": 4.035545825958252,
"rewards/rejected": -3.924398422241211,
"step": 117
},
{
"epoch": 1.3985185185185185,
"grad_norm": 13.386330467851073,
"learning_rate": 4.104754375481664e-07,
"logits/chosen": -1.1449244022369385,
"logits/rejected": -1.0441653728485107,
"logps/chosen": -24.610374450683594,
"logps/rejected": -36.322635650634766,
"loss": 0.148,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.24931076169013977,
"rewards/margins": 2.860081195831299,
"rewards/rejected": -3.1093921661376953,
"step": 118
},
{
"epoch": 1.4103703703703703,
"grad_norm": 17.903128810468665,
"learning_rate": 4.084726416286337e-07,
"logits/chosen": -1.1355631351470947,
"logits/rejected": -1.0569454431533813,
"logps/chosen": -22.172731399536133,
"logps/rejected": -38.71437072753906,
"loss": 0.1681,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.004118114709854126,
"rewards/margins": 3.1719160079956055,
"rewards/rejected": -3.1760339736938477,
"step": 119
},
{
"epoch": 1.4222222222222223,
"grad_norm": 14.325608299731273,
"learning_rate": 4.0645269681018434e-07,
"logits/chosen": -1.2059547901153564,
"logits/rejected": -1.132045030593872,
"logps/chosen": -24.006052017211914,
"logps/rejected": -37.643314361572266,
"loss": 0.1583,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.3129858076572418,
"rewards/margins": 2.9086873531341553,
"rewards/rejected": -3.221672773361206,
"step": 120
},
{
"epoch": 1.434074074074074,
"grad_norm": 13.002484277938684,
"learning_rate": 4.044158216783684e-07,
"logits/chosen": -1.369994044303894,
"logits/rejected": -1.179801344871521,
"logps/chosen": -28.838666915893555,
"logps/rejected": -49.269287109375,
"loss": 0.1372,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.21779999136924744,
"rewards/margins": 4.504581928253174,
"rewards/rejected": -4.722381591796875,
"step": 121
},
{
"epoch": 1.445925925925926,
"grad_norm": 16.113792921785464,
"learning_rate": 4.0236223665082605e-07,
"logits/chosen": -1.1226955652236938,
"logits/rejected": -1.0712882280349731,
"logps/chosen": -21.75322151184082,
"logps/rejected": -35.07586669921875,
"loss": 0.1625,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1918860822916031,
"rewards/margins": 3.3565304279327393,
"rewards/rejected": -3.164644479751587,
"step": 122
},
{
"epoch": 1.4577777777777778,
"grad_norm": 11.423804755471494,
"learning_rate": 4.0029216395343617e-07,
"logits/chosen": -1.0564236640930176,
"logits/rejected": -0.9565566778182983,
"logps/chosen": -27.292240142822266,
"logps/rejected": -41.23828887939453,
"loss": 0.1276,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.42787694931030273,
"rewards/margins": 3.2124743461608887,
"rewards/rejected": -3.6403515338897705,
"step": 123
},
{
"epoch": 1.4696296296296296,
"grad_norm": 11.96487396864106,
"learning_rate": 3.982058275962682e-07,
"logits/chosen": -1.2627426385879517,
"logits/rejected": -1.163001298904419,
"logps/chosen": -20.64603614807129,
"logps/rejected": -39.54261016845703,
"loss": 0.1485,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.4101359248161316,
"rewards/margins": 2.903512716293335,
"rewards/rejected": -2.4933767318725586,
"step": 124
},
{
"epoch": 1.4814814814814814,
"grad_norm": 13.800579072803204,
"learning_rate": 3.9610345334934094e-07,
"logits/chosen": -1.2117929458618164,
"logits/rejected": -0.9392006993293762,
"logps/chosen": -28.66204071044922,
"logps/rejected": -40.63731002807617,
"loss": 0.1596,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1593039333820343,
"rewards/margins": 3.4954304695129395,
"rewards/rejected": -3.3361263275146484,
"step": 125
},
{
"epoch": 1.4933333333333334,
"grad_norm": 12.680404338446278,
"learning_rate": 3.939852687181915e-07,
"logits/chosen": -1.1634321212768555,
"logits/rejected": -1.0764764547348022,
"logps/chosen": -24.423765182495117,
"logps/rejected": -45.39548873901367,
"loss": 0.1324,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.17568892240524292,
"rewards/margins": 4.0248494148254395,
"rewards/rejected": -3.8491601943969727,
"step": 126
},
{
"epoch": 1.5051851851851852,
"grad_norm": 13.14161578490378,
"learning_rate": 3.9185150291925585e-07,
"logits/chosen": -1.0429072380065918,
"logits/rejected": -1.0684268474578857,
"logps/chosen": -26.456886291503906,
"logps/rejected": -39.13412094116211,
"loss": 0.1397,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.43806853890419006,
"rewards/margins": 3.234588146209717,
"rewards/rejected": -3.672656536102295,
"step": 127
},
{
"epoch": 1.5170370370370372,
"grad_norm": 14.252517134892512,
"learning_rate": 3.8970238685506486e-07,
"logits/chosen": -1.0745394229888916,
"logits/rejected": -1.0680888891220093,
"logps/chosen": -26.106287002563477,
"logps/rejected": -45.78963088989258,
"loss": 0.1535,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.10547050833702087,
"rewards/margins": 3.6777379512786865,
"rewards/rejected": -3.5722672939300537,
"step": 128
},
{
"epoch": 1.528888888888889,
"grad_norm": 13.410270453749325,
"learning_rate": 3.8753815308925685e-07,
"logits/chosen": -1.3084537982940674,
"logits/rejected": -1.1879018545150757,
"logps/chosen": -22.162595748901367,
"logps/rejected": -42.90380096435547,
"loss": 0.1354,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.29345619678497314,
"rewards/margins": 3.8301730155944824,
"rewards/rejected": -4.123629570007324,
"step": 129
},
{
"epoch": 1.5407407407407407,
"grad_norm": 16.65901363698597,
"learning_rate": 3.8535903582141184e-07,
"logits/chosen": -1.1705418825149536,
"logits/rejected": -1.053526520729065,
"logps/chosen": -22.083023071289062,
"logps/rejected": -43.40499496459961,
"loss": 0.1819,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.09943583607673645,
"rewards/margins": 3.597656011581421,
"rewards/rejected": -3.498220443725586,
"step": 130
},
{
"epoch": 1.5525925925925925,
"grad_norm": 15.81048973784746,
"learning_rate": 3.8316527086170727e-07,
"logits/chosen": -1.1002339124679565,
"logits/rejected": -0.9635283946990967,
"logps/chosen": -22.6536865234375,
"logps/rejected": -35.75001907348633,
"loss": 0.1862,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.08621586859226227,
"rewards/margins": 3.319308280944824,
"rewards/rejected": -3.2330923080444336,
"step": 131
},
{
"epoch": 1.5644444444444443,
"grad_norm": 13.934303626010081,
"learning_rate": 3.809570956054003e-07,
"logits/chosen": -1.2058043479919434,
"logits/rejected": -1.1326546669006348,
"logps/chosen": -20.698150634765625,
"logps/rejected": -43.496559143066406,
"loss": 0.1502,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.10407552123069763,
"rewards/margins": 4.241490364074707,
"rewards/rejected": -4.3455657958984375,
"step": 132
},
{
"epoch": 1.5762962962962963,
"grad_norm": 13.808397445470401,
"learning_rate": 3.787347490071389e-07,
"logits/chosen": -1.2017699480056763,
"logits/rejected": -1.1394281387329102,
"logps/chosen": -29.24155044555664,
"logps/rejected": -45.46855163574219,
"loss": 0.1565,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.3030049204826355,
"rewards/margins": 3.9124467372894287,
"rewards/rejected": -4.215451240539551,
"step": 133
},
{
"epoch": 1.5881481481481483,
"grad_norm": 13.152290267087837,
"learning_rate": 3.764984715551031e-07,
"logits/chosen": -1.1422480344772339,
"logits/rejected": -1.053503155708313,
"logps/chosen": -20.119190216064453,
"logps/rejected": -41.04280090332031,
"loss": 0.1632,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.04641704261302948,
"rewards/margins": 3.333278179168701,
"rewards/rejected": -3.379695415496826,
"step": 134
},
{
"epoch": 1.6,
"grad_norm": 12.384641280044091,
"learning_rate": 3.7424850524498113e-07,
"logits/chosen": -1.1235531568527222,
"logits/rejected": -1.016575574874878,
"logps/chosen": -23.927431106567383,
"logps/rejected": -38.624183654785156,
"loss": 0.1505,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.145728200674057,
"rewards/margins": 3.4623892307281494,
"rewards/rejected": -3.6081173419952393,
"step": 135
},
{
"epoch": 1.6118518518518519,
"grad_norm": 13.297788267005293,
"learning_rate": 3.7198509355378207e-07,
"logits/chosen": -1.1904593706130981,
"logits/rejected": -1.0650973320007324,
"logps/chosen": -30.460954666137695,
"logps/rejected": -35.29721450805664,
"loss": 0.1623,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.5998407602310181,
"rewards/margins": 2.190915822982788,
"rewards/rejected": -2.7907564640045166,
"step": 136
},
{
"epoch": 1.6237037037037036,
"grad_norm": 17.654879145447634,
"learning_rate": 3.6970848141348855e-07,
"logits/chosen": -1.2997840642929077,
"logits/rejected": -1.1812993288040161,
"logps/chosen": -29.659500122070312,
"logps/rejected": -39.244354248046875,
"loss": 0.1878,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.24524670839309692,
"rewards/margins": 3.048208713531494,
"rewards/rejected": -3.2934556007385254,
"step": 137
},
{
"epoch": 1.6355555555555554,
"grad_norm": 9.713259026639975,
"learning_rate": 3.6741891518455146e-07,
"logits/chosen": -1.0600968599319458,
"logits/rejected": -0.9694119691848755,
"logps/chosen": -26.941146850585938,
"logps/rejected": -45.241539001464844,
"loss": 0.099,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.2543194591999054,
"rewards/margins": 3.474762201309204,
"rewards/rejected": -3.729081392288208,
"step": 138
},
{
"epoch": 1.6474074074074074,
"grad_norm": 11.146298314879976,
"learning_rate": 3.6511664262923094e-07,
"logits/chosen": -1.1857203245162964,
"logits/rejected": -1.1235812902450562,
"logps/chosen": -20.542293548583984,
"logps/rejected": -38.22064971923828,
"loss": 0.1272,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.22452278435230255,
"rewards/margins": 3.8128674030303955,
"rewards/rejected": -4.037390232086182,
"step": 139
},
{
"epoch": 1.6592592592592592,
"grad_norm": 11.77226347660767,
"learning_rate": 3.6280191288478435e-07,
"logits/chosen": -1.2729771137237549,
"logits/rejected": -1.1265182495117188,
"logps/chosen": -26.0278377532959,
"logps/rejected": -44.57939147949219,
"loss": 0.1158,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.20267322659492493,
"rewards/margins": 3.678438901901245,
"rewards/rejected": -3.8811120986938477,
"step": 140
},
{
"epoch": 1.6711111111111112,
"grad_norm": 12.442016266819769,
"learning_rate": 3.604749764365069e-07,
"logits/chosen": -1.1912599802017212,
"logits/rejected": -1.084775686264038,
"logps/chosen": -20.05962371826172,
"logps/rejected": -39.900665283203125,
"loss": 0.1196,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.07022520899772644,
"rewards/margins": 3.820122718811035,
"rewards/rejected": -3.890347957611084,
"step": 141
},
{
"epoch": 1.682962962962963,
"grad_norm": 14.443169294013128,
"learning_rate": 3.5813608509062526e-07,
"logits/chosen": -0.998296856880188,
"logits/rejected": -1.11066472530365,
"logps/chosen": -26.359149932861328,
"logps/rejected": -48.0468635559082,
"loss": 0.1386,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.2483871877193451,
"rewards/margins": 3.9444689750671387,
"rewards/rejected": -4.192856311798096,
"step": 142
},
{
"epoch": 1.6948148148148148,
"grad_norm": 12.88438627763912,
"learning_rate": 3.557854919470491e-07,
"logits/chosen": -1.1343494653701782,
"logits/rejected": -1.1029855012893677,
"logps/chosen": -32.05289077758789,
"logps/rejected": -38.77518081665039,
"loss": 0.1465,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.08383223414421082,
"rewards/margins": 2.9178643226623535,
"rewards/rejected": -3.001697063446045,
"step": 143
},
{
"epoch": 1.7066666666666666,
"grad_norm": 12.409012501344572,
"learning_rate": 3.5342345137198206e-07,
"logits/chosen": -1.0480347871780396,
"logits/rejected": -0.9312314391136169,
"logps/chosen": -30.324771881103516,
"logps/rejected": -36.17607116699219,
"loss": 0.1341,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.2758581340312958,
"rewards/margins": 2.6668765544891357,
"rewards/rejected": -2.942734718322754,
"step": 144
},
{
"epoch": 1.7185185185185186,
"grad_norm": 14.582949797718573,
"learning_rate": 3.510502189703954e-07,
"logits/chosen": -0.97275710105896,
"logits/rejected": -0.7612693905830383,
"logps/chosen": -28.907245635986328,
"logps/rejected": -45.605037689208984,
"loss": 0.1472,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.35651320219039917,
"rewards/margins": 4.817986011505127,
"rewards/rejected": -5.17449951171875,
"step": 145
},
{
"epoch": 1.7303703703703703,
"grad_norm": 13.66922326611715,
"learning_rate": 3.486660515583691e-07,
"logits/chosen": -1.1288774013519287,
"logits/rejected": -1.1245758533477783,
"logps/chosen": -23.699264526367188,
"logps/rejected": -42.97127914428711,
"loss": 0.1285,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.11036735773086548,
"rewards/margins": 4.373822212219238,
"rewards/rejected": -4.263454914093018,
"step": 146
},
{
"epoch": 1.7422222222222223,
"grad_norm": 13.037114765866198,
"learning_rate": 3.4627120713529983e-07,
"logits/chosen": -0.9598813056945801,
"logits/rejected": -0.8330179452896118,
"logps/chosen": -22.383928298950195,
"logps/rejected": -45.0758171081543,
"loss": 0.1429,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.034085407853126526,
"rewards/margins": 4.736968040466309,
"rewards/rejected": -4.771053314208984,
"step": 147
},
{
"epoch": 1.7540740740740741,
"grad_norm": 10.872543956486167,
"learning_rate": 3.438659448559825e-07,
"logits/chosen": -1.1963474750518799,
"logits/rejected": -1.0486239194869995,
"logps/chosen": -27.349458694458008,
"logps/rejected": -48.23403549194336,
"loss": 0.1038,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.17232058942317963,
"rewards/margins": 4.531591892242432,
"rewards/rejected": -4.703912734985352,
"step": 148
},
{
"epoch": 1.765925925925926,
"grad_norm": 10.7720279947233,
"learning_rate": 3.414505250025659e-07,
"logits/chosen": -0.9560255408287048,
"logits/rejected": -1.0075461864471436,
"logps/chosen": -30.97559928894043,
"logps/rejected": -42.89778518676758,
"loss": 0.1031,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.011755384504795074,
"rewards/margins": 3.706606864929199,
"rewards/rejected": -3.718362331390381,
"step": 149
},
{
"epoch": 1.7777777777777777,
"grad_norm": 14.01453220823484,
"learning_rate": 3.390252089563867e-07,
"logits/chosen": -1.167525291442871,
"logits/rejected": -1.008201241493225,
"logps/chosen": -24.03421401977539,
"logps/rejected": -37.12451171875,
"loss": 0.147,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.05710184574127197,
"rewards/margins": 3.54923939704895,
"rewards/rejected": -3.492137908935547,
"step": 150
},
{
"epoch": 1.7896296296296297,
"grad_norm": 18.40124537105695,
"learning_rate": 3.3659025916968475e-07,
"logits/chosen": -1.1562587022781372,
"logits/rejected": -1.0596400499343872,
"logps/chosen": -27.828075408935547,
"logps/rejected": -50.78956985473633,
"loss": 0.1666,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.4149998426437378,
"rewards/margins": 4.103493690490723,
"rewards/rejected": -4.51849365234375,
"step": 151
},
{
"epoch": 1.8014814814814815,
"grad_norm": 15.187471450574751,
"learning_rate": 3.3414593913720155e-07,
"logits/chosen": -1.1149495840072632,
"logits/rejected": -0.9014438986778259,
"logps/chosen": -24.957393646240234,
"logps/rejected": -38.273773193359375,
"loss": 0.1572,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.3299483358860016,
"rewards/margins": 3.5365545749664307,
"rewards/rejected": -3.206606388092041,
"step": 152
},
{
"epoch": 1.8133333333333335,
"grad_norm": 11.786430269793136,
"learning_rate": 3.3169251336766697e-07,
"logits/chosen": -1.0765142440795898,
"logits/rejected": -0.9713940620422363,
"logps/chosen": -23.6178035736084,
"logps/rejected": -36.39717102050781,
"loss": 0.1303,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.5628844499588013,
"rewards/margins": 3.1841235160827637,
"rewards/rejected": -3.7470080852508545,
"step": 153
},
{
"epoch": 1.8251851851851852,
"grad_norm": 15.707535366344572,
"learning_rate": 3.2923024735517567e-07,
"logits/chosen": -1.2396905422210693,
"logits/rejected": -1.13885498046875,
"logps/chosen": -25.60649871826172,
"logps/rejected": -41.11204147338867,
"loss": 0.1377,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.42897889018058777,
"rewards/margins": 3.3137550354003906,
"rewards/rejected": -3.742733955383301,
"step": 154
},
{
"epoch": 1.837037037037037,
"grad_norm": 16.002573607260132,
"learning_rate": 3.2675940755045713e-07,
"logits/chosen": -1.1592830419540405,
"logits/rejected": -1.069584846496582,
"logps/chosen": -34.06727600097656,
"logps/rejected": -54.026817321777344,
"loss": 0.1949,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.43196994066238403,
"rewards/margins": 4.310949802398682,
"rewards/rejected": -4.742919445037842,
"step": 155
},
{
"epoch": 1.8488888888888888,
"grad_norm": 14.394511048135854,
"learning_rate": 3.242802613320418e-07,
"logits/chosen": -1.0737497806549072,
"logits/rejected": -0.9672637581825256,
"logps/chosen": -27.148597717285156,
"logps/rejected": -41.859004974365234,
"loss": 0.1554,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.2204400897026062,
"rewards/margins": 3.8589026927948,
"rewards/rejected": -4.079343318939209,
"step": 156
},
{
"epoch": 1.8607407407407406,
"grad_norm": 13.068510095436686,
"learning_rate": 3.217930769773275e-07,
"logits/chosen": -1.2130502462387085,
"logits/rejected": -1.0399776697158813,
"logps/chosen": -20.487337112426758,
"logps/rejected": -35.530582427978516,
"loss": 0.1261,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.049159154295921326,
"rewards/margins": 4.460110187530518,
"rewards/rejected": -4.410951614379883,
"step": 157
},
{
"epoch": 1.8725925925925926,
"grad_norm": 12.727841490377434,
"learning_rate": 3.1929812363354764e-07,
"logits/chosen": -1.1142170429229736,
"logits/rejected": -0.979875385761261,
"logps/chosen": -25.325483322143555,
"logps/rejected": -46.20812225341797,
"loss": 0.1047,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.1650889664888382,
"rewards/margins": 4.539978504180908,
"rewards/rejected": -4.7050676345825195,
"step": 158
},
{
"epoch": 1.8844444444444446,
"grad_norm": 13.783921189406176,
"learning_rate": 3.167956712886463e-07,
"logits/chosen": -1.0069048404693604,
"logits/rejected": -0.9355603456497192,
"logps/chosen": -29.581226348876953,
"logps/rejected": -37.52265167236328,
"loss": 0.1372,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.499575138092041,
"rewards/margins": 2.6125097274780273,
"rewards/rejected": -3.1120848655700684,
"step": 159
},
{
"epoch": 1.8962962962962964,
"grad_norm": 12.862775831490238,
"learning_rate": 3.142859907420615e-07,
"logits/chosen": -1.0252788066864014,
"logits/rejected": -1.0804516077041626,
"logps/chosen": -24.711009979248047,
"logps/rejected": -42.78890609741211,
"loss": 0.1256,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.3273608684539795,
"rewards/margins": 3.2098522186279297,
"rewards/rejected": -3.5372135639190674,
"step": 160
},
{
"epoch": 1.9081481481481481,
"grad_norm": 11.856116486125906,
"learning_rate": 3.117693535754213e-07,
"logits/chosen": -1.069286823272705,
"logits/rejected": -0.9155316948890686,
"logps/chosen": -23.146581649780273,
"logps/rejected": -43.31779479980469,
"loss": 0.1256,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.038980498909950256,
"rewards/margins": 4.636472702026367,
"rewards/rejected": -4.597492218017578,
"step": 161
},
{
"epoch": 1.92,
"grad_norm": 15.032149567521808,
"learning_rate": 3.092460321231547e-07,
"logits/chosen": -1.0839258432388306,
"logits/rejected": -1.006733775138855,
"logps/chosen": -24.381574630737305,
"logps/rejected": -40.473060607910156,
"loss": 0.1488,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.02435511350631714,
"rewards/margins": 4.535048007965088,
"rewards/rejected": -4.559402942657471,
"step": 162
},
{
"epoch": 1.9318518518518517,
"grad_norm": 14.8363884279284,
"learning_rate": 3.0671629944302164e-07,
"logits/chosen": -1.0501927137374878,
"logits/rejected": -0.9243767261505127,
"logps/chosen": -27.61357879638672,
"logps/rejected": -36.362586975097656,
"loss": 0.1177,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.23934195935726166,
"rewards/margins": 3.6352920532226562,
"rewards/rejected": -3.8746337890625,
"step": 163
},
{
"epoch": 1.9437037037037037,
"grad_norm": 12.238985051757798,
"learning_rate": 3.0418042928656415e-07,
"logits/chosen": -1.1459879875183105,
"logits/rejected": -0.9831377267837524,
"logps/chosen": -23.33287811279297,
"logps/rejected": -43.29710006713867,
"loss": 0.1341,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.029119372367858887,
"rewards/margins": 4.274390697479248,
"rewards/rejected": -4.3035101890563965,
"step": 164
},
{
"epoch": 1.9555555555555557,
"grad_norm": 16.045991654119778,
"learning_rate": 3.016386960694827e-07,
"logits/chosen": -1.0820094347000122,
"logits/rejected": -0.9164285063743591,
"logps/chosen": -29.36737823486328,
"logps/rejected": -45.8538818359375,
"loss": 0.1575,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.5107632875442505,
"rewards/margins": 3.8868861198425293,
"rewards/rejected": -4.39764928817749,
"step": 165
},
{
"epoch": 1.9674074074074075,
"grad_norm": 15.658417100599408,
"learning_rate": 2.990913748419411e-07,
"logits/chosen": -1.1057474613189697,
"logits/rejected": -1.0400460958480835,
"logps/chosen": -32.17692565917969,
"logps/rejected": -43.858551025390625,
"loss": 0.1491,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.028441503643989563,
"rewards/margins": 3.5364620685577393,
"rewards/rejected": -3.5080206394195557,
"step": 166
},
{
"epoch": 1.9792592592592593,
"grad_norm": 17.182247947721276,
"learning_rate": 2.9653874125880167e-07,
"logits/chosen": -1.1606206893920898,
"logits/rejected": -1.0265402793884277,
"logps/chosen": -24.273101806640625,
"logps/rejected": -43.97246551513672,
"loss": 0.1734,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.012486815452575684,
"rewards/margins": 3.4821486473083496,
"rewards/rejected": -3.4946351051330566,
"step": 167
},
{
"epoch": 1.991111111111111,
"grad_norm": 8.93976424369471,
"learning_rate": 2.9398107154979634e-07,
"logits/chosen": -1.1381988525390625,
"logits/rejected": -1.03400456905365,
"logps/chosen": -21.53853416442871,
"logps/rejected": -48.0505256652832,
"loss": 0.0872,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.05716177821159363,
"rewards/margins": 4.557176113128662,
"rewards/rejected": -4.500014305114746,
"step": 168
},
{
"epoch": 2.002962962962963,
"grad_norm": 11.949224405327886,
"learning_rate": 2.9141864248963427e-07,
"logits/chosen": -1.2692681550979614,
"logits/rejected": -1.0146331787109375,
"logps/chosen": -27.361726760864258,
"logps/rejected": -35.84319305419922,
"loss": 0.1362,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.12528757750988007,
"rewards/margins": 4.429131984710693,
"rewards/rejected": -4.303844451904297,
"step": 169
},
{
"epoch": 2.0148148148148146,
"grad_norm": 7.858640781523143,
"learning_rate": 2.8885173136805125e-07,
"logits/chosen": -1.1425201892852783,
"logits/rejected": -1.0211284160614014,
"logps/chosen": -26.627113342285156,
"logps/rejected": -51.298709869384766,
"loss": 0.0958,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.16640473902225494,
"rewards/margins": 4.273306846618652,
"rewards/rejected": -4.439712047576904,
"step": 170
},
{
"epoch": 2.026666666666667,
"grad_norm": 5.791091337239758,
"learning_rate": 2.862806159598032e-07,
"logits/chosen": -1.246085286140442,
"logits/rejected": -1.1816462278366089,
"logps/chosen": -23.06086540222168,
"logps/rejected": -39.5461540222168,
"loss": 0.0582,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.21991188824176788,
"rewards/margins": 4.500458717346191,
"rewards/rejected": -4.28054666519165,
"step": 171
},
{
"epoch": 2.0385185185185186,
"grad_norm": 8.464583168455022,
"learning_rate": 2.837055744946072e-07,
"logits/chosen": -0.9950094819068909,
"logits/rejected": -0.9867933392524719,
"logps/chosen": -20.085613250732422,
"logps/rejected": -39.374183654785156,
"loss": 0.0846,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.23747408390045166,
"rewards/margins": 4.035274505615234,
"rewards/rejected": -3.797800064086914,
"step": 172
},
{
"epoch": 2.0503703703703704,
"grad_norm": 7.246388422688696,
"learning_rate": 2.811268856270332e-07,
"logits/chosen": -1.149637222290039,
"logits/rejected": -1.1608506441116333,
"logps/chosen": -22.0140380859375,
"logps/rejected": -42.8390998840332,
"loss": 0.081,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.34959834814071655,
"rewards/margins": 4.302677154541016,
"rewards/rejected": -3.9530792236328125,
"step": 173
},
{
"epoch": 2.062222222222222,
"grad_norm": 7.340518516395049,
"learning_rate": 2.7854482840634965e-07,
"logits/chosen": -1.2548686265945435,
"logits/rejected": -1.127457618713379,
"logps/chosen": -21.352310180664062,
"logps/rejected": -43.30939483642578,
"loss": 0.0859,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.04353713244199753,
"rewards/margins": 5.536983013153076,
"rewards/rejected": -5.49344539642334,
"step": 174
},
{
"epoch": 2.074074074074074,
"grad_norm": 9.753614692470563,
"learning_rate": 2.759596822463267e-07,
"logits/chosen": -1.1281955242156982,
"logits/rejected": -0.9843631386756897,
"logps/chosen": -28.948612213134766,
"logps/rejected": -37.4376335144043,
"loss": 0.0864,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.19420504570007324,
"rewards/margins": 3.627711772918701,
"rewards/rejected": -3.8219170570373535,
"step": 175
},
{
"epoch": 2.0859259259259257,
"grad_norm": 6.267240444464727,
"learning_rate": 2.73371726895e-07,
"logits/chosen": -1.1884928941726685,
"logits/rejected": -1.0611791610717773,
"logps/chosen": -29.869997024536133,
"logps/rejected": -49.20811462402344,
"loss": 0.0636,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.011849135160446167,
"rewards/margins": 4.665461540222168,
"rewards/rejected": -4.6536126136779785,
"step": 176
},
{
"epoch": 2.097777777777778,
"grad_norm": 6.022136138537939,
"learning_rate": 2.7078124240439793e-07,
"logits/chosen": -1.1008820533752441,
"logits/rejected": -0.9790475368499756,
"logps/chosen": -29.616289138793945,
"logps/rejected": -57.20648193359375,
"loss": 0.0594,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5932025909423828,
"rewards/margins": 6.053226947784424,
"rewards/rejected": -6.64642858505249,
"step": 177
},
{
"epoch": 2.1096296296296297,
"grad_norm": 6.379960194971949,
"learning_rate": 2.68188509100236e-07,
"logits/chosen": -1.0663186311721802,
"logits/rejected": -0.994686484336853,
"logps/chosen": -26.227067947387695,
"logps/rejected": -50.95429229736328,
"loss": 0.0638,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.13309167325496674,
"rewards/margins": 4.429349422454834,
"rewards/rejected": -4.562440872192383,
"step": 178
},
{
"epoch": 2.1214814814814815,
"grad_norm": 7.642435740805011,
"learning_rate": 2.6559380755158206e-07,
"logits/chosen": -1.1984007358551025,
"logits/rejected": -1.1312189102172852,
"logps/chosen": -29.640098571777344,
"logps/rejected": -48.15163040161133,
"loss": 0.0936,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.03109852969646454,
"rewards/margins": 4.580999851226807,
"rewards/rejected": -4.61209774017334,
"step": 179
},
{
"epoch": 2.1333333333333333,
"grad_norm": 7.309302464370304,
"learning_rate": 2.629974185404951e-07,
"logits/chosen": -1.232039451599121,
"logits/rejected": -1.1574738025665283,
"logps/chosen": -24.592525482177734,
"logps/rejected": -58.08824157714844,
"loss": 0.0754,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.607149600982666,
"rewards/margins": 5.233615875244141,
"rewards/rejected": -5.840765476226807,
"step": 180
},
{
"epoch": 2.145185185185185,
"grad_norm": 7.918401262658898,
"learning_rate": 2.603996230316402e-07,
"logits/chosen": -1.1730706691741943,
"logits/rejected": -1.1893783807754517,
"logps/chosen": -20.52701187133789,
"logps/rejected": -32.62423324584961,
"loss": 0.093,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2257765233516693,
"rewards/margins": 3.6342880725860596,
"rewards/rejected": -3.4085114002227783,
"step": 181
},
{
"epoch": 2.157037037037037,
"grad_norm": 8.076843746703107,
"learning_rate": 2.5780070214188474e-07,
"logits/chosen": -1.2444607019424438,
"logits/rejected": -1.1096103191375732,
"logps/chosen": -33.14277267456055,
"logps/rejected": -46.21152114868164,
"loss": 0.0751,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.3877983093261719,
"rewards/margins": 3.8209316730499268,
"rewards/rejected": -4.2087297439575195,
"step": 182
},
{
"epoch": 2.168888888888889,
"grad_norm": 7.21014521039241,
"learning_rate": 2.552009371098778e-07,
"logits/chosen": -1.132177472114563,
"logits/rejected": -1.0657352209091187,
"logps/chosen": -27.557518005371094,
"logps/rejected": -44.8818473815918,
"loss": 0.0694,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.03888387978076935,
"rewards/margins": 4.166874885559082,
"rewards/rejected": -4.205758571624756,
"step": 183
},
{
"epoch": 2.180740740740741,
"grad_norm": 7.31003315950285,
"learning_rate": 2.5260060926561604e-07,
"logits/chosen": -1.1547397375106812,
"logits/rejected": -1.0553665161132812,
"logps/chosen": -22.003814697265625,
"logps/rejected": -42.98273849487305,
"loss": 0.0753,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.14262710511684418,
"rewards/margins": 4.841116905212402,
"rewards/rejected": -4.9837446212768555,
"step": 184
},
{
"epoch": 2.1925925925925926,
"grad_norm": 8.27339627937372,
"learning_rate": 2.5e-07,
"logits/chosen": -1.2605483531951904,
"logits/rejected": -1.0690468549728394,
"logps/chosen": -28.908740997314453,
"logps/rejected": -40.10096740722656,
"loss": 0.0956,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.43739017844200134,
"rewards/margins": 3.4248218536376953,
"rewards/rejected": -3.8622121810913086,
"step": 185
},
{
"epoch": 2.2044444444444444,
"grad_norm": 8.253216832927258,
"learning_rate": 2.4739939073438393e-07,
"logits/chosen": -1.3061436414718628,
"logits/rejected": -1.1886006593704224,
"logps/chosen": -33.44011688232422,
"logps/rejected": -46.8795166015625,
"loss": 0.0904,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5726553201675415,
"rewards/margins": 3.8218576908111572,
"rewards/rejected": -4.39451265335083,
"step": 186
},
{
"epoch": 2.216296296296296,
"grad_norm": 7.807015119173489,
"learning_rate": 2.4479906289012216e-07,
"logits/chosen": -1.345091462135315,
"logits/rejected": -1.0644184350967407,
"logps/chosen": -25.767536163330078,
"logps/rejected": -41.148502349853516,
"loss": 0.0849,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.5413724780082703,
"rewards/margins": 4.547415256500244,
"rewards/rejected": -4.006042957305908,
"step": 187
},
{
"epoch": 2.228148148148148,
"grad_norm": 8.268473966183542,
"learning_rate": 2.421992978581152e-07,
"logits/chosen": -1.2509685754776,
"logits/rejected": -1.1202762126922607,
"logps/chosen": -26.480911254882812,
"logps/rejected": -41.798858642578125,
"loss": 0.0768,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4163511097431183,
"rewards/margins": 4.190377235412598,
"rewards/rejected": -4.606728553771973,
"step": 188
},
{
"epoch": 2.24,
"grad_norm": 6.31545694362126,
"learning_rate": 2.3960037696835987e-07,
"logits/chosen": -0.9931889772415161,
"logits/rejected": -0.9487002491950989,
"logps/chosen": -23.28666877746582,
"logps/rejected": -45.82819366455078,
"loss": 0.0528,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.22770199179649353,
"rewards/margins": 5.227255344390869,
"rewards/rejected": -5.454957008361816,
"step": 189
},
{
"epoch": 2.251851851851852,
"grad_norm": 8.198173492670941,
"learning_rate": 2.3700258145950493e-07,
"logits/chosen": -1.2542146444320679,
"logits/rejected": -1.296125888824463,
"logps/chosen": -23.325332641601562,
"logps/rejected": -42.396663665771484,
"loss": 0.074,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.37019920349121094,
"rewards/margins": 4.538805961608887,
"rewards/rejected": -4.909005165100098,
"step": 190
},
{
"epoch": 2.2637037037037038,
"grad_norm": 6.252335723496194,
"learning_rate": 2.3440619244841794e-07,
"logits/chosen": -1.0998159646987915,
"logits/rejected": -1.0990605354309082,
"logps/chosen": -24.507465362548828,
"logps/rejected": -36.9913330078125,
"loss": 0.0693,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.07138313353061676,
"rewards/margins": 3.743443727493286,
"rewards/rejected": -3.67206072807312,
"step": 191
},
{
"epoch": 2.2755555555555556,
"grad_norm": 5.937599917562406,
"learning_rate": 2.3181149089976404e-07,
"logits/chosen": -1.1160556077957153,
"logits/rejected": -0.9888994693756104,
"logps/chosen": -25.562957763671875,
"logps/rejected": -44.06254959106445,
"loss": 0.0597,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.11315414309501648,
"rewards/margins": 5.40950345993042,
"rewards/rejected": -5.52265739440918,
"step": 192
},
{
"epoch": 2.2874074074074073,
"grad_norm": 8.140792637653023,
"learning_rate": 2.2921875759560207e-07,
"logits/chosen": -1.2146611213684082,
"logits/rejected": -1.1461243629455566,
"logps/chosen": -36.22383499145508,
"logps/rejected": -46.22894287109375,
"loss": 0.0893,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8142991065979004,
"rewards/margins": 4.104118347167969,
"rewards/rejected": -4.918417930603027,
"step": 193
},
{
"epoch": 2.299259259259259,
"grad_norm": 7.224664725024332,
"learning_rate": 2.2662827310499995e-07,
"logits/chosen": -1.0874426364898682,
"logits/rejected": -0.9829124212265015,
"logps/chosen": -24.988603591918945,
"logps/rejected": -42.57012939453125,
"loss": 0.0656,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.10310526937246323,
"rewards/margins": 5.060862064361572,
"rewards/rejected": -4.957756996154785,
"step": 194
},
{
"epoch": 2.311111111111111,
"grad_norm": 7.027603500584767,
"learning_rate": 2.2404031775367332e-07,
"logits/chosen": -1.1362197399139404,
"logits/rejected": -1.0883052349090576,
"logps/chosen": -24.717567443847656,
"logps/rejected": -43.55390167236328,
"loss": 0.0582,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.07100862264633179,
"rewards/margins": 4.886796474456787,
"rewards/rejected": -4.815788269042969,
"step": 195
},
{
"epoch": 2.322962962962963,
"grad_norm": 6.3481105853123,
"learning_rate": 2.2145517159365043e-07,
"logits/chosen": -1.2440788745880127,
"logits/rejected": -1.0895586013793945,
"logps/chosen": -27.22349739074707,
"logps/rejected": -39.78349304199219,
"loss": 0.0609,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.12459969520568848,
"rewards/margins": 4.133920192718506,
"rewards/rejected": -4.0093207359313965,
"step": 196
},
{
"epoch": 2.334814814814815,
"grad_norm": 8.448014970739372,
"learning_rate": 2.1887311437296684e-07,
"logits/chosen": -1.2059340476989746,
"logits/rejected": -1.1843221187591553,
"logps/chosen": -22.853811264038086,
"logps/rejected": -32.71154022216797,
"loss": 0.0912,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.47247427701950073,
"rewards/margins": 3.9881787300109863,
"rewards/rejected": -3.51570463180542,
"step": 197
},
{
"epoch": 2.3466666666666667,
"grad_norm": 8.053586024276273,
"learning_rate": 2.162944255053928e-07,
"logits/chosen": -1.1554303169250488,
"logits/rejected": -1.0401800870895386,
"logps/chosen": -20.67418670654297,
"logps/rejected": -37.24845504760742,
"loss": 0.0809,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2763448655605316,
"rewards/margins": 4.477565288543701,
"rewards/rejected": -4.201220512390137,
"step": 198
},
{
"epoch": 2.3585185185185185,
"grad_norm": 7.516398498619182,
"learning_rate": 2.137193840401968e-07,
"logits/chosen": -1.1824381351470947,
"logits/rejected": -1.1074461936950684,
"logps/chosen": -28.55365562438965,
"logps/rejected": -41.09587478637695,
"loss": 0.0734,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2404264211654663,
"rewards/margins": 3.8725597858428955,
"rewards/rejected": -3.6321334838867188,
"step": 199
},
{
"epoch": 2.3703703703703702,
"grad_norm": 5.954177017572196,
"learning_rate": 2.1114826863194878e-07,
"logits/chosen": -1.24180269241333,
"logits/rejected": -1.0925354957580566,
"logps/chosen": -28.197025299072266,
"logps/rejected": -46.81939697265625,
"loss": 0.0549,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.15689772367477417,
"rewards/margins": 5.056156635284424,
"rewards/rejected": -5.213054180145264,
"step": 200
},
{
"epoch": 2.3822222222222225,
"grad_norm": 5.991252280343694,
"learning_rate": 2.0858135751036568e-07,
"logits/chosen": -1.222536325454712,
"logits/rejected": -1.1197445392608643,
"logps/chosen": -32.660709381103516,
"logps/rejected": -46.89257049560547,
"loss": 0.0509,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.014814764261245728,
"rewards/margins": 5.237975120544434,
"rewards/rejected": -5.223160743713379,
"step": 201
},
{
"epoch": 2.3940740740740742,
"grad_norm": 6.65615573416704,
"learning_rate": 2.060189284502037e-07,
"logits/chosen": -1.1877946853637695,
"logits/rejected": -1.1109426021575928,
"logps/chosen": -25.55805206298828,
"logps/rejected": -44.239295959472656,
"loss": 0.0634,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.14644792675971985,
"rewards/margins": 4.83575439453125,
"rewards/rejected": -4.689306259155273,
"step": 202
},
{
"epoch": 2.405925925925926,
"grad_norm": 6.275499946646439,
"learning_rate": 2.0346125874119838e-07,
"logits/chosen": -1.132055401802063,
"logits/rejected": -1.0429214239120483,
"logps/chosen": -24.973257064819336,
"logps/rejected": -42.17146682739258,
"loss": 0.071,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.11066167056560516,
"rewards/margins": 4.5910515785217285,
"rewards/rejected": -4.7017130851745605,
"step": 203
},
{
"epoch": 2.417777777777778,
"grad_norm": 7.65769891944596,
"learning_rate": 2.0090862515805895e-07,
"logits/chosen": -1.0738351345062256,
"logits/rejected": -0.8972642421722412,
"logps/chosen": -33.31107711791992,
"logps/rejected": -41.709693908691406,
"loss": 0.0813,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.21268025040626526,
"rewards/margins": 4.644548416137695,
"rewards/rejected": -4.857229232788086,
"step": 204
},
{
"epoch": 2.4296296296296296,
"grad_norm": 7.640686179230129,
"learning_rate": 1.983613039305173e-07,
"logits/chosen": -1.2996752262115479,
"logits/rejected": -1.12294340133667,
"logps/chosen": -18.794048309326172,
"logps/rejected": -45.74852752685547,
"loss": 0.0789,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.16848334670066833,
"rewards/margins": 4.915053367614746,
"rewards/rejected": -5.0835371017456055,
"step": 205
},
{
"epoch": 2.4414814814814814,
"grad_norm": 7.524471411959897,
"learning_rate": 1.9581957071343588e-07,
"logits/chosen": -1.0391274690628052,
"logits/rejected": -0.9014835357666016,
"logps/chosen": -33.915252685546875,
"logps/rejected": -57.86189270019531,
"loss": 0.0805,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4224894046783447,
"rewards/margins": 4.6927666664123535,
"rewards/rejected": -5.115255832672119,
"step": 206
},
{
"epoch": 2.453333333333333,
"grad_norm": 6.9279059385356305,
"learning_rate": 1.9328370055697832e-07,
"logits/chosen": -1.1469345092773438,
"logits/rejected": -0.9380808472633362,
"logps/chosen": -24.10541343688965,
"logps/rejected": -44.4921760559082,
"loss": 0.0595,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.29525160789489746,
"rewards/margins": 4.89801549911499,
"rewards/rejected": -4.602763652801514,
"step": 207
},
{
"epoch": 2.4651851851851854,
"grad_norm": 6.54091678469529,
"learning_rate": 1.907539678768453e-07,
"logits/chosen": -1.1986242532730103,
"logits/rejected": -1.1000490188598633,
"logps/chosen": -22.64141273498535,
"logps/rejected": -53.74283981323242,
"loss": 0.068,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.23207074403762817,
"rewards/margins": 5.020073413848877,
"rewards/rejected": -5.2521443367004395,
"step": 208
},
{
"epoch": 2.477037037037037,
"grad_norm": 7.3835745720901365,
"learning_rate": 1.8823064642457876e-07,
"logits/chosen": -1.1322101354599,
"logits/rejected": -1.0012404918670654,
"logps/chosen": -25.564584732055664,
"logps/rejected": -52.52565002441406,
"loss": 0.0701,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.2706539034843445,
"rewards/margins": 4.544902801513672,
"rewards/rejected": -4.815556526184082,
"step": 209
},
{
"epoch": 2.488888888888889,
"grad_norm": 6.037126217772019,
"learning_rate": 1.8571400925793852e-07,
"logits/chosen": -1.32914137840271,
"logits/rejected": -1.199539303779602,
"logps/chosen": -27.011600494384766,
"logps/rejected": -42.806114196777344,
"loss": 0.0639,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.05861341953277588,
"rewards/margins": 4.113726615905762,
"rewards/rejected": -4.055113315582275,
"step": 210
},
{
"epoch": 2.5007407407407407,
"grad_norm": 6.792003028800643,
"learning_rate": 1.8320432871135376e-07,
"logits/chosen": -0.9643785357475281,
"logits/rejected": -0.8642684817314148,
"logps/chosen": -32.56407928466797,
"logps/rejected": -48.981529235839844,
"loss": 0.0659,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.09449410438537598,
"rewards/margins": 4.1956257820129395,
"rewards/rejected": -4.2901201248168945,
"step": 211
},
{
"epoch": 2.5125925925925925,
"grad_norm": 6.652434536599441,
"learning_rate": 1.8070187636645237e-07,
"logits/chosen": -1.1183323860168457,
"logits/rejected": -1.0643121004104614,
"logps/chosen": -23.476839065551758,
"logps/rejected": -46.453697204589844,
"loss": 0.0651,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.05248948931694031,
"rewards/margins": 4.479131698608398,
"rewards/rejected": -4.426641941070557,
"step": 212
},
{
"epoch": 2.5244444444444447,
"grad_norm": 6.873490871799767,
"learning_rate": 1.782069230226725e-07,
"logits/chosen": -0.9355219602584839,
"logits/rejected": -0.8760642409324646,
"logps/chosen": -26.840740203857422,
"logps/rejected": -46.565147399902344,
"loss": 0.0716,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.27372512221336365,
"rewards/margins": 4.666114330291748,
"rewards/rejected": -4.9398393630981445,
"step": 213
},
{
"epoch": 2.536296296296296,
"grad_norm": 6.477809311744379,
"learning_rate": 1.7571973866795813e-07,
"logits/chosen": -1.3275456428527832,
"logits/rejected": -1.1785155534744263,
"logps/chosen": -19.671016693115234,
"logps/rejected": -40.520137786865234,
"loss": 0.0696,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.06844872236251831,
"rewards/margins": 4.899576663970947,
"rewards/rejected": -4.8311285972595215,
"step": 214
},
{
"epoch": 2.5481481481481483,
"grad_norm": 4.940048002831371,
"learning_rate": 1.7324059244954292e-07,
"logits/chosen": -1.461755633354187,
"logits/rejected": -1.3273966312408447,
"logps/chosen": -23.988277435302734,
"logps/rejected": -35.3886604309082,
"loss": 0.0481,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.3582080900669098,
"rewards/margins": 4.515974044799805,
"rewards/rejected": -4.874181747436523,
"step": 215
},
{
"epoch": 2.56,
"grad_norm": 9.005658987409907,
"learning_rate": 1.7076975264482433e-07,
"logits/chosen": -1.2200323343276978,
"logits/rejected": -1.0738322734832764,
"logps/chosen": -22.159700393676758,
"logps/rejected": -41.98440170288086,
"loss": 0.0807,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.023108944296836853,
"rewards/margins": 4.085160255432129,
"rewards/rejected": -4.062050819396973,
"step": 216
},
{
"epoch": 2.571851851851852,
"grad_norm": 6.115258133963013,
"learning_rate": 1.6830748663233303e-07,
"logits/chosen": -1.135589599609375,
"logits/rejected": -1.0998283624649048,
"logps/chosen": -22.15255355834961,
"logps/rejected": -39.37363815307617,
"loss": 0.0597,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.2580828368663788,
"rewards/margins": 4.229098796844482,
"rewards/rejected": -4.487181186676025,
"step": 217
},
{
"epoch": 2.5837037037037036,
"grad_norm": 7.594741719247832,
"learning_rate": 1.6585406086279846e-07,
"logits/chosen": -1.3007519245147705,
"logits/rejected": -1.258547306060791,
"logps/chosen": -29.01621437072754,
"logps/rejected": -51.67272186279297,
"loss": 0.0695,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.06502366065979004,
"rewards/margins": 5.339412212371826,
"rewards/rejected": -5.274388313293457,
"step": 218
},
{
"epoch": 2.5955555555555554,
"grad_norm": 5.212981266507165,
"learning_rate": 1.6340974083031523e-07,
"logits/chosen": -1.2680379152297974,
"logits/rejected": -1.2023954391479492,
"logps/chosen": -25.777963638305664,
"logps/rejected": -38.38170623779297,
"loss": 0.057,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.02264055609703064,
"rewards/margins": 3.978463649749756,
"rewards/rejected": -3.9558229446411133,
"step": 219
},
{
"epoch": 2.6074074074074076,
"grad_norm": 5.672295808616577,
"learning_rate": 1.6097479104361326e-07,
"logits/chosen": -1.2693517208099365,
"logits/rejected": -1.2250739336013794,
"logps/chosen": -21.100271224975586,
"logps/rejected": -41.79471969604492,
"loss": 0.0562,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.08915658295154572,
"rewards/margins": 4.542138576507568,
"rewards/rejected": -4.452981948852539,
"step": 220
},
{
"epoch": 2.6192592592592594,
"grad_norm": 6.347499166452346,
"learning_rate": 1.5854947499743413e-07,
"logits/chosen": -1.0178323984146118,
"logits/rejected": -0.9484214186668396,
"logps/chosen": -18.72942543029785,
"logps/rejected": -43.50739288330078,
"loss": 0.058,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.00246235728263855,
"rewards/margins": 5.049181938171387,
"rewards/rejected": -5.046720027923584,
"step": 221
},
{
"epoch": 2.631111111111111,
"grad_norm": 7.517395617419555,
"learning_rate": 1.5613405514401757e-07,
"logits/chosen": -1.3176552057266235,
"logits/rejected": -1.2037431001663208,
"logps/chosen": -23.663074493408203,
"logps/rejected": -38.63740158081055,
"loss": 0.0784,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.46497219800949097,
"rewards/margins": 3.637241840362549,
"rewards/rejected": -4.1022138595581055,
"step": 222
},
{
"epoch": 2.642962962962963,
"grad_norm": 5.580464995595371,
"learning_rate": 1.537287928647002e-07,
"logits/chosen": -1.1343742609024048,
"logits/rejected": -1.0372800827026367,
"logps/chosen": -24.60474395751953,
"logps/rejected": -35.45951843261719,
"loss": 0.0552,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.19021296501159668,
"rewards/margins": 3.999257802963257,
"rewards/rejected": -4.1894707679748535,
"step": 223
},
{
"epoch": 2.6548148148148147,
"grad_norm": 6.8709626079577175,
"learning_rate": 1.513339484416309e-07,
"logits/chosen": -1.1663920879364014,
"logits/rejected": -1.151513695716858,
"logps/chosen": -34.081424713134766,
"logps/rejected": -52.950035095214844,
"loss": 0.0634,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6248299479484558,
"rewards/margins": 5.083865165710449,
"rewards/rejected": -5.708695411682129,
"step": 224
},
{
"epoch": 2.6666666666666665,
"grad_norm": 5.241170241551687,
"learning_rate": 1.489497810296046e-07,
"logits/chosen": -1.1173107624053955,
"logits/rejected": -1.0356335639953613,
"logps/chosen": -23.928882598876953,
"logps/rejected": -59.75672912597656,
"loss": 0.0511,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.154710054397583,
"rewards/margins": 6.490203857421875,
"rewards/rejected": -6.644913673400879,
"step": 225
},
{
"epoch": 2.6785185185185183,
"grad_norm": 6.118871434229746,
"learning_rate": 1.4657654862801797e-07,
"logits/chosen": -1.1692712306976318,
"logits/rejected": -1.1598937511444092,
"logps/chosen": -21.213607788085938,
"logps/rejected": -43.659019470214844,
"loss": 0.0535,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.0038762539625167847,
"rewards/margins": 4.2396321296691895,
"rewards/rejected": -4.243508338928223,
"step": 226
},
{
"epoch": 2.6903703703703705,
"grad_norm": 6.573686325728602,
"learning_rate": 1.4421450805295082e-07,
"logits/chosen": -1.3742166757583618,
"logits/rejected": -1.2483296394348145,
"logps/chosen": -26.414283752441406,
"logps/rejected": -36.898033142089844,
"loss": 0.0631,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.4164190888404846,
"rewards/margins": 3.4958038330078125,
"rewards/rejected": -3.0793848037719727,
"step": 227
},
{
"epoch": 2.7022222222222223,
"grad_norm": 6.627117841873176,
"learning_rate": 1.418639149093748e-07,
"logits/chosen": -1.252882719039917,
"logits/rejected": -1.1287035942077637,
"logps/chosen": -27.196077346801758,
"logps/rejected": -36.04934310913086,
"loss": 0.0652,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.3088911473751068,
"rewards/margins": 3.228538990020752,
"rewards/rejected": -3.5374302864074707,
"step": 228
},
{
"epoch": 2.714074074074074,
"grad_norm": 5.7383606439736985,
"learning_rate": 1.3952502356349323e-07,
"logits/chosen": -1.134902000427246,
"logits/rejected": -1.048799753189087,
"logps/chosen": -24.576427459716797,
"logps/rejected": -45.68292236328125,
"loss": 0.0555,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.00011165440082550049,
"rewards/margins": 5.5121378898620605,
"rewards/rejected": -5.512249946594238,
"step": 229
},
{
"epoch": 2.725925925925926,
"grad_norm": 7.011579914365523,
"learning_rate": 1.371980871152157e-07,
"logits/chosen": -1.0634100437164307,
"logits/rejected": -0.9104180335998535,
"logps/chosen": -29.859907150268555,
"logps/rejected": -50.70886993408203,
"loss": 0.0694,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.16749510169029236,
"rewards/margins": 5.730169773101807,
"rewards/rejected": -5.5626749992370605,
"step": 230
},
{
"epoch": 2.7377777777777776,
"grad_norm": 6.154624592473375,
"learning_rate": 1.3488335737076911e-07,
"logits/chosen": -1.196423888206482,
"logits/rejected": -1.0755786895751953,
"logps/chosen": -22.506702423095703,
"logps/rejected": -31.105947494506836,
"loss": 0.0662,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.14943012595176697,
"rewards/margins": 3.1946725845336914,
"rewards/rejected": -3.344102621078491,
"step": 231
},
{
"epoch": 2.74962962962963,
"grad_norm": 5.957255330795934,
"learning_rate": 1.3258108481544847e-07,
"logits/chosen": -1.1230725049972534,
"logits/rejected": -1.0154623985290527,
"logps/chosen": -32.393314361572266,
"logps/rejected": -46.890968322753906,
"loss": 0.0571,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.3677994906902313,
"rewards/margins": 4.284073829650879,
"rewards/rejected": -4.651873588562012,
"step": 232
},
{
"epoch": 2.7614814814814816,
"grad_norm": 7.438230804694601,
"learning_rate": 1.3029151858651143e-07,
"logits/chosen": -1.351361632347107,
"logits/rejected": -1.2523919343948364,
"logps/chosen": -21.477752685546875,
"logps/rejected": -47.73276138305664,
"loss": 0.072,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.33381107449531555,
"rewards/margins": 5.271888256072998,
"rewards/rejected": -5.60569953918457,
"step": 233
},
{
"epoch": 2.7733333333333334,
"grad_norm": 6.539977486206468,
"learning_rate": 1.2801490644621788e-07,
"logits/chosen": -0.9469627141952515,
"logits/rejected": -0.7967553734779358,
"logps/chosen": -29.131805419921875,
"logps/rejected": -47.47956085205078,
"loss": 0.0694,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.4405498802661896,
"rewards/margins": 4.784643650054932,
"rewards/rejected": -5.225193500518799,
"step": 234
},
{
"epoch": 2.785185185185185,
"grad_norm": 5.650929076564459,
"learning_rate": 1.257514947550189e-07,
"logits/chosen": -1.1391454935073853,
"logits/rejected": -0.9985545873641968,
"logps/chosen": -19.8972110748291,
"logps/rejected": -33.077980041503906,
"loss": 0.0546,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.12327444553375244,
"rewards/margins": 4.0027875900268555,
"rewards/rejected": -3.8795135021209717,
"step": 235
},
{
"epoch": 2.797037037037037,
"grad_norm": 7.402429067879936,
"learning_rate": 1.2350152844489688e-07,
"logits/chosen": -1.1549052000045776,
"logits/rejected": -0.9909151792526245,
"logps/chosen": -30.456247329711914,
"logps/rejected": -48.731536865234375,
"loss": 0.0793,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4372573494911194,
"rewards/margins": 4.651694297790527,
"rewards/rejected": -5.088951587677002,
"step": 236
},
{
"epoch": 2.8088888888888888,
"grad_norm": 6.734173424308296,
"learning_rate": 1.2126525099286108e-07,
"logits/chosen": -1.180855631828308,
"logits/rejected": -1.2272781133651733,
"logps/chosen": -28.35424041748047,
"logps/rejected": -48.205318450927734,
"loss": 0.0687,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.47708311676979065,
"rewards/margins": 5.187458515167236,
"rewards/rejected": -5.664542198181152,
"step": 237
},
{
"epoch": 2.8207407407407405,
"grad_norm": 6.387888892476844,
"learning_rate": 1.1904290439459971e-07,
"logits/chosen": -1.1783702373504639,
"logits/rejected": -1.0934996604919434,
"logps/chosen": -23.247806549072266,
"logps/rejected": -42.38697814941406,
"loss": 0.0566,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.272434800863266,
"rewards/margins": 3.895128011703491,
"rewards/rejected": -4.167562484741211,
"step": 238
},
{
"epoch": 2.8325925925925928,
"grad_norm": 5.6141759750684015,
"learning_rate": 1.1683472913829284e-07,
"logits/chosen": -1.2703089714050293,
"logits/rejected": -1.1347819566726685,
"logps/chosen": -36.7236213684082,
"logps/rejected": -49.431922912597656,
"loss": 0.0502,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.3381836414337158,
"rewards/margins": 4.108426094055176,
"rewards/rejected": -4.446609973907471,
"step": 239
},
{
"epoch": 2.8444444444444446,
"grad_norm": 5.932933616519591,
"learning_rate": 1.146409641785882e-07,
"logits/chosen": -1.1102083921432495,
"logits/rejected": -1.0604140758514404,
"logps/chosen": -27.76748275756836,
"logps/rejected": -34.07774353027344,
"loss": 0.0608,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.2582487463951111,
"rewards/margins": 2.856698513031006,
"rewards/rejected": -3.1149468421936035,
"step": 240
},
{
"epoch": 2.8562962962962963,
"grad_norm": 6.7530047905552735,
"learning_rate": 1.1246184691074314e-07,
"logits/chosen": -1.2408270835876465,
"logits/rejected": -1.1994930505752563,
"logps/chosen": -28.50021743774414,
"logps/rejected": -49.54254150390625,
"loss": 0.0722,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0546003133058548,
"rewards/margins": 6.17899227142334,
"rewards/rejected": -6.124391555786133,
"step": 241
},
{
"epoch": 2.868148148148148,
"grad_norm": 7.401984494431854,
"learning_rate": 1.1029761314493518e-07,
"logits/chosen": -1.3563504219055176,
"logits/rejected": -1.2836796045303345,
"logps/chosen": -29.872364044189453,
"logps/rejected": -42.799747467041016,
"loss": 0.0685,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.22778728604316711,
"rewards/margins": 5.134041786193848,
"rewards/rejected": -5.3618292808532715,
"step": 242
},
{
"epoch": 2.88,
"grad_norm": 7.471266580413762,
"learning_rate": 1.0814849708074414e-07,
"logits/chosen": -1.128278136253357,
"logits/rejected": -0.9680910706520081,
"logps/chosen": -38.86433792114258,
"logps/rejected": -47.132667541503906,
"loss": 0.0593,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.25867849588394165,
"rewards/margins": 4.5064005851745605,
"rewards/rejected": -4.247722625732422,
"step": 243
},
{
"epoch": 2.891851851851852,
"grad_norm": 6.390593039880407,
"learning_rate": 1.0601473128180854e-07,
"logits/chosen": -1.2510465383529663,
"logits/rejected": -1.100001573562622,
"logps/chosen": -33.47804260253906,
"logps/rejected": -41.27080154418945,
"loss": 0.0621,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0875249058008194,
"rewards/margins": 4.39518404006958,
"rewards/rejected": -4.307658672332764,
"step": 244
},
{
"epoch": 2.9037037037037035,
"grad_norm": 8.267732345292577,
"learning_rate": 1.0389654665065908e-07,
"logits/chosen": -1.1220481395721436,
"logits/rejected": -1.0034825801849365,
"logps/chosen": -24.331592559814453,
"logps/rejected": -41.46772003173828,
"loss": 0.0865,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.26567134261131287,
"rewards/margins": 4.682834148406982,
"rewards/rejected": -4.948505401611328,
"step": 245
},
{
"epoch": 2.9155555555555557,
"grad_norm": 7.488610652410469,
"learning_rate": 1.0179417240373182e-07,
"logits/chosen": -1.176962971687317,
"logits/rejected": -1.1089400053024292,
"logps/chosen": -34.5350341796875,
"logps/rejected": -56.02618408203125,
"loss": 0.0643,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.9151340126991272,
"rewards/margins": 5.155758380889893,
"rewards/rejected": -6.070892333984375,
"step": 246
},
{
"epoch": 2.9274074074074075,
"grad_norm": 6.376533768492628,
"learning_rate": 9.970783604656383e-08,
"logits/chosen": -1.3059768676757812,
"logits/rejected": -1.0361342430114746,
"logps/chosen": -28.046321868896484,
"logps/rejected": -48.62135696411133,
"loss": 0.0632,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.13395918905735016,
"rewards/margins": 5.584090232849121,
"rewards/rejected": -5.718049049377441,
"step": 247
},
{
"epoch": 2.9392592592592592,
"grad_norm": 7.764371689739165,
"learning_rate": 9.763776334917398e-08,
"logits/chosen": -1.3117642402648926,
"logits/rejected": -1.1723650693893433,
"logps/chosen": -28.31963348388672,
"logps/rejected": -37.416561126708984,
"loss": 0.0825,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4260867238044739,
"rewards/margins": 2.7781217098236084,
"rewards/rejected": -3.2042083740234375,
"step": 248
},
{
"epoch": 2.951111111111111,
"grad_norm": 6.603531713615615,
"learning_rate": 9.558417832163162e-08,
"logits/chosen": -1.0509438514709473,
"logits/rejected": -1.1028845310211182,
"logps/chosen": -29.35840606689453,
"logps/rejected": -39.08806610107422,
"loss": 0.0572,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.06763426959514618,
"rewards/margins": 4.334118843078613,
"rewards/rejected": -4.401752948760986,
"step": 249
},
{
"epoch": 2.962962962962963,
"grad_norm": 6.641636931789762,
"learning_rate": 9.354730318981561e-08,
"logits/chosen": -1.269490122795105,
"logits/rejected": -1.1995911598205566,
"logps/chosen": -23.048587799072266,
"logps/rejected": -41.5166015625,
"loss": 0.0718,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.22312739491462708,
"rewards/margins": 4.370500564575195,
"rewards/rejected": -4.5936279296875,
"step": 250
},
{
"epoch": 2.974814814814815,
"grad_norm": 5.554303148575841,
"learning_rate": 9.15273583713663e-08,
"logits/chosen": -1.2579662799835205,
"logits/rejected": -1.0015959739685059,
"logps/chosen": -31.479568481445312,
"logps/rejected": -56.00233459472656,
"loss": 0.0555,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5022794008255005,
"rewards/margins": 6.369531154632568,
"rewards/rejected": -6.871809959411621,
"step": 251
},
{
"epoch": 2.986666666666667,
"grad_norm": 5.613495199138643,
"learning_rate": 8.95245624518336e-08,
"logits/chosen": -1.2209105491638184,
"logits/rejected": -1.217021107673645,
"logps/chosen": -25.06351089477539,
"logps/rejected": -47.17867660522461,
"loss": 0.0597,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4572719633579254,
"rewards/margins": 4.925302028656006,
"rewards/rejected": -5.382573127746582,
"step": 252
},
{
"epoch": 2.9985185185185186,
"grad_norm": 5.721091066167364,
"learning_rate": 8.753913216102285e-08,
"logits/chosen": -1.257638931274414,
"logits/rejected": -1.1348259449005127,
"logps/chosen": -28.36161231994629,
"logps/rejected": -52.211952209472656,
"loss": 0.0512,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5001235604286194,
"rewards/margins": 4.936119079589844,
"rewards/rejected": -5.436242580413818,
"step": 253
},
{
"epoch": 3.0103703703703704,
"grad_norm": 6.164342961198106,
"learning_rate": 8.557128234954189e-08,
"logits/chosen": -1.16610848903656,
"logits/rejected": -1.0525445938110352,
"logps/chosen": -19.37337875366211,
"logps/rejected": -44.04081344604492,
"loss": 0.0608,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.3864176273345947,
"rewards/margins": 5.418819427490234,
"rewards/rejected": -5.805237293243408,
"step": 254
},
{
"epoch": 3.022222222222222,
"grad_norm": 4.836985245782948,
"learning_rate": 8.362122596555088e-08,
"logits/chosen": -1.1399970054626465,
"logits/rejected": -0.9710614681243896,
"logps/chosen": -23.326759338378906,
"logps/rejected": -46.79590606689453,
"loss": 0.0426,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2866979241371155,
"rewards/margins": 6.633603572845459,
"rewards/rejected": -6.3469061851501465,
"step": 255
},
{
"epoch": 3.034074074074074,
"grad_norm": 5.427568975360207,
"learning_rate": 8.16891740317189e-08,
"logits/chosen": -1.2294830083847046,
"logits/rejected": -1.1226603984832764,
"logps/chosen": -23.196685791015625,
"logps/rejected": -38.58136749267578,
"loss": 0.0515,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.16551783680915833,
"rewards/margins": 4.354981899261475,
"rewards/rejected": -4.520500183105469,
"step": 256
},
{
"epoch": 3.0459259259259257,
"grad_norm": 5.924541071404178,
"learning_rate": 7.977533562238838e-08,
"logits/chosen": -1.1663788557052612,
"logits/rejected": -1.1404701471328735,
"logps/chosen": -26.776004791259766,
"logps/rejected": -50.571266174316406,
"loss": 0.059,
"rewards/accuracies": 0.9375,
"rewards/chosen": -0.18248632550239563,
"rewards/margins": 5.887378692626953,
"rewards/rejected": -6.069864273071289,
"step": 257
},
{
"epoch": 3.057777777777778,
"grad_norm": 4.128731375178606,
"learning_rate": 7.787991784094999e-08,
"logits/chosen": -1.2448476552963257,
"logits/rejected": -1.0964651107788086,
"logps/chosen": -29.85052490234375,
"logps/rejected": -62.34690856933594,
"loss": 0.0365,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4742766320705414,
"rewards/margins": 5.80230712890625,
"rewards/rejected": -6.276583671569824,
"step": 258
},
{
"epoch": 3.0696296296296297,
"grad_norm": 6.5179983840331825,
"learning_rate": 7.60031257974316e-08,
"logits/chosen": -1.1081359386444092,
"logits/rejected": -1.0185449123382568,
"logps/chosen": -23.463979721069336,
"logps/rejected": -50.03909683227539,
"loss": 0.0571,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.14104682207107544,
"rewards/margins": 5.487791538238525,
"rewards/rejected": -5.628839015960693,
"step": 259
},
{
"epoch": 3.0814814814814815,
"grad_norm": 6.670813820042167,
"learning_rate": 7.414516258630244e-08,
"logits/chosen": -1.0931766033172607,
"logits/rejected": -0.9176234602928162,
"logps/chosen": -35.09284210205078,
"logps/rejected": -56.267723083496094,
"loss": 0.0615,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.34566253423690796,
"rewards/margins": 5.847842216491699,
"rewards/rejected": -6.193504810333252,
"step": 260
},
{
"epoch": 3.0933333333333333,
"grad_norm": 4.886020098171949,
"learning_rate": 7.230622926449564e-08,
"logits/chosen": -1.2389843463897705,
"logits/rejected": -1.1709716320037842,
"logps/chosen": -23.021934509277344,
"logps/rejected": -42.478797912597656,
"loss": 0.0446,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.25014615058898926,
"rewards/margins": 5.359426975250244,
"rewards/rejected": -5.6095733642578125,
"step": 261
},
{
"epoch": 3.105185185185185,
"grad_norm": 5.470390367743688,
"learning_rate": 7.048652482965078e-08,
"logits/chosen": -1.250532865524292,
"logits/rejected": -1.098189353942871,
"logps/chosen": -33.6146354675293,
"logps/rejected": -41.64539337158203,
"loss": 0.0575,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.16313320398330688,
"rewards/margins": 4.307170391082764,
"rewards/rejected": -4.470303535461426,
"step": 262
},
{
"epoch": 3.117037037037037,
"grad_norm": 4.836356334775007,
"learning_rate": 6.868624619858021e-08,
"logits/chosen": -1.4147872924804688,
"logits/rejected": -1.4524210691452026,
"logps/chosen": -28.40629768371582,
"logps/rejected": -56.72626495361328,
"loss": 0.0447,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.03009369969367981,
"rewards/margins": 5.194394111633301,
"rewards/rejected": -5.164300918579102,
"step": 263
},
{
"epoch": 3.128888888888889,
"grad_norm": 4.750367218060603,
"learning_rate": 6.690558818595943e-08,
"logits/chosen": -1.2358546257019043,
"logits/rejected": -1.1999270915985107,
"logps/chosen": -25.05208969116211,
"logps/rejected": -48.712806701660156,
"loss": 0.0435,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.637313723564148,
"rewards/margins": 5.243877410888672,
"rewards/rejected": -5.881191253662109,
"step": 264
},
{
"epoch": 3.140740740740741,
"grad_norm": 4.863983890990079,
"learning_rate": 6.514474348324581e-08,
"logits/chosen": -1.2671034336090088,
"logits/rejected": -1.1254373788833618,
"logps/chosen": -32.094966888427734,
"logps/rejected": -52.297821044921875,
"loss": 0.0446,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4840225875377655,
"rewards/margins": 5.210573196411133,
"rewards/rejected": -5.6945953369140625,
"step": 265
},
{
"epoch": 3.1525925925925926,
"grad_norm": 6.337695693323137,
"learning_rate": 6.340390263782655e-08,
"logits/chosen": -1.2698873281478882,
"logits/rejected": -1.172045111656189,
"logps/chosen": -24.47865867614746,
"logps/rejected": -54.05537796020508,
"loss": 0.0665,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.3584892153739929,
"rewards/margins": 5.420907020568848,
"rewards/rejected": -5.779396057128906,
"step": 266
},
{
"epoch": 3.1644444444444444,
"grad_norm": 6.315515433549729,
"learning_rate": 6.168325403239913e-08,
"logits/chosen": -1.2651307582855225,
"logits/rejected": -1.1162527799606323,
"logps/chosen": -19.784488677978516,
"logps/rejected": -40.73728942871094,
"loss": 0.0556,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.27537021040916443,
"rewards/margins": 5.313858985900879,
"rewards/rejected": -5.038488388061523,
"step": 267
},
{
"epoch": 3.176296296296296,
"grad_norm": 4.582040973118046,
"learning_rate": 5.998298386458545e-08,
"logits/chosen": -1.0796051025390625,
"logits/rejected": -1.0264118909835815,
"logps/chosen": -27.581031799316406,
"logps/rejected": -49.427703857421875,
"loss": 0.0417,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.022144198417663574,
"rewards/margins": 5.080024719238281,
"rewards/rejected": -5.057880878448486,
"step": 268
},
{
"epoch": 3.188148148148148,
"grad_norm": 4.952404939534042,
"learning_rate": 5.830327612678265e-08,
"logits/chosen": -1.0570693016052246,
"logits/rejected": -1.0790140628814697,
"logps/chosen": -27.206192016601562,
"logps/rejected": -52.819984436035156,
"loss": 0.0419,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.8033032417297363,
"rewards/margins": 4.914515495300293,
"rewards/rejected": -5.717819690704346,
"step": 269
},
{
"epoch": 3.2,
"grad_norm": 5.724512806119854,
"learning_rate": 5.6644312586253044e-08,
"logits/chosen": -1.0734919309616089,
"logits/rejected": -1.0849241018295288,
"logps/chosen": -41.63764572143555,
"logps/rejected": -48.729576110839844,
"loss": 0.0562,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.17860059440135956,
"rewards/margins": 4.616375923156738,
"rewards/rejected": -4.794977188110352,
"step": 270
},
{
"epoch": 3.211851851851852,
"grad_norm": 5.991455888598502,
"learning_rate": 5.5006272765454056e-08,
"logits/chosen": -1.2988901138305664,
"logits/rejected": -1.1308969259262085,
"logps/chosen": -22.436080932617188,
"logps/rejected": -34.09817123413086,
"loss": 0.0584,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.005201712250709534,
"rewards/margins": 3.529590606689453,
"rewards/rejected": -3.534792423248291,
"step": 271
},
{
"epoch": 3.2237037037037037,
"grad_norm": 5.413534996418431,
"learning_rate": 5.338933392261158e-08,
"logits/chosen": -1.222093105316162,
"logits/rejected": -1.1171449422836304,
"logps/chosen": -26.16643714904785,
"logps/rejected": -42.16415023803711,
"loss": 0.0508,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.20804953575134277,
"rewards/margins": 5.081421852111816,
"rewards/rejected": -5.2894721031188965,
"step": 272
},
{
"epoch": 3.2355555555555555,
"grad_norm": 5.91458057536832,
"learning_rate": 5.1793671032538206e-08,
"logits/chosen": -1.2229275703430176,
"logits/rejected": -1.3230491876602173,
"logps/chosen": -23.901247024536133,
"logps/rejected": -45.79841995239258,
"loss": 0.0586,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.171402707695961,
"rewards/margins": 4.954162120819092,
"rewards/rejected": -5.125565052032471,
"step": 273
},
{
"epoch": 3.2474074074074073,
"grad_norm": 5.22719369235926,
"learning_rate": 5.021945676769859e-08,
"logits/chosen": -1.2852232456207275,
"logits/rejected": -1.2391951084136963,
"logps/chosen": -20.282339096069336,
"logps/rejected": -42.286293029785156,
"loss": 0.0503,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.014406859874725342,
"rewards/margins": 4.513213157653809,
"rewards/rejected": -4.5276198387146,
"step": 274
},
{
"epoch": 3.259259259259259,
"grad_norm": 5.73422178803048,
"learning_rate": 4.866686147952387e-08,
"logits/chosen": -1.0481388568878174,
"logits/rejected": -0.9910224676132202,
"logps/chosen": -31.128089904785156,
"logps/rejected": -48.627586364746094,
"loss": 0.0565,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.11383038759231567,
"rewards/margins": 4.8710784912109375,
"rewards/rejected": -4.757248401641846,
"step": 275
},
{
"epoch": 3.2711111111111113,
"grad_norm": 5.655456397723797,
"learning_rate": 4.71360531799774e-08,
"logits/chosen": -1.1052677631378174,
"logits/rejected": -1.0184680223464966,
"logps/chosen": -36.36450958251953,
"logps/rejected": -51.73442840576172,
"loss": 0.0596,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5170344114303589,
"rewards/margins": 5.25890588760376,
"rewards/rejected": -5.775939464569092,
"step": 276
},
{
"epoch": 3.282962962962963,
"grad_norm": 4.996738283026781,
"learning_rate": 4.562719752337349e-08,
"logits/chosen": -1.266676664352417,
"logits/rejected": -1.1158446073532104,
"logps/chosen": -33.958919525146484,
"logps/rejected": -66.85248565673828,
"loss": 0.0523,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6868615746498108,
"rewards/margins": 6.892773628234863,
"rewards/rejected": -7.579635143280029,
"step": 277
},
{
"epoch": 3.294814814814815,
"grad_norm": 5.1730881424971535,
"learning_rate": 4.4140457788451434e-08,
"logits/chosen": -1.3682211637496948,
"logits/rejected": -1.2177406549453735,
"logps/chosen": -23.593040466308594,
"logps/rejected": -43.28880310058594,
"loss": 0.047,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.21666671335697174,
"rewards/margins": 4.890883445739746,
"rewards/rejected": -4.674216270446777,
"step": 278
},
{
"epoch": 3.3066666666666666,
"grad_norm": 4.729619192449929,
"learning_rate": 4.267599486070647e-08,
"logits/chosen": -1.2258741855621338,
"logits/rejected": -1.1649140119552612,
"logps/chosen": -31.068470001220703,
"logps/rejected": -36.381038665771484,
"loss": 0.0481,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.24162916839122772,
"rewards/margins": 4.573906898498535,
"rewards/rejected": -4.8155364990234375,
"step": 279
},
{
"epoch": 3.3185185185185184,
"grad_norm": 5.122216550777693,
"learning_rate": 4.1233967214979764e-08,
"logits/chosen": -1.198957920074463,
"logits/rejected": -1.06025230884552,
"logps/chosen": -33.02262496948242,
"logps/rejected": -41.4984130859375,
"loss": 0.049,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.23480704426765442,
"rewards/margins": 3.949801445007324,
"rewards/rejected": -3.714993953704834,
"step": 280
},
{
"epoch": 3.33037037037037,
"grad_norm": 4.305629596628497,
"learning_rate": 3.9814530898309356e-08,
"logits/chosen": -1.0878995656967163,
"logits/rejected": -1.0379247665405273,
"logps/chosen": -27.192787170410156,
"logps/rejected": -46.65719223022461,
"loss": 0.0366,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.029019802808761597,
"rewards/margins": 5.454700946807861,
"rewards/rejected": -5.483720779418945,
"step": 281
},
{
"epoch": 3.3422222222222224,
"grad_norm": 5.169778953020736,
"learning_rate": 3.8417839513043646e-08,
"logits/chosen": -1.2834384441375732,
"logits/rejected": -1.2438150644302368,
"logps/chosen": -30.712045669555664,
"logps/rejected": -37.924110412597656,
"loss": 0.0538,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.27224576473236084,
"rewards/margins": 3.7623844146728516,
"rewards/rejected": -4.034629821777344,
"step": 282
},
{
"epoch": 3.354074074074074,
"grad_norm": 6.097603815404355,
"learning_rate": 3.704404420021956e-08,
"logits/chosen": -1.1656073331832886,
"logits/rejected": -0.950996994972229,
"logps/chosen": -27.072315216064453,
"logps/rejected": -46.62635040283203,
"loss": 0.0607,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.001886114478111267,
"rewards/margins": 5.3247785568237305,
"rewards/rejected": -5.326663970947266,
"step": 283
},
{
"epoch": 3.365925925925926,
"grad_norm": 5.599744322780303,
"learning_rate": 3.569329362320708e-08,
"logits/chosen": -1.015643835067749,
"logits/rejected": -0.936226487159729,
"logps/chosen": -21.00103187561035,
"logps/rejected": -49.05156326293945,
"loss": 0.0462,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.10715761035680771,
"rewards/margins": 5.087098121643066,
"rewards/rejected": -5.19425630569458,
"step": 284
},
{
"epoch": 3.3777777777777778,
"grad_norm": 5.412291665519436,
"learning_rate": 3.436573395162179e-08,
"logits/chosen": -1.2125096321105957,
"logits/rejected": -1.0717750787734985,
"logps/chosen": -26.21784782409668,
"logps/rejected": -44.80372619628906,
"loss": 0.0562,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4322938024997711,
"rewards/margins": 4.486114025115967,
"rewards/rejected": -4.918407440185547,
"step": 285
},
{
"epoch": 3.3896296296296295,
"grad_norm": 5.068684864066647,
"learning_rate": 3.306150884550732e-08,
"logits/chosen": -1.306767225265503,
"logits/rejected": -1.136150598526001,
"logps/chosen": -28.90319061279297,
"logps/rejected": -48.472164154052734,
"loss": 0.0491,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.3896186947822571,
"rewards/margins": 4.801982879638672,
"rewards/rejected": -5.191601753234863,
"step": 286
},
{
"epoch": 3.4014814814814813,
"grad_norm": 6.257371157657287,
"learning_rate": 3.17807594397895e-08,
"logits/chosen": -1.2118041515350342,
"logits/rejected": -1.007792353630066,
"logps/chosen": -26.383615493774414,
"logps/rejected": -46.10572052001953,
"loss": 0.0542,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.43773341178894043,
"rewards/margins": 5.824153900146484,
"rewards/rejected": -6.261887550354004,
"step": 287
},
{
"epoch": 3.413333333333333,
"grad_norm": 5.353883051519317,
"learning_rate": 3.052362432900332e-08,
"logits/chosen": -1.447021245956421,
"logits/rejected": -1.2934633493423462,
"logps/chosen": -25.619125366210938,
"logps/rejected": -42.07542037963867,
"loss": 0.0495,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3246709406375885,
"rewards/margins": 7.011876106262207,
"rewards/rejected": -6.687204360961914,
"step": 288
},
{
"epoch": 3.4251851851851853,
"grad_norm": 5.399450219209751,
"learning_rate": 2.9290239552295538e-08,
"logits/chosen": -1.0401594638824463,
"logits/rejected": -1.0249950885772705,
"logps/chosen": -32.01249313354492,
"logps/rejected": -38.693145751953125,
"loss": 0.0501,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.22764620184898376,
"rewards/margins": 4.879059314727783,
"rewards/rejected": -4.6514129638671875,
"step": 289
},
{
"epoch": 3.437037037037037,
"grad_norm": 5.942445036249677,
"learning_rate": 2.8080738578703052e-08,
"logits/chosen": -1.2160862684249878,
"logits/rejected": -1.1057730913162231,
"logps/chosen": -26.857769012451172,
"logps/rejected": -49.42009735107422,
"loss": 0.0644,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0421622097492218,
"rewards/margins": 7.015720844268799,
"rewards/rejected": -6.973557472229004,
"step": 290
},
{
"epoch": 3.448888888888889,
"grad_norm": 4.452390830898345,
"learning_rate": 2.6895252292709974e-08,
"logits/chosen": -1.0676244497299194,
"logits/rejected": -1.078723669052124,
"logps/chosen": -31.738510131835938,
"logps/rejected": -45.86015319824219,
"loss": 0.0474,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5407127737998962,
"rewards/margins": 4.980555057525635,
"rewards/rejected": -5.521267890930176,
"step": 291
},
{
"epoch": 3.4607407407407407,
"grad_norm": 6.147853636678421,
"learning_rate": 2.5733908980083984e-08,
"logits/chosen": -1.2384705543518066,
"logits/rejected": -1.112764835357666,
"logps/chosen": -26.170108795166016,
"logps/rejected": -45.731956481933594,
"loss": 0.0609,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5723249316215515,
"rewards/margins": 4.402219772338867,
"rewards/rejected": -4.974545001983643,
"step": 292
},
{
"epoch": 3.4725925925925925,
"grad_norm": 5.914419745435524,
"learning_rate": 2.4596834313994037e-08,
"logits/chosen": -1.1161627769470215,
"logits/rejected": -1.0215301513671875,
"logps/chosen": -28.129005432128906,
"logps/rejected": -33.972686767578125,
"loss": 0.0566,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1716342568397522,
"rewards/margins": 4.373476028442383,
"rewards/rejected": -4.201840877532959,
"step": 293
},
{
"epoch": 3.4844444444444447,
"grad_norm": 4.7471616018558285,
"learning_rate": 2.3484151341411018e-08,
"logits/chosen": -1.1082960367202759,
"logits/rejected": -1.0436348915100098,
"logps/chosen": -20.280670166015625,
"logps/rejected": -46.68223190307617,
"loss": 0.0442,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.2738330066204071,
"rewards/margins": 5.264364719390869,
"rewards/rejected": -5.5381975173950195,
"step": 294
},
{
"epoch": 3.4962962962962965,
"grad_norm": 4.718228569099853,
"learning_rate": 2.23959804697921e-08,
"logits/chosen": -1.0989983081817627,
"logits/rejected": -1.0200862884521484,
"logps/chosen": -28.536529541015625,
"logps/rejected": -44.35844421386719,
"loss": 0.039,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.01516886055469513,
"rewards/margins": 5.231680870056152,
"rewards/rejected": -5.216512680053711,
"step": 295
},
{
"epoch": 3.5081481481481482,
"grad_norm": 4.412160626992289,
"learning_rate": 2.1332439454051277e-08,
"logits/chosen": -1.0349336862564087,
"logits/rejected": -0.9772415161132812,
"logps/chosen": -24.290695190429688,
"logps/rejected": -34.85298538208008,
"loss": 0.0405,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.006944596767425537,
"rewards/margins": 3.6680963039398193,
"rewards/rejected": -3.661151647567749,
"step": 296
},
{
"epoch": 3.52,
"grad_norm": 5.698184998134574,
"learning_rate": 2.029364338381656e-08,
"logits/chosen": -1.373365879058838,
"logits/rejected": -1.2929483652114868,
"logps/chosen": -34.31553649902344,
"logps/rejected": -35.5068359375,
"loss": 0.0555,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.09695194661617279,
"rewards/margins": 3.730624198913574,
"rewards/rejected": -3.8275763988494873,
"step": 297
},
{
"epoch": 3.531851851851852,
"grad_norm": 5.166813211580323,
"learning_rate": 1.9279704670975726e-08,
"logits/chosen": -1.0577523708343506,
"logits/rejected": -0.9344998598098755,
"logps/chosen": -25.05517578125,
"logps/rejected": -48.95963668823242,
"loss": 0.0435,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.17365989089012146,
"rewards/margins": 4.252870559692383,
"rewards/rejected": -4.426530838012695,
"step": 298
},
{
"epoch": 3.5437037037037036,
"grad_norm": 4.976330098589956,
"learning_rate": 1.829073303751172e-08,
"logits/chosen": -1.071714162826538,
"logits/rejected": -1.0084483623504639,
"logps/chosen": -20.396150588989258,
"logps/rejected": -38.729373931884766,
"loss": 0.0463,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.11510992050170898,
"rewards/margins": 5.170332431793213,
"rewards/rejected": -5.2854413986206055,
"step": 299
},
{
"epoch": 3.5555555555555554,
"grad_norm": 4.623020185136485,
"learning_rate": 1.732683550362954e-08,
"logits/chosen": -1.06589674949646,
"logits/rejected": -1.0053000450134277,
"logps/chosen": -33.81154251098633,
"logps/rejected": -48.16522216796875,
"loss": 0.0401,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.0654190182685852,
"rewards/margins": 4.919932842254639,
"rewards/rejected": -4.985352516174316,
"step": 300
},
{
"epoch": 3.5674074074074076,
"grad_norm": 4.373917316257469,
"learning_rate": 1.6388116376174765e-08,
"logits/chosen": -1.1930819749832153,
"logits/rejected": -1.1007626056671143,
"logps/chosen": -24.583969116210938,
"logps/rejected": -48.29629898071289,
"loss": 0.0404,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5538195371627808,
"rewards/margins": 5.18541145324707,
"rewards/rejected": -5.739231109619141,
"step": 301
},
{
"epoch": 3.5792592592592594,
"grad_norm": 4.944808160984247,
"learning_rate": 1.5474677237346468e-08,
"logits/chosen": -1.1952768564224243,
"logits/rejected": -1.1539109945297241,
"logps/chosen": -29.354717254638672,
"logps/rejected": -49.623294830322266,
"loss": 0.0512,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.31121665239334106,
"rewards/margins": 4.585163116455078,
"rewards/rejected": -4.896379470825195,
"step": 302
},
{
"epoch": 3.591111111111111,
"grad_norm": 6.237582774941322,
"learning_rate": 1.4586616933704527e-08,
"logits/chosen": -1.0483250617980957,
"logits/rejected": -1.0512489080429077,
"logps/chosen": -36.7315788269043,
"logps/rejected": -52.41490173339844,
"loss": 0.063,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.060242414474487305,
"rewards/margins": 5.002007484436035,
"rewards/rejected": -4.941765308380127,
"step": 303
},
{
"epoch": 3.602962962962963,
"grad_norm": 5.366887328514776,
"learning_rate": 1.372403156547311e-08,
"logits/chosen": -1.2591538429260254,
"logits/rejected": -1.1872644424438477,
"logps/chosen": -22.69057273864746,
"logps/rejected": -38.499332427978516,
"loss": 0.0535,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.3308228552341461,
"rewards/margins": 4.569196701049805,
"rewards/rejected": -4.900019645690918,
"step": 304
},
{
"epoch": 3.6148148148148147,
"grad_norm": 4.383186056032288,
"learning_rate": 1.2887014476141212e-08,
"logits/chosen": -1.1302443742752075,
"logits/rejected": -1.1017392873764038,
"logps/chosen": -27.243087768554688,
"logps/rejected": -47.09513473510742,
"loss": 0.0441,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2740994989871979,
"rewards/margins": 6.748981475830078,
"rewards/rejected": -6.474882125854492,
"step": 305
},
{
"epoch": 3.626666666666667,
"grad_norm": 5.520520861273014,
"learning_rate": 1.2075656242361732e-08,
"logits/chosen": -1.1834189891815186,
"logits/rejected": -1.0502477884292603,
"logps/chosen": -24.07543182373047,
"logps/rejected": -44.05875778198242,
"loss": 0.0493,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.1085430383682251,
"rewards/margins": 4.616766452789307,
"rewards/rejected": -4.725309371948242,
"step": 306
},
{
"epoch": 3.6385185185185183,
"grad_norm": 4.5846368218080045,
"learning_rate": 1.1290044664149873e-08,
"logits/chosen": -1.0908325910568237,
"logits/rejected": -1.0090572834014893,
"logps/chosen": -32.33647918701172,
"logps/rejected": -47.15243148803711,
"loss": 0.0393,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.17506128549575806,
"rewards/margins": 4.792283058166504,
"rewards/rejected": -4.967344284057617,
"step": 307
},
{
"epoch": 3.6503703703703705,
"grad_norm": 5.28209891846498,
"learning_rate": 1.0530264755381824e-08,
"logits/chosen": -1.2786378860473633,
"logits/rejected": -1.3132318258285522,
"logps/chosen": -26.759113311767578,
"logps/rejected": -41.227149963378906,
"loss": 0.0528,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.08898818492889404,
"rewards/margins": 3.8004322052001953,
"rewards/rejected": -3.8894202709198,
"step": 308
},
{
"epoch": 3.6622222222222223,
"grad_norm": 4.960907388580732,
"learning_rate": 9.796398734595284e-09,
"logits/chosen": -1.1778481006622314,
"logits/rejected": -1.181472897529602,
"logps/chosen": -20.444726943969727,
"logps/rejected": -33.29534149169922,
"loss": 0.0466,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.2902683913707733,
"rewards/margins": 3.6233019828796387,
"rewards/rejected": -3.9135704040527344,
"step": 309
},
{
"epoch": 3.674074074074074,
"grad_norm": 5.737646906284586,
"learning_rate": 9.088526016092141e-09,
"logits/chosen": -1.1990212202072144,
"logits/rejected": -1.1145985126495361,
"logps/chosen": -23.687454223632812,
"logps/rejected": -40.095672607421875,
"loss": 0.0516,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.447526752948761,
"rewards/margins": 5.467113018035889,
"rewards/rejected": -5.019586086273193,
"step": 310
},
{
"epoch": 3.685925925925926,
"grad_norm": 5.7150399704998245,
"learning_rate": 8.40672320134489e-09,
"logits/chosen": -1.146994948387146,
"logits/rejected": -0.9583498239517212,
"logps/chosen": -27.36312484741211,
"logps/rejected": -43.72743225097656,
"loss": 0.0543,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.11620418727397919,
"rewards/margins": 5.6578192710876465,
"rewards/rejected": -5.774023056030273,
"step": 311
},
{
"epoch": 3.6977777777777776,
"grad_norm": 4.67711156350355,
"learning_rate": 7.751064070707247e-09,
"logits/chosen": -1.3420299291610718,
"logits/rejected": -1.3341833353042603,
"logps/chosen": -31.239133834838867,
"logps/rejected": -41.84351348876953,
"loss": 0.0412,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3695347011089325,
"rewards/margins": 4.75352668762207,
"rewards/rejected": -4.3839921951293945,
"step": 312
},
{
"epoch": 3.70962962962963,
"grad_norm": 5.331465549642304,
"learning_rate": 7.12161957543006e-09,
"logits/chosen": -1.1273610591888428,
"logits/rejected": -1.1161746978759766,
"logps/chosen": -37.207733154296875,
"logps/rejected": -61.19139862060547,
"loss": 0.0479,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4315568804740906,
"rewards/margins": 5.310615539550781,
"rewards/rejected": -5.742172718048096,
"step": 313
},
{
"epoch": 3.7214814814814816,
"grad_norm": 5.478798851131127,
"learning_rate": 6.518457829983559e-09,
"logits/chosen": -1.3124021291732788,
"logits/rejected": -1.2279609441757202,
"logps/chosen": -34.83631896972656,
"logps/rejected": -44.276790618896484,
"loss": 0.0511,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.18586915731430054,
"rewards/margins": 3.245110511779785,
"rewards/rejected": -3.4309799671173096,
"step": 314
},
{
"epoch": 3.7333333333333334,
"grad_norm": 4.7427648272619,
"learning_rate": 5.9416441046862555e-09,
"logits/chosen": -1.1716216802597046,
"logits/rejected": -1.2297029495239258,
"logps/chosen": -21.677108764648438,
"logps/rejected": -35.96882247924805,
"loss": 0.0502,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.003640979528427124,
"rewards/margins": 3.7295522689819336,
"rewards/rejected": -3.7259111404418945,
"step": 315
},
{
"epoch": 3.745185185185185,
"grad_norm": 5.760686688528461,
"learning_rate": 5.3912408186420064e-09,
"logits/chosen": -1.038623332977295,
"logits/rejected": -0.9665778875350952,
"logps/chosen": -27.82607650756836,
"logps/rejected": -35.596378326416016,
"loss": 0.0571,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.08153516054153442,
"rewards/margins": 4.407654762268066,
"rewards/rejected": -4.326119422912598,
"step": 316
},
{
"epoch": 3.757037037037037,
"grad_norm": 4.629475217167777,
"learning_rate": 4.867307532985227e-09,
"logits/chosen": -1.2615653276443481,
"logits/rejected": -1.1494407653808594,
"logps/chosen": -40.15790557861328,
"logps/rejected": -60.7736701965332,
"loss": 0.0398,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.6902495622634888,
"rewards/margins": 5.601743698120117,
"rewards/rejected": -6.291993141174316,
"step": 317
},
{
"epoch": 3.7688888888888887,
"grad_norm": 6.454886951587756,
"learning_rate": 4.369900944435734e-09,
"logits/chosen": -1.0968234539031982,
"logits/rejected": -1.026517391204834,
"logps/chosen": -31.793502807617188,
"logps/rejected": -60.37879180908203,
"loss": 0.0663,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.31835824251174927,
"rewards/margins": 5.233622074127197,
"rewards/rejected": -5.551980495452881,
"step": 318
},
{
"epoch": 3.7807407407407405,
"grad_norm": 5.37027735834608,
"learning_rate": 3.899074879163244e-09,
"logits/chosen": -1.2527568340301514,
"logits/rejected": -1.0810654163360596,
"logps/chosen": -24.402645111083984,
"logps/rejected": -39.67679977416992,
"loss": 0.0538,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4089929461479187,
"rewards/margins": 4.298420429229736,
"rewards/rejected": -4.707413196563721,
"step": 319
},
{
"epoch": 3.7925925925925927,
"grad_norm": 5.568233279162257,
"learning_rate": 3.4548802869627804e-09,
"logits/chosen": -1.291711688041687,
"logits/rejected": -1.2471994161605835,
"logps/chosen": -31.061437606811523,
"logps/rejected": -49.516639709472656,
"loss": 0.0473,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.07120761275291443,
"rewards/margins": 3.723219394683838,
"rewards/rejected": -3.794426918029785,
"step": 320
},
{
"epoch": 3.8044444444444445,
"grad_norm": 6.171276653233977,
"learning_rate": 3.037365235741024e-09,
"logits/chosen": -1.3342313766479492,
"logits/rejected": -1.187886357307434,
"logps/chosen": -24.079877853393555,
"logps/rejected": -38.28224182128906,
"loss": 0.0611,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4851805865764618,
"rewards/margins": 4.108402252197266,
"rewards/rejected": -4.593582630157471,
"step": 321
},
{
"epoch": 3.8162962962962963,
"grad_norm": 6.301615641450496,
"learning_rate": 2.6465749063149245e-09,
"logits/chosen": -1.4614932537078857,
"logits/rejected": -1.3210101127624512,
"logps/chosen": -24.112567901611328,
"logps/rejected": -51.42138671875,
"loss": 0.0652,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.5791712999343872,
"rewards/margins": 6.299165725708008,
"rewards/rejected": -6.8783369064331055,
"step": 322
},
{
"epoch": 3.828148148148148,
"grad_norm": 6.87975838997433,
"learning_rate": 2.282551587522441e-09,
"logits/chosen": -1.406750202178955,
"logits/rejected": -1.3338254690170288,
"logps/chosen": -22.056568145751953,
"logps/rejected": -34.89329147338867,
"loss": 0.0758,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.25121578574180603,
"rewards/margins": 4.385520935058594,
"rewards/rejected": -4.636736869812012,
"step": 323
},
{
"epoch": 3.84,
"grad_norm": 4.966352446635051,
"learning_rate": 1.9453346716462316e-09,
"logits/chosen": -1.211751937866211,
"logits/rejected": -1.1320858001708984,
"logps/chosen": -27.62029457092285,
"logps/rejected": -32.46119689941406,
"loss": 0.0454,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.004868373274803162,
"rewards/margins": 3.807752847671509,
"rewards/rejected": -3.802884578704834,
"step": 324
},
{
"epoch": 3.851851851851852,
"grad_norm": 5.653095930506065,
"learning_rate": 1.6349606501509794e-09,
"logits/chosen": -1.1088950634002686,
"logits/rejected": -0.9607290029525757,
"logps/chosen": -28.395509719848633,
"logps/rejected": -34.00682830810547,
"loss": 0.0544,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.15473833680152893,
"rewards/margins": 3.99048113822937,
"rewards/rejected": -3.835742473602295,
"step": 325
},
{
"epoch": 3.863703703703704,
"grad_norm": 5.3966444428734945,
"learning_rate": 1.351463109734441e-09,
"logits/chosen": -1.3495458364486694,
"logits/rejected": -1.0097894668579102,
"logps/chosen": -22.80147933959961,
"logps/rejected": -41.809940338134766,
"loss": 0.0496,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.19359610974788666,
"rewards/margins": 5.500581741333008,
"rewards/rejected": -5.694178104400635,
"step": 326
},
{
"epoch": 3.8755555555555556,
"grad_norm": 5.006770074945758,
"learning_rate": 1.0948727286930192e-09,
"logits/chosen": -1.1479936838150024,
"logits/rejected": -0.9590707421302795,
"logps/chosen": -27.08885955810547,
"logps/rejected": -40.10725402832031,
"loss": 0.0455,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.17611512541770935,
"rewards/margins": 3.613635540008545,
"rewards/rejected": -3.4375205039978027,
"step": 327
},
{
"epoch": 3.8874074074074074,
"grad_norm": 6.085390667471827,
"learning_rate": 8.652172736017816e-10,
"logits/chosen": -1.1275379657745361,
"logits/rejected": -1.116228461265564,
"logps/chosen": -33.487083435058594,
"logps/rejected": -52.050228118896484,
"loss": 0.0628,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.16681703925132751,
"rewards/margins": 4.557419776916504,
"rewards/rejected": -4.724237442016602,
"step": 328
},
{
"epoch": 3.899259259259259,
"grad_norm": 6.597375260168904,
"learning_rate": 6.625215963098896e-10,
"logits/chosen": -1.234811782836914,
"logits/rejected": -1.1153168678283691,
"logps/chosen": -27.0404052734375,
"logps/rejected": -34.0019416809082,
"loss": 0.065,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.32952964305877686,
"rewards/margins": 4.547809600830078,
"rewards/rejected": -4.8773393630981445,
"step": 329
},
{
"epoch": 3.911111111111111,
"grad_norm": 5.399445593167999,
"learning_rate": 4.868076312512515e-10,
"logits/chosen": -1.1961758136749268,
"logits/rejected": -1.034976840019226,
"logps/chosen": -22.31209945678711,
"logps/rejected": -44.69541931152344,
"loss": 0.0563,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.18147775530815125,
"rewards/margins": 5.29000186920166,
"rewards/rejected": -5.108523845672607,
"step": 330
},
{
"epoch": 3.9229629629629628,
"grad_norm": 4.687101989180421,
"learning_rate": 3.3809439307086463e-10,
"logits/chosen": -1.204687237739563,
"logits/rejected": -1.126007318496704,
"logps/chosen": -24.837623596191406,
"logps/rejected": -40.658023834228516,
"loss": 0.0473,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2791484594345093,
"rewards/margins": 4.140464782714844,
"rewards/rejected": -3.8613169193267822,
"step": 331
},
{
"epoch": 3.934814814814815,
"grad_norm": 5.7183873880444045,
"learning_rate": 2.1639797456723952e-10,
"logits/chosen": -1.2559609413146973,
"logits/rejected": -1.0792549848556519,
"logps/chosen": -35.796287536621094,
"logps/rejected": -46.229820251464844,
"loss": 0.049,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.015116512775421143,
"rewards/margins": 5.156147480010986,
"rewards/rejected": -5.141030311584473,
"step": 332
},
{
"epoch": 3.9466666666666668,
"grad_norm": 4.801576190645628,
"learning_rate": 1.21731544950876e-10,
"logits/chosen": -1.227901816368103,
"logits/rejected": -1.2207201719284058,
"logps/chosen": -31.329517364501953,
"logps/rejected": -51.822059631347656,
"loss": 0.0438,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.12939153611660004,
"rewards/margins": 5.30501651763916,
"rewards/rejected": -5.434407711029053,
"step": 333
},
{
"epoch": 3.9585185185185185,
"grad_norm": 5.366333281325966,
"learning_rate": 5.4105348419264394e-11,
"logits/chosen": -1.474123239517212,
"logits/rejected": -1.370969295501709,
"logps/chosen": -21.29511260986328,
"logps/rejected": -37.816551208496094,
"loss": 0.0584,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.26586639881134033,
"rewards/margins": 4.2116827964782715,
"rewards/rejected": -3.9458167552948,
"step": 334
},
{
"epoch": 3.9703703703703703,
"grad_norm": 4.961233689259609,
"learning_rate": 1.3526703048216682e-11,
"logits/chosen": -1.2672888040542603,
"logits/rejected": -1.0974268913269043,
"logps/chosen": -25.828834533691406,
"logps/rejected": -52.68805694580078,
"loss": 0.0468,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.10839378833770752,
"rewards/margins": 6.133199691772461,
"rewards/rejected": -6.024805545806885,
"step": 335
},
{
"epoch": 3.982222222222222,
"grad_norm": 5.7068180002610625,
"learning_rate": 0.0,
"logits/chosen": -1.3201903104782104,
"logits/rejected": -1.2799780368804932,
"logps/chosen": -26.542402267456055,
"logps/rejected": -42.164154052734375,
"loss": 0.0559,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.13592669367790222,
"rewards/margins": 4.714659690856934,
"rewards/rejected": -4.578732967376709,
"step": 336
},
{
"epoch": 3.982222222222222,
"step": 336,
"total_flos": 0.0,
"train_loss": 0.19470643034825721,
"train_runtime": 59934.0013,
"train_samples_per_second": 0.72,
"train_steps_per_second": 0.006
}
],
"logging_steps": 1,
"max_steps": 336,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 200,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}