{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 80000000000, "global_step": 6380, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 28.0, "learning_rate": 7.836990595611285e-09, "logits/chosen": 2.823613166809082, "logits/rejected": 1.4552888870239258, "logps/chosen": -545.9974365234375, "logps/rejected": -345.03973388671875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": 0.0, "rewards/student_margin": 0.0, "rewards/teacher_margin": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 33.5, "learning_rate": 7.836990595611285e-08, "logits/chosen": 1.927430510520935, "logits/rejected": 1.9023399353027344, "logps/chosen": -411.6571960449219, "logps/rejected": -495.5906677246094, "loss": 0.7255, "rewards/accuracies": 0.2222222238779068, "rewards/chosen": -0.141819566488266, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": 0.010378235951066017, "rewards/student_margin": -0.15219779312610626, "rewards/teacher_margin": 0.0, "step": 10 }, { "epoch": 0.0, "grad_norm": 34.5, "learning_rate": 1.567398119122257e-07, "logits/chosen": 2.1551527976989746, "logits/rejected": 2.102078914642334, "logps/chosen": -393.61212158203125, "logps/rejected": -442.87451171875, "loss": 0.7316, "rewards/accuracies": 0.5, "rewards/chosen": -0.03079478070139885, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -0.0712582916021347, "rewards/student_margin": 0.040463510900735855, "rewards/teacher_margin": 0.0, "step": 20 }, { "epoch": 0.0, "grad_norm": 31.125, "learning_rate": 2.3510971786833858e-07, "logits/chosen": 2.1545891761779785, "logits/rejected": 2.0464978218078613, "logps/chosen": -352.2803955078125, "logps/rejected": -446.78131103515625, "loss": 0.7144, "rewards/accuracies": 0.36666667461395264, "rewards/chosen": -0.0764111876487732, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -0.03122202679514885, "rewards/student_margin": -0.045189157128334045, "rewards/teacher_margin": 0.0, "step": 30 }, { "epoch": 0.01, "grad_norm": 30.625, "learning_rate": 3.134796238244514e-07, "logits/chosen": 2.36665940284729, "logits/rejected": 2.311516046524048, "logps/chosen": -369.3040771484375, "logps/rejected": -474.19921875, "loss": 0.692, "rewards/accuracies": 0.4333333373069763, "rewards/chosen": -0.027357231825590134, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": 0.06278394162654877, "rewards/student_margin": -0.0901411697268486, "rewards/teacher_margin": 0.0, "step": 40 }, { "epoch": 0.01, "grad_norm": 30.5, "learning_rate": 3.918495297805643e-07, "logits/chosen": 2.3542869091033936, "logits/rejected": 2.3711459636688232, "logps/chosen": -386.4942321777344, "logps/rejected": -443.01751708984375, "loss": 0.7093, "rewards/accuracies": 0.46666669845581055, "rewards/chosen": -0.09300258010625839, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": 0.012741600163280964, "rewards/student_margin": -0.10574416071176529, "rewards/teacher_margin": 0.0, "step": 50 }, { "epoch": 0.01, "grad_norm": 29.0, "learning_rate": 4.7021943573667715e-07, "logits/chosen": 2.3006792068481445, "logits/rejected": 2.1928327083587646, "logps/chosen": -347.8493347167969, "logps/rejected": -338.8526916503906, "loss": 0.6865, "rewards/accuracies": 0.5333333015441895, "rewards/chosen": 0.039476651698350906, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": 0.05235341936349869, "rewards/student_margin": -0.012876768596470356, "rewards/teacher_margin": 0.0, "step": 60 }, { "epoch": 0.01, "grad_norm": 24.875, "learning_rate": 5.485893416927901e-07, "logits/chosen": 2.387723922729492, "logits/rejected": 2.4438114166259766, "logps/chosen": -328.1762390136719, "logps/rejected": -403.99365234375, "loss": 0.6421, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06824061274528503, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -0.23530200123786926, "rewards/student_margin": 0.16706135869026184, "rewards/teacher_margin": 0.0, "step": 70 }, { "epoch": 0.01, "grad_norm": 27.5, "learning_rate": 6.269592476489028e-07, "logits/chosen": 2.5362980365753174, "logits/rejected": 2.300431489944458, "logps/chosen": -342.4037170410156, "logps/rejected": -384.273681640625, "loss": 0.6002, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -0.03849436342716217, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -0.41073402762413025, "rewards/student_margin": 0.37223973870277405, "rewards/teacher_margin": 0.0, "step": 80 }, { "epoch": 0.01, "grad_norm": 27.875, "learning_rate": 7.053291536050157e-07, "logits/chosen": 2.178856372833252, "logits/rejected": 2.213784694671631, "logps/chosen": -368.0638427734375, "logps/rejected": -478.83026123046875, "loss": 0.5361, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.13164520263671875, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -0.4704175591468811, "rewards/student_margin": 0.33877235651016235, "rewards/teacher_margin": 0.0, "step": 90 }, { "epoch": 0.02, "grad_norm": 24.875, "learning_rate": 7.836990595611286e-07, "logits/chosen": 2.110853910446167, "logits/rejected": 2.239248514175415, "logps/chosen": -372.6839904785156, "logps/rejected": -436.00079345703125, "loss": 0.5543, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 0.06339085847139359, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -0.5883966684341431, "rewards/student_margin": 0.6517876386642456, "rewards/teacher_margin": 0.0, "step": 100 }, { "epoch": 0.02, "grad_norm": 23.75, "learning_rate": 8.620689655172415e-07, "logits/chosen": 2.2184062004089355, "logits/rejected": 2.2864716053009033, "logps/chosen": -390.29595947265625, "logps/rejected": -468.29302978515625, "loss": 0.5267, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -0.04247945174574852, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -0.9685481190681458, "rewards/student_margin": 0.9260686635971069, "rewards/teacher_margin": 0.0, "step": 110 }, { "epoch": 0.02, "grad_norm": 23.375, "learning_rate": 9.404388714733543e-07, "logits/chosen": 2.22385573387146, "logits/rejected": 2.2214207649230957, "logps/chosen": -369.9247131347656, "logps/rejected": -459.4775390625, "loss": 0.5197, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -0.26339417695999146, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.146983027458191, "rewards/student_margin": 0.8835889101028442, "rewards/teacher_margin": 0.0, "step": 120 }, { "epoch": 0.02, "grad_norm": 21.625, "learning_rate": 1.0188087774294672e-06, "logits/chosen": 2.194343090057373, "logits/rejected": 2.3365678787231445, "logps/chosen": -326.28326416015625, "logps/rejected": -434.6817932128906, "loss": 0.4859, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -0.30163320899009705, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.2639799118041992, "rewards/student_margin": 0.9623466730117798, "rewards/teacher_margin": 0.0, "step": 130 }, { "epoch": 0.02, "grad_norm": 23.5, "learning_rate": 1.0971786833855801e-06, "logits/chosen": 2.2458949089050293, "logits/rejected": 2.3308143615722656, "logps/chosen": -321.7738952636719, "logps/rejected": -428.9666442871094, "loss": 0.4853, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.030444085597991943, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.157867431640625, "rewards/student_margin": 1.1274235248565674, "rewards/teacher_margin": 0.0, "step": 140 }, { "epoch": 0.02, "grad_norm": 20.125, "learning_rate": 1.1755485893416929e-06, "logits/chosen": 2.12040638923645, "logits/rejected": 2.3787925243377686, "logps/chosen": -338.150146484375, "logps/rejected": -510.31591796875, "loss": 0.4077, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.05263698101043701, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.5507991313934326, "rewards/student_margin": 1.498162031173706, "rewards/teacher_margin": 0.0, "step": 150 }, { "epoch": 0.03, "grad_norm": 23.875, "learning_rate": 1.2539184952978056e-06, "logits/chosen": 2.37615704536438, "logits/rejected": 2.3313465118408203, "logps/chosen": -338.21270751953125, "logps/rejected": -400.43341064453125, "loss": 0.4662, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03429489582777023, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.3294470310211182, "rewards/student_margin": 1.3637417554855347, "rewards/teacher_margin": 0.0, "step": 160 }, { "epoch": 0.03, "grad_norm": 24.0, "learning_rate": 1.3322884012539186e-06, "logits/chosen": 2.2349793910980225, "logits/rejected": 2.040658712387085, "logps/chosen": -442.40252685546875, "logps/rejected": -477.47052001953125, "loss": 0.4231, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": -0.07953541725873947, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.288071632385254, "rewards/student_margin": 1.2085362672805786, "rewards/teacher_margin": 0.0, "step": 170 }, { "epoch": 0.03, "grad_norm": 21.875, "learning_rate": 1.4106583072100313e-06, "logits/chosen": 2.249545097351074, "logits/rejected": 2.3875250816345215, "logps/chosen": -448.8692321777344, "logps/rejected": -513.055908203125, "loss": 0.3863, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.2250821590423584, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.462332248687744, "rewards/student_margin": 2.237250328063965, "rewards/teacher_margin": 0.0, "step": 180 }, { "epoch": 0.03, "grad_norm": 23.875, "learning_rate": 1.4890282131661443e-06, "logits/chosen": 2.2982354164123535, "logits/rejected": 2.393425941467285, "logps/chosen": -354.1462707519531, "logps/rejected": -444.1820373535156, "loss": 0.457, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -0.37192535400390625, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.9471790790557861, "rewards/student_margin": 1.5752534866333008, "rewards/teacher_margin": 0.0, "step": 190 }, { "epoch": 0.03, "grad_norm": 16.75, "learning_rate": 1.5673981191222572e-06, "logits/chosen": 2.282116174697876, "logits/rejected": 2.2872157096862793, "logps/chosen": -350.09979248046875, "logps/rejected": -444.9610900878906, "loss": 0.3999, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": -0.25568491220474243, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.8648840188980103, "rewards/student_margin": 1.6091991662979126, "rewards/teacher_margin": 0.0, "step": 200 }, { "epoch": 0.03, "grad_norm": 25.875, "learning_rate": 1.64576802507837e-06, "logits/chosen": 2.2302212715148926, "logits/rejected": 2.4427990913391113, "logps/chosen": -369.0305480957031, "logps/rejected": -466.35107421875, "loss": 0.3751, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.08400142192840576, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.2602508068084717, "rewards/student_margin": 2.344252347946167, "rewards/teacher_margin": 0.0, "step": 210 }, { "epoch": 0.03, "grad_norm": 17.25, "learning_rate": 1.724137931034483e-06, "logits/chosen": 2.4002585411071777, "logits/rejected": 2.468780040740967, "logps/chosen": -338.50848388671875, "logps/rejected": -383.3343811035156, "loss": 0.4336, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.19083760678768158, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.6729093790054321, "rewards/student_margin": 1.4820717573165894, "rewards/teacher_margin": 0.0, "step": 220 }, { "epoch": 0.04, "grad_norm": 15.5, "learning_rate": 1.8025078369905957e-06, "logits/chosen": 2.201570749282837, "logits/rejected": 2.347830295562744, "logps/chosen": -363.1189880371094, "logps/rejected": -442.08642578125, "loss": 0.4556, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4014434814453125, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.4565956592559814, "rewards/student_margin": 1.055152177810669, "rewards/teacher_margin": 0.0, "step": 230 }, { "epoch": 0.04, "grad_norm": 18.5, "learning_rate": 1.8808777429467086e-06, "logits/chosen": 1.8651161193847656, "logits/rejected": 2.0073065757751465, "logps/chosen": -341.8658142089844, "logps/rejected": -407.6085510253906, "loss": 0.4392, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": -0.10438945144414902, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.2551590204238892, "rewards/student_margin": 1.1507694721221924, "rewards/teacher_margin": 0.0, "step": 240 }, { "epoch": 0.04, "grad_norm": 20.625, "learning_rate": 1.9592476489028214e-06, "logits/chosen": 2.2734878063201904, "logits/rejected": 2.3731117248535156, "logps/chosen": -320.9939270019531, "logps/rejected": -460.5008850097656, "loss": 0.429, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.09011359512805939, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.0900018215179443, "rewards/student_margin": 0.9998881220817566, "rewards/teacher_margin": 0.0, "step": 250 }, { "epoch": 0.04, "grad_norm": 16.625, "learning_rate": 2.0376175548589343e-06, "logits/chosen": 2.2008090019226074, "logits/rejected": 2.0241432189941406, "logps/chosen": -364.24114990234375, "logps/rejected": -458.1392517089844, "loss": 0.4255, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.13753663003444672, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.7853968143463135, "rewards/student_margin": 1.922933578491211, "rewards/teacher_margin": 0.0, "step": 260 }, { "epoch": 0.04, "grad_norm": 24.375, "learning_rate": 2.1159874608150473e-06, "logits/chosen": 2.2729110717773438, "logits/rejected": 2.1600234508514404, "logps/chosen": -364.36981201171875, "logps/rejected": -491.1966247558594, "loss": 0.419, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 0.03324992209672928, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.6763681173324585, "rewards/student_margin": 1.7096182107925415, "rewards/teacher_margin": 0.0, "step": 270 }, { "epoch": 0.04, "grad_norm": 23.0, "learning_rate": 2.1943573667711602e-06, "logits/chosen": 2.2034125328063965, "logits/rejected": 2.2017383575439453, "logps/chosen": -363.89715576171875, "logps/rejected": -484.51116943359375, "loss": 0.3962, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.230382040143013, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.035240650177002, "rewards/student_margin": 2.265622854232788, "rewards/teacher_margin": 0.0, "step": 280 }, { "epoch": 0.05, "grad_norm": 19.625, "learning_rate": 2.2727272727272728e-06, "logits/chosen": 2.290755271911621, "logits/rejected": 2.276491403579712, "logps/chosen": -296.02618408203125, "logps/rejected": -398.37689208984375, "loss": 0.3649, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.37919336557388306, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.4979604482650757, "rewards/student_margin": 1.877153754234314, "rewards/teacher_margin": 0.0, "step": 290 }, { "epoch": 0.05, "grad_norm": 21.875, "learning_rate": 2.3510971786833857e-06, "logits/chosen": 2.4366583824157715, "logits/rejected": 2.3412327766418457, "logps/chosen": -383.0442199707031, "logps/rejected": -398.50518798828125, "loss": 0.4419, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.28211697936058044, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.1785156726837158, "rewards/student_margin": 1.4606326818466187, "rewards/teacher_margin": 0.0, "step": 300 }, { "epoch": 0.05, "grad_norm": 19.875, "learning_rate": 2.4294670846394987e-06, "logits/chosen": 2.2485082149505615, "logits/rejected": 2.148284673690796, "logps/chosen": -345.15093994140625, "logps/rejected": -447.29742431640625, "loss": 0.3907, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.4087551236152649, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.6644394397735596, "rewards/student_margin": 2.0731945037841797, "rewards/teacher_margin": 0.0, "step": 310 }, { "epoch": 0.05, "grad_norm": 13.875, "learning_rate": 2.507836990595611e-06, "logits/chosen": 2.3357791900634766, "logits/rejected": 2.0635435581207275, "logps/chosen": -386.22662353515625, "logps/rejected": -449.47216796875, "loss": 0.3728, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.4933919906616211, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.7445013523101807, "rewards/student_margin": 2.2378931045532227, "rewards/teacher_margin": 0.0, "step": 320 }, { "epoch": 0.05, "grad_norm": 19.0, "learning_rate": 2.5862068965517246e-06, "logits/chosen": 2.113351583480835, "logits/rejected": 2.1406049728393555, "logps/chosen": -304.8935241699219, "logps/rejected": -373.6761474609375, "loss": 0.4637, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 0.14335303008556366, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.3245261907577515, "rewards/student_margin": 1.467879056930542, "rewards/teacher_margin": 0.0, "step": 330 }, { "epoch": 0.05, "grad_norm": 13.0, "learning_rate": 2.664576802507837e-06, "logits/chosen": 2.0369467735290527, "logits/rejected": 2.0569188594818115, "logps/chosen": -363.6729431152344, "logps/rejected": -426.5743103027344, "loss": 0.3447, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 0.114227294921875, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.794350028038025, "rewards/student_margin": 1.9085772037506104, "rewards/teacher_margin": 0.0, "step": 340 }, { "epoch": 0.05, "grad_norm": 17.125, "learning_rate": 2.74294670846395e-06, "logits/chosen": 2.1934168338775635, "logits/rejected": 2.2040586471557617, "logps/chosen": -333.93707275390625, "logps/rejected": -402.4310302734375, "loss": 0.3234, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.662988543510437, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.0982108116149902, "rewards/student_margin": 2.7611989974975586, "rewards/teacher_margin": 0.0, "step": 350 }, { "epoch": 0.06, "grad_norm": 22.125, "learning_rate": 2.8213166144200626e-06, "logits/chosen": 2.177361011505127, "logits/rejected": 2.0621471405029297, "logps/chosen": -384.903076171875, "logps/rejected": -397.87200927734375, "loss": 0.3718, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.6229776740074158, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.455620527267456, "rewards/student_margin": 2.0785984992980957, "rewards/teacher_margin": 0.0, "step": 360 }, { "epoch": 0.06, "grad_norm": 16.75, "learning_rate": 2.899686520376176e-06, "logits/chosen": 2.0624585151672363, "logits/rejected": 2.3779311180114746, "logps/chosen": -433.61798095703125, "logps/rejected": -524.899169921875, "loss": 0.3987, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.16819992661476135, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.662959098815918, "rewards/student_margin": 1.8311588764190674, "rewards/teacher_margin": 0.0, "step": 370 }, { "epoch": 0.06, "grad_norm": 18.25, "learning_rate": 2.9780564263322885e-06, "logits/chosen": 2.36124324798584, "logits/rejected": 2.314995765686035, "logps/chosen": -433.23992919921875, "logps/rejected": -478.0809020996094, "loss": 0.4069, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.4563807547092438, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.7036340236663818, "rewards/student_margin": 2.1600148677825928, "rewards/teacher_margin": 0.0, "step": 380 }, { "epoch": 0.06, "grad_norm": 24.5, "learning_rate": 3.0564263322884015e-06, "logits/chosen": 2.2857918739318848, "logits/rejected": 2.309779405593872, "logps/chosen": -390.44317626953125, "logps/rejected": -467.6398010253906, "loss": 0.4119, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": 0.5236002802848816, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.686099648475647, "rewards/student_margin": 2.209700107574463, "rewards/teacher_margin": 0.0, "step": 390 }, { "epoch": 0.06, "grad_norm": 19.875, "learning_rate": 3.1347962382445144e-06, "logits/chosen": 2.643850803375244, "logits/rejected": 2.611961841583252, "logps/chosen": -386.1805725097656, "logps/rejected": -438.4471130371094, "loss": 0.341, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.12163287401199341, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.6392881870269775, "rewards/student_margin": 2.760921001434326, "rewards/teacher_margin": 0.0, "step": 400 }, { "epoch": 0.06, "grad_norm": 18.0, "learning_rate": 3.2131661442006274e-06, "logits/chosen": 2.259416103363037, "logits/rejected": 2.2809250354766846, "logps/chosen": -377.7174377441406, "logps/rejected": -457.1698303222656, "loss": 0.3199, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6127907037734985, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.6672379970550537, "rewards/student_margin": 3.280028820037842, "rewards/teacher_margin": 0.0, "step": 410 }, { "epoch": 0.07, "grad_norm": 18.75, "learning_rate": 3.29153605015674e-06, "logits/chosen": 2.2675483226776123, "logits/rejected": 2.198174476623535, "logps/chosen": -386.1883239746094, "logps/rejected": -461.9312438964844, "loss": 0.3602, "rewards/accuracies": 0.9000000953674316, "rewards/chosen": -0.13544133305549622, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.995842456817627, "rewards/student_margin": 2.860401153564453, "rewards/teacher_margin": 0.0, "step": 420 }, { "epoch": 0.07, "grad_norm": 17.75, "learning_rate": 3.369905956112853e-06, "logits/chosen": 2.3144519329071045, "logits/rejected": 2.0859196186065674, "logps/chosen": -352.28399658203125, "logps/rejected": -412.254638671875, "loss": 0.4505, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 0.06295698881149292, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.8022692203521729, "rewards/student_margin": 1.8652260303497314, "rewards/teacher_margin": 0.0, "step": 430 }, { "epoch": 0.07, "grad_norm": 18.5, "learning_rate": 3.448275862068966e-06, "logits/chosen": 2.311047077178955, "logits/rejected": 2.2913379669189453, "logps/chosen": -374.22802734375, "logps/rejected": -434.5338439941406, "loss": 0.4136, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.30223265290260315, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.7289443016052246, "rewards/student_margin": 3.031177043914795, "rewards/teacher_margin": 0.0, "step": 440 }, { "epoch": 0.07, "grad_norm": 17.5, "learning_rate": 3.5266457680250788e-06, "logits/chosen": 2.4307050704956055, "logits/rejected": 2.561244249343872, "logps/chosen": -379.9247131347656, "logps/rejected": -407.8270568847656, "loss": 0.3582, "rewards/accuracies": 0.9000000953674316, "rewards/chosen": 0.29674965143203735, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.5200042724609375, "rewards/student_margin": 2.81675386428833, "rewards/teacher_margin": 0.0, "step": 450 }, { "epoch": 0.07, "grad_norm": 27.75, "learning_rate": 3.6050156739811913e-06, "logits/chosen": 2.128385543823242, "logits/rejected": 2.0907838344573975, "logps/chosen": -400.91717529296875, "logps/rejected": -530.9824829101562, "loss": 0.356, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5192385911941528, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.551103115081787, "rewards/student_margin": 3.0703415870666504, "rewards/teacher_margin": 0.0, "step": 460 }, { "epoch": 0.07, "grad_norm": 14.75, "learning_rate": 3.6833855799373043e-06, "logits/chosen": 2.4146134853363037, "logits/rejected": 2.2238388061523438, "logps/chosen": -420.5039978027344, "logps/rejected": -461.2884826660156, "loss": 0.3722, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 0.08131153881549835, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.0483007431030273, "rewards/student_margin": 2.129612445831299, "rewards/teacher_margin": 0.0, "step": 470 }, { "epoch": 0.08, "grad_norm": 20.5, "learning_rate": 3.7617554858934172e-06, "logits/chosen": 2.269953727722168, "logits/rejected": 2.3099522590637207, "logps/chosen": -360.2138366699219, "logps/rejected": -537.5364990234375, "loss": 0.3649, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 0.43445664644241333, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.42378568649292, "rewards/student_margin": 2.8582420349121094, "rewards/teacher_margin": 0.0, "step": 480 }, { "epoch": 0.08, "grad_norm": 21.75, "learning_rate": 3.84012539184953e-06, "logits/chosen": 2.1988048553466797, "logits/rejected": 2.157762050628662, "logps/chosen": -370.66583251953125, "logps/rejected": -431.24444580078125, "loss": 0.3423, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.23767447471618652, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.3868513107299805, "rewards/student_margin": 2.624525785446167, "rewards/teacher_margin": 0.0, "step": 490 }, { "epoch": 0.08, "grad_norm": 15.625, "learning_rate": 3.918495297805643e-06, "logits/chosen": 2.492316722869873, "logits/rejected": 2.462441921234131, "logps/chosen": -363.45001220703125, "logps/rejected": -420.40814208984375, "loss": 0.4114, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.04861477017402649, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.2330451011657715, "rewards/student_margin": 2.1844305992126465, "rewards/teacher_margin": 0.0, "step": 500 }, { "epoch": 0.08, "grad_norm": 22.25, "learning_rate": 3.996865203761755e-06, "logits/chosen": 2.073357582092285, "logits/rejected": 1.9686062335968018, "logps/chosen": -375.708740234375, "logps/rejected": -419.54534912109375, "loss": 0.3897, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -0.0827634185552597, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.2681710720062256, "rewards/student_margin": 2.185407876968384, "rewards/teacher_margin": 0.0, "step": 510 }, { "epoch": 0.08, "grad_norm": 13.125, "learning_rate": 4.075235109717869e-06, "logits/chosen": 2.098198890686035, "logits/rejected": 2.156982898712158, "logps/chosen": -386.9971923828125, "logps/rejected": -457.8114318847656, "loss": 0.3548, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 0.2538744807243347, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.220729112625122, "rewards/student_margin": 1.4746036529541016, "rewards/teacher_margin": 0.0, "step": 520 }, { "epoch": 0.08, "grad_norm": 18.375, "learning_rate": 4.153605015673981e-06, "logits/chosen": 2.239912986755371, "logits/rejected": 2.2412004470825195, "logps/chosen": -324.81683349609375, "logps/rejected": -415.37841796875, "loss": 0.3492, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.29275792837142944, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.158926248550415, "rewards/student_margin": 2.4516842365264893, "rewards/teacher_margin": 0.0, "step": 530 }, { "epoch": 0.08, "grad_norm": 17.75, "learning_rate": 4.2319749216300945e-06, "logits/chosen": 2.3173203468322754, "logits/rejected": 2.3197200298309326, "logps/chosen": -364.68072509765625, "logps/rejected": -457.248046875, "loss": 0.385, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.002521765185520053, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.685851573944092, "rewards/student_margin": 2.6883738040924072, "rewards/teacher_margin": 0.0, "step": 540 }, { "epoch": 0.09, "grad_norm": 16.5, "learning_rate": 4.310344827586207e-06, "logits/chosen": 2.224043369293213, "logits/rejected": 2.124126672744751, "logps/chosen": -397.21484375, "logps/rejected": -454.77606201171875, "loss": 0.3855, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.15688516199588776, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.7797231674194336, "rewards/student_margin": 1.9366083145141602, "rewards/teacher_margin": 0.0, "step": 550 }, { "epoch": 0.09, "grad_norm": 19.125, "learning_rate": 4.3887147335423205e-06, "logits/chosen": 2.142578601837158, "logits/rejected": 2.3369054794311523, "logps/chosen": -327.46014404296875, "logps/rejected": -501.7063903808594, "loss": 0.3457, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.03988009691238403, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.4761157035827637, "rewards/student_margin": 2.515995979309082, "rewards/teacher_margin": 0.0, "step": 560 }, { "epoch": 0.09, "grad_norm": 22.625, "learning_rate": 4.467084639498433e-06, "logits/chosen": 2.065657138824463, "logits/rejected": 1.6880791187286377, "logps/chosen": -354.273193359375, "logps/rejected": -396.13458251953125, "loss": 0.379, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.2126554250717163, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.1840319633483887, "rewards/student_margin": 2.3966870307922363, "rewards/teacher_margin": 0.0, "step": 570 }, { "epoch": 0.09, "grad_norm": 18.25, "learning_rate": 4.5454545454545455e-06, "logits/chosen": 2.3829522132873535, "logits/rejected": 2.342242479324341, "logps/chosen": -410.91375732421875, "logps/rejected": -456.8291931152344, "loss": 0.3369, "rewards/accuracies": 0.9000000953674316, "rewards/chosen": 0.3803706467151642, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.2380497455596924, "rewards/student_margin": 2.618420362472534, "rewards/teacher_margin": 0.0, "step": 580 }, { "epoch": 0.09, "grad_norm": 20.25, "learning_rate": 4.623824451410659e-06, "logits/chosen": 2.3038744926452637, "logits/rejected": 2.0741543769836426, "logps/chosen": -406.5392761230469, "logps/rejected": -408.89617919921875, "loss": 0.4305, "rewards/accuracies": 0.6666666269302368, "rewards/chosen": -0.17663268744945526, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.4715148210525513, "rewards/student_margin": 1.2948819398880005, "rewards/teacher_margin": 0.0, "step": 590 }, { "epoch": 0.09, "grad_norm": 17.625, "learning_rate": 4.7021943573667714e-06, "logits/chosen": 2.1380269527435303, "logits/rejected": 2.250474452972412, "logps/chosen": -373.09320068359375, "logps/rejected": -414.43304443359375, "loss": 0.3722, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.018924076110124588, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.669830083847046, "rewards/student_margin": 1.650905966758728, "rewards/teacher_margin": 0.0, "step": 600 }, { "epoch": 0.1, "grad_norm": 24.375, "learning_rate": 4.780564263322884e-06, "logits/chosen": 2.2485146522521973, "logits/rejected": 2.1549315452575684, "logps/chosen": -440.39288330078125, "logps/rejected": -431.9788513183594, "loss": 0.4068, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 0.1762092411518097, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.680384874343872, "rewards/student_margin": 1.8565940856933594, "rewards/teacher_margin": 0.0, "step": 610 }, { "epoch": 0.1, "grad_norm": 20.0, "learning_rate": 4.858934169278997e-06, "logits/chosen": 2.2714178562164307, "logits/rejected": 2.0903637409210205, "logps/chosen": -416.11236572265625, "logps/rejected": -428.8457946777344, "loss": 0.4127, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 0.4204396605491638, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.0484182834625244, "rewards/student_margin": 2.4688572883605957, "rewards/teacher_margin": 0.0, "step": 620 }, { "epoch": 0.1, "grad_norm": 21.0, "learning_rate": 4.937304075235111e-06, "logits/chosen": 2.2357707023620605, "logits/rejected": 2.1548380851745605, "logps/chosen": -381.8794250488281, "logps/rejected": -413.0296936035156, "loss": 0.3346, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.582564115524292, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.6162660121917725, "rewards/student_margin": 3.1988296508789062, "rewards/teacher_margin": 0.0, "step": 630 }, { "epoch": 0.1, "grad_norm": 16.25, "learning_rate": 4.999998503270461e-06, "logits/chosen": 2.16475248336792, "logits/rejected": 2.2281501293182373, "logps/chosen": -391.668701171875, "logps/rejected": -461.87744140625, "loss": 0.4073, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 0.7223118543624878, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.6918067932128906, "rewards/student_margin": 2.414118766784668, "rewards/teacher_margin": 0.0, "step": 640 }, { "epoch": 0.1, "grad_norm": 21.625, "learning_rate": 4.999946117924745e-06, "logits/chosen": 2.0960400104522705, "logits/rejected": 2.283792734146118, "logps/chosen": -350.0085144042969, "logps/rejected": -470.9830017089844, "loss": 0.3773, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.1939437985420227, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.7379710674285889, "rewards/student_margin": 1.9319149255752563, "rewards/teacher_margin": 0.0, "step": 650 }, { "epoch": 0.1, "grad_norm": 20.375, "learning_rate": 4.999818897894192e-06, "logits/chosen": 2.2479116916656494, "logits/rejected": 2.261665105819702, "logps/chosen": -387.2687072753906, "logps/rejected": -368.5941162109375, "loss": 0.3767, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": 0.2011927366256714, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.61771559715271, "rewards/student_margin": 1.818908452987671, "rewards/teacher_margin": 0.0, "step": 660 }, { "epoch": 0.11, "grad_norm": 16.0, "learning_rate": 4.999616846987071e-06, "logits/chosen": 2.1456961631774902, "logits/rejected": 2.0120089054107666, "logps/chosen": -382.92718505859375, "logps/rejected": -444.59033203125, "loss": 0.3489, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.3482722043991089, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.399381160736084, "rewards/student_margin": 2.7476532459259033, "rewards/teacher_margin": 0.0, "step": 670 }, { "epoch": 0.11, "grad_norm": 22.625, "learning_rate": 4.999339971251679e-06, "logits/chosen": 2.2489054203033447, "logits/rejected": 2.2089006900787354, "logps/chosen": -392.1227722167969, "logps/rejected": -466.62335205078125, "loss": 0.371, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 0.35043638944625854, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.2226755619049072, "rewards/student_margin": 2.5731120109558105, "rewards/teacher_margin": 0.0, "step": 680 }, { "epoch": 0.11, "grad_norm": 23.375, "learning_rate": 4.998988278976157e-06, "logits/chosen": 2.3761959075927734, "logits/rejected": 2.371328830718994, "logps/chosen": -422.1768493652344, "logps/rejected": -503.87957763671875, "loss": 0.4003, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.19609221816062927, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.5681087970733643, "rewards/student_margin": 2.7642006874084473, "rewards/teacher_margin": 0.0, "step": 690 }, { "epoch": 0.11, "grad_norm": 20.125, "learning_rate": 4.998561780688246e-06, "logits/chosen": 2.0721497535705566, "logits/rejected": 2.1726222038269043, "logps/chosen": -376.9571228027344, "logps/rejected": -436.64971923828125, "loss": 0.3295, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 0.0686085894703865, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.527146339416504, "rewards/student_margin": 2.595755100250244, "rewards/teacher_margin": 0.0, "step": 700 }, { "epoch": 0.11, "grad_norm": 18.75, "learning_rate": 4.998060489154965e-06, "logits/chosen": 2.0344719886779785, "logits/rejected": 2.128448247909546, "logps/chosen": -400.06884765625, "logps/rejected": -403.7366027832031, "loss": 0.4133, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.591033935546875, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.4933950901031494, "rewards/student_margin": 3.0844290256500244, "rewards/teacher_margin": 0.0, "step": 710 }, { "epoch": 0.11, "grad_norm": 18.625, "learning_rate": 4.997484419382237e-06, "logits/chosen": 2.12632155418396, "logits/rejected": 2.394336223602295, "logps/chosen": -334.028564453125, "logps/rejected": -533.8587036132812, "loss": 0.3461, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5567973256111145, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.8452367782592773, "rewards/student_margin": 3.402034044265747, "rewards/teacher_margin": 0.0, "step": 720 }, { "epoch": 0.11, "grad_norm": 20.0, "learning_rate": 4.996833588614432e-06, "logits/chosen": 2.3135998249053955, "logits/rejected": 2.427696466445923, "logps/chosen": -391.9068298339844, "logps/rejected": -431.922607421875, "loss": 0.4065, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 0.3110305666923523, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.3743510246276855, "rewards/student_margin": 2.6853814125061035, "rewards/teacher_margin": 0.0, "step": 730 }, { "epoch": 0.12, "grad_norm": 20.625, "learning_rate": 4.996108016333859e-06, "logits/chosen": 2.298734188079834, "logits/rejected": 2.141279458999634, "logps/chosen": -371.62408447265625, "logps/rejected": -490.76312255859375, "loss": 0.3614, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": -0.1348625123500824, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.6863017082214355, "rewards/student_margin": 3.5514392852783203, "rewards/teacher_margin": 0.0, "step": 740 }, { "epoch": 0.12, "grad_norm": 25.125, "learning_rate": 4.995307724260171e-06, "logits/chosen": 2.208245277404785, "logits/rejected": 2.391312599182129, "logps/chosen": -392.109130859375, "logps/rejected": -491.1319885253906, "loss": 0.4275, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": -0.647506594657898, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.78999400138855, "rewards/student_margin": 2.1424872875213623, "rewards/teacher_margin": 0.0, "step": 750 }, { "epoch": 0.12, "grad_norm": 16.875, "learning_rate": 4.994432736349729e-06, "logits/chosen": 2.2047789096832275, "logits/rejected": 2.415001630783081, "logps/chosen": -321.9795837402344, "logps/rejected": -449.22479248046875, "loss": 0.4141, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -0.10200933367013931, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.225008726119995, "rewards/student_margin": 2.122999668121338, "rewards/teacher_margin": 0.0, "step": 760 }, { "epoch": 0.12, "grad_norm": 19.5, "learning_rate": 4.9934830787948756e-06, "logits/chosen": 2.545530319213867, "logits/rejected": 2.5257840156555176, "logps/chosen": -320.6497802734375, "logps/rejected": -449.13482666015625, "loss": 0.3567, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.05362142249941826, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.1033291816711426, "rewards/student_margin": 2.1569507122039795, "rewards/teacher_margin": 0.0, "step": 770 }, { "epoch": 0.12, "grad_norm": 25.375, "learning_rate": 4.992458780023151e-06, "logits/chosen": 2.4481022357940674, "logits/rejected": 2.3354721069335938, "logps/chosen": -368.9129638671875, "logps/rejected": -403.81011962890625, "loss": 0.4023, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.035367585718631744, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.220233917236328, "rewards/student_margin": 2.255601406097412, "rewards/teacher_margin": 0.0, "step": 780 }, { "epoch": 0.12, "grad_norm": 20.5, "learning_rate": 4.991359870696448e-06, "logits/chosen": 2.5052151679992676, "logits/rejected": 2.377469539642334, "logps/chosen": -370.687255859375, "logps/rejected": -464.0367126464844, "loss": 0.3393, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.38655346632003784, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.0698726177215576, "rewards/student_margin": 3.456425905227661, "rewards/teacher_margin": 0.0, "step": 790 }, { "epoch": 0.13, "grad_norm": 9.75, "learning_rate": 4.990186383710089e-06, "logits/chosen": 2.248818874359131, "logits/rejected": 2.241952419281006, "logps/chosen": -364.9921569824219, "logps/rejected": -429.6702575683594, "loss": 0.3291, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.26953354477882385, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.759406328201294, "rewards/student_margin": 3.028939723968506, "rewards/teacher_margin": 0.0, "step": 800 }, { "epoch": 0.13, "grad_norm": 15.0, "learning_rate": 4.988938354191842e-06, "logits/chosen": 2.4116063117980957, "logits/rejected": 2.1507606506347656, "logps/chosen": -401.8000183105469, "logps/rejected": -470.48974609375, "loss": 0.3186, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.2210436314344406, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.5051519870758057, "rewards/student_margin": 2.7261955738067627, "rewards/teacher_margin": 0.0, "step": 810 }, { "epoch": 0.13, "grad_norm": 9.6875, "learning_rate": 4.98761581950087e-06, "logits/chosen": 2.310250997543335, "logits/rejected": 2.491212844848633, "logps/chosen": -357.6111755371094, "logps/rejected": -462.36883544921875, "loss": 0.3756, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.36351194977760315, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.0282015800476074, "rewards/student_margin": 3.3917136192321777, "rewards/teacher_margin": 0.0, "step": 820 }, { "epoch": 0.13, "grad_norm": 25.125, "learning_rate": 4.986218819226614e-06, "logits/chosen": 2.1903536319732666, "logits/rejected": 2.2725143432617188, "logps/chosen": -364.6377868652344, "logps/rejected": -481.3650817871094, "loss": 0.3934, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 0.14307276904582977, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.8698813915252686, "rewards/student_margin": 2.0129542350769043, "rewards/teacher_margin": 0.0, "step": 830 }, { "epoch": 0.13, "grad_norm": 14.625, "learning_rate": 4.984747395187604e-06, "logits/chosen": 2.2614586353302, "logits/rejected": 2.3284449577331543, "logps/chosen": -372.1788635253906, "logps/rejected": -460.71905517578125, "loss": 0.2562, "rewards/accuracies": 0.9000000953674316, "rewards/chosen": 0.8193306922912598, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.206346273422241, "rewards/student_margin": 3.025676727294922, "rewards/teacher_margin": 0.0, "step": 840 }, { "epoch": 0.13, "grad_norm": 8.0, "learning_rate": 4.9832015914302135e-06, "logits/chosen": 2.4473578929901123, "logits/rejected": 2.130159616470337, "logps/chosen": -387.8343200683594, "logps/rejected": -452.24560546875, "loss": 0.2887, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.5022855401039124, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.698909044265747, "rewards/student_margin": 3.2011942863464355, "rewards/teacher_margin": 0.0, "step": 850 }, { "epoch": 0.13, "grad_norm": 10.5, "learning_rate": 4.981581454227332e-06, "logits/chosen": 2.3531577587127686, "logits/rejected": 2.160940647125244, "logps/chosen": -350.1921691894531, "logps/rejected": -398.22265625, "loss": 0.3697, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.02706654742360115, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.192899227142334, "rewards/student_margin": 3.2199654579162598, "rewards/teacher_margin": 0.0, "step": 860 }, { "epoch": 0.14, "grad_norm": 17.875, "learning_rate": 4.9798870320769884e-06, "logits/chosen": 2.405717134475708, "logits/rejected": 2.1969313621520996, "logps/chosen": -376.37066650390625, "logps/rejected": -406.08441162109375, "loss": 0.3651, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.12809982895851135, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.363137722015381, "rewards/student_margin": 3.491237163543701, "rewards/teacher_margin": 0.0, "step": 870 }, { "epoch": 0.14, "grad_norm": 18.625, "learning_rate": 4.978118375700895e-06, "logits/chosen": 2.3619024753570557, "logits/rejected": 2.455719470977783, "logps/chosen": -326.07952880859375, "logps/rejected": -449.20013427734375, "loss": 0.2944, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5576108694076538, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2496941089630127, "rewards/student_margin": 3.807304859161377, "rewards/teacher_margin": 0.0, "step": 880 }, { "epoch": 0.14, "grad_norm": 14.625, "learning_rate": 4.976275538042932e-06, "logits/chosen": 2.200669765472412, "logits/rejected": 2.385596990585327, "logps/chosen": -334.2998352050781, "logps/rejected": -443.8241271972656, "loss": 0.314, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.3688916862010956, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.984567403793335, "rewards/student_margin": 3.353458881378174, "rewards/teacher_margin": 0.0, "step": 890 }, { "epoch": 0.14, "grad_norm": 20.75, "learning_rate": 4.974358574267554e-06, "logits/chosen": 2.33606219291687, "logits/rejected": 2.087332010269165, "logps/chosen": -373.33819580078125, "logps/rejected": -376.1592712402344, "loss": 0.3301, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 0.6809409856796265, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.24197518825531, "rewards/student_margin": 1.9229161739349365, "rewards/teacher_margin": 0.0, "step": 900 }, { "epoch": 0.14, "grad_norm": 19.375, "learning_rate": 4.972367541758154e-06, "logits/chosen": 2.0645499229431152, "logits/rejected": 1.9899866580963135, "logps/chosen": -356.3707275390625, "logps/rejected": -449.3792419433594, "loss": 0.4092, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.20348969101905823, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.072772264480591, "rewards/student_margin": 3.276261806488037, "rewards/teacher_margin": 0.0, "step": 910 }, { "epoch": 0.14, "grad_norm": 18.5, "learning_rate": 4.97030250011533e-06, "logits/chosen": 2.27887225151062, "logits/rejected": 2.632761001586914, "logps/chosen": -346.4476013183594, "logps/rejected": -414.49810791015625, "loss": 0.3497, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.10400823503732681, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.812917709350586, "rewards/student_margin": 2.9169259071350098, "rewards/teacher_margin": 0.0, "step": 920 }, { "epoch": 0.15, "grad_norm": 15.0, "learning_rate": 4.968163511155112e-06, "logits/chosen": 2.2531771659851074, "logits/rejected": 2.3235390186309814, "logps/chosen": -359.2082214355469, "logps/rejected": -439.1959533691406, "loss": 0.3367, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.14408263564109802, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.5682601928710938, "rewards/student_margin": 3.424177646636963, "rewards/teacher_margin": 0.0, "step": 930 }, { "epoch": 0.15, "grad_norm": 16.625, "learning_rate": 4.965950638907106e-06, "logits/chosen": 2.1399567127227783, "logits/rejected": 2.0601232051849365, "logps/chosen": -384.9977111816406, "logps/rejected": -398.41448974609375, "loss": 0.359, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -0.4951428771018982, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.1657090187072754, "rewards/student_margin": 2.6705660820007324, "rewards/teacher_margin": 0.0, "step": 940 }, { "epoch": 0.15, "grad_norm": 12.6875, "learning_rate": 4.963663949612576e-06, "logits/chosen": 2.467695474624634, "logits/rejected": 2.4705662727355957, "logps/chosen": -328.66278076171875, "logps/rejected": -423.71746826171875, "loss": 0.315, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -0.02754516899585724, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.4418301582336426, "rewards/student_margin": 3.414285182952881, "rewards/teacher_margin": 0.0, "step": 950 }, { "epoch": 0.15, "grad_norm": 18.875, "learning_rate": 4.961303511722469e-06, "logits/chosen": 2.202942371368408, "logits/rejected": 2.3181934356689453, "logps/chosen": -319.66064453125, "logps/rejected": -454.7606506347656, "loss": 0.3774, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.026477938517928123, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.6223151683807373, "rewards/student_margin": 3.6487934589385986, "rewards/teacher_margin": 0.0, "step": 960 }, { "epoch": 0.15, "grad_norm": 16.75, "learning_rate": 4.95886939589536e-06, "logits/chosen": 2.2630813121795654, "logits/rejected": 1.994014024734497, "logps/chosen": -391.45745849609375, "logps/rejected": -449.36920166015625, "loss": 0.3939, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.10374321043491364, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.140214443206787, "rewards/student_margin": 2.24395751953125, "rewards/teacher_margin": 0.0, "step": 970 }, { "epoch": 0.15, "grad_norm": 15.9375, "learning_rate": 4.95636167499533e-06, "logits/chosen": 2.113713026046753, "logits/rejected": 2.1676106452941895, "logps/chosen": -353.131591796875, "logps/rejected": -491.37066650390625, "loss": 0.317, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5847376585006714, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.6874327659606934, "rewards/student_margin": 3.2721710205078125, "rewards/teacher_margin": 0.0, "step": 980 }, { "epoch": 0.16, "grad_norm": 18.75, "learning_rate": 4.953780424089803e-06, "logits/chosen": 2.2561230659484863, "logits/rejected": 2.163105010986328, "logps/chosen": -393.0497131347656, "logps/rejected": -402.49920654296875, "loss": 0.3376, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 1.1391042470932007, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -0.6542249321937561, "rewards/student_margin": 1.7933292388916016, "rewards/teacher_margin": 0.0, "step": 990 }, { "epoch": 0.16, "grad_norm": 20.75, "learning_rate": 4.951125720447282e-06, "logits/chosen": 2.352250337600708, "logits/rejected": 2.550795555114746, "logps/chosen": -347.7257080078125, "logps/rejected": -564.503662109375, "loss": 0.3063, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.3694305419921875, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.8610310554504395, "rewards/student_margin": 3.230461835861206, "rewards/teacher_margin": 0.0, "step": 1000 }, { "epoch": 0.16, "grad_norm": 27.375, "learning_rate": 4.9483976435350436e-06, "logits/chosen": 2.2802579402923584, "logits/rejected": 2.4262197017669678, "logps/chosen": -367.87811279296875, "logps/rejected": -423.06500244140625, "loss": 0.4143, "rewards/accuracies": 0.6999999284744263, "rewards/chosen": 0.2212369441986084, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.9231888055801392, "rewards/student_margin": 2.144425392150879, "rewards/teacher_margin": 0.0, "step": 1010 }, { "epoch": 0.16, "grad_norm": 19.0, "learning_rate": 4.945596275016758e-06, "logits/chosen": 2.2507548332214355, "logits/rejected": 2.159379482269287, "logps/chosen": -394.84320068359375, "logps/rejected": -425.9612731933594, "loss": 0.3631, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 0.6104202270507812, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.9127671718597412, "rewards/student_margin": 2.5231871604919434, "rewards/teacher_margin": 0.0, "step": 1020 }, { "epoch": 0.16, "grad_norm": 14.375, "learning_rate": 4.942721698750047e-06, "logits/chosen": 2.170393705368042, "logits/rejected": 1.962284803390503, "logps/chosen": -412.2069396972656, "logps/rejected": -475.2394104003906, "loss": 0.3645, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 0.21626384556293488, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.8311259746551514, "rewards/student_margin": 2.0473897457122803, "rewards/teacher_margin": 0.0, "step": 1030 }, { "epoch": 0.16, "grad_norm": 17.625, "learning_rate": 4.939774000783966e-06, "logits/chosen": 2.1060290336608887, "logits/rejected": 2.2292065620422363, "logps/chosen": -387.0289306640625, "logps/rejected": -494.63909912109375, "loss": 0.335, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.10872574895620346, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.845547676086426, "rewards/student_margin": 2.9542737007141113, "rewards/teacher_margin": 0.0, "step": 1040 }, { "epoch": 0.16, "grad_norm": 20.875, "learning_rate": 4.936753269356436e-06, "logits/chosen": 2.249981164932251, "logits/rejected": 2.1618547439575195, "logps/chosen": -370.42950439453125, "logps/rejected": -401.3464050292969, "loss": 0.376, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.11796979606151581, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.6812560558319092, "rewards/student_margin": 1.7992258071899414, "rewards/teacher_margin": 0.0, "step": 1050 }, { "epoch": 0.17, "grad_norm": 22.625, "learning_rate": 4.933659594891602e-06, "logits/chosen": 2.393725872039795, "logits/rejected": 1.917820930480957, "logps/chosen": -424.61956787109375, "logps/rejected": -376.91534423828125, "loss": 0.3492, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.5922211408615112, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.1211442947387695, "rewards/student_margin": 2.713365316390991, "rewards/teacher_margin": 0.0, "step": 1060 }, { "epoch": 0.17, "grad_norm": 16.625, "learning_rate": 4.93049306999712e-06, "logits/chosen": 2.294188976287842, "logits/rejected": 2.4502148628234863, "logps/chosen": -351.43743896484375, "logps/rejected": -482.1832580566406, "loss": 0.3421, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.104090116918087, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.4911551475524902, "rewards/student_margin": 2.595245361328125, "rewards/teacher_margin": 0.0, "step": 1070 }, { "epoch": 0.17, "grad_norm": 11.4375, "learning_rate": 4.92725378946139e-06, "logits/chosen": 2.226865768432617, "logits/rejected": 2.241239547729492, "logps/chosen": -295.4040222167969, "logps/rejected": -426.7386169433594, "loss": 0.3086, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.2998916506767273, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.104494571685791, "rewards/student_margin": 2.404386281967163, "rewards/teacher_margin": 0.0, "step": 1080 }, { "epoch": 0.17, "grad_norm": 24.5, "learning_rate": 4.923941850250717e-06, "logits/chosen": 2.078420639038086, "logits/rejected": 2.1207845211029053, "logps/chosen": -352.8011474609375, "logps/rejected": -452.44927978515625, "loss": 0.3491, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.2823028564453125, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.3118271827697754, "rewards/student_margin": 2.594130039215088, "rewards/teacher_margin": 0.0, "step": 1090 }, { "epoch": 0.17, "grad_norm": 23.125, "learning_rate": 4.920557351506409e-06, "logits/chosen": 2.2514822483062744, "logits/rejected": 2.2906341552734375, "logps/chosen": -381.4742431640625, "logps/rejected": -468.31756591796875, "loss": 0.3207, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.36983174085617065, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.3936171531677246, "rewards/student_margin": 3.763448715209961, "rewards/teacher_margin": 0.0, "step": 1100 }, { "epoch": 0.17, "grad_norm": 21.25, "learning_rate": 4.9171003945418074e-06, "logits/chosen": 2.278604030609131, "logits/rejected": 2.12575101852417, "logps/chosen": -388.5594482421875, "logps/rejected": -458.32220458984375, "loss": 0.2538, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.27666550874710083, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.5685842037200928, "rewards/student_margin": 2.8452494144439697, "rewards/teacher_margin": 0.0, "step": 1110 }, { "epoch": 0.18, "grad_norm": 11.75, "learning_rate": 4.913571082839258e-06, "logits/chosen": 2.2906651496887207, "logits/rejected": 2.2557358741760254, "logps/chosen": -386.88494873046875, "logps/rejected": -422.2568359375, "loss": 0.3352, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.3296402096748352, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.666569471359253, "rewards/student_margin": 2.9962096214294434, "rewards/teacher_margin": 0.0, "step": 1120 }, { "epoch": 0.18, "grad_norm": 19.625, "learning_rate": 4.909969522047008e-06, "logits/chosen": 2.2493722438812256, "logits/rejected": 2.245151996612549, "logps/chosen": -385.2981262207031, "logps/rejected": -479.27996826171875, "loss": 0.3376, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.15771588683128357, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.6178359985351562, "rewards/student_margin": 3.7755520343780518, "rewards/teacher_margin": 0.0, "step": 1130 }, { "epoch": 0.18, "grad_norm": 16.875, "learning_rate": 4.906295819976049e-06, "logits/chosen": 2.169713020324707, "logits/rejected": 2.076906204223633, "logps/chosen": -310.92315673828125, "logps/rejected": -435.09991455078125, "loss": 0.3303, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.045978546142578125, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.840859889984131, "rewards/student_margin": 3.886838912963867, "rewards/teacher_margin": 0.0, "step": 1140 }, { "epoch": 0.18, "grad_norm": 23.5, "learning_rate": 4.902550086596885e-06, "logits/chosen": 2.280611753463745, "logits/rejected": 2.209174394607544, "logps/chosen": -358.68743896484375, "logps/rejected": -434.3912658691406, "loss": 0.3295, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 0.03143030405044556, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.6862168312072754, "rewards/student_margin": 3.717646837234497, "rewards/teacher_margin": 0.0, "step": 1150 }, { "epoch": 0.18, "grad_norm": 8.875, "learning_rate": 4.8987324340362445e-06, "logits/chosen": 2.3937580585479736, "logits/rejected": 2.3334829807281494, "logps/chosen": -374.60052490234375, "logps/rejected": -433.16156005859375, "loss": 0.2845, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.6600649952888489, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2615723609924316, "rewards/student_margin": 3.9216365814208984, "rewards/teacher_margin": 0.0, "step": 1160 }, { "epoch": 0.18, "grad_norm": 24.625, "learning_rate": 4.8948429765737185e-06, "logits/chosen": 2.3700509071350098, "logits/rejected": 2.172513484954834, "logps/chosen": -437.3172302246094, "logps/rejected": -421.365966796875, "loss": 0.4215, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -0.33285266160964966, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.726017475128174, "rewards/student_margin": 2.39316463470459, "rewards/teacher_margin": 0.0, "step": 1170 }, { "epoch": 0.18, "grad_norm": 18.125, "learning_rate": 4.890881830638349e-06, "logits/chosen": 2.269519329071045, "logits/rejected": 2.1703853607177734, "logps/chosen": -375.9482421875, "logps/rejected": -373.0008544921875, "loss": 0.3419, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.5368738770484924, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.476065158843994, "rewards/student_margin": 3.012938976287842, "rewards/teacher_margin": 0.0, "step": 1180 }, { "epoch": 0.19, "grad_norm": 18.875, "learning_rate": 4.8868491148051315e-06, "logits/chosen": 2.3238651752471924, "logits/rejected": 2.3727147579193115, "logps/chosen": -385.24810791015625, "logps/rejected": -442.97052001953125, "loss": 0.3154, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.4260340631008148, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.7639269828796387, "rewards/student_margin": 3.1899609565734863, "rewards/teacher_margin": 0.0, "step": 1190 }, { "epoch": 0.19, "grad_norm": 19.875, "learning_rate": 4.882744949791478e-06, "logits/chosen": 2.160252571105957, "logits/rejected": 2.1706321239471436, "logps/chosen": -364.6128845214844, "logps/rejected": -440.3382263183594, "loss": 0.2733, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 0.16803261637687683, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.659932851791382, "rewards/student_margin": 2.827965259552002, "rewards/teacher_margin": 0.0, "step": 1200 }, { "epoch": 0.19, "grad_norm": 15.5, "learning_rate": 4.878569458453592e-06, "logits/chosen": 2.5424752235412598, "logits/rejected": 2.3839545249938965, "logps/chosen": -341.1519775390625, "logps/rejected": -383.97393798828125, "loss": 0.3737, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.6576718688011169, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.734546661376953, "rewards/student_margin": 3.3922183513641357, "rewards/teacher_margin": 0.0, "step": 1210 }, { "epoch": 0.19, "grad_norm": 23.125, "learning_rate": 4.874322765782802e-06, "logits/chosen": 2.1967220306396484, "logits/rejected": 2.3293397426605225, "logps/chosen": -362.31134033203125, "logps/rejected": -442.8485412597656, "loss": 0.3891, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.37989911437034607, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2361598014831543, "rewards/student_margin": 3.6160590648651123, "rewards/teacher_margin": 0.0, "step": 1220 }, { "epoch": 0.19, "grad_norm": 15.375, "learning_rate": 4.870004998901807e-06, "logits/chosen": 2.2760727405548096, "logits/rejected": 2.4965484142303467, "logps/chosen": -396.95220947265625, "logps/rejected": -476.5635681152344, "loss": 0.3376, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.05275803059339523, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.4440207481384277, "rewards/student_margin": 2.3912625312805176, "rewards/teacher_margin": 0.0, "step": 1230 }, { "epoch": 0.19, "grad_norm": 14.1875, "learning_rate": 4.8656162870608855e-06, "logits/chosen": 1.8774135112762451, "logits/rejected": 2.4191958904266357, "logps/chosen": -295.61846923828125, "logps/rejected": -503.3675842285156, "loss": 0.2857, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.5429339408874512, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.0950636863708496, "rewards/student_margin": 3.6379973888397217, "rewards/teacher_margin": 0.0, "step": 1240 }, { "epoch": 0.2, "grad_norm": 10.8125, "learning_rate": 4.861156761634014e-06, "logits/chosen": 2.1938087940216064, "logits/rejected": 2.002340793609619, "logps/chosen": -316.1131286621094, "logps/rejected": -410.16839599609375, "loss": 0.2649, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.7077531814575195, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.8897695541381836, "rewards/student_margin": 3.5975232124328613, "rewards/teacher_margin": 0.0, "step": 1250 }, { "epoch": 0.2, "grad_norm": 22.0, "learning_rate": 4.856626556114942e-06, "logits/chosen": 2.255521774291992, "logits/rejected": 2.4610867500305176, "logps/chosen": -363.5118408203125, "logps/rejected": -506.02978515625, "loss": 0.37, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.504284679889679, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.8391990661621094, "rewards/student_margin": 3.3434836864471436, "rewards/teacher_margin": 0.0, "step": 1260 }, { "epoch": 0.2, "grad_norm": 17.125, "learning_rate": 4.852025806113194e-06, "logits/chosen": 2.0991051197052, "logits/rejected": 2.092215061187744, "logps/chosen": -357.8221740722656, "logps/rejected": -460.8573303222656, "loss": 0.4165, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": -0.04353028163313866, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.8869564533233643, "rewards/student_margin": 1.8434261083602905, "rewards/teacher_margin": 0.0, "step": 1270 }, { "epoch": 0.2, "grad_norm": 19.625, "learning_rate": 4.847354649350008e-06, "logits/chosen": 2.4254791736602783, "logits/rejected": 2.3545644283294678, "logps/chosen": -368.5026550292969, "logps/rejected": -470.0302734375, "loss": 0.3041, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.3097531795501709, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.0843381881713867, "rewards/student_margin": 3.3940911293029785, "rewards/teacher_margin": 0.0, "step": 1280 }, { "epoch": 0.2, "grad_norm": 12.125, "learning_rate": 4.842613225654216e-06, "logits/chosen": 2.165343761444092, "logits/rejected": 2.4036753177642822, "logps/chosen": -377.92913818359375, "logps/rejected": -475.9405822753906, "loss": 0.33, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.188862606883049, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.929375410079956, "rewards/student_margin": 3.1182379722595215, "rewards/teacher_margin": 0.0, "step": 1290 }, { "epoch": 0.2, "grad_norm": 25.25, "learning_rate": 4.837801676958055e-06, "logits/chosen": 2.1773734092712402, "logits/rejected": 2.0151267051696777, "logps/chosen": -401.71221923828125, "logps/rejected": -431.99066162109375, "loss": 0.3157, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.8157631158828735, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.2871954441070557, "rewards/student_margin": 3.1029586791992188, "rewards/teacher_margin": 0.0, "step": 1300 }, { "epoch": 0.21, "grad_norm": 15.375, "learning_rate": 4.832920147292923e-06, "logits/chosen": 1.9885914325714111, "logits/rejected": 2.177318572998047, "logps/chosen": -351.4902038574219, "logps/rejected": -432.939208984375, "loss": 0.3695, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.15478616952896118, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.8492910861968994, "rewards/student_margin": 3.004077434539795, "rewards/teacher_margin": 0.0, "step": 1310 }, { "epoch": 0.21, "grad_norm": 17.75, "learning_rate": 4.827968782785062e-06, "logits/chosen": 2.1921262741088867, "logits/rejected": 2.2043137550354004, "logps/chosen": -431.90997314453125, "logps/rejected": -447.6964416503906, "loss": 0.3148, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.4941243529319763, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.261469602584839, "rewards/student_margin": 2.755593776702881, "rewards/teacher_margin": 0.0, "step": 1320 }, { "epoch": 0.21, "grad_norm": 10.0625, "learning_rate": 4.8229477316511875e-06, "logits/chosen": 2.441920280456543, "logits/rejected": 2.3516249656677246, "logps/chosen": -308.2803955078125, "logps/rejected": -393.41986083984375, "loss": 0.2672, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.6950597167015076, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.89546537399292, "rewards/student_margin": 3.5905251502990723, "rewards/teacher_margin": 0.0, "step": 1330 }, { "epoch": 0.21, "grad_norm": 13.5625, "learning_rate": 4.8178571441940515e-06, "logits/chosen": 2.332716941833496, "logits/rejected": 2.0264029502868652, "logps/chosen": -425.8597717285156, "logps/rejected": -412.5272521972656, "loss": 0.3628, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5689414739608765, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.919766664505005, "rewards/student_margin": 3.488708019256592, "rewards/teacher_margin": 0.0, "step": 1340 }, { "epoch": 0.21, "grad_norm": 11.0625, "learning_rate": 4.81269717279794e-06, "logits/chosen": 2.3111302852630615, "logits/rejected": 2.4411330223083496, "logps/chosen": -344.7596130371094, "logps/rejected": -455.4334411621094, "loss": 0.2574, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.8965075612068176, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.6903486251831055, "rewards/student_margin": 3.5868563652038574, "rewards/teacher_margin": 0.0, "step": 1350 }, { "epoch": 0.21, "grad_norm": 19.375, "learning_rate": 4.8074679719241164e-06, "logits/chosen": 2.2593300342559814, "logits/rejected": 2.3117194175720215, "logps/chosen": -353.96051025390625, "logps/rejected": -432.08111572265625, "loss": 0.3173, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 1.1041176319122314, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.9407297372817993, "rewards/student_margin": 3.044847249984741, "rewards/teacher_margin": 0.0, "step": 1360 }, { "epoch": 0.21, "grad_norm": 21.0, "learning_rate": 4.8021696981061946e-06, "logits/chosen": 2.317605495452881, "logits/rejected": 2.3511059284210205, "logps/chosen": -338.8973388671875, "logps/rejected": -430.4156799316406, "loss": 0.4019, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.1750636100769043, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.558255672454834, "rewards/student_margin": 3.7333195209503174, "rewards/teacher_margin": 0.0, "step": 1370 }, { "epoch": 0.22, "grad_norm": 18.0, "learning_rate": 4.796802509945453e-06, "logits/chosen": 2.1983585357666016, "logits/rejected": 2.290005922317505, "logps/chosen": -362.72491455078125, "logps/rejected": -420.1893005371094, "loss": 0.2684, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 0.07416917383670807, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.406132221221924, "rewards/student_margin": 2.4803013801574707, "rewards/teacher_margin": 0.0, "step": 1380 }, { "epoch": 0.22, "grad_norm": 20.25, "learning_rate": 4.791366568106087e-06, "logits/chosen": 2.228454828262329, "logits/rejected": 2.370490550994873, "logps/chosen": -319.6820983886719, "logps/rejected": -350.31109619140625, "loss": 0.4145, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 0.41793814301490784, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.933131217956543, "rewards/student_margin": 2.351069450378418, "rewards/teacher_margin": 0.0, "step": 1390 }, { "epoch": 0.22, "grad_norm": 21.375, "learning_rate": 4.7858620353104035e-06, "logits/chosen": 2.4289729595184326, "logits/rejected": 2.522684335708618, "logps/chosen": -354.356689453125, "logps/rejected": -485.9098205566406, "loss": 0.2402, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 1.0219841003417969, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.5192878246307373, "rewards/student_margin": 3.541271924972534, "rewards/teacher_margin": 0.0, "step": 1400 }, { "epoch": 0.22, "grad_norm": 22.375, "learning_rate": 4.780289076333944e-06, "logits/chosen": 2.4191651344299316, "logits/rejected": 2.3104605674743652, "logps/chosen": -383.91668701171875, "logps/rejected": -432.3193359375, "loss": 0.2648, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.6892379522323608, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.025803804397583, "rewards/student_margin": 3.7150416374206543, "rewards/teacher_margin": 0.0, "step": 1410 }, { "epoch": 0.22, "grad_norm": 21.0, "learning_rate": 4.774647858000554e-06, "logits/chosen": 2.437727451324463, "logits/rejected": 2.4620790481567383, "logps/chosen": -368.26605224609375, "logps/rejected": -445.8623046875, "loss": 0.3025, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.05018618702888489, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2815146446228027, "rewards/student_margin": 3.2313284873962402, "rewards/teacher_margin": 0.0, "step": 1420 }, { "epoch": 0.22, "grad_norm": 13.875, "learning_rate": 4.7689385491773934e-06, "logits/chosen": 2.413599967956543, "logits/rejected": 2.444441080093384, "logps/chosen": -406.80731201171875, "logps/rejected": -479.5438537597656, "loss": 0.291, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.4798935055732727, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.9961471557617188, "rewards/student_margin": 4.476040840148926, "rewards/teacher_margin": 0.0, "step": 1430 }, { "epoch": 0.23, "grad_norm": 14.625, "learning_rate": 4.763161320769875e-06, "logits/chosen": 2.052940845489502, "logits/rejected": 2.2601702213287354, "logps/chosen": -375.41497802734375, "logps/rejected": -448.98651123046875, "loss": 0.3702, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.8520803451538086, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.104666233062744, "rewards/student_margin": 2.9567465782165527, "rewards/teacher_margin": 0.0, "step": 1440 }, { "epoch": 0.23, "grad_norm": 22.875, "learning_rate": 4.757316345716554e-06, "logits/chosen": 2.1091980934143066, "logits/rejected": 2.1563944816589355, "logps/chosen": -413.22650146484375, "logps/rejected": -449.48345947265625, "loss": 0.3383, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.4480951726436615, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.7518672943115234, "rewards/student_margin": 3.1999621391296387, "rewards/teacher_margin": 0.0, "step": 1450 }, { "epoch": 0.23, "grad_norm": 19.875, "learning_rate": 4.751403798983946e-06, "logits/chosen": 2.420492649078369, "logits/rejected": 2.2154927253723145, "logps/chosen": -353.75286865234375, "logps/rejected": -378.8622131347656, "loss": 0.3719, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 0.48272910714149475, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.011146068572998, "rewards/student_margin": 2.493875026702881, "rewards/teacher_margin": 0.0, "step": 1460 }, { "epoch": 0.23, "grad_norm": 17.375, "learning_rate": 4.7454238575612965e-06, "logits/chosen": 2.088392972946167, "logits/rejected": 2.173022747039795, "logps/chosen": -372.3373718261719, "logps/rejected": -413.6541442871094, "loss": 0.3051, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.3563021719455719, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.3876583576202393, "rewards/student_margin": 2.7439606189727783, "rewards/teacher_margin": 0.0, "step": 1470 }, { "epoch": 0.23, "grad_norm": 13.875, "learning_rate": 4.739376700455275e-06, "logits/chosen": 2.137221336364746, "logits/rejected": 2.225682258605957, "logps/chosen": -309.23876953125, "logps/rejected": -452.2105407714844, "loss": 0.3229, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 0.4685831665992737, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.294102907180786, "rewards/student_margin": 2.762686014175415, "rewards/teacher_margin": 0.0, "step": 1480 }, { "epoch": 0.23, "grad_norm": 9.3125, "learning_rate": 4.733262508684622e-06, "logits/chosen": 2.208930492401123, "logits/rejected": 1.9213775396347046, "logps/chosen": -451.2225036621094, "logps/rejected": -451.5113220214844, "loss": 0.2791, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.22047753632068634, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.9555349349975586, "rewards/student_margin": 3.1760125160217285, "rewards/teacher_margin": 0.0, "step": 1490 }, { "epoch": 0.24, "grad_norm": 22.125, "learning_rate": 4.727081465274727e-06, "logits/chosen": 2.067206621170044, "logits/rejected": 2.1477036476135254, "logps/chosen": -317.67230224609375, "logps/rejected": -471.792236328125, "loss": 0.3725, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.2624381482601166, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.4429574012756348, "rewards/student_margin": 2.705395460128784, "rewards/teacher_margin": 0.0, "step": 1500 }, { "epoch": 0.24, "grad_norm": 19.25, "learning_rate": 4.720833755252154e-06, "logits/chosen": 2.4636502265930176, "logits/rejected": 2.186276435852051, "logps/chosen": -393.92572021484375, "logps/rejected": -395.9743347167969, "loss": 0.3356, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 0.16287533938884735, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.943005084991455, "rewards/student_margin": 3.1058807373046875, "rewards/teacher_margin": 0.0, "step": 1510 }, { "epoch": 0.24, "grad_norm": 22.375, "learning_rate": 4.714519565639095e-06, "logits/chosen": 2.4763877391815186, "logits/rejected": 2.6625988483428955, "logps/chosen": -384.75030517578125, "logps/rejected": -474.10882568359375, "loss": 0.2403, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.471353679895401, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.203866481781006, "rewards/student_margin": 4.675220012664795, "rewards/teacher_margin": 0.0, "step": 1520 }, { "epoch": 0.24, "grad_norm": 25.25, "learning_rate": 4.7081390854477815e-06, "logits/chosen": 2.0345566272735596, "logits/rejected": 2.147205114364624, "logps/chosen": -315.8880615234375, "logps/rejected": -411.25421142578125, "loss": 0.3292, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.47202712297439575, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.308072566986084, "rewards/student_margin": 3.780099391937256, "rewards/teacher_margin": 0.0, "step": 1530 }, { "epoch": 0.24, "grad_norm": 26.25, "learning_rate": 4.70169250567482e-06, "logits/chosen": 2.2077202796936035, "logits/rejected": 2.2679662704467773, "logps/chosen": -362.367431640625, "logps/rejected": -460.81707763671875, "loss": 0.3154, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.6270269155502319, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.8426284790039062, "rewards/student_margin": 3.4696555137634277, "rewards/teacher_margin": 0.0, "step": 1540 }, { "epoch": 0.24, "grad_norm": 16.0, "learning_rate": 4.695180019295476e-06, "logits/chosen": 2.1336255073547363, "logits/rejected": 1.9491550922393799, "logps/chosen": -353.28778076171875, "logps/rejected": -441.46099853515625, "loss": 0.2586, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.426022469997406, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.7616546154022217, "rewards/student_margin": 4.187676906585693, "rewards/teacher_margin": 0.0, "step": 1550 }, { "epoch": 0.24, "grad_norm": 13.25, "learning_rate": 4.688601821257895e-06, "logits/chosen": 2.2268261909484863, "logits/rejected": 2.272552967071533, "logps/chosen": -351.8265686035156, "logps/rejected": -471.8194274902344, "loss": 0.2466, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 1.074945330619812, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.5835683345794678, "rewards/student_margin": 3.6585135459899902, "rewards/teacher_margin": 0.0, "step": 1560 }, { "epoch": 0.25, "grad_norm": 15.3125, "learning_rate": 4.6819581084772754e-06, "logits/chosen": 2.265319585800171, "logits/rejected": 2.2556183338165283, "logps/chosen": -404.9942321777344, "logps/rejected": -497.8770446777344, "loss": 0.2324, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.164621502161026, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.1226675510406494, "rewards/student_margin": 3.2872891426086426, "rewards/teacher_margin": 0.0, "step": 1570 }, { "epoch": 0.25, "grad_norm": 10.25, "learning_rate": 4.675249079829962e-06, "logits/chosen": 2.379080295562744, "logits/rejected": 2.155766010284424, "logps/chosen": -385.7734069824219, "logps/rejected": -523.2696533203125, "loss": 0.2713, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.8772011995315552, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.3319172859191895, "rewards/student_margin": 4.209117889404297, "rewards/teacher_margin": 0.0, "step": 1580 }, { "epoch": 0.25, "grad_norm": 19.75, "learning_rate": 4.668474936147502e-06, "logits/chosen": 2.1541495323181152, "logits/rejected": 2.3214666843414307, "logps/chosen": -381.10723876953125, "logps/rejected": -476.5653381347656, "loss": 0.3971, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.05469512939453125, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.138357400894165, "rewards/student_margin": 3.1930525302886963, "rewards/teacher_margin": 0.0, "step": 1590 }, { "epoch": 0.25, "grad_norm": 17.5, "learning_rate": 4.661635880210628e-06, "logits/chosen": 2.2829995155334473, "logits/rejected": 2.254088878631592, "logps/chosen": -444.8953552246094, "logps/rejected": -551.3001098632812, "loss": 0.3423, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.4792858064174652, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.7514395713806152, "rewards/student_margin": 3.2307255268096924, "rewards/teacher_margin": 0.0, "step": 1600 }, { "epoch": 0.25, "grad_norm": 15.3125, "learning_rate": 4.654732116743193e-06, "logits/chosen": 2.1041347980499268, "logits/rejected": 2.3102529048919678, "logps/chosen": -351.86895751953125, "logps/rejected": -471.9771423339844, "loss": 0.3189, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9735881090164185, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.7338075637817383, "rewards/student_margin": 3.7073960304260254, "rewards/teacher_margin": 0.0, "step": 1610 }, { "epoch": 0.25, "grad_norm": 18.625, "learning_rate": 4.647763852406034e-06, "logits/chosen": 2.1601524353027344, "logits/rejected": 2.3111846446990967, "logps/chosen": -330.41400146484375, "logps/rejected": -475.339599609375, "loss": 0.349, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.6567579507827759, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.533489227294922, "rewards/student_margin": 5.190247535705566, "rewards/teacher_margin": 0.0, "step": 1620 }, { "epoch": 0.26, "grad_norm": 14.0, "learning_rate": 4.640731295790793e-06, "logits/chosen": 2.3584513664245605, "logits/rejected": 2.1575541496276855, "logps/chosen": -375.91217041015625, "logps/rejected": -393.40020751953125, "loss": 0.2929, "rewards/accuracies": 0.6333333253860474, "rewards/chosen": 0.6348249316215515, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.7159254550933838, "rewards/student_margin": 2.35075044631958, "rewards/teacher_margin": 0.0, "step": 1630 }, { "epoch": 0.26, "grad_norm": 13.25, "learning_rate": 4.633634657413673e-06, "logits/chosen": 1.9272451400756836, "logits/rejected": 2.114234209060669, "logps/chosen": -415.65301513671875, "logps/rejected": -496.1805725097656, "loss": 0.2812, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.19299495220184326, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.4423460960388184, "rewards/student_margin": 3.635340929031372, "rewards/teacher_margin": 0.0, "step": 1640 }, { "epoch": 0.26, "grad_norm": 20.375, "learning_rate": 4.626474149709127e-06, "logits/chosen": 2.3737661838531494, "logits/rejected": 2.359593629837036, "logps/chosen": -389.91644287109375, "logps/rejected": -495.78594970703125, "loss": 0.2574, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.7729851007461548, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2747325897216797, "rewards/student_margin": 4.047717094421387, "rewards/teacher_margin": 0.0, "step": 1650 }, { "epoch": 0.26, "grad_norm": 16.375, "learning_rate": 4.619249987023513e-06, "logits/chosen": 2.1319570541381836, "logits/rejected": 2.2157890796661377, "logps/chosen": -365.80987548828125, "logps/rejected": -430.28875732421875, "loss": 0.2578, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.1358619928359985, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.20314621925354, "rewards/student_margin": 3.339008331298828, "rewards/teacher_margin": 0.0, "step": 1660 }, { "epoch": 0.26, "grad_norm": 20.0, "learning_rate": 4.6119623856086655e-06, "logits/chosen": 2.3459951877593994, "logits/rejected": 2.170069932937622, "logps/chosen": -341.6787414550781, "logps/rejected": -374.75457763671875, "loss": 0.3468, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 0.5894002318382263, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.4443989992141724, "rewards/student_margin": 2.033799171447754, "rewards/teacher_margin": 0.0, "step": 1670 }, { "epoch": 0.26, "grad_norm": 21.0, "learning_rate": 4.604611563615428e-06, "logits/chosen": 2.3754425048828125, "logits/rejected": 2.578775405883789, "logps/chosen": -403.689697265625, "logps/rejected": -460.97735595703125, "loss": 0.351, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.23234355449676514, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.8285892009735107, "rewards/student_margin": 3.0609323978424072, "rewards/teacher_margin": 0.0, "step": 1680 }, { "epoch": 0.26, "grad_norm": 6.09375, "learning_rate": 4.597197741087124e-06, "logits/chosen": 2.260488510131836, "logits/rejected": 2.0914623737335205, "logps/chosen": -389.6407165527344, "logps/rejected": -409.7547302246094, "loss": 0.283, "rewards/accuracies": 0.9000000953674316, "rewards/chosen": 0.5259709358215332, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.028218984603882, "rewards/student_margin": 3.554190158843994, "rewards/teacher_margin": 0.0, "step": 1690 }, { "epoch": 0.27, "grad_norm": 16.375, "learning_rate": 4.589721139952964e-06, "logits/chosen": 2.324218273162842, "logits/rejected": 2.0411252975463867, "logps/chosen": -391.35577392578125, "logps/rejected": -429.2552795410156, "loss": 0.2778, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.5151987671852112, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.7141525745391846, "rewards/student_margin": 3.229351043701172, "rewards/teacher_margin": 0.0, "step": 1700 }, { "epoch": 0.27, "grad_norm": 24.875, "learning_rate": 4.582181984021407e-06, "logits/chosen": 2.0803442001342773, "logits/rejected": 2.34643292427063, "logps/chosen": -382.03692626953125, "logps/rejected": -552.8040771484375, "loss": 0.3395, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.684313952922821, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.0848798751831055, "rewards/student_margin": 4.769193649291992, "rewards/teacher_margin": 0.0, "step": 1710 }, { "epoch": 0.27, "grad_norm": 22.625, "learning_rate": 4.574580498973462e-06, "logits/chosen": 2.358665943145752, "logits/rejected": 2.3511786460876465, "logps/chosen": -399.3021240234375, "logps/rejected": -518.3663330078125, "loss": 0.2704, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": 0.36852699518203735, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.0613694190979004, "rewards/student_margin": 3.429896593093872, "rewards/teacher_margin": 0.0, "step": 1720 }, { "epoch": 0.27, "grad_norm": 17.25, "learning_rate": 4.566916912355926e-06, "logits/chosen": 2.2575502395629883, "logits/rejected": 2.233567953109741, "logps/chosen": -402.64813232421875, "logps/rejected": -480.33038330078125, "loss": 0.3164, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.771281898021698, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.0568840503692627, "rewards/student_margin": 3.8281655311584473, "rewards/teacher_margin": 0.0, "step": 1730 }, { "epoch": 0.27, "grad_norm": 6.78125, "learning_rate": 4.559191453574582e-06, "logits/chosen": 2.307366371154785, "logits/rejected": 2.103435754776001, "logps/chosen": -390.2318420410156, "logps/rejected": -414.7093200683594, "loss": 0.2413, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 1.2031283378601074, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.36790132522583, "rewards/student_margin": 3.5710291862487793, "rewards/teacher_margin": 0.0, "step": 1740 }, { "epoch": 0.27, "grad_norm": 16.75, "learning_rate": 4.551404353887322e-06, "logits/chosen": 2.034942150115967, "logits/rejected": 2.0218663215637207, "logps/chosen": -382.4777526855469, "logps/rejected": -440.51031494140625, "loss": 0.2378, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8022573590278625, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.452645778656006, "rewards/student_margin": 3.2549030780792236, "rewards/teacher_margin": 0.0, "step": 1750 }, { "epoch": 0.28, "grad_norm": 16.625, "learning_rate": 4.54355584639723e-06, "logits/chosen": 2.197904109954834, "logits/rejected": 2.2808008193969727, "logps/chosen": -408.74017333984375, "logps/rejected": -473.0552673339844, "loss": 0.3453, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.4888620972633362, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.433514356613159, "rewards/student_margin": 3.922377109527588, "rewards/teacher_margin": 0.0, "step": 1760 }, { "epoch": 0.28, "grad_norm": 17.0, "learning_rate": 4.5356461660456045e-06, "logits/chosen": 2.164057970046997, "logits/rejected": 2.2220771312713623, "logps/chosen": -377.1018371582031, "logps/rejected": -479.0262145996094, "loss": 0.2559, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.7700233459472656, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.1522347927093506, "rewards/student_margin": 2.922258138656616, "rewards/teacher_margin": 0.0, "step": 1770 }, { "epoch": 0.28, "grad_norm": 18.0, "learning_rate": 4.527675549604921e-06, "logits/chosen": 2.0765137672424316, "logits/rejected": 2.0573270320892334, "logps/chosen": -369.86907958984375, "logps/rejected": -471.10772705078125, "loss": 0.2606, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.7050718069076538, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.8469743728637695, "rewards/student_margin": 3.552046298980713, "rewards/teacher_margin": 0.0, "step": 1780 }, { "epoch": 0.28, "grad_norm": 13.1875, "learning_rate": 4.5196442356717526e-06, "logits/chosen": 2.36519193649292, "logits/rejected": 2.098045825958252, "logps/chosen": -351.6300354003906, "logps/rejected": -414.138916015625, "loss": 0.2398, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": 0.5385591983795166, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.6042535305023193, "rewards/student_margin": 3.142812728881836, "rewards/teacher_margin": 0.0, "step": 1790 }, { "epoch": 0.28, "grad_norm": 12.9375, "learning_rate": 4.511552464659617e-06, "logits/chosen": 2.006380796432495, "logits/rejected": 2.1198132038116455, "logps/chosen": -411.06671142578125, "logps/rejected": -483.88983154296875, "loss": 0.2057, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.7215427160263062, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.9890451431274414, "rewards/student_margin": 3.7105884552001953, "rewards/teacher_margin": 0.0, "step": 1800 }, { "epoch": 0.28, "grad_norm": 18.125, "learning_rate": 4.5034004787917905e-06, "logits/chosen": 2.125535011291504, "logits/rejected": 2.2896571159362793, "logps/chosen": -340.19482421875, "logps/rejected": -465.0826110839844, "loss": 0.2562, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.056809235364198685, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.745903491973877, "rewards/student_margin": 3.8027126789093018, "rewards/teacher_margin": 0.0, "step": 1810 }, { "epoch": 0.29, "grad_norm": 15.3125, "learning_rate": 4.49518852209405e-06, "logits/chosen": 2.24306058883667, "logits/rejected": 1.8288602828979492, "logps/chosen": -355.8966979980469, "logps/rejected": -383.4837951660156, "loss": 0.3246, "rewards/accuracies": 0.9000000953674316, "rewards/chosen": 0.6151221990585327, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.861602544784546, "rewards/student_margin": 3.4767251014709473, "rewards/teacher_margin": 0.0, "step": 1820 }, { "epoch": 0.29, "grad_norm": 16.75, "learning_rate": 4.486916840387366e-06, "logits/chosen": 2.3944616317749023, "logits/rejected": 2.1694493293762207, "logps/chosen": -401.08599853515625, "logps/rejected": -397.0449523925781, "loss": 0.3608, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.22733919322490692, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.6191203594207764, "rewards/student_margin": 2.8464598655700684, "rewards/teacher_margin": 0.0, "step": 1830 }, { "epoch": 0.29, "grad_norm": 17.0, "learning_rate": 4.4785856812805575e-06, "logits/chosen": 1.9857267141342163, "logits/rejected": 2.2452406883239746, "logps/chosen": -370.99114990234375, "logps/rejected": -478.3790588378906, "loss": 0.279, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.05579733848571777, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.295580863952637, "rewards/student_margin": 4.351378440856934, "rewards/teacher_margin": 0.0, "step": 1840 }, { "epoch": 0.29, "grad_norm": 18.375, "learning_rate": 4.470195294162863e-06, "logits/chosen": 2.214505672454834, "logits/rejected": 2.1600468158721924, "logps/chosen": -356.6380920410156, "logps/rejected": -429.8399353027344, "loss": 0.3058, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.24384383857250214, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.390477418899536, "rewards/student_margin": 3.6343212127685547, "rewards/teacher_margin": 0.0, "step": 1850 }, { "epoch": 0.29, "grad_norm": 17.25, "learning_rate": 4.461745930196488e-06, "logits/chosen": 2.304492473602295, "logits/rejected": 2.2834553718566895, "logps/chosen": -351.2618103027344, "logps/rejected": -446.447265625, "loss": 0.3119, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 0.3111915588378906, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.7691330909729004, "rewards/student_margin": 3.080324649810791, "rewards/teacher_margin": 0.0, "step": 1860 }, { "epoch": 0.29, "grad_norm": 20.75, "learning_rate": 4.45323784230908e-06, "logits/chosen": 2.0996408462524414, "logits/rejected": 2.3790407180786133, "logps/chosen": -353.23980712890625, "logps/rejected": -435.61419677734375, "loss": 0.277, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.19356688857078552, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2387123107910156, "rewards/student_margin": 3.432279109954834, "rewards/teacher_margin": 0.0, "step": 1870 }, { "epoch": 0.29, "grad_norm": 11.1875, "learning_rate": 4.444671285186155e-06, "logits/chosen": 2.405585765838623, "logits/rejected": 2.414564609527588, "logps/chosen": -371.3876953125, "logps/rejected": -414.8853454589844, "loss": 0.3107, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.3823794424533844, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.05053448677063, "rewards/student_margin": 3.4329142570495605, "rewards/teacher_margin": 0.0, "step": 1880 }, { "epoch": 0.3, "grad_norm": 13.875, "learning_rate": 4.436046515263484e-06, "logits/chosen": 2.3927066326141357, "logits/rejected": 2.5018389225006104, "logps/chosen": -350.46051025390625, "logps/rejected": -426.7987365722656, "loss": 0.3189, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.46986547112464905, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.6895954608917236, "rewards/student_margin": 3.1594607830047607, "rewards/teacher_margin": 0.0, "step": 1890 }, { "epoch": 0.3, "grad_norm": 11.125, "learning_rate": 4.427363790719406e-06, "logits/chosen": 2.118764638900757, "logits/rejected": 2.0256009101867676, "logps/chosen": -359.9532165527344, "logps/rejected": -454.215087890625, "loss": 0.2217, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.7649208903312683, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.139664649963379, "rewards/student_margin": 2.904585361480713, "rewards/teacher_margin": 0.0, "step": 1900 }, { "epoch": 0.3, "grad_norm": 13.5, "learning_rate": 4.418623371467104e-06, "logits/chosen": 2.3266701698303223, "logits/rejected": 2.2130565643310547, "logps/chosen": -351.64813232421875, "logps/rejected": -442.3761291503906, "loss": 0.3528, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.761719822883606, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.8674395084381104, "rewards/student_margin": 3.6291592121124268, "rewards/teacher_margin": 0.0, "step": 1910 }, { "epoch": 0.3, "grad_norm": 17.0, "learning_rate": 4.409825519146827e-06, "logits/chosen": 2.3715548515319824, "logits/rejected": 2.254122018814087, "logps/chosen": -343.1055908203125, "logps/rejected": -429.45904541015625, "loss": 0.2404, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.3804914057254791, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.921339988708496, "rewards/student_margin": 3.301831007003784, "rewards/teacher_margin": 0.0, "step": 1920 }, { "epoch": 0.3, "grad_norm": 17.0, "learning_rate": 4.400970497118052e-06, "logits/chosen": 2.2375292778015137, "logits/rejected": 2.035963773727417, "logps/chosen": -379.23663330078125, "logps/rejected": -474.33837890625, "loss": 0.3545, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.8393847346305847, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.995181083679199, "rewards/student_margin": 3.834566116333008, "rewards/teacher_margin": 0.0, "step": 1930 }, { "epoch": 0.3, "grad_norm": 8.375, "learning_rate": 4.392058570451605e-06, "logits/chosen": 2.0531041622161865, "logits/rejected": 2.0584397315979004, "logps/chosen": -404.27874755859375, "logps/rejected": -477.31097412109375, "loss": 0.2526, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.6113362908363342, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.3851611614227295, "rewards/student_margin": 3.996497631072998, "rewards/teacher_margin": 0.0, "step": 1940 }, { "epoch": 0.31, "grad_norm": 14.0625, "learning_rate": 4.3830900059217265e-06, "logits/chosen": 2.2479584217071533, "logits/rejected": 2.0480430126190186, "logps/chosen": -396.1625061035156, "logps/rejected": -396.0016174316406, "loss": 0.2374, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.4396853446960449, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.4994571208953857, "rewards/student_margin": 3.9391422271728516, "rewards/teacher_margin": 0.0, "step": 1950 }, { "epoch": 0.31, "grad_norm": 12.125, "learning_rate": 4.374065071998081e-06, "logits/chosen": 2.0664987564086914, "logits/rejected": 2.1982455253601074, "logps/chosen": -331.90289306640625, "logps/rejected": -425.8673400878906, "loss": 0.2905, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.9614151120185852, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.5811235904693604, "rewards/student_margin": 3.5425384044647217, "rewards/teacher_margin": 0.0, "step": 1960 }, { "epoch": 0.31, "grad_norm": 23.125, "learning_rate": 4.364984038837727e-06, "logits/chosen": 2.291724443435669, "logits/rejected": 2.2664742469787598, "logps/chosen": -367.8359375, "logps/rejected": -418.68634033203125, "loss": 0.3219, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.5604678392410278, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.2192113399505615, "rewards/student_margin": 2.7796790599823, "rewards/teacher_margin": 0.0, "step": 1970 }, { "epoch": 0.31, "grad_norm": 18.625, "learning_rate": 4.355847178277025e-06, "logits/chosen": 2.3209898471832275, "logits/rejected": 2.3509840965270996, "logps/chosen": -325.83489990234375, "logps/rejected": -377.1394348144531, "loss": 0.2667, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.5430029630661011, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.7181413173675537, "rewards/student_margin": 3.2611446380615234, "rewards/teacher_margin": 0.0, "step": 1980 }, { "epoch": 0.31, "grad_norm": 15.8125, "learning_rate": 4.346654763823501e-06, "logits/chosen": 2.183046817779541, "logits/rejected": 1.9843237400054932, "logps/chosen": -360.84307861328125, "logps/rejected": -412.257568359375, "loss": 0.3142, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.40714240074157715, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.948389768600464, "rewards/student_margin": 3.355532169342041, "rewards/teacher_margin": 0.0, "step": 1990 }, { "epoch": 0.31, "grad_norm": 19.375, "learning_rate": 4.337407070647662e-06, "logits/chosen": 2.1943626403808594, "logits/rejected": 2.096409320831299, "logps/chosen": -398.80340576171875, "logps/rejected": -463.10760498046875, "loss": 0.2982, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.49700015783309937, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.9807357788085938, "rewards/student_margin": 3.477735996246338, "rewards/teacher_margin": 0.0, "step": 2000 }, { "epoch": 0.32, "grad_norm": 14.5625, "learning_rate": 4.328104375574756e-06, "logits/chosen": 2.01296067237854, "logits/rejected": 1.958081841468811, "logps/chosen": -336.89556884765625, "logps/rejected": -390.3586120605469, "loss": 0.3148, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.529579222202301, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.9655940532684326, "rewards/student_margin": 3.495173215866089, "rewards/teacher_margin": 0.0, "step": 2010 }, { "epoch": 0.32, "grad_norm": 13.0, "learning_rate": 4.318746957076486e-06, "logits/chosen": 2.1673381328582764, "logits/rejected": 2.236161708831787, "logps/chosen": -381.1363525390625, "logps/rejected": -494.91351318359375, "loss": 0.3579, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.6766161918640137, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.954303026199341, "rewards/student_margin": 4.630919456481934, "rewards/teacher_margin": 0.0, "step": 2020 }, { "epoch": 0.32, "grad_norm": 17.25, "learning_rate": 4.309335095262675e-06, "logits/chosen": 2.2419989109039307, "logits/rejected": 2.255988597869873, "logps/chosen": -390.44195556640625, "logps/rejected": -534.3837280273438, "loss": 0.2538, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": 0.167364239692688, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.3419766426086426, "rewards/student_margin": 3.509340763092041, "rewards/teacher_margin": 0.0, "step": 2030 }, { "epoch": 0.32, "grad_norm": 25.5, "learning_rate": 4.299869071872882e-06, "logits/chosen": 2.2394516468048096, "logits/rejected": 2.4231152534484863, "logps/chosen": -370.52862548828125, "logps/rejected": -478.20501708984375, "loss": 0.3318, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.022884875535964966, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.5218944549560547, "rewards/student_margin": 2.544779062271118, "rewards/teacher_margin": 0.0, "step": 2040 }, { "epoch": 0.32, "grad_norm": 8.1875, "learning_rate": 4.290349170267961e-06, "logits/chosen": 2.2540676593780518, "logits/rejected": 2.154564619064331, "logps/chosen": -318.80499267578125, "logps/rejected": -373.9777526855469, "loss": 0.2288, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 1.1226544380187988, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.579138994216919, "rewards/student_margin": 2.7017934322357178, "rewards/teacher_margin": 0.0, "step": 2050 }, { "epoch": 0.32, "grad_norm": 16.5, "learning_rate": 4.2807756754215926e-06, "logits/chosen": 2.469200611114502, "logits/rejected": 2.289149284362793, "logps/chosen": -371.82708740234375, "logps/rejected": -413.82843017578125, "loss": 0.2621, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.6293642520904541, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.8229892253875732, "rewards/student_margin": 2.4523534774780273, "rewards/teacher_margin": 0.0, "step": 2060 }, { "epoch": 0.32, "grad_norm": 17.375, "learning_rate": 4.271148873911739e-06, "logits/chosen": 2.1347861289978027, "logits/rejected": 2.0511717796325684, "logps/chosen": -370.75054931640625, "logps/rejected": -376.75396728515625, "loss": 0.2806, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.3916610777378082, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.5346503257751465, "rewards/student_margin": 2.9263112545013428, "rewards/teacher_margin": 0.0, "step": 2070 }, { "epoch": 0.33, "grad_norm": 19.375, "learning_rate": 4.261469053912075e-06, "logits/chosen": 2.548414945602417, "logits/rejected": 2.3109049797058105, "logps/chosen": -390.76873779296875, "logps/rejected": -392.36077880859375, "loss": 0.272, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.8124133944511414, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.1957788467407227, "rewards/student_margin": 3.0081918239593506, "rewards/teacher_margin": 0.0, "step": 2080 }, { "epoch": 0.33, "grad_norm": 17.625, "learning_rate": 4.2517365051833564e-06, "logits/chosen": 2.0476605892181396, "logits/rejected": 2.250492811203003, "logps/chosen": -389.4486389160156, "logps/rejected": -472.5542907714844, "loss": 0.2874, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.5750201344490051, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.316688060760498, "rewards/student_margin": 3.8917083740234375, "rewards/teacher_margin": 0.0, "step": 2090 }, { "epoch": 0.33, "grad_norm": 13.5, "learning_rate": 4.24195151906475e-06, "logits/chosen": 2.1893954277038574, "logits/rejected": 2.3351621627807617, "logps/chosen": -333.552490234375, "logps/rejected": -412.4090881347656, "loss": 0.297, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.459265798330307, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.725071430206299, "rewards/student_margin": 3.1843373775482178, "rewards/teacher_margin": 0.0, "step": 2100 }, { "epoch": 0.33, "grad_norm": 13.875, "learning_rate": 4.23211438846511e-06, "logits/chosen": 2.2877869606018066, "logits/rejected": 2.4389865398406982, "logps/chosen": -361.0514831542969, "logps/rejected": -408.6927795410156, "loss": 0.2372, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.01857222244143486, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.7370243072509766, "rewards/student_margin": 3.75559663772583, "rewards/teacher_margin": 0.0, "step": 2110 }, { "epoch": 0.33, "grad_norm": 20.75, "learning_rate": 4.222225407854208e-06, "logits/chosen": 2.2848269939422607, "logits/rejected": 2.4588122367858887, "logps/chosen": -359.1607971191406, "logps/rejected": -433.43927001953125, "loss": 0.2584, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.8308922052383423, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.0730090141296387, "rewards/student_margin": 3.9039013385772705, "rewards/teacher_margin": 0.0, "step": 2120 }, { "epoch": 0.33, "grad_norm": 16.375, "learning_rate": 4.212284873253926e-06, "logits/chosen": 2.157104969024658, "logits/rejected": 2.0904102325439453, "logps/chosen": -408.9845886230469, "logps/rejected": -471.6368713378906, "loss": 0.2616, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7046324014663696, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.0994210243225098, "rewards/student_margin": 3.804053783416748, "rewards/teacher_margin": 0.0, "step": 2130 }, { "epoch": 0.34, "grad_norm": 19.875, "learning_rate": 4.202293082229385e-06, "logits/chosen": 2.399350643157959, "logits/rejected": 2.261389970779419, "logps/chosen": -359.5699462890625, "logps/rejected": -410.29449462890625, "loss": 0.2595, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.14682084321975708, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.3434479236602783, "rewards/student_margin": 3.196627378463745, "rewards/teacher_margin": 0.0, "step": 2140 }, { "epoch": 0.34, "grad_norm": 14.0, "learning_rate": 4.192250333880045e-06, "logits/chosen": 2.157667398452759, "logits/rejected": 2.443382978439331, "logps/chosen": -393.087890625, "logps/rejected": -494.3221740722656, "loss": 0.2168, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.3280692994594574, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.877622365951538, "rewards/student_margin": 3.2056918144226074, "rewards/teacher_margin": 0.0, "step": 2150 }, { "epoch": 0.34, "grad_norm": 20.375, "learning_rate": 4.182156928830749e-06, "logits/chosen": 2.2621493339538574, "logits/rejected": 2.099200963973999, "logps/chosen": -344.75665283203125, "logps/rejected": -392.2640075683594, "loss": 0.3052, "rewards/accuracies": 0.7333333492279053, "rewards/chosen": 0.17640702426433563, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.0423808097839355, "rewards/student_margin": 3.218787670135498, "rewards/teacher_margin": 0.0, "step": 2160 }, { "epoch": 0.34, "grad_norm": 18.25, "learning_rate": 4.172013169222722e-06, "logits/chosen": 2.442689895629883, "logits/rejected": 2.1981256008148193, "logps/chosen": -390.1517639160156, "logps/rejected": -449.3399353027344, "loss": 0.3174, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.4473528265953064, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.682013750076294, "rewards/student_margin": 3.129366874694824, "rewards/teacher_margin": 0.0, "step": 2170 }, { "epoch": 0.34, "grad_norm": 12.375, "learning_rate": 4.1618193587045305e-06, "logits/chosen": 2.3212926387786865, "logits/rejected": 2.446316957473755, "logps/chosen": -386.0205383300781, "logps/rejected": -436.2451171875, "loss": 0.2519, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 1.0397193431854248, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.4377517700195312, "rewards/student_margin": 4.477471351623535, "rewards/teacher_margin": 0.0, "step": 2180 }, { "epoch": 0.34, "grad_norm": 17.75, "learning_rate": 4.151575802422992e-06, "logits/chosen": 2.4286575317382812, "logits/rejected": 2.3589932918548584, "logps/chosen": -369.14410400390625, "logps/rejected": -459.66552734375, "loss": 0.2938, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.47952526807785034, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.4370856285095215, "rewards/student_margin": 3.9166111946105957, "rewards/teacher_margin": 0.0, "step": 2190 }, { "epoch": 0.34, "grad_norm": 20.625, "learning_rate": 4.141282807014034e-06, "logits/chosen": 2.086730480194092, "logits/rejected": 2.1971802711486816, "logps/chosen": -341.469482421875, "logps/rejected": -431.07745361328125, "loss": 0.274, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.5356088876724243, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.769986391067505, "rewards/student_margin": 3.3055953979492188, "rewards/teacher_margin": 0.0, "step": 2200 }, { "epoch": 0.35, "grad_norm": 22.875, "learning_rate": 4.130940680593527e-06, "logits/chosen": 2.2593677043914795, "logits/rejected": 2.264709949493408, "logps/chosen": -340.01263427734375, "logps/rejected": -404.6569519042969, "loss": 0.3011, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.45611804723739624, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.737945079803467, "rewards/student_margin": 3.1940629482269287, "rewards/teacher_margin": 0.0, "step": 2210 }, { "epoch": 0.35, "grad_norm": 16.125, "learning_rate": 4.120549732748051e-06, "logits/chosen": 2.4592833518981934, "logits/rejected": 2.4011738300323486, "logps/chosen": -382.6155700683594, "logps/rejected": -420.83087158203125, "loss": 0.2169, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.25808972120285034, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.5128509998321533, "rewards/student_margin": 3.7709407806396484, "rewards/teacher_margin": 0.0, "step": 2220 }, { "epoch": 0.35, "grad_norm": 15.4375, "learning_rate": 4.1101102745256335e-06, "logits/chosen": 2.310042381286621, "logits/rejected": 2.509584903717041, "logps/chosen": -425.1043395996094, "logps/rejected": -517.6640014648438, "loss": 0.349, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.5127886533737183, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.159834384918213, "rewards/student_margin": 3.6726231575012207, "rewards/teacher_margin": 0.0, "step": 2230 }, { "epoch": 0.35, "grad_norm": 18.125, "learning_rate": 4.0996226184264355e-06, "logits/chosen": 2.5404293537139893, "logits/rejected": 2.287459373474121, "logps/chosen": -394.8978271484375, "logps/rejected": -383.2523193359375, "loss": 0.2622, "rewards/accuracies": 0.9000000953674316, "rewards/chosen": 0.1426587700843811, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.6931746006011963, "rewards/student_margin": 2.8358333110809326, "rewards/teacher_margin": 0.0, "step": 2240 }, { "epoch": 0.35, "grad_norm": 19.625, "learning_rate": 4.0890870783934e-06, "logits/chosen": 2.3715660572052, "logits/rejected": 2.264880418777466, "logps/chosen": -348.4560546875, "logps/rejected": -352.72491455078125, "loss": 0.2587, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.06754124909639359, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2925896644592285, "rewards/student_margin": 3.360131025314331, "rewards/teacher_margin": 0.0, "step": 2250 }, { "epoch": 0.35, "grad_norm": 13.1875, "learning_rate": 4.0785039698028495e-06, "logits/chosen": 2.0120110511779785, "logits/rejected": 2.2751669883728027, "logps/chosen": -363.1047668457031, "logps/rejected": -449.82659912109375, "loss": 0.283, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.26059380173683167, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.4347293376922607, "rewards/student_margin": 3.174135684967041, "rewards/teacher_margin": 0.0, "step": 2260 }, { "epoch": 0.36, "grad_norm": 20.5, "learning_rate": 4.067873609455052e-06, "logits/chosen": 2.1835646629333496, "logits/rejected": 2.3163673877716064, "logps/chosen": -342.4147033691406, "logps/rejected": -429.95904541015625, "loss": 0.3188, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.08957010507583618, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.6255412101745605, "rewards/student_margin": 3.715111494064331, "rewards/teacher_margin": 0.0, "step": 2270 }, { "epoch": 0.36, "grad_norm": 12.0, "learning_rate": 4.057196315564732e-06, "logits/chosen": 2.1850028038024902, "logits/rejected": 2.3072800636291504, "logps/chosen": -380.82537841796875, "logps/rejected": -479.2842712402344, "loss": 0.262, "rewards/accuracies": 0.9000000953674316, "rewards/chosen": 0.27549538016319275, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.265378475189209, "rewards/student_margin": 4.5408735275268555, "rewards/teacher_margin": 0.0, "step": 2280 }, { "epoch": 0.36, "grad_norm": 20.875, "learning_rate": 4.0464724077515474e-06, "logits/chosen": 2.1061110496520996, "logits/rejected": 2.158477306365967, "logps/chosen": -379.86883544921875, "logps/rejected": -438.06597900390625, "loss": 0.2517, "rewards/accuracies": 0.9000000953674316, "rewards/chosen": 1.038312554359436, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.1670308113098145, "rewards/student_margin": 3.205343723297119, "rewards/teacher_margin": 0.0, "step": 2290 }, { "epoch": 0.36, "grad_norm": 21.125, "learning_rate": 4.0357022070305205e-06, "logits/chosen": 1.8222116231918335, "logits/rejected": 2.140615463256836, "logps/chosen": -334.448974609375, "logps/rejected": -552.8897705078125, "loss": 0.2521, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.0023529292084276676, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.9737651348114014, "rewards/student_margin": 3.9761176109313965, "rewards/teacher_margin": 0.0, "step": 2300 }, { "epoch": 0.36, "grad_norm": 23.25, "learning_rate": 4.024886035802432e-06, "logits/chosen": 2.3093795776367188, "logits/rejected": 2.3128671646118164, "logps/chosen": -412.97149658203125, "logps/rejected": -513.7916259765625, "loss": 0.282, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.7244001626968384, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.4940237998962402, "rewards/student_margin": 4.218424320220947, "rewards/teacher_margin": 0.0, "step": 2310 }, { "epoch": 0.36, "grad_norm": 8.0, "learning_rate": 4.014024217844167e-06, "logits/chosen": 2.0881800651550293, "logits/rejected": 1.9600870609283447, "logps/chosen": -345.86590576171875, "logps/rejected": -434.638671875, "loss": 0.2027, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.19055531919002533, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.7857460975646973, "rewards/student_margin": 3.9763011932373047, "rewards/teacher_margin": 0.0, "step": 2320 }, { "epoch": 0.37, "grad_norm": 20.375, "learning_rate": 4.0031170782990214e-06, "logits/chosen": 2.2908711433410645, "logits/rejected": 2.211184024810791, "logps/chosen": -350.46783447265625, "logps/rejected": -425.9938049316406, "loss": 0.2841, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.7492457628250122, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.77624249458313, "rewards/student_margin": 3.5254883766174316, "rewards/teacher_margin": 0.0, "step": 2330 }, { "epoch": 0.37, "grad_norm": 22.625, "learning_rate": 3.992164943666972e-06, "logits/chosen": 2.4556171894073486, "logits/rejected": 2.6449596881866455, "logps/chosen": -358.88970947265625, "logps/rejected": -477.9335021972656, "loss": 0.2542, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.30440419912338257, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.9228973388671875, "rewards/student_margin": 4.227301597595215, "rewards/teacher_margin": 0.0, "step": 2340 }, { "epoch": 0.37, "grad_norm": 15.6875, "learning_rate": 3.981168141794902e-06, "logits/chosen": 2.2117793560028076, "logits/rejected": 2.430795431137085, "logps/chosen": -351.9301452636719, "logps/rejected": -470.13330078125, "loss": 0.1932, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.10783439874649048, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.2909650802612305, "rewards/student_margin": 4.398799419403076, "rewards/teacher_margin": 0.0, "step": 2350 }, { "epoch": 0.37, "grad_norm": 20.0, "learning_rate": 3.9701270018667895e-06, "logits/chosen": 2.1149024963378906, "logits/rejected": 2.275358200073242, "logps/chosen": -363.36993408203125, "logps/rejected": -484.3304138183594, "loss": 0.2609, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 0.05589407682418823, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.8164174556732178, "rewards/student_margin": 2.87231183052063, "rewards/teacher_margin": 0.0, "step": 2360 }, { "epoch": 0.37, "grad_norm": 7.6875, "learning_rate": 3.959041854393846e-06, "logits/chosen": 2.4439048767089844, "logits/rejected": 2.4801313877105713, "logps/chosen": -403.86810302734375, "logps/rejected": -455.4736328125, "loss": 0.2527, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -0.7013717889785767, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.079593181610107, "rewards/student_margin": 3.3782215118408203, "rewards/teacher_margin": 0.0, "step": 2370 }, { "epoch": 0.37, "grad_norm": 17.625, "learning_rate": 3.947913031204631e-06, "logits/chosen": 2.4617161750793457, "logits/rejected": 2.375124454498291, "logps/chosen": -413.83782958984375, "logps/rejected": -420.19696044921875, "loss": 0.3199, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.08388874679803848, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.4833407402038574, "rewards/student_margin": 3.567229747772217, "rewards/teacher_margin": 0.0, "step": 2380 }, { "epoch": 0.37, "grad_norm": 18.875, "learning_rate": 3.936740865435116e-06, "logits/chosen": 2.2610387802124023, "logits/rejected": 2.328009843826294, "logps/chosen": -362.6685791015625, "logps/rejected": -449.802734375, "loss": 0.3415, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5953782200813293, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.6166610717773438, "rewards/student_margin": 3.21203875541687, "rewards/teacher_margin": 0.0, "step": 2390 }, { "epoch": 0.38, "grad_norm": 11.4375, "learning_rate": 3.925525691518711e-06, "logits/chosen": 2.4029855728149414, "logits/rejected": 2.4755091667175293, "logps/chosen": -375.99481201171875, "logps/rejected": -435.3857421875, "loss": 0.2572, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -0.1986234039068222, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.678576946258545, "rewards/student_margin": 4.479953289031982, "rewards/teacher_margin": 0.0, "step": 2400 }, { "epoch": 0.38, "grad_norm": 17.75, "learning_rate": 3.9142678451762516e-06, "logits/chosen": 2.146836757659912, "logits/rejected": 2.232630968093872, "logps/chosen": -428.52435302734375, "logps/rejected": -498.97882080078125, "loss": 0.2474, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.22230203449726105, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.329835414886475, "rewards/student_margin": 4.55213737487793, "rewards/teacher_margin": 0.0, "step": 2410 }, { "epoch": 0.38, "grad_norm": 14.25, "learning_rate": 3.9029676634059565e-06, "logits/chosen": 2.550261974334717, "logits/rejected": 2.011990785598755, "logps/chosen": -403.0806579589844, "logps/rejected": -444.563720703125, "loss": 0.2699, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.7714961767196655, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.248587131500244, "rewards/student_margin": 4.020082950592041, "rewards/teacher_margin": 0.0, "step": 2420 }, { "epoch": 0.38, "grad_norm": 12.75, "learning_rate": 3.891625484473331e-06, "logits/chosen": 2.1088860034942627, "logits/rejected": 2.4027085304260254, "logps/chosen": -378.26434326171875, "logps/rejected": -499.4009704589844, "loss": 0.2732, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.3703666627407074, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.1761250495910645, "rewards/student_margin": 3.5464916229248047, "rewards/teacher_margin": 0.0, "step": 2430 }, { "epoch": 0.38, "grad_norm": 8.9375, "learning_rate": 3.88024164790105e-06, "logits/chosen": 2.406759738922119, "logits/rejected": 2.088613510131836, "logps/chosen": -394.82049560546875, "logps/rejected": -419.43975830078125, "loss": 0.243, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.41398367285728455, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.0463218688964844, "rewards/student_margin": 3.460305690765381, "rewards/teacher_margin": 0.0, "step": 2440 }, { "epoch": 0.38, "grad_norm": 10.625, "learning_rate": 3.868816494458783e-06, "logits/chosen": 2.397116184234619, "logits/rejected": 2.3680195808410645, "logps/chosen": -374.85308837890625, "logps/rejected": -447.60882568359375, "loss": 0.2706, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.9430896043777466, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.5239174365997314, "rewards/student_margin": 4.467007160186768, "rewards/teacher_margin": 0.0, "step": 2450 }, { "epoch": 0.39, "grad_norm": 25.125, "learning_rate": 3.857350366153007e-06, "logits/chosen": 2.3722329139709473, "logits/rejected": 2.278595447540283, "logps/chosen": -431.8951721191406, "logps/rejected": -490.44549560546875, "loss": 0.1818, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.17575711011886597, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.8241474628448486, "rewards/student_margin": 3.9999046325683594, "rewards/teacher_margin": 0.0, "step": 2460 }, { "epoch": 0.39, "grad_norm": 8.75, "learning_rate": 3.845843606216758e-06, "logits/chosen": 2.213392734527588, "logits/rejected": 2.548081636428833, "logps/chosen": -330.73162841796875, "logps/rejected": -474.1424865722656, "loss": 0.2506, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.0669407993555069, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.437326431274414, "rewards/student_margin": 4.370385646820068, "rewards/teacher_margin": 0.0, "step": 2470 }, { "epoch": 0.39, "grad_norm": 12.75, "learning_rate": 3.83429655909936e-06, "logits/chosen": 2.271848440170288, "logits/rejected": 2.217095136642456, "logps/chosen": -395.7364196777344, "logps/rejected": -425.7189025878906, "loss": 0.2513, "rewards/accuracies": 0.9000000953674316, "rewards/chosen": 0.09038683772087097, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.0842795372009277, "rewards/student_margin": 3.174666166305542, "rewards/teacher_margin": 0.0, "step": 2480 }, { "epoch": 0.39, "grad_norm": 10.375, "learning_rate": 3.8227095704561175e-06, "logits/chosen": 2.3145384788513184, "logits/rejected": 2.3819522857666016, "logps/chosen": -427.932373046875, "logps/rejected": -516.2389526367188, "loss": 0.2666, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.36437174677848816, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.9510481357574463, "rewards/student_margin": 3.3154196739196777, "rewards/teacher_margin": 0.0, "step": 2490 }, { "epoch": 0.39, "grad_norm": 24.875, "learning_rate": 3.8110829871379622e-06, "logits/chosen": 2.4711530208587646, "logits/rejected": 2.221726894378662, "logps/chosen": -390.0331115722656, "logps/rejected": -426.604248046875, "loss": 0.3809, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.3402051329612732, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.777930736541748, "rewards/student_margin": 4.118135452270508, "rewards/teacher_margin": 0.0, "step": 2500 }, { "epoch": 0.39, "grad_norm": 13.625, "learning_rate": 3.7994171571810756e-06, "logits/chosen": 2.2909791469573975, "logits/rejected": 2.004585027694702, "logps/chosen": -393.61016845703125, "logps/rejected": -423.322509765625, "loss": 0.2766, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.506365180015564, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.3920435905456543, "rewards/student_margin": 3.898408889770508, "rewards/teacher_margin": 0.0, "step": 2510 }, { "epoch": 0.39, "grad_norm": 12.375, "learning_rate": 3.7877124297964666e-06, "logits/chosen": 2.2105064392089844, "logits/rejected": 2.2123634815216064, "logps/chosen": -336.07977294921875, "logps/rejected": -445.05889892578125, "loss": 0.245, "rewards/accuracies": 1.0, "rewards/chosen": 0.6864665150642395, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.234098434448242, "rewards/student_margin": 4.920565128326416, "rewards/teacher_margin": 0.0, "step": 2520 }, { "epoch": 0.4, "grad_norm": 18.875, "learning_rate": 3.7759691553595214e-06, "logits/chosen": 1.999338150024414, "logits/rejected": 2.18291974067688, "logps/chosen": -350.80419921875, "logps/rejected": -430.95745849609375, "loss": 0.3008, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": -0.2831433415412903, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2123916149139404, "rewards/student_margin": 2.929248332977295, "rewards/teacher_margin": 0.0, "step": 2530 }, { "epoch": 0.4, "grad_norm": 13.875, "learning_rate": 3.7641876853995124e-06, "logits/chosen": 2.162703037261963, "logits/rejected": 2.210523843765259, "logps/chosen": -396.792724609375, "logps/rejected": -441.58648681640625, "loss": 0.2395, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6270571351051331, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.6433043479919434, "rewards/student_margin": 3.2703616619110107, "rewards/teacher_margin": 0.0, "step": 2540 }, { "epoch": 0.4, "grad_norm": 14.4375, "learning_rate": 3.752368372589078e-06, "logits/chosen": 2.0900580883026123, "logits/rejected": 2.1564900875091553, "logps/chosen": -341.9209899902344, "logps/rejected": -429.7032165527344, "loss": 0.246, "rewards/accuracies": 1.0, "rewards/chosen": 0.6051188707351685, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.7318496704101562, "rewards/student_margin": 3.3369686603546143, "rewards/teacher_margin": 0.0, "step": 2550 }, { "epoch": 0.4, "grad_norm": 27.25, "learning_rate": 3.7405115707336612e-06, "logits/chosen": 2.440706253051758, "logits/rejected": 2.1124367713928223, "logps/chosen": -407.38201904296875, "logps/rejected": -431.97833251953125, "loss": 0.2349, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.35553717613220215, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.0866031646728516, "rewards/student_margin": 3.4421401023864746, "rewards/teacher_margin": 0.0, "step": 2560 }, { "epoch": 0.4, "grad_norm": 15.875, "learning_rate": 3.7286176347609282e-06, "logits/chosen": 2.2072603702545166, "logits/rejected": 2.1973233222961426, "logps/chosen": -338.1429443359375, "logps/rejected": -407.81378173828125, "loss": 0.2881, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": 0.5499972105026245, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.049400568008423, "rewards/student_margin": 3.599397659301758, "rewards/teacher_margin": 0.0, "step": 2570 }, { "epoch": 0.4, "grad_norm": 10.875, "learning_rate": 3.716686920710128e-06, "logits/chosen": 2.2564117908477783, "logits/rejected": 2.2996981143951416, "logps/chosen": -350.6414794921875, "logps/rejected": -383.01763916015625, "loss": 0.2191, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.833136260509491, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.1922011375427246, "rewards/student_margin": 4.025337219238281, "rewards/teacher_margin": 0.0, "step": 2580 }, { "epoch": 0.41, "grad_norm": 22.25, "learning_rate": 3.7047197857214505e-06, "logits/chosen": 1.9421443939208984, "logits/rejected": 2.0635464191436768, "logps/chosen": -370.78497314453125, "logps/rejected": -494.9637756347656, "loss": 0.2209, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.5799004435539246, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.5687034130096436, "rewards/student_margin": 4.148603439331055, "rewards/teacher_margin": 0.0, "step": 2590 }, { "epoch": 0.41, "grad_norm": 5.75, "learning_rate": 3.692716588025327e-06, "logits/chosen": 2.3203647136688232, "logits/rejected": 2.2240707874298096, "logps/chosen": -398.8072814941406, "logps/rejected": -430.3450622558594, "loss": 0.2411, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 1.3029650449752808, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.511976957321167, "rewards/student_margin": 3.814941883087158, "rewards/teacher_margin": 0.0, "step": 2600 }, { "epoch": 0.41, "grad_norm": 16.0, "learning_rate": 3.6806776869317074e-06, "logits/chosen": 2.2366137504577637, "logits/rejected": 2.091334342956543, "logps/chosen": -355.5997009277344, "logps/rejected": -407.6437072753906, "loss": 0.2254, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.45720139145851135, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.1405997276306152, "rewards/student_margin": 3.5978012084960938, "rewards/teacher_margin": 0.0, "step": 2610 }, { "epoch": 0.41, "grad_norm": 14.0625, "learning_rate": 3.668603442819307e-06, "logits/chosen": 2.1990723609924316, "logits/rejected": 2.0838794708251953, "logps/chosen": -357.3851013183594, "logps/rejected": -447.6002502441406, "loss": 0.2121, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.3204575777053833, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.3603832721710205, "rewards/student_margin": 3.6808407306671143, "rewards/teacher_margin": 0.0, "step": 2620 }, { "epoch": 0.41, "grad_norm": 18.875, "learning_rate": 3.6564942171248164e-06, "logits/chosen": 2.3496224880218506, "logits/rejected": 2.397185802459717, "logps/chosen": -388.1718444824219, "logps/rejected": -438.5511169433594, "loss": 0.2741, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 1.1179084777832031, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.9450879096984863, "rewards/student_margin": 4.062995910644531, "rewards/teacher_margin": 0.0, "step": 2630 }, { "epoch": 0.41, "grad_norm": 15.6875, "learning_rate": 3.6443503723320837e-06, "logits/chosen": 2.1123147010803223, "logits/rejected": 2.1328189373016357, "logps/chosen": -350.64141845703125, "logps/rejected": -442.03369140625, "loss": 0.2041, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.2710772454738617, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.1655421257019043, "rewards/student_margin": 3.4366188049316406, "rewards/teacher_margin": 0.0, "step": 2640 }, { "epoch": 0.42, "grad_norm": 18.5, "learning_rate": 3.632172271961264e-06, "logits/chosen": 2.43031644821167, "logits/rejected": 2.099778413772583, "logps/chosen": -359.9398498535156, "logps/rejected": -423.88055419921875, "loss": 0.2262, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.40013784170150757, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.6482977867126465, "rewards/student_margin": 3.0484354496002197, "rewards/teacher_margin": 0.0, "step": 2650 }, { "epoch": 0.42, "grad_norm": 10.375, "learning_rate": 3.619960280557934e-06, "logits/chosen": 2.40014910697937, "logits/rejected": 2.2928993701934814, "logps/chosen": -368.8309631347656, "logps/rejected": -468.2025451660156, "loss": 0.2987, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.3458234369754791, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.208742618560791, "rewards/student_margin": 3.5545654296875, "rewards/teacher_margin": 0.0, "step": 2660 }, { "epoch": 0.42, "grad_norm": 15.1875, "learning_rate": 3.6077147636821847e-06, "logits/chosen": 2.2615249156951904, "logits/rejected": 2.1530754566192627, "logps/chosen": -352.0721130371094, "logps/rejected": -375.41595458984375, "loss": 0.2542, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.42203807830810547, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.5779712200164795, "rewards/student_margin": 4.000009059906006, "rewards/teacher_margin": 0.0, "step": 2670 }, { "epoch": 0.42, "grad_norm": 22.0, "learning_rate": 3.595436087897675e-06, "logits/chosen": 2.2553186416625977, "logits/rejected": 2.326979875564575, "logps/chosen": -342.98419189453125, "logps/rejected": -382.9024353027344, "loss": 0.2293, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.22376033663749695, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.329378128051758, "rewards/student_margin": 3.553138256072998, "rewards/teacher_margin": 0.0, "step": 2680 }, { "epoch": 0.42, "grad_norm": 18.625, "learning_rate": 3.5831246207606597e-06, "logits/chosen": 2.2838730812072754, "logits/rejected": 2.118323802947998, "logps/chosen": -337.77447509765625, "logps/rejected": -398.12261962890625, "loss": 0.3161, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5371744632720947, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.0975286960601807, "rewards/student_margin": 3.6347031593322754, "rewards/teacher_margin": 0.0, "step": 2690 }, { "epoch": 0.42, "grad_norm": 13.8125, "learning_rate": 3.570780730808986e-06, "logits/chosen": 2.3086845874786377, "logits/rejected": 2.6637866497039795, "logps/chosen": -360.9557189941406, "logps/rejected": -434.5098571777344, "loss": 0.262, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.436821848154068, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.9482104778289795, "rewards/student_margin": 4.385032653808594, "rewards/teacher_margin": 0.0, "step": 2700 }, { "epoch": 0.42, "grad_norm": 13.6875, "learning_rate": 3.5584047875510646e-06, "logits/chosen": 2.2629265785217285, "logits/rejected": 2.065871000289917, "logps/chosen": -417.76617431640625, "logps/rejected": -439.2709045410156, "loss": 0.2402, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.49786892533302307, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.419957399368286, "rewards/student_margin": 3.9178268909454346, "rewards/teacher_margin": 0.0, "step": 2710 }, { "epoch": 0.43, "grad_norm": 15.4375, "learning_rate": 3.5459971614548056e-06, "logits/chosen": 2.2914280891418457, "logits/rejected": 2.219886302947998, "logps/chosen": -360.3038635253906, "logps/rejected": -411.9893493652344, "loss": 0.217, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.320984423160553, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.688615083694458, "rewards/student_margin": 3.0095996856689453, "rewards/teacher_margin": 0.0, "step": 2720 }, { "epoch": 0.43, "grad_norm": 13.875, "learning_rate": 3.5335582239365297e-06, "logits/chosen": 2.171496629714966, "logits/rejected": 2.2908554077148438, "logps/chosen": -341.51031494140625, "logps/rejected": -398.7691650390625, "loss": 0.2738, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.7232702970504761, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.734180212020874, "rewards/student_margin": 3.4574503898620605, "rewards/teacher_margin": 0.0, "step": 2730 }, { "epoch": 0.43, "grad_norm": 18.75, "learning_rate": 3.521088347349848e-06, "logits/chosen": 2.4681715965270996, "logits/rejected": 2.0134692192077637, "logps/chosen": -391.6719665527344, "logps/rejected": -397.56640625, "loss": 0.2595, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 1.3762166500091553, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.694150447845459, "rewards/student_margin": 4.070366859436035, "rewards/teacher_margin": 0.0, "step": 2740 }, { "epoch": 0.43, "grad_norm": 13.6875, "learning_rate": 3.508587904974522e-06, "logits/chosen": 2.3746092319488525, "logits/rejected": 2.262995958328247, "logps/chosen": -337.2062072753906, "logps/rejected": -355.24407958984375, "loss": 0.2091, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.7187009453773499, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.7206175327301025, "rewards/student_margin": 3.4393184185028076, "rewards/teacher_margin": 0.0, "step": 2750 }, { "epoch": 0.43, "grad_norm": 14.1875, "learning_rate": 3.4960572710052804e-06, "logits/chosen": 2.2188398838043213, "logits/rejected": 2.039242744445801, "logps/chosen": -368.8310546875, "logps/rejected": -416.52642822265625, "loss": 0.2423, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.21069283783435822, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.2424139976501465, "rewards/student_margin": 2.453106641769409, "rewards/teacher_margin": 0.0, "step": 2760 }, { "epoch": 0.43, "grad_norm": 13.25, "learning_rate": 3.483496820540626e-06, "logits/chosen": 2.204479694366455, "logits/rejected": 2.2028932571411133, "logps/chosen": -358.66546630859375, "logps/rejected": -396.17071533203125, "loss": 0.2279, "rewards/accuracies": 0.9000000953674316, "rewards/chosen": 0.269366979598999, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2958297729492188, "rewards/student_margin": 3.5651965141296387, "rewards/teacher_margin": 0.0, "step": 2770 }, { "epoch": 0.44, "grad_norm": 18.75, "learning_rate": 3.470906929571605e-06, "logits/chosen": 2.4260220527648926, "logits/rejected": 1.9704691171646118, "logps/chosen": -423.75543212890625, "logps/rejected": -363.49462890625, "loss": 0.2542, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.3744148313999176, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.4745655059814453, "rewards/student_margin": 3.848979949951172, "rewards/teacher_margin": 0.0, "step": 2780 }, { "epoch": 0.44, "grad_norm": 16.5, "learning_rate": 3.458287974970547e-06, "logits/chosen": 2.2904181480407715, "logits/rejected": 2.2327446937561035, "logps/chosen": -340.1028747558594, "logps/rejected": -411.589111328125, "loss": 0.1955, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.7340807914733887, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.4019551277160645, "rewards/student_margin": 5.136035442352295, "rewards/teacher_margin": 0.0, "step": 2790 }, { "epoch": 0.44, "grad_norm": 7.34375, "learning_rate": 3.4456403344797905e-06, "logits/chosen": 2.2651209831237793, "logits/rejected": 2.257246732711792, "logps/chosen": -327.26397705078125, "logps/rejected": -470.7875061035156, "loss": 0.2437, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.10406925529241562, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.128390312194824, "rewards/student_margin": 4.024320602416992, "rewards/teacher_margin": 0.0, "step": 2800 }, { "epoch": 0.44, "grad_norm": 14.4375, "learning_rate": 3.4329643867003715e-06, "logits/chosen": 2.299778461456299, "logits/rejected": 2.36427903175354, "logps/chosen": -343.0499572753906, "logps/rejected": -431.79962158203125, "loss": 0.2522, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": -0.22638142108917236, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.024296760559082, "rewards/student_margin": 3.79791522026062, "rewards/teacher_margin": 0.0, "step": 2810 }, { "epoch": 0.44, "grad_norm": 7.5625, "learning_rate": 3.4202605110806906e-06, "logits/chosen": 2.2362706661224365, "logits/rejected": 2.1268322467803955, "logps/chosen": -409.0557556152344, "logps/rejected": -466.540771484375, "loss": 0.2573, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.1055247038602829, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.8472554683685303, "rewards/student_margin": 2.95278000831604, "rewards/teacher_margin": 0.0, "step": 2820 }, { "epoch": 0.44, "grad_norm": 13.0, "learning_rate": 3.407529087905156e-06, "logits/chosen": 2.4477391242980957, "logits/rejected": 2.393184185028076, "logps/chosen": -353.10980224609375, "logps/rejected": -432.911376953125, "loss": 0.2196, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.18899457156658173, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2837612628936768, "rewards/student_margin": 3.4727559089660645, "rewards/teacher_margin": 0.0, "step": 2830 }, { "epoch": 0.45, "grad_norm": 16.5, "learning_rate": 3.3947704982827968e-06, "logits/chosen": 2.367135524749756, "logits/rejected": 2.1990301609039307, "logps/chosen": -413.9443359375, "logps/rejected": -445.1016540527344, "loss": 0.2459, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.17410914599895477, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.982217311859131, "rewards/student_margin": 4.1563262939453125, "rewards/teacher_margin": 0.0, "step": 2840 }, { "epoch": 0.45, "grad_norm": 19.0, "learning_rate": 3.3819851241358592e-06, "logits/chosen": 2.2958569526672363, "logits/rejected": 2.32338285446167, "logps/chosen": -364.51873779296875, "logps/rejected": -439.712890625, "loss": 0.2578, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.33544084429740906, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.7783234119415283, "rewards/student_margin": 4.113764762878418, "rewards/teacher_margin": 0.0, "step": 2850 }, { "epoch": 0.45, "grad_norm": 10.3125, "learning_rate": 3.3691733481883693e-06, "logits/chosen": 2.0372023582458496, "logits/rejected": 2.293612003326416, "logps/chosen": -356.5626220703125, "logps/rejected": -526.7611083984375, "loss": 0.2338, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.6936900019645691, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.731656312942505, "rewards/student_margin": 4.425345420837402, "rewards/teacher_margin": 0.0, "step": 2860 }, { "epoch": 0.45, "grad_norm": 15.5, "learning_rate": 3.3563355539546795e-06, "logits/chosen": 2.4403467178344727, "logits/rejected": 2.4208643436431885, "logps/chosen": -415.3199157714844, "logps/rejected": -452.3612365722656, "loss": 0.1906, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.6735491752624512, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2452950477600098, "rewards/student_margin": 3.918844223022461, "rewards/teacher_margin": 0.0, "step": 2870 }, { "epoch": 0.45, "grad_norm": 9.875, "learning_rate": 3.3434721257279853e-06, "logits/chosen": 2.1227784156799316, "logits/rejected": 2.0975255966186523, "logps/chosen": -375.83953857421875, "logps/rejected": -450.77471923828125, "loss": 0.244, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5497146844863892, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.343022108078003, "rewards/student_margin": 3.8927369117736816, "rewards/teacher_margin": 0.0, "step": 2880 }, { "epoch": 0.45, "grad_norm": 19.125, "learning_rate": 3.3305834485688267e-06, "logits/chosen": 2.2371647357940674, "logits/rejected": 2.3524863719940186, "logps/chosen": -397.8186950683594, "logps/rejected": -450.29937744140625, "loss": 0.2679, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.4096117913722992, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.763610363006592, "rewards/student_margin": 4.173222541809082, "rewards/teacher_margin": 0.0, "step": 2890 }, { "epoch": 0.45, "grad_norm": 9.5625, "learning_rate": 3.3176699082935546e-06, "logits/chosen": 2.3676345348358154, "logits/rejected": 2.309113025665283, "logps/chosen": -377.0630187988281, "logps/rejected": -424.2857360839844, "loss": 0.2005, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.6000615358352661, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.160989284515381, "rewards/student_margin": 3.7610511779785156, "rewards/teacher_margin": 0.0, "step": 2900 }, { "epoch": 0.46, "grad_norm": 14.4375, "learning_rate": 3.304731891462788e-06, "logits/chosen": 2.3007915019989014, "logits/rejected": 2.194401502609253, "logps/chosen": -339.5563049316406, "logps/rejected": -427.13140869140625, "loss": 0.2371, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.6036936044692993, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.199122905731201, "rewards/student_margin": 3.802816390991211, "rewards/teacher_margin": 0.0, "step": 2910 }, { "epoch": 0.46, "grad_norm": 17.375, "learning_rate": 3.2917697853698387e-06, "logits/chosen": 2.25099515914917, "logits/rejected": 2.391965389251709, "logps/chosen": -324.9006652832031, "logps/rejected": -414.39544677734375, "loss": 0.2107, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5956756472587585, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.658155918121338, "rewards/student_margin": 3.253831386566162, "rewards/teacher_margin": 0.0, "step": 2920 }, { "epoch": 0.46, "grad_norm": 16.375, "learning_rate": 3.2787839780291197e-06, "logits/chosen": 2.2894983291625977, "logits/rejected": 2.2689950466156006, "logps/chosen": -344.76959228515625, "logps/rejected": -401.6066589355469, "loss": 0.2829, "rewards/accuracies": 0.7666667103767395, "rewards/chosen": 0.1024453192949295, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.1665778160095215, "rewards/student_margin": 4.2690229415893555, "rewards/teacher_margin": 0.0, "step": 2930 }, { "epoch": 0.46, "grad_norm": 15.1875, "learning_rate": 3.2657748581645287e-06, "logits/chosen": 2.4147586822509766, "logits/rejected": 2.314513921737671, "logps/chosen": -375.0296325683594, "logps/rejected": -482.91497802734375, "loss": 0.1915, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.746520459651947, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.227827548980713, "rewards/student_margin": 4.974348068237305, "rewards/teacher_margin": 0.0, "step": 2940 }, { "epoch": 0.46, "grad_norm": 16.5, "learning_rate": 3.252742815197813e-06, "logits/chosen": 2.2344329357147217, "logits/rejected": 2.0863192081451416, "logps/chosen": -362.58642578125, "logps/rejected": -432.3460998535156, "loss": 0.1902, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 1.0001742839813232, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.413084983825684, "rewards/student_margin": 5.413259029388428, "rewards/teacher_margin": 0.0, "step": 2950 }, { "epoch": 0.46, "grad_norm": 21.0, "learning_rate": 3.239688239236911e-06, "logits/chosen": 2.0183589458465576, "logits/rejected": 2.1004397869110107, "logps/chosen": -354.41778564453125, "logps/rejected": -453.76190185546875, "loss": 0.266, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.3502083122730255, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.139425754547119, "rewards/student_margin": 3.4896340370178223, "rewards/teacher_margin": 0.0, "step": 2960 }, { "epoch": 0.47, "grad_norm": 19.0, "learning_rate": 3.226611521064278e-06, "logits/chosen": 2.215217113494873, "logits/rejected": 2.269537925720215, "logps/chosen": -387.2310485839844, "logps/rejected": -427.2652282714844, "loss": 0.2014, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.43073707818984985, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.9213948249816895, "rewards/student_margin": 3.3521316051483154, "rewards/teacher_margin": 0.0, "step": 2970 }, { "epoch": 0.47, "grad_norm": 7.1875, "learning_rate": 3.213513052125182e-06, "logits/chosen": 2.264648914337158, "logits/rejected": 2.368467330932617, "logps/chosen": -337.35113525390625, "logps/rejected": -404.4619140625, "loss": 0.2343, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.4190541207790375, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.3140735626220703, "rewards/student_margin": 3.7331275939941406, "rewards/teacher_margin": 0.0, "step": 2980 }, { "epoch": 0.47, "grad_norm": 11.3125, "learning_rate": 3.200393224515993e-06, "logits/chosen": 2.347146987915039, "logits/rejected": 2.3630881309509277, "logps/chosen": -351.7055358886719, "logps/rejected": -453.29315185546875, "loss": 0.2479, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8046822547912598, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.515284776687622, "rewards/student_margin": 4.3199663162231445, "rewards/teacher_margin": 0.0, "step": 2990 }, { "epoch": 0.47, "grad_norm": 19.5, "learning_rate": 3.1872524309724412e-06, "logits/chosen": 2.610239267349243, "logits/rejected": 2.3775582313537598, "logps/chosen": -358.210693359375, "logps/rejected": -367.8978576660156, "loss": 0.2619, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.8314231038093567, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.6737313270568848, "rewards/student_margin": 3.5051543712615967, "rewards/teacher_margin": 0.0, "step": 3000 }, { "epoch": 0.47, "grad_norm": 19.5, "learning_rate": 3.1740910648578614e-06, "logits/chosen": 2.3571410179138184, "logits/rejected": 2.2388644218444824, "logps/chosen": -416.7958984375, "logps/rejected": -477.7660217285156, "loss": 0.2504, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 1.0125916004180908, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.9751479625701904, "rewards/student_margin": 3.9877395629882812, "rewards/teacher_margin": 0.0, "step": 3010 }, { "epoch": 0.47, "grad_norm": 10.5, "learning_rate": 3.1609095201514193e-06, "logits/chosen": 2.0754928588867188, "logits/rejected": 2.0234599113464355, "logps/chosen": -370.8299255371094, "logps/rejected": -442.7757873535156, "loss": 0.2585, "rewards/accuracies": 0.9000000953674316, "rewards/chosen": 0.550487756729126, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.60387921333313, "rewards/student_margin": 4.154366970062256, "rewards/teacher_margin": 0.0, "step": 3020 }, { "epoch": 0.47, "grad_norm": 20.875, "learning_rate": 3.1477081914363174e-06, "logits/chosen": 2.3996293544769287, "logits/rejected": 2.509965658187866, "logps/chosen": -369.8542175292969, "logps/rejected": -506.729248046875, "loss": 0.2189, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.5509821176528931, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.141905784606934, "rewards/student_margin": 4.692887306213379, "rewards/teacher_margin": 0.0, "step": 3030 }, { "epoch": 0.48, "grad_norm": 12.125, "learning_rate": 3.1344874738879823e-06, "logits/chosen": 2.0692124366760254, "logits/rejected": 2.0163118839263916, "logps/chosen": -438.5457458496094, "logps/rejected": -467.59649658203125, "loss": 0.2616, "rewards/accuracies": 0.9000000953674316, "rewards/chosen": 0.7837346792221069, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.3626346588134766, "rewards/student_margin": 3.146369218826294, "rewards/teacher_margin": 0.0, "step": 3040 }, { "epoch": 0.48, "grad_norm": 6.0, "learning_rate": 3.121247763262235e-06, "logits/chosen": 2.377230167388916, "logits/rejected": 2.107757329940796, "logps/chosen": -429.8155822753906, "logps/rejected": -426.9541015625, "loss": 0.2401, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.2864219546318054, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.094973087310791, "rewards/student_margin": 3.381394863128662, "rewards/teacher_margin": 0.0, "step": 3050 }, { "epoch": 0.48, "grad_norm": 13.1875, "learning_rate": 3.1079894558834474e-06, "logits/chosen": 2.1431241035461426, "logits/rejected": 2.100407838821411, "logps/chosen": -343.11309814453125, "logps/rejected": -378.8443298339844, "loss": 0.2617, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.7765836119651794, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.6467397212982178, "rewards/student_margin": 3.423323392868042, "rewards/teacher_margin": 0.0, "step": 3060 }, { "epoch": 0.48, "grad_norm": 11.0, "learning_rate": 3.0947129486326745e-06, "logits/chosen": 2.2760090827941895, "logits/rejected": 2.122431993484497, "logps/chosen": -363.107666015625, "logps/rejected": -427.31500244140625, "loss": 0.2324, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": -0.1468345671892166, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.045578956604004, "rewards/student_margin": 3.898744583129883, "rewards/teacher_margin": 0.0, "step": 3070 }, { "epoch": 0.48, "grad_norm": 11.3125, "learning_rate": 3.0814186389357765e-06, "logits/chosen": 2.330493688583374, "logits/rejected": 2.0978925228118896, "logps/chosen": -407.52447509765625, "logps/rejected": -391.86505126953125, "loss": 0.238, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.5319676995277405, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.955634117126465, "rewards/student_margin": 3.4876017570495605, "rewards/teacher_margin": 0.0, "step": 3080 }, { "epoch": 0.48, "grad_norm": 15.75, "learning_rate": 3.068106924751521e-06, "logits/chosen": 2.051382303237915, "logits/rejected": 2.0976693630218506, "logps/chosen": -373.179931640625, "logps/rejected": -411.8519592285156, "loss": 0.2428, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.3659347593784332, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.0203680992126465, "rewards/student_margin": 2.654433250427246, "rewards/teacher_margin": 0.0, "step": 3090 }, { "epoch": 0.49, "grad_norm": 15.0, "learning_rate": 3.0547782045596708e-06, "logits/chosen": 2.290377616882324, "logits/rejected": 2.4751532077789307, "logps/chosen": -355.5189208984375, "logps/rejected": -467.42730712890625, "loss": 0.2155, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.02586769498884678, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.694409132003784, "rewards/student_margin": 3.7202765941619873, "rewards/teacher_margin": 0.0, "step": 3100 }, { "epoch": 0.49, "grad_norm": 16.5, "learning_rate": 3.0414328773490546e-06, "logits/chosen": 2.4533419609069824, "logits/rejected": 2.193533182144165, "logps/chosen": -371.39404296875, "logps/rejected": -402.93121337890625, "loss": 0.2096, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.48314565420150757, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.1560425758361816, "rewards/student_margin": 3.639188051223755, "rewards/teacher_margin": 0.0, "step": 3110 }, { "epoch": 0.49, "grad_norm": 19.375, "learning_rate": 3.028071342605625e-06, "logits/chosen": 2.1506896018981934, "logits/rejected": 2.169945240020752, "logps/chosen": -346.757568359375, "logps/rejected": -409.0155334472656, "loss": 0.3379, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.2748657763004303, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.875403881072998, "rewards/student_margin": 3.1502695083618164, "rewards/teacher_margin": 0.0, "step": 3120 }, { "epoch": 0.49, "grad_norm": 12.4375, "learning_rate": 3.0146940003004993e-06, "logits/chosen": 2.507147789001465, "logits/rejected": 2.5541577339172363, "logps/chosen": -395.2455749511719, "logps/rejected": -476.75238037109375, "loss": 0.2376, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.7538625001907349, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.055445671081543, "rewards/student_margin": 4.80930757522583, "rewards/teacher_margin": 0.0, "step": 3130 }, { "epoch": 0.49, "grad_norm": 18.0, "learning_rate": 3.001301250877987e-06, "logits/chosen": 2.1209867000579834, "logits/rejected": 2.0309531688690186, "logps/chosen": -353.945556640625, "logps/rejected": -397.07354736328125, "loss": 0.2065, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.528801441192627, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.8886709213256836, "rewards/student_margin": 3.4174728393554688, "rewards/teacher_margin": 0.0, "step": 3140 }, { "epoch": 0.49, "grad_norm": 14.0625, "learning_rate": 2.987893495243601e-06, "logits/chosen": 2.306917667388916, "logits/rejected": 2.25931715965271, "logps/chosen": -424.87860107421875, "logps/rejected": -444.0254821777344, "loss": 0.2483, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.16171619296073914, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.8270695209503174, "rewards/student_margin": 2.9887855052948, "rewards/teacher_margin": 0.0, "step": 3150 }, { "epoch": 0.5, "grad_norm": 19.0, "learning_rate": 2.974471134752058e-06, "logits/chosen": 2.288144588470459, "logits/rejected": 2.250706911087036, "logps/chosen": -384.644287109375, "logps/rejected": -488.08795166015625, "loss": 0.3177, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.045200102031230927, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.465588092803955, "rewards/student_margin": 4.5107879638671875, "rewards/teacher_margin": 0.0, "step": 3160 }, { "epoch": 0.5, "grad_norm": 13.375, "learning_rate": 2.9610345711952655e-06, "logits/chosen": 2.0723891258239746, "logits/rejected": 2.1462111473083496, "logps/chosen": -363.16644287109375, "logps/rejected": -479.6796875, "loss": 0.2146, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": -0.06971307098865509, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.810393810272217, "rewards/student_margin": 3.7406811714172363, "rewards/teacher_margin": 0.0, "step": 3170 }, { "epoch": 0.5, "grad_norm": 15.8125, "learning_rate": 2.9475842067902915e-06, "logits/chosen": 1.9900572299957275, "logits/rejected": 2.1901602745056152, "logps/chosen": -349.59796142578125, "logps/rejected": -454.04193115234375, "loss": 0.215, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.514102041721344, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.7387378215789795, "rewards/student_margin": 4.252840042114258, "rewards/teacher_margin": 0.0, "step": 3180 }, { "epoch": 0.5, "grad_norm": 23.125, "learning_rate": 2.9341204441673267e-06, "logits/chosen": 2.336085557937622, "logits/rejected": 2.2899043560028076, "logps/chosen": -352.58258056640625, "logps/rejected": -386.87347412109375, "loss": 0.2891, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.19839784502983093, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.880344867706299, "rewards/student_margin": 3.078742742538452, "rewards/teacher_margin": 0.0, "step": 3190 }, { "epoch": 0.5, "grad_norm": 13.875, "learning_rate": 2.92064368635763e-06, "logits/chosen": 2.2074406147003174, "logits/rejected": 2.1881444454193115, "logps/chosen": -373.75860595703125, "logps/rejected": -479.1675720214844, "loss": 0.1859, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.7743901014328003, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.749584674835205, "rewards/student_margin": 4.523974895477295, "rewards/teacher_margin": 0.0, "step": 3200 }, { "epoch": 0.5, "grad_norm": 11.25, "learning_rate": 2.9071543367814657e-06, "logits/chosen": 2.1032676696777344, "logits/rejected": 1.9540249109268188, "logps/chosen": -432.912109375, "logps/rejected": -461.303466796875, "loss": 0.1937, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7490127682685852, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.4244656562805176, "rewards/student_margin": 4.173478603363037, "rewards/teacher_margin": 0.0, "step": 3210 }, { "epoch": 0.5, "grad_norm": 13.3125, "learning_rate": 2.8936527992360273e-06, "logits/chosen": 2.310997247695923, "logits/rejected": 2.1953234672546387, "logps/chosen": -370.6199645996094, "logps/rejected": -496.1392517089844, "loss": 0.1528, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 1.0951523780822754, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.16379451751709, "rewards/student_margin": 5.258946895599365, "rewards/teacher_margin": 0.0, "step": 3220 }, { "epoch": 0.51, "grad_norm": 15.3125, "learning_rate": 2.8801394778833475e-06, "logits/chosen": 2.167978525161743, "logits/rejected": 2.3006348609924316, "logps/chosen": -381.28656005859375, "logps/rejected": -460.5723571777344, "loss": 0.2727, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.5259613394737244, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.6617493629455566, "rewards/student_margin": 3.187710762023926, "rewards/teacher_margin": 0.0, "step": 3230 }, { "epoch": 0.51, "grad_norm": 14.375, "learning_rate": 2.8666147772382034e-06, "logits/chosen": 2.101233959197998, "logits/rejected": 2.1007916927337646, "logps/chosen": -364.1319885253906, "logps/rejected": -442.06787109375, "loss": 0.3026, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.4731379449367523, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.312168121337891, "rewards/student_margin": 4.785305976867676, "rewards/teacher_margin": 0.0, "step": 3240 }, { "epoch": 0.51, "grad_norm": 10.25, "learning_rate": 2.8530791021560045e-06, "logits/chosen": 2.1129748821258545, "logits/rejected": 2.2223691940307617, "logps/chosen": -294.7200622558594, "logps/rejected": -437.4198303222656, "loss": 0.2648, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.885317325592041, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.805337429046631, "rewards/student_margin": 3.69065523147583, "rewards/teacher_margin": 0.0, "step": 3250 }, { "epoch": 0.51, "grad_norm": 12.5, "learning_rate": 2.8395328578206756e-06, "logits/chosen": 2.174830675125122, "logits/rejected": 2.404393196105957, "logps/chosen": -374.43121337890625, "logps/rejected": -525.418212890625, "loss": 0.1949, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.9664198160171509, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.9952728748321533, "rewards/student_margin": 4.961692810058594, "rewards/teacher_margin": 0.0, "step": 3260 }, { "epoch": 0.51, "grad_norm": 10.3125, "learning_rate": 2.825976449732525e-06, "logits/chosen": 2.159381151199341, "logits/rejected": 2.0782291889190674, "logps/chosen": -336.55096435546875, "logps/rejected": -412.1483459472656, "loss": 0.2297, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.5453509092330933, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.947563409805298, "rewards/student_margin": 3.4929141998291016, "rewards/teacher_margin": 0.0, "step": 3270 }, { "epoch": 0.51, "grad_norm": 10.3125, "learning_rate": 2.8124102836961113e-06, "logits/chosen": 2.1955528259277344, "logits/rejected": 2.13395357131958, "logps/chosen": -391.7129211425781, "logps/rejected": -480.88482666015625, "loss": 0.2356, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 1.1434831619262695, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.3922219276428223, "rewards/student_margin": 4.535704612731934, "rewards/teacher_margin": 0.0, "step": 3280 }, { "epoch": 0.52, "grad_norm": 25.0, "learning_rate": 2.7988347658080906e-06, "logits/chosen": 2.1978774070739746, "logits/rejected": 2.357642650604248, "logps/chosen": -380.69384765625, "logps/rejected": -499.8038635253906, "loss": 0.1988, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.4792620539665222, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.050598621368408, "rewards/student_margin": 3.529860734939575, "rewards/teacher_margin": 0.0, "step": 3290 }, { "epoch": 0.52, "grad_norm": 9.75, "learning_rate": 2.785250302445062e-06, "logits/chosen": 2.213548183441162, "logits/rejected": 2.227273464202881, "logps/chosen": -351.055419921875, "logps/rejected": -410.69842529296875, "loss": 0.1757, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.6280609369277954, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.492575168609619, "rewards/student_margin": 4.120635986328125, "rewards/teacher_margin": 0.0, "step": 3300 }, { "epoch": 0.52, "grad_norm": 18.125, "learning_rate": 2.7716573002514047e-06, "logits/chosen": 2.1122398376464844, "logits/rejected": 2.424421787261963, "logps/chosen": -389.32098388671875, "logps/rejected": -473.4154357910156, "loss": 0.1759, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.5370089411735535, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.4182610511779785, "rewards/student_margin": 3.9552695751190186, "rewards/teacher_margin": 0.0, "step": 3310 }, { "epoch": 0.52, "grad_norm": 14.1875, "learning_rate": 2.7580561661271015e-06, "logits/chosen": 2.4003238677978516, "logits/rejected": 2.2500274181365967, "logps/chosen": -384.6837463378906, "logps/rejected": -420.10040283203125, "loss": 0.2434, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5723447203636169, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.46054744720459, "rewards/student_margin": 5.032892227172852, "rewards/teacher_margin": 0.0, "step": 3320 }, { "epoch": 0.52, "grad_norm": 11.8125, "learning_rate": 2.7444473072155624e-06, "logits/chosen": 2.372487783432007, "logits/rejected": 2.3858721256256104, "logps/chosen": -341.2293395996094, "logps/rejected": -388.5402526855469, "loss": 0.2018, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.8238633871078491, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.9152262210845947, "rewards/student_margin": 3.7390899658203125, "rewards/teacher_margin": 0.0, "step": 3330 }, { "epoch": 0.52, "grad_norm": 9.875, "learning_rate": 2.730831130891434e-06, "logits/chosen": 2.1053388118743896, "logits/rejected": 2.426819324493408, "logps/chosen": -325.96221923828125, "logps/rejected": -447.5068359375, "loss": 0.1942, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.37924525141716003, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.7468554973602295, "rewards/student_margin": 4.126101016998291, "rewards/teacher_margin": 0.0, "step": 3340 }, { "epoch": 0.53, "grad_norm": 14.6875, "learning_rate": 2.7172080447484074e-06, "logits/chosen": 2.3462748527526855, "logits/rejected": 2.5254805088043213, "logps/chosen": -337.8982238769531, "logps/rejected": -416.61236572265625, "loss": 0.258, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.18362095952033997, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2570464611053467, "rewards/student_margin": 3.4406676292419434, "rewards/teacher_margin": 0.0, "step": 3350 }, { "epoch": 0.53, "grad_norm": 10.4375, "learning_rate": 2.703578456587015e-06, "logits/chosen": 2.2353405952453613, "logits/rejected": 2.277163028717041, "logps/chosen": -323.01202392578125, "logps/rejected": -388.23333740234375, "loss": 0.225, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.14114722609519958, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.82804274559021, "rewards/student_margin": 2.9691901206970215, "rewards/teacher_margin": 0.0, "step": 3360 }, { "epoch": 0.53, "grad_norm": 13.625, "learning_rate": 2.689942774402423e-06, "logits/chosen": 2.233466148376465, "logits/rejected": 2.1455206871032715, "logps/chosen": -403.6963806152344, "logps/rejected": -485.756591796875, "loss": 0.1677, "rewards/accuracies": 1.0, "rewards/chosen": 1.4890658855438232, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.111947536468506, "rewards/student_margin": 5.601013660430908, "rewards/teacher_margin": 0.0, "step": 3370 }, { "epoch": 0.53, "grad_norm": 13.0, "learning_rate": 2.676301406372221e-06, "logits/chosen": 2.095114231109619, "logits/rejected": 2.1753928661346436, "logps/chosen": -350.6921081542969, "logps/rejected": -493.73583984375, "loss": 0.1882, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.241744726896286, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.6957168579101562, "rewards/student_margin": 3.9374618530273438, "rewards/teacher_margin": 0.0, "step": 3380 }, { "epoch": 0.53, "grad_norm": 8.0625, "learning_rate": 2.662654760844201e-06, "logits/chosen": 2.399343729019165, "logits/rejected": 2.3131113052368164, "logps/chosen": -416.31585693359375, "logps/rejected": -476.9352111816406, "loss": 0.203, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6221926808357239, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.174395561218262, "rewards/student_margin": 4.79658842086792, "rewards/teacher_margin": 0.0, "step": 3390 }, { "epoch": 0.53, "grad_norm": 22.0, "learning_rate": 2.649003246324135e-06, "logits/chosen": 2.0387094020843506, "logits/rejected": 2.1155505180358887, "logps/chosen": -400.13470458984375, "logps/rejected": -499.76348876953125, "loss": 0.2173, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.0205734968185425, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.8018431663513184, "rewards/student_margin": 4.82241678237915, "rewards/teacher_margin": 0.0, "step": 3400 }, { "epoch": 0.53, "grad_norm": 14.4375, "learning_rate": 2.6353472714635443e-06, "logits/chosen": 2.182504892349243, "logits/rejected": 2.2360215187072754, "logps/chosen": -377.00323486328125, "logps/rejected": -484.99542236328125, "loss": 0.2126, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.8529586791992188, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.279623985290527, "rewards/student_margin": 5.132582664489746, "rewards/teacher_margin": 0.0, "step": 3410 }, { "epoch": 0.54, "grad_norm": 10.25, "learning_rate": 2.6216872450474695e-06, "logits/chosen": 2.421586275100708, "logits/rejected": 2.495795965194702, "logps/chosen": -369.4314880371094, "logps/rejected": -407.2565002441406, "loss": 0.161, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.5809465646743774, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.810878038406372, "rewards/student_margin": 4.391824722290039, "rewards/teacher_margin": 0.0, "step": 3420 }, { "epoch": 0.54, "grad_norm": 19.125, "learning_rate": 2.6080235759822325e-06, "logits/chosen": 2.4609487056732178, "logits/rejected": 2.5285089015960693, "logps/chosen": -364.0375671386719, "logps/rejected": -432.07080078125, "loss": 0.2565, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.10315883159637451, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.791532039642334, "rewards/student_margin": 3.894690990447998, "rewards/teacher_margin": 0.0, "step": 3430 }, { "epoch": 0.54, "grad_norm": 10.3125, "learning_rate": 2.5943566732831965e-06, "logits/chosen": 2.266489267349243, "logits/rejected": 2.345585584640503, "logps/chosen": -375.3038330078125, "logps/rejected": -429.507568359375, "loss": 0.2454, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.7917287349700928, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.6586544513702393, "rewards/student_margin": 4.450383186340332, "rewards/teacher_margin": 0.0, "step": 3440 }, { "epoch": 0.54, "grad_norm": 15.0625, "learning_rate": 2.58068694606252e-06, "logits/chosen": 2.0476601123809814, "logits/rejected": 2.201812505722046, "logps/chosen": -353.490478515625, "logps/rejected": -458.107421875, "loss": 0.2184, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.052721668034791946, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.8606624603271484, "rewards/student_margin": 3.913384199142456, "rewards/teacher_margin": 0.0, "step": 3450 }, { "epoch": 0.54, "grad_norm": 13.3125, "learning_rate": 2.5670148035169156e-06, "logits/chosen": 2.319406747817993, "logits/rejected": 1.9899654388427734, "logps/chosen": -368.28936767578125, "logps/rejected": -351.22479248046875, "loss": 0.2731, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.058328498154878616, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.4036848545074463, "rewards/student_margin": 2.4620132446289062, "rewards/teacher_margin": 0.0, "step": 3460 }, { "epoch": 0.54, "grad_norm": 18.5, "learning_rate": 2.5533406549153953e-06, "logits/chosen": 2.133512258529663, "logits/rejected": 2.1391520500183105, "logps/chosen": -389.7716064453125, "logps/rejected": -406.0328063964844, "loss": 0.2861, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.22314047813415527, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.7764532566070557, "rewards/student_margin": 2.999593496322632, "rewards/teacher_margin": 0.0, "step": 3470 }, { "epoch": 0.55, "grad_norm": 13.25, "learning_rate": 2.53966490958702e-06, "logits/chosen": 2.2071762084960938, "logits/rejected": 2.027562379837036, "logps/chosen": -424.583984375, "logps/rejected": -360.71337890625, "loss": 0.1714, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.9901229739189148, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.8530969619750977, "rewards/student_margin": 3.843219757080078, "rewards/teacher_margin": 0.0, "step": 3480 }, { "epoch": 0.55, "grad_norm": 12.625, "learning_rate": 2.5259879769086517e-06, "logits/chosen": 2.3718647956848145, "logits/rejected": 2.312702178955078, "logps/chosen": -339.8014221191406, "logps/rejected": -370.7449035644531, "loss": 0.221, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.4879492223262787, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.551971912384033, "rewards/student_margin": 4.039920806884766, "rewards/teacher_margin": 0.0, "step": 3490 }, { "epoch": 0.55, "grad_norm": 9.0, "learning_rate": 2.5123102662926912e-06, "logits/chosen": 2.2475686073303223, "logits/rejected": 1.858041524887085, "logps/chosen": -428.4232482910156, "logps/rejected": -430.6632385253906, "loss": 0.2196, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.954119861125946, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.841684579849243, "rewards/student_margin": 3.795804500579834, "rewards/teacher_margin": 0.0, "step": 3500 }, { "epoch": 0.55, "grad_norm": 19.75, "learning_rate": 2.4986321871748294e-06, "logits/chosen": 2.108095645904541, "logits/rejected": 2.2903430461883545, "logps/chosen": -350.99078369140625, "logps/rejected": -432.5859375, "loss": 0.2309, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.3405437469482422, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.7620322704315186, "rewards/student_margin": 4.10257625579834, "rewards/teacher_margin": 0.0, "step": 3510 }, { "epoch": 0.55, "grad_norm": 17.625, "learning_rate": 2.4849541490017868e-06, "logits/chosen": 2.1404101848602295, "logits/rejected": 2.221020460128784, "logps/chosen": -320.7823181152344, "logps/rejected": -377.9104919433594, "loss": 0.2606, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.282367467880249, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.410953044891357, "rewards/student_margin": 4.6933207511901855, "rewards/teacher_margin": 0.0, "step": 3520 }, { "epoch": 0.55, "grad_norm": 16.75, "learning_rate": 2.4712765612190583e-06, "logits/chosen": 2.259737968444824, "logits/rejected": 2.1508030891418457, "logps/chosen": -387.46795654296875, "logps/rejected": -521.582763671875, "loss": 0.2575, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.658551812171936, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.7274487018585205, "rewards/student_margin": 4.386000633239746, "rewards/teacher_margin": 0.0, "step": 3530 }, { "epoch": 0.55, "grad_norm": 18.625, "learning_rate": 2.4575998332586573e-06, "logits/chosen": 2.518524646759033, "logits/rejected": 2.1819119453430176, "logps/chosen": -391.85797119140625, "logps/rejected": -446.6455078125, "loss": 0.2712, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.2988213002681732, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.161787986755371, "rewards/student_margin": 4.460609436035156, "rewards/teacher_margin": 0.0, "step": 3540 }, { "epoch": 0.56, "grad_norm": 10.6875, "learning_rate": 2.4439243745268576e-06, "logits/chosen": 2.21928334236145, "logits/rejected": 2.1078569889068604, "logps/chosen": -413.432373046875, "logps/rejected": -452.96710205078125, "loss": 0.2166, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5337194204330444, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.4814116954803467, "rewards/student_margin": 4.01513147354126, "rewards/teacher_margin": 0.0, "step": 3550 }, { "epoch": 0.56, "grad_norm": 7.75, "learning_rate": 2.43025059439194e-06, "logits/chosen": 2.528026580810547, "logits/rejected": 2.2313992977142334, "logps/chosen": -397.0318298339844, "logps/rejected": -442.065185546875, "loss": 0.1932, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.8028140068054199, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.1704564094543457, "rewards/student_margin": 3.9732704162597656, "rewards/teacher_margin": 0.0, "step": 3560 }, { "epoch": 0.56, "grad_norm": 19.5, "learning_rate": 2.4165789021719373e-06, "logits/chosen": 2.380279541015625, "logits/rejected": 1.9540138244628906, "logps/chosen": -425.80499267578125, "logps/rejected": -430.75531005859375, "loss": 0.1735, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.7520251274108887, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.166973352432251, "rewards/student_margin": 2.9189982414245605, "rewards/teacher_margin": 0.0, "step": 3570 }, { "epoch": 0.56, "grad_norm": 17.875, "learning_rate": 2.4029097071223815e-06, "logits/chosen": 2.1330151557922363, "logits/rejected": 2.021304130554199, "logps/chosen": -400.5568542480469, "logps/rejected": -413.1788635253906, "loss": 0.2206, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.8463171124458313, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.841163158416748, "rewards/student_margin": 3.6874802112579346, "rewards/teacher_margin": 0.0, "step": 3580 }, { "epoch": 0.56, "grad_norm": 21.375, "learning_rate": 2.3892434184240536e-06, "logits/chosen": 2.1433746814727783, "logits/rejected": 2.4154748916625977, "logps/chosen": -363.99114990234375, "logps/rejected": -440.7875061035156, "loss": 0.221, "rewards/accuracies": 1.0, "rewards/chosen": 0.6583948731422424, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.377272129058838, "rewards/student_margin": 5.035666465759277, "rewards/teacher_margin": 0.0, "step": 3590 }, { "epoch": 0.56, "grad_norm": 15.0, "learning_rate": 2.3755804451707333e-06, "logits/chosen": 2.335618257522583, "logits/rejected": 2.3779683113098145, "logps/chosen": -422.3512268066406, "logps/rejected": -495.6741638183594, "loss": 0.1847, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.4791710376739502, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.5901615619659424, "rewards/student_margin": 4.069332599639893, "rewards/teacher_margin": 0.0, "step": 3600 }, { "epoch": 0.57, "grad_norm": 17.875, "learning_rate": 2.3619211963569545e-06, "logits/chosen": 2.2997450828552246, "logits/rejected": 2.2331604957580566, "logps/chosen": -396.25543212890625, "logps/rejected": -423.6015625, "loss": 0.2052, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.6045502424240112, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.381983757019043, "rewards/student_margin": 4.9865336418151855, "rewards/teacher_margin": 0.0, "step": 3610 }, { "epoch": 0.57, "grad_norm": 18.375, "learning_rate": 2.348266080865762e-06, "logits/chosen": 2.302664279937744, "logits/rejected": 2.293337821960449, "logps/chosen": -371.8497619628906, "logps/rejected": -452.75323486328125, "loss": 0.2447, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.7904747724533081, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.722474098205566, "rewards/student_margin": 5.512949466705322, "rewards/teacher_margin": 0.0, "step": 3620 }, { "epoch": 0.57, "grad_norm": 21.375, "learning_rate": 2.3346155074564712e-06, "logits/chosen": 2.2145354747772217, "logits/rejected": 2.3611884117126465, "logps/chosen": -379.44549560546875, "logps/rejected": -542.7459716796875, "loss": 0.2557, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.07235515117645264, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.767797470092773, "rewards/student_margin": 4.840151786804199, "rewards/teacher_margin": 0.0, "step": 3630 }, { "epoch": 0.57, "grad_norm": 11.4375, "learning_rate": 2.3209698847524316e-06, "logits/chosen": 2.3657450675964355, "logits/rejected": 2.4999051094055176, "logps/chosen": -329.00042724609375, "logps/rejected": -427.77655029296875, "loss": 0.1861, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.6146613359451294, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.9574196338653564, "rewards/student_margin": 4.572081565856934, "rewards/teacher_margin": 0.0, "step": 3640 }, { "epoch": 0.57, "grad_norm": 12.5625, "learning_rate": 2.3073296212287994e-06, "logits/chosen": 2.3500430583953857, "logits/rejected": 2.255985736846924, "logps/chosen": -412.987060546875, "logps/rejected": -437.62371826171875, "loss": 0.2356, "rewards/accuracies": 0.9000000953674316, "rewards/chosen": 0.5405006408691406, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.6325275897979736, "rewards/student_margin": 4.173027992248535, "rewards/teacher_margin": 0.0, "step": 3650 }, { "epoch": 0.57, "grad_norm": 11.375, "learning_rate": 2.293695125200302e-06, "logits/chosen": 2.3791496753692627, "logits/rejected": 2.298616886138916, "logps/chosen": -377.6425476074219, "logps/rejected": -492.912841796875, "loss": 0.1653, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.582294225692749, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.921860456466675, "rewards/student_margin": 4.504155158996582, "rewards/teacher_margin": 0.0, "step": 3660 }, { "epoch": 0.58, "grad_norm": 17.125, "learning_rate": 2.2800668048090213e-06, "logits/chosen": 2.19169020652771, "logits/rejected": 2.13714599609375, "logps/chosen": -383.288818359375, "logps/rejected": -460.9693298339844, "loss": 0.2661, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 1.019307255744934, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.5573196411132812, "rewards/student_margin": 3.576627254486084, "rewards/teacher_margin": 0.0, "step": 3670 }, { "epoch": 0.58, "grad_norm": 13.0625, "learning_rate": 2.2664450680121757e-06, "logits/chosen": 2.701406955718994, "logits/rejected": 2.409045696258545, "logps/chosen": -364.9839782714844, "logps/rejected": -441.0972595214844, "loss": 0.2295, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 0.35218074917793274, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.7102432250976562, "rewards/student_margin": 4.0624237060546875, "rewards/teacher_margin": 0.0, "step": 3680 }, { "epoch": 0.58, "grad_norm": 9.625, "learning_rate": 2.2528303225699036e-06, "logits/chosen": 2.3105616569519043, "logits/rejected": 2.448324680328369, "logps/chosen": -304.02764892578125, "logps/rejected": -448.71881103515625, "loss": 0.2157, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.8077465295791626, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.955540418624878, "rewards/student_margin": 4.76328706741333, "rewards/teacher_margin": 0.0, "step": 3690 }, { "epoch": 0.58, "grad_norm": 18.5, "learning_rate": 2.239222976033064e-06, "logits/chosen": 2.0238544940948486, "logits/rejected": 1.9927690029144287, "logps/chosen": -413.3540954589844, "logps/rejected": -513.38427734375, "loss": 0.2249, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5376284718513489, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.414843797683716, "rewards/student_margin": 3.95247220993042, "rewards/teacher_margin": 0.0, "step": 3700 }, { "epoch": 0.58, "grad_norm": 13.3125, "learning_rate": 2.2256234357310304e-06, "logits/chosen": 2.276885986328125, "logits/rejected": 2.1446967124938965, "logps/chosen": -390.608154296875, "logps/rejected": -394.8718566894531, "loss": 0.1939, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.46289747953414917, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.881957530975342, "rewards/student_margin": 4.344854831695557, "rewards/teacher_margin": 0.0, "step": 3710 }, { "epoch": 0.58, "grad_norm": 9.625, "learning_rate": 2.2120321087595045e-06, "logits/chosen": 1.8932933807373047, "logits/rejected": 2.250673532485962, "logps/chosen": -369.96051025390625, "logps/rejected": -566.11328125, "loss": 0.2225, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.20951715111732483, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.475049018859863, "rewards/student_margin": 4.684566020965576, "rewards/teacher_margin": 0.0, "step": 3720 }, { "epoch": 0.58, "grad_norm": 17.125, "learning_rate": 2.1984494019683212e-06, "logits/chosen": 2.03346586227417, "logits/rejected": 2.0300803184509277, "logps/chosen": -383.0079650878906, "logps/rejected": -520.2147216796875, "loss": 0.2609, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": -0.008518969640135765, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.5929675102233887, "rewards/student_margin": 3.5844485759735107, "rewards/teacher_margin": 0.0, "step": 3730 }, { "epoch": 0.59, "grad_norm": 11.125, "learning_rate": 2.184875721949277e-06, "logits/chosen": 2.2196459770202637, "logits/rejected": 2.225369930267334, "logps/chosen": -315.83966064453125, "logps/rejected": -399.6290588378906, "loss": 0.228, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 0.411459356546402, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.8817734718322754, "rewards/student_margin": 3.2932331562042236, "rewards/teacher_margin": 0.0, "step": 3740 }, { "epoch": 0.59, "grad_norm": 7.3125, "learning_rate": 2.171311475023956e-06, "logits/chosen": 2.2237534523010254, "logits/rejected": 2.2150678634643555, "logps/chosen": -345.4691467285156, "logps/rejected": -419.6263122558594, "loss": 0.2689, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 1.0103347301483154, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.552267074584961, "rewards/student_margin": 4.5626020431518555, "rewards/teacher_margin": 0.0, "step": 3750 }, { "epoch": 0.59, "grad_norm": 10.6875, "learning_rate": 2.1577570672315662e-06, "logits/chosen": 2.2325844764709473, "logits/rejected": 2.2768213748931885, "logps/chosen": -347.60418701171875, "logps/rejected": -440.8597717285156, "loss": 0.2189, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.9593130350112915, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.555112838745117, "rewards/student_margin": 4.514425754547119, "rewards/teacher_margin": 0.0, "step": 3760 }, { "epoch": 0.59, "grad_norm": 7.71875, "learning_rate": 2.1442129043167877e-06, "logits/chosen": 2.2568788528442383, "logits/rejected": 2.029522180557251, "logps/chosen": -392.84820556640625, "logps/rejected": -463.40692138671875, "loss": 0.2254, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.515551745891571, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.407654285430908, "rewards/student_margin": 3.923206329345703, "rewards/teacher_margin": 0.0, "step": 3770 }, { "epoch": 0.59, "grad_norm": 17.75, "learning_rate": 2.130679391717623e-06, "logits/chosen": 2.312624216079712, "logits/rejected": 2.240938663482666, "logps/chosen": -398.32672119140625, "logps/rejected": -471.4715270996094, "loss": 0.2627, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6971611380577087, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.089206695556641, "rewards/student_margin": 4.786368370056152, "rewards/teacher_margin": 0.0, "step": 3780 }, { "epoch": 0.59, "grad_norm": 12.875, "learning_rate": 2.1171569345532646e-06, "logits/chosen": 2.3735430240631104, "logits/rejected": 2.447174549102783, "logps/chosen": -368.5428466796875, "logps/rejected": -383.60870361328125, "loss": 0.2087, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.504956066608429, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.0010478496551514, "rewards/student_margin": 3.5060038566589355, "rewards/teacher_margin": 0.0, "step": 3790 }, { "epoch": 0.6, "grad_norm": 22.0, "learning_rate": 2.103645937611965e-06, "logits/chosen": 1.9013102054595947, "logits/rejected": 2.170565128326416, "logps/chosen": -349.91619873046875, "logps/rejected": -449.0887145996094, "loss": 0.2499, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.29979297518730164, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.329071998596191, "rewards/student_margin": 4.628865718841553, "rewards/teacher_margin": 0.0, "step": 3800 }, { "epoch": 0.6, "grad_norm": 11.5625, "learning_rate": 2.0901468053389194e-06, "logits/chosen": 2.1704487800598145, "logits/rejected": 2.312422752380371, "logps/chosen": -468.676513671875, "logps/rejected": -513.9473266601562, "loss": 0.1741, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.396463006734848, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.035261154174805, "rewards/student_margin": 4.431724548339844, "rewards/teacher_margin": 0.0, "step": 3810 }, { "epoch": 0.6, "grad_norm": 6.6875, "learning_rate": 2.0766599418241616e-06, "logits/chosen": 2.1784441471099854, "logits/rejected": 2.1361289024353027, "logps/chosen": -329.8717956542969, "logps/rejected": -407.87249755859375, "loss": 0.2403, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 1.2488939762115479, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.9971418380737305, "rewards/student_margin": 4.246035575866699, "rewards/teacher_margin": 0.0, "step": 3820 }, { "epoch": 0.6, "grad_norm": 6.75, "learning_rate": 2.0631857507904645e-06, "logits/chosen": 2.297903537750244, "logits/rejected": 2.155238628387451, "logps/chosen": -377.43609619140625, "logps/rejected": -460.2127380371094, "loss": 0.192, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 1.455108404159546, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.194455146789551, "rewards/student_margin": 4.649563789367676, "rewards/teacher_margin": 0.0, "step": 3830 }, { "epoch": 0.6, "grad_norm": 15.625, "learning_rate": 2.049724635581258e-06, "logits/chosen": 2.433605909347534, "logits/rejected": 2.3757710456848145, "logps/chosen": -372.36114501953125, "logps/rejected": -452.69879150390625, "loss": 0.224, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.3206843137741089, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.4990296363830566, "rewards/student_margin": 3.819714307785034, "rewards/teacher_margin": 0.0, "step": 3840 }, { "epoch": 0.6, "grad_norm": 14.9375, "learning_rate": 2.0362769991485514e-06, "logits/chosen": 2.276495933532715, "logits/rejected": 2.3173041343688965, "logps/chosen": -363.28094482421875, "logps/rejected": -457.95263671875, "loss": 0.2022, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.8752276301383972, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.469148635864258, "rewards/student_margin": 4.3443756103515625, "rewards/teacher_margin": 0.0, "step": 3850 }, { "epoch": 0.61, "grad_norm": 12.9375, "learning_rate": 2.022843244040874e-06, "logits/chosen": 2.3171744346618652, "logits/rejected": 2.1923203468322754, "logps/chosen": -374.4541931152344, "logps/rejected": -415.54595947265625, "loss": 0.1624, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.3494204580783844, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.1482293605804443, "rewards/student_margin": 3.497650146484375, "rewards/teacher_margin": 0.0, "step": 3860 }, { "epoch": 0.61, "grad_norm": 10.3125, "learning_rate": 2.009423772391227e-06, "logits/chosen": 2.558969736099243, "logits/rejected": 2.214752435684204, "logps/chosen": -373.34967041015625, "logps/rejected": -418.3020935058594, "loss": 0.252, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": -0.026145944371819496, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.621061086654663, "rewards/student_margin": 2.5949156284332275, "rewards/teacher_margin": 0.0, "step": 3870 }, { "epoch": 0.61, "grad_norm": 12.4375, "learning_rate": 1.99601898590504e-06, "logits/chosen": 2.327904462814331, "logits/rejected": 2.2376770973205566, "logps/chosen": -361.3371887207031, "logps/rejected": -459.8245544433594, "loss": 0.2435, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 1.2409169673919678, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.5375027656555176, "rewards/student_margin": 4.7784199714660645, "rewards/teacher_margin": 0.0, "step": 3880 }, { "epoch": 0.61, "grad_norm": 18.25, "learning_rate": 1.98262928584815e-06, "logits/chosen": 1.9560056924819946, "logits/rejected": 2.22693133354187, "logps/chosen": -353.8571472167969, "logps/rejected": -476.8414001464844, "loss": 0.2282, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.6393729448318481, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.335660934448242, "rewards/student_margin": 3.9750335216522217, "rewards/teacher_margin": 0.0, "step": 3890 }, { "epoch": 0.61, "grad_norm": 12.1875, "learning_rate": 1.969255073034789e-06, "logits/chosen": 2.320183515548706, "logits/rejected": 2.14522385597229, "logps/chosen": -333.9617919921875, "logps/rejected": -401.8171691894531, "loss": 0.2012, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.2806742787361145, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.473022937774658, "rewards/student_margin": 3.753697156906128, "rewards/teacher_margin": 0.0, "step": 3900 }, { "epoch": 0.61, "grad_norm": 22.125, "learning_rate": 1.955896747815586e-06, "logits/chosen": 2.07922625541687, "logits/rejected": 2.2764229774475098, "logps/chosen": -400.4126892089844, "logps/rejected": -477.13427734375, "loss": 0.234, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.14358584582805634, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.004758358001709, "rewards/student_margin": 3.148343563079834, "rewards/teacher_margin": 0.0, "step": 3910 }, { "epoch": 0.61, "grad_norm": 11.375, "learning_rate": 1.9425547100655825e-06, "logits/chosen": 1.9863029718399048, "logits/rejected": 2.3017525672912598, "logps/chosen": -338.62738037109375, "logps/rejected": -488.9197692871094, "loss": 0.1993, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.3663279414176941, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.945026397705078, "rewards/student_margin": 4.311354637145996, "rewards/teacher_margin": 0.0, "step": 3920 }, { "epoch": 0.62, "grad_norm": 22.25, "learning_rate": 1.9292293591722623e-06, "logits/chosen": 2.4022693634033203, "logits/rejected": 2.2365543842315674, "logps/chosen": -397.0676574707031, "logps/rejected": -451.76507568359375, "loss": 0.2569, "rewards/accuracies": 1.0, "rewards/chosen": 0.9065150022506714, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.001762390136719, "rewards/student_margin": 4.9082770347595215, "rewards/teacher_margin": 0.0, "step": 3930 }, { "epoch": 0.62, "grad_norm": 10.5625, "learning_rate": 1.9159210940235977e-06, "logits/chosen": 2.1494364738464355, "logits/rejected": 2.2778542041778564, "logps/chosen": -384.95489501953125, "logps/rejected": -501.65325927734375, "loss": 0.1995, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.8951177597045898, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.1695404052734375, "rewards/student_margin": 5.064658164978027, "rewards/teacher_margin": 0.0, "step": 3940 }, { "epoch": 0.62, "grad_norm": 17.375, "learning_rate": 1.9026303129961049e-06, "logits/chosen": 2.0833165645599365, "logits/rejected": 2.3335959911346436, "logps/chosen": -312.3328857421875, "logps/rejected": -415.20831298828125, "loss": 0.1949, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5699090361595154, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2504830360412598, "rewards/student_margin": 3.820391893386841, "rewards/teacher_margin": 0.0, "step": 3950 }, { "epoch": 0.62, "grad_norm": 18.0, "learning_rate": 1.8893574139429226e-06, "logits/chosen": 2.3678994178771973, "logits/rejected": 2.118623971939087, "logps/chosen": -362.4637451171875, "logps/rejected": -396.91070556640625, "loss": 0.2149, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.10757827758789062, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.0848395824432373, "rewards/student_margin": 3.192417860031128, "rewards/teacher_margin": 0.0, "step": 3960 }, { "epoch": 0.62, "grad_norm": 12.625, "learning_rate": 1.8761027941819015e-06, "logits/chosen": 2.118600845336914, "logits/rejected": 1.9349584579467773, "logps/chosen": -370.16351318359375, "logps/rejected": -445.29595947265625, "loss": 0.2034, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.6241740584373474, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.3816821575164795, "rewards/student_margin": 4.005856513977051, "rewards/teacher_margin": 0.0, "step": 3970 }, { "epoch": 0.62, "grad_norm": 11.0, "learning_rate": 1.86286685048371e-06, "logits/chosen": 2.1980221271514893, "logits/rejected": 1.9180787801742554, "logps/chosen": -400.87908935546875, "logps/rejected": -446.2035217285156, "loss": 0.1687, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.9995938539505005, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.640249252319336, "rewards/student_margin": 4.639843940734863, "rewards/teacher_margin": 0.0, "step": 3980 }, { "epoch": 0.63, "grad_norm": 13.1875, "learning_rate": 1.8496499790599576e-06, "logits/chosen": 2.1617043018341064, "logits/rejected": 2.4177193641662598, "logps/chosen": -418.474365234375, "logps/rejected": -522.3171997070312, "loss": 0.2008, "rewards/accuracies": 0.7666666507720947, "rewards/chosen": 0.4389050602912903, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.831400156021118, "rewards/student_margin": 3.2703049182891846, "rewards/teacher_margin": 0.0, "step": 3990 }, { "epoch": 0.63, "grad_norm": 12.5, "learning_rate": 1.836452575551334e-06, "logits/chosen": 2.1598098278045654, "logits/rejected": 1.9940067529678345, "logps/chosen": -392.3472595214844, "logps/rejected": -479.0543518066406, "loss": 0.2044, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.8060212135314941, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2296130657196045, "rewards/student_margin": 4.0356340408325195, "rewards/teacher_margin": 0.0, "step": 4000 }, { "epoch": 0.63, "grad_norm": 13.125, "learning_rate": 1.8232750350157679e-06, "logits/chosen": 2.406219720840454, "logits/rejected": 2.242527484893799, "logps/chosen": -318.98284912109375, "logps/rejected": -422.5018005371094, "loss": 0.2052, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.37432101368904114, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.4287688732147217, "rewards/student_margin": 3.8030898571014404, "rewards/teacher_margin": 0.0, "step": 4010 }, { "epoch": 0.63, "grad_norm": 16.875, "learning_rate": 1.8101177519165974e-06, "logits/chosen": 2.0695815086364746, "logits/rejected": 2.488751173019409, "logps/chosen": -358.83428955078125, "logps/rejected": -509.04766845703125, "loss": 0.2287, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.2573285698890686, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.60225772857666, "rewards/student_margin": 4.859586238861084, "rewards/teacher_margin": 0.0, "step": 4020 }, { "epoch": 0.63, "grad_norm": 12.25, "learning_rate": 1.796981120110765e-06, "logits/chosen": 2.4509191513061523, "logits/rejected": 2.1260035037994385, "logps/chosen": -351.1524963378906, "logps/rejected": -426.05938720703125, "loss": 0.2365, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.5135831236839294, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.213418006896973, "rewards/student_margin": 4.727001667022705, "rewards/teacher_margin": 0.0, "step": 4030 }, { "epoch": 0.63, "grad_norm": 8.4375, "learning_rate": 1.7838655328370268e-06, "logits/chosen": 2.5682601928710938, "logits/rejected": 2.426596164703369, "logps/chosen": -371.00286865234375, "logps/rejected": -389.0164794921875, "loss": 0.1383, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.9797958135604858, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2294936180114746, "rewards/student_margin": 4.209290504455566, "rewards/teacher_margin": 0.0, "step": 4040 }, { "epoch": 0.63, "grad_norm": 19.125, "learning_rate": 1.7707713827041808e-06, "logits/chosen": 2.1681597232818604, "logits/rejected": 2.320230722427368, "logps/chosen": -400.6273193359375, "logps/rejected": -496.3367614746094, "loss": 0.1996, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.42232999205589294, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.025732040405273, "rewards/student_margin": 4.448061943054199, "rewards/teacher_margin": 0.0, "step": 4050 }, { "epoch": 0.64, "grad_norm": 2.15625, "learning_rate": 1.7576990616793139e-06, "logits/chosen": 2.1751160621643066, "logits/rejected": 1.8546712398529053, "logps/chosen": -449.2755432128906, "logps/rejected": -417.5882263183594, "loss": 0.1582, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.8441101908683777, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.7444796562194824, "rewards/student_margin": 3.588589906692505, "rewards/teacher_margin": 0.0, "step": 4060 }, { "epoch": 0.64, "grad_norm": 18.5, "learning_rate": 1.744648961076068e-06, "logits/chosen": 2.1425530910491943, "logits/rejected": 2.3875584602355957, "logps/chosen": -344.25115966796875, "logps/rejected": -411.7806091308594, "loss": 0.2393, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9676275253295898, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.9560160636901855, "rewards/student_margin": 3.9236435890197754, "rewards/teacher_margin": 0.0, "step": 4070 }, { "epoch": 0.64, "grad_norm": 6.25, "learning_rate": 1.7316214715429322e-06, "logits/chosen": 2.491248607635498, "logits/rejected": 2.4586715698242188, "logps/chosen": -372.4438781738281, "logps/rejected": -440.86834716796875, "loss": 0.206, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.20205967128276825, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.8040242195129395, "rewards/student_margin": 4.0060834884643555, "rewards/teacher_margin": 0.0, "step": 4080 }, { "epoch": 0.64, "grad_norm": 13.625, "learning_rate": 1.7186169830515399e-06, "logits/chosen": 2.3731017112731934, "logits/rejected": 2.359196424484253, "logps/chosen": -348.0357971191406, "logps/rejected": -446.9637756347656, "loss": 0.2055, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.4058133661746979, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.1918137073516846, "rewards/student_margin": 3.5976271629333496, "rewards/teacher_margin": 0.0, "step": 4090 }, { "epoch": 0.64, "grad_norm": 19.375, "learning_rate": 1.705635884884999e-06, "logits/chosen": 2.424001693725586, "logits/rejected": 2.2628700733184814, "logps/chosen": -420.54498291015625, "logps/rejected": -514.2991943359375, "loss": 0.2183, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.5077044367790222, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.8657665252685547, "rewards/student_margin": 3.3734707832336426, "rewards/teacher_margin": 0.0, "step": 4100 }, { "epoch": 0.64, "grad_norm": 13.4375, "learning_rate": 1.6926785656262417e-06, "logits/chosen": 2.2128641605377197, "logits/rejected": 2.1223859786987305, "logps/chosen": -350.88677978515625, "logps/rejected": -352.3712158203125, "loss": 0.2089, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.46475401520729065, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.104180097579956, "rewards/student_margin": 3.5689339637756348, "rewards/teacher_margin": 0.0, "step": 4110 }, { "epoch": 0.65, "grad_norm": 15.4375, "learning_rate": 1.6797454131463885e-06, "logits/chosen": 2.384117603302002, "logits/rejected": 2.4138007164001465, "logps/chosen": -337.30084228515625, "logps/rejected": -415.44427490234375, "loss": 0.1828, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.6626492738723755, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.5069947242736816, "rewards/student_margin": 4.169643878936768, "rewards/teacher_margin": 0.0, "step": 4120 }, { "epoch": 0.65, "grad_norm": 10.3125, "learning_rate": 1.66683681459314e-06, "logits/chosen": 2.226247549057007, "logits/rejected": 2.3713502883911133, "logps/chosen": -337.50299072265625, "logps/rejected": -454.22039794921875, "loss": 0.2295, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.33039093017578125, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.9453320503234863, "rewards/student_margin": 4.275723457336426, "rewards/teacher_margin": 0.0, "step": 4130 }, { "epoch": 0.65, "grad_norm": 19.375, "learning_rate": 1.653953156379185e-06, "logits/chosen": 2.285707950592041, "logits/rejected": 2.2883620262145996, "logps/chosen": -388.2783203125, "logps/rejected": -400.43951416015625, "loss": 0.2366, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.36054128408432007, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.81415057182312, "rewards/student_margin": 4.174691677093506, "rewards/teacher_margin": 0.0, "step": 4140 }, { "epoch": 0.65, "grad_norm": 10.125, "learning_rate": 1.641094824170638e-06, "logits/chosen": 2.2254040241241455, "logits/rejected": 2.362788200378418, "logps/chosen": -376.3564758300781, "logps/rejected": -426.7521057128906, "loss": 0.2549, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.43624091148376465, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.5602798461914062, "rewards/student_margin": 3.996520519256592, "rewards/teacher_margin": 0.0, "step": 4150 }, { "epoch": 0.65, "grad_norm": 15.0, "learning_rate": 1.6282622028754897e-06, "logits/chosen": 2.130876064300537, "logits/rejected": 2.2034571170806885, "logps/chosen": -334.30328369140625, "logps/rejected": -442.67401123046875, "loss": 0.2429, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9439103007316589, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.286083221435547, "rewards/student_margin": 4.229994297027588, "rewards/teacher_margin": 0.0, "step": 4160 }, { "epoch": 0.65, "grad_norm": 17.625, "learning_rate": 1.6154556766320875e-06, "logits/chosen": 2.200023651123047, "logits/rejected": 2.3823628425598145, "logps/chosen": -353.2941589355469, "logps/rejected": -451.27099609375, "loss": 0.2403, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.637153685092926, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.889683961868286, "rewards/student_margin": 3.5268378257751465, "rewards/teacher_margin": 0.0, "step": 4170 }, { "epoch": 0.66, "grad_norm": 16.125, "learning_rate": 1.602675628797636e-06, "logits/chosen": 2.402381420135498, "logits/rejected": 2.3626885414123535, "logps/chosen": -345.10498046875, "logps/rejected": -379.0228271484375, "loss": 0.2619, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.6528035998344421, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.7316203117370605, "rewards/student_margin": 3.3844237327575684, "rewards/teacher_margin": 0.0, "step": 4180 }, { "epoch": 0.66, "grad_norm": 8.3125, "learning_rate": 1.5899224419367206e-06, "logits/chosen": 2.3541531562805176, "logits/rejected": 1.941227912902832, "logps/chosen": -452.4227600097656, "logps/rejected": -503.11004638671875, "loss": 0.1664, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.0959924459457397, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.516533613204956, "rewards/student_margin": 4.612525463104248, "rewards/teacher_margin": 0.0, "step": 4190 }, { "epoch": 0.66, "grad_norm": 9.4375, "learning_rate": 1.5771964978098582e-06, "logits/chosen": 2.429255962371826, "logits/rejected": 2.2475154399871826, "logps/chosen": -410.1239318847656, "logps/rejected": -435.70172119140625, "loss": 0.1991, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.37092310190200806, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.40700101852417, "rewards/student_margin": 3.777924060821533, "rewards/teacher_margin": 0.0, "step": 4200 }, { "epoch": 0.66, "grad_norm": 11.4375, "learning_rate": 1.564498177362065e-06, "logits/chosen": 2.140536308288574, "logits/rejected": 2.2048118114471436, "logps/chosen": -376.2337646484375, "logps/rejected": -433.76104736328125, "loss": 0.1851, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.8298137784004211, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.1906914710998535, "rewards/student_margin": 3.020505428314209, "rewards/teacher_margin": 0.0, "step": 4210 }, { "epoch": 0.66, "grad_norm": 6.6875, "learning_rate": 1.5518278607114585e-06, "logits/chosen": 2.2253987789154053, "logits/rejected": 2.379819631576538, "logps/chosen": -366.1931457519531, "logps/rejected": -445.821044921875, "loss": 0.2006, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.30069810152053833, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.3067867755889893, "rewards/student_margin": 3.6074843406677246, "rewards/teacher_margin": 0.0, "step": 4220 }, { "epoch": 0.66, "grad_norm": 12.125, "learning_rate": 1.539185927137874e-06, "logits/chosen": 2.2728793621063232, "logits/rejected": 2.1780149936676025, "logps/chosen": -380.3988342285156, "logps/rejected": -363.99566650390625, "loss": 0.1979, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.8607684373855591, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.1549408435821533, "rewards/student_margin": 4.015708923339844, "rewards/teacher_margin": 0.0, "step": 4230 }, { "epoch": 0.66, "grad_norm": 16.875, "learning_rate": 1.526572755071514e-06, "logits/chosen": 2.2408108711242676, "logits/rejected": 2.1123244762420654, "logps/chosen": -365.9927673339844, "logps/rejected": -557.75439453125, "loss": 0.21, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.6940973997116089, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.9921607971191406, "rewards/student_margin": 4.686258792877197, "rewards/teacher_margin": 0.0, "step": 4240 }, { "epoch": 0.67, "grad_norm": 6.90625, "learning_rate": 1.5139887220816182e-06, "logits/chosen": 2.3250720500946045, "logits/rejected": 2.1486306190490723, "logps/chosen": -404.0055847167969, "logps/rejected": -501.9654846191406, "loss": 0.17, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.47140803933143616, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.591813325881958, "rewards/student_margin": 4.0632219314575195, "rewards/teacher_margin": 0.0, "step": 4250 }, { "epoch": 0.67, "grad_norm": 15.0, "learning_rate": 1.5014342048651636e-06, "logits/chosen": 2.1650147438049316, "logits/rejected": 2.19319486618042, "logps/chosen": -426.5665588378906, "logps/rejected": -461.7464904785156, "loss": 0.2001, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.6052268743515015, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.958289384841919, "rewards/student_margin": 3.563516139984131, "rewards/teacher_margin": 0.0, "step": 4260 }, { "epoch": 0.67, "grad_norm": 8.1875, "learning_rate": 1.4889095792355842e-06, "logits/chosen": 2.345414638519287, "logits/rejected": 2.335036039352417, "logps/chosen": -376.9546813964844, "logps/rejected": -499.475830078125, "loss": 0.1785, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7843676209449768, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.7308247089385986, "rewards/student_margin": 4.51519250869751, "rewards/teacher_margin": 0.0, "step": 4270 }, { "epoch": 0.67, "grad_norm": 16.5, "learning_rate": 1.4764152201115251e-06, "logits/chosen": 2.4229483604431152, "logits/rejected": 2.4653823375701904, "logps/chosen": -329.5719299316406, "logps/rejected": -410.22772216796875, "loss": 0.237, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.48011675477027893, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.3626980781555176, "rewards/student_margin": 3.8428149223327637, "rewards/teacher_margin": 0.0, "step": 4280 }, { "epoch": 0.67, "grad_norm": 13.4375, "learning_rate": 1.4639515015056205e-06, "logits/chosen": 2.2521984577178955, "logits/rejected": 2.2531790733337402, "logps/chosen": -390.44970703125, "logps/rejected": -411.6861877441406, "loss": 0.2181, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5905542373657227, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.048703670501709, "rewards/student_margin": 3.6392579078674316, "rewards/teacher_margin": 0.0, "step": 4290 }, { "epoch": 0.67, "grad_norm": 14.8125, "learning_rate": 1.4515187965132916e-06, "logits/chosen": 2.148357629776001, "logits/rejected": 2.2932491302490234, "logps/chosen": -344.1498107910156, "logps/rejected": -498.25115966796875, "loss": 0.1877, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.588786244392395, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.6832547187805176, "rewards/student_margin": 4.272041320800781, "rewards/teacher_margin": 0.0, "step": 4300 }, { "epoch": 0.68, "grad_norm": 16.875, "learning_rate": 1.4391174773015836e-06, "logits/chosen": 2.322120428085327, "logits/rejected": 2.115831136703491, "logps/chosen": -352.9857482910156, "logps/rejected": -432.473388671875, "loss": 0.1966, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.666633129119873, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.7332310676574707, "rewards/student_margin": 4.399864196777344, "rewards/teacher_margin": 0.0, "step": 4310 }, { "epoch": 0.68, "grad_norm": 16.625, "learning_rate": 1.426747915098024e-06, "logits/chosen": 2.249241828918457, "logits/rejected": 2.341294765472412, "logps/chosen": -369.5724792480469, "logps/rejected": -423.00958251953125, "loss": 0.2558, "rewards/accuracies": 1.0, "rewards/chosen": 0.9959958791732788, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.716212749481201, "rewards/student_margin": 4.7122087478637695, "rewards/teacher_margin": 0.0, "step": 4320 }, { "epoch": 0.68, "grad_norm": 13.8125, "learning_rate": 1.4144104801795084e-06, "logits/chosen": 2.0937018394470215, "logits/rejected": 2.022745132446289, "logps/chosen": -397.8958740234375, "logps/rejected": -432.82342529296875, "loss": 0.2005, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7341985106468201, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.147383213043213, "rewards/student_margin": 3.8815817832946777, "rewards/teacher_margin": 0.0, "step": 4330 }, { "epoch": 0.68, "grad_norm": 16.5, "learning_rate": 1.4021055418612192e-06, "logits/chosen": 2.597449541091919, "logits/rejected": 2.3199515342712402, "logps/chosen": -427.53753662109375, "logps/rejected": -430.06573486328125, "loss": 0.2271, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.4447408616542816, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.8538248538970947, "rewards/student_margin": 3.298565626144409, "rewards/teacher_margin": 0.0, "step": 4340 }, { "epoch": 0.68, "grad_norm": 13.4375, "learning_rate": 1.3898334684855647e-06, "logits/chosen": 2.2624268531799316, "logits/rejected": 2.2079737186431885, "logps/chosen": -392.041015625, "logps/rejected": -465.6962890625, "loss": 0.1794, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.9841591119766235, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.234818935394287, "rewards/student_margin": 4.218977928161621, "rewards/teacher_margin": 0.0, "step": 4350 }, { "epoch": 0.68, "grad_norm": 16.625, "learning_rate": 1.3775946274111632e-06, "logits/chosen": 2.1273396015167236, "logits/rejected": 2.158954381942749, "logps/chosen": -407.2088928222656, "logps/rejected": -520.5064086914062, "loss": 0.1732, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.7951123714447021, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.5791778564453125, "rewards/student_margin": 4.374290466308594, "rewards/teacher_margin": 0.0, "step": 4360 }, { "epoch": 0.68, "grad_norm": 13.6875, "learning_rate": 1.3653893850018351e-06, "logits/chosen": 2.182701587677002, "logits/rejected": 2.1715781688690186, "logps/chosen": -378.4584045410156, "logps/rejected": -415.1776428222656, "loss": 0.1877, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.8146513104438782, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.6057095527648926, "rewards/student_margin": 4.420360565185547, "rewards/teacher_margin": 0.0, "step": 4370 }, { "epoch": 0.69, "grad_norm": 21.125, "learning_rate": 1.353218106615643e-06, "logits/chosen": 2.305262804031372, "logits/rejected": 2.365464448928833, "logps/chosen": -386.16729736328125, "logps/rejected": -474.8497009277344, "loss": 0.1781, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.8675039410591125, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.257737159729004, "rewards/student_margin": 5.125240802764893, "rewards/teacher_margin": 0.0, "step": 4380 }, { "epoch": 0.69, "grad_norm": 11.1875, "learning_rate": 1.3410811565939522e-06, "logits/chosen": 2.42240571975708, "logits/rejected": 2.4300317764282227, "logps/chosen": -369.504150390625, "logps/rejected": -465.0870666503906, "loss": 0.2313, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.40142887830734253, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.4254188537597656, "rewards/student_margin": 3.826847553253174, "rewards/teacher_margin": 0.0, "step": 4390 }, { "epoch": 0.69, "grad_norm": 12.5625, "learning_rate": 1.328978898250525e-06, "logits/chosen": 2.1981430053710938, "logits/rejected": 2.1189374923706055, "logps/chosen": -341.4212646484375, "logps/rejected": -423.22283935546875, "loss": 0.1943, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.8387746810913086, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -1.9836686849594116, "rewards/student_margin": 2.8224432468414307, "rewards/teacher_margin": 0.0, "step": 4400 }, { "epoch": 0.69, "grad_norm": 3.75, "learning_rate": 1.3169116938606452e-06, "logits/chosen": 2.318127155303955, "logits/rejected": 2.1642205715179443, "logps/chosen": -375.04132080078125, "logps/rejected": -454.688232421875, "loss": 0.1764, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.7740648984909058, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.350397825241089, "rewards/student_margin": 4.124463081359863, "rewards/teacher_margin": 0.0, "step": 4410 }, { "epoch": 0.69, "grad_norm": 11.4375, "learning_rate": 1.3048799046502728e-06, "logits/chosen": 2.4050726890563965, "logits/rejected": 2.185568332672119, "logps/chosen": -380.9561767578125, "logps/rejected": -455.29486083984375, "loss": 0.1758, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.9718208312988281, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.776228666305542, "rewards/student_margin": 4.748049736022949, "rewards/teacher_margin": 0.0, "step": 4420 }, { "epoch": 0.69, "grad_norm": 8.1875, "learning_rate": 1.292883890785232e-06, "logits/chosen": 2.292860746383667, "logits/rejected": 2.16355037689209, "logps/chosen": -374.21051025390625, "logps/rejected": -444.0502014160156, "loss": 0.1535, "rewards/accuracies": 1.0, "rewards/chosen": 1.2792543172836304, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.199206829071045, "rewards/student_margin": 4.478461265563965, "rewards/teacher_margin": 0.0, "step": 4430 }, { "epoch": 0.7, "grad_norm": 11.3125, "learning_rate": 1.2809240113604293e-06, "logits/chosen": 2.0508453845977783, "logits/rejected": 2.1279282569885254, "logps/chosen": -341.65167236328125, "logps/rejected": -451.89202880859375, "loss": 0.1692, "rewards/accuracies": 1.0, "rewards/chosen": 0.4559069573879242, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.7608139514923096, "rewards/student_margin": 4.216721057891846, "rewards/teacher_margin": 0.0, "step": 4440 }, { "epoch": 0.7, "grad_norm": 17.875, "learning_rate": 1.269000624389104e-06, "logits/chosen": 2.210927963256836, "logits/rejected": 2.175581932067871, "logps/chosen": -379.268798828125, "logps/rejected": -477.5199279785156, "loss": 0.1872, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.6265663504600525, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.1053733825683594, "rewards/student_margin": 3.7319397926330566, "rewards/teacher_margin": 0.0, "step": 4450 }, { "epoch": 0.7, "grad_norm": 5.65625, "learning_rate": 1.2571140867921108e-06, "logits/chosen": 2.4008982181549072, "logits/rejected": 2.1090166568756104, "logps/chosen": -398.1876220703125, "logps/rejected": -397.8680419921875, "loss": 0.1838, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.7496792078018188, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.6183419227600098, "rewards/student_margin": 3.368021011352539, "rewards/teacher_margin": 0.0, "step": 4460 }, { "epoch": 0.7, "grad_norm": 13.125, "learning_rate": 1.2452647543872368e-06, "logits/chosen": 2.4607419967651367, "logits/rejected": 2.3071541786193848, "logps/chosen": -372.62646484375, "logps/rejected": -461.21038818359375, "loss": 0.1556, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 1.0401886701583862, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.642698287963867, "rewards/student_margin": 3.682887315750122, "rewards/teacher_margin": 0.0, "step": 4470 }, { "epoch": 0.7, "grad_norm": 15.1875, "learning_rate": 1.233452981878549e-06, "logits/chosen": 2.371138095855713, "logits/rejected": 2.393810272216797, "logps/chosen": -404.2279052734375, "logps/rejected": -474.05548095703125, "loss": 0.2227, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.9403649568557739, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.3432083129882812, "rewards/student_margin": 3.2835731506347656, "rewards/teacher_margin": 0.0, "step": 4480 }, { "epoch": 0.7, "grad_norm": 13.375, "learning_rate": 1.2216791228457778e-06, "logits/chosen": 2.3199284076690674, "logits/rejected": 2.1189212799072266, "logps/chosen": -398.3868713378906, "logps/rejected": -498.7290954589844, "loss": 0.1915, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 1.0342378616333008, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.4660282135009766, "rewards/student_margin": 4.500266075134277, "rewards/teacher_margin": 0.0, "step": 4490 }, { "epoch": 0.71, "grad_norm": 14.75, "learning_rate": 1.20994352973373e-06, "logits/chosen": 2.329070568084717, "logits/rejected": 2.4055492877960205, "logps/chosen": -306.6795654296875, "logps/rejected": -418.5924377441406, "loss": 0.1846, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.3376057744026184, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2351009845733643, "rewards/student_margin": 3.572706937789917, "rewards/teacher_margin": 0.0, "step": 4500 }, { "epoch": 0.71, "grad_norm": 21.25, "learning_rate": 1.198246553841744e-06, "logits/chosen": 2.4479258060455322, "logits/rejected": 2.4245755672454834, "logps/chosen": -411.2915954589844, "logps/rejected": -503.4586486816406, "loss": 0.2047, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7828003168106079, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.8683807849884033, "rewards/student_margin": 4.651180267333984, "rewards/teacher_margin": 0.0, "step": 4510 }, { "epoch": 0.71, "grad_norm": 17.5, "learning_rate": 1.186588545313167e-06, "logits/chosen": 2.116206169128418, "logits/rejected": 2.336148977279663, "logps/chosen": -352.74383544921875, "logps/rejected": -510.85430908203125, "loss": 0.1851, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 1.0128092765808105, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.4747490882873535, "rewards/student_margin": 4.487558364868164, "rewards/teacher_margin": 0.0, "step": 4520 }, { "epoch": 0.71, "grad_norm": 11.8125, "learning_rate": 1.1749698531248783e-06, "logits/chosen": 2.420536756515503, "logits/rejected": 2.2323310375213623, "logps/chosen": -407.015625, "logps/rejected": -500.64031982421875, "loss": 0.2255, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.019533798098564148, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.437556028366089, "rewards/student_margin": 3.45708966255188, "rewards/teacher_margin": 0.0, "step": 4530 }, { "epoch": 0.71, "grad_norm": 16.5, "learning_rate": 1.1633908250768407e-06, "logits/chosen": 2.346482992172241, "logits/rejected": 2.4074902534484863, "logps/chosen": -325.85723876953125, "logps/rejected": -453.3356018066406, "loss": 0.2102, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.7215293645858765, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.153322696685791, "rewards/student_margin": 4.874851703643799, "rewards/teacher_margin": 0.0, "step": 4540 }, { "epoch": 0.71, "grad_norm": 20.25, "learning_rate": 1.1518518077816925e-06, "logits/chosen": 2.187791347503662, "logits/rejected": 2.142148971557617, "logps/chosen": -395.7634582519531, "logps/rejected": -456.22747802734375, "loss": 0.228, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.20826086401939392, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.600189685821533, "rewards/student_margin": 3.808450698852539, "rewards/teacher_margin": 0.0, "step": 4550 }, { "epoch": 0.71, "grad_norm": 13.75, "learning_rate": 1.1403531466543647e-06, "logits/chosen": 2.1464340686798096, "logits/rejected": 2.1971054077148438, "logps/chosen": -392.3583068847656, "logps/rejected": -412.98822021484375, "loss": 0.1922, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.4458504319190979, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.70526123046875, "rewards/student_margin": 4.151111602783203, "rewards/teacher_margin": 0.0, "step": 4560 }, { "epoch": 0.72, "grad_norm": 8.5, "learning_rate": 1.1288951859017488e-06, "logits/chosen": 2.2134852409362793, "logits/rejected": 2.233588218688965, "logps/chosen": -365.6943359375, "logps/rejected": -487.455322265625, "loss": 0.1702, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.4695436954498291, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.899659156799316, "rewards/student_margin": 5.369202613830566, "rewards/teacher_margin": 0.0, "step": 4570 }, { "epoch": 0.72, "grad_norm": 10.875, "learning_rate": 1.1174782685123919e-06, "logits/chosen": 2.3453192710876465, "logits/rejected": 2.39570689201355, "logps/chosen": -367.35797119140625, "logps/rejected": -455.792724609375, "loss": 0.1856, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.45448359847068787, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.4926705360412598, "rewards/student_margin": 2.9471538066864014, "rewards/teacher_margin": 0.0, "step": 4580 }, { "epoch": 0.72, "grad_norm": 6.3125, "learning_rate": 1.106102736246225e-06, "logits/chosen": 2.1479272842407227, "logits/rejected": 2.2734103202819824, "logps/chosen": -314.1912536621094, "logps/rejected": -411.30517578125, "loss": 0.1825, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.38981834053993225, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.4364051818847656, "rewards/student_margin": 3.826223373413086, "rewards/teacher_margin": 0.0, "step": 4590 }, { "epoch": 0.72, "grad_norm": 17.5, "learning_rate": 1.0947689296243352e-06, "logits/chosen": 2.3885436058044434, "logits/rejected": 2.036431312561035, "logps/chosen": -405.0794982910156, "logps/rejected": -389.13165283203125, "loss": 0.1469, "rewards/accuracies": 1.0, "rewards/chosen": 0.8896406888961792, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.8038108348846436, "rewards/student_margin": 4.693451404571533, "rewards/teacher_margin": 0.0, "step": 4600 }, { "epoch": 0.72, "grad_norm": 9.0625, "learning_rate": 1.0834771879187741e-06, "logits/chosen": 2.2399938106536865, "logits/rejected": 2.2264115810394287, "logps/chosen": -440.02838134765625, "logps/rejected": -500.850341796875, "loss": 0.158, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.08392997086048126, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.019566059112549, "rewards/student_margin": 4.103496074676514, "rewards/teacher_margin": 0.0, "step": 4610 }, { "epoch": 0.72, "grad_norm": 21.25, "learning_rate": 1.0722278491423998e-06, "logits/chosen": 2.449798107147217, "logits/rejected": 2.432065010070801, "logps/chosen": -409.23101806640625, "logps/rejected": -449.4249572753906, "loss": 0.2577, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.7606010437011719, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.4439101219177246, "rewards/student_margin": 4.2045111656188965, "rewards/teacher_margin": 0.0, "step": 4620 }, { "epoch": 0.73, "grad_norm": 12.75, "learning_rate": 1.0610212500387584e-06, "logits/chosen": 2.126857280731201, "logits/rejected": 2.017108201980591, "logps/chosen": -434.02685546875, "logps/rejected": -429.6539611816406, "loss": 0.2105, "rewards/accuracies": 0.9000000953674316, "rewards/chosen": 0.8691893815994263, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.4859824180603027, "rewards/student_margin": 3.3551719188690186, "rewards/teacher_margin": 0.0, "step": 4630 }, { "epoch": 0.73, "grad_norm": 15.875, "learning_rate": 1.049857726072005e-06, "logits/chosen": 2.146465539932251, "logits/rejected": 2.216127395629883, "logps/chosen": -342.6104431152344, "logps/rejected": -440.58135986328125, "loss": 0.2064, "rewards/accuracies": 1.0, "rewards/chosen": 0.4689766764640808, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.6667208671569824, "rewards/student_margin": 3.135697364807129, "rewards/teacher_margin": 0.0, "step": 4640 }, { "epoch": 0.73, "grad_norm": 15.8125, "learning_rate": 1.038737611416861e-06, "logits/chosen": 2.1464524269104004, "logits/rejected": 2.1502575874328613, "logps/chosen": -441.51361083984375, "logps/rejected": -526.0476684570312, "loss": 0.1561, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6486836671829224, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.4056506156921387, "rewards/student_margin": 4.05433464050293, "rewards/teacher_margin": 0.0, "step": 4650 }, { "epoch": 0.73, "grad_norm": 8.25, "learning_rate": 1.0276612389486112e-06, "logits/chosen": 2.2533810138702393, "logits/rejected": 2.3166136741638184, "logps/chosen": -338.15203857421875, "logps/rejected": -424.89080810546875, "loss": 0.2512, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.3315960764884949, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.6144444942474365, "rewards/student_margin": 2.946040630340576, "rewards/teacher_margin": 0.0, "step": 4660 }, { "epoch": 0.73, "grad_norm": 15.125, "learning_rate": 1.0166289402331391e-06, "logits/chosen": 2.1138951778411865, "logits/rejected": 2.0586953163146973, "logps/chosen": -387.335205078125, "logps/rejected": -436.0008850097656, "loss": 0.222, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.312610387802124, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.6769402027130127, "rewards/student_margin": 3.989550828933716, "rewards/teacher_margin": 0.0, "step": 4670 }, { "epoch": 0.73, "grad_norm": 13.1875, "learning_rate": 1.0056410455170013e-06, "logits/chosen": 2.418975353240967, "logits/rejected": 2.5750701427459717, "logps/chosen": -408.5782165527344, "logps/rejected": -472.0025329589844, "loss": 0.1739, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.5650798082351685, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.8151066303253174, "rewards/student_margin": 4.380185604095459, "rewards/teacher_margin": 0.0, "step": 4680 }, { "epoch": 0.74, "grad_norm": 15.875, "learning_rate": 9.946978837175418e-07, "logits/chosen": 2.336592197418213, "logits/rejected": 2.0165512561798096, "logps/chosen": -403.698486328125, "logps/rejected": -394.62420654296875, "loss": 0.1765, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.6634343862533569, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.845256805419922, "rewards/student_margin": 4.508691310882568, "rewards/teacher_margin": 0.0, "step": 4690 }, { "epoch": 0.74, "grad_norm": 12.625, "learning_rate": 9.837997824130468e-07, "logits/chosen": 2.1321518421173096, "logits/rejected": 2.27929949760437, "logps/chosen": -311.05572509765625, "logps/rejected": -413.9977111816406, "loss": 0.2011, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.5490480661392212, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.0460591316223145, "rewards/student_margin": 4.595107078552246, "rewards/teacher_margin": 0.0, "step": 4700 }, { "epoch": 0.74, "grad_norm": 9.9375, "learning_rate": 9.729470678329375e-07, "logits/chosen": 2.4814484119415283, "logits/rejected": 2.428584575653076, "logps/chosen": -383.8559265136719, "logps/rejected": -449.4302673339844, "loss": 0.155, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.6394717693328857, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.772118091583252, "rewards/student_margin": 4.411590099334717, "rewards/teacher_margin": 0.0, "step": 4710 }, { "epoch": 0.74, "grad_norm": 14.875, "learning_rate": 9.62140064848007e-07, "logits/chosen": 2.1247520446777344, "logits/rejected": 2.2458348274230957, "logps/chosen": -342.9242858886719, "logps/rejected": -501.0965270996094, "loss": 0.1785, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.0699262022972107, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.375868797302246, "rewards/student_margin": 4.44579553604126, "rewards/teacher_margin": 0.0, "step": 4720 }, { "epoch": 0.74, "grad_norm": 20.375, "learning_rate": 9.513790969606926e-07, "logits/chosen": 2.2374188899993896, "logits/rejected": 2.1970958709716797, "logps/chosen": -364.10986328125, "logps/rejected": -454.8092346191406, "loss": 0.2483, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.3717343211174011, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.5343616008758545, "rewards/student_margin": 3.9060962200164795, "rewards/teacher_margin": 0.0, "step": 4730 }, { "epoch": 0.74, "grad_norm": 10.75, "learning_rate": 9.406644862953923e-07, "logits/chosen": 2.2181410789489746, "logits/rejected": 2.2355470657348633, "logps/chosen": -345.8125915527344, "logps/rejected": -498.0513610839844, "loss": 0.157, "rewards/accuracies": 1.0, "rewards/chosen": 0.5674850344657898, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.480647087097168, "rewards/student_margin": 5.048131942749023, "rewards/teacher_margin": 0.0, "step": 4740 }, { "epoch": 0.74, "grad_norm": 20.25, "learning_rate": 9.299965535888245e-07, "logits/chosen": 2.34041428565979, "logits/rejected": 2.301543951034546, "logps/chosen": -419.5838928222656, "logps/rejected": -401.06060791015625, "loss": 0.25, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5575419068336487, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.0623602867126465, "rewards/student_margin": 3.6199023723602295, "rewards/teacher_margin": 0.0, "step": 4750 }, { "epoch": 0.75, "grad_norm": 12.4375, "learning_rate": 9.193756181804248e-07, "logits/chosen": 2.386845111846924, "logits/rejected": 2.337221622467041, "logps/chosen": -344.47589111328125, "logps/rejected": -412.7548828125, "loss": 0.2004, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.9086974859237671, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.7918388843536377, "rewards/student_margin": 4.700536727905273, "rewards/teacher_margin": 0.0, "step": 4760 }, { "epoch": 0.75, "grad_norm": 5.09375, "learning_rate": 9.088019980027862e-07, "logits/chosen": 2.342294692993164, "logits/rejected": 2.151555299758911, "logps/chosen": -328.8809509277344, "logps/rejected": -380.1582946777344, "loss": 0.2133, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.7753934860229492, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.111673355102539, "rewards/student_margin": 2.8870668411254883, "rewards/teacher_margin": 0.0, "step": 4770 }, { "epoch": 0.75, "grad_norm": 16.25, "learning_rate": 8.982760095721435e-07, "logits/chosen": 2.191879987716675, "logits/rejected": 2.237039089202881, "logps/chosen": -368.62066650390625, "logps/rejected": -441.68994140625, "loss": 0.1902, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7186470031738281, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.382054090499878, "rewards/student_margin": 4.100700855255127, "rewards/teacher_margin": 0.0, "step": 4780 }, { "epoch": 0.75, "grad_norm": 10.25, "learning_rate": 8.877979679789003e-07, "logits/chosen": 2.0346829891204834, "logits/rejected": 2.274494171142578, "logps/chosen": -369.5391845703125, "logps/rejected": -485.61328125, "loss": 0.2107, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.6239652037620544, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.454137325286865, "rewards/student_margin": 5.0781025886535645, "rewards/teacher_margin": 0.0, "step": 4790 }, { "epoch": 0.75, "grad_norm": 10.6875, "learning_rate": 8.773681868781922e-07, "logits/chosen": 2.3317553997039795, "logits/rejected": 2.253122568130493, "logps/chosen": -434.63995361328125, "logps/rejected": -419.7216796875, "loss": 0.1411, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.4950445294380188, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.237788677215576, "rewards/student_margin": 3.732832670211792, "rewards/teacher_margin": 0.0, "step": 4800 }, { "epoch": 0.75, "grad_norm": 8.0625, "learning_rate": 8.669869784805004e-07, "logits/chosen": 2.1285929679870605, "logits/rejected": 2.1949281692504883, "logps/chosen": -370.31927490234375, "logps/rejected": -489.98406982421875, "loss": 0.2371, "rewards/accuracies": 1.0, "rewards/chosen": 0.8148528933525085, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.862879991531372, "rewards/student_margin": 4.677732944488525, "rewards/teacher_margin": 0.0, "step": 4810 }, { "epoch": 0.76, "grad_norm": 20.5, "learning_rate": 8.566546535423067e-07, "logits/chosen": 2.423316478729248, "logits/rejected": 2.033618450164795, "logps/chosen": -412.862548828125, "logps/rejected": -389.4032287597656, "loss": 0.2452, "rewards/accuracies": 0.9000000953674316, "rewards/chosen": 0.682807207107544, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.9670217037200928, "rewards/student_margin": 3.649829387664795, "rewards/teacher_margin": 0.0, "step": 4820 }, { "epoch": 0.76, "grad_norm": 17.625, "learning_rate": 8.463715213567889e-07, "logits/chosen": 2.0933845043182373, "logits/rejected": 1.9579381942749023, "logps/chosen": -339.6474914550781, "logps/rejected": -370.60723876953125, "loss": 0.1749, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.44126051664352417, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.195952892303467, "rewards/student_margin": 3.6372132301330566, "rewards/teacher_margin": 0.0, "step": 4830 }, { "epoch": 0.76, "grad_norm": 20.625, "learning_rate": 8.361378897445643e-07, "logits/chosen": 2.3362605571746826, "logits/rejected": 2.3216171264648438, "logps/chosen": -327.75140380859375, "logps/rejected": -417.5182189941406, "loss": 0.2222, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.7961492538452148, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.203826904296875, "rewards/student_margin": 3.9999759197235107, "rewards/teacher_margin": 0.0, "step": 4840 }, { "epoch": 0.76, "grad_norm": 13.0625, "learning_rate": 8.259540650444736e-07, "logits/chosen": 2.3477838039398193, "logits/rejected": 2.3859994411468506, "logps/chosen": -344.9651794433594, "logps/rejected": -436.0146484375, "loss": 0.1586, "rewards/accuracies": 0.9000000953674316, "rewards/chosen": 0.4386192262172699, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.499537944793701, "rewards/student_margin": 3.938157320022583, "rewards/teacher_margin": 0.0, "step": 4850 }, { "epoch": 0.76, "grad_norm": 12.625, "learning_rate": 8.15820352104412e-07, "logits/chosen": 2.174537420272827, "logits/rejected": 2.066080093383789, "logps/chosen": -367.1825256347656, "logps/rejected": -414.8009338378906, "loss": 0.2105, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8279439806938171, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2905402183532715, "rewards/student_margin": 4.118483543395996, "rewards/teacher_margin": 0.0, "step": 4860 }, { "epoch": 0.76, "grad_norm": 11.375, "learning_rate": 8.057370542722032e-07, "logits/chosen": 2.274807929992676, "logits/rejected": 2.10182523727417, "logps/chosen": -345.66796875, "logps/rejected": -375.68414306640625, "loss": 0.2158, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.39200183749198914, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.376962184906006, "rewards/student_margin": 3.7689642906188965, "rewards/teacher_margin": 0.0, "step": 4870 }, { "epoch": 0.76, "grad_norm": 17.75, "learning_rate": 7.957044733865188e-07, "logits/chosen": 2.2859017848968506, "logits/rejected": 1.9933927059173584, "logps/chosen": -386.2018737792969, "logps/rejected": -390.8060302734375, "loss": 0.1807, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.6668523550033569, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.3219590187072754, "rewards/student_margin": 3.98881196975708, "rewards/teacher_margin": 0.0, "step": 4880 }, { "epoch": 0.77, "grad_norm": 10.3125, "learning_rate": 7.857229097678431e-07, "logits/chosen": 2.1686654090881348, "logits/rejected": 2.19273042678833, "logps/chosen": -346.4832458496094, "logps/rejected": -437.9906311035156, "loss": 0.2168, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.984961986541748, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.6690850257873535, "rewards/student_margin": 3.6540470123291016, "rewards/teacher_margin": 0.0, "step": 4890 }, { "epoch": 0.77, "grad_norm": 13.9375, "learning_rate": 7.75792662209483e-07, "logits/chosen": 2.3466107845306396, "logits/rejected": 2.2471745014190674, "logps/chosen": -384.7683410644531, "logps/rejected": -414.97515869140625, "loss": 0.2204, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.7697904109954834, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.367492198944092, "rewards/student_margin": 4.137282848358154, "rewards/teacher_margin": 0.0, "step": 4900 }, { "epoch": 0.77, "grad_norm": 13.9375, "learning_rate": 7.659140279686236e-07, "logits/chosen": 2.5020437240600586, "logits/rejected": 2.4461376667022705, "logps/chosen": -391.22064208984375, "logps/rejected": -430.6355895996094, "loss": 0.2197, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5198295712471008, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.743666410446167, "rewards/student_margin": 3.2634963989257812, "rewards/teacher_margin": 0.0, "step": 4910 }, { "epoch": 0.77, "grad_norm": 7.84375, "learning_rate": 7.560873027574297e-07, "logits/chosen": 2.1466622352600098, "logits/rejected": 2.194134473800659, "logps/chosen": -350.7427673339844, "logps/rejected": -499.7720642089844, "loss": 0.1552, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.9746049642562866, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2624409198760986, "rewards/student_margin": 4.237045764923096, "rewards/teacher_margin": 0.0, "step": 4920 }, { "epoch": 0.77, "grad_norm": 16.125, "learning_rate": 7.463127807341966e-07, "logits/chosen": 2.1758341789245605, "logits/rejected": 2.2997305393218994, "logps/chosen": -376.2896423339844, "logps/rejected": -482.694580078125, "loss": 0.2343, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 0.24577435851097107, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.2431139945983887, "rewards/student_margin": 2.4888882637023926, "rewards/teacher_margin": 0.0, "step": 4930 }, { "epoch": 0.77, "grad_norm": 5.03125, "learning_rate": 7.365907544945398e-07, "logits/chosen": 2.3579952716827393, "logits/rejected": 2.104785203933716, "logps/chosen": -389.17755126953125, "logps/rejected": -411.75677490234375, "loss": 0.1862, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6767646670341492, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.9576096534729004, "rewards/student_margin": 3.6343741416931152, "rewards/teacher_margin": 0.0, "step": 4940 }, { "epoch": 0.78, "grad_norm": 13.8125, "learning_rate": 7.269215150626391e-07, "logits/chosen": 2.380134105682373, "logits/rejected": 2.134861946105957, "logps/chosen": -450.21630859375, "logps/rejected": -482.78582763671875, "loss": 0.1934, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.784981369972229, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.2988996505737305, "rewards/student_margin": 5.08388090133667, "rewards/teacher_margin": 0.0, "step": 4950 }, { "epoch": 0.78, "grad_norm": 11.875, "learning_rate": 7.173053518825274e-07, "logits/chosen": 2.4318015575408936, "logits/rejected": 2.4395248889923096, "logps/chosen": -408.6988220214844, "logps/rejected": -433.99249267578125, "loss": 0.1678, "rewards/accuracies": 1.0, "rewards/chosen": 1.3959100246429443, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.5151829719543457, "rewards/student_margin": 4.911092758178711, "rewards/teacher_margin": 0.0, "step": 4960 }, { "epoch": 0.78, "grad_norm": 9.5625, "learning_rate": 7.077425528094259e-07, "logits/chosen": 2.3370726108551025, "logits/rejected": 2.0847952365875244, "logps/chosen": -367.232666015625, "logps/rejected": -416.49627685546875, "loss": 0.2276, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 1.0696319341659546, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.059650897979736, "rewards/student_margin": 5.1292829513549805, "rewards/teacher_margin": 0.0, "step": 4970 }, { "epoch": 0.78, "grad_norm": 8.375, "learning_rate": 6.982334041011249e-07, "logits/chosen": 2.067645788192749, "logits/rejected": 2.289499044418335, "logps/chosen": -327.84539794921875, "logps/rejected": -490.4068298339844, "loss": 0.176, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5802655816078186, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.104984283447266, "rewards/student_margin": 4.685250282287598, "rewards/teacher_margin": 0.0, "step": 4980 }, { "epoch": 0.78, "grad_norm": 10.4375, "learning_rate": 6.887781904094184e-07, "logits/chosen": 2.1711831092834473, "logits/rejected": 2.3205275535583496, "logps/chosen": -363.71820068359375, "logps/rejected": -426.7178649902344, "loss": 0.1588, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.933092474937439, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.714293956756592, "rewards/student_margin": 3.6473865509033203, "rewards/teacher_margin": 0.0, "step": 4990 }, { "epoch": 0.78, "grad_norm": 7.875, "learning_rate": 6.793771947715808e-07, "logits/chosen": 2.2875924110412598, "logits/rejected": 2.275660991668701, "logps/chosen": -342.6695861816406, "logps/rejected": -441.390625, "loss": 0.2059, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.6675311326980591, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.742429733276367, "rewards/student_margin": 4.409960746765137, "rewards/teacher_margin": 0.0, "step": 5000 }, { "epoch": 0.79, "grad_norm": 17.625, "learning_rate": 6.700306986018973e-07, "logits/chosen": 2.366389751434326, "logits/rejected": 2.5010457038879395, "logps/chosen": -379.03656005859375, "logps/rejected": -378.0776672363281, "loss": 0.1603, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.6534426808357239, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.616217613220215, "rewards/student_margin": 3.269660234451294, "rewards/teacher_margin": 0.0, "step": 5010 }, { "epoch": 0.79, "grad_norm": 14.375, "learning_rate": 6.607389816832352e-07, "logits/chosen": 2.5686137676239014, "logits/rejected": 2.325084924697876, "logps/chosen": -376.7912902832031, "logps/rejected": -432.4188537597656, "loss": 0.2145, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.040656328201294, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2443325519561768, "rewards/student_margin": 4.284988880157471, "rewards/teacher_margin": 0.0, "step": 5020 }, { "epoch": 0.79, "grad_norm": 10.9375, "learning_rate": 6.515023221586722e-07, "logits/chosen": 2.102614164352417, "logits/rejected": 2.1724472045898438, "logps/chosen": -371.5576171875, "logps/rejected": -437.1917419433594, "loss": 0.1723, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.6238963603973389, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.420559406280518, "rewards/student_margin": 5.044455528259277, "rewards/teacher_margin": 0.0, "step": 5030 }, { "epoch": 0.79, "grad_norm": 16.375, "learning_rate": 6.423209965231694e-07, "logits/chosen": 2.3633923530578613, "logits/rejected": 2.519592761993408, "logps/chosen": -396.9071350097656, "logps/rejected": -478.1936950683594, "loss": 0.216, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.4063758850097656, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.0615077018737793, "rewards/student_margin": 3.467883348464966, "rewards/teacher_margin": 0.0, "step": 5040 }, { "epoch": 0.79, "grad_norm": 13.6875, "learning_rate": 6.331952796152943e-07, "logits/chosen": 2.1723713874816895, "logits/rejected": 2.2117018699645996, "logps/chosen": -392.2982482910156, "logps/rejected": -464.2071838378906, "loss": 0.195, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.5718798637390137, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.630582094192505, "rewards/student_margin": 4.202462196350098, "rewards/teacher_margin": 0.0, "step": 5050 }, { "epoch": 0.79, "grad_norm": 10.3125, "learning_rate": 6.241254446089942e-07, "logits/chosen": 2.3491721153259277, "logits/rejected": 2.3552095890045166, "logps/chosen": -367.91943359375, "logps/rejected": -416.3814392089844, "loss": 0.1929, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5121696591377258, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.6782596111297607, "rewards/student_margin": 4.190428733825684, "rewards/teacher_margin": 0.0, "step": 5060 }, { "epoch": 0.79, "grad_norm": 5.71875, "learning_rate": 6.151117630054185e-07, "logits/chosen": 2.196230411529541, "logits/rejected": 2.012369394302368, "logps/chosen": -362.2015380859375, "logps/rejected": -374.1309509277344, "loss": 0.2154, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.6486814618110657, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.089136838912964, "rewards/student_margin": 3.7378182411193848, "rewards/teacher_margin": 0.0, "step": 5070 }, { "epoch": 0.8, "grad_norm": 18.75, "learning_rate": 6.06154504624791e-07, "logits/chosen": 2.2344307899475098, "logits/rejected": 2.4990742206573486, "logps/chosen": -376.5575256347656, "logps/rejected": -451.63751220703125, "loss": 0.1806, "rewards/accuracies": 1.0, "rewards/chosen": 0.8733956217765808, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.4088923931121826, "rewards/student_margin": 4.282288551330566, "rewards/teacher_margin": 0.0, "step": 5080 }, { "epoch": 0.8, "grad_norm": 10.5625, "learning_rate": 5.972539375983344e-07, "logits/chosen": 2.348113775253296, "logits/rejected": 2.357048511505127, "logps/chosen": -376.7982177734375, "logps/rejected": -468.2998962402344, "loss": 0.1946, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9349287152290344, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.812060594558716, "rewards/student_margin": 4.7469892501831055, "rewards/teacher_margin": 0.0, "step": 5090 }, { "epoch": 0.8, "grad_norm": 17.625, "learning_rate": 5.884103283602422e-07, "logits/chosen": 2.100098133087158, "logits/rejected": 2.2741966247558594, "logps/chosen": -395.5623779296875, "logps/rejected": -436.90435791015625, "loss": 0.2906, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.5344882011413574, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.8020853996276855, "rewards/student_margin": 3.336573839187622, "rewards/teacher_margin": 0.0, "step": 5100 }, { "epoch": 0.8, "grad_norm": 15.375, "learning_rate": 5.79623941639704e-07, "logits/chosen": 2.223958969116211, "logits/rejected": 2.1651275157928467, "logps/chosen": -346.91461181640625, "logps/rejected": -437.204833984375, "loss": 0.1606, "rewards/accuracies": 0.9000000953674316, "rewards/chosen": 0.6021250486373901, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.4132282733917236, "rewards/student_margin": 4.015353202819824, "rewards/teacher_margin": 0.0, "step": 5110 }, { "epoch": 0.8, "grad_norm": 16.75, "learning_rate": 5.708950404529812e-07, "logits/chosen": 1.9179728031158447, "logits/rejected": 2.0714385509490967, "logps/chosen": -367.74188232421875, "logps/rejected": -471.6404724121094, "loss": 0.2615, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.6238868832588196, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.788740634918213, "rewards/student_margin": 4.412627220153809, "rewards/teacher_margin": 0.0, "step": 5120 }, { "epoch": 0.8, "grad_norm": 18.75, "learning_rate": 5.622238860955329e-07, "logits/chosen": 2.075538158416748, "logits/rejected": 2.3453330993652344, "logps/chosen": -335.178466796875, "logps/rejected": -466.2454528808594, "loss": 0.2607, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.6069015264511108, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.7469239234924316, "rewards/student_margin": 3.353825330734253, "rewards/teacher_margin": 0.0, "step": 5130 }, { "epoch": 0.81, "grad_norm": 10.5625, "learning_rate": 5.536107381341943e-07, "logits/chosen": 2.3911397457122803, "logits/rejected": 2.3547255992889404, "logps/chosen": -370.7298889160156, "logps/rejected": -437.2976989746094, "loss": 0.2104, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 1.0226166248321533, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.329169750213623, "rewards/student_margin": 4.3517866134643555, "rewards/teacher_margin": 0.0, "step": 5140 }, { "epoch": 0.81, "grad_norm": 14.4375, "learning_rate": 5.450558543994094e-07, "logits/chosen": 2.1940689086914062, "logits/rejected": 2.181312322616577, "logps/chosen": -386.1596374511719, "logps/rejected": -445.49871826171875, "loss": 0.1879, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.2865888178348541, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.136366844177246, "rewards/student_margin": 4.422955513000488, "rewards/teacher_margin": 0.0, "step": 5150 }, { "epoch": 0.81, "grad_norm": 13.125, "learning_rate": 5.365594909775077e-07, "logits/chosen": 2.139578342437744, "logits/rejected": 2.216346025466919, "logps/chosen": -337.63128662109375, "logps/rejected": -475.4978942871094, "loss": 0.1607, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.7066879868507385, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.5730011463165283, "rewards/student_margin": 4.279689788818359, "rewards/teacher_margin": 0.0, "step": 5160 }, { "epoch": 0.81, "grad_norm": 8.5, "learning_rate": 5.281219022030423e-07, "logits/chosen": 2.2072532176971436, "logits/rejected": 2.321816921234131, "logps/chosen": -337.2447509765625, "logps/rejected": -397.9025573730469, "loss": 0.2241, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.8121663928031921, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.1043238639831543, "rewards/student_margin": 3.916489839553833, "rewards/teacher_margin": 0.0, "step": 5170 }, { "epoch": 0.81, "grad_norm": 12.5625, "learning_rate": 5.197433406511771e-07, "logits/chosen": 2.3946003913879395, "logits/rejected": 2.136025905609131, "logps/chosen": -406.6019592285156, "logps/rejected": -414.6551818847656, "loss": 0.1987, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.39392292499542236, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.7375683784484863, "rewards/student_margin": 4.131491184234619, "rewards/teacher_margin": 0.0, "step": 5180 }, { "epoch": 0.81, "grad_norm": 13.375, "learning_rate": 5.114240571301205e-07, "logits/chosen": 2.0955450534820557, "logits/rejected": 2.3253350257873535, "logps/chosen": -341.7557067871094, "logps/rejected": -452.169189453125, "loss": 0.1911, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": -0.11268768459558487, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.5104000568389893, "rewards/student_margin": 3.397712230682373, "rewards/teacher_margin": 0.0, "step": 5190 }, { "epoch": 0.82, "grad_norm": 8.9375, "learning_rate": 5.031643006736256e-07, "logits/chosen": 2.2627463340759277, "logits/rejected": 2.274893045425415, "logps/chosen": -369.69403076171875, "logps/rejected": -378.95947265625, "loss": 0.1755, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.41244229674339294, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2940800189971924, "rewards/student_margin": 3.7065224647521973, "rewards/teacher_margin": 0.0, "step": 5200 }, { "epoch": 0.82, "grad_norm": 10.5, "learning_rate": 4.949643185335288e-07, "logits/chosen": 2.3585586547851562, "logits/rejected": 1.9730157852172852, "logps/chosen": -391.43206787109375, "logps/rejected": -386.8642272949219, "loss": 0.1989, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.865001380443573, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.0373826026916504, "rewards/student_margin": 3.902383804321289, "rewards/teacher_margin": 0.0, "step": 5210 }, { "epoch": 0.82, "grad_norm": 6.6875, "learning_rate": 4.868243561723535e-07, "logits/chosen": 2.445131301879883, "logits/rejected": 2.4065158367156982, "logps/chosen": -373.07672119140625, "logps/rejected": -498.70513916015625, "loss": 0.1955, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.8417023420333862, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.124225616455078, "rewards/student_margin": 4.965927600860596, "rewards/teacher_margin": 0.0, "step": 5220 }, { "epoch": 0.82, "grad_norm": 16.25, "learning_rate": 4.787446572559582e-07, "logits/chosen": 2.282120943069458, "logits/rejected": 2.0630853176116943, "logps/chosen": -400.98980712890625, "logps/rejected": -444.30633544921875, "loss": 0.1923, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.38360315561294556, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.1675338745117188, "rewards/student_margin": 3.5511374473571777, "rewards/teacher_margin": 0.0, "step": 5230 }, { "epoch": 0.82, "grad_norm": 12.0, "learning_rate": 4.7072546364624383e-07, "logits/chosen": 2.4046263694763184, "logits/rejected": 2.2510952949523926, "logps/chosen": -409.61676025390625, "logps/rejected": -418.1024475097656, "loss": 0.2234, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.591124951839447, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.495370388031006, "rewards/student_margin": 3.0864953994750977, "rewards/teacher_margin": 0.0, "step": 5240 }, { "epoch": 0.82, "grad_norm": 10.5, "learning_rate": 4.6276701539391567e-07, "logits/chosen": 2.2259552478790283, "logits/rejected": 2.3141391277313232, "logps/chosen": -397.34649658203125, "logps/rejected": -475.9881896972656, "loss": 0.2202, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.9363609552383423, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.6788430213928223, "rewards/student_margin": 4.615203857421875, "rewards/teacher_margin": 0.0, "step": 5250 }, { "epoch": 0.82, "grad_norm": 6.46875, "learning_rate": 4.548695507312942e-07, "logits/chosen": 1.9528383016586304, "logits/rejected": 2.267435073852539, "logps/chosen": -347.9878234863281, "logps/rejected": -475.6080017089844, "loss": 0.2089, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.15762636065483093, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.2009596824646, "rewards/student_margin": 4.358586311340332, "rewards/teacher_margin": 0.0, "step": 5260 }, { "epoch": 0.83, "grad_norm": 13.5625, "learning_rate": 4.4703330606518657e-07, "logits/chosen": 2.3648600578308105, "logits/rejected": 2.217665910720825, "logps/chosen": -421.70489501953125, "logps/rejected": -441.8717346191406, "loss": 0.1828, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.7234677672386169, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.1376452445983887, "rewards/student_margin": 3.8611130714416504, "rewards/teacher_margin": 0.0, "step": 5270 }, { "epoch": 0.83, "grad_norm": 9.75, "learning_rate": 4.392585159698087e-07, "logits/chosen": 2.0349690914154053, "logits/rejected": 2.2352256774902344, "logps/chosen": -315.5826416015625, "logps/rejected": -415.2647399902344, "loss": 0.1774, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.36136603355407715, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.7365405559539795, "rewards/student_margin": 4.097906589508057, "rewards/teacher_margin": 0.0, "step": 5280 }, { "epoch": 0.83, "grad_norm": 10.5625, "learning_rate": 4.3154541317976306e-07, "logits/chosen": 1.9423707723617554, "logits/rejected": 1.8010339736938477, "logps/chosen": -394.10748291015625, "logps/rejected": -423.43292236328125, "loss": 0.1501, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.6639676690101624, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.7518794536590576, "rewards/student_margin": 4.415846824645996, "rewards/teacher_margin": 0.0, "step": 5290 }, { "epoch": 0.83, "grad_norm": 11.3125, "learning_rate": 4.2389422858307244e-07, "logits/chosen": 2.3784549236297607, "logits/rejected": 2.2591519355773926, "logps/chosen": -415.6517639160156, "logps/rejected": -512.3271484375, "loss": 0.1826, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.4276931881904602, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.4912402629852295, "rewards/student_margin": 3.918933391571045, "rewards/teacher_margin": 0.0, "step": 5300 }, { "epoch": 0.83, "grad_norm": 11.125, "learning_rate": 4.163051912142685e-07, "logits/chosen": 2.1806135177612305, "logits/rejected": 2.221679925918579, "logps/chosen": -446.80072021484375, "logps/rejected": -483.662353515625, "loss": 0.1645, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.5257972478866577, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.635450839996338, "rewards/student_margin": 4.161247730255127, "rewards/teacher_margin": 0.0, "step": 5310 }, { "epoch": 0.83, "grad_norm": 15.0, "learning_rate": 4.087785282475351e-07, "logits/chosen": 2.2724108695983887, "logits/rejected": 2.196840286254883, "logps/chosen": -337.02105712890625, "logps/rejected": -418.55426025390625, "loss": 0.188, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.7632239460945129, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2096943855285645, "rewards/student_margin": 3.9729180335998535, "rewards/teacher_margin": 0.0, "step": 5320 }, { "epoch": 0.84, "grad_norm": 8.0625, "learning_rate": 4.0131446498990896e-07, "logits/chosen": 2.383780002593994, "logits/rejected": 2.356071949005127, "logps/chosen": -391.28778076171875, "logps/rejected": -500.7630310058594, "loss": 0.164, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.28271356225013733, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.9272048473358154, "rewards/student_margin": 4.209918975830078, "rewards/teacher_margin": 0.0, "step": 5330 }, { "epoch": 0.84, "grad_norm": 16.0, "learning_rate": 3.939132248745342e-07, "logits/chosen": 2.202019453048706, "logits/rejected": 2.4069035053253174, "logps/chosen": -416.1748962402344, "logps/rejected": -442.7652282714844, "loss": 0.1987, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8236830830574036, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.8221423625946045, "rewards/student_margin": 4.645825386047363, "rewards/teacher_margin": 0.0, "step": 5340 }, { "epoch": 0.84, "grad_norm": 9.5625, "learning_rate": 3.8657502945397385e-07, "logits/chosen": 2.3065547943115234, "logits/rejected": 2.2571303844451904, "logps/chosen": -417.66534423828125, "logps/rejected": -386.8114318847656, "loss": 0.1701, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6518687009811401, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.794309377670288, "rewards/student_margin": 3.4461779594421387, "rewards/teacher_margin": 0.0, "step": 5350 }, { "epoch": 0.84, "grad_norm": 12.0625, "learning_rate": 3.7930009839358016e-07, "logits/chosen": 2.056572675704956, "logits/rejected": 2.1151654720306396, "logps/chosen": -386.936767578125, "logps/rejected": -505.4623107910156, "loss": 0.1808, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.6277281045913696, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.1349220275878906, "rewards/student_margin": 3.7626500129699707, "rewards/teacher_margin": 0.0, "step": 5360 }, { "epoch": 0.84, "grad_norm": 14.0625, "learning_rate": 3.7208864946491565e-07, "logits/chosen": 2.2746710777282715, "logits/rejected": 2.2143688201904297, "logps/chosen": -402.032470703125, "logps/rejected": -520.7393798828125, "loss": 0.1978, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.6573580503463745, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.340886116027832, "rewards/student_margin": 4.998244285583496, "rewards/teacher_margin": 0.0, "step": 5370 }, { "epoch": 0.84, "grad_norm": 12.125, "learning_rate": 3.6494089853923553e-07, "logits/chosen": 2.2729554176330566, "logits/rejected": 2.1069273948669434, "logps/chosen": -332.3842468261719, "logps/rejected": -442.7864685058594, "loss": 0.1447, "rewards/accuracies": 0.9000000953674316, "rewards/chosen": 0.5674639940261841, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.3418426513671875, "rewards/student_margin": 3.909306287765503, "rewards/teacher_margin": 0.0, "step": 5380 }, { "epoch": 0.84, "grad_norm": 7.875, "learning_rate": 3.578570595810274e-07, "logits/chosen": 2.106508493423462, "logits/rejected": 2.312915563583374, "logps/chosen": -424.59295654296875, "logps/rejected": -507.313232421875, "loss": 0.2074, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.7015904188156128, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.9406981468200684, "rewards/student_margin": 4.642289161682129, "rewards/teacher_margin": 0.0, "step": 5390 }, { "epoch": 0.85, "grad_norm": 5.34375, "learning_rate": 3.508373446416027e-07, "logits/chosen": 2.138469696044922, "logits/rejected": 2.3363680839538574, "logps/chosen": -369.6650390625, "logps/rejected": -397.35626220703125, "loss": 0.1885, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.7446642518043518, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.141200542449951, "rewards/student_margin": 3.8858649730682373, "rewards/teacher_margin": 0.0, "step": 5400 }, { "epoch": 0.85, "grad_norm": 15.375, "learning_rate": 3.4388196385275297e-07, "logits/chosen": 2.3210737705230713, "logits/rejected": 2.4671313762664795, "logps/chosen": -371.82757568359375, "logps/rejected": -406.30792236328125, "loss": 0.2187, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 1.1517667770385742, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.3165955543518066, "rewards/student_margin": 3.468362331390381, "rewards/teacher_margin": 0.0, "step": 5410 }, { "epoch": 0.85, "grad_norm": 19.75, "learning_rate": 3.3699112542045663e-07, "logits/chosen": 2.1681766510009766, "logits/rejected": 1.9920454025268555, "logps/chosen": -408.90228271484375, "logps/rejected": -364.9134826660156, "loss": 0.2085, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6114395260810852, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.076265811920166, "rewards/student_margin": 3.6877052783966064, "rewards/teacher_margin": 0.0, "step": 5420 }, { "epoch": 0.85, "grad_norm": 10.4375, "learning_rate": 3.3016503561865003e-07, "logits/chosen": 2.3190836906433105, "logits/rejected": 2.5411553382873535, "logps/chosen": -373.18634033203125, "logps/rejected": -481.63031005859375, "loss": 0.2465, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.32860034704208374, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.9675533771514893, "rewards/student_margin": 3.2961533069610596, "rewards/teacher_margin": 0.0, "step": 5430 }, { "epoch": 0.85, "grad_norm": 10.5625, "learning_rate": 3.234038987830476e-07, "logits/chosen": 2.280510425567627, "logits/rejected": 2.3124380111694336, "logps/chosen": -421.57904052734375, "logps/rejected": -464.0086975097656, "loss": 0.2359, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.09813537448644638, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.385851860046387, "rewards/student_margin": 4.483986854553223, "rewards/teacher_margin": 0.0, "step": 5440 }, { "epoch": 0.85, "grad_norm": 11.125, "learning_rate": 3.167079173050297e-07, "logits/chosen": 2.148624897003174, "logits/rejected": 2.2616467475891113, "logps/chosen": -394.287109375, "logps/rejected": -447.1385803222656, "loss": 0.2213, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.7595364451408386, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.4219062328338623, "rewards/student_margin": 4.181443214416504, "rewards/teacher_margin": 0.0, "step": 5450 }, { "epoch": 0.86, "grad_norm": 8.1875, "learning_rate": 3.100772916255815e-07, "logits/chosen": 2.4783742427825928, "logits/rejected": 2.3122711181640625, "logps/chosen": -407.8063049316406, "logps/rejected": -406.1996154785156, "loss": 0.1481, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.24006757140159607, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2049050331115723, "rewards/student_margin": 3.444972515106201, "rewards/teacher_margin": 0.0, "step": 5460 }, { "epoch": 0.86, "grad_norm": 15.375, "learning_rate": 3.035122202292942e-07, "logits/chosen": 2.170015335083008, "logits/rejected": 2.217313528060913, "logps/chosen": -416.874755859375, "logps/rejected": -474.3807067871094, "loss": 0.2259, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.11562144756317139, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.9507224559783936, "rewards/student_margin": 4.066344261169434, "rewards/teacher_margin": 0.0, "step": 5470 }, { "epoch": 0.86, "grad_norm": 19.375, "learning_rate": 2.970128996384228e-07, "logits/chosen": 2.277169704437256, "logits/rejected": 2.039562702178955, "logps/chosen": -414.03045654296875, "logps/rejected": -404.8936462402344, "loss": 0.2151, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.508808434009552, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.5929744243621826, "rewards/student_margin": 3.1017825603485107, "rewards/teacher_margin": 0.0, "step": 5480 }, { "epoch": 0.86, "grad_norm": 16.875, "learning_rate": 2.9057952440700396e-07, "logits/chosen": 2.4115138053894043, "logits/rejected": 1.956770658493042, "logps/chosen": -377.45062255859375, "logps/rejected": -341.13653564453125, "loss": 0.175, "rewards/accuracies": 1.0, "rewards/chosen": 1.2293256521224976, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.763292074203491, "rewards/student_margin": 3.9926180839538574, "rewards/teacher_margin": 0.0, "step": 5490 }, { "epoch": 0.86, "grad_norm": 15.125, "learning_rate": 2.8421228711503127e-07, "logits/chosen": 1.8642022609710693, "logits/rejected": 2.280454158782959, "logps/chosen": -327.9745178222656, "logps/rejected": -521.4654541015625, "loss": 0.2262, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7534988522529602, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.438559532165527, "rewards/student_margin": 5.192058086395264, "rewards/teacher_margin": 0.0, "step": 5500 }, { "epoch": 0.86, "grad_norm": 17.25, "learning_rate": 2.779113783626916e-07, "logits/chosen": 2.1880173683166504, "logits/rejected": 2.3169894218444824, "logps/chosen": -343.0785217285156, "logps/rejected": -431.2730407714844, "loss": 0.1329, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.272409588098526, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.8545937538146973, "rewards/student_margin": 4.127003192901611, "rewards/teacher_margin": 0.0, "step": 5510 }, { "epoch": 0.87, "grad_norm": 14.25, "learning_rate": 2.7167698676465765e-07, "logits/chosen": 2.233309268951416, "logits/rejected": 2.3881468772888184, "logps/chosen": -358.3476257324219, "logps/rejected": -431.0907287597656, "loss": 0.1831, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.6958979368209839, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.5248045921325684, "rewards/student_margin": 4.220702648162842, "rewards/teacher_margin": 0.0, "step": 5520 }, { "epoch": 0.87, "grad_norm": 10.375, "learning_rate": 2.655092989444444e-07, "logits/chosen": 2.050823211669922, "logits/rejected": 2.2496607303619385, "logps/chosen": -377.2169189453125, "logps/rejected": -504.4391174316406, "loss": 0.2207, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.3412223756313324, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.9181742668151855, "rewards/student_margin": 4.259397029876709, "rewards/teacher_margin": 0.0, "step": 5530 }, { "epoch": 0.87, "grad_norm": 6.65625, "learning_rate": 2.594084995288207e-07, "logits/chosen": 2.191013813018799, "logits/rejected": 2.110527276992798, "logps/chosen": -374.1242370605469, "logps/rejected": -417.874267578125, "loss": 0.1842, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.525336742401123, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.7702865600585938, "rewards/student_margin": 3.295623302459717, "rewards/teacher_margin": 0.0, "step": 5540 }, { "epoch": 0.87, "grad_norm": 12.4375, "learning_rate": 2.5337477114228295e-07, "logits/chosen": 2.3595540523529053, "logits/rejected": 2.5519797801971436, "logps/chosen": -350.56085205078125, "logps/rejected": -498.60504150390625, "loss": 0.2053, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.4597729742527008, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.0506908893585205, "rewards/student_margin": 3.5104637145996094, "rewards/teacher_margin": 0.0, "step": 5550 }, { "epoch": 0.87, "grad_norm": 16.625, "learning_rate": 2.4740829440158904e-07, "logits/chosen": 2.2592995166778564, "logits/rejected": 2.211606740951538, "logps/chosen": -370.6821594238281, "logps/rejected": -427.726318359375, "loss": 0.2343, "rewards/accuracies": 0.9000000953674316, "rewards/chosen": 0.7817676067352295, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.9983975887298584, "rewards/student_margin": 3.780165195465088, "rewards/teacher_margin": 0.0, "step": 5560 }, { "epoch": 0.87, "grad_norm": 17.25, "learning_rate": 2.4150924791035037e-07, "logits/chosen": 2.306429386138916, "logits/rejected": 2.251401424407959, "logps/chosen": -376.0966796875, "logps/rejected": -393.9678649902344, "loss": 0.2104, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.23830631375312805, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.4037153720855713, "rewards/student_margin": 2.642021656036377, "rewards/teacher_margin": 0.0, "step": 5570 }, { "epoch": 0.87, "grad_norm": 19.125, "learning_rate": 2.3567780825368813e-07, "logits/chosen": 2.209865093231201, "logits/rejected": 2.4303536415100098, "logps/chosen": -371.0525207519531, "logps/rejected": -373.54736328125, "loss": 0.222, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.5070635676383972, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.0891990661621094, "rewards/student_margin": 3.5962624549865723, "rewards/teacher_margin": 0.0, "step": 5580 }, { "epoch": 0.88, "grad_norm": 16.75, "learning_rate": 2.2991414999294386e-07, "logits/chosen": 2.4971327781677246, "logits/rejected": 2.4894907474517822, "logps/chosen": -390.57366943359375, "logps/rejected": -450.89093017578125, "loss": 0.2288, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.37669551372528076, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2893288135528564, "rewards/student_margin": 3.6660244464874268, "rewards/teacher_margin": 0.0, "step": 5590 }, { "epoch": 0.88, "grad_norm": 13.75, "learning_rate": 2.2421844566045515e-07, "logits/chosen": 2.165761709213257, "logits/rejected": 2.269256591796875, "logps/chosen": -367.3092041015625, "logps/rejected": -441.0704650878906, "loss": 0.1847, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.5646764636039734, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.3486995697021484, "rewards/student_margin": 3.9133763313293457, "rewards/teacher_margin": 0.0, "step": 5600 }, { "epoch": 0.88, "grad_norm": 15.6875, "learning_rate": 2.1859086575439225e-07, "logits/chosen": 2.269378185272217, "logits/rejected": 2.379396915435791, "logps/chosen": -352.1163635253906, "logps/rejected": -426.03729248046875, "loss": 0.2526, "rewards/accuracies": 1.0, "rewards/chosen": 0.7257265448570251, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.0956614017486572, "rewards/student_margin": 3.821387767791748, "rewards/teacher_margin": 0.0, "step": 5610 }, { "epoch": 0.88, "grad_norm": 11.125, "learning_rate": 2.13031578733654e-07, "logits/chosen": 2.3883848190307617, "logits/rejected": 2.2737228870391846, "logps/chosen": -340.14398193359375, "logps/rejected": -428.3273010253906, "loss": 0.1887, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8617734909057617, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.7830960750579834, "rewards/student_margin": 3.644869565963745, "rewards/teacher_margin": 0.0, "step": 5620 }, { "epoch": 0.88, "grad_norm": 14.875, "learning_rate": 2.0754075101282327e-07, "logits/chosen": 2.43986177444458, "logits/rejected": 2.1640894412994385, "logps/chosen": -349.7734680175781, "logps/rejected": -381.86895751953125, "loss": 0.218, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.1860099881887436, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.61075496673584, "rewards/student_margin": 2.796765089035034, "rewards/teacher_margin": 0.0, "step": 5630 }, { "epoch": 0.88, "grad_norm": 14.0, "learning_rate": 2.0211854695718763e-07, "logits/chosen": 2.421998977661133, "logits/rejected": 2.2017688751220703, "logps/chosen": -384.3017272949219, "logps/rejected": -381.19403076171875, "loss": 0.2091, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 1.0884554386138916, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.3045496940612793, "rewards/student_margin": 4.39300537109375, "rewards/teacher_margin": 0.0, "step": 5640 }, { "epoch": 0.89, "grad_norm": 13.1875, "learning_rate": 1.967651288778183e-07, "logits/chosen": 2.279479742050171, "logits/rejected": 2.5703928470611572, "logps/chosen": -354.9503479003906, "logps/rejected": -480.7311096191406, "loss": 0.239, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.4314715266227722, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.269718885421753, "rewards/student_margin": 3.70119047164917, "rewards/teacher_margin": 0.0, "step": 5650 }, { "epoch": 0.89, "grad_norm": 8.9375, "learning_rate": 1.914806570267111e-07, "logits/chosen": 2.318463087081909, "logits/rejected": 2.181204319000244, "logps/chosen": -374.6690368652344, "logps/rejected": -442.94219970703125, "loss": 0.2115, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5986078977584839, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.089754581451416, "rewards/student_margin": 3.6883628368377686, "rewards/teacher_margin": 0.0, "step": 5660 }, { "epoch": 0.89, "grad_norm": 10.375, "learning_rate": 1.8626528959198947e-07, "logits/chosen": 2.324489116668701, "logits/rejected": 2.265256643295288, "logps/chosen": -392.2458190917969, "logps/rejected": -407.7958679199219, "loss": 0.1575, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.8621466755867004, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.6850292682647705, "rewards/student_margin": 3.547175884246826, "rewards/teacher_margin": 0.0, "step": 5670 }, { "epoch": 0.89, "grad_norm": 10.3125, "learning_rate": 1.8111918269316935e-07, "logits/chosen": 2.3972115516662598, "logits/rejected": 2.273054361343384, "logps/chosen": -330.9543151855469, "logps/rejected": -403.77886962890625, "loss": 0.1953, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.7225832939147949, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.867365598678589, "rewards/student_margin": 4.589948654174805, "rewards/teacher_margin": 0.0, "step": 5680 }, { "epoch": 0.89, "grad_norm": 11.0, "learning_rate": 1.7604249037648625e-07, "logits/chosen": 2.0604913234710693, "logits/rejected": 2.1035468578338623, "logps/chosen": -383.2791442871094, "logps/rejected": -453.7605895996094, "loss": 0.1434, "rewards/accuracies": 1.0, "rewards/chosen": 0.44861069321632385, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.280402660369873, "rewards/student_margin": 4.729013919830322, "rewards/teacher_margin": 0.0, "step": 5690 }, { "epoch": 0.89, "grad_norm": 16.375, "learning_rate": 1.7103536461028331e-07, "logits/chosen": 2.1436574459075928, "logits/rejected": 2.0301384925842285, "logps/chosen": -365.8976745605469, "logps/rejected": -425.86676025390625, "loss": 0.1747, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.46591606736183167, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.449326753616333, "rewards/student_margin": 3.915243148803711, "rewards/teacher_margin": 0.0, "step": 5700 }, { "epoch": 0.89, "grad_norm": 13.375, "learning_rate": 1.6609795528046257e-07, "logits/chosen": 2.2140603065490723, "logits/rejected": 2.3792645931243896, "logps/chosen": -383.3377380371094, "logps/rejected": -378.3917236328125, "loss": 0.2091, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.19762104749679565, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.9785523414611816, "rewards/student_margin": 3.176173210144043, "rewards/teacher_margin": 0.0, "step": 5710 }, { "epoch": 0.9, "grad_norm": 7.59375, "learning_rate": 1.6123041018599766e-07, "logits/chosen": 2.2791829109191895, "logits/rejected": 2.195279598236084, "logps/chosen": -415.80224609375, "logps/rejected": -463.51312255859375, "loss": 0.1877, "rewards/accuracies": 1.0, "rewards/chosen": 0.04624149575829506, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.453300476074219, "rewards/student_margin": 4.499542236328125, "rewards/teacher_margin": 0.0, "step": 5720 }, { "epoch": 0.9, "grad_norm": 8.8125, "learning_rate": 1.5643287503451038e-07, "logits/chosen": 2.294829845428467, "logits/rejected": 2.1253809928894043, "logps/chosen": -431.16778564453125, "logps/rejected": -438.08526611328125, "loss": 0.1969, "rewards/accuracies": 0.7999999523162842, "rewards/chosen": 0.3945757746696472, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.193666458129883, "rewards/student_margin": 3.5882420539855957, "rewards/teacher_margin": 0.0, "step": 5730 }, { "epoch": 0.9, "grad_norm": 7.8125, "learning_rate": 1.5170549343790837e-07, "logits/chosen": 2.0279994010925293, "logits/rejected": 2.304466724395752, "logps/chosen": -305.9178466796875, "logps/rejected": -465.63433837890625, "loss": 0.1582, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.35116907954216003, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.6634509563446045, "rewards/student_margin": 4.01461935043335, "rewards/teacher_margin": 0.0, "step": 5740 }, { "epoch": 0.9, "grad_norm": 12.5, "learning_rate": 1.4704840690808658e-07, "logits/chosen": 2.1558053493499756, "logits/rejected": 2.358475923538208, "logps/chosen": -329.4479064941406, "logps/rejected": -411.7767639160156, "loss": 0.2033, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7207157015800476, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.5077903270721436, "rewards/student_margin": 4.228506565093994, "rewards/teacher_margin": 0.0, "step": 5750 }, { "epoch": 0.9, "grad_norm": 14.375, "learning_rate": 1.4246175485269047e-07, "logits/chosen": 2.1714162826538086, "logits/rejected": 2.022117853164673, "logps/chosen": -366.1888122558594, "logps/rejected": -422.42626953125, "loss": 0.2103, "rewards/accuracies": 0.9000000953674316, "rewards/chosen": 0.362636923789978, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2962279319763184, "rewards/student_margin": 3.658864974975586, "rewards/teacher_margin": 0.0, "step": 5760 }, { "epoch": 0.9, "grad_norm": 15.25, "learning_rate": 1.3794567457094371e-07, "logits/chosen": 2.333744525909424, "logits/rejected": 2.3007044792175293, "logps/chosen": -368.6081848144531, "logps/rejected": -428.64605712890625, "loss": 0.2041, "rewards/accuracies": 1.0, "rewards/chosen": 0.6755335927009583, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2065482139587402, "rewards/student_margin": 3.8820815086364746, "rewards/teacher_margin": 0.0, "step": 5770 }, { "epoch": 0.91, "grad_norm": 28.25, "learning_rate": 1.335003012495373e-07, "logits/chosen": 2.0182271003723145, "logits/rejected": 2.167402982711792, "logps/chosen": -332.85736083984375, "logps/rejected": -443.0613708496094, "loss": 0.2356, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.25208717584609985, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.926774501800537, "rewards/student_margin": 3.1788618564605713, "rewards/teacher_margin": 0.0, "step": 5780 }, { "epoch": 0.91, "grad_norm": 6.625, "learning_rate": 1.29125767958585e-07, "logits/chosen": 2.3444199562072754, "logits/rejected": 2.2177212238311768, "logps/chosen": -375.41802978515625, "logps/rejected": -433.70074462890625, "loss": 0.2194, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 1.1283342838287354, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2408955097198486, "rewards/student_margin": 4.369229316711426, "rewards/teacher_margin": 0.0, "step": 5790 }, { "epoch": 0.91, "grad_norm": 12.3125, "learning_rate": 1.2482220564763669e-07, "logits/chosen": 2.0574798583984375, "logits/rejected": 2.199319362640381, "logps/chosen": -352.0958251953125, "logps/rejected": -377.81707763671875, "loss": 0.2672, "rewards/accuracies": 0.7000000476837158, "rewards/chosen": 0.1466474086046219, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.373234987258911, "rewards/student_margin": 2.5198819637298584, "rewards/teacher_margin": 0.0, "step": 5800 }, { "epoch": 0.91, "grad_norm": 11.9375, "learning_rate": 1.2058974314176048e-07, "logits/chosen": 2.376814365386963, "logits/rejected": 2.296910524368286, "logps/chosen": -411.83489990234375, "logps/rejected": -492.22442626953125, "loss": 0.1796, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.9793182611465454, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.7316460609436035, "rewards/student_margin": 3.7109646797180176, "rewards/teacher_margin": 0.0, "step": 5810 }, { "epoch": 0.91, "grad_norm": 14.6875, "learning_rate": 1.1642850713768583e-07, "logits/chosen": 2.021230459213257, "logits/rejected": 2.328230381011963, "logps/chosen": -339.5244140625, "logps/rejected": -449.17022705078125, "loss": 0.2074, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.4842238426208496, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.5763092041015625, "rewards/student_margin": 4.060533046722412, "rewards/teacher_margin": 0.0, "step": 5820 }, { "epoch": 0.91, "grad_norm": 8.3125, "learning_rate": 1.1233862220001168e-07, "logits/chosen": 2.210416078567505, "logits/rejected": 2.3002922534942627, "logps/chosen": -350.97369384765625, "logps/rejected": -490.4478454589844, "loss": 0.1564, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.5073890686035156, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.409335136413574, "rewards/student_margin": 4.91672420501709, "rewards/teacher_margin": 0.0, "step": 5830 }, { "epoch": 0.92, "grad_norm": 9.9375, "learning_rate": 1.0832021075747712e-07, "logits/chosen": 2.040278673171997, "logits/rejected": 2.0895941257476807, "logps/chosen": -339.07159423828125, "logps/rejected": -380.5693664550781, "loss": 0.1863, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.6481130123138428, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2045464515686035, "rewards/student_margin": 3.852658748626709, "rewards/teacher_margin": 0.0, "step": 5840 }, { "epoch": 0.92, "grad_norm": 18.625, "learning_rate": 1.0437339309929573e-07, "logits/chosen": 2.296123504638672, "logits/rejected": 2.2315683364868164, "logps/chosen": -325.5566101074219, "logps/rejected": -471.41180419921875, "loss": 0.1842, "rewards/accuracies": 1.0, "rewards/chosen": 0.7518499493598938, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.535308361053467, "rewards/student_margin": 5.287158489227295, "rewards/teacher_margin": 0.0, "step": 5850 }, { "epoch": 0.92, "grad_norm": 16.375, "learning_rate": 1.0049828737155653e-07, "logits/chosen": 2.2149507999420166, "logits/rejected": 2.379211902618408, "logps/chosen": -353.892822265625, "logps/rejected": -458.6172790527344, "loss": 0.1562, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.5059636831283569, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.673590898513794, "rewards/student_margin": 4.179554462432861, "rewards/teacher_margin": 0.0, "step": 5860 }, { "epoch": 0.92, "grad_norm": 15.375, "learning_rate": 9.669500957368627e-08, "logits/chosen": 2.5048789978027344, "logits/rejected": 2.53952956199646, "logps/chosen": -371.6074523925781, "logps/rejected": -416.8810119628906, "loss": 0.1503, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.46981382369995117, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.4466464519500732, "rewards/student_margin": 3.9164605140686035, "rewards/teacher_margin": 0.0, "step": 5870 }, { "epoch": 0.92, "grad_norm": 14.0, "learning_rate": 9.296367355497604e-08, "logits/chosen": 2.2422890663146973, "logits/rejected": 2.423337697982788, "logps/chosen": -344.4833068847656, "logps/rejected": -431.587890625, "loss": 0.1796, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.8225002288818359, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.654350757598877, "rewards/student_margin": 4.476850986480713, "rewards/teacher_margin": 0.0, "step": 5880 }, { "epoch": 0.92, "grad_norm": 16.75, "learning_rate": 8.9304391011176e-08, "logits/chosen": 2.257303476333618, "logits/rejected": 2.262951374053955, "logps/chosen": -391.962890625, "logps/rejected": -429.45550537109375, "loss": 0.2272, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.5440480709075928, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.210204601287842, "rewards/student_margin": 3.7542526721954346, "rewards/teacher_margin": 0.0, "step": 5890 }, { "epoch": 0.92, "grad_norm": 14.0625, "learning_rate": 8.571727148114884e-08, "logits/chosen": 2.2049593925476074, "logits/rejected": 2.0458126068115234, "logps/chosen": -362.9419860839844, "logps/rejected": -449.9620666503906, "loss": 0.1786, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": 0.35495924949645996, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2998344898223877, "rewards/student_margin": 3.6547939777374268, "rewards/teacher_margin": 0.0, "step": 5900 }, { "epoch": 0.93, "grad_norm": 12.9375, "learning_rate": 8.220242234359271e-08, "logits/chosen": 2.601264476776123, "logits/rejected": 2.3658103942871094, "logps/chosen": -400.4070739746094, "logps/rejected": -376.959228515625, "loss": 0.1877, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.24305877089500427, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.822248935699463, "rewards/student_margin": 3.065307378768921, "rewards/teacher_margin": 0.0, "step": 5910 }, { "epoch": 0.93, "grad_norm": 10.5625, "learning_rate": 7.875994881382659e-08, "logits/chosen": 2.4243650436401367, "logits/rejected": 2.2012972831726074, "logps/chosen": -347.5898742675781, "logps/rejected": -388.90472412109375, "loss": 0.1867, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 1.0953638553619385, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.792360305786133, "rewards/student_margin": 3.8877243995666504, "rewards/teacher_margin": 0.0, "step": 5920 }, { "epoch": 0.93, "grad_norm": 19.0, "learning_rate": 7.538995394063996e-08, "logits/chosen": 2.3261067867279053, "logits/rejected": 2.0677506923675537, "logps/chosen": -376.04901123046875, "logps/rejected": -433.77862548828125, "loss": 0.1707, "rewards/accuracies": 1.0, "rewards/chosen": 0.1455022096633911, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.305983543395996, "rewards/student_margin": 4.451485633850098, "rewards/teacher_margin": 0.0, "step": 5930 }, { "epoch": 0.93, "grad_norm": 19.375, "learning_rate": 7.209253860320897e-08, "logits/chosen": 2.3210227489471436, "logits/rejected": 2.41642427444458, "logps/chosen": -372.62847900390625, "logps/rejected": -437.6602478027344, "loss": 0.2088, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.8154757618904114, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.7553513050079346, "rewards/student_margin": 4.570827007293701, "rewards/teacher_margin": 0.0, "step": 5940 }, { "epoch": 0.93, "grad_norm": 8.375, "learning_rate": 6.88678015080757e-08, "logits/chosen": 2.2320191860198975, "logits/rejected": 2.2844643592834473, "logps/chosen": -369.1745300292969, "logps/rejected": -397.2835693359375, "loss": 0.2143, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.4421082139015198, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.4783167839050293, "rewards/student_margin": 2.9204251766204834, "rewards/teacher_margin": 0.0, "step": 5950 }, { "epoch": 0.93, "grad_norm": 8.0625, "learning_rate": 6.571583918619423e-08, "logits/chosen": 2.510165214538574, "logits/rejected": 2.403012752532959, "logps/chosen": -395.22076416015625, "logps/rejected": -407.90838623046875, "loss": 0.1978, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.543927013874054, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.545433759689331, "rewards/student_margin": 4.089361190795898, "rewards/teacher_margin": 0.0, "step": 5960 }, { "epoch": 0.94, "grad_norm": 10.375, "learning_rate": 6.26367459900415e-08, "logits/chosen": 2.410365581512451, "logits/rejected": 2.4213943481445312, "logps/chosen": -394.50616455078125, "logps/rejected": -517.7322387695312, "loss": 0.1824, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7646802663803101, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.3061652183532715, "rewards/student_margin": 4.070845603942871, "rewards/teacher_margin": 0.0, "step": 5970 }, { "epoch": 0.94, "grad_norm": 15.5625, "learning_rate": 5.963061409079124e-08, "logits/chosen": 2.1213741302490234, "logits/rejected": 2.3274176120758057, "logps/chosen": -325.91851806640625, "logps/rejected": -415.89990234375, "loss": 0.1881, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.5136106014251709, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.057328701019287, "rewards/student_margin": 3.570939302444458, "rewards/teacher_margin": 0.0, "step": 5980 }, { "epoch": 0.94, "grad_norm": 17.375, "learning_rate": 5.669753347555596e-08, "logits/chosen": 2.4347760677337646, "logits/rejected": 2.437208414077759, "logps/chosen": -368.4062805175781, "logps/rejected": -464.50604248046875, "loss": 0.2103, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.04481887072324753, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.282297134399414, "rewards/student_margin": 3.327116012573242, "rewards/teacher_margin": 0.0, "step": 5990 }, { "epoch": 0.94, "grad_norm": 12.25, "learning_rate": 5.3837591944694014e-08, "logits/chosen": 2.360888719558716, "logits/rejected": 2.2320427894592285, "logps/chosen": -366.5610046386719, "logps/rejected": -455.919677734375, "loss": 0.2051, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9655609130859375, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.4876437187194824, "rewards/student_margin": 3.45320463180542, "rewards/teacher_margin": 0.0, "step": 6000 }, { "epoch": 0.94, "grad_norm": 13.6875, "learning_rate": 5.105087510917956e-08, "logits/chosen": 2.292454719543457, "logits/rejected": 2.3031833171844482, "logps/chosen": -356.5537109375, "logps/rejected": -420.85009765625, "loss": 0.2196, "rewards/accuracies": 1.0, "rewards/chosen": 0.8026086091995239, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.03272819519043, "rewards/student_margin": 4.835336208343506, "rewards/teacher_margin": 0.0, "step": 6010 }, { "epoch": 0.94, "grad_norm": 10.25, "learning_rate": 4.8337466388040935e-08, "logits/chosen": 2.4439713954925537, "logits/rejected": 2.3645057678222656, "logps/chosen": -371.02337646484375, "logps/rejected": -418.4205017089844, "loss": 0.2269, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 1.0299944877624512, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.1699047088623047, "rewards/student_margin": 3.199899196624756, "rewards/teacher_margin": 0.0, "step": 6020 }, { "epoch": 0.95, "grad_norm": 11.5625, "learning_rate": 4.569744700586326e-08, "logits/chosen": 2.4523720741271973, "logits/rejected": 2.2242815494537354, "logps/chosen": -406.94378662109375, "logps/rejected": -439.6375427246094, "loss": 0.2308, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7007296681404114, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2837066650390625, "rewards/student_margin": 3.98443603515625, "rewards/teacher_margin": 0.0, "step": 6030 }, { "epoch": 0.95, "grad_norm": 8.5625, "learning_rate": 4.31308959903573e-08, "logits/chosen": 2.154148578643799, "logits/rejected": 2.0714235305786133, "logps/chosen": -352.1864929199219, "logps/rejected": -363.07366943359375, "loss": 0.2136, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.5506538152694702, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.6307919025421143, "rewards/student_margin": 3.181445837020874, "rewards/teacher_margin": 0.0, "step": 6040 }, { "epoch": 0.95, "grad_norm": 14.8125, "learning_rate": 4.063789016999331e-08, "logits/chosen": 2.075705051422119, "logits/rejected": 2.0914149284362793, "logps/chosen": -336.5975341796875, "logps/rejected": -408.7382507324219, "loss": 0.2154, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.5506619811058044, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.587484359741211, "rewards/student_margin": 4.13814640045166, "rewards/teacher_margin": 0.0, "step": 6050 }, { "epoch": 0.95, "grad_norm": 17.75, "learning_rate": 3.8218504171701473e-08, "logits/chosen": 2.2179901599884033, "logits/rejected": 2.3311285972595215, "logps/chosen": -357.0268249511719, "logps/rejected": -451.97540283203125, "loss": 0.1769, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5059942007064819, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.0013937950134277, "rewards/student_margin": 3.50738787651062, "rewards/teacher_margin": 0.0, "step": 6060 }, { "epoch": 0.95, "grad_norm": 20.5, "learning_rate": 3.587281041863899e-08, "logits/chosen": 1.892382264137268, "logits/rejected": 2.167736530303955, "logps/chosen": -402.40899658203125, "logps/rejected": -476.56829833984375, "loss": 0.2475, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.2499643862247467, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.5292022228240967, "rewards/student_margin": 3.7791666984558105, "rewards/teacher_margin": 0.0, "step": 6070 }, { "epoch": 0.95, "grad_norm": 11.9375, "learning_rate": 3.360087912801957e-08, "logits/chosen": 2.2083137035369873, "logits/rejected": 2.148740291595459, "logps/chosen": -347.1419372558594, "logps/rejected": -407.12152099609375, "loss": 0.1713, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.5535652041435242, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.5324130058288574, "rewards/student_margin": 3.0859785079956055, "rewards/teacher_margin": 0.0, "step": 6080 }, { "epoch": 0.95, "grad_norm": 15.375, "learning_rate": 3.1402778309014284e-08, "logits/chosen": 2.164210557937622, "logits/rejected": 2.4700570106506348, "logps/chosen": -389.385498046875, "logps/rejected": -503.19189453125, "loss": 0.2169, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.5740918517112732, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.9634647369384766, "rewards/student_margin": 4.5375566482543945, "rewards/teacher_margin": 0.0, "step": 6090 }, { "epoch": 0.96, "grad_norm": 17.75, "learning_rate": 2.9278573760713745e-08, "logits/chosen": 2.090214252471924, "logits/rejected": 2.0234415531158447, "logps/chosen": -390.03948974609375, "logps/rejected": -484.2791442871094, "loss": 0.1864, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.05563787370920181, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.501677989959717, "rewards/student_margin": 4.446040153503418, "rewards/teacher_margin": 0.0, "step": 6100 }, { "epoch": 0.96, "grad_norm": 11.25, "learning_rate": 2.722832907015971e-08, "logits/chosen": 2.4228386878967285, "logits/rejected": 2.2034754753112793, "logps/chosen": -373.6684875488281, "logps/rejected": -419.432861328125, "loss": 0.1903, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.4400637745857239, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.0755481719970703, "rewards/student_margin": 3.5156121253967285, "rewards/teacher_margin": 0.0, "step": 6110 }, { "epoch": 0.96, "grad_norm": 18.625, "learning_rate": 2.525210561044128e-08, "logits/chosen": 2.226818799972534, "logits/rejected": 2.0953850746154785, "logps/chosen": -379.4895324707031, "logps/rejected": -426.3072814941406, "loss": 0.2425, "rewards/accuracies": 0.7333332896232605, "rewards/chosen": -0.02732607163488865, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.697058916091919, "rewards/student_margin": 2.6697325706481934, "rewards/teacher_margin": 0.0, "step": 6120 }, { "epoch": 0.96, "grad_norm": 13.6875, "learning_rate": 2.3349962538857518e-08, "logits/chosen": 2.231477975845337, "logits/rejected": 2.083103656768799, "logps/chosen": -350.20501708984375, "logps/rejected": -400.52435302734375, "loss": 0.1727, "rewards/accuracies": 0.8666666746139526, "rewards/chosen": 0.5855715274810791, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.5345120429992676, "rewards/student_margin": 4.120083332061768, "rewards/teacher_margin": 0.0, "step": 6130 }, { "epoch": 0.96, "grad_norm": 9.5, "learning_rate": 2.1521956795145517e-08, "logits/chosen": 2.0766186714172363, "logits/rejected": 2.0765154361724854, "logps/chosen": -417.31085205078125, "logps/rejected": -453.8123474121094, "loss": 0.167, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.6634305119514465, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.9899914264678955, "rewards/student_margin": 4.6534223556518555, "rewards/teacher_margin": 0.0, "step": 6140 }, { "epoch": 0.96, "grad_norm": 16.25, "learning_rate": 1.9768143099779258e-08, "logits/chosen": 2.4145724773406982, "logits/rejected": 2.328265905380249, "logps/chosen": -350.8409118652344, "logps/rejected": -414.33135986328125, "loss": 0.1854, "rewards/accuracies": 0.9000000953674316, "rewards/chosen": 0.124974824488163, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.1804845333099365, "rewards/student_margin": 3.305459499359131, "rewards/teacher_margin": 0.0, "step": 6150 }, { "epoch": 0.97, "grad_norm": 16.625, "learning_rate": 1.808857395232788e-08, "logits/chosen": 2.437615156173706, "logits/rejected": 2.3351635932922363, "logps/chosen": -375.98870849609375, "logps/rejected": -474.4404296875, "loss": 0.2143, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 1.0847854614257812, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.1947996616363525, "rewards/student_margin": 4.279585838317871, "rewards/teacher_margin": 0.0, "step": 6160 }, { "epoch": 0.97, "grad_norm": 11.6875, "learning_rate": 1.6483299629886372e-08, "logits/chosen": 2.3586320877075195, "logits/rejected": 2.4213781356811523, "logps/chosen": -375.6097717285156, "logps/rejected": -429.35382080078125, "loss": 0.1894, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.6681365966796875, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.427309036254883, "rewards/student_margin": 5.09544563293457, "rewards/teacher_margin": 0.0, "step": 6170 }, { "epoch": 0.97, "grad_norm": 13.375, "learning_rate": 1.4952368185569277e-08, "logits/chosen": 2.138272762298584, "logits/rejected": 2.0282528400421143, "logps/chosen": -396.82122802734375, "logps/rejected": -472.81951904296875, "loss": 0.2031, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.659168004989624, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.0978198051452637, "rewards/student_margin": 3.756988048553467, "rewards/teacher_margin": 0.0, "step": 6180 }, { "epoch": 0.97, "grad_norm": 12.3125, "learning_rate": 1.349582544707323e-08, "logits/chosen": 2.481755256652832, "logits/rejected": 2.1981589794158936, "logps/chosen": -382.43438720703125, "logps/rejected": -355.4049377441406, "loss": 0.1927, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.5684838891029358, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.9659812450408936, "rewards/student_margin": 3.5344650745391846, "rewards/teacher_margin": 0.0, "step": 6190 }, { "epoch": 0.97, "grad_norm": 10.625, "learning_rate": 1.2113715015304728e-08, "logits/chosen": 2.5248427391052246, "logits/rejected": 2.5294265747070312, "logps/chosen": -348.305419921875, "logps/rejected": -471.517333984375, "loss": 0.1562, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.4350802004337311, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.9717915058135986, "rewards/student_margin": 4.406871318817139, "rewards/teacher_margin": 0.0, "step": 6200 }, { "epoch": 0.97, "grad_norm": 18.0, "learning_rate": 1.0806078263074783e-08, "logits/chosen": 2.095660448074341, "logits/rejected": 2.119906425476074, "logps/chosen": -407.7420654296875, "logps/rejected": -441.375, "loss": 0.1681, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.17975158989429474, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.4144134521484375, "rewards/student_margin": 4.594164848327637, "rewards/teacher_margin": 0.0, "step": 6210 }, { "epoch": 0.97, "grad_norm": 9.9375, "learning_rate": 9.572954333861018e-09, "logits/chosen": 2.108302116394043, "logits/rejected": 2.222754955291748, "logps/chosen": -362.50811767578125, "logps/rejected": -448.9073181152344, "loss": 0.1797, "rewards/accuracies": 0.966666579246521, "rewards/chosen": 0.8445938229560852, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.448662519454956, "rewards/student_margin": 4.293255805969238, "rewards/teacher_margin": 0.0, "step": 6220 }, { "epoch": 0.98, "grad_norm": 11.0, "learning_rate": 8.414380140635281e-09, "logits/chosen": 2.219505786895752, "logits/rejected": 2.4196956157684326, "logps/chosen": -355.3612060546875, "logps/rejected": -474.388916015625, "loss": 0.2034, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.43251150846481323, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.494264602661133, "rewards/student_margin": 3.92677640914917, "rewards/teacher_margin": 0.0, "step": 6230 }, { "epoch": 0.98, "grad_norm": 10.0625, "learning_rate": 7.3303903647584106e-09, "logits/chosen": 2.1508216857910156, "logits/rejected": 2.4214441776275635, "logps/chosen": -352.2890319824219, "logps/rejected": -407.17669677734375, "loss": 0.2079, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.6938777565956116, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.5106959342956543, "rewards/student_margin": 3.2045738697052, "rewards/teacher_margin": 0.0, "step": 6240 }, { "epoch": 0.98, "grad_norm": 15.875, "learning_rate": 6.321017454943568e-09, "logits/chosen": 2.3090951442718506, "logits/rejected": 2.201667547225952, "logps/chosen": -390.9002990722656, "logps/rejected": -439.030029296875, "loss": 0.1998, "rewards/accuracies": 1.0, "rewards/chosen": 0.8811970949172974, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.2257513999938965, "rewards/student_margin": 4.1069488525390625, "rewards/teacher_margin": 0.0, "step": 6250 }, { "epoch": 0.98, "grad_norm": 18.375, "learning_rate": 5.386291626283124e-09, "logits/chosen": 2.4156525135040283, "logits/rejected": 2.5021700859069824, "logps/chosen": -312.3028869628906, "logps/rejected": -395.7488098144531, "loss": 0.1951, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.7052678465843201, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.4201385974884033, "rewards/student_margin": 4.1254072189331055, "rewards/teacher_margin": 0.0, "step": 6260 }, { "epoch": 0.98, "grad_norm": 13.1875, "learning_rate": 4.526240859345499e-09, "logits/chosen": 2.0396711826324463, "logits/rejected": 1.9359092712402344, "logps/chosen": -383.11761474609375, "logps/rejected": -492.93719482421875, "loss": 0.1891, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.28198346495628357, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.1349196434021, "rewards/student_margin": 4.416903495788574, "rewards/teacher_margin": 0.0, "step": 6270 }, { "epoch": 0.98, "grad_norm": 11.25, "learning_rate": 3.7408908993363805e-09, "logits/chosen": 2.216543197631836, "logits/rejected": 1.9850622415542603, "logps/chosen": -380.0614013671875, "logps/rejected": -437.53564453125, "loss": 0.2281, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.42258912324905396, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.139047384262085, "rewards/student_margin": 3.5616366863250732, "rewards/teacher_margin": 0.0, "step": 6280 }, { "epoch": 0.99, "grad_norm": 10.0625, "learning_rate": 3.030265255329623e-09, "logits/chosen": 2.0900778770446777, "logits/rejected": 2.225165843963623, "logps/chosen": -333.2851867675781, "logps/rejected": -407.6112976074219, "loss": 0.1975, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.2916669249534607, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.9080076217651367, "rewards/student_margin": 3.199674606323242, "rewards/teacher_margin": 0.0, "step": 6290 }, { "epoch": 0.99, "grad_norm": 16.75, "learning_rate": 2.394385199561977e-09, "logits/chosen": 2.524141550064087, "logits/rejected": 2.4509177207946777, "logps/chosen": -387.2109680175781, "logps/rejected": -448.238037109375, "loss": 0.1794, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.7161127924919128, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.09369421005249, "rewards/student_margin": 4.809807777404785, "rewards/teacher_margin": 0.0, "step": 6300 }, { "epoch": 0.99, "grad_norm": 10.1875, "learning_rate": 1.8332697667972078e-09, "logits/chosen": 2.0809245109558105, "logits/rejected": 2.262782573699951, "logps/chosen": -350.5285949707031, "logps/rejected": -491.6537170410156, "loss": 0.1455, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": 0.3026334345340729, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.433758735656738, "rewards/student_margin": 4.736392498016357, "rewards/teacher_margin": 0.0, "step": 6310 }, { "epoch": 0.99, "grad_norm": 11.4375, "learning_rate": 1.3469357537557182e-09, "logits/chosen": 2.0380942821502686, "logits/rejected": 2.1137866973876953, "logps/chosen": -362.50726318359375, "logps/rejected": -430.01513671875, "loss": 0.2662, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.5755778551101685, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.8395731449127197, "rewards/student_margin": 3.4151508808135986, "rewards/teacher_margin": 0.0, "step": 6320 }, { "epoch": 0.99, "grad_norm": 13.3125, "learning_rate": 9.35397718612452e-10, "logits/chosen": 2.4518303871154785, "logits/rejected": 2.5649991035461426, "logps/chosen": -390.725830078125, "logps/rejected": -472.42266845703125, "loss": 0.199, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.5518854856491089, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.644479274749756, "rewards/student_margin": 4.196364879608154, "rewards/teacher_margin": 0.0, "step": 6330 }, { "epoch": 0.99, "grad_norm": 13.25, "learning_rate": 5.98667980560852e-10, "logits/chosen": 2.2874343395233154, "logits/rejected": 2.3878045082092285, "logps/chosen": -371.6022033691406, "logps/rejected": -513.467529296875, "loss": 0.2012, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 0.9210349917411804, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.116450786590576, "rewards/student_margin": 5.037485599517822, "rewards/teacher_margin": 0.0, "step": 6340 }, { "epoch": 1.0, "grad_norm": 7.625, "learning_rate": 3.367566194431571e-10, "logits/chosen": 2.338473081588745, "logits/rejected": 2.35361909866333, "logps/chosen": -370.60791015625, "logps/rejected": -452.43511962890625, "loss": 0.2006, "rewards/accuracies": 0.9666666984558105, "rewards/chosen": -0.07709326595067978, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.028176307678223, "rewards/student_margin": 3.951082706451416, "rewards/teacher_margin": 0.0, "step": 6350 }, { "epoch": 1.0, "grad_norm": 8.125, "learning_rate": 1.4967147545036364e-10, "logits/chosen": 2.25748610496521, "logits/rejected": 2.434911012649536, "logps/chosen": -301.7140808105469, "logps/rejected": -449.4032287597656, "loss": 0.1883, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.5325816869735718, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -4.2593207359313965, "rewards/student_margin": 4.791902542114258, "rewards/teacher_margin": 0.0, "step": 6360 }, { "epoch": 1.0, "grad_norm": 11.0625, "learning_rate": 3.741814888602591e-11, "logits/chosen": 2.2587971687316895, "logits/rejected": 2.1336870193481445, "logps/chosen": -397.2039489746094, "logps/rejected": -449.08978271484375, "loss": 0.1628, "rewards/accuracies": 0.9333332777023315, "rewards/chosen": 1.0158222913742065, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -2.873974084854126, "rewards/student_margin": 3.889796733856201, "rewards/teacher_margin": 0.0, "step": 6370 }, { "epoch": 1.0, "grad_norm": 14.3125, "learning_rate": 0.0, "logits/chosen": 2.277320623397827, "logits/rejected": 2.526331663131714, "logps/chosen": -332.88226318359375, "logps/rejected": -430.95733642578125, "loss": 0.1968, "rewards/accuracies": 0.8666666150093079, "rewards/chosen": 0.713263213634491, "rewards/diff": 0.0, "rewards/diff_abs": 0.0, "rewards/rejected": -3.6614181995391846, "rewards/student_margin": 4.374680995941162, "rewards/teacher_margin": 0.0, "step": 6380 }, { "epoch": 1.0, "step": 6380, "total_flos": 0.0, "train_loss": 0.26257968655200586, "train_runtime": 7112.1171, "train_samples_per_second": 21.529, "train_steps_per_second": 0.897 } ], "logging_steps": 10, "max_steps": 6380, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 400, "total_flos": 0.0, "train_batch_size": 3, "trial_name": null, "trial_params": null }