{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 50.0, "eval_steps": 500, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 89.6328125, "epoch": 0.25, "grad_norm": 2.560847282409668, "kl": 0.0, "learning_rate": 9.99375e-07, "loss": 0.0, "reward": 1.43950754404068, "reward_std": 0.4176497310400009, "rewards/accuracy_reward": 0.5098200291395187, "rewards/format_reward": 0.9296875, "step": 1 }, { "completion_length": 87.2734375, "epoch": 0.5, "grad_norm": 2.703657865524292, "kl": 0.00046539306640625, "learning_rate": 9.9875e-07, "loss": 0.0, "reward": 1.2813043594360352, "reward_std": 0.47014792263507843, "rewards/accuracy_reward": 0.42192937433719635, "rewards/format_reward": 0.859375, "step": 2 }, { "completion_length": 80.0078125, "epoch": 0.75, "grad_norm": 5.132789134979248, "kl": 0.0010738372802734375, "learning_rate": 9.98125e-07, "loss": 0.0, "reward": 1.3763126730918884, "reward_std": 0.3502582609653473, "rewards/accuracy_reward": 0.4075627326965332, "rewards/format_reward": 0.96875, "step": 3 }, { "completion_length": 94.28571701049805, "epoch": 1.0, "grad_norm": 2.7119903564453125, "kl": 0.00128173828125, "learning_rate": 9.975e-07, "loss": 0.0001, "reward": 1.5021255612373352, "reward_std": 0.3990684002637863, "rewards/accuracy_reward": 0.502125546336174, "rewards/format_reward": 1.0, "step": 4 }, { "completion_length": 84.453125, "epoch": 1.25, "grad_norm": 2.7416577339172363, "kl": 0.0021209716796875, "learning_rate": 9.968749999999999e-07, "loss": 0.0001, "reward": 1.3453806638717651, "reward_std": 0.4074166566133499, "rewards/accuracy_reward": 0.4000682085752487, "rewards/format_reward": 0.9453125, "step": 5 }, { "completion_length": 78.625, "epoch": 1.5, "grad_norm": 2.8765275478363037, "kl": 0.006072998046875, "learning_rate": 9.9625e-07, "loss": 0.0002, "reward": 1.4404960870742798, "reward_std": 0.3917630910873413, "rewards/accuracy_reward": 0.44830864667892456, "rewards/format_reward": 0.9921875, "step": 6 }, { "completion_length": 70.7578125, "epoch": 1.75, "grad_norm": 4.0928168296813965, "kl": 0.0057220458984375, "learning_rate": 9.956249999999999e-07, "loss": 0.0002, "reward": 1.5452297925949097, "reward_std": 0.2363404855132103, "rewards/accuracy_reward": 0.5608547776937485, "rewards/format_reward": 0.984375, "step": 7 }, { "completion_length": 69.57143211364746, "epoch": 2.0, "grad_norm": 2.1500496864318848, "kl": 0.00787353515625, "learning_rate": 9.95e-07, "loss": 0.0003, "reward": 1.540364921092987, "reward_std": 0.03179515106603503, "rewards/accuracy_reward": 0.5403649136424065, "rewards/format_reward": 1.0, "step": 8 }, { "completion_length": 74.5234375, "epoch": 2.25, "grad_norm": 2.890000343322754, "kl": 0.009307861328125, "learning_rate": 9.94375e-07, "loss": 0.0004, "reward": 1.4441289901733398, "reward_std": 0.2048807591199875, "rewards/accuracy_reward": 0.44412901997566223, "rewards/format_reward": 1.0, "step": 9 }, { "completion_length": 73.109375, "epoch": 2.5, "grad_norm": 2.234746217727661, "kl": 0.00933837890625, "learning_rate": 9.9375e-07, "loss": 0.0004, "reward": 1.4635842442512512, "reward_std": 0.2971457466483116, "rewards/accuracy_reward": 0.46358419954776764, "rewards/format_reward": 1.0, "step": 10 }, { "completion_length": 67.8203125, "epoch": 2.75, "grad_norm": 2.0221376419067383, "kl": 0.018035888671875, "learning_rate": 9.93125e-07, "loss": 0.0007, "reward": 1.6205239295959473, "reward_std": 0.18765632808208466, "rewards/accuracy_reward": 0.6205238401889801, "rewards/format_reward": 1.0, "step": 11 }, { "completion_length": 75.00000381469727, "epoch": 3.0, "grad_norm": 2.337059259414673, "kl": 0.014556884765625, "learning_rate": 9.925e-07, "loss": 0.0006, "reward": 1.785714328289032, "reward_std": 0.26726123690605164, "rewards/accuracy_reward": 0.7857142984867096, "rewards/format_reward": 1.0, "step": 12 }, { "completion_length": 67.9765625, "epoch": 3.25, "grad_norm": 2.605905294418335, "kl": 0.0230712890625, "learning_rate": 9.91875e-07, "loss": 0.0009, "reward": 1.5252612233161926, "reward_std": 0.2860804498195648, "rewards/accuracy_reward": 0.5330736935138702, "rewards/format_reward": 0.9921875, "step": 13 }, { "completion_length": 71.0703125, "epoch": 3.5, "grad_norm": 2.2355077266693115, "kl": 0.02154541015625, "learning_rate": 9.912499999999998e-07, "loss": 0.0009, "reward": 1.5774829387664795, "reward_std": 0.1989663988351822, "rewards/accuracy_reward": 0.5774829983711243, "rewards/format_reward": 1.0, "step": 14 }, { "completion_length": 65.875, "epoch": 3.75, "grad_norm": 2.508030652999878, "kl": 0.0369873046875, "learning_rate": 9.90625e-07, "loss": 0.0015, "reward": 1.6362086534500122, "reward_std": 0.2645450085401535, "rewards/accuracy_reward": 0.6362085938453674, "rewards/format_reward": 1.0, "step": 15 }, { "completion_length": 58.00000190734863, "epoch": 4.0, "grad_norm": 2.397307872772217, "kl": 0.02752685546875, "learning_rate": 9.9e-07, "loss": 0.001, "reward": 1.837504506111145, "reward_std": 0.34773190319538116, "rewards/accuracy_reward": 0.8375044763088226, "rewards/format_reward": 1.0, "step": 16 }, { "completion_length": 62.1328125, "epoch": 4.25, "grad_norm": 1.985022783279419, "kl": 0.02044677734375, "learning_rate": 9.89375e-07, "loss": 0.0008, "reward": 1.8274397253990173, "reward_std": 0.15590714663267136, "rewards/accuracy_reward": 0.8274396657943726, "rewards/format_reward": 1.0, "step": 17 }, { "completion_length": 66.078125, "epoch": 4.5, "grad_norm": 2.398780584335327, "kl": 0.0401611328125, "learning_rate": 9.8875e-07, "loss": 0.0016, "reward": 1.642715036869049, "reward_std": 0.19355066865682602, "rewards/accuracy_reward": 0.6427150070667267, "rewards/format_reward": 1.0, "step": 18 }, { "completion_length": 66.0625, "epoch": 4.75, "grad_norm": 2.3296172618865967, "kl": 0.048095703125, "learning_rate": 9.88125e-07, "loss": 0.0019, "reward": 1.6349385380744934, "reward_std": 0.30714260041713715, "rewards/accuracy_reward": 0.6427510678768158, "rewards/format_reward": 0.9921875, "step": 19 }, { "completion_length": 73.50000381469727, "epoch": 5.0, "grad_norm": 2.703334331512451, "kl": 0.03143310546875, "learning_rate": 9.875e-07, "loss": 0.0012, "reward": 1.521121323108673, "reward_std": 0.409614622592926, "rewards/accuracy_reward": 0.5211213529109955, "rewards/format_reward": 1.0, "step": 20 }, { "completion_length": 63.859375, "epoch": 5.25, "grad_norm": 3.7955989837646484, "kl": 0.0419921875, "learning_rate": 9.86875e-07, "loss": 0.0017, "reward": 1.6154157519340515, "reward_std": 0.17696820944547653, "rewards/accuracy_reward": 0.6154157221317291, "rewards/format_reward": 1.0, "step": 21 }, { "completion_length": 62.3359375, "epoch": 5.5, "grad_norm": 2.0224432945251465, "kl": 0.04486083984375, "learning_rate": 9.862499999999999e-07, "loss": 0.0018, "reward": 1.7184607982635498, "reward_std": 0.16713028401136398, "rewards/accuracy_reward": 0.7184608280658722, "rewards/format_reward": 1.0, "step": 22 }, { "completion_length": 65.8515625, "epoch": 5.75, "grad_norm": 2.972362756729126, "kl": 0.0313720703125, "learning_rate": 9.85625e-07, "loss": 0.0013, "reward": 1.7493125200271606, "reward_std": 0.184370219707489, "rewards/accuracy_reward": 0.7493124902248383, "rewards/format_reward": 1.0, "step": 23 }, { "completion_length": 74.21429061889648, "epoch": 6.0, "grad_norm": 1.9288957118988037, "kl": 0.0411376953125, "learning_rate": 9.849999999999999e-07, "loss": 0.0019, "reward": 1.6142857074737549, "reward_std": 0.26726123690605164, "rewards/accuracy_reward": 0.6142857670783997, "rewards/format_reward": 1.0, "step": 24 }, { "completion_length": 61.2265625, "epoch": 6.25, "grad_norm": 7.291033744812012, "kl": 0.260986328125, "learning_rate": 9.84375e-07, "loss": 0.0104, "reward": 1.7374743819236755, "reward_std": 0.2810707241296768, "rewards/accuracy_reward": 0.7374744713306427, "rewards/format_reward": 1.0, "step": 25 }, { "completion_length": 60.390625, "epoch": 6.5, "grad_norm": 2.046499013900757, "kl": 0.03759765625, "learning_rate": 9.8375e-07, "loss": 0.0015, "reward": 1.663317084312439, "reward_std": 0.216216042637825, "rewards/accuracy_reward": 0.6633170545101166, "rewards/format_reward": 1.0, "step": 26 }, { "completion_length": 60.2421875, "epoch": 6.75, "grad_norm": 2.2611141204833984, "kl": 0.046875, "learning_rate": 9.83125e-07, "loss": 0.0019, "reward": 1.5771763920783997, "reward_std": 0.2472986802458763, "rewards/accuracy_reward": 0.5849888920783997, "rewards/format_reward": 0.9921875, "step": 27 }, { "completion_length": 64.28571891784668, "epoch": 7.0, "grad_norm": 2.10233736038208, "kl": 0.086181640625, "learning_rate": 9.825e-07, "loss": 0.0036, "reward": 1.82956200838089, "reward_std": 0.3444478511810303, "rewards/accuracy_reward": 0.8295620679855347, "rewards/format_reward": 1.0, "step": 28 }, { "completion_length": 63.015625, "epoch": 7.25, "grad_norm": 2.0856142044067383, "kl": 0.0609130859375, "learning_rate": 9.81875e-07, "loss": 0.0024, "reward": 1.6384294033050537, "reward_std": 0.20048531889915466, "rewards/accuracy_reward": 0.6384294033050537, "rewards/format_reward": 1.0, "step": 29 }, { "completion_length": 56.7109375, "epoch": 7.5, "grad_norm": 1.9775277376174927, "kl": 0.06494140625, "learning_rate": 9.8125e-07, "loss": 0.0026, "reward": 1.726142942905426, "reward_std": 0.183742456138134, "rewards/accuracy_reward": 0.733955442905426, "rewards/format_reward": 0.9921875, "step": 30 }, { "completion_length": 62.640625, "epoch": 7.75, "grad_norm": 4.462751388549805, "kl": 0.0439453125, "learning_rate": 9.806249999999998e-07, "loss": 0.0018, "reward": 1.7479652166366577, "reward_std": 0.23261219263076782, "rewards/accuracy_reward": 0.7479651868343353, "rewards/format_reward": 1.0, "step": 31 }, { "completion_length": 56.64285850524902, "epoch": 8.0, "grad_norm": 1.7369378805160522, "kl": 0.067138671875, "learning_rate": 9.8e-07, "loss": 0.0026, "reward": 1.9810991883277893, "reward_std": 0.024930346757173538, "rewards/accuracy_reward": 0.9810990691184998, "rewards/format_reward": 1.0, "step": 32 }, { "completion_length": 54.9609375, "epoch": 8.25, "grad_norm": 2.8938117027282715, "kl": 0.0703125, "learning_rate": 9.79375e-07, "loss": 0.0028, "reward": 1.7204629778862, "reward_std": 0.16244513541460037, "rewards/accuracy_reward": 0.7204630076885223, "rewards/format_reward": 1.0, "step": 33 }, { "completion_length": 56.1640625, "epoch": 8.5, "grad_norm": 1.817608118057251, "kl": 0.0526123046875, "learning_rate": 9.7875e-07, "loss": 0.0021, "reward": 1.8434885740280151, "reward_std": 0.13239304721355438, "rewards/accuracy_reward": 0.8434885144233704, "rewards/format_reward": 1.0, "step": 34 }, { "completion_length": 61.7421875, "epoch": 8.75, "grad_norm": 2.240640878677368, "kl": 0.0614013671875, "learning_rate": 9.78125e-07, "loss": 0.0025, "reward": 1.6918946504592896, "reward_std": 0.23886261880397797, "rewards/accuracy_reward": 0.6918946206569672, "rewards/format_reward": 1.0, "step": 35 }, { "completion_length": 67.07143020629883, "epoch": 9.0, "grad_norm": 2.1461708545684814, "kl": 0.0511474609375, "learning_rate": 9.775e-07, "loss": 0.0023, "reward": 1.5855827927589417, "reward_std": 0.4357292503118515, "rewards/accuracy_reward": 0.6570113599300385, "rewards/format_reward": 0.9285714626312256, "step": 36 }, { "completion_length": 59.109375, "epoch": 9.25, "grad_norm": 1.676393747329712, "kl": 0.07080078125, "learning_rate": 9.76875e-07, "loss": 0.0028, "reward": 1.7769874930381775, "reward_std": 0.18539611995220184, "rewards/accuracy_reward": 0.7769874632358551, "rewards/format_reward": 1.0, "step": 37 }, { "completion_length": 59.2109375, "epoch": 9.5, "grad_norm": 2.115464925765991, "kl": 0.0623779296875, "learning_rate": 9.7625e-07, "loss": 0.0025, "reward": 1.6574002504348755, "reward_std": 0.23898707330226898, "rewards/accuracy_reward": 0.6574002504348755, "rewards/format_reward": 1.0, "step": 38 }, { "completion_length": 58.7734375, "epoch": 9.75, "grad_norm": 2.0188374519348145, "kl": 0.0640869140625, "learning_rate": 9.756249999999999e-07, "loss": 0.0026, "reward": 1.8230915069580078, "reward_std": 0.07023201137781143, "rewards/accuracy_reward": 0.8230914771556854, "rewards/format_reward": 1.0, "step": 39 }, { "completion_length": 50.57143020629883, "epoch": 10.0, "grad_norm": 1.818213701248169, "kl": 0.069580078125, "learning_rate": 9.75e-07, "loss": 0.0027, "reward": 1.898565948009491, "reward_std": 0.1666813576593995, "rewards/accuracy_reward": 0.8985659182071686, "rewards/format_reward": 1.0, "step": 40 }, { "completion_length": 61.828125, "epoch": 10.25, "grad_norm": 2.0514211654663086, "kl": 0.054443359375, "learning_rate": 9.743749999999999e-07, "loss": 0.0022, "reward": 1.770473837852478, "reward_std": 0.26378537714481354, "rewards/accuracy_reward": 0.7860988080501556, "rewards/format_reward": 0.984375, "step": 41 }, { "completion_length": 62.3203125, "epoch": 10.5, "grad_norm": 1.9126454591751099, "kl": 0.0628662109375, "learning_rate": 9.7375e-07, "loss": 0.0025, "reward": 1.8098745346069336, "reward_std": 0.158866249024868, "rewards/accuracy_reward": 0.8176870048046112, "rewards/format_reward": 0.9921875, "step": 42 }, { "completion_length": 58.640625, "epoch": 10.75, "grad_norm": 1.7069599628448486, "kl": 0.0595703125, "learning_rate": 9.73125e-07, "loss": 0.0024, "reward": 1.7534176111221313, "reward_std": 0.15397731214761734, "rewards/accuracy_reward": 0.7534177303314209, "rewards/format_reward": 1.0, "step": 43 }, { "completion_length": 51.21428871154785, "epoch": 11.0, "grad_norm": 2.326500415802002, "kl": 0.0687255859375, "learning_rate": 9.725e-07, "loss": 0.0027, "reward": 1.612587034702301, "reward_std": 0.19589456543326378, "rewards/accuracy_reward": 0.6125869750976562, "rewards/format_reward": 1.0, "step": 44 }, { "completion_length": 63.1953125, "epoch": 11.25, "grad_norm": 2.318408966064453, "kl": 0.05615234375, "learning_rate": 9.71875e-07, "loss": 0.0022, "reward": 1.731492519378662, "reward_std": 0.1684369444847107, "rewards/accuracy_reward": 0.7314925193786621, "rewards/format_reward": 1.0, "step": 45 }, { "completion_length": 62.1796875, "epoch": 11.5, "grad_norm": 2.0479931831359863, "kl": 0.067138671875, "learning_rate": 9.712499999999998e-07, "loss": 0.0027, "reward": 1.8079904317855835, "reward_std": 0.14444740116596222, "rewards/accuracy_reward": 0.8079904615879059, "rewards/format_reward": 1.0, "step": 46 }, { "completion_length": 58.5234375, "epoch": 11.75, "grad_norm": 2.1100575923919678, "kl": 0.0506591796875, "learning_rate": 9.70625e-07, "loss": 0.002, "reward": 1.849604606628418, "reward_std": 0.1187722496688366, "rewards/accuracy_reward": 0.8496046662330627, "rewards/format_reward": 1.0, "step": 47 }, { "completion_length": 78.5714340209961, "epoch": 12.0, "grad_norm": 2.278730630874634, "kl": 0.0589599609375, "learning_rate": 9.7e-07, "loss": 0.0023, "reward": 1.650295615196228, "reward_std": 0.3869354873895645, "rewards/accuracy_reward": 0.6502955406904221, "rewards/format_reward": 1.0, "step": 48 }, { "completion_length": 62.8203125, "epoch": 12.25, "grad_norm": 1.7048614025115967, "kl": 0.0550537109375, "learning_rate": 9.69375e-07, "loss": 0.0022, "reward": 1.8256508708000183, "reward_std": 0.12432926893234253, "rewards/accuracy_reward": 0.8256509006023407, "rewards/format_reward": 1.0, "step": 49 }, { "completion_length": 67.4375, "epoch": 12.5, "grad_norm": 1.5910967588424683, "kl": 0.0621337890625, "learning_rate": 9.6875e-07, "loss": 0.0025, "reward": 1.801437258720398, "reward_std": 0.12012555077672005, "rewards/accuracy_reward": 0.8014372885227203, "rewards/format_reward": 1.0, "step": 50 }, { "completion_length": 60.3515625, "epoch": 12.75, "grad_norm": 1.7540335655212402, "kl": 0.07177734375, "learning_rate": 9.68125e-07, "loss": 0.0029, "reward": 1.7970203161239624, "reward_std": 0.09028816036880016, "rewards/accuracy_reward": 0.79702028632164, "rewards/format_reward": 1.0, "step": 51 }, { "completion_length": 68.85714721679688, "epoch": 13.0, "grad_norm": 1.9271409511566162, "kl": 0.057861328125, "learning_rate": 9.675e-07, "loss": 0.0022, "reward": 1.725570797920227, "reward_std": 0.2023605689755641, "rewards/accuracy_reward": 0.725570797920227, "rewards/format_reward": 1.0, "step": 52 }, { "completion_length": 64.7890625, "epoch": 13.25, "grad_norm": 1.6656194925308228, "kl": 0.057861328125, "learning_rate": 9.66875e-07, "loss": 0.0023, "reward": 1.762970507144928, "reward_std": 0.1429205760359764, "rewards/accuracy_reward": 0.7629704773426056, "rewards/format_reward": 1.0, "step": 53 }, { "completion_length": 69.1796875, "epoch": 13.5, "grad_norm": 1.7760093212127686, "kl": 0.084228515625, "learning_rate": 9.6625e-07, "loss": 0.0034, "reward": 1.7609952092170715, "reward_std": 0.11301954090595245, "rewards/accuracy_reward": 0.7609952092170715, "rewards/format_reward": 1.0, "step": 54 }, { "completion_length": 65.984375, "epoch": 13.75, "grad_norm": 1.944703459739685, "kl": 0.0472412109375, "learning_rate": 9.65625e-07, "loss": 0.0019, "reward": 1.8564435839653015, "reward_std": 0.14264069870114326, "rewards/accuracy_reward": 0.8564436435699463, "rewards/format_reward": 1.0, "step": 55 }, { "completion_length": 70.85714340209961, "epoch": 14.0, "grad_norm": 1.7048606872558594, "kl": 0.059326171875, "learning_rate": 9.649999999999999e-07, "loss": 0.0023, "reward": 1.8061460256576538, "reward_std": 0.2514180298894644, "rewards/accuracy_reward": 0.8061459064483643, "rewards/format_reward": 1.0, "step": 56 }, { "completion_length": 64.2421875, "epoch": 14.25, "grad_norm": 1.8009275197982788, "kl": 0.083251953125, "learning_rate": 9.64375e-07, "loss": 0.0033, "reward": 1.7855259776115417, "reward_std": 0.08876464702188969, "rewards/accuracy_reward": 0.785525918006897, "rewards/format_reward": 1.0, "step": 57 }, { "completion_length": 68.515625, "epoch": 14.5, "grad_norm": 1.522387981414795, "kl": 0.074951171875, "learning_rate": 9.637499999999999e-07, "loss": 0.003, "reward": 1.8352751731872559, "reward_std": 0.12848591804504395, "rewards/accuracy_reward": 0.8352752029895782, "rewards/format_reward": 1.0, "step": 58 }, { "completion_length": 67.0390625, "epoch": 14.75, "grad_norm": 2.3351709842681885, "kl": 0.065673828125, "learning_rate": 9.63125e-07, "loss": 0.0026, "reward": 1.8882685899734497, "reward_std": 0.11743934452533722, "rewards/accuracy_reward": 0.8882685601711273, "rewards/format_reward": 1.0, "step": 59 }, { "completion_length": 58.71428871154785, "epoch": 15.0, "grad_norm": 1.921271800994873, "kl": 0.05419921875, "learning_rate": 9.624999999999999e-07, "loss": 0.0022, "reward": 1.53987056016922, "reward_std": 0.16531461104750633, "rewards/accuracy_reward": 0.5398704707622528, "rewards/format_reward": 1.0, "step": 60 }, { "completion_length": 68.25, "epoch": 15.25, "grad_norm": 1.8104687929153442, "kl": 0.060302734375, "learning_rate": 9.61875e-07, "loss": 0.0024, "reward": 1.8509221076965332, "reward_std": 0.11642135679721832, "rewards/accuracy_reward": 0.8509220480918884, "rewards/format_reward": 1.0, "step": 61 }, { "completion_length": 67.453125, "epoch": 15.5, "grad_norm": 2.454641819000244, "kl": 0.0623779296875, "learning_rate": 9.6125e-07, "loss": 0.0025, "reward": 1.7732171416282654, "reward_std": 0.1827603131532669, "rewards/accuracy_reward": 0.7732171416282654, "rewards/format_reward": 1.0, "step": 62 }, { "completion_length": 70.375, "epoch": 15.75, "grad_norm": 3.006380558013916, "kl": 0.123046875, "learning_rate": 9.606249999999998e-07, "loss": 0.0049, "reward": 1.774861454963684, "reward_std": 0.1715347319841385, "rewards/accuracy_reward": 0.7748615145683289, "rewards/format_reward": 1.0, "step": 63 }, { "completion_length": 56.142860412597656, "epoch": 16.0, "grad_norm": 3.028219223022461, "kl": 0.0634765625, "learning_rate": 9.6e-07, "loss": 0.0025, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 64 }, { "completion_length": 74.2890625, "epoch": 16.25, "grad_norm": 1.6325531005859375, "kl": 0.06103515625, "learning_rate": 9.59375e-07, "loss": 0.0024, "reward": 1.7759021520614624, "reward_std": 0.15456650406122208, "rewards/accuracy_reward": 0.7759020924568176, "rewards/format_reward": 1.0, "step": 65 }, { "completion_length": 69.484375, "epoch": 16.5, "grad_norm": 1.501849889755249, "kl": 0.068115234375, "learning_rate": 9.5875e-07, "loss": 0.0027, "reward": 1.7950308322906494, "reward_std": 0.12456715479493141, "rewards/accuracy_reward": 0.7950307726860046, "rewards/format_reward": 1.0, "step": 66 }, { "completion_length": 64.953125, "epoch": 16.75, "grad_norm": 2.5453922748565674, "kl": 0.0596923828125, "learning_rate": 9.58125e-07, "loss": 0.0024, "reward": 1.8739762902259827, "reward_std": 0.11443666741251945, "rewards/accuracy_reward": 0.8817887902259827, "rewards/format_reward": 0.9921875, "step": 67 }, { "completion_length": 61.42857551574707, "epoch": 17.0, "grad_norm": 1.8238996267318726, "kl": 0.04443359375, "learning_rate": 9.575e-07, "loss": 0.0017, "reward": 1.8428571224212646, "reward_std": 0.1414213627576828, "rewards/accuracy_reward": 0.8428571820259094, "rewards/format_reward": 1.0, "step": 68 }, { "completion_length": 72.0234375, "epoch": 17.25, "grad_norm": 1.7668324708938599, "kl": 0.07373046875, "learning_rate": 9.56875e-07, "loss": 0.003, "reward": 1.8650294542312622, "reward_std": 0.13415485620498657, "rewards/accuracy_reward": 0.8650294542312622, "rewards/format_reward": 1.0, "step": 69 }, { "completion_length": 64.9453125, "epoch": 17.5, "grad_norm": 1.4389671087265015, "kl": 0.047607421875, "learning_rate": 9.5625e-07, "loss": 0.0019, "reward": 1.927801787853241, "reward_std": 0.06346799433231354, "rewards/accuracy_reward": 0.927801787853241, "rewards/format_reward": 1.0, "step": 70 }, { "completion_length": 67.9296875, "epoch": 17.75, "grad_norm": 1.4634050130844116, "kl": 0.072509765625, "learning_rate": 9.556249999999999e-07, "loss": 0.0029, "reward": 1.8554179668426514, "reward_std": 0.13238264620304108, "rewards/accuracy_reward": 0.8554179966449738, "rewards/format_reward": 1.0, "step": 71 }, { "completion_length": 57.07143020629883, "epoch": 18.0, "grad_norm": 4.220071792602539, "kl": 0.17724609375, "learning_rate": 9.55e-07, "loss": 0.0064, "reward": 1.9889448285102844, "reward_std": 0.006739838980138302, "rewards/accuracy_reward": 0.988944798707962, "rewards/format_reward": 1.0, "step": 72 }, { "completion_length": 68.5546875, "epoch": 18.25, "grad_norm": 1.6068540811538696, "kl": 0.0767822265625, "learning_rate": 9.543749999999999e-07, "loss": 0.0031, "reward": 1.8943632245063782, "reward_std": 0.07841086108237505, "rewards/accuracy_reward": 0.8943631649017334, "rewards/format_reward": 1.0, "step": 73 }, { "completion_length": 68.2421875, "epoch": 18.5, "grad_norm": 1.6987547874450684, "kl": 0.07958984375, "learning_rate": 9.5375e-07, "loss": 0.0032, "reward": 1.8070343136787415, "reward_std": 0.14604970812797546, "rewards/accuracy_reward": 0.8304717838764191, "rewards/format_reward": 0.9765625, "step": 74 }, { "completion_length": 65.3046875, "epoch": 18.75, "grad_norm": 1.6191295385360718, "kl": 0.04541015625, "learning_rate": 9.53125e-07, "loss": 0.0018, "reward": 1.7468606233596802, "reward_std": 0.16931980848312378, "rewards/accuracy_reward": 0.7468606233596802, "rewards/format_reward": 1.0, "step": 75 }, { "completion_length": 50.78571891784668, "epoch": 19.0, "grad_norm": 1.6109150648117065, "kl": 0.066162109375, "learning_rate": 9.525e-07, "loss": 0.0027, "reward": 1.9664621353149414, "reward_std": 0.00548175536096096, "rewards/accuracy_reward": 0.9664620459079742, "rewards/format_reward": 1.0, "step": 76 }, { "completion_length": 66.2109375, "epoch": 19.25, "grad_norm": 1.385863184928894, "kl": 0.059326171875, "learning_rate": 9.51875e-07, "loss": 0.0024, "reward": 1.849595844745636, "reward_std": 0.0718111265450716, "rewards/accuracy_reward": 0.849595844745636, "rewards/format_reward": 1.0, "step": 77 }, { "completion_length": 75.578125, "epoch": 19.5, "grad_norm": 1.782371163368225, "kl": 0.073974609375, "learning_rate": 9.5125e-07, "loss": 0.003, "reward": 1.8164063096046448, "reward_std": 0.16241852939128876, "rewards/accuracy_reward": 0.8320313096046448, "rewards/format_reward": 0.984375, "step": 78 }, { "completion_length": 69.609375, "epoch": 19.75, "grad_norm": 1.2934050559997559, "kl": 0.0748291015625, "learning_rate": 9.50625e-07, "loss": 0.003, "reward": 1.8654966354370117, "reward_std": 0.09261503256857395, "rewards/accuracy_reward": 0.8654966354370117, "rewards/format_reward": 1.0, "step": 79 }, { "completion_length": 58.00000190734863, "epoch": 20.0, "grad_norm": 1.742002248764038, "kl": 0.082275390625, "learning_rate": 9.499999999999999e-07, "loss": 0.0033, "reward": 1.8342429995536804, "reward_std": 0.2157389521598816, "rewards/accuracy_reward": 0.8342429995536804, "rewards/format_reward": 1.0, "step": 80 }, { "completion_length": 67.65625, "epoch": 20.25, "grad_norm": 1.1428124904632568, "kl": 0.0526123046875, "learning_rate": 9.493749999999999e-07, "loss": 0.0021, "reward": 1.9487730264663696, "reward_std": 0.07040097191929817, "rewards/accuracy_reward": 0.9565855264663696, "rewards/format_reward": 0.9921875, "step": 81 }, { "completion_length": 73.515625, "epoch": 20.5, "grad_norm": 2.601008653640747, "kl": 0.0626220703125, "learning_rate": 9.487499999999999e-07, "loss": 0.0025, "reward": 1.770107924938202, "reward_std": 0.11608634144067764, "rewards/accuracy_reward": 0.7701078951358795, "rewards/format_reward": 1.0, "step": 82 }, { "completion_length": 70.1640625, "epoch": 20.75, "grad_norm": 1.5553102493286133, "kl": 0.07421875, "learning_rate": 9.481249999999999e-07, "loss": 0.003, "reward": 1.8716879487037659, "reward_std": 0.13853080570697784, "rewards/accuracy_reward": 0.8716880083084106, "rewards/format_reward": 1.0, "step": 83 }, { "completion_length": 100.9285774230957, "epoch": 21.0, "grad_norm": 1.4347081184387207, "kl": 0.081298828125, "learning_rate": 9.474999999999999e-07, "loss": 0.0032, "reward": 1.7912707328796387, "reward_std": 0.22874768637120724, "rewards/accuracy_reward": 0.7912707030773163, "rewards/format_reward": 1.0, "step": 84 }, { "completion_length": 75.5078125, "epoch": 21.25, "grad_norm": 1.4661049842834473, "kl": 0.05859375, "learning_rate": 9.468749999999999e-07, "loss": 0.0023, "reward": 1.7916218042373657, "reward_std": 0.07297072582878172, "rewards/accuracy_reward": 0.7916218042373657, "rewards/format_reward": 1.0, "step": 85 }, { "completion_length": 67.9296875, "epoch": 21.5, "grad_norm": 1.3486883640289307, "kl": 0.0849609375, "learning_rate": 9.462499999999999e-07, "loss": 0.0034, "reward": 1.9112024307250977, "reward_std": 0.07055860431864858, "rewards/accuracy_reward": 0.9112024307250977, "rewards/format_reward": 1.0, "step": 86 }, { "completion_length": 83.8359375, "epoch": 21.75, "grad_norm": 1.7671387195587158, "kl": 0.062744140625, "learning_rate": 9.45625e-07, "loss": 0.0025, "reward": 1.8552258610725403, "reward_std": 0.14477503299713135, "rewards/accuracy_reward": 0.8630383908748627, "rewards/format_reward": 0.9921875, "step": 87 }, { "completion_length": 68.42857360839844, "epoch": 22.0, "grad_norm": 1.4051504135131836, "kl": 0.0599365234375, "learning_rate": 9.45e-07, "loss": 0.0023, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 88 }, { "completion_length": 67.8203125, "epoch": 22.25, "grad_norm": 1.5293912887573242, "kl": 0.083251953125, "learning_rate": 9.44375e-07, "loss": 0.0033, "reward": 1.9049308896064758, "reward_std": 0.10063770413398743, "rewards/accuracy_reward": 0.9049308598041534, "rewards/format_reward": 1.0, "step": 89 }, { "completion_length": 73.7890625, "epoch": 22.5, "grad_norm": 1.6518833637237549, "kl": 0.07080078125, "learning_rate": 9.4375e-07, "loss": 0.0028, "reward": 1.9123117327690125, "reward_std": 0.10563771054148674, "rewards/accuracy_reward": 0.9201242327690125, "rewards/format_reward": 0.9921875, "step": 90 }, { "completion_length": 76.1484375, "epoch": 22.75, "grad_norm": 1.4721721410751343, "kl": 0.0540771484375, "learning_rate": 9.43125e-07, "loss": 0.0022, "reward": 1.7826260328292847, "reward_std": 0.09974323213100433, "rewards/accuracy_reward": 0.7904385328292847, "rewards/format_reward": 0.9921875, "step": 91 }, { "completion_length": 76.28571701049805, "epoch": 23.0, "grad_norm": 1.7413424253463745, "kl": 0.0732421875, "learning_rate": 9.425e-07, "loss": 0.003, "reward": 1.7768830060958862, "reward_std": 0.27347957249730825, "rewards/accuracy_reward": 0.7768829166889191, "rewards/format_reward": 1.0, "step": 92 }, { "completion_length": 74.9296875, "epoch": 23.25, "grad_norm": 1.5143275260925293, "kl": 0.08154296875, "learning_rate": 9.41875e-07, "loss": 0.0033, "reward": 1.8272383213043213, "reward_std": 0.14210523292422295, "rewards/accuracy_reward": 0.8350508213043213, "rewards/format_reward": 0.9921875, "step": 93 }, { "completion_length": 69.9453125, "epoch": 23.5, "grad_norm": 1.544258713722229, "kl": 0.08349609375, "learning_rate": 9.4125e-07, "loss": 0.0033, "reward": 1.937786877155304, "reward_std": 0.07279435358941555, "rewards/accuracy_reward": 0.9377869367599487, "rewards/format_reward": 1.0, "step": 94 }, { "completion_length": 74.7109375, "epoch": 23.75, "grad_norm": 1.8090089559555054, "kl": 0.0645751953125, "learning_rate": 9.40625e-07, "loss": 0.0026, "reward": 1.8037108778953552, "reward_std": 0.16137579828500748, "rewards/accuracy_reward": 0.8037109076976776, "rewards/format_reward": 1.0, "step": 95 }, { "completion_length": 58.07143020629883, "epoch": 24.0, "grad_norm": 1.5920876264572144, "kl": 0.0655517578125, "learning_rate": 9.399999999999999e-07, "loss": 0.0028, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 96 }, { "completion_length": 71.515625, "epoch": 24.25, "grad_norm": 1.088055968284607, "kl": 0.0806884765625, "learning_rate": 9.393749999999999e-07, "loss": 0.0032, "reward": 1.924683690071106, "reward_std": 0.054580602794885635, "rewards/accuracy_reward": 0.924683690071106, "rewards/format_reward": 1.0, "step": 97 }, { "completion_length": 67.359375, "epoch": 24.5, "grad_norm": 1.886084794998169, "kl": 0.062255859375, "learning_rate": 9.387499999999999e-07, "loss": 0.0025, "reward": 1.8458982706069946, "reward_std": 0.08389683440327644, "rewards/accuracy_reward": 0.8458982408046722, "rewards/format_reward": 1.0, "step": 98 }, { "completion_length": 67.765625, "epoch": 24.75, "grad_norm": 1.9300857782363892, "kl": 0.086181640625, "learning_rate": 9.381249999999999e-07, "loss": 0.0035, "reward": 1.8570204973220825, "reward_std": 0.1493590921163559, "rewards/accuracy_reward": 0.8570204675197601, "rewards/format_reward": 1.0, "step": 99 }, { "completion_length": 69.14286041259766, "epoch": 25.0, "grad_norm": 1.8485654592514038, "kl": 0.08447265625, "learning_rate": 9.374999999999999e-07, "loss": 0.0035, "reward": 1.9482696056365967, "reward_std": 0.042133186012506485, "rewards/accuracy_reward": 0.9482696056365967, "rewards/format_reward": 1.0, "step": 100 }, { "completion_length": 69.8046875, "epoch": 25.25, "grad_norm": 1.7303087711334229, "kl": 0.0626220703125, "learning_rate": 9.368749999999999e-07, "loss": 0.0025, "reward": 1.772914171218872, "reward_std": 0.13923951238393784, "rewards/accuracy_reward": 0.7807266712188721, "rewards/format_reward": 0.9921875, "step": 101 }, { "completion_length": 64.5234375, "epoch": 25.5, "grad_norm": 1.4180161952972412, "kl": 0.092041015625, "learning_rate": 9.3625e-07, "loss": 0.0037, "reward": 1.9157771468162537, "reward_std": 0.09159575775265694, "rewards/accuracy_reward": 0.9157771468162537, "rewards/format_reward": 1.0, "step": 102 }, { "completion_length": 66.8515625, "epoch": 25.75, "grad_norm": 1.7584847211837769, "kl": 0.098388671875, "learning_rate": 9.35625e-07, "loss": 0.0039, "reward": 1.9192783832550049, "reward_std": 0.05962797999382019, "rewards/accuracy_reward": 0.9192784130573273, "rewards/format_reward": 1.0, "step": 103 }, { "completion_length": 80.28571701049805, "epoch": 26.0, "grad_norm": 1.4904193878173828, "kl": 0.079345703125, "learning_rate": 9.35e-07, "loss": 0.003, "reward": 1.8510922193527222, "reward_std": 0.12523864209651947, "rewards/accuracy_reward": 0.8510921597480774, "rewards/format_reward": 1.0, "step": 104 }, { "completion_length": 61.109375, "epoch": 26.25, "grad_norm": 1.324255108833313, "kl": 0.0677490234375, "learning_rate": 9.34375e-07, "loss": 0.0027, "reward": 1.9567571878433228, "reward_std": 0.0685323141515255, "rewards/accuracy_reward": 0.9567572176456451, "rewards/format_reward": 1.0, "step": 105 }, { "completion_length": 71.4296875, "epoch": 26.5, "grad_norm": 2.364198684692383, "kl": 0.07373046875, "learning_rate": 9.3375e-07, "loss": 0.0029, "reward": 1.808899700641632, "reward_std": 0.1115667074918747, "rewards/accuracy_reward": 0.8167122006416321, "rewards/format_reward": 0.9921875, "step": 106 }, { "completion_length": 73.1171875, "epoch": 26.75, "grad_norm": 2.9395134449005127, "kl": 0.0947265625, "learning_rate": 9.33125e-07, "loss": 0.0038, "reward": 1.7833083868026733, "reward_std": 0.13094724714756012, "rewards/accuracy_reward": 0.7911209166049957, "rewards/format_reward": 0.9921875, "step": 107 }, { "completion_length": 58.42857551574707, "epoch": 27.0, "grad_norm": 1.540488839149475, "kl": 0.093994140625, "learning_rate": 9.325e-07, "loss": 0.0036, "reward": 1.9905372262001038, "reward_std": 0.008901512250304222, "rewards/accuracy_reward": 0.9905371963977814, "rewards/format_reward": 1.0, "step": 108 }, { "completion_length": 74.328125, "epoch": 27.25, "grad_norm": 1.7257088422775269, "kl": 0.068115234375, "learning_rate": 9.31875e-07, "loss": 0.0027, "reward": 1.8329947590827942, "reward_std": 0.1075306311249733, "rewards/accuracy_reward": 0.8329947590827942, "rewards/format_reward": 1.0, "step": 109 }, { "completion_length": 65.4375, "epoch": 27.5, "grad_norm": 1.3653507232666016, "kl": 0.072265625, "learning_rate": 9.3125e-07, "loss": 0.0029, "reward": 1.9556488394737244, "reward_std": 0.022309845313429832, "rewards/accuracy_reward": 0.9556488394737244, "rewards/format_reward": 1.0, "step": 110 }, { "completion_length": 66.34375, "epoch": 27.75, "grad_norm": 2.26039981842041, "kl": 0.07080078125, "learning_rate": 9.30625e-07, "loss": 0.0028, "reward": 1.8479012846946716, "reward_std": 0.1186746098101139, "rewards/accuracy_reward": 0.8479012846946716, "rewards/format_reward": 1.0, "step": 111 }, { "completion_length": 73.64286231994629, "epoch": 28.0, "grad_norm": 1.920817494392395, "kl": 0.089599609375, "learning_rate": 9.3e-07, "loss": 0.0036, "reward": 1.8746840953826904, "reward_std": 0.15773950517177582, "rewards/accuracy_reward": 0.874684065580368, "rewards/format_reward": 1.0, "step": 112 }, { "completion_length": 70.9375, "epoch": 28.25, "grad_norm": 2.101787805557251, "kl": 0.070068359375, "learning_rate": 9.293749999999999e-07, "loss": 0.0028, "reward": 1.9114864468574524, "reward_std": 0.08409742452204227, "rewards/accuracy_reward": 0.9114864468574524, "rewards/format_reward": 1.0, "step": 113 }, { "completion_length": 68.3671875, "epoch": 28.5, "grad_norm": 1.2061454057693481, "kl": 0.071044921875, "learning_rate": 9.287499999999999e-07, "loss": 0.0028, "reward": 1.8754997849464417, "reward_std": 0.1343640312552452, "rewards/accuracy_reward": 0.883312314748764, "rewards/format_reward": 0.9921875, "step": 114 }, { "completion_length": 75.0390625, "epoch": 28.75, "grad_norm": 1.5401500463485718, "kl": 0.102294921875, "learning_rate": 9.281249999999999e-07, "loss": 0.0041, "reward": 1.7917361855506897, "reward_std": 0.11317018419504166, "rewards/accuracy_reward": 0.7917361855506897, "rewards/format_reward": 1.0, "step": 115 }, { "completion_length": 73.21429061889648, "epoch": 29.0, "grad_norm": 1.6947107315063477, "kl": 0.06884765625, "learning_rate": 9.274999999999999e-07, "loss": 0.0027, "reward": 1.8370923399925232, "reward_std": 0.21845543384552002, "rewards/accuracy_reward": 0.8370923399925232, "rewards/format_reward": 1.0, "step": 116 }, { "completion_length": 75.4765625, "epoch": 29.25, "grad_norm": 1.9241127967834473, "kl": 0.085693359375, "learning_rate": 9.268749999999999e-07, "loss": 0.0034, "reward": 1.8733400702476501, "reward_std": 0.15294026210904121, "rewards/accuracy_reward": 0.8811526000499725, "rewards/format_reward": 0.9921875, "step": 117 }, { "completion_length": 73.2890625, "epoch": 29.5, "grad_norm": 1.4431734085083008, "kl": 0.076171875, "learning_rate": 9.2625e-07, "loss": 0.0031, "reward": 1.829237937927246, "reward_std": 0.12272904813289642, "rewards/accuracy_reward": 0.8370503783226013, "rewards/format_reward": 0.9921875, "step": 118 }, { "completion_length": 63.21875, "epoch": 29.75, "grad_norm": 0.8593989014625549, "kl": 0.092529296875, "learning_rate": 9.25625e-07, "loss": 0.0037, "reward": 1.89995938539505, "reward_std": 0.026171773206442595, "rewards/accuracy_reward": 0.8999594449996948, "rewards/format_reward": 1.0, "step": 119 }, { "completion_length": 61.92857360839844, "epoch": 30.0, "grad_norm": 1.6377516984939575, "kl": 0.094970703125, "learning_rate": 9.25e-07, "loss": 0.0037, "reward": 1.8513376116752625, "reward_std": 0.1493394821882248, "rewards/accuracy_reward": 0.8513375520706177, "rewards/format_reward": 1.0, "step": 120 }, { "completion_length": 76.2578125, "epoch": 30.25, "grad_norm": 1.9491173028945923, "kl": 0.07861328125, "learning_rate": 9.243749999999999e-07, "loss": 0.0031, "reward": 1.874415099620819, "reward_std": 0.10396317765116692, "rewards/accuracy_reward": 0.8744150996208191, "rewards/format_reward": 1.0, "step": 121 }, { "completion_length": 71.0703125, "epoch": 30.5, "grad_norm": 1.4187074899673462, "kl": 0.0557861328125, "learning_rate": 9.237499999999999e-07, "loss": 0.0022, "reward": 1.8533148169517517, "reward_std": 0.1055279728025198, "rewards/accuracy_reward": 0.8533147573471069, "rewards/format_reward": 1.0, "step": 122 }, { "completion_length": 68.4921875, "epoch": 30.75, "grad_norm": 1.2265938520431519, "kl": 0.08837890625, "learning_rate": 9.23125e-07, "loss": 0.0035, "reward": 1.8870754837989807, "reward_std": 0.05487770680338144, "rewards/accuracy_reward": 0.8870754837989807, "rewards/format_reward": 1.0, "step": 123 }, { "completion_length": 68.42857551574707, "epoch": 31.0, "grad_norm": 1.4902782440185547, "kl": 0.141357421875, "learning_rate": 9.225e-07, "loss": 0.0054, "reward": 1.8678642511367798, "reward_std": 0.08099648356437683, "rewards/accuracy_reward": 0.8678641617298126, "rewards/format_reward": 1.0, "step": 124 }, { "completion_length": 65.359375, "epoch": 31.25, "grad_norm": 1.6962063312530518, "kl": 0.115478515625, "learning_rate": 9.21875e-07, "loss": 0.0046, "reward": 1.9279668927192688, "reward_std": 0.08291410095989704, "rewards/accuracy_reward": 0.9279668629169464, "rewards/format_reward": 1.0, "step": 125 }, { "completion_length": 66.3671875, "epoch": 31.5, "grad_norm": 1.7165579795837402, "kl": 0.072265625, "learning_rate": 9.2125e-07, "loss": 0.0029, "reward": 1.8470426201820374, "reward_std": 0.11905767396092415, "rewards/accuracy_reward": 0.8626675605773926, "rewards/format_reward": 0.984375, "step": 126 }, { "completion_length": 67.90625, "epoch": 31.75, "grad_norm": 1.3967280387878418, "kl": 0.070068359375, "learning_rate": 9.20625e-07, "loss": 0.0028, "reward": 1.8610433340072632, "reward_std": 0.07865873631089926, "rewards/accuracy_reward": 0.868855893611908, "rewards/format_reward": 0.9921875, "step": 127 }, { "completion_length": 89.50000381469727, "epoch": 32.0, "grad_norm": 1.5761455297470093, "kl": 0.08544921875, "learning_rate": 9.2e-07, "loss": 0.0036, "reward": 1.8485036492347717, "reward_std": 0.1893613114953041, "rewards/accuracy_reward": 0.848503589630127, "rewards/format_reward": 1.0, "step": 128 }, { "completion_length": 61.921875, "epoch": 32.25, "grad_norm": 2.392521619796753, "kl": 0.07666015625, "learning_rate": 9.19375e-07, "loss": 0.0031, "reward": 1.9423640966415405, "reward_std": 0.07001920230686665, "rewards/accuracy_reward": 0.9423640370368958, "rewards/format_reward": 1.0, "step": 129 }, { "completion_length": 63.15625, "epoch": 32.5, "grad_norm": 1.7595425844192505, "kl": 0.067138671875, "learning_rate": 9.187499999999999e-07, "loss": 0.0027, "reward": 1.7660340666770935, "reward_std": 0.13782843947410583, "rewards/accuracy_reward": 0.7738466858863831, "rewards/format_reward": 0.9921875, "step": 130 }, { "completion_length": 66.59375, "epoch": 32.75, "grad_norm": 1.5350968837738037, "kl": 0.07666015625, "learning_rate": 9.181249999999999e-07, "loss": 0.0031, "reward": 1.9150999784469604, "reward_std": 0.08709576074033976, "rewards/accuracy_reward": 0.9150999784469604, "rewards/format_reward": 1.0, "step": 131 }, { "completion_length": 76.21428680419922, "epoch": 33.0, "grad_norm": 1.6754910945892334, "kl": 0.10986328125, "learning_rate": 9.174999999999999e-07, "loss": 0.0044, "reward": 1.748177945613861, "reward_std": 0.24180777370929718, "rewards/accuracy_reward": 0.7481780052185059, "rewards/format_reward": 1.0, "step": 132 }, { "completion_length": 60.9921875, "epoch": 33.25, "grad_norm": 1.6315778493881226, "kl": 0.102294921875, "learning_rate": 9.168749999999999e-07, "loss": 0.0041, "reward": 1.9423342943191528, "reward_std": 0.08625898323953152, "rewards/accuracy_reward": 0.94233438372612, "rewards/format_reward": 1.0, "step": 133 }, { "completion_length": 65.8515625, "epoch": 33.5, "grad_norm": 1.549924373626709, "kl": 0.08837890625, "learning_rate": 9.1625e-07, "loss": 0.0035, "reward": 1.8791784048080444, "reward_std": 0.07251664437353611, "rewards/accuracy_reward": 0.8791784048080444, "rewards/format_reward": 1.0, "step": 134 }, { "completion_length": 64.0, "epoch": 33.75, "grad_norm": 1.5533236265182495, "kl": 0.1025390625, "learning_rate": 9.15625e-07, "loss": 0.0041, "reward": 1.8645389080047607, "reward_std": 0.06934745609760284, "rewards/accuracy_reward": 0.8645389974117279, "rewards/format_reward": 1.0, "step": 135 }, { "completion_length": 53.50000190734863, "epoch": 34.0, "grad_norm": 1.9051095247268677, "kl": 0.08349609375, "learning_rate": 9.15e-07, "loss": 0.0035, "reward": 1.9944872856140137, "reward_std": 0.002273906720802188, "rewards/accuracy_reward": 0.9944872260093689, "rewards/format_reward": 1.0, "step": 136 }, { "completion_length": 64.984375, "epoch": 34.25, "grad_norm": 1.688677191734314, "kl": 0.09130859375, "learning_rate": 9.14375e-07, "loss": 0.0037, "reward": 1.877675175666809, "reward_std": 0.07563214749097824, "rewards/accuracy_reward": 0.8776752054691315, "rewards/format_reward": 1.0, "step": 137 }, { "completion_length": 69.9375, "epoch": 34.5, "grad_norm": 2.37134051322937, "kl": 0.08447265625, "learning_rate": 9.137499999999999e-07, "loss": 0.0034, "reward": 1.8840071558952332, "reward_std": 0.0991634838283062, "rewards/accuracy_reward": 0.8840071260929108, "rewards/format_reward": 1.0, "step": 138 }, { "completion_length": 62.5078125, "epoch": 34.75, "grad_norm": 3.3183658123016357, "kl": 0.091064453125, "learning_rate": 9.131249999999999e-07, "loss": 0.0036, "reward": 1.9298002123832703, "reward_std": 0.04489972349256277, "rewards/accuracy_reward": 0.929800271987915, "rewards/format_reward": 1.0, "step": 139 }, { "completion_length": 43.64285850524902, "epoch": 35.0, "grad_norm": 1.5096288919448853, "kl": 0.11328125, "learning_rate": 9.124999999999999e-07, "loss": 0.0046, "reward": 2.000000238418579, "reward_std": 0.0, "rewards/accuracy_reward": 1.0000001788139343, "rewards/format_reward": 1.0, "step": 140 }, { "completion_length": 66.15625, "epoch": 35.25, "grad_norm": 1.5723927021026611, "kl": 0.125244140625, "learning_rate": 9.11875e-07, "loss": 0.005, "reward": 1.883579969406128, "reward_std": 0.11640417203307152, "rewards/accuracy_reward": 0.8835799992084503, "rewards/format_reward": 1.0, "step": 141 }, { "completion_length": 63.1875, "epoch": 35.5, "grad_norm": 1.9653661251068115, "kl": 0.112060546875, "learning_rate": 9.1125e-07, "loss": 0.0045, "reward": 1.9033833146095276, "reward_std": 0.1032666489481926, "rewards/accuracy_reward": 0.9033832252025604, "rewards/format_reward": 1.0, "step": 142 }, { "completion_length": 68.21875, "epoch": 35.75, "grad_norm": 1.6285479068756104, "kl": 0.098876953125, "learning_rate": 9.10625e-07, "loss": 0.004, "reward": 1.9241021275520325, "reward_std": 0.048172490671277046, "rewards/accuracy_reward": 0.9241021573543549, "rewards/format_reward": 1.0, "step": 143 }, { "completion_length": 66.00000381469727, "epoch": 36.0, "grad_norm": 1.6228117942810059, "kl": 0.086669921875, "learning_rate": 9.1e-07, "loss": 0.0036, "reward": 1.9067211747169495, "reward_std": 0.1559794396162033, "rewards/accuracy_reward": 0.9067211747169495, "rewards/format_reward": 1.0, "step": 144 }, { "completion_length": 66.4140625, "epoch": 36.25, "grad_norm": 1.8398008346557617, "kl": 0.11865234375, "learning_rate": 9.09375e-07, "loss": 0.0047, "reward": 1.8845882415771484, "reward_std": 0.09509449079632759, "rewards/accuracy_reward": 0.8845882713794708, "rewards/format_reward": 1.0, "step": 145 }, { "completion_length": 62.125, "epoch": 36.5, "grad_norm": 1.786875605583191, "kl": 0.133544921875, "learning_rate": 9.087499999999999e-07, "loss": 0.0053, "reward": 1.9288102984428406, "reward_std": 0.10526704788208008, "rewards/accuracy_reward": 0.928810328245163, "rewards/format_reward": 1.0, "step": 146 }, { "completion_length": 66.4609375, "epoch": 36.75, "grad_norm": 1.0654469728469849, "kl": 0.085693359375, "learning_rate": 9.081249999999999e-07, "loss": 0.0034, "reward": 1.9138463139533997, "reward_std": 0.05325407162308693, "rewards/accuracy_reward": 0.9138462841510773, "rewards/format_reward": 1.0, "step": 147 }, { "completion_length": 73.92857360839844, "epoch": 37.0, "grad_norm": 3.4973747730255127, "kl": 0.08349609375, "learning_rate": 9.074999999999999e-07, "loss": 0.0033, "reward": 1.9407565593719482, "reward_std": 0.02056967036332935, "rewards/accuracy_reward": 0.9407565593719482, "rewards/format_reward": 1.0, "step": 148 }, { "completion_length": 64.2421875, "epoch": 37.25, "grad_norm": 1.0742721557617188, "kl": 0.0662841796875, "learning_rate": 9.068749999999999e-07, "loss": 0.0027, "reward": 1.95524662733078, "reward_std": 0.03273457381874323, "rewards/accuracy_reward": 0.9552466571331024, "rewards/format_reward": 1.0, "step": 149 }, { "completion_length": 65.5234375, "epoch": 37.5, "grad_norm": 1.77584707736969, "kl": 0.13134765625, "learning_rate": 9.0625e-07, "loss": 0.0053, "reward": 1.9271560907363892, "reward_std": 0.09471606463193893, "rewards/accuracy_reward": 0.9271561205387115, "rewards/format_reward": 1.0, "step": 150 }, { "completion_length": 64.6484375, "epoch": 37.75, "grad_norm": 1.662251353263855, "kl": 0.10595703125, "learning_rate": 9.05625e-07, "loss": 0.0042, "reward": 1.886656403541565, "reward_std": 0.061278367415070534, "rewards/accuracy_reward": 0.8866565227508545, "rewards/format_reward": 1.0, "step": 151 }, { "completion_length": 62.14285850524902, "epoch": 38.0, "grad_norm": 1.0349845886230469, "kl": 0.084716796875, "learning_rate": 9.05e-07, "loss": 0.0033, "reward": 1.9759380221366882, "reward_std": 0.006050224881619215, "rewards/accuracy_reward": 0.9759379625320435, "rewards/format_reward": 1.0, "step": 152 }, { "completion_length": 68.9453125, "epoch": 38.25, "grad_norm": 1.4474000930786133, "kl": 0.081298828125, "learning_rate": 9.04375e-07, "loss": 0.0033, "reward": 1.899282455444336, "reward_std": 0.07941721752285957, "rewards/accuracy_reward": 0.8992824554443359, "rewards/format_reward": 1.0, "step": 153 }, { "completion_length": 67.90625, "epoch": 38.5, "grad_norm": 1.698777437210083, "kl": 0.091552734375, "learning_rate": 9.0375e-07, "loss": 0.0037, "reward": 1.8697898387908936, "reward_std": 0.096246431581676, "rewards/accuracy_reward": 0.8697898387908936, "rewards/format_reward": 1.0, "step": 154 }, { "completion_length": 62.40625, "epoch": 38.75, "grad_norm": 2.1754443645477295, "kl": 0.142578125, "learning_rate": 9.031249999999999e-07, "loss": 0.0057, "reward": 1.938372015953064, "reward_std": 0.05481863580644131, "rewards/accuracy_reward": 0.9383720755577087, "rewards/format_reward": 1.0, "step": 155 }, { "completion_length": 61.42857551574707, "epoch": 39.0, "grad_norm": 1.1363853216171265, "kl": 0.10986328125, "learning_rate": 9.024999999999999e-07, "loss": 0.0041, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.9999999403953552, "rewards/format_reward": 1.0, "step": 156 }, { "completion_length": 63.625, "epoch": 39.25, "grad_norm": 2.1956725120544434, "kl": 0.09814453125, "learning_rate": 9.018749999999999e-07, "loss": 0.0039, "reward": 1.9699036478996277, "reward_std": 0.028281668201088905, "rewards/accuracy_reward": 0.9699036478996277, "rewards/format_reward": 1.0, "step": 157 }, { "completion_length": 68.1953125, "epoch": 39.5, "grad_norm": 12.062308311462402, "kl": 0.089111328125, "learning_rate": 9.0125e-07, "loss": 0.0036, "reward": 1.9097211360931396, "reward_std": 0.028848751448094845, "rewards/accuracy_reward": 0.9097210764884949, "rewards/format_reward": 1.0, "step": 158 }, { "completion_length": 60.328125, "epoch": 39.75, "grad_norm": 1.7780170440673828, "kl": 0.117431640625, "learning_rate": 9.00625e-07, "loss": 0.0047, "reward": 1.9212573766708374, "reward_std": 0.05963377561420202, "rewards/accuracy_reward": 0.9212574064731598, "rewards/format_reward": 1.0, "step": 159 }, { "completion_length": 74.35714721679688, "epoch": 40.0, "grad_norm": 1.2525172233581543, "kl": 0.092041015625, "learning_rate": 9e-07, "loss": 0.0034, "reward": 1.60818749666214, "reward_std": 0.27048664540052414, "rewards/accuracy_reward": 0.6081875115633011, "rewards/format_reward": 1.0, "step": 160 }, { "completion_length": 64.375, "epoch": 40.25, "grad_norm": 1.7184271812438965, "kl": 0.124267578125, "learning_rate": 8.99375e-07, "loss": 0.005, "reward": 1.8835479021072388, "reward_std": 0.12309728935360909, "rewards/accuracy_reward": 0.8835478723049164, "rewards/format_reward": 1.0, "step": 161 }, { "completion_length": 64.40625, "epoch": 40.5, "grad_norm": 1.8725802898406982, "kl": 0.093017578125, "learning_rate": 8.9875e-07, "loss": 0.0037, "reward": 1.8645429015159607, "reward_std": 0.11185415461659431, "rewards/accuracy_reward": 0.8645428419113159, "rewards/format_reward": 1.0, "step": 162 }, { "completion_length": 63.2890625, "epoch": 40.75, "grad_norm": 1.6588366031646729, "kl": 0.103759765625, "learning_rate": 8.981249999999999e-07, "loss": 0.0042, "reward": 1.9586772918701172, "reward_std": 0.0604820279404521, "rewards/accuracy_reward": 0.9586772918701172, "rewards/format_reward": 1.0, "step": 163 }, { "completion_length": 57.07143020629883, "epoch": 41.0, "grad_norm": 1.847510576248169, "kl": 0.0975341796875, "learning_rate": 8.974999999999999e-07, "loss": 0.004, "reward": 1.996271789073944, "reward_std": 0.009226946160197258, "rewards/accuracy_reward": 0.9962717592716217, "rewards/format_reward": 1.0, "step": 164 }, { "completion_length": 65.078125, "epoch": 41.25, "grad_norm": 38.17945861816406, "kl": 0.104736328125, "learning_rate": 8.96875e-07, "loss": 0.0042, "reward": 1.972551941871643, "reward_std": 0.029624830232933164, "rewards/accuracy_reward": 0.9725519418716431, "rewards/format_reward": 1.0, "step": 165 }, { "completion_length": 61.625, "epoch": 41.5, "grad_norm": 1.6226741075515747, "kl": 0.10888671875, "learning_rate": 8.9625e-07, "loss": 0.0044, "reward": 1.9604188203811646, "reward_std": 0.032178135588765144, "rewards/accuracy_reward": 0.9604187309741974, "rewards/format_reward": 1.0, "step": 166 }, { "completion_length": 62.6953125, "epoch": 41.75, "grad_norm": 1.6520531177520752, "kl": 0.10791015625, "learning_rate": 8.95625e-07, "loss": 0.0043, "reward": 1.927883267402649, "reward_std": 0.13132158294320107, "rewards/accuracy_reward": 0.9278833270072937, "rewards/format_reward": 1.0, "step": 167 }, { "completion_length": 59.500003814697266, "epoch": 42.0, "grad_norm": 3.64481520652771, "kl": 0.224365234375, "learning_rate": 8.95e-07, "loss": 0.0083, "reward": 1.990772783756256, "reward_std": 0.014955918304622173, "rewards/accuracy_reward": 0.9907727539539337, "rewards/format_reward": 1.0, "step": 168 }, { "completion_length": 64.328125, "epoch": 42.25, "grad_norm": 1.8193657398223877, "kl": 0.139404296875, "learning_rate": 8.94375e-07, "loss": 0.0056, "reward": 1.8847713470458984, "reward_std": 0.07002338580787182, "rewards/accuracy_reward": 0.8847712576389313, "rewards/format_reward": 1.0, "step": 169 }, { "completion_length": 64.3203125, "epoch": 42.5, "grad_norm": 1.530835509300232, "kl": 0.081298828125, "learning_rate": 8.9375e-07, "loss": 0.0032, "reward": 1.932590126991272, "reward_std": 0.08853724412620068, "rewards/accuracy_reward": 0.932590126991272, "rewards/format_reward": 1.0, "step": 170 }, { "completion_length": 64.421875, "epoch": 42.75, "grad_norm": 1.8089220523834229, "kl": 0.10009765625, "learning_rate": 8.931249999999999e-07, "loss": 0.004, "reward": 1.9313350915908813, "reward_std": 0.10991925746202469, "rewards/accuracy_reward": 0.9313351213932037, "rewards/format_reward": 1.0, "step": 171 }, { "completion_length": 57.78571891784668, "epoch": 43.0, "grad_norm": 1.598233938217163, "kl": 0.1025390625, "learning_rate": 8.924999999999999e-07, "loss": 0.004, "reward": 1.9523810148239136, "reward_std": 0.117851123213768, "rewards/accuracy_reward": 0.9523809850215912, "rewards/format_reward": 1.0, "step": 172 }, { "completion_length": 71.3046875, "epoch": 43.25, "grad_norm": 1.3065526485443115, "kl": 0.10498046875, "learning_rate": 8.918749999999999e-07, "loss": 0.0042, "reward": 1.9195694327354431, "reward_std": 0.09789480268955231, "rewards/accuracy_reward": 0.9195694029331207, "rewards/format_reward": 1.0, "step": 173 }, { "completion_length": 65.5234375, "epoch": 43.5, "grad_norm": 2.1177382469177246, "kl": 0.077880859375, "learning_rate": 8.912499999999999e-07, "loss": 0.0031, "reward": 1.9476656317710876, "reward_std": 0.1074841171503067, "rewards/accuracy_reward": 0.9476656317710876, "rewards/format_reward": 1.0, "step": 174 }, { "completion_length": 65.203125, "epoch": 43.75, "grad_norm": 1.707092523574829, "kl": 0.092529296875, "learning_rate": 8.906249999999999e-07, "loss": 0.0037, "reward": 1.957956075668335, "reward_std": 0.08192763105034828, "rewards/accuracy_reward": 0.9579560160636902, "rewards/format_reward": 1.0, "step": 175 }, { "completion_length": 63.92857360839844, "epoch": 44.0, "grad_norm": 2.0081725120544434, "kl": 0.112060546875, "learning_rate": 8.9e-07, "loss": 0.0042, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 176 }, { "completion_length": 70.5234375, "epoch": 44.25, "grad_norm": 1.1276594400405884, "kl": 0.078125, "learning_rate": 8.89375e-07, "loss": 0.0031, "reward": 1.9139772653579712, "reward_std": 0.04851855710148811, "rewards/accuracy_reward": 0.9139772355556488, "rewards/format_reward": 1.0, "step": 177 }, { "completion_length": 66.9609375, "epoch": 44.5, "grad_norm": 1.5237661600112915, "kl": 0.087646484375, "learning_rate": 8.8875e-07, "loss": 0.0035, "reward": 1.9119119048118591, "reward_std": 0.11337531358003616, "rewards/accuracy_reward": 0.9119119048118591, "rewards/format_reward": 1.0, "step": 178 }, { "completion_length": 67.40625, "epoch": 44.75, "grad_norm": 1.9239343404769897, "kl": 0.118408203125, "learning_rate": 8.88125e-07, "loss": 0.0047, "reward": 1.950349748134613, "reward_std": 0.08595369663089514, "rewards/accuracy_reward": 0.950349748134613, "rewards/format_reward": 1.0, "step": 179 }, { "completion_length": 65.92857360839844, "epoch": 45.0, "grad_norm": 1.3003308773040771, "kl": 0.089111328125, "learning_rate": 8.874999999999999e-07, "loss": 0.0033, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 180 }, { "completion_length": 62.4296875, "epoch": 45.25, "grad_norm": 1.5781491994857788, "kl": 0.1025390625, "learning_rate": 8.86875e-07, "loss": 0.0041, "reward": 1.9496909379959106, "reward_std": 0.09814758412539959, "rewards/accuracy_reward": 0.957503467798233, "rewards/format_reward": 0.9921875, "step": 181 }, { "completion_length": 68.4453125, "epoch": 45.5, "grad_norm": 1.3683621883392334, "kl": 0.066162109375, "learning_rate": 8.8625e-07, "loss": 0.0027, "reward": 1.9691051840782166, "reward_std": 0.05337556218728423, "rewards/accuracy_reward": 0.9691051244735718, "rewards/format_reward": 1.0, "step": 182 }, { "completion_length": 68.578125, "epoch": 45.75, "grad_norm": 2.0152835845947266, "kl": 0.1162109375, "learning_rate": 8.85625e-07, "loss": 0.0047, "reward": 1.8733105659484863, "reward_std": 0.10198326967656612, "rewards/accuracy_reward": 0.8733105063438416, "rewards/format_reward": 1.0, "step": 183 }, { "completion_length": 67.00000190734863, "epoch": 46.0, "grad_norm": 1.3855398893356323, "kl": 0.091796875, "learning_rate": 8.85e-07, "loss": 0.0041, "reward": 2.0, "reward_std": 0.0, "rewards/accuracy_reward": 1.0, "rewards/format_reward": 1.0, "step": 184 }, { "completion_length": 64.0859375, "epoch": 46.25, "grad_norm": 1.4427531957626343, "kl": 0.12060546875, "learning_rate": 8.84375e-07, "loss": 0.0048, "reward": 1.9910829067230225, "reward_std": 0.013608792796730995, "rewards/accuracy_reward": 0.9910828173160553, "rewards/format_reward": 1.0, "step": 185 }, { "completion_length": 68.8984375, "epoch": 46.5, "grad_norm": 1.636596918106079, "kl": 0.11083984375, "learning_rate": 8.8375e-07, "loss": 0.0044, "reward": 1.8811240196228027, "reward_std": 0.08701632916927338, "rewards/accuracy_reward": 0.8811240196228027, "rewards/format_reward": 1.0, "step": 186 }, { "completion_length": 62.34375, "epoch": 46.75, "grad_norm": 1.5652858018875122, "kl": 0.095947265625, "learning_rate": 8.83125e-07, "loss": 0.0038, "reward": 1.9629307389259338, "reward_std": 0.04571997746825218, "rewards/accuracy_reward": 0.9629306495189667, "rewards/format_reward": 1.0, "step": 187 }, { "completion_length": 52.85714530944824, "epoch": 47.0, "grad_norm": 1.4924471378326416, "kl": 0.0849609375, "learning_rate": 8.824999999999999e-07, "loss": 0.0035, "reward": 1.8819976449012756, "reward_std": 0.043679721653461456, "rewards/accuracy_reward": 0.8819977045059204, "rewards/format_reward": 1.0, "step": 188 }, { "completion_length": 60.375, "epoch": 47.25, "grad_norm": 1.2424156665802002, "kl": 0.11572265625, "learning_rate": 8.818749999999999e-07, "loss": 0.0046, "reward": 1.9622814059257507, "reward_std": 0.03379644639790058, "rewards/accuracy_reward": 0.9622813165187836, "rewards/format_reward": 1.0, "step": 189 }, { "completion_length": 64.8125, "epoch": 47.5, "grad_norm": 1.445113182067871, "kl": 0.107421875, "learning_rate": 8.812499999999999e-07, "loss": 0.0043, "reward": 1.969697892665863, "reward_std": 0.025794532150030136, "rewards/accuracy_reward": 0.9696978330612183, "rewards/format_reward": 1.0, "step": 190 }, { "completion_length": 62.25, "epoch": 47.75, "grad_norm": 1.8363816738128662, "kl": 0.161865234375, "learning_rate": 8.806249999999999e-07, "loss": 0.0065, "reward": 1.900344431400299, "reward_std": 0.05351579561829567, "rewards/accuracy_reward": 0.9003444910049438, "rewards/format_reward": 1.0, "step": 191 }, { "completion_length": 65.71428680419922, "epoch": 48.0, "grad_norm": 1.4372133016586304, "kl": 0.103271484375, "learning_rate": 8.799999999999999e-07, "loss": 0.0038, "reward": 1.9987189769744873, "reward_std": 0.0020755312871187925, "rewards/accuracy_reward": 0.9987190961837769, "rewards/format_reward": 1.0, "step": 192 }, { "completion_length": 60.0859375, "epoch": 48.25, "grad_norm": 1.662278175354004, "kl": 0.087646484375, "learning_rate": 8.793749999999999e-07, "loss": 0.0035, "reward": 1.9600813388824463, "reward_std": 0.07481173612177372, "rewards/accuracy_reward": 0.9600813686847687, "rewards/format_reward": 1.0, "step": 193 }, { "completion_length": 60.2890625, "epoch": 48.5, "grad_norm": 1.3797621726989746, "kl": 0.10595703125, "learning_rate": 8.7875e-07, "loss": 0.0042, "reward": 1.9719964265823364, "reward_std": 0.02817021100781858, "rewards/accuracy_reward": 0.9719964861869812, "rewards/format_reward": 1.0, "step": 194 }, { "completion_length": 60.7578125, "epoch": 48.75, "grad_norm": 1.4646984338760376, "kl": 0.13037109375, "learning_rate": 8.78125e-07, "loss": 0.0052, "reward": 1.9132311940193176, "reward_std": 0.04202069714665413, "rewards/accuracy_reward": 0.9132311940193176, "rewards/format_reward": 1.0, "step": 195 }, { "completion_length": 64.21428871154785, "epoch": 49.0, "grad_norm": 1.513399600982666, "kl": 0.1103515625, "learning_rate": 8.774999999999999e-07, "loss": 0.0041, "reward": 1.9454046487808228, "reward_std": 0.055479995906353, "rewards/accuracy_reward": 0.9454046189785004, "rewards/format_reward": 1.0, "step": 196 }, { "completion_length": 61.5, "epoch": 49.25, "grad_norm": 1.0642904043197632, "kl": 0.094482421875, "learning_rate": 8.76875e-07, "loss": 0.0038, "reward": 1.9798645377159119, "reward_std": 0.031060860259458423, "rewards/accuracy_reward": 0.9798645973205566, "rewards/format_reward": 1.0, "step": 197 }, { "completion_length": 61.5, "epoch": 49.5, "grad_norm": 2.066206455230713, "kl": 0.1689453125, "learning_rate": 8.7625e-07, "loss": 0.0067, "reward": 1.917717456817627, "reward_std": 0.027819208800792694, "rewards/accuracy_reward": 0.9177174866199493, "rewards/format_reward": 1.0, "step": 198 }, { "completion_length": 59.9296875, "epoch": 49.75, "grad_norm": 1.2293668985366821, "kl": 0.114990234375, "learning_rate": 8.75625e-07, "loss": 0.0046, "reward": 1.9745354652404785, "reward_std": 0.048739076184574515, "rewards/accuracy_reward": 0.9823479354381561, "rewards/format_reward": 0.9921875, "step": 199 }, { "completion_length": 75.85714721679688, "epoch": 50.0, "grad_norm": 1.5204596519470215, "kl": 0.095703125, "learning_rate": 8.75e-07, "loss": 0.0041, "reward": 1.9712833166122437, "reward_std": 0.02327703475020826, "rewards/accuracy_reward": 0.9712833166122437, "rewards/format_reward": 1.0, "step": 200 } ], "logging_steps": 1.0, "max_steps": 1600, "num_input_tokens_seen": 0, "num_train_epochs": 400, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }