{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9523809523809526, "eval_steps": 500, "global_step": 45, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 668.669677734375, "epoch": 0.06349206349206349, "grad_norm": 14.599732398986816, "kl": 0.0, "learning_rate": 4.000000000000001e-06, "loss": -0.0007, "reward": 0.3214285895228386, "reward_std": 0.535038448870182, "rewards/graph_reward": 0.3214285895228386, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 680.9509124755859, "epoch": 0.12698412698412698, "grad_norm": 0.4279649555683136, "kl": 0.0, "learning_rate": 8.000000000000001e-06, "loss": 0.0317, "reward": 0.3214285857975483, "reward_std": 0.49289844930171967, "rewards/graph_reward": 0.3214285857975483, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 648.4464721679688, "epoch": 0.19047619047619047, "grad_norm": 1.7730225324630737, "kl": 0.00031948089599609375, "learning_rate": 1.2e-05, "loss": 0.0226, "reward": 0.2857142984867096, "reward_std": 0.47218646109104156, "rewards/graph_reward": 0.2857142984867096, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 600.6607360839844, "epoch": 0.25396825396825395, "grad_norm": 2.329758644104004, "kl": 0.028167724609375, "learning_rate": 1.6000000000000003e-05, "loss": 0.0518, "reward": 0.4017857313156128, "reward_std": 0.5280746445059776, "rewards/graph_reward": 0.4017857313156128, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 357.64286041259766, "epoch": 0.31746031746031744, "grad_norm": 2.8278896808624268, "kl": 0.07501220703125, "learning_rate": 2e-05, "loss": 0.0103, "reward": 0.79464291036129, "reward_std": 0.747464045882225, "rewards/graph_reward": 0.79464291036129, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 457.1071548461914, "epoch": 0.38095238095238093, "grad_norm": 13820146688.0, "kl": 117374976.0, "learning_rate": 1.9969173337331283e-05, "loss": 4097932.25, "reward": 0.98214291036129, "reward_std": 0.6609893068671227, "rewards/graph_reward": 0.98214291036129, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 541.0937728881836, "epoch": 0.4444444444444444, "grad_norm": 1.6704585552215576, "kl": 0.164306640625, "learning_rate": 1.9876883405951378e-05, "loss": 0.0721, "reward": 0.7946428805589676, "reward_std": 0.6530626565217972, "rewards/graph_reward": 0.7946428805589676, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 626.7411041259766, "epoch": 0.5079365079365079, "grad_norm": 1.676254153251648, "kl": 0.204833984375, "learning_rate": 1.9723699203976768e-05, "loss": 0.0474, "reward": 0.3482143059372902, "reward_std": 0.5222531035542488, "rewards/graph_reward": 0.3482143059372902, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 900.7589569091797, "epoch": 0.5714285714285714, "grad_norm": 0.6683644652366638, "kl": 0.181884765625, "learning_rate": 1.9510565162951538e-05, "loss": 0.0936, "reward": 0.6250000298023224, "reward_std": 0.5735516771674156, "rewards/graph_reward": 0.6250000298023224, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 959.1518402099609, "epoch": 0.6349206349206349, "grad_norm": 0.3719010055065155, "kl": 0.19287109375, "learning_rate": 1.9238795325112867e-05, "loss": 0.0338, "reward": 0.0803571455180645, "reward_std": 0.14164696633815765, "rewards/graph_reward": 0.0803571455180645, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 872.8616485595703, "epoch": 0.6984126984126984, "grad_norm": 1.9810298681259155, "kl": 0.220703125, "learning_rate": 1.891006524188368e-05, "loss": 0.0044, "reward": 0.08928571920841932, "reward_std": 0.18201843090355396, "rewards/graph_reward": 0.08928571920841932, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 635.1830596923828, "epoch": 0.7619047619047619, "grad_norm": 0.4636230766773224, "kl": 0.210693359375, "learning_rate": 1.8526401643540924e-05, "loss": 0.02, "reward": 0.2232142984867096, "reward_std": 0.3668579272925854, "rewards/graph_reward": 0.2232142984867096, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 573.5089569091797, "epoch": 0.8253968253968254, "grad_norm": 0.672173798084259, "kl": 0.2294921875, "learning_rate": 1.8090169943749477e-05, "loss": 0.0232, "reward": 0.6607143133878708, "reward_std": 0.7296628206968307, "rewards/graph_reward": 0.6607143133878708, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 518.3259201049805, "epoch": 0.8888888888888888, "grad_norm": 0.5373820662498474, "kl": 0.19970703125, "learning_rate": 1.7604059656000313e-05, "loss": 0.0148, "reward": 0.5000000223517418, "reward_std": 0.6410829946398735, "rewards/graph_reward": 0.5000000223517418, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 506.68751525878906, "epoch": 0.9523809523809523, "grad_norm": 0.5141621232032776, "kl": 0.1875, "learning_rate": 1.7071067811865477e-05, "loss": 0.0312, "reward": 0.4821428805589676, "reward_std": 0.5212902799248695, "rewards/graph_reward": 0.4821428805589676, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 449.55358123779297, "epoch": 1.0634920634920635, "grad_norm": 0.8779112100601196, "kl": 0.212158203125, "learning_rate": 1.6494480483301836e-05, "loss": 0.0131, "reward": 0.6071428805589676, "reward_std": 0.43190471827983856, "rewards/graph_reward": 0.6071428805589676, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 404.7098388671875, "epoch": 1.126984126984127, "grad_norm": 1.4357943534851074, "kl": 0.225830078125, "learning_rate": 1.5877852522924733e-05, "loss": 0.0361, "reward": 0.7767857760190964, "reward_std": 0.7070925980806351, "rewards/graph_reward": 0.7767857760190964, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 392.4151916503906, "epoch": 1.1904761904761905, "grad_norm": 0.8047733902931213, "kl": 0.263916015625, "learning_rate": 1.5224985647159489e-05, "loss": 0.0837, "reward": 0.517857164144516, "reward_std": 0.6126912012696266, "rewards/graph_reward": 0.517857164144516, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 663.2901916503906, "epoch": 1.253968253968254, "grad_norm": 0.7689018249511719, "kl": 0.32177734375, "learning_rate": 1.4539904997395468e-05, "loss": 0.1363, "reward": 0.4375000298023224, "reward_std": 0.5959424823522568, "rewards/graph_reward": 0.4375000298023224, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 1077.3661499023438, "epoch": 1.3174603174603174, "grad_norm": 0.4689948558807373, "kl": 0.3798828125, "learning_rate": 1.3826834323650899e-05, "loss": 0.1309, "reward": 0.2589285895228386, "reward_std": 0.3649100065231323, "rewards/graph_reward": 0.2589285895228386, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 750.763427734375, "epoch": 1.380952380952381, "grad_norm": 0.575568437576294, "kl": 0.42138671875, "learning_rate": 1.3090169943749475e-05, "loss": 0.1088, "reward": 0.27678572200238705, "reward_std": 0.44856370612978935, "rewards/graph_reward": 0.27678572200238705, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 558.2544860839844, "epoch": 1.4444444444444444, "grad_norm": 0.6248221397399902, "kl": 0.4111328125, "learning_rate": 1.2334453638559057e-05, "loss": 0.0542, "reward": 0.2589285718277097, "reward_std": 0.35302002541720867, "rewards/graph_reward": 0.2589285718277097, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 405.4062728881836, "epoch": 1.507936507936508, "grad_norm": 0.6308891177177429, "kl": 0.3486328125, "learning_rate": 1.156434465040231e-05, "loss": 0.0644, "reward": 0.383928582072258, "reward_std": 0.5448233261704445, "rewards/graph_reward": 0.383928582072258, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 339.0178680419922, "epoch": 1.5714285714285714, "grad_norm": 0.6035424470901489, "kl": 0.31884765625, "learning_rate": 1.0784590957278452e-05, "loss": 0.0192, "reward": 0.5446428880095482, "reward_std": 0.4957195296883583, "rewards/graph_reward": 0.5446428880095482, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 326.39733123779297, "epoch": 1.6349206349206349, "grad_norm": 0.5706361532211304, "kl": 0.28271484375, "learning_rate": 1e-05, "loss": 0.0109, "reward": 0.571428582072258, "reward_std": 0.3363610692322254, "rewards/graph_reward": 0.571428582072258, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 319.0133972167969, "epoch": 1.6984126984126984, "grad_norm": 0.6702840328216553, "kl": 0.27587890625, "learning_rate": 9.215409042721553e-06, "loss": 0.0264, "reward": 0.6607143133878708, "reward_std": 0.576372779905796, "rewards/graph_reward": 0.6607143133878708, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 338.82144927978516, "epoch": 1.7619047619047619, "grad_norm": 0.49153271317481995, "kl": 0.283203125, "learning_rate": 8.43565534959769e-06, "loss": -0.0115, "reward": 0.6250000298023224, "reward_std": 0.5419125556945801, "rewards/graph_reward": 0.6250000298023224, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 348.3169708251953, "epoch": 1.8253968253968254, "grad_norm": 0.5534822940826416, "kl": 0.2578125, "learning_rate": 7.66554636144095e-06, "loss": 0.012, "reward": 0.8750000447034836, "reward_std": 0.5143264532089233, "rewards/graph_reward": 0.8750000447034836, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 367.44197845458984, "epoch": 1.8888888888888888, "grad_norm": 0.8025973439216614, "kl": 0.254638671875, "learning_rate": 6.909830056250527e-06, "loss": 0.0294, "reward": 0.6071428805589676, "reward_std": 0.655883751809597, "rewards/graph_reward": 0.6071428805589676, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 396.97596740722656, "epoch": 1.9523809523809523, "grad_norm": 0.7672126293182373, "kl": 0.25244140625, "learning_rate": 6.173165676349103e-06, "loss": 0.0346, "reward": 0.8571428805589676, "reward_std": 0.5488763749599457, "rewards/graph_reward": 0.8571428805589676, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 424.0044860839844, "epoch": 2.0634920634920633, "grad_norm": 0.5899645090103149, "kl": 0.27001953125, "learning_rate": 5.460095002604533e-06, "loss": 0.0286, "reward": 0.8839286267757416, "reward_std": 0.5037358924746513, "rewards/graph_reward": 0.8839286267757416, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 455.21876525878906, "epoch": 2.126984126984127, "grad_norm": 0.8172342777252197, "kl": 0.25830078125, "learning_rate": 4.775014352840512e-06, "loss": 0.0636, "reward": 0.6607143171131611, "reward_std": 0.42899394035339355, "rewards/graph_reward": 0.6607143171131611, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 485.95538330078125, "epoch": 2.1904761904761907, "grad_norm": 0.8087209463119507, "kl": 0.283203125, "learning_rate": 4.12214747707527e-06, "loss": 0.049, "reward": 0.785714328289032, "reward_std": 0.5360910147428513, "rewards/graph_reward": 0.785714328289032, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 537.3839492797852, "epoch": 2.253968253968254, "grad_norm": 1.2612076997756958, "kl": 0.29833984375, "learning_rate": 3.505519516698165e-06, "loss": 0.1034, "reward": 0.7767857611179352, "reward_std": 0.5999058485031128, "rewards/graph_reward": 0.7767857611179352, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 531.1652069091797, "epoch": 2.317460317460317, "grad_norm": 2.282433032989502, "kl": 0.36474609375, "learning_rate": 2.9289321881345257e-06, "loss": 0.1471, "reward": 0.8750000298023224, "reward_std": 0.5596240684390068, "rewards/graph_reward": 0.8750000298023224, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 476.2053756713867, "epoch": 2.380952380952381, "grad_norm": 3.4955978393554688, "kl": 0.49169921875, "learning_rate": 2.395940343999691e-06, "loss": 0.0662, "reward": 0.6339286118745804, "reward_std": 0.481971338391304, "rewards/graph_reward": 0.6339286118745804, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 435.54019927978516, "epoch": 2.4444444444444446, "grad_norm": 4.864017963409424, "kl": 0.8310546875, "learning_rate": 1.9098300562505266e-06, "loss": 0.0783, "reward": 0.8660714775323868, "reward_std": 0.7266623377799988, "rewards/graph_reward": 0.8660714775323868, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 385.86608123779297, "epoch": 2.507936507936508, "grad_norm": 31.254209518432617, "kl": 1.837890625, "learning_rate": 1.4735983564590784e-06, "loss": 0.0904, "reward": 0.8928571790456772, "reward_std": 0.6362242549657822, "rewards/graph_reward": 0.8928571790456772, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 408.38841247558594, "epoch": 2.571428571428571, "grad_norm": 76.5787124633789, "kl": 1.4765625, "learning_rate": 1.0899347581163222e-06, "loss": 0.0649, "reward": 0.7053571790456772, "reward_std": 0.6126912087202072, "rewards/graph_reward": 0.7053571790456772, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 396.6071548461914, "epoch": 2.634920634920635, "grad_norm": 8.922845840454102, "kl": 0.59521484375, "learning_rate": 7.612046748871327e-07, "loss": 0.0264, "reward": 0.8303571790456772, "reward_std": 0.6264393925666809, "rewards/graph_reward": 0.8303571790456772, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 396.32591247558594, "epoch": 2.6984126984126986, "grad_norm": 19.087890625, "kl": 0.50244140625, "learning_rate": 4.894348370484648e-07, "loss": -0.0059, "reward": 0.9017857909202576, "reward_std": 0.6444200202822685, "rewards/graph_reward": 0.9017857909202576, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 407.43751525878906, "epoch": 2.761904761904762, "grad_norm": 11.51771354675293, "kl": 0.43115234375, "learning_rate": 2.7630079602323447e-07, "loss": 0.0172, "reward": 0.839285746216774, "reward_std": 0.6167442202568054, "rewards/graph_reward": 0.839285746216774, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 381.80358123779297, "epoch": 2.825396825396825, "grad_norm": 10.169096946716309, "kl": 0.38623046875, "learning_rate": 1.231165940486234e-07, "loss": 0.0034, "reward": 0.79464291036129, "reward_std": 0.6334031894803047, "rewards/graph_reward": 0.79464291036129, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 384.7143020629883, "epoch": 2.888888888888889, "grad_norm": 7.334926128387451, "kl": 0.373046875, "learning_rate": 3.082666266872036e-08, "loss": -0.0354, "reward": 1.0000000596046448, "reward_std": 0.6530626267194748, "rewards/graph_reward": 1.0000000596046448, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 395.1586608886719, "epoch": 2.9523809523809526, "grad_norm": 4.834416389465332, "kl": 0.4365234375, "learning_rate": 0.0, "loss": 0.0216, "reward": 0.6339286118745804, "reward_std": 0.4484739974141121, "rewards/graph_reward": 0.6339286118745804, "step": 45 }, { "epoch": 2.9523809523809526, "step": 45, "total_flos": 0.0, "train_loss": 91065.20385508855, "train_runtime": 5939.8823, "train_samples_per_second": 0.253, "train_steps_per_second": 0.008 } ], "logging_steps": 1, "max_steps": 45, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }