{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 2.9523809523809526,
  "eval_steps": 500,
  "global_step": 45,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio": 0.0,
      "completion_length": 668.669677734375,
      "epoch": 0.06349206349206349,
      "grad_norm": 14.599732398986816,
      "kl": 0.0,
      "learning_rate": 4.000000000000001e-06,
      "loss": -0.0007,
      "reward": 0.3214285895228386,
      "reward_std": 0.535038448870182,
      "rewards/graph_reward": 0.3214285895228386,
      "step": 1
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 680.9509124755859,
      "epoch": 0.12698412698412698,
      "grad_norm": 0.4279649555683136,
      "kl": 0.0,
      "learning_rate": 8.000000000000001e-06,
      "loss": 0.0317,
      "reward": 0.3214285857975483,
      "reward_std": 0.49289844930171967,
      "rewards/graph_reward": 0.3214285857975483,
      "step": 2
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 648.4464721679688,
      "epoch": 0.19047619047619047,
      "grad_norm": 1.7730225324630737,
      "kl": 0.00031948089599609375,
      "learning_rate": 1.2e-05,
      "loss": 0.0226,
      "reward": 0.2857142984867096,
      "reward_std": 0.47218646109104156,
      "rewards/graph_reward": 0.2857142984867096,
      "step": 3
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 600.6607360839844,
      "epoch": 0.25396825396825395,
      "grad_norm": 2.329758644104004,
      "kl": 0.028167724609375,
      "learning_rate": 1.6000000000000003e-05,
      "loss": 0.0518,
      "reward": 0.4017857313156128,
      "reward_std": 0.5280746445059776,
      "rewards/graph_reward": 0.4017857313156128,
      "step": 4
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 357.64286041259766,
      "epoch": 0.31746031746031744,
      "grad_norm": 2.8278896808624268,
      "kl": 0.07501220703125,
      "learning_rate": 2e-05,
      "loss": 0.0103,
      "reward": 0.79464291036129,
      "reward_std": 0.747464045882225,
      "rewards/graph_reward": 0.79464291036129,
      "step": 5
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 457.1071548461914,
      "epoch": 0.38095238095238093,
      "grad_norm": 13820146688.0,
      "kl": 117374976.0,
      "learning_rate": 1.9969173337331283e-05,
      "loss": 4097932.25,
      "reward": 0.98214291036129,
      "reward_std": 0.6609893068671227,
      "rewards/graph_reward": 0.98214291036129,
      "step": 6
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 541.0937728881836,
      "epoch": 0.4444444444444444,
      "grad_norm": 1.6704585552215576,
      "kl": 0.164306640625,
      "learning_rate": 1.9876883405951378e-05,
      "loss": 0.0721,
      "reward": 0.7946428805589676,
      "reward_std": 0.6530626565217972,
      "rewards/graph_reward": 0.7946428805589676,
      "step": 7
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 626.7411041259766,
      "epoch": 0.5079365079365079,
      "grad_norm": 1.676254153251648,
      "kl": 0.204833984375,
      "learning_rate": 1.9723699203976768e-05,
      "loss": 0.0474,
      "reward": 0.3482143059372902,
      "reward_std": 0.5222531035542488,
      "rewards/graph_reward": 0.3482143059372902,
      "step": 8
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 900.7589569091797,
      "epoch": 0.5714285714285714,
      "grad_norm": 0.6683644652366638,
      "kl": 0.181884765625,
      "learning_rate": 1.9510565162951538e-05,
      "loss": 0.0936,
      "reward": 0.6250000298023224,
      "reward_std": 0.5735516771674156,
      "rewards/graph_reward": 0.6250000298023224,
      "step": 9
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 959.1518402099609,
      "epoch": 0.6349206349206349,
      "grad_norm": 0.3719010055065155,
      "kl": 0.19287109375,
      "learning_rate": 1.9238795325112867e-05,
      "loss": 0.0338,
      "reward": 0.0803571455180645,
      "reward_std": 0.14164696633815765,
      "rewards/graph_reward": 0.0803571455180645,
      "step": 10
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 872.8616485595703,
      "epoch": 0.6984126984126984,
      "grad_norm": 1.9810298681259155,
      "kl": 0.220703125,
      "learning_rate": 1.891006524188368e-05,
      "loss": 0.0044,
      "reward": 0.08928571920841932,
      "reward_std": 0.18201843090355396,
      "rewards/graph_reward": 0.08928571920841932,
      "step": 11
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 635.1830596923828,
      "epoch": 0.7619047619047619,
      "grad_norm": 0.4636230766773224,
      "kl": 0.210693359375,
      "learning_rate": 1.8526401643540924e-05,
      "loss": 0.02,
      "reward": 0.2232142984867096,
      "reward_std": 0.3668579272925854,
      "rewards/graph_reward": 0.2232142984867096,
      "step": 12
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 573.5089569091797,
      "epoch": 0.8253968253968254,
      "grad_norm": 0.672173798084259,
      "kl": 0.2294921875,
      "learning_rate": 1.8090169943749477e-05,
      "loss": 0.0232,
      "reward": 0.6607143133878708,
      "reward_std": 0.7296628206968307,
      "rewards/graph_reward": 0.6607143133878708,
      "step": 13
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 518.3259201049805,
      "epoch": 0.8888888888888888,
      "grad_norm": 0.5373820662498474,
      "kl": 0.19970703125,
      "learning_rate": 1.7604059656000313e-05,
      "loss": 0.0148,
      "reward": 0.5000000223517418,
      "reward_std": 0.6410829946398735,
      "rewards/graph_reward": 0.5000000223517418,
      "step": 14
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 506.68751525878906,
      "epoch": 0.9523809523809523,
      "grad_norm": 0.5141621232032776,
      "kl": 0.1875,
      "learning_rate": 1.7071067811865477e-05,
      "loss": 0.0312,
      "reward": 0.4821428805589676,
      "reward_std": 0.5212902799248695,
      "rewards/graph_reward": 0.4821428805589676,
      "step": 15
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 449.55358123779297,
      "epoch": 1.0634920634920635,
      "grad_norm": 0.8779112100601196,
      "kl": 0.212158203125,
      "learning_rate": 1.6494480483301836e-05,
      "loss": 0.0131,
      "reward": 0.6071428805589676,
      "reward_std": 0.43190471827983856,
      "rewards/graph_reward": 0.6071428805589676,
      "step": 16
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 404.7098388671875,
      "epoch": 1.126984126984127,
      "grad_norm": 1.4357943534851074,
      "kl": 0.225830078125,
      "learning_rate": 1.5877852522924733e-05,
      "loss": 0.0361,
      "reward": 0.7767857760190964,
      "reward_std": 0.7070925980806351,
      "rewards/graph_reward": 0.7767857760190964,
      "step": 17
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 392.4151916503906,
      "epoch": 1.1904761904761905,
      "grad_norm": 0.8047733902931213,
      "kl": 0.263916015625,
      "learning_rate": 1.5224985647159489e-05,
      "loss": 0.0837,
      "reward": 0.517857164144516,
      "reward_std": 0.6126912012696266,
      "rewards/graph_reward": 0.517857164144516,
      "step": 18
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 663.2901916503906,
      "epoch": 1.253968253968254,
      "grad_norm": 0.7689018249511719,
      "kl": 0.32177734375,
      "learning_rate": 1.4539904997395468e-05,
      "loss": 0.1363,
      "reward": 0.4375000298023224,
      "reward_std": 0.5959424823522568,
      "rewards/graph_reward": 0.4375000298023224,
      "step": 19
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 1077.3661499023438,
      "epoch": 1.3174603174603174,
      "grad_norm": 0.4689948558807373,
      "kl": 0.3798828125,
      "learning_rate": 1.3826834323650899e-05,
      "loss": 0.1309,
      "reward": 0.2589285895228386,
      "reward_std": 0.3649100065231323,
      "rewards/graph_reward": 0.2589285895228386,
      "step": 20
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 750.763427734375,
      "epoch": 1.380952380952381,
      "grad_norm": 0.575568437576294,
      "kl": 0.42138671875,
      "learning_rate": 1.3090169943749475e-05,
      "loss": 0.1088,
      "reward": 0.27678572200238705,
      "reward_std": 0.44856370612978935,
      "rewards/graph_reward": 0.27678572200238705,
      "step": 21
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 558.2544860839844,
      "epoch": 1.4444444444444444,
      "grad_norm": 0.6248221397399902,
      "kl": 0.4111328125,
      "learning_rate": 1.2334453638559057e-05,
      "loss": 0.0542,
      "reward": 0.2589285718277097,
      "reward_std": 0.35302002541720867,
      "rewards/graph_reward": 0.2589285718277097,
      "step": 22
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 405.4062728881836,
      "epoch": 1.507936507936508,
      "grad_norm": 0.6308891177177429,
      "kl": 0.3486328125,
      "learning_rate": 1.156434465040231e-05,
      "loss": 0.0644,
      "reward": 0.383928582072258,
      "reward_std": 0.5448233261704445,
      "rewards/graph_reward": 0.383928582072258,
      "step": 23
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 339.0178680419922,
      "epoch": 1.5714285714285714,
      "grad_norm": 0.6035424470901489,
      "kl": 0.31884765625,
      "learning_rate": 1.0784590957278452e-05,
      "loss": 0.0192,
      "reward": 0.5446428880095482,
      "reward_std": 0.4957195296883583,
      "rewards/graph_reward": 0.5446428880095482,
      "step": 24
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 326.39733123779297,
      "epoch": 1.6349206349206349,
      "grad_norm": 0.5706361532211304,
      "kl": 0.28271484375,
      "learning_rate": 1e-05,
      "loss": 0.0109,
      "reward": 0.571428582072258,
      "reward_std": 0.3363610692322254,
      "rewards/graph_reward": 0.571428582072258,
      "step": 25
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 319.0133972167969,
      "epoch": 1.6984126984126984,
      "grad_norm": 0.6702840328216553,
      "kl": 0.27587890625,
      "learning_rate": 9.215409042721553e-06,
      "loss": 0.0264,
      "reward": 0.6607143133878708,
      "reward_std": 0.576372779905796,
      "rewards/graph_reward": 0.6607143133878708,
      "step": 26
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 338.82144927978516,
      "epoch": 1.7619047619047619,
      "grad_norm": 0.49153271317481995,
      "kl": 0.283203125,
      "learning_rate": 8.43565534959769e-06,
      "loss": -0.0115,
      "reward": 0.6250000298023224,
      "reward_std": 0.5419125556945801,
      "rewards/graph_reward": 0.6250000298023224,
      "step": 27
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 348.3169708251953,
      "epoch": 1.8253968253968254,
      "grad_norm": 0.5534822940826416,
      "kl": 0.2578125,
      "learning_rate": 7.66554636144095e-06,
      "loss": 0.012,
      "reward": 0.8750000447034836,
      "reward_std": 0.5143264532089233,
      "rewards/graph_reward": 0.8750000447034836,
      "step": 28
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 367.44197845458984,
      "epoch": 1.8888888888888888,
      "grad_norm": 0.8025973439216614,
      "kl": 0.254638671875,
      "learning_rate": 6.909830056250527e-06,
      "loss": 0.0294,
      "reward": 0.6071428805589676,
      "reward_std": 0.655883751809597,
      "rewards/graph_reward": 0.6071428805589676,
      "step": 29
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 396.97596740722656,
      "epoch": 1.9523809523809523,
      "grad_norm": 0.7672126293182373,
      "kl": 0.25244140625,
      "learning_rate": 6.173165676349103e-06,
      "loss": 0.0346,
      "reward": 0.8571428805589676,
      "reward_std": 0.5488763749599457,
      "rewards/graph_reward": 0.8571428805589676,
      "step": 30
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 424.0044860839844,
      "epoch": 2.0634920634920633,
      "grad_norm": 0.5899645090103149,
      "kl": 0.27001953125,
      "learning_rate": 5.460095002604533e-06,
      "loss": 0.0286,
      "reward": 0.8839286267757416,
      "reward_std": 0.5037358924746513,
      "rewards/graph_reward": 0.8839286267757416,
      "step": 31
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 455.21876525878906,
      "epoch": 2.126984126984127,
      "grad_norm": 0.8172342777252197,
      "kl": 0.25830078125,
      "learning_rate": 4.775014352840512e-06,
      "loss": 0.0636,
      "reward": 0.6607143171131611,
      "reward_std": 0.42899394035339355,
      "rewards/graph_reward": 0.6607143171131611,
      "step": 32
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 485.95538330078125,
      "epoch": 2.1904761904761907,
      "grad_norm": 0.8087209463119507,
      "kl": 0.283203125,
      "learning_rate": 4.12214747707527e-06,
      "loss": 0.049,
      "reward": 0.785714328289032,
      "reward_std": 0.5360910147428513,
      "rewards/graph_reward": 0.785714328289032,
      "step": 33
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 537.3839492797852,
      "epoch": 2.253968253968254,
      "grad_norm": 1.2612076997756958,
      "kl": 0.29833984375,
      "learning_rate": 3.505519516698165e-06,
      "loss": 0.1034,
      "reward": 0.7767857611179352,
      "reward_std": 0.5999058485031128,
      "rewards/graph_reward": 0.7767857611179352,
      "step": 34
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 531.1652069091797,
      "epoch": 2.317460317460317,
      "grad_norm": 2.282433032989502,
      "kl": 0.36474609375,
      "learning_rate": 2.9289321881345257e-06,
      "loss": 0.1471,
      "reward": 0.8750000298023224,
      "reward_std": 0.5596240684390068,
      "rewards/graph_reward": 0.8750000298023224,
      "step": 35
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 476.2053756713867,
      "epoch": 2.380952380952381,
      "grad_norm": 3.4955978393554688,
      "kl": 0.49169921875,
      "learning_rate": 2.395940343999691e-06,
      "loss": 0.0662,
      "reward": 0.6339286118745804,
      "reward_std": 0.481971338391304,
      "rewards/graph_reward": 0.6339286118745804,
      "step": 36
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 435.54019927978516,
      "epoch": 2.4444444444444446,
      "grad_norm": 4.864017963409424,
      "kl": 0.8310546875,
      "learning_rate": 1.9098300562505266e-06,
      "loss": 0.0783,
      "reward": 0.8660714775323868,
      "reward_std": 0.7266623377799988,
      "rewards/graph_reward": 0.8660714775323868,
      "step": 37
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 385.86608123779297,
      "epoch": 2.507936507936508,
      "grad_norm": 31.254209518432617,
      "kl": 1.837890625,
      "learning_rate": 1.4735983564590784e-06,
      "loss": 0.0904,
      "reward": 0.8928571790456772,
      "reward_std": 0.6362242549657822,
      "rewards/graph_reward": 0.8928571790456772,
      "step": 38
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 408.38841247558594,
      "epoch": 2.571428571428571,
      "grad_norm": 76.5787124633789,
      "kl": 1.4765625,
      "learning_rate": 1.0899347581163222e-06,
      "loss": 0.0649,
      "reward": 0.7053571790456772,
      "reward_std": 0.6126912087202072,
      "rewards/graph_reward": 0.7053571790456772,
      "step": 39
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 396.6071548461914,
      "epoch": 2.634920634920635,
      "grad_norm": 8.922845840454102,
      "kl": 0.59521484375,
      "learning_rate": 7.612046748871327e-07,
      "loss": 0.0264,
      "reward": 0.8303571790456772,
      "reward_std": 0.6264393925666809,
      "rewards/graph_reward": 0.8303571790456772,
      "step": 40
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 396.32591247558594,
      "epoch": 2.6984126984126986,
      "grad_norm": 19.087890625,
      "kl": 0.50244140625,
      "learning_rate": 4.894348370484648e-07,
      "loss": -0.0059,
      "reward": 0.9017857909202576,
      "reward_std": 0.6444200202822685,
      "rewards/graph_reward": 0.9017857909202576,
      "step": 41
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 407.43751525878906,
      "epoch": 2.761904761904762,
      "grad_norm": 11.51771354675293,
      "kl": 0.43115234375,
      "learning_rate": 2.7630079602323447e-07,
      "loss": 0.0172,
      "reward": 0.839285746216774,
      "reward_std": 0.6167442202568054,
      "rewards/graph_reward": 0.839285746216774,
      "step": 42
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 381.80358123779297,
      "epoch": 2.825396825396825,
      "grad_norm": 10.169096946716309,
      "kl": 0.38623046875,
      "learning_rate": 1.231165940486234e-07,
      "loss": 0.0034,
      "reward": 0.79464291036129,
      "reward_std": 0.6334031894803047,
      "rewards/graph_reward": 0.79464291036129,
      "step": 43
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 384.7143020629883,
      "epoch": 2.888888888888889,
      "grad_norm": 7.334926128387451,
      "kl": 0.373046875,
      "learning_rate": 3.082666266872036e-08,
      "loss": -0.0354,
      "reward": 1.0000000596046448,
      "reward_std": 0.6530626267194748,
      "rewards/graph_reward": 1.0000000596046448,
      "step": 44
    },
    {
      "clip_ratio": 0.0,
      "completion_length": 395.1586608886719,
      "epoch": 2.9523809523809526,
      "grad_norm": 4.834416389465332,
      "kl": 0.4365234375,
      "learning_rate": 0.0,
      "loss": 0.0216,
      "reward": 0.6339286118745804,
      "reward_std": 0.4484739974141121,
      "rewards/graph_reward": 0.6339286118745804,
      "step": 45
    },
    {
      "epoch": 2.9523809523809526,
      "step": 45,
      "total_flos": 0.0,
      "train_loss": 91065.20385508855,
      "train_runtime": 5939.8823,
      "train_samples_per_second": 0.253,
      "train_steps_per_second": 0.008
    }
  ],
  "logging_steps": 1,
  "max_steps": 45,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 3,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}