{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.30140908748398765,
  "eval_steps": 400,
  "global_step": 500,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.006028181749679753,
      "grad_norm": 0.38845974438826136,
      "learning_rate": 1.0040160642570282e-06,
      "loss": 1.6618,
      "step": 10
    },
    {
      "epoch": 0.012056363499359506,
      "grad_norm": 0.255603405930248,
      "learning_rate": 2.0080321285140564e-06,
      "loss": 1.6698,
      "step": 20
    },
    {
      "epoch": 0.01808454524903926,
      "grad_norm": 0.20116505722657768,
      "learning_rate": 3.0120481927710846e-06,
      "loss": 1.6264,
      "step": 30
    },
    {
      "epoch": 0.02411272699871901,
      "grad_norm": 0.1795881732266397,
      "learning_rate": 4.016064257028113e-06,
      "loss": 1.6025,
      "step": 40
    },
    {
      "epoch": 0.030140908748398764,
      "grad_norm": 0.14600421495766222,
      "learning_rate": 5.020080321285141e-06,
      "loss": 1.5584,
      "step": 50
    },
    {
      "epoch": 0.03616909049807852,
      "grad_norm": 0.1170942466306718,
      "learning_rate": 6.024096385542169e-06,
      "loss": 1.5604,
      "step": 60
    },
    {
      "epoch": 0.04219727224775827,
      "grad_norm": 0.2445974669666656,
      "learning_rate": 7.028112449799197e-06,
      "loss": 1.5057,
      "step": 70
    },
    {
      "epoch": 0.04822545399743802,
      "grad_norm": 0.11238025351136724,
      "learning_rate": 8.032128514056226e-06,
      "loss": 1.4724,
      "step": 80
    },
    {
      "epoch": 0.05425363574711778,
      "grad_norm": 0.13639577534070754,
      "learning_rate": 9.036144578313253e-06,
      "loss": 1.4962,
      "step": 90
    },
    {
      "epoch": 0.06028181749679753,
      "grad_norm": 0.17113178431310286,
      "learning_rate": 1.0040160642570281e-05,
      "loss": 1.4765,
      "step": 100
    },
    {
      "epoch": 0.06630999924647728,
      "grad_norm": 0.4795401637258333,
      "learning_rate": 1.104417670682731e-05,
      "loss": 1.4468,
      "step": 110
    },
    {
      "epoch": 0.07233818099615703,
      "grad_norm": 0.13385762516900662,
      "learning_rate": 1.2048192771084338e-05,
      "loss": 1.4134,
      "step": 120
    },
    {
      "epoch": 0.07836636274583679,
      "grad_norm": 0.12303707813666019,
      "learning_rate": 1.3052208835341367e-05,
      "loss": 1.4191,
      "step": 130
    },
    {
      "epoch": 0.08439454449551655,
      "grad_norm": 0.10822073133399364,
      "learning_rate": 1.4056224899598394e-05,
      "loss": 1.397,
      "step": 140
    },
    {
      "epoch": 0.09042272624519629,
      "grad_norm": 0.1109270134990499,
      "learning_rate": 1.5060240963855424e-05,
      "loss": 1.3818,
      "step": 150
    },
    {
      "epoch": 0.09645090799487605,
      "grad_norm": 0.14622173134033867,
      "learning_rate": 1.606425702811245e-05,
      "loss": 1.4002,
      "step": 160
    },
    {
      "epoch": 0.1024790897445558,
      "grad_norm": 0.10587626114414271,
      "learning_rate": 1.706827309236948e-05,
      "loss": 1.4123,
      "step": 170
    },
    {
      "epoch": 0.10850727149423556,
      "grad_norm": 0.10302196814593138,
      "learning_rate": 1.8072289156626505e-05,
      "loss": 1.4016,
      "step": 180
    },
    {
      "epoch": 0.1145354532439153,
      "grad_norm": 0.1359849724314843,
      "learning_rate": 1.9076305220883535e-05,
      "loss": 1.404,
      "step": 190
    },
    {
      "epoch": 0.12056363499359506,
      "grad_norm": 0.10587622358885339,
      "learning_rate": 2.0080321285140562e-05,
      "loss": 1.4019,
      "step": 200
    },
    {
      "epoch": 0.1265918167432748,
      "grad_norm": 0.15017595066321648,
      "learning_rate": 2.1084337349397593e-05,
      "loss": 1.393,
      "step": 210
    },
    {
      "epoch": 0.13261999849295456,
      "grad_norm": 0.19475575142022897,
      "learning_rate": 2.208835341365462e-05,
      "loss": 1.3876,
      "step": 220
    },
    {
      "epoch": 0.1386481802426343,
      "grad_norm": 0.12084095277263424,
      "learning_rate": 2.309236947791165e-05,
      "loss": 1.3916,
      "step": 230
    },
    {
      "epoch": 0.14467636199231407,
      "grad_norm": 0.11857482977859173,
      "learning_rate": 2.4096385542168677e-05,
      "loss": 1.4056,
      "step": 240
    },
    {
      "epoch": 0.15070454374199382,
      "grad_norm": 0.1403959719635503,
      "learning_rate": 2.5100401606425704e-05,
      "loss": 1.3935,
      "step": 250
    },
    {
      "epoch": 0.15673272549167358,
      "grad_norm": 0.10800155257965392,
      "learning_rate": 2.6104417670682734e-05,
      "loss": 1.3826,
      "step": 260
    },
    {
      "epoch": 0.16276090724135334,
      "grad_norm": 0.10598439909830581,
      "learning_rate": 2.7108433734939758e-05,
      "loss": 1.3999,
      "step": 270
    },
    {
      "epoch": 0.1687890889910331,
      "grad_norm": 0.10753449693494475,
      "learning_rate": 2.8112449799196788e-05,
      "loss": 1.4047,
      "step": 280
    },
    {
      "epoch": 0.17481727074071282,
      "grad_norm": 0.36718328659037996,
      "learning_rate": 2.911646586345382e-05,
      "loss": 1.3935,
      "step": 290
    },
    {
      "epoch": 0.18084545249039258,
      "grad_norm": 0.10611900000479042,
      "learning_rate": 3.012048192771085e-05,
      "loss": 1.3736,
      "step": 300
    },
    {
      "epoch": 0.18687363424007233,
      "grad_norm": 0.11901555220652378,
      "learning_rate": 3.112449799196787e-05,
      "loss": 1.3927,
      "step": 310
    },
    {
      "epoch": 0.1929018159897521,
      "grad_norm": 0.118935148513695,
      "learning_rate": 3.21285140562249e-05,
      "loss": 1.3636,
      "step": 320
    },
    {
      "epoch": 0.19892999773943185,
      "grad_norm": 0.1974545721831922,
      "learning_rate": 3.313253012048193e-05,
      "loss": 1.3892,
      "step": 330
    },
    {
      "epoch": 0.2049581794891116,
      "grad_norm": 0.13145409772199562,
      "learning_rate": 3.413654618473896e-05,
      "loss": 1.3756,
      "step": 340
    },
    {
      "epoch": 0.21098636123879136,
      "grad_norm": 0.11064380941915805,
      "learning_rate": 3.5140562248995983e-05,
      "loss": 1.3935,
      "step": 350
    },
    {
      "epoch": 0.21701454298847112,
      "grad_norm": 0.12160423827639648,
      "learning_rate": 3.614457831325301e-05,
      "loss": 1.3698,
      "step": 360
    },
    {
      "epoch": 0.22304272473815084,
      "grad_norm": 0.10349641889173723,
      "learning_rate": 3.7148594377510044e-05,
      "loss": 1.3771,
      "step": 370
    },
    {
      "epoch": 0.2290709064878306,
      "grad_norm": 0.10682144059511894,
      "learning_rate": 3.815261044176707e-05,
      "loss": 1.3768,
      "step": 380
    },
    {
      "epoch": 0.23509908823751036,
      "grad_norm": 0.11625245619819907,
      "learning_rate": 3.91566265060241e-05,
      "loss": 1.3795,
      "step": 390
    },
    {
      "epoch": 0.2411272699871901,
      "grad_norm": 0.10327726962763091,
      "learning_rate": 4.0160642570281125e-05,
      "loss": 1.3987,
      "step": 400
    },
    {
      "epoch": 0.2411272699871901,
      "eval_loss": 1.3548544645309448,
      "eval_runtime": 148.2269,
      "eval_samples_per_second": 7.239,
      "eval_steps_per_second": 0.911,
      "step": 400
    },
    {
      "epoch": 0.24715545173686987,
      "grad_norm": 0.10660530950921367,
      "learning_rate": 4.116465863453816e-05,
      "loss": 1.3886,
      "step": 410
    },
    {
      "epoch": 0.2531836334865496,
      "grad_norm": 0.10405582985373843,
      "learning_rate": 4.2168674698795186e-05,
      "loss": 1.3645,
      "step": 420
    },
    {
      "epoch": 0.2592118152362294,
      "grad_norm": 0.3318479326670041,
      "learning_rate": 4.317269076305221e-05,
      "loss": 1.3591,
      "step": 430
    },
    {
      "epoch": 0.2652399969859091,
      "grad_norm": 0.10840544026201794,
      "learning_rate": 4.417670682730924e-05,
      "loss": 1.3805,
      "step": 440
    },
    {
      "epoch": 0.2712681787355889,
      "grad_norm": 0.10730056620740543,
      "learning_rate": 4.5180722891566266e-05,
      "loss": 1.3888,
      "step": 450
    },
    {
      "epoch": 0.2772963604852686,
      "grad_norm": 0.10699620793474768,
      "learning_rate": 4.61847389558233e-05,
      "loss": 1.3935,
      "step": 460
    },
    {
      "epoch": 0.2833245422349484,
      "grad_norm": 0.10595493402596641,
      "learning_rate": 4.718875502008032e-05,
      "loss": 1.3659,
      "step": 470
    },
    {
      "epoch": 0.28935272398462814,
      "grad_norm": 0.14234040947748414,
      "learning_rate": 4.8192771084337354e-05,
      "loss": 1.371,
      "step": 480
    },
    {
      "epoch": 0.29538090573430786,
      "grad_norm": 0.1095349792774781,
      "learning_rate": 4.919678714859438e-05,
      "loss": 1.3647,
      "step": 490
    },
    {
      "epoch": 0.30140908748398765,
      "grad_norm": 0.10655792946130023,
      "learning_rate": 4.999997536857586e-05,
      "loss": 1.3606,
      "step": 500
    }
  ],
  "logging_steps": 10,
  "max_steps": 4974,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 3,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 840529663229952.0,
  "train_batch_size": 1,
  "trial_name": null,
  "trial_params": null
}