Wanff
Add fine-tuned model
08fd82a
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 0,
"global_step": 415,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0024096385542168677,
"grad_norm": 0.447265625,
"learning_rate": 9.975903614457833e-06,
"loss": 1.8184,
"step": 1
},
{
"epoch": 0.004819277108433735,
"grad_norm": 0.435546875,
"learning_rate": 9.951807228915663e-06,
"loss": 1.8618,
"step": 2
},
{
"epoch": 0.007228915662650603,
"grad_norm": 0.4609375,
"learning_rate": 9.927710843373494e-06,
"loss": 1.9108,
"step": 3
},
{
"epoch": 0.00963855421686747,
"grad_norm": 0.416015625,
"learning_rate": 9.903614457831326e-06,
"loss": 1.7899,
"step": 4
},
{
"epoch": 0.012048192771084338,
"grad_norm": 0.416015625,
"learning_rate": 9.879518072289156e-06,
"loss": 1.8254,
"step": 5
},
{
"epoch": 0.014457831325301205,
"grad_norm": 0.373046875,
"learning_rate": 9.855421686746988e-06,
"loss": 1.6838,
"step": 6
},
{
"epoch": 0.016867469879518072,
"grad_norm": 0.388671875,
"learning_rate": 9.83132530120482e-06,
"loss": 1.768,
"step": 7
},
{
"epoch": 0.01927710843373494,
"grad_norm": 0.34375,
"learning_rate": 9.807228915662652e-06,
"loss": 1.7494,
"step": 8
},
{
"epoch": 0.021686746987951807,
"grad_norm": 0.361328125,
"learning_rate": 9.783132530120483e-06,
"loss": 1.8167,
"step": 9
},
{
"epoch": 0.024096385542168676,
"grad_norm": 0.318359375,
"learning_rate": 9.759036144578315e-06,
"loss": 1.7875,
"step": 10
},
{
"epoch": 0.02650602409638554,
"grad_norm": 0.306640625,
"learning_rate": 9.734939759036145e-06,
"loss": 1.7663,
"step": 11
},
{
"epoch": 0.02891566265060241,
"grad_norm": 0.30859375,
"learning_rate": 9.710843373493977e-06,
"loss": 1.7343,
"step": 12
},
{
"epoch": 0.03132530120481928,
"grad_norm": 0.265625,
"learning_rate": 9.686746987951809e-06,
"loss": 1.6783,
"step": 13
},
{
"epoch": 0.033734939759036145,
"grad_norm": 0.302734375,
"learning_rate": 9.662650602409639e-06,
"loss": 1.7204,
"step": 14
},
{
"epoch": 0.03614457831325301,
"grad_norm": 0.283203125,
"learning_rate": 9.63855421686747e-06,
"loss": 1.7375,
"step": 15
},
{
"epoch": 0.03855421686746988,
"grad_norm": 0.28125,
"learning_rate": 9.614457831325302e-06,
"loss": 1.6582,
"step": 16
},
{
"epoch": 0.04096385542168675,
"grad_norm": 0.279296875,
"learning_rate": 9.590361445783132e-06,
"loss": 1.6181,
"step": 17
},
{
"epoch": 0.043373493975903614,
"grad_norm": 0.275390625,
"learning_rate": 9.566265060240964e-06,
"loss": 1.606,
"step": 18
},
{
"epoch": 0.04578313253012048,
"grad_norm": 0.271484375,
"learning_rate": 9.542168674698796e-06,
"loss": 1.629,
"step": 19
},
{
"epoch": 0.04819277108433735,
"grad_norm": 0.28515625,
"learning_rate": 9.518072289156628e-06,
"loss": 1.6843,
"step": 20
},
{
"epoch": 0.05060240963855422,
"grad_norm": 0.259765625,
"learning_rate": 9.49397590361446e-06,
"loss": 1.6272,
"step": 21
},
{
"epoch": 0.05301204819277108,
"grad_norm": 0.234375,
"learning_rate": 9.46987951807229e-06,
"loss": 1.6065,
"step": 22
},
{
"epoch": 0.05542168674698795,
"grad_norm": 0.2353515625,
"learning_rate": 9.445783132530121e-06,
"loss": 1.6183,
"step": 23
},
{
"epoch": 0.05783132530120482,
"grad_norm": 0.234375,
"learning_rate": 9.421686746987953e-06,
"loss": 1.6144,
"step": 24
},
{
"epoch": 0.060240963855421686,
"grad_norm": 0.2099609375,
"learning_rate": 9.397590361445785e-06,
"loss": 1.5703,
"step": 25
},
{
"epoch": 0.06265060240963856,
"grad_norm": 0.2119140625,
"learning_rate": 9.373493975903615e-06,
"loss": 1.5837,
"step": 26
},
{
"epoch": 0.06506024096385542,
"grad_norm": 0.2177734375,
"learning_rate": 9.349397590361446e-06,
"loss": 1.5706,
"step": 27
},
{
"epoch": 0.06746987951807229,
"grad_norm": 0.2216796875,
"learning_rate": 9.325301204819278e-06,
"loss": 1.5427,
"step": 28
},
{
"epoch": 0.06987951807228916,
"grad_norm": 0.2177734375,
"learning_rate": 9.301204819277108e-06,
"loss": 1.597,
"step": 29
},
{
"epoch": 0.07228915662650602,
"grad_norm": 0.224609375,
"learning_rate": 9.27710843373494e-06,
"loss": 1.5046,
"step": 30
},
{
"epoch": 0.0746987951807229,
"grad_norm": 0.1904296875,
"learning_rate": 9.253012048192772e-06,
"loss": 1.5434,
"step": 31
},
{
"epoch": 0.07710843373493977,
"grad_norm": 0.1982421875,
"learning_rate": 9.228915662650602e-06,
"loss": 1.5595,
"step": 32
},
{
"epoch": 0.07951807228915662,
"grad_norm": 0.20703125,
"learning_rate": 9.204819277108434e-06,
"loss": 1.4755,
"step": 33
},
{
"epoch": 0.0819277108433735,
"grad_norm": 0.2119140625,
"learning_rate": 9.180722891566265e-06,
"loss": 1.5265,
"step": 34
},
{
"epoch": 0.08433734939759036,
"grad_norm": 0.53125,
"learning_rate": 9.156626506024097e-06,
"loss": 1.4612,
"step": 35
},
{
"epoch": 0.08674698795180723,
"grad_norm": 0.19140625,
"learning_rate": 9.132530120481929e-06,
"loss": 1.5302,
"step": 36
},
{
"epoch": 0.0891566265060241,
"grad_norm": 0.1845703125,
"learning_rate": 9.10843373493976e-06,
"loss": 1.4741,
"step": 37
},
{
"epoch": 0.09156626506024096,
"grad_norm": 0.1904296875,
"learning_rate": 9.08433734939759e-06,
"loss": 1.4975,
"step": 38
},
{
"epoch": 0.09397590361445783,
"grad_norm": 0.1826171875,
"learning_rate": 9.060240963855423e-06,
"loss": 1.4627,
"step": 39
},
{
"epoch": 0.0963855421686747,
"grad_norm": 0.17578125,
"learning_rate": 9.036144578313254e-06,
"loss": 1.395,
"step": 40
},
{
"epoch": 0.09879518072289156,
"grad_norm": 0.1923828125,
"learning_rate": 9.012048192771084e-06,
"loss": 1.4478,
"step": 41
},
{
"epoch": 0.10120481927710843,
"grad_norm": 0.173828125,
"learning_rate": 8.987951807228916e-06,
"loss": 1.4981,
"step": 42
},
{
"epoch": 0.10361445783132531,
"grad_norm": 0.1875,
"learning_rate": 8.963855421686748e-06,
"loss": 1.4501,
"step": 43
},
{
"epoch": 0.10602409638554217,
"grad_norm": 0.1708984375,
"learning_rate": 8.939759036144578e-06,
"loss": 1.3822,
"step": 44
},
{
"epoch": 0.10843373493975904,
"grad_norm": 0.1884765625,
"learning_rate": 8.91566265060241e-06,
"loss": 1.4565,
"step": 45
},
{
"epoch": 0.1108433734939759,
"grad_norm": 0.162109375,
"learning_rate": 8.891566265060241e-06,
"loss": 1.4586,
"step": 46
},
{
"epoch": 0.11325301204819277,
"grad_norm": 0.1591796875,
"learning_rate": 8.867469879518073e-06,
"loss": 1.4511,
"step": 47
},
{
"epoch": 0.11566265060240964,
"grad_norm": 0.1669921875,
"learning_rate": 8.843373493975905e-06,
"loss": 1.4934,
"step": 48
},
{
"epoch": 0.1180722891566265,
"grad_norm": 0.16796875,
"learning_rate": 8.819277108433735e-06,
"loss": 1.3896,
"step": 49
},
{
"epoch": 0.12048192771084337,
"grad_norm": 0.166015625,
"learning_rate": 8.795180722891567e-06,
"loss": 1.3766,
"step": 50
},
{
"epoch": 0.12289156626506025,
"grad_norm": 0.1572265625,
"learning_rate": 8.771084337349399e-06,
"loss": 1.4585,
"step": 51
},
{
"epoch": 0.12530120481927712,
"grad_norm": 0.224609375,
"learning_rate": 8.74698795180723e-06,
"loss": 1.366,
"step": 52
},
{
"epoch": 0.12771084337349398,
"grad_norm": 0.158203125,
"learning_rate": 8.722891566265062e-06,
"loss": 1.3657,
"step": 53
},
{
"epoch": 0.13012048192771083,
"grad_norm": 0.158203125,
"learning_rate": 8.698795180722892e-06,
"loss": 1.4105,
"step": 54
},
{
"epoch": 0.13253012048192772,
"grad_norm": 0.17578125,
"learning_rate": 8.674698795180724e-06,
"loss": 1.4599,
"step": 55
},
{
"epoch": 0.13493975903614458,
"grad_norm": 0.158203125,
"learning_rate": 8.650602409638556e-06,
"loss": 1.4063,
"step": 56
},
{
"epoch": 0.13734939759036144,
"grad_norm": 0.1572265625,
"learning_rate": 8.626506024096386e-06,
"loss": 1.4064,
"step": 57
},
{
"epoch": 0.13975903614457832,
"grad_norm": 0.158203125,
"learning_rate": 8.602409638554217e-06,
"loss": 1.4478,
"step": 58
},
{
"epoch": 0.14216867469879518,
"grad_norm": 0.2314453125,
"learning_rate": 8.57831325301205e-06,
"loss": 1.408,
"step": 59
},
{
"epoch": 0.14457831325301204,
"grad_norm": 0.1884765625,
"learning_rate": 8.55421686746988e-06,
"loss": 1.3526,
"step": 60
},
{
"epoch": 0.14698795180722893,
"grad_norm": 0.150390625,
"learning_rate": 8.530120481927711e-06,
"loss": 1.3377,
"step": 61
},
{
"epoch": 0.1493975903614458,
"grad_norm": 0.240234375,
"learning_rate": 8.506024096385543e-06,
"loss": 1.3711,
"step": 62
},
{
"epoch": 0.15180722891566265,
"grad_norm": 0.1533203125,
"learning_rate": 8.481927710843375e-06,
"loss": 1.4012,
"step": 63
},
{
"epoch": 0.15421686746987953,
"grad_norm": 0.64453125,
"learning_rate": 8.457831325301206e-06,
"loss": 1.3922,
"step": 64
},
{
"epoch": 0.1566265060240964,
"grad_norm": 0.16796875,
"learning_rate": 8.433734939759038e-06,
"loss": 1.3594,
"step": 65
},
{
"epoch": 0.15903614457831325,
"grad_norm": 0.1513671875,
"learning_rate": 8.409638554216868e-06,
"loss": 1.3225,
"step": 66
},
{
"epoch": 0.1614457831325301,
"grad_norm": 0.158203125,
"learning_rate": 8.3855421686747e-06,
"loss": 1.3608,
"step": 67
},
{
"epoch": 0.163855421686747,
"grad_norm": 0.1591796875,
"learning_rate": 8.361445783132532e-06,
"loss": 1.3723,
"step": 68
},
{
"epoch": 0.16626506024096385,
"grad_norm": 0.14453125,
"learning_rate": 8.337349397590362e-06,
"loss": 1.349,
"step": 69
},
{
"epoch": 0.1686746987951807,
"grad_norm": 0.1572265625,
"learning_rate": 8.313253012048194e-06,
"loss": 1.367,
"step": 70
},
{
"epoch": 0.1710843373493976,
"grad_norm": 0.171875,
"learning_rate": 8.289156626506025e-06,
"loss": 1.3529,
"step": 71
},
{
"epoch": 0.17349397590361446,
"grad_norm": 0.1689453125,
"learning_rate": 8.265060240963855e-06,
"loss": 1.3107,
"step": 72
},
{
"epoch": 0.17590361445783131,
"grad_norm": 0.146484375,
"learning_rate": 8.240963855421687e-06,
"loss": 1.3254,
"step": 73
},
{
"epoch": 0.1783132530120482,
"grad_norm": 0.15234375,
"learning_rate": 8.216867469879519e-06,
"loss": 1.3508,
"step": 74
},
{
"epoch": 0.18072289156626506,
"grad_norm": 0.15625,
"learning_rate": 8.19277108433735e-06,
"loss": 1.3236,
"step": 75
},
{
"epoch": 0.18313253012048192,
"grad_norm": 0.2177734375,
"learning_rate": 8.16867469879518e-06,
"loss": 1.3544,
"step": 76
},
{
"epoch": 0.1855421686746988,
"grad_norm": 0.1875,
"learning_rate": 8.144578313253012e-06,
"loss": 1.3338,
"step": 77
},
{
"epoch": 0.18795180722891566,
"grad_norm": 0.1474609375,
"learning_rate": 8.120481927710844e-06,
"loss": 1.2773,
"step": 78
},
{
"epoch": 0.19036144578313252,
"grad_norm": 0.142578125,
"learning_rate": 8.096385542168676e-06,
"loss": 1.3254,
"step": 79
},
{
"epoch": 0.1927710843373494,
"grad_norm": 0.171875,
"learning_rate": 8.072289156626508e-06,
"loss": 1.3502,
"step": 80
},
{
"epoch": 0.19518072289156627,
"grad_norm": 0.1435546875,
"learning_rate": 8.048192771084338e-06,
"loss": 1.3205,
"step": 81
},
{
"epoch": 0.19759036144578312,
"grad_norm": 0.1640625,
"learning_rate": 8.02409638554217e-06,
"loss": 1.3201,
"step": 82
},
{
"epoch": 0.2,
"grad_norm": 0.158203125,
"learning_rate": 8.000000000000001e-06,
"loss": 1.3507,
"step": 83
},
{
"epoch": 0.20240963855421687,
"grad_norm": 0.1552734375,
"learning_rate": 7.975903614457831e-06,
"loss": 1.371,
"step": 84
},
{
"epoch": 0.20481927710843373,
"grad_norm": 0.1474609375,
"learning_rate": 7.951807228915663e-06,
"loss": 1.3613,
"step": 85
},
{
"epoch": 0.20722891566265061,
"grad_norm": 0.1630859375,
"learning_rate": 7.927710843373495e-06,
"loss": 1.3667,
"step": 86
},
{
"epoch": 0.20963855421686747,
"grad_norm": 0.296875,
"learning_rate": 7.903614457831325e-06,
"loss": 1.3651,
"step": 87
},
{
"epoch": 0.21204819277108433,
"grad_norm": 0.185546875,
"learning_rate": 7.879518072289157e-06,
"loss": 1.3087,
"step": 88
},
{
"epoch": 0.21445783132530122,
"grad_norm": 0.404296875,
"learning_rate": 7.855421686746989e-06,
"loss": 1.2481,
"step": 89
},
{
"epoch": 0.21686746987951808,
"grad_norm": 0.37890625,
"learning_rate": 7.83132530120482e-06,
"loss": 1.3014,
"step": 90
},
{
"epoch": 0.21927710843373494,
"grad_norm": 0.1484375,
"learning_rate": 7.807228915662652e-06,
"loss": 1.3491,
"step": 91
},
{
"epoch": 0.2216867469879518,
"grad_norm": 0.1396484375,
"learning_rate": 7.783132530120484e-06,
"loss": 1.2814,
"step": 92
},
{
"epoch": 0.22409638554216868,
"grad_norm": 0.1650390625,
"learning_rate": 7.759036144578314e-06,
"loss": 1.2934,
"step": 93
},
{
"epoch": 0.22650602409638554,
"grad_norm": 0.1533203125,
"learning_rate": 7.734939759036146e-06,
"loss": 1.2927,
"step": 94
},
{
"epoch": 0.2289156626506024,
"grad_norm": 0.171875,
"learning_rate": 7.710843373493977e-06,
"loss": 1.3319,
"step": 95
},
{
"epoch": 0.23132530120481928,
"grad_norm": 0.35546875,
"learning_rate": 7.686746987951807e-06,
"loss": 1.2998,
"step": 96
},
{
"epoch": 0.23373493975903614,
"grad_norm": 0.1611328125,
"learning_rate": 7.66265060240964e-06,
"loss": 1.2885,
"step": 97
},
{
"epoch": 0.236144578313253,
"grad_norm": 0.15234375,
"learning_rate": 7.638554216867471e-06,
"loss": 1.3596,
"step": 98
},
{
"epoch": 0.2385542168674699,
"grad_norm": 0.1484375,
"learning_rate": 7.614457831325302e-06,
"loss": 1.3371,
"step": 99
},
{
"epoch": 0.24096385542168675,
"grad_norm": 0.203125,
"learning_rate": 7.590361445783133e-06,
"loss": 1.3512,
"step": 100
},
{
"epoch": 0.2433734939759036,
"grad_norm": 0.21875,
"learning_rate": 7.5662650602409645e-06,
"loss": 1.2999,
"step": 101
},
{
"epoch": 0.2457831325301205,
"grad_norm": 0.1474609375,
"learning_rate": 7.5421686746987955e-06,
"loss": 1.2697,
"step": 102
},
{
"epoch": 0.24819277108433735,
"grad_norm": 0.162109375,
"learning_rate": 7.518072289156627e-06,
"loss": 1.2562,
"step": 103
},
{
"epoch": 0.25060240963855424,
"grad_norm": 0.1552734375,
"learning_rate": 7.493975903614459e-06,
"loss": 1.3084,
"step": 104
},
{
"epoch": 0.25301204819277107,
"grad_norm": 0.23046875,
"learning_rate": 7.469879518072289e-06,
"loss": 1.3325,
"step": 105
},
{
"epoch": 0.25542168674698795,
"grad_norm": 0.16796875,
"learning_rate": 7.445783132530121e-06,
"loss": 1.2602,
"step": 106
},
{
"epoch": 0.25783132530120484,
"grad_norm": 0.1533203125,
"learning_rate": 7.4216867469879526e-06,
"loss": 1.3126,
"step": 107
},
{
"epoch": 0.26024096385542167,
"grad_norm": 0.1474609375,
"learning_rate": 7.3975903614457835e-06,
"loss": 1.3055,
"step": 108
},
{
"epoch": 0.26265060240963856,
"grad_norm": 0.185546875,
"learning_rate": 7.373493975903615e-06,
"loss": 1.3219,
"step": 109
},
{
"epoch": 0.26506024096385544,
"grad_norm": 0.158203125,
"learning_rate": 7.349397590361447e-06,
"loss": 1.3015,
"step": 110
},
{
"epoch": 0.2674698795180723,
"grad_norm": 0.1630859375,
"learning_rate": 7.325301204819277e-06,
"loss": 1.2532,
"step": 111
},
{
"epoch": 0.26987951807228916,
"grad_norm": 0.142578125,
"learning_rate": 7.301204819277109e-06,
"loss": 1.2487,
"step": 112
},
{
"epoch": 0.27228915662650605,
"grad_norm": 0.1474609375,
"learning_rate": 7.277108433734941e-06,
"loss": 1.2979,
"step": 113
},
{
"epoch": 0.2746987951807229,
"grad_norm": 0.15234375,
"learning_rate": 7.2530120481927715e-06,
"loss": 1.3236,
"step": 114
},
{
"epoch": 0.27710843373493976,
"grad_norm": 0.1513671875,
"learning_rate": 7.228915662650603e-06,
"loss": 1.2813,
"step": 115
},
{
"epoch": 0.27951807228915665,
"grad_norm": 0.1982421875,
"learning_rate": 7.204819277108435e-06,
"loss": 1.2829,
"step": 116
},
{
"epoch": 0.2819277108433735,
"grad_norm": 0.15625,
"learning_rate": 7.180722891566265e-06,
"loss": 1.2742,
"step": 117
},
{
"epoch": 0.28433734939759037,
"grad_norm": 0.1650390625,
"learning_rate": 7.156626506024097e-06,
"loss": 1.2569,
"step": 118
},
{
"epoch": 0.28674698795180725,
"grad_norm": 0.154296875,
"learning_rate": 7.132530120481929e-06,
"loss": 1.2845,
"step": 119
},
{
"epoch": 0.2891566265060241,
"grad_norm": 0.1826171875,
"learning_rate": 7.1084337349397595e-06,
"loss": 1.2786,
"step": 120
},
{
"epoch": 0.29156626506024097,
"grad_norm": 0.205078125,
"learning_rate": 7.084337349397591e-06,
"loss": 1.2551,
"step": 121
},
{
"epoch": 0.29397590361445786,
"grad_norm": 0.1904296875,
"learning_rate": 7.060240963855422e-06,
"loss": 1.29,
"step": 122
},
{
"epoch": 0.2963855421686747,
"grad_norm": 0.1572265625,
"learning_rate": 7.036144578313253e-06,
"loss": 1.2848,
"step": 123
},
{
"epoch": 0.2987951807228916,
"grad_norm": 0.15625,
"learning_rate": 7.012048192771085e-06,
"loss": 1.2824,
"step": 124
},
{
"epoch": 0.30120481927710846,
"grad_norm": 0.1591796875,
"learning_rate": 6.987951807228917e-06,
"loss": 1.2811,
"step": 125
},
{
"epoch": 0.3036144578313253,
"grad_norm": 0.1552734375,
"learning_rate": 6.963855421686747e-06,
"loss": 1.2695,
"step": 126
},
{
"epoch": 0.3060240963855422,
"grad_norm": 0.1591796875,
"learning_rate": 6.9397590361445784e-06,
"loss": 1.2875,
"step": 127
},
{
"epoch": 0.30843373493975906,
"grad_norm": 0.1982421875,
"learning_rate": 6.91566265060241e-06,
"loss": 1.2343,
"step": 128
},
{
"epoch": 0.3108433734939759,
"grad_norm": 0.15234375,
"learning_rate": 6.891566265060241e-06,
"loss": 1.2445,
"step": 129
},
{
"epoch": 0.3132530120481928,
"grad_norm": 0.19921875,
"learning_rate": 6.867469879518073e-06,
"loss": 1.2923,
"step": 130
},
{
"epoch": 0.3156626506024096,
"grad_norm": 0.181640625,
"learning_rate": 6.843373493975905e-06,
"loss": 1.2072,
"step": 131
},
{
"epoch": 0.3180722891566265,
"grad_norm": 0.1572265625,
"learning_rate": 6.819277108433735e-06,
"loss": 1.2216,
"step": 132
},
{
"epoch": 0.3204819277108434,
"grad_norm": 0.1630859375,
"learning_rate": 6.7951807228915665e-06,
"loss": 1.3419,
"step": 133
},
{
"epoch": 0.3228915662650602,
"grad_norm": 0.154296875,
"learning_rate": 6.771084337349398e-06,
"loss": 1.2857,
"step": 134
},
{
"epoch": 0.3253012048192771,
"grad_norm": 0.1806640625,
"learning_rate": 6.746987951807229e-06,
"loss": 1.2438,
"step": 135
},
{
"epoch": 0.327710843373494,
"grad_norm": 0.1552734375,
"learning_rate": 6.722891566265061e-06,
"loss": 1.2539,
"step": 136
},
{
"epoch": 0.3301204819277108,
"grad_norm": 0.162109375,
"learning_rate": 6.698795180722893e-06,
"loss": 1.3105,
"step": 137
},
{
"epoch": 0.3325301204819277,
"grad_norm": 0.2041015625,
"learning_rate": 6.674698795180723e-06,
"loss": 1.2704,
"step": 138
},
{
"epoch": 0.3349397590361446,
"grad_norm": 0.1640625,
"learning_rate": 6.6506024096385545e-06,
"loss": 1.3083,
"step": 139
},
{
"epoch": 0.3373493975903614,
"grad_norm": 0.1650390625,
"learning_rate": 6.626506024096386e-06,
"loss": 1.2866,
"step": 140
},
{
"epoch": 0.3397590361445783,
"grad_norm": 0.154296875,
"learning_rate": 6.602409638554217e-06,
"loss": 1.2263,
"step": 141
},
{
"epoch": 0.3421686746987952,
"grad_norm": 0.1796875,
"learning_rate": 6.578313253012049e-06,
"loss": 1.2755,
"step": 142
},
{
"epoch": 0.344578313253012,
"grad_norm": 0.1845703125,
"learning_rate": 6.554216867469881e-06,
"loss": 1.2335,
"step": 143
},
{
"epoch": 0.3469879518072289,
"grad_norm": 0.158203125,
"learning_rate": 6.530120481927711e-06,
"loss": 1.2253,
"step": 144
},
{
"epoch": 0.3493975903614458,
"grad_norm": 0.162109375,
"learning_rate": 6.5060240963855425e-06,
"loss": 1.2575,
"step": 145
},
{
"epoch": 0.35180722891566263,
"grad_norm": 0.1669921875,
"learning_rate": 6.481927710843374e-06,
"loss": 1.2449,
"step": 146
},
{
"epoch": 0.3542168674698795,
"grad_norm": 0.15625,
"learning_rate": 6.457831325301205e-06,
"loss": 1.3099,
"step": 147
},
{
"epoch": 0.3566265060240964,
"grad_norm": 0.15625,
"learning_rate": 6.433734939759036e-06,
"loss": 1.2514,
"step": 148
},
{
"epoch": 0.35903614457831323,
"grad_norm": 0.1630859375,
"learning_rate": 6.409638554216868e-06,
"loss": 1.2865,
"step": 149
},
{
"epoch": 0.3614457831325301,
"grad_norm": 0.162109375,
"learning_rate": 6.385542168674699e-06,
"loss": 1.2442,
"step": 150
},
{
"epoch": 0.363855421686747,
"grad_norm": 0.2138671875,
"learning_rate": 6.3614457831325305e-06,
"loss": 1.2158,
"step": 151
},
{
"epoch": 0.36626506024096384,
"grad_norm": 0.1708984375,
"learning_rate": 6.337349397590362e-06,
"loss": 1.2311,
"step": 152
},
{
"epoch": 0.3686746987951807,
"grad_norm": 0.1728515625,
"learning_rate": 6.313253012048192e-06,
"loss": 1.2171,
"step": 153
},
{
"epoch": 0.3710843373493976,
"grad_norm": 0.1572265625,
"learning_rate": 6.289156626506024e-06,
"loss": 1.1648,
"step": 154
},
{
"epoch": 0.37349397590361444,
"grad_norm": 0.1611328125,
"learning_rate": 6.265060240963856e-06,
"loss": 1.2545,
"step": 155
},
{
"epoch": 0.3759036144578313,
"grad_norm": 0.171875,
"learning_rate": 6.240963855421688e-06,
"loss": 1.2812,
"step": 156
},
{
"epoch": 0.3783132530120482,
"grad_norm": 0.1689453125,
"learning_rate": 6.2168674698795185e-06,
"loss": 1.2385,
"step": 157
},
{
"epoch": 0.38072289156626504,
"grad_norm": 0.16015625,
"learning_rate": 6.19277108433735e-06,
"loss": 1.2154,
"step": 158
},
{
"epoch": 0.38313253012048193,
"grad_norm": 0.1630859375,
"learning_rate": 6.168674698795182e-06,
"loss": 1.2235,
"step": 159
},
{
"epoch": 0.3855421686746988,
"grad_norm": 0.251953125,
"learning_rate": 6.144578313253012e-06,
"loss": 1.2003,
"step": 160
},
{
"epoch": 0.38795180722891565,
"grad_norm": 0.22265625,
"learning_rate": 6.120481927710844e-06,
"loss": 1.2295,
"step": 161
},
{
"epoch": 0.39036144578313253,
"grad_norm": 0.1787109375,
"learning_rate": 6.096385542168676e-06,
"loss": 1.2656,
"step": 162
},
{
"epoch": 0.3927710843373494,
"grad_norm": 0.16796875,
"learning_rate": 6.0722891566265066e-06,
"loss": 1.2085,
"step": 163
},
{
"epoch": 0.39518072289156625,
"grad_norm": 0.26171875,
"learning_rate": 6.048192771084338e-06,
"loss": 1.3061,
"step": 164
},
{
"epoch": 0.39759036144578314,
"grad_norm": 0.166015625,
"learning_rate": 6.02409638554217e-06,
"loss": 1.2648,
"step": 165
},
{
"epoch": 0.4,
"grad_norm": 0.1650390625,
"learning_rate": 6e-06,
"loss": 1.2459,
"step": 166
},
{
"epoch": 0.40240963855421685,
"grad_norm": 0.162109375,
"learning_rate": 5.975903614457832e-06,
"loss": 1.2652,
"step": 167
},
{
"epoch": 0.40481927710843374,
"grad_norm": 0.16796875,
"learning_rate": 5.951807228915664e-06,
"loss": 1.2533,
"step": 168
},
{
"epoch": 0.4072289156626506,
"grad_norm": 0.1923828125,
"learning_rate": 5.927710843373495e-06,
"loss": 1.282,
"step": 169
},
{
"epoch": 0.40963855421686746,
"grad_norm": 0.162109375,
"learning_rate": 5.9036144578313255e-06,
"loss": 1.244,
"step": 170
},
{
"epoch": 0.41204819277108434,
"grad_norm": 0.1923828125,
"learning_rate": 5.879518072289157e-06,
"loss": 1.2396,
"step": 171
},
{
"epoch": 0.41445783132530123,
"grad_norm": 0.193359375,
"learning_rate": 5.855421686746988e-06,
"loss": 1.199,
"step": 172
},
{
"epoch": 0.41686746987951806,
"grad_norm": 0.1630859375,
"learning_rate": 5.83132530120482e-06,
"loss": 1.1981,
"step": 173
},
{
"epoch": 0.41927710843373495,
"grad_norm": 0.154296875,
"learning_rate": 5.807228915662652e-06,
"loss": 1.2312,
"step": 174
},
{
"epoch": 0.42168674698795183,
"grad_norm": 0.1591796875,
"learning_rate": 5.783132530120482e-06,
"loss": 1.2546,
"step": 175
},
{
"epoch": 0.42409638554216866,
"grad_norm": 0.314453125,
"learning_rate": 5.7590361445783135e-06,
"loss": 1.233,
"step": 176
},
{
"epoch": 0.42650602409638555,
"grad_norm": 0.1943359375,
"learning_rate": 5.734939759036145e-06,
"loss": 1.2589,
"step": 177
},
{
"epoch": 0.42891566265060244,
"grad_norm": 0.18359375,
"learning_rate": 5.710843373493976e-06,
"loss": 1.1707,
"step": 178
},
{
"epoch": 0.43132530120481927,
"grad_norm": 0.1728515625,
"learning_rate": 5.686746987951808e-06,
"loss": 1.2565,
"step": 179
},
{
"epoch": 0.43373493975903615,
"grad_norm": 0.166015625,
"learning_rate": 5.66265060240964e-06,
"loss": 1.1999,
"step": 180
},
{
"epoch": 0.43614457831325304,
"grad_norm": 0.1708984375,
"learning_rate": 5.63855421686747e-06,
"loss": 1.2431,
"step": 181
},
{
"epoch": 0.43855421686746987,
"grad_norm": 0.1630859375,
"learning_rate": 5.6144578313253015e-06,
"loss": 1.2073,
"step": 182
},
{
"epoch": 0.44096385542168676,
"grad_norm": 3.984375,
"learning_rate": 5.590361445783133e-06,
"loss": 1.2639,
"step": 183
},
{
"epoch": 0.4433734939759036,
"grad_norm": 0.2001953125,
"learning_rate": 5.566265060240964e-06,
"loss": 1.2562,
"step": 184
},
{
"epoch": 0.4457831325301205,
"grad_norm": 0.267578125,
"learning_rate": 5.542168674698796e-06,
"loss": 1.211,
"step": 185
},
{
"epoch": 0.44819277108433736,
"grad_norm": 0.169921875,
"learning_rate": 5.518072289156628e-06,
"loss": 1.2354,
"step": 186
},
{
"epoch": 0.4506024096385542,
"grad_norm": 0.1767578125,
"learning_rate": 5.493975903614458e-06,
"loss": 1.2065,
"step": 187
},
{
"epoch": 0.4530120481927711,
"grad_norm": 0.16796875,
"learning_rate": 5.4698795180722896e-06,
"loss": 1.2138,
"step": 188
},
{
"epoch": 0.45542168674698796,
"grad_norm": 0.1611328125,
"learning_rate": 5.445783132530121e-06,
"loss": 1.2445,
"step": 189
},
{
"epoch": 0.4578313253012048,
"grad_norm": 0.169921875,
"learning_rate": 5.421686746987952e-06,
"loss": 1.2164,
"step": 190
},
{
"epoch": 0.4602409638554217,
"grad_norm": 0.197265625,
"learning_rate": 5.397590361445784e-06,
"loss": 1.2706,
"step": 191
},
{
"epoch": 0.46265060240963857,
"grad_norm": 0.2060546875,
"learning_rate": 5.373493975903615e-06,
"loss": 1.1854,
"step": 192
},
{
"epoch": 0.4650602409638554,
"grad_norm": 0.1953125,
"learning_rate": 5.349397590361446e-06,
"loss": 1.19,
"step": 193
},
{
"epoch": 0.4674698795180723,
"grad_norm": 0.2138671875,
"learning_rate": 5.325301204819278e-06,
"loss": 1.1898,
"step": 194
},
{
"epoch": 0.46987951807228917,
"grad_norm": 0.1787109375,
"learning_rate": 5.301204819277109e-06,
"loss": 1.173,
"step": 195
},
{
"epoch": 0.472289156626506,
"grad_norm": 0.216796875,
"learning_rate": 5.27710843373494e-06,
"loss": 1.243,
"step": 196
},
{
"epoch": 0.4746987951807229,
"grad_norm": 0.2236328125,
"learning_rate": 5.253012048192771e-06,
"loss": 1.2215,
"step": 197
},
{
"epoch": 0.4771084337349398,
"grad_norm": 0.2353515625,
"learning_rate": 5.228915662650603e-06,
"loss": 1.206,
"step": 198
},
{
"epoch": 0.4795180722891566,
"grad_norm": 0.16796875,
"learning_rate": 5.204819277108434e-06,
"loss": 1.2018,
"step": 199
},
{
"epoch": 0.4819277108433735,
"grad_norm": 0.1923828125,
"learning_rate": 5.180722891566266e-06,
"loss": 1.2342,
"step": 200
},
{
"epoch": 0.4843373493975904,
"grad_norm": 0.173828125,
"learning_rate": 5.156626506024097e-06,
"loss": 1.2479,
"step": 201
},
{
"epoch": 0.4867469879518072,
"grad_norm": 0.16796875,
"learning_rate": 5.132530120481927e-06,
"loss": 1.2312,
"step": 202
},
{
"epoch": 0.4891566265060241,
"grad_norm": 0.16796875,
"learning_rate": 5.108433734939759e-06,
"loss": 1.1447,
"step": 203
},
{
"epoch": 0.491566265060241,
"grad_norm": 0.21484375,
"learning_rate": 5.084337349397591e-06,
"loss": 1.2487,
"step": 204
},
{
"epoch": 0.4939759036144578,
"grad_norm": 0.189453125,
"learning_rate": 5.060240963855422e-06,
"loss": 1.2035,
"step": 205
},
{
"epoch": 0.4963855421686747,
"grad_norm": 0.1796875,
"learning_rate": 5.036144578313254e-06,
"loss": 1.2573,
"step": 206
},
{
"epoch": 0.4987951807228916,
"grad_norm": 0.171875,
"learning_rate": 5.012048192771085e-06,
"loss": 1.2324,
"step": 207
},
{
"epoch": 0.5012048192771085,
"grad_norm": 0.16796875,
"learning_rate": 4.987951807228916e-06,
"loss": 1.1985,
"step": 208
},
{
"epoch": 0.5036144578313253,
"grad_norm": 0.1767578125,
"learning_rate": 4.963855421686747e-06,
"loss": 1.2165,
"step": 209
},
{
"epoch": 0.5060240963855421,
"grad_norm": 0.173828125,
"learning_rate": 4.939759036144578e-06,
"loss": 1.2033,
"step": 210
},
{
"epoch": 0.5084337349397591,
"grad_norm": 0.1630859375,
"learning_rate": 4.91566265060241e-06,
"loss": 1.1665,
"step": 211
},
{
"epoch": 0.5108433734939759,
"grad_norm": 0.1796875,
"learning_rate": 4.891566265060242e-06,
"loss": 1.1848,
"step": 212
},
{
"epoch": 0.5132530120481927,
"grad_norm": 0.1669921875,
"learning_rate": 4.8674698795180725e-06,
"loss": 1.2168,
"step": 213
},
{
"epoch": 0.5156626506024097,
"grad_norm": 0.17578125,
"learning_rate": 4.843373493975904e-06,
"loss": 1.2074,
"step": 214
},
{
"epoch": 0.5180722891566265,
"grad_norm": 0.1962890625,
"learning_rate": 4.819277108433735e-06,
"loss": 1.2167,
"step": 215
},
{
"epoch": 0.5204819277108433,
"grad_norm": 0.1787109375,
"learning_rate": 4.795180722891566e-06,
"loss": 1.1521,
"step": 216
},
{
"epoch": 0.5228915662650603,
"grad_norm": 0.1748046875,
"learning_rate": 4.771084337349398e-06,
"loss": 1.2391,
"step": 217
},
{
"epoch": 0.5253012048192771,
"grad_norm": 0.1806640625,
"learning_rate": 4.74698795180723e-06,
"loss": 1.2543,
"step": 218
},
{
"epoch": 0.5277108433734939,
"grad_norm": 0.1787109375,
"learning_rate": 4.7228915662650606e-06,
"loss": 1.2133,
"step": 219
},
{
"epoch": 0.5301204819277109,
"grad_norm": 0.48046875,
"learning_rate": 4.698795180722892e-06,
"loss": 1.1813,
"step": 220
},
{
"epoch": 0.5325301204819277,
"grad_norm": 0.1689453125,
"learning_rate": 4.674698795180723e-06,
"loss": 1.2244,
"step": 221
},
{
"epoch": 0.5349397590361445,
"grad_norm": 0.220703125,
"learning_rate": 4.650602409638554e-06,
"loss": 1.2437,
"step": 222
},
{
"epoch": 0.5373493975903615,
"grad_norm": 0.1796875,
"learning_rate": 4.626506024096386e-06,
"loss": 1.1834,
"step": 223
},
{
"epoch": 0.5397590361445783,
"grad_norm": 0.1669921875,
"learning_rate": 4.602409638554217e-06,
"loss": 1.1981,
"step": 224
},
{
"epoch": 0.5421686746987951,
"grad_norm": 0.1865234375,
"learning_rate": 4.578313253012049e-06,
"loss": 1.195,
"step": 225
},
{
"epoch": 0.5445783132530121,
"grad_norm": 0.1826171875,
"learning_rate": 4.55421686746988e-06,
"loss": 1.199,
"step": 226
},
{
"epoch": 0.5469879518072289,
"grad_norm": 0.1787109375,
"learning_rate": 4.530120481927711e-06,
"loss": 1.286,
"step": 227
},
{
"epoch": 0.5493975903614458,
"grad_norm": 0.2236328125,
"learning_rate": 4.506024096385542e-06,
"loss": 1.2637,
"step": 228
},
{
"epoch": 0.5518072289156627,
"grad_norm": 0.2060546875,
"learning_rate": 4.481927710843374e-06,
"loss": 1.2137,
"step": 229
},
{
"epoch": 0.5542168674698795,
"grad_norm": 0.234375,
"learning_rate": 4.457831325301205e-06,
"loss": 1.2075,
"step": 230
},
{
"epoch": 0.5566265060240964,
"grad_norm": 0.1962890625,
"learning_rate": 4.433734939759037e-06,
"loss": 1.1673,
"step": 231
},
{
"epoch": 0.5590361445783133,
"grad_norm": 0.201171875,
"learning_rate": 4.4096385542168675e-06,
"loss": 1.1583,
"step": 232
},
{
"epoch": 0.5614457831325301,
"grad_norm": 0.1845703125,
"learning_rate": 4.385542168674699e-06,
"loss": 1.2021,
"step": 233
},
{
"epoch": 0.563855421686747,
"grad_norm": 0.169921875,
"learning_rate": 4.361445783132531e-06,
"loss": 1.1934,
"step": 234
},
{
"epoch": 0.5662650602409639,
"grad_norm": 0.166015625,
"learning_rate": 4.337349397590362e-06,
"loss": 1.1967,
"step": 235
},
{
"epoch": 0.5686746987951807,
"grad_norm": 0.166015625,
"learning_rate": 4.313253012048193e-06,
"loss": 1.2122,
"step": 236
},
{
"epoch": 0.5710843373493976,
"grad_norm": 0.166015625,
"learning_rate": 4.289156626506025e-06,
"loss": 1.2235,
"step": 237
},
{
"epoch": 0.5734939759036145,
"grad_norm": 0.1806640625,
"learning_rate": 4.2650602409638555e-06,
"loss": 1.246,
"step": 238
},
{
"epoch": 0.5759036144578313,
"grad_norm": 0.1953125,
"learning_rate": 4.240963855421687e-06,
"loss": 1.2286,
"step": 239
},
{
"epoch": 0.5783132530120482,
"grad_norm": 0.1640625,
"learning_rate": 4.216867469879519e-06,
"loss": 1.2062,
"step": 240
},
{
"epoch": 0.5807228915662651,
"grad_norm": 0.1689453125,
"learning_rate": 4.19277108433735e-06,
"loss": 1.2193,
"step": 241
},
{
"epoch": 0.5831325301204819,
"grad_norm": 0.1708984375,
"learning_rate": 4.168674698795181e-06,
"loss": 1.191,
"step": 242
},
{
"epoch": 0.5855421686746988,
"grad_norm": 0.2255859375,
"learning_rate": 4.144578313253013e-06,
"loss": 1.1525,
"step": 243
},
{
"epoch": 0.5879518072289157,
"grad_norm": 0.1767578125,
"learning_rate": 4.1204819277108436e-06,
"loss": 1.1852,
"step": 244
},
{
"epoch": 0.5903614457831325,
"grad_norm": 0.1787109375,
"learning_rate": 4.096385542168675e-06,
"loss": 1.1511,
"step": 245
},
{
"epoch": 0.5927710843373494,
"grad_norm": 0.2021484375,
"learning_rate": 4.072289156626506e-06,
"loss": 1.1303,
"step": 246
},
{
"epoch": 0.5951807228915663,
"grad_norm": 0.16796875,
"learning_rate": 4.048192771084338e-06,
"loss": 1.2076,
"step": 247
},
{
"epoch": 0.5975903614457831,
"grad_norm": 0.18359375,
"learning_rate": 4.024096385542169e-06,
"loss": 1.1557,
"step": 248
},
{
"epoch": 0.6,
"grad_norm": 0.1767578125,
"learning_rate": 4.000000000000001e-06,
"loss": 1.1526,
"step": 249
},
{
"epoch": 0.6024096385542169,
"grad_norm": 0.1923828125,
"learning_rate": 3.975903614457832e-06,
"loss": 1.2734,
"step": 250
},
{
"epoch": 0.6048192771084338,
"grad_norm": 0.2060546875,
"learning_rate": 3.9518072289156625e-06,
"loss": 1.151,
"step": 251
},
{
"epoch": 0.6072289156626506,
"grad_norm": 0.18359375,
"learning_rate": 3.927710843373494e-06,
"loss": 1.2488,
"step": 252
},
{
"epoch": 0.6096385542168675,
"grad_norm": 0.1748046875,
"learning_rate": 3.903614457831326e-06,
"loss": 1.2773,
"step": 253
},
{
"epoch": 0.6120481927710844,
"grad_norm": 0.1845703125,
"learning_rate": 3.879518072289157e-06,
"loss": 1.1906,
"step": 254
},
{
"epoch": 0.6144578313253012,
"grad_norm": 0.201171875,
"learning_rate": 3.855421686746989e-06,
"loss": 1.1557,
"step": 255
},
{
"epoch": 0.6168674698795181,
"grad_norm": 0.185546875,
"learning_rate": 3.83132530120482e-06,
"loss": 1.1864,
"step": 256
},
{
"epoch": 0.619277108433735,
"grad_norm": 0.19921875,
"learning_rate": 3.807228915662651e-06,
"loss": 1.1687,
"step": 257
},
{
"epoch": 0.6216867469879518,
"grad_norm": 0.177734375,
"learning_rate": 3.7831325301204823e-06,
"loss": 1.2305,
"step": 258
},
{
"epoch": 0.6240963855421687,
"grad_norm": 0.2236328125,
"learning_rate": 3.7590361445783136e-06,
"loss": 1.244,
"step": 259
},
{
"epoch": 0.6265060240963856,
"grad_norm": 0.17578125,
"learning_rate": 3.7349397590361445e-06,
"loss": 1.2029,
"step": 260
},
{
"epoch": 0.6289156626506024,
"grad_norm": 0.181640625,
"learning_rate": 3.7108433734939763e-06,
"loss": 1.2267,
"step": 261
},
{
"epoch": 0.6313253012048192,
"grad_norm": 0.2021484375,
"learning_rate": 3.6867469879518076e-06,
"loss": 1.1931,
"step": 262
},
{
"epoch": 0.6337349397590362,
"grad_norm": 0.177734375,
"learning_rate": 3.6626506024096385e-06,
"loss": 1.1854,
"step": 263
},
{
"epoch": 0.636144578313253,
"grad_norm": 0.1748046875,
"learning_rate": 3.6385542168674703e-06,
"loss": 1.2057,
"step": 264
},
{
"epoch": 0.6385542168674698,
"grad_norm": 0.177734375,
"learning_rate": 3.6144578313253016e-06,
"loss": 1.2216,
"step": 265
},
{
"epoch": 0.6409638554216868,
"grad_norm": 0.1796875,
"learning_rate": 3.5903614457831325e-06,
"loss": 1.1754,
"step": 266
},
{
"epoch": 0.6433734939759036,
"grad_norm": 0.177734375,
"learning_rate": 3.5662650602409643e-06,
"loss": 1.2114,
"step": 267
},
{
"epoch": 0.6457831325301204,
"grad_norm": 0.1796875,
"learning_rate": 3.5421686746987956e-06,
"loss": 1.1781,
"step": 268
},
{
"epoch": 0.6481927710843374,
"grad_norm": 0.51171875,
"learning_rate": 3.5180722891566266e-06,
"loss": 1.1875,
"step": 269
},
{
"epoch": 0.6506024096385542,
"grad_norm": 0.2255859375,
"learning_rate": 3.4939759036144583e-06,
"loss": 1.2973,
"step": 270
},
{
"epoch": 0.653012048192771,
"grad_norm": 0.16796875,
"learning_rate": 3.4698795180722892e-06,
"loss": 1.2095,
"step": 271
},
{
"epoch": 0.655421686746988,
"grad_norm": 0.169921875,
"learning_rate": 3.4457831325301206e-06,
"loss": 1.1939,
"step": 272
},
{
"epoch": 0.6578313253012048,
"grad_norm": 0.1748046875,
"learning_rate": 3.4216867469879523e-06,
"loss": 1.1503,
"step": 273
},
{
"epoch": 0.6602409638554216,
"grad_norm": 0.16796875,
"learning_rate": 3.3975903614457832e-06,
"loss": 1.1429,
"step": 274
},
{
"epoch": 0.6626506024096386,
"grad_norm": 0.1826171875,
"learning_rate": 3.3734939759036146e-06,
"loss": 1.1845,
"step": 275
},
{
"epoch": 0.6650602409638554,
"grad_norm": 0.353515625,
"learning_rate": 3.3493975903614463e-06,
"loss": 1.1825,
"step": 276
},
{
"epoch": 0.6674698795180722,
"grad_norm": 0.341796875,
"learning_rate": 3.3253012048192772e-06,
"loss": 1.1827,
"step": 277
},
{
"epoch": 0.6698795180722892,
"grad_norm": 0.1669921875,
"learning_rate": 3.3012048192771086e-06,
"loss": 1.1598,
"step": 278
},
{
"epoch": 0.672289156626506,
"grad_norm": 0.1767578125,
"learning_rate": 3.2771084337349403e-06,
"loss": 1.2151,
"step": 279
},
{
"epoch": 0.6746987951807228,
"grad_norm": 0.181640625,
"learning_rate": 3.2530120481927713e-06,
"loss": 1.1548,
"step": 280
},
{
"epoch": 0.6771084337349398,
"grad_norm": 0.1689453125,
"learning_rate": 3.2289156626506026e-06,
"loss": 1.2327,
"step": 281
},
{
"epoch": 0.6795180722891566,
"grad_norm": 0.2138671875,
"learning_rate": 3.204819277108434e-06,
"loss": 1.1985,
"step": 282
},
{
"epoch": 0.6819277108433734,
"grad_norm": 0.1962890625,
"learning_rate": 3.1807228915662653e-06,
"loss": 1.2324,
"step": 283
},
{
"epoch": 0.6843373493975904,
"grad_norm": 0.169921875,
"learning_rate": 3.156626506024096e-06,
"loss": 1.1778,
"step": 284
},
{
"epoch": 0.6867469879518072,
"grad_norm": 0.220703125,
"learning_rate": 3.132530120481928e-06,
"loss": 1.2193,
"step": 285
},
{
"epoch": 0.689156626506024,
"grad_norm": 0.1767578125,
"learning_rate": 3.1084337349397593e-06,
"loss": 1.1658,
"step": 286
},
{
"epoch": 0.691566265060241,
"grad_norm": 0.2236328125,
"learning_rate": 3.084337349397591e-06,
"loss": 1.2838,
"step": 287
},
{
"epoch": 0.6939759036144578,
"grad_norm": 0.1748046875,
"learning_rate": 3.060240963855422e-06,
"loss": 1.2215,
"step": 288
},
{
"epoch": 0.6963855421686747,
"grad_norm": 0.1953125,
"learning_rate": 3.0361445783132533e-06,
"loss": 1.1141,
"step": 289
},
{
"epoch": 0.6987951807228916,
"grad_norm": 0.171875,
"learning_rate": 3.012048192771085e-06,
"loss": 1.2054,
"step": 290
},
{
"epoch": 0.7012048192771084,
"grad_norm": 0.177734375,
"learning_rate": 2.987951807228916e-06,
"loss": 1.1681,
"step": 291
},
{
"epoch": 0.7036144578313253,
"grad_norm": 0.1826171875,
"learning_rate": 2.9638554216867473e-06,
"loss": 1.1685,
"step": 292
},
{
"epoch": 0.7060240963855422,
"grad_norm": 0.1748046875,
"learning_rate": 2.9397590361445786e-06,
"loss": 1.1455,
"step": 293
},
{
"epoch": 0.708433734939759,
"grad_norm": 0.1826171875,
"learning_rate": 2.91566265060241e-06,
"loss": 1.2131,
"step": 294
},
{
"epoch": 0.7108433734939759,
"grad_norm": 0.173828125,
"learning_rate": 2.891566265060241e-06,
"loss": 1.2477,
"step": 295
},
{
"epoch": 0.7132530120481928,
"grad_norm": 0.2119140625,
"learning_rate": 2.8674698795180726e-06,
"loss": 1.2399,
"step": 296
},
{
"epoch": 0.7156626506024096,
"grad_norm": 0.17578125,
"learning_rate": 2.843373493975904e-06,
"loss": 1.2477,
"step": 297
},
{
"epoch": 0.7180722891566265,
"grad_norm": 0.1806640625,
"learning_rate": 2.819277108433735e-06,
"loss": 1.1986,
"step": 298
},
{
"epoch": 0.7204819277108434,
"grad_norm": 0.1728515625,
"learning_rate": 2.7951807228915666e-06,
"loss": 1.1513,
"step": 299
},
{
"epoch": 0.7228915662650602,
"grad_norm": 0.1728515625,
"learning_rate": 2.771084337349398e-06,
"loss": 1.2202,
"step": 300
},
{
"epoch": 0.7253012048192771,
"grad_norm": 0.1875,
"learning_rate": 2.746987951807229e-06,
"loss": 1.2278,
"step": 301
},
{
"epoch": 0.727710843373494,
"grad_norm": 0.1728515625,
"learning_rate": 2.7228915662650607e-06,
"loss": 1.2158,
"step": 302
},
{
"epoch": 0.7301204819277108,
"grad_norm": 0.16796875,
"learning_rate": 2.698795180722892e-06,
"loss": 1.1696,
"step": 303
},
{
"epoch": 0.7325301204819277,
"grad_norm": 0.177734375,
"learning_rate": 2.674698795180723e-06,
"loss": 1.2091,
"step": 304
},
{
"epoch": 0.7349397590361446,
"grad_norm": 0.1728515625,
"learning_rate": 2.6506024096385547e-06,
"loss": 1.1598,
"step": 305
},
{
"epoch": 0.7373493975903614,
"grad_norm": 0.1796875,
"learning_rate": 2.6265060240963856e-06,
"loss": 1.207,
"step": 306
},
{
"epoch": 0.7397590361445783,
"grad_norm": 0.1923828125,
"learning_rate": 2.602409638554217e-06,
"loss": 1.2258,
"step": 307
},
{
"epoch": 0.7421686746987952,
"grad_norm": 0.177734375,
"learning_rate": 2.5783132530120487e-06,
"loss": 1.1626,
"step": 308
},
{
"epoch": 0.744578313253012,
"grad_norm": 0.201171875,
"learning_rate": 2.5542168674698796e-06,
"loss": 1.2301,
"step": 309
},
{
"epoch": 0.7469879518072289,
"grad_norm": 0.21484375,
"learning_rate": 2.530120481927711e-06,
"loss": 1.1551,
"step": 310
},
{
"epoch": 0.7493975903614458,
"grad_norm": 0.23046875,
"learning_rate": 2.5060240963855427e-06,
"loss": 1.1655,
"step": 311
},
{
"epoch": 0.7518072289156627,
"grad_norm": 0.173828125,
"learning_rate": 2.4819277108433736e-06,
"loss": 1.2255,
"step": 312
},
{
"epoch": 0.7542168674698795,
"grad_norm": 0.1708984375,
"learning_rate": 2.457831325301205e-06,
"loss": 1.2154,
"step": 313
},
{
"epoch": 0.7566265060240964,
"grad_norm": 0.166015625,
"learning_rate": 2.4337349397590363e-06,
"loss": 1.1923,
"step": 314
},
{
"epoch": 0.7590361445783133,
"grad_norm": 0.16796875,
"learning_rate": 2.4096385542168676e-06,
"loss": 1.217,
"step": 315
},
{
"epoch": 0.7614457831325301,
"grad_norm": 0.181640625,
"learning_rate": 2.385542168674699e-06,
"loss": 1.2221,
"step": 316
},
{
"epoch": 0.763855421686747,
"grad_norm": 0.2578125,
"learning_rate": 2.3614457831325303e-06,
"loss": 1.163,
"step": 317
},
{
"epoch": 0.7662650602409639,
"grad_norm": 0.2021484375,
"learning_rate": 2.3373493975903616e-06,
"loss": 1.187,
"step": 318
},
{
"epoch": 0.7686746987951807,
"grad_norm": 0.1953125,
"learning_rate": 2.313253012048193e-06,
"loss": 1.1824,
"step": 319
},
{
"epoch": 0.7710843373493976,
"grad_norm": 0.1708984375,
"learning_rate": 2.2891566265060243e-06,
"loss": 1.1858,
"step": 320
},
{
"epoch": 0.7734939759036145,
"grad_norm": 0.1708984375,
"learning_rate": 2.2650602409638556e-06,
"loss": 1.1922,
"step": 321
},
{
"epoch": 0.7759036144578313,
"grad_norm": 0.17578125,
"learning_rate": 2.240963855421687e-06,
"loss": 1.2378,
"step": 322
},
{
"epoch": 0.7783132530120482,
"grad_norm": 0.201171875,
"learning_rate": 2.2168674698795183e-06,
"loss": 1.1722,
"step": 323
},
{
"epoch": 0.7807228915662651,
"grad_norm": 0.173828125,
"learning_rate": 2.1927710843373496e-06,
"loss": 1.1634,
"step": 324
},
{
"epoch": 0.7831325301204819,
"grad_norm": 0.251953125,
"learning_rate": 2.168674698795181e-06,
"loss": 1.1894,
"step": 325
},
{
"epoch": 0.7855421686746988,
"grad_norm": 0.1943359375,
"learning_rate": 2.1445783132530123e-06,
"loss": 1.2023,
"step": 326
},
{
"epoch": 0.7879518072289157,
"grad_norm": 0.1904296875,
"learning_rate": 2.1204819277108437e-06,
"loss": 1.1758,
"step": 327
},
{
"epoch": 0.7903614457831325,
"grad_norm": 0.1748046875,
"learning_rate": 2.096385542168675e-06,
"loss": 1.1747,
"step": 328
},
{
"epoch": 0.7927710843373494,
"grad_norm": 0.1943359375,
"learning_rate": 2.0722891566265063e-06,
"loss": 1.228,
"step": 329
},
{
"epoch": 0.7951807228915663,
"grad_norm": 0.1943359375,
"learning_rate": 2.0481927710843377e-06,
"loss": 1.1958,
"step": 330
},
{
"epoch": 0.7975903614457831,
"grad_norm": 0.1923828125,
"learning_rate": 2.024096385542169e-06,
"loss": 1.1237,
"step": 331
},
{
"epoch": 0.8,
"grad_norm": 0.18359375,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.1705,
"step": 332
},
{
"epoch": 0.8024096385542169,
"grad_norm": 0.181640625,
"learning_rate": 1.9759036144578312e-06,
"loss": 1.2265,
"step": 333
},
{
"epoch": 0.8048192771084337,
"grad_norm": 0.177734375,
"learning_rate": 1.951807228915663e-06,
"loss": 1.1741,
"step": 334
},
{
"epoch": 0.8072289156626506,
"grad_norm": 0.173828125,
"learning_rate": 1.9277108433734943e-06,
"loss": 1.18,
"step": 335
},
{
"epoch": 0.8096385542168675,
"grad_norm": 0.26171875,
"learning_rate": 1.9036144578313255e-06,
"loss": 1.1867,
"step": 336
},
{
"epoch": 0.8120481927710843,
"grad_norm": 0.1796875,
"learning_rate": 1.8795180722891568e-06,
"loss": 1.2315,
"step": 337
},
{
"epoch": 0.8144578313253013,
"grad_norm": 0.1767578125,
"learning_rate": 1.8554216867469881e-06,
"loss": 1.2101,
"step": 338
},
{
"epoch": 0.8168674698795181,
"grad_norm": 0.1787109375,
"learning_rate": 1.8313253012048193e-06,
"loss": 1.1729,
"step": 339
},
{
"epoch": 0.8192771084337349,
"grad_norm": 0.18359375,
"learning_rate": 1.8072289156626508e-06,
"loss": 1.2136,
"step": 340
},
{
"epoch": 0.8216867469879519,
"grad_norm": 0.2470703125,
"learning_rate": 1.7831325301204822e-06,
"loss": 1.1676,
"step": 341
},
{
"epoch": 0.8240963855421687,
"grad_norm": 0.1962890625,
"learning_rate": 1.7590361445783133e-06,
"loss": 1.2039,
"step": 342
},
{
"epoch": 0.8265060240963855,
"grad_norm": 0.1826171875,
"learning_rate": 1.7349397590361446e-06,
"loss": 1.2019,
"step": 343
},
{
"epoch": 0.8289156626506025,
"grad_norm": 0.1826171875,
"learning_rate": 1.7108433734939762e-06,
"loss": 1.256,
"step": 344
},
{
"epoch": 0.8313253012048193,
"grad_norm": 0.1708984375,
"learning_rate": 1.6867469879518073e-06,
"loss": 1.1741,
"step": 345
},
{
"epoch": 0.8337349397590361,
"grad_norm": 0.1728515625,
"learning_rate": 1.6626506024096386e-06,
"loss": 1.2059,
"step": 346
},
{
"epoch": 0.8361445783132531,
"grad_norm": 0.171875,
"learning_rate": 1.6385542168674702e-06,
"loss": 1.2006,
"step": 347
},
{
"epoch": 0.8385542168674699,
"grad_norm": 0.1806640625,
"learning_rate": 1.6144578313253013e-06,
"loss": 1.2194,
"step": 348
},
{
"epoch": 0.8409638554216867,
"grad_norm": 0.1748046875,
"learning_rate": 1.5903614457831326e-06,
"loss": 1.1615,
"step": 349
},
{
"epoch": 0.8433734939759037,
"grad_norm": 0.1845703125,
"learning_rate": 1.566265060240964e-06,
"loss": 1.2338,
"step": 350
},
{
"epoch": 0.8457831325301205,
"grad_norm": 0.1708984375,
"learning_rate": 1.5421686746987955e-06,
"loss": 1.2289,
"step": 351
},
{
"epoch": 0.8481927710843373,
"grad_norm": 0.2255859375,
"learning_rate": 1.5180722891566266e-06,
"loss": 1.2027,
"step": 352
},
{
"epoch": 0.8506024096385543,
"grad_norm": 0.1865234375,
"learning_rate": 1.493975903614458e-06,
"loss": 1.2452,
"step": 353
},
{
"epoch": 0.8530120481927711,
"grad_norm": 0.1767578125,
"learning_rate": 1.4698795180722893e-06,
"loss": 1.2186,
"step": 354
},
{
"epoch": 0.8554216867469879,
"grad_norm": 0.1748046875,
"learning_rate": 1.4457831325301204e-06,
"loss": 1.1827,
"step": 355
},
{
"epoch": 0.8578313253012049,
"grad_norm": 0.2255859375,
"learning_rate": 1.421686746987952e-06,
"loss": 1.1932,
"step": 356
},
{
"epoch": 0.8602409638554217,
"grad_norm": 0.169921875,
"learning_rate": 1.3975903614457833e-06,
"loss": 1.1829,
"step": 357
},
{
"epoch": 0.8626506024096385,
"grad_norm": 0.1884765625,
"learning_rate": 1.3734939759036144e-06,
"loss": 1.1716,
"step": 358
},
{
"epoch": 0.8650602409638555,
"grad_norm": 0.1767578125,
"learning_rate": 1.349397590361446e-06,
"loss": 1.2319,
"step": 359
},
{
"epoch": 0.8674698795180723,
"grad_norm": 0.25390625,
"learning_rate": 1.3253012048192773e-06,
"loss": 1.1795,
"step": 360
},
{
"epoch": 0.8698795180722891,
"grad_norm": 0.1767578125,
"learning_rate": 1.3012048192771085e-06,
"loss": 1.1556,
"step": 361
},
{
"epoch": 0.8722891566265061,
"grad_norm": 0.255859375,
"learning_rate": 1.2771084337349398e-06,
"loss": 1.1645,
"step": 362
},
{
"epoch": 0.8746987951807229,
"grad_norm": 0.1787109375,
"learning_rate": 1.2530120481927713e-06,
"loss": 1.2587,
"step": 363
},
{
"epoch": 0.8771084337349397,
"grad_norm": 0.2041015625,
"learning_rate": 1.2289156626506025e-06,
"loss": 1.2195,
"step": 364
},
{
"epoch": 0.8795180722891566,
"grad_norm": 0.169921875,
"learning_rate": 1.2048192771084338e-06,
"loss": 1.1616,
"step": 365
},
{
"epoch": 0.8819277108433735,
"grad_norm": 0.2138671875,
"learning_rate": 1.1807228915662651e-06,
"loss": 1.185,
"step": 366
},
{
"epoch": 0.8843373493975903,
"grad_norm": 0.208984375,
"learning_rate": 1.1566265060240965e-06,
"loss": 1.1904,
"step": 367
},
{
"epoch": 0.8867469879518072,
"grad_norm": 0.1796875,
"learning_rate": 1.1325301204819278e-06,
"loss": 1.1684,
"step": 368
},
{
"epoch": 0.8891566265060241,
"grad_norm": 0.1943359375,
"learning_rate": 1.1084337349397592e-06,
"loss": 1.2207,
"step": 369
},
{
"epoch": 0.891566265060241,
"grad_norm": 0.1806640625,
"learning_rate": 1.0843373493975905e-06,
"loss": 1.1843,
"step": 370
},
{
"epoch": 0.8939759036144578,
"grad_norm": 0.171875,
"learning_rate": 1.0602409638554218e-06,
"loss": 1.191,
"step": 371
},
{
"epoch": 0.8963855421686747,
"grad_norm": 0.1875,
"learning_rate": 1.0361445783132532e-06,
"loss": 1.2322,
"step": 372
},
{
"epoch": 0.8987951807228916,
"grad_norm": 0.19921875,
"learning_rate": 1.0120481927710845e-06,
"loss": 1.155,
"step": 373
},
{
"epoch": 0.9012048192771084,
"grad_norm": 0.1953125,
"learning_rate": 9.879518072289156e-07,
"loss": 1.2351,
"step": 374
},
{
"epoch": 0.9036144578313253,
"grad_norm": 0.1767578125,
"learning_rate": 9.638554216867472e-07,
"loss": 1.1366,
"step": 375
},
{
"epoch": 0.9060240963855422,
"grad_norm": 0.1845703125,
"learning_rate": 9.397590361445784e-07,
"loss": 1.237,
"step": 376
},
{
"epoch": 0.908433734939759,
"grad_norm": 0.169921875,
"learning_rate": 9.156626506024096e-07,
"loss": 1.1675,
"step": 377
},
{
"epoch": 0.9108433734939759,
"grad_norm": 0.19140625,
"learning_rate": 8.915662650602411e-07,
"loss": 1.193,
"step": 378
},
{
"epoch": 0.9132530120481928,
"grad_norm": 0.1943359375,
"learning_rate": 8.674698795180723e-07,
"loss": 1.232,
"step": 379
},
{
"epoch": 0.9156626506024096,
"grad_norm": 0.171875,
"learning_rate": 8.433734939759036e-07,
"loss": 1.2171,
"step": 380
},
{
"epoch": 0.9180722891566265,
"grad_norm": 0.2109375,
"learning_rate": 8.192771084337351e-07,
"loss": 1.1892,
"step": 381
},
{
"epoch": 0.9204819277108434,
"grad_norm": 0.1748046875,
"learning_rate": 7.951807228915663e-07,
"loss": 1.2025,
"step": 382
},
{
"epoch": 0.9228915662650602,
"grad_norm": 0.1767578125,
"learning_rate": 7.710843373493978e-07,
"loss": 1.1685,
"step": 383
},
{
"epoch": 0.9253012048192771,
"grad_norm": 0.1767578125,
"learning_rate": 7.46987951807229e-07,
"loss": 1.2348,
"step": 384
},
{
"epoch": 0.927710843373494,
"grad_norm": 0.1787109375,
"learning_rate": 7.228915662650602e-07,
"loss": 1.1254,
"step": 385
},
{
"epoch": 0.9301204819277108,
"grad_norm": 0.2294921875,
"learning_rate": 6.987951807228917e-07,
"loss": 1.2119,
"step": 386
},
{
"epoch": 0.9325301204819277,
"grad_norm": 0.17578125,
"learning_rate": 6.74698795180723e-07,
"loss": 1.2123,
"step": 387
},
{
"epoch": 0.9349397590361446,
"grad_norm": 0.185546875,
"learning_rate": 6.506024096385542e-07,
"loss": 1.1913,
"step": 388
},
{
"epoch": 0.9373493975903614,
"grad_norm": 0.1767578125,
"learning_rate": 6.265060240963857e-07,
"loss": 1.2331,
"step": 389
},
{
"epoch": 0.9397590361445783,
"grad_norm": 0.1748046875,
"learning_rate": 6.024096385542169e-07,
"loss": 1.196,
"step": 390
},
{
"epoch": 0.9421686746987952,
"grad_norm": 0.193359375,
"learning_rate": 5.783132530120482e-07,
"loss": 1.1444,
"step": 391
},
{
"epoch": 0.944578313253012,
"grad_norm": 0.2021484375,
"learning_rate": 5.542168674698796e-07,
"loss": 1.2528,
"step": 392
},
{
"epoch": 0.946987951807229,
"grad_norm": 0.19140625,
"learning_rate": 5.301204819277109e-07,
"loss": 1.178,
"step": 393
},
{
"epoch": 0.9493975903614458,
"grad_norm": 0.1767578125,
"learning_rate": 5.060240963855422e-07,
"loss": 1.1813,
"step": 394
},
{
"epoch": 0.9518072289156626,
"grad_norm": 0.322265625,
"learning_rate": 4.819277108433736e-07,
"loss": 1.1372,
"step": 395
},
{
"epoch": 0.9542168674698795,
"grad_norm": 0.208984375,
"learning_rate": 4.578313253012048e-07,
"loss": 1.2609,
"step": 396
},
{
"epoch": 0.9566265060240964,
"grad_norm": 0.17578125,
"learning_rate": 4.3373493975903615e-07,
"loss": 1.1818,
"step": 397
},
{
"epoch": 0.9590361445783132,
"grad_norm": 0.1826171875,
"learning_rate": 4.0963855421686754e-07,
"loss": 1.1709,
"step": 398
},
{
"epoch": 0.9614457831325302,
"grad_norm": 0.2216796875,
"learning_rate": 3.855421686746989e-07,
"loss": 1.1202,
"step": 399
},
{
"epoch": 0.963855421686747,
"grad_norm": 0.1826171875,
"learning_rate": 3.614457831325301e-07,
"loss": 1.1467,
"step": 400
},
{
"epoch": 0.9662650602409638,
"grad_norm": 0.1884765625,
"learning_rate": 3.373493975903615e-07,
"loss": 1.1886,
"step": 401
},
{
"epoch": 0.9686746987951808,
"grad_norm": 0.1689453125,
"learning_rate": 3.1325301204819284e-07,
"loss": 1.2327,
"step": 402
},
{
"epoch": 0.9710843373493976,
"grad_norm": 0.171875,
"learning_rate": 2.891566265060241e-07,
"loss": 1.1978,
"step": 403
},
{
"epoch": 0.9734939759036144,
"grad_norm": 0.208984375,
"learning_rate": 2.6506024096385546e-07,
"loss": 1.1517,
"step": 404
},
{
"epoch": 0.9759036144578314,
"grad_norm": 0.173828125,
"learning_rate": 2.409638554216868e-07,
"loss": 1.1905,
"step": 405
},
{
"epoch": 0.9783132530120482,
"grad_norm": 0.1796875,
"learning_rate": 2.1686746987951808e-07,
"loss": 1.2248,
"step": 406
},
{
"epoch": 0.980722891566265,
"grad_norm": 0.1884765625,
"learning_rate": 1.9277108433734944e-07,
"loss": 1.204,
"step": 407
},
{
"epoch": 0.983132530120482,
"grad_norm": 0.19140625,
"learning_rate": 1.6867469879518075e-07,
"loss": 1.168,
"step": 408
},
{
"epoch": 0.9855421686746988,
"grad_norm": 0.1689453125,
"learning_rate": 1.4457831325301206e-07,
"loss": 1.1737,
"step": 409
},
{
"epoch": 0.9879518072289156,
"grad_norm": 0.1845703125,
"learning_rate": 1.204819277108434e-07,
"loss": 1.1892,
"step": 410
},
{
"epoch": 0.9903614457831326,
"grad_norm": 0.30078125,
"learning_rate": 9.638554216867472e-08,
"loss": 1.2251,
"step": 411
},
{
"epoch": 0.9927710843373494,
"grad_norm": 0.1728515625,
"learning_rate": 7.228915662650603e-08,
"loss": 1.1567,
"step": 412
},
{
"epoch": 0.9951807228915662,
"grad_norm": 0.1728515625,
"learning_rate": 4.819277108433736e-08,
"loss": 1.1682,
"step": 413
},
{
"epoch": 0.9975903614457832,
"grad_norm": 0.1875,
"learning_rate": 2.409638554216868e-08,
"loss": 1.2771,
"step": 414
},
{
"epoch": 1.0,
"grad_norm": 0.1865234375,
"learning_rate": 0.0,
"loss": 1.1797,
"step": 415
}
],
"logging_steps": 1.0,
"max_steps": 415,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.3142898503833354e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}