sedrickkeh's picture
End of training
62888ec verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002,
"grad_norm": 6.779170989990234,
"learning_rate": 6.666666666666668e-08,
"loss": 1.2436,
"step": 1
},
{
"epoch": 0.004,
"grad_norm": 6.8514909744262695,
"learning_rate": 1.3333333333333336e-07,
"loss": 1.2722,
"step": 2
},
{
"epoch": 0.006,
"grad_norm": 6.65818977355957,
"learning_rate": 2.0000000000000002e-07,
"loss": 1.2199,
"step": 3
},
{
"epoch": 0.008,
"grad_norm": 6.83758020401001,
"learning_rate": 2.666666666666667e-07,
"loss": 1.2601,
"step": 4
},
{
"epoch": 0.01,
"grad_norm": 6.710981369018555,
"learning_rate": 3.3333333333333335e-07,
"loss": 1.279,
"step": 5
},
{
"epoch": 0.012,
"grad_norm": 6.6265668869018555,
"learning_rate": 4.0000000000000003e-07,
"loss": 1.2378,
"step": 6
},
{
"epoch": 0.014,
"grad_norm": 6.815594673156738,
"learning_rate": 4.666666666666667e-07,
"loss": 1.2796,
"step": 7
},
{
"epoch": 0.016,
"grad_norm": 6.420166969299316,
"learning_rate": 5.333333333333335e-07,
"loss": 1.2348,
"step": 8
},
{
"epoch": 0.018,
"grad_norm": 6.717895030975342,
"learning_rate": 6.000000000000001e-07,
"loss": 1.2454,
"step": 9
},
{
"epoch": 0.02,
"grad_norm": 6.134315490722656,
"learning_rate": 6.666666666666667e-07,
"loss": 1.2112,
"step": 10
},
{
"epoch": 0.022,
"grad_norm": 6.2358598709106445,
"learning_rate": 7.333333333333334e-07,
"loss": 1.2471,
"step": 11
},
{
"epoch": 0.024,
"grad_norm": 6.138259410858154,
"learning_rate": 8.000000000000001e-07,
"loss": 1.2422,
"step": 12
},
{
"epoch": 0.026,
"grad_norm": 4.941827297210693,
"learning_rate": 8.666666666666668e-07,
"loss": 1.2303,
"step": 13
},
{
"epoch": 0.028,
"grad_norm": 4.79518461227417,
"learning_rate": 9.333333333333334e-07,
"loss": 1.2304,
"step": 14
},
{
"epoch": 0.03,
"grad_norm": 4.52637243270874,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.2018,
"step": 15
},
{
"epoch": 0.032,
"grad_norm": 4.465169906616211,
"learning_rate": 1.066666666666667e-06,
"loss": 1.2221,
"step": 16
},
{
"epoch": 0.034,
"grad_norm": 2.7714080810546875,
"learning_rate": 1.1333333333333334e-06,
"loss": 1.139,
"step": 17
},
{
"epoch": 0.036,
"grad_norm": 2.8010947704315186,
"learning_rate": 1.2000000000000002e-06,
"loss": 1.1654,
"step": 18
},
{
"epoch": 0.038,
"grad_norm": 2.7349588871002197,
"learning_rate": 1.2666666666666669e-06,
"loss": 1.1389,
"step": 19
},
{
"epoch": 0.04,
"grad_norm": 2.573075294494629,
"learning_rate": 1.3333333333333334e-06,
"loss": 1.1583,
"step": 20
},
{
"epoch": 0.042,
"grad_norm": 2.594188928604126,
"learning_rate": 1.4000000000000001e-06,
"loss": 1.1342,
"step": 21
},
{
"epoch": 0.044,
"grad_norm": 2.4305062294006348,
"learning_rate": 1.4666666666666669e-06,
"loss": 1.159,
"step": 22
},
{
"epoch": 0.046,
"grad_norm": 2.1182713508605957,
"learning_rate": 1.5333333333333334e-06,
"loss": 1.1454,
"step": 23
},
{
"epoch": 0.048,
"grad_norm": 3.179542064666748,
"learning_rate": 1.6000000000000001e-06,
"loss": 1.1222,
"step": 24
},
{
"epoch": 0.05,
"grad_norm": 3.3224637508392334,
"learning_rate": 1.6666666666666667e-06,
"loss": 1.123,
"step": 25
},
{
"epoch": 0.052,
"grad_norm": 3.318582773208618,
"learning_rate": 1.7333333333333336e-06,
"loss": 1.1258,
"step": 26
},
{
"epoch": 0.054,
"grad_norm": 3.051558494567871,
"learning_rate": 1.8000000000000001e-06,
"loss": 1.0751,
"step": 27
},
{
"epoch": 0.056,
"grad_norm": 2.4479663372039795,
"learning_rate": 1.8666666666666669e-06,
"loss": 1.0648,
"step": 28
},
{
"epoch": 0.058,
"grad_norm": 2.288482904434204,
"learning_rate": 1.9333333333333336e-06,
"loss": 1.0898,
"step": 29
},
{
"epoch": 0.06,
"grad_norm": 2.026151657104492,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.0935,
"step": 30
},
{
"epoch": 0.062,
"grad_norm": 1.4403475522994995,
"learning_rate": 2.0666666666666666e-06,
"loss": 1.0833,
"step": 31
},
{
"epoch": 0.064,
"grad_norm": 1.496666431427002,
"learning_rate": 2.133333333333334e-06,
"loss": 1.0441,
"step": 32
},
{
"epoch": 0.066,
"grad_norm": 1.6388070583343506,
"learning_rate": 2.2e-06,
"loss": 1.0262,
"step": 33
},
{
"epoch": 0.068,
"grad_norm": 1.682080626487732,
"learning_rate": 2.266666666666667e-06,
"loss": 1.0388,
"step": 34
},
{
"epoch": 0.07,
"grad_norm": 1.5605063438415527,
"learning_rate": 2.3333333333333336e-06,
"loss": 1.037,
"step": 35
},
{
"epoch": 0.072,
"grad_norm": 1.287360429763794,
"learning_rate": 2.4000000000000003e-06,
"loss": 1.0476,
"step": 36
},
{
"epoch": 0.074,
"grad_norm": 1.0461649894714355,
"learning_rate": 2.466666666666667e-06,
"loss": 1.0496,
"step": 37
},
{
"epoch": 0.076,
"grad_norm": 1.0821465253829956,
"learning_rate": 2.5333333333333338e-06,
"loss": 1.0307,
"step": 38
},
{
"epoch": 0.078,
"grad_norm": 1.061495065689087,
"learning_rate": 2.6e-06,
"loss": 1.0279,
"step": 39
},
{
"epoch": 0.08,
"grad_norm": 1.0558407306671143,
"learning_rate": 2.666666666666667e-06,
"loss": 1.0201,
"step": 40
},
{
"epoch": 0.082,
"grad_norm": 0.923545777797699,
"learning_rate": 2.7333333333333336e-06,
"loss": 1.0028,
"step": 41
},
{
"epoch": 0.084,
"grad_norm": 0.8768528699874878,
"learning_rate": 2.8000000000000003e-06,
"loss": 1.0085,
"step": 42
},
{
"epoch": 0.086,
"grad_norm": 0.891363263130188,
"learning_rate": 2.866666666666667e-06,
"loss": 1.0024,
"step": 43
},
{
"epoch": 0.088,
"grad_norm": 0.8647627830505371,
"learning_rate": 2.9333333333333338e-06,
"loss": 0.9924,
"step": 44
},
{
"epoch": 0.09,
"grad_norm": 0.8449153304100037,
"learning_rate": 3e-06,
"loss": 1.0107,
"step": 45
},
{
"epoch": 0.092,
"grad_norm": 0.7374930381774902,
"learning_rate": 3.066666666666667e-06,
"loss": 0.9944,
"step": 46
},
{
"epoch": 0.094,
"grad_norm": 0.6910094022750854,
"learning_rate": 3.133333333333334e-06,
"loss": 0.9756,
"step": 47
},
{
"epoch": 0.096,
"grad_norm": 0.7899953722953796,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.9564,
"step": 48
},
{
"epoch": 0.098,
"grad_norm": 0.7347140312194824,
"learning_rate": 3.266666666666667e-06,
"loss": 0.9787,
"step": 49
},
{
"epoch": 0.1,
"grad_norm": 0.71909499168396,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.9744,
"step": 50
},
{
"epoch": 0.102,
"grad_norm": 0.704480767250061,
"learning_rate": 3.4000000000000005e-06,
"loss": 0.9686,
"step": 51
},
{
"epoch": 0.104,
"grad_norm": 0.6947300434112549,
"learning_rate": 3.4666666666666672e-06,
"loss": 0.9644,
"step": 52
},
{
"epoch": 0.106,
"grad_norm": 0.66453617811203,
"learning_rate": 3.5333333333333335e-06,
"loss": 0.9486,
"step": 53
},
{
"epoch": 0.108,
"grad_norm": 0.6524918675422668,
"learning_rate": 3.6000000000000003e-06,
"loss": 0.9473,
"step": 54
},
{
"epoch": 0.11,
"grad_norm": 0.7257620096206665,
"learning_rate": 3.6666666666666666e-06,
"loss": 0.9851,
"step": 55
},
{
"epoch": 0.112,
"grad_norm": 0.6214912533760071,
"learning_rate": 3.7333333333333337e-06,
"loss": 0.9226,
"step": 56
},
{
"epoch": 0.114,
"grad_norm": 0.6151769161224365,
"learning_rate": 3.8000000000000005e-06,
"loss": 0.9252,
"step": 57
},
{
"epoch": 0.116,
"grad_norm": 0.6252180337905884,
"learning_rate": 3.866666666666667e-06,
"loss": 0.9495,
"step": 58
},
{
"epoch": 0.118,
"grad_norm": 0.6552236080169678,
"learning_rate": 3.9333333333333335e-06,
"loss": 0.9693,
"step": 59
},
{
"epoch": 0.12,
"grad_norm": 0.648305356502533,
"learning_rate": 4.000000000000001e-06,
"loss": 0.9473,
"step": 60
},
{
"epoch": 0.122,
"grad_norm": 0.6331591010093689,
"learning_rate": 4.066666666666667e-06,
"loss": 0.9563,
"step": 61
},
{
"epoch": 0.124,
"grad_norm": 0.6448110342025757,
"learning_rate": 4.133333333333333e-06,
"loss": 0.9355,
"step": 62
},
{
"epoch": 0.126,
"grad_norm": 0.6444178223609924,
"learning_rate": 4.2000000000000004e-06,
"loss": 0.9706,
"step": 63
},
{
"epoch": 0.128,
"grad_norm": 0.6279839873313904,
"learning_rate": 4.266666666666668e-06,
"loss": 0.955,
"step": 64
},
{
"epoch": 0.13,
"grad_norm": 0.6672361493110657,
"learning_rate": 4.333333333333334e-06,
"loss": 0.9396,
"step": 65
},
{
"epoch": 0.132,
"grad_norm": 0.6860214471817017,
"learning_rate": 4.4e-06,
"loss": 0.9023,
"step": 66
},
{
"epoch": 0.134,
"grad_norm": 0.5856081247329712,
"learning_rate": 4.4666666666666665e-06,
"loss": 0.914,
"step": 67
},
{
"epoch": 0.136,
"grad_norm": 0.5969718098640442,
"learning_rate": 4.533333333333334e-06,
"loss": 0.9292,
"step": 68
},
{
"epoch": 0.138,
"grad_norm": 0.6717826128005981,
"learning_rate": 4.600000000000001e-06,
"loss": 0.9057,
"step": 69
},
{
"epoch": 0.14,
"grad_norm": 0.6435735821723938,
"learning_rate": 4.666666666666667e-06,
"loss": 0.9358,
"step": 70
},
{
"epoch": 0.142,
"grad_norm": 0.6428582668304443,
"learning_rate": 4.7333333333333335e-06,
"loss": 0.9357,
"step": 71
},
{
"epoch": 0.144,
"grad_norm": 0.6402033567428589,
"learning_rate": 4.800000000000001e-06,
"loss": 0.9601,
"step": 72
},
{
"epoch": 0.146,
"grad_norm": 0.5693813562393188,
"learning_rate": 4.866666666666667e-06,
"loss": 0.9408,
"step": 73
},
{
"epoch": 0.148,
"grad_norm": 0.604894757270813,
"learning_rate": 4.933333333333334e-06,
"loss": 0.9143,
"step": 74
},
{
"epoch": 0.15,
"grad_norm": 0.5587258338928223,
"learning_rate": 5e-06,
"loss": 0.9461,
"step": 75
},
{
"epoch": 0.152,
"grad_norm": 0.5895867347717285,
"learning_rate": 5.0666666666666676e-06,
"loss": 0.9324,
"step": 76
},
{
"epoch": 0.154,
"grad_norm": 0.6396773457527161,
"learning_rate": 5.133333333333334e-06,
"loss": 0.9284,
"step": 77
},
{
"epoch": 0.156,
"grad_norm": 0.6042245030403137,
"learning_rate": 5.2e-06,
"loss": 0.9189,
"step": 78
},
{
"epoch": 0.158,
"grad_norm": 0.7052513957023621,
"learning_rate": 5.2666666666666665e-06,
"loss": 0.8917,
"step": 79
},
{
"epoch": 0.16,
"grad_norm": 0.5608431696891785,
"learning_rate": 5.333333333333334e-06,
"loss": 0.9101,
"step": 80
},
{
"epoch": 0.162,
"grad_norm": 0.6625503301620483,
"learning_rate": 5.400000000000001e-06,
"loss": 0.9442,
"step": 81
},
{
"epoch": 0.164,
"grad_norm": 0.7061005234718323,
"learning_rate": 5.466666666666667e-06,
"loss": 0.9173,
"step": 82
},
{
"epoch": 0.166,
"grad_norm": 0.6100620627403259,
"learning_rate": 5.533333333333334e-06,
"loss": 0.8942,
"step": 83
},
{
"epoch": 0.168,
"grad_norm": 0.5722076296806335,
"learning_rate": 5.600000000000001e-06,
"loss": 0.9025,
"step": 84
},
{
"epoch": 0.17,
"grad_norm": 0.5684266686439514,
"learning_rate": 5.666666666666667e-06,
"loss": 0.9191,
"step": 85
},
{
"epoch": 0.172,
"grad_norm": 0.5633739233016968,
"learning_rate": 5.733333333333334e-06,
"loss": 0.9195,
"step": 86
},
{
"epoch": 0.174,
"grad_norm": 0.5535778999328613,
"learning_rate": 5.8e-06,
"loss": 0.8992,
"step": 87
},
{
"epoch": 0.176,
"grad_norm": 0.5984314680099487,
"learning_rate": 5.8666666666666675e-06,
"loss": 0.9046,
"step": 88
},
{
"epoch": 0.178,
"grad_norm": 0.5641950964927673,
"learning_rate": 5.933333333333335e-06,
"loss": 0.9309,
"step": 89
},
{
"epoch": 0.18,
"grad_norm": 0.6352724432945251,
"learning_rate": 6e-06,
"loss": 0.9449,
"step": 90
},
{
"epoch": 0.182,
"grad_norm": 0.655491054058075,
"learning_rate": 6.066666666666667e-06,
"loss": 0.959,
"step": 91
},
{
"epoch": 0.184,
"grad_norm": 0.5627034902572632,
"learning_rate": 6.133333333333334e-06,
"loss": 0.9479,
"step": 92
},
{
"epoch": 0.186,
"grad_norm": 0.7245451807975769,
"learning_rate": 6.200000000000001e-06,
"loss": 0.8791,
"step": 93
},
{
"epoch": 0.188,
"grad_norm": 0.6565855145454407,
"learning_rate": 6.266666666666668e-06,
"loss": 0.9234,
"step": 94
},
{
"epoch": 0.19,
"grad_norm": 0.5744818449020386,
"learning_rate": 6.333333333333333e-06,
"loss": 0.9209,
"step": 95
},
{
"epoch": 0.192,
"grad_norm": 0.7991737127304077,
"learning_rate": 6.4000000000000006e-06,
"loss": 0.931,
"step": 96
},
{
"epoch": 0.194,
"grad_norm": 0.542374312877655,
"learning_rate": 6.466666666666667e-06,
"loss": 0.9381,
"step": 97
},
{
"epoch": 0.196,
"grad_norm": 0.6645175814628601,
"learning_rate": 6.533333333333334e-06,
"loss": 0.9074,
"step": 98
},
{
"epoch": 0.198,
"grad_norm": 0.6946215033531189,
"learning_rate": 6.600000000000001e-06,
"loss": 0.9359,
"step": 99
},
{
"epoch": 0.2,
"grad_norm": 0.6256315112113953,
"learning_rate": 6.666666666666667e-06,
"loss": 0.9097,
"step": 100
},
{
"epoch": 0.202,
"grad_norm": 0.6502851843833923,
"learning_rate": 6.733333333333334e-06,
"loss": 0.9355,
"step": 101
},
{
"epoch": 0.204,
"grad_norm": 0.6452805399894714,
"learning_rate": 6.800000000000001e-06,
"loss": 0.8883,
"step": 102
},
{
"epoch": 0.206,
"grad_norm": 0.6026374697685242,
"learning_rate": 6.866666666666667e-06,
"loss": 0.9054,
"step": 103
},
{
"epoch": 0.208,
"grad_norm": 0.5819752812385559,
"learning_rate": 6.9333333333333344e-06,
"loss": 0.9196,
"step": 104
},
{
"epoch": 0.21,
"grad_norm": 0.74315345287323,
"learning_rate": 7e-06,
"loss": 0.9175,
"step": 105
},
{
"epoch": 0.212,
"grad_norm": 0.6989079713821411,
"learning_rate": 7.066666666666667e-06,
"loss": 0.9186,
"step": 106
},
{
"epoch": 0.214,
"grad_norm": 0.6356122493743896,
"learning_rate": 7.133333333333334e-06,
"loss": 0.9102,
"step": 107
},
{
"epoch": 0.216,
"grad_norm": 0.7254390120506287,
"learning_rate": 7.2000000000000005e-06,
"loss": 0.9181,
"step": 108
},
{
"epoch": 0.218,
"grad_norm": 0.6062964200973511,
"learning_rate": 7.266666666666668e-06,
"loss": 0.8487,
"step": 109
},
{
"epoch": 0.22,
"grad_norm": 0.6720328330993652,
"learning_rate": 7.333333333333333e-06,
"loss": 0.8934,
"step": 110
},
{
"epoch": 0.222,
"grad_norm": 0.6350739002227783,
"learning_rate": 7.4e-06,
"loss": 0.927,
"step": 111
},
{
"epoch": 0.224,
"grad_norm": 0.5598141551017761,
"learning_rate": 7.4666666666666675e-06,
"loss": 0.9275,
"step": 112
},
{
"epoch": 0.226,
"grad_norm": 0.6055253744125366,
"learning_rate": 7.533333333333334e-06,
"loss": 0.9078,
"step": 113
},
{
"epoch": 0.228,
"grad_norm": 0.5682628154754639,
"learning_rate": 7.600000000000001e-06,
"loss": 0.9462,
"step": 114
},
{
"epoch": 0.23,
"grad_norm": 0.5697638988494873,
"learning_rate": 7.666666666666667e-06,
"loss": 0.9008,
"step": 115
},
{
"epoch": 0.232,
"grad_norm": 0.6082512736320496,
"learning_rate": 7.733333333333334e-06,
"loss": 0.8878,
"step": 116
},
{
"epoch": 0.234,
"grad_norm": 0.6491492390632629,
"learning_rate": 7.800000000000002e-06,
"loss": 0.9021,
"step": 117
},
{
"epoch": 0.236,
"grad_norm": 0.5754338502883911,
"learning_rate": 7.866666666666667e-06,
"loss": 0.9012,
"step": 118
},
{
"epoch": 0.238,
"grad_norm": 0.6297829151153564,
"learning_rate": 7.933333333333334e-06,
"loss": 0.936,
"step": 119
},
{
"epoch": 0.24,
"grad_norm": 0.6097427606582642,
"learning_rate": 8.000000000000001e-06,
"loss": 0.8952,
"step": 120
},
{
"epoch": 0.242,
"grad_norm": 0.5730277895927429,
"learning_rate": 8.066666666666667e-06,
"loss": 0.9103,
"step": 121
},
{
"epoch": 0.244,
"grad_norm": 0.48332902789115906,
"learning_rate": 8.133333333333334e-06,
"loss": 0.9291,
"step": 122
},
{
"epoch": 0.246,
"grad_norm": 0.554993212223053,
"learning_rate": 8.2e-06,
"loss": 0.905,
"step": 123
},
{
"epoch": 0.248,
"grad_norm": 0.5835518836975098,
"learning_rate": 8.266666666666667e-06,
"loss": 0.8933,
"step": 124
},
{
"epoch": 0.25,
"grad_norm": 0.5752037763595581,
"learning_rate": 8.333333333333334e-06,
"loss": 0.905,
"step": 125
},
{
"epoch": 0.252,
"grad_norm": 0.5807712078094482,
"learning_rate": 8.400000000000001e-06,
"loss": 0.8634,
"step": 126
},
{
"epoch": 0.254,
"grad_norm": 0.6821209788322449,
"learning_rate": 8.466666666666668e-06,
"loss": 0.8879,
"step": 127
},
{
"epoch": 0.256,
"grad_norm": 0.5632617473602295,
"learning_rate": 8.533333333333335e-06,
"loss": 0.8993,
"step": 128
},
{
"epoch": 0.258,
"grad_norm": 0.58171546459198,
"learning_rate": 8.6e-06,
"loss": 0.8596,
"step": 129
},
{
"epoch": 0.26,
"grad_norm": 0.6329283714294434,
"learning_rate": 8.666666666666668e-06,
"loss": 0.8848,
"step": 130
},
{
"epoch": 0.262,
"grad_norm": 0.5913123488426208,
"learning_rate": 8.733333333333333e-06,
"loss": 0.8715,
"step": 131
},
{
"epoch": 0.264,
"grad_norm": 0.6618683338165283,
"learning_rate": 8.8e-06,
"loss": 0.898,
"step": 132
},
{
"epoch": 0.266,
"grad_norm": 0.6601865887641907,
"learning_rate": 8.866666666666668e-06,
"loss": 0.8785,
"step": 133
},
{
"epoch": 0.268,
"grad_norm": 0.715740442276001,
"learning_rate": 8.933333333333333e-06,
"loss": 0.8904,
"step": 134
},
{
"epoch": 0.27,
"grad_norm": 0.7350261211395264,
"learning_rate": 9e-06,
"loss": 0.9241,
"step": 135
},
{
"epoch": 0.272,
"grad_norm": 0.6438612341880798,
"learning_rate": 9.066666666666667e-06,
"loss": 0.8931,
"step": 136
},
{
"epoch": 0.274,
"grad_norm": 0.8216118812561035,
"learning_rate": 9.133333333333335e-06,
"loss": 0.8773,
"step": 137
},
{
"epoch": 0.276,
"grad_norm": 0.6327617764472961,
"learning_rate": 9.200000000000002e-06,
"loss": 0.8808,
"step": 138
},
{
"epoch": 0.278,
"grad_norm": 0.6212260723114014,
"learning_rate": 9.266666666666667e-06,
"loss": 0.8811,
"step": 139
},
{
"epoch": 0.28,
"grad_norm": 0.627037525177002,
"learning_rate": 9.333333333333334e-06,
"loss": 0.9266,
"step": 140
},
{
"epoch": 0.282,
"grad_norm": 0.624682605266571,
"learning_rate": 9.4e-06,
"loss": 0.9108,
"step": 141
},
{
"epoch": 0.284,
"grad_norm": 0.6022452712059021,
"learning_rate": 9.466666666666667e-06,
"loss": 0.9115,
"step": 142
},
{
"epoch": 0.286,
"grad_norm": 0.5518873333930969,
"learning_rate": 9.533333333333334e-06,
"loss": 0.8877,
"step": 143
},
{
"epoch": 0.288,
"grad_norm": 0.7182208299636841,
"learning_rate": 9.600000000000001e-06,
"loss": 0.8841,
"step": 144
},
{
"epoch": 0.29,
"grad_norm": 0.6279104351997375,
"learning_rate": 9.666666666666667e-06,
"loss": 0.8631,
"step": 145
},
{
"epoch": 0.292,
"grad_norm": 0.5953511595726013,
"learning_rate": 9.733333333333334e-06,
"loss": 0.8906,
"step": 146
},
{
"epoch": 0.294,
"grad_norm": 0.9255452752113342,
"learning_rate": 9.800000000000001e-06,
"loss": 0.8899,
"step": 147
},
{
"epoch": 0.296,
"grad_norm": 0.6811776161193848,
"learning_rate": 9.866666666666668e-06,
"loss": 0.9043,
"step": 148
},
{
"epoch": 0.298,
"grad_norm": 0.8268923759460449,
"learning_rate": 9.933333333333334e-06,
"loss": 0.8981,
"step": 149
},
{
"epoch": 0.3,
"grad_norm": 0.8442743420600891,
"learning_rate": 1e-05,
"loss": 0.8739,
"step": 150
},
{
"epoch": 0.302,
"grad_norm": 0.8098756670951843,
"learning_rate": 9.99998646145412e-06,
"loss": 0.8607,
"step": 151
},
{
"epoch": 0.304,
"grad_norm": 0.8181275129318237,
"learning_rate": 9.999945845889795e-06,
"loss": 0.8963,
"step": 152
},
{
"epoch": 0.306,
"grad_norm": 0.6322422623634338,
"learning_rate": 9.999878153526974e-06,
"loss": 0.8965,
"step": 153
},
{
"epoch": 0.308,
"grad_norm": 0.8551559448242188,
"learning_rate": 9.999783384732242e-06,
"loss": 0.902,
"step": 154
},
{
"epoch": 0.31,
"grad_norm": 0.6801779270172119,
"learning_rate": 9.999661540018812e-06,
"loss": 0.8598,
"step": 155
},
{
"epoch": 0.312,
"grad_norm": 0.7247231006622314,
"learning_rate": 9.999512620046523e-06,
"loss": 0.9204,
"step": 156
},
{
"epoch": 0.314,
"grad_norm": 0.8099005818367004,
"learning_rate": 9.999336625621836e-06,
"loss": 0.8934,
"step": 157
},
{
"epoch": 0.316,
"grad_norm": 0.6449592113494873,
"learning_rate": 9.99913355769784e-06,
"loss": 0.8633,
"step": 158
},
{
"epoch": 0.318,
"grad_norm": 0.7183147668838501,
"learning_rate": 9.998903417374228e-06,
"loss": 0.894,
"step": 159
},
{
"epoch": 0.32,
"grad_norm": 0.6774334907531738,
"learning_rate": 9.99864620589731e-06,
"loss": 0.9034,
"step": 160
},
{
"epoch": 0.322,
"grad_norm": 0.6847894787788391,
"learning_rate": 9.998361924659989e-06,
"loss": 0.9033,
"step": 161
},
{
"epoch": 0.324,
"grad_norm": 0.6603731513023376,
"learning_rate": 9.998050575201772e-06,
"loss": 0.8633,
"step": 162
},
{
"epoch": 0.326,
"grad_norm": 0.6944355368614197,
"learning_rate": 9.997712159208745e-06,
"loss": 0.8819,
"step": 163
},
{
"epoch": 0.328,
"grad_norm": 0.6257456541061401,
"learning_rate": 9.99734667851357e-06,
"loss": 0.8876,
"step": 164
},
{
"epoch": 0.33,
"grad_norm": 0.702239453792572,
"learning_rate": 9.99695413509548e-06,
"loss": 0.8782,
"step": 165
},
{
"epoch": 0.332,
"grad_norm": 0.5888884663581848,
"learning_rate": 9.99653453108026e-06,
"loss": 0.8554,
"step": 166
},
{
"epoch": 0.334,
"grad_norm": 0.6543298959732056,
"learning_rate": 9.996087868740244e-06,
"loss": 0.8662,
"step": 167
},
{
"epoch": 0.336,
"grad_norm": 0.8132950663566589,
"learning_rate": 9.995614150494293e-06,
"loss": 0.9093,
"step": 168
},
{
"epoch": 0.338,
"grad_norm": 0.5895329117774963,
"learning_rate": 9.995113378907791e-06,
"loss": 0.8916,
"step": 169
},
{
"epoch": 0.34,
"grad_norm": 0.8918687701225281,
"learning_rate": 9.994585556692624e-06,
"loss": 0.8586,
"step": 170
},
{
"epoch": 0.342,
"grad_norm": 0.8333307504653931,
"learning_rate": 9.994030686707171e-06,
"loss": 0.8919,
"step": 171
},
{
"epoch": 0.344,
"grad_norm": 0.737712025642395,
"learning_rate": 9.993448771956285e-06,
"loss": 0.9085,
"step": 172
},
{
"epoch": 0.346,
"grad_norm": 0.7404334545135498,
"learning_rate": 9.99283981559128e-06,
"loss": 0.8941,
"step": 173
},
{
"epoch": 0.348,
"grad_norm": 0.7040583491325378,
"learning_rate": 9.992203820909906e-06,
"loss": 0.8901,
"step": 174
},
{
"epoch": 0.35,
"grad_norm": 0.7103179097175598,
"learning_rate": 9.991540791356342e-06,
"loss": 0.8806,
"step": 175
},
{
"epoch": 0.352,
"grad_norm": 0.7652331590652466,
"learning_rate": 9.99085073052117e-06,
"loss": 0.8689,
"step": 176
},
{
"epoch": 0.354,
"grad_norm": 0.6264358758926392,
"learning_rate": 9.990133642141359e-06,
"loss": 0.9046,
"step": 177
},
{
"epoch": 0.356,
"grad_norm": 0.8190493583679199,
"learning_rate": 9.989389530100242e-06,
"loss": 0.9127,
"step": 178
},
{
"epoch": 0.358,
"grad_norm": 0.6480888724327087,
"learning_rate": 9.988618398427495e-06,
"loss": 0.8533,
"step": 179
},
{
"epoch": 0.36,
"grad_norm": 0.7520120739936829,
"learning_rate": 9.987820251299121e-06,
"loss": 0.8705,
"step": 180
},
{
"epoch": 0.362,
"grad_norm": 0.8041049242019653,
"learning_rate": 9.986995093037422e-06,
"loss": 0.8946,
"step": 181
},
{
"epoch": 0.364,
"grad_norm": 0.6484959125518799,
"learning_rate": 9.986142928110972e-06,
"loss": 0.8988,
"step": 182
},
{
"epoch": 0.366,
"grad_norm": 0.7548107504844666,
"learning_rate": 9.985263761134602e-06,
"loss": 0.875,
"step": 183
},
{
"epoch": 0.368,
"grad_norm": 0.6038651466369629,
"learning_rate": 9.984357596869369e-06,
"loss": 0.8696,
"step": 184
},
{
"epoch": 0.37,
"grad_norm": 0.761684000492096,
"learning_rate": 9.98342444022253e-06,
"loss": 0.9005,
"step": 185
},
{
"epoch": 0.372,
"grad_norm": 0.7544848322868347,
"learning_rate": 9.982464296247523e-06,
"loss": 0.8743,
"step": 186
},
{
"epoch": 0.374,
"grad_norm": 0.6661540865898132,
"learning_rate": 9.981477170143924e-06,
"loss": 0.8793,
"step": 187
},
{
"epoch": 0.376,
"grad_norm": 0.6984407305717468,
"learning_rate": 9.980463067257437e-06,
"loss": 0.8982,
"step": 188
},
{
"epoch": 0.378,
"grad_norm": 0.7096168994903564,
"learning_rate": 9.979421993079853e-06,
"loss": 0.8327,
"step": 189
},
{
"epoch": 0.38,
"grad_norm": 0.7101868391036987,
"learning_rate": 9.978353953249023e-06,
"loss": 0.8653,
"step": 190
},
{
"epoch": 0.382,
"grad_norm": 0.6241440773010254,
"learning_rate": 9.977258953548831e-06,
"loss": 0.8479,
"step": 191
},
{
"epoch": 0.384,
"grad_norm": 0.8079991936683655,
"learning_rate": 9.976136999909156e-06,
"loss": 0.8801,
"step": 192
},
{
"epoch": 0.386,
"grad_norm": 0.8028410077095032,
"learning_rate": 9.97498809840585e-06,
"loss": 0.8703,
"step": 193
},
{
"epoch": 0.388,
"grad_norm": 0.8198662400245667,
"learning_rate": 9.973812255260692e-06,
"loss": 0.8566,
"step": 194
},
{
"epoch": 0.39,
"grad_norm": 0.8669592142105103,
"learning_rate": 9.972609476841368e-06,
"loss": 0.9185,
"step": 195
},
{
"epoch": 0.392,
"grad_norm": 0.8332952857017517,
"learning_rate": 9.971379769661422e-06,
"loss": 0.878,
"step": 196
},
{
"epoch": 0.394,
"grad_norm": 0.7698492407798767,
"learning_rate": 9.970123140380237e-06,
"loss": 0.91,
"step": 197
},
{
"epoch": 0.396,
"grad_norm": 0.8548876047134399,
"learning_rate": 9.968839595802982e-06,
"loss": 0.8952,
"step": 198
},
{
"epoch": 0.398,
"grad_norm": 0.6185474991798401,
"learning_rate": 9.967529142880592e-06,
"loss": 0.9048,
"step": 199
},
{
"epoch": 0.4,
"grad_norm": 0.8127085566520691,
"learning_rate": 9.966191788709716e-06,
"loss": 0.9067,
"step": 200
},
{
"epoch": 0.402,
"grad_norm": 0.6378841400146484,
"learning_rate": 9.964827540532685e-06,
"loss": 0.8992,
"step": 201
},
{
"epoch": 0.404,
"grad_norm": 0.8299638032913208,
"learning_rate": 9.963436405737476e-06,
"loss": 0.8849,
"step": 202
},
{
"epoch": 0.406,
"grad_norm": 0.8502849340438843,
"learning_rate": 9.962018391857665e-06,
"loss": 0.88,
"step": 203
},
{
"epoch": 0.408,
"grad_norm": 0.8172805905342102,
"learning_rate": 9.960573506572391e-06,
"loss": 0.9101,
"step": 204
},
{
"epoch": 0.41,
"grad_norm": 0.7766000628471375,
"learning_rate": 9.959101757706308e-06,
"loss": 0.9005,
"step": 205
},
{
"epoch": 0.412,
"grad_norm": 0.8308401107788086,
"learning_rate": 9.957603153229559e-06,
"loss": 0.8975,
"step": 206
},
{
"epoch": 0.414,
"grad_norm": 0.7129584550857544,
"learning_rate": 9.95607770125771e-06,
"loss": 0.8636,
"step": 207
},
{
"epoch": 0.416,
"grad_norm": 0.7642067074775696,
"learning_rate": 9.95452541005172e-06,
"loss": 0.866,
"step": 208
},
{
"epoch": 0.418,
"grad_norm": 0.9380465745925903,
"learning_rate": 9.952946288017899e-06,
"loss": 0.8999,
"step": 209
},
{
"epoch": 0.42,
"grad_norm": 0.7058805823326111,
"learning_rate": 9.951340343707852e-06,
"loss": 0.9092,
"step": 210
},
{
"epoch": 0.422,
"grad_norm": 0.7754363417625427,
"learning_rate": 9.94970758581844e-06,
"loss": 0.8728,
"step": 211
},
{
"epoch": 0.424,
"grad_norm": 0.7104613184928894,
"learning_rate": 9.948048023191728e-06,
"loss": 0.8731,
"step": 212
},
{
"epoch": 0.426,
"grad_norm": 0.820107102394104,
"learning_rate": 9.946361664814942e-06,
"loss": 0.9037,
"step": 213
},
{
"epoch": 0.428,
"grad_norm": 0.611301600933075,
"learning_rate": 9.94464851982042e-06,
"loss": 0.8822,
"step": 214
},
{
"epoch": 0.43,
"grad_norm": 0.7367827296257019,
"learning_rate": 9.942908597485558e-06,
"loss": 0.8675,
"step": 215
},
{
"epoch": 0.432,
"grad_norm": 0.595421552658081,
"learning_rate": 9.941141907232766e-06,
"loss": 0.9004,
"step": 216
},
{
"epoch": 0.434,
"grad_norm": 0.8655984997749329,
"learning_rate": 9.939348458629406e-06,
"loss": 0.8943,
"step": 217
},
{
"epoch": 0.436,
"grad_norm": 0.687925398349762,
"learning_rate": 9.937528261387753e-06,
"loss": 0.902,
"step": 218
},
{
"epoch": 0.438,
"grad_norm": 0.7494369149208069,
"learning_rate": 9.93568132536494e-06,
"loss": 0.8586,
"step": 219
},
{
"epoch": 0.44,
"grad_norm": 0.72107994556427,
"learning_rate": 9.933807660562898e-06,
"loss": 0.8662,
"step": 220
},
{
"epoch": 0.442,
"grad_norm": 0.7555108666419983,
"learning_rate": 9.9319072771283e-06,
"loss": 0.8689,
"step": 221
},
{
"epoch": 0.444,
"grad_norm": 0.6739816665649414,
"learning_rate": 9.929980185352525e-06,
"loss": 0.8736,
"step": 222
},
{
"epoch": 0.446,
"grad_norm": 0.6736449003219604,
"learning_rate": 9.928026395671577e-06,
"loss": 0.8675,
"step": 223
},
{
"epoch": 0.448,
"grad_norm": 0.7635306119918823,
"learning_rate": 9.926045918666045e-06,
"loss": 0.8943,
"step": 224
},
{
"epoch": 0.45,
"grad_norm": 0.647369384765625,
"learning_rate": 9.924038765061042e-06,
"loss": 0.8695,
"step": 225
},
{
"epoch": 0.452,
"grad_norm": 0.8473477959632874,
"learning_rate": 9.92200494572614e-06,
"loss": 0.9111,
"step": 226
},
{
"epoch": 0.454,
"grad_norm": 0.6710712909698486,
"learning_rate": 9.919944471675328e-06,
"loss": 0.925,
"step": 227
},
{
"epoch": 0.456,
"grad_norm": 0.6937105655670166,
"learning_rate": 9.91785735406693e-06,
"loss": 0.8782,
"step": 228
},
{
"epoch": 0.458,
"grad_norm": 0.658285915851593,
"learning_rate": 9.915743604203563e-06,
"loss": 0.8742,
"step": 229
},
{
"epoch": 0.46,
"grad_norm": 0.6236475706100464,
"learning_rate": 9.913603233532067e-06,
"loss": 0.8671,
"step": 230
},
{
"epoch": 0.462,
"grad_norm": 0.689721941947937,
"learning_rate": 9.911436253643445e-06,
"loss": 0.8656,
"step": 231
},
{
"epoch": 0.464,
"grad_norm": 0.6586703062057495,
"learning_rate": 9.909242676272797e-06,
"loss": 0.8761,
"step": 232
},
{
"epoch": 0.466,
"grad_norm": 0.6571497917175293,
"learning_rate": 9.907022513299264e-06,
"loss": 0.8914,
"step": 233
},
{
"epoch": 0.468,
"grad_norm": 0.6405252814292908,
"learning_rate": 9.904775776745959e-06,
"loss": 0.8499,
"step": 234
},
{
"epoch": 0.47,
"grad_norm": 0.7265756726264954,
"learning_rate": 9.902502478779897e-06,
"loss": 0.849,
"step": 235
},
{
"epoch": 0.472,
"grad_norm": 0.7562282681465149,
"learning_rate": 9.90020263171194e-06,
"loss": 0.935,
"step": 236
},
{
"epoch": 0.474,
"grad_norm": 0.9941809773445129,
"learning_rate": 9.89787624799672e-06,
"loss": 0.8422,
"step": 237
},
{
"epoch": 0.476,
"grad_norm": 0.5920974016189575,
"learning_rate": 9.89552334023258e-06,
"loss": 0.8999,
"step": 238
},
{
"epoch": 0.478,
"grad_norm": 1.0161575078964233,
"learning_rate": 9.893143921161501e-06,
"loss": 0.849,
"step": 239
},
{
"epoch": 0.48,
"grad_norm": 0.5542423129081726,
"learning_rate": 9.890738003669029e-06,
"loss": 0.8743,
"step": 240
},
{
"epoch": 0.482,
"grad_norm": 0.8454483151435852,
"learning_rate": 9.888305600784217e-06,
"loss": 0.8796,
"step": 241
},
{
"epoch": 0.484,
"grad_norm": 0.6125505566596985,
"learning_rate": 9.88584672567954e-06,
"loss": 0.8423,
"step": 242
},
{
"epoch": 0.486,
"grad_norm": 0.6614047884941101,
"learning_rate": 9.883361391670841e-06,
"loss": 0.9044,
"step": 243
},
{
"epoch": 0.488,
"grad_norm": 0.6495211720466614,
"learning_rate": 9.880849612217238e-06,
"loss": 0.8751,
"step": 244
},
{
"epoch": 0.49,
"grad_norm": 0.6626394391059875,
"learning_rate": 9.878311400921072e-06,
"loss": 0.8748,
"step": 245
},
{
"epoch": 0.492,
"grad_norm": 0.780263364315033,
"learning_rate": 9.875746771527817e-06,
"loss": 0.8951,
"step": 246
},
{
"epoch": 0.494,
"grad_norm": 0.5627451539039612,
"learning_rate": 9.873155737926014e-06,
"loss": 0.8619,
"step": 247
},
{
"epoch": 0.496,
"grad_norm": 0.8236713409423828,
"learning_rate": 9.870538314147194e-06,
"loss": 0.8861,
"step": 248
},
{
"epoch": 0.498,
"grad_norm": 0.5814322829246521,
"learning_rate": 9.867894514365802e-06,
"loss": 0.8435,
"step": 249
},
{
"epoch": 0.5,
"grad_norm": 0.7435102462768555,
"learning_rate": 9.86522435289912e-06,
"loss": 0.847,
"step": 250
},
{
"epoch": 0.502,
"grad_norm": 0.6055560111999512,
"learning_rate": 9.862527844207189e-06,
"loss": 0.9026,
"step": 251
},
{
"epoch": 0.504,
"grad_norm": 0.7876644134521484,
"learning_rate": 9.859805002892733e-06,
"loss": 0.8963,
"step": 252
},
{
"epoch": 0.506,
"grad_norm": 0.7262117862701416,
"learning_rate": 9.857055843701073e-06,
"loss": 0.8749,
"step": 253
},
{
"epoch": 0.508,
"grad_norm": 0.6386765837669373,
"learning_rate": 9.85428038152006e-06,
"loss": 0.8652,
"step": 254
},
{
"epoch": 0.51,
"grad_norm": 0.6865687370300293,
"learning_rate": 9.851478631379982e-06,
"loss": 0.8486,
"step": 255
},
{
"epoch": 0.512,
"grad_norm": 0.5925199389457703,
"learning_rate": 9.84865060845349e-06,
"loss": 0.8874,
"step": 256
},
{
"epoch": 0.514,
"grad_norm": 0.7218337655067444,
"learning_rate": 9.845796328055505e-06,
"loss": 0.8516,
"step": 257
},
{
"epoch": 0.516,
"grad_norm": 0.5752958655357361,
"learning_rate": 9.842915805643156e-06,
"loss": 0.8895,
"step": 258
},
{
"epoch": 0.518,
"grad_norm": 0.7929096817970276,
"learning_rate": 9.840009056815674e-06,
"loss": 0.8975,
"step": 259
},
{
"epoch": 0.52,
"grad_norm": 0.6541514992713928,
"learning_rate": 9.83707609731432e-06,
"loss": 0.85,
"step": 260
},
{
"epoch": 0.522,
"grad_norm": 0.7700029015541077,
"learning_rate": 9.834116943022299e-06,
"loss": 0.8465,
"step": 261
},
{
"epoch": 0.524,
"grad_norm": 0.7397891283035278,
"learning_rate": 9.831131609964664e-06,
"loss": 0.858,
"step": 262
},
{
"epoch": 0.526,
"grad_norm": 0.7328517436981201,
"learning_rate": 9.828120114308248e-06,
"loss": 0.9133,
"step": 263
},
{
"epoch": 0.528,
"grad_norm": 0.6791554093360901,
"learning_rate": 9.825082472361558e-06,
"loss": 0.8991,
"step": 264
},
{
"epoch": 0.53,
"grad_norm": 0.633378267288208,
"learning_rate": 9.822018700574696e-06,
"loss": 0.875,
"step": 265
},
{
"epoch": 0.532,
"grad_norm": 0.6736602783203125,
"learning_rate": 9.818928815539266e-06,
"loss": 0.8935,
"step": 266
},
{
"epoch": 0.534,
"grad_norm": 0.6292229294776917,
"learning_rate": 9.815812833988292e-06,
"loss": 0.8636,
"step": 267
},
{
"epoch": 0.536,
"grad_norm": 0.5925595164299011,
"learning_rate": 9.812670772796113e-06,
"loss": 0.8796,
"step": 268
},
{
"epoch": 0.538,
"grad_norm": 0.6620059609413147,
"learning_rate": 9.809502648978311e-06,
"loss": 0.8507,
"step": 269
},
{
"epoch": 0.54,
"grad_norm": 0.590861976146698,
"learning_rate": 9.806308479691595e-06,
"loss": 0.8668,
"step": 270
},
{
"epoch": 0.542,
"grad_norm": 0.6953304409980774,
"learning_rate": 9.803088282233733e-06,
"loss": 0.8753,
"step": 271
},
{
"epoch": 0.544,
"grad_norm": 0.7721243500709534,
"learning_rate": 9.799842074043438e-06,
"loss": 0.888,
"step": 272
},
{
"epoch": 0.546,
"grad_norm": 0.6552896499633789,
"learning_rate": 9.796569872700287e-06,
"loss": 0.8614,
"step": 273
},
{
"epoch": 0.548,
"grad_norm": 0.7471222877502441,
"learning_rate": 9.793271695924621e-06,
"loss": 0.8587,
"step": 274
},
{
"epoch": 0.55,
"grad_norm": 0.6494088172912598,
"learning_rate": 9.789947561577445e-06,
"loss": 0.8382,
"step": 275
},
{
"epoch": 0.552,
"grad_norm": 0.6371944546699524,
"learning_rate": 9.786597487660336e-06,
"loss": 0.8866,
"step": 276
},
{
"epoch": 0.554,
"grad_norm": 0.6909160614013672,
"learning_rate": 9.78322149231535e-06,
"loss": 0.8749,
"step": 277
},
{
"epoch": 0.556,
"grad_norm": 0.544785737991333,
"learning_rate": 9.779819593824909e-06,
"loss": 0.8504,
"step": 278
},
{
"epoch": 0.558,
"grad_norm": 0.8029861450195312,
"learning_rate": 9.776391810611719e-06,
"loss": 0.8585,
"step": 279
},
{
"epoch": 0.56,
"grad_norm": 0.5448933243751526,
"learning_rate": 9.77293816123866e-06,
"loss": 0.8724,
"step": 280
},
{
"epoch": 0.562,
"grad_norm": 0.644865870475769,
"learning_rate": 9.769458664408689e-06,
"loss": 0.8534,
"step": 281
},
{
"epoch": 0.564,
"grad_norm": 0.6825750470161438,
"learning_rate": 9.765953338964736e-06,
"loss": 0.8765,
"step": 282
},
{
"epoch": 0.566,
"grad_norm": 0.5962495803833008,
"learning_rate": 9.762422203889604e-06,
"loss": 0.852,
"step": 283
},
{
"epoch": 0.568,
"grad_norm": 0.6786333918571472,
"learning_rate": 9.75886527830587e-06,
"loss": 0.8427,
"step": 284
},
{
"epoch": 0.57,
"grad_norm": 0.6863124370574951,
"learning_rate": 9.755282581475769e-06,
"loss": 0.8853,
"step": 285
},
{
"epoch": 0.572,
"grad_norm": 0.596169114112854,
"learning_rate": 9.751674132801106e-06,
"loss": 0.8583,
"step": 286
},
{
"epoch": 0.574,
"grad_norm": 0.6611921191215515,
"learning_rate": 9.748039951823141e-06,
"loss": 0.8584,
"step": 287
},
{
"epoch": 0.576,
"grad_norm": 0.5860627889633179,
"learning_rate": 9.744380058222483e-06,
"loss": 0.8574,
"step": 288
},
{
"epoch": 0.578,
"grad_norm": 0.5513158440589905,
"learning_rate": 9.740694471818988e-06,
"loss": 0.8507,
"step": 289
},
{
"epoch": 0.58,
"grad_norm": 0.6688372492790222,
"learning_rate": 9.736983212571646e-06,
"loss": 0.8638,
"step": 290
},
{
"epoch": 0.582,
"grad_norm": 0.6180363297462463,
"learning_rate": 9.733246300578482e-06,
"loss": 0.8778,
"step": 291
},
{
"epoch": 0.584,
"grad_norm": 0.7170926928520203,
"learning_rate": 9.729483756076436e-06,
"loss": 0.8586,
"step": 292
},
{
"epoch": 0.586,
"grad_norm": 0.6541260480880737,
"learning_rate": 9.72569559944126e-06,
"loss": 0.8799,
"step": 293
},
{
"epoch": 0.588,
"grad_norm": 0.5732930302619934,
"learning_rate": 9.721881851187406e-06,
"loss": 0.8669,
"step": 294
},
{
"epoch": 0.59,
"grad_norm": 0.6722111105918884,
"learning_rate": 9.718042531967918e-06,
"loss": 0.8898,
"step": 295
},
{
"epoch": 0.592,
"grad_norm": 0.5557838678359985,
"learning_rate": 9.714177662574316e-06,
"loss": 0.8883,
"step": 296
},
{
"epoch": 0.594,
"grad_norm": 0.7094323039054871,
"learning_rate": 9.710287263936485e-06,
"loss": 0.864,
"step": 297
},
{
"epoch": 0.596,
"grad_norm": 0.5642362236976624,
"learning_rate": 9.70637135712256e-06,
"loss": 0.8742,
"step": 298
},
{
"epoch": 0.598,
"grad_norm": 0.7420487403869629,
"learning_rate": 9.702429963338812e-06,
"loss": 0.8753,
"step": 299
},
{
"epoch": 0.6,
"grad_norm": 0.583194375038147,
"learning_rate": 9.698463103929542e-06,
"loss": 0.8638,
"step": 300
},
{
"epoch": 0.602,
"grad_norm": 0.714911162853241,
"learning_rate": 9.694470800376951e-06,
"loss": 0.8452,
"step": 301
},
{
"epoch": 0.604,
"grad_norm": 0.551218569278717,
"learning_rate": 9.690453074301035e-06,
"loss": 0.865,
"step": 302
},
{
"epoch": 0.606,
"grad_norm": 0.777683675289154,
"learning_rate": 9.68640994745946e-06,
"loss": 0.8554,
"step": 303
},
{
"epoch": 0.608,
"grad_norm": 0.6686451435089111,
"learning_rate": 9.682341441747446e-06,
"loss": 0.9091,
"step": 304
},
{
"epoch": 0.61,
"grad_norm": 0.847751259803772,
"learning_rate": 9.678247579197658e-06,
"loss": 0.8649,
"step": 305
},
{
"epoch": 0.612,
"grad_norm": 0.6995366215705872,
"learning_rate": 9.674128381980073e-06,
"loss": 0.8565,
"step": 306
},
{
"epoch": 0.614,
"grad_norm": 0.7599447965621948,
"learning_rate": 9.669983872401868e-06,
"loss": 0.8861,
"step": 307
},
{
"epoch": 0.616,
"grad_norm": 0.8379013538360596,
"learning_rate": 9.665814072907293e-06,
"loss": 0.8731,
"step": 308
},
{
"epoch": 0.618,
"grad_norm": 0.6999011039733887,
"learning_rate": 9.661619006077562e-06,
"loss": 0.8891,
"step": 309
},
{
"epoch": 0.62,
"grad_norm": 0.8724938035011292,
"learning_rate": 9.657398694630713e-06,
"loss": 0.8677,
"step": 310
},
{
"epoch": 0.622,
"grad_norm": 0.5456347465515137,
"learning_rate": 9.653153161421497e-06,
"loss": 0.8357,
"step": 311
},
{
"epoch": 0.624,
"grad_norm": 0.8137102127075195,
"learning_rate": 9.648882429441258e-06,
"loss": 0.864,
"step": 312
},
{
"epoch": 0.626,
"grad_norm": 0.6591514945030212,
"learning_rate": 9.644586521817792e-06,
"loss": 0.8651,
"step": 313
},
{
"epoch": 0.628,
"grad_norm": 0.6591551303863525,
"learning_rate": 9.640265461815235e-06,
"loss": 0.8742,
"step": 314
},
{
"epoch": 0.63,
"grad_norm": 0.8578521609306335,
"learning_rate": 9.635919272833938e-06,
"loss": 0.8796,
"step": 315
},
{
"epoch": 0.632,
"grad_norm": 0.7640730738639832,
"learning_rate": 9.63154797841033e-06,
"loss": 0.8631,
"step": 316
},
{
"epoch": 0.634,
"grad_norm": 0.5908384919166565,
"learning_rate": 9.627151602216801e-06,
"loss": 0.852,
"step": 317
},
{
"epoch": 0.636,
"grad_norm": 0.6175269484519958,
"learning_rate": 9.622730168061568e-06,
"loss": 0.8714,
"step": 318
},
{
"epoch": 0.638,
"grad_norm": 0.6520965695381165,
"learning_rate": 9.618283699888543e-06,
"loss": 0.9097,
"step": 319
},
{
"epoch": 0.64,
"grad_norm": 0.6406873464584351,
"learning_rate": 9.613812221777212e-06,
"loss": 0.864,
"step": 320
},
{
"epoch": 0.642,
"grad_norm": 0.8102174997329712,
"learning_rate": 9.609315757942504e-06,
"loss": 0.8618,
"step": 321
},
{
"epoch": 0.644,
"grad_norm": 0.5675696730613708,
"learning_rate": 9.604794332734647e-06,
"loss": 0.8578,
"step": 322
},
{
"epoch": 0.646,
"grad_norm": 0.6569576263427734,
"learning_rate": 9.600247970639053e-06,
"loss": 0.8508,
"step": 323
},
{
"epoch": 0.648,
"grad_norm": 0.6983848214149475,
"learning_rate": 9.595676696276173e-06,
"loss": 0.9102,
"step": 324
},
{
"epoch": 0.65,
"grad_norm": 0.6264669895172119,
"learning_rate": 9.591080534401371e-06,
"loss": 0.8651,
"step": 325
},
{
"epoch": 0.652,
"grad_norm": 0.7677618861198425,
"learning_rate": 9.586459509904786e-06,
"loss": 0.8864,
"step": 326
},
{
"epoch": 0.654,
"grad_norm": 0.6663860082626343,
"learning_rate": 9.581813647811199e-06,
"loss": 0.8762,
"step": 327
},
{
"epoch": 0.656,
"grad_norm": 0.6873608231544495,
"learning_rate": 9.577142973279896e-06,
"loss": 0.8404,
"step": 328
},
{
"epoch": 0.658,
"grad_norm": 0.7165527939796448,
"learning_rate": 9.572447511604536e-06,
"loss": 0.8472,
"step": 329
},
{
"epoch": 0.66,
"grad_norm": 0.5741456747055054,
"learning_rate": 9.567727288213005e-06,
"loss": 0.9202,
"step": 330
},
{
"epoch": 0.662,
"grad_norm": 0.6779457926750183,
"learning_rate": 9.56298232866729e-06,
"loss": 0.876,
"step": 331
},
{
"epoch": 0.664,
"grad_norm": 0.6113842129707336,
"learning_rate": 9.55821265866333e-06,
"loss": 0.8977,
"step": 332
},
{
"epoch": 0.666,
"grad_norm": 0.6604581475257874,
"learning_rate": 9.553418304030886e-06,
"loss": 0.8066,
"step": 333
},
{
"epoch": 0.668,
"grad_norm": 0.6155861616134644,
"learning_rate": 9.548599290733393e-06,
"loss": 0.8716,
"step": 334
},
{
"epoch": 0.67,
"grad_norm": 0.5810530781745911,
"learning_rate": 9.543755644867823e-06,
"loss": 0.8626,
"step": 335
},
{
"epoch": 0.672,
"grad_norm": 0.5998817682266235,
"learning_rate": 9.538887392664544e-06,
"loss": 0.8535,
"step": 336
},
{
"epoch": 0.674,
"grad_norm": 0.5710519552230835,
"learning_rate": 9.53399456048718e-06,
"loss": 0.858,
"step": 337
},
{
"epoch": 0.676,
"grad_norm": 0.5840133428573608,
"learning_rate": 9.529077174832466e-06,
"loss": 0.8794,
"step": 338
},
{
"epoch": 0.678,
"grad_norm": 0.6933937668800354,
"learning_rate": 9.524135262330098e-06,
"loss": 0.8652,
"step": 339
},
{
"epoch": 0.68,
"grad_norm": 0.538587749004364,
"learning_rate": 9.519168849742603e-06,
"loss": 0.8828,
"step": 340
},
{
"epoch": 0.682,
"grad_norm": 0.6397183537483215,
"learning_rate": 9.514177963965181e-06,
"loss": 0.85,
"step": 341
},
{
"epoch": 0.684,
"grad_norm": 0.655133068561554,
"learning_rate": 9.50916263202557e-06,
"loss": 0.8623,
"step": 342
},
{
"epoch": 0.686,
"grad_norm": 0.5885834693908691,
"learning_rate": 9.504122881083886e-06,
"loss": 0.8575,
"step": 343
},
{
"epoch": 0.688,
"grad_norm": 0.651634693145752,
"learning_rate": 9.499058738432492e-06,
"loss": 0.8873,
"step": 344
},
{
"epoch": 0.69,
"grad_norm": 0.6309255957603455,
"learning_rate": 9.493970231495836e-06,
"loss": 0.8831,
"step": 345
},
{
"epoch": 0.692,
"grad_norm": 0.7011136412620544,
"learning_rate": 9.488857387830315e-06,
"loss": 0.8665,
"step": 346
},
{
"epoch": 0.694,
"grad_norm": 0.6683589816093445,
"learning_rate": 9.483720235124113e-06,
"loss": 0.8505,
"step": 347
},
{
"epoch": 0.696,
"grad_norm": 0.7045072317123413,
"learning_rate": 9.478558801197065e-06,
"loss": 0.8524,
"step": 348
},
{
"epoch": 0.698,
"grad_norm": 0.7451434135437012,
"learning_rate": 9.473373114000493e-06,
"loss": 0.8424,
"step": 349
},
{
"epoch": 0.7,
"grad_norm": 0.6919768452644348,
"learning_rate": 9.468163201617063e-06,
"loss": 0.8715,
"step": 350
},
{
"epoch": 0.702,
"grad_norm": 0.7216975688934326,
"learning_rate": 9.46292909226063e-06,
"loss": 0.8191,
"step": 351
},
{
"epoch": 0.704,
"grad_norm": 0.7797873020172119,
"learning_rate": 9.457670814276083e-06,
"loss": 0.8458,
"step": 352
},
{
"epoch": 0.706,
"grad_norm": 0.6440073847770691,
"learning_rate": 9.452388396139202e-06,
"loss": 0.8391,
"step": 353
},
{
"epoch": 0.708,
"grad_norm": 0.7262352108955383,
"learning_rate": 9.44708186645649e-06,
"loss": 0.8578,
"step": 354
},
{
"epoch": 0.71,
"grad_norm": 0.7259324193000793,
"learning_rate": 9.441751253965022e-06,
"loss": 0.857,
"step": 355
},
{
"epoch": 0.712,
"grad_norm": 0.6144593358039856,
"learning_rate": 9.436396587532297e-06,
"loss": 0.86,
"step": 356
},
{
"epoch": 0.714,
"grad_norm": 0.8025081753730774,
"learning_rate": 9.431017896156074e-06,
"loss": 0.8748,
"step": 357
},
{
"epoch": 0.716,
"grad_norm": 0.5604358911514282,
"learning_rate": 9.425615208964217e-06,
"loss": 0.8504,
"step": 358
},
{
"epoch": 0.718,
"grad_norm": 0.785858690738678,
"learning_rate": 9.420188555214537e-06,
"loss": 0.886,
"step": 359
},
{
"epoch": 0.72,
"grad_norm": 0.5290353894233704,
"learning_rate": 9.414737964294636e-06,
"loss": 0.8224,
"step": 360
},
{
"epoch": 0.722,
"grad_norm": 0.6365246772766113,
"learning_rate": 9.40926346572174e-06,
"loss": 0.8578,
"step": 361
},
{
"epoch": 0.724,
"grad_norm": 0.7250834107398987,
"learning_rate": 9.403765089142554e-06,
"loss": 0.8292,
"step": 362
},
{
"epoch": 0.726,
"grad_norm": 0.51982581615448,
"learning_rate": 9.398242864333084e-06,
"loss": 0.8582,
"step": 363
},
{
"epoch": 0.728,
"grad_norm": 0.7608000040054321,
"learning_rate": 9.392696821198488e-06,
"loss": 0.8679,
"step": 364
},
{
"epoch": 0.73,
"grad_norm": 0.6473584771156311,
"learning_rate": 9.38712698977291e-06,
"loss": 0.8497,
"step": 365
},
{
"epoch": 0.732,
"grad_norm": 0.7003604769706726,
"learning_rate": 9.381533400219319e-06,
"loss": 0.8666,
"step": 366
},
{
"epoch": 0.734,
"grad_norm": 0.7169507145881653,
"learning_rate": 9.375916082829341e-06,
"loss": 0.8702,
"step": 367
},
{
"epoch": 0.736,
"grad_norm": 0.6383227109909058,
"learning_rate": 9.370275068023097e-06,
"loss": 0.8897,
"step": 368
},
{
"epoch": 0.738,
"grad_norm": 0.694448709487915,
"learning_rate": 9.364610386349048e-06,
"loss": 0.8448,
"step": 369
},
{
"epoch": 0.74,
"grad_norm": 0.6426830887794495,
"learning_rate": 9.358922068483813e-06,
"loss": 0.8737,
"step": 370
},
{
"epoch": 0.742,
"grad_norm": 0.7932099103927612,
"learning_rate": 9.35321014523201e-06,
"loss": 0.9002,
"step": 371
},
{
"epoch": 0.744,
"grad_norm": 0.6667394042015076,
"learning_rate": 9.347474647526095e-06,
"loss": 0.8579,
"step": 372
},
{
"epoch": 0.746,
"grad_norm": 0.7726197242736816,
"learning_rate": 9.34171560642619e-06,
"loss": 0.8494,
"step": 373
},
{
"epoch": 0.748,
"grad_norm": 0.780178427696228,
"learning_rate": 9.335933053119906e-06,
"loss": 0.8598,
"step": 374
},
{
"epoch": 0.75,
"grad_norm": 0.6369235515594482,
"learning_rate": 9.330127018922195e-06,
"loss": 0.8739,
"step": 375
},
{
"epoch": 0.752,
"grad_norm": 0.6134310364723206,
"learning_rate": 9.324297535275156e-06,
"loss": 0.8205,
"step": 376
},
{
"epoch": 0.754,
"grad_norm": 0.8698707818984985,
"learning_rate": 9.318444633747884e-06,
"loss": 0.858,
"step": 377
},
{
"epoch": 0.756,
"grad_norm": 0.5858055353164673,
"learning_rate": 9.312568346036288e-06,
"loss": 0.8341,
"step": 378
},
{
"epoch": 0.758,
"grad_norm": 0.7148376107215881,
"learning_rate": 9.306668703962927e-06,
"loss": 0.8241,
"step": 379
},
{
"epoch": 0.76,
"grad_norm": 0.9750962257385254,
"learning_rate": 9.30074573947683e-06,
"loss": 0.8986,
"step": 380
},
{
"epoch": 0.762,
"grad_norm": 0.7608172297477722,
"learning_rate": 9.294799484653323e-06,
"loss": 0.8645,
"step": 381
},
{
"epoch": 0.764,
"grad_norm": 0.9215735197067261,
"learning_rate": 9.288829971693869e-06,
"loss": 0.8283,
"step": 382
},
{
"epoch": 0.766,
"grad_norm": 0.8112582564353943,
"learning_rate": 9.282837232925876e-06,
"loss": 0.86,
"step": 383
},
{
"epoch": 0.768,
"grad_norm": 0.8226185441017151,
"learning_rate": 9.276821300802535e-06,
"loss": 0.8746,
"step": 384
},
{
"epoch": 0.77,
"grad_norm": 0.7064146399497986,
"learning_rate": 9.27078220790263e-06,
"loss": 0.8813,
"step": 385
},
{
"epoch": 0.772,
"grad_norm": 0.9706873297691345,
"learning_rate": 9.264719986930376e-06,
"loss": 0.8761,
"step": 386
},
{
"epoch": 0.774,
"grad_norm": 0.6123415231704712,
"learning_rate": 9.25863467071524e-06,
"loss": 0.8337,
"step": 387
},
{
"epoch": 0.776,
"grad_norm": 0.7688935399055481,
"learning_rate": 9.25252629221175e-06,
"loss": 0.8527,
"step": 388
},
{
"epoch": 0.778,
"grad_norm": 0.6873968243598938,
"learning_rate": 9.246394884499334e-06,
"loss": 0.8725,
"step": 389
},
{
"epoch": 0.78,
"grad_norm": 0.6200389862060547,
"learning_rate": 9.24024048078213e-06,
"loss": 0.8426,
"step": 390
},
{
"epoch": 0.782,
"grad_norm": 0.555893063545227,
"learning_rate": 9.234063114388809e-06,
"loss": 0.8545,
"step": 391
},
{
"epoch": 0.784,
"grad_norm": 0.677457869052887,
"learning_rate": 9.227862818772392e-06,
"loss": 0.8346,
"step": 392
},
{
"epoch": 0.786,
"grad_norm": 0.6086246371269226,
"learning_rate": 9.221639627510076e-06,
"loss": 0.8622,
"step": 393
},
{
"epoch": 0.788,
"grad_norm": 0.6295508742332458,
"learning_rate": 9.215393574303043e-06,
"loss": 0.8601,
"step": 394
},
{
"epoch": 0.79,
"grad_norm": 0.6351264119148254,
"learning_rate": 9.209124692976287e-06,
"loss": 0.8487,
"step": 395
},
{
"epoch": 0.792,
"grad_norm": 0.6365606784820557,
"learning_rate": 9.202833017478421e-06,
"loss": 0.844,
"step": 396
},
{
"epoch": 0.794,
"grad_norm": 0.6243758201599121,
"learning_rate": 9.196518581881502e-06,
"loss": 0.8448,
"step": 397
},
{
"epoch": 0.796,
"grad_norm": 0.6153333783149719,
"learning_rate": 9.190181420380838e-06,
"loss": 0.8535,
"step": 398
},
{
"epoch": 0.798,
"grad_norm": 0.6188914179801941,
"learning_rate": 9.18382156729481e-06,
"loss": 0.8507,
"step": 399
},
{
"epoch": 0.8,
"grad_norm": 0.6090688109397888,
"learning_rate": 9.177439057064684e-06,
"loss": 0.8617,
"step": 400
},
{
"epoch": 0.802,
"grad_norm": 0.6110431551933289,
"learning_rate": 9.171033924254421e-06,
"loss": 0.8766,
"step": 401
},
{
"epoch": 0.804,
"grad_norm": 0.6193629503250122,
"learning_rate": 9.164606203550498e-06,
"loss": 0.8283,
"step": 402
},
{
"epoch": 0.806,
"grad_norm": 0.6263427734375,
"learning_rate": 9.15815592976171e-06,
"loss": 0.8593,
"step": 403
},
{
"epoch": 0.808,
"grad_norm": 0.604211151599884,
"learning_rate": 9.151683137818989e-06,
"loss": 0.8423,
"step": 404
},
{
"epoch": 0.81,
"grad_norm": 0.6154013276100159,
"learning_rate": 9.145187862775208e-06,
"loss": 0.8409,
"step": 405
},
{
"epoch": 0.812,
"grad_norm": 0.534557044506073,
"learning_rate": 9.138670139805004e-06,
"loss": 0.8458,
"step": 406
},
{
"epoch": 0.814,
"grad_norm": 0.6704394817352295,
"learning_rate": 9.132130004204569e-06,
"loss": 0.8556,
"step": 407
},
{
"epoch": 0.816,
"grad_norm": 0.6993472576141357,
"learning_rate": 9.125567491391476e-06,
"loss": 0.8907,
"step": 408
},
{
"epoch": 0.818,
"grad_norm": 0.6247013807296753,
"learning_rate": 9.118982636904476e-06,
"loss": 0.8372,
"step": 409
},
{
"epoch": 0.82,
"grad_norm": 0.6774156093597412,
"learning_rate": 9.112375476403313e-06,
"loss": 0.8363,
"step": 410
},
{
"epoch": 0.822,
"grad_norm": 0.6017762422561646,
"learning_rate": 9.10574604566852e-06,
"loss": 0.8808,
"step": 411
},
{
"epoch": 0.824,
"grad_norm": 0.7543519139289856,
"learning_rate": 9.099094380601244e-06,
"loss": 0.8395,
"step": 412
},
{
"epoch": 0.826,
"grad_norm": 0.5303599834442139,
"learning_rate": 9.09242051722303e-06,
"loss": 0.8495,
"step": 413
},
{
"epoch": 0.828,
"grad_norm": 0.6581388711929321,
"learning_rate": 9.085724491675642e-06,
"loss": 0.8482,
"step": 414
},
{
"epoch": 0.83,
"grad_norm": 0.6038942337036133,
"learning_rate": 9.079006340220862e-06,
"loss": 0.8646,
"step": 415
},
{
"epoch": 0.832,
"grad_norm": 0.6721682548522949,
"learning_rate": 9.072266099240286e-06,
"loss": 0.8486,
"step": 416
},
{
"epoch": 0.834,
"grad_norm": 0.688840925693512,
"learning_rate": 9.065503805235139e-06,
"loss": 0.8797,
"step": 417
},
{
"epoch": 0.836,
"grad_norm": 0.9070942401885986,
"learning_rate": 9.058719494826076e-06,
"loss": 0.8313,
"step": 418
},
{
"epoch": 0.838,
"grad_norm": 0.5890297293663025,
"learning_rate": 9.051913204752972e-06,
"loss": 0.8577,
"step": 419
},
{
"epoch": 0.84,
"grad_norm": 0.9257925748825073,
"learning_rate": 9.045084971874738e-06,
"loss": 0.8476,
"step": 420
},
{
"epoch": 0.842,
"grad_norm": 0.6360816359519958,
"learning_rate": 9.03823483316911e-06,
"loss": 0.8365,
"step": 421
},
{
"epoch": 0.844,
"grad_norm": 0.7895241975784302,
"learning_rate": 9.031362825732456e-06,
"loss": 0.9092,
"step": 422
},
{
"epoch": 0.846,
"grad_norm": 0.8315562009811401,
"learning_rate": 9.02446898677957e-06,
"loss": 0.8825,
"step": 423
},
{
"epoch": 0.848,
"grad_norm": 0.6070747971534729,
"learning_rate": 9.017553353643479e-06,
"loss": 0.8157,
"step": 424
},
{
"epoch": 0.85,
"grad_norm": 0.8066498041152954,
"learning_rate": 9.01061596377522e-06,
"loss": 0.8614,
"step": 425
},
{
"epoch": 0.852,
"grad_norm": 0.6945021748542786,
"learning_rate": 9.003656854743667e-06,
"loss": 0.8652,
"step": 426
},
{
"epoch": 0.854,
"grad_norm": 0.7237082719802856,
"learning_rate": 8.996676064235308e-06,
"loss": 0.8033,
"step": 427
},
{
"epoch": 0.856,
"grad_norm": 0.7030566930770874,
"learning_rate": 8.989673630054044e-06,
"loss": 0.8576,
"step": 428
},
{
"epoch": 0.858,
"grad_norm": 0.6638972163200378,
"learning_rate": 8.982649590120982e-06,
"loss": 0.8649,
"step": 429
},
{
"epoch": 0.86,
"grad_norm": 0.6616825461387634,
"learning_rate": 8.97560398247424e-06,
"loss": 0.8683,
"step": 430
},
{
"epoch": 0.862,
"grad_norm": 0.6521521806716919,
"learning_rate": 8.96853684526873e-06,
"loss": 0.8796,
"step": 431
},
{
"epoch": 0.864,
"grad_norm": 0.7620105743408203,
"learning_rate": 8.961448216775955e-06,
"loss": 0.8421,
"step": 432
},
{
"epoch": 0.866,
"grad_norm": 0.5343947410583496,
"learning_rate": 8.954338135383804e-06,
"loss": 0.8413,
"step": 433
},
{
"epoch": 0.868,
"grad_norm": 0.8251727223396301,
"learning_rate": 8.947206639596346e-06,
"loss": 0.8446,
"step": 434
},
{
"epoch": 0.87,
"grad_norm": 0.6117093563079834,
"learning_rate": 8.94005376803361e-06,
"loss": 0.8391,
"step": 435
},
{
"epoch": 0.872,
"grad_norm": 0.7564120888710022,
"learning_rate": 8.932879559431392e-06,
"loss": 0.8555,
"step": 436
},
{
"epoch": 0.874,
"grad_norm": 0.7144505381584167,
"learning_rate": 8.925684052641027e-06,
"loss": 0.8485,
"step": 437
},
{
"epoch": 0.876,
"grad_norm": 0.6239364743232727,
"learning_rate": 8.9184672866292e-06,
"loss": 0.8347,
"step": 438
},
{
"epoch": 0.878,
"grad_norm": 0.6891036629676819,
"learning_rate": 8.911229300477716e-06,
"loss": 0.8764,
"step": 439
},
{
"epoch": 0.88,
"grad_norm": 0.5618371367454529,
"learning_rate": 8.903970133383297e-06,
"loss": 0.8486,
"step": 440
},
{
"epoch": 0.882,
"grad_norm": 0.6398274302482605,
"learning_rate": 8.896689824657371e-06,
"loss": 0.8586,
"step": 441
},
{
"epoch": 0.884,
"grad_norm": 0.6113273501396179,
"learning_rate": 8.889388413725857e-06,
"loss": 0.8255,
"step": 442
},
{
"epoch": 0.886,
"grad_norm": 0.6663311719894409,
"learning_rate": 8.882065940128946e-06,
"loss": 0.8686,
"step": 443
},
{
"epoch": 0.888,
"grad_norm": 0.561815083026886,
"learning_rate": 8.874722443520898e-06,
"loss": 0.8468,
"step": 444
},
{
"epoch": 0.89,
"grad_norm": 0.6316409111022949,
"learning_rate": 8.867357963669821e-06,
"loss": 0.8376,
"step": 445
},
{
"epoch": 0.892,
"grad_norm": 0.719174325466156,
"learning_rate": 8.859972540457451e-06,
"loss": 0.826,
"step": 446
},
{
"epoch": 0.894,
"grad_norm": 0.6321907639503479,
"learning_rate": 8.852566213878947e-06,
"loss": 0.852,
"step": 447
},
{
"epoch": 0.896,
"grad_norm": 0.6965914964675903,
"learning_rate": 8.845139024042664e-06,
"loss": 0.8513,
"step": 448
},
{
"epoch": 0.898,
"grad_norm": 0.6648813486099243,
"learning_rate": 8.837691011169944e-06,
"loss": 0.8531,
"step": 449
},
{
"epoch": 0.9,
"grad_norm": 0.6940134167671204,
"learning_rate": 8.83022221559489e-06,
"loss": 0.8919,
"step": 450
},
{
"epoch": 0.902,
"grad_norm": 0.5885754823684692,
"learning_rate": 8.822732677764158e-06,
"loss": 0.9006,
"step": 451
},
{
"epoch": 0.904,
"grad_norm": 0.6959089636802673,
"learning_rate": 8.815222438236726e-06,
"loss": 0.8686,
"step": 452
},
{
"epoch": 0.906,
"grad_norm": 0.6296737790107727,
"learning_rate": 8.807691537683685e-06,
"loss": 0.8542,
"step": 453
},
{
"epoch": 0.908,
"grad_norm": 0.6397737264633179,
"learning_rate": 8.800140016888009e-06,
"loss": 0.8584,
"step": 454
},
{
"epoch": 0.91,
"grad_norm": 0.714893639087677,
"learning_rate": 8.792567916744346e-06,
"loss": 0.8548,
"step": 455
},
{
"epoch": 0.912,
"grad_norm": 0.5381580591201782,
"learning_rate": 8.784975278258783e-06,
"loss": 0.8714,
"step": 456
},
{
"epoch": 0.914,
"grad_norm": 0.7903892397880554,
"learning_rate": 8.777362142548636e-06,
"loss": 0.8662,
"step": 457
},
{
"epoch": 0.916,
"grad_norm": 0.6871955394744873,
"learning_rate": 8.769728550842217e-06,
"loss": 0.8304,
"step": 458
},
{
"epoch": 0.918,
"grad_norm": 0.5707519054412842,
"learning_rate": 8.762074544478622e-06,
"loss": 0.8184,
"step": 459
},
{
"epoch": 0.92,
"grad_norm": 0.7742196321487427,
"learning_rate": 8.754400164907496e-06,
"loss": 0.854,
"step": 460
},
{
"epoch": 0.922,
"grad_norm": 0.7090582847595215,
"learning_rate": 8.746705453688815e-06,
"loss": 0.82,
"step": 461
},
{
"epoch": 0.924,
"grad_norm": 0.5473904609680176,
"learning_rate": 8.73899045249266e-06,
"loss": 0.8278,
"step": 462
},
{
"epoch": 0.926,
"grad_norm": 0.5831822752952576,
"learning_rate": 8.73125520309899e-06,
"loss": 0.8389,
"step": 463
},
{
"epoch": 0.928,
"grad_norm": 0.645477831363678,
"learning_rate": 8.723499747397415e-06,
"loss": 0.8692,
"step": 464
},
{
"epoch": 0.93,
"grad_norm": 0.5839529037475586,
"learning_rate": 8.715724127386971e-06,
"loss": 0.8682,
"step": 465
},
{
"epoch": 0.932,
"grad_norm": 0.5304246544837952,
"learning_rate": 8.707928385175898e-06,
"loss": 0.8257,
"step": 466
},
{
"epoch": 0.934,
"grad_norm": 0.5904834866523743,
"learning_rate": 8.700112562981398e-06,
"loss": 0.8548,
"step": 467
},
{
"epoch": 0.936,
"grad_norm": 0.6015157103538513,
"learning_rate": 8.692276703129421e-06,
"loss": 0.8589,
"step": 468
},
{
"epoch": 0.938,
"grad_norm": 0.5249965786933899,
"learning_rate": 8.68442084805442e-06,
"loss": 0.8387,
"step": 469
},
{
"epoch": 0.94,
"grad_norm": 0.6521996259689331,
"learning_rate": 8.676545040299145e-06,
"loss": 0.8649,
"step": 470
},
{
"epoch": 0.942,
"grad_norm": 0.6516804099082947,
"learning_rate": 8.668649322514382e-06,
"loss": 0.867,
"step": 471
},
{
"epoch": 0.944,
"grad_norm": 0.5435242652893066,
"learning_rate": 8.660733737458751e-06,
"loss": 0.8723,
"step": 472
},
{
"epoch": 0.946,
"grad_norm": 0.7011072635650635,
"learning_rate": 8.652798327998458e-06,
"loss": 0.8535,
"step": 473
},
{
"epoch": 0.948,
"grad_norm": 0.5955620408058167,
"learning_rate": 8.644843137107058e-06,
"loss": 0.8747,
"step": 474
},
{
"epoch": 0.95,
"grad_norm": 0.6632460355758667,
"learning_rate": 8.636868207865244e-06,
"loss": 0.8493,
"step": 475
},
{
"epoch": 0.952,
"grad_norm": 0.5909416675567627,
"learning_rate": 8.628873583460593e-06,
"loss": 0.8726,
"step": 476
},
{
"epoch": 0.954,
"grad_norm": 0.5914918184280396,
"learning_rate": 8.620859307187339e-06,
"loss": 0.8766,
"step": 477
},
{
"epoch": 0.956,
"grad_norm": 0.6765621304512024,
"learning_rate": 8.61282542244614e-06,
"loss": 0.8589,
"step": 478
},
{
"epoch": 0.958,
"grad_norm": 0.5578816533088684,
"learning_rate": 8.604771972743848e-06,
"loss": 0.8483,
"step": 479
},
{
"epoch": 0.96,
"grad_norm": 0.6967804431915283,
"learning_rate": 8.596699001693257e-06,
"loss": 0.8311,
"step": 480
},
{
"epoch": 0.962,
"grad_norm": 0.7719427347183228,
"learning_rate": 8.588606553012884e-06,
"loss": 0.8645,
"step": 481
},
{
"epoch": 0.964,
"grad_norm": 0.5344576835632324,
"learning_rate": 8.580494670526725e-06,
"loss": 0.8759,
"step": 482
},
{
"epoch": 0.966,
"grad_norm": 0.5716308951377869,
"learning_rate": 8.572363398164017e-06,
"loss": 0.8299,
"step": 483
},
{
"epoch": 0.968,
"grad_norm": 0.6527791619300842,
"learning_rate": 8.564212779959003e-06,
"loss": 0.8398,
"step": 484
},
{
"epoch": 0.97,
"grad_norm": 0.6142027974128723,
"learning_rate": 8.556042860050686e-06,
"loss": 0.8251,
"step": 485
},
{
"epoch": 0.972,
"grad_norm": 0.5965133309364319,
"learning_rate": 8.547853682682605e-06,
"loss": 0.8587,
"step": 486
},
{
"epoch": 0.974,
"grad_norm": 0.5377973318099976,
"learning_rate": 8.539645292202579e-06,
"loss": 0.8724,
"step": 487
},
{
"epoch": 0.976,
"grad_norm": 0.6018757820129395,
"learning_rate": 8.531417733062476e-06,
"loss": 0.8646,
"step": 488
},
{
"epoch": 0.978,
"grad_norm": 0.6402251720428467,
"learning_rate": 8.523171049817974e-06,
"loss": 0.8562,
"step": 489
},
{
"epoch": 0.98,
"grad_norm": 0.5891185402870178,
"learning_rate": 8.51490528712831e-06,
"loss": 0.8098,
"step": 490
},
{
"epoch": 0.982,
"grad_norm": 0.5361101627349854,
"learning_rate": 8.506620489756045e-06,
"loss": 0.8605,
"step": 491
},
{
"epoch": 0.984,
"grad_norm": 0.5156423449516296,
"learning_rate": 8.498316702566828e-06,
"loss": 0.8312,
"step": 492
},
{
"epoch": 0.986,
"grad_norm": 0.5701969265937805,
"learning_rate": 8.489993970529137e-06,
"loss": 0.857,
"step": 493
},
{
"epoch": 0.988,
"grad_norm": 0.6193264126777649,
"learning_rate": 8.481652338714048e-06,
"loss": 0.8571,
"step": 494
},
{
"epoch": 0.99,
"grad_norm": 0.5284368991851807,
"learning_rate": 8.473291852294986e-06,
"loss": 0.8774,
"step": 495
},
{
"epoch": 0.992,
"grad_norm": 0.644671618938446,
"learning_rate": 8.464912556547486e-06,
"loss": 0.8824,
"step": 496
},
{
"epoch": 0.994,
"grad_norm": 0.5708516836166382,
"learning_rate": 8.456514496848938e-06,
"loss": 0.8418,
"step": 497
},
{
"epoch": 0.996,
"grad_norm": 0.491625040769577,
"learning_rate": 8.44809771867835e-06,
"loss": 0.8599,
"step": 498
},
{
"epoch": 0.998,
"grad_norm": 0.6016339659690857,
"learning_rate": 8.439662267616093e-06,
"loss": 0.8589,
"step": 499
},
{
"epoch": 1.0,
"grad_norm": 0.6193352341651917,
"learning_rate": 8.43120818934367e-06,
"loss": 0.8247,
"step": 500
},
{
"epoch": 1.002,
"grad_norm": 0.5769928693771362,
"learning_rate": 8.422735529643445e-06,
"loss": 0.7995,
"step": 501
},
{
"epoch": 1.004,
"grad_norm": 0.5737547874450684,
"learning_rate": 8.414244334398418e-06,
"loss": 0.7948,
"step": 502
},
{
"epoch": 1.006,
"grad_norm": 0.6058068871498108,
"learning_rate": 8.405734649591964e-06,
"loss": 0.7863,
"step": 503
},
{
"epoch": 1.008,
"grad_norm": 0.5428916811943054,
"learning_rate": 8.397206521307584e-06,
"loss": 0.8049,
"step": 504
},
{
"epoch": 1.01,
"grad_norm": 0.7062215209007263,
"learning_rate": 8.388659995728662e-06,
"loss": 0.8319,
"step": 505
},
{
"epoch": 1.012,
"grad_norm": 0.4917738139629364,
"learning_rate": 8.380095119138209e-06,
"loss": 0.844,
"step": 506
},
{
"epoch": 1.014,
"grad_norm": 0.7528649568557739,
"learning_rate": 8.371511937918616e-06,
"loss": 0.8274,
"step": 507
},
{
"epoch": 1.016,
"grad_norm": 0.513393759727478,
"learning_rate": 8.362910498551402e-06,
"loss": 0.8135,
"step": 508
},
{
"epoch": 1.018,
"grad_norm": 0.6357613801956177,
"learning_rate": 8.354290847616954e-06,
"loss": 0.7991,
"step": 509
},
{
"epoch": 1.02,
"grad_norm": 0.6039820909500122,
"learning_rate": 8.345653031794292e-06,
"loss": 0.8176,
"step": 510
},
{
"epoch": 1.022,
"grad_norm": 0.5468531847000122,
"learning_rate": 8.3369970978608e-06,
"loss": 0.8629,
"step": 511
},
{
"epoch": 1.024,
"grad_norm": 0.5981908440589905,
"learning_rate": 8.328323092691985e-06,
"loss": 0.7772,
"step": 512
},
{
"epoch": 1.026,
"grad_norm": 0.509652853012085,
"learning_rate": 8.319631063261209e-06,
"loss": 0.842,
"step": 513
},
{
"epoch": 1.028,
"grad_norm": 0.5468859672546387,
"learning_rate": 8.310921056639451e-06,
"loss": 0.8299,
"step": 514
},
{
"epoch": 1.03,
"grad_norm": 0.6187793016433716,
"learning_rate": 8.302193119995038e-06,
"loss": 0.7958,
"step": 515
},
{
"epoch": 1.032,
"grad_norm": 0.5641921758651733,
"learning_rate": 8.293447300593402e-06,
"loss": 0.8341,
"step": 516
},
{
"epoch": 1.034,
"grad_norm": 0.6467854380607605,
"learning_rate": 8.284683645796814e-06,
"loss": 0.7682,
"step": 517
},
{
"epoch": 1.036,
"grad_norm": 0.4969216585159302,
"learning_rate": 8.275902203064125e-06,
"loss": 0.8247,
"step": 518
},
{
"epoch": 1.038,
"grad_norm": 0.5728968977928162,
"learning_rate": 8.267103019950529e-06,
"loss": 0.8065,
"step": 519
},
{
"epoch": 1.04,
"grad_norm": 0.5534486174583435,
"learning_rate": 8.258286144107277e-06,
"loss": 0.818,
"step": 520
},
{
"epoch": 1.042,
"grad_norm": 0.5607255101203918,
"learning_rate": 8.249451623281444e-06,
"loss": 0.7975,
"step": 521
},
{
"epoch": 1.044,
"grad_norm": 0.537211000919342,
"learning_rate": 8.240599505315656e-06,
"loss": 0.8345,
"step": 522
},
{
"epoch": 1.046,
"grad_norm": 0.5845485925674438,
"learning_rate": 8.231729838147833e-06,
"loss": 0.8191,
"step": 523
},
{
"epoch": 1.048,
"grad_norm": 0.5804502964019775,
"learning_rate": 8.222842669810936e-06,
"loss": 0.8313,
"step": 524
},
{
"epoch": 1.05,
"grad_norm": 0.5499438047409058,
"learning_rate": 8.213938048432697e-06,
"loss": 0.8242,
"step": 525
},
{
"epoch": 1.052,
"grad_norm": 0.6084002256393433,
"learning_rate": 8.205016022235368e-06,
"loss": 0.8272,
"step": 526
},
{
"epoch": 1.054,
"grad_norm": 0.5327991247177124,
"learning_rate": 8.196076639535453e-06,
"loss": 0.8309,
"step": 527
},
{
"epoch": 1.056,
"grad_norm": 0.5128622651100159,
"learning_rate": 8.18711994874345e-06,
"loss": 0.8158,
"step": 528
},
{
"epoch": 1.058,
"grad_norm": 0.6121509075164795,
"learning_rate": 8.178145998363585e-06,
"loss": 0.7969,
"step": 529
},
{
"epoch": 1.06,
"grad_norm": 0.5912569761276245,
"learning_rate": 8.16915483699355e-06,
"loss": 0.8165,
"step": 530
},
{
"epoch": 1.062,
"grad_norm": 0.6030197143554688,
"learning_rate": 8.160146513324256e-06,
"loss": 0.8328,
"step": 531
},
{
"epoch": 1.064,
"grad_norm": 0.5357882380485535,
"learning_rate": 8.151121076139534e-06,
"loss": 0.8209,
"step": 532
},
{
"epoch": 1.066,
"grad_norm": 0.5966700315475464,
"learning_rate": 8.142078574315907e-06,
"loss": 0.807,
"step": 533
},
{
"epoch": 1.068,
"grad_norm": 0.6120830774307251,
"learning_rate": 8.133019056822303e-06,
"loss": 0.7926,
"step": 534
},
{
"epoch": 1.07,
"grad_norm": 0.6333845257759094,
"learning_rate": 8.123942572719801e-06,
"loss": 0.8208,
"step": 535
},
{
"epoch": 1.072,
"grad_norm": 0.6417355537414551,
"learning_rate": 8.11484917116136e-06,
"loss": 0.822,
"step": 536
},
{
"epoch": 1.074,
"grad_norm": 0.498068243265152,
"learning_rate": 8.105738901391553e-06,
"loss": 0.8476,
"step": 537
},
{
"epoch": 1.076,
"grad_norm": 0.6133856177330017,
"learning_rate": 8.096611812746302e-06,
"loss": 0.7765,
"step": 538
},
{
"epoch": 1.078,
"grad_norm": 0.5497493147850037,
"learning_rate": 8.087467954652608e-06,
"loss": 0.8126,
"step": 539
},
{
"epoch": 1.08,
"grad_norm": 0.5176334381103516,
"learning_rate": 8.078307376628292e-06,
"loss": 0.8018,
"step": 540
},
{
"epoch": 1.082,
"grad_norm": 0.5719330906867981,
"learning_rate": 8.069130128281714e-06,
"loss": 0.8003,
"step": 541
},
{
"epoch": 1.084,
"grad_norm": 0.6029807329177856,
"learning_rate": 8.059936259311514e-06,
"loss": 0.8541,
"step": 542
},
{
"epoch": 1.086,
"grad_norm": 0.5657724738121033,
"learning_rate": 8.05072581950634e-06,
"loss": 0.7943,
"step": 543
},
{
"epoch": 1.088,
"grad_norm": 0.6055567860603333,
"learning_rate": 8.041498858744572e-06,
"loss": 0.7925,
"step": 544
},
{
"epoch": 1.09,
"grad_norm": 0.5769219994544983,
"learning_rate": 8.032255426994069e-06,
"loss": 0.8003,
"step": 545
},
{
"epoch": 1.092,
"grad_norm": 0.5590239763259888,
"learning_rate": 8.022995574311876e-06,
"loss": 0.8006,
"step": 546
},
{
"epoch": 1.094,
"grad_norm": 0.613633930683136,
"learning_rate": 8.013719350843969e-06,
"loss": 0.7964,
"step": 547
},
{
"epoch": 1.096,
"grad_norm": 0.6089486479759216,
"learning_rate": 8.004426806824985e-06,
"loss": 0.8035,
"step": 548
},
{
"epoch": 1.098,
"grad_norm": 0.5584562420845032,
"learning_rate": 7.99511799257793e-06,
"loss": 0.8435,
"step": 549
},
{
"epoch": 1.1,
"grad_norm": 0.6562801003456116,
"learning_rate": 7.985792958513932e-06,
"loss": 0.8297,
"step": 550
},
{
"epoch": 1.102,
"grad_norm": 0.573478639125824,
"learning_rate": 7.97645175513195e-06,
"loss": 0.839,
"step": 551
},
{
"epoch": 1.104,
"grad_norm": 0.5595790147781372,
"learning_rate": 7.967094433018508e-06,
"loss": 0.7936,
"step": 552
},
{
"epoch": 1.106,
"grad_norm": 0.5592019557952881,
"learning_rate": 7.95772104284742e-06,
"loss": 0.8311,
"step": 553
},
{
"epoch": 1.108,
"grad_norm": 0.5474761128425598,
"learning_rate": 7.948331635379517e-06,
"loss": 0.7777,
"step": 554
},
{
"epoch": 1.11,
"grad_norm": 0.6115286946296692,
"learning_rate": 7.938926261462366e-06,
"loss": 0.821,
"step": 555
},
{
"epoch": 1.112,
"grad_norm": 0.4691702425479889,
"learning_rate": 7.929504972030003e-06,
"loss": 0.7774,
"step": 556
},
{
"epoch": 1.114,
"grad_norm": 0.6433555483818054,
"learning_rate": 7.920067818102652e-06,
"loss": 0.8085,
"step": 557
},
{
"epoch": 1.116,
"grad_norm": 0.5453904271125793,
"learning_rate": 7.910614850786448e-06,
"loss": 0.804,
"step": 558
},
{
"epoch": 1.1179999999999999,
"grad_norm": 0.5915471911430359,
"learning_rate": 7.901146121273165e-06,
"loss": 0.8135,
"step": 559
},
{
"epoch": 1.12,
"grad_norm": 0.519405722618103,
"learning_rate": 7.891661680839932e-06,
"loss": 0.8083,
"step": 560
},
{
"epoch": 1.1219999999999999,
"grad_norm": 0.7413570880889893,
"learning_rate": 7.882161580848966e-06,
"loss": 0.8363,
"step": 561
},
{
"epoch": 1.124,
"grad_norm": 0.6559600830078125,
"learning_rate": 7.872645872747281e-06,
"loss": 0.8312,
"step": 562
},
{
"epoch": 1.126,
"grad_norm": 0.5560040473937988,
"learning_rate": 7.863114608066417e-06,
"loss": 0.7881,
"step": 563
},
{
"epoch": 1.1280000000000001,
"grad_norm": 0.6661034226417542,
"learning_rate": 7.85356783842216e-06,
"loss": 0.8372,
"step": 564
},
{
"epoch": 1.13,
"grad_norm": 0.5710970759391785,
"learning_rate": 7.84400561551426e-06,
"loss": 0.8015,
"step": 565
},
{
"epoch": 1.1320000000000001,
"grad_norm": 0.5782989263534546,
"learning_rate": 7.834427991126155e-06,
"loss": 0.7983,
"step": 566
},
{
"epoch": 1.134,
"grad_norm": 0.6261492371559143,
"learning_rate": 7.82483501712469e-06,
"loss": 0.8461,
"step": 567
},
{
"epoch": 1.1360000000000001,
"grad_norm": 0.513586163520813,
"learning_rate": 7.815226745459831e-06,
"loss": 0.7761,
"step": 568
},
{
"epoch": 1.138,
"grad_norm": 0.639958918094635,
"learning_rate": 7.80560322816439e-06,
"loss": 0.8502,
"step": 569
},
{
"epoch": 1.1400000000000001,
"grad_norm": 0.48928067088127136,
"learning_rate": 7.795964517353734e-06,
"loss": 0.8222,
"step": 570
},
{
"epoch": 1.142,
"grad_norm": 0.527836263179779,
"learning_rate": 7.786310665225522e-06,
"loss": 0.8261,
"step": 571
},
{
"epoch": 1.144,
"grad_norm": 0.6203815937042236,
"learning_rate": 7.776641724059398e-06,
"loss": 0.8052,
"step": 572
},
{
"epoch": 1.146,
"grad_norm": 0.5364488363265991,
"learning_rate": 7.76695774621672e-06,
"loss": 0.8178,
"step": 573
},
{
"epoch": 1.148,
"grad_norm": 0.5550497174263,
"learning_rate": 7.757258784140286e-06,
"loss": 0.7836,
"step": 574
},
{
"epoch": 1.15,
"grad_norm": 0.6141073107719421,
"learning_rate": 7.747544890354031e-06,
"loss": 0.7771,
"step": 575
},
{
"epoch": 1.152,
"grad_norm": 0.5875626802444458,
"learning_rate": 7.737816117462752e-06,
"loss": 0.8157,
"step": 576
},
{
"epoch": 1.154,
"grad_norm": 0.5942591428756714,
"learning_rate": 7.728072518151826e-06,
"loss": 0.8049,
"step": 577
},
{
"epoch": 1.156,
"grad_norm": 0.6154875755310059,
"learning_rate": 7.718314145186918e-06,
"loss": 0.8229,
"step": 578
},
{
"epoch": 1.158,
"grad_norm": 0.5936452150344849,
"learning_rate": 7.7085410514137e-06,
"loss": 0.8372,
"step": 579
},
{
"epoch": 1.16,
"grad_norm": 0.5601768493652344,
"learning_rate": 7.698753289757565e-06,
"loss": 0.8246,
"step": 580
},
{
"epoch": 1.162,
"grad_norm": 0.5993269085884094,
"learning_rate": 7.688950913223336e-06,
"loss": 0.8146,
"step": 581
},
{
"epoch": 1.164,
"grad_norm": 0.5179044604301453,
"learning_rate": 7.679133974894984e-06,
"loss": 0.7709,
"step": 582
},
{
"epoch": 1.166,
"grad_norm": 0.5535378456115723,
"learning_rate": 7.669302527935334e-06,
"loss": 0.7898,
"step": 583
},
{
"epoch": 1.168,
"grad_norm": 0.611381471157074,
"learning_rate": 7.65945662558579e-06,
"loss": 0.8016,
"step": 584
},
{
"epoch": 1.17,
"grad_norm": 0.6293729543685913,
"learning_rate": 7.649596321166024e-06,
"loss": 0.7995,
"step": 585
},
{
"epoch": 1.172,
"grad_norm": 0.5044072866439819,
"learning_rate": 7.639721668073718e-06,
"loss": 0.8043,
"step": 586
},
{
"epoch": 1.174,
"grad_norm": 0.5509117245674133,
"learning_rate": 7.629832719784245e-06,
"loss": 0.802,
"step": 587
},
{
"epoch": 1.176,
"grad_norm": 0.5565614104270935,
"learning_rate": 7.619929529850397e-06,
"loss": 0.8155,
"step": 588
},
{
"epoch": 1.178,
"grad_norm": 0.6130761504173279,
"learning_rate": 7.610012151902091e-06,
"loss": 0.8334,
"step": 589
},
{
"epoch": 1.18,
"grad_norm": 0.5241647362709045,
"learning_rate": 7.600080639646077e-06,
"loss": 0.8129,
"step": 590
},
{
"epoch": 1.182,
"grad_norm": 0.6383605003356934,
"learning_rate": 7.590135046865652e-06,
"loss": 0.8431,
"step": 591
},
{
"epoch": 1.184,
"grad_norm": 0.6660565137863159,
"learning_rate": 7.580175427420358e-06,
"loss": 0.8099,
"step": 592
},
{
"epoch": 1.186,
"grad_norm": 0.5172692537307739,
"learning_rate": 7.570201835245703e-06,
"loss": 0.7888,
"step": 593
},
{
"epoch": 1.188,
"grad_norm": 0.6571612358093262,
"learning_rate": 7.560214324352858e-06,
"loss": 0.7891,
"step": 594
},
{
"epoch": 1.19,
"grad_norm": 0.6135300993919373,
"learning_rate": 7.550212948828377e-06,
"loss": 0.8296,
"step": 595
},
{
"epoch": 1.192,
"grad_norm": 0.5668914318084717,
"learning_rate": 7.54019776283389e-06,
"loss": 0.7931,
"step": 596
},
{
"epoch": 1.194,
"grad_norm": 0.6143723726272583,
"learning_rate": 7.530168820605819e-06,
"loss": 0.8178,
"step": 597
},
{
"epoch": 1.196,
"grad_norm": 0.7302437424659729,
"learning_rate": 7.520126176455084e-06,
"loss": 0.8434,
"step": 598
},
{
"epoch": 1.198,
"grad_norm": 0.5691401362419128,
"learning_rate": 7.510069884766802e-06,
"loss": 0.8216,
"step": 599
},
{
"epoch": 1.2,
"grad_norm": 0.72467041015625,
"learning_rate": 7.500000000000001e-06,
"loss": 0.8043,
"step": 600
},
{
"epoch": 1.202,
"grad_norm": 0.5468252301216125,
"learning_rate": 7.489916576687318e-06,
"loss": 0.7883,
"step": 601
},
{
"epoch": 1.204,
"grad_norm": 0.6991376876831055,
"learning_rate": 7.479819669434712e-06,
"loss": 0.8545,
"step": 602
},
{
"epoch": 1.206,
"grad_norm": 0.5413354635238647,
"learning_rate": 7.469709332921155e-06,
"loss": 0.8286,
"step": 603
},
{
"epoch": 1.208,
"grad_norm": 0.6795758008956909,
"learning_rate": 7.459585621898353e-06,
"loss": 0.7882,
"step": 604
},
{
"epoch": 1.21,
"grad_norm": 0.5337622165679932,
"learning_rate": 7.449448591190436e-06,
"loss": 0.8151,
"step": 605
},
{
"epoch": 1.212,
"grad_norm": 0.5041375160217285,
"learning_rate": 7.4392982956936644e-06,
"loss": 0.7572,
"step": 606
},
{
"epoch": 1.214,
"grad_norm": 0.5212530493736267,
"learning_rate": 7.429134790376136e-06,
"loss": 0.7994,
"step": 607
},
{
"epoch": 1.216,
"grad_norm": 0.5540004968643188,
"learning_rate": 7.418958130277483e-06,
"loss": 0.8277,
"step": 608
},
{
"epoch": 1.218,
"grad_norm": 0.5500084757804871,
"learning_rate": 7.408768370508577e-06,
"loss": 0.7894,
"step": 609
},
{
"epoch": 1.22,
"grad_norm": 0.5433350205421448,
"learning_rate": 7.398565566251232e-06,
"loss": 0.8024,
"step": 610
},
{
"epoch": 1.222,
"grad_norm": 0.6301877498626709,
"learning_rate": 7.3883497727579e-06,
"loss": 0.8209,
"step": 611
},
{
"epoch": 1.224,
"grad_norm": 0.4610621929168701,
"learning_rate": 7.378121045351378e-06,
"loss": 0.8452,
"step": 612
},
{
"epoch": 1.226,
"grad_norm": 0.5440369248390198,
"learning_rate": 7.3678794394245e-06,
"loss": 0.8119,
"step": 613
},
{
"epoch": 1.228,
"grad_norm": 0.5543891191482544,
"learning_rate": 7.357625010439853e-06,
"loss": 0.8434,
"step": 614
},
{
"epoch": 1.23,
"grad_norm": 0.6058164834976196,
"learning_rate": 7.347357813929455e-06,
"loss": 0.7724,
"step": 615
},
{
"epoch": 1.232,
"grad_norm": 0.5185509920120239,
"learning_rate": 7.337077905494472e-06,
"loss": 0.8108,
"step": 616
},
{
"epoch": 1.234,
"grad_norm": 0.6211835145950317,
"learning_rate": 7.326785340804908e-06,
"loss": 0.8424,
"step": 617
},
{
"epoch": 1.236,
"grad_norm": 0.6689534187316895,
"learning_rate": 7.31648017559931e-06,
"loss": 0.8203,
"step": 618
},
{
"epoch": 1.238,
"grad_norm": 0.6034107804298401,
"learning_rate": 7.3061624656844544e-06,
"loss": 0.8432,
"step": 619
},
{
"epoch": 1.24,
"grad_norm": 0.6534163355827332,
"learning_rate": 7.295832266935059e-06,
"loss": 0.8214,
"step": 620
},
{
"epoch": 1.242,
"grad_norm": 0.6900277137756348,
"learning_rate": 7.285489635293472e-06,
"loss": 0.8246,
"step": 621
},
{
"epoch": 1.244,
"grad_norm": 0.49560463428497314,
"learning_rate": 7.275134626769369e-06,
"loss": 0.7885,
"step": 622
},
{
"epoch": 1.246,
"grad_norm": 0.7048647403717041,
"learning_rate": 7.264767297439455e-06,
"loss": 0.7972,
"step": 623
},
{
"epoch": 1.248,
"grad_norm": 0.6690518260002136,
"learning_rate": 7.254387703447154e-06,
"loss": 0.8183,
"step": 624
},
{
"epoch": 1.25,
"grad_norm": 0.5271294713020325,
"learning_rate": 7.243995901002312e-06,
"loss": 0.7815,
"step": 625
},
{
"epoch": 1.252,
"grad_norm": 0.5115328431129456,
"learning_rate": 7.233591946380884e-06,
"loss": 0.7985,
"step": 626
},
{
"epoch": 1.254,
"grad_norm": 0.9464442133903503,
"learning_rate": 7.223175895924638e-06,
"loss": 0.8168,
"step": 627
},
{
"epoch": 1.256,
"grad_norm": 0.5649638772010803,
"learning_rate": 7.212747806040845e-06,
"loss": 0.818,
"step": 628
},
{
"epoch": 1.258,
"grad_norm": 0.5858141183853149,
"learning_rate": 7.2023077332019755e-06,
"loss": 0.7964,
"step": 629
},
{
"epoch": 1.26,
"grad_norm": 0.6316258907318115,
"learning_rate": 7.191855733945388e-06,
"loss": 0.819,
"step": 630
},
{
"epoch": 1.262,
"grad_norm": 0.6449371576309204,
"learning_rate": 7.181391864873034e-06,
"loss": 0.8433,
"step": 631
},
{
"epoch": 1.264,
"grad_norm": 0.5685151219367981,
"learning_rate": 7.170916182651141e-06,
"loss": 0.8091,
"step": 632
},
{
"epoch": 1.266,
"grad_norm": 0.6365333795547485,
"learning_rate": 7.160428744009913e-06,
"loss": 0.7851,
"step": 633
},
{
"epoch": 1.268,
"grad_norm": 0.6000338196754456,
"learning_rate": 7.149929605743214e-06,
"loss": 0.8176,
"step": 634
},
{
"epoch": 1.27,
"grad_norm": 0.6080004572868347,
"learning_rate": 7.1394188247082715e-06,
"loss": 0.7953,
"step": 635
},
{
"epoch": 1.272,
"grad_norm": 0.6208333373069763,
"learning_rate": 7.128896457825364e-06,
"loss": 0.812,
"step": 636
},
{
"epoch": 1.274,
"grad_norm": 0.6057702302932739,
"learning_rate": 7.118362562077508e-06,
"loss": 0.8129,
"step": 637
},
{
"epoch": 1.276,
"grad_norm": 0.562838613986969,
"learning_rate": 7.107817194510157e-06,
"loss": 0.8127,
"step": 638
},
{
"epoch": 1.278,
"grad_norm": 0.641021728515625,
"learning_rate": 7.0972604122308865e-06,
"loss": 0.8113,
"step": 639
},
{
"epoch": 1.28,
"grad_norm": 0.6385996341705322,
"learning_rate": 7.08669227240909e-06,
"loss": 0.8378,
"step": 640
},
{
"epoch": 1.282,
"grad_norm": 0.6018445491790771,
"learning_rate": 7.076112832275667e-06,
"loss": 0.823,
"step": 641
},
{
"epoch": 1.284,
"grad_norm": 0.6377434134483337,
"learning_rate": 7.06552214912271e-06,
"loss": 0.8271,
"step": 642
},
{
"epoch": 1.286,
"grad_norm": 0.6069837212562561,
"learning_rate": 7.054920280303199e-06,
"loss": 0.806,
"step": 643
},
{
"epoch": 1.288,
"grad_norm": 0.5235119462013245,
"learning_rate": 7.04430728323069e-06,
"loss": 0.8077,
"step": 644
},
{
"epoch": 1.29,
"grad_norm": 0.5952073335647583,
"learning_rate": 7.033683215379002e-06,
"loss": 0.7621,
"step": 645
},
{
"epoch": 1.292,
"grad_norm": 0.5681912899017334,
"learning_rate": 7.023048134281907e-06,
"loss": 0.7912,
"step": 646
},
{
"epoch": 1.294,
"grad_norm": 0.4946375787258148,
"learning_rate": 7.012402097532815e-06,
"loss": 0.8125,
"step": 647
},
{
"epoch": 1.296,
"grad_norm": 0.6761035919189453,
"learning_rate": 7.0017451627844765e-06,
"loss": 0.8038,
"step": 648
},
{
"epoch": 1.298,
"grad_norm": 0.6653770208358765,
"learning_rate": 6.991077387748643e-06,
"loss": 0.7879,
"step": 649
},
{
"epoch": 1.3,
"grad_norm": 0.510911762714386,
"learning_rate": 6.980398830195785e-06,
"loss": 0.7873,
"step": 650
},
{
"epoch": 1.302,
"grad_norm": 0.6981252431869507,
"learning_rate": 6.9697095479547564e-06,
"loss": 0.8002,
"step": 651
},
{
"epoch": 1.304,
"grad_norm": 0.6994757652282715,
"learning_rate": 6.959009598912493e-06,
"loss": 0.7766,
"step": 652
},
{
"epoch": 1.306,
"grad_norm": 0.5115617513656616,
"learning_rate": 6.948299041013695e-06,
"loss": 0.827,
"step": 653
},
{
"epoch": 1.308,
"grad_norm": 0.6113659739494324,
"learning_rate": 6.9375779322605154e-06,
"loss": 0.787,
"step": 654
},
{
"epoch": 1.31,
"grad_norm": 0.5647181272506714,
"learning_rate": 6.9268463307122425e-06,
"loss": 0.7437,
"step": 655
},
{
"epoch": 1.312,
"grad_norm": 0.4827154278755188,
"learning_rate": 6.916104294484988e-06,
"loss": 0.8348,
"step": 656
},
{
"epoch": 1.314,
"grad_norm": 0.61137455701828,
"learning_rate": 6.905351881751372e-06,
"loss": 0.8054,
"step": 657
},
{
"epoch": 1.316,
"grad_norm": 0.606617271900177,
"learning_rate": 6.8945891507402075e-06,
"loss": 0.8113,
"step": 658
},
{
"epoch": 1.318,
"grad_norm": 0.7584879994392395,
"learning_rate": 6.883816159736187e-06,
"loss": 0.8542,
"step": 659
},
{
"epoch": 1.32,
"grad_norm": 0.4678022563457489,
"learning_rate": 6.873032967079562e-06,
"loss": 0.7836,
"step": 660
},
{
"epoch": 1.322,
"grad_norm": 0.7454090118408203,
"learning_rate": 6.862239631165831e-06,
"loss": 0.8062,
"step": 661
},
{
"epoch": 1.324,
"grad_norm": 0.692216694355011,
"learning_rate": 6.851436210445427e-06,
"loss": 0.795,
"step": 662
},
{
"epoch": 1.326,
"grad_norm": 0.466247022151947,
"learning_rate": 6.840622763423391e-06,
"loss": 0.8105,
"step": 663
},
{
"epoch": 1.328,
"grad_norm": 0.6885058283805847,
"learning_rate": 6.829799348659061e-06,
"loss": 0.852,
"step": 664
},
{
"epoch": 1.33,
"grad_norm": 0.6030435562133789,
"learning_rate": 6.818966024765758e-06,
"loss": 0.8173,
"step": 665
},
{
"epoch": 1.332,
"grad_norm": 0.580797553062439,
"learning_rate": 6.808122850410461e-06,
"loss": 0.8009,
"step": 666
},
{
"epoch": 1.334,
"grad_norm": 0.6403875350952148,
"learning_rate": 6.7972698843135e-06,
"loss": 0.8135,
"step": 667
},
{
"epoch": 1.336,
"grad_norm": 0.6278457045555115,
"learning_rate": 6.7864071852482205e-06,
"loss": 0.8025,
"step": 668
},
{
"epoch": 1.338,
"grad_norm": 0.6138670444488525,
"learning_rate": 6.775534812040686e-06,
"loss": 0.8516,
"step": 669
},
{
"epoch": 1.34,
"grad_norm": 0.5964850783348083,
"learning_rate": 6.7646528235693445e-06,
"loss": 0.8037,
"step": 670
},
{
"epoch": 1.342,
"grad_norm": 0.4964216351509094,
"learning_rate": 6.753761278764719e-06,
"loss": 0.8162,
"step": 671
},
{
"epoch": 1.3439999999999999,
"grad_norm": 0.6246684789657593,
"learning_rate": 6.7428602366090764e-06,
"loss": 0.8007,
"step": 672
},
{
"epoch": 1.346,
"grad_norm": 0.5912864208221436,
"learning_rate": 6.7319497561361245e-06,
"loss": 0.8589,
"step": 673
},
{
"epoch": 1.3479999999999999,
"grad_norm": 0.634901762008667,
"learning_rate": 6.721029896430678e-06,
"loss": 0.8166,
"step": 674
},
{
"epoch": 1.35,
"grad_norm": 0.5598165988922119,
"learning_rate": 6.710100716628345e-06,
"loss": 0.8003,
"step": 675
},
{
"epoch": 1.3519999999999999,
"grad_norm": 0.5302711725234985,
"learning_rate": 6.699162275915208e-06,
"loss": 0.7989,
"step": 676
},
{
"epoch": 1.354,
"grad_norm": 0.6884709596633911,
"learning_rate": 6.6882146335274955e-06,
"loss": 0.8377,
"step": 677
},
{
"epoch": 1.3559999999999999,
"grad_norm": 0.5577335357666016,
"learning_rate": 6.677257848751276e-06,
"loss": 0.8159,
"step": 678
},
{
"epoch": 1.358,
"grad_norm": 0.5639868974685669,
"learning_rate": 6.666291980922122e-06,
"loss": 0.8189,
"step": 679
},
{
"epoch": 1.3599999999999999,
"grad_norm": 0.6146543622016907,
"learning_rate": 6.655317089424791e-06,
"loss": 0.8428,
"step": 680
},
{
"epoch": 1.362,
"grad_norm": 0.4616728127002716,
"learning_rate": 6.644333233692917e-06,
"loss": 0.8361,
"step": 681
},
{
"epoch": 1.3639999999999999,
"grad_norm": 0.556716799736023,
"learning_rate": 6.633340473208673e-06,
"loss": 0.7989,
"step": 682
},
{
"epoch": 1.366,
"grad_norm": 0.5753201246261597,
"learning_rate": 6.622338867502452e-06,
"loss": 0.8213,
"step": 683
},
{
"epoch": 1.3679999999999999,
"grad_norm": 0.5656223893165588,
"learning_rate": 6.611328476152557e-06,
"loss": 0.8155,
"step": 684
},
{
"epoch": 1.37,
"grad_norm": 0.5939602255821228,
"learning_rate": 6.600309358784858e-06,
"loss": 0.8176,
"step": 685
},
{
"epoch": 1.3719999999999999,
"grad_norm": 0.5385190844535828,
"learning_rate": 6.58928157507249e-06,
"loss": 0.8592,
"step": 686
},
{
"epoch": 1.374,
"grad_norm": 0.5945911407470703,
"learning_rate": 6.578245184735513e-06,
"loss": 0.8017,
"step": 687
},
{
"epoch": 1.376,
"grad_norm": 0.6518405079841614,
"learning_rate": 6.567200247540599e-06,
"loss": 0.841,
"step": 688
},
{
"epoch": 1.3780000000000001,
"grad_norm": 0.6033949255943298,
"learning_rate": 6.556146823300701e-06,
"loss": 0.8232,
"step": 689
},
{
"epoch": 1.38,
"grad_norm": 0.570328414440155,
"learning_rate": 6.545084971874738e-06,
"loss": 0.805,
"step": 690
},
{
"epoch": 1.3820000000000001,
"grad_norm": 0.5114735960960388,
"learning_rate": 6.534014753167263e-06,
"loss": 0.8165,
"step": 691
},
{
"epoch": 1.384,
"grad_norm": 0.5306764245033264,
"learning_rate": 6.522936227128139e-06,
"loss": 0.7843,
"step": 692
},
{
"epoch": 1.3860000000000001,
"grad_norm": 0.5091618895530701,
"learning_rate": 6.5118494537522235e-06,
"loss": 0.7579,
"step": 693
},
{
"epoch": 1.388,
"grad_norm": 0.4887152910232544,
"learning_rate": 6.500754493079029e-06,
"loss": 0.7969,
"step": 694
},
{
"epoch": 1.3900000000000001,
"grad_norm": 0.49224093556404114,
"learning_rate": 6.48965140519241e-06,
"loss": 0.837,
"step": 695
},
{
"epoch": 1.392,
"grad_norm": 0.47571027278900146,
"learning_rate": 6.4785402502202345e-06,
"loss": 0.7802,
"step": 696
},
{
"epoch": 1.3940000000000001,
"grad_norm": 0.46153298020362854,
"learning_rate": 6.467421088334052e-06,
"loss": 0.7947,
"step": 697
},
{
"epoch": 1.396,
"grad_norm": 0.5026895403862,
"learning_rate": 6.456293979748778e-06,
"loss": 0.7754,
"step": 698
},
{
"epoch": 1.3980000000000001,
"grad_norm": 0.512188732624054,
"learning_rate": 6.445158984722358e-06,
"loss": 0.856,
"step": 699
},
{
"epoch": 1.4,
"grad_norm": 0.5626097917556763,
"learning_rate": 6.434016163555452e-06,
"loss": 0.8204,
"step": 700
},
{
"epoch": 1.4020000000000001,
"grad_norm": 0.49451103806495667,
"learning_rate": 6.422865576591096e-06,
"loss": 0.8005,
"step": 701
},
{
"epoch": 1.404,
"grad_norm": 0.47687214612960815,
"learning_rate": 6.411707284214384e-06,
"loss": 0.813,
"step": 702
},
{
"epoch": 1.4060000000000001,
"grad_norm": 0.5690708160400391,
"learning_rate": 6.400541346852136e-06,
"loss": 0.8292,
"step": 703
},
{
"epoch": 1.408,
"grad_norm": 0.4766281247138977,
"learning_rate": 6.389367824972575e-06,
"loss": 0.7606,
"step": 704
},
{
"epoch": 1.41,
"grad_norm": 0.48364976048469543,
"learning_rate": 6.378186779084996e-06,
"loss": 0.7973,
"step": 705
},
{
"epoch": 1.412,
"grad_norm": 0.51153564453125,
"learning_rate": 6.366998269739442e-06,
"loss": 0.817,
"step": 706
},
{
"epoch": 1.414,
"grad_norm": 0.4661514461040497,
"learning_rate": 6.35580235752637e-06,
"loss": 0.8269,
"step": 707
},
{
"epoch": 1.416,
"grad_norm": 0.4984007477760315,
"learning_rate": 6.344599103076329e-06,
"loss": 0.8395,
"step": 708
},
{
"epoch": 1.418,
"grad_norm": 0.5143007636070251,
"learning_rate": 6.3333885670596285e-06,
"loss": 0.7778,
"step": 709
},
{
"epoch": 1.42,
"grad_norm": 0.48550623655319214,
"learning_rate": 6.322170810186013e-06,
"loss": 0.8195,
"step": 710
},
{
"epoch": 1.422,
"grad_norm": 0.49492278695106506,
"learning_rate": 6.310945893204324e-06,
"loss": 0.8467,
"step": 711
},
{
"epoch": 1.424,
"grad_norm": 0.521795392036438,
"learning_rate": 6.299713876902188e-06,
"loss": 0.8383,
"step": 712
},
{
"epoch": 1.426,
"grad_norm": 0.544080913066864,
"learning_rate": 6.28847482210567e-06,
"loss": 0.8189,
"step": 713
},
{
"epoch": 1.428,
"grad_norm": 0.47585853934288025,
"learning_rate": 6.277228789678953e-06,
"loss": 0.8071,
"step": 714
},
{
"epoch": 1.43,
"grad_norm": 0.5880405306816101,
"learning_rate": 6.26597584052401e-06,
"loss": 0.8156,
"step": 715
},
{
"epoch": 1.432,
"grad_norm": 0.5470208525657654,
"learning_rate": 6.254716035580264e-06,
"loss": 0.8118,
"step": 716
},
{
"epoch": 1.434,
"grad_norm": 0.5518249273300171,
"learning_rate": 6.243449435824276e-06,
"loss": 0.8103,
"step": 717
},
{
"epoch": 1.436,
"grad_norm": 0.5518653988838196,
"learning_rate": 6.23217610226939e-06,
"loss": 0.8304,
"step": 718
},
{
"epoch": 1.438,
"grad_norm": 0.5015103220939636,
"learning_rate": 6.220896095965428e-06,
"loss": 0.8015,
"step": 719
},
{
"epoch": 1.44,
"grad_norm": 0.5413691997528076,
"learning_rate": 6.209609477998339e-06,
"loss": 0.8005,
"step": 720
},
{
"epoch": 1.442,
"grad_norm": 0.5215966105461121,
"learning_rate": 6.198316309489886e-06,
"loss": 0.8073,
"step": 721
},
{
"epoch": 1.444,
"grad_norm": 0.471635103225708,
"learning_rate": 6.187016651597299e-06,
"loss": 0.8202,
"step": 722
},
{
"epoch": 1.446,
"grad_norm": 0.6489139795303345,
"learning_rate": 6.17571056551295e-06,
"loss": 0.8254,
"step": 723
},
{
"epoch": 1.448,
"grad_norm": 0.44364041090011597,
"learning_rate": 6.16439811246403e-06,
"loss": 0.7725,
"step": 724
},
{
"epoch": 1.45,
"grad_norm": 0.5453159809112549,
"learning_rate": 6.153079353712201e-06,
"loss": 0.798,
"step": 725
},
{
"epoch": 1.452,
"grad_norm": 0.5725481510162354,
"learning_rate": 6.141754350553279e-06,
"loss": 0.8243,
"step": 726
},
{
"epoch": 1.454,
"grad_norm": 0.5413415431976318,
"learning_rate": 6.130423164316893e-06,
"loss": 0.8279,
"step": 727
},
{
"epoch": 1.456,
"grad_norm": 0.4809170961380005,
"learning_rate": 6.119085856366158e-06,
"loss": 0.797,
"step": 728
},
{
"epoch": 1.458,
"grad_norm": 0.5237243175506592,
"learning_rate": 6.107742488097338e-06,
"loss": 0.8013,
"step": 729
},
{
"epoch": 1.46,
"grad_norm": 0.5416289567947388,
"learning_rate": 6.0963931209395165e-06,
"loss": 0.809,
"step": 730
},
{
"epoch": 1.462,
"grad_norm": 0.47033780813217163,
"learning_rate": 6.085037816354269e-06,
"loss": 0.8291,
"step": 731
},
{
"epoch": 1.464,
"grad_norm": 0.5361016988754272,
"learning_rate": 6.073676635835317e-06,
"loss": 0.8432,
"step": 732
},
{
"epoch": 1.466,
"grad_norm": 0.5197705626487732,
"learning_rate": 6.062309640908206e-06,
"loss": 0.804,
"step": 733
},
{
"epoch": 1.468,
"grad_norm": 0.5188045501708984,
"learning_rate": 6.05093689312997e-06,
"loss": 0.8176,
"step": 734
},
{
"epoch": 1.47,
"grad_norm": 0.5773131251335144,
"learning_rate": 6.039558454088796e-06,
"loss": 0.8276,
"step": 735
},
{
"epoch": 1.472,
"grad_norm": 0.5026968121528625,
"learning_rate": 6.028174385403693e-06,
"loss": 0.8054,
"step": 736
},
{
"epoch": 1.474,
"grad_norm": 0.481131911277771,
"learning_rate": 6.016784748724153e-06,
"loss": 0.7974,
"step": 737
},
{
"epoch": 1.476,
"grad_norm": 0.6374879479408264,
"learning_rate": 6.005389605729824e-06,
"loss": 0.7866,
"step": 738
},
{
"epoch": 1.478,
"grad_norm": 0.5750545263290405,
"learning_rate": 5.993989018130173e-06,
"loss": 0.799,
"step": 739
},
{
"epoch": 1.48,
"grad_norm": 0.5512261986732483,
"learning_rate": 5.982583047664151e-06,
"loss": 0.7951,
"step": 740
},
{
"epoch": 1.482,
"grad_norm": 0.4982967674732208,
"learning_rate": 5.97117175609986e-06,
"loss": 0.8104,
"step": 741
},
{
"epoch": 1.484,
"grad_norm": 0.5609320402145386,
"learning_rate": 5.9597552052342174e-06,
"loss": 0.8109,
"step": 742
},
{
"epoch": 1.486,
"grad_norm": 0.5253490209579468,
"learning_rate": 5.948333456892624e-06,
"loss": 0.7986,
"step": 743
},
{
"epoch": 1.488,
"grad_norm": 0.6019419431686401,
"learning_rate": 5.936906572928625e-06,
"loss": 0.8111,
"step": 744
},
{
"epoch": 1.49,
"grad_norm": 0.4798535108566284,
"learning_rate": 5.925474615223573e-06,
"loss": 0.8139,
"step": 745
},
{
"epoch": 1.492,
"grad_norm": 0.5777360200881958,
"learning_rate": 5.914037645686308e-06,
"loss": 0.7785,
"step": 746
},
{
"epoch": 1.494,
"grad_norm": 0.5345956087112427,
"learning_rate": 5.902595726252801e-06,
"loss": 0.8064,
"step": 747
},
{
"epoch": 1.496,
"grad_norm": 0.5394262671470642,
"learning_rate": 5.891148918885834e-06,
"loss": 0.7876,
"step": 748
},
{
"epoch": 1.498,
"grad_norm": 0.575792670249939,
"learning_rate": 5.879697285574655e-06,
"loss": 0.8266,
"step": 749
},
{
"epoch": 1.5,
"grad_norm": 0.4988614320755005,
"learning_rate": 5.8682408883346535e-06,
"loss": 0.8024,
"step": 750
},
{
"epoch": 1.502,
"grad_norm": 0.4781121015548706,
"learning_rate": 5.85677978920701e-06,
"loss": 0.8014,
"step": 751
},
{
"epoch": 1.504,
"grad_norm": 0.5493645668029785,
"learning_rate": 5.84531405025837e-06,
"loss": 0.7882,
"step": 752
},
{
"epoch": 1.506,
"grad_norm": 0.5231897234916687,
"learning_rate": 5.8338437335805124e-06,
"loss": 0.7806,
"step": 753
},
{
"epoch": 1.508,
"grad_norm": 0.4463392496109009,
"learning_rate": 5.8223689012899945e-06,
"loss": 0.7944,
"step": 754
},
{
"epoch": 1.51,
"grad_norm": 0.4627637565135956,
"learning_rate": 5.810889615527839e-06,
"loss": 0.796,
"step": 755
},
{
"epoch": 1.512,
"grad_norm": 0.540871798992157,
"learning_rate": 5.799405938459175e-06,
"loss": 0.7873,
"step": 756
},
{
"epoch": 1.514,
"grad_norm": 0.5337440371513367,
"learning_rate": 5.787917932272922e-06,
"loss": 0.7991,
"step": 757
},
{
"epoch": 1.516,
"grad_norm": 0.5268739461898804,
"learning_rate": 5.776425659181438e-06,
"loss": 0.773,
"step": 758
},
{
"epoch": 1.518,
"grad_norm": 0.5249832272529602,
"learning_rate": 5.764929181420191e-06,
"loss": 0.7912,
"step": 759
},
{
"epoch": 1.52,
"grad_norm": 0.7338144779205322,
"learning_rate": 5.753428561247416e-06,
"loss": 0.7758,
"step": 760
},
{
"epoch": 1.522,
"grad_norm": 0.603810727596283,
"learning_rate": 5.741923860943783e-06,
"loss": 0.8154,
"step": 761
},
{
"epoch": 1.524,
"grad_norm": 0.5017214417457581,
"learning_rate": 5.730415142812059e-06,
"loss": 0.8125,
"step": 762
},
{
"epoch": 1.526,
"grad_norm": 0.6532445549964905,
"learning_rate": 5.718902469176765e-06,
"loss": 0.8281,
"step": 763
},
{
"epoch": 1.528,
"grad_norm": 0.554637610912323,
"learning_rate": 5.707385902383845e-06,
"loss": 0.8564,
"step": 764
},
{
"epoch": 1.53,
"grad_norm": 0.47505423426628113,
"learning_rate": 5.695865504800328e-06,
"loss": 0.818,
"step": 765
},
{
"epoch": 1.532,
"grad_norm": 0.5482528805732727,
"learning_rate": 5.684341338813986e-06,
"loss": 0.8216,
"step": 766
},
{
"epoch": 1.534,
"grad_norm": 0.49230730533599854,
"learning_rate": 5.672813466832998e-06,
"loss": 0.7835,
"step": 767
},
{
"epoch": 1.536,
"grad_norm": 0.5212811231613159,
"learning_rate": 5.661281951285613e-06,
"loss": 0.8284,
"step": 768
},
{
"epoch": 1.538,
"grad_norm": 0.4840611517429352,
"learning_rate": 5.649746854619814e-06,
"loss": 0.8013,
"step": 769
},
{
"epoch": 1.54,
"grad_norm": 0.4953254163265228,
"learning_rate": 5.638208239302975e-06,
"loss": 0.782,
"step": 770
},
{
"epoch": 1.542,
"grad_norm": 0.570379376411438,
"learning_rate": 5.626666167821522e-06,
"loss": 0.8124,
"step": 771
},
{
"epoch": 1.544,
"grad_norm": 0.5032173991203308,
"learning_rate": 5.615120702680604e-06,
"loss": 0.8016,
"step": 772
},
{
"epoch": 1.546,
"grad_norm": 0.5023512840270996,
"learning_rate": 5.6035719064037446e-06,
"loss": 0.7978,
"step": 773
},
{
"epoch": 1.548,
"grad_norm": 0.5365172028541565,
"learning_rate": 5.592019841532507e-06,
"loss": 0.8113,
"step": 774
},
{
"epoch": 1.55,
"grad_norm": 0.6160049438476562,
"learning_rate": 5.5804645706261515e-06,
"loss": 0.7946,
"step": 775
},
{
"epoch": 1.552,
"grad_norm": 0.46849778294563293,
"learning_rate": 5.568906156261309e-06,
"loss": 0.8342,
"step": 776
},
{
"epoch": 1.554,
"grad_norm": 0.5677435994148254,
"learning_rate": 5.557344661031628e-06,
"loss": 0.8129,
"step": 777
},
{
"epoch": 1.556,
"grad_norm": 0.5215854644775391,
"learning_rate": 5.54578014754744e-06,
"loss": 0.7856,
"step": 778
},
{
"epoch": 1.558,
"grad_norm": 0.4937414824962616,
"learning_rate": 5.5342126784354265e-06,
"loss": 0.8075,
"step": 779
},
{
"epoch": 1.56,
"grad_norm": 0.4672093987464905,
"learning_rate": 5.522642316338268e-06,
"loss": 0.8288,
"step": 780
},
{
"epoch": 1.562,
"grad_norm": 0.5195022225379944,
"learning_rate": 5.511069123914319e-06,
"loss": 0.7994,
"step": 781
},
{
"epoch": 1.564,
"grad_norm": 0.4978856146335602,
"learning_rate": 5.499493163837258e-06,
"loss": 0.817,
"step": 782
},
{
"epoch": 1.5659999999999998,
"grad_norm": 0.5288148522377014,
"learning_rate": 5.487914498795748e-06,
"loss": 0.7984,
"step": 783
},
{
"epoch": 1.568,
"grad_norm": 0.47676193714141846,
"learning_rate": 5.476333191493108e-06,
"loss": 0.7923,
"step": 784
},
{
"epoch": 1.5699999999999998,
"grad_norm": 0.5122764706611633,
"learning_rate": 5.464749304646963e-06,
"loss": 0.7706,
"step": 785
},
{
"epoch": 1.572,
"grad_norm": 0.4978078007698059,
"learning_rate": 5.453162900988902e-06,
"loss": 0.7747,
"step": 786
},
{
"epoch": 1.5739999999999998,
"grad_norm": 0.4881911277770996,
"learning_rate": 5.44157404326415e-06,
"loss": 0.829,
"step": 787
},
{
"epoch": 1.576,
"grad_norm": 0.5487351417541504,
"learning_rate": 5.429982794231221e-06,
"loss": 0.8169,
"step": 788
},
{
"epoch": 1.5779999999999998,
"grad_norm": 0.5416744947433472,
"learning_rate": 5.41838921666158e-06,
"loss": 0.8234,
"step": 789
},
{
"epoch": 1.58,
"grad_norm": 0.5604218244552612,
"learning_rate": 5.406793373339292e-06,
"loss": 0.809,
"step": 790
},
{
"epoch": 1.5819999999999999,
"grad_norm": 0.5446667075157166,
"learning_rate": 5.395195327060707e-06,
"loss": 0.8159,
"step": 791
},
{
"epoch": 1.584,
"grad_norm": 0.5692674517631531,
"learning_rate": 5.383595140634093e-06,
"loss": 0.8395,
"step": 792
},
{
"epoch": 1.5859999999999999,
"grad_norm": 0.5842452049255371,
"learning_rate": 5.371992876879318e-06,
"loss": 0.8206,
"step": 793
},
{
"epoch": 1.588,
"grad_norm": 0.5441771149635315,
"learning_rate": 5.360388598627487e-06,
"loss": 0.8254,
"step": 794
},
{
"epoch": 1.5899999999999999,
"grad_norm": 0.4671582579612732,
"learning_rate": 5.348782368720627e-06,
"loss": 0.7993,
"step": 795
},
{
"epoch": 1.592,
"grad_norm": 0.599138617515564,
"learning_rate": 5.337174250011326e-06,
"loss": 0.8183,
"step": 796
},
{
"epoch": 1.5939999999999999,
"grad_norm": 0.4990823268890381,
"learning_rate": 5.325564305362404e-06,
"loss": 0.8081,
"step": 797
},
{
"epoch": 1.596,
"grad_norm": 0.5241624116897583,
"learning_rate": 5.3139525976465675e-06,
"loss": 0.8425,
"step": 798
},
{
"epoch": 1.5979999999999999,
"grad_norm": 0.572844386100769,
"learning_rate": 5.3023391897460715e-06,
"loss": 0.8241,
"step": 799
},
{
"epoch": 1.6,
"grad_norm": 0.5502427220344543,
"learning_rate": 5.290724144552379e-06,
"loss": 0.7951,
"step": 800
},
{
"epoch": 1.6019999999999999,
"grad_norm": 0.4780225157737732,
"learning_rate": 5.27910752496582e-06,
"loss": 0.8158,
"step": 801
},
{
"epoch": 1.604,
"grad_norm": 0.4722861051559448,
"learning_rate": 5.267489393895247e-06,
"loss": 0.7858,
"step": 802
},
{
"epoch": 1.6059999999999999,
"grad_norm": 0.47437071800231934,
"learning_rate": 5.255869814257701e-06,
"loss": 0.8218,
"step": 803
},
{
"epoch": 1.608,
"grad_norm": 0.5584851503372192,
"learning_rate": 5.244248848978067e-06,
"loss": 0.8444,
"step": 804
},
{
"epoch": 1.6099999999999999,
"grad_norm": 0.5402180552482605,
"learning_rate": 5.232626560988735e-06,
"loss": 0.8223,
"step": 805
},
{
"epoch": 1.612,
"grad_norm": 0.5967413187026978,
"learning_rate": 5.221003013229253e-06,
"loss": 0.8017,
"step": 806
},
{
"epoch": 1.6139999999999999,
"grad_norm": 0.5010871887207031,
"learning_rate": 5.209378268645998e-06,
"loss": 0.8087,
"step": 807
},
{
"epoch": 1.616,
"grad_norm": 0.48053157329559326,
"learning_rate": 5.197752390191827e-06,
"loss": 0.7959,
"step": 808
},
{
"epoch": 1.6179999999999999,
"grad_norm": 0.5281156301498413,
"learning_rate": 5.18612544082573e-06,
"loss": 0.8326,
"step": 809
},
{
"epoch": 1.62,
"grad_norm": 0.5991333723068237,
"learning_rate": 5.174497483512506e-06,
"loss": 0.792,
"step": 810
},
{
"epoch": 1.6219999999999999,
"grad_norm": 0.46182727813720703,
"learning_rate": 5.162868581222407e-06,
"loss": 0.8037,
"step": 811
},
{
"epoch": 1.624,
"grad_norm": 0.5693475604057312,
"learning_rate": 5.151238796930804e-06,
"loss": 0.8441,
"step": 812
},
{
"epoch": 1.626,
"grad_norm": 0.4842833876609802,
"learning_rate": 5.139608193617846e-06,
"loss": 0.7986,
"step": 813
},
{
"epoch": 1.6280000000000001,
"grad_norm": 0.4889780580997467,
"learning_rate": 5.127976834268112e-06,
"loss": 0.7985,
"step": 814
},
{
"epoch": 1.63,
"grad_norm": 0.5266144871711731,
"learning_rate": 5.116344781870282e-06,
"loss": 0.7968,
"step": 815
},
{
"epoch": 1.6320000000000001,
"grad_norm": 0.4901430010795593,
"learning_rate": 5.1047120994167855e-06,
"loss": 0.7998,
"step": 816
},
{
"epoch": 1.634,
"grad_norm": 0.47501808404922485,
"learning_rate": 5.093078849903464e-06,
"loss": 0.7756,
"step": 817
},
{
"epoch": 1.6360000000000001,
"grad_norm": 0.5496917963027954,
"learning_rate": 5.081445096329229e-06,
"loss": 0.7817,
"step": 818
},
{
"epoch": 1.638,
"grad_norm": 0.43850964307785034,
"learning_rate": 5.069810901695727e-06,
"loss": 0.8252,
"step": 819
},
{
"epoch": 1.6400000000000001,
"grad_norm": 0.5552741289138794,
"learning_rate": 5.0581763290069865e-06,
"loss": 0.8149,
"step": 820
},
{
"epoch": 1.642,
"grad_norm": 0.5505797266960144,
"learning_rate": 5.046541441269085e-06,
"loss": 0.8131,
"step": 821
},
{
"epoch": 1.6440000000000001,
"grad_norm": 0.4630671441555023,
"learning_rate": 5.034906301489808e-06,
"loss": 0.7794,
"step": 822
},
{
"epoch": 1.646,
"grad_norm": 0.49438831210136414,
"learning_rate": 5.0232709726783065e-06,
"loss": 0.8188,
"step": 823
},
{
"epoch": 1.6480000000000001,
"grad_norm": 0.453652560710907,
"learning_rate": 5.011635517844753e-06,
"loss": 0.786,
"step": 824
},
{
"epoch": 1.65,
"grad_norm": 0.49963581562042236,
"learning_rate": 5e-06,
"loss": 0.8272,
"step": 825
},
{
"epoch": 1.6520000000000001,
"grad_norm": 0.5175125598907471,
"learning_rate": 4.988364482155249e-06,
"loss": 0.7994,
"step": 826
},
{
"epoch": 1.654,
"grad_norm": 0.4657656252384186,
"learning_rate": 4.976729027321694e-06,
"loss": 0.7749,
"step": 827
},
{
"epoch": 1.6560000000000001,
"grad_norm": 0.4991082549095154,
"learning_rate": 4.965093698510192e-06,
"loss": 0.8031,
"step": 828
},
{
"epoch": 1.658,
"grad_norm": 0.47223085165023804,
"learning_rate": 4.953458558730917e-06,
"loss": 0.826,
"step": 829
},
{
"epoch": 1.6600000000000001,
"grad_norm": 0.5325574278831482,
"learning_rate": 4.941823670993016e-06,
"loss": 0.8018,
"step": 830
},
{
"epoch": 1.662,
"grad_norm": 0.5229061245918274,
"learning_rate": 4.9301890983042744e-06,
"loss": 0.8187,
"step": 831
},
{
"epoch": 1.6640000000000001,
"grad_norm": 0.47113412618637085,
"learning_rate": 4.9185549036707715e-06,
"loss": 0.8358,
"step": 832
},
{
"epoch": 1.666,
"grad_norm": 0.49168315529823303,
"learning_rate": 4.906921150096538e-06,
"loss": 0.8258,
"step": 833
},
{
"epoch": 1.6680000000000001,
"grad_norm": 0.5069917440414429,
"learning_rate": 4.895287900583216e-06,
"loss": 0.8118,
"step": 834
},
{
"epoch": 1.67,
"grad_norm": 0.518430769443512,
"learning_rate": 4.883655218129719e-06,
"loss": 0.8007,
"step": 835
},
{
"epoch": 1.6720000000000002,
"grad_norm": 0.5682756900787354,
"learning_rate": 4.87202316573189e-06,
"loss": 0.7916,
"step": 836
},
{
"epoch": 1.674,
"grad_norm": 0.5430126190185547,
"learning_rate": 4.860391806382157e-06,
"loss": 0.841,
"step": 837
},
{
"epoch": 1.6760000000000002,
"grad_norm": 0.516560435295105,
"learning_rate": 4.8487612030691975e-06,
"loss": 0.8029,
"step": 838
},
{
"epoch": 1.678,
"grad_norm": 0.5646497011184692,
"learning_rate": 4.837131418777595e-06,
"loss": 0.8312,
"step": 839
},
{
"epoch": 1.6800000000000002,
"grad_norm": 0.5456799864768982,
"learning_rate": 4.825502516487497e-06,
"loss": 0.8234,
"step": 840
},
{
"epoch": 1.682,
"grad_norm": 0.5457205176353455,
"learning_rate": 4.813874559174271e-06,
"loss": 0.7864,
"step": 841
},
{
"epoch": 1.6840000000000002,
"grad_norm": 0.5338269472122192,
"learning_rate": 4.802247609808175e-06,
"loss": 0.8137,
"step": 842
},
{
"epoch": 1.686,
"grad_norm": 0.5644640922546387,
"learning_rate": 4.7906217313540035e-06,
"loss": 0.8368,
"step": 843
},
{
"epoch": 1.688,
"grad_norm": 0.5289748311042786,
"learning_rate": 4.778996986770747e-06,
"loss": 0.8174,
"step": 844
},
{
"epoch": 1.69,
"grad_norm": 0.48800283670425415,
"learning_rate": 4.767373439011267e-06,
"loss": 0.8208,
"step": 845
},
{
"epoch": 1.692,
"grad_norm": 0.5060573816299438,
"learning_rate": 4.755751151021934e-06,
"loss": 0.7976,
"step": 846
},
{
"epoch": 1.694,
"grad_norm": 0.5901538133621216,
"learning_rate": 4.744130185742301e-06,
"loss": 0.7963,
"step": 847
},
{
"epoch": 1.696,
"grad_norm": 0.43792036175727844,
"learning_rate": 4.732510606104754e-06,
"loss": 0.7713,
"step": 848
},
{
"epoch": 1.698,
"grad_norm": 0.46001285314559937,
"learning_rate": 4.720892475034181e-06,
"loss": 0.7949,
"step": 849
},
{
"epoch": 1.7,
"grad_norm": 0.5100943446159363,
"learning_rate": 4.7092758554476215e-06,
"loss": 0.867,
"step": 850
},
{
"epoch": 1.702,
"grad_norm": 0.4933598041534424,
"learning_rate": 4.6976608102539285e-06,
"loss": 0.8546,
"step": 851
},
{
"epoch": 1.704,
"grad_norm": 0.4918576776981354,
"learning_rate": 4.686047402353433e-06,
"loss": 0.8219,
"step": 852
},
{
"epoch": 1.706,
"grad_norm": 0.451834499835968,
"learning_rate": 4.674435694637597e-06,
"loss": 0.8474,
"step": 853
},
{
"epoch": 1.708,
"grad_norm": 0.47564661502838135,
"learning_rate": 4.662825749988675e-06,
"loss": 0.7999,
"step": 854
},
{
"epoch": 1.71,
"grad_norm": 0.5556415319442749,
"learning_rate": 4.651217631279374e-06,
"loss": 0.7976,
"step": 855
},
{
"epoch": 1.712,
"grad_norm": 0.4383542537689209,
"learning_rate": 4.639611401372514e-06,
"loss": 0.8453,
"step": 856
},
{
"epoch": 1.714,
"grad_norm": 0.49957823753356934,
"learning_rate": 4.6280071231206845e-06,
"loss": 0.831,
"step": 857
},
{
"epoch": 1.716,
"grad_norm": 0.5337651968002319,
"learning_rate": 4.6164048593659076e-06,
"loss": 0.8025,
"step": 858
},
{
"epoch": 1.718,
"grad_norm": 0.5165109038352966,
"learning_rate": 4.604804672939295e-06,
"loss": 0.7944,
"step": 859
},
{
"epoch": 1.72,
"grad_norm": 0.5279600024223328,
"learning_rate": 4.59320662666071e-06,
"loss": 0.7874,
"step": 860
},
{
"epoch": 1.722,
"grad_norm": 0.4770665168762207,
"learning_rate": 4.581610783338424e-06,
"loss": 0.8016,
"step": 861
},
{
"epoch": 1.724,
"grad_norm": 0.513299822807312,
"learning_rate": 4.570017205768779e-06,
"loss": 0.7832,
"step": 862
},
{
"epoch": 1.726,
"grad_norm": 0.5326491594314575,
"learning_rate": 4.5584259567358505e-06,
"loss": 0.8094,
"step": 863
},
{
"epoch": 1.728,
"grad_norm": 0.46006107330322266,
"learning_rate": 4.546837099011101e-06,
"loss": 0.8345,
"step": 864
},
{
"epoch": 1.73,
"grad_norm": 0.5608815550804138,
"learning_rate": 4.53525069535304e-06,
"loss": 0.7947,
"step": 865
},
{
"epoch": 1.732,
"grad_norm": 0.5806840062141418,
"learning_rate": 4.523666808506893e-06,
"loss": 0.7849,
"step": 866
},
{
"epoch": 1.734,
"grad_norm": 0.47665658593177795,
"learning_rate": 4.512085501204254e-06,
"loss": 0.7925,
"step": 867
},
{
"epoch": 1.736,
"grad_norm": 0.4672011435031891,
"learning_rate": 4.500506836162746e-06,
"loss": 0.8218,
"step": 868
},
{
"epoch": 1.738,
"grad_norm": 0.505943238735199,
"learning_rate": 4.4889308760856826e-06,
"loss": 0.8182,
"step": 869
},
{
"epoch": 1.74,
"grad_norm": 0.6454104781150818,
"learning_rate": 4.477357683661734e-06,
"loss": 0.8054,
"step": 870
},
{
"epoch": 1.742,
"grad_norm": 0.46112364530563354,
"learning_rate": 4.465787321564576e-06,
"loss": 0.8006,
"step": 871
},
{
"epoch": 1.744,
"grad_norm": 0.4697955250740051,
"learning_rate": 4.45421985245256e-06,
"loss": 0.775,
"step": 872
},
{
"epoch": 1.746,
"grad_norm": 0.5397466421127319,
"learning_rate": 4.442655338968373e-06,
"loss": 0.8024,
"step": 873
},
{
"epoch": 1.748,
"grad_norm": 0.5473257303237915,
"learning_rate": 4.431093843738693e-06,
"loss": 0.7931,
"step": 874
},
{
"epoch": 1.75,
"grad_norm": 0.5032723546028137,
"learning_rate": 4.4195354293738484e-06,
"loss": 0.8341,
"step": 875
},
{
"epoch": 1.752,
"grad_norm": 0.44628429412841797,
"learning_rate": 4.4079801584674955e-06,
"loss": 0.768,
"step": 876
},
{
"epoch": 1.754,
"grad_norm": 0.5417639017105103,
"learning_rate": 4.396428093596258e-06,
"loss": 0.7915,
"step": 877
},
{
"epoch": 1.756,
"grad_norm": 0.5060398578643799,
"learning_rate": 4.384879297319398e-06,
"loss": 0.8007,
"step": 878
},
{
"epoch": 1.758,
"grad_norm": 0.44769251346588135,
"learning_rate": 4.373333832178478e-06,
"loss": 0.8074,
"step": 879
},
{
"epoch": 1.76,
"grad_norm": 0.5053737759590149,
"learning_rate": 4.361791760697027e-06,
"loss": 0.7963,
"step": 880
},
{
"epoch": 1.762,
"grad_norm": 0.4751681685447693,
"learning_rate": 4.3502531453801885e-06,
"loss": 0.8116,
"step": 881
},
{
"epoch": 1.764,
"grad_norm": 0.576187789440155,
"learning_rate": 4.3387180487143875e-06,
"loss": 0.8269,
"step": 882
},
{
"epoch": 1.766,
"grad_norm": 0.49040693044662476,
"learning_rate": 4.3271865331670036e-06,
"loss": 0.8172,
"step": 883
},
{
"epoch": 1.768,
"grad_norm": 0.44776564836502075,
"learning_rate": 4.315658661186016e-06,
"loss": 0.8085,
"step": 884
},
{
"epoch": 1.77,
"grad_norm": 0.5188393592834473,
"learning_rate": 4.304134495199675e-06,
"loss": 0.7949,
"step": 885
},
{
"epoch": 1.772,
"grad_norm": 0.5137274861335754,
"learning_rate": 4.2926140976161555e-06,
"loss": 0.7843,
"step": 886
},
{
"epoch": 1.774,
"grad_norm": 0.5027738809585571,
"learning_rate": 4.281097530823237e-06,
"loss": 0.8109,
"step": 887
},
{
"epoch": 1.776,
"grad_norm": 0.5102717280387878,
"learning_rate": 4.269584857187942e-06,
"loss": 0.8051,
"step": 888
},
{
"epoch": 1.778,
"grad_norm": 0.49146613478660583,
"learning_rate": 4.258076139056217e-06,
"loss": 0.8164,
"step": 889
},
{
"epoch": 1.78,
"grad_norm": 0.47425997257232666,
"learning_rate": 4.246571438752585e-06,
"loss": 0.7982,
"step": 890
},
{
"epoch": 1.782,
"grad_norm": 0.509017825126648,
"learning_rate": 4.23507081857981e-06,
"loss": 0.8061,
"step": 891
},
{
"epoch": 1.784,
"grad_norm": 0.5787748694419861,
"learning_rate": 4.2235743408185635e-06,
"loss": 0.7887,
"step": 892
},
{
"epoch": 1.786,
"grad_norm": 0.46629399061203003,
"learning_rate": 4.212082067727079e-06,
"loss": 0.8101,
"step": 893
},
{
"epoch": 1.788,
"grad_norm": 0.6112287044525146,
"learning_rate": 4.200594061540827e-06,
"loss": 0.7886,
"step": 894
},
{
"epoch": 1.79,
"grad_norm": 0.510998547077179,
"learning_rate": 4.189110384472164e-06,
"loss": 0.7793,
"step": 895
},
{
"epoch": 1.792,
"grad_norm": 0.5151376128196716,
"learning_rate": 4.1776310987100054e-06,
"loss": 0.7949,
"step": 896
},
{
"epoch": 1.794,
"grad_norm": 0.4956475496292114,
"learning_rate": 4.166156266419489e-06,
"loss": 0.787,
"step": 897
},
{
"epoch": 1.796,
"grad_norm": 0.5557451248168945,
"learning_rate": 4.154685949741631e-06,
"loss": 0.8264,
"step": 898
},
{
"epoch": 1.798,
"grad_norm": 0.50986248254776,
"learning_rate": 4.143220210792993e-06,
"loss": 0.7918,
"step": 899
},
{
"epoch": 1.8,
"grad_norm": 0.47877979278564453,
"learning_rate": 4.131759111665349e-06,
"loss": 0.7743,
"step": 900
},
{
"epoch": 1.802,
"grad_norm": 0.4706796407699585,
"learning_rate": 4.1203027144253466e-06,
"loss": 0.8061,
"step": 901
},
{
"epoch": 1.804,
"grad_norm": 0.5295886397361755,
"learning_rate": 4.108851081114169e-06,
"loss": 0.8092,
"step": 902
},
{
"epoch": 1.806,
"grad_norm": 0.461213082075119,
"learning_rate": 4.0974042737472005e-06,
"loss": 0.7893,
"step": 903
},
{
"epoch": 1.808,
"grad_norm": 0.5076703429222107,
"learning_rate": 4.0859623543136935e-06,
"loss": 0.8037,
"step": 904
},
{
"epoch": 1.81,
"grad_norm": 0.47249898314476013,
"learning_rate": 4.074525384776428e-06,
"loss": 0.8312,
"step": 905
},
{
"epoch": 1.812,
"grad_norm": 0.45104101300239563,
"learning_rate": 4.063093427071376e-06,
"loss": 0.7977,
"step": 906
},
{
"epoch": 1.814,
"grad_norm": 0.48447185754776,
"learning_rate": 4.051666543107377e-06,
"loss": 0.8073,
"step": 907
},
{
"epoch": 1.8159999999999998,
"grad_norm": 0.5335273146629333,
"learning_rate": 4.040244794765783e-06,
"loss": 0.7914,
"step": 908
},
{
"epoch": 1.818,
"grad_norm": 0.4864543676376343,
"learning_rate": 4.028828243900141e-06,
"loss": 0.822,
"step": 909
},
{
"epoch": 1.8199999999999998,
"grad_norm": 0.5080208778381348,
"learning_rate": 4.017416952335849e-06,
"loss": 0.8095,
"step": 910
},
{
"epoch": 1.822,
"grad_norm": 0.5187543630599976,
"learning_rate": 4.006010981869829e-06,
"loss": 0.7817,
"step": 911
},
{
"epoch": 1.8239999999999998,
"grad_norm": 0.4684050679206848,
"learning_rate": 3.994610394270178e-06,
"loss": 0.8149,
"step": 912
},
{
"epoch": 1.826,
"grad_norm": 0.4266953468322754,
"learning_rate": 3.983215251275847e-06,
"loss": 0.808,
"step": 913
},
{
"epoch": 1.8279999999999998,
"grad_norm": 0.41727814078330994,
"learning_rate": 3.971825614596308e-06,
"loss": 0.8322,
"step": 914
},
{
"epoch": 1.83,
"grad_norm": 0.5039946436882019,
"learning_rate": 3.960441545911205e-06,
"loss": 0.8174,
"step": 915
},
{
"epoch": 1.8319999999999999,
"grad_norm": 0.48520591855049133,
"learning_rate": 3.949063106870031e-06,
"loss": 0.764,
"step": 916
},
{
"epoch": 1.834,
"grad_norm": 0.550553560256958,
"learning_rate": 3.9376903590917945e-06,
"loss": 0.8395,
"step": 917
},
{
"epoch": 1.8359999999999999,
"grad_norm": 0.5808132886886597,
"learning_rate": 3.926323364164684e-06,
"loss": 0.8217,
"step": 918
},
{
"epoch": 1.838,
"grad_norm": 0.5393190979957581,
"learning_rate": 3.914962183645733e-06,
"loss": 0.7786,
"step": 919
},
{
"epoch": 1.8399999999999999,
"grad_norm": 0.5174140334129333,
"learning_rate": 3.903606879060483e-06,
"loss": 0.8074,
"step": 920
},
{
"epoch": 1.842,
"grad_norm": 0.5107792615890503,
"learning_rate": 3.892257511902664e-06,
"loss": 0.7904,
"step": 921
},
{
"epoch": 1.8439999999999999,
"grad_norm": 0.5404635667800903,
"learning_rate": 3.880914143633844e-06,
"loss": 0.8241,
"step": 922
},
{
"epoch": 1.846,
"grad_norm": 0.5038130283355713,
"learning_rate": 3.869576835683109e-06,
"loss": 0.8188,
"step": 923
},
{
"epoch": 1.8479999999999999,
"grad_norm": 0.4418555796146393,
"learning_rate": 3.8582456494467214e-06,
"loss": 0.8185,
"step": 924
},
{
"epoch": 1.85,
"grad_norm": 0.46331271529197693,
"learning_rate": 3.8469206462878e-06,
"loss": 0.8046,
"step": 925
},
{
"epoch": 1.8519999999999999,
"grad_norm": 0.48213040828704834,
"learning_rate": 3.835601887535971e-06,
"loss": 0.8279,
"step": 926
},
{
"epoch": 1.854,
"grad_norm": 0.48893412947654724,
"learning_rate": 3.82428943448705e-06,
"loss": 0.7719,
"step": 927
},
{
"epoch": 1.8559999999999999,
"grad_norm": 0.4834563136100769,
"learning_rate": 3.812983348402703e-06,
"loss": 0.7815,
"step": 928
},
{
"epoch": 1.858,
"grad_norm": 0.4763684570789337,
"learning_rate": 3.8016836905101157e-06,
"loss": 0.7872,
"step": 929
},
{
"epoch": 1.8599999999999999,
"grad_norm": 0.4794900417327881,
"learning_rate": 3.790390522001662e-06,
"loss": 0.7982,
"step": 930
},
{
"epoch": 1.862,
"grad_norm": 0.4603365659713745,
"learning_rate": 3.7791039040345743e-06,
"loss": 0.806,
"step": 931
},
{
"epoch": 1.8639999999999999,
"grad_norm": 0.5639286041259766,
"learning_rate": 3.767823897730612e-06,
"loss": 0.7946,
"step": 932
},
{
"epoch": 1.866,
"grad_norm": 0.4854590594768524,
"learning_rate": 3.756550564175727e-06,
"loss": 0.7826,
"step": 933
},
{
"epoch": 1.8679999999999999,
"grad_norm": 0.4922090768814087,
"learning_rate": 3.745283964419736e-06,
"loss": 0.8108,
"step": 934
},
{
"epoch": 1.87,
"grad_norm": 0.46560823917388916,
"learning_rate": 3.7340241594759917e-06,
"loss": 0.8257,
"step": 935
},
{
"epoch": 1.8719999999999999,
"grad_norm": 0.45512527227401733,
"learning_rate": 3.7227712103210485e-06,
"loss": 0.8273,
"step": 936
},
{
"epoch": 1.874,
"grad_norm": 0.5078556537628174,
"learning_rate": 3.7115251778943314e-06,
"loss": 0.7712,
"step": 937
},
{
"epoch": 1.876,
"grad_norm": 0.483775794506073,
"learning_rate": 3.700286123097814e-06,
"loss": 0.7934,
"step": 938
},
{
"epoch": 1.8780000000000001,
"grad_norm": 0.4577796757221222,
"learning_rate": 3.6890541067956775e-06,
"loss": 0.7992,
"step": 939
},
{
"epoch": 1.88,
"grad_norm": 0.4452671408653259,
"learning_rate": 3.6778291898139907e-06,
"loss": 0.7963,
"step": 940
},
{
"epoch": 1.8820000000000001,
"grad_norm": 0.48819735646247864,
"learning_rate": 3.6666114329403723e-06,
"loss": 0.8253,
"step": 941
},
{
"epoch": 1.884,
"grad_norm": 0.505906879901886,
"learning_rate": 3.655400896923672e-06,
"loss": 0.8247,
"step": 942
},
{
"epoch": 1.8860000000000001,
"grad_norm": 0.4865492582321167,
"learning_rate": 3.6441976424736315e-06,
"loss": 0.8308,
"step": 943
},
{
"epoch": 1.888,
"grad_norm": 0.49690744280815125,
"learning_rate": 3.633001730260558e-06,
"loss": 0.7937,
"step": 944
},
{
"epoch": 1.8900000000000001,
"grad_norm": 0.5015735626220703,
"learning_rate": 3.6218132209150047e-06,
"loss": 0.7736,
"step": 945
},
{
"epoch": 1.892,
"grad_norm": 0.4772734045982361,
"learning_rate": 3.6106321750274275e-06,
"loss": 0.8031,
"step": 946
},
{
"epoch": 1.8940000000000001,
"grad_norm": 0.4924619495868683,
"learning_rate": 3.5994586531478672e-06,
"loss": 0.8156,
"step": 947
},
{
"epoch": 1.896,
"grad_norm": 0.5400336980819702,
"learning_rate": 3.5882927157856175e-06,
"loss": 0.8079,
"step": 948
},
{
"epoch": 1.8980000000000001,
"grad_norm": 0.47806546092033386,
"learning_rate": 3.577134423408906e-06,
"loss": 0.7797,
"step": 949
},
{
"epoch": 1.9,
"grad_norm": 0.4568106532096863,
"learning_rate": 3.5659838364445505e-06,
"loss": 0.7719,
"step": 950
},
{
"epoch": 1.9020000000000001,
"grad_norm": 0.4688374400138855,
"learning_rate": 3.5548410152776414e-06,
"loss": 0.7971,
"step": 951
},
{
"epoch": 1.904,
"grad_norm": 0.4213714003562927,
"learning_rate": 3.543706020251223e-06,
"loss": 0.8177,
"step": 952
},
{
"epoch": 1.9060000000000001,
"grad_norm": 0.5132880210876465,
"learning_rate": 3.5325789116659493e-06,
"loss": 0.8076,
"step": 953
},
{
"epoch": 1.908,
"grad_norm": 0.5077955722808838,
"learning_rate": 3.521459749779769e-06,
"loss": 0.7994,
"step": 954
},
{
"epoch": 1.9100000000000001,
"grad_norm": 0.5192594528198242,
"learning_rate": 3.51034859480759e-06,
"loss": 0.7992,
"step": 955
},
{
"epoch": 1.912,
"grad_norm": 0.5100328326225281,
"learning_rate": 3.4992455069209717e-06,
"loss": 0.8076,
"step": 956
},
{
"epoch": 1.9140000000000001,
"grad_norm": 0.42553383111953735,
"learning_rate": 3.488150546247778e-06,
"loss": 0.817,
"step": 957
},
{
"epoch": 1.916,
"grad_norm": 0.4890826642513275,
"learning_rate": 3.4770637728718608e-06,
"loss": 0.8088,
"step": 958
},
{
"epoch": 1.9180000000000001,
"grad_norm": 0.4364239573478699,
"learning_rate": 3.465985246832739e-06,
"loss": 0.8242,
"step": 959
},
{
"epoch": 1.92,
"grad_norm": 0.418891578912735,
"learning_rate": 3.4549150281252635e-06,
"loss": 0.7886,
"step": 960
},
{
"epoch": 1.9220000000000002,
"grad_norm": 0.4589402377605438,
"learning_rate": 3.4438531766993012e-06,
"loss": 0.7952,
"step": 961
},
{
"epoch": 1.924,
"grad_norm": 0.4044283926486969,
"learning_rate": 3.4327997524594026e-06,
"loss": 0.8409,
"step": 962
},
{
"epoch": 1.9260000000000002,
"grad_norm": 0.4343528151512146,
"learning_rate": 3.4217548152644887e-06,
"loss": 0.8252,
"step": 963
},
{
"epoch": 1.928,
"grad_norm": 0.46095573902130127,
"learning_rate": 3.4107184249275114e-06,
"loss": 0.8167,
"step": 964
},
{
"epoch": 1.9300000000000002,
"grad_norm": 0.4581632614135742,
"learning_rate": 3.399690641215142e-06,
"loss": 0.7869,
"step": 965
},
{
"epoch": 1.932,
"grad_norm": 0.48575082421302795,
"learning_rate": 3.3886715238474454e-06,
"loss": 0.7851,
"step": 966
},
{
"epoch": 1.9340000000000002,
"grad_norm": 0.47937479615211487,
"learning_rate": 3.3776611324975496e-06,
"loss": 0.7889,
"step": 967
},
{
"epoch": 1.936,
"grad_norm": 0.44034528732299805,
"learning_rate": 3.3666595267913293e-06,
"loss": 0.8248,
"step": 968
},
{
"epoch": 1.938,
"grad_norm": 0.47162750363349915,
"learning_rate": 3.355666766307084e-06,
"loss": 0.7813,
"step": 969
},
{
"epoch": 1.94,
"grad_norm": 0.44156888127326965,
"learning_rate": 3.3446829105752103e-06,
"loss": 0.8,
"step": 970
},
{
"epoch": 1.942,
"grad_norm": 0.5223917365074158,
"learning_rate": 3.3337080190778816e-06,
"loss": 0.7972,
"step": 971
},
{
"epoch": 1.944,
"grad_norm": 0.46116048097610474,
"learning_rate": 3.322742151248726e-06,
"loss": 0.7999,
"step": 972
},
{
"epoch": 1.946,
"grad_norm": 0.4879921078681946,
"learning_rate": 3.311785366472506e-06,
"loss": 0.8009,
"step": 973
},
{
"epoch": 1.948,
"grad_norm": 0.4909932017326355,
"learning_rate": 3.3008377240847955e-06,
"loss": 0.8025,
"step": 974
},
{
"epoch": 1.95,
"grad_norm": 0.5128923058509827,
"learning_rate": 3.289899283371657e-06,
"loss": 0.8343,
"step": 975
},
{
"epoch": 1.952,
"grad_norm": 0.47355395555496216,
"learning_rate": 3.2789701035693242e-06,
"loss": 0.8171,
"step": 976
},
{
"epoch": 1.954,
"grad_norm": 0.4412969648838043,
"learning_rate": 3.268050243863877e-06,
"loss": 0.8041,
"step": 977
},
{
"epoch": 1.956,
"grad_norm": 0.42818376421928406,
"learning_rate": 3.2571397633909252e-06,
"loss": 0.7968,
"step": 978
},
{
"epoch": 1.958,
"grad_norm": 0.4641779363155365,
"learning_rate": 3.246238721235283e-06,
"loss": 0.7914,
"step": 979
},
{
"epoch": 1.96,
"grad_norm": 0.46108317375183105,
"learning_rate": 3.2353471764306567e-06,
"loss": 0.7926,
"step": 980
},
{
"epoch": 1.962,
"grad_norm": 0.4477689266204834,
"learning_rate": 3.224465187959316e-06,
"loss": 0.8085,
"step": 981
},
{
"epoch": 1.964,
"grad_norm": 0.43308204412460327,
"learning_rate": 3.2135928147517803e-06,
"loss": 0.792,
"step": 982
},
{
"epoch": 1.966,
"grad_norm": 0.43959012627601624,
"learning_rate": 3.2027301156865015e-06,
"loss": 0.7827,
"step": 983
},
{
"epoch": 1.968,
"grad_norm": 0.47771644592285156,
"learning_rate": 3.1918771495895395e-06,
"loss": 0.8044,
"step": 984
},
{
"epoch": 1.97,
"grad_norm": 0.486515611410141,
"learning_rate": 3.1810339752342446e-06,
"loss": 0.7992,
"step": 985
},
{
"epoch": 1.972,
"grad_norm": 0.41616618633270264,
"learning_rate": 3.1702006513409393e-06,
"loss": 0.807,
"step": 986
},
{
"epoch": 1.974,
"grad_norm": 0.43619367480278015,
"learning_rate": 3.1593772365766107e-06,
"loss": 0.795,
"step": 987
},
{
"epoch": 1.976,
"grad_norm": 0.4389900863170624,
"learning_rate": 3.148563789554575e-06,
"loss": 0.8113,
"step": 988
},
{
"epoch": 1.978,
"grad_norm": 0.4348282516002655,
"learning_rate": 3.137760368834169e-06,
"loss": 0.7804,
"step": 989
},
{
"epoch": 1.98,
"grad_norm": 0.4243306815624237,
"learning_rate": 3.12696703292044e-06,
"loss": 0.8005,
"step": 990
},
{
"epoch": 1.982,
"grad_norm": 0.4412820339202881,
"learning_rate": 3.1161838402638158e-06,
"loss": 0.8126,
"step": 991
},
{
"epoch": 1.984,
"grad_norm": 0.44826000928878784,
"learning_rate": 3.105410849259796e-06,
"loss": 0.8276,
"step": 992
},
{
"epoch": 1.986,
"grad_norm": 0.448371559381485,
"learning_rate": 3.09464811824863e-06,
"loss": 0.8171,
"step": 993
},
{
"epoch": 1.988,
"grad_norm": 0.4271306097507477,
"learning_rate": 3.0838957055150136e-06,
"loss": 0.8174,
"step": 994
},
{
"epoch": 1.99,
"grad_norm": 0.455337792634964,
"learning_rate": 3.0731536692877596e-06,
"loss": 0.7809,
"step": 995
},
{
"epoch": 1.992,
"grad_norm": 0.4774368405342102,
"learning_rate": 3.0624220677394854e-06,
"loss": 0.8019,
"step": 996
},
{
"epoch": 1.994,
"grad_norm": 0.4929225444793701,
"learning_rate": 3.0517009589863057e-06,
"loss": 0.7836,
"step": 997
},
{
"epoch": 1.996,
"grad_norm": 0.42296701669692993,
"learning_rate": 3.040990401087508e-06,
"loss": 0.7909,
"step": 998
},
{
"epoch": 1.998,
"grad_norm": 0.43920400738716125,
"learning_rate": 3.030290452045245e-06,
"loss": 0.8158,
"step": 999
},
{
"epoch": 2.0,
"grad_norm": 0.4366856813430786,
"learning_rate": 3.019601169804216e-06,
"loss": 0.8151,
"step": 1000
},
{
"epoch": 2.002,
"grad_norm": 0.4748608469963074,
"learning_rate": 3.0089226122513583e-06,
"loss": 0.7909,
"step": 1001
},
{
"epoch": 2.004,
"grad_norm": 0.44225451350212097,
"learning_rate": 2.9982548372155264e-06,
"loss": 0.7375,
"step": 1002
},
{
"epoch": 2.006,
"grad_norm": 0.46516430377960205,
"learning_rate": 2.9875979024671846e-06,
"loss": 0.7774,
"step": 1003
},
{
"epoch": 2.008,
"grad_norm": 0.5143842697143555,
"learning_rate": 2.9769518657180953e-06,
"loss": 0.7638,
"step": 1004
},
{
"epoch": 2.01,
"grad_norm": 0.429679811000824,
"learning_rate": 2.966316784621e-06,
"loss": 0.7848,
"step": 1005
},
{
"epoch": 2.012,
"grad_norm": 0.4154725670814514,
"learning_rate": 2.9556927167693107e-06,
"loss": 0.7841,
"step": 1006
},
{
"epoch": 2.014,
"grad_norm": 0.4369898736476898,
"learning_rate": 2.945079719696802e-06,
"loss": 0.7826,
"step": 1007
},
{
"epoch": 2.016,
"grad_norm": 0.4525117874145508,
"learning_rate": 2.934477850877292e-06,
"loss": 0.7601,
"step": 1008
},
{
"epoch": 2.018,
"grad_norm": 0.494784951210022,
"learning_rate": 2.9238871677243354e-06,
"loss": 0.7692,
"step": 1009
},
{
"epoch": 2.02,
"grad_norm": 0.43322470784187317,
"learning_rate": 2.9133077275909112e-06,
"loss": 0.7638,
"step": 1010
},
{
"epoch": 2.022,
"grad_norm": 0.4489244818687439,
"learning_rate": 2.9027395877691143e-06,
"loss": 0.7872,
"step": 1011
},
{
"epoch": 2.024,
"grad_norm": 0.4334550201892853,
"learning_rate": 2.892182805489846e-06,
"loss": 0.7805,
"step": 1012
},
{
"epoch": 2.026,
"grad_norm": 0.48196423053741455,
"learning_rate": 2.8816374379224932e-06,
"loss": 0.77,
"step": 1013
},
{
"epoch": 2.028,
"grad_norm": 0.46943002939224243,
"learning_rate": 2.871103542174637e-06,
"loss": 0.7638,
"step": 1014
},
{
"epoch": 2.03,
"grad_norm": 0.40652287006378174,
"learning_rate": 2.86058117529173e-06,
"loss": 0.7708,
"step": 1015
},
{
"epoch": 2.032,
"grad_norm": 0.43071281909942627,
"learning_rate": 2.8500703942567874e-06,
"loss": 0.8108,
"step": 1016
},
{
"epoch": 2.034,
"grad_norm": 0.46008026599884033,
"learning_rate": 2.839571255990088e-06,
"loss": 0.7641,
"step": 1017
},
{
"epoch": 2.036,
"grad_norm": 0.409462571144104,
"learning_rate": 2.82908381734886e-06,
"loss": 0.7784,
"step": 1018
},
{
"epoch": 2.038,
"grad_norm": 0.4535731077194214,
"learning_rate": 2.818608135126967e-06,
"loss": 0.7266,
"step": 1019
},
{
"epoch": 2.04,
"grad_norm": 0.4551372528076172,
"learning_rate": 2.8081442660546126e-06,
"loss": 0.7685,
"step": 1020
},
{
"epoch": 2.042,
"grad_norm": 0.43031415343284607,
"learning_rate": 2.797692266798027e-06,
"loss": 0.7547,
"step": 1021
},
{
"epoch": 2.044,
"grad_norm": 0.47990843653678894,
"learning_rate": 2.7872521939591556e-06,
"loss": 0.7547,
"step": 1022
},
{
"epoch": 2.046,
"grad_norm": 0.4494011104106903,
"learning_rate": 2.776824104075364e-06,
"loss": 0.7153,
"step": 1023
},
{
"epoch": 2.048,
"grad_norm": 0.42083409428596497,
"learning_rate": 2.7664080536191178e-06,
"loss": 0.7896,
"step": 1024
},
{
"epoch": 2.05,
"grad_norm": 0.42462587356567383,
"learning_rate": 2.7560040989976894e-06,
"loss": 0.7692,
"step": 1025
},
{
"epoch": 2.052,
"grad_norm": 0.44874322414398193,
"learning_rate": 2.7456122965528475e-06,
"loss": 0.7709,
"step": 1026
},
{
"epoch": 2.054,
"grad_norm": 0.47028055787086487,
"learning_rate": 2.7352327025605464e-06,
"loss": 0.7434,
"step": 1027
},
{
"epoch": 2.056,
"grad_norm": 0.42561060190200806,
"learning_rate": 2.724865373230632e-06,
"loss": 0.7651,
"step": 1028
},
{
"epoch": 2.058,
"grad_norm": 0.4689915180206299,
"learning_rate": 2.714510364706531e-06,
"loss": 0.7594,
"step": 1029
},
{
"epoch": 2.06,
"grad_norm": 0.44147977232933044,
"learning_rate": 2.7041677330649408e-06,
"loss": 0.7366,
"step": 1030
},
{
"epoch": 2.062,
"grad_norm": 0.493244469165802,
"learning_rate": 2.6938375343155464e-06,
"loss": 0.7734,
"step": 1031
},
{
"epoch": 2.064,
"grad_norm": 0.42558082938194275,
"learning_rate": 2.683519824400693e-06,
"loss": 0.7742,
"step": 1032
},
{
"epoch": 2.066,
"grad_norm": 0.4560059607028961,
"learning_rate": 2.6732146591950924e-06,
"loss": 0.7595,
"step": 1033
},
{
"epoch": 2.068,
"grad_norm": 0.43729954957962036,
"learning_rate": 2.662922094505529e-06,
"loss": 0.7466,
"step": 1034
},
{
"epoch": 2.07,
"grad_norm": 0.46198272705078125,
"learning_rate": 2.6526421860705474e-06,
"loss": 0.7644,
"step": 1035
},
{
"epoch": 2.072,
"grad_norm": 0.4170866310596466,
"learning_rate": 2.6423749895601494e-06,
"loss": 0.7849,
"step": 1036
},
{
"epoch": 2.074,
"grad_norm": 0.4126523733139038,
"learning_rate": 2.6321205605755002e-06,
"loss": 0.7772,
"step": 1037
},
{
"epoch": 2.076,
"grad_norm": 0.4304735064506531,
"learning_rate": 2.6218789546486235e-06,
"loss": 0.7569,
"step": 1038
},
{
"epoch": 2.078,
"grad_norm": 0.42081594467163086,
"learning_rate": 2.611650227242102e-06,
"loss": 0.7815,
"step": 1039
},
{
"epoch": 2.08,
"grad_norm": 0.4308110773563385,
"learning_rate": 2.601434433748771e-06,
"loss": 0.7791,
"step": 1040
},
{
"epoch": 2.082,
"grad_norm": 0.42325472831726074,
"learning_rate": 2.5912316294914232e-06,
"loss": 0.7789,
"step": 1041
},
{
"epoch": 2.084,
"grad_norm": 0.44107508659362793,
"learning_rate": 2.581041869722519e-06,
"loss": 0.7685,
"step": 1042
},
{
"epoch": 2.086,
"grad_norm": 0.4687047004699707,
"learning_rate": 2.5708652096238674e-06,
"loss": 0.7712,
"step": 1043
},
{
"epoch": 2.088,
"grad_norm": 0.5534901022911072,
"learning_rate": 2.560701704306336e-06,
"loss": 0.7938,
"step": 1044
},
{
"epoch": 2.09,
"grad_norm": 0.46949270367622375,
"learning_rate": 2.550551408809566e-06,
"loss": 0.7937,
"step": 1045
},
{
"epoch": 2.092,
"grad_norm": 0.43184739351272583,
"learning_rate": 2.540414378101647e-06,
"loss": 0.7584,
"step": 1046
},
{
"epoch": 2.094,
"grad_norm": 0.41342681646347046,
"learning_rate": 2.5302906670788463e-06,
"loss": 0.7393,
"step": 1047
},
{
"epoch": 2.096,
"grad_norm": 0.5106869339942932,
"learning_rate": 2.52018033056529e-06,
"loss": 0.7937,
"step": 1048
},
{
"epoch": 2.098,
"grad_norm": 0.4273001551628113,
"learning_rate": 2.5100834233126827e-06,
"loss": 0.7461,
"step": 1049
},
{
"epoch": 2.1,
"grad_norm": 0.43153709173202515,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.7631,
"step": 1050
},
{
"epoch": 2.102,
"grad_norm": 0.4402224123477936,
"learning_rate": 2.489930115233199e-06,
"loss": 0.8026,
"step": 1051
},
{
"epoch": 2.104,
"grad_norm": 0.45185586810112,
"learning_rate": 2.4798738235449164e-06,
"loss": 0.7683,
"step": 1052
},
{
"epoch": 2.106,
"grad_norm": 0.4645299017429352,
"learning_rate": 2.469831179394182e-06,
"loss": 0.7985,
"step": 1053
},
{
"epoch": 2.108,
"grad_norm": 0.4175006151199341,
"learning_rate": 2.4598022371661113e-06,
"loss": 0.8002,
"step": 1054
},
{
"epoch": 2.11,
"grad_norm": 0.4740184545516968,
"learning_rate": 2.4497870511716237e-06,
"loss": 0.796,
"step": 1055
},
{
"epoch": 2.112,
"grad_norm": 0.485872745513916,
"learning_rate": 2.4397856756471435e-06,
"loss": 0.7947,
"step": 1056
},
{
"epoch": 2.114,
"grad_norm": 0.43455788493156433,
"learning_rate": 2.429798164754299e-06,
"loss": 0.7656,
"step": 1057
},
{
"epoch": 2.116,
"grad_norm": 0.44379016757011414,
"learning_rate": 2.4198245725796427e-06,
"loss": 0.7622,
"step": 1058
},
{
"epoch": 2.118,
"grad_norm": 0.4761369824409485,
"learning_rate": 2.40986495313435e-06,
"loss": 0.7647,
"step": 1059
},
{
"epoch": 2.12,
"grad_norm": 0.4579893946647644,
"learning_rate": 2.3999193603539234e-06,
"loss": 0.7693,
"step": 1060
},
{
"epoch": 2.122,
"grad_norm": 0.4474698007106781,
"learning_rate": 2.3899878480979098e-06,
"loss": 0.7826,
"step": 1061
},
{
"epoch": 2.124,
"grad_norm": 0.4562933146953583,
"learning_rate": 2.380070470149605e-06,
"loss": 0.8071,
"step": 1062
},
{
"epoch": 2.126,
"grad_norm": 0.5011743903160095,
"learning_rate": 2.3701672802157567e-06,
"loss": 0.7657,
"step": 1063
},
{
"epoch": 2.128,
"grad_norm": 0.4564701020717621,
"learning_rate": 2.3602783319262847e-06,
"loss": 0.746,
"step": 1064
},
{
"epoch": 2.13,
"grad_norm": 0.4964149296283722,
"learning_rate": 2.3504036788339763e-06,
"loss": 0.7611,
"step": 1065
},
{
"epoch": 2.132,
"grad_norm": 0.4345194101333618,
"learning_rate": 2.340543374414212e-06,
"loss": 0.7925,
"step": 1066
},
{
"epoch": 2.134,
"grad_norm": 0.40942201018333435,
"learning_rate": 2.330697472064667e-06,
"loss": 0.7567,
"step": 1067
},
{
"epoch": 2.136,
"grad_norm": 0.4680323302745819,
"learning_rate": 2.320866025105016e-06,
"loss": 0.7702,
"step": 1068
},
{
"epoch": 2.138,
"grad_norm": 0.4870208501815796,
"learning_rate": 2.3110490867766644e-06,
"loss": 0.7438,
"step": 1069
},
{
"epoch": 2.14,
"grad_norm": 0.4345625340938568,
"learning_rate": 2.3012467102424373e-06,
"loss": 0.754,
"step": 1070
},
{
"epoch": 2.142,
"grad_norm": 0.4366339445114136,
"learning_rate": 2.2914589485863015e-06,
"loss": 0.7586,
"step": 1071
},
{
"epoch": 2.144,
"grad_norm": 0.4483181834220886,
"learning_rate": 2.2816858548130837e-06,
"loss": 0.8244,
"step": 1072
},
{
"epoch": 2.146,
"grad_norm": 0.4304027259349823,
"learning_rate": 2.2719274818481767e-06,
"loss": 0.7521,
"step": 1073
},
{
"epoch": 2.148,
"grad_norm": 0.41920629143714905,
"learning_rate": 2.2621838825372496e-06,
"loss": 0.797,
"step": 1074
},
{
"epoch": 2.15,
"grad_norm": 0.42902520298957825,
"learning_rate": 2.2524551096459703e-06,
"loss": 0.772,
"step": 1075
},
{
"epoch": 2.152,
"grad_norm": 0.42061668634414673,
"learning_rate": 2.2427412158597133e-06,
"loss": 0.7737,
"step": 1076
},
{
"epoch": 2.154,
"grad_norm": 0.4826355278491974,
"learning_rate": 2.23304225378328e-06,
"loss": 0.7701,
"step": 1077
},
{
"epoch": 2.156,
"grad_norm": 0.44024163484573364,
"learning_rate": 2.2233582759406065e-06,
"loss": 0.7849,
"step": 1078
},
{
"epoch": 2.158,
"grad_norm": 0.4087906777858734,
"learning_rate": 2.213689334774479e-06,
"loss": 0.7615,
"step": 1079
},
{
"epoch": 2.16,
"grad_norm": 0.4429933428764343,
"learning_rate": 2.204035482646267e-06,
"loss": 0.7765,
"step": 1080
},
{
"epoch": 2.162,
"grad_norm": 0.4163159430027008,
"learning_rate": 2.1943967718356123e-06,
"loss": 0.7497,
"step": 1081
},
{
"epoch": 2.164,
"grad_norm": 0.4267112910747528,
"learning_rate": 2.184773254540169e-06,
"loss": 0.7756,
"step": 1082
},
{
"epoch": 2.166,
"grad_norm": 0.40076255798339844,
"learning_rate": 2.175164982875311e-06,
"loss": 0.7566,
"step": 1083
},
{
"epoch": 2.168,
"grad_norm": 0.4351121783256531,
"learning_rate": 2.165572008873845e-06,
"loss": 0.772,
"step": 1084
},
{
"epoch": 2.17,
"grad_norm": 0.40487784147262573,
"learning_rate": 2.155994384485742e-06,
"loss": 0.7754,
"step": 1085
},
{
"epoch": 2.172,
"grad_norm": 0.4281260371208191,
"learning_rate": 2.146432161577842e-06,
"loss": 0.7542,
"step": 1086
},
{
"epoch": 2.174,
"grad_norm": 0.44176384806632996,
"learning_rate": 2.1368853919335835e-06,
"loss": 0.7782,
"step": 1087
},
{
"epoch": 2.176,
"grad_norm": 0.4439259469509125,
"learning_rate": 2.12735412725272e-06,
"loss": 0.7806,
"step": 1088
},
{
"epoch": 2.178,
"grad_norm": 0.4487484097480774,
"learning_rate": 2.1178384191510344e-06,
"loss": 0.7741,
"step": 1089
},
{
"epoch": 2.18,
"grad_norm": 0.4279979467391968,
"learning_rate": 2.1083383191600676e-06,
"loss": 0.7632,
"step": 1090
},
{
"epoch": 2.182,
"grad_norm": 0.43812668323516846,
"learning_rate": 2.0988538787268374e-06,
"loss": 0.7504,
"step": 1091
},
{
"epoch": 2.184,
"grad_norm": 0.4351823925971985,
"learning_rate": 2.0893851492135536e-06,
"loss": 0.7838,
"step": 1092
},
{
"epoch": 2.186,
"grad_norm": 0.4904820919036865,
"learning_rate": 2.0799321818973488e-06,
"loss": 0.7421,
"step": 1093
},
{
"epoch": 2.188,
"grad_norm": 0.38253724575042725,
"learning_rate": 2.0704950279699986e-06,
"loss": 0.7744,
"step": 1094
},
{
"epoch": 2.19,
"grad_norm": 0.4415190815925598,
"learning_rate": 2.061073738537635e-06,
"loss": 0.7871,
"step": 1095
},
{
"epoch": 2.192,
"grad_norm": 0.4349212050437927,
"learning_rate": 2.0516683646204836e-06,
"loss": 0.7796,
"step": 1096
},
{
"epoch": 2.194,
"grad_norm": 0.45319244265556335,
"learning_rate": 2.0422789571525813e-06,
"loss": 0.7536,
"step": 1097
},
{
"epoch": 2.196,
"grad_norm": 0.4984447658061981,
"learning_rate": 2.0329055669814936e-06,
"loss": 0.7683,
"step": 1098
},
{
"epoch": 2.198,
"grad_norm": 0.4400745630264282,
"learning_rate": 2.023548244868051e-06,
"loss": 0.7293,
"step": 1099
},
{
"epoch": 2.2,
"grad_norm": 0.40443891286849976,
"learning_rate": 2.0142070414860704e-06,
"loss": 0.7865,
"step": 1100
},
{
"epoch": 2.202,
"grad_norm": 0.45138484239578247,
"learning_rate": 2.0048820074220716e-06,
"loss": 0.7632,
"step": 1101
},
{
"epoch": 2.204,
"grad_norm": 0.4614701569080353,
"learning_rate": 1.9955731931750182e-06,
"loss": 0.7997,
"step": 1102
},
{
"epoch": 2.206,
"grad_norm": 0.5170438289642334,
"learning_rate": 1.9862806491560315e-06,
"loss": 0.7392,
"step": 1103
},
{
"epoch": 2.208,
"grad_norm": 0.4604811668395996,
"learning_rate": 1.977004425688126e-06,
"loss": 0.7267,
"step": 1104
},
{
"epoch": 2.21,
"grad_norm": 0.4471680521965027,
"learning_rate": 1.9677445730059348e-06,
"loss": 0.7519,
"step": 1105
},
{
"epoch": 2.212,
"grad_norm": 0.4620627760887146,
"learning_rate": 1.958501141255427e-06,
"loss": 0.7664,
"step": 1106
},
{
"epoch": 2.214,
"grad_norm": 0.4312162697315216,
"learning_rate": 1.9492741804936623e-06,
"loss": 0.7635,
"step": 1107
},
{
"epoch": 2.216,
"grad_norm": 0.48040759563446045,
"learning_rate": 1.9400637406884875e-06,
"loss": 0.7728,
"step": 1108
},
{
"epoch": 2.218,
"grad_norm": 0.48567670583724976,
"learning_rate": 1.9308698717182874e-06,
"loss": 0.7546,
"step": 1109
},
{
"epoch": 2.22,
"grad_norm": 0.4145522117614746,
"learning_rate": 1.9216926233717087e-06,
"loss": 0.7375,
"step": 1110
},
{
"epoch": 2.222,
"grad_norm": 0.3948729634284973,
"learning_rate": 1.9125320453473923e-06,
"loss": 0.7747,
"step": 1111
},
{
"epoch": 2.224,
"grad_norm": 0.40589264035224915,
"learning_rate": 1.9033881872537009e-06,
"loss": 0.7519,
"step": 1112
},
{
"epoch": 2.226,
"grad_norm": 0.42979031801223755,
"learning_rate": 1.8942610986084487e-06,
"loss": 0.8096,
"step": 1113
},
{
"epoch": 2.228,
"grad_norm": 0.43356558680534363,
"learning_rate": 1.88515082883864e-06,
"loss": 0.7853,
"step": 1114
},
{
"epoch": 2.23,
"grad_norm": 0.4155976474285126,
"learning_rate": 1.8760574272802002e-06,
"loss": 0.7433,
"step": 1115
},
{
"epoch": 2.232,
"grad_norm": 0.41546252369880676,
"learning_rate": 1.8669809431776991e-06,
"loss": 0.771,
"step": 1116
},
{
"epoch": 2.234,
"grad_norm": 0.42242228984832764,
"learning_rate": 1.8579214256840938e-06,
"loss": 0.7931,
"step": 1117
},
{
"epoch": 2.2359999999999998,
"grad_norm": 0.3971981704235077,
"learning_rate": 1.8488789238604676e-06,
"loss": 0.7894,
"step": 1118
},
{
"epoch": 2.238,
"grad_norm": 0.4262888729572296,
"learning_rate": 1.8398534866757455e-06,
"loss": 0.7469,
"step": 1119
},
{
"epoch": 2.24,
"grad_norm": 0.42837753891944885,
"learning_rate": 1.8308451630064484e-06,
"loss": 0.7523,
"step": 1120
},
{
"epoch": 2.242,
"grad_norm": 0.4347648620605469,
"learning_rate": 1.8218540016364178e-06,
"loss": 0.7754,
"step": 1121
},
{
"epoch": 2.2439999999999998,
"grad_norm": 0.43818947672843933,
"learning_rate": 1.8128800512565514e-06,
"loss": 0.7487,
"step": 1122
},
{
"epoch": 2.246,
"grad_norm": 0.3970399498939514,
"learning_rate": 1.8039233604645468e-06,
"loss": 0.7838,
"step": 1123
},
{
"epoch": 2.248,
"grad_norm": 0.40745770931243896,
"learning_rate": 1.7949839777646327e-06,
"loss": 0.7994,
"step": 1124
},
{
"epoch": 2.25,
"grad_norm": 0.43025413155555725,
"learning_rate": 1.7860619515673034e-06,
"loss": 0.7662,
"step": 1125
},
{
"epoch": 2.252,
"grad_norm": 0.46322372555732727,
"learning_rate": 1.7771573301890666e-06,
"loss": 0.8064,
"step": 1126
},
{
"epoch": 2.254,
"grad_norm": 0.47347068786621094,
"learning_rate": 1.7682701618521687e-06,
"loss": 0.7538,
"step": 1127
},
{
"epoch": 2.2560000000000002,
"grad_norm": 0.4192434847354889,
"learning_rate": 1.7594004946843458e-06,
"loss": 0.8046,
"step": 1128
},
{
"epoch": 2.258,
"grad_norm": 0.4811704456806183,
"learning_rate": 1.7505483767185583e-06,
"loss": 0.7348,
"step": 1129
},
{
"epoch": 2.26,
"grad_norm": 0.42036548256874084,
"learning_rate": 1.7417138558927244e-06,
"loss": 0.7466,
"step": 1130
},
{
"epoch": 2.262,
"grad_norm": 0.46703797578811646,
"learning_rate": 1.7328969800494727e-06,
"loss": 0.8125,
"step": 1131
},
{
"epoch": 2.2640000000000002,
"grad_norm": 0.4545026421546936,
"learning_rate": 1.7240977969358757e-06,
"loss": 0.756,
"step": 1132
},
{
"epoch": 2.266,
"grad_norm": 0.39864465594291687,
"learning_rate": 1.7153163542031881e-06,
"loss": 0.7741,
"step": 1133
},
{
"epoch": 2.268,
"grad_norm": 0.4210382401943207,
"learning_rate": 1.7065526994065973e-06,
"loss": 0.7489,
"step": 1134
},
{
"epoch": 2.27,
"grad_norm": 0.3919101357460022,
"learning_rate": 1.6978068800049624e-06,
"loss": 0.7977,
"step": 1135
},
{
"epoch": 2.2720000000000002,
"grad_norm": 0.4045415222644806,
"learning_rate": 1.6890789433605508e-06,
"loss": 0.7482,
"step": 1136
},
{
"epoch": 2.274,
"grad_norm": 0.42059701681137085,
"learning_rate": 1.680368936738792e-06,
"loss": 0.7717,
"step": 1137
},
{
"epoch": 2.276,
"grad_norm": 0.4480821490287781,
"learning_rate": 1.671676907308018e-06,
"loss": 0.7716,
"step": 1138
},
{
"epoch": 2.278,
"grad_norm": 0.4437813460826874,
"learning_rate": 1.6630029021392007e-06,
"loss": 0.7507,
"step": 1139
},
{
"epoch": 2.2800000000000002,
"grad_norm": 0.4174870252609253,
"learning_rate": 1.6543469682057105e-06,
"loss": 0.7388,
"step": 1140
},
{
"epoch": 2.282,
"grad_norm": 0.4287874102592468,
"learning_rate": 1.645709152383046e-06,
"loss": 0.7615,
"step": 1141
},
{
"epoch": 2.284,
"grad_norm": 0.4152935743331909,
"learning_rate": 1.6370895014486e-06,
"loss": 0.758,
"step": 1142
},
{
"epoch": 2.286,
"grad_norm": 0.41198644042015076,
"learning_rate": 1.6284880620813847e-06,
"loss": 0.7776,
"step": 1143
},
{
"epoch": 2.288,
"grad_norm": 0.40575531125068665,
"learning_rate": 1.6199048808617896e-06,
"loss": 0.7964,
"step": 1144
},
{
"epoch": 2.29,
"grad_norm": 0.3933468461036682,
"learning_rate": 1.611340004271339e-06,
"loss": 0.7683,
"step": 1145
},
{
"epoch": 2.292,
"grad_norm": 0.42372068762779236,
"learning_rate": 1.6027934786924187e-06,
"loss": 0.7576,
"step": 1146
},
{
"epoch": 2.294,
"grad_norm": 0.40251022577285767,
"learning_rate": 1.594265350408039e-06,
"loss": 0.76,
"step": 1147
},
{
"epoch": 2.296,
"grad_norm": 0.40240368247032166,
"learning_rate": 1.5857556656015837e-06,
"loss": 0.7936,
"step": 1148
},
{
"epoch": 2.298,
"grad_norm": 0.5056098103523254,
"learning_rate": 1.5772644703565564e-06,
"loss": 0.7714,
"step": 1149
},
{
"epoch": 2.3,
"grad_norm": 0.4141823351383209,
"learning_rate": 1.5687918106563326e-06,
"loss": 0.7508,
"step": 1150
},
{
"epoch": 2.302,
"grad_norm": 0.4253833591938019,
"learning_rate": 1.5603377323839069e-06,
"loss": 0.7452,
"step": 1151
},
{
"epoch": 2.304,
"grad_norm": 0.4501397907733917,
"learning_rate": 1.551902281321651e-06,
"loss": 0.7613,
"step": 1152
},
{
"epoch": 2.306,
"grad_norm": 0.4004981517791748,
"learning_rate": 1.5434855031510626e-06,
"loss": 0.7794,
"step": 1153
},
{
"epoch": 2.308,
"grad_norm": 0.44913139939308167,
"learning_rate": 1.5350874434525142e-06,
"loss": 0.7587,
"step": 1154
},
{
"epoch": 2.31,
"grad_norm": 0.4194786548614502,
"learning_rate": 1.5267081477050132e-06,
"loss": 0.7882,
"step": 1155
},
{
"epoch": 2.312,
"grad_norm": 0.43769264221191406,
"learning_rate": 1.5183476612859538e-06,
"loss": 0.7932,
"step": 1156
},
{
"epoch": 2.314,
"grad_norm": 0.43893754482269287,
"learning_rate": 1.5100060294708647e-06,
"loss": 0.7729,
"step": 1157
},
{
"epoch": 2.316,
"grad_norm": 0.4052613377571106,
"learning_rate": 1.5016832974331725e-06,
"loss": 0.7768,
"step": 1158
},
{
"epoch": 2.318,
"grad_norm": 0.41634050011634827,
"learning_rate": 1.4933795102439558e-06,
"loss": 0.7517,
"step": 1159
},
{
"epoch": 2.32,
"grad_norm": 0.515920102596283,
"learning_rate": 1.4850947128716914e-06,
"loss": 0.7657,
"step": 1160
},
{
"epoch": 2.322,
"grad_norm": 0.43313372135162354,
"learning_rate": 1.4768289501820265e-06,
"loss": 0.7392,
"step": 1161
},
{
"epoch": 2.324,
"grad_norm": 0.4506223797798157,
"learning_rate": 1.4685822669375239e-06,
"loss": 0.7826,
"step": 1162
},
{
"epoch": 2.326,
"grad_norm": 0.44990140199661255,
"learning_rate": 1.4603547077974217e-06,
"loss": 0.7833,
"step": 1163
},
{
"epoch": 2.328,
"grad_norm": 0.43483781814575195,
"learning_rate": 1.4521463173173966e-06,
"loss": 0.7771,
"step": 1164
},
{
"epoch": 2.33,
"grad_norm": 0.41871559619903564,
"learning_rate": 1.4439571399493146e-06,
"loss": 0.77,
"step": 1165
},
{
"epoch": 2.332,
"grad_norm": 0.41554340720176697,
"learning_rate": 1.4357872200409988e-06,
"loss": 0.8184,
"step": 1166
},
{
"epoch": 2.334,
"grad_norm": 0.41174760460853577,
"learning_rate": 1.4276366018359845e-06,
"loss": 0.7728,
"step": 1167
},
{
"epoch": 2.336,
"grad_norm": 0.426435649394989,
"learning_rate": 1.4195053294732757e-06,
"loss": 0.7597,
"step": 1168
},
{
"epoch": 2.338,
"grad_norm": 0.4220362603664398,
"learning_rate": 1.4113934469871166e-06,
"loss": 0.7222,
"step": 1169
},
{
"epoch": 2.34,
"grad_norm": 0.4154855012893677,
"learning_rate": 1.4033009983067454e-06,
"loss": 0.8104,
"step": 1170
},
{
"epoch": 2.342,
"grad_norm": 0.4327816069126129,
"learning_rate": 1.3952280272561541e-06,
"loss": 0.7503,
"step": 1171
},
{
"epoch": 2.344,
"grad_norm": 0.409800261259079,
"learning_rate": 1.3871745775538598e-06,
"loss": 0.7895,
"step": 1172
},
{
"epoch": 2.346,
"grad_norm": 0.4550241231918335,
"learning_rate": 1.3791406928126638e-06,
"loss": 0.7569,
"step": 1173
},
{
"epoch": 2.348,
"grad_norm": 0.41326087713241577,
"learning_rate": 1.371126416539409e-06,
"loss": 0.7821,
"step": 1174
},
{
"epoch": 2.35,
"grad_norm": 0.42594876885414124,
"learning_rate": 1.3631317921347564e-06,
"loss": 0.754,
"step": 1175
},
{
"epoch": 2.352,
"grad_norm": 0.428521066904068,
"learning_rate": 1.3551568628929434e-06,
"loss": 0.7867,
"step": 1176
},
{
"epoch": 2.354,
"grad_norm": 0.39713218808174133,
"learning_rate": 1.3472016720015447e-06,
"loss": 0.7685,
"step": 1177
},
{
"epoch": 2.356,
"grad_norm": 0.4291415512561798,
"learning_rate": 1.339266262541249e-06,
"loss": 0.7723,
"step": 1178
},
{
"epoch": 2.358,
"grad_norm": 0.4253414273262024,
"learning_rate": 1.3313506774856177e-06,
"loss": 0.7748,
"step": 1179
},
{
"epoch": 2.36,
"grad_norm": 0.409959077835083,
"learning_rate": 1.3234549597008572e-06,
"loss": 0.7861,
"step": 1180
},
{
"epoch": 2.362,
"grad_norm": 0.4249674379825592,
"learning_rate": 1.3155791519455812e-06,
"loss": 0.8093,
"step": 1181
},
{
"epoch": 2.364,
"grad_norm": 0.4227546155452728,
"learning_rate": 1.3077232968705805e-06,
"loss": 0.7827,
"step": 1182
},
{
"epoch": 2.366,
"grad_norm": 0.4601157009601593,
"learning_rate": 1.2998874370186026e-06,
"loss": 0.7788,
"step": 1183
},
{
"epoch": 2.368,
"grad_norm": 0.4204160273075104,
"learning_rate": 1.2920716148241036e-06,
"loss": 0.752,
"step": 1184
},
{
"epoch": 2.37,
"grad_norm": 0.42416802048683167,
"learning_rate": 1.2842758726130283e-06,
"loss": 0.7587,
"step": 1185
},
{
"epoch": 2.372,
"grad_norm": 0.42040911316871643,
"learning_rate": 1.2765002526025871e-06,
"loss": 0.7622,
"step": 1186
},
{
"epoch": 2.374,
"grad_norm": 0.38308432698249817,
"learning_rate": 1.2687447969010113e-06,
"loss": 0.7838,
"step": 1187
},
{
"epoch": 2.376,
"grad_norm": 0.40888527035713196,
"learning_rate": 1.2610095475073415e-06,
"loss": 0.783,
"step": 1188
},
{
"epoch": 2.378,
"grad_norm": 0.40721824765205383,
"learning_rate": 1.2532945463111856e-06,
"loss": 0.7861,
"step": 1189
},
{
"epoch": 2.38,
"grad_norm": 0.43069660663604736,
"learning_rate": 1.2455998350925042e-06,
"loss": 0.7398,
"step": 1190
},
{
"epoch": 2.382,
"grad_norm": 0.36445775628089905,
"learning_rate": 1.2379254555213788e-06,
"loss": 0.7637,
"step": 1191
},
{
"epoch": 2.384,
"grad_norm": 0.40194427967071533,
"learning_rate": 1.2302714491577834e-06,
"loss": 0.7485,
"step": 1192
},
{
"epoch": 2.386,
"grad_norm": 0.40127691626548767,
"learning_rate": 1.2226378574513654e-06,
"loss": 0.7683,
"step": 1193
},
{
"epoch": 2.388,
"grad_norm": 0.4438623785972595,
"learning_rate": 1.2150247217412186e-06,
"loss": 0.7896,
"step": 1194
},
{
"epoch": 2.39,
"grad_norm": 0.45208215713500977,
"learning_rate": 1.2074320832556558e-06,
"loss": 0.743,
"step": 1195
},
{
"epoch": 2.392,
"grad_norm": 0.4029155373573303,
"learning_rate": 1.1998599831119912e-06,
"loss": 0.7622,
"step": 1196
},
{
"epoch": 2.394,
"grad_norm": 0.41432464122772217,
"learning_rate": 1.1923084623163172e-06,
"loss": 0.7699,
"step": 1197
},
{
"epoch": 2.396,
"grad_norm": 0.42966488003730774,
"learning_rate": 1.1847775617632746e-06,
"loss": 0.7534,
"step": 1198
},
{
"epoch": 2.398,
"grad_norm": 0.4253801107406616,
"learning_rate": 1.1772673222358421e-06,
"loss": 0.7597,
"step": 1199
},
{
"epoch": 2.4,
"grad_norm": 0.3783406913280487,
"learning_rate": 1.1697777844051105e-06,
"loss": 0.7738,
"step": 1200
},
{
"epoch": 2.402,
"grad_norm": 0.44550472497940063,
"learning_rate": 1.162308988830057e-06,
"loss": 0.8131,
"step": 1201
},
{
"epoch": 2.404,
"grad_norm": 0.4549965560436249,
"learning_rate": 1.1548609759573375e-06,
"loss": 0.7755,
"step": 1202
},
{
"epoch": 2.406,
"grad_norm": 0.4233042001724243,
"learning_rate": 1.1474337861210543e-06,
"loss": 0.7592,
"step": 1203
},
{
"epoch": 2.408,
"grad_norm": 0.4179689884185791,
"learning_rate": 1.1400274595425499e-06,
"loss": 0.7675,
"step": 1204
},
{
"epoch": 2.41,
"grad_norm": 0.3878929615020752,
"learning_rate": 1.132642036330181e-06,
"loss": 0.7756,
"step": 1205
},
{
"epoch": 2.412,
"grad_norm": 0.43287208676338196,
"learning_rate": 1.1252775564791023e-06,
"loss": 0.7551,
"step": 1206
},
{
"epoch": 2.414,
"grad_norm": 0.40028390288352966,
"learning_rate": 1.1179340598710547e-06,
"loss": 0.7572,
"step": 1207
},
{
"epoch": 2.416,
"grad_norm": 0.3733893930912018,
"learning_rate": 1.1106115862741457e-06,
"loss": 0.7952,
"step": 1208
},
{
"epoch": 2.418,
"grad_norm": 0.3904898464679718,
"learning_rate": 1.1033101753426285e-06,
"loss": 0.7745,
"step": 1209
},
{
"epoch": 2.42,
"grad_norm": 0.4095996916294098,
"learning_rate": 1.096029866616704e-06,
"loss": 0.7647,
"step": 1210
},
{
"epoch": 2.422,
"grad_norm": 0.4702613055706024,
"learning_rate": 1.0887706995222864e-06,
"loss": 0.768,
"step": 1211
},
{
"epoch": 2.424,
"grad_norm": 0.4261472225189209,
"learning_rate": 1.0815327133708015e-06,
"loss": 0.752,
"step": 1212
},
{
"epoch": 2.426,
"grad_norm": 0.38912856578826904,
"learning_rate": 1.0743159473589738e-06,
"loss": 0.7822,
"step": 1213
},
{
"epoch": 2.428,
"grad_norm": 0.42375972867012024,
"learning_rate": 1.0671204405686108e-06,
"loss": 0.7756,
"step": 1214
},
{
"epoch": 2.43,
"grad_norm": 0.40271031856536865,
"learning_rate": 1.0599462319663906e-06,
"loss": 0.7919,
"step": 1215
},
{
"epoch": 2.432,
"grad_norm": 0.39157113432884216,
"learning_rate": 1.052793360403655e-06,
"loss": 0.7749,
"step": 1216
},
{
"epoch": 2.434,
"grad_norm": 0.4087899923324585,
"learning_rate": 1.0456618646161954e-06,
"loss": 0.8007,
"step": 1217
},
{
"epoch": 2.436,
"grad_norm": 0.4762036204338074,
"learning_rate": 1.0385517832240472e-06,
"loss": 0.7607,
"step": 1218
},
{
"epoch": 2.438,
"grad_norm": 0.41489630937576294,
"learning_rate": 1.0314631547312738e-06,
"loss": 0.779,
"step": 1219
},
{
"epoch": 2.44,
"grad_norm": 0.3774562180042267,
"learning_rate": 1.0243960175257605e-06,
"loss": 0.8008,
"step": 1220
},
{
"epoch": 2.442,
"grad_norm": 0.39354345202445984,
"learning_rate": 1.0173504098790188e-06,
"loss": 0.7714,
"step": 1221
},
{
"epoch": 2.444,
"grad_norm": 0.4119318425655365,
"learning_rate": 1.010326369945957e-06,
"loss": 0.7856,
"step": 1222
},
{
"epoch": 2.446,
"grad_norm": 0.3862462639808655,
"learning_rate": 1.0033239357646913e-06,
"loss": 0.7589,
"step": 1223
},
{
"epoch": 2.448,
"grad_norm": 0.44779011607170105,
"learning_rate": 9.963431452563331e-07,
"loss": 0.7884,
"step": 1224
},
{
"epoch": 2.45,
"grad_norm": 0.4043843746185303,
"learning_rate": 9.893840362247809e-07,
"loss": 0.7568,
"step": 1225
},
{
"epoch": 2.452,
"grad_norm": 0.43136683106422424,
"learning_rate": 9.824466463565246e-07,
"loss": 0.7687,
"step": 1226
},
{
"epoch": 2.454,
"grad_norm": 0.40449437499046326,
"learning_rate": 9.7553101322043e-07,
"loss": 0.8022,
"step": 1227
},
{
"epoch": 2.456,
"grad_norm": 0.39529287815093994,
"learning_rate": 9.686371742675443e-07,
"loss": 0.7691,
"step": 1228
},
{
"epoch": 2.458,
"grad_norm": 0.3712971806526184,
"learning_rate": 9.617651668308914e-07,
"loss": 0.7897,
"step": 1229
},
{
"epoch": 2.46,
"grad_norm": 0.41396647691726685,
"learning_rate": 9.549150281252633e-07,
"loss": 0.765,
"step": 1230
},
{
"epoch": 2.462,
"grad_norm": 0.419058620929718,
"learning_rate": 9.480867952470285e-07,
"loss": 0.7615,
"step": 1231
},
{
"epoch": 2.464,
"grad_norm": 0.38833707571029663,
"learning_rate": 9.412805051739266e-07,
"loss": 0.796,
"step": 1232
},
{
"epoch": 2.466,
"grad_norm": 0.38123270869255066,
"learning_rate": 9.344961947648624e-07,
"loss": 0.7856,
"step": 1233
},
{
"epoch": 2.468,
"grad_norm": 0.39136457443237305,
"learning_rate": 9.277339007597158e-07,
"loss": 0.7702,
"step": 1234
},
{
"epoch": 2.4699999999999998,
"grad_norm": 0.424621045589447,
"learning_rate": 9.209936597791407e-07,
"loss": 0.7604,
"step": 1235
},
{
"epoch": 2.472,
"grad_norm": 0.42278164625167847,
"learning_rate": 9.142755083243577e-07,
"loss": 0.8015,
"step": 1236
},
{
"epoch": 2.474,
"grad_norm": 0.39735937118530273,
"learning_rate": 9.075794827769696e-07,
"loss": 0.7122,
"step": 1237
},
{
"epoch": 2.476,
"grad_norm": 0.4141498804092407,
"learning_rate": 9.009056193987569e-07,
"loss": 0.8107,
"step": 1238
},
{
"epoch": 2.4779999999999998,
"grad_norm": 0.38823202252388,
"learning_rate": 8.942539543314799e-07,
"loss": 0.7794,
"step": 1239
},
{
"epoch": 2.48,
"grad_norm": 0.40469056367874146,
"learning_rate": 8.876245235966884e-07,
"loss": 0.7747,
"step": 1240
},
{
"epoch": 2.482,
"grad_norm": 0.4189411401748657,
"learning_rate": 8.810173630955249e-07,
"loss": 0.7616,
"step": 1241
},
{
"epoch": 2.484,
"grad_norm": 0.396389365196228,
"learning_rate": 8.744325086085248e-07,
"loss": 0.7603,
"step": 1242
},
{
"epoch": 2.4859999999999998,
"grad_norm": 0.4292638301849365,
"learning_rate": 8.678699957954323e-07,
"loss": 0.7578,
"step": 1243
},
{
"epoch": 2.488,
"grad_norm": 0.3797649145126343,
"learning_rate": 8.613298601949971e-07,
"loss": 0.7562,
"step": 1244
},
{
"epoch": 2.49,
"grad_norm": 0.4183959662914276,
"learning_rate": 8.54812137224792e-07,
"loss": 0.7773,
"step": 1245
},
{
"epoch": 2.492,
"grad_norm": 0.367262065410614,
"learning_rate": 8.483168621810133e-07,
"loss": 0.7601,
"step": 1246
},
{
"epoch": 2.4939999999999998,
"grad_norm": 0.39003264904022217,
"learning_rate": 8.418440702382897e-07,
"loss": 0.7667,
"step": 1247
},
{
"epoch": 2.496,
"grad_norm": 0.3786110579967499,
"learning_rate": 8.353937964495029e-07,
"loss": 0.8138,
"step": 1248
},
{
"epoch": 2.498,
"grad_norm": 0.4133704900741577,
"learning_rate": 8.289660757455803e-07,
"loss": 0.7382,
"step": 1249
},
{
"epoch": 2.5,
"grad_norm": 0.39127856492996216,
"learning_rate": 8.225609429353187e-07,
"loss": 0.752,
"step": 1250
},
{
"epoch": 2.502,
"grad_norm": 0.43218833208084106,
"learning_rate": 8.161784327051919e-07,
"loss": 0.7998,
"step": 1251
},
{
"epoch": 2.504,
"grad_norm": 0.3970191478729248,
"learning_rate": 8.098185796191632e-07,
"loss": 0.7362,
"step": 1252
},
{
"epoch": 2.5060000000000002,
"grad_norm": 0.4043598175048828,
"learning_rate": 8.034814181184996e-07,
"loss": 0.7921,
"step": 1253
},
{
"epoch": 2.508,
"grad_norm": 0.3990473747253418,
"learning_rate": 7.971669825215789e-07,
"loss": 0.7728,
"step": 1254
},
{
"epoch": 2.51,
"grad_norm": 0.4073302745819092,
"learning_rate": 7.908753070237124e-07,
"loss": 0.7421,
"step": 1255
},
{
"epoch": 2.512,
"grad_norm": 0.48361241817474365,
"learning_rate": 7.846064256969571e-07,
"loss": 0.7522,
"step": 1256
},
{
"epoch": 2.5140000000000002,
"grad_norm": 0.39428237080574036,
"learning_rate": 7.783603724899258e-07,
"loss": 0.76,
"step": 1257
},
{
"epoch": 2.516,
"grad_norm": 0.3941364586353302,
"learning_rate": 7.72137181227608e-07,
"loss": 0.7715,
"step": 1258
},
{
"epoch": 2.518,
"grad_norm": 0.4074542224407196,
"learning_rate": 7.659368856111926e-07,
"loss": 0.7611,
"step": 1259
},
{
"epoch": 2.52,
"grad_norm": 0.4131048619747162,
"learning_rate": 7.597595192178702e-07,
"loss": 0.7384,
"step": 1260
},
{
"epoch": 2.5220000000000002,
"grad_norm": 0.34917521476745605,
"learning_rate": 7.536051155006657e-07,
"loss": 0.7621,
"step": 1261
},
{
"epoch": 2.524,
"grad_norm": 0.40943166613578796,
"learning_rate": 7.47473707788251e-07,
"loss": 0.7669,
"step": 1262
},
{
"epoch": 2.526,
"grad_norm": 0.42000773549079895,
"learning_rate": 7.413653292847617e-07,
"loss": 0.7689,
"step": 1263
},
{
"epoch": 2.528,
"grad_norm": 0.432198166847229,
"learning_rate": 7.352800130696253e-07,
"loss": 0.7821,
"step": 1264
},
{
"epoch": 2.5300000000000002,
"grad_norm": 0.4601210653781891,
"learning_rate": 7.292177920973726e-07,
"loss": 0.7382,
"step": 1265
},
{
"epoch": 2.532,
"grad_norm": 0.398960679769516,
"learning_rate": 7.23178699197467e-07,
"loss": 0.7778,
"step": 1266
},
{
"epoch": 2.534,
"grad_norm": 0.4066762924194336,
"learning_rate": 7.171627670741243e-07,
"loss": 0.7505,
"step": 1267
},
{
"epoch": 2.536,
"grad_norm": 0.4193468689918518,
"learning_rate": 7.111700283061318e-07,
"loss": 0.7472,
"step": 1268
},
{
"epoch": 2.5380000000000003,
"grad_norm": 0.4361197054386139,
"learning_rate": 7.052005153466779e-07,
"loss": 0.7514,
"step": 1269
},
{
"epoch": 2.54,
"grad_norm": 0.433932900428772,
"learning_rate": 6.992542605231739e-07,
"loss": 0.7533,
"step": 1270
},
{
"epoch": 2.542,
"grad_norm": 0.3849650025367737,
"learning_rate": 6.933312960370748e-07,
"loss": 0.7595,
"step": 1271
},
{
"epoch": 2.544,
"grad_norm": 0.4118126630783081,
"learning_rate": 6.874316539637127e-07,
"loss": 0.7549,
"step": 1272
},
{
"epoch": 2.5460000000000003,
"grad_norm": 0.4175261855125427,
"learning_rate": 6.815553662521185e-07,
"loss": 0.7291,
"step": 1273
},
{
"epoch": 2.548,
"grad_norm": 0.40914928913116455,
"learning_rate": 6.757024647248456e-07,
"loss": 0.7868,
"step": 1274
},
{
"epoch": 2.55,
"grad_norm": 0.37963372468948364,
"learning_rate": 6.698729810778065e-07,
"loss": 0.7663,
"step": 1275
},
{
"epoch": 2.552,
"grad_norm": 0.44203123450279236,
"learning_rate": 6.640669468800947e-07,
"loss": 0.7593,
"step": 1276
},
{
"epoch": 2.5540000000000003,
"grad_norm": 0.38774022459983826,
"learning_rate": 6.58284393573812e-07,
"loss": 0.7571,
"step": 1277
},
{
"epoch": 2.556,
"grad_norm": 0.40861040353775024,
"learning_rate": 6.52525352473905e-07,
"loss": 0.7659,
"step": 1278
},
{
"epoch": 2.558,
"grad_norm": 0.4282105565071106,
"learning_rate": 6.467898547679913e-07,
"loss": 0.7602,
"step": 1279
},
{
"epoch": 2.56,
"grad_norm": 0.39104947447776794,
"learning_rate": 6.410779315161885e-07,
"loss": 0.7473,
"step": 1280
},
{
"epoch": 2.5620000000000003,
"grad_norm": 0.411494642496109,
"learning_rate": 6.353896136509524e-07,
"loss": 0.77,
"step": 1281
},
{
"epoch": 2.564,
"grad_norm": 0.386294960975647,
"learning_rate": 6.297249319769016e-07,
"loss": 0.7938,
"step": 1282
},
{
"epoch": 2.566,
"grad_norm": 0.41267791390419006,
"learning_rate": 6.240839171706608e-07,
"loss": 0.7528,
"step": 1283
},
{
"epoch": 2.568,
"grad_norm": 0.41963523626327515,
"learning_rate": 6.184665997806832e-07,
"loss": 0.7562,
"step": 1284
},
{
"epoch": 2.57,
"grad_norm": 0.401456356048584,
"learning_rate": 6.128730102270897e-07,
"loss": 0.7551,
"step": 1285
},
{
"epoch": 2.572,
"grad_norm": 0.38480910658836365,
"learning_rate": 6.073031788015133e-07,
"loss": 0.7358,
"step": 1286
},
{
"epoch": 2.574,
"grad_norm": 0.36396434903144836,
"learning_rate": 6.017571356669183e-07,
"loss": 0.7695,
"step": 1287
},
{
"epoch": 2.576,
"grad_norm": 0.394317626953125,
"learning_rate": 5.962349108574478e-07,
"loss": 0.7786,
"step": 1288
},
{
"epoch": 2.578,
"grad_norm": 0.37718522548675537,
"learning_rate": 5.9073653427826e-07,
"loss": 0.762,
"step": 1289
},
{
"epoch": 2.58,
"grad_norm": 0.3722103238105774,
"learning_rate": 5.852620357053651e-07,
"loss": 0.7687,
"step": 1290
},
{
"epoch": 2.582,
"grad_norm": 0.4033386707305908,
"learning_rate": 5.798114447854636e-07,
"loss": 0.7757,
"step": 1291
},
{
"epoch": 2.584,
"grad_norm": 0.440335750579834,
"learning_rate": 5.743847910357836e-07,
"loss": 0.7752,
"step": 1292
},
{
"epoch": 2.586,
"grad_norm": 0.41635289788246155,
"learning_rate": 5.689821038439264e-07,
"loss": 0.7256,
"step": 1293
},
{
"epoch": 2.588,
"grad_norm": 0.41737210750579834,
"learning_rate": 5.636034124677043e-07,
"loss": 0.7684,
"step": 1294
},
{
"epoch": 2.59,
"grad_norm": 0.37913477420806885,
"learning_rate": 5.582487460349806e-07,
"loss": 0.7918,
"step": 1295
},
{
"epoch": 2.592,
"grad_norm": 0.40841925144195557,
"learning_rate": 5.529181335435124e-07,
"loss": 0.7521,
"step": 1296
},
{
"epoch": 2.594,
"grad_norm": 0.4041779339313507,
"learning_rate": 5.476116038607993e-07,
"loss": 0.7527,
"step": 1297
},
{
"epoch": 2.596,
"grad_norm": 0.4226526916027069,
"learning_rate": 5.423291857239177e-07,
"loss": 0.8115,
"step": 1298
},
{
"epoch": 2.598,
"grad_norm": 0.40755200386047363,
"learning_rate": 5.370709077393721e-07,
"loss": 0.7997,
"step": 1299
},
{
"epoch": 2.6,
"grad_norm": 0.3873971402645111,
"learning_rate": 5.318367983829393e-07,
"loss": 0.7787,
"step": 1300
},
{
"epoch": 2.602,
"grad_norm": 0.443805992603302,
"learning_rate": 5.266268859995083e-07,
"loss": 0.7616,
"step": 1301
},
{
"epoch": 2.604,
"grad_norm": 0.39908817410469055,
"learning_rate": 5.214411988029355e-07,
"loss": 0.7876,
"step": 1302
},
{
"epoch": 2.606,
"grad_norm": 0.43873775005340576,
"learning_rate": 5.162797648758877e-07,
"loss": 0.7772,
"step": 1303
},
{
"epoch": 2.608,
"grad_norm": 0.39780065417289734,
"learning_rate": 5.111426121696866e-07,
"loss": 0.7736,
"step": 1304
},
{
"epoch": 2.61,
"grad_norm": 0.4444182515144348,
"learning_rate": 5.06029768504166e-07,
"loss": 0.7951,
"step": 1305
},
{
"epoch": 2.612,
"grad_norm": 0.4113735854625702,
"learning_rate": 5.009412615675102e-07,
"loss": 0.7346,
"step": 1306
},
{
"epoch": 2.614,
"grad_norm": 0.39655396342277527,
"learning_rate": 4.958771189161149e-07,
"loss": 0.7395,
"step": 1307
},
{
"epoch": 2.616,
"grad_norm": 0.423323392868042,
"learning_rate": 4.908373679744316e-07,
"loss": 0.7711,
"step": 1308
},
{
"epoch": 2.618,
"grad_norm": 0.4427488446235657,
"learning_rate": 4.858220360348187e-07,
"loss": 0.7558,
"step": 1309
},
{
"epoch": 2.62,
"grad_norm": 0.3920653760433197,
"learning_rate": 4.808311502573976e-07,
"loss": 0.7444,
"step": 1310
},
{
"epoch": 2.622,
"grad_norm": 0.40778854489326477,
"learning_rate": 4.758647376699033e-07,
"loss": 0.7639,
"step": 1311
},
{
"epoch": 2.624,
"grad_norm": 0.4384738802909851,
"learning_rate": 4.709228251675357e-07,
"loss": 0.7281,
"step": 1312
},
{
"epoch": 2.626,
"grad_norm": 0.3867555856704712,
"learning_rate": 4.6600543951281995e-07,
"loss": 0.7692,
"step": 1313
},
{
"epoch": 2.628,
"grad_norm": 0.3772483170032501,
"learning_rate": 4.6111260733545714e-07,
"loss": 0.7303,
"step": 1314
},
{
"epoch": 2.63,
"grad_norm": 0.3986133933067322,
"learning_rate": 4.562443551321788e-07,
"loss": 0.7802,
"step": 1315
},
{
"epoch": 2.632,
"grad_norm": 0.39893269538879395,
"learning_rate": 4.514007092666084e-07,
"loss": 0.7718,
"step": 1316
},
{
"epoch": 2.634,
"grad_norm": 0.4097014367580414,
"learning_rate": 4.4658169596911493e-07,
"loss": 0.7906,
"step": 1317
},
{
"epoch": 2.636,
"grad_norm": 0.3478214144706726,
"learning_rate": 4.417873413366702e-07,
"loss": 0.7696,
"step": 1318
},
{
"epoch": 2.638,
"grad_norm": 0.42011260986328125,
"learning_rate": 4.370176713327118e-07,
"loss": 0.7825,
"step": 1319
},
{
"epoch": 2.64,
"grad_norm": 0.40823644399642944,
"learning_rate": 4.322727117869951e-07,
"loss": 0.7785,
"step": 1320
},
{
"epoch": 2.642,
"grad_norm": 0.4038843512535095,
"learning_rate": 4.275524883954657e-07,
"loss": 0.7932,
"step": 1321
},
{
"epoch": 2.644,
"grad_norm": 0.417581170797348,
"learning_rate": 4.228570267201049e-07,
"loss": 0.7759,
"step": 1322
},
{
"epoch": 2.646,
"grad_norm": 0.4698950946331024,
"learning_rate": 4.1818635218880186e-07,
"loss": 0.7731,
"step": 1323
},
{
"epoch": 2.648,
"grad_norm": 0.4142085313796997,
"learning_rate": 4.1354049009521504e-07,
"loss": 0.773,
"step": 1324
},
{
"epoch": 2.65,
"grad_norm": 0.4050668776035309,
"learning_rate": 4.089194655986306e-07,
"loss": 0.801,
"step": 1325
},
{
"epoch": 2.652,
"grad_norm": 0.41010555624961853,
"learning_rate": 4.043233037238281e-07,
"loss": 0.7833,
"step": 1326
},
{
"epoch": 2.654,
"grad_norm": 0.3984374701976776,
"learning_rate": 3.99752029360948e-07,
"loss": 0.7807,
"step": 1327
},
{
"epoch": 2.656,
"grad_norm": 0.41552355885505676,
"learning_rate": 3.9520566726535367e-07,
"loss": 0.7502,
"step": 1328
},
{
"epoch": 2.658,
"grad_norm": 0.4388125240802765,
"learning_rate": 3.90684242057498e-07,
"loss": 0.7821,
"step": 1329
},
{
"epoch": 2.66,
"grad_norm": 0.4021058976650238,
"learning_rate": 3.8618777822278854e-07,
"loss": 0.8142,
"step": 1330
},
{
"epoch": 2.662,
"grad_norm": 0.38491585850715637,
"learning_rate": 3.8171630011145877e-07,
"loss": 0.7804,
"step": 1331
},
{
"epoch": 2.664,
"grad_norm": 0.43489962816238403,
"learning_rate": 3.772698319384349e-07,
"loss": 0.7255,
"step": 1332
},
{
"epoch": 2.666,
"grad_norm": 0.4907485544681549,
"learning_rate": 3.728483977831998e-07,
"loss": 0.7768,
"step": 1333
},
{
"epoch": 2.668,
"grad_norm": 0.4141354262828827,
"learning_rate": 3.684520215896703e-07,
"loss": 0.7552,
"step": 1334
},
{
"epoch": 2.67,
"grad_norm": 0.40464743971824646,
"learning_rate": 3.6408072716606346e-07,
"loss": 0.75,
"step": 1335
},
{
"epoch": 2.672,
"grad_norm": 0.4001968204975128,
"learning_rate": 3.597345381847656e-07,
"loss": 0.7509,
"step": 1336
},
{
"epoch": 2.674,
"grad_norm": 0.39966756105422974,
"learning_rate": 3.554134781822094e-07,
"loss": 0.7706,
"step": 1337
},
{
"epoch": 2.676,
"grad_norm": 0.4206313192844391,
"learning_rate": 3.511175705587433e-07,
"loss": 0.7745,
"step": 1338
},
{
"epoch": 2.678,
"grad_norm": 0.4104755222797394,
"learning_rate": 3.468468385785023e-07,
"loss": 0.7618,
"step": 1339
},
{
"epoch": 2.68,
"grad_norm": 0.4081501364707947,
"learning_rate": 3.426013053692878e-07,
"loss": 0.7593,
"step": 1340
},
{
"epoch": 2.682,
"grad_norm": 0.4143938720226288,
"learning_rate": 3.3838099392243915e-07,
"loss": 0.7515,
"step": 1341
},
{
"epoch": 2.684,
"grad_norm": 0.37549248337745667,
"learning_rate": 3.341859270927067e-07,
"loss": 0.8178,
"step": 1342
},
{
"epoch": 2.686,
"grad_norm": 0.3817611038684845,
"learning_rate": 3.30016127598134e-07,
"loss": 0.799,
"step": 1343
},
{
"epoch": 2.6879999999999997,
"grad_norm": 0.38387176394462585,
"learning_rate": 3.258716180199278e-07,
"loss": 0.7526,
"step": 1344
},
{
"epoch": 2.69,
"grad_norm": 0.42001819610595703,
"learning_rate": 3.2175242080234314e-07,
"loss": 0.7845,
"step": 1345
},
{
"epoch": 2.692,
"grad_norm": 0.4304758906364441,
"learning_rate": 3.1765855825255543e-07,
"loss": 0.7381,
"step": 1346
},
{
"epoch": 2.694,
"grad_norm": 0.3993408977985382,
"learning_rate": 3.135900525405428e-07,
"loss": 0.7858,
"step": 1347
},
{
"epoch": 2.6959999999999997,
"grad_norm": 0.3781108260154724,
"learning_rate": 3.0954692569896585e-07,
"loss": 0.7184,
"step": 1348
},
{
"epoch": 2.698,
"grad_norm": 0.41282615065574646,
"learning_rate": 3.055291996230492e-07,
"loss": 0.7445,
"step": 1349
},
{
"epoch": 2.7,
"grad_norm": 0.39070644974708557,
"learning_rate": 3.015368960704584e-07,
"loss": 0.7616,
"step": 1350
},
{
"epoch": 2.702,
"grad_norm": 0.3836466670036316,
"learning_rate": 2.975700366611883e-07,
"loss": 0.7625,
"step": 1351
},
{
"epoch": 2.7039999999999997,
"grad_norm": 0.36659085750579834,
"learning_rate": 2.9362864287744266e-07,
"loss": 0.8153,
"step": 1352
},
{
"epoch": 2.706,
"grad_norm": 0.37427932024002075,
"learning_rate": 2.8971273606351656e-07,
"loss": 0.7523,
"step": 1353
},
{
"epoch": 2.708,
"grad_norm": 0.4171502888202667,
"learning_rate": 2.858223374256841e-07,
"loss": 0.7399,
"step": 1354
},
{
"epoch": 2.71,
"grad_norm": 0.38254213333129883,
"learning_rate": 2.819574680320825e-07,
"loss": 0.7741,
"step": 1355
},
{
"epoch": 2.7119999999999997,
"grad_norm": 0.4144255816936493,
"learning_rate": 2.7811814881259503e-07,
"loss": 0.7869,
"step": 1356
},
{
"epoch": 2.714,
"grad_norm": 0.3962395191192627,
"learning_rate": 2.743044005587425e-07,
"loss": 0.7871,
"step": 1357
},
{
"epoch": 2.716,
"grad_norm": 0.36894917488098145,
"learning_rate": 2.705162439235648e-07,
"loss": 0.7705,
"step": 1358
},
{
"epoch": 2.718,
"grad_norm": 0.39567211270332336,
"learning_rate": 2.6675369942151864e-07,
"loss": 0.7581,
"step": 1359
},
{
"epoch": 2.7199999999999998,
"grad_norm": 0.4306613504886627,
"learning_rate": 2.63016787428354e-07,
"loss": 0.7602,
"step": 1360
},
{
"epoch": 2.722,
"grad_norm": 0.40510469675064087,
"learning_rate": 2.593055281810125e-07,
"loss": 0.7311,
"step": 1361
},
{
"epoch": 2.724,
"grad_norm": 0.3800606429576874,
"learning_rate": 2.556199417775174e-07,
"loss": 0.7655,
"step": 1362
},
{
"epoch": 2.726,
"grad_norm": 0.4124450385570526,
"learning_rate": 2.519600481768597e-07,
"loss": 0.7687,
"step": 1363
},
{
"epoch": 2.7279999999999998,
"grad_norm": 0.3907739222049713,
"learning_rate": 2.483258671988942e-07,
"loss": 0.7697,
"step": 1364
},
{
"epoch": 2.73,
"grad_norm": 0.3988780081272125,
"learning_rate": 2.447174185242324e-07,
"loss": 0.7653,
"step": 1365
},
{
"epoch": 2.732,
"grad_norm": 0.43391841650009155,
"learning_rate": 2.4113472169413176e-07,
"loss": 0.6854,
"step": 1366
},
{
"epoch": 2.734,
"grad_norm": 0.43772202730178833,
"learning_rate": 2.37577796110397e-07,
"loss": 0.7842,
"step": 1367
},
{
"epoch": 2.7359999999999998,
"grad_norm": 0.37034618854522705,
"learning_rate": 2.3404666103526542e-07,
"loss": 0.7639,
"step": 1368
},
{
"epoch": 2.738,
"grad_norm": 0.35954827070236206,
"learning_rate": 2.3054133559131163e-07,
"loss": 0.7998,
"step": 1369
},
{
"epoch": 2.74,
"grad_norm": 0.38162147998809814,
"learning_rate": 2.2706183876134047e-07,
"loss": 0.7764,
"step": 1370
},
{
"epoch": 2.742,
"grad_norm": 0.3861159086227417,
"learning_rate": 2.2360818938828189e-07,
"loss": 0.792,
"step": 1371
},
{
"epoch": 2.7439999999999998,
"grad_norm": 0.37095239758491516,
"learning_rate": 2.2018040617509174e-07,
"loss": 0.7745,
"step": 1372
},
{
"epoch": 2.746,
"grad_norm": 0.3978652358055115,
"learning_rate": 2.167785076846518e-07,
"loss": 0.7546,
"step": 1373
},
{
"epoch": 2.748,
"grad_norm": 0.39223480224609375,
"learning_rate": 2.134025123396638e-07,
"loss": 0.7821,
"step": 1374
},
{
"epoch": 2.75,
"grad_norm": 0.4191019535064697,
"learning_rate": 2.1005243842255552e-07,
"loss": 0.7574,
"step": 1375
},
{
"epoch": 2.752,
"grad_norm": 0.4072646200656891,
"learning_rate": 2.0672830407537925e-07,
"loss": 0.7591,
"step": 1376
},
{
"epoch": 2.754,
"grad_norm": 0.4050624370574951,
"learning_rate": 2.0343012729971244e-07,
"loss": 0.8114,
"step": 1377
},
{
"epoch": 2.7560000000000002,
"grad_norm": 0.4051263630390167,
"learning_rate": 2.0015792595656225e-07,
"loss": 0.7642,
"step": 1378
},
{
"epoch": 2.758,
"grad_norm": 0.3808835446834564,
"learning_rate": 1.9691171776626882e-07,
"loss": 0.7824,
"step": 1379
},
{
"epoch": 2.76,
"grad_norm": 0.4035777449607849,
"learning_rate": 1.9369152030840553e-07,
"loss": 0.7698,
"step": 1380
},
{
"epoch": 2.762,
"grad_norm": 0.382522851228714,
"learning_rate": 1.904973510216912e-07,
"loss": 0.7742,
"step": 1381
},
{
"epoch": 2.7640000000000002,
"grad_norm": 0.3774438798427582,
"learning_rate": 1.873292272038868e-07,
"loss": 0.791,
"step": 1382
},
{
"epoch": 2.766,
"grad_norm": 0.417490154504776,
"learning_rate": 1.841871660117095e-07,
"loss": 0.762,
"step": 1383
},
{
"epoch": 2.768,
"grad_norm": 0.40647685527801514,
"learning_rate": 1.8107118446073492e-07,
"loss": 0.7638,
"step": 1384
},
{
"epoch": 2.77,
"grad_norm": 0.40906310081481934,
"learning_rate": 1.779812994253055e-07,
"loss": 0.8037,
"step": 1385
},
{
"epoch": 2.7720000000000002,
"grad_norm": 0.418174684047699,
"learning_rate": 1.7491752763844294e-07,
"loss": 0.7855,
"step": 1386
},
{
"epoch": 2.774,
"grad_norm": 0.4123242497444153,
"learning_rate": 1.7187988569175307e-07,
"loss": 0.7552,
"step": 1387
},
{
"epoch": 2.776,
"grad_norm": 0.4758404493331909,
"learning_rate": 1.688683900353366e-07,
"loss": 0.7618,
"step": 1388
},
{
"epoch": 2.778,
"grad_norm": 0.392837256193161,
"learning_rate": 1.6588305697770313e-07,
"loss": 0.7906,
"step": 1389
},
{
"epoch": 2.7800000000000002,
"grad_norm": 0.3605124354362488,
"learning_rate": 1.6292390268568103e-07,
"loss": 0.7709,
"step": 1390
},
{
"epoch": 2.782,
"grad_norm": 0.39685410261154175,
"learning_rate": 1.5999094318432662e-07,
"loss": 0.7712,
"step": 1391
},
{
"epoch": 2.784,
"grad_norm": 0.3873448073863983,
"learning_rate": 1.5708419435684463e-07,
"loss": 0.7559,
"step": 1392
},
{
"epoch": 2.786,
"grad_norm": 0.4224396347999573,
"learning_rate": 1.5420367194449448e-07,
"loss": 0.7599,
"step": 1393
},
{
"epoch": 2.7880000000000003,
"grad_norm": 0.36535367369651794,
"learning_rate": 1.5134939154651196e-07,
"loss": 0.7822,
"step": 1394
},
{
"epoch": 2.79,
"grad_norm": 0.3644055724143982,
"learning_rate": 1.4852136862001766e-07,
"loss": 0.7514,
"step": 1395
},
{
"epoch": 2.792,
"grad_norm": 0.423311322927475,
"learning_rate": 1.4571961847993977e-07,
"loss": 0.7406,
"step": 1396
},
{
"epoch": 2.794,
"grad_norm": 0.3825368881225586,
"learning_rate": 1.4294415629892756e-07,
"loss": 0.7779,
"step": 1397
},
{
"epoch": 2.7960000000000003,
"grad_norm": 0.4092647433280945,
"learning_rate": 1.4019499710726913e-07,
"loss": 0.7341,
"step": 1398
},
{
"epoch": 2.798,
"grad_norm": 0.38083040714263916,
"learning_rate": 1.374721557928116e-07,
"loss": 0.7631,
"step": 1399
},
{
"epoch": 2.8,
"grad_norm": 0.45573127269744873,
"learning_rate": 1.3477564710088097e-07,
"loss": 0.7536,
"step": 1400
},
{
"epoch": 2.802,
"grad_norm": 0.38706058263778687,
"learning_rate": 1.3210548563419857e-07,
"loss": 0.7856,
"step": 1401
},
{
"epoch": 2.8040000000000003,
"grad_norm": 0.3757847547531128,
"learning_rate": 1.294616858528064e-07,
"loss": 0.8257,
"step": 1402
},
{
"epoch": 2.806,
"grad_norm": 0.44547516107559204,
"learning_rate": 1.268442620739868e-07,
"loss": 0.7904,
"step": 1403
},
{
"epoch": 2.808,
"grad_norm": 0.3914513885974884,
"learning_rate": 1.2425322847218368e-07,
"loss": 0.7536,
"step": 1404
},
{
"epoch": 2.81,
"grad_norm": 0.40451890230178833,
"learning_rate": 1.2168859907892904e-07,
"loss": 0.7698,
"step": 1405
},
{
"epoch": 2.8120000000000003,
"grad_norm": 0.3859497010707855,
"learning_rate": 1.1915038778276212e-07,
"loss": 0.7556,
"step": 1406
},
{
"epoch": 2.814,
"grad_norm": 0.4025975465774536,
"learning_rate": 1.166386083291604e-07,
"loss": 0.7818,
"step": 1407
},
{
"epoch": 2.816,
"grad_norm": 0.3883987367153168,
"learning_rate": 1.1415327432046041e-07,
"loss": 0.7682,
"step": 1408
},
{
"epoch": 2.818,
"grad_norm": 0.41288435459136963,
"learning_rate": 1.1169439921578485e-07,
"loss": 0.7435,
"step": 1409
},
{
"epoch": 2.82,
"grad_norm": 0.39167869091033936,
"learning_rate": 1.0926199633097156e-07,
"loss": 0.7778,
"step": 1410
},
{
"epoch": 2.822,
"grad_norm": 0.41951802372932434,
"learning_rate": 1.0685607883850035e-07,
"loss": 0.7935,
"step": 1411
},
{
"epoch": 2.824,
"grad_norm": 0.40632784366607666,
"learning_rate": 1.044766597674196e-07,
"loss": 0.7273,
"step": 1412
},
{
"epoch": 2.826,
"grad_norm": 0.379300594329834,
"learning_rate": 1.0212375200327973e-07,
"loss": 0.7752,
"step": 1413
},
{
"epoch": 2.828,
"grad_norm": 0.3707320988178253,
"learning_rate": 9.979736828806096e-08,
"loss": 0.725,
"step": 1414
},
{
"epoch": 2.83,
"grad_norm": 0.371960312128067,
"learning_rate": 9.749752122010347e-08,
"loss": 0.7946,
"step": 1415
},
{
"epoch": 2.832,
"grad_norm": 0.3983508348464966,
"learning_rate": 9.522422325404234e-08,
"loss": 0.7541,
"step": 1416
},
{
"epoch": 2.834,
"grad_norm": 0.38345709443092346,
"learning_rate": 9.297748670073658e-08,
"loss": 0.7919,
"step": 1417
},
{
"epoch": 2.836,
"grad_norm": 0.36458224058151245,
"learning_rate": 9.075732372720414e-08,
"loss": 0.765,
"step": 1418
},
{
"epoch": 2.838,
"grad_norm": 0.37088239192962646,
"learning_rate": 8.856374635655696e-08,
"loss": 0.7838,
"step": 1419
},
{
"epoch": 2.84,
"grad_norm": 0.41397547721862793,
"learning_rate": 8.639676646793382e-08,
"loss": 0.7823,
"step": 1420
},
{
"epoch": 2.842,
"grad_norm": 0.3749987781047821,
"learning_rate": 8.425639579643763e-08,
"loss": 0.7627,
"step": 1421
},
{
"epoch": 2.844,
"grad_norm": 0.4029843807220459,
"learning_rate": 8.214264593307097e-08,
"loss": 0.7964,
"step": 1422
},
{
"epoch": 2.846,
"grad_norm": 0.4003535211086273,
"learning_rate": 8.00555283246729e-08,
"loss": 0.7721,
"step": 1423
},
{
"epoch": 2.848,
"grad_norm": 0.3872876465320587,
"learning_rate": 7.799505427386001e-08,
"loss": 0.7721,
"step": 1424
},
{
"epoch": 2.85,
"grad_norm": 0.4099399447441101,
"learning_rate": 7.59612349389599e-08,
"loss": 0.742,
"step": 1425
},
{
"epoch": 2.852,
"grad_norm": 0.3872721791267395,
"learning_rate": 7.395408133395509e-08,
"loss": 0.8037,
"step": 1426
},
{
"epoch": 2.854,
"grad_norm": 0.3637619614601135,
"learning_rate": 7.197360432842359e-08,
"loss": 0.7783,
"step": 1427
},
{
"epoch": 2.856,
"grad_norm": 0.3728599548339844,
"learning_rate": 7.001981464747565e-08,
"loss": 0.7498,
"step": 1428
},
{
"epoch": 2.858,
"grad_norm": 0.37154945731163025,
"learning_rate": 6.809272287169988e-08,
"loss": 0.7953,
"step": 1429
},
{
"epoch": 2.86,
"grad_norm": 0.41290783882141113,
"learning_rate": 6.61923394371039e-08,
"loss": 0.7582,
"step": 1430
},
{
"epoch": 2.862,
"grad_norm": 0.37080150842666626,
"learning_rate": 6.431867463506047e-08,
"loss": 0.7905,
"step": 1431
},
{
"epoch": 2.864,
"grad_norm": 0.3939642608165741,
"learning_rate": 6.247173861224753e-08,
"loss": 0.7539,
"step": 1432
},
{
"epoch": 2.866,
"grad_norm": 0.3907812833786011,
"learning_rate": 6.065154137059603e-08,
"loss": 0.769,
"step": 1433
},
{
"epoch": 2.868,
"grad_norm": 0.3818773329257965,
"learning_rate": 5.8858092767236084e-08,
"loss": 0.7335,
"step": 1434
},
{
"epoch": 2.87,
"grad_norm": 0.4225902855396271,
"learning_rate": 5.709140251444201e-08,
"loss": 0.7649,
"step": 1435
},
{
"epoch": 2.872,
"grad_norm": 0.41155657172203064,
"learning_rate": 5.535148017958014e-08,
"loss": 0.7602,
"step": 1436
},
{
"epoch": 2.874,
"grad_norm": 0.4028669595718384,
"learning_rate": 5.363833518505834e-08,
"loss": 0.7674,
"step": 1437
},
{
"epoch": 2.876,
"grad_norm": 0.38350680470466614,
"learning_rate": 5.19519768082738e-08,
"loss": 0.7224,
"step": 1438
},
{
"epoch": 2.878,
"grad_norm": 0.3763676881790161,
"learning_rate": 5.029241418156139e-08,
"loss": 0.7941,
"step": 1439
},
{
"epoch": 2.88,
"grad_norm": 0.3712924122810364,
"learning_rate": 4.865965629214819e-08,
"loss": 0.7631,
"step": 1440
},
{
"epoch": 2.882,
"grad_norm": 0.3843233585357666,
"learning_rate": 4.7053711982101294e-08,
"loss": 0.8029,
"step": 1441
},
{
"epoch": 2.884,
"grad_norm": 0.3684174120426178,
"learning_rate": 4.5474589948280026e-08,
"loss": 0.7565,
"step": 1442
},
{
"epoch": 2.886,
"grad_norm": 0.3982797861099243,
"learning_rate": 4.392229874229159e-08,
"loss": 0.7471,
"step": 1443
},
{
"epoch": 2.888,
"grad_norm": 0.41067230701446533,
"learning_rate": 4.2396846770441644e-08,
"loss": 0.7696,
"step": 1444
},
{
"epoch": 2.89,
"grad_norm": 0.38704684376716614,
"learning_rate": 4.0898242293691546e-08,
"loss": 0.7443,
"step": 1445
},
{
"epoch": 2.892,
"grad_norm": 0.38135451078414917,
"learning_rate": 3.9426493427611177e-08,
"loss": 0.7472,
"step": 1446
},
{
"epoch": 2.894,
"grad_norm": 0.3887581527233124,
"learning_rate": 3.7981608142335644e-08,
"loss": 0.7413,
"step": 1447
},
{
"epoch": 2.896,
"grad_norm": 0.3935358226299286,
"learning_rate": 3.65635942625242e-08,
"loss": 0.7873,
"step": 1448
},
{
"epoch": 2.898,
"grad_norm": 0.3650510311126709,
"learning_rate": 3.517245946731529e-08,
"loss": 0.7751,
"step": 1449
},
{
"epoch": 2.9,
"grad_norm": 0.38925668597221375,
"learning_rate": 3.3808211290284886e-08,
"loss": 0.7693,
"step": 1450
},
{
"epoch": 2.902,
"grad_norm": 0.4022022783756256,
"learning_rate": 3.247085711940878e-08,
"loss": 0.7388,
"step": 1451
},
{
"epoch": 2.904,
"grad_norm": 0.38158661127090454,
"learning_rate": 3.1160404197018155e-08,
"loss": 0.7582,
"step": 1452
},
{
"epoch": 2.906,
"grad_norm": 0.39832374453544617,
"learning_rate": 2.9876859619764606e-08,
"loss": 0.7544,
"step": 1453
},
{
"epoch": 2.908,
"grad_norm": 0.3519267439842224,
"learning_rate": 2.8620230338578526e-08,
"loss": 0.7685,
"step": 1454
},
{
"epoch": 2.91,
"grad_norm": 0.40946221351623535,
"learning_rate": 2.7390523158633552e-08,
"loss": 0.7249,
"step": 1455
},
{
"epoch": 2.912,
"grad_norm": 0.41239914298057556,
"learning_rate": 2.6187744739308297e-08,
"loss": 0.7743,
"step": 1456
},
{
"epoch": 2.914,
"grad_norm": 0.3555036187171936,
"learning_rate": 2.501190159415079e-08,
"loss": 0.7681,
"step": 1457
},
{
"epoch": 2.916,
"grad_norm": 0.37814003229141235,
"learning_rate": 2.386300009084408e-08,
"loss": 0.7537,
"step": 1458
},
{
"epoch": 2.918,
"grad_norm": 0.37123093008995056,
"learning_rate": 2.27410464511707e-08,
"loss": 0.7865,
"step": 1459
},
{
"epoch": 2.92,
"grad_norm": 0.35215526819229126,
"learning_rate": 2.1646046750978255e-08,
"loss": 0.7807,
"step": 1460
},
{
"epoch": 2.922,
"grad_norm": 0.39182814955711365,
"learning_rate": 2.057800692014833e-08,
"loss": 0.7786,
"step": 1461
},
{
"epoch": 2.924,
"grad_norm": 0.3995373249053955,
"learning_rate": 1.953693274256374e-08,
"loss": 0.7739,
"step": 1462
},
{
"epoch": 2.926,
"grad_norm": 0.3778274357318878,
"learning_rate": 1.8522829856076895e-08,
"loss": 0.7992,
"step": 1463
},
{
"epoch": 2.928,
"grad_norm": 0.4012286365032196,
"learning_rate": 1.753570375247815e-08,
"loss": 0.7559,
"step": 1464
},
{
"epoch": 2.93,
"grad_norm": 0.37980130314826965,
"learning_rate": 1.657555977746972e-08,
"loss": 0.7885,
"step": 1465
},
{
"epoch": 2.932,
"grad_norm": 0.37036362290382385,
"learning_rate": 1.5642403130632367e-08,
"loss": 0.7941,
"step": 1466
},
{
"epoch": 2.934,
"grad_norm": 0.38239234685897827,
"learning_rate": 1.4736238865398766e-08,
"loss": 0.7658,
"step": 1467
},
{
"epoch": 2.936,
"grad_norm": 0.36525359749794006,
"learning_rate": 1.3857071889029073e-08,
"loss": 0.8153,
"step": 1468
},
{
"epoch": 2.9379999999999997,
"grad_norm": 0.41786691546440125,
"learning_rate": 1.3004906962578723e-08,
"loss": 0.7857,
"step": 1469
},
{
"epoch": 2.94,
"grad_norm": 0.3932099938392639,
"learning_rate": 1.2179748700879013e-08,
"loss": 0.7324,
"step": 1470
},
{
"epoch": 2.942,
"grad_norm": 0.4426632225513458,
"learning_rate": 1.1381601572505452e-08,
"loss": 0.7635,
"step": 1471
},
{
"epoch": 2.944,
"grad_norm": 0.3719755709171295,
"learning_rate": 1.0610469899760001e-08,
"loss": 0.8129,
"step": 1472
},
{
"epoch": 2.9459999999999997,
"grad_norm": 0.4014662206172943,
"learning_rate": 9.866357858642206e-09,
"loss": 0.7705,
"step": 1473
},
{
"epoch": 2.948,
"grad_norm": 0.35570037364959717,
"learning_rate": 9.14926947883088e-09,
"loss": 0.7696,
"step": 1474
},
{
"epoch": 2.95,
"grad_norm": 0.3750092089176178,
"learning_rate": 8.459208643659122e-09,
"loss": 0.7757,
"step": 1475
},
{
"epoch": 2.952,
"grad_norm": 0.41063806414604187,
"learning_rate": 7.796179090094891e-09,
"loss": 0.8003,
"step": 1476
},
{
"epoch": 2.9539999999999997,
"grad_norm": 0.4088057577610016,
"learning_rate": 7.160184408721571e-09,
"loss": 0.7465,
"step": 1477
},
{
"epoch": 2.956,
"grad_norm": 0.368656724691391,
"learning_rate": 6.551228043715218e-09,
"loss": 0.7716,
"step": 1478
},
{
"epoch": 2.958,
"grad_norm": 0.4089622497558594,
"learning_rate": 5.969313292830126e-09,
"loss": 0.7852,
"step": 1479
},
{
"epoch": 2.96,
"grad_norm": 0.3801428973674774,
"learning_rate": 5.414443307377171e-09,
"loss": 0.772,
"step": 1480
},
{
"epoch": 2.9619999999999997,
"grad_norm": 0.39099130034446716,
"learning_rate": 4.8866210922110525e-09,
"loss": 0.7901,
"step": 1481
},
{
"epoch": 2.964,
"grad_norm": 0.38790273666381836,
"learning_rate": 4.385849505708084e-09,
"loss": 0.7628,
"step": 1482
},
{
"epoch": 2.966,
"grad_norm": 0.3607439398765564,
"learning_rate": 3.912131259757313e-09,
"loss": 0.781,
"step": 1483
},
{
"epoch": 2.968,
"grad_norm": 0.3746965825557709,
"learning_rate": 3.4654689197405335e-09,
"loss": 0.7677,
"step": 1484
},
{
"epoch": 2.9699999999999998,
"grad_norm": 0.3762003779411316,
"learning_rate": 3.0458649045211897e-09,
"loss": 0.7549,
"step": 1485
},
{
"epoch": 2.972,
"grad_norm": 0.4223495423793793,
"learning_rate": 2.6533214864310485e-09,
"loss": 0.7428,
"step": 1486
},
{
"epoch": 2.974,
"grad_norm": 0.3829779326915741,
"learning_rate": 2.287840791256324e-09,
"loss": 0.7882,
"step": 1487
},
{
"epoch": 2.976,
"grad_norm": 0.3744191527366638,
"learning_rate": 1.9494247982282386e-09,
"loss": 0.7786,
"step": 1488
},
{
"epoch": 2.9779999999999998,
"grad_norm": 0.3640522062778473,
"learning_rate": 1.638075340010814e-09,
"loss": 0.7755,
"step": 1489
},
{
"epoch": 2.98,
"grad_norm": 0.36369234323501587,
"learning_rate": 1.3537941026914302e-09,
"loss": 0.7661,
"step": 1490
},
{
"epoch": 2.982,
"grad_norm": 0.3734087646007538,
"learning_rate": 1.096582625772502e-09,
"loss": 0.7361,
"step": 1491
},
{
"epoch": 2.984,
"grad_norm": 0.43970629572868347,
"learning_rate": 8.664423021614854e-10,
"loss": 0.752,
"step": 1492
},
{
"epoch": 2.9859999999999998,
"grad_norm": 0.3792881965637207,
"learning_rate": 6.633743781642166e-10,
"loss": 0.7787,
"step": 1493
},
{
"epoch": 2.988,
"grad_norm": 0.4296637177467346,
"learning_rate": 4.87379953478806e-10,
"loss": 0.7623,
"step": 1494
},
{
"epoch": 2.99,
"grad_norm": 0.4059789180755615,
"learning_rate": 3.384599811889766e-10,
"loss": 0.7643,
"step": 1495
},
{
"epoch": 2.992,
"grad_norm": 0.35259121656417847,
"learning_rate": 2.1661526775795804e-10,
"loss": 0.7792,
"step": 1496
},
{
"epoch": 2.9939999999999998,
"grad_norm": 0.3793678283691406,
"learning_rate": 1.2184647302626585e-10,
"loss": 0.7959,
"step": 1497
},
{
"epoch": 2.996,
"grad_norm": 0.40443864464759827,
"learning_rate": 5.4154110206150465e-11,
"loss": 0.763,
"step": 1498
},
{
"epoch": 2.998,
"grad_norm": 0.35355669260025024,
"learning_rate": 1.3538545881042198e-11,
"loss": 0.8012,
"step": 1499
},
{
"epoch": 3.0,
"grad_norm": 0.3940981328487396,
"learning_rate": 0.0,
"loss": 0.7917,
"step": 1500
},
{
"epoch": 3.0,
"step": 1500,
"total_flos": 1507181614268416.0,
"train_loss": 0.8263951101700465,
"train_runtime": 83725.9772,
"train_samples_per_second": 1.72,
"train_steps_per_second": 0.018
}
],
"logging_steps": 1.0,
"max_steps": 1500,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1507181614268416.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}