Koto-Small-7B-IT / trainer_state.json
Delta-Vector's picture
Upload folder using huggingface_hub
eb6309f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 736,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002717391304347826,
"grad_norm": 1.1728428602218628,
"learning_rate": 1.0958904109589041e-07,
"loss": 1.5302,
"step": 1
},
{
"epoch": 0.005434782608695652,
"grad_norm": 1.2432262897491455,
"learning_rate": 2.1917808219178082e-07,
"loss": 1.5229,
"step": 2
},
{
"epoch": 0.008152173913043478,
"grad_norm": 1.276131510734558,
"learning_rate": 3.2876712328767123e-07,
"loss": 1.465,
"step": 3
},
{
"epoch": 0.010869565217391304,
"grad_norm": 1.3332633972167969,
"learning_rate": 4.3835616438356164e-07,
"loss": 1.4591,
"step": 4
},
{
"epoch": 0.01358695652173913,
"grad_norm": 1.3067371845245361,
"learning_rate": 5.47945205479452e-07,
"loss": 1.4768,
"step": 5
},
{
"epoch": 0.016304347826086956,
"grad_norm": 1.343665361404419,
"learning_rate": 6.575342465753425e-07,
"loss": 1.3851,
"step": 6
},
{
"epoch": 0.019021739130434784,
"grad_norm": 1.3181569576263428,
"learning_rate": 7.671232876712329e-07,
"loss": 1.465,
"step": 7
},
{
"epoch": 0.021739130434782608,
"grad_norm": 1.3282394409179688,
"learning_rate": 8.767123287671233e-07,
"loss": 1.4626,
"step": 8
},
{
"epoch": 0.024456521739130436,
"grad_norm": 1.3123646974563599,
"learning_rate": 9.863013698630137e-07,
"loss": 1.372,
"step": 9
},
{
"epoch": 0.02717391304347826,
"grad_norm": 1.1737868785858154,
"learning_rate": 1.095890410958904e-06,
"loss": 1.4761,
"step": 10
},
{
"epoch": 0.029891304347826088,
"grad_norm": 1.1780813932418823,
"learning_rate": 1.2054794520547945e-06,
"loss": 1.4729,
"step": 11
},
{
"epoch": 0.03260869565217391,
"grad_norm": 1.2191578149795532,
"learning_rate": 1.315068493150685e-06,
"loss": 1.5081,
"step": 12
},
{
"epoch": 0.035326086956521736,
"grad_norm": 1.082284927368164,
"learning_rate": 1.4246575342465753e-06,
"loss": 1.4792,
"step": 13
},
{
"epoch": 0.03804347826086957,
"grad_norm": 1.013970136642456,
"learning_rate": 1.5342465753424657e-06,
"loss": 1.4913,
"step": 14
},
{
"epoch": 0.04076086956521739,
"grad_norm": 1.0283228158950806,
"learning_rate": 1.643835616438356e-06,
"loss": 1.4546,
"step": 15
},
{
"epoch": 0.043478260869565216,
"grad_norm": 1.0099676847457886,
"learning_rate": 1.7534246575342465e-06,
"loss": 1.43,
"step": 16
},
{
"epoch": 0.04619565217391304,
"grad_norm": 0.8908332586288452,
"learning_rate": 1.863013698630137e-06,
"loss": 1.606,
"step": 17
},
{
"epoch": 0.04891304347826087,
"grad_norm": 0.8198248744010925,
"learning_rate": 1.9726027397260274e-06,
"loss": 1.4755,
"step": 18
},
{
"epoch": 0.051630434782608696,
"grad_norm": 0.8422051072120667,
"learning_rate": 2.0821917808219176e-06,
"loss": 1.4863,
"step": 19
},
{
"epoch": 0.05434782608695652,
"grad_norm": 0.7524480819702148,
"learning_rate": 2.191780821917808e-06,
"loss": 1.4982,
"step": 20
},
{
"epoch": 0.057065217391304345,
"grad_norm": 0.7762174010276794,
"learning_rate": 2.3013698630136984e-06,
"loss": 1.4734,
"step": 21
},
{
"epoch": 0.059782608695652176,
"grad_norm": 0.7196646332740784,
"learning_rate": 2.410958904109589e-06,
"loss": 1.5099,
"step": 22
},
{
"epoch": 0.0625,
"grad_norm": 0.7181140780448914,
"learning_rate": 2.5205479452054796e-06,
"loss": 1.4242,
"step": 23
},
{
"epoch": 0.06521739130434782,
"grad_norm": 0.7445970177650452,
"learning_rate": 2.63013698630137e-06,
"loss": 1.4157,
"step": 24
},
{
"epoch": 0.06793478260869565,
"grad_norm": 0.6932166814804077,
"learning_rate": 2.73972602739726e-06,
"loss": 1.4898,
"step": 25
},
{
"epoch": 0.07065217391304347,
"grad_norm": 0.6732593178749084,
"learning_rate": 2.8493150684931506e-06,
"loss": 1.4643,
"step": 26
},
{
"epoch": 0.07336956521739131,
"grad_norm": 0.6919940114021301,
"learning_rate": 2.958904109589041e-06,
"loss": 1.4602,
"step": 27
},
{
"epoch": 0.07608695652173914,
"grad_norm": 0.6646421551704407,
"learning_rate": 3.0684931506849314e-06,
"loss": 1.5683,
"step": 28
},
{
"epoch": 0.07880434782608696,
"grad_norm": 0.6307073831558228,
"learning_rate": 3.178082191780822e-06,
"loss": 1.4543,
"step": 29
},
{
"epoch": 0.08152173913043478,
"grad_norm": 0.6445148587226868,
"learning_rate": 3.287671232876712e-06,
"loss": 1.4359,
"step": 30
},
{
"epoch": 0.08423913043478261,
"grad_norm": 0.6217952370643616,
"learning_rate": 3.397260273972603e-06,
"loss": 1.4731,
"step": 31
},
{
"epoch": 0.08695652173913043,
"grad_norm": 0.5980370044708252,
"learning_rate": 3.506849315068493e-06,
"loss": 1.4586,
"step": 32
},
{
"epoch": 0.08967391304347826,
"grad_norm": 0.5764021873474121,
"learning_rate": 3.6164383561643833e-06,
"loss": 1.5436,
"step": 33
},
{
"epoch": 0.09239130434782608,
"grad_norm": 0.5938739776611328,
"learning_rate": 3.726027397260274e-06,
"loss": 1.439,
"step": 34
},
{
"epoch": 0.09510869565217392,
"grad_norm": 0.5414128303527832,
"learning_rate": 3.835616438356164e-06,
"loss": 1.4686,
"step": 35
},
{
"epoch": 0.09782608695652174,
"grad_norm": 0.5185249447822571,
"learning_rate": 3.945205479452055e-06,
"loss": 1.5181,
"step": 36
},
{
"epoch": 0.10054347826086957,
"grad_norm": 0.4969967007637024,
"learning_rate": 4.054794520547945e-06,
"loss": 1.3971,
"step": 37
},
{
"epoch": 0.10326086956521739,
"grad_norm": 0.4812517464160919,
"learning_rate": 4.164383561643835e-06,
"loss": 1.4484,
"step": 38
},
{
"epoch": 0.10597826086956522,
"grad_norm": 0.45815640687942505,
"learning_rate": 4.273972602739726e-06,
"loss": 1.3937,
"step": 39
},
{
"epoch": 0.10869565217391304,
"grad_norm": 0.46528932452201843,
"learning_rate": 4.383561643835616e-06,
"loss": 1.4014,
"step": 40
},
{
"epoch": 0.11141304347826086,
"grad_norm": 0.47149142622947693,
"learning_rate": 4.4931506849315066e-06,
"loss": 1.379,
"step": 41
},
{
"epoch": 0.11413043478260869,
"grad_norm": 0.452342689037323,
"learning_rate": 4.602739726027397e-06,
"loss": 1.525,
"step": 42
},
{
"epoch": 0.11684782608695653,
"grad_norm": 0.43710047006607056,
"learning_rate": 4.712328767123287e-06,
"loss": 1.3985,
"step": 43
},
{
"epoch": 0.11956521739130435,
"grad_norm": 0.45290184020996094,
"learning_rate": 4.821917808219178e-06,
"loss": 1.3659,
"step": 44
},
{
"epoch": 0.12228260869565218,
"grad_norm": 0.4344335198402405,
"learning_rate": 4.931506849315068e-06,
"loss": 1.3643,
"step": 45
},
{
"epoch": 0.125,
"grad_norm": 0.4450250566005707,
"learning_rate": 5.041095890410959e-06,
"loss": 1.4228,
"step": 46
},
{
"epoch": 0.12771739130434784,
"grad_norm": 0.42544320225715637,
"learning_rate": 5.1506849315068494e-06,
"loss": 1.4202,
"step": 47
},
{
"epoch": 0.13043478260869565,
"grad_norm": 0.40666356682777405,
"learning_rate": 5.26027397260274e-06,
"loss": 1.4102,
"step": 48
},
{
"epoch": 0.1331521739130435,
"grad_norm": 0.43085426092147827,
"learning_rate": 5.36986301369863e-06,
"loss": 1.4723,
"step": 49
},
{
"epoch": 0.1358695652173913,
"grad_norm": 0.42747458815574646,
"learning_rate": 5.47945205479452e-06,
"loss": 1.3923,
"step": 50
},
{
"epoch": 0.13858695652173914,
"grad_norm": 0.42834821343421936,
"learning_rate": 5.589041095890411e-06,
"loss": 1.5331,
"step": 51
},
{
"epoch": 0.14130434782608695,
"grad_norm": 0.4218186140060425,
"learning_rate": 5.698630136986301e-06,
"loss": 1.5421,
"step": 52
},
{
"epoch": 0.14402173913043478,
"grad_norm": 0.4152364432811737,
"learning_rate": 5.8082191780821915e-06,
"loss": 1.3454,
"step": 53
},
{
"epoch": 0.14673913043478262,
"grad_norm": 0.4136430621147156,
"learning_rate": 5.917808219178082e-06,
"loss": 1.4521,
"step": 54
},
{
"epoch": 0.14945652173913043,
"grad_norm": 0.40648165345191956,
"learning_rate": 6.027397260273972e-06,
"loss": 1.4088,
"step": 55
},
{
"epoch": 0.15217391304347827,
"grad_norm": 0.4164981544017792,
"learning_rate": 6.136986301369863e-06,
"loss": 1.4305,
"step": 56
},
{
"epoch": 0.15489130434782608,
"grad_norm": 0.39643916487693787,
"learning_rate": 6.246575342465753e-06,
"loss": 1.4827,
"step": 57
},
{
"epoch": 0.15760869565217392,
"grad_norm": 0.38055455684661865,
"learning_rate": 6.356164383561644e-06,
"loss": 1.4725,
"step": 58
},
{
"epoch": 0.16032608695652173,
"grad_norm": 0.37801629304885864,
"learning_rate": 6.465753424657534e-06,
"loss": 1.3829,
"step": 59
},
{
"epoch": 0.16304347826086957,
"grad_norm": 0.3744637370109558,
"learning_rate": 6.575342465753424e-06,
"loss": 1.3753,
"step": 60
},
{
"epoch": 0.16576086956521738,
"grad_norm": 0.3783905506134033,
"learning_rate": 6.684931506849315e-06,
"loss": 1.393,
"step": 61
},
{
"epoch": 0.16847826086956522,
"grad_norm": 0.40007972717285156,
"learning_rate": 6.794520547945206e-06,
"loss": 1.4476,
"step": 62
},
{
"epoch": 0.17119565217391305,
"grad_norm": 0.39294755458831787,
"learning_rate": 6.904109589041096e-06,
"loss": 1.3876,
"step": 63
},
{
"epoch": 0.17391304347826086,
"grad_norm": 0.377951979637146,
"learning_rate": 7.013698630136986e-06,
"loss": 1.4476,
"step": 64
},
{
"epoch": 0.1766304347826087,
"grad_norm": 0.3997436463832855,
"learning_rate": 7.123287671232876e-06,
"loss": 1.3766,
"step": 65
},
{
"epoch": 0.1793478260869565,
"grad_norm": 0.38416406512260437,
"learning_rate": 7.2328767123287666e-06,
"loss": 1.3872,
"step": 66
},
{
"epoch": 0.18206521739130435,
"grad_norm": 0.3918316662311554,
"learning_rate": 7.342465753424658e-06,
"loss": 1.3798,
"step": 67
},
{
"epoch": 0.18478260869565216,
"grad_norm": 0.38442927598953247,
"learning_rate": 7.452054794520548e-06,
"loss": 1.4045,
"step": 68
},
{
"epoch": 0.1875,
"grad_norm": 0.41492289304733276,
"learning_rate": 7.561643835616438e-06,
"loss": 1.4799,
"step": 69
},
{
"epoch": 0.19021739130434784,
"grad_norm": 0.368286669254303,
"learning_rate": 7.671232876712327e-06,
"loss": 1.4169,
"step": 70
},
{
"epoch": 0.19293478260869565,
"grad_norm": 0.3741121292114258,
"learning_rate": 7.780821917808218e-06,
"loss": 1.3663,
"step": 71
},
{
"epoch": 0.1956521739130435,
"grad_norm": 0.4842956066131592,
"learning_rate": 7.89041095890411e-06,
"loss": 1.4287,
"step": 72
},
{
"epoch": 0.1983695652173913,
"grad_norm": 0.3899683654308319,
"learning_rate": 8e-06,
"loss": 1.347,
"step": 73
},
{
"epoch": 0.20108695652173914,
"grad_norm": 0.4036129117012024,
"learning_rate": 7.998791723304636e-06,
"loss": 1.5064,
"step": 74
},
{
"epoch": 0.20380434782608695,
"grad_norm": 0.35146766901016235,
"learning_rate": 7.997580157289776e-06,
"loss": 1.3794,
"step": 75
},
{
"epoch": 0.20652173913043478,
"grad_norm": 0.3625771403312683,
"learning_rate": 7.996365288505225e-06,
"loss": 1.3338,
"step": 76
},
{
"epoch": 0.20923913043478262,
"grad_norm": 0.3601396679878235,
"learning_rate": 7.995147103427359e-06,
"loss": 1.5026,
"step": 77
},
{
"epoch": 0.21195652173913043,
"grad_norm": 0.37662559747695923,
"learning_rate": 7.993925588458618e-06,
"loss": 1.3867,
"step": 78
},
{
"epoch": 0.21467391304347827,
"grad_norm": 0.37634652853012085,
"learning_rate": 7.992700729927007e-06,
"loss": 1.4557,
"step": 79
},
{
"epoch": 0.21739130434782608,
"grad_norm": 0.3699498772621155,
"learning_rate": 7.99147251408558e-06,
"loss": 1.4237,
"step": 80
},
{
"epoch": 0.22010869565217392,
"grad_norm": 0.3638997972011566,
"learning_rate": 7.990240927111924e-06,
"loss": 1.3822,
"step": 81
},
{
"epoch": 0.22282608695652173,
"grad_norm": 0.3786831498146057,
"learning_rate": 7.989005955107651e-06,
"loss": 1.3545,
"step": 82
},
{
"epoch": 0.22554347826086957,
"grad_norm": 0.3499252200126648,
"learning_rate": 7.987767584097859e-06,
"loss": 1.4278,
"step": 83
},
{
"epoch": 0.22826086956521738,
"grad_norm": 0.3761653006076813,
"learning_rate": 7.986525800030624e-06,
"loss": 1.4175,
"step": 84
},
{
"epoch": 0.23097826086956522,
"grad_norm": 0.3493477702140808,
"learning_rate": 7.985280588776449e-06,
"loss": 1.3315,
"step": 85
},
{
"epoch": 0.23369565217391305,
"grad_norm": 0.3659382462501526,
"learning_rate": 7.984031936127745e-06,
"loss": 1.3883,
"step": 86
},
{
"epoch": 0.23641304347826086,
"grad_norm": 0.381062388420105,
"learning_rate": 7.982779827798278e-06,
"loss": 1.3715,
"step": 87
},
{
"epoch": 0.2391304347826087,
"grad_norm": 0.3763432800769806,
"learning_rate": 7.981524249422633e-06,
"loss": 1.4077,
"step": 88
},
{
"epoch": 0.2418478260869565,
"grad_norm": 0.3684273064136505,
"learning_rate": 7.980265186555657e-06,
"loss": 1.4948,
"step": 89
},
{
"epoch": 0.24456521739130435,
"grad_norm": 0.36673790216445923,
"learning_rate": 7.979002624671916e-06,
"loss": 1.3961,
"step": 90
},
{
"epoch": 0.24728260869565216,
"grad_norm": 0.35955995321273804,
"learning_rate": 7.97773654916512e-06,
"loss": 1.3453,
"step": 91
},
{
"epoch": 0.25,
"grad_norm": 0.3854355812072754,
"learning_rate": 7.976466945347576e-06,
"loss": 1.4365,
"step": 92
},
{
"epoch": 0.25271739130434784,
"grad_norm": 0.3639521896839142,
"learning_rate": 7.975193798449611e-06,
"loss": 1.4144,
"step": 93
},
{
"epoch": 0.2554347826086957,
"grad_norm": 0.3835349678993225,
"learning_rate": 7.973917093619002e-06,
"loss": 1.4329,
"step": 94
},
{
"epoch": 0.25815217391304346,
"grad_norm": 0.3855659067630768,
"learning_rate": 7.972636815920398e-06,
"loss": 1.4355,
"step": 95
},
{
"epoch": 0.2608695652173913,
"grad_norm": 0.3564344644546509,
"learning_rate": 7.971352950334734e-06,
"loss": 1.3849,
"step": 96
},
{
"epoch": 0.26358695652173914,
"grad_norm": 0.3524990975856781,
"learning_rate": 7.970065481758653e-06,
"loss": 1.3864,
"step": 97
},
{
"epoch": 0.266304347826087,
"grad_norm": 0.3606851398944855,
"learning_rate": 7.968774395003903e-06,
"loss": 1.4575,
"step": 98
},
{
"epoch": 0.26902173913043476,
"grad_norm": 0.35041898488998413,
"learning_rate": 7.967479674796748e-06,
"loss": 1.4361,
"step": 99
},
{
"epoch": 0.2717391304347826,
"grad_norm": 0.35816073417663574,
"learning_rate": 7.96618130577736e-06,
"loss": 1.3062,
"step": 100
},
{
"epoch": 0.27445652173913043,
"grad_norm": 0.3600623309612274,
"learning_rate": 7.964879272499215e-06,
"loss": 1.2906,
"step": 101
},
{
"epoch": 0.27717391304347827,
"grad_norm": 0.36027124524116516,
"learning_rate": 7.96357355942848e-06,
"loss": 1.3665,
"step": 102
},
{
"epoch": 0.2798913043478261,
"grad_norm": 0.3527257442474365,
"learning_rate": 7.962264150943397e-06,
"loss": 1.374,
"step": 103
},
{
"epoch": 0.2826086956521739,
"grad_norm": 0.352342814207077,
"learning_rate": 7.960951031333648e-06,
"loss": 1.379,
"step": 104
},
{
"epoch": 0.28532608695652173,
"grad_norm": 0.365664541721344,
"learning_rate": 7.959634184799749e-06,
"loss": 1.4299,
"step": 105
},
{
"epoch": 0.28804347826086957,
"grad_norm": 0.34988468885421753,
"learning_rate": 7.958313595452392e-06,
"loss": 1.4161,
"step": 106
},
{
"epoch": 0.2907608695652174,
"grad_norm": 0.3694222569465637,
"learning_rate": 7.956989247311828e-06,
"loss": 1.371,
"step": 107
},
{
"epoch": 0.29347826086956524,
"grad_norm": 0.3477892577648163,
"learning_rate": 7.955661124307205e-06,
"loss": 1.4617,
"step": 108
},
{
"epoch": 0.296195652173913,
"grad_norm": 0.34717634320259094,
"learning_rate": 7.954329210275928e-06,
"loss": 1.3667,
"step": 109
},
{
"epoch": 0.29891304347826086,
"grad_norm": 0.3612186014652252,
"learning_rate": 7.952993488962999e-06,
"loss": 1.4065,
"step": 110
},
{
"epoch": 0.3016304347826087,
"grad_norm": 0.3764769434928894,
"learning_rate": 7.951653944020356e-06,
"loss": 1.3538,
"step": 111
},
{
"epoch": 0.30434782608695654,
"grad_norm": 0.34798580408096313,
"learning_rate": 7.95031055900621e-06,
"loss": 1.3902,
"step": 112
},
{
"epoch": 0.3070652173913043,
"grad_norm": 0.3572092652320862,
"learning_rate": 7.94896331738437e-06,
"loss": 1.3614,
"step": 113
},
{
"epoch": 0.30978260869565216,
"grad_norm": 0.34781020879745483,
"learning_rate": 7.94761220252356e-06,
"loss": 1.406,
"step": 114
},
{
"epoch": 0.3125,
"grad_norm": 0.37208840250968933,
"learning_rate": 7.946257197696737e-06,
"loss": 1.4388,
"step": 115
},
{
"epoch": 0.31521739130434784,
"grad_norm": 0.3797365427017212,
"learning_rate": 7.94489828608041e-06,
"loss": 1.3632,
"step": 116
},
{
"epoch": 0.3179347826086957,
"grad_norm": 0.34749776124954224,
"learning_rate": 7.94353545075393e-06,
"loss": 1.3529,
"step": 117
},
{
"epoch": 0.32065217391304346,
"grad_norm": 0.3999796211719513,
"learning_rate": 7.942168674698794e-06,
"loss": 1.3701,
"step": 118
},
{
"epoch": 0.3233695652173913,
"grad_norm": 0.38252121210098267,
"learning_rate": 7.94079794079794e-06,
"loss": 1.4129,
"step": 119
},
{
"epoch": 0.32608695652173914,
"grad_norm": 0.3657996356487274,
"learning_rate": 7.939423231835025e-06,
"loss": 1.3735,
"step": 120
},
{
"epoch": 0.328804347826087,
"grad_norm": 0.39039069414138794,
"learning_rate": 7.938044530493707e-06,
"loss": 1.4762,
"step": 121
},
{
"epoch": 0.33152173913043476,
"grad_norm": 0.35975077748298645,
"learning_rate": 7.936661819356923e-06,
"loss": 1.4388,
"step": 122
},
{
"epoch": 0.3342391304347826,
"grad_norm": 0.36750245094299316,
"learning_rate": 7.93527508090615e-06,
"loss": 1.3754,
"step": 123
},
{
"epoch": 0.33695652173913043,
"grad_norm": 0.38908377289772034,
"learning_rate": 7.933884297520661e-06,
"loss": 1.2874,
"step": 124
},
{
"epoch": 0.33967391304347827,
"grad_norm": 0.3685767352581024,
"learning_rate": 7.932489451476793e-06,
"loss": 1.4003,
"step": 125
},
{
"epoch": 0.3423913043478261,
"grad_norm": 0.3810523450374603,
"learning_rate": 7.93109052494718e-06,
"loss": 1.4163,
"step": 126
},
{
"epoch": 0.3451086956521739,
"grad_norm": 0.3630084693431854,
"learning_rate": 7.9296875e-06,
"loss": 1.3586,
"step": 127
},
{
"epoch": 0.34782608695652173,
"grad_norm": 0.35243499279022217,
"learning_rate": 7.928280358598207e-06,
"loss": 1.4657,
"step": 128
},
{
"epoch": 0.35054347826086957,
"grad_norm": 0.3778194785118103,
"learning_rate": 7.926869082598759e-06,
"loss": 1.3592,
"step": 129
},
{
"epoch": 0.3532608695652174,
"grad_norm": 0.3549621105194092,
"learning_rate": 7.925453653751838e-06,
"loss": 1.3663,
"step": 130
},
{
"epoch": 0.35597826086956524,
"grad_norm": 0.37039250135421753,
"learning_rate": 7.924034053700065e-06,
"loss": 1.4375,
"step": 131
},
{
"epoch": 0.358695652173913,
"grad_norm": 0.3636303246021271,
"learning_rate": 7.922610263977701e-06,
"loss": 1.4134,
"step": 132
},
{
"epoch": 0.36141304347826086,
"grad_norm": 0.3509652614593506,
"learning_rate": 7.921182266009852e-06,
"loss": 1.4279,
"step": 133
},
{
"epoch": 0.3641304347826087,
"grad_norm": 0.3587414622306824,
"learning_rate": 7.919750041111659e-06,
"loss": 1.4191,
"step": 134
},
{
"epoch": 0.36684782608695654,
"grad_norm": 0.3569912910461426,
"learning_rate": 7.918313570487484e-06,
"loss": 1.4302,
"step": 135
},
{
"epoch": 0.3695652173913043,
"grad_norm": 0.3657996654510498,
"learning_rate": 7.916872835230084e-06,
"loss": 1.3599,
"step": 136
},
{
"epoch": 0.37228260869565216,
"grad_norm": 0.3624045252799988,
"learning_rate": 7.915427816319788e-06,
"loss": 1.309,
"step": 137
},
{
"epoch": 0.375,
"grad_norm": 0.34657520055770874,
"learning_rate": 7.913978494623655e-06,
"loss": 1.3104,
"step": 138
},
{
"epoch": 0.37771739130434784,
"grad_norm": 0.3397389352321625,
"learning_rate": 7.912524850894631e-06,
"loss": 1.4129,
"step": 139
},
{
"epoch": 0.3804347826086957,
"grad_norm": 0.356838583946228,
"learning_rate": 7.911066865770698e-06,
"loss": 1.4378,
"step": 140
},
{
"epoch": 0.38315217391304346,
"grad_norm": 0.37165331840515137,
"learning_rate": 7.909604519774012e-06,
"loss": 1.4103,
"step": 141
},
{
"epoch": 0.3858695652173913,
"grad_norm": 0.3432267904281616,
"learning_rate": 7.908137793310034e-06,
"loss": 1.3907,
"step": 142
},
{
"epoch": 0.38858695652173914,
"grad_norm": 0.36105579137802124,
"learning_rate": 7.906666666666667e-06,
"loss": 1.3705,
"step": 143
},
{
"epoch": 0.391304347826087,
"grad_norm": 0.37910157442092896,
"learning_rate": 7.905191120013353e-06,
"loss": 1.3521,
"step": 144
},
{
"epoch": 0.39402173913043476,
"grad_norm": 0.35084208846092224,
"learning_rate": 7.9037111334002e-06,
"loss": 1.4527,
"step": 145
},
{
"epoch": 0.3967391304347826,
"grad_norm": 0.3619990050792694,
"learning_rate": 7.902226686757073e-06,
"loss": 1.4008,
"step": 146
},
{
"epoch": 0.39945652173913043,
"grad_norm": 0.3557009994983673,
"learning_rate": 7.90073775989269e-06,
"loss": 1.4537,
"step": 147
},
{
"epoch": 0.40217391304347827,
"grad_norm": 0.36351799964904785,
"learning_rate": 7.899244332493702e-06,
"loss": 1.3187,
"step": 148
},
{
"epoch": 0.4048913043478261,
"grad_norm": 0.3715021312236786,
"learning_rate": 7.897746384123781e-06,
"loss": 1.4033,
"step": 149
},
{
"epoch": 0.4076086956521739,
"grad_norm": 0.353608638048172,
"learning_rate": 7.896243894222672e-06,
"loss": 1.4045,
"step": 150
},
{
"epoch": 0.41032608695652173,
"grad_norm": 0.36378586292266846,
"learning_rate": 7.894736842105263e-06,
"loss": 1.3938,
"step": 151
},
{
"epoch": 0.41304347826086957,
"grad_norm": 0.34492647647857666,
"learning_rate": 7.893225206960635e-06,
"loss": 1.3993,
"step": 152
},
{
"epoch": 0.4157608695652174,
"grad_norm": 0.3632250726222992,
"learning_rate": 7.891708967851099e-06,
"loss": 1.3533,
"step": 153
},
{
"epoch": 0.41847826086956524,
"grad_norm": 0.349479079246521,
"learning_rate": 7.890188103711236e-06,
"loss": 1.3298,
"step": 154
},
{
"epoch": 0.421195652173913,
"grad_norm": 0.3889572322368622,
"learning_rate": 7.88866259334691e-06,
"loss": 1.3372,
"step": 155
},
{
"epoch": 0.42391304347826086,
"grad_norm": 0.38098421692848206,
"learning_rate": 7.887132415434302e-06,
"loss": 1.3923,
"step": 156
},
{
"epoch": 0.4266304347826087,
"grad_norm": 0.36011362075805664,
"learning_rate": 7.885597548518897e-06,
"loss": 1.4739,
"step": 157
},
{
"epoch": 0.42934782608695654,
"grad_norm": 0.3780601918697357,
"learning_rate": 7.884057971014493e-06,
"loss": 1.366,
"step": 158
},
{
"epoch": 0.4320652173913043,
"grad_norm": 0.3612130582332611,
"learning_rate": 7.882513661202186e-06,
"loss": 1.3845,
"step": 159
},
{
"epoch": 0.43478260869565216,
"grad_norm": 0.36176979541778564,
"learning_rate": 7.88096459722935e-06,
"loss": 1.4583,
"step": 160
},
{
"epoch": 0.4375,
"grad_norm": 0.3663961589336395,
"learning_rate": 7.8794107571086e-06,
"loss": 1.3918,
"step": 161
},
{
"epoch": 0.44021739130434784,
"grad_norm": 0.3562765121459961,
"learning_rate": 7.877852118716762e-06,
"loss": 1.389,
"step": 162
},
{
"epoch": 0.4429347826086957,
"grad_norm": 0.35205936431884766,
"learning_rate": 7.876288659793814e-06,
"loss": 1.3838,
"step": 163
},
{
"epoch": 0.44565217391304346,
"grad_norm": 0.36153075098991394,
"learning_rate": 7.874720357941835e-06,
"loss": 1.3615,
"step": 164
},
{
"epoch": 0.4483695652173913,
"grad_norm": 0.39255401492118835,
"learning_rate": 7.873147190623922e-06,
"loss": 1.3994,
"step": 165
},
{
"epoch": 0.45108695652173914,
"grad_norm": 0.3576897084712982,
"learning_rate": 7.871569135163129e-06,
"loss": 1.3612,
"step": 166
},
{
"epoch": 0.453804347826087,
"grad_norm": 0.3810504674911499,
"learning_rate": 7.869986168741356e-06,
"loss": 1.3979,
"step": 167
},
{
"epoch": 0.45652173913043476,
"grad_norm": 0.3665294349193573,
"learning_rate": 7.868398268398268e-06,
"loss": 1.416,
"step": 168
},
{
"epoch": 0.4592391304347826,
"grad_norm": 0.37046587467193604,
"learning_rate": 7.866805411030177e-06,
"loss": 1.3696,
"step": 169
},
{
"epoch": 0.46195652173913043,
"grad_norm": 0.3550441265106201,
"learning_rate": 7.865207573388918e-06,
"loss": 1.4137,
"step": 170
},
{
"epoch": 0.46467391304347827,
"grad_norm": 0.35164910554885864,
"learning_rate": 7.863604732080725e-06,
"loss": 1.3666,
"step": 171
},
{
"epoch": 0.4673913043478261,
"grad_norm": 0.334942489862442,
"learning_rate": 7.861996863565082e-06,
"loss": 1.3725,
"step": 172
},
{
"epoch": 0.4701086956521739,
"grad_norm": 0.36114388704299927,
"learning_rate": 7.860383944153578e-06,
"loss": 1.361,
"step": 173
},
{
"epoch": 0.47282608695652173,
"grad_norm": 0.3654029965400696,
"learning_rate": 7.85876595000874e-06,
"loss": 1.3703,
"step": 174
},
{
"epoch": 0.47554347826086957,
"grad_norm": 0.3569169044494629,
"learning_rate": 7.857142857142858e-06,
"loss": 1.384,
"step": 175
},
{
"epoch": 0.4782608695652174,
"grad_norm": 0.38499578833580017,
"learning_rate": 7.855514641416798e-06,
"loss": 1.3818,
"step": 176
},
{
"epoch": 0.48097826086956524,
"grad_norm": 0.353002667427063,
"learning_rate": 7.853881278538813e-06,
"loss": 1.4214,
"step": 177
},
{
"epoch": 0.483695652173913,
"grad_norm": 0.34458428621292114,
"learning_rate": 7.852242744063325e-06,
"loss": 1.4166,
"step": 178
},
{
"epoch": 0.48641304347826086,
"grad_norm": 0.3483969271183014,
"learning_rate": 7.850599013389712e-06,
"loss": 1.3618,
"step": 179
},
{
"epoch": 0.4891304347826087,
"grad_norm": 0.35649237036705017,
"learning_rate": 7.848950061761072e-06,
"loss": 1.4088,
"step": 180
},
{
"epoch": 0.49184782608695654,
"grad_norm": 0.37080127000808716,
"learning_rate": 7.84729586426299e-06,
"loss": 1.3909,
"step": 181
},
{
"epoch": 0.4945652173913043,
"grad_norm": 0.35322239995002747,
"learning_rate": 7.845636395822269e-06,
"loss": 1.2835,
"step": 182
},
{
"epoch": 0.49728260869565216,
"grad_norm": 0.34811776876449585,
"learning_rate": 7.843971631205673e-06,
"loss": 1.3693,
"step": 183
},
{
"epoch": 0.5,
"grad_norm": 0.3507843315601349,
"learning_rate": 7.842301545018646e-06,
"loss": 1.4295,
"step": 184
},
{
"epoch": 0.5027173913043478,
"grad_norm": 0.36517709493637085,
"learning_rate": 7.84062611170402e-06,
"loss": 1.3763,
"step": 185
},
{
"epoch": 0.5054347826086957,
"grad_norm": 0.3709813058376312,
"learning_rate": 7.83894530554071e-06,
"loss": 1.3681,
"step": 186
},
{
"epoch": 0.5081521739130435,
"grad_norm": 0.3524121046066284,
"learning_rate": 7.837259100642398e-06,
"loss": 1.3644,
"step": 187
},
{
"epoch": 0.5108695652173914,
"grad_norm": 0.3615017533302307,
"learning_rate": 7.83556747095621e-06,
"loss": 1.4233,
"step": 188
},
{
"epoch": 0.5135869565217391,
"grad_norm": 0.3449147939682007,
"learning_rate": 7.833870390261366e-06,
"loss": 1.4012,
"step": 189
},
{
"epoch": 0.5163043478260869,
"grad_norm": 0.3625212013721466,
"learning_rate": 7.832167832167831e-06,
"loss": 1.4591,
"step": 190
},
{
"epoch": 0.5190217391304348,
"grad_norm": 0.390602707862854,
"learning_rate": 7.830459770114941e-06,
"loss": 1.4077,
"step": 191
},
{
"epoch": 0.5217391304347826,
"grad_norm": 0.3582949936389923,
"learning_rate": 7.82874617737003e-06,
"loss": 1.3576,
"step": 192
},
{
"epoch": 0.5244565217391305,
"grad_norm": 0.3695563077926636,
"learning_rate": 7.827027027027026e-06,
"loss": 1.3497,
"step": 193
},
{
"epoch": 0.5271739130434783,
"grad_norm": 0.3619958162307739,
"learning_rate": 7.825302292005052e-06,
"loss": 1.319,
"step": 194
},
{
"epoch": 0.529891304347826,
"grad_norm": 0.364831805229187,
"learning_rate": 7.823571945046999e-06,
"loss": 1.3109,
"step": 195
},
{
"epoch": 0.532608695652174,
"grad_norm": 0.3568253219127655,
"learning_rate": 7.821835958718086e-06,
"loss": 1.3776,
"step": 196
},
{
"epoch": 0.5353260869565217,
"grad_norm": 0.37014317512512207,
"learning_rate": 7.820094305404424e-06,
"loss": 1.4481,
"step": 197
},
{
"epoch": 0.5380434782608695,
"grad_norm": 0.37578845024108887,
"learning_rate": 7.818346957311535e-06,
"loss": 1.3715,
"step": 198
},
{
"epoch": 0.5407608695652174,
"grad_norm": 0.347926527261734,
"learning_rate": 7.816593886462881e-06,
"loss": 1.3542,
"step": 199
},
{
"epoch": 0.5434782608695652,
"grad_norm": 0.36149612069129944,
"learning_rate": 7.814835064698378e-06,
"loss": 1.4431,
"step": 200
},
{
"epoch": 0.5461956521739131,
"grad_norm": 0.35200005769729614,
"learning_rate": 7.813070463672874e-06,
"loss": 1.3958,
"step": 201
},
{
"epoch": 0.5489130434782609,
"grad_norm": 0.34820863604545593,
"learning_rate": 7.811300054854634e-06,
"loss": 1.2673,
"step": 202
},
{
"epoch": 0.5516304347826086,
"grad_norm": 0.36692818999290466,
"learning_rate": 7.80952380952381e-06,
"loss": 1.3734,
"step": 203
},
{
"epoch": 0.5543478260869565,
"grad_norm": 0.3500845432281494,
"learning_rate": 7.807741698770867e-06,
"loss": 1.3535,
"step": 204
},
{
"epoch": 0.5570652173913043,
"grad_norm": 0.3685120642185211,
"learning_rate": 7.805953693495038e-06,
"loss": 1.3531,
"step": 205
},
{
"epoch": 0.5597826086956522,
"grad_norm": 0.3602903485298157,
"learning_rate": 7.804159764402723e-06,
"loss": 1.3251,
"step": 206
},
{
"epoch": 0.5625,
"grad_norm": 0.359762966632843,
"learning_rate": 7.802359882005899e-06,
"loss": 1.3668,
"step": 207
},
{
"epoch": 0.5652173913043478,
"grad_norm": 0.3595513701438904,
"learning_rate": 7.800554016620498e-06,
"loss": 1.5346,
"step": 208
},
{
"epoch": 0.5679347826086957,
"grad_norm": 0.372349351644516,
"learning_rate": 7.79874213836478e-06,
"loss": 1.431,
"step": 209
},
{
"epoch": 0.5706521739130435,
"grad_norm": 0.3687814474105835,
"learning_rate": 7.79692421715768e-06,
"loss": 1.3849,
"step": 210
},
{
"epoch": 0.5733695652173914,
"grad_norm": 0.3541337251663208,
"learning_rate": 7.795100222717148e-06,
"loss": 1.3575,
"step": 211
},
{
"epoch": 0.5760869565217391,
"grad_norm": 0.3784167170524597,
"learning_rate": 7.793270124558468e-06,
"loss": 1.3768,
"step": 212
},
{
"epoch": 0.5788043478260869,
"grad_norm": 0.3548452854156494,
"learning_rate": 7.79143389199255e-06,
"loss": 1.416,
"step": 213
},
{
"epoch": 0.5815217391304348,
"grad_norm": 0.3744758367538452,
"learning_rate": 7.78959149412423e-06,
"loss": 1.4005,
"step": 214
},
{
"epoch": 0.5842391304347826,
"grad_norm": 0.34252363443374634,
"learning_rate": 7.787742899850522e-06,
"loss": 1.3614,
"step": 215
},
{
"epoch": 0.5869565217391305,
"grad_norm": 0.34789708256721497,
"learning_rate": 7.78588807785888e-06,
"loss": 1.3754,
"step": 216
},
{
"epoch": 0.5896739130434783,
"grad_norm": 0.366502046585083,
"learning_rate": 7.784026996625422e-06,
"loss": 1.3839,
"step": 217
},
{
"epoch": 0.592391304347826,
"grad_norm": 0.39890167117118835,
"learning_rate": 7.782159624413145e-06,
"loss": 1.4088,
"step": 218
},
{
"epoch": 0.595108695652174,
"grad_norm": 0.35646381974220276,
"learning_rate": 7.780285929270127e-06,
"loss": 1.3362,
"step": 219
},
{
"epoch": 0.5978260869565217,
"grad_norm": 0.36038488149642944,
"learning_rate": 7.778405879027698e-06,
"loss": 1.349,
"step": 220
},
{
"epoch": 0.6005434782608695,
"grad_norm": 0.35663580894470215,
"learning_rate": 7.776519441298602e-06,
"loss": 1.3727,
"step": 221
},
{
"epoch": 0.6032608695652174,
"grad_norm": 0.36461469531059265,
"learning_rate": 7.774626583475137e-06,
"loss": 1.4629,
"step": 222
},
{
"epoch": 0.6059782608695652,
"grad_norm": 0.35047125816345215,
"learning_rate": 7.772727272727272e-06,
"loss": 1.2795,
"step": 223
},
{
"epoch": 0.6086956521739131,
"grad_norm": 0.36245492100715637,
"learning_rate": 7.770821476000759e-06,
"loss": 1.3618,
"step": 224
},
{
"epoch": 0.6114130434782609,
"grad_norm": 0.3742018938064575,
"learning_rate": 7.768909160015202e-06,
"loss": 1.459,
"step": 225
},
{
"epoch": 0.6141304347826086,
"grad_norm": 0.35593125224113464,
"learning_rate": 7.766990291262136e-06,
"loss": 1.4521,
"step": 226
},
{
"epoch": 0.6168478260869565,
"grad_norm": 0.3579864203929901,
"learning_rate": 7.76506483600305e-06,
"loss": 1.4157,
"step": 227
},
{
"epoch": 0.6195652173913043,
"grad_norm": 0.352098286151886,
"learning_rate": 7.76313276026743e-06,
"loss": 1.4013,
"step": 228
},
{
"epoch": 0.6222826086956522,
"grad_norm": 0.36963212490081787,
"learning_rate": 7.761194029850745e-06,
"loss": 1.4101,
"step": 229
},
{
"epoch": 0.625,
"grad_norm": 0.36608660221099854,
"learning_rate": 7.75924861031244e-06,
"loss": 1.3273,
"step": 230
},
{
"epoch": 0.6277173913043478,
"grad_norm": 0.37208378314971924,
"learning_rate": 7.757296466973885e-06,
"loss": 1.388,
"step": 231
},
{
"epoch": 0.6304347826086957,
"grad_norm": 0.34387239813804626,
"learning_rate": 7.75533756491633e-06,
"loss": 1.3463,
"step": 232
},
{
"epoch": 0.6331521739130435,
"grad_norm": 0.338460773229599,
"learning_rate": 7.753371868978805e-06,
"loss": 1.3765,
"step": 233
},
{
"epoch": 0.6358695652173914,
"grad_norm": 0.3468306064605713,
"learning_rate": 7.751399343756032e-06,
"loss": 1.311,
"step": 234
},
{
"epoch": 0.6385869565217391,
"grad_norm": 0.3542969524860382,
"learning_rate": 7.749419953596288e-06,
"loss": 1.3258,
"step": 235
},
{
"epoch": 0.6413043478260869,
"grad_norm": 0.3702642023563385,
"learning_rate": 7.747433662599263e-06,
"loss": 1.4132,
"step": 236
},
{
"epoch": 0.6440217391304348,
"grad_norm": 0.42715317010879517,
"learning_rate": 7.745440434613891e-06,
"loss": 1.3031,
"step": 237
},
{
"epoch": 0.6467391304347826,
"grad_norm": 0.35499247908592224,
"learning_rate": 7.74344023323615e-06,
"loss": 1.4496,
"step": 238
},
{
"epoch": 0.6494565217391305,
"grad_norm": 0.37534981966018677,
"learning_rate": 7.741433021806853e-06,
"loss": 1.4033,
"step": 239
},
{
"epoch": 0.6521739130434783,
"grad_norm": 0.35908043384552,
"learning_rate": 7.739418763409401e-06,
"loss": 1.4686,
"step": 240
},
{
"epoch": 0.654891304347826,
"grad_norm": 0.365974485874176,
"learning_rate": 7.737397420867525e-06,
"loss": 1.3485,
"step": 241
},
{
"epoch": 0.657608695652174,
"grad_norm": 0.3661860227584839,
"learning_rate": 7.735368956743002e-06,
"loss": 1.3576,
"step": 242
},
{
"epoch": 0.6603260869565217,
"grad_norm": 0.352400004863739,
"learning_rate": 7.733333333333333e-06,
"loss": 1.4192,
"step": 243
},
{
"epoch": 0.6630434782608695,
"grad_norm": 0.3496619760990143,
"learning_rate": 7.731290512669416e-06,
"loss": 1.3675,
"step": 244
},
{
"epoch": 0.6657608695652174,
"grad_norm": 0.34792232513427734,
"learning_rate": 7.729240456513184e-06,
"loss": 1.3972,
"step": 245
},
{
"epoch": 0.6684782608695652,
"grad_norm": 0.346755713224411,
"learning_rate": 7.727183126355213e-06,
"loss": 1.4235,
"step": 246
},
{
"epoch": 0.6711956521739131,
"grad_norm": 0.35014012455940247,
"learning_rate": 7.725118483412322e-06,
"loss": 1.3392,
"step": 247
},
{
"epoch": 0.6739130434782609,
"grad_norm": 0.3645906150341034,
"learning_rate": 7.723046488625123e-06,
"loss": 1.3828,
"step": 248
},
{
"epoch": 0.6766304347826086,
"grad_norm": 0.3508262634277344,
"learning_rate": 7.720967102655568e-06,
"loss": 1.3201,
"step": 249
},
{
"epoch": 0.6793478260869565,
"grad_norm": 0.3524917960166931,
"learning_rate": 7.718880285884455e-06,
"loss": 1.3746,
"step": 250
},
{
"epoch": 0.6820652173913043,
"grad_norm": 0.35207194089889526,
"learning_rate": 7.71678599840891e-06,
"loss": 1.3566,
"step": 251
},
{
"epoch": 0.6847826086956522,
"grad_norm": 0.34460002183914185,
"learning_rate": 7.714684200039848e-06,
"loss": 1.3067,
"step": 252
},
{
"epoch": 0.6875,
"grad_norm": 0.3507847785949707,
"learning_rate": 7.7125748502994e-06,
"loss": 1.3619,
"step": 253
},
{
"epoch": 0.6902173913043478,
"grad_norm": 0.36836346983909607,
"learning_rate": 7.710457908418317e-06,
"loss": 1.3668,
"step": 254
},
{
"epoch": 0.6929347826086957,
"grad_norm": 0.35324111580848694,
"learning_rate": 7.708333333333332e-06,
"loss": 1.384,
"step": 255
},
{
"epoch": 0.6956521739130435,
"grad_norm": 0.3476124405860901,
"learning_rate": 7.706201083684526e-06,
"loss": 1.3887,
"step": 256
},
{
"epoch": 0.6983695652173914,
"grad_norm": 0.3560345768928528,
"learning_rate": 7.704061117812625e-06,
"loss": 1.3833,
"step": 257
},
{
"epoch": 0.7010869565217391,
"grad_norm": 0.35968106985092163,
"learning_rate": 7.701913393756293e-06,
"loss": 1.3493,
"step": 258
},
{
"epoch": 0.7038043478260869,
"grad_norm": 0.3483898639678955,
"learning_rate": 7.699757869249394e-06,
"loss": 1.3651,
"step": 259
},
{
"epoch": 0.7065217391304348,
"grad_norm": 0.37258586287498474,
"learning_rate": 7.697594501718214e-06,
"loss": 1.4654,
"step": 260
},
{
"epoch": 0.7092391304347826,
"grad_norm": 0.37677285075187683,
"learning_rate": 7.695423248278656e-06,
"loss": 1.331,
"step": 261
},
{
"epoch": 0.7119565217391305,
"grad_norm": 0.3459789454936981,
"learning_rate": 7.693244065733414e-06,
"loss": 1.4426,
"step": 262
},
{
"epoch": 0.7146739130434783,
"grad_norm": 0.35081684589385986,
"learning_rate": 7.691056910569105e-06,
"loss": 1.3873,
"step": 263
},
{
"epoch": 0.717391304347826,
"grad_norm": 0.377763956785202,
"learning_rate": 7.68886173895337e-06,
"loss": 1.3201,
"step": 264
},
{
"epoch": 0.720108695652174,
"grad_norm": 0.35093316435813904,
"learning_rate": 7.686658506731945e-06,
"loss": 1.3274,
"step": 265
},
{
"epoch": 0.7228260869565217,
"grad_norm": 0.3494810461997986,
"learning_rate": 7.68444716942571e-06,
"loss": 1.3173,
"step": 266
},
{
"epoch": 0.7255434782608695,
"grad_norm": 0.3792516887187958,
"learning_rate": 7.682227682227683e-06,
"loss": 1.4449,
"step": 267
},
{
"epoch": 0.7282608695652174,
"grad_norm": 0.3558602035045624,
"learning_rate": 7.68e-06,
"loss": 1.3737,
"step": 268
},
{
"epoch": 0.7309782608695652,
"grad_norm": 0.3495853543281555,
"learning_rate": 7.677764077270858e-06,
"loss": 1.4514,
"step": 269
},
{
"epoch": 0.7336956521739131,
"grad_norm": 0.3459034860134125,
"learning_rate": 7.675519868231419e-06,
"loss": 1.3891,
"step": 270
},
{
"epoch": 0.7364130434782609,
"grad_norm": 0.37044718861579895,
"learning_rate": 7.673267326732673e-06,
"loss": 1.4816,
"step": 271
},
{
"epoch": 0.7391304347826086,
"grad_norm": 0.33735278248786926,
"learning_rate": 7.671006406282289e-06,
"loss": 1.3679,
"step": 272
},
{
"epoch": 0.7418478260869565,
"grad_norm": 0.3841879069805145,
"learning_rate": 7.668737060041406e-06,
"loss": 1.3936,
"step": 273
},
{
"epoch": 0.7445652173913043,
"grad_norm": 0.3517480790615082,
"learning_rate": 7.666459240821406e-06,
"loss": 1.3055,
"step": 274
},
{
"epoch": 0.7472826086956522,
"grad_norm": 0.34539881348609924,
"learning_rate": 7.664172901080633e-06,
"loss": 1.3261,
"step": 275
},
{
"epoch": 0.75,
"grad_norm": 0.3954020142555237,
"learning_rate": 7.66187799292109e-06,
"loss": 1.3094,
"step": 276
},
{
"epoch": 0.7527173913043478,
"grad_norm": 0.3624208867549896,
"learning_rate": 7.659574468085107e-06,
"loss": 1.3864,
"step": 277
},
{
"epoch": 0.7554347826086957,
"grad_norm": 0.34993529319763184,
"learning_rate": 7.657262277951932e-06,
"loss": 1.4306,
"step": 278
},
{
"epoch": 0.7581521739130435,
"grad_norm": 0.37371665239334106,
"learning_rate": 7.654941373534337e-06,
"loss": 1.3844,
"step": 279
},
{
"epoch": 0.7608695652173914,
"grad_norm": 0.36566704511642456,
"learning_rate": 7.652611705475141e-06,
"loss": 1.4169,
"step": 280
},
{
"epoch": 0.7635869565217391,
"grad_norm": 0.3945908546447754,
"learning_rate": 7.650273224043716e-06,
"loss": 1.4021,
"step": 281
},
{
"epoch": 0.7663043478260869,
"grad_norm": 0.36350396275520325,
"learning_rate": 7.647925879132448e-06,
"loss": 1.4002,
"step": 282
},
{
"epoch": 0.7690217391304348,
"grad_norm": 0.3854394555091858,
"learning_rate": 7.645569620253164e-06,
"loss": 1.4119,
"step": 283
},
{
"epoch": 0.7717391304347826,
"grad_norm": 0.3648253083229065,
"learning_rate": 7.643204396533503e-06,
"loss": 1.3952,
"step": 284
},
{
"epoch": 0.7744565217391305,
"grad_norm": 0.3636375665664673,
"learning_rate": 7.640830156713257e-06,
"loss": 1.4338,
"step": 285
},
{
"epoch": 0.7771739130434783,
"grad_norm": 0.3662267327308655,
"learning_rate": 7.638446849140676e-06,
"loss": 1.3377,
"step": 286
},
{
"epoch": 0.779891304347826,
"grad_norm": 0.3441767394542694,
"learning_rate": 7.636054421768708e-06,
"loss": 1.373,
"step": 287
},
{
"epoch": 0.782608695652174,
"grad_norm": 0.3580285310745239,
"learning_rate": 7.633652822151224e-06,
"loss": 1.4107,
"step": 288
},
{
"epoch": 0.7853260869565217,
"grad_norm": 0.3530341386795044,
"learning_rate": 7.63124199743918e-06,
"loss": 1.3704,
"step": 289
},
{
"epoch": 0.7880434782608695,
"grad_norm": 0.35250723361968994,
"learning_rate": 7.628821894376737e-06,
"loss": 1.3558,
"step": 290
},
{
"epoch": 0.7907608695652174,
"grad_norm": 0.35989367961883545,
"learning_rate": 7.626392459297343e-06,
"loss": 1.3583,
"step": 291
},
{
"epoch": 0.7934782608695652,
"grad_norm": 0.34206631779670715,
"learning_rate": 7.623953638119768e-06,
"loss": 1.4009,
"step": 292
},
{
"epoch": 0.7961956521739131,
"grad_norm": 0.3655489683151245,
"learning_rate": 7.621505376344085e-06,
"loss": 1.3665,
"step": 293
},
{
"epoch": 0.7989130434782609,
"grad_norm": 0.36722490191459656,
"learning_rate": 7.619047619047619e-06,
"loss": 1.4071,
"step": 294
},
{
"epoch": 0.8016304347826086,
"grad_norm": 0.36561647057533264,
"learning_rate": 7.616580310880829e-06,
"loss": 1.3369,
"step": 295
},
{
"epoch": 0.8043478260869565,
"grad_norm": 0.3570895195007324,
"learning_rate": 7.614103396063162e-06,
"loss": 1.3277,
"step": 296
},
{
"epoch": 0.8070652173913043,
"grad_norm": 0.3854903280735016,
"learning_rate": 7.611616818378846e-06,
"loss": 1.4852,
"step": 297
},
{
"epoch": 0.8097826086956522,
"grad_norm": 0.3468440771102905,
"learning_rate": 7.609120521172638e-06,
"loss": 1.3272,
"step": 298
},
{
"epoch": 0.8125,
"grad_norm": 0.3752823770046234,
"learning_rate": 7.606614447345517e-06,
"loss": 1.4438,
"step": 299
},
{
"epoch": 0.8152173913043478,
"grad_norm": 0.3850380480289459,
"learning_rate": 7.6040985393503375e-06,
"loss": 1.3909,
"step": 300
},
{
"epoch": 0.8179347826086957,
"grad_norm": 0.34979549050331116,
"learning_rate": 7.6015727391874176e-06,
"loss": 1.3671,
"step": 301
},
{
"epoch": 0.8206521739130435,
"grad_norm": 0.37657612562179565,
"learning_rate": 7.599036988400088e-06,
"loss": 1.3691,
"step": 302
},
{
"epoch": 0.8233695652173914,
"grad_norm": 0.36025533080101013,
"learning_rate": 7.596491228070175e-06,
"loss": 1.4006,
"step": 303
},
{
"epoch": 0.8260869565217391,
"grad_norm": 0.3619629442691803,
"learning_rate": 7.593935398813448e-06,
"loss": 1.4321,
"step": 304
},
{
"epoch": 0.8288043478260869,
"grad_norm": 0.3546094596385956,
"learning_rate": 7.5913694407749885e-06,
"loss": 1.2595,
"step": 305
},
{
"epoch": 0.8315217391304348,
"grad_norm": 0.3541072607040405,
"learning_rate": 7.588793293624531e-06,
"loss": 1.3578,
"step": 306
},
{
"epoch": 0.8342391304347826,
"grad_norm": 0.3686266541481018,
"learning_rate": 7.586206896551724e-06,
"loss": 1.4234,
"step": 307
},
{
"epoch": 0.8369565217391305,
"grad_norm": 0.3526665270328522,
"learning_rate": 7.583610188261351e-06,
"loss": 1.4652,
"step": 308
},
{
"epoch": 0.8396739130434783,
"grad_norm": 0.3411577045917511,
"learning_rate": 7.5810031069684864e-06,
"loss": 1.3768,
"step": 309
},
{
"epoch": 0.842391304347826,
"grad_norm": 0.35096436738967896,
"learning_rate": 7.578385590393596e-06,
"loss": 1.438,
"step": 310
},
{
"epoch": 0.845108695652174,
"grad_norm": 0.36183464527130127,
"learning_rate": 7.575757575757576e-06,
"loss": 1.3347,
"step": 311
},
{
"epoch": 0.8478260869565217,
"grad_norm": 0.362402081489563,
"learning_rate": 7.5731189997767355e-06,
"loss": 1.3988,
"step": 312
},
{
"epoch": 0.8505434782608695,
"grad_norm": 0.3439631462097168,
"learning_rate": 7.570469798657717e-06,
"loss": 1.3964,
"step": 313
},
{
"epoch": 0.8532608695652174,
"grad_norm": 0.34573471546173096,
"learning_rate": 7.567809908092356e-06,
"loss": 1.4407,
"step": 314
},
{
"epoch": 0.8559782608695652,
"grad_norm": 0.37581852078437805,
"learning_rate": 7.5651392632524705e-06,
"loss": 1.3481,
"step": 315
},
{
"epoch": 0.8586956521739131,
"grad_norm": 0.37714245915412903,
"learning_rate": 7.5624577987846045e-06,
"loss": 1.3223,
"step": 316
},
{
"epoch": 0.8614130434782609,
"grad_norm": 0.3433416485786438,
"learning_rate": 7.559765448804691e-06,
"loss": 1.3849,
"step": 317
},
{
"epoch": 0.8641304347826086,
"grad_norm": 0.35157305002212524,
"learning_rate": 7.5570621468926556e-06,
"loss": 1.4306,
"step": 318
},
{
"epoch": 0.8668478260869565,
"grad_norm": 0.3580811321735382,
"learning_rate": 7.554347826086957e-06,
"loss": 1.4124,
"step": 319
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.35856080055236816,
"learning_rate": 7.5516224188790555e-06,
"loss": 1.3477,
"step": 320
},
{
"epoch": 0.8722826086956522,
"grad_norm": 0.35569003224372864,
"learning_rate": 7.548885857207822e-06,
"loss": 1.4032,
"step": 321
},
{
"epoch": 0.875,
"grad_norm": 0.34170466661453247,
"learning_rate": 7.546138072453862e-06,
"loss": 1.3616,
"step": 322
},
{
"epoch": 0.8777173913043478,
"grad_norm": 0.34092995524406433,
"learning_rate": 7.54337899543379e-06,
"loss": 1.3248,
"step": 323
},
{
"epoch": 0.8804347826086957,
"grad_norm": 0.3783915340900421,
"learning_rate": 7.540608556394418e-06,
"loss": 1.3918,
"step": 324
},
{
"epoch": 0.8831521739130435,
"grad_norm": 0.35835227370262146,
"learning_rate": 7.537826685006877e-06,
"loss": 1.3379,
"step": 325
},
{
"epoch": 0.8858695652173914,
"grad_norm": 0.3583104908466339,
"learning_rate": 7.535033310360671e-06,
"loss": 1.4401,
"step": 326
},
{
"epoch": 0.8885869565217391,
"grad_norm": 0.3653166592121124,
"learning_rate": 7.532228360957643e-06,
"loss": 1.3051,
"step": 327
},
{
"epoch": 0.8913043478260869,
"grad_norm": 0.3713383376598358,
"learning_rate": 7.529411764705882e-06,
"loss": 1.3492,
"step": 328
},
{
"epoch": 0.8940217391304348,
"grad_norm": 0.3575204908847809,
"learning_rate": 7.5265834489135455e-06,
"loss": 1.3615,
"step": 329
},
{
"epoch": 0.8967391304347826,
"grad_norm": 0.366793692111969,
"learning_rate": 7.523743340282604e-06,
"loss": 1.3239,
"step": 330
},
{
"epoch": 0.8994565217391305,
"grad_norm": 0.3851720988750458,
"learning_rate": 7.520891364902507e-06,
"loss": 1.4105,
"step": 331
},
{
"epoch": 0.9021739130434783,
"grad_norm": 0.3682537078857422,
"learning_rate": 7.518027448243778e-06,
"loss": 1.3926,
"step": 332
},
{
"epoch": 0.904891304347826,
"grad_norm": 0.35508108139038086,
"learning_rate": 7.515151515151516e-06,
"loss": 1.39,
"step": 333
},
{
"epoch": 0.907608695652174,
"grad_norm": 0.35429316759109497,
"learning_rate": 7.512263489838823e-06,
"loss": 1.4277,
"step": 334
},
{
"epoch": 0.9103260869565217,
"grad_norm": 0.35008707642555237,
"learning_rate": 7.50936329588015e-06,
"loss": 1.3593,
"step": 335
},
{
"epoch": 0.9130434782608695,
"grad_norm": 0.35157814621925354,
"learning_rate": 7.5064508562045505e-06,
"loss": 1.3293,
"step": 336
},
{
"epoch": 0.9157608695652174,
"grad_norm": 0.3862317204475403,
"learning_rate": 7.503526093088858e-06,
"loss": 1.3472,
"step": 337
},
{
"epoch": 0.9184782608695652,
"grad_norm": 0.36031222343444824,
"learning_rate": 7.500588928150765e-06,
"loss": 1.3446,
"step": 338
},
{
"epoch": 0.9211956521739131,
"grad_norm": 0.3770779073238373,
"learning_rate": 7.497639282341832e-06,
"loss": 1.3005,
"step": 339
},
{
"epoch": 0.9239130434782609,
"grad_norm": 0.3638140857219696,
"learning_rate": 7.494677075940384e-06,
"loss": 1.3519,
"step": 340
},
{
"epoch": 0.9266304347826086,
"grad_norm": 0.3515496253967285,
"learning_rate": 7.491702228544334e-06,
"loss": 1.4093,
"step": 341
},
{
"epoch": 0.9293478260869565,
"grad_norm": 0.3889096975326538,
"learning_rate": 7.48871465906391e-06,
"loss": 1.3816,
"step": 342
},
{
"epoch": 0.9320652173913043,
"grad_norm": 0.3584050238132477,
"learning_rate": 7.485714285714285e-06,
"loss": 1.4043,
"step": 343
},
{
"epoch": 0.9347826086956522,
"grad_norm": 0.360524982213974,
"learning_rate": 7.4827010260081125e-06,
"loss": 1.3481,
"step": 344
},
{
"epoch": 0.9375,
"grad_norm": 0.3846801817417145,
"learning_rate": 7.4796747967479676e-06,
"loss": 1.4366,
"step": 345
},
{
"epoch": 0.9402173913043478,
"grad_norm": 0.35864099860191345,
"learning_rate": 7.476635514018692e-06,
"loss": 1.3096,
"step": 346
},
{
"epoch": 0.9429347826086957,
"grad_norm": 0.37097325921058655,
"learning_rate": 7.473583093179635e-06,
"loss": 1.2847,
"step": 347
},
{
"epoch": 0.9456521739130435,
"grad_norm": 0.396785169839859,
"learning_rate": 7.4705174488568e-06,
"loss": 1.3337,
"step": 348
},
{
"epoch": 0.9483695652173914,
"grad_norm": 0.3547925353050232,
"learning_rate": 7.467438494934877e-06,
"loss": 1.3112,
"step": 349
},
{
"epoch": 0.9510869565217391,
"grad_norm": 0.3848293721675873,
"learning_rate": 7.46434614454919e-06,
"loss": 1.3002,
"step": 350
},
{
"epoch": 0.9538043478260869,
"grad_norm": 0.38206636905670166,
"learning_rate": 7.46124031007752e-06,
"loss": 1.3621,
"step": 351
},
{
"epoch": 0.9565217391304348,
"grad_norm": 0.3636261820793152,
"learning_rate": 7.458120903131829e-06,
"loss": 1.3595,
"step": 352
},
{
"epoch": 0.9592391304347826,
"grad_norm": 0.3859856426715851,
"learning_rate": 7.454987834549879e-06,
"loss": 1.3782,
"step": 353
},
{
"epoch": 0.9619565217391305,
"grad_norm": 0.3692803382873535,
"learning_rate": 7.451841014386735e-06,
"loss": 1.3679,
"step": 354
},
{
"epoch": 0.9646739130434783,
"grad_norm": 0.3576744794845581,
"learning_rate": 7.448680351906158e-06,
"loss": 1.3806,
"step": 355
},
{
"epoch": 0.967391304347826,
"grad_norm": 0.4264325499534607,
"learning_rate": 7.4455057555718835e-06,
"loss": 1.4033,
"step": 356
},
{
"epoch": 0.970108695652174,
"grad_norm": 0.3938155770301819,
"learning_rate": 7.442317133038782e-06,
"loss": 1.3224,
"step": 357
},
{
"epoch": 0.9728260869565217,
"grad_norm": 0.37210017442703247,
"learning_rate": 7.439114391143912e-06,
"loss": 1.3148,
"step": 358
},
{
"epoch": 0.9755434782608695,
"grad_norm": 0.35559719800949097,
"learning_rate": 7.435897435897436e-06,
"loss": 1.3092,
"step": 359
},
{
"epoch": 0.9782608695652174,
"grad_norm": 0.4250313937664032,
"learning_rate": 7.432666172473437e-06,
"loss": 1.3665,
"step": 360
},
{
"epoch": 0.9809782608695652,
"grad_norm": 0.34938138723373413,
"learning_rate": 7.429420505200594e-06,
"loss": 1.3979,
"step": 361
},
{
"epoch": 0.9836956521739131,
"grad_norm": 0.3839857876300812,
"learning_rate": 7.426160337552743e-06,
"loss": 1.3008,
"step": 362
},
{
"epoch": 0.9864130434782609,
"grad_norm": 0.37689098715782166,
"learning_rate": 7.422885572139303e-06,
"loss": 1.4584,
"step": 363
},
{
"epoch": 0.9891304347826086,
"grad_norm": 0.35095831751823425,
"learning_rate": 7.4195961106955874e-06,
"loss": 1.286,
"step": 364
},
{
"epoch": 0.9918478260869565,
"grad_norm": 0.34754878282546997,
"learning_rate": 7.416291854072963e-06,
"loss": 1.2923,
"step": 365
},
{
"epoch": 0.9945652173913043,
"grad_norm": 0.3640921413898468,
"learning_rate": 7.412972702228902e-06,
"loss": 1.4107,
"step": 366
},
{
"epoch": 0.9972826086956522,
"grad_norm": 0.33963489532470703,
"learning_rate": 7.409638554216868e-06,
"loss": 1.3914,
"step": 367
},
{
"epoch": 1.0,
"grad_norm": 0.36554214358329773,
"learning_rate": 7.406289308176099e-06,
"loss": 1.4259,
"step": 368
},
{
"epoch": 1.002717391304348,
"grad_norm": 0.34235456585884094,
"learning_rate": 7.40292486132123e-06,
"loss": 1.4269,
"step": 369
},
{
"epoch": 1.0054347826086956,
"grad_norm": 0.34481701254844666,
"learning_rate": 7.399545109931765e-06,
"loss": 1.4128,
"step": 370
},
{
"epoch": 1.0081521739130435,
"grad_norm": 0.362263023853302,
"learning_rate": 7.396149949341437e-06,
"loss": 1.3562,
"step": 371
},
{
"epoch": 1.0108695652173914,
"grad_norm": 0.351425439119339,
"learning_rate": 7.392739273927391e-06,
"loss": 1.3482,
"step": 372
},
{
"epoch": 1.013586956521739,
"grad_norm": 0.33792799711227417,
"learning_rate": 7.3893129770992355e-06,
"loss": 1.3668,
"step": 373
},
{
"epoch": 1.016304347826087,
"grad_norm": 0.35396307706832886,
"learning_rate": 7.385870951287936e-06,
"loss": 1.2686,
"step": 374
},
{
"epoch": 1.0190217391304348,
"grad_norm": 0.3580890893936157,
"learning_rate": 7.38241308793456e-06,
"loss": 1.3531,
"step": 375
},
{
"epoch": 1.0217391304347827,
"grad_norm": 0.3556496798992157,
"learning_rate": 7.3789392774788615e-06,
"loss": 1.347,
"step": 376
},
{
"epoch": 1.0244565217391304,
"grad_norm": 0.3453238904476166,
"learning_rate": 7.3754494093477135e-06,
"loss": 1.2562,
"step": 377
},
{
"epoch": 1.0271739130434783,
"grad_norm": 0.3520653545856476,
"learning_rate": 7.371943371943371e-06,
"loss": 1.3663,
"step": 378
},
{
"epoch": 1.0298913043478262,
"grad_norm": 0.35983365774154663,
"learning_rate": 7.368421052631578e-06,
"loss": 1.3657,
"step": 379
},
{
"epoch": 1.0326086956521738,
"grad_norm": 0.34599393606185913,
"learning_rate": 7.364882337729505e-06,
"loss": 1.3984,
"step": 380
},
{
"epoch": 1.0353260869565217,
"grad_norm": 0.34382811188697815,
"learning_rate": 7.361327112493519e-06,
"loss": 1.3781,
"step": 381
},
{
"epoch": 1.0380434782608696,
"grad_norm": 0.340087890625,
"learning_rate": 7.357755261106781e-06,
"loss": 1.3895,
"step": 382
},
{
"epoch": 1.0407608695652173,
"grad_norm": 0.3491702079772949,
"learning_rate": 7.3541666666666656e-06,
"loss": 1.3423,
"step": 383
},
{
"epoch": 1.0434782608695652,
"grad_norm": 0.3492245674133301,
"learning_rate": 7.350561211172017e-06,
"loss": 1.3223,
"step": 384
},
{
"epoch": 1.046195652173913,
"grad_norm": 0.35501372814178467,
"learning_rate": 7.346938775510203e-06,
"loss": 1.5079,
"step": 385
},
{
"epoch": 1.048913043478261,
"grad_norm": 0.3509463965892792,
"learning_rate": 7.343299239444006e-06,
"loss": 1.3769,
"step": 386
},
{
"epoch": 1.0516304347826086,
"grad_norm": 0.3643050789833069,
"learning_rate": 7.339642481598317e-06,
"loss": 1.3845,
"step": 387
},
{
"epoch": 1.0543478260869565,
"grad_norm": 0.3647449016571045,
"learning_rate": 7.335968379446639e-06,
"loss": 1.3997,
"step": 388
},
{
"epoch": 1.0570652173913044,
"grad_norm": 0.34432703256607056,
"learning_rate": 7.33227680929741e-06,
"loss": 1.3715,
"step": 389
},
{
"epoch": 1.059782608695652,
"grad_norm": 0.3494293689727783,
"learning_rate": 7.328567646280116e-06,
"loss": 1.4117,
"step": 390
},
{
"epoch": 1.0625,
"grad_norm": 0.35947683453559875,
"learning_rate": 7.324840764331209e-06,
"loss": 1.3205,
"step": 391
},
{
"epoch": 1.065217391304348,
"grad_norm": 0.35863590240478516,
"learning_rate": 7.321096036179834e-06,
"loss": 1.3203,
"step": 392
},
{
"epoch": 1.0679347826086956,
"grad_norm": 0.35642603039741516,
"learning_rate": 7.317333333333333e-06,
"loss": 1.3938,
"step": 393
},
{
"epoch": 1.0706521739130435,
"grad_norm": 0.35906854271888733,
"learning_rate": 7.313552526062549e-06,
"loss": 1.3695,
"step": 394
},
{
"epoch": 1.0733695652173914,
"grad_norm": 0.3712838590145111,
"learning_rate": 7.309753483386923e-06,
"loss": 1.3674,
"step": 395
},
{
"epoch": 1.0760869565217392,
"grad_norm": 0.38039541244506836,
"learning_rate": 7.30593607305936e-06,
"loss": 1.4794,
"step": 396
},
{
"epoch": 1.078804347826087,
"grad_norm": 0.3319692015647888,
"learning_rate": 7.302100161550887e-06,
"loss": 1.3669,
"step": 397
},
{
"epoch": 1.0815217391304348,
"grad_norm": 0.3548648953437805,
"learning_rate": 7.298245614035087e-06,
"loss": 1.3456,
"step": 398
},
{
"epoch": 1.0842391304347827,
"grad_norm": 0.4208085238933563,
"learning_rate": 7.2943722943722935e-06,
"loss": 1.3859,
"step": 399
},
{
"epoch": 1.0869565217391304,
"grad_norm": 0.34764373302459717,
"learning_rate": 7.290480065093571e-06,
"loss": 1.3748,
"step": 400
},
{
"epoch": 1.0896739130434783,
"grad_norm": 0.35038769245147705,
"learning_rate": 7.286568787384447e-06,
"loss": 1.4612,
"step": 401
},
{
"epoch": 1.0923913043478262,
"grad_norm": 0.337618887424469,
"learning_rate": 7.2826383210684106e-06,
"loss": 1.3535,
"step": 402
},
{
"epoch": 1.0951086956521738,
"grad_norm": 0.3615802824497223,
"learning_rate": 7.278688524590163e-06,
"loss": 1.3879,
"step": 403
},
{
"epoch": 1.0978260869565217,
"grad_norm": 0.3457699418067932,
"learning_rate": 7.2747192549986296e-06,
"loss": 1.4415,
"step": 404
},
{
"epoch": 1.1005434782608696,
"grad_norm": 0.34804657101631165,
"learning_rate": 7.270730367929708e-06,
"loss": 1.3189,
"step": 405
},
{
"epoch": 1.1032608695652173,
"grad_norm": 0.3465460240840912,
"learning_rate": 7.266721717588769e-06,
"loss": 1.3687,
"step": 406
},
{
"epoch": 1.1059782608695652,
"grad_norm": 0.345615416765213,
"learning_rate": 7.262693156732891e-06,
"loss": 1.3165,
"step": 407
},
{
"epoch": 1.108695652173913,
"grad_norm": 0.34842392802238464,
"learning_rate": 7.258644536652834e-06,
"loss": 1.3238,
"step": 408
},
{
"epoch": 1.1114130434782608,
"grad_norm": 0.3517705798149109,
"learning_rate": 7.254575707154742e-06,
"loss": 1.3034,
"step": 409
},
{
"epoch": 1.1141304347826086,
"grad_norm": 0.3425719141960144,
"learning_rate": 7.2504865165415625e-06,
"loss": 1.4513,
"step": 410
},
{
"epoch": 1.1168478260869565,
"grad_norm": 0.34302225708961487,
"learning_rate": 7.2463768115942025e-06,
"loss": 1.3241,
"step": 411
},
{
"epoch": 1.1195652173913044,
"grad_norm": 0.36630138754844666,
"learning_rate": 7.242246437552389e-06,
"loss": 1.2878,
"step": 412
},
{
"epoch": 1.122282608695652,
"grad_norm": 0.35825982689857483,
"learning_rate": 7.238095238095238e-06,
"loss": 1.2899,
"step": 413
},
{
"epoch": 1.125,
"grad_norm": 0.3747299015522003,
"learning_rate": 7.2339230553215386e-06,
"loss": 1.3485,
"step": 414
},
{
"epoch": 1.127717391304348,
"grad_norm": 0.35318565368652344,
"learning_rate": 7.229729729729729e-06,
"loss": 1.3475,
"step": 415
},
{
"epoch": 1.1304347826086956,
"grad_norm": 0.3485441505908966,
"learning_rate": 7.225515100197572e-06,
"loss": 1.3389,
"step": 416
},
{
"epoch": 1.1331521739130435,
"grad_norm": 0.37150004506111145,
"learning_rate": 7.221279003961517e-06,
"loss": 1.4033,
"step": 417
},
{
"epoch": 1.1358695652173914,
"grad_norm": 0.34884974360466003,
"learning_rate": 7.217021276595745e-06,
"loss": 1.3228,
"step": 418
},
{
"epoch": 1.1385869565217392,
"grad_norm": 0.36258143186569214,
"learning_rate": 7.212741751990899e-06,
"loss": 1.4687,
"step": 419
},
{
"epoch": 1.141304347826087,
"grad_norm": 0.3562002182006836,
"learning_rate": 7.208440262332477e-06,
"loss": 1.4778,
"step": 420
},
{
"epoch": 1.1440217391304348,
"grad_norm": 0.35977649688720703,
"learning_rate": 7.204116638078902e-06,
"loss": 1.2789,
"step": 421
},
{
"epoch": 1.1467391304347827,
"grad_norm": 0.36146384477615356,
"learning_rate": 7.1997707079392365e-06,
"loss": 1.3834,
"step": 422
},
{
"epoch": 1.1494565217391304,
"grad_norm": 0.3516390323638916,
"learning_rate": 7.195402298850574e-06,
"loss": 1.3415,
"step": 423
},
{
"epoch": 1.1521739130434783,
"grad_norm": 0.33636316657066345,
"learning_rate": 7.1910112359550555e-06,
"loss": 1.37,
"step": 424
},
{
"epoch": 1.1548913043478262,
"grad_norm": 0.3520665168762207,
"learning_rate": 7.186597342576545e-06,
"loss": 1.4197,
"step": 425
},
{
"epoch": 1.1576086956521738,
"grad_norm": 0.3456652760505676,
"learning_rate": 7.18216044019693e-06,
"loss": 1.4093,
"step": 426
},
{
"epoch": 1.1603260869565217,
"grad_norm": 0.34344473481178284,
"learning_rate": 7.177700348432055e-06,
"loss": 1.3166,
"step": 427
},
{
"epoch": 1.1630434782608696,
"grad_norm": 0.3432557284832001,
"learning_rate": 7.173216885007277e-06,
"loss": 1.314,
"step": 428
},
{
"epoch": 1.1657608695652173,
"grad_norm": 0.3404994308948517,
"learning_rate": 7.168709865732632e-06,
"loss": 1.3291,
"step": 429
},
{
"epoch": 1.1684782608695652,
"grad_norm": 0.34031039476394653,
"learning_rate": 7.164179104477611e-06,
"loss": 1.3882,
"step": 430
},
{
"epoch": 1.171195652173913,
"grad_norm": 0.34020742774009705,
"learning_rate": 7.159624413145539e-06,
"loss": 1.3246,
"step": 431
},
{
"epoch": 1.1739130434782608,
"grad_norm": 0.34915924072265625,
"learning_rate": 7.155045601647543e-06,
"loss": 1.3882,
"step": 432
},
{
"epoch": 1.1766304347826086,
"grad_norm": 0.35569998621940613,
"learning_rate": 7.1504424778761065e-06,
"loss": 1.3184,
"step": 433
},
{
"epoch": 1.1793478260869565,
"grad_norm": 0.34406909346580505,
"learning_rate": 7.145814847678202e-06,
"loss": 1.3279,
"step": 434
},
{
"epoch": 1.1820652173913044,
"grad_norm": 0.3368189334869385,
"learning_rate": 7.1411625148279956e-06,
"loss": 1.3227,
"step": 435
},
{
"epoch": 1.184782608695652,
"grad_norm": 0.3536527454853058,
"learning_rate": 7.136485280999108e-06,
"loss": 1.3437,
"step": 436
},
{
"epoch": 1.1875,
"grad_norm": 0.34697532653808594,
"learning_rate": 7.131782945736434e-06,
"loss": 1.4258,
"step": 437
},
{
"epoch": 1.190217391304348,
"grad_norm": 0.3366316258907318,
"learning_rate": 7.127055306427504e-06,
"loss": 1.3611,
"step": 438
},
{
"epoch": 1.1929347826086956,
"grad_norm": 0.3574765622615814,
"learning_rate": 7.122302158273381e-06,
"loss": 1.307,
"step": 439
},
{
"epoch": 1.1956521739130435,
"grad_norm": 0.36839932203292847,
"learning_rate": 7.117523294259092e-06,
"loss": 1.3751,
"step": 440
},
{
"epoch": 1.1983695652173914,
"grad_norm": 0.3561205565929413,
"learning_rate": 7.112718505123568e-06,
"loss": 1.2903,
"step": 441
},
{
"epoch": 1.2010869565217392,
"grad_norm": 0.360176682472229,
"learning_rate": 7.107887579329101e-06,
"loss": 1.4551,
"step": 442
},
{
"epoch": 1.203804347826087,
"grad_norm": 0.36860206723213196,
"learning_rate": 7.1030303030303025e-06,
"loss": 1.3247,
"step": 443
},
{
"epoch": 1.2065217391304348,
"grad_norm": 0.35110506415367126,
"learning_rate": 7.09814646004254e-06,
"loss": 1.2791,
"step": 444
},
{
"epoch": 1.2092391304347827,
"grad_norm": 0.3497629761695862,
"learning_rate": 7.093235831809871e-06,
"loss": 1.4523,
"step": 445
},
{
"epoch": 1.2119565217391304,
"grad_norm": 0.5049267411231995,
"learning_rate": 7.088298197372441e-06,
"loss": 1.3368,
"step": 446
},
{
"epoch": 1.2146739130434783,
"grad_norm": 0.3517695367336273,
"learning_rate": 7.083333333333333e-06,
"loss": 1.4039,
"step": 447
},
{
"epoch": 1.2173913043478262,
"grad_norm": 0.3525075316429138,
"learning_rate": 7.078341013824884e-06,
"loss": 1.3692,
"step": 448
},
{
"epoch": 1.2201086956521738,
"grad_norm": 0.3429877758026123,
"learning_rate": 7.07332101047443e-06,
"loss": 1.3325,
"step": 449
},
{
"epoch": 1.2228260869565217,
"grad_norm": 0.3495529592037201,
"learning_rate": 7.068273092369477e-06,
"loss": 1.3013,
"step": 450
},
{
"epoch": 1.2255434782608696,
"grad_norm": 0.36226147413253784,
"learning_rate": 7.0631970260223045e-06,
"loss": 1.3787,
"step": 451
},
{
"epoch": 1.2282608695652173,
"grad_norm": 0.33679068088531494,
"learning_rate": 7.058092575333955e-06,
"loss": 1.366,
"step": 452
},
{
"epoch": 1.2309782608695652,
"grad_norm": 0.33339637517929077,
"learning_rate": 7.052959501557633e-06,
"loss": 1.2804,
"step": 453
},
{
"epoch": 1.233695652173913,
"grad_norm": 0.3441121280193329,
"learning_rate": 7.047797563261481e-06,
"loss": 1.3386,
"step": 454
},
{
"epoch": 1.2364130434782608,
"grad_norm": 0.36244091391563416,
"learning_rate": 7.042606516290728e-06,
"loss": 1.3221,
"step": 455
},
{
"epoch": 1.2391304347826086,
"grad_norm": 0.374056875705719,
"learning_rate": 7.037386113729187e-06,
"loss": 1.3576,
"step": 456
},
{
"epoch": 1.2418478260869565,
"grad_norm": 0.339832067489624,
"learning_rate": 7.032136105860114e-06,
"loss": 1.4454,
"step": 457
},
{
"epoch": 1.2445652173913044,
"grad_norm": 0.35934752225875854,
"learning_rate": 7.0268562401263825e-06,
"loss": 1.3476,
"step": 458
},
{
"epoch": 1.247282608695652,
"grad_norm": 0.35539355874061584,
"learning_rate": 7.0215462610899875e-06,
"loss": 1.2979,
"step": 459
},
{
"epoch": 1.25,
"grad_norm": 0.3446095287799835,
"learning_rate": 7.016205910390849e-06,
"loss": 1.3906,
"step": 460
},
{
"epoch": 1.252717391304348,
"grad_norm": 0.35285550355911255,
"learning_rate": 7.010834926704908e-06,
"loss": 1.3686,
"step": 461
},
{
"epoch": 1.2554347826086958,
"grad_norm": 0.3725644052028656,
"learning_rate": 7.005433045701502e-06,
"loss": 1.3869,
"step": 462
},
{
"epoch": 1.2581521739130435,
"grad_norm": 0.3460637032985687,
"learning_rate": 7e-06,
"loss": 1.3895,
"step": 463
},
{
"epoch": 1.2608695652173914,
"grad_norm": 0.3563375771045685,
"learning_rate": 6.994535519125682e-06,
"loss": 1.3394,
"step": 464
},
{
"epoch": 1.2635869565217392,
"grad_norm": 0.3403022289276123,
"learning_rate": 6.989039329464861e-06,
"loss": 1.3424,
"step": 465
},
{
"epoch": 1.266304347826087,
"grad_norm": 0.3511143922805786,
"learning_rate": 6.983511154219204e-06,
"loss": 1.415,
"step": 466
},
{
"epoch": 1.2690217391304348,
"grad_norm": 0.35280826687812805,
"learning_rate": 6.9779507133592734e-06,
"loss": 1.3912,
"step": 467
},
{
"epoch": 1.2717391304347827,
"grad_norm": 0.34647977352142334,
"learning_rate": 6.972357723577236e-06,
"loss": 1.2599,
"step": 468
},
{
"epoch": 1.2744565217391304,
"grad_norm": 0.3498916029930115,
"learning_rate": 6.966731898238747e-06,
"loss": 1.2458,
"step": 469
},
{
"epoch": 1.2771739130434783,
"grad_norm": 0.35044994950294495,
"learning_rate": 6.961072947333987e-06,
"loss": 1.3233,
"step": 470
},
{
"epoch": 1.2798913043478262,
"grad_norm": 0.34845730662345886,
"learning_rate": 6.955380577427821e-06,
"loss": 1.3276,
"step": 471
},
{
"epoch": 1.2826086956521738,
"grad_norm": 0.33805760741233826,
"learning_rate": 6.949654491609081e-06,
"loss": 1.3344,
"step": 472
},
{
"epoch": 1.2853260869565217,
"grad_norm": 0.3485923409461975,
"learning_rate": 6.943894389438942e-06,
"loss": 1.388,
"step": 473
},
{
"epoch": 1.2880434782608696,
"grad_norm": 0.34043747186660767,
"learning_rate": 6.9380999668983765e-06,
"loss": 1.375,
"step": 474
},
{
"epoch": 1.2907608695652173,
"grad_norm": 0.3383161723613739,
"learning_rate": 6.93227091633466e-06,
"loss": 1.3298,
"step": 475
},
{
"epoch": 1.2934782608695652,
"grad_norm": 0.35131675004959106,
"learning_rate": 6.926406926406925e-06,
"loss": 1.423,
"step": 476
},
{
"epoch": 1.296195652173913,
"grad_norm": 0.3680573105812073,
"learning_rate": 6.920507682030727e-06,
"loss": 1.3236,
"step": 477
},
{
"epoch": 1.2989130434782608,
"grad_norm": 0.3596939146518707,
"learning_rate": 6.914572864321606e-06,
"loss": 1.3664,
"step": 478
},
{
"epoch": 1.3016304347826086,
"grad_norm": 0.3593509793281555,
"learning_rate": 6.908602150537633e-06,
"loss": 1.3124,
"step": 479
},
{
"epoch": 1.3043478260869565,
"grad_norm": 0.33961907029151917,
"learning_rate": 6.902595214020896e-06,
"loss": 1.3484,
"step": 480
},
{
"epoch": 1.3070652173913042,
"grad_norm": 0.3712293207645416,
"learning_rate": 6.89655172413793e-06,
"loss": 1.3233,
"step": 481
},
{
"epoch": 1.309782608695652,
"grad_norm": 0.3482085168361664,
"learning_rate": 6.890471346219057e-06,
"loss": 1.3653,
"step": 482
},
{
"epoch": 1.3125,
"grad_norm": 0.3515682816505432,
"learning_rate": 6.884353741496598e-06,
"loss": 1.3987,
"step": 483
},
{
"epoch": 1.315217391304348,
"grad_norm": 0.35586169362068176,
"learning_rate": 6.878198567041965e-06,
"loss": 1.3217,
"step": 484
},
{
"epoch": 1.3179347826086958,
"grad_norm": 0.34130343794822693,
"learning_rate": 6.872005475701574e-06,
"loss": 1.3132,
"step": 485
},
{
"epoch": 1.3206521739130435,
"grad_norm": 0.39066967368125916,
"learning_rate": 6.8657741160315826e-06,
"loss": 1.3306,
"step": 486
},
{
"epoch": 1.3233695652173914,
"grad_norm": 0.35300007462501526,
"learning_rate": 6.859504132231405e-06,
"loss": 1.3755,
"step": 487
},
{
"epoch": 1.3260869565217392,
"grad_norm": 0.3553847372531891,
"learning_rate": 6.853195164075993e-06,
"loss": 1.3317,
"step": 488
},
{
"epoch": 1.328804347826087,
"grad_norm": 0.3760699927806854,
"learning_rate": 6.846846846846847e-06,
"loss": 1.4368,
"step": 489
},
{
"epoch": 1.3315217391304348,
"grad_norm": 0.3570723533630371,
"learning_rate": 6.84045881126173e-06,
"loss": 1.4,
"step": 490
},
{
"epoch": 1.3342391304347827,
"grad_norm": 0.3696223497390747,
"learning_rate": 6.834030683403068e-06,
"loss": 1.3349,
"step": 491
},
{
"epoch": 1.3369565217391304,
"grad_norm": 0.37452593445777893,
"learning_rate": 6.827562084644981e-06,
"loss": 1.2458,
"step": 492
},
{
"epoch": 1.3396739130434783,
"grad_norm": 0.3435407280921936,
"learning_rate": 6.821052631578946e-06,
"loss": 1.3624,
"step": 493
},
{
"epoch": 1.3423913043478262,
"grad_norm": 0.3696284592151642,
"learning_rate": 6.814501935938049e-06,
"loss": 1.3795,
"step": 494
},
{
"epoch": 1.3451086956521738,
"grad_norm": 0.35256364941596985,
"learning_rate": 6.807909604519773e-06,
"loss": 1.3196,
"step": 495
},
{
"epoch": 1.3478260869565217,
"grad_norm": 0.34246110916137695,
"learning_rate": 6.801275239107331e-06,
"loss": 1.4291,
"step": 496
},
{
"epoch": 1.3505434782608696,
"grad_norm": 0.3617890775203705,
"learning_rate": 6.79459843638948e-06,
"loss": 1.3208,
"step": 497
},
{
"epoch": 1.3532608695652173,
"grad_norm": 0.35320863127708435,
"learning_rate": 6.787878787878787e-06,
"loss": 1.3272,
"step": 498
},
{
"epoch": 1.3559782608695652,
"grad_norm": 0.3528405427932739,
"learning_rate": 6.781115879828325e-06,
"loss": 1.3978,
"step": 499
},
{
"epoch": 1.358695652173913,
"grad_norm": 0.3528098165988922,
"learning_rate": 6.774309293146751e-06,
"loss": 1.3747,
"step": 500
},
{
"epoch": 1.3614130434782608,
"grad_norm": 0.345384806394577,
"learning_rate": 6.767458603311734e-06,
"loss": 1.3925,
"step": 501
},
{
"epoch": 1.3641304347826086,
"grad_norm": 0.34573063254356384,
"learning_rate": 6.760563380281689e-06,
"loss": 1.383,
"step": 502
},
{
"epoch": 1.3668478260869565,
"grad_norm": 0.3485819697380066,
"learning_rate": 6.753623188405796e-06,
"loss": 1.3945,
"step": 503
},
{
"epoch": 1.3695652173913042,
"grad_norm": 0.35015052556991577,
"learning_rate": 6.746637586332242e-06,
"loss": 1.323,
"step": 504
},
{
"epoch": 1.372282608695652,
"grad_norm": 0.3457247316837311,
"learning_rate": 6.739606126914661e-06,
"loss": 1.2701,
"step": 505
},
{
"epoch": 1.375,
"grad_norm": 0.3384177088737488,
"learning_rate": 6.732528357116721e-06,
"loss": 1.2737,
"step": 506
},
{
"epoch": 1.377717391304348,
"grad_norm": 0.3327966034412384,
"learning_rate": 6.725403817914831e-06,
"loss": 1.3766,
"step": 507
},
{
"epoch": 1.3804347826086958,
"grad_norm": 0.3439366817474365,
"learning_rate": 6.718232044198895e-06,
"loss": 1.4009,
"step": 508
},
{
"epoch": 1.3831521739130435,
"grad_norm": 0.35546526312828064,
"learning_rate": 6.711012564671101e-06,
"loss": 1.374,
"step": 509
},
{
"epoch": 1.3858695652173914,
"grad_norm": 0.3421470522880554,
"learning_rate": 6.703744901742677e-06,
"loss": 1.3559,
"step": 510
},
{
"epoch": 1.3885869565217392,
"grad_norm": 0.334219366312027,
"learning_rate": 6.696428571428572e-06,
"loss": 1.3361,
"step": 511
},
{
"epoch": 1.391304347826087,
"grad_norm": 0.33939287066459656,
"learning_rate": 6.6890630832400145e-06,
"loss": 1.3162,
"step": 512
},
{
"epoch": 1.3940217391304348,
"grad_norm": 0.35583412647247314,
"learning_rate": 6.681647940074906e-06,
"loss": 1.4194,
"step": 513
},
{
"epoch": 1.3967391304347827,
"grad_norm": 0.3360309600830078,
"learning_rate": 6.674182638105974e-06,
"loss": 1.3667,
"step": 514
},
{
"epoch": 1.3994565217391304,
"grad_norm": 0.34478163719177246,
"learning_rate": 6.666666666666666e-06,
"loss": 1.4185,
"step": 515
},
{
"epoch": 1.4021739130434783,
"grad_norm": 0.3429558277130127,
"learning_rate": 6.659099508134694e-06,
"loss": 1.2838,
"step": 516
},
{
"epoch": 1.4048913043478262,
"grad_norm": 0.373536080121994,
"learning_rate": 6.6514806378132115e-06,
"loss": 1.3696,
"step": 517
},
{
"epoch": 1.4076086956521738,
"grad_norm": 0.34450167417526245,
"learning_rate": 6.643809523809523e-06,
"loss": 1.3697,
"step": 518
},
{
"epoch": 1.4103260869565217,
"grad_norm": 0.3542488217353821,
"learning_rate": 6.636085626911314e-06,
"loss": 1.3584,
"step": 519
},
{
"epoch": 1.4130434782608696,
"grad_norm": 0.3472454845905304,
"learning_rate": 6.628308400460298e-06,
"loss": 1.3639,
"step": 520
},
{
"epoch": 1.4157608695652173,
"grad_norm": 0.36916840076446533,
"learning_rate": 6.620477290223248e-06,
"loss": 1.3159,
"step": 521
},
{
"epoch": 1.4184782608695652,
"grad_norm": 0.3357577323913574,
"learning_rate": 6.612591734260331e-06,
"loss": 1.2943,
"step": 522
},
{
"epoch": 1.421195652173913,
"grad_norm": 0.3517664074897766,
"learning_rate": 6.604651162790697e-06,
"loss": 1.3014,
"step": 523
},
{
"epoch": 1.4239130434782608,
"grad_norm": 0.3861806094646454,
"learning_rate": 6.596654998055231e-06,
"loss": 1.3575,
"step": 524
},
{
"epoch": 1.4266304347826086,
"grad_norm": 0.347937673330307,
"learning_rate": 6.588602654176425e-06,
"loss": 1.4401,
"step": 525
},
{
"epoch": 1.4293478260869565,
"grad_norm": 0.34917065501213074,
"learning_rate": 6.580493537015276e-06,
"loss": 1.3305,
"step": 526
},
{
"epoch": 1.4320652173913042,
"grad_norm": 0.35585877299308777,
"learning_rate": 6.572327044025157e-06,
"loss": 1.3503,
"step": 527
},
{
"epoch": 1.434782608695652,
"grad_norm": 0.36193299293518066,
"learning_rate": 6.564102564102565e-06,
"loss": 1.4254,
"step": 528
},
{
"epoch": 1.4375,
"grad_norm": 0.3477296233177185,
"learning_rate": 6.55581947743468e-06,
"loss": 1.3582,
"step": 529
},
{
"epoch": 1.440217391304348,
"grad_norm": 0.348344624042511,
"learning_rate": 6.547477155343664e-06,
"loss": 1.3552,
"step": 530
},
{
"epoch": 1.4429347826086958,
"grad_norm": 0.34499526023864746,
"learning_rate": 6.5390749601275915e-06,
"loss": 1.3497,
"step": 531
},
{
"epoch": 1.4456521739130435,
"grad_norm": 0.3457295000553131,
"learning_rate": 6.530612244897959e-06,
"loss": 1.3276,
"step": 532
},
{
"epoch": 1.4483695652173914,
"grad_norm": 0.37890803813934326,
"learning_rate": 6.522088353413655e-06,
"loss": 1.3666,
"step": 533
},
{
"epoch": 1.4510869565217392,
"grad_norm": 0.36650505661964417,
"learning_rate": 6.513502619911325e-06,
"loss": 1.3268,
"step": 534
},
{
"epoch": 1.453804347826087,
"grad_norm": 0.3550858497619629,
"learning_rate": 6.5048543689320385e-06,
"loss": 1.3665,
"step": 535
},
{
"epoch": 1.4565217391304348,
"grad_norm": 0.35670801997184753,
"learning_rate": 6.496142915144132e-06,
"loss": 1.3853,
"step": 536
},
{
"epoch": 1.4592391304347827,
"grad_norm": 0.34603556990623474,
"learning_rate": 6.487367563162183e-06,
"loss": 1.3374,
"step": 537
},
{
"epoch": 1.4619565217391304,
"grad_norm": 0.343791663646698,
"learning_rate": 6.478527607361963e-06,
"loss": 1.3811,
"step": 538
},
{
"epoch": 1.4646739130434783,
"grad_norm": 0.3415122926235199,
"learning_rate": 6.469622331691296e-06,
"loss": 1.3357,
"step": 539
},
{
"epoch": 1.4673913043478262,
"grad_norm": 0.3270445168018341,
"learning_rate": 6.460651009476719e-06,
"loss": 1.3408,
"step": 540
},
{
"epoch": 1.4701086956521738,
"grad_norm": 0.3364504277706146,
"learning_rate": 6.4516129032258055e-06,
"loss": 1.3302,
"step": 541
},
{
"epoch": 1.4728260869565217,
"grad_norm": 0.3475323021411896,
"learning_rate": 6.4425072644250714e-06,
"loss": 1.3357,
"step": 542
},
{
"epoch": 1.4755434782608696,
"grad_norm": 0.34420526027679443,
"learning_rate": 6.4333333333333324e-06,
"loss": 1.3509,
"step": 543
},
{
"epoch": 1.4782608695652173,
"grad_norm": 0.3597885072231293,
"learning_rate": 6.424090338770388e-06,
"loss": 1.3475,
"step": 544
},
{
"epoch": 1.4809782608695652,
"grad_norm": 0.3500368893146515,
"learning_rate": 6.414777497900924e-06,
"loss": 1.3867,
"step": 545
},
{
"epoch": 1.483695652173913,
"grad_norm": 0.3487471640110016,
"learning_rate": 6.405394016013486e-06,
"loss": 1.3833,
"step": 546
},
{
"epoch": 1.4864130434782608,
"grad_norm": 0.3366873562335968,
"learning_rate": 6.395939086294416e-06,
"loss": 1.3299,
"step": 547
},
{
"epoch": 1.4891304347826086,
"grad_norm": 0.355654239654541,
"learning_rate": 6.386411889596603e-06,
"loss": 1.3747,
"step": 548
},
{
"epoch": 1.4918478260869565,
"grad_norm": 0.35797804594039917,
"learning_rate": 6.376811594202899e-06,
"loss": 1.3601,
"step": 549
},
{
"epoch": 1.4945652173913042,
"grad_norm": 0.34977495670318604,
"learning_rate": 6.367137355584082e-06,
"loss": 1.2506,
"step": 550
},
{
"epoch": 1.497282608695652,
"grad_norm": 0.3429720401763916,
"learning_rate": 6.357388316151203e-06,
"loss": 1.336,
"step": 551
},
{
"epoch": 1.5,
"grad_norm": 0.343697726726532,
"learning_rate": 6.347563605002156e-06,
"loss": 1.3987,
"step": 552
},
{
"epoch": 1.5027173913043477,
"grad_norm": 0.36152300238609314,
"learning_rate": 6.337662337662337e-06,
"loss": 1.345,
"step": 553
},
{
"epoch": 1.5054347826086958,
"grad_norm": 0.35157766938209534,
"learning_rate": 6.327683615819208e-06,
"loss": 1.3364,
"step": 554
},
{
"epoch": 1.5081521739130435,
"grad_norm": 0.3413873314857483,
"learning_rate": 6.31762652705061e-06,
"loss": 1.3341,
"step": 555
},
{
"epoch": 1.5108695652173914,
"grad_norm": 0.35999447107315063,
"learning_rate": 6.307490144546649e-06,
"loss": 1.3927,
"step": 556
},
{
"epoch": 1.5135869565217392,
"grad_norm": 0.35518884658813477,
"learning_rate": 6.297273526824978e-06,
"loss": 1.3702,
"step": 557
},
{
"epoch": 1.516304347826087,
"grad_norm": 0.3421798348426819,
"learning_rate": 6.2869757174392925e-06,
"loss": 1.428,
"step": 558
},
{
"epoch": 1.5190217391304348,
"grad_norm": 0.36976388096809387,
"learning_rate": 6.27659574468085e-06,
"loss": 1.3784,
"step": 559
},
{
"epoch": 1.5217391304347827,
"grad_norm": 0.35902440547943115,
"learning_rate": 6.266132621272808e-06,
"loss": 1.327,
"step": 560
},
{
"epoch": 1.5244565217391304,
"grad_norm": 0.360893577337265,
"learning_rate": 6.2555853440571934e-06,
"loss": 1.316,
"step": 561
},
{
"epoch": 1.5271739130434783,
"grad_norm": 0.35077449679374695,
"learning_rate": 6.244952893674293e-06,
"loss": 1.2877,
"step": 562
},
{
"epoch": 1.5298913043478262,
"grad_norm": 0.3528415262699127,
"learning_rate": 6.234234234234233e-06,
"loss": 1.2806,
"step": 563
},
{
"epoch": 1.5326086956521738,
"grad_norm": 0.3422030508518219,
"learning_rate": 6.223428312980551e-06,
"loss": 1.3473,
"step": 564
},
{
"epoch": 1.5353260869565217,
"grad_norm": 0.3537536859512329,
"learning_rate": 6.212534059945504e-06,
"loss": 1.4186,
"step": 565
},
{
"epoch": 1.5380434782608696,
"grad_norm": 0.3570748269557953,
"learning_rate": 6.201550387596899e-06,
"loss": 1.3426,
"step": 566
},
{
"epoch": 1.5407608695652173,
"grad_norm": 0.346245139837265,
"learning_rate": 6.19047619047619e-06,
"loss": 1.3228,
"step": 567
},
{
"epoch": 1.5434782608695652,
"grad_norm": 0.35338735580444336,
"learning_rate": 6.179310344827586e-06,
"loss": 1.4134,
"step": 568
},
{
"epoch": 1.546195652173913,
"grad_norm": 0.3490867614746094,
"learning_rate": 6.168051708217913e-06,
"loss": 1.3663,
"step": 569
},
{
"epoch": 1.5489130434782608,
"grad_norm": 0.349432110786438,
"learning_rate": 6.1566991191469625e-06,
"loss": 1.2371,
"step": 570
},
{
"epoch": 1.5516304347826086,
"grad_norm": 0.3474854528903961,
"learning_rate": 6.145251396648044e-06,
"loss": 1.3437,
"step": 571
},
{
"epoch": 1.5543478260869565,
"grad_norm": 0.3465191125869751,
"learning_rate": 6.133707339878447e-06,
"loss": 1.3227,
"step": 572
},
{
"epoch": 1.5570652173913042,
"grad_norm": 0.3537481725215912,
"learning_rate": 6.122065727699531e-06,
"loss": 1.3241,
"step": 573
},
{
"epoch": 1.5597826086956523,
"grad_norm": 0.3460562825202942,
"learning_rate": 6.110325318246111e-06,
"loss": 1.2951,
"step": 574
},
{
"epoch": 1.5625,
"grad_norm": 0.3643260896205902,
"learning_rate": 6.098484848484849e-06,
"loss": 1.3374,
"step": 575
},
{
"epoch": 1.5652173913043477,
"grad_norm": 0.36155954003334045,
"learning_rate": 6.0865430337612935e-06,
"loss": 1.5067,
"step": 576
},
{
"epoch": 1.5679347826086958,
"grad_norm": 0.37407439947128296,
"learning_rate": 6.074498567335243e-06,
"loss": 1.4037,
"step": 577
},
{
"epoch": 1.5706521739130435,
"grad_norm": 0.3926370143890381,
"learning_rate": 6.062350119904077e-06,
"loss": 1.356,
"step": 578
},
{
"epoch": 1.5733695652173914,
"grad_norm": 0.3557644486427307,
"learning_rate": 6.05009633911368e-06,
"loss": 1.3288,
"step": 579
},
{
"epoch": 1.5760869565217392,
"grad_norm": 0.36532676219940186,
"learning_rate": 6.037735849056603e-06,
"loss": 1.3494,
"step": 580
},
{
"epoch": 1.578804347826087,
"grad_norm": 0.36569759249687195,
"learning_rate": 6.025267249757046e-06,
"loss": 1.3887,
"step": 581
},
{
"epoch": 1.5815217391304348,
"grad_norm": 0.3753635883331299,
"learning_rate": 6.012689116642264e-06,
"loss": 1.3738,
"step": 582
},
{
"epoch": 1.5842391304347827,
"grad_norm": 0.34146779775619507,
"learning_rate": 6e-06,
"loss": 1.3313,
"step": 583
},
{
"epoch": 1.5869565217391304,
"grad_norm": 0.32988572120666504,
"learning_rate": 5.987198424421466e-06,
"loss": 1.347,
"step": 584
},
{
"epoch": 1.5896739130434783,
"grad_norm": 0.3355376124382019,
"learning_rate": 5.974282888229475e-06,
"loss": 1.3562,
"step": 585
},
{
"epoch": 1.5923913043478262,
"grad_norm": 0.3790322542190552,
"learning_rate": 5.961251862891206e-06,
"loss": 1.3789,
"step": 586
},
{
"epoch": 1.5951086956521738,
"grad_norm": 0.34783735871315,
"learning_rate": 5.948103792415168e-06,
"loss": 1.307,
"step": 587
},
{
"epoch": 1.5978260869565217,
"grad_norm": 0.34875112771987915,
"learning_rate": 5.934837092731829e-06,
"loss": 1.3205,
"step": 588
},
{
"epoch": 1.6005434782608696,
"grad_norm": 0.3391176462173462,
"learning_rate": 5.921450151057401e-06,
"loss": 1.3438,
"step": 589
},
{
"epoch": 1.6032608695652173,
"grad_norm": 0.35487234592437744,
"learning_rate": 5.907941325240263e-06,
"loss": 1.4358,
"step": 590
},
{
"epoch": 1.6059782608695652,
"grad_norm": 0.3468010127544403,
"learning_rate": 5.894308943089431e-06,
"loss": 1.2499,
"step": 591
},
{
"epoch": 1.608695652173913,
"grad_norm": 0.34538012742996216,
"learning_rate": 5.8805513016845326e-06,
"loss": 1.3331,
"step": 592
},
{
"epoch": 1.6114130434782608,
"grad_norm": 0.34510117769241333,
"learning_rate": 5.866666666666667e-06,
"loss": 1.4319,
"step": 593
},
{
"epoch": 1.6141304347826086,
"grad_norm": 0.3392553925514221,
"learning_rate": 5.852653271509531e-06,
"loss": 1.4242,
"step": 594
},
{
"epoch": 1.6168478260869565,
"grad_norm": 0.344718873500824,
"learning_rate": 5.838509316770187e-06,
"loss": 1.3883,
"step": 595
},
{
"epoch": 1.6195652173913042,
"grad_norm": 0.348323792219162,
"learning_rate": 5.824232969318773e-06,
"loss": 1.3739,
"step": 596
},
{
"epoch": 1.6222826086956523,
"grad_norm": 0.3823015093803406,
"learning_rate": 5.8098223615465e-06,
"loss": 1.3831,
"step": 597
},
{
"epoch": 1.625,
"grad_norm": 0.34378868341445923,
"learning_rate": 5.795275590551181e-06,
"loss": 1.2974,
"step": 598
},
{
"epoch": 1.6277173913043477,
"grad_norm": 0.35214224457740784,
"learning_rate": 5.780590717299578e-06,
"loss": 1.3613,
"step": 599
},
{
"epoch": 1.6304347826086958,
"grad_norm": 0.3451964557170868,
"learning_rate": 5.765765765765765e-06,
"loss": 1.319,
"step": 600
},
{
"epoch": 1.6331521739130435,
"grad_norm": 0.32084909081459045,
"learning_rate": 5.750798722044728e-06,
"loss": 1.3498,
"step": 601
},
{
"epoch": 1.6358695652173914,
"grad_norm": 0.3332853615283966,
"learning_rate": 5.735687533440342e-06,
"loss": 1.284,
"step": 602
},
{
"epoch": 1.6385869565217392,
"grad_norm": 0.34017214179039,
"learning_rate": 5.720430107526882e-06,
"loss": 1.2987,
"step": 603
},
{
"epoch": 1.641304347826087,
"grad_norm": 0.3513610064983368,
"learning_rate": 5.705024311183144e-06,
"loss": 1.3854,
"step": 604
},
{
"epoch": 1.6440217391304348,
"grad_norm": 0.3369273841381073,
"learning_rate": 5.689467969598262e-06,
"loss": 1.2769,
"step": 605
},
{
"epoch": 1.6467391304347827,
"grad_norm": 0.3546997904777527,
"learning_rate": 5.673758865248226e-06,
"loss": 1.423,
"step": 606
},
{
"epoch": 1.6494565217391304,
"grad_norm": 0.3369233310222626,
"learning_rate": 5.657894736842105e-06,
"loss": 1.3771,
"step": 607
},
{
"epoch": 1.6521739130434783,
"grad_norm": 0.35690680146217346,
"learning_rate": 5.641873278236913e-06,
"loss": 1.4428,
"step": 608
},
{
"epoch": 1.6548913043478262,
"grad_norm": 0.35253193974494934,
"learning_rate": 5.625692137320044e-06,
"loss": 1.3207,
"step": 609
},
{
"epoch": 1.6576086956521738,
"grad_norm": 0.3626672923564911,
"learning_rate": 5.609348914858095e-06,
"loss": 1.3305,
"step": 610
},
{
"epoch": 1.6603260869565217,
"grad_norm": 0.3617280423641205,
"learning_rate": 5.592841163310961e-06,
"loss": 1.3925,
"step": 611
},
{
"epoch": 1.6630434782608696,
"grad_norm": 0.3400178551673889,
"learning_rate": 5.576166385609892e-06,
"loss": 1.3414,
"step": 612
},
{
"epoch": 1.6657608695652173,
"grad_norm": 0.3407033085823059,
"learning_rate": 5.559322033898304e-06,
"loss": 1.3731,
"step": 613
},
{
"epoch": 1.6684782608695652,
"grad_norm": 0.34921663999557495,
"learning_rate": 5.542305508233957e-06,
"loss": 1.3955,
"step": 614
},
{
"epoch": 1.671195652173913,
"grad_norm": 0.3465006649494171,
"learning_rate": 5.525114155251141e-06,
"loss": 1.3132,
"step": 615
},
{
"epoch": 1.6739130434782608,
"grad_norm": 0.35166648030281067,
"learning_rate": 5.507745266781411e-06,
"loss": 1.3554,
"step": 616
},
{
"epoch": 1.6766304347826086,
"grad_norm": 0.3840665817260742,
"learning_rate": 5.490196078431373e-06,
"loss": 1.2921,
"step": 617
},
{
"epoch": 1.6793478260869565,
"grad_norm": 0.3391146957874298,
"learning_rate": 5.4724637681159414e-06,
"loss": 1.3488,
"step": 618
},
{
"epoch": 1.6820652173913042,
"grad_norm": 0.3500737249851227,
"learning_rate": 5.454545454545454e-06,
"loss": 1.3308,
"step": 619
},
{
"epoch": 1.6847826086956523,
"grad_norm": 0.36856669187545776,
"learning_rate": 5.436438195664909e-06,
"loss": 1.2802,
"step": 620
},
{
"epoch": 1.6875,
"grad_norm": 0.3350989818572998,
"learning_rate": 5.418138987043581e-06,
"loss": 1.336,
"step": 621
},
{
"epoch": 1.6902173913043477,
"grad_norm": 0.35188931226730347,
"learning_rate": 5.399644760213144e-06,
"loss": 1.3416,
"step": 622
},
{
"epoch": 1.6929347826086958,
"grad_norm": 0.3366836607456207,
"learning_rate": 5.380952380952381e-06,
"loss": 1.3587,
"step": 623
},
{
"epoch": 1.6956521739130435,
"grad_norm": 0.3324086368083954,
"learning_rate": 5.3620586475164575e-06,
"loss": 1.3623,
"step": 624
},
{
"epoch": 1.6983695652173914,
"grad_norm": 0.3365149199962616,
"learning_rate": 5.3429602888086635e-06,
"loss": 1.3576,
"step": 625
},
{
"epoch": 1.7010869565217392,
"grad_norm": 0.3542598783969879,
"learning_rate": 5.323653962492438e-06,
"loss": 1.3239,
"step": 626
},
{
"epoch": 1.703804347826087,
"grad_norm": 0.35152384638786316,
"learning_rate": 5.304136253041362e-06,
"loss": 1.3395,
"step": 627
},
{
"epoch": 1.7065217391304348,
"grad_norm": 0.345515638589859,
"learning_rate": 5.284403669724771e-06,
"loss": 1.4407,
"step": 628
},
{
"epoch": 1.7092391304347827,
"grad_norm": 0.333773672580719,
"learning_rate": 5.2644526445264444e-06,
"loss": 1.3048,
"step": 629
},
{
"epoch": 1.7119565217391304,
"grad_norm": 0.3490050137042999,
"learning_rate": 5.244279529993816e-06,
"loss": 1.4172,
"step": 630
},
{
"epoch": 1.7146739130434783,
"grad_norm": 0.3542667031288147,
"learning_rate": 5.223880597014925e-06,
"loss": 1.3624,
"step": 631
},
{
"epoch": 1.7173913043478262,
"grad_norm": 0.36096838116645813,
"learning_rate": 5.203252032520325e-06,
"loss": 1.2933,
"step": 632
},
{
"epoch": 1.7201086956521738,
"grad_norm": 0.3591996133327484,
"learning_rate": 5.1823899371069175e-06,
"loss": 1.3022,
"step": 633
},
{
"epoch": 1.7228260869565217,
"grad_norm": 0.33392882347106934,
"learning_rate": 5.161290322580645e-06,
"loss": 1.292,
"step": 634
},
{
"epoch": 1.7255434782608696,
"grad_norm": 0.37061673402786255,
"learning_rate": 5.139949109414757e-06,
"loss": 1.4192,
"step": 635
},
{
"epoch": 1.7282608695652173,
"grad_norm": 0.3426995277404785,
"learning_rate": 5.1183621241202815e-06,
"loss": 1.3483,
"step": 636
},
{
"epoch": 1.7309782608695652,
"grad_norm": 0.33502069115638733,
"learning_rate": 5.096525096525096e-06,
"loss": 1.4262,
"step": 637
},
{
"epoch": 1.733695652173913,
"grad_norm": 0.34400445222854614,
"learning_rate": 5.074433656957928e-06,
"loss": 1.3643,
"step": 638
},
{
"epoch": 1.7364130434782608,
"grad_norm": 0.32812371850013733,
"learning_rate": 5.052083333333333e-06,
"loss": 1.4587,
"step": 639
},
{
"epoch": 1.7391304347826086,
"grad_norm": 0.3361847996711731,
"learning_rate": 5.029469548133595e-06,
"loss": 1.3439,
"step": 640
},
{
"epoch": 1.7418478260869565,
"grad_norm": 0.34841278195381165,
"learning_rate": 5.0065876152832674e-06,
"loss": 1.3689,
"step": 641
},
{
"epoch": 1.7445652173913042,
"grad_norm": 0.33427974581718445,
"learning_rate": 4.983432736911861e-06,
"loss": 1.2804,
"step": 642
},
{
"epoch": 1.7472826086956523,
"grad_norm": 0.32697078585624695,
"learning_rate": 4.96e-06,
"loss": 1.3019,
"step": 643
},
{
"epoch": 1.75,
"grad_norm": 0.352584570646286,
"learning_rate": 4.93628437290409e-06,
"loss": 1.2847,
"step": 644
},
{
"epoch": 1.7527173913043477,
"grad_norm": 0.3434447944164276,
"learning_rate": 4.912280701754385e-06,
"loss": 1.3631,
"step": 645
},
{
"epoch": 1.7554347826086958,
"grad_norm": 0.3368462920188904,
"learning_rate": 4.887983706720978e-06,
"loss": 1.4064,
"step": 646
},
{
"epoch": 1.7581521739130435,
"grad_norm": 0.33226725459098816,
"learning_rate": 4.8633879781420755e-06,
"loss": 1.3605,
"step": 647
},
{
"epoch": 1.7608695652173914,
"grad_norm": 0.34347623586654663,
"learning_rate": 4.83848797250859e-06,
"loss": 1.3918,
"step": 648
},
{
"epoch": 1.7635869565217392,
"grad_norm": 0.346498966217041,
"learning_rate": 4.8132780082987544e-06,
"loss": 1.3772,
"step": 649
},
{
"epoch": 1.766304347826087,
"grad_norm": 0.3317979574203491,
"learning_rate": 4.787752261656228e-06,
"loss": 1.3775,
"step": 650
},
{
"epoch": 1.7690217391304348,
"grad_norm": 0.36047762632369995,
"learning_rate": 4.761904761904761e-06,
"loss": 1.3881,
"step": 651
},
{
"epoch": 1.7717391304347827,
"grad_norm": 0.3384213447570801,
"learning_rate": 4.735729386892177e-06,
"loss": 1.3693,
"step": 652
},
{
"epoch": 1.7744565217391304,
"grad_norm": 0.33621373772621155,
"learning_rate": 4.709219858156028e-06,
"loss": 1.4111,
"step": 653
},
{
"epoch": 1.7771739130434783,
"grad_norm": 0.33636173605918884,
"learning_rate": 4.6823697359029255e-06,
"loss": 1.3144,
"step": 654
},
{
"epoch": 1.7798913043478262,
"grad_norm": 0.33351048827171326,
"learning_rate": 4.655172413793103e-06,
"loss": 1.349,
"step": 655
},
{
"epoch": 1.7826086956521738,
"grad_norm": 0.32726943492889404,
"learning_rate": 4.62762111352133e-06,
"loss": 1.3875,
"step": 656
},
{
"epoch": 1.7853260869565217,
"grad_norm": 0.3338076174259186,
"learning_rate": 4.5997088791848614e-06,
"loss": 1.3458,
"step": 657
},
{
"epoch": 1.7880434782608696,
"grad_norm": 0.3343966603279114,
"learning_rate": 4.571428571428571e-06,
"loss": 1.3314,
"step": 658
},
{
"epoch": 1.7907608695652173,
"grad_norm": 0.33754849433898926,
"learning_rate": 4.5427728613569326e-06,
"loss": 1.3338,
"step": 659
},
{
"epoch": 1.7934782608695652,
"grad_norm": 0.34028756618499756,
"learning_rate": 4.51373422420193e-06,
"loss": 1.3775,
"step": 660
},
{
"epoch": 1.796195652173913,
"grad_norm": 0.3283347189426422,
"learning_rate": 4.4843049327354254e-06,
"loss": 1.3437,
"step": 661
},
{
"epoch": 1.7989130434782608,
"grad_norm": 0.35330507159233093,
"learning_rate": 4.454477050413845e-06,
"loss": 1.3844,
"step": 662
},
{
"epoch": 1.8016304347826086,
"grad_norm": 0.33189696073532104,
"learning_rate": 4.424242424242424e-06,
"loss": 1.3126,
"step": 663
},
{
"epoch": 1.8043478260869565,
"grad_norm": 0.3308519721031189,
"learning_rate": 4.393592677345538e-06,
"loss": 1.3042,
"step": 664
},
{
"epoch": 1.8070652173913042,
"grad_norm": 0.3336990177631378,
"learning_rate": 4.362519201228878e-06,
"loss": 1.4633,
"step": 665
},
{
"epoch": 1.8097826086956523,
"grad_norm": 0.33930081129074097,
"learning_rate": 4.331013147718484e-06,
"loss": 1.3027,
"step": 666
},
{
"epoch": 1.8125,
"grad_norm": 0.33651721477508545,
"learning_rate": 4.299065420560747e-06,
"loss": 1.4218,
"step": 667
},
{
"epoch": 1.8152173913043477,
"grad_norm": 0.3540966808795929,
"learning_rate": 4.266666666666667e-06,
"loss": 1.3684,
"step": 668
},
{
"epoch": 1.8179347826086958,
"grad_norm": 0.3553326427936554,
"learning_rate": 4.2338072669826225e-06,
"loss": 1.343,
"step": 669
},
{
"epoch": 1.8206521739130435,
"grad_norm": 0.3493621349334717,
"learning_rate": 4.2004773269689735e-06,
"loss": 1.3468,
"step": 670
},
{
"epoch": 1.8233695652173914,
"grad_norm": 0.3427792191505432,
"learning_rate": 4.166666666666667e-06,
"loss": 1.3775,
"step": 671
},
{
"epoch": 1.8260869565217392,
"grad_norm": 0.3357887864112854,
"learning_rate": 4.1323648103309125e-06,
"loss": 1.4103,
"step": 672
},
{
"epoch": 1.828804347826087,
"grad_norm": 0.3339937627315521,
"learning_rate": 4.097560975609756e-06,
"loss": 1.2352,
"step": 673
},
{
"epoch": 1.8315217391304348,
"grad_norm": 0.33390963077545166,
"learning_rate": 4.062244062244062e-06,
"loss": 1.3348,
"step": 674
},
{
"epoch": 1.8342391304347827,
"grad_norm": 0.33141598105430603,
"learning_rate": 4.0264026402640265e-06,
"loss": 1.4008,
"step": 675
},
{
"epoch": 1.8369565217391304,
"grad_norm": 0.3481482267379761,
"learning_rate": 3.990024937655859e-06,
"loss": 1.4426,
"step": 676
},
{
"epoch": 1.8396739130434783,
"grad_norm": 0.3348937928676605,
"learning_rate": 3.9530988274706864e-06,
"loss": 1.3531,
"step": 677
},
{
"epoch": 1.8423913043478262,
"grad_norm": 0.34614527225494385,
"learning_rate": 3.915611814345991e-06,
"loss": 1.4149,
"step": 678
},
{
"epoch": 1.8451086956521738,
"grad_norm": 0.34718191623687744,
"learning_rate": 3.877551020408163e-06,
"loss": 1.3121,
"step": 679
},
{
"epoch": 1.8478260869565217,
"grad_norm": 0.3331008553504944,
"learning_rate": 3.8389031705227075e-06,
"loss": 1.3773,
"step": 680
},
{
"epoch": 1.8505434782608696,
"grad_norm": 0.3339367210865021,
"learning_rate": 3.799654576856649e-06,
"loss": 1.3741,
"step": 681
},
{
"epoch": 1.8532608695652173,
"grad_norm": 0.34059810638427734,
"learning_rate": 3.7597911227154046e-06,
"loss": 1.4186,
"step": 682
},
{
"epoch": 1.8559782608695652,
"grad_norm": 0.34646710753440857,
"learning_rate": 3.7192982456140345e-06,
"loss": 1.3247,
"step": 683
},
{
"epoch": 1.858695652173913,
"grad_norm": 0.33644041419029236,
"learning_rate": 3.6781609195402296e-06,
"loss": 1.2994,
"step": 684
},
{
"epoch": 1.8614130434782608,
"grad_norm": 0.33584222197532654,
"learning_rate": 3.636363636363636e-06,
"loss": 1.3625,
"step": 685
},
{
"epoch": 1.8641304347826086,
"grad_norm": 0.3542543053627014,
"learning_rate": 3.593890386343216e-06,
"loss": 1.4085,
"step": 686
},
{
"epoch": 1.8668478260869565,
"grad_norm": 0.3339426815509796,
"learning_rate": 3.550724637681159e-06,
"loss": 1.3908,
"step": 687
},
{
"epoch": 1.8695652173913042,
"grad_norm": 0.3317144215106964,
"learning_rate": 3.5068493150684927e-06,
"loss": 1.3254,
"step": 688
},
{
"epoch": 1.8722826086956523,
"grad_norm": 0.32547229528427124,
"learning_rate": 3.462246777163904e-06,
"loss": 1.3813,
"step": 689
},
{
"epoch": 1.875,
"grad_norm": 0.3365728557109833,
"learning_rate": 3.416898792943361e-06,
"loss": 1.3404,
"step": 690
},
{
"epoch": 1.8777173913043477,
"grad_norm": 0.33152568340301514,
"learning_rate": 3.370786516853932e-06,
"loss": 1.303,
"step": 691
},
{
"epoch": 1.8804347826086958,
"grad_norm": 0.34790927171707153,
"learning_rate": 3.3238904627006605e-06,
"loss": 1.3703,
"step": 692
},
{
"epoch": 1.8831521739130435,
"grad_norm": 0.3446458876132965,
"learning_rate": 3.276190476190476e-06,
"loss": 1.316,
"step": 693
},
{
"epoch": 1.8858695652173914,
"grad_norm": 0.334216445684433,
"learning_rate": 3.2276657060518727e-06,
"loss": 1.4189,
"step": 694
},
{
"epoch": 1.8885869565217392,
"grad_norm": 0.3326725661754608,
"learning_rate": 3.1782945736434107e-06,
"loss": 1.2832,
"step": 695
},
{
"epoch": 1.891304347826087,
"grad_norm": 0.34493488073349,
"learning_rate": 3.1280547409579666e-06,
"loss": 1.3276,
"step": 696
},
{
"epoch": 1.8940217391304348,
"grad_norm": 0.34864330291748047,
"learning_rate": 3.0769230769230766e-06,
"loss": 1.3397,
"step": 697
},
{
"epoch": 1.8967391304347827,
"grad_norm": 0.3301394581794739,
"learning_rate": 3.0248756218905467e-06,
"loss": 1.3018,
"step": 698
},
{
"epoch": 1.8994565217391304,
"grad_norm": 0.33378443121910095,
"learning_rate": 2.9718875502008025e-06,
"loss": 1.3894,
"step": 699
},
{
"epoch": 1.9021739130434783,
"grad_norm": 0.33840638399124146,
"learning_rate": 2.917933130699088e-06,
"loss": 1.371,
"step": 700
},
{
"epoch": 1.9048913043478262,
"grad_norm": 0.3386146128177643,
"learning_rate": 2.8629856850715747e-06,
"loss": 1.3683,
"step": 701
},
{
"epoch": 1.9076086956521738,
"grad_norm": 0.33338841795921326,
"learning_rate": 2.807017543859649e-06,
"loss": 1.4066,
"step": 702
},
{
"epoch": 1.9103260869565217,
"grad_norm": 0.32861799001693726,
"learning_rate": 2.7499999999999995e-06,
"loss": 1.3388,
"step": 703
},
{
"epoch": 1.9130434782608696,
"grad_norm": 0.3410465717315674,
"learning_rate": 2.691903259726603e-06,
"loss": 1.3076,
"step": 704
},
{
"epoch": 1.9157608695652173,
"grad_norm": 0.33878254890441895,
"learning_rate": 2.6326963906581743e-06,
"loss": 1.3261,
"step": 705
},
{
"epoch": 1.9184782608695652,
"grad_norm": 0.32100728154182434,
"learning_rate": 2.5723472668810287e-06,
"loss": 1.324,
"step": 706
},
{
"epoch": 1.921195652173913,
"grad_norm": 0.3330436646938324,
"learning_rate": 2.5108225108225104e-06,
"loss": 1.278,
"step": 707
},
{
"epoch": 1.9239130434782608,
"grad_norm": 0.340689092874527,
"learning_rate": 2.448087431693989e-06,
"loss": 1.3297,
"step": 708
},
{
"epoch": 1.9266304347826086,
"grad_norm": 0.3378203511238098,
"learning_rate": 2.3841059602649004e-06,
"loss": 1.3884,
"step": 709
},
{
"epoch": 1.9293478260869565,
"grad_norm": 0.35144487023353577,
"learning_rate": 2.3188405797101444e-06,
"loss": 1.36,
"step": 710
},
{
"epoch": 1.9320652173913042,
"grad_norm": 0.329775333404541,
"learning_rate": 2.2522522522522524e-06,
"loss": 1.384,
"step": 711
},
{
"epoch": 1.9347826086956523,
"grad_norm": 0.33017706871032715,
"learning_rate": 2.184300341296928e-06,
"loss": 1.3275,
"step": 712
},
{
"epoch": 1.9375,
"grad_norm": 0.3360842764377594,
"learning_rate": 2.114942528735632e-06,
"loss": 1.4158,
"step": 713
},
{
"epoch": 1.9402173913043477,
"grad_norm": 0.32160329818725586,
"learning_rate": 2.044134727061556e-06,
"loss": 1.2895,
"step": 714
},
{
"epoch": 1.9429347826086958,
"grad_norm": 0.33702775835990906,
"learning_rate": 1.9718309859154927e-06,
"loss": 1.2626,
"step": 715
},
{
"epoch": 1.9456521739130435,
"grad_norm": 0.3408706784248352,
"learning_rate": 1.8979833926453144e-06,
"loss": 1.3131,
"step": 716
},
{
"epoch": 1.9483695652173914,
"grad_norm": 0.3265095353126526,
"learning_rate": 1.8225419664268583e-06,
"loss": 1.2896,
"step": 717
},
{
"epoch": 1.9510869565217392,
"grad_norm": 0.34003371000289917,
"learning_rate": 1.7454545454545452e-06,
"loss": 1.2791,
"step": 718
},
{
"epoch": 1.953804347826087,
"grad_norm": 0.33744871616363525,
"learning_rate": 1.6666666666666665e-06,
"loss": 1.3417,
"step": 719
},
{
"epoch": 1.9565217391304348,
"grad_norm": 0.33043915033340454,
"learning_rate": 1.5861214374225524e-06,
"loss": 1.3383,
"step": 720
},
{
"epoch": 1.9592391304347827,
"grad_norm": 0.3270052969455719,
"learning_rate": 1.5037593984962404e-06,
"loss": 1.358,
"step": 721
},
{
"epoch": 1.9619565217391304,
"grad_norm": 0.32354652881622314,
"learning_rate": 1.4195183776932827e-06,
"loss": 1.3476,
"step": 722
},
{
"epoch": 1.9646739130434783,
"grad_norm": 0.33073997497558594,
"learning_rate": 1.3333333333333332e-06,
"loss": 1.3588,
"step": 723
},
{
"epoch": 1.9673913043478262,
"grad_norm": 0.33804917335510254,
"learning_rate": 1.245136186770428e-06,
"loss": 1.3842,
"step": 724
},
{
"epoch": 1.9701086956521738,
"grad_norm": 0.3286682367324829,
"learning_rate": 1.1548556430446192e-06,
"loss": 1.303,
"step": 725
},
{
"epoch": 1.9728260869565217,
"grad_norm": 0.33602073788642883,
"learning_rate": 1.0624169986719786e-06,
"loss": 1.2939,
"step": 726
},
{
"epoch": 1.9755434782608696,
"grad_norm": 0.33070436120033264,
"learning_rate": 9.67741935483871e-07,
"loss": 1.2889,
"step": 727
},
{
"epoch": 1.9782608695652173,
"grad_norm": 0.33206212520599365,
"learning_rate": 8.707482993197278e-07,
"loss": 1.3471,
"step": 728
},
{
"epoch": 1.9809782608695652,
"grad_norm": 0.32087206840515137,
"learning_rate": 7.713498622589532e-07,
"loss": 1.3782,
"step": 729
},
{
"epoch": 1.983695652173913,
"grad_norm": 0.32111042737960815,
"learning_rate": 6.694560669456067e-07,
"loss": 1.2801,
"step": 730
},
{
"epoch": 1.9864130434782608,
"grad_norm": 0.3236371874809265,
"learning_rate": 5.649717514124293e-07,
"loss": 1.4388,
"step": 731
},
{
"epoch": 1.9891304347826086,
"grad_norm": 0.3244304656982422,
"learning_rate": 4.5779685264663803e-07,
"loss": 1.2649,
"step": 732
},
{
"epoch": 1.9918478260869565,
"grad_norm": 0.505721926689148,
"learning_rate": 3.478260869565217e-07,
"loss": 1.272,
"step": 733
},
{
"epoch": 1.9945652173913042,
"grad_norm": 0.3223818838596344,
"learning_rate": 2.3494860499265783e-07,
"loss": 1.3915,
"step": 734
},
{
"epoch": 1.9972826086956523,
"grad_norm": 0.3199419677257538,
"learning_rate": 1.1904761904761904e-07,
"loss": 1.3722,
"step": 735
},
{
"epoch": 2.0,
"grad_norm": 0.3224795460700989,
"learning_rate": 0,
"loss": 1.4072,
"step": 736
}
],
"logging_steps": 1,
"max_steps": 736,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 184,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.746705359123513e+19,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}