lewtun's picture
lewtun HF staff
Model save
182ed3e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 15.0,
"eval_steps": 500,
"global_step": 330,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.045454545454545456,
"grad_norm": 257.23835842552523,
"learning_rate": 2.9411764705882355e-06,
"loss": 8.3183,
"step": 1
},
{
"epoch": 0.09090909090909091,
"grad_norm": 212.69538455789322,
"learning_rate": 5.882352941176471e-06,
"loss": 8.7163,
"step": 2
},
{
"epoch": 0.13636363636363635,
"grad_norm": 112.61346458118909,
"learning_rate": 8.823529411764707e-06,
"loss": 7.6494,
"step": 3
},
{
"epoch": 0.18181818181818182,
"grad_norm": 784.8951183910573,
"learning_rate": 1.1764705882352942e-05,
"loss": 8.0969,
"step": 4
},
{
"epoch": 0.22727272727272727,
"grad_norm": 185.45732108671288,
"learning_rate": 1.4705882352941177e-05,
"loss": 7.311,
"step": 5
},
{
"epoch": 0.2727272727272727,
"grad_norm": 195.28031105021546,
"learning_rate": 1.7647058823529414e-05,
"loss": 7.8658,
"step": 6
},
{
"epoch": 0.3181818181818182,
"grad_norm": 36.48221664801266,
"learning_rate": 2.058823529411765e-05,
"loss": 5.2603,
"step": 7
},
{
"epoch": 0.36363636363636365,
"grad_norm": 9.55998838923863,
"learning_rate": 2.3529411764705884e-05,
"loss": 4.8036,
"step": 8
},
{
"epoch": 0.4090909090909091,
"grad_norm": 9.615632721341097,
"learning_rate": 2.647058823529412e-05,
"loss": 4.8279,
"step": 9
},
{
"epoch": 0.45454545454545453,
"grad_norm": 7.669264353119744,
"learning_rate": 2.9411764705882354e-05,
"loss": 4.6123,
"step": 10
},
{
"epoch": 0.5,
"grad_norm": 6.9967605072507215,
"learning_rate": 3.235294117647059e-05,
"loss": 4.3857,
"step": 11
},
{
"epoch": 0.5454545454545454,
"grad_norm": 6.587199773697142,
"learning_rate": 3.529411764705883e-05,
"loss": 4.4648,
"step": 12
},
{
"epoch": 0.5909090909090909,
"grad_norm": 5.505554834249296,
"learning_rate": 3.8235294117647055e-05,
"loss": 4.1377,
"step": 13
},
{
"epoch": 0.6363636363636364,
"grad_norm": 4.9970053869195965,
"learning_rate": 4.11764705882353e-05,
"loss": 3.8989,
"step": 14
},
{
"epoch": 0.6818181818181818,
"grad_norm": 3.79904253928739,
"learning_rate": 4.411764705882353e-05,
"loss": 3.8294,
"step": 15
},
{
"epoch": 0.7272727272727273,
"grad_norm": 3.508407692792644,
"learning_rate": 4.705882352941177e-05,
"loss": 4.0649,
"step": 16
},
{
"epoch": 0.7727272727272727,
"grad_norm": 3.2871677206858054,
"learning_rate": 5e-05,
"loss": 3.7045,
"step": 17
},
{
"epoch": 0.8181818181818182,
"grad_norm": 3.3241161598281,
"learning_rate": 4.999886666070519e-05,
"loss": 3.7635,
"step": 18
},
{
"epoch": 0.8636363636363636,
"grad_norm": 3.1152646953619567,
"learning_rate": 4.9995466756994795e-05,
"loss": 3.6178,
"step": 19
},
{
"epoch": 0.9090909090909091,
"grad_norm": 3.273528675012558,
"learning_rate": 4.9989800631379443e-05,
"loss": 3.5688,
"step": 20
},
{
"epoch": 0.9545454545454546,
"grad_norm": 3.115834430389958,
"learning_rate": 4.998186885467182e-05,
"loss": 3.2943,
"step": 21
},
{
"epoch": 1.0,
"grad_norm": 2.661570245411782,
"learning_rate": 4.99716722259292e-05,
"loss": 3.0865,
"step": 22
},
{
"epoch": 1.0454545454545454,
"grad_norm": 2.849664680666923,
"learning_rate": 4.99592117723729e-05,
"loss": 3.1392,
"step": 23
},
{
"epoch": 1.0909090909090908,
"grad_norm": 2.4018072414305895,
"learning_rate": 4.994448874928487e-05,
"loss": 2.9298,
"step": 24
},
{
"epoch": 1.1363636363636362,
"grad_norm": 2.0206665673120896,
"learning_rate": 4.992750463988114e-05,
"loss": 2.8627,
"step": 25
},
{
"epoch": 1.1818181818181819,
"grad_norm": 1.9920698277851823,
"learning_rate": 4.990826115516248e-05,
"loss": 2.8083,
"step": 26
},
{
"epoch": 1.2272727272727273,
"grad_norm": 1.7918724410197877,
"learning_rate": 4.9886760233742e-05,
"loss": 2.6199,
"step": 27
},
{
"epoch": 1.2727272727272727,
"grad_norm": 1.5505702505545602,
"learning_rate": 4.986300404164984e-05,
"loss": 2.6521,
"step": 28
},
{
"epoch": 1.3181818181818181,
"grad_norm": 1.2616722287294364,
"learning_rate": 4.9836994972114974e-05,
"loss": 2.6015,
"step": 29
},
{
"epoch": 1.3636363636363638,
"grad_norm": 1.1638338779341943,
"learning_rate": 4.9808735645324125e-05,
"loss": 2.5141,
"step": 30
},
{
"epoch": 1.4090909090909092,
"grad_norm": 1.0794908268185501,
"learning_rate": 4.9778228908157766e-05,
"loss": 2.4714,
"step": 31
},
{
"epoch": 1.4545454545454546,
"grad_norm": 0.9412858812798977,
"learning_rate": 4.9745477833903364e-05,
"loss": 2.4086,
"step": 32
},
{
"epoch": 1.5,
"grad_norm": 0.9703710030327403,
"learning_rate": 4.971048572194577e-05,
"loss": 2.3233,
"step": 33
},
{
"epoch": 1.5454545454545454,
"grad_norm": 0.8923121257838885,
"learning_rate": 4.9673256097434793e-05,
"loss": 2.3252,
"step": 34
},
{
"epoch": 1.5909090909090908,
"grad_norm": 0.7408466397481557,
"learning_rate": 4.963379271093012e-05,
"loss": 2.3592,
"step": 35
},
{
"epoch": 1.6363636363636362,
"grad_norm": 0.8332937369940602,
"learning_rate": 4.959209953802344e-05,
"loss": 2.3153,
"step": 36
},
{
"epoch": 1.6818181818181817,
"grad_norm": 0.8198715006056391,
"learning_rate": 4.954818077893798e-05,
"loss": 2.14,
"step": 37
},
{
"epoch": 1.7272727272727273,
"grad_norm": 0.7030061091284978,
"learning_rate": 4.950204085810533e-05,
"loss": 2.1745,
"step": 38
},
{
"epoch": 1.7727272727272727,
"grad_norm": 0.7045320084558211,
"learning_rate": 4.945368442371974e-05,
"loss": 2.0868,
"step": 39
},
{
"epoch": 1.8181818181818183,
"grad_norm": 0.6475997413192687,
"learning_rate": 4.9403116347269866e-05,
"loss": 2.0927,
"step": 40
},
{
"epoch": 1.8636363636363638,
"grad_norm": 0.6818237942981833,
"learning_rate": 4.935034172304797e-05,
"loss": 2.1228,
"step": 41
},
{
"epoch": 1.9090909090909092,
"grad_norm": 0.6349632576540991,
"learning_rate": 4.9295365867636766e-05,
"loss": 2.1594,
"step": 42
},
{
"epoch": 1.9545454545454546,
"grad_norm": 0.6123700654773722,
"learning_rate": 4.923819431937377e-05,
"loss": 1.9419,
"step": 43
},
{
"epoch": 2.0,
"grad_norm": 0.628265874333403,
"learning_rate": 4.9178832837793415e-05,
"loss": 1.9591,
"step": 44
},
{
"epoch": 2.0454545454545454,
"grad_norm": 0.6079957574674165,
"learning_rate": 4.9117287403046766e-05,
"loss": 1.9066,
"step": 45
},
{
"epoch": 2.090909090909091,
"grad_norm": 0.5913194294689115,
"learning_rate": 4.9053564215299135e-05,
"loss": 1.9269,
"step": 46
},
{
"epoch": 2.1363636363636362,
"grad_norm": 0.524599692265706,
"learning_rate": 4.898766969410542e-05,
"loss": 1.8848,
"step": 47
},
{
"epoch": 2.1818181818181817,
"grad_norm": 0.5449852882597904,
"learning_rate": 4.891961047776342e-05,
"loss": 1.8835,
"step": 48
},
{
"epoch": 2.227272727272727,
"grad_norm": 0.5842912439698547,
"learning_rate": 4.8849393422645054e-05,
"loss": 1.8353,
"step": 49
},
{
"epoch": 2.2727272727272725,
"grad_norm": 0.5004654366250076,
"learning_rate": 4.87770256025057e-05,
"loss": 1.8698,
"step": 50
},
{
"epoch": 2.3181818181818183,
"grad_norm": 0.5000460896838382,
"learning_rate": 4.870251430777148e-05,
"loss": 1.8355,
"step": 51
},
{
"epoch": 2.3636363636363638,
"grad_norm": 0.47646909638568696,
"learning_rate": 4.862586704480494e-05,
"loss": 1.8062,
"step": 52
},
{
"epoch": 2.409090909090909,
"grad_norm": 0.4775044778243631,
"learning_rate": 4.8547091535148725e-05,
"loss": 1.7511,
"step": 53
},
{
"epoch": 2.4545454545454546,
"grad_norm": 0.4271462696410255,
"learning_rate": 4.846619571474777e-05,
"loss": 1.819,
"step": 54
},
{
"epoch": 2.5,
"grad_norm": 0.4687848016567625,
"learning_rate": 4.8383187733149814e-05,
"loss": 1.7687,
"step": 55
},
{
"epoch": 2.5454545454545454,
"grad_norm": 0.49290753360843537,
"learning_rate": 4.8298075952684406e-05,
"loss": 1.7602,
"step": 56
},
{
"epoch": 2.590909090909091,
"grad_norm": 0.4145717434905905,
"learning_rate": 4.821086894762045e-05,
"loss": 1.6849,
"step": 57
},
{
"epoch": 2.6363636363636362,
"grad_norm": 0.47758526019077047,
"learning_rate": 4.812157550330246e-05,
"loss": 1.8031,
"step": 58
},
{
"epoch": 2.6818181818181817,
"grad_norm": 0.43090399608171975,
"learning_rate": 4.8030204615265445e-05,
"loss": 1.6979,
"step": 59
},
{
"epoch": 2.7272727272727275,
"grad_norm": 0.42558498758567664,
"learning_rate": 4.7936765488328794e-05,
"loss": 1.6167,
"step": 60
},
{
"epoch": 2.7727272727272725,
"grad_norm": 0.42771373575336713,
"learning_rate": 4.7841267535668876e-05,
"loss": 1.6126,
"step": 61
},
{
"epoch": 2.8181818181818183,
"grad_norm": 0.45605982497696307,
"learning_rate": 4.7743720377870786e-05,
"loss": 1.7051,
"step": 62
},
{
"epoch": 2.8636363636363638,
"grad_norm": 0.36383290803582924,
"learning_rate": 4.764413384195915e-05,
"loss": 1.5355,
"step": 63
},
{
"epoch": 2.909090909090909,
"grad_norm": 0.35888633901842737,
"learning_rate": 4.7542517960408125e-05,
"loss": 1.6037,
"step": 64
},
{
"epoch": 2.9545454545454546,
"grad_norm": 0.35247547310658606,
"learning_rate": 4.7438882970130756e-05,
"loss": 1.6403,
"step": 65
},
{
"epoch": 3.0,
"grad_norm": 0.4123547479714445,
"learning_rate": 4.7333239311447634e-05,
"loss": 1.6687,
"step": 66
},
{
"epoch": 3.0454545454545454,
"grad_norm": 0.3575991218498864,
"learning_rate": 4.7225597627035176e-05,
"loss": 1.6531,
"step": 67
},
{
"epoch": 3.090909090909091,
"grad_norm": 0.3303944518163955,
"learning_rate": 4.711596876085344e-05,
"loss": 1.4537,
"step": 68
},
{
"epoch": 3.1363636363636362,
"grad_norm": 0.43041449387205766,
"learning_rate": 4.70043637570537e-05,
"loss": 1.6096,
"step": 69
},
{
"epoch": 3.1818181818181817,
"grad_norm": 0.3646696726581941,
"learning_rate": 4.6890793858865865e-05,
"loss": 1.586,
"step": 70
},
{
"epoch": 3.227272727272727,
"grad_norm": 0.3260419946678917,
"learning_rate": 4.677527050746577e-05,
"loss": 1.5422,
"step": 71
},
{
"epoch": 3.2727272727272725,
"grad_norm": 0.33570338878732947,
"learning_rate": 4.665780534082264e-05,
"loss": 1.5553,
"step": 72
},
{
"epoch": 3.3181818181818183,
"grad_norm": 0.3085499618355785,
"learning_rate": 4.6538410192526613e-05,
"loss": 1.5067,
"step": 73
},
{
"epoch": 3.3636363636363638,
"grad_norm": 0.30727638027509513,
"learning_rate": 4.6417097090596637e-05,
"loss": 1.5667,
"step": 74
},
{
"epoch": 3.409090909090909,
"grad_norm": 0.3021730689826977,
"learning_rate": 4.629387825626875e-05,
"loss": 1.5795,
"step": 75
},
{
"epoch": 3.4545454545454546,
"grad_norm": 0.32502917714553514,
"learning_rate": 4.6168766102764874e-05,
"loss": 1.5154,
"step": 76
},
{
"epoch": 3.5,
"grad_norm": 0.3301265717537239,
"learning_rate": 4.604177323404235e-05,
"loss": 1.5048,
"step": 77
},
{
"epoch": 3.5454545454545454,
"grad_norm": 0.323165145487666,
"learning_rate": 4.591291244352413e-05,
"loss": 1.5117,
"step": 78
},
{
"epoch": 3.590909090909091,
"grad_norm": 0.3082703852698703,
"learning_rate": 4.578219671280998e-05,
"loss": 1.4521,
"step": 79
},
{
"epoch": 3.6363636363636362,
"grad_norm": 0.29479913403691677,
"learning_rate": 4.5649639210368714e-05,
"loss": 1.4487,
"step": 80
},
{
"epoch": 3.6818181818181817,
"grad_norm": 0.3092257716704811,
"learning_rate": 4.551525329021155e-05,
"loss": 1.447,
"step": 81
},
{
"epoch": 3.7272727272727275,
"grad_norm": 0.37229811196636947,
"learning_rate": 4.5379052490546855e-05,
"loss": 1.4919,
"step": 82
},
{
"epoch": 3.7727272727272725,
"grad_norm": 0.31138376308603494,
"learning_rate": 4.524105053241625e-05,
"loss": 1.4865,
"step": 83
},
{
"epoch": 3.8181818181818183,
"grad_norm": 0.3584821617778458,
"learning_rate": 4.510126131831234e-05,
"loss": 1.5092,
"step": 84
},
{
"epoch": 3.8636363636363638,
"grad_norm": 0.32342053983138597,
"learning_rate": 4.4959698930778184e-05,
"loss": 1.3528,
"step": 85
},
{
"epoch": 3.909090909090909,
"grad_norm": 0.3755258228072684,
"learning_rate": 4.481637763098858e-05,
"loss": 1.5071,
"step": 86
},
{
"epoch": 3.9545454545454546,
"grad_norm": 0.31577787863565876,
"learning_rate": 4.4671311857313376e-05,
"loss": 1.5149,
"step": 87
},
{
"epoch": 4.0,
"grad_norm": 0.3274660630382459,
"learning_rate": 4.452451622386294e-05,
"loss": 1.5101,
"step": 88
},
{
"epoch": 4.045454545454546,
"grad_norm": 0.295499868623689,
"learning_rate": 4.437600551901591e-05,
"loss": 1.4591,
"step": 89
},
{
"epoch": 4.090909090909091,
"grad_norm": 0.28617863379643455,
"learning_rate": 4.422579470392941e-05,
"loss": 1.4866,
"step": 90
},
{
"epoch": 4.136363636363637,
"grad_norm": 0.27613544331064876,
"learning_rate": 4.40738989110318e-05,
"loss": 1.4244,
"step": 91
},
{
"epoch": 4.181818181818182,
"grad_norm": 0.2969006794655833,
"learning_rate": 4.392033344249827e-05,
"loss": 1.3955,
"step": 92
},
{
"epoch": 4.2272727272727275,
"grad_norm": 0.2833654953485509,
"learning_rate": 4.376511376870925e-05,
"loss": 1.4366,
"step": 93
},
{
"epoch": 4.2727272727272725,
"grad_norm": 0.32286555195443095,
"learning_rate": 4.36082555266919e-05,
"loss": 1.3779,
"step": 94
},
{
"epoch": 4.318181818181818,
"grad_norm": 0.30339867592392933,
"learning_rate": 4.3449774518544837e-05,
"loss": 1.3523,
"step": 95
},
{
"epoch": 4.363636363636363,
"grad_norm": 0.3678773849209462,
"learning_rate": 4.328968670984621e-05,
"loss": 1.4055,
"step": 96
},
{
"epoch": 4.409090909090909,
"grad_norm": 0.2809671108473524,
"learning_rate": 4.3128008228045264e-05,
"loss": 1.3742,
"step": 97
},
{
"epoch": 4.454545454545454,
"grad_norm": 0.3264760569541278,
"learning_rate": 4.296475536083769e-05,
"loss": 1.3946,
"step": 98
},
{
"epoch": 4.5,
"grad_norm": 0.258862105020995,
"learning_rate": 4.279994455452478e-05,
"loss": 1.3284,
"step": 99
},
{
"epoch": 4.545454545454545,
"grad_norm": 0.3134553840821824,
"learning_rate": 4.263359241235657e-05,
"loss": 1.3851,
"step": 100
},
{
"epoch": 4.590909090909091,
"grad_norm": 0.30170722376996695,
"learning_rate": 4.246571569285925e-05,
"loss": 1.371,
"step": 101
},
{
"epoch": 4.636363636363637,
"grad_norm": 0.2887395932931785,
"learning_rate": 4.229633130814685e-05,
"loss": 1.3292,
"step": 102
},
{
"epoch": 4.681818181818182,
"grad_norm": 0.3218428316599217,
"learning_rate": 4.212545632221751e-05,
"loss": 1.4558,
"step": 103
},
{
"epoch": 4.7272727272727275,
"grad_norm": 0.27631578638146514,
"learning_rate": 4.1953107949234414e-05,
"loss": 1.2989,
"step": 104
},
{
"epoch": 4.7727272727272725,
"grad_norm": 0.2688375043457644,
"learning_rate": 4.1779303551791695e-05,
"loss": 1.3677,
"step": 105
},
{
"epoch": 4.818181818181818,
"grad_norm": 0.28592384506961227,
"learning_rate": 4.160406063916517e-05,
"loss": 1.2749,
"step": 106
},
{
"epoch": 4.863636363636363,
"grad_norm": 0.30087535430968293,
"learning_rate": 4.142739686554853e-05,
"loss": 1.405,
"step": 107
},
{
"epoch": 4.909090909090909,
"grad_norm": 0.27354449180224716,
"learning_rate": 4.124933002827481e-05,
"loss": 1.3642,
"step": 108
},
{
"epoch": 4.954545454545455,
"grad_norm": 0.29727660205166156,
"learning_rate": 4.106987806602345e-05,
"loss": 1.3686,
"step": 109
},
{
"epoch": 5.0,
"grad_norm": 0.29032585014056134,
"learning_rate": 4.088905905701316e-05,
"loss": 1.3651,
"step": 110
},
{
"epoch": 5.045454545454546,
"grad_norm": 0.2900324279431226,
"learning_rate": 4.070689121718066e-05,
"loss": 1.3556,
"step": 111
},
{
"epoch": 5.090909090909091,
"grad_norm": 0.24863713928813203,
"learning_rate": 4.0523392898345604e-05,
"loss": 1.341,
"step": 112
},
{
"epoch": 5.136363636363637,
"grad_norm": 0.2790657225717466,
"learning_rate": 4.03385825863618e-05,
"loss": 1.3565,
"step": 113
},
{
"epoch": 5.181818181818182,
"grad_norm": 0.2665012607059084,
"learning_rate": 4.0152478899254906e-05,
"loss": 1.2776,
"step": 114
},
{
"epoch": 5.2272727272727275,
"grad_norm": 0.29328265354243366,
"learning_rate": 3.996510058534682e-05,
"loss": 1.3697,
"step": 115
},
{
"epoch": 5.2727272727272725,
"grad_norm": 0.2636658932396027,
"learning_rate": 3.9776466521366995e-05,
"loss": 1.3208,
"step": 116
},
{
"epoch": 5.318181818181818,
"grad_norm": 0.23509864156431806,
"learning_rate": 3.958659571055071e-05,
"loss": 1.2671,
"step": 117
},
{
"epoch": 5.363636363636363,
"grad_norm": 0.2621476324195016,
"learning_rate": 3.939550728072473e-05,
"loss": 1.3526,
"step": 118
},
{
"epoch": 5.409090909090909,
"grad_norm": 0.24664500701059844,
"learning_rate": 3.920322048238024e-05,
"loss": 1.2985,
"step": 119
},
{
"epoch": 5.454545454545454,
"grad_norm": 0.28092507999952776,
"learning_rate": 3.900975468673368e-05,
"loss": 1.2591,
"step": 120
},
{
"epoch": 5.5,
"grad_norm": 0.2604137388452376,
"learning_rate": 3.8815129383775104e-05,
"loss": 1.3022,
"step": 121
},
{
"epoch": 5.545454545454545,
"grad_norm": 0.25485436564236713,
"learning_rate": 3.861936418030483e-05,
"loss": 1.3511,
"step": 122
},
{
"epoch": 5.590909090909091,
"grad_norm": 0.22709249270142517,
"learning_rate": 3.842247879795822e-05,
"loss": 1.3479,
"step": 123
},
{
"epoch": 5.636363636363637,
"grad_norm": 0.23971939043085813,
"learning_rate": 3.822449307121886e-05,
"loss": 1.2734,
"step": 124
},
{
"epoch": 5.681818181818182,
"grad_norm": 0.27819081228375214,
"learning_rate": 3.8025426945420426e-05,
"loss": 1.2971,
"step": 125
},
{
"epoch": 5.7272727272727275,
"grad_norm": 0.25456906150670894,
"learning_rate": 3.782530047473739e-05,
"loss": 1.2765,
"step": 126
},
{
"epoch": 5.7727272727272725,
"grad_norm": 0.2235373692230836,
"learning_rate": 3.762413382016467e-05,
"loss": 1.2652,
"step": 127
},
{
"epoch": 5.818181818181818,
"grad_norm": 0.29051536045091336,
"learning_rate": 3.742194724748668e-05,
"loss": 1.2158,
"step": 128
},
{
"epoch": 5.863636363636363,
"grad_norm": 0.25505139923240183,
"learning_rate": 3.721876112523566e-05,
"loss": 1.2749,
"step": 129
},
{
"epoch": 5.909090909090909,
"grad_norm": 0.2297174194118098,
"learning_rate": 3.701459592263974e-05,
"loss": 1.2686,
"step": 130
},
{
"epoch": 5.954545454545455,
"grad_norm": 1.1819543960335286,
"learning_rate": 3.680947220756086e-05,
"loss": 1.2669,
"step": 131
},
{
"epoch": 6.0,
"grad_norm": 0.3596649944995116,
"learning_rate": 3.6603410644422703e-05,
"loss": 1.2553,
"step": 132
},
{
"epoch": 6.045454545454546,
"grad_norm": 0.2373357658133507,
"learning_rate": 3.639643199212899e-05,
"loss": 1.2475,
"step": 133
},
{
"epoch": 6.090909090909091,
"grad_norm": 0.22493113890489372,
"learning_rate": 3.618855710197212e-05,
"loss": 1.2343,
"step": 134
},
{
"epoch": 6.136363636363637,
"grad_norm": 0.2186099012077051,
"learning_rate": 3.59798069155327e-05,
"loss": 1.2583,
"step": 135
},
{
"epoch": 6.181818181818182,
"grad_norm": 0.2198359168498248,
"learning_rate": 3.577020246256974e-05,
"loss": 1.2124,
"step": 136
},
{
"epoch": 6.2272727272727275,
"grad_norm": 0.2418275850989451,
"learning_rate": 3.555976485890216e-05,
"loss": 1.2652,
"step": 137
},
{
"epoch": 6.2727272727272725,
"grad_norm": 0.24177652899109378,
"learning_rate": 3.5348515304281567e-05,
"loss": 1.2718,
"step": 138
},
{
"epoch": 6.318181818181818,
"grad_norm": 0.23176107126497403,
"learning_rate": 3.5136475080256504e-05,
"loss": 1.2815,
"step": 139
},
{
"epoch": 6.363636363636363,
"grad_norm": 0.2186114898802362,
"learning_rate": 3.492366554802856e-05,
"loss": 1.2278,
"step": 140
},
{
"epoch": 6.409090909090909,
"grad_norm": 0.24894955084370854,
"learning_rate": 3.471010814630044e-05,
"loss": 1.2528,
"step": 141
},
{
"epoch": 6.454545454545454,
"grad_norm": 0.26036666060205077,
"learning_rate": 3.449582438911613e-05,
"loss": 1.3011,
"step": 142
},
{
"epoch": 6.5,
"grad_norm": 0.21464875894128543,
"learning_rate": 3.428083586369362e-05,
"loss": 1.2153,
"step": 143
},
{
"epoch": 6.545454545454545,
"grad_norm": 0.24924104575584335,
"learning_rate": 3.406516422825013e-05,
"loss": 1.2149,
"step": 144
},
{
"epoch": 6.590909090909091,
"grad_norm": 0.22796512903628247,
"learning_rate": 3.384883120982027e-05,
"loss": 1.2057,
"step": 145
},
{
"epoch": 6.636363636363637,
"grad_norm": 0.22412816101017885,
"learning_rate": 3.363185860206719e-05,
"loss": 1.2879,
"step": 146
},
{
"epoch": 6.681818181818182,
"grad_norm": 0.22898070560796288,
"learning_rate": 3.341426826308708e-05,
"loss": 1.2407,
"step": 147
},
{
"epoch": 6.7272727272727275,
"grad_norm": 0.24599569608907884,
"learning_rate": 3.319608211320719e-05,
"loss": 1.193,
"step": 148
},
{
"epoch": 6.7727272727272725,
"grad_norm": 0.22837138512494187,
"learning_rate": 3.29773221327775e-05,
"loss": 1.2173,
"step": 149
},
{
"epoch": 6.818181818181818,
"grad_norm": 0.23242892820855598,
"learning_rate": 3.2758010359956376e-05,
"loss": 1.3222,
"step": 150
},
{
"epoch": 6.863636363636363,
"grad_norm": 0.2543864237654759,
"learning_rate": 3.253816888849051e-05,
"loss": 1.24,
"step": 151
},
{
"epoch": 6.909090909090909,
"grad_norm": 0.24092106206033628,
"learning_rate": 3.2317819865489066e-05,
"loss": 1.2964,
"step": 152
},
{
"epoch": 6.954545454545455,
"grad_norm": 0.24386711464365543,
"learning_rate": 3.209698548919262e-05,
"loss": 1.2041,
"step": 153
},
{
"epoch": 7.0,
"grad_norm": 0.2507114565585548,
"learning_rate": 3.187568800673682e-05,
"loss": 1.2057,
"step": 154
},
{
"epoch": 7.045454545454546,
"grad_norm": 0.21813780408934713,
"learning_rate": 3.165394971191125e-05,
"loss": 1.1822,
"step": 155
},
{
"epoch": 7.090909090909091,
"grad_norm": 0.22737082010889867,
"learning_rate": 3.143179294291351e-05,
"loss": 1.2516,
"step": 156
},
{
"epoch": 7.136363636363637,
"grad_norm": 0.2168145915465791,
"learning_rate": 3.120924008009875e-05,
"loss": 1.1801,
"step": 157
},
{
"epoch": 7.181818181818182,
"grad_norm": 0.19136333990689988,
"learning_rate": 3.0986313543725174e-05,
"loss": 1.1683,
"step": 158
},
{
"epoch": 7.2272727272727275,
"grad_norm": 0.2261635672146715,
"learning_rate": 3.0763035791695335e-05,
"loss": 1.2801,
"step": 159
},
{
"epoch": 7.2727272727272725,
"grad_norm": 0.1900863256471995,
"learning_rate": 3.053942931729365e-05,
"loss": 1.2395,
"step": 160
},
{
"epoch": 7.318181818181818,
"grad_norm": 0.20083348886939187,
"learning_rate": 3.0315516646920494e-05,
"loss": 1.1789,
"step": 161
},
{
"epoch": 7.363636363636363,
"grad_norm": 0.22527651747823768,
"learning_rate": 3.0091320337822793e-05,
"loss": 1.1912,
"step": 162
},
{
"epoch": 7.409090909090909,
"grad_norm": 0.22935986524053414,
"learning_rate": 2.9866862975821596e-05,
"loss": 1.2043,
"step": 163
},
{
"epoch": 7.454545454545454,
"grad_norm": 0.25053211749891174,
"learning_rate": 2.9642167173036768e-05,
"loss": 1.2245,
"step": 164
},
{
"epoch": 7.5,
"grad_norm": 0.24058101769072435,
"learning_rate": 2.9417255565608982e-05,
"loss": 1.1887,
"step": 165
},
{
"epoch": 7.545454545454545,
"grad_norm": 0.2236093293557457,
"learning_rate": 2.9192150811419343e-05,
"loss": 1.1546,
"step": 166
},
{
"epoch": 7.590909090909091,
"grad_norm": 0.2536710657188106,
"learning_rate": 2.8966875587806842e-05,
"loss": 1.2302,
"step": 167
},
{
"epoch": 7.636363636363637,
"grad_norm": 0.23635026185440464,
"learning_rate": 2.8741452589283747e-05,
"loss": 1.2491,
"step": 168
},
{
"epoch": 7.681818181818182,
"grad_norm": 0.20829672573008348,
"learning_rate": 2.8515904525249342e-05,
"loss": 1.1821,
"step": 169
},
{
"epoch": 7.7272727272727275,
"grad_norm": 0.19887830172000012,
"learning_rate": 2.8290254117702204e-05,
"loss": 1.2327,
"step": 170
},
{
"epoch": 7.7727272727272725,
"grad_norm": 0.22301542864724377,
"learning_rate": 2.8064524098951122e-05,
"loss": 1.1883,
"step": 171
},
{
"epoch": 7.818181818181818,
"grad_norm": 0.20156843827561532,
"learning_rate": 2.7838737209324995e-05,
"loss": 1.2065,
"step": 172
},
{
"epoch": 7.863636363636363,
"grad_norm": 0.2196470118065061,
"learning_rate": 2.761291619488198e-05,
"loss": 1.2109,
"step": 173
},
{
"epoch": 7.909090909090909,
"grad_norm": 0.23170457369155842,
"learning_rate": 2.738708380511803e-05,
"loss": 1.211,
"step": 174
},
{
"epoch": 7.954545454545455,
"grad_norm": 0.20233640061480676,
"learning_rate": 2.7161262790675013e-05,
"loss": 1.1566,
"step": 175
},
{
"epoch": 8.0,
"grad_norm": 0.22428142795845282,
"learning_rate": 2.6935475901048884e-05,
"loss": 1.2359,
"step": 176
},
{
"epoch": 8.045454545454545,
"grad_norm": 0.21470110790573302,
"learning_rate": 2.6709745882297805e-05,
"loss": 1.2061,
"step": 177
},
{
"epoch": 8.090909090909092,
"grad_norm": 0.19724648690029864,
"learning_rate": 2.6484095474750663e-05,
"loss": 1.2481,
"step": 178
},
{
"epoch": 8.136363636363637,
"grad_norm": 0.20720915390036856,
"learning_rate": 2.6258547410716272e-05,
"loss": 1.1459,
"step": 179
},
{
"epoch": 8.181818181818182,
"grad_norm": 0.2279215538554465,
"learning_rate": 2.6033124412193167e-05,
"loss": 1.1456,
"step": 180
},
{
"epoch": 8.227272727272727,
"grad_norm": 0.1919565575948159,
"learning_rate": 2.580784918858066e-05,
"loss": 1.2178,
"step": 181
},
{
"epoch": 8.272727272727273,
"grad_norm": 0.20123788080080415,
"learning_rate": 2.558274443439103e-05,
"loss": 1.1572,
"step": 182
},
{
"epoch": 8.318181818181818,
"grad_norm": 0.18813695162235894,
"learning_rate": 2.535783282696324e-05,
"loss": 1.1923,
"step": 183
},
{
"epoch": 8.363636363636363,
"grad_norm": 0.20823032710382397,
"learning_rate": 2.5133137024178406e-05,
"loss": 1.1843,
"step": 184
},
{
"epoch": 8.409090909090908,
"grad_norm": 0.2099319254835431,
"learning_rate": 2.4908679662177216e-05,
"loss": 1.1993,
"step": 185
},
{
"epoch": 8.454545454545455,
"grad_norm": 0.19824053523947052,
"learning_rate": 2.468448335307951e-05,
"loss": 1.1826,
"step": 186
},
{
"epoch": 8.5,
"grad_norm": 0.19231852836204918,
"learning_rate": 2.4460570682706362e-05,
"loss": 1.1279,
"step": 187
},
{
"epoch": 8.545454545454545,
"grad_norm": 0.20741037413832103,
"learning_rate": 2.4236964208304673e-05,
"loss": 1.0622,
"step": 188
},
{
"epoch": 8.590909090909092,
"grad_norm": 0.3975271395657707,
"learning_rate": 2.4013686456274824e-05,
"loss": 1.1743,
"step": 189
},
{
"epoch": 8.636363636363637,
"grad_norm": 0.1911090472778124,
"learning_rate": 2.379075991990126e-05,
"loss": 1.1241,
"step": 190
},
{
"epoch": 8.681818181818182,
"grad_norm": 0.19428196767319525,
"learning_rate": 2.35682070570865e-05,
"loss": 1.2169,
"step": 191
},
{
"epoch": 8.727272727272727,
"grad_norm": 0.1929543238663036,
"learning_rate": 2.3346050288088743e-05,
"loss": 1.1213,
"step": 192
},
{
"epoch": 8.772727272727273,
"grad_norm": 0.1859295707214649,
"learning_rate": 2.3124311993263192e-05,
"loss": 1.2022,
"step": 193
},
{
"epoch": 8.818181818181818,
"grad_norm": 0.19215762964127076,
"learning_rate": 2.2903014510807392e-05,
"loss": 1.1756,
"step": 194
},
{
"epoch": 8.863636363636363,
"grad_norm": 0.19577206240809536,
"learning_rate": 2.2682180134510943e-05,
"loss": 1.1492,
"step": 195
},
{
"epoch": 8.909090909090908,
"grad_norm": 0.18354313338272446,
"learning_rate": 2.2461831111509496e-05,
"loss": 1.1474,
"step": 196
},
{
"epoch": 8.954545454545455,
"grad_norm": 0.1959877870143329,
"learning_rate": 2.2241989640043633e-05,
"loss": 1.2621,
"step": 197
},
{
"epoch": 9.0,
"grad_norm": 0.1948482879201238,
"learning_rate": 2.202267786722252e-05,
"loss": 1.2449,
"step": 198
},
{
"epoch": 9.045454545454545,
"grad_norm": 0.21120390875412814,
"learning_rate": 2.1803917886792812e-05,
"loss": 1.1333,
"step": 199
},
{
"epoch": 9.090909090909092,
"grad_norm": 0.18499690032381164,
"learning_rate": 2.1585731736912922e-05,
"loss": 1.2015,
"step": 200
},
{
"epoch": 9.136363636363637,
"grad_norm": 0.19403738840593351,
"learning_rate": 2.136814139793282e-05,
"loss": 1.1961,
"step": 201
},
{
"epoch": 9.181818181818182,
"grad_norm": 0.2116507051866016,
"learning_rate": 2.1151168790179738e-05,
"loss": 1.215,
"step": 202
},
{
"epoch": 9.227272727272727,
"grad_norm": 0.2091130488246949,
"learning_rate": 2.0934835771749872e-05,
"loss": 1.1493,
"step": 203
},
{
"epoch": 9.272727272727273,
"grad_norm": 0.21656692423046117,
"learning_rate": 2.0719164136306386e-05,
"loss": 1.1132,
"step": 204
},
{
"epoch": 9.318181818181818,
"grad_norm": 0.20613565196879666,
"learning_rate": 2.0504175610883876e-05,
"loss": 1.2056,
"step": 205
},
{
"epoch": 9.363636363636363,
"grad_norm": 0.18361410512148918,
"learning_rate": 2.0289891853699573e-05,
"loss": 1.2396,
"step": 206
},
{
"epoch": 9.409090909090908,
"grad_norm": 0.1970968023390117,
"learning_rate": 2.0076334451971447e-05,
"loss": 1.1505,
"step": 207
},
{
"epoch": 9.454545454545455,
"grad_norm": 0.2802489746470572,
"learning_rate": 1.9863524919743505e-05,
"loss": 1.0803,
"step": 208
},
{
"epoch": 9.5,
"grad_norm": 0.18361667226221806,
"learning_rate": 1.9651484695718435e-05,
"loss": 1.1293,
"step": 209
},
{
"epoch": 9.545454545454545,
"grad_norm": 0.1836624775996522,
"learning_rate": 1.944023514109784e-05,
"loss": 1.1646,
"step": 210
},
{
"epoch": 9.590909090909092,
"grad_norm": 0.7985028121765728,
"learning_rate": 1.922979753743027e-05,
"loss": 1.1631,
"step": 211
},
{
"epoch": 9.636363636363637,
"grad_norm": 0.18982874728183255,
"learning_rate": 1.9020193084467303e-05,
"loss": 1.0795,
"step": 212
},
{
"epoch": 9.681818181818182,
"grad_norm": 0.2738998625073182,
"learning_rate": 1.881144289802788e-05,
"loss": 1.0812,
"step": 213
},
{
"epoch": 9.727272727272727,
"grad_norm": 0.19287056890311244,
"learning_rate": 1.8603568007871025e-05,
"loss": 1.1318,
"step": 214
},
{
"epoch": 9.772727272727273,
"grad_norm": 0.19551935271275803,
"learning_rate": 1.83965893555773e-05,
"loss": 1.1903,
"step": 215
},
{
"epoch": 9.818181818181818,
"grad_norm": 0.20289464182378333,
"learning_rate": 1.8190527792439145e-05,
"loss": 1.1716,
"step": 216
},
{
"epoch": 9.863636363636363,
"grad_norm": 0.18427534137801654,
"learning_rate": 1.7985404077360258e-05,
"loss": 1.181,
"step": 217
},
{
"epoch": 9.909090909090908,
"grad_norm": 0.17269516793495363,
"learning_rate": 1.7781238874764337e-05,
"loss": 1.1443,
"step": 218
},
{
"epoch": 9.954545454545455,
"grad_norm": 0.20393617981346993,
"learning_rate": 1.757805275251333e-05,
"loss": 1.1991,
"step": 219
},
{
"epoch": 10.0,
"grad_norm": 0.19442135716601172,
"learning_rate": 1.737586617983534e-05,
"loss": 1.1121,
"step": 220
},
{
"epoch": 10.045454545454545,
"grad_norm": 0.17057632115064372,
"learning_rate": 1.717469952526262e-05,
"loss": 1.162,
"step": 221
},
{
"epoch": 10.090909090909092,
"grad_norm": 0.17688084966953885,
"learning_rate": 1.6974573054579582e-05,
"loss": 1.177,
"step": 222
},
{
"epoch": 10.136363636363637,
"grad_norm": 0.17485623392128088,
"learning_rate": 1.6775506928781146e-05,
"loss": 1.1594,
"step": 223
},
{
"epoch": 10.181818181818182,
"grad_norm": 0.1915709146213454,
"learning_rate": 1.6577521202041775e-05,
"loss": 1.1637,
"step": 224
},
{
"epoch": 10.227272727272727,
"grad_norm": 0.18779150088359975,
"learning_rate": 1.6380635819695172e-05,
"loss": 1.1325,
"step": 225
},
{
"epoch": 10.272727272727273,
"grad_norm": 0.17675767122639172,
"learning_rate": 1.6184870616224905e-05,
"loss": 1.1283,
"step": 226
},
{
"epoch": 10.318181818181818,
"grad_norm": 0.20187810956460173,
"learning_rate": 1.599024531326632e-05,
"loss": 1.1362,
"step": 227
},
{
"epoch": 10.363636363636363,
"grad_norm": 0.18429771837794573,
"learning_rate": 1.5796779517619757e-05,
"loss": 1.0782,
"step": 228
},
{
"epoch": 10.409090909090908,
"grad_norm": 0.18129745466430086,
"learning_rate": 1.560449271927528e-05,
"loss": 1.1556,
"step": 229
},
{
"epoch": 10.454545454545455,
"grad_norm": 0.18156111868915045,
"learning_rate": 1.541340428944929e-05,
"loss": 1.1472,
"step": 230
},
{
"epoch": 10.5,
"grad_norm": 0.2058178822209493,
"learning_rate": 1.5223533478633012e-05,
"loss": 1.1436,
"step": 231
},
{
"epoch": 10.545454545454545,
"grad_norm": 0.19420543695117135,
"learning_rate": 1.5034899414653183e-05,
"loss": 1.1632,
"step": 232
},
{
"epoch": 10.590909090909092,
"grad_norm": 0.1816721312324971,
"learning_rate": 1.4847521100745101e-05,
"loss": 1.0919,
"step": 233
},
{
"epoch": 10.636363636363637,
"grad_norm": 0.17503046181743187,
"learning_rate": 1.4661417413638206e-05,
"loss": 1.177,
"step": 234
},
{
"epoch": 10.681818181818182,
"grad_norm": 0.1996340286183738,
"learning_rate": 1.44766071016544e-05,
"loss": 1.1901,
"step": 235
},
{
"epoch": 10.727272727272727,
"grad_norm": 0.18843938813925634,
"learning_rate": 1.4293108782819345e-05,
"loss": 1.1081,
"step": 236
},
{
"epoch": 10.772727272727273,
"grad_norm": 0.18750865793808888,
"learning_rate": 1.4110940942986844e-05,
"loss": 1.0781,
"step": 237
},
{
"epoch": 10.818181818181818,
"grad_norm": 0.17785144309042253,
"learning_rate": 1.3930121933976556e-05,
"loss": 1.1961,
"step": 238
},
{
"epoch": 10.863636363636363,
"grad_norm": 0.17015756605905757,
"learning_rate": 1.37506699717252e-05,
"loss": 1.142,
"step": 239
},
{
"epoch": 10.909090909090908,
"grad_norm": 0.1976824831465818,
"learning_rate": 1.3572603134451479e-05,
"loss": 1.1024,
"step": 240
},
{
"epoch": 10.954545454545455,
"grad_norm": 0.17152795538031174,
"learning_rate": 1.3395939360834845e-05,
"loss": 1.136,
"step": 241
},
{
"epoch": 11.0,
"grad_norm": 0.1667419474833486,
"learning_rate": 1.3220696448208308e-05,
"loss": 1.1114,
"step": 242
},
{
"epoch": 11.045454545454545,
"grad_norm": 0.16490649981297714,
"learning_rate": 1.304689205076558e-05,
"loss": 1.1569,
"step": 243
},
{
"epoch": 11.090909090909092,
"grad_norm": 0.17142788488665067,
"learning_rate": 1.2874543677782508e-05,
"loss": 1.1667,
"step": 244
},
{
"epoch": 11.136363636363637,
"grad_norm": 0.1747032605597451,
"learning_rate": 1.2703668691853155e-05,
"loss": 1.1422,
"step": 245
},
{
"epoch": 11.181818181818182,
"grad_norm": 0.1680960418576758,
"learning_rate": 1.253428430714076e-05,
"loss": 1.1462,
"step": 246
},
{
"epoch": 11.227272727272727,
"grad_norm": 0.1893549688172363,
"learning_rate": 1.2366407587643432e-05,
"loss": 1.1496,
"step": 247
},
{
"epoch": 11.272727272727273,
"grad_norm": 0.2930049956267172,
"learning_rate": 1.220005544547522e-05,
"loss": 1.0953,
"step": 248
},
{
"epoch": 11.318181818181818,
"grad_norm": 0.16093892837183954,
"learning_rate": 1.2035244639162319e-05,
"loss": 1.1001,
"step": 249
},
{
"epoch": 11.363636363636363,
"grad_norm": 0.16689857435832806,
"learning_rate": 1.1871991771954748e-05,
"loss": 1.0471,
"step": 250
},
{
"epoch": 11.409090909090908,
"grad_norm": 0.17166817028161555,
"learning_rate": 1.1710313290153795e-05,
"loss": 1.0986,
"step": 251
},
{
"epoch": 11.454545454545455,
"grad_norm": 0.18190627031196485,
"learning_rate": 1.1550225481455165e-05,
"loss": 1.1788,
"step": 252
},
{
"epoch": 11.5,
"grad_norm": 0.17512791533622662,
"learning_rate": 1.1391744473308106e-05,
"loss": 1.1673,
"step": 253
},
{
"epoch": 11.545454545454545,
"grad_norm": 0.16757254457143245,
"learning_rate": 1.1234886231290759e-05,
"loss": 1.1746,
"step": 254
},
{
"epoch": 11.590909090909092,
"grad_norm": 0.1668004234347932,
"learning_rate": 1.1079666557501736e-05,
"loss": 1.1107,
"step": 255
},
{
"epoch": 11.636363636363637,
"grad_norm": 0.15359971441629242,
"learning_rate": 1.0926101088968207e-05,
"loss": 1.1658,
"step": 256
},
{
"epoch": 11.681818181818182,
"grad_norm": 0.16049551854978608,
"learning_rate": 1.0774205296070597e-05,
"loss": 1.0853,
"step": 257
},
{
"epoch": 11.727272727272727,
"grad_norm": 0.20099872095589202,
"learning_rate": 1.062399448098409e-05,
"loss": 1.0978,
"step": 258
},
{
"epoch": 11.772727272727273,
"grad_norm": 0.18578304835004,
"learning_rate": 1.0475483776137062e-05,
"loss": 1.1296,
"step": 259
},
{
"epoch": 11.818181818181818,
"grad_norm": 0.16704809777000973,
"learning_rate": 1.0328688142686627e-05,
"loss": 1.0695,
"step": 260
},
{
"epoch": 11.863636363636363,
"grad_norm": 0.17323425454468377,
"learning_rate": 1.0183622369011422e-05,
"loss": 1.077,
"step": 261
},
{
"epoch": 11.909090909090908,
"grad_norm": 0.17800221938786567,
"learning_rate": 1.0040301069221823e-05,
"loss": 1.122,
"step": 262
},
{
"epoch": 11.954545454545455,
"grad_norm": 0.17418177714949332,
"learning_rate": 9.89873868168766e-06,
"loss": 1.1375,
"step": 263
},
{
"epoch": 12.0,
"grad_norm": 0.16676746363627873,
"learning_rate": 9.758949467583754e-06,
"loss": 1.1737,
"step": 264
},
{
"epoch": 12.045454545454545,
"grad_norm": 0.1527614833387276,
"learning_rate": 9.620947509453155e-06,
"loss": 1.1136,
"step": 265
},
{
"epoch": 12.090909090909092,
"grad_norm": 0.17699034639073358,
"learning_rate": 9.484746709788451e-06,
"loss": 1.1231,
"step": 266
},
{
"epoch": 12.136363636363637,
"grad_norm": 0.17421157455050928,
"learning_rate": 9.350360789631291e-06,
"loss": 1.148,
"step": 267
},
{
"epoch": 12.181818181818182,
"grad_norm": 0.15462065956553298,
"learning_rate": 9.217803287190029e-06,
"loss": 1.1435,
"step": 268
},
{
"epoch": 12.227272727272727,
"grad_norm": 0.16543045040679552,
"learning_rate": 9.087087556475873e-06,
"loss": 1.1312,
"step": 269
},
{
"epoch": 12.272727272727273,
"grad_norm": 0.17092139127270728,
"learning_rate": 8.958226765957655e-06,
"loss": 1.1164,
"step": 270
},
{
"epoch": 12.318181818181818,
"grad_norm": 0.16435344514564526,
"learning_rate": 8.831233897235128e-06,
"loss": 1.075,
"step": 271
},
{
"epoch": 12.363636363636363,
"grad_norm": 0.16450679267899113,
"learning_rate": 8.706121743731256e-06,
"loss": 1.1508,
"step": 272
},
{
"epoch": 12.409090909090908,
"grad_norm": 0.15282580972945572,
"learning_rate": 8.58290290940337e-06,
"loss": 1.135,
"step": 273
},
{
"epoch": 12.454545454545455,
"grad_norm": 0.1555918103689812,
"learning_rate": 8.461589807473392e-06,
"loss": 1.121,
"step": 274
},
{
"epoch": 12.5,
"grad_norm": 0.17353062952008638,
"learning_rate": 8.342194659177358e-06,
"loss": 1.1849,
"step": 275
},
{
"epoch": 12.545454545454545,
"grad_norm": 0.15700310300217593,
"learning_rate": 8.224729492534231e-06,
"loss": 1.1479,
"step": 276
},
{
"epoch": 12.590909090909092,
"grad_norm": 0.23929770557768484,
"learning_rate": 8.109206141134142e-06,
"loss": 1.0834,
"step": 277
},
{
"epoch": 12.636363636363637,
"grad_norm": 0.18493048427935435,
"learning_rate": 7.995636242946305e-06,
"loss": 1.1398,
"step": 278
},
{
"epoch": 12.681818181818182,
"grad_norm": 0.16961784748611264,
"learning_rate": 7.884031239146569e-06,
"loss": 1.0651,
"step": 279
},
{
"epoch": 12.727272727272727,
"grad_norm": 0.1597693846586007,
"learning_rate": 7.774402372964833e-06,
"loss": 1.0952,
"step": 280
},
{
"epoch": 12.772727272727273,
"grad_norm": 0.16022321839887022,
"learning_rate": 7.666760688552371e-06,
"loss": 1.1269,
"step": 281
},
{
"epoch": 12.818181818181818,
"grad_norm": 0.1661994031922762,
"learning_rate": 7.5611170298692466e-06,
"loss": 1.0682,
"step": 282
},
{
"epoch": 12.863636363636363,
"grad_norm": 0.16932158388771293,
"learning_rate": 7.4574820395918735e-06,
"loss": 1.163,
"step": 283
},
{
"epoch": 12.909090909090908,
"grad_norm": 0.15828697644374046,
"learning_rate": 7.3558661580408545e-06,
"loss": 1.1466,
"step": 284
},
{
"epoch": 12.954545454545455,
"grad_norm": 0.25782602954860684,
"learning_rate": 7.256279622129215e-06,
"loss": 0.9785,
"step": 285
},
{
"epoch": 13.0,
"grad_norm": 0.15598380461908062,
"learning_rate": 7.15873246433113e-06,
"loss": 1.1248,
"step": 286
},
{
"epoch": 13.045454545454545,
"grad_norm": 0.16456745649594778,
"learning_rate": 7.063234511671206e-06,
"loss": 1.1426,
"step": 287
},
{
"epoch": 13.090909090909092,
"grad_norm": 0.17833577442195056,
"learning_rate": 6.969795384734556e-06,
"loss": 1.1278,
"step": 288
},
{
"epoch": 13.136363636363637,
"grad_norm": 0.16618049623992226,
"learning_rate": 6.878424496697554e-06,
"loss": 1.0637,
"step": 289
},
{
"epoch": 13.181818181818182,
"grad_norm": 0.1548002033964038,
"learning_rate": 6.789131052379549e-06,
"loss": 1.1438,
"step": 290
},
{
"epoch": 13.227272727272727,
"grad_norm": 0.15277655748023802,
"learning_rate": 6.7019240473155924e-06,
"loss": 1.0657,
"step": 291
},
{
"epoch": 13.272727272727273,
"grad_norm": 0.24777282879928914,
"learning_rate": 6.616812266850187e-06,
"loss": 1.159,
"step": 292
},
{
"epoch": 13.318181818181818,
"grad_norm": 0.19128988519206985,
"learning_rate": 6.5338042852522305e-06,
"loss": 1.1772,
"step": 293
},
{
"epoch": 13.363636363636363,
"grad_norm": 0.17365339407446134,
"learning_rate": 6.4529084648512815e-06,
"loss": 1.0807,
"step": 294
},
{
"epoch": 13.409090909090908,
"grad_norm": 0.16321204192747293,
"learning_rate": 6.374132955195062e-06,
"loss": 1.1293,
"step": 295
},
{
"epoch": 13.454545454545455,
"grad_norm": 0.1754755861725117,
"learning_rate": 6.297485692228512e-06,
"loss": 1.1434,
"step": 296
},
{
"epoch": 13.5,
"grad_norm": 0.14452663905974894,
"learning_rate": 6.222974397494309e-06,
"loss": 1.0709,
"step": 297
},
{
"epoch": 13.545454545454545,
"grad_norm": 0.14708535357644448,
"learning_rate": 6.150606577354948e-06,
"loss": 1.0964,
"step": 298
},
{
"epoch": 13.590909090909092,
"grad_norm": 0.16471061622026806,
"learning_rate": 6.080389522236585e-06,
"loss": 1.114,
"step": 299
},
{
"epoch": 13.636363636363637,
"grad_norm": 0.175358096168744,
"learning_rate": 6.012330305894584e-06,
"loss": 1.0573,
"step": 300
},
{
"epoch": 13.681818181818182,
"grad_norm": 0.15744915474687687,
"learning_rate": 5.946435784700869e-06,
"loss": 1.1256,
"step": 301
},
{
"epoch": 13.727272727272727,
"grad_norm": 0.1594724312088306,
"learning_rate": 5.8827125969532365e-06,
"loss": 1.0757,
"step": 302
},
{
"epoch": 13.772727272727273,
"grad_norm": 0.16045998825705504,
"learning_rate": 5.82116716220659e-06,
"loss": 1.1302,
"step": 303
},
{
"epoch": 13.818181818181818,
"grad_norm": 0.15174228657227892,
"learning_rate": 5.76180568062623e-06,
"loss": 1.1555,
"step": 304
},
{
"epoch": 13.863636363636363,
"grad_norm": 0.16291888537269678,
"learning_rate": 5.704634132363239e-06,
"loss": 1.131,
"step": 305
},
{
"epoch": 13.909090909090908,
"grad_norm": 0.15722592315890294,
"learning_rate": 5.649658276952029e-06,
"loss": 1.0328,
"step": 306
},
{
"epoch": 13.954545454545455,
"grad_norm": 0.15455642700386937,
"learning_rate": 5.596883652730137e-06,
"loss": 1.0786,
"step": 307
},
{
"epoch": 14.0,
"grad_norm": 0.16018814750974167,
"learning_rate": 5.546315576280258e-06,
"loss": 1.0977,
"step": 308
},
{
"epoch": 14.045454545454545,
"grad_norm": 0.15691439028906481,
"learning_rate": 5.497959141894671e-06,
"loss": 1.1606,
"step": 309
},
{
"epoch": 14.090909090909092,
"grad_norm": 0.15808702196585686,
"learning_rate": 5.451819221062024e-06,
"loss": 1.1181,
"step": 310
},
{
"epoch": 14.136363636363637,
"grad_norm": 0.15115950620778784,
"learning_rate": 5.4079004619765614e-06,
"loss": 1.1576,
"step": 311
},
{
"epoch": 14.181818181818182,
"grad_norm": 0.15083892267087215,
"learning_rate": 5.3662072890698845e-06,
"loss": 1.1048,
"step": 312
},
{
"epoch": 14.227272727272727,
"grad_norm": 0.1481235847517611,
"learning_rate": 5.326743902565208e-06,
"loss": 1.0597,
"step": 313
},
{
"epoch": 14.272727272727273,
"grad_norm": 0.17487949113995144,
"learning_rate": 5.289514278054232e-06,
"loss": 1.1048,
"step": 314
},
{
"epoch": 14.318181818181818,
"grad_norm": 0.1565612302630643,
"learning_rate": 5.254522166096635e-06,
"loss": 1.1404,
"step": 315
},
{
"epoch": 14.363636363636363,
"grad_norm": 0.15463375307382266,
"learning_rate": 5.221771091842242e-06,
"loss": 1.0867,
"step": 316
},
{
"epoch": 14.409090909090908,
"grad_norm": 0.14777974752978296,
"learning_rate": 5.191264354675882e-06,
"loss": 1.1297,
"step": 317
},
{
"epoch": 14.454545454545455,
"grad_norm": 0.15705272728049427,
"learning_rate": 5.1630050278850275e-06,
"loss": 1.1302,
"step": 318
},
{
"epoch": 14.5,
"grad_norm": 0.15391726300293704,
"learning_rate": 5.136995958350162e-06,
"loss": 1.1421,
"step": 319
},
{
"epoch": 14.545454545454545,
"grad_norm": 0.15288254329961756,
"learning_rate": 5.113239766257999e-06,
"loss": 1.1455,
"step": 320
},
{
"epoch": 14.590909090909092,
"grad_norm": 0.14947122968152077,
"learning_rate": 5.091738844837518e-06,
"loss": 1.0706,
"step": 321
},
{
"epoch": 14.636363636363637,
"grad_norm": 0.16745909284503774,
"learning_rate": 5.0724953601188635e-06,
"loss": 1.0375,
"step": 322
},
{
"epoch": 14.681818181818182,
"grad_norm": 0.1515771191801389,
"learning_rate": 5.0555112507151364e-06,
"loss": 1.1166,
"step": 323
},
{
"epoch": 14.727272727272727,
"grad_norm": 0.15386111047539605,
"learning_rate": 5.0407882276271015e-06,
"loss": 1.0891,
"step": 324
},
{
"epoch": 14.772727272727273,
"grad_norm": 1.3735036578655186,
"learning_rate": 5.028327774070807e-06,
"loss": 1.0836,
"step": 325
},
{
"epoch": 14.818181818181818,
"grad_norm": 0.1570470593366731,
"learning_rate": 5.018131145328181e-06,
"loss": 1.0566,
"step": 326
},
{
"epoch": 14.863636363636363,
"grad_norm": 0.15950636214118602,
"learning_rate": 5.0101993686205585e-06,
"loss": 1.1019,
"step": 327
},
{
"epoch": 14.909090909090908,
"grad_norm": 0.15627633379629186,
"learning_rate": 5.004533243005204e-06,
"loss": 1.0666,
"step": 328
},
{
"epoch": 14.954545454545455,
"grad_norm": 0.157736145851946,
"learning_rate": 5.0011333392948126e-06,
"loss": 1.1023,
"step": 329
},
{
"epoch": 15.0,
"grad_norm": 0.15122659418177306,
"learning_rate": 5e-06,
"loss": 1.1027,
"step": 330
},
{
"epoch": 15.0,
"step": 330,
"total_flos": 103645451026432.0,
"train_loss": 1.592864108988733,
"train_runtime": 2075.5919,
"train_samples_per_second": 2.544,
"train_steps_per_second": 0.159
}
],
"logging_steps": 1,
"max_steps": 330,
"num_input_tokens_seen": 0,
"num_train_epochs": 15,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 103645451026432.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}