masatochi's picture
Training in progress, step 985, checkpoint
6f66d38 verified
raw
history blame
172 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.7122198120028923,
"eval_steps": 692,
"global_step": 985,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007230657989877079,
"grad_norm": 0.37217167019844055,
"learning_rate": 2.0000000000000002e-07,
"loss": 1.5113,
"step": 1
},
{
"epoch": 0.0007230657989877079,
"eval_loss": 1.3881797790527344,
"eval_runtime": 667.6567,
"eval_samples_per_second": 3.49,
"eval_steps_per_second": 0.873,
"step": 1
},
{
"epoch": 0.0014461315979754157,
"grad_norm": 0.26884201169013977,
"learning_rate": 4.0000000000000003e-07,
"loss": 1.3991,
"step": 2
},
{
"epoch": 0.0021691973969631237,
"grad_norm": 0.42644935846328735,
"learning_rate": 6.000000000000001e-07,
"loss": 1.5952,
"step": 3
},
{
"epoch": 0.0028922631959508315,
"grad_norm": 0.29478296637535095,
"learning_rate": 8.000000000000001e-07,
"loss": 1.3243,
"step": 4
},
{
"epoch": 0.0036153289949385392,
"grad_norm": 0.41465193033218384,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.4471,
"step": 5
},
{
"epoch": 0.004338394793926247,
"grad_norm": 0.25165989995002747,
"learning_rate": 1.2000000000000002e-06,
"loss": 1.3216,
"step": 6
},
{
"epoch": 0.005061460592913955,
"grad_norm": 0.2920430600643158,
"learning_rate": 1.4000000000000001e-06,
"loss": 1.4116,
"step": 7
},
{
"epoch": 0.005784526391901663,
"grad_norm": 0.3292546272277832,
"learning_rate": 1.6000000000000001e-06,
"loss": 1.4948,
"step": 8
},
{
"epoch": 0.006507592190889371,
"grad_norm": 0.2700996696949005,
"learning_rate": 1.8000000000000001e-06,
"loss": 1.2588,
"step": 9
},
{
"epoch": 0.0072306579898770785,
"grad_norm": 0.268759161233902,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.4552,
"step": 10
},
{
"epoch": 0.007953723788864787,
"grad_norm": 0.3097935616970062,
"learning_rate": 2.2e-06,
"loss": 1.2995,
"step": 11
},
{
"epoch": 0.008676789587852495,
"grad_norm": 0.3226897120475769,
"learning_rate": 2.4000000000000003e-06,
"loss": 1.439,
"step": 12
},
{
"epoch": 0.009399855386840203,
"grad_norm": 0.2564990222454071,
"learning_rate": 2.6e-06,
"loss": 1.4056,
"step": 13
},
{
"epoch": 0.01012292118582791,
"grad_norm": 0.2508530020713806,
"learning_rate": 2.8000000000000003e-06,
"loss": 1.3216,
"step": 14
},
{
"epoch": 0.010845986984815618,
"grad_norm": 0.2531338632106781,
"learning_rate": 3e-06,
"loss": 1.4021,
"step": 15
},
{
"epoch": 0.011569052783803326,
"grad_norm": 0.45660316944122314,
"learning_rate": 3.2000000000000003e-06,
"loss": 1.8103,
"step": 16
},
{
"epoch": 0.012292118582791034,
"grad_norm": 0.2740483283996582,
"learning_rate": 3.4000000000000005e-06,
"loss": 1.2811,
"step": 17
},
{
"epoch": 0.013015184381778741,
"grad_norm": 0.262483686208725,
"learning_rate": 3.6000000000000003e-06,
"loss": 1.4296,
"step": 18
},
{
"epoch": 0.01373825018076645,
"grad_norm": 0.23837727308273315,
"learning_rate": 3.8000000000000005e-06,
"loss": 1.3429,
"step": 19
},
{
"epoch": 0.014461315979754157,
"grad_norm": 0.2985168993473053,
"learning_rate": 4.000000000000001e-06,
"loss": 1.53,
"step": 20
},
{
"epoch": 0.015184381778741865,
"grad_norm": 0.3057151138782501,
"learning_rate": 4.2000000000000004e-06,
"loss": 1.3555,
"step": 21
},
{
"epoch": 0.015907447577729574,
"grad_norm": 0.2920631766319275,
"learning_rate": 4.4e-06,
"loss": 1.3522,
"step": 22
},
{
"epoch": 0.016630513376717282,
"grad_norm": 0.2631171941757202,
"learning_rate": 4.600000000000001e-06,
"loss": 1.2649,
"step": 23
},
{
"epoch": 0.01735357917570499,
"grad_norm": 0.2993241250514984,
"learning_rate": 4.800000000000001e-06,
"loss": 1.4431,
"step": 24
},
{
"epoch": 0.018076644974692697,
"grad_norm": 0.2654544711112976,
"learning_rate": 5e-06,
"loss": 1.3019,
"step": 25
},
{
"epoch": 0.018799710773680405,
"grad_norm": 0.28488221764564514,
"learning_rate": 5.2e-06,
"loss": 1.3105,
"step": 26
},
{
"epoch": 0.019522776572668113,
"grad_norm": 0.28586897253990173,
"learning_rate": 5.400000000000001e-06,
"loss": 1.3502,
"step": 27
},
{
"epoch": 0.02024584237165582,
"grad_norm": 0.2833098769187927,
"learning_rate": 5.600000000000001e-06,
"loss": 1.3046,
"step": 28
},
{
"epoch": 0.02096890817064353,
"grad_norm": 0.24501250684261322,
"learning_rate": 5.8e-06,
"loss": 1.2376,
"step": 29
},
{
"epoch": 0.021691973969631236,
"grad_norm": 0.4207770824432373,
"learning_rate": 6e-06,
"loss": 1.5645,
"step": 30
},
{
"epoch": 0.022415039768618944,
"grad_norm": 0.5098739862442017,
"learning_rate": 6.200000000000001e-06,
"loss": 1.5241,
"step": 31
},
{
"epoch": 0.023138105567606652,
"grad_norm": 0.31374362111091614,
"learning_rate": 6.4000000000000006e-06,
"loss": 1.4496,
"step": 32
},
{
"epoch": 0.02386117136659436,
"grad_norm": 0.49290239810943604,
"learning_rate": 6.600000000000001e-06,
"loss": 1.626,
"step": 33
},
{
"epoch": 0.024584237165582067,
"grad_norm": 0.31210431456565857,
"learning_rate": 6.800000000000001e-06,
"loss": 1.3339,
"step": 34
},
{
"epoch": 0.025307302964569775,
"grad_norm": 0.3143630623817444,
"learning_rate": 7e-06,
"loss": 1.2261,
"step": 35
},
{
"epoch": 0.026030368763557483,
"grad_norm": 0.4381314218044281,
"learning_rate": 7.2000000000000005e-06,
"loss": 1.4051,
"step": 36
},
{
"epoch": 0.02675343456254519,
"grad_norm": 0.3636917769908905,
"learning_rate": 7.4e-06,
"loss": 1.3815,
"step": 37
},
{
"epoch": 0.0274765003615329,
"grad_norm": 0.3755267560482025,
"learning_rate": 7.600000000000001e-06,
"loss": 1.3677,
"step": 38
},
{
"epoch": 0.028199566160520606,
"grad_norm": 0.3734245300292969,
"learning_rate": 7.800000000000002e-06,
"loss": 1.4356,
"step": 39
},
{
"epoch": 0.028922631959508314,
"grad_norm": 0.3986704349517822,
"learning_rate": 8.000000000000001e-06,
"loss": 1.4567,
"step": 40
},
{
"epoch": 0.02964569775849602,
"grad_norm": 0.39432185888290405,
"learning_rate": 8.2e-06,
"loss": 1.5894,
"step": 41
},
{
"epoch": 0.03036876355748373,
"grad_norm": 0.5450723767280579,
"learning_rate": 8.400000000000001e-06,
"loss": 1.569,
"step": 42
},
{
"epoch": 0.03109182935647144,
"grad_norm": 0.38127774000167847,
"learning_rate": 8.6e-06,
"loss": 1.472,
"step": 43
},
{
"epoch": 0.03181489515545915,
"grad_norm": 0.4520113468170166,
"learning_rate": 8.8e-06,
"loss": 1.4103,
"step": 44
},
{
"epoch": 0.03253796095444685,
"grad_norm": 0.382798969745636,
"learning_rate": 9e-06,
"loss": 1.3647,
"step": 45
},
{
"epoch": 0.033261026753434564,
"grad_norm": 0.4572380483150482,
"learning_rate": 9.200000000000002e-06,
"loss": 1.4196,
"step": 46
},
{
"epoch": 0.03398409255242227,
"grad_norm": 0.3955709934234619,
"learning_rate": 9.4e-06,
"loss": 1.4654,
"step": 47
},
{
"epoch": 0.03470715835140998,
"grad_norm": 0.5052328109741211,
"learning_rate": 9.600000000000001e-06,
"loss": 1.6222,
"step": 48
},
{
"epoch": 0.035430224150397684,
"grad_norm": 0.3734697699546814,
"learning_rate": 9.800000000000001e-06,
"loss": 1.3473,
"step": 49
},
{
"epoch": 0.036153289949385395,
"grad_norm": 0.3865366280078888,
"learning_rate": 1e-05,
"loss": 1.4105,
"step": 50
},
{
"epoch": 0.0368763557483731,
"grad_norm": 0.42371755838394165,
"learning_rate": 1.02e-05,
"loss": 1.4933,
"step": 51
},
{
"epoch": 0.03759942154736081,
"grad_norm": 0.34533318877220154,
"learning_rate": 1.04e-05,
"loss": 1.214,
"step": 52
},
{
"epoch": 0.038322487346348515,
"grad_norm": 0.3520753085613251,
"learning_rate": 1.0600000000000002e-05,
"loss": 1.2373,
"step": 53
},
{
"epoch": 0.039045553145336226,
"grad_norm": 0.6355977058410645,
"learning_rate": 1.0800000000000002e-05,
"loss": 1.5411,
"step": 54
},
{
"epoch": 0.03976861894432393,
"grad_norm": 0.46781396865844727,
"learning_rate": 1.1000000000000001e-05,
"loss": 1.4157,
"step": 55
},
{
"epoch": 0.04049168474331164,
"grad_norm": 0.3881015479564667,
"learning_rate": 1.1200000000000001e-05,
"loss": 1.3046,
"step": 56
},
{
"epoch": 0.04121475054229935,
"grad_norm": 0.5843562483787537,
"learning_rate": 1.14e-05,
"loss": 1.5255,
"step": 57
},
{
"epoch": 0.04193781634128706,
"grad_norm": 0.36714574694633484,
"learning_rate": 1.16e-05,
"loss": 1.356,
"step": 58
},
{
"epoch": 0.04266088214027477,
"grad_norm": 0.3784966468811035,
"learning_rate": 1.18e-05,
"loss": 1.1724,
"step": 59
},
{
"epoch": 0.04338394793926247,
"grad_norm": 0.421464204788208,
"learning_rate": 1.2e-05,
"loss": 1.4358,
"step": 60
},
{
"epoch": 0.044107013738250184,
"grad_norm": 0.3749872148036957,
"learning_rate": 1.22e-05,
"loss": 1.2683,
"step": 61
},
{
"epoch": 0.04483007953723789,
"grad_norm": 0.3953036665916443,
"learning_rate": 1.2400000000000002e-05,
"loss": 1.2323,
"step": 62
},
{
"epoch": 0.0455531453362256,
"grad_norm": 0.47576940059661865,
"learning_rate": 1.2600000000000001e-05,
"loss": 1.5067,
"step": 63
},
{
"epoch": 0.046276211135213303,
"grad_norm": 0.3674280643463135,
"learning_rate": 1.2800000000000001e-05,
"loss": 1.2926,
"step": 64
},
{
"epoch": 0.046999276934201015,
"grad_norm": 0.4777793288230896,
"learning_rate": 1.3000000000000001e-05,
"loss": 1.4084,
"step": 65
},
{
"epoch": 0.04772234273318872,
"grad_norm": 0.5235625505447388,
"learning_rate": 1.3200000000000002e-05,
"loss": 1.4148,
"step": 66
},
{
"epoch": 0.04844540853217643,
"grad_norm": 0.43167218565940857,
"learning_rate": 1.3400000000000002e-05,
"loss": 1.2986,
"step": 67
},
{
"epoch": 0.049168474331164135,
"grad_norm": 0.35833850502967834,
"learning_rate": 1.3600000000000002e-05,
"loss": 1.2428,
"step": 68
},
{
"epoch": 0.049891540130151846,
"grad_norm": 0.4315280616283417,
"learning_rate": 1.38e-05,
"loss": 1.2497,
"step": 69
},
{
"epoch": 0.05061460592913955,
"grad_norm": 0.5640541315078735,
"learning_rate": 1.4e-05,
"loss": 1.3195,
"step": 70
},
{
"epoch": 0.05133767172812726,
"grad_norm": 0.3889266848564148,
"learning_rate": 1.4200000000000001e-05,
"loss": 1.2181,
"step": 71
},
{
"epoch": 0.052060737527114966,
"grad_norm": 0.5410600900650024,
"learning_rate": 1.4400000000000001e-05,
"loss": 1.3649,
"step": 72
},
{
"epoch": 0.05278380332610268,
"grad_norm": 0.4043705463409424,
"learning_rate": 1.46e-05,
"loss": 1.3354,
"step": 73
},
{
"epoch": 0.05350686912509038,
"grad_norm": 0.2976624071598053,
"learning_rate": 1.48e-05,
"loss": 1.2689,
"step": 74
},
{
"epoch": 0.05422993492407809,
"grad_norm": 0.4049951136112213,
"learning_rate": 1.5000000000000002e-05,
"loss": 1.3255,
"step": 75
},
{
"epoch": 0.0549530007230658,
"grad_norm": 0.35892027616500854,
"learning_rate": 1.5200000000000002e-05,
"loss": 1.1894,
"step": 76
},
{
"epoch": 0.05567606652205351,
"grad_norm": 0.46196767687797546,
"learning_rate": 1.54e-05,
"loss": 1.3013,
"step": 77
},
{
"epoch": 0.05639913232104121,
"grad_norm": 0.46217429637908936,
"learning_rate": 1.5600000000000003e-05,
"loss": 1.1938,
"step": 78
},
{
"epoch": 0.05712219812002892,
"grad_norm": 1.1090481281280518,
"learning_rate": 1.58e-05,
"loss": 1.5665,
"step": 79
},
{
"epoch": 0.05784526391901663,
"grad_norm": 0.5215122699737549,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.3107,
"step": 80
},
{
"epoch": 0.05856832971800434,
"grad_norm": 0.31574738025665283,
"learning_rate": 1.62e-05,
"loss": 1.2218,
"step": 81
},
{
"epoch": 0.05929139551699204,
"grad_norm": 0.34580984711647034,
"learning_rate": 1.64e-05,
"loss": 1.1997,
"step": 82
},
{
"epoch": 0.060014461315979754,
"grad_norm": 0.4590187966823578,
"learning_rate": 1.66e-05,
"loss": 1.3401,
"step": 83
},
{
"epoch": 0.06073752711496746,
"grad_norm": 0.40272387862205505,
"learning_rate": 1.6800000000000002e-05,
"loss": 1.256,
"step": 84
},
{
"epoch": 0.06146059291395517,
"grad_norm": 0.4131518006324768,
"learning_rate": 1.7e-05,
"loss": 1.1237,
"step": 85
},
{
"epoch": 0.06218365871294288,
"grad_norm": 0.3303501605987549,
"learning_rate": 1.72e-05,
"loss": 1.2023,
"step": 86
},
{
"epoch": 0.06290672451193059,
"grad_norm": 0.314738929271698,
"learning_rate": 1.7400000000000003e-05,
"loss": 1.2153,
"step": 87
},
{
"epoch": 0.0636297903109183,
"grad_norm": 0.31625810265541077,
"learning_rate": 1.76e-05,
"loss": 1.2048,
"step": 88
},
{
"epoch": 0.064352856109906,
"grad_norm": 0.36682891845703125,
"learning_rate": 1.7800000000000002e-05,
"loss": 1.278,
"step": 89
},
{
"epoch": 0.0650759219088937,
"grad_norm": 0.3271387219429016,
"learning_rate": 1.8e-05,
"loss": 1.3014,
"step": 90
},
{
"epoch": 0.06579898770788142,
"grad_norm": 0.38205042481422424,
"learning_rate": 1.8200000000000002e-05,
"loss": 1.2683,
"step": 91
},
{
"epoch": 0.06652205350686913,
"grad_norm": 0.3368231952190399,
"learning_rate": 1.8400000000000003e-05,
"loss": 1.3805,
"step": 92
},
{
"epoch": 0.06724511930585683,
"grad_norm": 0.462415874004364,
"learning_rate": 1.86e-05,
"loss": 1.2521,
"step": 93
},
{
"epoch": 0.06796818510484454,
"grad_norm": 0.3378755450248718,
"learning_rate": 1.88e-05,
"loss": 1.1164,
"step": 94
},
{
"epoch": 0.06869125090383225,
"grad_norm": 0.3311493694782257,
"learning_rate": 1.9e-05,
"loss": 1.223,
"step": 95
},
{
"epoch": 0.06941431670281996,
"grad_norm": 0.3691946268081665,
"learning_rate": 1.9200000000000003e-05,
"loss": 1.1876,
"step": 96
},
{
"epoch": 0.07013738250180766,
"grad_norm": 0.40734970569610596,
"learning_rate": 1.94e-05,
"loss": 1.2732,
"step": 97
},
{
"epoch": 0.07086044830079537,
"grad_norm": 1.0996010303497314,
"learning_rate": 1.9600000000000002e-05,
"loss": 1.4193,
"step": 98
},
{
"epoch": 0.07158351409978309,
"grad_norm": 0.4469655752182007,
"learning_rate": 1.98e-05,
"loss": 1.1684,
"step": 99
},
{
"epoch": 0.07230657989877079,
"grad_norm": 0.3527953028678894,
"learning_rate": 2e-05,
"loss": 1.1232,
"step": 100
},
{
"epoch": 0.0730296456977585,
"grad_norm": 0.33126530051231384,
"learning_rate": 1.999999893747778e-05,
"loss": 1.1679,
"step": 101
},
{
"epoch": 0.0737527114967462,
"grad_norm": 0.2917117476463318,
"learning_rate": 1.999999574991134e-05,
"loss": 1.1278,
"step": 102
},
{
"epoch": 0.07447577729573392,
"grad_norm": 0.3925279676914215,
"learning_rate": 1.999999043730136e-05,
"loss": 1.2538,
"step": 103
},
{
"epoch": 0.07519884309472162,
"grad_norm": 0.5860010981559753,
"learning_rate": 1.999998299964897e-05,
"loss": 1.0725,
"step": 104
},
{
"epoch": 0.07592190889370933,
"grad_norm": 0.37837302684783936,
"learning_rate": 1.999997343695575e-05,
"loss": 1.2131,
"step": 105
},
{
"epoch": 0.07664497469269703,
"grad_norm": 0.32298776507377625,
"learning_rate": 1.999996174922373e-05,
"loss": 1.1068,
"step": 106
},
{
"epoch": 0.07736804049168475,
"grad_norm": 0.2889445424079895,
"learning_rate": 1.999994793645539e-05,
"loss": 1.1648,
"step": 107
},
{
"epoch": 0.07809110629067245,
"grad_norm": 0.326594740152359,
"learning_rate": 1.9999931998653677e-05,
"loss": 1.1019,
"step": 108
},
{
"epoch": 0.07881417208966016,
"grad_norm": 0.37818118929862976,
"learning_rate": 1.9999913935821973e-05,
"loss": 1.1402,
"step": 109
},
{
"epoch": 0.07953723788864786,
"grad_norm": 0.3480512201786041,
"learning_rate": 1.9999893747964108e-05,
"loss": 1.1359,
"step": 110
},
{
"epoch": 0.08026030368763558,
"grad_norm": 0.2945075035095215,
"learning_rate": 1.9999871435084384e-05,
"loss": 1.1261,
"step": 111
},
{
"epoch": 0.08098336948662328,
"grad_norm": 0.3005722463130951,
"learning_rate": 1.9999846997187535e-05,
"loss": 1.1119,
"step": 112
},
{
"epoch": 0.08170643528561099,
"grad_norm": 0.38726744055747986,
"learning_rate": 1.9999820434278755e-05,
"loss": 1.231,
"step": 113
},
{
"epoch": 0.0824295010845987,
"grad_norm": 0.3677898943424225,
"learning_rate": 1.999979174636369e-05,
"loss": 1.0692,
"step": 114
},
{
"epoch": 0.08315256688358641,
"grad_norm": 0.45312801003456116,
"learning_rate": 1.9999760933448443e-05,
"loss": 1.1273,
"step": 115
},
{
"epoch": 0.08387563268257411,
"grad_norm": 0.33305323123931885,
"learning_rate": 1.999972799553955e-05,
"loss": 1.2399,
"step": 116
},
{
"epoch": 0.08459869848156182,
"grad_norm": 0.2953278124332428,
"learning_rate": 1.9999692932644016e-05,
"loss": 1.2297,
"step": 117
},
{
"epoch": 0.08532176428054954,
"grad_norm": 0.5426760315895081,
"learning_rate": 1.9999655744769292e-05,
"loss": 1.0915,
"step": 118
},
{
"epoch": 0.08604483007953724,
"grad_norm": 0.7710702419281006,
"learning_rate": 1.999961643192328e-05,
"loss": 1.1106,
"step": 119
},
{
"epoch": 0.08676789587852494,
"grad_norm": 0.4938242435455322,
"learning_rate": 1.9999574994114336e-05,
"loss": 1.1751,
"step": 120
},
{
"epoch": 0.08749096167751265,
"grad_norm": 0.2793751657009125,
"learning_rate": 1.9999531431351263e-05,
"loss": 1.0712,
"step": 121
},
{
"epoch": 0.08821402747650037,
"grad_norm": 0.37205490469932556,
"learning_rate": 1.999948574364332e-05,
"loss": 1.1263,
"step": 122
},
{
"epoch": 0.08893709327548807,
"grad_norm": 0.25429850816726685,
"learning_rate": 1.9999437931000213e-05,
"loss": 1.0495,
"step": 123
},
{
"epoch": 0.08966015907447578,
"grad_norm": 0.3983619213104248,
"learning_rate": 1.9999387993432107e-05,
"loss": 1.1422,
"step": 124
},
{
"epoch": 0.09038322487346348,
"grad_norm": 0.27335548400878906,
"learning_rate": 1.9999335930949612e-05,
"loss": 1.1263,
"step": 125
},
{
"epoch": 0.0911062906724512,
"grad_norm": 0.24053499102592468,
"learning_rate": 1.9999281743563788e-05,
"loss": 1.1613,
"step": 126
},
{
"epoch": 0.0918293564714389,
"grad_norm": 0.2247527539730072,
"learning_rate": 1.999922543128616e-05,
"loss": 1.0831,
"step": 127
},
{
"epoch": 0.09255242227042661,
"grad_norm": 0.3363755941390991,
"learning_rate": 1.9999166994128686e-05,
"loss": 1.2648,
"step": 128
},
{
"epoch": 0.09327548806941431,
"grad_norm": 0.31380873918533325,
"learning_rate": 1.9999106432103785e-05,
"loss": 1.1549,
"step": 129
},
{
"epoch": 0.09399855386840203,
"grad_norm": 0.40868130326271057,
"learning_rate": 1.9999043745224324e-05,
"loss": 1.0049,
"step": 130
},
{
"epoch": 0.09472161966738973,
"grad_norm": 0.46924683451652527,
"learning_rate": 1.999897893350363e-05,
"loss": 1.3451,
"step": 131
},
{
"epoch": 0.09544468546637744,
"grad_norm": 0.42218780517578125,
"learning_rate": 1.9998911996955478e-05,
"loss": 1.1618,
"step": 132
},
{
"epoch": 0.09616775126536514,
"grad_norm": 0.24168308079242706,
"learning_rate": 1.999884293559408e-05,
"loss": 1.1712,
"step": 133
},
{
"epoch": 0.09689081706435286,
"grad_norm": 0.41752415895462036,
"learning_rate": 1.9998771749434128e-05,
"loss": 1.2094,
"step": 134
},
{
"epoch": 0.09761388286334056,
"grad_norm": 0.4695405960083008,
"learning_rate": 1.999869843849074e-05,
"loss": 1.08,
"step": 135
},
{
"epoch": 0.09833694866232827,
"grad_norm": 0.36969393491744995,
"learning_rate": 1.9998623002779493e-05,
"loss": 1.1485,
"step": 136
},
{
"epoch": 0.09906001446131597,
"grad_norm": 0.31913572549819946,
"learning_rate": 1.9998545442316422e-05,
"loss": 1.1797,
"step": 137
},
{
"epoch": 0.09978308026030369,
"grad_norm": 0.3533536493778229,
"learning_rate": 1.9998465757118007e-05,
"loss": 1.0771,
"step": 138
},
{
"epoch": 0.1005061460592914,
"grad_norm": 0.31527888774871826,
"learning_rate": 1.999838394720118e-05,
"loss": 1.1364,
"step": 139
},
{
"epoch": 0.1012292118582791,
"grad_norm": 0.274924635887146,
"learning_rate": 1.9998300012583333e-05,
"loss": 1.0948,
"step": 140
},
{
"epoch": 0.1019522776572668,
"grad_norm": 0.6106323599815369,
"learning_rate": 1.9998213953282297e-05,
"loss": 1.2224,
"step": 141
},
{
"epoch": 0.10267534345625452,
"grad_norm": 0.4037460386753082,
"learning_rate": 1.999812576931636e-05,
"loss": 1.1291,
"step": 142
},
{
"epoch": 0.10339840925524223,
"grad_norm": 0.3101276457309723,
"learning_rate": 1.999803546070426e-05,
"loss": 1.2032,
"step": 143
},
{
"epoch": 0.10412147505422993,
"grad_norm": 0.2807331085205078,
"learning_rate": 1.9997943027465187e-05,
"loss": 1.0995,
"step": 144
},
{
"epoch": 0.10484454085321765,
"grad_norm": 0.2920645773410797,
"learning_rate": 1.999784846961879e-05,
"loss": 1.0696,
"step": 145
},
{
"epoch": 0.10556760665220535,
"grad_norm": 0.2744218409061432,
"learning_rate": 1.9997751787185163e-05,
"loss": 1.0342,
"step": 146
},
{
"epoch": 0.10629067245119306,
"grad_norm": 0.3686463534832001,
"learning_rate": 1.999765298018484e-05,
"loss": 1.0401,
"step": 147
},
{
"epoch": 0.10701373825018076,
"grad_norm": 0.3631075322628021,
"learning_rate": 1.9997552048638833e-05,
"loss": 1.083,
"step": 148
},
{
"epoch": 0.10773680404916848,
"grad_norm": 0.4366123080253601,
"learning_rate": 1.999744899256858e-05,
"loss": 1.077,
"step": 149
},
{
"epoch": 0.10845986984815618,
"grad_norm": 0.27256667613983154,
"learning_rate": 1.9997343811995985e-05,
"loss": 1.0411,
"step": 150
},
{
"epoch": 0.10918293564714389,
"grad_norm": 0.2851492464542389,
"learning_rate": 1.99972365069434e-05,
"loss": 1.033,
"step": 151
},
{
"epoch": 0.1099060014461316,
"grad_norm": 0.2830626368522644,
"learning_rate": 1.999712707743362e-05,
"loss": 1.0917,
"step": 152
},
{
"epoch": 0.11062906724511931,
"grad_norm": 0.30642372369766235,
"learning_rate": 1.9997015523489912e-05,
"loss": 1.0848,
"step": 153
},
{
"epoch": 0.11135213304410702,
"grad_norm": 0.29700708389282227,
"learning_rate": 1.999690184513597e-05,
"loss": 1.1311,
"step": 154
},
{
"epoch": 0.11207519884309472,
"grad_norm": 0.3125067949295044,
"learning_rate": 1.999678604239596e-05,
"loss": 1.0089,
"step": 155
},
{
"epoch": 0.11279826464208242,
"grad_norm": 0.26634660363197327,
"learning_rate": 1.9996668115294486e-05,
"loss": 0.8978,
"step": 156
},
{
"epoch": 0.11352133044107014,
"grad_norm": 0.3419530689716339,
"learning_rate": 1.9996548063856606e-05,
"loss": 1.1423,
"step": 157
},
{
"epoch": 0.11424439624005785,
"grad_norm": 0.3322874903678894,
"learning_rate": 1.999642588810784e-05,
"loss": 1.1393,
"step": 158
},
{
"epoch": 0.11496746203904555,
"grad_norm": 0.39157527685165405,
"learning_rate": 1.999630158807414e-05,
"loss": 1.1042,
"step": 159
},
{
"epoch": 0.11569052783803326,
"grad_norm": 0.2635056674480438,
"learning_rate": 1.999617516378193e-05,
"loss": 1.1925,
"step": 160
},
{
"epoch": 0.11641359363702097,
"grad_norm": 0.28288525342941284,
"learning_rate": 1.999604661525807e-05,
"loss": 1.1529,
"step": 161
},
{
"epoch": 0.11713665943600868,
"grad_norm": 0.2699298560619354,
"learning_rate": 1.9995915942529875e-05,
"loss": 1.1823,
"step": 162
},
{
"epoch": 0.11785972523499638,
"grad_norm": 0.2862537205219269,
"learning_rate": 1.999578314562512e-05,
"loss": 1.1446,
"step": 163
},
{
"epoch": 0.11858279103398409,
"grad_norm": 0.3110126852989197,
"learning_rate": 1.9995648224572023e-05,
"loss": 1.1324,
"step": 164
},
{
"epoch": 0.1193058568329718,
"grad_norm": 0.29105737805366516,
"learning_rate": 1.9995511179399253e-05,
"loss": 1.1826,
"step": 165
},
{
"epoch": 0.12002892263195951,
"grad_norm": 0.29570046067237854,
"learning_rate": 1.999537201013593e-05,
"loss": 1.1039,
"step": 166
},
{
"epoch": 0.12075198843094721,
"grad_norm": 0.3625316619873047,
"learning_rate": 1.9995230716811637e-05,
"loss": 1.22,
"step": 167
},
{
"epoch": 0.12147505422993492,
"grad_norm": 0.2676149606704712,
"learning_rate": 1.9995087299456395e-05,
"loss": 1.112,
"step": 168
},
{
"epoch": 0.12219812002892264,
"grad_norm": 0.2860702574253082,
"learning_rate": 1.9994941758100677e-05,
"loss": 1.1972,
"step": 169
},
{
"epoch": 0.12292118582791034,
"grad_norm": 0.22130216658115387,
"learning_rate": 1.9994794092775418e-05,
"loss": 1.0739,
"step": 170
},
{
"epoch": 0.12364425162689804,
"grad_norm": 0.28100821375846863,
"learning_rate": 1.9994644303511994e-05,
"loss": 1.0996,
"step": 171
},
{
"epoch": 0.12436731742588576,
"grad_norm": 0.2977873980998993,
"learning_rate": 1.9994492390342233e-05,
"loss": 1.0319,
"step": 172
},
{
"epoch": 0.12509038322487345,
"grad_norm": 0.28634506464004517,
"learning_rate": 1.999433835329842e-05,
"loss": 1.1696,
"step": 173
},
{
"epoch": 0.12581344902386118,
"grad_norm": 0.2574990689754486,
"learning_rate": 1.999418219241329e-05,
"loss": 1.1368,
"step": 174
},
{
"epoch": 0.1265365148228489,
"grad_norm": 0.48853209614753723,
"learning_rate": 1.9994023907720027e-05,
"loss": 1.15,
"step": 175
},
{
"epoch": 0.1272595806218366,
"grad_norm": 0.2706013023853302,
"learning_rate": 1.9993863499252265e-05,
"loss": 1.123,
"step": 176
},
{
"epoch": 0.1279826464208243,
"grad_norm": 0.3121446669101715,
"learning_rate": 1.9993700967044097e-05,
"loss": 1.1917,
"step": 177
},
{
"epoch": 0.128705712219812,
"grad_norm": 0.31216657161712646,
"learning_rate": 1.9993536311130054e-05,
"loss": 1.1167,
"step": 178
},
{
"epoch": 0.1294287780187997,
"grad_norm": 0.36718523502349854,
"learning_rate": 1.9993369531545134e-05,
"loss": 1.1922,
"step": 179
},
{
"epoch": 0.1301518438177874,
"grad_norm": 0.37485387921333313,
"learning_rate": 1.999320062832477e-05,
"loss": 1.1381,
"step": 180
},
{
"epoch": 0.13087490961677511,
"grad_norm": 0.2811019718647003,
"learning_rate": 1.9993029601504865e-05,
"loss": 1.0746,
"step": 181
},
{
"epoch": 0.13159797541576285,
"grad_norm": 0.30120980739593506,
"learning_rate": 1.9992856451121754e-05,
"loss": 1.1391,
"step": 182
},
{
"epoch": 0.13232104121475055,
"grad_norm": 0.3567376136779785,
"learning_rate": 1.9992681177212236e-05,
"loss": 1.0458,
"step": 183
},
{
"epoch": 0.13304410701373826,
"grad_norm": 0.2923922538757324,
"learning_rate": 1.9992503779813558e-05,
"loss": 1.1919,
"step": 184
},
{
"epoch": 0.13376717281272596,
"grad_norm": 0.4777401089668274,
"learning_rate": 1.9992324258963414e-05,
"loss": 1.1606,
"step": 185
},
{
"epoch": 0.13449023861171366,
"grad_norm": 0.4806600511074066,
"learning_rate": 1.9992142614699958e-05,
"loss": 1.1763,
"step": 186
},
{
"epoch": 0.13521330441070137,
"grad_norm": 0.3039063811302185,
"learning_rate": 1.9991958847061786e-05,
"loss": 1.0922,
"step": 187
},
{
"epoch": 0.13593637020968907,
"grad_norm": 0.27506738901138306,
"learning_rate": 1.9991772956087952e-05,
"loss": 1.0489,
"step": 188
},
{
"epoch": 0.13665943600867678,
"grad_norm": 0.2944995164871216,
"learning_rate": 1.999158494181796e-05,
"loss": 1.0292,
"step": 189
},
{
"epoch": 0.1373825018076645,
"grad_norm": 0.347499817609787,
"learning_rate": 1.999139480429176e-05,
"loss": 1.1791,
"step": 190
},
{
"epoch": 0.1381055676066522,
"grad_norm": 0.2530309855937958,
"learning_rate": 1.9991202543549758e-05,
"loss": 1.0919,
"step": 191
},
{
"epoch": 0.13882863340563992,
"grad_norm": 0.3032122850418091,
"learning_rate": 1.9991008159632816e-05,
"loss": 1.0343,
"step": 192
},
{
"epoch": 0.13955169920462762,
"grad_norm": 0.32035306096076965,
"learning_rate": 1.999081165258223e-05,
"loss": 1.1535,
"step": 193
},
{
"epoch": 0.14027476500361533,
"grad_norm": 0.3276461958885193,
"learning_rate": 1.999061302243977e-05,
"loss": 1.1237,
"step": 194
},
{
"epoch": 0.14099783080260303,
"grad_norm": 0.27263158559799194,
"learning_rate": 1.9990412269247637e-05,
"loss": 1.1925,
"step": 195
},
{
"epoch": 0.14172089660159073,
"grad_norm": 0.48212355375289917,
"learning_rate": 1.9990209393048497e-05,
"loss": 1.0973,
"step": 196
},
{
"epoch": 0.14244396240057844,
"grad_norm": 0.2263614982366562,
"learning_rate": 1.9990004393885466e-05,
"loss": 1.1126,
"step": 197
},
{
"epoch": 0.14316702819956617,
"grad_norm": 0.24680563807487488,
"learning_rate": 1.99897972718021e-05,
"loss": 1.1049,
"step": 198
},
{
"epoch": 0.14389009399855388,
"grad_norm": 0.2854582667350769,
"learning_rate": 1.9989588026842416e-05,
"loss": 1.2055,
"step": 199
},
{
"epoch": 0.14461315979754158,
"grad_norm": 0.2650730609893799,
"learning_rate": 1.9989376659050878e-05,
"loss": 1.1494,
"step": 200
},
{
"epoch": 0.14533622559652928,
"grad_norm": 0.4389187693595886,
"learning_rate": 1.99891631684724e-05,
"loss": 1.162,
"step": 201
},
{
"epoch": 0.146059291395517,
"grad_norm": 0.2606823444366455,
"learning_rate": 1.998894755515236e-05,
"loss": 1.2099,
"step": 202
},
{
"epoch": 0.1467823571945047,
"grad_norm": 0.3051895201206207,
"learning_rate": 1.9988729819136568e-05,
"loss": 1.055,
"step": 203
},
{
"epoch": 0.1475054229934924,
"grad_norm": 0.2886350452899933,
"learning_rate": 1.9988509960471294e-05,
"loss": 1.1345,
"step": 204
},
{
"epoch": 0.14822848879248013,
"grad_norm": 0.2689605951309204,
"learning_rate": 1.9988287979203264e-05,
"loss": 1.0322,
"step": 205
},
{
"epoch": 0.14895155459146783,
"grad_norm": 0.2789241075515747,
"learning_rate": 1.9988063875379645e-05,
"loss": 1.2015,
"step": 206
},
{
"epoch": 0.14967462039045554,
"grad_norm": 0.4138123095035553,
"learning_rate": 1.9987837649048062e-05,
"loss": 1.1172,
"step": 207
},
{
"epoch": 0.15039768618944324,
"grad_norm": 0.49634483456611633,
"learning_rate": 1.998760930025659e-05,
"loss": 1.1927,
"step": 208
},
{
"epoch": 0.15112075198843095,
"grad_norm": 0.3492712378501892,
"learning_rate": 1.9987378829053756e-05,
"loss": 1.0852,
"step": 209
},
{
"epoch": 0.15184381778741865,
"grad_norm": 0.42525535821914673,
"learning_rate": 1.9987146235488532e-05,
"loss": 1.1665,
"step": 210
},
{
"epoch": 0.15256688358640635,
"grad_norm": 0.23187308013439178,
"learning_rate": 1.9986911519610346e-05,
"loss": 1.0863,
"step": 211
},
{
"epoch": 0.15328994938539406,
"grad_norm": 0.23776406049728394,
"learning_rate": 1.9986674681469074e-05,
"loss": 0.9548,
"step": 212
},
{
"epoch": 0.1540130151843818,
"grad_norm": 0.23905649781227112,
"learning_rate": 1.998643572111505e-05,
"loss": 1.0704,
"step": 213
},
{
"epoch": 0.1547360809833695,
"grad_norm": 0.31379473209381104,
"learning_rate": 1.9986194638599056e-05,
"loss": 1.0357,
"step": 214
},
{
"epoch": 0.1554591467823572,
"grad_norm": 0.31585368514060974,
"learning_rate": 1.9985951433972313e-05,
"loss": 1.0999,
"step": 215
},
{
"epoch": 0.1561822125813449,
"grad_norm": 0.42276912927627563,
"learning_rate": 1.9985706107286515e-05,
"loss": 1.0196,
"step": 216
},
{
"epoch": 0.1569052783803326,
"grad_norm": 0.2652490437030792,
"learning_rate": 1.9985458658593787e-05,
"loss": 1.0087,
"step": 217
},
{
"epoch": 0.1576283441793203,
"grad_norm": 0.2674664855003357,
"learning_rate": 1.9985209087946717e-05,
"loss": 1.1576,
"step": 218
},
{
"epoch": 0.15835140997830802,
"grad_norm": 0.3311745524406433,
"learning_rate": 1.9984957395398336e-05,
"loss": 1.2496,
"step": 219
},
{
"epoch": 0.15907447577729572,
"grad_norm": 0.31602323055267334,
"learning_rate": 1.998470358100213e-05,
"loss": 1.0267,
"step": 220
},
{
"epoch": 0.15979754157628345,
"grad_norm": 0.42870277166366577,
"learning_rate": 1.998444764481204e-05,
"loss": 1.0689,
"step": 221
},
{
"epoch": 0.16052060737527116,
"grad_norm": 0.2735862731933594,
"learning_rate": 1.9984189586882455e-05,
"loss": 1.1395,
"step": 222
},
{
"epoch": 0.16124367317425886,
"grad_norm": 0.36028143763542175,
"learning_rate": 1.9983929407268206e-05,
"loss": 1.176,
"step": 223
},
{
"epoch": 0.16196673897324657,
"grad_norm": 0.35686901211738586,
"learning_rate": 1.9983667106024584e-05,
"loss": 1.165,
"step": 224
},
{
"epoch": 0.16268980477223427,
"grad_norm": 0.25849634408950806,
"learning_rate": 1.9983402683207334e-05,
"loss": 1.0527,
"step": 225
},
{
"epoch": 0.16341287057122197,
"grad_norm": 0.2960383892059326,
"learning_rate": 1.9983136138872644e-05,
"loss": 1.1526,
"step": 226
},
{
"epoch": 0.16413593637020968,
"grad_norm": 0.31200936436653137,
"learning_rate": 1.9982867473077155e-05,
"loss": 1.001,
"step": 227
},
{
"epoch": 0.1648590021691974,
"grad_norm": 0.2932310998439789,
"learning_rate": 1.998259668587796e-05,
"loss": 1.0264,
"step": 228
},
{
"epoch": 0.16558206796818511,
"grad_norm": 0.31865110993385315,
"learning_rate": 1.9982323777332605e-05,
"loss": 1.1389,
"step": 229
},
{
"epoch": 0.16630513376717282,
"grad_norm": 0.3649924397468567,
"learning_rate": 1.9982048747499082e-05,
"loss": 1.1133,
"step": 230
},
{
"epoch": 0.16702819956616052,
"grad_norm": 0.27349087595939636,
"learning_rate": 1.9981771596435834e-05,
"loss": 1.1829,
"step": 231
},
{
"epoch": 0.16775126536514823,
"grad_norm": 0.49940750002861023,
"learning_rate": 1.9981492324201762e-05,
"loss": 1.1312,
"step": 232
},
{
"epoch": 0.16847433116413593,
"grad_norm": 0.3599984049797058,
"learning_rate": 1.998121093085621e-05,
"loss": 1.0232,
"step": 233
},
{
"epoch": 0.16919739696312364,
"grad_norm": 0.3514828681945801,
"learning_rate": 1.9980927416458976e-05,
"loss": 1.1149,
"step": 234
},
{
"epoch": 0.16992046276211134,
"grad_norm": 0.29723837971687317,
"learning_rate": 1.998064178107031e-05,
"loss": 1.0841,
"step": 235
},
{
"epoch": 0.17064352856109907,
"grad_norm": 0.33873599767684937,
"learning_rate": 1.9980354024750903e-05,
"loss": 1.046,
"step": 236
},
{
"epoch": 0.17136659436008678,
"grad_norm": 0.32947036623954773,
"learning_rate": 1.998006414756191e-05,
"loss": 1.0069,
"step": 237
},
{
"epoch": 0.17208966015907448,
"grad_norm": 0.26573359966278076,
"learning_rate": 1.9979772149564932e-05,
"loss": 1.0213,
"step": 238
},
{
"epoch": 0.17281272595806219,
"grad_norm": 0.3644008934497833,
"learning_rate": 1.9979478030822022e-05,
"loss": 1.153,
"step": 239
},
{
"epoch": 0.1735357917570499,
"grad_norm": 0.337447851896286,
"learning_rate": 1.997918179139567e-05,
"loss": 1.0932,
"step": 240
},
{
"epoch": 0.1742588575560376,
"grad_norm": 0.2889169752597809,
"learning_rate": 1.9978883431348845e-05,
"loss": 1.1418,
"step": 241
},
{
"epoch": 0.1749819233550253,
"grad_norm": 0.3004131019115448,
"learning_rate": 1.9978582950744938e-05,
"loss": 1.1679,
"step": 242
},
{
"epoch": 0.175704989154013,
"grad_norm": 0.28067609667778015,
"learning_rate": 1.9978280349647808e-05,
"loss": 1.2233,
"step": 243
},
{
"epoch": 0.17642805495300073,
"grad_norm": 0.3912954330444336,
"learning_rate": 1.9977975628121753e-05,
"loss": 1.0767,
"step": 244
},
{
"epoch": 0.17715112075198844,
"grad_norm": 0.2701180875301361,
"learning_rate": 1.9977668786231536e-05,
"loss": 1.1795,
"step": 245
},
{
"epoch": 0.17787418655097614,
"grad_norm": 0.27517423033714294,
"learning_rate": 1.9977359824042353e-05,
"loss": 1.0639,
"step": 246
},
{
"epoch": 0.17859725234996385,
"grad_norm": 0.33523860573768616,
"learning_rate": 1.9977048741619866e-05,
"loss": 1.0979,
"step": 247
},
{
"epoch": 0.17932031814895155,
"grad_norm": 0.25108376145362854,
"learning_rate": 1.9976735539030182e-05,
"loss": 1.1019,
"step": 248
},
{
"epoch": 0.18004338394793926,
"grad_norm": 0.35542207956314087,
"learning_rate": 1.9976420216339854e-05,
"loss": 1.0838,
"step": 249
},
{
"epoch": 0.18076644974692696,
"grad_norm": 0.3071356415748596,
"learning_rate": 1.9976102773615894e-05,
"loss": 1.024,
"step": 250
},
{
"epoch": 0.18148951554591466,
"grad_norm": 0.2952810227870941,
"learning_rate": 1.9975783210925752e-05,
"loss": 0.9775,
"step": 251
},
{
"epoch": 0.1822125813449024,
"grad_norm": 0.29858988523483276,
"learning_rate": 1.9975461528337345e-05,
"loss": 0.9622,
"step": 252
},
{
"epoch": 0.1829356471438901,
"grad_norm": 0.26090505719184875,
"learning_rate": 1.9975137725919032e-05,
"loss": 1.061,
"step": 253
},
{
"epoch": 0.1836587129428778,
"grad_norm": 0.3304395079612732,
"learning_rate": 1.9974811803739617e-05,
"loss": 1.0657,
"step": 254
},
{
"epoch": 0.1843817787418655,
"grad_norm": 0.4547516405582428,
"learning_rate": 1.997448376186836e-05,
"loss": 1.1043,
"step": 255
},
{
"epoch": 0.18510484454085321,
"grad_norm": 0.28662002086639404,
"learning_rate": 1.997415360037498e-05,
"loss": 1.0743,
"step": 256
},
{
"epoch": 0.18582791033984092,
"grad_norm": 0.266493022441864,
"learning_rate": 1.9973821319329625e-05,
"loss": 0.9767,
"step": 257
},
{
"epoch": 0.18655097613882862,
"grad_norm": 0.34035131335258484,
"learning_rate": 1.9973486918802912e-05,
"loss": 1.2844,
"step": 258
},
{
"epoch": 0.18727404193781635,
"grad_norm": 0.34165364503860474,
"learning_rate": 1.9973150398865908e-05,
"loss": 1.2808,
"step": 259
},
{
"epoch": 0.18799710773680406,
"grad_norm": 0.3018459975719452,
"learning_rate": 1.9972811759590117e-05,
"loss": 0.9891,
"step": 260
},
{
"epoch": 0.18872017353579176,
"grad_norm": 0.3636222779750824,
"learning_rate": 1.9972471001047505e-05,
"loss": 1.1218,
"step": 261
},
{
"epoch": 0.18944323933477947,
"grad_norm": 0.3037750720977783,
"learning_rate": 1.9972128123310485e-05,
"loss": 1.0688,
"step": 262
},
{
"epoch": 0.19016630513376717,
"grad_norm": 0.3097022771835327,
"learning_rate": 1.997178312645192e-05,
"loss": 1.135,
"step": 263
},
{
"epoch": 0.19088937093275488,
"grad_norm": 0.29391196370124817,
"learning_rate": 1.9971436010545125e-05,
"loss": 1.0739,
"step": 264
},
{
"epoch": 0.19161243673174258,
"grad_norm": 0.2928149104118347,
"learning_rate": 1.9971086775663856e-05,
"loss": 1.0486,
"step": 265
},
{
"epoch": 0.19233550253073028,
"grad_norm": 0.32840797305107117,
"learning_rate": 1.9970735421882334e-05,
"loss": 1.0065,
"step": 266
},
{
"epoch": 0.19305856832971802,
"grad_norm": 0.2717532217502594,
"learning_rate": 1.997038194927522e-05,
"loss": 0.9868,
"step": 267
},
{
"epoch": 0.19378163412870572,
"grad_norm": 0.3612685799598694,
"learning_rate": 1.9970026357917636e-05,
"loss": 1.055,
"step": 268
},
{
"epoch": 0.19450469992769343,
"grad_norm": 0.31416481733322144,
"learning_rate": 1.9969668647885136e-05,
"loss": 1.0907,
"step": 269
},
{
"epoch": 0.19522776572668113,
"grad_norm": 0.31718918681144714,
"learning_rate": 1.996930881925374e-05,
"loss": 1.048,
"step": 270
},
{
"epoch": 0.19595083152566883,
"grad_norm": 0.41617873311042786,
"learning_rate": 1.9968946872099915e-05,
"loss": 1.1356,
"step": 271
},
{
"epoch": 0.19667389732465654,
"grad_norm": 0.6090404987335205,
"learning_rate": 1.9968582806500572e-05,
"loss": 1.0258,
"step": 272
},
{
"epoch": 0.19739696312364424,
"grad_norm": 0.3227296471595764,
"learning_rate": 1.9968216622533082e-05,
"loss": 1.0439,
"step": 273
},
{
"epoch": 0.19812002892263195,
"grad_norm": 0.5116894245147705,
"learning_rate": 1.9967848320275253e-05,
"loss": 1.2046,
"step": 274
},
{
"epoch": 0.19884309472161968,
"grad_norm": 0.30214396119117737,
"learning_rate": 1.996747789980536e-05,
"loss": 1.0646,
"step": 275
},
{
"epoch": 0.19956616052060738,
"grad_norm": 0.32545679807662964,
"learning_rate": 1.996710536120211e-05,
"loss": 1.0382,
"step": 276
},
{
"epoch": 0.2002892263195951,
"grad_norm": 0.3302004933357239,
"learning_rate": 1.9966730704544677e-05,
"loss": 1.3115,
"step": 277
},
{
"epoch": 0.2010122921185828,
"grad_norm": 0.29362550377845764,
"learning_rate": 1.9966353929912672e-05,
"loss": 1.1415,
"step": 278
},
{
"epoch": 0.2017353579175705,
"grad_norm": 0.41206830739974976,
"learning_rate": 1.9965975037386164e-05,
"loss": 1.1537,
"step": 279
},
{
"epoch": 0.2024584237165582,
"grad_norm": 0.28330183029174805,
"learning_rate": 1.9965594027045668e-05,
"loss": 0.9909,
"step": 280
},
{
"epoch": 0.2031814895155459,
"grad_norm": 0.3228115439414978,
"learning_rate": 1.996521089897215e-05,
"loss": 1.1051,
"step": 281
},
{
"epoch": 0.2039045553145336,
"grad_norm": 0.2800455689430237,
"learning_rate": 1.9964825653247026e-05,
"loss": 1.0582,
"step": 282
},
{
"epoch": 0.20462762111352134,
"grad_norm": 0.3681683838367462,
"learning_rate": 1.9964438289952167e-05,
"loss": 1.1653,
"step": 283
},
{
"epoch": 0.20535068691250905,
"grad_norm": 0.3052619695663452,
"learning_rate": 1.9964048809169885e-05,
"loss": 1.0226,
"step": 284
},
{
"epoch": 0.20607375271149675,
"grad_norm": 0.32279151678085327,
"learning_rate": 1.9963657210982947e-05,
"loss": 1.143,
"step": 285
},
{
"epoch": 0.20679681851048445,
"grad_norm": 0.3428622782230377,
"learning_rate": 1.9963263495474573e-05,
"loss": 1.0171,
"step": 286
},
{
"epoch": 0.20751988430947216,
"grad_norm": 0.2577044665813446,
"learning_rate": 1.9962867662728422e-05,
"loss": 0.9957,
"step": 287
},
{
"epoch": 0.20824295010845986,
"grad_norm": 0.4019884467124939,
"learning_rate": 1.9962469712828613e-05,
"loss": 1.2837,
"step": 288
},
{
"epoch": 0.20896601590744757,
"grad_norm": 0.3153854012489319,
"learning_rate": 1.9962069645859717e-05,
"loss": 1.1967,
"step": 289
},
{
"epoch": 0.2096890817064353,
"grad_norm": 0.3266783356666565,
"learning_rate": 1.9961667461906743e-05,
"loss": 1.015,
"step": 290
},
{
"epoch": 0.210412147505423,
"grad_norm": 0.5336325168609619,
"learning_rate": 1.9961263161055163e-05,
"loss": 1.1563,
"step": 291
},
{
"epoch": 0.2111352133044107,
"grad_norm": 0.4741445481777191,
"learning_rate": 1.996085674339089e-05,
"loss": 1.1913,
"step": 292
},
{
"epoch": 0.2118582791033984,
"grad_norm": 0.30581969022750854,
"learning_rate": 1.996044820900029e-05,
"loss": 1.0972,
"step": 293
},
{
"epoch": 0.21258134490238612,
"grad_norm": 0.31274256110191345,
"learning_rate": 1.996003755797018e-05,
"loss": 1.1664,
"step": 294
},
{
"epoch": 0.21330441070137382,
"grad_norm": 0.32022932171821594,
"learning_rate": 1.995962479038782e-05,
"loss": 1.0773,
"step": 295
},
{
"epoch": 0.21402747650036152,
"grad_norm": 0.3246869444847107,
"learning_rate": 1.9959209906340925e-05,
"loss": 1.1162,
"step": 296
},
{
"epoch": 0.21475054229934923,
"grad_norm": 0.44917920231819153,
"learning_rate": 1.995879290591767e-05,
"loss": 1.0324,
"step": 297
},
{
"epoch": 0.21547360809833696,
"grad_norm": 0.30781978368759155,
"learning_rate": 1.9958373789206656e-05,
"loss": 1.0437,
"step": 298
},
{
"epoch": 0.21619667389732466,
"grad_norm": 0.28506171703338623,
"learning_rate": 1.995795255629696e-05,
"loss": 1.0357,
"step": 299
},
{
"epoch": 0.21691973969631237,
"grad_norm": 0.3168564438819885,
"learning_rate": 1.9957529207278082e-05,
"loss": 1.0496,
"step": 300
},
{
"epoch": 0.21764280549530007,
"grad_norm": 0.4779270589351654,
"learning_rate": 1.9957103742239997e-05,
"loss": 1.0295,
"step": 301
},
{
"epoch": 0.21836587129428778,
"grad_norm": 0.2756384611129761,
"learning_rate": 1.9956676161273114e-05,
"loss": 0.9995,
"step": 302
},
{
"epoch": 0.21908893709327548,
"grad_norm": 0.28942012786865234,
"learning_rate": 1.9956246464468294e-05,
"loss": 1.0739,
"step": 303
},
{
"epoch": 0.2198120028922632,
"grad_norm": 0.3324378430843353,
"learning_rate": 1.9955814651916853e-05,
"loss": 1.061,
"step": 304
},
{
"epoch": 0.2205350686912509,
"grad_norm": 0.3170448839664459,
"learning_rate": 1.995538072371055e-05,
"loss": 1.2148,
"step": 305
},
{
"epoch": 0.22125813449023862,
"grad_norm": 0.4121737778186798,
"learning_rate": 1.9954944679941602e-05,
"loss": 0.9862,
"step": 306
},
{
"epoch": 0.22198120028922633,
"grad_norm": 0.3573335111141205,
"learning_rate": 1.9954506520702662e-05,
"loss": 1.0321,
"step": 307
},
{
"epoch": 0.22270426608821403,
"grad_norm": 0.36431220173835754,
"learning_rate": 1.995406624608685e-05,
"loss": 1.1277,
"step": 308
},
{
"epoch": 0.22342733188720174,
"grad_norm": 0.4707132577896118,
"learning_rate": 1.9953623856187714e-05,
"loss": 0.9929,
"step": 309
},
{
"epoch": 0.22415039768618944,
"grad_norm": 0.3705374598503113,
"learning_rate": 1.9953179351099276e-05,
"loss": 1.0452,
"step": 310
},
{
"epoch": 0.22487346348517714,
"grad_norm": 0.2698614299297333,
"learning_rate": 1.9952732730915993e-05,
"loss": 0.9796,
"step": 311
},
{
"epoch": 0.22559652928416485,
"grad_norm": 0.3234151303768158,
"learning_rate": 1.9952283995732765e-05,
"loss": 1.0508,
"step": 312
},
{
"epoch": 0.22631959508315258,
"grad_norm": 0.3262479603290558,
"learning_rate": 1.9951833145644962e-05,
"loss": 1.2089,
"step": 313
},
{
"epoch": 0.22704266088214028,
"grad_norm": 0.5179426670074463,
"learning_rate": 1.9951380180748383e-05,
"loss": 1.1464,
"step": 314
},
{
"epoch": 0.227765726681128,
"grad_norm": 0.4757941961288452,
"learning_rate": 1.9950925101139292e-05,
"loss": 1.0322,
"step": 315
},
{
"epoch": 0.2284887924801157,
"grad_norm": 0.40444672107696533,
"learning_rate": 1.9950467906914387e-05,
"loss": 1.0495,
"step": 316
},
{
"epoch": 0.2292118582791034,
"grad_norm": 0.3511693775653839,
"learning_rate": 1.995000859817083e-05,
"loss": 0.9481,
"step": 317
},
{
"epoch": 0.2299349240780911,
"grad_norm": 0.5550206303596497,
"learning_rate": 1.9949547175006227e-05,
"loss": 1.036,
"step": 318
},
{
"epoch": 0.2306579898770788,
"grad_norm": 0.31517210602760315,
"learning_rate": 1.9949083637518628e-05,
"loss": 1.0197,
"step": 319
},
{
"epoch": 0.2313810556760665,
"grad_norm": 0.3532278835773468,
"learning_rate": 1.994861798580654e-05,
"loss": 1.0144,
"step": 320
},
{
"epoch": 0.23210412147505424,
"grad_norm": 0.3524475693702698,
"learning_rate": 1.9948150219968917e-05,
"loss": 1.2165,
"step": 321
},
{
"epoch": 0.23282718727404195,
"grad_norm": 0.30520960688591003,
"learning_rate": 1.9947680340105156e-05,
"loss": 1.0552,
"step": 322
},
{
"epoch": 0.23355025307302965,
"grad_norm": 0.36823803186416626,
"learning_rate": 1.9947208346315112e-05,
"loss": 1.0136,
"step": 323
},
{
"epoch": 0.23427331887201736,
"grad_norm": 0.33798906207084656,
"learning_rate": 1.994673423869909e-05,
"loss": 1.0973,
"step": 324
},
{
"epoch": 0.23499638467100506,
"grad_norm": 0.4596942961215973,
"learning_rate": 1.994625801735783e-05,
"loss": 0.9646,
"step": 325
},
{
"epoch": 0.23571945046999276,
"grad_norm": 0.27387481927871704,
"learning_rate": 1.9945779682392538e-05,
"loss": 1.1533,
"step": 326
},
{
"epoch": 0.23644251626898047,
"grad_norm": 0.3908016085624695,
"learning_rate": 1.994529923390486e-05,
"loss": 1.1398,
"step": 327
},
{
"epoch": 0.23716558206796817,
"grad_norm": 0.5063049793243408,
"learning_rate": 1.99448166719969e-05,
"loss": 1.1009,
"step": 328
},
{
"epoch": 0.2378886478669559,
"grad_norm": 0.4367094337940216,
"learning_rate": 1.9944331996771194e-05,
"loss": 1.1103,
"step": 329
},
{
"epoch": 0.2386117136659436,
"grad_norm": 0.3248264193534851,
"learning_rate": 1.9943845208330742e-05,
"loss": 1.1392,
"step": 330
},
{
"epoch": 0.2393347794649313,
"grad_norm": 0.31271231174468994,
"learning_rate": 1.9943356306778995e-05,
"loss": 1.0572,
"step": 331
},
{
"epoch": 0.24005784526391902,
"grad_norm": 0.31461718678474426,
"learning_rate": 1.9942865292219837e-05,
"loss": 1.0907,
"step": 332
},
{
"epoch": 0.24078091106290672,
"grad_norm": 0.40921902656555176,
"learning_rate": 1.9942372164757616e-05,
"loss": 1.0401,
"step": 333
},
{
"epoch": 0.24150397686189443,
"grad_norm": 0.39852192997932434,
"learning_rate": 1.994187692449712e-05,
"loss": 1.1206,
"step": 334
},
{
"epoch": 0.24222704266088213,
"grad_norm": 0.30172234773635864,
"learning_rate": 1.9941379571543597e-05,
"loss": 1.0826,
"step": 335
},
{
"epoch": 0.24295010845986983,
"grad_norm": 0.32275334000587463,
"learning_rate": 1.994088010600273e-05,
"loss": 0.9221,
"step": 336
},
{
"epoch": 0.24367317425885757,
"grad_norm": 0.32760071754455566,
"learning_rate": 1.994037852798066e-05,
"loss": 1.1498,
"step": 337
},
{
"epoch": 0.24439624005784527,
"grad_norm": 0.33486208319664,
"learning_rate": 1.9939874837583977e-05,
"loss": 1.0462,
"step": 338
},
{
"epoch": 0.24511930585683298,
"grad_norm": 0.4280707836151123,
"learning_rate": 1.9939369034919712e-05,
"loss": 0.981,
"step": 339
},
{
"epoch": 0.24584237165582068,
"grad_norm": 0.3889998495578766,
"learning_rate": 1.9938861120095353e-05,
"loss": 1.1114,
"step": 340
},
{
"epoch": 0.24656543745480838,
"grad_norm": 0.2864742577075958,
"learning_rate": 1.9938351093218833e-05,
"loss": 1.0915,
"step": 341
},
{
"epoch": 0.2472885032537961,
"grad_norm": 0.390601247549057,
"learning_rate": 1.9937838954398542e-05,
"loss": 1.0737,
"step": 342
},
{
"epoch": 0.2480115690527838,
"grad_norm": 0.4023664593696594,
"learning_rate": 1.99373247037433e-05,
"loss": 1.1513,
"step": 343
},
{
"epoch": 0.24873463485177152,
"grad_norm": 0.2922250032424927,
"learning_rate": 1.9936808341362396e-05,
"loss": 1.1422,
"step": 344
},
{
"epoch": 0.24945770065075923,
"grad_norm": 0.5238683819770813,
"learning_rate": 1.9936289867365557e-05,
"loss": 1.1925,
"step": 345
},
{
"epoch": 0.2501807664497469,
"grad_norm": 0.2811175584793091,
"learning_rate": 1.993576928186296e-05,
"loss": 1.0295,
"step": 346
},
{
"epoch": 0.25090383224873464,
"grad_norm": 0.3582068383693695,
"learning_rate": 1.9935246584965237e-05,
"loss": 1.0085,
"step": 347
},
{
"epoch": 0.25162689804772237,
"grad_norm": 0.29945480823516846,
"learning_rate": 1.993472177678345e-05,
"loss": 0.9309,
"step": 348
},
{
"epoch": 0.25234996384671005,
"grad_norm": 0.2941429018974304,
"learning_rate": 1.993419485742914e-05,
"loss": 1.0875,
"step": 349
},
{
"epoch": 0.2530730296456978,
"grad_norm": 0.33406949043273926,
"learning_rate": 1.9933665827014272e-05,
"loss": 1.1204,
"step": 350
},
{
"epoch": 0.25379609544468545,
"grad_norm": 0.30616679787635803,
"learning_rate": 1.9933134685651267e-05,
"loss": 1.1959,
"step": 351
},
{
"epoch": 0.2545191612436732,
"grad_norm": 0.3512917459011078,
"learning_rate": 1.993260143345299e-05,
"loss": 1.1553,
"step": 352
},
{
"epoch": 0.25524222704266086,
"grad_norm": 0.40841343998908997,
"learning_rate": 1.9932066070532768e-05,
"loss": 1.0604,
"step": 353
},
{
"epoch": 0.2559652928416486,
"grad_norm": 0.4030756950378418,
"learning_rate": 1.9931528597004363e-05,
"loss": 1.2246,
"step": 354
},
{
"epoch": 0.25668835864063627,
"grad_norm": 0.3495555520057678,
"learning_rate": 1.9930989012981992e-05,
"loss": 1.1397,
"step": 355
},
{
"epoch": 0.257411424439624,
"grad_norm": 0.31172969937324524,
"learning_rate": 1.9930447318580323e-05,
"loss": 1.0977,
"step": 356
},
{
"epoch": 0.25813449023861174,
"grad_norm": 0.3468332886695862,
"learning_rate": 1.992990351391446e-05,
"loss": 1.2147,
"step": 357
},
{
"epoch": 0.2588575560375994,
"grad_norm": 0.4538349211215973,
"learning_rate": 1.992935759909997e-05,
"loss": 1.0623,
"step": 358
},
{
"epoch": 0.25958062183658714,
"grad_norm": 0.5491474270820618,
"learning_rate": 1.9928809574252864e-05,
"loss": 1.1133,
"step": 359
},
{
"epoch": 0.2603036876355748,
"grad_norm": 0.33087098598480225,
"learning_rate": 1.992825943948959e-05,
"loss": 1.0613,
"step": 360
},
{
"epoch": 0.26102675343456255,
"grad_norm": 0.4498991370201111,
"learning_rate": 1.9927707194927067e-05,
"loss": 1.1283,
"step": 361
},
{
"epoch": 0.26174981923355023,
"grad_norm": 0.3464474678039551,
"learning_rate": 1.9927152840682636e-05,
"loss": 1.1189,
"step": 362
},
{
"epoch": 0.26247288503253796,
"grad_norm": 0.32526206970214844,
"learning_rate": 1.9926596376874112e-05,
"loss": 0.9857,
"step": 363
},
{
"epoch": 0.2631959508315257,
"grad_norm": 0.36080119013786316,
"learning_rate": 1.9926037803619744e-05,
"loss": 1.0095,
"step": 364
},
{
"epoch": 0.26391901663051337,
"grad_norm": 0.4334959089756012,
"learning_rate": 1.9925477121038218e-05,
"loss": 1.1569,
"step": 365
},
{
"epoch": 0.2646420824295011,
"grad_norm": 0.33947449922561646,
"learning_rate": 1.99249143292487e-05,
"loss": 1.067,
"step": 366
},
{
"epoch": 0.2653651482284888,
"grad_norm": 0.3930415213108063,
"learning_rate": 1.9924349428370774e-05,
"loss": 1.1315,
"step": 367
},
{
"epoch": 0.2660882140274765,
"grad_norm": 0.47822701930999756,
"learning_rate": 1.992378241852449e-05,
"loss": 1.0941,
"step": 368
},
{
"epoch": 0.2668112798264642,
"grad_norm": 0.4171973764896393,
"learning_rate": 1.9923213299830336e-05,
"loss": 1.2023,
"step": 369
},
{
"epoch": 0.2675343456254519,
"grad_norm": 0.29545828700065613,
"learning_rate": 1.992264207240925e-05,
"loss": 1.1136,
"step": 370
},
{
"epoch": 0.26825741142443965,
"grad_norm": 0.3832903504371643,
"learning_rate": 1.9922068736382627e-05,
"loss": 1.0233,
"step": 371
},
{
"epoch": 0.26898047722342733,
"grad_norm": 0.5224931836128235,
"learning_rate": 1.99214932918723e-05,
"loss": 1.178,
"step": 372
},
{
"epoch": 0.26970354302241506,
"grad_norm": 0.3658877909183502,
"learning_rate": 1.9920915739000555e-05,
"loss": 1.1076,
"step": 373
},
{
"epoch": 0.27042660882140274,
"grad_norm": 0.325195848941803,
"learning_rate": 1.9920336077890122e-05,
"loss": 1.0595,
"step": 374
},
{
"epoch": 0.27114967462039047,
"grad_norm": 0.33365723490715027,
"learning_rate": 1.991975430866419e-05,
"loss": 1.0157,
"step": 375
},
{
"epoch": 0.27187274041937814,
"grad_norm": 0.3699803948402405,
"learning_rate": 1.9919170431446374e-05,
"loss": 0.9225,
"step": 376
},
{
"epoch": 0.2725958062183659,
"grad_norm": 0.4542098939418793,
"learning_rate": 1.9918584446360755e-05,
"loss": 1.0914,
"step": 377
},
{
"epoch": 0.27331887201735355,
"grad_norm": 0.2979832589626312,
"learning_rate": 1.9917996353531864e-05,
"loss": 1.1258,
"step": 378
},
{
"epoch": 0.2740419378163413,
"grad_norm": 0.3481557369232178,
"learning_rate": 1.9917406153084668e-05,
"loss": 1.1902,
"step": 379
},
{
"epoch": 0.274765003615329,
"grad_norm": 0.3932206928730011,
"learning_rate": 1.9916813845144587e-05,
"loss": 1.1773,
"step": 380
},
{
"epoch": 0.2754880694143167,
"grad_norm": 0.32080596685409546,
"learning_rate": 1.991621942983749e-05,
"loss": 1.0663,
"step": 381
},
{
"epoch": 0.2762111352133044,
"grad_norm": 0.38809677958488464,
"learning_rate": 1.9915622907289695e-05,
"loss": 0.9916,
"step": 382
},
{
"epoch": 0.2769342010122921,
"grad_norm": 0.2993149757385254,
"learning_rate": 1.9915024277627965e-05,
"loss": 1.0444,
"step": 383
},
{
"epoch": 0.27765726681127983,
"grad_norm": 0.39644211530685425,
"learning_rate": 1.991442354097951e-05,
"loss": 1.1248,
"step": 384
},
{
"epoch": 0.2783803326102675,
"grad_norm": 0.3163677752017975,
"learning_rate": 1.9913820697471988e-05,
"loss": 1.1467,
"step": 385
},
{
"epoch": 0.27910339840925524,
"grad_norm": 0.3997037410736084,
"learning_rate": 1.9913215747233505e-05,
"loss": 1.0009,
"step": 386
},
{
"epoch": 0.279826464208243,
"grad_norm": 0.43837687373161316,
"learning_rate": 1.991260869039262e-05,
"loss": 1.1285,
"step": 387
},
{
"epoch": 0.28054953000723065,
"grad_norm": 0.3039294481277466,
"learning_rate": 1.9911999527078333e-05,
"loss": 1.1063,
"step": 388
},
{
"epoch": 0.2812725958062184,
"grad_norm": 0.2840760052204132,
"learning_rate": 1.9911388257420093e-05,
"loss": 1.0023,
"step": 389
},
{
"epoch": 0.28199566160520606,
"grad_norm": 0.34338292479515076,
"learning_rate": 1.9910774881547803e-05,
"loss": 0.9788,
"step": 390
},
{
"epoch": 0.2827187274041938,
"grad_norm": 0.30882540345191956,
"learning_rate": 1.99101593995918e-05,
"loss": 0.949,
"step": 391
},
{
"epoch": 0.28344179320318147,
"grad_norm": 0.29385194182395935,
"learning_rate": 1.9909541811682883e-05,
"loss": 1.0488,
"step": 392
},
{
"epoch": 0.2841648590021692,
"grad_norm": 0.2834920585155487,
"learning_rate": 1.9908922117952288e-05,
"loss": 1.1649,
"step": 393
},
{
"epoch": 0.2848879248011569,
"grad_norm": 0.451275110244751,
"learning_rate": 1.9908300318531707e-05,
"loss": 0.9627,
"step": 394
},
{
"epoch": 0.2856109906001446,
"grad_norm": 0.3283119201660156,
"learning_rate": 1.990767641355327e-05,
"loss": 0.9928,
"step": 395
},
{
"epoch": 0.28633405639913234,
"grad_norm": 0.4345645308494568,
"learning_rate": 1.990705040314956e-05,
"loss": 1.121,
"step": 396
},
{
"epoch": 0.28705712219812,
"grad_norm": 0.4899539649486542,
"learning_rate": 1.9906422287453614e-05,
"loss": 1.1305,
"step": 397
},
{
"epoch": 0.28778018799710775,
"grad_norm": 0.3533375859260559,
"learning_rate": 1.99057920665989e-05,
"loss": 1.1405,
"step": 398
},
{
"epoch": 0.2885032537960954,
"grad_norm": 0.2978805899620056,
"learning_rate": 1.990515974071935e-05,
"loss": 1.0067,
"step": 399
},
{
"epoch": 0.28922631959508316,
"grad_norm": 0.39741653203964233,
"learning_rate": 1.9904525309949332e-05,
"loss": 1.1486,
"step": 400
},
{
"epoch": 0.28994938539407084,
"grad_norm": 0.3140316903591156,
"learning_rate": 1.990388877442367e-05,
"loss": 1.0824,
"step": 401
},
{
"epoch": 0.29067245119305857,
"grad_norm": 0.5698236227035522,
"learning_rate": 1.9903250134277622e-05,
"loss": 1.155,
"step": 402
},
{
"epoch": 0.2913955169920463,
"grad_norm": 0.3922751545906067,
"learning_rate": 1.990260938964691e-05,
"loss": 1.0638,
"step": 403
},
{
"epoch": 0.292118582791034,
"grad_norm": 0.39551931619644165,
"learning_rate": 1.990196654066769e-05,
"loss": 1.1221,
"step": 404
},
{
"epoch": 0.2928416485900217,
"grad_norm": 0.4097634255886078,
"learning_rate": 1.9901321587476573e-05,
"loss": 1.1902,
"step": 405
},
{
"epoch": 0.2935647143890094,
"grad_norm": 0.32898181676864624,
"learning_rate": 1.9900674530210617e-05,
"loss": 1.1063,
"step": 406
},
{
"epoch": 0.2942877801879971,
"grad_norm": 0.3113490045070648,
"learning_rate": 1.9900025369007316e-05,
"loss": 1.1066,
"step": 407
},
{
"epoch": 0.2950108459869848,
"grad_norm": 0.38681137561798096,
"learning_rate": 1.9899374104004628e-05,
"loss": 0.9615,
"step": 408
},
{
"epoch": 0.2957339117859725,
"grad_norm": 0.3377947509288788,
"learning_rate": 1.9898720735340948e-05,
"loss": 1.1625,
"step": 409
},
{
"epoch": 0.29645697758496026,
"grad_norm": 0.347249299287796,
"learning_rate": 1.9898065263155118e-05,
"loss": 1.0252,
"step": 410
},
{
"epoch": 0.29718004338394793,
"grad_norm": 0.301384299993515,
"learning_rate": 1.989740768758643e-05,
"loss": 1.0796,
"step": 411
},
{
"epoch": 0.29790310918293567,
"grad_norm": 0.5834956765174866,
"learning_rate": 1.9896748008774618e-05,
"loss": 1.0108,
"step": 412
},
{
"epoch": 0.29862617498192334,
"grad_norm": 0.4832312762737274,
"learning_rate": 1.989608622685987e-05,
"loss": 1.2985,
"step": 413
},
{
"epoch": 0.2993492407809111,
"grad_norm": 0.28051361441612244,
"learning_rate": 1.989542234198282e-05,
"loss": 1.0726,
"step": 414
},
{
"epoch": 0.30007230657989875,
"grad_norm": 0.38429728150367737,
"learning_rate": 1.989475635428454e-05,
"loss": 1.1316,
"step": 415
},
{
"epoch": 0.3007953723788865,
"grad_norm": 1.0346802473068237,
"learning_rate": 1.9894088263906563e-05,
"loss": 0.9848,
"step": 416
},
{
"epoch": 0.30151843817787416,
"grad_norm": 0.4266470670700073,
"learning_rate": 1.9893418070990855e-05,
"loss": 1.0715,
"step": 417
},
{
"epoch": 0.3022415039768619,
"grad_norm": 0.3880995810031891,
"learning_rate": 1.9892745775679837e-05,
"loss": 1.0393,
"step": 418
},
{
"epoch": 0.3029645697758496,
"grad_norm": 0.3417673408985138,
"learning_rate": 1.9892071378116378e-05,
"loss": 0.9673,
"step": 419
},
{
"epoch": 0.3036876355748373,
"grad_norm": 0.35822057723999023,
"learning_rate": 1.9891394878443783e-05,
"loss": 1.0562,
"step": 420
},
{
"epoch": 0.30441070137382503,
"grad_norm": 0.5743041038513184,
"learning_rate": 1.989071627680582e-05,
"loss": 1.0709,
"step": 421
},
{
"epoch": 0.3051337671728127,
"grad_norm": 0.3537566363811493,
"learning_rate": 1.9890035573346685e-05,
"loss": 1.0405,
"step": 422
},
{
"epoch": 0.30585683297180044,
"grad_norm": 0.3579690456390381,
"learning_rate": 1.988935276821104e-05,
"loss": 1.0664,
"step": 423
},
{
"epoch": 0.3065798987707881,
"grad_norm": 0.3571028709411621,
"learning_rate": 1.988866786154398e-05,
"loss": 1.1104,
"step": 424
},
{
"epoch": 0.30730296456977585,
"grad_norm": 0.3370681405067444,
"learning_rate": 1.988798085349105e-05,
"loss": 1.1617,
"step": 425
},
{
"epoch": 0.3080260303687636,
"grad_norm": 0.34891048073768616,
"learning_rate": 1.9887291744198242e-05,
"loss": 1.1739,
"step": 426
},
{
"epoch": 0.30874909616775126,
"grad_norm": 0.36562421917915344,
"learning_rate": 1.9886600533812e-05,
"loss": 0.9671,
"step": 427
},
{
"epoch": 0.309472161966739,
"grad_norm": 0.291559636592865,
"learning_rate": 1.9885907222479202e-05,
"loss": 1.0322,
"step": 428
},
{
"epoch": 0.31019522776572667,
"grad_norm": 0.35871514678001404,
"learning_rate": 1.9885211810347185e-05,
"loss": 0.9888,
"step": 429
},
{
"epoch": 0.3109182935647144,
"grad_norm": 0.38193368911743164,
"learning_rate": 1.9884514297563722e-05,
"loss": 1.048,
"step": 430
},
{
"epoch": 0.3116413593637021,
"grad_norm": 0.45247647166252136,
"learning_rate": 1.9883814684277043e-05,
"loss": 0.9847,
"step": 431
},
{
"epoch": 0.3123644251626898,
"grad_norm": 0.32414931058883667,
"learning_rate": 1.9883112970635812e-05,
"loss": 1.0522,
"step": 432
},
{
"epoch": 0.31308749096167754,
"grad_norm": 0.32052549719810486,
"learning_rate": 1.988240915678916e-05,
"loss": 1.2259,
"step": 433
},
{
"epoch": 0.3138105567606652,
"grad_norm": 0.5931859016418457,
"learning_rate": 1.9881703242886635e-05,
"loss": 1.1631,
"step": 434
},
{
"epoch": 0.31453362255965295,
"grad_norm": 0.34109485149383545,
"learning_rate": 1.9880995229078253e-05,
"loss": 1.1199,
"step": 435
},
{
"epoch": 0.3152566883586406,
"grad_norm": 0.5068467855453491,
"learning_rate": 1.988028511551447e-05,
"loss": 1.0753,
"step": 436
},
{
"epoch": 0.31597975415762836,
"grad_norm": 0.3728184103965759,
"learning_rate": 1.987957290234619e-05,
"loss": 1.075,
"step": 437
},
{
"epoch": 0.31670281995661603,
"grad_norm": 0.3531060814857483,
"learning_rate": 1.987885858972476e-05,
"loss": 1.0648,
"step": 438
},
{
"epoch": 0.31742588575560376,
"grad_norm": 0.39028334617614746,
"learning_rate": 1.9878142177801977e-05,
"loss": 1.1465,
"step": 439
},
{
"epoch": 0.31814895155459144,
"grad_norm": 0.4468533992767334,
"learning_rate": 1.9877423666730075e-05,
"loss": 1.0639,
"step": 440
},
{
"epoch": 0.3188720173535792,
"grad_norm": 0.4782250225543976,
"learning_rate": 1.9876703056661748e-05,
"loss": 1.0668,
"step": 441
},
{
"epoch": 0.3195950831525669,
"grad_norm": 0.321390837430954,
"learning_rate": 1.9875980347750125e-05,
"loss": 1.1202,
"step": 442
},
{
"epoch": 0.3203181489515546,
"grad_norm": 0.3330671191215515,
"learning_rate": 1.9875255540148787e-05,
"loss": 1.0998,
"step": 443
},
{
"epoch": 0.3210412147505423,
"grad_norm": 0.3717515170574188,
"learning_rate": 1.9874528634011758e-05,
"loss": 1.0561,
"step": 444
},
{
"epoch": 0.32176428054953,
"grad_norm": 0.42336907982826233,
"learning_rate": 1.9873799629493507e-05,
"loss": 1.0152,
"step": 445
},
{
"epoch": 0.3224873463485177,
"grad_norm": 0.3879169821739197,
"learning_rate": 1.9873068526748957e-05,
"loss": 1.1424,
"step": 446
},
{
"epoch": 0.3232104121475054,
"grad_norm": 0.3264402151107788,
"learning_rate": 1.987233532593346e-05,
"loss": 1.062,
"step": 447
},
{
"epoch": 0.32393347794649313,
"grad_norm": 0.45311230421066284,
"learning_rate": 1.987160002720283e-05,
"loss": 1.0058,
"step": 448
},
{
"epoch": 0.32465654374548086,
"grad_norm": 0.3314540982246399,
"learning_rate": 1.9870862630713325e-05,
"loss": 1.0831,
"step": 449
},
{
"epoch": 0.32537960954446854,
"grad_norm": 0.46991175413131714,
"learning_rate": 1.9870123136621638e-05,
"loss": 1.0709,
"step": 450
},
{
"epoch": 0.32610267534345627,
"grad_norm": 0.344123899936676,
"learning_rate": 1.9869381545084924e-05,
"loss": 1.1122,
"step": 451
},
{
"epoch": 0.32682574114244395,
"grad_norm": 0.34936752915382385,
"learning_rate": 1.9868637856260764e-05,
"loss": 1.1093,
"step": 452
},
{
"epoch": 0.3275488069414317,
"grad_norm": 0.3147508502006531,
"learning_rate": 1.9867892070307202e-05,
"loss": 0.9859,
"step": 453
},
{
"epoch": 0.32827187274041936,
"grad_norm": 0.3766098916530609,
"learning_rate": 1.9867144187382718e-05,
"loss": 1.0976,
"step": 454
},
{
"epoch": 0.3289949385394071,
"grad_norm": 0.3415137827396393,
"learning_rate": 1.986639420764624e-05,
"loss": 1.0321,
"step": 455
},
{
"epoch": 0.3297180043383948,
"grad_norm": 0.34528324007987976,
"learning_rate": 1.9865642131257147e-05,
"loss": 1.0696,
"step": 456
},
{
"epoch": 0.3304410701373825,
"grad_norm": 0.445751428604126,
"learning_rate": 1.986488795837525e-05,
"loss": 1.1925,
"step": 457
},
{
"epoch": 0.33116413593637023,
"grad_norm": 0.38328754901885986,
"learning_rate": 1.9864131689160822e-05,
"loss": 1.2289,
"step": 458
},
{
"epoch": 0.3318872017353579,
"grad_norm": 0.3823976218700409,
"learning_rate": 1.986337332377457e-05,
"loss": 1.0853,
"step": 459
},
{
"epoch": 0.33261026753434564,
"grad_norm": 0.467807412147522,
"learning_rate": 1.9862612862377652e-05,
"loss": 1.1639,
"step": 460
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.38709041476249695,
"learning_rate": 1.9861850305131666e-05,
"loss": 1.1848,
"step": 461
},
{
"epoch": 0.33405639913232105,
"grad_norm": 0.36389395594596863,
"learning_rate": 1.986108565219866e-05,
"loss": 1.0158,
"step": 462
},
{
"epoch": 0.3347794649313087,
"grad_norm": 0.31022781133651733,
"learning_rate": 1.986031890374113e-05,
"loss": 1.1638,
"step": 463
},
{
"epoch": 0.33550253073029646,
"grad_norm": 0.4525332748889923,
"learning_rate": 1.985955005992201e-05,
"loss": 1.0545,
"step": 464
},
{
"epoch": 0.3362255965292842,
"grad_norm": 0.3653198480606079,
"learning_rate": 1.985877912090468e-05,
"loss": 1.1992,
"step": 465
},
{
"epoch": 0.33694866232827186,
"grad_norm": 0.5307183265686035,
"learning_rate": 1.985800608685297e-05,
"loss": 1.0617,
"step": 466
},
{
"epoch": 0.3376717281272596,
"grad_norm": 0.32720625400543213,
"learning_rate": 1.985723095793116e-05,
"loss": 0.9739,
"step": 467
},
{
"epoch": 0.3383947939262473,
"grad_norm": 0.3342708945274353,
"learning_rate": 1.9856453734303958e-05,
"loss": 0.9406,
"step": 468
},
{
"epoch": 0.339117859725235,
"grad_norm": 0.5348480939865112,
"learning_rate": 1.9855674416136536e-05,
"loss": 1.123,
"step": 469
},
{
"epoch": 0.3398409255242227,
"grad_norm": 0.37697339057922363,
"learning_rate": 1.9854893003594492e-05,
"loss": 1.1665,
"step": 470
},
{
"epoch": 0.3405639913232104,
"grad_norm": 0.36113324761390686,
"learning_rate": 1.985410949684389e-05,
"loss": 1.0995,
"step": 471
},
{
"epoch": 0.34128705712219815,
"grad_norm": 0.39146167039871216,
"learning_rate": 1.9853323896051226e-05,
"loss": 1.0418,
"step": 472
},
{
"epoch": 0.3420101229211858,
"grad_norm": 0.36467745900154114,
"learning_rate": 1.9852536201383444e-05,
"loss": 1.0498,
"step": 473
},
{
"epoch": 0.34273318872017355,
"grad_norm": 0.34970226883888245,
"learning_rate": 1.985174641300793e-05,
"loss": 0.88,
"step": 474
},
{
"epoch": 0.34345625451916123,
"grad_norm": 0.5417302846908569,
"learning_rate": 1.9850954531092515e-05,
"loss": 1.0795,
"step": 475
},
{
"epoch": 0.34417932031814896,
"grad_norm": 0.39648255705833435,
"learning_rate": 1.9850160555805485e-05,
"loss": 1.006,
"step": 476
},
{
"epoch": 0.34490238611713664,
"grad_norm": 0.29076090455055237,
"learning_rate": 1.984936448731556e-05,
"loss": 1.0339,
"step": 477
},
{
"epoch": 0.34562545191612437,
"grad_norm": 0.32601386308670044,
"learning_rate": 1.9848566325791906e-05,
"loss": 1.1229,
"step": 478
},
{
"epoch": 0.34634851771511205,
"grad_norm": 0.3112106919288635,
"learning_rate": 1.984776607140414e-05,
"loss": 1.019,
"step": 479
},
{
"epoch": 0.3470715835140998,
"grad_norm": 0.46314576268196106,
"learning_rate": 1.984696372432231e-05,
"loss": 1.0548,
"step": 480
},
{
"epoch": 0.3477946493130875,
"grad_norm": 0.3397153615951538,
"learning_rate": 1.9846159284716933e-05,
"loss": 1.0716,
"step": 481
},
{
"epoch": 0.3485177151120752,
"grad_norm": 0.3064862787723541,
"learning_rate": 1.9845352752758943e-05,
"loss": 1.0731,
"step": 482
},
{
"epoch": 0.3492407809110629,
"grad_norm": 0.3357180953025818,
"learning_rate": 1.984454412861974e-05,
"loss": 1.1419,
"step": 483
},
{
"epoch": 0.3499638467100506,
"grad_norm": 0.34050750732421875,
"learning_rate": 1.9843733412471155e-05,
"loss": 0.9212,
"step": 484
},
{
"epoch": 0.35068691250903833,
"grad_norm": 0.3766930103302002,
"learning_rate": 1.9842920604485474e-05,
"loss": 1.0685,
"step": 485
},
{
"epoch": 0.351409978308026,
"grad_norm": 0.39459505677223206,
"learning_rate": 1.9842105704835416e-05,
"loss": 1.0002,
"step": 486
},
{
"epoch": 0.35213304410701374,
"grad_norm": 0.552499532699585,
"learning_rate": 1.9841288713694155e-05,
"loss": 0.9073,
"step": 487
},
{
"epoch": 0.35285610990600147,
"grad_norm": 0.34755146503448486,
"learning_rate": 1.9840469631235305e-05,
"loss": 1.0715,
"step": 488
},
{
"epoch": 0.35357917570498915,
"grad_norm": 0.33920028805732727,
"learning_rate": 1.9839648457632928e-05,
"loss": 1.0243,
"step": 489
},
{
"epoch": 0.3543022415039769,
"grad_norm": 0.3754862844944,
"learning_rate": 1.9838825193061518e-05,
"loss": 1.0662,
"step": 490
},
{
"epoch": 0.35502530730296455,
"grad_norm": 0.4731397330760956,
"learning_rate": 1.9837999837696028e-05,
"loss": 1.1542,
"step": 491
},
{
"epoch": 0.3557483731019523,
"grad_norm": 0.3418963551521301,
"learning_rate": 1.983717239171185e-05,
"loss": 1.1911,
"step": 492
},
{
"epoch": 0.35647143890093996,
"grad_norm": 0.40142807364463806,
"learning_rate": 1.9836342855284817e-05,
"loss": 1.0625,
"step": 493
},
{
"epoch": 0.3571945046999277,
"grad_norm": 0.856560468673706,
"learning_rate": 1.9835511228591214e-05,
"loss": 1.0711,
"step": 494
},
{
"epoch": 0.3579175704989154,
"grad_norm": 0.4791191816329956,
"learning_rate": 1.983467751180776e-05,
"loss": 1.0933,
"step": 495
},
{
"epoch": 0.3586406362979031,
"grad_norm": 0.41239994764328003,
"learning_rate": 1.983384170511163e-05,
"loss": 1.0406,
"step": 496
},
{
"epoch": 0.35936370209689084,
"grad_norm": 0.5610681772232056,
"learning_rate": 1.983300380868043e-05,
"loss": 1.1115,
"step": 497
},
{
"epoch": 0.3600867678958785,
"grad_norm": 0.46288248896598816,
"learning_rate": 1.9832163822692217e-05,
"loss": 0.975,
"step": 498
},
{
"epoch": 0.36080983369486624,
"grad_norm": 0.4447326958179474,
"learning_rate": 1.98313217473255e-05,
"loss": 1.1229,
"step": 499
},
{
"epoch": 0.3615328994938539,
"grad_norm": 0.3198843002319336,
"learning_rate": 1.9830477582759213e-05,
"loss": 1.1134,
"step": 500
},
{
"epoch": 0.36225596529284165,
"grad_norm": 0.5029221177101135,
"learning_rate": 1.9829631329172754e-05,
"loss": 1.0069,
"step": 501
},
{
"epoch": 0.36297903109182933,
"grad_norm": 0.3619605302810669,
"learning_rate": 1.982878298674595e-05,
"loss": 1.114,
"step": 502
},
{
"epoch": 0.36370209689081706,
"grad_norm": 0.9314847588539124,
"learning_rate": 1.9827932555659076e-05,
"loss": 1.1359,
"step": 503
},
{
"epoch": 0.3644251626898048,
"grad_norm": 0.47591856122016907,
"learning_rate": 1.9827080036092857e-05,
"loss": 0.9483,
"step": 504
},
{
"epoch": 0.36514822848879247,
"grad_norm": 0.4096466600894928,
"learning_rate": 1.9826225428228455e-05,
"loss": 1.1312,
"step": 505
},
{
"epoch": 0.3658712942877802,
"grad_norm": 0.4787607192993164,
"learning_rate": 1.982536873224748e-05,
"loss": 0.9659,
"step": 506
},
{
"epoch": 0.3665943600867679,
"grad_norm": 0.4249323308467865,
"learning_rate": 1.9824509948331983e-05,
"loss": 1.1238,
"step": 507
},
{
"epoch": 0.3673174258857556,
"grad_norm": 0.6485395431518555,
"learning_rate": 1.9823649076664456e-05,
"loss": 1.094,
"step": 508
},
{
"epoch": 0.3680404916847433,
"grad_norm": 0.38700076937675476,
"learning_rate": 1.982278611742784e-05,
"loss": 1.0641,
"step": 509
},
{
"epoch": 0.368763557483731,
"grad_norm": 0.33102619647979736,
"learning_rate": 1.9821921070805522e-05,
"loss": 1.0633,
"step": 510
},
{
"epoch": 0.36948662328271875,
"grad_norm": 0.36750614643096924,
"learning_rate": 1.982105393698132e-05,
"loss": 1.0691,
"step": 511
},
{
"epoch": 0.37020968908170643,
"grad_norm": 0.5264472365379333,
"learning_rate": 1.9820184716139513e-05,
"loss": 1.0395,
"step": 512
},
{
"epoch": 0.37093275488069416,
"grad_norm": 0.333700567483902,
"learning_rate": 1.9819313408464804e-05,
"loss": 1.0712,
"step": 513
},
{
"epoch": 0.37165582067968184,
"grad_norm": 0.3045554757118225,
"learning_rate": 1.9818440014142363e-05,
"loss": 1.0887,
"step": 514
},
{
"epoch": 0.37237888647866957,
"grad_norm": 0.4070206880569458,
"learning_rate": 1.9817564533357775e-05,
"loss": 0.9634,
"step": 515
},
{
"epoch": 0.37310195227765725,
"grad_norm": 0.3737180829048157,
"learning_rate": 1.9816686966297095e-05,
"loss": 1.0509,
"step": 516
},
{
"epoch": 0.373825018076645,
"grad_norm": 0.40559202432632446,
"learning_rate": 1.9815807313146803e-05,
"loss": 1.1261,
"step": 517
},
{
"epoch": 0.3745480838756327,
"grad_norm": 0.36162590980529785,
"learning_rate": 1.9814925574093836e-05,
"loss": 0.9911,
"step": 518
},
{
"epoch": 0.3752711496746204,
"grad_norm": 0.355801522731781,
"learning_rate": 1.981404174932556e-05,
"loss": 1.0928,
"step": 519
},
{
"epoch": 0.3759942154736081,
"grad_norm": 0.4543927311897278,
"learning_rate": 1.98131558390298e-05,
"loss": 1.0528,
"step": 520
},
{
"epoch": 0.3767172812725958,
"grad_norm": 0.3490901291370392,
"learning_rate": 1.981226784339481e-05,
"loss": 1.1732,
"step": 521
},
{
"epoch": 0.3774403470715835,
"grad_norm": 0.3499595820903778,
"learning_rate": 1.981137776260929e-05,
"loss": 1.0,
"step": 522
},
{
"epoch": 0.3781634128705712,
"grad_norm": 0.3718455731868744,
"learning_rate": 1.981048559686239e-05,
"loss": 1.0863,
"step": 523
},
{
"epoch": 0.37888647866955893,
"grad_norm": 0.5918817520141602,
"learning_rate": 1.9809591346343705e-05,
"loss": 1.0692,
"step": 524
},
{
"epoch": 0.3796095444685466,
"grad_norm": 0.32191401720046997,
"learning_rate": 1.980869501124326e-05,
"loss": 1.0125,
"step": 525
},
{
"epoch": 0.38033261026753434,
"grad_norm": 0.30539965629577637,
"learning_rate": 1.9807796591751535e-05,
"loss": 0.9828,
"step": 526
},
{
"epoch": 0.3810556760665221,
"grad_norm": 0.28572794795036316,
"learning_rate": 1.980689608805944e-05,
"loss": 1.0433,
"step": 527
},
{
"epoch": 0.38177874186550975,
"grad_norm": 0.4277295768260956,
"learning_rate": 1.980599350035834e-05,
"loss": 0.9482,
"step": 528
},
{
"epoch": 0.3825018076644975,
"grad_norm": 0.49444156885147095,
"learning_rate": 1.9805088828840043e-05,
"loss": 1.0394,
"step": 529
},
{
"epoch": 0.38322487346348516,
"grad_norm": 0.3143816888332367,
"learning_rate": 1.9804182073696793e-05,
"loss": 0.9569,
"step": 530
},
{
"epoch": 0.3839479392624729,
"grad_norm": 0.7650742530822754,
"learning_rate": 1.980327323512128e-05,
"loss": 0.8502,
"step": 531
},
{
"epoch": 0.38467100506146057,
"grad_norm": 0.4131964147090912,
"learning_rate": 1.9802362313306633e-05,
"loss": 1.0786,
"step": 532
},
{
"epoch": 0.3853940708604483,
"grad_norm": 0.4136810600757599,
"learning_rate": 1.9801449308446428e-05,
"loss": 1.1747,
"step": 533
},
{
"epoch": 0.38611713665943603,
"grad_norm": 0.7104756832122803,
"learning_rate": 1.980053422073469e-05,
"loss": 1.0974,
"step": 534
},
{
"epoch": 0.3868402024584237,
"grad_norm": 0.31901639699935913,
"learning_rate": 1.979961705036587e-05,
"loss": 1.0914,
"step": 535
},
{
"epoch": 0.38756326825741144,
"grad_norm": 0.4051487147808075,
"learning_rate": 1.9798697797534875e-05,
"loss": 0.9603,
"step": 536
},
{
"epoch": 0.3882863340563991,
"grad_norm": 0.45901933312416077,
"learning_rate": 1.9797776462437048e-05,
"loss": 1.02,
"step": 537
},
{
"epoch": 0.38900939985538685,
"grad_norm": 0.32152485847473145,
"learning_rate": 1.9796853045268177e-05,
"loss": 1.0367,
"step": 538
},
{
"epoch": 0.3897324656543745,
"grad_norm": 0.3409494161605835,
"learning_rate": 1.9795927546224495e-05,
"loss": 1.0584,
"step": 539
},
{
"epoch": 0.39045553145336226,
"grad_norm": 0.36208903789520264,
"learning_rate": 1.979499996550267e-05,
"loss": 1.1319,
"step": 540
},
{
"epoch": 0.39117859725235,
"grad_norm": 0.4814053177833557,
"learning_rate": 1.9794070303299824e-05,
"loss": 1.1161,
"step": 541
},
{
"epoch": 0.39190166305133767,
"grad_norm": 0.5428364276885986,
"learning_rate": 1.979313855981351e-05,
"loss": 1.0326,
"step": 542
},
{
"epoch": 0.3926247288503254,
"grad_norm": 0.336286336183548,
"learning_rate": 1.9792204735241726e-05,
"loss": 0.9927,
"step": 543
},
{
"epoch": 0.3933477946493131,
"grad_norm": 0.38685551285743713,
"learning_rate": 1.979126882978292e-05,
"loss": 1.0352,
"step": 544
},
{
"epoch": 0.3940708604483008,
"grad_norm": 0.4633883833885193,
"learning_rate": 1.9790330843635967e-05,
"loss": 1.0327,
"step": 545
},
{
"epoch": 0.3947939262472885,
"grad_norm": 0.35226261615753174,
"learning_rate": 1.97893907770002e-05,
"loss": 1.1259,
"step": 546
},
{
"epoch": 0.3955169920462762,
"grad_norm": 0.4218463897705078,
"learning_rate": 1.9788448630075385e-05,
"loss": 1.1796,
"step": 547
},
{
"epoch": 0.3962400578452639,
"grad_norm": 0.36684003472328186,
"learning_rate": 1.9787504403061733e-05,
"loss": 1.134,
"step": 548
},
{
"epoch": 0.3969631236442516,
"grad_norm": 0.3835614025592804,
"learning_rate": 1.97865580961599e-05,
"loss": 1.0669,
"step": 549
},
{
"epoch": 0.39768618944323936,
"grad_norm": 0.36046484112739563,
"learning_rate": 1.9785609709570973e-05,
"loss": 1.1683,
"step": 550
},
{
"epoch": 0.39840925524222703,
"grad_norm": 0.4382922649383545,
"learning_rate": 1.9784659243496492e-05,
"loss": 1.0883,
"step": 551
},
{
"epoch": 0.39913232104121477,
"grad_norm": 0.343426913022995,
"learning_rate": 1.9783706698138438e-05,
"loss": 1.1423,
"step": 552
},
{
"epoch": 0.39985538684020244,
"grad_norm": 0.4072953760623932,
"learning_rate": 1.9782752073699224e-05,
"loss": 1.1642,
"step": 553
},
{
"epoch": 0.4005784526391902,
"grad_norm": 0.3550209701061249,
"learning_rate": 1.978179537038172e-05,
"loss": 1.0956,
"step": 554
},
{
"epoch": 0.40130151843817785,
"grad_norm": 0.4303446412086487,
"learning_rate": 1.9780836588389225e-05,
"loss": 1.0257,
"step": 555
},
{
"epoch": 0.4020245842371656,
"grad_norm": 0.3410395681858063,
"learning_rate": 1.9779875727925487e-05,
"loss": 0.9585,
"step": 556
},
{
"epoch": 0.4027476500361533,
"grad_norm": 0.5033888816833496,
"learning_rate": 1.9778912789194692e-05,
"loss": 1.0376,
"step": 557
},
{
"epoch": 0.403470715835141,
"grad_norm": 0.4729475677013397,
"learning_rate": 1.9777947772401468e-05,
"loss": 1.1985,
"step": 558
},
{
"epoch": 0.4041937816341287,
"grad_norm": 0.36693134903907776,
"learning_rate": 1.9776980677750884e-05,
"loss": 1.1011,
"step": 559
},
{
"epoch": 0.4049168474331164,
"grad_norm": 0.49466729164123535,
"learning_rate": 1.9776011505448455e-05,
"loss": 0.9232,
"step": 560
},
{
"epoch": 0.40563991323210413,
"grad_norm": 0.3103843033313751,
"learning_rate": 1.9775040255700137e-05,
"loss": 0.9774,
"step": 561
},
{
"epoch": 0.4063629790310918,
"grad_norm": 0.3624059855937958,
"learning_rate": 1.9774066928712315e-05,
"loss": 1.0841,
"step": 562
},
{
"epoch": 0.40708604483007954,
"grad_norm": 0.4419246017932892,
"learning_rate": 1.9773091524691833e-05,
"loss": 1.1977,
"step": 563
},
{
"epoch": 0.4078091106290672,
"grad_norm": 0.43419817090034485,
"learning_rate": 1.9772114043845968e-05,
"loss": 1.0559,
"step": 564
},
{
"epoch": 0.40853217642805495,
"grad_norm": 0.5691271424293518,
"learning_rate": 1.9771134486382436e-05,
"loss": 1.1231,
"step": 565
},
{
"epoch": 0.4092552422270427,
"grad_norm": 0.366300493478775,
"learning_rate": 1.9770152852509403e-05,
"loss": 1.1711,
"step": 566
},
{
"epoch": 0.40997830802603036,
"grad_norm": 0.38072991371154785,
"learning_rate": 1.9769169142435463e-05,
"loss": 1.0633,
"step": 567
},
{
"epoch": 0.4107013738250181,
"grad_norm": 0.33428722620010376,
"learning_rate": 1.9768183356369666e-05,
"loss": 1.0056,
"step": 568
},
{
"epoch": 0.41142443962400577,
"grad_norm": 0.31342649459838867,
"learning_rate": 1.9767195494521493e-05,
"loss": 0.9913,
"step": 569
},
{
"epoch": 0.4121475054229935,
"grad_norm": 0.3302362561225891,
"learning_rate": 1.976620555710087e-05,
"loss": 1.0847,
"step": 570
},
{
"epoch": 0.4128705712219812,
"grad_norm": 0.41525211930274963,
"learning_rate": 1.976521354431816e-05,
"loss": 1.081,
"step": 571
},
{
"epoch": 0.4135936370209689,
"grad_norm": 0.39214402437210083,
"learning_rate": 1.976421945638417e-05,
"loss": 1.0983,
"step": 572
},
{
"epoch": 0.41431670281995664,
"grad_norm": 0.5267409086227417,
"learning_rate": 1.976322329351015e-05,
"loss": 1.0452,
"step": 573
},
{
"epoch": 0.4150397686189443,
"grad_norm": 0.40632006525993347,
"learning_rate": 1.976222505590779e-05,
"loss": 1.1461,
"step": 574
},
{
"epoch": 0.41576283441793205,
"grad_norm": 0.43246760964393616,
"learning_rate": 1.976122474378922e-05,
"loss": 1.1012,
"step": 575
},
{
"epoch": 0.4164859002169197,
"grad_norm": 0.6482414603233337,
"learning_rate": 1.976022235736701e-05,
"loss": 1.0559,
"step": 576
},
{
"epoch": 0.41720896601590746,
"grad_norm": 0.381939560174942,
"learning_rate": 1.9759217896854167e-05,
"loss": 1.0672,
"step": 577
},
{
"epoch": 0.41793203181489513,
"grad_norm": 0.32086628675460815,
"learning_rate": 1.9758211362464155e-05,
"loss": 1.1237,
"step": 578
},
{
"epoch": 0.41865509761388287,
"grad_norm": 0.5381978750228882,
"learning_rate": 1.9757202754410857e-05,
"loss": 1.1335,
"step": 579
},
{
"epoch": 0.4193781634128706,
"grad_norm": 0.3503972589969635,
"learning_rate": 1.9756192072908605e-05,
"loss": 1.0443,
"step": 580
},
{
"epoch": 0.4201012292118583,
"grad_norm": 0.3661491572856903,
"learning_rate": 1.975517931817218e-05,
"loss": 1.1533,
"step": 581
},
{
"epoch": 0.420824295010846,
"grad_norm": 0.36555972695350647,
"learning_rate": 1.9754164490416796e-05,
"loss": 1.1034,
"step": 582
},
{
"epoch": 0.4215473608098337,
"grad_norm": 0.7466145157814026,
"learning_rate": 1.975314758985811e-05,
"loss": 1.011,
"step": 583
},
{
"epoch": 0.4222704266088214,
"grad_norm": 0.4798685610294342,
"learning_rate": 1.975212861671221e-05,
"loss": 1.1509,
"step": 584
},
{
"epoch": 0.4229934924078091,
"grad_norm": 0.4475909471511841,
"learning_rate": 1.975110757119564e-05,
"loss": 1.0896,
"step": 585
},
{
"epoch": 0.4237165582067968,
"grad_norm": 0.6410030126571655,
"learning_rate": 1.9750084453525372e-05,
"loss": 1.0774,
"step": 586
},
{
"epoch": 0.4244396240057845,
"grad_norm": 0.34272611141204834,
"learning_rate": 1.9749059263918825e-05,
"loss": 1.1316,
"step": 587
},
{
"epoch": 0.42516268980477223,
"grad_norm": 0.3565758168697357,
"learning_rate": 1.9748032002593854e-05,
"loss": 1.0329,
"step": 588
},
{
"epoch": 0.42588575560375996,
"grad_norm": 0.40375301241874695,
"learning_rate": 1.9747002669768763e-05,
"loss": 0.9806,
"step": 589
},
{
"epoch": 0.42660882140274764,
"grad_norm": 0.2997436225414276,
"learning_rate": 1.9745971265662286e-05,
"loss": 1.0032,
"step": 590
},
{
"epoch": 0.42733188720173537,
"grad_norm": 0.36661413311958313,
"learning_rate": 1.9744937790493595e-05,
"loss": 1.0444,
"step": 591
},
{
"epoch": 0.42805495300072305,
"grad_norm": 0.5585591793060303,
"learning_rate": 1.974390224448232e-05,
"loss": 0.9029,
"step": 592
},
{
"epoch": 0.4287780187997108,
"grad_norm": 0.38664501905441284,
"learning_rate": 1.974286462784851e-05,
"loss": 1.0633,
"step": 593
},
{
"epoch": 0.42950108459869846,
"grad_norm": 0.6772089004516602,
"learning_rate": 1.9741824940812664e-05,
"loss": 1.1014,
"step": 594
},
{
"epoch": 0.4302241503976862,
"grad_norm": 0.42573368549346924,
"learning_rate": 1.9740783183595726e-05,
"loss": 0.8741,
"step": 595
},
{
"epoch": 0.4309472161966739,
"grad_norm": 0.6061074137687683,
"learning_rate": 1.973973935641907e-05,
"loss": 1.1427,
"step": 596
},
{
"epoch": 0.4316702819956616,
"grad_norm": 0.45339277386665344,
"learning_rate": 1.9738693459504514e-05,
"loss": 1.0578,
"step": 597
},
{
"epoch": 0.43239334779464933,
"grad_norm": 0.4401942193508148,
"learning_rate": 1.9737645493074313e-05,
"loss": 1.0383,
"step": 598
},
{
"epoch": 0.433116413593637,
"grad_norm": 0.37971097230911255,
"learning_rate": 1.9736595457351167e-05,
"loss": 1.1324,
"step": 599
},
{
"epoch": 0.43383947939262474,
"grad_norm": 0.5753974914550781,
"learning_rate": 1.973554335255822e-05,
"loss": 1.1619,
"step": 600
},
{
"epoch": 0.4345625451916124,
"grad_norm": 0.39130666851997375,
"learning_rate": 1.973448917891904e-05,
"loss": 1.1005,
"step": 601
},
{
"epoch": 0.43528561099060015,
"grad_norm": 0.48390281200408936,
"learning_rate": 1.9733432936657643e-05,
"loss": 1.0415,
"step": 602
},
{
"epoch": 0.4360086767895879,
"grad_norm": 0.5121054649353027,
"learning_rate": 1.973237462599849e-05,
"loss": 1.0392,
"step": 603
},
{
"epoch": 0.43673174258857556,
"grad_norm": 0.4659397602081299,
"learning_rate": 1.9731314247166474e-05,
"loss": 1.0161,
"step": 604
},
{
"epoch": 0.4374548083875633,
"grad_norm": 0.5566121339797974,
"learning_rate": 1.973025180038693e-05,
"loss": 1.2346,
"step": 605
},
{
"epoch": 0.43817787418655096,
"grad_norm": 0.4821203052997589,
"learning_rate": 1.9729187285885636e-05,
"loss": 1.0823,
"step": 606
},
{
"epoch": 0.4389009399855387,
"grad_norm": 0.47735702991485596,
"learning_rate": 1.9728120703888804e-05,
"loss": 1.0736,
"step": 607
},
{
"epoch": 0.4396240057845264,
"grad_norm": 0.34330862760543823,
"learning_rate": 1.9727052054623086e-05,
"loss": 1.1462,
"step": 608
},
{
"epoch": 0.4403470715835141,
"grad_norm": 0.36775317788124084,
"learning_rate": 1.972598133831558e-05,
"loss": 1.1986,
"step": 609
},
{
"epoch": 0.4410701373825018,
"grad_norm": 0.329261839389801,
"learning_rate": 1.972490855519381e-05,
"loss": 1.1152,
"step": 610
},
{
"epoch": 0.4417932031814895,
"grad_norm": 0.3638715445995331,
"learning_rate": 1.9723833705485752e-05,
"loss": 0.9042,
"step": 611
},
{
"epoch": 0.44251626898047725,
"grad_norm": 0.3313756585121155,
"learning_rate": 1.9722756789419816e-05,
"loss": 1.1223,
"step": 612
},
{
"epoch": 0.4432393347794649,
"grad_norm": 0.46972256898880005,
"learning_rate": 1.9721677807224853e-05,
"loss": 0.9796,
"step": 613
},
{
"epoch": 0.44396240057845265,
"grad_norm": 0.37405598163604736,
"learning_rate": 1.9720596759130146e-05,
"loss": 0.9614,
"step": 614
},
{
"epoch": 0.44468546637744033,
"grad_norm": 0.4029143154621124,
"learning_rate": 1.9719513645365426e-05,
"loss": 1.1846,
"step": 615
},
{
"epoch": 0.44540853217642806,
"grad_norm": 0.3397390842437744,
"learning_rate": 1.9718428466160863e-05,
"loss": 1.2796,
"step": 616
},
{
"epoch": 0.44613159797541574,
"grad_norm": 0.43354395031929016,
"learning_rate": 1.9717341221747056e-05,
"loss": 1.1272,
"step": 617
},
{
"epoch": 0.44685466377440347,
"grad_norm": 0.3265979290008545,
"learning_rate": 1.9716251912355053e-05,
"loss": 0.9609,
"step": 618
},
{
"epoch": 0.4475777295733912,
"grad_norm": 0.4178304374217987,
"learning_rate": 1.9715160538216337e-05,
"loss": 0.9845,
"step": 619
},
{
"epoch": 0.4483007953723789,
"grad_norm": 0.4773651957511902,
"learning_rate": 1.971406709956283e-05,
"loss": 1.1367,
"step": 620
},
{
"epoch": 0.4490238611713666,
"grad_norm": 0.5595555901527405,
"learning_rate": 1.9712971596626894e-05,
"loss": 1.1154,
"step": 621
},
{
"epoch": 0.4497469269703543,
"grad_norm": 0.5785884261131287,
"learning_rate": 1.971187402964132e-05,
"loss": 1.2254,
"step": 622
},
{
"epoch": 0.450469992769342,
"grad_norm": 0.38726550340652466,
"learning_rate": 1.9710774398839354e-05,
"loss": 1.0962,
"step": 623
},
{
"epoch": 0.4511930585683297,
"grad_norm": 0.3972480893135071,
"learning_rate": 1.970967270445467e-05,
"loss": 1.0462,
"step": 624
},
{
"epoch": 0.45191612436731743,
"grad_norm": 0.3542553186416626,
"learning_rate": 1.970856894672139e-05,
"loss": 1.139,
"step": 625
},
{
"epoch": 0.45263919016630516,
"grad_norm": 0.3438204526901245,
"learning_rate": 1.9707463125874052e-05,
"loss": 1.0925,
"step": 626
},
{
"epoch": 0.45336225596529284,
"grad_norm": 0.3725246489048004,
"learning_rate": 1.9706355242147656e-05,
"loss": 1.1189,
"step": 627
},
{
"epoch": 0.45408532176428057,
"grad_norm": 0.5289852619171143,
"learning_rate": 1.9705245295777636e-05,
"loss": 1.0969,
"step": 628
},
{
"epoch": 0.45480838756326825,
"grad_norm": 0.3476713001728058,
"learning_rate": 1.970413328699986e-05,
"loss": 1.1203,
"step": 629
},
{
"epoch": 0.455531453362256,
"grad_norm": 0.4541453719139099,
"learning_rate": 1.9703019216050627e-05,
"loss": 0.9669,
"step": 630
},
{
"epoch": 0.45625451916124365,
"grad_norm": 0.3815220296382904,
"learning_rate": 1.9701903083166692e-05,
"loss": 1.0995,
"step": 631
},
{
"epoch": 0.4569775849602314,
"grad_norm": 0.5542870759963989,
"learning_rate": 1.970078488858523e-05,
"loss": 1.069,
"step": 632
},
{
"epoch": 0.45770065075921906,
"grad_norm": 0.4734939634799957,
"learning_rate": 1.9699664632543868e-05,
"loss": 1.0801,
"step": 633
},
{
"epoch": 0.4584237165582068,
"grad_norm": 0.45855480432510376,
"learning_rate": 1.9698542315280658e-05,
"loss": 1.1013,
"step": 634
},
{
"epoch": 0.4591467823571945,
"grad_norm": 0.48604539036750793,
"learning_rate": 1.9697417937034106e-05,
"loss": 1.0505,
"step": 635
},
{
"epoch": 0.4598698481561822,
"grad_norm": 0.4221401810646057,
"learning_rate": 1.9696291498043144e-05,
"loss": 1.0743,
"step": 636
},
{
"epoch": 0.46059291395516994,
"grad_norm": 0.39816153049468994,
"learning_rate": 1.9695162998547145e-05,
"loss": 1.0368,
"step": 637
},
{
"epoch": 0.4613159797541576,
"grad_norm": 0.3210352957248688,
"learning_rate": 1.969403243878592e-05,
"loss": 0.9725,
"step": 638
},
{
"epoch": 0.46203904555314534,
"grad_norm": 0.332942932844162,
"learning_rate": 1.969289981899972e-05,
"loss": 1.0431,
"step": 639
},
{
"epoch": 0.462762111352133,
"grad_norm": 0.5648030638694763,
"learning_rate": 1.9691765139429227e-05,
"loss": 1.0316,
"step": 640
},
{
"epoch": 0.46348517715112075,
"grad_norm": 0.3773210048675537,
"learning_rate": 1.969062840031557e-05,
"loss": 0.9669,
"step": 641
},
{
"epoch": 0.4642082429501085,
"grad_norm": 0.37246161699295044,
"learning_rate": 1.968948960190031e-05,
"loss": 1.0267,
"step": 642
},
{
"epoch": 0.46493130874909616,
"grad_norm": 0.30961740016937256,
"learning_rate": 1.9688348744425443e-05,
"loss": 1.0467,
"step": 643
},
{
"epoch": 0.4656543745480839,
"grad_norm": 0.4040377736091614,
"learning_rate": 1.968720582813341e-05,
"loss": 1.0727,
"step": 644
},
{
"epoch": 0.46637744034707157,
"grad_norm": 0.37091973423957825,
"learning_rate": 1.9686060853267088e-05,
"loss": 1.1161,
"step": 645
},
{
"epoch": 0.4671005061460593,
"grad_norm": 0.33641329407691956,
"learning_rate": 1.9684913820069785e-05,
"loss": 0.9528,
"step": 646
},
{
"epoch": 0.467823571945047,
"grad_norm": 0.36881205439567566,
"learning_rate": 1.9683764728785255e-05,
"loss": 1.0615,
"step": 647
},
{
"epoch": 0.4685466377440347,
"grad_norm": 0.37982848286628723,
"learning_rate": 1.968261357965768e-05,
"loss": 1.0018,
"step": 648
},
{
"epoch": 0.4692697035430224,
"grad_norm": 0.4756353497505188,
"learning_rate": 1.9681460372931688e-05,
"loss": 1.0773,
"step": 649
},
{
"epoch": 0.4699927693420101,
"grad_norm": 0.4749247133731842,
"learning_rate": 1.9680305108852335e-05,
"loss": 0.9998,
"step": 650
},
{
"epoch": 0.47071583514099785,
"grad_norm": 0.309193879365921,
"learning_rate": 1.9679147787665128e-05,
"loss": 1.0227,
"step": 651
},
{
"epoch": 0.47143890093998553,
"grad_norm": 0.44455206394195557,
"learning_rate": 1.9677988409615996e-05,
"loss": 1.1575,
"step": 652
},
{
"epoch": 0.47216196673897326,
"grad_norm": 0.3658543527126312,
"learning_rate": 1.9676826974951316e-05,
"loss": 1.1213,
"step": 653
},
{
"epoch": 0.47288503253796094,
"grad_norm": 0.3084392249584198,
"learning_rate": 1.9675663483917896e-05,
"loss": 0.9861,
"step": 654
},
{
"epoch": 0.47360809833694867,
"grad_norm": 0.39952704310417175,
"learning_rate": 1.9674497936762984e-05,
"loss": 1.1173,
"step": 655
},
{
"epoch": 0.47433116413593635,
"grad_norm": 0.37585368752479553,
"learning_rate": 1.9673330333734263e-05,
"loss": 1.0684,
"step": 656
},
{
"epoch": 0.4750542299349241,
"grad_norm": 0.4608975946903229,
"learning_rate": 1.9672160675079857e-05,
"loss": 1.0324,
"step": 657
},
{
"epoch": 0.4757772957339118,
"grad_norm": 0.3381264805793762,
"learning_rate": 1.9670988961048318e-05,
"loss": 1.0594,
"step": 658
},
{
"epoch": 0.4765003615328995,
"grad_norm": 0.402926504611969,
"learning_rate": 1.9669815191888647e-05,
"loss": 1.0805,
"step": 659
},
{
"epoch": 0.4772234273318872,
"grad_norm": 0.4931265115737915,
"learning_rate": 1.966863936785027e-05,
"loss": 0.9502,
"step": 660
},
{
"epoch": 0.4779464931308749,
"grad_norm": 0.3112112879753113,
"learning_rate": 1.9667461489183056e-05,
"loss": 1.081,
"step": 661
},
{
"epoch": 0.4786695589298626,
"grad_norm": 0.4174824059009552,
"learning_rate": 1.9666281556137313e-05,
"loss": 0.9463,
"step": 662
},
{
"epoch": 0.4793926247288503,
"grad_norm": 0.45544588565826416,
"learning_rate": 1.9665099568963777e-05,
"loss": 0.9999,
"step": 663
},
{
"epoch": 0.48011569052783803,
"grad_norm": 0.3407338559627533,
"learning_rate": 1.9663915527913628e-05,
"loss": 1.0472,
"step": 664
},
{
"epoch": 0.48083875632682577,
"grad_norm": 0.4073576033115387,
"learning_rate": 1.9662729433238477e-05,
"loss": 1.1324,
"step": 665
},
{
"epoch": 0.48156182212581344,
"grad_norm": 0.3806203305721283,
"learning_rate": 1.966154128519038e-05,
"loss": 0.9075,
"step": 666
},
{
"epoch": 0.4822848879248012,
"grad_norm": 0.8510825634002686,
"learning_rate": 1.966035108402182e-05,
"loss": 1.0489,
"step": 667
},
{
"epoch": 0.48300795372378885,
"grad_norm": 0.36897632479667664,
"learning_rate": 1.965915882998572e-05,
"loss": 1.0699,
"step": 668
},
{
"epoch": 0.4837310195227766,
"grad_norm": 0.6465381979942322,
"learning_rate": 1.9657964523335443e-05,
"loss": 1.1132,
"step": 669
},
{
"epoch": 0.48445408532176426,
"grad_norm": 0.5258365273475647,
"learning_rate": 1.965676816432478e-05,
"loss": 1.0174,
"step": 670
},
{
"epoch": 0.485177151120752,
"grad_norm": 0.5828375816345215,
"learning_rate": 1.9655569753207962e-05,
"loss": 1.106,
"step": 671
},
{
"epoch": 0.48590021691973967,
"grad_norm": 0.32821497321128845,
"learning_rate": 1.965436929023966e-05,
"loss": 0.9611,
"step": 672
},
{
"epoch": 0.4866232827187274,
"grad_norm": 0.37569311261177063,
"learning_rate": 1.9653166775674976e-05,
"loss": 1.0434,
"step": 673
},
{
"epoch": 0.48734634851771513,
"grad_norm": 0.41483476758003235,
"learning_rate": 1.965196220976945e-05,
"loss": 1.1067,
"step": 674
},
{
"epoch": 0.4880694143167028,
"grad_norm": 0.5044158697128296,
"learning_rate": 1.965075559277906e-05,
"loss": 1.1482,
"step": 675
},
{
"epoch": 0.48879248011569054,
"grad_norm": 0.4230242371559143,
"learning_rate": 1.9649546924960217e-05,
"loss": 0.8987,
"step": 676
},
{
"epoch": 0.4895155459146782,
"grad_norm": 0.5350583791732788,
"learning_rate": 1.964833620656976e-05,
"loss": 1.0624,
"step": 677
},
{
"epoch": 0.49023861171366595,
"grad_norm": 0.3142975866794586,
"learning_rate": 1.9647123437864985e-05,
"loss": 1.1541,
"step": 678
},
{
"epoch": 0.4909616775126536,
"grad_norm": 0.44349947571754456,
"learning_rate": 1.96459086191036e-05,
"loss": 1.1184,
"step": 679
},
{
"epoch": 0.49168474331164136,
"grad_norm": 0.36088117957115173,
"learning_rate": 1.964469175054377e-05,
"loss": 1.0658,
"step": 680
},
{
"epoch": 0.4924078091106291,
"grad_norm": 0.37263569235801697,
"learning_rate": 1.964347283244407e-05,
"loss": 0.9996,
"step": 681
},
{
"epoch": 0.49313087490961677,
"grad_norm": 0.34794577956199646,
"learning_rate": 1.964225186506354e-05,
"loss": 1.119,
"step": 682
},
{
"epoch": 0.4938539407086045,
"grad_norm": 0.3667242228984833,
"learning_rate": 1.9641028848661633e-05,
"loss": 1.1527,
"step": 683
},
{
"epoch": 0.4945770065075922,
"grad_norm": 0.3178524076938629,
"learning_rate": 1.963980378349825e-05,
"loss": 1.1277,
"step": 684
},
{
"epoch": 0.4953000723065799,
"grad_norm": 0.669657289981842,
"learning_rate": 1.963857666983372e-05,
"loss": 1.026,
"step": 685
},
{
"epoch": 0.4960231381055676,
"grad_norm": 0.4365706145763397,
"learning_rate": 1.963734750792881e-05,
"loss": 1.0566,
"step": 686
},
{
"epoch": 0.4967462039045553,
"grad_norm": 0.3377775549888611,
"learning_rate": 1.963611629804472e-05,
"loss": 1.0243,
"step": 687
},
{
"epoch": 0.49746926970354305,
"grad_norm": 0.49133753776550293,
"learning_rate": 1.9634883040443093e-05,
"loss": 1.0347,
"step": 688
},
{
"epoch": 0.4981923355025307,
"grad_norm": 0.46029427647590637,
"learning_rate": 1.9633647735386002e-05,
"loss": 1.2015,
"step": 689
},
{
"epoch": 0.49891540130151846,
"grad_norm": 0.4478205442428589,
"learning_rate": 1.9632410383135946e-05,
"loss": 1.2785,
"step": 690
},
{
"epoch": 0.49963846710050613,
"grad_norm": 0.44710350036621094,
"learning_rate": 1.9631170983955878e-05,
"loss": 1.0269,
"step": 691
},
{
"epoch": 0.5003615328994938,
"grad_norm": 0.5935525298118591,
"learning_rate": 1.9629929538109175e-05,
"loss": 0.8842,
"step": 692
},
{
"epoch": 0.5003615328994938,
"eval_loss": 1.0690364837646484,
"eval_runtime": 669.5491,
"eval_samples_per_second": 3.48,
"eval_steps_per_second": 0.871,
"step": 692
},
{
"epoch": 0.5010845986984815,
"grad_norm": 0.7159201502799988,
"learning_rate": 1.962868604585964e-05,
"loss": 1.0409,
"step": 693
},
{
"epoch": 0.5018076644974693,
"grad_norm": 0.5066662430763245,
"learning_rate": 1.962744050747153e-05,
"loss": 1.0734,
"step": 694
},
{
"epoch": 0.502530730296457,
"grad_norm": 0.44471684098243713,
"learning_rate": 1.9626192923209524e-05,
"loss": 1.1123,
"step": 695
},
{
"epoch": 0.5032537960954447,
"grad_norm": 0.5092771053314209,
"learning_rate": 1.962494329333874e-05,
"loss": 1.0229,
"step": 696
},
{
"epoch": 0.5039768618944324,
"grad_norm": 0.39915162324905396,
"learning_rate": 1.962369161812473e-05,
"loss": 1.0708,
"step": 697
},
{
"epoch": 0.5046999276934201,
"grad_norm": 0.4196009039878845,
"learning_rate": 1.962243789783348e-05,
"loss": 0.9515,
"step": 698
},
{
"epoch": 0.5054229934924078,
"grad_norm": 0.3307042717933655,
"learning_rate": 1.962118213273141e-05,
"loss": 1.021,
"step": 699
},
{
"epoch": 0.5061460592913956,
"grad_norm": 0.40860000252723694,
"learning_rate": 1.961992432308538e-05,
"loss": 1.098,
"step": 700
},
{
"epoch": 0.5068691250903832,
"grad_norm": 0.5225731134414673,
"learning_rate": 1.9618664469162673e-05,
"loss": 1.1183,
"step": 701
},
{
"epoch": 0.5075921908893709,
"grad_norm": 0.3705964684486389,
"learning_rate": 1.9617402571231022e-05,
"loss": 1.0388,
"step": 702
},
{
"epoch": 0.5083152566883586,
"grad_norm": 0.48947831988334656,
"learning_rate": 1.9616138629558577e-05,
"loss": 1.1357,
"step": 703
},
{
"epoch": 0.5090383224873464,
"grad_norm": 0.4139689803123474,
"learning_rate": 1.9614872644413943e-05,
"loss": 1.1402,
"step": 704
},
{
"epoch": 0.5097613882863341,
"grad_norm": 0.5978605151176453,
"learning_rate": 1.9613604616066137e-05,
"loss": 1.0413,
"step": 705
},
{
"epoch": 0.5104844540853217,
"grad_norm": 0.5595225095748901,
"learning_rate": 1.961233454478462e-05,
"loss": 0.9795,
"step": 706
},
{
"epoch": 0.5112075198843095,
"grad_norm": 0.5151244401931763,
"learning_rate": 1.9611062430839296e-05,
"loss": 0.9897,
"step": 707
},
{
"epoch": 0.5119305856832972,
"grad_norm": 0.43460798263549805,
"learning_rate": 1.960978827450049e-05,
"loss": 1.0279,
"step": 708
},
{
"epoch": 0.5126536514822849,
"grad_norm": 0.3416450023651123,
"learning_rate": 1.9608512076038964e-05,
"loss": 1.0831,
"step": 709
},
{
"epoch": 0.5133767172812725,
"grad_norm": 0.4662010371685028,
"learning_rate": 1.960723383572592e-05,
"loss": 1.0736,
"step": 710
},
{
"epoch": 0.5140997830802603,
"grad_norm": 0.3611615300178528,
"learning_rate": 1.9605953553832987e-05,
"loss": 0.9366,
"step": 711
},
{
"epoch": 0.514822848879248,
"grad_norm": 0.3886420726776123,
"learning_rate": 1.9604671230632234e-05,
"loss": 1.1591,
"step": 712
},
{
"epoch": 0.5155459146782357,
"grad_norm": 0.45299410820007324,
"learning_rate": 1.9603386866396155e-05,
"loss": 1.0887,
"step": 713
},
{
"epoch": 0.5162689804772235,
"grad_norm": 0.3800199627876282,
"learning_rate": 1.960210046139769e-05,
"loss": 1.1232,
"step": 714
},
{
"epoch": 0.5169920462762111,
"grad_norm": 0.5120952129364014,
"learning_rate": 1.9600812015910203e-05,
"loss": 1.1163,
"step": 715
},
{
"epoch": 0.5177151120751988,
"grad_norm": 0.5335928797721863,
"learning_rate": 1.9599521530207492e-05,
"loss": 0.8892,
"step": 716
},
{
"epoch": 0.5184381778741866,
"grad_norm": 0.5247392058372498,
"learning_rate": 1.959822900456379e-05,
"loss": 1.1459,
"step": 717
},
{
"epoch": 0.5191612436731743,
"grad_norm": 0.3226061165332794,
"learning_rate": 1.9596934439253768e-05,
"loss": 0.9743,
"step": 718
},
{
"epoch": 0.519884309472162,
"grad_norm": 0.616631031036377,
"learning_rate": 1.9595637834552524e-05,
"loss": 1.1154,
"step": 719
},
{
"epoch": 0.5206073752711496,
"grad_norm": 0.48514634370803833,
"learning_rate": 1.9594339190735594e-05,
"loss": 1.0391,
"step": 720
},
{
"epoch": 0.5213304410701374,
"grad_norm": 0.36247485876083374,
"learning_rate": 1.959303850807895e-05,
"loss": 0.85,
"step": 721
},
{
"epoch": 0.5220535068691251,
"grad_norm": 0.36446431279182434,
"learning_rate": 1.9591735786858985e-05,
"loss": 1.063,
"step": 722
},
{
"epoch": 0.5227765726681128,
"grad_norm": 0.44948527216911316,
"learning_rate": 1.9590431027352533e-05,
"loss": 1.032,
"step": 723
},
{
"epoch": 0.5234996384671005,
"grad_norm": 0.612940788269043,
"learning_rate": 1.958912422983687e-05,
"loss": 1.0337,
"step": 724
},
{
"epoch": 0.5242227042660882,
"grad_norm": 0.48580238223075867,
"learning_rate": 1.958781539458969e-05,
"loss": 1.0932,
"step": 725
},
{
"epoch": 0.5249457700650759,
"grad_norm": 0.4745420813560486,
"learning_rate": 1.9586504521889122e-05,
"loss": 1.2802,
"step": 726
},
{
"epoch": 0.5256688358640637,
"grad_norm": 0.35681384801864624,
"learning_rate": 1.9585191612013745e-05,
"loss": 1.1219,
"step": 727
},
{
"epoch": 0.5263919016630514,
"grad_norm": 0.4016458988189697,
"learning_rate": 1.9583876665242548e-05,
"loss": 1.0063,
"step": 728
},
{
"epoch": 0.527114967462039,
"grad_norm": 0.43212026357650757,
"learning_rate": 1.9582559681854962e-05,
"loss": 1.1472,
"step": 729
},
{
"epoch": 0.5278380332610267,
"grad_norm": 0.4073551595211029,
"learning_rate": 1.958124066213086e-05,
"loss": 0.9998,
"step": 730
},
{
"epoch": 0.5285610990600145,
"grad_norm": 0.4123099446296692,
"learning_rate": 1.957991960635053e-05,
"loss": 1.0065,
"step": 731
},
{
"epoch": 0.5292841648590022,
"grad_norm": 0.3612479567527771,
"learning_rate": 1.9578596514794714e-05,
"loss": 1.1218,
"step": 732
},
{
"epoch": 0.5300072306579898,
"grad_norm": 0.5187839865684509,
"learning_rate": 1.957727138774456e-05,
"loss": 1.0317,
"step": 733
},
{
"epoch": 0.5307302964569776,
"grad_norm": 0.3550787568092346,
"learning_rate": 1.957594422548168e-05,
"loss": 1.0982,
"step": 734
},
{
"epoch": 0.5314533622559653,
"grad_norm": 0.38324612379074097,
"learning_rate": 1.957461502828809e-05,
"loss": 0.9884,
"step": 735
},
{
"epoch": 0.532176428054953,
"grad_norm": 0.9438675045967102,
"learning_rate": 1.957328379644625e-05,
"loss": 1.1052,
"step": 736
},
{
"epoch": 0.5328994938539408,
"grad_norm": 0.35722121596336365,
"learning_rate": 1.9571950530239062e-05,
"loss": 0.9642,
"step": 737
},
{
"epoch": 0.5336225596529284,
"grad_norm": 0.42087095975875854,
"learning_rate": 1.9570615229949844e-05,
"loss": 1.1394,
"step": 738
},
{
"epoch": 0.5343456254519161,
"grad_norm": 0.4152733087539673,
"learning_rate": 1.956927789586235e-05,
"loss": 1.0182,
"step": 739
},
{
"epoch": 0.5350686912509038,
"grad_norm": 0.4146084785461426,
"learning_rate": 1.9567938528260778e-05,
"loss": 0.981,
"step": 740
},
{
"epoch": 0.5357917570498916,
"grad_norm": 0.32588517665863037,
"learning_rate": 1.9566597127429746e-05,
"loss": 1.0129,
"step": 741
},
{
"epoch": 0.5365148228488793,
"grad_norm": 0.3670799136161804,
"learning_rate": 1.9565253693654307e-05,
"loss": 1.0014,
"step": 742
},
{
"epoch": 0.5372378886478669,
"grad_norm": 0.41809889674186707,
"learning_rate": 1.9563908227219945e-05,
"loss": 0.9734,
"step": 743
},
{
"epoch": 0.5379609544468547,
"grad_norm": 0.37623125314712524,
"learning_rate": 1.956256072841258e-05,
"loss": 1.0281,
"step": 744
},
{
"epoch": 0.5386840202458424,
"grad_norm": 0.4615330100059509,
"learning_rate": 1.9561211197518564e-05,
"loss": 0.8803,
"step": 745
},
{
"epoch": 0.5394070860448301,
"grad_norm": 0.3422069847583771,
"learning_rate": 1.9559859634824675e-05,
"loss": 1.0829,
"step": 746
},
{
"epoch": 0.5401301518438177,
"grad_norm": 0.5453998446464539,
"learning_rate": 1.9558506040618122e-05,
"loss": 1.137,
"step": 747
},
{
"epoch": 0.5408532176428055,
"grad_norm": 0.5272181034088135,
"learning_rate": 1.9557150415186558e-05,
"loss": 1.1079,
"step": 748
},
{
"epoch": 0.5415762834417932,
"grad_norm": 0.5351109504699707,
"learning_rate": 1.9555792758818052e-05,
"loss": 1.0158,
"step": 749
},
{
"epoch": 0.5422993492407809,
"grad_norm": 0.45870500802993774,
"learning_rate": 1.9554433071801117e-05,
"loss": 1.1699,
"step": 750
},
{
"epoch": 0.5430224150397687,
"grad_norm": 0.42589834332466125,
"learning_rate": 1.9553071354424692e-05,
"loss": 1.0275,
"step": 751
},
{
"epoch": 0.5437454808387563,
"grad_norm": 0.43729254603385925,
"learning_rate": 1.955170760697815e-05,
"loss": 0.9456,
"step": 752
},
{
"epoch": 0.544468546637744,
"grad_norm": 0.39081788063049316,
"learning_rate": 1.9550341829751283e-05,
"loss": 1.0078,
"step": 753
},
{
"epoch": 0.5451916124367318,
"grad_norm": 0.47970667481422424,
"learning_rate": 1.9548974023034337e-05,
"loss": 1.2289,
"step": 754
},
{
"epoch": 0.5459146782357195,
"grad_norm": 0.3771878182888031,
"learning_rate": 1.9547604187117974e-05,
"loss": 0.9818,
"step": 755
},
{
"epoch": 0.5466377440347071,
"grad_norm": 0.3280777335166931,
"learning_rate": 1.9546232322293285e-05,
"loss": 1.0503,
"step": 756
},
{
"epoch": 0.5473608098336948,
"grad_norm": 0.5649731159210205,
"learning_rate": 1.95448584288518e-05,
"loss": 1.0285,
"step": 757
},
{
"epoch": 0.5480838756326826,
"grad_norm": 0.6320711374282837,
"learning_rate": 1.9543482507085484e-05,
"loss": 1.2219,
"step": 758
},
{
"epoch": 0.5488069414316703,
"grad_norm": 0.36972150206565857,
"learning_rate": 1.9542104557286715e-05,
"loss": 1.0396,
"step": 759
},
{
"epoch": 0.549530007230658,
"grad_norm": 0.6155106425285339,
"learning_rate": 1.9540724579748323e-05,
"loss": 0.9265,
"step": 760
},
{
"epoch": 0.5502530730296457,
"grad_norm": 0.3946453630924225,
"learning_rate": 1.9539342574763554e-05,
"loss": 1.2332,
"step": 761
},
{
"epoch": 0.5509761388286334,
"grad_norm": 0.31569650769233704,
"learning_rate": 1.953795854262609e-05,
"loss": 1.0735,
"step": 762
},
{
"epoch": 0.5516992046276211,
"grad_norm": 0.35076481103897095,
"learning_rate": 1.9536572483630048e-05,
"loss": 1.0889,
"step": 763
},
{
"epoch": 0.5524222704266089,
"grad_norm": 0.39931586384773254,
"learning_rate": 1.953518439806997e-05,
"loss": 0.904,
"step": 764
},
{
"epoch": 0.5531453362255966,
"grad_norm": 0.394999623298645,
"learning_rate": 1.9533794286240828e-05,
"loss": 1.0382,
"step": 765
},
{
"epoch": 0.5538684020245842,
"grad_norm": 0.3732694089412689,
"learning_rate": 1.953240214843803e-05,
"loss": 1.0024,
"step": 766
},
{
"epoch": 0.5545914678235719,
"grad_norm": 0.4705665409564972,
"learning_rate": 1.9531007984957408e-05,
"loss": 1.0037,
"step": 767
},
{
"epoch": 0.5553145336225597,
"grad_norm": 0.3279126286506653,
"learning_rate": 1.9529611796095232e-05,
"loss": 0.9385,
"step": 768
},
{
"epoch": 0.5560375994215474,
"grad_norm": 0.5398396849632263,
"learning_rate": 1.95282135821482e-05,
"loss": 1.0472,
"step": 769
},
{
"epoch": 0.556760665220535,
"grad_norm": 0.5406724810600281,
"learning_rate": 1.952681334341343e-05,
"loss": 1.0669,
"step": 770
},
{
"epoch": 0.5574837310195228,
"grad_norm": 0.4217100441455841,
"learning_rate": 1.952541108018849e-05,
"loss": 1.1648,
"step": 771
},
{
"epoch": 0.5582067968185105,
"grad_norm": 0.5078068375587463,
"learning_rate": 1.9524006792771354e-05,
"loss": 1.1129,
"step": 772
},
{
"epoch": 0.5589298626174982,
"grad_norm": 0.39175429940223694,
"learning_rate": 1.952260048146045e-05,
"loss": 1.1255,
"step": 773
},
{
"epoch": 0.559652928416486,
"grad_norm": 0.43805810809135437,
"learning_rate": 1.9521192146554623e-05,
"loss": 1.1986,
"step": 774
},
{
"epoch": 0.5603759942154736,
"grad_norm": 0.3366648554801941,
"learning_rate": 1.9519781788353148e-05,
"loss": 1.1253,
"step": 775
},
{
"epoch": 0.5610990600144613,
"grad_norm": 0.35892999172210693,
"learning_rate": 1.9518369407155732e-05,
"loss": 0.998,
"step": 776
},
{
"epoch": 0.561822125813449,
"grad_norm": 0.4056641757488251,
"learning_rate": 1.9516955003262517e-05,
"loss": 0.9725,
"step": 777
},
{
"epoch": 0.5625451916124368,
"grad_norm": 0.3325026333332062,
"learning_rate": 1.9515538576974067e-05,
"loss": 1.1256,
"step": 778
},
{
"epoch": 0.5632682574114244,
"grad_norm": 0.4020673930644989,
"learning_rate": 1.951412012859138e-05,
"loss": 1.0631,
"step": 779
},
{
"epoch": 0.5639913232104121,
"grad_norm": 0.3648395836353302,
"learning_rate": 1.9512699658415882e-05,
"loss": 1.1435,
"step": 780
},
{
"epoch": 0.5647143890093999,
"grad_norm": 0.35294806957244873,
"learning_rate": 1.9511277166749425e-05,
"loss": 0.9151,
"step": 781
},
{
"epoch": 0.5654374548083876,
"grad_norm": 0.4098382592201233,
"learning_rate": 1.95098526538943e-05,
"loss": 0.9072,
"step": 782
},
{
"epoch": 0.5661605206073753,
"grad_norm": 0.377547025680542,
"learning_rate": 1.950842612015322e-05,
"loss": 0.8726,
"step": 783
},
{
"epoch": 0.5668835864063629,
"grad_norm": 0.42288947105407715,
"learning_rate": 1.9506997565829335e-05,
"loss": 1.0385,
"step": 784
},
{
"epoch": 0.5676066522053507,
"grad_norm": 0.39082857966423035,
"learning_rate": 1.9505566991226214e-05,
"loss": 0.9935,
"step": 785
},
{
"epoch": 0.5683297180043384,
"grad_norm": 0.3439696729183197,
"learning_rate": 1.950413439664786e-05,
"loss": 1.1061,
"step": 786
},
{
"epoch": 0.5690527838033261,
"grad_norm": 0.4018382728099823,
"learning_rate": 1.950269978239871e-05,
"loss": 0.9143,
"step": 787
},
{
"epoch": 0.5697758496023138,
"grad_norm": 0.42380473017692566,
"learning_rate": 1.950126314878362e-05,
"loss": 1.1054,
"step": 788
},
{
"epoch": 0.5704989154013015,
"grad_norm": 0.4379919767379761,
"learning_rate": 1.9499824496107883e-05,
"loss": 1.0511,
"step": 789
},
{
"epoch": 0.5712219812002892,
"grad_norm": 0.4412190318107605,
"learning_rate": 1.9498383824677223e-05,
"loss": 1.153,
"step": 790
},
{
"epoch": 0.571945046999277,
"grad_norm": 0.4430684447288513,
"learning_rate": 1.9496941134797784e-05,
"loss": 0.9391,
"step": 791
},
{
"epoch": 0.5726681127982647,
"grad_norm": 0.3570266664028168,
"learning_rate": 1.9495496426776147e-05,
"loss": 1.1379,
"step": 792
},
{
"epoch": 0.5733911785972523,
"grad_norm": 0.4187745451927185,
"learning_rate": 1.949404970091932e-05,
"loss": 1.0523,
"step": 793
},
{
"epoch": 0.57411424439624,
"grad_norm": 0.4245019853115082,
"learning_rate": 1.9492600957534735e-05,
"loss": 1.0555,
"step": 794
},
{
"epoch": 0.5748373101952278,
"grad_norm": 0.5078554749488831,
"learning_rate": 1.9491150196930258e-05,
"loss": 1.1153,
"step": 795
},
{
"epoch": 0.5755603759942155,
"grad_norm": 0.4756588041782379,
"learning_rate": 1.948969741941418e-05,
"loss": 1.157,
"step": 796
},
{
"epoch": 0.5762834417932032,
"grad_norm": 0.43571797013282776,
"learning_rate": 1.948824262529523e-05,
"loss": 0.9788,
"step": 797
},
{
"epoch": 0.5770065075921909,
"grad_norm": 0.4313880205154419,
"learning_rate": 1.948678581488255e-05,
"loss": 0.9907,
"step": 798
},
{
"epoch": 0.5777295733911786,
"grad_norm": 0.4783801734447479,
"learning_rate": 1.948532698848572e-05,
"loss": 1.1045,
"step": 799
},
{
"epoch": 0.5784526391901663,
"grad_norm": 0.41679173707962036,
"learning_rate": 1.9483866146414756e-05,
"loss": 1.1111,
"step": 800
},
{
"epoch": 0.579175704989154,
"grad_norm": 0.3428262174129486,
"learning_rate": 1.9482403288980082e-05,
"loss": 1.1117,
"step": 801
},
{
"epoch": 0.5798987707881417,
"grad_norm": 0.4037555456161499,
"learning_rate": 1.9480938416492564e-05,
"loss": 1.0466,
"step": 802
},
{
"epoch": 0.5806218365871294,
"grad_norm": 0.423533171415329,
"learning_rate": 1.9479471529263502e-05,
"loss": 1.0328,
"step": 803
},
{
"epoch": 0.5813449023861171,
"grad_norm": 0.42857813835144043,
"learning_rate": 1.9478002627604605e-05,
"loss": 0.9655,
"step": 804
},
{
"epoch": 0.5820679681851049,
"grad_norm": 0.4149419963359833,
"learning_rate": 1.9476531711828027e-05,
"loss": 1.0997,
"step": 805
},
{
"epoch": 0.5827910339840926,
"grad_norm": 0.5268839001655579,
"learning_rate": 1.9475058782246342e-05,
"loss": 0.9518,
"step": 806
},
{
"epoch": 0.5835140997830802,
"grad_norm": 0.4031262695789337,
"learning_rate": 1.947358383917256e-05,
"loss": 1.0986,
"step": 807
},
{
"epoch": 0.584237165582068,
"grad_norm": 0.3794477581977844,
"learning_rate": 1.9472106882920103e-05,
"loss": 1.0462,
"step": 808
},
{
"epoch": 0.5849602313810557,
"grad_norm": 0.4023298919200897,
"learning_rate": 1.947062791380284e-05,
"loss": 0.972,
"step": 809
},
{
"epoch": 0.5856832971800434,
"grad_norm": 0.3912923038005829,
"learning_rate": 1.946914693213505e-05,
"loss": 1.0204,
"step": 810
},
{
"epoch": 0.586406362979031,
"grad_norm": 0.4873286187648773,
"learning_rate": 1.946766393823146e-05,
"loss": 1.1627,
"step": 811
},
{
"epoch": 0.5871294287780188,
"grad_norm": 0.4002951681613922,
"learning_rate": 1.94661789324072e-05,
"loss": 0.9713,
"step": 812
},
{
"epoch": 0.5878524945770065,
"grad_norm": 0.3794306516647339,
"learning_rate": 1.946469191497785e-05,
"loss": 1.0521,
"step": 813
},
{
"epoch": 0.5885755603759942,
"grad_norm": 0.5814877152442932,
"learning_rate": 1.9463202886259398e-05,
"loss": 1.0715,
"step": 814
},
{
"epoch": 0.589298626174982,
"grad_norm": 0.46600183844566345,
"learning_rate": 1.946171184656828e-05,
"loss": 1.0208,
"step": 815
},
{
"epoch": 0.5900216919739696,
"grad_norm": 0.6167263984680176,
"learning_rate": 1.946021879622134e-05,
"loss": 1.0131,
"step": 816
},
{
"epoch": 0.5907447577729573,
"grad_norm": 0.45610466599464417,
"learning_rate": 1.9458723735535866e-05,
"loss": 1.1696,
"step": 817
},
{
"epoch": 0.591467823571945,
"grad_norm": 0.3938460052013397,
"learning_rate": 1.9457226664829555e-05,
"loss": 0.9354,
"step": 818
},
{
"epoch": 0.5921908893709328,
"grad_norm": 0.7263491749763489,
"learning_rate": 1.945572758442055e-05,
"loss": 0.9243,
"step": 819
},
{
"epoch": 0.5929139551699205,
"grad_norm": 0.39691534638404846,
"learning_rate": 1.945422649462741e-05,
"loss": 1.0144,
"step": 820
},
{
"epoch": 0.5936370209689081,
"grad_norm": 0.4366442561149597,
"learning_rate": 1.9452723395769118e-05,
"loss": 1.1116,
"step": 821
},
{
"epoch": 0.5943600867678959,
"grad_norm": 0.5859283208847046,
"learning_rate": 1.9451218288165098e-05,
"loss": 0.8539,
"step": 822
},
{
"epoch": 0.5950831525668836,
"grad_norm": 0.4116322696208954,
"learning_rate": 1.9449711172135185e-05,
"loss": 1.0003,
"step": 823
},
{
"epoch": 0.5958062183658713,
"grad_norm": 0.3992029130458832,
"learning_rate": 1.9448202047999653e-05,
"loss": 1.0766,
"step": 824
},
{
"epoch": 0.596529284164859,
"grad_norm": 0.42713025212287903,
"learning_rate": 1.944669091607919e-05,
"loss": 1.0108,
"step": 825
},
{
"epoch": 0.5972523499638467,
"grad_norm": 0.36716726422309875,
"learning_rate": 1.9445177776694923e-05,
"loss": 0.9922,
"step": 826
},
{
"epoch": 0.5979754157628344,
"grad_norm": 0.39609575271606445,
"learning_rate": 1.9443662630168404e-05,
"loss": 1.2268,
"step": 827
},
{
"epoch": 0.5986984815618221,
"grad_norm": 0.4199231266975403,
"learning_rate": 1.9442145476821607e-05,
"loss": 1.0309,
"step": 828
},
{
"epoch": 0.5994215473608099,
"grad_norm": 0.3968781530857086,
"learning_rate": 1.9440626316976926e-05,
"loss": 1.068,
"step": 829
},
{
"epoch": 0.6001446131597975,
"grad_norm": 0.39676693081855774,
"learning_rate": 1.94391051509572e-05,
"loss": 1.0705,
"step": 830
},
{
"epoch": 0.6008676789587852,
"grad_norm": 0.4254015386104584,
"learning_rate": 1.9437581979085678e-05,
"loss": 1.1494,
"step": 831
},
{
"epoch": 0.601590744757773,
"grad_norm": 0.5715295672416687,
"learning_rate": 1.943605680168604e-05,
"loss": 1.1313,
"step": 832
},
{
"epoch": 0.6023138105567607,
"grad_norm": 0.44537991285324097,
"learning_rate": 1.9434529619082396e-05,
"loss": 1.1528,
"step": 833
},
{
"epoch": 0.6030368763557483,
"grad_norm": 0.4265975058078766,
"learning_rate": 1.943300043159928e-05,
"loss": 1.111,
"step": 834
},
{
"epoch": 0.603759942154736,
"grad_norm": 0.4071260988712311,
"learning_rate": 1.9431469239561646e-05,
"loss": 0.9698,
"step": 835
},
{
"epoch": 0.6044830079537238,
"grad_norm": 0.5448580384254456,
"learning_rate": 1.942993604329488e-05,
"loss": 1.0849,
"step": 836
},
{
"epoch": 0.6052060737527115,
"grad_norm": 0.40993860363960266,
"learning_rate": 1.94284008431248e-05,
"loss": 1.054,
"step": 837
},
{
"epoch": 0.6059291395516992,
"grad_norm": 0.418747216463089,
"learning_rate": 1.9426863639377634e-05,
"loss": 0.9819,
"step": 838
},
{
"epoch": 0.6066522053506869,
"grad_norm": 0.47144564986228943,
"learning_rate": 1.942532443238005e-05,
"loss": 1.1143,
"step": 839
},
{
"epoch": 0.6073752711496746,
"grad_norm": 0.38305413722991943,
"learning_rate": 1.9423783222459135e-05,
"loss": 1.0806,
"step": 840
},
{
"epoch": 0.6080983369486623,
"grad_norm": 0.3502989113330841,
"learning_rate": 1.9422240009942403e-05,
"loss": 1.0041,
"step": 841
},
{
"epoch": 0.6088214027476501,
"grad_norm": 0.3907722234725952,
"learning_rate": 1.9420694795157792e-05,
"loss": 1.051,
"step": 842
},
{
"epoch": 0.6095444685466378,
"grad_norm": 0.35976073145866394,
"learning_rate": 1.9419147578433667e-05,
"loss": 1.1117,
"step": 843
},
{
"epoch": 0.6102675343456254,
"grad_norm": 0.4920249879360199,
"learning_rate": 1.9417598360098822e-05,
"loss": 1.1318,
"step": 844
},
{
"epoch": 0.6109906001446131,
"grad_norm": 0.3940010666847229,
"learning_rate": 1.941604714048247e-05,
"loss": 0.8906,
"step": 845
},
{
"epoch": 0.6117136659436009,
"grad_norm": 0.38385289907455444,
"learning_rate": 1.9414493919914253e-05,
"loss": 1.0902,
"step": 846
},
{
"epoch": 0.6124367317425886,
"grad_norm": 0.375847190618515,
"learning_rate": 1.9412938698724237e-05,
"loss": 1.1212,
"step": 847
},
{
"epoch": 0.6131597975415762,
"grad_norm": 0.4347201883792877,
"learning_rate": 1.9411381477242913e-05,
"loss": 0.9429,
"step": 848
},
{
"epoch": 0.613882863340564,
"grad_norm": 0.42075204849243164,
"learning_rate": 1.9409822255801197e-05,
"loss": 1.2179,
"step": 849
},
{
"epoch": 0.6146059291395517,
"grad_norm": 0.9480845928192139,
"learning_rate": 1.940826103473043e-05,
"loss": 1.1344,
"step": 850
},
{
"epoch": 0.6153289949385394,
"grad_norm": 0.44934409856796265,
"learning_rate": 1.9406697814362382e-05,
"loss": 1.0579,
"step": 851
},
{
"epoch": 0.6160520607375272,
"grad_norm": 0.48897886276245117,
"learning_rate": 1.940513259502924e-05,
"loss": 1.0972,
"step": 852
},
{
"epoch": 0.6167751265365148,
"grad_norm": 0.35401520133018494,
"learning_rate": 1.9403565377063624e-05,
"loss": 0.9755,
"step": 853
},
{
"epoch": 0.6174981923355025,
"grad_norm": 0.3998658359050751,
"learning_rate": 1.9401996160798574e-05,
"loss": 0.9717,
"step": 854
},
{
"epoch": 0.6182212581344902,
"grad_norm": 0.3708108067512512,
"learning_rate": 1.9400424946567552e-05,
"loss": 0.9571,
"step": 855
},
{
"epoch": 0.618944323933478,
"grad_norm": 0.36125117540359497,
"learning_rate": 1.939885173470445e-05,
"loss": 1.0311,
"step": 856
},
{
"epoch": 0.6196673897324656,
"grad_norm": 0.35216933488845825,
"learning_rate": 1.9397276525543583e-05,
"loss": 1.0258,
"step": 857
},
{
"epoch": 0.6203904555314533,
"grad_norm": 0.3627132475376129,
"learning_rate": 1.9395699319419687e-05,
"loss": 1.0178,
"step": 858
},
{
"epoch": 0.6211135213304411,
"grad_norm": 0.5006598830223083,
"learning_rate": 1.9394120116667932e-05,
"loss": 1.138,
"step": 859
},
{
"epoch": 0.6218365871294288,
"grad_norm": 0.4495599865913391,
"learning_rate": 1.93925389176239e-05,
"loss": 1.2473,
"step": 860
},
{
"epoch": 0.6225596529284165,
"grad_norm": 0.7601077556610107,
"learning_rate": 1.9390955722623602e-05,
"loss": 1.0375,
"step": 861
},
{
"epoch": 0.6232827187274042,
"grad_norm": 0.38106614351272583,
"learning_rate": 1.9389370532003483e-05,
"loss": 1.1907,
"step": 862
},
{
"epoch": 0.6240057845263919,
"grad_norm": 0.4347703754901886,
"learning_rate": 1.938778334610039e-05,
"loss": 1.0626,
"step": 863
},
{
"epoch": 0.6247288503253796,
"grad_norm": 0.41734766960144043,
"learning_rate": 1.9386194165251616e-05,
"loss": 1.0065,
"step": 864
},
{
"epoch": 0.6254519161243673,
"grad_norm": 0.397935688495636,
"learning_rate": 1.9384602989794868e-05,
"loss": 1.0134,
"step": 865
},
{
"epoch": 0.6261749819233551,
"grad_norm": 0.3894132077693939,
"learning_rate": 1.9383009820068275e-05,
"loss": 1.0335,
"step": 866
},
{
"epoch": 0.6268980477223427,
"grad_norm": 0.4423835277557373,
"learning_rate": 1.938141465641039e-05,
"loss": 0.9506,
"step": 867
},
{
"epoch": 0.6276211135213304,
"grad_norm": 0.524779200553894,
"learning_rate": 1.9379817499160202e-05,
"loss": 0.9809,
"step": 868
},
{
"epoch": 0.6283441793203182,
"grad_norm": 0.8196138739585876,
"learning_rate": 1.9378218348657104e-05,
"loss": 1.0157,
"step": 869
},
{
"epoch": 0.6290672451193059,
"grad_norm": 0.4658561050891876,
"learning_rate": 1.937661720524093e-05,
"loss": 1.0556,
"step": 870
},
{
"epoch": 0.6297903109182935,
"grad_norm": 0.41377320885658264,
"learning_rate": 1.9375014069251928e-05,
"loss": 1.0779,
"step": 871
},
{
"epoch": 0.6305133767172812,
"grad_norm": 0.5276013016700745,
"learning_rate": 1.937340894103077e-05,
"loss": 1.1533,
"step": 872
},
{
"epoch": 0.631236442516269,
"grad_norm": 0.410815954208374,
"learning_rate": 1.937180182091855e-05,
"loss": 1.0898,
"step": 873
},
{
"epoch": 0.6319595083152567,
"grad_norm": 0.37035036087036133,
"learning_rate": 1.9370192709256795e-05,
"loss": 1.1058,
"step": 874
},
{
"epoch": 0.6326825741142444,
"grad_norm": 0.4849984049797058,
"learning_rate": 1.9368581606387442e-05,
"loss": 0.9201,
"step": 875
},
{
"epoch": 0.6334056399132321,
"grad_norm": 0.61427903175354,
"learning_rate": 1.9366968512652863e-05,
"loss": 1.0005,
"step": 876
},
{
"epoch": 0.6341287057122198,
"grad_norm": 0.4817984402179718,
"learning_rate": 1.9365353428395845e-05,
"loss": 1.1757,
"step": 877
},
{
"epoch": 0.6348517715112075,
"grad_norm": 0.44777020812034607,
"learning_rate": 1.9363736353959603e-05,
"loss": 1.0455,
"step": 878
},
{
"epoch": 0.6355748373101953,
"grad_norm": 0.41143718361854553,
"learning_rate": 1.9362117289687764e-05,
"loss": 0.9868,
"step": 879
},
{
"epoch": 0.6362979031091829,
"grad_norm": 0.8019953966140747,
"learning_rate": 1.9360496235924396e-05,
"loss": 1.0541,
"step": 880
},
{
"epoch": 0.6370209689081706,
"grad_norm": 0.5058273673057556,
"learning_rate": 1.9358873193013975e-05,
"loss": 1.1222,
"step": 881
},
{
"epoch": 0.6377440347071583,
"grad_norm": 0.4158300459384918,
"learning_rate": 1.935724816130141e-05,
"loss": 1.1177,
"step": 882
},
{
"epoch": 0.6384671005061461,
"grad_norm": 0.3587720990180969,
"learning_rate": 1.9355621141132022e-05,
"loss": 1.0659,
"step": 883
},
{
"epoch": 0.6391901663051338,
"grad_norm": 0.5225945711135864,
"learning_rate": 1.935399213285156e-05,
"loss": 1.1002,
"step": 884
},
{
"epoch": 0.6399132321041214,
"grad_norm": 0.3899383544921875,
"learning_rate": 1.93523611368062e-05,
"loss": 1.023,
"step": 885
},
{
"epoch": 0.6406362979031092,
"grad_norm": 0.3299097716808319,
"learning_rate": 1.9350728153342533e-05,
"loss": 0.9908,
"step": 886
},
{
"epoch": 0.6413593637020969,
"grad_norm": 0.4942092001438141,
"learning_rate": 1.9349093182807574e-05,
"loss": 1.0417,
"step": 887
},
{
"epoch": 0.6420824295010846,
"grad_norm": 0.6462168097496033,
"learning_rate": 1.9347456225548767e-05,
"loss": 1.0398,
"step": 888
},
{
"epoch": 0.6428054953000724,
"grad_norm": 0.47286465764045715,
"learning_rate": 1.9345817281913964e-05,
"loss": 1.0001,
"step": 889
},
{
"epoch": 0.64352856109906,
"grad_norm": 0.40762990713119507,
"learning_rate": 1.9344176352251456e-05,
"loss": 1.0202,
"step": 890
},
{
"epoch": 0.6442516268980477,
"grad_norm": 0.4899740517139435,
"learning_rate": 1.9342533436909942e-05,
"loss": 1.0916,
"step": 891
},
{
"epoch": 0.6449746926970354,
"grad_norm": 0.5071614384651184,
"learning_rate": 1.9340888536238555e-05,
"loss": 1.0278,
"step": 892
},
{
"epoch": 0.6456977584960232,
"grad_norm": 0.36371201276779175,
"learning_rate": 1.9339241650586835e-05,
"loss": 1.1233,
"step": 893
},
{
"epoch": 0.6464208242950108,
"grad_norm": 0.5467074513435364,
"learning_rate": 1.933759278030476e-05,
"loss": 1.0505,
"step": 894
},
{
"epoch": 0.6471438900939985,
"grad_norm": 0.3950304388999939,
"learning_rate": 1.933594192574272e-05,
"loss": 0.8757,
"step": 895
},
{
"epoch": 0.6478669558929863,
"grad_norm": 0.44331902265548706,
"learning_rate": 1.933428908725153e-05,
"loss": 1.0699,
"step": 896
},
{
"epoch": 0.648590021691974,
"grad_norm": 0.5385304093360901,
"learning_rate": 1.9332634265182422e-05,
"loss": 0.9539,
"step": 897
},
{
"epoch": 0.6493130874909617,
"grad_norm": 0.3947201371192932,
"learning_rate": 1.9330977459887058e-05,
"loss": 1.0645,
"step": 898
},
{
"epoch": 0.6500361532899493,
"grad_norm": 0.4047844707965851,
"learning_rate": 1.932931867171751e-05,
"loss": 1.0795,
"step": 899
},
{
"epoch": 0.6507592190889371,
"grad_norm": 0.45291417837142944,
"learning_rate": 1.9327657901026284e-05,
"loss": 0.8067,
"step": 900
},
{
"epoch": 0.6514822848879248,
"grad_norm": 0.45252764225006104,
"learning_rate": 1.93259951481663e-05,
"loss": 0.9772,
"step": 901
},
{
"epoch": 0.6522053506869125,
"grad_norm": 0.8900644183158875,
"learning_rate": 1.9324330413490896e-05,
"loss": 1.0711,
"step": 902
},
{
"epoch": 0.6529284164859002,
"grad_norm": 0.5582137107849121,
"learning_rate": 1.932266369735384e-05,
"loss": 1.1331,
"step": 903
},
{
"epoch": 0.6536514822848879,
"grad_norm": 0.488629549741745,
"learning_rate": 1.9320995000109315e-05,
"loss": 1.1425,
"step": 904
},
{
"epoch": 0.6543745480838756,
"grad_norm": 0.5531617403030396,
"learning_rate": 1.9319324322111928e-05,
"loss": 0.9534,
"step": 905
},
{
"epoch": 0.6550976138828634,
"grad_norm": 0.4221404790878296,
"learning_rate": 1.9317651663716704e-05,
"loss": 1.0654,
"step": 906
},
{
"epoch": 0.6558206796818511,
"grad_norm": 0.36095425486564636,
"learning_rate": 1.9315977025279088e-05,
"loss": 1.0173,
"step": 907
},
{
"epoch": 0.6565437454808387,
"grad_norm": 0.5640881061553955,
"learning_rate": 1.9314300407154954e-05,
"loss": 1.0946,
"step": 908
},
{
"epoch": 0.6572668112798264,
"grad_norm": 0.3728702962398529,
"learning_rate": 1.9312621809700586e-05,
"loss": 1.1767,
"step": 909
},
{
"epoch": 0.6579898770788142,
"grad_norm": 0.5009912252426147,
"learning_rate": 1.9310941233272698e-05,
"loss": 0.9627,
"step": 910
},
{
"epoch": 0.6587129428778019,
"grad_norm": 0.4499903619289398,
"learning_rate": 1.9309258678228412e-05,
"loss": 1.0449,
"step": 911
},
{
"epoch": 0.6594360086767896,
"grad_norm": 0.7598214745521545,
"learning_rate": 1.9307574144925288e-05,
"loss": 1.0257,
"step": 912
},
{
"epoch": 0.6601590744757773,
"grad_norm": 0.413637638092041,
"learning_rate": 1.930588763372129e-05,
"loss": 1.0121,
"step": 913
},
{
"epoch": 0.660882140274765,
"grad_norm": 0.4112358093261719,
"learning_rate": 1.930419914497481e-05,
"loss": 1.165,
"step": 914
},
{
"epoch": 0.6616052060737527,
"grad_norm": 0.41854724287986755,
"learning_rate": 1.9302508679044662e-05,
"loss": 1.1068,
"step": 915
},
{
"epoch": 0.6623282718727405,
"grad_norm": 0.4287382662296295,
"learning_rate": 1.9300816236290077e-05,
"loss": 0.9718,
"step": 916
},
{
"epoch": 0.6630513376717281,
"grad_norm": 0.3937189280986786,
"learning_rate": 1.9299121817070706e-05,
"loss": 0.9827,
"step": 917
},
{
"epoch": 0.6637744034707158,
"grad_norm": 0.6436665058135986,
"learning_rate": 1.929742542174662e-05,
"loss": 0.9858,
"step": 918
},
{
"epoch": 0.6644974692697035,
"grad_norm": 0.4006657600402832,
"learning_rate": 1.929572705067831e-05,
"loss": 0.9564,
"step": 919
},
{
"epoch": 0.6652205350686913,
"grad_norm": 0.6361526250839233,
"learning_rate": 1.929402670422669e-05,
"loss": 1.1058,
"step": 920
},
{
"epoch": 0.665943600867679,
"grad_norm": 0.4914890229701996,
"learning_rate": 1.9292324382753088e-05,
"loss": 1.0892,
"step": 921
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.4729480743408203,
"learning_rate": 1.9290620086619255e-05,
"loss": 1.2001,
"step": 922
},
{
"epoch": 0.6673897324656544,
"grad_norm": 0.37761080265045166,
"learning_rate": 1.9288913816187365e-05,
"loss": 1.1583,
"step": 923
},
{
"epoch": 0.6681127982646421,
"grad_norm": 0.4286913275718689,
"learning_rate": 1.9287205571820007e-05,
"loss": 1.0615,
"step": 924
},
{
"epoch": 0.6688358640636298,
"grad_norm": 0.3655116558074951,
"learning_rate": 1.9285495353880187e-05,
"loss": 1.1904,
"step": 925
},
{
"epoch": 0.6695589298626174,
"grad_norm": 0.5355531573295593,
"learning_rate": 1.928378316273134e-05,
"loss": 1.1509,
"step": 926
},
{
"epoch": 0.6702819956616052,
"grad_norm": 0.49476566910743713,
"learning_rate": 1.9282068998737306e-05,
"loss": 1.0475,
"step": 927
},
{
"epoch": 0.6710050614605929,
"grad_norm": 0.3970414102077484,
"learning_rate": 1.928035286226236e-05,
"loss": 1.1631,
"step": 928
},
{
"epoch": 0.6717281272595806,
"grad_norm": 0.4435936212539673,
"learning_rate": 1.9278634753671185e-05,
"loss": 1.13,
"step": 929
},
{
"epoch": 0.6724511930585684,
"grad_norm": 0.47896090149879456,
"learning_rate": 1.927691467332889e-05,
"loss": 1.0606,
"step": 930
},
{
"epoch": 0.673174258857556,
"grad_norm": 0.5069797039031982,
"learning_rate": 1.9275192621600993e-05,
"loss": 1.1685,
"step": 931
},
{
"epoch": 0.6738973246565437,
"grad_norm": 0.4008742868900299,
"learning_rate": 1.9273468598853446e-05,
"loss": 1.0611,
"step": 932
},
{
"epoch": 0.6746203904555315,
"grad_norm": 0.41504350304603577,
"learning_rate": 1.9271742605452604e-05,
"loss": 1.0769,
"step": 933
},
{
"epoch": 0.6753434562545192,
"grad_norm": 0.4604232609272003,
"learning_rate": 1.927001464176525e-05,
"loss": 1.0744,
"step": 934
},
{
"epoch": 0.6760665220535069,
"grad_norm": 0.4554314613342285,
"learning_rate": 1.926828470815859e-05,
"loss": 1.031,
"step": 935
},
{
"epoch": 0.6767895878524945,
"grad_norm": 0.4168878197669983,
"learning_rate": 1.9266552805000236e-05,
"loss": 1.1499,
"step": 936
},
{
"epoch": 0.6775126536514823,
"grad_norm": 0.37697312235832214,
"learning_rate": 1.926481893265823e-05,
"loss": 1.0173,
"step": 937
},
{
"epoch": 0.67823571945047,
"grad_norm": 0.48489493131637573,
"learning_rate": 1.926308309150102e-05,
"loss": 0.8716,
"step": 938
},
{
"epoch": 0.6789587852494577,
"grad_norm": 0.4797099530696869,
"learning_rate": 1.926134528189749e-05,
"loss": 0.9966,
"step": 939
},
{
"epoch": 0.6796818510484454,
"grad_norm": 0.4304414987564087,
"learning_rate": 1.9259605504216922e-05,
"loss": 1.05,
"step": 940
},
{
"epoch": 0.6804049168474331,
"grad_norm": 0.5568910837173462,
"learning_rate": 1.9257863758829038e-05,
"loss": 1.1633,
"step": 941
},
{
"epoch": 0.6811279826464208,
"grad_norm": 0.4955918788909912,
"learning_rate": 1.9256120046103954e-05,
"loss": 0.9605,
"step": 942
},
{
"epoch": 0.6818510484454086,
"grad_norm": 0.44979023933410645,
"learning_rate": 1.9254374366412225e-05,
"loss": 1.0132,
"step": 943
},
{
"epoch": 0.6825741142443963,
"grad_norm": 0.5083341598510742,
"learning_rate": 1.9252626720124813e-05,
"loss": 1.0155,
"step": 944
},
{
"epoch": 0.6832971800433839,
"grad_norm": 0.40737634897232056,
"learning_rate": 1.92508771076131e-05,
"loss": 1.1144,
"step": 945
},
{
"epoch": 0.6840202458423716,
"grad_norm": 0.43715277314186096,
"learning_rate": 1.924912552924889e-05,
"loss": 1.2293,
"step": 946
},
{
"epoch": 0.6847433116413594,
"grad_norm": 0.8877456188201904,
"learning_rate": 1.9247371985404397e-05,
"loss": 1.0002,
"step": 947
},
{
"epoch": 0.6854663774403471,
"grad_norm": 0.45003005862236023,
"learning_rate": 1.9245616476452263e-05,
"loss": 0.9714,
"step": 948
},
{
"epoch": 0.6861894432393347,
"grad_norm": 0.5420054197311401,
"learning_rate": 1.924385900276553e-05,
"loss": 0.854,
"step": 949
},
{
"epoch": 0.6869125090383225,
"grad_norm": 0.49163565039634705,
"learning_rate": 1.9242099564717683e-05,
"loss": 1.1079,
"step": 950
},
{
"epoch": 0.6876355748373102,
"grad_norm": 0.561029314994812,
"learning_rate": 1.9240338162682598e-05,
"loss": 1.1312,
"step": 951
},
{
"epoch": 0.6883586406362979,
"grad_norm": 0.43775081634521484,
"learning_rate": 1.923857479703459e-05,
"loss": 1.1497,
"step": 952
},
{
"epoch": 0.6890817064352857,
"grad_norm": 0.5425902009010315,
"learning_rate": 1.9236809468148378e-05,
"loss": 1.0263,
"step": 953
},
{
"epoch": 0.6898047722342733,
"grad_norm": 0.4741000235080719,
"learning_rate": 1.92350421763991e-05,
"loss": 1.108,
"step": 954
},
{
"epoch": 0.690527838033261,
"grad_norm": 0.36742356419563293,
"learning_rate": 1.9233272922162318e-05,
"loss": 0.9945,
"step": 955
},
{
"epoch": 0.6912509038322487,
"grad_norm": 0.6709416508674622,
"learning_rate": 1.9231501705814005e-05,
"loss": 1.1226,
"step": 956
},
{
"epoch": 0.6919739696312365,
"grad_norm": 0.39497923851013184,
"learning_rate": 1.922972852773055e-05,
"loss": 1.0677,
"step": 957
},
{
"epoch": 0.6926970354302241,
"grad_norm": 0.43766117095947266,
"learning_rate": 1.922795338828876e-05,
"loss": 1.1008,
"step": 958
},
{
"epoch": 0.6934201012292118,
"grad_norm": 0.6151456832885742,
"learning_rate": 1.922617628786587e-05,
"loss": 1.0524,
"step": 959
},
{
"epoch": 0.6941431670281996,
"grad_norm": 0.4668988883495331,
"learning_rate": 1.922439722683951e-05,
"loss": 0.9703,
"step": 960
},
{
"epoch": 0.6948662328271873,
"grad_norm": 0.43881484866142273,
"learning_rate": 1.9222616205587742e-05,
"loss": 1.1008,
"step": 961
},
{
"epoch": 0.695589298626175,
"grad_norm": 0.4098372161388397,
"learning_rate": 1.9220833224489045e-05,
"loss": 1.1125,
"step": 962
},
{
"epoch": 0.6963123644251626,
"grad_norm": 0.3672609329223633,
"learning_rate": 1.9219048283922305e-05,
"loss": 0.9593,
"step": 963
},
{
"epoch": 0.6970354302241504,
"grad_norm": 0.4422583281993866,
"learning_rate": 1.9217261384266833e-05,
"loss": 1.1713,
"step": 964
},
{
"epoch": 0.6977584960231381,
"grad_norm": 0.3935304284095764,
"learning_rate": 1.921547252590235e-05,
"loss": 1.1546,
"step": 965
},
{
"epoch": 0.6984815618221258,
"grad_norm": 0.3898194134235382,
"learning_rate": 1.9213681709209e-05,
"loss": 1.044,
"step": 966
},
{
"epoch": 0.6992046276211136,
"grad_norm": 0.5345346331596375,
"learning_rate": 1.921188893456734e-05,
"loss": 0.9389,
"step": 967
},
{
"epoch": 0.6999276934201012,
"grad_norm": 0.4857673645019531,
"learning_rate": 1.921009420235834e-05,
"loss": 1.0613,
"step": 968
},
{
"epoch": 0.7006507592190889,
"grad_norm": 0.4024747312068939,
"learning_rate": 1.920829751296339e-05,
"loss": 1.0423,
"step": 969
},
{
"epoch": 0.7013738250180767,
"grad_norm": 0.4439171850681305,
"learning_rate": 1.920649886676429e-05,
"loss": 1.0442,
"step": 970
},
{
"epoch": 0.7020968908170644,
"grad_norm": 0.41875940561294556,
"learning_rate": 1.9204698264143268e-05,
"loss": 1.0491,
"step": 971
},
{
"epoch": 0.702819956616052,
"grad_norm": 0.5009695887565613,
"learning_rate": 1.9202895705482952e-05,
"loss": 1.057,
"step": 972
},
{
"epoch": 0.7035430224150397,
"grad_norm": 0.6199245452880859,
"learning_rate": 1.92010911911664e-05,
"loss": 1.0034,
"step": 973
},
{
"epoch": 0.7042660882140275,
"grad_norm": 0.4618159830570221,
"learning_rate": 1.919928472157708e-05,
"loss": 1.0351,
"step": 974
},
{
"epoch": 0.7049891540130152,
"grad_norm": 0.4229826033115387,
"learning_rate": 1.9197476297098868e-05,
"loss": 1.2061,
"step": 975
},
{
"epoch": 0.7057122198120029,
"grad_norm": 0.49111953377723694,
"learning_rate": 1.9195665918116068e-05,
"loss": 1.0294,
"step": 976
},
{
"epoch": 0.7064352856109906,
"grad_norm": 0.4833580255508423,
"learning_rate": 1.919385358501339e-05,
"loss": 1.0781,
"step": 977
},
{
"epoch": 0.7071583514099783,
"grad_norm": 0.42100799083709717,
"learning_rate": 1.9192039298175965e-05,
"loss": 1.03,
"step": 978
},
{
"epoch": 0.707881417208966,
"grad_norm": 0.5994098782539368,
"learning_rate": 1.9190223057989337e-05,
"loss": 0.8802,
"step": 979
},
{
"epoch": 0.7086044830079538,
"grad_norm": 0.4303410053253174,
"learning_rate": 1.9188404864839465e-05,
"loss": 1.1212,
"step": 980
},
{
"epoch": 0.7093275488069414,
"grad_norm": 0.453218013048172,
"learning_rate": 1.9186584719112724e-05,
"loss": 0.9847,
"step": 981
},
{
"epoch": 0.7100506146059291,
"grad_norm": 0.43997496366500854,
"learning_rate": 1.9184762621195897e-05,
"loss": 0.9984,
"step": 982
},
{
"epoch": 0.7107736804049168,
"grad_norm": 0.5103498101234436,
"learning_rate": 1.91829385714762e-05,
"loss": 1.0727,
"step": 983
},
{
"epoch": 0.7114967462039046,
"grad_norm": 0.5352555513381958,
"learning_rate": 1.918111257034124e-05,
"loss": 1.02,
"step": 984
},
{
"epoch": 0.7122198120028923,
"grad_norm": 0.40594494342803955,
"learning_rate": 1.917928461817906e-05,
"loss": 1.0947,
"step": 985
}
],
"logging_steps": 1,
"max_steps": 6915,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 5,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.1929190399934464e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}