henggg's picture
Upload 18 files
86cd44f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.997242140099283,
"eval_steps": 500,
"global_step": 339,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00882515168229454,
"grad_norm": 0.40674829483032227,
"learning_rate": 9.999785297426788e-05,
"loss": 0.2055,
"num_input_tokens_seen": 203120,
"step": 1
},
{
"epoch": 0.01765030336458908,
"grad_norm": 0.4242195785045624,
"learning_rate": 9.999141208146028e-05,
"loss": 0.1902,
"num_input_tokens_seen": 406048,
"step": 2
},
{
"epoch": 0.026475455046883617,
"grad_norm": 0.3813261389732361,
"learning_rate": 9.998067787472772e-05,
"loss": 0.1421,
"num_input_tokens_seen": 614736,
"step": 3
},
{
"epoch": 0.03530060672917816,
"grad_norm": 0.28003761172294617,
"learning_rate": 9.996565127593488e-05,
"loss": 0.1102,
"num_input_tokens_seen": 816416,
"step": 4
},
{
"epoch": 0.0441257584114727,
"grad_norm": 0.25300610065460205,
"learning_rate": 9.994633357558158e-05,
"loss": 0.0801,
"num_input_tokens_seen": 1024272,
"step": 5
},
{
"epoch": 0.052950910093767234,
"grad_norm": 0.2328871786594391,
"learning_rate": 9.99227264326918e-05,
"loss": 0.0574,
"num_input_tokens_seen": 1228192,
"step": 6
},
{
"epoch": 0.06177606177606178,
"grad_norm": 0.17362241446971893,
"learning_rate": 9.989483187467127e-05,
"loss": 0.0401,
"num_input_tokens_seen": 1434992,
"step": 7
},
{
"epoch": 0.07060121345835632,
"grad_norm": 0.09250874817371368,
"learning_rate": 9.986265229713331e-05,
"loss": 0.0295,
"num_input_tokens_seen": 1646560,
"step": 8
},
{
"epoch": 0.07942636514065085,
"grad_norm": 0.08936059474945068,
"learning_rate": 9.982619046369321e-05,
"loss": 0.0262,
"num_input_tokens_seen": 1838624,
"step": 9
},
{
"epoch": 0.0882515168229454,
"grad_norm": 0.08603595942258835,
"learning_rate": 9.978544950573074e-05,
"loss": 0.0263,
"num_input_tokens_seen": 2053488,
"step": 10
},
{
"epoch": 0.09707666850523994,
"grad_norm": 0.07848804444074631,
"learning_rate": 9.974043292212128e-05,
"loss": 0.022,
"num_input_tokens_seen": 2253680,
"step": 11
},
{
"epoch": 0.10590182018753447,
"grad_norm": 0.06246768683195114,
"learning_rate": 9.96911445789354e-05,
"loss": 0.0202,
"num_input_tokens_seen": 2442000,
"step": 12
},
{
"epoch": 0.11472697186982901,
"grad_norm": 0.048259809613227844,
"learning_rate": 9.963758870910671e-05,
"loss": 0.0202,
"num_input_tokens_seen": 2655920,
"step": 13
},
{
"epoch": 0.12355212355212356,
"grad_norm": 0.03917853534221649,
"learning_rate": 9.957976991206846e-05,
"loss": 0.0178,
"num_input_tokens_seen": 2874064,
"step": 14
},
{
"epoch": 0.13237727523441808,
"grad_norm": 0.040510393679142,
"learning_rate": 9.951769315335844e-05,
"loss": 0.0158,
"num_input_tokens_seen": 3071744,
"step": 15
},
{
"epoch": 0.14120242691671264,
"grad_norm": 0.035558607429265976,
"learning_rate": 9.945136376419259e-05,
"loss": 0.0159,
"num_input_tokens_seen": 3277904,
"step": 16
},
{
"epoch": 0.15002757859900717,
"grad_norm": 0.034995947033166885,
"learning_rate": 9.938078744100712e-05,
"loss": 0.0147,
"num_input_tokens_seen": 3493136,
"step": 17
},
{
"epoch": 0.1588527302813017,
"grad_norm": 0.03230876475572586,
"learning_rate": 9.930597024496931e-05,
"loss": 0.0138,
"num_input_tokens_seen": 3704288,
"step": 18
},
{
"epoch": 0.16767788196359626,
"grad_norm": 0.028281500563025475,
"learning_rate": 9.922691860145696e-05,
"loss": 0.0128,
"num_input_tokens_seen": 3904352,
"step": 19
},
{
"epoch": 0.1765030336458908,
"grad_norm": 0.026264235377311707,
"learning_rate": 9.914363929950659e-05,
"loss": 0.0124,
"num_input_tokens_seen": 4113888,
"step": 20
},
{
"epoch": 0.18532818532818532,
"grad_norm": 0.023232094943523407,
"learning_rate": 9.905613949123036e-05,
"loss": 0.0116,
"num_input_tokens_seen": 4323504,
"step": 21
},
{
"epoch": 0.19415333701047988,
"grad_norm": 0.02393435873091221,
"learning_rate": 9.896442669120187e-05,
"loss": 0.0109,
"num_input_tokens_seen": 4523008,
"step": 22
},
{
"epoch": 0.2029784886927744,
"grad_norm": 0.024421676993370056,
"learning_rate": 9.886850877581079e-05,
"loss": 0.0106,
"num_input_tokens_seen": 4732864,
"step": 23
},
{
"epoch": 0.21180364037506894,
"grad_norm": 0.022869078442454338,
"learning_rate": 9.876839398258641e-05,
"loss": 0.0099,
"num_input_tokens_seen": 4941936,
"step": 24
},
{
"epoch": 0.2206287920573635,
"grad_norm": 0.025933578610420227,
"learning_rate": 9.866409090949022e-05,
"loss": 0.0109,
"num_input_tokens_seen": 5143584,
"step": 25
},
{
"epoch": 0.22945394373965802,
"grad_norm": 0.02043001353740692,
"learning_rate": 9.855560851417752e-05,
"loss": 0.0084,
"num_input_tokens_seen": 5351024,
"step": 26
},
{
"epoch": 0.23827909542195255,
"grad_norm": 0.02140035293996334,
"learning_rate": 9.844295611322804e-05,
"loss": 0.0081,
"num_input_tokens_seen": 5563760,
"step": 27
},
{
"epoch": 0.2471042471042471,
"grad_norm": 0.019948888570070267,
"learning_rate": 9.832614338134595e-05,
"loss": 0.0078,
"num_input_tokens_seen": 5772416,
"step": 28
},
{
"epoch": 0.25592939878654164,
"grad_norm": 0.021153336390852928,
"learning_rate": 9.820518035052889e-05,
"loss": 0.0081,
"num_input_tokens_seen": 5974464,
"step": 29
},
{
"epoch": 0.26475455046883617,
"grad_norm": 0.02002059668302536,
"learning_rate": 9.808007740920646e-05,
"loss": 0.0087,
"num_input_tokens_seen": 6193520,
"step": 30
},
{
"epoch": 0.2735797021511307,
"grad_norm": 0.029256833717226982,
"learning_rate": 9.795084530134801e-05,
"loss": 0.0079,
"num_input_tokens_seen": 6399792,
"step": 31
},
{
"epoch": 0.2824048538334253,
"grad_norm": 0.02395695447921753,
"learning_rate": 9.781749512553999e-05,
"loss": 0.0086,
"num_input_tokens_seen": 6603584,
"step": 32
},
{
"epoch": 0.2912300055157198,
"grad_norm": 0.02185678854584694,
"learning_rate": 9.768003833403278e-05,
"loss": 0.0079,
"num_input_tokens_seen": 6810656,
"step": 33
},
{
"epoch": 0.30005515719801434,
"grad_norm": 0.02072463184595108,
"learning_rate": 9.753848673175707e-05,
"loss": 0.0069,
"num_input_tokens_seen": 7001792,
"step": 34
},
{
"epoch": 0.3088803088803089,
"grad_norm": 0.018024709075689316,
"learning_rate": 9.739285247531018e-05,
"loss": 0.0064,
"num_input_tokens_seen": 7205952,
"step": 35
},
{
"epoch": 0.3177054605626034,
"grad_norm": 0.019729286432266235,
"learning_rate": 9.724314807191195e-05,
"loss": 0.006,
"num_input_tokens_seen": 7406304,
"step": 36
},
{
"epoch": 0.32653061224489793,
"grad_norm": 0.01830880530178547,
"learning_rate": 9.708938637833065e-05,
"loss": 0.0067,
"num_input_tokens_seen": 7629568,
"step": 37
},
{
"epoch": 0.3353557639271925,
"grad_norm": 0.021113887429237366,
"learning_rate": 9.693158059977878e-05,
"loss": 0.0063,
"num_input_tokens_seen": 7845200,
"step": 38
},
{
"epoch": 0.34418091560948705,
"grad_norm": 0.015138108283281326,
"learning_rate": 9.676974428877901e-05,
"loss": 0.0058,
"num_input_tokens_seen": 8061840,
"step": 39
},
{
"epoch": 0.3530060672917816,
"grad_norm": 0.017043087631464005,
"learning_rate": 9.660389134400033e-05,
"loss": 0.0061,
"num_input_tokens_seen": 8279664,
"step": 40
},
{
"epoch": 0.3618312189740761,
"grad_norm": 0.01955767348408699,
"learning_rate": 9.643403600906433e-05,
"loss": 0.0055,
"num_input_tokens_seen": 8475376,
"step": 41
},
{
"epoch": 0.37065637065637064,
"grad_norm": 0.014688636176288128,
"learning_rate": 9.626019287132203e-05,
"loss": 0.005,
"num_input_tokens_seen": 8691760,
"step": 42
},
{
"epoch": 0.3794815223386652,
"grad_norm": 0.01973150670528412,
"learning_rate": 9.608237686060099e-05,
"loss": 0.006,
"num_input_tokens_seen": 8884736,
"step": 43
},
{
"epoch": 0.38830667402095975,
"grad_norm": 0.01489401888102293,
"learning_rate": 9.590060324792327e-05,
"loss": 0.0048,
"num_input_tokens_seen": 9084064,
"step": 44
},
{
"epoch": 0.3971318257032543,
"grad_norm": 0.015995647758245468,
"learning_rate": 9.571488764419381e-05,
"loss": 0.0047,
"num_input_tokens_seen": 9302144,
"step": 45
},
{
"epoch": 0.4059569773855488,
"grad_norm": 0.01859475113451481,
"learning_rate": 9.552524599885981e-05,
"loss": 0.0053,
"num_input_tokens_seen": 9517456,
"step": 46
},
{
"epoch": 0.41478212906784334,
"grad_norm": 0.018746482208371162,
"learning_rate": 9.533169459854098e-05,
"loss": 0.0044,
"num_input_tokens_seen": 9710768,
"step": 47
},
{
"epoch": 0.42360728075013787,
"grad_norm": 0.017155013978481293,
"learning_rate": 9.513425006563079e-05,
"loss": 0.0043,
"num_input_tokens_seen": 9914064,
"step": 48
},
{
"epoch": 0.43243243243243246,
"grad_norm": 0.015938682481646538,
"learning_rate": 9.493292935686895e-05,
"loss": 0.0041,
"num_input_tokens_seen": 10120208,
"step": 49
},
{
"epoch": 0.441257584114727,
"grad_norm": 0.017114240676164627,
"learning_rate": 9.472774976188515e-05,
"loss": 0.0044,
"num_input_tokens_seen": 10346304,
"step": 50
},
{
"epoch": 0.4500827357970215,
"grad_norm": 0.014332287944853306,
"learning_rate": 9.451872890171419e-05,
"loss": 0.004,
"num_input_tokens_seen": 10547984,
"step": 51
},
{
"epoch": 0.45890788747931605,
"grad_norm": 0.017018554732203484,
"learning_rate": 9.43058847272827e-05,
"loss": 0.0045,
"num_input_tokens_seen": 10754288,
"step": 52
},
{
"epoch": 0.4677330391616106,
"grad_norm": 0.013670100830495358,
"learning_rate": 9.408923551786743e-05,
"loss": 0.0028,
"num_input_tokens_seen": 10942704,
"step": 53
},
{
"epoch": 0.4765581908439051,
"grad_norm": 0.016749229282140732,
"learning_rate": 9.386879987952549e-05,
"loss": 0.0034,
"num_input_tokens_seen": 11150864,
"step": 54
},
{
"epoch": 0.4853833425261997,
"grad_norm": 0.01554529182612896,
"learning_rate": 9.364459674349641e-05,
"loss": 0.0042,
"num_input_tokens_seen": 11367728,
"step": 55
},
{
"epoch": 0.4942084942084942,
"grad_norm": 0.015070905908942223,
"learning_rate": 9.341664536457626e-05,
"loss": 0.0028,
"num_input_tokens_seen": 11575536,
"step": 56
},
{
"epoch": 0.5030336458907887,
"grad_norm": 0.016440849751234055,
"learning_rate": 9.31849653194641e-05,
"loss": 0.0035,
"num_input_tokens_seen": 11781328,
"step": 57
},
{
"epoch": 0.5118587975730833,
"grad_norm": 0.014468475244939327,
"learning_rate": 9.294957650508065e-05,
"loss": 0.0029,
"num_input_tokens_seen": 11981232,
"step": 58
},
{
"epoch": 0.5206839492553779,
"grad_norm": 0.014588565565645695,
"learning_rate": 9.27104991368596e-05,
"loss": 0.0028,
"num_input_tokens_seen": 12187296,
"step": 59
},
{
"epoch": 0.5295091009376723,
"grad_norm": 0.0141281234100461,
"learning_rate": 9.246775374701139e-05,
"loss": 0.0027,
"num_input_tokens_seen": 12385632,
"step": 60
},
{
"epoch": 0.5383342526199669,
"grad_norm": 0.013463583774864674,
"learning_rate": 9.222136118275995e-05,
"loss": 0.0022,
"num_input_tokens_seen": 12588928,
"step": 61
},
{
"epoch": 0.5471594043022614,
"grad_norm": 0.014033553190529346,
"learning_rate": 9.197134260455233e-05,
"loss": 0.0027,
"num_input_tokens_seen": 12825616,
"step": 62
},
{
"epoch": 0.555984555984556,
"grad_norm": 0.013906535692512989,
"learning_rate": 9.171771948424137e-05,
"loss": 0.0025,
"num_input_tokens_seen": 13044976,
"step": 63
},
{
"epoch": 0.5648097076668506,
"grad_norm": 0.012418747879564762,
"learning_rate": 9.146051360324166e-05,
"loss": 0.0025,
"num_input_tokens_seen": 13255280,
"step": 64
},
{
"epoch": 0.573634859349145,
"grad_norm": 0.015126565471291542,
"learning_rate": 9.119974705065901e-05,
"loss": 0.0022,
"num_input_tokens_seen": 13463456,
"step": 65
},
{
"epoch": 0.5824600110314396,
"grad_norm": 0.013123284094035625,
"learning_rate": 9.093544222139337e-05,
"loss": 0.0023,
"num_input_tokens_seen": 13667744,
"step": 66
},
{
"epoch": 0.5912851627137341,
"grad_norm": 0.014246366918087006,
"learning_rate": 9.066762181421552e-05,
"loss": 0.0024,
"num_input_tokens_seen": 13874240,
"step": 67
},
{
"epoch": 0.6001103143960287,
"grad_norm": 0.011402356438338757,
"learning_rate": 9.039630882981768e-05,
"loss": 0.0015,
"num_input_tokens_seen": 14081392,
"step": 68
},
{
"epoch": 0.6089354660783233,
"grad_norm": 0.014725148677825928,
"learning_rate": 9.012152656883823e-05,
"loss": 0.0033,
"num_input_tokens_seen": 14300896,
"step": 69
},
{
"epoch": 0.6177606177606177,
"grad_norm": 0.014837515540421009,
"learning_rate": 8.984329862986056e-05,
"loss": 0.0021,
"num_input_tokens_seen": 14523968,
"step": 70
},
{
"epoch": 0.6265857694429123,
"grad_norm": 0.014493652619421482,
"learning_rate": 8.956164890738643e-05,
"loss": 0.0013,
"num_input_tokens_seen": 14728960,
"step": 71
},
{
"epoch": 0.6354109211252068,
"grad_norm": 0.011806878261268139,
"learning_rate": 8.927660158978392e-05,
"loss": 0.0016,
"num_input_tokens_seen": 14912480,
"step": 72
},
{
"epoch": 0.6442360728075014,
"grad_norm": 0.01818985864520073,
"learning_rate": 8.898818115721008e-05,
"loss": 0.0019,
"num_input_tokens_seen": 15114608,
"step": 73
},
{
"epoch": 0.6530612244897959,
"grad_norm": 0.015412255190312862,
"learning_rate": 8.86964123795085e-05,
"loss": 0.0017,
"num_input_tokens_seen": 15326112,
"step": 74
},
{
"epoch": 0.6618863761720905,
"grad_norm": 0.013063928112387657,
"learning_rate": 8.84013203140821e-05,
"loss": 0.0015,
"num_input_tokens_seen": 15545248,
"step": 75
},
{
"epoch": 0.670711527854385,
"grad_norm": 0.016336796805262566,
"learning_rate": 8.810293030374126e-05,
"loss": 0.0017,
"num_input_tokens_seen": 15751872,
"step": 76
},
{
"epoch": 0.6795366795366795,
"grad_norm": 0.010313590988516808,
"learning_rate": 8.780126797452713e-05,
"loss": 0.001,
"num_input_tokens_seen": 15957872,
"step": 77
},
{
"epoch": 0.6883618312189741,
"grad_norm": 0.015468253754079342,
"learning_rate": 8.749635923351107e-05,
"loss": 0.0018,
"num_input_tokens_seen": 16162640,
"step": 78
},
{
"epoch": 0.6971869829012686,
"grad_norm": 0.01543041318655014,
"learning_rate": 8.71882302665696e-05,
"loss": 0.001,
"num_input_tokens_seen": 16352368,
"step": 79
},
{
"epoch": 0.7060121345835632,
"grad_norm": 0.01957864873111248,
"learning_rate": 8.687690753613554e-05,
"loss": 0.0014,
"num_input_tokens_seen": 16563920,
"step": 80
},
{
"epoch": 0.7148372862658577,
"grad_norm": 0.012508533895015717,
"learning_rate": 8.656241777892543e-05,
"loss": 0.001,
"num_input_tokens_seen": 16759024,
"step": 81
},
{
"epoch": 0.7236624379481522,
"grad_norm": 0.012273616157472134,
"learning_rate": 8.624478800364332e-05,
"loss": 0.0013,
"num_input_tokens_seen": 16973728,
"step": 82
},
{
"epoch": 0.7324875896304468,
"grad_norm": 0.01503776852041483,
"learning_rate": 8.592404548866123e-05,
"loss": 0.0012,
"num_input_tokens_seen": 17162752,
"step": 83
},
{
"epoch": 0.7413127413127413,
"grad_norm": 0.014227951876819134,
"learning_rate": 8.560021777967649e-05,
"loss": 0.0013,
"num_input_tokens_seen": 17364064,
"step": 84
},
{
"epoch": 0.7501378929950359,
"grad_norm": 0.01252016518265009,
"learning_rate": 8.527333268734606e-05,
"loss": 0.0011,
"num_input_tokens_seen": 17564576,
"step": 85
},
{
"epoch": 0.7589630446773304,
"grad_norm": 0.011520475149154663,
"learning_rate": 8.494341828489812e-05,
"loss": 0.0037,
"num_input_tokens_seen": 17778752,
"step": 86
},
{
"epoch": 0.7677881963596249,
"grad_norm": 0.010531144216656685,
"learning_rate": 8.461050290572114e-05,
"loss": 0.0007,
"num_input_tokens_seen": 17982448,
"step": 87
},
{
"epoch": 0.7766133480419195,
"grad_norm": 0.010875461623072624,
"learning_rate": 8.427461514093056e-05,
"loss": 0.0008,
"num_input_tokens_seen": 18180608,
"step": 88
},
{
"epoch": 0.785438499724214,
"grad_norm": 0.007611530367285013,
"learning_rate": 8.393578383691329e-05,
"loss": 0.0006,
"num_input_tokens_seen": 18384496,
"step": 89
},
{
"epoch": 0.7942636514065086,
"grad_norm": 0.010159923695027828,
"learning_rate": 8.359403809285053e-05,
"loss": 0.001,
"num_input_tokens_seen": 18587744,
"step": 90
},
{
"epoch": 0.803088803088803,
"grad_norm": 0.011715343222022057,
"learning_rate": 8.324940725821852e-05,
"loss": 0.001,
"num_input_tokens_seen": 18791056,
"step": 91
},
{
"epoch": 0.8119139547710976,
"grad_norm": 0.012972251512110233,
"learning_rate": 8.290192093026805e-05,
"loss": 0.0008,
"num_input_tokens_seen": 18985008,
"step": 92
},
{
"epoch": 0.8207391064533922,
"grad_norm": 0.0135871022939682,
"learning_rate": 8.255160895148263e-05,
"loss": 0.0014,
"num_input_tokens_seen": 19193888,
"step": 93
},
{
"epoch": 0.8295642581356867,
"grad_norm": 0.011914449743926525,
"learning_rate": 8.219850140701557e-05,
"loss": 0.001,
"num_input_tokens_seen": 19399552,
"step": 94
},
{
"epoch": 0.8383894098179813,
"grad_norm": 0.009591113775968552,
"learning_rate": 8.184262862210624e-05,
"loss": 0.0007,
"num_input_tokens_seen": 19605120,
"step": 95
},
{
"epoch": 0.8472145615002757,
"grad_norm": 0.009942690841853619,
"learning_rate": 8.148402115947571e-05,
"loss": 0.0008,
"num_input_tokens_seen": 19802480,
"step": 96
},
{
"epoch": 0.8560397131825703,
"grad_norm": 0.012667879462242126,
"learning_rate": 8.112270981670196e-05,
"loss": 0.0011,
"num_input_tokens_seen": 20009520,
"step": 97
},
{
"epoch": 0.8648648648648649,
"grad_norm": 0.010983509942889214,
"learning_rate": 8.075872562357501e-05,
"loss": 0.0009,
"num_input_tokens_seen": 20235888,
"step": 98
},
{
"epoch": 0.8736900165471594,
"grad_norm": 0.011479397304356098,
"learning_rate": 8.039209983943201e-05,
"loss": 0.0006,
"num_input_tokens_seen": 20433600,
"step": 99
},
{
"epoch": 0.882515168229454,
"grad_norm": 0.012184002436697483,
"learning_rate": 8.002286395047267e-05,
"loss": 0.0009,
"num_input_tokens_seen": 20631664,
"step": 100
},
{
"epoch": 0.8913403199117484,
"grad_norm": 0.009395604953169823,
"learning_rate": 7.965104966705518e-05,
"loss": 0.0006,
"num_input_tokens_seen": 20833056,
"step": 101
},
{
"epoch": 0.900165471594043,
"grad_norm": 0.013585143722593784,
"learning_rate": 7.927668892097289e-05,
"loss": 0.0008,
"num_input_tokens_seen": 21051104,
"step": 102
},
{
"epoch": 0.9089906232763376,
"grad_norm": 0.008882119320333004,
"learning_rate": 7.889981386271201e-05,
"loss": 0.0005,
"num_input_tokens_seen": 21246080,
"step": 103
},
{
"epoch": 0.9178157749586321,
"grad_norm": 0.010433576069772243,
"learning_rate": 7.852045685869045e-05,
"loss": 0.0006,
"num_input_tokens_seen": 21439696,
"step": 104
},
{
"epoch": 0.9266409266409267,
"grad_norm": 0.01474383007735014,
"learning_rate": 7.813865048847819e-05,
"loss": 0.0008,
"num_input_tokens_seen": 21648432,
"step": 105
},
{
"epoch": 0.9354660783232212,
"grad_norm": 0.011113091371953487,
"learning_rate": 7.775442754199928e-05,
"loss": 0.0007,
"num_input_tokens_seen": 21864368,
"step": 106
},
{
"epoch": 0.9442912300055157,
"grad_norm": 0.009181715548038483,
"learning_rate": 7.736782101671587e-05,
"loss": 0.0006,
"num_input_tokens_seen": 22061968,
"step": 107
},
{
"epoch": 0.9531163816878102,
"grad_norm": 0.0140100521966815,
"learning_rate": 7.697886411479423e-05,
"loss": 0.0012,
"num_input_tokens_seen": 22278128,
"step": 108
},
{
"epoch": 0.9619415333701048,
"grad_norm": 0.007349591236561537,
"learning_rate": 7.658759024025349e-05,
"loss": 0.0004,
"num_input_tokens_seen": 22469056,
"step": 109
},
{
"epoch": 0.9707666850523994,
"grad_norm": 0.01252900529652834,
"learning_rate": 7.619403299609668e-05,
"loss": 0.0008,
"num_input_tokens_seen": 22662128,
"step": 110
},
{
"epoch": 0.9795918367346939,
"grad_norm": 0.012083148583769798,
"learning_rate": 7.579822618142505e-05,
"loss": 0.0007,
"num_input_tokens_seen": 22883216,
"step": 111
},
{
"epoch": 0.9884169884169884,
"grad_norm": 0.010517132468521595,
"learning_rate": 7.540020378853523e-05,
"loss": 0.0005,
"num_input_tokens_seen": 23085888,
"step": 112
},
{
"epoch": 0.9972421400992829,
"grad_norm": 0.01143716461956501,
"learning_rate": 7.500000000000001e-05,
"loss": 0.0007,
"num_input_tokens_seen": 23307520,
"step": 113
},
{
"epoch": 1.0088251516822946,
"grad_norm": 0.0287212785333395,
"learning_rate": 7.459764918573264e-05,
"loss": 0.0014,
"num_input_tokens_seen": 23564192,
"step": 114
},
{
"epoch": 1.0176503033645892,
"grad_norm": 0.010353313758969307,
"learning_rate": 7.419318590003523e-05,
"loss": 0.0007,
"num_input_tokens_seen": 23768816,
"step": 115
},
{
"epoch": 1.0264754550468835,
"grad_norm": 0.013796573504805565,
"learning_rate": 7.378664487863103e-05,
"loss": 0.0006,
"num_input_tokens_seen": 23974096,
"step": 116
},
{
"epoch": 1.0353006067291781,
"grad_norm": 0.006352484691888094,
"learning_rate": 7.33780610356814e-05,
"loss": 0.0003,
"num_input_tokens_seen": 24172256,
"step": 117
},
{
"epoch": 1.0441257584114727,
"grad_norm": 0.007957457564771175,
"learning_rate": 7.296746946078736e-05,
"loss": 0.0004,
"num_input_tokens_seen": 24362208,
"step": 118
},
{
"epoch": 1.0529509100937673,
"grad_norm": 0.0068214968778193,
"learning_rate": 7.255490541597594e-05,
"loss": 0.0003,
"num_input_tokens_seen": 24562224,
"step": 119
},
{
"epoch": 1.0617760617760619,
"grad_norm": 0.00877879373729229,
"learning_rate": 7.214040433267198e-05,
"loss": 0.0005,
"num_input_tokens_seen": 24776528,
"step": 120
},
{
"epoch": 1.0706012134583562,
"grad_norm": 0.007200079504400492,
"learning_rate": 7.172400180865513e-05,
"loss": 0.0003,
"num_input_tokens_seen": 24985008,
"step": 121
},
{
"epoch": 1.0794263651406508,
"grad_norm": 0.010829208418726921,
"learning_rate": 7.130573360500276e-05,
"loss": 0.0005,
"num_input_tokens_seen": 25200720,
"step": 122
},
{
"epoch": 1.0882515168229454,
"grad_norm": 0.010170291177928448,
"learning_rate": 7.088563564301873e-05,
"loss": 0.0004,
"num_input_tokens_seen": 25413568,
"step": 123
},
{
"epoch": 1.09707666850524,
"grad_norm": 0.007032219786196947,
"learning_rate": 7.046374400114842e-05,
"loss": 0.0003,
"num_input_tokens_seen": 25608576,
"step": 124
},
{
"epoch": 1.1059018201875346,
"grad_norm": 0.00843306165188551,
"learning_rate": 7.004009491188022e-05,
"loss": 0.0003,
"num_input_tokens_seen": 25818400,
"step": 125
},
{
"epoch": 1.114726971869829,
"grad_norm": 0.00947788916528225,
"learning_rate": 6.961472475863405e-05,
"loss": 0.0005,
"num_input_tokens_seen": 26037424,
"step": 126
},
{
"epoch": 1.1235521235521235,
"grad_norm": 0.009593469090759754,
"learning_rate": 6.918767007263646e-05,
"loss": 0.0005,
"num_input_tokens_seen": 26250480,
"step": 127
},
{
"epoch": 1.1323772752344181,
"grad_norm": 0.012611499056220055,
"learning_rate": 6.875896752978344e-05,
"loss": 0.0005,
"num_input_tokens_seen": 26458592,
"step": 128
},
{
"epoch": 1.1412024269167127,
"grad_norm": 0.005860932637006044,
"learning_rate": 6.832865394749065e-05,
"loss": 0.0004,
"num_input_tokens_seen": 26680256,
"step": 129
},
{
"epoch": 1.150027578599007,
"grad_norm": 0.008905632421374321,
"learning_rate": 6.789676628153143e-05,
"loss": 0.0004,
"num_input_tokens_seen": 26887424,
"step": 130
},
{
"epoch": 1.1588527302813016,
"grad_norm": 0.00839240662753582,
"learning_rate": 6.746334162286307e-05,
"loss": 0.0003,
"num_input_tokens_seen": 27112736,
"step": 131
},
{
"epoch": 1.1676778819635962,
"grad_norm": 0.010829194448888302,
"learning_rate": 6.702841719444141e-05,
"loss": 0.0004,
"num_input_tokens_seen": 27320064,
"step": 132
},
{
"epoch": 1.1765030336458908,
"grad_norm": 0.005576102528721094,
"learning_rate": 6.659203034802397e-05,
"loss": 0.0003,
"num_input_tokens_seen": 27520544,
"step": 133
},
{
"epoch": 1.1853281853281854,
"grad_norm": 0.008609413169324398,
"learning_rate": 6.615421856096231e-05,
"loss": 0.0009,
"num_input_tokens_seen": 27737920,
"step": 134
},
{
"epoch": 1.19415333701048,
"grad_norm": 0.013195198960602283,
"learning_rate": 6.571501943298334e-05,
"loss": 0.0014,
"num_input_tokens_seen": 27947552,
"step": 135
},
{
"epoch": 1.2029784886927744,
"grad_norm": 0.008647961542010307,
"learning_rate": 6.527447068296026e-05,
"loss": 0.0003,
"num_input_tokens_seen": 28143808,
"step": 136
},
{
"epoch": 1.211803640375069,
"grad_norm": 0.006975845899432898,
"learning_rate": 6.483261014567311e-05,
"loss": 0.0002,
"num_input_tokens_seen": 28349312,
"step": 137
},
{
"epoch": 1.2206287920573635,
"grad_norm": 0.013750969432294369,
"learning_rate": 6.438947576855968e-05,
"loss": 0.0002,
"num_input_tokens_seen": 28560096,
"step": 138
},
{
"epoch": 1.229453943739658,
"grad_norm": 0.009799162857234478,
"learning_rate": 6.394510560845637e-05,
"loss": 0.0005,
"num_input_tokens_seen": 28764544,
"step": 139
},
{
"epoch": 1.2382790954219525,
"grad_norm": 0.00819414108991623,
"learning_rate": 6.349953782832991e-05,
"loss": 0.0004,
"num_input_tokens_seen": 28949360,
"step": 140
},
{
"epoch": 1.247104247104247,
"grad_norm": 0.008884673938155174,
"learning_rate": 6.305281069399989e-05,
"loss": 0.0002,
"num_input_tokens_seen": 29148112,
"step": 141
},
{
"epoch": 1.2559293987865416,
"grad_norm": 0.009248818270862103,
"learning_rate": 6.26049625708524e-05,
"loss": 0.0004,
"num_input_tokens_seen": 29370624,
"step": 142
},
{
"epoch": 1.2647545504688362,
"grad_norm": 0.008902438916265965,
"learning_rate": 6.215603192054522e-05,
"loss": 0.0003,
"num_input_tokens_seen": 29572464,
"step": 143
},
{
"epoch": 1.2735797021511308,
"grad_norm": 0.012439709156751633,
"learning_rate": 6.17060572977047e-05,
"loss": 0.0006,
"num_input_tokens_seen": 29771152,
"step": 144
},
{
"epoch": 1.2824048538334254,
"grad_norm": 0.013059360906481743,
"learning_rate": 6.125507734661458e-05,
"loss": 0.0003,
"num_input_tokens_seen": 29954960,
"step": 145
},
{
"epoch": 1.2912300055157198,
"grad_norm": 0.011295526288449764,
"learning_rate": 6.080313079789723e-05,
"loss": 0.0004,
"num_input_tokens_seen": 30165568,
"step": 146
},
{
"epoch": 1.3000551571980143,
"grad_norm": 0.01000818982720375,
"learning_rate": 6.035025646518746e-05,
"loss": 0.0005,
"num_input_tokens_seen": 30372160,
"step": 147
},
{
"epoch": 1.308880308880309,
"grad_norm": 0.010914387181401253,
"learning_rate": 5.989649324179911e-05,
"loss": 0.0003,
"num_input_tokens_seen": 30572752,
"step": 148
},
{
"epoch": 1.3177054605626033,
"grad_norm": 0.009289560839533806,
"learning_rate": 5.944188009738483e-05,
"loss": 0.0004,
"num_input_tokens_seen": 30780496,
"step": 149
},
{
"epoch": 1.3265306122448979,
"grad_norm": 0.015559184364974499,
"learning_rate": 5.8986456074589404e-05,
"loss": 0.0004,
"num_input_tokens_seen": 30975120,
"step": 150
},
{
"epoch": 1.3353557639271925,
"grad_norm": 0.00643413420766592,
"learning_rate": 5.853026028569667e-05,
"loss": 0.0002,
"num_input_tokens_seen": 31174000,
"step": 151
},
{
"epoch": 1.344180915609487,
"grad_norm": 0.0077626509591937065,
"learning_rate": 5.807333190927053e-05,
"loss": 0.0003,
"num_input_tokens_seen": 31387088,
"step": 152
},
{
"epoch": 1.3530060672917816,
"grad_norm": 0.0083751380443573,
"learning_rate": 5.761571018679025e-05,
"loss": 0.0003,
"num_input_tokens_seen": 31576400,
"step": 153
},
{
"epoch": 1.3618312189740762,
"grad_norm": 0.007961435243487358,
"learning_rate": 5.715743441928041e-05,
"loss": 0.0003,
"num_input_tokens_seen": 31784320,
"step": 154
},
{
"epoch": 1.3706563706563706,
"grad_norm": 0.006737589370459318,
"learning_rate": 5.669854396393559e-05,
"loss": 0.0004,
"num_input_tokens_seen": 31987520,
"step": 155
},
{
"epoch": 1.3794815223386652,
"grad_norm": 0.014642222784459591,
"learning_rate": 5.6239078230740436e-05,
"loss": 0.0004,
"num_input_tokens_seen": 32187456,
"step": 156
},
{
"epoch": 1.3883066740209598,
"grad_norm": 0.006064648274332285,
"learning_rate": 5.5779076679085054e-05,
"loss": 0.0002,
"num_input_tokens_seen": 32384528,
"step": 157
},
{
"epoch": 1.3971318257032543,
"grad_norm": 0.009461612440645695,
"learning_rate": 5.531857881437612e-05,
"loss": 0.0004,
"num_input_tokens_seen": 32593040,
"step": 158
},
{
"epoch": 1.4059569773855487,
"grad_norm": 0.007511747535318136,
"learning_rate": 5.48576241846443e-05,
"loss": 0.0003,
"num_input_tokens_seen": 32797952,
"step": 159
},
{
"epoch": 1.4147821290678433,
"grad_norm": 0.02702983096241951,
"learning_rate": 5.4396252377147615e-05,
"loss": 0.0003,
"num_input_tokens_seen": 33008800,
"step": 160
},
{
"epoch": 1.4236072807501379,
"grad_norm": 0.008439299650490284,
"learning_rate": 5.3934503014971793e-05,
"loss": 0.0003,
"num_input_tokens_seen": 33208352,
"step": 161
},
{
"epoch": 1.4324324324324325,
"grad_norm": 0.0037907836958765984,
"learning_rate": 5.347241575362729e-05,
"loss": 0.0002,
"num_input_tokens_seen": 33410208,
"step": 162
},
{
"epoch": 1.441257584114727,
"grad_norm": 0.008237862028181553,
"learning_rate": 5.30100302776438e-05,
"loss": 0.0003,
"num_input_tokens_seen": 33631888,
"step": 163
},
{
"epoch": 1.4500827357970216,
"grad_norm": 0.009860441088676453,
"learning_rate": 5.254738629716186e-05,
"loss": 0.0004,
"num_input_tokens_seen": 33825152,
"step": 164
},
{
"epoch": 1.458907887479316,
"grad_norm": 0.007564296945929527,
"learning_rate": 5.208452354452274e-05,
"loss": 0.0003,
"num_input_tokens_seen": 34020352,
"step": 165
},
{
"epoch": 1.4677330391616106,
"grad_norm": 0.019607344642281532,
"learning_rate": 5.162148177085604e-05,
"loss": 0.0004,
"num_input_tokens_seen": 34226288,
"step": 166
},
{
"epoch": 1.4765581908439052,
"grad_norm": 0.007924061268568039,
"learning_rate": 5.115830074266591e-05,
"loss": 0.0016,
"num_input_tokens_seen": 34426672,
"step": 167
},
{
"epoch": 1.4853833425261997,
"grad_norm": 0.006358864717185497,
"learning_rate": 5.0695020238415756e-05,
"loss": 0.0002,
"num_input_tokens_seen": 34636944,
"step": 168
},
{
"epoch": 1.494208494208494,
"grad_norm": 0.010681587271392345,
"learning_rate": 5.0231680045112176e-05,
"loss": 0.0003,
"num_input_tokens_seen": 34839456,
"step": 169
},
{
"epoch": 1.5030336458907887,
"grad_norm": 0.01033815648406744,
"learning_rate": 4.976831995488784e-05,
"loss": 0.0002,
"num_input_tokens_seen": 35031600,
"step": 170
},
{
"epoch": 1.5118587975730833,
"grad_norm": 0.016812577843666077,
"learning_rate": 4.9304979761584256e-05,
"loss": 0.0004,
"num_input_tokens_seen": 35227728,
"step": 171
},
{
"epoch": 1.5206839492553779,
"grad_norm": 0.008957776241004467,
"learning_rate": 4.884169925733409e-05,
"loss": 0.0002,
"num_input_tokens_seen": 35436528,
"step": 172
},
{
"epoch": 1.5295091009376725,
"grad_norm": 0.006675931625068188,
"learning_rate": 4.837851822914397e-05,
"loss": 0.0002,
"num_input_tokens_seen": 35628624,
"step": 173
},
{
"epoch": 1.538334252619967,
"grad_norm": 0.006146900821477175,
"learning_rate": 4.791547645547726e-05,
"loss": 0.0002,
"num_input_tokens_seen": 35827376,
"step": 174
},
{
"epoch": 1.5471594043022614,
"grad_norm": 0.012180755846202374,
"learning_rate": 4.745261370283817e-05,
"loss": 0.0003,
"num_input_tokens_seen": 36056560,
"step": 175
},
{
"epoch": 1.555984555984556,
"grad_norm": 0.00920344889163971,
"learning_rate": 4.698996972235622e-05,
"loss": 0.0002,
"num_input_tokens_seen": 36267568,
"step": 176
},
{
"epoch": 1.5648097076668506,
"grad_norm": 0.010103096254169941,
"learning_rate": 4.652758424637271e-05,
"loss": 0.0027,
"num_input_tokens_seen": 36473008,
"step": 177
},
{
"epoch": 1.573634859349145,
"grad_norm": 0.012086655013263226,
"learning_rate": 4.606549698502823e-05,
"loss": 0.0004,
"num_input_tokens_seen": 36670944,
"step": 178
},
{
"epoch": 1.5824600110314395,
"grad_norm": 0.0054108137264847755,
"learning_rate": 4.56037476228524e-05,
"loss": 0.0001,
"num_input_tokens_seen": 36882256,
"step": 179
},
{
"epoch": 1.591285162713734,
"grad_norm": 0.014871139079332352,
"learning_rate": 4.5142375815355706e-05,
"loss": 0.0004,
"num_input_tokens_seen": 37091392,
"step": 180
},
{
"epoch": 1.6001103143960287,
"grad_norm": 0.005915229208767414,
"learning_rate": 4.468142118562389e-05,
"loss": 0.0002,
"num_input_tokens_seen": 37309680,
"step": 181
},
{
"epoch": 1.6089354660783233,
"grad_norm": 0.006937643978744745,
"learning_rate": 4.4220923320914964e-05,
"loss": 0.0003,
"num_input_tokens_seen": 37517952,
"step": 182
},
{
"epoch": 1.6177606177606179,
"grad_norm": 0.00866376981139183,
"learning_rate": 4.376092176925958e-05,
"loss": 0.0003,
"num_input_tokens_seen": 37732160,
"step": 183
},
{
"epoch": 1.6265857694429124,
"grad_norm": 0.007841500453650951,
"learning_rate": 4.330145603606441e-05,
"loss": 0.0004,
"num_input_tokens_seen": 37940368,
"step": 184
},
{
"epoch": 1.6354109211252068,
"grad_norm": 0.008568421937525272,
"learning_rate": 4.2842565580719595e-05,
"loss": 0.0004,
"num_input_tokens_seen": 38135024,
"step": 185
},
{
"epoch": 1.6442360728075014,
"grad_norm": 0.011796732433140278,
"learning_rate": 4.238428981320975e-05,
"loss": 0.0002,
"num_input_tokens_seen": 38336176,
"step": 186
},
{
"epoch": 1.6530612244897958,
"grad_norm": 0.00755694042891264,
"learning_rate": 4.192666809072948e-05,
"loss": 0.0003,
"num_input_tokens_seen": 38548880,
"step": 187
},
{
"epoch": 1.6618863761720903,
"grad_norm": 0.01243317686021328,
"learning_rate": 4.146973971430333e-05,
"loss": 0.0003,
"num_input_tokens_seen": 38755920,
"step": 188
},
{
"epoch": 1.670711527854385,
"grad_norm": 0.006207725498825312,
"learning_rate": 4.101354392541061e-05,
"loss": 0.0002,
"num_input_tokens_seen": 38973328,
"step": 189
},
{
"epoch": 1.6795366795366795,
"grad_norm": 0.008532355539500713,
"learning_rate": 4.0558119902615174e-05,
"loss": 0.0003,
"num_input_tokens_seen": 39193232,
"step": 190
},
{
"epoch": 1.688361831218974,
"grad_norm": 0.008602111600339413,
"learning_rate": 4.010350675820091e-05,
"loss": 0.0003,
"num_input_tokens_seen": 39406608,
"step": 191
},
{
"epoch": 1.6971869829012687,
"grad_norm": 0.008903734385967255,
"learning_rate": 3.964974353481254e-05,
"loss": 0.0004,
"num_input_tokens_seen": 39620160,
"step": 192
},
{
"epoch": 1.7060121345835633,
"grad_norm": 0.005871508736163378,
"learning_rate": 3.919686920210277e-05,
"loss": 0.0001,
"num_input_tokens_seen": 39815952,
"step": 193
},
{
"epoch": 1.7148372862658579,
"grad_norm": 0.008220325224101543,
"learning_rate": 3.874492265338544e-05,
"loss": 0.0003,
"num_input_tokens_seen": 40015408,
"step": 194
},
{
"epoch": 1.7236624379481522,
"grad_norm": 0.00940727163106203,
"learning_rate": 3.829394270229531e-05,
"loss": 0.0002,
"num_input_tokens_seen": 40215328,
"step": 195
},
{
"epoch": 1.7324875896304468,
"grad_norm": 0.005745697300881147,
"learning_rate": 3.784396807945477e-05,
"loss": 0.0002,
"num_input_tokens_seen": 40414384,
"step": 196
},
{
"epoch": 1.7413127413127412,
"grad_norm": 0.009524352848529816,
"learning_rate": 3.7395037429147615e-05,
"loss": 0.0002,
"num_input_tokens_seen": 40620656,
"step": 197
},
{
"epoch": 1.7501378929950357,
"grad_norm": 0.00809427909553051,
"learning_rate": 3.694718930600012e-05,
"loss": 0.0003,
"num_input_tokens_seen": 40847008,
"step": 198
},
{
"epoch": 1.7589630446773303,
"grad_norm": 0.0051635075360536575,
"learning_rate": 3.65004621716701e-05,
"loss": 0.0001,
"num_input_tokens_seen": 41036368,
"step": 199
},
{
"epoch": 1.767788196359625,
"grad_norm": 0.006504002492874861,
"learning_rate": 3.6054894391543646e-05,
"loss": 0.0003,
"num_input_tokens_seen": 41252976,
"step": 200
},
{
"epoch": 1.7766133480419195,
"grad_norm": 0.009855791926383972,
"learning_rate": 3.561052423144032e-05,
"loss": 0.0002,
"num_input_tokens_seen": 41465104,
"step": 201
},
{
"epoch": 1.785438499724214,
"grad_norm": 0.004304118454456329,
"learning_rate": 3.5167389854326905e-05,
"loss": 0.0002,
"num_input_tokens_seen": 41670800,
"step": 202
},
{
"epoch": 1.7942636514065087,
"grad_norm": 0.014682441018521786,
"learning_rate": 3.4725529317039754e-05,
"loss": 0.0013,
"num_input_tokens_seen": 41883536,
"step": 203
},
{
"epoch": 1.803088803088803,
"grad_norm": 0.0061918287537992,
"learning_rate": 3.428498056701665e-05,
"loss": 0.0001,
"num_input_tokens_seen": 42083360,
"step": 204
},
{
"epoch": 1.8119139547710976,
"grad_norm": 0.009490927681326866,
"learning_rate": 3.38457814390377e-05,
"loss": 0.0002,
"num_input_tokens_seen": 42283120,
"step": 205
},
{
"epoch": 1.8207391064533922,
"grad_norm": 0.008434086106717587,
"learning_rate": 3.340796965197604e-05,
"loss": 0.0003,
"num_input_tokens_seen": 42499088,
"step": 206
},
{
"epoch": 1.8295642581356866,
"grad_norm": 0.004052174277603626,
"learning_rate": 3.297158280555862e-05,
"loss": 0.0001,
"num_input_tokens_seen": 42692976,
"step": 207
},
{
"epoch": 1.8383894098179812,
"grad_norm": 0.007411065977066755,
"learning_rate": 3.2536658377136935e-05,
"loss": 0.0003,
"num_input_tokens_seen": 42907216,
"step": 208
},
{
"epoch": 1.8472145615002757,
"grad_norm": 0.006996455602347851,
"learning_rate": 3.210323371846857e-05,
"loss": 0.0001,
"num_input_tokens_seen": 43112448,
"step": 209
},
{
"epoch": 1.8560397131825703,
"grad_norm": 0.006998082622885704,
"learning_rate": 3.167134605250938e-05,
"loss": 0.0003,
"num_input_tokens_seen": 43340096,
"step": 210
},
{
"epoch": 1.864864864864865,
"grad_norm": 0.006418649572879076,
"learning_rate": 3.124103247021657e-05,
"loss": 0.0001,
"num_input_tokens_seen": 43539664,
"step": 211
},
{
"epoch": 1.8736900165471595,
"grad_norm": 0.009151714853942394,
"learning_rate": 3.081232992736355e-05,
"loss": 0.0003,
"num_input_tokens_seen": 43727664,
"step": 212
},
{
"epoch": 1.882515168229454,
"grad_norm": 0.004692760296165943,
"learning_rate": 3.0385275241365962e-05,
"loss": 0.0002,
"num_input_tokens_seen": 43953584,
"step": 213
},
{
"epoch": 1.8913403199117484,
"grad_norm": 0.006455820985138416,
"learning_rate": 2.9959905088119776e-05,
"loss": 0.0002,
"num_input_tokens_seen": 44157504,
"step": 214
},
{
"epoch": 1.900165471594043,
"grad_norm": 0.006325691007077694,
"learning_rate": 2.9536255998851613e-05,
"loss": 0.0001,
"num_input_tokens_seen": 44350448,
"step": 215
},
{
"epoch": 1.9089906232763376,
"grad_norm": 0.006784004159271717,
"learning_rate": 2.9114364356981272e-05,
"loss": 0.0002,
"num_input_tokens_seen": 44561472,
"step": 216
},
{
"epoch": 1.917815774958632,
"grad_norm": 0.008874817751348019,
"learning_rate": 2.8694266394997238e-05,
"loss": 0.0002,
"num_input_tokens_seen": 44769936,
"step": 217
},
{
"epoch": 1.9266409266409266,
"grad_norm": 0.006964050233364105,
"learning_rate": 2.8275998191344888e-05,
"loss": 0.0002,
"num_input_tokens_seen": 44979344,
"step": 218
},
{
"epoch": 1.9354660783232212,
"grad_norm": 0.014264012686908245,
"learning_rate": 2.7859595667328026e-05,
"loss": 0.0002,
"num_input_tokens_seen": 45196944,
"step": 219
},
{
"epoch": 1.9442912300055157,
"grad_norm": 0.005279663018882275,
"learning_rate": 2.7445094584024067e-05,
"loss": 0.0001,
"num_input_tokens_seen": 45406832,
"step": 220
},
{
"epoch": 1.9531163816878103,
"grad_norm": 0.0171637125313282,
"learning_rate": 2.7032530539212658e-05,
"loss": 0.0003,
"num_input_tokens_seen": 45603120,
"step": 221
},
{
"epoch": 1.961941533370105,
"grad_norm": 0.007687513716518879,
"learning_rate": 2.6621938964318595e-05,
"loss": 0.0002,
"num_input_tokens_seen": 45805184,
"step": 222
},
{
"epoch": 1.9707666850523995,
"grad_norm": 0.0034611017908900976,
"learning_rate": 2.621335512136899e-05,
"loss": 0.0001,
"num_input_tokens_seen": 46001184,
"step": 223
},
{
"epoch": 1.9795918367346939,
"grad_norm": 0.004358428996056318,
"learning_rate": 2.5806814099964772e-05,
"loss": 0.0002,
"num_input_tokens_seen": 46206288,
"step": 224
},
{
"epoch": 1.9884169884169884,
"grad_norm": 0.008765267208218575,
"learning_rate": 2.540235081426736e-05,
"loss": 0.0002,
"num_input_tokens_seen": 46427344,
"step": 225
},
{
"epoch": 1.9972421400992828,
"grad_norm": 0.006889387033879757,
"learning_rate": 2.500000000000001e-05,
"loss": 0.0003,
"num_input_tokens_seen": 46627344,
"step": 226
},
{
"epoch": 2.0088251516822946,
"grad_norm": 0.043494511395692825,
"learning_rate": 2.459979621146477e-05,
"loss": 0.0011,
"num_input_tokens_seen": 46901504,
"step": 227
},
{
"epoch": 2.017650303364589,
"grad_norm": 0.007718184031546116,
"learning_rate": 2.4201773818574956e-05,
"loss": 0.0001,
"num_input_tokens_seen": 47104400,
"step": 228
},
{
"epoch": 2.0264754550468838,
"grad_norm": 0.003912526648491621,
"learning_rate": 2.3805967003903333e-05,
"loss": 0.0001,
"num_input_tokens_seen": 47314176,
"step": 229
},
{
"epoch": 2.0353006067291783,
"grad_norm": 0.010783454403281212,
"learning_rate": 2.3412409759746528e-05,
"loss": 0.0003,
"num_input_tokens_seen": 47525264,
"step": 230
},
{
"epoch": 2.0441257584114725,
"grad_norm": 0.0026623259764164686,
"learning_rate": 2.302113588520578e-05,
"loss": 0.0001,
"num_input_tokens_seen": 47724528,
"step": 231
},
{
"epoch": 2.052950910093767,
"grad_norm": 0.00557671207934618,
"learning_rate": 2.2632178983284153e-05,
"loss": 0.0002,
"num_input_tokens_seen": 47932624,
"step": 232
},
{
"epoch": 2.0617760617760617,
"grad_norm": 0.003710981458425522,
"learning_rate": 2.2245572458000712e-05,
"loss": 0.0001,
"num_input_tokens_seen": 48148608,
"step": 233
},
{
"epoch": 2.0706012134583562,
"grad_norm": 0.009742701426148415,
"learning_rate": 2.1861349511521815e-05,
"loss": 0.0025,
"num_input_tokens_seen": 48373632,
"step": 234
},
{
"epoch": 2.079426365140651,
"grad_norm": 0.009755464270710945,
"learning_rate": 2.147954314130955e-05,
"loss": 0.0013,
"num_input_tokens_seen": 48586512,
"step": 235
},
{
"epoch": 2.0882515168229454,
"grad_norm": 0.002706202445551753,
"learning_rate": 2.1100186137288e-05,
"loss": 0.0001,
"num_input_tokens_seen": 48793568,
"step": 236
},
{
"epoch": 2.09707666850524,
"grad_norm": 0.005180325359106064,
"learning_rate": 2.072331107902713e-05,
"loss": 0.0001,
"num_input_tokens_seen": 49006224,
"step": 237
},
{
"epoch": 2.1059018201875346,
"grad_norm": 0.005968959536403418,
"learning_rate": 2.0348950332944834e-05,
"loss": 0.0002,
"num_input_tokens_seen": 49217632,
"step": 238
},
{
"epoch": 2.114726971869829,
"grad_norm": 0.0063306307420134544,
"learning_rate": 1.9977136049527345e-05,
"loss": 0.0001,
"num_input_tokens_seen": 49426624,
"step": 239
},
{
"epoch": 2.1235521235521237,
"grad_norm": 0.005157762672752142,
"learning_rate": 1.960790016056801e-05,
"loss": 0.0001,
"num_input_tokens_seen": 49623376,
"step": 240
},
{
"epoch": 2.132377275234418,
"grad_norm": 0.005218483041971922,
"learning_rate": 1.9241274376425e-05,
"loss": 0.0002,
"num_input_tokens_seen": 49828144,
"step": 241
},
{
"epoch": 2.1412024269167125,
"grad_norm": 0.00744604179635644,
"learning_rate": 1.8877290183298057e-05,
"loss": 0.0002,
"num_input_tokens_seen": 50018448,
"step": 242
},
{
"epoch": 2.150027578599007,
"grad_norm": 0.005399591755121946,
"learning_rate": 1.8515978840524302e-05,
"loss": 0.0001,
"num_input_tokens_seen": 50218176,
"step": 243
},
{
"epoch": 2.1588527302813016,
"grad_norm": 0.005761398002505302,
"learning_rate": 1.815737137789377e-05,
"loss": 0.0002,
"num_input_tokens_seen": 50424896,
"step": 244
},
{
"epoch": 2.1676778819635962,
"grad_norm": 0.006964447908103466,
"learning_rate": 1.7801498592984446e-05,
"loss": 0.0006,
"num_input_tokens_seen": 50635088,
"step": 245
},
{
"epoch": 2.176503033645891,
"grad_norm": 0.002962745726108551,
"learning_rate": 1.7448391048517376e-05,
"loss": 0.0001,
"num_input_tokens_seen": 50849552,
"step": 246
},
{
"epoch": 2.1853281853281854,
"grad_norm": 0.005332667380571365,
"learning_rate": 1.7098079069731958e-05,
"loss": 0.0002,
"num_input_tokens_seen": 51037776,
"step": 247
},
{
"epoch": 2.19415333701048,
"grad_norm": 0.006928949151188135,
"learning_rate": 1.6750592741781497e-05,
"loss": 0.0002,
"num_input_tokens_seen": 51242672,
"step": 248
},
{
"epoch": 2.2029784886927746,
"grad_norm": 0.004213888198137283,
"learning_rate": 1.640596190714947e-05,
"loss": 0.0001,
"num_input_tokens_seen": 51437008,
"step": 249
},
{
"epoch": 2.211803640375069,
"grad_norm": 0.010446918196976185,
"learning_rate": 1.6064216163086716e-05,
"loss": 0.0001,
"num_input_tokens_seen": 51641264,
"step": 250
},
{
"epoch": 2.2206287920573633,
"grad_norm": 0.004029524512588978,
"learning_rate": 1.5725384859069455e-05,
"loss": 0.0001,
"num_input_tokens_seen": 51842592,
"step": 251
},
{
"epoch": 2.229453943739658,
"grad_norm": 0.006790219806134701,
"learning_rate": 1.538949709427886e-05,
"loss": 0.0012,
"num_input_tokens_seen": 52047456,
"step": 252
},
{
"epoch": 2.2382790954219525,
"grad_norm": 0.003987099044024944,
"learning_rate": 1.5056581715101886e-05,
"loss": 0.0001,
"num_input_tokens_seen": 52242208,
"step": 253
},
{
"epoch": 2.247104247104247,
"grad_norm": 0.008930574171245098,
"learning_rate": 1.472666731265394e-05,
"loss": 0.0003,
"num_input_tokens_seen": 52436800,
"step": 254
},
{
"epoch": 2.2559293987865416,
"grad_norm": 0.004108684603124857,
"learning_rate": 1.4399782220323515e-05,
"loss": 0.0001,
"num_input_tokens_seen": 52624752,
"step": 255
},
{
"epoch": 2.2647545504688362,
"grad_norm": 0.00732703972607851,
"learning_rate": 1.4075954511338785e-05,
"loss": 0.0001,
"num_input_tokens_seen": 52836384,
"step": 256
},
{
"epoch": 2.273579702151131,
"grad_norm": 0.006608397234231234,
"learning_rate": 1.3755211996356687e-05,
"loss": 0.0001,
"num_input_tokens_seen": 53059296,
"step": 257
},
{
"epoch": 2.2824048538334254,
"grad_norm": 0.002376733347773552,
"learning_rate": 1.3437582221074573e-05,
"loss": 0.0001,
"num_input_tokens_seen": 53267440,
"step": 258
},
{
"epoch": 2.29123000551572,
"grad_norm": 0.004921163432300091,
"learning_rate": 1.3123092463864456e-05,
"loss": 0.0001,
"num_input_tokens_seen": 53501008,
"step": 259
},
{
"epoch": 2.300055157198014,
"grad_norm": 0.0034377635456621647,
"learning_rate": 1.2811769733430406e-05,
"loss": 0.0001,
"num_input_tokens_seen": 53700432,
"step": 260
},
{
"epoch": 2.3088803088803087,
"grad_norm": 0.006821690127253532,
"learning_rate": 1.250364076648894e-05,
"loss": 0.0002,
"num_input_tokens_seen": 53919616,
"step": 261
},
{
"epoch": 2.3177054605626033,
"grad_norm": 0.004776927176862955,
"learning_rate": 1.2198732025472876e-05,
"loss": 0.0001,
"num_input_tokens_seen": 54130528,
"step": 262
},
{
"epoch": 2.326530612244898,
"grad_norm": 0.004824692849069834,
"learning_rate": 1.1897069696258755e-05,
"loss": 0.0002,
"num_input_tokens_seen": 54350560,
"step": 263
},
{
"epoch": 2.3353557639271925,
"grad_norm": 0.005174586083739996,
"learning_rate": 1.1598679685917901e-05,
"loss": 0.0001,
"num_input_tokens_seen": 54542224,
"step": 264
},
{
"epoch": 2.344180915609487,
"grad_norm": 0.012352543883025646,
"learning_rate": 1.1303587620491513e-05,
"loss": 0.0002,
"num_input_tokens_seen": 54745136,
"step": 265
},
{
"epoch": 2.3530060672917816,
"grad_norm": 0.005056153051555157,
"learning_rate": 1.1011818842789928e-05,
"loss": 0.0001,
"num_input_tokens_seen": 54957584,
"step": 266
},
{
"epoch": 2.361831218974076,
"grad_norm": 0.010525842197239399,
"learning_rate": 1.0723398410216084e-05,
"loss": 0.0001,
"num_input_tokens_seen": 55162496,
"step": 267
},
{
"epoch": 2.370656370656371,
"grad_norm": 0.0092442212626338,
"learning_rate": 1.0438351092613569e-05,
"loss": 0.0002,
"num_input_tokens_seen": 55376544,
"step": 268
},
{
"epoch": 2.3794815223386654,
"grad_norm": 0.00699999462813139,
"learning_rate": 1.0156701370139454e-05,
"loss": 0.0001,
"num_input_tokens_seen": 55583072,
"step": 269
},
{
"epoch": 2.38830667402096,
"grad_norm": 0.007677710149437189,
"learning_rate": 9.878473431161767e-06,
"loss": 0.0002,
"num_input_tokens_seen": 55801200,
"step": 270
},
{
"epoch": 2.397131825703254,
"grad_norm": 0.003174175275489688,
"learning_rate": 9.603691170182317e-06,
"loss": 0.0001,
"num_input_tokens_seen": 55998080,
"step": 271
},
{
"epoch": 2.4059569773855487,
"grad_norm": 0.005871200002729893,
"learning_rate": 9.33237818578449e-06,
"loss": 0.0002,
"num_input_tokens_seen": 56200448,
"step": 272
},
{
"epoch": 2.4147821290678433,
"grad_norm": 0.00371691957116127,
"learning_rate": 9.064557778606631e-06,
"loss": 0.0001,
"num_input_tokens_seen": 56400416,
"step": 273
},
{
"epoch": 2.423607280750138,
"grad_norm": 0.007599337492138147,
"learning_rate": 8.800252949340998e-06,
"loss": 0.0002,
"num_input_tokens_seen": 56606128,
"step": 274
},
{
"epoch": 2.4324324324324325,
"grad_norm": 0.0015243644593283534,
"learning_rate": 8.539486396758356e-06,
"loss": 0.0,
"num_input_tokens_seen": 56797824,
"step": 275
},
{
"epoch": 2.441257584114727,
"grad_norm": 0.0030196798034012318,
"learning_rate": 8.28228051575864e-06,
"loss": 0.0001,
"num_input_tokens_seen": 57006384,
"step": 276
},
{
"epoch": 2.4500827357970216,
"grad_norm": 0.005347589962184429,
"learning_rate": 8.02865739544767e-06,
"loss": 0.0001,
"num_input_tokens_seen": 57207824,
"step": 277
},
{
"epoch": 2.458907887479316,
"grad_norm": 0.005150883924216032,
"learning_rate": 7.778638817240042e-06,
"loss": 0.0001,
"num_input_tokens_seen": 57415152,
"step": 278
},
{
"epoch": 2.467733039161611,
"grad_norm": 0.006857512053102255,
"learning_rate": 7.532246252988617e-06,
"loss": 0.0001,
"num_input_tokens_seen": 57628096,
"step": 279
},
{
"epoch": 2.476558190843905,
"grad_norm": 0.005364645272493362,
"learning_rate": 7.289500863140414e-06,
"loss": 0.0001,
"num_input_tokens_seen": 57824064,
"step": 280
},
{
"epoch": 2.4853833425261995,
"grad_norm": 0.007198365870863199,
"learning_rate": 7.05042349491935e-06,
"loss": 0.0002,
"num_input_tokens_seen": 58042720,
"step": 281
},
{
"epoch": 2.494208494208494,
"grad_norm": 0.005014900583773851,
"learning_rate": 6.815034680535915e-06,
"loss": 0.0001,
"num_input_tokens_seen": 58255408,
"step": 282
},
{
"epoch": 2.5030336458907887,
"grad_norm": 0.008873779326677322,
"learning_rate": 6.5833546354237556e-06,
"loss": 0.0001,
"num_input_tokens_seen": 58464800,
"step": 283
},
{
"epoch": 2.5118587975730833,
"grad_norm": 0.0044725253246724606,
"learning_rate": 6.355403256503595e-06,
"loss": 0.0001,
"num_input_tokens_seen": 58672496,
"step": 284
},
{
"epoch": 2.520683949255378,
"grad_norm": 0.0047348616644740105,
"learning_rate": 6.1312001204745115e-06,
"loss": 0.0002,
"num_input_tokens_seen": 58898256,
"step": 285
},
{
"epoch": 2.5295091009376725,
"grad_norm": 0.00710884016007185,
"learning_rate": 5.910764482132575e-06,
"loss": 0.0001,
"num_input_tokens_seen": 59107152,
"step": 286
},
{
"epoch": 2.538334252619967,
"grad_norm": 0.007686229422688484,
"learning_rate": 5.6941152727173265e-06,
"loss": 0.0002,
"num_input_tokens_seen": 59307664,
"step": 287
},
{
"epoch": 2.5471594043022616,
"grad_norm": 0.014555118046700954,
"learning_rate": 5.481271098285817e-06,
"loss": 0.0003,
"num_input_tokens_seen": 59514736,
"step": 288
},
{
"epoch": 2.5559845559845558,
"grad_norm": 0.0028200196102261543,
"learning_rate": 5.272250238114856e-06,
"loss": 0.0001,
"num_input_tokens_seen": 59712512,
"step": 289
},
{
"epoch": 2.564809707666851,
"grad_norm": 0.004194322973489761,
"learning_rate": 5.067070643131055e-06,
"loss": 0.0001,
"num_input_tokens_seen": 59910000,
"step": 290
},
{
"epoch": 2.573634859349145,
"grad_norm": 0.006987538188695908,
"learning_rate": 4.865749934369223e-06,
"loss": 0.0002,
"num_input_tokens_seen": 60116400,
"step": 291
},
{
"epoch": 2.5824600110314395,
"grad_norm": 0.003778768004849553,
"learning_rate": 4.668305401459022e-06,
"loss": 0.0002,
"num_input_tokens_seen": 60320368,
"step": 292
},
{
"epoch": 2.591285162713734,
"grad_norm": 0.003472360782325268,
"learning_rate": 4.474754001140191e-06,
"loss": 0.0001,
"num_input_tokens_seen": 60536528,
"step": 293
},
{
"epoch": 2.6001103143960287,
"grad_norm": 0.009052475914359093,
"learning_rate": 4.285112355806192e-06,
"loss": 0.001,
"num_input_tokens_seen": 60743120,
"step": 294
},
{
"epoch": 2.6089354660783233,
"grad_norm": 0.0060082292184233665,
"learning_rate": 4.099396752076745e-06,
"loss": 0.0001,
"num_input_tokens_seen": 60942704,
"step": 295
},
{
"epoch": 2.617760617760618,
"grad_norm": 0.0075798071920871735,
"learning_rate": 3.917623139399018e-06,
"loss": 0.0001,
"num_input_tokens_seen": 61140128,
"step": 296
},
{
"epoch": 2.6265857694429124,
"grad_norm": 0.0055752964690327644,
"learning_rate": 3.7398071286779857e-06,
"loss": 0.0001,
"num_input_tokens_seen": 61334224,
"step": 297
},
{
"epoch": 2.6354109211252066,
"grad_norm": 0.007863204926252365,
"learning_rate": 3.5659639909356723e-06,
"loss": 0.0001,
"num_input_tokens_seen": 61543280,
"step": 298
},
{
"epoch": 2.6442360728075016,
"grad_norm": 0.006538075394928455,
"learning_rate": 3.3961086559996803e-06,
"loss": 0.0002,
"num_input_tokens_seen": 61750720,
"step": 299
},
{
"epoch": 2.6530612244897958,
"grad_norm": 0.002779777627438307,
"learning_rate": 3.230255711220992e-06,
"loss": 0.0,
"num_input_tokens_seen": 61945952,
"step": 300
},
{
"epoch": 2.6618863761720903,
"grad_norm": 0.004271807614713907,
"learning_rate": 3.0684194002212287e-06,
"loss": 0.0001,
"num_input_tokens_seen": 62155632,
"step": 301
},
{
"epoch": 2.670711527854385,
"grad_norm": 0.00638817623257637,
"learning_rate": 2.910613621669356e-06,
"loss": 0.0001,
"num_input_tokens_seen": 62353216,
"step": 302
},
{
"epoch": 2.6795366795366795,
"grad_norm": 0.00442032516002655,
"learning_rate": 2.7568519280880558e-06,
"loss": 0.0001,
"num_input_tokens_seen": 62544128,
"step": 303
},
{
"epoch": 2.688361831218974,
"grad_norm": 0.008686737157404423,
"learning_rate": 2.607147524689829e-06,
"loss": 0.0004,
"num_input_tokens_seen": 62752688,
"step": 304
},
{
"epoch": 2.6971869829012687,
"grad_norm": 0.0059651597402989864,
"learning_rate": 2.4615132682429374e-06,
"loss": 0.0001,
"num_input_tokens_seen": 62963296,
"step": 305
},
{
"epoch": 2.7060121345835633,
"grad_norm": 0.0056177834048867226,
"learning_rate": 2.3199616659672354e-06,
"loss": 0.0002,
"num_input_tokens_seen": 63161904,
"step": 306
},
{
"epoch": 2.714837286265858,
"grad_norm": 0.0029979923274368048,
"learning_rate": 2.182504874460006e-06,
"loss": 0.0001,
"num_input_tokens_seen": 63365744,
"step": 307
},
{
"epoch": 2.7236624379481524,
"grad_norm": 0.004314000252634287,
"learning_rate": 2.049154698651989e-06,
"loss": 0.0001,
"num_input_tokens_seen": 63571808,
"step": 308
},
{
"epoch": 2.7324875896304466,
"grad_norm": 0.006837273947894573,
"learning_rate": 1.919922590793549e-06,
"loss": 0.0001,
"num_input_tokens_seen": 63768960,
"step": 309
},
{
"epoch": 2.741312741312741,
"grad_norm": 0.0037646403070539236,
"learning_rate": 1.7948196494711188e-06,
"loss": 0.0001,
"num_input_tokens_seen": 63979648,
"step": 310
},
{
"epoch": 2.7501378929950357,
"grad_norm": 0.0031723175197839737,
"learning_rate": 1.6738566186540627e-06,
"loss": 0.0001,
"num_input_tokens_seen": 64189712,
"step": 311
},
{
"epoch": 2.7589630446773303,
"grad_norm": 0.005477920174598694,
"learning_rate": 1.5570438867719694e-06,
"loss": 0.0001,
"num_input_tokens_seen": 64400624,
"step": 312
},
{
"epoch": 2.767788196359625,
"grad_norm": 0.006315939594060183,
"learning_rate": 1.4443914858224938e-06,
"loss": 0.0001,
"num_input_tokens_seen": 64626320,
"step": 313
},
{
"epoch": 2.7766133480419195,
"grad_norm": 0.004753002431243658,
"learning_rate": 1.3359090905097848e-06,
"loss": 0.0001,
"num_input_tokens_seen": 64826480,
"step": 314
},
{
"epoch": 2.785438499724214,
"grad_norm": 0.00812880601733923,
"learning_rate": 1.2316060174136002e-06,
"loss": 0.0003,
"num_input_tokens_seen": 65031984,
"step": 315
},
{
"epoch": 2.7942636514065087,
"grad_norm": 0.0029212606605142355,
"learning_rate": 1.1314912241892183e-06,
"loss": 0.0001,
"num_input_tokens_seen": 65239456,
"step": 316
},
{
"epoch": 2.8030888030888033,
"grad_norm": 0.006850802339613438,
"learning_rate": 1.0355733087981378e-06,
"loss": 0.0002,
"num_input_tokens_seen": 65433888,
"step": 317
},
{
"epoch": 2.8119139547710974,
"grad_norm": 0.0020711093675345182,
"learning_rate": 9.43860508769645e-07,
"loss": 0.0001,
"num_input_tokens_seen": 65638288,
"step": 318
},
{
"epoch": 2.8207391064533924,
"grad_norm": 0.004868640564382076,
"learning_rate": 8.563607004934193e-07,
"loss": 0.0002,
"num_input_tokens_seen": 65855952,
"step": 319
},
{
"epoch": 2.8295642581356866,
"grad_norm": 0.006297328509390354,
"learning_rate": 7.730813985430407e-07,
"loss": 0.0002,
"num_input_tokens_seen": 66070192,
"step": 320
},
{
"epoch": 2.838389409817981,
"grad_norm": 0.0036759376525878906,
"learning_rate": 6.940297550306896e-07,
"loss": 0.0001,
"num_input_tokens_seen": 66283808,
"step": 321
},
{
"epoch": 2.8472145615002757,
"grad_norm": 0.0120092136785388,
"learning_rate": 6.192125589928821e-07,
"loss": 0.0002,
"num_input_tokens_seen": 66507776,
"step": 322
},
{
"epoch": 2.8560397131825703,
"grad_norm": 0.005414010491222143,
"learning_rate": 5.486362358074094e-07,
"loss": 0.0002,
"num_input_tokens_seen": 66708320,
"step": 323
},
{
"epoch": 2.864864864864865,
"grad_norm": 0.007992051541805267,
"learning_rate": 4.823068466415615e-07,
"loss": 0.0001,
"num_input_tokens_seen": 66910032,
"step": 324
},
{
"epoch": 2.8736900165471595,
"grad_norm": 0.006493248511105776,
"learning_rate": 4.202300879315446e-07,
"loss": 0.0001,
"num_input_tokens_seen": 67112784,
"step": 325
},
{
"epoch": 2.882515168229454,
"grad_norm": 0.004381334874778986,
"learning_rate": 3.624112908932942e-07,
"loss": 0.0001,
"num_input_tokens_seen": 67306464,
"step": 326
},
{
"epoch": 2.8913403199117482,
"grad_norm": 0.00577085604891181,
"learning_rate": 3.088554210646133e-07,
"loss": 0.0001,
"num_input_tokens_seen": 67504720,
"step": 327
},
{
"epoch": 2.9001654715940433,
"grad_norm": 0.003793071024119854,
"learning_rate": 2.595670778787196e-07,
"loss": 0.0001,
"num_input_tokens_seen": 67694048,
"step": 328
},
{
"epoch": 2.9089906232763374,
"grad_norm": 0.00835067592561245,
"learning_rate": 2.1455049426926666e-07,
"loss": 0.0002,
"num_input_tokens_seen": 67895008,
"step": 329
},
{
"epoch": 2.917815774958632,
"grad_norm": 0.005372443702071905,
"learning_rate": 1.7380953630678488e-07,
"loss": 0.0001,
"num_input_tokens_seen": 68093168,
"step": 330
},
{
"epoch": 2.9266409266409266,
"grad_norm": 0.010219305753707886,
"learning_rate": 1.373477028666803e-07,
"loss": 0.0002,
"num_input_tokens_seen": 68305568,
"step": 331
},
{
"epoch": 2.935466078323221,
"grad_norm": 0.0038206197787076235,
"learning_rate": 1.0516812532873621e-07,
"loss": 0.0001,
"num_input_tokens_seen": 68506384,
"step": 332
},
{
"epoch": 2.9442912300055157,
"grad_norm": 0.007432411424815655,
"learning_rate": 7.727356730820035e-08,
"loss": 0.0002,
"num_input_tokens_seen": 68716160,
"step": 333
},
{
"epoch": 2.9531163816878103,
"grad_norm": 0.004036502446979284,
"learning_rate": 5.3666424418413744e-08,
"loss": 0.0001,
"num_input_tokens_seen": 68918048,
"step": 334
},
{
"epoch": 2.961941533370105,
"grad_norm": 0.0045955548994243145,
"learning_rate": 3.4348724065119685e-08,
"loss": 0.0001,
"num_input_tokens_seen": 69129152,
"step": 335
},
{
"epoch": 2.9707666850523995,
"grad_norm": 0.012164157815277576,
"learning_rate": 1.9322125272297488e-08,
"loss": 0.0003,
"num_input_tokens_seen": 69328576,
"step": 336
},
{
"epoch": 2.979591836734694,
"grad_norm": 0.0029640356078743935,
"learning_rate": 8.587918539726402e-09,
"loss": 0.0001,
"num_input_tokens_seen": 69537232,
"step": 337
},
{
"epoch": 2.988416988416988,
"grad_norm": 0.005239939782768488,
"learning_rate": 2.1470257321298813e-09,
"loss": 0.0001,
"num_input_tokens_seen": 69761008,
"step": 338
},
{
"epoch": 2.997242140099283,
"grad_norm": 0.0060053626075387,
"learning_rate": 0.0,
"loss": 0.0001,
"num_input_tokens_seen": 69953200,
"step": 339
},
{
"epoch": 2.997242140099283,
"num_input_tokens_seen": 69953200,
"step": 339,
"total_flos": 2.976146663409713e+18,
"train_loss": 0.004280612113766934,
"train_runtime": 8852.4475,
"train_samples_per_second": 4.914,
"train_steps_per_second": 0.038
}
],
"logging_steps": 1,
"max_steps": 339,
"num_input_tokens_seen": 69953200,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.976146663409713e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}