{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.5197568389057752, "eval_steps": 500, "global_step": 3500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004342162396873643, "grad_norm": 0.7083401679992676, "learning_rate": 4e-05, "loss": 2.137, "step": 1 }, { "epoch": 0.0008684324793747286, "grad_norm": 0.699398934841156, "learning_rate": 8e-05, "loss": 2.1093, "step": 2 }, { "epoch": 0.0013026487190620929, "grad_norm": 0.6803368926048279, "learning_rate": 0.00012, "loss": 2.0977, "step": 3 }, { "epoch": 0.0017368649587494573, "grad_norm": 0.6662035584449768, "learning_rate": 0.00016, "loss": 2.0676, "step": 4 }, { "epoch": 0.0021710811984368217, "grad_norm": 0.31832820177078247, "learning_rate": 0.0002, "loss": 1.9584, "step": 5 }, { "epoch": 0.0026052974381241857, "grad_norm": 0.21583382785320282, "learning_rate": 0.0001999710312862109, "loss": 1.8854, "step": 6 }, { "epoch": 0.00303951367781155, "grad_norm": 0.23672160506248474, "learning_rate": 0.0001999420625724218, "loss": 1.8224, "step": 7 }, { "epoch": 0.0034737299174989146, "grad_norm": 0.2646632194519043, "learning_rate": 0.0001999130938586327, "loss": 1.7351, "step": 8 }, { "epoch": 0.0039079461571862786, "grad_norm": 0.3025886118412018, "learning_rate": 0.00019988412514484357, "loss": 1.6451, "step": 9 }, { "epoch": 0.004342162396873643, "grad_norm": 0.33446529507637024, "learning_rate": 0.00019985515643105447, "loss": 1.5329, "step": 10 }, { "epoch": 0.004776378636561007, "grad_norm": 0.35481929779052734, "learning_rate": 0.00019982618771726535, "loss": 1.4224, "step": 11 }, { "epoch": 0.005210594876248371, "grad_norm": 0.3858325183391571, "learning_rate": 0.00019979721900347625, "loss": 1.2945, "step": 12 }, { "epoch": 0.005644811115935736, "grad_norm": 0.41268807649612427, "learning_rate": 0.00019976825028968716, "loss": 1.1373, "step": 13 }, { "epoch": 0.0060790273556231, "grad_norm": 0.43161213397979736, "learning_rate": 0.00019973928157589803, "loss": 0.978, "step": 14 }, { "epoch": 0.006513243595310464, "grad_norm": 0.4535561203956604, "learning_rate": 0.00019971031286210894, "loss": 0.8297, "step": 15 }, { "epoch": 0.006947459834997829, "grad_norm": 0.45605772733688354, "learning_rate": 0.00019968134414831981, "loss": 0.6684, "step": 16 }, { "epoch": 0.007381676074685193, "grad_norm": 0.5142773389816284, "learning_rate": 0.00019965237543453072, "loss": 0.5302, "step": 17 }, { "epoch": 0.007815892314372557, "grad_norm": 0.6368698477745056, "learning_rate": 0.0001996234067207416, "loss": 0.3915, "step": 18 }, { "epoch": 0.008250108554059922, "grad_norm": 0.5368231534957886, "learning_rate": 0.0001995944380069525, "loss": 0.2547, "step": 19 }, { "epoch": 0.008684324793747287, "grad_norm": 0.5204238295555115, "learning_rate": 0.0001995654692931634, "loss": 0.2155, "step": 20 }, { "epoch": 0.00911854103343465, "grad_norm": 0.3791578710079193, "learning_rate": 0.00019953650057937428, "loss": 0.1348, "step": 21 }, { "epoch": 0.009552757273122015, "grad_norm": 0.26642370223999023, "learning_rate": 0.00019950753186558518, "loss": 0.0953, "step": 22 }, { "epoch": 0.00998697351280938, "grad_norm": 0.2439703792333603, "learning_rate": 0.00019947856315179606, "loss": 0.0991, "step": 23 }, { "epoch": 0.010421189752496743, "grad_norm": 0.1552814394235611, "learning_rate": 0.00019944959443800696, "loss": 0.0834, "step": 24 }, { "epoch": 0.010855405992184108, "grad_norm": 0.17741930484771729, "learning_rate": 0.00019942062572421784, "loss": 0.0764, "step": 25 }, { "epoch": 0.011289622231871473, "grad_norm": 0.15319164097309113, "learning_rate": 0.00019939165701042874, "loss": 0.0948, "step": 26 }, { "epoch": 0.011723838471558836, "grad_norm": 0.10254763066768646, "learning_rate": 0.00019936268829663965, "loss": 0.0621, "step": 27 }, { "epoch": 0.0121580547112462, "grad_norm": 0.13184596598148346, "learning_rate": 0.00019933371958285052, "loss": 0.0694, "step": 28 }, { "epoch": 0.012592270950933565, "grad_norm": 0.11803077906370163, "learning_rate": 0.00019930475086906143, "loss": 0.0732, "step": 29 }, { "epoch": 0.013026487190620929, "grad_norm": 0.11552850157022476, "learning_rate": 0.0001992757821552723, "loss": 0.0865, "step": 30 }, { "epoch": 0.013460703430308293, "grad_norm": 0.08317773044109344, "learning_rate": 0.0001992468134414832, "loss": 0.0745, "step": 31 }, { "epoch": 0.013894919669995658, "grad_norm": 0.09790172427892685, "learning_rate": 0.00019921784472769408, "loss": 0.0793, "step": 32 }, { "epoch": 0.014329135909683021, "grad_norm": 0.06966142356395721, "learning_rate": 0.000199188876013905, "loss": 0.0705, "step": 33 }, { "epoch": 0.014763352149370386, "grad_norm": 0.06615456938743591, "learning_rate": 0.0001991599073001159, "loss": 0.0726, "step": 34 }, { "epoch": 0.015197568389057751, "grad_norm": 0.06885384023189545, "learning_rate": 0.0001991309385863268, "loss": 0.072, "step": 35 }, { "epoch": 0.015631784628745114, "grad_norm": 0.07089430838823318, "learning_rate": 0.00019910196987253767, "loss": 0.0662, "step": 36 }, { "epoch": 0.01606600086843248, "grad_norm": 0.07172323763370514, "learning_rate": 0.00019907300115874855, "loss": 0.0647, "step": 37 }, { "epoch": 0.016500217108119844, "grad_norm": 0.06592019647359848, "learning_rate": 0.00019904403244495945, "loss": 0.0655, "step": 38 }, { "epoch": 0.01693443334780721, "grad_norm": 0.05621284991502762, "learning_rate": 0.00019901506373117033, "loss": 0.0647, "step": 39 }, { "epoch": 0.017368649587494574, "grad_norm": 0.07261176407337189, "learning_rate": 0.00019898609501738123, "loss": 0.0878, "step": 40 }, { "epoch": 0.017802865827181935, "grad_norm": 0.05676492676138878, "learning_rate": 0.00019895712630359214, "loss": 0.0675, "step": 41 }, { "epoch": 0.0182370820668693, "grad_norm": 0.07016979157924652, "learning_rate": 0.00019892815758980304, "loss": 0.0707, "step": 42 }, { "epoch": 0.018671298306556665, "grad_norm": 0.061570633202791214, "learning_rate": 0.00019889918887601392, "loss": 0.0685, "step": 43 }, { "epoch": 0.01910551454624403, "grad_norm": 0.05502679571509361, "learning_rate": 0.0001988702201622248, "loss": 0.0551, "step": 44 }, { "epoch": 0.019539730785931395, "grad_norm": 0.05480094999074936, "learning_rate": 0.0001988412514484357, "loss": 0.0544, "step": 45 }, { "epoch": 0.01997394702561876, "grad_norm": 0.0445089265704155, "learning_rate": 0.00019881228273464657, "loss": 0.0546, "step": 46 }, { "epoch": 0.02040816326530612, "grad_norm": 0.04600195586681366, "learning_rate": 0.00019878331402085748, "loss": 0.0477, "step": 47 }, { "epoch": 0.020842379504993486, "grad_norm": 0.07088567316532135, "learning_rate": 0.00019875434530706838, "loss": 0.0807, "step": 48 }, { "epoch": 0.02127659574468085, "grad_norm": 0.07396797090768814, "learning_rate": 0.00019872537659327929, "loss": 0.0771, "step": 49 }, { "epoch": 0.021710811984368215, "grad_norm": 0.064840167760849, "learning_rate": 0.00019869640787949016, "loss": 0.0611, "step": 50 }, { "epoch": 0.02214502822405558, "grad_norm": 0.05322154611349106, "learning_rate": 0.00019866743916570107, "loss": 0.0525, "step": 51 }, { "epoch": 0.022579244463742945, "grad_norm": 0.07603923231363297, "learning_rate": 0.00019863847045191194, "loss": 0.0867, "step": 52 }, { "epoch": 0.02301346070343031, "grad_norm": 0.0672057643532753, "learning_rate": 0.00019860950173812282, "loss": 0.0656, "step": 53 }, { "epoch": 0.02344767694311767, "grad_norm": 0.04825486242771149, "learning_rate": 0.00019858053302433372, "loss": 0.0465, "step": 54 }, { "epoch": 0.023881893182805036, "grad_norm": 0.055328965187072754, "learning_rate": 0.00019855156431054463, "loss": 0.0527, "step": 55 }, { "epoch": 0.0243161094224924, "grad_norm": 0.04741791635751724, "learning_rate": 0.00019852259559675553, "loss": 0.0596, "step": 56 }, { "epoch": 0.024750325662179766, "grad_norm": 0.05637412518262863, "learning_rate": 0.0001984936268829664, "loss": 0.0531, "step": 57 }, { "epoch": 0.02518454190186713, "grad_norm": 0.05381873622536659, "learning_rate": 0.0001984646581691773, "loss": 0.0759, "step": 58 }, { "epoch": 0.025618758141554496, "grad_norm": 0.05130472779273987, "learning_rate": 0.0001984356894553882, "loss": 0.0514, "step": 59 }, { "epoch": 0.026052974381241857, "grad_norm": 0.051273003220558167, "learning_rate": 0.00019840672074159906, "loss": 0.0607, "step": 60 }, { "epoch": 0.026487190620929222, "grad_norm": 0.052414774894714355, "learning_rate": 0.00019837775202780997, "loss": 0.0563, "step": 61 }, { "epoch": 0.026921406860616587, "grad_norm": 0.04929771646857262, "learning_rate": 0.00019834878331402087, "loss": 0.0604, "step": 62 }, { "epoch": 0.02735562310030395, "grad_norm": 0.05067106708884239, "learning_rate": 0.00019831981460023177, "loss": 0.0452, "step": 63 }, { "epoch": 0.027789839339991317, "grad_norm": 0.03915831819176674, "learning_rate": 0.00019829084588644265, "loss": 0.0515, "step": 64 }, { "epoch": 0.02822405557967868, "grad_norm": 0.06256204098463058, "learning_rate": 0.00019826187717265356, "loss": 0.0738, "step": 65 }, { "epoch": 0.028658271819366043, "grad_norm": 0.03837879002094269, "learning_rate": 0.00019823290845886443, "loss": 0.0433, "step": 66 }, { "epoch": 0.029092488059053408, "grad_norm": 0.050799887627363205, "learning_rate": 0.0001982039397450753, "loss": 0.0602, "step": 67 }, { "epoch": 0.029526704298740773, "grad_norm": 0.05626372992992401, "learning_rate": 0.0001981749710312862, "loss": 0.0663, "step": 68 }, { "epoch": 0.029960920538428137, "grad_norm": 0.05932501703500748, "learning_rate": 0.00019814600231749712, "loss": 0.0695, "step": 69 }, { "epoch": 0.030395136778115502, "grad_norm": 0.046737752854824066, "learning_rate": 0.00019811703360370802, "loss": 0.0607, "step": 70 }, { "epoch": 0.030829353017802867, "grad_norm": 0.05745745822787285, "learning_rate": 0.0001980880648899189, "loss": 0.0585, "step": 71 }, { "epoch": 0.03126356925749023, "grad_norm": 0.04742376133799553, "learning_rate": 0.0001980590961761298, "loss": 0.0403, "step": 72 }, { "epoch": 0.0316977854971776, "grad_norm": 0.06668157875537872, "learning_rate": 0.00019803012746234068, "loss": 0.0963, "step": 73 }, { "epoch": 0.03213200173686496, "grad_norm": 0.06483882665634155, "learning_rate": 0.00019800115874855155, "loss": 0.0752, "step": 74 }, { "epoch": 0.03256621797655232, "grad_norm": 0.04573976621031761, "learning_rate": 0.00019797219003476246, "loss": 0.0607, "step": 75 }, { "epoch": 0.03300043421623969, "grad_norm": 0.050016701221466064, "learning_rate": 0.00019794322132097336, "loss": 0.052, "step": 76 }, { "epoch": 0.03343465045592705, "grad_norm": 0.06422847509384155, "learning_rate": 0.00019791425260718426, "loss": 0.0662, "step": 77 }, { "epoch": 0.03386886669561442, "grad_norm": 0.04904785379767418, "learning_rate": 0.00019788528389339514, "loss": 0.061, "step": 78 }, { "epoch": 0.03430308293530178, "grad_norm": 0.04826809838414192, "learning_rate": 0.00019785631517960604, "loss": 0.0629, "step": 79 }, { "epoch": 0.03473729917498915, "grad_norm": 0.051802679896354675, "learning_rate": 0.00019782734646581692, "loss": 0.0539, "step": 80 }, { "epoch": 0.03517151541467651, "grad_norm": 0.04649043083190918, "learning_rate": 0.00019779837775202782, "loss": 0.0522, "step": 81 }, { "epoch": 0.03560573165436387, "grad_norm": 0.04552188515663147, "learning_rate": 0.0001977694090382387, "loss": 0.0677, "step": 82 }, { "epoch": 0.03603994789405124, "grad_norm": 0.044875796884298325, "learning_rate": 0.0001977404403244496, "loss": 0.0561, "step": 83 }, { "epoch": 0.0364741641337386, "grad_norm": 0.0387980155646801, "learning_rate": 0.0001977114716106605, "loss": 0.0407, "step": 84 }, { "epoch": 0.03690838037342597, "grad_norm": 0.04789264127612114, "learning_rate": 0.00019768250289687139, "loss": 0.06, "step": 85 }, { "epoch": 0.03734259661311333, "grad_norm": 0.04808861389756203, "learning_rate": 0.0001976535341830823, "loss": 0.0708, "step": 86 }, { "epoch": 0.0377768128528007, "grad_norm": 0.04716057330369949, "learning_rate": 0.00019762456546929317, "loss": 0.0534, "step": 87 }, { "epoch": 0.03821102909248806, "grad_norm": 0.046040259301662445, "learning_rate": 0.00019759559675550407, "loss": 0.0536, "step": 88 }, { "epoch": 0.03864524533217542, "grad_norm": 0.05645117536187172, "learning_rate": 0.00019756662804171495, "loss": 0.06, "step": 89 }, { "epoch": 0.03907946157186279, "grad_norm": 0.041596803814172745, "learning_rate": 0.00019753765932792585, "loss": 0.0476, "step": 90 }, { "epoch": 0.03951367781155015, "grad_norm": 0.03925769031047821, "learning_rate": 0.00019750869061413675, "loss": 0.0475, "step": 91 }, { "epoch": 0.03994789405123752, "grad_norm": 0.04416707158088684, "learning_rate": 0.00019747972190034763, "loss": 0.0523, "step": 92 }, { "epoch": 0.04038211029092488, "grad_norm": 0.04556754231452942, "learning_rate": 0.00019745075318655853, "loss": 0.0504, "step": 93 }, { "epoch": 0.04081632653061224, "grad_norm": 0.040624238550662994, "learning_rate": 0.0001974217844727694, "loss": 0.0636, "step": 94 }, { "epoch": 0.04125054277029961, "grad_norm": 0.04595480114221573, "learning_rate": 0.00019739281575898031, "loss": 0.0501, "step": 95 }, { "epoch": 0.04168475900998697, "grad_norm": 0.04735013470053673, "learning_rate": 0.0001973638470451912, "loss": 0.0658, "step": 96 }, { "epoch": 0.04211897524967434, "grad_norm": 0.05094614252448082, "learning_rate": 0.0001973348783314021, "loss": 0.0613, "step": 97 }, { "epoch": 0.0425531914893617, "grad_norm": 0.05568967014551163, "learning_rate": 0.000197305909617613, "loss": 0.061, "step": 98 }, { "epoch": 0.04298740772904907, "grad_norm": 0.05080579221248627, "learning_rate": 0.00019727694090382387, "loss": 0.064, "step": 99 }, { "epoch": 0.04342162396873643, "grad_norm": 0.03608544543385506, "learning_rate": 0.00019724797219003478, "loss": 0.0482, "step": 100 }, { "epoch": 0.04385584020842379, "grad_norm": 0.044908370822668076, "learning_rate": 0.00019721900347624566, "loss": 0.0735, "step": 101 }, { "epoch": 0.04429005644811116, "grad_norm": 0.04627292975783348, "learning_rate": 0.00019719003476245656, "loss": 0.0637, "step": 102 }, { "epoch": 0.04472427268779852, "grad_norm": 0.04019039124250412, "learning_rate": 0.00019716106604866744, "loss": 0.0521, "step": 103 }, { "epoch": 0.04515848892748589, "grad_norm": 0.04855141043663025, "learning_rate": 0.00019713209733487834, "loss": 0.0629, "step": 104 }, { "epoch": 0.04559270516717325, "grad_norm": 0.049006566405296326, "learning_rate": 0.00019710312862108924, "loss": 0.0592, "step": 105 }, { "epoch": 0.04602692140686062, "grad_norm": 0.050240881741046906, "learning_rate": 0.00019707415990730012, "loss": 0.0629, "step": 106 }, { "epoch": 0.04646113764654798, "grad_norm": 0.04127007722854614, "learning_rate": 0.00019704519119351102, "loss": 0.0512, "step": 107 }, { "epoch": 0.04689535388623534, "grad_norm": 0.04567702114582062, "learning_rate": 0.0001970162224797219, "loss": 0.0669, "step": 108 }, { "epoch": 0.04732957012592271, "grad_norm": 0.04622099921107292, "learning_rate": 0.0001969872537659328, "loss": 0.0584, "step": 109 }, { "epoch": 0.04776378636561007, "grad_norm": 0.0505375936627388, "learning_rate": 0.00019695828505214368, "loss": 0.0481, "step": 110 }, { "epoch": 0.04819800260529744, "grad_norm": 0.04182496666908264, "learning_rate": 0.00019692931633835458, "loss": 0.0466, "step": 111 }, { "epoch": 0.0486322188449848, "grad_norm": 0.04947333410382271, "learning_rate": 0.0001969003476245655, "loss": 0.0611, "step": 112 }, { "epoch": 0.049066435084672164, "grad_norm": 0.04917158931493759, "learning_rate": 0.00019687137891077636, "loss": 0.0636, "step": 113 }, { "epoch": 0.04950065132435953, "grad_norm": 0.06067786365747452, "learning_rate": 0.00019684241019698727, "loss": 0.0748, "step": 114 }, { "epoch": 0.04993486756404689, "grad_norm": 0.05078978091478348, "learning_rate": 0.00019681344148319814, "loss": 0.058, "step": 115 }, { "epoch": 0.05036908380373426, "grad_norm": 0.05092448741197586, "learning_rate": 0.00019678447276940905, "loss": 0.0592, "step": 116 }, { "epoch": 0.05080330004342162, "grad_norm": 0.039484139531850815, "learning_rate": 0.00019675550405561993, "loss": 0.046, "step": 117 }, { "epoch": 0.05123751628310899, "grad_norm": 0.05611754581332207, "learning_rate": 0.00019672653534183083, "loss": 0.0476, "step": 118 }, { "epoch": 0.05167173252279635, "grad_norm": 0.05244527384638786, "learning_rate": 0.00019669756662804173, "loss": 0.0485, "step": 119 }, { "epoch": 0.052105948762483714, "grad_norm": 0.06425631046295166, "learning_rate": 0.0001966685979142526, "loss": 0.0802, "step": 120 }, { "epoch": 0.05254016500217108, "grad_norm": 0.05750326067209244, "learning_rate": 0.0001966396292004635, "loss": 0.0634, "step": 121 }, { "epoch": 0.052974381241858444, "grad_norm": 0.05549105256795883, "learning_rate": 0.0001966106604866744, "loss": 0.0717, "step": 122 }, { "epoch": 0.05340859748154581, "grad_norm": 0.05031603202223778, "learning_rate": 0.0001965816917728853, "loss": 0.0507, "step": 123 }, { "epoch": 0.053842813721233174, "grad_norm": 0.03995596617460251, "learning_rate": 0.00019655272305909617, "loss": 0.0399, "step": 124 }, { "epoch": 0.054277029960920535, "grad_norm": 0.058115407824516296, "learning_rate": 0.00019652375434530707, "loss": 0.0597, "step": 125 }, { "epoch": 0.0547112462006079, "grad_norm": 0.05351436510682106, "learning_rate": 0.00019649478563151798, "loss": 0.057, "step": 126 }, { "epoch": 0.055145462440295265, "grad_norm": 0.04320720210671425, "learning_rate": 0.00019646581691772888, "loss": 0.0552, "step": 127 }, { "epoch": 0.05557967867998263, "grad_norm": 0.050133172422647476, "learning_rate": 0.00019643684820393976, "loss": 0.0462, "step": 128 }, { "epoch": 0.056013894919669995, "grad_norm": 0.036959223449230194, "learning_rate": 0.00019640787949015066, "loss": 0.0416, "step": 129 }, { "epoch": 0.05644811115935736, "grad_norm": 0.03414500877261162, "learning_rate": 0.00019637891077636154, "loss": 0.0466, "step": 130 }, { "epoch": 0.056882327399044724, "grad_norm": 0.054074011743068695, "learning_rate": 0.00019634994206257241, "loss": 0.0778, "step": 131 }, { "epoch": 0.057316543638732086, "grad_norm": 0.05731811746954918, "learning_rate": 0.00019632097334878332, "loss": 0.0626, "step": 132 }, { "epoch": 0.057750759878419454, "grad_norm": 0.039866212755441666, "learning_rate": 0.00019629200463499422, "loss": 0.0466, "step": 133 }, { "epoch": 0.058184976118106815, "grad_norm": 0.059963490813970566, "learning_rate": 0.00019626303592120513, "loss": 0.0538, "step": 134 }, { "epoch": 0.058619192357794184, "grad_norm": 0.04319881275296211, "learning_rate": 0.000196234067207416, "loss": 0.0524, "step": 135 }, { "epoch": 0.059053408597481545, "grad_norm": 0.05221778526902199, "learning_rate": 0.0001962050984936269, "loss": 0.0747, "step": 136 }, { "epoch": 0.05948762483716891, "grad_norm": 0.046094805002212524, "learning_rate": 0.00019617612977983778, "loss": 0.0577, "step": 137 }, { "epoch": 0.059921841076856275, "grad_norm": 0.06755319237709045, "learning_rate": 0.00019614716106604866, "loss": 0.1037, "step": 138 }, { "epoch": 0.060356057316543636, "grad_norm": 0.04487508535385132, "learning_rate": 0.00019611819235225956, "loss": 0.0697, "step": 139 }, { "epoch": 0.060790273556231005, "grad_norm": 0.07269088923931122, "learning_rate": 0.00019608922363847047, "loss": 0.0909, "step": 140 }, { "epoch": 0.061224489795918366, "grad_norm": 0.04226171597838402, "learning_rate": 0.00019606025492468137, "loss": 0.0477, "step": 141 }, { "epoch": 0.061658706035605734, "grad_norm": 0.041675273329019547, "learning_rate": 0.00019603128621089225, "loss": 0.0493, "step": 142 }, { "epoch": 0.062092922275293096, "grad_norm": 0.039942920207977295, "learning_rate": 0.00019600231749710315, "loss": 0.054, "step": 143 }, { "epoch": 0.06252713851498046, "grad_norm": 0.04449339956045151, "learning_rate": 0.00019597334878331403, "loss": 0.043, "step": 144 }, { "epoch": 0.06296135475466783, "grad_norm": 0.045853856950998306, "learning_rate": 0.0001959443800695249, "loss": 0.0622, "step": 145 }, { "epoch": 0.0633955709943552, "grad_norm": 0.04212164878845215, "learning_rate": 0.0001959154113557358, "loss": 0.0507, "step": 146 }, { "epoch": 0.06382978723404255, "grad_norm": 0.07130462676286697, "learning_rate": 0.0001958864426419467, "loss": 0.0973, "step": 147 }, { "epoch": 0.06426400347372992, "grad_norm": 0.04055679216980934, "learning_rate": 0.00019585747392815762, "loss": 0.0474, "step": 148 }, { "epoch": 0.06469821971341728, "grad_norm": 0.04265017434954643, "learning_rate": 0.0001958285052143685, "loss": 0.0521, "step": 149 }, { "epoch": 0.06513243595310464, "grad_norm": 0.0457426980137825, "learning_rate": 0.0001957995365005794, "loss": 0.0534, "step": 150 }, { "epoch": 0.06556665219279201, "grad_norm": 0.05117351561784744, "learning_rate": 0.00019577056778679027, "loss": 0.062, "step": 151 }, { "epoch": 0.06600086843247938, "grad_norm": 0.048427000641822815, "learning_rate": 0.00019574159907300115, "loss": 0.0571, "step": 152 }, { "epoch": 0.06643508467216674, "grad_norm": 0.06607496738433838, "learning_rate": 0.00019571263035921205, "loss": 0.0886, "step": 153 }, { "epoch": 0.0668693009118541, "grad_norm": 0.04602469876408577, "learning_rate": 0.00019568366164542296, "loss": 0.0483, "step": 154 }, { "epoch": 0.06730351715154147, "grad_norm": 0.044537901878356934, "learning_rate": 0.00019565469293163386, "loss": 0.0427, "step": 155 }, { "epoch": 0.06773773339122884, "grad_norm": 0.04190573841333389, "learning_rate": 0.00019562572421784474, "loss": 0.0393, "step": 156 }, { "epoch": 0.06817194963091619, "grad_norm": 0.048481203615665436, "learning_rate": 0.00019559675550405564, "loss": 0.0541, "step": 157 }, { "epoch": 0.06860616587060356, "grad_norm": 0.045507464557886124, "learning_rate": 0.00019556778679026652, "loss": 0.0507, "step": 158 }, { "epoch": 0.06904038211029093, "grad_norm": 0.050230398774147034, "learning_rate": 0.0001955388180764774, "loss": 0.0565, "step": 159 }, { "epoch": 0.0694745983499783, "grad_norm": 0.05014928802847862, "learning_rate": 0.0001955098493626883, "loss": 0.0682, "step": 160 }, { "epoch": 0.06990881458966565, "grad_norm": 0.04155123978853226, "learning_rate": 0.0001954808806488992, "loss": 0.0513, "step": 161 }, { "epoch": 0.07034303082935302, "grad_norm": 0.04276610538363457, "learning_rate": 0.0001954519119351101, "loss": 0.0565, "step": 162 }, { "epoch": 0.07077724706904039, "grad_norm": 0.03481678292155266, "learning_rate": 0.00019542294322132098, "loss": 0.0478, "step": 163 }, { "epoch": 0.07121146330872774, "grad_norm": 0.052877917885780334, "learning_rate": 0.00019539397450753188, "loss": 0.0521, "step": 164 }, { "epoch": 0.07164567954841511, "grad_norm": 0.04270905256271362, "learning_rate": 0.00019536500579374276, "loss": 0.0669, "step": 165 }, { "epoch": 0.07207989578810248, "grad_norm": 0.0436183325946331, "learning_rate": 0.00019533603707995364, "loss": 0.0543, "step": 166 }, { "epoch": 0.07251411202778985, "grad_norm": 0.04904114827513695, "learning_rate": 0.00019530706836616454, "loss": 0.0511, "step": 167 }, { "epoch": 0.0729483282674772, "grad_norm": 0.04059867188334465, "learning_rate": 0.00019527809965237545, "loss": 0.0449, "step": 168 }, { "epoch": 0.07338254450716457, "grad_norm": 0.040497638285160065, "learning_rate": 0.00019524913093858635, "loss": 0.0598, "step": 169 }, { "epoch": 0.07381676074685194, "grad_norm": 0.04683464393019676, "learning_rate": 0.00019522016222479723, "loss": 0.0528, "step": 170 }, { "epoch": 0.07425097698653929, "grad_norm": 0.05493398755788803, "learning_rate": 0.00019519119351100813, "loss": 0.0787, "step": 171 }, { "epoch": 0.07468519322622666, "grad_norm": 0.04186181724071503, "learning_rate": 0.000195162224797219, "loss": 0.0562, "step": 172 }, { "epoch": 0.07511940946591403, "grad_norm": 0.0359225831925869, "learning_rate": 0.0001951332560834299, "loss": 0.0514, "step": 173 }, { "epoch": 0.0755536257056014, "grad_norm": 0.045267727226018906, "learning_rate": 0.0001951042873696408, "loss": 0.0569, "step": 174 }, { "epoch": 0.07598784194528875, "grad_norm": 0.05066123977303505, "learning_rate": 0.0001950753186558517, "loss": 0.0594, "step": 175 }, { "epoch": 0.07642205818497612, "grad_norm": 0.04520639032125473, "learning_rate": 0.0001950463499420626, "loss": 0.0615, "step": 176 }, { "epoch": 0.07685627442466349, "grad_norm": 0.03975681960582733, "learning_rate": 0.00019501738122827347, "loss": 0.0422, "step": 177 }, { "epoch": 0.07729049066435084, "grad_norm": 0.04762616008520126, "learning_rate": 0.00019498841251448437, "loss": 0.0536, "step": 178 }, { "epoch": 0.07772470690403821, "grad_norm": 0.04295413941144943, "learning_rate": 0.00019495944380069525, "loss": 0.0527, "step": 179 }, { "epoch": 0.07815892314372558, "grad_norm": 0.03900258615612984, "learning_rate": 0.00019493047508690615, "loss": 0.0455, "step": 180 }, { "epoch": 0.07859313938341293, "grad_norm": 0.04929991811513901, "learning_rate": 0.00019490150637311703, "loss": 0.0501, "step": 181 }, { "epoch": 0.0790273556231003, "grad_norm": 0.04396167770028114, "learning_rate": 0.00019487253765932793, "loss": 0.0409, "step": 182 }, { "epoch": 0.07946157186278767, "grad_norm": 0.04506872221827507, "learning_rate": 0.00019484356894553884, "loss": 0.0584, "step": 183 }, { "epoch": 0.07989578810247504, "grad_norm": 0.040463682264089584, "learning_rate": 0.00019481460023174972, "loss": 0.0502, "step": 184 }, { "epoch": 0.08033000434216239, "grad_norm": 0.045519910752773285, "learning_rate": 0.00019478563151796062, "loss": 0.051, "step": 185 }, { "epoch": 0.08076422058184976, "grad_norm": 0.04745452105998993, "learning_rate": 0.0001947566628041715, "loss": 0.0531, "step": 186 }, { "epoch": 0.08119843682153713, "grad_norm": 0.03863684460520744, "learning_rate": 0.0001947276940903824, "loss": 0.0444, "step": 187 }, { "epoch": 0.08163265306122448, "grad_norm": 0.0527241975069046, "learning_rate": 0.00019469872537659328, "loss": 0.0641, "step": 188 }, { "epoch": 0.08206686930091185, "grad_norm": 0.047999512404203415, "learning_rate": 0.00019466975666280418, "loss": 0.0658, "step": 189 }, { "epoch": 0.08250108554059922, "grad_norm": 0.04404706880450249, "learning_rate": 0.00019464078794901508, "loss": 0.0599, "step": 190 }, { "epoch": 0.08293530178028659, "grad_norm": 0.034148599952459335, "learning_rate": 0.00019461181923522596, "loss": 0.0444, "step": 191 }, { "epoch": 0.08336951801997394, "grad_norm": 0.038160666823387146, "learning_rate": 0.00019458285052143686, "loss": 0.0467, "step": 192 }, { "epoch": 0.08380373425966131, "grad_norm": 0.056508056819438934, "learning_rate": 0.00019455388180764774, "loss": 0.0618, "step": 193 }, { "epoch": 0.08423795049934868, "grad_norm": 0.045706652104854584, "learning_rate": 0.00019452491309385864, "loss": 0.0587, "step": 194 }, { "epoch": 0.08467216673903603, "grad_norm": 0.03600676730275154, "learning_rate": 0.00019449594438006952, "loss": 0.0451, "step": 195 }, { "epoch": 0.0851063829787234, "grad_norm": 0.029732802882790565, "learning_rate": 0.00019446697566628042, "loss": 0.0385, "step": 196 }, { "epoch": 0.08554059921841077, "grad_norm": 0.03288305923342705, "learning_rate": 0.00019443800695249133, "loss": 0.0408, "step": 197 }, { "epoch": 0.08597481545809814, "grad_norm": 0.028448892757296562, "learning_rate": 0.0001944090382387022, "loss": 0.0369, "step": 198 }, { "epoch": 0.0864090316977855, "grad_norm": 0.04292777553200722, "learning_rate": 0.0001943800695249131, "loss": 0.0532, "step": 199 }, { "epoch": 0.08684324793747286, "grad_norm": 0.03773938864469528, "learning_rate": 0.00019435110081112398, "loss": 0.0444, "step": 200 }, { "epoch": 0.08727746417716023, "grad_norm": 0.03954479470849037, "learning_rate": 0.0001943221320973349, "loss": 0.0485, "step": 201 }, { "epoch": 0.08771168041684758, "grad_norm": 0.04170330613851547, "learning_rate": 0.00019429316338354577, "loss": 0.0459, "step": 202 }, { "epoch": 0.08814589665653495, "grad_norm": 0.035083312541246414, "learning_rate": 0.00019426419466975667, "loss": 0.0424, "step": 203 }, { "epoch": 0.08858011289622232, "grad_norm": 0.05554633215069771, "learning_rate": 0.00019423522595596757, "loss": 0.07, "step": 204 }, { "epoch": 0.08901432913590969, "grad_norm": 0.04733762890100479, "learning_rate": 0.00019420625724217845, "loss": 0.0591, "step": 205 }, { "epoch": 0.08944854537559704, "grad_norm": 0.04187049716711044, "learning_rate": 0.00019417728852838935, "loss": 0.0486, "step": 206 }, { "epoch": 0.08988276161528441, "grad_norm": 0.04470975324511528, "learning_rate": 0.00019414831981460026, "loss": 0.0595, "step": 207 }, { "epoch": 0.09031697785497178, "grad_norm": 0.03953153267502785, "learning_rate": 0.00019411935110081113, "loss": 0.0432, "step": 208 }, { "epoch": 0.09075119409465914, "grad_norm": 0.055020980536937714, "learning_rate": 0.000194090382387022, "loss": 0.0879, "step": 209 }, { "epoch": 0.0911854103343465, "grad_norm": 0.04815017431974411, "learning_rate": 0.00019406141367323291, "loss": 0.0539, "step": 210 }, { "epoch": 0.09161962657403387, "grad_norm": 0.0442352257668972, "learning_rate": 0.00019403244495944382, "loss": 0.0451, "step": 211 }, { "epoch": 0.09205384281372124, "grad_norm": 0.040680866688489914, "learning_rate": 0.0001940034762456547, "loss": 0.0579, "step": 212 }, { "epoch": 0.0924880590534086, "grad_norm": 0.03736395388841629, "learning_rate": 0.0001939745075318656, "loss": 0.0415, "step": 213 }, { "epoch": 0.09292227529309596, "grad_norm": 0.039253294467926025, "learning_rate": 0.0001939455388180765, "loss": 0.0432, "step": 214 }, { "epoch": 0.09335649153278333, "grad_norm": 0.04981299862265587, "learning_rate": 0.00019391657010428738, "loss": 0.069, "step": 215 }, { "epoch": 0.09379070777247069, "grad_norm": 0.04173027351498604, "learning_rate": 0.00019388760139049825, "loss": 0.0522, "step": 216 }, { "epoch": 0.09422492401215805, "grad_norm": 0.034836046397686005, "learning_rate": 0.00019385863267670916, "loss": 0.0458, "step": 217 }, { "epoch": 0.09465914025184542, "grad_norm": 0.047457341104745865, "learning_rate": 0.00019382966396292006, "loss": 0.0586, "step": 218 }, { "epoch": 0.09509335649153278, "grad_norm": 0.03782190755009651, "learning_rate": 0.00019380069524913094, "loss": 0.0444, "step": 219 }, { "epoch": 0.09552757273122015, "grad_norm": 0.03896765410900116, "learning_rate": 0.00019377172653534184, "loss": 0.0515, "step": 220 }, { "epoch": 0.09596178897090751, "grad_norm": 0.04006579518318176, "learning_rate": 0.00019374275782155275, "loss": 0.0598, "step": 221 }, { "epoch": 0.09639600521059488, "grad_norm": 0.03563413396477699, "learning_rate": 0.00019371378910776362, "loss": 0.05, "step": 222 }, { "epoch": 0.09683022145028224, "grad_norm": 0.0475783534348011, "learning_rate": 0.0001936848203939745, "loss": 0.0671, "step": 223 }, { "epoch": 0.0972644376899696, "grad_norm": 0.04286491870880127, "learning_rate": 0.0001936558516801854, "loss": 0.0461, "step": 224 }, { "epoch": 0.09769865392965697, "grad_norm": 0.055003900080919266, "learning_rate": 0.0001936268829663963, "loss": 0.067, "step": 225 }, { "epoch": 0.09813287016934433, "grad_norm": 0.03747004643082619, "learning_rate": 0.0001935979142526072, "loss": 0.0435, "step": 226 }, { "epoch": 0.0985670864090317, "grad_norm": 0.05067036673426628, "learning_rate": 0.0001935689455388181, "loss": 0.0509, "step": 227 }, { "epoch": 0.09900130264871906, "grad_norm": 0.04623427614569664, "learning_rate": 0.000193539976825029, "loss": 0.0535, "step": 228 }, { "epoch": 0.09943551888840643, "grad_norm": 0.04230823367834091, "learning_rate": 0.00019351100811123987, "loss": 0.0492, "step": 229 }, { "epoch": 0.09986973512809379, "grad_norm": 0.047490935772657394, "learning_rate": 0.00019348203939745074, "loss": 0.0579, "step": 230 }, { "epoch": 0.10030395136778116, "grad_norm": 0.05475654825568199, "learning_rate": 0.00019345307068366165, "loss": 0.0718, "step": 231 }, { "epoch": 0.10073816760746852, "grad_norm": 0.03551526367664337, "learning_rate": 0.00019342410196987255, "loss": 0.042, "step": 232 }, { "epoch": 0.10117238384715588, "grad_norm": 0.04030029475688934, "learning_rate": 0.00019339513325608346, "loss": 0.0504, "step": 233 }, { "epoch": 0.10160660008684325, "grad_norm": 0.04321757331490517, "learning_rate": 0.00019336616454229433, "loss": 0.0524, "step": 234 }, { "epoch": 0.10204081632653061, "grad_norm": 0.04394688084721565, "learning_rate": 0.00019333719582850524, "loss": 0.0625, "step": 235 }, { "epoch": 0.10247503256621798, "grad_norm": 0.039190277457237244, "learning_rate": 0.0001933082271147161, "loss": 0.0382, "step": 236 }, { "epoch": 0.10290924880590534, "grad_norm": 0.05130652338266373, "learning_rate": 0.000193279258400927, "loss": 0.0706, "step": 237 }, { "epoch": 0.1033434650455927, "grad_norm": 0.03996153175830841, "learning_rate": 0.0001932502896871379, "loss": 0.0518, "step": 238 }, { "epoch": 0.10377768128528007, "grad_norm": 0.042815543711185455, "learning_rate": 0.0001932213209733488, "loss": 0.0511, "step": 239 }, { "epoch": 0.10421189752496743, "grad_norm": 0.04235919937491417, "learning_rate": 0.0001931923522595597, "loss": 0.0493, "step": 240 }, { "epoch": 0.1046461137646548, "grad_norm": 0.04234609380364418, "learning_rate": 0.00019316338354577058, "loss": 0.0459, "step": 241 }, { "epoch": 0.10508033000434217, "grad_norm": 0.040160950273275375, "learning_rate": 0.00019313441483198148, "loss": 0.0472, "step": 242 }, { "epoch": 0.10551454624402953, "grad_norm": 0.037086643278598785, "learning_rate": 0.00019310544611819236, "loss": 0.0461, "step": 243 }, { "epoch": 0.10594876248371689, "grad_norm": 0.05389266833662987, "learning_rate": 0.00019307647740440323, "loss": 0.0819, "step": 244 }, { "epoch": 0.10638297872340426, "grad_norm": 0.052627865225076675, "learning_rate": 0.00019304750869061414, "loss": 0.0691, "step": 245 }, { "epoch": 0.10681719496309162, "grad_norm": 0.044948361814022064, "learning_rate": 0.00019301853997682504, "loss": 0.0582, "step": 246 }, { "epoch": 0.10725141120277898, "grad_norm": 0.0334913469851017, "learning_rate": 0.00019298957126303594, "loss": 0.0455, "step": 247 }, { "epoch": 0.10768562744246635, "grad_norm": 0.04388870671391487, "learning_rate": 0.00019296060254924682, "loss": 0.0589, "step": 248 }, { "epoch": 0.10811984368215372, "grad_norm": 0.04316788539290428, "learning_rate": 0.00019293163383545773, "loss": 0.0542, "step": 249 }, { "epoch": 0.10855405992184107, "grad_norm": 0.03094732202589512, "learning_rate": 0.0001929026651216686, "loss": 0.042, "step": 250 }, { "epoch": 0.10898827616152844, "grad_norm": 0.030171135440468788, "learning_rate": 0.00019287369640787948, "loss": 0.0407, "step": 251 }, { "epoch": 0.1094224924012158, "grad_norm": 0.03201733157038689, "learning_rate": 0.00019284472769409038, "loss": 0.0499, "step": 252 }, { "epoch": 0.10985670864090318, "grad_norm": 0.04464307427406311, "learning_rate": 0.00019281575898030129, "loss": 0.0481, "step": 253 }, { "epoch": 0.11029092488059053, "grad_norm": 0.04349854588508606, "learning_rate": 0.0001927867902665122, "loss": 0.0544, "step": 254 }, { "epoch": 0.1107251411202779, "grad_norm": 0.0460648313164711, "learning_rate": 0.00019275782155272307, "loss": 0.0609, "step": 255 }, { "epoch": 0.11115935735996527, "grad_norm": 0.03829682245850563, "learning_rate": 0.00019272885283893397, "loss": 0.0598, "step": 256 }, { "epoch": 0.11159357359965262, "grad_norm": 0.035956040024757385, "learning_rate": 0.00019269988412514485, "loss": 0.0478, "step": 257 }, { "epoch": 0.11202778983933999, "grad_norm": 0.03577110916376114, "learning_rate": 0.00019267091541135572, "loss": 0.0502, "step": 258 }, { "epoch": 0.11246200607902736, "grad_norm": 0.05599001795053482, "learning_rate": 0.00019264194669756663, "loss": 0.0803, "step": 259 }, { "epoch": 0.11289622231871473, "grad_norm": 0.038812778890132904, "learning_rate": 0.00019261297798377753, "loss": 0.0435, "step": 260 }, { "epoch": 0.11333043855840208, "grad_norm": 0.04182914271950722, "learning_rate": 0.00019258400926998843, "loss": 0.0599, "step": 261 }, { "epoch": 0.11376465479808945, "grad_norm": 0.03961155563592911, "learning_rate": 0.0001925550405561993, "loss": 0.0516, "step": 262 }, { "epoch": 0.11419887103777682, "grad_norm": 0.036500945687294006, "learning_rate": 0.00019252607184241021, "loss": 0.0441, "step": 263 }, { "epoch": 0.11463308727746417, "grad_norm": 0.03595854714512825, "learning_rate": 0.0001924971031286211, "loss": 0.0506, "step": 264 }, { "epoch": 0.11506730351715154, "grad_norm": 0.044774286448955536, "learning_rate": 0.00019246813441483197, "loss": 0.0655, "step": 265 }, { "epoch": 0.11550151975683891, "grad_norm": 0.03854202479124069, "learning_rate": 0.00019243916570104287, "loss": 0.0527, "step": 266 }, { "epoch": 0.11593573599652628, "grad_norm": 0.03486412763595581, "learning_rate": 0.00019241019698725378, "loss": 0.0472, "step": 267 }, { "epoch": 0.11636995223621363, "grad_norm": 0.039601098746061325, "learning_rate": 0.00019238122827346468, "loss": 0.0424, "step": 268 }, { "epoch": 0.116804168475901, "grad_norm": 0.03565143048763275, "learning_rate": 0.00019235225955967556, "loss": 0.0443, "step": 269 }, { "epoch": 0.11723838471558837, "grad_norm": 0.040365058928728104, "learning_rate": 0.00019232329084588646, "loss": 0.0487, "step": 270 }, { "epoch": 0.11767260095527572, "grad_norm": 0.03593125939369202, "learning_rate": 0.00019229432213209734, "loss": 0.0479, "step": 271 }, { "epoch": 0.11810681719496309, "grad_norm": 0.03898940235376358, "learning_rate": 0.00019226535341830824, "loss": 0.0486, "step": 272 }, { "epoch": 0.11854103343465046, "grad_norm": 0.04047354310750961, "learning_rate": 0.00019223638470451912, "loss": 0.0467, "step": 273 }, { "epoch": 0.11897524967433783, "grad_norm": 0.032248109579086304, "learning_rate": 0.00019220741599073002, "loss": 0.0425, "step": 274 }, { "epoch": 0.11940946591402518, "grad_norm": 0.037578847259283066, "learning_rate": 0.00019217844727694092, "loss": 0.0515, "step": 275 }, { "epoch": 0.11984368215371255, "grad_norm": 0.04379972815513611, "learning_rate": 0.0001921494785631518, "loss": 0.0489, "step": 276 }, { "epoch": 0.12027789839339992, "grad_norm": 0.045067738741636276, "learning_rate": 0.0001921205098493627, "loss": 0.055, "step": 277 }, { "epoch": 0.12071211463308727, "grad_norm": 0.038350384682416916, "learning_rate": 0.00019209154113557358, "loss": 0.0523, "step": 278 }, { "epoch": 0.12114633087277464, "grad_norm": 0.045552853494882584, "learning_rate": 0.00019206257242178448, "loss": 0.0593, "step": 279 }, { "epoch": 0.12158054711246201, "grad_norm": 0.03652540594339371, "learning_rate": 0.00019203360370799536, "loss": 0.0401, "step": 280 }, { "epoch": 0.12201476335214936, "grad_norm": 0.046073734760284424, "learning_rate": 0.00019200463499420626, "loss": 0.057, "step": 281 }, { "epoch": 0.12244897959183673, "grad_norm": 0.03719460591673851, "learning_rate": 0.00019197566628041717, "loss": 0.0507, "step": 282 }, { "epoch": 0.1228831958315241, "grad_norm": 0.046581070870161057, "learning_rate": 0.00019194669756662804, "loss": 0.0667, "step": 283 }, { "epoch": 0.12331741207121147, "grad_norm": 0.06181512773036957, "learning_rate": 0.00019191772885283895, "loss": 0.0689, "step": 284 }, { "epoch": 0.12375162831089882, "grad_norm": 0.034502238035202026, "learning_rate": 0.00019188876013904985, "loss": 0.0451, "step": 285 }, { "epoch": 0.12418584455058619, "grad_norm": 0.033609505742788315, "learning_rate": 0.00019185979142526073, "loss": 0.042, "step": 286 }, { "epoch": 0.12462006079027356, "grad_norm": 0.05046140030026436, "learning_rate": 0.0001918308227114716, "loss": 0.0751, "step": 287 }, { "epoch": 0.12505427702996091, "grad_norm": 0.03362179547548294, "learning_rate": 0.0001918018539976825, "loss": 0.0427, "step": 288 }, { "epoch": 0.12548849326964828, "grad_norm": 0.037858638912439346, "learning_rate": 0.0001917728852838934, "loss": 0.04, "step": 289 }, { "epoch": 0.12592270950933565, "grad_norm": 0.03764700889587402, "learning_rate": 0.0001917439165701043, "loss": 0.0589, "step": 290 }, { "epoch": 0.12635692574902302, "grad_norm": 0.033437393605709076, "learning_rate": 0.0001917149478563152, "loss": 0.0396, "step": 291 }, { "epoch": 0.1267911419887104, "grad_norm": 0.03524801507592201, "learning_rate": 0.0001916859791425261, "loss": 0.0451, "step": 292 }, { "epoch": 0.12722535822839776, "grad_norm": 0.03711460903286934, "learning_rate": 0.00019165701042873697, "loss": 0.0407, "step": 293 }, { "epoch": 0.1276595744680851, "grad_norm": 0.04663747549057007, "learning_rate": 0.00019162804171494785, "loss": 0.0439, "step": 294 }, { "epoch": 0.12809379070777246, "grad_norm": 0.04616788774728775, "learning_rate": 0.00019159907300115875, "loss": 0.0587, "step": 295 }, { "epoch": 0.12852800694745983, "grad_norm": 0.037069790065288544, "learning_rate": 0.00019157010428736966, "loss": 0.0537, "step": 296 }, { "epoch": 0.1289622231871472, "grad_norm": 0.0388314351439476, "learning_rate": 0.00019154113557358053, "loss": 0.0493, "step": 297 }, { "epoch": 0.12939643942683457, "grad_norm": 0.031936164945364, "learning_rate": 0.00019151216685979144, "loss": 0.0395, "step": 298 }, { "epoch": 0.12983065566652194, "grad_norm": 0.040307071059942245, "learning_rate": 0.00019148319814600234, "loss": 0.0543, "step": 299 }, { "epoch": 0.13026487190620928, "grad_norm": 0.03362736105918884, "learning_rate": 0.00019145422943221322, "loss": 0.04, "step": 300 }, { "epoch": 0.13069908814589665, "grad_norm": 0.03325377404689789, "learning_rate": 0.0001914252607184241, "loss": 0.0375, "step": 301 }, { "epoch": 0.13113330438558402, "grad_norm": 0.04420468956232071, "learning_rate": 0.000191396292004635, "loss": 0.0588, "step": 302 }, { "epoch": 0.13156752062527138, "grad_norm": 0.042306333780288696, "learning_rate": 0.0001913673232908459, "loss": 0.0578, "step": 303 }, { "epoch": 0.13200173686495875, "grad_norm": 0.03993650898337364, "learning_rate": 0.00019133835457705678, "loss": 0.0445, "step": 304 }, { "epoch": 0.13243595310464612, "grad_norm": 0.03690562769770622, "learning_rate": 0.00019130938586326768, "loss": 0.0511, "step": 305 }, { "epoch": 0.1328701693443335, "grad_norm": 0.05009304732084274, "learning_rate": 0.0001912804171494786, "loss": 0.0632, "step": 306 }, { "epoch": 0.13330438558402083, "grad_norm": 0.04142018035054207, "learning_rate": 0.00019125144843568946, "loss": 0.0508, "step": 307 }, { "epoch": 0.1337386018237082, "grad_norm": 0.040976040065288544, "learning_rate": 0.00019122247972190034, "loss": 0.0502, "step": 308 }, { "epoch": 0.13417281806339557, "grad_norm": 0.042092591524124146, "learning_rate": 0.00019119351100811124, "loss": 0.052, "step": 309 }, { "epoch": 0.13460703430308293, "grad_norm": 0.037369925528764725, "learning_rate": 0.00019116454229432215, "loss": 0.0432, "step": 310 }, { "epoch": 0.1350412505427703, "grad_norm": 0.037208475172519684, "learning_rate": 0.00019113557358053302, "loss": 0.049, "step": 311 }, { "epoch": 0.13547546678245767, "grad_norm": 0.03912423551082611, "learning_rate": 0.00019110660486674393, "loss": 0.0517, "step": 312 }, { "epoch": 0.13590968302214504, "grad_norm": 0.04449324309825897, "learning_rate": 0.00019107763615295483, "loss": 0.0613, "step": 313 }, { "epoch": 0.13634389926183238, "grad_norm": 0.04702894017100334, "learning_rate": 0.0001910486674391657, "loss": 0.0666, "step": 314 }, { "epoch": 0.13677811550151975, "grad_norm": 0.04253809526562691, "learning_rate": 0.00019101969872537658, "loss": 0.0621, "step": 315 }, { "epoch": 0.13721233174120712, "grad_norm": 0.03856262192130089, "learning_rate": 0.0001909907300115875, "loss": 0.0506, "step": 316 }, { "epoch": 0.13764654798089448, "grad_norm": 0.04405730962753296, "learning_rate": 0.0001909617612977984, "loss": 0.0488, "step": 317 }, { "epoch": 0.13808076422058185, "grad_norm": 0.03860783204436302, "learning_rate": 0.0001909327925840093, "loss": 0.0488, "step": 318 }, { "epoch": 0.13851498046026922, "grad_norm": 0.042276572436094284, "learning_rate": 0.00019090382387022017, "loss": 0.0537, "step": 319 }, { "epoch": 0.1389491966999566, "grad_norm": 0.043404750525951385, "learning_rate": 0.00019087485515643108, "loss": 0.0614, "step": 320 }, { "epoch": 0.13938341293964393, "grad_norm": 0.03293661028146744, "learning_rate": 0.00019084588644264195, "loss": 0.0437, "step": 321 }, { "epoch": 0.1398176291793313, "grad_norm": 0.04241139441728592, "learning_rate": 0.00019081691772885283, "loss": 0.0644, "step": 322 }, { "epoch": 0.14025184541901867, "grad_norm": 0.04477868229150772, "learning_rate": 0.00019078794901506373, "loss": 0.0581, "step": 323 }, { "epoch": 0.14068606165870604, "grad_norm": 0.04254178702831268, "learning_rate": 0.00019075898030127464, "loss": 0.0475, "step": 324 }, { "epoch": 0.1411202778983934, "grad_norm": 0.039650171995162964, "learning_rate": 0.00019073001158748554, "loss": 0.0441, "step": 325 }, { "epoch": 0.14155449413808077, "grad_norm": 0.03547080606222153, "learning_rate": 0.00019070104287369642, "loss": 0.0459, "step": 326 }, { "epoch": 0.14198871037776814, "grad_norm": 0.030985152348876, "learning_rate": 0.00019067207415990732, "loss": 0.0405, "step": 327 }, { "epoch": 0.14242292661745548, "grad_norm": 0.03633008524775505, "learning_rate": 0.0001906431054461182, "loss": 0.0426, "step": 328 }, { "epoch": 0.14285714285714285, "grad_norm": 0.03835967183113098, "learning_rate": 0.00019061413673232907, "loss": 0.0511, "step": 329 }, { "epoch": 0.14329135909683022, "grad_norm": 0.031091703101992607, "learning_rate": 0.00019058516801853998, "loss": 0.0412, "step": 330 }, { "epoch": 0.14372557533651759, "grad_norm": 0.045019570738077164, "learning_rate": 0.00019055619930475088, "loss": 0.0524, "step": 331 }, { "epoch": 0.14415979157620495, "grad_norm": 0.042671624571084976, "learning_rate": 0.00019052723059096179, "loss": 0.0465, "step": 332 }, { "epoch": 0.14459400781589232, "grad_norm": 0.03176131471991539, "learning_rate": 0.00019049826187717266, "loss": 0.0383, "step": 333 }, { "epoch": 0.1450282240555797, "grad_norm": 0.039035145193338394, "learning_rate": 0.00019046929316338357, "loss": 0.0416, "step": 334 }, { "epoch": 0.14546244029526703, "grad_norm": 0.03603845089673996, "learning_rate": 0.00019044032444959444, "loss": 0.0479, "step": 335 }, { "epoch": 0.1458966565349544, "grad_norm": 0.05446283146739006, "learning_rate": 0.00019041135573580532, "loss": 0.058, "step": 336 }, { "epoch": 0.14633087277464177, "grad_norm": 0.03653489425778389, "learning_rate": 0.00019038238702201622, "loss": 0.0505, "step": 337 }, { "epoch": 0.14676508901432914, "grad_norm": 0.03518545627593994, "learning_rate": 0.00019035341830822713, "loss": 0.0446, "step": 338 }, { "epoch": 0.1471993052540165, "grad_norm": 0.03894200548529625, "learning_rate": 0.00019032444959443803, "loss": 0.0446, "step": 339 }, { "epoch": 0.14763352149370387, "grad_norm": 0.05154534801840782, "learning_rate": 0.0001902954808806489, "loss": 0.0537, "step": 340 }, { "epoch": 0.14806773773339124, "grad_norm": 0.0428602434694767, "learning_rate": 0.0001902665121668598, "loss": 0.0535, "step": 341 }, { "epoch": 0.14850195397307858, "grad_norm": 0.03125575929880142, "learning_rate": 0.0001902375434530707, "loss": 0.0434, "step": 342 }, { "epoch": 0.14893617021276595, "grad_norm": 0.039174456149339676, "learning_rate": 0.00019020857473928156, "loss": 0.0513, "step": 343 }, { "epoch": 0.14937038645245332, "grad_norm": 0.0422285832464695, "learning_rate": 0.00019017960602549247, "loss": 0.0578, "step": 344 }, { "epoch": 0.1498046026921407, "grad_norm": 0.041519392281770706, "learning_rate": 0.00019015063731170337, "loss": 0.057, "step": 345 }, { "epoch": 0.15023881893182806, "grad_norm": 0.036152131855487823, "learning_rate": 0.00019012166859791427, "loss": 0.05, "step": 346 }, { "epoch": 0.15067303517151542, "grad_norm": 0.0387120246887207, "learning_rate": 0.00019009269988412515, "loss": 0.057, "step": 347 }, { "epoch": 0.1511072514112028, "grad_norm": 0.03455391153693199, "learning_rate": 0.00019006373117033605, "loss": 0.0453, "step": 348 }, { "epoch": 0.15154146765089013, "grad_norm": 0.034672752022743225, "learning_rate": 0.00019003476245654693, "loss": 0.0407, "step": 349 }, { "epoch": 0.1519756838905775, "grad_norm": 0.036628905683755875, "learning_rate": 0.0001900057937427578, "loss": 0.0446, "step": 350 }, { "epoch": 0.15240990013026487, "grad_norm": 0.04567549750208855, "learning_rate": 0.0001899768250289687, "loss": 0.055, "step": 351 }, { "epoch": 0.15284411636995224, "grad_norm": 0.039983540773391724, "learning_rate": 0.00018994785631517962, "loss": 0.0501, "step": 352 }, { "epoch": 0.1532783326096396, "grad_norm": 0.0370592400431633, "learning_rate": 0.00018991888760139052, "loss": 0.0387, "step": 353 }, { "epoch": 0.15371254884932697, "grad_norm": 0.03668994456529617, "learning_rate": 0.0001898899188876014, "loss": 0.0433, "step": 354 }, { "epoch": 0.15414676508901434, "grad_norm": 0.04388640448451042, "learning_rate": 0.0001898609501738123, "loss": 0.061, "step": 355 }, { "epoch": 0.15458098132870168, "grad_norm": 0.03503924235701561, "learning_rate": 0.00018983198146002318, "loss": 0.045, "step": 356 }, { "epoch": 0.15501519756838905, "grad_norm": 0.05477984622120857, "learning_rate": 0.00018980301274623405, "loss": 0.0677, "step": 357 }, { "epoch": 0.15544941380807642, "grad_norm": 0.03422663360834122, "learning_rate": 0.00018977404403244496, "loss": 0.047, "step": 358 }, { "epoch": 0.1558836300477638, "grad_norm": 0.0409727543592453, "learning_rate": 0.00018974507531865586, "loss": 0.0468, "step": 359 }, { "epoch": 0.15631784628745116, "grad_norm": 0.03525396063923836, "learning_rate": 0.00018971610660486676, "loss": 0.0404, "step": 360 }, { "epoch": 0.15675206252713852, "grad_norm": 0.04045272618532181, "learning_rate": 0.00018968713789107764, "loss": 0.0478, "step": 361 }, { "epoch": 0.15718627876682587, "grad_norm": 0.042639102786779404, "learning_rate": 0.00018965816917728854, "loss": 0.0464, "step": 362 }, { "epoch": 0.15762049500651323, "grad_norm": 0.03818485140800476, "learning_rate": 0.00018962920046349945, "loss": 0.0512, "step": 363 }, { "epoch": 0.1580547112462006, "grad_norm": 0.03609969839453697, "learning_rate": 0.0001896002317497103, "loss": 0.0458, "step": 364 }, { "epoch": 0.15848892748588797, "grad_norm": 0.04345929995179176, "learning_rate": 0.0001895712630359212, "loss": 0.0516, "step": 365 }, { "epoch": 0.15892314372557534, "grad_norm": 0.04102470353245735, "learning_rate": 0.0001895422943221321, "loss": 0.0564, "step": 366 }, { "epoch": 0.1593573599652627, "grad_norm": 0.042316827923059464, "learning_rate": 0.000189513325608343, "loss": 0.0597, "step": 367 }, { "epoch": 0.15979157620495008, "grad_norm": 0.032522041350603104, "learning_rate": 0.00018948435689455389, "loss": 0.0437, "step": 368 }, { "epoch": 0.16022579244463742, "grad_norm": 0.03136477246880531, "learning_rate": 0.0001894553881807648, "loss": 0.0437, "step": 369 }, { "epoch": 0.16066000868432478, "grad_norm": 0.041031040251255035, "learning_rate": 0.0001894264194669757, "loss": 0.0569, "step": 370 }, { "epoch": 0.16109422492401215, "grad_norm": 0.03324378654360771, "learning_rate": 0.00018939745075318657, "loss": 0.043, "step": 371 }, { "epoch": 0.16152844116369952, "grad_norm": 0.03663293272256851, "learning_rate": 0.00018936848203939745, "loss": 0.048, "step": 372 }, { "epoch": 0.1619626574033869, "grad_norm": 0.10473132878541946, "learning_rate": 0.00018933951332560835, "loss": 0.1247, "step": 373 }, { "epoch": 0.16239687364307426, "grad_norm": 0.044064246118068695, "learning_rate": 0.00018931054461181925, "loss": 0.0587, "step": 374 }, { "epoch": 0.16283108988276163, "grad_norm": 0.0406067818403244, "learning_rate": 0.00018928157589803013, "loss": 0.0511, "step": 375 }, { "epoch": 0.16326530612244897, "grad_norm": 0.04347710311412811, "learning_rate": 0.00018925260718424103, "loss": 0.0645, "step": 376 }, { "epoch": 0.16369952236213633, "grad_norm": 0.039979416877031326, "learning_rate": 0.00018922363847045194, "loss": 0.0603, "step": 377 }, { "epoch": 0.1641337386018237, "grad_norm": 0.028565770015120506, "learning_rate": 0.00018919466975666281, "loss": 0.041, "step": 378 }, { "epoch": 0.16456795484151107, "grad_norm": 0.034830570220947266, "learning_rate": 0.0001891657010428737, "loss": 0.0453, "step": 379 }, { "epoch": 0.16500217108119844, "grad_norm": 0.03196999058127403, "learning_rate": 0.0001891367323290846, "loss": 0.0503, "step": 380 }, { "epoch": 0.1654363873208858, "grad_norm": 0.02930157072842121, "learning_rate": 0.0001891077636152955, "loss": 0.046, "step": 381 }, { "epoch": 0.16587060356057318, "grad_norm": 0.028370272368192673, "learning_rate": 0.00018907879490150637, "loss": 0.0406, "step": 382 }, { "epoch": 0.16630481980026052, "grad_norm": 0.030735645443201065, "learning_rate": 0.00018904982618771728, "loss": 0.0452, "step": 383 }, { "epoch": 0.16673903603994789, "grad_norm": 0.04211806878447533, "learning_rate": 0.00018902085747392818, "loss": 0.0517, "step": 384 }, { "epoch": 0.16717325227963525, "grad_norm": 0.03514954820275307, "learning_rate": 0.00018899188876013906, "loss": 0.0462, "step": 385 }, { "epoch": 0.16760746851932262, "grad_norm": 0.03706466034054756, "learning_rate": 0.00018896292004634994, "loss": 0.0484, "step": 386 }, { "epoch": 0.16804168475901, "grad_norm": 0.040088851004838943, "learning_rate": 0.00018893395133256084, "loss": 0.0616, "step": 387 }, { "epoch": 0.16847590099869736, "grad_norm": 0.03620345517992973, "learning_rate": 0.00018890498261877174, "loss": 0.0542, "step": 388 }, { "epoch": 0.16891011723838473, "grad_norm": 0.035707663744688034, "learning_rate": 0.00018887601390498262, "loss": 0.0486, "step": 389 }, { "epoch": 0.16934433347807207, "grad_norm": 0.03514106571674347, "learning_rate": 0.00018884704519119352, "loss": 0.0533, "step": 390 }, { "epoch": 0.16977854971775944, "grad_norm": 0.03197222575545311, "learning_rate": 0.00018881807647740443, "loss": 0.0426, "step": 391 }, { "epoch": 0.1702127659574468, "grad_norm": 0.039178889244794846, "learning_rate": 0.0001887891077636153, "loss": 0.0535, "step": 392 }, { "epoch": 0.17064698219713417, "grad_norm": 0.046329937875270844, "learning_rate": 0.00018876013904982618, "loss": 0.0664, "step": 393 }, { "epoch": 0.17108119843682154, "grad_norm": 0.04918839782476425, "learning_rate": 0.00018873117033603708, "loss": 0.0578, "step": 394 }, { "epoch": 0.1715154146765089, "grad_norm": 0.03348748758435249, "learning_rate": 0.000188702201622248, "loss": 0.0498, "step": 395 }, { "epoch": 0.17194963091619628, "grad_norm": 0.033797796815633774, "learning_rate": 0.00018867323290845886, "loss": 0.0404, "step": 396 }, { "epoch": 0.17238384715588362, "grad_norm": 0.03595108166337013, "learning_rate": 0.00018864426419466977, "loss": 0.0564, "step": 397 }, { "epoch": 0.172818063395571, "grad_norm": 0.035284657031297684, "learning_rate": 0.00018861529548088067, "loss": 0.0532, "step": 398 }, { "epoch": 0.17325227963525835, "grad_norm": 0.038479581475257874, "learning_rate": 0.00018858632676709155, "loss": 0.0463, "step": 399 }, { "epoch": 0.17368649587494572, "grad_norm": 0.04863559082150459, "learning_rate": 0.00018855735805330242, "loss": 0.0669, "step": 400 }, { "epoch": 0.1741207121146331, "grad_norm": 0.050505075603723526, "learning_rate": 0.00018852838933951333, "loss": 0.068, "step": 401 }, { "epoch": 0.17455492835432046, "grad_norm": 0.03283257037401199, "learning_rate": 0.00018849942062572423, "loss": 0.0439, "step": 402 }, { "epoch": 0.17498914459400783, "grad_norm": 0.03564135357737541, "learning_rate": 0.0001884704519119351, "loss": 0.0521, "step": 403 }, { "epoch": 0.17542336083369517, "grad_norm": 0.04079529643058777, "learning_rate": 0.000188441483198146, "loss": 0.0536, "step": 404 }, { "epoch": 0.17585757707338254, "grad_norm": 0.03021615743637085, "learning_rate": 0.00018841251448435692, "loss": 0.0408, "step": 405 }, { "epoch": 0.1762917933130699, "grad_norm": 0.04835856333374977, "learning_rate": 0.0001883835457705678, "loss": 0.0575, "step": 406 }, { "epoch": 0.17672600955275727, "grad_norm": 0.02825934812426567, "learning_rate": 0.00018835457705677867, "loss": 0.0398, "step": 407 }, { "epoch": 0.17716022579244464, "grad_norm": 0.03277318924665451, "learning_rate": 0.00018832560834298957, "loss": 0.0511, "step": 408 }, { "epoch": 0.177594442032132, "grad_norm": 0.03525669872760773, "learning_rate": 0.00018829663962920048, "loss": 0.0489, "step": 409 }, { "epoch": 0.17802865827181938, "grad_norm": 0.04269387200474739, "learning_rate": 0.00018826767091541135, "loss": 0.0568, "step": 410 }, { "epoch": 0.17846287451150672, "grad_norm": 0.03460850194096565, "learning_rate": 0.00018823870220162226, "loss": 0.0443, "step": 411 }, { "epoch": 0.1788970907511941, "grad_norm": 0.04240446910262108, "learning_rate": 0.00018820973348783316, "loss": 0.0547, "step": 412 }, { "epoch": 0.17933130699088146, "grad_norm": 0.03730539232492447, "learning_rate": 0.00018818076477404404, "loss": 0.0517, "step": 413 }, { "epoch": 0.17976552323056882, "grad_norm": 0.04469822719693184, "learning_rate": 0.00018815179606025491, "loss": 0.057, "step": 414 }, { "epoch": 0.1801997394702562, "grad_norm": 0.03404216840863228, "learning_rate": 0.00018812282734646582, "loss": 0.0398, "step": 415 }, { "epoch": 0.18063395570994356, "grad_norm": 0.03696839138865471, "learning_rate": 0.00018809385863267672, "loss": 0.0566, "step": 416 }, { "epoch": 0.18106817194963093, "grad_norm": 0.037126317620277405, "learning_rate": 0.00018806488991888763, "loss": 0.0453, "step": 417 }, { "epoch": 0.18150238818931827, "grad_norm": 0.029424721375107765, "learning_rate": 0.0001880359212050985, "loss": 0.0454, "step": 418 }, { "epoch": 0.18193660442900564, "grad_norm": 0.034976352006196976, "learning_rate": 0.0001880069524913094, "loss": 0.0435, "step": 419 }, { "epoch": 0.182370820668693, "grad_norm": 0.03662649914622307, "learning_rate": 0.00018797798377752028, "loss": 0.0499, "step": 420 }, { "epoch": 0.18280503690838038, "grad_norm": 0.0361071415245533, "learning_rate": 0.00018794901506373116, "loss": 0.0536, "step": 421 }, { "epoch": 0.18323925314806774, "grad_norm": 0.040313661098480225, "learning_rate": 0.00018792004634994206, "loss": 0.0512, "step": 422 }, { "epoch": 0.1836734693877551, "grad_norm": 0.04793829098343849, "learning_rate": 0.00018789107763615297, "loss": 0.0457, "step": 423 }, { "epoch": 0.18410768562744248, "grad_norm": 0.043424248695373535, "learning_rate": 0.00018786210892236387, "loss": 0.0531, "step": 424 }, { "epoch": 0.18454190186712982, "grad_norm": 0.05836452543735504, "learning_rate": 0.00018783314020857475, "loss": 0.0701, "step": 425 }, { "epoch": 0.1849761181068172, "grad_norm": 0.040584318339824677, "learning_rate": 0.00018780417149478565, "loss": 0.0463, "step": 426 }, { "epoch": 0.18541033434650456, "grad_norm": 0.04535475745797157, "learning_rate": 0.00018777520278099653, "loss": 0.0689, "step": 427 }, { "epoch": 0.18584455058619193, "grad_norm": 0.034920502454042435, "learning_rate": 0.0001877462340672074, "loss": 0.0473, "step": 428 }, { "epoch": 0.1862787668258793, "grad_norm": 0.050958339124917984, "learning_rate": 0.0001877172653534183, "loss": 0.0437, "step": 429 }, { "epoch": 0.18671298306556666, "grad_norm": 0.030035076662898064, "learning_rate": 0.0001876882966396292, "loss": 0.0408, "step": 430 }, { "epoch": 0.187147199305254, "grad_norm": 0.039723604917526245, "learning_rate": 0.00018765932792584011, "loss": 0.0509, "step": 431 }, { "epoch": 0.18758141554494137, "grad_norm": 0.03725224360823631, "learning_rate": 0.000187630359212051, "loss": 0.0474, "step": 432 }, { "epoch": 0.18801563178462874, "grad_norm": 0.03871114179491997, "learning_rate": 0.0001876013904982619, "loss": 0.0498, "step": 433 }, { "epoch": 0.1884498480243161, "grad_norm": 0.03517184779047966, "learning_rate": 0.00018757242178447277, "loss": 0.0502, "step": 434 }, { "epoch": 0.18888406426400348, "grad_norm": 0.039909590035676956, "learning_rate": 0.00018754345307068365, "loss": 0.0521, "step": 435 }, { "epoch": 0.18931828050369084, "grad_norm": 0.0414666086435318, "learning_rate": 0.00018751448435689455, "loss": 0.0568, "step": 436 }, { "epoch": 0.1897524967433782, "grad_norm": 0.03581927344202995, "learning_rate": 0.00018748551564310546, "loss": 0.0425, "step": 437 }, { "epoch": 0.19018671298306555, "grad_norm": 0.03278425708413124, "learning_rate": 0.00018745654692931636, "loss": 0.0428, "step": 438 }, { "epoch": 0.19062092922275292, "grad_norm": 0.038047291338443756, "learning_rate": 0.00018742757821552724, "loss": 0.0497, "step": 439 }, { "epoch": 0.1910551454624403, "grad_norm": 0.04604040086269379, "learning_rate": 0.00018739860950173814, "loss": 0.0574, "step": 440 }, { "epoch": 0.19148936170212766, "grad_norm": 0.042475033551454544, "learning_rate": 0.00018736964078794904, "loss": 0.0505, "step": 441 }, { "epoch": 0.19192357794181503, "grad_norm": 0.05272972211241722, "learning_rate": 0.0001873406720741599, "loss": 0.0816, "step": 442 }, { "epoch": 0.1923577941815024, "grad_norm": 0.03222045674920082, "learning_rate": 0.0001873117033603708, "loss": 0.0386, "step": 443 }, { "epoch": 0.19279201042118976, "grad_norm": 0.04291742667555809, "learning_rate": 0.0001872827346465817, "loss": 0.0441, "step": 444 }, { "epoch": 0.1932262266608771, "grad_norm": 0.040193844586610794, "learning_rate": 0.0001872537659327926, "loss": 0.0563, "step": 445 }, { "epoch": 0.19366044290056447, "grad_norm": 0.034066155552864075, "learning_rate": 0.00018722479721900348, "loss": 0.0413, "step": 446 }, { "epoch": 0.19409465914025184, "grad_norm": 0.04078992083668709, "learning_rate": 0.00018719582850521438, "loss": 0.0649, "step": 447 }, { "epoch": 0.1945288753799392, "grad_norm": 0.04151568189263344, "learning_rate": 0.0001871668597914253, "loss": 0.0459, "step": 448 }, { "epoch": 0.19496309161962658, "grad_norm": 0.036577727645635605, "learning_rate": 0.00018713789107763616, "loss": 0.0513, "step": 449 }, { "epoch": 0.19539730785931395, "grad_norm": 0.03562239557504654, "learning_rate": 0.00018710892236384704, "loss": 0.0452, "step": 450 }, { "epoch": 0.19583152409900131, "grad_norm": 0.032215919345617294, "learning_rate": 0.00018707995365005795, "loss": 0.036, "step": 451 }, { "epoch": 0.19626574033868865, "grad_norm": 0.03549022600054741, "learning_rate": 0.00018705098493626885, "loss": 0.0528, "step": 452 }, { "epoch": 0.19669995657837602, "grad_norm": 0.03795130178332329, "learning_rate": 0.00018702201622247973, "loss": 0.0577, "step": 453 }, { "epoch": 0.1971341728180634, "grad_norm": 0.037727952003479004, "learning_rate": 0.00018699304750869063, "loss": 0.053, "step": 454 }, { "epoch": 0.19756838905775076, "grad_norm": 0.03944394737482071, "learning_rate": 0.00018696407879490153, "loss": 0.0598, "step": 455 }, { "epoch": 0.19800260529743813, "grad_norm": 0.033672090619802475, "learning_rate": 0.0001869351100811124, "loss": 0.0473, "step": 456 }, { "epoch": 0.1984368215371255, "grad_norm": 0.03219886124134064, "learning_rate": 0.00018690614136732329, "loss": 0.0376, "step": 457 }, { "epoch": 0.19887103777681286, "grad_norm": 0.03385812044143677, "learning_rate": 0.0001868771726535342, "loss": 0.0451, "step": 458 }, { "epoch": 0.1993052540165002, "grad_norm": 0.03345385193824768, "learning_rate": 0.0001868482039397451, "loss": 0.0486, "step": 459 }, { "epoch": 0.19973947025618757, "grad_norm": 0.03472769260406494, "learning_rate": 0.00018681923522595597, "loss": 0.0514, "step": 460 }, { "epoch": 0.20017368649587494, "grad_norm": 0.05609716847538948, "learning_rate": 0.00018679026651216687, "loss": 0.0892, "step": 461 }, { "epoch": 0.2006079027355623, "grad_norm": 0.03173099085688591, "learning_rate": 0.00018676129779837778, "loss": 0.0448, "step": 462 }, { "epoch": 0.20104211897524968, "grad_norm": 0.030016480013728142, "learning_rate": 0.00018673232908458865, "loss": 0.0422, "step": 463 }, { "epoch": 0.20147633521493705, "grad_norm": 0.034051086753606796, "learning_rate": 0.00018670336037079953, "loss": 0.0472, "step": 464 }, { "epoch": 0.20191055145462442, "grad_norm": 0.03701572120189667, "learning_rate": 0.00018667439165701043, "loss": 0.0523, "step": 465 }, { "epoch": 0.20234476769431176, "grad_norm": 0.029499176889657974, "learning_rate": 0.00018664542294322134, "loss": 0.0455, "step": 466 }, { "epoch": 0.20277898393399912, "grad_norm": 0.032826367765665054, "learning_rate": 0.00018661645422943221, "loss": 0.0496, "step": 467 }, { "epoch": 0.2032132001736865, "grad_norm": 0.04142281040549278, "learning_rate": 0.00018658748551564312, "loss": 0.0587, "step": 468 }, { "epoch": 0.20364741641337386, "grad_norm": 0.03528276085853577, "learning_rate": 0.00018655851680185402, "loss": 0.0441, "step": 469 }, { "epoch": 0.20408163265306123, "grad_norm": 0.030323248356580734, "learning_rate": 0.0001865295480880649, "loss": 0.0429, "step": 470 }, { "epoch": 0.2045158488927486, "grad_norm": 0.029259048402309418, "learning_rate": 0.00018650057937427578, "loss": 0.0351, "step": 471 }, { "epoch": 0.20495006513243597, "grad_norm": 0.033875975757837296, "learning_rate": 0.00018647161066048668, "loss": 0.0462, "step": 472 }, { "epoch": 0.2053842813721233, "grad_norm": 0.04482308775186539, "learning_rate": 0.00018644264194669758, "loss": 0.0555, "step": 473 }, { "epoch": 0.20581849761181067, "grad_norm": 0.03636910021305084, "learning_rate": 0.00018641367323290846, "loss": 0.0488, "step": 474 }, { "epoch": 0.20625271385149804, "grad_norm": 0.032956209033727646, "learning_rate": 0.00018638470451911936, "loss": 0.0422, "step": 475 }, { "epoch": 0.2066869300911854, "grad_norm": 0.03562194108963013, "learning_rate": 0.00018635573580533027, "loss": 0.0431, "step": 476 }, { "epoch": 0.20712114633087278, "grad_norm": 0.04482211172580719, "learning_rate": 0.00018632676709154114, "loss": 0.0547, "step": 477 }, { "epoch": 0.20755536257056015, "grad_norm": 0.034216947853565216, "learning_rate": 0.00018629779837775202, "loss": 0.0478, "step": 478 }, { "epoch": 0.20798957881024752, "grad_norm": 0.02818777970969677, "learning_rate": 0.00018626882966396292, "loss": 0.042, "step": 479 }, { "epoch": 0.20842379504993486, "grad_norm": 0.030506035313010216, "learning_rate": 0.00018623986095017383, "loss": 0.0385, "step": 480 }, { "epoch": 0.20885801128962223, "grad_norm": 0.032848045229911804, "learning_rate": 0.0001862108922363847, "loss": 0.0434, "step": 481 }, { "epoch": 0.2092922275293096, "grad_norm": 0.03476947173476219, "learning_rate": 0.0001861819235225956, "loss": 0.0473, "step": 482 }, { "epoch": 0.20972644376899696, "grad_norm": 0.03295399621129036, "learning_rate": 0.0001861529548088065, "loss": 0.0429, "step": 483 }, { "epoch": 0.21016066000868433, "grad_norm": 0.037790436297655106, "learning_rate": 0.0001861239860950174, "loss": 0.0559, "step": 484 }, { "epoch": 0.2105948762483717, "grad_norm": 0.03582257404923439, "learning_rate": 0.00018609501738122827, "loss": 0.0541, "step": 485 }, { "epoch": 0.21102909248805907, "grad_norm": 0.03188883885741234, "learning_rate": 0.00018606604866743917, "loss": 0.0462, "step": 486 }, { "epoch": 0.2114633087277464, "grad_norm": 0.037030480802059174, "learning_rate": 0.00018603707995365007, "loss": 0.0487, "step": 487 }, { "epoch": 0.21189752496743378, "grad_norm": 0.033926915377378464, "learning_rate": 0.00018600811123986095, "loss": 0.0442, "step": 488 }, { "epoch": 0.21233174120712114, "grad_norm": 0.03145325183868408, "learning_rate": 0.00018597914252607185, "loss": 0.0394, "step": 489 }, { "epoch": 0.2127659574468085, "grad_norm": 0.03710542246699333, "learning_rate": 0.00018595017381228276, "loss": 0.0558, "step": 490 }, { "epoch": 0.21320017368649588, "grad_norm": 0.03222250938415527, "learning_rate": 0.00018592120509849363, "loss": 0.0462, "step": 491 }, { "epoch": 0.21363438992618325, "grad_norm": 0.032189395278692245, "learning_rate": 0.0001858922363847045, "loss": 0.0345, "step": 492 }, { "epoch": 0.21406860616587062, "grad_norm": 0.0345730222761631, "learning_rate": 0.0001858632676709154, "loss": 0.0526, "step": 493 }, { "epoch": 0.21450282240555796, "grad_norm": 0.035095468163490295, "learning_rate": 0.00018583429895712632, "loss": 0.0431, "step": 494 }, { "epoch": 0.21493703864524533, "grad_norm": 0.04218198359012604, "learning_rate": 0.0001858053302433372, "loss": 0.0578, "step": 495 }, { "epoch": 0.2153712548849327, "grad_norm": 0.03327875956892967, "learning_rate": 0.0001857763615295481, "loss": 0.0464, "step": 496 }, { "epoch": 0.21580547112462006, "grad_norm": 0.04200523719191551, "learning_rate": 0.000185747392815759, "loss": 0.0542, "step": 497 }, { "epoch": 0.21623968736430743, "grad_norm": 0.02822558768093586, "learning_rate": 0.00018571842410196988, "loss": 0.0412, "step": 498 }, { "epoch": 0.2166739036039948, "grad_norm": 0.031007803976535797, "learning_rate": 0.00018568945538818075, "loss": 0.0367, "step": 499 }, { "epoch": 0.21710811984368214, "grad_norm": 0.039218004792928696, "learning_rate": 0.00018566048667439166, "loss": 0.0503, "step": 500 }, { "epoch": 0.2175423360833695, "grad_norm": 0.037772033363580704, "learning_rate": 0.00018563151796060256, "loss": 0.0526, "step": 501 }, { "epoch": 0.21797655232305688, "grad_norm": 0.028330016881227493, "learning_rate": 0.00018560254924681344, "loss": 0.0382, "step": 502 }, { "epoch": 0.21841076856274425, "grad_norm": 0.04708074405789375, "learning_rate": 0.00018557358053302434, "loss": 0.0622, "step": 503 }, { "epoch": 0.2188449848024316, "grad_norm": 0.03838808089494705, "learning_rate": 0.00018554461181923525, "loss": 0.05, "step": 504 }, { "epoch": 0.21927920104211898, "grad_norm": 0.03537886217236519, "learning_rate": 0.00018551564310544612, "loss": 0.0525, "step": 505 }, { "epoch": 0.21971341728180635, "grad_norm": 0.030282124876976013, "learning_rate": 0.000185486674391657, "loss": 0.0475, "step": 506 }, { "epoch": 0.2201476335214937, "grad_norm": 0.030899209901690483, "learning_rate": 0.0001854577056778679, "loss": 0.0423, "step": 507 }, { "epoch": 0.22058184976118106, "grad_norm": 0.04036472737789154, "learning_rate": 0.0001854287369640788, "loss": 0.059, "step": 508 }, { "epoch": 0.22101606600086843, "grad_norm": 0.03361595794558525, "learning_rate": 0.0001853997682502897, "loss": 0.0408, "step": 509 }, { "epoch": 0.2214502822405558, "grad_norm": 0.03743249177932739, "learning_rate": 0.0001853707995365006, "loss": 0.0449, "step": 510 }, { "epoch": 0.22188449848024316, "grad_norm": 0.034678492695093155, "learning_rate": 0.0001853418308227115, "loss": 0.0421, "step": 511 }, { "epoch": 0.22231871471993053, "grad_norm": 0.04270632192492485, "learning_rate": 0.00018531286210892237, "loss": 0.0615, "step": 512 }, { "epoch": 0.2227529309596179, "grad_norm": 0.03959297761321068, "learning_rate": 0.00018528389339513324, "loss": 0.0545, "step": 513 }, { "epoch": 0.22318714719930524, "grad_norm": 0.029921963810920715, "learning_rate": 0.00018525492468134415, "loss": 0.0473, "step": 514 }, { "epoch": 0.2236213634389926, "grad_norm": 0.033521562814712524, "learning_rate": 0.00018522595596755505, "loss": 0.0409, "step": 515 }, { "epoch": 0.22405557967867998, "grad_norm": 0.028009548783302307, "learning_rate": 0.00018519698725376596, "loss": 0.037, "step": 516 }, { "epoch": 0.22448979591836735, "grad_norm": 0.030624153092503548, "learning_rate": 0.00018516801853997683, "loss": 0.0466, "step": 517 }, { "epoch": 0.22492401215805471, "grad_norm": 0.04178798571228981, "learning_rate": 0.00018513904982618774, "loss": 0.0527, "step": 518 }, { "epoch": 0.22535822839774208, "grad_norm": 0.034036047756671906, "learning_rate": 0.00018511008111239864, "loss": 0.0517, "step": 519 }, { "epoch": 0.22579244463742945, "grad_norm": 0.03999226540327072, "learning_rate": 0.0001850811123986095, "loss": 0.0566, "step": 520 }, { "epoch": 0.2262266608771168, "grad_norm": 0.03676460310816765, "learning_rate": 0.0001850521436848204, "loss": 0.0507, "step": 521 }, { "epoch": 0.22666087711680416, "grad_norm": 0.03431371599435806, "learning_rate": 0.0001850231749710313, "loss": 0.0541, "step": 522 }, { "epoch": 0.22709509335649153, "grad_norm": 0.03752364218235016, "learning_rate": 0.0001849942062572422, "loss": 0.0558, "step": 523 }, { "epoch": 0.2275293095961789, "grad_norm": 0.03015449084341526, "learning_rate": 0.00018496523754345308, "loss": 0.038, "step": 524 }, { "epoch": 0.22796352583586627, "grad_norm": 0.035126399248838425, "learning_rate": 0.00018493626882966398, "loss": 0.0537, "step": 525 }, { "epoch": 0.22839774207555363, "grad_norm": 0.03131519630551338, "learning_rate": 0.00018490730011587488, "loss": 0.0474, "step": 526 }, { "epoch": 0.228831958315241, "grad_norm": 0.0279023889452219, "learning_rate": 0.00018487833140208576, "loss": 0.0418, "step": 527 }, { "epoch": 0.22926617455492834, "grad_norm": 0.03264223039150238, "learning_rate": 0.00018484936268829664, "loss": 0.049, "step": 528 }, { "epoch": 0.2297003907946157, "grad_norm": 0.03934367373585701, "learning_rate": 0.00018482039397450754, "loss": 0.0489, "step": 529 }, { "epoch": 0.23013460703430308, "grad_norm": 0.03522181510925293, "learning_rate": 0.00018479142526071844, "loss": 0.0587, "step": 530 }, { "epoch": 0.23056882327399045, "grad_norm": 0.029996689409017563, "learning_rate": 0.00018476245654692932, "loss": 0.0454, "step": 531 }, { "epoch": 0.23100303951367782, "grad_norm": 0.034754350781440735, "learning_rate": 0.00018473348783314022, "loss": 0.0462, "step": 532 }, { "epoch": 0.23143725575336518, "grad_norm": 0.03367964178323746, "learning_rate": 0.00018470451911935113, "loss": 0.0535, "step": 533 }, { "epoch": 0.23187147199305255, "grad_norm": 0.0329362154006958, "learning_rate": 0.000184675550405562, "loss": 0.0467, "step": 534 }, { "epoch": 0.2323056882327399, "grad_norm": 0.03338341787457466, "learning_rate": 0.00018464658169177288, "loss": 0.0475, "step": 535 }, { "epoch": 0.23273990447242726, "grad_norm": 0.028364917263388634, "learning_rate": 0.00018461761297798379, "loss": 0.0373, "step": 536 }, { "epoch": 0.23317412071211463, "grad_norm": 0.03057895228266716, "learning_rate": 0.0001845886442641947, "loss": 0.0412, "step": 537 }, { "epoch": 0.233608336951802, "grad_norm": 0.035345304757356644, "learning_rate": 0.00018455967555040557, "loss": 0.0469, "step": 538 }, { "epoch": 0.23404255319148937, "grad_norm": 0.029698606580495834, "learning_rate": 0.00018453070683661647, "loss": 0.0369, "step": 539 }, { "epoch": 0.23447676943117673, "grad_norm": 0.047548290342092514, "learning_rate": 0.00018450173812282737, "loss": 0.0718, "step": 540 }, { "epoch": 0.2349109856708641, "grad_norm": 0.026436997577548027, "learning_rate": 0.00018447276940903825, "loss": 0.0336, "step": 541 }, { "epoch": 0.23534520191055144, "grad_norm": 0.047358106821775436, "learning_rate": 0.00018444380069524913, "loss": 0.0635, "step": 542 }, { "epoch": 0.2357794181502388, "grad_norm": 0.03718942031264305, "learning_rate": 0.00018441483198146003, "loss": 0.0499, "step": 543 }, { "epoch": 0.23621363438992618, "grad_norm": 0.047774482518434525, "learning_rate": 0.00018438586326767093, "loss": 0.0613, "step": 544 }, { "epoch": 0.23664785062961355, "grad_norm": 0.034137409180402756, "learning_rate": 0.0001843568945538818, "loss": 0.046, "step": 545 }, { "epoch": 0.23708206686930092, "grad_norm": 0.03172863647341728, "learning_rate": 0.00018432792584009271, "loss": 0.0347, "step": 546 }, { "epoch": 0.23751628310898829, "grad_norm": 0.031843140721321106, "learning_rate": 0.00018429895712630362, "loss": 0.0438, "step": 547 }, { "epoch": 0.23795049934867565, "grad_norm": 0.03779742494225502, "learning_rate": 0.0001842699884125145, "loss": 0.0505, "step": 548 }, { "epoch": 0.238384715588363, "grad_norm": 0.035327598452568054, "learning_rate": 0.00018424101969872537, "loss": 0.0486, "step": 549 }, { "epoch": 0.23881893182805036, "grad_norm": 0.04511626437306404, "learning_rate": 0.00018421205098493627, "loss": 0.0489, "step": 550 }, { "epoch": 0.23925314806773773, "grad_norm": 0.03448478877544403, "learning_rate": 0.00018418308227114718, "loss": 0.0585, "step": 551 }, { "epoch": 0.2396873643074251, "grad_norm": 0.028998849913477898, "learning_rate": 0.00018415411355735806, "loss": 0.0406, "step": 552 }, { "epoch": 0.24012158054711247, "grad_norm": 0.02575867809355259, "learning_rate": 0.00018412514484356896, "loss": 0.0373, "step": 553 }, { "epoch": 0.24055579678679984, "grad_norm": 0.028639430180191994, "learning_rate": 0.00018409617612977986, "loss": 0.0441, "step": 554 }, { "epoch": 0.2409900130264872, "grad_norm": 0.038669489324092865, "learning_rate": 0.00018406720741599074, "loss": 0.0455, "step": 555 }, { "epoch": 0.24142422926617454, "grad_norm": 0.03676658123731613, "learning_rate": 0.00018403823870220162, "loss": 0.0503, "step": 556 }, { "epoch": 0.2418584455058619, "grad_norm": 0.03790974244475365, "learning_rate": 0.00018400926998841252, "loss": 0.0574, "step": 557 }, { "epoch": 0.24229266174554928, "grad_norm": 0.038189467042684555, "learning_rate": 0.00018398030127462342, "loss": 0.0442, "step": 558 }, { "epoch": 0.24272687798523665, "grad_norm": 0.03880944103002548, "learning_rate": 0.0001839513325608343, "loss": 0.0582, "step": 559 }, { "epoch": 0.24316109422492402, "grad_norm": 0.03059050813317299, "learning_rate": 0.0001839223638470452, "loss": 0.0427, "step": 560 }, { "epoch": 0.2435953104646114, "grad_norm": 0.03202185779809952, "learning_rate": 0.0001838933951332561, "loss": 0.0457, "step": 561 }, { "epoch": 0.24402952670429873, "grad_norm": 0.03469632565975189, "learning_rate": 0.00018386442641946698, "loss": 0.0469, "step": 562 }, { "epoch": 0.2444637429439861, "grad_norm": 0.038638871163129807, "learning_rate": 0.00018383545770567786, "loss": 0.0531, "step": 563 }, { "epoch": 0.24489795918367346, "grad_norm": 0.031056666746735573, "learning_rate": 0.00018380648899188876, "loss": 0.0439, "step": 564 }, { "epoch": 0.24533217542336083, "grad_norm": 0.027191102504730225, "learning_rate": 0.00018377752027809967, "loss": 0.0389, "step": 565 }, { "epoch": 0.2457663916630482, "grad_norm": 0.03715234622359276, "learning_rate": 0.00018374855156431054, "loss": 0.0481, "step": 566 }, { "epoch": 0.24620060790273557, "grad_norm": 0.03761928528547287, "learning_rate": 0.00018371958285052145, "loss": 0.0464, "step": 567 }, { "epoch": 0.24663482414242294, "grad_norm": 0.041414253413677216, "learning_rate": 0.00018369061413673235, "loss": 0.0764, "step": 568 }, { "epoch": 0.24706904038211028, "grad_norm": 0.03229385241866112, "learning_rate": 0.00018366164542294323, "loss": 0.0475, "step": 569 }, { "epoch": 0.24750325662179765, "grad_norm": 0.04780471324920654, "learning_rate": 0.0001836326767091541, "loss": 0.0582, "step": 570 }, { "epoch": 0.24793747286148501, "grad_norm": 0.035480841994285583, "learning_rate": 0.000183603707995365, "loss": 0.0472, "step": 571 }, { "epoch": 0.24837168910117238, "grad_norm": 0.03601158410310745, "learning_rate": 0.0001835747392815759, "loss": 0.0453, "step": 572 }, { "epoch": 0.24880590534085975, "grad_norm": 0.03631042316555977, "learning_rate": 0.0001835457705677868, "loss": 0.0501, "step": 573 }, { "epoch": 0.24924012158054712, "grad_norm": 0.03484423831105232, "learning_rate": 0.0001835168018539977, "loss": 0.048, "step": 574 }, { "epoch": 0.2496743378202345, "grad_norm": 0.026493581011891365, "learning_rate": 0.0001834878331402086, "loss": 0.0378, "step": 575 }, { "epoch": 0.25010855405992183, "grad_norm": 0.053333114832639694, "learning_rate": 0.00018345886442641947, "loss": 0.064, "step": 576 }, { "epoch": 0.2505427702996092, "grad_norm": 0.04745639115571976, "learning_rate": 0.00018342989571263035, "loss": 0.0647, "step": 577 }, { "epoch": 0.25097698653929656, "grad_norm": 0.03873669356107712, "learning_rate": 0.00018340092699884125, "loss": 0.051, "step": 578 }, { "epoch": 0.25141120277898393, "grad_norm": 0.03193731606006622, "learning_rate": 0.00018337195828505216, "loss": 0.0498, "step": 579 }, { "epoch": 0.2518454190186713, "grad_norm": 0.031808141618967056, "learning_rate": 0.00018334298957126303, "loss": 0.0454, "step": 580 }, { "epoch": 0.25227963525835867, "grad_norm": 0.033278994262218475, "learning_rate": 0.00018331402085747394, "loss": 0.0411, "step": 581 }, { "epoch": 0.25271385149804604, "grad_norm": 0.0367935411632061, "learning_rate": 0.00018328505214368484, "loss": 0.0516, "step": 582 }, { "epoch": 0.2531480677377334, "grad_norm": 0.039125893265008926, "learning_rate": 0.00018325608342989572, "loss": 0.0509, "step": 583 }, { "epoch": 0.2535822839774208, "grad_norm": 0.048576854169368744, "learning_rate": 0.0001832271147161066, "loss": 0.0597, "step": 584 }, { "epoch": 0.25401650021710814, "grad_norm": 0.029854651540517807, "learning_rate": 0.0001831981460023175, "loss": 0.0415, "step": 585 }, { "epoch": 0.2544507164567955, "grad_norm": 0.028105296194553375, "learning_rate": 0.0001831691772885284, "loss": 0.0441, "step": 586 }, { "epoch": 0.2548849326964828, "grad_norm": 0.03419432416558266, "learning_rate": 0.00018314020857473928, "loss": 0.0516, "step": 587 }, { "epoch": 0.2553191489361702, "grad_norm": 0.03126964345574379, "learning_rate": 0.00018311123986095018, "loss": 0.0439, "step": 588 }, { "epoch": 0.25575336517585756, "grad_norm": 0.04228653386235237, "learning_rate": 0.00018308227114716109, "loss": 0.0508, "step": 589 }, { "epoch": 0.25618758141554493, "grad_norm": 0.04101753979921341, "learning_rate": 0.00018305330243337196, "loss": 0.0498, "step": 590 }, { "epoch": 0.2566217976552323, "grad_norm": 0.03216276690363884, "learning_rate": 0.00018302433371958284, "loss": 0.0477, "step": 591 }, { "epoch": 0.25705601389491967, "grad_norm": 0.027524884790182114, "learning_rate": 0.00018299536500579374, "loss": 0.038, "step": 592 }, { "epoch": 0.25749023013460703, "grad_norm": 0.03750011324882507, "learning_rate": 0.00018296639629200465, "loss": 0.0477, "step": 593 }, { "epoch": 0.2579244463742944, "grad_norm": 0.024011045694351196, "learning_rate": 0.00018293742757821552, "loss": 0.0373, "step": 594 }, { "epoch": 0.25835866261398177, "grad_norm": 0.048863571137189865, "learning_rate": 0.00018290845886442643, "loss": 0.0486, "step": 595 }, { "epoch": 0.25879287885366914, "grad_norm": 0.03564349561929703, "learning_rate": 0.00018287949015063733, "loss": 0.0435, "step": 596 }, { "epoch": 0.2592270950933565, "grad_norm": 0.04342077299952507, "learning_rate": 0.0001828505214368482, "loss": 0.0493, "step": 597 }, { "epoch": 0.2596613113330439, "grad_norm": 0.03753993660211563, "learning_rate": 0.00018282155272305908, "loss": 0.0543, "step": 598 }, { "epoch": 0.26009552757273124, "grad_norm": 0.03594928979873657, "learning_rate": 0.00018279258400927, "loss": 0.047, "step": 599 }, { "epoch": 0.26052974381241856, "grad_norm": 0.033650897443294525, "learning_rate": 0.0001827636152954809, "loss": 0.0502, "step": 600 }, { "epoch": 0.2609639600521059, "grad_norm": 0.026554353535175323, "learning_rate": 0.00018273464658169177, "loss": 0.0449, "step": 601 }, { "epoch": 0.2613981762917933, "grad_norm": 0.040216732770204544, "learning_rate": 0.00018270567786790267, "loss": 0.0682, "step": 602 }, { "epoch": 0.26183239253148066, "grad_norm": 0.02836969494819641, "learning_rate": 0.00018267670915411358, "loss": 0.0466, "step": 603 }, { "epoch": 0.26226660877116803, "grad_norm": 0.03196130692958832, "learning_rate": 0.00018264774044032448, "loss": 0.0495, "step": 604 }, { "epoch": 0.2627008250108554, "grad_norm": 0.02840746007859707, "learning_rate": 0.00018261877172653536, "loss": 0.0418, "step": 605 }, { "epoch": 0.26313504125054277, "grad_norm": 0.03264332190155983, "learning_rate": 0.00018258980301274623, "loss": 0.0513, "step": 606 }, { "epoch": 0.26356925749023014, "grad_norm": 0.03162454068660736, "learning_rate": 0.00018256083429895714, "loss": 0.0466, "step": 607 }, { "epoch": 0.2640034737299175, "grad_norm": 0.029927769675850868, "learning_rate": 0.00018253186558516804, "loss": 0.0438, "step": 608 }, { "epoch": 0.26443768996960487, "grad_norm": 0.03436635807156563, "learning_rate": 0.00018250289687137892, "loss": 0.0474, "step": 609 }, { "epoch": 0.26487190620929224, "grad_norm": 0.031710848212242126, "learning_rate": 0.00018247392815758982, "loss": 0.0484, "step": 610 }, { "epoch": 0.2653061224489796, "grad_norm": 0.035200051963329315, "learning_rate": 0.00018244495944380072, "loss": 0.0482, "step": 611 }, { "epoch": 0.265740338688667, "grad_norm": 0.03654586896300316, "learning_rate": 0.0001824159907300116, "loss": 0.05, "step": 612 }, { "epoch": 0.26617455492835435, "grad_norm": 0.034888964146375656, "learning_rate": 0.00018238702201622248, "loss": 0.0506, "step": 613 }, { "epoch": 0.26660877116804166, "grad_norm": 0.034716833382844925, "learning_rate": 0.00018235805330243338, "loss": 0.0526, "step": 614 }, { "epoch": 0.267042987407729, "grad_norm": 0.0297621451318264, "learning_rate": 0.00018232908458864428, "loss": 0.042, "step": 615 }, { "epoch": 0.2674772036474164, "grad_norm": 0.03025101125240326, "learning_rate": 0.00018230011587485516, "loss": 0.0399, "step": 616 }, { "epoch": 0.26791141988710376, "grad_norm": 0.04356067627668381, "learning_rate": 0.00018227114716106607, "loss": 0.0603, "step": 617 }, { "epoch": 0.26834563612679113, "grad_norm": 0.03192654997110367, "learning_rate": 0.00018224217844727697, "loss": 0.0447, "step": 618 }, { "epoch": 0.2687798523664785, "grad_norm": 0.03219345584511757, "learning_rate": 0.00018221320973348785, "loss": 0.0432, "step": 619 }, { "epoch": 0.26921406860616587, "grad_norm": 0.04129676893353462, "learning_rate": 0.00018218424101969872, "loss": 0.0458, "step": 620 }, { "epoch": 0.26964828484585324, "grad_norm": 0.049140963703393936, "learning_rate": 0.00018215527230590963, "loss": 0.067, "step": 621 }, { "epoch": 0.2700825010855406, "grad_norm": 0.030686063691973686, "learning_rate": 0.00018212630359212053, "loss": 0.0364, "step": 622 }, { "epoch": 0.270516717325228, "grad_norm": 0.028311580419540405, "learning_rate": 0.0001820973348783314, "loss": 0.0384, "step": 623 }, { "epoch": 0.27095093356491534, "grad_norm": 0.02727629616856575, "learning_rate": 0.0001820683661645423, "loss": 0.0372, "step": 624 }, { "epoch": 0.2713851498046027, "grad_norm": 0.03879628702998161, "learning_rate": 0.0001820393974507532, "loss": 0.0489, "step": 625 }, { "epoch": 0.2718193660442901, "grad_norm": 0.03557194769382477, "learning_rate": 0.0001820104287369641, "loss": 0.0538, "step": 626 }, { "epoch": 0.27225358228397745, "grad_norm": 0.03163350746035576, "learning_rate": 0.00018198146002317497, "loss": 0.0471, "step": 627 }, { "epoch": 0.27268779852366476, "grad_norm": 0.03281879797577858, "learning_rate": 0.00018195249130938587, "loss": 0.0411, "step": 628 }, { "epoch": 0.27312201476335213, "grad_norm": 0.03194601461291313, "learning_rate": 0.00018192352259559677, "loss": 0.0469, "step": 629 }, { "epoch": 0.2735562310030395, "grad_norm": 0.033936552703380585, "learning_rate": 0.00018189455388180765, "loss": 0.0474, "step": 630 }, { "epoch": 0.27399044724272686, "grad_norm": 0.025946224108338356, "learning_rate": 0.00018186558516801855, "loss": 0.0358, "step": 631 }, { "epoch": 0.27442466348241423, "grad_norm": 0.04391651973128319, "learning_rate": 0.00018183661645422946, "loss": 0.065, "step": 632 }, { "epoch": 0.2748588797221016, "grad_norm": 0.050722911953926086, "learning_rate": 0.00018180764774044033, "loss": 0.0756, "step": 633 }, { "epoch": 0.27529309596178897, "grad_norm": 0.027286313474178314, "learning_rate": 0.0001817786790266512, "loss": 0.0397, "step": 634 }, { "epoch": 0.27572731220147634, "grad_norm": 0.0289190411567688, "learning_rate": 0.00018174971031286212, "loss": 0.037, "step": 635 }, { "epoch": 0.2761615284411637, "grad_norm": 0.028289958834648132, "learning_rate": 0.00018172074159907302, "loss": 0.0397, "step": 636 }, { "epoch": 0.2765957446808511, "grad_norm": 0.033169280737638474, "learning_rate": 0.0001816917728852839, "loss": 0.0417, "step": 637 }, { "epoch": 0.27702996092053844, "grad_norm": 0.042330868542194366, "learning_rate": 0.0001816628041714948, "loss": 0.0551, "step": 638 }, { "epoch": 0.2774641771602258, "grad_norm": 0.030897991731762886, "learning_rate": 0.0001816338354577057, "loss": 0.041, "step": 639 }, { "epoch": 0.2778983933999132, "grad_norm": 0.04191277548670769, "learning_rate": 0.00018160486674391658, "loss": 0.0591, "step": 640 }, { "epoch": 0.27833260963960055, "grad_norm": 0.027812523767352104, "learning_rate": 0.00018157589803012746, "loss": 0.0349, "step": 641 }, { "epoch": 0.27876682587928786, "grad_norm": 0.030904652550816536, "learning_rate": 0.00018154692931633836, "loss": 0.0382, "step": 642 }, { "epoch": 0.27920104211897523, "grad_norm": 0.031487010419368744, "learning_rate": 0.00018151796060254926, "loss": 0.0397, "step": 643 }, { "epoch": 0.2796352583586626, "grad_norm": 0.03278053179383278, "learning_rate": 0.00018148899188876014, "loss": 0.0466, "step": 644 }, { "epoch": 0.28006947459834997, "grad_norm": 0.042889855802059174, "learning_rate": 0.00018146002317497104, "loss": 0.0486, "step": 645 }, { "epoch": 0.28050369083803733, "grad_norm": 0.03492758050560951, "learning_rate": 0.00018143105446118195, "loss": 0.0441, "step": 646 }, { "epoch": 0.2809379070777247, "grad_norm": 0.03515778109431267, "learning_rate": 0.00018140208574739282, "loss": 0.0547, "step": 647 }, { "epoch": 0.28137212331741207, "grad_norm": 0.03690196946263313, "learning_rate": 0.0001813731170336037, "loss": 0.0548, "step": 648 }, { "epoch": 0.28180633955709944, "grad_norm": 0.024906666949391365, "learning_rate": 0.0001813441483198146, "loss": 0.036, "step": 649 }, { "epoch": 0.2822405557967868, "grad_norm": 0.03311024606227875, "learning_rate": 0.0001813151796060255, "loss": 0.0409, "step": 650 }, { "epoch": 0.2826747720364742, "grad_norm": 0.038247447460889816, "learning_rate": 0.00018128621089223638, "loss": 0.0492, "step": 651 }, { "epoch": 0.28310898827616154, "grad_norm": 0.03390941768884659, "learning_rate": 0.0001812572421784473, "loss": 0.0491, "step": 652 }, { "epoch": 0.2835432045158489, "grad_norm": 0.03830808401107788, "learning_rate": 0.0001812282734646582, "loss": 0.0521, "step": 653 }, { "epoch": 0.2839774207555363, "grad_norm": 0.04006361961364746, "learning_rate": 0.00018119930475086907, "loss": 0.051, "step": 654 }, { "epoch": 0.28441163699522365, "grad_norm": 0.03229132667183876, "learning_rate": 0.00018117033603707995, "loss": 0.0458, "step": 655 }, { "epoch": 0.28484585323491096, "grad_norm": 0.036902815103530884, "learning_rate": 0.00018114136732329085, "loss": 0.055, "step": 656 }, { "epoch": 0.28528006947459833, "grad_norm": 0.03119436465203762, "learning_rate": 0.00018111239860950175, "loss": 0.0372, "step": 657 }, { "epoch": 0.2857142857142857, "grad_norm": 0.03793225437402725, "learning_rate": 0.00018108342989571263, "loss": 0.0546, "step": 658 }, { "epoch": 0.28614850195397307, "grad_norm": 0.03288763761520386, "learning_rate": 0.00018105446118192353, "loss": 0.0547, "step": 659 }, { "epoch": 0.28658271819366044, "grad_norm": 0.03336387127637863, "learning_rate": 0.00018102549246813444, "loss": 0.046, "step": 660 }, { "epoch": 0.2870169344333478, "grad_norm": 0.02799719013273716, "learning_rate": 0.00018099652375434531, "loss": 0.042, "step": 661 }, { "epoch": 0.28745115067303517, "grad_norm": 0.04032472148537636, "learning_rate": 0.0001809675550405562, "loss": 0.0493, "step": 662 }, { "epoch": 0.28788536691272254, "grad_norm": 0.03296314552426338, "learning_rate": 0.0001809385863267671, "loss": 0.0525, "step": 663 }, { "epoch": 0.2883195831524099, "grad_norm": 0.032065317034721375, "learning_rate": 0.000180909617612978, "loss": 0.047, "step": 664 }, { "epoch": 0.2887537993920973, "grad_norm": 0.027486996725201607, "learning_rate": 0.00018088064889918887, "loss": 0.0452, "step": 665 }, { "epoch": 0.28918801563178465, "grad_norm": 0.030706362798810005, "learning_rate": 0.00018085168018539978, "loss": 0.0428, "step": 666 }, { "epoch": 0.289622231871472, "grad_norm": 0.033242758363485336, "learning_rate": 0.00018082271147161068, "loss": 0.0484, "step": 667 }, { "epoch": 0.2900564481111594, "grad_norm": 0.024343930184841156, "learning_rate": 0.00018079374275782156, "loss": 0.0417, "step": 668 }, { "epoch": 0.2904906643508467, "grad_norm": 0.034791357815265656, "learning_rate": 0.00018076477404403244, "loss": 0.0487, "step": 669 }, { "epoch": 0.29092488059053406, "grad_norm": 0.03738206997513771, "learning_rate": 0.00018073580533024334, "loss": 0.0624, "step": 670 }, { "epoch": 0.29135909683022143, "grad_norm": 0.031620655208826065, "learning_rate": 0.00018070683661645424, "loss": 0.0425, "step": 671 }, { "epoch": 0.2917933130699088, "grad_norm": 0.03915546461939812, "learning_rate": 0.00018067786790266512, "loss": 0.0583, "step": 672 }, { "epoch": 0.29222752930959617, "grad_norm": 0.02743075042963028, "learning_rate": 0.00018064889918887602, "loss": 0.0395, "step": 673 }, { "epoch": 0.29266174554928354, "grad_norm": 0.04210470989346504, "learning_rate": 0.00018061993047508693, "loss": 0.0602, "step": 674 }, { "epoch": 0.2930959617889709, "grad_norm": 0.026883462443947792, "learning_rate": 0.0001805909617612978, "loss": 0.0388, "step": 675 }, { "epoch": 0.2935301780286583, "grad_norm": 0.031257402151823044, "learning_rate": 0.00018056199304750868, "loss": 0.0432, "step": 676 }, { "epoch": 0.29396439426834564, "grad_norm": 0.03315794840455055, "learning_rate": 0.00018053302433371958, "loss": 0.0412, "step": 677 }, { "epoch": 0.294398610508033, "grad_norm": 0.0570780485868454, "learning_rate": 0.0001805040556199305, "loss": 0.0682, "step": 678 }, { "epoch": 0.2948328267477204, "grad_norm": 0.02653060480952263, "learning_rate": 0.00018047508690614136, "loss": 0.034, "step": 679 }, { "epoch": 0.29526704298740775, "grad_norm": 0.033346988260746, "learning_rate": 0.00018044611819235227, "loss": 0.0504, "step": 680 }, { "epoch": 0.2957012592270951, "grad_norm": 0.02801245078444481, "learning_rate": 0.00018041714947856317, "loss": 0.0393, "step": 681 }, { "epoch": 0.2961354754667825, "grad_norm": 0.03244206681847572, "learning_rate": 0.00018038818076477407, "loss": 0.0393, "step": 682 }, { "epoch": 0.2965696917064698, "grad_norm": 0.03742178902029991, "learning_rate": 0.00018035921205098495, "loss": 0.055, "step": 683 }, { "epoch": 0.29700390794615716, "grad_norm": 0.03287533298134804, "learning_rate": 0.00018033024333719583, "loss": 0.0452, "step": 684 }, { "epoch": 0.29743812418584453, "grad_norm": 0.037382807582616806, "learning_rate": 0.00018030127462340673, "loss": 0.0578, "step": 685 }, { "epoch": 0.2978723404255319, "grad_norm": 0.028814688324928284, "learning_rate": 0.0001802723059096176, "loss": 0.0395, "step": 686 }, { "epoch": 0.29830655666521927, "grad_norm": 0.030391838401556015, "learning_rate": 0.0001802433371958285, "loss": 0.0456, "step": 687 }, { "epoch": 0.29874077290490664, "grad_norm": 0.028521986678242683, "learning_rate": 0.00018021436848203942, "loss": 0.0424, "step": 688 }, { "epoch": 0.299174989144594, "grad_norm": 0.03750445321202278, "learning_rate": 0.00018018539976825032, "loss": 0.0547, "step": 689 }, { "epoch": 0.2996092053842814, "grad_norm": 0.03369120880961418, "learning_rate": 0.0001801564310544612, "loss": 0.0543, "step": 690 }, { "epoch": 0.30004342162396874, "grad_norm": 0.036905452609062195, "learning_rate": 0.00018012746234067207, "loss": 0.0526, "step": 691 }, { "epoch": 0.3004776378636561, "grad_norm": 0.028907816857099533, "learning_rate": 0.00018009849362688298, "loss": 0.041, "step": 692 }, { "epoch": 0.3009118541033435, "grad_norm": 0.039762016385793686, "learning_rate": 0.00018006952491309385, "loss": 0.0626, "step": 693 }, { "epoch": 0.30134607034303085, "grad_norm": 0.032995376735925674, "learning_rate": 0.00018004055619930476, "loss": 0.0445, "step": 694 }, { "epoch": 0.3017802865827182, "grad_norm": 0.031932998448610306, "learning_rate": 0.00018001158748551566, "loss": 0.0486, "step": 695 }, { "epoch": 0.3022145028224056, "grad_norm": 0.03803734481334686, "learning_rate": 0.00017998261877172656, "loss": 0.0525, "step": 696 }, { "epoch": 0.3026487190620929, "grad_norm": 0.03954422473907471, "learning_rate": 0.00017995365005793744, "loss": 0.0555, "step": 697 }, { "epoch": 0.30308293530178027, "grad_norm": 0.04148051515221596, "learning_rate": 0.00017992468134414832, "loss": 0.0532, "step": 698 }, { "epoch": 0.30351715154146763, "grad_norm": 0.03282377868890762, "learning_rate": 0.00017989571263035922, "loss": 0.0459, "step": 699 }, { "epoch": 0.303951367781155, "grad_norm": 0.036772143095731735, "learning_rate": 0.0001798667439165701, "loss": 0.0458, "step": 700 }, { "epoch": 0.30438558402084237, "grad_norm": 0.032748933881521225, "learning_rate": 0.000179837775202781, "loss": 0.046, "step": 701 }, { "epoch": 0.30481980026052974, "grad_norm": 0.03613371402025223, "learning_rate": 0.0001798088064889919, "loss": 0.0501, "step": 702 }, { "epoch": 0.3052540165002171, "grad_norm": 0.028015349060297012, "learning_rate": 0.0001797798377752028, "loss": 0.0408, "step": 703 }, { "epoch": 0.3056882327399045, "grad_norm": 0.03355312719941139, "learning_rate": 0.00017975086906141369, "loss": 0.0479, "step": 704 }, { "epoch": 0.30612244897959184, "grad_norm": 0.03612188622355461, "learning_rate": 0.00017972190034762456, "loss": 0.0544, "step": 705 }, { "epoch": 0.3065566652192792, "grad_norm": 0.043637290596961975, "learning_rate": 0.00017969293163383547, "loss": 0.0519, "step": 706 }, { "epoch": 0.3069908814589666, "grad_norm": 0.029089052230119705, "learning_rate": 0.00017966396292004637, "loss": 0.047, "step": 707 }, { "epoch": 0.30742509769865395, "grad_norm": 0.041218243539333344, "learning_rate": 0.00017963499420625725, "loss": 0.0561, "step": 708 }, { "epoch": 0.3078593139383413, "grad_norm": 0.030620872974395752, "learning_rate": 0.00017960602549246815, "loss": 0.0478, "step": 709 }, { "epoch": 0.3082935301780287, "grad_norm": 0.039345983415842056, "learning_rate": 0.00017957705677867905, "loss": 0.0584, "step": 710 }, { "epoch": 0.308727746417716, "grad_norm": 0.024773817509412766, "learning_rate": 0.00017954808806488993, "loss": 0.0327, "step": 711 }, { "epoch": 0.30916196265740337, "grad_norm": 0.03313567116856575, "learning_rate": 0.0001795191193511008, "loss": 0.055, "step": 712 }, { "epoch": 0.30959617889709073, "grad_norm": 0.03679310157895088, "learning_rate": 0.0001794901506373117, "loss": 0.0516, "step": 713 }, { "epoch": 0.3100303951367781, "grad_norm": 0.03680863603949547, "learning_rate": 0.00017946118192352261, "loss": 0.0433, "step": 714 }, { "epoch": 0.31046461137646547, "grad_norm": 0.043273404240608215, "learning_rate": 0.0001794322132097335, "loss": 0.0823, "step": 715 }, { "epoch": 0.31089882761615284, "grad_norm": 0.03878754749894142, "learning_rate": 0.0001794032444959444, "loss": 0.0567, "step": 716 }, { "epoch": 0.3113330438558402, "grad_norm": 0.03643855080008507, "learning_rate": 0.0001793742757821553, "loss": 0.0614, "step": 717 }, { "epoch": 0.3117672600955276, "grad_norm": 0.032215382903814316, "learning_rate": 0.00017934530706836618, "loss": 0.0424, "step": 718 }, { "epoch": 0.31220147633521494, "grad_norm": 0.031243428587913513, "learning_rate": 0.00017931633835457705, "loss": 0.0476, "step": 719 }, { "epoch": 0.3126356925749023, "grad_norm": 0.03387127444148064, "learning_rate": 0.00017928736964078796, "loss": 0.0565, "step": 720 }, { "epoch": 0.3130699088145897, "grad_norm": 0.03808264806866646, "learning_rate": 0.00017925840092699886, "loss": 0.0538, "step": 721 }, { "epoch": 0.31350412505427705, "grad_norm": 0.041264161467552185, "learning_rate": 0.00017922943221320974, "loss": 0.0672, "step": 722 }, { "epoch": 0.3139383412939644, "grad_norm": 0.04635841026902199, "learning_rate": 0.00017920046349942064, "loss": 0.0596, "step": 723 }, { "epoch": 0.31437255753365173, "grad_norm": 0.034705616533756256, "learning_rate": 0.00017917149478563154, "loss": 0.0571, "step": 724 }, { "epoch": 0.3148067737733391, "grad_norm": 0.030876323580741882, "learning_rate": 0.00017914252607184242, "loss": 0.0454, "step": 725 }, { "epoch": 0.31524099001302647, "grad_norm": 0.02999948337674141, "learning_rate": 0.0001791135573580533, "loss": 0.0422, "step": 726 }, { "epoch": 0.31567520625271384, "grad_norm": 0.03863374516367912, "learning_rate": 0.0001790845886442642, "loss": 0.0583, "step": 727 }, { "epoch": 0.3161094224924012, "grad_norm": 0.030729513615369797, "learning_rate": 0.0001790556199304751, "loss": 0.0423, "step": 728 }, { "epoch": 0.3165436387320886, "grad_norm": 0.03130311891436577, "learning_rate": 0.00017902665121668598, "loss": 0.0424, "step": 729 }, { "epoch": 0.31697785497177594, "grad_norm": 0.03811647370457649, "learning_rate": 0.00017899768250289688, "loss": 0.0542, "step": 730 }, { "epoch": 0.3174120712114633, "grad_norm": 0.030381064862012863, "learning_rate": 0.0001789687137891078, "loss": 0.0465, "step": 731 }, { "epoch": 0.3178462874511507, "grad_norm": 0.03165702894330025, "learning_rate": 0.00017893974507531866, "loss": 0.0426, "step": 732 }, { "epoch": 0.31828050369083805, "grad_norm": 0.027540862560272217, "learning_rate": 0.00017891077636152954, "loss": 0.0378, "step": 733 }, { "epoch": 0.3187147199305254, "grad_norm": 0.02677670679986477, "learning_rate": 0.00017888180764774044, "loss": 0.0391, "step": 734 }, { "epoch": 0.3191489361702128, "grad_norm": 0.04038514569401741, "learning_rate": 0.00017885283893395135, "loss": 0.0541, "step": 735 }, { "epoch": 0.31958315240990015, "grad_norm": 0.03133655712008476, "learning_rate": 0.00017882387022016223, "loss": 0.0469, "step": 736 }, { "epoch": 0.3200173686495875, "grad_norm": 0.03268914669752121, "learning_rate": 0.00017879490150637313, "loss": 0.0401, "step": 737 }, { "epoch": 0.32045158488927483, "grad_norm": 0.033875495195388794, "learning_rate": 0.00017876593279258403, "loss": 0.044, "step": 738 }, { "epoch": 0.3208858011289622, "grad_norm": 0.04104755073785782, "learning_rate": 0.0001787369640787949, "loss": 0.0625, "step": 739 }, { "epoch": 0.32132001736864957, "grad_norm": 0.03961517661809921, "learning_rate": 0.00017870799536500579, "loss": 0.0447, "step": 740 }, { "epoch": 0.32175423360833694, "grad_norm": 0.034381914883852005, "learning_rate": 0.0001786790266512167, "loss": 0.0459, "step": 741 }, { "epoch": 0.3221884498480243, "grad_norm": 0.02899251878261566, "learning_rate": 0.0001786500579374276, "loss": 0.036, "step": 742 }, { "epoch": 0.3226226660877117, "grad_norm": 0.035669952630996704, "learning_rate": 0.00017862108922363847, "loss": 0.0528, "step": 743 }, { "epoch": 0.32305688232739904, "grad_norm": 0.0378962866961956, "learning_rate": 0.00017859212050984937, "loss": 0.0447, "step": 744 }, { "epoch": 0.3234910985670864, "grad_norm": 0.0491696298122406, "learning_rate": 0.00017856315179606028, "loss": 0.0448, "step": 745 }, { "epoch": 0.3239253148067738, "grad_norm": 0.023796536028385162, "learning_rate": 0.00017853418308227115, "loss": 0.0337, "step": 746 }, { "epoch": 0.32435953104646115, "grad_norm": 0.03326389938592911, "learning_rate": 0.00017850521436848203, "loss": 0.0446, "step": 747 }, { "epoch": 0.3247937472861485, "grad_norm": 0.03254936635494232, "learning_rate": 0.00017847624565469293, "loss": 0.0374, "step": 748 }, { "epoch": 0.3252279635258359, "grad_norm": 0.02903556451201439, "learning_rate": 0.00017844727694090384, "loss": 0.0378, "step": 749 }, { "epoch": 0.32566217976552325, "grad_norm": 0.035492800176143646, "learning_rate": 0.00017841830822711471, "loss": 0.0495, "step": 750 }, { "epoch": 0.3260963960052106, "grad_norm": 0.03714393079280853, "learning_rate": 0.00017838933951332562, "loss": 0.0479, "step": 751 }, { "epoch": 0.32653061224489793, "grad_norm": 0.041072774678468704, "learning_rate": 0.00017836037079953652, "loss": 0.047, "step": 752 }, { "epoch": 0.3269648284845853, "grad_norm": 0.03148976340889931, "learning_rate": 0.0001783314020857474, "loss": 0.0393, "step": 753 }, { "epoch": 0.32739904472427267, "grad_norm": 0.03571334481239319, "learning_rate": 0.00017830243337195828, "loss": 0.0514, "step": 754 }, { "epoch": 0.32783326096396004, "grad_norm": 0.03773289918899536, "learning_rate": 0.00017827346465816918, "loss": 0.0561, "step": 755 }, { "epoch": 0.3282674772036474, "grad_norm": 0.03419439122080803, "learning_rate": 0.00017824449594438008, "loss": 0.0505, "step": 756 }, { "epoch": 0.3287016934433348, "grad_norm": 0.03685329109430313, "learning_rate": 0.00017821552723059096, "loss": 0.0493, "step": 757 }, { "epoch": 0.32913590968302214, "grad_norm": 0.027102336287498474, "learning_rate": 0.00017818655851680186, "loss": 0.0416, "step": 758 }, { "epoch": 0.3295701259227095, "grad_norm": 0.03721369802951813, "learning_rate": 0.00017815758980301277, "loss": 0.0578, "step": 759 }, { "epoch": 0.3300043421623969, "grad_norm": 0.028465505689382553, "learning_rate": 0.00017812862108922367, "loss": 0.0426, "step": 760 }, { "epoch": 0.33043855840208425, "grad_norm": 0.040655717253685, "learning_rate": 0.00017809965237543455, "loss": 0.0589, "step": 761 }, { "epoch": 0.3308727746417716, "grad_norm": 0.03556610271334648, "learning_rate": 0.00017807068366164542, "loss": 0.0506, "step": 762 }, { "epoch": 0.331306990881459, "grad_norm": 0.027362601831555367, "learning_rate": 0.00017804171494785633, "loss": 0.035, "step": 763 }, { "epoch": 0.33174120712114635, "grad_norm": 0.028054367750883102, "learning_rate": 0.0001780127462340672, "loss": 0.04, "step": 764 }, { "epoch": 0.3321754233608337, "grad_norm": 0.04060087352991104, "learning_rate": 0.0001779837775202781, "loss": 0.0606, "step": 765 }, { "epoch": 0.33260963960052103, "grad_norm": 0.03554994612932205, "learning_rate": 0.000177954808806489, "loss": 0.0531, "step": 766 }, { "epoch": 0.3330438558402084, "grad_norm": 0.031461309641599655, "learning_rate": 0.00017792584009269992, "loss": 0.0408, "step": 767 }, { "epoch": 0.33347807207989577, "grad_norm": 0.03467322140932083, "learning_rate": 0.0001778968713789108, "loss": 0.0494, "step": 768 }, { "epoch": 0.33391228831958314, "grad_norm": 0.03043636865913868, "learning_rate": 0.00017786790266512167, "loss": 0.0461, "step": 769 }, { "epoch": 0.3343465045592705, "grad_norm": 0.02550501935184002, "learning_rate": 0.00017783893395133257, "loss": 0.0333, "step": 770 }, { "epoch": 0.3347807207989579, "grad_norm": 0.030812392011284828, "learning_rate": 0.00017780996523754345, "loss": 0.0459, "step": 771 }, { "epoch": 0.33521493703864524, "grad_norm": 0.029298804700374603, "learning_rate": 0.00017778099652375435, "loss": 0.0395, "step": 772 }, { "epoch": 0.3356491532783326, "grad_norm": 0.03454957157373428, "learning_rate": 0.00017775202780996526, "loss": 0.0482, "step": 773 }, { "epoch": 0.33608336951802, "grad_norm": 0.029581697657704353, "learning_rate": 0.00017772305909617616, "loss": 0.0393, "step": 774 }, { "epoch": 0.33651758575770735, "grad_norm": 0.033482737839221954, "learning_rate": 0.00017769409038238704, "loss": 0.0439, "step": 775 }, { "epoch": 0.3369518019973947, "grad_norm": 0.03552280738949776, "learning_rate": 0.0001776651216685979, "loss": 0.0436, "step": 776 }, { "epoch": 0.3373860182370821, "grad_norm": 0.02487865649163723, "learning_rate": 0.00017763615295480882, "loss": 0.0392, "step": 777 }, { "epoch": 0.33782023447676945, "grad_norm": 0.03153971955180168, "learning_rate": 0.0001776071842410197, "loss": 0.0361, "step": 778 }, { "epoch": 0.3382544507164568, "grad_norm": 0.03945176675915718, "learning_rate": 0.0001775782155272306, "loss": 0.0663, "step": 779 }, { "epoch": 0.33868866695614414, "grad_norm": 0.036744795739650726, "learning_rate": 0.0001775492468134415, "loss": 0.0406, "step": 780 }, { "epoch": 0.3391228831958315, "grad_norm": 0.04276387393474579, "learning_rate": 0.0001775202780996524, "loss": 0.0484, "step": 781 }, { "epoch": 0.33955709943551887, "grad_norm": 0.03183692693710327, "learning_rate": 0.00017749130938586328, "loss": 0.0377, "step": 782 }, { "epoch": 0.33999131567520624, "grad_norm": 0.03713219240307808, "learning_rate": 0.00017746234067207416, "loss": 0.052, "step": 783 }, { "epoch": 0.3404255319148936, "grad_norm": 0.03282071277499199, "learning_rate": 0.00017743337195828506, "loss": 0.0441, "step": 784 }, { "epoch": 0.340859748154581, "grad_norm": 0.03618557006120682, "learning_rate": 0.00017740440324449594, "loss": 0.0502, "step": 785 }, { "epoch": 0.34129396439426835, "grad_norm": 0.03606418892741203, "learning_rate": 0.00017737543453070684, "loss": 0.0448, "step": 786 }, { "epoch": 0.3417281806339557, "grad_norm": 0.02929144911468029, "learning_rate": 0.00017734646581691775, "loss": 0.0403, "step": 787 }, { "epoch": 0.3421623968736431, "grad_norm": 0.05539466813206673, "learning_rate": 0.00017731749710312865, "loss": 0.0607, "step": 788 }, { "epoch": 0.34259661311333045, "grad_norm": 0.03443344309926033, "learning_rate": 0.00017728852838933953, "loss": 0.0563, "step": 789 }, { "epoch": 0.3430308293530178, "grad_norm": 0.03521762043237686, "learning_rate": 0.0001772595596755504, "loss": 0.0496, "step": 790 }, { "epoch": 0.3434650455927052, "grad_norm": 0.02786187268793583, "learning_rate": 0.0001772305909617613, "loss": 0.0392, "step": 791 }, { "epoch": 0.34389926183239256, "grad_norm": 0.03294853866100311, "learning_rate": 0.00017720162224797218, "loss": 0.0552, "step": 792 }, { "epoch": 0.34433347807207987, "grad_norm": 0.03113682009279728, "learning_rate": 0.0001771726535341831, "loss": 0.0437, "step": 793 }, { "epoch": 0.34476769431176724, "grad_norm": 0.03387298434972763, "learning_rate": 0.000177143684820394, "loss": 0.0487, "step": 794 }, { "epoch": 0.3452019105514546, "grad_norm": 0.03514786809682846, "learning_rate": 0.0001771147161066049, "loss": 0.0531, "step": 795 }, { "epoch": 0.345636126791142, "grad_norm": 0.024749118834733963, "learning_rate": 0.00017708574739281577, "loss": 0.0343, "step": 796 }, { "epoch": 0.34607034303082934, "grad_norm": 0.031341057270765305, "learning_rate": 0.00017705677867902665, "loss": 0.0502, "step": 797 }, { "epoch": 0.3465045592705167, "grad_norm": 0.03254907578229904, "learning_rate": 0.00017702780996523755, "loss": 0.0475, "step": 798 }, { "epoch": 0.3469387755102041, "grad_norm": 0.02680947259068489, "learning_rate": 0.00017699884125144845, "loss": 0.0415, "step": 799 }, { "epoch": 0.34737299174989145, "grad_norm": 0.02892196550965309, "learning_rate": 0.00017696987253765933, "loss": 0.0464, "step": 800 }, { "epoch": 0.3478072079895788, "grad_norm": 0.04500152915716171, "learning_rate": 0.00017694090382387024, "loss": 0.0573, "step": 801 }, { "epoch": 0.3482414242292662, "grad_norm": 0.02908325381577015, "learning_rate": 0.00017691193511008114, "loss": 0.0436, "step": 802 }, { "epoch": 0.34867564046895355, "grad_norm": 0.0353679396212101, "learning_rate": 0.00017688296639629202, "loss": 0.0498, "step": 803 }, { "epoch": 0.3491098567086409, "grad_norm": 0.027662228792905807, "learning_rate": 0.0001768539976825029, "loss": 0.0423, "step": 804 }, { "epoch": 0.3495440729483283, "grad_norm": 0.03129921853542328, "learning_rate": 0.0001768250289687138, "loss": 0.0492, "step": 805 }, { "epoch": 0.34997828918801566, "grad_norm": 0.02768275886774063, "learning_rate": 0.0001767960602549247, "loss": 0.0424, "step": 806 }, { "epoch": 0.35041250542770297, "grad_norm": 0.034509435296058655, "learning_rate": 0.00017676709154113558, "loss": 0.0489, "step": 807 }, { "epoch": 0.35084672166739034, "grad_norm": 0.032575029879808426, "learning_rate": 0.00017673812282734648, "loss": 0.0514, "step": 808 }, { "epoch": 0.3512809379070777, "grad_norm": 0.036231186240911484, "learning_rate": 0.00017670915411355738, "loss": 0.0508, "step": 809 }, { "epoch": 0.3517151541467651, "grad_norm": 0.034158580005168915, "learning_rate": 0.00017668018539976826, "loss": 0.0572, "step": 810 }, { "epoch": 0.35214937038645244, "grad_norm": 0.02618558146059513, "learning_rate": 0.00017665121668597914, "loss": 0.0383, "step": 811 }, { "epoch": 0.3525835866261398, "grad_norm": 0.04136792570352554, "learning_rate": 0.00017662224797219004, "loss": 0.0504, "step": 812 }, { "epoch": 0.3530178028658272, "grad_norm": 0.03034045733511448, "learning_rate": 0.00017659327925840094, "loss": 0.0401, "step": 813 }, { "epoch": 0.35345201910551455, "grad_norm": 0.03294108062982559, "learning_rate": 0.00017656431054461182, "loss": 0.044, "step": 814 }, { "epoch": 0.3538862353452019, "grad_norm": 0.03632107377052307, "learning_rate": 0.00017653534183082272, "loss": 0.0573, "step": 815 }, { "epoch": 0.3543204515848893, "grad_norm": 0.03182584419846535, "learning_rate": 0.00017650637311703363, "loss": 0.0483, "step": 816 }, { "epoch": 0.35475466782457665, "grad_norm": 0.051935162395238876, "learning_rate": 0.0001764774044032445, "loss": 0.0602, "step": 817 }, { "epoch": 0.355188884064264, "grad_norm": 0.025639750063419342, "learning_rate": 0.00017644843568945538, "loss": 0.0377, "step": 818 }, { "epoch": 0.3556231003039514, "grad_norm": 0.02837713249027729, "learning_rate": 0.00017641946697566629, "loss": 0.0402, "step": 819 }, { "epoch": 0.35605731654363876, "grad_norm": 0.0286050233989954, "learning_rate": 0.0001763904982618772, "loss": 0.0455, "step": 820 }, { "epoch": 0.35649153278332607, "grad_norm": 0.025697926059365273, "learning_rate": 0.00017636152954808807, "loss": 0.0426, "step": 821 }, { "epoch": 0.35692574902301344, "grad_norm": 0.03804675117135048, "learning_rate": 0.00017633256083429897, "loss": 0.0543, "step": 822 }, { "epoch": 0.3573599652627008, "grad_norm": 0.03078329935669899, "learning_rate": 0.00017630359212050987, "loss": 0.0438, "step": 823 }, { "epoch": 0.3577941815023882, "grad_norm": 0.030522380024194717, "learning_rate": 0.00017627462340672075, "loss": 0.0433, "step": 824 }, { "epoch": 0.35822839774207554, "grad_norm": 0.0301913321018219, "learning_rate": 0.00017624565469293163, "loss": 0.0424, "step": 825 }, { "epoch": 0.3586626139817629, "grad_norm": 0.03994838148355484, "learning_rate": 0.00017621668597914253, "loss": 0.0629, "step": 826 }, { "epoch": 0.3590968302214503, "grad_norm": 0.030238604173064232, "learning_rate": 0.00017618771726535343, "loss": 0.0459, "step": 827 }, { "epoch": 0.35953104646113765, "grad_norm": 0.039191439747810364, "learning_rate": 0.0001761587485515643, "loss": 0.0544, "step": 828 }, { "epoch": 0.359965262700825, "grad_norm": 0.03680328279733658, "learning_rate": 0.00017612977983777521, "loss": 0.0453, "step": 829 }, { "epoch": 0.3603994789405124, "grad_norm": 0.033231280744075775, "learning_rate": 0.00017610081112398612, "loss": 0.0547, "step": 830 }, { "epoch": 0.36083369518019975, "grad_norm": 0.03846496343612671, "learning_rate": 0.000176071842410197, "loss": 0.0594, "step": 831 }, { "epoch": 0.3612679114198871, "grad_norm": 0.032541606575250626, "learning_rate": 0.00017604287369640787, "loss": 0.0521, "step": 832 }, { "epoch": 0.3617021276595745, "grad_norm": 0.03952357918024063, "learning_rate": 0.00017601390498261877, "loss": 0.0638, "step": 833 }, { "epoch": 0.36213634389926186, "grad_norm": 0.03913275524973869, "learning_rate": 0.00017598493626882968, "loss": 0.062, "step": 834 }, { "epoch": 0.36257056013894917, "grad_norm": 0.028632601723074913, "learning_rate": 0.00017595596755504055, "loss": 0.0389, "step": 835 }, { "epoch": 0.36300477637863654, "grad_norm": 0.0306613277643919, "learning_rate": 0.00017592699884125146, "loss": 0.0392, "step": 836 }, { "epoch": 0.3634389926183239, "grad_norm": 0.034744858741760254, "learning_rate": 0.00017589803012746236, "loss": 0.0432, "step": 837 }, { "epoch": 0.3638732088580113, "grad_norm": 0.042042575776576996, "learning_rate": 0.00017586906141367324, "loss": 0.0584, "step": 838 }, { "epoch": 0.36430742509769865, "grad_norm": 0.031945545226335526, "learning_rate": 0.00017584009269988412, "loss": 0.0446, "step": 839 }, { "epoch": 0.364741641337386, "grad_norm": 0.03574046865105629, "learning_rate": 0.00017581112398609502, "loss": 0.0517, "step": 840 }, { "epoch": 0.3651758575770734, "grad_norm": 0.03979581966996193, "learning_rate": 0.00017578215527230592, "loss": 0.0493, "step": 841 }, { "epoch": 0.36561007381676075, "grad_norm": 0.032785564661026, "learning_rate": 0.0001757531865585168, "loss": 0.0418, "step": 842 }, { "epoch": 0.3660442900564481, "grad_norm": 0.028881819918751717, "learning_rate": 0.0001757242178447277, "loss": 0.0412, "step": 843 }, { "epoch": 0.3664785062961355, "grad_norm": 0.03528248518705368, "learning_rate": 0.0001756952491309386, "loss": 0.0501, "step": 844 }, { "epoch": 0.36691272253582286, "grad_norm": 0.030725127086043358, "learning_rate": 0.0001756662804171495, "loss": 0.0463, "step": 845 }, { "epoch": 0.3673469387755102, "grad_norm": 0.04483892023563385, "learning_rate": 0.0001756373117033604, "loss": 0.0657, "step": 846 }, { "epoch": 0.3677811550151976, "grad_norm": 0.03310380503535271, "learning_rate": 0.00017560834298957126, "loss": 0.046, "step": 847 }, { "epoch": 0.36821537125488496, "grad_norm": 0.056998878717422485, "learning_rate": 0.00017557937427578217, "loss": 0.1006, "step": 848 }, { "epoch": 0.3686495874945723, "grad_norm": 0.036690156906843185, "learning_rate": 0.00017555040556199304, "loss": 0.0605, "step": 849 }, { "epoch": 0.36908380373425964, "grad_norm": 0.03215920925140381, "learning_rate": 0.00017552143684820395, "loss": 0.0495, "step": 850 }, { "epoch": 0.369518019973947, "grad_norm": 0.03417474031448364, "learning_rate": 0.00017549246813441485, "loss": 0.0549, "step": 851 }, { "epoch": 0.3699522362136344, "grad_norm": 0.03306019678711891, "learning_rate": 0.00017546349942062576, "loss": 0.0481, "step": 852 }, { "epoch": 0.37038645245332175, "grad_norm": 0.03975765407085419, "learning_rate": 0.00017543453070683663, "loss": 0.0591, "step": 853 }, { "epoch": 0.3708206686930091, "grad_norm": 0.031365878880023956, "learning_rate": 0.0001754055619930475, "loss": 0.0465, "step": 854 }, { "epoch": 0.3712548849326965, "grad_norm": 0.0351068489253521, "learning_rate": 0.0001753765932792584, "loss": 0.0561, "step": 855 }, { "epoch": 0.37168910117238385, "grad_norm": 0.0460599921643734, "learning_rate": 0.0001753476245654693, "loss": 0.064, "step": 856 }, { "epoch": 0.3721233174120712, "grad_norm": 0.02412169799208641, "learning_rate": 0.0001753186558516802, "loss": 0.0366, "step": 857 }, { "epoch": 0.3725575336517586, "grad_norm": 0.03461502119898796, "learning_rate": 0.0001752896871378911, "loss": 0.0482, "step": 858 }, { "epoch": 0.37299174989144596, "grad_norm": 0.03145555034279823, "learning_rate": 0.000175260718424102, "loss": 0.0505, "step": 859 }, { "epoch": 0.3734259661311333, "grad_norm": 0.028822802007198334, "learning_rate": 0.00017523174971031288, "loss": 0.0437, "step": 860 }, { "epoch": 0.3738601823708207, "grad_norm": 0.031313102692365646, "learning_rate": 0.00017520278099652375, "loss": 0.0458, "step": 861 }, { "epoch": 0.374294398610508, "grad_norm": 0.029907209798693657, "learning_rate": 0.00017517381228273466, "loss": 0.0433, "step": 862 }, { "epoch": 0.3747286148501954, "grad_norm": 0.03319232165813446, "learning_rate": 0.00017514484356894553, "loss": 0.0479, "step": 863 }, { "epoch": 0.37516283108988274, "grad_norm": 0.03179497644305229, "learning_rate": 0.00017511587485515644, "loss": 0.05, "step": 864 }, { "epoch": 0.3755970473295701, "grad_norm": 0.03649448603391647, "learning_rate": 0.00017508690614136734, "loss": 0.0512, "step": 865 }, { "epoch": 0.3760312635692575, "grad_norm": 0.032105471938848495, "learning_rate": 0.00017505793742757824, "loss": 0.0399, "step": 866 }, { "epoch": 0.37646547980894485, "grad_norm": 0.034236740320920944, "learning_rate": 0.00017502896871378912, "loss": 0.0507, "step": 867 }, { "epoch": 0.3768996960486322, "grad_norm": 0.03651820123195648, "learning_rate": 0.000175, "loss": 0.0478, "step": 868 }, { "epoch": 0.3773339122883196, "grad_norm": 0.030957210808992386, "learning_rate": 0.0001749710312862109, "loss": 0.0477, "step": 869 }, { "epoch": 0.37776812852800695, "grad_norm": 0.03258143737912178, "learning_rate": 0.00017494206257242178, "loss": 0.0556, "step": 870 }, { "epoch": 0.3782023447676943, "grad_norm": 0.03698136657476425, "learning_rate": 0.00017491309385863268, "loss": 0.0489, "step": 871 }, { "epoch": 0.3786365610073817, "grad_norm": 0.03633569926023483, "learning_rate": 0.00017488412514484359, "loss": 0.05, "step": 872 }, { "epoch": 0.37907077724706906, "grad_norm": 0.028125159442424774, "learning_rate": 0.0001748551564310545, "loss": 0.0394, "step": 873 }, { "epoch": 0.3795049934867564, "grad_norm": 0.03799419477581978, "learning_rate": 0.00017482618771726537, "loss": 0.0501, "step": 874 }, { "epoch": 0.3799392097264438, "grad_norm": 0.025164257735013962, "learning_rate": 0.00017479721900347624, "loss": 0.0363, "step": 875 }, { "epoch": 0.3803734259661311, "grad_norm": 0.03272455558180809, "learning_rate": 0.00017476825028968715, "loss": 0.0474, "step": 876 }, { "epoch": 0.3808076422058185, "grad_norm": 0.02922961860895157, "learning_rate": 0.00017473928157589802, "loss": 0.0394, "step": 877 }, { "epoch": 0.38124185844550584, "grad_norm": 0.02723761461675167, "learning_rate": 0.00017471031286210893, "loss": 0.0385, "step": 878 }, { "epoch": 0.3816760746851932, "grad_norm": 0.03589416667819023, "learning_rate": 0.00017468134414831983, "loss": 0.051, "step": 879 }, { "epoch": 0.3821102909248806, "grad_norm": 0.03555101528763771, "learning_rate": 0.00017465237543453073, "loss": 0.0496, "step": 880 }, { "epoch": 0.38254450716456795, "grad_norm": 0.03393559530377388, "learning_rate": 0.0001746234067207416, "loss": 0.0453, "step": 881 }, { "epoch": 0.3829787234042553, "grad_norm": 0.03748873621225357, "learning_rate": 0.0001745944380069525, "loss": 0.0506, "step": 882 }, { "epoch": 0.3834129396439427, "grad_norm": 0.03534743934869766, "learning_rate": 0.0001745654692931634, "loss": 0.0534, "step": 883 }, { "epoch": 0.38384715588363005, "grad_norm": 0.03603522479534149, "learning_rate": 0.00017453650057937427, "loss": 0.043, "step": 884 }, { "epoch": 0.3842813721233174, "grad_norm": 0.0324513204395771, "learning_rate": 0.00017450753186558517, "loss": 0.0472, "step": 885 }, { "epoch": 0.3847155883630048, "grad_norm": 0.028880387544631958, "learning_rate": 0.00017447856315179608, "loss": 0.0414, "step": 886 }, { "epoch": 0.38514980460269216, "grad_norm": 0.03771184757351875, "learning_rate": 0.00017444959443800698, "loss": 0.05, "step": 887 }, { "epoch": 0.3855840208423795, "grad_norm": 0.03606418892741203, "learning_rate": 0.00017442062572421786, "loss": 0.0502, "step": 888 }, { "epoch": 0.3860182370820669, "grad_norm": 0.03513363003730774, "learning_rate": 0.00017439165701042873, "loss": 0.0484, "step": 889 }, { "epoch": 0.3864524533217542, "grad_norm": 0.028294511139392853, "learning_rate": 0.00017436268829663964, "loss": 0.0443, "step": 890 }, { "epoch": 0.3868866695614416, "grad_norm": 0.0335603803396225, "learning_rate": 0.0001743337195828505, "loss": 0.048, "step": 891 }, { "epoch": 0.38732088580112894, "grad_norm": 0.0278327539563179, "learning_rate": 0.00017430475086906142, "loss": 0.0379, "step": 892 }, { "epoch": 0.3877551020408163, "grad_norm": 0.0279851071536541, "learning_rate": 0.00017427578215527232, "loss": 0.0369, "step": 893 }, { "epoch": 0.3881893182805037, "grad_norm": 0.032952681183815, "learning_rate": 0.00017424681344148322, "loss": 0.0469, "step": 894 }, { "epoch": 0.38862353452019105, "grad_norm": 0.028144773095846176, "learning_rate": 0.0001742178447276941, "loss": 0.0441, "step": 895 }, { "epoch": 0.3890577507598784, "grad_norm": 0.02970105968415737, "learning_rate": 0.00017418887601390498, "loss": 0.0376, "step": 896 }, { "epoch": 0.3894919669995658, "grad_norm": 0.024731142446398735, "learning_rate": 0.00017415990730011588, "loss": 0.0377, "step": 897 }, { "epoch": 0.38992618323925315, "grad_norm": 0.030018705874681473, "learning_rate": 0.00017413093858632678, "loss": 0.0424, "step": 898 }, { "epoch": 0.3903603994789405, "grad_norm": 0.028162164613604546, "learning_rate": 0.00017410196987253766, "loss": 0.0436, "step": 899 }, { "epoch": 0.3907946157186279, "grad_norm": 0.035514798015356064, "learning_rate": 0.00017407300115874856, "loss": 0.052, "step": 900 }, { "epoch": 0.39122883195831526, "grad_norm": 0.032899241894483566, "learning_rate": 0.00017404403244495947, "loss": 0.0503, "step": 901 }, { "epoch": 0.39166304819800263, "grad_norm": 0.028493542224168777, "learning_rate": 0.00017401506373117035, "loss": 0.0398, "step": 902 }, { "epoch": 0.39209726443769, "grad_norm": 0.03478928655385971, "learning_rate": 0.00017398609501738122, "loss": 0.0453, "step": 903 }, { "epoch": 0.3925314806773773, "grad_norm": 0.03423327952623367, "learning_rate": 0.00017395712630359213, "loss": 0.054, "step": 904 }, { "epoch": 0.3929656969170647, "grad_norm": 0.035531461238861084, "learning_rate": 0.00017392815758980303, "loss": 0.0542, "step": 905 }, { "epoch": 0.39339991315675205, "grad_norm": 0.037643611431121826, "learning_rate": 0.0001738991888760139, "loss": 0.0561, "step": 906 }, { "epoch": 0.3938341293964394, "grad_norm": 0.035972096025943756, "learning_rate": 0.0001738702201622248, "loss": 0.049, "step": 907 }, { "epoch": 0.3942683456361268, "grad_norm": 0.030961114913225174, "learning_rate": 0.0001738412514484357, "loss": 0.0512, "step": 908 }, { "epoch": 0.39470256187581415, "grad_norm": 0.028138622641563416, "learning_rate": 0.0001738122827346466, "loss": 0.041, "step": 909 }, { "epoch": 0.3951367781155015, "grad_norm": 0.027596397325396538, "learning_rate": 0.00017378331402085747, "loss": 0.0388, "step": 910 }, { "epoch": 0.3955709943551889, "grad_norm": 0.036916330456733704, "learning_rate": 0.00017375434530706837, "loss": 0.0578, "step": 911 }, { "epoch": 0.39600521059487626, "grad_norm": 0.03938530012965202, "learning_rate": 0.00017372537659327927, "loss": 0.0574, "step": 912 }, { "epoch": 0.3964394268345636, "grad_norm": 0.029536884278059006, "learning_rate": 0.00017369640787949015, "loss": 0.0412, "step": 913 }, { "epoch": 0.396873643074251, "grad_norm": 0.023957977071404457, "learning_rate": 0.00017366743916570105, "loss": 0.0391, "step": 914 }, { "epoch": 0.39730785931393836, "grad_norm": 0.03193599358201027, "learning_rate": 0.00017363847045191196, "loss": 0.0479, "step": 915 }, { "epoch": 0.39774207555362573, "grad_norm": 0.027932552620768547, "learning_rate": 0.00017360950173812283, "loss": 0.0391, "step": 916 }, { "epoch": 0.3981762917933131, "grad_norm": 0.026805579662322998, "learning_rate": 0.0001735805330243337, "loss": 0.0342, "step": 917 }, { "epoch": 0.3986105080330004, "grad_norm": 0.02726161666214466, "learning_rate": 0.00017355156431054461, "loss": 0.0384, "step": 918 }, { "epoch": 0.3990447242726878, "grad_norm": 0.033837608993053436, "learning_rate": 0.00017352259559675552, "loss": 0.0476, "step": 919 }, { "epoch": 0.39947894051237515, "grad_norm": 0.02968410588800907, "learning_rate": 0.0001734936268829664, "loss": 0.0342, "step": 920 }, { "epoch": 0.3999131567520625, "grad_norm": 0.04314741864800453, "learning_rate": 0.0001734646581691773, "loss": 0.0576, "step": 921 }, { "epoch": 0.4003473729917499, "grad_norm": 0.032130517065525055, "learning_rate": 0.0001734356894553882, "loss": 0.0463, "step": 922 }, { "epoch": 0.40078158923143725, "grad_norm": 0.025740861892700195, "learning_rate": 0.00017340672074159908, "loss": 0.0345, "step": 923 }, { "epoch": 0.4012158054711246, "grad_norm": 0.03703179955482483, "learning_rate": 0.00017337775202780998, "loss": 0.0455, "step": 924 }, { "epoch": 0.401650021710812, "grad_norm": 0.033029261976480484, "learning_rate": 0.00017334878331402086, "loss": 0.0493, "step": 925 }, { "epoch": 0.40208423795049936, "grad_norm": 0.0313020683825016, "learning_rate": 0.00017331981460023176, "loss": 0.0482, "step": 926 }, { "epoch": 0.4025184541901867, "grad_norm": 0.025354797020554543, "learning_rate": 0.00017329084588644264, "loss": 0.0391, "step": 927 }, { "epoch": 0.4029526704298741, "grad_norm": 0.03517385944724083, "learning_rate": 0.00017326187717265354, "loss": 0.0551, "step": 928 }, { "epoch": 0.40338688666956146, "grad_norm": 0.0337984524667263, "learning_rate": 0.00017323290845886445, "loss": 0.0465, "step": 929 }, { "epoch": 0.40382110290924883, "grad_norm": 0.039554376155138016, "learning_rate": 0.00017320393974507532, "loss": 0.052, "step": 930 }, { "epoch": 0.40425531914893614, "grad_norm": 0.027436422184109688, "learning_rate": 0.00017317497103128623, "loss": 0.0407, "step": 931 }, { "epoch": 0.4046895353886235, "grad_norm": 0.03020455501973629, "learning_rate": 0.0001731460023174971, "loss": 0.038, "step": 932 }, { "epoch": 0.4051237516283109, "grad_norm": 0.034671034663915634, "learning_rate": 0.000173117033603708, "loss": 0.0482, "step": 933 }, { "epoch": 0.40555796786799825, "grad_norm": 0.030291734263300896, "learning_rate": 0.00017308806488991888, "loss": 0.0385, "step": 934 }, { "epoch": 0.4059921841076856, "grad_norm": 0.03580262139439583, "learning_rate": 0.0001730590961761298, "loss": 0.0513, "step": 935 }, { "epoch": 0.406426400347373, "grad_norm": 0.032775212079286575, "learning_rate": 0.0001730301274623407, "loss": 0.045, "step": 936 }, { "epoch": 0.40686061658706035, "grad_norm": 0.029379909858107567, "learning_rate": 0.00017300115874855157, "loss": 0.0346, "step": 937 }, { "epoch": 0.4072948328267477, "grad_norm": 0.0345950685441494, "learning_rate": 0.00017297219003476247, "loss": 0.0488, "step": 938 }, { "epoch": 0.4077290490664351, "grad_norm": 0.02821270190179348, "learning_rate": 0.00017294322132097335, "loss": 0.0421, "step": 939 }, { "epoch": 0.40816326530612246, "grad_norm": 0.031935833394527435, "learning_rate": 0.00017291425260718425, "loss": 0.0421, "step": 940 }, { "epoch": 0.4085974815458098, "grad_norm": 0.03434150665998459, "learning_rate": 0.00017288528389339513, "loss": 0.0408, "step": 941 }, { "epoch": 0.4090316977854972, "grad_norm": 0.03548818454146385, "learning_rate": 0.00017285631517960603, "loss": 0.0472, "step": 942 }, { "epoch": 0.40946591402518456, "grad_norm": 0.03209386765956879, "learning_rate": 0.00017282734646581694, "loss": 0.051, "step": 943 }, { "epoch": 0.40990013026487193, "grad_norm": 0.03075801022350788, "learning_rate": 0.00017279837775202784, "loss": 0.0408, "step": 944 }, { "epoch": 0.41033434650455924, "grad_norm": 0.040367018431425095, "learning_rate": 0.00017276940903823872, "loss": 0.0577, "step": 945 }, { "epoch": 0.4107685627442466, "grad_norm": 0.03569037839770317, "learning_rate": 0.0001727404403244496, "loss": 0.0428, "step": 946 }, { "epoch": 0.411202778983934, "grad_norm": 0.034485381096601486, "learning_rate": 0.0001727114716106605, "loss": 0.0502, "step": 947 }, { "epoch": 0.41163699522362135, "grad_norm": 0.030553176999092102, "learning_rate": 0.00017268250289687137, "loss": 0.0474, "step": 948 }, { "epoch": 0.4120712114633087, "grad_norm": 0.03037351183593273, "learning_rate": 0.00017265353418308228, "loss": 0.0441, "step": 949 }, { "epoch": 0.4125054277029961, "grad_norm": 0.029221635311841965, "learning_rate": 0.00017262456546929318, "loss": 0.0451, "step": 950 }, { "epoch": 0.41293964394268345, "grad_norm": 0.030891064554452896, "learning_rate": 0.00017259559675550409, "loss": 0.0469, "step": 951 }, { "epoch": 0.4133738601823708, "grad_norm": 0.038812361657619476, "learning_rate": 0.00017256662804171496, "loss": 0.0546, "step": 952 }, { "epoch": 0.4138080764220582, "grad_norm": 0.03283114731311798, "learning_rate": 0.00017253765932792584, "loss": 0.0377, "step": 953 }, { "epoch": 0.41424229266174556, "grad_norm": 0.031972333788871765, "learning_rate": 0.00017250869061413674, "loss": 0.0501, "step": 954 }, { "epoch": 0.4146765089014329, "grad_norm": 0.03369893133640289, "learning_rate": 0.00017247972190034762, "loss": 0.0508, "step": 955 }, { "epoch": 0.4151107251411203, "grad_norm": 0.029352284967899323, "learning_rate": 0.00017245075318655852, "loss": 0.0416, "step": 956 }, { "epoch": 0.41554494138080766, "grad_norm": 0.04007987305521965, "learning_rate": 0.00017242178447276943, "loss": 0.0509, "step": 957 }, { "epoch": 0.41597915762049503, "grad_norm": 0.03789031505584717, "learning_rate": 0.00017239281575898033, "loss": 0.0536, "step": 958 }, { "epoch": 0.41641337386018235, "grad_norm": 0.027904894202947617, "learning_rate": 0.0001723638470451912, "loss": 0.0411, "step": 959 }, { "epoch": 0.4168475900998697, "grad_norm": 0.024632325395941734, "learning_rate": 0.00017233487833140208, "loss": 0.0332, "step": 960 }, { "epoch": 0.4172818063395571, "grad_norm": 0.03315972909331322, "learning_rate": 0.000172305909617613, "loss": 0.0514, "step": 961 }, { "epoch": 0.41771602257924445, "grad_norm": 0.02900533378124237, "learning_rate": 0.00017227694090382386, "loss": 0.0449, "step": 962 }, { "epoch": 0.4181502388189318, "grad_norm": 0.03435368835926056, "learning_rate": 0.00017224797219003477, "loss": 0.052, "step": 963 }, { "epoch": 0.4185844550586192, "grad_norm": 0.03469766676425934, "learning_rate": 0.00017221900347624567, "loss": 0.0488, "step": 964 }, { "epoch": 0.41901867129830656, "grad_norm": 0.02837817743420601, "learning_rate": 0.00017219003476245657, "loss": 0.0385, "step": 965 }, { "epoch": 0.4194528875379939, "grad_norm": 0.03546156734228134, "learning_rate": 0.00017216106604866745, "loss": 0.0511, "step": 966 }, { "epoch": 0.4198871037776813, "grad_norm": 0.03588046878576279, "learning_rate": 0.00017213209733487833, "loss": 0.0549, "step": 967 }, { "epoch": 0.42032132001736866, "grad_norm": 0.030910860747098923, "learning_rate": 0.00017210312862108923, "loss": 0.0556, "step": 968 }, { "epoch": 0.42075553625705603, "grad_norm": 0.027249084785580635, "learning_rate": 0.0001720741599073001, "loss": 0.0449, "step": 969 }, { "epoch": 0.4211897524967434, "grad_norm": 0.032478030771017075, "learning_rate": 0.000172045191193511, "loss": 0.0489, "step": 970 }, { "epoch": 0.42162396873643077, "grad_norm": 0.03276272118091583, "learning_rate": 0.00017201622247972192, "loss": 0.0506, "step": 971 }, { "epoch": 0.42205818497611813, "grad_norm": 0.03584790229797363, "learning_rate": 0.00017198725376593282, "loss": 0.0461, "step": 972 }, { "epoch": 0.42249240121580545, "grad_norm": 0.034953001886606216, "learning_rate": 0.0001719582850521437, "loss": 0.0524, "step": 973 }, { "epoch": 0.4229266174554928, "grad_norm": 0.028279446065425873, "learning_rate": 0.00017192931633835457, "loss": 0.0446, "step": 974 }, { "epoch": 0.4233608336951802, "grad_norm": 0.041365597397089005, "learning_rate": 0.00017190034762456548, "loss": 0.0583, "step": 975 }, { "epoch": 0.42379504993486755, "grad_norm": 0.04159562662243843, "learning_rate": 0.00017187137891077635, "loss": 0.0645, "step": 976 }, { "epoch": 0.4242292661745549, "grad_norm": 0.02933673933148384, "learning_rate": 0.00017184241019698726, "loss": 0.0454, "step": 977 }, { "epoch": 0.4246634824142423, "grad_norm": 0.03089239075779915, "learning_rate": 0.00017181344148319816, "loss": 0.0481, "step": 978 }, { "epoch": 0.42509769865392966, "grad_norm": 0.0347575843334198, "learning_rate": 0.00017178447276940906, "loss": 0.0537, "step": 979 }, { "epoch": 0.425531914893617, "grad_norm": 0.02515512891113758, "learning_rate": 0.00017175550405561994, "loss": 0.0368, "step": 980 }, { "epoch": 0.4259661311333044, "grad_norm": 0.0293170977383852, "learning_rate": 0.00017172653534183082, "loss": 0.0404, "step": 981 }, { "epoch": 0.42640034737299176, "grad_norm": 0.033776022493839264, "learning_rate": 0.00017169756662804172, "loss": 0.044, "step": 982 }, { "epoch": 0.42683456361267913, "grad_norm": 0.028236735612154007, "learning_rate": 0.0001716685979142526, "loss": 0.0403, "step": 983 }, { "epoch": 0.4272687798523665, "grad_norm": 0.06138623133301735, "learning_rate": 0.0001716396292004635, "loss": 0.0929, "step": 984 }, { "epoch": 0.42770299609205387, "grad_norm": 0.037684179842472076, "learning_rate": 0.0001716106604866744, "loss": 0.0487, "step": 985 }, { "epoch": 0.42813721233174123, "grad_norm": 0.02903871424496174, "learning_rate": 0.0001715816917728853, "loss": 0.0422, "step": 986 }, { "epoch": 0.42857142857142855, "grad_norm": 0.030636316165328026, "learning_rate": 0.00017155272305909619, "loss": 0.0437, "step": 987 }, { "epoch": 0.4290056448111159, "grad_norm": 0.028834963217377663, "learning_rate": 0.00017152375434530706, "loss": 0.0481, "step": 988 }, { "epoch": 0.4294398610508033, "grad_norm": 0.024905307218432426, "learning_rate": 0.00017149478563151797, "loss": 0.0443, "step": 989 }, { "epoch": 0.42987407729049065, "grad_norm": 0.028678173199295998, "learning_rate": 0.00017146581691772887, "loss": 0.0395, "step": 990 }, { "epoch": 0.430308293530178, "grad_norm": 0.0322023443877697, "learning_rate": 0.00017143684820393975, "loss": 0.0459, "step": 991 }, { "epoch": 0.4307425097698654, "grad_norm": 0.027417009696364403, "learning_rate": 0.00017140787949015065, "loss": 0.0449, "step": 992 }, { "epoch": 0.43117672600955276, "grad_norm": 0.02587359957396984, "learning_rate": 0.00017137891077636155, "loss": 0.0395, "step": 993 }, { "epoch": 0.4316109422492401, "grad_norm": 0.033392224460840225, "learning_rate": 0.00017134994206257243, "loss": 0.0477, "step": 994 }, { "epoch": 0.4320451584889275, "grad_norm": 0.029676226899027824, "learning_rate": 0.0001713209733487833, "loss": 0.0465, "step": 995 }, { "epoch": 0.43247937472861486, "grad_norm": 0.03040267713367939, "learning_rate": 0.0001712920046349942, "loss": 0.0452, "step": 996 }, { "epoch": 0.43291359096830223, "grad_norm": 0.03452040255069733, "learning_rate": 0.00017126303592120511, "loss": 0.055, "step": 997 }, { "epoch": 0.4333478072079896, "grad_norm": 0.02380337007343769, "learning_rate": 0.000171234067207416, "loss": 0.0386, "step": 998 }, { "epoch": 0.43378202344767697, "grad_norm": 0.03348243609070778, "learning_rate": 0.0001712050984936269, "loss": 0.0696, "step": 999 }, { "epoch": 0.4342162396873643, "grad_norm": 0.03576110675930977, "learning_rate": 0.0001711761297798378, "loss": 0.0554, "step": 1000 }, { "epoch": 0.43465045592705165, "grad_norm": 0.028079437091946602, "learning_rate": 0.00017114716106604867, "loss": 0.04, "step": 1001 }, { "epoch": 0.435084672166739, "grad_norm": 0.033638644963502884, "learning_rate": 0.00017111819235225958, "loss": 0.0513, "step": 1002 }, { "epoch": 0.4355188884064264, "grad_norm": 0.031191788613796234, "learning_rate": 0.00017108922363847046, "loss": 0.0471, "step": 1003 }, { "epoch": 0.43595310464611375, "grad_norm": 0.027175866067409515, "learning_rate": 0.00017106025492468136, "loss": 0.0417, "step": 1004 }, { "epoch": 0.4363873208858011, "grad_norm": 0.029152212664484978, "learning_rate": 0.00017103128621089224, "loss": 0.0443, "step": 1005 }, { "epoch": 0.4368215371254885, "grad_norm": 0.02231225185096264, "learning_rate": 0.00017100231749710314, "loss": 0.038, "step": 1006 }, { "epoch": 0.43725575336517586, "grad_norm": 0.03328331187367439, "learning_rate": 0.00017097334878331404, "loss": 0.0398, "step": 1007 }, { "epoch": 0.4376899696048632, "grad_norm": 0.031838707625865936, "learning_rate": 0.00017094438006952492, "loss": 0.0433, "step": 1008 }, { "epoch": 0.4381241858445506, "grad_norm": 0.031498026102781296, "learning_rate": 0.00017091541135573582, "loss": 0.04, "step": 1009 }, { "epoch": 0.43855840208423796, "grad_norm": 0.03430824726819992, "learning_rate": 0.0001708864426419467, "loss": 0.0486, "step": 1010 }, { "epoch": 0.43899261832392533, "grad_norm": 0.024896496906876564, "learning_rate": 0.0001708574739281576, "loss": 0.0432, "step": 1011 }, { "epoch": 0.4394268345636127, "grad_norm": 0.025910722091794014, "learning_rate": 0.00017082850521436848, "loss": 0.0416, "step": 1012 }, { "epoch": 0.43986105080330007, "grad_norm": 0.04174089804291725, "learning_rate": 0.00017079953650057938, "loss": 0.0572, "step": 1013 }, { "epoch": 0.4402952670429874, "grad_norm": 0.02988690324127674, "learning_rate": 0.0001707705677867903, "loss": 0.04, "step": 1014 }, { "epoch": 0.44072948328267475, "grad_norm": 0.032539624720811844, "learning_rate": 0.00017074159907300116, "loss": 0.0481, "step": 1015 }, { "epoch": 0.4411636995223621, "grad_norm": 0.03739083930850029, "learning_rate": 0.00017071263035921207, "loss": 0.0431, "step": 1016 }, { "epoch": 0.4415979157620495, "grad_norm": 0.02792794071137905, "learning_rate": 0.00017068366164542294, "loss": 0.0431, "step": 1017 }, { "epoch": 0.44203213200173685, "grad_norm": 0.034184530377388, "learning_rate": 0.00017065469293163385, "loss": 0.0495, "step": 1018 }, { "epoch": 0.4424663482414242, "grad_norm": 0.03526041656732559, "learning_rate": 0.00017062572421784472, "loss": 0.0433, "step": 1019 }, { "epoch": 0.4429005644811116, "grad_norm": 0.03394777700304985, "learning_rate": 0.00017059675550405563, "loss": 0.0441, "step": 1020 }, { "epoch": 0.44333478072079896, "grad_norm": 0.043113965541124344, "learning_rate": 0.00017056778679026653, "loss": 0.0541, "step": 1021 }, { "epoch": 0.44376899696048633, "grad_norm": 0.028107045218348503, "learning_rate": 0.0001705388180764774, "loss": 0.0473, "step": 1022 }, { "epoch": 0.4442032132001737, "grad_norm": 0.030500447377562523, "learning_rate": 0.0001705098493626883, "loss": 0.0426, "step": 1023 }, { "epoch": 0.44463742943986106, "grad_norm": 0.030155744403600693, "learning_rate": 0.0001704808806488992, "loss": 0.0448, "step": 1024 }, { "epoch": 0.44507164567954843, "grad_norm": 0.030469929799437523, "learning_rate": 0.0001704519119351101, "loss": 0.0449, "step": 1025 }, { "epoch": 0.4455058619192358, "grad_norm": 0.03531277924776077, "learning_rate": 0.00017042294322132097, "loss": 0.0522, "step": 1026 }, { "epoch": 0.44594007815892317, "grad_norm": 0.04284236580133438, "learning_rate": 0.00017039397450753187, "loss": 0.0534, "step": 1027 }, { "epoch": 0.4463742943986105, "grad_norm": 0.03668149560689926, "learning_rate": 0.00017036500579374278, "loss": 0.0516, "step": 1028 }, { "epoch": 0.44680851063829785, "grad_norm": 0.029756693169474602, "learning_rate": 0.00017033603707995365, "loss": 0.0489, "step": 1029 }, { "epoch": 0.4472427268779852, "grad_norm": 0.027604207396507263, "learning_rate": 0.00017030706836616456, "loss": 0.0441, "step": 1030 }, { "epoch": 0.4476769431176726, "grad_norm": 0.02891652286052704, "learning_rate": 0.00017027809965237543, "loss": 0.0359, "step": 1031 }, { "epoch": 0.44811115935735996, "grad_norm": 0.031301744282245636, "learning_rate": 0.00017024913093858634, "loss": 0.0381, "step": 1032 }, { "epoch": 0.4485453755970473, "grad_norm": 0.03003089316189289, "learning_rate": 0.00017022016222479721, "loss": 0.0387, "step": 1033 }, { "epoch": 0.4489795918367347, "grad_norm": 0.029097583144903183, "learning_rate": 0.00017019119351100812, "loss": 0.043, "step": 1034 }, { "epoch": 0.44941380807642206, "grad_norm": 0.03599700331687927, "learning_rate": 0.00017016222479721902, "loss": 0.0515, "step": 1035 }, { "epoch": 0.44984802431610943, "grad_norm": 0.03560885414481163, "learning_rate": 0.00017013325608342993, "loss": 0.0462, "step": 1036 }, { "epoch": 0.4502822405557968, "grad_norm": 0.03297913074493408, "learning_rate": 0.0001701042873696408, "loss": 0.0559, "step": 1037 }, { "epoch": 0.45071645679548417, "grad_norm": 0.02658795565366745, "learning_rate": 0.00017007531865585168, "loss": 0.0373, "step": 1038 }, { "epoch": 0.45115067303517153, "grad_norm": 0.029905473813414574, "learning_rate": 0.00017004634994206258, "loss": 0.0377, "step": 1039 }, { "epoch": 0.4515848892748589, "grad_norm": 0.029432514682412148, "learning_rate": 0.00017001738122827346, "loss": 0.0474, "step": 1040 }, { "epoch": 0.45201910551454627, "grad_norm": 0.03128014877438545, "learning_rate": 0.00016998841251448436, "loss": 0.0482, "step": 1041 }, { "epoch": 0.4524533217542336, "grad_norm": 0.023013576865196228, "learning_rate": 0.00016995944380069527, "loss": 0.0322, "step": 1042 }, { "epoch": 0.45288753799392095, "grad_norm": 0.032422661781311035, "learning_rate": 0.00016993047508690617, "loss": 0.045, "step": 1043 }, { "epoch": 0.4533217542336083, "grad_norm": 0.024171508848667145, "learning_rate": 0.00016990150637311705, "loss": 0.0328, "step": 1044 }, { "epoch": 0.4537559704732957, "grad_norm": 0.02731773629784584, "learning_rate": 0.00016987253765932792, "loss": 0.0422, "step": 1045 }, { "epoch": 0.45419018671298306, "grad_norm": 0.02717318944633007, "learning_rate": 0.00016984356894553883, "loss": 0.0398, "step": 1046 }, { "epoch": 0.4546244029526704, "grad_norm": 0.027664760127663612, "learning_rate": 0.0001698146002317497, "loss": 0.0503, "step": 1047 }, { "epoch": 0.4550586191923578, "grad_norm": 0.036886509507894516, "learning_rate": 0.0001697856315179606, "loss": 0.0527, "step": 1048 }, { "epoch": 0.45549283543204516, "grad_norm": 0.024572983384132385, "learning_rate": 0.0001697566628041715, "loss": 0.0384, "step": 1049 }, { "epoch": 0.45592705167173253, "grad_norm": 0.036228764802217484, "learning_rate": 0.00016972769409038242, "loss": 0.0505, "step": 1050 }, { "epoch": 0.4563612679114199, "grad_norm": 0.040163446217775345, "learning_rate": 0.0001696987253765933, "loss": 0.0532, "step": 1051 }, { "epoch": 0.45679548415110727, "grad_norm": 0.04033424332737923, "learning_rate": 0.00016966975666280417, "loss": 0.0536, "step": 1052 }, { "epoch": 0.45722970039079464, "grad_norm": 0.034723129123449326, "learning_rate": 0.00016964078794901507, "loss": 0.0519, "step": 1053 }, { "epoch": 0.457663916630482, "grad_norm": 0.03609688580036163, "learning_rate": 0.00016961181923522595, "loss": 0.0541, "step": 1054 }, { "epoch": 0.4580981328701693, "grad_norm": 0.04541436582803726, "learning_rate": 0.00016958285052143685, "loss": 0.0508, "step": 1055 }, { "epoch": 0.4585323491098567, "grad_norm": 0.03178521618247032, "learning_rate": 0.00016955388180764776, "loss": 0.0441, "step": 1056 }, { "epoch": 0.45896656534954405, "grad_norm": 0.03495994955301285, "learning_rate": 0.00016952491309385866, "loss": 0.0505, "step": 1057 }, { "epoch": 0.4594007815892314, "grad_norm": 0.02626221254467964, "learning_rate": 0.00016949594438006954, "loss": 0.0361, "step": 1058 }, { "epoch": 0.4598349978289188, "grad_norm": 0.031108492985367775, "learning_rate": 0.0001694669756662804, "loss": 0.043, "step": 1059 }, { "epoch": 0.46026921406860616, "grad_norm": 0.028029577806591988, "learning_rate": 0.00016943800695249132, "loss": 0.0384, "step": 1060 }, { "epoch": 0.4607034303082935, "grad_norm": 0.03848305344581604, "learning_rate": 0.0001694090382387022, "loss": 0.049, "step": 1061 }, { "epoch": 0.4611376465479809, "grad_norm": 0.0273699052631855, "learning_rate": 0.0001693800695249131, "loss": 0.0401, "step": 1062 }, { "epoch": 0.46157186278766826, "grad_norm": 0.02904667891561985, "learning_rate": 0.000169351100811124, "loss": 0.043, "step": 1063 }, { "epoch": 0.46200607902735563, "grad_norm": 0.03567121922969818, "learning_rate": 0.0001693221320973349, "loss": 0.0541, "step": 1064 }, { "epoch": 0.462440295267043, "grad_norm": 0.028400305658578873, "learning_rate": 0.00016929316338354578, "loss": 0.0444, "step": 1065 }, { "epoch": 0.46287451150673037, "grad_norm": 0.020795537158846855, "learning_rate": 0.00016926419466975666, "loss": 0.0277, "step": 1066 }, { "epoch": 0.46330872774641774, "grad_norm": 0.023880701512098312, "learning_rate": 0.00016923522595596756, "loss": 0.0342, "step": 1067 }, { "epoch": 0.4637429439861051, "grad_norm": 0.021049542352557182, "learning_rate": 0.00016920625724217844, "loss": 0.0277, "step": 1068 }, { "epoch": 0.4641771602257924, "grad_norm": 0.03706882521510124, "learning_rate": 0.00016917728852838934, "loss": 0.053, "step": 1069 }, { "epoch": 0.4646113764654798, "grad_norm": 0.04312799870967865, "learning_rate": 0.00016914831981460025, "loss": 0.0473, "step": 1070 }, { "epoch": 0.46504559270516715, "grad_norm": 0.03167681396007538, "learning_rate": 0.00016911935110081115, "loss": 0.05, "step": 1071 }, { "epoch": 0.4654798089448545, "grad_norm": 0.03814166039228439, "learning_rate": 0.00016909038238702203, "loss": 0.0529, "step": 1072 }, { "epoch": 0.4659140251845419, "grad_norm": 0.04097817465662956, "learning_rate": 0.0001690614136732329, "loss": 0.0531, "step": 1073 }, { "epoch": 0.46634824142422926, "grad_norm": 0.03297990933060646, "learning_rate": 0.0001690324449594438, "loss": 0.0404, "step": 1074 }, { "epoch": 0.46678245766391663, "grad_norm": 0.031626272946596146, "learning_rate": 0.00016900347624565468, "loss": 0.0423, "step": 1075 }, { "epoch": 0.467216673903604, "grad_norm": 0.03450481593608856, "learning_rate": 0.00016897450753186559, "loss": 0.0481, "step": 1076 }, { "epoch": 0.46765089014329136, "grad_norm": 0.025056125596165657, "learning_rate": 0.0001689455388180765, "loss": 0.0351, "step": 1077 }, { "epoch": 0.46808510638297873, "grad_norm": 0.02817426435649395, "learning_rate": 0.0001689165701042874, "loss": 0.0389, "step": 1078 }, { "epoch": 0.4685193226226661, "grad_norm": 0.03598317503929138, "learning_rate": 0.00016888760139049827, "loss": 0.0492, "step": 1079 }, { "epoch": 0.46895353886235347, "grad_norm": 0.032032888382673264, "learning_rate": 0.00016885863267670917, "loss": 0.0445, "step": 1080 }, { "epoch": 0.46938775510204084, "grad_norm": 0.030501795932650566, "learning_rate": 0.00016882966396292005, "loss": 0.0393, "step": 1081 }, { "epoch": 0.4698219713417282, "grad_norm": 0.03600820526480675, "learning_rate": 0.00016880069524913093, "loss": 0.0492, "step": 1082 }, { "epoch": 0.4702561875814155, "grad_norm": 0.033180974423885345, "learning_rate": 0.00016877172653534183, "loss": 0.0374, "step": 1083 }, { "epoch": 0.4706904038211029, "grad_norm": 0.03191152215003967, "learning_rate": 0.00016874275782155273, "loss": 0.051, "step": 1084 }, { "epoch": 0.47112462006079026, "grad_norm": 0.024999383836984634, "learning_rate": 0.00016871378910776364, "loss": 0.0375, "step": 1085 }, { "epoch": 0.4715588363004776, "grad_norm": 0.03237331286072731, "learning_rate": 0.00016868482039397452, "loss": 0.042, "step": 1086 }, { "epoch": 0.471993052540165, "grad_norm": 0.029195362702012062, "learning_rate": 0.00016865585168018542, "loss": 0.0444, "step": 1087 }, { "epoch": 0.47242726877985236, "grad_norm": 0.02476334385573864, "learning_rate": 0.0001686268829663963, "loss": 0.035, "step": 1088 }, { "epoch": 0.47286148501953973, "grad_norm": 0.02538246288895607, "learning_rate": 0.0001685979142526072, "loss": 0.0335, "step": 1089 }, { "epoch": 0.4732957012592271, "grad_norm": 0.046175405383110046, "learning_rate": 0.00016856894553881808, "loss": 0.0749, "step": 1090 }, { "epoch": 0.47372991749891447, "grad_norm": 0.027586601674556732, "learning_rate": 0.00016853997682502898, "loss": 0.0451, "step": 1091 }, { "epoch": 0.47416413373860183, "grad_norm": 0.04073783755302429, "learning_rate": 0.00016851100811123988, "loss": 0.0561, "step": 1092 }, { "epoch": 0.4745983499782892, "grad_norm": 0.03376505896449089, "learning_rate": 0.00016848203939745076, "loss": 0.0547, "step": 1093 }, { "epoch": 0.47503256621797657, "grad_norm": 0.03574476018548012, "learning_rate": 0.00016845307068366166, "loss": 0.0515, "step": 1094 }, { "epoch": 0.47546678245766394, "grad_norm": 0.03387194871902466, "learning_rate": 0.00016842410196987254, "loss": 0.0488, "step": 1095 }, { "epoch": 0.4759009986973513, "grad_norm": 0.033859074115753174, "learning_rate": 0.00016839513325608344, "loss": 0.0478, "step": 1096 }, { "epoch": 0.4763352149370386, "grad_norm": 0.030819758772850037, "learning_rate": 0.00016836616454229432, "loss": 0.051, "step": 1097 }, { "epoch": 0.476769431176726, "grad_norm": 0.026491645723581314, "learning_rate": 0.00016833719582850522, "loss": 0.0427, "step": 1098 }, { "epoch": 0.47720364741641336, "grad_norm": 0.03483892232179642, "learning_rate": 0.00016830822711471613, "loss": 0.0506, "step": 1099 }, { "epoch": 0.4776378636561007, "grad_norm": 0.036432333290576935, "learning_rate": 0.000168279258400927, "loss": 0.0465, "step": 1100 }, { "epoch": 0.4780720798957881, "grad_norm": 0.03572588786482811, "learning_rate": 0.0001682502896871379, "loss": 0.051, "step": 1101 }, { "epoch": 0.47850629613547546, "grad_norm": 0.03004995919764042, "learning_rate": 0.00016822132097334878, "loss": 0.0422, "step": 1102 }, { "epoch": 0.47894051237516283, "grad_norm": 0.034211207181215286, "learning_rate": 0.0001681923522595597, "loss": 0.0455, "step": 1103 }, { "epoch": 0.4793747286148502, "grad_norm": 0.03502094745635986, "learning_rate": 0.00016816338354577057, "loss": 0.0456, "step": 1104 }, { "epoch": 0.47980894485453757, "grad_norm": 0.027699032798409462, "learning_rate": 0.00016813441483198147, "loss": 0.0405, "step": 1105 }, { "epoch": 0.48024316109422494, "grad_norm": 0.027569226920604706, "learning_rate": 0.00016810544611819237, "loss": 0.0397, "step": 1106 }, { "epoch": 0.4806773773339123, "grad_norm": 0.03084222786128521, "learning_rate": 0.00016807647740440325, "loss": 0.0422, "step": 1107 }, { "epoch": 0.48111159357359967, "grad_norm": 0.03008195199072361, "learning_rate": 0.00016804750869061415, "loss": 0.0406, "step": 1108 }, { "epoch": 0.48154580981328704, "grad_norm": 0.03481116145849228, "learning_rate": 0.00016801853997682503, "loss": 0.0488, "step": 1109 }, { "epoch": 0.4819800260529744, "grad_norm": 0.029380012303590775, "learning_rate": 0.00016798957126303593, "loss": 0.0402, "step": 1110 }, { "epoch": 0.4824142422926617, "grad_norm": 0.03915943205356598, "learning_rate": 0.0001679606025492468, "loss": 0.0508, "step": 1111 }, { "epoch": 0.4828484585323491, "grad_norm": 0.030712837353348732, "learning_rate": 0.00016793163383545771, "loss": 0.0495, "step": 1112 }, { "epoch": 0.48328267477203646, "grad_norm": 0.032065920531749725, "learning_rate": 0.00016790266512166862, "loss": 0.0427, "step": 1113 }, { "epoch": 0.4837168910117238, "grad_norm": 0.027290496975183487, "learning_rate": 0.0001678736964078795, "loss": 0.0386, "step": 1114 }, { "epoch": 0.4841511072514112, "grad_norm": 0.03288483992218971, "learning_rate": 0.0001678447276940904, "loss": 0.0522, "step": 1115 }, { "epoch": 0.48458532349109856, "grad_norm": 0.023322634398937225, "learning_rate": 0.00016781575898030127, "loss": 0.038, "step": 1116 }, { "epoch": 0.48501953973078593, "grad_norm": 0.03188279643654823, "learning_rate": 0.00016778679026651218, "loss": 0.0484, "step": 1117 }, { "epoch": 0.4854537559704733, "grad_norm": 0.027059955522418022, "learning_rate": 0.00016775782155272305, "loss": 0.0412, "step": 1118 }, { "epoch": 0.48588797221016067, "grad_norm": 0.02600109577178955, "learning_rate": 0.00016772885283893396, "loss": 0.0393, "step": 1119 }, { "epoch": 0.48632218844984804, "grad_norm": 0.03897339105606079, "learning_rate": 0.00016769988412514486, "loss": 0.0543, "step": 1120 }, { "epoch": 0.4867564046895354, "grad_norm": 0.02521538734436035, "learning_rate": 0.00016767091541135574, "loss": 0.0409, "step": 1121 }, { "epoch": 0.4871906209292228, "grad_norm": 0.0300771351903677, "learning_rate": 0.00016764194669756664, "loss": 0.047, "step": 1122 }, { "epoch": 0.48762483716891014, "grad_norm": 0.028528962284326553, "learning_rate": 0.00016761297798377752, "loss": 0.0399, "step": 1123 }, { "epoch": 0.48805905340859745, "grad_norm": 0.038500938564538956, "learning_rate": 0.00016758400926998842, "loss": 0.0573, "step": 1124 }, { "epoch": 0.4884932696482848, "grad_norm": 0.03472080081701279, "learning_rate": 0.0001675550405561993, "loss": 0.0522, "step": 1125 }, { "epoch": 0.4889274858879722, "grad_norm": 0.03446793556213379, "learning_rate": 0.0001675260718424102, "loss": 0.0527, "step": 1126 }, { "epoch": 0.48936170212765956, "grad_norm": 0.031338706612586975, "learning_rate": 0.0001674971031286211, "loss": 0.0457, "step": 1127 }, { "epoch": 0.4897959183673469, "grad_norm": 0.030975641682744026, "learning_rate": 0.00016746813441483198, "loss": 0.0486, "step": 1128 }, { "epoch": 0.4902301346070343, "grad_norm": 0.02801419049501419, "learning_rate": 0.0001674391657010429, "loss": 0.0421, "step": 1129 }, { "epoch": 0.49066435084672166, "grad_norm": 0.03950496017932892, "learning_rate": 0.00016741019698725376, "loss": 0.0494, "step": 1130 }, { "epoch": 0.49109856708640903, "grad_norm": 0.03970480337738991, "learning_rate": 0.00016738122827346467, "loss": 0.0517, "step": 1131 }, { "epoch": 0.4915327833260964, "grad_norm": 0.036078229546546936, "learning_rate": 0.00016735225955967554, "loss": 0.0476, "step": 1132 }, { "epoch": 0.49196699956578377, "grad_norm": 0.03275501728057861, "learning_rate": 0.00016732329084588645, "loss": 0.0486, "step": 1133 }, { "epoch": 0.49240121580547114, "grad_norm": 0.02779034525156021, "learning_rate": 0.00016729432213209735, "loss": 0.0408, "step": 1134 }, { "epoch": 0.4928354320451585, "grad_norm": 0.055153049528598785, "learning_rate": 0.00016726535341830826, "loss": 0.0473, "step": 1135 }, { "epoch": 0.4932696482848459, "grad_norm": 0.026913471519947052, "learning_rate": 0.00016723638470451913, "loss": 0.04, "step": 1136 }, { "epoch": 0.49370386452453324, "grad_norm": 0.03216119483113289, "learning_rate": 0.00016720741599073, "loss": 0.0504, "step": 1137 }, { "epoch": 0.49413808076422056, "grad_norm": 0.038199283182621, "learning_rate": 0.0001671784472769409, "loss": 0.0551, "step": 1138 }, { "epoch": 0.4945722970039079, "grad_norm": 0.032919686287641525, "learning_rate": 0.0001671494785631518, "loss": 0.0474, "step": 1139 }, { "epoch": 0.4950065132435953, "grad_norm": 0.02840450033545494, "learning_rate": 0.0001671205098493627, "loss": 0.0443, "step": 1140 }, { "epoch": 0.49544072948328266, "grad_norm": 0.03209400549530983, "learning_rate": 0.0001670915411355736, "loss": 0.0472, "step": 1141 }, { "epoch": 0.49587494572297003, "grad_norm": 0.030079878866672516, "learning_rate": 0.0001670625724217845, "loss": 0.0439, "step": 1142 }, { "epoch": 0.4963091619626574, "grad_norm": 0.02849772945046425, "learning_rate": 0.00016703360370799538, "loss": 0.0396, "step": 1143 }, { "epoch": 0.49674337820234477, "grad_norm": 0.026997458189725876, "learning_rate": 0.00016700463499420625, "loss": 0.0351, "step": 1144 }, { "epoch": 0.49717759444203213, "grad_norm": 0.028411293402314186, "learning_rate": 0.00016697566628041716, "loss": 0.0428, "step": 1145 }, { "epoch": 0.4976118106817195, "grad_norm": 0.04569128528237343, "learning_rate": 0.00016694669756662803, "loss": 0.0533, "step": 1146 }, { "epoch": 0.49804602692140687, "grad_norm": 0.03562667965888977, "learning_rate": 0.00016691772885283894, "loss": 0.0552, "step": 1147 }, { "epoch": 0.49848024316109424, "grad_norm": 0.031561631709337234, "learning_rate": 0.00016688876013904984, "loss": 0.0413, "step": 1148 }, { "epoch": 0.4989144594007816, "grad_norm": 0.03660324588418007, "learning_rate": 0.00016685979142526074, "loss": 0.0581, "step": 1149 }, { "epoch": 0.499348675640469, "grad_norm": 0.029149340465664864, "learning_rate": 0.00016683082271147162, "loss": 0.041, "step": 1150 }, { "epoch": 0.49978289188015634, "grad_norm": 0.03745060786604881, "learning_rate": 0.0001668018539976825, "loss": 0.0553, "step": 1151 }, { "epoch": 0.5002171081198437, "grad_norm": 0.0313265398144722, "learning_rate": 0.0001667728852838934, "loss": 0.0463, "step": 1152 }, { "epoch": 0.5006513243595311, "grad_norm": 0.03631487116217613, "learning_rate": 0.00016674391657010428, "loss": 0.0549, "step": 1153 }, { "epoch": 0.5010855405992184, "grad_norm": 0.035685837268829346, "learning_rate": 0.00016671494785631518, "loss": 0.0529, "step": 1154 }, { "epoch": 0.5015197568389058, "grad_norm": 0.03156263008713722, "learning_rate": 0.00016668597914252609, "loss": 0.0456, "step": 1155 }, { "epoch": 0.5019539730785931, "grad_norm": 0.027245815843343735, "learning_rate": 0.000166657010428737, "loss": 0.0406, "step": 1156 }, { "epoch": 0.5023881893182806, "grad_norm": 0.02925187721848488, "learning_rate": 0.00016662804171494787, "loss": 0.0431, "step": 1157 }, { "epoch": 0.5028224055579679, "grad_norm": 0.028183279559016228, "learning_rate": 0.00016659907300115877, "loss": 0.0344, "step": 1158 }, { "epoch": 0.5032566217976552, "grad_norm": 0.03626574948430061, "learning_rate": 0.00016657010428736965, "loss": 0.0522, "step": 1159 }, { "epoch": 0.5036908380373426, "grad_norm": 0.025158178061246872, "learning_rate": 0.00016654113557358052, "loss": 0.0328, "step": 1160 }, { "epoch": 0.5041250542770299, "grad_norm": 0.035807520151138306, "learning_rate": 0.00016651216685979143, "loss": 0.0497, "step": 1161 }, { "epoch": 0.5045592705167173, "grad_norm": 0.03843451663851738, "learning_rate": 0.00016648319814600233, "loss": 0.0591, "step": 1162 }, { "epoch": 0.5049934867564047, "grad_norm": 0.025603579357266426, "learning_rate": 0.00016645422943221323, "loss": 0.0391, "step": 1163 }, { "epoch": 0.5054277029960921, "grad_norm": 0.029121678322553635, "learning_rate": 0.0001664252607184241, "loss": 0.0455, "step": 1164 }, { "epoch": 0.5058619192357794, "grad_norm": 0.02927287109196186, "learning_rate": 0.00016639629200463501, "loss": 0.0409, "step": 1165 }, { "epoch": 0.5062961354754668, "grad_norm": 0.038543980568647385, "learning_rate": 0.0001663673232908459, "loss": 0.0556, "step": 1166 }, { "epoch": 0.5067303517151541, "grad_norm": 0.027179867029190063, "learning_rate": 0.00016633835457705677, "loss": 0.0423, "step": 1167 }, { "epoch": 0.5071645679548415, "grad_norm": 0.04105430468916893, "learning_rate": 0.00016630938586326767, "loss": 0.0508, "step": 1168 }, { "epoch": 0.5075987841945289, "grad_norm": 0.02582898549735546, "learning_rate": 0.00016628041714947858, "loss": 0.0313, "step": 1169 }, { "epoch": 0.5080330004342163, "grad_norm": 0.029978204518556595, "learning_rate": 0.00016625144843568948, "loss": 0.0469, "step": 1170 }, { "epoch": 0.5084672166739036, "grad_norm": 0.024874160066246986, "learning_rate": 0.00016622247972190036, "loss": 0.0348, "step": 1171 }, { "epoch": 0.508901432913591, "grad_norm": 0.029731178656220436, "learning_rate": 0.00016619351100811126, "loss": 0.0439, "step": 1172 }, { "epoch": 0.5093356491532783, "grad_norm": 0.03124183975160122, "learning_rate": 0.00016616454229432214, "loss": 0.0453, "step": 1173 }, { "epoch": 0.5097698653929656, "grad_norm": 0.034222207963466644, "learning_rate": 0.000166135573580533, "loss": 0.0575, "step": 1174 }, { "epoch": 0.5102040816326531, "grad_norm": 0.029798392206430435, "learning_rate": 0.00016610660486674392, "loss": 0.0391, "step": 1175 }, { "epoch": 0.5106382978723404, "grad_norm": 0.02710781805217266, "learning_rate": 0.00016607763615295482, "loss": 0.0385, "step": 1176 }, { "epoch": 0.5110725141120278, "grad_norm": 0.02773641236126423, "learning_rate": 0.00016604866743916572, "loss": 0.043, "step": 1177 }, { "epoch": 0.5115067303517151, "grad_norm": 0.03035992756485939, "learning_rate": 0.0001660196987253766, "loss": 0.0436, "step": 1178 }, { "epoch": 0.5119409465914025, "grad_norm": 0.03085564449429512, "learning_rate": 0.0001659907300115875, "loss": 0.0426, "step": 1179 }, { "epoch": 0.5123751628310899, "grad_norm": 0.03249194100499153, "learning_rate": 0.00016596176129779838, "loss": 0.045, "step": 1180 }, { "epoch": 0.5128093790707773, "grad_norm": 0.025214090943336487, "learning_rate": 0.00016593279258400928, "loss": 0.0391, "step": 1181 }, { "epoch": 0.5132435953104646, "grad_norm": 0.03008168190717697, "learning_rate": 0.00016590382387022016, "loss": 0.0447, "step": 1182 }, { "epoch": 0.513677811550152, "grad_norm": 0.025950267910957336, "learning_rate": 0.00016587485515643106, "loss": 0.0394, "step": 1183 }, { "epoch": 0.5141120277898393, "grad_norm": 0.03078186884522438, "learning_rate": 0.00016584588644264197, "loss": 0.0506, "step": 1184 }, { "epoch": 0.5145462440295268, "grad_norm": 0.02535959519445896, "learning_rate": 0.00016581691772885284, "loss": 0.0379, "step": 1185 }, { "epoch": 0.5149804602692141, "grad_norm": 0.029065504670143127, "learning_rate": 0.00016578794901506375, "loss": 0.0458, "step": 1186 }, { "epoch": 0.5154146765089014, "grad_norm": 0.032836709171533585, "learning_rate": 0.00016575898030127463, "loss": 0.0575, "step": 1187 }, { "epoch": 0.5158488927485888, "grad_norm": 0.031558144837617874, "learning_rate": 0.00016573001158748553, "loss": 0.0412, "step": 1188 }, { "epoch": 0.5162831089882761, "grad_norm": 0.02469601295888424, "learning_rate": 0.0001657010428736964, "loss": 0.0354, "step": 1189 }, { "epoch": 0.5167173252279635, "grad_norm": 0.03263497352600098, "learning_rate": 0.0001656720741599073, "loss": 0.0428, "step": 1190 }, { "epoch": 0.5171515414676509, "grad_norm": 0.029674503952264786, "learning_rate": 0.0001656431054461182, "loss": 0.0451, "step": 1191 }, { "epoch": 0.5175857577073383, "grad_norm": 0.03898968920111656, "learning_rate": 0.0001656141367323291, "loss": 0.0642, "step": 1192 }, { "epoch": 0.5180199739470256, "grad_norm": 0.03775753825902939, "learning_rate": 0.00016558516801854, "loss": 0.0494, "step": 1193 }, { "epoch": 0.518454190186713, "grad_norm": 0.030130507424473763, "learning_rate": 0.00016555619930475087, "loss": 0.0407, "step": 1194 }, { "epoch": 0.5188884064264003, "grad_norm": 0.03746167570352554, "learning_rate": 0.00016552723059096177, "loss": 0.0558, "step": 1195 }, { "epoch": 0.5193226226660878, "grad_norm": 0.025117045268416405, "learning_rate": 0.00016549826187717265, "loss": 0.0392, "step": 1196 }, { "epoch": 0.5197568389057751, "grad_norm": 0.04885869845747948, "learning_rate": 0.00016546929316338355, "loss": 0.0666, "step": 1197 }, { "epoch": 0.5201910551454625, "grad_norm": 0.03204965218901634, "learning_rate": 0.00016544032444959446, "loss": 0.0494, "step": 1198 }, { "epoch": 0.5206252713851498, "grad_norm": 0.03521885350346565, "learning_rate": 0.00016541135573580533, "loss": 0.0605, "step": 1199 }, { "epoch": 0.5210594876248371, "grad_norm": 0.030366791412234306, "learning_rate": 0.00016538238702201624, "loss": 0.044, "step": 1200 }, { "epoch": 0.5214937038645245, "grad_norm": 0.02963079698383808, "learning_rate": 0.00016535341830822711, "loss": 0.0481, "step": 1201 }, { "epoch": 0.5219279201042119, "grad_norm": 0.03448466211557388, "learning_rate": 0.00016532444959443802, "loss": 0.0446, "step": 1202 }, { "epoch": 0.5223621363438993, "grad_norm": 0.03431907668709755, "learning_rate": 0.0001652954808806489, "loss": 0.0498, "step": 1203 }, { "epoch": 0.5227963525835866, "grad_norm": 0.03172852098941803, "learning_rate": 0.0001652665121668598, "loss": 0.0458, "step": 1204 }, { "epoch": 0.523230568823274, "grad_norm": 0.03242357075214386, "learning_rate": 0.0001652375434530707, "loss": 0.0474, "step": 1205 }, { "epoch": 0.5236647850629613, "grad_norm": 0.03321189433336258, "learning_rate": 0.00016520857473928158, "loss": 0.0508, "step": 1206 }, { "epoch": 0.5240990013026487, "grad_norm": 0.03514818102121353, "learning_rate": 0.00016517960602549248, "loss": 0.0532, "step": 1207 }, { "epoch": 0.5245332175423361, "grad_norm": 0.026950767263770103, "learning_rate": 0.00016515063731170336, "loss": 0.0347, "step": 1208 }, { "epoch": 0.5249674337820235, "grad_norm": 0.04146554693579674, "learning_rate": 0.00016512166859791426, "loss": 0.061, "step": 1209 }, { "epoch": 0.5254016500217108, "grad_norm": 0.03356388956308365, "learning_rate": 0.00016509269988412514, "loss": 0.0432, "step": 1210 }, { "epoch": 0.5258358662613982, "grad_norm": 0.03596331551671028, "learning_rate": 0.00016506373117033604, "loss": 0.0516, "step": 1211 }, { "epoch": 0.5262700825010855, "grad_norm": 0.03201964870095253, "learning_rate": 0.00016503476245654695, "loss": 0.0446, "step": 1212 }, { "epoch": 0.526704298740773, "grad_norm": 0.031360529363155365, "learning_rate": 0.00016500579374275782, "loss": 0.0424, "step": 1213 }, { "epoch": 0.5271385149804603, "grad_norm": 0.029337311163544655, "learning_rate": 0.00016497682502896873, "loss": 0.0389, "step": 1214 }, { "epoch": 0.5275727312201476, "grad_norm": 0.03680749237537384, "learning_rate": 0.0001649478563151796, "loss": 0.0419, "step": 1215 }, { "epoch": 0.528006947459835, "grad_norm": 0.033498022705316544, "learning_rate": 0.0001649188876013905, "loss": 0.0517, "step": 1216 }, { "epoch": 0.5284411636995223, "grad_norm": 0.032476719468832016, "learning_rate": 0.00016488991888760138, "loss": 0.0468, "step": 1217 }, { "epoch": 0.5288753799392097, "grad_norm": 0.031201379373669624, "learning_rate": 0.0001648609501738123, "loss": 0.0491, "step": 1218 }, { "epoch": 0.5293095961788971, "grad_norm": 0.030970316380262375, "learning_rate": 0.0001648319814600232, "loss": 0.0428, "step": 1219 }, { "epoch": 0.5297438124185845, "grad_norm": 0.026726339012384415, "learning_rate": 0.00016480301274623407, "loss": 0.0354, "step": 1220 }, { "epoch": 0.5301780286582718, "grad_norm": 0.03373072296380997, "learning_rate": 0.00016477404403244497, "loss": 0.053, "step": 1221 }, { "epoch": 0.5306122448979592, "grad_norm": 0.03432188183069229, "learning_rate": 0.00016474507531865585, "loss": 0.0501, "step": 1222 }, { "epoch": 0.5310464611376465, "grad_norm": 0.025819454342126846, "learning_rate": 0.00016471610660486675, "loss": 0.0342, "step": 1223 }, { "epoch": 0.531480677377334, "grad_norm": 0.03156053274869919, "learning_rate": 0.00016468713789107763, "loss": 0.0448, "step": 1224 }, { "epoch": 0.5319148936170213, "grad_norm": 0.035119008272886276, "learning_rate": 0.00016465816917728853, "loss": 0.0433, "step": 1225 }, { "epoch": 0.5323491098567087, "grad_norm": 0.027719827368855476, "learning_rate": 0.00016462920046349944, "loss": 0.0401, "step": 1226 }, { "epoch": 0.532783326096396, "grad_norm": 0.031441863626241684, "learning_rate": 0.0001646002317497103, "loss": 0.0455, "step": 1227 }, { "epoch": 0.5332175423360833, "grad_norm": 0.04018000140786171, "learning_rate": 0.00016457126303592122, "loss": 0.0507, "step": 1228 }, { "epoch": 0.5336517585757707, "grad_norm": 0.028481340035796165, "learning_rate": 0.0001645422943221321, "loss": 0.0436, "step": 1229 }, { "epoch": 0.534085974815458, "grad_norm": 0.035127073526382446, "learning_rate": 0.000164513325608343, "loss": 0.0507, "step": 1230 }, { "epoch": 0.5345201910551455, "grad_norm": 0.0319664441049099, "learning_rate": 0.00016448435689455387, "loss": 0.0419, "step": 1231 }, { "epoch": 0.5349544072948328, "grad_norm": 0.03699595108628273, "learning_rate": 0.00016445538818076478, "loss": 0.0498, "step": 1232 }, { "epoch": 0.5353886235345202, "grad_norm": 0.028063226491212845, "learning_rate": 0.00016442641946697568, "loss": 0.0392, "step": 1233 }, { "epoch": 0.5358228397742075, "grad_norm": 0.03085392154753208, "learning_rate": 0.00016439745075318659, "loss": 0.0479, "step": 1234 }, { "epoch": 0.536257056013895, "grad_norm": 0.027172120288014412, "learning_rate": 0.00016436848203939746, "loss": 0.0396, "step": 1235 }, { "epoch": 0.5366912722535823, "grad_norm": 0.03202288597822189, "learning_rate": 0.00016433951332560837, "loss": 0.045, "step": 1236 }, { "epoch": 0.5371254884932697, "grad_norm": 0.03194000944495201, "learning_rate": 0.00016431054461181924, "loss": 0.0419, "step": 1237 }, { "epoch": 0.537559704732957, "grad_norm": 0.02779650129377842, "learning_rate": 0.00016428157589803012, "loss": 0.0394, "step": 1238 }, { "epoch": 0.5379939209726444, "grad_norm": 0.03396061435341835, "learning_rate": 0.00016425260718424102, "loss": 0.0558, "step": 1239 }, { "epoch": 0.5384281372123317, "grad_norm": 0.035278066992759705, "learning_rate": 0.00016422363847045193, "loss": 0.0505, "step": 1240 }, { "epoch": 0.5388623534520192, "grad_norm": 0.029738249257206917, "learning_rate": 0.00016419466975666283, "loss": 0.0445, "step": 1241 }, { "epoch": 0.5392965696917065, "grad_norm": 0.02887635864317417, "learning_rate": 0.0001641657010428737, "loss": 0.0483, "step": 1242 }, { "epoch": 0.5397307859313938, "grad_norm": 0.03161432221531868, "learning_rate": 0.0001641367323290846, "loss": 0.0451, "step": 1243 }, { "epoch": 0.5401650021710812, "grad_norm": 0.031038248911499977, "learning_rate": 0.0001641077636152955, "loss": 0.0476, "step": 1244 }, { "epoch": 0.5405992184107685, "grad_norm": 0.029218066483736038, "learning_rate": 0.00016407879490150636, "loss": 0.0392, "step": 1245 }, { "epoch": 0.541033434650456, "grad_norm": 0.022427549585700035, "learning_rate": 0.00016404982618771727, "loss": 0.0336, "step": 1246 }, { "epoch": 0.5414676508901433, "grad_norm": 0.03182215616106987, "learning_rate": 0.00016402085747392817, "loss": 0.0448, "step": 1247 }, { "epoch": 0.5419018671298307, "grad_norm": 0.04725756123661995, "learning_rate": 0.00016399188876013907, "loss": 0.0672, "step": 1248 }, { "epoch": 0.542336083369518, "grad_norm": 0.02602890506386757, "learning_rate": 0.00016396292004634995, "loss": 0.0399, "step": 1249 }, { "epoch": 0.5427702996092054, "grad_norm": 0.03450959175825119, "learning_rate": 0.00016393395133256085, "loss": 0.0522, "step": 1250 }, { "epoch": 0.5432045158488927, "grad_norm": 0.026919152587652206, "learning_rate": 0.00016390498261877173, "loss": 0.0421, "step": 1251 }, { "epoch": 0.5436387320885802, "grad_norm": 0.03205368295311928, "learning_rate": 0.0001638760139049826, "loss": 0.0444, "step": 1252 }, { "epoch": 0.5440729483282675, "grad_norm": 0.034211914986371994, "learning_rate": 0.0001638470451911935, "loss": 0.0526, "step": 1253 }, { "epoch": 0.5445071645679549, "grad_norm": 0.0332583524286747, "learning_rate": 0.00016381807647740442, "loss": 0.052, "step": 1254 }, { "epoch": 0.5449413808076422, "grad_norm": 0.026499096304178238, "learning_rate": 0.00016378910776361532, "loss": 0.041, "step": 1255 }, { "epoch": 0.5453755970473295, "grad_norm": 0.03625990077853203, "learning_rate": 0.0001637601390498262, "loss": 0.061, "step": 1256 }, { "epoch": 0.5458098132870169, "grad_norm": 0.03130260854959488, "learning_rate": 0.0001637311703360371, "loss": 0.0521, "step": 1257 }, { "epoch": 0.5462440295267043, "grad_norm": 0.02735084667801857, "learning_rate": 0.00016370220162224798, "loss": 0.0401, "step": 1258 }, { "epoch": 0.5466782457663917, "grad_norm": 0.04306270554661751, "learning_rate": 0.00016367323290845885, "loss": 0.0448, "step": 1259 }, { "epoch": 0.547112462006079, "grad_norm": 0.0372079573571682, "learning_rate": 0.00016364426419466976, "loss": 0.0528, "step": 1260 }, { "epoch": 0.5475466782457664, "grad_norm": 0.033278532326221466, "learning_rate": 0.00016361529548088066, "loss": 0.0474, "step": 1261 }, { "epoch": 0.5479808944854537, "grad_norm": 0.028759637847542763, "learning_rate": 0.00016358632676709156, "loss": 0.0368, "step": 1262 }, { "epoch": 0.5484151107251412, "grad_norm": 0.0344727449119091, "learning_rate": 0.00016355735805330244, "loss": 0.0474, "step": 1263 }, { "epoch": 0.5488493269648285, "grad_norm": 0.02515021525323391, "learning_rate": 0.00016352838933951334, "loss": 0.0386, "step": 1264 }, { "epoch": 0.5492835432045159, "grad_norm": 0.03275750204920769, "learning_rate": 0.00016349942062572422, "loss": 0.0454, "step": 1265 }, { "epoch": 0.5497177594442032, "grad_norm": 0.031178684905171394, "learning_rate": 0.0001634704519119351, "loss": 0.0461, "step": 1266 }, { "epoch": 0.5501519756838906, "grad_norm": 0.02901088260114193, "learning_rate": 0.000163441483198146, "loss": 0.0373, "step": 1267 }, { "epoch": 0.5505861919235779, "grad_norm": 0.03075389750301838, "learning_rate": 0.0001634125144843569, "loss": 0.0442, "step": 1268 }, { "epoch": 0.5510204081632653, "grad_norm": 0.027834711596369743, "learning_rate": 0.0001633835457705678, "loss": 0.0411, "step": 1269 }, { "epoch": 0.5514546244029527, "grad_norm": 0.029654689133167267, "learning_rate": 0.00016335457705677869, "loss": 0.0458, "step": 1270 }, { "epoch": 0.55188884064264, "grad_norm": 0.0335739366710186, "learning_rate": 0.0001633256083429896, "loss": 0.0438, "step": 1271 }, { "epoch": 0.5523230568823274, "grad_norm": 0.026805724948644638, "learning_rate": 0.00016329663962920047, "loss": 0.0385, "step": 1272 }, { "epoch": 0.5527572731220147, "grad_norm": 0.029269607737660408, "learning_rate": 0.00016326767091541134, "loss": 0.0381, "step": 1273 }, { "epoch": 0.5531914893617021, "grad_norm": 0.024596083909273148, "learning_rate": 0.00016323870220162225, "loss": 0.0323, "step": 1274 }, { "epoch": 0.5536257056013895, "grad_norm": 0.032651882618665695, "learning_rate": 0.00016320973348783315, "loss": 0.0422, "step": 1275 }, { "epoch": 0.5540599218410769, "grad_norm": 0.03486670181155205, "learning_rate": 0.00016318076477404405, "loss": 0.0511, "step": 1276 }, { "epoch": 0.5544941380807642, "grad_norm": 0.029618704691529274, "learning_rate": 0.00016315179606025493, "loss": 0.0423, "step": 1277 }, { "epoch": 0.5549283543204516, "grad_norm": 0.022985681891441345, "learning_rate": 0.00016312282734646583, "loss": 0.0328, "step": 1278 }, { "epoch": 0.5553625705601389, "grad_norm": 0.0381246916949749, "learning_rate": 0.0001630938586326767, "loss": 0.0512, "step": 1279 }, { "epoch": 0.5557967867998264, "grad_norm": 0.03607594594359398, "learning_rate": 0.00016306488991888761, "loss": 0.0522, "step": 1280 }, { "epoch": 0.5562310030395137, "grad_norm": 0.02748439833521843, "learning_rate": 0.0001630359212050985, "loss": 0.0318, "step": 1281 }, { "epoch": 0.5566652192792011, "grad_norm": 0.04156062379479408, "learning_rate": 0.0001630069524913094, "loss": 0.0505, "step": 1282 }, { "epoch": 0.5570994355188884, "grad_norm": 0.032055918127298355, "learning_rate": 0.0001629779837775203, "loss": 0.0408, "step": 1283 }, { "epoch": 0.5575336517585757, "grad_norm": 0.032944101840257645, "learning_rate": 0.00016294901506373117, "loss": 0.0403, "step": 1284 }, { "epoch": 0.5579678679982631, "grad_norm": 0.030304135754704475, "learning_rate": 0.00016292004634994208, "loss": 0.0428, "step": 1285 }, { "epoch": 0.5584020842379505, "grad_norm": 0.04210275784134865, "learning_rate": 0.00016289107763615295, "loss": 0.0602, "step": 1286 }, { "epoch": 0.5588363004776379, "grad_norm": 0.03526769205927849, "learning_rate": 0.00016286210892236386, "loss": 0.0474, "step": 1287 }, { "epoch": 0.5592705167173252, "grad_norm": 0.03166527673602104, "learning_rate": 0.00016283314020857474, "loss": 0.0459, "step": 1288 }, { "epoch": 0.5597047329570126, "grad_norm": 0.037422165274620056, "learning_rate": 0.00016280417149478564, "loss": 0.0501, "step": 1289 }, { "epoch": 0.5601389491966999, "grad_norm": 0.030003409832715988, "learning_rate": 0.00016277520278099654, "loss": 0.0387, "step": 1290 }, { "epoch": 0.5605731654363874, "grad_norm": 0.02560872584581375, "learning_rate": 0.00016274623406720742, "loss": 0.0369, "step": 1291 }, { "epoch": 0.5610073816760747, "grad_norm": 0.031058790162205696, "learning_rate": 0.00016271726535341832, "loss": 0.0472, "step": 1292 }, { "epoch": 0.5614415979157621, "grad_norm": 0.027479395270347595, "learning_rate": 0.0001626882966396292, "loss": 0.0415, "step": 1293 }, { "epoch": 0.5618758141554494, "grad_norm": 0.03153546527028084, "learning_rate": 0.0001626593279258401, "loss": 0.0513, "step": 1294 }, { "epoch": 0.5623100303951368, "grad_norm": 0.02401675470173359, "learning_rate": 0.00016263035921205098, "loss": 0.0359, "step": 1295 }, { "epoch": 0.5627442466348241, "grad_norm": 0.046038225293159485, "learning_rate": 0.00016260139049826188, "loss": 0.0497, "step": 1296 }, { "epoch": 0.5631784628745115, "grad_norm": 0.03048243187367916, "learning_rate": 0.0001625724217844728, "loss": 0.0383, "step": 1297 }, { "epoch": 0.5636126791141989, "grad_norm": 0.029688769951462746, "learning_rate": 0.00016254345307068366, "loss": 0.0465, "step": 1298 }, { "epoch": 0.5640468953538862, "grad_norm": 0.03205108270049095, "learning_rate": 0.00016251448435689457, "loss": 0.0469, "step": 1299 }, { "epoch": 0.5644811115935736, "grad_norm": 0.030784739181399345, "learning_rate": 0.00016248551564310544, "loss": 0.0463, "step": 1300 }, { "epoch": 0.5649153278332609, "grad_norm": 0.03051881305873394, "learning_rate": 0.00016245654692931635, "loss": 0.047, "step": 1301 }, { "epoch": 0.5653495440729484, "grad_norm": 0.026900973170995712, "learning_rate": 0.00016242757821552722, "loss": 0.039, "step": 1302 }, { "epoch": 0.5657837603126357, "grad_norm": 0.03469308093190193, "learning_rate": 0.00016239860950173813, "loss": 0.0415, "step": 1303 }, { "epoch": 0.5662179765523231, "grad_norm": 0.02492031455039978, "learning_rate": 0.00016236964078794903, "loss": 0.038, "step": 1304 }, { "epoch": 0.5666521927920104, "grad_norm": 0.03731763735413551, "learning_rate": 0.0001623406720741599, "loss": 0.0528, "step": 1305 }, { "epoch": 0.5670864090316978, "grad_norm": 0.03264200687408447, "learning_rate": 0.0001623117033603708, "loss": 0.043, "step": 1306 }, { "epoch": 0.5675206252713851, "grad_norm": 0.033300116658210754, "learning_rate": 0.0001622827346465817, "loss": 0.0481, "step": 1307 }, { "epoch": 0.5679548415110726, "grad_norm": 0.0314621701836586, "learning_rate": 0.0001622537659327926, "loss": 0.0446, "step": 1308 }, { "epoch": 0.5683890577507599, "grad_norm": 0.030670808628201485, "learning_rate": 0.00016222479721900347, "loss": 0.0422, "step": 1309 }, { "epoch": 0.5688232739904473, "grad_norm": 0.03397829458117485, "learning_rate": 0.00016219582850521437, "loss": 0.0494, "step": 1310 }, { "epoch": 0.5692574902301346, "grad_norm": 0.029048221185803413, "learning_rate": 0.00016216685979142528, "loss": 0.0418, "step": 1311 }, { "epoch": 0.5696917064698219, "grad_norm": 0.028934579342603683, "learning_rate": 0.00016213789107763615, "loss": 0.042, "step": 1312 }, { "epoch": 0.5701259227095093, "grad_norm": 0.03228578716516495, "learning_rate": 0.00016210892236384706, "loss": 0.0459, "step": 1313 }, { "epoch": 0.5705601389491967, "grad_norm": 0.02762845903635025, "learning_rate": 0.00016207995365005796, "loss": 0.0376, "step": 1314 }, { "epoch": 0.5709943551888841, "grad_norm": 0.02905391901731491, "learning_rate": 0.00016205098493626884, "loss": 0.0391, "step": 1315 }, { "epoch": 0.5714285714285714, "grad_norm": 0.02581918239593506, "learning_rate": 0.00016202201622247971, "loss": 0.0356, "step": 1316 }, { "epoch": 0.5718627876682588, "grad_norm": 0.029246967285871506, "learning_rate": 0.00016199304750869062, "loss": 0.0455, "step": 1317 }, { "epoch": 0.5722970039079461, "grad_norm": 0.02899780683219433, "learning_rate": 0.00016196407879490152, "loss": 0.0432, "step": 1318 }, { "epoch": 0.5727312201476336, "grad_norm": 0.039482228457927704, "learning_rate": 0.0001619351100811124, "loss": 0.0443, "step": 1319 }, { "epoch": 0.5731654363873209, "grad_norm": 0.034356825053691864, "learning_rate": 0.0001619061413673233, "loss": 0.0499, "step": 1320 }, { "epoch": 0.5735996526270083, "grad_norm": 0.03914685919880867, "learning_rate": 0.0001618771726535342, "loss": 0.0485, "step": 1321 }, { "epoch": 0.5740338688666956, "grad_norm": 0.033090054988861084, "learning_rate": 0.00016184820393974508, "loss": 0.0541, "step": 1322 }, { "epoch": 0.574468085106383, "grad_norm": 0.03390613570809364, "learning_rate": 0.00016181923522595596, "loss": 0.0452, "step": 1323 }, { "epoch": 0.5749023013460703, "grad_norm": 0.03822924941778183, "learning_rate": 0.00016179026651216686, "loss": 0.0447, "step": 1324 }, { "epoch": 0.5753365175857577, "grad_norm": 0.026290079578757286, "learning_rate": 0.00016176129779837777, "loss": 0.0402, "step": 1325 }, { "epoch": 0.5757707338254451, "grad_norm": 0.03671284765005112, "learning_rate": 0.00016173232908458867, "loss": 0.0593, "step": 1326 }, { "epoch": 0.5762049500651324, "grad_norm": 0.02860778197646141, "learning_rate": 0.00016170336037079955, "loss": 0.0378, "step": 1327 }, { "epoch": 0.5766391663048198, "grad_norm": 0.024853887036442757, "learning_rate": 0.00016167439165701045, "loss": 0.0337, "step": 1328 }, { "epoch": 0.5770733825445071, "grad_norm": 0.03022726997733116, "learning_rate": 0.00016164542294322133, "loss": 0.0436, "step": 1329 }, { "epoch": 0.5775075987841946, "grad_norm": 0.02601066417992115, "learning_rate": 0.0001616164542294322, "loss": 0.0415, "step": 1330 }, { "epoch": 0.5779418150238819, "grad_norm": 0.031893569976091385, "learning_rate": 0.0001615874855156431, "loss": 0.0466, "step": 1331 }, { "epoch": 0.5783760312635693, "grad_norm": 0.027164380997419357, "learning_rate": 0.000161558516801854, "loss": 0.0413, "step": 1332 }, { "epoch": 0.5788102475032566, "grad_norm": 0.03162367641925812, "learning_rate": 0.00016152954808806491, "loss": 0.047, "step": 1333 }, { "epoch": 0.579244463742944, "grad_norm": 0.029615914449095726, "learning_rate": 0.0001615005793742758, "loss": 0.0428, "step": 1334 }, { "epoch": 0.5796786799826313, "grad_norm": 0.03465209901332855, "learning_rate": 0.0001614716106604867, "loss": 0.0498, "step": 1335 }, { "epoch": 0.5801128962223188, "grad_norm": 0.026132311671972275, "learning_rate": 0.00016144264194669757, "loss": 0.0407, "step": 1336 }, { "epoch": 0.5805471124620061, "grad_norm": 0.026560978963971138, "learning_rate": 0.00016141367323290845, "loss": 0.0448, "step": 1337 }, { "epoch": 0.5809813287016934, "grad_norm": 0.03075362555682659, "learning_rate": 0.00016138470451911935, "loss": 0.0468, "step": 1338 }, { "epoch": 0.5814155449413808, "grad_norm": 0.028235264122486115, "learning_rate": 0.00016135573580533026, "loss": 0.0404, "step": 1339 }, { "epoch": 0.5818497611810681, "grad_norm": 0.02864878810942173, "learning_rate": 0.00016132676709154116, "loss": 0.0398, "step": 1340 }, { "epoch": 0.5822839774207555, "grad_norm": 0.028696032240986824, "learning_rate": 0.00016129779837775204, "loss": 0.041, "step": 1341 }, { "epoch": 0.5827181936604429, "grad_norm": 0.029841836541891098, "learning_rate": 0.00016126882966396294, "loss": 0.0348, "step": 1342 }, { "epoch": 0.5831524099001303, "grad_norm": 0.025827180594205856, "learning_rate": 0.00016123986095017382, "loss": 0.042, "step": 1343 }, { "epoch": 0.5835866261398176, "grad_norm": 0.03461744263768196, "learning_rate": 0.0001612108922363847, "loss": 0.047, "step": 1344 }, { "epoch": 0.584020842379505, "grad_norm": 0.03352322429418564, "learning_rate": 0.0001611819235225956, "loss": 0.0501, "step": 1345 }, { "epoch": 0.5844550586191923, "grad_norm": 0.027093160897493362, "learning_rate": 0.0001611529548088065, "loss": 0.0454, "step": 1346 }, { "epoch": 0.5848892748588798, "grad_norm": 0.0316416472196579, "learning_rate": 0.0001611239860950174, "loss": 0.0402, "step": 1347 }, { "epoch": 0.5853234910985671, "grad_norm": 0.02874227985739708, "learning_rate": 0.00016109501738122828, "loss": 0.0437, "step": 1348 }, { "epoch": 0.5857577073382545, "grad_norm": 0.03506254404783249, "learning_rate": 0.00016106604866743918, "loss": 0.0459, "step": 1349 }, { "epoch": 0.5861919235779418, "grad_norm": 0.02847885712981224, "learning_rate": 0.00016103707995365006, "loss": 0.0473, "step": 1350 }, { "epoch": 0.5866261398176292, "grad_norm": 0.029924539849162102, "learning_rate": 0.00016100811123986094, "loss": 0.0448, "step": 1351 }, { "epoch": 0.5870603560573165, "grad_norm": 0.0325111523270607, "learning_rate": 0.00016097914252607184, "loss": 0.0498, "step": 1352 }, { "epoch": 0.5874945722970039, "grad_norm": 0.038553234189748764, "learning_rate": 0.00016095017381228275, "loss": 0.0479, "step": 1353 }, { "epoch": 0.5879287885366913, "grad_norm": 0.029682187363505363, "learning_rate": 0.00016092120509849365, "loss": 0.0416, "step": 1354 }, { "epoch": 0.5883630047763786, "grad_norm": 0.025454016402363777, "learning_rate": 0.00016089223638470453, "loss": 0.0343, "step": 1355 }, { "epoch": 0.588797221016066, "grad_norm": 0.028420891612768173, "learning_rate": 0.00016086326767091543, "loss": 0.0391, "step": 1356 }, { "epoch": 0.5892314372557533, "grad_norm": 0.03187655657529831, "learning_rate": 0.0001608342989571263, "loss": 0.0449, "step": 1357 }, { "epoch": 0.5896656534954408, "grad_norm": 0.03671014681458473, "learning_rate": 0.00016080533024333718, "loss": 0.0511, "step": 1358 }, { "epoch": 0.5900998697351281, "grad_norm": 0.03399726375937462, "learning_rate": 0.00016077636152954809, "loss": 0.0503, "step": 1359 }, { "epoch": 0.5905340859748155, "grad_norm": 0.038145627826452255, "learning_rate": 0.000160747392815759, "loss": 0.0549, "step": 1360 }, { "epoch": 0.5909683022145028, "grad_norm": 0.03139147907495499, "learning_rate": 0.0001607184241019699, "loss": 0.0447, "step": 1361 }, { "epoch": 0.5914025184541902, "grad_norm": 0.0284607894718647, "learning_rate": 0.00016068945538818077, "loss": 0.0354, "step": 1362 }, { "epoch": 0.5918367346938775, "grad_norm": 0.03729249909520149, "learning_rate": 0.00016066048667439167, "loss": 0.0508, "step": 1363 }, { "epoch": 0.592270950933565, "grad_norm": 0.038784608244895935, "learning_rate": 0.00016063151796060255, "loss": 0.0526, "step": 1364 }, { "epoch": 0.5927051671732523, "grad_norm": 0.037945643067359924, "learning_rate": 0.00016060254924681343, "loss": 0.0588, "step": 1365 }, { "epoch": 0.5931393834129396, "grad_norm": 0.03401834890246391, "learning_rate": 0.00016057358053302433, "loss": 0.0477, "step": 1366 }, { "epoch": 0.593573599652627, "grad_norm": 0.04581061378121376, "learning_rate": 0.00016054461181923523, "loss": 0.0466, "step": 1367 }, { "epoch": 0.5940078158923143, "grad_norm": 0.03548217937350273, "learning_rate": 0.00016051564310544614, "loss": 0.0442, "step": 1368 }, { "epoch": 0.5944420321320018, "grad_norm": 0.03710260987281799, "learning_rate": 0.00016048667439165701, "loss": 0.0529, "step": 1369 }, { "epoch": 0.5948762483716891, "grad_norm": 0.0400405079126358, "learning_rate": 0.00016045770567786792, "loss": 0.0575, "step": 1370 }, { "epoch": 0.5953104646113765, "grad_norm": 0.02826886996626854, "learning_rate": 0.0001604287369640788, "loss": 0.0441, "step": 1371 }, { "epoch": 0.5957446808510638, "grad_norm": 0.03382157161831856, "learning_rate": 0.0001603997682502897, "loss": 0.0554, "step": 1372 }, { "epoch": 0.5961788970907512, "grad_norm": 0.03422621265053749, "learning_rate": 0.00016037079953650058, "loss": 0.0476, "step": 1373 }, { "epoch": 0.5966131133304385, "grad_norm": 0.027322113513946533, "learning_rate": 0.00016034183082271148, "loss": 0.0321, "step": 1374 }, { "epoch": 0.597047329570126, "grad_norm": 0.029011622071266174, "learning_rate": 0.00016031286210892238, "loss": 0.0433, "step": 1375 }, { "epoch": 0.5974815458098133, "grad_norm": 0.030647289007902145, "learning_rate": 0.00016028389339513326, "loss": 0.0497, "step": 1376 }, { "epoch": 0.5979157620495007, "grad_norm": 0.03468073159456253, "learning_rate": 0.00016025492468134416, "loss": 0.0537, "step": 1377 }, { "epoch": 0.598349978289188, "grad_norm": 0.031014850363135338, "learning_rate": 0.00016022595596755504, "loss": 0.047, "step": 1378 }, { "epoch": 0.5987841945288754, "grad_norm": 0.04171089828014374, "learning_rate": 0.00016019698725376594, "loss": 0.047, "step": 1379 }, { "epoch": 0.5992184107685627, "grad_norm": 0.024174867197871208, "learning_rate": 0.00016016801853997682, "loss": 0.0314, "step": 1380 }, { "epoch": 0.5996526270082501, "grad_norm": 0.03935502842068672, "learning_rate": 0.00016013904982618772, "loss": 0.0606, "step": 1381 }, { "epoch": 0.6000868432479375, "grad_norm": 0.032736629247665405, "learning_rate": 0.00016011008111239863, "loss": 0.041, "step": 1382 }, { "epoch": 0.6005210594876248, "grad_norm": 0.03864171355962753, "learning_rate": 0.0001600811123986095, "loss": 0.0482, "step": 1383 }, { "epoch": 0.6009552757273122, "grad_norm": 0.0342947356402874, "learning_rate": 0.0001600521436848204, "loss": 0.0472, "step": 1384 }, { "epoch": 0.6013894919669995, "grad_norm": 0.030353771522641182, "learning_rate": 0.00016002317497103128, "loss": 0.0472, "step": 1385 }, { "epoch": 0.601823708206687, "grad_norm": 0.027645757421851158, "learning_rate": 0.0001599942062572422, "loss": 0.0379, "step": 1386 }, { "epoch": 0.6022579244463743, "grad_norm": 0.029738077893853188, "learning_rate": 0.00015996523754345307, "loss": 0.0445, "step": 1387 }, { "epoch": 0.6026921406860617, "grad_norm": 0.03135683014988899, "learning_rate": 0.00015993626882966397, "loss": 0.0459, "step": 1388 }, { "epoch": 0.603126356925749, "grad_norm": 0.021702921018004417, "learning_rate": 0.00015990730011587487, "loss": 0.0288, "step": 1389 }, { "epoch": 0.6035605731654364, "grad_norm": 0.02330235205590725, "learning_rate": 0.00015987833140208575, "loss": 0.0357, "step": 1390 }, { "epoch": 0.6039947894051237, "grad_norm": 0.03311017528176308, "learning_rate": 0.00015984936268829665, "loss": 0.0436, "step": 1391 }, { "epoch": 0.6044290056448112, "grad_norm": 0.02543545700609684, "learning_rate": 0.00015982039397450753, "loss": 0.0345, "step": 1392 }, { "epoch": 0.6048632218844985, "grad_norm": 0.029905343428254128, "learning_rate": 0.00015979142526071843, "loss": 0.0437, "step": 1393 }, { "epoch": 0.6052974381241858, "grad_norm": 0.02809019573032856, "learning_rate": 0.0001597624565469293, "loss": 0.0414, "step": 1394 }, { "epoch": 0.6057316543638732, "grad_norm": 0.03084579110145569, "learning_rate": 0.0001597334878331402, "loss": 0.0468, "step": 1395 }, { "epoch": 0.6061658706035605, "grad_norm": 0.026850976049900055, "learning_rate": 0.00015970451911935112, "loss": 0.0387, "step": 1396 }, { "epoch": 0.606600086843248, "grad_norm": 0.030758708715438843, "learning_rate": 0.000159675550405562, "loss": 0.0366, "step": 1397 }, { "epoch": 0.6070343030829353, "grad_norm": 0.04806855693459511, "learning_rate": 0.0001596465816917729, "loss": 0.0637, "step": 1398 }, { "epoch": 0.6074685193226227, "grad_norm": 0.029812928289175034, "learning_rate": 0.0001596176129779838, "loss": 0.0454, "step": 1399 }, { "epoch": 0.60790273556231, "grad_norm": 0.027667725458741188, "learning_rate": 0.00015958864426419468, "loss": 0.0392, "step": 1400 }, { "epoch": 0.6083369518019974, "grad_norm": 0.030985813587903976, "learning_rate": 0.00015955967555040555, "loss": 0.0422, "step": 1401 }, { "epoch": 0.6087711680416847, "grad_norm": 0.029574550688266754, "learning_rate": 0.00015953070683661646, "loss": 0.046, "step": 1402 }, { "epoch": 0.6092053842813722, "grad_norm": 0.03623111546039581, "learning_rate": 0.00015950173812282736, "loss": 0.0467, "step": 1403 }, { "epoch": 0.6096396005210595, "grad_norm": 0.041254203766584396, "learning_rate": 0.00015947276940903824, "loss": 0.0568, "step": 1404 }, { "epoch": 0.6100738167607469, "grad_norm": 0.02670598030090332, "learning_rate": 0.00015944380069524914, "loss": 0.0374, "step": 1405 }, { "epoch": 0.6105080330004342, "grad_norm": 0.028457332402467728, "learning_rate": 0.00015941483198146005, "loss": 0.0407, "step": 1406 }, { "epoch": 0.6109422492401215, "grad_norm": 0.029367325827479362, "learning_rate": 0.00015938586326767092, "loss": 0.0392, "step": 1407 }, { "epoch": 0.611376465479809, "grad_norm": 0.03546149656176567, "learning_rate": 0.0001593568945538818, "loss": 0.0516, "step": 1408 }, { "epoch": 0.6118106817194963, "grad_norm": 0.035985130816698074, "learning_rate": 0.0001593279258400927, "loss": 0.0569, "step": 1409 }, { "epoch": 0.6122448979591837, "grad_norm": 0.02659233845770359, "learning_rate": 0.0001592989571263036, "loss": 0.0373, "step": 1410 }, { "epoch": 0.612679114198871, "grad_norm": 0.03350624442100525, "learning_rate": 0.00015926998841251448, "loss": 0.0497, "step": 1411 }, { "epoch": 0.6131133304385584, "grad_norm": 0.028257472440600395, "learning_rate": 0.0001592410196987254, "loss": 0.0433, "step": 1412 }, { "epoch": 0.6135475466782457, "grad_norm": 0.02619759738445282, "learning_rate": 0.0001592120509849363, "loss": 0.0386, "step": 1413 }, { "epoch": 0.6139817629179332, "grad_norm": 0.035130567848682404, "learning_rate": 0.00015918308227114717, "loss": 0.0513, "step": 1414 }, { "epoch": 0.6144159791576205, "grad_norm": 0.031923066824674606, "learning_rate": 0.00015915411355735804, "loss": 0.0518, "step": 1415 }, { "epoch": 0.6148501953973079, "grad_norm": 0.0302781630307436, "learning_rate": 0.00015912514484356895, "loss": 0.0415, "step": 1416 }, { "epoch": 0.6152844116369952, "grad_norm": 0.030894562602043152, "learning_rate": 0.00015909617612977985, "loss": 0.0421, "step": 1417 }, { "epoch": 0.6157186278766826, "grad_norm": 0.03255515918135643, "learning_rate": 0.00015906720741599073, "loss": 0.0472, "step": 1418 }, { "epoch": 0.61615284411637, "grad_norm": 0.029538707807660103, "learning_rate": 0.00015903823870220163, "loss": 0.0405, "step": 1419 }, { "epoch": 0.6165870603560574, "grad_norm": 0.02745675854384899, "learning_rate": 0.00015900926998841254, "loss": 0.042, "step": 1420 }, { "epoch": 0.6170212765957447, "grad_norm": 0.02740616910159588, "learning_rate": 0.0001589803012746234, "loss": 0.04, "step": 1421 }, { "epoch": 0.617455492835432, "grad_norm": 0.03410978615283966, "learning_rate": 0.0001589513325608343, "loss": 0.0517, "step": 1422 }, { "epoch": 0.6178897090751194, "grad_norm": 0.0260813906788826, "learning_rate": 0.0001589223638470452, "loss": 0.0418, "step": 1423 }, { "epoch": 0.6183239253148067, "grad_norm": 0.035039037466049194, "learning_rate": 0.0001588933951332561, "loss": 0.0595, "step": 1424 }, { "epoch": 0.6187581415544942, "grad_norm": 0.033302485942840576, "learning_rate": 0.000158864426419467, "loss": 0.0494, "step": 1425 }, { "epoch": 0.6191923577941815, "grad_norm": 0.035447340458631516, "learning_rate": 0.00015883545770567788, "loss": 0.0469, "step": 1426 }, { "epoch": 0.6196265740338689, "grad_norm": 0.031205635517835617, "learning_rate": 0.00015880648899188878, "loss": 0.053, "step": 1427 }, { "epoch": 0.6200607902735562, "grad_norm": 0.0473213717341423, "learning_rate": 0.00015877752027809966, "loss": 0.0416, "step": 1428 }, { "epoch": 0.6204950065132436, "grad_norm": 0.029683653265237808, "learning_rate": 0.00015874855156431053, "loss": 0.0479, "step": 1429 }, { "epoch": 0.6209292227529309, "grad_norm": 0.03130192309617996, "learning_rate": 0.00015871958285052144, "loss": 0.0488, "step": 1430 }, { "epoch": 0.6213634389926184, "grad_norm": 0.03174491226673126, "learning_rate": 0.00015869061413673234, "loss": 0.0407, "step": 1431 }, { "epoch": 0.6217976552323057, "grad_norm": 0.029887469485402107, "learning_rate": 0.00015866164542294324, "loss": 0.0471, "step": 1432 }, { "epoch": 0.6222318714719931, "grad_norm": 0.0335966981947422, "learning_rate": 0.00015863267670915412, "loss": 0.0426, "step": 1433 }, { "epoch": 0.6226660877116804, "grad_norm": 0.03579239919781685, "learning_rate": 0.00015860370799536502, "loss": 0.051, "step": 1434 }, { "epoch": 0.6231003039513677, "grad_norm": 0.03454295173287392, "learning_rate": 0.0001585747392815759, "loss": 0.0555, "step": 1435 }, { "epoch": 0.6235345201910552, "grad_norm": 0.031216062605381012, "learning_rate": 0.00015854577056778678, "loss": 0.0559, "step": 1436 }, { "epoch": 0.6239687364307425, "grad_norm": 0.028171978890895844, "learning_rate": 0.00015851680185399768, "loss": 0.0413, "step": 1437 }, { "epoch": 0.6244029526704299, "grad_norm": 0.025501063093543053, "learning_rate": 0.00015848783314020859, "loss": 0.0384, "step": 1438 }, { "epoch": 0.6248371689101172, "grad_norm": 0.029797883704304695, "learning_rate": 0.0001584588644264195, "loss": 0.0443, "step": 1439 }, { "epoch": 0.6252713851498046, "grad_norm": 0.027923738583922386, "learning_rate": 0.00015842989571263037, "loss": 0.0404, "step": 1440 }, { "epoch": 0.6257056013894919, "grad_norm": 0.023112023249268532, "learning_rate": 0.00015840092699884127, "loss": 0.0351, "step": 1441 }, { "epoch": 0.6261398176291794, "grad_norm": 0.024696853011846542, "learning_rate": 0.00015837195828505215, "loss": 0.0325, "step": 1442 }, { "epoch": 0.6265740338688667, "grad_norm": 0.031041134148836136, "learning_rate": 0.00015834298957126302, "loss": 0.0462, "step": 1443 }, { "epoch": 0.6270082501085541, "grad_norm": 0.03362066298723221, "learning_rate": 0.00015831402085747393, "loss": 0.0492, "step": 1444 }, { "epoch": 0.6274424663482414, "grad_norm": 0.027071524411439896, "learning_rate": 0.00015828505214368483, "loss": 0.0366, "step": 1445 }, { "epoch": 0.6278766825879288, "grad_norm": 0.03125172108411789, "learning_rate": 0.00015825608342989573, "loss": 0.0462, "step": 1446 }, { "epoch": 0.6283108988276161, "grad_norm": 0.02561967447400093, "learning_rate": 0.0001582271147161066, "loss": 0.0347, "step": 1447 }, { "epoch": 0.6287451150673035, "grad_norm": 0.03574226796627045, "learning_rate": 0.00015819814600231751, "loss": 0.0558, "step": 1448 }, { "epoch": 0.6291793313069909, "grad_norm": 0.0336029976606369, "learning_rate": 0.0001581691772885284, "loss": 0.0482, "step": 1449 }, { "epoch": 0.6296135475466782, "grad_norm": 0.02697230689227581, "learning_rate": 0.00015814020857473927, "loss": 0.0361, "step": 1450 }, { "epoch": 0.6300477637863656, "grad_norm": 0.0266861729323864, "learning_rate": 0.00015811123986095017, "loss": 0.0361, "step": 1451 }, { "epoch": 0.6304819800260529, "grad_norm": 0.026108670979738235, "learning_rate": 0.00015808227114716107, "loss": 0.0361, "step": 1452 }, { "epoch": 0.6309161962657404, "grad_norm": 0.0344550721347332, "learning_rate": 0.00015805330243337198, "loss": 0.0463, "step": 1453 }, { "epoch": 0.6313504125054277, "grad_norm": 0.03225186467170715, "learning_rate": 0.00015802433371958286, "loss": 0.0392, "step": 1454 }, { "epoch": 0.6317846287451151, "grad_norm": 0.028626568615436554, "learning_rate": 0.00015799536500579376, "loss": 0.0423, "step": 1455 }, { "epoch": 0.6322188449848024, "grad_norm": 0.03330875560641289, "learning_rate": 0.00015796639629200464, "loss": 0.0431, "step": 1456 }, { "epoch": 0.6326530612244898, "grad_norm": 0.030760563910007477, "learning_rate": 0.0001579374275782155, "loss": 0.0485, "step": 1457 }, { "epoch": 0.6330872774641771, "grad_norm": 0.027630742639303207, "learning_rate": 0.00015790845886442642, "loss": 0.038, "step": 1458 }, { "epoch": 0.6335214937038646, "grad_norm": 0.02897372469305992, "learning_rate": 0.00015787949015063732, "loss": 0.0422, "step": 1459 }, { "epoch": 0.6339557099435519, "grad_norm": 0.029042452573776245, "learning_rate": 0.00015785052143684822, "loss": 0.0451, "step": 1460 }, { "epoch": 0.6343899261832393, "grad_norm": 0.028798189014196396, "learning_rate": 0.0001578215527230591, "loss": 0.0357, "step": 1461 }, { "epoch": 0.6348241424229266, "grad_norm": 0.03698223829269409, "learning_rate": 0.00015779258400927, "loss": 0.0547, "step": 1462 }, { "epoch": 0.6352583586626139, "grad_norm": 0.03436628356575966, "learning_rate": 0.00015776361529548088, "loss": 0.048, "step": 1463 }, { "epoch": 0.6356925749023014, "grad_norm": 0.03226415067911148, "learning_rate": 0.00015773464658169176, "loss": 0.0478, "step": 1464 }, { "epoch": 0.6361267911419887, "grad_norm": 0.02829672209918499, "learning_rate": 0.00015770567786790266, "loss": 0.0454, "step": 1465 }, { "epoch": 0.6365610073816761, "grad_norm": 0.027147255837917328, "learning_rate": 0.00015767670915411356, "loss": 0.0392, "step": 1466 }, { "epoch": 0.6369952236213634, "grad_norm": 0.0271847452968359, "learning_rate": 0.00015764774044032447, "loss": 0.034, "step": 1467 }, { "epoch": 0.6374294398610508, "grad_norm": 0.03143755719065666, "learning_rate": 0.00015761877172653534, "loss": 0.0481, "step": 1468 }, { "epoch": 0.6378636561007381, "grad_norm": 0.025414325296878815, "learning_rate": 0.00015758980301274625, "loss": 0.0405, "step": 1469 }, { "epoch": 0.6382978723404256, "grad_norm": 0.0272732712328434, "learning_rate": 0.00015756083429895712, "loss": 0.0424, "step": 1470 }, { "epoch": 0.6387320885801129, "grad_norm": 0.02573825977742672, "learning_rate": 0.00015753186558516803, "loss": 0.0373, "step": 1471 }, { "epoch": 0.6391663048198003, "grad_norm": 0.03564717248082161, "learning_rate": 0.0001575028968713789, "loss": 0.0565, "step": 1472 }, { "epoch": 0.6396005210594876, "grad_norm": 0.025914974510669708, "learning_rate": 0.0001574739281575898, "loss": 0.0391, "step": 1473 }, { "epoch": 0.640034737299175, "grad_norm": 0.034012485295534134, "learning_rate": 0.0001574449594438007, "loss": 0.0522, "step": 1474 }, { "epoch": 0.6404689535388624, "grad_norm": 0.030032087117433548, "learning_rate": 0.0001574159907300116, "loss": 0.0419, "step": 1475 }, { "epoch": 0.6409031697785497, "grad_norm": 0.032127801328897476, "learning_rate": 0.0001573870220162225, "loss": 0.0515, "step": 1476 }, { "epoch": 0.6413373860182371, "grad_norm": 0.02447037771344185, "learning_rate": 0.0001573580533024334, "loss": 0.0365, "step": 1477 }, { "epoch": 0.6417716022579244, "grad_norm": 0.026313472539186478, "learning_rate": 0.00015732908458864427, "loss": 0.0377, "step": 1478 }, { "epoch": 0.6422058184976118, "grad_norm": 0.028584297746419907, "learning_rate": 0.00015730011587485515, "loss": 0.0423, "step": 1479 }, { "epoch": 0.6426400347372991, "grad_norm": 0.040137555450201035, "learning_rate": 0.00015727114716106605, "loss": 0.0528, "step": 1480 }, { "epoch": 0.6430742509769866, "grad_norm": 0.03320203721523285, "learning_rate": 0.00015724217844727696, "loss": 0.0466, "step": 1481 }, { "epoch": 0.6435084672166739, "grad_norm": 0.05221818387508392, "learning_rate": 0.00015721320973348783, "loss": 0.052, "step": 1482 }, { "epoch": 0.6439426834563613, "grad_norm": 0.030936021357774734, "learning_rate": 0.00015718424101969874, "loss": 0.0459, "step": 1483 }, { "epoch": 0.6443768996960486, "grad_norm": 0.03493255004286766, "learning_rate": 0.00015715527230590964, "loss": 0.0512, "step": 1484 }, { "epoch": 0.644811115935736, "grad_norm": 0.03586660325527191, "learning_rate": 0.00015712630359212052, "loss": 0.0549, "step": 1485 }, { "epoch": 0.6452453321754233, "grad_norm": 0.03144745156168938, "learning_rate": 0.0001570973348783314, "loss": 0.045, "step": 1486 }, { "epoch": 0.6456795484151108, "grad_norm": 0.03280746936798096, "learning_rate": 0.0001570683661645423, "loss": 0.0435, "step": 1487 }, { "epoch": 0.6461137646547981, "grad_norm": 0.03312225639820099, "learning_rate": 0.0001570393974507532, "loss": 0.0471, "step": 1488 }, { "epoch": 0.6465479808944855, "grad_norm": 0.031325943768024445, "learning_rate": 0.00015701042873696408, "loss": 0.0422, "step": 1489 }, { "epoch": 0.6469821971341728, "grad_norm": 0.03685877099633217, "learning_rate": 0.00015698146002317498, "loss": 0.0363, "step": 1490 }, { "epoch": 0.6474164133738601, "grad_norm": 0.03220660984516144, "learning_rate": 0.00015695249130938589, "loss": 0.0536, "step": 1491 }, { "epoch": 0.6478506296135476, "grad_norm": 0.025879016146063805, "learning_rate": 0.00015692352259559676, "loss": 0.0389, "step": 1492 }, { "epoch": 0.6482848458532349, "grad_norm": 0.035252224653959274, "learning_rate": 0.00015689455388180764, "loss": 0.0451, "step": 1493 }, { "epoch": 0.6487190620929223, "grad_norm": 0.030161762610077858, "learning_rate": 0.00015686558516801854, "loss": 0.0402, "step": 1494 }, { "epoch": 0.6491532783326096, "grad_norm": 0.04123440384864807, "learning_rate": 0.00015683661645422945, "loss": 0.0432, "step": 1495 }, { "epoch": 0.649587494572297, "grad_norm": 0.029704352840781212, "learning_rate": 0.00015680764774044032, "loss": 0.0403, "step": 1496 }, { "epoch": 0.6500217108119843, "grad_norm": 0.039619430899620056, "learning_rate": 0.00015677867902665123, "loss": 0.0568, "step": 1497 }, { "epoch": 0.6504559270516718, "grad_norm": 0.02978592738509178, "learning_rate": 0.00015674971031286213, "loss": 0.0374, "step": 1498 }, { "epoch": 0.6508901432913591, "grad_norm": 0.03304333612322807, "learning_rate": 0.000156720741599073, "loss": 0.0481, "step": 1499 }, { "epoch": 0.6513243595310465, "grad_norm": 0.031089186668395996, "learning_rate": 0.00015669177288528388, "loss": 0.0403, "step": 1500 }, { "epoch": 0.6517585757707338, "grad_norm": 0.031233279034495354, "learning_rate": 0.0001566628041714948, "loss": 0.0478, "step": 1501 }, { "epoch": 0.6521927920104212, "grad_norm": 0.03391152247786522, "learning_rate": 0.0001566338354577057, "loss": 0.0417, "step": 1502 }, { "epoch": 0.6526270082501086, "grad_norm": 0.037436068058013916, "learning_rate": 0.00015660486674391657, "loss": 0.0569, "step": 1503 }, { "epoch": 0.6530612244897959, "grad_norm": 0.024861354380846024, "learning_rate": 0.00015657589803012747, "loss": 0.0316, "step": 1504 }, { "epoch": 0.6534954407294833, "grad_norm": 0.030735300853848457, "learning_rate": 0.00015654692931633838, "loss": 0.0391, "step": 1505 }, { "epoch": 0.6539296569691706, "grad_norm": 0.040059398859739304, "learning_rate": 0.00015651796060254925, "loss": 0.0539, "step": 1506 }, { "epoch": 0.654363873208858, "grad_norm": 0.038532424718141556, "learning_rate": 0.00015648899188876013, "loss": 0.0515, "step": 1507 }, { "epoch": 0.6547980894485453, "grad_norm": 0.03260180354118347, "learning_rate": 0.00015646002317497103, "loss": 0.0497, "step": 1508 }, { "epoch": 0.6552323056882328, "grad_norm": 0.032405491918325424, "learning_rate": 0.00015643105446118194, "loss": 0.0467, "step": 1509 }, { "epoch": 0.6556665219279201, "grad_norm": 0.03679279610514641, "learning_rate": 0.0001564020857473928, "loss": 0.0568, "step": 1510 }, { "epoch": 0.6561007381676075, "grad_norm": 0.030573029071092606, "learning_rate": 0.00015637311703360372, "loss": 0.0488, "step": 1511 }, { "epoch": 0.6565349544072948, "grad_norm": 0.030603496357798576, "learning_rate": 0.00015634414831981462, "loss": 0.0472, "step": 1512 }, { "epoch": 0.6569691706469822, "grad_norm": 0.02690718322992325, "learning_rate": 0.0001563151796060255, "loss": 0.0382, "step": 1513 }, { "epoch": 0.6574033868866695, "grad_norm": 0.02830420807003975, "learning_rate": 0.00015628621089223637, "loss": 0.044, "step": 1514 }, { "epoch": 0.657837603126357, "grad_norm": 0.029479078948497772, "learning_rate": 0.00015625724217844728, "loss": 0.0431, "step": 1515 }, { "epoch": 0.6582718193660443, "grad_norm": 0.028950070962309837, "learning_rate": 0.00015622827346465818, "loss": 0.0425, "step": 1516 }, { "epoch": 0.6587060356057316, "grad_norm": 0.02706664800643921, "learning_rate": 0.00015619930475086908, "loss": 0.0403, "step": 1517 }, { "epoch": 0.659140251845419, "grad_norm": 0.032483600080013275, "learning_rate": 0.00015617033603707996, "loss": 0.052, "step": 1518 }, { "epoch": 0.6595744680851063, "grad_norm": 0.03448007255792618, "learning_rate": 0.00015614136732329087, "loss": 0.0544, "step": 1519 }, { "epoch": 0.6600086843247938, "grad_norm": 0.0382423922419548, "learning_rate": 0.00015611239860950174, "loss": 0.0775, "step": 1520 }, { "epoch": 0.6604429005644811, "grad_norm": 0.03768420219421387, "learning_rate": 0.00015608342989571262, "loss": 0.0531, "step": 1521 }, { "epoch": 0.6608771168041685, "grad_norm": 0.03606289625167847, "learning_rate": 0.00015605446118192352, "loss": 0.0496, "step": 1522 }, { "epoch": 0.6613113330438558, "grad_norm": 0.03435833007097244, "learning_rate": 0.00015602549246813443, "loss": 0.0552, "step": 1523 }, { "epoch": 0.6617455492835432, "grad_norm": 0.03713810816407204, "learning_rate": 0.00015599652375434533, "loss": 0.0505, "step": 1524 }, { "epoch": 0.6621797655232305, "grad_norm": 0.023504288867115974, "learning_rate": 0.0001559675550405562, "loss": 0.0406, "step": 1525 }, { "epoch": 0.662613981762918, "grad_norm": 0.028103724122047424, "learning_rate": 0.0001559385863267671, "loss": 0.046, "step": 1526 }, { "epoch": 0.6630481980026053, "grad_norm": 0.02438259869813919, "learning_rate": 0.00015590961761297799, "loss": 0.0383, "step": 1527 }, { "epoch": 0.6634824142422927, "grad_norm": 0.03879522904753685, "learning_rate": 0.00015588064889918886, "loss": 0.0408, "step": 1528 }, { "epoch": 0.66391663048198, "grad_norm": 0.02911362610757351, "learning_rate": 0.00015585168018539977, "loss": 0.0449, "step": 1529 }, { "epoch": 0.6643508467216674, "grad_norm": 0.03206560015678406, "learning_rate": 0.00015582271147161067, "loss": 0.0468, "step": 1530 }, { "epoch": 0.6647850629613548, "grad_norm": 0.024589447304606438, "learning_rate": 0.00015579374275782157, "loss": 0.0344, "step": 1531 }, { "epoch": 0.6652192792010421, "grad_norm": 0.028990186750888824, "learning_rate": 0.00015576477404403245, "loss": 0.0406, "step": 1532 }, { "epoch": 0.6656534954407295, "grad_norm": 0.02764812670648098, "learning_rate": 0.00015573580533024335, "loss": 0.0427, "step": 1533 }, { "epoch": 0.6660877116804168, "grad_norm": 0.0297355055809021, "learning_rate": 0.00015570683661645423, "loss": 0.0387, "step": 1534 }, { "epoch": 0.6665219279201042, "grad_norm": 0.031544238328933716, "learning_rate": 0.0001556778679026651, "loss": 0.0478, "step": 1535 }, { "epoch": 0.6669561441597915, "grad_norm": 0.03116876818239689, "learning_rate": 0.000155648899188876, "loss": 0.0416, "step": 1536 }, { "epoch": 0.667390360399479, "grad_norm": 0.02915465459227562, "learning_rate": 0.00015561993047508692, "loss": 0.047, "step": 1537 }, { "epoch": 0.6678245766391663, "grad_norm": 0.030096804723143578, "learning_rate": 0.00015559096176129782, "loss": 0.0457, "step": 1538 }, { "epoch": 0.6682587928788537, "grad_norm": 0.02904406562447548, "learning_rate": 0.0001555619930475087, "loss": 0.0381, "step": 1539 }, { "epoch": 0.668693009118541, "grad_norm": 0.029649319127202034, "learning_rate": 0.0001555330243337196, "loss": 0.0441, "step": 1540 }, { "epoch": 0.6691272253582284, "grad_norm": 0.033825065940618515, "learning_rate": 0.00015550405561993048, "loss": 0.0493, "step": 1541 }, { "epoch": 0.6695614415979158, "grad_norm": 0.029024844989180565, "learning_rate": 0.00015547508690614135, "loss": 0.0414, "step": 1542 }, { "epoch": 0.6699956578376032, "grad_norm": 0.03481533005833626, "learning_rate": 0.00015544611819235226, "loss": 0.0494, "step": 1543 }, { "epoch": 0.6704298740772905, "grad_norm": 0.038320258259773254, "learning_rate": 0.00015541714947856316, "loss": 0.0531, "step": 1544 }, { "epoch": 0.6708640903169778, "grad_norm": 0.03488198295235634, "learning_rate": 0.00015538818076477406, "loss": 0.0427, "step": 1545 }, { "epoch": 0.6712983065566652, "grad_norm": 0.03144368901848793, "learning_rate": 0.00015535921205098494, "loss": 0.0414, "step": 1546 }, { "epoch": 0.6717325227963525, "grad_norm": 0.03119914047420025, "learning_rate": 0.00015533024333719584, "loss": 0.0454, "step": 1547 }, { "epoch": 0.67216673903604, "grad_norm": 0.03290736302733421, "learning_rate": 0.00015530127462340672, "loss": 0.0485, "step": 1548 }, { "epoch": 0.6726009552757273, "grad_norm": 0.024072032421827316, "learning_rate": 0.0001552723059096176, "loss": 0.0362, "step": 1549 }, { "epoch": 0.6730351715154147, "grad_norm": 0.03336826339364052, "learning_rate": 0.0001552433371958285, "loss": 0.0458, "step": 1550 }, { "epoch": 0.673469387755102, "grad_norm": 0.024622049182653427, "learning_rate": 0.0001552143684820394, "loss": 0.039, "step": 1551 }, { "epoch": 0.6739036039947894, "grad_norm": 0.034681860357522964, "learning_rate": 0.0001551853997682503, "loss": 0.0386, "step": 1552 }, { "epoch": 0.6743378202344767, "grad_norm": 0.028760554268956184, "learning_rate": 0.00015515643105446118, "loss": 0.0393, "step": 1553 }, { "epoch": 0.6747720364741642, "grad_norm": 0.023305267095565796, "learning_rate": 0.0001551274623406721, "loss": 0.0343, "step": 1554 }, { "epoch": 0.6752062527138515, "grad_norm": 0.031411312520504, "learning_rate": 0.000155098493626883, "loss": 0.0484, "step": 1555 }, { "epoch": 0.6756404689535389, "grad_norm": 0.02729707770049572, "learning_rate": 0.00015506952491309384, "loss": 0.0423, "step": 1556 }, { "epoch": 0.6760746851932262, "grad_norm": 0.03239104524254799, "learning_rate": 0.00015504055619930475, "loss": 0.0549, "step": 1557 }, { "epoch": 0.6765089014329136, "grad_norm": 0.034528665244579315, "learning_rate": 0.00015501158748551565, "loss": 0.0436, "step": 1558 }, { "epoch": 0.676943117672601, "grad_norm": 0.028282299637794495, "learning_rate": 0.00015498261877172655, "loss": 0.0471, "step": 1559 }, { "epoch": 0.6773773339122883, "grad_norm": 0.028063194826245308, "learning_rate": 0.00015495365005793743, "loss": 0.0368, "step": 1560 }, { "epoch": 0.6778115501519757, "grad_norm": 0.032785333693027496, "learning_rate": 0.00015492468134414833, "loss": 0.0513, "step": 1561 }, { "epoch": 0.678245766391663, "grad_norm": 0.03177879750728607, "learning_rate": 0.00015489571263035924, "loss": 0.045, "step": 1562 }, { "epoch": 0.6786799826313504, "grad_norm": 0.03344086557626724, "learning_rate": 0.00015486674391657011, "loss": 0.0439, "step": 1563 }, { "epoch": 0.6791141988710377, "grad_norm": 0.043403975665569305, "learning_rate": 0.000154837775202781, "loss": 0.0644, "step": 1564 }, { "epoch": 0.6795484151107252, "grad_norm": 0.03149336203932762, "learning_rate": 0.0001548088064889919, "loss": 0.0489, "step": 1565 }, { "epoch": 0.6799826313504125, "grad_norm": 0.027319349348545074, "learning_rate": 0.0001547798377752028, "loss": 0.038, "step": 1566 }, { "epoch": 0.6804168475900999, "grad_norm": 0.03337555378675461, "learning_rate": 0.00015475086906141367, "loss": 0.0454, "step": 1567 }, { "epoch": 0.6808510638297872, "grad_norm": 0.030514812096953392, "learning_rate": 0.00015472190034762458, "loss": 0.0439, "step": 1568 }, { "epoch": 0.6812852800694746, "grad_norm": 0.03613261133432388, "learning_rate": 0.00015469293163383548, "loss": 0.0527, "step": 1569 }, { "epoch": 0.681719496309162, "grad_norm": 0.026893509551882744, "learning_rate": 0.00015466396292004636, "loss": 0.0297, "step": 1570 }, { "epoch": 0.6821537125488494, "grad_norm": 0.04109145328402519, "learning_rate": 0.00015463499420625724, "loss": 0.0544, "step": 1571 }, { "epoch": 0.6825879287885367, "grad_norm": 0.029721425846219063, "learning_rate": 0.00015460602549246814, "loss": 0.0498, "step": 1572 }, { "epoch": 0.683022145028224, "grad_norm": 0.031085073947906494, "learning_rate": 0.00015457705677867904, "loss": 0.0453, "step": 1573 }, { "epoch": 0.6834563612679114, "grad_norm": 0.029351839795708656, "learning_rate": 0.00015454808806488992, "loss": 0.0363, "step": 1574 }, { "epoch": 0.6838905775075987, "grad_norm": 0.027086962014436722, "learning_rate": 0.00015451911935110082, "loss": 0.043, "step": 1575 }, { "epoch": 0.6843247937472862, "grad_norm": 0.027616078034043312, "learning_rate": 0.00015449015063731173, "loss": 0.0389, "step": 1576 }, { "epoch": 0.6847590099869735, "grad_norm": 0.06689398735761642, "learning_rate": 0.0001544611819235226, "loss": 0.0563, "step": 1577 }, { "epoch": 0.6851932262266609, "grad_norm": 0.024439208209514618, "learning_rate": 0.00015443221320973348, "loss": 0.0363, "step": 1578 }, { "epoch": 0.6856274424663482, "grad_norm": 0.026680512353777885, "learning_rate": 0.00015440324449594438, "loss": 0.0412, "step": 1579 }, { "epoch": 0.6860616587060356, "grad_norm": 0.02772834524512291, "learning_rate": 0.0001543742757821553, "loss": 0.0458, "step": 1580 }, { "epoch": 0.686495874945723, "grad_norm": 0.03820192813873291, "learning_rate": 0.00015434530706836616, "loss": 0.0567, "step": 1581 }, { "epoch": 0.6869300911854104, "grad_norm": 0.03356015682220459, "learning_rate": 0.00015431633835457707, "loss": 0.0535, "step": 1582 }, { "epoch": 0.6873643074250977, "grad_norm": 0.0358148068189621, "learning_rate": 0.00015428736964078797, "loss": 0.0484, "step": 1583 }, { "epoch": 0.6877985236647851, "grad_norm": 0.03814147412776947, "learning_rate": 0.00015425840092699885, "loss": 0.055, "step": 1584 }, { "epoch": 0.6882327399044724, "grad_norm": 0.028581861406564713, "learning_rate": 0.00015422943221320972, "loss": 0.0397, "step": 1585 }, { "epoch": 0.6886669561441597, "grad_norm": 0.031398843973875046, "learning_rate": 0.00015420046349942063, "loss": 0.0448, "step": 1586 }, { "epoch": 0.6891011723838472, "grad_norm": 0.025692125782370567, "learning_rate": 0.00015417149478563153, "loss": 0.0344, "step": 1587 }, { "epoch": 0.6895353886235345, "grad_norm": 0.027093559503555298, "learning_rate": 0.0001541425260718424, "loss": 0.0398, "step": 1588 }, { "epoch": 0.6899696048632219, "grad_norm": 0.032455578446388245, "learning_rate": 0.0001541135573580533, "loss": 0.0485, "step": 1589 }, { "epoch": 0.6904038211029092, "grad_norm": 0.026808438822627068, "learning_rate": 0.00015408458864426422, "loss": 0.0393, "step": 1590 }, { "epoch": 0.6908380373425966, "grad_norm": 0.030161531642079353, "learning_rate": 0.0001540556199304751, "loss": 0.0425, "step": 1591 }, { "epoch": 0.691272253582284, "grad_norm": 0.03138190135359764, "learning_rate": 0.00015402665121668597, "loss": 0.0375, "step": 1592 }, { "epoch": 0.6917064698219714, "grad_norm": 0.028684111312031746, "learning_rate": 0.00015399768250289687, "loss": 0.0425, "step": 1593 }, { "epoch": 0.6921406860616587, "grad_norm": 0.025148481130599976, "learning_rate": 0.00015396871378910778, "loss": 0.0309, "step": 1594 }, { "epoch": 0.6925749023013461, "grad_norm": 0.037633106112480164, "learning_rate": 0.00015393974507531865, "loss": 0.0523, "step": 1595 }, { "epoch": 0.6930091185410334, "grad_norm": 0.03062477521598339, "learning_rate": 0.00015391077636152956, "loss": 0.0418, "step": 1596 }, { "epoch": 0.6934433347807208, "grad_norm": 0.02685404196381569, "learning_rate": 0.00015388180764774046, "loss": 0.0363, "step": 1597 }, { "epoch": 0.6938775510204082, "grad_norm": 0.024993907660245895, "learning_rate": 0.00015385283893395134, "loss": 0.037, "step": 1598 }, { "epoch": 0.6943117672600956, "grad_norm": 0.028064941987395287, "learning_rate": 0.00015382387022016221, "loss": 0.0407, "step": 1599 }, { "epoch": 0.6947459834997829, "grad_norm": 0.03451312705874443, "learning_rate": 0.00015379490150637312, "loss": 0.0429, "step": 1600 }, { "epoch": 0.6951801997394702, "grad_norm": 0.03249617666006088, "learning_rate": 0.00015376593279258402, "loss": 0.0394, "step": 1601 }, { "epoch": 0.6956144159791576, "grad_norm": 0.02652798593044281, "learning_rate": 0.0001537369640787949, "loss": 0.0339, "step": 1602 }, { "epoch": 0.6960486322188449, "grad_norm": 0.029300322756171227, "learning_rate": 0.0001537079953650058, "loss": 0.0328, "step": 1603 }, { "epoch": 0.6964828484585324, "grad_norm": 0.02747003175318241, "learning_rate": 0.0001536790266512167, "loss": 0.0335, "step": 1604 }, { "epoch": 0.6969170646982197, "grad_norm": 0.03720302879810333, "learning_rate": 0.00015365005793742758, "loss": 0.0578, "step": 1605 }, { "epoch": 0.6973512809379071, "grad_norm": 0.03467237949371338, "learning_rate": 0.00015362108922363846, "loss": 0.0396, "step": 1606 }, { "epoch": 0.6977854971775944, "grad_norm": 0.03890318050980568, "learning_rate": 0.00015359212050984936, "loss": 0.0573, "step": 1607 }, { "epoch": 0.6982197134172818, "grad_norm": 0.030959803611040115, "learning_rate": 0.00015356315179606027, "loss": 0.0427, "step": 1608 }, { "epoch": 0.6986539296569692, "grad_norm": 0.022995518520474434, "learning_rate": 0.00015353418308227114, "loss": 0.0268, "step": 1609 }, { "epoch": 0.6990881458966566, "grad_norm": 0.03458115831017494, "learning_rate": 0.00015350521436848205, "loss": 0.0449, "step": 1610 }, { "epoch": 0.6995223621363439, "grad_norm": 0.036853477358818054, "learning_rate": 0.00015347624565469295, "loss": 0.0516, "step": 1611 }, { "epoch": 0.6999565783760313, "grad_norm": 0.03013487532734871, "learning_rate": 0.00015344727694090383, "loss": 0.0444, "step": 1612 }, { "epoch": 0.7003907946157186, "grad_norm": 0.030262039974331856, "learning_rate": 0.0001534183082271147, "loss": 0.0485, "step": 1613 }, { "epoch": 0.7008250108554059, "grad_norm": 0.03277484327554703, "learning_rate": 0.0001533893395133256, "loss": 0.0479, "step": 1614 }, { "epoch": 0.7012592270950934, "grad_norm": 0.03163651376962662, "learning_rate": 0.0001533603707995365, "loss": 0.0424, "step": 1615 }, { "epoch": 0.7016934433347807, "grad_norm": 0.03240131959319115, "learning_rate": 0.00015333140208574741, "loss": 0.0383, "step": 1616 }, { "epoch": 0.7021276595744681, "grad_norm": 0.03493519127368927, "learning_rate": 0.0001533024333719583, "loss": 0.0447, "step": 1617 }, { "epoch": 0.7025618758141554, "grad_norm": 0.03503197431564331, "learning_rate": 0.0001532734646581692, "loss": 0.0507, "step": 1618 }, { "epoch": 0.7029960920538428, "grad_norm": 0.02385438047349453, "learning_rate": 0.00015324449594438007, "loss": 0.0352, "step": 1619 }, { "epoch": 0.7034303082935301, "grad_norm": 0.031034676358103752, "learning_rate": 0.00015321552723059095, "loss": 0.0448, "step": 1620 }, { "epoch": 0.7038645245332176, "grad_norm": 0.03196074813604355, "learning_rate": 0.00015318655851680185, "loss": 0.0436, "step": 1621 }, { "epoch": 0.7042987407729049, "grad_norm": 0.04043018817901611, "learning_rate": 0.00015315758980301276, "loss": 0.0439, "step": 1622 }, { "epoch": 0.7047329570125923, "grad_norm": 0.03125888854265213, "learning_rate": 0.00015312862108922366, "loss": 0.0439, "step": 1623 }, { "epoch": 0.7051671732522796, "grad_norm": 0.029770929366350174, "learning_rate": 0.00015309965237543454, "loss": 0.0389, "step": 1624 }, { "epoch": 0.705601389491967, "grad_norm": 0.040373291820287704, "learning_rate": 0.00015307068366164544, "loss": 0.0439, "step": 1625 }, { "epoch": 0.7060356057316544, "grad_norm": 0.02756352722644806, "learning_rate": 0.00015304171494785632, "loss": 0.0353, "step": 1626 }, { "epoch": 0.7064698219713418, "grad_norm": 0.04499821737408638, "learning_rate": 0.0001530127462340672, "loss": 0.0565, "step": 1627 }, { "epoch": 0.7069040382110291, "grad_norm": 0.033047325909137726, "learning_rate": 0.0001529837775202781, "loss": 0.0456, "step": 1628 }, { "epoch": 0.7073382544507164, "grad_norm": 0.029898907989263535, "learning_rate": 0.000152954808806489, "loss": 0.0425, "step": 1629 }, { "epoch": 0.7077724706904038, "grad_norm": 0.024088894948363304, "learning_rate": 0.0001529258400926999, "loss": 0.0323, "step": 1630 }, { "epoch": 0.7082066869300911, "grad_norm": 0.02366037853062153, "learning_rate": 0.00015289687137891078, "loss": 0.0331, "step": 1631 }, { "epoch": 0.7086409031697786, "grad_norm": 0.03334007039666176, "learning_rate": 0.00015286790266512168, "loss": 0.0533, "step": 1632 }, { "epoch": 0.7090751194094659, "grad_norm": 0.024834249168634415, "learning_rate": 0.0001528389339513326, "loss": 0.0418, "step": 1633 }, { "epoch": 0.7095093356491533, "grad_norm": 0.022928979247808456, "learning_rate": 0.00015280996523754344, "loss": 0.0318, "step": 1634 }, { "epoch": 0.7099435518888406, "grad_norm": 0.032361529767513275, "learning_rate": 0.00015278099652375434, "loss": 0.045, "step": 1635 }, { "epoch": 0.710377768128528, "grad_norm": 0.026199564337730408, "learning_rate": 0.00015275202780996524, "loss": 0.0401, "step": 1636 }, { "epoch": 0.7108119843682154, "grad_norm": 0.028543060645461082, "learning_rate": 0.00015272305909617615, "loss": 0.0435, "step": 1637 }, { "epoch": 0.7112462006079028, "grad_norm": 0.03430711850523949, "learning_rate": 0.00015269409038238703, "loss": 0.0453, "step": 1638 }, { "epoch": 0.7116804168475901, "grad_norm": 0.031084297224879265, "learning_rate": 0.00015266512166859793, "loss": 0.0479, "step": 1639 }, { "epoch": 0.7121146330872775, "grad_norm": 0.05658046901226044, "learning_rate": 0.00015263615295480883, "loss": 0.0699, "step": 1640 }, { "epoch": 0.7125488493269648, "grad_norm": 0.027034010738134384, "learning_rate": 0.0001526071842410197, "loss": 0.046, "step": 1641 }, { "epoch": 0.7129830655666521, "grad_norm": 0.028292851522564888, "learning_rate": 0.00015257821552723059, "loss": 0.0393, "step": 1642 }, { "epoch": 0.7134172818063396, "grad_norm": 0.023892875760793686, "learning_rate": 0.0001525492468134415, "loss": 0.0348, "step": 1643 }, { "epoch": 0.7138514980460269, "grad_norm": 0.03437713906168938, "learning_rate": 0.0001525202780996524, "loss": 0.0508, "step": 1644 }, { "epoch": 0.7142857142857143, "grad_norm": 0.030125407502055168, "learning_rate": 0.00015249130938586327, "loss": 0.0384, "step": 1645 }, { "epoch": 0.7147199305254016, "grad_norm": 0.033370718359947205, "learning_rate": 0.00015246234067207417, "loss": 0.0474, "step": 1646 }, { "epoch": 0.715154146765089, "grad_norm": 0.03798152878880501, "learning_rate": 0.00015243337195828508, "loss": 0.0549, "step": 1647 }, { "epoch": 0.7155883630047764, "grad_norm": 0.03317031264305115, "learning_rate": 0.00015240440324449595, "loss": 0.0504, "step": 1648 }, { "epoch": 0.7160225792444638, "grad_norm": 0.02536538615822792, "learning_rate": 0.00015237543453070683, "loss": 0.0404, "step": 1649 }, { "epoch": 0.7164567954841511, "grad_norm": 0.031564582139253616, "learning_rate": 0.00015234646581691773, "loss": 0.0497, "step": 1650 }, { "epoch": 0.7168910117238385, "grad_norm": 0.02587929554283619, "learning_rate": 0.00015231749710312864, "loss": 0.0434, "step": 1651 }, { "epoch": 0.7173252279635258, "grad_norm": 0.027461275458335876, "learning_rate": 0.00015228852838933951, "loss": 0.0429, "step": 1652 }, { "epoch": 0.7177594442032132, "grad_norm": 0.029351389035582542, "learning_rate": 0.00015225955967555042, "loss": 0.0452, "step": 1653 }, { "epoch": 0.7181936604429006, "grad_norm": 0.030585337430238724, "learning_rate": 0.00015223059096176132, "loss": 0.0444, "step": 1654 }, { "epoch": 0.7186278766825879, "grad_norm": 0.033091943711042404, "learning_rate": 0.0001522016222479722, "loss": 0.0518, "step": 1655 }, { "epoch": 0.7190620929222753, "grad_norm": 0.03343244269490242, "learning_rate": 0.00015217265353418308, "loss": 0.048, "step": 1656 }, { "epoch": 0.7194963091619626, "grad_norm": 0.025115443393588066, "learning_rate": 0.00015214368482039398, "loss": 0.0353, "step": 1657 }, { "epoch": 0.71993052540165, "grad_norm": 0.03206261247396469, "learning_rate": 0.00015211471610660488, "loss": 0.0424, "step": 1658 }, { "epoch": 0.7203647416413373, "grad_norm": 0.030260808765888214, "learning_rate": 0.00015208574739281576, "loss": 0.0434, "step": 1659 }, { "epoch": 0.7207989578810248, "grad_norm": 0.03323477879166603, "learning_rate": 0.00015205677867902666, "loss": 0.0502, "step": 1660 }, { "epoch": 0.7212331741207121, "grad_norm": 0.03251457214355469, "learning_rate": 0.00015202780996523757, "loss": 0.0394, "step": 1661 }, { "epoch": 0.7216673903603995, "grad_norm": 0.04038078710436821, "learning_rate": 0.00015199884125144844, "loss": 0.045, "step": 1662 }, { "epoch": 0.7221016066000868, "grad_norm": 0.03354761004447937, "learning_rate": 0.00015196987253765932, "loss": 0.0494, "step": 1663 }, { "epoch": 0.7225358228397742, "grad_norm": 0.02768930047750473, "learning_rate": 0.00015194090382387022, "loss": 0.0373, "step": 1664 }, { "epoch": 0.7229700390794616, "grad_norm": 0.03479521721601486, "learning_rate": 0.00015191193511008113, "loss": 0.0613, "step": 1665 }, { "epoch": 0.723404255319149, "grad_norm": 0.0355863980948925, "learning_rate": 0.000151882966396292, "loss": 0.0449, "step": 1666 }, { "epoch": 0.7238384715588363, "grad_norm": 0.027273626998066902, "learning_rate": 0.0001518539976825029, "loss": 0.0437, "step": 1667 }, { "epoch": 0.7242726877985237, "grad_norm": 0.034624580293893814, "learning_rate": 0.0001518250289687138, "loss": 0.0449, "step": 1668 }, { "epoch": 0.724706904038211, "grad_norm": 0.025260083377361298, "learning_rate": 0.0001517960602549247, "loss": 0.0371, "step": 1669 }, { "epoch": 0.7251411202778983, "grad_norm": 0.031167054548859596, "learning_rate": 0.00015176709154113556, "loss": 0.0461, "step": 1670 }, { "epoch": 0.7255753365175858, "grad_norm": 0.029753761366009712, "learning_rate": 0.00015173812282734647, "loss": 0.0413, "step": 1671 }, { "epoch": 0.7260095527572731, "grad_norm": 0.03511873260140419, "learning_rate": 0.00015170915411355737, "loss": 0.051, "step": 1672 }, { "epoch": 0.7264437689969605, "grad_norm": 0.03533425182104111, "learning_rate": 0.00015168018539976825, "loss": 0.0382, "step": 1673 }, { "epoch": 0.7268779852366478, "grad_norm": 0.025410547852516174, "learning_rate": 0.00015165121668597915, "loss": 0.0327, "step": 1674 }, { "epoch": 0.7273122014763352, "grad_norm": 0.031114498153328896, "learning_rate": 0.00015162224797219006, "loss": 0.0474, "step": 1675 }, { "epoch": 0.7277464177160226, "grad_norm": 0.028971880674362183, "learning_rate": 0.00015159327925840093, "loss": 0.0445, "step": 1676 }, { "epoch": 0.72818063395571, "grad_norm": 0.024142326787114143, "learning_rate": 0.0001515643105446118, "loss": 0.0287, "step": 1677 }, { "epoch": 0.7286148501953973, "grad_norm": 0.04137092083692551, "learning_rate": 0.0001515353418308227, "loss": 0.057, "step": 1678 }, { "epoch": 0.7290490664350847, "grad_norm": 0.024550095200538635, "learning_rate": 0.00015150637311703362, "loss": 0.035, "step": 1679 }, { "epoch": 0.729483282674772, "grad_norm": 0.032809630036354065, "learning_rate": 0.0001514774044032445, "loss": 0.045, "step": 1680 }, { "epoch": 0.7299174989144595, "grad_norm": 0.025663431733846664, "learning_rate": 0.0001514484356894554, "loss": 0.0388, "step": 1681 }, { "epoch": 0.7303517151541468, "grad_norm": 0.02450251765549183, "learning_rate": 0.0001514194669756663, "loss": 0.0416, "step": 1682 }, { "epoch": 0.7307859313938341, "grad_norm": 0.025116385892033577, "learning_rate": 0.00015139049826187718, "loss": 0.0362, "step": 1683 }, { "epoch": 0.7312201476335215, "grad_norm": 0.027830135077238083, "learning_rate": 0.00015136152954808805, "loss": 0.0411, "step": 1684 }, { "epoch": 0.7316543638732088, "grad_norm": 0.03095991350710392, "learning_rate": 0.00015133256083429896, "loss": 0.0443, "step": 1685 }, { "epoch": 0.7320885801128962, "grad_norm": 0.04089808836579323, "learning_rate": 0.00015130359212050986, "loss": 0.054, "step": 1686 }, { "epoch": 0.7325227963525835, "grad_norm": 0.026857538148760796, "learning_rate": 0.00015127462340672074, "loss": 0.0404, "step": 1687 }, { "epoch": 0.732957012592271, "grad_norm": 0.031129350885748863, "learning_rate": 0.00015124565469293164, "loss": 0.0397, "step": 1688 }, { "epoch": 0.7333912288319583, "grad_norm": 0.0353299118578434, "learning_rate": 0.00015121668597914255, "loss": 0.0527, "step": 1689 }, { "epoch": 0.7338254450716457, "grad_norm": 0.030377980321645737, "learning_rate": 0.00015118771726535342, "loss": 0.0423, "step": 1690 }, { "epoch": 0.734259661311333, "grad_norm": 0.03202570602297783, "learning_rate": 0.0001511587485515643, "loss": 0.0422, "step": 1691 }, { "epoch": 0.7346938775510204, "grad_norm": 0.032367855310440063, "learning_rate": 0.0001511297798377752, "loss": 0.0416, "step": 1692 }, { "epoch": 0.7351280937907078, "grad_norm": 0.02346085198223591, "learning_rate": 0.0001511008111239861, "loss": 0.0348, "step": 1693 }, { "epoch": 0.7355623100303952, "grad_norm": 0.03206723928451538, "learning_rate": 0.00015107184241019698, "loss": 0.0468, "step": 1694 }, { "epoch": 0.7359965262700825, "grad_norm": 0.031016884371638298, "learning_rate": 0.0001510428736964079, "loss": 0.0405, "step": 1695 }, { "epoch": 0.7364307425097699, "grad_norm": 0.024076448753476143, "learning_rate": 0.0001510139049826188, "loss": 0.0336, "step": 1696 }, { "epoch": 0.7368649587494572, "grad_norm": 0.027587424963712692, "learning_rate": 0.00015098493626882967, "loss": 0.0382, "step": 1697 }, { "epoch": 0.7372991749891445, "grad_norm": 0.03351308032870293, "learning_rate": 0.00015095596755504054, "loss": 0.0496, "step": 1698 }, { "epoch": 0.737733391228832, "grad_norm": 0.02458728663623333, "learning_rate": 0.00015092699884125145, "loss": 0.0363, "step": 1699 }, { "epoch": 0.7381676074685193, "grad_norm": 0.028546888381242752, "learning_rate": 0.00015089803012746235, "loss": 0.0406, "step": 1700 }, { "epoch": 0.7386018237082067, "grad_norm": 0.030979886651039124, "learning_rate": 0.00015086906141367323, "loss": 0.0459, "step": 1701 }, { "epoch": 0.739036039947894, "grad_norm": 0.032552603632211685, "learning_rate": 0.00015084009269988413, "loss": 0.0455, "step": 1702 }, { "epoch": 0.7394702561875814, "grad_norm": 0.032135140150785446, "learning_rate": 0.00015081112398609504, "loss": 0.0395, "step": 1703 }, { "epoch": 0.7399044724272688, "grad_norm": 0.029641365632414818, "learning_rate": 0.0001507821552723059, "loss": 0.0422, "step": 1704 }, { "epoch": 0.7403386886669562, "grad_norm": 0.02725786715745926, "learning_rate": 0.0001507531865585168, "loss": 0.0352, "step": 1705 }, { "epoch": 0.7407729049066435, "grad_norm": 0.03272905945777893, "learning_rate": 0.0001507242178447277, "loss": 0.0467, "step": 1706 }, { "epoch": 0.7412071211463309, "grad_norm": 0.030197452753782272, "learning_rate": 0.0001506952491309386, "loss": 0.0475, "step": 1707 }, { "epoch": 0.7416413373860182, "grad_norm": 0.0308021642267704, "learning_rate": 0.0001506662804171495, "loss": 0.0365, "step": 1708 }, { "epoch": 0.7420755536257057, "grad_norm": 0.028341153636574745, "learning_rate": 0.00015063731170336038, "loss": 0.0386, "step": 1709 }, { "epoch": 0.742509769865393, "grad_norm": 0.025394557043910027, "learning_rate": 0.00015060834298957128, "loss": 0.037, "step": 1710 }, { "epoch": 0.7429439861050803, "grad_norm": 0.029661649838089943, "learning_rate": 0.00015057937427578218, "loss": 0.0384, "step": 1711 }, { "epoch": 0.7433782023447677, "grad_norm": 0.037196483463048935, "learning_rate": 0.00015055040556199303, "loss": 0.0586, "step": 1712 }, { "epoch": 0.743812418584455, "grad_norm": 0.02299555391073227, "learning_rate": 0.00015052143684820394, "loss": 0.0345, "step": 1713 }, { "epoch": 0.7442466348241424, "grad_norm": 0.0393168106675148, "learning_rate": 0.00015049246813441484, "loss": 0.0651, "step": 1714 }, { "epoch": 0.7446808510638298, "grad_norm": 0.03396942466497421, "learning_rate": 0.00015046349942062574, "loss": 0.0468, "step": 1715 }, { "epoch": 0.7451150673035172, "grad_norm": 0.03599150478839874, "learning_rate": 0.00015043453070683662, "loss": 0.047, "step": 1716 }, { "epoch": 0.7455492835432045, "grad_norm": 0.030744124203920364, "learning_rate": 0.00015040556199304752, "loss": 0.0444, "step": 1717 }, { "epoch": 0.7459834997828919, "grad_norm": 0.025842992588877678, "learning_rate": 0.00015037659327925843, "loss": 0.0343, "step": 1718 }, { "epoch": 0.7464177160225792, "grad_norm": 0.03039276972413063, "learning_rate": 0.0001503476245654693, "loss": 0.0403, "step": 1719 }, { "epoch": 0.7468519322622666, "grad_norm": 0.03663505241274834, "learning_rate": 0.00015031865585168018, "loss": 0.046, "step": 1720 }, { "epoch": 0.747286148501954, "grad_norm": 0.03413698077201843, "learning_rate": 0.00015028968713789109, "loss": 0.053, "step": 1721 }, { "epoch": 0.7477203647416414, "grad_norm": 0.0284839179366827, "learning_rate": 0.000150260718424102, "loss": 0.0452, "step": 1722 }, { "epoch": 0.7481545809813287, "grad_norm": 0.03334270790219307, "learning_rate": 0.00015023174971031287, "loss": 0.0523, "step": 1723 }, { "epoch": 0.748588797221016, "grad_norm": 0.021794423460960388, "learning_rate": 0.00015020278099652377, "loss": 0.0342, "step": 1724 }, { "epoch": 0.7490230134607034, "grad_norm": 0.035922981798648834, "learning_rate": 0.00015017381228273467, "loss": 0.0507, "step": 1725 }, { "epoch": 0.7494572297003907, "grad_norm": 0.03297586739063263, "learning_rate": 0.00015014484356894555, "loss": 0.0436, "step": 1726 }, { "epoch": 0.7498914459400782, "grad_norm": 0.02781081385910511, "learning_rate": 0.00015011587485515643, "loss": 0.0445, "step": 1727 }, { "epoch": 0.7503256621797655, "grad_norm": 0.032125506550073624, "learning_rate": 0.00015008690614136733, "loss": 0.0476, "step": 1728 }, { "epoch": 0.7507598784194529, "grad_norm": 0.031952571123838425, "learning_rate": 0.00015005793742757823, "loss": 0.0526, "step": 1729 }, { "epoch": 0.7511940946591402, "grad_norm": 0.027338506653904915, "learning_rate": 0.0001500289687137891, "loss": 0.0421, "step": 1730 }, { "epoch": 0.7516283108988276, "grad_norm": 0.03605492785573006, "learning_rate": 0.00015000000000000001, "loss": 0.0547, "step": 1731 }, { "epoch": 0.752062527138515, "grad_norm": 0.03435375913977623, "learning_rate": 0.00014997103128621092, "loss": 0.0398, "step": 1732 }, { "epoch": 0.7524967433782024, "grad_norm": 0.023491602391004562, "learning_rate": 0.0001499420625724218, "loss": 0.0385, "step": 1733 }, { "epoch": 0.7529309596178897, "grad_norm": 0.024943174794316292, "learning_rate": 0.00014991309385863267, "loss": 0.0402, "step": 1734 }, { "epoch": 0.7533651758575771, "grad_norm": 0.03355515003204346, "learning_rate": 0.00014988412514484357, "loss": 0.047, "step": 1735 }, { "epoch": 0.7537993920972644, "grad_norm": 0.047809816896915436, "learning_rate": 0.00014985515643105448, "loss": 0.0658, "step": 1736 }, { "epoch": 0.7542336083369519, "grad_norm": 0.031939759850502014, "learning_rate": 0.00014982618771726535, "loss": 0.0478, "step": 1737 }, { "epoch": 0.7546678245766392, "grad_norm": 0.02584601752460003, "learning_rate": 0.00014979721900347626, "loss": 0.0376, "step": 1738 }, { "epoch": 0.7551020408163265, "grad_norm": 0.038913361728191376, "learning_rate": 0.00014976825028968716, "loss": 0.06, "step": 1739 }, { "epoch": 0.7555362570560139, "grad_norm": 0.0334354005753994, "learning_rate": 0.00014973928157589804, "loss": 0.0539, "step": 1740 }, { "epoch": 0.7559704732957012, "grad_norm": 0.03711209073662758, "learning_rate": 0.00014971031286210892, "loss": 0.0588, "step": 1741 }, { "epoch": 0.7564046895353886, "grad_norm": 0.03223710134625435, "learning_rate": 0.00014968134414831982, "loss": 0.0485, "step": 1742 }, { "epoch": 0.756838905775076, "grad_norm": 0.033604152500629425, "learning_rate": 0.00014965237543453072, "loss": 0.0437, "step": 1743 }, { "epoch": 0.7572731220147634, "grad_norm": 0.03259902819991112, "learning_rate": 0.0001496234067207416, "loss": 0.0486, "step": 1744 }, { "epoch": 0.7577073382544507, "grad_norm": 0.02811816707253456, "learning_rate": 0.0001495944380069525, "loss": 0.047, "step": 1745 }, { "epoch": 0.7581415544941381, "grad_norm": 0.02877199277281761, "learning_rate": 0.0001495654692931634, "loss": 0.0413, "step": 1746 }, { "epoch": 0.7585757707338254, "grad_norm": 0.024196425452828407, "learning_rate": 0.00014953650057937428, "loss": 0.034, "step": 1747 }, { "epoch": 0.7590099869735129, "grad_norm": 0.03628256544470787, "learning_rate": 0.00014950753186558516, "loss": 0.0498, "step": 1748 }, { "epoch": 0.7594442032132002, "grad_norm": 0.03684200718998909, "learning_rate": 0.00014947856315179606, "loss": 0.0498, "step": 1749 }, { "epoch": 0.7598784194528876, "grad_norm": 0.028674205765128136, "learning_rate": 0.00014944959443800697, "loss": 0.0382, "step": 1750 }, { "epoch": 0.7603126356925749, "grad_norm": 0.029287708923220634, "learning_rate": 0.00014942062572421784, "loss": 0.0417, "step": 1751 }, { "epoch": 0.7607468519322622, "grad_norm": 0.0278943981975317, "learning_rate": 0.00014939165701042875, "loss": 0.0347, "step": 1752 }, { "epoch": 0.7611810681719496, "grad_norm": 0.029339062049984932, "learning_rate": 0.00014936268829663965, "loss": 0.0415, "step": 1753 }, { "epoch": 0.761615284411637, "grad_norm": 0.029827000573277473, "learning_rate": 0.00014933371958285053, "loss": 0.046, "step": 1754 }, { "epoch": 0.7620495006513244, "grad_norm": 0.030939823016524315, "learning_rate": 0.0001493047508690614, "loss": 0.0488, "step": 1755 }, { "epoch": 0.7624837168910117, "grad_norm": 0.028964776545763016, "learning_rate": 0.0001492757821552723, "loss": 0.0368, "step": 1756 }, { "epoch": 0.7629179331306991, "grad_norm": 0.03190678730607033, "learning_rate": 0.0001492468134414832, "loss": 0.0426, "step": 1757 }, { "epoch": 0.7633521493703864, "grad_norm": 0.02950860746204853, "learning_rate": 0.0001492178447276941, "loss": 0.0474, "step": 1758 }, { "epoch": 0.7637863656100738, "grad_norm": 0.035344015806913376, "learning_rate": 0.000149188876013905, "loss": 0.0534, "step": 1759 }, { "epoch": 0.7642205818497612, "grad_norm": 0.03360963985323906, "learning_rate": 0.0001491599073001159, "loss": 0.0476, "step": 1760 }, { "epoch": 0.7646547980894486, "grad_norm": 0.0340595617890358, "learning_rate": 0.00014913093858632677, "loss": 0.048, "step": 1761 }, { "epoch": 0.7650890143291359, "grad_norm": 0.034271929413080215, "learning_rate": 0.00014910196987253765, "loss": 0.05, "step": 1762 }, { "epoch": 0.7655232305688233, "grad_norm": 0.042245201766490936, "learning_rate": 0.00014907300115874855, "loss": 0.0658, "step": 1763 }, { "epoch": 0.7659574468085106, "grad_norm": 0.024996474385261536, "learning_rate": 0.00014904403244495946, "loss": 0.0357, "step": 1764 }, { "epoch": 0.7663916630481981, "grad_norm": 0.03811528533697128, "learning_rate": 0.00014901506373117033, "loss": 0.0534, "step": 1765 }, { "epoch": 0.7668258792878854, "grad_norm": 0.039648521691560745, "learning_rate": 0.00014898609501738124, "loss": 0.0462, "step": 1766 }, { "epoch": 0.7672600955275727, "grad_norm": 0.028559433296322823, "learning_rate": 0.00014895712630359214, "loss": 0.0478, "step": 1767 }, { "epoch": 0.7676943117672601, "grad_norm": 0.030502310022711754, "learning_rate": 0.00014892815758980302, "loss": 0.0465, "step": 1768 }, { "epoch": 0.7681285280069474, "grad_norm": 0.02556605450809002, "learning_rate": 0.0001488991888760139, "loss": 0.0372, "step": 1769 }, { "epoch": 0.7685627442466348, "grad_norm": 0.02742738276720047, "learning_rate": 0.0001488702201622248, "loss": 0.0392, "step": 1770 }, { "epoch": 0.7689969604863222, "grad_norm": 0.027262717485427856, "learning_rate": 0.0001488412514484357, "loss": 0.0479, "step": 1771 }, { "epoch": 0.7694311767260096, "grad_norm": 0.030208837240934372, "learning_rate": 0.00014881228273464658, "loss": 0.0432, "step": 1772 }, { "epoch": 0.7698653929656969, "grad_norm": 0.03257419168949127, "learning_rate": 0.00014878331402085748, "loss": 0.0536, "step": 1773 }, { "epoch": 0.7702996092053843, "grad_norm": 0.028239509090781212, "learning_rate": 0.00014875434530706839, "loss": 0.0448, "step": 1774 }, { "epoch": 0.7707338254450716, "grad_norm": 0.025310970842838287, "learning_rate": 0.00014872537659327926, "loss": 0.0344, "step": 1775 }, { "epoch": 0.771168041684759, "grad_norm": 0.024019144475460052, "learning_rate": 0.00014869640787949014, "loss": 0.0373, "step": 1776 }, { "epoch": 0.7716022579244464, "grad_norm": 0.027221249416470528, "learning_rate": 0.00014866743916570104, "loss": 0.0321, "step": 1777 }, { "epoch": 0.7720364741641338, "grad_norm": 0.034973032772541046, "learning_rate": 0.00014863847045191195, "loss": 0.0484, "step": 1778 }, { "epoch": 0.7724706904038211, "grad_norm": 0.030006496235728264, "learning_rate": 0.00014860950173812282, "loss": 0.0348, "step": 1779 }, { "epoch": 0.7729049066435084, "grad_norm": 0.029215192422270775, "learning_rate": 0.00014858053302433373, "loss": 0.0458, "step": 1780 }, { "epoch": 0.7733391228831958, "grad_norm": 0.028527947142720222, "learning_rate": 0.00014855156431054463, "loss": 0.0402, "step": 1781 }, { "epoch": 0.7737733391228832, "grad_norm": 0.029747627675533295, "learning_rate": 0.0001485225955967555, "loss": 0.0498, "step": 1782 }, { "epoch": 0.7742075553625706, "grad_norm": 0.034727491438388824, "learning_rate": 0.00014849362688296638, "loss": 0.0467, "step": 1783 }, { "epoch": 0.7746417716022579, "grad_norm": 0.036345381289720535, "learning_rate": 0.0001484646581691773, "loss": 0.053, "step": 1784 }, { "epoch": 0.7750759878419453, "grad_norm": 0.030906224623322487, "learning_rate": 0.0001484356894553882, "loss": 0.0426, "step": 1785 }, { "epoch": 0.7755102040816326, "grad_norm": 0.030028384178876877, "learning_rate": 0.00014840672074159907, "loss": 0.0402, "step": 1786 }, { "epoch": 0.77594442032132, "grad_norm": 0.03044203668832779, "learning_rate": 0.00014837775202780997, "loss": 0.0465, "step": 1787 }, { "epoch": 0.7763786365610074, "grad_norm": 0.02572561986744404, "learning_rate": 0.00014834878331402088, "loss": 0.042, "step": 1788 }, { "epoch": 0.7768128528006948, "grad_norm": 0.02515152469277382, "learning_rate": 0.00014831981460023178, "loss": 0.0389, "step": 1789 }, { "epoch": 0.7772470690403821, "grad_norm": 0.02753361687064171, "learning_rate": 0.00014829084588644263, "loss": 0.0394, "step": 1790 }, { "epoch": 0.7776812852800695, "grad_norm": 0.03932685777544975, "learning_rate": 0.00014826187717265353, "loss": 0.0567, "step": 1791 }, { "epoch": 0.7781155015197568, "grad_norm": 0.02772199735045433, "learning_rate": 0.00014823290845886444, "loss": 0.0434, "step": 1792 }, { "epoch": 0.7785497177594441, "grad_norm": 0.02586045302450657, "learning_rate": 0.0001482039397450753, "loss": 0.0301, "step": 1793 }, { "epoch": 0.7789839339991316, "grad_norm": 0.02751094289124012, "learning_rate": 0.00014817497103128622, "loss": 0.0347, "step": 1794 }, { "epoch": 0.7794181502388189, "grad_norm": 0.030598388984799385, "learning_rate": 0.00014814600231749712, "loss": 0.045, "step": 1795 }, { "epoch": 0.7798523664785063, "grad_norm": 0.031561478972435, "learning_rate": 0.00014811703360370802, "loss": 0.0388, "step": 1796 }, { "epoch": 0.7802865827181936, "grad_norm": 0.030463743954896927, "learning_rate": 0.0001480880648899189, "loss": 0.041, "step": 1797 }, { "epoch": 0.780720798957881, "grad_norm": 0.027677573263645172, "learning_rate": 0.00014805909617612978, "loss": 0.0409, "step": 1798 }, { "epoch": 0.7811550151975684, "grad_norm": 0.03033914789557457, "learning_rate": 0.00014803012746234068, "loss": 0.0405, "step": 1799 }, { "epoch": 0.7815892314372558, "grad_norm": 0.03364015370607376, "learning_rate": 0.00014800115874855156, "loss": 0.0485, "step": 1800 }, { "epoch": 0.7820234476769431, "grad_norm": 0.033328402787446976, "learning_rate": 0.00014797219003476246, "loss": 0.0545, "step": 1801 }, { "epoch": 0.7824576639166305, "grad_norm": 0.027658531442284584, "learning_rate": 0.00014794322132097336, "loss": 0.0403, "step": 1802 }, { "epoch": 0.7828918801563178, "grad_norm": 0.028674453496932983, "learning_rate": 0.00014791425260718427, "loss": 0.0466, "step": 1803 }, { "epoch": 0.7833260963960053, "grad_norm": 0.03161695599555969, "learning_rate": 0.00014788528389339515, "loss": 0.0442, "step": 1804 }, { "epoch": 0.7837603126356926, "grad_norm": 0.030682679265737534, "learning_rate": 0.00014785631517960602, "loss": 0.0499, "step": 1805 }, { "epoch": 0.78419452887538, "grad_norm": 0.026170626282691956, "learning_rate": 0.00014782734646581693, "loss": 0.0427, "step": 1806 }, { "epoch": 0.7846287451150673, "grad_norm": 0.027271928265690804, "learning_rate": 0.00014779837775202783, "loss": 0.0417, "step": 1807 }, { "epoch": 0.7850629613547546, "grad_norm": 0.030316602438688278, "learning_rate": 0.0001477694090382387, "loss": 0.043, "step": 1808 }, { "epoch": 0.785497177594442, "grad_norm": 0.02797536365687847, "learning_rate": 0.0001477404403244496, "loss": 0.0398, "step": 1809 }, { "epoch": 0.7859313938341294, "grad_norm": 0.03680885210633278, "learning_rate": 0.0001477114716106605, "loss": 0.0487, "step": 1810 }, { "epoch": 0.7863656100738168, "grad_norm": 0.032716114073991776, "learning_rate": 0.0001476825028968714, "loss": 0.0434, "step": 1811 }, { "epoch": 0.7867998263135041, "grad_norm": 0.021492447704076767, "learning_rate": 0.00014765353418308227, "loss": 0.0359, "step": 1812 }, { "epoch": 0.7872340425531915, "grad_norm": 0.02830984629690647, "learning_rate": 0.00014762456546929317, "loss": 0.0458, "step": 1813 }, { "epoch": 0.7876682587928788, "grad_norm": 0.026318999007344246, "learning_rate": 0.00014759559675550407, "loss": 0.0385, "step": 1814 }, { "epoch": 0.7881024750325663, "grad_norm": 0.03066597692668438, "learning_rate": 0.00014756662804171495, "loss": 0.0447, "step": 1815 }, { "epoch": 0.7885366912722536, "grad_norm": 0.033667802810668945, "learning_rate": 0.00014753765932792585, "loss": 0.0527, "step": 1816 }, { "epoch": 0.788970907511941, "grad_norm": 0.030761638656258583, "learning_rate": 0.00014750869061413676, "loss": 0.0425, "step": 1817 }, { "epoch": 0.7894051237516283, "grad_norm": 0.024018844589591026, "learning_rate": 0.00014747972190034763, "loss": 0.0386, "step": 1818 }, { "epoch": 0.7898393399913157, "grad_norm": 0.026631463319063187, "learning_rate": 0.0001474507531865585, "loss": 0.0411, "step": 1819 }, { "epoch": 0.790273556231003, "grad_norm": 0.029595956206321716, "learning_rate": 0.00014742178447276941, "loss": 0.0454, "step": 1820 }, { "epoch": 0.7907077724706904, "grad_norm": 0.027319859713315964, "learning_rate": 0.00014739281575898032, "loss": 0.034, "step": 1821 }, { "epoch": 0.7911419887103778, "grad_norm": 0.03452128916978836, "learning_rate": 0.0001473638470451912, "loss": 0.0485, "step": 1822 }, { "epoch": 0.7915762049500651, "grad_norm": 0.03518221527338028, "learning_rate": 0.0001473348783314021, "loss": 0.0431, "step": 1823 }, { "epoch": 0.7920104211897525, "grad_norm": 0.031061619520187378, "learning_rate": 0.000147305909617613, "loss": 0.0467, "step": 1824 }, { "epoch": 0.7924446374294398, "grad_norm": 0.022288599982857704, "learning_rate": 0.00014727694090382388, "loss": 0.0352, "step": 1825 }, { "epoch": 0.7928788536691272, "grad_norm": 0.029140155762434006, "learning_rate": 0.00014724797219003476, "loss": 0.047, "step": 1826 }, { "epoch": 0.7933130699088146, "grad_norm": 0.04250635951757431, "learning_rate": 0.00014721900347624566, "loss": 0.0686, "step": 1827 }, { "epoch": 0.793747286148502, "grad_norm": 0.037123698741197586, "learning_rate": 0.00014719003476245656, "loss": 0.0511, "step": 1828 }, { "epoch": 0.7941815023881893, "grad_norm": 0.03835690766572952, "learning_rate": 0.00014716106604866744, "loss": 0.0562, "step": 1829 }, { "epoch": 0.7946157186278767, "grad_norm": 0.035167913883924484, "learning_rate": 0.00014713209733487834, "loss": 0.0464, "step": 1830 }, { "epoch": 0.795049934867564, "grad_norm": 0.026079334318637848, "learning_rate": 0.00014710312862108925, "loss": 0.0379, "step": 1831 }, { "epoch": 0.7954841511072515, "grad_norm": 0.03247830644249916, "learning_rate": 0.00014707415990730012, "loss": 0.0536, "step": 1832 }, { "epoch": 0.7959183673469388, "grad_norm": 0.02406691201031208, "learning_rate": 0.000147045191193511, "loss": 0.0381, "step": 1833 }, { "epoch": 0.7963525835866262, "grad_norm": 0.0370798259973526, "learning_rate": 0.0001470162224797219, "loss": 0.048, "step": 1834 }, { "epoch": 0.7967867998263135, "grad_norm": 0.026781072840094566, "learning_rate": 0.0001469872537659328, "loss": 0.0372, "step": 1835 }, { "epoch": 0.7972210160660008, "grad_norm": 0.033603325486183167, "learning_rate": 0.00014695828505214368, "loss": 0.0465, "step": 1836 }, { "epoch": 0.7976552323056882, "grad_norm": 0.03513015806674957, "learning_rate": 0.0001469293163383546, "loss": 0.0445, "step": 1837 }, { "epoch": 0.7980894485453756, "grad_norm": 0.03301960602402687, "learning_rate": 0.0001469003476245655, "loss": 0.0436, "step": 1838 }, { "epoch": 0.798523664785063, "grad_norm": 0.032173238694667816, "learning_rate": 0.00014687137891077637, "loss": 0.0398, "step": 1839 }, { "epoch": 0.7989578810247503, "grad_norm": 0.02650519087910652, "learning_rate": 0.00014684241019698725, "loss": 0.0437, "step": 1840 }, { "epoch": 0.7993920972644377, "grad_norm": 0.02845388650894165, "learning_rate": 0.00014681344148319815, "loss": 0.0385, "step": 1841 }, { "epoch": 0.799826313504125, "grad_norm": 0.03403950855135918, "learning_rate": 0.00014678447276940905, "loss": 0.0449, "step": 1842 }, { "epoch": 0.8002605297438125, "grad_norm": 0.02591974474489689, "learning_rate": 0.00014675550405561993, "loss": 0.0372, "step": 1843 }, { "epoch": 0.8006947459834998, "grad_norm": 0.027077946811914444, "learning_rate": 0.00014672653534183083, "loss": 0.0434, "step": 1844 }, { "epoch": 0.8011289622231872, "grad_norm": 0.027987563982605934, "learning_rate": 0.00014669756662804174, "loss": 0.0396, "step": 1845 }, { "epoch": 0.8015631784628745, "grad_norm": 0.026050908491015434, "learning_rate": 0.0001466685979142526, "loss": 0.0375, "step": 1846 }, { "epoch": 0.8019973947025619, "grad_norm": 0.027219710871577263, "learning_rate": 0.0001466396292004635, "loss": 0.0415, "step": 1847 }, { "epoch": 0.8024316109422492, "grad_norm": 0.04450646787881851, "learning_rate": 0.0001466106604866744, "loss": 0.0754, "step": 1848 }, { "epoch": 0.8028658271819366, "grad_norm": 0.02623901702463627, "learning_rate": 0.0001465816917728853, "loss": 0.0367, "step": 1849 }, { "epoch": 0.803300043421624, "grad_norm": 0.032335102558135986, "learning_rate": 0.00014655272305909617, "loss": 0.0503, "step": 1850 }, { "epoch": 0.8037342596613113, "grad_norm": 0.0233046505600214, "learning_rate": 0.00014652375434530708, "loss": 0.0349, "step": 1851 }, { "epoch": 0.8041684759009987, "grad_norm": 0.02847515605390072, "learning_rate": 0.00014649478563151798, "loss": 0.0454, "step": 1852 }, { "epoch": 0.804602692140686, "grad_norm": 0.03247516602277756, "learning_rate": 0.00014646581691772886, "loss": 0.0407, "step": 1853 }, { "epoch": 0.8050369083803735, "grad_norm": 0.030429450795054436, "learning_rate": 0.00014643684820393973, "loss": 0.0462, "step": 1854 }, { "epoch": 0.8054711246200608, "grad_norm": 0.02825009636580944, "learning_rate": 0.00014640787949015064, "loss": 0.0414, "step": 1855 }, { "epoch": 0.8059053408597482, "grad_norm": 0.025530800223350525, "learning_rate": 0.00014637891077636154, "loss": 0.0366, "step": 1856 }, { "epoch": 0.8063395570994355, "grad_norm": 0.025670845061540604, "learning_rate": 0.00014634994206257242, "loss": 0.0393, "step": 1857 }, { "epoch": 0.8067737733391229, "grad_norm": 0.025360921397805214, "learning_rate": 0.00014632097334878332, "loss": 0.035, "step": 1858 }, { "epoch": 0.8072079895788102, "grad_norm": 0.028164690360426903, "learning_rate": 0.00014629200463499423, "loss": 0.0456, "step": 1859 }, { "epoch": 0.8076422058184977, "grad_norm": 0.035446759313344955, "learning_rate": 0.0001462630359212051, "loss": 0.0553, "step": 1860 }, { "epoch": 0.808076422058185, "grad_norm": 0.028687305748462677, "learning_rate": 0.00014623406720741598, "loss": 0.043, "step": 1861 }, { "epoch": 0.8085106382978723, "grad_norm": 0.03656477481126785, "learning_rate": 0.00014620509849362688, "loss": 0.0545, "step": 1862 }, { "epoch": 0.8089448545375597, "grad_norm": 0.029394442215561867, "learning_rate": 0.0001461761297798378, "loss": 0.0461, "step": 1863 }, { "epoch": 0.809379070777247, "grad_norm": 0.02989893965423107, "learning_rate": 0.00014614716106604866, "loss": 0.0438, "step": 1864 }, { "epoch": 0.8098132870169344, "grad_norm": 0.04195946082472801, "learning_rate": 0.00014611819235225957, "loss": 0.0663, "step": 1865 }, { "epoch": 0.8102475032566218, "grad_norm": 0.02269413322210312, "learning_rate": 0.00014608922363847047, "loss": 0.0308, "step": 1866 }, { "epoch": 0.8106817194963092, "grad_norm": 0.023781191557645798, "learning_rate": 0.00014606025492468137, "loss": 0.0323, "step": 1867 }, { "epoch": 0.8111159357359965, "grad_norm": 0.032525647431612015, "learning_rate": 0.00014603128621089222, "loss": 0.0445, "step": 1868 }, { "epoch": 0.8115501519756839, "grad_norm": 0.03401252254843712, "learning_rate": 0.00014600231749710313, "loss": 0.0393, "step": 1869 }, { "epoch": 0.8119843682153712, "grad_norm": 0.03092198446393013, "learning_rate": 0.00014597334878331403, "loss": 0.0482, "step": 1870 }, { "epoch": 0.8124185844550587, "grad_norm": 0.03762177377939224, "learning_rate": 0.0001459443800695249, "loss": 0.057, "step": 1871 }, { "epoch": 0.812852800694746, "grad_norm": 0.03299945592880249, "learning_rate": 0.0001459154113557358, "loss": 0.0467, "step": 1872 }, { "epoch": 0.8132870169344334, "grad_norm": 0.03162165358662605, "learning_rate": 0.00014588644264194672, "loss": 0.0473, "step": 1873 }, { "epoch": 0.8137212331741207, "grad_norm": 0.035246264189481735, "learning_rate": 0.00014585747392815762, "loss": 0.0449, "step": 1874 }, { "epoch": 0.8141554494138081, "grad_norm": 0.03287537768483162, "learning_rate": 0.0001458285052143685, "loss": 0.0473, "step": 1875 }, { "epoch": 0.8145896656534954, "grad_norm": 0.043122243136167526, "learning_rate": 0.00014579953650057937, "loss": 0.055, "step": 1876 }, { "epoch": 0.8150238818931828, "grad_norm": 0.028666973114013672, "learning_rate": 0.00014577056778679028, "loss": 0.0353, "step": 1877 }, { "epoch": 0.8154580981328702, "grad_norm": 0.03774398937821388, "learning_rate": 0.00014574159907300115, "loss": 0.0508, "step": 1878 }, { "epoch": 0.8158923143725575, "grad_norm": 0.026573821902275085, "learning_rate": 0.00014571263035921206, "loss": 0.0335, "step": 1879 }, { "epoch": 0.8163265306122449, "grad_norm": 0.02338903769850731, "learning_rate": 0.00014568366164542296, "loss": 0.0317, "step": 1880 }, { "epoch": 0.8167607468519322, "grad_norm": 0.03318272903561592, "learning_rate": 0.00014565469293163386, "loss": 0.0409, "step": 1881 }, { "epoch": 0.8171949630916197, "grad_norm": 0.030008522793650627, "learning_rate": 0.00014562572421784474, "loss": 0.0427, "step": 1882 }, { "epoch": 0.817629179331307, "grad_norm": 0.04296838864684105, "learning_rate": 0.00014559675550405562, "loss": 0.0456, "step": 1883 }, { "epoch": 0.8180633955709944, "grad_norm": 0.026238685473799706, "learning_rate": 0.00014556778679026652, "loss": 0.0335, "step": 1884 }, { "epoch": 0.8184976118106817, "grad_norm": 0.02662724256515503, "learning_rate": 0.0001455388180764774, "loss": 0.0352, "step": 1885 }, { "epoch": 0.8189318280503691, "grad_norm": 0.03382530435919762, "learning_rate": 0.0001455098493626883, "loss": 0.052, "step": 1886 }, { "epoch": 0.8193660442900564, "grad_norm": 0.029934851452708244, "learning_rate": 0.0001454808806488992, "loss": 0.0432, "step": 1887 }, { "epoch": 0.8198002605297439, "grad_norm": 0.033714670687913895, "learning_rate": 0.0001454519119351101, "loss": 0.0435, "step": 1888 }, { "epoch": 0.8202344767694312, "grad_norm": 0.031081276014447212, "learning_rate": 0.00014542294322132099, "loss": 0.0408, "step": 1889 }, { "epoch": 0.8206686930091185, "grad_norm": 0.03201887011528015, "learning_rate": 0.00014539397450753186, "loss": 0.0413, "step": 1890 }, { "epoch": 0.8211029092488059, "grad_norm": 0.02954074926674366, "learning_rate": 0.00014536500579374277, "loss": 0.0418, "step": 1891 }, { "epoch": 0.8215371254884932, "grad_norm": 0.03410540521144867, "learning_rate": 0.00014533603707995364, "loss": 0.0459, "step": 1892 }, { "epoch": 0.8219713417281806, "grad_norm": 0.02821778692305088, "learning_rate": 0.00014530706836616455, "loss": 0.0349, "step": 1893 }, { "epoch": 0.822405557967868, "grad_norm": 0.038199715316295624, "learning_rate": 0.00014527809965237545, "loss": 0.0598, "step": 1894 }, { "epoch": 0.8228397742075554, "grad_norm": 0.03057371824979782, "learning_rate": 0.00014524913093858635, "loss": 0.0411, "step": 1895 }, { "epoch": 0.8232739904472427, "grad_norm": 0.026911864057183266, "learning_rate": 0.00014522016222479723, "loss": 0.0423, "step": 1896 }, { "epoch": 0.8237082066869301, "grad_norm": 0.032636940479278564, "learning_rate": 0.0001451911935110081, "loss": 0.0458, "step": 1897 }, { "epoch": 0.8241424229266174, "grad_norm": 0.03207670897245407, "learning_rate": 0.000145162224797219, "loss": 0.0428, "step": 1898 }, { "epoch": 0.8245766391663049, "grad_norm": 0.03155216947197914, "learning_rate": 0.00014513325608342991, "loss": 0.041, "step": 1899 }, { "epoch": 0.8250108554059922, "grad_norm": 0.02336292341351509, "learning_rate": 0.0001451042873696408, "loss": 0.0327, "step": 1900 }, { "epoch": 0.8254450716456796, "grad_norm": 0.02507534623146057, "learning_rate": 0.0001450753186558517, "loss": 0.038, "step": 1901 }, { "epoch": 0.8258792878853669, "grad_norm": 0.03299584612250328, "learning_rate": 0.0001450463499420626, "loss": 0.0442, "step": 1902 }, { "epoch": 0.8263135041250543, "grad_norm": 0.035627298057079315, "learning_rate": 0.00014501738122827347, "loss": 0.0435, "step": 1903 }, { "epoch": 0.8267477203647416, "grad_norm": 0.0291341133415699, "learning_rate": 0.00014498841251448435, "loss": 0.0437, "step": 1904 }, { "epoch": 0.827181936604429, "grad_norm": 0.027151335030794144, "learning_rate": 0.00014495944380069526, "loss": 0.0411, "step": 1905 }, { "epoch": 0.8276161528441164, "grad_norm": 0.023610608652234077, "learning_rate": 0.00014493047508690616, "loss": 0.0384, "step": 1906 }, { "epoch": 0.8280503690838037, "grad_norm": 0.033798687160015106, "learning_rate": 0.00014490150637311704, "loss": 0.0549, "step": 1907 }, { "epoch": 0.8284845853234911, "grad_norm": 0.03178274258971214, "learning_rate": 0.00014487253765932794, "loss": 0.0438, "step": 1908 }, { "epoch": 0.8289188015631784, "grad_norm": 0.03396683931350708, "learning_rate": 0.00014484356894553884, "loss": 0.0469, "step": 1909 }, { "epoch": 0.8293530178028659, "grad_norm": 0.027894634753465652, "learning_rate": 0.00014481460023174972, "loss": 0.0431, "step": 1910 }, { "epoch": 0.8297872340425532, "grad_norm": 0.0358998142182827, "learning_rate": 0.0001447856315179606, "loss": 0.0512, "step": 1911 }, { "epoch": 0.8302214502822406, "grad_norm": 0.02486182376742363, "learning_rate": 0.0001447566628041715, "loss": 0.0404, "step": 1912 }, { "epoch": 0.8306556665219279, "grad_norm": 0.030249720439314842, "learning_rate": 0.0001447276940903824, "loss": 0.0421, "step": 1913 }, { "epoch": 0.8310898827616153, "grad_norm": 0.07239435613155365, "learning_rate": 0.00014469872537659328, "loss": 0.157, "step": 1914 }, { "epoch": 0.8315240990013026, "grad_norm": 0.03189070150256157, "learning_rate": 0.00014466975666280418, "loss": 0.0468, "step": 1915 }, { "epoch": 0.8319583152409901, "grad_norm": 0.027312200516462326, "learning_rate": 0.0001446407879490151, "loss": 0.0399, "step": 1916 }, { "epoch": 0.8323925314806774, "grad_norm": 0.02700837515294552, "learning_rate": 0.00014461181923522596, "loss": 0.0399, "step": 1917 }, { "epoch": 0.8328267477203647, "grad_norm": 0.029052110388875008, "learning_rate": 0.00014458285052143684, "loss": 0.047, "step": 1918 }, { "epoch": 0.8332609639600521, "grad_norm": 0.026150856167078018, "learning_rate": 0.00014455388180764774, "loss": 0.0387, "step": 1919 }, { "epoch": 0.8336951801997394, "grad_norm": 0.024249473586678505, "learning_rate": 0.00014452491309385865, "loss": 0.0429, "step": 1920 }, { "epoch": 0.8341293964394269, "grad_norm": 0.025982819497585297, "learning_rate": 0.00014449594438006952, "loss": 0.0369, "step": 1921 }, { "epoch": 0.8345636126791142, "grad_norm": 0.02723195217549801, "learning_rate": 0.00014446697566628043, "loss": 0.0393, "step": 1922 }, { "epoch": 0.8349978289188016, "grad_norm": 0.025857344269752502, "learning_rate": 0.00014443800695249133, "loss": 0.0378, "step": 1923 }, { "epoch": 0.8354320451584889, "grad_norm": 0.02449696511030197, "learning_rate": 0.0001444090382387022, "loss": 0.0367, "step": 1924 }, { "epoch": 0.8358662613981763, "grad_norm": 0.0273366030305624, "learning_rate": 0.00014438006952491309, "loss": 0.0367, "step": 1925 }, { "epoch": 0.8363004776378636, "grad_norm": 0.028952650725841522, "learning_rate": 0.000144351100811124, "loss": 0.0501, "step": 1926 }, { "epoch": 0.8367346938775511, "grad_norm": 0.02991628833115101, "learning_rate": 0.0001443221320973349, "loss": 0.0438, "step": 1927 }, { "epoch": 0.8371689101172384, "grad_norm": 0.0242062509059906, "learning_rate": 0.00014429316338354577, "loss": 0.0387, "step": 1928 }, { "epoch": 0.8376031263569258, "grad_norm": 0.02626001089811325, "learning_rate": 0.00014426419466975667, "loss": 0.0373, "step": 1929 }, { "epoch": 0.8380373425966131, "grad_norm": 0.03065614216029644, "learning_rate": 0.00014423522595596758, "loss": 0.0426, "step": 1930 }, { "epoch": 0.8384715588363004, "grad_norm": 0.03538075461983681, "learning_rate": 0.00014420625724217845, "loss": 0.0497, "step": 1931 }, { "epoch": 0.8389057750759878, "grad_norm": 0.02666524425148964, "learning_rate": 0.00014417728852838933, "loss": 0.0379, "step": 1932 }, { "epoch": 0.8393399913156752, "grad_norm": 0.02606046572327614, "learning_rate": 0.00014414831981460023, "loss": 0.0363, "step": 1933 }, { "epoch": 0.8397742075553626, "grad_norm": 0.030609961599111557, "learning_rate": 0.00014411935110081114, "loss": 0.0407, "step": 1934 }, { "epoch": 0.8402084237950499, "grad_norm": 0.03225742653012276, "learning_rate": 0.00014409038238702201, "loss": 0.0493, "step": 1935 }, { "epoch": 0.8406426400347373, "grad_norm": 0.03525082767009735, "learning_rate": 0.00014406141367323292, "loss": 0.0406, "step": 1936 }, { "epoch": 0.8410768562744246, "grad_norm": 0.03507285937666893, "learning_rate": 0.00014403244495944382, "loss": 0.0558, "step": 1937 }, { "epoch": 0.8415110725141121, "grad_norm": 0.03357028216123581, "learning_rate": 0.0001440034762456547, "loss": 0.0492, "step": 1938 }, { "epoch": 0.8419452887537994, "grad_norm": 0.028803694993257523, "learning_rate": 0.00014397450753186558, "loss": 0.0428, "step": 1939 }, { "epoch": 0.8423795049934868, "grad_norm": 0.038658507168293, "learning_rate": 0.00014394553881807648, "loss": 0.0495, "step": 1940 }, { "epoch": 0.8428137212331741, "grad_norm": 0.0392301119863987, "learning_rate": 0.00014391657010428738, "loss": 0.0618, "step": 1941 }, { "epoch": 0.8432479374728615, "grad_norm": 0.03335396945476532, "learning_rate": 0.00014388760139049826, "loss": 0.0367, "step": 1942 }, { "epoch": 0.8436821537125488, "grad_norm": 0.03346993029117584, "learning_rate": 0.00014385863267670916, "loss": 0.0521, "step": 1943 }, { "epoch": 0.8441163699522363, "grad_norm": 0.028737748041749, "learning_rate": 0.00014382966396292007, "loss": 0.044, "step": 1944 }, { "epoch": 0.8445505861919236, "grad_norm": 0.027634650468826294, "learning_rate": 0.00014380069524913094, "loss": 0.046, "step": 1945 }, { "epoch": 0.8449848024316109, "grad_norm": 0.025698808953166008, "learning_rate": 0.00014377172653534182, "loss": 0.0384, "step": 1946 }, { "epoch": 0.8454190186712983, "grad_norm": 0.020889753475785255, "learning_rate": 0.00014374275782155272, "loss": 0.0304, "step": 1947 }, { "epoch": 0.8458532349109856, "grad_norm": 0.03461039811372757, "learning_rate": 0.00014371378910776363, "loss": 0.0536, "step": 1948 }, { "epoch": 0.846287451150673, "grad_norm": 0.022404206916689873, "learning_rate": 0.0001436848203939745, "loss": 0.0306, "step": 1949 }, { "epoch": 0.8467216673903604, "grad_norm": 0.03199774771928787, "learning_rate": 0.0001436558516801854, "loss": 0.0467, "step": 1950 }, { "epoch": 0.8471558836300478, "grad_norm": 0.03409325331449509, "learning_rate": 0.0001436268829663963, "loss": 0.0495, "step": 1951 }, { "epoch": 0.8475900998697351, "grad_norm": 0.028994014486670494, "learning_rate": 0.00014359791425260721, "loss": 0.0424, "step": 1952 }, { "epoch": 0.8480243161094225, "grad_norm": 0.031292881816625595, "learning_rate": 0.0001435689455388181, "loss": 0.0426, "step": 1953 }, { "epoch": 0.8484585323491098, "grad_norm": 0.0330350361764431, "learning_rate": 0.00014353997682502897, "loss": 0.0546, "step": 1954 }, { "epoch": 0.8488927485887973, "grad_norm": 0.026546306908130646, "learning_rate": 0.00014351100811123987, "loss": 0.0365, "step": 1955 }, { "epoch": 0.8493269648284846, "grad_norm": 0.05160243809223175, "learning_rate": 0.00014348203939745075, "loss": 0.0977, "step": 1956 }, { "epoch": 0.849761181068172, "grad_norm": 0.02403208240866661, "learning_rate": 0.00014345307068366165, "loss": 0.0378, "step": 1957 }, { "epoch": 0.8501953973078593, "grad_norm": 0.031874917447566986, "learning_rate": 0.00014342410196987256, "loss": 0.0378, "step": 1958 }, { "epoch": 0.8506296135475466, "grad_norm": 0.03166304528713226, "learning_rate": 0.00014339513325608346, "loss": 0.045, "step": 1959 }, { "epoch": 0.851063829787234, "grad_norm": 0.02752501703798771, "learning_rate": 0.00014336616454229434, "loss": 0.0376, "step": 1960 }, { "epoch": 0.8514980460269214, "grad_norm": 0.035289134830236435, "learning_rate": 0.0001433371958285052, "loss": 0.0527, "step": 1961 }, { "epoch": 0.8519322622666088, "grad_norm": 0.03643534705042839, "learning_rate": 0.00014330822711471612, "loss": 0.0393, "step": 1962 }, { "epoch": 0.8523664785062961, "grad_norm": 0.03255149722099304, "learning_rate": 0.000143279258400927, "loss": 0.0376, "step": 1963 }, { "epoch": 0.8528006947459835, "grad_norm": 0.030566062778234482, "learning_rate": 0.0001432502896871379, "loss": 0.0421, "step": 1964 }, { "epoch": 0.8532349109856708, "grad_norm": 0.028955625370144844, "learning_rate": 0.0001432213209733488, "loss": 0.0424, "step": 1965 }, { "epoch": 0.8536691272253583, "grad_norm": 0.02823384292423725, "learning_rate": 0.0001431923522595597, "loss": 0.0453, "step": 1966 }, { "epoch": 0.8541033434650456, "grad_norm": 0.028729312121868134, "learning_rate": 0.00014316338354577058, "loss": 0.0384, "step": 1967 }, { "epoch": 0.854537559704733, "grad_norm": 0.03356658294796944, "learning_rate": 0.00014313441483198146, "loss": 0.0466, "step": 1968 }, { "epoch": 0.8549717759444203, "grad_norm": 0.02687694877386093, "learning_rate": 0.00014310544611819236, "loss": 0.035, "step": 1969 }, { "epoch": 0.8554059921841077, "grad_norm": 0.026242408901453018, "learning_rate": 0.00014307647740440324, "loss": 0.0345, "step": 1970 }, { "epoch": 0.855840208423795, "grad_norm": 0.03178303688764572, "learning_rate": 0.00014304750869061414, "loss": 0.0391, "step": 1971 }, { "epoch": 0.8562744246634825, "grad_norm": 0.03588534891605377, "learning_rate": 0.00014301853997682505, "loss": 0.0485, "step": 1972 }, { "epoch": 0.8567086409031698, "grad_norm": 0.029414109885692596, "learning_rate": 0.00014298957126303595, "loss": 0.0376, "step": 1973 }, { "epoch": 0.8571428571428571, "grad_norm": 0.03333282843232155, "learning_rate": 0.00014296060254924683, "loss": 0.0434, "step": 1974 }, { "epoch": 0.8575770733825445, "grad_norm": 0.033117614686489105, "learning_rate": 0.0001429316338354577, "loss": 0.0437, "step": 1975 }, { "epoch": 0.8580112896222318, "grad_norm": 0.030400462448596954, "learning_rate": 0.0001429026651216686, "loss": 0.0408, "step": 1976 }, { "epoch": 0.8584455058619193, "grad_norm": 0.035701557993888855, "learning_rate": 0.00014287369640787948, "loss": 0.0496, "step": 1977 }, { "epoch": 0.8588797221016066, "grad_norm": 0.03105113096535206, "learning_rate": 0.00014284472769409039, "loss": 0.039, "step": 1978 }, { "epoch": 0.859313938341294, "grad_norm": 0.038084838539361954, "learning_rate": 0.0001428157589803013, "loss": 0.0439, "step": 1979 }, { "epoch": 0.8597481545809813, "grad_norm": 0.036602966487407684, "learning_rate": 0.0001427867902665122, "loss": 0.0474, "step": 1980 }, { "epoch": 0.8601823708206687, "grad_norm": 0.0332217775285244, "learning_rate": 0.00014275782155272307, "loss": 0.0458, "step": 1981 }, { "epoch": 0.860616587060356, "grad_norm": 0.03309790417551994, "learning_rate": 0.00014272885283893395, "loss": 0.0429, "step": 1982 }, { "epoch": 0.8610508033000435, "grad_norm": 0.030732357874512672, "learning_rate": 0.00014269988412514485, "loss": 0.0427, "step": 1983 }, { "epoch": 0.8614850195397308, "grad_norm": 0.03058842197060585, "learning_rate": 0.00014267091541135573, "loss": 0.0378, "step": 1984 }, { "epoch": 0.8619192357794182, "grad_norm": 0.02841087616980076, "learning_rate": 0.00014264194669756663, "loss": 0.0347, "step": 1985 }, { "epoch": 0.8623534520191055, "grad_norm": 0.03106006048619747, "learning_rate": 0.00014261297798377753, "loss": 0.041, "step": 1986 }, { "epoch": 0.8627876682587928, "grad_norm": 0.03840110823512077, "learning_rate": 0.00014258400926998844, "loss": 0.0499, "step": 1987 }, { "epoch": 0.8632218844984803, "grad_norm": 0.03096187300980091, "learning_rate": 0.00014255504055619932, "loss": 0.0447, "step": 1988 }, { "epoch": 0.8636561007381676, "grad_norm": 0.027057264000177383, "learning_rate": 0.0001425260718424102, "loss": 0.0342, "step": 1989 }, { "epoch": 0.864090316977855, "grad_norm": 0.025652017444372177, "learning_rate": 0.0001424971031286211, "loss": 0.0396, "step": 1990 }, { "epoch": 0.8645245332175423, "grad_norm": 0.032355524599552155, "learning_rate": 0.00014246813441483197, "loss": 0.0439, "step": 1991 }, { "epoch": 0.8649587494572297, "grad_norm": 0.031072113662958145, "learning_rate": 0.00014243916570104288, "loss": 0.0477, "step": 1992 }, { "epoch": 0.865392965696917, "grad_norm": 0.030140774324536324, "learning_rate": 0.00014241019698725378, "loss": 0.044, "step": 1993 }, { "epoch": 0.8658271819366045, "grad_norm": 0.02587377279996872, "learning_rate": 0.00014238122827346468, "loss": 0.0401, "step": 1994 }, { "epoch": 0.8662613981762918, "grad_norm": 0.030129248276352882, "learning_rate": 0.00014235225955967556, "loss": 0.0438, "step": 1995 }, { "epoch": 0.8666956144159792, "grad_norm": 0.03087014891207218, "learning_rate": 0.00014232329084588644, "loss": 0.0475, "step": 1996 }, { "epoch": 0.8671298306556665, "grad_norm": 0.039598286151885986, "learning_rate": 0.00014229432213209734, "loss": 0.0465, "step": 1997 }, { "epoch": 0.8675640468953539, "grad_norm": 0.03504587709903717, "learning_rate": 0.00014226535341830824, "loss": 0.0522, "step": 1998 }, { "epoch": 0.8679982631350412, "grad_norm": 0.036332350224256516, "learning_rate": 0.00014223638470451912, "loss": 0.0443, "step": 1999 }, { "epoch": 0.8684324793747286, "grad_norm": 0.03455139324069023, "learning_rate": 0.00014220741599073002, "loss": 0.0418, "step": 2000 }, { "epoch": 0.868866695614416, "grad_norm": 0.033495478332042694, "learning_rate": 0.00014217844727694093, "loss": 0.043, "step": 2001 }, { "epoch": 0.8693009118541033, "grad_norm": 0.03274128958582878, "learning_rate": 0.0001421494785631518, "loss": 0.056, "step": 2002 }, { "epoch": 0.8697351280937907, "grad_norm": 0.03046279214322567, "learning_rate": 0.00014212050984936268, "loss": 0.0499, "step": 2003 }, { "epoch": 0.870169344333478, "grad_norm": 0.03886440396308899, "learning_rate": 0.00014209154113557358, "loss": 0.0535, "step": 2004 }, { "epoch": 0.8706035605731655, "grad_norm": 0.02778572402894497, "learning_rate": 0.0001420625724217845, "loss": 0.0416, "step": 2005 }, { "epoch": 0.8710377768128528, "grad_norm": 0.026489345356822014, "learning_rate": 0.00014203360370799537, "loss": 0.0418, "step": 2006 }, { "epoch": 0.8714719930525402, "grad_norm": 0.023104501888155937, "learning_rate": 0.00014200463499420627, "loss": 0.0389, "step": 2007 }, { "epoch": 0.8719062092922275, "grad_norm": 0.02914455160498619, "learning_rate": 0.00014197566628041717, "loss": 0.0469, "step": 2008 }, { "epoch": 0.8723404255319149, "grad_norm": 0.026561478152871132, "learning_rate": 0.00014194669756662805, "loss": 0.0381, "step": 2009 }, { "epoch": 0.8727746417716022, "grad_norm": 0.03218894451856613, "learning_rate": 0.00014191772885283893, "loss": 0.0431, "step": 2010 }, { "epoch": 0.8732088580112897, "grad_norm": 0.027128156274557114, "learning_rate": 0.00014188876013904983, "loss": 0.0403, "step": 2011 }, { "epoch": 0.873643074250977, "grad_norm": 0.03661199286580086, "learning_rate": 0.00014185979142526073, "loss": 0.0537, "step": 2012 }, { "epoch": 0.8740772904906644, "grad_norm": 0.02571532502770424, "learning_rate": 0.0001418308227114716, "loss": 0.0388, "step": 2013 }, { "epoch": 0.8745115067303517, "grad_norm": 0.03134253993630409, "learning_rate": 0.00014180185399768251, "loss": 0.0492, "step": 2014 }, { "epoch": 0.874945722970039, "grad_norm": 0.02391468733549118, "learning_rate": 0.00014177288528389342, "loss": 0.0372, "step": 2015 }, { "epoch": 0.8753799392097265, "grad_norm": 0.028213145211338997, "learning_rate": 0.0001417439165701043, "loss": 0.0375, "step": 2016 }, { "epoch": 0.8758141554494138, "grad_norm": 0.02762652188539505, "learning_rate": 0.00014171494785631517, "loss": 0.0385, "step": 2017 }, { "epoch": 0.8762483716891012, "grad_norm": 0.03688434883952141, "learning_rate": 0.00014168597914252607, "loss": 0.0598, "step": 2018 }, { "epoch": 0.8766825879287885, "grad_norm": 0.035583898425102234, "learning_rate": 0.00014165701042873698, "loss": 0.0515, "step": 2019 }, { "epoch": 0.8771168041684759, "grad_norm": 0.029533490538597107, "learning_rate": 0.00014162804171494785, "loss": 0.0431, "step": 2020 }, { "epoch": 0.8775510204081632, "grad_norm": 0.030698049813508987, "learning_rate": 0.00014159907300115876, "loss": 0.0411, "step": 2021 }, { "epoch": 0.8779852366478507, "grad_norm": 0.0295929703861475, "learning_rate": 0.00014157010428736966, "loss": 0.0398, "step": 2022 }, { "epoch": 0.878419452887538, "grad_norm": 0.03797440603375435, "learning_rate": 0.00014154113557358054, "loss": 0.0491, "step": 2023 }, { "epoch": 0.8788536691272254, "grad_norm": 0.026546452194452286, "learning_rate": 0.00014151216685979142, "loss": 0.0336, "step": 2024 }, { "epoch": 0.8792878853669127, "grad_norm": 0.028520507737994194, "learning_rate": 0.00014148319814600232, "loss": 0.0389, "step": 2025 }, { "epoch": 0.8797221016066001, "grad_norm": 0.03675870969891548, "learning_rate": 0.00014145422943221322, "loss": 0.0452, "step": 2026 }, { "epoch": 0.8801563178462875, "grad_norm": 0.030710697174072266, "learning_rate": 0.0001414252607184241, "loss": 0.0431, "step": 2027 }, { "epoch": 0.8805905340859748, "grad_norm": 0.032596655189991, "learning_rate": 0.000141396292004635, "loss": 0.0467, "step": 2028 }, { "epoch": 0.8810247503256622, "grad_norm": 0.042485423386096954, "learning_rate": 0.0001413673232908459, "loss": 0.0461, "step": 2029 }, { "epoch": 0.8814589665653495, "grad_norm": 0.03450758010149002, "learning_rate": 0.00014133835457705678, "loss": 0.0491, "step": 2030 }, { "epoch": 0.8818931828050369, "grad_norm": 0.02624584175646305, "learning_rate": 0.0001413093858632677, "loss": 0.0402, "step": 2031 }, { "epoch": 0.8823273990447242, "grad_norm": 0.0387093648314476, "learning_rate": 0.00014128041714947856, "loss": 0.0585, "step": 2032 }, { "epoch": 0.8827616152844117, "grad_norm": 0.04121008515357971, "learning_rate": 0.00014125144843568947, "loss": 0.0448, "step": 2033 }, { "epoch": 0.883195831524099, "grad_norm": 0.034964293241500854, "learning_rate": 0.00014122247972190034, "loss": 0.043, "step": 2034 }, { "epoch": 0.8836300477637864, "grad_norm": 0.027142617851495743, "learning_rate": 0.00014119351100811125, "loss": 0.0377, "step": 2035 }, { "epoch": 0.8840642640034737, "grad_norm": 0.03179572522640228, "learning_rate": 0.00014116454229432215, "loss": 0.0428, "step": 2036 }, { "epoch": 0.8844984802431611, "grad_norm": 0.03457105904817581, "learning_rate": 0.00014113557358053303, "loss": 0.0484, "step": 2037 }, { "epoch": 0.8849326964828484, "grad_norm": 0.03622158616781235, "learning_rate": 0.00014110660486674393, "loss": 0.0433, "step": 2038 }, { "epoch": 0.8853669127225359, "grad_norm": 0.03511437028646469, "learning_rate": 0.0001410776361529548, "loss": 0.0503, "step": 2039 }, { "epoch": 0.8858011289622232, "grad_norm": 0.036743778735399246, "learning_rate": 0.0001410486674391657, "loss": 0.0531, "step": 2040 }, { "epoch": 0.8862353452019105, "grad_norm": 0.030400533229112625, "learning_rate": 0.0001410196987253766, "loss": 0.0489, "step": 2041 }, { "epoch": 0.8866695614415979, "grad_norm": 0.024852711707353592, "learning_rate": 0.0001409907300115875, "loss": 0.0367, "step": 2042 }, { "epoch": 0.8871037776812852, "grad_norm": 0.04195273667573929, "learning_rate": 0.0001409617612977984, "loss": 0.057, "step": 2043 }, { "epoch": 0.8875379939209727, "grad_norm": 0.02948159910738468, "learning_rate": 0.0001409327925840093, "loss": 0.0371, "step": 2044 }, { "epoch": 0.88797221016066, "grad_norm": 0.03381238877773285, "learning_rate": 0.00014090382387022018, "loss": 0.0554, "step": 2045 }, { "epoch": 0.8884064264003474, "grad_norm": 0.032675664871931076, "learning_rate": 0.00014087485515643105, "loss": 0.0481, "step": 2046 }, { "epoch": 0.8888406426400347, "grad_norm": 0.027026213705539703, "learning_rate": 0.00014084588644264196, "loss": 0.0415, "step": 2047 }, { "epoch": 0.8892748588797221, "grad_norm": 0.03621970862150192, "learning_rate": 0.00014081691772885283, "loss": 0.0508, "step": 2048 }, { "epoch": 0.8897090751194094, "grad_norm": 0.027637988328933716, "learning_rate": 0.00014078794901506374, "loss": 0.0441, "step": 2049 }, { "epoch": 0.8901432913590969, "grad_norm": 0.030292175710201263, "learning_rate": 0.00014075898030127464, "loss": 0.0465, "step": 2050 }, { "epoch": 0.8905775075987842, "grad_norm": 0.0291909109801054, "learning_rate": 0.00014073001158748554, "loss": 0.044, "step": 2051 }, { "epoch": 0.8910117238384716, "grad_norm": 0.029960067942738533, "learning_rate": 0.00014070104287369642, "loss": 0.0446, "step": 2052 }, { "epoch": 0.8914459400781589, "grad_norm": 0.028511861339211464, "learning_rate": 0.0001406720741599073, "loss": 0.0449, "step": 2053 }, { "epoch": 0.8918801563178463, "grad_norm": 0.03835698589682579, "learning_rate": 0.0001406431054461182, "loss": 0.0658, "step": 2054 }, { "epoch": 0.8923143725575337, "grad_norm": 0.038661133497953415, "learning_rate": 0.00014061413673232908, "loss": 0.0443, "step": 2055 }, { "epoch": 0.892748588797221, "grad_norm": 0.031536765396595, "learning_rate": 0.00014058516801853998, "loss": 0.0373, "step": 2056 }, { "epoch": 0.8931828050369084, "grad_norm": 0.037868682295084, "learning_rate": 0.00014055619930475089, "loss": 0.0651, "step": 2057 }, { "epoch": 0.8936170212765957, "grad_norm": 0.028231054544448853, "learning_rate": 0.0001405272305909618, "loss": 0.0383, "step": 2058 }, { "epoch": 0.8940512375162831, "grad_norm": 0.03248724341392517, "learning_rate": 0.00014049826187717267, "loss": 0.0413, "step": 2059 }, { "epoch": 0.8944854537559704, "grad_norm": 0.030417146161198616, "learning_rate": 0.00014046929316338354, "loss": 0.0469, "step": 2060 }, { "epoch": 0.8949196699956579, "grad_norm": 0.038787152618169785, "learning_rate": 0.00014044032444959445, "loss": 0.0448, "step": 2061 }, { "epoch": 0.8953538862353452, "grad_norm": 0.02868465892970562, "learning_rate": 0.00014041135573580532, "loss": 0.0408, "step": 2062 }, { "epoch": 0.8957881024750326, "grad_norm": 0.027412064373493195, "learning_rate": 0.00014038238702201623, "loss": 0.0377, "step": 2063 }, { "epoch": 0.8962223187147199, "grad_norm": 0.035905227065086365, "learning_rate": 0.00014035341830822713, "loss": 0.0522, "step": 2064 }, { "epoch": 0.8966565349544073, "grad_norm": 0.03611810505390167, "learning_rate": 0.00014032444959443803, "loss": 0.0463, "step": 2065 }, { "epoch": 0.8970907511940946, "grad_norm": 0.03059331141412258, "learning_rate": 0.0001402954808806489, "loss": 0.0434, "step": 2066 }, { "epoch": 0.8975249674337821, "grad_norm": 0.03384896740317345, "learning_rate": 0.0001402665121668598, "loss": 0.0414, "step": 2067 }, { "epoch": 0.8979591836734694, "grad_norm": 0.025099631398916245, "learning_rate": 0.0001402375434530707, "loss": 0.0297, "step": 2068 }, { "epoch": 0.8983933999131567, "grad_norm": 0.03526316583156586, "learning_rate": 0.00014020857473928157, "loss": 0.0481, "step": 2069 }, { "epoch": 0.8988276161528441, "grad_norm": 0.028692418709397316, "learning_rate": 0.00014017960602549247, "loss": 0.0412, "step": 2070 }, { "epoch": 0.8992618323925314, "grad_norm": 0.03414814546704292, "learning_rate": 0.00014015063731170338, "loss": 0.0482, "step": 2071 }, { "epoch": 0.8996960486322189, "grad_norm": 0.03565351665019989, "learning_rate": 0.00014012166859791428, "loss": 0.0477, "step": 2072 }, { "epoch": 0.9001302648719062, "grad_norm": 0.03263545036315918, "learning_rate": 0.00014009269988412516, "loss": 0.0492, "step": 2073 }, { "epoch": 0.9005644811115936, "grad_norm": 0.03266032412648201, "learning_rate": 0.00014006373117033603, "loss": 0.0443, "step": 2074 }, { "epoch": 0.9009986973512809, "grad_norm": 0.02599659375846386, "learning_rate": 0.00014003476245654694, "loss": 0.0323, "step": 2075 }, { "epoch": 0.9014329135909683, "grad_norm": 0.02757035568356514, "learning_rate": 0.0001400057937427578, "loss": 0.0393, "step": 2076 }, { "epoch": 0.9018671298306556, "grad_norm": 0.03702864423394203, "learning_rate": 0.00013997682502896872, "loss": 0.0503, "step": 2077 }, { "epoch": 0.9023013460703431, "grad_norm": 0.032811250537633896, "learning_rate": 0.00013994785631517962, "loss": 0.0506, "step": 2078 }, { "epoch": 0.9027355623100304, "grad_norm": 0.032581593841314316, "learning_rate": 0.00013991888760139052, "loss": 0.04, "step": 2079 }, { "epoch": 0.9031697785497178, "grad_norm": 0.028418032452464104, "learning_rate": 0.0001398899188876014, "loss": 0.0393, "step": 2080 }, { "epoch": 0.9036039947894051, "grad_norm": 0.02439926564693451, "learning_rate": 0.00013986095017381228, "loss": 0.039, "step": 2081 }, { "epoch": 0.9040382110290925, "grad_norm": 0.03621521592140198, "learning_rate": 0.00013983198146002318, "loss": 0.0479, "step": 2082 }, { "epoch": 0.9044724272687799, "grad_norm": 0.027440901845693588, "learning_rate": 0.00013980301274623406, "loss": 0.0377, "step": 2083 }, { "epoch": 0.9049066435084672, "grad_norm": 0.03320474922657013, "learning_rate": 0.00013977404403244496, "loss": 0.047, "step": 2084 }, { "epoch": 0.9053408597481546, "grad_norm": 0.03451510891318321, "learning_rate": 0.00013974507531865586, "loss": 0.0549, "step": 2085 }, { "epoch": 0.9057750759878419, "grad_norm": 0.02999286539852619, "learning_rate": 0.00013971610660486677, "loss": 0.0462, "step": 2086 }, { "epoch": 0.9062092922275293, "grad_norm": 0.03790955990552902, "learning_rate": 0.00013968713789107764, "loss": 0.0533, "step": 2087 }, { "epoch": 0.9066435084672166, "grad_norm": 0.035895366221666336, "learning_rate": 0.00013965816917728852, "loss": 0.0615, "step": 2088 }, { "epoch": 0.9070777247069041, "grad_norm": 0.03197116777300835, "learning_rate": 0.00013962920046349943, "loss": 0.0487, "step": 2089 }, { "epoch": 0.9075119409465914, "grad_norm": 0.02279396913945675, "learning_rate": 0.0001396002317497103, "loss": 0.0311, "step": 2090 }, { "epoch": 0.9079461571862788, "grad_norm": 0.023685257881879807, "learning_rate": 0.0001395712630359212, "loss": 0.0394, "step": 2091 }, { "epoch": 0.9083803734259661, "grad_norm": 0.04520018398761749, "learning_rate": 0.0001395422943221321, "loss": 0.0548, "step": 2092 }, { "epoch": 0.9088145896656535, "grad_norm": 0.031058818101882935, "learning_rate": 0.000139513325608343, "loss": 0.0408, "step": 2093 }, { "epoch": 0.9092488059053409, "grad_norm": 0.029566964134573936, "learning_rate": 0.0001394843568945539, "loss": 0.0414, "step": 2094 }, { "epoch": 0.9096830221450283, "grad_norm": 0.032492849975824356, "learning_rate": 0.00013945538818076477, "loss": 0.0493, "step": 2095 }, { "epoch": 0.9101172383847156, "grad_norm": 0.028952939435839653, "learning_rate": 0.00013942641946697567, "loss": 0.0449, "step": 2096 }, { "epoch": 0.9105514546244029, "grad_norm": 0.025056935846805573, "learning_rate": 0.00013939745075318657, "loss": 0.0395, "step": 2097 }, { "epoch": 0.9109856708640903, "grad_norm": 0.02562245912849903, "learning_rate": 0.00013936848203939745, "loss": 0.0376, "step": 2098 }, { "epoch": 0.9114198871037776, "grad_norm": 0.026553800329566002, "learning_rate": 0.00013933951332560835, "loss": 0.0368, "step": 2099 }, { "epoch": 0.9118541033434651, "grad_norm": 0.02708553895354271, "learning_rate": 0.00013931054461181926, "loss": 0.0456, "step": 2100 }, { "epoch": 0.9122883195831524, "grad_norm": 0.028273994103074074, "learning_rate": 0.00013928157589803013, "loss": 0.0448, "step": 2101 }, { "epoch": 0.9127225358228398, "grad_norm": 0.027410374954342842, "learning_rate": 0.000139252607184241, "loss": 0.0376, "step": 2102 }, { "epoch": 0.9131567520625271, "grad_norm": 0.03329867869615555, "learning_rate": 0.00013922363847045191, "loss": 0.049, "step": 2103 }, { "epoch": 0.9135909683022145, "grad_norm": 0.025568438693881035, "learning_rate": 0.00013919466975666282, "loss": 0.0366, "step": 2104 }, { "epoch": 0.9140251845419018, "grad_norm": 0.029210492968559265, "learning_rate": 0.0001391657010428737, "loss": 0.0403, "step": 2105 }, { "epoch": 0.9144594007815893, "grad_norm": 0.025942735373973846, "learning_rate": 0.0001391367323290846, "loss": 0.0285, "step": 2106 }, { "epoch": 0.9148936170212766, "grad_norm": 0.026792172342538834, "learning_rate": 0.0001391077636152955, "loss": 0.0339, "step": 2107 }, { "epoch": 0.915327833260964, "grad_norm": 0.02706478163599968, "learning_rate": 0.00013907879490150638, "loss": 0.0352, "step": 2108 }, { "epoch": 0.9157620495006513, "grad_norm": 0.02936442382633686, "learning_rate": 0.00013904982618771728, "loss": 0.0424, "step": 2109 }, { "epoch": 0.9161962657403386, "grad_norm": 0.037001460790634155, "learning_rate": 0.00013902085747392816, "loss": 0.0457, "step": 2110 }, { "epoch": 0.9166304819800261, "grad_norm": 0.024711865931749344, "learning_rate": 0.00013899188876013906, "loss": 0.0376, "step": 2111 }, { "epoch": 0.9170646982197134, "grad_norm": 0.04041684791445732, "learning_rate": 0.00013896292004634994, "loss": 0.0512, "step": 2112 }, { "epoch": 0.9174989144594008, "grad_norm": 0.0317920558154583, "learning_rate": 0.00013893395133256084, "loss": 0.045, "step": 2113 }, { "epoch": 0.9179331306990881, "grad_norm": 0.04350608214735985, "learning_rate": 0.00013890498261877175, "loss": 0.0561, "step": 2114 }, { "epoch": 0.9183673469387755, "grad_norm": 0.03163138031959534, "learning_rate": 0.00013887601390498262, "loss": 0.0425, "step": 2115 }, { "epoch": 0.9188015631784628, "grad_norm": 0.029919680207967758, "learning_rate": 0.00013884704519119353, "loss": 0.0324, "step": 2116 }, { "epoch": 0.9192357794181503, "grad_norm": 0.041621677577495575, "learning_rate": 0.0001388180764774044, "loss": 0.0514, "step": 2117 }, { "epoch": 0.9196699956578376, "grad_norm": 0.030498886480927467, "learning_rate": 0.0001387891077636153, "loss": 0.0367, "step": 2118 }, { "epoch": 0.920104211897525, "grad_norm": 0.03399229422211647, "learning_rate": 0.00013876013904982618, "loss": 0.0435, "step": 2119 }, { "epoch": 0.9205384281372123, "grad_norm": 0.027746310457587242, "learning_rate": 0.0001387311703360371, "loss": 0.0338, "step": 2120 }, { "epoch": 0.9209726443768997, "grad_norm": 0.027493588626384735, "learning_rate": 0.000138702201622248, "loss": 0.0365, "step": 2121 }, { "epoch": 0.921406860616587, "grad_norm": 0.02794453129172325, "learning_rate": 0.00013867323290845887, "loss": 0.042, "step": 2122 }, { "epoch": 0.9218410768562745, "grad_norm": 0.030643248930573463, "learning_rate": 0.00013864426419466977, "loss": 0.0371, "step": 2123 }, { "epoch": 0.9222752930959618, "grad_norm": 0.03396167233586311, "learning_rate": 0.00013861529548088065, "loss": 0.0471, "step": 2124 }, { "epoch": 0.9227095093356491, "grad_norm": 0.030941464006900787, "learning_rate": 0.00013858632676709155, "loss": 0.0445, "step": 2125 }, { "epoch": 0.9231437255753365, "grad_norm": 0.032383885234594345, "learning_rate": 0.00013855735805330243, "loss": 0.0419, "step": 2126 }, { "epoch": 0.9235779418150238, "grad_norm": 0.02794160507619381, "learning_rate": 0.00013852838933951333, "loss": 0.039, "step": 2127 }, { "epoch": 0.9240121580547113, "grad_norm": 0.02721387892961502, "learning_rate": 0.00013849942062572424, "loss": 0.0389, "step": 2128 }, { "epoch": 0.9244463742943986, "grad_norm": 0.02475523017346859, "learning_rate": 0.0001384704519119351, "loss": 0.0327, "step": 2129 }, { "epoch": 0.924880590534086, "grad_norm": 0.03334461525082588, "learning_rate": 0.00013844148319814602, "loss": 0.0517, "step": 2130 }, { "epoch": 0.9253148067737733, "grad_norm": 0.022131426259875298, "learning_rate": 0.0001384125144843569, "loss": 0.0323, "step": 2131 }, { "epoch": 0.9257490230134607, "grad_norm": 0.030871136114001274, "learning_rate": 0.0001383835457705678, "loss": 0.0386, "step": 2132 }, { "epoch": 0.926183239253148, "grad_norm": 0.02491099014878273, "learning_rate": 0.00013835457705677867, "loss": 0.041, "step": 2133 }, { "epoch": 0.9266174554928355, "grad_norm": 0.027743477374315262, "learning_rate": 0.00013832560834298958, "loss": 0.0407, "step": 2134 }, { "epoch": 0.9270516717325228, "grad_norm": 0.03120150975883007, "learning_rate": 0.00013829663962920048, "loss": 0.0442, "step": 2135 }, { "epoch": 0.9274858879722102, "grad_norm": 0.03072458505630493, "learning_rate": 0.00013826767091541136, "loss": 0.051, "step": 2136 }, { "epoch": 0.9279201042118975, "grad_norm": 0.025443365797400475, "learning_rate": 0.00013823870220162226, "loss": 0.0403, "step": 2137 }, { "epoch": 0.9283543204515848, "grad_norm": 0.03241339698433876, "learning_rate": 0.00013820973348783314, "loss": 0.0484, "step": 2138 }, { "epoch": 0.9287885366912723, "grad_norm": 0.026722051203250885, "learning_rate": 0.00013818076477404404, "loss": 0.0311, "step": 2139 }, { "epoch": 0.9292227529309596, "grad_norm": 0.037301622331142426, "learning_rate": 0.00013815179606025492, "loss": 0.0547, "step": 2140 }, { "epoch": 0.929656969170647, "grad_norm": 0.10372752696275711, "learning_rate": 0.00013812282734646582, "loss": 0.0437, "step": 2141 }, { "epoch": 0.9300911854103343, "grad_norm": 0.025743165984749794, "learning_rate": 0.00013809385863267673, "loss": 0.0351, "step": 2142 }, { "epoch": 0.9305254016500217, "grad_norm": 0.033833570778369904, "learning_rate": 0.00013806488991888763, "loss": 0.051, "step": 2143 }, { "epoch": 0.930959617889709, "grad_norm": 0.030373496934771538, "learning_rate": 0.0001380359212050985, "loss": 0.0469, "step": 2144 }, { "epoch": 0.9313938341293965, "grad_norm": 0.04016495496034622, "learning_rate": 0.00013800695249130938, "loss": 0.0424, "step": 2145 }, { "epoch": 0.9318280503690838, "grad_norm": 0.0286623015999794, "learning_rate": 0.0001379779837775203, "loss": 0.0389, "step": 2146 }, { "epoch": 0.9322622666087712, "grad_norm": 0.0321274995803833, "learning_rate": 0.00013794901506373116, "loss": 0.0446, "step": 2147 }, { "epoch": 0.9326964828484585, "grad_norm": 0.03006785735487938, "learning_rate": 0.00013792004634994207, "loss": 0.0473, "step": 2148 }, { "epoch": 0.9331306990881459, "grad_norm": 0.02398059517145157, "learning_rate": 0.00013789107763615297, "loss": 0.0365, "step": 2149 }, { "epoch": 0.9335649153278333, "grad_norm": 0.02793448232114315, "learning_rate": 0.00013786210892236387, "loss": 0.0397, "step": 2150 }, { "epoch": 0.9339991315675207, "grad_norm": 0.031009534373879433, "learning_rate": 0.00013783314020857475, "loss": 0.0421, "step": 2151 }, { "epoch": 0.934433347807208, "grad_norm": 0.029026534408330917, "learning_rate": 0.00013780417149478563, "loss": 0.0392, "step": 2152 }, { "epoch": 0.9348675640468953, "grad_norm": 0.03120335377752781, "learning_rate": 0.00013777520278099653, "loss": 0.0411, "step": 2153 }, { "epoch": 0.9353017802865827, "grad_norm": 0.028501121327280998, "learning_rate": 0.0001377462340672074, "loss": 0.0377, "step": 2154 }, { "epoch": 0.93573599652627, "grad_norm": 0.031863339245319366, "learning_rate": 0.0001377172653534183, "loss": 0.043, "step": 2155 }, { "epoch": 0.9361702127659575, "grad_norm": 0.028894485905766487, "learning_rate": 0.00013768829663962922, "loss": 0.0404, "step": 2156 }, { "epoch": 0.9366044290056448, "grad_norm": 0.031099170446395874, "learning_rate": 0.00013765932792584012, "loss": 0.0398, "step": 2157 }, { "epoch": 0.9370386452453322, "grad_norm": 0.036405209451913834, "learning_rate": 0.000137630359212051, "loss": 0.0417, "step": 2158 }, { "epoch": 0.9374728614850195, "grad_norm": 0.030180014669895172, "learning_rate": 0.00013760139049826187, "loss": 0.0363, "step": 2159 }, { "epoch": 0.9379070777247069, "grad_norm": 0.027602504938840866, "learning_rate": 0.00013757242178447278, "loss": 0.0413, "step": 2160 }, { "epoch": 0.9383412939643943, "grad_norm": 0.031432200223207474, "learning_rate": 0.00013754345307068365, "loss": 0.0423, "step": 2161 }, { "epoch": 0.9387755102040817, "grad_norm": 0.03124765306711197, "learning_rate": 0.00013751448435689456, "loss": 0.0457, "step": 2162 }, { "epoch": 0.939209726443769, "grad_norm": 0.03963832929730415, "learning_rate": 0.00013748551564310546, "loss": 0.0586, "step": 2163 }, { "epoch": 0.9396439426834564, "grad_norm": 0.040504153817892075, "learning_rate": 0.00013745654692931636, "loss": 0.0452, "step": 2164 }, { "epoch": 0.9400781589231437, "grad_norm": 0.031160999089479446, "learning_rate": 0.00013742757821552724, "loss": 0.0463, "step": 2165 }, { "epoch": 0.940512375162831, "grad_norm": 0.028206581249833107, "learning_rate": 0.00013739860950173812, "loss": 0.0405, "step": 2166 }, { "epoch": 0.9409465914025185, "grad_norm": 0.03421599045395851, "learning_rate": 0.00013736964078794902, "loss": 0.0595, "step": 2167 }, { "epoch": 0.9413808076422058, "grad_norm": 0.028872806578874588, "learning_rate": 0.0001373406720741599, "loss": 0.0419, "step": 2168 }, { "epoch": 0.9418150238818932, "grad_norm": 0.028335964307188988, "learning_rate": 0.0001373117033603708, "loss": 0.043, "step": 2169 }, { "epoch": 0.9422492401215805, "grad_norm": 0.03556100279092789, "learning_rate": 0.0001372827346465817, "loss": 0.0484, "step": 2170 }, { "epoch": 0.9426834563612679, "grad_norm": 0.024907942861318588, "learning_rate": 0.0001372537659327926, "loss": 0.0352, "step": 2171 }, { "epoch": 0.9431176726009552, "grad_norm": 0.02658097632229328, "learning_rate": 0.00013722479721900349, "loss": 0.0366, "step": 2172 }, { "epoch": 0.9435518888406427, "grad_norm": 0.03057592734694481, "learning_rate": 0.00013719582850521436, "loss": 0.0418, "step": 2173 }, { "epoch": 0.94398610508033, "grad_norm": 0.030184350907802582, "learning_rate": 0.00013716685979142527, "loss": 0.0413, "step": 2174 }, { "epoch": 0.9444203213200174, "grad_norm": 0.02817760594189167, "learning_rate": 0.00013713789107763614, "loss": 0.0442, "step": 2175 }, { "epoch": 0.9448545375597047, "grad_norm": 0.028308512642979622, "learning_rate": 0.00013710892236384705, "loss": 0.0388, "step": 2176 }, { "epoch": 0.9452887537993921, "grad_norm": 0.026963790878653526, "learning_rate": 0.00013707995365005795, "loss": 0.0434, "step": 2177 }, { "epoch": 0.9457229700390795, "grad_norm": 0.027523212134838104, "learning_rate": 0.00013705098493626885, "loss": 0.0378, "step": 2178 }, { "epoch": 0.9461571862787668, "grad_norm": 0.035585321485996246, "learning_rate": 0.00013702201622247973, "loss": 0.0598, "step": 2179 }, { "epoch": 0.9465914025184542, "grad_norm": 0.03184712678194046, "learning_rate": 0.0001369930475086906, "loss": 0.0462, "step": 2180 }, { "epoch": 0.9470256187581415, "grad_norm": 0.04311623424291611, "learning_rate": 0.0001369640787949015, "loss": 0.0582, "step": 2181 }, { "epoch": 0.9474598349978289, "grad_norm": 0.027615820989012718, "learning_rate": 0.0001369351100811124, "loss": 0.0411, "step": 2182 }, { "epoch": 0.9478940512375162, "grad_norm": 0.030661169439554214, "learning_rate": 0.0001369061413673233, "loss": 0.0397, "step": 2183 }, { "epoch": 0.9483282674772037, "grad_norm": 0.021827930584549904, "learning_rate": 0.0001368771726535342, "loss": 0.0339, "step": 2184 }, { "epoch": 0.948762483716891, "grad_norm": 0.030129745602607727, "learning_rate": 0.0001368482039397451, "loss": 0.0398, "step": 2185 }, { "epoch": 0.9491966999565784, "grad_norm": 0.02397645264863968, "learning_rate": 0.00013681923522595597, "loss": 0.0337, "step": 2186 }, { "epoch": 0.9496309161962657, "grad_norm": 0.0337529294192791, "learning_rate": 0.00013679026651216685, "loss": 0.0441, "step": 2187 }, { "epoch": 0.9500651324359531, "grad_norm": 0.031537119299173355, "learning_rate": 0.00013676129779837775, "loss": 0.0549, "step": 2188 }, { "epoch": 0.9504993486756405, "grad_norm": 0.028810830786824226, "learning_rate": 0.00013673232908458866, "loss": 0.039, "step": 2189 }, { "epoch": 0.9509335649153279, "grad_norm": 0.03335162252187729, "learning_rate": 0.00013670336037079954, "loss": 0.0472, "step": 2190 }, { "epoch": 0.9513677811550152, "grad_norm": 0.03277963027358055, "learning_rate": 0.00013667439165701044, "loss": 0.0493, "step": 2191 }, { "epoch": 0.9518019973947026, "grad_norm": 0.037257421761751175, "learning_rate": 0.00013664542294322134, "loss": 0.0491, "step": 2192 }, { "epoch": 0.9522362136343899, "grad_norm": 0.031910981982946396, "learning_rate": 0.00013661645422943222, "loss": 0.0382, "step": 2193 }, { "epoch": 0.9526704298740772, "grad_norm": 0.02935083769261837, "learning_rate": 0.00013658748551564312, "loss": 0.0437, "step": 2194 }, { "epoch": 0.9531046461137647, "grad_norm": 0.02893870510160923, "learning_rate": 0.000136558516801854, "loss": 0.0408, "step": 2195 }, { "epoch": 0.953538862353452, "grad_norm": 0.023851303383708, "learning_rate": 0.0001365295480880649, "loss": 0.0278, "step": 2196 }, { "epoch": 0.9539730785931394, "grad_norm": 0.02870131842792034, "learning_rate": 0.00013650057937427578, "loss": 0.0417, "step": 2197 }, { "epoch": 0.9544072948328267, "grad_norm": 0.03842976689338684, "learning_rate": 0.00013647161066048668, "loss": 0.0422, "step": 2198 }, { "epoch": 0.9548415110725141, "grad_norm": 0.027746714651584625, "learning_rate": 0.0001364426419466976, "loss": 0.0447, "step": 2199 }, { "epoch": 0.9552757273122015, "grad_norm": 0.027727734297513962, "learning_rate": 0.00013641367323290846, "loss": 0.0393, "step": 2200 }, { "epoch": 0.9557099435518889, "grad_norm": 0.024328382685780525, "learning_rate": 0.00013638470451911937, "loss": 0.0362, "step": 2201 }, { "epoch": 0.9561441597915762, "grad_norm": 0.030501583591103554, "learning_rate": 0.00013635573580533024, "loss": 0.043, "step": 2202 }, { "epoch": 0.9565783760312636, "grad_norm": 0.03709418699145317, "learning_rate": 0.00013632676709154115, "loss": 0.0529, "step": 2203 }, { "epoch": 0.9570125922709509, "grad_norm": 0.027952171862125397, "learning_rate": 0.00013629779837775202, "loss": 0.0345, "step": 2204 }, { "epoch": 0.9574468085106383, "grad_norm": 0.028287675231695175, "learning_rate": 0.00013626882966396293, "loss": 0.0386, "step": 2205 }, { "epoch": 0.9578810247503257, "grad_norm": 0.03183449059724808, "learning_rate": 0.00013623986095017383, "loss": 0.0431, "step": 2206 }, { "epoch": 0.958315240990013, "grad_norm": 0.03488324210047722, "learning_rate": 0.0001362108922363847, "loss": 0.0418, "step": 2207 }, { "epoch": 0.9587494572297004, "grad_norm": 0.026730617508292198, "learning_rate": 0.0001361819235225956, "loss": 0.0327, "step": 2208 }, { "epoch": 0.9591836734693877, "grad_norm": 0.03089052252471447, "learning_rate": 0.0001361529548088065, "loss": 0.0347, "step": 2209 }, { "epoch": 0.9596178897090751, "grad_norm": 0.02888919599354267, "learning_rate": 0.0001361239860950174, "loss": 0.0434, "step": 2210 }, { "epoch": 0.9600521059487624, "grad_norm": 0.03462725505232811, "learning_rate": 0.00013609501738122827, "loss": 0.0429, "step": 2211 }, { "epoch": 0.9604863221884499, "grad_norm": 0.028506189584732056, "learning_rate": 0.00013606604866743917, "loss": 0.0363, "step": 2212 }, { "epoch": 0.9609205384281372, "grad_norm": 0.03749020770192146, "learning_rate": 0.00013603707995365008, "loss": 0.0484, "step": 2213 }, { "epoch": 0.9613547546678246, "grad_norm": 0.029138952493667603, "learning_rate": 0.00013600811123986095, "loss": 0.0409, "step": 2214 }, { "epoch": 0.9617889709075119, "grad_norm": 0.041693080216646194, "learning_rate": 0.00013597914252607186, "loss": 0.061, "step": 2215 }, { "epoch": 0.9622231871471993, "grad_norm": 0.035778727382421494, "learning_rate": 0.00013595017381228273, "loss": 0.0471, "step": 2216 }, { "epoch": 0.9626574033868867, "grad_norm": 0.033804427832365036, "learning_rate": 0.00013592120509849364, "loss": 0.0462, "step": 2217 }, { "epoch": 0.9630916196265741, "grad_norm": 0.023441001772880554, "learning_rate": 0.00013589223638470451, "loss": 0.0325, "step": 2218 }, { "epoch": 0.9635258358662614, "grad_norm": 0.023705758154392242, "learning_rate": 0.00013586326767091542, "loss": 0.0293, "step": 2219 }, { "epoch": 0.9639600521059488, "grad_norm": 0.04140575975179672, "learning_rate": 0.00013583429895712632, "loss": 0.0407, "step": 2220 }, { "epoch": 0.9643942683456361, "grad_norm": 0.02872782200574875, "learning_rate": 0.0001358053302433372, "loss": 0.0374, "step": 2221 }, { "epoch": 0.9648284845853234, "grad_norm": 0.03456737846136093, "learning_rate": 0.0001357763615295481, "loss": 0.0477, "step": 2222 }, { "epoch": 0.9652627008250109, "grad_norm": 0.03260280191898346, "learning_rate": 0.00013574739281575898, "loss": 0.0464, "step": 2223 }, { "epoch": 0.9656969170646982, "grad_norm": 0.02683817781507969, "learning_rate": 0.00013571842410196988, "loss": 0.0345, "step": 2224 }, { "epoch": 0.9661311333043856, "grad_norm": 0.034209538251161575, "learning_rate": 0.00013568945538818076, "loss": 0.042, "step": 2225 }, { "epoch": 0.9665653495440729, "grad_norm": 0.036412402987480164, "learning_rate": 0.00013566048667439166, "loss": 0.0491, "step": 2226 }, { "epoch": 0.9669995657837603, "grad_norm": 0.032398246228694916, "learning_rate": 0.00013563151796060257, "loss": 0.0515, "step": 2227 }, { "epoch": 0.9674337820234477, "grad_norm": 0.03629367798566818, "learning_rate": 0.00013560254924681344, "loss": 0.0411, "step": 2228 }, { "epoch": 0.9678679982631351, "grad_norm": 0.02964608743786812, "learning_rate": 0.00013557358053302435, "loss": 0.0396, "step": 2229 }, { "epoch": 0.9683022145028224, "grad_norm": 0.036275263875722885, "learning_rate": 0.00013554461181923522, "loss": 0.0496, "step": 2230 }, { "epoch": 0.9687364307425098, "grad_norm": 0.03429960459470749, "learning_rate": 0.00013551564310544613, "loss": 0.0462, "step": 2231 }, { "epoch": 0.9691706469821971, "grad_norm": 0.024065155535936356, "learning_rate": 0.000135486674391657, "loss": 0.0331, "step": 2232 }, { "epoch": 0.9696048632218845, "grad_norm": 0.03262323513627052, "learning_rate": 0.0001354577056778679, "loss": 0.0452, "step": 2233 }, { "epoch": 0.9700390794615719, "grad_norm": 0.0382208377122879, "learning_rate": 0.0001354287369640788, "loss": 0.0514, "step": 2234 }, { "epoch": 0.9704732957012592, "grad_norm": 0.032265305519104004, "learning_rate": 0.00013539976825028971, "loss": 0.043, "step": 2235 }, { "epoch": 0.9709075119409466, "grad_norm": 0.029365187510848045, "learning_rate": 0.0001353707995365006, "loss": 0.0384, "step": 2236 }, { "epoch": 0.9713417281806339, "grad_norm": 0.023244334384799004, "learning_rate": 0.00013534183082271147, "loss": 0.0356, "step": 2237 }, { "epoch": 0.9717759444203213, "grad_norm": 0.027667613700032234, "learning_rate": 0.00013531286210892237, "loss": 0.0433, "step": 2238 }, { "epoch": 0.9722101606600086, "grad_norm": 0.033056750893592834, "learning_rate": 0.00013528389339513325, "loss": 0.0457, "step": 2239 }, { "epoch": 0.9726443768996961, "grad_norm": 0.03479159623384476, "learning_rate": 0.00013525492468134415, "loss": 0.0468, "step": 2240 }, { "epoch": 0.9730785931393834, "grad_norm": 0.03536512702703476, "learning_rate": 0.00013522595596755506, "loss": 0.0382, "step": 2241 }, { "epoch": 0.9735128093790708, "grad_norm": 0.02667381428182125, "learning_rate": 0.00013519698725376596, "loss": 0.0401, "step": 2242 }, { "epoch": 0.9739470256187581, "grad_norm": 0.0359371080994606, "learning_rate": 0.00013516801853997684, "loss": 0.0531, "step": 2243 }, { "epoch": 0.9743812418584455, "grad_norm": 0.03615528717637062, "learning_rate": 0.0001351390498261877, "loss": 0.0438, "step": 2244 }, { "epoch": 0.9748154580981329, "grad_norm": 0.02989623136818409, "learning_rate": 0.00013511008111239862, "loss": 0.046, "step": 2245 }, { "epoch": 0.9752496743378203, "grad_norm": 0.038469765335321426, "learning_rate": 0.0001350811123986095, "loss": 0.0613, "step": 2246 }, { "epoch": 0.9756838905775076, "grad_norm": 0.035166870802640915, "learning_rate": 0.0001350521436848204, "loss": 0.0556, "step": 2247 }, { "epoch": 0.9761181068171949, "grad_norm": 0.02952173911035061, "learning_rate": 0.0001350231749710313, "loss": 0.0426, "step": 2248 }, { "epoch": 0.9765523230568823, "grad_norm": 0.03314448893070221, "learning_rate": 0.0001349942062572422, "loss": 0.0579, "step": 2249 }, { "epoch": 0.9769865392965696, "grad_norm": 0.02709200605750084, "learning_rate": 0.00013496523754345308, "loss": 0.0437, "step": 2250 }, { "epoch": 0.9774207555362571, "grad_norm": 0.02738839015364647, "learning_rate": 0.00013493626882966396, "loss": 0.0402, "step": 2251 }, { "epoch": 0.9778549717759444, "grad_norm": 0.027244890108704567, "learning_rate": 0.00013490730011587486, "loss": 0.0424, "step": 2252 }, { "epoch": 0.9782891880156318, "grad_norm": 0.023743722587823868, "learning_rate": 0.00013487833140208574, "loss": 0.0371, "step": 2253 }, { "epoch": 0.9787234042553191, "grad_norm": 0.03127893805503845, "learning_rate": 0.00013484936268829664, "loss": 0.0426, "step": 2254 }, { "epoch": 0.9791576204950065, "grad_norm": 0.03587310388684273, "learning_rate": 0.00013482039397450755, "loss": 0.0491, "step": 2255 }, { "epoch": 0.9795918367346939, "grad_norm": 0.02438114583492279, "learning_rate": 0.00013479142526071845, "loss": 0.0362, "step": 2256 }, { "epoch": 0.9800260529743813, "grad_norm": 0.036399051547050476, "learning_rate": 0.00013476245654692933, "loss": 0.0454, "step": 2257 }, { "epoch": 0.9804602692140686, "grad_norm": 0.02578856609761715, "learning_rate": 0.0001347334878331402, "loss": 0.0316, "step": 2258 }, { "epoch": 0.980894485453756, "grad_norm": 0.026096750050783157, "learning_rate": 0.0001347045191193511, "loss": 0.0373, "step": 2259 }, { "epoch": 0.9813287016934433, "grad_norm": 0.03240537643432617, "learning_rate": 0.00013467555040556198, "loss": 0.0419, "step": 2260 }, { "epoch": 0.9817629179331308, "grad_norm": 0.030255481600761414, "learning_rate": 0.00013464658169177289, "loss": 0.0372, "step": 2261 }, { "epoch": 0.9821971341728181, "grad_norm": 0.03389328345656395, "learning_rate": 0.0001346176129779838, "loss": 0.0475, "step": 2262 }, { "epoch": 0.9826313504125054, "grad_norm": 0.02976074069738388, "learning_rate": 0.0001345886442641947, "loss": 0.0483, "step": 2263 }, { "epoch": 0.9830655666521928, "grad_norm": 0.025279808789491653, "learning_rate": 0.00013455967555040557, "loss": 0.035, "step": 2264 }, { "epoch": 0.9834997828918801, "grad_norm": 0.02859262377023697, "learning_rate": 0.00013453070683661645, "loss": 0.0423, "step": 2265 }, { "epoch": 0.9839339991315675, "grad_norm": 0.031738266348838806, "learning_rate": 0.00013450173812282735, "loss": 0.0453, "step": 2266 }, { "epoch": 0.9843682153712549, "grad_norm": 0.02764860913157463, "learning_rate": 0.00013447276940903823, "loss": 0.038, "step": 2267 }, { "epoch": 0.9848024316109423, "grad_norm": 0.034258704632520676, "learning_rate": 0.00013444380069524913, "loss": 0.0529, "step": 2268 }, { "epoch": 0.9852366478506296, "grad_norm": 0.028737129643559456, "learning_rate": 0.00013441483198146003, "loss": 0.0397, "step": 2269 }, { "epoch": 0.985670864090317, "grad_norm": 0.03589112311601639, "learning_rate": 0.00013438586326767094, "loss": 0.0534, "step": 2270 }, { "epoch": 0.9861050803300043, "grad_norm": 0.03500163555145264, "learning_rate": 0.00013435689455388181, "loss": 0.0479, "step": 2271 }, { "epoch": 0.9865392965696917, "grad_norm": 0.02292477712035179, "learning_rate": 0.00013432792584009272, "loss": 0.0314, "step": 2272 }, { "epoch": 0.9869735128093791, "grad_norm": 0.03799629583954811, "learning_rate": 0.0001342989571263036, "loss": 0.0579, "step": 2273 }, { "epoch": 0.9874077290490665, "grad_norm": 0.02583610825240612, "learning_rate": 0.00013426998841251447, "loss": 0.0361, "step": 2274 }, { "epoch": 0.9878419452887538, "grad_norm": 0.03954479098320007, "learning_rate": 0.00013424101969872538, "loss": 0.0516, "step": 2275 }, { "epoch": 0.9882761615284411, "grad_norm": 0.02709767408668995, "learning_rate": 0.00013421205098493628, "loss": 0.0398, "step": 2276 }, { "epoch": 0.9887103777681285, "grad_norm": 0.037551648914813995, "learning_rate": 0.00013418308227114718, "loss": 0.0579, "step": 2277 }, { "epoch": 0.9891445940078158, "grad_norm": 0.02648613229393959, "learning_rate": 0.00013415411355735806, "loss": 0.0331, "step": 2278 }, { "epoch": 0.9895788102475033, "grad_norm": 0.02786169946193695, "learning_rate": 0.00013412514484356896, "loss": 0.0487, "step": 2279 }, { "epoch": 0.9900130264871906, "grad_norm": 0.029947757720947266, "learning_rate": 0.00013409617612977984, "loss": 0.0418, "step": 2280 }, { "epoch": 0.990447242726878, "grad_norm": 0.04127148538827896, "learning_rate": 0.00013406720741599072, "loss": 0.0593, "step": 2281 }, { "epoch": 0.9908814589665653, "grad_norm": 0.02721256949007511, "learning_rate": 0.00013403823870220162, "loss": 0.0434, "step": 2282 }, { "epoch": 0.9913156752062527, "grad_norm": 0.024182768538594246, "learning_rate": 0.00013400926998841252, "loss": 0.0356, "step": 2283 }, { "epoch": 0.9917498914459401, "grad_norm": 0.02751404047012329, "learning_rate": 0.00013398030127462343, "loss": 0.0383, "step": 2284 }, { "epoch": 0.9921841076856275, "grad_norm": 0.026285607367753983, "learning_rate": 0.0001339513325608343, "loss": 0.0392, "step": 2285 }, { "epoch": 0.9926183239253148, "grad_norm": 0.025645237416028976, "learning_rate": 0.0001339223638470452, "loss": 0.0357, "step": 2286 }, { "epoch": 0.9930525401650022, "grad_norm": 0.02565135806798935, "learning_rate": 0.00013389339513325608, "loss": 0.0359, "step": 2287 }, { "epoch": 0.9934867564046895, "grad_norm": 0.03245379775762558, "learning_rate": 0.000133864426419467, "loss": 0.0458, "step": 2288 }, { "epoch": 0.993920972644377, "grad_norm": 0.02426012046635151, "learning_rate": 0.00013383545770567787, "loss": 0.0374, "step": 2289 }, { "epoch": 0.9943551888840643, "grad_norm": 0.021702991798520088, "learning_rate": 0.00013380648899188877, "loss": 0.0321, "step": 2290 }, { "epoch": 0.9947894051237516, "grad_norm": 0.03216599300503731, "learning_rate": 0.00013377752027809967, "loss": 0.0468, "step": 2291 }, { "epoch": 0.995223621363439, "grad_norm": 0.02601948194205761, "learning_rate": 0.00013374855156431055, "loss": 0.0403, "step": 2292 }, { "epoch": 0.9956578376031263, "grad_norm": 0.028861071914434433, "learning_rate": 0.00013371958285052145, "loss": 0.0402, "step": 2293 }, { "epoch": 0.9960920538428137, "grad_norm": 0.024505512788891792, "learning_rate": 0.00013369061413673233, "loss": 0.0341, "step": 2294 }, { "epoch": 0.996526270082501, "grad_norm": 0.03261742368340492, "learning_rate": 0.00013366164542294323, "loss": 0.0467, "step": 2295 }, { "epoch": 0.9969604863221885, "grad_norm": 0.0356551855802536, "learning_rate": 0.0001336326767091541, "loss": 0.055, "step": 2296 }, { "epoch": 0.9973947025618758, "grad_norm": 0.02777998521924019, "learning_rate": 0.000133603707995365, "loss": 0.0434, "step": 2297 }, { "epoch": 0.9978289188015632, "grad_norm": 0.026408227160573006, "learning_rate": 0.00013357473928157592, "loss": 0.0423, "step": 2298 }, { "epoch": 0.9982631350412505, "grad_norm": 0.03586670756340027, "learning_rate": 0.0001335457705677868, "loss": 0.0363, "step": 2299 }, { "epoch": 0.998697351280938, "grad_norm": 0.0244100671261549, "learning_rate": 0.0001335168018539977, "loss": 0.0336, "step": 2300 }, { "epoch": 0.9991315675206253, "grad_norm": 0.041754890233278275, "learning_rate": 0.00013348783314020857, "loss": 0.0597, "step": 2301 }, { "epoch": 0.9995657837603127, "grad_norm": 0.025611279532313347, "learning_rate": 0.00013345886442641948, "loss": 0.0382, "step": 2302 }, { "epoch": 1.0, "grad_norm": 0.030549731105566025, "learning_rate": 0.00013342989571263035, "loss": 0.0423, "step": 2303 }, { "epoch": 1.0004342162396873, "grad_norm": 0.03086947090923786, "learning_rate": 0.00013340092699884126, "loss": 0.0379, "step": 2304 }, { "epoch": 1.0008684324793746, "grad_norm": 0.02804921381175518, "learning_rate": 0.00013337195828505216, "loss": 0.0419, "step": 2305 }, { "epoch": 1.0013026487190622, "grad_norm": 0.023069137707352638, "learning_rate": 0.00013334298957126304, "loss": 0.0316, "step": 2306 }, { "epoch": 1.0017368649587495, "grad_norm": 0.02720634452998638, "learning_rate": 0.00013331402085747394, "loss": 0.0361, "step": 2307 }, { "epoch": 1.0021710811984368, "grad_norm": 0.03447691723704338, "learning_rate": 0.00013328505214368482, "loss": 0.0466, "step": 2308 }, { "epoch": 1.002605297438124, "grad_norm": 0.03030562773346901, "learning_rate": 0.00013325608342989572, "loss": 0.0406, "step": 2309 }, { "epoch": 1.0030395136778116, "grad_norm": 0.026614924892783165, "learning_rate": 0.0001332271147161066, "loss": 0.0343, "step": 2310 }, { "epoch": 1.003473729917499, "grad_norm": 0.02843519300222397, "learning_rate": 0.0001331981460023175, "loss": 0.0404, "step": 2311 }, { "epoch": 1.0039079461571863, "grad_norm": 0.030830303207039833, "learning_rate": 0.0001331691772885284, "loss": 0.0425, "step": 2312 }, { "epoch": 1.0043421623968736, "grad_norm": 0.03577243536710739, "learning_rate": 0.00013314020857473928, "loss": 0.0448, "step": 2313 }, { "epoch": 1.004776378636561, "grad_norm": 0.028798885643482208, "learning_rate": 0.0001331112398609502, "loss": 0.0399, "step": 2314 }, { "epoch": 1.0052105948762484, "grad_norm": 0.03741610795259476, "learning_rate": 0.00013308227114716106, "loss": 0.038, "step": 2315 }, { "epoch": 1.0056448111159357, "grad_norm": 0.030297858640551567, "learning_rate": 0.00013305330243337197, "loss": 0.0372, "step": 2316 }, { "epoch": 1.006079027355623, "grad_norm": 0.02625579945743084, "learning_rate": 0.00013302433371958284, "loss": 0.0356, "step": 2317 }, { "epoch": 1.0065132435953104, "grad_norm": 0.028840823099017143, "learning_rate": 0.00013299536500579375, "loss": 0.042, "step": 2318 }, { "epoch": 1.006947459834998, "grad_norm": 0.02382943034172058, "learning_rate": 0.00013296639629200465, "loss": 0.0308, "step": 2319 }, { "epoch": 1.0073816760746852, "grad_norm": 0.035044457763433456, "learning_rate": 0.00013293742757821553, "loss": 0.0434, "step": 2320 }, { "epoch": 1.0078158923143725, "grad_norm": 0.02849504165351391, "learning_rate": 0.00013290845886442643, "loss": 0.0362, "step": 2321 }, { "epoch": 1.0082501085540598, "grad_norm": 0.02801535278558731, "learning_rate": 0.0001328794901506373, "loss": 0.0346, "step": 2322 }, { "epoch": 1.0086843247937474, "grad_norm": 0.03731875494122505, "learning_rate": 0.0001328505214368482, "loss": 0.044, "step": 2323 }, { "epoch": 1.0091185410334347, "grad_norm": 0.03044193983078003, "learning_rate": 0.0001328215527230591, "loss": 0.0378, "step": 2324 }, { "epoch": 1.009552757273122, "grad_norm": 0.030143916606903076, "learning_rate": 0.00013279258400927, "loss": 0.0324, "step": 2325 }, { "epoch": 1.0099869735128093, "grad_norm": 0.030632860958576202, "learning_rate": 0.0001327636152954809, "loss": 0.0393, "step": 2326 }, { "epoch": 1.0104211897524968, "grad_norm": 0.03065096214413643, "learning_rate": 0.00013273464658169177, "loss": 0.0407, "step": 2327 }, { "epoch": 1.0108554059921842, "grad_norm": 0.027884751558303833, "learning_rate": 0.00013270567786790268, "loss": 0.0395, "step": 2328 }, { "epoch": 1.0112896222318715, "grad_norm": 0.031725093722343445, "learning_rate": 0.00013267670915411355, "loss": 0.0349, "step": 2329 }, { "epoch": 1.0117238384715588, "grad_norm": 0.030044177547097206, "learning_rate": 0.00013264774044032446, "loss": 0.0339, "step": 2330 }, { "epoch": 1.012158054711246, "grad_norm": 0.03298114985227585, "learning_rate": 0.00013261877172653533, "loss": 0.0402, "step": 2331 }, { "epoch": 1.0125922709509336, "grad_norm": 0.03858044743537903, "learning_rate": 0.00013258980301274624, "loss": 0.048, "step": 2332 }, { "epoch": 1.013026487190621, "grad_norm": 0.039986006915569305, "learning_rate": 0.00013256083429895714, "loss": 0.0553, "step": 2333 }, { "epoch": 1.0134607034303083, "grad_norm": 0.03128610551357269, "learning_rate": 0.00013253186558516804, "loss": 0.0394, "step": 2334 }, { "epoch": 1.0138949196699956, "grad_norm": 0.031870462000370026, "learning_rate": 0.00013250289687137892, "loss": 0.0402, "step": 2335 }, { "epoch": 1.014329135909683, "grad_norm": 0.031554024666547775, "learning_rate": 0.0001324739281575898, "loss": 0.0382, "step": 2336 }, { "epoch": 1.0147633521493704, "grad_norm": 0.029990192502737045, "learning_rate": 0.0001324449594438007, "loss": 0.038, "step": 2337 }, { "epoch": 1.0151975683890577, "grad_norm": 0.043011169880628586, "learning_rate": 0.00013241599073001158, "loss": 0.0656, "step": 2338 }, { "epoch": 1.015631784628745, "grad_norm": 0.02713867463171482, "learning_rate": 0.00013238702201622248, "loss": 0.0353, "step": 2339 }, { "epoch": 1.0160660008684326, "grad_norm": 0.029613185673952103, "learning_rate": 0.00013235805330243339, "loss": 0.0385, "step": 2340 }, { "epoch": 1.0165002171081199, "grad_norm": 0.03410862758755684, "learning_rate": 0.0001323290845886443, "loss": 0.0424, "step": 2341 }, { "epoch": 1.0169344333478072, "grad_norm": 0.030207326635718346, "learning_rate": 0.00013230011587485517, "loss": 0.0504, "step": 2342 }, { "epoch": 1.0173686495874945, "grad_norm": 0.033077266067266464, "learning_rate": 0.00013227114716106604, "loss": 0.0474, "step": 2343 }, { "epoch": 1.017802865827182, "grad_norm": 0.029622633010149002, "learning_rate": 0.00013224217844727695, "loss": 0.0363, "step": 2344 }, { "epoch": 1.0182370820668694, "grad_norm": 0.036850329488515854, "learning_rate": 0.00013221320973348782, "loss": 0.0369, "step": 2345 }, { "epoch": 1.0186712983065567, "grad_norm": 0.0308463703840971, "learning_rate": 0.00013218424101969873, "loss": 0.0372, "step": 2346 }, { "epoch": 1.019105514546244, "grad_norm": 0.02880735509097576, "learning_rate": 0.00013215527230590963, "loss": 0.0342, "step": 2347 }, { "epoch": 1.0195397307859313, "grad_norm": 0.033778198063373566, "learning_rate": 0.00013212630359212053, "loss": 0.0427, "step": 2348 }, { "epoch": 1.0199739470256188, "grad_norm": 0.06625759601593018, "learning_rate": 0.0001320973348783314, "loss": 0.0701, "step": 2349 }, { "epoch": 1.0204081632653061, "grad_norm": 0.03499298170208931, "learning_rate": 0.00013206836616454231, "loss": 0.0407, "step": 2350 }, { "epoch": 1.0208423795049935, "grad_norm": 0.026471074670553207, "learning_rate": 0.0001320393974507532, "loss": 0.0346, "step": 2351 }, { "epoch": 1.0212765957446808, "grad_norm": 0.03675152733922005, "learning_rate": 0.00013201042873696407, "loss": 0.0479, "step": 2352 }, { "epoch": 1.0217108119843683, "grad_norm": 0.048920512199401855, "learning_rate": 0.00013198146002317497, "loss": 0.0373, "step": 2353 }, { "epoch": 1.0221450282240556, "grad_norm": 0.030769484117627144, "learning_rate": 0.00013195249130938587, "loss": 0.0398, "step": 2354 }, { "epoch": 1.022579244463743, "grad_norm": 0.025104915723204613, "learning_rate": 0.00013192352259559678, "loss": 0.0329, "step": 2355 }, { "epoch": 1.0230134607034302, "grad_norm": 0.02659616619348526, "learning_rate": 0.00013189455388180766, "loss": 0.033, "step": 2356 }, { "epoch": 1.0234476769431178, "grad_norm": 0.028662407770752907, "learning_rate": 0.00013186558516801856, "loss": 0.0392, "step": 2357 }, { "epoch": 1.023881893182805, "grad_norm": 0.027455171570181847, "learning_rate": 0.00013183661645422944, "loss": 0.0337, "step": 2358 }, { "epoch": 1.0243161094224924, "grad_norm": 0.03290437161922455, "learning_rate": 0.0001318076477404403, "loss": 0.038, "step": 2359 }, { "epoch": 1.0247503256621797, "grad_norm": 0.02305612713098526, "learning_rate": 0.00013177867902665122, "loss": 0.0313, "step": 2360 }, { "epoch": 1.025184541901867, "grad_norm": 0.034108828753232956, "learning_rate": 0.00013174971031286212, "loss": 0.0461, "step": 2361 }, { "epoch": 1.0256187581415546, "grad_norm": 0.02930394746363163, "learning_rate": 0.00013172074159907302, "loss": 0.0414, "step": 2362 }, { "epoch": 1.0260529743812419, "grad_norm": 0.027452992275357246, "learning_rate": 0.0001316917728852839, "loss": 0.0318, "step": 2363 }, { "epoch": 1.0264871906209292, "grad_norm": 0.0295764971524477, "learning_rate": 0.0001316628041714948, "loss": 0.0444, "step": 2364 }, { "epoch": 1.0269214068606165, "grad_norm": 0.03134748712182045, "learning_rate": 0.00013163383545770568, "loss": 0.0434, "step": 2365 }, { "epoch": 1.027355623100304, "grad_norm": 0.027049962431192398, "learning_rate": 0.00013160486674391656, "loss": 0.0352, "step": 2366 }, { "epoch": 1.0277898393399914, "grad_norm": 0.026786714792251587, "learning_rate": 0.00013157589803012746, "loss": 0.0318, "step": 2367 }, { "epoch": 1.0282240555796787, "grad_norm": 0.024306096136569977, "learning_rate": 0.00013154692931633836, "loss": 0.0335, "step": 2368 }, { "epoch": 1.028658271819366, "grad_norm": 0.025140974670648575, "learning_rate": 0.00013151796060254927, "loss": 0.0334, "step": 2369 }, { "epoch": 1.0290924880590535, "grad_norm": 0.029016204178333282, "learning_rate": 0.00013148899188876014, "loss": 0.0385, "step": 2370 }, { "epoch": 1.0295267042987408, "grad_norm": 0.033915501087903976, "learning_rate": 0.00013146002317497105, "loss": 0.031, "step": 2371 }, { "epoch": 1.0299609205384281, "grad_norm": 0.029062507674098015, "learning_rate": 0.00013143105446118192, "loss": 0.0393, "step": 2372 }, { "epoch": 1.0303951367781155, "grad_norm": 0.029596207663416862, "learning_rate": 0.0001314020857473928, "loss": 0.0344, "step": 2373 }, { "epoch": 1.0308293530178028, "grad_norm": 0.04062100127339363, "learning_rate": 0.0001313731170336037, "loss": 0.0552, "step": 2374 }, { "epoch": 1.0312635692574903, "grad_norm": 0.03987826406955719, "learning_rate": 0.0001313441483198146, "loss": 0.0414, "step": 2375 }, { "epoch": 1.0316977854971776, "grad_norm": 0.026084046810865402, "learning_rate": 0.0001313151796060255, "loss": 0.0336, "step": 2376 }, { "epoch": 1.032132001736865, "grad_norm": 0.029388993978500366, "learning_rate": 0.0001312862108922364, "loss": 0.0407, "step": 2377 }, { "epoch": 1.0325662179765522, "grad_norm": 0.02555863745510578, "learning_rate": 0.0001312572421784473, "loss": 0.0316, "step": 2378 }, { "epoch": 1.0330004342162398, "grad_norm": 0.030339840799570084, "learning_rate": 0.00013122827346465817, "loss": 0.0428, "step": 2379 }, { "epoch": 1.033434650455927, "grad_norm": 0.02989787980914116, "learning_rate": 0.00013119930475086907, "loss": 0.0406, "step": 2380 }, { "epoch": 1.0338688666956144, "grad_norm": 0.0343019999563694, "learning_rate": 0.00013117033603707995, "loss": 0.0463, "step": 2381 }, { "epoch": 1.0343030829353017, "grad_norm": 0.033009935170412064, "learning_rate": 0.00013114136732329085, "loss": 0.0407, "step": 2382 }, { "epoch": 1.0347372991749892, "grad_norm": 0.0274325180798769, "learning_rate": 0.00013111239860950176, "loss": 0.0349, "step": 2383 }, { "epoch": 1.0351715154146766, "grad_norm": 0.028686370700597763, "learning_rate": 0.00013108342989571263, "loss": 0.0399, "step": 2384 }, { "epoch": 1.0356057316543639, "grad_norm": 0.027370866388082504, "learning_rate": 0.00013105446118192354, "loss": 0.0333, "step": 2385 }, { "epoch": 1.0360399478940512, "grad_norm": 0.027786150574684143, "learning_rate": 0.00013102549246813441, "loss": 0.0346, "step": 2386 }, { "epoch": 1.0364741641337385, "grad_norm": 0.028200514614582062, "learning_rate": 0.00013099652375434532, "loss": 0.0319, "step": 2387 }, { "epoch": 1.036908380373426, "grad_norm": 0.02947312593460083, "learning_rate": 0.0001309675550405562, "loss": 0.0414, "step": 2388 }, { "epoch": 1.0373425966131133, "grad_norm": 0.030526353046298027, "learning_rate": 0.0001309385863267671, "loss": 0.036, "step": 2389 }, { "epoch": 1.0377768128528007, "grad_norm": 0.025646423920989037, "learning_rate": 0.000130909617612978, "loss": 0.0323, "step": 2390 }, { "epoch": 1.038211029092488, "grad_norm": 0.02793518453836441, "learning_rate": 0.00013088064889918888, "loss": 0.033, "step": 2391 }, { "epoch": 1.0386452453321755, "grad_norm": 0.03149619698524475, "learning_rate": 0.00013085168018539978, "loss": 0.0415, "step": 2392 }, { "epoch": 1.0390794615718628, "grad_norm": 0.026426108554005623, "learning_rate": 0.00013082271147161066, "loss": 0.0321, "step": 2393 }, { "epoch": 1.0395136778115501, "grad_norm": 0.023818867281079292, "learning_rate": 0.00013079374275782156, "loss": 0.0264, "step": 2394 }, { "epoch": 1.0399478940512374, "grad_norm": 0.04295625537633896, "learning_rate": 0.00013076477404403244, "loss": 0.0454, "step": 2395 }, { "epoch": 1.040382110290925, "grad_norm": 0.024460244923830032, "learning_rate": 0.00013073580533024334, "loss": 0.0324, "step": 2396 }, { "epoch": 1.0408163265306123, "grad_norm": 0.028979724273085594, "learning_rate": 0.00013070683661645425, "loss": 0.0336, "step": 2397 }, { "epoch": 1.0412505427702996, "grad_norm": 0.030299313366413116, "learning_rate": 0.00013067786790266512, "loss": 0.0397, "step": 2398 }, { "epoch": 1.041684759009987, "grad_norm": 0.03340838477015495, "learning_rate": 0.00013064889918887603, "loss": 0.037, "step": 2399 }, { "epoch": 1.0421189752496742, "grad_norm": 0.026687923818826675, "learning_rate": 0.0001306199304750869, "loss": 0.0281, "step": 2400 }, { "epoch": 1.0425531914893618, "grad_norm": 0.0320209339261055, "learning_rate": 0.0001305909617612978, "loss": 0.0404, "step": 2401 }, { "epoch": 1.042987407729049, "grad_norm": 0.02752070501446724, "learning_rate": 0.00013056199304750868, "loss": 0.0338, "step": 2402 }, { "epoch": 1.0434216239687364, "grad_norm": 0.031067024916410446, "learning_rate": 0.0001305330243337196, "loss": 0.0343, "step": 2403 }, { "epoch": 1.0438558402084237, "grad_norm": 0.032763510942459106, "learning_rate": 0.0001305040556199305, "loss": 0.0393, "step": 2404 }, { "epoch": 1.0442900564481112, "grad_norm": 0.029975712299346924, "learning_rate": 0.00013047508690614137, "loss": 0.0341, "step": 2405 }, { "epoch": 1.0447242726877985, "grad_norm": 0.036660704761743546, "learning_rate": 0.00013044611819235227, "loss": 0.0405, "step": 2406 }, { "epoch": 1.0451584889274859, "grad_norm": 0.0318634919822216, "learning_rate": 0.00013041714947856315, "loss": 0.0396, "step": 2407 }, { "epoch": 1.0455927051671732, "grad_norm": 0.0407436341047287, "learning_rate": 0.00013038818076477405, "loss": 0.0459, "step": 2408 }, { "epoch": 1.0460269214068607, "grad_norm": 0.024078894406557083, "learning_rate": 0.00013035921205098493, "loss": 0.0302, "step": 2409 }, { "epoch": 1.046461137646548, "grad_norm": 0.04145204275846481, "learning_rate": 0.00013033024333719583, "loss": 0.0491, "step": 2410 }, { "epoch": 1.0468953538862353, "grad_norm": 0.029376506805419922, "learning_rate": 0.00013030127462340674, "loss": 0.0412, "step": 2411 }, { "epoch": 1.0473295701259226, "grad_norm": 0.03494599089026451, "learning_rate": 0.0001302723059096176, "loss": 0.0408, "step": 2412 }, { "epoch": 1.0477637863656102, "grad_norm": 0.03157578781247139, "learning_rate": 0.00013024333719582852, "loss": 0.0371, "step": 2413 }, { "epoch": 1.0481980026052975, "grad_norm": 0.038258202373981476, "learning_rate": 0.0001302143684820394, "loss": 0.0422, "step": 2414 }, { "epoch": 1.0486322188449848, "grad_norm": 0.028314372524619102, "learning_rate": 0.0001301853997682503, "loss": 0.0397, "step": 2415 }, { "epoch": 1.0490664350846721, "grad_norm": 0.03046509623527527, "learning_rate": 0.00013015643105446117, "loss": 0.0398, "step": 2416 }, { "epoch": 1.0495006513243594, "grad_norm": 0.03189755976200104, "learning_rate": 0.00013012746234067208, "loss": 0.0405, "step": 2417 }, { "epoch": 1.049934867564047, "grad_norm": 0.03740180656313896, "learning_rate": 0.00013009849362688298, "loss": 0.0412, "step": 2418 }, { "epoch": 1.0503690838037343, "grad_norm": 0.030254777520895004, "learning_rate": 0.00013006952491309386, "loss": 0.0362, "step": 2419 }, { "epoch": 1.0508033000434216, "grad_norm": 0.034417781978845596, "learning_rate": 0.00013004055619930476, "loss": 0.0357, "step": 2420 }, { "epoch": 1.051237516283109, "grad_norm": 0.02720572054386139, "learning_rate": 0.00013001158748551564, "loss": 0.0348, "step": 2421 }, { "epoch": 1.0516717325227964, "grad_norm": 0.031155258417129517, "learning_rate": 0.00012998261877172654, "loss": 0.041, "step": 2422 }, { "epoch": 1.0521059487624838, "grad_norm": 0.03244003653526306, "learning_rate": 0.00012995365005793742, "loss": 0.0445, "step": 2423 }, { "epoch": 1.052540165002171, "grad_norm": 0.031613972038030624, "learning_rate": 0.00012992468134414832, "loss": 0.0377, "step": 2424 }, { "epoch": 1.0529743812418584, "grad_norm": 0.03538982942700386, "learning_rate": 0.00012989571263035923, "loss": 0.0446, "step": 2425 }, { "epoch": 1.053408597481546, "grad_norm": 0.03851898014545441, "learning_rate": 0.0001298667439165701, "loss": 0.0457, "step": 2426 }, { "epoch": 1.0538428137212332, "grad_norm": 0.03028871864080429, "learning_rate": 0.000129837775202781, "loss": 0.0351, "step": 2427 }, { "epoch": 1.0542770299609205, "grad_norm": 0.023443687707185745, "learning_rate": 0.0001298088064889919, "loss": 0.0317, "step": 2428 }, { "epoch": 1.0547112462006079, "grad_norm": 0.02730853110551834, "learning_rate": 0.00012977983777520279, "loss": 0.0356, "step": 2429 }, { "epoch": 1.0551454624402952, "grad_norm": 0.02885373868048191, "learning_rate": 0.00012975086906141366, "loss": 0.0406, "step": 2430 }, { "epoch": 1.0555796786799827, "grad_norm": 0.03105398640036583, "learning_rate": 0.00012972190034762457, "loss": 0.0444, "step": 2431 }, { "epoch": 1.05601389491967, "grad_norm": 0.03212866187095642, "learning_rate": 0.00012969293163383547, "loss": 0.0387, "step": 2432 }, { "epoch": 1.0564481111593573, "grad_norm": 0.03339710086584091, "learning_rate": 0.00012966396292004637, "loss": 0.0332, "step": 2433 }, { "epoch": 1.0568823273990446, "grad_norm": 0.02770499512553215, "learning_rate": 0.00012963499420625725, "loss": 0.0357, "step": 2434 }, { "epoch": 1.0573165436387322, "grad_norm": 0.033496249467134476, "learning_rate": 0.00012960602549246815, "loss": 0.0502, "step": 2435 }, { "epoch": 1.0577507598784195, "grad_norm": 0.029884736984968185, "learning_rate": 0.00012957705677867903, "loss": 0.037, "step": 2436 }, { "epoch": 1.0581849761181068, "grad_norm": 0.027158813551068306, "learning_rate": 0.0001295480880648899, "loss": 0.0379, "step": 2437 }, { "epoch": 1.0586191923577941, "grad_norm": 0.027689753100275993, "learning_rate": 0.0001295191193511008, "loss": 0.039, "step": 2438 }, { "epoch": 1.0590534085974816, "grad_norm": 0.03104972466826439, "learning_rate": 0.00012949015063731172, "loss": 0.0391, "step": 2439 }, { "epoch": 1.059487624837169, "grad_norm": 0.02885962650179863, "learning_rate": 0.00012946118192352262, "loss": 0.0385, "step": 2440 }, { "epoch": 1.0599218410768563, "grad_norm": 0.027306437492370605, "learning_rate": 0.0001294322132097335, "loss": 0.0312, "step": 2441 }, { "epoch": 1.0603560573165436, "grad_norm": 0.030106019228696823, "learning_rate": 0.0001294032444959444, "loss": 0.0341, "step": 2442 }, { "epoch": 1.060790273556231, "grad_norm": 0.029022641479969025, "learning_rate": 0.00012937427578215528, "loss": 0.0307, "step": 2443 }, { "epoch": 1.0612244897959184, "grad_norm": 0.03943420201539993, "learning_rate": 0.00012934530706836615, "loss": 0.0467, "step": 2444 }, { "epoch": 1.0616587060356057, "grad_norm": 0.04098435118794441, "learning_rate": 0.00012931633835457706, "loss": 0.0459, "step": 2445 }, { "epoch": 1.062092922275293, "grad_norm": 0.03454793244600296, "learning_rate": 0.00012928736964078796, "loss": 0.0371, "step": 2446 }, { "epoch": 1.0625271385149804, "grad_norm": 0.05405750125646591, "learning_rate": 0.00012925840092699886, "loss": 0.0415, "step": 2447 }, { "epoch": 1.062961354754668, "grad_norm": 0.03257249295711517, "learning_rate": 0.00012922943221320974, "loss": 0.0453, "step": 2448 }, { "epoch": 1.0633955709943552, "grad_norm": 0.027479713782668114, "learning_rate": 0.00012920046349942064, "loss": 0.0314, "step": 2449 }, { "epoch": 1.0638297872340425, "grad_norm": 0.027265125885605812, "learning_rate": 0.00012917149478563152, "loss": 0.0293, "step": 2450 }, { "epoch": 1.0642640034737298, "grad_norm": 0.035143379122018814, "learning_rate": 0.0001291425260718424, "loss": 0.0457, "step": 2451 }, { "epoch": 1.0646982197134174, "grad_norm": 0.03819432482123375, "learning_rate": 0.0001291135573580533, "loss": 0.0487, "step": 2452 }, { "epoch": 1.0651324359531047, "grad_norm": 0.03533809259533882, "learning_rate": 0.0001290845886442642, "loss": 0.0477, "step": 2453 }, { "epoch": 1.065566652192792, "grad_norm": 0.0316828228533268, "learning_rate": 0.0001290556199304751, "loss": 0.0383, "step": 2454 }, { "epoch": 1.0660008684324793, "grad_norm": 0.02830331027507782, "learning_rate": 0.00012902665121668598, "loss": 0.0336, "step": 2455 }, { "epoch": 1.0664350846721669, "grad_norm": 0.04328421875834465, "learning_rate": 0.0001289976825028969, "loss": 0.0423, "step": 2456 }, { "epoch": 1.0668693009118542, "grad_norm": 0.03268023207783699, "learning_rate": 0.00012896871378910777, "loss": 0.0366, "step": 2457 }, { "epoch": 1.0673035171515415, "grad_norm": 0.02956596575677395, "learning_rate": 0.00012893974507531864, "loss": 0.0368, "step": 2458 }, { "epoch": 1.0677377333912288, "grad_norm": 0.02850380167365074, "learning_rate": 0.00012891077636152955, "loss": 0.0299, "step": 2459 }, { "epoch": 1.068171949630916, "grad_norm": 0.032775457948446274, "learning_rate": 0.00012888180764774045, "loss": 0.0428, "step": 2460 }, { "epoch": 1.0686061658706036, "grad_norm": 0.03359070047736168, "learning_rate": 0.00012885283893395135, "loss": 0.0394, "step": 2461 }, { "epoch": 1.069040382110291, "grad_norm": 0.028799109160900116, "learning_rate": 0.00012882387022016223, "loss": 0.0338, "step": 2462 }, { "epoch": 1.0694745983499783, "grad_norm": 0.02847568690776825, "learning_rate": 0.00012879490150637313, "loss": 0.0362, "step": 2463 }, { "epoch": 1.0699088145896656, "grad_norm": 0.03150567784905434, "learning_rate": 0.000128765932792584, "loss": 0.0385, "step": 2464 }, { "epoch": 1.0703430308293531, "grad_norm": 0.032329261302948, "learning_rate": 0.0001287369640787949, "loss": 0.0482, "step": 2465 }, { "epoch": 1.0707772470690404, "grad_norm": 0.0276672150939703, "learning_rate": 0.0001287079953650058, "loss": 0.0332, "step": 2466 }, { "epoch": 1.0712114633087277, "grad_norm": 0.029301410540938377, "learning_rate": 0.0001286790266512167, "loss": 0.039, "step": 2467 }, { "epoch": 1.071645679548415, "grad_norm": 0.03596123680472374, "learning_rate": 0.0001286500579374276, "loss": 0.0413, "step": 2468 }, { "epoch": 1.0720798957881024, "grad_norm": 0.03181058168411255, "learning_rate": 0.00012862108922363847, "loss": 0.0452, "step": 2469 }, { "epoch": 1.07251411202779, "grad_norm": 0.035492829978466034, "learning_rate": 0.00012859212050984938, "loss": 0.0436, "step": 2470 }, { "epoch": 1.0729483282674772, "grad_norm": 0.030831221491098404, "learning_rate": 0.00012856315179606025, "loss": 0.0381, "step": 2471 }, { "epoch": 1.0733825445071645, "grad_norm": 0.040080588310956955, "learning_rate": 0.00012853418308227113, "loss": 0.0482, "step": 2472 }, { "epoch": 1.0738167607468518, "grad_norm": 0.026362432166934013, "learning_rate": 0.00012850521436848204, "loss": 0.0358, "step": 2473 }, { "epoch": 1.0742509769865394, "grad_norm": 0.026130860671401024, "learning_rate": 0.00012847624565469294, "loss": 0.03, "step": 2474 }, { "epoch": 1.0746851932262267, "grad_norm": 0.030226856470108032, "learning_rate": 0.00012844727694090384, "loss": 0.0366, "step": 2475 }, { "epoch": 1.075119409465914, "grad_norm": 0.03542635589838028, "learning_rate": 0.00012841830822711472, "loss": 0.0474, "step": 2476 }, { "epoch": 1.0755536257056013, "grad_norm": 0.032922666519880295, "learning_rate": 0.00012838933951332562, "loss": 0.0379, "step": 2477 }, { "epoch": 1.0759878419452888, "grad_norm": 0.030406389385461807, "learning_rate": 0.0001283603707995365, "loss": 0.038, "step": 2478 }, { "epoch": 1.0764220581849762, "grad_norm": 0.031172825023531914, "learning_rate": 0.0001283314020857474, "loss": 0.036, "step": 2479 }, { "epoch": 1.0768562744246635, "grad_norm": 0.03052966669201851, "learning_rate": 0.00012830243337195828, "loss": 0.0353, "step": 2480 }, { "epoch": 1.0772904906643508, "grad_norm": 0.03362059220671654, "learning_rate": 0.00012827346465816918, "loss": 0.0433, "step": 2481 }, { "epoch": 1.0777247069040383, "grad_norm": 0.030276693403720856, "learning_rate": 0.0001282444959443801, "loss": 0.039, "step": 2482 }, { "epoch": 1.0781589231437256, "grad_norm": 0.030641235411167145, "learning_rate": 0.00012821552723059096, "loss": 0.0391, "step": 2483 }, { "epoch": 1.078593139383413, "grad_norm": 0.02855825051665306, "learning_rate": 0.00012818655851680187, "loss": 0.0344, "step": 2484 }, { "epoch": 1.0790273556231003, "grad_norm": 0.03668443113565445, "learning_rate": 0.00012815758980301274, "loss": 0.0407, "step": 2485 }, { "epoch": 1.0794615718627876, "grad_norm": 0.029565056785941124, "learning_rate": 0.00012812862108922365, "loss": 0.0393, "step": 2486 }, { "epoch": 1.079895788102475, "grad_norm": 0.034834124147892, "learning_rate": 0.00012809965237543452, "loss": 0.0422, "step": 2487 }, { "epoch": 1.0803300043421624, "grad_norm": 0.03571239858865738, "learning_rate": 0.00012807068366164543, "loss": 0.0508, "step": 2488 }, { "epoch": 1.0807642205818497, "grad_norm": 0.03852108120918274, "learning_rate": 0.00012804171494785633, "loss": 0.0477, "step": 2489 }, { "epoch": 1.081198436821537, "grad_norm": 0.028152965009212494, "learning_rate": 0.0001280127462340672, "loss": 0.033, "step": 2490 }, { "epoch": 1.0816326530612246, "grad_norm": 0.03646685555577278, "learning_rate": 0.0001279837775202781, "loss": 0.0464, "step": 2491 }, { "epoch": 1.082066869300912, "grad_norm": 0.044930726289749146, "learning_rate": 0.000127954808806489, "loss": 0.0483, "step": 2492 }, { "epoch": 1.0825010855405992, "grad_norm": 0.024634433910250664, "learning_rate": 0.0001279258400926999, "loss": 0.0288, "step": 2493 }, { "epoch": 1.0829353017802865, "grad_norm": 0.028381705284118652, "learning_rate": 0.00012789687137891077, "loss": 0.033, "step": 2494 }, { "epoch": 1.0833695180199738, "grad_norm": 0.023782627657055855, "learning_rate": 0.00012786790266512167, "loss": 0.0334, "step": 2495 }, { "epoch": 1.0838037342596614, "grad_norm": 0.027879077941179276, "learning_rate": 0.00012783893395133258, "loss": 0.0338, "step": 2496 }, { "epoch": 1.0842379504993487, "grad_norm": 0.03144657984375954, "learning_rate": 0.00012780996523754345, "loss": 0.0404, "step": 2497 }, { "epoch": 1.084672166739036, "grad_norm": 0.03416517749428749, "learning_rate": 0.00012778099652375436, "loss": 0.0394, "step": 2498 }, { "epoch": 1.0851063829787233, "grad_norm": 0.03528415784239769, "learning_rate": 0.00012775202780996523, "loss": 0.0405, "step": 2499 }, { "epoch": 1.0855405992184108, "grad_norm": 0.039117805659770966, "learning_rate": 0.00012772305909617614, "loss": 0.0423, "step": 2500 }, { "epoch": 1.0859748154580982, "grad_norm": 0.03378056734800339, "learning_rate": 0.00012769409038238701, "loss": 0.0386, "step": 2501 }, { "epoch": 1.0864090316977855, "grad_norm": 0.029471689835190773, "learning_rate": 0.00012766512166859792, "loss": 0.0349, "step": 2502 }, { "epoch": 1.0868432479374728, "grad_norm": 0.025784606114029884, "learning_rate": 0.00012763615295480882, "loss": 0.0312, "step": 2503 }, { "epoch": 1.0872774641771603, "grad_norm": 0.037786632776260376, "learning_rate": 0.0001276071842410197, "loss": 0.0396, "step": 2504 }, { "epoch": 1.0877116804168476, "grad_norm": 0.029232703149318695, "learning_rate": 0.0001275782155272306, "loss": 0.0364, "step": 2505 }, { "epoch": 1.088145896656535, "grad_norm": 0.02802528440952301, "learning_rate": 0.0001275492468134415, "loss": 0.036, "step": 2506 }, { "epoch": 1.0885801128962223, "grad_norm": 0.03140205889940262, "learning_rate": 0.00012752027809965238, "loss": 0.0343, "step": 2507 }, { "epoch": 1.0890143291359098, "grad_norm": 0.03466328978538513, "learning_rate": 0.00012749130938586326, "loss": 0.0372, "step": 2508 }, { "epoch": 1.089448545375597, "grad_norm": 0.03505750373005867, "learning_rate": 0.00012746234067207416, "loss": 0.0468, "step": 2509 }, { "epoch": 1.0898827616152844, "grad_norm": 0.03273766115307808, "learning_rate": 0.00012743337195828507, "loss": 0.0409, "step": 2510 }, { "epoch": 1.0903169778549717, "grad_norm": 0.03252221643924713, "learning_rate": 0.00012740440324449594, "loss": 0.0489, "step": 2511 }, { "epoch": 1.090751194094659, "grad_norm": 0.0284226406365633, "learning_rate": 0.00012737543453070685, "loss": 0.0345, "step": 2512 }, { "epoch": 1.0911854103343466, "grad_norm": 0.033943235874176025, "learning_rate": 0.00012734646581691775, "loss": 0.0406, "step": 2513 }, { "epoch": 1.0916196265740339, "grad_norm": 0.03052627108991146, "learning_rate": 0.00012731749710312863, "loss": 0.0355, "step": 2514 }, { "epoch": 1.0920538428137212, "grad_norm": 0.030543139204382896, "learning_rate": 0.0001272885283893395, "loss": 0.033, "step": 2515 }, { "epoch": 1.0924880590534085, "grad_norm": 0.03386886790394783, "learning_rate": 0.0001272595596755504, "loss": 0.0453, "step": 2516 }, { "epoch": 1.092922275293096, "grad_norm": 0.024116946384310722, "learning_rate": 0.0001272305909617613, "loss": 0.0282, "step": 2517 }, { "epoch": 1.0933564915327834, "grad_norm": 0.035883307456970215, "learning_rate": 0.0001272016222479722, "loss": 0.0401, "step": 2518 }, { "epoch": 1.0937907077724707, "grad_norm": 0.03171491250395775, "learning_rate": 0.0001271726535341831, "loss": 0.0421, "step": 2519 }, { "epoch": 1.094224924012158, "grad_norm": 0.038344383239746094, "learning_rate": 0.000127143684820394, "loss": 0.0554, "step": 2520 }, { "epoch": 1.0946591402518455, "grad_norm": 0.024053802713751793, "learning_rate": 0.00012711471610660487, "loss": 0.03, "step": 2521 }, { "epoch": 1.0950933564915328, "grad_norm": 0.03033187985420227, "learning_rate": 0.00012708574739281575, "loss": 0.0342, "step": 2522 }, { "epoch": 1.0955275727312201, "grad_norm": 0.034343063831329346, "learning_rate": 0.00012705677867902665, "loss": 0.0434, "step": 2523 }, { "epoch": 1.0959617889709075, "grad_norm": 0.02721463143825531, "learning_rate": 0.00012702780996523756, "loss": 0.0343, "step": 2524 }, { "epoch": 1.096396005210595, "grad_norm": 0.035562124103307724, "learning_rate": 0.00012699884125144846, "loss": 0.0361, "step": 2525 }, { "epoch": 1.0968302214502823, "grad_norm": 0.02852759137749672, "learning_rate": 0.00012696987253765934, "loss": 0.0351, "step": 2526 }, { "epoch": 1.0972644376899696, "grad_norm": 0.030773548409342766, "learning_rate": 0.00012694090382387024, "loss": 0.0384, "step": 2527 }, { "epoch": 1.097698653929657, "grad_norm": 0.026188410818576813, "learning_rate": 0.00012691193511008112, "loss": 0.0325, "step": 2528 }, { "epoch": 1.0981328701693442, "grad_norm": 0.032989006489515305, "learning_rate": 0.000126882966396292, "loss": 0.0464, "step": 2529 }, { "epoch": 1.0985670864090318, "grad_norm": 0.023154255002737045, "learning_rate": 0.0001268539976825029, "loss": 0.029, "step": 2530 }, { "epoch": 1.099001302648719, "grad_norm": 0.021991247311234474, "learning_rate": 0.0001268250289687138, "loss": 0.0244, "step": 2531 }, { "epoch": 1.0994355188884064, "grad_norm": 0.03154022619128227, "learning_rate": 0.0001267960602549247, "loss": 0.0343, "step": 2532 }, { "epoch": 1.0998697351280937, "grad_norm": 0.029632005840539932, "learning_rate": 0.00012676709154113558, "loss": 0.0396, "step": 2533 }, { "epoch": 1.1003039513677813, "grad_norm": 0.03726745396852493, "learning_rate": 0.00012673812282734648, "loss": 0.0406, "step": 2534 }, { "epoch": 1.1007381676074686, "grad_norm": 0.03856489062309265, "learning_rate": 0.00012670915411355736, "loss": 0.0436, "step": 2535 }, { "epoch": 1.1011723838471559, "grad_norm": 0.041072964668273926, "learning_rate": 0.00012668018539976824, "loss": 0.0575, "step": 2536 }, { "epoch": 1.1016066000868432, "grad_norm": 0.031296756118535995, "learning_rate": 0.00012665121668597914, "loss": 0.0402, "step": 2537 }, { "epoch": 1.1020408163265305, "grad_norm": 0.035562556236982346, "learning_rate": 0.00012662224797219004, "loss": 0.0399, "step": 2538 }, { "epoch": 1.102475032566218, "grad_norm": 0.03985482081770897, "learning_rate": 0.00012659327925840095, "loss": 0.0449, "step": 2539 }, { "epoch": 1.1029092488059054, "grad_norm": 0.037965770810842514, "learning_rate": 0.00012656431054461183, "loss": 0.0489, "step": 2540 }, { "epoch": 1.1033434650455927, "grad_norm": 0.033631499856710434, "learning_rate": 0.00012653534183082273, "loss": 0.0371, "step": 2541 }, { "epoch": 1.10377768128528, "grad_norm": 0.039605170488357544, "learning_rate": 0.0001265063731170336, "loss": 0.0487, "step": 2542 }, { "epoch": 1.1042118975249675, "grad_norm": 0.027241675183176994, "learning_rate": 0.00012647740440324448, "loss": 0.0307, "step": 2543 }, { "epoch": 1.1046461137646548, "grad_norm": 0.03400669991970062, "learning_rate": 0.00012644843568945539, "loss": 0.039, "step": 2544 }, { "epoch": 1.1050803300043421, "grad_norm": 0.035373345017433167, "learning_rate": 0.0001264194669756663, "loss": 0.0374, "step": 2545 }, { "epoch": 1.1055145462440295, "grad_norm": 0.032689835876226425, "learning_rate": 0.0001263904982618772, "loss": 0.0408, "step": 2546 }, { "epoch": 1.105948762483717, "grad_norm": 0.03441290929913521, "learning_rate": 0.00012636152954808807, "loss": 0.0457, "step": 2547 }, { "epoch": 1.1063829787234043, "grad_norm": 0.03609965369105339, "learning_rate": 0.00012633256083429897, "loss": 0.0392, "step": 2548 }, { "epoch": 1.1068171949630916, "grad_norm": 0.031381186097860336, "learning_rate": 0.00012630359212050985, "loss": 0.0379, "step": 2549 }, { "epoch": 1.107251411202779, "grad_norm": 0.028263196349143982, "learning_rate": 0.00012627462340672073, "loss": 0.0382, "step": 2550 }, { "epoch": 1.1076856274424665, "grad_norm": 0.044052716344594955, "learning_rate": 0.00012624565469293163, "loss": 0.0498, "step": 2551 }, { "epoch": 1.1081198436821538, "grad_norm": 0.028410131111741066, "learning_rate": 0.00012621668597914253, "loss": 0.0341, "step": 2552 }, { "epoch": 1.108554059921841, "grad_norm": 0.032054681330919266, "learning_rate": 0.00012618771726535344, "loss": 0.0392, "step": 2553 }, { "epoch": 1.1089882761615284, "grad_norm": 0.0259060338139534, "learning_rate": 0.00012615874855156431, "loss": 0.0324, "step": 2554 }, { "epoch": 1.1094224924012157, "grad_norm": 0.02634112350642681, "learning_rate": 0.00012612977983777522, "loss": 0.0283, "step": 2555 }, { "epoch": 1.1098567086409032, "grad_norm": 0.030299503356218338, "learning_rate": 0.0001261008111239861, "loss": 0.0346, "step": 2556 }, { "epoch": 1.1102909248805906, "grad_norm": 0.03189830854535103, "learning_rate": 0.00012607184241019697, "loss": 0.0374, "step": 2557 }, { "epoch": 1.1107251411202779, "grad_norm": 0.02752586081624031, "learning_rate": 0.00012604287369640788, "loss": 0.0342, "step": 2558 }, { "epoch": 1.1111593573599652, "grad_norm": 0.025450093671679497, "learning_rate": 0.00012601390498261878, "loss": 0.0349, "step": 2559 }, { "epoch": 1.1115935735996527, "grad_norm": 0.036472368985414505, "learning_rate": 0.00012598493626882968, "loss": 0.0445, "step": 2560 }, { "epoch": 1.11202778983934, "grad_norm": 0.03890056535601616, "learning_rate": 0.00012595596755504056, "loss": 0.0419, "step": 2561 }, { "epoch": 1.1124620060790273, "grad_norm": 0.03232402727007866, "learning_rate": 0.00012592699884125146, "loss": 0.0399, "step": 2562 }, { "epoch": 1.1128962223187147, "grad_norm": 0.03463084623217583, "learning_rate": 0.00012589803012746234, "loss": 0.043, "step": 2563 }, { "epoch": 1.113330438558402, "grad_norm": 0.031389810144901276, "learning_rate": 0.00012586906141367322, "loss": 0.0396, "step": 2564 }, { "epoch": 1.1137646547980895, "grad_norm": 0.03460206463932991, "learning_rate": 0.00012584009269988412, "loss": 0.0382, "step": 2565 }, { "epoch": 1.1141988710377768, "grad_norm": 0.032664183527231216, "learning_rate": 0.00012581112398609502, "loss": 0.0419, "step": 2566 }, { "epoch": 1.1146330872774641, "grad_norm": 0.028210842981934547, "learning_rate": 0.00012578215527230593, "loss": 0.04, "step": 2567 }, { "epoch": 1.1150673035171514, "grad_norm": 0.027972571551799774, "learning_rate": 0.0001257531865585168, "loss": 0.0347, "step": 2568 }, { "epoch": 1.115501519756839, "grad_norm": 0.031685106456279755, "learning_rate": 0.0001257242178447277, "loss": 0.037, "step": 2569 }, { "epoch": 1.1159357359965263, "grad_norm": 0.02954479120671749, "learning_rate": 0.00012569524913093858, "loss": 0.035, "step": 2570 }, { "epoch": 1.1163699522362136, "grad_norm": 0.031048987060785294, "learning_rate": 0.0001256662804171495, "loss": 0.0365, "step": 2571 }, { "epoch": 1.116804168475901, "grad_norm": 0.03579281270503998, "learning_rate": 0.00012563731170336036, "loss": 0.0419, "step": 2572 }, { "epoch": 1.1172383847155885, "grad_norm": 0.026284685358405113, "learning_rate": 0.00012560834298957127, "loss": 0.0315, "step": 2573 }, { "epoch": 1.1176726009552758, "grad_norm": 0.03125812113285065, "learning_rate": 0.00012557937427578217, "loss": 0.0419, "step": 2574 }, { "epoch": 1.118106817194963, "grad_norm": 0.029510769993066788, "learning_rate": 0.00012555040556199305, "loss": 0.0341, "step": 2575 }, { "epoch": 1.1185410334346504, "grad_norm": 0.02720577083528042, "learning_rate": 0.00012552143684820395, "loss": 0.0374, "step": 2576 }, { "epoch": 1.118975249674338, "grad_norm": 0.02508411556482315, "learning_rate": 0.00012549246813441483, "loss": 0.0327, "step": 2577 }, { "epoch": 1.1194094659140252, "grad_norm": 0.02563556097447872, "learning_rate": 0.00012546349942062573, "loss": 0.0322, "step": 2578 }, { "epoch": 1.1198436821537125, "grad_norm": 0.036028340458869934, "learning_rate": 0.0001254345307068366, "loss": 0.0413, "step": 2579 }, { "epoch": 1.1202778983933999, "grad_norm": 0.029660964384675026, "learning_rate": 0.0001254055619930475, "loss": 0.0379, "step": 2580 }, { "epoch": 1.1207121146330872, "grad_norm": 0.034006036818027496, "learning_rate": 0.00012537659327925842, "loss": 0.0401, "step": 2581 }, { "epoch": 1.1211463308727747, "grad_norm": 0.03772401064634323, "learning_rate": 0.0001253476245654693, "loss": 0.0465, "step": 2582 }, { "epoch": 1.121580547112462, "grad_norm": 0.032939523458480835, "learning_rate": 0.0001253186558516802, "loss": 0.0507, "step": 2583 }, { "epoch": 1.1220147633521493, "grad_norm": 0.03197585791349411, "learning_rate": 0.0001252896871378911, "loss": 0.0343, "step": 2584 }, { "epoch": 1.1224489795918366, "grad_norm": 0.03280562534928322, "learning_rate": 0.00012526071842410198, "loss": 0.0319, "step": 2585 }, { "epoch": 1.1228831958315242, "grad_norm": 0.03605299070477486, "learning_rate": 0.00012523174971031285, "loss": 0.0463, "step": 2586 }, { "epoch": 1.1233174120712115, "grad_norm": 0.03192775323987007, "learning_rate": 0.00012520278099652376, "loss": 0.0391, "step": 2587 }, { "epoch": 1.1237516283108988, "grad_norm": 0.03274547681212425, "learning_rate": 0.00012517381228273466, "loss": 0.0423, "step": 2588 }, { "epoch": 1.1241858445505861, "grad_norm": 0.031379520893096924, "learning_rate": 0.00012514484356894554, "loss": 0.0371, "step": 2589 }, { "epoch": 1.1246200607902737, "grad_norm": 0.035379983484745026, "learning_rate": 0.00012511587485515644, "loss": 0.0398, "step": 2590 }, { "epoch": 1.125054277029961, "grad_norm": 0.030178261920809746, "learning_rate": 0.00012508690614136735, "loss": 0.0387, "step": 2591 }, { "epoch": 1.1254884932696483, "grad_norm": 0.028540175408124924, "learning_rate": 0.00012505793742757822, "loss": 0.0405, "step": 2592 }, { "epoch": 1.1259227095093356, "grad_norm": 0.024769525974988937, "learning_rate": 0.0001250289687137891, "loss": 0.0307, "step": 2593 }, { "epoch": 1.1263569257490231, "grad_norm": 0.03165381774306297, "learning_rate": 0.000125, "loss": 0.044, "step": 2594 }, { "epoch": 1.1267911419887104, "grad_norm": 0.024246957153081894, "learning_rate": 0.0001249710312862109, "loss": 0.0308, "step": 2595 }, { "epoch": 1.1272253582283978, "grad_norm": 0.03457241505384445, "learning_rate": 0.00012494206257242178, "loss": 0.0354, "step": 2596 }, { "epoch": 1.127659574468085, "grad_norm": 0.03627859428524971, "learning_rate": 0.0001249130938586327, "loss": 0.0417, "step": 2597 }, { "epoch": 1.1280937907077724, "grad_norm": 0.028001435101032257, "learning_rate": 0.0001248841251448436, "loss": 0.0365, "step": 2598 }, { "epoch": 1.12852800694746, "grad_norm": 0.036363933235406876, "learning_rate": 0.00012485515643105447, "loss": 0.0476, "step": 2599 }, { "epoch": 1.1289622231871472, "grad_norm": 0.027391374111175537, "learning_rate": 0.00012482618771726534, "loss": 0.0357, "step": 2600 }, { "epoch": 1.1293964394268345, "grad_norm": 0.031553614884614944, "learning_rate": 0.00012479721900347625, "loss": 0.0405, "step": 2601 }, { "epoch": 1.1298306556665219, "grad_norm": 0.03029167652130127, "learning_rate": 0.00012476825028968715, "loss": 0.0393, "step": 2602 }, { "epoch": 1.1302648719062094, "grad_norm": 0.02725207805633545, "learning_rate": 0.00012473928157589803, "loss": 0.0341, "step": 2603 }, { "epoch": 1.1306990881458967, "grad_norm": 0.031601689755916595, "learning_rate": 0.00012471031286210893, "loss": 0.0402, "step": 2604 }, { "epoch": 1.131133304385584, "grad_norm": 0.03244725614786148, "learning_rate": 0.00012468134414831984, "loss": 0.0367, "step": 2605 }, { "epoch": 1.1315675206252713, "grad_norm": 0.03136738762259483, "learning_rate": 0.0001246523754345307, "loss": 0.0345, "step": 2606 }, { "epoch": 1.1320017368649586, "grad_norm": 0.027320927008986473, "learning_rate": 0.0001246234067207416, "loss": 0.0363, "step": 2607 }, { "epoch": 1.1324359531046462, "grad_norm": 0.0403415746986866, "learning_rate": 0.0001245944380069525, "loss": 0.0468, "step": 2608 }, { "epoch": 1.1328701693443335, "grad_norm": 0.03412803262472153, "learning_rate": 0.0001245654692931634, "loss": 0.0417, "step": 2609 }, { "epoch": 1.1333043855840208, "grad_norm": 0.02597365342080593, "learning_rate": 0.00012453650057937427, "loss": 0.0387, "step": 2610 }, { "epoch": 1.1337386018237081, "grad_norm": 0.03134492039680481, "learning_rate": 0.00012450753186558518, "loss": 0.0418, "step": 2611 }, { "epoch": 1.1341728180633956, "grad_norm": 0.02752791903913021, "learning_rate": 0.00012447856315179608, "loss": 0.0339, "step": 2612 }, { "epoch": 1.134607034303083, "grad_norm": 0.028080793097615242, "learning_rate": 0.00012444959443800696, "loss": 0.0344, "step": 2613 }, { "epoch": 1.1350412505427703, "grad_norm": 0.031173771247267723, "learning_rate": 0.00012442062572421783, "loss": 0.038, "step": 2614 }, { "epoch": 1.1354754667824576, "grad_norm": 0.0302566010504961, "learning_rate": 0.00012439165701042874, "loss": 0.0381, "step": 2615 }, { "epoch": 1.1359096830221451, "grad_norm": 0.02716781012713909, "learning_rate": 0.00012436268829663964, "loss": 0.0367, "step": 2616 }, { "epoch": 1.1363438992618324, "grad_norm": 0.026070397347211838, "learning_rate": 0.00012433371958285052, "loss": 0.0352, "step": 2617 }, { "epoch": 1.1367781155015197, "grad_norm": 0.028197156265378, "learning_rate": 0.00012430475086906142, "loss": 0.0342, "step": 2618 }, { "epoch": 1.137212331741207, "grad_norm": 0.026335185393691063, "learning_rate": 0.00012427578215527232, "loss": 0.0298, "step": 2619 }, { "epoch": 1.1376465479808946, "grad_norm": 0.034634869545698166, "learning_rate": 0.0001242468134414832, "loss": 0.042, "step": 2620 }, { "epoch": 1.138080764220582, "grad_norm": 0.03242142125964165, "learning_rate": 0.00012421784472769408, "loss": 0.0368, "step": 2621 }, { "epoch": 1.1385149804602692, "grad_norm": 0.037589315325021744, "learning_rate": 0.00012418887601390498, "loss": 0.0425, "step": 2622 }, { "epoch": 1.1389491966999565, "grad_norm": 0.04236025735735893, "learning_rate": 0.00012415990730011589, "loss": 0.0649, "step": 2623 }, { "epoch": 1.1393834129396438, "grad_norm": 0.02821427583694458, "learning_rate": 0.0001241309385863268, "loss": 0.0304, "step": 2624 }, { "epoch": 1.1398176291793314, "grad_norm": 0.03152201324701309, "learning_rate": 0.00012410196987253767, "loss": 0.0363, "step": 2625 }, { "epoch": 1.1402518454190187, "grad_norm": 0.03165849298238754, "learning_rate": 0.00012407300115874857, "loss": 0.0364, "step": 2626 }, { "epoch": 1.140686061658706, "grad_norm": 0.033311545848846436, "learning_rate": 0.00012404403244495945, "loss": 0.0415, "step": 2627 }, { "epoch": 1.1411202778983933, "grad_norm": 0.02829173021018505, "learning_rate": 0.00012401506373117032, "loss": 0.0301, "step": 2628 }, { "epoch": 1.1415544941380809, "grad_norm": 0.03920881822705269, "learning_rate": 0.00012398609501738123, "loss": 0.0454, "step": 2629 }, { "epoch": 1.1419887103777682, "grad_norm": 0.034461867064237595, "learning_rate": 0.00012395712630359213, "loss": 0.0445, "step": 2630 }, { "epoch": 1.1424229266174555, "grad_norm": 0.029790587723255157, "learning_rate": 0.00012392815758980303, "loss": 0.0357, "step": 2631 }, { "epoch": 1.1428571428571428, "grad_norm": 0.03239079937338829, "learning_rate": 0.0001238991888760139, "loss": 0.0322, "step": 2632 }, { "epoch": 1.14329135909683, "grad_norm": 0.032172661274671555, "learning_rate": 0.00012387022016222481, "loss": 0.0384, "step": 2633 }, { "epoch": 1.1437255753365176, "grad_norm": 0.031060023233294487, "learning_rate": 0.0001238412514484357, "loss": 0.0382, "step": 2634 }, { "epoch": 1.144159791576205, "grad_norm": 0.04278445988893509, "learning_rate": 0.00012381228273464657, "loss": 0.0501, "step": 2635 }, { "epoch": 1.1445940078158923, "grad_norm": 0.02790413238108158, "learning_rate": 0.00012378331402085747, "loss": 0.0336, "step": 2636 }, { "epoch": 1.1450282240555798, "grad_norm": 0.037166085094213486, "learning_rate": 0.00012375434530706837, "loss": 0.0445, "step": 2637 }, { "epoch": 1.1454624402952671, "grad_norm": 0.034547992050647736, "learning_rate": 0.00012372537659327928, "loss": 0.0414, "step": 2638 }, { "epoch": 1.1458966565349544, "grad_norm": 0.03248521685600281, "learning_rate": 0.00012369640787949015, "loss": 0.039, "step": 2639 }, { "epoch": 1.1463308727746417, "grad_norm": 0.027289357036352158, "learning_rate": 0.00012366743916570106, "loss": 0.0377, "step": 2640 }, { "epoch": 1.146765089014329, "grad_norm": 0.035692907869815826, "learning_rate": 0.00012363847045191194, "loss": 0.0409, "step": 2641 }, { "epoch": 1.1471993052540166, "grad_norm": 0.02925136871635914, "learning_rate": 0.0001236095017381228, "loss": 0.0336, "step": 2642 }, { "epoch": 1.147633521493704, "grad_norm": 0.03030095063149929, "learning_rate": 0.00012358053302433372, "loss": 0.0423, "step": 2643 }, { "epoch": 1.1480677377333912, "grad_norm": 0.027547160163521767, "learning_rate": 0.00012355156431054462, "loss": 0.0329, "step": 2644 }, { "epoch": 1.1485019539730785, "grad_norm": 0.027768438681960106, "learning_rate": 0.00012352259559675552, "loss": 0.0339, "step": 2645 }, { "epoch": 1.148936170212766, "grad_norm": 0.027045991271734238, "learning_rate": 0.0001234936268829664, "loss": 0.0319, "step": 2646 }, { "epoch": 1.1493703864524534, "grad_norm": 0.03252058103680611, "learning_rate": 0.0001234646581691773, "loss": 0.0386, "step": 2647 }, { "epoch": 1.1498046026921407, "grad_norm": 0.032058533281087875, "learning_rate": 0.00012343568945538818, "loss": 0.0388, "step": 2648 }, { "epoch": 1.150238818931828, "grad_norm": 0.0253597479313612, "learning_rate": 0.00012340672074159906, "loss": 0.0303, "step": 2649 }, { "epoch": 1.1506730351715153, "grad_norm": 0.03661920130252838, "learning_rate": 0.00012337775202780996, "loss": 0.0327, "step": 2650 }, { "epoch": 1.1511072514112028, "grad_norm": 0.030651992186903954, "learning_rate": 0.00012334878331402086, "loss": 0.0348, "step": 2651 }, { "epoch": 1.1515414676508902, "grad_norm": 0.03158602863550186, "learning_rate": 0.00012331981460023177, "loss": 0.0391, "step": 2652 }, { "epoch": 1.1519756838905775, "grad_norm": 0.03361821174621582, "learning_rate": 0.00012329084588644264, "loss": 0.0372, "step": 2653 }, { "epoch": 1.1524099001302648, "grad_norm": 0.03103194199502468, "learning_rate": 0.00012326187717265355, "loss": 0.0348, "step": 2654 }, { "epoch": 1.1528441163699523, "grad_norm": 0.030980534851551056, "learning_rate": 0.00012323290845886442, "loss": 0.036, "step": 2655 }, { "epoch": 1.1532783326096396, "grad_norm": 0.029037483036518097, "learning_rate": 0.0001232039397450753, "loss": 0.0352, "step": 2656 }, { "epoch": 1.153712548849327, "grad_norm": 0.03330636024475098, "learning_rate": 0.0001231749710312862, "loss": 0.0427, "step": 2657 }, { "epoch": 1.1541467650890143, "grad_norm": 0.03869554400444031, "learning_rate": 0.0001231460023174971, "loss": 0.0464, "step": 2658 }, { "epoch": 1.1545809813287016, "grad_norm": 0.037938669323921204, "learning_rate": 0.000123117033603708, "loss": 0.043, "step": 2659 }, { "epoch": 1.155015197568389, "grad_norm": 0.030285336077213287, "learning_rate": 0.0001230880648899189, "loss": 0.0331, "step": 2660 }, { "epoch": 1.1554494138080764, "grad_norm": 0.03161444514989853, "learning_rate": 0.0001230590961761298, "loss": 0.0417, "step": 2661 }, { "epoch": 1.1558836300477637, "grad_norm": 0.028901919722557068, "learning_rate": 0.0001230301274623407, "loss": 0.0314, "step": 2662 }, { "epoch": 1.1563178462874513, "grad_norm": 0.031752265989780426, "learning_rate": 0.00012300115874855155, "loss": 0.0355, "step": 2663 }, { "epoch": 1.1567520625271386, "grad_norm": 0.03106089122593403, "learning_rate": 0.00012297219003476245, "loss": 0.0439, "step": 2664 }, { "epoch": 1.157186278766826, "grad_norm": 0.028864318504929543, "learning_rate": 0.00012294322132097335, "loss": 0.0362, "step": 2665 }, { "epoch": 1.1576204950065132, "grad_norm": 0.03668621554970741, "learning_rate": 0.00012291425260718426, "loss": 0.0365, "step": 2666 }, { "epoch": 1.1580547112462005, "grad_norm": 0.022353660315275192, "learning_rate": 0.00012288528389339513, "loss": 0.03, "step": 2667 }, { "epoch": 1.158488927485888, "grad_norm": 0.03179442882537842, "learning_rate": 0.00012285631517960604, "loss": 0.0403, "step": 2668 }, { "epoch": 1.1589231437255754, "grad_norm": 0.033862799406051636, "learning_rate": 0.00012282734646581694, "loss": 0.0379, "step": 2669 }, { "epoch": 1.1593573599652627, "grad_norm": 0.03265625238418579, "learning_rate": 0.00012279837775202782, "loss": 0.038, "step": 2670 }, { "epoch": 1.15979157620495, "grad_norm": 0.03217211738228798, "learning_rate": 0.0001227694090382387, "loss": 0.0405, "step": 2671 }, { "epoch": 1.1602257924446375, "grad_norm": 0.03351349011063576, "learning_rate": 0.0001227404403244496, "loss": 0.0439, "step": 2672 }, { "epoch": 1.1606600086843248, "grad_norm": 0.03201846405863762, "learning_rate": 0.0001227114716106605, "loss": 0.0431, "step": 2673 }, { "epoch": 1.1610942249240122, "grad_norm": 0.03153888136148453, "learning_rate": 0.00012268250289687138, "loss": 0.0356, "step": 2674 }, { "epoch": 1.1615284411636995, "grad_norm": 0.03875511512160301, "learning_rate": 0.00012265353418308228, "loss": 0.0447, "step": 2675 }, { "epoch": 1.1619626574033868, "grad_norm": 0.029014702886343002, "learning_rate": 0.00012262456546929319, "loss": 0.0376, "step": 2676 }, { "epoch": 1.1623968736430743, "grad_norm": 0.033226028084754944, "learning_rate": 0.00012259559675550406, "loss": 0.0376, "step": 2677 }, { "epoch": 1.1628310898827616, "grad_norm": 0.035794347524642944, "learning_rate": 0.00012256662804171494, "loss": 0.05, "step": 2678 }, { "epoch": 1.163265306122449, "grad_norm": 0.03097541816532612, "learning_rate": 0.00012253765932792584, "loss": 0.0383, "step": 2679 }, { "epoch": 1.1636995223621363, "grad_norm": 0.03218699246644974, "learning_rate": 0.00012250869061413675, "loss": 0.0383, "step": 2680 }, { "epoch": 1.1641337386018238, "grad_norm": 0.02953638881444931, "learning_rate": 0.00012247972190034762, "loss": 0.0336, "step": 2681 }, { "epoch": 1.164567954841511, "grad_norm": 0.031509146094322205, "learning_rate": 0.00012245075318655853, "loss": 0.042, "step": 2682 }, { "epoch": 1.1650021710811984, "grad_norm": 0.03206924721598625, "learning_rate": 0.00012242178447276943, "loss": 0.0355, "step": 2683 }, { "epoch": 1.1654363873208857, "grad_norm": 0.029733512550592422, "learning_rate": 0.0001223928157589803, "loss": 0.0374, "step": 2684 }, { "epoch": 1.1658706035605733, "grad_norm": 0.03374966233968735, "learning_rate": 0.00012236384704519118, "loss": 0.0387, "step": 2685 }, { "epoch": 1.1663048198002606, "grad_norm": 0.02810685709118843, "learning_rate": 0.0001223348783314021, "loss": 0.0346, "step": 2686 }, { "epoch": 1.1667390360399479, "grad_norm": 0.03380461409687996, "learning_rate": 0.000122305909617613, "loss": 0.0393, "step": 2687 }, { "epoch": 1.1671732522796352, "grad_norm": 0.02941637858748436, "learning_rate": 0.00012227694090382387, "loss": 0.0372, "step": 2688 }, { "epoch": 1.1676074685193227, "grad_norm": 0.0397912859916687, "learning_rate": 0.00012224797219003477, "loss": 0.0421, "step": 2689 }, { "epoch": 1.16804168475901, "grad_norm": 0.026806721463799477, "learning_rate": 0.00012221900347624568, "loss": 0.0353, "step": 2690 }, { "epoch": 1.1684759009986974, "grad_norm": 0.036484505981206894, "learning_rate": 0.00012219003476245655, "loss": 0.0418, "step": 2691 }, { "epoch": 1.1689101172383847, "grad_norm": 0.02912399359047413, "learning_rate": 0.00012216106604866743, "loss": 0.0348, "step": 2692 }, { "epoch": 1.169344333478072, "grad_norm": 0.031042441725730896, "learning_rate": 0.00012213209733487833, "loss": 0.0412, "step": 2693 }, { "epoch": 1.1697785497177595, "grad_norm": 0.0406530387699604, "learning_rate": 0.00012210312862108924, "loss": 0.0499, "step": 2694 }, { "epoch": 1.1702127659574468, "grad_norm": 0.029482407495379448, "learning_rate": 0.0001220741599073001, "loss": 0.0359, "step": 2695 }, { "epoch": 1.1706469821971341, "grad_norm": 0.04151599481701851, "learning_rate": 0.00012204519119351102, "loss": 0.0429, "step": 2696 }, { "epoch": 1.1710811984368215, "grad_norm": 0.026229115203022957, "learning_rate": 0.0001220162224797219, "loss": 0.0329, "step": 2697 }, { "epoch": 1.171515414676509, "grad_norm": 0.033560868352651596, "learning_rate": 0.00012198725376593281, "loss": 0.036, "step": 2698 }, { "epoch": 1.1719496309161963, "grad_norm": 0.03425081819295883, "learning_rate": 0.00012195828505214369, "loss": 0.0421, "step": 2699 }, { "epoch": 1.1723838471558836, "grad_norm": 0.031009351834654808, "learning_rate": 0.00012192931633835458, "loss": 0.0311, "step": 2700 }, { "epoch": 1.172818063395571, "grad_norm": 0.028872370719909668, "learning_rate": 0.00012190034762456548, "loss": 0.0358, "step": 2701 }, { "epoch": 1.1732522796352582, "grad_norm": 0.03306965529918671, "learning_rate": 0.00012187137891077636, "loss": 0.0374, "step": 2702 }, { "epoch": 1.1736864958749458, "grad_norm": 0.025960443541407585, "learning_rate": 0.00012184241019698726, "loss": 0.0336, "step": 2703 }, { "epoch": 1.174120712114633, "grad_norm": 0.0420551560819149, "learning_rate": 0.00012181344148319815, "loss": 0.0508, "step": 2704 }, { "epoch": 1.1745549283543204, "grad_norm": 0.03708711639046669, "learning_rate": 0.00012178447276940905, "loss": 0.047, "step": 2705 }, { "epoch": 1.174989144594008, "grad_norm": 0.030126245692372322, "learning_rate": 0.00012175550405561993, "loss": 0.0369, "step": 2706 }, { "epoch": 1.1754233608336953, "grad_norm": 0.03158334642648697, "learning_rate": 0.00012172653534183082, "loss": 0.0365, "step": 2707 }, { "epoch": 1.1758575770733826, "grad_norm": 0.029271027073264122, "learning_rate": 0.00012169756662804173, "loss": 0.0361, "step": 2708 }, { "epoch": 1.1762917933130699, "grad_norm": 0.026995837688446045, "learning_rate": 0.0001216685979142526, "loss": 0.0358, "step": 2709 }, { "epoch": 1.1767260095527572, "grad_norm": 0.03477263078093529, "learning_rate": 0.0001216396292004635, "loss": 0.0434, "step": 2710 }, { "epoch": 1.1771602257924447, "grad_norm": 0.03137827664613724, "learning_rate": 0.0001216106604866744, "loss": 0.0412, "step": 2711 }, { "epoch": 1.177594442032132, "grad_norm": 0.043022822588682175, "learning_rate": 0.0001215816917728853, "loss": 0.046, "step": 2712 }, { "epoch": 1.1780286582718194, "grad_norm": 0.034901995211839676, "learning_rate": 0.00012155272305909618, "loss": 0.041, "step": 2713 }, { "epoch": 1.1784628745115067, "grad_norm": 0.026185251772403717, "learning_rate": 0.00012152375434530707, "loss": 0.0309, "step": 2714 }, { "epoch": 1.1788970907511942, "grad_norm": 0.03407777100801468, "learning_rate": 0.00012149478563151797, "loss": 0.0367, "step": 2715 }, { "epoch": 1.1793313069908815, "grad_norm": 0.03825793042778969, "learning_rate": 0.00012146581691772887, "loss": 0.0476, "step": 2716 }, { "epoch": 1.1797655232305688, "grad_norm": 0.033323682844638824, "learning_rate": 0.00012143684820393975, "loss": 0.032, "step": 2717 }, { "epoch": 1.1801997394702561, "grad_norm": 0.03102484717965126, "learning_rate": 0.00012140787949015064, "loss": 0.0317, "step": 2718 }, { "epoch": 1.1806339557099435, "grad_norm": 0.02975616604089737, "learning_rate": 0.00012137891077636154, "loss": 0.0403, "step": 2719 }, { "epoch": 1.181068171949631, "grad_norm": 0.0373043566942215, "learning_rate": 0.00012134994206257242, "loss": 0.047, "step": 2720 }, { "epoch": 1.1815023881893183, "grad_norm": 0.025004668161273003, "learning_rate": 0.00012132097334878331, "loss": 0.0307, "step": 2721 }, { "epoch": 1.1819366044290056, "grad_norm": 0.03018679842352867, "learning_rate": 0.00012129200463499421, "loss": 0.036, "step": 2722 }, { "epoch": 1.182370820668693, "grad_norm": 0.038219403475522995, "learning_rate": 0.00012126303592120512, "loss": 0.042, "step": 2723 }, { "epoch": 1.1828050369083805, "grad_norm": 0.03357020765542984, "learning_rate": 0.000121234067207416, "loss": 0.0436, "step": 2724 }, { "epoch": 1.1832392531480678, "grad_norm": 0.03269839286804199, "learning_rate": 0.00012120509849362689, "loss": 0.0362, "step": 2725 }, { "epoch": 1.183673469387755, "grad_norm": 0.032603636384010315, "learning_rate": 0.00012117612977983779, "loss": 0.0366, "step": 2726 }, { "epoch": 1.1841076856274424, "grad_norm": 0.027393219992518425, "learning_rate": 0.00012114716106604867, "loss": 0.0323, "step": 2727 }, { "epoch": 1.1845419018671297, "grad_norm": 0.031068425625562668, "learning_rate": 0.00012111819235225957, "loss": 0.0347, "step": 2728 }, { "epoch": 1.1849761181068172, "grad_norm": 0.03924299404025078, "learning_rate": 0.00012108922363847046, "loss": 0.0398, "step": 2729 }, { "epoch": 1.1854103343465046, "grad_norm": 0.027679264545440674, "learning_rate": 0.00012106025492468136, "loss": 0.0369, "step": 2730 }, { "epoch": 1.1858445505861919, "grad_norm": 0.033195700496435165, "learning_rate": 0.00012103128621089224, "loss": 0.0361, "step": 2731 }, { "epoch": 1.1862787668258794, "grad_norm": 0.04086673632264137, "learning_rate": 0.00012100231749710313, "loss": 0.0493, "step": 2732 }, { "epoch": 1.1867129830655667, "grad_norm": 0.03231199085712433, "learning_rate": 0.00012097334878331403, "loss": 0.0384, "step": 2733 }, { "epoch": 1.187147199305254, "grad_norm": 0.03362030163407326, "learning_rate": 0.00012094438006952491, "loss": 0.0366, "step": 2734 }, { "epoch": 1.1875814155449413, "grad_norm": 0.03445395827293396, "learning_rate": 0.00012091541135573581, "loss": 0.035, "step": 2735 }, { "epoch": 1.1880156317846287, "grad_norm": 0.025187311694025993, "learning_rate": 0.0001208864426419467, "loss": 0.032, "step": 2736 }, { "epoch": 1.1884498480243162, "grad_norm": 0.04158493131399155, "learning_rate": 0.00012085747392815761, "loss": 0.0457, "step": 2737 }, { "epoch": 1.1888840642640035, "grad_norm": 0.043713320046663284, "learning_rate": 0.00012082850521436848, "loss": 0.047, "step": 2738 }, { "epoch": 1.1893182805036908, "grad_norm": 0.033965110778808594, "learning_rate": 0.00012079953650057937, "loss": 0.0384, "step": 2739 }, { "epoch": 1.1897524967433781, "grad_norm": 0.038044724613428116, "learning_rate": 0.00012077056778679028, "loss": 0.0362, "step": 2740 }, { "epoch": 1.1901867129830657, "grad_norm": 0.036382727324962616, "learning_rate": 0.00012074159907300116, "loss": 0.0392, "step": 2741 }, { "epoch": 1.190620929222753, "grad_norm": 0.03841260448098183, "learning_rate": 0.00012071263035921206, "loss": 0.0409, "step": 2742 }, { "epoch": 1.1910551454624403, "grad_norm": 0.030238701030611992, "learning_rate": 0.00012068366164542295, "loss": 0.0376, "step": 2743 }, { "epoch": 1.1914893617021276, "grad_norm": 0.029582306742668152, "learning_rate": 0.00012065469293163385, "loss": 0.0383, "step": 2744 }, { "epoch": 1.191923577941815, "grad_norm": 0.03072711080312729, "learning_rate": 0.00012062572421784473, "loss": 0.0398, "step": 2745 }, { "epoch": 1.1923577941815025, "grad_norm": 0.039950285106897354, "learning_rate": 0.00012059675550405562, "loss": 0.05, "step": 2746 }, { "epoch": 1.1927920104211898, "grad_norm": 0.02699861116707325, "learning_rate": 0.00012056778679026652, "loss": 0.0371, "step": 2747 }, { "epoch": 1.193226226660877, "grad_norm": 0.04063248634338379, "learning_rate": 0.0001205388180764774, "loss": 0.0426, "step": 2748 }, { "epoch": 1.1936604429005644, "grad_norm": 0.03440306335687637, "learning_rate": 0.0001205098493626883, "loss": 0.0453, "step": 2749 }, { "epoch": 1.194094659140252, "grad_norm": 0.030750583857297897, "learning_rate": 0.0001204808806488992, "loss": 0.0367, "step": 2750 }, { "epoch": 1.1945288753799392, "grad_norm": 0.03003905899822712, "learning_rate": 0.0001204519119351101, "loss": 0.0313, "step": 2751 }, { "epoch": 1.1949630916196265, "grad_norm": 0.02982192300260067, "learning_rate": 0.00012042294322132097, "loss": 0.0356, "step": 2752 }, { "epoch": 1.1953973078593139, "grad_norm": 0.03319333493709564, "learning_rate": 0.00012039397450753186, "loss": 0.0392, "step": 2753 }, { "epoch": 1.1958315240990014, "grad_norm": 0.029995594173669815, "learning_rate": 0.00012036500579374277, "loss": 0.0342, "step": 2754 }, { "epoch": 1.1962657403386887, "grad_norm": 0.027988415211439133, "learning_rate": 0.00012033603707995364, "loss": 0.0338, "step": 2755 }, { "epoch": 1.196699956578376, "grad_norm": 0.03457064926624298, "learning_rate": 0.00012030706836616455, "loss": 0.0394, "step": 2756 }, { "epoch": 1.1971341728180633, "grad_norm": 0.03244677931070328, "learning_rate": 0.00012027809965237544, "loss": 0.0359, "step": 2757 }, { "epoch": 1.1975683890577509, "grad_norm": 0.030775725841522217, "learning_rate": 0.00012024913093858634, "loss": 0.0371, "step": 2758 }, { "epoch": 1.1980026052974382, "grad_norm": 0.03543740510940552, "learning_rate": 0.00012022016222479722, "loss": 0.038, "step": 2759 }, { "epoch": 1.1984368215371255, "grad_norm": 0.029827335849404335, "learning_rate": 0.00012019119351100811, "loss": 0.0356, "step": 2760 }, { "epoch": 1.1988710377768128, "grad_norm": 0.02681722491979599, "learning_rate": 0.00012016222479721901, "loss": 0.0313, "step": 2761 }, { "epoch": 1.1993052540165001, "grad_norm": 0.042455412447452545, "learning_rate": 0.00012013325608342992, "loss": 0.052, "step": 2762 }, { "epoch": 1.1997394702561877, "grad_norm": 0.023997485637664795, "learning_rate": 0.00012010428736964079, "loss": 0.0289, "step": 2763 }, { "epoch": 1.200173686495875, "grad_norm": 0.03161256015300751, "learning_rate": 0.00012007531865585168, "loss": 0.0379, "step": 2764 }, { "epoch": 1.2006079027355623, "grad_norm": 0.027431510388851166, "learning_rate": 0.00012004634994206259, "loss": 0.0357, "step": 2765 }, { "epoch": 1.2010421189752496, "grad_norm": 0.036753252148628235, "learning_rate": 0.00012001738122827346, "loss": 0.037, "step": 2766 }, { "epoch": 1.2014763352149371, "grad_norm": 0.03916870057582855, "learning_rate": 0.00011998841251448437, "loss": 0.0466, "step": 2767 }, { "epoch": 1.2019105514546244, "grad_norm": 0.031121529638767242, "learning_rate": 0.00011995944380069526, "loss": 0.0346, "step": 2768 }, { "epoch": 1.2023447676943118, "grad_norm": 0.03603595867753029, "learning_rate": 0.00011993047508690616, "loss": 0.0436, "step": 2769 }, { "epoch": 1.202778983933999, "grad_norm": 0.029314348474144936, "learning_rate": 0.00011990150637311704, "loss": 0.0354, "step": 2770 }, { "epoch": 1.2032132001736864, "grad_norm": 0.03282938525080681, "learning_rate": 0.00011987253765932793, "loss": 0.038, "step": 2771 }, { "epoch": 1.203647416413374, "grad_norm": 0.02804744988679886, "learning_rate": 0.00011984356894553883, "loss": 0.033, "step": 2772 }, { "epoch": 1.2040816326530612, "grad_norm": 0.03376150131225586, "learning_rate": 0.00011981460023174971, "loss": 0.0434, "step": 2773 }, { "epoch": 1.2045158488927485, "grad_norm": 0.03400374576449394, "learning_rate": 0.00011978563151796061, "loss": 0.0447, "step": 2774 }, { "epoch": 1.204950065132436, "grad_norm": 0.04957886412739754, "learning_rate": 0.0001197566628041715, "loss": 0.054, "step": 2775 }, { "epoch": 1.2053842813721234, "grad_norm": 0.03741441294550896, "learning_rate": 0.0001197276940903824, "loss": 0.0447, "step": 2776 }, { "epoch": 1.2058184976118107, "grad_norm": 0.030830014497041702, "learning_rate": 0.00011969872537659328, "loss": 0.0401, "step": 2777 }, { "epoch": 1.206252713851498, "grad_norm": 0.03541937842965126, "learning_rate": 0.00011966975666280417, "loss": 0.0374, "step": 2778 }, { "epoch": 1.2066869300911853, "grad_norm": 0.03395817056298256, "learning_rate": 0.00011964078794901508, "loss": 0.0411, "step": 2779 }, { "epoch": 1.2071211463308729, "grad_norm": 0.030810073018074036, "learning_rate": 0.00011961181923522595, "loss": 0.0389, "step": 2780 }, { "epoch": 1.2075553625705602, "grad_norm": 0.03265755996108055, "learning_rate": 0.00011958285052143686, "loss": 0.0381, "step": 2781 }, { "epoch": 1.2079895788102475, "grad_norm": 0.033778052777051926, "learning_rate": 0.00011955388180764775, "loss": 0.038, "step": 2782 }, { "epoch": 1.2084237950499348, "grad_norm": 0.025912310928106308, "learning_rate": 0.00011952491309385865, "loss": 0.0301, "step": 2783 }, { "epoch": 1.2088580112896223, "grad_norm": 0.04260667786002159, "learning_rate": 0.00011949594438006953, "loss": 0.0537, "step": 2784 }, { "epoch": 1.2092922275293096, "grad_norm": 0.02852647379040718, "learning_rate": 0.00011946697566628042, "loss": 0.0316, "step": 2785 }, { "epoch": 1.209726443768997, "grad_norm": 0.04134998098015785, "learning_rate": 0.00011943800695249132, "loss": 0.0482, "step": 2786 }, { "epoch": 1.2101606600086843, "grad_norm": 0.03095860406756401, "learning_rate": 0.0001194090382387022, "loss": 0.038, "step": 2787 }, { "epoch": 1.2105948762483716, "grad_norm": 0.037704650312662125, "learning_rate": 0.0001193800695249131, "loss": 0.038, "step": 2788 }, { "epoch": 1.2110290924880591, "grad_norm": 0.028828635811805725, "learning_rate": 0.00011935110081112399, "loss": 0.0335, "step": 2789 }, { "epoch": 1.2114633087277464, "grad_norm": 0.031223729252815247, "learning_rate": 0.0001193221320973349, "loss": 0.038, "step": 2790 }, { "epoch": 1.2118975249674337, "grad_norm": 0.02799689583480358, "learning_rate": 0.00011929316338354577, "loss": 0.0296, "step": 2791 }, { "epoch": 1.212331741207121, "grad_norm": 0.026522569358348846, "learning_rate": 0.00011926419466975666, "loss": 0.0304, "step": 2792 }, { "epoch": 1.2127659574468086, "grad_norm": 0.034435104578733444, "learning_rate": 0.00011923522595596757, "loss": 0.0538, "step": 2793 }, { "epoch": 1.213200173686496, "grad_norm": 0.04640308767557144, "learning_rate": 0.00011920625724217844, "loss": 0.0499, "step": 2794 }, { "epoch": 1.2136343899261832, "grad_norm": 0.02818608470261097, "learning_rate": 0.00011917728852838935, "loss": 0.0324, "step": 2795 }, { "epoch": 1.2140686061658705, "grad_norm": 0.03657647967338562, "learning_rate": 0.00011914831981460024, "loss": 0.043, "step": 2796 }, { "epoch": 1.2145028224055578, "grad_norm": 0.030338814482092857, "learning_rate": 0.00011911935110081114, "loss": 0.0343, "step": 2797 }, { "epoch": 1.2149370386452454, "grad_norm": 0.04054642841219902, "learning_rate": 0.00011909038238702202, "loss": 0.0478, "step": 2798 }, { "epoch": 1.2153712548849327, "grad_norm": 0.03387877717614174, "learning_rate": 0.0001190614136732329, "loss": 0.0384, "step": 2799 }, { "epoch": 1.21580547112462, "grad_norm": 0.030276183038949966, "learning_rate": 0.00011903244495944381, "loss": 0.0375, "step": 2800 }, { "epoch": 1.2162396873643075, "grad_norm": 0.03658377006649971, "learning_rate": 0.00011900347624565469, "loss": 0.0445, "step": 2801 }, { "epoch": 1.2166739036039949, "grad_norm": 0.037355877459049225, "learning_rate": 0.00011897450753186559, "loss": 0.0441, "step": 2802 }, { "epoch": 1.2171081198436822, "grad_norm": 0.03981072083115578, "learning_rate": 0.00011894553881807648, "loss": 0.0467, "step": 2803 }, { "epoch": 1.2175423360833695, "grad_norm": 0.03614025190472603, "learning_rate": 0.00011891657010428738, "loss": 0.0375, "step": 2804 }, { "epoch": 1.2179765523230568, "grad_norm": 0.03509506210684776, "learning_rate": 0.00011888760139049826, "loss": 0.0432, "step": 2805 }, { "epoch": 1.2184107685627443, "grad_norm": 0.036707330495119095, "learning_rate": 0.00011885863267670917, "loss": 0.0452, "step": 2806 }, { "epoch": 1.2188449848024316, "grad_norm": 0.03362826630473137, "learning_rate": 0.00011882966396292006, "loss": 0.0394, "step": 2807 }, { "epoch": 1.219279201042119, "grad_norm": 0.05304175987839699, "learning_rate": 0.00011880069524913093, "loss": 0.0598, "step": 2808 }, { "epoch": 1.2197134172818063, "grad_norm": 0.02645508013665676, "learning_rate": 0.00011877172653534184, "loss": 0.0326, "step": 2809 }, { "epoch": 1.2201476335214938, "grad_norm": 0.03350101411342621, "learning_rate": 0.00011874275782155273, "loss": 0.0368, "step": 2810 }, { "epoch": 1.2205818497611811, "grad_norm": 0.03268517926335335, "learning_rate": 0.00011871378910776363, "loss": 0.035, "step": 2811 }, { "epoch": 1.2210160660008684, "grad_norm": 0.03565121442079544, "learning_rate": 0.0001186848203939745, "loss": 0.0372, "step": 2812 }, { "epoch": 1.2214502822405557, "grad_norm": 0.029030516743659973, "learning_rate": 0.00011865585168018541, "loss": 0.0348, "step": 2813 }, { "epoch": 1.221884498480243, "grad_norm": 0.026003815233707428, "learning_rate": 0.0001186268829663963, "loss": 0.0314, "step": 2814 }, { "epoch": 1.2223187147199306, "grad_norm": 0.0277452003210783, "learning_rate": 0.0001185979142526072, "loss": 0.0339, "step": 2815 }, { "epoch": 1.222752930959618, "grad_norm": 0.03227734938263893, "learning_rate": 0.00011856894553881808, "loss": 0.0424, "step": 2816 }, { "epoch": 1.2231871471993052, "grad_norm": 0.030812203884124756, "learning_rate": 0.00011853997682502897, "loss": 0.04, "step": 2817 }, { "epoch": 1.2236213634389925, "grad_norm": 0.02895817533135414, "learning_rate": 0.00011851100811123987, "loss": 0.0379, "step": 2818 }, { "epoch": 1.22405557967868, "grad_norm": 0.029404612258076668, "learning_rate": 0.00011848203939745075, "loss": 0.0379, "step": 2819 }, { "epoch": 1.2244897959183674, "grad_norm": 0.03184221312403679, "learning_rate": 0.00011845307068366165, "loss": 0.0368, "step": 2820 }, { "epoch": 1.2249240121580547, "grad_norm": 0.041890766471624374, "learning_rate": 0.00011842410196987254, "loss": 0.0368, "step": 2821 }, { "epoch": 1.225358228397742, "grad_norm": 0.03420192375779152, "learning_rate": 0.00011839513325608345, "loss": 0.0414, "step": 2822 }, { "epoch": 1.2257924446374295, "grad_norm": 0.031105518341064453, "learning_rate": 0.00011836616454229432, "loss": 0.0373, "step": 2823 }, { "epoch": 1.2262266608771168, "grad_norm": 0.029392139986157417, "learning_rate": 0.00011833719582850522, "loss": 0.034, "step": 2824 }, { "epoch": 1.2266608771168042, "grad_norm": 0.028221040964126587, "learning_rate": 0.00011830822711471612, "loss": 0.0324, "step": 2825 }, { "epoch": 1.2270950933564915, "grad_norm": 0.031487371772527695, "learning_rate": 0.000118279258400927, "loss": 0.0367, "step": 2826 }, { "epoch": 1.227529309596179, "grad_norm": 0.02891075797379017, "learning_rate": 0.0001182502896871379, "loss": 0.0325, "step": 2827 }, { "epoch": 1.2279635258358663, "grad_norm": 0.045036111027002335, "learning_rate": 0.00011822132097334879, "loss": 0.04, "step": 2828 }, { "epoch": 1.2283977420755536, "grad_norm": 0.02837185375392437, "learning_rate": 0.00011819235225955969, "loss": 0.0375, "step": 2829 }, { "epoch": 1.228831958315241, "grad_norm": 0.026207871735095978, "learning_rate": 0.00011816338354577057, "loss": 0.0302, "step": 2830 }, { "epoch": 1.2292661745549283, "grad_norm": 0.043009202927351, "learning_rate": 0.00011813441483198146, "loss": 0.0478, "step": 2831 }, { "epoch": 1.2297003907946158, "grad_norm": 0.03355031460523605, "learning_rate": 0.00011810544611819236, "loss": 0.036, "step": 2832 }, { "epoch": 1.230134607034303, "grad_norm": 0.03388876095414162, "learning_rate": 0.00011807647740440324, "loss": 0.0385, "step": 2833 }, { "epoch": 1.2305688232739904, "grad_norm": 0.029853632673621178, "learning_rate": 0.00011804750869061414, "loss": 0.0349, "step": 2834 }, { "epoch": 1.2310030395136777, "grad_norm": 0.03374839574098587, "learning_rate": 0.00011801853997682503, "loss": 0.0392, "step": 2835 }, { "epoch": 1.2314372557533653, "grad_norm": 0.03743988275527954, "learning_rate": 0.00011798957126303594, "loss": 0.0397, "step": 2836 }, { "epoch": 1.2318714719930526, "grad_norm": 0.03144682198762894, "learning_rate": 0.00011796060254924681, "loss": 0.0387, "step": 2837 }, { "epoch": 1.23230568823274, "grad_norm": 0.03257567062973976, "learning_rate": 0.0001179316338354577, "loss": 0.0342, "step": 2838 }, { "epoch": 1.2327399044724272, "grad_norm": 0.028149418532848358, "learning_rate": 0.00011790266512166861, "loss": 0.0341, "step": 2839 }, { "epoch": 1.2331741207121145, "grad_norm": 0.03602912649512291, "learning_rate": 0.00011787369640787948, "loss": 0.0439, "step": 2840 }, { "epoch": 1.233608336951802, "grad_norm": 0.028395509347319603, "learning_rate": 0.00011784472769409039, "loss": 0.0303, "step": 2841 }, { "epoch": 1.2340425531914894, "grad_norm": 0.037411823868751526, "learning_rate": 0.00011781575898030128, "loss": 0.0419, "step": 2842 }, { "epoch": 1.2344767694311767, "grad_norm": 0.04414879158139229, "learning_rate": 0.00011778679026651218, "loss": 0.048, "step": 2843 }, { "epoch": 1.2349109856708642, "grad_norm": 0.02796093560755253, "learning_rate": 0.00011775782155272306, "loss": 0.0324, "step": 2844 }, { "epoch": 1.2353452019105515, "grad_norm": 0.041194695979356766, "learning_rate": 0.00011772885283893396, "loss": 0.0456, "step": 2845 }, { "epoch": 1.2357794181502388, "grad_norm": 0.03559482470154762, "learning_rate": 0.00011769988412514485, "loss": 0.038, "step": 2846 }, { "epoch": 1.2362136343899262, "grad_norm": 0.034583043307065964, "learning_rate": 0.00011767091541135573, "loss": 0.0387, "step": 2847 }, { "epoch": 1.2366478506296135, "grad_norm": 0.0393785797059536, "learning_rate": 0.00011764194669756663, "loss": 0.0381, "step": 2848 }, { "epoch": 1.237082066869301, "grad_norm": 0.037958625704050064, "learning_rate": 0.00011761297798377752, "loss": 0.0419, "step": 2849 }, { "epoch": 1.2375162831089883, "grad_norm": 0.0314825214445591, "learning_rate": 0.00011758400926998843, "loss": 0.0369, "step": 2850 }, { "epoch": 1.2379504993486756, "grad_norm": 0.02562011033296585, "learning_rate": 0.0001175550405561993, "loss": 0.0321, "step": 2851 }, { "epoch": 1.238384715588363, "grad_norm": 0.038788292557001114, "learning_rate": 0.00011752607184241021, "loss": 0.0446, "step": 2852 }, { "epoch": 1.2388189318280505, "grad_norm": 0.03786344826221466, "learning_rate": 0.0001174971031286211, "loss": 0.0466, "step": 2853 }, { "epoch": 1.2392531480677378, "grad_norm": 0.04060401767492294, "learning_rate": 0.00011746813441483197, "loss": 0.0499, "step": 2854 }, { "epoch": 1.239687364307425, "grad_norm": 0.02499685250222683, "learning_rate": 0.00011743916570104288, "loss": 0.0287, "step": 2855 }, { "epoch": 1.2401215805471124, "grad_norm": 0.035491228103637695, "learning_rate": 0.00011741019698725377, "loss": 0.0452, "step": 2856 }, { "epoch": 1.2405557967867997, "grad_norm": 0.032318007200956345, "learning_rate": 0.00011738122827346467, "loss": 0.0343, "step": 2857 }, { "epoch": 1.2409900130264873, "grad_norm": 0.03384597599506378, "learning_rate": 0.00011735225955967555, "loss": 0.0396, "step": 2858 }, { "epoch": 1.2414242292661746, "grad_norm": 0.041230153292417526, "learning_rate": 0.00011732329084588645, "loss": 0.0473, "step": 2859 }, { "epoch": 1.2418584455058619, "grad_norm": 0.028347861021757126, "learning_rate": 0.00011729432213209734, "loss": 0.0323, "step": 2860 }, { "epoch": 1.2422926617455492, "grad_norm": 0.037681758403778076, "learning_rate": 0.00011726535341830825, "loss": 0.0423, "step": 2861 }, { "epoch": 1.2427268779852367, "grad_norm": 0.031744927167892456, "learning_rate": 0.00011723638470451912, "loss": 0.0387, "step": 2862 }, { "epoch": 1.243161094224924, "grad_norm": 0.0437411330640316, "learning_rate": 0.00011720741599073001, "loss": 0.0468, "step": 2863 }, { "epoch": 1.2435953104646114, "grad_norm": 0.03358388692140579, "learning_rate": 0.00011717844727694092, "loss": 0.0387, "step": 2864 }, { "epoch": 1.2440295267042987, "grad_norm": 0.037192452698946, "learning_rate": 0.0001171494785631518, "loss": 0.0399, "step": 2865 }, { "epoch": 1.244463742943986, "grad_norm": 0.029540754854679108, "learning_rate": 0.0001171205098493627, "loss": 0.037, "step": 2866 }, { "epoch": 1.2448979591836735, "grad_norm": 0.039404384791851044, "learning_rate": 0.00011709154113557359, "loss": 0.0433, "step": 2867 }, { "epoch": 1.2453321754233608, "grad_norm": 0.03513019531965256, "learning_rate": 0.00011706257242178449, "loss": 0.0465, "step": 2868 }, { "epoch": 1.2457663916630481, "grad_norm": 0.03809894993901253, "learning_rate": 0.00011703360370799537, "loss": 0.0447, "step": 2869 }, { "epoch": 1.2462006079027357, "grad_norm": 0.031927578151226044, "learning_rate": 0.00011700463499420626, "loss": 0.0395, "step": 2870 }, { "epoch": 1.246634824142423, "grad_norm": 0.032195281237363815, "learning_rate": 0.00011697566628041716, "loss": 0.0403, "step": 2871 }, { "epoch": 1.2470690403821103, "grad_norm": 0.035015542060136795, "learning_rate": 0.00011694669756662804, "loss": 0.0422, "step": 2872 }, { "epoch": 1.2475032566217976, "grad_norm": 0.030612224712967873, "learning_rate": 0.00011691772885283894, "loss": 0.0358, "step": 2873 }, { "epoch": 1.247937472861485, "grad_norm": 0.03541300445795059, "learning_rate": 0.00011688876013904983, "loss": 0.0401, "step": 2874 }, { "epoch": 1.2483716891011725, "grad_norm": 0.02686985954642296, "learning_rate": 0.00011685979142526074, "loss": 0.0319, "step": 2875 }, { "epoch": 1.2488059053408598, "grad_norm": 0.03421938791871071, "learning_rate": 0.00011683082271147161, "loss": 0.0414, "step": 2876 }, { "epoch": 1.249240121580547, "grad_norm": 0.02932366356253624, "learning_rate": 0.0001168018539976825, "loss": 0.0369, "step": 2877 }, { "epoch": 1.2496743378202344, "grad_norm": 0.03921453282237053, "learning_rate": 0.0001167728852838934, "loss": 0.0517, "step": 2878 }, { "epoch": 1.250108554059922, "grad_norm": 0.03850848227739334, "learning_rate": 0.00011674391657010428, "loss": 0.0489, "step": 2879 }, { "epoch": 1.2505427702996093, "grad_norm": 0.028171556070446968, "learning_rate": 0.00011671494785631519, "loss": 0.0334, "step": 2880 }, { "epoch": 1.2509769865392966, "grad_norm": 0.034020841121673584, "learning_rate": 0.00011668597914252608, "loss": 0.0397, "step": 2881 }, { "epoch": 1.2514112027789839, "grad_norm": 0.028083082288503647, "learning_rate": 0.00011665701042873698, "loss": 0.0335, "step": 2882 }, { "epoch": 1.2518454190186712, "grad_norm": 0.037423938512802124, "learning_rate": 0.00011662804171494786, "loss": 0.0529, "step": 2883 }, { "epoch": 1.2522796352583587, "grad_norm": 0.031783442944288254, "learning_rate": 0.00011659907300115876, "loss": 0.0419, "step": 2884 }, { "epoch": 1.252713851498046, "grad_norm": 0.03374040126800537, "learning_rate": 0.00011657010428736965, "loss": 0.0401, "step": 2885 }, { "epoch": 1.2531480677377334, "grad_norm": 0.03862965479493141, "learning_rate": 0.00011654113557358053, "loss": 0.0442, "step": 2886 }, { "epoch": 1.2535822839774209, "grad_norm": 0.027244577184319496, "learning_rate": 0.00011651216685979143, "loss": 0.0309, "step": 2887 }, { "epoch": 1.2540165002171082, "grad_norm": 0.028408952057361603, "learning_rate": 0.00011648319814600232, "loss": 0.0327, "step": 2888 }, { "epoch": 1.2544507164567955, "grad_norm": 0.029843313619494438, "learning_rate": 0.00011645422943221322, "loss": 0.038, "step": 2889 }, { "epoch": 1.2548849326964828, "grad_norm": 0.0267395731061697, "learning_rate": 0.0001164252607184241, "loss": 0.0304, "step": 2890 }, { "epoch": 1.2553191489361701, "grad_norm": 0.0350620411336422, "learning_rate": 0.000116396292004635, "loss": 0.0418, "step": 2891 }, { "epoch": 1.2557533651758574, "grad_norm": 0.03014020249247551, "learning_rate": 0.0001163673232908459, "loss": 0.0349, "step": 2892 }, { "epoch": 1.256187581415545, "grad_norm": 0.03323042392730713, "learning_rate": 0.00011633835457705677, "loss": 0.0403, "step": 2893 }, { "epoch": 1.2566217976552323, "grad_norm": 0.04099719598889351, "learning_rate": 0.00011630938586326768, "loss": 0.0471, "step": 2894 }, { "epoch": 1.2570560138949196, "grad_norm": 0.03533007949590683, "learning_rate": 0.00011628041714947857, "loss": 0.0422, "step": 2895 }, { "epoch": 1.2574902301346071, "grad_norm": 0.033784665167331696, "learning_rate": 0.00011625144843568947, "loss": 0.0346, "step": 2896 }, { "epoch": 1.2579244463742945, "grad_norm": 0.027371341362595558, "learning_rate": 0.00011622247972190035, "loss": 0.0363, "step": 2897 }, { "epoch": 1.2583586626139818, "grad_norm": 0.033461254090070724, "learning_rate": 0.00011619351100811125, "loss": 0.0393, "step": 2898 }, { "epoch": 1.258792878853669, "grad_norm": 0.03496759757399559, "learning_rate": 0.00011616454229432214, "loss": 0.0373, "step": 2899 }, { "epoch": 1.2592270950933564, "grad_norm": 0.024425966665148735, "learning_rate": 0.00011613557358053302, "loss": 0.0291, "step": 2900 }, { "epoch": 1.259661311333044, "grad_norm": 0.030943183228373528, "learning_rate": 0.00011610660486674392, "loss": 0.0381, "step": 2901 }, { "epoch": 1.2600955275727312, "grad_norm": 0.028792420402169228, "learning_rate": 0.00011607763615295481, "loss": 0.0315, "step": 2902 }, { "epoch": 1.2605297438124186, "grad_norm": 0.036893878132104874, "learning_rate": 0.00011604866743916571, "loss": 0.048, "step": 2903 }, { "epoch": 1.2609639600521059, "grad_norm": 0.033690936863422394, "learning_rate": 0.00011601969872537659, "loss": 0.0414, "step": 2904 }, { "epoch": 1.2613981762917934, "grad_norm": 0.040983524173498154, "learning_rate": 0.0001159907300115875, "loss": 0.0402, "step": 2905 }, { "epoch": 1.2618323925314807, "grad_norm": 0.03675007447600365, "learning_rate": 0.00011596176129779838, "loss": 0.0425, "step": 2906 }, { "epoch": 1.262266608771168, "grad_norm": 0.029274579137563705, "learning_rate": 0.00011593279258400929, "loss": 0.0329, "step": 2907 }, { "epoch": 1.2627008250108553, "grad_norm": 0.032087601721286774, "learning_rate": 0.00011590382387022017, "loss": 0.0399, "step": 2908 }, { "epoch": 1.2631350412505427, "grad_norm": 0.03817923739552498, "learning_rate": 0.00011587485515643106, "loss": 0.0437, "step": 2909 }, { "epoch": 1.2635692574902302, "grad_norm": 0.03662949055433273, "learning_rate": 0.00011584588644264196, "loss": 0.042, "step": 2910 }, { "epoch": 1.2640034737299175, "grad_norm": 0.032586149871349335, "learning_rate": 0.00011581691772885284, "loss": 0.0372, "step": 2911 }, { "epoch": 1.2644376899696048, "grad_norm": 0.03348453715443611, "learning_rate": 0.00011578794901506374, "loss": 0.0356, "step": 2912 }, { "epoch": 1.2648719062092924, "grad_norm": 0.035836439579725266, "learning_rate": 0.00011575898030127463, "loss": 0.0397, "step": 2913 }, { "epoch": 1.2653061224489797, "grad_norm": 0.02993515506386757, "learning_rate": 0.00011573001158748553, "loss": 0.0379, "step": 2914 }, { "epoch": 1.265740338688667, "grad_norm": 0.029743948951363564, "learning_rate": 0.00011570104287369641, "loss": 0.0354, "step": 2915 }, { "epoch": 1.2661745549283543, "grad_norm": 0.03358416259288788, "learning_rate": 0.0001156720741599073, "loss": 0.0396, "step": 2916 }, { "epoch": 1.2666087711680416, "grad_norm": 0.043118976056575775, "learning_rate": 0.0001156431054461182, "loss": 0.0503, "step": 2917 }, { "epoch": 1.267042987407729, "grad_norm": 0.039799708873033524, "learning_rate": 0.00011561413673232908, "loss": 0.0507, "step": 2918 }, { "epoch": 1.2674772036474165, "grad_norm": 0.03337894752621651, "learning_rate": 0.00011558516801853998, "loss": 0.0413, "step": 2919 }, { "epoch": 1.2679114198871038, "grad_norm": 0.027650458738207817, "learning_rate": 0.00011555619930475087, "loss": 0.0341, "step": 2920 }, { "epoch": 1.268345636126791, "grad_norm": 0.0431731678545475, "learning_rate": 0.00011552723059096178, "loss": 0.0487, "step": 2921 }, { "epoch": 1.2687798523664786, "grad_norm": 0.03211009502410889, "learning_rate": 0.00011549826187717265, "loss": 0.0346, "step": 2922 }, { "epoch": 1.269214068606166, "grad_norm": 0.031437668949365616, "learning_rate": 0.00011546929316338356, "loss": 0.0376, "step": 2923 }, { "epoch": 1.2696482848458532, "grad_norm": 0.027832185849547386, "learning_rate": 0.00011544032444959445, "loss": 0.0354, "step": 2924 }, { "epoch": 1.2700825010855405, "grad_norm": 0.027526630088686943, "learning_rate": 0.00011541135573580533, "loss": 0.0342, "step": 2925 }, { "epoch": 1.2705167173252279, "grad_norm": 0.029162615537643433, "learning_rate": 0.00011538238702201623, "loss": 0.0398, "step": 2926 }, { "epoch": 1.2709509335649154, "grad_norm": 0.023466680198907852, "learning_rate": 0.00011535341830822712, "loss": 0.0288, "step": 2927 }, { "epoch": 1.2713851498046027, "grad_norm": 0.03391445800662041, "learning_rate": 0.00011532444959443802, "loss": 0.0396, "step": 2928 }, { "epoch": 1.27181936604429, "grad_norm": 0.03450887277722359, "learning_rate": 0.0001152954808806489, "loss": 0.0448, "step": 2929 }, { "epoch": 1.2722535822839776, "grad_norm": 0.0341305285692215, "learning_rate": 0.0001152665121668598, "loss": 0.0418, "step": 2930 }, { "epoch": 1.2726877985236649, "grad_norm": 0.03157223388552666, "learning_rate": 0.0001152375434530707, "loss": 0.0341, "step": 2931 }, { "epoch": 1.2731220147633522, "grad_norm": 0.027836302295327187, "learning_rate": 0.00011520857473928157, "loss": 0.0318, "step": 2932 }, { "epoch": 1.2735562310030395, "grad_norm": 0.03434112295508385, "learning_rate": 0.00011517960602549247, "loss": 0.0404, "step": 2933 }, { "epoch": 1.2739904472427268, "grad_norm": 0.05631440877914429, "learning_rate": 0.00011515063731170336, "loss": 0.066, "step": 2934 }, { "epoch": 1.2744246634824141, "grad_norm": 0.03695321083068848, "learning_rate": 0.00011512166859791427, "loss": 0.0452, "step": 2935 }, { "epoch": 1.2748588797221017, "grad_norm": 0.03717338666319847, "learning_rate": 0.00011509269988412514, "loss": 0.0408, "step": 2936 }, { "epoch": 1.275293095961789, "grad_norm": 0.03316926956176758, "learning_rate": 0.00011506373117033605, "loss": 0.0412, "step": 2937 }, { "epoch": 1.2757273122014763, "grad_norm": 0.02852916531264782, "learning_rate": 0.00011503476245654694, "loss": 0.0366, "step": 2938 }, { "epoch": 1.2761615284411638, "grad_norm": 0.033972062170505524, "learning_rate": 0.00011500579374275781, "loss": 0.043, "step": 2939 }, { "epoch": 1.2765957446808511, "grad_norm": 0.03384673222899437, "learning_rate": 0.00011497682502896872, "loss": 0.0359, "step": 2940 }, { "epoch": 1.2770299609205384, "grad_norm": 0.030915044248104095, "learning_rate": 0.00011494785631517961, "loss": 0.0348, "step": 2941 }, { "epoch": 1.2774641771602258, "grad_norm": 0.02672613598406315, "learning_rate": 0.00011491888760139051, "loss": 0.0372, "step": 2942 }, { "epoch": 1.277898393399913, "grad_norm": 0.03534408286213875, "learning_rate": 0.00011488991888760139, "loss": 0.0365, "step": 2943 }, { "epoch": 1.2783326096396006, "grad_norm": 0.030718421563506126, "learning_rate": 0.00011486095017381229, "loss": 0.0354, "step": 2944 }, { "epoch": 1.278766825879288, "grad_norm": 0.032323844730854034, "learning_rate": 0.00011483198146002318, "loss": 0.0348, "step": 2945 }, { "epoch": 1.2792010421189752, "grad_norm": 0.029317179694771767, "learning_rate": 0.00011480301274623406, "loss": 0.0372, "step": 2946 }, { "epoch": 1.2796352583586625, "grad_norm": 0.029027655720710754, "learning_rate": 0.00011477404403244496, "loss": 0.0345, "step": 2947 }, { "epoch": 1.28006947459835, "grad_norm": 0.03137189894914627, "learning_rate": 0.00011474507531865585, "loss": 0.0367, "step": 2948 }, { "epoch": 1.2805036908380374, "grad_norm": 0.03229035437107086, "learning_rate": 0.00011471610660486676, "loss": 0.0366, "step": 2949 }, { "epoch": 1.2809379070777247, "grad_norm": 0.03556684032082558, "learning_rate": 0.00011468713789107763, "loss": 0.0379, "step": 2950 }, { "epoch": 1.281372123317412, "grad_norm": 0.040012527257204056, "learning_rate": 0.00011465816917728854, "loss": 0.0429, "step": 2951 }, { "epoch": 1.2818063395570993, "grad_norm": 0.030817244201898575, "learning_rate": 0.00011462920046349943, "loss": 0.0339, "step": 2952 }, { "epoch": 1.2822405557967869, "grad_norm": 0.029182320460677147, "learning_rate": 0.0001146002317497103, "loss": 0.0299, "step": 2953 }, { "epoch": 1.2826747720364742, "grad_norm": 0.023354630917310715, "learning_rate": 0.00011457126303592121, "loss": 0.0271, "step": 2954 }, { "epoch": 1.2831089882761615, "grad_norm": 0.03758954256772995, "learning_rate": 0.0001145422943221321, "loss": 0.0433, "step": 2955 }, { "epoch": 1.283543204515849, "grad_norm": 0.032869044691324234, "learning_rate": 0.000114513325608343, "loss": 0.0379, "step": 2956 }, { "epoch": 1.2839774207555363, "grad_norm": 0.032218754291534424, "learning_rate": 0.00011448435689455388, "loss": 0.0389, "step": 2957 }, { "epoch": 1.2844116369952236, "grad_norm": 0.028771517798304558, "learning_rate": 0.00011445538818076478, "loss": 0.0354, "step": 2958 }, { "epoch": 1.284845853234911, "grad_norm": 0.034060582518577576, "learning_rate": 0.00011442641946697567, "loss": 0.046, "step": 2959 }, { "epoch": 1.2852800694745983, "grad_norm": 0.040199633687734604, "learning_rate": 0.00011439745075318658, "loss": 0.0495, "step": 2960 }, { "epoch": 1.2857142857142856, "grad_norm": 0.040659770369529724, "learning_rate": 0.00011436848203939745, "loss": 0.039, "step": 2961 }, { "epoch": 1.2861485019539731, "grad_norm": 0.03732852637767792, "learning_rate": 0.00011433951332560836, "loss": 0.0396, "step": 2962 }, { "epoch": 1.2865827181936604, "grad_norm": 0.0270701814442873, "learning_rate": 0.00011431054461181925, "loss": 0.0325, "step": 2963 }, { "epoch": 1.2870169344333477, "grad_norm": 0.030206210911273956, "learning_rate": 0.00011428157589803012, "loss": 0.0435, "step": 2964 }, { "epoch": 1.2874511506730353, "grad_norm": 0.03181450814008713, "learning_rate": 0.00011425260718424103, "loss": 0.0353, "step": 2965 }, { "epoch": 1.2878853669127226, "grad_norm": 0.026011602953076363, "learning_rate": 0.00011422363847045192, "loss": 0.0323, "step": 2966 }, { "epoch": 1.28831958315241, "grad_norm": 0.030911238864064217, "learning_rate": 0.00011419466975666282, "loss": 0.0389, "step": 2967 }, { "epoch": 1.2887537993920972, "grad_norm": 0.030703088268637657, "learning_rate": 0.0001141657010428737, "loss": 0.0411, "step": 2968 }, { "epoch": 1.2891880156317845, "grad_norm": 0.03004937805235386, "learning_rate": 0.0001141367323290846, "loss": 0.0395, "step": 2969 }, { "epoch": 1.289622231871472, "grad_norm": 0.025101378560066223, "learning_rate": 0.00011410776361529549, "loss": 0.0275, "step": 2970 }, { "epoch": 1.2900564481111594, "grad_norm": 0.024378174915909767, "learning_rate": 0.00011407879490150637, "loss": 0.0284, "step": 2971 }, { "epoch": 1.2904906643508467, "grad_norm": 0.025472097098827362, "learning_rate": 0.00011404982618771727, "loss": 0.0292, "step": 2972 }, { "epoch": 1.290924880590534, "grad_norm": 0.039696238934993744, "learning_rate": 0.00011402085747392816, "loss": 0.041, "step": 2973 }, { "epoch": 1.2913590968302215, "grad_norm": 0.03688874840736389, "learning_rate": 0.00011399188876013907, "loss": 0.0499, "step": 2974 }, { "epoch": 1.2917933130699089, "grad_norm": 0.027188600972294807, "learning_rate": 0.00011396292004634994, "loss": 0.035, "step": 2975 }, { "epoch": 1.2922275293095962, "grad_norm": 0.034622497856616974, "learning_rate": 0.00011393395133256085, "loss": 0.0409, "step": 2976 }, { "epoch": 1.2926617455492835, "grad_norm": 0.029033344238996506, "learning_rate": 0.00011390498261877174, "loss": 0.0359, "step": 2977 }, { "epoch": 1.2930959617889708, "grad_norm": 0.030993996188044548, "learning_rate": 0.00011387601390498261, "loss": 0.0394, "step": 2978 }, { "epoch": 1.2935301780286583, "grad_norm": 0.03255714848637581, "learning_rate": 0.00011384704519119352, "loss": 0.04, "step": 2979 }, { "epoch": 1.2939643942683456, "grad_norm": 0.024630818516016006, "learning_rate": 0.0001138180764774044, "loss": 0.0291, "step": 2980 }, { "epoch": 1.294398610508033, "grad_norm": 0.02463364042341709, "learning_rate": 0.00011378910776361531, "loss": 0.028, "step": 2981 }, { "epoch": 1.2948328267477205, "grad_norm": 0.02617952600121498, "learning_rate": 0.00011376013904982619, "loss": 0.0298, "step": 2982 }, { "epoch": 1.2952670429874078, "grad_norm": 0.03235043212771416, "learning_rate": 0.00011373117033603709, "loss": 0.0388, "step": 2983 }, { "epoch": 1.2957012592270951, "grad_norm": 0.03079437091946602, "learning_rate": 0.00011370220162224798, "loss": 0.0423, "step": 2984 }, { "epoch": 1.2961354754667824, "grad_norm": 0.030089175328612328, "learning_rate": 0.00011367323290845886, "loss": 0.0363, "step": 2985 }, { "epoch": 1.2965696917064697, "grad_norm": 0.03382711112499237, "learning_rate": 0.00011364426419466976, "loss": 0.0433, "step": 2986 }, { "epoch": 1.297003907946157, "grad_norm": 0.028670433908700943, "learning_rate": 0.00011361529548088065, "loss": 0.0372, "step": 2987 }, { "epoch": 1.2974381241858446, "grad_norm": 0.03714992478489876, "learning_rate": 0.00011358632676709155, "loss": 0.0474, "step": 2988 }, { "epoch": 1.297872340425532, "grad_norm": 0.045167628675699234, "learning_rate": 0.00011355735805330243, "loss": 0.0529, "step": 2989 }, { "epoch": 1.2983065566652192, "grad_norm": 0.030785735696554184, "learning_rate": 0.00011352838933951334, "loss": 0.0388, "step": 2990 }, { "epoch": 1.2987407729049067, "grad_norm": 0.0309563260525465, "learning_rate": 0.00011349942062572423, "loss": 0.0343, "step": 2991 }, { "epoch": 1.299174989144594, "grad_norm": 0.02826661989092827, "learning_rate": 0.0001134704519119351, "loss": 0.0369, "step": 2992 }, { "epoch": 1.2996092053842814, "grad_norm": 0.032528847455978394, "learning_rate": 0.000113441483198146, "loss": 0.0386, "step": 2993 }, { "epoch": 1.3000434216239687, "grad_norm": 0.0283330250531435, "learning_rate": 0.0001134125144843569, "loss": 0.035, "step": 2994 }, { "epoch": 1.300477637863656, "grad_norm": 0.04311269894242287, "learning_rate": 0.0001133835457705678, "loss": 0.055, "step": 2995 }, { "epoch": 1.3009118541033435, "grad_norm": 0.0336747020483017, "learning_rate": 0.00011335457705677868, "loss": 0.0407, "step": 2996 }, { "epoch": 1.3013460703430308, "grad_norm": 0.03563176095485687, "learning_rate": 0.00011332560834298958, "loss": 0.0397, "step": 2997 }, { "epoch": 1.3017802865827182, "grad_norm": 0.03596068173646927, "learning_rate": 0.00011329663962920047, "loss": 0.0435, "step": 2998 }, { "epoch": 1.3022145028224057, "grad_norm": 0.03349783271551132, "learning_rate": 0.00011326767091541135, "loss": 0.0398, "step": 2999 }, { "epoch": 1.302648719062093, "grad_norm": 0.0386887826025486, "learning_rate": 0.00011323870220162225, "loss": 0.0441, "step": 3000 }, { "epoch": 1.3030829353017803, "grad_norm": 0.030005143955349922, "learning_rate": 0.00011320973348783315, "loss": 0.0366, "step": 3001 }, { "epoch": 1.3035171515414676, "grad_norm": 0.04301001504063606, "learning_rate": 0.00011318076477404404, "loss": 0.0555, "step": 3002 }, { "epoch": 1.303951367781155, "grad_norm": 0.034578826278448105, "learning_rate": 0.00011315179606025492, "loss": 0.0421, "step": 3003 }, { "epoch": 1.3043855840208423, "grad_norm": 0.032523225992918015, "learning_rate": 0.00011312282734646582, "loss": 0.0419, "step": 3004 }, { "epoch": 1.3048198002605298, "grad_norm": 0.03336628898978233, "learning_rate": 0.00011309385863267671, "loss": 0.0373, "step": 3005 }, { "epoch": 1.305254016500217, "grad_norm": 0.03262297436594963, "learning_rate": 0.00011306488991888762, "loss": 0.0326, "step": 3006 }, { "epoch": 1.3056882327399044, "grad_norm": 0.03868933022022247, "learning_rate": 0.0001130359212050985, "loss": 0.0477, "step": 3007 }, { "epoch": 1.306122448979592, "grad_norm": 0.03538293391466141, "learning_rate": 0.0001130069524913094, "loss": 0.0438, "step": 3008 }, { "epoch": 1.3065566652192793, "grad_norm": 0.030219588428735733, "learning_rate": 0.00011297798377752029, "loss": 0.0343, "step": 3009 }, { "epoch": 1.3069908814589666, "grad_norm": 0.03111080639064312, "learning_rate": 0.00011294901506373117, "loss": 0.0346, "step": 3010 }, { "epoch": 1.307425097698654, "grad_norm": 0.02388201653957367, "learning_rate": 0.00011292004634994207, "loss": 0.031, "step": 3011 }, { "epoch": 1.3078593139383412, "grad_norm": 0.034250207245349884, "learning_rate": 0.00011289107763615296, "loss": 0.0372, "step": 3012 }, { "epoch": 1.3082935301780287, "grad_norm": 0.027203010395169258, "learning_rate": 0.00011286210892236386, "loss": 0.0303, "step": 3013 }, { "epoch": 1.308727746417716, "grad_norm": 0.039065293967723846, "learning_rate": 0.00011283314020857474, "loss": 0.0486, "step": 3014 }, { "epoch": 1.3091619626574034, "grad_norm": 0.029481321573257446, "learning_rate": 0.00011280417149478564, "loss": 0.0389, "step": 3015 }, { "epoch": 1.3095961788970907, "grad_norm": 0.04386517405509949, "learning_rate": 0.00011277520278099653, "loss": 0.0533, "step": 3016 }, { "epoch": 1.3100303951367782, "grad_norm": 0.02907857857644558, "learning_rate": 0.00011274623406720741, "loss": 0.0347, "step": 3017 }, { "epoch": 1.3104646113764655, "grad_norm": 0.03410186618566513, "learning_rate": 0.00011271726535341831, "loss": 0.0398, "step": 3018 }, { "epoch": 1.3108988276161528, "grad_norm": 0.0305402260273695, "learning_rate": 0.0001126882966396292, "loss": 0.0359, "step": 3019 }, { "epoch": 1.3113330438558402, "grad_norm": 0.033788662403821945, "learning_rate": 0.00011265932792584011, "loss": 0.0388, "step": 3020 }, { "epoch": 1.3117672600955275, "grad_norm": 0.03568699583411217, "learning_rate": 0.00011263035921205098, "loss": 0.0372, "step": 3021 }, { "epoch": 1.312201476335215, "grad_norm": 0.04128998890519142, "learning_rate": 0.00011260139049826189, "loss": 0.0402, "step": 3022 }, { "epoch": 1.3126356925749023, "grad_norm": 0.03034403920173645, "learning_rate": 0.00011257242178447278, "loss": 0.0329, "step": 3023 }, { "epoch": 1.3130699088145896, "grad_norm": 0.04070289433002472, "learning_rate": 0.00011254345307068365, "loss": 0.0483, "step": 3024 }, { "epoch": 1.3135041250542772, "grad_norm": 0.032228194177150726, "learning_rate": 0.00011251448435689456, "loss": 0.0382, "step": 3025 }, { "epoch": 1.3139383412939645, "grad_norm": 0.03665580600500107, "learning_rate": 0.00011248551564310545, "loss": 0.0379, "step": 3026 }, { "epoch": 1.3143725575336518, "grad_norm": 0.029792679473757744, "learning_rate": 0.00011245654692931635, "loss": 0.0331, "step": 3027 }, { "epoch": 1.314806773773339, "grad_norm": 0.03698191046714783, "learning_rate": 0.00011242757821552723, "loss": 0.0434, "step": 3028 }, { "epoch": 1.3152409900130264, "grad_norm": 0.03704965114593506, "learning_rate": 0.00011239860950173813, "loss": 0.0434, "step": 3029 }, { "epoch": 1.3156752062527137, "grad_norm": 0.035477109253406525, "learning_rate": 0.00011236964078794902, "loss": 0.0415, "step": 3030 }, { "epoch": 1.3161094224924013, "grad_norm": 0.030192581936717033, "learning_rate": 0.0001123406720741599, "loss": 0.0348, "step": 3031 }, { "epoch": 1.3165436387320886, "grad_norm": 0.028463682159781456, "learning_rate": 0.0001123117033603708, "loss": 0.0335, "step": 3032 }, { "epoch": 1.3169778549717759, "grad_norm": 0.03614094853401184, "learning_rate": 0.0001122827346465817, "loss": 0.0421, "step": 3033 }, { "epoch": 1.3174120712114634, "grad_norm": 0.02876162715256214, "learning_rate": 0.0001122537659327926, "loss": 0.0351, "step": 3034 }, { "epoch": 1.3178462874511507, "grad_norm": 0.0261339470744133, "learning_rate": 0.00011222479721900347, "loss": 0.0324, "step": 3035 }, { "epoch": 1.318280503690838, "grad_norm": 0.03871973231434822, "learning_rate": 0.00011219582850521438, "loss": 0.0429, "step": 3036 }, { "epoch": 1.3187147199305254, "grad_norm": 0.027595600113272667, "learning_rate": 0.00011216685979142527, "loss": 0.0359, "step": 3037 }, { "epoch": 1.3191489361702127, "grad_norm": 0.02937772497534752, "learning_rate": 0.00011213789107763614, "loss": 0.0332, "step": 3038 }, { "epoch": 1.3195831524099002, "grad_norm": 0.02581372670829296, "learning_rate": 0.00011210892236384705, "loss": 0.0324, "step": 3039 }, { "epoch": 1.3200173686495875, "grad_norm": 0.03296217694878578, "learning_rate": 0.00011207995365005795, "loss": 0.0317, "step": 3040 }, { "epoch": 1.3204515848892748, "grad_norm": 0.030116375535726547, "learning_rate": 0.00011205098493626884, "loss": 0.0387, "step": 3041 }, { "epoch": 1.3208858011289621, "grad_norm": 0.02777777798473835, "learning_rate": 0.00011202201622247972, "loss": 0.0322, "step": 3042 }, { "epoch": 1.3213200173686497, "grad_norm": 0.033407438546419144, "learning_rate": 0.00011199304750869062, "loss": 0.0406, "step": 3043 }, { "epoch": 1.321754233608337, "grad_norm": 0.03261242061853409, "learning_rate": 0.00011196407879490151, "loss": 0.035, "step": 3044 }, { "epoch": 1.3221884498480243, "grad_norm": 0.030599107965826988, "learning_rate": 0.00011193511008111239, "loss": 0.0376, "step": 3045 }, { "epoch": 1.3226226660877116, "grad_norm": 0.0321236252784729, "learning_rate": 0.00011190614136732329, "loss": 0.0374, "step": 3046 }, { "epoch": 1.323056882327399, "grad_norm": 0.04012523964047432, "learning_rate": 0.0001118771726535342, "loss": 0.039, "step": 3047 }, { "epoch": 1.3234910985670865, "grad_norm": 0.027187393978238106, "learning_rate": 0.00011184820393974509, "loss": 0.0318, "step": 3048 }, { "epoch": 1.3239253148067738, "grad_norm": 0.026861360296607018, "learning_rate": 0.00011181923522595596, "loss": 0.0322, "step": 3049 }, { "epoch": 1.324359531046461, "grad_norm": 0.03027205914258957, "learning_rate": 0.00011179026651216687, "loss": 0.0413, "step": 3050 }, { "epoch": 1.3247937472861486, "grad_norm": 0.030360663309693336, "learning_rate": 0.00011176129779837776, "loss": 0.0315, "step": 3051 }, { "epoch": 1.325227963525836, "grad_norm": 0.027710681781172752, "learning_rate": 0.00011173232908458866, "loss": 0.029, "step": 3052 }, { "epoch": 1.3256621797655233, "grad_norm": 0.03578774631023407, "learning_rate": 0.00011170336037079954, "loss": 0.0457, "step": 3053 }, { "epoch": 1.3260963960052106, "grad_norm": 0.028475189581513405, "learning_rate": 0.00011167439165701044, "loss": 0.0359, "step": 3054 }, { "epoch": 1.3265306122448979, "grad_norm": 0.04579676315188408, "learning_rate": 0.00011164542294322133, "loss": 0.0459, "step": 3055 }, { "epoch": 1.3269648284845852, "grad_norm": 0.03313445672392845, "learning_rate": 0.00011161645422943221, "loss": 0.0417, "step": 3056 }, { "epoch": 1.3273990447242727, "grad_norm": 0.03940417990088463, "learning_rate": 0.00011158748551564311, "loss": 0.0443, "step": 3057 }, { "epoch": 1.32783326096396, "grad_norm": 0.037194181233644485, "learning_rate": 0.000111558516801854, "loss": 0.0395, "step": 3058 }, { "epoch": 1.3282674772036474, "grad_norm": 0.03901904076337814, "learning_rate": 0.0001115295480880649, "loss": 0.0479, "step": 3059 }, { "epoch": 1.3287016934433349, "grad_norm": 0.03523111343383789, "learning_rate": 0.00011150057937427578, "loss": 0.0389, "step": 3060 }, { "epoch": 1.3291359096830222, "grad_norm": 0.030384505167603493, "learning_rate": 0.00011147161066048669, "loss": 0.0386, "step": 3061 }, { "epoch": 1.3295701259227095, "grad_norm": 0.03593797609210014, "learning_rate": 0.00011144264194669758, "loss": 0.0413, "step": 3062 }, { "epoch": 1.3300043421623968, "grad_norm": 0.04044853150844574, "learning_rate": 0.00011141367323290845, "loss": 0.0531, "step": 3063 }, { "epoch": 1.3304385584020841, "grad_norm": 0.0461442768573761, "learning_rate": 0.00011138470451911936, "loss": 0.0452, "step": 3064 }, { "epoch": 1.3308727746417717, "grad_norm": 0.03080364130437374, "learning_rate": 0.00011135573580533025, "loss": 0.0397, "step": 3065 }, { "epoch": 1.331306990881459, "grad_norm": 0.03281201794743538, "learning_rate": 0.00011132676709154115, "loss": 0.0405, "step": 3066 }, { "epoch": 1.3317412071211463, "grad_norm": 0.03373052179813385, "learning_rate": 0.00011129779837775203, "loss": 0.0453, "step": 3067 }, { "epoch": 1.3321754233608338, "grad_norm": 0.03565700352191925, "learning_rate": 0.00011126882966396293, "loss": 0.0407, "step": 3068 }, { "epoch": 1.3326096396005211, "grad_norm": 0.029081355780363083, "learning_rate": 0.00011123986095017382, "loss": 0.0374, "step": 3069 }, { "epoch": 1.3330438558402085, "grad_norm": 0.030784204602241516, "learning_rate": 0.0001112108922363847, "loss": 0.0385, "step": 3070 }, { "epoch": 1.3334780720798958, "grad_norm": 0.03551880270242691, "learning_rate": 0.0001111819235225956, "loss": 0.0421, "step": 3071 }, { "epoch": 1.333912288319583, "grad_norm": 0.03723979741334915, "learning_rate": 0.00011115295480880649, "loss": 0.046, "step": 3072 }, { "epoch": 1.3343465045592704, "grad_norm": 0.03081456571817398, "learning_rate": 0.0001111239860950174, "loss": 0.0332, "step": 3073 }, { "epoch": 1.334780720798958, "grad_norm": 0.03365355357527733, "learning_rate": 0.00011109501738122827, "loss": 0.0463, "step": 3074 }, { "epoch": 1.3352149370386452, "grad_norm": 0.03612165525555611, "learning_rate": 0.00011106604866743918, "loss": 0.0421, "step": 3075 }, { "epoch": 1.3356491532783326, "grad_norm": 0.03488852456212044, "learning_rate": 0.00011103707995365007, "loss": 0.0397, "step": 3076 }, { "epoch": 1.33608336951802, "grad_norm": 0.03180518373847008, "learning_rate": 0.00011100811123986094, "loss": 0.0343, "step": 3077 }, { "epoch": 1.3365175857577074, "grad_norm": 0.036041561514139175, "learning_rate": 0.00011097914252607185, "loss": 0.0425, "step": 3078 }, { "epoch": 1.3369518019973947, "grad_norm": 0.04433013126254082, "learning_rate": 0.00011095017381228274, "loss": 0.0502, "step": 3079 }, { "epoch": 1.337386018237082, "grad_norm": 0.024332202970981598, "learning_rate": 0.00011092120509849364, "loss": 0.0287, "step": 3080 }, { "epoch": 1.3378202344767693, "grad_norm": 0.028876641765236855, "learning_rate": 0.00011089223638470452, "loss": 0.0364, "step": 3081 }, { "epoch": 1.3382544507164569, "grad_norm": 0.031292304396629333, "learning_rate": 0.00011086326767091542, "loss": 0.0373, "step": 3082 }, { "epoch": 1.3386886669561442, "grad_norm": 0.029405364766716957, "learning_rate": 0.00011083429895712631, "loss": 0.0423, "step": 3083 }, { "epoch": 1.3391228831958315, "grad_norm": 0.032557547092437744, "learning_rate": 0.00011080533024333719, "loss": 0.0419, "step": 3084 }, { "epoch": 1.3395570994355188, "grad_norm": 0.031998731195926666, "learning_rate": 0.00011077636152954809, "loss": 0.0414, "step": 3085 }, { "epoch": 1.3399913156752064, "grad_norm": 0.03126242756843567, "learning_rate": 0.000110747392815759, "loss": 0.0391, "step": 3086 }, { "epoch": 1.3404255319148937, "grad_norm": 0.034262459725141525, "learning_rate": 0.00011071842410196988, "loss": 0.0406, "step": 3087 }, { "epoch": 1.340859748154581, "grad_norm": 0.0388617105782032, "learning_rate": 0.00011068945538818076, "loss": 0.0502, "step": 3088 }, { "epoch": 1.3412939643942683, "grad_norm": 0.03547272831201553, "learning_rate": 0.00011066048667439166, "loss": 0.0403, "step": 3089 }, { "epoch": 1.3417281806339556, "grad_norm": 0.037292834371328354, "learning_rate": 0.00011063151796060255, "loss": 0.0401, "step": 3090 }, { "epoch": 1.3421623968736431, "grad_norm": 0.026430858299136162, "learning_rate": 0.00011060254924681343, "loss": 0.0302, "step": 3091 }, { "epoch": 1.3425966131133305, "grad_norm": 0.03583472967147827, "learning_rate": 0.00011057358053302434, "loss": 0.0336, "step": 3092 }, { "epoch": 1.3430308293530178, "grad_norm": 0.03774934634566307, "learning_rate": 0.00011054461181923524, "loss": 0.042, "step": 3093 }, { "epoch": 1.3434650455927053, "grad_norm": 0.02983023039996624, "learning_rate": 0.00011051564310544613, "loss": 0.0332, "step": 3094 }, { "epoch": 1.3438992618323926, "grad_norm": 0.031750477850437164, "learning_rate": 0.000110486674391657, "loss": 0.0334, "step": 3095 }, { "epoch": 1.34433347807208, "grad_norm": 0.02909061312675476, "learning_rate": 0.00011045770567786791, "loss": 0.0367, "step": 3096 }, { "epoch": 1.3447676943117672, "grad_norm": 0.02900591306388378, "learning_rate": 0.0001104287369640788, "loss": 0.032, "step": 3097 }, { "epoch": 1.3452019105514545, "grad_norm": 0.03231970593333244, "learning_rate": 0.0001103997682502897, "loss": 0.0365, "step": 3098 }, { "epoch": 1.3456361267911419, "grad_norm": 0.04000348597764969, "learning_rate": 0.00011037079953650058, "loss": 0.0483, "step": 3099 }, { "epoch": 1.3460703430308294, "grad_norm": 0.03144702687859535, "learning_rate": 0.00011034183082271148, "loss": 0.0371, "step": 3100 }, { "epoch": 1.3465045592705167, "grad_norm": 0.03656529262661934, "learning_rate": 0.00011031286210892237, "loss": 0.0396, "step": 3101 }, { "epoch": 1.346938775510204, "grad_norm": 0.0341009795665741, "learning_rate": 0.00011028389339513325, "loss": 0.0374, "step": 3102 }, { "epoch": 1.3473729917498916, "grad_norm": 0.03186469525098801, "learning_rate": 0.00011025492468134415, "loss": 0.0382, "step": 3103 }, { "epoch": 1.3478072079895789, "grad_norm": 0.040695372968912125, "learning_rate": 0.00011022595596755504, "loss": 0.0405, "step": 3104 }, { "epoch": 1.3482414242292662, "grad_norm": 0.03081185556948185, "learning_rate": 0.00011019698725376595, "loss": 0.0308, "step": 3105 }, { "epoch": 1.3486756404689535, "grad_norm": 0.028407620266079903, "learning_rate": 0.00011016801853997682, "loss": 0.0397, "step": 3106 }, { "epoch": 1.3491098567086408, "grad_norm": 0.023906176909804344, "learning_rate": 0.00011013904982618773, "loss": 0.0306, "step": 3107 }, { "epoch": 1.3495440729483283, "grad_norm": 0.0347139947116375, "learning_rate": 0.00011011008111239862, "loss": 0.0429, "step": 3108 }, { "epoch": 1.3499782891880157, "grad_norm": 0.03799999877810478, "learning_rate": 0.0001100811123986095, "loss": 0.0377, "step": 3109 }, { "epoch": 1.350412505427703, "grad_norm": 0.03227884694933891, "learning_rate": 0.0001100521436848204, "loss": 0.0368, "step": 3110 }, { "epoch": 1.3508467216673903, "grad_norm": 0.025860344991087914, "learning_rate": 0.00011002317497103129, "loss": 0.0311, "step": 3111 }, { "epoch": 1.3512809379070778, "grad_norm": 0.027932479977607727, "learning_rate": 0.00010999420625724219, "loss": 0.0335, "step": 3112 }, { "epoch": 1.3517151541467651, "grad_norm": 0.029126150533556938, "learning_rate": 0.00010996523754345307, "loss": 0.0374, "step": 3113 }, { "epoch": 1.3521493703864524, "grad_norm": 0.036490365862846375, "learning_rate": 0.00010993626882966397, "loss": 0.0413, "step": 3114 }, { "epoch": 1.3525835866261398, "grad_norm": 0.030705703422427177, "learning_rate": 0.00010990730011587486, "loss": 0.0377, "step": 3115 }, { "epoch": 1.353017802865827, "grad_norm": 0.026701487600803375, "learning_rate": 0.00010987833140208574, "loss": 0.0321, "step": 3116 }, { "epoch": 1.3534520191055146, "grad_norm": 0.035791072994470596, "learning_rate": 0.00010984936268829664, "loss": 0.037, "step": 3117 }, { "epoch": 1.353886235345202, "grad_norm": 0.03799590468406677, "learning_rate": 0.00010982039397450753, "loss": 0.0423, "step": 3118 }, { "epoch": 1.3543204515848892, "grad_norm": 0.03330104425549507, "learning_rate": 0.00010979142526071844, "loss": 0.0375, "step": 3119 }, { "epoch": 1.3547546678245768, "grad_norm": 0.0362006314098835, "learning_rate": 0.00010976245654692931, "loss": 0.0457, "step": 3120 }, { "epoch": 1.355188884064264, "grad_norm": 0.03755546361207962, "learning_rate": 0.00010973348783314022, "loss": 0.0443, "step": 3121 }, { "epoch": 1.3556231003039514, "grad_norm": 0.031083177775144577, "learning_rate": 0.00010970451911935111, "loss": 0.0386, "step": 3122 }, { "epoch": 1.3560573165436387, "grad_norm": 0.036307286471128464, "learning_rate": 0.00010967555040556198, "loss": 0.0496, "step": 3123 }, { "epoch": 1.356491532783326, "grad_norm": 0.02894393540918827, "learning_rate": 0.00010964658169177289, "loss": 0.0359, "step": 3124 }, { "epoch": 1.3569257490230133, "grad_norm": 0.03138035908341408, "learning_rate": 0.00010961761297798379, "loss": 0.0329, "step": 3125 }, { "epoch": 1.3573599652627009, "grad_norm": 0.03450090065598488, "learning_rate": 0.00010958864426419468, "loss": 0.041, "step": 3126 }, { "epoch": 1.3577941815023882, "grad_norm": 0.03599924594163895, "learning_rate": 0.00010955967555040556, "loss": 0.038, "step": 3127 }, { "epoch": 1.3582283977420755, "grad_norm": 0.03015170805156231, "learning_rate": 0.00010953070683661646, "loss": 0.0364, "step": 3128 }, { "epoch": 1.358662613981763, "grad_norm": 0.027220653370022774, "learning_rate": 0.00010950173812282735, "loss": 0.0306, "step": 3129 }, { "epoch": 1.3590968302214503, "grad_norm": 0.026676863431930542, "learning_rate": 0.00010947276940903823, "loss": 0.0289, "step": 3130 }, { "epoch": 1.3595310464611376, "grad_norm": 0.030855510383844376, "learning_rate": 0.00010944380069524913, "loss": 0.0346, "step": 3131 }, { "epoch": 1.359965262700825, "grad_norm": 0.03017914667725563, "learning_rate": 0.00010941483198146004, "loss": 0.036, "step": 3132 }, { "epoch": 1.3603994789405123, "grad_norm": 0.034865446388721466, "learning_rate": 0.00010938586326767093, "loss": 0.0442, "step": 3133 }, { "epoch": 1.3608336951801998, "grad_norm": 0.02486284449696541, "learning_rate": 0.0001093568945538818, "loss": 0.0285, "step": 3134 }, { "epoch": 1.3612679114198871, "grad_norm": 0.028663625940680504, "learning_rate": 0.00010932792584009271, "loss": 0.0332, "step": 3135 }, { "epoch": 1.3617021276595744, "grad_norm": 0.03579244390130043, "learning_rate": 0.0001092989571263036, "loss": 0.0448, "step": 3136 }, { "epoch": 1.362136343899262, "grad_norm": 0.0316963754594326, "learning_rate": 0.00010926998841251447, "loss": 0.0389, "step": 3137 }, { "epoch": 1.3625705601389493, "grad_norm": 0.042476870119571686, "learning_rate": 0.00010924101969872538, "loss": 0.0554, "step": 3138 }, { "epoch": 1.3630047763786366, "grad_norm": 0.03245158493518829, "learning_rate": 0.00010921205098493628, "loss": 0.0412, "step": 3139 }, { "epoch": 1.363438992618324, "grad_norm": 0.02601350098848343, "learning_rate": 0.00010918308227114717, "loss": 0.0358, "step": 3140 }, { "epoch": 1.3638732088580112, "grad_norm": 0.03592539578676224, "learning_rate": 0.00010915411355735805, "loss": 0.0347, "step": 3141 }, { "epoch": 1.3643074250976985, "grad_norm": 0.04516908898949623, "learning_rate": 0.00010912514484356895, "loss": 0.056, "step": 3142 }, { "epoch": 1.364741641337386, "grad_norm": 0.027739830315113068, "learning_rate": 0.00010909617612977984, "loss": 0.0359, "step": 3143 }, { "epoch": 1.3651758575770734, "grad_norm": 0.035830408334732056, "learning_rate": 0.00010906720741599072, "loss": 0.0422, "step": 3144 }, { "epoch": 1.3656100738167607, "grad_norm": 0.025232959538698196, "learning_rate": 0.00010903823870220162, "loss": 0.0284, "step": 3145 }, { "epoch": 1.3660442900564482, "grad_norm": 0.03499378636479378, "learning_rate": 0.00010900926998841253, "loss": 0.0424, "step": 3146 }, { "epoch": 1.3664785062961355, "grad_norm": 0.02978041023015976, "learning_rate": 0.00010898030127462342, "loss": 0.0422, "step": 3147 }, { "epoch": 1.3669127225358229, "grad_norm": 0.03364676982164383, "learning_rate": 0.00010895133256083429, "loss": 0.0388, "step": 3148 }, { "epoch": 1.3673469387755102, "grad_norm": 0.030819695442914963, "learning_rate": 0.0001089223638470452, "loss": 0.0363, "step": 3149 }, { "epoch": 1.3677811550151975, "grad_norm": 0.02389485202729702, "learning_rate": 0.00010889339513325609, "loss": 0.0284, "step": 3150 }, { "epoch": 1.368215371254885, "grad_norm": 0.03985444828867912, "learning_rate": 0.00010886442641946699, "loss": 0.0417, "step": 3151 }, { "epoch": 1.3686495874945723, "grad_norm": 0.038257990032434464, "learning_rate": 0.00010883545770567787, "loss": 0.0497, "step": 3152 }, { "epoch": 1.3690838037342596, "grad_norm": 0.04748479649424553, "learning_rate": 0.00010880648899188877, "loss": 0.0484, "step": 3153 }, { "epoch": 1.369518019973947, "grad_norm": 0.0416133850812912, "learning_rate": 0.00010877752027809966, "loss": 0.0442, "step": 3154 }, { "epoch": 1.3699522362136345, "grad_norm": 0.03841627761721611, "learning_rate": 0.00010874855156431054, "loss": 0.0411, "step": 3155 }, { "epoch": 1.3703864524533218, "grad_norm": 0.04490552842617035, "learning_rate": 0.00010871958285052144, "loss": 0.0504, "step": 3156 }, { "epoch": 1.3708206686930091, "grad_norm": 0.03704807907342911, "learning_rate": 0.00010869061413673233, "loss": 0.0363, "step": 3157 }, { "epoch": 1.3712548849326964, "grad_norm": 0.04014680162072182, "learning_rate": 0.00010866164542294324, "loss": 0.0482, "step": 3158 }, { "epoch": 1.3716891011723837, "grad_norm": 0.028961682692170143, "learning_rate": 0.00010863267670915411, "loss": 0.035, "step": 3159 }, { "epoch": 1.3721233174120713, "grad_norm": 0.03812296688556671, "learning_rate": 0.00010860370799536502, "loss": 0.0368, "step": 3160 }, { "epoch": 1.3725575336517586, "grad_norm": 0.025441652163863182, "learning_rate": 0.0001085747392815759, "loss": 0.0291, "step": 3161 }, { "epoch": 1.372991749891446, "grad_norm": 0.02938506379723549, "learning_rate": 0.00010854577056778678, "loss": 0.0362, "step": 3162 }, { "epoch": 1.3734259661311334, "grad_norm": 0.03512920066714287, "learning_rate": 0.00010851680185399769, "loss": 0.0458, "step": 3163 }, { "epoch": 1.3738601823708207, "grad_norm": 0.031456537544727325, "learning_rate": 0.00010848783314020859, "loss": 0.0333, "step": 3164 }, { "epoch": 1.374294398610508, "grad_norm": 0.03799983486533165, "learning_rate": 0.00010845886442641948, "loss": 0.047, "step": 3165 }, { "epoch": 1.3747286148501954, "grad_norm": 0.03878263384103775, "learning_rate": 0.00010842989571263036, "loss": 0.0512, "step": 3166 }, { "epoch": 1.3751628310898827, "grad_norm": 0.03018483892083168, "learning_rate": 0.00010840092699884126, "loss": 0.0417, "step": 3167 }, { "epoch": 1.37559704732957, "grad_norm": 0.035068970173597336, "learning_rate": 0.00010837195828505215, "loss": 0.0349, "step": 3168 }, { "epoch": 1.3760312635692575, "grad_norm": 0.03779861703515053, "learning_rate": 0.00010834298957126303, "loss": 0.0439, "step": 3169 }, { "epoch": 1.3764654798089448, "grad_norm": 0.0363328717648983, "learning_rate": 0.00010831402085747393, "loss": 0.0389, "step": 3170 }, { "epoch": 1.3768996960486322, "grad_norm": 0.03174567595124245, "learning_rate": 0.00010828505214368483, "loss": 0.0385, "step": 3171 }, { "epoch": 1.3773339122883197, "grad_norm": 0.028425587341189384, "learning_rate": 0.00010825608342989572, "loss": 0.0323, "step": 3172 }, { "epoch": 1.377768128528007, "grad_norm": 0.030422117561101913, "learning_rate": 0.0001082271147161066, "loss": 0.0371, "step": 3173 }, { "epoch": 1.3782023447676943, "grad_norm": 0.025947488844394684, "learning_rate": 0.0001081981460023175, "loss": 0.0336, "step": 3174 }, { "epoch": 1.3786365610073816, "grad_norm": 0.027242116630077362, "learning_rate": 0.0001081691772885284, "loss": 0.0359, "step": 3175 }, { "epoch": 1.379070777247069, "grad_norm": 0.028777867555618286, "learning_rate": 0.00010814020857473927, "loss": 0.0335, "step": 3176 }, { "epoch": 1.3795049934867565, "grad_norm": 0.03243269398808479, "learning_rate": 0.00010811123986095018, "loss": 0.042, "step": 3177 }, { "epoch": 1.3799392097264438, "grad_norm": 0.029834967106580734, "learning_rate": 0.00010808227114716108, "loss": 0.0399, "step": 3178 }, { "epoch": 1.380373425966131, "grad_norm": 0.026142358779907227, "learning_rate": 0.00010805330243337197, "loss": 0.0312, "step": 3179 }, { "epoch": 1.3808076422058184, "grad_norm": 0.036690227687358856, "learning_rate": 0.00010802433371958285, "loss": 0.0416, "step": 3180 }, { "epoch": 1.381241858445506, "grad_norm": 0.028992636129260063, "learning_rate": 0.00010799536500579375, "loss": 0.0346, "step": 3181 }, { "epoch": 1.3816760746851933, "grad_norm": 0.027403950691223145, "learning_rate": 0.00010796639629200464, "loss": 0.0315, "step": 3182 }, { "epoch": 1.3821102909248806, "grad_norm": 0.02984504960477352, "learning_rate": 0.00010793742757821552, "loss": 0.0315, "step": 3183 }, { "epoch": 1.382544507164568, "grad_norm": 0.03903144598007202, "learning_rate": 0.00010790845886442642, "loss": 0.0494, "step": 3184 }, { "epoch": 1.3829787234042552, "grad_norm": 0.04277553781867027, "learning_rate": 0.00010787949015063732, "loss": 0.0488, "step": 3185 }, { "epoch": 1.3834129396439427, "grad_norm": 0.028289254754781723, "learning_rate": 0.00010785052143684821, "loss": 0.0349, "step": 3186 }, { "epoch": 1.38384715588363, "grad_norm": 0.030348733067512512, "learning_rate": 0.00010782155272305909, "loss": 0.0354, "step": 3187 }, { "epoch": 1.3842813721233174, "grad_norm": 0.03448227420449257, "learning_rate": 0.00010779258400927, "loss": 0.0378, "step": 3188 }, { "epoch": 1.384715588363005, "grad_norm": 0.029860729351639748, "learning_rate": 0.00010776361529548088, "loss": 0.0339, "step": 3189 }, { "epoch": 1.3851498046026922, "grad_norm": 0.03594005107879639, "learning_rate": 0.00010773464658169176, "loss": 0.0394, "step": 3190 }, { "epoch": 1.3855840208423795, "grad_norm": 0.027083542197942734, "learning_rate": 0.00010770567786790266, "loss": 0.0328, "step": 3191 }, { "epoch": 1.3860182370820668, "grad_norm": 0.028965191915631294, "learning_rate": 0.00010767670915411357, "loss": 0.037, "step": 3192 }, { "epoch": 1.3864524533217542, "grad_norm": 0.023740047588944435, "learning_rate": 0.00010764774044032446, "loss": 0.0287, "step": 3193 }, { "epoch": 1.3868866695614415, "grad_norm": 0.03428789973258972, "learning_rate": 0.00010761877172653534, "loss": 0.0439, "step": 3194 }, { "epoch": 1.387320885801129, "grad_norm": 0.030780183151364326, "learning_rate": 0.00010758980301274624, "loss": 0.0384, "step": 3195 }, { "epoch": 1.3877551020408163, "grad_norm": 0.03086603432893753, "learning_rate": 0.00010756083429895713, "loss": 0.0341, "step": 3196 }, { "epoch": 1.3881893182805036, "grad_norm": 0.025530098006129265, "learning_rate": 0.00010753186558516803, "loss": 0.0318, "step": 3197 }, { "epoch": 1.3886235345201912, "grad_norm": 0.03410535678267479, "learning_rate": 0.00010750289687137891, "loss": 0.0405, "step": 3198 }, { "epoch": 1.3890577507598785, "grad_norm": 0.03248567506670952, "learning_rate": 0.00010747392815758981, "loss": 0.0434, "step": 3199 }, { "epoch": 1.3894919669995658, "grad_norm": 0.029699400067329407, "learning_rate": 0.0001074449594438007, "loss": 0.0356, "step": 3200 }, { "epoch": 1.389926183239253, "grad_norm": 0.031138869002461433, "learning_rate": 0.00010741599073001158, "loss": 0.0367, "step": 3201 }, { "epoch": 1.3903603994789404, "grad_norm": 0.03523705154657364, "learning_rate": 0.00010738702201622248, "loss": 0.0344, "step": 3202 }, { "epoch": 1.390794615718628, "grad_norm": 0.025838548317551613, "learning_rate": 0.00010735805330243339, "loss": 0.0312, "step": 3203 }, { "epoch": 1.3912288319583153, "grad_norm": 0.02980492077767849, "learning_rate": 0.00010732908458864428, "loss": 0.0325, "step": 3204 }, { "epoch": 1.3916630481980026, "grad_norm": 0.03147491440176964, "learning_rate": 0.00010730011587485515, "loss": 0.0376, "step": 3205 }, { "epoch": 1.39209726443769, "grad_norm": 0.03466599062085152, "learning_rate": 0.00010727114716106606, "loss": 0.0444, "step": 3206 }, { "epoch": 1.3925314806773774, "grad_norm": 0.02900407277047634, "learning_rate": 0.00010724217844727695, "loss": 0.0354, "step": 3207 }, { "epoch": 1.3929656969170647, "grad_norm": 0.03876974433660507, "learning_rate": 0.00010721320973348782, "loss": 0.0412, "step": 3208 }, { "epoch": 1.393399913156752, "grad_norm": 0.028679171577095985, "learning_rate": 0.00010718424101969873, "loss": 0.0359, "step": 3209 }, { "epoch": 1.3938341293964394, "grad_norm": 0.03055979497730732, "learning_rate": 0.00010715527230590963, "loss": 0.0379, "step": 3210 }, { "epoch": 1.3942683456361267, "grad_norm": 0.029706789180636406, "learning_rate": 0.00010712630359212052, "loss": 0.0337, "step": 3211 }, { "epoch": 1.3947025618758142, "grad_norm": 0.036127083003520966, "learning_rate": 0.0001070973348783314, "loss": 0.036, "step": 3212 }, { "epoch": 1.3951367781155015, "grad_norm": 0.025300292298197746, "learning_rate": 0.0001070683661645423, "loss": 0.0294, "step": 3213 }, { "epoch": 1.3955709943551888, "grad_norm": 0.025743434205651283, "learning_rate": 0.00010703939745075319, "loss": 0.0305, "step": 3214 }, { "epoch": 1.3960052105948764, "grad_norm": 0.029615003615617752, "learning_rate": 0.00010701042873696407, "loss": 0.0354, "step": 3215 }, { "epoch": 1.3964394268345637, "grad_norm": 0.03455755114555359, "learning_rate": 0.00010698146002317497, "loss": 0.0368, "step": 3216 }, { "epoch": 1.396873643074251, "grad_norm": 0.02908787690103054, "learning_rate": 0.00010695249130938588, "loss": 0.0308, "step": 3217 }, { "epoch": 1.3973078593139383, "grad_norm": 0.03211832791566849, "learning_rate": 0.00010692352259559677, "loss": 0.0376, "step": 3218 }, { "epoch": 1.3977420755536256, "grad_norm": 0.03817525878548622, "learning_rate": 0.00010689455388180764, "loss": 0.0405, "step": 3219 }, { "epoch": 1.3981762917933132, "grad_norm": 0.025961050763726234, "learning_rate": 0.00010686558516801855, "loss": 0.0323, "step": 3220 }, { "epoch": 1.3986105080330005, "grad_norm": 0.027354005724191666, "learning_rate": 0.00010683661645422944, "loss": 0.0335, "step": 3221 }, { "epoch": 1.3990447242726878, "grad_norm": 0.03182535618543625, "learning_rate": 0.00010680764774044031, "loss": 0.03, "step": 3222 }, { "epoch": 1.399478940512375, "grad_norm": 0.040023282170295715, "learning_rate": 0.00010677867902665122, "loss": 0.0424, "step": 3223 }, { "epoch": 1.3999131567520626, "grad_norm": 0.03446388617157936, "learning_rate": 0.00010674971031286212, "loss": 0.0333, "step": 3224 }, { "epoch": 1.40034737299175, "grad_norm": 0.03562047705054283, "learning_rate": 0.00010672074159907301, "loss": 0.0423, "step": 3225 }, { "epoch": 1.4007815892314373, "grad_norm": 0.034584201872348785, "learning_rate": 0.00010669177288528389, "loss": 0.0403, "step": 3226 }, { "epoch": 1.4012158054711246, "grad_norm": 0.035848475992679596, "learning_rate": 0.00010666280417149479, "loss": 0.0424, "step": 3227 }, { "epoch": 1.4016500217108119, "grad_norm": 0.0364072322845459, "learning_rate": 0.00010663383545770568, "loss": 0.0406, "step": 3228 }, { "epoch": 1.4020842379504994, "grad_norm": 0.029652820900082588, "learning_rate": 0.00010660486674391656, "loss": 0.0332, "step": 3229 }, { "epoch": 1.4025184541901867, "grad_norm": 0.03303779289126396, "learning_rate": 0.00010657589803012746, "loss": 0.0417, "step": 3230 }, { "epoch": 1.402952670429874, "grad_norm": 0.03159058094024658, "learning_rate": 0.00010654692931633837, "loss": 0.0398, "step": 3231 }, { "epoch": 1.4033868866695616, "grad_norm": 0.032879702746868134, "learning_rate": 0.00010651796060254926, "loss": 0.0412, "step": 3232 }, { "epoch": 1.4038211029092489, "grad_norm": 0.03602813929319382, "learning_rate": 0.00010648899188876013, "loss": 0.0354, "step": 3233 }, { "epoch": 1.4042553191489362, "grad_norm": 0.027373963966965675, "learning_rate": 0.00010646002317497104, "loss": 0.0332, "step": 3234 }, { "epoch": 1.4046895353886235, "grad_norm": 0.032608553767204285, "learning_rate": 0.00010643105446118193, "loss": 0.0372, "step": 3235 }, { "epoch": 1.4051237516283108, "grad_norm": 0.03611568361520767, "learning_rate": 0.0001064020857473928, "loss": 0.0434, "step": 3236 }, { "epoch": 1.4055579678679981, "grad_norm": 0.024837814271450043, "learning_rate": 0.00010637311703360371, "loss": 0.0285, "step": 3237 }, { "epoch": 1.4059921841076857, "grad_norm": 0.03480709344148636, "learning_rate": 0.00010634414831981461, "loss": 0.0397, "step": 3238 }, { "epoch": 1.406426400347373, "grad_norm": 0.036852896213531494, "learning_rate": 0.0001063151796060255, "loss": 0.048, "step": 3239 }, { "epoch": 1.4068606165870603, "grad_norm": 0.036686159670352936, "learning_rate": 0.00010628621089223638, "loss": 0.0416, "step": 3240 }, { "epoch": 1.4072948328267478, "grad_norm": 0.03664826601743698, "learning_rate": 0.00010625724217844728, "loss": 0.0448, "step": 3241 }, { "epoch": 1.4077290490664351, "grad_norm": 0.03065604530274868, "learning_rate": 0.00010622827346465819, "loss": 0.0329, "step": 3242 }, { "epoch": 1.4081632653061225, "grad_norm": 0.0347236692905426, "learning_rate": 0.00010619930475086908, "loss": 0.0401, "step": 3243 }, { "epoch": 1.4085974815458098, "grad_norm": 0.03463771566748619, "learning_rate": 0.00010617033603707995, "loss": 0.0421, "step": 3244 }, { "epoch": 1.409031697785497, "grad_norm": 0.03532618284225464, "learning_rate": 0.00010614136732329086, "loss": 0.0419, "step": 3245 }, { "epoch": 1.4094659140251846, "grad_norm": 0.03360011801123619, "learning_rate": 0.00010611239860950175, "loss": 0.0385, "step": 3246 }, { "epoch": 1.409900130264872, "grad_norm": 0.03765629231929779, "learning_rate": 0.00010608342989571262, "loss": 0.0385, "step": 3247 }, { "epoch": 1.4103343465045592, "grad_norm": 0.02594027668237686, "learning_rate": 0.00010605446118192353, "loss": 0.0339, "step": 3248 }, { "epoch": 1.4107685627442466, "grad_norm": 0.032357264310121536, "learning_rate": 0.00010602549246813443, "loss": 0.039, "step": 3249 }, { "epoch": 1.411202778983934, "grad_norm": 0.02691616490483284, "learning_rate": 0.00010599652375434532, "loss": 0.0374, "step": 3250 }, { "epoch": 1.4116369952236214, "grad_norm": 0.02686849609017372, "learning_rate": 0.0001059675550405562, "loss": 0.03, "step": 3251 }, { "epoch": 1.4120712114633087, "grad_norm": 0.0355256050825119, "learning_rate": 0.0001059385863267671, "loss": 0.0416, "step": 3252 }, { "epoch": 1.412505427702996, "grad_norm": 0.03339817002415657, "learning_rate": 0.00010590961761297799, "loss": 0.0443, "step": 3253 }, { "epoch": 1.4129396439426833, "grad_norm": 0.038682859390974045, "learning_rate": 0.00010588064889918887, "loss": 0.0464, "step": 3254 }, { "epoch": 1.4133738601823709, "grad_norm": 0.031128162518143654, "learning_rate": 0.00010585168018539977, "loss": 0.0371, "step": 3255 }, { "epoch": 1.4138080764220582, "grad_norm": 0.02861013635993004, "learning_rate": 0.00010582271147161067, "loss": 0.0306, "step": 3256 }, { "epoch": 1.4142422926617455, "grad_norm": 0.03535071387887001, "learning_rate": 0.00010579374275782157, "loss": 0.0442, "step": 3257 }, { "epoch": 1.414676508901433, "grad_norm": 0.03305106982588768, "learning_rate": 0.00010576477404403244, "loss": 0.039, "step": 3258 }, { "epoch": 1.4151107251411204, "grad_norm": 0.035108137875795364, "learning_rate": 0.00010573580533024335, "loss": 0.0413, "step": 3259 }, { "epoch": 1.4155449413808077, "grad_norm": 0.03309998661279678, "learning_rate": 0.00010570683661645424, "loss": 0.0411, "step": 3260 }, { "epoch": 1.415979157620495, "grad_norm": 0.03299713507294655, "learning_rate": 0.00010567786790266511, "loss": 0.0407, "step": 3261 }, { "epoch": 1.4164133738601823, "grad_norm": 0.031569916754961014, "learning_rate": 0.00010564889918887602, "loss": 0.0383, "step": 3262 }, { "epoch": 1.4168475900998696, "grad_norm": 0.032350361347198486, "learning_rate": 0.00010561993047508692, "loss": 0.0374, "step": 3263 }, { "epoch": 1.4172818063395571, "grad_norm": 0.03273614123463631, "learning_rate": 0.00010559096176129781, "loss": 0.0382, "step": 3264 }, { "epoch": 1.4177160225792445, "grad_norm": 0.028406990692019463, "learning_rate": 0.00010556199304750869, "loss": 0.0396, "step": 3265 }, { "epoch": 1.4181502388189318, "grad_norm": 0.04151050001382828, "learning_rate": 0.00010553302433371959, "loss": 0.0535, "step": 3266 }, { "epoch": 1.4185844550586193, "grad_norm": 0.045416295528411865, "learning_rate": 0.00010550405561993048, "loss": 0.0568, "step": 3267 }, { "epoch": 1.4190186712983066, "grad_norm": 0.035436343401670456, "learning_rate": 0.00010547508690614136, "loss": 0.0451, "step": 3268 }, { "epoch": 1.419452887537994, "grad_norm": 0.028981279581785202, "learning_rate": 0.00010544611819235226, "loss": 0.0325, "step": 3269 }, { "epoch": 1.4198871037776812, "grad_norm": 0.03858104720711708, "learning_rate": 0.00010541714947856316, "loss": 0.0361, "step": 3270 }, { "epoch": 1.4203213200173685, "grad_norm": 0.03171723708510399, "learning_rate": 0.00010538818076477405, "loss": 0.0385, "step": 3271 }, { "epoch": 1.420755536257056, "grad_norm": 0.03213641047477722, "learning_rate": 0.00010535921205098493, "loss": 0.0449, "step": 3272 }, { "epoch": 1.4211897524967434, "grad_norm": 0.02499936893582344, "learning_rate": 0.00010533024333719583, "loss": 0.0287, "step": 3273 }, { "epoch": 1.4216239687364307, "grad_norm": 0.03525727614760399, "learning_rate": 0.00010530127462340672, "loss": 0.0414, "step": 3274 }, { "epoch": 1.4220581849761182, "grad_norm": 0.025460774078965187, "learning_rate": 0.0001052723059096176, "loss": 0.0325, "step": 3275 }, { "epoch": 1.4224924012158056, "grad_norm": 0.034024693071842194, "learning_rate": 0.0001052433371958285, "loss": 0.0412, "step": 3276 }, { "epoch": 1.4229266174554929, "grad_norm": 0.030504699796438217, "learning_rate": 0.00010521436848203941, "loss": 0.0311, "step": 3277 }, { "epoch": 1.4233608336951802, "grad_norm": 0.032529011368751526, "learning_rate": 0.0001051853997682503, "loss": 0.0367, "step": 3278 }, { "epoch": 1.4237950499348675, "grad_norm": 0.038764290511608124, "learning_rate": 0.00010515643105446118, "loss": 0.046, "step": 3279 }, { "epoch": 1.4242292661745548, "grad_norm": 0.042133111506700516, "learning_rate": 0.00010512746234067208, "loss": 0.044, "step": 3280 }, { "epoch": 1.4246634824142423, "grad_norm": 0.027921125292778015, "learning_rate": 0.00010509849362688298, "loss": 0.03, "step": 3281 }, { "epoch": 1.4250976986539297, "grad_norm": 0.030699385330080986, "learning_rate": 0.00010506952491309385, "loss": 0.0307, "step": 3282 }, { "epoch": 1.425531914893617, "grad_norm": 0.02786310389637947, "learning_rate": 0.00010504055619930475, "loss": 0.0346, "step": 3283 }, { "epoch": 1.4259661311333045, "grad_norm": 0.04770581051707268, "learning_rate": 0.00010501158748551565, "loss": 0.0563, "step": 3284 }, { "epoch": 1.4264003473729918, "grad_norm": 0.03366216644644737, "learning_rate": 0.00010498261877172654, "loss": 0.0389, "step": 3285 }, { "epoch": 1.4268345636126791, "grad_norm": 0.03289399296045303, "learning_rate": 0.00010495365005793742, "loss": 0.0326, "step": 3286 }, { "epoch": 1.4272687798523664, "grad_norm": 0.031693994998931885, "learning_rate": 0.00010492468134414832, "loss": 0.0389, "step": 3287 }, { "epoch": 1.4277029960920538, "grad_norm": 0.031907886266708374, "learning_rate": 0.00010489571263035923, "loss": 0.0371, "step": 3288 }, { "epoch": 1.4281372123317413, "grad_norm": 0.027918869629502296, "learning_rate": 0.0001048667439165701, "loss": 0.0334, "step": 3289 }, { "epoch": 1.4285714285714286, "grad_norm": 0.030658742412924767, "learning_rate": 0.000104837775202781, "loss": 0.0384, "step": 3290 }, { "epoch": 1.429005644811116, "grad_norm": 0.0326542891561985, "learning_rate": 0.0001048088064889919, "loss": 0.0409, "step": 3291 }, { "epoch": 1.4294398610508032, "grad_norm": 0.031729135662317276, "learning_rate": 0.00010477983777520279, "loss": 0.0378, "step": 3292 }, { "epoch": 1.4298740772904908, "grad_norm": 0.03256106376647949, "learning_rate": 0.00010475086906141367, "loss": 0.0426, "step": 3293 }, { "epoch": 1.430308293530178, "grad_norm": 0.0483257956802845, "learning_rate": 0.00010472190034762457, "loss": 0.0606, "step": 3294 }, { "epoch": 1.4307425097698654, "grad_norm": 0.030745111405849457, "learning_rate": 0.00010469293163383547, "loss": 0.0399, "step": 3295 }, { "epoch": 1.4311767260095527, "grad_norm": 0.0315384678542614, "learning_rate": 0.00010466396292004636, "loss": 0.0353, "step": 3296 }, { "epoch": 1.43161094224924, "grad_norm": 0.033200617879629135, "learning_rate": 0.00010463499420625724, "loss": 0.0342, "step": 3297 }, { "epoch": 1.4320451584889275, "grad_norm": 0.024957740679383278, "learning_rate": 0.00010460602549246814, "loss": 0.0316, "step": 3298 }, { "epoch": 1.4324793747286149, "grad_norm": 0.024794064462184906, "learning_rate": 0.00010457705677867903, "loss": 0.0304, "step": 3299 }, { "epoch": 1.4329135909683022, "grad_norm": 0.029068659991025925, "learning_rate": 0.00010454808806488991, "loss": 0.0395, "step": 3300 }, { "epoch": 1.4333478072079897, "grad_norm": 0.028627926483750343, "learning_rate": 0.00010451911935110081, "loss": 0.0336, "step": 3301 }, { "epoch": 1.433782023447677, "grad_norm": 0.026776088401675224, "learning_rate": 0.00010449015063731172, "loss": 0.0318, "step": 3302 }, { "epoch": 1.4342162396873643, "grad_norm": 0.026125865057110786, "learning_rate": 0.00010446118192352261, "loss": 0.0322, "step": 3303 }, { "epoch": 1.4346504559270516, "grad_norm": 0.039897192269563675, "learning_rate": 0.00010443221320973348, "loss": 0.0552, "step": 3304 }, { "epoch": 1.435084672166739, "grad_norm": 0.031116807833313942, "learning_rate": 0.00010440324449594439, "loss": 0.0378, "step": 3305 }, { "epoch": 1.4355188884064263, "grad_norm": 0.02994142472743988, "learning_rate": 0.00010437427578215528, "loss": 0.0425, "step": 3306 }, { "epoch": 1.4359531046461138, "grad_norm": 0.04656873643398285, "learning_rate": 0.00010434530706836615, "loss": 0.0546, "step": 3307 }, { "epoch": 1.4363873208858011, "grad_norm": 0.031884320080280304, "learning_rate": 0.00010431633835457706, "loss": 0.0378, "step": 3308 }, { "epoch": 1.4368215371254884, "grad_norm": 0.026836002245545387, "learning_rate": 0.00010428736964078796, "loss": 0.0284, "step": 3309 }, { "epoch": 1.437255753365176, "grad_norm": 0.03831776604056358, "learning_rate": 0.00010425840092699885, "loss": 0.0453, "step": 3310 }, { "epoch": 1.4376899696048633, "grad_norm": 0.03864622488617897, "learning_rate": 0.00010422943221320973, "loss": 0.042, "step": 3311 }, { "epoch": 1.4381241858445506, "grad_norm": 0.030636310577392578, "learning_rate": 0.00010420046349942063, "loss": 0.0421, "step": 3312 }, { "epoch": 1.438558402084238, "grad_norm": 0.03151032701134682, "learning_rate": 0.00010417149478563152, "loss": 0.0363, "step": 3313 }, { "epoch": 1.4389926183239252, "grad_norm": 0.03308417648077011, "learning_rate": 0.0001041425260718424, "loss": 0.042, "step": 3314 }, { "epoch": 1.4394268345636128, "grad_norm": 0.028614139184355736, "learning_rate": 0.0001041135573580533, "loss": 0.0346, "step": 3315 }, { "epoch": 1.4398610508033, "grad_norm": 0.03070632368326187, "learning_rate": 0.00010408458864426421, "loss": 0.0392, "step": 3316 }, { "epoch": 1.4402952670429874, "grad_norm": 0.030251063406467438, "learning_rate": 0.0001040556199304751, "loss": 0.0386, "step": 3317 }, { "epoch": 1.4407294832826747, "grad_norm": 0.025365928187966347, "learning_rate": 0.00010402665121668597, "loss": 0.0319, "step": 3318 }, { "epoch": 1.4411636995223622, "grad_norm": 0.02654959075152874, "learning_rate": 0.00010399768250289688, "loss": 0.0345, "step": 3319 }, { "epoch": 1.4415979157620495, "grad_norm": 0.04044915363192558, "learning_rate": 0.00010396871378910778, "loss": 0.046, "step": 3320 }, { "epoch": 1.4420321320017369, "grad_norm": 0.029332028701901436, "learning_rate": 0.00010393974507531864, "loss": 0.0352, "step": 3321 }, { "epoch": 1.4424663482414242, "grad_norm": 0.03038622811436653, "learning_rate": 0.00010391077636152955, "loss": 0.037, "step": 3322 }, { "epoch": 1.4429005644811115, "grad_norm": 0.026014646515250206, "learning_rate": 0.00010388180764774045, "loss": 0.0343, "step": 3323 }, { "epoch": 1.443334780720799, "grad_norm": 0.03728277236223221, "learning_rate": 0.00010385283893395134, "loss": 0.0416, "step": 3324 }, { "epoch": 1.4437689969604863, "grad_norm": 0.03403148800134659, "learning_rate": 0.00010382387022016222, "loss": 0.0415, "step": 3325 }, { "epoch": 1.4442032132001736, "grad_norm": 0.02657419629395008, "learning_rate": 0.00010379490150637312, "loss": 0.0275, "step": 3326 }, { "epoch": 1.4446374294398612, "grad_norm": 0.03451921045780182, "learning_rate": 0.00010376593279258403, "loss": 0.0411, "step": 3327 }, { "epoch": 1.4450716456795485, "grad_norm": 0.04036589339375496, "learning_rate": 0.0001037369640787949, "loss": 0.0426, "step": 3328 }, { "epoch": 1.4455058619192358, "grad_norm": 0.03352858126163483, "learning_rate": 0.00010370799536500579, "loss": 0.037, "step": 3329 }, { "epoch": 1.4459400781589231, "grad_norm": 0.028667258098721504, "learning_rate": 0.0001036790266512167, "loss": 0.0337, "step": 3330 }, { "epoch": 1.4463742943986104, "grad_norm": 0.03004029020667076, "learning_rate": 0.00010365005793742759, "loss": 0.0328, "step": 3331 }, { "epoch": 1.4468085106382977, "grad_norm": 0.03188534080982208, "learning_rate": 0.00010362108922363846, "loss": 0.0372, "step": 3332 }, { "epoch": 1.4472427268779853, "grad_norm": 0.02760126441717148, "learning_rate": 0.00010359212050984937, "loss": 0.0326, "step": 3333 }, { "epoch": 1.4476769431176726, "grad_norm": 0.03667876869440079, "learning_rate": 0.00010356315179606027, "loss": 0.0417, "step": 3334 }, { "epoch": 1.44811115935736, "grad_norm": 0.029996925964951515, "learning_rate": 0.00010353418308227115, "loss": 0.0365, "step": 3335 }, { "epoch": 1.4485453755970474, "grad_norm": 0.040038205683231354, "learning_rate": 0.00010350521436848204, "loss": 0.0438, "step": 3336 }, { "epoch": 1.4489795918367347, "grad_norm": 0.03176156058907509, "learning_rate": 0.00010347624565469294, "loss": 0.0326, "step": 3337 }, { "epoch": 1.449413808076422, "grad_norm": 0.040942415595054626, "learning_rate": 0.00010344727694090383, "loss": 0.0449, "step": 3338 }, { "epoch": 1.4498480243161094, "grad_norm": 0.03038792870938778, "learning_rate": 0.00010341830822711471, "loss": 0.0335, "step": 3339 }, { "epoch": 1.4502822405557967, "grad_norm": 0.03754470869898796, "learning_rate": 0.00010338933951332561, "loss": 0.0437, "step": 3340 }, { "epoch": 1.4507164567954842, "grad_norm": 0.02866341732442379, "learning_rate": 0.00010336037079953652, "loss": 0.0271, "step": 3341 }, { "epoch": 1.4511506730351715, "grad_norm": 0.03174388408660889, "learning_rate": 0.0001033314020857474, "loss": 0.0338, "step": 3342 }, { "epoch": 1.4515848892748588, "grad_norm": 0.031860291957855225, "learning_rate": 0.00010330243337195828, "loss": 0.0383, "step": 3343 }, { "epoch": 1.4520191055145464, "grad_norm": 0.033638469874858856, "learning_rate": 0.00010327346465816919, "loss": 0.036, "step": 3344 }, { "epoch": 1.4524533217542337, "grad_norm": 0.031240377575159073, "learning_rate": 0.00010324449594438008, "loss": 0.0394, "step": 3345 }, { "epoch": 1.452887537993921, "grad_norm": 0.030133496969938278, "learning_rate": 0.00010321552723059095, "loss": 0.0381, "step": 3346 }, { "epoch": 1.4533217542336083, "grad_norm": 0.028217071667313576, "learning_rate": 0.00010318655851680186, "loss": 0.0357, "step": 3347 }, { "epoch": 1.4537559704732956, "grad_norm": 0.03458056598901749, "learning_rate": 0.00010315758980301276, "loss": 0.0384, "step": 3348 }, { "epoch": 1.454190186712983, "grad_norm": 0.032685186713933945, "learning_rate": 0.00010312862108922365, "loss": 0.0397, "step": 3349 }, { "epoch": 1.4546244029526705, "grad_norm": 0.034984290599823, "learning_rate": 0.00010309965237543453, "loss": 0.0382, "step": 3350 }, { "epoch": 1.4550586191923578, "grad_norm": 0.028849076479673386, "learning_rate": 0.00010307068366164543, "loss": 0.0378, "step": 3351 }, { "epoch": 1.455492835432045, "grad_norm": 0.025639649480581284, "learning_rate": 0.00010304171494785632, "loss": 0.0304, "step": 3352 }, { "epoch": 1.4559270516717326, "grad_norm": 0.03236394748091698, "learning_rate": 0.0001030127462340672, "loss": 0.0387, "step": 3353 }, { "epoch": 1.45636126791142, "grad_norm": 0.03394026681780815, "learning_rate": 0.0001029837775202781, "loss": 0.0384, "step": 3354 }, { "epoch": 1.4567954841511073, "grad_norm": 0.026026029139757156, "learning_rate": 0.000102954808806489, "loss": 0.03, "step": 3355 }, { "epoch": 1.4572297003907946, "grad_norm": 0.04353984445333481, "learning_rate": 0.0001029258400926999, "loss": 0.0388, "step": 3356 }, { "epoch": 1.457663916630482, "grad_norm": 0.022466301918029785, "learning_rate": 0.00010289687137891077, "loss": 0.027, "step": 3357 }, { "epoch": 1.4580981328701692, "grad_norm": 0.0388190895318985, "learning_rate": 0.00010286790266512168, "loss": 0.0388, "step": 3358 }, { "epoch": 1.4585323491098567, "grad_norm": 0.027438731864094734, "learning_rate": 0.00010283893395133258, "loss": 0.0308, "step": 3359 }, { "epoch": 1.458966565349544, "grad_norm": 0.03653266653418541, "learning_rate": 0.00010280996523754344, "loss": 0.0437, "step": 3360 }, { "epoch": 1.4594007815892314, "grad_norm": 0.04031620919704437, "learning_rate": 0.00010278099652375435, "loss": 0.0435, "step": 3361 }, { "epoch": 1.459834997828919, "grad_norm": 0.03225674852728844, "learning_rate": 0.00010275202780996525, "loss": 0.0373, "step": 3362 }, { "epoch": 1.4602692140686062, "grad_norm": 0.04214489087462425, "learning_rate": 0.00010272305909617614, "loss": 0.0524, "step": 3363 }, { "epoch": 1.4607034303082935, "grad_norm": 0.031885795295238495, "learning_rate": 0.00010269409038238702, "loss": 0.0329, "step": 3364 }, { "epoch": 1.4611376465479808, "grad_norm": 0.03231288120150566, "learning_rate": 0.00010266512166859792, "loss": 0.0378, "step": 3365 }, { "epoch": 1.4615718627876682, "grad_norm": 0.03609391301870346, "learning_rate": 0.00010263615295480882, "loss": 0.0365, "step": 3366 }, { "epoch": 1.4620060790273557, "grad_norm": 0.034693483263254166, "learning_rate": 0.0001026071842410197, "loss": 0.0422, "step": 3367 }, { "epoch": 1.462440295267043, "grad_norm": 0.03828537464141846, "learning_rate": 0.00010257821552723059, "loss": 0.0456, "step": 3368 }, { "epoch": 1.4628745115067303, "grad_norm": 0.03838023915886879, "learning_rate": 0.0001025492468134415, "loss": 0.0437, "step": 3369 }, { "epoch": 1.4633087277464178, "grad_norm": 0.037904292345047, "learning_rate": 0.00010252027809965238, "loss": 0.0421, "step": 3370 }, { "epoch": 1.4637429439861052, "grad_norm": 0.029323799535632133, "learning_rate": 0.00010249130938586326, "loss": 0.0342, "step": 3371 }, { "epoch": 1.4641771602257925, "grad_norm": 0.03726832941174507, "learning_rate": 0.00010246234067207416, "loss": 0.0379, "step": 3372 }, { "epoch": 1.4646113764654798, "grad_norm": 0.028734877705574036, "learning_rate": 0.00010243337195828507, "loss": 0.0325, "step": 3373 }, { "epoch": 1.465045592705167, "grad_norm": 0.031351253390312195, "learning_rate": 0.00010240440324449594, "loss": 0.0375, "step": 3374 }, { "epoch": 1.4654798089448544, "grad_norm": 0.034263450652360916, "learning_rate": 0.00010237543453070683, "loss": 0.0405, "step": 3375 }, { "epoch": 1.465914025184542, "grad_norm": 0.03257537633180618, "learning_rate": 0.00010234646581691774, "loss": 0.0393, "step": 3376 }, { "epoch": 1.4663482414242293, "grad_norm": 0.03476444259285927, "learning_rate": 0.00010231749710312863, "loss": 0.0387, "step": 3377 }, { "epoch": 1.4667824576639166, "grad_norm": 0.027681834995746613, "learning_rate": 0.0001022885283893395, "loss": 0.0336, "step": 3378 }, { "epoch": 1.467216673903604, "grad_norm": 0.031740736216306686, "learning_rate": 0.00010225955967555041, "loss": 0.037, "step": 3379 }, { "epoch": 1.4676508901432914, "grad_norm": 0.028416205197572708, "learning_rate": 0.00010223059096176131, "loss": 0.031, "step": 3380 }, { "epoch": 1.4680851063829787, "grad_norm": 0.04032799229025841, "learning_rate": 0.00010220162224797219, "loss": 0.0506, "step": 3381 }, { "epoch": 1.468519322622666, "grad_norm": 0.042136695235967636, "learning_rate": 0.00010217265353418308, "loss": 0.0531, "step": 3382 }, { "epoch": 1.4689535388623534, "grad_norm": 0.035203997045755386, "learning_rate": 0.00010214368482039398, "loss": 0.0387, "step": 3383 }, { "epoch": 1.469387755102041, "grad_norm": 0.029803968966007233, "learning_rate": 0.00010211471610660487, "loss": 0.0354, "step": 3384 }, { "epoch": 1.4698219713417282, "grad_norm": 0.032798588275909424, "learning_rate": 0.00010208574739281575, "loss": 0.0372, "step": 3385 }, { "epoch": 1.4702561875814155, "grad_norm": 0.03250119462609291, "learning_rate": 0.00010205677867902665, "loss": 0.0381, "step": 3386 }, { "epoch": 1.4706904038211028, "grad_norm": 0.03178366273641586, "learning_rate": 0.00010202780996523756, "loss": 0.0374, "step": 3387 }, { "epoch": 1.4711246200607904, "grad_norm": 0.045984238386154175, "learning_rate": 0.00010199884125144845, "loss": 0.0509, "step": 3388 }, { "epoch": 1.4715588363004777, "grad_norm": 0.029115086421370506, "learning_rate": 0.00010196987253765932, "loss": 0.0375, "step": 3389 }, { "epoch": 1.471993052540165, "grad_norm": 0.03576627001166344, "learning_rate": 0.00010194090382387023, "loss": 0.0467, "step": 3390 }, { "epoch": 1.4724272687798523, "grad_norm": 0.02859627641737461, "learning_rate": 0.00010191193511008112, "loss": 0.0325, "step": 3391 }, { "epoch": 1.4728614850195396, "grad_norm": 0.03251340240240097, "learning_rate": 0.000101882966396292, "loss": 0.0382, "step": 3392 }, { "epoch": 1.4732957012592272, "grad_norm": 0.03137362003326416, "learning_rate": 0.0001018539976825029, "loss": 0.0368, "step": 3393 }, { "epoch": 1.4737299174989145, "grad_norm": 0.029423438012599945, "learning_rate": 0.0001018250289687138, "loss": 0.0337, "step": 3394 }, { "epoch": 1.4741641337386018, "grad_norm": 0.03970061615109444, "learning_rate": 0.00010179606025492469, "loss": 0.0475, "step": 3395 }, { "epoch": 1.4745983499782893, "grad_norm": 0.034941863268613815, "learning_rate": 0.00010176709154113557, "loss": 0.0431, "step": 3396 }, { "epoch": 1.4750325662179766, "grad_norm": 0.03466002643108368, "learning_rate": 0.00010173812282734647, "loss": 0.0347, "step": 3397 }, { "epoch": 1.475466782457664, "grad_norm": 0.030201608315110207, "learning_rate": 0.00010170915411355738, "loss": 0.0413, "step": 3398 }, { "epoch": 1.4759009986973513, "grad_norm": 0.030097804963588715, "learning_rate": 0.00010168018539976824, "loss": 0.033, "step": 3399 }, { "epoch": 1.4763352149370386, "grad_norm": 0.029949581250548363, "learning_rate": 0.00010165121668597914, "loss": 0.033, "step": 3400 }, { "epoch": 1.4767694311767259, "grad_norm": 0.040669020265340805, "learning_rate": 0.00010162224797219005, "loss": 0.0459, "step": 3401 }, { "epoch": 1.4772036474164134, "grad_norm": 0.03031211346387863, "learning_rate": 0.00010159327925840094, "loss": 0.0302, "step": 3402 }, { "epoch": 1.4776378636561007, "grad_norm": 0.039463043212890625, "learning_rate": 0.00010156431054461181, "loss": 0.0473, "step": 3403 }, { "epoch": 1.478072079895788, "grad_norm": 0.03974929079413414, "learning_rate": 0.00010153534183082272, "loss": 0.0448, "step": 3404 }, { "epoch": 1.4785062961354756, "grad_norm": 0.03540648892521858, "learning_rate": 0.00010150637311703362, "loss": 0.0442, "step": 3405 }, { "epoch": 1.4789405123751629, "grad_norm": 0.03644976764917374, "learning_rate": 0.0001014774044032445, "loss": 0.0455, "step": 3406 }, { "epoch": 1.4793747286148502, "grad_norm": 0.026639649644494057, "learning_rate": 0.00010144843568945539, "loss": 0.0334, "step": 3407 }, { "epoch": 1.4798089448545375, "grad_norm": 0.03132033348083496, "learning_rate": 0.00010141946697566629, "loss": 0.0373, "step": 3408 }, { "epoch": 1.4802431610942248, "grad_norm": 0.03561646491289139, "learning_rate": 0.00010139049826187718, "loss": 0.0456, "step": 3409 }, { "epoch": 1.4806773773339124, "grad_norm": 0.03400665521621704, "learning_rate": 0.00010136152954808806, "loss": 0.0412, "step": 3410 }, { "epoch": 1.4811115935735997, "grad_norm": 0.02562865987420082, "learning_rate": 0.00010133256083429896, "loss": 0.0333, "step": 3411 }, { "epoch": 1.481545809813287, "grad_norm": 0.027811691164970398, "learning_rate": 0.00010130359212050987, "loss": 0.0328, "step": 3412 }, { "epoch": 1.4819800260529745, "grad_norm": 0.034765202552080154, "learning_rate": 0.00010127462340672074, "loss": 0.0468, "step": 3413 }, { "epoch": 1.4824142422926618, "grad_norm": 0.03131668269634247, "learning_rate": 0.00010124565469293163, "loss": 0.0386, "step": 3414 }, { "epoch": 1.4828484585323491, "grad_norm": 0.028582720085978508, "learning_rate": 0.00010121668597914254, "loss": 0.0338, "step": 3415 }, { "epoch": 1.4832826747720365, "grad_norm": 0.031423334032297134, "learning_rate": 0.00010118771726535343, "loss": 0.0338, "step": 3416 }, { "epoch": 1.4837168910117238, "grad_norm": 0.02855740115046501, "learning_rate": 0.0001011587485515643, "loss": 0.0331, "step": 3417 }, { "epoch": 1.484151107251411, "grad_norm": 0.03403674438595772, "learning_rate": 0.00010112977983777521, "loss": 0.04, "step": 3418 }, { "epoch": 1.4845853234910986, "grad_norm": 0.049074187874794006, "learning_rate": 0.00010110081112398611, "loss": 0.0429, "step": 3419 }, { "epoch": 1.485019539730786, "grad_norm": 0.028765501454472542, "learning_rate": 0.00010107184241019699, "loss": 0.0341, "step": 3420 }, { "epoch": 1.4854537559704732, "grad_norm": 0.043500449508428574, "learning_rate": 0.00010104287369640788, "loss": 0.046, "step": 3421 }, { "epoch": 1.4858879722101608, "grad_norm": 0.03735543042421341, "learning_rate": 0.00010101390498261878, "loss": 0.0377, "step": 3422 }, { "epoch": 1.486322188449848, "grad_norm": 0.031222129240632057, "learning_rate": 0.00010098493626882967, "loss": 0.0354, "step": 3423 }, { "epoch": 1.4867564046895354, "grad_norm": 0.03807615861296654, "learning_rate": 0.00010095596755504055, "loss": 0.0465, "step": 3424 }, { "epoch": 1.4871906209292227, "grad_norm": 0.029024573042988777, "learning_rate": 0.00010092699884125145, "loss": 0.0327, "step": 3425 }, { "epoch": 1.48762483716891, "grad_norm": 0.03897327929735184, "learning_rate": 0.00010089803012746236, "loss": 0.051, "step": 3426 }, { "epoch": 1.4880590534085973, "grad_norm": 0.06425464898347855, "learning_rate": 0.00010086906141367323, "loss": 0.0654, "step": 3427 }, { "epoch": 1.4884932696482849, "grad_norm": 0.037279073148965836, "learning_rate": 0.00010084009269988412, "loss": 0.0398, "step": 3428 }, { "epoch": 1.4889274858879722, "grad_norm": 0.03177998960018158, "learning_rate": 0.00010081112398609503, "loss": 0.0331, "step": 3429 }, { "epoch": 1.4893617021276595, "grad_norm": 0.0315001904964447, "learning_rate": 0.00010078215527230592, "loss": 0.0418, "step": 3430 }, { "epoch": 1.489795918367347, "grad_norm": 0.03018135018646717, "learning_rate": 0.00010075318655851679, "loss": 0.0343, "step": 3431 }, { "epoch": 1.4902301346070344, "grad_norm": 0.023591920733451843, "learning_rate": 0.0001007242178447277, "loss": 0.0272, "step": 3432 }, { "epoch": 1.4906643508467217, "grad_norm": 0.03260718286037445, "learning_rate": 0.0001006952491309386, "loss": 0.0375, "step": 3433 }, { "epoch": 1.491098567086409, "grad_norm": 0.02847980707883835, "learning_rate": 0.00010066628041714949, "loss": 0.0366, "step": 3434 }, { "epoch": 1.4915327833260963, "grad_norm": 0.04872502014040947, "learning_rate": 0.00010063731170336037, "loss": 0.0664, "step": 3435 }, { "epoch": 1.4919669995657838, "grad_norm": 0.03252248466014862, "learning_rate": 0.00010060834298957127, "loss": 0.0357, "step": 3436 }, { "epoch": 1.4924012158054711, "grad_norm": 0.031972359865903854, "learning_rate": 0.00010057937427578217, "loss": 0.0363, "step": 3437 }, { "epoch": 1.4928354320451585, "grad_norm": 0.03260812163352966, "learning_rate": 0.00010055040556199304, "loss": 0.0378, "step": 3438 }, { "epoch": 1.493269648284846, "grad_norm": 0.028248609974980354, "learning_rate": 0.00010052143684820394, "loss": 0.0335, "step": 3439 }, { "epoch": 1.4937038645245333, "grad_norm": 0.029487421736121178, "learning_rate": 0.00010049246813441484, "loss": 0.0349, "step": 3440 }, { "epoch": 1.4941380807642206, "grad_norm": 0.03654861822724342, "learning_rate": 0.00010046349942062574, "loss": 0.047, "step": 3441 }, { "epoch": 1.494572297003908, "grad_norm": 0.03188052028417587, "learning_rate": 0.00010043453070683661, "loss": 0.0416, "step": 3442 }, { "epoch": 1.4950065132435952, "grad_norm": 0.043259184807538986, "learning_rate": 0.00010040556199304752, "loss": 0.05, "step": 3443 }, { "epoch": 1.4954407294832825, "grad_norm": 0.03597338870167732, "learning_rate": 0.00010037659327925842, "loss": 0.0392, "step": 3444 }, { "epoch": 1.49587494572297, "grad_norm": 0.04451984539628029, "learning_rate": 0.0001003476245654693, "loss": 0.0507, "step": 3445 }, { "epoch": 1.4963091619626574, "grad_norm": 0.03480066731572151, "learning_rate": 0.00010031865585168019, "loss": 0.0367, "step": 3446 }, { "epoch": 1.4967433782023447, "grad_norm": 0.036181192845106125, "learning_rate": 0.00010028968713789109, "loss": 0.0469, "step": 3447 }, { "epoch": 1.4971775944420322, "grad_norm": 0.02753407321870327, "learning_rate": 0.00010026071842410198, "loss": 0.0312, "step": 3448 }, { "epoch": 1.4976118106817196, "grad_norm": 0.029450923204421997, "learning_rate": 0.00010023174971031286, "loss": 0.0344, "step": 3449 }, { "epoch": 1.4980460269214069, "grad_norm": 0.04387199133634567, "learning_rate": 0.00010020278099652376, "loss": 0.0425, "step": 3450 }, { "epoch": 1.4984802431610942, "grad_norm": 0.034993767738342285, "learning_rate": 0.00010017381228273466, "loss": 0.0386, "step": 3451 }, { "epoch": 1.4989144594007815, "grad_norm": 0.02988545224070549, "learning_rate": 0.00010014484356894554, "loss": 0.0366, "step": 3452 }, { "epoch": 1.499348675640469, "grad_norm": 0.039620961993932724, "learning_rate": 0.00010011587485515643, "loss": 0.0404, "step": 3453 }, { "epoch": 1.4997828918801563, "grad_norm": 0.0360754132270813, "learning_rate": 0.00010008690614136733, "loss": 0.0352, "step": 3454 }, { "epoch": 1.5002171081198437, "grad_norm": 0.030626730993390083, "learning_rate": 0.00010005793742757822, "loss": 0.0371, "step": 3455 }, { "epoch": 1.5006513243595312, "grad_norm": 0.04193166643381119, "learning_rate": 0.0001000289687137891, "loss": 0.0409, "step": 3456 }, { "epoch": 1.5010855405992185, "grad_norm": 0.03243810683488846, "learning_rate": 0.0001, "loss": 0.0394, "step": 3457 }, { "epoch": 1.5015197568389058, "grad_norm": 0.032020773738622665, "learning_rate": 9.99710312862109e-05, "loss": 0.0455, "step": 3458 }, { "epoch": 1.5019539730785931, "grad_norm": 0.033629849553108215, "learning_rate": 9.994206257242179e-05, "loss": 0.0374, "step": 3459 }, { "epoch": 1.5023881893182804, "grad_norm": 0.045886922627687454, "learning_rate": 9.991309385863268e-05, "loss": 0.0367, "step": 3460 }, { "epoch": 1.5028224055579678, "grad_norm": 0.026090702041983604, "learning_rate": 9.988412514484358e-05, "loss": 0.034, "step": 3461 }, { "epoch": 1.503256621797655, "grad_norm": 0.02858106605708599, "learning_rate": 9.985515643105447e-05, "loss": 0.0377, "step": 3462 }, { "epoch": 1.5036908380373426, "grad_norm": 0.03803960233926773, "learning_rate": 9.982618771726536e-05, "loss": 0.0474, "step": 3463 }, { "epoch": 1.50412505427703, "grad_norm": 0.031110800802707672, "learning_rate": 9.979721900347625e-05, "loss": 0.0317, "step": 3464 }, { "epoch": 1.5045592705167175, "grad_norm": 0.025930413976311684, "learning_rate": 9.976825028968714e-05, "loss": 0.0313, "step": 3465 }, { "epoch": 1.5049934867564048, "grad_norm": 0.034064143896102905, "learning_rate": 9.973928157589803e-05, "loss": 0.0385, "step": 3466 }, { "epoch": 1.505427702996092, "grad_norm": 0.03255598992109299, "learning_rate": 9.971031286210892e-05, "loss": 0.0407, "step": 3467 }, { "epoch": 1.5058619192357794, "grad_norm": 0.031519632786512375, "learning_rate": 9.968134414831982e-05, "loss": 0.0371, "step": 3468 }, { "epoch": 1.5062961354754667, "grad_norm": 0.024471892043948174, "learning_rate": 9.965237543453071e-05, "loss": 0.0275, "step": 3469 }, { "epoch": 1.506730351715154, "grad_norm": 0.03765787556767464, "learning_rate": 9.96234067207416e-05, "loss": 0.0408, "step": 3470 }, { "epoch": 1.5071645679548415, "grad_norm": 0.03735783323645592, "learning_rate": 9.95944380069525e-05, "loss": 0.042, "step": 3471 }, { "epoch": 1.5075987841945289, "grad_norm": 0.031440261751413345, "learning_rate": 9.95654692931634e-05, "loss": 0.0383, "step": 3472 }, { "epoch": 1.5080330004342164, "grad_norm": 0.03892599418759346, "learning_rate": 9.953650057937427e-05, "loss": 0.0469, "step": 3473 }, { "epoch": 1.5084672166739037, "grad_norm": 0.03277905657887459, "learning_rate": 9.950753186558516e-05, "loss": 0.0421, "step": 3474 }, { "epoch": 1.508901432913591, "grad_norm": 0.03168017417192459, "learning_rate": 9.947856315179607e-05, "loss": 0.0377, "step": 3475 }, { "epoch": 1.5093356491532783, "grad_norm": 0.042192284017801285, "learning_rate": 9.944959443800696e-05, "loss": 0.0457, "step": 3476 }, { "epoch": 1.5097698653929656, "grad_norm": 0.04205150529742241, "learning_rate": 9.942062572421785e-05, "loss": 0.0418, "step": 3477 }, { "epoch": 1.510204081632653, "grad_norm": 0.05229579284787178, "learning_rate": 9.939165701042874e-05, "loss": 0.0476, "step": 3478 }, { "epoch": 1.5106382978723403, "grad_norm": 0.030060801655054092, "learning_rate": 9.936268829663964e-05, "loss": 0.0356, "step": 3479 }, { "epoch": 1.5110725141120278, "grad_norm": 0.033268120139837265, "learning_rate": 9.933371958285053e-05, "loss": 0.0396, "step": 3480 }, { "epoch": 1.5115067303517151, "grad_norm": 0.032858263701200485, "learning_rate": 9.930475086906141e-05, "loss": 0.0389, "step": 3481 }, { "epoch": 1.5119409465914027, "grad_norm": 0.029995379969477654, "learning_rate": 9.927578215527231e-05, "loss": 0.037, "step": 3482 }, { "epoch": 1.51237516283109, "grad_norm": 0.029411448165774345, "learning_rate": 9.92468134414832e-05, "loss": 0.0372, "step": 3483 }, { "epoch": 1.5128093790707773, "grad_norm": 0.02558985911309719, "learning_rate": 9.92178447276941e-05, "loss": 0.0302, "step": 3484 }, { "epoch": 1.5132435953104646, "grad_norm": 0.029958924278616905, "learning_rate": 9.918887601390498e-05, "loss": 0.0349, "step": 3485 }, { "epoch": 1.513677811550152, "grad_norm": 0.03937581554055214, "learning_rate": 9.915990730011589e-05, "loss": 0.0473, "step": 3486 }, { "epoch": 1.5141120277898392, "grad_norm": 0.03671418875455856, "learning_rate": 9.913093858632678e-05, "loss": 0.0441, "step": 3487 }, { "epoch": 1.5145462440295268, "grad_norm": 0.038126688450574875, "learning_rate": 9.910196987253765e-05, "loss": 0.0506, "step": 3488 }, { "epoch": 1.514980460269214, "grad_norm": 0.033849652856588364, "learning_rate": 9.907300115874856e-05, "loss": 0.044, "step": 3489 }, { "epoch": 1.5154146765089014, "grad_norm": 0.029398959130048752, "learning_rate": 9.904403244495945e-05, "loss": 0.0362, "step": 3490 }, { "epoch": 1.515848892748589, "grad_norm": 0.037130411714315414, "learning_rate": 9.901506373117034e-05, "loss": 0.0422, "step": 3491 }, { "epoch": 1.5162831089882762, "grad_norm": 0.034389983862638474, "learning_rate": 9.898609501738123e-05, "loss": 0.0411, "step": 3492 }, { "epoch": 1.5167173252279635, "grad_norm": 0.02658918686211109, "learning_rate": 9.895712630359213e-05, "loss": 0.0298, "step": 3493 }, { "epoch": 1.5171515414676509, "grad_norm": 0.033717505633831024, "learning_rate": 9.892815758980302e-05, "loss": 0.0386, "step": 3494 }, { "epoch": 1.5175857577073382, "grad_norm": 0.0295511893928051, "learning_rate": 9.889918887601391e-05, "loss": 0.031, "step": 3495 }, { "epoch": 1.5180199739470255, "grad_norm": 0.02841942571103573, "learning_rate": 9.88702201622248e-05, "loss": 0.0331, "step": 3496 }, { "epoch": 1.518454190186713, "grad_norm": 0.03160611912608147, "learning_rate": 9.884125144843569e-05, "loss": 0.0377, "step": 3497 }, { "epoch": 1.5188884064264003, "grad_norm": 0.025281641632318497, "learning_rate": 9.881228273464658e-05, "loss": 0.0335, "step": 3498 }, { "epoch": 1.5193226226660879, "grad_norm": 0.027666987851262093, "learning_rate": 9.878331402085747e-05, "loss": 0.0329, "step": 3499 }, { "epoch": 1.5197568389057752, "grad_norm": 0.042447470128536224, "learning_rate": 9.875434530706838e-05, "loss": 0.0427, "step": 3500 } ], "logging_steps": 1, "max_steps": 6909, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.254060958945403e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }