{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9997083272568177, "eval_steps": 200, "global_step": 3428, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005833454863642993, "grad_norm": 17.78308868408203, "learning_rate": 0.0, "loss": 5.8378, "step": 1 }, { "epoch": 0.0011666909727285986, "grad_norm": 5.597744464874268, "learning_rate": 7.525749891599529e-06, "loss": 2.3217, "step": 2 }, { "epoch": 0.0017500364590928978, "grad_norm": 9.02523136138916, "learning_rate": 1.192803136799156e-05, "loss": 2.1816, "step": 3 }, { "epoch": 0.002333381945457197, "grad_norm": 3.38098406791687, "learning_rate": 1.5051499783199057e-05, "loss": 1.7371, "step": 4 }, { "epoch": 0.002916727431821496, "grad_norm": 3.2741100788116455, "learning_rate": 1.7474250108400467e-05, "loss": 1.6348, "step": 5 }, { "epoch": 0.0035000729181857955, "grad_norm": 3.280085563659668, "learning_rate": 1.945378125959109e-05, "loss": 1.6431, "step": 6 }, { "epoch": 0.004083418404550095, "grad_norm": 2.7186739444732666, "learning_rate": 2.1127451000356418e-05, "loss": 1.452, "step": 7 }, { "epoch": 0.004666763890914394, "grad_norm": 2.5042686462402344, "learning_rate": 2.2577249674798584e-05, "loss": 1.3986, "step": 8 }, { "epoch": 0.005250109377278694, "grad_norm": 2.6429643630981445, "learning_rate": 2.385606273598312e-05, "loss": 1.5973, "step": 9 }, { "epoch": 0.005833454863642992, "grad_norm": 2.739990711212158, "learning_rate": 2.4999999999999998e-05, "loss": 1.4543, "step": 10 }, { "epoch": 0.006416800350007292, "grad_norm": 2.085124969482422, "learning_rate": 2.6034817128955623e-05, "loss": 1.1983, "step": 11 }, { "epoch": 0.007000145836371591, "grad_norm": 2.3114278316497803, "learning_rate": 2.6979531151190617e-05, "loss": 1.5163, "step": 12 }, { "epoch": 0.0075834913227358905, "grad_norm": 2.1083810329437256, "learning_rate": 2.7848583807670913e-05, "loss": 1.3752, "step": 13 }, { "epoch": 0.00816683680910019, "grad_norm": 2.667152166366577, "learning_rate": 2.8653200891955945e-05, "loss": 1.3118, "step": 14 }, { "epoch": 0.008750182295464488, "grad_norm": 2.096151113510132, "learning_rate": 2.940228147639203e-05, "loss": 1.1005, "step": 15 }, { "epoch": 0.009333527781828789, "grad_norm": 2.579360246658325, "learning_rate": 3.0102999566398115e-05, "loss": 1.5375, "step": 16 }, { "epoch": 0.009916873268193087, "grad_norm": 2.0573806762695312, "learning_rate": 3.076122303445685e-05, "loss": 1.3534, "step": 17 }, { "epoch": 0.010500218754557387, "grad_norm": 1.6891591548919678, "learning_rate": 3.1381812627582646e-05, "loss": 1.1602, "step": 18 }, { "epoch": 0.011083564240921686, "grad_norm": 1.654665470123291, "learning_rate": 3.1968840023820715e-05, "loss": 1.2598, "step": 19 }, { "epoch": 0.011666909727285985, "grad_norm": 1.51304030418396, "learning_rate": 3.2525749891599525e-05, "loss": 1.3738, "step": 20 }, { "epoch": 0.012250255213650285, "grad_norm": 1.7645764350891113, "learning_rate": 3.305548236834798e-05, "loss": 1.3982, "step": 21 }, { "epoch": 0.012833600700014583, "grad_norm": 2.0786468982696533, "learning_rate": 3.3560567020555153e-05, "loss": 1.2424, "step": 22 }, { "epoch": 0.013416946186378884, "grad_norm": 1.5325442552566528, "learning_rate": 3.404319590043982e-05, "loss": 1.2803, "step": 23 }, { "epoch": 0.014000291672743182, "grad_norm": 1.5579572916030884, "learning_rate": 3.450528104279015e-05, "loss": 1.4129, "step": 24 }, { "epoch": 0.01458363715910748, "grad_norm": 1.6682405471801758, "learning_rate": 3.4948500216800935e-05, "loss": 1.1793, "step": 25 }, { "epoch": 0.015166982645471781, "grad_norm": 1.558914303779602, "learning_rate": 3.537433369927044e-05, "loss": 1.2917, "step": 26 }, { "epoch": 0.01575032813183608, "grad_norm": 2.367410182952881, "learning_rate": 3.578409410397468e-05, "loss": 1.3104, "step": 27 }, { "epoch": 0.01633367361820038, "grad_norm": 1.7541717290878296, "learning_rate": 3.6178950783555475e-05, "loss": 1.1249, "step": 28 }, { "epoch": 0.016917019104564678, "grad_norm": 1.3140240907669067, "learning_rate": 3.65599499474739e-05, "loss": 1.0676, "step": 29 }, { "epoch": 0.017500364590928977, "grad_norm": 2.0365586280822754, "learning_rate": 3.6928031367991554e-05, "loss": 1.2787, "step": 30 }, { "epoch": 0.018083710077293275, "grad_norm": 1.875240683555603, "learning_rate": 3.728404234585681e-05, "loss": 1.2876, "step": 31 }, { "epoch": 0.018667055563657577, "grad_norm": 1.6128060817718506, "learning_rate": 3.762874945799765e-05, "loss": 1.1359, "step": 32 }, { "epoch": 0.019250401050021876, "grad_norm": 1.761427640914917, "learning_rate": 3.796284849694718e-05, "loss": 1.2898, "step": 33 }, { "epoch": 0.019833746536386174, "grad_norm": 1.8005716800689697, "learning_rate": 3.8286972926056376e-05, "loss": 1.0982, "step": 34 }, { "epoch": 0.020417092022750473, "grad_norm": 1.7455655336380005, "learning_rate": 3.8601701108756885e-05, "loss": 1.1419, "step": 35 }, { "epoch": 0.021000437509114775, "grad_norm": 1.207555890083313, "learning_rate": 3.890756251918218e-05, "loss": 1.3689, "step": 36 }, { "epoch": 0.021583782995479073, "grad_norm": 1.4690238237380981, "learning_rate": 3.920504310167487e-05, "loss": 1.0896, "step": 37 }, { "epoch": 0.022167128481843372, "grad_norm": 1.31458580493927, "learning_rate": 3.949458991542025e-05, "loss": 1.2124, "step": 38 }, { "epoch": 0.02275047396820767, "grad_norm": 1.7508232593536377, "learning_rate": 3.977661517566247e-05, "loss": 1.0062, "step": 39 }, { "epoch": 0.02333381945457197, "grad_norm": 1.7391034364700317, "learning_rate": 4.005149978319905e-05, "loss": 1.1858, "step": 40 }, { "epoch": 0.02391716494093627, "grad_norm": 1.479203462600708, "learning_rate": 4.031959641799338e-05, "loss": 1.1886, "step": 41 }, { "epoch": 0.02450051042730057, "grad_norm": 1.7898560762405396, "learning_rate": 4.058123225994751e-05, "loss": 1.204, "step": 42 }, { "epoch": 0.025083855913664868, "grad_norm": 1.422057867050171, "learning_rate": 4.0836711389489654e-05, "loss": 1.1607, "step": 43 }, { "epoch": 0.025667201400029167, "grad_norm": 2.161949872970581, "learning_rate": 4.108631691215468e-05, "loss": 1.1471, "step": 44 }, { "epoch": 0.026250546886393465, "grad_norm": 1.9058781862258911, "learning_rate": 4.133031284438358e-05, "loss": 1.0656, "step": 45 }, { "epoch": 0.026833892372757767, "grad_norm": 1.6354972124099731, "learning_rate": 4.156894579203935e-05, "loss": 1.0579, "step": 46 }, { "epoch": 0.027417237859122066, "grad_norm": 1.6435108184814453, "learning_rate": 4.180244644839293e-05, "loss": 1.0281, "step": 47 }, { "epoch": 0.028000583345486364, "grad_norm": 1.7230783700942993, "learning_rate": 4.203103093438968e-05, "loss": 1.086, "step": 48 }, { "epoch": 0.028583928831850663, "grad_norm": 1.4264886379241943, "learning_rate": 4.2254902000712836e-05, "loss": 1.1384, "step": 49 }, { "epoch": 0.02916727431821496, "grad_norm": 1.6517153978347778, "learning_rate": 4.247425010840046e-05, "loss": 1.2282, "step": 50 }, { "epoch": 0.029750619804579263, "grad_norm": 2.1615817546844482, "learning_rate": 4.2689254402448405e-05, "loss": 1.2475, "step": 51 }, { "epoch": 0.030333965290943562, "grad_norm": 1.5791531801223755, "learning_rate": 4.290008359086998e-05, "loss": 1.4099, "step": 52 }, { "epoch": 0.03091731077730786, "grad_norm": 1.5546684265136719, "learning_rate": 4.310689674001973e-05, "loss": 1.2336, "step": 53 }, { "epoch": 0.03150065626367216, "grad_norm": 1.5784398317337036, "learning_rate": 4.330984399557421e-05, "loss": 1.3104, "step": 54 }, { "epoch": 0.03208400175003646, "grad_norm": 1.2873289585113525, "learning_rate": 4.350906723735609e-05, "loss": 1.3342, "step": 55 }, { "epoch": 0.03266734723640076, "grad_norm": 1.6933963298797607, "learning_rate": 4.370470067515501e-05, "loss": 1.1348, "step": 56 }, { "epoch": 0.033250692722765054, "grad_norm": 1.5604673624038696, "learning_rate": 4.3896871391812285e-05, "loss": 1.3968, "step": 57 }, { "epoch": 0.033834038209129357, "grad_norm": 1.5108568668365479, "learning_rate": 4.408569983907343e-05, "loss": 1.1997, "step": 58 }, { "epoch": 0.03441738369549366, "grad_norm": 3.0541892051696777, "learning_rate": 4.42713002910536e-05, "loss": 1.3484, "step": 59 }, { "epoch": 0.035000729181857954, "grad_norm": 1.2510807514190674, "learning_rate": 4.445378125959108e-05, "loss": 1.3166, "step": 60 }, { "epoch": 0.035584074668222256, "grad_norm": 1.6028800010681152, "learning_rate": 4.463324587526917e-05, "loss": 1.145, "step": 61 }, { "epoch": 0.03616742015458655, "grad_norm": 1.5321674346923828, "learning_rate": 4.4809792237456346e-05, "loss": 1.2243, "step": 62 }, { "epoch": 0.03675076564095085, "grad_norm": 1.4574953317642212, "learning_rate": 4.498351373633954e-05, "loss": 0.9716, "step": 63 }, { "epoch": 0.037334111127315155, "grad_norm": 1.2741219997406006, "learning_rate": 4.515449934959717e-05, "loss": 1.083, "step": 64 }, { "epoch": 0.03791745661367945, "grad_norm": 1.4046497344970703, "learning_rate": 4.532283391607138e-05, "loss": 1.1772, "step": 65 }, { "epoch": 0.03850080210004375, "grad_norm": 1.3555320501327515, "learning_rate": 4.548859838854671e-05, "loss": 0.9133, "step": 66 }, { "epoch": 0.03908414758640805, "grad_norm": 1.6886674165725708, "learning_rate": 4.565187006752065e-05, "loss": 1.1938, "step": 67 }, { "epoch": 0.03966749307277235, "grad_norm": 1.430363416671753, "learning_rate": 4.581272281765591e-05, "loss": 1.1537, "step": 68 }, { "epoch": 0.04025083855913665, "grad_norm": 1.6385464668273926, "learning_rate": 4.597122726843138e-05, "loss": 1.1556, "step": 69 }, { "epoch": 0.040834184045500946, "grad_norm": 1.4001507759094238, "learning_rate": 4.612745100035642e-05, "loss": 1.1733, "step": 70 }, { "epoch": 0.04141752953186525, "grad_norm": 1.5451918840408325, "learning_rate": 4.628145871797688e-05, "loss": 1.197, "step": 71 }, { "epoch": 0.04200087501822955, "grad_norm": 1.5024776458740234, "learning_rate": 4.643331241078171e-05, "loss": 1.3481, "step": 72 }, { "epoch": 0.042584220504593845, "grad_norm": 1.5428893566131592, "learning_rate": 4.658307150301139e-05, "loss": 1.1476, "step": 73 }, { "epoch": 0.04316756599095815, "grad_norm": 1.2959415912628174, "learning_rate": 4.67307929932744e-05, "loss": 1.0119, "step": 74 }, { "epoch": 0.04375091147732244, "grad_norm": 1.198490023612976, "learning_rate": 4.687653158479249e-05, "loss": 1.24, "step": 75 }, { "epoch": 0.044334256963686744, "grad_norm": 1.3311015367507935, "learning_rate": 4.702033980701978e-05, "loss": 1.1441, "step": 76 }, { "epoch": 0.044917602450051046, "grad_norm": 1.4103642702102661, "learning_rate": 4.716226812931204e-05, "loss": 1.0517, "step": 77 }, { "epoch": 0.04550094793641534, "grad_norm": 1.260272741317749, "learning_rate": 4.7302365067262006e-05, "loss": 1.2173, "step": 78 }, { "epoch": 0.04608429342277964, "grad_norm": 1.4968616962432861, "learning_rate": 4.744067728226103e-05, "loss": 1.0362, "step": 79 }, { "epoch": 0.04666763890914394, "grad_norm": 1.2590322494506836, "learning_rate": 4.757724967479858e-05, "loss": 1.3106, "step": 80 }, { "epoch": 0.04725098439550824, "grad_norm": 1.2166228294372559, "learning_rate": 4.771212547196624e-05, "loss": 1.0736, "step": 81 }, { "epoch": 0.04783432988187254, "grad_norm": 1.19132399559021, "learning_rate": 4.7845346309592914e-05, "loss": 0.9754, "step": 82 }, { "epoch": 0.04841767536823684, "grad_norm": 1.3279188871383667, "learning_rate": 4.7976952309401844e-05, "loss": 1.1815, "step": 83 }, { "epoch": 0.04900102085460114, "grad_norm": 1.5890907049179077, "learning_rate": 4.810698215154703e-05, "loss": 1.0381, "step": 84 }, { "epoch": 0.049584366340965434, "grad_norm": 1.2219558954238892, "learning_rate": 4.823547314285732e-05, "loss": 0.8901, "step": 85 }, { "epoch": 0.050167711827329736, "grad_norm": 1.9673529863357544, "learning_rate": 4.836246128108918e-05, "loss": 1.087, "step": 86 }, { "epoch": 0.05075105731369404, "grad_norm": 1.6241381168365479, "learning_rate": 4.8487981315465456e-05, "loss": 1.328, "step": 87 }, { "epoch": 0.05133440280005833, "grad_norm": 1.289197325706482, "learning_rate": 4.8612066803754214e-05, "loss": 1.3474, "step": 88 }, { "epoch": 0.051917748286422635, "grad_norm": 1.4262926578521729, "learning_rate": 4.873475016612281e-05, "loss": 1.0049, "step": 89 }, { "epoch": 0.05250109377278693, "grad_norm": 1.382620930671692, "learning_rate": 4.885606273598312e-05, "loss": 0.94, "step": 90 }, { "epoch": 0.05308443925915123, "grad_norm": 1.5160958766937256, "learning_rate": 4.897603480802733e-05, "loss": 1.1705, "step": 91 }, { "epoch": 0.053667784745515534, "grad_norm": 2.5832581520080566, "learning_rate": 4.909469568363888e-05, "loss": 1.2074, "step": 92 }, { "epoch": 0.05425113023187983, "grad_norm": 1.540708065032959, "learning_rate": 4.9212073713848375e-05, "loss": 1.2703, "step": 93 }, { "epoch": 0.05483447571824413, "grad_norm": 1.8622777462005615, "learning_rate": 4.932819633999246e-05, "loss": 1.1169, "step": 94 }, { "epoch": 0.055417821204608426, "grad_norm": 1.4035429954528809, "learning_rate": 4.9443090132221186e-05, "loss": 1.17, "step": 95 }, { "epoch": 0.05600116669097273, "grad_norm": 1.4052140712738037, "learning_rate": 4.9556780825989205e-05, "loss": 0.9749, "step": 96 }, { "epoch": 0.05658451217733703, "grad_norm": 1.3691339492797852, "learning_rate": 4.9669293356656114e-05, "loss": 0.9748, "step": 97 }, { "epoch": 0.057167857663701326, "grad_norm": 1.2556352615356445, "learning_rate": 4.978065189231237e-05, "loss": 0.9405, "step": 98 }, { "epoch": 0.05775120315006563, "grad_norm": 1.2853363752365112, "learning_rate": 4.989087986493874e-05, "loss": 1.3587, "step": 99 }, { "epoch": 0.05833454863642992, "grad_norm": 2.179720640182495, "learning_rate": 4.9999999999999996e-05, "loss": 1.181, "step": 100 }, { "epoch": 0.058917894122794225, "grad_norm": 1.1849037408828735, "learning_rate": 5e-05, "loss": 1.0379, "step": 101 }, { "epoch": 0.05950123960915853, "grad_norm": 1.335351586341858, "learning_rate": 4.998497596153847e-05, "loss": 1.2294, "step": 102 }, { "epoch": 0.06008458509552282, "grad_norm": 1.5505980253219604, "learning_rate": 4.9969951923076926e-05, "loss": 0.9985, "step": 103 }, { "epoch": 0.060667930581887124, "grad_norm": 1.2492345571517944, "learning_rate": 4.9954927884615385e-05, "loss": 1.009, "step": 104 }, { "epoch": 0.06125127606825142, "grad_norm": 1.2737586498260498, "learning_rate": 4.993990384615384e-05, "loss": 1.075, "step": 105 }, { "epoch": 0.06183462155461572, "grad_norm": 1.6333897113800049, "learning_rate": 4.992487980769231e-05, "loss": 1.2081, "step": 106 }, { "epoch": 0.06241796704098002, "grad_norm": 1.5222417116165161, "learning_rate": 4.9909855769230774e-05, "loss": 1.2142, "step": 107 }, { "epoch": 0.06300131252734432, "grad_norm": 1.683530569076538, "learning_rate": 4.989483173076923e-05, "loss": 0.9966, "step": 108 }, { "epoch": 0.06358465801370862, "grad_norm": 1.8551238775253296, "learning_rate": 4.98798076923077e-05, "loss": 1.0807, "step": 109 }, { "epoch": 0.06416800350007291, "grad_norm": 1.721418857574463, "learning_rate": 4.9864783653846156e-05, "loss": 0.907, "step": 110 }, { "epoch": 0.06475134898643722, "grad_norm": 1.1599464416503906, "learning_rate": 4.9849759615384615e-05, "loss": 1.0121, "step": 111 }, { "epoch": 0.06533469447280152, "grad_norm": 1.1434050798416138, "learning_rate": 4.983473557692308e-05, "loss": 1.0762, "step": 112 }, { "epoch": 0.06591803995916581, "grad_norm": 1.3993638753890991, "learning_rate": 4.981971153846154e-05, "loss": 1.2264, "step": 113 }, { "epoch": 0.06650138544553011, "grad_norm": 1.278794288635254, "learning_rate": 4.9804687500000004e-05, "loss": 1.1514, "step": 114 }, { "epoch": 0.06708473093189442, "grad_norm": 4.100466728210449, "learning_rate": 4.978966346153847e-05, "loss": 1.1164, "step": 115 }, { "epoch": 0.06766807641825871, "grad_norm": 1.2248213291168213, "learning_rate": 4.977463942307693e-05, "loss": 1.1503, "step": 116 }, { "epoch": 0.06825142190462301, "grad_norm": 1.7549059391021729, "learning_rate": 4.9759615384615386e-05, "loss": 1.185, "step": 117 }, { "epoch": 0.06883476739098732, "grad_norm": 1.747718334197998, "learning_rate": 4.9744591346153844e-05, "loss": 1.1494, "step": 118 }, { "epoch": 0.06941811287735161, "grad_norm": 1.4090042114257812, "learning_rate": 4.972956730769231e-05, "loss": 1.0752, "step": 119 }, { "epoch": 0.07000145836371591, "grad_norm": 1.3040906190872192, "learning_rate": 4.9714543269230775e-05, "loss": 0.9461, "step": 120 }, { "epoch": 0.07058480385008022, "grad_norm": 1.625506043434143, "learning_rate": 4.9699519230769233e-05, "loss": 1.0583, "step": 121 }, { "epoch": 0.07116814933644451, "grad_norm": 1.273437261581421, "learning_rate": 4.968449519230769e-05, "loss": 1.1448, "step": 122 }, { "epoch": 0.0717514948228088, "grad_norm": 1.3089839220046997, "learning_rate": 4.966947115384616e-05, "loss": 1.0703, "step": 123 }, { "epoch": 0.0723348403091731, "grad_norm": 1.1929200887680054, "learning_rate": 4.9654447115384616e-05, "loss": 0.9157, "step": 124 }, { "epoch": 0.07291818579553741, "grad_norm": 1.1315945386886597, "learning_rate": 4.963942307692308e-05, "loss": 1.2748, "step": 125 }, { "epoch": 0.0735015312819017, "grad_norm": 1.4779376983642578, "learning_rate": 4.962439903846154e-05, "loss": 1.0475, "step": 126 }, { "epoch": 0.074084876768266, "grad_norm": 1.3037022352218628, "learning_rate": 4.9609375000000005e-05, "loss": 1.1542, "step": 127 }, { "epoch": 0.07466822225463031, "grad_norm": 1.454216480255127, "learning_rate": 4.959435096153846e-05, "loss": 1.3286, "step": 128 }, { "epoch": 0.0752515677409946, "grad_norm": 1.5513452291488647, "learning_rate": 4.957932692307692e-05, "loss": 1.2599, "step": 129 }, { "epoch": 0.0758349132273589, "grad_norm": 1.7299277782440186, "learning_rate": 4.956430288461539e-05, "loss": 1.2156, "step": 130 }, { "epoch": 0.07641825871372321, "grad_norm": 1.2468066215515137, "learning_rate": 4.9549278846153846e-05, "loss": 1.0193, "step": 131 }, { "epoch": 0.0770016042000875, "grad_norm": 1.1203701496124268, "learning_rate": 4.953425480769231e-05, "loss": 1.0451, "step": 132 }, { "epoch": 0.0775849496864518, "grad_norm": 1.4089607000350952, "learning_rate": 4.9519230769230776e-05, "loss": 1.0041, "step": 133 }, { "epoch": 0.0781682951728161, "grad_norm": 1.40877103805542, "learning_rate": 4.9504206730769235e-05, "loss": 1.1952, "step": 134 }, { "epoch": 0.0787516406591804, "grad_norm": 1.2676079273223877, "learning_rate": 4.948918269230769e-05, "loss": 0.9846, "step": 135 }, { "epoch": 0.0793349861455447, "grad_norm": 1.3630707263946533, "learning_rate": 4.947415865384616e-05, "loss": 1.2925, "step": 136 }, { "epoch": 0.07991833163190899, "grad_norm": 1.4193191528320312, "learning_rate": 4.945913461538462e-05, "loss": 0.9525, "step": 137 }, { "epoch": 0.0805016771182733, "grad_norm": 1.4956103563308716, "learning_rate": 4.944411057692308e-05, "loss": 1.0733, "step": 138 }, { "epoch": 0.0810850226046376, "grad_norm": 1.4280532598495483, "learning_rate": 4.942908653846154e-05, "loss": 0.9752, "step": 139 }, { "epoch": 0.08166836809100189, "grad_norm": 1.4108835458755493, "learning_rate": 4.94140625e-05, "loss": 1.0615, "step": 140 }, { "epoch": 0.0822517135773662, "grad_norm": 1.3212484121322632, "learning_rate": 4.9399038461538464e-05, "loss": 0.957, "step": 141 }, { "epoch": 0.0828350590637305, "grad_norm": 1.3035906553268433, "learning_rate": 4.938401442307692e-05, "loss": 1.0273, "step": 142 }, { "epoch": 0.08341840455009479, "grad_norm": 1.9610090255737305, "learning_rate": 4.936899038461539e-05, "loss": 1.2096, "step": 143 }, { "epoch": 0.0840017500364591, "grad_norm": 1.5158385038375854, "learning_rate": 4.935396634615385e-05, "loss": 1.056, "step": 144 }, { "epoch": 0.0845850955228234, "grad_norm": 1.9964501857757568, "learning_rate": 4.933894230769231e-05, "loss": 1.0332, "step": 145 }, { "epoch": 0.08516844100918769, "grad_norm": 1.5921212434768677, "learning_rate": 4.932391826923077e-05, "loss": 1.0627, "step": 146 }, { "epoch": 0.08575178649555198, "grad_norm": 1.6017435789108276, "learning_rate": 4.930889423076923e-05, "loss": 1.0619, "step": 147 }, { "epoch": 0.0863351319819163, "grad_norm": 1.4326040744781494, "learning_rate": 4.9293870192307694e-05, "loss": 1.0111, "step": 148 }, { "epoch": 0.08691847746828059, "grad_norm": 1.0582656860351562, "learning_rate": 4.927884615384616e-05, "loss": 0.978, "step": 149 }, { "epoch": 0.08750182295464488, "grad_norm": 1.1291985511779785, "learning_rate": 4.926382211538462e-05, "loss": 1.2015, "step": 150 }, { "epoch": 0.08808516844100919, "grad_norm": 1.6709433794021606, "learning_rate": 4.924879807692308e-05, "loss": 1.086, "step": 151 }, { "epoch": 0.08866851392737349, "grad_norm": 1.3261191844940186, "learning_rate": 4.923377403846154e-05, "loss": 1.2622, "step": 152 }, { "epoch": 0.08925185941373778, "grad_norm": 1.3250443935394287, "learning_rate": 4.921875e-05, "loss": 0.9489, "step": 153 }, { "epoch": 0.08983520490010209, "grad_norm": 1.3647021055221558, "learning_rate": 4.9203725961538466e-05, "loss": 1.0411, "step": 154 }, { "epoch": 0.09041855038646639, "grad_norm": 1.5882066488265991, "learning_rate": 4.9188701923076924e-05, "loss": 1.2391, "step": 155 }, { "epoch": 0.09100189587283068, "grad_norm": 1.2136874198913574, "learning_rate": 4.917367788461539e-05, "loss": 1.0796, "step": 156 }, { "epoch": 0.09158524135919498, "grad_norm": 1.3928202390670776, "learning_rate": 4.915865384615385e-05, "loss": 1.2077, "step": 157 }, { "epoch": 0.09216858684555929, "grad_norm": 1.487825632095337, "learning_rate": 4.9143629807692306e-05, "loss": 1.1106, "step": 158 }, { "epoch": 0.09275193233192358, "grad_norm": 1.7569891214370728, "learning_rate": 4.912860576923077e-05, "loss": 1.2142, "step": 159 }, { "epoch": 0.09333527781828788, "grad_norm": 1.2210508584976196, "learning_rate": 4.911358173076923e-05, "loss": 1.1436, "step": 160 }, { "epoch": 0.09391862330465219, "grad_norm": 1.2036561965942383, "learning_rate": 4.9098557692307695e-05, "loss": 1.227, "step": 161 }, { "epoch": 0.09450196879101648, "grad_norm": 1.2647464275360107, "learning_rate": 4.908353365384616e-05, "loss": 0.9552, "step": 162 }, { "epoch": 0.09508531427738078, "grad_norm": 1.2640854120254517, "learning_rate": 4.906850961538462e-05, "loss": 1.2703, "step": 163 }, { "epoch": 0.09566865976374508, "grad_norm": 1.8966978788375854, "learning_rate": 4.905348557692308e-05, "loss": 1.2019, "step": 164 }, { "epoch": 0.09625200525010938, "grad_norm": 1.3982689380645752, "learning_rate": 4.9038461538461536e-05, "loss": 1.0861, "step": 165 }, { "epoch": 0.09683535073647367, "grad_norm": 1.3104490041732788, "learning_rate": 4.90234375e-05, "loss": 1.0978, "step": 166 }, { "epoch": 0.09741869622283797, "grad_norm": 1.3252923488616943, "learning_rate": 4.900841346153847e-05, "loss": 0.984, "step": 167 }, { "epoch": 0.09800204170920228, "grad_norm": 1.636171579360962, "learning_rate": 4.8993389423076925e-05, "loss": 1.0612, "step": 168 }, { "epoch": 0.09858538719556657, "grad_norm": 1.3443262577056885, "learning_rate": 4.897836538461539e-05, "loss": 1.0664, "step": 169 }, { "epoch": 0.09916873268193087, "grad_norm": 1.2440283298492432, "learning_rate": 4.896334134615385e-05, "loss": 1.083, "step": 170 }, { "epoch": 0.09975207816829518, "grad_norm": 1.125069499015808, "learning_rate": 4.894831730769231e-05, "loss": 1.1206, "step": 171 }, { "epoch": 0.10033542365465947, "grad_norm": 1.3264683485031128, "learning_rate": 4.893329326923077e-05, "loss": 0.9415, "step": 172 }, { "epoch": 0.10091876914102377, "grad_norm": 1.1505907773971558, "learning_rate": 4.891826923076923e-05, "loss": 1.0793, "step": 173 }, { "epoch": 0.10150211462738808, "grad_norm": 1.1637053489685059, "learning_rate": 4.89032451923077e-05, "loss": 0.9331, "step": 174 }, { "epoch": 0.10208546011375237, "grad_norm": 1.6416714191436768, "learning_rate": 4.888822115384616e-05, "loss": 0.8152, "step": 175 }, { "epoch": 0.10266880560011667, "grad_norm": 1.0731501579284668, "learning_rate": 4.8873197115384614e-05, "loss": 1.188, "step": 176 }, { "epoch": 0.10325215108648096, "grad_norm": 1.5831621885299683, "learning_rate": 4.885817307692308e-05, "loss": 1.1591, "step": 177 }, { "epoch": 0.10383549657284527, "grad_norm": 1.221384882926941, "learning_rate": 4.884314903846154e-05, "loss": 1.0392, "step": 178 }, { "epoch": 0.10441884205920957, "grad_norm": 1.2436399459838867, "learning_rate": 4.8828125e-05, "loss": 1.0176, "step": 179 }, { "epoch": 0.10500218754557386, "grad_norm": 1.6474889516830444, "learning_rate": 4.881310096153847e-05, "loss": 1.294, "step": 180 }, { "epoch": 0.10558553303193817, "grad_norm": 1.7568507194519043, "learning_rate": 4.8798076923076926e-05, "loss": 1.1217, "step": 181 }, { "epoch": 0.10616887851830246, "grad_norm": 1.6313364505767822, "learning_rate": 4.8783052884615385e-05, "loss": 1.0687, "step": 182 }, { "epoch": 0.10675222400466676, "grad_norm": 1.3143326044082642, "learning_rate": 4.8768028846153843e-05, "loss": 1.001, "step": 183 }, { "epoch": 0.10733556949103107, "grad_norm": 1.7886399030685425, "learning_rate": 4.875300480769231e-05, "loss": 1.2089, "step": 184 }, { "epoch": 0.10791891497739536, "grad_norm": 1.494966983795166, "learning_rate": 4.8737980769230774e-05, "loss": 1.096, "step": 185 }, { "epoch": 0.10850226046375966, "grad_norm": 1.311043620109558, "learning_rate": 4.872295673076923e-05, "loss": 1.0791, "step": 186 }, { "epoch": 0.10908560595012397, "grad_norm": 1.4188566207885742, "learning_rate": 4.87079326923077e-05, "loss": 0.8453, "step": 187 }, { "epoch": 0.10966895143648826, "grad_norm": 1.2535464763641357, "learning_rate": 4.8692908653846156e-05, "loss": 0.9653, "step": 188 }, { "epoch": 0.11025229692285256, "grad_norm": 1.2611030340194702, "learning_rate": 4.8677884615384615e-05, "loss": 1.2116, "step": 189 }, { "epoch": 0.11083564240921685, "grad_norm": 1.33787202835083, "learning_rate": 4.866286057692308e-05, "loss": 1.0238, "step": 190 }, { "epoch": 0.11141898789558116, "grad_norm": 1.256029486656189, "learning_rate": 4.864783653846154e-05, "loss": 1.123, "step": 191 }, { "epoch": 0.11200233338194546, "grad_norm": 1.2442421913146973, "learning_rate": 4.8632812500000004e-05, "loss": 1.0065, "step": 192 }, { "epoch": 0.11258567886830975, "grad_norm": 1.540716290473938, "learning_rate": 4.861778846153847e-05, "loss": 1.1194, "step": 193 }, { "epoch": 0.11316902435467406, "grad_norm": 1.2217652797698975, "learning_rate": 4.860276442307692e-05, "loss": 1.0814, "step": 194 }, { "epoch": 0.11375236984103836, "grad_norm": 1.1543757915496826, "learning_rate": 4.8587740384615386e-05, "loss": 1.1429, "step": 195 }, { "epoch": 0.11433571532740265, "grad_norm": 1.2116531133651733, "learning_rate": 4.8572716346153845e-05, "loss": 0.8905, "step": 196 }, { "epoch": 0.11491906081376696, "grad_norm": 1.2289031744003296, "learning_rate": 4.855769230769231e-05, "loss": 0.9011, "step": 197 }, { "epoch": 0.11550240630013126, "grad_norm": 1.6803086996078491, "learning_rate": 4.8542668269230775e-05, "loss": 1.0375, "step": 198 }, { "epoch": 0.11608575178649555, "grad_norm": 1.5701960325241089, "learning_rate": 4.8527644230769234e-05, "loss": 1.0346, "step": 199 }, { "epoch": 0.11666909727285985, "grad_norm": 1.2407475709915161, "learning_rate": 4.851262019230769e-05, "loss": 0.7745, "step": 200 }, { "epoch": 0.11666909727285985, "eval_loss_squad": 1.1932364337146282, "eval_perplexity": 7.5639171404401075, "eval_perplexity_reconstruct": 1.8755490788116311, "step": 200 }, { "epoch": 0.11725244275922415, "grad_norm": 1.604348063468933, "learning_rate": 4.849759615384616e-05, "loss": 1.1338, "step": 201 }, { "epoch": 0.11783578824558845, "grad_norm": 1.2575210332870483, "learning_rate": 4.8482572115384616e-05, "loss": 1.1509, "step": 202 }, { "epoch": 0.11841913373195274, "grad_norm": 1.662980318069458, "learning_rate": 4.846754807692308e-05, "loss": 1.1348, "step": 203 }, { "epoch": 0.11900247921831705, "grad_norm": 1.3378013372421265, "learning_rate": 4.845252403846154e-05, "loss": 1.2898, "step": 204 }, { "epoch": 0.11958582470468135, "grad_norm": 1.4811590909957886, "learning_rate": 4.8437500000000005e-05, "loss": 0.9847, "step": 205 }, { "epoch": 0.12016917019104564, "grad_norm": 1.452512502670288, "learning_rate": 4.8422475961538464e-05, "loss": 1.0591, "step": 206 }, { "epoch": 0.12075251567740995, "grad_norm": 1.1843714714050293, "learning_rate": 4.840745192307692e-05, "loss": 0.7955, "step": 207 }, { "epoch": 0.12133586116377425, "grad_norm": 1.3748295307159424, "learning_rate": 4.839242788461539e-05, "loss": 1.0937, "step": 208 }, { "epoch": 0.12191920665013854, "grad_norm": 1.1598683595657349, "learning_rate": 4.8377403846153846e-05, "loss": 1.0785, "step": 209 }, { "epoch": 0.12250255213650284, "grad_norm": 1.3770662546157837, "learning_rate": 4.836237980769231e-05, "loss": 1.1478, "step": 210 }, { "epoch": 0.12308589762286715, "grad_norm": 1.3409414291381836, "learning_rate": 4.8347355769230776e-05, "loss": 1.2035, "step": 211 }, { "epoch": 0.12366924310923144, "grad_norm": 1.0668489933013916, "learning_rate": 4.833233173076923e-05, "loss": 1.148, "step": 212 }, { "epoch": 0.12425258859559574, "grad_norm": 1.2233036756515503, "learning_rate": 4.8317307692307693e-05, "loss": 0.9074, "step": 213 }, { "epoch": 0.12483593408196005, "grad_norm": 1.2284631729125977, "learning_rate": 4.830228365384616e-05, "loss": 1.006, "step": 214 }, { "epoch": 0.12541927956832433, "grad_norm": 1.0120066404342651, "learning_rate": 4.828725961538462e-05, "loss": 1.1474, "step": 215 }, { "epoch": 0.12600262505468865, "grad_norm": 1.47971773147583, "learning_rate": 4.827223557692308e-05, "loss": 0.7954, "step": 216 }, { "epoch": 0.12658597054105294, "grad_norm": 1.3422911167144775, "learning_rate": 4.825721153846154e-05, "loss": 1.0716, "step": 217 }, { "epoch": 0.12716931602741724, "grad_norm": 1.1010000705718994, "learning_rate": 4.82421875e-05, "loss": 1.1334, "step": 218 }, { "epoch": 0.12775266151378153, "grad_norm": 1.4309356212615967, "learning_rate": 4.8227163461538465e-05, "loss": 1.137, "step": 219 }, { "epoch": 0.12833600700014583, "grad_norm": 1.333970308303833, "learning_rate": 4.821213942307692e-05, "loss": 1.2948, "step": 220 }, { "epoch": 0.12891935248651012, "grad_norm": 1.3190792798995972, "learning_rate": 4.819711538461539e-05, "loss": 1.061, "step": 221 }, { "epoch": 0.12950269797287445, "grad_norm": 1.3843986988067627, "learning_rate": 4.818209134615385e-05, "loss": 1.1526, "step": 222 }, { "epoch": 0.13008604345923874, "grad_norm": 1.6656113862991333, "learning_rate": 4.816706730769231e-05, "loss": 1.2739, "step": 223 }, { "epoch": 0.13066938894560304, "grad_norm": 1.3917752504348755, "learning_rate": 4.815204326923077e-05, "loss": 1.1356, "step": 224 }, { "epoch": 0.13125273443196733, "grad_norm": 1.3735393285751343, "learning_rate": 4.813701923076923e-05, "loss": 0.9553, "step": 225 }, { "epoch": 0.13183607991833163, "grad_norm": 1.253967523574829, "learning_rate": 4.8121995192307695e-05, "loss": 1.1367, "step": 226 }, { "epoch": 0.13241942540469592, "grad_norm": 1.2027031183242798, "learning_rate": 4.810697115384616e-05, "loss": 1.129, "step": 227 }, { "epoch": 0.13300277089106022, "grad_norm": 1.2082641124725342, "learning_rate": 4.809194711538462e-05, "loss": 1.0744, "step": 228 }, { "epoch": 0.13358611637742454, "grad_norm": 1.4022235870361328, "learning_rate": 4.8076923076923084e-05, "loss": 0.9777, "step": 229 }, { "epoch": 0.13416946186378884, "grad_norm": 1.1199394464492798, "learning_rate": 4.8061899038461535e-05, "loss": 0.9213, "step": 230 }, { "epoch": 0.13475280735015313, "grad_norm": 1.1612290143966675, "learning_rate": 4.8046875e-05, "loss": 1.1336, "step": 231 }, { "epoch": 0.13533615283651743, "grad_norm": 1.43931245803833, "learning_rate": 4.8031850961538466e-05, "loss": 1.2295, "step": 232 }, { "epoch": 0.13591949832288172, "grad_norm": 1.4430086612701416, "learning_rate": 4.8016826923076924e-05, "loss": 1.1819, "step": 233 }, { "epoch": 0.13650284380924602, "grad_norm": 1.1566003561019897, "learning_rate": 4.800180288461539e-05, "loss": 0.9493, "step": 234 }, { "epoch": 0.13708618929561034, "grad_norm": 1.1817492246627808, "learning_rate": 4.798677884615385e-05, "loss": 1.0723, "step": 235 }, { "epoch": 0.13766953478197463, "grad_norm": 1.0918781757354736, "learning_rate": 4.797175480769231e-05, "loss": 1.1297, "step": 236 }, { "epoch": 0.13825288026833893, "grad_norm": 1.2325400114059448, "learning_rate": 4.795673076923077e-05, "loss": 1.0465, "step": 237 }, { "epoch": 0.13883622575470322, "grad_norm": 1.4473427534103394, "learning_rate": 4.794170673076923e-05, "loss": 1.1118, "step": 238 }, { "epoch": 0.13941957124106752, "grad_norm": 1.2843005657196045, "learning_rate": 4.7926682692307696e-05, "loss": 0.7924, "step": 239 }, { "epoch": 0.14000291672743181, "grad_norm": 1.225659728050232, "learning_rate": 4.791165865384616e-05, "loss": 0.9887, "step": 240 }, { "epoch": 0.1405862622137961, "grad_norm": 1.3597815036773682, "learning_rate": 4.789663461538462e-05, "loss": 1.063, "step": 241 }, { "epoch": 0.14116960770016043, "grad_norm": 1.2931313514709473, "learning_rate": 4.788161057692308e-05, "loss": 1.1008, "step": 242 }, { "epoch": 0.14175295318652473, "grad_norm": 1.6043051481246948, "learning_rate": 4.7866586538461537e-05, "loss": 1.2153, "step": 243 }, { "epoch": 0.14233629867288902, "grad_norm": 1.2612937688827515, "learning_rate": 4.78515625e-05, "loss": 1.05, "step": 244 }, { "epoch": 0.14291964415925332, "grad_norm": 1.4583909511566162, "learning_rate": 4.783653846153847e-05, "loss": 1.0092, "step": 245 }, { "epoch": 0.1435029896456176, "grad_norm": 1.4382933378219604, "learning_rate": 4.7821514423076926e-05, "loss": 0.8433, "step": 246 }, { "epoch": 0.1440863351319819, "grad_norm": 1.1039222478866577, "learning_rate": 4.780649038461539e-05, "loss": 0.9546, "step": 247 }, { "epoch": 0.1446696806183462, "grad_norm": 1.46884024143219, "learning_rate": 4.779146634615384e-05, "loss": 1.1191, "step": 248 }, { "epoch": 0.14525302610471053, "grad_norm": 1.221130609512329, "learning_rate": 4.777644230769231e-05, "loss": 0.9995, "step": 249 }, { "epoch": 0.14583637159107482, "grad_norm": 1.2932945489883423, "learning_rate": 4.776141826923077e-05, "loss": 1.0508, "step": 250 }, { "epoch": 0.14641971707743912, "grad_norm": 1.174483060836792, "learning_rate": 4.774639423076923e-05, "loss": 1.1433, "step": 251 }, { "epoch": 0.1470030625638034, "grad_norm": 1.366163969039917, "learning_rate": 4.77313701923077e-05, "loss": 1.1716, "step": 252 }, { "epoch": 0.1475864080501677, "grad_norm": 1.9801900386810303, "learning_rate": 4.7716346153846155e-05, "loss": 0.9884, "step": 253 }, { "epoch": 0.148169753536532, "grad_norm": 1.3597514629364014, "learning_rate": 4.7701322115384614e-05, "loss": 1.0706, "step": 254 }, { "epoch": 0.14875309902289632, "grad_norm": 1.1929199695587158, "learning_rate": 4.768629807692308e-05, "loss": 0.996, "step": 255 }, { "epoch": 0.14933644450926062, "grad_norm": 1.0896735191345215, "learning_rate": 4.767127403846154e-05, "loss": 1.2185, "step": 256 }, { "epoch": 0.1499197899956249, "grad_norm": 1.0902819633483887, "learning_rate": 4.765625e-05, "loss": 1.3523, "step": 257 }, { "epoch": 0.1505031354819892, "grad_norm": 1.1990022659301758, "learning_rate": 4.764122596153847e-05, "loss": 1.049, "step": 258 }, { "epoch": 0.1510864809683535, "grad_norm": 1.2676620483398438, "learning_rate": 4.762620192307693e-05, "loss": 1.2471, "step": 259 }, { "epoch": 0.1516698264547178, "grad_norm": 1.0942872762680054, "learning_rate": 4.7611177884615385e-05, "loss": 1.0276, "step": 260 }, { "epoch": 0.1522531719410821, "grad_norm": 1.886791706085205, "learning_rate": 4.7596153846153844e-05, "loss": 0.9001, "step": 261 }, { "epoch": 0.15283651742744642, "grad_norm": 1.3173785209655762, "learning_rate": 4.758112980769231e-05, "loss": 1.0866, "step": 262 }, { "epoch": 0.1534198629138107, "grad_norm": 1.3886914253234863, "learning_rate": 4.7566105769230774e-05, "loss": 1.0645, "step": 263 }, { "epoch": 0.154003208400175, "grad_norm": 1.5575705766677856, "learning_rate": 4.755108173076923e-05, "loss": 1.0723, "step": 264 }, { "epoch": 0.1545865538865393, "grad_norm": 1.2715169191360474, "learning_rate": 4.75360576923077e-05, "loss": 0.7773, "step": 265 }, { "epoch": 0.1551698993729036, "grad_norm": 1.5639898777008057, "learning_rate": 4.752103365384616e-05, "loss": 1.0814, "step": 266 }, { "epoch": 0.1557532448592679, "grad_norm": 1.5546399354934692, "learning_rate": 4.7506009615384615e-05, "loss": 0.9091, "step": 267 }, { "epoch": 0.1563365903456322, "grad_norm": 1.1600172519683838, "learning_rate": 4.749098557692308e-05, "loss": 1.2764, "step": 268 }, { "epoch": 0.1569199358319965, "grad_norm": 1.4420632123947144, "learning_rate": 4.747596153846154e-05, "loss": 1.1393, "step": 269 }, { "epoch": 0.1575032813183608, "grad_norm": 1.2289047241210938, "learning_rate": 4.7460937500000004e-05, "loss": 0.9656, "step": 270 }, { "epoch": 0.1580866268047251, "grad_norm": 1.165598750114441, "learning_rate": 4.744591346153846e-05, "loss": 0.9472, "step": 271 }, { "epoch": 0.1586699722910894, "grad_norm": 1.1388275623321533, "learning_rate": 4.743088942307692e-05, "loss": 1.0464, "step": 272 }, { "epoch": 0.1592533177774537, "grad_norm": 1.1367133855819702, "learning_rate": 4.7415865384615386e-05, "loss": 0.9505, "step": 273 }, { "epoch": 0.15983666326381799, "grad_norm": 1.086214542388916, "learning_rate": 4.7400841346153845e-05, "loss": 0.8583, "step": 274 }, { "epoch": 0.1604200087501823, "grad_norm": 1.464219093322754, "learning_rate": 4.738581730769231e-05, "loss": 0.9827, "step": 275 }, { "epoch": 0.1610033542365466, "grad_norm": 1.486207365989685, "learning_rate": 4.7370793269230776e-05, "loss": 1.0802, "step": 276 }, { "epoch": 0.1615866997229109, "grad_norm": 1.265541434288025, "learning_rate": 4.7355769230769234e-05, "loss": 1.0812, "step": 277 }, { "epoch": 0.1621700452092752, "grad_norm": 1.532962441444397, "learning_rate": 4.734074519230769e-05, "loss": 0.9444, "step": 278 }, { "epoch": 0.1627533906956395, "grad_norm": 1.329508900642395, "learning_rate": 4.732572115384616e-05, "loss": 1.1593, "step": 279 }, { "epoch": 0.16333673618200378, "grad_norm": 1.2211445569992065, "learning_rate": 4.7310697115384616e-05, "loss": 1.1621, "step": 280 }, { "epoch": 0.16392008166836808, "grad_norm": 1.0366151332855225, "learning_rate": 4.729567307692308e-05, "loss": 1.0649, "step": 281 }, { "epoch": 0.1645034271547324, "grad_norm": 1.4749221801757812, "learning_rate": 4.728064903846154e-05, "loss": 1.1227, "step": 282 }, { "epoch": 0.1650867726410967, "grad_norm": 1.3342058658599854, "learning_rate": 4.7265625000000005e-05, "loss": 1.1781, "step": 283 }, { "epoch": 0.165670118127461, "grad_norm": 1.0524542331695557, "learning_rate": 4.7250600961538464e-05, "loss": 1.0728, "step": 284 }, { "epoch": 0.1662534636138253, "grad_norm": 1.2550618648529053, "learning_rate": 4.723557692307692e-05, "loss": 0.9786, "step": 285 }, { "epoch": 0.16683680910018958, "grad_norm": 1.6661171913146973, "learning_rate": 4.722055288461539e-05, "loss": 1.1991, "step": 286 }, { "epoch": 0.16742015458655388, "grad_norm": 1.480756163597107, "learning_rate": 4.7205528846153846e-05, "loss": 1.1048, "step": 287 }, { "epoch": 0.1680035000729182, "grad_norm": 1.4135624170303345, "learning_rate": 4.719050480769231e-05, "loss": 1.1372, "step": 288 }, { "epoch": 0.1685868455592825, "grad_norm": 1.5737360715866089, "learning_rate": 4.717548076923077e-05, "loss": 1.2462, "step": 289 }, { "epoch": 0.1691701910456468, "grad_norm": 1.2390706539154053, "learning_rate": 4.716045673076923e-05, "loss": 0.9622, "step": 290 }, { "epoch": 0.16975353653201108, "grad_norm": 1.193597435951233, "learning_rate": 4.7145432692307694e-05, "loss": 1.0312, "step": 291 }, { "epoch": 0.17033688201837538, "grad_norm": 1.2671687602996826, "learning_rate": 4.713040865384616e-05, "loss": 1.0976, "step": 292 }, { "epoch": 0.17092022750473967, "grad_norm": 1.220468521118164, "learning_rate": 4.711538461538462e-05, "loss": 0.8168, "step": 293 }, { "epoch": 0.17150357299110397, "grad_norm": 1.7847779989242554, "learning_rate": 4.710036057692308e-05, "loss": 1.1623, "step": 294 }, { "epoch": 0.1720869184774683, "grad_norm": 1.1684701442718506, "learning_rate": 4.708533653846154e-05, "loss": 1.0476, "step": 295 }, { "epoch": 0.1726702639638326, "grad_norm": 1.2110151052474976, "learning_rate": 4.70703125e-05, "loss": 1.2198, "step": 296 }, { "epoch": 0.17325360945019688, "grad_norm": 1.2417210340499878, "learning_rate": 4.7055288461538465e-05, "loss": 0.9928, "step": 297 }, { "epoch": 0.17383695493656118, "grad_norm": 1.4387229681015015, "learning_rate": 4.7040264423076924e-05, "loss": 1.2669, "step": 298 }, { "epoch": 0.17442030042292547, "grad_norm": 1.5898375511169434, "learning_rate": 4.702524038461539e-05, "loss": 1.055, "step": 299 }, { "epoch": 0.17500364590928977, "grad_norm": 1.5764034986495972, "learning_rate": 4.701021634615385e-05, "loss": 1.0568, "step": 300 }, { "epoch": 0.17558699139565406, "grad_norm": 1.1175717115402222, "learning_rate": 4.699519230769231e-05, "loss": 0.9626, "step": 301 }, { "epoch": 0.17617033688201839, "grad_norm": 1.0562156438827515, "learning_rate": 4.698016826923077e-05, "loss": 1.0173, "step": 302 }, { "epoch": 0.17675368236838268, "grad_norm": 1.1503074169158936, "learning_rate": 4.696514423076923e-05, "loss": 1.0513, "step": 303 }, { "epoch": 0.17733702785474698, "grad_norm": 1.1285890340805054, "learning_rate": 4.6950120192307695e-05, "loss": 0.966, "step": 304 }, { "epoch": 0.17792037334111127, "grad_norm": 1.0698230266571045, "learning_rate": 4.693509615384616e-05, "loss": 1.1029, "step": 305 }, { "epoch": 0.17850371882747557, "grad_norm": 1.3292865753173828, "learning_rate": 4.692007211538462e-05, "loss": 1.19, "step": 306 }, { "epoch": 0.17908706431383986, "grad_norm": 1.1927766799926758, "learning_rate": 4.690504807692308e-05, "loss": 1.1935, "step": 307 }, { "epoch": 0.17967040980020418, "grad_norm": 2.0182723999023438, "learning_rate": 4.6890024038461536e-05, "loss": 1.2596, "step": 308 }, { "epoch": 0.18025375528656848, "grad_norm": 1.8118665218353271, "learning_rate": 4.6875e-05, "loss": 1.0279, "step": 309 }, { "epoch": 0.18083710077293277, "grad_norm": 1.3893784284591675, "learning_rate": 4.6859975961538466e-05, "loss": 1.0969, "step": 310 }, { "epoch": 0.18142044625929707, "grad_norm": 4.505359172821045, "learning_rate": 4.6844951923076925e-05, "loss": 1.04, "step": 311 }, { "epoch": 0.18200379174566136, "grad_norm": 1.258991003036499, "learning_rate": 4.682992788461539e-05, "loss": 0.9914, "step": 312 }, { "epoch": 0.18258713723202566, "grad_norm": 1.0383793115615845, "learning_rate": 4.681490384615385e-05, "loss": 1.0399, "step": 313 }, { "epoch": 0.18317048271838995, "grad_norm": 1.4799776077270508, "learning_rate": 4.679987980769231e-05, "loss": 0.9968, "step": 314 }, { "epoch": 0.18375382820475428, "grad_norm": 1.3189499378204346, "learning_rate": 4.678485576923077e-05, "loss": 1.0315, "step": 315 }, { "epoch": 0.18433717369111857, "grad_norm": 1.2589702606201172, "learning_rate": 4.676983173076923e-05, "loss": 0.8627, "step": 316 }, { "epoch": 0.18492051917748287, "grad_norm": 1.2101918458938599, "learning_rate": 4.6754807692307696e-05, "loss": 1.2105, "step": 317 }, { "epoch": 0.18550386466384716, "grad_norm": 1.3174864053726196, "learning_rate": 4.673978365384616e-05, "loss": 0.9905, "step": 318 }, { "epoch": 0.18608721015021146, "grad_norm": 1.2675038576126099, "learning_rate": 4.672475961538462e-05, "loss": 1.0751, "step": 319 }, { "epoch": 0.18667055563657575, "grad_norm": 1.8191653490066528, "learning_rate": 4.670973557692308e-05, "loss": 1.1342, "step": 320 }, { "epoch": 0.18725390112294005, "grad_norm": 1.3822802305221558, "learning_rate": 4.669471153846154e-05, "loss": 0.9589, "step": 321 }, { "epoch": 0.18783724660930437, "grad_norm": 1.386513590812683, "learning_rate": 4.66796875e-05, "loss": 1.1085, "step": 322 }, { "epoch": 0.18842059209566867, "grad_norm": 1.1989296674728394, "learning_rate": 4.666466346153847e-05, "loss": 1.1038, "step": 323 }, { "epoch": 0.18900393758203296, "grad_norm": 1.1725409030914307, "learning_rate": 4.6649639423076926e-05, "loss": 0.9671, "step": 324 }, { "epoch": 0.18958728306839726, "grad_norm": 1.1871923208236694, "learning_rate": 4.6634615384615384e-05, "loss": 0.9863, "step": 325 }, { "epoch": 0.19017062855476155, "grad_norm": 1.4059133529663086, "learning_rate": 4.661959134615384e-05, "loss": 0.858, "step": 326 }, { "epoch": 0.19075397404112585, "grad_norm": 1.4571958780288696, "learning_rate": 4.660456730769231e-05, "loss": 1.1032, "step": 327 }, { "epoch": 0.19133731952749017, "grad_norm": 1.4414441585540771, "learning_rate": 4.6589543269230773e-05, "loss": 1.1884, "step": 328 }, { "epoch": 0.19192066501385446, "grad_norm": 1.9027329683303833, "learning_rate": 4.657451923076923e-05, "loss": 1.0415, "step": 329 }, { "epoch": 0.19250401050021876, "grad_norm": 1.2300493717193604, "learning_rate": 4.65594951923077e-05, "loss": 0.8911, "step": 330 }, { "epoch": 0.19308735598658305, "grad_norm": 3.4765067100524902, "learning_rate": 4.6544471153846156e-05, "loss": 0.9321, "step": 331 }, { "epoch": 0.19367070147294735, "grad_norm": 1.4455662965774536, "learning_rate": 4.6529447115384614e-05, "loss": 0.9986, "step": 332 }, { "epoch": 0.19425404695931164, "grad_norm": 1.371666669845581, "learning_rate": 4.651442307692308e-05, "loss": 1.1358, "step": 333 }, { "epoch": 0.19483739244567594, "grad_norm": 1.2533602714538574, "learning_rate": 4.649939903846154e-05, "loss": 0.9561, "step": 334 }, { "epoch": 0.19542073793204026, "grad_norm": 1.152901291847229, "learning_rate": 4.6484375e-05, "loss": 1.0428, "step": 335 }, { "epoch": 0.19600408341840456, "grad_norm": 1.435378074645996, "learning_rate": 4.646935096153847e-05, "loss": 0.9433, "step": 336 }, { "epoch": 0.19658742890476885, "grad_norm": 1.480175495147705, "learning_rate": 4.645432692307693e-05, "loss": 0.9606, "step": 337 }, { "epoch": 0.19717077439113315, "grad_norm": 1.4078121185302734, "learning_rate": 4.6439302884615386e-05, "loss": 1.078, "step": 338 }, { "epoch": 0.19775411987749744, "grad_norm": 1.3780559301376343, "learning_rate": 4.6424278846153844e-05, "loss": 1.1521, "step": 339 }, { "epoch": 0.19833746536386174, "grad_norm": 1.5239243507385254, "learning_rate": 4.640925480769231e-05, "loss": 0.9855, "step": 340 }, { "epoch": 0.19892081085022606, "grad_norm": 1.095953106880188, "learning_rate": 4.6394230769230775e-05, "loss": 1.1482, "step": 341 }, { "epoch": 0.19950415633659035, "grad_norm": 1.4764552116394043, "learning_rate": 4.637920673076923e-05, "loss": 1.0308, "step": 342 }, { "epoch": 0.20008750182295465, "grad_norm": 1.2678275108337402, "learning_rate": 4.636418269230769e-05, "loss": 1.0498, "step": 343 }, { "epoch": 0.20067084730931894, "grad_norm": 1.3208849430084229, "learning_rate": 4.634915865384616e-05, "loss": 0.8625, "step": 344 }, { "epoch": 0.20125419279568324, "grad_norm": 1.3627556562423706, "learning_rate": 4.6334134615384615e-05, "loss": 1.0739, "step": 345 }, { "epoch": 0.20183753828204753, "grad_norm": 1.3360272645950317, "learning_rate": 4.631911057692308e-05, "loss": 0.9951, "step": 346 }, { "epoch": 0.20242088376841183, "grad_norm": 1.2518396377563477, "learning_rate": 4.630408653846154e-05, "loss": 1.0037, "step": 347 }, { "epoch": 0.20300422925477615, "grad_norm": 1.2477564811706543, "learning_rate": 4.6289062500000005e-05, "loss": 1.0325, "step": 348 }, { "epoch": 0.20358757474114045, "grad_norm": 0.9390896558761597, "learning_rate": 4.627403846153846e-05, "loss": 0.9671, "step": 349 }, { "epoch": 0.20417092022750474, "grad_norm": 1.2609106302261353, "learning_rate": 4.625901442307692e-05, "loss": 0.9002, "step": 350 }, { "epoch": 0.20475426571386904, "grad_norm": 1.3142434358596802, "learning_rate": 4.624399038461539e-05, "loss": 1.0176, "step": 351 }, { "epoch": 0.20533761120023333, "grad_norm": 2.33760142326355, "learning_rate": 4.6228966346153845e-05, "loss": 1.2718, "step": 352 }, { "epoch": 0.20592095668659763, "grad_norm": 1.0962882041931152, "learning_rate": 4.621394230769231e-05, "loss": 1.1066, "step": 353 }, { "epoch": 0.20650430217296192, "grad_norm": 1.1384212970733643, "learning_rate": 4.6198918269230776e-05, "loss": 0.986, "step": 354 }, { "epoch": 0.20708764765932625, "grad_norm": 1.3480985164642334, "learning_rate": 4.6183894230769234e-05, "loss": 0.9396, "step": 355 }, { "epoch": 0.20767099314569054, "grad_norm": 1.3062304258346558, "learning_rate": 4.616887019230769e-05, "loss": 1.1495, "step": 356 }, { "epoch": 0.20825433863205484, "grad_norm": 1.1252844333648682, "learning_rate": 4.615384615384616e-05, "loss": 1.2231, "step": 357 }, { "epoch": 0.20883768411841913, "grad_norm": 1.3172509670257568, "learning_rate": 4.613882211538462e-05, "loss": 1.181, "step": 358 }, { "epoch": 0.20942102960478343, "grad_norm": 1.2648638486862183, "learning_rate": 4.612379807692308e-05, "loss": 1.2795, "step": 359 }, { "epoch": 0.21000437509114772, "grad_norm": 1.266517162322998, "learning_rate": 4.610877403846154e-05, "loss": 1.1233, "step": 360 }, { "epoch": 0.21058772057751204, "grad_norm": 1.5995312929153442, "learning_rate": 4.609375e-05, "loss": 0.9868, "step": 361 }, { "epoch": 0.21117106606387634, "grad_norm": 1.338070034980774, "learning_rate": 4.6078725961538464e-05, "loss": 1.0835, "step": 362 }, { "epoch": 0.21175441155024063, "grad_norm": 1.2133007049560547, "learning_rate": 4.606370192307692e-05, "loss": 1.0838, "step": 363 }, { "epoch": 0.21233775703660493, "grad_norm": 1.2662042379379272, "learning_rate": 4.604867788461539e-05, "loss": 1.0508, "step": 364 }, { "epoch": 0.21292110252296922, "grad_norm": 1.1545066833496094, "learning_rate": 4.6033653846153846e-05, "loss": 1.0582, "step": 365 }, { "epoch": 0.21350444800933352, "grad_norm": 1.2459946870803833, "learning_rate": 4.601862980769231e-05, "loss": 1.0589, "step": 366 }, { "epoch": 0.21408779349569781, "grad_norm": 1.1584820747375488, "learning_rate": 4.600360576923077e-05, "loss": 0.9987, "step": 367 }, { "epoch": 0.21467113898206214, "grad_norm": 1.049363136291504, "learning_rate": 4.598858173076923e-05, "loss": 1.0067, "step": 368 }, { "epoch": 0.21525448446842643, "grad_norm": 1.3197144269943237, "learning_rate": 4.5973557692307694e-05, "loss": 1.1545, "step": 369 }, { "epoch": 0.21583782995479073, "grad_norm": 1.3010430335998535, "learning_rate": 4.595853365384616e-05, "loss": 1.0185, "step": 370 }, { "epoch": 0.21642117544115502, "grad_norm": 1.2531121969223022, "learning_rate": 4.594350961538462e-05, "loss": 1.1825, "step": 371 }, { "epoch": 0.21700452092751932, "grad_norm": 1.7184710502624512, "learning_rate": 4.592848557692308e-05, "loss": 1.0196, "step": 372 }, { "epoch": 0.2175878664138836, "grad_norm": 1.173525094985962, "learning_rate": 4.591346153846154e-05, "loss": 1.2479, "step": 373 }, { "epoch": 0.21817121190024794, "grad_norm": 1.450506567955017, "learning_rate": 4.58984375e-05, "loss": 1.1017, "step": 374 }, { "epoch": 0.21875455738661223, "grad_norm": 1.1033124923706055, "learning_rate": 4.5883413461538465e-05, "loss": 0.8721, "step": 375 }, { "epoch": 0.21933790287297653, "grad_norm": 1.3852168321609497, "learning_rate": 4.5868389423076924e-05, "loss": 1.0496, "step": 376 }, { "epoch": 0.21992124835934082, "grad_norm": 1.2315133810043335, "learning_rate": 4.585336538461539e-05, "loss": 1.2742, "step": 377 }, { "epoch": 0.22050459384570512, "grad_norm": 1.1665183305740356, "learning_rate": 4.583834134615385e-05, "loss": 0.8682, "step": 378 }, { "epoch": 0.2210879393320694, "grad_norm": 1.1642358303070068, "learning_rate": 4.5823317307692306e-05, "loss": 1.0468, "step": 379 }, { "epoch": 0.2216712848184337, "grad_norm": 1.1943094730377197, "learning_rate": 4.580829326923077e-05, "loss": 0.9771, "step": 380 }, { "epoch": 0.22225463030479803, "grad_norm": 1.4384686946868896, "learning_rate": 4.579326923076923e-05, "loss": 1.1928, "step": 381 }, { "epoch": 0.22283797579116232, "grad_norm": 2.045624256134033, "learning_rate": 4.5778245192307695e-05, "loss": 1.1249, "step": 382 }, { "epoch": 0.22342132127752662, "grad_norm": 1.4147062301635742, "learning_rate": 4.576322115384616e-05, "loss": 1.0644, "step": 383 }, { "epoch": 0.22400466676389091, "grad_norm": 1.4192973375320435, "learning_rate": 4.574819711538462e-05, "loss": 1.0954, "step": 384 }, { "epoch": 0.2245880122502552, "grad_norm": 2.5359246730804443, "learning_rate": 4.573317307692308e-05, "loss": 0.9555, "step": 385 }, { "epoch": 0.2251713577366195, "grad_norm": 1.3057085275650024, "learning_rate": 4.5718149038461536e-05, "loss": 1.2688, "step": 386 }, { "epoch": 0.2257547032229838, "grad_norm": 1.5768152475357056, "learning_rate": 4.5703125e-05, "loss": 1.0067, "step": 387 }, { "epoch": 0.22633804870934812, "grad_norm": 1.3447943925857544, "learning_rate": 4.5688100961538467e-05, "loss": 1.1379, "step": 388 }, { "epoch": 0.22692139419571242, "grad_norm": 1.4754528999328613, "learning_rate": 4.5673076923076925e-05, "loss": 0.8801, "step": 389 }, { "epoch": 0.2275047396820767, "grad_norm": 1.4515520334243774, "learning_rate": 4.565805288461539e-05, "loss": 1.0797, "step": 390 }, { "epoch": 0.228088085168441, "grad_norm": 1.317018747329712, "learning_rate": 4.564302884615385e-05, "loss": 1.0002, "step": 391 }, { "epoch": 0.2286714306548053, "grad_norm": 1.7866982221603394, "learning_rate": 4.562800480769231e-05, "loss": 1.0067, "step": 392 }, { "epoch": 0.2292547761411696, "grad_norm": 1.3489786386489868, "learning_rate": 4.561298076923077e-05, "loss": 1.0879, "step": 393 }, { "epoch": 0.22983812162753392, "grad_norm": 1.4815866947174072, "learning_rate": 4.559795673076923e-05, "loss": 1.1511, "step": 394 }, { "epoch": 0.23042146711389822, "grad_norm": 1.099830985069275, "learning_rate": 4.5582932692307696e-05, "loss": 1.2723, "step": 395 }, { "epoch": 0.2310048126002625, "grad_norm": 1.2879087924957275, "learning_rate": 4.556790865384616e-05, "loss": 1.0579, "step": 396 }, { "epoch": 0.2315881580866268, "grad_norm": 1.2693564891815186, "learning_rate": 4.5552884615384613e-05, "loss": 1.2117, "step": 397 }, { "epoch": 0.2321715035729911, "grad_norm": 1.5512031316757202, "learning_rate": 4.553786057692308e-05, "loss": 0.9932, "step": 398 }, { "epoch": 0.2327548490593554, "grad_norm": 2.0312271118164062, "learning_rate": 4.552283653846154e-05, "loss": 0.8886, "step": 399 }, { "epoch": 0.2333381945457197, "grad_norm": 1.1745553016662598, "learning_rate": 4.55078125e-05, "loss": 1.1362, "step": 400 }, { "epoch": 0.2333381945457197, "eval_loss_squad": 1.0443613978661597, "eval_perplexity": 7.789854227908129, "eval_perplexity_reconstruct": 1.9117095009373144, "step": 400 }, { "epoch": 0.233921540032084, "grad_norm": 1.2545403242111206, "learning_rate": 4.549278846153847e-05, "loss": 1.1109, "step": 401 }, { "epoch": 0.2345048855184483, "grad_norm": 1.2915353775024414, "learning_rate": 4.5477764423076926e-05, "loss": 0.9874, "step": 402 }, { "epoch": 0.2350882310048126, "grad_norm": 1.3028852939605713, "learning_rate": 4.5462740384615385e-05, "loss": 0.9613, "step": 403 }, { "epoch": 0.2356715764911769, "grad_norm": 1.5871734619140625, "learning_rate": 4.544771634615384e-05, "loss": 1.0638, "step": 404 }, { "epoch": 0.2362549219775412, "grad_norm": 1.4773112535476685, "learning_rate": 4.543269230769231e-05, "loss": 0.9079, "step": 405 }, { "epoch": 0.2368382674639055, "grad_norm": 1.2807551622390747, "learning_rate": 4.5417668269230774e-05, "loss": 1.0703, "step": 406 }, { "epoch": 0.23742161295026978, "grad_norm": 1.3610466718673706, "learning_rate": 4.540264423076923e-05, "loss": 0.962, "step": 407 }, { "epoch": 0.2380049584366341, "grad_norm": 1.3228495121002197, "learning_rate": 4.53876201923077e-05, "loss": 1.0652, "step": 408 }, { "epoch": 0.2385883039229984, "grad_norm": 1.153469443321228, "learning_rate": 4.5372596153846156e-05, "loss": 1.2528, "step": 409 }, { "epoch": 0.2391716494093627, "grad_norm": 1.5087329149246216, "learning_rate": 4.5357572115384615e-05, "loss": 0.9128, "step": 410 }, { "epoch": 0.239754994895727, "grad_norm": 1.523759365081787, "learning_rate": 4.534254807692308e-05, "loss": 1.2005, "step": 411 }, { "epoch": 0.2403383403820913, "grad_norm": 1.2065186500549316, "learning_rate": 4.532752403846154e-05, "loss": 0.9159, "step": 412 }, { "epoch": 0.24092168586845558, "grad_norm": 1.2266783714294434, "learning_rate": 4.5312500000000004e-05, "loss": 1.1309, "step": 413 }, { "epoch": 0.2415050313548199, "grad_norm": 1.5053402185440063, "learning_rate": 4.529747596153847e-05, "loss": 1.1071, "step": 414 }, { "epoch": 0.2420883768411842, "grad_norm": 1.978948712348938, "learning_rate": 4.528245192307692e-05, "loss": 1.1895, "step": 415 }, { "epoch": 0.2426717223275485, "grad_norm": 1.3853555917739868, "learning_rate": 4.5267427884615386e-05, "loss": 1.0861, "step": 416 }, { "epoch": 0.2432550678139128, "grad_norm": 1.2731225490570068, "learning_rate": 4.5252403846153844e-05, "loss": 0.8939, "step": 417 }, { "epoch": 0.24383841330027708, "grad_norm": 1.1784796714782715, "learning_rate": 4.523737980769231e-05, "loss": 1.1628, "step": 418 }, { "epoch": 0.24442175878664138, "grad_norm": 1.4403506517410278, "learning_rate": 4.5222355769230775e-05, "loss": 1.2741, "step": 419 }, { "epoch": 0.24500510427300567, "grad_norm": 1.3002524375915527, "learning_rate": 4.5207331730769233e-05, "loss": 1.0343, "step": 420 }, { "epoch": 0.24558844975937, "grad_norm": 1.323982834815979, "learning_rate": 4.519230769230769e-05, "loss": 1.0944, "step": 421 }, { "epoch": 0.2461717952457343, "grad_norm": 1.2002545595169067, "learning_rate": 4.517728365384616e-05, "loss": 1.0209, "step": 422 }, { "epoch": 0.2467551407320986, "grad_norm": 1.1465034484863281, "learning_rate": 4.5162259615384616e-05, "loss": 1.0743, "step": 423 }, { "epoch": 0.24733848621846288, "grad_norm": 1.2983150482177734, "learning_rate": 4.514723557692308e-05, "loss": 1.1408, "step": 424 }, { "epoch": 0.24792183170482718, "grad_norm": 1.2283337116241455, "learning_rate": 4.513221153846154e-05, "loss": 0.8972, "step": 425 }, { "epoch": 0.24850517719119147, "grad_norm": 1.3164033889770508, "learning_rate": 4.5117187500000005e-05, "loss": 1.2549, "step": 426 }, { "epoch": 0.2490885226775558, "grad_norm": 1.2797776460647583, "learning_rate": 4.510216346153846e-05, "loss": 0.9427, "step": 427 }, { "epoch": 0.2496718681639201, "grad_norm": 1.0905731916427612, "learning_rate": 4.508713942307692e-05, "loss": 1.127, "step": 428 }, { "epoch": 0.25025521365028436, "grad_norm": 1.4118478298187256, "learning_rate": 4.507211538461539e-05, "loss": 0.9869, "step": 429 }, { "epoch": 0.25083855913664865, "grad_norm": 1.125809907913208, "learning_rate": 4.5057091346153846e-05, "loss": 1.0308, "step": 430 }, { "epoch": 0.251421904623013, "grad_norm": 1.1028636693954468, "learning_rate": 4.504206730769231e-05, "loss": 1.0587, "step": 431 }, { "epoch": 0.2520052501093773, "grad_norm": 1.59440279006958, "learning_rate": 4.5027043269230776e-05, "loss": 1.0767, "step": 432 }, { "epoch": 0.2525885955957416, "grad_norm": 1.067054033279419, "learning_rate": 4.501201923076923e-05, "loss": 1.1787, "step": 433 }, { "epoch": 0.2531719410821059, "grad_norm": 1.1260448694229126, "learning_rate": 4.499699519230769e-05, "loss": 1.0111, "step": 434 }, { "epoch": 0.2537552865684702, "grad_norm": 1.079969048500061, "learning_rate": 4.498197115384616e-05, "loss": 1.0204, "step": 435 }, { "epoch": 0.2543386320548345, "grad_norm": 1.1547199487686157, "learning_rate": 4.496694711538462e-05, "loss": 0.9295, "step": 436 }, { "epoch": 0.2549219775411988, "grad_norm": 1.3175557851791382, "learning_rate": 4.495192307692308e-05, "loss": 0.9133, "step": 437 }, { "epoch": 0.25550532302756307, "grad_norm": 1.323306918144226, "learning_rate": 4.493689903846154e-05, "loss": 0.9118, "step": 438 }, { "epoch": 0.25608866851392736, "grad_norm": 1.2758287191390991, "learning_rate": 4.4921875e-05, "loss": 1.2725, "step": 439 }, { "epoch": 0.25667201400029166, "grad_norm": 1.7028545141220093, "learning_rate": 4.4906850961538465e-05, "loss": 1.088, "step": 440 }, { "epoch": 0.25725535948665595, "grad_norm": 1.8174322843551636, "learning_rate": 4.489182692307692e-05, "loss": 1.1273, "step": 441 }, { "epoch": 0.25783870497302025, "grad_norm": 1.1969882249832153, "learning_rate": 4.487680288461539e-05, "loss": 1.2244, "step": 442 }, { "epoch": 0.25842205045938454, "grad_norm": 0.9966511726379395, "learning_rate": 4.486177884615385e-05, "loss": 1.1073, "step": 443 }, { "epoch": 0.2590053959457489, "grad_norm": 1.1165422201156616, "learning_rate": 4.484675480769231e-05, "loss": 0.9374, "step": 444 }, { "epoch": 0.2595887414321132, "grad_norm": 1.374568223953247, "learning_rate": 4.483173076923077e-05, "loss": 1.0658, "step": 445 }, { "epoch": 0.2601720869184775, "grad_norm": 1.2743726968765259, "learning_rate": 4.481670673076923e-05, "loss": 1.1376, "step": 446 }, { "epoch": 0.2607554324048418, "grad_norm": 1.2559353113174438, "learning_rate": 4.4801682692307694e-05, "loss": 0.9594, "step": 447 }, { "epoch": 0.2613387778912061, "grad_norm": 1.220955491065979, "learning_rate": 4.478665865384616e-05, "loss": 1.0246, "step": 448 }, { "epoch": 0.26192212337757037, "grad_norm": 1.0389074087142944, "learning_rate": 4.477163461538462e-05, "loss": 0.973, "step": 449 }, { "epoch": 0.26250546886393467, "grad_norm": 1.1613105535507202, "learning_rate": 4.4756610576923083e-05, "loss": 1.1086, "step": 450 }, { "epoch": 0.26308881435029896, "grad_norm": 1.147077202796936, "learning_rate": 4.4741586538461535e-05, "loss": 1.2115, "step": 451 }, { "epoch": 0.26367215983666326, "grad_norm": 1.1235055923461914, "learning_rate": 4.47265625e-05, "loss": 1.1007, "step": 452 }, { "epoch": 0.26425550532302755, "grad_norm": 2.1269114017486572, "learning_rate": 4.4711538461538466e-05, "loss": 1.1473, "step": 453 }, { "epoch": 0.26483885080939185, "grad_norm": 1.0030604600906372, "learning_rate": 4.4696514423076924e-05, "loss": 0.9971, "step": 454 }, { "epoch": 0.26542219629575614, "grad_norm": 1.013385534286499, "learning_rate": 4.468149038461539e-05, "loss": 0.9949, "step": 455 }, { "epoch": 0.26600554178212044, "grad_norm": 1.333153486251831, "learning_rate": 4.466646634615385e-05, "loss": 0.9749, "step": 456 }, { "epoch": 0.2665888872684848, "grad_norm": 1.3025758266448975, "learning_rate": 4.4651442307692306e-05, "loss": 0.961, "step": 457 }, { "epoch": 0.2671722327548491, "grad_norm": 1.1991068124771118, "learning_rate": 4.463641826923077e-05, "loss": 0.9674, "step": 458 }, { "epoch": 0.2677555782412134, "grad_norm": 1.5575584173202515, "learning_rate": 4.462139423076923e-05, "loss": 0.9976, "step": 459 }, { "epoch": 0.26833892372757767, "grad_norm": 1.234240174293518, "learning_rate": 4.4606370192307696e-05, "loss": 0.969, "step": 460 }, { "epoch": 0.26892226921394197, "grad_norm": 1.2866865396499634, "learning_rate": 4.459134615384616e-05, "loss": 0.7539, "step": 461 }, { "epoch": 0.26950561470030626, "grad_norm": 1.3814231157302856, "learning_rate": 4.457632211538462e-05, "loss": 1.014, "step": 462 }, { "epoch": 0.27008896018667056, "grad_norm": 1.2410728931427002, "learning_rate": 4.456129807692308e-05, "loss": 1.1885, "step": 463 }, { "epoch": 0.27067230567303485, "grad_norm": 1.1972429752349854, "learning_rate": 4.4546274038461536e-05, "loss": 1.1027, "step": 464 }, { "epoch": 0.27125565115939915, "grad_norm": 1.2238330841064453, "learning_rate": 4.453125e-05, "loss": 0.9025, "step": 465 }, { "epoch": 0.27183899664576344, "grad_norm": 1.2489745616912842, "learning_rate": 4.451622596153847e-05, "loss": 1.0432, "step": 466 }, { "epoch": 0.27242234213212774, "grad_norm": 1.2359799146652222, "learning_rate": 4.4501201923076925e-05, "loss": 1.0075, "step": 467 }, { "epoch": 0.27300568761849203, "grad_norm": 1.4064029455184937, "learning_rate": 4.448617788461539e-05, "loss": 1.1092, "step": 468 }, { "epoch": 0.2735890331048563, "grad_norm": 1.2376477718353271, "learning_rate": 4.447115384615384e-05, "loss": 1.0766, "step": 469 }, { "epoch": 0.2741723785912207, "grad_norm": 1.1279197931289673, "learning_rate": 4.445612980769231e-05, "loss": 1.1218, "step": 470 }, { "epoch": 0.274755724077585, "grad_norm": 1.5163652896881104, "learning_rate": 4.444110576923077e-05, "loss": 0.9557, "step": 471 }, { "epoch": 0.27533906956394927, "grad_norm": 1.198194146156311, "learning_rate": 4.442608173076923e-05, "loss": 0.8365, "step": 472 }, { "epoch": 0.27592241505031356, "grad_norm": 1.205476999282837, "learning_rate": 4.44110576923077e-05, "loss": 0.9057, "step": 473 }, { "epoch": 0.27650576053667786, "grad_norm": 1.0138362646102905, "learning_rate": 4.4396033653846155e-05, "loss": 1.0689, "step": 474 }, { "epoch": 0.27708910602304215, "grad_norm": 1.353697657585144, "learning_rate": 4.4381009615384614e-05, "loss": 0.9925, "step": 475 }, { "epoch": 0.27767245150940645, "grad_norm": 1.3365930318832397, "learning_rate": 4.436598557692308e-05, "loss": 0.8991, "step": 476 }, { "epoch": 0.27825579699577074, "grad_norm": 1.3026283979415894, "learning_rate": 4.435096153846154e-05, "loss": 1.1419, "step": 477 }, { "epoch": 0.27883914248213504, "grad_norm": 1.311883807182312, "learning_rate": 4.43359375e-05, "loss": 0.9451, "step": 478 }, { "epoch": 0.27942248796849933, "grad_norm": 0.950835108757019, "learning_rate": 4.432091346153847e-05, "loss": 0.8928, "step": 479 }, { "epoch": 0.28000583345486363, "grad_norm": 1.2371882200241089, "learning_rate": 4.4305889423076927e-05, "loss": 1.076, "step": 480 }, { "epoch": 0.2805891789412279, "grad_norm": 1.6330941915512085, "learning_rate": 4.4290865384615385e-05, "loss": 1.2121, "step": 481 }, { "epoch": 0.2811725244275922, "grad_norm": 1.3535494804382324, "learning_rate": 4.4275841346153844e-05, "loss": 1.1427, "step": 482 }, { "epoch": 0.2817558699139565, "grad_norm": 1.4552642107009888, "learning_rate": 4.426081730769231e-05, "loss": 1.0305, "step": 483 }, { "epoch": 0.28233921540032086, "grad_norm": 0.9442708492279053, "learning_rate": 4.4245793269230774e-05, "loss": 1.0597, "step": 484 }, { "epoch": 0.28292256088668516, "grad_norm": 1.0428327322006226, "learning_rate": 4.423076923076923e-05, "loss": 0.9091, "step": 485 }, { "epoch": 0.28350590637304945, "grad_norm": 1.2180997133255005, "learning_rate": 4.42157451923077e-05, "loss": 0.9809, "step": 486 }, { "epoch": 0.28408925185941375, "grad_norm": 1.153455138206482, "learning_rate": 4.4200721153846156e-05, "loss": 1.0152, "step": 487 }, { "epoch": 0.28467259734577804, "grad_norm": 1.2968738079071045, "learning_rate": 4.4185697115384615e-05, "loss": 1.0907, "step": 488 }, { "epoch": 0.28525594283214234, "grad_norm": 1.2618736028671265, "learning_rate": 4.417067307692308e-05, "loss": 0.9007, "step": 489 }, { "epoch": 0.28583928831850663, "grad_norm": 1.4131522178649902, "learning_rate": 4.415564903846154e-05, "loss": 1.0028, "step": 490 }, { "epoch": 0.28642263380487093, "grad_norm": 1.2588627338409424, "learning_rate": 4.4140625000000004e-05, "loss": 1.1391, "step": 491 }, { "epoch": 0.2870059792912352, "grad_norm": 1.4097051620483398, "learning_rate": 4.412560096153846e-05, "loss": 1.1168, "step": 492 }, { "epoch": 0.2875893247775995, "grad_norm": 0.9398725032806396, "learning_rate": 4.411057692307692e-05, "loss": 0.961, "step": 493 }, { "epoch": 0.2881726702639638, "grad_norm": 1.3721821308135986, "learning_rate": 4.4095552884615386e-05, "loss": 0.8813, "step": 494 }, { "epoch": 0.2887560157503281, "grad_norm": 1.4227555990219116, "learning_rate": 4.4080528846153845e-05, "loss": 1.2802, "step": 495 }, { "epoch": 0.2893393612366924, "grad_norm": 1.5191651582717896, "learning_rate": 4.406550480769231e-05, "loss": 1.1544, "step": 496 }, { "epoch": 0.28992270672305676, "grad_norm": 1.350059986114502, "learning_rate": 4.4050480769230775e-05, "loss": 1.1791, "step": 497 }, { "epoch": 0.29050605220942105, "grad_norm": 1.2704778909683228, "learning_rate": 4.4035456730769234e-05, "loss": 1.0238, "step": 498 }, { "epoch": 0.29108939769578535, "grad_norm": 1.3052599430084229, "learning_rate": 4.402043269230769e-05, "loss": 0.8804, "step": 499 }, { "epoch": 0.29167274318214964, "grad_norm": 1.4801050424575806, "learning_rate": 4.400540865384616e-05, "loss": 1.0356, "step": 500 }, { "epoch": 0.29225608866851394, "grad_norm": 1.1770102977752686, "learning_rate": 4.3990384615384616e-05, "loss": 0.9618, "step": 501 }, { "epoch": 0.29283943415487823, "grad_norm": 1.216208577156067, "learning_rate": 4.397536057692308e-05, "loss": 1.1352, "step": 502 }, { "epoch": 0.2934227796412425, "grad_norm": 1.1072677373886108, "learning_rate": 4.396033653846154e-05, "loss": 1.2635, "step": 503 }, { "epoch": 0.2940061251276068, "grad_norm": 1.2480254173278809, "learning_rate": 4.3945312500000005e-05, "loss": 1.0547, "step": 504 }, { "epoch": 0.2945894706139711, "grad_norm": 1.227055549621582, "learning_rate": 4.3930288461538464e-05, "loss": 1.1527, "step": 505 }, { "epoch": 0.2951728161003354, "grad_norm": 1.1896963119506836, "learning_rate": 4.391526442307692e-05, "loss": 1.0816, "step": 506 }, { "epoch": 0.2957561615866997, "grad_norm": 1.480958342552185, "learning_rate": 4.390024038461539e-05, "loss": 1.0701, "step": 507 }, { "epoch": 0.296339507073064, "grad_norm": 1.4093650579452515, "learning_rate": 4.3885216346153846e-05, "loss": 1.1059, "step": 508 }, { "epoch": 0.2969228525594283, "grad_norm": 1.1637531518936157, "learning_rate": 4.387019230769231e-05, "loss": 1.1347, "step": 509 }, { "epoch": 0.29750619804579265, "grad_norm": 1.185320496559143, "learning_rate": 4.385516826923077e-05, "loss": 1.0149, "step": 510 }, { "epoch": 0.29808954353215694, "grad_norm": 1.3317254781723022, "learning_rate": 4.384014423076923e-05, "loss": 0.9834, "step": 511 }, { "epoch": 0.29867288901852124, "grad_norm": 1.0526453256607056, "learning_rate": 4.3825120192307693e-05, "loss": 0.9779, "step": 512 }, { "epoch": 0.29925623450488553, "grad_norm": 1.2846801280975342, "learning_rate": 4.381009615384616e-05, "loss": 1.1655, "step": 513 }, { "epoch": 0.2998395799912498, "grad_norm": 1.4375282526016235, "learning_rate": 4.379507211538462e-05, "loss": 1.1173, "step": 514 }, { "epoch": 0.3004229254776141, "grad_norm": 1.0759106874465942, "learning_rate": 4.378004807692308e-05, "loss": 0.909, "step": 515 }, { "epoch": 0.3010062709639784, "grad_norm": 1.262014389038086, "learning_rate": 4.376502403846154e-05, "loss": 1.1499, "step": 516 }, { "epoch": 0.3015896164503427, "grad_norm": 1.5526944398880005, "learning_rate": 4.375e-05, "loss": 0.975, "step": 517 }, { "epoch": 0.302172961936707, "grad_norm": 1.2012171745300293, "learning_rate": 4.3734975961538465e-05, "loss": 0.791, "step": 518 }, { "epoch": 0.3027563074230713, "grad_norm": 1.248779058456421, "learning_rate": 4.371995192307692e-05, "loss": 0.9285, "step": 519 }, { "epoch": 0.3033396529094356, "grad_norm": 1.387321949005127, "learning_rate": 4.370492788461539e-05, "loss": 1.0779, "step": 520 }, { "epoch": 0.3039229983957999, "grad_norm": 1.4328728914260864, "learning_rate": 4.368990384615385e-05, "loss": 1.1295, "step": 521 }, { "epoch": 0.3045063438821642, "grad_norm": 1.231877326965332, "learning_rate": 4.367487980769231e-05, "loss": 0.9938, "step": 522 }, { "epoch": 0.30508968936852854, "grad_norm": 1.1031498908996582, "learning_rate": 4.365985576923077e-05, "loss": 1.0653, "step": 523 }, { "epoch": 0.30567303485489283, "grad_norm": 1.112518310546875, "learning_rate": 4.364483173076923e-05, "loss": 1.033, "step": 524 }, { "epoch": 0.30625638034125713, "grad_norm": 1.2784297466278076, "learning_rate": 4.3629807692307695e-05, "loss": 1.026, "step": 525 }, { "epoch": 0.3068397258276214, "grad_norm": 1.298776626586914, "learning_rate": 4.361478365384616e-05, "loss": 0.8997, "step": 526 }, { "epoch": 0.3074230713139857, "grad_norm": 1.134751319885254, "learning_rate": 4.359975961538462e-05, "loss": 1.2168, "step": 527 }, { "epoch": 0.30800641680035, "grad_norm": 1.0475013256072998, "learning_rate": 4.358473557692308e-05, "loss": 1.1632, "step": 528 }, { "epoch": 0.3085897622867143, "grad_norm": 1.1225156784057617, "learning_rate": 4.3569711538461535e-05, "loss": 1.0383, "step": 529 }, { "epoch": 0.3091731077730786, "grad_norm": 1.3873623609542847, "learning_rate": 4.35546875e-05, "loss": 1.0286, "step": 530 }, { "epoch": 0.3097564532594429, "grad_norm": 1.9602346420288086, "learning_rate": 4.3539663461538466e-05, "loss": 1.1737, "step": 531 }, { "epoch": 0.3103397987458072, "grad_norm": 1.5184245109558105, "learning_rate": 4.3524639423076925e-05, "loss": 1.0894, "step": 532 }, { "epoch": 0.3109231442321715, "grad_norm": 1.3252931833267212, "learning_rate": 4.350961538461539e-05, "loss": 1.0596, "step": 533 }, { "epoch": 0.3115064897185358, "grad_norm": 1.6967353820800781, "learning_rate": 4.349459134615385e-05, "loss": 1.0257, "step": 534 }, { "epoch": 0.3120898352049001, "grad_norm": 1.2516505718231201, "learning_rate": 4.347956730769231e-05, "loss": 1.3018, "step": 535 }, { "epoch": 0.3126731806912644, "grad_norm": 2.050544500350952, "learning_rate": 4.346454326923077e-05, "loss": 1.155, "step": 536 }, { "epoch": 0.3132565261776287, "grad_norm": 1.6617178916931152, "learning_rate": 4.344951923076923e-05, "loss": 0.9574, "step": 537 }, { "epoch": 0.313839871663993, "grad_norm": 1.0113744735717773, "learning_rate": 4.3434495192307696e-05, "loss": 1.0248, "step": 538 }, { "epoch": 0.3144232171503573, "grad_norm": 1.1463007926940918, "learning_rate": 4.341947115384616e-05, "loss": 1.0335, "step": 539 }, { "epoch": 0.3150065626367216, "grad_norm": 1.0310202836990356, "learning_rate": 4.340444711538462e-05, "loss": 0.8483, "step": 540 }, { "epoch": 0.3155899081230859, "grad_norm": 1.2262356281280518, "learning_rate": 4.338942307692308e-05, "loss": 1.1332, "step": 541 }, { "epoch": 0.3161732536094502, "grad_norm": 1.4348859786987305, "learning_rate": 4.337439903846154e-05, "loss": 0.9394, "step": 542 }, { "epoch": 0.3167565990958145, "grad_norm": 1.4490423202514648, "learning_rate": 4.3359375e-05, "loss": 1.1187, "step": 543 }, { "epoch": 0.3173399445821788, "grad_norm": 1.3444559574127197, "learning_rate": 4.334435096153847e-05, "loss": 1.0358, "step": 544 }, { "epoch": 0.3179232900685431, "grad_norm": 1.1789164543151855, "learning_rate": 4.3329326923076926e-05, "loss": 0.9231, "step": 545 }, { "epoch": 0.3185066355549074, "grad_norm": 1.231382966041565, "learning_rate": 4.3314302884615384e-05, "loss": 1.0525, "step": 546 }, { "epoch": 0.3190899810412717, "grad_norm": 1.0804648399353027, "learning_rate": 4.329927884615384e-05, "loss": 0.9816, "step": 547 }, { "epoch": 0.31967332652763597, "grad_norm": 1.2451800107955933, "learning_rate": 4.328425480769231e-05, "loss": 0.8803, "step": 548 }, { "epoch": 0.32025667201400027, "grad_norm": 1.2045650482177734, "learning_rate": 4.326923076923077e-05, "loss": 1.1134, "step": 549 }, { "epoch": 0.3208400175003646, "grad_norm": 1.3157379627227783, "learning_rate": 4.325420673076923e-05, "loss": 1.2101, "step": 550 }, { "epoch": 0.3214233629867289, "grad_norm": 1.2755893468856812, "learning_rate": 4.32391826923077e-05, "loss": 0.9826, "step": 551 }, { "epoch": 0.3220067084730932, "grad_norm": 1.2792096138000488, "learning_rate": 4.3224158653846156e-05, "loss": 0.8629, "step": 552 }, { "epoch": 0.3225900539594575, "grad_norm": 1.1155221462249756, "learning_rate": 4.3209134615384614e-05, "loss": 1.1298, "step": 553 }, { "epoch": 0.3231733994458218, "grad_norm": 1.3425167798995972, "learning_rate": 4.319411057692308e-05, "loss": 1.0914, "step": 554 }, { "epoch": 0.3237567449321861, "grad_norm": 2.2145228385925293, "learning_rate": 4.317908653846154e-05, "loss": 0.9944, "step": 555 }, { "epoch": 0.3243400904185504, "grad_norm": 1.3437769412994385, "learning_rate": 4.31640625e-05, "loss": 1.0431, "step": 556 }, { "epoch": 0.3249234359049147, "grad_norm": 1.253940463066101, "learning_rate": 4.314903846153847e-05, "loss": 0.8568, "step": 557 }, { "epoch": 0.325506781391279, "grad_norm": 1.6543011665344238, "learning_rate": 4.313401442307693e-05, "loss": 0.9353, "step": 558 }, { "epoch": 0.32609012687764327, "grad_norm": 1.3182803392410278, "learning_rate": 4.3118990384615385e-05, "loss": 0.8821, "step": 559 }, { "epoch": 0.32667347236400757, "grad_norm": 1.3994224071502686, "learning_rate": 4.3103966346153844e-05, "loss": 0.9704, "step": 560 }, { "epoch": 0.32725681785037186, "grad_norm": 1.1596894264221191, "learning_rate": 4.308894230769231e-05, "loss": 1.1954, "step": 561 }, { "epoch": 0.32784016333673616, "grad_norm": 1.1696090698242188, "learning_rate": 4.3073918269230774e-05, "loss": 0.994, "step": 562 }, { "epoch": 0.3284235088231005, "grad_norm": 1.0677425861358643, "learning_rate": 4.305889423076923e-05, "loss": 0.9962, "step": 563 }, { "epoch": 0.3290068543094648, "grad_norm": 1.27321457862854, "learning_rate": 4.304387019230769e-05, "loss": 1.1947, "step": 564 }, { "epoch": 0.3295901997958291, "grad_norm": 1.2170658111572266, "learning_rate": 4.302884615384616e-05, "loss": 1.2042, "step": 565 }, { "epoch": 0.3301735452821934, "grad_norm": 1.4280612468719482, "learning_rate": 4.3013822115384615e-05, "loss": 1.3016, "step": 566 }, { "epoch": 0.3307568907685577, "grad_norm": 1.2479727268218994, "learning_rate": 4.299879807692308e-05, "loss": 1.172, "step": 567 }, { "epoch": 0.331340236254922, "grad_norm": 1.192344069480896, "learning_rate": 4.298377403846154e-05, "loss": 1.0476, "step": 568 }, { "epoch": 0.3319235817412863, "grad_norm": 1.4639259576797485, "learning_rate": 4.2968750000000004e-05, "loss": 0.8669, "step": 569 }, { "epoch": 0.3325069272276506, "grad_norm": 1.4050641059875488, "learning_rate": 4.295372596153846e-05, "loss": 1.1723, "step": 570 }, { "epoch": 0.33309027271401487, "grad_norm": 1.117199420928955, "learning_rate": 4.293870192307692e-05, "loss": 1.0075, "step": 571 }, { "epoch": 0.33367361820037916, "grad_norm": 1.2617186307907104, "learning_rate": 4.2923677884615387e-05, "loss": 1.1577, "step": 572 }, { "epoch": 0.33425696368674346, "grad_norm": 1.269479513168335, "learning_rate": 4.2908653846153845e-05, "loss": 1.1862, "step": 573 }, { "epoch": 0.33484030917310775, "grad_norm": 1.3484537601470947, "learning_rate": 4.289362980769231e-05, "loss": 1.0458, "step": 574 }, { "epoch": 0.33542365465947205, "grad_norm": 1.2798513174057007, "learning_rate": 4.2878605769230776e-05, "loss": 1.1412, "step": 575 }, { "epoch": 0.3360070001458364, "grad_norm": 1.1217023134231567, "learning_rate": 4.2863581730769234e-05, "loss": 0.8785, "step": 576 }, { "epoch": 0.3365903456322007, "grad_norm": 1.2047514915466309, "learning_rate": 4.284855769230769e-05, "loss": 1.0627, "step": 577 }, { "epoch": 0.337173691118565, "grad_norm": 1.3945984840393066, "learning_rate": 4.283353365384616e-05, "loss": 1.122, "step": 578 }, { "epoch": 0.3377570366049293, "grad_norm": 1.466818928718567, "learning_rate": 4.2818509615384616e-05, "loss": 1.0109, "step": 579 }, { "epoch": 0.3383403820912936, "grad_norm": 1.4969720840454102, "learning_rate": 4.280348557692308e-05, "loss": 1.0855, "step": 580 }, { "epoch": 0.3389237275776579, "grad_norm": 1.2596534490585327, "learning_rate": 4.278846153846154e-05, "loss": 1.0654, "step": 581 }, { "epoch": 0.33950707306402217, "grad_norm": 1.1512525081634521, "learning_rate": 4.27734375e-05, "loss": 1.1286, "step": 582 }, { "epoch": 0.34009041855038646, "grad_norm": 1.2515615224838257, "learning_rate": 4.2758413461538464e-05, "loss": 1.2436, "step": 583 }, { "epoch": 0.34067376403675076, "grad_norm": 1.2093133926391602, "learning_rate": 4.274338942307692e-05, "loss": 1.0487, "step": 584 }, { "epoch": 0.34125710952311505, "grad_norm": 1.1588521003723145, "learning_rate": 4.272836538461539e-05, "loss": 1.1469, "step": 585 }, { "epoch": 0.34184045500947935, "grad_norm": 1.1132664680480957, "learning_rate": 4.2713341346153846e-05, "loss": 1.0547, "step": 586 }, { "epoch": 0.34242380049584364, "grad_norm": 1.2172892093658447, "learning_rate": 4.269831730769231e-05, "loss": 1.2139, "step": 587 }, { "epoch": 0.34300714598220794, "grad_norm": 1.605918526649475, "learning_rate": 4.268329326923077e-05, "loss": 0.9379, "step": 588 }, { "epoch": 0.34359049146857223, "grad_norm": 1.560250163078308, "learning_rate": 4.266826923076923e-05, "loss": 0.992, "step": 589 }, { "epoch": 0.3441738369549366, "grad_norm": 1.7285302877426147, "learning_rate": 4.2653245192307694e-05, "loss": 0.9653, "step": 590 }, { "epoch": 0.3447571824413009, "grad_norm": 1.458788514137268, "learning_rate": 4.263822115384616e-05, "loss": 1.1627, "step": 591 }, { "epoch": 0.3453405279276652, "grad_norm": 1.444248080253601, "learning_rate": 4.262319711538462e-05, "loss": 1.1464, "step": 592 }, { "epoch": 0.34592387341402947, "grad_norm": 1.3933836221694946, "learning_rate": 4.260817307692308e-05, "loss": 1.0868, "step": 593 }, { "epoch": 0.34650721890039377, "grad_norm": 1.6488311290740967, "learning_rate": 4.259314903846154e-05, "loss": 1.0009, "step": 594 }, { "epoch": 0.34709056438675806, "grad_norm": 1.1838133335113525, "learning_rate": 4.2578125e-05, "loss": 0.884, "step": 595 }, { "epoch": 0.34767390987312236, "grad_norm": 1.3585830926895142, "learning_rate": 4.2563100961538465e-05, "loss": 1.1594, "step": 596 }, { "epoch": 0.34825725535948665, "grad_norm": 1.4692409038543701, "learning_rate": 4.2548076923076924e-05, "loss": 1.0424, "step": 597 }, { "epoch": 0.34884060084585095, "grad_norm": 1.2625113725662231, "learning_rate": 4.253305288461539e-05, "loss": 1.171, "step": 598 }, { "epoch": 0.34942394633221524, "grad_norm": 1.2111209630966187, "learning_rate": 4.251802884615385e-05, "loss": 0.9783, "step": 599 }, { "epoch": 0.35000729181857954, "grad_norm": 1.224150538444519, "learning_rate": 4.2503004807692306e-05, "loss": 1.1189, "step": 600 }, { "epoch": 0.35000729181857954, "eval_loss_squad": 1.0769534187577665, "eval_perplexity": 7.957609155257721, "eval_perplexity_reconstruct": 1.950415498039559, "step": 600 }, { "epoch": 0.35059063730494383, "grad_norm": 2.5112569332122803, "learning_rate": 4.248798076923077e-05, "loss": 1.1328, "step": 601 }, { "epoch": 0.3511739827913081, "grad_norm": 1.6178394556045532, "learning_rate": 4.247295673076923e-05, "loss": 1.1492, "step": 602 }, { "epoch": 0.3517573282776725, "grad_norm": 0.9533804059028625, "learning_rate": 4.2457932692307695e-05, "loss": 1.1439, "step": 603 }, { "epoch": 0.35234067376403677, "grad_norm": 1.2348041534423828, "learning_rate": 4.244290865384616e-05, "loss": 0.8496, "step": 604 }, { "epoch": 0.35292401925040107, "grad_norm": 1.199015498161316, "learning_rate": 4.242788461538462e-05, "loss": 1.0841, "step": 605 }, { "epoch": 0.35350736473676536, "grad_norm": 1.264140248298645, "learning_rate": 4.241286057692308e-05, "loss": 0.9882, "step": 606 }, { "epoch": 0.35409071022312966, "grad_norm": 1.1264489889144897, "learning_rate": 4.2397836538461536e-05, "loss": 0.8837, "step": 607 }, { "epoch": 0.35467405570949395, "grad_norm": 1.0938150882720947, "learning_rate": 4.23828125e-05, "loss": 0.9441, "step": 608 }, { "epoch": 0.35525740119585825, "grad_norm": 1.190896987915039, "learning_rate": 4.2367788461538466e-05, "loss": 1.0634, "step": 609 }, { "epoch": 0.35584074668222254, "grad_norm": 1.4452359676361084, "learning_rate": 4.2352764423076925e-05, "loss": 0.786, "step": 610 }, { "epoch": 0.35642409216858684, "grad_norm": 1.373305082321167, "learning_rate": 4.233774038461539e-05, "loss": 0.9567, "step": 611 }, { "epoch": 0.35700743765495113, "grad_norm": 1.3388804197311401, "learning_rate": 4.232271634615385e-05, "loss": 0.9216, "step": 612 }, { "epoch": 0.3575907831413154, "grad_norm": 2.152236223220825, "learning_rate": 4.230769230769231e-05, "loss": 1.1881, "step": 613 }, { "epoch": 0.3581741286276797, "grad_norm": 1.3865232467651367, "learning_rate": 4.229266826923077e-05, "loss": 1.0252, "step": 614 }, { "epoch": 0.358757474114044, "grad_norm": 1.3287843465805054, "learning_rate": 4.227764423076923e-05, "loss": 1.1208, "step": 615 }, { "epoch": 0.35934081960040837, "grad_norm": 1.2160487174987793, "learning_rate": 4.2262620192307696e-05, "loss": 0.7792, "step": 616 }, { "epoch": 0.35992416508677266, "grad_norm": 1.3507564067840576, "learning_rate": 4.224759615384616e-05, "loss": 0.8473, "step": 617 }, { "epoch": 0.36050751057313696, "grad_norm": 1.738625168800354, "learning_rate": 4.223257211538461e-05, "loss": 1.016, "step": 618 }, { "epoch": 0.36109085605950125, "grad_norm": 1.3850781917572021, "learning_rate": 4.221754807692308e-05, "loss": 1.1247, "step": 619 }, { "epoch": 0.36167420154586555, "grad_norm": 1.1078225374221802, "learning_rate": 4.220252403846154e-05, "loss": 1.0403, "step": 620 }, { "epoch": 0.36225754703222984, "grad_norm": 1.5114189386367798, "learning_rate": 4.21875e-05, "loss": 1.136, "step": 621 }, { "epoch": 0.36284089251859414, "grad_norm": 1.1798583269119263, "learning_rate": 4.217247596153847e-05, "loss": 1.0254, "step": 622 }, { "epoch": 0.36342423800495843, "grad_norm": 1.6477906703948975, "learning_rate": 4.2157451923076926e-05, "loss": 1.0695, "step": 623 }, { "epoch": 0.36400758349132273, "grad_norm": 1.1232982873916626, "learning_rate": 4.2142427884615385e-05, "loss": 0.7686, "step": 624 }, { "epoch": 0.364590928977687, "grad_norm": 1.483515739440918, "learning_rate": 4.212740384615384e-05, "loss": 1.0878, "step": 625 }, { "epoch": 0.3651742744640513, "grad_norm": 1.361569881439209, "learning_rate": 4.211237980769231e-05, "loss": 1.0778, "step": 626 }, { "epoch": 0.3657576199504156, "grad_norm": 1.3304964303970337, "learning_rate": 4.2097355769230774e-05, "loss": 1.2297, "step": 627 }, { "epoch": 0.3663409654367799, "grad_norm": 1.2258186340332031, "learning_rate": 4.208233173076923e-05, "loss": 1.0869, "step": 628 }, { "epoch": 0.36692431092314426, "grad_norm": 2.0285935401916504, "learning_rate": 4.20673076923077e-05, "loss": 1.0043, "step": 629 }, { "epoch": 0.36750765640950855, "grad_norm": 1.278552770614624, "learning_rate": 4.2052283653846156e-05, "loss": 0.9999, "step": 630 }, { "epoch": 0.36809100189587285, "grad_norm": 1.9170531034469604, "learning_rate": 4.2037259615384614e-05, "loss": 0.8887, "step": 631 }, { "epoch": 0.36867434738223714, "grad_norm": 1.070654034614563, "learning_rate": 4.202223557692308e-05, "loss": 0.8389, "step": 632 }, { "epoch": 0.36925769286860144, "grad_norm": 1.3350582122802734, "learning_rate": 4.200721153846154e-05, "loss": 1.1057, "step": 633 }, { "epoch": 0.36984103835496573, "grad_norm": 1.2793062925338745, "learning_rate": 4.1992187500000003e-05, "loss": 1.0407, "step": 634 }, { "epoch": 0.37042438384133003, "grad_norm": 1.2050164937973022, "learning_rate": 4.197716346153847e-05, "loss": 1.0973, "step": 635 }, { "epoch": 0.3710077293276943, "grad_norm": 1.1398264169692993, "learning_rate": 4.196213942307692e-05, "loss": 1.162, "step": 636 }, { "epoch": 0.3715910748140586, "grad_norm": 1.2741928100585938, "learning_rate": 4.1947115384615386e-05, "loss": 0.9104, "step": 637 }, { "epoch": 0.3721744203004229, "grad_norm": 1.6818182468414307, "learning_rate": 4.1932091346153844e-05, "loss": 0.9304, "step": 638 }, { "epoch": 0.3727577657867872, "grad_norm": 1.0803583860397339, "learning_rate": 4.191706730769231e-05, "loss": 1.1518, "step": 639 }, { "epoch": 0.3733411112731515, "grad_norm": 1.178480625152588, "learning_rate": 4.1902043269230775e-05, "loss": 0.9778, "step": 640 }, { "epoch": 0.3739244567595158, "grad_norm": 1.3216358423233032, "learning_rate": 4.188701923076923e-05, "loss": 1.0803, "step": 641 }, { "epoch": 0.3745078022458801, "grad_norm": 1.191552758216858, "learning_rate": 4.187199519230769e-05, "loss": 1.0794, "step": 642 }, { "epoch": 0.37509114773224445, "grad_norm": 1.240075945854187, "learning_rate": 4.185697115384616e-05, "loss": 1.2029, "step": 643 }, { "epoch": 0.37567449321860874, "grad_norm": 1.2938767671585083, "learning_rate": 4.1841947115384616e-05, "loss": 1.1531, "step": 644 }, { "epoch": 0.37625783870497304, "grad_norm": 1.4226038455963135, "learning_rate": 4.182692307692308e-05, "loss": 1.0929, "step": 645 }, { "epoch": 0.37684118419133733, "grad_norm": 1.0986605882644653, "learning_rate": 4.181189903846154e-05, "loss": 0.8105, "step": 646 }, { "epoch": 0.3774245296777016, "grad_norm": 1.0812294483184814, "learning_rate": 4.1796875000000005e-05, "loss": 1.0476, "step": 647 }, { "epoch": 0.3780078751640659, "grad_norm": 1.7585773468017578, "learning_rate": 4.178185096153846e-05, "loss": 1.2113, "step": 648 }, { "epoch": 0.3785912206504302, "grad_norm": 1.1438621282577515, "learning_rate": 4.176682692307692e-05, "loss": 0.8594, "step": 649 }, { "epoch": 0.3791745661367945, "grad_norm": 1.1411939859390259, "learning_rate": 4.175180288461539e-05, "loss": 0.8793, "step": 650 }, { "epoch": 0.3797579116231588, "grad_norm": 1.1287474632263184, "learning_rate": 4.1736778846153845e-05, "loss": 0.8364, "step": 651 }, { "epoch": 0.3803412571095231, "grad_norm": 1.285798192024231, "learning_rate": 4.172175480769231e-05, "loss": 1.1573, "step": 652 }, { "epoch": 0.3809246025958874, "grad_norm": 1.1466621160507202, "learning_rate": 4.1706730769230776e-05, "loss": 0.9106, "step": 653 }, { "epoch": 0.3815079480822517, "grad_norm": 1.2659651041030884, "learning_rate": 4.1691706730769234e-05, "loss": 0.9637, "step": 654 }, { "epoch": 0.382091293568616, "grad_norm": 1.6592333316802979, "learning_rate": 4.167668269230769e-05, "loss": 0.9402, "step": 655 }, { "epoch": 0.38267463905498034, "grad_norm": 0.9642985463142395, "learning_rate": 4.166165865384616e-05, "loss": 0.9339, "step": 656 }, { "epoch": 0.38325798454134463, "grad_norm": 1.3237916231155396, "learning_rate": 4.164663461538462e-05, "loss": 1.1165, "step": 657 }, { "epoch": 0.3838413300277089, "grad_norm": 1.2888929843902588, "learning_rate": 4.163161057692308e-05, "loss": 1.1668, "step": 658 }, { "epoch": 0.3844246755140732, "grad_norm": 1.3504743576049805, "learning_rate": 4.161658653846154e-05, "loss": 1.2046, "step": 659 }, { "epoch": 0.3850080210004375, "grad_norm": 1.2326107025146484, "learning_rate": 4.16015625e-05, "loss": 0.8843, "step": 660 }, { "epoch": 0.3855913664868018, "grad_norm": 1.083722472190857, "learning_rate": 4.1586538461538464e-05, "loss": 0.8806, "step": 661 }, { "epoch": 0.3861747119731661, "grad_norm": 1.2427271604537964, "learning_rate": 4.157151442307692e-05, "loss": 1.1388, "step": 662 }, { "epoch": 0.3867580574595304, "grad_norm": 1.2181854248046875, "learning_rate": 4.155649038461539e-05, "loss": 0.979, "step": 663 }, { "epoch": 0.3873414029458947, "grad_norm": 1.6642446517944336, "learning_rate": 4.1541466346153847e-05, "loss": 1.0273, "step": 664 }, { "epoch": 0.387924748432259, "grad_norm": 3.092848539352417, "learning_rate": 4.152644230769231e-05, "loss": 0.9759, "step": 665 }, { "epoch": 0.3885080939186233, "grad_norm": 1.4167457818984985, "learning_rate": 4.151141826923077e-05, "loss": 1.1041, "step": 666 }, { "epoch": 0.3890914394049876, "grad_norm": 1.4154654741287231, "learning_rate": 4.149639423076923e-05, "loss": 1.0213, "step": 667 }, { "epoch": 0.3896747848913519, "grad_norm": 1.112176775932312, "learning_rate": 4.1481370192307694e-05, "loss": 0.9382, "step": 668 }, { "epoch": 0.39025813037771623, "grad_norm": 1.0001271963119507, "learning_rate": 4.146634615384616e-05, "loss": 1.0377, "step": 669 }, { "epoch": 0.3908414758640805, "grad_norm": 1.0494935512542725, "learning_rate": 4.145132211538462e-05, "loss": 1.0897, "step": 670 }, { "epoch": 0.3914248213504448, "grad_norm": 1.2128801345825195, "learning_rate": 4.143629807692308e-05, "loss": 0.9406, "step": 671 }, { "epoch": 0.3920081668368091, "grad_norm": 1.1072816848754883, "learning_rate": 4.142127403846154e-05, "loss": 0.9358, "step": 672 }, { "epoch": 0.3925915123231734, "grad_norm": 1.251513123512268, "learning_rate": 4.140625e-05, "loss": 1.0557, "step": 673 }, { "epoch": 0.3931748578095377, "grad_norm": 1.124248743057251, "learning_rate": 4.1391225961538465e-05, "loss": 1.1116, "step": 674 }, { "epoch": 0.393758203295902, "grad_norm": 1.1827398538589478, "learning_rate": 4.1376201923076924e-05, "loss": 1.1149, "step": 675 }, { "epoch": 0.3943415487822663, "grad_norm": 1.1932939291000366, "learning_rate": 4.136117788461539e-05, "loss": 0.865, "step": 676 }, { "epoch": 0.3949248942686306, "grad_norm": 1.1423388719558716, "learning_rate": 4.134615384615385e-05, "loss": 1.0161, "step": 677 }, { "epoch": 0.3955082397549949, "grad_norm": 1.1362109184265137, "learning_rate": 4.1331129807692306e-05, "loss": 1.1078, "step": 678 }, { "epoch": 0.3960915852413592, "grad_norm": 1.173803448677063, "learning_rate": 4.131610576923077e-05, "loss": 1.0828, "step": 679 }, { "epoch": 0.3966749307277235, "grad_norm": 1.1851235628128052, "learning_rate": 4.130108173076923e-05, "loss": 1.0101, "step": 680 }, { "epoch": 0.39725827621408777, "grad_norm": 1.1966065168380737, "learning_rate": 4.1286057692307695e-05, "loss": 1.0556, "step": 681 }, { "epoch": 0.3978416217004521, "grad_norm": 1.042441725730896, "learning_rate": 4.127103365384616e-05, "loss": 0.9449, "step": 682 }, { "epoch": 0.3984249671868164, "grad_norm": 1.3966326713562012, "learning_rate": 4.125600961538462e-05, "loss": 1.2354, "step": 683 }, { "epoch": 0.3990083126731807, "grad_norm": 1.7100861072540283, "learning_rate": 4.124098557692308e-05, "loss": 1.0369, "step": 684 }, { "epoch": 0.399591658159545, "grad_norm": 1.1547572612762451, "learning_rate": 4.1225961538461536e-05, "loss": 1.2298, "step": 685 }, { "epoch": 0.4001750036459093, "grad_norm": 1.2655994892120361, "learning_rate": 4.12109375e-05, "loss": 1.0823, "step": 686 }, { "epoch": 0.4007583491322736, "grad_norm": 1.1846497058868408, "learning_rate": 4.119591346153847e-05, "loss": 1.127, "step": 687 }, { "epoch": 0.4013416946186379, "grad_norm": 1.3547903299331665, "learning_rate": 4.1180889423076925e-05, "loss": 0.9981, "step": 688 }, { "epoch": 0.4019250401050022, "grad_norm": 1.16106379032135, "learning_rate": 4.116586538461539e-05, "loss": 1.0675, "step": 689 }, { "epoch": 0.4025083855913665, "grad_norm": 1.3457812070846558, "learning_rate": 4.115084134615385e-05, "loss": 1.0359, "step": 690 }, { "epoch": 0.4030917310777308, "grad_norm": 1.440121054649353, "learning_rate": 4.113581730769231e-05, "loss": 1.0681, "step": 691 }, { "epoch": 0.40367507656409507, "grad_norm": 1.382415533065796, "learning_rate": 4.112079326923077e-05, "loss": 0.9819, "step": 692 }, { "epoch": 0.40425842205045937, "grad_norm": 1.391350507736206, "learning_rate": 4.110576923076923e-05, "loss": 1.0845, "step": 693 }, { "epoch": 0.40484176753682366, "grad_norm": 1.2903047800064087, "learning_rate": 4.1090745192307696e-05, "loss": 1.0631, "step": 694 }, { "epoch": 0.405425113023188, "grad_norm": 1.1688337326049805, "learning_rate": 4.107572115384616e-05, "loss": 0.9691, "step": 695 }, { "epoch": 0.4060084585095523, "grad_norm": 1.327972173690796, "learning_rate": 4.1060697115384613e-05, "loss": 1.0152, "step": 696 }, { "epoch": 0.4065918039959166, "grad_norm": 1.1001508235931396, "learning_rate": 4.104567307692308e-05, "loss": 0.9932, "step": 697 }, { "epoch": 0.4071751494822809, "grad_norm": 1.1654032468795776, "learning_rate": 4.103064903846154e-05, "loss": 1.0077, "step": 698 }, { "epoch": 0.4077584949686452, "grad_norm": 1.3325926065444946, "learning_rate": 4.1015625e-05, "loss": 0.9754, "step": 699 }, { "epoch": 0.4083418404550095, "grad_norm": 1.0101975202560425, "learning_rate": 4.100060096153847e-05, "loss": 0.835, "step": 700 }, { "epoch": 0.4089251859413738, "grad_norm": 1.0377428531646729, "learning_rate": 4.0985576923076926e-05, "loss": 0.9193, "step": 701 }, { "epoch": 0.4095085314277381, "grad_norm": 1.4344565868377686, "learning_rate": 4.0970552884615385e-05, "loss": 0.981, "step": 702 }, { "epoch": 0.41009187691410237, "grad_norm": 1.0151726007461548, "learning_rate": 4.095552884615384e-05, "loss": 0.8954, "step": 703 }, { "epoch": 0.41067522240046667, "grad_norm": 1.2651805877685547, "learning_rate": 4.094050480769231e-05, "loss": 0.8135, "step": 704 }, { "epoch": 0.41125856788683096, "grad_norm": 1.1307834386825562, "learning_rate": 4.0925480769230774e-05, "loss": 1.0629, "step": 705 }, { "epoch": 0.41184191337319526, "grad_norm": 1.5362266302108765, "learning_rate": 4.091045673076923e-05, "loss": 1.1536, "step": 706 }, { "epoch": 0.41242525885955955, "grad_norm": 1.1772881746292114, "learning_rate": 4.08954326923077e-05, "loss": 1.3347, "step": 707 }, { "epoch": 0.41300860434592385, "grad_norm": 1.2242248058319092, "learning_rate": 4.0880408653846156e-05, "loss": 0.9295, "step": 708 }, { "epoch": 0.4135919498322882, "grad_norm": 1.2769187688827515, "learning_rate": 4.0865384615384615e-05, "loss": 1.0842, "step": 709 }, { "epoch": 0.4141752953186525, "grad_norm": 0.9419771432876587, "learning_rate": 4.085036057692308e-05, "loss": 0.9203, "step": 710 }, { "epoch": 0.4147586408050168, "grad_norm": 1.5197808742523193, "learning_rate": 4.083533653846154e-05, "loss": 0.8543, "step": 711 }, { "epoch": 0.4153419862913811, "grad_norm": 1.1694600582122803, "learning_rate": 4.0820312500000004e-05, "loss": 1.262, "step": 712 }, { "epoch": 0.4159253317777454, "grad_norm": 1.1101324558258057, "learning_rate": 4.080528846153847e-05, "loss": 0.8561, "step": 713 }, { "epoch": 0.4165086772641097, "grad_norm": 1.1828705072402954, "learning_rate": 4.079026442307692e-05, "loss": 0.8932, "step": 714 }, { "epoch": 0.41709202275047397, "grad_norm": 1.1211762428283691, "learning_rate": 4.0775240384615386e-05, "loss": 0.9816, "step": 715 }, { "epoch": 0.41767536823683826, "grad_norm": 1.2857762575149536, "learning_rate": 4.0760216346153845e-05, "loss": 1.0257, "step": 716 }, { "epoch": 0.41825871372320256, "grad_norm": 1.0974875688552856, "learning_rate": 4.074519230769231e-05, "loss": 0.6593, "step": 717 }, { "epoch": 0.41884205920956685, "grad_norm": 1.2576038837432861, "learning_rate": 4.0730168269230775e-05, "loss": 0.9487, "step": 718 }, { "epoch": 0.41942540469593115, "grad_norm": 1.1279481649398804, "learning_rate": 4.0715144230769234e-05, "loss": 1.0483, "step": 719 }, { "epoch": 0.42000875018229544, "grad_norm": 1.191821575164795, "learning_rate": 4.070012019230769e-05, "loss": 1.0075, "step": 720 }, { "epoch": 0.42059209566865974, "grad_norm": 0.9579320549964905, "learning_rate": 4.068509615384616e-05, "loss": 1.1948, "step": 721 }, { "epoch": 0.4211754411550241, "grad_norm": 1.1844723224639893, "learning_rate": 4.0670072115384616e-05, "loss": 1.0395, "step": 722 }, { "epoch": 0.4217587866413884, "grad_norm": 1.192607045173645, "learning_rate": 4.065504807692308e-05, "loss": 0.8447, "step": 723 }, { "epoch": 0.4223421321277527, "grad_norm": 1.256598949432373, "learning_rate": 4.064002403846154e-05, "loss": 0.8741, "step": 724 }, { "epoch": 0.422925477614117, "grad_norm": 1.1558480262756348, "learning_rate": 4.0625000000000005e-05, "loss": 1.1007, "step": 725 }, { "epoch": 0.42350882310048127, "grad_norm": 1.2486516237258911, "learning_rate": 4.0609975961538463e-05, "loss": 1.0104, "step": 726 }, { "epoch": 0.42409216858684556, "grad_norm": 1.1530859470367432, "learning_rate": 4.059495192307692e-05, "loss": 0.9664, "step": 727 }, { "epoch": 0.42467551407320986, "grad_norm": 1.1082444190979004, "learning_rate": 4.057992788461539e-05, "loss": 1.2137, "step": 728 }, { "epoch": 0.42525885955957415, "grad_norm": 1.1947239637374878, "learning_rate": 4.0564903846153846e-05, "loss": 1.1242, "step": 729 }, { "epoch": 0.42584220504593845, "grad_norm": 1.185736060142517, "learning_rate": 4.054987980769231e-05, "loss": 1.0753, "step": 730 }, { "epoch": 0.42642555053230274, "grad_norm": 1.292222499847412, "learning_rate": 4.0534855769230776e-05, "loss": 1.2377, "step": 731 }, { "epoch": 0.42700889601866704, "grad_norm": 1.3122601509094238, "learning_rate": 4.051983173076923e-05, "loss": 1.0001, "step": 732 }, { "epoch": 0.42759224150503133, "grad_norm": 1.284077525138855, "learning_rate": 4.050480769230769e-05, "loss": 0.8811, "step": 733 }, { "epoch": 0.42817558699139563, "grad_norm": 1.266852855682373, "learning_rate": 4.048978365384616e-05, "loss": 0.8545, "step": 734 }, { "epoch": 0.42875893247776, "grad_norm": 1.1647552251815796, "learning_rate": 4.047475961538462e-05, "loss": 0.9484, "step": 735 }, { "epoch": 0.4293422779641243, "grad_norm": 1.3035751581192017, "learning_rate": 4.045973557692308e-05, "loss": 1.0405, "step": 736 }, { "epoch": 0.42992562345048857, "grad_norm": 1.1369545459747314, "learning_rate": 4.044471153846154e-05, "loss": 0.8982, "step": 737 }, { "epoch": 0.43050896893685286, "grad_norm": 1.1648366451263428, "learning_rate": 4.04296875e-05, "loss": 1.0459, "step": 738 }, { "epoch": 0.43109231442321716, "grad_norm": 1.3287396430969238, "learning_rate": 4.0414663461538465e-05, "loss": 1.0519, "step": 739 }, { "epoch": 0.43167565990958146, "grad_norm": 1.1275254487991333, "learning_rate": 4.039963942307692e-05, "loss": 0.8556, "step": 740 }, { "epoch": 0.43225900539594575, "grad_norm": 1.3335869312286377, "learning_rate": 4.038461538461539e-05, "loss": 0.9997, "step": 741 }, { "epoch": 0.43284235088231005, "grad_norm": 1.073275089263916, "learning_rate": 4.036959134615385e-05, "loss": 1.1496, "step": 742 }, { "epoch": 0.43342569636867434, "grad_norm": 1.194717288017273, "learning_rate": 4.035456730769231e-05, "loss": 0.958, "step": 743 }, { "epoch": 0.43400904185503864, "grad_norm": 1.55983567237854, "learning_rate": 4.033954326923077e-05, "loss": 1.2247, "step": 744 }, { "epoch": 0.43459238734140293, "grad_norm": 2.449402093887329, "learning_rate": 4.032451923076923e-05, "loss": 1.2115, "step": 745 }, { "epoch": 0.4351757328277672, "grad_norm": 1.2797013521194458, "learning_rate": 4.0309495192307694e-05, "loss": 0.8518, "step": 746 }, { "epoch": 0.4357590783141315, "grad_norm": 1.1281001567840576, "learning_rate": 4.029447115384616e-05, "loss": 1.0215, "step": 747 }, { "epoch": 0.43634242380049587, "grad_norm": 1.1771413087844849, "learning_rate": 4.027944711538462e-05, "loss": 1.1645, "step": 748 }, { "epoch": 0.43692576928686017, "grad_norm": 1.181060552597046, "learning_rate": 4.0264423076923083e-05, "loss": 0.8806, "step": 749 }, { "epoch": 0.43750911477322446, "grad_norm": 1.2284411191940308, "learning_rate": 4.0249399038461535e-05, "loss": 1.1494, "step": 750 }, { "epoch": 0.43809246025958876, "grad_norm": 1.0676560401916504, "learning_rate": 4.0234375e-05, "loss": 0.9234, "step": 751 }, { "epoch": 0.43867580574595305, "grad_norm": 1.481394648551941, "learning_rate": 4.0219350961538466e-05, "loss": 0.904, "step": 752 }, { "epoch": 0.43925915123231735, "grad_norm": 1.0527502298355103, "learning_rate": 4.0204326923076924e-05, "loss": 1.0413, "step": 753 }, { "epoch": 0.43984249671868164, "grad_norm": 1.2585145235061646, "learning_rate": 4.018930288461539e-05, "loss": 0.9207, "step": 754 }, { "epoch": 0.44042584220504594, "grad_norm": 1.5194056034088135, "learning_rate": 4.017427884615385e-05, "loss": 1.0445, "step": 755 }, { "epoch": 0.44100918769141023, "grad_norm": 1.1574163436889648, "learning_rate": 4.0159254807692307e-05, "loss": 0.8065, "step": 756 }, { "epoch": 0.4415925331777745, "grad_norm": 1.351405143737793, "learning_rate": 4.014423076923077e-05, "loss": 0.914, "step": 757 }, { "epoch": 0.4421758786641388, "grad_norm": 1.1521351337432861, "learning_rate": 4.012920673076923e-05, "loss": 0.9017, "step": 758 }, { "epoch": 0.4427592241505031, "grad_norm": 1.9487134218215942, "learning_rate": 4.0114182692307696e-05, "loss": 1.03, "step": 759 }, { "epoch": 0.4433425696368674, "grad_norm": 2.560194969177246, "learning_rate": 4.009915865384616e-05, "loss": 1.0656, "step": 760 }, { "epoch": 0.4439259151232317, "grad_norm": 1.140284538269043, "learning_rate": 4.008413461538462e-05, "loss": 1.0233, "step": 761 }, { "epoch": 0.44450926060959606, "grad_norm": 1.3260776996612549, "learning_rate": 4.006911057692308e-05, "loss": 1.0017, "step": 762 }, { "epoch": 0.44509260609596035, "grad_norm": 1.1119332313537598, "learning_rate": 4.0054086538461536e-05, "loss": 1.2699, "step": 763 }, { "epoch": 0.44567595158232465, "grad_norm": 1.3355944156646729, "learning_rate": 4.00390625e-05, "loss": 0.9841, "step": 764 }, { "epoch": 0.44625929706868894, "grad_norm": 1.3637195825576782, "learning_rate": 4.002403846153847e-05, "loss": 1.0625, "step": 765 }, { "epoch": 0.44684264255505324, "grad_norm": 1.1988649368286133, "learning_rate": 4.0009014423076925e-05, "loss": 1.0488, "step": 766 }, { "epoch": 0.44742598804141753, "grad_norm": 2.336710214614868, "learning_rate": 3.999399038461539e-05, "loss": 1.1126, "step": 767 }, { "epoch": 0.44800933352778183, "grad_norm": 1.3706448078155518, "learning_rate": 3.997896634615384e-05, "loss": 1.0118, "step": 768 }, { "epoch": 0.4485926790141461, "grad_norm": 1.171186089515686, "learning_rate": 3.996394230769231e-05, "loss": 1.024, "step": 769 }, { "epoch": 0.4491760245005104, "grad_norm": 1.424375057220459, "learning_rate": 3.994891826923077e-05, "loss": 1.0885, "step": 770 }, { "epoch": 0.4497593699868747, "grad_norm": 1.218055009841919, "learning_rate": 3.993389423076923e-05, "loss": 0.9143, "step": 771 }, { "epoch": 0.450342715473239, "grad_norm": 1.1051896810531616, "learning_rate": 3.99188701923077e-05, "loss": 0.8874, "step": 772 }, { "epoch": 0.4509260609596033, "grad_norm": 1.2627577781677246, "learning_rate": 3.9903846153846155e-05, "loss": 0.9562, "step": 773 }, { "epoch": 0.4515094064459676, "grad_norm": 2.4132933616638184, "learning_rate": 3.9888822115384614e-05, "loss": 1.0477, "step": 774 }, { "epoch": 0.45209275193233195, "grad_norm": 1.4425148963928223, "learning_rate": 3.987379807692308e-05, "loss": 0.8765, "step": 775 }, { "epoch": 0.45267609741869624, "grad_norm": 1.0001459121704102, "learning_rate": 3.985877403846154e-05, "loss": 0.9992, "step": 776 }, { "epoch": 0.45325944290506054, "grad_norm": 1.2499384880065918, "learning_rate": 3.984375e-05, "loss": 1.0017, "step": 777 }, { "epoch": 0.45384278839142483, "grad_norm": 1.0234681367874146, "learning_rate": 3.982872596153847e-05, "loss": 0.9316, "step": 778 }, { "epoch": 0.45442613387778913, "grad_norm": 1.1663625240325928, "learning_rate": 3.981370192307693e-05, "loss": 1.0178, "step": 779 }, { "epoch": 0.4550094793641534, "grad_norm": 1.0986937284469604, "learning_rate": 3.9798677884615385e-05, "loss": 0.9092, "step": 780 }, { "epoch": 0.4555928248505177, "grad_norm": 1.2490792274475098, "learning_rate": 3.9783653846153844e-05, "loss": 1.046, "step": 781 }, { "epoch": 0.456176170336882, "grad_norm": 1.217383861541748, "learning_rate": 3.976862980769231e-05, "loss": 0.9513, "step": 782 }, { "epoch": 0.4567595158232463, "grad_norm": 1.4201329946517944, "learning_rate": 3.9753605769230774e-05, "loss": 1.0895, "step": 783 }, { "epoch": 0.4573428613096106, "grad_norm": 1.3387080430984497, "learning_rate": 3.973858173076923e-05, "loss": 0.9925, "step": 784 }, { "epoch": 0.4579262067959749, "grad_norm": 1.2447290420532227, "learning_rate": 3.97235576923077e-05, "loss": 1.1017, "step": 785 }, { "epoch": 0.4585095522823392, "grad_norm": 1.403903841972351, "learning_rate": 3.9708533653846156e-05, "loss": 0.8692, "step": 786 }, { "epoch": 0.4590928977687035, "grad_norm": 1.3049734830856323, "learning_rate": 3.9693509615384615e-05, "loss": 1.1045, "step": 787 }, { "epoch": 0.45967624325506784, "grad_norm": 2.3408448696136475, "learning_rate": 3.967848557692308e-05, "loss": 1.0606, "step": 788 }, { "epoch": 0.46025958874143214, "grad_norm": 1.189640760421753, "learning_rate": 3.966346153846154e-05, "loss": 1.0287, "step": 789 }, { "epoch": 0.46084293422779643, "grad_norm": 1.1157665252685547, "learning_rate": 3.9648437500000004e-05, "loss": 0.9822, "step": 790 }, { "epoch": 0.4614262797141607, "grad_norm": 1.242864966392517, "learning_rate": 3.963341346153846e-05, "loss": 0.827, "step": 791 }, { "epoch": 0.462009625200525, "grad_norm": 1.1176204681396484, "learning_rate": 3.961838942307692e-05, "loss": 1.2449, "step": 792 }, { "epoch": 0.4625929706868893, "grad_norm": 1.0381675958633423, "learning_rate": 3.9603365384615386e-05, "loss": 1.1754, "step": 793 }, { "epoch": 0.4631763161732536, "grad_norm": 1.2429289817810059, "learning_rate": 3.9588341346153845e-05, "loss": 1.0103, "step": 794 }, { "epoch": 0.4637596616596179, "grad_norm": 1.214370608329773, "learning_rate": 3.957331730769231e-05, "loss": 0.9133, "step": 795 }, { "epoch": 0.4643430071459822, "grad_norm": 1.249859094619751, "learning_rate": 3.9558293269230775e-05, "loss": 1.2252, "step": 796 }, { "epoch": 0.4649263526323465, "grad_norm": 1.1522125005722046, "learning_rate": 3.9543269230769234e-05, "loss": 0.8547, "step": 797 }, { "epoch": 0.4655096981187108, "grad_norm": 1.0914942026138306, "learning_rate": 3.952824519230769e-05, "loss": 1.1068, "step": 798 }, { "epoch": 0.4660930436050751, "grad_norm": 1.2309699058532715, "learning_rate": 3.951322115384616e-05, "loss": 1.1301, "step": 799 }, { "epoch": 0.4666763890914394, "grad_norm": 1.3135758638381958, "learning_rate": 3.9498197115384616e-05, "loss": 0.947, "step": 800 }, { "epoch": 0.4666763890914394, "eval_loss_squad": 0.9123072922043503, "eval_perplexity": 7.919284365740796, "eval_perplexity_reconstruct": 1.9212373943429781, "step": 800 }, { "epoch": 0.46725973457780373, "grad_norm": 1.0412598848342896, "learning_rate": 3.948317307692308e-05, "loss": 1.1682, "step": 801 }, { "epoch": 0.467843080064168, "grad_norm": 0.986031711101532, "learning_rate": 3.946814903846154e-05, "loss": 0.7983, "step": 802 }, { "epoch": 0.4684264255505323, "grad_norm": 1.149339199066162, "learning_rate": 3.9453125000000005e-05, "loss": 0.9834, "step": 803 }, { "epoch": 0.4690097710368966, "grad_norm": 1.2143068313598633, "learning_rate": 3.9438100961538464e-05, "loss": 1.1033, "step": 804 }, { "epoch": 0.4695931165232609, "grad_norm": 1.015660285949707, "learning_rate": 3.942307692307692e-05, "loss": 0.8502, "step": 805 }, { "epoch": 0.4701764620096252, "grad_norm": 1.3838074207305908, "learning_rate": 3.940805288461539e-05, "loss": 0.9986, "step": 806 }, { "epoch": 0.4707598074959895, "grad_norm": 1.1482564210891724, "learning_rate": 3.9393028846153846e-05, "loss": 1.0143, "step": 807 }, { "epoch": 0.4713431529823538, "grad_norm": 1.0859732627868652, "learning_rate": 3.937800480769231e-05, "loss": 0.9108, "step": 808 }, { "epoch": 0.4719264984687181, "grad_norm": 1.4561444520950317, "learning_rate": 3.936298076923077e-05, "loss": 0.7463, "step": 809 }, { "epoch": 0.4725098439550824, "grad_norm": 1.909414529800415, "learning_rate": 3.934795673076923e-05, "loss": 1.0691, "step": 810 }, { "epoch": 0.4730931894414467, "grad_norm": 2.1978914737701416, "learning_rate": 3.9332932692307694e-05, "loss": 1.263, "step": 811 }, { "epoch": 0.473676534927811, "grad_norm": 1.4734632968902588, "learning_rate": 3.931790865384616e-05, "loss": 0.8494, "step": 812 }, { "epoch": 0.47425988041417527, "grad_norm": 1.2567843198776245, "learning_rate": 3.930288461538462e-05, "loss": 1.0615, "step": 813 }, { "epoch": 0.47484322590053957, "grad_norm": 1.2729908227920532, "learning_rate": 3.928786057692308e-05, "loss": 0.8998, "step": 814 }, { "epoch": 0.4754265713869039, "grad_norm": 1.0162193775177002, "learning_rate": 3.927283653846154e-05, "loss": 1.0442, "step": 815 }, { "epoch": 0.4760099168732682, "grad_norm": 1.0893824100494385, "learning_rate": 3.92578125e-05, "loss": 0.9158, "step": 816 }, { "epoch": 0.4765932623596325, "grad_norm": 0.9300756454467773, "learning_rate": 3.9242788461538465e-05, "loss": 0.9869, "step": 817 }, { "epoch": 0.4771766078459968, "grad_norm": 1.3809287548065186, "learning_rate": 3.9227764423076923e-05, "loss": 0.9937, "step": 818 }, { "epoch": 0.4777599533323611, "grad_norm": 1.1738115549087524, "learning_rate": 3.921274038461539e-05, "loss": 1.0634, "step": 819 }, { "epoch": 0.4783432988187254, "grad_norm": 1.2441519498825073, "learning_rate": 3.919771634615385e-05, "loss": 0.7852, "step": 820 }, { "epoch": 0.4789266443050897, "grad_norm": 0.8443605303764343, "learning_rate": 3.918269230769231e-05, "loss": 1.0473, "step": 821 }, { "epoch": 0.479509989791454, "grad_norm": 1.2155978679656982, "learning_rate": 3.916766826923077e-05, "loss": 0.9415, "step": 822 }, { "epoch": 0.4800933352778183, "grad_norm": 1.1366358995437622, "learning_rate": 3.915264423076923e-05, "loss": 0.9134, "step": 823 }, { "epoch": 0.4806766807641826, "grad_norm": 1.1710559129714966, "learning_rate": 3.9137620192307695e-05, "loss": 1.0786, "step": 824 }, { "epoch": 0.48126002625054687, "grad_norm": 1.0255122184753418, "learning_rate": 3.912259615384616e-05, "loss": 1.0628, "step": 825 }, { "epoch": 0.48184337173691116, "grad_norm": 0.9826768636703491, "learning_rate": 3.910757211538462e-05, "loss": 0.8179, "step": 826 }, { "epoch": 0.48242671722327546, "grad_norm": 1.3819292783737183, "learning_rate": 3.909254807692308e-05, "loss": 1.0797, "step": 827 }, { "epoch": 0.4830100627096398, "grad_norm": 1.038661003112793, "learning_rate": 3.9077524038461536e-05, "loss": 1.2463, "step": 828 }, { "epoch": 0.4835934081960041, "grad_norm": 1.517325758934021, "learning_rate": 3.90625e-05, "loss": 1.0843, "step": 829 }, { "epoch": 0.4841767536823684, "grad_norm": 1.1275871992111206, "learning_rate": 3.9047475961538466e-05, "loss": 1.0089, "step": 830 }, { "epoch": 0.4847600991687327, "grad_norm": 2.5894081592559814, "learning_rate": 3.9032451923076925e-05, "loss": 1.0206, "step": 831 }, { "epoch": 0.485343444655097, "grad_norm": 1.2647501230239868, "learning_rate": 3.901742788461539e-05, "loss": 1.0476, "step": 832 }, { "epoch": 0.4859267901414613, "grad_norm": 1.1321070194244385, "learning_rate": 3.900240384615385e-05, "loss": 0.9772, "step": 833 }, { "epoch": 0.4865101356278256, "grad_norm": 1.4880715608596802, "learning_rate": 3.898737980769231e-05, "loss": 0.7914, "step": 834 }, { "epoch": 0.4870934811141899, "grad_norm": 1.07522714138031, "learning_rate": 3.897235576923077e-05, "loss": 0.9263, "step": 835 }, { "epoch": 0.48767682660055417, "grad_norm": 1.0414209365844727, "learning_rate": 3.895733173076923e-05, "loss": 1.2107, "step": 836 }, { "epoch": 0.48826017208691846, "grad_norm": 1.1117522716522217, "learning_rate": 3.8942307692307696e-05, "loss": 0.833, "step": 837 }, { "epoch": 0.48884351757328276, "grad_norm": 1.225772500038147, "learning_rate": 3.892728365384616e-05, "loss": 1.0641, "step": 838 }, { "epoch": 0.48942686305964705, "grad_norm": 1.2368497848510742, "learning_rate": 3.891225961538462e-05, "loss": 0.9278, "step": 839 }, { "epoch": 0.49001020854601135, "grad_norm": 1.068732500076294, "learning_rate": 3.889723557692308e-05, "loss": 0.8247, "step": 840 }, { "epoch": 0.4905935540323757, "grad_norm": 1.1534667015075684, "learning_rate": 3.888221153846154e-05, "loss": 0.9824, "step": 841 }, { "epoch": 0.49117689951874, "grad_norm": 1.3575752973556519, "learning_rate": 3.88671875e-05, "loss": 1.0802, "step": 842 }, { "epoch": 0.4917602450051043, "grad_norm": 1.2261594533920288, "learning_rate": 3.885216346153847e-05, "loss": 1.1202, "step": 843 }, { "epoch": 0.4923435904914686, "grad_norm": 1.2742036581039429, "learning_rate": 3.8837139423076926e-05, "loss": 1.0831, "step": 844 }, { "epoch": 0.4929269359778329, "grad_norm": 1.2143781185150146, "learning_rate": 3.8822115384615384e-05, "loss": 1.0475, "step": 845 }, { "epoch": 0.4935102814641972, "grad_norm": 1.1177541017532349, "learning_rate": 3.880709134615384e-05, "loss": 1.0494, "step": 846 }, { "epoch": 0.49409362695056147, "grad_norm": 1.1861687898635864, "learning_rate": 3.879206730769231e-05, "loss": 0.9165, "step": 847 }, { "epoch": 0.49467697243692577, "grad_norm": 1.2765378952026367, "learning_rate": 3.877704326923077e-05, "loss": 0.891, "step": 848 }, { "epoch": 0.49526031792329006, "grad_norm": 1.1258078813552856, "learning_rate": 3.876201923076923e-05, "loss": 0.9761, "step": 849 }, { "epoch": 0.49584366340965436, "grad_norm": 1.1065664291381836, "learning_rate": 3.87469951923077e-05, "loss": 0.9383, "step": 850 }, { "epoch": 0.49642700889601865, "grad_norm": 1.1673333644866943, "learning_rate": 3.8731971153846156e-05, "loss": 0.9632, "step": 851 }, { "epoch": 0.49701035438238295, "grad_norm": 1.428477168083191, "learning_rate": 3.8716947115384614e-05, "loss": 0.9079, "step": 852 }, { "epoch": 0.49759369986874724, "grad_norm": 1.1232894659042358, "learning_rate": 3.870192307692308e-05, "loss": 0.9956, "step": 853 }, { "epoch": 0.4981770453551116, "grad_norm": 1.2544480562210083, "learning_rate": 3.868689903846154e-05, "loss": 1.0855, "step": 854 }, { "epoch": 0.4987603908414759, "grad_norm": 1.152759313583374, "learning_rate": 3.8671875e-05, "loss": 1.0324, "step": 855 }, { "epoch": 0.4993437363278402, "grad_norm": 1.1294784545898438, "learning_rate": 3.865685096153847e-05, "loss": 0.9175, "step": 856 }, { "epoch": 0.4999270818142045, "grad_norm": 1.2857279777526855, "learning_rate": 3.864182692307693e-05, "loss": 1.2653, "step": 857 }, { "epoch": 0.5005104273005687, "grad_norm": 1.9902052879333496, "learning_rate": 3.8626802884615385e-05, "loss": 0.9604, "step": 858 }, { "epoch": 0.501093772786933, "grad_norm": 1.4721367359161377, "learning_rate": 3.8611778846153844e-05, "loss": 1.066, "step": 859 }, { "epoch": 0.5016771182732973, "grad_norm": 1.022174596786499, "learning_rate": 3.859675480769231e-05, "loss": 0.9604, "step": 860 }, { "epoch": 0.5022604637596617, "grad_norm": 1.3919785022735596, "learning_rate": 3.8581730769230775e-05, "loss": 0.9068, "step": 861 }, { "epoch": 0.502843809246026, "grad_norm": 1.2962651252746582, "learning_rate": 3.856670673076923e-05, "loss": 1.1166, "step": 862 }, { "epoch": 0.5034271547323903, "grad_norm": 1.2550551891326904, "learning_rate": 3.855168269230769e-05, "loss": 0.9605, "step": 863 }, { "epoch": 0.5040105002187546, "grad_norm": 1.1890307664871216, "learning_rate": 3.853665865384616e-05, "loss": 1.1311, "step": 864 }, { "epoch": 0.5045938457051189, "grad_norm": 1.2987920045852661, "learning_rate": 3.8521634615384615e-05, "loss": 1.0043, "step": 865 }, { "epoch": 0.5051771911914832, "grad_norm": 1.0546956062316895, "learning_rate": 3.850661057692308e-05, "loss": 1.1154, "step": 866 }, { "epoch": 0.5057605366778475, "grad_norm": 1.1739052534103394, "learning_rate": 3.849158653846154e-05, "loss": 1.0501, "step": 867 }, { "epoch": 0.5063438821642118, "grad_norm": 1.171311378479004, "learning_rate": 3.8476562500000004e-05, "loss": 1.109, "step": 868 }, { "epoch": 0.5069272276505761, "grad_norm": 1.0606648921966553, "learning_rate": 3.846153846153846e-05, "loss": 0.9756, "step": 869 }, { "epoch": 0.5075105731369404, "grad_norm": 1.1545205116271973, "learning_rate": 3.844651442307692e-05, "loss": 1.073, "step": 870 }, { "epoch": 0.5080939186233047, "grad_norm": 1.2131757736206055, "learning_rate": 3.843149038461539e-05, "loss": 1.1398, "step": 871 }, { "epoch": 0.508677264109669, "grad_norm": 1.140005111694336, "learning_rate": 3.8416466346153845e-05, "loss": 0.9707, "step": 872 }, { "epoch": 0.5092606095960333, "grad_norm": 1.253605604171753, "learning_rate": 3.840144230769231e-05, "loss": 1.1389, "step": 873 }, { "epoch": 0.5098439550823975, "grad_norm": 1.2395936250686646, "learning_rate": 3.8386418269230776e-05, "loss": 1.0647, "step": 874 }, { "epoch": 0.5104273005687618, "grad_norm": 1.092179298400879, "learning_rate": 3.8371394230769234e-05, "loss": 1.0585, "step": 875 }, { "epoch": 0.5110106460551261, "grad_norm": 1.6231690645217896, "learning_rate": 3.835637019230769e-05, "loss": 0.8506, "step": 876 }, { "epoch": 0.5115939915414904, "grad_norm": 1.1169666051864624, "learning_rate": 3.834134615384616e-05, "loss": 0.8958, "step": 877 }, { "epoch": 0.5121773370278547, "grad_norm": 1.2263548374176025, "learning_rate": 3.8326322115384616e-05, "loss": 0.9337, "step": 878 }, { "epoch": 0.512760682514219, "grad_norm": 1.1182410717010498, "learning_rate": 3.831129807692308e-05, "loss": 0.8743, "step": 879 }, { "epoch": 0.5133440280005833, "grad_norm": 1.1924457550048828, "learning_rate": 3.829627403846154e-05, "loss": 0.9632, "step": 880 }, { "epoch": 0.5139273734869476, "grad_norm": 1.2248971462249756, "learning_rate": 3.828125e-05, "loss": 1.1191, "step": 881 }, { "epoch": 0.5145107189733119, "grad_norm": 1.1758016347885132, "learning_rate": 3.8266225961538464e-05, "loss": 0.9399, "step": 882 }, { "epoch": 0.5150940644596762, "grad_norm": 1.1751339435577393, "learning_rate": 3.825120192307692e-05, "loss": 0.8132, "step": 883 }, { "epoch": 0.5156774099460405, "grad_norm": 1.093839168548584, "learning_rate": 3.823617788461539e-05, "loss": 1.1629, "step": 884 }, { "epoch": 0.5162607554324048, "grad_norm": 3.200582265853882, "learning_rate": 3.8221153846153846e-05, "loss": 1.1263, "step": 885 }, { "epoch": 0.5168441009187691, "grad_norm": 1.2929569482803345, "learning_rate": 3.820612980769231e-05, "loss": 1.1846, "step": 886 }, { "epoch": 0.5174274464051335, "grad_norm": 1.3224855661392212, "learning_rate": 3.819110576923077e-05, "loss": 0.9277, "step": 887 }, { "epoch": 0.5180107918914978, "grad_norm": 1.1607059240341187, "learning_rate": 3.817608173076923e-05, "loss": 0.9946, "step": 888 }, { "epoch": 0.5185941373778621, "grad_norm": 2.7251431941986084, "learning_rate": 3.8161057692307694e-05, "loss": 1.1296, "step": 889 }, { "epoch": 0.5191774828642264, "grad_norm": 1.1163549423217773, "learning_rate": 3.814603365384616e-05, "loss": 0.9401, "step": 890 }, { "epoch": 0.5197608283505907, "grad_norm": 1.1019116640090942, "learning_rate": 3.813100961538462e-05, "loss": 0.8071, "step": 891 }, { "epoch": 0.520344173836955, "grad_norm": 1.1490522623062134, "learning_rate": 3.811598557692308e-05, "loss": 0.9441, "step": 892 }, { "epoch": 0.5209275193233193, "grad_norm": 1.1176124811172485, "learning_rate": 3.810096153846154e-05, "loss": 1.0805, "step": 893 }, { "epoch": 0.5215108648096836, "grad_norm": 1.4119200706481934, "learning_rate": 3.80859375e-05, "loss": 1.3708, "step": 894 }, { "epoch": 0.5220942102960479, "grad_norm": 1.2920783758163452, "learning_rate": 3.8070913461538465e-05, "loss": 1.2315, "step": 895 }, { "epoch": 0.5226775557824122, "grad_norm": 1.2457376718521118, "learning_rate": 3.8055889423076924e-05, "loss": 1.0076, "step": 896 }, { "epoch": 0.5232609012687764, "grad_norm": 1.5919691324234009, "learning_rate": 3.804086538461539e-05, "loss": 0.9201, "step": 897 }, { "epoch": 0.5238442467551407, "grad_norm": 1.3675076961517334, "learning_rate": 3.802584134615385e-05, "loss": 0.9394, "step": 898 }, { "epoch": 0.524427592241505, "grad_norm": 1.148927927017212, "learning_rate": 3.8010817307692306e-05, "loss": 0.8612, "step": 899 }, { "epoch": 0.5250109377278693, "grad_norm": 1.258003830909729, "learning_rate": 3.799579326923077e-05, "loss": 1.1393, "step": 900 }, { "epoch": 0.5255942832142336, "grad_norm": 1.0879266262054443, "learning_rate": 3.798076923076923e-05, "loss": 1.2231, "step": 901 }, { "epoch": 0.5261776287005979, "grad_norm": 1.3922462463378906, "learning_rate": 3.7965745192307695e-05, "loss": 1.0593, "step": 902 }, { "epoch": 0.5267609741869622, "grad_norm": 1.225117564201355, "learning_rate": 3.795072115384616e-05, "loss": 1.2312, "step": 903 }, { "epoch": 0.5273443196733265, "grad_norm": 1.3208086490631104, "learning_rate": 3.793569711538462e-05, "loss": 1.3092, "step": 904 }, { "epoch": 0.5279276651596908, "grad_norm": 1.0285180807113647, "learning_rate": 3.792067307692308e-05, "loss": 0.9734, "step": 905 }, { "epoch": 0.5285110106460551, "grad_norm": 1.4396753311157227, "learning_rate": 3.7905649038461536e-05, "loss": 1.2026, "step": 906 }, { "epoch": 0.5290943561324194, "grad_norm": 0.9777195453643799, "learning_rate": 3.7890625e-05, "loss": 0.9129, "step": 907 }, { "epoch": 0.5296777016187837, "grad_norm": 1.0864167213439941, "learning_rate": 3.7875600961538466e-05, "loss": 0.8855, "step": 908 }, { "epoch": 0.530261047105148, "grad_norm": 1.0939980745315552, "learning_rate": 3.7860576923076925e-05, "loss": 1.0853, "step": 909 }, { "epoch": 0.5308443925915123, "grad_norm": 1.1439995765686035, "learning_rate": 3.784555288461539e-05, "loss": 1.1568, "step": 910 }, { "epoch": 0.5314277380778766, "grad_norm": 1.2120246887207031, "learning_rate": 3.783052884615385e-05, "loss": 0.9977, "step": 911 }, { "epoch": 0.5320110835642409, "grad_norm": 1.0372374057769775, "learning_rate": 3.781550480769231e-05, "loss": 0.8251, "step": 912 }, { "epoch": 0.5325944290506052, "grad_norm": 1.2008637189865112, "learning_rate": 3.780048076923077e-05, "loss": 1.0291, "step": 913 }, { "epoch": 0.5331777745369696, "grad_norm": 1.475831151008606, "learning_rate": 3.778545673076923e-05, "loss": 1.0554, "step": 914 }, { "epoch": 0.5337611200233339, "grad_norm": 1.3396097421646118, "learning_rate": 3.7770432692307696e-05, "loss": 1.0433, "step": 915 }, { "epoch": 0.5343444655096982, "grad_norm": 1.40950345993042, "learning_rate": 3.775540865384616e-05, "loss": 0.963, "step": 916 }, { "epoch": 0.5349278109960625, "grad_norm": 1.8577003479003906, "learning_rate": 3.774038461538461e-05, "loss": 0.747, "step": 917 }, { "epoch": 0.5355111564824268, "grad_norm": 1.0990678071975708, "learning_rate": 3.772536057692308e-05, "loss": 0.869, "step": 918 }, { "epoch": 0.536094501968791, "grad_norm": 1.4199498891830444, "learning_rate": 3.771033653846154e-05, "loss": 0.8398, "step": 919 }, { "epoch": 0.5366778474551553, "grad_norm": 1.5259379148483276, "learning_rate": 3.76953125e-05, "loss": 1.0978, "step": 920 }, { "epoch": 0.5372611929415196, "grad_norm": 1.208513617515564, "learning_rate": 3.768028846153847e-05, "loss": 0.9836, "step": 921 }, { "epoch": 0.5378445384278839, "grad_norm": 1.3118703365325928, "learning_rate": 3.7665264423076926e-05, "loss": 0.9274, "step": 922 }, { "epoch": 0.5384278839142482, "grad_norm": 1.2613270282745361, "learning_rate": 3.7650240384615385e-05, "loss": 1.095, "step": 923 }, { "epoch": 0.5390112294006125, "grad_norm": 1.056458592414856, "learning_rate": 3.763521634615384e-05, "loss": 1.0447, "step": 924 }, { "epoch": 0.5395945748869768, "grad_norm": 1.104421854019165, "learning_rate": 3.762019230769231e-05, "loss": 0.9413, "step": 925 }, { "epoch": 0.5401779203733411, "grad_norm": 1.1966310739517212, "learning_rate": 3.7605168269230774e-05, "loss": 1.0595, "step": 926 }, { "epoch": 0.5407612658597054, "grad_norm": 1.1758238077163696, "learning_rate": 3.759014423076923e-05, "loss": 1.0223, "step": 927 }, { "epoch": 0.5413446113460697, "grad_norm": 1.850199580192566, "learning_rate": 3.75751201923077e-05, "loss": 1.0916, "step": 928 }, { "epoch": 0.541927956832434, "grad_norm": 1.0883837938308716, "learning_rate": 3.7560096153846156e-05, "loss": 1.1043, "step": 929 }, { "epoch": 0.5425113023187983, "grad_norm": 1.1901664733886719, "learning_rate": 3.7545072115384614e-05, "loss": 1.0474, "step": 930 }, { "epoch": 0.5430946478051626, "grad_norm": 1.5722588300704956, "learning_rate": 3.753004807692308e-05, "loss": 0.9751, "step": 931 }, { "epoch": 0.5436779932915269, "grad_norm": 1.2307900190353394, "learning_rate": 3.751502403846154e-05, "loss": 1.0156, "step": 932 }, { "epoch": 0.5442613387778912, "grad_norm": 1.954958200454712, "learning_rate": 3.7500000000000003e-05, "loss": 1.0679, "step": 933 }, { "epoch": 0.5448446842642555, "grad_norm": 1.6473888158798218, "learning_rate": 3.748497596153847e-05, "loss": 0.9303, "step": 934 }, { "epoch": 0.5454280297506198, "grad_norm": 1.390716314315796, "learning_rate": 3.746995192307692e-05, "loss": 1.0829, "step": 935 }, { "epoch": 0.5460113752369841, "grad_norm": 1.2111623287200928, "learning_rate": 3.7454927884615386e-05, "loss": 1.1691, "step": 936 }, { "epoch": 0.5465947207233484, "grad_norm": 1.2147167921066284, "learning_rate": 3.7439903846153844e-05, "loss": 1.0071, "step": 937 }, { "epoch": 0.5471780662097127, "grad_norm": 1.4748153686523438, "learning_rate": 3.742487980769231e-05, "loss": 0.945, "step": 938 }, { "epoch": 0.547761411696077, "grad_norm": 1.2974998950958252, "learning_rate": 3.7409855769230775e-05, "loss": 0.9262, "step": 939 }, { "epoch": 0.5483447571824414, "grad_norm": 1.0850262641906738, "learning_rate": 3.739483173076923e-05, "loss": 0.8514, "step": 940 }, { "epoch": 0.5489281026688057, "grad_norm": 1.409722924232483, "learning_rate": 3.737980769230769e-05, "loss": 1.0453, "step": 941 }, { "epoch": 0.54951144815517, "grad_norm": 1.253421664237976, "learning_rate": 3.736478365384616e-05, "loss": 0.8305, "step": 942 }, { "epoch": 0.5500947936415342, "grad_norm": 1.1697018146514893, "learning_rate": 3.7349759615384616e-05, "loss": 1.0097, "step": 943 }, { "epoch": 0.5506781391278985, "grad_norm": 1.3280630111694336, "learning_rate": 3.733473557692308e-05, "loss": 1.1064, "step": 944 }, { "epoch": 0.5512614846142628, "grad_norm": 1.7148746252059937, "learning_rate": 3.731971153846154e-05, "loss": 1.152, "step": 945 }, { "epoch": 0.5518448301006271, "grad_norm": 1.2414015531539917, "learning_rate": 3.7304687500000005e-05, "loss": 1.4578, "step": 946 }, { "epoch": 0.5524281755869914, "grad_norm": 1.5570706129074097, "learning_rate": 3.728966346153846e-05, "loss": 1.1254, "step": 947 }, { "epoch": 0.5530115210733557, "grad_norm": 1.2762681245803833, "learning_rate": 3.727463942307692e-05, "loss": 1.0296, "step": 948 }, { "epoch": 0.55359486655972, "grad_norm": 1.1472910642623901, "learning_rate": 3.725961538461539e-05, "loss": 0.9723, "step": 949 }, { "epoch": 0.5541782120460843, "grad_norm": 1.429591417312622, "learning_rate": 3.7244591346153845e-05, "loss": 1.0805, "step": 950 }, { "epoch": 0.5547615575324486, "grad_norm": 1.136590600013733, "learning_rate": 3.722956730769231e-05, "loss": 1.2051, "step": 951 }, { "epoch": 0.5553449030188129, "grad_norm": 1.1960887908935547, "learning_rate": 3.7214543269230776e-05, "loss": 0.9221, "step": 952 }, { "epoch": 0.5559282485051772, "grad_norm": 1.2155873775482178, "learning_rate": 3.719951923076923e-05, "loss": 0.8845, "step": 953 }, { "epoch": 0.5565115939915415, "grad_norm": 1.4684885740280151, "learning_rate": 3.718449519230769e-05, "loss": 1.1217, "step": 954 }, { "epoch": 0.5570949394779058, "grad_norm": 1.1232365369796753, "learning_rate": 3.716947115384616e-05, "loss": 0.96, "step": 955 }, { "epoch": 0.5576782849642701, "grad_norm": 1.3967763185501099, "learning_rate": 3.715444711538462e-05, "loss": 1.1756, "step": 956 }, { "epoch": 0.5582616304506344, "grad_norm": 1.3869478702545166, "learning_rate": 3.713942307692308e-05, "loss": 0.904, "step": 957 }, { "epoch": 0.5588449759369987, "grad_norm": 1.5184909105300903, "learning_rate": 3.712439903846154e-05, "loss": 1.065, "step": 958 }, { "epoch": 0.559428321423363, "grad_norm": 1.1123069524765015, "learning_rate": 3.7109375e-05, "loss": 0.9782, "step": 959 }, { "epoch": 0.5600116669097273, "grad_norm": 1.0042744874954224, "learning_rate": 3.7094350961538464e-05, "loss": 1.0655, "step": 960 }, { "epoch": 0.5605950123960916, "grad_norm": 1.103111743927002, "learning_rate": 3.707932692307692e-05, "loss": 1.0633, "step": 961 }, { "epoch": 0.5611783578824558, "grad_norm": 1.1853861808776855, "learning_rate": 3.706430288461539e-05, "loss": 1.1423, "step": 962 }, { "epoch": 0.5617617033688201, "grad_norm": 1.1413124799728394, "learning_rate": 3.704927884615385e-05, "loss": 0.9878, "step": 963 }, { "epoch": 0.5623450488551844, "grad_norm": 1.1289325952529907, "learning_rate": 3.703425480769231e-05, "loss": 0.9265, "step": 964 }, { "epoch": 0.5629283943415487, "grad_norm": 1.21286141872406, "learning_rate": 3.701923076923077e-05, "loss": 0.9536, "step": 965 }, { "epoch": 0.563511739827913, "grad_norm": 1.1824188232421875, "learning_rate": 3.700420673076923e-05, "loss": 0.8876, "step": 966 }, { "epoch": 0.5640950853142774, "grad_norm": 1.1528284549713135, "learning_rate": 3.6989182692307694e-05, "loss": 0.9186, "step": 967 }, { "epoch": 0.5646784308006417, "grad_norm": 0.9889248609542847, "learning_rate": 3.697415865384616e-05, "loss": 1.0231, "step": 968 }, { "epoch": 0.565261776287006, "grad_norm": 1.1389588117599487, "learning_rate": 3.695913461538462e-05, "loss": 0.9333, "step": 969 }, { "epoch": 0.5658451217733703, "grad_norm": 1.3849660158157349, "learning_rate": 3.694411057692308e-05, "loss": 1.016, "step": 970 }, { "epoch": 0.5664284672597346, "grad_norm": 1.1598434448242188, "learning_rate": 3.6929086538461535e-05, "loss": 1.2011, "step": 971 }, { "epoch": 0.5670118127460989, "grad_norm": 1.3359456062316895, "learning_rate": 3.69140625e-05, "loss": 1.0756, "step": 972 }, { "epoch": 0.5675951582324632, "grad_norm": 1.4474626779556274, "learning_rate": 3.6899038461538466e-05, "loss": 1.0561, "step": 973 }, { "epoch": 0.5681785037188275, "grad_norm": 1.1154292821884155, "learning_rate": 3.6884014423076924e-05, "loss": 0.8292, "step": 974 }, { "epoch": 0.5687618492051918, "grad_norm": 1.0911437273025513, "learning_rate": 3.686899038461539e-05, "loss": 0.9813, "step": 975 }, { "epoch": 0.5693451946915561, "grad_norm": 1.3380028009414673, "learning_rate": 3.685396634615385e-05, "loss": 1.2311, "step": 976 }, { "epoch": 0.5699285401779204, "grad_norm": 1.0942449569702148, "learning_rate": 3.6838942307692306e-05, "loss": 0.8704, "step": 977 }, { "epoch": 0.5705118856642847, "grad_norm": 1.0522291660308838, "learning_rate": 3.682391826923077e-05, "loss": 0.7929, "step": 978 }, { "epoch": 0.571095231150649, "grad_norm": 1.7213867902755737, "learning_rate": 3.680889423076923e-05, "loss": 1.2793, "step": 979 }, { "epoch": 0.5716785766370133, "grad_norm": 1.2406498193740845, "learning_rate": 3.6793870192307695e-05, "loss": 0.8329, "step": 980 }, { "epoch": 0.5722619221233776, "grad_norm": 1.3200228214263916, "learning_rate": 3.677884615384616e-05, "loss": 1.0274, "step": 981 }, { "epoch": 0.5728452676097419, "grad_norm": 1.1541839838027954, "learning_rate": 3.676382211538462e-05, "loss": 0.9626, "step": 982 }, { "epoch": 0.5734286130961062, "grad_norm": 1.1976728439331055, "learning_rate": 3.674879807692308e-05, "loss": 0.8529, "step": 983 }, { "epoch": 0.5740119585824704, "grad_norm": 1.458693027496338, "learning_rate": 3.6733774038461536e-05, "loss": 1.1267, "step": 984 }, { "epoch": 0.5745953040688347, "grad_norm": 1.5604653358459473, "learning_rate": 3.671875e-05, "loss": 1.0798, "step": 985 }, { "epoch": 0.575178649555199, "grad_norm": 1.269396424293518, "learning_rate": 3.670372596153847e-05, "loss": 1.0479, "step": 986 }, { "epoch": 0.5757619950415633, "grad_norm": 1.206891655921936, "learning_rate": 3.6688701923076925e-05, "loss": 0.7532, "step": 987 }, { "epoch": 0.5763453405279276, "grad_norm": 1.3230568170547485, "learning_rate": 3.667367788461539e-05, "loss": 0.9374, "step": 988 }, { "epoch": 0.5769286860142919, "grad_norm": 1.2090986967086792, "learning_rate": 3.665865384615384e-05, "loss": 0.8378, "step": 989 }, { "epoch": 0.5775120315006562, "grad_norm": 1.1172541379928589, "learning_rate": 3.664362980769231e-05, "loss": 0.9294, "step": 990 }, { "epoch": 0.5780953769870205, "grad_norm": 1.2599154710769653, "learning_rate": 3.662860576923077e-05, "loss": 1.0339, "step": 991 }, { "epoch": 0.5786787224733848, "grad_norm": 1.5349094867706299, "learning_rate": 3.661358173076923e-05, "loss": 1.2734, "step": 992 }, { "epoch": 0.5792620679597492, "grad_norm": 1.4437059164047241, "learning_rate": 3.6598557692307697e-05, "loss": 1.0486, "step": 993 }, { "epoch": 0.5798454134461135, "grad_norm": 1.0442750453948975, "learning_rate": 3.6583533653846155e-05, "loss": 0.9691, "step": 994 }, { "epoch": 0.5804287589324778, "grad_norm": 1.3448185920715332, "learning_rate": 3.6568509615384614e-05, "loss": 1.2887, "step": 995 }, { "epoch": 0.5810121044188421, "grad_norm": 1.0911777019500732, "learning_rate": 3.655348557692308e-05, "loss": 1.0297, "step": 996 }, { "epoch": 0.5815954499052064, "grad_norm": 1.1703611612319946, "learning_rate": 3.653846153846154e-05, "loss": 0.8926, "step": 997 }, { "epoch": 0.5821787953915707, "grad_norm": 1.7544491291046143, "learning_rate": 3.65234375e-05, "loss": 1.1923, "step": 998 }, { "epoch": 0.582762140877935, "grad_norm": 1.3902952671051025, "learning_rate": 3.650841346153847e-05, "loss": 0.8508, "step": 999 }, { "epoch": 0.5833454863642993, "grad_norm": 1.1169430017471313, "learning_rate": 3.6493389423076926e-05, "loss": 0.9376, "step": 1000 }, { "epoch": 0.5833454863642993, "eval_loss_squad": 0.8289175107888878, "eval_perplexity": 8.038335329537642, "eval_perplexity_reconstruct": 1.9354374343602856, "step": 1000 }, { "epoch": 0.5839288318506636, "grad_norm": 1.0589932203292847, "learning_rate": 3.6478365384615385e-05, "loss": 0.8848, "step": 1001 }, { "epoch": 0.5845121773370279, "grad_norm": 1.3385180234909058, "learning_rate": 3.6463341346153843e-05, "loss": 1.005, "step": 1002 }, { "epoch": 0.5850955228233922, "grad_norm": 1.3170318603515625, "learning_rate": 3.644831730769231e-05, "loss": 1.0717, "step": 1003 }, { "epoch": 0.5856788683097565, "grad_norm": 1.134464144706726, "learning_rate": 3.6433293269230774e-05, "loss": 1.1363, "step": 1004 }, { "epoch": 0.5862622137961208, "grad_norm": 1.2407094240188599, "learning_rate": 3.641826923076923e-05, "loss": 0.8989, "step": 1005 }, { "epoch": 0.586845559282485, "grad_norm": 1.040368914604187, "learning_rate": 3.64032451923077e-05, "loss": 0.7677, "step": 1006 }, { "epoch": 0.5874289047688493, "grad_norm": 1.2355806827545166, "learning_rate": 3.6388221153846156e-05, "loss": 1.0903, "step": 1007 }, { "epoch": 0.5880122502552136, "grad_norm": 2.0347537994384766, "learning_rate": 3.6373197115384615e-05, "loss": 1.0069, "step": 1008 }, { "epoch": 0.5885955957415779, "grad_norm": 1.3925468921661377, "learning_rate": 3.635817307692308e-05, "loss": 1.1185, "step": 1009 }, { "epoch": 0.5891789412279422, "grad_norm": 1.4087902307510376, "learning_rate": 3.634314903846154e-05, "loss": 1.0334, "step": 1010 }, { "epoch": 0.5897622867143065, "grad_norm": 1.5874099731445312, "learning_rate": 3.6328125000000004e-05, "loss": 1.1728, "step": 1011 }, { "epoch": 0.5903456322006708, "grad_norm": 1.1461565494537354, "learning_rate": 3.631310096153846e-05, "loss": 1.1395, "step": 1012 }, { "epoch": 0.5909289776870351, "grad_norm": 1.3284554481506348, "learning_rate": 3.629807692307692e-05, "loss": 0.9558, "step": 1013 }, { "epoch": 0.5915123231733994, "grad_norm": 1.1890766620635986, "learning_rate": 3.6283052884615386e-05, "loss": 0.8911, "step": 1014 }, { "epoch": 0.5920956686597637, "grad_norm": 1.15906822681427, "learning_rate": 3.6268028846153845e-05, "loss": 0.8283, "step": 1015 }, { "epoch": 0.592679014146128, "grad_norm": 2.1104965209960938, "learning_rate": 3.625300480769231e-05, "loss": 0.8631, "step": 1016 }, { "epoch": 0.5932623596324923, "grad_norm": 1.0914028882980347, "learning_rate": 3.6237980769230775e-05, "loss": 0.9244, "step": 1017 }, { "epoch": 0.5938457051188566, "grad_norm": 1.1843571662902832, "learning_rate": 3.6222956730769234e-05, "loss": 0.9798, "step": 1018 }, { "epoch": 0.5944290506052209, "grad_norm": 2.413841724395752, "learning_rate": 3.620793269230769e-05, "loss": 0.9308, "step": 1019 }, { "epoch": 0.5950123960915853, "grad_norm": 2.1389975547790527, "learning_rate": 3.619290865384616e-05, "loss": 1.0376, "step": 1020 }, { "epoch": 0.5955957415779496, "grad_norm": 1.1918492317199707, "learning_rate": 3.6177884615384616e-05, "loss": 0.998, "step": 1021 }, { "epoch": 0.5961790870643139, "grad_norm": 1.1064860820770264, "learning_rate": 3.616286057692308e-05, "loss": 1.0828, "step": 1022 }, { "epoch": 0.5967624325506782, "grad_norm": 1.1806142330169678, "learning_rate": 3.614783653846154e-05, "loss": 1.1629, "step": 1023 }, { "epoch": 0.5973457780370425, "grad_norm": 1.266573429107666, "learning_rate": 3.6132812500000005e-05, "loss": 0.9503, "step": 1024 }, { "epoch": 0.5979291235234068, "grad_norm": 1.1975195407867432, "learning_rate": 3.6117788461538463e-05, "loss": 0.9898, "step": 1025 }, { "epoch": 0.5985124690097711, "grad_norm": 1.3326911926269531, "learning_rate": 3.610276442307692e-05, "loss": 1.0076, "step": 1026 }, { "epoch": 0.5990958144961354, "grad_norm": 1.2400132417678833, "learning_rate": 3.608774038461539e-05, "loss": 1.0404, "step": 1027 }, { "epoch": 0.5996791599824997, "grad_norm": 1.5326324701309204, "learning_rate": 3.6072716346153846e-05, "loss": 0.9418, "step": 1028 }, { "epoch": 0.600262505468864, "grad_norm": 1.4190855026245117, "learning_rate": 3.605769230769231e-05, "loss": 1.0581, "step": 1029 }, { "epoch": 0.6008458509552282, "grad_norm": 1.4408974647521973, "learning_rate": 3.604266826923077e-05, "loss": 0.8873, "step": 1030 }, { "epoch": 0.6014291964415925, "grad_norm": 1.0051006078720093, "learning_rate": 3.602764423076923e-05, "loss": 0.9694, "step": 1031 }, { "epoch": 0.6020125419279568, "grad_norm": 1.304622769355774, "learning_rate": 3.601262019230769e-05, "loss": 0.894, "step": 1032 }, { "epoch": 0.6025958874143211, "grad_norm": 1.6079553365707397, "learning_rate": 3.599759615384616e-05, "loss": 1.134, "step": 1033 }, { "epoch": 0.6031792329006854, "grad_norm": 1.3666532039642334, "learning_rate": 3.598257211538462e-05, "loss": 1.171, "step": 1034 }, { "epoch": 0.6037625783870497, "grad_norm": 1.1331239938735962, "learning_rate": 3.596754807692308e-05, "loss": 1.014, "step": 1035 }, { "epoch": 0.604345923873414, "grad_norm": 1.153573989868164, "learning_rate": 3.595252403846154e-05, "loss": 1.1011, "step": 1036 }, { "epoch": 0.6049292693597783, "grad_norm": 1.182096004486084, "learning_rate": 3.59375e-05, "loss": 1.1392, "step": 1037 }, { "epoch": 0.6055126148461426, "grad_norm": 1.1555769443511963, "learning_rate": 3.5922475961538465e-05, "loss": 0.8643, "step": 1038 }, { "epoch": 0.6060959603325069, "grad_norm": 1.103007197380066, "learning_rate": 3.590745192307692e-05, "loss": 0.8785, "step": 1039 }, { "epoch": 0.6066793058188712, "grad_norm": 1.2092280387878418, "learning_rate": 3.589242788461539e-05, "loss": 1.0269, "step": 1040 }, { "epoch": 0.6072626513052355, "grad_norm": 1.2705990076065063, "learning_rate": 3.587740384615385e-05, "loss": 0.8287, "step": 1041 }, { "epoch": 0.6078459967915998, "grad_norm": 1.1593817472457886, "learning_rate": 3.586237980769231e-05, "loss": 0.8295, "step": 1042 }, { "epoch": 0.6084293422779641, "grad_norm": 1.1436785459518433, "learning_rate": 3.584735576923077e-05, "loss": 0.9481, "step": 1043 }, { "epoch": 0.6090126877643284, "grad_norm": 1.4250012636184692, "learning_rate": 3.583233173076923e-05, "loss": 1.2019, "step": 1044 }, { "epoch": 0.6095960332506927, "grad_norm": 1.3171687126159668, "learning_rate": 3.5817307692307695e-05, "loss": 0.8549, "step": 1045 }, { "epoch": 0.6101793787370571, "grad_norm": 1.4231929779052734, "learning_rate": 3.580228365384616e-05, "loss": 1.1054, "step": 1046 }, { "epoch": 0.6107627242234214, "grad_norm": 1.2546910047531128, "learning_rate": 3.578725961538462e-05, "loss": 1.0624, "step": 1047 }, { "epoch": 0.6113460697097857, "grad_norm": 1.1451596021652222, "learning_rate": 3.577223557692308e-05, "loss": 1.1191, "step": 1048 }, { "epoch": 0.61192941519615, "grad_norm": 1.250524640083313, "learning_rate": 3.5757211538461535e-05, "loss": 1.0673, "step": 1049 }, { "epoch": 0.6125127606825143, "grad_norm": 1.2106703519821167, "learning_rate": 3.57421875e-05, "loss": 0.9558, "step": 1050 }, { "epoch": 0.6130961061688786, "grad_norm": 1.4316116571426392, "learning_rate": 3.5727163461538466e-05, "loss": 1.146, "step": 1051 }, { "epoch": 0.6136794516552428, "grad_norm": 1.1764512062072754, "learning_rate": 3.5712139423076924e-05, "loss": 0.8677, "step": 1052 }, { "epoch": 0.6142627971416071, "grad_norm": 1.0635274648666382, "learning_rate": 3.569711538461539e-05, "loss": 0.9785, "step": 1053 }, { "epoch": 0.6148461426279714, "grad_norm": 1.1735188961029053, "learning_rate": 3.568209134615385e-05, "loss": 0.8116, "step": 1054 }, { "epoch": 0.6154294881143357, "grad_norm": 1.1931421756744385, "learning_rate": 3.566706730769231e-05, "loss": 1.0047, "step": 1055 }, { "epoch": 0.6160128336007, "grad_norm": 1.4057892560958862, "learning_rate": 3.565204326923077e-05, "loss": 0.9297, "step": 1056 }, { "epoch": 0.6165961790870643, "grad_norm": 1.0752261877059937, "learning_rate": 3.563701923076923e-05, "loss": 0.9164, "step": 1057 }, { "epoch": 0.6171795245734286, "grad_norm": 1.052547574043274, "learning_rate": 3.5621995192307696e-05, "loss": 1.0254, "step": 1058 }, { "epoch": 0.6177628700597929, "grad_norm": 1.1009021997451782, "learning_rate": 3.560697115384616e-05, "loss": 0.888, "step": 1059 }, { "epoch": 0.6183462155461572, "grad_norm": 1.2356963157653809, "learning_rate": 3.559194711538462e-05, "loss": 1.1234, "step": 1060 }, { "epoch": 0.6189295610325215, "grad_norm": 1.765424370765686, "learning_rate": 3.557692307692308e-05, "loss": 1.2263, "step": 1061 }, { "epoch": 0.6195129065188858, "grad_norm": 1.3897560834884644, "learning_rate": 3.5561899038461536e-05, "loss": 1.0729, "step": 1062 }, { "epoch": 0.6200962520052501, "grad_norm": 1.1901566982269287, "learning_rate": 3.5546875e-05, "loss": 0.9953, "step": 1063 }, { "epoch": 0.6206795974916144, "grad_norm": 1.4633679389953613, "learning_rate": 3.553185096153847e-05, "loss": 0.9616, "step": 1064 }, { "epoch": 0.6212629429779787, "grad_norm": 1.3287920951843262, "learning_rate": 3.5516826923076926e-05, "loss": 1.0438, "step": 1065 }, { "epoch": 0.621846288464343, "grad_norm": 1.7268593311309814, "learning_rate": 3.5501802884615384e-05, "loss": 0.8745, "step": 1066 }, { "epoch": 0.6224296339507073, "grad_norm": 1.0686813592910767, "learning_rate": 3.548677884615384e-05, "loss": 0.8533, "step": 1067 }, { "epoch": 0.6230129794370716, "grad_norm": 1.2637293338775635, "learning_rate": 3.547175480769231e-05, "loss": 0.8116, "step": 1068 }, { "epoch": 0.6235963249234359, "grad_norm": 1.3439655303955078, "learning_rate": 3.545673076923077e-05, "loss": 1.1462, "step": 1069 }, { "epoch": 0.6241796704098002, "grad_norm": 1.277295708656311, "learning_rate": 3.544170673076923e-05, "loss": 0.9344, "step": 1070 }, { "epoch": 0.6247630158961645, "grad_norm": 1.2058827877044678, "learning_rate": 3.54266826923077e-05, "loss": 1.0197, "step": 1071 }, { "epoch": 0.6253463613825287, "grad_norm": 1.288257360458374, "learning_rate": 3.5411658653846155e-05, "loss": 0.925, "step": 1072 }, { "epoch": 0.6259297068688932, "grad_norm": 1.1143733263015747, "learning_rate": 3.5396634615384614e-05, "loss": 1.0083, "step": 1073 }, { "epoch": 0.6265130523552574, "grad_norm": 1.0898163318634033, "learning_rate": 3.538161057692308e-05, "loss": 1.0865, "step": 1074 }, { "epoch": 0.6270963978416217, "grad_norm": 1.1873056888580322, "learning_rate": 3.536658653846154e-05, "loss": 1.0579, "step": 1075 }, { "epoch": 0.627679743327986, "grad_norm": 1.0526707172393799, "learning_rate": 3.53515625e-05, "loss": 1.0129, "step": 1076 }, { "epoch": 0.6282630888143503, "grad_norm": 1.6970293521881104, "learning_rate": 3.533653846153847e-05, "loss": 1.1972, "step": 1077 }, { "epoch": 0.6288464343007146, "grad_norm": 1.2696473598480225, "learning_rate": 3.532151442307693e-05, "loss": 0.8886, "step": 1078 }, { "epoch": 0.6294297797870789, "grad_norm": 1.1362708806991577, "learning_rate": 3.5306490384615385e-05, "loss": 1.1162, "step": 1079 }, { "epoch": 0.6300131252734432, "grad_norm": 1.0971933603286743, "learning_rate": 3.5291466346153844e-05, "loss": 0.9418, "step": 1080 }, { "epoch": 0.6305964707598075, "grad_norm": 1.5346184968948364, "learning_rate": 3.527644230769231e-05, "loss": 1.3067, "step": 1081 }, { "epoch": 0.6311798162461718, "grad_norm": 1.10757315158844, "learning_rate": 3.5261418269230774e-05, "loss": 0.9629, "step": 1082 }, { "epoch": 0.6317631617325361, "grad_norm": 1.1153584718704224, "learning_rate": 3.524639423076923e-05, "loss": 0.9693, "step": 1083 }, { "epoch": 0.6323465072189004, "grad_norm": 1.2248104810714722, "learning_rate": 3.523137019230769e-05, "loss": 0.9687, "step": 1084 }, { "epoch": 0.6329298527052647, "grad_norm": 1.2040361166000366, "learning_rate": 3.5216346153846157e-05, "loss": 0.8451, "step": 1085 }, { "epoch": 0.633513198191629, "grad_norm": 1.2189350128173828, "learning_rate": 3.5201322115384615e-05, "loss": 0.8775, "step": 1086 }, { "epoch": 0.6340965436779933, "grad_norm": 1.453861117362976, "learning_rate": 3.518629807692308e-05, "loss": 0.9802, "step": 1087 }, { "epoch": 0.6346798891643576, "grad_norm": 1.6878211498260498, "learning_rate": 3.517127403846154e-05, "loss": 0.8514, "step": 1088 }, { "epoch": 0.6352632346507219, "grad_norm": 1.2242681980133057, "learning_rate": 3.5156250000000004e-05, "loss": 0.7271, "step": 1089 }, { "epoch": 0.6358465801370862, "grad_norm": 1.1206653118133545, "learning_rate": 3.514122596153846e-05, "loss": 1.0562, "step": 1090 }, { "epoch": 0.6364299256234505, "grad_norm": 1.2914477586746216, "learning_rate": 3.512620192307692e-05, "loss": 0.943, "step": 1091 }, { "epoch": 0.6370132711098148, "grad_norm": 1.4054170846939087, "learning_rate": 3.5111177884615386e-05, "loss": 1.064, "step": 1092 }, { "epoch": 0.637596616596179, "grad_norm": 1.1132476329803467, "learning_rate": 3.5096153846153845e-05, "loss": 1.0468, "step": 1093 }, { "epoch": 0.6381799620825434, "grad_norm": 1.4797582626342773, "learning_rate": 3.508112980769231e-05, "loss": 1.0369, "step": 1094 }, { "epoch": 0.6387633075689076, "grad_norm": 1.1891480684280396, "learning_rate": 3.5066105769230775e-05, "loss": 0.93, "step": 1095 }, { "epoch": 0.6393466530552719, "grad_norm": 1.1823198795318604, "learning_rate": 3.5051081730769234e-05, "loss": 1.2398, "step": 1096 }, { "epoch": 0.6399299985416362, "grad_norm": 1.0506694316864014, "learning_rate": 3.503605769230769e-05, "loss": 1.0028, "step": 1097 }, { "epoch": 0.6405133440280005, "grad_norm": 1.1625219583511353, "learning_rate": 3.502103365384616e-05, "loss": 0.9668, "step": 1098 }, { "epoch": 0.6410966895143649, "grad_norm": 1.3133008480072021, "learning_rate": 3.5006009615384616e-05, "loss": 0.8842, "step": 1099 }, { "epoch": 0.6416800350007292, "grad_norm": 1.2925740480422974, "learning_rate": 3.499098557692308e-05, "loss": 0.9443, "step": 1100 }, { "epoch": 0.6422633804870935, "grad_norm": 1.1056509017944336, "learning_rate": 3.497596153846154e-05, "loss": 1.0339, "step": 1101 }, { "epoch": 0.6428467259734578, "grad_norm": 1.4267328977584839, "learning_rate": 3.49609375e-05, "loss": 0.9475, "step": 1102 }, { "epoch": 0.6434300714598221, "grad_norm": 1.0847243070602417, "learning_rate": 3.4945913461538464e-05, "loss": 1.0789, "step": 1103 }, { "epoch": 0.6440134169461864, "grad_norm": 1.2231626510620117, "learning_rate": 3.493088942307692e-05, "loss": 0.9817, "step": 1104 }, { "epoch": 0.6445967624325507, "grad_norm": 1.1734437942504883, "learning_rate": 3.491586538461539e-05, "loss": 1.0806, "step": 1105 }, { "epoch": 0.645180107918915, "grad_norm": 1.1729824542999268, "learning_rate": 3.4900841346153846e-05, "loss": 1.0979, "step": 1106 }, { "epoch": 0.6457634534052793, "grad_norm": 1.1877245903015137, "learning_rate": 3.488581730769231e-05, "loss": 1.0407, "step": 1107 }, { "epoch": 0.6463467988916436, "grad_norm": 1.0097910165786743, "learning_rate": 3.487079326923077e-05, "loss": 0.9112, "step": 1108 }, { "epoch": 0.6469301443780079, "grad_norm": 1.1973975896835327, "learning_rate": 3.485576923076923e-05, "loss": 0.873, "step": 1109 }, { "epoch": 0.6475134898643722, "grad_norm": 1.1800600290298462, "learning_rate": 3.4840745192307694e-05, "loss": 0.9863, "step": 1110 }, { "epoch": 0.6480968353507365, "grad_norm": 1.5829706192016602, "learning_rate": 3.482572115384616e-05, "loss": 0.9469, "step": 1111 }, { "epoch": 0.6486801808371008, "grad_norm": 1.5107544660568237, "learning_rate": 3.481069711538462e-05, "loss": 0.8981, "step": 1112 }, { "epoch": 0.6492635263234651, "grad_norm": 1.1659681797027588, "learning_rate": 3.479567307692308e-05, "loss": 1.0151, "step": 1113 }, { "epoch": 0.6498468718098294, "grad_norm": 1.2110625505447388, "learning_rate": 3.478064903846154e-05, "loss": 0.8399, "step": 1114 }, { "epoch": 0.6504302172961937, "grad_norm": 1.2130272388458252, "learning_rate": 3.4765625e-05, "loss": 0.9638, "step": 1115 }, { "epoch": 0.651013562782558, "grad_norm": 1.4531303644180298, "learning_rate": 3.4750600961538465e-05, "loss": 0.9451, "step": 1116 }, { "epoch": 0.6515969082689222, "grad_norm": 1.2456012964248657, "learning_rate": 3.4735576923076923e-05, "loss": 0.9804, "step": 1117 }, { "epoch": 0.6521802537552865, "grad_norm": 1.8596121072769165, "learning_rate": 3.472055288461539e-05, "loss": 1.0599, "step": 1118 }, { "epoch": 0.6527635992416508, "grad_norm": 1.2399401664733887, "learning_rate": 3.470552884615385e-05, "loss": 0.978, "step": 1119 }, { "epoch": 0.6533469447280151, "grad_norm": 1.25180983543396, "learning_rate": 3.4690504807692306e-05, "loss": 1.1725, "step": 1120 }, { "epoch": 0.6539302902143794, "grad_norm": 1.2084770202636719, "learning_rate": 3.467548076923077e-05, "loss": 0.8519, "step": 1121 }, { "epoch": 0.6545136357007437, "grad_norm": 1.1235297918319702, "learning_rate": 3.466045673076923e-05, "loss": 0.9979, "step": 1122 }, { "epoch": 0.655096981187108, "grad_norm": 1.1955933570861816, "learning_rate": 3.4645432692307695e-05, "loss": 1.2237, "step": 1123 }, { "epoch": 0.6556803266734723, "grad_norm": 1.144810676574707, "learning_rate": 3.463040865384616e-05, "loss": 0.7198, "step": 1124 }, { "epoch": 0.6562636721598366, "grad_norm": 1.1226835250854492, "learning_rate": 3.461538461538462e-05, "loss": 1.0139, "step": 1125 }, { "epoch": 0.656847017646201, "grad_norm": 1.0557211637496948, "learning_rate": 3.460036057692308e-05, "loss": 1.2123, "step": 1126 }, { "epoch": 0.6574303631325653, "grad_norm": 1.051958441734314, "learning_rate": 3.4585336538461536e-05, "loss": 1.1768, "step": 1127 }, { "epoch": 0.6580137086189296, "grad_norm": 1.0615653991699219, "learning_rate": 3.45703125e-05, "loss": 0.9798, "step": 1128 }, { "epoch": 0.6585970541052939, "grad_norm": 1.022555947303772, "learning_rate": 3.4555288461538466e-05, "loss": 0.7892, "step": 1129 }, { "epoch": 0.6591803995916582, "grad_norm": 1.3960226774215698, "learning_rate": 3.4540264423076925e-05, "loss": 0.922, "step": 1130 }, { "epoch": 0.6597637450780225, "grad_norm": 1.0755395889282227, "learning_rate": 3.452524038461539e-05, "loss": 1.0775, "step": 1131 }, { "epoch": 0.6603470905643868, "grad_norm": 1.075884222984314, "learning_rate": 3.451021634615385e-05, "loss": 1.149, "step": 1132 }, { "epoch": 0.6609304360507511, "grad_norm": 1.4113826751708984, "learning_rate": 3.449519230769231e-05, "loss": 0.8611, "step": 1133 }, { "epoch": 0.6615137815371154, "grad_norm": 1.0343314409255981, "learning_rate": 3.448016826923077e-05, "loss": 0.8762, "step": 1134 }, { "epoch": 0.6620971270234797, "grad_norm": 1.0894453525543213, "learning_rate": 3.446514423076923e-05, "loss": 1.0281, "step": 1135 }, { "epoch": 0.662680472509844, "grad_norm": 1.2878773212432861, "learning_rate": 3.4450120192307696e-05, "loss": 1.0719, "step": 1136 }, { "epoch": 0.6632638179962083, "grad_norm": 1.1200342178344727, "learning_rate": 3.443509615384616e-05, "loss": 1.0834, "step": 1137 }, { "epoch": 0.6638471634825726, "grad_norm": 1.2115342617034912, "learning_rate": 3.442007211538461e-05, "loss": 1.1008, "step": 1138 }, { "epoch": 0.6644305089689369, "grad_norm": 1.214706301689148, "learning_rate": 3.440504807692308e-05, "loss": 1.2134, "step": 1139 }, { "epoch": 0.6650138544553011, "grad_norm": 1.3579165935516357, "learning_rate": 3.439002403846154e-05, "loss": 0.8706, "step": 1140 }, { "epoch": 0.6655971999416654, "grad_norm": 1.3200846910476685, "learning_rate": 3.4375e-05, "loss": 0.9311, "step": 1141 }, { "epoch": 0.6661805454280297, "grad_norm": 1.4525578022003174, "learning_rate": 3.435997596153847e-05, "loss": 1.0345, "step": 1142 }, { "epoch": 0.666763890914394, "grad_norm": 1.2601604461669922, "learning_rate": 3.4344951923076926e-05, "loss": 1.1084, "step": 1143 }, { "epoch": 0.6673472364007583, "grad_norm": 1.278788447380066, "learning_rate": 3.4329927884615384e-05, "loss": 0.9541, "step": 1144 }, { "epoch": 0.6679305818871226, "grad_norm": 0.9493159651756287, "learning_rate": 3.431490384615384e-05, "loss": 0.9375, "step": 1145 }, { "epoch": 0.6685139273734869, "grad_norm": 1.0539159774780273, "learning_rate": 3.429987980769231e-05, "loss": 0.9884, "step": 1146 }, { "epoch": 0.6690972728598512, "grad_norm": 1.1678050756454468, "learning_rate": 3.4284855769230773e-05, "loss": 1.0402, "step": 1147 }, { "epoch": 0.6696806183462155, "grad_norm": 1.2369019985198975, "learning_rate": 3.426983173076923e-05, "loss": 0.9254, "step": 1148 }, { "epoch": 0.6702639638325798, "grad_norm": 1.539076566696167, "learning_rate": 3.42548076923077e-05, "loss": 1.1155, "step": 1149 }, { "epoch": 0.6708473093189441, "grad_norm": 1.074963092803955, "learning_rate": 3.4239783653846156e-05, "loss": 1.0213, "step": 1150 }, { "epoch": 0.6714306548053084, "grad_norm": 1.2291454076766968, "learning_rate": 3.4224759615384614e-05, "loss": 0.898, "step": 1151 }, { "epoch": 0.6720140002916728, "grad_norm": 1.1478317975997925, "learning_rate": 3.420973557692308e-05, "loss": 0.8589, "step": 1152 }, { "epoch": 0.6725973457780371, "grad_norm": 1.096078872680664, "learning_rate": 3.419471153846154e-05, "loss": 0.8751, "step": 1153 }, { "epoch": 0.6731806912644014, "grad_norm": 1.4119384288787842, "learning_rate": 3.41796875e-05, "loss": 1.0693, "step": 1154 }, { "epoch": 0.6737640367507657, "grad_norm": 1.2378814220428467, "learning_rate": 3.416466346153847e-05, "loss": 1.2076, "step": 1155 }, { "epoch": 0.67434738223713, "grad_norm": 1.2882436513900757, "learning_rate": 3.414963942307692e-05, "loss": 1.1177, "step": 1156 }, { "epoch": 0.6749307277234943, "grad_norm": 1.1691817045211792, "learning_rate": 3.4134615384615386e-05, "loss": 1.0826, "step": 1157 }, { "epoch": 0.6755140732098586, "grad_norm": 1.168468713760376, "learning_rate": 3.4119591346153844e-05, "loss": 0.9258, "step": 1158 }, { "epoch": 0.6760974186962229, "grad_norm": 1.3444772958755493, "learning_rate": 3.410456730769231e-05, "loss": 1.1413, "step": 1159 }, { "epoch": 0.6766807641825872, "grad_norm": 1.2808760404586792, "learning_rate": 3.4089543269230775e-05, "loss": 0.8778, "step": 1160 }, { "epoch": 0.6772641096689515, "grad_norm": 1.1354130506515503, "learning_rate": 3.407451923076923e-05, "loss": 0.8865, "step": 1161 }, { "epoch": 0.6778474551553157, "grad_norm": 1.0770645141601562, "learning_rate": 3.405949519230769e-05, "loss": 1.0861, "step": 1162 }, { "epoch": 0.67843080064168, "grad_norm": 1.1578465700149536, "learning_rate": 3.404447115384616e-05, "loss": 1.0487, "step": 1163 }, { "epoch": 0.6790141461280443, "grad_norm": 1.0803139209747314, "learning_rate": 3.4029447115384615e-05, "loss": 0.8303, "step": 1164 }, { "epoch": 0.6795974916144086, "grad_norm": 1.0990511178970337, "learning_rate": 3.401442307692308e-05, "loss": 1.1653, "step": 1165 }, { "epoch": 0.6801808371007729, "grad_norm": 0.9813050627708435, "learning_rate": 3.399939903846154e-05, "loss": 0.9875, "step": 1166 }, { "epoch": 0.6807641825871372, "grad_norm": 1.0541377067565918, "learning_rate": 3.3984375000000004e-05, "loss": 0.9924, "step": 1167 }, { "epoch": 0.6813475280735015, "grad_norm": 1.2727155685424805, "learning_rate": 3.396935096153846e-05, "loss": 0.7893, "step": 1168 }, { "epoch": 0.6819308735598658, "grad_norm": 1.1019082069396973, "learning_rate": 3.395432692307692e-05, "loss": 1.2714, "step": 1169 }, { "epoch": 0.6825142190462301, "grad_norm": 0.9809292554855347, "learning_rate": 3.393930288461539e-05, "loss": 0.8502, "step": 1170 }, { "epoch": 0.6830975645325944, "grad_norm": 0.8613129258155823, "learning_rate": 3.3924278846153845e-05, "loss": 0.8276, "step": 1171 }, { "epoch": 0.6836809100189587, "grad_norm": 1.049072265625, "learning_rate": 3.390925480769231e-05, "loss": 1.0627, "step": 1172 }, { "epoch": 0.684264255505323, "grad_norm": 1.205977201461792, "learning_rate": 3.3894230769230776e-05, "loss": 0.9767, "step": 1173 }, { "epoch": 0.6848476009916873, "grad_norm": 1.2076629400253296, "learning_rate": 3.387920673076923e-05, "loss": 0.8472, "step": 1174 }, { "epoch": 0.6854309464780516, "grad_norm": 1.244746208190918, "learning_rate": 3.386418269230769e-05, "loss": 1.0404, "step": 1175 }, { "epoch": 0.6860142919644159, "grad_norm": 1.0318353176116943, "learning_rate": 3.384915865384616e-05, "loss": 1.2889, "step": 1176 }, { "epoch": 0.6865976374507802, "grad_norm": 1.1536865234375, "learning_rate": 3.3834134615384617e-05, "loss": 1.0932, "step": 1177 }, { "epoch": 0.6871809829371445, "grad_norm": 1.0421112775802612, "learning_rate": 3.381911057692308e-05, "loss": 1.1513, "step": 1178 }, { "epoch": 0.6877643284235089, "grad_norm": 1.2524076700210571, "learning_rate": 3.380408653846154e-05, "loss": 1.0571, "step": 1179 }, { "epoch": 0.6883476739098732, "grad_norm": 1.3088963031768799, "learning_rate": 3.37890625e-05, "loss": 1.0539, "step": 1180 }, { "epoch": 0.6889310193962375, "grad_norm": 1.4658859968185425, "learning_rate": 3.3774038461538464e-05, "loss": 0.9256, "step": 1181 }, { "epoch": 0.6895143648826018, "grad_norm": 1.3050382137298584, "learning_rate": 3.375901442307692e-05, "loss": 1.1349, "step": 1182 }, { "epoch": 0.690097710368966, "grad_norm": 1.318977952003479, "learning_rate": 3.374399038461539e-05, "loss": 1.0711, "step": 1183 }, { "epoch": 0.6906810558553304, "grad_norm": 1.2855241298675537, "learning_rate": 3.3728966346153846e-05, "loss": 0.9612, "step": 1184 }, { "epoch": 0.6912644013416946, "grad_norm": 1.3077634572982788, "learning_rate": 3.371394230769231e-05, "loss": 1.0041, "step": 1185 }, { "epoch": 0.6918477468280589, "grad_norm": 1.1256729364395142, "learning_rate": 3.369891826923077e-05, "loss": 1.0035, "step": 1186 }, { "epoch": 0.6924310923144232, "grad_norm": 1.3386636972427368, "learning_rate": 3.368389423076923e-05, "loss": 0.9681, "step": 1187 }, { "epoch": 0.6930144378007875, "grad_norm": 1.4480712413787842, "learning_rate": 3.3668870192307694e-05, "loss": 1.1093, "step": 1188 }, { "epoch": 0.6935977832871518, "grad_norm": 1.1406118869781494, "learning_rate": 3.365384615384616e-05, "loss": 1.0623, "step": 1189 }, { "epoch": 0.6941811287735161, "grad_norm": 1.1809027194976807, "learning_rate": 3.363882211538462e-05, "loss": 0.9942, "step": 1190 }, { "epoch": 0.6947644742598804, "grad_norm": 1.3855853080749512, "learning_rate": 3.362379807692308e-05, "loss": 0.9911, "step": 1191 }, { "epoch": 0.6953478197462447, "grad_norm": 1.2072291374206543, "learning_rate": 3.3608774038461535e-05, "loss": 0.9015, "step": 1192 }, { "epoch": 0.695931165232609, "grad_norm": 1.3127961158752441, "learning_rate": 3.359375e-05, "loss": 0.8459, "step": 1193 }, { "epoch": 0.6965145107189733, "grad_norm": 1.1735903024673462, "learning_rate": 3.3578725961538465e-05, "loss": 0.9313, "step": 1194 }, { "epoch": 0.6970978562053376, "grad_norm": 1.1430635452270508, "learning_rate": 3.3563701923076924e-05, "loss": 1.1029, "step": 1195 }, { "epoch": 0.6976812016917019, "grad_norm": 1.0707919597625732, "learning_rate": 3.354867788461539e-05, "loss": 1.0674, "step": 1196 }, { "epoch": 0.6982645471780662, "grad_norm": 1.1463159322738647, "learning_rate": 3.353365384615385e-05, "loss": 1.0293, "step": 1197 }, { "epoch": 0.6988478926644305, "grad_norm": 0.9569932818412781, "learning_rate": 3.3518629807692306e-05, "loss": 0.726, "step": 1198 }, { "epoch": 0.6994312381507948, "grad_norm": 1.4707874059677124, "learning_rate": 3.350360576923077e-05, "loss": 0.9595, "step": 1199 }, { "epoch": 0.7000145836371591, "grad_norm": 1.0669324398040771, "learning_rate": 3.348858173076923e-05, "loss": 1.1805, "step": 1200 }, { "epoch": 0.7000145836371591, "eval_loss_squad": 0.8398913412541151, "eval_perplexity": 8.243036213620654, "eval_perplexity_reconstruct": 1.9400207914704957, "step": 1200 }, { "epoch": 0.7005979291235234, "grad_norm": 1.3081518411636353, "learning_rate": 3.3473557692307695e-05, "loss": 1.2301, "step": 1201 }, { "epoch": 0.7011812746098877, "grad_norm": 1.2794771194458008, "learning_rate": 3.345853365384616e-05, "loss": 0.7746, "step": 1202 }, { "epoch": 0.701764620096252, "grad_norm": 1.2433689832687378, "learning_rate": 3.344350961538462e-05, "loss": 0.9256, "step": 1203 }, { "epoch": 0.7023479655826163, "grad_norm": 1.1298249959945679, "learning_rate": 3.342848557692308e-05, "loss": 1.0316, "step": 1204 }, { "epoch": 0.7029313110689807, "grad_norm": 1.1544654369354248, "learning_rate": 3.3413461538461536e-05, "loss": 0.9899, "step": 1205 }, { "epoch": 0.703514656555345, "grad_norm": 1.4191893339157104, "learning_rate": 3.33984375e-05, "loss": 0.8943, "step": 1206 }, { "epoch": 0.7040980020417092, "grad_norm": 1.1830098628997803, "learning_rate": 3.3383413461538466e-05, "loss": 0.9173, "step": 1207 }, { "epoch": 0.7046813475280735, "grad_norm": 1.1737726926803589, "learning_rate": 3.3368389423076925e-05, "loss": 0.9999, "step": 1208 }, { "epoch": 0.7052646930144378, "grad_norm": 1.3882325887680054, "learning_rate": 3.335336538461539e-05, "loss": 1.0172, "step": 1209 }, { "epoch": 0.7058480385008021, "grad_norm": 1.1368088722229004, "learning_rate": 3.333834134615384e-05, "loss": 0.9639, "step": 1210 }, { "epoch": 0.7064313839871664, "grad_norm": 1.0348821878433228, "learning_rate": 3.332331730769231e-05, "loss": 1.0874, "step": 1211 }, { "epoch": 0.7070147294735307, "grad_norm": 1.3544259071350098, "learning_rate": 3.330829326923077e-05, "loss": 1.0331, "step": 1212 }, { "epoch": 0.707598074959895, "grad_norm": 0.9908936619758606, "learning_rate": 3.329326923076923e-05, "loss": 0.7225, "step": 1213 }, { "epoch": 0.7081814204462593, "grad_norm": 1.2547880411148071, "learning_rate": 3.3278245192307696e-05, "loss": 1.0878, "step": 1214 }, { "epoch": 0.7087647659326236, "grad_norm": 1.617482304573059, "learning_rate": 3.3263221153846155e-05, "loss": 1.0804, "step": 1215 }, { "epoch": 0.7093481114189879, "grad_norm": 1.039589285850525, "learning_rate": 3.324819711538461e-05, "loss": 0.9664, "step": 1216 }, { "epoch": 0.7099314569053522, "grad_norm": 1.2409045696258545, "learning_rate": 3.323317307692308e-05, "loss": 0.9162, "step": 1217 }, { "epoch": 0.7105148023917165, "grad_norm": 1.286959171295166, "learning_rate": 3.321814903846154e-05, "loss": 1.032, "step": 1218 }, { "epoch": 0.7110981478780808, "grad_norm": 1.3693422079086304, "learning_rate": 3.3203125e-05, "loss": 1.0232, "step": 1219 }, { "epoch": 0.7116814933644451, "grad_norm": 1.3210753202438354, "learning_rate": 3.318810096153847e-05, "loss": 0.9577, "step": 1220 }, { "epoch": 0.7122648388508094, "grad_norm": 1.1538783311843872, "learning_rate": 3.3173076923076926e-05, "loss": 1.0804, "step": 1221 }, { "epoch": 0.7128481843371737, "grad_norm": 1.1558293104171753, "learning_rate": 3.3158052884615385e-05, "loss": 1.1509, "step": 1222 }, { "epoch": 0.713431529823538, "grad_norm": 0.9820008277893066, "learning_rate": 3.314302884615384e-05, "loss": 0.814, "step": 1223 }, { "epoch": 0.7140148753099023, "grad_norm": 0.9828884601593018, "learning_rate": 3.312800480769231e-05, "loss": 0.8456, "step": 1224 }, { "epoch": 0.7145982207962666, "grad_norm": 1.3056386709213257, "learning_rate": 3.3112980769230774e-05, "loss": 1.0279, "step": 1225 }, { "epoch": 0.7151815662826309, "grad_norm": 1.1356948614120483, "learning_rate": 3.309795673076923e-05, "loss": 1.0234, "step": 1226 }, { "epoch": 0.7157649117689951, "grad_norm": 1.2792552709579468, "learning_rate": 3.30829326923077e-05, "loss": 1.1383, "step": 1227 }, { "epoch": 0.7163482572553594, "grad_norm": 1.0980956554412842, "learning_rate": 3.3067908653846156e-05, "loss": 1.0534, "step": 1228 }, { "epoch": 0.7169316027417237, "grad_norm": 1.2338215112686157, "learning_rate": 3.3052884615384615e-05, "loss": 0.9844, "step": 1229 }, { "epoch": 0.717514948228088, "grad_norm": 1.4420562982559204, "learning_rate": 3.303786057692308e-05, "loss": 0.8789, "step": 1230 }, { "epoch": 0.7180982937144523, "grad_norm": 1.26366126537323, "learning_rate": 3.302283653846154e-05, "loss": 0.8189, "step": 1231 }, { "epoch": 0.7186816392008167, "grad_norm": 1.1623914241790771, "learning_rate": 3.3007812500000004e-05, "loss": 0.9789, "step": 1232 }, { "epoch": 0.719264984687181, "grad_norm": 1.0107698440551758, "learning_rate": 3.299278846153846e-05, "loss": 0.7271, "step": 1233 }, { "epoch": 0.7198483301735453, "grad_norm": 1.185608148574829, "learning_rate": 3.297776442307692e-05, "loss": 1.1248, "step": 1234 }, { "epoch": 0.7204316756599096, "grad_norm": 1.077970027923584, "learning_rate": 3.2962740384615386e-05, "loss": 1.1963, "step": 1235 }, { "epoch": 0.7210150211462739, "grad_norm": 1.684244990348816, "learning_rate": 3.2947716346153844e-05, "loss": 0.8828, "step": 1236 }, { "epoch": 0.7215983666326382, "grad_norm": 1.24001944065094, "learning_rate": 3.293269230769231e-05, "loss": 0.8627, "step": 1237 }, { "epoch": 0.7221817121190025, "grad_norm": 1.0665417909622192, "learning_rate": 3.2917668269230775e-05, "loss": 0.7553, "step": 1238 }, { "epoch": 0.7227650576053668, "grad_norm": 1.1986167430877686, "learning_rate": 3.2902644230769233e-05, "loss": 0.7785, "step": 1239 }, { "epoch": 0.7233484030917311, "grad_norm": 1.313407301902771, "learning_rate": 3.288762019230769e-05, "loss": 0.9125, "step": 1240 }, { "epoch": 0.7239317485780954, "grad_norm": 1.1136894226074219, "learning_rate": 3.287259615384616e-05, "loss": 0.975, "step": 1241 }, { "epoch": 0.7245150940644597, "grad_norm": 1.2097238302230835, "learning_rate": 3.2857572115384616e-05, "loss": 0.8688, "step": 1242 }, { "epoch": 0.725098439550824, "grad_norm": 1.280327320098877, "learning_rate": 3.284254807692308e-05, "loss": 0.9758, "step": 1243 }, { "epoch": 0.7256817850371883, "grad_norm": 1.2413567304611206, "learning_rate": 3.282752403846154e-05, "loss": 0.9494, "step": 1244 }, { "epoch": 0.7262651305235526, "grad_norm": 1.1193758249282837, "learning_rate": 3.2812500000000005e-05, "loss": 1.0908, "step": 1245 }, { "epoch": 0.7268484760099169, "grad_norm": 1.1849331855773926, "learning_rate": 3.279747596153846e-05, "loss": 1.0299, "step": 1246 }, { "epoch": 0.7274318214962812, "grad_norm": 2.731739044189453, "learning_rate": 3.278245192307692e-05, "loss": 0.8046, "step": 1247 }, { "epoch": 0.7280151669826455, "grad_norm": 1.145367980003357, "learning_rate": 3.276742788461539e-05, "loss": 0.9687, "step": 1248 }, { "epoch": 0.7285985124690098, "grad_norm": 1.2098665237426758, "learning_rate": 3.2752403846153846e-05, "loss": 0.8909, "step": 1249 }, { "epoch": 0.729181857955374, "grad_norm": 1.4284601211547852, "learning_rate": 3.273737980769231e-05, "loss": 0.8545, "step": 1250 }, { "epoch": 0.7297652034417383, "grad_norm": 1.4550679922103882, "learning_rate": 3.272235576923077e-05, "loss": 0.9656, "step": 1251 }, { "epoch": 0.7303485489281026, "grad_norm": 1.2722722291946411, "learning_rate": 3.270733173076923e-05, "loss": 0.8022, "step": 1252 }, { "epoch": 0.7309318944144669, "grad_norm": 1.3001720905303955, "learning_rate": 3.269230769230769e-05, "loss": 0.9008, "step": 1253 }, { "epoch": 0.7315152399008312, "grad_norm": 1.610422968864441, "learning_rate": 3.267728365384616e-05, "loss": 0.7881, "step": 1254 }, { "epoch": 0.7320985853871955, "grad_norm": 1.0116015672683716, "learning_rate": 3.266225961538462e-05, "loss": 0.7952, "step": 1255 }, { "epoch": 0.7326819308735598, "grad_norm": 1.4856303930282593, "learning_rate": 3.264723557692308e-05, "loss": 1.0552, "step": 1256 }, { "epoch": 0.7332652763599241, "grad_norm": 1.7719351053237915, "learning_rate": 3.263221153846154e-05, "loss": 1.058, "step": 1257 }, { "epoch": 0.7338486218462885, "grad_norm": 1.1480412483215332, "learning_rate": 3.26171875e-05, "loss": 0.8906, "step": 1258 }, { "epoch": 0.7344319673326528, "grad_norm": 1.2761352062225342, "learning_rate": 3.2602163461538464e-05, "loss": 0.8908, "step": 1259 }, { "epoch": 0.7350153128190171, "grad_norm": 1.1891727447509766, "learning_rate": 3.258713942307692e-05, "loss": 0.9932, "step": 1260 }, { "epoch": 0.7355986583053814, "grad_norm": 1.0514845848083496, "learning_rate": 3.257211538461539e-05, "loss": 1.1528, "step": 1261 }, { "epoch": 0.7361820037917457, "grad_norm": 1.4285988807678223, "learning_rate": 3.255709134615385e-05, "loss": 0.8171, "step": 1262 }, { "epoch": 0.73676534927811, "grad_norm": 1.2109655141830444, "learning_rate": 3.254206730769231e-05, "loss": 1.1154, "step": 1263 }, { "epoch": 0.7373486947644743, "grad_norm": 1.417160153388977, "learning_rate": 3.252704326923077e-05, "loss": 1.0018, "step": 1264 }, { "epoch": 0.7379320402508386, "grad_norm": 1.045836091041565, "learning_rate": 3.251201923076923e-05, "loss": 0.9084, "step": 1265 }, { "epoch": 0.7385153857372029, "grad_norm": 1.0985413789749146, "learning_rate": 3.2496995192307694e-05, "loss": 0.7832, "step": 1266 }, { "epoch": 0.7390987312235672, "grad_norm": 1.1846632957458496, "learning_rate": 3.248197115384616e-05, "loss": 0.9933, "step": 1267 }, { "epoch": 0.7396820767099315, "grad_norm": 1.1051980257034302, "learning_rate": 3.246694711538462e-05, "loss": 1.2966, "step": 1268 }, { "epoch": 0.7402654221962958, "grad_norm": 1.1988706588745117, "learning_rate": 3.2451923076923077e-05, "loss": 1.052, "step": 1269 }, { "epoch": 0.7408487676826601, "grad_norm": 1.1623855829238892, "learning_rate": 3.2436899038461535e-05, "loss": 0.9502, "step": 1270 }, { "epoch": 0.7414321131690244, "grad_norm": 1.6816089153289795, "learning_rate": 3.2421875e-05, "loss": 1.1816, "step": 1271 }, { "epoch": 0.7420154586553886, "grad_norm": 1.2776967287063599, "learning_rate": 3.2406850961538466e-05, "loss": 1.0167, "step": 1272 }, { "epoch": 0.7425988041417529, "grad_norm": 1.7386460304260254, "learning_rate": 3.2391826923076924e-05, "loss": 1.0841, "step": 1273 }, { "epoch": 0.7431821496281172, "grad_norm": 1.1177300214767456, "learning_rate": 3.237680288461539e-05, "loss": 0.8789, "step": 1274 }, { "epoch": 0.7437654951144815, "grad_norm": 1.161293864250183, "learning_rate": 3.236177884615385e-05, "loss": 0.9454, "step": 1275 }, { "epoch": 0.7443488406008458, "grad_norm": 1.609604001045227, "learning_rate": 3.2346754807692306e-05, "loss": 0.9307, "step": 1276 }, { "epoch": 0.7449321860872101, "grad_norm": 0.9415054321289062, "learning_rate": 3.233173076923077e-05, "loss": 0.9462, "step": 1277 }, { "epoch": 0.7455155315735744, "grad_norm": 1.4762784242630005, "learning_rate": 3.231670673076923e-05, "loss": 1.0188, "step": 1278 }, { "epoch": 0.7460988770599387, "grad_norm": 1.2860867977142334, "learning_rate": 3.2301682692307695e-05, "loss": 0.8886, "step": 1279 }, { "epoch": 0.746682222546303, "grad_norm": 1.2090308666229248, "learning_rate": 3.228665865384616e-05, "loss": 1.0265, "step": 1280 }, { "epoch": 0.7472655680326673, "grad_norm": 1.0692979097366333, "learning_rate": 3.227163461538462e-05, "loss": 1.014, "step": 1281 }, { "epoch": 0.7478489135190316, "grad_norm": 1.2457678318023682, "learning_rate": 3.225661057692308e-05, "loss": 1.0442, "step": 1282 }, { "epoch": 0.7484322590053959, "grad_norm": 1.1981621980667114, "learning_rate": 3.2241586538461536e-05, "loss": 0.924, "step": 1283 }, { "epoch": 0.7490156044917602, "grad_norm": 1.448926568031311, "learning_rate": 3.22265625e-05, "loss": 1.2461, "step": 1284 }, { "epoch": 0.7495989499781246, "grad_norm": 1.2373019456863403, "learning_rate": 3.221153846153847e-05, "loss": 1.1221, "step": 1285 }, { "epoch": 0.7501822954644889, "grad_norm": 1.135291337966919, "learning_rate": 3.2196514423076925e-05, "loss": 0.9693, "step": 1286 }, { "epoch": 0.7507656409508532, "grad_norm": 1.3530988693237305, "learning_rate": 3.2181490384615384e-05, "loss": 0.9934, "step": 1287 }, { "epoch": 0.7513489864372175, "grad_norm": 1.216654658317566, "learning_rate": 3.216646634615384e-05, "loss": 1.1738, "step": 1288 }, { "epoch": 0.7519323319235818, "grad_norm": 1.1719613075256348, "learning_rate": 3.215144230769231e-05, "loss": 0.8775, "step": 1289 }, { "epoch": 0.7525156774099461, "grad_norm": 1.223272442817688, "learning_rate": 3.213641826923077e-05, "loss": 1.0273, "step": 1290 }, { "epoch": 0.7530990228963104, "grad_norm": 1.4900156259536743, "learning_rate": 3.212139423076923e-05, "loss": 1.0808, "step": 1291 }, { "epoch": 0.7536823683826747, "grad_norm": 1.1983774900436401, "learning_rate": 3.21063701923077e-05, "loss": 1.1111, "step": 1292 }, { "epoch": 0.754265713869039, "grad_norm": 1.4071288108825684, "learning_rate": 3.2091346153846155e-05, "loss": 0.8877, "step": 1293 }, { "epoch": 0.7548490593554033, "grad_norm": 1.1557461023330688, "learning_rate": 3.2076322115384614e-05, "loss": 0.8536, "step": 1294 }, { "epoch": 0.7554324048417675, "grad_norm": 1.1718254089355469, "learning_rate": 3.206129807692308e-05, "loss": 0.8273, "step": 1295 }, { "epoch": 0.7560157503281318, "grad_norm": 1.2104593515396118, "learning_rate": 3.204627403846154e-05, "loss": 1.2501, "step": 1296 }, { "epoch": 0.7565990958144961, "grad_norm": 2.3339264392852783, "learning_rate": 3.203125e-05, "loss": 0.8378, "step": 1297 }, { "epoch": 0.7571824413008604, "grad_norm": 1.0478349924087524, "learning_rate": 3.201622596153847e-05, "loss": 1.0836, "step": 1298 }, { "epoch": 0.7577657867872247, "grad_norm": 1.2753820419311523, "learning_rate": 3.2001201923076926e-05, "loss": 1.1445, "step": 1299 }, { "epoch": 0.758349132273589, "grad_norm": 1.194672703742981, "learning_rate": 3.1986177884615385e-05, "loss": 1.0424, "step": 1300 }, { "epoch": 0.7589324777599533, "grad_norm": 1.2020694017410278, "learning_rate": 3.1971153846153843e-05, "loss": 1.0068, "step": 1301 }, { "epoch": 0.7595158232463176, "grad_norm": 2.032259941101074, "learning_rate": 3.195612980769231e-05, "loss": 0.946, "step": 1302 }, { "epoch": 0.7600991687326819, "grad_norm": 1.2049493789672852, "learning_rate": 3.1941105769230774e-05, "loss": 1.053, "step": 1303 }, { "epoch": 0.7606825142190462, "grad_norm": 1.1551873683929443, "learning_rate": 3.192608173076923e-05, "loss": 1.0539, "step": 1304 }, { "epoch": 0.7612658597054105, "grad_norm": 1.0860520601272583, "learning_rate": 3.191105769230769e-05, "loss": 1.1625, "step": 1305 }, { "epoch": 0.7618492051917748, "grad_norm": 1.1426883935928345, "learning_rate": 3.1896033653846156e-05, "loss": 0.8321, "step": 1306 }, { "epoch": 0.7624325506781391, "grad_norm": 1.2449581623077393, "learning_rate": 3.1881009615384615e-05, "loss": 1.0748, "step": 1307 }, { "epoch": 0.7630158961645034, "grad_norm": 1.2475258111953735, "learning_rate": 3.186598557692308e-05, "loss": 0.9779, "step": 1308 }, { "epoch": 0.7635992416508677, "grad_norm": 1.0726374387741089, "learning_rate": 3.185096153846154e-05, "loss": 0.9079, "step": 1309 }, { "epoch": 0.764182587137232, "grad_norm": 1.169661521911621, "learning_rate": 3.1835937500000004e-05, "loss": 1.1156, "step": 1310 }, { "epoch": 0.7647659326235964, "grad_norm": 1.1962292194366455, "learning_rate": 3.182091346153846e-05, "loss": 0.9258, "step": 1311 }, { "epoch": 0.7653492781099607, "grad_norm": 1.038885235786438, "learning_rate": 3.180588942307692e-05, "loss": 1.0377, "step": 1312 }, { "epoch": 0.765932623596325, "grad_norm": 1.4127665758132935, "learning_rate": 3.1790865384615386e-05, "loss": 0.982, "step": 1313 }, { "epoch": 0.7665159690826893, "grad_norm": 1.1078689098358154, "learning_rate": 3.1775841346153845e-05, "loss": 1.0159, "step": 1314 }, { "epoch": 0.7670993145690536, "grad_norm": 1.047503113746643, "learning_rate": 3.176081730769231e-05, "loss": 0.7841, "step": 1315 }, { "epoch": 0.7676826600554179, "grad_norm": 1.2618082761764526, "learning_rate": 3.1745793269230775e-05, "loss": 1.0228, "step": 1316 }, { "epoch": 0.7682660055417821, "grad_norm": 2.2956039905548096, "learning_rate": 3.1730769230769234e-05, "loss": 0.8457, "step": 1317 }, { "epoch": 0.7688493510281464, "grad_norm": 1.1614506244659424, "learning_rate": 3.171574519230769e-05, "loss": 1.0914, "step": 1318 }, { "epoch": 0.7694326965145107, "grad_norm": 1.0440285205841064, "learning_rate": 3.170072115384616e-05, "loss": 0.9781, "step": 1319 }, { "epoch": 0.770016042000875, "grad_norm": 1.2385691404342651, "learning_rate": 3.1685697115384616e-05, "loss": 0.904, "step": 1320 }, { "epoch": 0.7705993874872393, "grad_norm": 1.0237793922424316, "learning_rate": 3.167067307692308e-05, "loss": 0.8984, "step": 1321 }, { "epoch": 0.7711827329736036, "grad_norm": 1.4518593549728394, "learning_rate": 3.165564903846154e-05, "loss": 1.1862, "step": 1322 }, { "epoch": 0.7717660784599679, "grad_norm": 1.197102427482605, "learning_rate": 3.1640625e-05, "loss": 1.0267, "step": 1323 }, { "epoch": 0.7723494239463322, "grad_norm": 1.3080718517303467, "learning_rate": 3.1625600961538464e-05, "loss": 1.0663, "step": 1324 }, { "epoch": 0.7729327694326965, "grad_norm": 0.9802163243293762, "learning_rate": 3.161057692307692e-05, "loss": 1.0571, "step": 1325 }, { "epoch": 0.7735161149190608, "grad_norm": 1.969308853149414, "learning_rate": 3.159555288461539e-05, "loss": 1.0338, "step": 1326 }, { "epoch": 0.7740994604054251, "grad_norm": 1.0249823331832886, "learning_rate": 3.1580528846153846e-05, "loss": 0.9921, "step": 1327 }, { "epoch": 0.7746828058917894, "grad_norm": 1.099127173423767, "learning_rate": 3.156550480769231e-05, "loss": 0.9504, "step": 1328 }, { "epoch": 0.7752661513781537, "grad_norm": 1.2968648672103882, "learning_rate": 3.155048076923077e-05, "loss": 0.9594, "step": 1329 }, { "epoch": 0.775849496864518, "grad_norm": 1.0648295879364014, "learning_rate": 3.153545673076923e-05, "loss": 0.8687, "step": 1330 }, { "epoch": 0.7764328423508823, "grad_norm": 1.802680253982544, "learning_rate": 3.1520432692307693e-05, "loss": 1.0598, "step": 1331 }, { "epoch": 0.7770161878372466, "grad_norm": 1.080910563468933, "learning_rate": 3.150540865384616e-05, "loss": 0.9963, "step": 1332 }, { "epoch": 0.7775995333236109, "grad_norm": 1.1521499156951904, "learning_rate": 3.149038461538462e-05, "loss": 1.0519, "step": 1333 }, { "epoch": 0.7781828788099752, "grad_norm": 1.1837037801742554, "learning_rate": 3.147536057692308e-05, "loss": 0.9725, "step": 1334 }, { "epoch": 0.7787662242963395, "grad_norm": 1.085605502128601, "learning_rate": 3.146033653846154e-05, "loss": 1.1865, "step": 1335 }, { "epoch": 0.7793495697827038, "grad_norm": 1.0882564783096313, "learning_rate": 3.14453125e-05, "loss": 0.9782, "step": 1336 }, { "epoch": 0.7799329152690682, "grad_norm": 1.1564704179763794, "learning_rate": 3.1430288461538465e-05, "loss": 0.8602, "step": 1337 }, { "epoch": 0.7805162607554325, "grad_norm": 1.3103642463684082, "learning_rate": 3.141526442307692e-05, "loss": 1.144, "step": 1338 }, { "epoch": 0.7810996062417968, "grad_norm": 1.2268692255020142, "learning_rate": 3.140024038461539e-05, "loss": 0.8534, "step": 1339 }, { "epoch": 0.781682951728161, "grad_norm": 1.2421032190322876, "learning_rate": 3.138521634615385e-05, "loss": 1.1487, "step": 1340 }, { "epoch": 0.7822662972145253, "grad_norm": 1.0822752714157104, "learning_rate": 3.1370192307692306e-05, "loss": 0.963, "step": 1341 }, { "epoch": 0.7828496427008896, "grad_norm": 1.2561531066894531, "learning_rate": 3.135516826923077e-05, "loss": 1.1393, "step": 1342 }, { "epoch": 0.7834329881872539, "grad_norm": 0.9926168918609619, "learning_rate": 3.134014423076923e-05, "loss": 0.8662, "step": 1343 }, { "epoch": 0.7840163336736182, "grad_norm": 1.2411295175552368, "learning_rate": 3.1325120192307695e-05, "loss": 1.0019, "step": 1344 }, { "epoch": 0.7845996791599825, "grad_norm": 1.1900317668914795, "learning_rate": 3.131009615384616e-05, "loss": 1.0258, "step": 1345 }, { "epoch": 0.7851830246463468, "grad_norm": 1.0790519714355469, "learning_rate": 3.129507211538462e-05, "loss": 1.1386, "step": 1346 }, { "epoch": 0.7857663701327111, "grad_norm": 2.3410184383392334, "learning_rate": 3.128004807692308e-05, "loss": 0.8789, "step": 1347 }, { "epoch": 0.7863497156190754, "grad_norm": 1.0258671045303345, "learning_rate": 3.1265024038461535e-05, "loss": 1.015, "step": 1348 }, { "epoch": 0.7869330611054397, "grad_norm": 1.0533411502838135, "learning_rate": 3.125e-05, "loss": 0.8044, "step": 1349 }, { "epoch": 0.787516406591804, "grad_norm": 1.1454368829727173, "learning_rate": 3.1234975961538466e-05, "loss": 1.0102, "step": 1350 }, { "epoch": 0.7880997520781683, "grad_norm": 1.3050988912582397, "learning_rate": 3.1219951923076924e-05, "loss": 1.1298, "step": 1351 }, { "epoch": 0.7886830975645326, "grad_norm": 1.1478936672210693, "learning_rate": 3.120492788461539e-05, "loss": 0.9447, "step": 1352 }, { "epoch": 0.7892664430508969, "grad_norm": 1.2745267152786255, "learning_rate": 3.118990384615385e-05, "loss": 0.9069, "step": 1353 }, { "epoch": 0.7898497885372612, "grad_norm": 1.031055212020874, "learning_rate": 3.117487980769231e-05, "loss": 0.9162, "step": 1354 }, { "epoch": 0.7904331340236255, "grad_norm": 1.1818459033966064, "learning_rate": 3.115985576923077e-05, "loss": 0.9005, "step": 1355 }, { "epoch": 0.7910164795099898, "grad_norm": 1.157064437866211, "learning_rate": 3.114483173076923e-05, "loss": 1.0922, "step": 1356 }, { "epoch": 0.7915998249963541, "grad_norm": 1.3568843603134155, "learning_rate": 3.1129807692307696e-05, "loss": 0.9815, "step": 1357 }, { "epoch": 0.7921831704827184, "grad_norm": 0.9772933125495911, "learning_rate": 3.111478365384616e-05, "loss": 0.848, "step": 1358 }, { "epoch": 0.7927665159690827, "grad_norm": 1.2958146333694458, "learning_rate": 3.109975961538461e-05, "loss": 0.903, "step": 1359 }, { "epoch": 0.793349861455447, "grad_norm": 1.325095772743225, "learning_rate": 3.108473557692308e-05, "loss": 0.8335, "step": 1360 }, { "epoch": 0.7939332069418112, "grad_norm": 1.3909435272216797, "learning_rate": 3.1069711538461537e-05, "loss": 1.1868, "step": 1361 }, { "epoch": 0.7945165524281755, "grad_norm": 1.1557323932647705, "learning_rate": 3.10546875e-05, "loss": 1.2876, "step": 1362 }, { "epoch": 0.7950998979145398, "grad_norm": 1.0653504133224487, "learning_rate": 3.103966346153847e-05, "loss": 1.0039, "step": 1363 }, { "epoch": 0.7956832434009042, "grad_norm": 1.1019622087478638, "learning_rate": 3.1024639423076926e-05, "loss": 0.8072, "step": 1364 }, { "epoch": 0.7962665888872685, "grad_norm": 1.1759988069534302, "learning_rate": 3.1009615384615384e-05, "loss": 1.038, "step": 1365 }, { "epoch": 0.7968499343736328, "grad_norm": 1.2272703647613525, "learning_rate": 3.099459134615384e-05, "loss": 1.0167, "step": 1366 }, { "epoch": 0.7974332798599971, "grad_norm": 1.6301058530807495, "learning_rate": 3.097956730769231e-05, "loss": 0.9102, "step": 1367 }, { "epoch": 0.7980166253463614, "grad_norm": 1.059002161026001, "learning_rate": 3.096454326923077e-05, "loss": 1.1956, "step": 1368 }, { "epoch": 0.7985999708327257, "grad_norm": 1.0058438777923584, "learning_rate": 3.094951923076923e-05, "loss": 0.7865, "step": 1369 }, { "epoch": 0.79918331631909, "grad_norm": 1.3153209686279297, "learning_rate": 3.09344951923077e-05, "loss": 0.995, "step": 1370 }, { "epoch": 0.7997666618054543, "grad_norm": 1.1372942924499512, "learning_rate": 3.0919471153846155e-05, "loss": 0.901, "step": 1371 }, { "epoch": 0.8003500072918186, "grad_norm": 1.0216442346572876, "learning_rate": 3.0904447115384614e-05, "loss": 1.1966, "step": 1372 }, { "epoch": 0.8009333527781829, "grad_norm": 1.054608702659607, "learning_rate": 3.088942307692308e-05, "loss": 0.8792, "step": 1373 }, { "epoch": 0.8015166982645472, "grad_norm": 1.2942436933517456, "learning_rate": 3.087439903846154e-05, "loss": 1.1201, "step": 1374 }, { "epoch": 0.8021000437509115, "grad_norm": 1.2945632934570312, "learning_rate": 3.0859375e-05, "loss": 0.936, "step": 1375 }, { "epoch": 0.8026833892372758, "grad_norm": 1.213426947593689, "learning_rate": 3.084435096153847e-05, "loss": 0.9309, "step": 1376 }, { "epoch": 0.8032667347236401, "grad_norm": 1.0726510286331177, "learning_rate": 3.082932692307692e-05, "loss": 1.1424, "step": 1377 }, { "epoch": 0.8038500802100044, "grad_norm": 1.3145674467086792, "learning_rate": 3.0814302884615385e-05, "loss": 1.0205, "step": 1378 }, { "epoch": 0.8044334256963687, "grad_norm": 1.082029104232788, "learning_rate": 3.0799278846153844e-05, "loss": 0.8689, "step": 1379 }, { "epoch": 0.805016771182733, "grad_norm": 1.0544030666351318, "learning_rate": 3.078425480769231e-05, "loss": 0.8943, "step": 1380 }, { "epoch": 0.8056001166690973, "grad_norm": 1.1208224296569824, "learning_rate": 3.0769230769230774e-05, "loss": 0.9113, "step": 1381 }, { "epoch": 0.8061834621554615, "grad_norm": 1.1705317497253418, "learning_rate": 3.075420673076923e-05, "loss": 0.7789, "step": 1382 }, { "epoch": 0.8067668076418258, "grad_norm": 1.1692845821380615, "learning_rate": 3.073918269230769e-05, "loss": 1.1744, "step": 1383 }, { "epoch": 0.8073501531281901, "grad_norm": 1.0401328802108765, "learning_rate": 3.072415865384616e-05, "loss": 0.9221, "step": 1384 }, { "epoch": 0.8079334986145544, "grad_norm": 1.0775591135025024, "learning_rate": 3.0709134615384615e-05, "loss": 1.2103, "step": 1385 }, { "epoch": 0.8085168441009187, "grad_norm": 1.3836114406585693, "learning_rate": 3.069411057692308e-05, "loss": 0.8438, "step": 1386 }, { "epoch": 0.809100189587283, "grad_norm": 1.1026825904846191, "learning_rate": 3.067908653846154e-05, "loss": 0.8164, "step": 1387 }, { "epoch": 0.8096835350736473, "grad_norm": 1.80268132686615, "learning_rate": 3.0664062500000004e-05, "loss": 1.0345, "step": 1388 }, { "epoch": 0.8102668805600116, "grad_norm": 1.5059008598327637, "learning_rate": 3.064903846153846e-05, "loss": 0.9297, "step": 1389 }, { "epoch": 0.810850226046376, "grad_norm": 1.0632637739181519, "learning_rate": 3.063401442307692e-05, "loss": 1.237, "step": 1390 }, { "epoch": 0.8114335715327403, "grad_norm": 1.1647098064422607, "learning_rate": 3.0618990384615386e-05, "loss": 1.0494, "step": 1391 }, { "epoch": 0.8120169170191046, "grad_norm": 1.0377775430679321, "learning_rate": 3.0603966346153845e-05, "loss": 1.3689, "step": 1392 }, { "epoch": 0.8126002625054689, "grad_norm": 1.713774561882019, "learning_rate": 3.058894230769231e-05, "loss": 1.1149, "step": 1393 }, { "epoch": 0.8131836079918332, "grad_norm": 1.2139819860458374, "learning_rate": 3.0573918269230776e-05, "loss": 0.8279, "step": 1394 }, { "epoch": 0.8137669534781975, "grad_norm": 1.3912312984466553, "learning_rate": 3.055889423076923e-05, "loss": 1.0654, "step": 1395 }, { "epoch": 0.8143502989645618, "grad_norm": 1.276260256767273, "learning_rate": 3.054387019230769e-05, "loss": 0.8086, "step": 1396 }, { "epoch": 0.8149336444509261, "grad_norm": 1.3147262334823608, "learning_rate": 3.052884615384616e-05, "loss": 1.1643, "step": 1397 }, { "epoch": 0.8155169899372904, "grad_norm": 1.760912299156189, "learning_rate": 3.0513822115384616e-05, "loss": 1.2003, "step": 1398 }, { "epoch": 0.8161003354236547, "grad_norm": 1.060642957687378, "learning_rate": 3.0498798076923078e-05, "loss": 0.8297, "step": 1399 }, { "epoch": 0.816683680910019, "grad_norm": 1.1941111087799072, "learning_rate": 3.0483774038461537e-05, "loss": 0.8585, "step": 1400 }, { "epoch": 0.816683680910019, "eval_loss_squad": 0.8743070242926478, "eval_perplexity": 8.211708550571148, "eval_perplexity_reconstruct": 1.9933093086287428, "step": 1400 }, { "epoch": 0.8172670263963833, "grad_norm": 1.1372501850128174, "learning_rate": 3.0468750000000002e-05, "loss": 1.0104, "step": 1401 }, { "epoch": 0.8178503718827476, "grad_norm": 1.203554391860962, "learning_rate": 3.0453725961538464e-05, "loss": 1.1503, "step": 1402 }, { "epoch": 0.8184337173691119, "grad_norm": 1.5660467147827148, "learning_rate": 3.0438701923076922e-05, "loss": 0.9822, "step": 1403 }, { "epoch": 0.8190170628554762, "grad_norm": 1.2876819372177124, "learning_rate": 3.0423677884615388e-05, "loss": 1.1155, "step": 1404 }, { "epoch": 0.8196004083418404, "grad_norm": 1.0389353036880493, "learning_rate": 3.0408653846153846e-05, "loss": 0.8308, "step": 1405 }, { "epoch": 0.8201837538282047, "grad_norm": 1.467089295387268, "learning_rate": 3.0393629807692308e-05, "loss": 1.308, "step": 1406 }, { "epoch": 0.820767099314569, "grad_norm": 1.135607361793518, "learning_rate": 3.0378605769230773e-05, "loss": 0.9116, "step": 1407 }, { "epoch": 0.8213504448009333, "grad_norm": 1.0728017091751099, "learning_rate": 3.0363581730769232e-05, "loss": 0.9576, "step": 1408 }, { "epoch": 0.8219337902872976, "grad_norm": 1.2483171224594116, "learning_rate": 3.0348557692307694e-05, "loss": 1.0636, "step": 1409 }, { "epoch": 0.8225171357736619, "grad_norm": 1.3540332317352295, "learning_rate": 3.033353365384616e-05, "loss": 0.8896, "step": 1410 }, { "epoch": 0.8231004812600262, "grad_norm": 1.4841009378433228, "learning_rate": 3.0318509615384618e-05, "loss": 0.9574, "step": 1411 }, { "epoch": 0.8236838267463905, "grad_norm": 1.2146106958389282, "learning_rate": 3.030348557692308e-05, "loss": 1.0769, "step": 1412 }, { "epoch": 0.8242671722327548, "grad_norm": 1.104175090789795, "learning_rate": 3.0288461538461538e-05, "loss": 0.9756, "step": 1413 }, { "epoch": 0.8248505177191191, "grad_norm": 0.9717370271682739, "learning_rate": 3.02734375e-05, "loss": 0.9478, "step": 1414 }, { "epoch": 0.8254338632054834, "grad_norm": 1.0630606412887573, "learning_rate": 3.0258413461538465e-05, "loss": 0.9061, "step": 1415 }, { "epoch": 0.8260172086918477, "grad_norm": 1.1713072061538696, "learning_rate": 3.0243389423076924e-05, "loss": 1.1889, "step": 1416 }, { "epoch": 0.8266005541782121, "grad_norm": 1.2068907022476196, "learning_rate": 3.0228365384615385e-05, "loss": 0.9438, "step": 1417 }, { "epoch": 0.8271838996645764, "grad_norm": 0.9715260863304138, "learning_rate": 3.0213341346153844e-05, "loss": 0.7337, "step": 1418 }, { "epoch": 0.8277672451509407, "grad_norm": 1.0985807180404663, "learning_rate": 3.019831730769231e-05, "loss": 1.0818, "step": 1419 }, { "epoch": 0.828350590637305, "grad_norm": 1.2472034692764282, "learning_rate": 3.018329326923077e-05, "loss": 0.9131, "step": 1420 }, { "epoch": 0.8289339361236693, "grad_norm": 1.1218814849853516, "learning_rate": 3.016826923076923e-05, "loss": 1.0787, "step": 1421 }, { "epoch": 0.8295172816100336, "grad_norm": 1.0618772506713867, "learning_rate": 3.0153245192307695e-05, "loss": 0.8397, "step": 1422 }, { "epoch": 0.8301006270963979, "grad_norm": 1.021227240562439, "learning_rate": 3.0138221153846157e-05, "loss": 0.7859, "step": 1423 }, { "epoch": 0.8306839725827622, "grad_norm": 1.3854846954345703, "learning_rate": 3.0123197115384615e-05, "loss": 0.9057, "step": 1424 }, { "epoch": 0.8312673180691265, "grad_norm": 1.0573982000350952, "learning_rate": 3.010817307692308e-05, "loss": 0.8431, "step": 1425 }, { "epoch": 0.8318506635554908, "grad_norm": 1.3047147989273071, "learning_rate": 3.009314903846154e-05, "loss": 1.1606, "step": 1426 }, { "epoch": 0.832434009041855, "grad_norm": 1.6169391870498657, "learning_rate": 3.0078125e-05, "loss": 1.0701, "step": 1427 }, { "epoch": 0.8330173545282193, "grad_norm": 1.2799001932144165, "learning_rate": 3.0063100961538466e-05, "loss": 1.0659, "step": 1428 }, { "epoch": 0.8336007000145836, "grad_norm": 1.3066989183425903, "learning_rate": 3.0048076923076925e-05, "loss": 1.0707, "step": 1429 }, { "epoch": 0.8341840455009479, "grad_norm": 1.275020956993103, "learning_rate": 3.0033052884615387e-05, "loss": 1.0019, "step": 1430 }, { "epoch": 0.8347673909873122, "grad_norm": 1.084713101387024, "learning_rate": 3.0018028846153845e-05, "loss": 1.0027, "step": 1431 }, { "epoch": 0.8353507364736765, "grad_norm": 1.222685694694519, "learning_rate": 3.0003004807692307e-05, "loss": 0.8549, "step": 1432 }, { "epoch": 0.8359340819600408, "grad_norm": 1.1162559986114502, "learning_rate": 2.9987980769230772e-05, "loss": 0.9687, "step": 1433 }, { "epoch": 0.8365174274464051, "grad_norm": 1.1409958600997925, "learning_rate": 2.997295673076923e-05, "loss": 0.9169, "step": 1434 }, { "epoch": 0.8371007729327694, "grad_norm": 1.1237506866455078, "learning_rate": 2.9957932692307693e-05, "loss": 1.0596, "step": 1435 }, { "epoch": 0.8376841184191337, "grad_norm": 1.2333991527557373, "learning_rate": 2.9942908653846158e-05, "loss": 1.0413, "step": 1436 }, { "epoch": 0.838267463905498, "grad_norm": 1.2132785320281982, "learning_rate": 2.9927884615384616e-05, "loss": 0.8801, "step": 1437 }, { "epoch": 0.8388508093918623, "grad_norm": 2.4601593017578125, "learning_rate": 2.991286057692308e-05, "loss": 1.0372, "step": 1438 }, { "epoch": 0.8394341548782266, "grad_norm": 1.1504793167114258, "learning_rate": 2.9897836538461537e-05, "loss": 1.0258, "step": 1439 }, { "epoch": 0.8400175003645909, "grad_norm": 1.0757592916488647, "learning_rate": 2.9882812500000002e-05, "loss": 0.925, "step": 1440 }, { "epoch": 0.8406008458509552, "grad_norm": 1.2550078630447388, "learning_rate": 2.9867788461538464e-05, "loss": 1.0328, "step": 1441 }, { "epoch": 0.8411841913373195, "grad_norm": 1.108076810836792, "learning_rate": 2.9852764423076923e-05, "loss": 1.0529, "step": 1442 }, { "epoch": 0.8417675368236839, "grad_norm": 1.1696008443832397, "learning_rate": 2.9837740384615388e-05, "loss": 0.8204, "step": 1443 }, { "epoch": 0.8423508823100482, "grad_norm": 0.9712606072425842, "learning_rate": 2.9822716346153846e-05, "loss": 0.9446, "step": 1444 }, { "epoch": 0.8429342277964125, "grad_norm": 1.253280520439148, "learning_rate": 2.9807692307692308e-05, "loss": 0.9655, "step": 1445 }, { "epoch": 0.8435175732827768, "grad_norm": 1.2332576513290405, "learning_rate": 2.9792668269230773e-05, "loss": 0.9132, "step": 1446 }, { "epoch": 0.8441009187691411, "grad_norm": 0.9151219129562378, "learning_rate": 2.9777644230769232e-05, "loss": 1.0091, "step": 1447 }, { "epoch": 0.8446842642555054, "grad_norm": 1.0670338869094849, "learning_rate": 2.9762620192307694e-05, "loss": 0.9617, "step": 1448 }, { "epoch": 0.8452676097418697, "grad_norm": 1.4182400703430176, "learning_rate": 2.974759615384616e-05, "loss": 0.9599, "step": 1449 }, { "epoch": 0.845850955228234, "grad_norm": 1.4236427545547485, "learning_rate": 2.9732572115384614e-05, "loss": 1.0529, "step": 1450 }, { "epoch": 0.8464343007145982, "grad_norm": 1.3941996097564697, "learning_rate": 2.971754807692308e-05, "loss": 0.8889, "step": 1451 }, { "epoch": 0.8470176462009625, "grad_norm": 1.1831328868865967, "learning_rate": 2.9702524038461538e-05, "loss": 0.9023, "step": 1452 }, { "epoch": 0.8476009916873268, "grad_norm": 1.4158825874328613, "learning_rate": 2.96875e-05, "loss": 1.1108, "step": 1453 }, { "epoch": 0.8481843371736911, "grad_norm": 0.9731126427650452, "learning_rate": 2.9672475961538465e-05, "loss": 0.8187, "step": 1454 }, { "epoch": 0.8487676826600554, "grad_norm": 1.0598933696746826, "learning_rate": 2.9657451923076924e-05, "loss": 0.8798, "step": 1455 }, { "epoch": 0.8493510281464197, "grad_norm": 1.04989492893219, "learning_rate": 2.9642427884615386e-05, "loss": 0.9962, "step": 1456 }, { "epoch": 0.849934373632784, "grad_norm": 1.1285686492919922, "learning_rate": 2.9627403846153844e-05, "loss": 0.8692, "step": 1457 }, { "epoch": 0.8505177191191483, "grad_norm": 1.1516649723052979, "learning_rate": 2.961237980769231e-05, "loss": 0.8191, "step": 1458 }, { "epoch": 0.8511010646055126, "grad_norm": 1.2343637943267822, "learning_rate": 2.959735576923077e-05, "loss": 0.8846, "step": 1459 }, { "epoch": 0.8516844100918769, "grad_norm": 1.295943021774292, "learning_rate": 2.958233173076923e-05, "loss": 0.9154, "step": 1460 }, { "epoch": 0.8522677555782412, "grad_norm": 1.3594167232513428, "learning_rate": 2.9567307692307695e-05, "loss": 0.9516, "step": 1461 }, { "epoch": 0.8528511010646055, "grad_norm": 1.2709529399871826, "learning_rate": 2.9552283653846157e-05, "loss": 1.0265, "step": 1462 }, { "epoch": 0.8534344465509698, "grad_norm": 1.2515851259231567, "learning_rate": 2.9537259615384615e-05, "loss": 0.9376, "step": 1463 }, { "epoch": 0.8540177920373341, "grad_norm": 0.9908419251441956, "learning_rate": 2.952223557692308e-05, "loss": 1.0423, "step": 1464 }, { "epoch": 0.8546011375236984, "grad_norm": 1.3674644231796265, "learning_rate": 2.950721153846154e-05, "loss": 1.2146, "step": 1465 }, { "epoch": 0.8551844830100627, "grad_norm": 1.239865779876709, "learning_rate": 2.94921875e-05, "loss": 0.9979, "step": 1466 }, { "epoch": 0.855767828496427, "grad_norm": 2.099808692932129, "learning_rate": 2.9477163461538466e-05, "loss": 1.1026, "step": 1467 }, { "epoch": 0.8563511739827913, "grad_norm": 1.1289135217666626, "learning_rate": 2.946213942307692e-05, "loss": 1.0327, "step": 1468 }, { "epoch": 0.8569345194691556, "grad_norm": 1.2344995737075806, "learning_rate": 2.9447115384615387e-05, "loss": 0.9183, "step": 1469 }, { "epoch": 0.85751786495552, "grad_norm": 1.2652794122695923, "learning_rate": 2.9432091346153845e-05, "loss": 0.9498, "step": 1470 }, { "epoch": 0.8581012104418843, "grad_norm": 1.1644681692123413, "learning_rate": 2.9417067307692307e-05, "loss": 1.037, "step": 1471 }, { "epoch": 0.8586845559282485, "grad_norm": 1.0714110136032104, "learning_rate": 2.9402043269230772e-05, "loss": 1.0802, "step": 1472 }, { "epoch": 0.8592679014146128, "grad_norm": 1.1564422845840454, "learning_rate": 2.938701923076923e-05, "loss": 1.0889, "step": 1473 }, { "epoch": 0.8598512469009771, "grad_norm": 1.3579368591308594, "learning_rate": 2.9371995192307693e-05, "loss": 0.8359, "step": 1474 }, { "epoch": 0.8604345923873414, "grad_norm": 1.0801093578338623, "learning_rate": 2.9356971153846158e-05, "loss": 0.911, "step": 1475 }, { "epoch": 0.8610179378737057, "grad_norm": 1.1530592441558838, "learning_rate": 2.9341947115384617e-05, "loss": 1.0432, "step": 1476 }, { "epoch": 0.86160128336007, "grad_norm": 1.07007896900177, "learning_rate": 2.932692307692308e-05, "loss": 1.0042, "step": 1477 }, { "epoch": 0.8621846288464343, "grad_norm": 1.128960371017456, "learning_rate": 2.9311899038461537e-05, "loss": 1.0481, "step": 1478 }, { "epoch": 0.8627679743327986, "grad_norm": 1.7839515209197998, "learning_rate": 2.9296875000000002e-05, "loss": 1.1279, "step": 1479 }, { "epoch": 0.8633513198191629, "grad_norm": 1.7222251892089844, "learning_rate": 2.9281850961538464e-05, "loss": 0.9325, "step": 1480 }, { "epoch": 0.8639346653055272, "grad_norm": 0.8684887290000916, "learning_rate": 2.9266826923076923e-05, "loss": 1.1595, "step": 1481 }, { "epoch": 0.8645180107918915, "grad_norm": 1.20137357711792, "learning_rate": 2.9251802884615388e-05, "loss": 1.0726, "step": 1482 }, { "epoch": 0.8651013562782558, "grad_norm": 1.3149369955062866, "learning_rate": 2.9236778846153846e-05, "loss": 1.1417, "step": 1483 }, { "epoch": 0.8656847017646201, "grad_norm": 1.2710528373718262, "learning_rate": 2.922175480769231e-05, "loss": 1.0942, "step": 1484 }, { "epoch": 0.8662680472509844, "grad_norm": 1.145824909210205, "learning_rate": 2.9206730769230774e-05, "loss": 1.0122, "step": 1485 }, { "epoch": 0.8668513927373487, "grad_norm": 1.2785214185714722, "learning_rate": 2.9191706730769232e-05, "loss": 1.1068, "step": 1486 }, { "epoch": 0.867434738223713, "grad_norm": 1.3779847621917725, "learning_rate": 2.9176682692307694e-05, "loss": 1.1604, "step": 1487 }, { "epoch": 0.8680180837100773, "grad_norm": 1.2660037279129028, "learning_rate": 2.916165865384616e-05, "loss": 0.9293, "step": 1488 }, { "epoch": 0.8686014291964416, "grad_norm": 1.2929606437683105, "learning_rate": 2.9146634615384614e-05, "loss": 0.879, "step": 1489 }, { "epoch": 0.8691847746828059, "grad_norm": 1.376483678817749, "learning_rate": 2.913161057692308e-05, "loss": 1.0383, "step": 1490 }, { "epoch": 0.8697681201691702, "grad_norm": 1.1967178583145142, "learning_rate": 2.9116586538461538e-05, "loss": 0.981, "step": 1491 }, { "epoch": 0.8703514656555345, "grad_norm": 1.1415852308273315, "learning_rate": 2.91015625e-05, "loss": 0.8465, "step": 1492 }, { "epoch": 0.8709348111418987, "grad_norm": 1.2080132961273193, "learning_rate": 2.9086538461538465e-05, "loss": 1.0375, "step": 1493 }, { "epoch": 0.871518156628263, "grad_norm": 1.1498886346817017, "learning_rate": 2.9071514423076924e-05, "loss": 1.1026, "step": 1494 }, { "epoch": 0.8721015021146273, "grad_norm": 1.213456630706787, "learning_rate": 2.9056490384615386e-05, "loss": 1.1582, "step": 1495 }, { "epoch": 0.8726848476009917, "grad_norm": 2.127002239227295, "learning_rate": 2.9041466346153844e-05, "loss": 0.9406, "step": 1496 }, { "epoch": 0.873268193087356, "grad_norm": 1.454347848892212, "learning_rate": 2.902644230769231e-05, "loss": 0.9734, "step": 1497 }, { "epoch": 0.8738515385737203, "grad_norm": 1.1562999486923218, "learning_rate": 2.901141826923077e-05, "loss": 1.1249, "step": 1498 }, { "epoch": 0.8744348840600846, "grad_norm": 1.0088770389556885, "learning_rate": 2.899639423076923e-05, "loss": 1.1021, "step": 1499 }, { "epoch": 0.8750182295464489, "grad_norm": 0.972812294960022, "learning_rate": 2.8981370192307695e-05, "loss": 0.8978, "step": 1500 }, { "epoch": 0.8756015750328132, "grad_norm": 1.3404576778411865, "learning_rate": 2.8966346153846157e-05, "loss": 1.0195, "step": 1501 }, { "epoch": 0.8761849205191775, "grad_norm": 1.1055123805999756, "learning_rate": 2.8951322115384616e-05, "loss": 1.1829, "step": 1502 }, { "epoch": 0.8767682660055418, "grad_norm": 0.9296578764915466, "learning_rate": 2.893629807692308e-05, "loss": 1.1158, "step": 1503 }, { "epoch": 0.8773516114919061, "grad_norm": 1.197265625, "learning_rate": 2.892127403846154e-05, "loss": 1.0498, "step": 1504 }, { "epoch": 0.8779349569782704, "grad_norm": 0.9996442794799805, "learning_rate": 2.890625e-05, "loss": 1.0372, "step": 1505 }, { "epoch": 0.8785183024646347, "grad_norm": 1.2330306768417358, "learning_rate": 2.8891225961538467e-05, "loss": 1.0224, "step": 1506 }, { "epoch": 0.879101647950999, "grad_norm": 0.902051568031311, "learning_rate": 2.887620192307692e-05, "loss": 0.6842, "step": 1507 }, { "epoch": 0.8796849934373633, "grad_norm": 1.1468744277954102, "learning_rate": 2.8861177884615387e-05, "loss": 0.7994, "step": 1508 }, { "epoch": 0.8802683389237276, "grad_norm": 1.3102710247039795, "learning_rate": 2.8846153846153845e-05, "loss": 0.8041, "step": 1509 }, { "epoch": 0.8808516844100919, "grad_norm": 1.234621286392212, "learning_rate": 2.8831129807692307e-05, "loss": 0.9069, "step": 1510 }, { "epoch": 0.8814350298964562, "grad_norm": 1.3230934143066406, "learning_rate": 2.8816105769230773e-05, "loss": 0.8703, "step": 1511 }, { "epoch": 0.8820183753828205, "grad_norm": 1.1847695112228394, "learning_rate": 2.880108173076923e-05, "loss": 1.0461, "step": 1512 }, { "epoch": 0.8826017208691848, "grad_norm": 1.535765528678894, "learning_rate": 2.8786057692307693e-05, "loss": 0.9742, "step": 1513 }, { "epoch": 0.883185066355549, "grad_norm": 1.295670509338379, "learning_rate": 2.8771033653846158e-05, "loss": 0.9253, "step": 1514 }, { "epoch": 0.8837684118419133, "grad_norm": 1.4603569507598877, "learning_rate": 2.8756009615384617e-05, "loss": 1.2047, "step": 1515 }, { "epoch": 0.8843517573282776, "grad_norm": 1.117161512374878, "learning_rate": 2.874098557692308e-05, "loss": 1.1832, "step": 1516 }, { "epoch": 0.8849351028146419, "grad_norm": 1.2172470092773438, "learning_rate": 2.8725961538461537e-05, "loss": 1.0312, "step": 1517 }, { "epoch": 0.8855184483010062, "grad_norm": 1.279415249824524, "learning_rate": 2.8710937500000002e-05, "loss": 1.0095, "step": 1518 }, { "epoch": 0.8861017937873705, "grad_norm": 1.3966619968414307, "learning_rate": 2.8695913461538464e-05, "loss": 0.921, "step": 1519 }, { "epoch": 0.8866851392737348, "grad_norm": 1.2895619869232178, "learning_rate": 2.8680889423076923e-05, "loss": 0.9268, "step": 1520 }, { "epoch": 0.8872684847600991, "grad_norm": 1.188140869140625, "learning_rate": 2.8665865384615388e-05, "loss": 0.8283, "step": 1521 }, { "epoch": 0.8878518302464634, "grad_norm": 1.200034499168396, "learning_rate": 2.8650841346153847e-05, "loss": 1.0621, "step": 1522 }, { "epoch": 0.8884351757328278, "grad_norm": 1.3461898565292358, "learning_rate": 2.863581730769231e-05, "loss": 0.9668, "step": 1523 }, { "epoch": 0.8890185212191921, "grad_norm": 1.2042393684387207, "learning_rate": 2.8620793269230774e-05, "loss": 1.145, "step": 1524 }, { "epoch": 0.8896018667055564, "grad_norm": 1.2944340705871582, "learning_rate": 2.860576923076923e-05, "loss": 0.8788, "step": 1525 }, { "epoch": 0.8901852121919207, "grad_norm": 1.2656347751617432, "learning_rate": 2.8590745192307694e-05, "loss": 0.826, "step": 1526 }, { "epoch": 0.890768557678285, "grad_norm": 1.079966425895691, "learning_rate": 2.857572115384616e-05, "loss": 0.9062, "step": 1527 }, { "epoch": 0.8913519031646493, "grad_norm": 1.440693736076355, "learning_rate": 2.8560697115384615e-05, "loss": 1.0446, "step": 1528 }, { "epoch": 0.8919352486510136, "grad_norm": 1.3850165605545044, "learning_rate": 2.854567307692308e-05, "loss": 0.9395, "step": 1529 }, { "epoch": 0.8925185941373779, "grad_norm": 1.3351702690124512, "learning_rate": 2.853064903846154e-05, "loss": 1.1362, "step": 1530 }, { "epoch": 0.8931019396237422, "grad_norm": 0.9820516109466553, "learning_rate": 2.8515625e-05, "loss": 0.8793, "step": 1531 }, { "epoch": 0.8936852851101065, "grad_norm": 1.218936562538147, "learning_rate": 2.8500600961538466e-05, "loss": 0.9328, "step": 1532 }, { "epoch": 0.8942686305964708, "grad_norm": 1.2848559617996216, "learning_rate": 2.8485576923076924e-05, "loss": 0.955, "step": 1533 }, { "epoch": 0.8948519760828351, "grad_norm": 1.3455287218093872, "learning_rate": 2.8470552884615386e-05, "loss": 0.9611, "step": 1534 }, { "epoch": 0.8954353215691994, "grad_norm": 1.62530517578125, "learning_rate": 2.8455528846153844e-05, "loss": 0.9592, "step": 1535 }, { "epoch": 0.8960186670555637, "grad_norm": 1.1084357500076294, "learning_rate": 2.844050480769231e-05, "loss": 1.1606, "step": 1536 }, { "epoch": 0.896602012541928, "grad_norm": 1.11439847946167, "learning_rate": 2.842548076923077e-05, "loss": 0.9126, "step": 1537 }, { "epoch": 0.8971853580282922, "grad_norm": 1.2643011808395386, "learning_rate": 2.841045673076923e-05, "loss": 1.1443, "step": 1538 }, { "epoch": 0.8977687035146565, "grad_norm": 1.1699936389923096, "learning_rate": 2.8395432692307695e-05, "loss": 0.94, "step": 1539 }, { "epoch": 0.8983520490010208, "grad_norm": 1.2582471370697021, "learning_rate": 2.8380408653846157e-05, "loss": 1.0579, "step": 1540 }, { "epoch": 0.8989353944873851, "grad_norm": 1.1401822566986084, "learning_rate": 2.8365384615384616e-05, "loss": 0.8857, "step": 1541 }, { "epoch": 0.8995187399737494, "grad_norm": 1.2778136730194092, "learning_rate": 2.835036057692308e-05, "loss": 0.7792, "step": 1542 }, { "epoch": 0.9001020854601137, "grad_norm": 1.3046090602874756, "learning_rate": 2.8335336538461536e-05, "loss": 1.0387, "step": 1543 }, { "epoch": 0.900685430946478, "grad_norm": 1.1095534563064575, "learning_rate": 2.83203125e-05, "loss": 0.8712, "step": 1544 }, { "epoch": 0.9012687764328423, "grad_norm": 1.116352915763855, "learning_rate": 2.8305288461538467e-05, "loss": 0.9245, "step": 1545 }, { "epoch": 0.9018521219192066, "grad_norm": 1.2703346014022827, "learning_rate": 2.8290264423076922e-05, "loss": 0.8139, "step": 1546 }, { "epoch": 0.9024354674055709, "grad_norm": 1.2357991933822632, "learning_rate": 2.8275240384615387e-05, "loss": 1.1956, "step": 1547 }, { "epoch": 0.9030188128919352, "grad_norm": 1.0954554080963135, "learning_rate": 2.8260216346153846e-05, "loss": 0.8301, "step": 1548 }, { "epoch": 0.9036021583782996, "grad_norm": 1.6321262121200562, "learning_rate": 2.8245192307692307e-05, "loss": 0.9625, "step": 1549 }, { "epoch": 0.9041855038646639, "grad_norm": 1.3000946044921875, "learning_rate": 2.8230168269230773e-05, "loss": 0.9551, "step": 1550 }, { "epoch": 0.9047688493510282, "grad_norm": 1.1882951259613037, "learning_rate": 2.821514423076923e-05, "loss": 0.8661, "step": 1551 }, { "epoch": 0.9053521948373925, "grad_norm": 1.136929988861084, "learning_rate": 2.8200120192307693e-05, "loss": 0.8345, "step": 1552 }, { "epoch": 0.9059355403237568, "grad_norm": 1.2074054479599, "learning_rate": 2.818509615384616e-05, "loss": 1.0495, "step": 1553 }, { "epoch": 0.9065188858101211, "grad_norm": 1.0739816427230835, "learning_rate": 2.8170072115384617e-05, "loss": 0.8297, "step": 1554 }, { "epoch": 0.9071022312964854, "grad_norm": 1.0656182765960693, "learning_rate": 2.815504807692308e-05, "loss": 0.9902, "step": 1555 }, { "epoch": 0.9076855767828497, "grad_norm": 1.072704553604126, "learning_rate": 2.8140024038461537e-05, "loss": 1.1225, "step": 1556 }, { "epoch": 0.908268922269214, "grad_norm": 1.2448283433914185, "learning_rate": 2.8125000000000003e-05, "loss": 1.2656, "step": 1557 }, { "epoch": 0.9088522677555783, "grad_norm": 1.2833285331726074, "learning_rate": 2.8109975961538465e-05, "loss": 1.297, "step": 1558 }, { "epoch": 0.9094356132419426, "grad_norm": 0.9518376588821411, "learning_rate": 2.8094951923076923e-05, "loss": 1.2577, "step": 1559 }, { "epoch": 0.9100189587283068, "grad_norm": 1.1654349565505981, "learning_rate": 2.8079927884615388e-05, "loss": 0.9981, "step": 1560 }, { "epoch": 0.9106023042146711, "grad_norm": 1.316017746925354, "learning_rate": 2.8064903846153843e-05, "loss": 0.8892, "step": 1561 }, { "epoch": 0.9111856497010354, "grad_norm": 1.2900649309158325, "learning_rate": 2.804987980769231e-05, "loss": 0.9713, "step": 1562 }, { "epoch": 0.9117689951873997, "grad_norm": 1.169692039489746, "learning_rate": 2.8034855769230774e-05, "loss": 1.0375, "step": 1563 }, { "epoch": 0.912352340673764, "grad_norm": 1.1953729391098022, "learning_rate": 2.801983173076923e-05, "loss": 0.8052, "step": 1564 }, { "epoch": 0.9129356861601283, "grad_norm": 1.3699668645858765, "learning_rate": 2.8004807692307694e-05, "loss": 0.968, "step": 1565 }, { "epoch": 0.9135190316464926, "grad_norm": 1.2539699077606201, "learning_rate": 2.7989783653846156e-05, "loss": 1.0042, "step": 1566 }, { "epoch": 0.9141023771328569, "grad_norm": 1.1665087938308716, "learning_rate": 2.7974759615384615e-05, "loss": 0.949, "step": 1567 }, { "epoch": 0.9146857226192212, "grad_norm": 1.421539068222046, "learning_rate": 2.795973557692308e-05, "loss": 0.9799, "step": 1568 }, { "epoch": 0.9152690681055855, "grad_norm": 1.0832866430282593, "learning_rate": 2.794471153846154e-05, "loss": 0.8219, "step": 1569 }, { "epoch": 0.9158524135919498, "grad_norm": 1.2853337526321411, "learning_rate": 2.79296875e-05, "loss": 1.0887, "step": 1570 }, { "epoch": 0.9164357590783141, "grad_norm": 1.1525390148162842, "learning_rate": 2.7914663461538466e-05, "loss": 1.0594, "step": 1571 }, { "epoch": 0.9170191045646784, "grad_norm": 1.3522123098373413, "learning_rate": 2.7899639423076924e-05, "loss": 1.012, "step": 1572 }, { "epoch": 0.9176024500510427, "grad_norm": 1.1640657186508179, "learning_rate": 2.7884615384615386e-05, "loss": 0.9551, "step": 1573 }, { "epoch": 0.918185795537407, "grad_norm": 1.099858045578003, "learning_rate": 2.7869591346153845e-05, "loss": 0.9387, "step": 1574 }, { "epoch": 0.9187691410237713, "grad_norm": 1.295347809791565, "learning_rate": 2.785456730769231e-05, "loss": 0.9975, "step": 1575 }, { "epoch": 0.9193524865101357, "grad_norm": 1.0438365936279297, "learning_rate": 2.7839543269230772e-05, "loss": 0.9287, "step": 1576 }, { "epoch": 0.9199358319965, "grad_norm": 1.5085781812667847, "learning_rate": 2.782451923076923e-05, "loss": 0.8949, "step": 1577 }, { "epoch": 0.9205191774828643, "grad_norm": 1.1125335693359375, "learning_rate": 2.7809495192307696e-05, "loss": 1.1754, "step": 1578 }, { "epoch": 0.9211025229692286, "grad_norm": 1.4580250978469849, "learning_rate": 2.7794471153846157e-05, "loss": 1.188, "step": 1579 }, { "epoch": 0.9216858684555929, "grad_norm": 1.5720735788345337, "learning_rate": 2.7779447115384616e-05, "loss": 0.97, "step": 1580 }, { "epoch": 0.9222692139419572, "grad_norm": 1.5077130794525146, "learning_rate": 2.776442307692308e-05, "loss": 0.9886, "step": 1581 }, { "epoch": 0.9228525594283215, "grad_norm": 0.9293258786201477, "learning_rate": 2.7749399038461536e-05, "loss": 0.9714, "step": 1582 }, { "epoch": 0.9234359049146857, "grad_norm": 1.5474597215652466, "learning_rate": 2.7734375e-05, "loss": 0.9918, "step": 1583 }, { "epoch": 0.92401925040105, "grad_norm": 1.3098106384277344, "learning_rate": 2.7719350961538463e-05, "loss": 1.1593, "step": 1584 }, { "epoch": 0.9246025958874143, "grad_norm": 1.1443274021148682, "learning_rate": 2.7704326923076922e-05, "loss": 1.0293, "step": 1585 }, { "epoch": 0.9251859413737786, "grad_norm": 1.362916111946106, "learning_rate": 2.7689302884615387e-05, "loss": 1.2039, "step": 1586 }, { "epoch": 0.9257692868601429, "grad_norm": 1.2175136804580688, "learning_rate": 2.7674278846153846e-05, "loss": 0.9562, "step": 1587 }, { "epoch": 0.9263526323465072, "grad_norm": 1.4667470455169678, "learning_rate": 2.7659254807692308e-05, "loss": 0.9903, "step": 1588 }, { "epoch": 0.9269359778328715, "grad_norm": 1.317681908607483, "learning_rate": 2.7644230769230773e-05, "loss": 1.0178, "step": 1589 }, { "epoch": 0.9275193233192358, "grad_norm": 1.2673649787902832, "learning_rate": 2.762920673076923e-05, "loss": 1.0001, "step": 1590 }, { "epoch": 0.9281026688056001, "grad_norm": 1.055626630783081, "learning_rate": 2.7614182692307693e-05, "loss": 1.0075, "step": 1591 }, { "epoch": 0.9286860142919644, "grad_norm": 0.938339352607727, "learning_rate": 2.759915865384616e-05, "loss": 0.8492, "step": 1592 }, { "epoch": 0.9292693597783287, "grad_norm": 1.1331336498260498, "learning_rate": 2.7584134615384617e-05, "loss": 1.1089, "step": 1593 }, { "epoch": 0.929852705264693, "grad_norm": 0.9863076210021973, "learning_rate": 2.756911057692308e-05, "loss": 0.827, "step": 1594 }, { "epoch": 0.9304360507510573, "grad_norm": 0.8568504452705383, "learning_rate": 2.7554086538461537e-05, "loss": 0.9642, "step": 1595 }, { "epoch": 0.9310193962374216, "grad_norm": 1.3427821397781372, "learning_rate": 2.7539062500000003e-05, "loss": 1.0344, "step": 1596 }, { "epoch": 0.9316027417237859, "grad_norm": 1.1285762786865234, "learning_rate": 2.7524038461538465e-05, "loss": 0.9632, "step": 1597 }, { "epoch": 0.9321860872101502, "grad_norm": 1.079243779182434, "learning_rate": 2.7509014423076923e-05, "loss": 1.1732, "step": 1598 }, { "epoch": 0.9327694326965145, "grad_norm": 1.4731907844543457, "learning_rate": 2.749399038461539e-05, "loss": 1.0798, "step": 1599 }, { "epoch": 0.9333527781828788, "grad_norm": 1.8456404209136963, "learning_rate": 2.7478966346153844e-05, "loss": 0.8122, "step": 1600 }, { "epoch": 0.9333527781828788, "eval_loss_squad": 0.8185375917516649, "eval_perplexity": 8.537540772600966, "eval_perplexity_reconstruct": 1.9311449308939752, "step": 1600 }, { "epoch": 0.9339361236692431, "grad_norm": 1.5223437547683716, "learning_rate": 2.746394230769231e-05, "loss": 1.0494, "step": 1601 }, { "epoch": 0.9345194691556075, "grad_norm": 1.1983387470245361, "learning_rate": 2.744891826923077e-05, "loss": 1.1375, "step": 1602 }, { "epoch": 0.9351028146419718, "grad_norm": 1.253590703010559, "learning_rate": 2.743389423076923e-05, "loss": 0.86, "step": 1603 }, { "epoch": 0.935686160128336, "grad_norm": 1.5768380165100098, "learning_rate": 2.7418870192307695e-05, "loss": 1.0707, "step": 1604 }, { "epoch": 0.9362695056147003, "grad_norm": 1.1141117811203003, "learning_rate": 2.7403846153846156e-05, "loss": 0.9249, "step": 1605 }, { "epoch": 0.9368528511010646, "grad_norm": 1.229311466217041, "learning_rate": 2.7388822115384615e-05, "loss": 1.1422, "step": 1606 }, { "epoch": 0.9374361965874289, "grad_norm": 1.0226390361785889, "learning_rate": 2.737379807692308e-05, "loss": 1.1093, "step": 1607 }, { "epoch": 0.9380195420737932, "grad_norm": 1.055548071861267, "learning_rate": 2.735877403846154e-05, "loss": 1.1264, "step": 1608 }, { "epoch": 0.9386028875601575, "grad_norm": 1.326737642288208, "learning_rate": 2.734375e-05, "loss": 0.7929, "step": 1609 }, { "epoch": 0.9391862330465218, "grad_norm": 1.2810485363006592, "learning_rate": 2.7328725961538466e-05, "loss": 0.9209, "step": 1610 }, { "epoch": 0.9397695785328861, "grad_norm": 1.1614972352981567, "learning_rate": 2.7313701923076924e-05, "loss": 0.8571, "step": 1611 }, { "epoch": 0.9403529240192504, "grad_norm": 1.063585638999939, "learning_rate": 2.7298677884615386e-05, "loss": 1.0951, "step": 1612 }, { "epoch": 0.9409362695056147, "grad_norm": 1.1287500858306885, "learning_rate": 2.7283653846153845e-05, "loss": 0.9155, "step": 1613 }, { "epoch": 0.941519614991979, "grad_norm": 1.202637791633606, "learning_rate": 2.726862980769231e-05, "loss": 0.922, "step": 1614 }, { "epoch": 0.9421029604783433, "grad_norm": 0.9116735458374023, "learning_rate": 2.7253605769230772e-05, "loss": 1.1287, "step": 1615 }, { "epoch": 0.9426863059647076, "grad_norm": 1.2105416059494019, "learning_rate": 2.723858173076923e-05, "loss": 1.0168, "step": 1616 }, { "epoch": 0.9432696514510719, "grad_norm": 1.4131770133972168, "learning_rate": 2.7223557692307696e-05, "loss": 0.8181, "step": 1617 }, { "epoch": 0.9438529969374362, "grad_norm": 1.2585093975067139, "learning_rate": 2.7208533653846158e-05, "loss": 0.8861, "step": 1618 }, { "epoch": 0.9444363424238005, "grad_norm": 1.1039953231811523, "learning_rate": 2.7193509615384616e-05, "loss": 0.9609, "step": 1619 }, { "epoch": 0.9450196879101648, "grad_norm": 1.245587706565857, "learning_rate": 2.7178485576923078e-05, "loss": 0.8698, "step": 1620 }, { "epoch": 0.9456030333965291, "grad_norm": 1.325048565864563, "learning_rate": 2.7163461538461536e-05, "loss": 0.8585, "step": 1621 }, { "epoch": 0.9461863788828934, "grad_norm": 1.3289551734924316, "learning_rate": 2.7148437500000002e-05, "loss": 1.089, "step": 1622 }, { "epoch": 0.9467697243692577, "grad_norm": 1.2364208698272705, "learning_rate": 2.7133413461538464e-05, "loss": 1.0367, "step": 1623 }, { "epoch": 0.947353069855622, "grad_norm": 1.2870745658874512, "learning_rate": 2.7118389423076922e-05, "loss": 1.1767, "step": 1624 }, { "epoch": 0.9479364153419862, "grad_norm": 1.1112834215164185, "learning_rate": 2.7103365384615387e-05, "loss": 0.9196, "step": 1625 }, { "epoch": 0.9485197608283505, "grad_norm": 1.0553075075149536, "learning_rate": 2.7088341346153846e-05, "loss": 1.01, "step": 1626 }, { "epoch": 0.9491031063147148, "grad_norm": 1.2069644927978516, "learning_rate": 2.7073317307692308e-05, "loss": 1.0086, "step": 1627 }, { "epoch": 0.9496864518010791, "grad_norm": 1.0190929174423218, "learning_rate": 2.7058293269230773e-05, "loss": 0.9053, "step": 1628 }, { "epoch": 0.9502697972874435, "grad_norm": 1.1159418821334839, "learning_rate": 2.704326923076923e-05, "loss": 0.9234, "step": 1629 }, { "epoch": 0.9508531427738078, "grad_norm": 1.2140814065933228, "learning_rate": 2.7028245192307693e-05, "loss": 0.8535, "step": 1630 }, { "epoch": 0.9514364882601721, "grad_norm": 1.3430871963500977, "learning_rate": 2.701322115384616e-05, "loss": 1.1157, "step": 1631 }, { "epoch": 0.9520198337465364, "grad_norm": 1.2624151706695557, "learning_rate": 2.6998197115384617e-05, "loss": 0.9076, "step": 1632 }, { "epoch": 0.9526031792329007, "grad_norm": 1.325024127960205, "learning_rate": 2.698317307692308e-05, "loss": 0.9707, "step": 1633 }, { "epoch": 0.953186524719265, "grad_norm": 1.1497493982315063, "learning_rate": 2.6968149038461538e-05, "loss": 0.9365, "step": 1634 }, { "epoch": 0.9537698702056293, "grad_norm": 1.139591097831726, "learning_rate": 2.6953125000000003e-05, "loss": 1.0114, "step": 1635 }, { "epoch": 0.9543532156919936, "grad_norm": 1.202013373374939, "learning_rate": 2.6938100961538465e-05, "loss": 0.8738, "step": 1636 }, { "epoch": 0.9549365611783579, "grad_norm": 1.1490192413330078, "learning_rate": 2.6923076923076923e-05, "loss": 0.9644, "step": 1637 }, { "epoch": 0.9555199066647222, "grad_norm": 1.1956679821014404, "learning_rate": 2.6908052884615385e-05, "loss": 0.8404, "step": 1638 }, { "epoch": 0.9561032521510865, "grad_norm": 1.016502857208252, "learning_rate": 2.6893028846153844e-05, "loss": 0.9757, "step": 1639 }, { "epoch": 0.9566865976374508, "grad_norm": 1.1245112419128418, "learning_rate": 2.687800480769231e-05, "loss": 0.8616, "step": 1640 }, { "epoch": 0.9572699431238151, "grad_norm": 1.326154351234436, "learning_rate": 2.686298076923077e-05, "loss": 0.8661, "step": 1641 }, { "epoch": 0.9578532886101794, "grad_norm": 1.2109782695770264, "learning_rate": 2.684795673076923e-05, "loss": 0.8722, "step": 1642 }, { "epoch": 0.9584366340965437, "grad_norm": 1.2979744672775269, "learning_rate": 2.6832932692307695e-05, "loss": 0.9699, "step": 1643 }, { "epoch": 0.959019979582908, "grad_norm": 1.4275164604187012, "learning_rate": 2.6817908653846157e-05, "loss": 1.3414, "step": 1644 }, { "epoch": 0.9596033250692723, "grad_norm": 2.1155264377593994, "learning_rate": 2.6802884615384615e-05, "loss": 1.061, "step": 1645 }, { "epoch": 0.9601866705556366, "grad_norm": 1.1777634620666504, "learning_rate": 2.678786057692308e-05, "loss": 0.8194, "step": 1646 }, { "epoch": 0.9607700160420009, "grad_norm": 1.3087421655654907, "learning_rate": 2.677283653846154e-05, "loss": 1.084, "step": 1647 }, { "epoch": 0.9613533615283651, "grad_norm": 1.2843700647354126, "learning_rate": 2.67578125e-05, "loss": 0.9156, "step": 1648 }, { "epoch": 0.9619367070147294, "grad_norm": 1.2804768085479736, "learning_rate": 2.6742788461538466e-05, "loss": 1.1256, "step": 1649 }, { "epoch": 0.9625200525010937, "grad_norm": 1.1783385276794434, "learning_rate": 2.6727764423076925e-05, "loss": 0.7652, "step": 1650 }, { "epoch": 0.963103397987458, "grad_norm": 1.0474493503570557, "learning_rate": 2.6712740384615386e-05, "loss": 1.1225, "step": 1651 }, { "epoch": 0.9636867434738223, "grad_norm": 1.003244161605835, "learning_rate": 2.6697716346153845e-05, "loss": 0.9582, "step": 1652 }, { "epoch": 0.9642700889601866, "grad_norm": 1.3323856592178345, "learning_rate": 2.668269230769231e-05, "loss": 0.9056, "step": 1653 }, { "epoch": 0.9648534344465509, "grad_norm": 0.8693028092384338, "learning_rate": 2.6667668269230772e-05, "loss": 1.2649, "step": 1654 }, { "epoch": 0.9654367799329153, "grad_norm": 1.2013863325119019, "learning_rate": 2.665264423076923e-05, "loss": 1.0149, "step": 1655 }, { "epoch": 0.9660201254192796, "grad_norm": 1.061509132385254, "learning_rate": 2.6637620192307692e-05, "loss": 0.9357, "step": 1656 }, { "epoch": 0.9666034709056439, "grad_norm": 1.086358904838562, "learning_rate": 2.6622596153846158e-05, "loss": 0.7849, "step": 1657 }, { "epoch": 0.9671868163920082, "grad_norm": 1.2358739376068115, "learning_rate": 2.6607572115384616e-05, "loss": 1.0885, "step": 1658 }, { "epoch": 0.9677701618783725, "grad_norm": 1.0823692083358765, "learning_rate": 2.6592548076923078e-05, "loss": 0.8326, "step": 1659 }, { "epoch": 0.9683535073647368, "grad_norm": 2.211810827255249, "learning_rate": 2.6577524038461537e-05, "loss": 1.0041, "step": 1660 }, { "epoch": 0.9689368528511011, "grad_norm": 1.0642420053482056, "learning_rate": 2.6562500000000002e-05, "loss": 1.0824, "step": 1661 }, { "epoch": 0.9695201983374654, "grad_norm": 1.0718188285827637, "learning_rate": 2.6547475961538464e-05, "loss": 1.024, "step": 1662 }, { "epoch": 0.9701035438238297, "grad_norm": 0.972342848777771, "learning_rate": 2.6532451923076922e-05, "loss": 1.0009, "step": 1663 }, { "epoch": 0.970686889310194, "grad_norm": 1.5624159574508667, "learning_rate": 2.6517427884615388e-05, "loss": 0.9796, "step": 1664 }, { "epoch": 0.9712702347965583, "grad_norm": 1.251407504081726, "learning_rate": 2.6502403846153846e-05, "loss": 1.0386, "step": 1665 }, { "epoch": 0.9718535802829226, "grad_norm": 1.3637851476669312, "learning_rate": 2.6487379807692308e-05, "loss": 1.0674, "step": 1666 }, { "epoch": 0.9724369257692869, "grad_norm": 1.1037849187850952, "learning_rate": 2.6472355769230773e-05, "loss": 0.8173, "step": 1667 }, { "epoch": 0.9730202712556512, "grad_norm": 1.0050426721572876, "learning_rate": 2.6457331730769232e-05, "loss": 0.9875, "step": 1668 }, { "epoch": 0.9736036167420155, "grad_norm": 1.2693151235580444, "learning_rate": 2.6442307692307694e-05, "loss": 1.0777, "step": 1669 }, { "epoch": 0.9741869622283797, "grad_norm": 1.139029622077942, "learning_rate": 2.642728365384616e-05, "loss": 1.1184, "step": 1670 }, { "epoch": 0.974770307714744, "grad_norm": 1.156640887260437, "learning_rate": 2.6412259615384617e-05, "loss": 1.1217, "step": 1671 }, { "epoch": 0.9753536532011083, "grad_norm": 1.2772996425628662, "learning_rate": 2.639723557692308e-05, "loss": 0.8868, "step": 1672 }, { "epoch": 0.9759369986874726, "grad_norm": 1.1612448692321777, "learning_rate": 2.6382211538461538e-05, "loss": 0.912, "step": 1673 }, { "epoch": 0.9765203441738369, "grad_norm": 1.0551241636276245, "learning_rate": 2.63671875e-05, "loss": 0.9191, "step": 1674 }, { "epoch": 0.9771036896602012, "grad_norm": 1.092178225517273, "learning_rate": 2.6352163461538465e-05, "loss": 0.8177, "step": 1675 }, { "epoch": 0.9776870351465655, "grad_norm": 1.261635422706604, "learning_rate": 2.6337139423076923e-05, "loss": 0.8914, "step": 1676 }, { "epoch": 0.9782703806329298, "grad_norm": 1.3369121551513672, "learning_rate": 2.6322115384615385e-05, "loss": 0.7757, "step": 1677 }, { "epoch": 0.9788537261192941, "grad_norm": 1.1056147813796997, "learning_rate": 2.6307091346153844e-05, "loss": 0.8988, "step": 1678 }, { "epoch": 0.9794370716056584, "grad_norm": 1.2153946161270142, "learning_rate": 2.629206730769231e-05, "loss": 1.0653, "step": 1679 }, { "epoch": 0.9800204170920227, "grad_norm": 1.085607886314392, "learning_rate": 2.627704326923077e-05, "loss": 1.0798, "step": 1680 }, { "epoch": 0.980603762578387, "grad_norm": 1.152902603149414, "learning_rate": 2.626201923076923e-05, "loss": 1.193, "step": 1681 }, { "epoch": 0.9811871080647514, "grad_norm": 1.100484848022461, "learning_rate": 2.6246995192307695e-05, "loss": 0.8515, "step": 1682 }, { "epoch": 0.9817704535511157, "grad_norm": 0.9451040029525757, "learning_rate": 2.6231971153846157e-05, "loss": 1.015, "step": 1683 }, { "epoch": 0.98235379903748, "grad_norm": 1.5376704931259155, "learning_rate": 2.6216947115384615e-05, "loss": 1.1115, "step": 1684 }, { "epoch": 0.9829371445238443, "grad_norm": 1.2775416374206543, "learning_rate": 2.620192307692308e-05, "loss": 0.9463, "step": 1685 }, { "epoch": 0.9835204900102086, "grad_norm": 1.0612167119979858, "learning_rate": 2.618689903846154e-05, "loss": 1.3035, "step": 1686 }, { "epoch": 0.9841038354965729, "grad_norm": 1.1903184652328491, "learning_rate": 2.6171875e-05, "loss": 1.0081, "step": 1687 }, { "epoch": 0.9846871809829372, "grad_norm": 1.1516382694244385, "learning_rate": 2.6156850961538466e-05, "loss": 0.8602, "step": 1688 }, { "epoch": 0.9852705264693015, "grad_norm": 1.1271847486495972, "learning_rate": 2.6141826923076925e-05, "loss": 1.109, "step": 1689 }, { "epoch": 0.9858538719556658, "grad_norm": 1.0397697687149048, "learning_rate": 2.6126802884615387e-05, "loss": 1.2407, "step": 1690 }, { "epoch": 0.9864372174420301, "grad_norm": 1.1553921699523926, "learning_rate": 2.6111778846153845e-05, "loss": 1.0275, "step": 1691 }, { "epoch": 0.9870205629283944, "grad_norm": 1.3742332458496094, "learning_rate": 2.6096754807692307e-05, "loss": 1.0156, "step": 1692 }, { "epoch": 0.9876039084147586, "grad_norm": 1.3807793855667114, "learning_rate": 2.6081730769230772e-05, "loss": 1.0403, "step": 1693 }, { "epoch": 0.9881872539011229, "grad_norm": 2.3124213218688965, "learning_rate": 2.606670673076923e-05, "loss": 1.065, "step": 1694 }, { "epoch": 0.9887705993874872, "grad_norm": 1.1907039880752563, "learning_rate": 2.6051682692307693e-05, "loss": 0.7911, "step": 1695 }, { "epoch": 0.9893539448738515, "grad_norm": 1.244017481803894, "learning_rate": 2.6036658653846158e-05, "loss": 1.0102, "step": 1696 }, { "epoch": 0.9899372903602158, "grad_norm": 1.2556577920913696, "learning_rate": 2.6021634615384616e-05, "loss": 0.949, "step": 1697 }, { "epoch": 0.9905206358465801, "grad_norm": 1.2664211988449097, "learning_rate": 2.6006610576923078e-05, "loss": 0.9661, "step": 1698 }, { "epoch": 0.9911039813329444, "grad_norm": 1.1367192268371582, "learning_rate": 2.5991586538461537e-05, "loss": 1.094, "step": 1699 }, { "epoch": 0.9916873268193087, "grad_norm": 1.4298202991485596, "learning_rate": 2.5976562500000002e-05, "loss": 1.2788, "step": 1700 }, { "epoch": 0.992270672305673, "grad_norm": 1.2524197101593018, "learning_rate": 2.5961538461538464e-05, "loss": 0.7913, "step": 1701 }, { "epoch": 0.9928540177920373, "grad_norm": 1.3437864780426025, "learning_rate": 2.5946514423076922e-05, "loss": 1.1432, "step": 1702 }, { "epoch": 0.9934373632784016, "grad_norm": 1.105908751487732, "learning_rate": 2.5931490384615388e-05, "loss": 0.8405, "step": 1703 }, { "epoch": 0.9940207087647659, "grad_norm": 0.9571016430854797, "learning_rate": 2.5916466346153846e-05, "loss": 0.7865, "step": 1704 }, { "epoch": 0.9946040542511302, "grad_norm": 1.0247722864151, "learning_rate": 2.5901442307692308e-05, "loss": 0.8726, "step": 1705 }, { "epoch": 0.9951873997374945, "grad_norm": 1.160334825515747, "learning_rate": 2.5886418269230773e-05, "loss": 0.8283, "step": 1706 }, { "epoch": 0.9957707452238588, "grad_norm": 1.4978736639022827, "learning_rate": 2.5871394230769232e-05, "loss": 1.101, "step": 1707 }, { "epoch": 0.9963540907102232, "grad_norm": 1.0514538288116455, "learning_rate": 2.5856370192307694e-05, "loss": 0.9573, "step": 1708 }, { "epoch": 0.9969374361965875, "grad_norm": 1.091744303703308, "learning_rate": 2.584134615384616e-05, "loss": 1.0072, "step": 1709 }, { "epoch": 0.9975207816829518, "grad_norm": 1.261958122253418, "learning_rate": 2.5826322115384614e-05, "loss": 0.8659, "step": 1710 }, { "epoch": 0.9981041271693161, "grad_norm": 1.4826487302780151, "learning_rate": 2.581129807692308e-05, "loss": 0.9641, "step": 1711 }, { "epoch": 0.9986874726556804, "grad_norm": 1.4628450870513916, "learning_rate": 2.5796274038461538e-05, "loss": 0.7998, "step": 1712 }, { "epoch": 0.9992708181420447, "grad_norm": 1.1515614986419678, "learning_rate": 2.578125e-05, "loss": 1.1375, "step": 1713 }, { "epoch": 0.999854163628409, "grad_norm": 1.118147373199463, "learning_rate": 2.5766225961538465e-05, "loss": 0.7665, "step": 1714 }, { "epoch": 1.0004375091147732, "grad_norm": 0.9037253260612488, "learning_rate": 2.5751201923076924e-05, "loss": 0.7602, "step": 1715 }, { "epoch": 1.0010208546011374, "grad_norm": 1.0712151527404785, "learning_rate": 2.5736177884615386e-05, "loss": 0.7995, "step": 1716 }, { "epoch": 1.0016042000875018, "grad_norm": 0.8427433371543884, "learning_rate": 2.5721153846153844e-05, "loss": 0.9496, "step": 1717 }, { "epoch": 1.002187545573866, "grad_norm": 0.9450975060462952, "learning_rate": 2.570612980769231e-05, "loss": 0.8904, "step": 1718 }, { "epoch": 1.0027708910602304, "grad_norm": 1.037550926208496, "learning_rate": 2.569110576923077e-05, "loss": 0.8008, "step": 1719 }, { "epoch": 1.0033542365465946, "grad_norm": 0.8638540506362915, "learning_rate": 2.567608173076923e-05, "loss": 0.639, "step": 1720 }, { "epoch": 1.003937582032959, "grad_norm": 1.1321598291397095, "learning_rate": 2.5661057692307695e-05, "loss": 1.0667, "step": 1721 }, { "epoch": 1.0045209275193234, "grad_norm": 1.1372548341751099, "learning_rate": 2.5646033653846157e-05, "loss": 0.8085, "step": 1722 }, { "epoch": 1.0051042730056876, "grad_norm": 1.0561586618423462, "learning_rate": 2.5631009615384615e-05, "loss": 0.8129, "step": 1723 }, { "epoch": 1.005687618492052, "grad_norm": 1.0580649375915527, "learning_rate": 2.561598557692308e-05, "loss": 0.6149, "step": 1724 }, { "epoch": 1.0062709639784162, "grad_norm": 1.236765742301941, "learning_rate": 2.560096153846154e-05, "loss": 0.7003, "step": 1725 }, { "epoch": 1.0068543094647806, "grad_norm": 1.0639208555221558, "learning_rate": 2.55859375e-05, "loss": 0.8107, "step": 1726 }, { "epoch": 1.0074376549511448, "grad_norm": 1.3609492778778076, "learning_rate": 2.5570913461538466e-05, "loss": 0.9411, "step": 1727 }, { "epoch": 1.0080210004375092, "grad_norm": 1.1644700765609741, "learning_rate": 2.555588942307692e-05, "loss": 0.701, "step": 1728 }, { "epoch": 1.0086043459238734, "grad_norm": 1.2108243703842163, "learning_rate": 2.5540865384615387e-05, "loss": 0.9003, "step": 1729 }, { "epoch": 1.0091876914102378, "grad_norm": 1.1662452220916748, "learning_rate": 2.5525841346153845e-05, "loss": 0.8709, "step": 1730 }, { "epoch": 1.009771036896602, "grad_norm": 1.0945322513580322, "learning_rate": 2.5510817307692307e-05, "loss": 1.0255, "step": 1731 }, { "epoch": 1.0103543823829664, "grad_norm": 1.0760470628738403, "learning_rate": 2.5495793269230772e-05, "loss": 0.7592, "step": 1732 }, { "epoch": 1.0109377278693306, "grad_norm": 1.3232825994491577, "learning_rate": 2.548076923076923e-05, "loss": 0.7684, "step": 1733 }, { "epoch": 1.011521073355695, "grad_norm": 1.0295486450195312, "learning_rate": 2.5465745192307693e-05, "loss": 0.741, "step": 1734 }, { "epoch": 1.0121044188420592, "grad_norm": 1.247344970703125, "learning_rate": 2.5450721153846158e-05, "loss": 0.7828, "step": 1735 }, { "epoch": 1.0126877643284236, "grad_norm": 1.0752184391021729, "learning_rate": 2.5435697115384617e-05, "loss": 0.9003, "step": 1736 }, { "epoch": 1.0132711098147877, "grad_norm": 1.0817466974258423, "learning_rate": 2.542067307692308e-05, "loss": 0.8295, "step": 1737 }, { "epoch": 1.0138544553011521, "grad_norm": 1.0235193967819214, "learning_rate": 2.5405649038461537e-05, "loss": 1.0862, "step": 1738 }, { "epoch": 1.0144378007875163, "grad_norm": 1.1558364629745483, "learning_rate": 2.5390625000000002e-05, "loss": 0.7409, "step": 1739 }, { "epoch": 1.0150211462738807, "grad_norm": 1.1982396841049194, "learning_rate": 2.5375600961538464e-05, "loss": 0.8249, "step": 1740 }, { "epoch": 1.015604491760245, "grad_norm": 1.074081540107727, "learning_rate": 2.5360576923076923e-05, "loss": 0.7878, "step": 1741 }, { "epoch": 1.0161878372466093, "grad_norm": 1.4815016984939575, "learning_rate": 2.5345552884615388e-05, "loss": 0.8043, "step": 1742 }, { "epoch": 1.0167711827329735, "grad_norm": 1.1086952686309814, "learning_rate": 2.5330528846153846e-05, "loss": 0.7201, "step": 1743 }, { "epoch": 1.017354528219338, "grad_norm": 1.3218934535980225, "learning_rate": 2.5315504807692308e-05, "loss": 0.6374, "step": 1744 }, { "epoch": 1.017937873705702, "grad_norm": 1.0537618398666382, "learning_rate": 2.5300480769230774e-05, "loss": 0.7224, "step": 1745 }, { "epoch": 1.0185212191920665, "grad_norm": 0.9964005947113037, "learning_rate": 2.528545673076923e-05, "loss": 0.7204, "step": 1746 }, { "epoch": 1.0191045646784307, "grad_norm": 1.1964815855026245, "learning_rate": 2.5270432692307694e-05, "loss": 0.9753, "step": 1747 }, { "epoch": 1.019687910164795, "grad_norm": 1.0601825714111328, "learning_rate": 2.525540865384616e-05, "loss": 0.9208, "step": 1748 }, { "epoch": 1.0202712556511595, "grad_norm": 0.9419945478439331, "learning_rate": 2.5240384615384614e-05, "loss": 0.8482, "step": 1749 }, { "epoch": 1.0208546011375237, "grad_norm": 1.0061907768249512, "learning_rate": 2.522536057692308e-05, "loss": 0.8908, "step": 1750 }, { "epoch": 1.021437946623888, "grad_norm": 1.1203185319900513, "learning_rate": 2.5210336538461538e-05, "loss": 0.725, "step": 1751 }, { "epoch": 1.0220212921102523, "grad_norm": 1.2346861362457275, "learning_rate": 2.51953125e-05, "loss": 0.8266, "step": 1752 }, { "epoch": 1.0226046375966167, "grad_norm": 1.17784583568573, "learning_rate": 2.5180288461538465e-05, "loss": 0.8013, "step": 1753 }, { "epoch": 1.0231879830829809, "grad_norm": 1.0398868322372437, "learning_rate": 2.5165264423076924e-05, "loss": 0.5803, "step": 1754 }, { "epoch": 1.0237713285693453, "grad_norm": 1.372266173362732, "learning_rate": 2.5150240384615386e-05, "loss": 0.6542, "step": 1755 }, { "epoch": 1.0243546740557095, "grad_norm": 1.2637945413589478, "learning_rate": 2.5135216346153844e-05, "loss": 1.0724, "step": 1756 }, { "epoch": 1.0249380195420739, "grad_norm": 1.0931401252746582, "learning_rate": 2.512019230769231e-05, "loss": 0.734, "step": 1757 }, { "epoch": 1.025521365028438, "grad_norm": 1.0705105066299438, "learning_rate": 2.510516826923077e-05, "loss": 0.8975, "step": 1758 }, { "epoch": 1.0261047105148025, "grad_norm": 1.0411278009414673, "learning_rate": 2.509014423076923e-05, "loss": 0.829, "step": 1759 }, { "epoch": 1.0266880560011666, "grad_norm": 1.1690353155136108, "learning_rate": 2.5075120192307695e-05, "loss": 0.8418, "step": 1760 }, { "epoch": 1.027271401487531, "grad_norm": 1.1422929763793945, "learning_rate": 2.5060096153846157e-05, "loss": 1.003, "step": 1761 }, { "epoch": 1.0278547469738952, "grad_norm": 1.2721543312072754, "learning_rate": 2.5045072115384616e-05, "loss": 0.8079, "step": 1762 }, { "epoch": 1.0284380924602596, "grad_norm": 1.3369412422180176, "learning_rate": 2.503004807692308e-05, "loss": 0.9004, "step": 1763 }, { "epoch": 1.0290214379466238, "grad_norm": 1.6825288534164429, "learning_rate": 2.5015024038461536e-05, "loss": 0.9225, "step": 1764 }, { "epoch": 1.0296047834329882, "grad_norm": 1.25969660282135, "learning_rate": 2.5e-05, "loss": 0.7035, "step": 1765 }, { "epoch": 1.0301881289193524, "grad_norm": 1.099972128868103, "learning_rate": 2.4984975961538463e-05, "loss": 0.7718, "step": 1766 }, { "epoch": 1.0307714744057168, "grad_norm": 1.1386865377426147, "learning_rate": 2.496995192307692e-05, "loss": 0.7239, "step": 1767 }, { "epoch": 1.031354819892081, "grad_norm": 1.2388840913772583, "learning_rate": 2.4954927884615387e-05, "loss": 0.7997, "step": 1768 }, { "epoch": 1.0319381653784454, "grad_norm": 1.159300446510315, "learning_rate": 2.493990384615385e-05, "loss": 1.0244, "step": 1769 }, { "epoch": 1.0325215108648096, "grad_norm": 1.186693549156189, "learning_rate": 2.4924879807692307e-05, "loss": 0.7362, "step": 1770 }, { "epoch": 1.033104856351174, "grad_norm": 1.0532599687576294, "learning_rate": 2.490985576923077e-05, "loss": 0.6688, "step": 1771 }, { "epoch": 1.0336882018375382, "grad_norm": 1.267077088356018, "learning_rate": 2.4894831730769234e-05, "loss": 0.7482, "step": 1772 }, { "epoch": 1.0342715473239026, "grad_norm": 1.1788893938064575, "learning_rate": 2.4879807692307693e-05, "loss": 0.6433, "step": 1773 }, { "epoch": 1.0348548928102668, "grad_norm": 1.3695346117019653, "learning_rate": 2.4864783653846155e-05, "loss": 1.0139, "step": 1774 }, { "epoch": 1.0354382382966312, "grad_norm": 1.364996075630188, "learning_rate": 2.4849759615384617e-05, "loss": 0.9402, "step": 1775 }, { "epoch": 1.0360215837829956, "grad_norm": 1.1488473415374756, "learning_rate": 2.483473557692308e-05, "loss": 0.6775, "step": 1776 }, { "epoch": 1.0366049292693598, "grad_norm": 1.1622604131698608, "learning_rate": 2.481971153846154e-05, "loss": 0.9029, "step": 1777 }, { "epoch": 1.0371882747557242, "grad_norm": 1.3263182640075684, "learning_rate": 2.4804687500000002e-05, "loss": 0.7835, "step": 1778 }, { "epoch": 1.0377716202420884, "grad_norm": 1.031337022781372, "learning_rate": 2.478966346153846e-05, "loss": 0.7646, "step": 1779 }, { "epoch": 1.0383549657284528, "grad_norm": 1.2709075212478638, "learning_rate": 2.4774639423076923e-05, "loss": 0.7746, "step": 1780 }, { "epoch": 1.038938311214817, "grad_norm": 1.0792698860168457, "learning_rate": 2.4759615384615388e-05, "loss": 0.4939, "step": 1781 }, { "epoch": 1.0395216567011814, "grad_norm": 1.0642223358154297, "learning_rate": 2.4744591346153847e-05, "loss": 0.7588, "step": 1782 }, { "epoch": 1.0401050021875455, "grad_norm": 1.2650682926177979, "learning_rate": 2.472956730769231e-05, "loss": 0.866, "step": 1783 }, { "epoch": 1.04068834767391, "grad_norm": 1.2211084365844727, "learning_rate": 2.471454326923077e-05, "loss": 0.8201, "step": 1784 }, { "epoch": 1.0412716931602741, "grad_norm": 1.2228623628616333, "learning_rate": 2.4699519230769232e-05, "loss": 0.625, "step": 1785 }, { "epoch": 1.0418550386466385, "grad_norm": 1.2318840026855469, "learning_rate": 2.4684495192307694e-05, "loss": 0.796, "step": 1786 }, { "epoch": 1.0424383841330027, "grad_norm": 1.1144096851348877, "learning_rate": 2.4669471153846156e-05, "loss": 0.9259, "step": 1787 }, { "epoch": 1.0430217296193671, "grad_norm": 1.1762030124664307, "learning_rate": 2.4654447115384615e-05, "loss": 0.8342, "step": 1788 }, { "epoch": 1.0436050751057313, "grad_norm": 1.142924427986145, "learning_rate": 2.463942307692308e-05, "loss": 0.8825, "step": 1789 }, { "epoch": 1.0441884205920957, "grad_norm": 1.3172521591186523, "learning_rate": 2.462439903846154e-05, "loss": 0.8078, "step": 1790 }, { "epoch": 1.04477176607846, "grad_norm": 1.0718015432357788, "learning_rate": 2.4609375e-05, "loss": 0.8854, "step": 1791 }, { "epoch": 1.0453551115648243, "grad_norm": 1.0188226699829102, "learning_rate": 2.4594350961538462e-05, "loss": 1.0314, "step": 1792 }, { "epoch": 1.0459384570511885, "grad_norm": 1.0908758640289307, "learning_rate": 2.4579326923076924e-05, "loss": 0.6848, "step": 1793 }, { "epoch": 1.046521802537553, "grad_norm": 1.072546362876892, "learning_rate": 2.4564302884615386e-05, "loss": 0.657, "step": 1794 }, { "epoch": 1.047105148023917, "grad_norm": 1.0566478967666626, "learning_rate": 2.4549278846153848e-05, "loss": 0.9628, "step": 1795 }, { "epoch": 1.0476884935102815, "grad_norm": 1.16886305809021, "learning_rate": 2.453425480769231e-05, "loss": 0.7434, "step": 1796 }, { "epoch": 1.0482718389966457, "grad_norm": 1.1381522417068481, "learning_rate": 2.4519230769230768e-05, "loss": 0.559, "step": 1797 }, { "epoch": 1.04885518448301, "grad_norm": 1.2783828973770142, "learning_rate": 2.4504206730769233e-05, "loss": 0.9, "step": 1798 }, { "epoch": 1.0494385299693743, "grad_norm": 1.0993632078170776, "learning_rate": 2.4489182692307695e-05, "loss": 0.7602, "step": 1799 }, { "epoch": 1.0500218754557387, "grad_norm": 0.9995693564414978, "learning_rate": 2.4474158653846154e-05, "loss": 0.8363, "step": 1800 }, { "epoch": 1.0500218754557387, "eval_loss_squad": 0.8367310870531947, "eval_perplexity": 8.296506422781007, "eval_perplexity_reconstruct": 1.9039826490870362, "step": 1800 }, { "epoch": 1.0506052209421028, "grad_norm": 1.3584226369857788, "learning_rate": 2.4459134615384616e-05, "loss": 0.7513, "step": 1801 }, { "epoch": 1.0511885664284673, "grad_norm": 1.1026893854141235, "learning_rate": 2.444411057692308e-05, "loss": 0.8734, "step": 1802 }, { "epoch": 1.0517719119148317, "grad_norm": 0.9750449061393738, "learning_rate": 2.442908653846154e-05, "loss": 0.7949, "step": 1803 }, { "epoch": 1.0523552574011958, "grad_norm": 1.3138856887817383, "learning_rate": 2.44140625e-05, "loss": 0.7326, "step": 1804 }, { "epoch": 1.0529386028875602, "grad_norm": 0.9921886920928955, "learning_rate": 2.4399038461538463e-05, "loss": 0.8876, "step": 1805 }, { "epoch": 1.0535219483739244, "grad_norm": 1.2363988161087036, "learning_rate": 2.4384014423076922e-05, "loss": 0.8668, "step": 1806 }, { "epoch": 1.0541052938602888, "grad_norm": 1.2714601755142212, "learning_rate": 2.4368990384615387e-05, "loss": 0.8749, "step": 1807 }, { "epoch": 1.054688639346653, "grad_norm": 1.1158849000930786, "learning_rate": 2.435396634615385e-05, "loss": 1.065, "step": 1808 }, { "epoch": 1.0552719848330174, "grad_norm": 1.125258207321167, "learning_rate": 2.4338942307692307e-05, "loss": 0.8081, "step": 1809 }, { "epoch": 1.0558553303193816, "grad_norm": 1.039055585861206, "learning_rate": 2.432391826923077e-05, "loss": 0.8932, "step": 1810 }, { "epoch": 1.056438675805746, "grad_norm": 1.0936514139175415, "learning_rate": 2.4308894230769235e-05, "loss": 0.9564, "step": 1811 }, { "epoch": 1.0570220212921102, "grad_norm": 1.334757924079895, "learning_rate": 2.4293870192307693e-05, "loss": 0.8967, "step": 1812 }, { "epoch": 1.0576053667784746, "grad_norm": 1.4477955102920532, "learning_rate": 2.4278846153846155e-05, "loss": 0.8336, "step": 1813 }, { "epoch": 1.0581887122648388, "grad_norm": 1.319165825843811, "learning_rate": 2.4263822115384617e-05, "loss": 0.7005, "step": 1814 }, { "epoch": 1.0587720577512032, "grad_norm": 1.1051411628723145, "learning_rate": 2.424879807692308e-05, "loss": 0.7516, "step": 1815 }, { "epoch": 1.0593554032375674, "grad_norm": 1.2956829071044922, "learning_rate": 2.423377403846154e-05, "loss": 0.7716, "step": 1816 }, { "epoch": 1.0599387487239318, "grad_norm": 1.199265956878662, "learning_rate": 2.4218750000000003e-05, "loss": 0.9904, "step": 1817 }, { "epoch": 1.060522094210296, "grad_norm": 1.0964492559432983, "learning_rate": 2.420372596153846e-05, "loss": 0.7208, "step": 1818 }, { "epoch": 1.0611054396966604, "grad_norm": 1.264791488647461, "learning_rate": 2.4188701923076923e-05, "loss": 0.7618, "step": 1819 }, { "epoch": 1.0616887851830246, "grad_norm": 1.1390454769134521, "learning_rate": 2.4173677884615388e-05, "loss": 0.7634, "step": 1820 }, { "epoch": 1.062272130669389, "grad_norm": 1.1044617891311646, "learning_rate": 2.4158653846153847e-05, "loss": 0.6295, "step": 1821 }, { "epoch": 1.0628554761557532, "grad_norm": 1.4096314907073975, "learning_rate": 2.414362980769231e-05, "loss": 0.8327, "step": 1822 }, { "epoch": 1.0634388216421176, "grad_norm": 1.2174628973007202, "learning_rate": 2.412860576923077e-05, "loss": 0.985, "step": 1823 }, { "epoch": 1.0640221671284817, "grad_norm": 0.912973165512085, "learning_rate": 2.4113581730769232e-05, "loss": 1.0427, "step": 1824 }, { "epoch": 1.0646055126148462, "grad_norm": 1.1908468008041382, "learning_rate": 2.4098557692307694e-05, "loss": 0.7616, "step": 1825 }, { "epoch": 1.0651888581012106, "grad_norm": 1.260453462600708, "learning_rate": 2.4083533653846156e-05, "loss": 0.8113, "step": 1826 }, { "epoch": 1.0657722035875747, "grad_norm": 1.2202224731445312, "learning_rate": 2.4068509615384615e-05, "loss": 0.7171, "step": 1827 }, { "epoch": 1.066355549073939, "grad_norm": 0.8979166746139526, "learning_rate": 2.405348557692308e-05, "loss": 0.7942, "step": 1828 }, { "epoch": 1.0669388945603033, "grad_norm": 1.1224358081817627, "learning_rate": 2.4038461538461542e-05, "loss": 0.9005, "step": 1829 }, { "epoch": 1.0675222400466677, "grad_norm": 1.1409502029418945, "learning_rate": 2.40234375e-05, "loss": 0.7607, "step": 1830 }, { "epoch": 1.068105585533032, "grad_norm": 1.1402530670166016, "learning_rate": 2.4008413461538462e-05, "loss": 1.0247, "step": 1831 }, { "epoch": 1.0686889310193963, "grad_norm": 1.0991151332855225, "learning_rate": 2.3993389423076924e-05, "loss": 0.7754, "step": 1832 }, { "epoch": 1.0692722765057605, "grad_norm": 1.229304552078247, "learning_rate": 2.3978365384615386e-05, "loss": 0.8271, "step": 1833 }, { "epoch": 1.069855621992125, "grad_norm": 1.2119697332382202, "learning_rate": 2.3963341346153848e-05, "loss": 0.8626, "step": 1834 }, { "epoch": 1.070438967478489, "grad_norm": 1.4418648481369019, "learning_rate": 2.394831730769231e-05, "loss": 0.8454, "step": 1835 }, { "epoch": 1.0710223129648535, "grad_norm": 1.1836721897125244, "learning_rate": 2.3933293269230768e-05, "loss": 0.8298, "step": 1836 }, { "epoch": 1.0716056584512177, "grad_norm": 1.2164535522460938, "learning_rate": 2.3918269230769234e-05, "loss": 0.8123, "step": 1837 }, { "epoch": 1.072189003937582, "grad_norm": 1.3282859325408936, "learning_rate": 2.3903245192307695e-05, "loss": 1.004, "step": 1838 }, { "epoch": 1.0727723494239463, "grad_norm": 1.1000369787216187, "learning_rate": 2.3888221153846154e-05, "loss": 0.7878, "step": 1839 }, { "epoch": 1.0733556949103107, "grad_norm": 1.3156583309173584, "learning_rate": 2.3873197115384616e-05, "loss": 0.6691, "step": 1840 }, { "epoch": 1.0739390403966749, "grad_norm": 1.264693021774292, "learning_rate": 2.3858173076923078e-05, "loss": 0.8582, "step": 1841 }, { "epoch": 1.0745223858830393, "grad_norm": 1.1336017847061157, "learning_rate": 2.384314903846154e-05, "loss": 0.704, "step": 1842 }, { "epoch": 1.0751057313694035, "grad_norm": 0.8796018362045288, "learning_rate": 2.3828125e-05, "loss": 0.7031, "step": 1843 }, { "epoch": 1.0756890768557679, "grad_norm": 1.2418160438537598, "learning_rate": 2.3813100961538463e-05, "loss": 0.7327, "step": 1844 }, { "epoch": 1.076272422342132, "grad_norm": 1.1050810813903809, "learning_rate": 2.3798076923076922e-05, "loss": 0.8006, "step": 1845 }, { "epoch": 1.0768557678284965, "grad_norm": 1.2827439308166504, "learning_rate": 2.3783052884615387e-05, "loss": 0.777, "step": 1846 }, { "epoch": 1.0774391133148606, "grad_norm": 1.3954315185546875, "learning_rate": 2.376802884615385e-05, "loss": 0.7289, "step": 1847 }, { "epoch": 1.078022458801225, "grad_norm": 1.2399247884750366, "learning_rate": 2.3753004807692308e-05, "loss": 0.7638, "step": 1848 }, { "epoch": 1.0786058042875892, "grad_norm": 1.351985216140747, "learning_rate": 2.373798076923077e-05, "loss": 0.7568, "step": 1849 }, { "epoch": 1.0791891497739536, "grad_norm": 1.1881484985351562, "learning_rate": 2.372295673076923e-05, "loss": 0.8417, "step": 1850 }, { "epoch": 1.0797724952603178, "grad_norm": 1.6460144519805908, "learning_rate": 2.3707932692307693e-05, "loss": 0.8467, "step": 1851 }, { "epoch": 1.0803558407466822, "grad_norm": 1.1815186738967896, "learning_rate": 2.3692908653846155e-05, "loss": 0.7675, "step": 1852 }, { "epoch": 1.0809391862330466, "grad_norm": 1.2811179161071777, "learning_rate": 2.3677884615384617e-05, "loss": 0.7745, "step": 1853 }, { "epoch": 1.0815225317194108, "grad_norm": 1.1434037685394287, "learning_rate": 2.366286057692308e-05, "loss": 0.8752, "step": 1854 }, { "epoch": 1.0821058772057752, "grad_norm": 1.2398650646209717, "learning_rate": 2.364783653846154e-05, "loss": 0.8398, "step": 1855 }, { "epoch": 1.0826892226921394, "grad_norm": 1.2694295644760132, "learning_rate": 2.3632812500000003e-05, "loss": 0.8714, "step": 1856 }, { "epoch": 1.0832725681785038, "grad_norm": 1.1594481468200684, "learning_rate": 2.361778846153846e-05, "loss": 0.6383, "step": 1857 }, { "epoch": 1.083855913664868, "grad_norm": 1.2221319675445557, "learning_rate": 2.3602764423076923e-05, "loss": 0.9375, "step": 1858 }, { "epoch": 1.0844392591512324, "grad_norm": 1.183512568473816, "learning_rate": 2.3587740384615385e-05, "loss": 0.8724, "step": 1859 }, { "epoch": 1.0850226046375966, "grad_norm": 1.3200178146362305, "learning_rate": 2.3572716346153847e-05, "loss": 0.8969, "step": 1860 }, { "epoch": 1.085605950123961, "grad_norm": 1.3450496196746826, "learning_rate": 2.355769230769231e-05, "loss": 0.822, "step": 1861 }, { "epoch": 1.0861892956103252, "grad_norm": 1.2601107358932495, "learning_rate": 2.354266826923077e-05, "loss": 0.7901, "step": 1862 }, { "epoch": 1.0867726410966896, "grad_norm": 1.089453101158142, "learning_rate": 2.3527644230769233e-05, "loss": 0.8132, "step": 1863 }, { "epoch": 1.0873559865830538, "grad_norm": 0.8895475268363953, "learning_rate": 2.3512620192307694e-05, "loss": 0.6964, "step": 1864 }, { "epoch": 1.0879393320694182, "grad_norm": 1.3046108484268188, "learning_rate": 2.3497596153846156e-05, "loss": 0.9721, "step": 1865 }, { "epoch": 1.0885226775557824, "grad_norm": 1.102406620979309, "learning_rate": 2.3482572115384615e-05, "loss": 0.8803, "step": 1866 }, { "epoch": 1.0891060230421468, "grad_norm": 1.1453616619110107, "learning_rate": 2.346754807692308e-05, "loss": 0.8508, "step": 1867 }, { "epoch": 1.089689368528511, "grad_norm": 1.1119948625564575, "learning_rate": 2.345252403846154e-05, "loss": 0.6988, "step": 1868 }, { "epoch": 1.0902727140148754, "grad_norm": 1.3853925466537476, "learning_rate": 2.34375e-05, "loss": 0.8331, "step": 1869 }, { "epoch": 1.0908560595012395, "grad_norm": 1.315190315246582, "learning_rate": 2.3422475961538462e-05, "loss": 0.7391, "step": 1870 }, { "epoch": 1.091439404987604, "grad_norm": 1.070307970046997, "learning_rate": 2.3407451923076924e-05, "loss": 0.8388, "step": 1871 }, { "epoch": 1.0920227504739681, "grad_norm": 0.9866098165512085, "learning_rate": 2.3392427884615386e-05, "loss": 1.0789, "step": 1872 }, { "epoch": 1.0926060959603325, "grad_norm": 1.2839645147323608, "learning_rate": 2.3377403846153848e-05, "loss": 0.7442, "step": 1873 }, { "epoch": 1.0931894414466967, "grad_norm": 1.2288687229156494, "learning_rate": 2.336237980769231e-05, "loss": 0.7923, "step": 1874 }, { "epoch": 1.0937727869330611, "grad_norm": 1.0700854063034058, "learning_rate": 2.334735576923077e-05, "loss": 0.9763, "step": 1875 }, { "epoch": 1.0943561324194253, "grad_norm": 0.9807063937187195, "learning_rate": 2.3332331730769234e-05, "loss": 0.8784, "step": 1876 }, { "epoch": 1.0949394779057897, "grad_norm": 1.1770750284194946, "learning_rate": 2.3317307692307692e-05, "loss": 0.7672, "step": 1877 }, { "epoch": 1.095522823392154, "grad_norm": 1.1473841667175293, "learning_rate": 2.3302283653846154e-05, "loss": 0.7583, "step": 1878 }, { "epoch": 1.0961061688785183, "grad_norm": 1.2041727304458618, "learning_rate": 2.3287259615384616e-05, "loss": 0.8327, "step": 1879 }, { "epoch": 1.0966895143648827, "grad_norm": 1.1696784496307373, "learning_rate": 2.3272235576923078e-05, "loss": 0.853, "step": 1880 }, { "epoch": 1.097272859851247, "grad_norm": 1.1404637098312378, "learning_rate": 2.325721153846154e-05, "loss": 0.8551, "step": 1881 }, { "epoch": 1.0978562053376113, "grad_norm": 1.082005262374878, "learning_rate": 2.32421875e-05, "loss": 0.7462, "step": 1882 }, { "epoch": 1.0984395508239755, "grad_norm": 1.3037984371185303, "learning_rate": 2.3227163461538464e-05, "loss": 1.0494, "step": 1883 }, { "epoch": 1.09902289631034, "grad_norm": 1.3747504949569702, "learning_rate": 2.3212139423076922e-05, "loss": 0.9479, "step": 1884 }, { "epoch": 1.099606241796704, "grad_norm": 1.2729436159133911, "learning_rate": 2.3197115384615387e-05, "loss": 0.8058, "step": 1885 }, { "epoch": 1.1001895872830685, "grad_norm": 0.951274037361145, "learning_rate": 2.3182091346153846e-05, "loss": 0.7261, "step": 1886 }, { "epoch": 1.1007729327694327, "grad_norm": 1.114943027496338, "learning_rate": 2.3167067307692308e-05, "loss": 0.8833, "step": 1887 }, { "epoch": 1.101356278255797, "grad_norm": 1.2294602394104004, "learning_rate": 2.315204326923077e-05, "loss": 0.8286, "step": 1888 }, { "epoch": 1.1019396237421613, "grad_norm": 1.0512102842330933, "learning_rate": 2.313701923076923e-05, "loss": 0.8782, "step": 1889 }, { "epoch": 1.1025229692285257, "grad_norm": 1.2631170749664307, "learning_rate": 2.3121995192307693e-05, "loss": 0.7911, "step": 1890 }, { "epoch": 1.1031063147148898, "grad_norm": 1.3144844770431519, "learning_rate": 2.3106971153846155e-05, "loss": 0.7812, "step": 1891 }, { "epoch": 1.1036896602012543, "grad_norm": 1.0786577463150024, "learning_rate": 2.3091947115384617e-05, "loss": 0.7804, "step": 1892 }, { "epoch": 1.1042730056876184, "grad_norm": 1.3957914113998413, "learning_rate": 2.307692307692308e-05, "loss": 0.9104, "step": 1893 }, { "epoch": 1.1048563511739828, "grad_norm": 1.103511929512024, "learning_rate": 2.306189903846154e-05, "loss": 0.6835, "step": 1894 }, { "epoch": 1.105439696660347, "grad_norm": 1.3668882846832275, "learning_rate": 2.3046875e-05, "loss": 0.8694, "step": 1895 }, { "epoch": 1.1060230421467114, "grad_norm": 1.075162649154663, "learning_rate": 2.303185096153846e-05, "loss": 0.8763, "step": 1896 }, { "epoch": 1.1066063876330756, "grad_norm": 1.2901614904403687, "learning_rate": 2.3016826923076923e-05, "loss": 0.8694, "step": 1897 }, { "epoch": 1.10718973311944, "grad_norm": 1.0675395727157593, "learning_rate": 2.3001802884615385e-05, "loss": 0.7391, "step": 1898 }, { "epoch": 1.1077730786058042, "grad_norm": 1.200480341911316, "learning_rate": 2.2986778846153847e-05, "loss": 0.868, "step": 1899 }, { "epoch": 1.1083564240921686, "grad_norm": 1.254665732383728, "learning_rate": 2.297175480769231e-05, "loss": 0.7587, "step": 1900 }, { "epoch": 1.1089397695785328, "grad_norm": 1.047951102256775, "learning_rate": 2.295673076923077e-05, "loss": 0.8484, "step": 1901 }, { "epoch": 1.1095231150648972, "grad_norm": 1.0804102420806885, "learning_rate": 2.2941706730769233e-05, "loss": 0.7814, "step": 1902 }, { "epoch": 1.1101064605512614, "grad_norm": 1.1867598295211792, "learning_rate": 2.2926682692307695e-05, "loss": 0.75, "step": 1903 }, { "epoch": 1.1106898060376258, "grad_norm": 1.148829460144043, "learning_rate": 2.2911658653846153e-05, "loss": 0.8877, "step": 1904 }, { "epoch": 1.11127315152399, "grad_norm": 1.0924962759017944, "learning_rate": 2.2896634615384615e-05, "loss": 0.7225, "step": 1905 }, { "epoch": 1.1118564970103544, "grad_norm": 1.1306710243225098, "learning_rate": 2.288161057692308e-05, "loss": 0.8202, "step": 1906 }, { "epoch": 1.1124398424967188, "grad_norm": 1.1604644060134888, "learning_rate": 2.286658653846154e-05, "loss": 0.8849, "step": 1907 }, { "epoch": 1.113023187983083, "grad_norm": 1.2594935894012451, "learning_rate": 2.28515625e-05, "loss": 0.8069, "step": 1908 }, { "epoch": 1.1136065334694474, "grad_norm": 1.1446539163589478, "learning_rate": 2.2836538461538463e-05, "loss": 0.6342, "step": 1909 }, { "epoch": 1.1141898789558116, "grad_norm": 1.317840337753296, "learning_rate": 2.2821514423076924e-05, "loss": 0.8211, "step": 1910 }, { "epoch": 1.114773224442176, "grad_norm": 1.0653162002563477, "learning_rate": 2.2806490384615386e-05, "loss": 0.8454, "step": 1911 }, { "epoch": 1.1153565699285402, "grad_norm": 0.9791072607040405, "learning_rate": 2.2791466346153848e-05, "loss": 0.7433, "step": 1912 }, { "epoch": 1.1159399154149046, "grad_norm": 1.1724233627319336, "learning_rate": 2.2776442307692307e-05, "loss": 0.7882, "step": 1913 }, { "epoch": 1.1165232609012687, "grad_norm": 1.2540687322616577, "learning_rate": 2.276141826923077e-05, "loss": 0.7061, "step": 1914 }, { "epoch": 1.1171066063876331, "grad_norm": 1.0810496807098389, "learning_rate": 2.2746394230769234e-05, "loss": 0.973, "step": 1915 }, { "epoch": 1.1176899518739973, "grad_norm": 1.2553794384002686, "learning_rate": 2.2731370192307692e-05, "loss": 0.6909, "step": 1916 }, { "epoch": 1.1182732973603617, "grad_norm": 1.2660142183303833, "learning_rate": 2.2716346153846154e-05, "loss": 0.8742, "step": 1917 }, { "epoch": 1.118856642846726, "grad_norm": 1.4024698734283447, "learning_rate": 2.2701322115384616e-05, "loss": 0.7558, "step": 1918 }, { "epoch": 1.1194399883330903, "grad_norm": 1.2419300079345703, "learning_rate": 2.2686298076923078e-05, "loss": 0.8231, "step": 1919 }, { "epoch": 1.1200233338194545, "grad_norm": 1.2142603397369385, "learning_rate": 2.267127403846154e-05, "loss": 0.8333, "step": 1920 }, { "epoch": 1.120606679305819, "grad_norm": 1.1328840255737305, "learning_rate": 2.2656250000000002e-05, "loss": 0.7284, "step": 1921 }, { "epoch": 1.121190024792183, "grad_norm": 1.256993055343628, "learning_rate": 2.264122596153846e-05, "loss": 0.8874, "step": 1922 }, { "epoch": 1.1217733702785475, "grad_norm": 0.9864529967308044, "learning_rate": 2.2626201923076922e-05, "loss": 0.7252, "step": 1923 }, { "epoch": 1.1223567157649117, "grad_norm": 1.1391963958740234, "learning_rate": 2.2611177884615387e-05, "loss": 0.7308, "step": 1924 }, { "epoch": 1.122940061251276, "grad_norm": 1.3632563352584839, "learning_rate": 2.2596153846153846e-05, "loss": 0.7668, "step": 1925 }, { "epoch": 1.1235234067376403, "grad_norm": 0.9523732662200928, "learning_rate": 2.2581129807692308e-05, "loss": 0.6653, "step": 1926 }, { "epoch": 1.1241067522240047, "grad_norm": 1.2235891819000244, "learning_rate": 2.256610576923077e-05, "loss": 0.7031, "step": 1927 }, { "epoch": 1.1246900977103689, "grad_norm": 1.1225990056991577, "learning_rate": 2.255108173076923e-05, "loss": 0.7132, "step": 1928 }, { "epoch": 1.1252734431967333, "grad_norm": 1.2185968160629272, "learning_rate": 2.2536057692307694e-05, "loss": 0.7747, "step": 1929 }, { "epoch": 1.1258567886830977, "grad_norm": 1.2075011730194092, "learning_rate": 2.2521033653846155e-05, "loss": 0.7532, "step": 1930 }, { "epoch": 1.1264401341694619, "grad_norm": 1.0588568449020386, "learning_rate": 2.2506009615384614e-05, "loss": 0.6433, "step": 1931 }, { "epoch": 1.127023479655826, "grad_norm": 1.154813528060913, "learning_rate": 2.249098557692308e-05, "loss": 0.8853, "step": 1932 }, { "epoch": 1.1276068251421905, "grad_norm": 1.280246376991272, "learning_rate": 2.247596153846154e-05, "loss": 0.8618, "step": 1933 }, { "epoch": 1.1281901706285549, "grad_norm": 1.2844151258468628, "learning_rate": 2.24609375e-05, "loss": 0.7397, "step": 1934 }, { "epoch": 1.128773516114919, "grad_norm": 1.1625947952270508, "learning_rate": 2.244591346153846e-05, "loss": 0.8301, "step": 1935 }, { "epoch": 1.1293568616012835, "grad_norm": 1.3182225227355957, "learning_rate": 2.2430889423076923e-05, "loss": 0.7165, "step": 1936 }, { "epoch": 1.1299402070876476, "grad_norm": 1.4218560457229614, "learning_rate": 2.2415865384615385e-05, "loss": 0.7256, "step": 1937 }, { "epoch": 1.130523552574012, "grad_norm": 1.061953067779541, "learning_rate": 2.2400841346153847e-05, "loss": 0.8491, "step": 1938 }, { "epoch": 1.1311068980603762, "grad_norm": 1.1568974256515503, "learning_rate": 2.238581730769231e-05, "loss": 0.7943, "step": 1939 }, { "epoch": 1.1316902435467406, "grad_norm": 1.0611028671264648, "learning_rate": 2.2370793269230768e-05, "loss": 0.8708, "step": 1940 }, { "epoch": 1.1322735890331048, "grad_norm": 1.1520451307296753, "learning_rate": 2.2355769230769233e-05, "loss": 0.7339, "step": 1941 }, { "epoch": 1.1328569345194692, "grad_norm": 1.2494333982467651, "learning_rate": 2.2340745192307695e-05, "loss": 0.7952, "step": 1942 }, { "epoch": 1.1334402800058334, "grad_norm": 1.1733430624008179, "learning_rate": 2.2325721153846153e-05, "loss": 0.7348, "step": 1943 }, { "epoch": 1.1340236254921978, "grad_norm": 1.1535006761550903, "learning_rate": 2.2310697115384615e-05, "loss": 0.8333, "step": 1944 }, { "epoch": 1.134606970978562, "grad_norm": 1.1618400812149048, "learning_rate": 2.229567307692308e-05, "loss": 0.8816, "step": 1945 }, { "epoch": 1.1351903164649264, "grad_norm": 1.43650221824646, "learning_rate": 2.228064903846154e-05, "loss": 0.9497, "step": 1946 }, { "epoch": 1.1357736619512906, "grad_norm": 1.2452301979064941, "learning_rate": 2.2265625e-05, "loss": 0.6446, "step": 1947 }, { "epoch": 1.136357007437655, "grad_norm": 1.2092143297195435, "learning_rate": 2.2250600961538463e-05, "loss": 0.8083, "step": 1948 }, { "epoch": 1.1369403529240192, "grad_norm": 1.0551011562347412, "learning_rate": 2.223557692307692e-05, "loss": 0.9274, "step": 1949 }, { "epoch": 1.1375236984103836, "grad_norm": 1.204288125038147, "learning_rate": 2.2220552884615386e-05, "loss": 1.043, "step": 1950 }, { "epoch": 1.1381070438967478, "grad_norm": 1.4167194366455078, "learning_rate": 2.220552884615385e-05, "loss": 0.8985, "step": 1951 }, { "epoch": 1.1386903893831122, "grad_norm": 1.2125877141952515, "learning_rate": 2.2190504807692307e-05, "loss": 0.7084, "step": 1952 }, { "epoch": 1.1392737348694764, "grad_norm": 1.3489576578140259, "learning_rate": 2.217548076923077e-05, "loss": 0.7183, "step": 1953 }, { "epoch": 1.1398570803558408, "grad_norm": 1.4352302551269531, "learning_rate": 2.2160456730769234e-05, "loss": 0.9226, "step": 1954 }, { "epoch": 1.140440425842205, "grad_norm": 1.2473138570785522, "learning_rate": 2.2145432692307693e-05, "loss": 0.8703, "step": 1955 }, { "epoch": 1.1410237713285694, "grad_norm": 1.1649906635284424, "learning_rate": 2.2130408653846154e-05, "loss": 0.7713, "step": 1956 }, { "epoch": 1.1416071168149338, "grad_norm": 1.1831884384155273, "learning_rate": 2.2115384615384616e-05, "loss": 0.9852, "step": 1957 }, { "epoch": 1.142190462301298, "grad_norm": 1.1909598112106323, "learning_rate": 2.2100360576923078e-05, "loss": 0.7232, "step": 1958 }, { "epoch": 1.1427738077876621, "grad_norm": 1.0720024108886719, "learning_rate": 2.208533653846154e-05, "loss": 0.8207, "step": 1959 }, { "epoch": 1.1433571532740265, "grad_norm": 1.0868524312973022, "learning_rate": 2.2070312500000002e-05, "loss": 0.7564, "step": 1960 }, { "epoch": 1.143940498760391, "grad_norm": 1.2819868326187134, "learning_rate": 2.205528846153846e-05, "loss": 0.8171, "step": 1961 }, { "epoch": 1.1445238442467551, "grad_norm": 1.2320795059204102, "learning_rate": 2.2040264423076922e-05, "loss": 0.6759, "step": 1962 }, { "epoch": 1.1451071897331195, "grad_norm": 1.2730618715286255, "learning_rate": 2.2025240384615388e-05, "loss": 0.8357, "step": 1963 }, { "epoch": 1.1456905352194837, "grad_norm": 1.3447294235229492, "learning_rate": 2.2010216346153846e-05, "loss": 0.7348, "step": 1964 }, { "epoch": 1.1462738807058481, "grad_norm": 1.215040683746338, "learning_rate": 2.1995192307692308e-05, "loss": 0.9677, "step": 1965 }, { "epoch": 1.1468572261922123, "grad_norm": 0.992956280708313, "learning_rate": 2.198016826923077e-05, "loss": 0.7393, "step": 1966 }, { "epoch": 1.1474405716785767, "grad_norm": 1.204768419265747, "learning_rate": 2.1965144230769232e-05, "loss": 0.7813, "step": 1967 }, { "epoch": 1.148023917164941, "grad_norm": 1.2408292293548584, "learning_rate": 2.1950120192307694e-05, "loss": 0.9303, "step": 1968 }, { "epoch": 1.1486072626513053, "grad_norm": 1.1849360466003418, "learning_rate": 2.1935096153846156e-05, "loss": 0.836, "step": 1969 }, { "epoch": 1.1491906081376695, "grad_norm": 1.2159719467163086, "learning_rate": 2.1920072115384614e-05, "loss": 0.6705, "step": 1970 }, { "epoch": 1.149773953624034, "grad_norm": 1.2858052253723145, "learning_rate": 2.190504807692308e-05, "loss": 0.8785, "step": 1971 }, { "epoch": 1.150357299110398, "grad_norm": 1.2040108442306519, "learning_rate": 2.189002403846154e-05, "loss": 0.7998, "step": 1972 }, { "epoch": 1.1509406445967625, "grad_norm": 1.2969449758529663, "learning_rate": 2.1875e-05, "loss": 0.7393, "step": 1973 }, { "epoch": 1.1515239900831267, "grad_norm": 1.1521106958389282, "learning_rate": 2.185997596153846e-05, "loss": 0.9285, "step": 1974 }, { "epoch": 1.152107335569491, "grad_norm": 1.139011025428772, "learning_rate": 2.1844951923076924e-05, "loss": 0.8499, "step": 1975 }, { "epoch": 1.1526906810558553, "grad_norm": 1.3250030279159546, "learning_rate": 2.1829927884615385e-05, "loss": 0.7346, "step": 1976 }, { "epoch": 1.1532740265422197, "grad_norm": 1.0421650409698486, "learning_rate": 2.1814903846153847e-05, "loss": 0.739, "step": 1977 }, { "epoch": 1.1538573720285838, "grad_norm": 1.3398250341415405, "learning_rate": 2.179987980769231e-05, "loss": 0.644, "step": 1978 }, { "epoch": 1.1544407175149483, "grad_norm": 1.0961638689041138, "learning_rate": 2.1784855769230768e-05, "loss": 0.8046, "step": 1979 }, { "epoch": 1.1550240630013124, "grad_norm": 1.2048981189727783, "learning_rate": 2.1769831730769233e-05, "loss": 0.6226, "step": 1980 }, { "epoch": 1.1556074084876768, "grad_norm": 1.395005702972412, "learning_rate": 2.1754807692307695e-05, "loss": 0.749, "step": 1981 }, { "epoch": 1.156190753974041, "grad_norm": 1.2888487577438354, "learning_rate": 2.1739783653846153e-05, "loss": 0.8828, "step": 1982 }, { "epoch": 1.1567740994604054, "grad_norm": 1.237052083015442, "learning_rate": 2.1724759615384615e-05, "loss": 0.742, "step": 1983 }, { "epoch": 1.1573574449467698, "grad_norm": 1.3370431661605835, "learning_rate": 2.170973557692308e-05, "loss": 0.6737, "step": 1984 }, { "epoch": 1.157940790433134, "grad_norm": 1.3525289297103882, "learning_rate": 2.169471153846154e-05, "loss": 0.6262, "step": 1985 }, { "epoch": 1.1585241359194982, "grad_norm": 1.2294694185256958, "learning_rate": 2.16796875e-05, "loss": 0.6299, "step": 1986 }, { "epoch": 1.1591074814058626, "grad_norm": 1.3560107946395874, "learning_rate": 2.1664663461538463e-05, "loss": 0.8025, "step": 1987 }, { "epoch": 1.159690826892227, "grad_norm": 1.2413029670715332, "learning_rate": 2.164963942307692e-05, "loss": 0.7526, "step": 1988 }, { "epoch": 1.1602741723785912, "grad_norm": 1.2291741371154785, "learning_rate": 2.1634615384615387e-05, "loss": 0.9964, "step": 1989 }, { "epoch": 1.1608575178649556, "grad_norm": 1.1054651737213135, "learning_rate": 2.161959134615385e-05, "loss": 0.8606, "step": 1990 }, { "epoch": 1.1614408633513198, "grad_norm": 1.2737064361572266, "learning_rate": 2.1604567307692307e-05, "loss": 0.7849, "step": 1991 }, { "epoch": 1.1620242088376842, "grad_norm": 1.2561743259429932, "learning_rate": 2.158954326923077e-05, "loss": 0.914, "step": 1992 }, { "epoch": 1.1626075543240484, "grad_norm": 1.4079277515411377, "learning_rate": 2.1574519230769234e-05, "loss": 0.8546, "step": 1993 }, { "epoch": 1.1631908998104128, "grad_norm": 1.1819322109222412, "learning_rate": 2.1559495192307693e-05, "loss": 0.7467, "step": 1994 }, { "epoch": 1.163774245296777, "grad_norm": 1.2287219762802124, "learning_rate": 2.1544471153846155e-05, "loss": 0.7146, "step": 1995 }, { "epoch": 1.1643575907831414, "grad_norm": 1.2632535696029663, "learning_rate": 2.1529447115384616e-05, "loss": 0.7355, "step": 1996 }, { "epoch": 1.1649409362695056, "grad_norm": 1.3165422677993774, "learning_rate": 2.151442307692308e-05, "loss": 0.7853, "step": 1997 }, { "epoch": 1.16552428175587, "grad_norm": 1.056222915649414, "learning_rate": 2.149939903846154e-05, "loss": 0.7185, "step": 1998 }, { "epoch": 1.1661076272422342, "grad_norm": 1.2883901596069336, "learning_rate": 2.1484375000000002e-05, "loss": 0.7979, "step": 1999 }, { "epoch": 1.1666909727285986, "grad_norm": 1.367472767829895, "learning_rate": 2.146935096153846e-05, "loss": 0.8075, "step": 2000 }, { "epoch": 1.1666909727285986, "eval_loss_squad": 0.833221665751189, "eval_perplexity": 8.233223012549466, "eval_perplexity_reconstruct": 1.9100281333491649, "step": 2000 }, { "epoch": 1.1672743182149627, "grad_norm": 1.1449155807495117, "learning_rate": 2.1454326923076923e-05, "loss": 0.865, "step": 2001 }, { "epoch": 1.1678576637013272, "grad_norm": 1.1930732727050781, "learning_rate": 2.1439302884615388e-05, "loss": 0.5906, "step": 2002 }, { "epoch": 1.1684410091876913, "grad_norm": 1.4782718420028687, "learning_rate": 2.1424278846153846e-05, "loss": 0.9741, "step": 2003 }, { "epoch": 1.1690243546740557, "grad_norm": 1.2961211204528809, "learning_rate": 2.1409254807692308e-05, "loss": 0.713, "step": 2004 }, { "epoch": 1.16960770016042, "grad_norm": 1.250820279121399, "learning_rate": 2.139423076923077e-05, "loss": 0.8144, "step": 2005 }, { "epoch": 1.1701910456467843, "grad_norm": 1.1511048078536987, "learning_rate": 2.1379206730769232e-05, "loss": 0.822, "step": 2006 }, { "epoch": 1.1707743911331485, "grad_norm": 1.1328845024108887, "learning_rate": 2.1364182692307694e-05, "loss": 0.8251, "step": 2007 }, { "epoch": 1.171357736619513, "grad_norm": 1.0141953229904175, "learning_rate": 2.1349158653846156e-05, "loss": 0.5794, "step": 2008 }, { "epoch": 1.171941082105877, "grad_norm": 1.3566126823425293, "learning_rate": 2.1334134615384614e-05, "loss": 0.9965, "step": 2009 }, { "epoch": 1.1725244275922415, "grad_norm": 1.246524691581726, "learning_rate": 2.131911057692308e-05, "loss": 0.7536, "step": 2010 }, { "epoch": 1.173107773078606, "grad_norm": 1.3714301586151123, "learning_rate": 2.130408653846154e-05, "loss": 0.7692, "step": 2011 }, { "epoch": 1.17369111856497, "grad_norm": 1.1998827457427979, "learning_rate": 2.12890625e-05, "loss": 0.8805, "step": 2012 }, { "epoch": 1.1742744640513343, "grad_norm": 1.1431297063827515, "learning_rate": 2.1274038461538462e-05, "loss": 0.9395, "step": 2013 }, { "epoch": 1.1748578095376987, "grad_norm": 1.1474422216415405, "learning_rate": 2.1259014423076924e-05, "loss": 0.5988, "step": 2014 }, { "epoch": 1.175441155024063, "grad_norm": 1.1838253736495972, "learning_rate": 2.1243990384615386e-05, "loss": 1.0735, "step": 2015 }, { "epoch": 1.1760245005104273, "grad_norm": 1.1664282083511353, "learning_rate": 2.1228966346153847e-05, "loss": 0.9661, "step": 2016 }, { "epoch": 1.1766078459967917, "grad_norm": 1.2454452514648438, "learning_rate": 2.121394230769231e-05, "loss": 0.8219, "step": 2017 }, { "epoch": 1.1771911914831559, "grad_norm": 1.3535860776901245, "learning_rate": 2.1198918269230768e-05, "loss": 0.855, "step": 2018 }, { "epoch": 1.1777745369695203, "grad_norm": 1.3076273202896118, "learning_rate": 2.1183894230769233e-05, "loss": 0.9618, "step": 2019 }, { "epoch": 1.1783578824558845, "grad_norm": 1.2071937322616577, "learning_rate": 2.1168870192307695e-05, "loss": 0.7426, "step": 2020 }, { "epoch": 1.1789412279422489, "grad_norm": 1.273971676826477, "learning_rate": 2.1153846153846154e-05, "loss": 0.8335, "step": 2021 }, { "epoch": 1.179524573428613, "grad_norm": 1.1485315561294556, "learning_rate": 2.1138822115384615e-05, "loss": 0.9572, "step": 2022 }, { "epoch": 1.1801079189149775, "grad_norm": 1.3041000366210938, "learning_rate": 2.112379807692308e-05, "loss": 0.6271, "step": 2023 }, { "epoch": 1.1806912644013416, "grad_norm": 1.3196794986724854, "learning_rate": 2.110877403846154e-05, "loss": 0.7458, "step": 2024 }, { "epoch": 1.181274609887706, "grad_norm": 1.1841623783111572, "learning_rate": 2.109375e-05, "loss": 0.7578, "step": 2025 }, { "epoch": 1.1818579553740702, "grad_norm": 1.098708152770996, "learning_rate": 2.1078725961538463e-05, "loss": 0.7583, "step": 2026 }, { "epoch": 1.1824413008604346, "grad_norm": 1.4319161176681519, "learning_rate": 2.106370192307692e-05, "loss": 0.6168, "step": 2027 }, { "epoch": 1.1830246463467988, "grad_norm": 1.0580462217330933, "learning_rate": 2.1048677884615387e-05, "loss": 0.9887, "step": 2028 }, { "epoch": 1.1836079918331632, "grad_norm": 0.9840408563613892, "learning_rate": 2.103365384615385e-05, "loss": 0.8125, "step": 2029 }, { "epoch": 1.1841913373195274, "grad_norm": 1.2513033151626587, "learning_rate": 2.1018629807692307e-05, "loss": 0.8072, "step": 2030 }, { "epoch": 1.1847746828058918, "grad_norm": 1.3638144731521606, "learning_rate": 2.100360576923077e-05, "loss": 0.7152, "step": 2031 }, { "epoch": 1.185358028292256, "grad_norm": 1.3127323389053345, "learning_rate": 2.0988581730769234e-05, "loss": 0.8506, "step": 2032 }, { "epoch": 1.1859413737786204, "grad_norm": 1.028713345527649, "learning_rate": 2.0973557692307693e-05, "loss": 0.7184, "step": 2033 }, { "epoch": 1.1865247192649846, "grad_norm": 1.3238139152526855, "learning_rate": 2.0958533653846155e-05, "loss": 0.9182, "step": 2034 }, { "epoch": 1.187108064751349, "grad_norm": 1.1423969268798828, "learning_rate": 2.0943509615384617e-05, "loss": 0.6711, "step": 2035 }, { "epoch": 1.1876914102377132, "grad_norm": 1.0575798749923706, "learning_rate": 2.092848557692308e-05, "loss": 0.6561, "step": 2036 }, { "epoch": 1.1882747557240776, "grad_norm": 1.18056058883667, "learning_rate": 2.091346153846154e-05, "loss": 0.9018, "step": 2037 }, { "epoch": 1.188858101210442, "grad_norm": 1.0494967699050903, "learning_rate": 2.0898437500000002e-05, "loss": 0.7882, "step": 2038 }, { "epoch": 1.1894414466968062, "grad_norm": 1.2525426149368286, "learning_rate": 2.088341346153846e-05, "loss": 0.7677, "step": 2039 }, { "epoch": 1.1900247921831704, "grad_norm": 1.2103064060211182, "learning_rate": 2.0868389423076923e-05, "loss": 0.7308, "step": 2040 }, { "epoch": 1.1906081376695348, "grad_norm": 1.238573670387268, "learning_rate": 2.0853365384615388e-05, "loss": 0.8663, "step": 2041 }, { "epoch": 1.1911914831558992, "grad_norm": 1.1449916362762451, "learning_rate": 2.0838341346153846e-05, "loss": 0.9087, "step": 2042 }, { "epoch": 1.1917748286422634, "grad_norm": 0.9556616544723511, "learning_rate": 2.082331730769231e-05, "loss": 0.8262, "step": 2043 }, { "epoch": 1.1923581741286278, "grad_norm": 1.0977288484573364, "learning_rate": 2.080829326923077e-05, "loss": 0.7063, "step": 2044 }, { "epoch": 1.192941519614992, "grad_norm": 1.09841787815094, "learning_rate": 2.0793269230769232e-05, "loss": 0.8165, "step": 2045 }, { "epoch": 1.1935248651013564, "grad_norm": 1.3817111253738403, "learning_rate": 2.0778245192307694e-05, "loss": 0.8761, "step": 2046 }, { "epoch": 1.1941082105877205, "grad_norm": 1.6460163593292236, "learning_rate": 2.0763221153846156e-05, "loss": 0.9336, "step": 2047 }, { "epoch": 1.194691556074085, "grad_norm": 1.1768983602523804, "learning_rate": 2.0748197115384614e-05, "loss": 0.7355, "step": 2048 }, { "epoch": 1.1952749015604491, "grad_norm": 1.1434657573699951, "learning_rate": 2.073317307692308e-05, "loss": 0.8538, "step": 2049 }, { "epoch": 1.1958582470468135, "grad_norm": 1.3496836423873901, "learning_rate": 2.071814903846154e-05, "loss": 0.6867, "step": 2050 }, { "epoch": 1.1964415925331777, "grad_norm": 1.1143194437026978, "learning_rate": 2.0703125e-05, "loss": 0.4993, "step": 2051 }, { "epoch": 1.1970249380195421, "grad_norm": 1.1078752279281616, "learning_rate": 2.0688100961538462e-05, "loss": 0.8276, "step": 2052 }, { "epoch": 1.1976082835059063, "grad_norm": 1.3760533332824707, "learning_rate": 2.0673076923076924e-05, "loss": 0.7688, "step": 2053 }, { "epoch": 1.1981916289922707, "grad_norm": 1.4191945791244507, "learning_rate": 2.0658052884615386e-05, "loss": 0.7183, "step": 2054 }, { "epoch": 1.198774974478635, "grad_norm": 3.6387343406677246, "learning_rate": 2.0643028846153848e-05, "loss": 0.9276, "step": 2055 }, { "epoch": 1.1993583199649993, "grad_norm": 1.3375924825668335, "learning_rate": 2.062800480769231e-05, "loss": 0.8397, "step": 2056 }, { "epoch": 1.1999416654513635, "grad_norm": 1.2695611715316772, "learning_rate": 2.0612980769230768e-05, "loss": 1.0456, "step": 2057 }, { "epoch": 1.200525010937728, "grad_norm": 1.278247594833374, "learning_rate": 2.0597956730769233e-05, "loss": 0.8718, "step": 2058 }, { "epoch": 1.201108356424092, "grad_norm": 1.2946563959121704, "learning_rate": 2.0582932692307695e-05, "loss": 0.7343, "step": 2059 }, { "epoch": 1.2016917019104565, "grad_norm": 1.5262877941131592, "learning_rate": 2.0567908653846154e-05, "loss": 0.8788, "step": 2060 }, { "epoch": 1.2022750473968207, "grad_norm": 1.3010066747665405, "learning_rate": 2.0552884615384616e-05, "loss": 0.7022, "step": 2061 }, { "epoch": 1.202858392883185, "grad_norm": 1.112301230430603, "learning_rate": 2.053786057692308e-05, "loss": 0.8348, "step": 2062 }, { "epoch": 1.2034417383695493, "grad_norm": 1.1436960697174072, "learning_rate": 2.052283653846154e-05, "loss": 0.7332, "step": 2063 }, { "epoch": 1.2040250838559137, "grad_norm": 1.1401253938674927, "learning_rate": 2.05078125e-05, "loss": 0.7297, "step": 2064 }, { "epoch": 1.204608429342278, "grad_norm": 1.187334418296814, "learning_rate": 2.0492788461538463e-05, "loss": 0.8428, "step": 2065 }, { "epoch": 1.2051917748286423, "grad_norm": 1.499940037727356, "learning_rate": 2.047776442307692e-05, "loss": 0.8032, "step": 2066 }, { "epoch": 1.2057751203150064, "grad_norm": 1.4192659854888916, "learning_rate": 2.0462740384615387e-05, "loss": 0.7676, "step": 2067 }, { "epoch": 1.2063584658013708, "grad_norm": 1.6800339221954346, "learning_rate": 2.044771634615385e-05, "loss": 0.7352, "step": 2068 }, { "epoch": 1.2069418112877353, "grad_norm": 1.1487371921539307, "learning_rate": 2.0432692307692307e-05, "loss": 0.7481, "step": 2069 }, { "epoch": 1.2075251567740994, "grad_norm": 1.2796707153320312, "learning_rate": 2.041766826923077e-05, "loss": 0.7011, "step": 2070 }, { "epoch": 1.2081085022604638, "grad_norm": 2.365525722503662, "learning_rate": 2.0402644230769235e-05, "loss": 0.8039, "step": 2071 }, { "epoch": 1.208691847746828, "grad_norm": 1.1125097274780273, "learning_rate": 2.0387620192307693e-05, "loss": 0.7027, "step": 2072 }, { "epoch": 1.2092751932331924, "grad_norm": 1.3944780826568604, "learning_rate": 2.0372596153846155e-05, "loss": 0.8972, "step": 2073 }, { "epoch": 1.2098585387195566, "grad_norm": 1.1701585054397583, "learning_rate": 2.0357572115384617e-05, "loss": 0.6956, "step": 2074 }, { "epoch": 1.210441884205921, "grad_norm": 1.17959463596344, "learning_rate": 2.034254807692308e-05, "loss": 0.7278, "step": 2075 }, { "epoch": 1.2110252296922852, "grad_norm": 1.3046574592590332, "learning_rate": 2.032752403846154e-05, "loss": 0.7054, "step": 2076 }, { "epoch": 1.2116085751786496, "grad_norm": 1.1868085861206055, "learning_rate": 2.0312500000000002e-05, "loss": 0.76, "step": 2077 }, { "epoch": 1.2121919206650138, "grad_norm": 1.4176987409591675, "learning_rate": 2.029747596153846e-05, "loss": 0.8469, "step": 2078 }, { "epoch": 1.2127752661513782, "grad_norm": 1.0783299207687378, "learning_rate": 2.0282451923076923e-05, "loss": 0.8074, "step": 2079 }, { "epoch": 1.2133586116377424, "grad_norm": 1.113829493522644, "learning_rate": 2.0267427884615388e-05, "loss": 0.7359, "step": 2080 }, { "epoch": 1.2139419571241068, "grad_norm": 1.3299684524536133, "learning_rate": 2.0252403846153847e-05, "loss": 0.8881, "step": 2081 }, { "epoch": 1.214525302610471, "grad_norm": 1.1996777057647705, "learning_rate": 2.023737980769231e-05, "loss": 0.686, "step": 2082 }, { "epoch": 1.2151086480968354, "grad_norm": 1.1687122583389282, "learning_rate": 2.022235576923077e-05, "loss": 0.8864, "step": 2083 }, { "epoch": 1.2156919935831996, "grad_norm": 1.3415162563323975, "learning_rate": 2.0207331730769232e-05, "loss": 0.8052, "step": 2084 }, { "epoch": 1.216275339069564, "grad_norm": 1.184653878211975, "learning_rate": 2.0192307692307694e-05, "loss": 0.7937, "step": 2085 }, { "epoch": 1.2168586845559282, "grad_norm": 1.2197047472000122, "learning_rate": 2.0177283653846156e-05, "loss": 0.7549, "step": 2086 }, { "epoch": 1.2174420300422926, "grad_norm": 1.2201251983642578, "learning_rate": 2.0162259615384615e-05, "loss": 0.818, "step": 2087 }, { "epoch": 1.2180253755286568, "grad_norm": 1.0660450458526611, "learning_rate": 2.014723557692308e-05, "loss": 0.7244, "step": 2088 }, { "epoch": 1.2186087210150212, "grad_norm": 1.112344741821289, "learning_rate": 2.0132211538461542e-05, "loss": 0.8623, "step": 2089 }, { "epoch": 1.2191920665013853, "grad_norm": 1.066853404045105, "learning_rate": 2.01171875e-05, "loss": 0.7755, "step": 2090 }, { "epoch": 1.2197754119877497, "grad_norm": 1.4612683057785034, "learning_rate": 2.0102163461538462e-05, "loss": 0.7503, "step": 2091 }, { "epoch": 1.2203587574741142, "grad_norm": 1.3045361042022705, "learning_rate": 2.0087139423076924e-05, "loss": 1.054, "step": 2092 }, { "epoch": 1.2209421029604783, "grad_norm": 1.891230821609497, "learning_rate": 2.0072115384615386e-05, "loss": 0.801, "step": 2093 }, { "epoch": 1.2215254484468425, "grad_norm": 1.2401351928710938, "learning_rate": 2.0057091346153848e-05, "loss": 1.0192, "step": 2094 }, { "epoch": 1.222108793933207, "grad_norm": 1.1924711465835571, "learning_rate": 2.004206730769231e-05, "loss": 0.7175, "step": 2095 }, { "epoch": 1.2226921394195713, "grad_norm": 1.4150327444076538, "learning_rate": 2.0027043269230768e-05, "loss": 0.8763, "step": 2096 }, { "epoch": 1.2232754849059355, "grad_norm": 1.2163945436477661, "learning_rate": 2.0012019230769233e-05, "loss": 0.678, "step": 2097 }, { "epoch": 1.2238588303923, "grad_norm": 1.3845294713974, "learning_rate": 1.9996995192307695e-05, "loss": 0.658, "step": 2098 }, { "epoch": 1.224442175878664, "grad_norm": 1.308927297592163, "learning_rate": 1.9981971153846154e-05, "loss": 0.6827, "step": 2099 }, { "epoch": 1.2250255213650285, "grad_norm": 1.386710524559021, "learning_rate": 1.9966947115384616e-05, "loss": 0.6608, "step": 2100 }, { "epoch": 1.2256088668513927, "grad_norm": 2.081791877746582, "learning_rate": 1.9951923076923078e-05, "loss": 0.8755, "step": 2101 }, { "epoch": 1.226192212337757, "grad_norm": 1.240175724029541, "learning_rate": 1.993689903846154e-05, "loss": 0.7445, "step": 2102 }, { "epoch": 1.2267755578241213, "grad_norm": 1.1266789436340332, "learning_rate": 1.9921875e-05, "loss": 0.816, "step": 2103 }, { "epoch": 1.2273589033104857, "grad_norm": 1.2244000434875488, "learning_rate": 1.9906850961538463e-05, "loss": 0.8795, "step": 2104 }, { "epoch": 1.2279422487968499, "grad_norm": 1.420829176902771, "learning_rate": 1.9891826923076922e-05, "loss": 0.909, "step": 2105 }, { "epoch": 1.2285255942832143, "grad_norm": 1.031460165977478, "learning_rate": 1.9876802884615387e-05, "loss": 0.557, "step": 2106 }, { "epoch": 1.2291089397695785, "grad_norm": 1.2154114246368408, "learning_rate": 1.986177884615385e-05, "loss": 0.7688, "step": 2107 }, { "epoch": 1.2296922852559429, "grad_norm": 1.0314477682113647, "learning_rate": 1.9846754807692307e-05, "loss": 0.834, "step": 2108 }, { "epoch": 1.230275630742307, "grad_norm": 1.227577805519104, "learning_rate": 1.983173076923077e-05, "loss": 0.7475, "step": 2109 }, { "epoch": 1.2308589762286715, "grad_norm": 1.3141686916351318, "learning_rate": 1.981670673076923e-05, "loss": 0.8031, "step": 2110 }, { "epoch": 1.2314423217150356, "grad_norm": 1.0540612936019897, "learning_rate": 1.9801682692307693e-05, "loss": 0.7732, "step": 2111 }, { "epoch": 1.2320256672014, "grad_norm": 1.198327898979187, "learning_rate": 1.9786658653846155e-05, "loss": 0.8973, "step": 2112 }, { "epoch": 1.2326090126877642, "grad_norm": 1.286871314048767, "learning_rate": 1.9771634615384617e-05, "loss": 0.6474, "step": 2113 }, { "epoch": 1.2331923581741286, "grad_norm": 1.4305251836776733, "learning_rate": 1.975661057692308e-05, "loss": 1.0454, "step": 2114 }, { "epoch": 1.233775703660493, "grad_norm": 1.2102510929107666, "learning_rate": 1.974158653846154e-05, "loss": 0.5195, "step": 2115 }, { "epoch": 1.2343590491468572, "grad_norm": 1.3697700500488281, "learning_rate": 1.9726562500000003e-05, "loss": 0.7984, "step": 2116 }, { "epoch": 1.2349423946332214, "grad_norm": 1.121392846107483, "learning_rate": 1.971153846153846e-05, "loss": 0.9984, "step": 2117 }, { "epoch": 1.2355257401195858, "grad_norm": 1.2788634300231934, "learning_rate": 1.9696514423076923e-05, "loss": 0.7598, "step": 2118 }, { "epoch": 1.2361090856059502, "grad_norm": 1.1477173566818237, "learning_rate": 1.9681490384615385e-05, "loss": 0.7552, "step": 2119 }, { "epoch": 1.2366924310923144, "grad_norm": 1.1774203777313232, "learning_rate": 1.9666466346153847e-05, "loss": 0.8124, "step": 2120 }, { "epoch": 1.2372757765786786, "grad_norm": 1.0521777868270874, "learning_rate": 1.965144230769231e-05, "loss": 0.7731, "step": 2121 }, { "epoch": 1.237859122065043, "grad_norm": 1.3230030536651611, "learning_rate": 1.963641826923077e-05, "loss": 0.8122, "step": 2122 }, { "epoch": 1.2384424675514074, "grad_norm": 1.1762146949768066, "learning_rate": 1.9621394230769232e-05, "loss": 0.7926, "step": 2123 }, { "epoch": 1.2390258130377716, "grad_norm": 1.3138278722763062, "learning_rate": 1.9606370192307694e-05, "loss": 0.8361, "step": 2124 }, { "epoch": 1.239609158524136, "grad_norm": 1.3810380697250366, "learning_rate": 1.9591346153846156e-05, "loss": 0.8379, "step": 2125 }, { "epoch": 1.2401925040105002, "grad_norm": 1.2080543041229248, "learning_rate": 1.9576322115384615e-05, "loss": 1.0409, "step": 2126 }, { "epoch": 1.2407758494968646, "grad_norm": 1.35194730758667, "learning_rate": 1.956129807692308e-05, "loss": 0.5343, "step": 2127 }, { "epoch": 1.2413591949832288, "grad_norm": 1.287524700164795, "learning_rate": 1.954627403846154e-05, "loss": 0.9522, "step": 2128 }, { "epoch": 1.2419425404695932, "grad_norm": 1.3742008209228516, "learning_rate": 1.953125e-05, "loss": 0.6388, "step": 2129 }, { "epoch": 1.2425258859559574, "grad_norm": 1.1241064071655273, "learning_rate": 1.9516225961538462e-05, "loss": 0.6185, "step": 2130 }, { "epoch": 1.2431092314423218, "grad_norm": 1.279675006866455, "learning_rate": 1.9501201923076924e-05, "loss": 0.7721, "step": 2131 }, { "epoch": 1.243692576928686, "grad_norm": 1.381134033203125, "learning_rate": 1.9486177884615386e-05, "loss": 0.7895, "step": 2132 }, { "epoch": 1.2442759224150504, "grad_norm": 1.276642918586731, "learning_rate": 1.9471153846153848e-05, "loss": 0.855, "step": 2133 }, { "epoch": 1.2448592679014145, "grad_norm": 1.0444273948669434, "learning_rate": 1.945612980769231e-05, "loss": 1.0147, "step": 2134 }, { "epoch": 1.245442613387779, "grad_norm": 1.4284225702285767, "learning_rate": 1.944110576923077e-05, "loss": 0.8998, "step": 2135 }, { "epoch": 1.2460259588741431, "grad_norm": 1.3835419416427612, "learning_rate": 1.9426081730769234e-05, "loss": 0.7556, "step": 2136 }, { "epoch": 1.2466093043605075, "grad_norm": 1.3676860332489014, "learning_rate": 1.9411057692307692e-05, "loss": 0.7672, "step": 2137 }, { "epoch": 1.2471926498468717, "grad_norm": 1.2197344303131104, "learning_rate": 1.9396033653846154e-05, "loss": 0.8391, "step": 2138 }, { "epoch": 1.2477759953332361, "grad_norm": 1.1958162784576416, "learning_rate": 1.9381009615384616e-05, "loss": 0.7733, "step": 2139 }, { "epoch": 1.2483593408196003, "grad_norm": 1.3922827243804932, "learning_rate": 1.9365985576923078e-05, "loss": 0.6837, "step": 2140 }, { "epoch": 1.2489426863059647, "grad_norm": 1.1373469829559326, "learning_rate": 1.935096153846154e-05, "loss": 0.8817, "step": 2141 }, { "epoch": 1.2495260317923291, "grad_norm": 1.2826591730117798, "learning_rate": 1.93359375e-05, "loss": 0.8214, "step": 2142 }, { "epoch": 1.2501093772786933, "grad_norm": 1.4706931114196777, "learning_rate": 1.9320913461538463e-05, "loss": 0.8828, "step": 2143 }, { "epoch": 1.2506927227650575, "grad_norm": 1.1995126008987427, "learning_rate": 1.9305889423076922e-05, "loss": 0.8192, "step": 2144 }, { "epoch": 1.251276068251422, "grad_norm": 1.321846842765808, "learning_rate": 1.9290865384615387e-05, "loss": 0.7632, "step": 2145 }, { "epoch": 1.2518594137377863, "grad_norm": 1.2701880931854248, "learning_rate": 1.9275841346153846e-05, "loss": 0.8689, "step": 2146 }, { "epoch": 1.2524427592241505, "grad_norm": 1.2398470640182495, "learning_rate": 1.9260817307692308e-05, "loss": 1.0235, "step": 2147 }, { "epoch": 1.2530261047105147, "grad_norm": 1.2439467906951904, "learning_rate": 1.924579326923077e-05, "loss": 0.6753, "step": 2148 }, { "epoch": 1.253609450196879, "grad_norm": 1.207457184791565, "learning_rate": 1.923076923076923e-05, "loss": 0.9007, "step": 2149 }, { "epoch": 1.2541927956832435, "grad_norm": 1.1856311559677124, "learning_rate": 1.9215745192307693e-05, "loss": 0.6977, "step": 2150 }, { "epoch": 1.2547761411696077, "grad_norm": 1.2477092742919922, "learning_rate": 1.9200721153846155e-05, "loss": 0.7035, "step": 2151 }, { "epoch": 1.255359486655972, "grad_norm": 1.0819847583770752, "learning_rate": 1.9185697115384617e-05, "loss": 0.7668, "step": 2152 }, { "epoch": 1.2559428321423363, "grad_norm": 1.4272301197052002, "learning_rate": 1.917067307692308e-05, "loss": 0.8473, "step": 2153 }, { "epoch": 1.2565261776287007, "grad_norm": 1.478842854499817, "learning_rate": 1.915564903846154e-05, "loss": 0.8842, "step": 2154 }, { "epoch": 1.2571095231150649, "grad_norm": 1.500758409500122, "learning_rate": 1.9140625e-05, "loss": 0.7502, "step": 2155 }, { "epoch": 1.2576928686014293, "grad_norm": 1.173345685005188, "learning_rate": 1.912560096153846e-05, "loss": 0.7785, "step": 2156 }, { "epoch": 1.2582762140877934, "grad_norm": 1.1746881008148193, "learning_rate": 1.9110576923076923e-05, "loss": 0.83, "step": 2157 }, { "epoch": 1.2588595595741578, "grad_norm": 1.263342261314392, "learning_rate": 1.9095552884615385e-05, "loss": 0.896, "step": 2158 }, { "epoch": 1.259442905060522, "grad_norm": 1.374733328819275, "learning_rate": 1.9080528846153847e-05, "loss": 0.8734, "step": 2159 }, { "epoch": 1.2600262505468864, "grad_norm": 1.1060365438461304, "learning_rate": 1.906550480769231e-05, "loss": 0.684, "step": 2160 }, { "epoch": 1.2606095960332506, "grad_norm": 1.289867639541626, "learning_rate": 1.905048076923077e-05, "loss": 0.8807, "step": 2161 }, { "epoch": 1.261192941519615, "grad_norm": 0.9511016607284546, "learning_rate": 1.9035456730769233e-05, "loss": 0.9252, "step": 2162 }, { "epoch": 1.2617762870059792, "grad_norm": 1.3257215023040771, "learning_rate": 1.9020432692307695e-05, "loss": 0.6726, "step": 2163 }, { "epoch": 1.2623596324923436, "grad_norm": 1.180518627166748, "learning_rate": 1.9005408653846153e-05, "loss": 0.6201, "step": 2164 }, { "epoch": 1.262942977978708, "grad_norm": 1.2338030338287354, "learning_rate": 1.8990384615384615e-05, "loss": 0.8228, "step": 2165 }, { "epoch": 1.2635263234650722, "grad_norm": 0.9147897362709045, "learning_rate": 1.897536057692308e-05, "loss": 0.9485, "step": 2166 }, { "epoch": 1.2641096689514364, "grad_norm": 1.209277629852295, "learning_rate": 1.896033653846154e-05, "loss": 0.9845, "step": 2167 }, { "epoch": 1.2646930144378008, "grad_norm": 1.3521689176559448, "learning_rate": 1.89453125e-05, "loss": 0.8049, "step": 2168 }, { "epoch": 1.2652763599241652, "grad_norm": 1.1422154903411865, "learning_rate": 1.8930288461538462e-05, "loss": 0.8085, "step": 2169 }, { "epoch": 1.2658597054105294, "grad_norm": 1.3813396692276, "learning_rate": 1.8915264423076924e-05, "loss": 0.8315, "step": 2170 }, { "epoch": 1.2664430508968936, "grad_norm": 1.1161137819290161, "learning_rate": 1.8900240384615386e-05, "loss": 0.8324, "step": 2171 }, { "epoch": 1.267026396383258, "grad_norm": 1.3338303565979004, "learning_rate": 1.8885216346153848e-05, "loss": 0.8148, "step": 2172 }, { "epoch": 1.2676097418696224, "grad_norm": 1.677724838256836, "learning_rate": 1.8870192307692307e-05, "loss": 0.8382, "step": 2173 }, { "epoch": 1.2681930873559866, "grad_norm": 1.2323002815246582, "learning_rate": 1.885516826923077e-05, "loss": 0.8921, "step": 2174 }, { "epoch": 1.2687764328423508, "grad_norm": 1.3112934827804565, "learning_rate": 1.8840144230769234e-05, "loss": 0.782, "step": 2175 }, { "epoch": 1.2693597783287152, "grad_norm": 1.1889418363571167, "learning_rate": 1.8825120192307692e-05, "loss": 0.6976, "step": 2176 }, { "epoch": 1.2699431238150796, "grad_norm": 1.2303264141082764, "learning_rate": 1.8810096153846154e-05, "loss": 0.7932, "step": 2177 }, { "epoch": 1.2705264693014438, "grad_norm": 1.306663990020752, "learning_rate": 1.8795072115384616e-05, "loss": 0.8867, "step": 2178 }, { "epoch": 1.2711098147878082, "grad_norm": 1.3073201179504395, "learning_rate": 1.8780048076923078e-05, "loss": 0.7185, "step": 2179 }, { "epoch": 1.2716931602741723, "grad_norm": 1.2337119579315186, "learning_rate": 1.876502403846154e-05, "loss": 0.8584, "step": 2180 }, { "epoch": 1.2722765057605367, "grad_norm": 1.0608634948730469, "learning_rate": 1.8750000000000002e-05, "loss": 0.7698, "step": 2181 }, { "epoch": 1.272859851246901, "grad_norm": 1.7613379955291748, "learning_rate": 1.873497596153846e-05, "loss": 0.8036, "step": 2182 }, { "epoch": 1.2734431967332653, "grad_norm": 1.2020319700241089, "learning_rate": 1.8719951923076922e-05, "loss": 0.9223, "step": 2183 }, { "epoch": 1.2740265422196295, "grad_norm": 1.1997487545013428, "learning_rate": 1.8704927884615387e-05, "loss": 0.8401, "step": 2184 }, { "epoch": 1.274609887705994, "grad_norm": 1.2920103073120117, "learning_rate": 1.8689903846153846e-05, "loss": 0.7484, "step": 2185 }, { "epoch": 1.275193233192358, "grad_norm": 1.2046105861663818, "learning_rate": 1.8674879807692308e-05, "loss": 0.6554, "step": 2186 }, { "epoch": 1.2757765786787225, "grad_norm": 1.1587508916854858, "learning_rate": 1.865985576923077e-05, "loss": 0.9453, "step": 2187 }, { "epoch": 1.2763599241650867, "grad_norm": 1.2004345655441284, "learning_rate": 1.864483173076923e-05, "loss": 0.7673, "step": 2188 }, { "epoch": 1.276943269651451, "grad_norm": 1.2180094718933105, "learning_rate": 1.8629807692307693e-05, "loss": 0.7902, "step": 2189 }, { "epoch": 1.2775266151378153, "grad_norm": 1.1839169263839722, "learning_rate": 1.8614783653846155e-05, "loss": 0.9672, "step": 2190 }, { "epoch": 1.2781099606241797, "grad_norm": 1.2951889038085938, "learning_rate": 1.8599759615384614e-05, "loss": 0.8781, "step": 2191 }, { "epoch": 1.278693306110544, "grad_norm": 1.199181318283081, "learning_rate": 1.858473557692308e-05, "loss": 0.8252, "step": 2192 }, { "epoch": 1.2792766515969083, "grad_norm": 1.3422390222549438, "learning_rate": 1.856971153846154e-05, "loss": 0.8287, "step": 2193 }, { "epoch": 1.2798599970832725, "grad_norm": 1.1554558277130127, "learning_rate": 1.85546875e-05, "loss": 0.8332, "step": 2194 }, { "epoch": 1.2804433425696369, "grad_norm": 1.2082306146621704, "learning_rate": 1.853966346153846e-05, "loss": 0.8973, "step": 2195 }, { "epoch": 1.2810266880560013, "grad_norm": 1.3980644941329956, "learning_rate": 1.8524639423076923e-05, "loss": 0.7318, "step": 2196 }, { "epoch": 1.2816100335423655, "grad_norm": 1.290248155593872, "learning_rate": 1.8509615384615385e-05, "loss": 0.9601, "step": 2197 }, { "epoch": 1.2821933790287297, "grad_norm": 1.2903095483779907, "learning_rate": 1.8494591346153847e-05, "loss": 0.853, "step": 2198 }, { "epoch": 1.282776724515094, "grad_norm": 1.1215404272079468, "learning_rate": 1.847956730769231e-05, "loss": 0.8574, "step": 2199 }, { "epoch": 1.2833600700014585, "grad_norm": 1.4814391136169434, "learning_rate": 1.8464543269230767e-05, "loss": 0.7185, "step": 2200 }, { "epoch": 1.2833600700014585, "eval_loss_squad": 0.8695367491571233, "eval_perplexity": 8.314321659478429, "eval_perplexity_reconstruct": 1.905092611405054, "step": 2200 }, { "epoch": 1.2839434154878226, "grad_norm": 1.5173953771591187, "learning_rate": 1.8449519230769233e-05, "loss": 0.7604, "step": 2201 }, { "epoch": 1.2845267609741868, "grad_norm": 1.3316093683242798, "learning_rate": 1.8434495192307695e-05, "loss": 0.7515, "step": 2202 }, { "epoch": 1.2851101064605512, "grad_norm": 1.444630742073059, "learning_rate": 1.8419471153846153e-05, "loss": 1.0531, "step": 2203 }, { "epoch": 1.2856934519469156, "grad_norm": 1.2808470726013184, "learning_rate": 1.8404447115384615e-05, "loss": 0.6771, "step": 2204 }, { "epoch": 1.2862767974332798, "grad_norm": 1.2151483297348022, "learning_rate": 1.838942307692308e-05, "loss": 0.8662, "step": 2205 }, { "epoch": 1.2868601429196442, "grad_norm": 1.46257483959198, "learning_rate": 1.837439903846154e-05, "loss": 0.8319, "step": 2206 }, { "epoch": 1.2874434884060084, "grad_norm": 1.3388731479644775, "learning_rate": 1.8359375e-05, "loss": 0.6788, "step": 2207 }, { "epoch": 1.2880268338923728, "grad_norm": 1.563617467880249, "learning_rate": 1.8344350961538463e-05, "loss": 0.5617, "step": 2208 }, { "epoch": 1.288610179378737, "grad_norm": 1.1539713144302368, "learning_rate": 1.832932692307692e-05, "loss": 0.7174, "step": 2209 }, { "epoch": 1.2891935248651014, "grad_norm": 1.2138261795043945, "learning_rate": 1.8314302884615386e-05, "loss": 0.9121, "step": 2210 }, { "epoch": 1.2897768703514656, "grad_norm": 1.2265892028808594, "learning_rate": 1.8299278846153848e-05, "loss": 0.7816, "step": 2211 }, { "epoch": 1.29036021583783, "grad_norm": 1.1945728063583374, "learning_rate": 1.8284254807692307e-05, "loss": 0.8718, "step": 2212 }, { "epoch": 1.2909435613241942, "grad_norm": 1.3123586177825928, "learning_rate": 1.826923076923077e-05, "loss": 0.7532, "step": 2213 }, { "epoch": 1.2915269068105586, "grad_norm": 1.2278257608413696, "learning_rate": 1.8254206730769234e-05, "loss": 0.9726, "step": 2214 }, { "epoch": 1.2921102522969228, "grad_norm": 1.100456714630127, "learning_rate": 1.8239182692307692e-05, "loss": 0.5921, "step": 2215 }, { "epoch": 1.2926935977832872, "grad_norm": 1.1185805797576904, "learning_rate": 1.8224158653846154e-05, "loss": 0.7453, "step": 2216 }, { "epoch": 1.2932769432696514, "grad_norm": 1.2159292697906494, "learning_rate": 1.8209134615384616e-05, "loss": 0.6872, "step": 2217 }, { "epoch": 1.2938602887560158, "grad_norm": 1.1445262432098389, "learning_rate": 1.8194110576923078e-05, "loss": 0.7672, "step": 2218 }, { "epoch": 1.2944436342423802, "grad_norm": 1.1859655380249023, "learning_rate": 1.817908653846154e-05, "loss": 0.6995, "step": 2219 }, { "epoch": 1.2950269797287444, "grad_norm": 1.2108757495880127, "learning_rate": 1.8164062500000002e-05, "loss": 1.1017, "step": 2220 }, { "epoch": 1.2956103252151085, "grad_norm": 1.0840204954147339, "learning_rate": 1.814903846153846e-05, "loss": 0.9441, "step": 2221 }, { "epoch": 1.296193670701473, "grad_norm": 1.4668989181518555, "learning_rate": 1.8134014423076922e-05, "loss": 0.8163, "step": 2222 }, { "epoch": 1.2967770161878374, "grad_norm": 1.3289495706558228, "learning_rate": 1.8118990384615388e-05, "loss": 0.7978, "step": 2223 }, { "epoch": 1.2973603616742015, "grad_norm": 1.1220710277557373, "learning_rate": 1.8103966346153846e-05, "loss": 0.8322, "step": 2224 }, { "epoch": 1.2979437071605657, "grad_norm": 1.1510266065597534, "learning_rate": 1.8088942307692308e-05, "loss": 0.735, "step": 2225 }, { "epoch": 1.2985270526469301, "grad_norm": 1.145638108253479, "learning_rate": 1.807391826923077e-05, "loss": 0.8109, "step": 2226 }, { "epoch": 1.2991103981332945, "grad_norm": 1.2273057699203491, "learning_rate": 1.8058894230769232e-05, "loss": 0.8697, "step": 2227 }, { "epoch": 1.2996937436196587, "grad_norm": 0.9667252898216248, "learning_rate": 1.8043870192307694e-05, "loss": 0.7946, "step": 2228 }, { "epoch": 1.300277089106023, "grad_norm": 1.183985710144043, "learning_rate": 1.8028846153846156e-05, "loss": 0.9245, "step": 2229 }, { "epoch": 1.3008604345923873, "grad_norm": 1.4120750427246094, "learning_rate": 1.8013822115384614e-05, "loss": 0.899, "step": 2230 }, { "epoch": 1.3014437800787517, "grad_norm": 1.9396076202392578, "learning_rate": 1.799879807692308e-05, "loss": 0.816, "step": 2231 }, { "epoch": 1.302027125565116, "grad_norm": 1.4099103212356567, "learning_rate": 1.798377403846154e-05, "loss": 0.6821, "step": 2232 }, { "epoch": 1.3026104710514803, "grad_norm": 1.2785017490386963, "learning_rate": 1.796875e-05, "loss": 0.9443, "step": 2233 }, { "epoch": 1.3031938165378445, "grad_norm": 1.3573774099349976, "learning_rate": 1.795372596153846e-05, "loss": 0.7205, "step": 2234 }, { "epoch": 1.303777162024209, "grad_norm": 1.1221401691436768, "learning_rate": 1.7938701923076923e-05, "loss": 0.7908, "step": 2235 }, { "epoch": 1.304360507510573, "grad_norm": 1.1731091737747192, "learning_rate": 1.7923677884615385e-05, "loss": 0.6417, "step": 2236 }, { "epoch": 1.3049438529969375, "grad_norm": 1.3627444505691528, "learning_rate": 1.7908653846153847e-05, "loss": 0.7929, "step": 2237 }, { "epoch": 1.3055271984833017, "grad_norm": 1.4136035442352295, "learning_rate": 1.789362980769231e-05, "loss": 0.9224, "step": 2238 }, { "epoch": 1.306110543969666, "grad_norm": 1.138240933418274, "learning_rate": 1.7878605769230768e-05, "loss": 0.8356, "step": 2239 }, { "epoch": 1.3066938894560303, "grad_norm": 1.424336552619934, "learning_rate": 1.7863581730769233e-05, "loss": 0.7738, "step": 2240 }, { "epoch": 1.3072772349423947, "grad_norm": 1.2529820203781128, "learning_rate": 1.7848557692307695e-05, "loss": 0.6549, "step": 2241 }, { "epoch": 1.3078605804287589, "grad_norm": 1.7925548553466797, "learning_rate": 1.7833533653846153e-05, "loss": 0.648, "step": 2242 }, { "epoch": 1.3084439259151233, "grad_norm": 1.3370999097824097, "learning_rate": 1.7818509615384615e-05, "loss": 0.8967, "step": 2243 }, { "epoch": 1.3090272714014874, "grad_norm": 1.1147828102111816, "learning_rate": 1.780348557692308e-05, "loss": 0.8724, "step": 2244 }, { "epoch": 1.3096106168878519, "grad_norm": 1.2542976140975952, "learning_rate": 1.778846153846154e-05, "loss": 0.8384, "step": 2245 }, { "epoch": 1.3101939623742163, "grad_norm": 1.251609444618225, "learning_rate": 1.77734375e-05, "loss": 0.8426, "step": 2246 }, { "epoch": 1.3107773078605804, "grad_norm": 1.2808489799499512, "learning_rate": 1.7758413461538463e-05, "loss": 0.7978, "step": 2247 }, { "epoch": 1.3113606533469446, "grad_norm": 1.2761458158493042, "learning_rate": 1.774338942307692e-05, "loss": 0.9413, "step": 2248 }, { "epoch": 1.311943998833309, "grad_norm": 1.263123631477356, "learning_rate": 1.7728365384615387e-05, "loss": 0.7363, "step": 2249 }, { "epoch": 1.3125273443196734, "grad_norm": 1.2832683324813843, "learning_rate": 1.771334134615385e-05, "loss": 0.5699, "step": 2250 }, { "epoch": 1.3131106898060376, "grad_norm": 1.2383707761764526, "learning_rate": 1.7698317307692307e-05, "loss": 0.9112, "step": 2251 }, { "epoch": 1.3136940352924018, "grad_norm": 1.2074283361434937, "learning_rate": 1.768329326923077e-05, "loss": 0.7454, "step": 2252 }, { "epoch": 1.3142773807787662, "grad_norm": 1.1046134233474731, "learning_rate": 1.7668269230769234e-05, "loss": 0.9412, "step": 2253 }, { "epoch": 1.3148607262651306, "grad_norm": 1.1459115743637085, "learning_rate": 1.7653245192307693e-05, "loss": 0.7046, "step": 2254 }, { "epoch": 1.3154440717514948, "grad_norm": 1.2635064125061035, "learning_rate": 1.7638221153846155e-05, "loss": 0.8815, "step": 2255 }, { "epoch": 1.316027417237859, "grad_norm": 1.1063247919082642, "learning_rate": 1.7623197115384616e-05, "loss": 0.8305, "step": 2256 }, { "epoch": 1.3166107627242234, "grad_norm": 2.1608684062957764, "learning_rate": 1.7608173076923078e-05, "loss": 0.9767, "step": 2257 }, { "epoch": 1.3171941082105878, "grad_norm": 1.1712557077407837, "learning_rate": 1.759314903846154e-05, "loss": 1.0557, "step": 2258 }, { "epoch": 1.317777453696952, "grad_norm": 1.1504344940185547, "learning_rate": 1.7578125000000002e-05, "loss": 1.0544, "step": 2259 }, { "epoch": 1.3183607991833164, "grad_norm": 1.2168277502059937, "learning_rate": 1.756310096153846e-05, "loss": 0.8735, "step": 2260 }, { "epoch": 1.3189441446696806, "grad_norm": 1.2982547283172607, "learning_rate": 1.7548076923076922e-05, "loss": 0.9633, "step": 2261 }, { "epoch": 1.319527490156045, "grad_norm": 1.2747186422348022, "learning_rate": 1.7533052884615388e-05, "loss": 0.6536, "step": 2262 }, { "epoch": 1.3201108356424092, "grad_norm": 1.1642917394638062, "learning_rate": 1.7518028846153846e-05, "loss": 0.7892, "step": 2263 }, { "epoch": 1.3206941811287736, "grad_norm": 1.22393798828125, "learning_rate": 1.7503004807692308e-05, "loss": 0.8314, "step": 2264 }, { "epoch": 1.3212775266151378, "grad_norm": 1.3126612901687622, "learning_rate": 1.748798076923077e-05, "loss": 0.695, "step": 2265 }, { "epoch": 1.3218608721015022, "grad_norm": 1.1996201276779175, "learning_rate": 1.7472956730769232e-05, "loss": 0.7914, "step": 2266 }, { "epoch": 1.3224442175878663, "grad_norm": 1.2702114582061768, "learning_rate": 1.7457932692307694e-05, "loss": 0.835, "step": 2267 }, { "epoch": 1.3230275630742308, "grad_norm": 1.1210219860076904, "learning_rate": 1.7442908653846156e-05, "loss": 0.6297, "step": 2268 }, { "epoch": 1.323610908560595, "grad_norm": 1.0922166109085083, "learning_rate": 1.7427884615384614e-05, "loss": 0.735, "step": 2269 }, { "epoch": 1.3241942540469593, "grad_norm": 1.3747941255569458, "learning_rate": 1.741286057692308e-05, "loss": 0.9677, "step": 2270 }, { "epoch": 1.3247775995333235, "grad_norm": 1.3291850090026855, "learning_rate": 1.739783653846154e-05, "loss": 0.9007, "step": 2271 }, { "epoch": 1.325360945019688, "grad_norm": 1.1687302589416504, "learning_rate": 1.73828125e-05, "loss": 0.7095, "step": 2272 }, { "epoch": 1.3259442905060523, "grad_norm": 1.1394537687301636, "learning_rate": 1.7367788461538462e-05, "loss": 0.6055, "step": 2273 }, { "epoch": 1.3265276359924165, "grad_norm": 1.1962412595748901, "learning_rate": 1.7352764423076924e-05, "loss": 0.7964, "step": 2274 }, { "epoch": 1.3271109814787807, "grad_norm": 1.5585336685180664, "learning_rate": 1.7337740384615386e-05, "loss": 0.725, "step": 2275 }, { "epoch": 1.327694326965145, "grad_norm": 1.1137040853500366, "learning_rate": 1.7322716346153847e-05, "loss": 0.8106, "step": 2276 }, { "epoch": 1.3282776724515095, "grad_norm": 1.2830206155776978, "learning_rate": 1.730769230769231e-05, "loss": 0.8546, "step": 2277 }, { "epoch": 1.3288610179378737, "grad_norm": 1.1548091173171997, "learning_rate": 1.7292668269230768e-05, "loss": 0.8009, "step": 2278 }, { "epoch": 1.3294443634242379, "grad_norm": 1.5664095878601074, "learning_rate": 1.7277644230769233e-05, "loss": 0.7502, "step": 2279 }, { "epoch": 1.3300277089106023, "grad_norm": 1.4026076793670654, "learning_rate": 1.7262620192307695e-05, "loss": 0.886, "step": 2280 }, { "epoch": 1.3306110543969667, "grad_norm": 1.2917288541793823, "learning_rate": 1.7247596153846153e-05, "loss": 0.7834, "step": 2281 }, { "epoch": 1.3311943998833309, "grad_norm": 1.4439992904663086, "learning_rate": 1.7232572115384615e-05, "loss": 0.8932, "step": 2282 }, { "epoch": 1.331777745369695, "grad_norm": 1.2439894676208496, "learning_rate": 1.721754807692308e-05, "loss": 0.7965, "step": 2283 }, { "epoch": 1.3323610908560595, "grad_norm": 1.2655996084213257, "learning_rate": 1.720252403846154e-05, "loss": 0.7999, "step": 2284 }, { "epoch": 1.3329444363424239, "grad_norm": 1.1132214069366455, "learning_rate": 1.71875e-05, "loss": 0.8086, "step": 2285 }, { "epoch": 1.333527781828788, "grad_norm": 1.3198105096817017, "learning_rate": 1.7172475961538463e-05, "loss": 0.824, "step": 2286 }, { "epoch": 1.3341111273151525, "grad_norm": 1.1884894371032715, "learning_rate": 1.715745192307692e-05, "loss": 0.7186, "step": 2287 }, { "epoch": 1.3346944728015167, "grad_norm": 1.179015040397644, "learning_rate": 1.7142427884615387e-05, "loss": 0.9886, "step": 2288 }, { "epoch": 1.335277818287881, "grad_norm": 1.1946966648101807, "learning_rate": 1.712740384615385e-05, "loss": 0.7248, "step": 2289 }, { "epoch": 1.3358611637742452, "grad_norm": 1.2960485219955444, "learning_rate": 1.7112379807692307e-05, "loss": 0.8273, "step": 2290 }, { "epoch": 1.3364445092606096, "grad_norm": 1.2853033542633057, "learning_rate": 1.709735576923077e-05, "loss": 0.8596, "step": 2291 }, { "epoch": 1.3370278547469738, "grad_norm": 1.0934782028198242, "learning_rate": 1.7082331730769234e-05, "loss": 0.71, "step": 2292 }, { "epoch": 1.3376112002333382, "grad_norm": 1.2531418800354004, "learning_rate": 1.7067307692307693e-05, "loss": 0.8957, "step": 2293 }, { "epoch": 1.3381945457197024, "grad_norm": 1.389156460762024, "learning_rate": 1.7052283653846155e-05, "loss": 0.8757, "step": 2294 }, { "epoch": 1.3387778912060668, "grad_norm": 1.4648810625076294, "learning_rate": 1.7037259615384617e-05, "loss": 0.9267, "step": 2295 }, { "epoch": 1.339361236692431, "grad_norm": 1.110659122467041, "learning_rate": 1.702223557692308e-05, "loss": 0.8775, "step": 2296 }, { "epoch": 1.3399445821787954, "grad_norm": 1.2895351648330688, "learning_rate": 1.700721153846154e-05, "loss": 0.7839, "step": 2297 }, { "epoch": 1.3405279276651596, "grad_norm": 1.207476258277893, "learning_rate": 1.6992187500000002e-05, "loss": 0.7952, "step": 2298 }, { "epoch": 1.341111273151524, "grad_norm": 1.142844557762146, "learning_rate": 1.697716346153846e-05, "loss": 0.7482, "step": 2299 }, { "epoch": 1.3416946186378884, "grad_norm": 1.2515149116516113, "learning_rate": 1.6962139423076923e-05, "loss": 0.9569, "step": 2300 }, { "epoch": 1.3422779641242526, "grad_norm": 1.090381145477295, "learning_rate": 1.6947115384615388e-05, "loss": 0.8499, "step": 2301 }, { "epoch": 1.3428613096106168, "grad_norm": 1.2688581943511963, "learning_rate": 1.6932091346153846e-05, "loss": 0.6979, "step": 2302 }, { "epoch": 1.3434446550969812, "grad_norm": 1.2520463466644287, "learning_rate": 1.6917067307692308e-05, "loss": 0.7364, "step": 2303 }, { "epoch": 1.3440280005833456, "grad_norm": 1.1990320682525635, "learning_rate": 1.690204326923077e-05, "loss": 0.652, "step": 2304 }, { "epoch": 1.3446113460697098, "grad_norm": 1.142889380455017, "learning_rate": 1.6887019230769232e-05, "loss": 0.9194, "step": 2305 }, { "epoch": 1.345194691556074, "grad_norm": 1.1990599632263184, "learning_rate": 1.6871995192307694e-05, "loss": 0.7939, "step": 2306 }, { "epoch": 1.3457780370424384, "grad_norm": 1.3533287048339844, "learning_rate": 1.6856971153846156e-05, "loss": 0.7524, "step": 2307 }, { "epoch": 1.3463613825288028, "grad_norm": 1.157096028327942, "learning_rate": 1.6841947115384614e-05, "loss": 0.5834, "step": 2308 }, { "epoch": 1.346944728015167, "grad_norm": 1.2179913520812988, "learning_rate": 1.682692307692308e-05, "loss": 0.9018, "step": 2309 }, { "epoch": 1.3475280735015314, "grad_norm": 1.2479956150054932, "learning_rate": 1.681189903846154e-05, "loss": 0.7633, "step": 2310 }, { "epoch": 1.3481114189878955, "grad_norm": 1.1520054340362549, "learning_rate": 1.6796875e-05, "loss": 0.7101, "step": 2311 }, { "epoch": 1.34869476447426, "grad_norm": 1.21807861328125, "learning_rate": 1.6781850961538462e-05, "loss": 0.7766, "step": 2312 }, { "epoch": 1.3492781099606241, "grad_norm": 1.4313595294952393, "learning_rate": 1.6766826923076924e-05, "loss": 0.6487, "step": 2313 }, { "epoch": 1.3498614554469885, "grad_norm": 1.3884174823760986, "learning_rate": 1.6751802884615386e-05, "loss": 0.7646, "step": 2314 }, { "epoch": 1.3504448009333527, "grad_norm": 1.1427439451217651, "learning_rate": 1.6736778846153848e-05, "loss": 0.8483, "step": 2315 }, { "epoch": 1.3510281464197171, "grad_norm": 1.2759617567062378, "learning_rate": 1.672175480769231e-05, "loss": 0.7245, "step": 2316 }, { "epoch": 1.3516114919060813, "grad_norm": 1.3233177661895752, "learning_rate": 1.6706730769230768e-05, "loss": 1.0692, "step": 2317 }, { "epoch": 1.3521948373924457, "grad_norm": 1.3573335409164429, "learning_rate": 1.6691706730769233e-05, "loss": 0.9137, "step": 2318 }, { "epoch": 1.35277818287881, "grad_norm": 1.0537357330322266, "learning_rate": 1.6676682692307695e-05, "loss": 0.7244, "step": 2319 }, { "epoch": 1.3533615283651743, "grad_norm": 1.126329779624939, "learning_rate": 1.6661658653846154e-05, "loss": 0.7374, "step": 2320 }, { "epoch": 1.3539448738515385, "grad_norm": 1.3769999742507935, "learning_rate": 1.6646634615384616e-05, "loss": 0.7223, "step": 2321 }, { "epoch": 1.354528219337903, "grad_norm": 1.307424545288086, "learning_rate": 1.6631610576923077e-05, "loss": 0.7789, "step": 2322 }, { "epoch": 1.355111564824267, "grad_norm": 1.2053377628326416, "learning_rate": 1.661658653846154e-05, "loss": 0.9478, "step": 2323 }, { "epoch": 1.3556949103106315, "grad_norm": 1.478121280670166, "learning_rate": 1.66015625e-05, "loss": 0.7229, "step": 2324 }, { "epoch": 1.3562782557969957, "grad_norm": 1.2465181350708008, "learning_rate": 1.6586538461538463e-05, "loss": 0.8424, "step": 2325 }, { "epoch": 1.35686160128336, "grad_norm": 1.366040825843811, "learning_rate": 1.657151442307692e-05, "loss": 0.8107, "step": 2326 }, { "epoch": 1.3574449467697245, "grad_norm": 1.2248352766036987, "learning_rate": 1.6556490384615387e-05, "loss": 0.7464, "step": 2327 }, { "epoch": 1.3580282922560887, "grad_norm": 1.4650728702545166, "learning_rate": 1.654146634615385e-05, "loss": 0.7499, "step": 2328 }, { "epoch": 1.3586116377424529, "grad_norm": 1.5269290208816528, "learning_rate": 1.6526442307692307e-05, "loss": 0.8989, "step": 2329 }, { "epoch": 1.3591949832288173, "grad_norm": 1.1337950229644775, "learning_rate": 1.651141826923077e-05, "loss": 0.6656, "step": 2330 }, { "epoch": 1.3597783287151817, "grad_norm": 1.1645163297653198, "learning_rate": 1.649639423076923e-05, "loss": 0.7783, "step": 2331 }, { "epoch": 1.3603616742015459, "grad_norm": 1.1996797323226929, "learning_rate": 1.6481370192307693e-05, "loss": 0.9066, "step": 2332 }, { "epoch": 1.36094501968791, "grad_norm": 1.2484889030456543, "learning_rate": 1.6466346153846155e-05, "loss": 0.761, "step": 2333 }, { "epoch": 1.3615283651742744, "grad_norm": 1.1876178979873657, "learning_rate": 1.6451322115384617e-05, "loss": 0.7001, "step": 2334 }, { "epoch": 1.3621117106606389, "grad_norm": 1.1717162132263184, "learning_rate": 1.643629807692308e-05, "loss": 0.7374, "step": 2335 }, { "epoch": 1.362695056147003, "grad_norm": 1.0891270637512207, "learning_rate": 1.642127403846154e-05, "loss": 0.9183, "step": 2336 }, { "epoch": 1.3632784016333674, "grad_norm": 1.3006951808929443, "learning_rate": 1.6406250000000002e-05, "loss": 0.7246, "step": 2337 }, { "epoch": 1.3638617471197316, "grad_norm": 1.3394348621368408, "learning_rate": 1.639122596153846e-05, "loss": 0.7199, "step": 2338 }, { "epoch": 1.364445092606096, "grad_norm": 1.2449558973312378, "learning_rate": 1.6376201923076923e-05, "loss": 0.8916, "step": 2339 }, { "epoch": 1.3650284380924602, "grad_norm": 1.0279120206832886, "learning_rate": 1.6361177884615385e-05, "loss": 0.9012, "step": 2340 }, { "epoch": 1.3656117835788246, "grad_norm": 1.218778133392334, "learning_rate": 1.6346153846153847e-05, "loss": 0.7377, "step": 2341 }, { "epoch": 1.3661951290651888, "grad_norm": 1.2190182209014893, "learning_rate": 1.633112980769231e-05, "loss": 0.963, "step": 2342 }, { "epoch": 1.3667784745515532, "grad_norm": 1.2658116817474365, "learning_rate": 1.631610576923077e-05, "loss": 0.8298, "step": 2343 }, { "epoch": 1.3673618200379174, "grad_norm": 1.2883538007736206, "learning_rate": 1.6301081730769232e-05, "loss": 0.8406, "step": 2344 }, { "epoch": 1.3679451655242818, "grad_norm": 1.2415398359298706, "learning_rate": 1.6286057692307694e-05, "loss": 0.7271, "step": 2345 }, { "epoch": 1.368528511010646, "grad_norm": 1.2686741352081299, "learning_rate": 1.6271033653846156e-05, "loss": 0.8293, "step": 2346 }, { "epoch": 1.3691118564970104, "grad_norm": 1.3187519311904907, "learning_rate": 1.6256009615384614e-05, "loss": 0.7959, "step": 2347 }, { "epoch": 1.3696952019833746, "grad_norm": 1.2809123992919922, "learning_rate": 1.624098557692308e-05, "loss": 0.657, "step": 2348 }, { "epoch": 1.370278547469739, "grad_norm": 1.2265044450759888, "learning_rate": 1.6225961538461538e-05, "loss": 0.8477, "step": 2349 }, { "epoch": 1.3708618929561034, "grad_norm": 1.0081802606582642, "learning_rate": 1.62109375e-05, "loss": 0.8687, "step": 2350 }, { "epoch": 1.3714452384424676, "grad_norm": 1.3187026977539062, "learning_rate": 1.6195913461538462e-05, "loss": 0.9389, "step": 2351 }, { "epoch": 1.3720285839288318, "grad_norm": 1.283357858657837, "learning_rate": 1.6180889423076924e-05, "loss": 0.7913, "step": 2352 }, { "epoch": 1.3726119294151962, "grad_norm": 1.126367211341858, "learning_rate": 1.6165865384615386e-05, "loss": 0.7502, "step": 2353 }, { "epoch": 1.3731952749015606, "grad_norm": 1.1398643255233765, "learning_rate": 1.6150841346153848e-05, "loss": 0.8787, "step": 2354 }, { "epoch": 1.3737786203879248, "grad_norm": 1.1188660860061646, "learning_rate": 1.613581730769231e-05, "loss": 0.5851, "step": 2355 }, { "epoch": 1.374361965874289, "grad_norm": 1.200810194015503, "learning_rate": 1.6120793269230768e-05, "loss": 0.7128, "step": 2356 }, { "epoch": 1.3749453113606533, "grad_norm": 1.1014255285263062, "learning_rate": 1.6105769230769233e-05, "loss": 0.8273, "step": 2357 }, { "epoch": 1.3755286568470177, "grad_norm": 1.1966328620910645, "learning_rate": 1.6090745192307692e-05, "loss": 0.9342, "step": 2358 }, { "epoch": 1.376112002333382, "grad_norm": 1.3458975553512573, "learning_rate": 1.6075721153846154e-05, "loss": 0.8317, "step": 2359 }, { "epoch": 1.3766953478197461, "grad_norm": 1.364256501197815, "learning_rate": 1.6060697115384616e-05, "loss": 0.7107, "step": 2360 }, { "epoch": 1.3772786933061105, "grad_norm": 1.2403984069824219, "learning_rate": 1.6045673076923078e-05, "loss": 0.6139, "step": 2361 }, { "epoch": 1.377862038792475, "grad_norm": 1.3325952291488647, "learning_rate": 1.603064903846154e-05, "loss": 0.8053, "step": 2362 }, { "epoch": 1.3784453842788391, "grad_norm": 1.0815925598144531, "learning_rate": 1.6015625e-05, "loss": 0.8323, "step": 2363 }, { "epoch": 1.3790287297652035, "grad_norm": 1.0733479261398315, "learning_rate": 1.6000600961538463e-05, "loss": 0.7921, "step": 2364 }, { "epoch": 1.3796120752515677, "grad_norm": 1.1240204572677612, "learning_rate": 1.5985576923076922e-05, "loss": 0.861, "step": 2365 }, { "epoch": 1.380195420737932, "grad_norm": 1.2654609680175781, "learning_rate": 1.5970552884615387e-05, "loss": 0.6925, "step": 2366 }, { "epoch": 1.3807787662242963, "grad_norm": 1.7558287382125854, "learning_rate": 1.5955528846153846e-05, "loss": 0.7423, "step": 2367 }, { "epoch": 1.3813621117106607, "grad_norm": 1.3726961612701416, "learning_rate": 1.5940504807692307e-05, "loss": 0.7108, "step": 2368 }, { "epoch": 1.3819454571970249, "grad_norm": 1.279613971710205, "learning_rate": 1.592548076923077e-05, "loss": 0.8721, "step": 2369 }, { "epoch": 1.3825288026833893, "grad_norm": 1.324630856513977, "learning_rate": 1.591045673076923e-05, "loss": 0.6835, "step": 2370 }, { "epoch": 1.3831121481697535, "grad_norm": 1.2332587242126465, "learning_rate": 1.5895432692307693e-05, "loss": 0.8593, "step": 2371 }, { "epoch": 1.3836954936561179, "grad_norm": 1.0641486644744873, "learning_rate": 1.5880408653846155e-05, "loss": 0.8916, "step": 2372 }, { "epoch": 1.384278839142482, "grad_norm": 1.2237672805786133, "learning_rate": 1.5865384615384617e-05, "loss": 0.8585, "step": 2373 }, { "epoch": 1.3848621846288465, "grad_norm": 1.3848849534988403, "learning_rate": 1.585036057692308e-05, "loss": 0.8661, "step": 2374 }, { "epoch": 1.3854455301152107, "grad_norm": 1.1622084379196167, "learning_rate": 1.583533653846154e-05, "loss": 0.8062, "step": 2375 }, { "epoch": 1.386028875601575, "grad_norm": 1.1375772953033447, "learning_rate": 1.58203125e-05, "loss": 0.8455, "step": 2376 }, { "epoch": 1.3866122210879395, "grad_norm": 1.1407866477966309, "learning_rate": 1.580528846153846e-05, "loss": 0.7599, "step": 2377 }, { "epoch": 1.3871955665743037, "grad_norm": 1.3291012048721313, "learning_rate": 1.5790264423076923e-05, "loss": 0.7574, "step": 2378 }, { "epoch": 1.3877789120606678, "grad_norm": 1.0793458223342896, "learning_rate": 1.5775240384615385e-05, "loss": 0.8483, "step": 2379 }, { "epoch": 1.3883622575470322, "grad_norm": 1.419547438621521, "learning_rate": 1.5760216346153847e-05, "loss": 0.8905, "step": 2380 }, { "epoch": 1.3889456030333966, "grad_norm": 1.1597005128860474, "learning_rate": 1.574519230769231e-05, "loss": 0.7328, "step": 2381 }, { "epoch": 1.3895289485197608, "grad_norm": 1.3240809440612793, "learning_rate": 1.573016826923077e-05, "loss": 0.9296, "step": 2382 }, { "epoch": 1.390112294006125, "grad_norm": 1.1506272554397583, "learning_rate": 1.5715144230769232e-05, "loss": 0.693, "step": 2383 }, { "epoch": 1.3906956394924894, "grad_norm": 1.0295346975326538, "learning_rate": 1.5700120192307694e-05, "loss": 0.8175, "step": 2384 }, { "epoch": 1.3912789849788538, "grad_norm": 1.1327847242355347, "learning_rate": 1.5685096153846153e-05, "loss": 0.8946, "step": 2385 }, { "epoch": 1.391862330465218, "grad_norm": 1.3098478317260742, "learning_rate": 1.5670072115384615e-05, "loss": 0.7245, "step": 2386 }, { "epoch": 1.3924456759515822, "grad_norm": 1.2984569072723389, "learning_rate": 1.565504807692308e-05, "loss": 1.0087, "step": 2387 }, { "epoch": 1.3930290214379466, "grad_norm": 1.20249605178833, "learning_rate": 1.564002403846154e-05, "loss": 0.9404, "step": 2388 }, { "epoch": 1.393612366924311, "grad_norm": 1.116599678993225, "learning_rate": 1.5625e-05, "loss": 0.8229, "step": 2389 }, { "epoch": 1.3941957124106752, "grad_norm": 1.4463697671890259, "learning_rate": 1.5609975961538462e-05, "loss": 0.8928, "step": 2390 }, { "epoch": 1.3947790578970396, "grad_norm": 1.3660231828689575, "learning_rate": 1.5594951923076924e-05, "loss": 0.7133, "step": 2391 }, { "epoch": 1.3953624033834038, "grad_norm": 1.2797685861587524, "learning_rate": 1.5579927884615386e-05, "loss": 0.8965, "step": 2392 }, { "epoch": 1.3959457488697682, "grad_norm": 1.3361741304397583, "learning_rate": 1.5564903846153848e-05, "loss": 0.7193, "step": 2393 }, { "epoch": 1.3965290943561324, "grad_norm": 1.1247235536575317, "learning_rate": 1.5549879807692306e-05, "loss": 0.7304, "step": 2394 }, { "epoch": 1.3971124398424968, "grad_norm": 1.4382926225662231, "learning_rate": 1.5534855769230768e-05, "loss": 0.8633, "step": 2395 }, { "epoch": 1.397695785328861, "grad_norm": 1.2390516996383667, "learning_rate": 1.5519831730769234e-05, "loss": 0.8239, "step": 2396 }, { "epoch": 1.3982791308152254, "grad_norm": 1.3460718393325806, "learning_rate": 1.5504807692307692e-05, "loss": 1.0886, "step": 2397 }, { "epoch": 1.3988624763015896, "grad_norm": 1.125791311264038, "learning_rate": 1.5489783653846154e-05, "loss": 0.8253, "step": 2398 }, { "epoch": 1.399445821787954, "grad_norm": 1.2912181615829468, "learning_rate": 1.5474759615384616e-05, "loss": 0.7138, "step": 2399 }, { "epoch": 1.4000291672743181, "grad_norm": 1.1510553359985352, "learning_rate": 1.5459735576923078e-05, "loss": 0.932, "step": 2400 }, { "epoch": 1.4000291672743181, "eval_loss_squad": 0.8576014773617499, "eval_perplexity": 8.10681335191388, "eval_perplexity_reconstruct": 1.909424481734322, "step": 2400 }, { "epoch": 1.4006125127606825, "grad_norm": 1.429721474647522, "learning_rate": 1.544471153846154e-05, "loss": 0.7134, "step": 2401 }, { "epoch": 1.4011958582470467, "grad_norm": 1.3556239604949951, "learning_rate": 1.54296875e-05, "loss": 0.8164, "step": 2402 }, { "epoch": 1.4017792037334111, "grad_norm": 1.2837475538253784, "learning_rate": 1.541466346153846e-05, "loss": 0.678, "step": 2403 }, { "epoch": 1.4023625492197755, "grad_norm": 1.1232627630233765, "learning_rate": 1.5399639423076922e-05, "loss": 0.8262, "step": 2404 }, { "epoch": 1.4029458947061397, "grad_norm": 1.3286436796188354, "learning_rate": 1.5384615384615387e-05, "loss": 0.6756, "step": 2405 }, { "epoch": 1.403529240192504, "grad_norm": 1.2372446060180664, "learning_rate": 1.5369591346153846e-05, "loss": 0.7997, "step": 2406 }, { "epoch": 1.4041125856788683, "grad_norm": 1.1915537118911743, "learning_rate": 1.5354567307692308e-05, "loss": 0.7118, "step": 2407 }, { "epoch": 1.4046959311652327, "grad_norm": 1.241876244544983, "learning_rate": 1.533954326923077e-05, "loss": 0.7073, "step": 2408 }, { "epoch": 1.405279276651597, "grad_norm": 1.4120349884033203, "learning_rate": 1.532451923076923e-05, "loss": 0.8801, "step": 2409 }, { "epoch": 1.405862622137961, "grad_norm": 1.0920681953430176, "learning_rate": 1.5309495192307693e-05, "loss": 0.8118, "step": 2410 }, { "epoch": 1.4064459676243255, "grad_norm": 1.3178508281707764, "learning_rate": 1.5294471153846155e-05, "loss": 0.8296, "step": 2411 }, { "epoch": 1.40702931311069, "grad_norm": 1.307503342628479, "learning_rate": 1.5279447115384614e-05, "loss": 0.8255, "step": 2412 }, { "epoch": 1.407612658597054, "grad_norm": 1.4173907041549683, "learning_rate": 1.526442307692308e-05, "loss": 0.7455, "step": 2413 }, { "epoch": 1.4081960040834183, "grad_norm": 1.2470276355743408, "learning_rate": 1.5249399038461539e-05, "loss": 0.7058, "step": 2414 }, { "epoch": 1.4087793495697827, "grad_norm": 1.1090408563613892, "learning_rate": 1.5234375000000001e-05, "loss": 0.7262, "step": 2415 }, { "epoch": 1.409362695056147, "grad_norm": 1.0376850366592407, "learning_rate": 1.5219350961538461e-05, "loss": 0.6755, "step": 2416 }, { "epoch": 1.4099460405425113, "grad_norm": 1.1430914402008057, "learning_rate": 1.5204326923076923e-05, "loss": 0.7457, "step": 2417 }, { "epoch": 1.4105293860288757, "grad_norm": 1.2482012510299683, "learning_rate": 1.5189302884615387e-05, "loss": 0.6143, "step": 2418 }, { "epoch": 1.4111127315152399, "grad_norm": 1.3330813646316528, "learning_rate": 1.5174278846153847e-05, "loss": 0.6651, "step": 2419 }, { "epoch": 1.4116960770016043, "grad_norm": 1.3887958526611328, "learning_rate": 1.5159254807692309e-05, "loss": 0.9114, "step": 2420 }, { "epoch": 1.4122794224879684, "grad_norm": 1.328403353691101, "learning_rate": 1.5144230769230769e-05, "loss": 0.8242, "step": 2421 }, { "epoch": 1.4128627679743329, "grad_norm": 1.185078740119934, "learning_rate": 1.5129206730769233e-05, "loss": 0.6619, "step": 2422 }, { "epoch": 1.413446113460697, "grad_norm": 1.2361003160476685, "learning_rate": 1.5114182692307693e-05, "loss": 0.7416, "step": 2423 }, { "epoch": 1.4140294589470614, "grad_norm": 1.1306205987930298, "learning_rate": 1.5099158653846155e-05, "loss": 0.8077, "step": 2424 }, { "epoch": 1.4146128044334256, "grad_norm": 0.9908915162086487, "learning_rate": 1.5084134615384615e-05, "loss": 0.8779, "step": 2425 }, { "epoch": 1.41519614991979, "grad_norm": 1.1640621423721313, "learning_rate": 1.5069110576923078e-05, "loss": 0.8623, "step": 2426 }, { "epoch": 1.4157794954061542, "grad_norm": 2.4493496417999268, "learning_rate": 1.505408653846154e-05, "loss": 0.9976, "step": 2427 }, { "epoch": 1.4163628408925186, "grad_norm": 1.1745401620864868, "learning_rate": 1.50390625e-05, "loss": 0.7252, "step": 2428 }, { "epoch": 1.4169461863788828, "grad_norm": 1.2570523023605347, "learning_rate": 1.5024038461538462e-05, "loss": 0.7836, "step": 2429 }, { "epoch": 1.4175295318652472, "grad_norm": 1.1713680028915405, "learning_rate": 1.5009014423076923e-05, "loss": 0.7318, "step": 2430 }, { "epoch": 1.4181128773516116, "grad_norm": 1.224394679069519, "learning_rate": 1.4993990384615386e-05, "loss": 0.619, "step": 2431 }, { "epoch": 1.4186962228379758, "grad_norm": 1.089404821395874, "learning_rate": 1.4978966346153846e-05, "loss": 0.712, "step": 2432 }, { "epoch": 1.41927956832434, "grad_norm": 1.172507643699646, "learning_rate": 1.4963942307692308e-05, "loss": 0.6563, "step": 2433 }, { "epoch": 1.4198629138107044, "grad_norm": 1.3817616701126099, "learning_rate": 1.4948918269230768e-05, "loss": 0.6572, "step": 2434 }, { "epoch": 1.4204462592970688, "grad_norm": 1.1678885221481323, "learning_rate": 1.4933894230769232e-05, "loss": 0.9326, "step": 2435 }, { "epoch": 1.421029604783433, "grad_norm": 1.2956814765930176, "learning_rate": 1.4918870192307694e-05, "loss": 0.8578, "step": 2436 }, { "epoch": 1.4216129502697972, "grad_norm": 0.7640461325645447, "learning_rate": 1.4903846153846154e-05, "loss": 0.8301, "step": 2437 }, { "epoch": 1.4221962957561616, "grad_norm": 1.1869699954986572, "learning_rate": 1.4888822115384616e-05, "loss": 0.8132, "step": 2438 }, { "epoch": 1.422779641242526, "grad_norm": 1.1306724548339844, "learning_rate": 1.487379807692308e-05, "loss": 0.7028, "step": 2439 }, { "epoch": 1.4233629867288902, "grad_norm": 1.2320770025253296, "learning_rate": 1.485877403846154e-05, "loss": 0.6797, "step": 2440 }, { "epoch": 1.4239463322152544, "grad_norm": 1.3738675117492676, "learning_rate": 1.484375e-05, "loss": 0.752, "step": 2441 }, { "epoch": 1.4245296777016188, "grad_norm": 1.2884392738342285, "learning_rate": 1.4828725961538462e-05, "loss": 0.7435, "step": 2442 }, { "epoch": 1.4251130231879832, "grad_norm": 1.3447365760803223, "learning_rate": 1.4813701923076922e-05, "loss": 0.6959, "step": 2443 }, { "epoch": 1.4256963686743473, "grad_norm": 1.270371913909912, "learning_rate": 1.4798677884615386e-05, "loss": 0.8662, "step": 2444 }, { "epoch": 1.4262797141607118, "grad_norm": 1.3903402090072632, "learning_rate": 1.4783653846153848e-05, "loss": 0.7844, "step": 2445 }, { "epoch": 1.426863059647076, "grad_norm": 1.2270903587341309, "learning_rate": 1.4768629807692308e-05, "loss": 0.6552, "step": 2446 }, { "epoch": 1.4274464051334403, "grad_norm": 1.170554280281067, "learning_rate": 1.475360576923077e-05, "loss": 0.9123, "step": 2447 }, { "epoch": 1.4280297506198045, "grad_norm": 0.9950125217437744, "learning_rate": 1.4738581730769233e-05, "loss": 0.7512, "step": 2448 }, { "epoch": 1.428613096106169, "grad_norm": 1.3289663791656494, "learning_rate": 1.4723557692307693e-05, "loss": 0.7338, "step": 2449 }, { "epoch": 1.4291964415925331, "grad_norm": 1.298034429550171, "learning_rate": 1.4708533653846154e-05, "loss": 0.7775, "step": 2450 }, { "epoch": 1.4297797870788975, "grad_norm": 1.3974965810775757, "learning_rate": 1.4693509615384615e-05, "loss": 0.7119, "step": 2451 }, { "epoch": 1.4303631325652617, "grad_norm": 1.3868650197982788, "learning_rate": 1.4678485576923079e-05, "loss": 0.8351, "step": 2452 }, { "epoch": 1.4309464780516261, "grad_norm": 1.093425989151001, "learning_rate": 1.466346153846154e-05, "loss": 0.505, "step": 2453 }, { "epoch": 1.4315298235379903, "grad_norm": 1.075796127319336, "learning_rate": 1.4648437500000001e-05, "loss": 1.0527, "step": 2454 }, { "epoch": 1.4321131690243547, "grad_norm": 1.2513554096221924, "learning_rate": 1.4633413461538461e-05, "loss": 0.6941, "step": 2455 }, { "epoch": 1.4326965145107189, "grad_norm": 1.2571288347244263, "learning_rate": 1.4618389423076923e-05, "loss": 0.8531, "step": 2456 }, { "epoch": 1.4332798599970833, "grad_norm": 1.139385461807251, "learning_rate": 1.4603365384615387e-05, "loss": 0.648, "step": 2457 }, { "epoch": 1.4338632054834477, "grad_norm": 1.1437668800354004, "learning_rate": 1.4588341346153847e-05, "loss": 0.7422, "step": 2458 }, { "epoch": 1.4344465509698119, "grad_norm": 1.3354110717773438, "learning_rate": 1.4573317307692307e-05, "loss": 0.8899, "step": 2459 }, { "epoch": 1.435029896456176, "grad_norm": 1.1407440900802612, "learning_rate": 1.4558293269230769e-05, "loss": 0.7084, "step": 2460 }, { "epoch": 1.4356132419425405, "grad_norm": 1.2988560199737549, "learning_rate": 1.4543269230769233e-05, "loss": 0.803, "step": 2461 }, { "epoch": 1.4361965874289049, "grad_norm": 1.2152442932128906, "learning_rate": 1.4528245192307693e-05, "loss": 0.5639, "step": 2462 }, { "epoch": 1.436779932915269, "grad_norm": 1.1522806882858276, "learning_rate": 1.4513221153846155e-05, "loss": 0.8283, "step": 2463 }, { "epoch": 1.4373632784016332, "grad_norm": 1.3547604084014893, "learning_rate": 1.4498197115384615e-05, "loss": 0.9728, "step": 2464 }, { "epoch": 1.4379466238879977, "grad_norm": 1.0888975858688354, "learning_rate": 1.4483173076923079e-05, "loss": 0.9095, "step": 2465 }, { "epoch": 1.438529969374362, "grad_norm": 1.2343162298202515, "learning_rate": 1.446814903846154e-05, "loss": 0.8971, "step": 2466 }, { "epoch": 1.4391133148607262, "grad_norm": 1.2880823612213135, "learning_rate": 1.4453125e-05, "loss": 0.758, "step": 2467 }, { "epoch": 1.4396966603470904, "grad_norm": 1.273984670639038, "learning_rate": 1.443810096153846e-05, "loss": 0.8011, "step": 2468 }, { "epoch": 1.4402800058334548, "grad_norm": 1.221825122833252, "learning_rate": 1.4423076923076923e-05, "loss": 0.7709, "step": 2469 }, { "epoch": 1.4408633513198192, "grad_norm": 1.1602380275726318, "learning_rate": 1.4408052884615386e-05, "loss": 0.8517, "step": 2470 }, { "epoch": 1.4414466968061834, "grad_norm": 1.3913099765777588, "learning_rate": 1.4393028846153847e-05, "loss": 0.8339, "step": 2471 }, { "epoch": 1.4420300422925478, "grad_norm": 1.3142465353012085, "learning_rate": 1.4378004807692308e-05, "loss": 0.7022, "step": 2472 }, { "epoch": 1.442613387778912, "grad_norm": 1.2736009359359741, "learning_rate": 1.4362980769230769e-05, "loss": 0.8779, "step": 2473 }, { "epoch": 1.4431967332652764, "grad_norm": 1.3056821823120117, "learning_rate": 1.4347956730769232e-05, "loss": 0.8236, "step": 2474 }, { "epoch": 1.4437800787516406, "grad_norm": 1.3933846950531006, "learning_rate": 1.4332932692307694e-05, "loss": 0.9686, "step": 2475 }, { "epoch": 1.444363424238005, "grad_norm": 1.3702456951141357, "learning_rate": 1.4317908653846154e-05, "loss": 0.8735, "step": 2476 }, { "epoch": 1.4449467697243692, "grad_norm": 1.1643601655960083, "learning_rate": 1.4302884615384614e-05, "loss": 0.6383, "step": 2477 }, { "epoch": 1.4455301152107336, "grad_norm": 1.2481948137283325, "learning_rate": 1.428786057692308e-05, "loss": 0.8129, "step": 2478 }, { "epoch": 1.4461134606970978, "grad_norm": 1.904402494430542, "learning_rate": 1.427283653846154e-05, "loss": 0.6686, "step": 2479 }, { "epoch": 1.4466968061834622, "grad_norm": 1.0331618785858154, "learning_rate": 1.42578125e-05, "loss": 0.6951, "step": 2480 }, { "epoch": 1.4472801516698264, "grad_norm": 1.3622822761535645, "learning_rate": 1.4242788461538462e-05, "loss": 0.8933, "step": 2481 }, { "epoch": 1.4478634971561908, "grad_norm": 1.3097389936447144, "learning_rate": 1.4227764423076922e-05, "loss": 0.8643, "step": 2482 }, { "epoch": 1.448446842642555, "grad_norm": 1.1944518089294434, "learning_rate": 1.4212740384615386e-05, "loss": 0.7313, "step": 2483 }, { "epoch": 1.4490301881289194, "grad_norm": 1.163017749786377, "learning_rate": 1.4197716346153848e-05, "loss": 0.8936, "step": 2484 }, { "epoch": 1.4496135336152838, "grad_norm": 1.350412130355835, "learning_rate": 1.4182692307692308e-05, "loss": 0.6471, "step": 2485 }, { "epoch": 1.450196879101648, "grad_norm": 1.1196433305740356, "learning_rate": 1.4167668269230768e-05, "loss": 0.9139, "step": 2486 }, { "epoch": 1.4507802245880121, "grad_norm": 1.0108141899108887, "learning_rate": 1.4152644230769233e-05, "loss": 0.7733, "step": 2487 }, { "epoch": 1.4513635700743766, "grad_norm": 1.1067391633987427, "learning_rate": 1.4137620192307694e-05, "loss": 0.5762, "step": 2488 }, { "epoch": 1.451946915560741, "grad_norm": 1.3588923215866089, "learning_rate": 1.4122596153846154e-05, "loss": 0.9662, "step": 2489 }, { "epoch": 1.4525302610471051, "grad_norm": 1.15555739402771, "learning_rate": 1.4107572115384616e-05, "loss": 0.7992, "step": 2490 }, { "epoch": 1.4531136065334693, "grad_norm": 1.3854424953460693, "learning_rate": 1.409254807692308e-05, "loss": 0.7705, "step": 2491 }, { "epoch": 1.4536969520198337, "grad_norm": 1.2606521844863892, "learning_rate": 1.407752403846154e-05, "loss": 0.7152, "step": 2492 }, { "epoch": 1.4542802975061981, "grad_norm": 1.3292980194091797, "learning_rate": 1.4062500000000001e-05, "loss": 0.9197, "step": 2493 }, { "epoch": 1.4548636429925623, "grad_norm": 1.5681556463241577, "learning_rate": 1.4047475961538462e-05, "loss": 0.811, "step": 2494 }, { "epoch": 1.4554469884789265, "grad_norm": 1.1809593439102173, "learning_rate": 1.4032451923076922e-05, "loss": 1.0052, "step": 2495 }, { "epoch": 1.456030333965291, "grad_norm": 1.1254372596740723, "learning_rate": 1.4017427884615387e-05, "loss": 0.7758, "step": 2496 }, { "epoch": 1.4566136794516553, "grad_norm": 1.2921695709228516, "learning_rate": 1.4002403846153847e-05, "loss": 0.7628, "step": 2497 }, { "epoch": 1.4571970249380195, "grad_norm": 1.3415536880493164, "learning_rate": 1.3987379807692307e-05, "loss": 0.6855, "step": 2498 }, { "epoch": 1.457780370424384, "grad_norm": 1.3940538167953491, "learning_rate": 1.397235576923077e-05, "loss": 0.7203, "step": 2499 }, { "epoch": 1.458363715910748, "grad_norm": 1.2635236978530884, "learning_rate": 1.3957331730769233e-05, "loss": 0.843, "step": 2500 }, { "epoch": 1.4589470613971125, "grad_norm": 1.1825194358825684, "learning_rate": 1.3942307692307693e-05, "loss": 0.6945, "step": 2501 }, { "epoch": 1.4595304068834767, "grad_norm": 1.2193362712860107, "learning_rate": 1.3927283653846155e-05, "loss": 0.787, "step": 2502 }, { "epoch": 1.460113752369841, "grad_norm": 1.249801754951477, "learning_rate": 1.3912259615384615e-05, "loss": 0.856, "step": 2503 }, { "epoch": 1.4606970978562053, "grad_norm": 1.1366738080978394, "learning_rate": 1.3897235576923079e-05, "loss": 0.696, "step": 2504 }, { "epoch": 1.4612804433425697, "grad_norm": 1.3033206462860107, "learning_rate": 1.388221153846154e-05, "loss": 0.8147, "step": 2505 }, { "epoch": 1.4618637888289339, "grad_norm": 1.3203999996185303, "learning_rate": 1.38671875e-05, "loss": 0.5789, "step": 2506 }, { "epoch": 1.4624471343152983, "grad_norm": 1.3760713338851929, "learning_rate": 1.3852163461538461e-05, "loss": 0.7991, "step": 2507 }, { "epoch": 1.4630304798016625, "grad_norm": 1.2778420448303223, "learning_rate": 1.3837139423076923e-05, "loss": 0.6555, "step": 2508 }, { "epoch": 1.4636138252880269, "grad_norm": 1.1887472867965698, "learning_rate": 1.3822115384615386e-05, "loss": 0.8536, "step": 2509 }, { "epoch": 1.464197170774391, "grad_norm": 1.0331052541732788, "learning_rate": 1.3807091346153847e-05, "loss": 0.6553, "step": 2510 }, { "epoch": 1.4647805162607554, "grad_norm": 1.2358107566833496, "learning_rate": 1.3792067307692309e-05, "loss": 0.8954, "step": 2511 }, { "epoch": 1.4653638617471199, "grad_norm": 1.4614425897598267, "learning_rate": 1.3777043269230769e-05, "loss": 0.7539, "step": 2512 }, { "epoch": 1.465947207233484, "grad_norm": 1.2347095012664795, "learning_rate": 1.3762019230769232e-05, "loss": 0.8088, "step": 2513 }, { "epoch": 1.4665305527198482, "grad_norm": 1.299397587776184, "learning_rate": 1.3746995192307694e-05, "loss": 0.5135, "step": 2514 }, { "epoch": 1.4671138982062126, "grad_norm": 1.1845791339874268, "learning_rate": 1.3731971153846154e-05, "loss": 0.9701, "step": 2515 }, { "epoch": 1.467697243692577, "grad_norm": 1.3026585578918457, "learning_rate": 1.3716947115384615e-05, "loss": 0.8863, "step": 2516 }, { "epoch": 1.4682805891789412, "grad_norm": 1.1109139919281006, "learning_rate": 1.3701923076923078e-05, "loss": 0.703, "step": 2517 }, { "epoch": 1.4688639346653054, "grad_norm": 1.1830486059188843, "learning_rate": 1.368689903846154e-05, "loss": 0.7957, "step": 2518 }, { "epoch": 1.4694472801516698, "grad_norm": 1.1587508916854858, "learning_rate": 1.3671875e-05, "loss": 1.0695, "step": 2519 }, { "epoch": 1.4700306256380342, "grad_norm": 1.0737591981887817, "learning_rate": 1.3656850961538462e-05, "loss": 0.7813, "step": 2520 }, { "epoch": 1.4706139711243984, "grad_norm": 1.2657713890075684, "learning_rate": 1.3641826923076922e-05, "loss": 0.7455, "step": 2521 }, { "epoch": 1.4711973166107628, "grad_norm": 1.3846890926361084, "learning_rate": 1.3626802884615386e-05, "loss": 0.7651, "step": 2522 }, { "epoch": 1.471780662097127, "grad_norm": 1.0715590715408325, "learning_rate": 1.3611778846153848e-05, "loss": 0.8357, "step": 2523 }, { "epoch": 1.4723640075834914, "grad_norm": 1.2787429094314575, "learning_rate": 1.3596754807692308e-05, "loss": 0.8817, "step": 2524 }, { "epoch": 1.4729473530698556, "grad_norm": 1.324184536933899, "learning_rate": 1.3581730769230768e-05, "loss": 0.8724, "step": 2525 }, { "epoch": 1.47353069855622, "grad_norm": 1.1958729028701782, "learning_rate": 1.3566706730769232e-05, "loss": 0.8143, "step": 2526 }, { "epoch": 1.4741140440425842, "grad_norm": 0.9904715418815613, "learning_rate": 1.3551682692307694e-05, "loss": 0.8887, "step": 2527 }, { "epoch": 1.4746973895289486, "grad_norm": 1.2931222915649414, "learning_rate": 1.3536658653846154e-05, "loss": 0.7175, "step": 2528 }, { "epoch": 1.4752807350153128, "grad_norm": 1.3142303228378296, "learning_rate": 1.3521634615384616e-05, "loss": 0.7999, "step": 2529 }, { "epoch": 1.4758640805016772, "grad_norm": 1.3524141311645508, "learning_rate": 1.350661057692308e-05, "loss": 0.7423, "step": 2530 }, { "epoch": 1.4764474259880414, "grad_norm": 1.4455997943878174, "learning_rate": 1.349158653846154e-05, "loss": 0.5973, "step": 2531 }, { "epoch": 1.4770307714744058, "grad_norm": 1.367997169494629, "learning_rate": 1.3476562500000001e-05, "loss": 0.7174, "step": 2532 }, { "epoch": 1.47761411696077, "grad_norm": 1.2403534650802612, "learning_rate": 1.3461538461538462e-05, "loss": 0.8182, "step": 2533 }, { "epoch": 1.4781974624471343, "grad_norm": 1.1000239849090576, "learning_rate": 1.3446514423076922e-05, "loss": 0.8016, "step": 2534 }, { "epoch": 1.4787808079334988, "grad_norm": 1.4198569059371948, "learning_rate": 1.3431490384615385e-05, "loss": 0.6896, "step": 2535 }, { "epoch": 1.479364153419863, "grad_norm": 1.2650169134140015, "learning_rate": 1.3416466346153847e-05, "loss": 0.7742, "step": 2536 }, { "epoch": 1.4799474989062271, "grad_norm": 1.4280163049697876, "learning_rate": 1.3401442307692308e-05, "loss": 0.9482, "step": 2537 }, { "epoch": 1.4805308443925915, "grad_norm": 1.2631523609161377, "learning_rate": 1.338641826923077e-05, "loss": 0.7142, "step": 2538 }, { "epoch": 1.481114189878956, "grad_norm": 1.236832857131958, "learning_rate": 1.3371394230769233e-05, "loss": 0.6786, "step": 2539 }, { "epoch": 1.4816975353653201, "grad_norm": 1.2481725215911865, "learning_rate": 1.3356370192307693e-05, "loss": 0.8536, "step": 2540 }, { "epoch": 1.4822808808516843, "grad_norm": 1.3706049919128418, "learning_rate": 1.3341346153846155e-05, "loss": 0.725, "step": 2541 }, { "epoch": 1.4828642263380487, "grad_norm": 1.2054919004440308, "learning_rate": 1.3326322115384615e-05, "loss": 0.9378, "step": 2542 }, { "epoch": 1.4834475718244131, "grad_norm": 1.659885048866272, "learning_rate": 1.3311298076923079e-05, "loss": 0.7564, "step": 2543 }, { "epoch": 1.4840309173107773, "grad_norm": 1.3692610263824463, "learning_rate": 1.3296274038461539e-05, "loss": 0.8413, "step": 2544 }, { "epoch": 1.4846142627971415, "grad_norm": 1.287308931350708, "learning_rate": 1.3281250000000001e-05, "loss": 0.695, "step": 2545 }, { "epoch": 1.4851976082835059, "grad_norm": 1.5530636310577393, "learning_rate": 1.3266225961538461e-05, "loss": 0.7356, "step": 2546 }, { "epoch": 1.4857809537698703, "grad_norm": 1.2729878425598145, "learning_rate": 1.3251201923076923e-05, "loss": 0.7533, "step": 2547 }, { "epoch": 1.4863642992562345, "grad_norm": 1.4389548301696777, "learning_rate": 1.3236177884615387e-05, "loss": 0.8441, "step": 2548 }, { "epoch": 1.4869476447425989, "grad_norm": 1.0692722797393799, "learning_rate": 1.3221153846153847e-05, "loss": 0.8289, "step": 2549 }, { "epoch": 1.487530990228963, "grad_norm": 1.1718000173568726, "learning_rate": 1.3206129807692309e-05, "loss": 0.7955, "step": 2550 }, { "epoch": 1.4881143357153275, "grad_norm": 1.1499435901641846, "learning_rate": 1.3191105769230769e-05, "loss": 0.7256, "step": 2551 }, { "epoch": 1.4886976812016917, "grad_norm": 1.250289797782898, "learning_rate": 1.3176081730769232e-05, "loss": 0.8081, "step": 2552 }, { "epoch": 1.489281026688056, "grad_norm": 1.243996024131775, "learning_rate": 1.3161057692307693e-05, "loss": 0.6775, "step": 2553 }, { "epoch": 1.4898643721744202, "grad_norm": 1.2744098901748657, "learning_rate": 1.3146033653846155e-05, "loss": 0.8471, "step": 2554 }, { "epoch": 1.4904477176607847, "grad_norm": 1.2613190412521362, "learning_rate": 1.3131009615384615e-05, "loss": 0.7044, "step": 2555 }, { "epoch": 1.4910310631471488, "grad_norm": 1.4642215967178345, "learning_rate": 1.3115985576923078e-05, "loss": 1.0189, "step": 2556 }, { "epoch": 1.4916144086335132, "grad_norm": 1.1679459810256958, "learning_rate": 1.310096153846154e-05, "loss": 0.8816, "step": 2557 }, { "epoch": 1.4921977541198774, "grad_norm": 1.1182615756988525, "learning_rate": 1.30859375e-05, "loss": 0.7353, "step": 2558 }, { "epoch": 1.4927810996062418, "grad_norm": 1.2033658027648926, "learning_rate": 1.3070913461538462e-05, "loss": 0.7741, "step": 2559 }, { "epoch": 1.493364445092606, "grad_norm": 1.1669368743896484, "learning_rate": 1.3055889423076923e-05, "loss": 0.8408, "step": 2560 }, { "epoch": 1.4939477905789704, "grad_norm": 1.4733059406280518, "learning_rate": 1.3040865384615386e-05, "loss": 0.7797, "step": 2561 }, { "epoch": 1.4945311360653348, "grad_norm": 1.431363582611084, "learning_rate": 1.3025841346153846e-05, "loss": 0.794, "step": 2562 }, { "epoch": 1.495114481551699, "grad_norm": 0.9812197685241699, "learning_rate": 1.3010817307692308e-05, "loss": 0.6596, "step": 2563 }, { "epoch": 1.4956978270380632, "grad_norm": 1.124457597732544, "learning_rate": 1.2995793269230768e-05, "loss": 0.906, "step": 2564 }, { "epoch": 1.4962811725244276, "grad_norm": 1.3098465204238892, "learning_rate": 1.2980769230769232e-05, "loss": 1.0137, "step": 2565 }, { "epoch": 1.496864518010792, "grad_norm": 1.0503084659576416, "learning_rate": 1.2965745192307694e-05, "loss": 0.7207, "step": 2566 }, { "epoch": 1.4974478634971562, "grad_norm": 1.2493394613265991, "learning_rate": 1.2950721153846154e-05, "loss": 0.9324, "step": 2567 }, { "epoch": 1.4980312089835204, "grad_norm": 1.1940001249313354, "learning_rate": 1.2935697115384616e-05, "loss": 0.7695, "step": 2568 }, { "epoch": 1.4986145544698848, "grad_norm": 1.2989429235458374, "learning_rate": 1.292067307692308e-05, "loss": 0.5233, "step": 2569 }, { "epoch": 1.4991978999562492, "grad_norm": 1.1896779537200928, "learning_rate": 1.290564903846154e-05, "loss": 0.8223, "step": 2570 }, { "epoch": 1.4997812454426134, "grad_norm": 1.2201910018920898, "learning_rate": 1.2890625e-05, "loss": 0.7695, "step": 2571 }, { "epoch": 1.5003645909289776, "grad_norm": 1.2268931865692139, "learning_rate": 1.2875600961538462e-05, "loss": 0.8114, "step": 2572 }, { "epoch": 1.500947936415342, "grad_norm": 1.1898248195648193, "learning_rate": 1.2860576923076922e-05, "loss": 0.8477, "step": 2573 }, { "epoch": 1.5015312819017064, "grad_norm": 1.4626970291137695, "learning_rate": 1.2845552884615386e-05, "loss": 0.6557, "step": 2574 }, { "epoch": 1.5021146273880706, "grad_norm": 1.2246737480163574, "learning_rate": 1.2830528846153847e-05, "loss": 0.9006, "step": 2575 }, { "epoch": 1.5026979728744347, "grad_norm": 1.2037559747695923, "learning_rate": 1.2815504807692308e-05, "loss": 0.8817, "step": 2576 }, { "epoch": 1.5032813183607991, "grad_norm": 1.3660948276519775, "learning_rate": 1.280048076923077e-05, "loss": 0.6706, "step": 2577 }, { "epoch": 1.5038646638471636, "grad_norm": 1.359358787536621, "learning_rate": 1.2785456730769233e-05, "loss": 0.8003, "step": 2578 }, { "epoch": 1.5044480093335277, "grad_norm": 1.2577052116394043, "learning_rate": 1.2770432692307693e-05, "loss": 0.9274, "step": 2579 }, { "epoch": 1.505031354819892, "grad_norm": 1.2104783058166504, "learning_rate": 1.2755408653846154e-05, "loss": 0.787, "step": 2580 }, { "epoch": 1.5056147003062563, "grad_norm": 1.3401927947998047, "learning_rate": 1.2740384615384615e-05, "loss": 0.7528, "step": 2581 }, { "epoch": 1.5061980457926207, "grad_norm": 1.3545399904251099, "learning_rate": 1.2725360576923079e-05, "loss": 0.7889, "step": 2582 }, { "epoch": 1.506781391278985, "grad_norm": 1.1138622760772705, "learning_rate": 1.271033653846154e-05, "loss": 0.9997, "step": 2583 }, { "epoch": 1.5073647367653493, "grad_norm": 1.2229743003845215, "learning_rate": 1.2695312500000001e-05, "loss": 0.7661, "step": 2584 }, { "epoch": 1.5079480822517137, "grad_norm": 1.1078226566314697, "learning_rate": 1.2680288461538461e-05, "loss": 0.6252, "step": 2585 }, { "epoch": 1.508531427738078, "grad_norm": 1.1622321605682373, "learning_rate": 1.2665264423076923e-05, "loss": 0.8268, "step": 2586 }, { "epoch": 1.509114773224442, "grad_norm": 1.3086098432540894, "learning_rate": 1.2650240384615387e-05, "loss": 0.8047, "step": 2587 }, { "epoch": 1.5096981187108065, "grad_norm": 1.4337576627731323, "learning_rate": 1.2635216346153847e-05, "loss": 0.8714, "step": 2588 }, { "epoch": 1.510281464197171, "grad_norm": 1.358109951019287, "learning_rate": 1.2620192307692307e-05, "loss": 0.7723, "step": 2589 }, { "epoch": 1.510864809683535, "grad_norm": 1.1631211042404175, "learning_rate": 1.2605168269230769e-05, "loss": 0.788, "step": 2590 }, { "epoch": 1.5114481551698993, "grad_norm": 1.421904444694519, "learning_rate": 1.2590144230769233e-05, "loss": 0.9168, "step": 2591 }, { "epoch": 1.5120315006562637, "grad_norm": 1.1729179620742798, "learning_rate": 1.2575120192307693e-05, "loss": 0.9585, "step": 2592 }, { "epoch": 1.512614846142628, "grad_norm": 1.3910799026489258, "learning_rate": 1.2560096153846155e-05, "loss": 0.6919, "step": 2593 }, { "epoch": 1.5131981916289923, "grad_norm": 1.3072963953018188, "learning_rate": 1.2545072115384615e-05, "loss": 0.7613, "step": 2594 }, { "epoch": 1.5137815371153565, "grad_norm": 1.4264005422592163, "learning_rate": 1.2530048076923079e-05, "loss": 0.9186, "step": 2595 }, { "epoch": 1.5143648826017209, "grad_norm": 1.0973105430603027, "learning_rate": 1.251502403846154e-05, "loss": 0.6772, "step": 2596 }, { "epoch": 1.5149482280880853, "grad_norm": 1.4456596374511719, "learning_rate": 1.25e-05, "loss": 0.7738, "step": 2597 }, { "epoch": 1.5155315735744495, "grad_norm": 1.246766209602356, "learning_rate": 1.248497596153846e-05, "loss": 0.9088, "step": 2598 }, { "epoch": 1.5161149190608136, "grad_norm": 1.170255422592163, "learning_rate": 1.2469951923076924e-05, "loss": 0.8891, "step": 2599 }, { "epoch": 1.516698264547178, "grad_norm": 1.2451422214508057, "learning_rate": 1.2454927884615385e-05, "loss": 0.8856, "step": 2600 }, { "epoch": 1.516698264547178, "eval_loss_squad": 0.8741619079979137, "eval_perplexity": 7.975801635804587, "eval_perplexity_reconstruct": 1.8950837793713629, "step": 2600 }, { "epoch": 1.5172816100335424, "grad_norm": 1.1058474779129028, "learning_rate": 1.2439903846153846e-05, "loss": 0.8623, "step": 2601 }, { "epoch": 1.5178649555199066, "grad_norm": 1.1985957622528076, "learning_rate": 1.2424879807692308e-05, "loss": 0.5664, "step": 2602 }, { "epoch": 1.5184483010062708, "grad_norm": 1.2489391565322876, "learning_rate": 1.240985576923077e-05, "loss": 0.9798, "step": 2603 }, { "epoch": 1.5190316464926352, "grad_norm": 1.2178716659545898, "learning_rate": 1.239483173076923e-05, "loss": 0.8585, "step": 2604 }, { "epoch": 1.5196149919789996, "grad_norm": 1.065341591835022, "learning_rate": 1.2379807692307694e-05, "loss": 0.6899, "step": 2605 }, { "epoch": 1.5201983374653638, "grad_norm": 1.3119381666183472, "learning_rate": 1.2364783653846154e-05, "loss": 0.743, "step": 2606 }, { "epoch": 1.520781682951728, "grad_norm": 1.1781443357467651, "learning_rate": 1.2349759615384616e-05, "loss": 0.7418, "step": 2607 }, { "epoch": 1.5213650284380926, "grad_norm": 1.3264269828796387, "learning_rate": 1.2334735576923078e-05, "loss": 0.7401, "step": 2608 }, { "epoch": 1.5219483739244568, "grad_norm": 1.2371203899383545, "learning_rate": 1.231971153846154e-05, "loss": 0.6457, "step": 2609 }, { "epoch": 1.522531719410821, "grad_norm": 1.1963739395141602, "learning_rate": 1.23046875e-05, "loss": 0.871, "step": 2610 }, { "epoch": 1.5231150648971854, "grad_norm": 1.4940956830978394, "learning_rate": 1.2289663461538462e-05, "loss": 0.6387, "step": 2611 }, { "epoch": 1.5236984103835498, "grad_norm": 1.158118724822998, "learning_rate": 1.2274639423076924e-05, "loss": 0.8764, "step": 2612 }, { "epoch": 1.524281755869914, "grad_norm": 1.2709519863128662, "learning_rate": 1.2259615384615384e-05, "loss": 0.795, "step": 2613 }, { "epoch": 1.5248651013562782, "grad_norm": 1.1874806880950928, "learning_rate": 1.2244591346153848e-05, "loss": 0.6796, "step": 2614 }, { "epoch": 1.5254484468426426, "grad_norm": 1.2645267248153687, "learning_rate": 1.2229567307692308e-05, "loss": 0.6134, "step": 2615 }, { "epoch": 1.526031792329007, "grad_norm": 1.4414528608322144, "learning_rate": 1.221454326923077e-05, "loss": 0.661, "step": 2616 }, { "epoch": 1.5266151378153712, "grad_norm": 1.1239551305770874, "learning_rate": 1.2199519230769232e-05, "loss": 0.6612, "step": 2617 }, { "epoch": 1.5271984833017354, "grad_norm": 1.0646827220916748, "learning_rate": 1.2184495192307694e-05, "loss": 0.8923, "step": 2618 }, { "epoch": 1.5277818287880998, "grad_norm": 1.2423840761184692, "learning_rate": 1.2169471153846154e-05, "loss": 1.1771, "step": 2619 }, { "epoch": 1.5283651742744642, "grad_norm": 1.0845423936843872, "learning_rate": 1.2154447115384617e-05, "loss": 0.735, "step": 2620 }, { "epoch": 1.5289485197608284, "grad_norm": 1.2939835786819458, "learning_rate": 1.2139423076923077e-05, "loss": 0.856, "step": 2621 }, { "epoch": 1.5295318652471925, "grad_norm": 1.2328243255615234, "learning_rate": 1.212439903846154e-05, "loss": 0.7687, "step": 2622 }, { "epoch": 1.530115210733557, "grad_norm": 1.2373398542404175, "learning_rate": 1.2109375000000001e-05, "loss": 0.7192, "step": 2623 }, { "epoch": 1.5306985562199213, "grad_norm": 1.3078012466430664, "learning_rate": 1.2094350961538461e-05, "loss": 1.0764, "step": 2624 }, { "epoch": 1.5312819017062855, "grad_norm": 1.34095299243927, "learning_rate": 1.2079326923076923e-05, "loss": 0.7552, "step": 2625 }, { "epoch": 1.5318652471926497, "grad_norm": 1.2611044645309448, "learning_rate": 1.2064302884615385e-05, "loss": 0.5605, "step": 2626 }, { "epoch": 1.5324485926790141, "grad_norm": 1.3855218887329102, "learning_rate": 1.2049278846153847e-05, "loss": 0.7463, "step": 2627 }, { "epoch": 1.5330319381653785, "grad_norm": 1.3558884859085083, "learning_rate": 1.2034254807692307e-05, "loss": 0.8794, "step": 2628 }, { "epoch": 1.5336152836517427, "grad_norm": 1.287604570388794, "learning_rate": 1.2019230769230771e-05, "loss": 0.8108, "step": 2629 }, { "epoch": 1.534198629138107, "grad_norm": 1.35361909866333, "learning_rate": 1.2004206730769231e-05, "loss": 0.7766, "step": 2630 }, { "epoch": 1.5347819746244713, "grad_norm": 1.1538270711898804, "learning_rate": 1.1989182692307693e-05, "loss": 0.8437, "step": 2631 }, { "epoch": 1.5353653201108357, "grad_norm": 1.212172031402588, "learning_rate": 1.1974158653846155e-05, "loss": 0.9256, "step": 2632 }, { "epoch": 1.5359486655972, "grad_norm": 1.2593048810958862, "learning_rate": 1.1959134615384617e-05, "loss": 0.8088, "step": 2633 }, { "epoch": 1.536532011083564, "grad_norm": 1.1822506189346313, "learning_rate": 1.1944110576923077e-05, "loss": 1.1436, "step": 2634 }, { "epoch": 1.5371153565699287, "grad_norm": 1.1885801553726196, "learning_rate": 1.1929086538461539e-05, "loss": 0.7486, "step": 2635 }, { "epoch": 1.5376987020562929, "grad_norm": 1.276310920715332, "learning_rate": 1.19140625e-05, "loss": 0.8676, "step": 2636 }, { "epoch": 1.538282047542657, "grad_norm": 1.1194807291030884, "learning_rate": 1.1899038461538461e-05, "loss": 0.8396, "step": 2637 }, { "epoch": 1.5388653930290215, "grad_norm": 1.291012167930603, "learning_rate": 1.1884014423076925e-05, "loss": 0.9238, "step": 2638 }, { "epoch": 1.5394487385153859, "grad_norm": 1.2062819004058838, "learning_rate": 1.1868990384615385e-05, "loss": 0.714, "step": 2639 }, { "epoch": 1.54003208400175, "grad_norm": 1.2483253479003906, "learning_rate": 1.1853966346153847e-05, "loss": 0.7584, "step": 2640 }, { "epoch": 1.5406154294881143, "grad_norm": 1.4191927909851074, "learning_rate": 1.1838942307692309e-05, "loss": 0.9147, "step": 2641 }, { "epoch": 1.5411987749744787, "grad_norm": 1.292017936706543, "learning_rate": 1.182391826923077e-05, "loss": 0.783, "step": 2642 }, { "epoch": 1.541782120460843, "grad_norm": 1.4348483085632324, "learning_rate": 1.180889423076923e-05, "loss": 0.8497, "step": 2643 }, { "epoch": 1.5423654659472072, "grad_norm": 1.4889435768127441, "learning_rate": 1.1793870192307692e-05, "loss": 0.8401, "step": 2644 }, { "epoch": 1.5429488114335714, "grad_norm": 1.2667125463485718, "learning_rate": 1.1778846153846154e-05, "loss": 0.8579, "step": 2645 }, { "epoch": 1.5435321569199358, "grad_norm": 1.2864303588867188, "learning_rate": 1.1763822115384616e-05, "loss": 0.7385, "step": 2646 }, { "epoch": 1.5441155024063002, "grad_norm": 1.079271912574768, "learning_rate": 1.1748798076923078e-05, "loss": 0.7407, "step": 2647 }, { "epoch": 1.5446988478926644, "grad_norm": 1.0965354442596436, "learning_rate": 1.173377403846154e-05, "loss": 0.7776, "step": 2648 }, { "epoch": 1.5452821933790286, "grad_norm": 1.260859489440918, "learning_rate": 1.171875e-05, "loss": 0.7992, "step": 2649 }, { "epoch": 1.545865538865393, "grad_norm": 1.2462226152420044, "learning_rate": 1.1703725961538462e-05, "loss": 1.0434, "step": 2650 }, { "epoch": 1.5464488843517574, "grad_norm": 1.1139503717422485, "learning_rate": 1.1688701923076924e-05, "loss": 0.6601, "step": 2651 }, { "epoch": 1.5470322298381216, "grad_norm": 1.375958800315857, "learning_rate": 1.1673677884615384e-05, "loss": 0.8542, "step": 2652 }, { "epoch": 1.5476155753244858, "grad_norm": 1.268680214881897, "learning_rate": 1.1658653846153846e-05, "loss": 0.9015, "step": 2653 }, { "epoch": 1.5481989208108502, "grad_norm": 1.3285831212997437, "learning_rate": 1.1643629807692308e-05, "loss": 0.7092, "step": 2654 }, { "epoch": 1.5487822662972146, "grad_norm": 1.4205631017684937, "learning_rate": 1.162860576923077e-05, "loss": 0.7721, "step": 2655 }, { "epoch": 1.5493656117835788, "grad_norm": 1.3641010522842407, "learning_rate": 1.1613581730769232e-05, "loss": 0.6439, "step": 2656 }, { "epoch": 1.549948957269943, "grad_norm": 1.1985429525375366, "learning_rate": 1.1598557692307694e-05, "loss": 0.7375, "step": 2657 }, { "epoch": 1.5505323027563074, "grad_norm": 1.200697898864746, "learning_rate": 1.1583533653846154e-05, "loss": 0.7262, "step": 2658 }, { "epoch": 1.5511156482426718, "grad_norm": 1.057250738143921, "learning_rate": 1.1568509615384616e-05, "loss": 0.9285, "step": 2659 }, { "epoch": 1.551698993729036, "grad_norm": 1.2788975238800049, "learning_rate": 1.1553485576923078e-05, "loss": 0.7772, "step": 2660 }, { "epoch": 1.5522823392154004, "grad_norm": 1.139838695526123, "learning_rate": 1.153846153846154e-05, "loss": 0.949, "step": 2661 }, { "epoch": 1.5528656847017648, "grad_norm": 1.4383840560913086, "learning_rate": 1.15234375e-05, "loss": 0.7398, "step": 2662 }, { "epoch": 1.553449030188129, "grad_norm": 1.2270270586013794, "learning_rate": 1.1508413461538462e-05, "loss": 0.8875, "step": 2663 }, { "epoch": 1.5540323756744931, "grad_norm": 1.2396215200424194, "learning_rate": 1.1493389423076924e-05, "loss": 0.8724, "step": 2664 }, { "epoch": 1.5546157211608576, "grad_norm": 1.183029294013977, "learning_rate": 1.1478365384615385e-05, "loss": 0.9306, "step": 2665 }, { "epoch": 1.555199066647222, "grad_norm": 1.2103503942489624, "learning_rate": 1.1463341346153847e-05, "loss": 0.7269, "step": 2666 }, { "epoch": 1.5557824121335861, "grad_norm": 1.5745123624801636, "learning_rate": 1.1448317307692307e-05, "loss": 0.5633, "step": 2667 }, { "epoch": 1.5563657576199503, "grad_norm": 1.3425438404083252, "learning_rate": 1.143329326923077e-05, "loss": 0.6148, "step": 2668 }, { "epoch": 1.5569491031063147, "grad_norm": 1.2065016031265259, "learning_rate": 1.1418269230769231e-05, "loss": 0.6517, "step": 2669 }, { "epoch": 1.5575324485926791, "grad_norm": 1.1521174907684326, "learning_rate": 1.1403245192307693e-05, "loss": 0.7774, "step": 2670 }, { "epoch": 1.5581157940790433, "grad_norm": 1.0685138702392578, "learning_rate": 1.1388221153846153e-05, "loss": 0.8086, "step": 2671 }, { "epoch": 1.5586991395654075, "grad_norm": 1.131866216659546, "learning_rate": 1.1373197115384617e-05, "loss": 0.8954, "step": 2672 }, { "epoch": 1.559282485051772, "grad_norm": 1.074537754058838, "learning_rate": 1.1358173076923077e-05, "loss": 0.8301, "step": 2673 }, { "epoch": 1.5598658305381363, "grad_norm": 1.255470633506775, "learning_rate": 1.1343149038461539e-05, "loss": 0.7473, "step": 2674 }, { "epoch": 1.5604491760245005, "grad_norm": 1.3070282936096191, "learning_rate": 1.1328125000000001e-05, "loss": 0.7232, "step": 2675 }, { "epoch": 1.5610325215108647, "grad_norm": 1.4808948040008545, "learning_rate": 1.1313100961538461e-05, "loss": 1.0816, "step": 2676 }, { "epoch": 1.561615866997229, "grad_norm": 1.2335243225097656, "learning_rate": 1.1298076923076923e-05, "loss": 0.9074, "step": 2677 }, { "epoch": 1.5621992124835935, "grad_norm": 1.1924381256103516, "learning_rate": 1.1283052884615385e-05, "loss": 0.6063, "step": 2678 }, { "epoch": 1.5627825579699577, "grad_norm": 1.2754387855529785, "learning_rate": 1.1268028846153847e-05, "loss": 0.8983, "step": 2679 }, { "epoch": 1.5633659034563219, "grad_norm": 1.326870083808899, "learning_rate": 1.1253004807692307e-05, "loss": 0.6428, "step": 2680 }, { "epoch": 1.5639492489426863, "grad_norm": 1.1194915771484375, "learning_rate": 1.123798076923077e-05, "loss": 0.6659, "step": 2681 }, { "epoch": 1.5645325944290507, "grad_norm": 1.119391679763794, "learning_rate": 1.122295673076923e-05, "loss": 0.8714, "step": 2682 }, { "epoch": 1.5651159399154149, "grad_norm": 1.299011468887329, "learning_rate": 1.1207932692307693e-05, "loss": 0.8289, "step": 2683 }, { "epoch": 1.565699285401779, "grad_norm": 1.0118248462677002, "learning_rate": 1.1192908653846155e-05, "loss": 0.6258, "step": 2684 }, { "epoch": 1.5662826308881435, "grad_norm": 1.1196413040161133, "learning_rate": 1.1177884615384616e-05, "loss": 0.7186, "step": 2685 }, { "epoch": 1.5668659763745079, "grad_norm": 1.2307488918304443, "learning_rate": 1.1162860576923077e-05, "loss": 0.7569, "step": 2686 }, { "epoch": 1.567449321860872, "grad_norm": 1.2402465343475342, "learning_rate": 1.114783653846154e-05, "loss": 0.8829, "step": 2687 }, { "epoch": 1.5680326673472365, "grad_norm": 1.262913465499878, "learning_rate": 1.11328125e-05, "loss": 0.8594, "step": 2688 }, { "epoch": 1.5686160128336009, "grad_norm": 1.1202564239501953, "learning_rate": 1.111778846153846e-05, "loss": 0.8337, "step": 2689 }, { "epoch": 1.569199358319965, "grad_norm": 1.263912320137024, "learning_rate": 1.1102764423076924e-05, "loss": 0.875, "step": 2690 }, { "epoch": 1.5697827038063292, "grad_norm": 1.3599956035614014, "learning_rate": 1.1087740384615384e-05, "loss": 0.7736, "step": 2691 }, { "epoch": 1.5703660492926936, "grad_norm": 1.435336947441101, "learning_rate": 1.1072716346153846e-05, "loss": 0.5259, "step": 2692 }, { "epoch": 1.570949394779058, "grad_norm": 1.3584843873977661, "learning_rate": 1.1057692307692308e-05, "loss": 0.6383, "step": 2693 }, { "epoch": 1.5715327402654222, "grad_norm": 1.44851815700531, "learning_rate": 1.104266826923077e-05, "loss": 0.8313, "step": 2694 }, { "epoch": 1.5721160857517864, "grad_norm": 1.316826343536377, "learning_rate": 1.102764423076923e-05, "loss": 0.6728, "step": 2695 }, { "epoch": 1.5726994312381508, "grad_norm": 1.5356096029281616, "learning_rate": 1.1012620192307694e-05, "loss": 0.7044, "step": 2696 }, { "epoch": 1.5732827767245152, "grad_norm": 1.1947566270828247, "learning_rate": 1.0997596153846154e-05, "loss": 0.6776, "step": 2697 }, { "epoch": 1.5738661222108794, "grad_norm": 1.3364490270614624, "learning_rate": 1.0982572115384616e-05, "loss": 0.8211, "step": 2698 }, { "epoch": 1.5744494676972436, "grad_norm": 1.2481436729431152, "learning_rate": 1.0967548076923078e-05, "loss": 0.9184, "step": 2699 }, { "epoch": 1.575032813183608, "grad_norm": 1.224586009979248, "learning_rate": 1.095252403846154e-05, "loss": 0.6932, "step": 2700 }, { "epoch": 1.5756161586699724, "grad_norm": 1.1394731998443604, "learning_rate": 1.09375e-05, "loss": 0.8501, "step": 2701 }, { "epoch": 1.5761995041563366, "grad_norm": 1.1618777513504028, "learning_rate": 1.0922475961538462e-05, "loss": 0.9925, "step": 2702 }, { "epoch": 1.5767828496427008, "grad_norm": 1.480178952217102, "learning_rate": 1.0907451923076924e-05, "loss": 0.8851, "step": 2703 }, { "epoch": 1.5773661951290652, "grad_norm": 1.0637935400009155, "learning_rate": 1.0892427884615384e-05, "loss": 0.7666, "step": 2704 }, { "epoch": 1.5779495406154296, "grad_norm": 1.2903245687484741, "learning_rate": 1.0877403846153847e-05, "loss": 0.7153, "step": 2705 }, { "epoch": 1.5785328861017938, "grad_norm": 1.4703887701034546, "learning_rate": 1.0862379807692308e-05, "loss": 0.7709, "step": 2706 }, { "epoch": 1.579116231588158, "grad_norm": 1.12290358543396, "learning_rate": 1.084735576923077e-05, "loss": 0.7594, "step": 2707 }, { "epoch": 1.5796995770745224, "grad_norm": 1.511317253112793, "learning_rate": 1.0832331730769231e-05, "loss": 0.9694, "step": 2708 }, { "epoch": 1.5802829225608868, "grad_norm": 1.799383282661438, "learning_rate": 1.0817307692307693e-05, "loss": 0.6799, "step": 2709 }, { "epoch": 1.580866268047251, "grad_norm": 1.1799027919769287, "learning_rate": 1.0802283653846154e-05, "loss": 0.8357, "step": 2710 }, { "epoch": 1.5814496135336151, "grad_norm": 1.2925686836242676, "learning_rate": 1.0787259615384617e-05, "loss": 0.6977, "step": 2711 }, { "epoch": 1.5820329590199795, "grad_norm": 1.284294843673706, "learning_rate": 1.0772235576923077e-05, "loss": 0.8429, "step": 2712 }, { "epoch": 1.582616304506344, "grad_norm": 1.1772273778915405, "learning_rate": 1.075721153846154e-05, "loss": 0.692, "step": 2713 }, { "epoch": 1.5831996499927081, "grad_norm": 1.1449207067489624, "learning_rate": 1.0742187500000001e-05, "loss": 0.7098, "step": 2714 }, { "epoch": 1.5837829954790725, "grad_norm": 1.4304105043411255, "learning_rate": 1.0727163461538461e-05, "loss": 0.8735, "step": 2715 }, { "epoch": 1.584366340965437, "grad_norm": 1.024904727935791, "learning_rate": 1.0712139423076923e-05, "loss": 0.7593, "step": 2716 }, { "epoch": 1.5849496864518011, "grad_norm": 1.2996480464935303, "learning_rate": 1.0697115384615385e-05, "loss": 0.7183, "step": 2717 }, { "epoch": 1.5855330319381653, "grad_norm": 1.3686482906341553, "learning_rate": 1.0682091346153847e-05, "loss": 0.9928, "step": 2718 }, { "epoch": 1.5861163774245297, "grad_norm": 1.2611160278320312, "learning_rate": 1.0667067307692307e-05, "loss": 0.937, "step": 2719 }, { "epoch": 1.5866997229108941, "grad_norm": 1.0135339498519897, "learning_rate": 1.065204326923077e-05, "loss": 0.8119, "step": 2720 }, { "epoch": 1.5872830683972583, "grad_norm": 1.1465262174606323, "learning_rate": 1.0637019230769231e-05, "loss": 0.788, "step": 2721 }, { "epoch": 1.5878664138836225, "grad_norm": 1.3036527633666992, "learning_rate": 1.0621995192307693e-05, "loss": 0.746, "step": 2722 }, { "epoch": 1.588449759369987, "grad_norm": 1.3011112213134766, "learning_rate": 1.0606971153846155e-05, "loss": 0.9224, "step": 2723 }, { "epoch": 1.5890331048563513, "grad_norm": 1.1597706079483032, "learning_rate": 1.0591947115384617e-05, "loss": 0.9486, "step": 2724 }, { "epoch": 1.5896164503427155, "grad_norm": 1.1415634155273438, "learning_rate": 1.0576923076923077e-05, "loss": 0.6527, "step": 2725 }, { "epoch": 1.5901997958290797, "grad_norm": 1.1724604368209839, "learning_rate": 1.056189903846154e-05, "loss": 0.6936, "step": 2726 }, { "epoch": 1.590783141315444, "grad_norm": 1.014355182647705, "learning_rate": 1.0546875e-05, "loss": 0.7157, "step": 2727 }, { "epoch": 1.5913664868018085, "grad_norm": 1.2243266105651855, "learning_rate": 1.053185096153846e-05, "loss": 0.7737, "step": 2728 }, { "epoch": 1.5919498322881727, "grad_norm": 1.347441554069519, "learning_rate": 1.0516826923076924e-05, "loss": 0.6544, "step": 2729 }, { "epoch": 1.5925331777745368, "grad_norm": 1.1011338233947754, "learning_rate": 1.0501802884615385e-05, "loss": 0.6885, "step": 2730 }, { "epoch": 1.5931165232609013, "grad_norm": 1.2773078680038452, "learning_rate": 1.0486778846153846e-05, "loss": 0.826, "step": 2731 }, { "epoch": 1.5936998687472657, "grad_norm": 1.3734136819839478, "learning_rate": 1.0471754807692308e-05, "loss": 0.8921, "step": 2732 }, { "epoch": 1.5942832142336298, "grad_norm": 1.4568568468093872, "learning_rate": 1.045673076923077e-05, "loss": 0.7519, "step": 2733 }, { "epoch": 1.594866559719994, "grad_norm": 1.0718237161636353, "learning_rate": 1.044170673076923e-05, "loss": 0.7066, "step": 2734 }, { "epoch": 1.5954499052063584, "grad_norm": 1.2433981895446777, "learning_rate": 1.0426682692307694e-05, "loss": 0.5966, "step": 2735 }, { "epoch": 1.5960332506927228, "grad_norm": 1.313547134399414, "learning_rate": 1.0411658653846154e-05, "loss": 0.714, "step": 2736 }, { "epoch": 1.596616596179087, "grad_norm": 1.2181950807571411, "learning_rate": 1.0396634615384616e-05, "loss": 0.8592, "step": 2737 }, { "epoch": 1.5971999416654512, "grad_norm": 1.5490280389785767, "learning_rate": 1.0381610576923078e-05, "loss": 0.8276, "step": 2738 }, { "epoch": 1.5977832871518156, "grad_norm": 1.3620082139968872, "learning_rate": 1.036658653846154e-05, "loss": 1.0517, "step": 2739 }, { "epoch": 1.59836663263818, "grad_norm": 1.155208945274353, "learning_rate": 1.03515625e-05, "loss": 0.6895, "step": 2740 }, { "epoch": 1.5989499781245442, "grad_norm": 1.2351773977279663, "learning_rate": 1.0336538461538462e-05, "loss": 0.5748, "step": 2741 }, { "epoch": 1.5995333236109086, "grad_norm": 0.9607925415039062, "learning_rate": 1.0321514423076924e-05, "loss": 0.87, "step": 2742 }, { "epoch": 1.600116669097273, "grad_norm": 1.342832326889038, "learning_rate": 1.0306490384615384e-05, "loss": 0.6786, "step": 2743 }, { "epoch": 1.6007000145836372, "grad_norm": 1.3285024166107178, "learning_rate": 1.0291466346153848e-05, "loss": 0.6508, "step": 2744 }, { "epoch": 1.6012833600700014, "grad_norm": 1.2747256755828857, "learning_rate": 1.0276442307692308e-05, "loss": 0.7721, "step": 2745 }, { "epoch": 1.6018667055563658, "grad_norm": 1.2518932819366455, "learning_rate": 1.026141826923077e-05, "loss": 0.7242, "step": 2746 }, { "epoch": 1.6024500510427302, "grad_norm": 1.4885451793670654, "learning_rate": 1.0246394230769232e-05, "loss": 0.831, "step": 2747 }, { "epoch": 1.6030333965290944, "grad_norm": 1.324741005897522, "learning_rate": 1.0231370192307693e-05, "loss": 0.7975, "step": 2748 }, { "epoch": 1.6036167420154586, "grad_norm": 1.2721138000488281, "learning_rate": 1.0216346153846154e-05, "loss": 0.8043, "step": 2749 }, { "epoch": 1.604200087501823, "grad_norm": 1.3388422727584839, "learning_rate": 1.0201322115384617e-05, "loss": 0.8146, "step": 2750 }, { "epoch": 1.6047834329881874, "grad_norm": 1.4431674480438232, "learning_rate": 1.0186298076923077e-05, "loss": 0.8857, "step": 2751 }, { "epoch": 1.6053667784745516, "grad_norm": 1.3353809118270874, "learning_rate": 1.017127403846154e-05, "loss": 0.6763, "step": 2752 }, { "epoch": 1.6059501239609157, "grad_norm": 1.4101769924163818, "learning_rate": 1.0156250000000001e-05, "loss": 0.7768, "step": 2753 }, { "epoch": 1.6065334694472801, "grad_norm": 1.2769949436187744, "learning_rate": 1.0141225961538461e-05, "loss": 0.803, "step": 2754 }, { "epoch": 1.6071168149336446, "grad_norm": 1.3187309503555298, "learning_rate": 1.0126201923076923e-05, "loss": 0.7019, "step": 2755 }, { "epoch": 1.6077001604200087, "grad_norm": 1.083141565322876, "learning_rate": 1.0111177884615385e-05, "loss": 0.8864, "step": 2756 }, { "epoch": 1.608283505906373, "grad_norm": 1.8760899305343628, "learning_rate": 1.0096153846153847e-05, "loss": 0.8891, "step": 2757 }, { "epoch": 1.6088668513927373, "grad_norm": 1.2959517240524292, "learning_rate": 1.0081129807692307e-05, "loss": 0.8422, "step": 2758 }, { "epoch": 1.6094501968791017, "grad_norm": 1.4058853387832642, "learning_rate": 1.0066105769230771e-05, "loss": 0.6244, "step": 2759 }, { "epoch": 1.610033542365466, "grad_norm": 1.154642939567566, "learning_rate": 1.0051081730769231e-05, "loss": 0.859, "step": 2760 }, { "epoch": 1.61061688785183, "grad_norm": 1.209807276725769, "learning_rate": 1.0036057692307693e-05, "loss": 0.6576, "step": 2761 }, { "epoch": 1.6112002333381945, "grad_norm": 1.3753244876861572, "learning_rate": 1.0021033653846155e-05, "loss": 0.6883, "step": 2762 }, { "epoch": 1.611783578824559, "grad_norm": 1.2860386371612549, "learning_rate": 1.0006009615384617e-05, "loss": 0.8279, "step": 2763 }, { "epoch": 1.612366924310923, "grad_norm": 1.2552486658096313, "learning_rate": 9.990985576923077e-06, "loss": 0.8707, "step": 2764 }, { "epoch": 1.6129502697972873, "grad_norm": 1.2394871711730957, "learning_rate": 9.975961538461539e-06, "loss": 0.9137, "step": 2765 }, { "epoch": 1.6135336152836517, "grad_norm": 1.0615895986557007, "learning_rate": 9.9609375e-06, "loss": 0.9192, "step": 2766 }, { "epoch": 1.614116960770016, "grad_norm": 1.2719849348068237, "learning_rate": 9.945913461538461e-06, "loss": 0.8623, "step": 2767 }, { "epoch": 1.6147003062563803, "grad_norm": 1.2009553909301758, "learning_rate": 9.930889423076924e-06, "loss": 0.7557, "step": 2768 }, { "epoch": 1.6152836517427447, "grad_norm": 1.2548385858535767, "learning_rate": 9.915865384615385e-06, "loss": 0.8356, "step": 2769 }, { "epoch": 1.615866997229109, "grad_norm": 1.4359416961669922, "learning_rate": 9.900841346153847e-06, "loss": 0.954, "step": 2770 }, { "epoch": 1.6164503427154733, "grad_norm": 1.3595879077911377, "learning_rate": 9.885817307692308e-06, "loss": 0.771, "step": 2771 }, { "epoch": 1.6170336882018375, "grad_norm": 1.4051216840744019, "learning_rate": 9.87079326923077e-06, "loss": 0.7762, "step": 2772 }, { "epoch": 1.6176170336882019, "grad_norm": 1.0524299144744873, "learning_rate": 9.85576923076923e-06, "loss": 0.7751, "step": 2773 }, { "epoch": 1.6182003791745663, "grad_norm": 1.2120068073272705, "learning_rate": 9.840745192307692e-06, "loss": 0.7958, "step": 2774 }, { "epoch": 1.6187837246609305, "grad_norm": 1.271024227142334, "learning_rate": 9.825721153846154e-06, "loss": 0.8882, "step": 2775 }, { "epoch": 1.6193670701472946, "grad_norm": 1.1276352405548096, "learning_rate": 9.810697115384616e-06, "loss": 0.8336, "step": 2776 }, { "epoch": 1.619950415633659, "grad_norm": 1.3857134580612183, "learning_rate": 9.795673076923078e-06, "loss": 0.766, "step": 2777 }, { "epoch": 1.6205337611200235, "grad_norm": 1.1864246129989624, "learning_rate": 9.78064903846154e-06, "loss": 0.6705, "step": 2778 }, { "epoch": 1.6211171066063876, "grad_norm": 1.193347692489624, "learning_rate": 9.765625e-06, "loss": 0.865, "step": 2779 }, { "epoch": 1.6217004520927518, "grad_norm": 1.2671443223953247, "learning_rate": 9.750600961538462e-06, "loss": 0.7479, "step": 2780 }, { "epoch": 1.6222837975791162, "grad_norm": 1.2168381214141846, "learning_rate": 9.735576923076924e-06, "loss": 0.8945, "step": 2781 }, { "epoch": 1.6228671430654806, "grad_norm": 1.0259840488433838, "learning_rate": 9.720552884615384e-06, "loss": 1.031, "step": 2782 }, { "epoch": 1.6234504885518448, "grad_norm": 1.3780772686004639, "learning_rate": 9.705528846153846e-06, "loss": 0.9144, "step": 2783 }, { "epoch": 1.624033834038209, "grad_norm": 1.1934678554534912, "learning_rate": 9.690504807692308e-06, "loss": 0.8131, "step": 2784 }, { "epoch": 1.6246171795245734, "grad_norm": 1.2687208652496338, "learning_rate": 9.67548076923077e-06, "loss": 0.7369, "step": 2785 }, { "epoch": 1.6252005250109378, "grad_norm": 1.2904934883117676, "learning_rate": 9.660456730769232e-06, "loss": 0.6605, "step": 2786 }, { "epoch": 1.625783870497302, "grad_norm": 1.6874489784240723, "learning_rate": 9.645432692307694e-06, "loss": 0.8216, "step": 2787 }, { "epoch": 1.6263672159836662, "grad_norm": 1.156012773513794, "learning_rate": 9.630408653846154e-06, "loss": 0.7344, "step": 2788 }, { "epoch": 1.6269505614700306, "grad_norm": 1.2649706602096558, "learning_rate": 9.615384615384616e-06, "loss": 0.7568, "step": 2789 }, { "epoch": 1.627533906956395, "grad_norm": 1.2135905027389526, "learning_rate": 9.600360576923078e-06, "loss": 0.7164, "step": 2790 }, { "epoch": 1.6281172524427592, "grad_norm": 1.3913686275482178, "learning_rate": 9.58533653846154e-06, "loss": 0.7437, "step": 2791 }, { "epoch": 1.6287005979291234, "grad_norm": 1.119268536567688, "learning_rate": 9.5703125e-06, "loss": 0.8903, "step": 2792 }, { "epoch": 1.6292839434154878, "grad_norm": 1.0997390747070312, "learning_rate": 9.555288461538462e-06, "loss": 0.8839, "step": 2793 }, { "epoch": 1.6298672889018522, "grad_norm": 1.7211315631866455, "learning_rate": 9.540264423076923e-06, "loss": 0.7962, "step": 2794 }, { "epoch": 1.6304506343882164, "grad_norm": 1.1959589719772339, "learning_rate": 9.525240384615385e-06, "loss": 0.7228, "step": 2795 }, { "epoch": 1.6310339798745808, "grad_norm": 1.3861689567565918, "learning_rate": 9.510216346153847e-06, "loss": 0.8062, "step": 2796 }, { "epoch": 1.6316173253609452, "grad_norm": 1.251761794090271, "learning_rate": 9.495192307692307e-06, "loss": 0.8465, "step": 2797 }, { "epoch": 1.6322006708473094, "grad_norm": 1.3125969171524048, "learning_rate": 9.48016826923077e-06, "loss": 0.9648, "step": 2798 }, { "epoch": 1.6327840163336735, "grad_norm": 1.0540010929107666, "learning_rate": 9.465144230769231e-06, "loss": 0.8933, "step": 2799 }, { "epoch": 1.633367361820038, "grad_norm": 1.2731456756591797, "learning_rate": 9.450120192307693e-06, "loss": 0.6307, "step": 2800 }, { "epoch": 1.633367361820038, "eval_loss_squad": 0.8574257939518429, "eval_perplexity": 8.239154888947215, "eval_perplexity_reconstruct": 1.9011622713541043, "step": 2800 }, { "epoch": 1.6339507073064024, "grad_norm": 1.4683425426483154, "learning_rate": 9.435096153846153e-06, "loss": 0.8113, "step": 2801 }, { "epoch": 1.6345340527927665, "grad_norm": 1.1839910745620728, "learning_rate": 9.420072115384617e-06, "loss": 0.6765, "step": 2802 }, { "epoch": 1.6351173982791307, "grad_norm": 1.1331663131713867, "learning_rate": 9.405048076923077e-06, "loss": 0.6725, "step": 2803 }, { "epoch": 1.6357007437654951, "grad_norm": 1.2307205200195312, "learning_rate": 9.390024038461539e-06, "loss": 0.728, "step": 2804 }, { "epoch": 1.6362840892518595, "grad_norm": 1.178563117980957, "learning_rate": 9.375000000000001e-06, "loss": 0.8463, "step": 2805 }, { "epoch": 1.6368674347382237, "grad_norm": 1.1501257419586182, "learning_rate": 9.359975961538461e-06, "loss": 0.9248, "step": 2806 }, { "epoch": 1.637450780224588, "grad_norm": 1.2466018199920654, "learning_rate": 9.344951923076923e-06, "loss": 0.8406, "step": 2807 }, { "epoch": 1.6380341257109523, "grad_norm": 1.2315740585327148, "learning_rate": 9.329927884615385e-06, "loss": 0.8917, "step": 2808 }, { "epoch": 1.6386174711973167, "grad_norm": 1.3349418640136719, "learning_rate": 9.314903846153847e-06, "loss": 0.9475, "step": 2809 }, { "epoch": 1.639200816683681, "grad_norm": 1.3950275182724, "learning_rate": 9.299879807692307e-06, "loss": 0.6757, "step": 2810 }, { "epoch": 1.639784162170045, "grad_norm": 1.2387348413467407, "learning_rate": 9.28485576923077e-06, "loss": 0.7381, "step": 2811 }, { "epoch": 1.6403675076564095, "grad_norm": 1.3129287958145142, "learning_rate": 9.26983173076923e-06, "loss": 0.6148, "step": 2812 }, { "epoch": 1.640950853142774, "grad_norm": 1.1648530960083008, "learning_rate": 9.254807692307693e-06, "loss": 0.7276, "step": 2813 }, { "epoch": 1.641534198629138, "grad_norm": 1.53678560256958, "learning_rate": 9.239783653846154e-06, "loss": 0.9696, "step": 2814 }, { "epoch": 1.6421175441155023, "grad_norm": 1.1140925884246826, "learning_rate": 9.224759615384616e-06, "loss": 0.6841, "step": 2815 }, { "epoch": 1.6427008896018667, "grad_norm": 1.36318838596344, "learning_rate": 9.209735576923077e-06, "loss": 0.8003, "step": 2816 }, { "epoch": 1.643284235088231, "grad_norm": 1.1023435592651367, "learning_rate": 9.19471153846154e-06, "loss": 0.7749, "step": 2817 }, { "epoch": 1.6438675805745953, "grad_norm": 1.7993230819702148, "learning_rate": 9.1796875e-06, "loss": 0.6514, "step": 2818 }, { "epoch": 1.6444509260609594, "grad_norm": 1.1305932998657227, "learning_rate": 9.16466346153846e-06, "loss": 0.802, "step": 2819 }, { "epoch": 1.645034271547324, "grad_norm": 1.2455143928527832, "learning_rate": 9.149639423076924e-06, "loss": 0.6783, "step": 2820 }, { "epoch": 1.6456176170336883, "grad_norm": 1.2018240690231323, "learning_rate": 9.134615384615384e-06, "loss": 0.6678, "step": 2821 }, { "epoch": 1.6462009625200524, "grad_norm": 1.5132325887680054, "learning_rate": 9.119591346153846e-06, "loss": 0.8185, "step": 2822 }, { "epoch": 1.6467843080064168, "grad_norm": 1.1789501905441284, "learning_rate": 9.104567307692308e-06, "loss": 0.7072, "step": 2823 }, { "epoch": 1.6473676534927812, "grad_norm": 1.2296137809753418, "learning_rate": 9.08954326923077e-06, "loss": 0.8957, "step": 2824 }, { "epoch": 1.6479509989791454, "grad_norm": 1.2670032978057861, "learning_rate": 9.07451923076923e-06, "loss": 1.0086, "step": 2825 }, { "epoch": 1.6485343444655096, "grad_norm": 1.2225745916366577, "learning_rate": 9.059495192307694e-06, "loss": 0.7689, "step": 2826 }, { "epoch": 1.649117689951874, "grad_norm": 1.3422561883926392, "learning_rate": 9.044471153846154e-06, "loss": 0.7682, "step": 2827 }, { "epoch": 1.6497010354382384, "grad_norm": 1.7664170265197754, "learning_rate": 9.029447115384616e-06, "loss": 0.9315, "step": 2828 }, { "epoch": 1.6502843809246026, "grad_norm": 1.141822338104248, "learning_rate": 9.014423076923078e-06, "loss": 0.7782, "step": 2829 }, { "epoch": 1.6508677264109668, "grad_norm": 1.266010046005249, "learning_rate": 8.99939903846154e-06, "loss": 0.7129, "step": 2830 }, { "epoch": 1.6514510718973312, "grad_norm": 1.1951196193695068, "learning_rate": 8.984375e-06, "loss": 0.8082, "step": 2831 }, { "epoch": 1.6520344173836956, "grad_norm": 1.1922125816345215, "learning_rate": 8.969350961538462e-06, "loss": 0.8675, "step": 2832 }, { "epoch": 1.6526177628700598, "grad_norm": 1.4151067733764648, "learning_rate": 8.954326923076924e-06, "loss": 0.7596, "step": 2833 }, { "epoch": 1.653201108356424, "grad_norm": 1.496769666671753, "learning_rate": 8.939302884615384e-06, "loss": 0.7633, "step": 2834 }, { "epoch": 1.6537844538427884, "grad_norm": 1.1096853017807007, "learning_rate": 8.924278846153847e-06, "loss": 0.782, "step": 2835 }, { "epoch": 1.6543677993291528, "grad_norm": 1.4348713159561157, "learning_rate": 8.909254807692308e-06, "loss": 0.7652, "step": 2836 }, { "epoch": 1.654951144815517, "grad_norm": 1.3088988065719604, "learning_rate": 8.89423076923077e-06, "loss": 0.7302, "step": 2837 }, { "epoch": 1.6555344903018812, "grad_norm": 1.1694996356964111, "learning_rate": 8.879206730769231e-06, "loss": 0.9367, "step": 2838 }, { "epoch": 1.6561178357882456, "grad_norm": 1.274279236793518, "learning_rate": 8.864182692307693e-06, "loss": 0.7277, "step": 2839 }, { "epoch": 1.65670118127461, "grad_norm": 1.4117878675460815, "learning_rate": 8.849158653846153e-06, "loss": 0.6493, "step": 2840 }, { "epoch": 1.6572845267609742, "grad_norm": 1.1404685974121094, "learning_rate": 8.834134615384617e-06, "loss": 0.8478, "step": 2841 }, { "epoch": 1.6578678722473383, "grad_norm": 1.1155657768249512, "learning_rate": 8.819110576923077e-06, "loss": 0.8945, "step": 2842 }, { "epoch": 1.6584512177337027, "grad_norm": 1.2752269506454468, "learning_rate": 8.804086538461539e-06, "loss": 0.7728, "step": 2843 }, { "epoch": 1.6590345632200671, "grad_norm": 1.6460522413253784, "learning_rate": 8.789062500000001e-06, "loss": 0.7511, "step": 2844 }, { "epoch": 1.6596179087064313, "grad_norm": 1.4639447927474976, "learning_rate": 8.774038461538461e-06, "loss": 0.6837, "step": 2845 }, { "epoch": 1.6602012541927955, "grad_norm": 1.4805471897125244, "learning_rate": 8.759014423076923e-06, "loss": 0.832, "step": 2846 }, { "epoch": 1.6607845996791601, "grad_norm": 1.1659775972366333, "learning_rate": 8.743990384615385e-06, "loss": 0.7268, "step": 2847 }, { "epoch": 1.6613679451655243, "grad_norm": 1.190755009651184, "learning_rate": 8.728966346153847e-06, "loss": 0.6614, "step": 2848 }, { "epoch": 1.6619512906518885, "grad_norm": 1.1789089441299438, "learning_rate": 8.713942307692307e-06, "loss": 0.8071, "step": 2849 }, { "epoch": 1.662534636138253, "grad_norm": 1.2296158075332642, "learning_rate": 8.69891826923077e-06, "loss": 0.6635, "step": 2850 }, { "epoch": 1.6631179816246173, "grad_norm": 1.1168346405029297, "learning_rate": 8.683894230769231e-06, "loss": 0.714, "step": 2851 }, { "epoch": 1.6637013271109815, "grad_norm": 1.1426185369491577, "learning_rate": 8.668870192307693e-06, "loss": 0.7065, "step": 2852 }, { "epoch": 1.6642846725973457, "grad_norm": 1.2550532817840576, "learning_rate": 8.653846153846155e-06, "loss": 0.6332, "step": 2853 }, { "epoch": 1.66486801808371, "grad_norm": 1.2261812686920166, "learning_rate": 8.638822115384617e-06, "loss": 0.7257, "step": 2854 }, { "epoch": 1.6654513635700745, "grad_norm": 1.1258302927017212, "learning_rate": 8.623798076923077e-06, "loss": 0.9171, "step": 2855 }, { "epoch": 1.6660347090564387, "grad_norm": 1.2768467664718628, "learning_rate": 8.60877403846154e-06, "loss": 0.6836, "step": 2856 }, { "epoch": 1.6666180545428029, "grad_norm": 0.9513541460037231, "learning_rate": 8.59375e-06, "loss": 0.7402, "step": 2857 }, { "epoch": 1.6672014000291673, "grad_norm": 1.2110565900802612, "learning_rate": 8.57872596153846e-06, "loss": 0.6598, "step": 2858 }, { "epoch": 1.6677847455155317, "grad_norm": 1.2651914358139038, "learning_rate": 8.563701923076924e-06, "loss": 0.8355, "step": 2859 }, { "epoch": 1.6683680910018959, "grad_norm": 1.2185043096542358, "learning_rate": 8.548677884615384e-06, "loss": 0.6839, "step": 2860 }, { "epoch": 1.66895143648826, "grad_norm": 1.2617533206939697, "learning_rate": 8.533653846153846e-06, "loss": 0.8926, "step": 2861 }, { "epoch": 1.6695347819746245, "grad_norm": 1.3251715898513794, "learning_rate": 8.518629807692308e-06, "loss": 0.956, "step": 2862 }, { "epoch": 1.6701181274609889, "grad_norm": 1.2607977390289307, "learning_rate": 8.50360576923077e-06, "loss": 0.81, "step": 2863 }, { "epoch": 1.670701472947353, "grad_norm": 1.0856292247772217, "learning_rate": 8.48858173076923e-06, "loss": 0.8838, "step": 2864 }, { "epoch": 1.6712848184337172, "grad_norm": 1.4323976039886475, "learning_rate": 8.473557692307694e-06, "loss": 0.908, "step": 2865 }, { "epoch": 1.6718681639200816, "grad_norm": 1.1892898082733154, "learning_rate": 8.458533653846154e-06, "loss": 0.8645, "step": 2866 }, { "epoch": 1.672451509406446, "grad_norm": 1.328147530555725, "learning_rate": 8.443509615384616e-06, "loss": 0.7748, "step": 2867 }, { "epoch": 1.6730348548928102, "grad_norm": 1.4214242696762085, "learning_rate": 8.428485576923078e-06, "loss": 0.8796, "step": 2868 }, { "epoch": 1.6736182003791744, "grad_norm": 0.941706120967865, "learning_rate": 8.41346153846154e-06, "loss": 0.7776, "step": 2869 }, { "epoch": 1.6742015458655388, "grad_norm": 1.255053997039795, "learning_rate": 8.3984375e-06, "loss": 0.6698, "step": 2870 }, { "epoch": 1.6747848913519032, "grad_norm": 1.1494503021240234, "learning_rate": 8.383413461538462e-06, "loss": 0.581, "step": 2871 }, { "epoch": 1.6753682368382674, "grad_norm": 1.3444379568099976, "learning_rate": 8.368389423076924e-06, "loss": 0.7983, "step": 2872 }, { "epoch": 1.6759515823246318, "grad_norm": 1.1211199760437012, "learning_rate": 8.353365384615384e-06, "loss": 0.6528, "step": 2873 }, { "epoch": 1.6765349278109962, "grad_norm": 1.30573570728302, "learning_rate": 8.338341346153848e-06, "loss": 0.7975, "step": 2874 }, { "epoch": 1.6771182732973604, "grad_norm": 1.0336012840270996, "learning_rate": 8.323317307692308e-06, "loss": 0.5304, "step": 2875 }, { "epoch": 1.6777016187837246, "grad_norm": 1.2422670125961304, "learning_rate": 8.30829326923077e-06, "loss": 0.6592, "step": 2876 }, { "epoch": 1.678284964270089, "grad_norm": 1.3174580335617065, "learning_rate": 8.293269230769232e-06, "loss": 0.7867, "step": 2877 }, { "epoch": 1.6788683097564534, "grad_norm": 1.1110163927078247, "learning_rate": 8.278245192307693e-06, "loss": 0.7349, "step": 2878 }, { "epoch": 1.6794516552428176, "grad_norm": 1.2657557725906372, "learning_rate": 8.263221153846154e-06, "loss": 0.9434, "step": 2879 }, { "epoch": 1.6800350007291818, "grad_norm": 1.080836296081543, "learning_rate": 8.248197115384616e-06, "loss": 0.7363, "step": 2880 }, { "epoch": 1.6806183462155462, "grad_norm": 1.3697772026062012, "learning_rate": 8.233173076923077e-06, "loss": 0.7499, "step": 2881 }, { "epoch": 1.6812016917019106, "grad_norm": 1.0886021852493286, "learning_rate": 8.21814903846154e-06, "loss": 0.9581, "step": 2882 }, { "epoch": 1.6817850371882748, "grad_norm": 1.3300132751464844, "learning_rate": 8.203125000000001e-06, "loss": 0.7848, "step": 2883 }, { "epoch": 1.682368382674639, "grad_norm": 1.4838621616363525, "learning_rate": 8.188100961538461e-06, "loss": 0.9271, "step": 2884 }, { "epoch": 1.6829517281610034, "grad_norm": 1.3038266897201538, "learning_rate": 8.173076923076923e-06, "loss": 0.7117, "step": 2885 }, { "epoch": 1.6835350736473678, "grad_norm": 1.1892757415771484, "learning_rate": 8.158052884615385e-06, "loss": 0.5654, "step": 2886 }, { "epoch": 1.684118419133732, "grad_norm": 1.1837880611419678, "learning_rate": 8.143028846153847e-06, "loss": 0.7745, "step": 2887 }, { "epoch": 1.6847017646200961, "grad_norm": 1.2857214212417603, "learning_rate": 8.128004807692307e-06, "loss": 0.7224, "step": 2888 }, { "epoch": 1.6852851101064605, "grad_norm": 1.1604810953140259, "learning_rate": 8.112980769230769e-06, "loss": 0.8615, "step": 2889 }, { "epoch": 1.685868455592825, "grad_norm": 1.1492291688919067, "learning_rate": 8.097956730769231e-06, "loss": 0.8235, "step": 2890 }, { "epoch": 1.6864518010791891, "grad_norm": 1.4180827140808105, "learning_rate": 8.082932692307693e-06, "loss": 0.7844, "step": 2891 }, { "epoch": 1.6870351465655533, "grad_norm": 1.3198400735855103, "learning_rate": 8.067908653846155e-06, "loss": 0.9021, "step": 2892 }, { "epoch": 1.6876184920519177, "grad_norm": 1.2082802057266235, "learning_rate": 8.052884615384617e-06, "loss": 0.8417, "step": 2893 }, { "epoch": 1.6882018375382821, "grad_norm": 1.4099825620651245, "learning_rate": 8.037860576923077e-06, "loss": 0.9223, "step": 2894 }, { "epoch": 1.6887851830246463, "grad_norm": 1.4837796688079834, "learning_rate": 8.022836538461539e-06, "loss": 1.0981, "step": 2895 }, { "epoch": 1.6893685285110105, "grad_norm": 1.2653106451034546, "learning_rate": 8.0078125e-06, "loss": 0.8805, "step": 2896 }, { "epoch": 1.689951873997375, "grad_norm": 1.1850988864898682, "learning_rate": 7.992788461538461e-06, "loss": 0.6671, "step": 2897 }, { "epoch": 1.6905352194837393, "grad_norm": 1.2096939086914062, "learning_rate": 7.977764423076923e-06, "loss": 0.615, "step": 2898 }, { "epoch": 1.6911185649701035, "grad_norm": 1.1689413785934448, "learning_rate": 7.962740384615385e-06, "loss": 0.7358, "step": 2899 }, { "epoch": 1.691701910456468, "grad_norm": 1.2683144807815552, "learning_rate": 7.947716346153847e-06, "loss": 0.8217, "step": 2900 }, { "epoch": 1.6922852559428323, "grad_norm": 1.4966411590576172, "learning_rate": 7.932692307692308e-06, "loss": 0.8058, "step": 2901 }, { "epoch": 1.6928686014291965, "grad_norm": 1.3604378700256348, "learning_rate": 7.91766826923077e-06, "loss": 0.8571, "step": 2902 }, { "epoch": 1.6934519469155607, "grad_norm": 1.2549089193344116, "learning_rate": 7.90264423076923e-06, "loss": 0.8095, "step": 2903 }, { "epoch": 1.694035292401925, "grad_norm": 1.293096899986267, "learning_rate": 7.887620192307692e-06, "loss": 0.888, "step": 2904 }, { "epoch": 1.6946186378882895, "grad_norm": 1.2265640497207642, "learning_rate": 7.872596153846154e-06, "loss": 0.6778, "step": 2905 }, { "epoch": 1.6952019833746537, "grad_norm": 1.3105190992355347, "learning_rate": 7.857572115384616e-06, "loss": 0.6253, "step": 2906 }, { "epoch": 1.6957853288610178, "grad_norm": 1.0808024406433105, "learning_rate": 7.842548076923076e-06, "loss": 0.8923, "step": 2907 }, { "epoch": 1.6963686743473823, "grad_norm": 1.20291268825531, "learning_rate": 7.82752403846154e-06, "loss": 0.5709, "step": 2908 }, { "epoch": 1.6969520198337467, "grad_norm": 1.1902101039886475, "learning_rate": 7.8125e-06, "loss": 0.7631, "step": 2909 }, { "epoch": 1.6975353653201108, "grad_norm": 1.1930304765701294, "learning_rate": 7.797475961538462e-06, "loss": 0.8958, "step": 2910 }, { "epoch": 1.698118710806475, "grad_norm": 1.1427156925201416, "learning_rate": 7.782451923076924e-06, "loss": 0.8889, "step": 2911 }, { "epoch": 1.6987020562928394, "grad_norm": 1.172147512435913, "learning_rate": 7.767427884615384e-06, "loss": 0.733, "step": 2912 }, { "epoch": 1.6992854017792038, "grad_norm": 1.1946724653244019, "learning_rate": 7.752403846153846e-06, "loss": 1.0244, "step": 2913 }, { "epoch": 1.699868747265568, "grad_norm": 1.224003791809082, "learning_rate": 7.737379807692308e-06, "loss": 0.8716, "step": 2914 }, { "epoch": 1.7004520927519322, "grad_norm": 1.2379382848739624, "learning_rate": 7.72235576923077e-06, "loss": 0.9171, "step": 2915 }, { "epoch": 1.7010354382382966, "grad_norm": 1.3675721883773804, "learning_rate": 7.70733173076923e-06, "loss": 0.7572, "step": 2916 }, { "epoch": 1.701618783724661, "grad_norm": 1.1662015914916992, "learning_rate": 7.692307692307694e-06, "loss": 0.726, "step": 2917 }, { "epoch": 1.7022021292110252, "grad_norm": 0.9387063980102539, "learning_rate": 7.677283653846154e-06, "loss": 0.8387, "step": 2918 }, { "epoch": 1.7027854746973894, "grad_norm": 1.1246999502182007, "learning_rate": 7.662259615384616e-06, "loss": 0.7998, "step": 2919 }, { "epoch": 1.7033688201837538, "grad_norm": 1.011136770248413, "learning_rate": 7.647235576923078e-06, "loss": 0.6432, "step": 2920 }, { "epoch": 1.7039521656701182, "grad_norm": 1.1951652765274048, "learning_rate": 7.63221153846154e-06, "loss": 0.9636, "step": 2921 }, { "epoch": 1.7045355111564824, "grad_norm": 1.4355970621109009, "learning_rate": 7.6171875000000005e-06, "loss": 0.6864, "step": 2922 }, { "epoch": 1.7051188566428466, "grad_norm": 1.2137242555618286, "learning_rate": 7.6021634615384615e-06, "loss": 0.9197, "step": 2923 }, { "epoch": 1.705702202129211, "grad_norm": 1.3542622327804565, "learning_rate": 7.5871394230769234e-06, "loss": 0.5522, "step": 2924 }, { "epoch": 1.7062855476155754, "grad_norm": 1.1288126707077026, "learning_rate": 7.5721153846153845e-06, "loss": 0.7477, "step": 2925 }, { "epoch": 1.7068688931019396, "grad_norm": 1.3344682455062866, "learning_rate": 7.557091346153846e-06, "loss": 0.873, "step": 2926 }, { "epoch": 1.707452238588304, "grad_norm": 1.3780889511108398, "learning_rate": 7.542067307692307e-06, "loss": 0.7851, "step": 2927 }, { "epoch": 1.7080355840746684, "grad_norm": 1.1783818006515503, "learning_rate": 7.52704326923077e-06, "loss": 0.8652, "step": 2928 }, { "epoch": 1.7086189295610326, "grad_norm": 1.1674573421478271, "learning_rate": 7.512019230769231e-06, "loss": 0.8358, "step": 2929 }, { "epoch": 1.7092022750473967, "grad_norm": 1.1287046670913696, "learning_rate": 7.496995192307693e-06, "loss": 0.7869, "step": 2930 }, { "epoch": 1.7097856205337612, "grad_norm": 1.240212082862854, "learning_rate": 7.481971153846154e-06, "loss": 0.9021, "step": 2931 }, { "epoch": 1.7103689660201256, "grad_norm": 1.4593373537063599, "learning_rate": 7.466947115384616e-06, "loss": 0.8768, "step": 2932 }, { "epoch": 1.7109523115064897, "grad_norm": 1.5059845447540283, "learning_rate": 7.451923076923077e-06, "loss": 0.7657, "step": 2933 }, { "epoch": 1.711535656992854, "grad_norm": 1.0853300094604492, "learning_rate": 7.43689903846154e-06, "loss": 0.935, "step": 2934 }, { "epoch": 1.7121190024792183, "grad_norm": 1.382460117340088, "learning_rate": 7.421875e-06, "loss": 0.644, "step": 2935 }, { "epoch": 1.7127023479655827, "grad_norm": 1.4362154006958008, "learning_rate": 7.406850961538461e-06, "loss": 0.5482, "step": 2936 }, { "epoch": 1.713285693451947, "grad_norm": 1.2484700679779053, "learning_rate": 7.391826923076924e-06, "loss": 0.799, "step": 2937 }, { "epoch": 1.713869038938311, "grad_norm": 1.0450493097305298, "learning_rate": 7.376802884615385e-06, "loss": 0.7939, "step": 2938 }, { "epoch": 1.7144523844246755, "grad_norm": 1.2090156078338623, "learning_rate": 7.361778846153847e-06, "loss": 0.7845, "step": 2939 }, { "epoch": 1.71503572991104, "grad_norm": 1.0005606412887573, "learning_rate": 7.346754807692308e-06, "loss": 0.8881, "step": 2940 }, { "epoch": 1.715619075397404, "grad_norm": 1.2529767751693726, "learning_rate": 7.33173076923077e-06, "loss": 0.9444, "step": 2941 }, { "epoch": 1.7162024208837683, "grad_norm": 1.3556801080703735, "learning_rate": 7.316706730769231e-06, "loss": 0.8145, "step": 2942 }, { "epoch": 1.7167857663701327, "grad_norm": 1.385504126548767, "learning_rate": 7.301682692307693e-06, "loss": 0.8717, "step": 2943 }, { "epoch": 1.717369111856497, "grad_norm": 1.2711490392684937, "learning_rate": 7.286658653846154e-06, "loss": 0.6642, "step": 2944 }, { "epoch": 1.7179524573428613, "grad_norm": 1.199277639389038, "learning_rate": 7.271634615384616e-06, "loss": 0.908, "step": 2945 }, { "epoch": 1.7185358028292255, "grad_norm": 1.2609366178512573, "learning_rate": 7.256610576923077e-06, "loss": 0.9037, "step": 2946 }, { "epoch": 1.7191191483155899, "grad_norm": 1.1617000102996826, "learning_rate": 7.241586538461539e-06, "loss": 0.8382, "step": 2947 }, { "epoch": 1.7197024938019543, "grad_norm": 1.1657629013061523, "learning_rate": 7.2265625e-06, "loss": 1.1246, "step": 2948 }, { "epoch": 1.7202858392883185, "grad_norm": 1.197189211845398, "learning_rate": 7.211538461538461e-06, "loss": 0.8029, "step": 2949 }, { "epoch": 1.7208691847746826, "grad_norm": 1.611620306968689, "learning_rate": 7.196514423076923e-06, "loss": 0.9494, "step": 2950 }, { "epoch": 1.721452530261047, "grad_norm": 1.2203035354614258, "learning_rate": 7.181490384615384e-06, "loss": 1.0667, "step": 2951 }, { "epoch": 1.7220358757474115, "grad_norm": 1.2028281688690186, "learning_rate": 7.166466346153847e-06, "loss": 0.6483, "step": 2952 }, { "epoch": 1.7226192212337756, "grad_norm": 1.251402735710144, "learning_rate": 7.151442307692307e-06, "loss": 0.8199, "step": 2953 }, { "epoch": 1.72320256672014, "grad_norm": 1.2342396974563599, "learning_rate": 7.13641826923077e-06, "loss": 0.7685, "step": 2954 }, { "epoch": 1.7237859122065045, "grad_norm": 1.1816489696502686, "learning_rate": 7.121394230769231e-06, "loss": 0.7799, "step": 2955 }, { "epoch": 1.7243692576928686, "grad_norm": 1.373840093612671, "learning_rate": 7.106370192307693e-06, "loss": 0.7768, "step": 2956 }, { "epoch": 1.7249526031792328, "grad_norm": 1.6701674461364746, "learning_rate": 7.091346153846154e-06, "loss": 0.953, "step": 2957 }, { "epoch": 1.7255359486655972, "grad_norm": 1.3481673002243042, "learning_rate": 7.076322115384617e-06, "loss": 0.8016, "step": 2958 }, { "epoch": 1.7261192941519616, "grad_norm": 1.2999407052993774, "learning_rate": 7.061298076923077e-06, "loss": 0.5779, "step": 2959 }, { "epoch": 1.7267026396383258, "grad_norm": 1.1688950061798096, "learning_rate": 7.04627403846154e-06, "loss": 0.937, "step": 2960 }, { "epoch": 1.72728598512469, "grad_norm": 1.5214632749557495, "learning_rate": 7.031250000000001e-06, "loss": 0.8384, "step": 2961 }, { "epoch": 1.7278693306110544, "grad_norm": 1.1693167686462402, "learning_rate": 7.016225961538461e-06, "loss": 0.8144, "step": 2962 }, { "epoch": 1.7284526760974188, "grad_norm": 1.2496626377105713, "learning_rate": 7.001201923076924e-06, "loss": 0.7753, "step": 2963 }, { "epoch": 1.729036021583783, "grad_norm": 1.1495461463928223, "learning_rate": 6.986177884615385e-06, "loss": 0.7307, "step": 2964 }, { "epoch": 1.7296193670701472, "grad_norm": 1.15946626663208, "learning_rate": 6.9711538461538465e-06, "loss": 0.8847, "step": 2965 }, { "epoch": 1.7302027125565116, "grad_norm": 1.1948374509811401, "learning_rate": 6.9561298076923076e-06, "loss": 0.7088, "step": 2966 }, { "epoch": 1.730786058042876, "grad_norm": 1.298287272453308, "learning_rate": 6.94110576923077e-06, "loss": 0.6594, "step": 2967 }, { "epoch": 1.7313694035292402, "grad_norm": 1.1406173706054688, "learning_rate": 6.9260817307692305e-06, "loss": 0.9859, "step": 2968 }, { "epoch": 1.7319527490156044, "grad_norm": 1.2161551713943481, "learning_rate": 6.911057692307693e-06, "loss": 0.7891, "step": 2969 }, { "epoch": 1.7325360945019688, "grad_norm": 1.0298594236373901, "learning_rate": 6.896033653846154e-06, "loss": 0.7256, "step": 2970 }, { "epoch": 1.7331194399883332, "grad_norm": 1.2798943519592285, "learning_rate": 6.881009615384616e-06, "loss": 0.8829, "step": 2971 }, { "epoch": 1.7337027854746974, "grad_norm": 1.514907717704773, "learning_rate": 6.865985576923077e-06, "loss": 0.7678, "step": 2972 }, { "epoch": 1.7342861309610615, "grad_norm": 1.2793967723846436, "learning_rate": 6.850961538461539e-06, "loss": 0.8854, "step": 2973 }, { "epoch": 1.734869476447426, "grad_norm": 1.2280986309051514, "learning_rate": 6.8359375e-06, "loss": 0.9037, "step": 2974 }, { "epoch": 1.7354528219337904, "grad_norm": 1.116003155708313, "learning_rate": 6.820913461538461e-06, "loss": 0.7507, "step": 2975 }, { "epoch": 1.7360361674201545, "grad_norm": 1.0788702964782715, "learning_rate": 6.805889423076924e-06, "loss": 0.9569, "step": 2976 }, { "epoch": 1.7366195129065187, "grad_norm": 1.329489827156067, "learning_rate": 6.790865384615384e-06, "loss": 0.7846, "step": 2977 }, { "epoch": 1.7372028583928831, "grad_norm": 1.1528865098953247, "learning_rate": 6.775841346153847e-06, "loss": 0.7985, "step": 2978 }, { "epoch": 1.7377862038792475, "grad_norm": 0.9944823980331421, "learning_rate": 6.760817307692308e-06, "loss": 0.7359, "step": 2979 }, { "epoch": 1.7383695493656117, "grad_norm": 1.269169569015503, "learning_rate": 6.74579326923077e-06, "loss": 0.7273, "step": 2980 }, { "epoch": 1.7389528948519761, "grad_norm": 1.0448265075683594, "learning_rate": 6.730769230769231e-06, "loss": 0.8073, "step": 2981 }, { "epoch": 1.7395362403383405, "grad_norm": 1.3037874698638916, "learning_rate": 6.715745192307693e-06, "loss": 0.6997, "step": 2982 }, { "epoch": 1.7401195858247047, "grad_norm": 1.2340432405471802, "learning_rate": 6.700721153846154e-06, "loss": 0.8573, "step": 2983 }, { "epoch": 1.740702931311069, "grad_norm": 1.128672480583191, "learning_rate": 6.6856971153846165e-06, "loss": 0.8413, "step": 2984 }, { "epoch": 1.7412862767974333, "grad_norm": 1.1064525842666626, "learning_rate": 6.6706730769230775e-06, "loss": 0.9186, "step": 2985 }, { "epoch": 1.7418696222837977, "grad_norm": 1.9784815311431885, "learning_rate": 6.6556490384615394e-06, "loss": 0.8194, "step": 2986 }, { "epoch": 1.742452967770162, "grad_norm": 1.0543274879455566, "learning_rate": 6.6406250000000005e-06, "loss": 0.6185, "step": 2987 }, { "epoch": 1.743036313256526, "grad_norm": 1.2074421644210815, "learning_rate": 6.6256009615384615e-06, "loss": 0.5638, "step": 2988 }, { "epoch": 1.7436196587428905, "grad_norm": 0.9913759231567383, "learning_rate": 6.610576923076923e-06, "loss": 0.7278, "step": 2989 }, { "epoch": 1.744203004229255, "grad_norm": 1.276524543762207, "learning_rate": 6.5955528846153845e-06, "loss": 0.7934, "step": 2990 }, { "epoch": 1.744786349715619, "grad_norm": 1.247419834136963, "learning_rate": 6.580528846153846e-06, "loss": 0.8914, "step": 2991 }, { "epoch": 1.7453696952019833, "grad_norm": 1.3992359638214111, "learning_rate": 6.565504807692307e-06, "loss": 0.7806, "step": 2992 }, { "epoch": 1.7459530406883477, "grad_norm": 1.1906291246414185, "learning_rate": 6.55048076923077e-06, "loss": 0.8607, "step": 2993 }, { "epoch": 1.746536386174712, "grad_norm": 1.327694296836853, "learning_rate": 6.535456730769231e-06, "loss": 0.606, "step": 2994 }, { "epoch": 1.7471197316610763, "grad_norm": 1.3064364194869995, "learning_rate": 6.520432692307693e-06, "loss": 0.8269, "step": 2995 }, { "epoch": 1.7477030771474404, "grad_norm": 1.3761636018753052, "learning_rate": 6.505408653846154e-06, "loss": 0.8794, "step": 2996 }, { "epoch": 1.7482864226338048, "grad_norm": 1.2299649715423584, "learning_rate": 6.490384615384616e-06, "loss": 0.9386, "step": 2997 }, { "epoch": 1.7488697681201693, "grad_norm": 1.324949026107788, "learning_rate": 6.475360576923077e-06, "loss": 0.8732, "step": 2998 }, { "epoch": 1.7494531136065334, "grad_norm": 1.2557692527770996, "learning_rate": 6.46033653846154e-06, "loss": 0.6888, "step": 2999 }, { "epoch": 1.7500364590928976, "grad_norm": 1.2142000198364258, "learning_rate": 6.4453125e-06, "loss": 0.828, "step": 3000 }, { "epoch": 1.7500364590928976, "eval_loss_squad": 0.852206126167439, "eval_perplexity": 8.171628834408976, "eval_perplexity_reconstruct": 1.8986877809494127, "step": 3000 }, { "epoch": 1.750619804579262, "grad_norm": 1.2730783224105835, "learning_rate": 6.430288461538461e-06, "loss": 0.8912, "step": 3001 }, { "epoch": 1.7512031500656264, "grad_norm": 1.2231379747390747, "learning_rate": 6.415264423076924e-06, "loss": 0.8677, "step": 3002 }, { "epoch": 1.7517864955519906, "grad_norm": 1.3539924621582031, "learning_rate": 6.400240384615385e-06, "loss": 0.8291, "step": 3003 }, { "epoch": 1.7523698410383548, "grad_norm": 1.3997881412506104, "learning_rate": 6.385216346153847e-06, "loss": 0.9718, "step": 3004 }, { "epoch": 1.7529531865247194, "grad_norm": 1.2460448741912842, "learning_rate": 6.370192307692308e-06, "loss": 0.9308, "step": 3005 }, { "epoch": 1.7535365320110836, "grad_norm": 1.063209056854248, "learning_rate": 6.35516826923077e-06, "loss": 0.637, "step": 3006 }, { "epoch": 1.7541198774974478, "grad_norm": 1.0733965635299683, "learning_rate": 6.340144230769231e-06, "loss": 0.7837, "step": 3007 }, { "epoch": 1.7547032229838122, "grad_norm": 1.167663335800171, "learning_rate": 6.325120192307693e-06, "loss": 0.8921, "step": 3008 }, { "epoch": 1.7552865684701766, "grad_norm": 1.3398858308792114, "learning_rate": 6.310096153846154e-06, "loss": 0.7743, "step": 3009 }, { "epoch": 1.7558699139565408, "grad_norm": 1.180702805519104, "learning_rate": 6.295072115384616e-06, "loss": 0.6499, "step": 3010 }, { "epoch": 1.756453259442905, "grad_norm": 1.4334017038345337, "learning_rate": 6.280048076923077e-06, "loss": 0.6341, "step": 3011 }, { "epoch": 1.7570366049292694, "grad_norm": 1.1912078857421875, "learning_rate": 6.265024038461539e-06, "loss": 0.6193, "step": 3012 }, { "epoch": 1.7576199504156338, "grad_norm": 1.267592430114746, "learning_rate": 6.25e-06, "loss": 0.8952, "step": 3013 }, { "epoch": 1.758203295901998, "grad_norm": 1.6982357501983643, "learning_rate": 6.234975961538462e-06, "loss": 0.8871, "step": 3014 }, { "epoch": 1.7587866413883622, "grad_norm": 1.41012704372406, "learning_rate": 6.219951923076923e-06, "loss": 0.7593, "step": 3015 }, { "epoch": 1.7593699868747266, "grad_norm": 1.222355842590332, "learning_rate": 6.204927884615385e-06, "loss": 0.8029, "step": 3016 }, { "epoch": 1.759953332361091, "grad_norm": 1.1510112285614014, "learning_rate": 6.189903846153847e-06, "loss": 0.7361, "step": 3017 }, { "epoch": 1.7605366778474552, "grad_norm": 1.1464331150054932, "learning_rate": 6.174879807692308e-06, "loss": 0.6445, "step": 3018 }, { "epoch": 1.7611200233338193, "grad_norm": 1.6973276138305664, "learning_rate": 6.15985576923077e-06, "loss": 0.7339, "step": 3019 }, { "epoch": 1.7617033688201837, "grad_norm": 1.2533224821090698, "learning_rate": 6.144831730769231e-06, "loss": 0.735, "step": 3020 }, { "epoch": 1.7622867143065482, "grad_norm": 0.871324360370636, "learning_rate": 6.129807692307692e-06, "loss": 0.7549, "step": 3021 }, { "epoch": 1.7628700597929123, "grad_norm": 1.1804686784744263, "learning_rate": 6.114783653846154e-06, "loss": 0.9045, "step": 3022 }, { "epoch": 1.7634534052792765, "grad_norm": 1.0688856840133667, "learning_rate": 6.099759615384616e-06, "loss": 0.965, "step": 3023 }, { "epoch": 1.764036750765641, "grad_norm": 1.29346764087677, "learning_rate": 6.084735576923077e-06, "loss": 0.8251, "step": 3024 }, { "epoch": 1.7646200962520053, "grad_norm": 1.247200846672058, "learning_rate": 6.069711538461539e-06, "loss": 0.8057, "step": 3025 }, { "epoch": 1.7652034417383695, "grad_norm": 1.3537094593048096, "learning_rate": 6.054687500000001e-06, "loss": 0.7938, "step": 3026 }, { "epoch": 1.7657867872247337, "grad_norm": 1.2823100090026855, "learning_rate": 6.039663461538462e-06, "loss": 0.9466, "step": 3027 }, { "epoch": 1.766370132711098, "grad_norm": 1.4946316480636597, "learning_rate": 6.0246394230769236e-06, "loss": 0.7986, "step": 3028 }, { "epoch": 1.7669534781974625, "grad_norm": 1.1807023286819458, "learning_rate": 6.0096153846153855e-06, "loss": 0.7605, "step": 3029 }, { "epoch": 1.7675368236838267, "grad_norm": 1.2204418182373047, "learning_rate": 5.9945913461538465e-06, "loss": 0.8484, "step": 3030 }, { "epoch": 1.7681201691701909, "grad_norm": 0.9801056981086731, "learning_rate": 5.979567307692308e-06, "loss": 0.8015, "step": 3031 }, { "epoch": 1.7687035146565555, "grad_norm": 1.2062498331069946, "learning_rate": 5.9645432692307694e-06, "loss": 0.7702, "step": 3032 }, { "epoch": 1.7692868601429197, "grad_norm": 1.5518649816513062, "learning_rate": 5.9495192307692305e-06, "loss": 0.9168, "step": 3033 }, { "epoch": 1.7698702056292839, "grad_norm": 1.196702480316162, "learning_rate": 5.934495192307692e-06, "loss": 0.7405, "step": 3034 }, { "epoch": 1.7704535511156483, "grad_norm": 1.3550660610198975, "learning_rate": 5.919471153846154e-06, "loss": 0.8246, "step": 3035 }, { "epoch": 1.7710368966020127, "grad_norm": 1.0379894971847534, "learning_rate": 5.904447115384615e-06, "loss": 0.7812, "step": 3036 }, { "epoch": 1.7716202420883769, "grad_norm": 1.0726555585861206, "learning_rate": 5.889423076923077e-06, "loss": 1.0311, "step": 3037 }, { "epoch": 1.772203587574741, "grad_norm": 1.0926289558410645, "learning_rate": 5.874399038461539e-06, "loss": 0.8649, "step": 3038 }, { "epoch": 1.7727869330611055, "grad_norm": 1.2088254690170288, "learning_rate": 5.859375e-06, "loss": 0.6787, "step": 3039 }, { "epoch": 1.7733702785474699, "grad_norm": 1.2711060047149658, "learning_rate": 5.844350961538462e-06, "loss": 0.6789, "step": 3040 }, { "epoch": 1.773953624033834, "grad_norm": 1.2558777332305908, "learning_rate": 5.829326923076923e-06, "loss": 0.8561, "step": 3041 }, { "epoch": 1.7745369695201982, "grad_norm": 1.1498215198516846, "learning_rate": 5.814302884615385e-06, "loss": 0.9378, "step": 3042 }, { "epoch": 1.7751203150065626, "grad_norm": 1.2702573537826538, "learning_rate": 5.799278846153847e-06, "loss": 0.7744, "step": 3043 }, { "epoch": 1.775703660492927, "grad_norm": 1.2708110809326172, "learning_rate": 5.784254807692308e-06, "loss": 0.8994, "step": 3044 }, { "epoch": 1.7762870059792912, "grad_norm": 1.1213219165802002, "learning_rate": 5.76923076923077e-06, "loss": 0.5506, "step": 3045 }, { "epoch": 1.7768703514656554, "grad_norm": 1.5344654321670532, "learning_rate": 5.754206730769231e-06, "loss": 0.8867, "step": 3046 }, { "epoch": 1.7774536969520198, "grad_norm": 1.2419880628585815, "learning_rate": 5.739182692307693e-06, "loss": 0.7802, "step": 3047 }, { "epoch": 1.7780370424383842, "grad_norm": 1.1512519121170044, "learning_rate": 5.724158653846154e-06, "loss": 0.6938, "step": 3048 }, { "epoch": 1.7786203879247484, "grad_norm": 1.4187743663787842, "learning_rate": 5.709134615384616e-06, "loss": 0.7221, "step": 3049 }, { "epoch": 1.7792037334111126, "grad_norm": 1.180336594581604, "learning_rate": 5.694110576923077e-06, "loss": 0.9465, "step": 3050 }, { "epoch": 1.779787078897477, "grad_norm": 1.244598627090454, "learning_rate": 5.6790865384615386e-06, "loss": 0.7442, "step": 3051 }, { "epoch": 1.7803704243838414, "grad_norm": 1.2341411113739014, "learning_rate": 5.6640625000000005e-06, "loss": 0.748, "step": 3052 }, { "epoch": 1.7809537698702056, "grad_norm": 1.1170315742492676, "learning_rate": 5.6490384615384615e-06, "loss": 0.7429, "step": 3053 }, { "epoch": 1.7815371153565698, "grad_norm": 1.3362447023391724, "learning_rate": 5.634014423076923e-06, "loss": 0.7197, "step": 3054 }, { "epoch": 1.7821204608429342, "grad_norm": 1.4332809448242188, "learning_rate": 5.618990384615385e-06, "loss": 0.9911, "step": 3055 }, { "epoch": 1.7827038063292986, "grad_norm": 1.2477706670761108, "learning_rate": 5.603966346153846e-06, "loss": 1.1378, "step": 3056 }, { "epoch": 1.7832871518156628, "grad_norm": 1.2143217325210571, "learning_rate": 5.588942307692308e-06, "loss": 0.6478, "step": 3057 }, { "epoch": 1.7838704973020272, "grad_norm": 1.2848975658416748, "learning_rate": 5.57391826923077e-06, "loss": 0.8657, "step": 3058 }, { "epoch": 1.7844538427883916, "grad_norm": 1.3641600608825684, "learning_rate": 5.55889423076923e-06, "loss": 0.9904, "step": 3059 }, { "epoch": 1.7850371882747558, "grad_norm": 1.3162271976470947, "learning_rate": 5.543870192307692e-06, "loss": 0.6504, "step": 3060 }, { "epoch": 1.78562053376112, "grad_norm": 1.1452566385269165, "learning_rate": 5.528846153846154e-06, "loss": 0.8217, "step": 3061 }, { "epoch": 1.7862038792474844, "grad_norm": 1.2015888690948486, "learning_rate": 5.513822115384615e-06, "loss": 0.9543, "step": 3062 }, { "epoch": 1.7867872247338488, "grad_norm": 1.3449658155441284, "learning_rate": 5.498798076923077e-06, "loss": 0.8232, "step": 3063 }, { "epoch": 1.787370570220213, "grad_norm": 1.3045237064361572, "learning_rate": 5.483774038461539e-06, "loss": 0.8037, "step": 3064 }, { "epoch": 1.7879539157065771, "grad_norm": 1.1564418077468872, "learning_rate": 5.46875e-06, "loss": 0.7541, "step": 3065 }, { "epoch": 1.7885372611929415, "grad_norm": 1.2231559753417969, "learning_rate": 5.453725961538462e-06, "loss": 0.7623, "step": 3066 }, { "epoch": 1.789120606679306, "grad_norm": 1.238562822341919, "learning_rate": 5.438701923076924e-06, "loss": 0.8263, "step": 3067 }, { "epoch": 1.7897039521656701, "grad_norm": 1.3414685726165771, "learning_rate": 5.423677884615385e-06, "loss": 0.8375, "step": 3068 }, { "epoch": 1.7902872976520343, "grad_norm": 1.1576594114303589, "learning_rate": 5.408653846153847e-06, "loss": 0.8616, "step": 3069 }, { "epoch": 1.7908706431383987, "grad_norm": 1.2848812341690063, "learning_rate": 5.3936298076923085e-06, "loss": 0.8936, "step": 3070 }, { "epoch": 1.7914539886247631, "grad_norm": 1.2974978685379028, "learning_rate": 5.37860576923077e-06, "loss": 0.7183, "step": 3071 }, { "epoch": 1.7920373341111273, "grad_norm": 1.0673803091049194, "learning_rate": 5.363581730769231e-06, "loss": 0.9361, "step": 3072 }, { "epoch": 1.7926206795974915, "grad_norm": 1.156964659690857, "learning_rate": 5.3485576923076925e-06, "loss": 0.665, "step": 3073 }, { "epoch": 1.793204025083856, "grad_norm": 1.2498539686203003, "learning_rate": 5.3335336538461536e-06, "loss": 0.683, "step": 3074 }, { "epoch": 1.7937873705702203, "grad_norm": 1.0622308254241943, "learning_rate": 5.3185096153846155e-06, "loss": 0.7311, "step": 3075 }, { "epoch": 1.7943707160565845, "grad_norm": 1.147762417793274, "learning_rate": 5.303485576923077e-06, "loss": 0.9415, "step": 3076 }, { "epoch": 1.7949540615429487, "grad_norm": 1.6700465679168701, "learning_rate": 5.288461538461538e-06, "loss": 0.7959, "step": 3077 }, { "epoch": 1.795537407029313, "grad_norm": 1.326983094215393, "learning_rate": 5.2734375e-06, "loss": 0.6907, "step": 3078 }, { "epoch": 1.7961207525156775, "grad_norm": 1.2017778158187866, "learning_rate": 5.258413461538462e-06, "loss": 0.7195, "step": 3079 }, { "epoch": 1.7967040980020417, "grad_norm": 1.3539180755615234, "learning_rate": 5.243389423076923e-06, "loss": 0.7981, "step": 3080 }, { "epoch": 1.7972874434884059, "grad_norm": 1.202142357826233, "learning_rate": 5.228365384615385e-06, "loss": 0.8899, "step": 3081 }, { "epoch": 1.7978707889747703, "grad_norm": 1.4579527378082275, "learning_rate": 5.213341346153847e-06, "loss": 0.7488, "step": 3082 }, { "epoch": 1.7984541344611347, "grad_norm": 1.3240573406219482, "learning_rate": 5.198317307692308e-06, "loss": 0.6901, "step": 3083 }, { "epoch": 1.7990374799474989, "grad_norm": 0.9925790429115295, "learning_rate": 5.18329326923077e-06, "loss": 0.7306, "step": 3084 }, { "epoch": 1.7996208254338633, "grad_norm": 1.2076115608215332, "learning_rate": 5.168269230769231e-06, "loss": 0.7139, "step": 3085 }, { "epoch": 1.8002041709202277, "grad_norm": 1.3439483642578125, "learning_rate": 5.153245192307692e-06, "loss": 0.7188, "step": 3086 }, { "epoch": 1.8007875164065918, "grad_norm": 1.2917673587799072, "learning_rate": 5.138221153846154e-06, "loss": 0.8389, "step": 3087 }, { "epoch": 1.801370861892956, "grad_norm": 1.1600747108459473, "learning_rate": 5.123197115384616e-06, "loss": 0.8402, "step": 3088 }, { "epoch": 1.8019542073793204, "grad_norm": 1.4509084224700928, "learning_rate": 5.108173076923077e-06, "loss": 0.8959, "step": 3089 }, { "epoch": 1.8025375528656848, "grad_norm": 1.1624780893325806, "learning_rate": 5.093149038461539e-06, "loss": 0.7454, "step": 3090 }, { "epoch": 1.803120898352049, "grad_norm": 1.2352399826049805, "learning_rate": 5.078125000000001e-06, "loss": 0.8392, "step": 3091 }, { "epoch": 1.8037042438384132, "grad_norm": 1.2602814435958862, "learning_rate": 5.063100961538462e-06, "loss": 0.6345, "step": 3092 }, { "epoch": 1.8042875893247776, "grad_norm": 1.5061650276184082, "learning_rate": 5.0480769230769235e-06, "loss": 0.6893, "step": 3093 }, { "epoch": 1.804870934811142, "grad_norm": 1.175838828086853, "learning_rate": 5.0330528846153854e-06, "loss": 0.7926, "step": 3094 }, { "epoch": 1.8054542802975062, "grad_norm": 1.4317418336868286, "learning_rate": 5.0180288461538465e-06, "loss": 0.8662, "step": 3095 }, { "epoch": 1.8060376257838704, "grad_norm": 1.0850125551223755, "learning_rate": 5.003004807692308e-06, "loss": 0.67, "step": 3096 }, { "epoch": 1.8066209712702348, "grad_norm": 1.199704885482788, "learning_rate": 4.987980769230769e-06, "loss": 1.0953, "step": 3097 }, { "epoch": 1.8072043167565992, "grad_norm": 1.1284786462783813, "learning_rate": 4.9729567307692305e-06, "loss": 0.8975, "step": 3098 }, { "epoch": 1.8077876622429634, "grad_norm": 1.3013429641723633, "learning_rate": 4.957932692307692e-06, "loss": 0.7548, "step": 3099 }, { "epoch": 1.8083710077293276, "grad_norm": 1.3402200937271118, "learning_rate": 4.942908653846154e-06, "loss": 0.7116, "step": 3100 }, { "epoch": 1.808954353215692, "grad_norm": 1.1357074975967407, "learning_rate": 4.927884615384615e-06, "loss": 0.747, "step": 3101 }, { "epoch": 1.8095376987020564, "grad_norm": 1.097558856010437, "learning_rate": 4.912860576923077e-06, "loss": 0.5583, "step": 3102 }, { "epoch": 1.8101210441884206, "grad_norm": 1.726578712463379, "learning_rate": 4.897836538461539e-06, "loss": 0.9862, "step": 3103 }, { "epoch": 1.8107043896747848, "grad_norm": 1.3363982439041138, "learning_rate": 4.8828125e-06, "loss": 0.7787, "step": 3104 }, { "epoch": 1.8112877351611492, "grad_norm": 1.1213432550430298, "learning_rate": 4.867788461538462e-06, "loss": 0.9938, "step": 3105 }, { "epoch": 1.8118710806475136, "grad_norm": 1.5810742378234863, "learning_rate": 4.852764423076923e-06, "loss": 0.8159, "step": 3106 }, { "epoch": 1.8124544261338777, "grad_norm": 1.1917210817337036, "learning_rate": 4.837740384615385e-06, "loss": 0.8894, "step": 3107 }, { "epoch": 1.813037771620242, "grad_norm": 1.112142562866211, "learning_rate": 4.822716346153847e-06, "loss": 0.7393, "step": 3108 }, { "epoch": 1.8136211171066063, "grad_norm": 1.3024792671203613, "learning_rate": 4.807692307692308e-06, "loss": 0.7867, "step": 3109 }, { "epoch": 1.8142044625929707, "grad_norm": 1.378227949142456, "learning_rate": 4.79266826923077e-06, "loss": 0.6839, "step": 3110 }, { "epoch": 1.814787808079335, "grad_norm": 1.1621403694152832, "learning_rate": 4.777644230769231e-06, "loss": 0.7255, "step": 3111 }, { "epoch": 1.8153711535656993, "grad_norm": 1.3222122192382812, "learning_rate": 4.762620192307693e-06, "loss": 0.7682, "step": 3112 }, { "epoch": 1.8159544990520637, "grad_norm": 1.3286000490188599, "learning_rate": 4.747596153846154e-06, "loss": 0.7242, "step": 3113 }, { "epoch": 1.816537844538428, "grad_norm": 1.1924617290496826, "learning_rate": 4.732572115384616e-06, "loss": 0.9124, "step": 3114 }, { "epoch": 1.817121190024792, "grad_norm": 1.5701050758361816, "learning_rate": 4.717548076923077e-06, "loss": 0.9612, "step": 3115 }, { "epoch": 1.8177045355111565, "grad_norm": 0.9868125319480896, "learning_rate": 4.7025240384615385e-06, "loss": 0.672, "step": 3116 }, { "epoch": 1.818287880997521, "grad_norm": 1.411771535873413, "learning_rate": 4.6875000000000004e-06, "loss": 0.7319, "step": 3117 }, { "epoch": 1.818871226483885, "grad_norm": 1.2565867900848389, "learning_rate": 4.6724759615384615e-06, "loss": 0.6905, "step": 3118 }, { "epoch": 1.8194545719702493, "grad_norm": 1.527400016784668, "learning_rate": 4.657451923076923e-06, "loss": 0.7905, "step": 3119 }, { "epoch": 1.8200379174566137, "grad_norm": 1.2910518646240234, "learning_rate": 4.642427884615385e-06, "loss": 0.7373, "step": 3120 }, { "epoch": 1.820621262942978, "grad_norm": 1.4282567501068115, "learning_rate": 4.627403846153846e-06, "loss": 0.8244, "step": 3121 }, { "epoch": 1.8212046084293423, "grad_norm": 1.2436426877975464, "learning_rate": 4.612379807692308e-06, "loss": 0.8656, "step": 3122 }, { "epoch": 1.8217879539157065, "grad_norm": 1.2247263193130493, "learning_rate": 4.59735576923077e-06, "loss": 0.8739, "step": 3123 }, { "epoch": 1.8223712994020709, "grad_norm": 1.359684705734253, "learning_rate": 4.58233173076923e-06, "loss": 0.6765, "step": 3124 }, { "epoch": 1.8229546448884353, "grad_norm": 1.2492992877960205, "learning_rate": 4.567307692307692e-06, "loss": 0.6232, "step": 3125 }, { "epoch": 1.8235379903747995, "grad_norm": 1.314278244972229, "learning_rate": 4.552283653846154e-06, "loss": 0.7352, "step": 3126 }, { "epoch": 1.8241213358611637, "grad_norm": 1.5234570503234863, "learning_rate": 4.537259615384615e-06, "loss": 0.9334, "step": 3127 }, { "epoch": 1.824704681347528, "grad_norm": 1.2035748958587646, "learning_rate": 4.522235576923077e-06, "loss": 0.8481, "step": 3128 }, { "epoch": 1.8252880268338925, "grad_norm": 1.1306325197219849, "learning_rate": 4.507211538461539e-06, "loss": 0.7214, "step": 3129 }, { "epoch": 1.8258713723202566, "grad_norm": 1.265943169593811, "learning_rate": 4.4921875e-06, "loss": 0.7674, "step": 3130 }, { "epoch": 1.8264547178066208, "grad_norm": 1.22818124294281, "learning_rate": 4.477163461538462e-06, "loss": 0.9115, "step": 3131 }, { "epoch": 1.8270380632929852, "grad_norm": 1.2925962209701538, "learning_rate": 4.462139423076924e-06, "loss": 1.0034, "step": 3132 }, { "epoch": 1.8276214087793496, "grad_norm": 1.3289649486541748, "learning_rate": 4.447115384615385e-06, "loss": 0.8526, "step": 3133 }, { "epoch": 1.8282047542657138, "grad_norm": 1.2596584558486938, "learning_rate": 4.432091346153847e-06, "loss": 0.8599, "step": 3134 }, { "epoch": 1.828788099752078, "grad_norm": 1.5771028995513916, "learning_rate": 4.4170673076923085e-06, "loss": 0.7632, "step": 3135 }, { "epoch": 1.8293714452384424, "grad_norm": 1.3541091680526733, "learning_rate": 4.4020432692307696e-06, "loss": 0.7655, "step": 3136 }, { "epoch": 1.8299547907248068, "grad_norm": 1.2546583414077759, "learning_rate": 4.387019230769231e-06, "loss": 0.7144, "step": 3137 }, { "epoch": 1.830538136211171, "grad_norm": 1.216031789779663, "learning_rate": 4.3719951923076925e-06, "loss": 0.8208, "step": 3138 }, { "epoch": 1.8311214816975354, "grad_norm": 1.219329595565796, "learning_rate": 4.3569711538461535e-06, "loss": 0.5568, "step": 3139 }, { "epoch": 1.8317048271838998, "grad_norm": 1.1087514162063599, "learning_rate": 4.3419471153846154e-06, "loss": 0.9815, "step": 3140 }, { "epoch": 1.832288172670264, "grad_norm": 1.2218693494796753, "learning_rate": 4.326923076923077e-06, "loss": 0.7445, "step": 3141 }, { "epoch": 1.8328715181566282, "grad_norm": 1.487666130065918, "learning_rate": 4.311899038461538e-06, "loss": 0.8892, "step": 3142 }, { "epoch": 1.8334548636429926, "grad_norm": 1.117554783821106, "learning_rate": 4.296875e-06, "loss": 0.6522, "step": 3143 }, { "epoch": 1.834038209129357, "grad_norm": 1.2432799339294434, "learning_rate": 4.281850961538462e-06, "loss": 0.7858, "step": 3144 }, { "epoch": 1.8346215546157212, "grad_norm": 1.2413891553878784, "learning_rate": 4.266826923076923e-06, "loss": 0.6435, "step": 3145 }, { "epoch": 1.8352049001020854, "grad_norm": 1.193045735359192, "learning_rate": 4.251802884615385e-06, "loss": 0.7756, "step": 3146 }, { "epoch": 1.8357882455884498, "grad_norm": 1.2209383249282837, "learning_rate": 4.236778846153847e-06, "loss": 0.7637, "step": 3147 }, { "epoch": 1.8363715910748142, "grad_norm": 1.1595642566680908, "learning_rate": 4.221754807692308e-06, "loss": 0.7416, "step": 3148 }, { "epoch": 1.8369549365611784, "grad_norm": 1.1345254182815552, "learning_rate": 4.20673076923077e-06, "loss": 0.4896, "step": 3149 }, { "epoch": 1.8375382820475425, "grad_norm": 1.1374866962432861, "learning_rate": 4.191706730769231e-06, "loss": 0.8506, "step": 3150 }, { "epoch": 1.838121627533907, "grad_norm": 1.126652717590332, "learning_rate": 4.176682692307692e-06, "loss": 0.7785, "step": 3151 }, { "epoch": 1.8387049730202714, "grad_norm": 1.1542102098464966, "learning_rate": 4.161658653846154e-06, "loss": 0.6363, "step": 3152 }, { "epoch": 1.8392883185066355, "grad_norm": 1.0646016597747803, "learning_rate": 4.146634615384616e-06, "loss": 0.6941, "step": 3153 }, { "epoch": 1.8398716639929997, "grad_norm": 1.2552902698516846, "learning_rate": 4.131610576923077e-06, "loss": 0.7249, "step": 3154 }, { "epoch": 1.8404550094793641, "grad_norm": 1.0623902082443237, "learning_rate": 4.116586538461539e-06, "loss": 0.7205, "step": 3155 }, { "epoch": 1.8410383549657285, "grad_norm": 1.0725420713424683, "learning_rate": 4.101562500000001e-06, "loss": 0.7775, "step": 3156 }, { "epoch": 1.8416217004520927, "grad_norm": 1.095056176185608, "learning_rate": 4.086538461538462e-06, "loss": 0.6987, "step": 3157 }, { "epoch": 1.842205045938457, "grad_norm": 1.199759840965271, "learning_rate": 4.0715144230769235e-06, "loss": 0.7286, "step": 3158 }, { "epoch": 1.8427883914248213, "grad_norm": 0.9884356260299683, "learning_rate": 4.0564903846153846e-06, "loss": 0.7607, "step": 3159 }, { "epoch": 1.8433717369111857, "grad_norm": 1.2776890993118286, "learning_rate": 4.0414663461538465e-06, "loss": 0.6219, "step": 3160 }, { "epoch": 1.84395508239755, "grad_norm": 1.3608481884002686, "learning_rate": 4.026442307692308e-06, "loss": 0.9211, "step": 3161 }, { "epoch": 1.844538427883914, "grad_norm": 1.205276608467102, "learning_rate": 4.011418269230769e-06, "loss": 0.6703, "step": 3162 }, { "epoch": 1.8451217733702785, "grad_norm": 1.4303468465805054, "learning_rate": 3.9963942307692304e-06, "loss": 0.7663, "step": 3163 }, { "epoch": 1.845705118856643, "grad_norm": 1.1754591464996338, "learning_rate": 3.981370192307692e-06, "loss": 0.7512, "step": 3164 }, { "epoch": 1.846288464343007, "grad_norm": 1.1947484016418457, "learning_rate": 3.966346153846154e-06, "loss": 0.715, "step": 3165 }, { "epoch": 1.8468718098293715, "grad_norm": 1.4016227722167969, "learning_rate": 3.951322115384615e-06, "loss": 0.9545, "step": 3166 }, { "epoch": 1.847455155315736, "grad_norm": 1.25657057762146, "learning_rate": 3.936298076923077e-06, "loss": 0.8419, "step": 3167 }, { "epoch": 1.8480385008021, "grad_norm": 1.1799569129943848, "learning_rate": 3.921274038461538e-06, "loss": 0.9247, "step": 3168 }, { "epoch": 1.8486218462884643, "grad_norm": 1.4153069257736206, "learning_rate": 3.90625e-06, "loss": 0.8692, "step": 3169 }, { "epoch": 1.8492051917748287, "grad_norm": 1.103832483291626, "learning_rate": 3.891225961538462e-06, "loss": 0.7473, "step": 3170 }, { "epoch": 1.849788537261193, "grad_norm": 1.3749737739562988, "learning_rate": 3.876201923076923e-06, "loss": 0.669, "step": 3171 }, { "epoch": 1.8503718827475573, "grad_norm": 1.2396540641784668, "learning_rate": 3.861177884615385e-06, "loss": 0.7441, "step": 3172 }, { "epoch": 1.8509552282339214, "grad_norm": 1.2196747064590454, "learning_rate": 3.846153846153847e-06, "loss": 0.7092, "step": 3173 }, { "epoch": 1.8515385737202859, "grad_norm": 1.1058050394058228, "learning_rate": 3.831129807692308e-06, "loss": 0.6491, "step": 3174 }, { "epoch": 1.8521219192066503, "grad_norm": 1.1973541975021362, "learning_rate": 3.81610576923077e-06, "loss": 0.8188, "step": 3175 }, { "epoch": 1.8527052646930144, "grad_norm": 1.4277015924453735, "learning_rate": 3.8010817307692308e-06, "loss": 0.87, "step": 3176 }, { "epoch": 1.8532886101793786, "grad_norm": 1.5026133060455322, "learning_rate": 3.7860576923076922e-06, "loss": 0.864, "step": 3177 }, { "epoch": 1.853871955665743, "grad_norm": 1.1631428003311157, "learning_rate": 3.7710336538461537e-06, "loss": 0.8488, "step": 3178 }, { "epoch": 1.8544553011521074, "grad_norm": 1.1146588325500488, "learning_rate": 3.7560096153846156e-06, "loss": 0.7239, "step": 3179 }, { "epoch": 1.8550386466384716, "grad_norm": 1.4882702827453613, "learning_rate": 3.740985576923077e-06, "loss": 0.723, "step": 3180 }, { "epoch": 1.8556219921248358, "grad_norm": 1.207821011543274, "learning_rate": 3.7259615384615385e-06, "loss": 0.6462, "step": 3181 }, { "epoch": 1.8562053376112002, "grad_norm": 1.1613391637802124, "learning_rate": 3.7109375e-06, "loss": 0.9761, "step": 3182 }, { "epoch": 1.8567886830975646, "grad_norm": 1.2488850355148315, "learning_rate": 3.695913461538462e-06, "loss": 0.9367, "step": 3183 }, { "epoch": 1.8573720285839288, "grad_norm": 1.1730568408966064, "learning_rate": 3.6808894230769233e-06, "loss": 0.7909, "step": 3184 }, { "epoch": 1.857955374070293, "grad_norm": 1.4373273849487305, "learning_rate": 3.665865384615385e-06, "loss": 0.824, "step": 3185 }, { "epoch": 1.8585387195566574, "grad_norm": 1.7913013696670532, "learning_rate": 3.6508413461538467e-06, "loss": 0.8291, "step": 3186 }, { "epoch": 1.8591220650430218, "grad_norm": 1.2750526666641235, "learning_rate": 3.635817307692308e-06, "loss": 0.9025, "step": 3187 }, { "epoch": 1.859705410529386, "grad_norm": 1.5024226903915405, "learning_rate": 3.6207932692307696e-06, "loss": 0.643, "step": 3188 }, { "epoch": 1.8602887560157502, "grad_norm": 1.1984363794326782, "learning_rate": 3.6057692307692307e-06, "loss": 0.8266, "step": 3189 }, { "epoch": 1.8608721015021146, "grad_norm": 1.3658881187438965, "learning_rate": 3.590745192307692e-06, "loss": 0.8933, "step": 3190 }, { "epoch": 1.861455446988479, "grad_norm": 1.5781782865524292, "learning_rate": 3.5757211538461536e-06, "loss": 0.9418, "step": 3191 }, { "epoch": 1.8620387924748432, "grad_norm": 1.1617451906204224, "learning_rate": 3.5606971153846155e-06, "loss": 0.5586, "step": 3192 }, { "epoch": 1.8626221379612076, "grad_norm": 1.3859959840774536, "learning_rate": 3.545673076923077e-06, "loss": 0.971, "step": 3193 }, { "epoch": 1.863205483447572, "grad_norm": 1.154800534248352, "learning_rate": 3.5306490384615384e-06, "loss": 0.9459, "step": 3194 }, { "epoch": 1.8637888289339362, "grad_norm": 1.2615742683410645, "learning_rate": 3.5156250000000003e-06, "loss": 0.8425, "step": 3195 }, { "epoch": 1.8643721744203003, "grad_norm": 1.4125624895095825, "learning_rate": 3.500600961538462e-06, "loss": 0.8873, "step": 3196 }, { "epoch": 1.8649555199066647, "grad_norm": 1.2564436197280884, "learning_rate": 3.4855769230769233e-06, "loss": 0.8137, "step": 3197 }, { "epoch": 1.8655388653930292, "grad_norm": 1.1753637790679932, "learning_rate": 3.470552884615385e-06, "loss": 0.8264, "step": 3198 }, { "epoch": 1.8661222108793933, "grad_norm": 1.0851452350616455, "learning_rate": 3.4555288461538466e-06, "loss": 0.7682, "step": 3199 }, { "epoch": 1.8667055563657575, "grad_norm": 1.3882149457931519, "learning_rate": 3.440504807692308e-06, "loss": 0.723, "step": 3200 }, { "epoch": 1.8667055563657575, "eval_loss_squad": 0.8567332937405445, "eval_perplexity": 8.266156372908618, "eval_perplexity_reconstruct": 1.8969144391866655, "step": 3200 }, { "epoch": 1.867288901852122, "grad_norm": 1.3030450344085693, "learning_rate": 3.4254807692307695e-06, "loss": 0.8131, "step": 3201 }, { "epoch": 1.8678722473384863, "grad_norm": 1.364791989326477, "learning_rate": 3.4104567307692306e-06, "loss": 0.8387, "step": 3202 }, { "epoch": 1.8684555928248505, "grad_norm": 1.1007466316223145, "learning_rate": 3.395432692307692e-06, "loss": 0.745, "step": 3203 }, { "epoch": 1.8690389383112147, "grad_norm": 1.2074437141418457, "learning_rate": 3.380408653846154e-06, "loss": 0.7523, "step": 3204 }, { "epoch": 1.869622283797579, "grad_norm": 1.4134937524795532, "learning_rate": 3.3653846153846154e-06, "loss": 0.7514, "step": 3205 }, { "epoch": 1.8702056292839435, "grad_norm": 1.2706308364868164, "learning_rate": 3.350360576923077e-06, "loss": 0.9497, "step": 3206 }, { "epoch": 1.8707889747703077, "grad_norm": 1.3989231586456299, "learning_rate": 3.3353365384615388e-06, "loss": 0.8504, "step": 3207 }, { "epoch": 1.8713723202566719, "grad_norm": 1.2219780683517456, "learning_rate": 3.3203125000000002e-06, "loss": 0.7592, "step": 3208 }, { "epoch": 1.8719556657430363, "grad_norm": 1.3430885076522827, "learning_rate": 3.3052884615384617e-06, "loss": 0.8517, "step": 3209 }, { "epoch": 1.8725390112294007, "grad_norm": 1.190152883529663, "learning_rate": 3.290264423076923e-06, "loss": 0.6381, "step": 3210 }, { "epoch": 1.8731223567157649, "grad_norm": 1.4218403100967407, "learning_rate": 3.275240384615385e-06, "loss": 0.7845, "step": 3211 }, { "epoch": 1.873705702202129, "grad_norm": 1.5346728563308716, "learning_rate": 3.2602163461538465e-06, "loss": 0.6876, "step": 3212 }, { "epoch": 1.8742890476884935, "grad_norm": 1.3545806407928467, "learning_rate": 3.245192307692308e-06, "loss": 0.7361, "step": 3213 }, { "epoch": 1.8748723931748579, "grad_norm": 1.127886176109314, "learning_rate": 3.23016826923077e-06, "loss": 1.1837, "step": 3214 }, { "epoch": 1.875455738661222, "grad_norm": 1.2765008211135864, "learning_rate": 3.2151442307692305e-06, "loss": 0.7988, "step": 3215 }, { "epoch": 1.8760390841475862, "grad_norm": 1.3014349937438965, "learning_rate": 3.2001201923076924e-06, "loss": 0.8047, "step": 3216 }, { "epoch": 1.8766224296339509, "grad_norm": 1.284070372581482, "learning_rate": 3.185096153846154e-06, "loss": 0.9213, "step": 3217 }, { "epoch": 1.877205775120315, "grad_norm": 1.1256341934204102, "learning_rate": 3.1700721153846153e-06, "loss": 0.9295, "step": 3218 }, { "epoch": 1.8777891206066792, "grad_norm": 1.0452816486358643, "learning_rate": 3.155048076923077e-06, "loss": 0.7096, "step": 3219 }, { "epoch": 1.8783724660930436, "grad_norm": 1.4434984922409058, "learning_rate": 3.1400240384615387e-06, "loss": 0.906, "step": 3220 }, { "epoch": 1.878955811579408, "grad_norm": 1.309296727180481, "learning_rate": 3.125e-06, "loss": 0.8231, "step": 3221 }, { "epoch": 1.8795391570657722, "grad_norm": 1.0300049781799316, "learning_rate": 3.1099759615384616e-06, "loss": 0.8218, "step": 3222 }, { "epoch": 1.8801225025521364, "grad_norm": 1.3848848342895508, "learning_rate": 3.0949519230769235e-06, "loss": 0.6832, "step": 3223 }, { "epoch": 1.8807058480385008, "grad_norm": 1.3134764432907104, "learning_rate": 3.079927884615385e-06, "loss": 0.7673, "step": 3224 }, { "epoch": 1.8812891935248652, "grad_norm": 1.3645648956298828, "learning_rate": 3.064903846153846e-06, "loss": 0.8301, "step": 3225 }, { "epoch": 1.8818725390112294, "grad_norm": 1.2801153659820557, "learning_rate": 3.049879807692308e-06, "loss": 0.8582, "step": 3226 }, { "epoch": 1.8824558844975936, "grad_norm": 1.3583933115005493, "learning_rate": 3.0348557692307694e-06, "loss": 0.8457, "step": 3227 }, { "epoch": 1.883039229983958, "grad_norm": 1.4235001802444458, "learning_rate": 3.019831730769231e-06, "loss": 0.8377, "step": 3228 }, { "epoch": 1.8836225754703224, "grad_norm": 1.4059243202209473, "learning_rate": 3.0048076923076927e-06, "loss": 0.7966, "step": 3229 }, { "epoch": 1.8842059209566866, "grad_norm": 1.2851868867874146, "learning_rate": 2.989783653846154e-06, "loss": 0.655, "step": 3230 }, { "epoch": 1.8847892664430508, "grad_norm": 1.42172110080719, "learning_rate": 2.9747596153846152e-06, "loss": 0.774, "step": 3231 }, { "epoch": 1.8853726119294152, "grad_norm": 1.1469957828521729, "learning_rate": 2.959735576923077e-06, "loss": 0.6521, "step": 3232 }, { "epoch": 1.8859559574157796, "grad_norm": 1.7705121040344238, "learning_rate": 2.9447115384615386e-06, "loss": 0.9157, "step": 3233 }, { "epoch": 1.8865393029021438, "grad_norm": 1.3892390727996826, "learning_rate": 2.9296875e-06, "loss": 0.846, "step": 3234 }, { "epoch": 1.887122648388508, "grad_norm": 1.1506258249282837, "learning_rate": 2.9146634615384615e-06, "loss": 0.8809, "step": 3235 }, { "epoch": 1.8877059938748724, "grad_norm": 1.0249956846237183, "learning_rate": 2.8996394230769234e-06, "loss": 0.7122, "step": 3236 }, { "epoch": 1.8882893393612368, "grad_norm": 1.3840585947036743, "learning_rate": 2.884615384615385e-06, "loss": 0.8086, "step": 3237 }, { "epoch": 1.888872684847601, "grad_norm": 1.3755419254302979, "learning_rate": 2.8695913461538464e-06, "loss": 0.9811, "step": 3238 }, { "epoch": 1.8894560303339651, "grad_norm": 1.3516442775726318, "learning_rate": 2.854567307692308e-06, "loss": 0.7445, "step": 3239 }, { "epoch": 1.8900393758203295, "grad_norm": 1.0328466892242432, "learning_rate": 2.8395432692307693e-06, "loss": 0.6279, "step": 3240 }, { "epoch": 1.890622721306694, "grad_norm": 1.1680728197097778, "learning_rate": 2.8245192307692307e-06, "loss": 0.7424, "step": 3241 }, { "epoch": 1.8912060667930581, "grad_norm": 1.1502512693405151, "learning_rate": 2.8094951923076926e-06, "loss": 0.8519, "step": 3242 }, { "epoch": 1.8917894122794223, "grad_norm": 1.0622332096099854, "learning_rate": 2.794471153846154e-06, "loss": 0.6469, "step": 3243 }, { "epoch": 1.892372757765787, "grad_norm": 1.0960564613342285, "learning_rate": 2.779447115384615e-06, "loss": 0.8083, "step": 3244 }, { "epoch": 1.8929561032521511, "grad_norm": 1.226771593093872, "learning_rate": 2.764423076923077e-06, "loss": 0.6511, "step": 3245 }, { "epoch": 1.8935394487385153, "grad_norm": 1.045430302619934, "learning_rate": 2.7493990384615385e-06, "loss": 0.9077, "step": 3246 }, { "epoch": 1.8941227942248797, "grad_norm": 1.3713380098342896, "learning_rate": 2.734375e-06, "loss": 0.9468, "step": 3247 }, { "epoch": 1.8947061397112441, "grad_norm": 1.352195143699646, "learning_rate": 2.719350961538462e-06, "loss": 1.0285, "step": 3248 }, { "epoch": 1.8952894851976083, "grad_norm": 1.1400947570800781, "learning_rate": 2.7043269230769233e-06, "loss": 0.8711, "step": 3249 }, { "epoch": 1.8958728306839725, "grad_norm": 1.3555407524108887, "learning_rate": 2.689302884615385e-06, "loss": 0.9186, "step": 3250 }, { "epoch": 1.896456176170337, "grad_norm": 1.185897946357727, "learning_rate": 2.6742788461538463e-06, "loss": 0.7208, "step": 3251 }, { "epoch": 1.8970395216567013, "grad_norm": 1.1496164798736572, "learning_rate": 2.6592548076923077e-06, "loss": 0.8495, "step": 3252 }, { "epoch": 1.8976228671430655, "grad_norm": 1.4104948043823242, "learning_rate": 2.644230769230769e-06, "loss": 0.9402, "step": 3253 }, { "epoch": 1.8982062126294297, "grad_norm": 1.2339621782302856, "learning_rate": 2.629206730769231e-06, "loss": 0.8283, "step": 3254 }, { "epoch": 1.898789558115794, "grad_norm": 1.3827406167984009, "learning_rate": 2.6141826923076926e-06, "loss": 0.8392, "step": 3255 }, { "epoch": 1.8993729036021585, "grad_norm": 1.5171111822128296, "learning_rate": 2.599158653846154e-06, "loss": 0.7742, "step": 3256 }, { "epoch": 1.8999562490885227, "grad_norm": 1.340565800666809, "learning_rate": 2.5841346153846155e-06, "loss": 0.7106, "step": 3257 }, { "epoch": 1.9005395945748869, "grad_norm": 1.0620558261871338, "learning_rate": 2.569110576923077e-06, "loss": 0.6781, "step": 3258 }, { "epoch": 1.9011229400612513, "grad_norm": 1.4068678617477417, "learning_rate": 2.5540865384615384e-06, "loss": 0.6035, "step": 3259 }, { "epoch": 1.9017062855476157, "grad_norm": 1.356840968132019, "learning_rate": 2.5390625000000003e-06, "loss": 0.8417, "step": 3260 }, { "epoch": 1.9022896310339799, "grad_norm": 1.0584261417388916, "learning_rate": 2.5240384615384618e-06, "loss": 0.844, "step": 3261 }, { "epoch": 1.902872976520344, "grad_norm": 1.1918290853500366, "learning_rate": 2.5090144230769232e-06, "loss": 0.8733, "step": 3262 }, { "epoch": 1.9034563220067084, "grad_norm": 1.2692493200302124, "learning_rate": 2.4939903846153847e-06, "loss": 0.9443, "step": 3263 }, { "epoch": 1.9040396674930729, "grad_norm": 1.1847847700119019, "learning_rate": 2.478966346153846e-06, "loss": 0.7985, "step": 3264 }, { "epoch": 1.904623012979437, "grad_norm": 1.3458268642425537, "learning_rate": 2.4639423076923076e-06, "loss": 0.7175, "step": 3265 }, { "epoch": 1.9052063584658012, "grad_norm": 1.0749139785766602, "learning_rate": 2.4489182692307695e-06, "loss": 0.7089, "step": 3266 }, { "epoch": 1.9057897039521656, "grad_norm": 1.1422741413116455, "learning_rate": 2.433894230769231e-06, "loss": 0.7655, "step": 3267 }, { "epoch": 1.90637304943853, "grad_norm": 1.0878541469573975, "learning_rate": 2.4188701923076925e-06, "loss": 0.7034, "step": 3268 }, { "epoch": 1.9069563949248942, "grad_norm": 1.0475382804870605, "learning_rate": 2.403846153846154e-06, "loss": 0.8075, "step": 3269 }, { "epoch": 1.9075397404112586, "grad_norm": 1.3869905471801758, "learning_rate": 2.3888221153846154e-06, "loss": 0.8407, "step": 3270 }, { "epoch": 1.908123085897623, "grad_norm": 1.221571445465088, "learning_rate": 2.373798076923077e-06, "loss": 0.9829, "step": 3271 }, { "epoch": 1.9087064313839872, "grad_norm": 1.1875793933868408, "learning_rate": 2.3587740384615383e-06, "loss": 0.7063, "step": 3272 }, { "epoch": 1.9092897768703514, "grad_norm": 1.3426343202590942, "learning_rate": 2.3437500000000002e-06, "loss": 0.8446, "step": 3273 }, { "epoch": 1.9098731223567158, "grad_norm": 1.2949109077453613, "learning_rate": 2.3287259615384617e-06, "loss": 0.7361, "step": 3274 }, { "epoch": 1.9104564678430802, "grad_norm": 1.5180912017822266, "learning_rate": 2.313701923076923e-06, "loss": 0.8648, "step": 3275 }, { "epoch": 1.9110398133294444, "grad_norm": 1.1568496227264404, "learning_rate": 2.298677884615385e-06, "loss": 0.7235, "step": 3276 }, { "epoch": 1.9116231588158086, "grad_norm": 1.4971147775650024, "learning_rate": 2.283653846153846e-06, "loss": 0.5042, "step": 3277 }, { "epoch": 1.912206504302173, "grad_norm": 1.248701572418213, "learning_rate": 2.2686298076923076e-06, "loss": 0.6988, "step": 3278 }, { "epoch": 1.9127898497885374, "grad_norm": 1.498721718788147, "learning_rate": 2.2536057692307694e-06, "loss": 0.7654, "step": 3279 }, { "epoch": 1.9133731952749016, "grad_norm": 1.3017774820327759, "learning_rate": 2.238581730769231e-06, "loss": 0.8718, "step": 3280 }, { "epoch": 1.9139565407612658, "grad_norm": 1.3298211097717285, "learning_rate": 2.2235576923076924e-06, "loss": 0.8205, "step": 3281 }, { "epoch": 1.9145398862476302, "grad_norm": 1.2720110416412354, "learning_rate": 2.2085336538461543e-06, "loss": 0.7552, "step": 3282 }, { "epoch": 1.9151232317339946, "grad_norm": 1.2517110109329224, "learning_rate": 2.1935096153846153e-06, "loss": 0.8019, "step": 3283 }, { "epoch": 1.9157065772203588, "grad_norm": 1.2890255451202393, "learning_rate": 2.1784855769230768e-06, "loss": 0.7499, "step": 3284 }, { "epoch": 1.916289922706723, "grad_norm": 1.0850515365600586, "learning_rate": 2.1634615384615387e-06, "loss": 0.8125, "step": 3285 }, { "epoch": 1.9168732681930873, "grad_norm": 1.2969878911972046, "learning_rate": 2.1484375e-06, "loss": 0.7481, "step": 3286 }, { "epoch": 1.9174566136794517, "grad_norm": 1.0845204591751099, "learning_rate": 2.1334134615384616e-06, "loss": 0.8015, "step": 3287 }, { "epoch": 1.918039959165816, "grad_norm": 1.371109962463379, "learning_rate": 2.1183894230769235e-06, "loss": 0.7493, "step": 3288 }, { "epoch": 1.9186233046521801, "grad_norm": 1.1624324321746826, "learning_rate": 2.103365384615385e-06, "loss": 0.7489, "step": 3289 }, { "epoch": 1.9192066501385445, "grad_norm": 1.3240567445755005, "learning_rate": 2.088341346153846e-06, "loss": 0.847, "step": 3290 }, { "epoch": 1.919789995624909, "grad_norm": 1.2875831127166748, "learning_rate": 2.073317307692308e-06, "loss": 1.0138, "step": 3291 }, { "epoch": 1.9203733411112731, "grad_norm": 1.2602109909057617, "learning_rate": 2.0582932692307694e-06, "loss": 0.9535, "step": 3292 }, { "epoch": 1.9209566865976373, "grad_norm": 1.1849132776260376, "learning_rate": 2.043269230769231e-06, "loss": 0.7271, "step": 3293 }, { "epoch": 1.9215400320840017, "grad_norm": 1.2771323919296265, "learning_rate": 2.0282451923076923e-06, "loss": 0.9094, "step": 3294 }, { "epoch": 1.922123377570366, "grad_norm": 1.2452751398086548, "learning_rate": 2.013221153846154e-06, "loss": 0.8227, "step": 3295 }, { "epoch": 1.9227067230567303, "grad_norm": 1.2949745655059814, "learning_rate": 1.9981971153846152e-06, "loss": 0.776, "step": 3296 }, { "epoch": 1.9232900685430947, "grad_norm": 1.4040955305099487, "learning_rate": 1.983173076923077e-06, "loss": 0.7911, "step": 3297 }, { "epoch": 1.923873414029459, "grad_norm": 1.2795517444610596, "learning_rate": 1.9681490384615386e-06, "loss": 0.7292, "step": 3298 }, { "epoch": 1.9244567595158233, "grad_norm": 1.1882306337356567, "learning_rate": 1.953125e-06, "loss": 0.8261, "step": 3299 }, { "epoch": 1.9250401050021875, "grad_norm": 1.011581540107727, "learning_rate": 1.9381009615384615e-06, "loss": 0.5729, "step": 3300 }, { "epoch": 1.9256234504885519, "grad_norm": 1.0311501026153564, "learning_rate": 1.9230769230769234e-06, "loss": 0.8459, "step": 3301 }, { "epoch": 1.9262067959749163, "grad_norm": 1.1920017004013062, "learning_rate": 1.908052884615385e-06, "loss": 0.9399, "step": 3302 }, { "epoch": 1.9267901414612805, "grad_norm": 1.0624799728393555, "learning_rate": 1.8930288461538461e-06, "loss": 0.6593, "step": 3303 }, { "epoch": 1.9273734869476447, "grad_norm": 1.4519715309143066, "learning_rate": 1.8780048076923078e-06, "loss": 0.7391, "step": 3304 }, { "epoch": 1.927956832434009, "grad_norm": 1.2700550556182861, "learning_rate": 1.8629807692307693e-06, "loss": 0.9545, "step": 3305 }, { "epoch": 1.9285401779203735, "grad_norm": 1.4977349042892456, "learning_rate": 1.847956730769231e-06, "loss": 0.7466, "step": 3306 }, { "epoch": 1.9291235234067377, "grad_norm": 1.3640085458755493, "learning_rate": 1.8329326923076924e-06, "loss": 0.6418, "step": 3307 }, { "epoch": 1.9297068688931018, "grad_norm": 1.5322325229644775, "learning_rate": 1.817908653846154e-06, "loss": 0.8667, "step": 3308 }, { "epoch": 1.9302902143794662, "grad_norm": 1.1606767177581787, "learning_rate": 1.8028846153846153e-06, "loss": 0.618, "step": 3309 }, { "epoch": 1.9308735598658306, "grad_norm": 1.2893586158752441, "learning_rate": 1.7878605769230768e-06, "loss": 0.6867, "step": 3310 }, { "epoch": 1.9314569053521948, "grad_norm": 1.2085678577423096, "learning_rate": 1.7728365384615385e-06, "loss": 0.6753, "step": 3311 }, { "epoch": 1.932040250838559, "grad_norm": 1.0935242176055908, "learning_rate": 1.7578125000000002e-06, "loss": 0.6407, "step": 3312 }, { "epoch": 1.9326235963249234, "grad_norm": 1.3033195734024048, "learning_rate": 1.7427884615384616e-06, "loss": 0.7061, "step": 3313 }, { "epoch": 1.9332069418112878, "grad_norm": 1.1596806049346924, "learning_rate": 1.7277644230769233e-06, "loss": 0.6813, "step": 3314 }, { "epoch": 1.933790287297652, "grad_norm": 1.1415847539901733, "learning_rate": 1.7127403846153848e-06, "loss": 0.8781, "step": 3315 }, { "epoch": 1.9343736327840162, "grad_norm": 1.3523577451705933, "learning_rate": 1.697716346153846e-06, "loss": 0.854, "step": 3316 }, { "epoch": 1.9349569782703806, "grad_norm": 1.3228445053100586, "learning_rate": 1.6826923076923077e-06, "loss": 0.7102, "step": 3317 }, { "epoch": 1.935540323756745, "grad_norm": 1.152653455734253, "learning_rate": 1.6676682692307694e-06, "loss": 0.8008, "step": 3318 }, { "epoch": 1.9361236692431092, "grad_norm": 1.4329392910003662, "learning_rate": 1.6526442307692309e-06, "loss": 0.9733, "step": 3319 }, { "epoch": 1.9367070147294734, "grad_norm": 1.1904407739639282, "learning_rate": 1.6376201923076925e-06, "loss": 0.7539, "step": 3320 }, { "epoch": 1.9372903602158378, "grad_norm": 1.1539748907089233, "learning_rate": 1.622596153846154e-06, "loss": 0.8373, "step": 3321 }, { "epoch": 1.9378737057022022, "grad_norm": 1.211431860923767, "learning_rate": 1.6075721153846153e-06, "loss": 0.7206, "step": 3322 }, { "epoch": 1.9384570511885664, "grad_norm": 1.0680336952209473, "learning_rate": 1.592548076923077e-06, "loss": 0.8083, "step": 3323 }, { "epoch": 1.9390403966749308, "grad_norm": 1.3347814083099365, "learning_rate": 1.5775240384615384e-06, "loss": 0.7336, "step": 3324 }, { "epoch": 1.9396237421612952, "grad_norm": 1.1900486946105957, "learning_rate": 1.5625e-06, "loss": 0.8278, "step": 3325 }, { "epoch": 1.9402070876476594, "grad_norm": 1.2089030742645264, "learning_rate": 1.5474759615384618e-06, "loss": 0.6548, "step": 3326 }, { "epoch": 1.9407904331340236, "grad_norm": 1.1675139665603638, "learning_rate": 1.532451923076923e-06, "loss": 0.8378, "step": 3327 }, { "epoch": 1.941373778620388, "grad_norm": 1.2819353342056274, "learning_rate": 1.5174278846153847e-06, "loss": 0.8911, "step": 3328 }, { "epoch": 1.9419571241067524, "grad_norm": 1.1306663751602173, "learning_rate": 1.5024038461538464e-06, "loss": 0.8226, "step": 3329 }, { "epoch": 1.9425404695931165, "grad_norm": 1.330399990081787, "learning_rate": 1.4873798076923076e-06, "loss": 0.7923, "step": 3330 }, { "epoch": 1.9431238150794807, "grad_norm": 1.2952775955200195, "learning_rate": 1.4723557692307693e-06, "loss": 0.8788, "step": 3331 }, { "epoch": 1.9437071605658451, "grad_norm": 1.1201375722885132, "learning_rate": 1.4573317307692308e-06, "loss": 0.5674, "step": 3332 }, { "epoch": 1.9442905060522095, "grad_norm": 1.1273674964904785, "learning_rate": 1.4423076923076924e-06, "loss": 0.8559, "step": 3333 }, { "epoch": 1.9448738515385737, "grad_norm": 1.462166428565979, "learning_rate": 1.427283653846154e-06, "loss": 0.774, "step": 3334 }, { "epoch": 1.945457197024938, "grad_norm": 1.3099924325942993, "learning_rate": 1.4122596153846154e-06, "loss": 0.6735, "step": 3335 }, { "epoch": 1.9460405425113023, "grad_norm": 1.3009566068649292, "learning_rate": 1.397235576923077e-06, "loss": 0.813, "step": 3336 }, { "epoch": 1.9466238879976667, "grad_norm": 1.3090664148330688, "learning_rate": 1.3822115384615385e-06, "loss": 0.8125, "step": 3337 }, { "epoch": 1.947207233484031, "grad_norm": 1.452247977256775, "learning_rate": 1.3671875e-06, "loss": 0.8553, "step": 3338 }, { "epoch": 1.947790578970395, "grad_norm": 1.308634877204895, "learning_rate": 1.3521634615384617e-06, "loss": 0.7305, "step": 3339 }, { "epoch": 1.9483739244567595, "grad_norm": 1.4182379245758057, "learning_rate": 1.3371394230769231e-06, "loss": 0.6694, "step": 3340 }, { "epoch": 1.948957269943124, "grad_norm": 1.5073226690292358, "learning_rate": 1.3221153846153846e-06, "loss": 0.7478, "step": 3341 }, { "epoch": 1.949540615429488, "grad_norm": 1.1056112051010132, "learning_rate": 1.3070913461538463e-06, "loss": 0.9675, "step": 3342 }, { "epoch": 1.9501239609158523, "grad_norm": 1.3071699142456055, "learning_rate": 1.2920673076923077e-06, "loss": 0.9027, "step": 3343 }, { "epoch": 1.9507073064022167, "grad_norm": 1.2924457788467407, "learning_rate": 1.2770432692307692e-06, "loss": 0.9082, "step": 3344 }, { "epoch": 1.951290651888581, "grad_norm": 1.27849280834198, "learning_rate": 1.2620192307692309e-06, "loss": 0.8798, "step": 3345 }, { "epoch": 1.9518739973749453, "grad_norm": 1.2095881700515747, "learning_rate": 1.2469951923076924e-06, "loss": 0.9073, "step": 3346 }, { "epoch": 1.9524573428613095, "grad_norm": 1.2283076047897339, "learning_rate": 1.2319711538461538e-06, "loss": 0.6455, "step": 3347 }, { "epoch": 1.9530406883476739, "grad_norm": 1.241592526435852, "learning_rate": 1.2169471153846155e-06, "loss": 0.7726, "step": 3348 }, { "epoch": 1.9536240338340383, "grad_norm": 1.2830148935317993, "learning_rate": 1.201923076923077e-06, "loss": 0.7911, "step": 3349 }, { "epoch": 1.9542073793204024, "grad_norm": 1.2363495826721191, "learning_rate": 1.1868990384615384e-06, "loss": 0.7075, "step": 3350 }, { "epoch": 1.9547907248067669, "grad_norm": 1.2319027185440063, "learning_rate": 1.1718750000000001e-06, "loss": 0.8239, "step": 3351 }, { "epoch": 1.9553740702931313, "grad_norm": 1.2770158052444458, "learning_rate": 1.1568509615384616e-06, "loss": 0.6991, "step": 3352 }, { "epoch": 1.9559574157794954, "grad_norm": 1.1271159648895264, "learning_rate": 1.141826923076923e-06, "loss": 0.8684, "step": 3353 }, { "epoch": 1.9565407612658596, "grad_norm": 1.2182567119598389, "learning_rate": 1.1268028846153847e-06, "loss": 0.8585, "step": 3354 }, { "epoch": 1.957124106752224, "grad_norm": 1.2966188192367554, "learning_rate": 1.1117788461538462e-06, "loss": 0.7743, "step": 3355 }, { "epoch": 1.9577074522385884, "grad_norm": 1.308058261871338, "learning_rate": 1.0967548076923077e-06, "loss": 0.7612, "step": 3356 }, { "epoch": 1.9582907977249526, "grad_norm": 1.1872565746307373, "learning_rate": 1.0817307692307693e-06, "loss": 0.9376, "step": 3357 }, { "epoch": 1.9588741432113168, "grad_norm": 1.1720060110092163, "learning_rate": 1.0667067307692308e-06, "loss": 0.689, "step": 3358 }, { "epoch": 1.9594574886976812, "grad_norm": 1.202436923980713, "learning_rate": 1.0516826923076925e-06, "loss": 0.8309, "step": 3359 }, { "epoch": 1.9600408341840456, "grad_norm": 1.271864414215088, "learning_rate": 1.036658653846154e-06, "loss": 0.8216, "step": 3360 }, { "epoch": 1.9606241796704098, "grad_norm": 1.501631259918213, "learning_rate": 1.0216346153846154e-06, "loss": 0.9438, "step": 3361 }, { "epoch": 1.961207525156774, "grad_norm": 1.3683483600616455, "learning_rate": 1.006610576923077e-06, "loss": 0.8276, "step": 3362 }, { "epoch": 1.9617908706431384, "grad_norm": 1.202343225479126, "learning_rate": 9.915865384615386e-07, "loss": 0.682, "step": 3363 }, { "epoch": 1.9623742161295028, "grad_norm": 1.0876096487045288, "learning_rate": 9.765625e-07, "loss": 0.7981, "step": 3364 }, { "epoch": 1.962957561615867, "grad_norm": 1.5290825366973877, "learning_rate": 9.615384615384617e-07, "loss": 0.9378, "step": 3365 }, { "epoch": 1.9635409071022312, "grad_norm": 1.267568588256836, "learning_rate": 9.465144230769231e-07, "loss": 0.7676, "step": 3366 }, { "epoch": 1.9641242525885956, "grad_norm": 1.1771818399429321, "learning_rate": 9.314903846153846e-07, "loss": 0.6243, "step": 3367 }, { "epoch": 1.96470759807496, "grad_norm": 1.4345792531967163, "learning_rate": 9.164663461538462e-07, "loss": 0.7595, "step": 3368 }, { "epoch": 1.9652909435613242, "grad_norm": 2.9055094718933105, "learning_rate": 9.014423076923077e-07, "loss": 0.8694, "step": 3369 }, { "epoch": 1.9658742890476884, "grad_norm": 1.2087029218673706, "learning_rate": 8.864182692307692e-07, "loss": 0.8892, "step": 3370 }, { "epoch": 1.9664576345340528, "grad_norm": 1.1844966411590576, "learning_rate": 8.713942307692308e-07, "loss": 0.9044, "step": 3371 }, { "epoch": 1.9670409800204172, "grad_norm": 1.4420499801635742, "learning_rate": 8.563701923076924e-07, "loss": 0.7416, "step": 3372 }, { "epoch": 1.9676243255067813, "grad_norm": 1.2633183002471924, "learning_rate": 8.413461538461539e-07, "loss": 0.8739, "step": 3373 }, { "epoch": 1.9682076709931455, "grad_norm": 1.2410845756530762, "learning_rate": 8.263221153846154e-07, "loss": 0.9586, "step": 3374 }, { "epoch": 1.96879101647951, "grad_norm": 1.3219760656356812, "learning_rate": 8.11298076923077e-07, "loss": 0.8433, "step": 3375 }, { "epoch": 1.9693743619658743, "grad_norm": 1.2453432083129883, "learning_rate": 7.962740384615385e-07, "loss": 0.6931, "step": 3376 }, { "epoch": 1.9699577074522385, "grad_norm": 1.3527343273162842, "learning_rate": 7.8125e-07, "loss": 0.8611, "step": 3377 }, { "epoch": 1.970541052938603, "grad_norm": 1.1058260202407837, "learning_rate": 7.662259615384615e-07, "loss": 0.9306, "step": 3378 }, { "epoch": 1.9711243984249673, "grad_norm": 1.070041537284851, "learning_rate": 7.512019230769232e-07, "loss": 0.8113, "step": 3379 }, { "epoch": 1.9717077439113315, "grad_norm": 2.1356186866760254, "learning_rate": 7.361778846153846e-07, "loss": 0.7269, "step": 3380 }, { "epoch": 1.9722910893976957, "grad_norm": 1.2367362976074219, "learning_rate": 7.211538461538462e-07, "loss": 0.8238, "step": 3381 }, { "epoch": 1.9728744348840601, "grad_norm": 1.401431679725647, "learning_rate": 7.061298076923077e-07, "loss": 0.6869, "step": 3382 }, { "epoch": 1.9734577803704245, "grad_norm": 1.330978512763977, "learning_rate": 6.911057692307693e-07, "loss": 0.7662, "step": 3383 }, { "epoch": 1.9740411258567887, "grad_norm": 1.0215189456939697, "learning_rate": 6.760817307692308e-07, "loss": 0.7518, "step": 3384 }, { "epoch": 1.9746244713431529, "grad_norm": 1.2366408109664917, "learning_rate": 6.610576923076923e-07, "loss": 0.9705, "step": 3385 }, { "epoch": 1.9752078168295173, "grad_norm": 1.478739857673645, "learning_rate": 6.460336538461539e-07, "loss": 0.691, "step": 3386 }, { "epoch": 1.9757911623158817, "grad_norm": 1.1755001544952393, "learning_rate": 6.310096153846154e-07, "loss": 0.589, "step": 3387 }, { "epoch": 1.9763745078022459, "grad_norm": 1.2783706188201904, "learning_rate": 6.159855769230769e-07, "loss": 0.7802, "step": 3388 }, { "epoch": 1.97695785328861, "grad_norm": 1.1876357793807983, "learning_rate": 6.009615384615385e-07, "loss": 0.7924, "step": 3389 }, { "epoch": 1.9775411987749745, "grad_norm": 1.159026026725769, "learning_rate": 5.859375000000001e-07, "loss": 0.7157, "step": 3390 }, { "epoch": 1.9781245442613389, "grad_norm": 1.2553104162216187, "learning_rate": 5.709134615384615e-07, "loss": 0.7791, "step": 3391 }, { "epoch": 1.978707889747703, "grad_norm": 1.2597992420196533, "learning_rate": 5.558894230769231e-07, "loss": 0.6958, "step": 3392 }, { "epoch": 1.9792912352340672, "grad_norm": 1.3527934551239014, "learning_rate": 5.408653846153847e-07, "loss": 0.6869, "step": 3393 }, { "epoch": 1.9798745807204317, "grad_norm": 1.4447914361953735, "learning_rate": 5.258413461538462e-07, "loss": 0.6232, "step": 3394 }, { "epoch": 1.980457926206796, "grad_norm": 1.3879870176315308, "learning_rate": 5.108173076923077e-07, "loss": 0.7473, "step": 3395 }, { "epoch": 1.9810412716931602, "grad_norm": 1.2205619812011719, "learning_rate": 4.957932692307693e-07, "loss": 0.8624, "step": 3396 }, { "epoch": 1.9816246171795244, "grad_norm": 1.3446601629257202, "learning_rate": 4.807692307692308e-07, "loss": 0.6787, "step": 3397 }, { "epoch": 1.9822079626658888, "grad_norm": 1.372126579284668, "learning_rate": 4.657451923076923e-07, "loss": 0.658, "step": 3398 }, { "epoch": 1.9827913081522532, "grad_norm": 1.195761799812317, "learning_rate": 4.5072115384615384e-07, "loss": 0.8341, "step": 3399 }, { "epoch": 1.9833746536386174, "grad_norm": 1.2088286876678467, "learning_rate": 4.356971153846154e-07, "loss": 0.6969, "step": 3400 }, { "epoch": 1.9833746536386174, "eval_loss_squad": 0.8548139879200608, "eval_perplexity": 8.228127154549277, "eval_perplexity_reconstruct": 1.8982905574950977, "step": 3400 }, { "epoch": 1.9839579991249816, "grad_norm": 1.5069819688796997, "learning_rate": 4.2067307692307693e-07, "loss": 0.9078, "step": 3401 }, { "epoch": 1.984541344611346, "grad_norm": 1.4215034246444702, "learning_rate": 4.056490384615385e-07, "loss": 0.7893, "step": 3402 }, { "epoch": 1.9851246900977104, "grad_norm": 1.417895793914795, "learning_rate": 3.90625e-07, "loss": 0.705, "step": 3403 }, { "epoch": 1.9857080355840746, "grad_norm": 1.336230754852295, "learning_rate": 3.756009615384616e-07, "loss": 0.8535, "step": 3404 }, { "epoch": 1.986291381070439, "grad_norm": 1.057672142982483, "learning_rate": 3.605769230769231e-07, "loss": 0.7968, "step": 3405 }, { "epoch": 1.9868747265568034, "grad_norm": 1.924058198928833, "learning_rate": 3.4555288461538463e-07, "loss": 0.8941, "step": 3406 }, { "epoch": 1.9874580720431676, "grad_norm": 1.2566702365875244, "learning_rate": 3.3052884615384615e-07, "loss": 0.9751, "step": 3407 }, { "epoch": 1.9880414175295318, "grad_norm": 1.2347677946090698, "learning_rate": 3.155048076923077e-07, "loss": 0.9065, "step": 3408 }, { "epoch": 1.9886247630158962, "grad_norm": 1.202021837234497, "learning_rate": 3.0048076923076924e-07, "loss": 1.108, "step": 3409 }, { "epoch": 1.9892081085022606, "grad_norm": 1.1871834993362427, "learning_rate": 2.8545673076923076e-07, "loss": 0.7417, "step": 3410 }, { "epoch": 1.9897914539886248, "grad_norm": 1.482557773590088, "learning_rate": 2.7043269230769233e-07, "loss": 0.8867, "step": 3411 }, { "epoch": 1.990374799474989, "grad_norm": 1.1281570196151733, "learning_rate": 2.5540865384615385e-07, "loss": 0.7128, "step": 3412 }, { "epoch": 1.9909581449613534, "grad_norm": 1.2373967170715332, "learning_rate": 2.403846153846154e-07, "loss": 0.9376, "step": 3413 }, { "epoch": 1.9915414904477178, "grad_norm": 1.0291107892990112, "learning_rate": 2.2536057692307692e-07, "loss": 0.6016, "step": 3414 }, { "epoch": 1.992124835934082, "grad_norm": 1.2457456588745117, "learning_rate": 2.1033653846153846e-07, "loss": 0.7507, "step": 3415 }, { "epoch": 1.9927081814204461, "grad_norm": 1.7160253524780273, "learning_rate": 1.953125e-07, "loss": 0.6249, "step": 3416 }, { "epoch": 1.9932915269068106, "grad_norm": 1.178257703781128, "learning_rate": 1.8028846153846156e-07, "loss": 0.9351, "step": 3417 }, { "epoch": 1.993874872393175, "grad_norm": 1.1656147241592407, "learning_rate": 1.6526442307692307e-07, "loss": 0.7854, "step": 3418 }, { "epoch": 1.9944582178795391, "grad_norm": 1.1849141120910645, "learning_rate": 1.5024038461538462e-07, "loss": 0.7907, "step": 3419 }, { "epoch": 1.9950415633659033, "grad_norm": 1.764297366142273, "learning_rate": 1.3521634615384617e-07, "loss": 0.7752, "step": 3420 }, { "epoch": 1.9956249088522677, "grad_norm": 1.2948668003082275, "learning_rate": 1.201923076923077e-07, "loss": 0.7394, "step": 3421 }, { "epoch": 1.9962082543386321, "grad_norm": 1.251250982284546, "learning_rate": 1.0516826923076923e-07, "loss": 0.7703, "step": 3422 }, { "epoch": 1.9967915998249963, "grad_norm": 1.1011062860488892, "learning_rate": 9.014423076923078e-08, "loss": 0.7541, "step": 3423 }, { "epoch": 1.9973749453113605, "grad_norm": 1.1622389554977417, "learning_rate": 7.512019230769231e-08, "loss": 0.8255, "step": 3424 }, { "epoch": 1.997958290797725, "grad_norm": 1.1566705703735352, "learning_rate": 6.009615384615386e-08, "loss": 0.8762, "step": 3425 }, { "epoch": 1.9985416362840893, "grad_norm": 1.2555816173553467, "learning_rate": 4.507211538461539e-08, "loss": 0.7081, "step": 3426 }, { "epoch": 1.9991249817704535, "grad_norm": 1.1134846210479736, "learning_rate": 3.004807692307693e-08, "loss": 0.7877, "step": 3427 }, { "epoch": 1.9997083272568177, "grad_norm": 1.113712191581726, "learning_rate": 1.5024038461538464e-08, "loss": 0.9928, "step": 3428 } ], "logging_steps": 1.0, "max_steps": 3428, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 652, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6980351498544218e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }