{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.790545125537209, "eval_steps": 200, "global_step": 12800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_loss": 2.6337203979492188, "eval_runtime": 33.4429, "eval_samples_per_second": 22.247, "eval_steps_per_second": 5.562, "step": 0 }, { "epoch": 0.00045238633793259444, "grad_norm": 0.10917367786169052, "learning_rate": 1.5082956259426848e-08, "loss": 1.4416, "step": 1 }, { "epoch": 0.0009047726758651889, "grad_norm": 0.32779040932655334, "learning_rate": 3.0165912518853697e-08, "loss": 1.3339, "step": 2 }, { "epoch": 0.0013571590137977834, "grad_norm": 0.3854389488697052, "learning_rate": 4.524886877828055e-08, "loss": 1.5011, "step": 3 }, { "epoch": 0.0018095453517303778, "grad_norm": 0.4182969331741333, "learning_rate": 6.033182503770739e-08, "loss": 1.5091, "step": 4 }, { "epoch": 0.002261931689662972, "grad_norm": 0.4821797013282776, "learning_rate": 7.541478129713425e-08, "loss": 1.7845, "step": 5 }, { "epoch": 0.0027143180275955667, "grad_norm": 0.5031030774116516, "learning_rate": 9.04977375565611e-08, "loss": 1.6489, "step": 6 }, { "epoch": 0.003166704365528161, "grad_norm": 0.5237405300140381, "learning_rate": 1.0558069381598795e-07, "loss": 1.9452, "step": 7 }, { "epoch": 0.0036190907034607555, "grad_norm": 0.5683294534683228, "learning_rate": 1.2066365007541479e-07, "loss": 1.9705, "step": 8 }, { "epoch": 0.00407147704139335, "grad_norm": 0.5541937947273254, "learning_rate": 1.3574660633484163e-07, "loss": 1.8704, "step": 9 }, { "epoch": 0.004523863379325944, "grad_norm": 0.6250733137130737, "learning_rate": 1.508295625942685e-07, "loss": 2.0321, "step": 10 }, { "epoch": 0.0049762497172585385, "grad_norm": 0.655157208442688, "learning_rate": 1.6591251885369535e-07, "loss": 2.0302, "step": 11 }, { "epoch": 0.0054286360551911335, "grad_norm": 0.6597447395324707, "learning_rate": 1.809954751131222e-07, "loss": 2.0193, "step": 12 }, { "epoch": 0.005881022393123728, "grad_norm": 0.6683413982391357, "learning_rate": 1.9607843137254904e-07, "loss": 2.0977, "step": 13 }, { "epoch": 0.006333408731056322, "grad_norm": 0.7074043154716492, "learning_rate": 2.111613876319759e-07, "loss": 2.1781, "step": 14 }, { "epoch": 0.006785795068988917, "grad_norm": 0.7387233972549438, "learning_rate": 2.2624434389140273e-07, "loss": 2.1387, "step": 15 }, { "epoch": 0.007238181406921511, "grad_norm": 0.7269941568374634, "learning_rate": 2.4132730015082957e-07, "loss": 2.1224, "step": 16 }, { "epoch": 0.007690567744854105, "grad_norm": 0.7506342530250549, "learning_rate": 2.564102564102564e-07, "loss": 2.312, "step": 17 }, { "epoch": 0.0081429540827867, "grad_norm": 0.7814583778381348, "learning_rate": 2.7149321266968326e-07, "loss": 2.2944, "step": 18 }, { "epoch": 0.008595340420719294, "grad_norm": 0.7807725071907043, "learning_rate": 2.865761689291101e-07, "loss": 2.2407, "step": 19 }, { "epoch": 0.009047726758651889, "grad_norm": 0.8167830109596252, "learning_rate": 3.01659125188537e-07, "loss": 2.3212, "step": 20 }, { "epoch": 0.009500113096584483, "grad_norm": 0.822982907295227, "learning_rate": 3.167420814479638e-07, "loss": 2.4153, "step": 21 }, { "epoch": 0.009952499434517077, "grad_norm": 0.8619258403778076, "learning_rate": 3.318250377073907e-07, "loss": 2.4481, "step": 22 }, { "epoch": 0.010404885772449673, "grad_norm": 0.8413375616073608, "learning_rate": 3.4690799396681754e-07, "loss": 2.3866, "step": 23 }, { "epoch": 0.010857272110382267, "grad_norm": 0.8759726881980896, "learning_rate": 3.619909502262444e-07, "loss": 2.4725, "step": 24 }, { "epoch": 0.011309658448314861, "grad_norm": 0.9024650454521179, "learning_rate": 3.770739064856712e-07, "loss": 2.5221, "step": 25 }, { "epoch": 0.011762044786247455, "grad_norm": 0.921983540058136, "learning_rate": 3.921568627450981e-07, "loss": 2.6753, "step": 26 }, { "epoch": 0.01221443112418005, "grad_norm": 0.9234832525253296, "learning_rate": 4.072398190045249e-07, "loss": 2.4699, "step": 27 }, { "epoch": 0.012666817462112644, "grad_norm": 0.9715510010719299, "learning_rate": 4.223227752639518e-07, "loss": 2.6709, "step": 28 }, { "epoch": 0.013119203800045238, "grad_norm": 0.9828589558601379, "learning_rate": 4.374057315233786e-07, "loss": 2.7204, "step": 29 }, { "epoch": 0.013571590137977834, "grad_norm": 0.9608268141746521, "learning_rate": 4.5248868778280546e-07, "loss": 2.6052, "step": 30 }, { "epoch": 0.014023976475910428, "grad_norm": 1.0252982378005981, "learning_rate": 4.675716440422323e-07, "loss": 2.8655, "step": 31 }, { "epoch": 0.014476362813843022, "grad_norm": 1.0586740970611572, "learning_rate": 4.826546003016591e-07, "loss": 2.8088, "step": 32 }, { "epoch": 0.014928749151775616, "grad_norm": 1.101237416267395, "learning_rate": 4.977375565610859e-07, "loss": 2.8921, "step": 33 }, { "epoch": 0.01538113548970821, "grad_norm": 1.0488309860229492, "learning_rate": 5.128205128205128e-07, "loss": 2.8521, "step": 34 }, { "epoch": 0.015833521827640806, "grad_norm": 1.1078431606292725, "learning_rate": 5.279034690799397e-07, "loss": 2.9228, "step": 35 }, { "epoch": 0.0162859081655734, "grad_norm": 1.0988504886627197, "learning_rate": 5.429864253393665e-07, "loss": 3.0071, "step": 36 }, { "epoch": 0.016738294503505995, "grad_norm": 1.147328495979309, "learning_rate": 5.580693815987934e-07, "loss": 3.0063, "step": 37 }, { "epoch": 0.01719068084143859, "grad_norm": 1.2106767892837524, "learning_rate": 5.731523378582202e-07, "loss": 3.0507, "step": 38 }, { "epoch": 0.017643067179371183, "grad_norm": 1.2379810810089111, "learning_rate": 5.882352941176471e-07, "loss": 3.1004, "step": 39 }, { "epoch": 0.018095453517303777, "grad_norm": 1.2583941221237183, "learning_rate": 6.03318250377074e-07, "loss": 3.1946, "step": 40 }, { "epoch": 0.01854783985523637, "grad_norm": 1.2818472385406494, "learning_rate": 6.184012066365008e-07, "loss": 3.2485, "step": 41 }, { "epoch": 0.019000226193168965, "grad_norm": 1.2959933280944824, "learning_rate": 6.334841628959276e-07, "loss": 3.2955, "step": 42 }, { "epoch": 0.01945261253110156, "grad_norm": 1.2796090841293335, "learning_rate": 6.485671191553546e-07, "loss": 3.3851, "step": 43 }, { "epoch": 0.019904998869034154, "grad_norm": 1.3375236988067627, "learning_rate": 6.636500754147814e-07, "loss": 3.3483, "step": 44 }, { "epoch": 0.020357385206966748, "grad_norm": 1.3742619752883911, "learning_rate": 6.787330316742082e-07, "loss": 3.3293, "step": 45 }, { "epoch": 0.020809771544899346, "grad_norm": 1.4002019166946411, "learning_rate": 6.938159879336351e-07, "loss": 3.5016, "step": 46 }, { "epoch": 0.02126215788283194, "grad_norm": 1.427987813949585, "learning_rate": 7.088989441930619e-07, "loss": 3.5851, "step": 47 }, { "epoch": 0.021714544220764534, "grad_norm": 1.511704683303833, "learning_rate": 7.239819004524888e-07, "loss": 3.6868, "step": 48 }, { "epoch": 0.022166930558697128, "grad_norm": 1.5928444862365723, "learning_rate": 7.390648567119156e-07, "loss": 3.8425, "step": 49 }, { "epoch": 0.022619316896629722, "grad_norm": 1.749611735343933, "learning_rate": 7.541478129713424e-07, "loss": 4.0948, "step": 50 }, { "epoch": 0.023071703234562316, "grad_norm": 0.09442979097366333, "learning_rate": 7.692307692307694e-07, "loss": 1.1209, "step": 51 }, { "epoch": 0.02352408957249491, "grad_norm": 0.1790979504585266, "learning_rate": 7.843137254901962e-07, "loss": 1.6726, "step": 52 }, { "epoch": 0.023976475910427505, "grad_norm": 0.4066171944141388, "learning_rate": 7.993966817496229e-07, "loss": 1.5858, "step": 53 }, { "epoch": 0.0244288622483601, "grad_norm": 0.4263452887535095, "learning_rate": 8.144796380090498e-07, "loss": 1.5984, "step": 54 }, { "epoch": 0.024881248586292693, "grad_norm": 0.47097599506378174, "learning_rate": 8.295625942684766e-07, "loss": 1.6613, "step": 55 }, { "epoch": 0.025333634924225287, "grad_norm": 0.5136803984642029, "learning_rate": 8.446455505279036e-07, "loss": 1.663, "step": 56 }, { "epoch": 0.02578602126215788, "grad_norm": 0.5091128945350647, "learning_rate": 8.597285067873304e-07, "loss": 1.9515, "step": 57 }, { "epoch": 0.026238407600090476, "grad_norm": 0.5549827814102173, "learning_rate": 8.748114630467572e-07, "loss": 1.8162, "step": 58 }, { "epoch": 0.026690793938023073, "grad_norm": 0.5529032945632935, "learning_rate": 8.898944193061841e-07, "loss": 1.6616, "step": 59 }, { "epoch": 0.027143180275955667, "grad_norm": 0.6001793146133423, "learning_rate": 9.049773755656109e-07, "loss": 1.8955, "step": 60 }, { "epoch": 0.02759556661388826, "grad_norm": 0.6293895840644836, "learning_rate": 9.200603318250378e-07, "loss": 1.9051, "step": 61 }, { "epoch": 0.028047952951820856, "grad_norm": 0.6421718597412109, "learning_rate": 9.351432880844646e-07, "loss": 2.016, "step": 62 }, { "epoch": 0.02850033928975345, "grad_norm": 0.6873517632484436, "learning_rate": 9.502262443438914e-07, "loss": 2.0059, "step": 63 }, { "epoch": 0.028952725627686044, "grad_norm": 0.7199680805206299, "learning_rate": 9.653092006033183e-07, "loss": 2.0199, "step": 64 }, { "epoch": 0.02940511196561864, "grad_norm": 0.7532120943069458, "learning_rate": 9.80392156862745e-07, "loss": 2.1125, "step": 65 }, { "epoch": 0.029857498303551232, "grad_norm": 0.7211053371429443, "learning_rate": 9.954751131221719e-07, "loss": 2.1524, "step": 66 }, { "epoch": 0.030309884641483827, "grad_norm": 0.8080956339836121, "learning_rate": 1.0105580693815989e-06, "loss": 2.2124, "step": 67 }, { "epoch": 0.03076227097941642, "grad_norm": 0.7639372944831848, "learning_rate": 1.0256410256410257e-06, "loss": 2.403, "step": 68 }, { "epoch": 0.031214657317349015, "grad_norm": 0.8137151598930359, "learning_rate": 1.0407239819004527e-06, "loss": 2.2809, "step": 69 }, { "epoch": 0.03166704365528161, "grad_norm": 0.8153931498527527, "learning_rate": 1.0558069381598795e-06, "loss": 2.2654, "step": 70 }, { "epoch": 0.0321194299932142, "grad_norm": 0.8634194731712341, "learning_rate": 1.0708898944193063e-06, "loss": 2.3705, "step": 71 }, { "epoch": 0.0325718163311468, "grad_norm": 0.8791585564613342, "learning_rate": 1.085972850678733e-06, "loss": 2.2649, "step": 72 }, { "epoch": 0.03302420266907939, "grad_norm": 0.8438045978546143, "learning_rate": 1.1010558069381598e-06, "loss": 2.379, "step": 73 }, { "epoch": 0.03347658900701199, "grad_norm": 0.9520341753959656, "learning_rate": 1.1161387631975868e-06, "loss": 2.4034, "step": 74 }, { "epoch": 0.03392897534494458, "grad_norm": 0.9628412127494812, "learning_rate": 1.1312217194570136e-06, "loss": 2.4297, "step": 75 }, { "epoch": 0.03438136168287718, "grad_norm": 0.9521026015281677, "learning_rate": 1.1463046757164404e-06, "loss": 2.4718, "step": 76 }, { "epoch": 0.03483374802080977, "grad_norm": 1.0074130296707153, "learning_rate": 1.1613876319758674e-06, "loss": 2.5501, "step": 77 }, { "epoch": 0.035286134358742366, "grad_norm": 1.0088969469070435, "learning_rate": 1.1764705882352942e-06, "loss": 2.5019, "step": 78 }, { "epoch": 0.035738520696674964, "grad_norm": 1.022720217704773, "learning_rate": 1.1915535444947212e-06, "loss": 2.6745, "step": 79 }, { "epoch": 0.036190907034607554, "grad_norm": 1.0624380111694336, "learning_rate": 1.206636500754148e-06, "loss": 2.7079, "step": 80 }, { "epoch": 0.03664329337254015, "grad_norm": 1.0446101427078247, "learning_rate": 1.2217194570135748e-06, "loss": 2.7244, "step": 81 }, { "epoch": 0.03709567971047274, "grad_norm": 1.1218805313110352, "learning_rate": 1.2368024132730016e-06, "loss": 2.6608, "step": 82 }, { "epoch": 0.03754806604840534, "grad_norm": 1.1701157093048096, "learning_rate": 1.2518853695324284e-06, "loss": 2.8348, "step": 83 }, { "epoch": 0.03800045238633793, "grad_norm": 1.1630805730819702, "learning_rate": 1.2669683257918552e-06, "loss": 2.8945, "step": 84 }, { "epoch": 0.03845283872427053, "grad_norm": 1.2108711004257202, "learning_rate": 1.282051282051282e-06, "loss": 2.901, "step": 85 }, { "epoch": 0.03890522506220312, "grad_norm": 1.2672454118728638, "learning_rate": 1.2971342383107092e-06, "loss": 2.9184, "step": 86 }, { "epoch": 0.03935761140013572, "grad_norm": 1.3102805614471436, "learning_rate": 1.312217194570136e-06, "loss": 2.8933, "step": 87 }, { "epoch": 0.03980999773806831, "grad_norm": 1.3302671909332275, "learning_rate": 1.3273001508295628e-06, "loss": 2.9944, "step": 88 }, { "epoch": 0.040262384076000905, "grad_norm": 1.3279258012771606, "learning_rate": 1.3423831070889896e-06, "loss": 2.9955, "step": 89 }, { "epoch": 0.040714770413933496, "grad_norm": 1.361893892288208, "learning_rate": 1.3574660633484164e-06, "loss": 3.1767, "step": 90 }, { "epoch": 0.041167156751866094, "grad_norm": 1.4249252080917358, "learning_rate": 1.3725490196078434e-06, "loss": 3.04, "step": 91 }, { "epoch": 0.04161954308979869, "grad_norm": 1.3713250160217285, "learning_rate": 1.3876319758672702e-06, "loss": 3.1393, "step": 92 }, { "epoch": 0.04207192942773128, "grad_norm": 1.5161356925964355, "learning_rate": 1.402714932126697e-06, "loss": 3.2194, "step": 93 }, { "epoch": 0.04252431576566388, "grad_norm": 1.5405185222625732, "learning_rate": 1.4177978883861237e-06, "loss": 3.3091, "step": 94 }, { "epoch": 0.04297670210359647, "grad_norm": 1.5984642505645752, "learning_rate": 1.4328808446455505e-06, "loss": 3.2101, "step": 95 }, { "epoch": 0.04342908844152907, "grad_norm": 1.664920449256897, "learning_rate": 1.4479638009049775e-06, "loss": 3.3788, "step": 96 }, { "epoch": 0.04388147477946166, "grad_norm": 1.6275672912597656, "learning_rate": 1.4630467571644043e-06, "loss": 3.5511, "step": 97 }, { "epoch": 0.044333861117394256, "grad_norm": 1.7485661506652832, "learning_rate": 1.4781297134238311e-06, "loss": 3.6812, "step": 98 }, { "epoch": 0.04478624745532685, "grad_norm": 1.8033849000930786, "learning_rate": 1.493212669683258e-06, "loss": 3.7155, "step": 99 }, { "epoch": 0.045238633793259445, "grad_norm": 2.1944878101348877, "learning_rate": 1.5082956259426847e-06, "loss": 3.9791, "step": 100 }, { "epoch": 0.045691020131192035, "grad_norm": 0.18038256466388702, "learning_rate": 1.5233785822021115e-06, "loss": 1.7212, "step": 101 }, { "epoch": 0.04614340646912463, "grad_norm": 0.32883980870246887, "learning_rate": 1.5384615384615387e-06, "loss": 1.3222, "step": 102 }, { "epoch": 0.046595792807057224, "grad_norm": 0.4075303077697754, "learning_rate": 1.5535444947209655e-06, "loss": 1.3425, "step": 103 }, { "epoch": 0.04704817914498982, "grad_norm": 0.4424018859863281, "learning_rate": 1.5686274509803923e-06, "loss": 1.408, "step": 104 }, { "epoch": 0.04750056548292242, "grad_norm": 0.5053431391716003, "learning_rate": 1.583710407239819e-06, "loss": 1.2689, "step": 105 }, { "epoch": 0.04795295182085501, "grad_norm": 0.5555182099342346, "learning_rate": 1.5987933634992459e-06, "loss": 1.5556, "step": 106 }, { "epoch": 0.04840533815878761, "grad_norm": 0.6155335903167725, "learning_rate": 1.6138763197586729e-06, "loss": 1.5952, "step": 107 }, { "epoch": 0.0488577244967202, "grad_norm": 0.6562772989273071, "learning_rate": 1.6289592760180997e-06, "loss": 1.7903, "step": 108 }, { "epoch": 0.049310110834652796, "grad_norm": 0.6916475296020508, "learning_rate": 1.6440422322775265e-06, "loss": 1.6384, "step": 109 }, { "epoch": 0.049762497172585386, "grad_norm": 0.6897794604301453, "learning_rate": 1.6591251885369533e-06, "loss": 1.7174, "step": 110 }, { "epoch": 0.050214883510517984, "grad_norm": 0.7198401689529419, "learning_rate": 1.67420814479638e-06, "loss": 1.9503, "step": 111 }, { "epoch": 0.050667269848450575, "grad_norm": 0.7694827318191528, "learning_rate": 1.6892911010558073e-06, "loss": 1.8035, "step": 112 }, { "epoch": 0.05111965618638317, "grad_norm": 0.7877671718597412, "learning_rate": 1.704374057315234e-06, "loss": 1.8598, "step": 113 }, { "epoch": 0.05157204252431576, "grad_norm": 0.8116180896759033, "learning_rate": 1.7194570135746609e-06, "loss": 1.9961, "step": 114 }, { "epoch": 0.05202442886224836, "grad_norm": 0.8146136999130249, "learning_rate": 1.7345399698340876e-06, "loss": 1.9419, "step": 115 }, { "epoch": 0.05247681520018095, "grad_norm": 0.7880291938781738, "learning_rate": 1.7496229260935144e-06, "loss": 1.9047, "step": 116 }, { "epoch": 0.05292920153811355, "grad_norm": 0.7874240875244141, "learning_rate": 1.7647058823529414e-06, "loss": 1.9559, "step": 117 }, { "epoch": 0.053381587876046147, "grad_norm": 0.7686619162559509, "learning_rate": 1.7797888386123682e-06, "loss": 1.9546, "step": 118 }, { "epoch": 0.05383397421397874, "grad_norm": 0.7842103242874146, "learning_rate": 1.794871794871795e-06, "loss": 2.1373, "step": 119 }, { "epoch": 0.054286360551911335, "grad_norm": 0.7664967179298401, "learning_rate": 1.8099547511312218e-06, "loss": 2.0468, "step": 120 }, { "epoch": 0.054738746889843926, "grad_norm": 0.6990913152694702, "learning_rate": 1.8250377073906486e-06, "loss": 2.2737, "step": 121 }, { "epoch": 0.05519113322777652, "grad_norm": 0.7262146472930908, "learning_rate": 1.8401206636500756e-06, "loss": 2.0078, "step": 122 }, { "epoch": 0.055643519565709114, "grad_norm": 0.7726587653160095, "learning_rate": 1.8552036199095024e-06, "loss": 2.2453, "step": 123 }, { "epoch": 0.05609590590364171, "grad_norm": 0.751103937625885, "learning_rate": 1.8702865761689292e-06, "loss": 2.1787, "step": 124 }, { "epoch": 0.0565482922415743, "grad_norm": 0.751914381980896, "learning_rate": 1.885369532428356e-06, "loss": 2.1497, "step": 125 }, { "epoch": 0.0570006785795069, "grad_norm": 0.7957012057304382, "learning_rate": 1.9004524886877828e-06, "loss": 2.191, "step": 126 }, { "epoch": 0.05745306491743949, "grad_norm": 0.8076576590538025, "learning_rate": 1.91553544494721e-06, "loss": 2.18, "step": 127 }, { "epoch": 0.05790545125537209, "grad_norm": 0.7427197098731995, "learning_rate": 1.9306184012066366e-06, "loss": 2.1301, "step": 128 }, { "epoch": 0.05835783759330468, "grad_norm": 0.7992486357688904, "learning_rate": 1.9457013574660634e-06, "loss": 2.1901, "step": 129 }, { "epoch": 0.05881022393123728, "grad_norm": 0.8398369550704956, "learning_rate": 1.96078431372549e-06, "loss": 2.249, "step": 130 }, { "epoch": 0.059262610269169874, "grad_norm": 0.7970113158226013, "learning_rate": 1.975867269984917e-06, "loss": 2.2561, "step": 131 }, { "epoch": 0.059714996607102465, "grad_norm": 0.8340783715248108, "learning_rate": 1.9909502262443437e-06, "loss": 2.2672, "step": 132 }, { "epoch": 0.06016738294503506, "grad_norm": 0.8238533735275269, "learning_rate": 2.006033182503771e-06, "loss": 2.3234, "step": 133 }, { "epoch": 0.06061976928296765, "grad_norm": 0.8649453520774841, "learning_rate": 2.0211161387631978e-06, "loss": 2.3503, "step": 134 }, { "epoch": 0.06107215562090025, "grad_norm": 0.8898661136627197, "learning_rate": 2.0361990950226245e-06, "loss": 2.419, "step": 135 }, { "epoch": 0.06152454195883284, "grad_norm": 0.8773770928382874, "learning_rate": 2.0512820512820513e-06, "loss": 2.3279, "step": 136 }, { "epoch": 0.06197692829676544, "grad_norm": 0.8952063918113708, "learning_rate": 2.066365007541478e-06, "loss": 2.4266, "step": 137 }, { "epoch": 0.06242931463469803, "grad_norm": 0.9591676592826843, "learning_rate": 2.0814479638009053e-06, "loss": 2.4362, "step": 138 }, { "epoch": 0.06288170097263063, "grad_norm": 0.9695006012916565, "learning_rate": 2.096530920060332e-06, "loss": 2.4726, "step": 139 }, { "epoch": 0.06333408731056323, "grad_norm": 0.9859821796417236, "learning_rate": 2.111613876319759e-06, "loss": 2.4446, "step": 140 }, { "epoch": 0.06378647364849581, "grad_norm": 1.009279489517212, "learning_rate": 2.1266968325791857e-06, "loss": 2.521, "step": 141 }, { "epoch": 0.0642388599864284, "grad_norm": 1.0488793849945068, "learning_rate": 2.1417797888386125e-06, "loss": 2.5219, "step": 142 }, { "epoch": 0.064691246324361, "grad_norm": 1.057810664176941, "learning_rate": 2.1568627450980393e-06, "loss": 2.6104, "step": 143 }, { "epoch": 0.0651436326622936, "grad_norm": 1.1015841960906982, "learning_rate": 2.171945701357466e-06, "loss": 2.6302, "step": 144 }, { "epoch": 0.0655960190002262, "grad_norm": 1.14084792137146, "learning_rate": 2.187028657616893e-06, "loss": 2.5879, "step": 145 }, { "epoch": 0.06604840533815878, "grad_norm": 1.121626377105713, "learning_rate": 2.2021116138763197e-06, "loss": 2.9141, "step": 146 }, { "epoch": 0.06650079167609138, "grad_norm": 1.1546727418899536, "learning_rate": 2.2171945701357465e-06, "loss": 2.9326, "step": 147 }, { "epoch": 0.06695317801402398, "grad_norm": 1.296535849571228, "learning_rate": 2.2322775263951737e-06, "loss": 3.0131, "step": 148 }, { "epoch": 0.06740556435195658, "grad_norm": 1.30239737033844, "learning_rate": 2.2473604826546005e-06, "loss": 3.1553, "step": 149 }, { "epoch": 0.06785795068988916, "grad_norm": 1.4067734479904175, "learning_rate": 2.2624434389140273e-06, "loss": 3.3319, "step": 150 }, { "epoch": 0.06831033702782176, "grad_norm": 0.10112728178501129, "learning_rate": 2.277526395173454e-06, "loss": 1.2496, "step": 151 }, { "epoch": 0.06876272336575436, "grad_norm": 0.2627163529396057, "learning_rate": 2.292609351432881e-06, "loss": 1.0356, "step": 152 }, { "epoch": 0.06921510970368695, "grad_norm": 0.32027748227119446, "learning_rate": 2.307692307692308e-06, "loss": 1.2338, "step": 153 }, { "epoch": 0.06966749604161954, "grad_norm": 0.348748117685318, "learning_rate": 2.322775263951735e-06, "loss": 1.4592, "step": 154 }, { "epoch": 0.07011988237955213, "grad_norm": 0.39392125606536865, "learning_rate": 2.3378582202111617e-06, "loss": 1.3018, "step": 155 }, { "epoch": 0.07057226871748473, "grad_norm": 0.42504221200942993, "learning_rate": 2.3529411764705885e-06, "loss": 1.4482, "step": 156 }, { "epoch": 0.07102465505541733, "grad_norm": 0.45873239636421204, "learning_rate": 2.3680241327300152e-06, "loss": 1.5533, "step": 157 }, { "epoch": 0.07147704139334993, "grad_norm": 0.4763510227203369, "learning_rate": 2.3831070889894425e-06, "loss": 1.4889, "step": 158 }, { "epoch": 0.07192942773128251, "grad_norm": 0.5064267516136169, "learning_rate": 2.3981900452488693e-06, "loss": 1.5821, "step": 159 }, { "epoch": 0.07238181406921511, "grad_norm": 0.5164059400558472, "learning_rate": 2.413273001508296e-06, "loss": 1.7133, "step": 160 }, { "epoch": 0.0728342004071477, "grad_norm": 0.5167456269264221, "learning_rate": 2.428355957767723e-06, "loss": 1.6008, "step": 161 }, { "epoch": 0.0732865867450803, "grad_norm": 0.6022229790687561, "learning_rate": 2.4434389140271496e-06, "loss": 1.6699, "step": 162 }, { "epoch": 0.07373897308301289, "grad_norm": 0.6219989061355591, "learning_rate": 2.4585218702865764e-06, "loss": 1.6593, "step": 163 }, { "epoch": 0.07419135942094549, "grad_norm": 0.6184532046318054, "learning_rate": 2.4736048265460032e-06, "loss": 1.6285, "step": 164 }, { "epoch": 0.07464374575887808, "grad_norm": 0.6529053449630737, "learning_rate": 2.48868778280543e-06, "loss": 1.7135, "step": 165 }, { "epoch": 0.07509613209681068, "grad_norm": 0.6892828941345215, "learning_rate": 2.503770739064857e-06, "loss": 1.7363, "step": 166 }, { "epoch": 0.07554851843474326, "grad_norm": 0.6974689364433289, "learning_rate": 2.5188536953242836e-06, "loss": 1.5484, "step": 167 }, { "epoch": 0.07600090477267586, "grad_norm": 0.7144427299499512, "learning_rate": 2.5339366515837104e-06, "loss": 1.7503, "step": 168 }, { "epoch": 0.07645329111060846, "grad_norm": 0.7137396335601807, "learning_rate": 2.549019607843137e-06, "loss": 1.6568, "step": 169 }, { "epoch": 0.07690567744854106, "grad_norm": 0.7356016039848328, "learning_rate": 2.564102564102564e-06, "loss": 1.6525, "step": 170 }, { "epoch": 0.07735806378647365, "grad_norm": 0.769875705242157, "learning_rate": 2.5791855203619916e-06, "loss": 1.748, "step": 171 }, { "epoch": 0.07781045012440624, "grad_norm": 0.7793062329292297, "learning_rate": 2.5942684766214184e-06, "loss": 1.6969, "step": 172 }, { "epoch": 0.07826283646233884, "grad_norm": 0.7778929471969604, "learning_rate": 2.609351432880845e-06, "loss": 1.7166, "step": 173 }, { "epoch": 0.07871522280027143, "grad_norm": 0.8175855278968811, "learning_rate": 2.624434389140272e-06, "loss": 1.8167, "step": 174 }, { "epoch": 0.07916760913820403, "grad_norm": 0.8316064476966858, "learning_rate": 2.6395173453996988e-06, "loss": 1.8311, "step": 175 }, { "epoch": 0.07961999547613662, "grad_norm": 0.8849754333496094, "learning_rate": 2.6546003016591256e-06, "loss": 1.7604, "step": 176 }, { "epoch": 0.08007238181406921, "grad_norm": 0.870646059513092, "learning_rate": 2.6696832579185524e-06, "loss": 1.8972, "step": 177 }, { "epoch": 0.08052476815200181, "grad_norm": 0.8384636044502258, "learning_rate": 2.684766214177979e-06, "loss": 1.7912, "step": 178 }, { "epoch": 0.08097715448993441, "grad_norm": 0.8384966850280762, "learning_rate": 2.699849170437406e-06, "loss": 1.8524, "step": 179 }, { "epoch": 0.08142954082786699, "grad_norm": 0.8320783376693726, "learning_rate": 2.7149321266968327e-06, "loss": 1.7607, "step": 180 }, { "epoch": 0.08188192716579959, "grad_norm": 0.8649104237556458, "learning_rate": 2.7300150829562595e-06, "loss": 1.7818, "step": 181 }, { "epoch": 0.08233431350373219, "grad_norm": 0.822292149066925, "learning_rate": 2.7450980392156867e-06, "loss": 1.8561, "step": 182 }, { "epoch": 0.08278669984166478, "grad_norm": 0.8105872869491577, "learning_rate": 2.7601809954751135e-06, "loss": 1.8544, "step": 183 }, { "epoch": 0.08323908617959738, "grad_norm": 0.8259276151657104, "learning_rate": 2.7752639517345403e-06, "loss": 1.8873, "step": 184 }, { "epoch": 0.08369147251752997, "grad_norm": 0.803145706653595, "learning_rate": 2.790346907993967e-06, "loss": 1.9109, "step": 185 }, { "epoch": 0.08414385885546256, "grad_norm": 0.8151469230651855, "learning_rate": 2.805429864253394e-06, "loss": 1.9108, "step": 186 }, { "epoch": 0.08459624519339516, "grad_norm": 0.8308303952217102, "learning_rate": 2.8205128205128207e-06, "loss": 1.7894, "step": 187 }, { "epoch": 0.08504863153132776, "grad_norm": 0.8311594128608704, "learning_rate": 2.8355957767722475e-06, "loss": 1.7596, "step": 188 }, { "epoch": 0.08550101786926034, "grad_norm": 0.8182538747787476, "learning_rate": 2.8506787330316743e-06, "loss": 1.8171, "step": 189 }, { "epoch": 0.08595340420719294, "grad_norm": 0.860839307308197, "learning_rate": 2.865761689291101e-06, "loss": 1.8502, "step": 190 }, { "epoch": 0.08640579054512554, "grad_norm": 0.7802768349647522, "learning_rate": 2.880844645550528e-06, "loss": 1.9433, "step": 191 }, { "epoch": 0.08685817688305814, "grad_norm": 0.9046308398246765, "learning_rate": 2.895927601809955e-06, "loss": 2.0078, "step": 192 }, { "epoch": 0.08731056322099072, "grad_norm": 0.8739044070243835, "learning_rate": 2.911010558069382e-06, "loss": 1.9495, "step": 193 }, { "epoch": 0.08776294955892332, "grad_norm": 0.8603385090827942, "learning_rate": 2.9260935143288087e-06, "loss": 1.931, "step": 194 }, { "epoch": 0.08821533589685591, "grad_norm": 0.891846776008606, "learning_rate": 2.9411764705882355e-06, "loss": 1.9977, "step": 195 }, { "epoch": 0.08866772223478851, "grad_norm": 0.8506848216056824, "learning_rate": 2.9562594268476623e-06, "loss": 2.0204, "step": 196 }, { "epoch": 0.08912010857272111, "grad_norm": 0.9019345045089722, "learning_rate": 2.971342383107089e-06, "loss": 2.1948, "step": 197 }, { "epoch": 0.0895724949106537, "grad_norm": 0.9063124656677246, "learning_rate": 2.986425339366516e-06, "loss": 2.1322, "step": 198 }, { "epoch": 0.09002488124858629, "grad_norm": 0.9489585757255554, "learning_rate": 3.0015082956259426e-06, "loss": 2.0869, "step": 199 }, { "epoch": 0.09047726758651889, "grad_norm": 0.9446021914482117, "learning_rate": 3.0165912518853694e-06, "loss": 2.1444, "step": 200 }, { "epoch": 0.09047726758651889, "eval_loss": 1.6063311100006104, "eval_runtime": 28.4884, "eval_samples_per_second": 26.116, "eval_steps_per_second": 6.529, "step": 200 }, { "epoch": 0.09092965392445149, "grad_norm": 0.10284460335969925, "learning_rate": 3.0316742081447962e-06, "loss": 1.2271, "step": 201 }, { "epoch": 0.09138204026238407, "grad_norm": 0.18311525881290436, "learning_rate": 3.046757164404223e-06, "loss": 1.1073, "step": 202 }, { "epoch": 0.09183442660031667, "grad_norm": 0.2547847032546997, "learning_rate": 3.0618401206636506e-06, "loss": 1.2449, "step": 203 }, { "epoch": 0.09228681293824927, "grad_norm": 0.26365217566490173, "learning_rate": 3.0769230769230774e-06, "loss": 1.2136, "step": 204 }, { "epoch": 0.09273919927618186, "grad_norm": 0.28396734595298767, "learning_rate": 3.0920060331825042e-06, "loss": 1.1749, "step": 205 }, { "epoch": 0.09319158561411445, "grad_norm": 0.29910004138946533, "learning_rate": 3.107088989441931e-06, "loss": 1.1401, "step": 206 }, { "epoch": 0.09364397195204704, "grad_norm": 0.31042546033859253, "learning_rate": 3.122171945701358e-06, "loss": 1.1013, "step": 207 }, { "epoch": 0.09409635828997964, "grad_norm": 0.31989386677742004, "learning_rate": 3.1372549019607846e-06, "loss": 1.1356, "step": 208 }, { "epoch": 0.09454874462791224, "grad_norm": 0.33571121096611023, "learning_rate": 3.1523378582202114e-06, "loss": 1.2741, "step": 209 }, { "epoch": 0.09500113096584484, "grad_norm": 0.35753700137138367, "learning_rate": 3.167420814479638e-06, "loss": 1.3804, "step": 210 }, { "epoch": 0.09545351730377742, "grad_norm": 0.37398552894592285, "learning_rate": 3.182503770739065e-06, "loss": 1.2796, "step": 211 }, { "epoch": 0.09590590364171002, "grad_norm": 0.39447376132011414, "learning_rate": 3.1975867269984918e-06, "loss": 1.3809, "step": 212 }, { "epoch": 0.09635828997964262, "grad_norm": 0.4046800136566162, "learning_rate": 3.212669683257919e-06, "loss": 1.4323, "step": 213 }, { "epoch": 0.09681067631757521, "grad_norm": 0.394756019115448, "learning_rate": 3.2277526395173458e-06, "loss": 1.2724, "step": 214 }, { "epoch": 0.0972630626555078, "grad_norm": 0.3865809440612793, "learning_rate": 3.2428355957767726e-06, "loss": 1.2602, "step": 215 }, { "epoch": 0.0977154489934404, "grad_norm": 0.44278863072395325, "learning_rate": 3.2579185520361994e-06, "loss": 1.3541, "step": 216 }, { "epoch": 0.098167835331373, "grad_norm": 0.4161834418773651, "learning_rate": 3.273001508295626e-06, "loss": 1.2901, "step": 217 }, { "epoch": 0.09862022166930559, "grad_norm": 0.43557918071746826, "learning_rate": 3.288084464555053e-06, "loss": 1.3813, "step": 218 }, { "epoch": 0.09907260800723817, "grad_norm": 0.43977466225624084, "learning_rate": 3.3031674208144797e-06, "loss": 1.2976, "step": 219 }, { "epoch": 0.09952499434517077, "grad_norm": 0.4143262505531311, "learning_rate": 3.3182503770739065e-06, "loss": 1.3999, "step": 220 }, { "epoch": 0.09997738068310337, "grad_norm": 0.4420431852340698, "learning_rate": 3.3333333333333333e-06, "loss": 1.3603, "step": 221 }, { "epoch": 0.10042976702103597, "grad_norm": 0.4413033723831177, "learning_rate": 3.34841628959276e-06, "loss": 1.3513, "step": 222 }, { "epoch": 0.10088215335896857, "grad_norm": 0.4599975049495697, "learning_rate": 3.3634992458521878e-06, "loss": 1.3114, "step": 223 }, { "epoch": 0.10133453969690115, "grad_norm": 0.46234825253486633, "learning_rate": 3.3785822021116145e-06, "loss": 1.2637, "step": 224 }, { "epoch": 0.10178692603483375, "grad_norm": 0.45369648933410645, "learning_rate": 3.3936651583710413e-06, "loss": 1.3037, "step": 225 }, { "epoch": 0.10223931237276634, "grad_norm": 0.4890686869621277, "learning_rate": 3.408748114630468e-06, "loss": 1.4448, "step": 226 }, { "epoch": 0.10269169871069894, "grad_norm": 0.4915764331817627, "learning_rate": 3.423831070889895e-06, "loss": 1.4281, "step": 227 }, { "epoch": 0.10314408504863153, "grad_norm": 0.4875490963459015, "learning_rate": 3.4389140271493217e-06, "loss": 1.4102, "step": 228 }, { "epoch": 0.10359647138656412, "grad_norm": 0.4563937783241272, "learning_rate": 3.4539969834087485e-06, "loss": 1.4308, "step": 229 }, { "epoch": 0.10404885772449672, "grad_norm": 0.4910651743412018, "learning_rate": 3.4690799396681753e-06, "loss": 1.3818, "step": 230 }, { "epoch": 0.10450124406242932, "grad_norm": 0.495875746011734, "learning_rate": 3.484162895927602e-06, "loss": 1.3806, "step": 231 }, { "epoch": 0.1049536304003619, "grad_norm": 0.48358720541000366, "learning_rate": 3.499245852187029e-06, "loss": 1.4399, "step": 232 }, { "epoch": 0.1054060167382945, "grad_norm": 0.4876674711704254, "learning_rate": 3.5143288084464557e-06, "loss": 1.3711, "step": 233 }, { "epoch": 0.1058584030762271, "grad_norm": 0.4984152913093567, "learning_rate": 3.529411764705883e-06, "loss": 1.4836, "step": 234 }, { "epoch": 0.1063107894141597, "grad_norm": 0.502505898475647, "learning_rate": 3.5444947209653097e-06, "loss": 1.2476, "step": 235 }, { "epoch": 0.10676317575209229, "grad_norm": 0.4831922948360443, "learning_rate": 3.5595776772247365e-06, "loss": 1.3357, "step": 236 }, { "epoch": 0.10721556209002488, "grad_norm": 0.4995551109313965, "learning_rate": 3.5746606334841633e-06, "loss": 1.4231, "step": 237 }, { "epoch": 0.10766794842795747, "grad_norm": 0.5082549452781677, "learning_rate": 3.58974358974359e-06, "loss": 1.3158, "step": 238 }, { "epoch": 0.10812033476589007, "grad_norm": 0.5121150016784668, "learning_rate": 3.604826546003017e-06, "loss": 1.4592, "step": 239 }, { "epoch": 0.10857272110382267, "grad_norm": 0.5276757478713989, "learning_rate": 3.6199095022624436e-06, "loss": 1.4509, "step": 240 }, { "epoch": 0.10902510744175525, "grad_norm": 0.49839484691619873, "learning_rate": 3.6349924585218704e-06, "loss": 1.4371, "step": 241 }, { "epoch": 0.10947749377968785, "grad_norm": 0.49852508306503296, "learning_rate": 3.6500754147812972e-06, "loss": 1.3854, "step": 242 }, { "epoch": 0.10992988011762045, "grad_norm": 0.5718432068824768, "learning_rate": 3.665158371040724e-06, "loss": 1.5645, "step": 243 }, { "epoch": 0.11038226645555305, "grad_norm": 0.4926327168941498, "learning_rate": 3.6802413273001512e-06, "loss": 1.5596, "step": 244 }, { "epoch": 0.11083465279348563, "grad_norm": 0.45934170484542847, "learning_rate": 3.695324283559578e-06, "loss": 1.4143, "step": 245 }, { "epoch": 0.11128703913141823, "grad_norm": 0.5036425590515137, "learning_rate": 3.710407239819005e-06, "loss": 1.6287, "step": 246 }, { "epoch": 0.11173942546935083, "grad_norm": 0.49225303530693054, "learning_rate": 3.7254901960784316e-06, "loss": 1.5508, "step": 247 }, { "epoch": 0.11219181180728342, "grad_norm": 0.5132015347480774, "learning_rate": 3.7405731523378584e-06, "loss": 1.5665, "step": 248 }, { "epoch": 0.11264419814521602, "grad_norm": 0.6156896352767944, "learning_rate": 3.755656108597285e-06, "loss": 1.683, "step": 249 }, { "epoch": 0.1130965844831486, "grad_norm": 0.6679587960243225, "learning_rate": 3.770739064856712e-06, "loss": 1.9378, "step": 250 }, { "epoch": 0.1135489708210812, "grad_norm": 0.0879235714673996, "learning_rate": 3.7858220211161388e-06, "loss": 1.2545, "step": 251 }, { "epoch": 0.1140013571590138, "grad_norm": 0.13720904290676117, "learning_rate": 3.8009049773755656e-06, "loss": 0.9343, "step": 252 }, { "epoch": 0.1144537434969464, "grad_norm": 0.1759236752986908, "learning_rate": 3.815987933634992e-06, "loss": 0.9893, "step": 253 }, { "epoch": 0.11490612983487898, "grad_norm": 0.18518029153347015, "learning_rate": 3.83107088989442e-06, "loss": 0.9376, "step": 254 }, { "epoch": 0.11535851617281158, "grad_norm": 0.19426366686820984, "learning_rate": 3.846153846153847e-06, "loss": 1.0967, "step": 255 }, { "epoch": 0.11581090251074418, "grad_norm": 0.20127107203006744, "learning_rate": 3.861236802413273e-06, "loss": 1.266, "step": 256 }, { "epoch": 0.11626328884867677, "grad_norm": 0.20982080698013306, "learning_rate": 3.8763197586727e-06, "loss": 1.005, "step": 257 }, { "epoch": 0.11671567518660936, "grad_norm": 0.18045534193515778, "learning_rate": 3.891402714932127e-06, "loss": 0.8872, "step": 258 }, { "epoch": 0.11716806152454196, "grad_norm": 0.21370646357536316, "learning_rate": 3.906485671191554e-06, "loss": 1.0099, "step": 259 }, { "epoch": 0.11762044786247455, "grad_norm": 0.2215162217617035, "learning_rate": 3.92156862745098e-06, "loss": 1.0836, "step": 260 }, { "epoch": 0.11807283420040715, "grad_norm": 0.2297031134366989, "learning_rate": 3.9366515837104075e-06, "loss": 1.1147, "step": 261 }, { "epoch": 0.11852522053833975, "grad_norm": 0.2062511444091797, "learning_rate": 3.951734539969834e-06, "loss": 0.8771, "step": 262 }, { "epoch": 0.11897760687627233, "grad_norm": 0.2405863255262375, "learning_rate": 3.966817496229261e-06, "loss": 1.1957, "step": 263 }, { "epoch": 0.11942999321420493, "grad_norm": 0.24210640788078308, "learning_rate": 3.9819004524886875e-06, "loss": 1.1548, "step": 264 }, { "epoch": 0.11988237955213753, "grad_norm": 0.27825286984443665, "learning_rate": 3.9969834087481156e-06, "loss": 1.1205, "step": 265 }, { "epoch": 0.12033476589007013, "grad_norm": 0.26872166991233826, "learning_rate": 4.012066365007542e-06, "loss": 1.2542, "step": 266 }, { "epoch": 0.12078715222800271, "grad_norm": 0.2763238549232483, "learning_rate": 4.027149321266969e-06, "loss": 1.0622, "step": 267 }, { "epoch": 0.1212395385659353, "grad_norm": 0.2740189731121063, "learning_rate": 4.0422322775263955e-06, "loss": 1.1238, "step": 268 }, { "epoch": 0.1216919249038679, "grad_norm": 0.30493414402008057, "learning_rate": 4.057315233785823e-06, "loss": 1.1866, "step": 269 }, { "epoch": 0.1221443112418005, "grad_norm": 0.2726280987262726, "learning_rate": 4.072398190045249e-06, "loss": 1.036, "step": 270 }, { "epoch": 0.12259669757973309, "grad_norm": 0.2876967489719391, "learning_rate": 4.087481146304676e-06, "loss": 1.0667, "step": 271 }, { "epoch": 0.12304908391766568, "grad_norm": 0.30955561995506287, "learning_rate": 4.102564102564103e-06, "loss": 1.1133, "step": 272 }, { "epoch": 0.12350147025559828, "grad_norm": 0.30965903401374817, "learning_rate": 4.11764705882353e-06, "loss": 1.0552, "step": 273 }, { "epoch": 0.12395385659353088, "grad_norm": 0.31585049629211426, "learning_rate": 4.132730015082956e-06, "loss": 1.1502, "step": 274 }, { "epoch": 0.12440624293146348, "grad_norm": 0.3229442238807678, "learning_rate": 4.1478129713423835e-06, "loss": 1.1419, "step": 275 }, { "epoch": 0.12485862926939606, "grad_norm": 0.32978636026382446, "learning_rate": 4.162895927601811e-06, "loss": 1.102, "step": 276 }, { "epoch": 0.12531101560732866, "grad_norm": 0.2998966574668884, "learning_rate": 4.177978883861237e-06, "loss": 0.965, "step": 277 }, { "epoch": 0.12576340194526126, "grad_norm": 0.31423524022102356, "learning_rate": 4.193061840120664e-06, "loss": 1.1584, "step": 278 }, { "epoch": 0.12621578828319385, "grad_norm": 0.3736515939235687, "learning_rate": 4.208144796380091e-06, "loss": 1.1427, "step": 279 }, { "epoch": 0.12666817462112645, "grad_norm": 0.3666379451751709, "learning_rate": 4.223227752639518e-06, "loss": 1.1368, "step": 280 }, { "epoch": 0.12712056095905905, "grad_norm": 0.4088203012943268, "learning_rate": 4.238310708898944e-06, "loss": 0.9762, "step": 281 }, { "epoch": 0.12757294729699162, "grad_norm": 0.41544854640960693, "learning_rate": 4.2533936651583714e-06, "loss": 1.1345, "step": 282 }, { "epoch": 0.12802533363492422, "grad_norm": 0.459001749753952, "learning_rate": 4.268476621417798e-06, "loss": 1.3368, "step": 283 }, { "epoch": 0.1284777199728568, "grad_norm": 0.40151914954185486, "learning_rate": 4.283559577677225e-06, "loss": 1.2005, "step": 284 }, { "epoch": 0.1289301063107894, "grad_norm": 0.39306652545928955, "learning_rate": 4.298642533936652e-06, "loss": 1.0469, "step": 285 }, { "epoch": 0.129382492648722, "grad_norm": 0.4230174422264099, "learning_rate": 4.313725490196079e-06, "loss": 1.0813, "step": 286 }, { "epoch": 0.1298348789866546, "grad_norm": 0.4264683127403259, "learning_rate": 4.328808446455506e-06, "loss": 1.0626, "step": 287 }, { "epoch": 0.1302872653245872, "grad_norm": 0.44791293144226074, "learning_rate": 4.343891402714932e-06, "loss": 1.1474, "step": 288 }, { "epoch": 0.1307396516625198, "grad_norm": 0.46777433156967163, "learning_rate": 4.358974358974359e-06, "loss": 1.2612, "step": 289 }, { "epoch": 0.1311920380004524, "grad_norm": 0.4682120680809021, "learning_rate": 4.374057315233786e-06, "loss": 1.1123, "step": 290 }, { "epoch": 0.13164442433838497, "grad_norm": 0.4432609975337982, "learning_rate": 4.389140271493213e-06, "loss": 1.2372, "step": 291 }, { "epoch": 0.13209681067631757, "grad_norm": 0.47359392046928406, "learning_rate": 4.404223227752639e-06, "loss": 1.0855, "step": 292 }, { "epoch": 0.13254919701425016, "grad_norm": 0.4715504050254822, "learning_rate": 4.419306184012067e-06, "loss": 1.1948, "step": 293 }, { "epoch": 0.13300158335218276, "grad_norm": 0.46393659710884094, "learning_rate": 4.434389140271493e-06, "loss": 1.0519, "step": 294 }, { "epoch": 0.13345396969011536, "grad_norm": 0.46861955523490906, "learning_rate": 4.44947209653092e-06, "loss": 1.1273, "step": 295 }, { "epoch": 0.13390635602804796, "grad_norm": 0.462346613407135, "learning_rate": 4.464555052790347e-06, "loss": 1.1283, "step": 296 }, { "epoch": 0.13435874236598055, "grad_norm": 0.5664047002792358, "learning_rate": 4.479638009049775e-06, "loss": 1.1268, "step": 297 }, { "epoch": 0.13481112870391315, "grad_norm": 0.4963298439979553, "learning_rate": 4.494720965309201e-06, "loss": 1.2266, "step": 298 }, { "epoch": 0.13526351504184575, "grad_norm": 0.519995927810669, "learning_rate": 4.509803921568628e-06, "loss": 1.2401, "step": 299 }, { "epoch": 0.13571590137977832, "grad_norm": 0.686543345451355, "learning_rate": 4.5248868778280546e-06, "loss": 1.4039, "step": 300 }, { "epoch": 0.13616828771771092, "grad_norm": 0.09637536108493805, "learning_rate": 4.539969834087482e-06, "loss": 1.2394, "step": 301 }, { "epoch": 0.13662067405564352, "grad_norm": 0.15905450284481049, "learning_rate": 4.555052790346908e-06, "loss": 0.8518, "step": 302 }, { "epoch": 0.1370730603935761, "grad_norm": 0.1937795728445053, "learning_rate": 4.570135746606335e-06, "loss": 0.8114, "step": 303 }, { "epoch": 0.1375254467315087, "grad_norm": 0.18738310039043427, "learning_rate": 4.585218702865762e-06, "loss": 0.8479, "step": 304 }, { "epoch": 0.1379778330694413, "grad_norm": 0.17131307721138, "learning_rate": 4.600301659125189e-06, "loss": 0.8014, "step": 305 }, { "epoch": 0.1384302194073739, "grad_norm": 0.19263143837451935, "learning_rate": 4.615384615384616e-06, "loss": 0.9275, "step": 306 }, { "epoch": 0.1388826057453065, "grad_norm": 0.1952895224094391, "learning_rate": 4.6304675716440425e-06, "loss": 0.809, "step": 307 }, { "epoch": 0.13933499208323907, "grad_norm": 0.17560231685638428, "learning_rate": 4.64555052790347e-06, "loss": 0.9095, "step": 308 }, { "epoch": 0.13978737842117167, "grad_norm": 0.2040272355079651, "learning_rate": 4.660633484162896e-06, "loss": 0.9229, "step": 309 }, { "epoch": 0.14023976475910427, "grad_norm": 0.20608969032764435, "learning_rate": 4.675716440422323e-06, "loss": 1.0323, "step": 310 }, { "epoch": 0.14069215109703687, "grad_norm": 0.17143970727920532, "learning_rate": 4.69079939668175e-06, "loss": 0.8816, "step": 311 }, { "epoch": 0.14114453743496946, "grad_norm": 0.21099169552326202, "learning_rate": 4.705882352941177e-06, "loss": 0.8681, "step": 312 }, { "epoch": 0.14159692377290206, "grad_norm": 0.19761766493320465, "learning_rate": 4.720965309200603e-06, "loss": 0.7996, "step": 313 }, { "epoch": 0.14204931011083466, "grad_norm": 0.24397683143615723, "learning_rate": 4.7360482654600305e-06, "loss": 0.9599, "step": 314 }, { "epoch": 0.14250169644876726, "grad_norm": 0.1992105394601822, "learning_rate": 4.751131221719457e-06, "loss": 0.8934, "step": 315 }, { "epoch": 0.14295408278669985, "grad_norm": 0.19837310910224915, "learning_rate": 4.766214177978885e-06, "loss": 0.9096, "step": 316 }, { "epoch": 0.14340646912463242, "grad_norm": 0.2522456645965576, "learning_rate": 4.781297134238311e-06, "loss": 0.987, "step": 317 }, { "epoch": 0.14385885546256502, "grad_norm": 0.22480565309524536, "learning_rate": 4.7963800904977385e-06, "loss": 0.8733, "step": 318 }, { "epoch": 0.14431124180049762, "grad_norm": 0.26319047808647156, "learning_rate": 4.811463046757165e-06, "loss": 0.8841, "step": 319 }, { "epoch": 0.14476362813843022, "grad_norm": 0.22067755460739136, "learning_rate": 4.826546003016592e-06, "loss": 0.8959, "step": 320 }, { "epoch": 0.14521601447636281, "grad_norm": 0.23854433000087738, "learning_rate": 4.8416289592760185e-06, "loss": 1.0243, "step": 321 }, { "epoch": 0.1456684008142954, "grad_norm": 0.28900349140167236, "learning_rate": 4.856711915535446e-06, "loss": 0.9506, "step": 322 }, { "epoch": 0.146120787152228, "grad_norm": 0.20927809178829193, "learning_rate": 4.871794871794872e-06, "loss": 1.0909, "step": 323 }, { "epoch": 0.1465731734901606, "grad_norm": 0.25660234689712524, "learning_rate": 4.886877828054299e-06, "loss": 0.8821, "step": 324 }, { "epoch": 0.1470255598280932, "grad_norm": 0.23427610099315643, "learning_rate": 4.901960784313726e-06, "loss": 0.9389, "step": 325 }, { "epoch": 0.14747794616602578, "grad_norm": 0.22085130214691162, "learning_rate": 4.917043740573153e-06, "loss": 0.8217, "step": 326 }, { "epoch": 0.14793033250395837, "grad_norm": 0.2876066267490387, "learning_rate": 4.93212669683258e-06, "loss": 0.9182, "step": 327 }, { "epoch": 0.14838271884189097, "grad_norm": 0.35493189096450806, "learning_rate": 4.9472096530920064e-06, "loss": 0.9295, "step": 328 }, { "epoch": 0.14883510517982357, "grad_norm": 0.2407694011926651, "learning_rate": 4.962292609351434e-06, "loss": 0.845, "step": 329 }, { "epoch": 0.14928749151775617, "grad_norm": 0.3411289155483246, "learning_rate": 4.97737556561086e-06, "loss": 0.7945, "step": 330 }, { "epoch": 0.14973987785568876, "grad_norm": 0.23552873730659485, "learning_rate": 4.992458521870287e-06, "loss": 0.8632, "step": 331 }, { "epoch": 0.15019226419362136, "grad_norm": 0.23586787283420563, "learning_rate": 5.007541478129714e-06, "loss": 0.9736, "step": 332 }, { "epoch": 0.15064465053155396, "grad_norm": 0.2363610565662384, "learning_rate": 5.022624434389141e-06, "loss": 0.9231, "step": 333 }, { "epoch": 0.15109703686948653, "grad_norm": 0.25294479727745056, "learning_rate": 5.037707390648567e-06, "loss": 0.8952, "step": 334 }, { "epoch": 0.15154942320741913, "grad_norm": 0.27956050634384155, "learning_rate": 5.052790346907994e-06, "loss": 0.9506, "step": 335 }, { "epoch": 0.15200180954535172, "grad_norm": 0.27112793922424316, "learning_rate": 5.067873303167421e-06, "loss": 0.9628, "step": 336 }, { "epoch": 0.15245419588328432, "grad_norm": 0.2823140621185303, "learning_rate": 5.082956259426848e-06, "loss": 0.9911, "step": 337 }, { "epoch": 0.15290658222121692, "grad_norm": 0.2623738944530487, "learning_rate": 5.098039215686274e-06, "loss": 0.8962, "step": 338 }, { "epoch": 0.15335896855914952, "grad_norm": 0.2597084641456604, "learning_rate": 5.1131221719457016e-06, "loss": 0.9255, "step": 339 }, { "epoch": 0.15381135489708211, "grad_norm": 0.2636582553386688, "learning_rate": 5.128205128205128e-06, "loss": 0.9787, "step": 340 }, { "epoch": 0.1542637412350147, "grad_norm": 0.23133310675621033, "learning_rate": 5.143288084464555e-06, "loss": 0.9123, "step": 341 }, { "epoch": 0.1547161275729473, "grad_norm": 0.3040371537208557, "learning_rate": 5.158371040723983e-06, "loss": 0.862, "step": 342 }, { "epoch": 0.15516851391087988, "grad_norm": 0.3434310853481293, "learning_rate": 5.1734539969834096e-06, "loss": 0.9747, "step": 343 }, { "epoch": 0.15562090024881248, "grad_norm": 0.27222827076911926, "learning_rate": 5.188536953242837e-06, "loss": 0.9297, "step": 344 }, { "epoch": 0.15607328658674507, "grad_norm": 0.35387006402015686, "learning_rate": 5.203619909502263e-06, "loss": 0.9441, "step": 345 }, { "epoch": 0.15652567292467767, "grad_norm": 0.3067138195037842, "learning_rate": 5.21870286576169e-06, "loss": 0.996, "step": 346 }, { "epoch": 0.15697805926261027, "grad_norm": 0.2962682247161865, "learning_rate": 5.233785822021117e-06, "loss": 0.9238, "step": 347 }, { "epoch": 0.15743044560054287, "grad_norm": 0.3368314802646637, "learning_rate": 5.248868778280544e-06, "loss": 1.1015, "step": 348 }, { "epoch": 0.15788283193847547, "grad_norm": 0.535832405090332, "learning_rate": 5.26395173453997e-06, "loss": 1.1256, "step": 349 }, { "epoch": 0.15833521827640806, "grad_norm": 0.5459734201431274, "learning_rate": 5.2790346907993975e-06, "loss": 1.2625, "step": 350 }, { "epoch": 0.15878760461434066, "grad_norm": 0.08834061771631241, "learning_rate": 5.294117647058824e-06, "loss": 1.1553, "step": 351 }, { "epoch": 0.15923999095227323, "grad_norm": 0.1375238299369812, "learning_rate": 5.309200603318251e-06, "loss": 1.0825, "step": 352 }, { "epoch": 0.15969237729020583, "grad_norm": 0.12115676701068878, "learning_rate": 5.3242835595776775e-06, "loss": 0.7468, "step": 353 }, { "epoch": 0.16014476362813843, "grad_norm": 0.1841784566640854, "learning_rate": 5.339366515837105e-06, "loss": 0.9026, "step": 354 }, { "epoch": 0.16059714996607102, "grad_norm": 0.16195428371429443, "learning_rate": 5.354449472096531e-06, "loss": 0.8261, "step": 355 }, { "epoch": 0.16104953630400362, "grad_norm": 0.18573614954948425, "learning_rate": 5.369532428355958e-06, "loss": 0.9687, "step": 356 }, { "epoch": 0.16150192264193622, "grad_norm": 0.21415722370147705, "learning_rate": 5.384615384615385e-06, "loss": 0.9621, "step": 357 }, { "epoch": 0.16195430897986882, "grad_norm": 0.15662147104740143, "learning_rate": 5.399698340874812e-06, "loss": 0.8181, "step": 358 }, { "epoch": 0.16240669531780141, "grad_norm": 0.1700572371482849, "learning_rate": 5.414781297134238e-06, "loss": 0.8137, "step": 359 }, { "epoch": 0.16285908165573398, "grad_norm": 0.1401992291212082, "learning_rate": 5.4298642533936655e-06, "loss": 0.877, "step": 360 }, { "epoch": 0.16331146799366658, "grad_norm": 0.17592734098434448, "learning_rate": 5.444947209653092e-06, "loss": 1.1089, "step": 361 }, { "epoch": 0.16376385433159918, "grad_norm": 0.20075660943984985, "learning_rate": 5.460030165912519e-06, "loss": 0.8404, "step": 362 }, { "epoch": 0.16421624066953178, "grad_norm": 0.18985675275325775, "learning_rate": 5.475113122171946e-06, "loss": 0.9128, "step": 363 }, { "epoch": 0.16466862700746437, "grad_norm": 0.19603624939918518, "learning_rate": 5.4901960784313735e-06, "loss": 0.9671, "step": 364 }, { "epoch": 0.16512101334539697, "grad_norm": 0.17259468138217926, "learning_rate": 5.505279034690801e-06, "loss": 0.9424, "step": 365 }, { "epoch": 0.16557339968332957, "grad_norm": 0.17942479252815247, "learning_rate": 5.520361990950227e-06, "loss": 0.8303, "step": 366 }, { "epoch": 0.16602578602126217, "grad_norm": 0.17921984195709229, "learning_rate": 5.535444947209654e-06, "loss": 0.9906, "step": 367 }, { "epoch": 0.16647817235919476, "grad_norm": 0.19414879381656647, "learning_rate": 5.550527903469081e-06, "loss": 0.8392, "step": 368 }, { "epoch": 0.16693055869712733, "grad_norm": 0.18996545672416687, "learning_rate": 5.565610859728508e-06, "loss": 0.915, "step": 369 }, { "epoch": 0.16738294503505993, "grad_norm": 0.196267232298851, "learning_rate": 5.580693815987934e-06, "loss": 0.8974, "step": 370 }, { "epoch": 0.16783533137299253, "grad_norm": 0.18394869565963745, "learning_rate": 5.5957767722473614e-06, "loss": 0.796, "step": 371 }, { "epoch": 0.16828771771092513, "grad_norm": 0.2099035382270813, "learning_rate": 5.610859728506788e-06, "loss": 0.7625, "step": 372 }, { "epoch": 0.16874010404885773, "grad_norm": 0.2028755098581314, "learning_rate": 5.625942684766215e-06, "loss": 0.9264, "step": 373 }, { "epoch": 0.16919249038679032, "grad_norm": 0.1872493326663971, "learning_rate": 5.641025641025641e-06, "loss": 0.8384, "step": 374 }, { "epoch": 0.16964487672472292, "grad_norm": 0.1973748803138733, "learning_rate": 5.656108597285069e-06, "loss": 0.9093, "step": 375 }, { "epoch": 0.17009726306265552, "grad_norm": 0.1850823312997818, "learning_rate": 5.671191553544495e-06, "loss": 0.8255, "step": 376 }, { "epoch": 0.17054964940058812, "grad_norm": 0.1846548169851303, "learning_rate": 5.686274509803922e-06, "loss": 0.8196, "step": 377 }, { "epoch": 0.17100203573852069, "grad_norm": 0.23367774486541748, "learning_rate": 5.7013574660633486e-06, "loss": 0.9778, "step": 378 }, { "epoch": 0.17145442207645328, "grad_norm": 0.20735958218574524, "learning_rate": 5.716440422322776e-06, "loss": 0.8379, "step": 379 }, { "epoch": 0.17190680841438588, "grad_norm": 0.250650554895401, "learning_rate": 5.731523378582202e-06, "loss": 1.0445, "step": 380 }, { "epoch": 0.17235919475231848, "grad_norm": 0.1961720585823059, "learning_rate": 5.746606334841629e-06, "loss": 0.8736, "step": 381 }, { "epoch": 0.17281158109025108, "grad_norm": 0.204644113779068, "learning_rate": 5.761689291101056e-06, "loss": 0.9025, "step": 382 }, { "epoch": 0.17326396742818367, "grad_norm": 0.2267153263092041, "learning_rate": 5.776772247360483e-06, "loss": 1.0285, "step": 383 }, { "epoch": 0.17371635376611627, "grad_norm": 0.21080204844474792, "learning_rate": 5.79185520361991e-06, "loss": 0.963, "step": 384 }, { "epoch": 0.17416874010404887, "grad_norm": 0.20614959299564362, "learning_rate": 5.806938159879337e-06, "loss": 0.9106, "step": 385 }, { "epoch": 0.17462112644198144, "grad_norm": 0.23715493083000183, "learning_rate": 5.822021116138764e-06, "loss": 1.0026, "step": 386 }, { "epoch": 0.17507351277991404, "grad_norm": 0.21489903330802917, "learning_rate": 5.837104072398191e-06, "loss": 0.8546, "step": 387 }, { "epoch": 0.17552589911784663, "grad_norm": 0.26821011304855347, "learning_rate": 5.852187028657617e-06, "loss": 0.869, "step": 388 }, { "epoch": 0.17597828545577923, "grad_norm": 0.24972333014011383, "learning_rate": 5.8672699849170446e-06, "loss": 0.8901, "step": 389 }, { "epoch": 0.17643067179371183, "grad_norm": 0.3039231300354004, "learning_rate": 5.882352941176471e-06, "loss": 0.9961, "step": 390 }, { "epoch": 0.17688305813164443, "grad_norm": 0.32883140444755554, "learning_rate": 5.897435897435898e-06, "loss": 0.9123, "step": 391 }, { "epoch": 0.17733544446957702, "grad_norm": 0.20742687582969666, "learning_rate": 5.9125188536953245e-06, "loss": 0.7313, "step": 392 }, { "epoch": 0.17778783080750962, "grad_norm": 0.31611743569374084, "learning_rate": 5.927601809954752e-06, "loss": 1.0349, "step": 393 }, { "epoch": 0.17824021714544222, "grad_norm": 0.22283443808555603, "learning_rate": 5.942684766214178e-06, "loss": 0.8614, "step": 394 }, { "epoch": 0.1786926034833748, "grad_norm": 0.2965244948863983, "learning_rate": 5.957767722473605e-06, "loss": 0.8888, "step": 395 }, { "epoch": 0.1791449898213074, "grad_norm": 0.28162646293640137, "learning_rate": 5.972850678733032e-06, "loss": 0.9549, "step": 396 }, { "epoch": 0.17959737615923999, "grad_norm": 0.3773486316204071, "learning_rate": 5.987933634992459e-06, "loss": 1.0595, "step": 397 }, { "epoch": 0.18004976249717258, "grad_norm": 0.3221220374107361, "learning_rate": 6.003016591251885e-06, "loss": 0.8952, "step": 398 }, { "epoch": 0.18050214883510518, "grad_norm": 0.31523948907852173, "learning_rate": 6.0180995475113125e-06, "loss": 0.9799, "step": 399 }, { "epoch": 0.18095453517303778, "grad_norm": 0.5409510135650635, "learning_rate": 6.033182503770739e-06, "loss": 1.0975, "step": 400 }, { "epoch": 0.18095453517303778, "eval_loss": 0.8984115719795227, "eval_runtime": 25.6909, "eval_samples_per_second": 28.96, "eval_steps_per_second": 7.24, "step": 400 }, { "epoch": 0.18140692151097038, "grad_norm": 0.11647921800613403, "learning_rate": 6.048265460030166e-06, "loss": 1.3594, "step": 401 }, { "epoch": 0.18185930784890297, "grad_norm": 0.12094968557357788, "learning_rate": 6.0633484162895924e-06, "loss": 0.9529, "step": 402 }, { "epoch": 0.18231169418683557, "grad_norm": 0.16884884238243103, "learning_rate": 6.07843137254902e-06, "loss": 0.8182, "step": 403 }, { "epoch": 0.18276408052476814, "grad_norm": 0.15380021929740906, "learning_rate": 6.093514328808446e-06, "loss": 0.8605, "step": 404 }, { "epoch": 0.18321646686270074, "grad_norm": 0.14434780180454254, "learning_rate": 6.108597285067874e-06, "loss": 0.8326, "step": 405 }, { "epoch": 0.18366885320063334, "grad_norm": 0.155549556016922, "learning_rate": 6.123680241327301e-06, "loss": 0.7846, "step": 406 }, { "epoch": 0.18412123953856593, "grad_norm": 0.15555742383003235, "learning_rate": 6.138763197586728e-06, "loss": 0.7363, "step": 407 }, { "epoch": 0.18457362587649853, "grad_norm": 0.132954940199852, "learning_rate": 6.153846153846155e-06, "loss": 0.8563, "step": 408 }, { "epoch": 0.18502601221443113, "grad_norm": 0.23602581024169922, "learning_rate": 6.168929110105581e-06, "loss": 0.8691, "step": 409 }, { "epoch": 0.18547839855236373, "grad_norm": 0.1736900508403778, "learning_rate": 6.1840120663650085e-06, "loss": 0.7535, "step": 410 }, { "epoch": 0.18593078489029632, "grad_norm": 0.20246095955371857, "learning_rate": 6.199095022624435e-06, "loss": 0.8575, "step": 411 }, { "epoch": 0.1863831712282289, "grad_norm": 0.18285703659057617, "learning_rate": 6.214177978883862e-06, "loss": 0.9712, "step": 412 }, { "epoch": 0.1868355575661615, "grad_norm": 0.1931200921535492, "learning_rate": 6.229260935143288e-06, "loss": 0.9547, "step": 413 }, { "epoch": 0.1872879439040941, "grad_norm": 0.2167317122220993, "learning_rate": 6.244343891402716e-06, "loss": 0.9088, "step": 414 }, { "epoch": 0.1877403302420267, "grad_norm": 0.2324584424495697, "learning_rate": 6.259426847662142e-06, "loss": 0.9433, "step": 415 }, { "epoch": 0.18819271657995928, "grad_norm": 0.17987088859081268, "learning_rate": 6.274509803921569e-06, "loss": 0.8051, "step": 416 }, { "epoch": 0.18864510291789188, "grad_norm": 0.28431153297424316, "learning_rate": 6.2895927601809956e-06, "loss": 0.9108, "step": 417 }, { "epoch": 0.18909748925582448, "grad_norm": 0.1824103593826294, "learning_rate": 6.304675716440423e-06, "loss": 0.8045, "step": 418 }, { "epoch": 0.18954987559375708, "grad_norm": 0.22927996516227722, "learning_rate": 6.319758672699849e-06, "loss": 0.7463, "step": 419 }, { "epoch": 0.19000226193168968, "grad_norm": 0.20925703644752502, "learning_rate": 6.334841628959276e-06, "loss": 0.8437, "step": 420 }, { "epoch": 0.19045464826962225, "grad_norm": 0.19920974969863892, "learning_rate": 6.349924585218703e-06, "loss": 0.7852, "step": 421 }, { "epoch": 0.19090703460755484, "grad_norm": 0.23140886425971985, "learning_rate": 6.36500754147813e-06, "loss": 0.7665, "step": 422 }, { "epoch": 0.19135942094548744, "grad_norm": 0.19694162905216217, "learning_rate": 6.380090497737556e-06, "loss": 0.7985, "step": 423 }, { "epoch": 0.19181180728342004, "grad_norm": 0.19552811980247498, "learning_rate": 6.3951734539969835e-06, "loss": 0.8418, "step": 424 }, { "epoch": 0.19226419362135264, "grad_norm": 0.2538328468799591, "learning_rate": 6.410256410256412e-06, "loss": 0.8638, "step": 425 }, { "epoch": 0.19271657995928523, "grad_norm": 0.23129570484161377, "learning_rate": 6.425339366515838e-06, "loss": 0.7745, "step": 426 }, { "epoch": 0.19316896629721783, "grad_norm": 0.185930535197258, "learning_rate": 6.440422322775265e-06, "loss": 0.7888, "step": 427 }, { "epoch": 0.19362135263515043, "grad_norm": 0.23380377888679504, "learning_rate": 6.4555052790346916e-06, "loss": 0.8858, "step": 428 }, { "epoch": 0.19407373897308303, "grad_norm": 0.19431445002555847, "learning_rate": 6.470588235294119e-06, "loss": 0.8731, "step": 429 }, { "epoch": 0.1945261253110156, "grad_norm": 0.23990508913993835, "learning_rate": 6.485671191553545e-06, "loss": 0.8788, "step": 430 }, { "epoch": 0.1949785116489482, "grad_norm": 0.20464712381362915, "learning_rate": 6.500754147812972e-06, "loss": 0.7798, "step": 431 }, { "epoch": 0.1954308979868808, "grad_norm": 0.23603376746177673, "learning_rate": 6.515837104072399e-06, "loss": 0.7797, "step": 432 }, { "epoch": 0.1958832843248134, "grad_norm": 0.23221230506896973, "learning_rate": 6.530920060331826e-06, "loss": 0.7554, "step": 433 }, { "epoch": 0.196335670662746, "grad_norm": 0.22027575969696045, "learning_rate": 6.546003016591252e-06, "loss": 0.9051, "step": 434 }, { "epoch": 0.19678805700067858, "grad_norm": 0.24925105273723602, "learning_rate": 6.5610859728506795e-06, "loss": 0.8009, "step": 435 }, { "epoch": 0.19724044333861118, "grad_norm": 0.31317660212516785, "learning_rate": 6.576168929110106e-06, "loss": 0.9287, "step": 436 }, { "epoch": 0.19769282967654378, "grad_norm": 0.20909351110458374, "learning_rate": 6.591251885369533e-06, "loss": 0.7765, "step": 437 }, { "epoch": 0.19814521601447635, "grad_norm": 0.3159376084804535, "learning_rate": 6.6063348416289595e-06, "loss": 0.9751, "step": 438 }, { "epoch": 0.19859760235240895, "grad_norm": 0.28327250480651855, "learning_rate": 6.621417797888387e-06, "loss": 1.0207, "step": 439 }, { "epoch": 0.19904998869034154, "grad_norm": 0.24818606674671173, "learning_rate": 6.636500754147813e-06, "loss": 0.8737, "step": 440 }, { "epoch": 0.19950237502827414, "grad_norm": 0.30169394612312317, "learning_rate": 6.65158371040724e-06, "loss": 0.9009, "step": 441 }, { "epoch": 0.19995476136620674, "grad_norm": 0.2656419277191162, "learning_rate": 6.666666666666667e-06, "loss": 0.7735, "step": 442 }, { "epoch": 0.20040714770413934, "grad_norm": 0.34059178829193115, "learning_rate": 6.681749622926094e-06, "loss": 0.9379, "step": 443 }, { "epoch": 0.20085953404207194, "grad_norm": 0.23763591051101685, "learning_rate": 6.69683257918552e-06, "loss": 0.8885, "step": 444 }, { "epoch": 0.20131192038000453, "grad_norm": 0.3554087281227112, "learning_rate": 6.7119155354449474e-06, "loss": 1.0399, "step": 445 }, { "epoch": 0.20176430671793713, "grad_norm": 0.2899327576160431, "learning_rate": 6.7269984917043755e-06, "loss": 0.8956, "step": 446 }, { "epoch": 0.2022166930558697, "grad_norm": 0.2973651587963104, "learning_rate": 6.742081447963802e-06, "loss": 0.9354, "step": 447 }, { "epoch": 0.2026690793938023, "grad_norm": 0.28507593274116516, "learning_rate": 6.757164404223229e-06, "loss": 0.9425, "step": 448 }, { "epoch": 0.2031214657317349, "grad_norm": 0.31953656673431396, "learning_rate": 6.7722473604826555e-06, "loss": 0.9675, "step": 449 }, { "epoch": 0.2035738520696675, "grad_norm": 0.43477436900138855, "learning_rate": 6.787330316742083e-06, "loss": 1.1436, "step": 450 }, { "epoch": 0.2040262384076001, "grad_norm": 0.1283264011144638, "learning_rate": 6.802413273001509e-06, "loss": 1.2414, "step": 451 }, { "epoch": 0.2044786247455327, "grad_norm": 0.15729187428951263, "learning_rate": 6.817496229260936e-06, "loss": 0.9824, "step": 452 }, { "epoch": 0.2049310110834653, "grad_norm": 0.15366607904434204, "learning_rate": 6.832579185520363e-06, "loss": 0.7686, "step": 453 }, { "epoch": 0.20538339742139788, "grad_norm": 0.13312649726867676, "learning_rate": 6.84766214177979e-06, "loss": 0.7286, "step": 454 }, { "epoch": 0.20583578375933048, "grad_norm": 0.14186516404151917, "learning_rate": 6.862745098039216e-06, "loss": 0.7369, "step": 455 }, { "epoch": 0.20628817009726305, "grad_norm": 0.18642565608024597, "learning_rate": 6.8778280542986434e-06, "loss": 0.8598, "step": 456 }, { "epoch": 0.20674055643519565, "grad_norm": 0.2179526537656784, "learning_rate": 6.89291101055807e-06, "loss": 0.8744, "step": 457 }, { "epoch": 0.20719294277312825, "grad_norm": 0.17004598677158356, "learning_rate": 6.907993966817497e-06, "loss": 0.88, "step": 458 }, { "epoch": 0.20764532911106084, "grad_norm": 0.24572843313217163, "learning_rate": 6.923076923076923e-06, "loss": 0.8653, "step": 459 }, { "epoch": 0.20809771544899344, "grad_norm": 0.21894443035125732, "learning_rate": 6.938159879336351e-06, "loss": 0.8654, "step": 460 }, { "epoch": 0.20855010178692604, "grad_norm": 0.19272378087043762, "learning_rate": 6.953242835595777e-06, "loss": 0.8854, "step": 461 }, { "epoch": 0.20900248812485864, "grad_norm": 0.14668114483356476, "learning_rate": 6.968325791855204e-06, "loss": 0.7518, "step": 462 }, { "epoch": 0.20945487446279124, "grad_norm": 0.16697905957698822, "learning_rate": 6.9834087481146306e-06, "loss": 0.7254, "step": 463 }, { "epoch": 0.2099072608007238, "grad_norm": 0.19931451976299286, "learning_rate": 6.998491704374058e-06, "loss": 0.8307, "step": 464 }, { "epoch": 0.2103596471386564, "grad_norm": 0.19550839066505432, "learning_rate": 7.013574660633484e-06, "loss": 0.787, "step": 465 }, { "epoch": 0.210812033476589, "grad_norm": 0.20887577533721924, "learning_rate": 7.028657616892911e-06, "loss": 0.8213, "step": 466 }, { "epoch": 0.2112644198145216, "grad_norm": 0.18829715251922607, "learning_rate": 7.0437405731523386e-06, "loss": 0.7528, "step": 467 }, { "epoch": 0.2117168061524542, "grad_norm": 0.18661807477474213, "learning_rate": 7.058823529411766e-06, "loss": 0.796, "step": 468 }, { "epoch": 0.2121691924903868, "grad_norm": 0.22348552942276, "learning_rate": 7.073906485671192e-06, "loss": 0.8572, "step": 469 }, { "epoch": 0.2126215788283194, "grad_norm": 0.18597401678562164, "learning_rate": 7.088989441930619e-06, "loss": 0.8302, "step": 470 }, { "epoch": 0.213073965166252, "grad_norm": 0.2075556367635727, "learning_rate": 7.104072398190046e-06, "loss": 0.901, "step": 471 }, { "epoch": 0.21352635150418459, "grad_norm": 0.20597046613693237, "learning_rate": 7.119155354449473e-06, "loss": 0.7128, "step": 472 }, { "epoch": 0.21397873784211716, "grad_norm": 0.23889608681201935, "learning_rate": 7.134238310708899e-06, "loss": 0.8738, "step": 473 }, { "epoch": 0.21443112418004975, "grad_norm": 0.19901339709758759, "learning_rate": 7.1493212669683265e-06, "loss": 0.754, "step": 474 }, { "epoch": 0.21488351051798235, "grad_norm": 0.18367744982242584, "learning_rate": 7.164404223227753e-06, "loss": 0.7676, "step": 475 }, { "epoch": 0.21533589685591495, "grad_norm": 0.2213570922613144, "learning_rate": 7.17948717948718e-06, "loss": 0.7566, "step": 476 }, { "epoch": 0.21578828319384755, "grad_norm": 0.24606074392795563, "learning_rate": 7.1945701357466065e-06, "loss": 0.7577, "step": 477 }, { "epoch": 0.21624066953178014, "grad_norm": 0.21020081639289856, "learning_rate": 7.209653092006034e-06, "loss": 0.8151, "step": 478 }, { "epoch": 0.21669305586971274, "grad_norm": 0.2632603347301483, "learning_rate": 7.22473604826546e-06, "loss": 0.9089, "step": 479 }, { "epoch": 0.21714544220764534, "grad_norm": 0.2167726755142212, "learning_rate": 7.239819004524887e-06, "loss": 0.8901, "step": 480 }, { "epoch": 0.21759782854557794, "grad_norm": 0.20921297371387482, "learning_rate": 7.2549019607843145e-06, "loss": 0.9051, "step": 481 }, { "epoch": 0.2180502148835105, "grad_norm": 0.2878897488117218, "learning_rate": 7.269984917043741e-06, "loss": 0.9364, "step": 482 }, { "epoch": 0.2185026012214431, "grad_norm": 0.19915664196014404, "learning_rate": 7.285067873303168e-06, "loss": 0.734, "step": 483 }, { "epoch": 0.2189549875593757, "grad_norm": 0.19715926051139832, "learning_rate": 7.3001508295625945e-06, "loss": 0.78, "step": 484 }, { "epoch": 0.2194073738973083, "grad_norm": 0.24286393821239471, "learning_rate": 7.315233785822022e-06, "loss": 0.7826, "step": 485 }, { "epoch": 0.2198597602352409, "grad_norm": 0.33146363496780396, "learning_rate": 7.330316742081448e-06, "loss": 0.9626, "step": 486 }, { "epoch": 0.2203121465731735, "grad_norm": 0.23406684398651123, "learning_rate": 7.345399698340876e-06, "loss": 0.8211, "step": 487 }, { "epoch": 0.2207645329111061, "grad_norm": 0.3157544732093811, "learning_rate": 7.3604826546003025e-06, "loss": 0.8658, "step": 488 }, { "epoch": 0.2212169192490387, "grad_norm": 0.2813735604286194, "learning_rate": 7.37556561085973e-06, "loss": 0.8353, "step": 489 }, { "epoch": 0.22166930558697126, "grad_norm": 0.34446802735328674, "learning_rate": 7.390648567119156e-06, "loss": 1.0791, "step": 490 }, { "epoch": 0.22212169192490386, "grad_norm": 0.33971962332725525, "learning_rate": 7.405731523378583e-06, "loss": 1.0441, "step": 491 }, { "epoch": 0.22257407826283646, "grad_norm": 0.27615952491760254, "learning_rate": 7.42081447963801e-06, "loss": 0.8723, "step": 492 }, { "epoch": 0.22302646460076905, "grad_norm": 0.35355666279792786, "learning_rate": 7.435897435897437e-06, "loss": 0.8864, "step": 493 }, { "epoch": 0.22347885093870165, "grad_norm": 0.27218109369277954, "learning_rate": 7.450980392156863e-06, "loss": 0.8752, "step": 494 }, { "epoch": 0.22393123727663425, "grad_norm": 0.30784621834754944, "learning_rate": 7.4660633484162904e-06, "loss": 0.9136, "step": 495 }, { "epoch": 0.22438362361456685, "grad_norm": 0.3047894537448883, "learning_rate": 7.481146304675717e-06, "loss": 0.9887, "step": 496 }, { "epoch": 0.22483600995249944, "grad_norm": 0.3511670231819153, "learning_rate": 7.496229260935144e-06, "loss": 0.858, "step": 497 }, { "epoch": 0.22528839629043204, "grad_norm": 0.43295398354530334, "learning_rate": 7.51131221719457e-06, "loss": 0.9888, "step": 498 }, { "epoch": 0.2257407826283646, "grad_norm": 0.3602030277252197, "learning_rate": 7.526395173453998e-06, "loss": 0.7842, "step": 499 }, { "epoch": 0.2261931689662972, "grad_norm": 0.5601304173469543, "learning_rate": 7.541478129713424e-06, "loss": 1.041, "step": 500 }, { "epoch": 0.2266455553042298, "grad_norm": 0.1065191775560379, "learning_rate": 7.556561085972851e-06, "loss": 1.1462, "step": 501 }, { "epoch": 0.2270979416421624, "grad_norm": 0.16639834642410278, "learning_rate": 7.5716440422322776e-06, "loss": 0.6883, "step": 502 }, { "epoch": 0.227550327980095, "grad_norm": 0.13869304955005646, "learning_rate": 7.586726998491705e-06, "loss": 0.6358, "step": 503 }, { "epoch": 0.2280027143180276, "grad_norm": 0.16486752033233643, "learning_rate": 7.601809954751131e-06, "loss": 0.8313, "step": 504 }, { "epoch": 0.2284551006559602, "grad_norm": 0.15883579850196838, "learning_rate": 7.616892911010558e-06, "loss": 0.7975, "step": 505 }, { "epoch": 0.2289074869938928, "grad_norm": 0.22225403785705566, "learning_rate": 7.631975867269985e-06, "loss": 0.8589, "step": 506 }, { "epoch": 0.2293598733318254, "grad_norm": 0.2238887995481491, "learning_rate": 7.647058823529411e-06, "loss": 0.7603, "step": 507 }, { "epoch": 0.22981225966975796, "grad_norm": 0.22994419932365417, "learning_rate": 7.66214177978884e-06, "loss": 0.8146, "step": 508 }, { "epoch": 0.23026464600769056, "grad_norm": 0.20384375751018524, "learning_rate": 7.677224736048267e-06, "loss": 0.8503, "step": 509 }, { "epoch": 0.23071703234562316, "grad_norm": 0.1754351407289505, "learning_rate": 7.692307692307694e-06, "loss": 0.8429, "step": 510 }, { "epoch": 0.23116941868355576, "grad_norm": 0.1935686469078064, "learning_rate": 7.70739064856712e-06, "loss": 0.7756, "step": 511 }, { "epoch": 0.23162180502148835, "grad_norm": 0.21427293121814728, "learning_rate": 7.722473604826546e-06, "loss": 0.8666, "step": 512 }, { "epoch": 0.23207419135942095, "grad_norm": 0.2559388279914856, "learning_rate": 7.737556561085974e-06, "loss": 0.7488, "step": 513 }, { "epoch": 0.23252657769735355, "grad_norm": 0.2108621597290039, "learning_rate": 7.7526395173454e-06, "loss": 0.7812, "step": 514 }, { "epoch": 0.23297896403528615, "grad_norm": 0.22536161541938782, "learning_rate": 7.767722473604827e-06, "loss": 0.914, "step": 515 }, { "epoch": 0.23343135037321872, "grad_norm": 0.16885265707969666, "learning_rate": 7.782805429864253e-06, "loss": 0.745, "step": 516 }, { "epoch": 0.2338837367111513, "grad_norm": 0.22716595232486725, "learning_rate": 7.797888386123682e-06, "loss": 0.871, "step": 517 }, { "epoch": 0.2343361230490839, "grad_norm": 0.21209575235843658, "learning_rate": 7.812971342383108e-06, "loss": 0.9625, "step": 518 }, { "epoch": 0.2347885093870165, "grad_norm": 0.21358048915863037, "learning_rate": 7.828054298642534e-06, "loss": 0.7919, "step": 519 }, { "epoch": 0.2352408957249491, "grad_norm": 0.24690072238445282, "learning_rate": 7.84313725490196e-06, "loss": 0.8775, "step": 520 }, { "epoch": 0.2356932820628817, "grad_norm": 0.20330293476581573, "learning_rate": 7.858220211161389e-06, "loss": 0.8013, "step": 521 }, { "epoch": 0.2361456684008143, "grad_norm": 0.19881942868232727, "learning_rate": 7.873303167420815e-06, "loss": 0.8332, "step": 522 }, { "epoch": 0.2365980547387469, "grad_norm": 0.20951199531555176, "learning_rate": 7.888386123680241e-06, "loss": 0.768, "step": 523 }, { "epoch": 0.2370504410766795, "grad_norm": 0.23973168432712555, "learning_rate": 7.903469079939668e-06, "loss": 0.7857, "step": 524 }, { "epoch": 0.23750282741461207, "grad_norm": 0.22241102159023285, "learning_rate": 7.918552036199096e-06, "loss": 0.7895, "step": 525 }, { "epoch": 0.23795521375254466, "grad_norm": 0.2303517758846283, "learning_rate": 7.933634992458522e-06, "loss": 0.9798, "step": 526 }, { "epoch": 0.23840760009047726, "grad_norm": 0.2059846967458725, "learning_rate": 7.948717948717949e-06, "loss": 0.7254, "step": 527 }, { "epoch": 0.23885998642840986, "grad_norm": 0.22893200814723969, "learning_rate": 7.963800904977375e-06, "loss": 0.8682, "step": 528 }, { "epoch": 0.23931237276634246, "grad_norm": 0.2143840491771698, "learning_rate": 7.978883861236803e-06, "loss": 0.7818, "step": 529 }, { "epoch": 0.23976475910427505, "grad_norm": 0.23112569749355316, "learning_rate": 7.993966817496231e-06, "loss": 0.7892, "step": 530 }, { "epoch": 0.24021714544220765, "grad_norm": 0.22377145290374756, "learning_rate": 8.009049773755657e-06, "loss": 0.8705, "step": 531 }, { "epoch": 0.24066953178014025, "grad_norm": 0.3483259379863739, "learning_rate": 8.024132730015084e-06, "loss": 0.7766, "step": 532 }, { "epoch": 0.24112191811807285, "grad_norm": 0.2611573338508606, "learning_rate": 8.03921568627451e-06, "loss": 0.7419, "step": 533 }, { "epoch": 0.24157430445600542, "grad_norm": 0.269803524017334, "learning_rate": 8.054298642533938e-06, "loss": 0.8501, "step": 534 }, { "epoch": 0.24202669079393802, "grad_norm": 0.2800680100917816, "learning_rate": 8.069381598793365e-06, "loss": 0.7734, "step": 535 }, { "epoch": 0.2424790771318706, "grad_norm": 0.28553932905197144, "learning_rate": 8.084464555052791e-06, "loss": 1.0101, "step": 536 }, { "epoch": 0.2429314634698032, "grad_norm": 0.2857571542263031, "learning_rate": 8.099547511312217e-06, "loss": 0.8015, "step": 537 }, { "epoch": 0.2433838498077358, "grad_norm": 0.3187916874885559, "learning_rate": 8.114630467571645e-06, "loss": 0.8903, "step": 538 }, { "epoch": 0.2438362361456684, "grad_norm": 0.29023629426956177, "learning_rate": 8.129713423831072e-06, "loss": 0.87, "step": 539 }, { "epoch": 0.244288622483601, "grad_norm": 0.32729658484458923, "learning_rate": 8.144796380090498e-06, "loss": 0.9257, "step": 540 }, { "epoch": 0.2447410088215336, "grad_norm": 0.2886449992656708, "learning_rate": 8.159879336349925e-06, "loss": 0.8205, "step": 541 }, { "epoch": 0.24519339515946617, "grad_norm": 0.27254289388656616, "learning_rate": 8.174962292609353e-06, "loss": 0.8188, "step": 542 }, { "epoch": 0.24564578149739877, "grad_norm": 0.30570387840270996, "learning_rate": 8.190045248868779e-06, "loss": 0.9458, "step": 543 }, { "epoch": 0.24609816783533137, "grad_norm": 0.43287283182144165, "learning_rate": 8.205128205128205e-06, "loss": 0.8553, "step": 544 }, { "epoch": 0.24655055417326396, "grad_norm": 0.3033774495124817, "learning_rate": 8.220211161387632e-06, "loss": 0.93, "step": 545 }, { "epoch": 0.24700294051119656, "grad_norm": 0.3175485134124756, "learning_rate": 8.23529411764706e-06, "loss": 0.9236, "step": 546 }, { "epoch": 0.24745532684912916, "grad_norm": 0.33290380239486694, "learning_rate": 8.250377073906486e-06, "loss": 0.9319, "step": 547 }, { "epoch": 0.24790771318706176, "grad_norm": 0.32848671078681946, "learning_rate": 8.265460030165913e-06, "loss": 0.9322, "step": 548 }, { "epoch": 0.24836009952499435, "grad_norm": 0.31983664631843567, "learning_rate": 8.280542986425339e-06, "loss": 0.8845, "step": 549 }, { "epoch": 0.24881248586292695, "grad_norm": 0.6087541580200195, "learning_rate": 8.295625942684767e-06, "loss": 1.0424, "step": 550 }, { "epoch": 0.24926487220085952, "grad_norm": 0.10058319568634033, "learning_rate": 8.310708898944195e-06, "loss": 1.4144, "step": 551 }, { "epoch": 0.24971725853879212, "grad_norm": 0.13980086147785187, "learning_rate": 8.325791855203621e-06, "loss": 0.6471, "step": 552 }, { "epoch": 0.2501696448767247, "grad_norm": 0.12118122726678848, "learning_rate": 8.340874811463048e-06, "loss": 0.741, "step": 553 }, { "epoch": 0.2506220312146573, "grad_norm": 0.168208509683609, "learning_rate": 8.355957767722474e-06, "loss": 0.748, "step": 554 }, { "epoch": 0.2510744175525899, "grad_norm": 0.17964373528957367, "learning_rate": 8.371040723981902e-06, "loss": 0.8874, "step": 555 }, { "epoch": 0.2515268038905225, "grad_norm": 0.16260278224945068, "learning_rate": 8.386123680241329e-06, "loss": 0.7754, "step": 556 }, { "epoch": 0.2519791902284551, "grad_norm": 0.15795819461345673, "learning_rate": 8.401206636500755e-06, "loss": 0.8555, "step": 557 }, { "epoch": 0.2524315765663877, "grad_norm": 0.16253437101840973, "learning_rate": 8.416289592760181e-06, "loss": 0.9137, "step": 558 }, { "epoch": 0.2528839629043203, "grad_norm": 0.19547909498214722, "learning_rate": 8.43137254901961e-06, "loss": 0.8343, "step": 559 }, { "epoch": 0.2533363492422529, "grad_norm": 0.2100573182106018, "learning_rate": 8.446455505279036e-06, "loss": 0.7433, "step": 560 }, { "epoch": 0.2537887355801855, "grad_norm": 0.17543625831604004, "learning_rate": 8.461538461538462e-06, "loss": 0.7512, "step": 561 }, { "epoch": 0.2542411219181181, "grad_norm": 0.25688624382019043, "learning_rate": 8.476621417797888e-06, "loss": 0.8032, "step": 562 }, { "epoch": 0.2546935082560507, "grad_norm": 0.20365720987319946, "learning_rate": 8.491704374057317e-06, "loss": 0.851, "step": 563 }, { "epoch": 0.25514589459398324, "grad_norm": 0.18265452980995178, "learning_rate": 8.506787330316743e-06, "loss": 0.7512, "step": 564 }, { "epoch": 0.25559828093191583, "grad_norm": 0.21435123682022095, "learning_rate": 8.52187028657617e-06, "loss": 0.7501, "step": 565 }, { "epoch": 0.25605066726984843, "grad_norm": 0.20012609660625458, "learning_rate": 8.536953242835596e-06, "loss": 0.8552, "step": 566 }, { "epoch": 0.25650305360778103, "grad_norm": 0.19673648476600647, "learning_rate": 8.552036199095024e-06, "loss": 0.8605, "step": 567 }, { "epoch": 0.2569554399457136, "grad_norm": 0.21650375425815582, "learning_rate": 8.56711915535445e-06, "loss": 0.7221, "step": 568 }, { "epoch": 0.2574078262836462, "grad_norm": 0.19108593463897705, "learning_rate": 8.582202111613876e-06, "loss": 0.8136, "step": 569 }, { "epoch": 0.2578602126215788, "grad_norm": 0.2084406316280365, "learning_rate": 8.597285067873304e-06, "loss": 0.8232, "step": 570 }, { "epoch": 0.2583125989595114, "grad_norm": 0.23692235350608826, "learning_rate": 8.612368024132731e-06, "loss": 0.8233, "step": 571 }, { "epoch": 0.258764985297444, "grad_norm": 0.21277695894241333, "learning_rate": 8.627450980392157e-06, "loss": 0.9638, "step": 572 }, { "epoch": 0.2592173716353766, "grad_norm": 0.20811431109905243, "learning_rate": 8.642533936651585e-06, "loss": 0.8521, "step": 573 }, { "epoch": 0.2596697579733092, "grad_norm": 0.24433712661266327, "learning_rate": 8.657616892911012e-06, "loss": 0.8579, "step": 574 }, { "epoch": 0.2601221443112418, "grad_norm": 0.22188237309455872, "learning_rate": 8.672699849170438e-06, "loss": 0.7753, "step": 575 }, { "epoch": 0.2605745306491744, "grad_norm": 0.24228435754776, "learning_rate": 8.687782805429864e-06, "loss": 0.8543, "step": 576 }, { "epoch": 0.261026916987107, "grad_norm": 0.2433740347623825, "learning_rate": 8.702865761689292e-06, "loss": 0.7934, "step": 577 }, { "epoch": 0.2614793033250396, "grad_norm": 0.25339850783348083, "learning_rate": 8.717948717948719e-06, "loss": 0.8746, "step": 578 }, { "epoch": 0.2619316896629722, "grad_norm": 0.23196004331111908, "learning_rate": 8.733031674208145e-06, "loss": 0.8227, "step": 579 }, { "epoch": 0.2623840760009048, "grad_norm": 0.27828916907310486, "learning_rate": 8.748114630467572e-06, "loss": 0.792, "step": 580 }, { "epoch": 0.26283646233883734, "grad_norm": 0.27962198853492737, "learning_rate": 8.763197586727e-06, "loss": 0.7762, "step": 581 }, { "epoch": 0.26328884867676994, "grad_norm": 0.24770735204219818, "learning_rate": 8.778280542986426e-06, "loss": 0.851, "step": 582 }, { "epoch": 0.26374123501470254, "grad_norm": 0.24233609437942505, "learning_rate": 8.793363499245852e-06, "loss": 0.8128, "step": 583 }, { "epoch": 0.26419362135263513, "grad_norm": 0.2704032361507416, "learning_rate": 8.808446455505279e-06, "loss": 0.8464, "step": 584 }, { "epoch": 0.26464600769056773, "grad_norm": 0.2734048664569855, "learning_rate": 8.823529411764707e-06, "loss": 0.8075, "step": 585 }, { "epoch": 0.26509839402850033, "grad_norm": 0.26624831557273865, "learning_rate": 8.838612368024133e-06, "loss": 0.7842, "step": 586 }, { "epoch": 0.2655507803664329, "grad_norm": 0.26060259342193604, "learning_rate": 8.85369532428356e-06, "loss": 0.9036, "step": 587 }, { "epoch": 0.2660031667043655, "grad_norm": 0.30297520756721497, "learning_rate": 8.868778280542986e-06, "loss": 0.8108, "step": 588 }, { "epoch": 0.2664555530422981, "grad_norm": 0.29915040731430054, "learning_rate": 8.883861236802414e-06, "loss": 0.8006, "step": 589 }, { "epoch": 0.2669079393802307, "grad_norm": 0.36738330125808716, "learning_rate": 8.89894419306184e-06, "loss": 0.8752, "step": 590 }, { "epoch": 0.2673603257181633, "grad_norm": 0.33454471826553345, "learning_rate": 8.914027149321268e-06, "loss": 0.8653, "step": 591 }, { "epoch": 0.2678127120560959, "grad_norm": 0.41570669412612915, "learning_rate": 8.929110105580695e-06, "loss": 0.8287, "step": 592 }, { "epoch": 0.2682650983940285, "grad_norm": 0.27037131786346436, "learning_rate": 8.944193061840121e-06, "loss": 0.7939, "step": 593 }, { "epoch": 0.2687174847319611, "grad_norm": 0.4226844310760498, "learning_rate": 8.95927601809955e-06, "loss": 0.8281, "step": 594 }, { "epoch": 0.2691698710698937, "grad_norm": 0.26709386706352234, "learning_rate": 8.974358974358976e-06, "loss": 0.8209, "step": 595 }, { "epoch": 0.2696222574078263, "grad_norm": 0.36793026328086853, "learning_rate": 8.989441930618402e-06, "loss": 0.872, "step": 596 }, { "epoch": 0.2700746437457589, "grad_norm": 0.3872925341129303, "learning_rate": 9.004524886877828e-06, "loss": 0.913, "step": 597 }, { "epoch": 0.2705270300836915, "grad_norm": 0.5190165638923645, "learning_rate": 9.019607843137256e-06, "loss": 1.0625, "step": 598 }, { "epoch": 0.27097941642162404, "grad_norm": 0.5005229711532593, "learning_rate": 9.034690799396683e-06, "loss": 1.0171, "step": 599 }, { "epoch": 0.27143180275955664, "grad_norm": 0.5042964816093445, "learning_rate": 9.049773755656109e-06, "loss": 1.0804, "step": 600 }, { "epoch": 0.27143180275955664, "eval_loss": 0.8430254459381104, "eval_runtime": 26.6976, "eval_samples_per_second": 27.868, "eval_steps_per_second": 6.967, "step": 600 }, { "epoch": 0.27188418909748924, "grad_norm": 0.11535760015249252, "learning_rate": 9.064856711915535e-06, "loss": 1.247, "step": 601 }, { "epoch": 0.27233657543542183, "grad_norm": 0.1591215580701828, "learning_rate": 9.079939668174964e-06, "loss": 0.7234, "step": 602 }, { "epoch": 0.27278896177335443, "grad_norm": 0.16187569499015808, "learning_rate": 9.09502262443439e-06, "loss": 0.7887, "step": 603 }, { "epoch": 0.27324134811128703, "grad_norm": 0.15579929947853088, "learning_rate": 9.110105580693816e-06, "loss": 0.7652, "step": 604 }, { "epoch": 0.27369373444921963, "grad_norm": 0.18251098692417145, "learning_rate": 9.125188536953243e-06, "loss": 0.7059, "step": 605 }, { "epoch": 0.2741461207871522, "grad_norm": 0.17758069932460785, "learning_rate": 9.14027149321267e-06, "loss": 0.7807, "step": 606 }, { "epoch": 0.2745985071250848, "grad_norm": 0.17907141149044037, "learning_rate": 9.155354449472097e-06, "loss": 0.726, "step": 607 }, { "epoch": 0.2750508934630174, "grad_norm": 0.18857307732105255, "learning_rate": 9.170437405731523e-06, "loss": 0.8419, "step": 608 }, { "epoch": 0.27550327980095, "grad_norm": 0.18397226929664612, "learning_rate": 9.18552036199095e-06, "loss": 0.8134, "step": 609 }, { "epoch": 0.2759556661388826, "grad_norm": 0.16396372020244598, "learning_rate": 9.200603318250378e-06, "loss": 0.7649, "step": 610 }, { "epoch": 0.2764080524768152, "grad_norm": 0.1877203732728958, "learning_rate": 9.215686274509804e-06, "loss": 0.7172, "step": 611 }, { "epoch": 0.2768604388147478, "grad_norm": 0.18403850495815277, "learning_rate": 9.230769230769232e-06, "loss": 0.7604, "step": 612 }, { "epoch": 0.2773128251526804, "grad_norm": 0.18659637868404388, "learning_rate": 9.245852187028659e-06, "loss": 0.6547, "step": 613 }, { "epoch": 0.277765211490613, "grad_norm": 0.18184320628643036, "learning_rate": 9.260935143288085e-06, "loss": 0.8054, "step": 614 }, { "epoch": 0.2782175978285456, "grad_norm": 0.23369327187538147, "learning_rate": 9.276018099547513e-06, "loss": 0.7356, "step": 615 }, { "epoch": 0.27866998416647815, "grad_norm": 0.1868286281824112, "learning_rate": 9.29110105580694e-06, "loss": 0.7642, "step": 616 }, { "epoch": 0.27912237050441074, "grad_norm": 0.20586813986301422, "learning_rate": 9.306184012066366e-06, "loss": 0.8384, "step": 617 }, { "epoch": 0.27957475684234334, "grad_norm": 0.20552411675453186, "learning_rate": 9.321266968325792e-06, "loss": 0.8653, "step": 618 }, { "epoch": 0.28002714318027594, "grad_norm": 0.19008006155490875, "learning_rate": 9.33634992458522e-06, "loss": 0.7473, "step": 619 }, { "epoch": 0.28047952951820854, "grad_norm": 0.21605855226516724, "learning_rate": 9.351432880844647e-06, "loss": 0.7797, "step": 620 }, { "epoch": 0.28093191585614113, "grad_norm": 0.22334149479866028, "learning_rate": 9.366515837104073e-06, "loss": 0.8866, "step": 621 }, { "epoch": 0.28138430219407373, "grad_norm": 0.24114498496055603, "learning_rate": 9.3815987933635e-06, "loss": 0.8625, "step": 622 }, { "epoch": 0.28183668853200633, "grad_norm": 0.2159021943807602, "learning_rate": 9.396681749622927e-06, "loss": 0.7759, "step": 623 }, { "epoch": 0.2822890748699389, "grad_norm": 0.22454793751239777, "learning_rate": 9.411764705882354e-06, "loss": 0.8201, "step": 624 }, { "epoch": 0.2827414612078715, "grad_norm": 0.22020384669303894, "learning_rate": 9.42684766214178e-06, "loss": 0.7951, "step": 625 }, { "epoch": 0.2831938475458041, "grad_norm": 0.22186905145645142, "learning_rate": 9.441930618401207e-06, "loss": 0.7026, "step": 626 }, { "epoch": 0.2836462338837367, "grad_norm": 0.2376614809036255, "learning_rate": 9.457013574660635e-06, "loss": 0.8166, "step": 627 }, { "epoch": 0.2840986202216693, "grad_norm": 0.24639269709587097, "learning_rate": 9.472096530920061e-06, "loss": 0.9116, "step": 628 }, { "epoch": 0.2845510065596019, "grad_norm": 0.26511576771736145, "learning_rate": 9.487179487179487e-06, "loss": 0.7833, "step": 629 }, { "epoch": 0.2850033928975345, "grad_norm": 0.22919566929340363, "learning_rate": 9.502262443438914e-06, "loss": 0.768, "step": 630 }, { "epoch": 0.2854557792354671, "grad_norm": 0.24819107353687286, "learning_rate": 9.517345399698342e-06, "loss": 0.8151, "step": 631 }, { "epoch": 0.2859081655733997, "grad_norm": 0.3019786775112152, "learning_rate": 9.53242835595777e-06, "loss": 0.7932, "step": 632 }, { "epoch": 0.28636055191133225, "grad_norm": 0.2621288001537323, "learning_rate": 9.547511312217196e-06, "loss": 0.8667, "step": 633 }, { "epoch": 0.28681293824926485, "grad_norm": 0.2945888936519623, "learning_rate": 9.562594268476623e-06, "loss": 0.8926, "step": 634 }, { "epoch": 0.28726532458719745, "grad_norm": 0.24975991249084473, "learning_rate": 9.577677224736049e-06, "loss": 0.8074, "step": 635 }, { "epoch": 0.28771771092513004, "grad_norm": 0.2516855299472809, "learning_rate": 9.592760180995477e-06, "loss": 0.6964, "step": 636 }, { "epoch": 0.28817009726306264, "grad_norm": 0.3490786850452423, "learning_rate": 9.607843137254903e-06, "loss": 0.8393, "step": 637 }, { "epoch": 0.28862248360099524, "grad_norm": 0.3269819915294647, "learning_rate": 9.62292609351433e-06, "loss": 0.8173, "step": 638 }, { "epoch": 0.28907486993892784, "grad_norm": 0.2980722486972809, "learning_rate": 9.638009049773756e-06, "loss": 0.8884, "step": 639 }, { "epoch": 0.28952725627686043, "grad_norm": 0.3092951476573944, "learning_rate": 9.653092006033184e-06, "loss": 0.7876, "step": 640 }, { "epoch": 0.28997964261479303, "grad_norm": 0.3104114234447479, "learning_rate": 9.66817496229261e-06, "loss": 0.8811, "step": 641 }, { "epoch": 0.29043202895272563, "grad_norm": 0.35291141271591187, "learning_rate": 9.683257918552037e-06, "loss": 0.8584, "step": 642 }, { "epoch": 0.2908844152906582, "grad_norm": 0.3490295112133026, "learning_rate": 9.698340874811463e-06, "loss": 0.8335, "step": 643 }, { "epoch": 0.2913368016285908, "grad_norm": 0.32756829261779785, "learning_rate": 9.713423831070891e-06, "loss": 0.8756, "step": 644 }, { "epoch": 0.2917891879665234, "grad_norm": 0.32433420419692993, "learning_rate": 9.728506787330318e-06, "loss": 0.9271, "step": 645 }, { "epoch": 0.292241574304456, "grad_norm": 0.3824956715106964, "learning_rate": 9.743589743589744e-06, "loss": 0.7664, "step": 646 }, { "epoch": 0.2926939606423886, "grad_norm": 0.38768506050109863, "learning_rate": 9.75867269984917e-06, "loss": 0.93, "step": 647 }, { "epoch": 0.2931463469803212, "grad_norm": 0.44878461956977844, "learning_rate": 9.773755656108599e-06, "loss": 0.8589, "step": 648 }, { "epoch": 0.2935987333182538, "grad_norm": 0.40161120891571045, "learning_rate": 9.788838612368025e-06, "loss": 0.9263, "step": 649 }, { "epoch": 0.2940511196561864, "grad_norm": 0.6391124725341797, "learning_rate": 9.803921568627451e-06, "loss": 0.9795, "step": 650 }, { "epoch": 0.29450350599411895, "grad_norm": 0.10679467022418976, "learning_rate": 9.819004524886878e-06, "loss": 1.0668, "step": 651 }, { "epoch": 0.29495589233205155, "grad_norm": 0.14910776913166046, "learning_rate": 9.834087481146306e-06, "loss": 0.983, "step": 652 }, { "epoch": 0.29540827866998415, "grad_norm": 0.13148973882198334, "learning_rate": 9.849170437405732e-06, "loss": 0.5884, "step": 653 }, { "epoch": 0.29586066500791675, "grad_norm": 0.15688712894916534, "learning_rate": 9.86425339366516e-06, "loss": 0.8143, "step": 654 }, { "epoch": 0.29631305134584934, "grad_norm": 0.16994206607341766, "learning_rate": 9.879336349924586e-06, "loss": 0.8088, "step": 655 }, { "epoch": 0.29676543768378194, "grad_norm": 0.17601217329502106, "learning_rate": 9.894419306184013e-06, "loss": 0.705, "step": 656 }, { "epoch": 0.29721782402171454, "grad_norm": 0.14906974136829376, "learning_rate": 9.90950226244344e-06, "loss": 0.6621, "step": 657 }, { "epoch": 0.29767021035964714, "grad_norm": 0.17265290021896362, "learning_rate": 9.924585218702867e-06, "loss": 0.7616, "step": 658 }, { "epoch": 0.29812259669757973, "grad_norm": 0.17803266644477844, "learning_rate": 9.939668174962294e-06, "loss": 0.7116, "step": 659 }, { "epoch": 0.29857498303551233, "grad_norm": 0.24010293185710907, "learning_rate": 9.95475113122172e-06, "loss": 0.8353, "step": 660 }, { "epoch": 0.29902736937344493, "grad_norm": 0.19234101474285126, "learning_rate": 9.969834087481146e-06, "loss": 0.6903, "step": 661 }, { "epoch": 0.2994797557113775, "grad_norm": 0.2093159705400467, "learning_rate": 9.984917043740574e-06, "loss": 0.7693, "step": 662 }, { "epoch": 0.2999321420493101, "grad_norm": 0.19772885739803314, "learning_rate": 1e-05, "loss": 0.7907, "step": 663 }, { "epoch": 0.3003845283872427, "grad_norm": 0.21634182333946228, "learning_rate": 9.999999946307734e-06, "loss": 0.7655, "step": 664 }, { "epoch": 0.3008369147251753, "grad_norm": 0.21543526649475098, "learning_rate": 9.999999785230936e-06, "loss": 0.8612, "step": 665 }, { "epoch": 0.3012893010631079, "grad_norm": 0.2269447296857834, "learning_rate": 9.99999951676961e-06, "loss": 0.7504, "step": 666 }, { "epoch": 0.3017416874010405, "grad_norm": 0.21243199706077576, "learning_rate": 9.99999914092376e-06, "loss": 0.7233, "step": 667 }, { "epoch": 0.30219407373897306, "grad_norm": 0.20383904874324799, "learning_rate": 9.999998657693394e-06, "loss": 0.7666, "step": 668 }, { "epoch": 0.30264646007690565, "grad_norm": 0.20236240327358246, "learning_rate": 9.999998067078527e-06, "loss": 0.6562, "step": 669 }, { "epoch": 0.30309884641483825, "grad_norm": 0.24028515815734863, "learning_rate": 9.999997369079166e-06, "loss": 0.752, "step": 670 }, { "epoch": 0.30355123275277085, "grad_norm": 0.2192355990409851, "learning_rate": 9.99999656369533e-06, "loss": 0.8463, "step": 671 }, { "epoch": 0.30400361909070345, "grad_norm": 0.2391347885131836, "learning_rate": 9.999995650927034e-06, "loss": 0.9192, "step": 672 }, { "epoch": 0.30445600542863605, "grad_norm": 0.22476516664028168, "learning_rate": 9.999994630774298e-06, "loss": 0.7864, "step": 673 }, { "epoch": 0.30490839176656864, "grad_norm": 0.24082158505916595, "learning_rate": 9.999993503237146e-06, "loss": 0.6059, "step": 674 }, { "epoch": 0.30536077810450124, "grad_norm": 0.24348177015781403, "learning_rate": 9.9999922683156e-06, "loss": 0.7738, "step": 675 }, { "epoch": 0.30581316444243384, "grad_norm": 0.23889607191085815, "learning_rate": 9.999990926009686e-06, "loss": 0.6804, "step": 676 }, { "epoch": 0.30626555078036644, "grad_norm": 0.26433148980140686, "learning_rate": 9.999989476319434e-06, "loss": 0.7294, "step": 677 }, { "epoch": 0.30671793711829903, "grad_norm": 0.2967691421508789, "learning_rate": 9.999987919244875e-06, "loss": 0.8649, "step": 678 }, { "epoch": 0.30717032345623163, "grad_norm": 0.25524184107780457, "learning_rate": 9.999986254786043e-06, "loss": 0.859, "step": 679 }, { "epoch": 0.30762270979416423, "grad_norm": 0.3223593831062317, "learning_rate": 9.999984482942972e-06, "loss": 0.7437, "step": 680 }, { "epoch": 0.3080750961320968, "grad_norm": 0.32611149549484253, "learning_rate": 9.999982603715701e-06, "loss": 0.7825, "step": 681 }, { "epoch": 0.3085274824700294, "grad_norm": 0.2564953565597534, "learning_rate": 9.999980617104273e-06, "loss": 0.775, "step": 682 }, { "epoch": 0.308979868807962, "grad_norm": 0.2723800241947174, "learning_rate": 9.999978523108726e-06, "loss": 0.7974, "step": 683 }, { "epoch": 0.3094322551458946, "grad_norm": 0.29790589213371277, "learning_rate": 9.999976321729107e-06, "loss": 0.8943, "step": 684 }, { "epoch": 0.30988464148382716, "grad_norm": 0.3206850290298462, "learning_rate": 9.999974012965464e-06, "loss": 0.7787, "step": 685 }, { "epoch": 0.31033702782175976, "grad_norm": 0.3013688921928406, "learning_rate": 9.999971596817846e-06, "loss": 0.8085, "step": 686 }, { "epoch": 0.31078941415969236, "grad_norm": 0.3094845712184906, "learning_rate": 9.999969073286306e-06, "loss": 0.7569, "step": 687 }, { "epoch": 0.31124180049762495, "grad_norm": 0.32132604718208313, "learning_rate": 9.999966442370896e-06, "loss": 0.8314, "step": 688 }, { "epoch": 0.31169418683555755, "grad_norm": 0.3009161949157715, "learning_rate": 9.999963704071673e-06, "loss": 0.775, "step": 689 }, { "epoch": 0.31214657317349015, "grad_norm": 0.32088470458984375, "learning_rate": 9.9999608583887e-06, "loss": 0.7883, "step": 690 }, { "epoch": 0.31259895951142275, "grad_norm": 0.28111082315444946, "learning_rate": 9.999957905322031e-06, "loss": 0.7858, "step": 691 }, { "epoch": 0.31305134584935534, "grad_norm": 0.31146475672721863, "learning_rate": 9.999954844871732e-06, "loss": 0.8233, "step": 692 }, { "epoch": 0.31350373218728794, "grad_norm": 0.33425143361091614, "learning_rate": 9.999951677037873e-06, "loss": 0.7959, "step": 693 }, { "epoch": 0.31395611852522054, "grad_norm": 0.332381933927536, "learning_rate": 9.999948401820518e-06, "loss": 0.8484, "step": 694 }, { "epoch": 0.31440850486315314, "grad_norm": 0.3266429007053375, "learning_rate": 9.999945019219737e-06, "loss": 0.8021, "step": 695 }, { "epoch": 0.31486089120108574, "grad_norm": 0.3864741921424866, "learning_rate": 9.999941529235606e-06, "loss": 0.7799, "step": 696 }, { "epoch": 0.31531327753901833, "grad_norm": 0.40724241733551025, "learning_rate": 9.999937931868196e-06, "loss": 0.8737, "step": 697 }, { "epoch": 0.31576566387695093, "grad_norm": 0.467903733253479, "learning_rate": 9.999934227117585e-06, "loss": 0.8994, "step": 698 }, { "epoch": 0.31621805021488353, "grad_norm": 0.42579084634780884, "learning_rate": 9.999930414983854e-06, "loss": 0.9759, "step": 699 }, { "epoch": 0.3166704365528161, "grad_norm": 0.4810768663883209, "learning_rate": 9.999926495467086e-06, "loss": 0.9591, "step": 700 }, { "epoch": 0.3171228228907487, "grad_norm": 0.09844833612442017, "learning_rate": 9.999922468567362e-06, "loss": 1.3387, "step": 701 }, { "epoch": 0.3175752092286813, "grad_norm": 0.14618127048015594, "learning_rate": 9.999918334284771e-06, "loss": 0.9447, "step": 702 }, { "epoch": 0.31802759556661386, "grad_norm": 0.15217307209968567, "learning_rate": 9.999914092619401e-06, "loss": 0.754, "step": 703 }, { "epoch": 0.31847998190454646, "grad_norm": 0.1749715954065323, "learning_rate": 9.999909743571344e-06, "loss": 0.73, "step": 704 }, { "epoch": 0.31893236824247906, "grad_norm": 0.1773625612258911, "learning_rate": 9.99990528714069e-06, "loss": 0.8201, "step": 705 }, { "epoch": 0.31938475458041166, "grad_norm": 0.2205086648464203, "learning_rate": 9.999900723327538e-06, "loss": 0.6247, "step": 706 }, { "epoch": 0.31983714091834425, "grad_norm": 0.1964729130268097, "learning_rate": 9.999896052131984e-06, "loss": 0.8643, "step": 707 }, { "epoch": 0.32028952725627685, "grad_norm": 0.17778770625591278, "learning_rate": 9.999891273554131e-06, "loss": 0.7307, "step": 708 }, { "epoch": 0.32074191359420945, "grad_norm": 0.19349122047424316, "learning_rate": 9.99988638759408e-06, "loss": 0.7564, "step": 709 }, { "epoch": 0.32119429993214205, "grad_norm": 0.21015988290309906, "learning_rate": 9.999881394251936e-06, "loss": 0.8297, "step": 710 }, { "epoch": 0.32164668627007464, "grad_norm": 0.19625915586948395, "learning_rate": 9.999876293527804e-06, "loss": 0.7653, "step": 711 }, { "epoch": 0.32209907260800724, "grad_norm": 0.22221893072128296, "learning_rate": 9.999871085421798e-06, "loss": 0.7262, "step": 712 }, { "epoch": 0.32255145894593984, "grad_norm": 0.2049301117658615, "learning_rate": 9.999865769934028e-06, "loss": 0.7042, "step": 713 }, { "epoch": 0.32300384528387244, "grad_norm": 0.23430189490318298, "learning_rate": 9.999860347064607e-06, "loss": 0.918, "step": 714 }, { "epoch": 0.32345623162180503, "grad_norm": 0.213454931974411, "learning_rate": 9.999854816813652e-06, "loss": 0.8427, "step": 715 }, { "epoch": 0.32390861795973763, "grad_norm": 0.2399243265390396, "learning_rate": 9.999849179181283e-06, "loss": 0.9116, "step": 716 }, { "epoch": 0.32436100429767023, "grad_norm": 0.23180051147937775, "learning_rate": 9.999843434167618e-06, "loss": 0.8553, "step": 717 }, { "epoch": 0.32481339063560283, "grad_norm": 0.23937080800533295, "learning_rate": 9.999837581772784e-06, "loss": 0.6992, "step": 718 }, { "epoch": 0.3252657769735354, "grad_norm": 0.2757538855075836, "learning_rate": 9.999831621996905e-06, "loss": 0.8991, "step": 719 }, { "epoch": 0.32571816331146797, "grad_norm": 0.23665755987167358, "learning_rate": 9.99982555484011e-06, "loss": 0.7171, "step": 720 }, { "epoch": 0.32617054964940057, "grad_norm": 0.26817864179611206, "learning_rate": 9.999819380302527e-06, "loss": 0.6484, "step": 721 }, { "epoch": 0.32662293598733316, "grad_norm": 0.2628593444824219, "learning_rate": 9.99981309838429e-06, "loss": 0.8, "step": 722 }, { "epoch": 0.32707532232526576, "grad_norm": 0.2720540463924408, "learning_rate": 9.999806709085536e-06, "loss": 0.7754, "step": 723 }, { "epoch": 0.32752770866319836, "grad_norm": 0.2581476867198944, "learning_rate": 9.999800212406399e-06, "loss": 0.8339, "step": 724 }, { "epoch": 0.32798009500113096, "grad_norm": 0.27235347032546997, "learning_rate": 9.99979360834702e-06, "loss": 0.8154, "step": 725 }, { "epoch": 0.32843248133906355, "grad_norm": 0.2872222363948822, "learning_rate": 9.99978689690754e-06, "loss": 0.7426, "step": 726 }, { "epoch": 0.32888486767699615, "grad_norm": 0.29493609070777893, "learning_rate": 9.999780078088104e-06, "loss": 0.8643, "step": 727 }, { "epoch": 0.32933725401492875, "grad_norm": 0.26483091711997986, "learning_rate": 9.999773151888857e-06, "loss": 0.7245, "step": 728 }, { "epoch": 0.32978964035286135, "grad_norm": 0.29625141620635986, "learning_rate": 9.999766118309951e-06, "loss": 0.9654, "step": 729 }, { "epoch": 0.33024202669079394, "grad_norm": 0.2863912582397461, "learning_rate": 9.999758977351533e-06, "loss": 0.8871, "step": 730 }, { "epoch": 0.33069441302872654, "grad_norm": 0.30831360816955566, "learning_rate": 9.999751729013761e-06, "loss": 0.7995, "step": 731 }, { "epoch": 0.33114679936665914, "grad_norm": 0.30263751745224, "learning_rate": 9.999744373296785e-06, "loss": 0.7131, "step": 732 }, { "epoch": 0.33159918570459174, "grad_norm": 0.3050912916660309, "learning_rate": 9.999736910200768e-06, "loss": 0.7507, "step": 733 }, { "epoch": 0.33205157204252433, "grad_norm": 0.3333936333656311, "learning_rate": 9.999729339725868e-06, "loss": 0.7367, "step": 734 }, { "epoch": 0.33250395838045693, "grad_norm": 0.39772823452949524, "learning_rate": 9.999721661872249e-06, "loss": 0.6955, "step": 735 }, { "epoch": 0.33295634471838953, "grad_norm": 0.319406121969223, "learning_rate": 9.999713876640072e-06, "loss": 0.7617, "step": 736 }, { "epoch": 0.33340873105632207, "grad_norm": 0.4028261601924896, "learning_rate": 9.999705984029509e-06, "loss": 0.8306, "step": 737 }, { "epoch": 0.33386111739425467, "grad_norm": 0.32012486457824707, "learning_rate": 9.999697984040727e-06, "loss": 0.7395, "step": 738 }, { "epoch": 0.33431350373218727, "grad_norm": 0.353914350271225, "learning_rate": 9.999689876673898e-06, "loss": 0.7845, "step": 739 }, { "epoch": 0.33476589007011986, "grad_norm": 0.4475042521953583, "learning_rate": 9.999681661929197e-06, "loss": 0.7183, "step": 740 }, { "epoch": 0.33521827640805246, "grad_norm": 0.3851722776889801, "learning_rate": 9.999673339806799e-06, "loss": 0.7413, "step": 741 }, { "epoch": 0.33567066274598506, "grad_norm": 0.4157194495201111, "learning_rate": 9.999664910306884e-06, "loss": 0.792, "step": 742 }, { "epoch": 0.33612304908391766, "grad_norm": 0.40094488859176636, "learning_rate": 9.999656373429633e-06, "loss": 0.6727, "step": 743 }, { "epoch": 0.33657543542185026, "grad_norm": 0.41361546516418457, "learning_rate": 9.999647729175228e-06, "loss": 0.8506, "step": 744 }, { "epoch": 0.33702782175978285, "grad_norm": 0.4474533796310425, "learning_rate": 9.999638977543857e-06, "loss": 0.8406, "step": 745 }, { "epoch": 0.33748020809771545, "grad_norm": 0.4576348662376404, "learning_rate": 9.999630118535705e-06, "loss": 0.9766, "step": 746 }, { "epoch": 0.33793259443564805, "grad_norm": 0.4989112615585327, "learning_rate": 9.999621152150967e-06, "loss": 0.909, "step": 747 }, { "epoch": 0.33838498077358065, "grad_norm": 0.5225561857223511, "learning_rate": 9.999612078389829e-06, "loss": 0.8975, "step": 748 }, { "epoch": 0.33883736711151324, "grad_norm": 0.5309637188911438, "learning_rate": 9.999602897252491e-06, "loss": 0.8105, "step": 749 }, { "epoch": 0.33928975344944584, "grad_norm": 0.5588895082473755, "learning_rate": 9.999593608739148e-06, "loss": 0.8661, "step": 750 }, { "epoch": 0.33974213978737844, "grad_norm": 0.11462723463773727, "learning_rate": 9.999584212849999e-06, "loss": 1.4317, "step": 751 }, { "epoch": 0.34019452612531104, "grad_norm": 0.1657470315694809, "learning_rate": 9.999574709585249e-06, "loss": 1.2895, "step": 752 }, { "epoch": 0.34064691246324363, "grad_norm": 0.15865597128868103, "learning_rate": 9.999565098945098e-06, "loss": 0.7214, "step": 753 }, { "epoch": 0.34109929880117623, "grad_norm": 0.18296761810779572, "learning_rate": 9.999555380929754e-06, "loss": 0.8502, "step": 754 }, { "epoch": 0.3415516851391088, "grad_norm": 0.20520542562007904, "learning_rate": 9.999545555539428e-06, "loss": 0.7752, "step": 755 }, { "epoch": 0.34200407147704137, "grad_norm": 0.22494405508041382, "learning_rate": 9.999535622774327e-06, "loss": 1.0753, "step": 756 }, { "epoch": 0.34245645781497397, "grad_norm": 0.20567281544208527, "learning_rate": 9.999525582634669e-06, "loss": 0.7091, "step": 757 }, { "epoch": 0.34290884415290657, "grad_norm": 0.22138096392154694, "learning_rate": 9.999515435120663e-06, "loss": 0.6825, "step": 758 }, { "epoch": 0.34336123049083916, "grad_norm": 0.23031044006347656, "learning_rate": 9.999505180232535e-06, "loss": 0.6882, "step": 759 }, { "epoch": 0.34381361682877176, "grad_norm": 0.2219041883945465, "learning_rate": 9.999494817970498e-06, "loss": 0.7071, "step": 760 }, { "epoch": 0.34426600316670436, "grad_norm": 0.236533522605896, "learning_rate": 9.999484348334779e-06, "loss": 0.6998, "step": 761 }, { "epoch": 0.34471838950463696, "grad_norm": 0.2410547435283661, "learning_rate": 9.9994737713256e-06, "loss": 0.679, "step": 762 }, { "epoch": 0.34517077584256955, "grad_norm": 0.23924872279167175, "learning_rate": 9.999463086943192e-06, "loss": 0.6628, "step": 763 }, { "epoch": 0.34562316218050215, "grad_norm": 0.2543971538543701, "learning_rate": 9.999452295187782e-06, "loss": 0.6264, "step": 764 }, { "epoch": 0.34607554851843475, "grad_norm": 0.26966366171836853, "learning_rate": 9.9994413960596e-06, "loss": 0.818, "step": 765 }, { "epoch": 0.34652793485636735, "grad_norm": 0.26649099588394165, "learning_rate": 9.999430389558884e-06, "loss": 0.683, "step": 766 }, { "epoch": 0.34698032119429995, "grad_norm": 0.2938804030418396, "learning_rate": 9.999419275685866e-06, "loss": 0.6605, "step": 767 }, { "epoch": 0.34743270753223254, "grad_norm": 0.3137078881263733, "learning_rate": 9.99940805444079e-06, "loss": 0.7299, "step": 768 }, { "epoch": 0.34788509387016514, "grad_norm": 0.33154818415641785, "learning_rate": 9.999396725823893e-06, "loss": 0.9033, "step": 769 }, { "epoch": 0.34833748020809774, "grad_norm": 0.30985385179519653, "learning_rate": 9.99938528983542e-06, "loss": 0.7896, "step": 770 }, { "epoch": 0.34878986654603034, "grad_norm": 0.32063204050064087, "learning_rate": 9.999373746475614e-06, "loss": 0.7019, "step": 771 }, { "epoch": 0.3492422528839629, "grad_norm": 0.3280408978462219, "learning_rate": 9.999362095744727e-06, "loss": 0.722, "step": 772 }, { "epoch": 0.3496946392218955, "grad_norm": 0.3415104150772095, "learning_rate": 9.999350337643006e-06, "loss": 0.8423, "step": 773 }, { "epoch": 0.3501470255598281, "grad_norm": 0.38077643513679504, "learning_rate": 9.999338472170705e-06, "loss": 0.6581, "step": 774 }, { "epoch": 0.35059941189776067, "grad_norm": 0.36855074763298035, "learning_rate": 9.999326499328078e-06, "loss": 0.735, "step": 775 }, { "epoch": 0.35105179823569327, "grad_norm": 0.38462117314338684, "learning_rate": 9.999314419115383e-06, "loss": 0.7566, "step": 776 }, { "epoch": 0.35150418457362587, "grad_norm": 0.3823815584182739, "learning_rate": 9.99930223153288e-06, "loss": 0.6354, "step": 777 }, { "epoch": 0.35195657091155846, "grad_norm": 0.4074817895889282, "learning_rate": 9.999289936580827e-06, "loss": 0.7219, "step": 778 }, { "epoch": 0.35240895724949106, "grad_norm": 0.38901767134666443, "learning_rate": 9.999277534259495e-06, "loss": 0.7519, "step": 779 }, { "epoch": 0.35286134358742366, "grad_norm": 0.4813770055770874, "learning_rate": 9.999265024569143e-06, "loss": 0.7989, "step": 780 }, { "epoch": 0.35331372992535626, "grad_norm": 0.4137837588787079, "learning_rate": 9.999252407510044e-06, "loss": 0.6997, "step": 781 }, { "epoch": 0.35376611626328885, "grad_norm": 0.4605962038040161, "learning_rate": 9.999239683082469e-06, "loss": 0.7328, "step": 782 }, { "epoch": 0.35421850260122145, "grad_norm": 0.5095902681350708, "learning_rate": 9.99922685128669e-06, "loss": 0.7999, "step": 783 }, { "epoch": 0.35467088893915405, "grad_norm": 0.5015769004821777, "learning_rate": 9.99921391212298e-06, "loss": 0.7852, "step": 784 }, { "epoch": 0.35512327527708665, "grad_norm": 0.47332072257995605, "learning_rate": 9.999200865591623e-06, "loss": 0.68, "step": 785 }, { "epoch": 0.35557566161501925, "grad_norm": 0.5698593258857727, "learning_rate": 9.999187711692895e-06, "loss": 0.7752, "step": 786 }, { "epoch": 0.35602804795295184, "grad_norm": 0.5246376395225525, "learning_rate": 9.999174450427078e-06, "loss": 0.6665, "step": 787 }, { "epoch": 0.35648043429088444, "grad_norm": 0.5457838773727417, "learning_rate": 9.999161081794459e-06, "loss": 0.7243, "step": 788 }, { "epoch": 0.356932820628817, "grad_norm": 0.589130699634552, "learning_rate": 9.999147605795325e-06, "loss": 0.7129, "step": 789 }, { "epoch": 0.3573852069667496, "grad_norm": 0.6162753701210022, "learning_rate": 9.999134022429965e-06, "loss": 0.7335, "step": 790 }, { "epoch": 0.3578375933046822, "grad_norm": 0.5804448127746582, "learning_rate": 9.999120331698668e-06, "loss": 0.7091, "step": 791 }, { "epoch": 0.3582899796426148, "grad_norm": 0.6722463369369507, "learning_rate": 9.999106533601733e-06, "loss": 0.7306, "step": 792 }, { "epoch": 0.3587423659805474, "grad_norm": 0.7063418030738831, "learning_rate": 9.999092628139453e-06, "loss": 0.8223, "step": 793 }, { "epoch": 0.35919475231847997, "grad_norm": 0.6791219115257263, "learning_rate": 9.999078615312128e-06, "loss": 0.7469, "step": 794 }, { "epoch": 0.35964713865641257, "grad_norm": 0.6480264663696289, "learning_rate": 9.999064495120057e-06, "loss": 0.7233, "step": 795 }, { "epoch": 0.36009952499434517, "grad_norm": 0.7460899353027344, "learning_rate": 9.999050267563546e-06, "loss": 0.8756, "step": 796 }, { "epoch": 0.36055191133227776, "grad_norm": 0.7969278693199158, "learning_rate": 9.999035932642899e-06, "loss": 0.7362, "step": 797 }, { "epoch": 0.36100429767021036, "grad_norm": 0.8366296291351318, "learning_rate": 9.999021490358423e-06, "loss": 0.785, "step": 798 }, { "epoch": 0.36145668400814296, "grad_norm": 0.8192808032035828, "learning_rate": 9.99900694071043e-06, "loss": 0.7834, "step": 799 }, { "epoch": 0.36190907034607556, "grad_norm": 0.983851969242096, "learning_rate": 9.998992283699234e-06, "loss": 0.8947, "step": 800 }, { "epoch": 0.36190907034607556, "eval_loss": 0.7742355465888977, "eval_runtime": 26.1218, "eval_samples_per_second": 28.482, "eval_steps_per_second": 7.121, "step": 800 }, { "epoch": 0.36236145668400815, "grad_norm": 0.13013441860675812, "learning_rate": 9.998977519325144e-06, "loss": 1.2388, "step": 801 }, { "epoch": 0.36281384302194075, "grad_norm": 0.19337411224842072, "learning_rate": 9.998962647588482e-06, "loss": 0.5692, "step": 802 }, { "epoch": 0.36326622935987335, "grad_norm": 0.25508856773376465, "learning_rate": 9.998947668489566e-06, "loss": 0.7588, "step": 803 }, { "epoch": 0.36371861569780595, "grad_norm": 0.2292693704366684, "learning_rate": 9.99893258202872e-06, "loss": 0.6008, "step": 804 }, { "epoch": 0.36417100203573854, "grad_norm": 0.29470765590667725, "learning_rate": 9.998917388206264e-06, "loss": 0.7276, "step": 805 }, { "epoch": 0.36462338837367114, "grad_norm": 0.34385445713996887, "learning_rate": 9.998902087022526e-06, "loss": 0.9056, "step": 806 }, { "epoch": 0.3650757747116037, "grad_norm": 0.34386369585990906, "learning_rate": 9.998886678477834e-06, "loss": 0.8025, "step": 807 }, { "epoch": 0.3655281610495363, "grad_norm": 0.3392086923122406, "learning_rate": 9.99887116257252e-06, "loss": 0.6399, "step": 808 }, { "epoch": 0.3659805473874689, "grad_norm": 0.35581228137016296, "learning_rate": 9.998855539306918e-06, "loss": 0.6493, "step": 809 }, { "epoch": 0.3664329337254015, "grad_norm": 0.38693687319755554, "learning_rate": 9.99883980868136e-06, "loss": 0.6839, "step": 810 }, { "epoch": 0.3668853200633341, "grad_norm": 0.3500588536262512, "learning_rate": 9.99882397069619e-06, "loss": 0.8086, "step": 811 }, { "epoch": 0.3673377064012667, "grad_norm": 0.4361667335033417, "learning_rate": 9.99880802535174e-06, "loss": 0.7622, "step": 812 }, { "epoch": 0.36779009273919927, "grad_norm": 0.44680923223495483, "learning_rate": 9.99879197264836e-06, "loss": 0.7108, "step": 813 }, { "epoch": 0.36824247907713187, "grad_norm": 0.47540047764778137, "learning_rate": 9.99877581258639e-06, "loss": 0.8171, "step": 814 }, { "epoch": 0.36869486541506447, "grad_norm": 0.46863293647766113, "learning_rate": 9.99875954516618e-06, "loss": 0.8265, "step": 815 }, { "epoch": 0.36914725175299706, "grad_norm": 0.4838193356990814, "learning_rate": 9.998743170388078e-06, "loss": 0.6191, "step": 816 }, { "epoch": 0.36959963809092966, "grad_norm": 0.5180812478065491, "learning_rate": 9.998726688252434e-06, "loss": 0.7903, "step": 817 }, { "epoch": 0.37005202442886226, "grad_norm": 0.551409125328064, "learning_rate": 9.998710098759605e-06, "loss": 0.6051, "step": 818 }, { "epoch": 0.37050441076679486, "grad_norm": 0.5312463045120239, "learning_rate": 9.998693401909946e-06, "loss": 0.746, "step": 819 }, { "epoch": 0.37095679710472745, "grad_norm": 0.5794820785522461, "learning_rate": 9.998676597703816e-06, "loss": 0.6557, "step": 820 }, { "epoch": 0.37140918344266005, "grad_norm": 0.6112874150276184, "learning_rate": 9.998659686141576e-06, "loss": 0.8452, "step": 821 }, { "epoch": 0.37186156978059265, "grad_norm": 0.6000187993049622, "learning_rate": 9.998642667223588e-06, "loss": 0.5744, "step": 822 }, { "epoch": 0.37231395611852525, "grad_norm": 0.6610323786735535, "learning_rate": 9.998625540950217e-06, "loss": 0.7419, "step": 823 }, { "epoch": 0.3727663424564578, "grad_norm": 0.6076019406318665, "learning_rate": 9.998608307321834e-06, "loss": 0.7514, "step": 824 }, { "epoch": 0.3732187287943904, "grad_norm": 0.7043147087097168, "learning_rate": 9.998590966338805e-06, "loss": 0.6237, "step": 825 }, { "epoch": 0.373671115132323, "grad_norm": 0.7443617582321167, "learning_rate": 9.998573518001507e-06, "loss": 0.7521, "step": 826 }, { "epoch": 0.3741235014702556, "grad_norm": 0.765911877155304, "learning_rate": 9.998555962310313e-06, "loss": 0.7085, "step": 827 }, { "epoch": 0.3745758878081882, "grad_norm": 0.7273766398429871, "learning_rate": 9.998538299265597e-06, "loss": 0.7624, "step": 828 }, { "epoch": 0.3750282741461208, "grad_norm": 0.7913145422935486, "learning_rate": 9.99852052886774e-06, "loss": 0.6045, "step": 829 }, { "epoch": 0.3754806604840534, "grad_norm": 0.9015067219734192, "learning_rate": 9.998502651117126e-06, "loss": 0.661, "step": 830 }, { "epoch": 0.37593304682198597, "grad_norm": 0.8965393304824829, "learning_rate": 9.998484666014138e-06, "loss": 0.6534, "step": 831 }, { "epoch": 0.37638543315991857, "grad_norm": 0.8986989855766296, "learning_rate": 9.998466573559161e-06, "loss": 0.7216, "step": 832 }, { "epoch": 0.37683781949785117, "grad_norm": 0.9951068758964539, "learning_rate": 9.998448373752585e-06, "loss": 0.7278, "step": 833 }, { "epoch": 0.37729020583578377, "grad_norm": 0.9988188743591309, "learning_rate": 9.998430066594799e-06, "loss": 0.7339, "step": 834 }, { "epoch": 0.37774259217371636, "grad_norm": 1.0818469524383545, "learning_rate": 9.998411652086198e-06, "loss": 0.6333, "step": 835 }, { "epoch": 0.37819497851164896, "grad_norm": 1.0937849283218384, "learning_rate": 9.998393130227175e-06, "loss": 0.7642, "step": 836 }, { "epoch": 0.37864736484958156, "grad_norm": 0.9533869624137878, "learning_rate": 9.998374501018131e-06, "loss": 0.6959, "step": 837 }, { "epoch": 0.37909975118751416, "grad_norm": 1.1686996221542358, "learning_rate": 9.998355764459464e-06, "loss": 0.6665, "step": 838 }, { "epoch": 0.37955213752544675, "grad_norm": 1.1972026824951172, "learning_rate": 9.998336920551577e-06, "loss": 0.7192, "step": 839 }, { "epoch": 0.38000452386337935, "grad_norm": 1.231026291847229, "learning_rate": 9.998317969294876e-06, "loss": 0.688, "step": 840 }, { "epoch": 0.3804569102013119, "grad_norm": 1.2595820426940918, "learning_rate": 9.998298910689765e-06, "loss": 0.6807, "step": 841 }, { "epoch": 0.3809092965392445, "grad_norm": 1.2780810594558716, "learning_rate": 9.998279744736656e-06, "loss": 0.6943, "step": 842 }, { "epoch": 0.3813616828771771, "grad_norm": 1.409178376197815, "learning_rate": 9.99826047143596e-06, "loss": 0.7914, "step": 843 }, { "epoch": 0.3818140692151097, "grad_norm": 1.4011203050613403, "learning_rate": 9.99824109078809e-06, "loss": 0.7108, "step": 844 }, { "epoch": 0.3822664555530423, "grad_norm": 1.5047789812088013, "learning_rate": 9.998221602793463e-06, "loss": 0.6215, "step": 845 }, { "epoch": 0.3827188418909749, "grad_norm": 1.3803538084030151, "learning_rate": 9.998202007452498e-06, "loss": 0.7614, "step": 846 }, { "epoch": 0.3831712282289075, "grad_norm": 1.6630998849868774, "learning_rate": 9.998182304765615e-06, "loss": 0.6914, "step": 847 }, { "epoch": 0.3836236145668401, "grad_norm": 1.7691935300827026, "learning_rate": 9.998162494733237e-06, "loss": 0.7491, "step": 848 }, { "epoch": 0.3840760009047727, "grad_norm": 1.8621351718902588, "learning_rate": 9.99814257735579e-06, "loss": 0.8306, "step": 849 }, { "epoch": 0.38452838724270527, "grad_norm": 1.7085164785385132, "learning_rate": 9.998122552633702e-06, "loss": 0.9202, "step": 850 }, { "epoch": 0.38498077358063787, "grad_norm": 0.13192981481552124, "learning_rate": 9.998102420567402e-06, "loss": 1.2771, "step": 851 }, { "epoch": 0.38543315991857047, "grad_norm": 0.2560006082057953, "learning_rate": 9.998082181157324e-06, "loss": 0.6754, "step": 852 }, { "epoch": 0.38588554625650306, "grad_norm": 0.3633415699005127, "learning_rate": 9.998061834403901e-06, "loss": 0.8114, "step": 853 }, { "epoch": 0.38633793259443566, "grad_norm": 0.360291063785553, "learning_rate": 9.99804138030757e-06, "loss": 0.594, "step": 854 }, { "epoch": 0.38679031893236826, "grad_norm": 0.3642880618572235, "learning_rate": 9.998020818868773e-06, "loss": 0.8051, "step": 855 }, { "epoch": 0.38724270527030086, "grad_norm": 0.46378961205482483, "learning_rate": 9.998000150087948e-06, "loss": 0.6257, "step": 856 }, { "epoch": 0.38769509160823346, "grad_norm": 0.4210261404514313, "learning_rate": 9.997979373965542e-06, "loss": 0.7118, "step": 857 }, { "epoch": 0.38814747794616605, "grad_norm": 0.44471144676208496, "learning_rate": 9.997958490501997e-06, "loss": 0.743, "step": 858 }, { "epoch": 0.3885998642840986, "grad_norm": 0.47717052698135376, "learning_rate": 9.997937499697767e-06, "loss": 0.7143, "step": 859 }, { "epoch": 0.3890522506220312, "grad_norm": 0.4471987187862396, "learning_rate": 9.9979164015533e-06, "loss": 0.609, "step": 860 }, { "epoch": 0.3895046369599638, "grad_norm": 0.3795545697212219, "learning_rate": 9.997895196069048e-06, "loss": 0.7009, "step": 861 }, { "epoch": 0.3899570232978964, "grad_norm": 0.3128439784049988, "learning_rate": 9.997873883245468e-06, "loss": 0.721, "step": 862 }, { "epoch": 0.390409409635829, "grad_norm": 0.34423184394836426, "learning_rate": 9.997852463083017e-06, "loss": 0.672, "step": 863 }, { "epoch": 0.3908617959737616, "grad_norm": 0.34176871180534363, "learning_rate": 9.997830935582156e-06, "loss": 0.5422, "step": 864 }, { "epoch": 0.3913141823116942, "grad_norm": 0.2432747483253479, "learning_rate": 9.997809300743346e-06, "loss": 0.7503, "step": 865 }, { "epoch": 0.3917665686496268, "grad_norm": 0.23622766137123108, "learning_rate": 9.997787558567053e-06, "loss": 0.7078, "step": 866 }, { "epoch": 0.3922189549875594, "grad_norm": 0.20296727120876312, "learning_rate": 9.997765709053744e-06, "loss": 0.6364, "step": 867 }, { "epoch": 0.392671341325492, "grad_norm": 0.2116369754076004, "learning_rate": 9.997743752203887e-06, "loss": 0.7115, "step": 868 }, { "epoch": 0.39312372766342457, "grad_norm": 0.23516353964805603, "learning_rate": 9.997721688017954e-06, "loss": 0.605, "step": 869 }, { "epoch": 0.39357611400135717, "grad_norm": 0.2763071358203888, "learning_rate": 9.997699516496418e-06, "loss": 0.6597, "step": 870 }, { "epoch": 0.39402850033928977, "grad_norm": 0.34593960642814636, "learning_rate": 9.997677237639758e-06, "loss": 0.6354, "step": 871 }, { "epoch": 0.39448088667722236, "grad_norm": 0.4132538437843323, "learning_rate": 9.997654851448449e-06, "loss": 0.6899, "step": 872 }, { "epoch": 0.39493327301515496, "grad_norm": 0.2890077233314514, "learning_rate": 9.997632357922976e-06, "loss": 0.7491, "step": 873 }, { "epoch": 0.39538565935308756, "grad_norm": 0.2518078088760376, "learning_rate": 9.997609757063818e-06, "loss": 0.5803, "step": 874 }, { "epoch": 0.39583804569102016, "grad_norm": 0.24551144242286682, "learning_rate": 9.99758704887146e-06, "loss": 0.652, "step": 875 }, { "epoch": 0.3962904320289527, "grad_norm": 0.32785192131996155, "learning_rate": 9.997564233346393e-06, "loss": 0.746, "step": 876 }, { "epoch": 0.3967428183668853, "grad_norm": 0.3420783281326294, "learning_rate": 9.997541310489108e-06, "loss": 0.7272, "step": 877 }, { "epoch": 0.3971952047048179, "grad_norm": 0.29900625348091125, "learning_rate": 9.997518280300092e-06, "loss": 0.774, "step": 878 }, { "epoch": 0.3976475910427505, "grad_norm": 0.298814594745636, "learning_rate": 9.997495142779845e-06, "loss": 0.6562, "step": 879 }, { "epoch": 0.3980999773806831, "grad_norm": 0.3153093755245209, "learning_rate": 9.99747189792886e-06, "loss": 0.7288, "step": 880 }, { "epoch": 0.3985523637186157, "grad_norm": 0.26564592123031616, "learning_rate": 9.997448545747637e-06, "loss": 0.6065, "step": 881 }, { "epoch": 0.3990047500565483, "grad_norm": 0.27995941042900085, "learning_rate": 9.99742508623668e-06, "loss": 0.5748, "step": 882 }, { "epoch": 0.3994571363944809, "grad_norm": 0.3446321487426758, "learning_rate": 9.99740151939649e-06, "loss": 0.6157, "step": 883 }, { "epoch": 0.3999095227324135, "grad_norm": 0.28262555599212646, "learning_rate": 9.997377845227577e-06, "loss": 0.6416, "step": 884 }, { "epoch": 0.4003619090703461, "grad_norm": 0.3300037384033203, "learning_rate": 9.997354063730442e-06, "loss": 0.6482, "step": 885 }, { "epoch": 0.4008142954082787, "grad_norm": 0.3528444766998291, "learning_rate": 9.997330174905604e-06, "loss": 0.585, "step": 886 }, { "epoch": 0.4012666817462113, "grad_norm": 0.4055725336074829, "learning_rate": 9.997306178753572e-06, "loss": 0.5812, "step": 887 }, { "epoch": 0.40171906808414387, "grad_norm": 0.4308398365974426, "learning_rate": 9.997282075274862e-06, "loss": 0.771, "step": 888 }, { "epoch": 0.40217145442207647, "grad_norm": 0.4179864525794983, "learning_rate": 9.997257864469989e-06, "loss": 0.7458, "step": 889 }, { "epoch": 0.40262384076000907, "grad_norm": 0.3686334490776062, "learning_rate": 9.997233546339478e-06, "loss": 0.5988, "step": 890 }, { "epoch": 0.40307622709794166, "grad_norm": 0.479705274105072, "learning_rate": 9.997209120883849e-06, "loss": 0.7285, "step": 891 }, { "epoch": 0.40352861343587426, "grad_norm": 0.3767562806606293, "learning_rate": 9.997184588103624e-06, "loss": 0.5294, "step": 892 }, { "epoch": 0.4039809997738068, "grad_norm": 0.4933464229106903, "learning_rate": 9.997159947999335e-06, "loss": 0.7391, "step": 893 }, { "epoch": 0.4044333861117394, "grad_norm": 0.47407832741737366, "learning_rate": 9.997135200571507e-06, "loss": 0.6172, "step": 894 }, { "epoch": 0.404885772449672, "grad_norm": 0.5224399566650391, "learning_rate": 9.997110345820673e-06, "loss": 0.8225, "step": 895 }, { "epoch": 0.4053381587876046, "grad_norm": 0.587566614151001, "learning_rate": 9.997085383747367e-06, "loss": 0.8019, "step": 896 }, { "epoch": 0.4057905451255372, "grad_norm": 0.4114418625831604, "learning_rate": 9.997060314352125e-06, "loss": 0.7366, "step": 897 }, { "epoch": 0.4062429314634698, "grad_norm": 0.516924262046814, "learning_rate": 9.997035137635485e-06, "loss": 0.7288, "step": 898 }, { "epoch": 0.4066953178014024, "grad_norm": 0.5543747544288635, "learning_rate": 9.997009853597989e-06, "loss": 0.6309, "step": 899 }, { "epoch": 0.407147704139335, "grad_norm": 0.757258951663971, "learning_rate": 9.996984462240178e-06, "loss": 0.6985, "step": 900 }, { "epoch": 0.4076000904772676, "grad_norm": 0.125876322388649, "learning_rate": 9.996958963562598e-06, "loss": 1.1391, "step": 901 }, { "epoch": 0.4080524768152002, "grad_norm": 0.13580387830734253, "learning_rate": 9.996933357565798e-06, "loss": 0.7046, "step": 902 }, { "epoch": 0.4085048631531328, "grad_norm": 0.17519347369670868, "learning_rate": 9.996907644250327e-06, "loss": 0.7503, "step": 903 }, { "epoch": 0.4089572494910654, "grad_norm": 0.20265667140483856, "learning_rate": 9.996881823616737e-06, "loss": 0.6743, "step": 904 }, { "epoch": 0.409409635828998, "grad_norm": 0.19673261046409607, "learning_rate": 9.996855895665582e-06, "loss": 0.6046, "step": 905 }, { "epoch": 0.4098620221669306, "grad_norm": 0.1838562786579132, "learning_rate": 9.99682986039742e-06, "loss": 0.5973, "step": 906 }, { "epoch": 0.41031440850486317, "grad_norm": 0.22760982811450958, "learning_rate": 9.99680371781281e-06, "loss": 0.5705, "step": 907 }, { "epoch": 0.41076679484279577, "grad_norm": 0.2246081680059433, "learning_rate": 9.996777467912312e-06, "loss": 0.6983, "step": 908 }, { "epoch": 0.41121918118072837, "grad_norm": 0.1930236667394638, "learning_rate": 9.996751110696493e-06, "loss": 0.6354, "step": 909 }, { "epoch": 0.41167156751866096, "grad_norm": 0.23532505333423615, "learning_rate": 9.996724646165915e-06, "loss": 0.6615, "step": 910 }, { "epoch": 0.4121239538565935, "grad_norm": 0.22764374315738678, "learning_rate": 9.996698074321149e-06, "loss": 0.587, "step": 911 }, { "epoch": 0.4125763401945261, "grad_norm": 0.27815091609954834, "learning_rate": 9.996671395162765e-06, "loss": 0.7372, "step": 912 }, { "epoch": 0.4130287265324587, "grad_norm": 0.2523258924484253, "learning_rate": 9.996644608691337e-06, "loss": 0.7012, "step": 913 }, { "epoch": 0.4134811128703913, "grad_norm": 0.27416548132896423, "learning_rate": 9.996617714907439e-06, "loss": 0.662, "step": 914 }, { "epoch": 0.4139334992083239, "grad_norm": 0.2865001857280731, "learning_rate": 9.996590713811649e-06, "loss": 0.7046, "step": 915 }, { "epoch": 0.4143858855462565, "grad_norm": 0.25491487979888916, "learning_rate": 9.996563605404548e-06, "loss": 0.5397, "step": 916 }, { "epoch": 0.4148382718841891, "grad_norm": 0.2668300271034241, "learning_rate": 9.996536389686715e-06, "loss": 0.6776, "step": 917 }, { "epoch": 0.4152906582221217, "grad_norm": 0.27927693724632263, "learning_rate": 9.996509066658738e-06, "loss": 0.6771, "step": 918 }, { "epoch": 0.4157430445600543, "grad_norm": 0.3100537657737732, "learning_rate": 9.9964816363212e-06, "loss": 0.8201, "step": 919 }, { "epoch": 0.4161954308979869, "grad_norm": 0.30100172758102417, "learning_rate": 9.996454098674696e-06, "loss": 0.7637, "step": 920 }, { "epoch": 0.4166478172359195, "grad_norm": 0.3118777871131897, "learning_rate": 9.996426453719813e-06, "loss": 0.709, "step": 921 }, { "epoch": 0.4171002035738521, "grad_norm": 0.4093967378139496, "learning_rate": 9.996398701457143e-06, "loss": 0.6453, "step": 922 }, { "epoch": 0.4175525899117847, "grad_norm": 0.33057838678359985, "learning_rate": 9.996370841887286e-06, "loss": 0.6195, "step": 923 }, { "epoch": 0.4180049762497173, "grad_norm": 0.40696442127227783, "learning_rate": 9.996342875010841e-06, "loss": 0.7238, "step": 924 }, { "epoch": 0.4184573625876499, "grad_norm": 0.29117631912231445, "learning_rate": 9.996314800828404e-06, "loss": 0.6226, "step": 925 }, { "epoch": 0.41890974892558247, "grad_norm": 0.3346465528011322, "learning_rate": 9.996286619340582e-06, "loss": 0.5805, "step": 926 }, { "epoch": 0.41936213526351507, "grad_norm": 0.28853872418403625, "learning_rate": 9.996258330547978e-06, "loss": 0.72, "step": 927 }, { "epoch": 0.4198145216014476, "grad_norm": 0.33722496032714844, "learning_rate": 9.996229934451203e-06, "loss": 0.5137, "step": 928 }, { "epoch": 0.4202669079393802, "grad_norm": 0.35148394107818604, "learning_rate": 9.99620143105086e-06, "loss": 0.6324, "step": 929 }, { "epoch": 0.4207192942773128, "grad_norm": 0.5020065307617188, "learning_rate": 9.99617282034757e-06, "loss": 0.6133, "step": 930 }, { "epoch": 0.4211716806152454, "grad_norm": 0.47319039702415466, "learning_rate": 9.99614410234194e-06, "loss": 0.5298, "step": 931 }, { "epoch": 0.421624066953178, "grad_norm": 0.4106064736843109, "learning_rate": 9.99611527703459e-06, "loss": 0.6371, "step": 932 }, { "epoch": 0.4220764532911106, "grad_norm": 0.46149110794067383, "learning_rate": 9.996086344426138e-06, "loss": 0.5933, "step": 933 }, { "epoch": 0.4225288396290432, "grad_norm": 0.5579364895820618, "learning_rate": 9.996057304517208e-06, "loss": 0.6007, "step": 934 }, { "epoch": 0.4229812259669758, "grad_norm": 0.507917582988739, "learning_rate": 9.996028157308422e-06, "loss": 0.6643, "step": 935 }, { "epoch": 0.4234336123049084, "grad_norm": 0.5975815653800964, "learning_rate": 9.995998902800406e-06, "loss": 0.6482, "step": 936 }, { "epoch": 0.423885998642841, "grad_norm": 0.5338273644447327, "learning_rate": 9.995969540993786e-06, "loss": 0.5253, "step": 937 }, { "epoch": 0.4243383849807736, "grad_norm": 0.6960197687149048, "learning_rate": 9.995940071889197e-06, "loss": 0.4637, "step": 938 }, { "epoch": 0.4247907713187062, "grad_norm": 0.5651724934577942, "learning_rate": 9.995910495487268e-06, "loss": 0.6454, "step": 939 }, { "epoch": 0.4252431576566388, "grad_norm": 0.536768913269043, "learning_rate": 9.995880811788637e-06, "loss": 0.5224, "step": 940 }, { "epoch": 0.4256955439945714, "grad_norm": 0.9094938635826111, "learning_rate": 9.995851020793938e-06, "loss": 0.5656, "step": 941 }, { "epoch": 0.426147930332504, "grad_norm": 0.5761896967887878, "learning_rate": 9.995821122503816e-06, "loss": 0.6299, "step": 942 }, { "epoch": 0.4266003166704366, "grad_norm": 0.6151994466781616, "learning_rate": 9.99579111691891e-06, "loss": 0.5914, "step": 943 }, { "epoch": 0.42705270300836917, "grad_norm": 0.7778657078742981, "learning_rate": 9.995761004039863e-06, "loss": 0.5897, "step": 944 }, { "epoch": 0.4275050893463017, "grad_norm": 0.6291765570640564, "learning_rate": 9.995730783867324e-06, "loss": 0.5983, "step": 945 }, { "epoch": 0.4279574756842343, "grad_norm": 0.7999734878540039, "learning_rate": 9.995700456401942e-06, "loss": 0.6747, "step": 946 }, { "epoch": 0.4284098620221669, "grad_norm": 0.6723253726959229, "learning_rate": 9.995670021644368e-06, "loss": 0.576, "step": 947 }, { "epoch": 0.4288622483600995, "grad_norm": 0.6459294557571411, "learning_rate": 9.995639479595255e-06, "loss": 0.5891, "step": 948 }, { "epoch": 0.4293146346980321, "grad_norm": 0.5712341070175171, "learning_rate": 9.995608830255259e-06, "loss": 0.7133, "step": 949 }, { "epoch": 0.4297670210359647, "grad_norm": 0.8806389570236206, "learning_rate": 9.99557807362504e-06, "loss": 0.5761, "step": 950 }, { "epoch": 0.4302194073738973, "grad_norm": 0.11210645735263824, "learning_rate": 9.995547209705256e-06, "loss": 1.2161, "step": 951 }, { "epoch": 0.4306717937118299, "grad_norm": 0.17321506142616272, "learning_rate": 9.995516238496571e-06, "loss": 0.7316, "step": 952 }, { "epoch": 0.4311241800497625, "grad_norm": 0.1984458714723587, "learning_rate": 9.995485159999652e-06, "loss": 0.6837, "step": 953 }, { "epoch": 0.4315765663876951, "grad_norm": 0.17609800398349762, "learning_rate": 9.995453974215164e-06, "loss": 0.5556, "step": 954 }, { "epoch": 0.4320289527256277, "grad_norm": 0.19833900034427643, "learning_rate": 9.995422681143777e-06, "loss": 0.6142, "step": 955 }, { "epoch": 0.4324813390635603, "grad_norm": 0.19767452776432037, "learning_rate": 9.995391280786165e-06, "loss": 0.5653, "step": 956 }, { "epoch": 0.4329337254014929, "grad_norm": 0.2051895409822464, "learning_rate": 9.995359773143e-06, "loss": 0.7005, "step": 957 }, { "epoch": 0.4333861117394255, "grad_norm": 0.26143133640289307, "learning_rate": 9.995328158214959e-06, "loss": 0.6867, "step": 958 }, { "epoch": 0.4338384980773581, "grad_norm": 0.2071329951286316, "learning_rate": 9.995296436002722e-06, "loss": 0.7212, "step": 959 }, { "epoch": 0.4342908844152907, "grad_norm": 0.21138189733028412, "learning_rate": 9.99526460650697e-06, "loss": 0.7542, "step": 960 }, { "epoch": 0.4347432707532233, "grad_norm": 0.2084711790084839, "learning_rate": 9.995232669728385e-06, "loss": 0.5863, "step": 961 }, { "epoch": 0.4351956570911559, "grad_norm": 0.188959002494812, "learning_rate": 9.995200625667656e-06, "loss": 0.6639, "step": 962 }, { "epoch": 0.4356480434290884, "grad_norm": 0.2180580049753189, "learning_rate": 9.99516847432547e-06, "loss": 0.7377, "step": 963 }, { "epoch": 0.436100429767021, "grad_norm": 0.23485523462295532, "learning_rate": 9.995136215702517e-06, "loss": 0.7039, "step": 964 }, { "epoch": 0.4365528161049536, "grad_norm": 0.22583754360675812, "learning_rate": 9.99510384979949e-06, "loss": 0.6207, "step": 965 }, { "epoch": 0.4370052024428862, "grad_norm": 0.24633079767227173, "learning_rate": 9.995071376617082e-06, "loss": 0.632, "step": 966 }, { "epoch": 0.4374575887808188, "grad_norm": 0.26217779517173767, "learning_rate": 9.995038796155995e-06, "loss": 0.7087, "step": 967 }, { "epoch": 0.4379099751187514, "grad_norm": 0.22770282626152039, "learning_rate": 9.995006108416925e-06, "loss": 0.6086, "step": 968 }, { "epoch": 0.438362361456684, "grad_norm": 0.23341652750968933, "learning_rate": 9.994973313400577e-06, "loss": 0.7029, "step": 969 }, { "epoch": 0.4388147477946166, "grad_norm": 0.22885240614414215, "learning_rate": 9.994940411107652e-06, "loss": 0.5795, "step": 970 }, { "epoch": 0.4392671341325492, "grad_norm": 0.23843036592006683, "learning_rate": 9.99490740153886e-06, "loss": 0.6408, "step": 971 }, { "epoch": 0.4397195204704818, "grad_norm": 0.20882362127304077, "learning_rate": 9.994874284694906e-06, "loss": 0.5492, "step": 972 }, { "epoch": 0.4401719068084144, "grad_norm": 0.23994167149066925, "learning_rate": 9.994841060576505e-06, "loss": 0.6709, "step": 973 }, { "epoch": 0.440624293146347, "grad_norm": 0.2526256740093231, "learning_rate": 9.994807729184368e-06, "loss": 0.7834, "step": 974 }, { "epoch": 0.4410766794842796, "grad_norm": 0.24630270898342133, "learning_rate": 9.994774290519213e-06, "loss": 0.5928, "step": 975 }, { "epoch": 0.4415290658222122, "grad_norm": 0.2053951770067215, "learning_rate": 9.994740744581756e-06, "loss": 0.5174, "step": 976 }, { "epoch": 0.4419814521601448, "grad_norm": 0.2738940715789795, "learning_rate": 9.994707091372719e-06, "loss": 0.7196, "step": 977 }, { "epoch": 0.4424338384980774, "grad_norm": 0.24516436457633972, "learning_rate": 9.994673330892823e-06, "loss": 0.6927, "step": 978 }, { "epoch": 0.44288622483601, "grad_norm": 0.24486610293388367, "learning_rate": 9.994639463142795e-06, "loss": 0.5502, "step": 979 }, { "epoch": 0.4433386111739425, "grad_norm": 0.2464614063501358, "learning_rate": 9.994605488123364e-06, "loss": 0.5854, "step": 980 }, { "epoch": 0.4437909975118751, "grad_norm": 0.2500174045562744, "learning_rate": 9.994571405835254e-06, "loss": 0.6128, "step": 981 }, { "epoch": 0.4442433838498077, "grad_norm": 0.268976628780365, "learning_rate": 9.994537216279202e-06, "loss": 0.6515, "step": 982 }, { "epoch": 0.4446957701877403, "grad_norm": 0.2567249536514282, "learning_rate": 9.994502919455939e-06, "loss": 0.5639, "step": 983 }, { "epoch": 0.4451481565256729, "grad_norm": 0.2695481479167938, "learning_rate": 9.994468515366205e-06, "loss": 0.606, "step": 984 }, { "epoch": 0.4456005428636055, "grad_norm": 0.31799185276031494, "learning_rate": 9.994434004010738e-06, "loss": 0.6385, "step": 985 }, { "epoch": 0.4460529292015381, "grad_norm": 0.2258293181657791, "learning_rate": 9.994399385390277e-06, "loss": 0.5511, "step": 986 }, { "epoch": 0.4465053155394707, "grad_norm": 0.2498103380203247, "learning_rate": 9.994364659505567e-06, "loss": 0.5415, "step": 987 }, { "epoch": 0.4469577018774033, "grad_norm": 0.297339528799057, "learning_rate": 9.994329826357352e-06, "loss": 0.6116, "step": 988 }, { "epoch": 0.4474100882153359, "grad_norm": 0.31603774428367615, "learning_rate": 9.994294885946383e-06, "loss": 0.6862, "step": 989 }, { "epoch": 0.4478624745532685, "grad_norm": 0.2874099314212799, "learning_rate": 9.99425983827341e-06, "loss": 0.506, "step": 990 }, { "epoch": 0.4483148608912011, "grad_norm": 0.35252365469932556, "learning_rate": 9.994224683339184e-06, "loss": 0.561, "step": 991 }, { "epoch": 0.4487672472291337, "grad_norm": 0.277815580368042, "learning_rate": 9.994189421144461e-06, "loss": 0.5883, "step": 992 }, { "epoch": 0.4492196335670663, "grad_norm": 0.307903528213501, "learning_rate": 9.994154051689998e-06, "loss": 0.6699, "step": 993 }, { "epoch": 0.4496720199049989, "grad_norm": 0.3205600678920746, "learning_rate": 9.994118574976555e-06, "loss": 0.5366, "step": 994 }, { "epoch": 0.4501244062429315, "grad_norm": 0.278727263212204, "learning_rate": 9.994082991004895e-06, "loss": 0.581, "step": 995 }, { "epoch": 0.4505767925808641, "grad_norm": 0.30150890350341797, "learning_rate": 9.99404729977578e-06, "loss": 0.5818, "step": 996 }, { "epoch": 0.4510291789187966, "grad_norm": 0.3260955512523651, "learning_rate": 9.994011501289976e-06, "loss": 0.6278, "step": 997 }, { "epoch": 0.4514815652567292, "grad_norm": 0.3691314160823822, "learning_rate": 9.993975595548255e-06, "loss": 0.6096, "step": 998 }, { "epoch": 0.4519339515946618, "grad_norm": 0.3303999602794647, "learning_rate": 9.993939582551386e-06, "loss": 0.5642, "step": 999 }, { "epoch": 0.4523863379325944, "grad_norm": 0.451384037733078, "learning_rate": 9.993903462300143e-06, "loss": 0.5385, "step": 1000 }, { "epoch": 0.4523863379325944, "eval_loss": 0.6416351795196533, "eval_runtime": 25.7963, "eval_samples_per_second": 28.841, "eval_steps_per_second": 7.21, "step": 1000 }, { "epoch": 0.452838724270527, "grad_norm": 0.12390793859958649, "learning_rate": 9.993867234795303e-06, "loss": 1.1523, "step": 1001 }, { "epoch": 0.4532911106084596, "grad_norm": 0.16071940958499908, "learning_rate": 9.993830900037641e-06, "loss": 1.0301, "step": 1002 }, { "epoch": 0.4537434969463922, "grad_norm": 0.2027565836906433, "learning_rate": 9.99379445802794e-06, "loss": 0.8103, "step": 1003 }, { "epoch": 0.4541958832843248, "grad_norm": 0.17919661104679108, "learning_rate": 9.993757908766982e-06, "loss": 0.7314, "step": 1004 }, { "epoch": 0.4546482696222574, "grad_norm": 0.17646664381027222, "learning_rate": 9.99372125225555e-06, "loss": 0.6261, "step": 1005 }, { "epoch": 0.45510065596019, "grad_norm": 0.18716247379779816, "learning_rate": 9.993684488494434e-06, "loss": 0.6887, "step": 1006 }, { "epoch": 0.4555530422981226, "grad_norm": 0.2538403868675232, "learning_rate": 9.993647617484424e-06, "loss": 0.8436, "step": 1007 }, { "epoch": 0.4560054286360552, "grad_norm": 0.22923101484775543, "learning_rate": 9.993610639226308e-06, "loss": 0.6513, "step": 1008 }, { "epoch": 0.4564578149739878, "grad_norm": 0.19822657108306885, "learning_rate": 9.993573553720886e-06, "loss": 0.7019, "step": 1009 }, { "epoch": 0.4569102013119204, "grad_norm": 0.1967959702014923, "learning_rate": 9.993536360968947e-06, "loss": 0.5727, "step": 1010 }, { "epoch": 0.457362587649853, "grad_norm": 0.2090284824371338, "learning_rate": 9.993499060971297e-06, "loss": 0.7374, "step": 1011 }, { "epoch": 0.4578149739877856, "grad_norm": 0.20263229310512543, "learning_rate": 9.993461653728733e-06, "loss": 0.5099, "step": 1012 }, { "epoch": 0.4582673603257182, "grad_norm": 0.22486424446105957, "learning_rate": 9.99342413924206e-06, "loss": 0.7668, "step": 1013 }, { "epoch": 0.4587197466636508, "grad_norm": 0.22524793446063995, "learning_rate": 9.993386517512082e-06, "loss": 0.6845, "step": 1014 }, { "epoch": 0.4591721330015833, "grad_norm": 0.2178989052772522, "learning_rate": 9.99334878853961e-06, "loss": 0.6213, "step": 1015 }, { "epoch": 0.4596245193395159, "grad_norm": 0.19931374490261078, "learning_rate": 9.99331095232545e-06, "loss": 0.7603, "step": 1016 }, { "epoch": 0.4600769056774485, "grad_norm": 0.219748854637146, "learning_rate": 9.993273008870416e-06, "loss": 0.5478, "step": 1017 }, { "epoch": 0.4605292920153811, "grad_norm": 0.2186930924654007, "learning_rate": 9.993234958175328e-06, "loss": 0.6602, "step": 1018 }, { "epoch": 0.4609816783533137, "grad_norm": 0.22033274173736572, "learning_rate": 9.993196800240996e-06, "loss": 0.6328, "step": 1019 }, { "epoch": 0.4614340646912463, "grad_norm": 0.2136853188276291, "learning_rate": 9.993158535068244e-06, "loss": 0.4945, "step": 1020 }, { "epoch": 0.4618864510291789, "grad_norm": 0.24956578016281128, "learning_rate": 9.993120162657892e-06, "loss": 0.6132, "step": 1021 }, { "epoch": 0.4623388373671115, "grad_norm": 0.2258577197790146, "learning_rate": 9.993081683010764e-06, "loss": 0.5582, "step": 1022 }, { "epoch": 0.4627912237050441, "grad_norm": 0.246862530708313, "learning_rate": 9.993043096127689e-06, "loss": 0.688, "step": 1023 }, { "epoch": 0.4632436100429767, "grad_norm": 0.22965078055858612, "learning_rate": 9.993004402009492e-06, "loss": 0.5921, "step": 1024 }, { "epoch": 0.4636959963809093, "grad_norm": 0.22925209999084473, "learning_rate": 9.992965600657006e-06, "loss": 0.5848, "step": 1025 }, { "epoch": 0.4641483827188419, "grad_norm": 0.22133618593215942, "learning_rate": 9.992926692071063e-06, "loss": 0.6118, "step": 1026 }, { "epoch": 0.4646007690567745, "grad_norm": 0.23023590445518494, "learning_rate": 9.9928876762525e-06, "loss": 0.5534, "step": 1027 }, { "epoch": 0.4650531553947071, "grad_norm": 0.2664046585559845, "learning_rate": 9.992848553202158e-06, "loss": 0.7275, "step": 1028 }, { "epoch": 0.4655055417326397, "grad_norm": 0.2803621292114258, "learning_rate": 9.99280932292087e-06, "loss": 0.7461, "step": 1029 }, { "epoch": 0.4659579280705723, "grad_norm": 0.2601984441280365, "learning_rate": 9.992769985409485e-06, "loss": 0.6603, "step": 1030 }, { "epoch": 0.4664103144085049, "grad_norm": 0.2558339536190033, "learning_rate": 9.992730540668842e-06, "loss": 0.5911, "step": 1031 }, { "epoch": 0.46686270074643743, "grad_norm": 0.24211354553699493, "learning_rate": 9.992690988699795e-06, "loss": 0.521, "step": 1032 }, { "epoch": 0.46731508708437003, "grad_norm": 0.3303705155849457, "learning_rate": 9.992651329503188e-06, "loss": 0.5648, "step": 1033 }, { "epoch": 0.4677674734223026, "grad_norm": 0.23405995965003967, "learning_rate": 9.992611563079875e-06, "loss": 0.4578, "step": 1034 }, { "epoch": 0.4682198597602352, "grad_norm": 0.26902127265930176, "learning_rate": 9.99257168943071e-06, "loss": 0.6402, "step": 1035 }, { "epoch": 0.4686722460981678, "grad_norm": 0.257890522480011, "learning_rate": 9.992531708556548e-06, "loss": 0.522, "step": 1036 }, { "epoch": 0.4691246324361004, "grad_norm": 0.2967270314693451, "learning_rate": 9.992491620458251e-06, "loss": 0.5989, "step": 1037 }, { "epoch": 0.469577018774033, "grad_norm": 0.2902102768421173, "learning_rate": 9.992451425136675e-06, "loss": 0.5489, "step": 1038 }, { "epoch": 0.4700294051119656, "grad_norm": 0.24295422434806824, "learning_rate": 9.992411122592687e-06, "loss": 0.4999, "step": 1039 }, { "epoch": 0.4704817914498982, "grad_norm": 0.3223569691181183, "learning_rate": 9.992370712827154e-06, "loss": 0.598, "step": 1040 }, { "epoch": 0.4709341777878308, "grad_norm": 0.30674290657043457, "learning_rate": 9.992330195840937e-06, "loss": 0.7158, "step": 1041 }, { "epoch": 0.4713865641257634, "grad_norm": 0.3286416232585907, "learning_rate": 9.992289571634913e-06, "loss": 0.6731, "step": 1042 }, { "epoch": 0.471838950463696, "grad_norm": 0.2825571894645691, "learning_rate": 9.992248840209953e-06, "loss": 0.509, "step": 1043 }, { "epoch": 0.4722913368016286, "grad_norm": 0.4455055594444275, "learning_rate": 9.99220800156693e-06, "loss": 0.6523, "step": 1044 }, { "epoch": 0.4727437231395612, "grad_norm": 0.3441793918609619, "learning_rate": 9.992167055706721e-06, "loss": 0.5953, "step": 1045 }, { "epoch": 0.4731961094774938, "grad_norm": 0.3478612005710602, "learning_rate": 9.992126002630207e-06, "loss": 0.588, "step": 1046 }, { "epoch": 0.4736484958154264, "grad_norm": 0.3391546308994293, "learning_rate": 9.99208484233827e-06, "loss": 0.5653, "step": 1047 }, { "epoch": 0.474100882153359, "grad_norm": 0.3254278004169464, "learning_rate": 9.99204357483179e-06, "loss": 0.5153, "step": 1048 }, { "epoch": 0.47455326849129154, "grad_norm": 0.39896348118782043, "learning_rate": 9.99200220011166e-06, "loss": 0.5526, "step": 1049 }, { "epoch": 0.47500565482922413, "grad_norm": 0.4555913209915161, "learning_rate": 9.991960718178764e-06, "loss": 0.6366, "step": 1050 }, { "epoch": 0.47545804116715673, "grad_norm": 0.13157491385936737, "learning_rate": 9.991919129033994e-06, "loss": 1.2662, "step": 1051 }, { "epoch": 0.47591042750508933, "grad_norm": 0.21833932399749756, "learning_rate": 9.991877432678242e-06, "loss": 1.0015, "step": 1052 }, { "epoch": 0.4763628138430219, "grad_norm": 0.19154050946235657, "learning_rate": 9.991835629112404e-06, "loss": 0.8928, "step": 1053 }, { "epoch": 0.4768152001809545, "grad_norm": 0.14038436114788055, "learning_rate": 9.99179371833738e-06, "loss": 0.6292, "step": 1054 }, { "epoch": 0.4772675865188871, "grad_norm": 0.17707695066928864, "learning_rate": 9.991751700354067e-06, "loss": 0.7162, "step": 1055 }, { "epoch": 0.4777199728568197, "grad_norm": 0.17941850423812866, "learning_rate": 9.99170957516337e-06, "loss": 0.6061, "step": 1056 }, { "epoch": 0.4781723591947523, "grad_norm": 0.194647416472435, "learning_rate": 9.991667342766191e-06, "loss": 0.7144, "step": 1057 }, { "epoch": 0.4786247455326849, "grad_norm": 0.19079382717609406, "learning_rate": 9.99162500316344e-06, "loss": 0.7029, "step": 1058 }, { "epoch": 0.4790771318706175, "grad_norm": 0.19127556681632996, "learning_rate": 9.991582556356026e-06, "loss": 0.5746, "step": 1059 }, { "epoch": 0.4795295182085501, "grad_norm": 0.2043769508600235, "learning_rate": 9.991540002344858e-06, "loss": 0.7405, "step": 1060 }, { "epoch": 0.4799819045464827, "grad_norm": 0.18456710875034332, "learning_rate": 9.991497341130852e-06, "loss": 0.5625, "step": 1061 }, { "epoch": 0.4804342908844153, "grad_norm": 0.21234434843063354, "learning_rate": 9.991454572714924e-06, "loss": 0.7047, "step": 1062 }, { "epoch": 0.4808866772223479, "grad_norm": 0.20182600617408752, "learning_rate": 9.991411697097993e-06, "loss": 0.6611, "step": 1063 }, { "epoch": 0.4813390635602805, "grad_norm": 0.21404214203357697, "learning_rate": 9.991368714280976e-06, "loss": 0.6718, "step": 1064 }, { "epoch": 0.4817914498982131, "grad_norm": 0.22211967408657074, "learning_rate": 9.991325624264802e-06, "loss": 0.6897, "step": 1065 }, { "epoch": 0.4822438362361457, "grad_norm": 0.2010943442583084, "learning_rate": 9.991282427050393e-06, "loss": 0.6446, "step": 1066 }, { "epoch": 0.48269622257407824, "grad_norm": 0.23250938951969147, "learning_rate": 9.991239122638678e-06, "loss": 0.6455, "step": 1067 }, { "epoch": 0.48314860891201084, "grad_norm": 0.22624970972537994, "learning_rate": 9.991195711030586e-06, "loss": 0.7235, "step": 1068 }, { "epoch": 0.48360099524994343, "grad_norm": 0.20697760581970215, "learning_rate": 9.99115219222705e-06, "loss": 0.6613, "step": 1069 }, { "epoch": 0.48405338158787603, "grad_norm": 0.20406696200370789, "learning_rate": 9.991108566229005e-06, "loss": 0.6225, "step": 1070 }, { "epoch": 0.48450576792580863, "grad_norm": 0.18762198090553284, "learning_rate": 9.991064833037386e-06, "loss": 0.6149, "step": 1071 }, { "epoch": 0.4849581542637412, "grad_norm": 0.25115323066711426, "learning_rate": 9.991020992653135e-06, "loss": 0.7563, "step": 1072 }, { "epoch": 0.4854105406016738, "grad_norm": 0.20071615278720856, "learning_rate": 9.990977045077191e-06, "loss": 0.5944, "step": 1073 }, { "epoch": 0.4858629269396064, "grad_norm": 0.20694778859615326, "learning_rate": 9.9909329903105e-06, "loss": 0.6142, "step": 1074 }, { "epoch": 0.486315313277539, "grad_norm": 0.22496336698532104, "learning_rate": 9.990888828354006e-06, "loss": 0.7127, "step": 1075 }, { "epoch": 0.4867676996154716, "grad_norm": 0.24247518181800842, "learning_rate": 9.990844559208661e-06, "loss": 0.5852, "step": 1076 }, { "epoch": 0.4872200859534042, "grad_norm": 0.22086453437805176, "learning_rate": 9.990800182875412e-06, "loss": 0.593, "step": 1077 }, { "epoch": 0.4876724722913368, "grad_norm": 0.21583305299282074, "learning_rate": 9.990755699355214e-06, "loss": 0.5746, "step": 1078 }, { "epoch": 0.4881248586292694, "grad_norm": 0.2761453092098236, "learning_rate": 9.99071110864902e-06, "loss": 0.5945, "step": 1079 }, { "epoch": 0.488577244967202, "grad_norm": 0.25453704595565796, "learning_rate": 9.990666410757792e-06, "loss": 0.7108, "step": 1080 }, { "epoch": 0.4890296313051346, "grad_norm": 0.2365685999393463, "learning_rate": 9.990621605682488e-06, "loss": 0.4684, "step": 1081 }, { "epoch": 0.4894820176430672, "grad_norm": 0.26620298624038696, "learning_rate": 9.990576693424068e-06, "loss": 0.6262, "step": 1082 }, { "epoch": 0.4899344039809998, "grad_norm": 0.2557627260684967, "learning_rate": 9.9905316739835e-06, "loss": 0.6348, "step": 1083 }, { "epoch": 0.49038679031893234, "grad_norm": 0.24048881232738495, "learning_rate": 9.990486547361747e-06, "loss": 0.4924, "step": 1084 }, { "epoch": 0.49083917665686494, "grad_norm": 0.28409191966056824, "learning_rate": 9.990441313559782e-06, "loss": 0.6881, "step": 1085 }, { "epoch": 0.49129156299479754, "grad_norm": 0.24379833042621613, "learning_rate": 9.990395972578575e-06, "loss": 0.6929, "step": 1086 }, { "epoch": 0.49174394933273013, "grad_norm": 0.24425964057445526, "learning_rate": 9.9903505244191e-06, "loss": 0.5864, "step": 1087 }, { "epoch": 0.49219633567066273, "grad_norm": 0.23963700234889984, "learning_rate": 9.990304969082332e-06, "loss": 0.5594, "step": 1088 }, { "epoch": 0.49264872200859533, "grad_norm": 0.25409695506095886, "learning_rate": 9.990259306569251e-06, "loss": 0.4957, "step": 1089 }, { "epoch": 0.4931011083465279, "grad_norm": 0.24900631606578827, "learning_rate": 9.990213536880836e-06, "loss": 0.6366, "step": 1090 }, { "epoch": 0.4935534946844605, "grad_norm": 0.31536802649497986, "learning_rate": 9.990167660018072e-06, "loss": 0.7167, "step": 1091 }, { "epoch": 0.4940058810223931, "grad_norm": 0.32627785205841064, "learning_rate": 9.990121675981944e-06, "loss": 0.6861, "step": 1092 }, { "epoch": 0.4944582673603257, "grad_norm": 0.3377833664417267, "learning_rate": 9.990075584773435e-06, "loss": 0.6135, "step": 1093 }, { "epoch": 0.4949106536982583, "grad_norm": 0.3363174796104431, "learning_rate": 9.990029386393542e-06, "loss": 0.5617, "step": 1094 }, { "epoch": 0.4953630400361909, "grad_norm": 0.3058222532272339, "learning_rate": 9.989983080843253e-06, "loss": 0.5041, "step": 1095 }, { "epoch": 0.4958154263741235, "grad_norm": 0.3135640323162079, "learning_rate": 9.989936668123564e-06, "loss": 0.5918, "step": 1096 }, { "epoch": 0.4962678127120561, "grad_norm": 0.3004806935787201, "learning_rate": 9.989890148235468e-06, "loss": 0.5749, "step": 1097 }, { "epoch": 0.4967201990499887, "grad_norm": 0.3473157286643982, "learning_rate": 9.989843521179972e-06, "loss": 0.5876, "step": 1098 }, { "epoch": 0.4971725853879213, "grad_norm": 0.3449253439903259, "learning_rate": 9.98979678695807e-06, "loss": 0.5014, "step": 1099 }, { "epoch": 0.4976249717258539, "grad_norm": 0.4083564877510071, "learning_rate": 9.989749945570768e-06, "loss": 0.5631, "step": 1100 }, { "epoch": 0.49807735806378645, "grad_norm": 0.11532290279865265, "learning_rate": 9.989702997019073e-06, "loss": 1.2258, "step": 1101 }, { "epoch": 0.49852974440171904, "grad_norm": 0.17163391411304474, "learning_rate": 9.989655941303993e-06, "loss": 1.0329, "step": 1102 }, { "epoch": 0.49898213073965164, "grad_norm": 0.16278113424777985, "learning_rate": 9.989608778426538e-06, "loss": 0.7133, "step": 1103 }, { "epoch": 0.49943451707758424, "grad_norm": 0.195398211479187, "learning_rate": 9.989561508387721e-06, "loss": 0.7377, "step": 1104 }, { "epoch": 0.49988690341551684, "grad_norm": 0.19261272251605988, "learning_rate": 9.98951413118856e-06, "loss": 0.6022, "step": 1105 }, { "epoch": 0.5003392897534494, "grad_norm": 0.17450179159641266, "learning_rate": 9.989466646830068e-06, "loss": 0.5894, "step": 1106 }, { "epoch": 0.5007916760913821, "grad_norm": 0.20115816593170166, "learning_rate": 9.989419055313266e-06, "loss": 0.6672, "step": 1107 }, { "epoch": 0.5012440624293146, "grad_norm": 0.18284134566783905, "learning_rate": 9.989371356639177e-06, "loss": 0.6774, "step": 1108 }, { "epoch": 0.5016964487672473, "grad_norm": 0.2355135828256607, "learning_rate": 9.989323550808826e-06, "loss": 0.8437, "step": 1109 }, { "epoch": 0.5021488351051798, "grad_norm": 0.20643211901187897, "learning_rate": 9.989275637823239e-06, "loss": 0.6357, "step": 1110 }, { "epoch": 0.5026012214431124, "grad_norm": 0.18918348848819733, "learning_rate": 9.989227617683445e-06, "loss": 0.622, "step": 1111 }, { "epoch": 0.503053607781045, "grad_norm": 0.20704635977745056, "learning_rate": 9.989179490390475e-06, "loss": 0.684, "step": 1112 }, { "epoch": 0.5035059941189776, "grad_norm": 0.21655140817165375, "learning_rate": 9.989131255945363e-06, "loss": 0.8162, "step": 1113 }, { "epoch": 0.5039583804569102, "grad_norm": 0.1872698962688446, "learning_rate": 9.989082914349146e-06, "loss": 0.5544, "step": 1114 }, { "epoch": 0.5044107667948428, "grad_norm": 0.22418074309825897, "learning_rate": 9.989034465602861e-06, "loss": 0.7542, "step": 1115 }, { "epoch": 0.5048631531327754, "grad_norm": 0.21659909188747406, "learning_rate": 9.988985909707549e-06, "loss": 0.661, "step": 1116 }, { "epoch": 0.505315539470708, "grad_norm": 0.2284967452287674, "learning_rate": 9.98893724666425e-06, "loss": 0.7312, "step": 1117 }, { "epoch": 0.5057679258086406, "grad_norm": 0.21439167857170105, "learning_rate": 9.988888476474014e-06, "loss": 0.6663, "step": 1118 }, { "epoch": 0.5062203121465731, "grad_norm": 0.2458486557006836, "learning_rate": 9.988839599137884e-06, "loss": 0.6207, "step": 1119 }, { "epoch": 0.5066726984845058, "grad_norm": 0.21591107547283173, "learning_rate": 9.988790614656915e-06, "loss": 0.5757, "step": 1120 }, { "epoch": 0.5071250848224383, "grad_norm": 0.2181759923696518, "learning_rate": 9.988741523032154e-06, "loss": 0.638, "step": 1121 }, { "epoch": 0.507577471160371, "grad_norm": 0.24597296118736267, "learning_rate": 9.988692324264657e-06, "loss": 0.6391, "step": 1122 }, { "epoch": 0.5080298574983035, "grad_norm": 0.2307048887014389, "learning_rate": 9.988643018355482e-06, "loss": 0.7311, "step": 1123 }, { "epoch": 0.5084822438362362, "grad_norm": 0.1829221546649933, "learning_rate": 9.988593605305686e-06, "loss": 0.5066, "step": 1124 }, { "epoch": 0.5089346301741687, "grad_norm": 0.21307790279388428, "learning_rate": 9.98854408511633e-06, "loss": 0.6224, "step": 1125 }, { "epoch": 0.5093870165121014, "grad_norm": 0.22613902390003204, "learning_rate": 9.98849445778848e-06, "loss": 0.5074, "step": 1126 }, { "epoch": 0.5098394028500339, "grad_norm": 0.2316797375679016, "learning_rate": 9.9884447233232e-06, "loss": 0.6338, "step": 1127 }, { "epoch": 0.5102917891879665, "grad_norm": 0.22519923746585846, "learning_rate": 9.988394881721558e-06, "loss": 0.6355, "step": 1128 }, { "epoch": 0.5107441755258991, "grad_norm": 0.2065473347902298, "learning_rate": 9.988344932984625e-06, "loss": 0.578, "step": 1129 }, { "epoch": 0.5111965618638317, "grad_norm": 0.21057042479515076, "learning_rate": 9.988294877113475e-06, "loss": 0.5834, "step": 1130 }, { "epoch": 0.5116489482017643, "grad_norm": 0.24896006286144257, "learning_rate": 9.98824471410918e-06, "loss": 0.5832, "step": 1131 }, { "epoch": 0.5121013345396969, "grad_norm": 0.25427478551864624, "learning_rate": 9.988194443972821e-06, "loss": 0.5612, "step": 1132 }, { "epoch": 0.5125537208776295, "grad_norm": 0.238283172249794, "learning_rate": 9.988144066705475e-06, "loss": 0.5364, "step": 1133 }, { "epoch": 0.5130061072155621, "grad_norm": 0.2663726508617401, "learning_rate": 9.988093582308225e-06, "loss": 0.6204, "step": 1134 }, { "epoch": 0.5134584935534947, "grad_norm": 0.27040326595306396, "learning_rate": 9.988042990782155e-06, "loss": 0.6564, "step": 1135 }, { "epoch": 0.5139108798914273, "grad_norm": 0.24623458087444305, "learning_rate": 9.98799229212835e-06, "loss": 0.5317, "step": 1136 }, { "epoch": 0.5143632662293599, "grad_norm": 0.3261900544166565, "learning_rate": 9.9879414863479e-06, "loss": 0.6149, "step": 1137 }, { "epoch": 0.5148156525672924, "grad_norm": 0.27358466386795044, "learning_rate": 9.987890573441899e-06, "loss": 0.6139, "step": 1138 }, { "epoch": 0.5152680389052251, "grad_norm": 0.3847373425960541, "learning_rate": 9.987839553411435e-06, "loss": 0.6772, "step": 1139 }, { "epoch": 0.5157204252431576, "grad_norm": 0.28965502977371216, "learning_rate": 9.98778842625761e-06, "loss": 0.6339, "step": 1140 }, { "epoch": 0.5161728115810903, "grad_norm": 0.259716659784317, "learning_rate": 9.987737191981517e-06, "loss": 0.5134, "step": 1141 }, { "epoch": 0.5166251979190228, "grad_norm": 0.27634143829345703, "learning_rate": 9.987685850584258e-06, "loss": 0.5421, "step": 1142 }, { "epoch": 0.5170775842569555, "grad_norm": 0.37332555651664734, "learning_rate": 9.987634402066937e-06, "loss": 0.6566, "step": 1143 }, { "epoch": 0.517529970594888, "grad_norm": 0.2733032703399658, "learning_rate": 9.987582846430658e-06, "loss": 0.5478, "step": 1144 }, { "epoch": 0.5179823569328206, "grad_norm": 0.3204512298107147, "learning_rate": 9.987531183676525e-06, "loss": 0.5017, "step": 1145 }, { "epoch": 0.5184347432707532, "grad_norm": 0.3475666344165802, "learning_rate": 9.987479413805652e-06, "loss": 0.601, "step": 1146 }, { "epoch": 0.5188871296086858, "grad_norm": 0.37602826952934265, "learning_rate": 9.98742753681915e-06, "loss": 0.6162, "step": 1147 }, { "epoch": 0.5193395159466184, "grad_norm": 0.36161550879478455, "learning_rate": 9.987375552718133e-06, "loss": 0.5903, "step": 1148 }, { "epoch": 0.519791902284551, "grad_norm": 0.36596357822418213, "learning_rate": 9.987323461503717e-06, "loss": 0.553, "step": 1149 }, { "epoch": 0.5202442886224836, "grad_norm": 0.4916616380214691, "learning_rate": 9.98727126317702e-06, "loss": 0.7303, "step": 1150 }, { "epoch": 0.5206966749604162, "grad_norm": 0.12974916398525238, "learning_rate": 9.987218957739165e-06, "loss": 1.1396, "step": 1151 }, { "epoch": 0.5211490612983488, "grad_norm": 0.22159859538078308, "learning_rate": 9.987166545191271e-06, "loss": 1.1169, "step": 1152 }, { "epoch": 0.5216014476362814, "grad_norm": 0.18254216015338898, "learning_rate": 9.98711402553447e-06, "loss": 0.7974, "step": 1153 }, { "epoch": 0.522053833974214, "grad_norm": 0.19563539326190948, "learning_rate": 9.987061398769886e-06, "loss": 0.6617, "step": 1154 }, { "epoch": 0.5225062203121466, "grad_norm": 0.1868840605020523, "learning_rate": 9.987008664898649e-06, "loss": 0.6217, "step": 1155 }, { "epoch": 0.5229586066500792, "grad_norm": 0.18637557327747345, "learning_rate": 9.986955823921895e-06, "loss": 0.7113, "step": 1156 }, { "epoch": 0.5234109929880117, "grad_norm": 0.19314000010490417, "learning_rate": 9.986902875840754e-06, "loss": 0.6904, "step": 1157 }, { "epoch": 0.5238633793259444, "grad_norm": 0.1854528933763504, "learning_rate": 9.986849820656366e-06, "loss": 0.6698, "step": 1158 }, { "epoch": 0.5243157656638769, "grad_norm": 0.20883511006832123, "learning_rate": 9.986796658369872e-06, "loss": 0.7683, "step": 1159 }, { "epoch": 0.5247681520018096, "grad_norm": 0.22165347635746002, "learning_rate": 9.98674338898241e-06, "loss": 0.69, "step": 1160 }, { "epoch": 0.5252205383397421, "grad_norm": 0.22566314041614532, "learning_rate": 9.986690012495127e-06, "loss": 0.6691, "step": 1161 }, { "epoch": 0.5256729246776747, "grad_norm": 0.21695281565189362, "learning_rate": 9.986636528909168e-06, "loss": 0.7055, "step": 1162 }, { "epoch": 0.5261253110156073, "grad_norm": 0.19932959973812103, "learning_rate": 9.986582938225683e-06, "loss": 0.5966, "step": 1163 }, { "epoch": 0.5265776973535399, "grad_norm": 0.2132115215063095, "learning_rate": 9.986529240445819e-06, "loss": 0.5915, "step": 1164 }, { "epoch": 0.5270300836914725, "grad_norm": 0.21845661103725433, "learning_rate": 9.986475435570735e-06, "loss": 0.6835, "step": 1165 }, { "epoch": 0.5274824700294051, "grad_norm": 0.23156805336475372, "learning_rate": 9.986421523601582e-06, "loss": 0.6135, "step": 1166 }, { "epoch": 0.5279348563673377, "grad_norm": 0.21425387263298035, "learning_rate": 9.98636750453952e-06, "loss": 0.5736, "step": 1167 }, { "epoch": 0.5283872427052703, "grad_norm": 0.1753532588481903, "learning_rate": 9.98631337838571e-06, "loss": 0.5242, "step": 1168 }, { "epoch": 0.5288396290432029, "grad_norm": 0.22765645384788513, "learning_rate": 9.986259145141312e-06, "loss": 0.6448, "step": 1169 }, { "epoch": 0.5292920153811355, "grad_norm": 0.22298693656921387, "learning_rate": 9.986204804807493e-06, "loss": 0.5754, "step": 1170 }, { "epoch": 0.5297444017190681, "grad_norm": 0.21083657443523407, "learning_rate": 9.986150357385417e-06, "loss": 0.5217, "step": 1171 }, { "epoch": 0.5301967880570007, "grad_norm": 0.20923402905464172, "learning_rate": 9.986095802876256e-06, "loss": 0.5656, "step": 1172 }, { "epoch": 0.5306491743949333, "grad_norm": 0.21340417861938477, "learning_rate": 9.986041141281183e-06, "loss": 0.5765, "step": 1173 }, { "epoch": 0.5311015607328659, "grad_norm": 0.20001165568828583, "learning_rate": 9.985986372601368e-06, "loss": 0.4708, "step": 1174 }, { "epoch": 0.5315539470707985, "grad_norm": 0.25324857234954834, "learning_rate": 9.985931496837992e-06, "loss": 0.7072, "step": 1175 }, { "epoch": 0.532006333408731, "grad_norm": 0.21564704179763794, "learning_rate": 9.985876513992228e-06, "loss": 0.5792, "step": 1176 }, { "epoch": 0.5324587197466637, "grad_norm": 0.23099857568740845, "learning_rate": 9.98582142406526e-06, "loss": 0.6385, "step": 1177 }, { "epoch": 0.5329111060845962, "grad_norm": 0.2346724271774292, "learning_rate": 9.985766227058273e-06, "loss": 0.6094, "step": 1178 }, { "epoch": 0.5333634924225288, "grad_norm": 0.24632368981838226, "learning_rate": 9.985710922972448e-06, "loss": 0.5782, "step": 1179 }, { "epoch": 0.5338158787604614, "grad_norm": 0.23514077067375183, "learning_rate": 9.985655511808977e-06, "loss": 0.5393, "step": 1180 }, { "epoch": 0.534268265098394, "grad_norm": 0.25014036893844604, "learning_rate": 9.985599993569048e-06, "loss": 0.6524, "step": 1181 }, { "epoch": 0.5347206514363266, "grad_norm": 0.2698171138763428, "learning_rate": 9.985544368253852e-06, "loss": 0.6439, "step": 1182 }, { "epoch": 0.5351730377742592, "grad_norm": 0.21626150608062744, "learning_rate": 9.985488635864586e-06, "loss": 0.5787, "step": 1183 }, { "epoch": 0.5356254241121918, "grad_norm": 0.2514764368534088, "learning_rate": 9.985432796402447e-06, "loss": 0.7504, "step": 1184 }, { "epoch": 0.5360778104501244, "grad_norm": 0.25036701560020447, "learning_rate": 9.985376849868633e-06, "loss": 0.5332, "step": 1185 }, { "epoch": 0.536530196788057, "grad_norm": 0.2652897834777832, "learning_rate": 9.985320796264346e-06, "loss": 0.6027, "step": 1186 }, { "epoch": 0.5369825831259896, "grad_norm": 0.24843886494636536, "learning_rate": 9.985264635590788e-06, "loss": 0.58, "step": 1187 }, { "epoch": 0.5374349694639222, "grad_norm": 0.2557542622089386, "learning_rate": 9.98520836784917e-06, "loss": 0.5637, "step": 1188 }, { "epoch": 0.5378873558018548, "grad_norm": 0.28459012508392334, "learning_rate": 9.985151993040694e-06, "loss": 0.5463, "step": 1189 }, { "epoch": 0.5383397421397874, "grad_norm": 0.24615009129047394, "learning_rate": 9.985095511166576e-06, "loss": 0.5897, "step": 1190 }, { "epoch": 0.53879212847772, "grad_norm": 0.24527877569198608, "learning_rate": 9.985038922228027e-06, "loss": 0.5492, "step": 1191 }, { "epoch": 0.5392445148156526, "grad_norm": 0.27791738510131836, "learning_rate": 9.984982226226263e-06, "loss": 0.6089, "step": 1192 }, { "epoch": 0.5396969011535852, "grad_norm": 0.28754472732543945, "learning_rate": 9.984925423162502e-06, "loss": 0.6338, "step": 1193 }, { "epoch": 0.5401492874915178, "grad_norm": 0.27946940064430237, "learning_rate": 9.984868513037962e-06, "loss": 0.5558, "step": 1194 }, { "epoch": 0.5406016738294503, "grad_norm": 0.3304057717323303, "learning_rate": 9.984811495853868e-06, "loss": 0.6274, "step": 1195 }, { "epoch": 0.541054060167383, "grad_norm": 0.3552910387516022, "learning_rate": 9.98475437161144e-06, "loss": 0.6064, "step": 1196 }, { "epoch": 0.5415064465053155, "grad_norm": 0.3503250479698181, "learning_rate": 9.98469714031191e-06, "loss": 0.59, "step": 1197 }, { "epoch": 0.5419588328432481, "grad_norm": 0.3497371971607208, "learning_rate": 9.984639801956506e-06, "loss": 0.6554, "step": 1198 }, { "epoch": 0.5424112191811807, "grad_norm": 0.3199635148048401, "learning_rate": 9.984582356546457e-06, "loss": 0.4813, "step": 1199 }, { "epoch": 0.5428636055191133, "grad_norm": 0.47353509068489075, "learning_rate": 9.984524804082998e-06, "loss": 0.6778, "step": 1200 }, { "epoch": 0.5428636055191133, "eval_loss": 0.6323645114898682, "eval_runtime": 27.4192, "eval_samples_per_second": 27.134, "eval_steps_per_second": 6.784, "step": 1200 }, { "epoch": 0.5433159918570459, "grad_norm": 0.12181596457958221, "learning_rate": 9.984467144567366e-06, "loss": 1.2488, "step": 1201 }, { "epoch": 0.5437683781949785, "grad_norm": 0.11425438523292542, "learning_rate": 9.9844093780008e-06, "loss": 0.6886, "step": 1202 }, { "epoch": 0.5442207645329111, "grad_norm": 0.16937197744846344, "learning_rate": 9.984351504384535e-06, "loss": 0.9439, "step": 1203 }, { "epoch": 0.5446731508708437, "grad_norm": 0.16759510338306427, "learning_rate": 9.984293523719822e-06, "loss": 0.7698, "step": 1204 }, { "epoch": 0.5451255372087763, "grad_norm": 0.1654934287071228, "learning_rate": 9.984235436007901e-06, "loss": 0.5827, "step": 1205 }, { "epoch": 0.5455779235467089, "grad_norm": 0.1765814572572708, "learning_rate": 9.984177241250022e-06, "loss": 0.6756, "step": 1206 }, { "epoch": 0.5460303098846415, "grad_norm": 0.17892947793006897, "learning_rate": 9.984118939447432e-06, "loss": 0.6187, "step": 1207 }, { "epoch": 0.5464826962225741, "grad_norm": 0.1850893348455429, "learning_rate": 9.984060530601388e-06, "loss": 0.6009, "step": 1208 }, { "epoch": 0.5469350825605067, "grad_norm": 0.19826403260231018, "learning_rate": 9.984002014713139e-06, "loss": 0.7145, "step": 1209 }, { "epoch": 0.5473874688984393, "grad_norm": 0.21711209416389465, "learning_rate": 9.983943391783943e-06, "loss": 0.6125, "step": 1210 }, { "epoch": 0.5478398552363719, "grad_norm": 0.1996651291847229, "learning_rate": 9.983884661815064e-06, "loss": 0.6317, "step": 1211 }, { "epoch": 0.5482922415743045, "grad_norm": 0.2089514136314392, "learning_rate": 9.983825824807756e-06, "loss": 0.6929, "step": 1212 }, { "epoch": 0.5487446279122371, "grad_norm": 0.21617907285690308, "learning_rate": 9.983766880763288e-06, "loss": 0.6992, "step": 1213 }, { "epoch": 0.5491970142501696, "grad_norm": 0.23066048324108124, "learning_rate": 9.983707829682924e-06, "loss": 0.6016, "step": 1214 }, { "epoch": 0.5496494005881022, "grad_norm": 0.18448014557361603, "learning_rate": 9.983648671567931e-06, "loss": 0.666, "step": 1215 }, { "epoch": 0.5501017869260348, "grad_norm": 0.20046593248844147, "learning_rate": 9.983589406419583e-06, "loss": 0.5195, "step": 1216 }, { "epoch": 0.5505541732639674, "grad_norm": 0.20790576934814453, "learning_rate": 9.983530034239151e-06, "loss": 0.5928, "step": 1217 }, { "epoch": 0.5510065596019, "grad_norm": 0.2464827597141266, "learning_rate": 9.983470555027908e-06, "loss": 0.9024, "step": 1218 }, { "epoch": 0.5514589459398326, "grad_norm": 0.2057012915611267, "learning_rate": 9.983410968787134e-06, "loss": 0.4955, "step": 1219 }, { "epoch": 0.5519113322777652, "grad_norm": 0.2335517257452011, "learning_rate": 9.983351275518108e-06, "loss": 0.6654, "step": 1220 }, { "epoch": 0.5523637186156978, "grad_norm": 0.19507312774658203, "learning_rate": 9.983291475222113e-06, "loss": 0.5582, "step": 1221 }, { "epoch": 0.5528161049536304, "grad_norm": 0.21147161722183228, "learning_rate": 9.98323156790043e-06, "loss": 0.6364, "step": 1222 }, { "epoch": 0.553268491291563, "grad_norm": 0.22222808003425598, "learning_rate": 9.98317155355435e-06, "loss": 0.5779, "step": 1223 }, { "epoch": 0.5537208776294956, "grad_norm": 0.194444939494133, "learning_rate": 9.983111432185159e-06, "loss": 0.5638, "step": 1224 }, { "epoch": 0.5541732639674282, "grad_norm": 0.23081545531749725, "learning_rate": 9.98305120379415e-06, "loss": 0.6501, "step": 1225 }, { "epoch": 0.5546256503053608, "grad_norm": 0.23809808492660522, "learning_rate": 9.982990868382614e-06, "loss": 0.6237, "step": 1226 }, { "epoch": 0.5550780366432934, "grad_norm": 0.23167423903942108, "learning_rate": 9.982930425951849e-06, "loss": 0.5559, "step": 1227 }, { "epoch": 0.555530422981226, "grad_norm": 0.24642497301101685, "learning_rate": 9.982869876503154e-06, "loss": 0.6736, "step": 1228 }, { "epoch": 0.5559828093191586, "grad_norm": 0.23241057991981506, "learning_rate": 9.982809220037826e-06, "loss": 0.6905, "step": 1229 }, { "epoch": 0.5564351956570912, "grad_norm": 0.24046652019023895, "learning_rate": 9.982748456557172e-06, "loss": 0.6466, "step": 1230 }, { "epoch": 0.5568875819950238, "grad_norm": 0.23461534082889557, "learning_rate": 9.982687586062494e-06, "loss": 0.495, "step": 1231 }, { "epoch": 0.5573399683329563, "grad_norm": 0.26217392086982727, "learning_rate": 9.9826266085551e-06, "loss": 0.5651, "step": 1232 }, { "epoch": 0.557792354670889, "grad_norm": 0.24970905482769012, "learning_rate": 9.982565524036299e-06, "loss": 0.6441, "step": 1233 }, { "epoch": 0.5582447410088215, "grad_norm": 0.2653008699417114, "learning_rate": 9.982504332507404e-06, "loss": 0.6182, "step": 1234 }, { "epoch": 0.5586971273467541, "grad_norm": 0.23248615860939026, "learning_rate": 9.982443033969727e-06, "loss": 0.5309, "step": 1235 }, { "epoch": 0.5591495136846867, "grad_norm": 0.25020456314086914, "learning_rate": 9.982381628424588e-06, "loss": 0.6471, "step": 1236 }, { "epoch": 0.5596019000226193, "grad_norm": 0.24329841136932373, "learning_rate": 9.982320115873305e-06, "loss": 0.5548, "step": 1237 }, { "epoch": 0.5600542863605519, "grad_norm": 0.3161710500717163, "learning_rate": 9.982258496317196e-06, "loss": 0.614, "step": 1238 }, { "epoch": 0.5605066726984845, "grad_norm": 0.30509883165359497, "learning_rate": 9.982196769757587e-06, "loss": 0.6535, "step": 1239 }, { "epoch": 0.5609590590364171, "grad_norm": 0.27226534485816956, "learning_rate": 9.982134936195805e-06, "loss": 0.563, "step": 1240 }, { "epoch": 0.5614114453743497, "grad_norm": 0.29107049107551575, "learning_rate": 9.982072995633174e-06, "loss": 0.5986, "step": 1241 }, { "epoch": 0.5618638317122823, "grad_norm": 0.2828601598739624, "learning_rate": 9.982010948071028e-06, "loss": 0.461, "step": 1242 }, { "epoch": 0.5623162180502149, "grad_norm": 0.2544436752796173, "learning_rate": 9.981948793510697e-06, "loss": 0.4874, "step": 1243 }, { "epoch": 0.5627686043881475, "grad_norm": 0.2832776606082916, "learning_rate": 9.981886531953518e-06, "loss": 0.6101, "step": 1244 }, { "epoch": 0.5632209907260801, "grad_norm": 0.3170752227306366, "learning_rate": 9.981824163400827e-06, "loss": 0.5124, "step": 1245 }, { "epoch": 0.5636733770640127, "grad_norm": 0.2881445586681366, "learning_rate": 9.981761687853964e-06, "loss": 0.5801, "step": 1246 }, { "epoch": 0.5641257634019453, "grad_norm": 0.33425191044807434, "learning_rate": 9.98169910531427e-06, "loss": 0.5989, "step": 1247 }, { "epoch": 0.5645781497398779, "grad_norm": 0.40390005707740784, "learning_rate": 9.98163641578309e-06, "loss": 0.6176, "step": 1248 }, { "epoch": 0.5650305360778104, "grad_norm": 0.35231706500053406, "learning_rate": 9.981573619261768e-06, "loss": 0.5808, "step": 1249 }, { "epoch": 0.565482922415743, "grad_norm": 0.5114802718162537, "learning_rate": 9.981510715751656e-06, "loss": 0.6656, "step": 1250 }, { "epoch": 0.5659353087536756, "grad_norm": 0.13619577884674072, "learning_rate": 9.981447705254105e-06, "loss": 1.2479, "step": 1251 }, { "epoch": 0.5663876950916082, "grad_norm": 0.18898463249206543, "learning_rate": 9.981384587770467e-06, "loss": 0.9969, "step": 1252 }, { "epoch": 0.5668400814295408, "grad_norm": 0.17754599452018738, "learning_rate": 9.981321363302095e-06, "loss": 0.6875, "step": 1253 }, { "epoch": 0.5672924677674734, "grad_norm": 0.18311157822608948, "learning_rate": 9.981258031850349e-06, "loss": 0.6958, "step": 1254 }, { "epoch": 0.567744854105406, "grad_norm": 0.16261844336986542, "learning_rate": 9.981194593416592e-06, "loss": 0.6673, "step": 1255 }, { "epoch": 0.5681972404433386, "grad_norm": 0.19569909572601318, "learning_rate": 9.981131048002182e-06, "loss": 0.6629, "step": 1256 }, { "epoch": 0.5686496267812712, "grad_norm": 0.20824256539344788, "learning_rate": 9.981067395608486e-06, "loss": 0.7907, "step": 1257 }, { "epoch": 0.5691020131192038, "grad_norm": 0.1952500194311142, "learning_rate": 9.981003636236872e-06, "loss": 0.6779, "step": 1258 }, { "epoch": 0.5695543994571364, "grad_norm": 0.22663311660289764, "learning_rate": 9.980939769888706e-06, "loss": 0.6874, "step": 1259 }, { "epoch": 0.570006785795069, "grad_norm": 0.1940218210220337, "learning_rate": 9.980875796565363e-06, "loss": 0.5406, "step": 1260 }, { "epoch": 0.5704591721330016, "grad_norm": 0.2153218388557434, "learning_rate": 9.980811716268215e-06, "loss": 0.7165, "step": 1261 }, { "epoch": 0.5709115584709342, "grad_norm": 0.2164214551448822, "learning_rate": 9.98074752899864e-06, "loss": 0.7661, "step": 1262 }, { "epoch": 0.5713639448088668, "grad_norm": 0.17862090468406677, "learning_rate": 9.980683234758015e-06, "loss": 0.4828, "step": 1263 }, { "epoch": 0.5718163311467994, "grad_norm": 0.1959761083126068, "learning_rate": 9.98061883354772e-06, "loss": 0.5492, "step": 1264 }, { "epoch": 0.572268717484732, "grad_norm": 0.22993987798690796, "learning_rate": 9.980554325369141e-06, "loss": 0.759, "step": 1265 }, { "epoch": 0.5727211038226645, "grad_norm": 0.25827789306640625, "learning_rate": 9.980489710223662e-06, "loss": 0.6394, "step": 1266 }, { "epoch": 0.5731734901605972, "grad_norm": 0.19186359643936157, "learning_rate": 9.980424988112669e-06, "loss": 0.5943, "step": 1267 }, { "epoch": 0.5736258764985297, "grad_norm": 0.2542232275009155, "learning_rate": 9.980360159037555e-06, "loss": 0.8146, "step": 1268 }, { "epoch": 0.5740782628364623, "grad_norm": 0.22708389163017273, "learning_rate": 9.98029522299971e-06, "loss": 0.6651, "step": 1269 }, { "epoch": 0.5745306491743949, "grad_norm": 0.24626114964485168, "learning_rate": 9.98023018000053e-06, "loss": 0.5761, "step": 1270 }, { "epoch": 0.5749830355123275, "grad_norm": 0.20677554607391357, "learning_rate": 9.980165030041412e-06, "loss": 0.6182, "step": 1271 }, { "epoch": 0.5754354218502601, "grad_norm": 0.2260519564151764, "learning_rate": 9.980099773123754e-06, "loss": 0.6129, "step": 1272 }, { "epoch": 0.5758878081881927, "grad_norm": 0.21111872792243958, "learning_rate": 9.98003440924896e-06, "loss": 0.6403, "step": 1273 }, { "epoch": 0.5763401945261253, "grad_norm": 0.2554069757461548, "learning_rate": 9.979968938418429e-06, "loss": 0.7931, "step": 1274 }, { "epoch": 0.5767925808640579, "grad_norm": 0.23454685509204865, "learning_rate": 9.979903360633573e-06, "loss": 0.667, "step": 1275 }, { "epoch": 0.5772449672019905, "grad_norm": 0.22072899341583252, "learning_rate": 9.979837675895797e-06, "loss": 0.4877, "step": 1276 }, { "epoch": 0.5776973535399231, "grad_norm": 0.24346807599067688, "learning_rate": 9.979771884206512e-06, "loss": 0.5536, "step": 1277 }, { "epoch": 0.5781497398778557, "grad_norm": 0.2247954159975052, "learning_rate": 9.979705985567131e-06, "loss": 0.5953, "step": 1278 }, { "epoch": 0.5786021262157883, "grad_norm": 0.20974291861057281, "learning_rate": 9.97963997997907e-06, "loss": 0.522, "step": 1279 }, { "epoch": 0.5790545125537209, "grad_norm": 0.244164377450943, "learning_rate": 9.979573867443746e-06, "loss": 0.59, "step": 1280 }, { "epoch": 0.5795068988916535, "grad_norm": 0.27396851778030396, "learning_rate": 9.979507647962579e-06, "loss": 0.6718, "step": 1281 }, { "epoch": 0.5799592852295861, "grad_norm": 0.36984163522720337, "learning_rate": 9.979441321536992e-06, "loss": 0.6116, "step": 1282 }, { "epoch": 0.5804116715675186, "grad_norm": 0.2471388578414917, "learning_rate": 9.979374888168406e-06, "loss": 0.5666, "step": 1283 }, { "epoch": 0.5808640579054513, "grad_norm": 0.24115817248821259, "learning_rate": 9.979308347858254e-06, "loss": 0.5271, "step": 1284 }, { "epoch": 0.5813164442433838, "grad_norm": 0.281093955039978, "learning_rate": 9.979241700607959e-06, "loss": 0.5966, "step": 1285 }, { "epoch": 0.5817688305813165, "grad_norm": 0.2402959018945694, "learning_rate": 9.979174946418957e-06, "loss": 0.5503, "step": 1286 }, { "epoch": 0.582221216919249, "grad_norm": 0.30930906534194946, "learning_rate": 9.979108085292678e-06, "loss": 0.6665, "step": 1287 }, { "epoch": 0.5826736032571816, "grad_norm": 0.26872602105140686, "learning_rate": 9.979041117230562e-06, "loss": 0.6084, "step": 1288 }, { "epoch": 0.5831259895951142, "grad_norm": 0.27255573868751526, "learning_rate": 9.978974042234041e-06, "loss": 0.5093, "step": 1289 }, { "epoch": 0.5835783759330468, "grad_norm": 0.3659798800945282, "learning_rate": 9.978906860304563e-06, "loss": 0.6543, "step": 1290 }, { "epoch": 0.5840307622709794, "grad_norm": 0.2861217260360718, "learning_rate": 9.978839571443566e-06, "loss": 0.5068, "step": 1291 }, { "epoch": 0.584483148608912, "grad_norm": 0.3017103970050812, "learning_rate": 9.978772175652495e-06, "loss": 0.5238, "step": 1292 }, { "epoch": 0.5849355349468446, "grad_norm": 0.39951297640800476, "learning_rate": 9.978704672932802e-06, "loss": 0.7201, "step": 1293 }, { "epoch": 0.5853879212847772, "grad_norm": 0.31259772181510925, "learning_rate": 9.978637063285932e-06, "loss": 0.5222, "step": 1294 }, { "epoch": 0.5858403076227098, "grad_norm": 0.3594709038734436, "learning_rate": 9.97856934671334e-06, "loss": 0.5441, "step": 1295 }, { "epoch": 0.5862926939606424, "grad_norm": 0.3655655086040497, "learning_rate": 9.978501523216477e-06, "loss": 0.6331, "step": 1296 }, { "epoch": 0.586745080298575, "grad_norm": 0.336794912815094, "learning_rate": 9.978433592796804e-06, "loss": 0.5302, "step": 1297 }, { "epoch": 0.5871974666365076, "grad_norm": 0.48038870096206665, "learning_rate": 9.978365555455775e-06, "loss": 0.5958, "step": 1298 }, { "epoch": 0.5876498529744402, "grad_norm": 0.36571207642555237, "learning_rate": 9.978297411194856e-06, "loss": 0.6046, "step": 1299 }, { "epoch": 0.5881022393123728, "grad_norm": 0.4999653100967407, "learning_rate": 9.978229160015508e-06, "loss": 0.5005, "step": 1300 }, { "epoch": 0.5885546256503054, "grad_norm": 0.16710057854652405, "learning_rate": 9.978160801919196e-06, "loss": 1.0104, "step": 1301 }, { "epoch": 0.5890070119882379, "grad_norm": 0.178179532289505, "learning_rate": 9.978092336907392e-06, "loss": 0.6367, "step": 1302 }, { "epoch": 0.5894593983261706, "grad_norm": 0.2183419167995453, "learning_rate": 9.978023764981562e-06, "loss": 0.7579, "step": 1303 }, { "epoch": 0.5899117846641031, "grad_norm": 0.18940821290016174, "learning_rate": 9.97795508614318e-06, "loss": 0.6993, "step": 1304 }, { "epoch": 0.5903641710020358, "grad_norm": 0.1830447018146515, "learning_rate": 9.977886300393723e-06, "loss": 0.5671, "step": 1305 }, { "epoch": 0.5908165573399683, "grad_norm": 0.25770366191864014, "learning_rate": 9.977817407734665e-06, "loss": 0.6756, "step": 1306 }, { "epoch": 0.591268943677901, "grad_norm": 0.1859053075313568, "learning_rate": 9.977748408167489e-06, "loss": 0.666, "step": 1307 }, { "epoch": 0.5917213300158335, "grad_norm": 0.20140452682971954, "learning_rate": 9.977679301693673e-06, "loss": 0.7736, "step": 1308 }, { "epoch": 0.5921737163537661, "grad_norm": 0.20370768010616302, "learning_rate": 9.977610088314706e-06, "loss": 0.6368, "step": 1309 }, { "epoch": 0.5926261026916987, "grad_norm": 0.21072159707546234, "learning_rate": 9.97754076803207e-06, "loss": 0.6535, "step": 1310 }, { "epoch": 0.5930784890296313, "grad_norm": 0.21940192580223083, "learning_rate": 9.977471340847259e-06, "loss": 0.7373, "step": 1311 }, { "epoch": 0.5935308753675639, "grad_norm": 0.23162981867790222, "learning_rate": 9.977401806761757e-06, "loss": 0.5776, "step": 1312 }, { "epoch": 0.5939832617054965, "grad_norm": 0.24975769221782684, "learning_rate": 9.977332165777063e-06, "loss": 0.6799, "step": 1313 }, { "epoch": 0.5944356480434291, "grad_norm": 0.19718357920646667, "learning_rate": 9.977262417894671e-06, "loss": 0.5034, "step": 1314 }, { "epoch": 0.5948880343813617, "grad_norm": 0.23429250717163086, "learning_rate": 9.977192563116078e-06, "loss": 0.6988, "step": 1315 }, { "epoch": 0.5953404207192943, "grad_norm": 0.1900215446949005, "learning_rate": 9.977122601442786e-06, "loss": 0.5724, "step": 1316 }, { "epoch": 0.5957928070572269, "grad_norm": 0.21339181065559387, "learning_rate": 9.977052532876296e-06, "loss": 0.6251, "step": 1317 }, { "epoch": 0.5962451933951595, "grad_norm": 0.2803313732147217, "learning_rate": 9.976982357418114e-06, "loss": 0.7155, "step": 1318 }, { "epoch": 0.596697579733092, "grad_norm": 0.23995469510555267, "learning_rate": 9.976912075069747e-06, "loss": 0.7377, "step": 1319 }, { "epoch": 0.5971499660710247, "grad_norm": 0.21672214567661285, "learning_rate": 9.976841685832704e-06, "loss": 0.5918, "step": 1320 }, { "epoch": 0.5976023524089572, "grad_norm": 0.24420355260372162, "learning_rate": 9.976771189708497e-06, "loss": 0.6614, "step": 1321 }, { "epoch": 0.5980547387468899, "grad_norm": 0.24804209172725677, "learning_rate": 9.97670058669864e-06, "loss": 0.6762, "step": 1322 }, { "epoch": 0.5985071250848224, "grad_norm": 0.2200249433517456, "learning_rate": 9.976629876804649e-06, "loss": 0.5277, "step": 1323 }, { "epoch": 0.598959511422755, "grad_norm": 0.23294657468795776, "learning_rate": 9.976559060028042e-06, "loss": 0.6361, "step": 1324 }, { "epoch": 0.5994118977606876, "grad_norm": 0.26586559414863586, "learning_rate": 9.976488136370342e-06, "loss": 0.6499, "step": 1325 }, { "epoch": 0.5998642840986202, "grad_norm": 0.2539757192134857, "learning_rate": 9.97641710583307e-06, "loss": 0.5847, "step": 1326 }, { "epoch": 0.6003166704365528, "grad_norm": 0.2767554223537445, "learning_rate": 9.976345968417754e-06, "loss": 0.6116, "step": 1327 }, { "epoch": 0.6007690567744854, "grad_norm": 0.23403988778591156, "learning_rate": 9.976274724125919e-06, "loss": 0.5964, "step": 1328 }, { "epoch": 0.601221443112418, "grad_norm": 0.2559235394001007, "learning_rate": 9.976203372959097e-06, "loss": 0.6173, "step": 1329 }, { "epoch": 0.6016738294503506, "grad_norm": 0.2381136566400528, "learning_rate": 9.97613191491882e-06, "loss": 0.6197, "step": 1330 }, { "epoch": 0.6021262157882832, "grad_norm": 0.23071886599063873, "learning_rate": 9.976060350006623e-06, "loss": 0.5089, "step": 1331 }, { "epoch": 0.6025786021262158, "grad_norm": 0.2597028613090515, "learning_rate": 9.975988678224042e-06, "loss": 0.6766, "step": 1332 }, { "epoch": 0.6030309884641484, "grad_norm": 0.23802952468395233, "learning_rate": 9.975916899572618e-06, "loss": 0.5732, "step": 1333 }, { "epoch": 0.603483374802081, "grad_norm": 0.2591016888618469, "learning_rate": 9.97584501405389e-06, "loss": 0.5702, "step": 1334 }, { "epoch": 0.6039357611400136, "grad_norm": 0.2882373332977295, "learning_rate": 9.975773021669403e-06, "loss": 0.638, "step": 1335 }, { "epoch": 0.6043881474779461, "grad_norm": 0.252169668674469, "learning_rate": 9.975700922420704e-06, "loss": 0.5446, "step": 1336 }, { "epoch": 0.6048405338158788, "grad_norm": 0.2760002911090851, "learning_rate": 9.975628716309341e-06, "loss": 0.5521, "step": 1337 }, { "epoch": 0.6052929201538113, "grad_norm": 0.30561476945877075, "learning_rate": 9.975556403336865e-06, "loss": 0.6335, "step": 1338 }, { "epoch": 0.605745306491744, "grad_norm": 0.31025025248527527, "learning_rate": 9.975483983504829e-06, "loss": 0.5885, "step": 1339 }, { "epoch": 0.6061976928296765, "grad_norm": 0.26407113671302795, "learning_rate": 9.975411456814787e-06, "loss": 0.4591, "step": 1340 }, { "epoch": 0.6066500791676092, "grad_norm": 0.26874691247940063, "learning_rate": 9.9753388232683e-06, "loss": 0.5574, "step": 1341 }, { "epoch": 0.6071024655055417, "grad_norm": 0.34289222955703735, "learning_rate": 9.975266082866923e-06, "loss": 0.5946, "step": 1342 }, { "epoch": 0.6075548518434744, "grad_norm": 0.2607809603214264, "learning_rate": 9.975193235612223e-06, "loss": 0.5386, "step": 1343 }, { "epoch": 0.6080072381814069, "grad_norm": 0.3389042019844055, "learning_rate": 9.97512028150576e-06, "loss": 0.6709, "step": 1344 }, { "epoch": 0.6084596245193395, "grad_norm": 0.2854250371456146, "learning_rate": 9.975047220549106e-06, "loss": 0.5633, "step": 1345 }, { "epoch": 0.6089120108572721, "grad_norm": 0.32076773047447205, "learning_rate": 9.974974052743826e-06, "loss": 0.5656, "step": 1346 }, { "epoch": 0.6093643971952047, "grad_norm": 0.31141847372055054, "learning_rate": 9.974900778091494e-06, "loss": 0.5517, "step": 1347 }, { "epoch": 0.6098167835331373, "grad_norm": 0.3366420269012451, "learning_rate": 9.974827396593682e-06, "loss": 0.5342, "step": 1348 }, { "epoch": 0.6102691698710699, "grad_norm": 0.45233777165412903, "learning_rate": 9.974753908251967e-06, "loss": 0.6397, "step": 1349 }, { "epoch": 0.6107215562090025, "grad_norm": 0.5165599584579468, "learning_rate": 9.974680313067926e-06, "loss": 0.6562, "step": 1350 }, { "epoch": 0.6111739425469351, "grad_norm": 0.12562334537506104, "learning_rate": 9.974606611043142e-06, "loss": 1.0821, "step": 1351 }, { "epoch": 0.6116263288848677, "grad_norm": 0.1455240249633789, "learning_rate": 9.974532802179195e-06, "loss": 0.7859, "step": 1352 }, { "epoch": 0.6120787152228002, "grad_norm": 0.15720190107822418, "learning_rate": 9.974458886477672e-06, "loss": 0.6446, "step": 1353 }, { "epoch": 0.6125311015607329, "grad_norm": 0.1773960143327713, "learning_rate": 9.97438486394016e-06, "loss": 0.674, "step": 1354 }, { "epoch": 0.6129834878986654, "grad_norm": 0.18249064683914185, "learning_rate": 9.974310734568249e-06, "loss": 0.8129, "step": 1355 }, { "epoch": 0.6134358742365981, "grad_norm": 0.17175471782684326, "learning_rate": 9.97423649836353e-06, "loss": 0.6471, "step": 1356 }, { "epoch": 0.6138882605745306, "grad_norm": 0.1850937306880951, "learning_rate": 9.974162155327599e-06, "loss": 0.6934, "step": 1357 }, { "epoch": 0.6143406469124633, "grad_norm": 0.18064653873443604, "learning_rate": 9.974087705462052e-06, "loss": 0.5308, "step": 1358 }, { "epoch": 0.6147930332503958, "grad_norm": 0.19484154880046844, "learning_rate": 9.974013148768488e-06, "loss": 0.6124, "step": 1359 }, { "epoch": 0.6152454195883285, "grad_norm": 0.1919710785150528, "learning_rate": 9.973938485248508e-06, "loss": 0.7448, "step": 1360 }, { "epoch": 0.615697805926261, "grad_norm": 0.19641529023647308, "learning_rate": 9.973863714903715e-06, "loss": 0.6688, "step": 1361 }, { "epoch": 0.6161501922641937, "grad_norm": 0.19296810030937195, "learning_rate": 9.973788837735717e-06, "loss": 0.6758, "step": 1362 }, { "epoch": 0.6166025786021262, "grad_norm": 0.23394355177879333, "learning_rate": 9.973713853746118e-06, "loss": 0.7419, "step": 1363 }, { "epoch": 0.6170549649400588, "grad_norm": 0.1880982667207718, "learning_rate": 9.973638762936532e-06, "loss": 0.6203, "step": 1364 }, { "epoch": 0.6175073512779914, "grad_norm": 0.210435688495636, "learning_rate": 9.973563565308572e-06, "loss": 0.6901, "step": 1365 }, { "epoch": 0.617959737615924, "grad_norm": 0.20826871693134308, "learning_rate": 9.973488260863851e-06, "loss": 0.5981, "step": 1366 }, { "epoch": 0.6184121239538566, "grad_norm": 0.22301599383354187, "learning_rate": 9.973412849603987e-06, "loss": 0.6653, "step": 1367 }, { "epoch": 0.6188645102917892, "grad_norm": 0.22319377958774567, "learning_rate": 9.9733373315306e-06, "loss": 0.4945, "step": 1368 }, { "epoch": 0.6193168966297218, "grad_norm": 0.22227583825588226, "learning_rate": 9.973261706645312e-06, "loss": 0.4789, "step": 1369 }, { "epoch": 0.6197692829676543, "grad_norm": 0.23231162130832672, "learning_rate": 9.973185974949746e-06, "loss": 0.766, "step": 1370 }, { "epoch": 0.620221669305587, "grad_norm": 0.25594112277030945, "learning_rate": 9.973110136445529e-06, "loss": 0.8316, "step": 1371 }, { "epoch": 0.6206740556435195, "grad_norm": 0.2378379851579666, "learning_rate": 9.973034191134291e-06, "loss": 0.6286, "step": 1372 }, { "epoch": 0.6211264419814522, "grad_norm": 0.22360694408416748, "learning_rate": 9.972958139017664e-06, "loss": 0.5506, "step": 1373 }, { "epoch": 0.6215788283193847, "grad_norm": 0.2304278016090393, "learning_rate": 9.972881980097275e-06, "loss": 0.5562, "step": 1374 }, { "epoch": 0.6220312146573174, "grad_norm": 0.23063276708126068, "learning_rate": 9.972805714374766e-06, "loss": 0.5635, "step": 1375 }, { "epoch": 0.6224836009952499, "grad_norm": 0.233733668923378, "learning_rate": 9.972729341851775e-06, "loss": 0.6608, "step": 1376 }, { "epoch": 0.6229359873331826, "grad_norm": 0.23945015668869019, "learning_rate": 9.972652862529937e-06, "loss": 0.5393, "step": 1377 }, { "epoch": 0.6233883736711151, "grad_norm": 0.22260062396526337, "learning_rate": 9.9725762764109e-06, "loss": 0.5477, "step": 1378 }, { "epoch": 0.6238407600090478, "grad_norm": 0.24138586223125458, "learning_rate": 9.972499583496307e-06, "loss": 0.5909, "step": 1379 }, { "epoch": 0.6242931463469803, "grad_norm": 0.23886005580425262, "learning_rate": 9.972422783787802e-06, "loss": 0.5826, "step": 1380 }, { "epoch": 0.624745532684913, "grad_norm": 0.25808048248291016, "learning_rate": 9.97234587728704e-06, "loss": 0.6892, "step": 1381 }, { "epoch": 0.6251979190228455, "grad_norm": 0.22638222575187683, "learning_rate": 9.97226886399567e-06, "loss": 0.4915, "step": 1382 }, { "epoch": 0.6256503053607781, "grad_norm": 0.2443896383047104, "learning_rate": 9.972191743915346e-06, "loss": 0.5497, "step": 1383 }, { "epoch": 0.6261026916987107, "grad_norm": 0.2354108691215515, "learning_rate": 9.972114517047723e-06, "loss": 0.5688, "step": 1384 }, { "epoch": 0.6265550780366433, "grad_norm": 0.2540557086467743, "learning_rate": 9.972037183394461e-06, "loss": 0.5423, "step": 1385 }, { "epoch": 0.6270074643745759, "grad_norm": 0.28038862347602844, "learning_rate": 9.97195974295722e-06, "loss": 0.6128, "step": 1386 }, { "epoch": 0.6274598507125084, "grad_norm": 0.2310100495815277, "learning_rate": 9.971882195737665e-06, "loss": 0.4942, "step": 1387 }, { "epoch": 0.6279122370504411, "grad_norm": 0.2552047371864319, "learning_rate": 9.971804541737461e-06, "loss": 0.5574, "step": 1388 }, { "epoch": 0.6283646233883736, "grad_norm": 0.3050123453140259, "learning_rate": 9.971726780958275e-06, "loss": 0.4964, "step": 1389 }, { "epoch": 0.6288170097263063, "grad_norm": 0.31352075934410095, "learning_rate": 9.971648913401776e-06, "loss": 0.6025, "step": 1390 }, { "epoch": 0.6292693960642388, "grad_norm": 0.3218671977519989, "learning_rate": 9.971570939069637e-06, "loss": 0.6359, "step": 1391 }, { "epoch": 0.6297217824021715, "grad_norm": 0.2634393274784088, "learning_rate": 9.971492857963536e-06, "loss": 0.4508, "step": 1392 }, { "epoch": 0.630174168740104, "grad_norm": 0.3371010720729828, "learning_rate": 9.971414670085145e-06, "loss": 0.5244, "step": 1393 }, { "epoch": 0.6306265550780367, "grad_norm": 0.34002572298049927, "learning_rate": 9.971336375436147e-06, "loss": 0.5167, "step": 1394 }, { "epoch": 0.6310789414159692, "grad_norm": 0.3060538172721863, "learning_rate": 9.97125797401822e-06, "loss": 0.5143, "step": 1395 }, { "epoch": 0.6315313277539019, "grad_norm": 0.4034123420715332, "learning_rate": 9.97117946583305e-06, "loss": 0.7068, "step": 1396 }, { "epoch": 0.6319837140918344, "grad_norm": 0.2560774087905884, "learning_rate": 9.971100850882324e-06, "loss": 0.5246, "step": 1397 }, { "epoch": 0.6324361004297671, "grad_norm": 0.410073459148407, "learning_rate": 9.971022129167729e-06, "loss": 0.5953, "step": 1398 }, { "epoch": 0.6328884867676996, "grad_norm": 0.37273073196411133, "learning_rate": 9.970943300690956e-06, "loss": 0.4855, "step": 1399 }, { "epoch": 0.6333408731056323, "grad_norm": 0.4520987272262573, "learning_rate": 9.970864365453698e-06, "loss": 0.5791, "step": 1400 }, { "epoch": 0.6333408731056323, "eval_loss": 0.6268455982208252, "eval_runtime": 25.6967, "eval_samples_per_second": 28.953, "eval_steps_per_second": 7.238, "step": 1400 }, { "epoch": 0.6337932594435648, "grad_norm": 0.12781500816345215, "learning_rate": 9.97078532345765e-06, "loss": 1.2767, "step": 1401 }, { "epoch": 0.6342456457814974, "grad_norm": 0.1784905046224594, "learning_rate": 9.97070617470451e-06, "loss": 1.0653, "step": 1402 }, { "epoch": 0.63469803211943, "grad_norm": 0.16945256292819977, "learning_rate": 9.970626919195977e-06, "loss": 0.613, "step": 1403 }, { "epoch": 0.6351504184573626, "grad_norm": 0.18590886890888214, "learning_rate": 9.970547556933754e-06, "loss": 0.6753, "step": 1404 }, { "epoch": 0.6356028047952952, "grad_norm": 0.16516391932964325, "learning_rate": 9.970468087919546e-06, "loss": 0.5266, "step": 1405 }, { "epoch": 0.6360551911332277, "grad_norm": 0.20130391418933868, "learning_rate": 9.970388512155059e-06, "loss": 0.7644, "step": 1406 }, { "epoch": 0.6365075774711604, "grad_norm": 0.2103927582502365, "learning_rate": 9.970308829642e-06, "loss": 0.6258, "step": 1407 }, { "epoch": 0.6369599638090929, "grad_norm": 0.19788284599781036, "learning_rate": 9.970229040382087e-06, "loss": 0.62, "step": 1408 }, { "epoch": 0.6374123501470256, "grad_norm": 0.21132035553455353, "learning_rate": 9.970149144377025e-06, "loss": 0.6906, "step": 1409 }, { "epoch": 0.6378647364849581, "grad_norm": 0.19183321297168732, "learning_rate": 9.970069141628535e-06, "loss": 0.6001, "step": 1410 }, { "epoch": 0.6383171228228908, "grad_norm": 0.21881067752838135, "learning_rate": 9.969989032138335e-06, "loss": 0.7516, "step": 1411 }, { "epoch": 0.6387695091608233, "grad_norm": 0.2006121724843979, "learning_rate": 9.969908815908145e-06, "loss": 0.5288, "step": 1412 }, { "epoch": 0.639221895498756, "grad_norm": 0.1879040151834488, "learning_rate": 9.969828492939686e-06, "loss": 0.5308, "step": 1413 }, { "epoch": 0.6396742818366885, "grad_norm": 0.22798515856266022, "learning_rate": 9.969748063234686e-06, "loss": 0.6938, "step": 1414 }, { "epoch": 0.6401266681746212, "grad_norm": 0.27620819211006165, "learning_rate": 9.969667526794869e-06, "loss": 0.7825, "step": 1415 }, { "epoch": 0.6405790545125537, "grad_norm": 0.20858745276927948, "learning_rate": 9.969586883621967e-06, "loss": 0.6935, "step": 1416 }, { "epoch": 0.6410314408504864, "grad_norm": 0.22855260968208313, "learning_rate": 9.969506133717713e-06, "loss": 0.6328, "step": 1417 }, { "epoch": 0.6414838271884189, "grad_norm": 0.19117134809494019, "learning_rate": 9.96942527708384e-06, "loss": 0.5613, "step": 1418 }, { "epoch": 0.6419362135263516, "grad_norm": 0.24912790954113007, "learning_rate": 9.969344313722084e-06, "loss": 0.6817, "step": 1419 }, { "epoch": 0.6423885998642841, "grad_norm": 0.19380711019039154, "learning_rate": 9.969263243634184e-06, "loss": 0.5143, "step": 1420 }, { "epoch": 0.6428409862022167, "grad_norm": 0.21678103506565094, "learning_rate": 9.96918206682188e-06, "loss": 0.6045, "step": 1421 }, { "epoch": 0.6432933725401493, "grad_norm": 0.22685956954956055, "learning_rate": 9.969100783286919e-06, "loss": 0.5725, "step": 1422 }, { "epoch": 0.6437457588780818, "grad_norm": 0.21802279353141785, "learning_rate": 9.969019393031044e-06, "loss": 0.6103, "step": 1423 }, { "epoch": 0.6441981452160145, "grad_norm": 0.20622971653938293, "learning_rate": 9.968937896056002e-06, "loss": 0.5404, "step": 1424 }, { "epoch": 0.644650531553947, "grad_norm": 0.21049988269805908, "learning_rate": 9.968856292363547e-06, "loss": 0.5628, "step": 1425 }, { "epoch": 0.6451029178918797, "grad_norm": 0.211713969707489, "learning_rate": 9.968774581955429e-06, "loss": 0.5383, "step": 1426 }, { "epoch": 0.6455553042298122, "grad_norm": 0.2627991735935211, "learning_rate": 9.968692764833402e-06, "loss": 0.6911, "step": 1427 }, { "epoch": 0.6460076905677449, "grad_norm": 0.22718249261379242, "learning_rate": 9.968610840999224e-06, "loss": 0.5714, "step": 1428 }, { "epoch": 0.6464600769056774, "grad_norm": 0.2682175040245056, "learning_rate": 9.968528810454657e-06, "loss": 0.579, "step": 1429 }, { "epoch": 0.6469124632436101, "grad_norm": 0.2473694086074829, "learning_rate": 9.96844667320146e-06, "loss": 0.4946, "step": 1430 }, { "epoch": 0.6473648495815426, "grad_norm": 0.2554166316986084, "learning_rate": 9.968364429241399e-06, "loss": 0.7093, "step": 1431 }, { "epoch": 0.6478172359194753, "grad_norm": 0.25379547476768494, "learning_rate": 9.968282078576238e-06, "loss": 0.5423, "step": 1432 }, { "epoch": 0.6482696222574078, "grad_norm": 0.2722306251525879, "learning_rate": 9.968199621207746e-06, "loss": 0.6009, "step": 1433 }, { "epoch": 0.6487220085953405, "grad_norm": 0.3428208529949188, "learning_rate": 9.968117057137695e-06, "loss": 0.764, "step": 1434 }, { "epoch": 0.649174394933273, "grad_norm": 0.2513734996318817, "learning_rate": 9.968034386367858e-06, "loss": 0.562, "step": 1435 }, { "epoch": 0.6496267812712057, "grad_norm": 0.24584731459617615, "learning_rate": 9.967951608900011e-06, "loss": 0.5762, "step": 1436 }, { "epoch": 0.6500791676091382, "grad_norm": 0.28407949209213257, "learning_rate": 9.967868724735932e-06, "loss": 0.6255, "step": 1437 }, { "epoch": 0.6505315539470709, "grad_norm": 0.26947060227394104, "learning_rate": 9.9677857338774e-06, "loss": 0.5916, "step": 1438 }, { "epoch": 0.6509839402850034, "grad_norm": 0.32533249258995056, "learning_rate": 9.967702636326195e-06, "loss": 0.7039, "step": 1439 }, { "epoch": 0.6514363266229359, "grad_norm": 0.2623298168182373, "learning_rate": 9.967619432084108e-06, "loss": 0.547, "step": 1440 }, { "epoch": 0.6518887129608686, "grad_norm": 0.2908569276332855, "learning_rate": 9.967536121152919e-06, "loss": 0.5664, "step": 1441 }, { "epoch": 0.6523410992988011, "grad_norm": 0.290173202753067, "learning_rate": 9.967452703534423e-06, "loss": 0.523, "step": 1442 }, { "epoch": 0.6527934856367338, "grad_norm": 0.3624300956726074, "learning_rate": 9.967369179230409e-06, "loss": 0.7229, "step": 1443 }, { "epoch": 0.6532458719746663, "grad_norm": 0.2927631437778473, "learning_rate": 9.96728554824267e-06, "loss": 0.5275, "step": 1444 }, { "epoch": 0.653698258312599, "grad_norm": 0.3130362629890442, "learning_rate": 9.967201810573005e-06, "loss": 0.5904, "step": 1445 }, { "epoch": 0.6541506446505315, "grad_norm": 0.342825323343277, "learning_rate": 9.96711796622321e-06, "loss": 0.5535, "step": 1446 }, { "epoch": 0.6546030309884642, "grad_norm": 0.27661651372909546, "learning_rate": 9.967034015195084e-06, "loss": 0.5324, "step": 1447 }, { "epoch": 0.6550554173263967, "grad_norm": 0.3581916391849518, "learning_rate": 9.966949957490435e-06, "loss": 0.4824, "step": 1448 }, { "epoch": 0.6555078036643294, "grad_norm": 0.3718360960483551, "learning_rate": 9.966865793111064e-06, "loss": 0.6082, "step": 1449 }, { "epoch": 0.6559601900022619, "grad_norm": 0.49464353919029236, "learning_rate": 9.966781522058781e-06, "loss": 0.5718, "step": 1450 }, { "epoch": 0.6564125763401946, "grad_norm": 0.13165023922920227, "learning_rate": 9.966697144335396e-06, "loss": 1.3094, "step": 1451 }, { "epoch": 0.6568649626781271, "grad_norm": 0.1633816957473755, "learning_rate": 9.966612659942719e-06, "loss": 0.6337, "step": 1452 }, { "epoch": 0.6573173490160598, "grad_norm": 0.17204928398132324, "learning_rate": 9.966528068882566e-06, "loss": 0.723, "step": 1453 }, { "epoch": 0.6577697353539923, "grad_norm": 0.17680150270462036, "learning_rate": 9.966443371156753e-06, "loss": 0.5919, "step": 1454 }, { "epoch": 0.658222121691925, "grad_norm": 0.18131108582019806, "learning_rate": 9.9663585667671e-06, "loss": 0.5704, "step": 1455 }, { "epoch": 0.6586745080298575, "grad_norm": 0.16379058361053467, "learning_rate": 9.966273655715429e-06, "loss": 0.5554, "step": 1456 }, { "epoch": 0.65912689436779, "grad_norm": 0.17874999344348907, "learning_rate": 9.96618863800356e-06, "loss": 0.5645, "step": 1457 }, { "epoch": 0.6595792807057227, "grad_norm": 0.18498623371124268, "learning_rate": 9.966103513633323e-06, "loss": 0.653, "step": 1458 }, { "epoch": 0.6600316670436552, "grad_norm": 0.19966861605644226, "learning_rate": 9.966018282606544e-06, "loss": 0.6349, "step": 1459 }, { "epoch": 0.6604840533815879, "grad_norm": 0.20659005641937256, "learning_rate": 9.965932944925055e-06, "loss": 0.7087, "step": 1460 }, { "epoch": 0.6609364397195204, "grad_norm": 0.23438431322574615, "learning_rate": 9.965847500590686e-06, "loss": 0.8616, "step": 1461 }, { "epoch": 0.6613888260574531, "grad_norm": 0.17983561754226685, "learning_rate": 9.965761949605276e-06, "loss": 0.6661, "step": 1462 }, { "epoch": 0.6618412123953856, "grad_norm": 0.2139839082956314, "learning_rate": 9.965676291970659e-06, "loss": 0.7162, "step": 1463 }, { "epoch": 0.6622935987333183, "grad_norm": 0.19707627594470978, "learning_rate": 9.965590527688678e-06, "loss": 0.6156, "step": 1464 }, { "epoch": 0.6627459850712508, "grad_norm": 0.21852721273899078, "learning_rate": 9.96550465676117e-06, "loss": 0.7107, "step": 1465 }, { "epoch": 0.6631983714091835, "grad_norm": 0.21355625987052917, "learning_rate": 9.965418679189984e-06, "loss": 0.6956, "step": 1466 }, { "epoch": 0.663650757747116, "grad_norm": 0.20228691399097443, "learning_rate": 9.965332594976966e-06, "loss": 0.6121, "step": 1467 }, { "epoch": 0.6641031440850487, "grad_norm": 0.243849515914917, "learning_rate": 9.96524640412396e-06, "loss": 0.7175, "step": 1468 }, { "epoch": 0.6645555304229812, "grad_norm": 0.19063222408294678, "learning_rate": 9.965160106632825e-06, "loss": 0.5089, "step": 1469 }, { "epoch": 0.6650079167609139, "grad_norm": 0.20266565680503845, "learning_rate": 9.965073702505408e-06, "loss": 0.5889, "step": 1470 }, { "epoch": 0.6654603030988464, "grad_norm": 0.23219141364097595, "learning_rate": 9.964987191743566e-06, "loss": 0.6036, "step": 1471 }, { "epoch": 0.6659126894367791, "grad_norm": 0.22463856637477875, "learning_rate": 9.964900574349159e-06, "loss": 0.5764, "step": 1472 }, { "epoch": 0.6663650757747116, "grad_norm": 0.23452535271644592, "learning_rate": 9.964813850324045e-06, "loss": 0.6512, "step": 1473 }, { "epoch": 0.6668174621126441, "grad_norm": 0.20232711732387543, "learning_rate": 9.964727019670087e-06, "loss": 0.5375, "step": 1474 }, { "epoch": 0.6672698484505768, "grad_norm": 0.22294580936431885, "learning_rate": 9.964640082389152e-06, "loss": 0.6543, "step": 1475 }, { "epoch": 0.6677222347885093, "grad_norm": 0.21893228590488434, "learning_rate": 9.964553038483105e-06, "loss": 0.685, "step": 1476 }, { "epoch": 0.668174621126442, "grad_norm": 0.198084756731987, "learning_rate": 9.964465887953816e-06, "loss": 0.4832, "step": 1477 }, { "epoch": 0.6686270074643745, "grad_norm": 0.2310151606798172, "learning_rate": 9.964378630803156e-06, "loss": 0.6168, "step": 1478 }, { "epoch": 0.6690793938023072, "grad_norm": 0.2465490698814392, "learning_rate": 9.964291267033002e-06, "loss": 0.6913, "step": 1479 }, { "epoch": 0.6695317801402397, "grad_norm": 0.22040729224681854, "learning_rate": 9.964203796645226e-06, "loss": 0.5819, "step": 1480 }, { "epoch": 0.6699841664781724, "grad_norm": 0.25976935029029846, "learning_rate": 9.964116219641708e-06, "loss": 0.6493, "step": 1481 }, { "epoch": 0.6704365528161049, "grad_norm": 0.26164567470550537, "learning_rate": 9.964028536024331e-06, "loss": 0.6816, "step": 1482 }, { "epoch": 0.6708889391540376, "grad_norm": 0.22541174292564392, "learning_rate": 9.963940745794977e-06, "loss": 0.556, "step": 1483 }, { "epoch": 0.6713413254919701, "grad_norm": 0.2327873557806015, "learning_rate": 9.96385284895553e-06, "loss": 0.6112, "step": 1484 }, { "epoch": 0.6717937118299028, "grad_norm": 0.28650450706481934, "learning_rate": 9.96376484550788e-06, "loss": 0.6523, "step": 1485 }, { "epoch": 0.6722460981678353, "grad_norm": 0.22856512665748596, "learning_rate": 9.963676735453916e-06, "loss": 0.6632, "step": 1486 }, { "epoch": 0.672698484505768, "grad_norm": 0.2607954442501068, "learning_rate": 9.96358851879553e-06, "loss": 0.5978, "step": 1487 }, { "epoch": 0.6731508708437005, "grad_norm": 0.26961442828178406, "learning_rate": 9.963500195534617e-06, "loss": 0.5205, "step": 1488 }, { "epoch": 0.6736032571816332, "grad_norm": 0.2683515250682831, "learning_rate": 9.963411765673073e-06, "loss": 0.5916, "step": 1489 }, { "epoch": 0.6740556435195657, "grad_norm": 0.26239386200904846, "learning_rate": 9.963323229212799e-06, "loss": 0.5109, "step": 1490 }, { "epoch": 0.6745080298574984, "grad_norm": 0.2834457755088806, "learning_rate": 9.963234586155694e-06, "loss": 0.6141, "step": 1491 }, { "epoch": 0.6749604161954309, "grad_norm": 0.2652958333492279, "learning_rate": 9.963145836503665e-06, "loss": 0.4837, "step": 1492 }, { "epoch": 0.6754128025333634, "grad_norm": 0.2813172936439514, "learning_rate": 9.963056980258617e-06, "loss": 0.5362, "step": 1493 }, { "epoch": 0.6758651888712961, "grad_norm": 0.2737351059913635, "learning_rate": 9.962968017422456e-06, "loss": 0.5566, "step": 1494 }, { "epoch": 0.6763175752092286, "grad_norm": 0.31031593680381775, "learning_rate": 9.962878947997095e-06, "loss": 0.5939, "step": 1495 }, { "epoch": 0.6767699615471613, "grad_norm": 0.3104061186313629, "learning_rate": 9.962789771984446e-06, "loss": 0.6137, "step": 1496 }, { "epoch": 0.6772223478850938, "grad_norm": 0.3096833825111389, "learning_rate": 9.962700489386425e-06, "loss": 0.5918, "step": 1497 }, { "epoch": 0.6776747342230265, "grad_norm": 0.3311043083667755, "learning_rate": 9.962611100204948e-06, "loss": 0.5432, "step": 1498 }, { "epoch": 0.678127120560959, "grad_norm": 0.29286500811576843, "learning_rate": 9.962521604441938e-06, "loss": 0.4672, "step": 1499 }, { "epoch": 0.6785795068988917, "grad_norm": 0.5703890323638916, "learning_rate": 9.962432002099312e-06, "loss": 0.6633, "step": 1500 }, { "epoch": 0.6790318932368242, "grad_norm": 0.12970146536827087, "learning_rate": 9.962342293178999e-06, "loss": 1.3109, "step": 1501 }, { "epoch": 0.6794842795747569, "grad_norm": 0.19667939841747284, "learning_rate": 9.962252477682926e-06, "loss": 0.8072, "step": 1502 }, { "epoch": 0.6799366659126894, "grad_norm": 0.1973164975643158, "learning_rate": 9.962162555613017e-06, "loss": 0.6935, "step": 1503 }, { "epoch": 0.6803890522506221, "grad_norm": 0.18025067448616028, "learning_rate": 9.962072526971207e-06, "loss": 0.6405, "step": 1504 }, { "epoch": 0.6808414385885546, "grad_norm": 0.1932922601699829, "learning_rate": 9.961982391759428e-06, "loss": 0.6855, "step": 1505 }, { "epoch": 0.6812938249264873, "grad_norm": 0.1854274719953537, "learning_rate": 9.961892149979618e-06, "loss": 0.6454, "step": 1506 }, { "epoch": 0.6817462112644198, "grad_norm": 0.16071279346942902, "learning_rate": 9.961801801633712e-06, "loss": 0.6251, "step": 1507 }, { "epoch": 0.6821985976023525, "grad_norm": 0.20949943363666534, "learning_rate": 9.961711346723653e-06, "loss": 0.5596, "step": 1508 }, { "epoch": 0.682650983940285, "grad_norm": 0.18857984244823456, "learning_rate": 9.961620785251383e-06, "loss": 0.6347, "step": 1509 }, { "epoch": 0.6831033702782175, "grad_norm": 0.20849266648292542, "learning_rate": 9.961530117218848e-06, "loss": 0.6919, "step": 1510 }, { "epoch": 0.6835557566161502, "grad_norm": 0.2027835249900818, "learning_rate": 9.961439342627993e-06, "loss": 0.596, "step": 1511 }, { "epoch": 0.6840081429540827, "grad_norm": 0.2168523222208023, "learning_rate": 9.961348461480767e-06, "loss": 0.759, "step": 1512 }, { "epoch": 0.6844605292920154, "grad_norm": 0.18503113090991974, "learning_rate": 9.961257473779124e-06, "loss": 0.6474, "step": 1513 }, { "epoch": 0.6849129156299479, "grad_norm": 0.22063501179218292, "learning_rate": 9.961166379525019e-06, "loss": 0.6811, "step": 1514 }, { "epoch": 0.6853653019678806, "grad_norm": 0.17739875614643097, "learning_rate": 9.961075178720405e-06, "loss": 0.6499, "step": 1515 }, { "epoch": 0.6858176883058131, "grad_norm": 0.23599842190742493, "learning_rate": 9.960983871367245e-06, "loss": 0.7641, "step": 1516 }, { "epoch": 0.6862700746437458, "grad_norm": 0.18831245601177216, "learning_rate": 9.960892457467497e-06, "loss": 0.5909, "step": 1517 }, { "epoch": 0.6867224609816783, "grad_norm": 0.2229490578174591, "learning_rate": 9.960800937023123e-06, "loss": 0.6483, "step": 1518 }, { "epoch": 0.687174847319611, "grad_norm": 0.19945231080055237, "learning_rate": 9.960709310036092e-06, "loss": 0.5544, "step": 1519 }, { "epoch": 0.6876272336575435, "grad_norm": 0.24215635657310486, "learning_rate": 9.960617576508372e-06, "loss": 0.7707, "step": 1520 }, { "epoch": 0.6880796199954762, "grad_norm": 0.21398168802261353, "learning_rate": 9.96052573644193e-06, "loss": 0.557, "step": 1521 }, { "epoch": 0.6885320063334087, "grad_norm": 0.28259217739105225, "learning_rate": 9.960433789838739e-06, "loss": 0.6448, "step": 1522 }, { "epoch": 0.6889843926713414, "grad_norm": 0.2987956404685974, "learning_rate": 9.960341736700776e-06, "loss": 0.6894, "step": 1523 }, { "epoch": 0.6894367790092739, "grad_norm": 0.22914288938045502, "learning_rate": 9.960249577030017e-06, "loss": 0.6097, "step": 1524 }, { "epoch": 0.6898891653472066, "grad_norm": 0.2598280906677246, "learning_rate": 9.96015731082844e-06, "loss": 0.6113, "step": 1525 }, { "epoch": 0.6903415516851391, "grad_norm": 0.22624985873699188, "learning_rate": 9.960064938098028e-06, "loss": 0.5342, "step": 1526 }, { "epoch": 0.6907939380230717, "grad_norm": 0.25702613592147827, "learning_rate": 9.959972458840766e-06, "loss": 0.69, "step": 1527 }, { "epoch": 0.6912463243610043, "grad_norm": 0.224885031580925, "learning_rate": 9.959879873058636e-06, "loss": 0.5508, "step": 1528 }, { "epoch": 0.6916987106989368, "grad_norm": 0.22461828589439392, "learning_rate": 9.959787180753631e-06, "loss": 0.5535, "step": 1529 }, { "epoch": 0.6921510970368695, "grad_norm": 0.294699102640152, "learning_rate": 9.95969438192774e-06, "loss": 0.7313, "step": 1530 }, { "epoch": 0.692603483374802, "grad_norm": 0.25837376713752747, "learning_rate": 9.959601476582955e-06, "loss": 0.6171, "step": 1531 }, { "epoch": 0.6930558697127347, "grad_norm": 0.22843460738658905, "learning_rate": 9.959508464721273e-06, "loss": 0.5634, "step": 1532 }, { "epoch": 0.6935082560506672, "grad_norm": 0.23052377998828888, "learning_rate": 9.95941534634469e-06, "loss": 0.6047, "step": 1533 }, { "epoch": 0.6939606423885999, "grad_norm": 0.2515343427658081, "learning_rate": 9.959322121455209e-06, "loss": 0.5508, "step": 1534 }, { "epoch": 0.6944130287265324, "grad_norm": 0.30879703164100647, "learning_rate": 9.959228790054828e-06, "loss": 0.7161, "step": 1535 }, { "epoch": 0.6948654150644651, "grad_norm": 0.2680801749229431, "learning_rate": 9.959135352145552e-06, "loss": 0.5846, "step": 1536 }, { "epoch": 0.6953178014023976, "grad_norm": 0.27205389738082886, "learning_rate": 9.959041807729393e-06, "loss": 0.6117, "step": 1537 }, { "epoch": 0.6957701877403303, "grad_norm": 0.3231266438961029, "learning_rate": 9.958948156808352e-06, "loss": 0.6399, "step": 1538 }, { "epoch": 0.6962225740782628, "grad_norm": 0.25710493326187134, "learning_rate": 9.958854399384447e-06, "loss": 0.5322, "step": 1539 }, { "epoch": 0.6966749604161955, "grad_norm": 0.30191126465797424, "learning_rate": 9.95876053545969e-06, "loss": 0.6004, "step": 1540 }, { "epoch": 0.697127346754128, "grad_norm": 0.2880746126174927, "learning_rate": 9.958666565036094e-06, "loss": 0.5808, "step": 1541 }, { "epoch": 0.6975797330920607, "grad_norm": 0.29652947187423706, "learning_rate": 9.95857248811568e-06, "loss": 0.535, "step": 1542 }, { "epoch": 0.6980321194299932, "grad_norm": 0.2823028862476349, "learning_rate": 9.958478304700468e-06, "loss": 0.5514, "step": 1543 }, { "epoch": 0.6984845057679258, "grad_norm": 0.31182610988616943, "learning_rate": 9.95838401479248e-06, "loss": 0.5656, "step": 1544 }, { "epoch": 0.6989368921058584, "grad_norm": 0.35688045620918274, "learning_rate": 9.958289618393741e-06, "loss": 0.7065, "step": 1545 }, { "epoch": 0.699389278443791, "grad_norm": 0.3107914328575134, "learning_rate": 9.958195115506282e-06, "loss": 0.4938, "step": 1546 }, { "epoch": 0.6998416647817236, "grad_norm": 0.3479117155075073, "learning_rate": 9.958100506132127e-06, "loss": 0.5174, "step": 1547 }, { "epoch": 0.7002940511196561, "grad_norm": 0.38587549328804016, "learning_rate": 9.95800579027331e-06, "loss": 0.6519, "step": 1548 }, { "epoch": 0.7007464374575888, "grad_norm": 0.3958067297935486, "learning_rate": 9.957910967931868e-06, "loss": 0.5858, "step": 1549 }, { "epoch": 0.7011988237955213, "grad_norm": 0.47771894931793213, "learning_rate": 9.957816039109832e-06, "loss": 0.6056, "step": 1550 }, { "epoch": 0.701651210133454, "grad_norm": 0.13562895357608795, "learning_rate": 9.957721003809246e-06, "loss": 1.1174, "step": 1551 }, { "epoch": 0.7021035964713865, "grad_norm": 0.18771223723888397, "learning_rate": 9.957625862032148e-06, "loss": 0.9653, "step": 1552 }, { "epoch": 0.7025559828093192, "grad_norm": 0.1628502756357193, "learning_rate": 9.957530613780583e-06, "loss": 0.5857, "step": 1553 }, { "epoch": 0.7030083691472517, "grad_norm": 0.18682503700256348, "learning_rate": 9.957435259056595e-06, "loss": 0.7107, "step": 1554 }, { "epoch": 0.7034607554851844, "grad_norm": 0.184464231133461, "learning_rate": 9.957339797862236e-06, "loss": 0.6231, "step": 1555 }, { "epoch": 0.7039131418231169, "grad_norm": 0.18128493428230286, "learning_rate": 9.957244230199549e-06, "loss": 0.5761, "step": 1556 }, { "epoch": 0.7043655281610496, "grad_norm": 0.19868719577789307, "learning_rate": 9.957148556070592e-06, "loss": 0.6592, "step": 1557 }, { "epoch": 0.7048179144989821, "grad_norm": 0.20598994195461273, "learning_rate": 9.95705277547742e-06, "loss": 0.6079, "step": 1558 }, { "epoch": 0.7052703008369148, "grad_norm": 0.2097853571176529, "learning_rate": 9.956956888422087e-06, "loss": 0.5934, "step": 1559 }, { "epoch": 0.7057226871748473, "grad_norm": 0.21630696952342987, "learning_rate": 9.956860894906655e-06, "loss": 0.6553, "step": 1560 }, { "epoch": 0.7061750735127799, "grad_norm": 0.21743115782737732, "learning_rate": 9.956764794933183e-06, "loss": 0.6573, "step": 1561 }, { "epoch": 0.7066274598507125, "grad_norm": 0.23761504888534546, "learning_rate": 9.956668588503738e-06, "loss": 0.7234, "step": 1562 }, { "epoch": 0.7070798461886451, "grad_norm": 0.2085566520690918, "learning_rate": 9.956572275620383e-06, "loss": 0.5877, "step": 1563 }, { "epoch": 0.7075322325265777, "grad_norm": 0.21840453147888184, "learning_rate": 9.956475856285188e-06, "loss": 0.6576, "step": 1564 }, { "epoch": 0.7079846188645103, "grad_norm": 0.2328086942434311, "learning_rate": 9.956379330500226e-06, "loss": 0.6104, "step": 1565 }, { "epoch": 0.7084370052024429, "grad_norm": 0.19800584018230438, "learning_rate": 9.956282698267565e-06, "loss": 0.5276, "step": 1566 }, { "epoch": 0.7088893915403754, "grad_norm": 0.20637576282024384, "learning_rate": 9.956185959589286e-06, "loss": 0.5381, "step": 1567 }, { "epoch": 0.7093417778783081, "grad_norm": 0.23845794796943665, "learning_rate": 9.956089114467463e-06, "loss": 0.6496, "step": 1568 }, { "epoch": 0.7097941642162406, "grad_norm": 0.22436827421188354, "learning_rate": 9.955992162904177e-06, "loss": 0.7007, "step": 1569 }, { "epoch": 0.7102465505541733, "grad_norm": 0.266827791929245, "learning_rate": 9.955895104901509e-06, "loss": 0.7011, "step": 1570 }, { "epoch": 0.7106989368921058, "grad_norm": 0.22693301737308502, "learning_rate": 9.955797940461548e-06, "loss": 0.6245, "step": 1571 }, { "epoch": 0.7111513232300385, "grad_norm": 0.22077462077140808, "learning_rate": 9.955700669586374e-06, "loss": 0.6122, "step": 1572 }, { "epoch": 0.711603709567971, "grad_norm": 0.22682605683803558, "learning_rate": 9.95560329227808e-06, "loss": 0.518, "step": 1573 }, { "epoch": 0.7120560959059037, "grad_norm": 0.22904498875141144, "learning_rate": 9.95550580853876e-06, "loss": 0.5196, "step": 1574 }, { "epoch": 0.7125084822438362, "grad_norm": 0.2062453180551529, "learning_rate": 9.955408218370501e-06, "loss": 0.5972, "step": 1575 }, { "epoch": 0.7129608685817689, "grad_norm": 0.22884199023246765, "learning_rate": 9.955310521775402e-06, "loss": 0.617, "step": 1576 }, { "epoch": 0.7134132549197014, "grad_norm": 0.2100546658039093, "learning_rate": 9.955212718755564e-06, "loss": 0.5426, "step": 1577 }, { "epoch": 0.713865641257634, "grad_norm": 0.2296397089958191, "learning_rate": 9.955114809313085e-06, "loss": 0.639, "step": 1578 }, { "epoch": 0.7143180275955666, "grad_norm": 0.2525530159473419, "learning_rate": 9.955016793450067e-06, "loss": 0.5975, "step": 1579 }, { "epoch": 0.7147704139334992, "grad_norm": 0.24907056987285614, "learning_rate": 9.954918671168617e-06, "loss": 0.5841, "step": 1580 }, { "epoch": 0.7152228002714318, "grad_norm": 0.2610059380531311, "learning_rate": 9.954820442470841e-06, "loss": 0.5999, "step": 1581 }, { "epoch": 0.7156751866093644, "grad_norm": 0.24630360305309296, "learning_rate": 9.95472210735885e-06, "loss": 0.5559, "step": 1582 }, { "epoch": 0.716127572947297, "grad_norm": 0.24608369171619415, "learning_rate": 9.954623665834754e-06, "loss": 0.5439, "step": 1583 }, { "epoch": 0.7165799592852296, "grad_norm": 0.2487465888261795, "learning_rate": 9.954525117900667e-06, "loss": 0.5341, "step": 1584 }, { "epoch": 0.7170323456231622, "grad_norm": 0.2847626209259033, "learning_rate": 9.954426463558708e-06, "loss": 0.5509, "step": 1585 }, { "epoch": 0.7174847319610947, "grad_norm": 0.2389853596687317, "learning_rate": 9.954327702810995e-06, "loss": 0.5648, "step": 1586 }, { "epoch": 0.7179371182990274, "grad_norm": 0.2673209011554718, "learning_rate": 9.954228835659648e-06, "loss": 0.6454, "step": 1587 }, { "epoch": 0.7183895046369599, "grad_norm": 0.2694607377052307, "learning_rate": 9.95412986210679e-06, "loss": 0.6103, "step": 1588 }, { "epoch": 0.7188418909748926, "grad_norm": 0.3023824095726013, "learning_rate": 9.954030782154549e-06, "loss": 0.5958, "step": 1589 }, { "epoch": 0.7192942773128251, "grad_norm": 0.25972411036491394, "learning_rate": 9.953931595805052e-06, "loss": 0.52, "step": 1590 }, { "epoch": 0.7197466636507578, "grad_norm": 0.27051258087158203, "learning_rate": 9.953832303060427e-06, "loss": 0.559, "step": 1591 }, { "epoch": 0.7201990499886903, "grad_norm": 0.2823937237262726, "learning_rate": 9.95373290392281e-06, "loss": 0.5803, "step": 1592 }, { "epoch": 0.720651436326623, "grad_norm": 0.2761114835739136, "learning_rate": 9.953633398394333e-06, "loss": 0.5956, "step": 1593 }, { "epoch": 0.7211038226645555, "grad_norm": 0.285033255815506, "learning_rate": 9.953533786477135e-06, "loss": 0.5559, "step": 1594 }, { "epoch": 0.7215562090024882, "grad_norm": 0.25884848833084106, "learning_rate": 9.953434068173354e-06, "loss": 0.485, "step": 1595 }, { "epoch": 0.7220085953404207, "grad_norm": 0.3428118824958801, "learning_rate": 9.953334243485131e-06, "loss": 0.6829, "step": 1596 }, { "epoch": 0.7224609816783533, "grad_norm": 0.29653480648994446, "learning_rate": 9.953234312414612e-06, "loss": 0.544, "step": 1597 }, { "epoch": 0.7229133680162859, "grad_norm": 0.3016298711299896, "learning_rate": 9.953134274963943e-06, "loss": 0.5218, "step": 1598 }, { "epoch": 0.7233657543542185, "grad_norm": 0.32178837060928345, "learning_rate": 9.95303413113527e-06, "loss": 0.554, "step": 1599 }, { "epoch": 0.7238181406921511, "grad_norm": 0.4066911041736603, "learning_rate": 9.952933880930746e-06, "loss": 0.5414, "step": 1600 }, { "epoch": 0.7238181406921511, "eval_loss": 0.6235954761505127, "eval_runtime": 25.9988, "eval_samples_per_second": 28.617, "eval_steps_per_second": 7.154, "step": 1600 }, { "epoch": 0.7242705270300837, "grad_norm": 0.15288494527339935, "learning_rate": 9.952833524352525e-06, "loss": 1.2105, "step": 1601 }, { "epoch": 0.7247229133680163, "grad_norm": 0.15964891016483307, "learning_rate": 9.952733061402759e-06, "loss": 0.6537, "step": 1602 }, { "epoch": 0.7251752997059489, "grad_norm": 0.16633950173854828, "learning_rate": 9.952632492083609e-06, "loss": 0.6013, "step": 1603 }, { "epoch": 0.7256276860438815, "grad_norm": 0.1716659963130951, "learning_rate": 9.952531816397232e-06, "loss": 0.6488, "step": 1604 }, { "epoch": 0.726080072381814, "grad_norm": 0.16225874423980713, "learning_rate": 9.952431034345793e-06, "loss": 0.588, "step": 1605 }, { "epoch": 0.7265324587197467, "grad_norm": 0.1843012273311615, "learning_rate": 9.952330145931456e-06, "loss": 0.61, "step": 1606 }, { "epoch": 0.7269848450576792, "grad_norm": 0.20660296082496643, "learning_rate": 9.952229151156386e-06, "loss": 0.7434, "step": 1607 }, { "epoch": 0.7274372313956119, "grad_norm": 0.2156895250082016, "learning_rate": 9.952128050022753e-06, "loss": 0.832, "step": 1608 }, { "epoch": 0.7278896177335444, "grad_norm": 0.20777523517608643, "learning_rate": 9.952026842532727e-06, "loss": 0.7704, "step": 1609 }, { "epoch": 0.7283420040714771, "grad_norm": 0.2208341509103775, "learning_rate": 9.951925528688484e-06, "loss": 0.6538, "step": 1610 }, { "epoch": 0.7287943904094096, "grad_norm": 0.23429255187511444, "learning_rate": 9.951824108492199e-06, "loss": 0.6151, "step": 1611 }, { "epoch": 0.7292467767473423, "grad_norm": 0.20662781596183777, "learning_rate": 9.95172258194605e-06, "loss": 0.5808, "step": 1612 }, { "epoch": 0.7296991630852748, "grad_norm": 0.1864316612482071, "learning_rate": 9.951620949052219e-06, "loss": 0.4974, "step": 1613 }, { "epoch": 0.7301515494232074, "grad_norm": 0.21746104955673218, "learning_rate": 9.951519209812885e-06, "loss": 0.6867, "step": 1614 }, { "epoch": 0.73060393576114, "grad_norm": 0.20408713817596436, "learning_rate": 9.951417364230236e-06, "loss": 0.5819, "step": 1615 }, { "epoch": 0.7310563220990726, "grad_norm": 0.2263457328081131, "learning_rate": 9.951315412306459e-06, "loss": 0.6297, "step": 1616 }, { "epoch": 0.7315087084370052, "grad_norm": 0.2133704274892807, "learning_rate": 9.951213354043742e-06, "loss": 0.5784, "step": 1617 }, { "epoch": 0.7319610947749378, "grad_norm": 0.21275506913661957, "learning_rate": 9.95111118944428e-06, "loss": 0.5908, "step": 1618 }, { "epoch": 0.7324134811128704, "grad_norm": 0.2392861694097519, "learning_rate": 9.951008918510265e-06, "loss": 0.6075, "step": 1619 }, { "epoch": 0.732865867450803, "grad_norm": 0.19420816004276276, "learning_rate": 9.950906541243894e-06, "loss": 0.4772, "step": 1620 }, { "epoch": 0.7333182537887356, "grad_norm": 0.22469837963581085, "learning_rate": 9.950804057647366e-06, "loss": 0.6392, "step": 1621 }, { "epoch": 0.7337706401266681, "grad_norm": 0.25256726145744324, "learning_rate": 9.95070146772288e-06, "loss": 0.7059, "step": 1622 }, { "epoch": 0.7342230264646008, "grad_norm": 0.2163977324962616, "learning_rate": 9.950598771472642e-06, "loss": 0.4774, "step": 1623 }, { "epoch": 0.7346754128025333, "grad_norm": 0.20889319479465485, "learning_rate": 9.950495968898856e-06, "loss": 0.4331, "step": 1624 }, { "epoch": 0.735127799140466, "grad_norm": 0.21859392523765564, "learning_rate": 9.95039306000373e-06, "loss": 0.5481, "step": 1625 }, { "epoch": 0.7355801854783985, "grad_norm": 0.23939462006092072, "learning_rate": 9.950290044789474e-06, "loss": 0.617, "step": 1626 }, { "epoch": 0.7360325718163312, "grad_norm": 0.25535428524017334, "learning_rate": 9.950186923258303e-06, "loss": 0.5544, "step": 1627 }, { "epoch": 0.7364849581542637, "grad_norm": 0.26257047057151794, "learning_rate": 9.950083695412428e-06, "loss": 0.563, "step": 1628 }, { "epoch": 0.7369373444921964, "grad_norm": 0.2303607314825058, "learning_rate": 9.94998036125407e-06, "loss": 0.5832, "step": 1629 }, { "epoch": 0.7373897308301289, "grad_norm": 0.25626516342163086, "learning_rate": 9.949876920785445e-06, "loss": 0.6014, "step": 1630 }, { "epoch": 0.7378421171680615, "grad_norm": 0.25983482599258423, "learning_rate": 9.949773374008774e-06, "loss": 0.6373, "step": 1631 }, { "epoch": 0.7382945035059941, "grad_norm": 0.24730277061462402, "learning_rate": 9.949669720926283e-06, "loss": 0.6524, "step": 1632 }, { "epoch": 0.7387468898439267, "grad_norm": 0.2242055982351303, "learning_rate": 9.9495659615402e-06, "loss": 0.4929, "step": 1633 }, { "epoch": 0.7391992761818593, "grad_norm": 0.24964353442192078, "learning_rate": 9.949462095852748e-06, "loss": 0.5172, "step": 1634 }, { "epoch": 0.7396516625197919, "grad_norm": 0.260733425617218, "learning_rate": 9.949358123866163e-06, "loss": 0.5951, "step": 1635 }, { "epoch": 0.7401040488577245, "grad_norm": 0.30710190534591675, "learning_rate": 9.949254045582675e-06, "loss": 0.6301, "step": 1636 }, { "epoch": 0.7405564351956571, "grad_norm": 0.2558826506137848, "learning_rate": 9.94914986100452e-06, "loss": 0.5781, "step": 1637 }, { "epoch": 0.7410088215335897, "grad_norm": 0.27854108810424805, "learning_rate": 9.949045570133935e-06, "loss": 0.6295, "step": 1638 }, { "epoch": 0.7414612078715223, "grad_norm": 0.3134213387966156, "learning_rate": 9.948941172973162e-06, "loss": 0.7301, "step": 1639 }, { "epoch": 0.7419135942094549, "grad_norm": 0.270831435918808, "learning_rate": 9.948836669524439e-06, "loss": 0.5286, "step": 1640 }, { "epoch": 0.7423659805473874, "grad_norm": 0.2578321099281311, "learning_rate": 9.948732059790015e-06, "loss": 0.5717, "step": 1641 }, { "epoch": 0.7428183668853201, "grad_norm": 0.2575458586215973, "learning_rate": 9.948627343772135e-06, "loss": 0.5009, "step": 1642 }, { "epoch": 0.7432707532232526, "grad_norm": 0.3337371349334717, "learning_rate": 9.948522521473047e-06, "loss": 0.8081, "step": 1643 }, { "epoch": 0.7437231395611853, "grad_norm": 0.29258692264556885, "learning_rate": 9.948417592895004e-06, "loss": 0.5072, "step": 1644 }, { "epoch": 0.7441755258991178, "grad_norm": 0.265579491853714, "learning_rate": 9.948312558040258e-06, "loss": 0.487, "step": 1645 }, { "epoch": 0.7446279122370505, "grad_norm": 0.29218634963035583, "learning_rate": 9.948207416911065e-06, "loss": 0.4765, "step": 1646 }, { "epoch": 0.745080298574983, "grad_norm": 0.395523339509964, "learning_rate": 9.948102169509683e-06, "loss": 0.6253, "step": 1647 }, { "epoch": 0.7455326849129156, "grad_norm": 0.3411768674850464, "learning_rate": 9.947996815838371e-06, "loss": 0.5821, "step": 1648 }, { "epoch": 0.7459850712508482, "grad_norm": 0.3848118185997009, "learning_rate": 9.947891355899398e-06, "loss": 0.6116, "step": 1649 }, { "epoch": 0.7464374575887808, "grad_norm": 0.426139771938324, "learning_rate": 9.947785789695021e-06, "loss": 0.5857, "step": 1650 }, { "epoch": 0.7468898439267134, "grad_norm": 0.1363982856273651, "learning_rate": 9.947680117227512e-06, "loss": 0.8221, "step": 1651 }, { "epoch": 0.747342230264646, "grad_norm": 0.14546240866184235, "learning_rate": 9.947574338499138e-06, "loss": 0.6823, "step": 1652 }, { "epoch": 0.7477946166025786, "grad_norm": 0.16529789566993713, "learning_rate": 9.947468453512174e-06, "loss": 0.5842, "step": 1653 }, { "epoch": 0.7482470029405112, "grad_norm": 0.16751468181610107, "learning_rate": 9.94736246226889e-06, "loss": 0.7236, "step": 1654 }, { "epoch": 0.7486993892784438, "grad_norm": 0.19682320952415466, "learning_rate": 9.947256364771564e-06, "loss": 0.6375, "step": 1655 }, { "epoch": 0.7491517756163764, "grad_norm": 0.20790502429008484, "learning_rate": 9.947150161022478e-06, "loss": 0.5764, "step": 1656 }, { "epoch": 0.749604161954309, "grad_norm": 0.1716715544462204, "learning_rate": 9.947043851023908e-06, "loss": 0.5398, "step": 1657 }, { "epoch": 0.7500565482922416, "grad_norm": 0.2095862776041031, "learning_rate": 9.946937434778138e-06, "loss": 0.6384, "step": 1658 }, { "epoch": 0.7505089346301742, "grad_norm": 0.24105378985404968, "learning_rate": 9.946830912287457e-06, "loss": 0.6263, "step": 1659 }, { "epoch": 0.7509613209681067, "grad_norm": 0.1903180480003357, "learning_rate": 9.94672428355415e-06, "loss": 0.546, "step": 1660 }, { "epoch": 0.7514137073060394, "grad_norm": 0.19425669312477112, "learning_rate": 9.946617548580508e-06, "loss": 0.6276, "step": 1661 }, { "epoch": 0.7518660936439719, "grad_norm": 0.22319862246513367, "learning_rate": 9.946510707368821e-06, "loss": 0.8763, "step": 1662 }, { "epoch": 0.7523184799819046, "grad_norm": 0.20758512616157532, "learning_rate": 9.946403759921387e-06, "loss": 0.61, "step": 1663 }, { "epoch": 0.7527708663198371, "grad_norm": 0.2352094054222107, "learning_rate": 9.946296706240502e-06, "loss": 0.6479, "step": 1664 }, { "epoch": 0.7532232526577697, "grad_norm": 0.22369983792304993, "learning_rate": 9.946189546328461e-06, "loss": 0.6607, "step": 1665 }, { "epoch": 0.7536756389957023, "grad_norm": 0.24532990157604218, "learning_rate": 9.946082280187573e-06, "loss": 0.6118, "step": 1666 }, { "epoch": 0.7541280253336349, "grad_norm": 0.2284027338027954, "learning_rate": 9.945974907820136e-06, "loss": 0.6557, "step": 1667 }, { "epoch": 0.7545804116715675, "grad_norm": 0.2529098093509674, "learning_rate": 9.945867429228458e-06, "loss": 0.5261, "step": 1668 }, { "epoch": 0.7550327980095001, "grad_norm": 0.20749486982822418, "learning_rate": 9.945759844414848e-06, "loss": 0.5356, "step": 1669 }, { "epoch": 0.7554851843474327, "grad_norm": 0.2422136515378952, "learning_rate": 9.945652153381614e-06, "loss": 0.6592, "step": 1670 }, { "epoch": 0.7559375706853653, "grad_norm": 0.21793556213378906, "learning_rate": 9.945544356131071e-06, "loss": 0.6012, "step": 1671 }, { "epoch": 0.7563899570232979, "grad_norm": 0.24982388317584991, "learning_rate": 9.945436452665535e-06, "loss": 0.6567, "step": 1672 }, { "epoch": 0.7568423433612305, "grad_norm": 0.22550907731056213, "learning_rate": 9.945328442987321e-06, "loss": 0.5786, "step": 1673 }, { "epoch": 0.7572947296991631, "grad_norm": 0.21940527856349945, "learning_rate": 9.94522032709875e-06, "loss": 0.534, "step": 1674 }, { "epoch": 0.7577471160370957, "grad_norm": 0.22987709939479828, "learning_rate": 9.945112105002143e-06, "loss": 0.5709, "step": 1675 }, { "epoch": 0.7581995023750283, "grad_norm": 0.21057315170764923, "learning_rate": 9.945003776699828e-06, "loss": 0.5333, "step": 1676 }, { "epoch": 0.7586518887129609, "grad_norm": 0.23220348358154297, "learning_rate": 9.944895342194126e-06, "loss": 0.5746, "step": 1677 }, { "epoch": 0.7591042750508935, "grad_norm": 0.24556662142276764, "learning_rate": 9.944786801487369e-06, "loss": 0.5593, "step": 1678 }, { "epoch": 0.759556661388826, "grad_norm": 0.28183165192604065, "learning_rate": 9.944678154581888e-06, "loss": 0.6008, "step": 1679 }, { "epoch": 0.7600090477267587, "grad_norm": 0.2484947144985199, "learning_rate": 9.944569401480017e-06, "loss": 0.5218, "step": 1680 }, { "epoch": 0.7604614340646912, "grad_norm": 0.25118914246559143, "learning_rate": 9.944460542184089e-06, "loss": 0.502, "step": 1681 }, { "epoch": 0.7609138204026238, "grad_norm": 0.2835737466812134, "learning_rate": 9.944351576696445e-06, "loss": 0.6869, "step": 1682 }, { "epoch": 0.7613662067405564, "grad_norm": 0.28277990221977234, "learning_rate": 9.944242505019424e-06, "loss": 0.5903, "step": 1683 }, { "epoch": 0.761818593078489, "grad_norm": 0.26259690523147583, "learning_rate": 9.944133327155367e-06, "loss": 0.5768, "step": 1684 }, { "epoch": 0.7622709794164216, "grad_norm": 0.2569063901901245, "learning_rate": 9.944024043106622e-06, "loss": 0.5409, "step": 1685 }, { "epoch": 0.7627233657543542, "grad_norm": 0.23281346261501312, "learning_rate": 9.943914652875533e-06, "loss": 0.477, "step": 1686 }, { "epoch": 0.7631757520922868, "grad_norm": 0.25558096170425415, "learning_rate": 9.943805156464451e-06, "loss": 0.4969, "step": 1687 }, { "epoch": 0.7636281384302194, "grad_norm": 0.2532190680503845, "learning_rate": 9.943695553875728e-06, "loss": 0.4868, "step": 1688 }, { "epoch": 0.764080524768152, "grad_norm": 0.27512529492378235, "learning_rate": 9.943585845111715e-06, "loss": 0.6746, "step": 1689 }, { "epoch": 0.7645329111060846, "grad_norm": 0.25205036997795105, "learning_rate": 9.943476030174773e-06, "loss": 0.507, "step": 1690 }, { "epoch": 0.7649852974440172, "grad_norm": 0.2911202907562256, "learning_rate": 9.943366109067258e-06, "loss": 0.502, "step": 1691 }, { "epoch": 0.7654376837819498, "grad_norm": 0.2909681797027588, "learning_rate": 9.94325608179153e-06, "loss": 0.6074, "step": 1692 }, { "epoch": 0.7658900701198824, "grad_norm": 0.26536068320274353, "learning_rate": 9.943145948349952e-06, "loss": 0.5294, "step": 1693 }, { "epoch": 0.766342456457815, "grad_norm": 0.2843252122402191, "learning_rate": 9.94303570874489e-06, "loss": 0.5504, "step": 1694 }, { "epoch": 0.7667948427957476, "grad_norm": 0.3059065639972687, "learning_rate": 9.942925362978713e-06, "loss": 0.5365, "step": 1695 }, { "epoch": 0.7672472291336802, "grad_norm": 0.35503271222114563, "learning_rate": 9.94281491105379e-06, "loss": 0.6205, "step": 1696 }, { "epoch": 0.7676996154716128, "grad_norm": 0.35822007060050964, "learning_rate": 9.942704352972491e-06, "loss": 0.553, "step": 1697 }, { "epoch": 0.7681520018095453, "grad_norm": 0.34567561745643616, "learning_rate": 9.942593688737193e-06, "loss": 0.5719, "step": 1698 }, { "epoch": 0.768604388147478, "grad_norm": 0.33098071813583374, "learning_rate": 9.942482918350273e-06, "loss": 0.475, "step": 1699 }, { "epoch": 0.7690567744854105, "grad_norm": 0.501473605632782, "learning_rate": 9.942372041814108e-06, "loss": 0.6434, "step": 1700 }, { "epoch": 0.7695091608233431, "grad_norm": 0.12033925950527191, "learning_rate": 9.94226105913108e-06, "loss": 1.1892, "step": 1701 }, { "epoch": 0.7699615471612757, "grad_norm": 0.14211423695087433, "learning_rate": 9.942149970303571e-06, "loss": 0.8347, "step": 1702 }, { "epoch": 0.7704139334992083, "grad_norm": 0.1712404042482376, "learning_rate": 9.942038775333971e-06, "loss": 0.6157, "step": 1703 }, { "epoch": 0.7708663198371409, "grad_norm": 0.19043201208114624, "learning_rate": 9.941927474224666e-06, "loss": 0.7242, "step": 1704 }, { "epoch": 0.7713187061750735, "grad_norm": 0.1764277219772339, "learning_rate": 9.941816066978045e-06, "loss": 0.582, "step": 1705 }, { "epoch": 0.7717710925130061, "grad_norm": 0.23447130620479584, "learning_rate": 9.941704553596503e-06, "loss": 0.8106, "step": 1706 }, { "epoch": 0.7722234788509387, "grad_norm": 0.18271654844284058, "learning_rate": 9.941592934082433e-06, "loss": 0.5997, "step": 1707 }, { "epoch": 0.7726758651888713, "grad_norm": 0.19217105209827423, "learning_rate": 9.941481208438233e-06, "loss": 0.6135, "step": 1708 }, { "epoch": 0.7731282515268039, "grad_norm": 0.17174877226352692, "learning_rate": 9.941369376666302e-06, "loss": 0.5806, "step": 1709 }, { "epoch": 0.7735806378647365, "grad_norm": 0.1956852376461029, "learning_rate": 9.941257438769043e-06, "loss": 0.6382, "step": 1710 }, { "epoch": 0.7740330242026691, "grad_norm": 0.1872003674507141, "learning_rate": 9.941145394748858e-06, "loss": 0.5535, "step": 1711 }, { "epoch": 0.7744854105406017, "grad_norm": 0.22292013466358185, "learning_rate": 9.941033244608159e-06, "loss": 0.5677, "step": 1712 }, { "epoch": 0.7749377968785343, "grad_norm": 0.23871943354606628, "learning_rate": 9.940920988349345e-06, "loss": 0.6404, "step": 1713 }, { "epoch": 0.7753901832164669, "grad_norm": 0.23612575232982635, "learning_rate": 9.940808625974836e-06, "loss": 0.7288, "step": 1714 }, { "epoch": 0.7758425695543995, "grad_norm": 0.22347278892993927, "learning_rate": 9.94069615748704e-06, "loss": 0.6573, "step": 1715 }, { "epoch": 0.7762949558923321, "grad_norm": 0.223484069108963, "learning_rate": 9.940583582888375e-06, "loss": 0.5799, "step": 1716 }, { "epoch": 0.7767473422302646, "grad_norm": 0.220592200756073, "learning_rate": 9.940470902181259e-06, "loss": 0.6699, "step": 1717 }, { "epoch": 0.7771997285681972, "grad_norm": 0.2178410440683365, "learning_rate": 9.940358115368107e-06, "loss": 0.5533, "step": 1718 }, { "epoch": 0.7776521149061298, "grad_norm": 0.20564156770706177, "learning_rate": 9.940245222451348e-06, "loss": 0.5251, "step": 1719 }, { "epoch": 0.7781045012440624, "grad_norm": 0.24992993474006653, "learning_rate": 9.940132223433403e-06, "loss": 0.5995, "step": 1720 }, { "epoch": 0.778556887581995, "grad_norm": 0.23258136212825775, "learning_rate": 9.9400191183167e-06, "loss": 0.5437, "step": 1721 }, { "epoch": 0.7790092739199276, "grad_norm": 0.2127462476491928, "learning_rate": 9.939905907103667e-06, "loss": 0.6014, "step": 1722 }, { "epoch": 0.7794616602578602, "grad_norm": 0.23902319371700287, "learning_rate": 9.939792589796736e-06, "loss": 0.7079, "step": 1723 }, { "epoch": 0.7799140465957928, "grad_norm": 0.2392314225435257, "learning_rate": 9.939679166398342e-06, "loss": 0.6353, "step": 1724 }, { "epoch": 0.7803664329337254, "grad_norm": 0.25415974855422974, "learning_rate": 9.939565636910919e-06, "loss": 0.5362, "step": 1725 }, { "epoch": 0.780818819271658, "grad_norm": 0.22405123710632324, "learning_rate": 9.939452001336907e-06, "loss": 0.4899, "step": 1726 }, { "epoch": 0.7812712056095906, "grad_norm": 0.2475266009569168, "learning_rate": 9.939338259678743e-06, "loss": 0.6631, "step": 1727 }, { "epoch": 0.7817235919475232, "grad_norm": 0.29091876745224, "learning_rate": 9.939224411938875e-06, "loss": 0.5869, "step": 1728 }, { "epoch": 0.7821759782854558, "grad_norm": 0.24252551794052124, "learning_rate": 9.939110458119745e-06, "loss": 0.565, "step": 1729 }, { "epoch": 0.7826283646233884, "grad_norm": 0.26621562242507935, "learning_rate": 9.938996398223802e-06, "loss": 0.6336, "step": 1730 }, { "epoch": 0.783080750961321, "grad_norm": 0.2874215543270111, "learning_rate": 9.938882232253493e-06, "loss": 0.5893, "step": 1731 }, { "epoch": 0.7835331372992536, "grad_norm": 0.2516578137874603, "learning_rate": 9.938767960211272e-06, "loss": 0.679, "step": 1732 }, { "epoch": 0.7839855236371862, "grad_norm": 0.257149338722229, "learning_rate": 9.938653582099593e-06, "loss": 0.6618, "step": 1733 }, { "epoch": 0.7844379099751188, "grad_norm": 0.22793017327785492, "learning_rate": 9.938539097920912e-06, "loss": 0.5202, "step": 1734 }, { "epoch": 0.7848902963130513, "grad_norm": 0.2576063573360443, "learning_rate": 9.938424507677688e-06, "loss": 0.5827, "step": 1735 }, { "epoch": 0.785342682650984, "grad_norm": 0.2670828104019165, "learning_rate": 9.938309811372381e-06, "loss": 0.5515, "step": 1736 }, { "epoch": 0.7857950689889165, "grad_norm": 0.29944750666618347, "learning_rate": 9.938195009007457e-06, "loss": 0.5247, "step": 1737 }, { "epoch": 0.7862474553268491, "grad_norm": 0.22949376702308655, "learning_rate": 9.93808010058538e-06, "loss": 0.4811, "step": 1738 }, { "epoch": 0.7866998416647817, "grad_norm": 0.29750487208366394, "learning_rate": 9.937965086108617e-06, "loss": 0.6374, "step": 1739 }, { "epoch": 0.7871522280027143, "grad_norm": 0.29284828901290894, "learning_rate": 9.937849965579639e-06, "loss": 0.5454, "step": 1740 }, { "epoch": 0.7876046143406469, "grad_norm": 0.32330185174942017, "learning_rate": 9.93773473900092e-06, "loss": 0.7081, "step": 1741 }, { "epoch": 0.7880570006785795, "grad_norm": 0.3033904731273651, "learning_rate": 9.93761940637493e-06, "loss": 0.5848, "step": 1742 }, { "epoch": 0.7885093870165121, "grad_norm": 0.3161073625087738, "learning_rate": 9.937503967704152e-06, "loss": 0.5305, "step": 1743 }, { "epoch": 0.7889617733544447, "grad_norm": 0.30921483039855957, "learning_rate": 9.93738842299106e-06, "loss": 0.5568, "step": 1744 }, { "epoch": 0.7894141596923773, "grad_norm": 0.37041863799095154, "learning_rate": 9.93727277223814e-06, "loss": 0.6582, "step": 1745 }, { "epoch": 0.7898665460303099, "grad_norm": 0.2798556089401245, "learning_rate": 9.937157015447873e-06, "loss": 0.5184, "step": 1746 }, { "epoch": 0.7903189323682425, "grad_norm": 0.3256586194038391, "learning_rate": 9.937041152622745e-06, "loss": 0.5438, "step": 1747 }, { "epoch": 0.7907713187061751, "grad_norm": 0.336882084608078, "learning_rate": 9.936925183765248e-06, "loss": 0.5889, "step": 1748 }, { "epoch": 0.7912237050441077, "grad_norm": 0.3260588049888611, "learning_rate": 9.936809108877867e-06, "loss": 0.5165, "step": 1749 }, { "epoch": 0.7916760913820403, "grad_norm": 0.3883899748325348, "learning_rate": 9.936692927963099e-06, "loss": 0.4976, "step": 1750 }, { "epoch": 0.7921284777199729, "grad_norm": 0.12935200333595276, "learning_rate": 9.936576641023437e-06, "loss": 1.373, "step": 1751 }, { "epoch": 0.7925808640579054, "grad_norm": 0.1870253086090088, "learning_rate": 9.936460248061379e-06, "loss": 0.7533, "step": 1752 }, { "epoch": 0.793033250395838, "grad_norm": 0.18276508152484894, "learning_rate": 9.936343749079427e-06, "loss": 0.6478, "step": 1753 }, { "epoch": 0.7934856367337706, "grad_norm": 0.17728383839130402, "learning_rate": 9.936227144080081e-06, "loss": 0.6961, "step": 1754 }, { "epoch": 0.7939380230717032, "grad_norm": 0.20370320975780487, "learning_rate": 9.936110433065845e-06, "loss": 0.5838, "step": 1755 }, { "epoch": 0.7943904094096358, "grad_norm": 0.1835714727640152, "learning_rate": 9.935993616039226e-06, "loss": 0.5443, "step": 1756 }, { "epoch": 0.7948427957475684, "grad_norm": 0.1893799602985382, "learning_rate": 9.935876693002735e-06, "loss": 0.5901, "step": 1757 }, { "epoch": 0.795295182085501, "grad_norm": 0.1865258663892746, "learning_rate": 9.93575966395888e-06, "loss": 0.5539, "step": 1758 }, { "epoch": 0.7957475684234336, "grad_norm": 0.20645487308502197, "learning_rate": 9.935642528910174e-06, "loss": 0.6193, "step": 1759 }, { "epoch": 0.7961999547613662, "grad_norm": 0.2134142965078354, "learning_rate": 9.935525287859136e-06, "loss": 0.6412, "step": 1760 }, { "epoch": 0.7966523410992988, "grad_norm": 0.26890528202056885, "learning_rate": 9.935407940808282e-06, "loss": 0.8344, "step": 1761 }, { "epoch": 0.7971047274372314, "grad_norm": 0.22130870819091797, "learning_rate": 9.935290487760134e-06, "loss": 0.6741, "step": 1762 }, { "epoch": 0.797557113775164, "grad_norm": 0.2216442972421646, "learning_rate": 9.935172928717211e-06, "loss": 0.7434, "step": 1763 }, { "epoch": 0.7980095001130966, "grad_norm": 0.24702656269073486, "learning_rate": 9.93505526368204e-06, "loss": 0.6814, "step": 1764 }, { "epoch": 0.7984618864510292, "grad_norm": 0.19701601564884186, "learning_rate": 9.934937492657149e-06, "loss": 0.5603, "step": 1765 }, { "epoch": 0.7989142727889618, "grad_norm": 0.21520377695560455, "learning_rate": 9.934819615645066e-06, "loss": 0.5534, "step": 1766 }, { "epoch": 0.7993666591268944, "grad_norm": 0.2312941998243332, "learning_rate": 9.934701632648323e-06, "loss": 0.5767, "step": 1767 }, { "epoch": 0.799819045464827, "grad_norm": 0.22578534483909607, "learning_rate": 9.934583543669454e-06, "loss": 0.5912, "step": 1768 }, { "epoch": 0.8002714318027595, "grad_norm": 0.21463295817375183, "learning_rate": 9.934465348710995e-06, "loss": 0.5632, "step": 1769 }, { "epoch": 0.8007238181406922, "grad_norm": 0.24886886775493622, "learning_rate": 9.934347047775485e-06, "loss": 0.7035, "step": 1770 }, { "epoch": 0.8011762044786247, "grad_norm": 0.2164354771375656, "learning_rate": 9.934228640865464e-06, "loss": 0.5819, "step": 1771 }, { "epoch": 0.8016285908165574, "grad_norm": 0.23827780783176422, "learning_rate": 9.934110127983474e-06, "loss": 0.5211, "step": 1772 }, { "epoch": 0.8020809771544899, "grad_norm": 0.22380371391773224, "learning_rate": 9.933991509132062e-06, "loss": 0.598, "step": 1773 }, { "epoch": 0.8025333634924225, "grad_norm": 0.24795591831207275, "learning_rate": 9.933872784313776e-06, "loss": 0.66, "step": 1774 }, { "epoch": 0.8029857498303551, "grad_norm": 0.25062283873558044, "learning_rate": 9.933753953531165e-06, "loss": 0.6163, "step": 1775 }, { "epoch": 0.8034381361682877, "grad_norm": 0.24823713302612305, "learning_rate": 9.93363501678678e-06, "loss": 0.5811, "step": 1776 }, { "epoch": 0.8038905225062203, "grad_norm": 0.2735205590724945, "learning_rate": 9.933515974083178e-06, "loss": 0.6202, "step": 1777 }, { "epoch": 0.8043429088441529, "grad_norm": 0.2668386697769165, "learning_rate": 9.933396825422914e-06, "loss": 0.6114, "step": 1778 }, { "epoch": 0.8047952951820855, "grad_norm": 0.2797871530056, "learning_rate": 9.933277570808546e-06, "loss": 0.5994, "step": 1779 }, { "epoch": 0.8052476815200181, "grad_norm": 0.22260968387126923, "learning_rate": 9.933158210242636e-06, "loss": 0.5109, "step": 1780 }, { "epoch": 0.8057000678579507, "grad_norm": 0.26218658685684204, "learning_rate": 9.933038743727749e-06, "loss": 0.5913, "step": 1781 }, { "epoch": 0.8061524541958833, "grad_norm": 0.23676937818527222, "learning_rate": 9.932919171266448e-06, "loss": 0.5916, "step": 1782 }, { "epoch": 0.8066048405338159, "grad_norm": 0.2380950003862381, "learning_rate": 9.932799492861304e-06, "loss": 0.5508, "step": 1783 }, { "epoch": 0.8070572268717485, "grad_norm": 0.2769770920276642, "learning_rate": 9.932679708514886e-06, "loss": 0.6447, "step": 1784 }, { "epoch": 0.8075096132096811, "grad_norm": 0.2518368661403656, "learning_rate": 9.932559818229766e-06, "loss": 0.5038, "step": 1785 }, { "epoch": 0.8079619995476136, "grad_norm": 0.27244627475738525, "learning_rate": 9.932439822008521e-06, "loss": 0.5817, "step": 1786 }, { "epoch": 0.8084143858855463, "grad_norm": 0.2837880253791809, "learning_rate": 9.932319719853725e-06, "loss": 0.5964, "step": 1787 }, { "epoch": 0.8088667722234788, "grad_norm": 0.25563743710517883, "learning_rate": 9.932199511767959e-06, "loss": 0.5217, "step": 1788 }, { "epoch": 0.8093191585614115, "grad_norm": 0.2509622871875763, "learning_rate": 9.932079197753806e-06, "loss": 0.5473, "step": 1789 }, { "epoch": 0.809771544899344, "grad_norm": 0.2932531237602234, "learning_rate": 9.931958777813846e-06, "loss": 0.6059, "step": 1790 }, { "epoch": 0.8102239312372767, "grad_norm": 0.29652878642082214, "learning_rate": 9.93183825195067e-06, "loss": 0.5056, "step": 1791 }, { "epoch": 0.8106763175752092, "grad_norm": 0.2990384101867676, "learning_rate": 9.931717620166865e-06, "loss": 0.5579, "step": 1792 }, { "epoch": 0.8111287039131418, "grad_norm": 0.29636847972869873, "learning_rate": 9.93159688246502e-06, "loss": 0.553, "step": 1793 }, { "epoch": 0.8115810902510744, "grad_norm": 0.28196173906326294, "learning_rate": 9.931476038847729e-06, "loss": 0.5565, "step": 1794 }, { "epoch": 0.812033476589007, "grad_norm": 0.3025619089603424, "learning_rate": 9.931355089317588e-06, "loss": 0.5671, "step": 1795 }, { "epoch": 0.8124858629269396, "grad_norm": 0.3801002502441406, "learning_rate": 9.931234033877195e-06, "loss": 0.6119, "step": 1796 }, { "epoch": 0.8129382492648722, "grad_norm": 0.3074089586734772, "learning_rate": 9.931112872529147e-06, "loss": 0.485, "step": 1797 }, { "epoch": 0.8133906356028048, "grad_norm": 0.35943618416786194, "learning_rate": 9.93099160527605e-06, "loss": 0.5519, "step": 1798 }, { "epoch": 0.8138430219407374, "grad_norm": 0.3916253447532654, "learning_rate": 9.930870232120507e-06, "loss": 0.5738, "step": 1799 }, { "epoch": 0.81429540827867, "grad_norm": 0.4543122351169586, "learning_rate": 9.930748753065126e-06, "loss": 0.5184, "step": 1800 }, { "epoch": 0.81429540827867, "eval_loss": 0.6194272637367249, "eval_runtime": 25.8732, "eval_samples_per_second": 28.756, "eval_steps_per_second": 7.189, "step": 1800 }, { "epoch": 0.8147477946166026, "grad_norm": 0.13915984332561493, "learning_rate": 9.930627168112513e-06, "loss": 0.9707, "step": 1801 }, { "epoch": 0.8152001809545352, "grad_norm": 0.17266234755516052, "learning_rate": 9.93050547726528e-06, "loss": 0.7665, "step": 1802 }, { "epoch": 0.8156525672924678, "grad_norm": 0.18375490605831146, "learning_rate": 9.93038368052604e-06, "loss": 0.6733, "step": 1803 }, { "epoch": 0.8161049536304004, "grad_norm": 0.16439193487167358, "learning_rate": 9.930261777897414e-06, "loss": 0.4537, "step": 1804 }, { "epoch": 0.8165573399683329, "grad_norm": 0.19785697758197784, "learning_rate": 9.930139769382012e-06, "loss": 0.6541, "step": 1805 }, { "epoch": 0.8170097263062656, "grad_norm": 0.2025686800479889, "learning_rate": 9.930017654982463e-06, "loss": 0.7008, "step": 1806 }, { "epoch": 0.8174621126441981, "grad_norm": 0.2026493102312088, "learning_rate": 9.929895434701382e-06, "loss": 0.5535, "step": 1807 }, { "epoch": 0.8179144989821308, "grad_norm": 0.207488551735878, "learning_rate": 9.929773108541398e-06, "loss": 0.5986, "step": 1808 }, { "epoch": 0.8183668853200633, "grad_norm": 0.24539531767368317, "learning_rate": 9.929650676505138e-06, "loss": 0.6426, "step": 1809 }, { "epoch": 0.818819271657996, "grad_norm": 0.20641428232192993, "learning_rate": 9.929528138595231e-06, "loss": 0.5806, "step": 1810 }, { "epoch": 0.8192716579959285, "grad_norm": 0.23191913962364197, "learning_rate": 9.929405494814305e-06, "loss": 0.6765, "step": 1811 }, { "epoch": 0.8197240443338611, "grad_norm": 0.21625791490077972, "learning_rate": 9.929282745165002e-06, "loss": 0.5799, "step": 1812 }, { "epoch": 0.8201764306717937, "grad_norm": 0.2564154267311096, "learning_rate": 9.929159889649952e-06, "loss": 0.7085, "step": 1813 }, { "epoch": 0.8206288170097263, "grad_norm": 0.23455168306827545, "learning_rate": 9.929036928271795e-06, "loss": 0.5552, "step": 1814 }, { "epoch": 0.8210812033476589, "grad_norm": 0.22413752973079681, "learning_rate": 9.928913861033173e-06, "loss": 0.6097, "step": 1815 }, { "epoch": 0.8215335896855915, "grad_norm": 0.2494051456451416, "learning_rate": 9.928790687936727e-06, "loss": 0.8345, "step": 1816 }, { "epoch": 0.8219859760235241, "grad_norm": 0.20581196248531342, "learning_rate": 9.928667408985107e-06, "loss": 0.5829, "step": 1817 }, { "epoch": 0.8224383623614567, "grad_norm": 0.23526811599731445, "learning_rate": 9.928544024180955e-06, "loss": 0.576, "step": 1818 }, { "epoch": 0.8228907486993893, "grad_norm": 0.2380758672952652, "learning_rate": 9.928420533526922e-06, "loss": 0.5893, "step": 1819 }, { "epoch": 0.8233431350373219, "grad_norm": 0.23627948760986328, "learning_rate": 9.928296937025663e-06, "loss": 0.5056, "step": 1820 }, { "epoch": 0.8237955213752545, "grad_norm": 0.2326301485300064, "learning_rate": 9.92817323467983e-06, "loss": 0.5697, "step": 1821 }, { "epoch": 0.824247907713187, "grad_norm": 0.2405351996421814, "learning_rate": 9.928049426492081e-06, "loss": 0.6717, "step": 1822 }, { "epoch": 0.8247002940511197, "grad_norm": 0.2499564290046692, "learning_rate": 9.927925512465076e-06, "loss": 0.7125, "step": 1823 }, { "epoch": 0.8251526803890522, "grad_norm": 0.2690967917442322, "learning_rate": 9.927801492601475e-06, "loss": 0.6069, "step": 1824 }, { "epoch": 0.8256050667269849, "grad_norm": 0.22132965922355652, "learning_rate": 9.92767736690394e-06, "loss": 0.5557, "step": 1825 }, { "epoch": 0.8260574530649174, "grad_norm": 0.2699069678783417, "learning_rate": 9.92755313537514e-06, "loss": 0.5738, "step": 1826 }, { "epoch": 0.82650983940285, "grad_norm": 0.2270096093416214, "learning_rate": 9.927428798017738e-06, "loss": 0.5645, "step": 1827 }, { "epoch": 0.8269622257407826, "grad_norm": 0.25942108035087585, "learning_rate": 9.92730435483441e-06, "loss": 0.5922, "step": 1828 }, { "epoch": 0.8274146120787153, "grad_norm": 0.28886353969573975, "learning_rate": 9.927179805827826e-06, "loss": 0.6411, "step": 1829 }, { "epoch": 0.8278669984166478, "grad_norm": 0.2677817642688751, "learning_rate": 9.927055151000664e-06, "loss": 0.5362, "step": 1830 }, { "epoch": 0.8283193847545804, "grad_norm": 0.2717393636703491, "learning_rate": 9.926930390355596e-06, "loss": 0.6816, "step": 1831 }, { "epoch": 0.828771771092513, "grad_norm": 0.3193564713001251, "learning_rate": 9.926805523895306e-06, "loss": 0.5295, "step": 1832 }, { "epoch": 0.8292241574304456, "grad_norm": 0.25322455167770386, "learning_rate": 9.926680551622471e-06, "loss": 0.5781, "step": 1833 }, { "epoch": 0.8296765437683782, "grad_norm": 0.2768172323703766, "learning_rate": 9.92655547353978e-06, "loss": 0.5513, "step": 1834 }, { "epoch": 0.8301289301063108, "grad_norm": 0.2808765470981598, "learning_rate": 9.926430289649917e-06, "loss": 0.5674, "step": 1835 }, { "epoch": 0.8305813164442434, "grad_norm": 0.2792631685733795, "learning_rate": 9.926304999955572e-06, "loss": 0.5924, "step": 1836 }, { "epoch": 0.831033702782176, "grad_norm": 0.23276881873607635, "learning_rate": 9.926179604459432e-06, "loss": 0.4116, "step": 1837 }, { "epoch": 0.8314860891201086, "grad_norm": 0.269186794757843, "learning_rate": 9.926054103164195e-06, "loss": 0.5598, "step": 1838 }, { "epoch": 0.8319384754580411, "grad_norm": 0.28948506712913513, "learning_rate": 9.925928496072553e-06, "loss": 0.5556, "step": 1839 }, { "epoch": 0.8323908617959738, "grad_norm": 0.26094287633895874, "learning_rate": 9.925802783187203e-06, "loss": 0.5138, "step": 1840 }, { "epoch": 0.8328432481339063, "grad_norm": 0.30145028233528137, "learning_rate": 9.92567696451085e-06, "loss": 0.5005, "step": 1841 }, { "epoch": 0.833295634471839, "grad_norm": 0.3106503188610077, "learning_rate": 9.925551040046191e-06, "loss": 0.5554, "step": 1842 }, { "epoch": 0.8337480208097715, "grad_norm": 0.31692928075790405, "learning_rate": 9.925425009795933e-06, "loss": 0.4801, "step": 1843 }, { "epoch": 0.8342004071477042, "grad_norm": 0.2931511104106903, "learning_rate": 9.92529887376278e-06, "loss": 0.5177, "step": 1844 }, { "epoch": 0.8346527934856367, "grad_norm": 0.2846549153327942, "learning_rate": 9.925172631949445e-06, "loss": 0.4825, "step": 1845 }, { "epoch": 0.8351051798235694, "grad_norm": 0.32234740257263184, "learning_rate": 9.925046284358637e-06, "loss": 0.5557, "step": 1846 }, { "epoch": 0.8355575661615019, "grad_norm": 0.3724619746208191, "learning_rate": 9.92491983099307e-06, "loss": 0.6573, "step": 1847 }, { "epoch": 0.8360099524994345, "grad_norm": 0.36048075556755066, "learning_rate": 9.92479327185546e-06, "loss": 0.5198, "step": 1848 }, { "epoch": 0.8364623388373671, "grad_norm": 0.3661724030971527, "learning_rate": 9.924666606948523e-06, "loss": 0.5583, "step": 1849 }, { "epoch": 0.8369147251752997, "grad_norm": 0.5204914808273315, "learning_rate": 9.924539836274983e-06, "loss": 0.6205, "step": 1850 }, { "epoch": 0.8373671115132323, "grad_norm": 0.15411867201328278, "learning_rate": 9.92441295983756e-06, "loss": 0.9269, "step": 1851 }, { "epoch": 0.8378194978511649, "grad_norm": 0.1886746883392334, "learning_rate": 9.924285977638982e-06, "loss": 0.9349, "step": 1852 }, { "epoch": 0.8382718841890975, "grad_norm": 0.15827172994613647, "learning_rate": 9.92415888968197e-06, "loss": 0.5568, "step": 1853 }, { "epoch": 0.8387242705270301, "grad_norm": 0.18538837134838104, "learning_rate": 9.92403169596926e-06, "loss": 0.6512, "step": 1854 }, { "epoch": 0.8391766568649627, "grad_norm": 0.18556785583496094, "learning_rate": 9.92390439650358e-06, "loss": 0.664, "step": 1855 }, { "epoch": 0.8396290432028952, "grad_norm": 0.20495536923408508, "learning_rate": 9.923776991287667e-06, "loss": 0.6375, "step": 1856 }, { "epoch": 0.8400814295408279, "grad_norm": 0.22830329835414886, "learning_rate": 9.923649480324253e-06, "loss": 0.7986, "step": 1857 }, { "epoch": 0.8405338158787604, "grad_norm": 0.19701389968395233, "learning_rate": 9.923521863616079e-06, "loss": 0.6225, "step": 1858 }, { "epoch": 0.8409862022166931, "grad_norm": 0.2169043868780136, "learning_rate": 9.923394141165887e-06, "loss": 0.6434, "step": 1859 }, { "epoch": 0.8414385885546256, "grad_norm": 0.24179710447788239, "learning_rate": 9.923266312976418e-06, "loss": 0.7066, "step": 1860 }, { "epoch": 0.8418909748925583, "grad_norm": 0.24235989153385162, "learning_rate": 9.923138379050419e-06, "loss": 0.6948, "step": 1861 }, { "epoch": 0.8423433612304908, "grad_norm": 0.2194678783416748, "learning_rate": 9.923010339390636e-06, "loss": 0.632, "step": 1862 }, { "epoch": 0.8427957475684235, "grad_norm": 0.2008039504289627, "learning_rate": 9.92288219399982e-06, "loss": 0.6101, "step": 1863 }, { "epoch": 0.843248133906356, "grad_norm": 0.2094995379447937, "learning_rate": 9.922753942880723e-06, "loss": 0.5817, "step": 1864 }, { "epoch": 0.8437005202442887, "grad_norm": 0.22156840562820435, "learning_rate": 9.922625586036098e-06, "loss": 0.6704, "step": 1865 }, { "epoch": 0.8441529065822212, "grad_norm": 0.20382152497768402, "learning_rate": 9.922497123468707e-06, "loss": 0.4953, "step": 1866 }, { "epoch": 0.8446052929201538, "grad_norm": 0.23910443484783173, "learning_rate": 9.9223685551813e-06, "loss": 0.6483, "step": 1867 }, { "epoch": 0.8450576792580864, "grad_norm": 0.23010508716106415, "learning_rate": 9.922239881176647e-06, "loss": 0.6192, "step": 1868 }, { "epoch": 0.845510065596019, "grad_norm": 0.2147011011838913, "learning_rate": 9.922111101457506e-06, "loss": 0.6211, "step": 1869 }, { "epoch": 0.8459624519339516, "grad_norm": 0.20736181735992432, "learning_rate": 9.921982216026644e-06, "loss": 0.5731, "step": 1870 }, { "epoch": 0.8464148382718842, "grad_norm": 0.22496715188026428, "learning_rate": 9.921853224886831e-06, "loss": 0.575, "step": 1871 }, { "epoch": 0.8468672246098168, "grad_norm": 0.275504469871521, "learning_rate": 9.921724128040835e-06, "loss": 0.6339, "step": 1872 }, { "epoch": 0.8473196109477493, "grad_norm": 0.2784673571586609, "learning_rate": 9.92159492549143e-06, "loss": 0.6547, "step": 1873 }, { "epoch": 0.847771997285682, "grad_norm": 0.22693432867527008, "learning_rate": 9.92146561724139e-06, "loss": 0.6016, "step": 1874 }, { "epoch": 0.8482243836236145, "grad_norm": 0.22746402025222778, "learning_rate": 9.921336203293492e-06, "loss": 0.5733, "step": 1875 }, { "epoch": 0.8486767699615472, "grad_norm": 0.24777831137180328, "learning_rate": 9.921206683650518e-06, "loss": 0.5398, "step": 1876 }, { "epoch": 0.8491291562994797, "grad_norm": 0.2589649260044098, "learning_rate": 9.921077058315245e-06, "loss": 0.6726, "step": 1877 }, { "epoch": 0.8495815426374124, "grad_norm": 0.22223332524299622, "learning_rate": 9.92094732729046e-06, "loss": 0.4809, "step": 1878 }, { "epoch": 0.8500339289753449, "grad_norm": 0.23398295044898987, "learning_rate": 9.92081749057895e-06, "loss": 0.6286, "step": 1879 }, { "epoch": 0.8504863153132776, "grad_norm": 0.2220853865146637, "learning_rate": 9.920687548183504e-06, "loss": 0.4748, "step": 1880 }, { "epoch": 0.8509387016512101, "grad_norm": 0.2345307171344757, "learning_rate": 9.920557500106907e-06, "loss": 0.563, "step": 1881 }, { "epoch": 0.8513910879891428, "grad_norm": 0.2702895402908325, "learning_rate": 9.920427346351958e-06, "loss": 0.4857, "step": 1882 }, { "epoch": 0.8518434743270753, "grad_norm": 0.25763729214668274, "learning_rate": 9.920297086921451e-06, "loss": 0.5944, "step": 1883 }, { "epoch": 0.852295860665008, "grad_norm": 0.27624356746673584, "learning_rate": 9.920166721818183e-06, "loss": 0.55, "step": 1884 }, { "epoch": 0.8527482470029405, "grad_norm": 0.27398183941841125, "learning_rate": 9.920036251044953e-06, "loss": 0.6216, "step": 1885 }, { "epoch": 0.8532006333408731, "grad_norm": 0.2411786913871765, "learning_rate": 9.919905674604564e-06, "loss": 0.5858, "step": 1886 }, { "epoch": 0.8536530196788057, "grad_norm": 0.2745203971862793, "learning_rate": 9.919774992499819e-06, "loss": 0.5094, "step": 1887 }, { "epoch": 0.8541054060167383, "grad_norm": 0.2594059109687805, "learning_rate": 9.919644204733528e-06, "loss": 0.5133, "step": 1888 }, { "epoch": 0.8545577923546709, "grad_norm": 0.2867982089519501, "learning_rate": 9.919513311308497e-06, "loss": 0.5627, "step": 1889 }, { "epoch": 0.8550101786926034, "grad_norm": 0.3471047282218933, "learning_rate": 9.91938231222754e-06, "loss": 0.5865, "step": 1890 }, { "epoch": 0.8554625650305361, "grad_norm": 0.33077380061149597, "learning_rate": 9.919251207493466e-06, "loss": 0.637, "step": 1891 }, { "epoch": 0.8559149513684686, "grad_norm": 0.3237074613571167, "learning_rate": 9.919119997109095e-06, "loss": 0.5891, "step": 1892 }, { "epoch": 0.8563673377064013, "grad_norm": 0.34233617782592773, "learning_rate": 9.918988681077241e-06, "loss": 0.5519, "step": 1893 }, { "epoch": 0.8568197240443338, "grad_norm": 0.3017970621585846, "learning_rate": 9.918857259400726e-06, "loss": 0.4688, "step": 1894 }, { "epoch": 0.8572721103822665, "grad_norm": 0.3239196538925171, "learning_rate": 9.918725732082376e-06, "loss": 0.4755, "step": 1895 }, { "epoch": 0.857724496720199, "grad_norm": 0.30777978897094727, "learning_rate": 9.918594099125012e-06, "loss": 0.5005, "step": 1896 }, { "epoch": 0.8581768830581317, "grad_norm": 0.3859896957874298, "learning_rate": 9.91846236053146e-06, "loss": 0.6756, "step": 1897 }, { "epoch": 0.8586292693960642, "grad_norm": 0.34270113706588745, "learning_rate": 9.918330516304552e-06, "loss": 0.5687, "step": 1898 }, { "epoch": 0.8590816557339969, "grad_norm": 0.4158553183078766, "learning_rate": 9.918198566447121e-06, "loss": 0.4794, "step": 1899 }, { "epoch": 0.8595340420719294, "grad_norm": 0.4645226001739502, "learning_rate": 9.918066510961996e-06, "loss": 0.5203, "step": 1900 }, { "epoch": 0.8599864284098621, "grad_norm": 0.12051043659448624, "learning_rate": 9.917934349852017e-06, "loss": 1.1925, "step": 1901 }, { "epoch": 0.8604388147477946, "grad_norm": 0.16377979516983032, "learning_rate": 9.917802083120021e-06, "loss": 0.6499, "step": 1902 }, { "epoch": 0.8608912010857273, "grad_norm": 0.1698743999004364, "learning_rate": 9.91766971076885e-06, "loss": 0.6264, "step": 1903 }, { "epoch": 0.8613435874236598, "grad_norm": 0.21012336015701294, "learning_rate": 9.917537232801345e-06, "loss": 0.7407, "step": 1904 }, { "epoch": 0.8617959737615924, "grad_norm": 0.21729764342308044, "learning_rate": 9.917404649220351e-06, "loss": 0.6775, "step": 1905 }, { "epoch": 0.862248360099525, "grad_norm": 0.20274266600608826, "learning_rate": 9.917271960028718e-06, "loss": 0.7743, "step": 1906 }, { "epoch": 0.8627007464374576, "grad_norm": 0.20269107818603516, "learning_rate": 9.917139165229294e-06, "loss": 0.6293, "step": 1907 }, { "epoch": 0.8631531327753902, "grad_norm": 0.1963459998369217, "learning_rate": 9.91700626482493e-06, "loss": 0.55, "step": 1908 }, { "epoch": 0.8636055191133227, "grad_norm": 0.19651912152767181, "learning_rate": 9.916873258818485e-06, "loss": 0.54, "step": 1909 }, { "epoch": 0.8640579054512554, "grad_norm": 0.23819167912006378, "learning_rate": 9.916740147212809e-06, "loss": 0.6736, "step": 1910 }, { "epoch": 0.8645102917891879, "grad_norm": 0.17975328862667084, "learning_rate": 9.916606930010765e-06, "loss": 0.5049, "step": 1911 }, { "epoch": 0.8649626781271206, "grad_norm": 0.19823046028614044, "learning_rate": 9.916473607215213e-06, "loss": 0.5644, "step": 1912 }, { "epoch": 0.8654150644650531, "grad_norm": 0.24861767888069153, "learning_rate": 9.916340178829017e-06, "loss": 0.6035, "step": 1913 }, { "epoch": 0.8658674508029858, "grad_norm": 0.21040166914463043, "learning_rate": 9.916206644855042e-06, "loss": 0.5477, "step": 1914 }, { "epoch": 0.8663198371409183, "grad_norm": 0.24092309176921844, "learning_rate": 9.916073005296156e-06, "loss": 0.6388, "step": 1915 }, { "epoch": 0.866772223478851, "grad_norm": 0.2319074124097824, "learning_rate": 9.915939260155229e-06, "loss": 0.5871, "step": 1916 }, { "epoch": 0.8672246098167835, "grad_norm": 0.2611682713031769, "learning_rate": 9.915805409435133e-06, "loss": 0.6441, "step": 1917 }, { "epoch": 0.8676769961547162, "grad_norm": 0.2551206946372986, "learning_rate": 9.915671453138745e-06, "loss": 0.5985, "step": 1918 }, { "epoch": 0.8681293824926487, "grad_norm": 0.23540514707565308, "learning_rate": 9.915537391268938e-06, "loss": 0.5873, "step": 1919 }, { "epoch": 0.8685817688305814, "grad_norm": 0.22303999960422516, "learning_rate": 9.915403223828595e-06, "loss": 0.6102, "step": 1920 }, { "epoch": 0.8690341551685139, "grad_norm": 0.23179209232330322, "learning_rate": 9.915268950820596e-06, "loss": 0.5408, "step": 1921 }, { "epoch": 0.8694865415064466, "grad_norm": 0.25243377685546875, "learning_rate": 9.915134572247823e-06, "loss": 0.6574, "step": 1922 }, { "epoch": 0.8699389278443791, "grad_norm": 0.22317320108413696, "learning_rate": 9.915000088113166e-06, "loss": 0.502, "step": 1923 }, { "epoch": 0.8703913141823117, "grad_norm": 0.23931477963924408, "learning_rate": 9.91486549841951e-06, "loss": 0.5588, "step": 1924 }, { "epoch": 0.8708437005202443, "grad_norm": 0.22352686524391174, "learning_rate": 9.914730803169746e-06, "loss": 0.5042, "step": 1925 }, { "epoch": 0.8712960868581768, "grad_norm": 0.2774117887020111, "learning_rate": 9.91459600236677e-06, "loss": 0.5712, "step": 1926 }, { "epoch": 0.8717484731961095, "grad_norm": 0.27715805172920227, "learning_rate": 9.914461096013473e-06, "loss": 0.7724, "step": 1927 }, { "epoch": 0.872200859534042, "grad_norm": 0.2249561995267868, "learning_rate": 9.914326084112754e-06, "loss": 0.5855, "step": 1928 }, { "epoch": 0.8726532458719747, "grad_norm": 0.3289549648761749, "learning_rate": 9.914190966667511e-06, "loss": 0.5739, "step": 1929 }, { "epoch": 0.8731056322099072, "grad_norm": 0.3038491904735565, "learning_rate": 9.914055743680651e-06, "loss": 0.641, "step": 1930 }, { "epoch": 0.8735580185478399, "grad_norm": 0.34813034534454346, "learning_rate": 9.913920415155072e-06, "loss": 0.7536, "step": 1931 }, { "epoch": 0.8740104048857724, "grad_norm": 0.2757706940174103, "learning_rate": 9.913784981093683e-06, "loss": 0.6473, "step": 1932 }, { "epoch": 0.8744627912237051, "grad_norm": 0.27271199226379395, "learning_rate": 9.913649441499395e-06, "loss": 0.5407, "step": 1933 }, { "epoch": 0.8749151775616376, "grad_norm": 0.2724100351333618, "learning_rate": 9.913513796375117e-06, "loss": 0.6045, "step": 1934 }, { "epoch": 0.8753675638995703, "grad_norm": 0.26961466670036316, "learning_rate": 9.913378045723759e-06, "loss": 0.5668, "step": 1935 }, { "epoch": 0.8758199502375028, "grad_norm": 0.30828818678855896, "learning_rate": 9.913242189548242e-06, "loss": 0.6959, "step": 1936 }, { "epoch": 0.8762723365754355, "grad_norm": 0.30664366483688354, "learning_rate": 9.913106227851481e-06, "loss": 0.6859, "step": 1937 }, { "epoch": 0.876724722913368, "grad_norm": 0.2702022194862366, "learning_rate": 9.912970160636396e-06, "loss": 0.4551, "step": 1938 }, { "epoch": 0.8771771092513007, "grad_norm": 0.27891701459884644, "learning_rate": 9.91283398790591e-06, "loss": 0.5385, "step": 1939 }, { "epoch": 0.8776294955892332, "grad_norm": 0.330201655626297, "learning_rate": 9.912697709662946e-06, "loss": 0.6416, "step": 1940 }, { "epoch": 0.8780818819271659, "grad_norm": 0.29938390851020813, "learning_rate": 9.912561325910433e-06, "loss": 0.6069, "step": 1941 }, { "epoch": 0.8785342682650984, "grad_norm": 0.3211900591850281, "learning_rate": 9.9124248366513e-06, "loss": 0.6186, "step": 1942 }, { "epoch": 0.8789866546030309, "grad_norm": 0.3199925422668457, "learning_rate": 9.912288241888477e-06, "loss": 0.5202, "step": 1943 }, { "epoch": 0.8794390409409636, "grad_norm": 0.2881879508495331, "learning_rate": 9.912151541624897e-06, "loss": 0.5868, "step": 1944 }, { "epoch": 0.8798914272788961, "grad_norm": 0.33023571968078613, "learning_rate": 9.912014735863497e-06, "loss": 0.5553, "step": 1945 }, { "epoch": 0.8803438136168288, "grad_norm": 0.3581477999687195, "learning_rate": 9.911877824607218e-06, "loss": 0.6555, "step": 1946 }, { "epoch": 0.8807961999547613, "grad_norm": 0.31989988684654236, "learning_rate": 9.911740807858996e-06, "loss": 0.5696, "step": 1947 }, { "epoch": 0.881248586292694, "grad_norm": 0.32205021381378174, "learning_rate": 9.911603685621775e-06, "loss": 0.5498, "step": 1948 }, { "epoch": 0.8817009726306265, "grad_norm": 0.3400980532169342, "learning_rate": 9.9114664578985e-06, "loss": 0.5035, "step": 1949 }, { "epoch": 0.8821533589685592, "grad_norm": 0.45414820313453674, "learning_rate": 9.911329124692119e-06, "loss": 0.5929, "step": 1950 }, { "epoch": 0.8826057453064917, "grad_norm": 0.1342628002166748, "learning_rate": 9.911191686005582e-06, "loss": 1.2705, "step": 1951 }, { "epoch": 0.8830581316444244, "grad_norm": 0.15049059689044952, "learning_rate": 9.911054141841838e-06, "loss": 0.6555, "step": 1952 }, { "epoch": 0.8835105179823569, "grad_norm": 0.15667925775051117, "learning_rate": 9.910916492203845e-06, "loss": 0.5213, "step": 1953 }, { "epoch": 0.8839629043202896, "grad_norm": 0.20776143670082092, "learning_rate": 9.910778737094555e-06, "loss": 0.6752, "step": 1954 }, { "epoch": 0.8844152906582221, "grad_norm": 0.1990365833044052, "learning_rate": 9.91064087651693e-06, "loss": 0.6773, "step": 1955 }, { "epoch": 0.8848676769961548, "grad_norm": 0.20535193383693695, "learning_rate": 9.91050291047393e-06, "loss": 0.6525, "step": 1956 }, { "epoch": 0.8853200633340873, "grad_norm": 0.21411405503749847, "learning_rate": 9.910364838968517e-06, "loss": 0.7213, "step": 1957 }, { "epoch": 0.88577244967202, "grad_norm": 0.21521830558776855, "learning_rate": 9.910226662003657e-06, "loss": 0.5902, "step": 1958 }, { "epoch": 0.8862248360099525, "grad_norm": 0.18966157734394073, "learning_rate": 9.910088379582315e-06, "loss": 0.5616, "step": 1959 }, { "epoch": 0.886677222347885, "grad_norm": 0.22312092781066895, "learning_rate": 9.909949991707466e-06, "loss": 0.6473, "step": 1960 }, { "epoch": 0.8871296086858177, "grad_norm": 0.23018325865268707, "learning_rate": 9.90981149838208e-06, "loss": 0.6241, "step": 1961 }, { "epoch": 0.8875819950237502, "grad_norm": 0.20883221924304962, "learning_rate": 9.90967289960913e-06, "loss": 0.5749, "step": 1962 }, { "epoch": 0.8880343813616829, "grad_norm": 0.22713600099086761, "learning_rate": 9.909534195391592e-06, "loss": 0.663, "step": 1963 }, { "epoch": 0.8884867676996154, "grad_norm": 0.223160982131958, "learning_rate": 9.90939538573245e-06, "loss": 0.571, "step": 1964 }, { "epoch": 0.8889391540375481, "grad_norm": 0.2459777146577835, "learning_rate": 9.90925647063468e-06, "loss": 0.5195, "step": 1965 }, { "epoch": 0.8893915403754806, "grad_norm": 0.22204247117042542, "learning_rate": 9.909117450101266e-06, "loss": 0.571, "step": 1966 }, { "epoch": 0.8898439267134133, "grad_norm": 0.205030620098114, "learning_rate": 9.908978324135197e-06, "loss": 0.571, "step": 1967 }, { "epoch": 0.8902963130513458, "grad_norm": 0.23665431141853333, "learning_rate": 9.908839092739458e-06, "loss": 0.6027, "step": 1968 }, { "epoch": 0.8907486993892785, "grad_norm": 0.25045961141586304, "learning_rate": 9.90869975591704e-06, "loss": 0.611, "step": 1969 }, { "epoch": 0.891201085727211, "grad_norm": 0.2345174252986908, "learning_rate": 9.908560313670936e-06, "loss": 0.5445, "step": 1970 }, { "epoch": 0.8916534720651437, "grad_norm": 0.25278639793395996, "learning_rate": 9.90842076600414e-06, "loss": 0.6743, "step": 1971 }, { "epoch": 0.8921058584030762, "grad_norm": 0.3071403205394745, "learning_rate": 9.908281112919652e-06, "loss": 0.793, "step": 1972 }, { "epoch": 0.8925582447410089, "grad_norm": 0.264635294675827, "learning_rate": 9.908141354420467e-06, "loss": 0.6187, "step": 1973 }, { "epoch": 0.8930106310789414, "grad_norm": 0.25523248314857483, "learning_rate": 9.908001490509589e-06, "loss": 0.672, "step": 1974 }, { "epoch": 0.8934630174168741, "grad_norm": 0.25769615173339844, "learning_rate": 9.90786152119002e-06, "loss": 0.52, "step": 1975 }, { "epoch": 0.8939154037548066, "grad_norm": 0.2577648162841797, "learning_rate": 9.90772144646477e-06, "loss": 0.546, "step": 1976 }, { "epoch": 0.8943677900927391, "grad_norm": 0.24441640079021454, "learning_rate": 9.907581266336843e-06, "loss": 0.5864, "step": 1977 }, { "epoch": 0.8948201764306718, "grad_norm": 0.24512708187103271, "learning_rate": 9.907440980809252e-06, "loss": 0.5275, "step": 1978 }, { "epoch": 0.8952725627686043, "grad_norm": 0.2581380307674408, "learning_rate": 9.90730058988501e-06, "loss": 0.5733, "step": 1979 }, { "epoch": 0.895724949106537, "grad_norm": 0.30333638191223145, "learning_rate": 9.90716009356713e-06, "loss": 0.5435, "step": 1980 }, { "epoch": 0.8961773354444695, "grad_norm": 0.26682987809181213, "learning_rate": 9.907019491858631e-06, "loss": 0.6041, "step": 1981 }, { "epoch": 0.8966297217824022, "grad_norm": 0.27210500836372375, "learning_rate": 9.906878784762534e-06, "loss": 0.5925, "step": 1982 }, { "epoch": 0.8970821081203347, "grad_norm": 0.22712227702140808, "learning_rate": 9.906737972281858e-06, "loss": 0.4761, "step": 1983 }, { "epoch": 0.8975344944582674, "grad_norm": 0.27600812911987305, "learning_rate": 9.90659705441963e-06, "loss": 0.5832, "step": 1984 }, { "epoch": 0.8979868807961999, "grad_norm": 0.28075650334358215, "learning_rate": 9.906456031178875e-06, "loss": 0.5891, "step": 1985 }, { "epoch": 0.8984392671341326, "grad_norm": 0.279360294342041, "learning_rate": 9.906314902562622e-06, "loss": 0.5474, "step": 1986 }, { "epoch": 0.8988916534720651, "grad_norm": 0.29199501872062683, "learning_rate": 9.906173668573901e-06, "loss": 0.6759, "step": 1987 }, { "epoch": 0.8993440398099978, "grad_norm": 0.2800823450088501, "learning_rate": 9.906032329215747e-06, "loss": 0.4996, "step": 1988 }, { "epoch": 0.8997964261479303, "grad_norm": 0.30369988083839417, "learning_rate": 9.905890884491196e-06, "loss": 0.6833, "step": 1989 }, { "epoch": 0.900248812485863, "grad_norm": 0.2596442401409149, "learning_rate": 9.905749334403285e-06, "loss": 0.4804, "step": 1990 }, { "epoch": 0.9007011988237955, "grad_norm": 0.2514909505844116, "learning_rate": 9.905607678955052e-06, "loss": 0.5412, "step": 1991 }, { "epoch": 0.9011535851617282, "grad_norm": 0.28082653880119324, "learning_rate": 9.90546591814954e-06, "loss": 0.4977, "step": 1992 }, { "epoch": 0.9016059714996607, "grad_norm": 0.29269319772720337, "learning_rate": 9.905324051989797e-06, "loss": 0.5631, "step": 1993 }, { "epoch": 0.9020583578375932, "grad_norm": 0.32226336002349854, "learning_rate": 9.905182080478866e-06, "loss": 0.5933, "step": 1994 }, { "epoch": 0.9025107441755259, "grad_norm": 0.4421904385089874, "learning_rate": 9.905040003619797e-06, "loss": 0.4426, "step": 1995 }, { "epoch": 0.9029631305134584, "grad_norm": 0.35159894824028015, "learning_rate": 9.904897821415644e-06, "loss": 0.6047, "step": 1996 }, { "epoch": 0.9034155168513911, "grad_norm": 0.37723809480667114, "learning_rate": 9.904755533869456e-06, "loss": 0.624, "step": 1997 }, { "epoch": 0.9038679031893236, "grad_norm": 0.3127713203430176, "learning_rate": 9.904613140984294e-06, "loss": 0.499, "step": 1998 }, { "epoch": 0.9043202895272563, "grad_norm": 0.41582074761390686, "learning_rate": 9.904470642763211e-06, "loss": 0.6314, "step": 1999 }, { "epoch": 0.9047726758651888, "grad_norm": 0.4435099959373474, "learning_rate": 9.904328039209273e-06, "loss": 0.531, "step": 2000 }, { "epoch": 0.9047726758651888, "eval_loss": 0.619054913520813, "eval_runtime": 26.7642, "eval_samples_per_second": 27.798, "eval_steps_per_second": 6.95, "step": 2000 }, { "epoch": 0.9052250622031215, "grad_norm": 0.14360325038433075, "learning_rate": 9.904185330325538e-06, "loss": 1.1027, "step": 2001 }, { "epoch": 0.905677448541054, "grad_norm": 0.21126601099967957, "learning_rate": 9.904042516115073e-06, "loss": 0.9269, "step": 2002 }, { "epoch": 0.9061298348789867, "grad_norm": 0.1636371612548828, "learning_rate": 9.903899596580943e-06, "loss": 0.5531, "step": 2003 }, { "epoch": 0.9065822212169192, "grad_norm": 0.20847375690937042, "learning_rate": 9.903756571726222e-06, "loss": 0.689, "step": 2004 }, { "epoch": 0.9070346075548519, "grad_norm": 0.2071983516216278, "learning_rate": 9.90361344155398e-06, "loss": 0.7751, "step": 2005 }, { "epoch": 0.9074869938927844, "grad_norm": 0.22837767004966736, "learning_rate": 9.903470206067287e-06, "loss": 0.7015, "step": 2006 }, { "epoch": 0.9079393802307171, "grad_norm": 0.1973305493593216, "learning_rate": 9.903326865269223e-06, "loss": 0.6125, "step": 2007 }, { "epoch": 0.9083917665686496, "grad_norm": 0.2247963398694992, "learning_rate": 9.903183419162865e-06, "loss": 0.6278, "step": 2008 }, { "epoch": 0.9088441529065823, "grad_norm": 0.20810896158218384, "learning_rate": 9.903039867751297e-06, "loss": 0.5888, "step": 2009 }, { "epoch": 0.9092965392445148, "grad_norm": 0.21538956463336945, "learning_rate": 9.902896211037599e-06, "loss": 0.5675, "step": 2010 }, { "epoch": 0.9097489255824475, "grad_norm": 0.20286038517951965, "learning_rate": 9.902752449024855e-06, "loss": 0.6328, "step": 2011 }, { "epoch": 0.91020131192038, "grad_norm": 0.21317891776561737, "learning_rate": 9.902608581716157e-06, "loss": 0.6593, "step": 2012 }, { "epoch": 0.9106536982583125, "grad_norm": 0.23193052411079407, "learning_rate": 9.902464609114591e-06, "loss": 0.6324, "step": 2013 }, { "epoch": 0.9111060845962452, "grad_norm": 0.2134193480014801, "learning_rate": 9.90232053122325e-06, "loss": 0.6156, "step": 2014 }, { "epoch": 0.9115584709341777, "grad_norm": 0.21747107803821564, "learning_rate": 9.90217634804523e-06, "loss": 0.5578, "step": 2015 }, { "epoch": 0.9120108572721104, "grad_norm": 0.23720939457416534, "learning_rate": 9.902032059583627e-06, "loss": 0.6441, "step": 2016 }, { "epoch": 0.9124632436100429, "grad_norm": 0.21891646087169647, "learning_rate": 9.901887665841536e-06, "loss": 0.5871, "step": 2017 }, { "epoch": 0.9129156299479756, "grad_norm": 0.22046217322349548, "learning_rate": 9.901743166822065e-06, "loss": 0.5871, "step": 2018 }, { "epoch": 0.9133680162859081, "grad_norm": 0.2251702845096588, "learning_rate": 9.901598562528312e-06, "loss": 0.6906, "step": 2019 }, { "epoch": 0.9138204026238408, "grad_norm": 0.2449696809053421, "learning_rate": 9.901453852963385e-06, "loss": 0.6204, "step": 2020 }, { "epoch": 0.9142727889617733, "grad_norm": 0.22451907396316528, "learning_rate": 9.901309038130392e-06, "loss": 0.5527, "step": 2021 }, { "epoch": 0.914725175299706, "grad_norm": 0.24124842882156372, "learning_rate": 9.90116411803244e-06, "loss": 0.6087, "step": 2022 }, { "epoch": 0.9151775616376385, "grad_norm": 0.2520042657852173, "learning_rate": 9.901019092672646e-06, "loss": 0.5176, "step": 2023 }, { "epoch": 0.9156299479755712, "grad_norm": 0.22945578396320343, "learning_rate": 9.900873962054121e-06, "loss": 0.6196, "step": 2024 }, { "epoch": 0.9160823343135037, "grad_norm": 0.22673816978931427, "learning_rate": 9.900728726179984e-06, "loss": 0.5349, "step": 2025 }, { "epoch": 0.9165347206514364, "grad_norm": 0.26180756092071533, "learning_rate": 9.900583385053354e-06, "loss": 0.7335, "step": 2026 }, { "epoch": 0.9169871069893689, "grad_norm": 0.23748870193958282, "learning_rate": 9.900437938677354e-06, "loss": 0.6113, "step": 2027 }, { "epoch": 0.9174394933273016, "grad_norm": 0.20021040737628937, "learning_rate": 9.900292387055104e-06, "loss": 0.4662, "step": 2028 }, { "epoch": 0.9178918796652341, "grad_norm": 0.24222640693187714, "learning_rate": 9.900146730189734e-06, "loss": 0.5709, "step": 2029 }, { "epoch": 0.9183442660031667, "grad_norm": 0.23283255100250244, "learning_rate": 9.900000968084369e-06, "loss": 0.5677, "step": 2030 }, { "epoch": 0.9187966523410993, "grad_norm": 0.2519229054450989, "learning_rate": 9.899855100742142e-06, "loss": 0.6072, "step": 2031 }, { "epoch": 0.9192490386790318, "grad_norm": 0.2384023517370224, "learning_rate": 9.899709128166182e-06, "loss": 0.6416, "step": 2032 }, { "epoch": 0.9197014250169645, "grad_norm": 0.2274506837129593, "learning_rate": 9.899563050359629e-06, "loss": 0.5013, "step": 2033 }, { "epoch": 0.920153811354897, "grad_norm": 0.2647883892059326, "learning_rate": 9.899416867325618e-06, "loss": 0.5863, "step": 2034 }, { "epoch": 0.9206061976928297, "grad_norm": 0.2692314088344574, "learning_rate": 9.899270579067286e-06, "loss": 0.6101, "step": 2035 }, { "epoch": 0.9210585840307622, "grad_norm": 0.23709815740585327, "learning_rate": 9.899124185587779e-06, "loss": 0.5085, "step": 2036 }, { "epoch": 0.9215109703686949, "grad_norm": 0.2880188822746277, "learning_rate": 9.898977686890239e-06, "loss": 0.6128, "step": 2037 }, { "epoch": 0.9219633567066274, "grad_norm": 0.25777238607406616, "learning_rate": 9.898831082977813e-06, "loss": 0.536, "step": 2038 }, { "epoch": 0.9224157430445601, "grad_norm": 0.2899013161659241, "learning_rate": 9.89868437385365e-06, "loss": 0.5573, "step": 2039 }, { "epoch": 0.9228681293824926, "grad_norm": 0.29605814814567566, "learning_rate": 9.8985375595209e-06, "loss": 0.6421, "step": 2040 }, { "epoch": 0.9233205157204253, "grad_norm": 0.27683472633361816, "learning_rate": 9.898390639982715e-06, "loss": 0.6161, "step": 2041 }, { "epoch": 0.9237729020583578, "grad_norm": 0.29478490352630615, "learning_rate": 9.898243615242253e-06, "loss": 0.5694, "step": 2042 }, { "epoch": 0.9242252883962905, "grad_norm": 0.31841495633125305, "learning_rate": 9.898096485302669e-06, "loss": 0.5696, "step": 2043 }, { "epoch": 0.924677674734223, "grad_norm": 0.29314061999320984, "learning_rate": 9.897949250167125e-06, "loss": 0.5593, "step": 2044 }, { "epoch": 0.9251300610721557, "grad_norm": 0.27373215556144714, "learning_rate": 9.897801909838782e-06, "loss": 0.5506, "step": 2045 }, { "epoch": 0.9255824474100882, "grad_norm": 0.32631391286849976, "learning_rate": 9.897654464320805e-06, "loss": 0.5447, "step": 2046 }, { "epoch": 0.9260348337480208, "grad_norm": 0.35536232590675354, "learning_rate": 9.897506913616362e-06, "loss": 0.5663, "step": 2047 }, { "epoch": 0.9264872200859534, "grad_norm": 0.3701637387275696, "learning_rate": 9.897359257728618e-06, "loss": 0.6655, "step": 2048 }, { "epoch": 0.926939606423886, "grad_norm": 0.3713899254798889, "learning_rate": 9.897211496660748e-06, "loss": 0.5315, "step": 2049 }, { "epoch": 0.9273919927618186, "grad_norm": 0.3803096413612366, "learning_rate": 9.897063630415922e-06, "loss": 0.493, "step": 2050 }, { "epoch": 0.9278443790997511, "grad_norm": 0.12265169620513916, "learning_rate": 9.896915658997321e-06, "loss": 1.1259, "step": 2051 }, { "epoch": 0.9282967654376838, "grad_norm": 0.1841108202934265, "learning_rate": 9.896767582408118e-06, "loss": 1.0843, "step": 2052 }, { "epoch": 0.9287491517756163, "grad_norm": 0.1446286141872406, "learning_rate": 9.896619400651493e-06, "loss": 0.6875, "step": 2053 }, { "epoch": 0.929201538113549, "grad_norm": 0.1730501502752304, "learning_rate": 9.896471113730633e-06, "loss": 0.6253, "step": 2054 }, { "epoch": 0.9296539244514815, "grad_norm": 0.19160407781600952, "learning_rate": 9.896322721648719e-06, "loss": 0.5924, "step": 2055 }, { "epoch": 0.9301063107894142, "grad_norm": 0.18434016406536102, "learning_rate": 9.89617422440894e-06, "loss": 0.5355, "step": 2056 }, { "epoch": 0.9305586971273467, "grad_norm": 0.2648833096027374, "learning_rate": 9.896025622014484e-06, "loss": 0.8535, "step": 2057 }, { "epoch": 0.9310110834652794, "grad_norm": 0.21432453393936157, "learning_rate": 9.895876914468543e-06, "loss": 0.6793, "step": 2058 }, { "epoch": 0.9314634698032119, "grad_norm": 0.21705880761146545, "learning_rate": 9.895728101774311e-06, "loss": 0.651, "step": 2059 }, { "epoch": 0.9319158561411446, "grad_norm": 0.2298915535211563, "learning_rate": 9.895579183934983e-06, "loss": 0.622, "step": 2060 }, { "epoch": 0.9323682424790771, "grad_norm": 0.21085864305496216, "learning_rate": 9.895430160953759e-06, "loss": 0.6039, "step": 2061 }, { "epoch": 0.9328206288170098, "grad_norm": 0.2334752380847931, "learning_rate": 9.89528103283384e-06, "loss": 0.638, "step": 2062 }, { "epoch": 0.9332730151549423, "grad_norm": 0.23269985616207123, "learning_rate": 9.895131799578424e-06, "loss": 0.614, "step": 2063 }, { "epoch": 0.9337254014928749, "grad_norm": 0.20544157922267914, "learning_rate": 9.89498246119072e-06, "loss": 0.5502, "step": 2064 }, { "epoch": 0.9341777878308075, "grad_norm": 0.22380900382995605, "learning_rate": 9.894833017673938e-06, "loss": 0.6106, "step": 2065 }, { "epoch": 0.9346301741687401, "grad_norm": 0.2145359218120575, "learning_rate": 9.894683469031281e-06, "loss": 0.638, "step": 2066 }, { "epoch": 0.9350825605066727, "grad_norm": 0.25300687551498413, "learning_rate": 9.894533815265966e-06, "loss": 0.6545, "step": 2067 }, { "epoch": 0.9355349468446053, "grad_norm": 0.2507840692996979, "learning_rate": 9.894384056381204e-06, "loss": 0.6075, "step": 2068 }, { "epoch": 0.9359873331825379, "grad_norm": 0.21437829732894897, "learning_rate": 9.894234192380214e-06, "loss": 0.5375, "step": 2069 }, { "epoch": 0.9364397195204704, "grad_norm": 0.2482413351535797, "learning_rate": 9.894084223266213e-06, "loss": 0.6581, "step": 2070 }, { "epoch": 0.9368921058584031, "grad_norm": 0.21956706047058105, "learning_rate": 9.893934149042421e-06, "loss": 0.5731, "step": 2071 }, { "epoch": 0.9373444921963356, "grad_norm": 0.20550057291984558, "learning_rate": 9.893783969712065e-06, "loss": 0.5654, "step": 2072 }, { "epoch": 0.9377968785342683, "grad_norm": 0.2467511147260666, "learning_rate": 9.893633685278365e-06, "loss": 0.6079, "step": 2073 }, { "epoch": 0.9382492648722008, "grad_norm": 0.23992124199867249, "learning_rate": 9.893483295744553e-06, "loss": 0.596, "step": 2074 }, { "epoch": 0.9387016512101335, "grad_norm": 0.2555951774120331, "learning_rate": 9.893332801113854e-06, "loss": 0.7037, "step": 2075 }, { "epoch": 0.939154037548066, "grad_norm": 0.2319357693195343, "learning_rate": 9.893182201389508e-06, "loss": 0.5613, "step": 2076 }, { "epoch": 0.9396064238859987, "grad_norm": 0.23677533864974976, "learning_rate": 9.893031496574744e-06, "loss": 0.5228, "step": 2077 }, { "epoch": 0.9400588102239312, "grad_norm": 0.24023036658763885, "learning_rate": 9.892880686672797e-06, "loss": 0.5429, "step": 2078 }, { "epoch": 0.9405111965618639, "grad_norm": 0.2557236850261688, "learning_rate": 9.89272977168691e-06, "loss": 0.5628, "step": 2079 }, { "epoch": 0.9409635828997964, "grad_norm": 0.28941044211387634, "learning_rate": 9.892578751620323e-06, "loss": 0.5771, "step": 2080 }, { "epoch": 0.941415969237729, "grad_norm": 0.2630535662174225, "learning_rate": 9.892427626476277e-06, "loss": 0.5783, "step": 2081 }, { "epoch": 0.9418683555756616, "grad_norm": 0.24537412822246552, "learning_rate": 9.892276396258023e-06, "loss": 0.5512, "step": 2082 }, { "epoch": 0.9423207419135942, "grad_norm": 0.2834022343158722, "learning_rate": 9.892125060968803e-06, "loss": 0.6441, "step": 2083 }, { "epoch": 0.9427731282515268, "grad_norm": 0.2643249034881592, "learning_rate": 9.89197362061187e-06, "loss": 0.5825, "step": 2084 }, { "epoch": 0.9432255145894594, "grad_norm": 0.28452274203300476, "learning_rate": 9.891822075190479e-06, "loss": 0.5899, "step": 2085 }, { "epoch": 0.943677900927392, "grad_norm": 0.2924950122833252, "learning_rate": 9.89167042470788e-06, "loss": 0.5823, "step": 2086 }, { "epoch": 0.9441302872653246, "grad_norm": 0.22856266796588898, "learning_rate": 9.891518669167332e-06, "loss": 0.4947, "step": 2087 }, { "epoch": 0.9445826736032572, "grad_norm": 0.2819935977458954, "learning_rate": 9.891366808572093e-06, "loss": 0.5258, "step": 2088 }, { "epoch": 0.9450350599411897, "grad_norm": 0.3064168393611908, "learning_rate": 9.89121484292543e-06, "loss": 0.4849, "step": 2089 }, { "epoch": 0.9454874462791224, "grad_norm": 0.2921559512615204, "learning_rate": 9.891062772230599e-06, "loss": 0.572, "step": 2090 }, { "epoch": 0.9459398326170549, "grad_norm": 0.27069833874702454, "learning_rate": 9.890910596490872e-06, "loss": 0.5464, "step": 2091 }, { "epoch": 0.9463922189549876, "grad_norm": 0.2748886048793793, "learning_rate": 9.890758315709514e-06, "loss": 0.4706, "step": 2092 }, { "epoch": 0.9468446052929201, "grad_norm": 0.3036223351955414, "learning_rate": 9.890605929889797e-06, "loss": 0.5096, "step": 2093 }, { "epoch": 0.9472969916308528, "grad_norm": 0.31216898560523987, "learning_rate": 9.89045343903499e-06, "loss": 0.5887, "step": 2094 }, { "epoch": 0.9477493779687853, "grad_norm": 0.30365052819252014, "learning_rate": 9.890300843148374e-06, "loss": 0.5481, "step": 2095 }, { "epoch": 0.948201764306718, "grad_norm": 0.385302871465683, "learning_rate": 9.890148142233222e-06, "loss": 0.6547, "step": 2096 }, { "epoch": 0.9486541506446505, "grad_norm": 0.3728286921977997, "learning_rate": 9.889995336292817e-06, "loss": 0.6187, "step": 2097 }, { "epoch": 0.9491065369825831, "grad_norm": 0.36858731508255005, "learning_rate": 9.889842425330438e-06, "loss": 0.5793, "step": 2098 }, { "epoch": 0.9495589233205157, "grad_norm": 0.4614998996257782, "learning_rate": 9.88968940934937e-06, "loss": 0.5471, "step": 2099 }, { "epoch": 0.9500113096584483, "grad_norm": 0.6012897491455078, "learning_rate": 9.8895362883529e-06, "loss": 0.6656, "step": 2100 }, { "epoch": 0.9504636959963809, "grad_norm": 0.11972494423389435, "learning_rate": 9.889383062344312e-06, "loss": 1.3503, "step": 2101 }, { "epoch": 0.9509160823343135, "grad_norm": 0.17273150384426117, "learning_rate": 9.889229731326904e-06, "loss": 0.8265, "step": 2102 }, { "epoch": 0.9513684686722461, "grad_norm": 0.19117364287376404, "learning_rate": 9.889076295303965e-06, "loss": 0.75, "step": 2103 }, { "epoch": 0.9518208550101787, "grad_norm": 0.17853344976902008, "learning_rate": 9.888922754278792e-06, "loss": 0.6678, "step": 2104 }, { "epoch": 0.9522732413481113, "grad_norm": 0.21287302672863007, "learning_rate": 9.88876910825468e-06, "loss": 0.7807, "step": 2105 }, { "epoch": 0.9527256276860439, "grad_norm": 0.21879799664020538, "learning_rate": 9.88861535723493e-06, "loss": 0.6788, "step": 2106 }, { "epoch": 0.9531780140239765, "grad_norm": 0.18773144483566284, "learning_rate": 9.888461501222846e-06, "loss": 0.5682, "step": 2107 }, { "epoch": 0.953630400361909, "grad_norm": 0.20293325185775757, "learning_rate": 9.888307540221731e-06, "loss": 0.6178, "step": 2108 }, { "epoch": 0.9540827866998417, "grad_norm": 0.2101040482521057, "learning_rate": 9.88815347423489e-06, "loss": 0.701, "step": 2109 }, { "epoch": 0.9545351730377742, "grad_norm": 0.21557334065437317, "learning_rate": 9.887999303265633e-06, "loss": 0.7183, "step": 2110 }, { "epoch": 0.9549875593757069, "grad_norm": 0.2168816775083542, "learning_rate": 9.887845027317274e-06, "loss": 0.6207, "step": 2111 }, { "epoch": 0.9554399457136394, "grad_norm": 0.19638828933238983, "learning_rate": 9.887690646393122e-06, "loss": 0.5067, "step": 2112 }, { "epoch": 0.9558923320515721, "grad_norm": 0.2122783064842224, "learning_rate": 9.887536160496494e-06, "loss": 0.6038, "step": 2113 }, { "epoch": 0.9563447183895046, "grad_norm": 0.21316422522068024, "learning_rate": 9.88738156963071e-06, "loss": 0.5485, "step": 2114 }, { "epoch": 0.9567971047274373, "grad_norm": 0.27511927485466003, "learning_rate": 9.887226873799087e-06, "loss": 0.6261, "step": 2115 }, { "epoch": 0.9572494910653698, "grad_norm": 0.2213035225868225, "learning_rate": 9.887072073004949e-06, "loss": 0.6929, "step": 2116 }, { "epoch": 0.9577018774033024, "grad_norm": 0.24409140646457672, "learning_rate": 9.886917167251622e-06, "loss": 0.6288, "step": 2117 }, { "epoch": 0.958154263741235, "grad_norm": 0.24641640484333038, "learning_rate": 9.886762156542428e-06, "loss": 0.646, "step": 2118 }, { "epoch": 0.9586066500791676, "grad_norm": 0.27189698815345764, "learning_rate": 9.886607040880702e-06, "loss": 0.7438, "step": 2119 }, { "epoch": 0.9590590364171002, "grad_norm": 0.24107562005519867, "learning_rate": 9.886451820269773e-06, "loss": 0.6569, "step": 2120 }, { "epoch": 0.9595114227550328, "grad_norm": 0.2487713247537613, "learning_rate": 9.886296494712975e-06, "loss": 0.5869, "step": 2121 }, { "epoch": 0.9599638090929654, "grad_norm": 0.21913014352321625, "learning_rate": 9.886141064213643e-06, "loss": 0.5047, "step": 2122 }, { "epoch": 0.960416195430898, "grad_norm": 0.23018495738506317, "learning_rate": 9.885985528775116e-06, "loss": 0.5779, "step": 2123 }, { "epoch": 0.9608685817688306, "grad_norm": 0.24475885927677155, "learning_rate": 9.885829888400732e-06, "loss": 0.5722, "step": 2124 }, { "epoch": 0.9613209681067632, "grad_norm": 0.2654646039009094, "learning_rate": 9.885674143093838e-06, "loss": 0.6293, "step": 2125 }, { "epoch": 0.9617733544446958, "grad_norm": 0.20823487639427185, "learning_rate": 9.885518292857777e-06, "loss": 0.5809, "step": 2126 }, { "epoch": 0.9622257407826283, "grad_norm": 0.26295629143714905, "learning_rate": 9.885362337695895e-06, "loss": 0.5372, "step": 2127 }, { "epoch": 0.962678127120561, "grad_norm": 0.2332073599100113, "learning_rate": 9.885206277611541e-06, "loss": 0.5417, "step": 2128 }, { "epoch": 0.9631305134584935, "grad_norm": 0.29280591011047363, "learning_rate": 9.88505011260807e-06, "loss": 0.723, "step": 2129 }, { "epoch": 0.9635828997964262, "grad_norm": 0.2587735056877136, "learning_rate": 9.884893842688832e-06, "loss": 0.5945, "step": 2130 }, { "epoch": 0.9640352861343587, "grad_norm": 0.26107752323150635, "learning_rate": 9.884737467857186e-06, "loss": 0.5876, "step": 2131 }, { "epoch": 0.9644876724722914, "grad_norm": 0.24727070331573486, "learning_rate": 9.88458098811649e-06, "loss": 0.5636, "step": 2132 }, { "epoch": 0.9649400588102239, "grad_norm": 0.27886006236076355, "learning_rate": 9.884424403470103e-06, "loss": 0.5684, "step": 2133 }, { "epoch": 0.9653924451481565, "grad_norm": 0.24290956556797028, "learning_rate": 9.884267713921392e-06, "loss": 0.5249, "step": 2134 }, { "epoch": 0.9658448314860891, "grad_norm": 0.3031765818595886, "learning_rate": 9.884110919473717e-06, "loss": 0.541, "step": 2135 }, { "epoch": 0.9662972178240217, "grad_norm": 0.28431984782218933, "learning_rate": 9.88395402013045e-06, "loss": 0.4997, "step": 2136 }, { "epoch": 0.9667496041619543, "grad_norm": 0.28837865591049194, "learning_rate": 9.883797015894956e-06, "loss": 0.66, "step": 2137 }, { "epoch": 0.9672019904998869, "grad_norm": 0.26189786195755005, "learning_rate": 9.88363990677061e-06, "loss": 0.5766, "step": 2138 }, { "epoch": 0.9676543768378195, "grad_norm": 0.2859962582588196, "learning_rate": 9.883482692760787e-06, "loss": 0.5588, "step": 2139 }, { "epoch": 0.9681067631757521, "grad_norm": 0.32845190167427063, "learning_rate": 9.883325373868861e-06, "loss": 0.5907, "step": 2140 }, { "epoch": 0.9685591495136847, "grad_norm": 0.2643766701221466, "learning_rate": 9.883167950098214e-06, "loss": 0.5117, "step": 2141 }, { "epoch": 0.9690115358516173, "grad_norm": 0.3132040202617645, "learning_rate": 9.883010421452225e-06, "loss": 0.5762, "step": 2142 }, { "epoch": 0.9694639221895499, "grad_norm": 0.32733154296875, "learning_rate": 9.882852787934275e-06, "loss": 0.5859, "step": 2143 }, { "epoch": 0.9699163085274825, "grad_norm": 0.34713977575302124, "learning_rate": 9.882695049547752e-06, "loss": 0.6921, "step": 2144 }, { "epoch": 0.9703686948654151, "grad_norm": 0.33942678570747375, "learning_rate": 9.882537206296043e-06, "loss": 0.4894, "step": 2145 }, { "epoch": 0.9708210812033476, "grad_norm": 0.37266775965690613, "learning_rate": 9.88237925818254e-06, "loss": 0.6164, "step": 2146 }, { "epoch": 0.9712734675412803, "grad_norm": 0.3558941185474396, "learning_rate": 9.882221205210634e-06, "loss": 0.6433, "step": 2147 }, { "epoch": 0.9717258538792128, "grad_norm": 0.35554423928260803, "learning_rate": 9.882063047383718e-06, "loss": 0.6135, "step": 2148 }, { "epoch": 0.9721782402171455, "grad_norm": 0.35746583342552185, "learning_rate": 9.88190478470519e-06, "loss": 0.4474, "step": 2149 }, { "epoch": 0.972630626555078, "grad_norm": 0.44526928663253784, "learning_rate": 9.881746417178448e-06, "loss": 0.5443, "step": 2150 }, { "epoch": 0.9730830128930106, "grad_norm": 0.18808668851852417, "learning_rate": 9.881587944806895e-06, "loss": 1.0736, "step": 2151 }, { "epoch": 0.9735353992309432, "grad_norm": 0.16275905072689056, "learning_rate": 9.881429367593933e-06, "loss": 0.617, "step": 2152 }, { "epoch": 0.9739877855688758, "grad_norm": 0.17880140244960785, "learning_rate": 9.88127068554297e-06, "loss": 0.6016, "step": 2153 }, { "epoch": 0.9744401719068084, "grad_norm": 0.19040410220623016, "learning_rate": 9.881111898657411e-06, "loss": 0.6352, "step": 2154 }, { "epoch": 0.974892558244741, "grad_norm": 0.19202178716659546, "learning_rate": 9.880953006940668e-06, "loss": 0.6518, "step": 2155 }, { "epoch": 0.9753449445826736, "grad_norm": 0.23408614099025726, "learning_rate": 9.880794010396153e-06, "loss": 0.6722, "step": 2156 }, { "epoch": 0.9757973309206062, "grad_norm": 0.22987110912799835, "learning_rate": 9.88063490902728e-06, "loss": 0.7547, "step": 2157 }, { "epoch": 0.9762497172585388, "grad_norm": 0.23041164875030518, "learning_rate": 9.880475702837468e-06, "loss": 0.7252, "step": 2158 }, { "epoch": 0.9767021035964714, "grad_norm": 0.19060809910297394, "learning_rate": 9.880316391830135e-06, "loss": 0.6741, "step": 2159 }, { "epoch": 0.977154489934404, "grad_norm": 0.2137368619441986, "learning_rate": 9.880156976008703e-06, "loss": 0.6533, "step": 2160 }, { "epoch": 0.9776068762723366, "grad_norm": 0.20294508337974548, "learning_rate": 9.879997455376593e-06, "loss": 0.6545, "step": 2161 }, { "epoch": 0.9780592626102692, "grad_norm": 0.19335393607616425, "learning_rate": 9.879837829937235e-06, "loss": 0.5036, "step": 2162 }, { "epoch": 0.9785116489482018, "grad_norm": 0.21382734179496765, "learning_rate": 9.879678099694056e-06, "loss": 0.5664, "step": 2163 }, { "epoch": 0.9789640352861344, "grad_norm": 0.23490947484970093, "learning_rate": 9.879518264650488e-06, "loss": 0.5773, "step": 2164 }, { "epoch": 0.979416421624067, "grad_norm": 0.24649561941623688, "learning_rate": 9.879358324809958e-06, "loss": 0.6664, "step": 2165 }, { "epoch": 0.9798688079619996, "grad_norm": 0.255482017993927, "learning_rate": 9.879198280175908e-06, "loss": 0.766, "step": 2166 }, { "epoch": 0.9803211942999321, "grad_norm": 0.24536772072315216, "learning_rate": 9.87903813075177e-06, "loss": 0.6867, "step": 2167 }, { "epoch": 0.9807735806378647, "grad_norm": 0.2589377164840698, "learning_rate": 9.878877876540987e-06, "loss": 0.6616, "step": 2168 }, { "epoch": 0.9812259669757973, "grad_norm": 0.2627277076244354, "learning_rate": 9.878717517546998e-06, "loss": 0.6653, "step": 2169 }, { "epoch": 0.9816783533137299, "grad_norm": 0.25051772594451904, "learning_rate": 9.87855705377325e-06, "loss": 0.5395, "step": 2170 }, { "epoch": 0.9821307396516625, "grad_norm": 0.2502603530883789, "learning_rate": 9.878396485223187e-06, "loss": 0.5673, "step": 2171 }, { "epoch": 0.9825831259895951, "grad_norm": 0.26139798760414124, "learning_rate": 9.878235811900259e-06, "loss": 0.6381, "step": 2172 }, { "epoch": 0.9830355123275277, "grad_norm": 0.2579334080219269, "learning_rate": 9.878075033807917e-06, "loss": 0.5598, "step": 2173 }, { "epoch": 0.9834878986654603, "grad_norm": 0.2626984119415283, "learning_rate": 9.87791415094961e-06, "loss": 0.8739, "step": 2174 }, { "epoch": 0.9839402850033929, "grad_norm": 0.278352290391922, "learning_rate": 9.877753163328798e-06, "loss": 0.7718, "step": 2175 }, { "epoch": 0.9843926713413255, "grad_norm": 0.2546755075454712, "learning_rate": 9.877592070948938e-06, "loss": 0.6545, "step": 2176 }, { "epoch": 0.9848450576792581, "grad_norm": 0.23911695182323456, "learning_rate": 9.877430873813486e-06, "loss": 0.6429, "step": 2177 }, { "epoch": 0.9852974440171907, "grad_norm": 0.20936040580272675, "learning_rate": 9.877269571925909e-06, "loss": 0.4238, "step": 2178 }, { "epoch": 0.9857498303551233, "grad_norm": 0.24479855597019196, "learning_rate": 9.877108165289668e-06, "loss": 0.4964, "step": 2179 }, { "epoch": 0.9862022166930559, "grad_norm": 0.27115336060523987, "learning_rate": 9.87694665390823e-06, "loss": 0.5762, "step": 2180 }, { "epoch": 0.9866546030309885, "grad_norm": 0.24662663042545319, "learning_rate": 9.876785037785065e-06, "loss": 0.5797, "step": 2181 }, { "epoch": 0.987106989368921, "grad_norm": 0.26732537150382996, "learning_rate": 9.876623316923643e-06, "loss": 0.6173, "step": 2182 }, { "epoch": 0.9875593757068537, "grad_norm": 0.28946468234062195, "learning_rate": 9.876461491327437e-06, "loss": 0.6196, "step": 2183 }, { "epoch": 0.9880117620447862, "grad_norm": 0.2762983441352844, "learning_rate": 9.876299560999922e-06, "loss": 0.6138, "step": 2184 }, { "epoch": 0.9884641483827188, "grad_norm": 0.2834378182888031, "learning_rate": 9.876137525944578e-06, "loss": 0.5605, "step": 2185 }, { "epoch": 0.9889165347206514, "grad_norm": 0.3472844362258911, "learning_rate": 9.875975386164884e-06, "loss": 0.6734, "step": 2186 }, { "epoch": 0.989368921058584, "grad_norm": 0.2561889886856079, "learning_rate": 9.87581314166432e-06, "loss": 0.4597, "step": 2187 }, { "epoch": 0.9898213073965166, "grad_norm": 0.2663712501525879, "learning_rate": 9.875650792446376e-06, "loss": 0.5696, "step": 2188 }, { "epoch": 0.9902736937344492, "grad_norm": 0.24061709642410278, "learning_rate": 9.875488338514533e-06, "loss": 0.4405, "step": 2189 }, { "epoch": 0.9907260800723818, "grad_norm": 0.31863993406295776, "learning_rate": 9.875325779872283e-06, "loss": 0.5732, "step": 2190 }, { "epoch": 0.9911784664103144, "grad_norm": 0.30436134338378906, "learning_rate": 9.875163116523116e-06, "loss": 0.5275, "step": 2191 }, { "epoch": 0.991630852748247, "grad_norm": 0.2905287742614746, "learning_rate": 9.875000348470525e-06, "loss": 0.5504, "step": 2192 }, { "epoch": 0.9920832390861796, "grad_norm": 0.28087523579597473, "learning_rate": 9.874837475718008e-06, "loss": 0.5527, "step": 2193 }, { "epoch": 0.9925356254241122, "grad_norm": 0.3169887363910675, "learning_rate": 9.874674498269062e-06, "loss": 0.5348, "step": 2194 }, { "epoch": 0.9929880117620448, "grad_norm": 0.31341618299484253, "learning_rate": 9.874511416127185e-06, "loss": 0.5646, "step": 2195 }, { "epoch": 0.9934403980999774, "grad_norm": 0.3032923936843872, "learning_rate": 9.874348229295886e-06, "loss": 0.4719, "step": 2196 }, { "epoch": 0.99389278443791, "grad_norm": 0.3446543216705322, "learning_rate": 9.87418493777866e-06, "loss": 0.6008, "step": 2197 }, { "epoch": 0.9943451707758426, "grad_norm": 0.31185993552207947, "learning_rate": 9.874021541579022e-06, "loss": 0.4802, "step": 2198 }, { "epoch": 0.9947975571137752, "grad_norm": 0.4014294445514679, "learning_rate": 9.87385804070048e-06, "loss": 0.5501, "step": 2199 }, { "epoch": 0.9952499434517078, "grad_norm": 0.553764820098877, "learning_rate": 9.873694435146541e-06, "loss": 0.6866, "step": 2200 }, { "epoch": 0.9952499434517078, "eval_loss": 0.6145682334899902, "eval_runtime": 26.5736, "eval_samples_per_second": 27.998, "eval_steps_per_second": 6.999, "step": 2200 }, { "epoch": 0.9957023297896404, "grad_norm": 0.16879937052726746, "learning_rate": 9.873530724920724e-06, "loss": 1.187, "step": 2201 }, { "epoch": 0.9961547161275729, "grad_norm": 0.2311924397945404, "learning_rate": 9.87336691002654e-06, "loss": 0.5682, "step": 2202 }, { "epoch": 0.9966071024655055, "grad_norm": 0.2632114589214325, "learning_rate": 9.873202990467514e-06, "loss": 0.7006, "step": 2203 }, { "epoch": 0.9970594888034381, "grad_norm": 0.26498591899871826, "learning_rate": 9.873038966247159e-06, "loss": 0.6568, "step": 2204 }, { "epoch": 0.9975118751413707, "grad_norm": 0.24233557283878326, "learning_rate": 9.872874837369001e-06, "loss": 0.5628, "step": 2205 }, { "epoch": 0.9979642614793033, "grad_norm": 0.28151240944862366, "learning_rate": 9.872710603836568e-06, "loss": 0.5681, "step": 2206 }, { "epoch": 0.9984166478172359, "grad_norm": 0.2961885631084442, "learning_rate": 9.872546265653383e-06, "loss": 0.6273, "step": 2207 }, { "epoch": 0.9988690341551685, "grad_norm": 0.27367666363716125, "learning_rate": 9.872381822822977e-06, "loss": 0.4585, "step": 2208 }, { "epoch": 0.9993214204931011, "grad_norm": 0.29260778427124023, "learning_rate": 9.872217275348883e-06, "loss": 0.4984, "step": 2209 }, { "epoch": 0.9997738068310337, "grad_norm": 0.36707940697669983, "learning_rate": 9.872052623234632e-06, "loss": 0.6236, "step": 2210 }, { "epoch": 1.0002261931689662, "grad_norm": 1.5441112518310547, "learning_rate": 9.87188786648376e-06, "loss": 1.2287, "step": 2211 }, { "epoch": 1.0006785795068989, "grad_norm": 0.16720612347126007, "learning_rate": 9.87172300509981e-06, "loss": 1.718, "step": 2212 }, { "epoch": 1.0011309658448315, "grad_norm": 0.14406581223011017, "learning_rate": 9.871558039086319e-06, "loss": 0.6095, "step": 2213 }, { "epoch": 1.0015833521827642, "grad_norm": 0.1986817866563797, "learning_rate": 9.871392968446831e-06, "loss": 0.7597, "step": 2214 }, { "epoch": 1.0020357385206966, "grad_norm": 0.2034113109111786, "learning_rate": 9.871227793184893e-06, "loss": 0.6265, "step": 2215 }, { "epoch": 1.0024881248586293, "grad_norm": 0.19857801496982574, "learning_rate": 9.871062513304049e-06, "loss": 0.625, "step": 2216 }, { "epoch": 1.002940511196562, "grad_norm": 0.21554477512836456, "learning_rate": 9.870897128807849e-06, "loss": 0.6825, "step": 2217 }, { "epoch": 1.0033928975344946, "grad_norm": 0.2131420522928238, "learning_rate": 9.870731639699846e-06, "loss": 0.6851, "step": 2218 }, { "epoch": 1.003845283872427, "grad_norm": 0.2361288070678711, "learning_rate": 9.870566045983596e-06, "loss": 0.6514, "step": 2219 }, { "epoch": 1.0042976702103596, "grad_norm": 0.24661199748516083, "learning_rate": 9.870400347662653e-06, "loss": 0.6263, "step": 2220 }, { "epoch": 1.0047500565482923, "grad_norm": 0.22476939857006073, "learning_rate": 9.870234544740579e-06, "loss": 0.6979, "step": 2221 }, { "epoch": 1.0052024428862247, "grad_norm": 0.2039821445941925, "learning_rate": 9.870068637220931e-06, "loss": 0.6238, "step": 2222 }, { "epoch": 1.0056548292241574, "grad_norm": 0.22637374699115753, "learning_rate": 9.869902625107273e-06, "loss": 0.5779, "step": 2223 }, { "epoch": 1.00610721556209, "grad_norm": 0.2363424003124237, "learning_rate": 9.869736508403171e-06, "loss": 0.6644, "step": 2224 }, { "epoch": 1.0065596019000227, "grad_norm": 0.20488935708999634, "learning_rate": 9.869570287112193e-06, "loss": 0.5877, "step": 2225 }, { "epoch": 1.0070119882379551, "grad_norm": 0.24870362877845764, "learning_rate": 9.86940396123791e-06, "loss": 0.5869, "step": 2226 }, { "epoch": 1.0074643745758878, "grad_norm": 0.20253540575504303, "learning_rate": 9.869237530783892e-06, "loss": 0.5358, "step": 2227 }, { "epoch": 1.0079167609138204, "grad_norm": 0.20644178986549377, "learning_rate": 9.869070995753714e-06, "loss": 0.5777, "step": 2228 }, { "epoch": 1.008369147251753, "grad_norm": 0.30752065777778625, "learning_rate": 9.868904356150953e-06, "loss": 0.5677, "step": 2229 }, { "epoch": 1.0088215335896855, "grad_norm": 0.2395569533109665, "learning_rate": 9.868737611979189e-06, "loss": 0.5919, "step": 2230 }, { "epoch": 1.0092739199276182, "grad_norm": 0.22404325008392334, "learning_rate": 9.868570763242e-06, "loss": 0.5199, "step": 2231 }, { "epoch": 1.0097263062655508, "grad_norm": 0.24637334048748016, "learning_rate": 9.868403809942972e-06, "loss": 0.6075, "step": 2232 }, { "epoch": 1.0101786926034835, "grad_norm": 0.23149292171001434, "learning_rate": 9.868236752085692e-06, "loss": 0.519, "step": 2233 }, { "epoch": 1.010631078941416, "grad_norm": 0.2389049381017685, "learning_rate": 9.868069589673743e-06, "loss": 0.5432, "step": 2234 }, { "epoch": 1.0110834652793486, "grad_norm": 0.2340010702610016, "learning_rate": 9.86790232271072e-06, "loss": 0.5319, "step": 2235 }, { "epoch": 1.0115358516172812, "grad_norm": 0.2204553782939911, "learning_rate": 9.867734951200212e-06, "loss": 0.5429, "step": 2236 }, { "epoch": 1.0119882379552136, "grad_norm": 0.24849744141101837, "learning_rate": 9.867567475145817e-06, "loss": 0.547, "step": 2237 }, { "epoch": 1.0124406242931463, "grad_norm": 0.2550046145915985, "learning_rate": 9.867399894551128e-06, "loss": 0.6393, "step": 2238 }, { "epoch": 1.012893010631079, "grad_norm": 0.25075507164001465, "learning_rate": 9.867232209419747e-06, "loss": 0.492, "step": 2239 }, { "epoch": 1.0133453969690116, "grad_norm": 0.262724906206131, "learning_rate": 9.867064419755274e-06, "loss": 0.5644, "step": 2240 }, { "epoch": 1.013797783306944, "grad_norm": 0.23622500896453857, "learning_rate": 9.866896525561312e-06, "loss": 0.5149, "step": 2241 }, { "epoch": 1.0142501696448767, "grad_norm": 0.29685622453689575, "learning_rate": 9.866728526841469e-06, "loss": 0.6579, "step": 2242 }, { "epoch": 1.0147025559828093, "grad_norm": 0.2576432526111603, "learning_rate": 9.866560423599351e-06, "loss": 0.5552, "step": 2243 }, { "epoch": 1.015154942320742, "grad_norm": 0.24843378365039825, "learning_rate": 9.866392215838571e-06, "loss": 0.4528, "step": 2244 }, { "epoch": 1.0156073286586744, "grad_norm": 0.2861003279685974, "learning_rate": 9.866223903562739e-06, "loss": 0.6603, "step": 2245 }, { "epoch": 1.016059714996607, "grad_norm": 0.2928241789340973, "learning_rate": 9.866055486775471e-06, "loss": 0.6153, "step": 2246 }, { "epoch": 1.0165121013345397, "grad_norm": 0.26631593704223633, "learning_rate": 9.865886965480383e-06, "loss": 0.4427, "step": 2247 }, { "epoch": 1.0169644876724724, "grad_norm": 0.2472371608018875, "learning_rate": 9.865718339681097e-06, "loss": 0.5349, "step": 2248 }, { "epoch": 1.0174168740104048, "grad_norm": 0.3046855032444, "learning_rate": 9.86554960938123e-06, "loss": 0.6659, "step": 2249 }, { "epoch": 1.0178692603483375, "grad_norm": 0.2859993875026703, "learning_rate": 9.865380774584409e-06, "loss": 0.5892, "step": 2250 }, { "epoch": 1.0183216466862701, "grad_norm": 0.28642183542251587, "learning_rate": 9.86521183529426e-06, "loss": 0.4738, "step": 2251 }, { "epoch": 1.0187740330242028, "grad_norm": 0.2963520884513855, "learning_rate": 9.86504279151441e-06, "loss": 0.5596, "step": 2252 }, { "epoch": 1.0192264193621352, "grad_norm": 0.3261040151119232, "learning_rate": 9.864873643248491e-06, "loss": 0.5603, "step": 2253 }, { "epoch": 1.0196788057000679, "grad_norm": 0.30090999603271484, "learning_rate": 9.864704390500135e-06, "loss": 0.4572, "step": 2254 }, { "epoch": 1.0201311920380005, "grad_norm": 0.33931297063827515, "learning_rate": 9.864535033272978e-06, "loss": 0.5994, "step": 2255 }, { "epoch": 1.020583578375933, "grad_norm": 0.29741644859313965, "learning_rate": 9.864365571570656e-06, "loss": 0.4901, "step": 2256 }, { "epoch": 1.0210359647138656, "grad_norm": 0.3483814001083374, "learning_rate": 9.864196005396808e-06, "loss": 0.5758, "step": 2257 }, { "epoch": 1.0214883510517982, "grad_norm": 0.31525835394859314, "learning_rate": 9.864026334755076e-06, "loss": 0.5268, "step": 2258 }, { "epoch": 1.021940737389731, "grad_norm": 0.3623119592666626, "learning_rate": 9.863856559649107e-06, "loss": 0.5778, "step": 2259 }, { "epoch": 1.0223931237276633, "grad_norm": 0.401611328125, "learning_rate": 9.863686680082543e-06, "loss": 0.492, "step": 2260 }, { "epoch": 1.022845510065596, "grad_norm": 0.38183867931365967, "learning_rate": 9.863516696059035e-06, "loss": 1.3286, "step": 2261 }, { "epoch": 1.0232978964035286, "grad_norm": 0.1223345473408699, "learning_rate": 9.863346607582232e-06, "loss": 0.6331, "step": 2262 }, { "epoch": 1.0237502827414613, "grad_norm": 0.1553351730108261, "learning_rate": 9.86317641465579e-06, "loss": 0.6761, "step": 2263 }, { "epoch": 1.0242026690793937, "grad_norm": 0.14495770633220673, "learning_rate": 9.863006117283362e-06, "loss": 0.5657, "step": 2264 }, { "epoch": 1.0246550554173264, "grad_norm": 0.2068362981081009, "learning_rate": 9.862835715468606e-06, "loss": 0.5807, "step": 2265 }, { "epoch": 1.025107441755259, "grad_norm": 0.22118686139583588, "learning_rate": 9.862665209215178e-06, "loss": 0.7029, "step": 2266 }, { "epoch": 1.0255598280931917, "grad_norm": 0.22194522619247437, "learning_rate": 9.862494598526748e-06, "loss": 0.7244, "step": 2267 }, { "epoch": 1.0260122144311241, "grad_norm": 0.19304399192333221, "learning_rate": 9.862323883406973e-06, "loss": 0.6054, "step": 2268 }, { "epoch": 1.0264646007690568, "grad_norm": 0.20875293016433716, "learning_rate": 9.862153063859523e-06, "loss": 0.584, "step": 2269 }, { "epoch": 1.0269169871069894, "grad_norm": 0.23080646991729736, "learning_rate": 9.861982139888066e-06, "loss": 0.7395, "step": 2270 }, { "epoch": 1.0273693734449219, "grad_norm": 0.21326130628585815, "learning_rate": 9.861811111496271e-06, "loss": 0.6817, "step": 2271 }, { "epoch": 1.0278217597828545, "grad_norm": 0.2224520891904831, "learning_rate": 9.861639978687814e-06, "loss": 0.6861, "step": 2272 }, { "epoch": 1.0282741461207872, "grad_norm": 0.2268802672624588, "learning_rate": 9.861468741466368e-06, "loss": 0.7027, "step": 2273 }, { "epoch": 1.0287265324587198, "grad_norm": 0.2384212166070938, "learning_rate": 9.861297399835614e-06, "loss": 0.655, "step": 2274 }, { "epoch": 1.0291789187966522, "grad_norm": 0.21852904558181763, "learning_rate": 9.861125953799229e-06, "loss": 0.5862, "step": 2275 }, { "epoch": 1.029631305134585, "grad_norm": 0.21972469985485077, "learning_rate": 9.860954403360896e-06, "loss": 0.5613, "step": 2276 }, { "epoch": 1.0300836914725175, "grad_norm": 0.2283354550600052, "learning_rate": 9.860782748524298e-06, "loss": 0.5901, "step": 2277 }, { "epoch": 1.0305360778104502, "grad_norm": 0.2349061220884323, "learning_rate": 9.860610989293124e-06, "loss": 0.6159, "step": 2278 }, { "epoch": 1.0309884641483826, "grad_norm": 0.23910637199878693, "learning_rate": 9.860439125671063e-06, "loss": 0.6248, "step": 2279 }, { "epoch": 1.0314408504863153, "grad_norm": 0.21194390952587128, "learning_rate": 9.860267157661801e-06, "loss": 0.5725, "step": 2280 }, { "epoch": 1.031893236824248, "grad_norm": 0.21338076889514923, "learning_rate": 9.860095085269037e-06, "loss": 0.4942, "step": 2281 }, { "epoch": 1.0323456231621806, "grad_norm": 0.24536918103694916, "learning_rate": 9.859922908496465e-06, "loss": 0.5793, "step": 2282 }, { "epoch": 1.032798009500113, "grad_norm": 0.22204887866973877, "learning_rate": 9.859750627347783e-06, "loss": 0.5152, "step": 2283 }, { "epoch": 1.0332503958380457, "grad_norm": 0.2538560628890991, "learning_rate": 9.859578241826691e-06, "loss": 0.6763, "step": 2284 }, { "epoch": 1.0337027821759783, "grad_norm": 0.23005729913711548, "learning_rate": 9.85940575193689e-06, "loss": 0.5163, "step": 2285 }, { "epoch": 1.034155168513911, "grad_norm": 0.22118908166885376, "learning_rate": 9.859233157682084e-06, "loss": 0.4736, "step": 2286 }, { "epoch": 1.0346075548518434, "grad_norm": 0.2538360357284546, "learning_rate": 9.859060459065983e-06, "loss": 0.5677, "step": 2287 }, { "epoch": 1.035059941189776, "grad_norm": 0.276607871055603, "learning_rate": 9.858887656092296e-06, "loss": 0.5917, "step": 2288 }, { "epoch": 1.0355123275277087, "grad_norm": 0.25718045234680176, "learning_rate": 9.85871474876473e-06, "loss": 0.5123, "step": 2289 }, { "epoch": 1.0359647138656412, "grad_norm": 0.2578164339065552, "learning_rate": 9.858541737087001e-06, "loss": 0.5877, "step": 2290 }, { "epoch": 1.0364171002035738, "grad_norm": 0.2548888623714447, "learning_rate": 9.858368621062825e-06, "loss": 0.5834, "step": 2291 }, { "epoch": 1.0368694865415065, "grad_norm": 0.25552353262901306, "learning_rate": 9.858195400695922e-06, "loss": 0.5285, "step": 2292 }, { "epoch": 1.037321872879439, "grad_norm": 0.2717037796974182, "learning_rate": 9.858022075990009e-06, "loss": 0.5633, "step": 2293 }, { "epoch": 1.0377742592173715, "grad_norm": 0.24881070852279663, "learning_rate": 9.857848646948809e-06, "loss": 0.4754, "step": 2294 }, { "epoch": 1.0382266455553042, "grad_norm": 0.2765263020992279, "learning_rate": 9.857675113576046e-06, "loss": 0.4561, "step": 2295 }, { "epoch": 1.0386790318932368, "grad_norm": 0.25441837310791016, "learning_rate": 9.857501475875449e-06, "loss": 0.5084, "step": 2296 }, { "epoch": 1.0391314182311695, "grad_norm": 0.28643494844436646, "learning_rate": 9.857327733850748e-06, "loss": 0.5952, "step": 2297 }, { "epoch": 1.039583804569102, "grad_norm": 0.30700063705444336, "learning_rate": 9.85715388750567e-06, "loss": 0.6922, "step": 2298 }, { "epoch": 1.0400361909070346, "grad_norm": 0.2971309721469879, "learning_rate": 9.856979936843954e-06, "loss": 0.5151, "step": 2299 }, { "epoch": 1.0404885772449672, "grad_norm": 0.331410676240921, "learning_rate": 9.856805881869334e-06, "loss": 0.579, "step": 2300 }, { "epoch": 1.0409409635829, "grad_norm": 0.27141621708869934, "learning_rate": 9.856631722585545e-06, "loss": 0.5016, "step": 2301 }, { "epoch": 1.0413933499208323, "grad_norm": 0.278810977935791, "learning_rate": 9.856457458996333e-06, "loss": 0.4645, "step": 2302 }, { "epoch": 1.041845736258765, "grad_norm": 0.2708129584789276, "learning_rate": 9.856283091105437e-06, "loss": 0.4642, "step": 2303 }, { "epoch": 1.0422981225966976, "grad_norm": 0.31617921590805054, "learning_rate": 9.8561086189166e-06, "loss": 0.5034, "step": 2304 }, { "epoch": 1.0427505089346303, "grad_norm": 0.31755882501602173, "learning_rate": 9.855934042433575e-06, "loss": 0.5476, "step": 2305 }, { "epoch": 1.0432028952725627, "grad_norm": 0.31297430396080017, "learning_rate": 9.855759361660106e-06, "loss": 0.5201, "step": 2306 }, { "epoch": 1.0436552816104954, "grad_norm": 0.34604862332344055, "learning_rate": 9.855584576599948e-06, "loss": 0.5002, "step": 2307 }, { "epoch": 1.044107667948428, "grad_norm": 0.34763675928115845, "learning_rate": 9.855409687256852e-06, "loss": 0.5763, "step": 2308 }, { "epoch": 1.0445600542863605, "grad_norm": 0.36043617129325867, "learning_rate": 9.855234693634577e-06, "loss": 0.5414, "step": 2309 }, { "epoch": 1.045012440624293, "grad_norm": 0.47719159722328186, "learning_rate": 9.855059595736877e-06, "loss": 0.5909, "step": 2310 }, { "epoch": 1.0454648269622258, "grad_norm": 0.305322527885437, "learning_rate": 9.854884393567519e-06, "loss": 1.1184, "step": 2311 }, { "epoch": 1.0459172133001584, "grad_norm": 0.19404065608978271, "learning_rate": 9.854709087130261e-06, "loss": 1.1123, "step": 2312 }, { "epoch": 1.0463695996380908, "grad_norm": 0.20929744839668274, "learning_rate": 9.854533676428869e-06, "loss": 0.816, "step": 2313 }, { "epoch": 1.0468219859760235, "grad_norm": 0.1903390884399414, "learning_rate": 9.854358161467111e-06, "loss": 0.7359, "step": 2314 }, { "epoch": 1.0472743723139561, "grad_norm": 0.1808023899793625, "learning_rate": 9.854182542248757e-06, "loss": 0.6427, "step": 2315 }, { "epoch": 1.0477267586518888, "grad_norm": 0.18331000208854675, "learning_rate": 9.854006818777575e-06, "loss": 0.5845, "step": 2316 }, { "epoch": 1.0481791449898212, "grad_norm": 0.20025160908699036, "learning_rate": 9.853830991057345e-06, "loss": 0.6208, "step": 2317 }, { "epoch": 1.0486315313277539, "grad_norm": 0.19514337182044983, "learning_rate": 9.853655059091838e-06, "loss": 0.58, "step": 2318 }, { "epoch": 1.0490839176656865, "grad_norm": 0.2310001701116562, "learning_rate": 9.853479022884835e-06, "loss": 0.6956, "step": 2319 }, { "epoch": 1.0495363040036192, "grad_norm": 0.19502289593219757, "learning_rate": 9.853302882440117e-06, "loss": 0.5358, "step": 2320 }, { "epoch": 1.0499886903415516, "grad_norm": 0.20681264996528625, "learning_rate": 9.853126637761465e-06, "loss": 0.6446, "step": 2321 }, { "epoch": 1.0504410766794843, "grad_norm": 0.21474742889404297, "learning_rate": 9.852950288852667e-06, "loss": 0.5068, "step": 2322 }, { "epoch": 1.050893463017417, "grad_norm": 0.2347838133573532, "learning_rate": 9.852773835717508e-06, "loss": 0.6194, "step": 2323 }, { "epoch": 1.0513458493553496, "grad_norm": 0.3008011281490326, "learning_rate": 9.85259727835978e-06, "loss": 0.6392, "step": 2324 }, { "epoch": 1.051798235693282, "grad_norm": 0.27171897888183594, "learning_rate": 9.852420616783272e-06, "loss": 0.6771, "step": 2325 }, { "epoch": 1.0522506220312147, "grad_norm": 0.22864878177642822, "learning_rate": 9.852243850991779e-06, "loss": 0.5805, "step": 2326 }, { "epoch": 1.0527030083691473, "grad_norm": 0.22995908558368683, "learning_rate": 9.852066980989097e-06, "loss": 0.5704, "step": 2327 }, { "epoch": 1.0531553947070798, "grad_norm": 0.21096085011959076, "learning_rate": 9.851890006779027e-06, "loss": 0.5377, "step": 2328 }, { "epoch": 1.0536077810450124, "grad_norm": 0.22012142837047577, "learning_rate": 9.85171292836537e-06, "loss": 0.5398, "step": 2329 }, { "epoch": 1.054060167382945, "grad_norm": 0.20983973145484924, "learning_rate": 9.851535745751926e-06, "loss": 0.5937, "step": 2330 }, { "epoch": 1.0545125537208777, "grad_norm": 0.25712156295776367, "learning_rate": 9.851358458942502e-06, "loss": 0.7666, "step": 2331 }, { "epoch": 1.0549649400588101, "grad_norm": 0.23077645897865295, "learning_rate": 9.851181067940905e-06, "loss": 0.5211, "step": 2332 }, { "epoch": 1.0554173263967428, "grad_norm": 0.22745342552661896, "learning_rate": 9.851003572750946e-06, "loss": 0.6064, "step": 2333 }, { "epoch": 1.0558697127346754, "grad_norm": 0.2520611584186554, "learning_rate": 9.850825973376437e-06, "loss": 0.6233, "step": 2334 }, { "epoch": 1.056322099072608, "grad_norm": 0.25801846385002136, "learning_rate": 9.85064826982119e-06, "loss": 0.547, "step": 2335 }, { "epoch": 1.0567744854105405, "grad_norm": 0.25164365768432617, "learning_rate": 9.850470462089024e-06, "loss": 0.5951, "step": 2336 }, { "epoch": 1.0572268717484732, "grad_norm": 0.29180580377578735, "learning_rate": 9.850292550183755e-06, "loss": 0.6706, "step": 2337 }, { "epoch": 1.0576792580864058, "grad_norm": 0.22358809411525726, "learning_rate": 9.850114534109207e-06, "loss": 0.467, "step": 2338 }, { "epoch": 1.0581316444243385, "grad_norm": 0.23002253472805023, "learning_rate": 9.849936413869203e-06, "loss": 0.5137, "step": 2339 }, { "epoch": 1.058584030762271, "grad_norm": 0.27887162566185, "learning_rate": 9.849758189467567e-06, "loss": 0.5168, "step": 2340 }, { "epoch": 1.0590364171002036, "grad_norm": 0.2497147172689438, "learning_rate": 9.849579860908127e-06, "loss": 0.5112, "step": 2341 }, { "epoch": 1.0594888034381362, "grad_norm": 0.2810595631599426, "learning_rate": 9.849401428194713e-06, "loss": 0.5183, "step": 2342 }, { "epoch": 1.0599411897760687, "grad_norm": 0.30235183238983154, "learning_rate": 9.849222891331158e-06, "loss": 0.6353, "step": 2343 }, { "epoch": 1.0603935761140013, "grad_norm": 0.28833892941474915, "learning_rate": 9.849044250321296e-06, "loss": 0.642, "step": 2344 }, { "epoch": 1.060845962451934, "grad_norm": 0.3128480315208435, "learning_rate": 9.848865505168963e-06, "loss": 0.7397, "step": 2345 }, { "epoch": 1.0612983487898666, "grad_norm": 0.2772349417209625, "learning_rate": 9.848686655877999e-06, "loss": 0.5623, "step": 2346 }, { "epoch": 1.061750735127799, "grad_norm": 0.29785215854644775, "learning_rate": 9.848507702452243e-06, "loss": 0.5602, "step": 2347 }, { "epoch": 1.0622031214657317, "grad_norm": 0.31557029485702515, "learning_rate": 9.848328644895541e-06, "loss": 0.5866, "step": 2348 }, { "epoch": 1.0626555078036644, "grad_norm": 0.27385178208351135, "learning_rate": 9.84814948321174e-06, "loss": 0.5584, "step": 2349 }, { "epoch": 1.063107894141597, "grad_norm": 0.3132200241088867, "learning_rate": 9.847970217404681e-06, "loss": 0.5236, "step": 2350 }, { "epoch": 1.0635602804795294, "grad_norm": 0.29906606674194336, "learning_rate": 9.84779084747822e-06, "loss": 0.6078, "step": 2351 }, { "epoch": 1.064012666817462, "grad_norm": 0.34182897210121155, "learning_rate": 9.847611373436208e-06, "loss": 0.583, "step": 2352 }, { "epoch": 1.0644650531553947, "grad_norm": 0.3431265652179718, "learning_rate": 9.847431795282498e-06, "loss": 0.6023, "step": 2353 }, { "epoch": 1.0649174394933274, "grad_norm": 0.3112800717353821, "learning_rate": 9.84725211302095e-06, "loss": 0.563, "step": 2354 }, { "epoch": 1.0653698258312598, "grad_norm": 0.334804892539978, "learning_rate": 9.84707232665542e-06, "loss": 0.587, "step": 2355 }, { "epoch": 1.0658222121691925, "grad_norm": 0.31172433495521545, "learning_rate": 9.84689243618977e-06, "loss": 0.5482, "step": 2356 }, { "epoch": 1.0662745985071251, "grad_norm": 0.3267514407634735, "learning_rate": 9.846712441627864e-06, "loss": 0.5638, "step": 2357 }, { "epoch": 1.0667269848450576, "grad_norm": 0.3154197037220001, "learning_rate": 9.846532342973568e-06, "loss": 0.5177, "step": 2358 }, { "epoch": 1.0671793711829902, "grad_norm": 0.3685954213142395, "learning_rate": 9.846352140230748e-06, "loss": 0.5593, "step": 2359 }, { "epoch": 1.0676317575209229, "grad_norm": 0.35809093713760376, "learning_rate": 9.846171833403277e-06, "loss": 0.5046, "step": 2360 }, { "epoch": 1.0680841438588555, "grad_norm": 0.3170197606086731, "learning_rate": 9.845991422495027e-06, "loss": 0.9451, "step": 2361 }, { "epoch": 1.068536530196788, "grad_norm": 0.1269351989030838, "learning_rate": 9.84581090750987e-06, "loss": 0.8701, "step": 2362 }, { "epoch": 1.0689889165347206, "grad_norm": 0.2831563353538513, "learning_rate": 9.845630288451686e-06, "loss": 0.667, "step": 2363 }, { "epoch": 1.0694413028726533, "grad_norm": 0.17009875178337097, "learning_rate": 9.845449565324353e-06, "loss": 0.5672, "step": 2364 }, { "epoch": 1.069893689210586, "grad_norm": 0.1940547674894333, "learning_rate": 9.84526873813175e-06, "loss": 0.6723, "step": 2365 }, { "epoch": 1.0703460755485183, "grad_norm": 0.21851439774036407, "learning_rate": 9.845087806877764e-06, "loss": 0.6917, "step": 2366 }, { "epoch": 1.070798461886451, "grad_norm": 0.21021078526973724, "learning_rate": 9.84490677156628e-06, "loss": 0.6504, "step": 2367 }, { "epoch": 1.0712508482243837, "grad_norm": 0.2440629005432129, "learning_rate": 9.844725632201186e-06, "loss": 0.6688, "step": 2368 }, { "epoch": 1.0717032345623163, "grad_norm": 0.24595652520656586, "learning_rate": 9.844544388786372e-06, "loss": 0.6341, "step": 2369 }, { "epoch": 1.0721556209002487, "grad_norm": 0.20862068235874176, "learning_rate": 9.84436304132573e-06, "loss": 0.564, "step": 2370 }, { "epoch": 1.0726080072381814, "grad_norm": 0.2136753350496292, "learning_rate": 9.844181589823156e-06, "loss": 0.5885, "step": 2371 }, { "epoch": 1.073060393576114, "grad_norm": 0.19843965768814087, "learning_rate": 9.844000034282545e-06, "loss": 0.4501, "step": 2372 }, { "epoch": 1.0735127799140467, "grad_norm": 0.2334694117307663, "learning_rate": 9.843818374707799e-06, "loss": 0.5412, "step": 2373 }, { "epoch": 1.0739651662519791, "grad_norm": 0.24094034731388092, "learning_rate": 9.843636611102816e-06, "loss": 0.588, "step": 2374 }, { "epoch": 1.0744175525899118, "grad_norm": 0.2414669543504715, "learning_rate": 9.843454743471505e-06, "loss": 0.6564, "step": 2375 }, { "epoch": 1.0748699389278444, "grad_norm": 0.28983283042907715, "learning_rate": 9.843272771817766e-06, "loss": 0.6788, "step": 2376 }, { "epoch": 1.0753223252657769, "grad_norm": 0.2572443187236786, "learning_rate": 9.843090696145512e-06, "loss": 0.5717, "step": 2377 }, { "epoch": 1.0757747116037095, "grad_norm": 0.23151499032974243, "learning_rate": 9.842908516458652e-06, "loss": 0.593, "step": 2378 }, { "epoch": 1.0762270979416422, "grad_norm": 0.2505719065666199, "learning_rate": 9.842726232761096e-06, "loss": 0.6138, "step": 2379 }, { "epoch": 1.0766794842795748, "grad_norm": 0.3840274512767792, "learning_rate": 9.84254384505676e-06, "loss": 0.7627, "step": 2380 }, { "epoch": 1.0771318706175073, "grad_norm": 0.2344111204147339, "learning_rate": 9.842361353349566e-06, "loss": 0.5618, "step": 2381 }, { "epoch": 1.07758425695544, "grad_norm": 0.2424679547548294, "learning_rate": 9.842178757643426e-06, "loss": 0.5586, "step": 2382 }, { "epoch": 1.0780366432933726, "grad_norm": 0.2442195564508438, "learning_rate": 9.841996057942265e-06, "loss": 0.6357, "step": 2383 }, { "epoch": 1.0784890296313052, "grad_norm": 0.2626122832298279, "learning_rate": 9.841813254250009e-06, "loss": 0.5984, "step": 2384 }, { "epoch": 1.0789414159692376, "grad_norm": 0.2416083663702011, "learning_rate": 9.84163034657058e-06, "loss": 0.5669, "step": 2385 }, { "epoch": 1.0793938023071703, "grad_norm": 0.2543719708919525, "learning_rate": 9.84144733490791e-06, "loss": 0.5377, "step": 2386 }, { "epoch": 1.079846188645103, "grad_norm": 0.2518148422241211, "learning_rate": 9.841264219265927e-06, "loss": 0.5639, "step": 2387 }, { "epoch": 1.0802985749830356, "grad_norm": 0.24812883138656616, "learning_rate": 9.841080999648565e-06, "loss": 0.5587, "step": 2388 }, { "epoch": 1.080750961320968, "grad_norm": 0.25307947397232056, "learning_rate": 9.840897676059758e-06, "loss": 0.5584, "step": 2389 }, { "epoch": 1.0812033476589007, "grad_norm": 0.2501216530799866, "learning_rate": 9.840714248503443e-06, "loss": 0.5124, "step": 2390 }, { "epoch": 1.0816557339968333, "grad_norm": 0.2855483591556549, "learning_rate": 9.840530716983561e-06, "loss": 0.5811, "step": 2391 }, { "epoch": 1.082108120334766, "grad_norm": 0.3123599588871002, "learning_rate": 9.840347081504051e-06, "loss": 0.6384, "step": 2392 }, { "epoch": 1.0825605066726984, "grad_norm": 0.2706471085548401, "learning_rate": 9.84016334206886e-06, "loss": 0.5274, "step": 2393 }, { "epoch": 1.083012893010631, "grad_norm": 0.2865418493747711, "learning_rate": 9.839979498681935e-06, "loss": 0.5852, "step": 2394 }, { "epoch": 1.0834652793485637, "grad_norm": 0.27330490946769714, "learning_rate": 9.83979555134722e-06, "loss": 0.4944, "step": 2395 }, { "epoch": 1.0839176656864962, "grad_norm": 0.3339444696903229, "learning_rate": 9.839611500068667e-06, "loss": 0.6621, "step": 2396 }, { "epoch": 1.0843700520244288, "grad_norm": 0.2823750376701355, "learning_rate": 9.839427344850232e-06, "loss": 0.6016, "step": 2397 }, { "epoch": 1.0848224383623615, "grad_norm": 0.3039710223674774, "learning_rate": 9.839243085695866e-06, "loss": 0.5141, "step": 2398 }, { "epoch": 1.0852748247002941, "grad_norm": 0.29182738065719604, "learning_rate": 9.83905872260953e-06, "loss": 0.6217, "step": 2399 }, { "epoch": 1.0857272110382266, "grad_norm": 0.2983352839946747, "learning_rate": 9.83887425559518e-06, "loss": 0.619, "step": 2400 }, { "epoch": 1.0857272110382266, "eval_loss": 0.6122303009033203, "eval_runtime": 25.745, "eval_samples_per_second": 28.899, "eval_steps_per_second": 7.225, "step": 2400 }, { "epoch": 1.0861795973761592, "grad_norm": 0.3337234556674957, "learning_rate": 9.838689684656781e-06, "loss": 0.6712, "step": 2401 }, { "epoch": 1.0866319837140919, "grad_norm": 0.31108200550079346, "learning_rate": 9.838505009798295e-06, "loss": 0.5155, "step": 2402 }, { "epoch": 1.0870843700520245, "grad_norm": 0.31779491901397705, "learning_rate": 9.83832023102369e-06, "loss": 0.6017, "step": 2403 }, { "epoch": 1.087536756389957, "grad_norm": 0.3165717124938965, "learning_rate": 9.83813534833693e-06, "loss": 0.4522, "step": 2404 }, { "epoch": 1.0879891427278896, "grad_norm": 0.3181943893432617, "learning_rate": 9.837950361741993e-06, "loss": 0.4975, "step": 2405 }, { "epoch": 1.0884415290658223, "grad_norm": 0.33610349893569946, "learning_rate": 9.837765271242845e-06, "loss": 0.5633, "step": 2406 }, { "epoch": 1.088893915403755, "grad_norm": 0.3437788486480713, "learning_rate": 9.837580076843465e-06, "loss": 0.5548, "step": 2407 }, { "epoch": 1.0893463017416873, "grad_norm": 0.3641093373298645, "learning_rate": 9.837394778547828e-06, "loss": 0.5183, "step": 2408 }, { "epoch": 1.08979868807962, "grad_norm": 0.35946059226989746, "learning_rate": 9.837209376359918e-06, "loss": 0.4495, "step": 2409 }, { "epoch": 1.0902510744175526, "grad_norm": 0.445709764957428, "learning_rate": 9.83702387028371e-06, "loss": 0.5624, "step": 2410 }, { "epoch": 1.0907034607554853, "grad_norm": 0.36650002002716064, "learning_rate": 9.836838260323195e-06, "loss": 1.5239, "step": 2411 }, { "epoch": 1.0911558470934177, "grad_norm": 0.15454861521720886, "learning_rate": 9.836652546482356e-06, "loss": 0.6422, "step": 2412 }, { "epoch": 1.0916082334313504, "grad_norm": 0.17814257740974426, "learning_rate": 9.836466728765181e-06, "loss": 0.571, "step": 2413 }, { "epoch": 1.092060619769283, "grad_norm": 0.19099056720733643, "learning_rate": 9.836280807175662e-06, "loss": 0.5591, "step": 2414 }, { "epoch": 1.0925130061072155, "grad_norm": 0.2168884426355362, "learning_rate": 9.836094781717794e-06, "loss": 0.6548, "step": 2415 }, { "epoch": 1.0929653924451481, "grad_norm": 0.24330052733421326, "learning_rate": 9.835908652395568e-06, "loss": 0.6659, "step": 2416 }, { "epoch": 1.0934177787830808, "grad_norm": 0.20505402982234955, "learning_rate": 9.835722419212984e-06, "loss": 0.5758, "step": 2417 }, { "epoch": 1.0938701651210134, "grad_norm": 0.2124132513999939, "learning_rate": 9.83553608217404e-06, "loss": 0.6751, "step": 2418 }, { "epoch": 1.0943225514589459, "grad_norm": 0.22348397970199585, "learning_rate": 9.835349641282742e-06, "loss": 0.6001, "step": 2419 }, { "epoch": 1.0947749377968785, "grad_norm": 0.2671237885951996, "learning_rate": 9.835163096543088e-06, "loss": 0.6913, "step": 2420 }, { "epoch": 1.0952273241348112, "grad_norm": 0.25961804389953613, "learning_rate": 9.834976447959087e-06, "loss": 0.7416, "step": 2421 }, { "epoch": 1.0956797104727438, "grad_norm": 0.22707952558994293, "learning_rate": 9.834789695534753e-06, "loss": 0.6133, "step": 2422 }, { "epoch": 1.0961320968106762, "grad_norm": 0.23653554916381836, "learning_rate": 9.834602839274088e-06, "loss": 0.5719, "step": 2423 }, { "epoch": 1.096584483148609, "grad_norm": 0.237336203455925, "learning_rate": 9.834415879181112e-06, "loss": 0.6472, "step": 2424 }, { "epoch": 1.0970368694865416, "grad_norm": 0.22143177688121796, "learning_rate": 9.834228815259836e-06, "loss": 0.562, "step": 2425 }, { "epoch": 1.097489255824474, "grad_norm": 0.22407911717891693, "learning_rate": 9.83404164751428e-06, "loss": 0.5779, "step": 2426 }, { "epoch": 1.0979416421624066, "grad_norm": 0.2569247782230377, "learning_rate": 9.833854375948462e-06, "loss": 0.5817, "step": 2427 }, { "epoch": 1.0983940285003393, "grad_norm": 0.2361043244600296, "learning_rate": 9.833667000566408e-06, "loss": 0.6324, "step": 2428 }, { "epoch": 1.098846414838272, "grad_norm": 0.23936182260513306, "learning_rate": 9.833479521372136e-06, "loss": 0.5081, "step": 2429 }, { "epoch": 1.0992988011762044, "grad_norm": 0.25754815340042114, "learning_rate": 9.833291938369675e-06, "loss": 0.7809, "step": 2430 }, { "epoch": 1.099751187514137, "grad_norm": 0.2416962832212448, "learning_rate": 9.833104251563058e-06, "loss": 0.6137, "step": 2431 }, { "epoch": 1.1002035738520697, "grad_norm": 0.2612966299057007, "learning_rate": 9.83291646095631e-06, "loss": 0.6573, "step": 2432 }, { "epoch": 1.1006559601900023, "grad_norm": 0.24695676565170288, "learning_rate": 9.832728566553467e-06, "loss": 0.6242, "step": 2433 }, { "epoch": 1.1011083465279348, "grad_norm": 0.24518123269081116, "learning_rate": 9.832540568358563e-06, "loss": 0.627, "step": 2434 }, { "epoch": 1.1015607328658674, "grad_norm": 0.23947270214557648, "learning_rate": 9.832352466375637e-06, "loss": 0.5179, "step": 2435 }, { "epoch": 1.1020131192038, "grad_norm": 0.2487281709909439, "learning_rate": 9.83216426060873e-06, "loss": 0.5962, "step": 2436 }, { "epoch": 1.1024655055417327, "grad_norm": 0.26258572936058044, "learning_rate": 9.831975951061881e-06, "loss": 0.6057, "step": 2437 }, { "epoch": 1.1029178918796652, "grad_norm": 0.2530672550201416, "learning_rate": 9.831787537739135e-06, "loss": 0.607, "step": 2438 }, { "epoch": 1.1033702782175978, "grad_norm": 0.257447212934494, "learning_rate": 9.83159902064454e-06, "loss": 0.5918, "step": 2439 }, { "epoch": 1.1038226645555305, "grad_norm": 0.27570411562919617, "learning_rate": 9.831410399782144e-06, "loss": 0.5807, "step": 2440 }, { "epoch": 1.1042750508934631, "grad_norm": 0.26117369532585144, "learning_rate": 9.831221675155999e-06, "loss": 0.562, "step": 2441 }, { "epoch": 1.1047274372313955, "grad_norm": 0.3082123100757599, "learning_rate": 9.831032846770156e-06, "loss": 0.6232, "step": 2442 }, { "epoch": 1.1051798235693282, "grad_norm": 0.2596628963947296, "learning_rate": 9.830843914628672e-06, "loss": 0.5481, "step": 2443 }, { "epoch": 1.1056322099072609, "grad_norm": 0.26487812399864197, "learning_rate": 9.830654878735605e-06, "loss": 0.5517, "step": 2444 }, { "epoch": 1.1060845962451933, "grad_norm": 0.2862599194049835, "learning_rate": 9.830465739095015e-06, "loss": 0.568, "step": 2445 }, { "epoch": 1.106536982583126, "grad_norm": 0.2595489025115967, "learning_rate": 9.830276495710961e-06, "loss": 0.475, "step": 2446 }, { "epoch": 1.1069893689210586, "grad_norm": 0.2667175531387329, "learning_rate": 9.830087148587511e-06, "loss": 0.5907, "step": 2447 }, { "epoch": 1.1074417552589912, "grad_norm": 0.28451794385910034, "learning_rate": 9.82989769772873e-06, "loss": 0.5405, "step": 2448 }, { "epoch": 1.1078941415969237, "grad_norm": 0.2634422183036804, "learning_rate": 9.829708143138687e-06, "loss": 0.501, "step": 2449 }, { "epoch": 1.1083465279348563, "grad_norm": 0.2435303032398224, "learning_rate": 9.829518484821454e-06, "loss": 0.4031, "step": 2450 }, { "epoch": 1.108798914272789, "grad_norm": 0.30119588971138, "learning_rate": 9.829328722781104e-06, "loss": 0.5062, "step": 2451 }, { "epoch": 1.1092513006107216, "grad_norm": 0.2710387706756592, "learning_rate": 9.829138857021711e-06, "loss": 0.493, "step": 2452 }, { "epoch": 1.109703686948654, "grad_norm": 0.2896463871002197, "learning_rate": 9.828948887547354e-06, "loss": 0.5214, "step": 2453 }, { "epoch": 1.1101560732865867, "grad_norm": 0.32123976945877075, "learning_rate": 9.828758814362114e-06, "loss": 0.6079, "step": 2454 }, { "epoch": 1.1106084596245194, "grad_norm": 0.3064662516117096, "learning_rate": 9.828568637470071e-06, "loss": 0.5207, "step": 2455 }, { "epoch": 1.111060845962452, "grad_norm": 0.3403412699699402, "learning_rate": 9.82837835687531e-06, "loss": 0.5951, "step": 2456 }, { "epoch": 1.1115132323003845, "grad_norm": 0.35401636362075806, "learning_rate": 9.828187972581917e-06, "loss": 0.5873, "step": 2457 }, { "epoch": 1.111965618638317, "grad_norm": 0.32115301489830017, "learning_rate": 9.827997484593983e-06, "loss": 0.4675, "step": 2458 }, { "epoch": 1.1124180049762498, "grad_norm": 0.3858543038368225, "learning_rate": 9.8278068929156e-06, "loss": 0.5627, "step": 2459 }, { "epoch": 1.1128703913141824, "grad_norm": 0.35355573892593384, "learning_rate": 9.827616197550856e-06, "loss": 0.5258, "step": 2460 }, { "epoch": 1.1133227776521148, "grad_norm": 0.3586048483848572, "learning_rate": 9.82742539850385e-06, "loss": 0.9772, "step": 2461 }, { "epoch": 1.1137751639900475, "grad_norm": 0.16418220102787018, "learning_rate": 9.82723449577868e-06, "loss": 0.8346, "step": 2462 }, { "epoch": 1.1142275503279802, "grad_norm": 0.1543901562690735, "learning_rate": 9.827043489379447e-06, "loss": 0.6265, "step": 2463 }, { "epoch": 1.1146799366659126, "grad_norm": 0.1974102109670639, "learning_rate": 9.82685237931025e-06, "loss": 0.6647, "step": 2464 }, { "epoch": 1.1151323230038452, "grad_norm": 0.20150400698184967, "learning_rate": 9.826661165575195e-06, "loss": 0.7221, "step": 2465 }, { "epoch": 1.115584709341778, "grad_norm": 0.21320077776908875, "learning_rate": 9.82646984817839e-06, "loss": 0.7965, "step": 2466 }, { "epoch": 1.1160370956797105, "grad_norm": 0.21766555309295654, "learning_rate": 9.826278427123944e-06, "loss": 0.5361, "step": 2467 }, { "epoch": 1.116489482017643, "grad_norm": 0.2117602527141571, "learning_rate": 9.826086902415963e-06, "loss": 0.6523, "step": 2468 }, { "epoch": 1.1169418683555756, "grad_norm": 0.22314605116844177, "learning_rate": 9.825895274058567e-06, "loss": 0.6924, "step": 2469 }, { "epoch": 1.1173942546935083, "grad_norm": 0.20264467597007751, "learning_rate": 9.82570354205587e-06, "loss": 0.5791, "step": 2470 }, { "epoch": 1.117846641031441, "grad_norm": 0.22370809316635132, "learning_rate": 9.825511706411986e-06, "loss": 0.5915, "step": 2471 }, { "epoch": 1.1182990273693734, "grad_norm": 0.25417986512184143, "learning_rate": 9.82531976713104e-06, "loss": 0.7607, "step": 2472 }, { "epoch": 1.118751413707306, "grad_norm": 0.2577018439769745, "learning_rate": 9.82512772421715e-06, "loss": 0.616, "step": 2473 }, { "epoch": 1.1192038000452387, "grad_norm": 0.23840376734733582, "learning_rate": 9.824935577674444e-06, "loss": 0.5267, "step": 2474 }, { "epoch": 1.1196561863831713, "grad_norm": 0.20549261569976807, "learning_rate": 9.824743327507048e-06, "loss": 0.4893, "step": 2475 }, { "epoch": 1.1201085727211038, "grad_norm": 0.24152794480323792, "learning_rate": 9.824550973719087e-06, "loss": 0.6167, "step": 2476 }, { "epoch": 1.1205609590590364, "grad_norm": 0.269877165555954, "learning_rate": 9.824358516314698e-06, "loss": 0.7293, "step": 2477 }, { "epoch": 1.121013345396969, "grad_norm": 0.22773005068302155, "learning_rate": 9.824165955298011e-06, "loss": 0.5863, "step": 2478 }, { "epoch": 1.1214657317349017, "grad_norm": 0.25413239002227783, "learning_rate": 9.823973290673161e-06, "loss": 0.809, "step": 2479 }, { "epoch": 1.1219181180728341, "grad_norm": 0.22995854914188385, "learning_rate": 9.823780522444288e-06, "loss": 0.5079, "step": 2480 }, { "epoch": 1.1223705044107668, "grad_norm": 0.2620346248149872, "learning_rate": 9.823587650615532e-06, "loss": 0.6943, "step": 2481 }, { "epoch": 1.1228228907486995, "grad_norm": 0.2530684769153595, "learning_rate": 9.823394675191033e-06, "loss": 0.5931, "step": 2482 }, { "epoch": 1.1232752770866319, "grad_norm": 0.2863596975803375, "learning_rate": 9.823201596174938e-06, "loss": 0.6572, "step": 2483 }, { "epoch": 1.1237276634245645, "grad_norm": 0.2752523124217987, "learning_rate": 9.823008413571393e-06, "loss": 0.548, "step": 2484 }, { "epoch": 1.1241800497624972, "grad_norm": 0.2317802757024765, "learning_rate": 9.822815127384546e-06, "loss": 0.5405, "step": 2485 }, { "epoch": 1.1246324361004298, "grad_norm": 0.2727946937084198, "learning_rate": 9.82262173761855e-06, "loss": 0.4691, "step": 2486 }, { "epoch": 1.1250848224383623, "grad_norm": 0.2733858823776245, "learning_rate": 9.822428244277557e-06, "loss": 0.5497, "step": 2487 }, { "epoch": 1.125537208776295, "grad_norm": 0.25531116127967834, "learning_rate": 9.822234647365721e-06, "loss": 0.5584, "step": 2488 }, { "epoch": 1.1259895951142276, "grad_norm": 0.2675964832305908, "learning_rate": 9.822040946887203e-06, "loss": 0.5532, "step": 2489 }, { "epoch": 1.1264419814521602, "grad_norm": 0.23838219046592712, "learning_rate": 9.821847142846164e-06, "loss": 0.4445, "step": 2490 }, { "epoch": 1.1268943677900927, "grad_norm": 0.26327285170555115, "learning_rate": 9.821653235246761e-06, "loss": 0.5254, "step": 2491 }, { "epoch": 1.1273467541280253, "grad_norm": 0.29154691100120544, "learning_rate": 9.821459224093165e-06, "loss": 0.6169, "step": 2492 }, { "epoch": 1.127799140465958, "grad_norm": 0.26513898372650146, "learning_rate": 9.821265109389538e-06, "loss": 0.5659, "step": 2493 }, { "epoch": 1.1282515268038904, "grad_norm": 0.2949455678462982, "learning_rate": 9.82107089114005e-06, "loss": 0.5777, "step": 2494 }, { "epoch": 1.128703913141823, "grad_norm": 0.2738761305809021, "learning_rate": 9.820876569348871e-06, "loss": 0.5555, "step": 2495 }, { "epoch": 1.1291562994797557, "grad_norm": 0.2746478319168091, "learning_rate": 9.82068214402018e-06, "loss": 0.5442, "step": 2496 }, { "epoch": 1.1296086858176884, "grad_norm": 0.2905701994895935, "learning_rate": 9.820487615158145e-06, "loss": 0.5433, "step": 2497 }, { "epoch": 1.130061072155621, "grad_norm": 0.28367531299591064, "learning_rate": 9.820292982766951e-06, "loss": 0.5382, "step": 2498 }, { "epoch": 1.1305134584935534, "grad_norm": 0.2866228222846985, "learning_rate": 9.820098246850772e-06, "loss": 0.5858, "step": 2499 }, { "epoch": 1.130965844831486, "grad_norm": 0.28061574697494507, "learning_rate": 9.819903407413795e-06, "loss": 0.5048, "step": 2500 }, { "epoch": 1.1314182311694188, "grad_norm": 0.25696438550949097, "learning_rate": 9.819708464460202e-06, "loss": 0.5205, "step": 2501 }, { "epoch": 1.1318706175073512, "grad_norm": 0.2935366928577423, "learning_rate": 9.81951341799418e-06, "loss": 0.5737, "step": 2502 }, { "epoch": 1.1323230038452838, "grad_norm": 0.29552534222602844, "learning_rate": 9.819318268019919e-06, "loss": 0.49, "step": 2503 }, { "epoch": 1.1327753901832165, "grad_norm": 0.30825135111808777, "learning_rate": 9.81912301454161e-06, "loss": 0.4879, "step": 2504 }, { "epoch": 1.1332277765211491, "grad_norm": 0.26504072546958923, "learning_rate": 9.818927657563445e-06, "loss": 0.3982, "step": 2505 }, { "epoch": 1.1336801628590816, "grad_norm": 0.3153058588504791, "learning_rate": 9.81873219708962e-06, "loss": 0.5382, "step": 2506 }, { "epoch": 1.1341325491970142, "grad_norm": 0.3677627444267273, "learning_rate": 9.818536633124335e-06, "loss": 0.6334, "step": 2507 }, { "epoch": 1.1345849355349469, "grad_norm": 0.4268166422843933, "learning_rate": 9.818340965671789e-06, "loss": 0.6734, "step": 2508 }, { "epoch": 1.1350373218728795, "grad_norm": 0.40113845467567444, "learning_rate": 9.818145194736185e-06, "loss": 0.5688, "step": 2509 }, { "epoch": 1.135489708210812, "grad_norm": 0.36580830812454224, "learning_rate": 9.817949320321723e-06, "loss": 0.5926, "step": 2510 }, { "epoch": 1.1359420945487446, "grad_norm": 0.31409934163093567, "learning_rate": 9.817753342432616e-06, "loss": 1.2346, "step": 2511 }, { "epoch": 1.1363944808866773, "grad_norm": 0.14542542397975922, "learning_rate": 9.817557261073069e-06, "loss": 0.4676, "step": 2512 }, { "epoch": 1.1368468672246097, "grad_norm": 0.17192688584327698, "learning_rate": 9.817361076247296e-06, "loss": 0.5961, "step": 2513 }, { "epoch": 1.1372992535625424, "grad_norm": 0.21490991115570068, "learning_rate": 9.817164787959509e-06, "loss": 0.594, "step": 2514 }, { "epoch": 1.137751639900475, "grad_norm": 0.19709108769893646, "learning_rate": 9.816968396213924e-06, "loss": 0.671, "step": 2515 }, { "epoch": 1.1382040262384077, "grad_norm": 0.18243227899074554, "learning_rate": 9.816771901014756e-06, "loss": 0.5834, "step": 2516 }, { "epoch": 1.1386564125763403, "grad_norm": 0.22192460298538208, "learning_rate": 9.81657530236623e-06, "loss": 0.652, "step": 2517 }, { "epoch": 1.1391087989142727, "grad_norm": 0.23469610512256622, "learning_rate": 9.816378600272568e-06, "loss": 0.7552, "step": 2518 }, { "epoch": 1.1395611852522054, "grad_norm": 0.22023281455039978, "learning_rate": 9.81618179473799e-06, "loss": 0.6492, "step": 2519 }, { "epoch": 1.140013571590138, "grad_norm": 0.2270551174879074, "learning_rate": 9.815984885766724e-06, "loss": 0.6378, "step": 2520 }, { "epoch": 1.1404659579280705, "grad_norm": 0.22298814356327057, "learning_rate": 9.815787873363002e-06, "loss": 0.6994, "step": 2521 }, { "epoch": 1.1409183442660031, "grad_norm": 0.20856429636478424, "learning_rate": 9.815590757531054e-06, "loss": 0.5794, "step": 2522 }, { "epoch": 1.1413707306039358, "grad_norm": 0.23347941040992737, "learning_rate": 9.815393538275111e-06, "loss": 0.6033, "step": 2523 }, { "epoch": 1.1418231169418684, "grad_norm": 0.23001086711883545, "learning_rate": 9.815196215599414e-06, "loss": 0.6422, "step": 2524 }, { "epoch": 1.1422755032798009, "grad_norm": 0.2439178079366684, "learning_rate": 9.814998789508195e-06, "loss": 0.6558, "step": 2525 }, { "epoch": 1.1427278896177335, "grad_norm": 0.2376650869846344, "learning_rate": 9.814801260005697e-06, "loss": 0.727, "step": 2526 }, { "epoch": 1.1431802759556662, "grad_norm": 0.19957077503204346, "learning_rate": 9.814603627096163e-06, "loss": 0.4534, "step": 2527 }, { "epoch": 1.1436326622935988, "grad_norm": 0.258100688457489, "learning_rate": 9.814405890783836e-06, "loss": 0.668, "step": 2528 }, { "epoch": 1.1440850486315313, "grad_norm": 0.21670351922512054, "learning_rate": 9.814208051072963e-06, "loss": 0.4221, "step": 2529 }, { "epoch": 1.144537434969464, "grad_norm": 0.2595183253288269, "learning_rate": 9.814010107967793e-06, "loss": 0.5988, "step": 2530 }, { "epoch": 1.1449898213073966, "grad_norm": 0.2689942419528961, "learning_rate": 9.813812061472578e-06, "loss": 0.6846, "step": 2531 }, { "epoch": 1.145442207645329, "grad_norm": 0.24989621341228485, "learning_rate": 9.813613911591572e-06, "loss": 0.5814, "step": 2532 }, { "epoch": 1.1458945939832617, "grad_norm": 0.2696648836135864, "learning_rate": 9.813415658329029e-06, "loss": 0.5517, "step": 2533 }, { "epoch": 1.1463469803211943, "grad_norm": 0.2853918671607971, "learning_rate": 9.813217301689206e-06, "loss": 0.7324, "step": 2534 }, { "epoch": 1.146799366659127, "grad_norm": 0.2782808244228363, "learning_rate": 9.813018841676365e-06, "loss": 0.7025, "step": 2535 }, { "epoch": 1.1472517529970594, "grad_norm": 0.23570112884044647, "learning_rate": 9.812820278294769e-06, "loss": 0.5654, "step": 2536 }, { "epoch": 1.147704139334992, "grad_norm": 0.23406711220741272, "learning_rate": 9.812621611548681e-06, "loss": 0.5759, "step": 2537 }, { "epoch": 1.1481565256729247, "grad_norm": 0.2584210932254791, "learning_rate": 9.812422841442367e-06, "loss": 0.4809, "step": 2538 }, { "epoch": 1.1486089120108574, "grad_norm": 0.2631421387195587, "learning_rate": 9.812223967980096e-06, "loss": 0.5757, "step": 2539 }, { "epoch": 1.1490612983487898, "grad_norm": 0.28959617018699646, "learning_rate": 9.812024991166143e-06, "loss": 0.6344, "step": 2540 }, { "epoch": 1.1495136846867224, "grad_norm": 0.23412570357322693, "learning_rate": 9.811825911004777e-06, "loss": 0.5203, "step": 2541 }, { "epoch": 1.149966071024655, "grad_norm": 0.2646080255508423, "learning_rate": 9.811626727500277e-06, "loss": 0.5637, "step": 2542 }, { "epoch": 1.1504184573625877, "grad_norm": 0.29853710532188416, "learning_rate": 9.811427440656918e-06, "loss": 0.5199, "step": 2543 }, { "epoch": 1.1508708437005202, "grad_norm": 0.2520812451839447, "learning_rate": 9.811228050478981e-06, "loss": 0.4563, "step": 2544 }, { "epoch": 1.1513232300384528, "grad_norm": 0.26025477051734924, "learning_rate": 9.81102855697075e-06, "loss": 0.5013, "step": 2545 }, { "epoch": 1.1517756163763855, "grad_norm": 0.277617484331131, "learning_rate": 9.810828960136505e-06, "loss": 0.6033, "step": 2546 }, { "epoch": 1.1522280027143181, "grad_norm": 0.26456478238105774, "learning_rate": 9.810629259980537e-06, "loss": 0.5338, "step": 2547 }, { "epoch": 1.1526803890522506, "grad_norm": 0.31231269240379333, "learning_rate": 9.810429456507134e-06, "loss": 0.5194, "step": 2548 }, { "epoch": 1.1531327753901832, "grad_norm": 0.36271119117736816, "learning_rate": 9.810229549720587e-06, "loss": 0.6258, "step": 2549 }, { "epoch": 1.1535851617281159, "grad_norm": 0.3208990693092346, "learning_rate": 9.810029539625189e-06, "loss": 0.643, "step": 2550 }, { "epoch": 1.1540375480660483, "grad_norm": 0.32507413625717163, "learning_rate": 9.809829426225234e-06, "loss": 0.6161, "step": 2551 }, { "epoch": 1.154489934403981, "grad_norm": 0.3005486726760864, "learning_rate": 9.809629209525024e-06, "loss": 0.5748, "step": 2552 }, { "epoch": 1.1549423207419136, "grad_norm": 0.3254006803035736, "learning_rate": 9.809428889528856e-06, "loss": 0.5679, "step": 2553 }, { "epoch": 1.1553947070798463, "grad_norm": 0.35544267296791077, "learning_rate": 9.809228466241033e-06, "loss": 0.5714, "step": 2554 }, { "epoch": 1.1558470934177787, "grad_norm": 0.3053162693977356, "learning_rate": 9.809027939665859e-06, "loss": 0.4667, "step": 2555 }, { "epoch": 1.1562994797557113, "grad_norm": 0.3476189374923706, "learning_rate": 9.80882730980764e-06, "loss": 0.5243, "step": 2556 }, { "epoch": 1.156751866093644, "grad_norm": 0.38419073820114136, "learning_rate": 9.808626576670689e-06, "loss": 0.5287, "step": 2557 }, { "epoch": 1.1572042524315767, "grad_norm": 0.4111350476741791, "learning_rate": 9.808425740259312e-06, "loss": 0.6009, "step": 2558 }, { "epoch": 1.157656638769509, "grad_norm": 0.3755585253238678, "learning_rate": 9.808224800577824e-06, "loss": 0.5973, "step": 2559 }, { "epoch": 1.1581090251074417, "grad_norm": 0.44885027408599854, "learning_rate": 9.808023757630542e-06, "loss": 0.5557, "step": 2560 }, { "epoch": 1.1585614114453744, "grad_norm": 0.32568517327308655, "learning_rate": 9.807822611421783e-06, "loss": 1.047, "step": 2561 }, { "epoch": 1.1590137977833068, "grad_norm": 0.1438056081533432, "learning_rate": 9.807621361955865e-06, "loss": 0.6143, "step": 2562 }, { "epoch": 1.1594661841212395, "grad_norm": 0.17755864560604095, "learning_rate": 9.807420009237115e-06, "loss": 0.6064, "step": 2563 }, { "epoch": 1.1599185704591721, "grad_norm": 0.17420797049999237, "learning_rate": 9.80721855326985e-06, "loss": 0.5993, "step": 2564 }, { "epoch": 1.1603709567971048, "grad_norm": 0.1819082349538803, "learning_rate": 9.807016994058405e-06, "loss": 0.5699, "step": 2565 }, { "epoch": 1.1608233431350374, "grad_norm": 0.2002139687538147, "learning_rate": 9.806815331607104e-06, "loss": 0.6176, "step": 2566 }, { "epoch": 1.1612757294729699, "grad_norm": 0.21431107819080353, "learning_rate": 9.806613565920277e-06, "loss": 0.6129, "step": 2567 }, { "epoch": 1.1617281158109025, "grad_norm": 0.19741497933864594, "learning_rate": 9.806411697002262e-06, "loss": 0.554, "step": 2568 }, { "epoch": 1.1621805021488352, "grad_norm": 0.213655486702919, "learning_rate": 9.80620972485739e-06, "loss": 0.6348, "step": 2569 }, { "epoch": 1.1626328884867676, "grad_norm": 0.2553747296333313, "learning_rate": 9.806007649490002e-06, "loss": 0.7956, "step": 2570 }, { "epoch": 1.1630852748247003, "grad_norm": 0.2271839827299118, "learning_rate": 9.805805470904435e-06, "loss": 0.6416, "step": 2571 }, { "epoch": 1.163537661162633, "grad_norm": 0.2301003485918045, "learning_rate": 9.805603189105035e-06, "loss": 0.6809, "step": 2572 }, { "epoch": 1.1639900475005656, "grad_norm": 0.24781273305416107, "learning_rate": 9.805400804096142e-06, "loss": 0.6821, "step": 2573 }, { "epoch": 1.164442433838498, "grad_norm": 0.241263285279274, "learning_rate": 9.805198315882105e-06, "loss": 0.7107, "step": 2574 }, { "epoch": 1.1648948201764306, "grad_norm": 0.23734940588474274, "learning_rate": 9.804995724467273e-06, "loss": 0.6146, "step": 2575 }, { "epoch": 1.1653472065143633, "grad_norm": 0.2842394709587097, "learning_rate": 9.804793029855996e-06, "loss": 0.8012, "step": 2576 }, { "epoch": 1.165799592852296, "grad_norm": 0.22866939008235931, "learning_rate": 9.804590232052628e-06, "loss": 0.5751, "step": 2577 }, { "epoch": 1.1662519791902284, "grad_norm": 0.25249090790748596, "learning_rate": 9.804387331061525e-06, "loss": 0.6611, "step": 2578 }, { "epoch": 1.166704365528161, "grad_norm": 0.23642081022262573, "learning_rate": 9.804184326887043e-06, "loss": 0.5964, "step": 2579 }, { "epoch": 1.1671567518660937, "grad_norm": 0.26440155506134033, "learning_rate": 9.803981219533545e-06, "loss": 0.6975, "step": 2580 }, { "epoch": 1.1676091382040261, "grad_norm": 0.2610516846179962, "learning_rate": 9.803778009005387e-06, "loss": 0.586, "step": 2581 }, { "epoch": 1.1680615245419588, "grad_norm": 0.24549221992492676, "learning_rate": 9.803574695306942e-06, "loss": 0.6145, "step": 2582 }, { "epoch": 1.1685139108798914, "grad_norm": 0.30423736572265625, "learning_rate": 9.803371278442568e-06, "loss": 0.7242, "step": 2583 }, { "epoch": 1.168966297217824, "grad_norm": 0.2626190781593323, "learning_rate": 9.803167758416639e-06, "loss": 0.5828, "step": 2584 }, { "epoch": 1.1694186835557567, "grad_norm": 0.2825044095516205, "learning_rate": 9.802964135233525e-06, "loss": 0.6488, "step": 2585 }, { "epoch": 1.1698710698936892, "grad_norm": 0.2671588659286499, "learning_rate": 9.802760408897597e-06, "loss": 0.5725, "step": 2586 }, { "epoch": 1.1703234562316218, "grad_norm": 0.27012062072753906, "learning_rate": 9.802556579413234e-06, "loss": 0.642, "step": 2587 }, { "epoch": 1.1707758425695545, "grad_norm": 0.26430830359458923, "learning_rate": 9.802352646784812e-06, "loss": 0.584, "step": 2588 }, { "epoch": 1.171228228907487, "grad_norm": 0.249406099319458, "learning_rate": 9.80214861101671e-06, "loss": 0.5584, "step": 2589 }, { "epoch": 1.1716806152454196, "grad_norm": 0.25454872846603394, "learning_rate": 9.801944472113312e-06, "loss": 0.5335, "step": 2590 }, { "epoch": 1.1721330015833522, "grad_norm": 0.2685597538948059, "learning_rate": 9.801740230079e-06, "loss": 0.5245, "step": 2591 }, { "epoch": 1.1725853879212849, "grad_norm": 0.2539985179901123, "learning_rate": 9.801535884918161e-06, "loss": 0.6108, "step": 2592 }, { "epoch": 1.1730377742592173, "grad_norm": 0.26760777831077576, "learning_rate": 9.801331436635185e-06, "loss": 0.5808, "step": 2593 }, { "epoch": 1.17349016059715, "grad_norm": 0.29942238330841064, "learning_rate": 9.801126885234461e-06, "loss": 0.6898, "step": 2594 }, { "epoch": 1.1739425469350826, "grad_norm": 0.2918166518211365, "learning_rate": 9.800922230720384e-06, "loss": 0.6123, "step": 2595 }, { "epoch": 1.1743949332730153, "grad_norm": 0.3120470345020294, "learning_rate": 9.80071747309735e-06, "loss": 0.5865, "step": 2596 }, { "epoch": 1.1748473196109477, "grad_norm": 0.2866261899471283, "learning_rate": 9.800512612369752e-06, "loss": 0.519, "step": 2597 }, { "epoch": 1.1752997059488803, "grad_norm": 0.29575663805007935, "learning_rate": 9.800307648541995e-06, "loss": 0.5759, "step": 2598 }, { "epoch": 1.175752092286813, "grad_norm": 0.2746128737926483, "learning_rate": 9.80010258161848e-06, "loss": 0.5562, "step": 2599 }, { "epoch": 1.1762044786247454, "grad_norm": 0.2846544682979584, "learning_rate": 9.799897411603608e-06, "loss": 0.5068, "step": 2600 }, { "epoch": 1.1762044786247454, "eval_loss": 0.6098482012748718, "eval_runtime": 25.5868, "eval_samples_per_second": 29.078, "eval_steps_per_second": 7.269, "step": 2600 }, { "epoch": 1.176656864962678, "grad_norm": 0.3149385452270508, "learning_rate": 9.799692138501788e-06, "loss": 0.617, "step": 2601 }, { "epoch": 1.1771092513006107, "grad_norm": 0.3524077534675598, "learning_rate": 9.79948676231743e-06, "loss": 0.589, "step": 2602 }, { "epoch": 1.1775616376385434, "grad_norm": 0.2798033654689789, "learning_rate": 9.79928128305494e-06, "loss": 0.4276, "step": 2603 }, { "epoch": 1.178014023976476, "grad_norm": 0.34506192803382874, "learning_rate": 9.799075700718737e-06, "loss": 0.4467, "step": 2604 }, { "epoch": 1.1784664103144085, "grad_norm": 0.34672707319259644, "learning_rate": 9.79887001531323e-06, "loss": 0.673, "step": 2605 }, { "epoch": 1.1789187966523411, "grad_norm": 0.2969883680343628, "learning_rate": 9.798664226842843e-06, "loss": 0.4946, "step": 2606 }, { "epoch": 1.1793711829902738, "grad_norm": 0.3654555082321167, "learning_rate": 9.798458335311991e-06, "loss": 0.6061, "step": 2607 }, { "epoch": 1.1798235693282062, "grad_norm": 0.40429526567459106, "learning_rate": 9.798252340725098e-06, "loss": 0.5639, "step": 2608 }, { "epoch": 1.1802759556661389, "grad_norm": 0.3352474570274353, "learning_rate": 9.798046243086588e-06, "loss": 0.4769, "step": 2609 }, { "epoch": 1.1807283420040715, "grad_norm": 0.4351049065589905, "learning_rate": 9.797840042400886e-06, "loss": 0.5892, "step": 2610 }, { "epoch": 1.1811807283420042, "grad_norm": 0.2866094410419464, "learning_rate": 9.797633738672422e-06, "loss": 1.1047, "step": 2611 }, { "epoch": 1.1816331146799366, "grad_norm": 0.1581403613090515, "learning_rate": 9.797427331905626e-06, "loss": 1.0439, "step": 2612 }, { "epoch": 1.1820855010178692, "grad_norm": 0.15168216824531555, "learning_rate": 9.797220822104932e-06, "loss": 0.684, "step": 2613 }, { "epoch": 1.182537887355802, "grad_norm": 0.22462120652198792, "learning_rate": 9.797014209274773e-06, "loss": 0.8654, "step": 2614 }, { "epoch": 1.1829902736937346, "grad_norm": 0.20756413042545319, "learning_rate": 9.796807493419587e-06, "loss": 0.6548, "step": 2615 }, { "epoch": 1.183442660031667, "grad_norm": 0.1997707188129425, "learning_rate": 9.796600674543817e-06, "loss": 0.6436, "step": 2616 }, { "epoch": 1.1838950463695996, "grad_norm": 0.18231651186943054, "learning_rate": 9.7963937526519e-06, "loss": 0.5456, "step": 2617 }, { "epoch": 1.1843474327075323, "grad_norm": 0.18461599946022034, "learning_rate": 9.796186727748285e-06, "loss": 0.5684, "step": 2618 }, { "epoch": 1.1847998190454647, "grad_norm": 0.22200699150562286, "learning_rate": 9.795979599837413e-06, "loss": 0.6849, "step": 2619 }, { "epoch": 1.1852522053833974, "grad_norm": 0.21882492303848267, "learning_rate": 9.795772368923737e-06, "loss": 0.7099, "step": 2620 }, { "epoch": 1.18570459172133, "grad_norm": 0.21284621953964233, "learning_rate": 9.795565035011704e-06, "loss": 0.6201, "step": 2621 }, { "epoch": 1.1861569780592627, "grad_norm": 0.18618664145469666, "learning_rate": 9.795357598105768e-06, "loss": 0.4886, "step": 2622 }, { "epoch": 1.186609364397195, "grad_norm": 0.20074708759784698, "learning_rate": 9.795150058210388e-06, "loss": 0.5922, "step": 2623 }, { "epoch": 1.1870617507351278, "grad_norm": 0.2554704248905182, "learning_rate": 9.794942415330015e-06, "loss": 0.6208, "step": 2624 }, { "epoch": 1.1875141370730604, "grad_norm": 0.2683351933956146, "learning_rate": 9.794734669469113e-06, "loss": 0.7402, "step": 2625 }, { "epoch": 1.187966523410993, "grad_norm": 0.2443404644727707, "learning_rate": 9.79452682063214e-06, "loss": 0.5724, "step": 2626 }, { "epoch": 1.1884189097489255, "grad_norm": 0.24949145317077637, "learning_rate": 9.794318868823564e-06, "loss": 0.5948, "step": 2627 }, { "epoch": 1.1888712960868582, "grad_norm": 0.23997601866722107, "learning_rate": 9.79411081404785e-06, "loss": 0.5767, "step": 2628 }, { "epoch": 1.1893236824247908, "grad_norm": 0.24214860796928406, "learning_rate": 9.793902656309466e-06, "loss": 0.526, "step": 2629 }, { "epoch": 1.1897760687627235, "grad_norm": 0.24755166471004486, "learning_rate": 9.79369439561288e-06, "loss": 0.6036, "step": 2630 }, { "epoch": 1.190228455100656, "grad_norm": 0.25581276416778564, "learning_rate": 9.793486031962568e-06, "loss": 0.6233, "step": 2631 }, { "epoch": 1.1906808414385885, "grad_norm": 0.24254432320594788, "learning_rate": 9.793277565363007e-06, "loss": 0.5685, "step": 2632 }, { "epoch": 1.1911332277765212, "grad_norm": 0.28853270411491394, "learning_rate": 9.793068995818669e-06, "loss": 0.7017, "step": 2633 }, { "epoch": 1.1915856141144539, "grad_norm": 0.27174392342567444, "learning_rate": 9.792860323334034e-06, "loss": 0.7311, "step": 2634 }, { "epoch": 1.1920380004523863, "grad_norm": 0.2724166512489319, "learning_rate": 9.792651547913587e-06, "loss": 0.6578, "step": 2635 }, { "epoch": 1.192490386790319, "grad_norm": 0.26329293847084045, "learning_rate": 9.792442669561813e-06, "loss": 0.5662, "step": 2636 }, { "epoch": 1.1929427731282516, "grad_norm": 0.2838033437728882, "learning_rate": 9.792233688283192e-06, "loss": 0.6081, "step": 2637 }, { "epoch": 1.193395159466184, "grad_norm": 0.24936695396900177, "learning_rate": 9.792024604082217e-06, "loss": 0.5724, "step": 2638 }, { "epoch": 1.1938475458041167, "grad_norm": 0.2756723463535309, "learning_rate": 9.791815416963375e-06, "loss": 0.5387, "step": 2639 }, { "epoch": 1.1942999321420493, "grad_norm": 0.32582348585128784, "learning_rate": 9.791606126931163e-06, "loss": 0.7496, "step": 2640 }, { "epoch": 1.194752318479982, "grad_norm": 0.3300988972187042, "learning_rate": 9.791396733990073e-06, "loss": 0.7418, "step": 2641 }, { "epoch": 1.1952047048179144, "grad_norm": 0.29033032059669495, "learning_rate": 9.791187238144604e-06, "loss": 0.538, "step": 2642 }, { "epoch": 1.195657091155847, "grad_norm": 0.28217560052871704, "learning_rate": 9.790977639399253e-06, "loss": 0.5757, "step": 2643 }, { "epoch": 1.1961094774937797, "grad_norm": 0.26764386892318726, "learning_rate": 9.790767937758522e-06, "loss": 0.5486, "step": 2644 }, { "epoch": 1.1965618638317124, "grad_norm": 0.25737226009368896, "learning_rate": 9.790558133226916e-06, "loss": 0.476, "step": 2645 }, { "epoch": 1.1970142501696448, "grad_norm": 0.2976669669151306, "learning_rate": 9.790348225808941e-06, "loss": 0.6769, "step": 2646 }, { "epoch": 1.1974666365075775, "grad_norm": 0.263624906539917, "learning_rate": 9.790138215509104e-06, "loss": 0.4752, "step": 2647 }, { "epoch": 1.19791902284551, "grad_norm": 0.2926979660987854, "learning_rate": 9.789928102331916e-06, "loss": 0.6428, "step": 2648 }, { "epoch": 1.1983714091834425, "grad_norm": 0.287448912858963, "learning_rate": 9.78971788628189e-06, "loss": 0.5667, "step": 2649 }, { "epoch": 1.1988237955213752, "grad_norm": 0.30107995867729187, "learning_rate": 9.789507567363538e-06, "loss": 0.5414, "step": 2650 }, { "epoch": 1.1992761818593078, "grad_norm": 0.3010142743587494, "learning_rate": 9.789297145581382e-06, "loss": 0.54, "step": 2651 }, { "epoch": 1.1997285681972405, "grad_norm": 0.30980929732322693, "learning_rate": 9.789086620939936e-06, "loss": 0.5353, "step": 2652 }, { "epoch": 1.2001809545351732, "grad_norm": 0.32068049907684326, "learning_rate": 9.788875993443724e-06, "loss": 0.5262, "step": 2653 }, { "epoch": 1.2006333408731056, "grad_norm": 0.3272947072982788, "learning_rate": 9.788665263097272e-06, "loss": 0.5743, "step": 2654 }, { "epoch": 1.2010857272110382, "grad_norm": 0.3413935899734497, "learning_rate": 9.788454429905101e-06, "loss": 0.5496, "step": 2655 }, { "epoch": 1.2015381135489709, "grad_norm": 0.33718597888946533, "learning_rate": 9.788243493871743e-06, "loss": 0.4905, "step": 2656 }, { "epoch": 1.2019904998869033, "grad_norm": 0.40672779083251953, "learning_rate": 9.788032455001726e-06, "loss": 0.5157, "step": 2657 }, { "epoch": 1.202442886224836, "grad_norm": 0.38970276713371277, "learning_rate": 9.787821313299583e-06, "loss": 0.5143, "step": 2658 }, { "epoch": 1.2028952725627686, "grad_norm": 0.36625224351882935, "learning_rate": 9.787610068769851e-06, "loss": 0.4974, "step": 2659 }, { "epoch": 1.2033476589007013, "grad_norm": 0.4209536910057068, "learning_rate": 9.78739872141706e-06, "loss": 0.5388, "step": 2660 }, { "epoch": 1.2038000452386337, "grad_norm": 0.3067794144153595, "learning_rate": 9.787187271245757e-06, "loss": 0.8956, "step": 2661 }, { "epoch": 1.2042524315765664, "grad_norm": 0.16084101796150208, "learning_rate": 9.786975718260481e-06, "loss": 1.0455, "step": 2662 }, { "epoch": 1.204704817914499, "grad_norm": 0.1751405894756317, "learning_rate": 9.786764062465772e-06, "loss": 0.6255, "step": 2663 }, { "epoch": 1.2051572042524317, "grad_norm": 0.20420904457569122, "learning_rate": 9.78655230386618e-06, "loss": 0.7424, "step": 2664 }, { "epoch": 1.205609590590364, "grad_norm": 0.1661645621061325, "learning_rate": 9.786340442466248e-06, "loss": 0.5878, "step": 2665 }, { "epoch": 1.2060619769282968, "grad_norm": 0.19213740527629852, "learning_rate": 9.786128478270534e-06, "loss": 0.6402, "step": 2666 }, { "epoch": 1.2065143632662294, "grad_norm": 0.2354833483695984, "learning_rate": 9.785916411283583e-06, "loss": 0.7261, "step": 2667 }, { "epoch": 1.2069667496041618, "grad_norm": 0.198683500289917, "learning_rate": 9.785704241509952e-06, "loss": 0.618, "step": 2668 }, { "epoch": 1.2074191359420945, "grad_norm": 0.18625035881996155, "learning_rate": 9.785491968954199e-06, "loss": 0.5923, "step": 2669 }, { "epoch": 1.2078715222800271, "grad_norm": 0.2029581516981125, "learning_rate": 9.78527959362088e-06, "loss": 0.5823, "step": 2670 }, { "epoch": 1.2083239086179598, "grad_norm": 0.21315819025039673, "learning_rate": 9.785067115514559e-06, "loss": 0.5927, "step": 2671 }, { "epoch": 1.2087762949558924, "grad_norm": 0.21781376004219055, "learning_rate": 9.7848545346398e-06, "loss": 0.58, "step": 2672 }, { "epoch": 1.2092286812938249, "grad_norm": 0.21460093557834625, "learning_rate": 9.784641851001166e-06, "loss": 0.6573, "step": 2673 }, { "epoch": 1.2096810676317575, "grad_norm": 0.20104597508907318, "learning_rate": 9.784429064603225e-06, "loss": 0.5007, "step": 2674 }, { "epoch": 1.2101334539696902, "grad_norm": 0.25274133682250977, "learning_rate": 9.784216175450548e-06, "loss": 0.7158, "step": 2675 }, { "epoch": 1.2105858403076226, "grad_norm": 0.2598205506801605, "learning_rate": 9.784003183547708e-06, "loss": 0.5695, "step": 2676 }, { "epoch": 1.2110382266455553, "grad_norm": 0.25020071864128113, "learning_rate": 9.783790088899278e-06, "loss": 0.7562, "step": 2677 }, { "epoch": 1.211490612983488, "grad_norm": 0.23188763856887817, "learning_rate": 9.783576891509834e-06, "loss": 0.6544, "step": 2678 }, { "epoch": 1.2119429993214206, "grad_norm": 0.26617324352264404, "learning_rate": 9.783363591383956e-06, "loss": 0.6646, "step": 2679 }, { "epoch": 1.212395385659353, "grad_norm": 0.26586514711380005, "learning_rate": 9.783150188526225e-06, "loss": 0.7897, "step": 2680 }, { "epoch": 1.2128477719972857, "grad_norm": 0.24336515367031097, "learning_rate": 9.782936682941226e-06, "loss": 0.5756, "step": 2681 }, { "epoch": 1.2133001583352183, "grad_norm": 0.23046410083770752, "learning_rate": 9.78272307463354e-06, "loss": 0.5018, "step": 2682 }, { "epoch": 1.213752544673151, "grad_norm": 0.24654120206832886, "learning_rate": 9.782509363607759e-06, "loss": 0.6066, "step": 2683 }, { "epoch": 1.2142049310110834, "grad_norm": 0.2911880910396576, "learning_rate": 9.78229554986847e-06, "loss": 0.6691, "step": 2684 }, { "epoch": 1.214657317349016, "grad_norm": 0.2581994831562042, "learning_rate": 9.782081633420266e-06, "loss": 0.6061, "step": 2685 }, { "epoch": 1.2151097036869487, "grad_norm": 0.25860607624053955, "learning_rate": 9.781867614267741e-06, "loss": 0.6287, "step": 2686 }, { "epoch": 1.2155620900248811, "grad_norm": 0.25601693987846375, "learning_rate": 9.781653492415494e-06, "loss": 0.5787, "step": 2687 }, { "epoch": 1.2160144763628138, "grad_norm": 0.264041930437088, "learning_rate": 9.781439267868121e-06, "loss": 0.6407, "step": 2688 }, { "epoch": 1.2164668627007464, "grad_norm": 0.27877944707870483, "learning_rate": 9.781224940630221e-06, "loss": 0.6094, "step": 2689 }, { "epoch": 1.216919249038679, "grad_norm": 0.24958817660808563, "learning_rate": 9.781010510706402e-06, "loss": 0.5095, "step": 2690 }, { "epoch": 1.2173716353766117, "grad_norm": 0.3063572943210602, "learning_rate": 9.780795978101265e-06, "loss": 0.6793, "step": 2691 }, { "epoch": 1.2178240217145442, "grad_norm": 0.26387014985084534, "learning_rate": 9.78058134281942e-06, "loss": 0.5778, "step": 2692 }, { "epoch": 1.2182764080524768, "grad_norm": 0.2565646171569824, "learning_rate": 9.780366604865475e-06, "loss": 0.4929, "step": 2693 }, { "epoch": 1.2187287943904095, "grad_norm": 0.32384422421455383, "learning_rate": 9.780151764244043e-06, "loss": 0.5989, "step": 2694 }, { "epoch": 1.219181180728342, "grad_norm": 0.30421876907348633, "learning_rate": 9.77993682095974e-06, "loss": 0.613, "step": 2695 }, { "epoch": 1.2196335670662746, "grad_norm": 0.26211652159690857, "learning_rate": 9.779721775017177e-06, "loss": 0.5087, "step": 2696 }, { "epoch": 1.2200859534042072, "grad_norm": 0.3003659248352051, "learning_rate": 9.779506626420978e-06, "loss": 0.6086, "step": 2697 }, { "epoch": 1.2205383397421399, "grad_norm": 0.2906612157821655, "learning_rate": 9.779291375175761e-06, "loss": 0.5871, "step": 2698 }, { "epoch": 1.2209907260800723, "grad_norm": 0.3008609414100647, "learning_rate": 9.77907602128615e-06, "loss": 0.4941, "step": 2699 }, { "epoch": 1.221443112418005, "grad_norm": 0.29604169726371765, "learning_rate": 9.778860564756769e-06, "loss": 0.5473, "step": 2700 }, { "epoch": 1.2218954987559376, "grad_norm": 0.25814300775527954, "learning_rate": 9.778645005592247e-06, "loss": 0.4607, "step": 2701 }, { "epoch": 1.2223478850938703, "grad_norm": 0.26684871315956116, "learning_rate": 9.778429343797211e-06, "loss": 0.5336, "step": 2702 }, { "epoch": 1.2228002714318027, "grad_norm": 0.2889462113380432, "learning_rate": 9.778213579376295e-06, "loss": 0.5126, "step": 2703 }, { "epoch": 1.2232526577697354, "grad_norm": 0.32200923562049866, "learning_rate": 9.777997712334133e-06, "loss": 0.4998, "step": 2704 }, { "epoch": 1.223705044107668, "grad_norm": 0.28773587942123413, "learning_rate": 9.777781742675359e-06, "loss": 0.4669, "step": 2705 }, { "epoch": 1.2241574304456004, "grad_norm": 0.34583839774131775, "learning_rate": 9.777565670404611e-06, "loss": 0.6233, "step": 2706 }, { "epoch": 1.224609816783533, "grad_norm": 0.3283802270889282, "learning_rate": 9.777349495526535e-06, "loss": 0.4576, "step": 2707 }, { "epoch": 1.2250622031214657, "grad_norm": 0.33200719952583313, "learning_rate": 9.77713321804577e-06, "loss": 0.5312, "step": 2708 }, { "epoch": 1.2255145894593984, "grad_norm": 0.358233243227005, "learning_rate": 9.776916837966957e-06, "loss": 0.4857, "step": 2709 }, { "epoch": 1.2259669757973308, "grad_norm": 0.45902401208877563, "learning_rate": 9.77670035529475e-06, "loss": 0.6637, "step": 2710 }, { "epoch": 1.2264193621352635, "grad_norm": 0.4268685281276703, "learning_rate": 9.776483770033795e-06, "loss": 1.1715, "step": 2711 }, { "epoch": 1.2268717484731961, "grad_norm": 0.17808614671230316, "learning_rate": 9.776267082188743e-06, "loss": 1.0442, "step": 2712 }, { "epoch": 1.2273241348111288, "grad_norm": 0.18543362617492676, "learning_rate": 9.776050291764252e-06, "loss": 0.8422, "step": 2713 }, { "epoch": 1.2277765211490612, "grad_norm": 0.19164355099201202, "learning_rate": 9.775833398764974e-06, "loss": 0.5169, "step": 2714 }, { "epoch": 1.2282289074869939, "grad_norm": 0.17044372856616974, "learning_rate": 9.775616403195566e-06, "loss": 0.4798, "step": 2715 }, { "epoch": 1.2286812938249265, "grad_norm": 0.21611164510250092, "learning_rate": 9.775399305060692e-06, "loss": 0.654, "step": 2716 }, { "epoch": 1.2291336801628592, "grad_norm": 0.2410590499639511, "learning_rate": 9.775182104365013e-06, "loss": 0.6952, "step": 2717 }, { "epoch": 1.2295860665007916, "grad_norm": 0.23631104826927185, "learning_rate": 9.774964801113194e-06, "loss": 0.7752, "step": 2718 }, { "epoch": 1.2300384528387243, "grad_norm": 0.2560649514198303, "learning_rate": 9.7747473953099e-06, "loss": 0.6497, "step": 2719 }, { "epoch": 1.230490839176657, "grad_norm": 0.22625499963760376, "learning_rate": 9.774529886959806e-06, "loss": 0.5816, "step": 2720 }, { "epoch": 1.2309432255145896, "grad_norm": 0.22201582789421082, "learning_rate": 9.774312276067575e-06, "loss": 0.6057, "step": 2721 }, { "epoch": 1.231395611852522, "grad_norm": 0.24732157588005066, "learning_rate": 9.774094562637888e-06, "loss": 0.5286, "step": 2722 }, { "epoch": 1.2318479981904547, "grad_norm": 0.24428527057170868, "learning_rate": 9.773876746675417e-06, "loss": 0.5232, "step": 2723 }, { "epoch": 1.2323003845283873, "grad_norm": 0.2620784640312195, "learning_rate": 9.77365882818484e-06, "loss": 0.5585, "step": 2724 }, { "epoch": 1.2327527708663197, "grad_norm": 0.2665769159793854, "learning_rate": 9.77344080717084e-06, "loss": 0.6259, "step": 2725 }, { "epoch": 1.2332051572042524, "grad_norm": 0.25363045930862427, "learning_rate": 9.773222683638098e-06, "loss": 0.6672, "step": 2726 }, { "epoch": 1.233657543542185, "grad_norm": 0.23881900310516357, "learning_rate": 9.773004457591295e-06, "loss": 0.504, "step": 2727 }, { "epoch": 1.2341099298801177, "grad_norm": 0.2424183338880539, "learning_rate": 9.772786129035123e-06, "loss": 0.5115, "step": 2728 }, { "epoch": 1.2345623162180501, "grad_norm": 0.2681165039539337, "learning_rate": 9.77256769797427e-06, "loss": 0.6073, "step": 2729 }, { "epoch": 1.2350147025559828, "grad_norm": 0.2343740016222, "learning_rate": 9.772349164413425e-06, "loss": 0.5001, "step": 2730 }, { "epoch": 1.2354670888939154, "grad_norm": 0.2668372690677643, "learning_rate": 9.772130528357283e-06, "loss": 0.4514, "step": 2731 }, { "epoch": 1.235919475231848, "grad_norm": 0.29295504093170166, "learning_rate": 9.77191178981054e-06, "loss": 0.5978, "step": 2732 }, { "epoch": 1.2363718615697805, "grad_norm": 0.24254776537418365, "learning_rate": 9.77169294877789e-06, "loss": 0.4698, "step": 2733 }, { "epoch": 1.2368242479077132, "grad_norm": 0.288648396730423, "learning_rate": 9.77147400526404e-06, "loss": 0.5985, "step": 2734 }, { "epoch": 1.2372766342456458, "grad_norm": 0.25369277596473694, "learning_rate": 9.771254959273688e-06, "loss": 0.6556, "step": 2735 }, { "epoch": 1.2377290205835783, "grad_norm": 0.2781633734703064, "learning_rate": 9.771035810811537e-06, "loss": 0.5796, "step": 2736 }, { "epoch": 1.238181406921511, "grad_norm": 0.25688862800598145, "learning_rate": 9.770816559882297e-06, "loss": 0.5705, "step": 2737 }, { "epoch": 1.2386337932594436, "grad_norm": 0.2737256586551666, "learning_rate": 9.770597206490672e-06, "loss": 0.6325, "step": 2738 }, { "epoch": 1.2390861795973762, "grad_norm": 0.2853878438472748, "learning_rate": 9.77037775064138e-06, "loss": 0.6122, "step": 2739 }, { "epoch": 1.2395385659353089, "grad_norm": 0.2838756740093231, "learning_rate": 9.77015819233913e-06, "loss": 0.5799, "step": 2740 }, { "epoch": 1.2399909522732413, "grad_norm": 0.24877610802650452, "learning_rate": 9.769938531588637e-06, "loss": 0.5012, "step": 2741 }, { "epoch": 1.240443338611174, "grad_norm": 0.27192676067352295, "learning_rate": 9.76971876839462e-06, "loss": 0.6763, "step": 2742 }, { "epoch": 1.2408957249491066, "grad_norm": 0.30139511823654175, "learning_rate": 9.769498902761798e-06, "loss": 0.6334, "step": 2743 }, { "epoch": 1.241348111287039, "grad_norm": 0.278489887714386, "learning_rate": 9.769278934694894e-06, "loss": 0.5816, "step": 2744 }, { "epoch": 1.2418004976249717, "grad_norm": 0.29455700516700745, "learning_rate": 9.76905886419863e-06, "loss": 0.6602, "step": 2745 }, { "epoch": 1.2422528839629043, "grad_norm": 0.28735411167144775, "learning_rate": 9.768838691277738e-06, "loss": 0.4781, "step": 2746 }, { "epoch": 1.242705270300837, "grad_norm": 0.31243786215782166, "learning_rate": 9.768618415936939e-06, "loss": 0.6766, "step": 2747 }, { "epoch": 1.2431576566387694, "grad_norm": 0.3027532696723938, "learning_rate": 9.76839803818097e-06, "loss": 0.55, "step": 2748 }, { "epoch": 1.243610042976702, "grad_norm": 0.2716444432735443, "learning_rate": 9.76817755801456e-06, "loss": 0.4761, "step": 2749 }, { "epoch": 1.2440624293146347, "grad_norm": 0.31440773606300354, "learning_rate": 9.767956975442447e-06, "loss": 0.5664, "step": 2750 }, { "epoch": 1.2445148156525674, "grad_norm": 0.3552285432815552, "learning_rate": 9.767736290469368e-06, "loss": 0.5797, "step": 2751 }, { "epoch": 1.2449672019904998, "grad_norm": 0.36554864048957825, "learning_rate": 9.767515503100061e-06, "loss": 0.5434, "step": 2752 }, { "epoch": 1.2454195883284325, "grad_norm": 0.2932663559913635, "learning_rate": 9.767294613339267e-06, "loss": 0.4668, "step": 2753 }, { "epoch": 1.2458719746663651, "grad_norm": 0.3122079074382782, "learning_rate": 9.767073621191733e-06, "loss": 0.5438, "step": 2754 }, { "epoch": 1.2463243610042976, "grad_norm": 0.3315339684486389, "learning_rate": 9.766852526662205e-06, "loss": 0.5551, "step": 2755 }, { "epoch": 1.2467767473422302, "grad_norm": 0.3875426948070526, "learning_rate": 9.766631329755429e-06, "loss": 0.6465, "step": 2756 }, { "epoch": 1.2472291336801629, "grad_norm": 0.40473753213882446, "learning_rate": 9.766410030476158e-06, "loss": 0.6053, "step": 2757 }, { "epoch": 1.2476815200180955, "grad_norm": 0.3592061698436737, "learning_rate": 9.766188628829144e-06, "loss": 0.5218, "step": 2758 }, { "epoch": 1.2481339063560282, "grad_norm": 0.3976213335990906, "learning_rate": 9.765967124819142e-06, "loss": 0.5989, "step": 2759 }, { "epoch": 1.2485862926939606, "grad_norm": 0.4188612699508667, "learning_rate": 9.765745518450908e-06, "loss": 0.5892, "step": 2760 }, { "epoch": 1.2490386790318933, "grad_norm": 0.3196594715118408, "learning_rate": 9.765523809729204e-06, "loss": 0.915, "step": 2761 }, { "epoch": 1.249491065369826, "grad_norm": 0.15340954065322876, "learning_rate": 9.76530199865879e-06, "loss": 1.273, "step": 2762 }, { "epoch": 1.2499434517077583, "grad_norm": 0.14391326904296875, "learning_rate": 9.76508008524443e-06, "loss": 0.518, "step": 2763 }, { "epoch": 1.250395838045691, "grad_norm": 0.19327402114868164, "learning_rate": 9.764858069490888e-06, "loss": 0.6389, "step": 2764 }, { "epoch": 1.2508482243836236, "grad_norm": 0.19053660333156586, "learning_rate": 9.764635951402936e-06, "loss": 0.5614, "step": 2765 }, { "epoch": 1.251300610721556, "grad_norm": 0.2077915370464325, "learning_rate": 9.764413730985344e-06, "loss": 0.7043, "step": 2766 }, { "epoch": 1.2517529970594887, "grad_norm": 0.2212970107793808, "learning_rate": 9.764191408242881e-06, "loss": 0.616, "step": 2767 }, { "epoch": 1.2522053833974214, "grad_norm": 0.21007844805717468, "learning_rate": 9.763968983180325e-06, "loss": 0.6165, "step": 2768 }, { "epoch": 1.252657769735354, "grad_norm": 0.2274552285671234, "learning_rate": 9.763746455802453e-06, "loss": 0.702, "step": 2769 }, { "epoch": 1.2531101560732867, "grad_norm": 0.25612592697143555, "learning_rate": 9.76352382611404e-06, "loss": 0.7845, "step": 2770 }, { "epoch": 1.2535625424112191, "grad_norm": 0.23846998810768127, "learning_rate": 9.763301094119873e-06, "loss": 0.7467, "step": 2771 }, { "epoch": 1.2540149287491518, "grad_norm": 0.23483185470104218, "learning_rate": 9.763078259824734e-06, "loss": 0.6155, "step": 2772 }, { "epoch": 1.2544673150870844, "grad_norm": 0.21290446817874908, "learning_rate": 9.762855323233407e-06, "loss": 0.4791, "step": 2773 }, { "epoch": 1.2549197014250169, "grad_norm": 0.25524500012397766, "learning_rate": 9.762632284350681e-06, "loss": 0.6655, "step": 2774 }, { "epoch": 1.2553720877629495, "grad_norm": 0.22807368636131287, "learning_rate": 9.762409143181346e-06, "loss": 0.592, "step": 2775 }, { "epoch": 1.2558244741008822, "grad_norm": 0.2783399224281311, "learning_rate": 9.762185899730194e-06, "loss": 0.5558, "step": 2776 }, { "epoch": 1.2562768604388148, "grad_norm": 0.3078058362007141, "learning_rate": 9.761962554002021e-06, "loss": 0.6358, "step": 2777 }, { "epoch": 1.2567292467767475, "grad_norm": 0.26532453298568726, "learning_rate": 9.761739106001623e-06, "loss": 0.7047, "step": 2778 }, { "epoch": 1.25718163311468, "grad_norm": 0.23617668449878693, "learning_rate": 9.761515555733798e-06, "loss": 0.5888, "step": 2779 }, { "epoch": 1.2576340194526126, "grad_norm": 0.2620866000652313, "learning_rate": 9.761291903203349e-06, "loss": 0.6333, "step": 2780 }, { "epoch": 1.2580864057905452, "grad_norm": 0.2596931457519531, "learning_rate": 9.761068148415078e-06, "loss": 0.5103, "step": 2781 }, { "epoch": 1.2585387921284776, "grad_norm": 0.2806573808193207, "learning_rate": 9.760844291373793e-06, "loss": 0.5844, "step": 2782 }, { "epoch": 1.2589911784664103, "grad_norm": 0.2843141257762909, "learning_rate": 9.760620332084297e-06, "loss": 0.732, "step": 2783 }, { "epoch": 1.259443564804343, "grad_norm": 0.2736939489841461, "learning_rate": 9.760396270551405e-06, "loss": 0.6571, "step": 2784 }, { "epoch": 1.2598959511422754, "grad_norm": 0.25268808007240295, "learning_rate": 9.760172106779923e-06, "loss": 0.5416, "step": 2785 }, { "epoch": 1.260348337480208, "grad_norm": 0.28550878167152405, "learning_rate": 9.759947840774673e-06, "loss": 0.6027, "step": 2786 }, { "epoch": 1.2608007238181407, "grad_norm": 0.2538754343986511, "learning_rate": 9.759723472540466e-06, "loss": 0.5449, "step": 2787 }, { "epoch": 1.2612531101560733, "grad_norm": 0.2780977487564087, "learning_rate": 9.759499002082125e-06, "loss": 0.649, "step": 2788 }, { "epoch": 1.261705496494006, "grad_norm": 0.24029067158699036, "learning_rate": 9.759274429404464e-06, "loss": 0.4798, "step": 2789 }, { "epoch": 1.2621578828319384, "grad_norm": 0.251708060503006, "learning_rate": 9.759049754512314e-06, "loss": 0.4198, "step": 2790 }, { "epoch": 1.262610269169871, "grad_norm": 0.287905216217041, "learning_rate": 9.758824977410496e-06, "loss": 0.5177, "step": 2791 }, { "epoch": 1.2630626555078037, "grad_norm": 0.29518839716911316, "learning_rate": 9.758600098103839e-06, "loss": 0.5985, "step": 2792 }, { "epoch": 1.2635150418457362, "grad_norm": 0.2860504388809204, "learning_rate": 9.75837511659717e-06, "loss": 0.6482, "step": 2793 }, { "epoch": 1.2639674281836688, "grad_norm": 0.28637072443962097, "learning_rate": 9.758150032895325e-06, "loss": 0.5932, "step": 2794 }, { "epoch": 1.2644198145216015, "grad_norm": 0.28173932433128357, "learning_rate": 9.757924847003136e-06, "loss": 0.5592, "step": 2795 }, { "epoch": 1.2648722008595341, "grad_norm": 0.2860463857650757, "learning_rate": 9.757699558925438e-06, "loss": 0.5606, "step": 2796 }, { "epoch": 1.2653245871974668, "grad_norm": 0.2922203242778778, "learning_rate": 9.757474168667072e-06, "loss": 0.578, "step": 2797 }, { "epoch": 1.2657769735353992, "grad_norm": 0.33653339743614197, "learning_rate": 9.757248676232877e-06, "loss": 0.5864, "step": 2798 }, { "epoch": 1.2662293598733319, "grad_norm": 0.29060009121894836, "learning_rate": 9.757023081627697e-06, "loss": 0.5289, "step": 2799 }, { "epoch": 1.2666817462112645, "grad_norm": 0.2769792377948761, "learning_rate": 9.756797384856375e-06, "loss": 0.506, "step": 2800 }, { "epoch": 1.2666817462112645, "eval_loss": 0.6088727116584778, "eval_runtime": 26.0449, "eval_samples_per_second": 28.566, "eval_steps_per_second": 7.142, "step": 2800 }, { "epoch": 1.267134132549197, "grad_norm": 0.32916656136512756, "learning_rate": 9.756571585923762e-06, "loss": 0.5511, "step": 2801 }, { "epoch": 1.2675865188871296, "grad_norm": 0.3326772153377533, "learning_rate": 9.756345684834705e-06, "loss": 0.5531, "step": 2802 }, { "epoch": 1.2680389052250622, "grad_norm": 0.30400726199150085, "learning_rate": 9.756119681594053e-06, "loss": 0.4875, "step": 2803 }, { "epoch": 1.2684912915629947, "grad_norm": 0.3642740249633789, "learning_rate": 9.755893576206665e-06, "loss": 0.5291, "step": 2804 }, { "epoch": 1.2689436779009273, "grad_norm": 0.3178017735481262, "learning_rate": 9.755667368677396e-06, "loss": 0.4886, "step": 2805 }, { "epoch": 1.26939606423886, "grad_norm": 0.348757803440094, "learning_rate": 9.755441059011102e-06, "loss": 0.6434, "step": 2806 }, { "epoch": 1.2698484505767926, "grad_norm": 0.3940362334251404, "learning_rate": 9.755214647212643e-06, "loss": 0.604, "step": 2807 }, { "epoch": 1.2703008369147253, "grad_norm": 0.3434571921825409, "learning_rate": 9.754988133286885e-06, "loss": 0.4419, "step": 2808 }, { "epoch": 1.2707532232526577, "grad_norm": 0.4311162829399109, "learning_rate": 9.75476151723869e-06, "loss": 0.5555, "step": 2809 }, { "epoch": 1.2712056095905904, "grad_norm": 0.4211193025112152, "learning_rate": 9.754534799072926e-06, "loss": 0.5286, "step": 2810 }, { "epoch": 1.271657995928523, "grad_norm": 0.2802656292915344, "learning_rate": 9.754307978794462e-06, "loss": 0.9679, "step": 2811 }, { "epoch": 1.2721103822664555, "grad_norm": 0.1449253261089325, "learning_rate": 9.75408105640817e-06, "loss": 0.8493, "step": 2812 }, { "epoch": 1.272562768604388, "grad_norm": 0.1607683002948761, "learning_rate": 9.753854031918923e-06, "loss": 0.6075, "step": 2813 }, { "epoch": 1.2730151549423208, "grad_norm": 0.18674226105213165, "learning_rate": 9.753626905331596e-06, "loss": 0.622, "step": 2814 }, { "epoch": 1.2734675412802534, "grad_norm": 0.20154114067554474, "learning_rate": 9.75339967665107e-06, "loss": 0.6658, "step": 2815 }, { "epoch": 1.273919927618186, "grad_norm": 0.19679448008537292, "learning_rate": 9.753172345882223e-06, "loss": 0.6245, "step": 2816 }, { "epoch": 1.2743723139561185, "grad_norm": 0.20662441849708557, "learning_rate": 9.752944913029934e-06, "loss": 0.5386, "step": 2817 }, { "epoch": 1.2748247002940511, "grad_norm": 0.23867133259773254, "learning_rate": 9.752717378099094e-06, "loss": 0.7479, "step": 2818 }, { "epoch": 1.2752770866319838, "grad_norm": 0.21890559792518616, "learning_rate": 9.752489741094586e-06, "loss": 0.6345, "step": 2819 }, { "epoch": 1.2757294729699162, "grad_norm": 0.24763163924217224, "learning_rate": 9.7522620020213e-06, "loss": 0.6652, "step": 2820 }, { "epoch": 1.2761818593078489, "grad_norm": 0.242435485124588, "learning_rate": 9.752034160884126e-06, "loss": 0.6558, "step": 2821 }, { "epoch": 1.2766342456457815, "grad_norm": 0.23096707463264465, "learning_rate": 9.751806217687957e-06, "loss": 0.6567, "step": 2822 }, { "epoch": 1.277086631983714, "grad_norm": 0.2529338300228119, "learning_rate": 9.751578172437692e-06, "loss": 0.5713, "step": 2823 }, { "epoch": 1.2775390183216466, "grad_norm": 0.24651603400707245, "learning_rate": 9.751350025138226e-06, "loss": 0.6227, "step": 2824 }, { "epoch": 1.2779914046595793, "grad_norm": 0.25256526470184326, "learning_rate": 9.751121775794459e-06, "loss": 0.5924, "step": 2825 }, { "epoch": 1.278443790997512, "grad_norm": 0.2286653071641922, "learning_rate": 9.750893424411292e-06, "loss": 0.5019, "step": 2826 }, { "epoch": 1.2788961773354446, "grad_norm": 0.23177926242351532, "learning_rate": 9.750664970993632e-06, "loss": 0.5717, "step": 2827 }, { "epoch": 1.279348563673377, "grad_norm": 0.2111201286315918, "learning_rate": 9.750436415546384e-06, "loss": 0.496, "step": 2828 }, { "epoch": 1.2798009500113097, "grad_norm": 0.234357550740242, "learning_rate": 9.750207758074457e-06, "loss": 0.629, "step": 2829 }, { "epoch": 1.2802533363492423, "grad_norm": 0.2725326716899872, "learning_rate": 9.74997899858276e-06, "loss": 0.6482, "step": 2830 }, { "epoch": 1.2807057226871748, "grad_norm": 0.23716451227664948, "learning_rate": 9.749750137076208e-06, "loss": 0.578, "step": 2831 }, { "epoch": 1.2811581090251074, "grad_norm": 0.2620217204093933, "learning_rate": 9.749521173559717e-06, "loss": 0.5655, "step": 2832 }, { "epoch": 1.28161049536304, "grad_norm": 0.24635924398899078, "learning_rate": 9.749292108038202e-06, "loss": 0.5345, "step": 2833 }, { "epoch": 1.2820628817009727, "grad_norm": 0.26375463604927063, "learning_rate": 9.749062940516584e-06, "loss": 0.575, "step": 2834 }, { "epoch": 1.2825152680389051, "grad_norm": 0.23156601190567017, "learning_rate": 9.748833670999786e-06, "loss": 0.489, "step": 2835 }, { "epoch": 1.2829676543768378, "grad_norm": 0.2640095055103302, "learning_rate": 9.748604299492732e-06, "loss": 0.6057, "step": 2836 }, { "epoch": 1.2834200407147704, "grad_norm": 0.2594679594039917, "learning_rate": 9.748374826000344e-06, "loss": 0.4895, "step": 2837 }, { "epoch": 1.283872427052703, "grad_norm": 0.23551541566848755, "learning_rate": 9.748145250527554e-06, "loss": 0.4872, "step": 2838 }, { "epoch": 1.2843248133906355, "grad_norm": 0.2712445855140686, "learning_rate": 9.747915573079292e-06, "loss": 0.6682, "step": 2839 }, { "epoch": 1.2847771997285682, "grad_norm": 0.2622758448123932, "learning_rate": 9.74768579366049e-06, "loss": 0.5798, "step": 2840 }, { "epoch": 1.2852295860665008, "grad_norm": 0.267366886138916, "learning_rate": 9.747455912276087e-06, "loss": 0.5676, "step": 2841 }, { "epoch": 1.2856819724044333, "grad_norm": 0.2843244671821594, "learning_rate": 9.747225928931012e-06, "loss": 0.6166, "step": 2842 }, { "epoch": 1.286134358742366, "grad_norm": 0.3115823268890381, "learning_rate": 9.746995843630212e-06, "loss": 0.5465, "step": 2843 }, { "epoch": 1.2865867450802986, "grad_norm": 0.28552693128585815, "learning_rate": 9.746765656378625e-06, "loss": 0.6057, "step": 2844 }, { "epoch": 1.2870391314182312, "grad_norm": 0.29755693674087524, "learning_rate": 9.746535367181196e-06, "loss": 0.5486, "step": 2845 }, { "epoch": 1.2874915177561639, "grad_norm": 0.28931328654289246, "learning_rate": 9.746304976042868e-06, "loss": 0.5698, "step": 2846 }, { "epoch": 1.2879439040940963, "grad_norm": 0.2808988094329834, "learning_rate": 9.746074482968593e-06, "loss": 0.4883, "step": 2847 }, { "epoch": 1.288396290432029, "grad_norm": 0.3047727942466736, "learning_rate": 9.745843887963319e-06, "loss": 0.4834, "step": 2848 }, { "epoch": 1.2888486767699616, "grad_norm": 0.2808775007724762, "learning_rate": 9.745613191031999e-06, "loss": 0.5346, "step": 2849 }, { "epoch": 1.289301063107894, "grad_norm": 0.3331037759780884, "learning_rate": 9.745382392179587e-06, "loss": 0.5696, "step": 2850 }, { "epoch": 1.2897534494458267, "grad_norm": 0.30670520663261414, "learning_rate": 9.745151491411042e-06, "loss": 0.5338, "step": 2851 }, { "epoch": 1.2902058357837594, "grad_norm": 0.2717808187007904, "learning_rate": 9.744920488731323e-06, "loss": 0.4247, "step": 2852 }, { "epoch": 1.2906582221216918, "grad_norm": 0.3513016402721405, "learning_rate": 9.744689384145386e-06, "loss": 0.5624, "step": 2853 }, { "epoch": 1.2911106084596244, "grad_norm": 0.4569036066532135, "learning_rate": 9.7444581776582e-06, "loss": 0.7577, "step": 2854 }, { "epoch": 1.291562994797557, "grad_norm": 0.3328867256641388, "learning_rate": 9.744226869274728e-06, "loss": 0.5798, "step": 2855 }, { "epoch": 1.2920153811354897, "grad_norm": 0.36308854818344116, "learning_rate": 9.743995458999939e-06, "loss": 0.5814, "step": 2856 }, { "epoch": 1.2924677674734224, "grad_norm": 0.37056562304496765, "learning_rate": 9.743763946838804e-06, "loss": 0.6073, "step": 2857 }, { "epoch": 1.2929201538113548, "grad_norm": 0.3607046902179718, "learning_rate": 9.743532332796292e-06, "loss": 0.4838, "step": 2858 }, { "epoch": 1.2933725401492875, "grad_norm": 0.3965695798397064, "learning_rate": 9.74330061687738e-06, "loss": 0.5366, "step": 2859 }, { "epoch": 1.2938249264872201, "grad_norm": 0.4560915529727936, "learning_rate": 9.743068799087043e-06, "loss": 0.5429, "step": 2860 }, { "epoch": 1.2942773128251526, "grad_norm": 0.4118375778198242, "learning_rate": 9.74283687943026e-06, "loss": 1.2619, "step": 2861 }, { "epoch": 1.2947296991630852, "grad_norm": 0.1499118059873581, "learning_rate": 9.742604857912013e-06, "loss": 0.7557, "step": 2862 }, { "epoch": 1.2951820855010179, "grad_norm": 0.16184449195861816, "learning_rate": 9.742372734537284e-06, "loss": 0.7463, "step": 2863 }, { "epoch": 1.2956344718389505, "grad_norm": 0.17900757491588593, "learning_rate": 9.74214050931106e-06, "loss": 0.6069, "step": 2864 }, { "epoch": 1.2960868581768832, "grad_norm": 0.19849857687950134, "learning_rate": 9.741908182238325e-06, "loss": 0.5979, "step": 2865 }, { "epoch": 1.2965392445148156, "grad_norm": 0.25318604707717896, "learning_rate": 9.741675753324071e-06, "loss": 0.7031, "step": 2866 }, { "epoch": 1.2969916308527483, "grad_norm": 0.22285644710063934, "learning_rate": 9.741443222573291e-06, "loss": 0.6243, "step": 2867 }, { "epoch": 1.297444017190681, "grad_norm": 0.23275543749332428, "learning_rate": 9.741210589990977e-06, "loss": 0.6225, "step": 2868 }, { "epoch": 1.2978964035286134, "grad_norm": 0.2499782145023346, "learning_rate": 9.740977855582126e-06, "loss": 0.5501, "step": 2869 }, { "epoch": 1.298348789866546, "grad_norm": 0.2250739187002182, "learning_rate": 9.740745019351736e-06, "loss": 0.4836, "step": 2870 }, { "epoch": 1.2988011762044787, "grad_norm": 0.21158695220947266, "learning_rate": 9.74051208130481e-06, "loss": 0.5763, "step": 2871 }, { "epoch": 1.299253562542411, "grad_norm": 0.2407611757516861, "learning_rate": 9.740279041446347e-06, "loss": 0.625, "step": 2872 }, { "epoch": 1.2997059488803437, "grad_norm": 0.22856733202934265, "learning_rate": 9.740045899781353e-06, "loss": 0.5543, "step": 2873 }, { "epoch": 1.3001583352182764, "grad_norm": 0.23502188920974731, "learning_rate": 9.739812656314838e-06, "loss": 0.5777, "step": 2874 }, { "epoch": 1.300610721556209, "grad_norm": 0.28303730487823486, "learning_rate": 9.739579311051806e-06, "loss": 0.7181, "step": 2875 }, { "epoch": 1.3010631078941417, "grad_norm": 0.25874608755111694, "learning_rate": 9.739345863997274e-06, "loss": 0.5862, "step": 2876 }, { "epoch": 1.3015154942320741, "grad_norm": 0.24694210290908813, "learning_rate": 9.739112315156254e-06, "loss": 0.5743, "step": 2877 }, { "epoch": 1.3019678805700068, "grad_norm": 0.2258276343345642, "learning_rate": 9.73887866453376e-06, "loss": 0.5134, "step": 2878 }, { "epoch": 1.3024202669079394, "grad_norm": 0.25972452759742737, "learning_rate": 9.738644912134811e-06, "loss": 0.644, "step": 2879 }, { "epoch": 1.3028726532458719, "grad_norm": 0.2313169240951538, "learning_rate": 9.73841105796443e-06, "loss": 0.5352, "step": 2880 }, { "epoch": 1.3033250395838045, "grad_norm": 0.24260742962360382, "learning_rate": 9.738177102027636e-06, "loss": 0.5666, "step": 2881 }, { "epoch": 1.3037774259217372, "grad_norm": 0.3296576142311096, "learning_rate": 9.737943044329454e-06, "loss": 0.8311, "step": 2882 }, { "epoch": 1.3042298122596698, "grad_norm": 0.30143746733665466, "learning_rate": 9.737708884874912e-06, "loss": 0.6354, "step": 2883 }, { "epoch": 1.3046821985976025, "grad_norm": 0.27125799655914307, "learning_rate": 9.737474623669041e-06, "loss": 0.5968, "step": 2884 }, { "epoch": 1.305134584935535, "grad_norm": 0.26029708981513977, "learning_rate": 9.737240260716866e-06, "loss": 0.6065, "step": 2885 }, { "epoch": 1.3055869712734676, "grad_norm": 0.25844934582710266, "learning_rate": 9.737005796023428e-06, "loss": 0.5352, "step": 2886 }, { "epoch": 1.3060393576114002, "grad_norm": 0.3025117516517639, "learning_rate": 9.736771229593758e-06, "loss": 0.6604, "step": 2887 }, { "epoch": 1.3064917439493327, "grad_norm": 0.23897112905979156, "learning_rate": 9.736536561432893e-06, "loss": 0.4419, "step": 2888 }, { "epoch": 1.3069441302872653, "grad_norm": 0.3013847768306732, "learning_rate": 9.736301791545875e-06, "loss": 0.6327, "step": 2889 }, { "epoch": 1.307396516625198, "grad_norm": 0.2662460207939148, "learning_rate": 9.736066919937747e-06, "loss": 0.4472, "step": 2890 }, { "epoch": 1.3078489029631304, "grad_norm": 0.2429116815328598, "learning_rate": 9.735831946613551e-06, "loss": 0.4951, "step": 2891 }, { "epoch": 1.308301289301063, "grad_norm": 0.30945834517478943, "learning_rate": 9.735596871578335e-06, "loss": 0.7022, "step": 2892 }, { "epoch": 1.3087536756389957, "grad_norm": 0.29931777715682983, "learning_rate": 9.735361694837147e-06, "loss": 0.6296, "step": 2893 }, { "epoch": 1.3092060619769283, "grad_norm": 0.30271467566490173, "learning_rate": 9.73512641639504e-06, "loss": 0.5207, "step": 2894 }, { "epoch": 1.309658448314861, "grad_norm": 0.27355891466140747, "learning_rate": 9.734891036257063e-06, "loss": 0.5609, "step": 2895 }, { "epoch": 1.3101108346527934, "grad_norm": 0.3157844841480255, "learning_rate": 9.734655554428274e-06, "loss": 0.5957, "step": 2896 }, { "epoch": 1.310563220990726, "grad_norm": 0.29876768589019775, "learning_rate": 9.734419970913728e-06, "loss": 0.5456, "step": 2897 }, { "epoch": 1.3110156073286587, "grad_norm": 0.281610906124115, "learning_rate": 9.73418428571849e-06, "loss": 0.506, "step": 2898 }, { "epoch": 1.3114679936665912, "grad_norm": 0.33829906582832336, "learning_rate": 9.733948498847616e-06, "loss": 0.6152, "step": 2899 }, { "epoch": 1.3119203800045238, "grad_norm": 0.3623616397380829, "learning_rate": 9.733712610306172e-06, "loss": 0.7329, "step": 2900 }, { "epoch": 1.3123727663424565, "grad_norm": 0.3031523823738098, "learning_rate": 9.733476620099224e-06, "loss": 0.513, "step": 2901 }, { "epoch": 1.3128251526803891, "grad_norm": 0.33097368478775024, "learning_rate": 9.733240528231841e-06, "loss": 0.5554, "step": 2902 }, { "epoch": 1.3132775390183218, "grad_norm": 0.3244868218898773, "learning_rate": 9.733004334709095e-06, "loss": 0.5147, "step": 2903 }, { "epoch": 1.3137299253562542, "grad_norm": 0.3350277543067932, "learning_rate": 9.732768039536056e-06, "loss": 0.5065, "step": 2904 }, { "epoch": 1.3141823116941869, "grad_norm": 0.33807629346847534, "learning_rate": 9.7325316427178e-06, "loss": 0.511, "step": 2905 }, { "epoch": 1.3146346980321195, "grad_norm": 0.41096609830856323, "learning_rate": 9.732295144259402e-06, "loss": 0.5967, "step": 2906 }, { "epoch": 1.315087084370052, "grad_norm": 0.3590152859687805, "learning_rate": 9.732058544165946e-06, "loss": 0.5585, "step": 2907 }, { "epoch": 1.3155394707079846, "grad_norm": 0.40926530957221985, "learning_rate": 9.731821842442508e-06, "loss": 0.552, "step": 2908 }, { "epoch": 1.3159918570459173, "grad_norm": 0.38843950629234314, "learning_rate": 9.731585039094176e-06, "loss": 0.509, "step": 2909 }, { "epoch": 1.3164442433838497, "grad_norm": 0.4852513372898102, "learning_rate": 9.731348134126033e-06, "loss": 0.5956, "step": 2910 }, { "epoch": 1.3168966297217823, "grad_norm": 0.4164871871471405, "learning_rate": 9.731111127543168e-06, "loss": 1.1853, "step": 2911 }, { "epoch": 1.317349016059715, "grad_norm": 0.18803811073303223, "learning_rate": 9.730874019350672e-06, "loss": 0.9321, "step": 2912 }, { "epoch": 1.3178014023976476, "grad_norm": 0.1728222817182541, "learning_rate": 9.730636809553635e-06, "loss": 0.6752, "step": 2913 }, { "epoch": 1.3182537887355803, "grad_norm": 0.2018362432718277, "learning_rate": 9.730399498157152e-06, "loss": 0.7203, "step": 2914 }, { "epoch": 1.3187061750735127, "grad_norm": 0.20791858434677124, "learning_rate": 9.730162085166323e-06, "loss": 0.5356, "step": 2915 }, { "epoch": 1.3191585614114454, "grad_norm": 0.22943022847175598, "learning_rate": 9.729924570586244e-06, "loss": 0.6855, "step": 2916 }, { "epoch": 1.319610947749378, "grad_norm": 0.22725708782672882, "learning_rate": 9.729686954422018e-06, "loss": 0.6133, "step": 2917 }, { "epoch": 1.3200633340873105, "grad_norm": 0.20592091977596283, "learning_rate": 9.729449236678745e-06, "loss": 0.6555, "step": 2918 }, { "epoch": 1.3205157204252431, "grad_norm": 0.24279433488845825, "learning_rate": 9.729211417361534e-06, "loss": 0.7152, "step": 2919 }, { "epoch": 1.3209681067631758, "grad_norm": 0.21192190051078796, "learning_rate": 9.72897349647549e-06, "loss": 0.5909, "step": 2920 }, { "epoch": 1.3214204931011084, "grad_norm": 0.2335977852344513, "learning_rate": 9.728735474025724e-06, "loss": 0.5512, "step": 2921 }, { "epoch": 1.3218728794390409, "grad_norm": 0.23371390998363495, "learning_rate": 9.728497350017348e-06, "loss": 0.6249, "step": 2922 }, { "epoch": 1.3223252657769735, "grad_norm": 0.23240172863006592, "learning_rate": 9.728259124455477e-06, "loss": 0.6066, "step": 2923 }, { "epoch": 1.3227776521149062, "grad_norm": 0.23847509920597076, "learning_rate": 9.728020797345224e-06, "loss": 0.5864, "step": 2924 }, { "epoch": 1.3232300384528388, "grad_norm": 0.23898017406463623, "learning_rate": 9.727782368691712e-06, "loss": 0.5549, "step": 2925 }, { "epoch": 1.3236824247907713, "grad_norm": 0.2632506787776947, "learning_rate": 9.727543838500058e-06, "loss": 0.7097, "step": 2926 }, { "epoch": 1.324134811128704, "grad_norm": 0.24971576035022736, "learning_rate": 9.727305206775389e-06, "loss": 0.5921, "step": 2927 }, { "epoch": 1.3245871974666366, "grad_norm": 0.2564380168914795, "learning_rate": 9.727066473522825e-06, "loss": 0.6133, "step": 2928 }, { "epoch": 1.325039583804569, "grad_norm": 0.2883983552455902, "learning_rate": 9.726827638747495e-06, "loss": 0.6564, "step": 2929 }, { "epoch": 1.3254919701425016, "grad_norm": 0.26965370774269104, "learning_rate": 9.726588702454532e-06, "loss": 0.6058, "step": 2930 }, { "epoch": 1.3259443564804343, "grad_norm": 0.2405070960521698, "learning_rate": 9.726349664649064e-06, "loss": 0.5131, "step": 2931 }, { "epoch": 1.326396742818367, "grad_norm": 0.2699286937713623, "learning_rate": 9.726110525336225e-06, "loss": 0.542, "step": 2932 }, { "epoch": 1.3268491291562996, "grad_norm": 0.26964718103408813, "learning_rate": 9.72587128452115e-06, "loss": 0.5563, "step": 2933 }, { "epoch": 1.327301515494232, "grad_norm": 0.2647685706615448, "learning_rate": 9.725631942208981e-06, "loss": 0.6633, "step": 2934 }, { "epoch": 1.3277539018321647, "grad_norm": 0.2694646716117859, "learning_rate": 9.725392498404857e-06, "loss": 0.6065, "step": 2935 }, { "epoch": 1.3282062881700973, "grad_norm": 0.24669399857521057, "learning_rate": 9.725152953113919e-06, "loss": 0.5039, "step": 2936 }, { "epoch": 1.3286586745080298, "grad_norm": 0.26863643527030945, "learning_rate": 9.72491330634131e-06, "loss": 0.5845, "step": 2937 }, { "epoch": 1.3291110608459624, "grad_norm": 0.27661436796188354, "learning_rate": 9.724673558092182e-06, "loss": 0.6067, "step": 2938 }, { "epoch": 1.329563447183895, "grad_norm": 0.31150490045547485, "learning_rate": 9.72443370837168e-06, "loss": 0.6869, "step": 2939 }, { "epoch": 1.3300158335218275, "grad_norm": 0.27396100759506226, "learning_rate": 9.724193757184956e-06, "loss": 0.595, "step": 2940 }, { "epoch": 1.3304682198597602, "grad_norm": 0.2698312997817993, "learning_rate": 9.723953704537164e-06, "loss": 0.5355, "step": 2941 }, { "epoch": 1.3309206061976928, "grad_norm": 0.33954334259033203, "learning_rate": 9.72371355043346e-06, "loss": 0.6076, "step": 2942 }, { "epoch": 1.3313729925356255, "grad_norm": 0.2463621199131012, "learning_rate": 9.723473294879002e-06, "loss": 0.5027, "step": 2943 }, { "epoch": 1.3318253788735581, "grad_norm": 0.31276941299438477, "learning_rate": 9.723232937878948e-06, "loss": 0.632, "step": 2944 }, { "epoch": 1.3322777652114906, "grad_norm": 0.3196955621242523, "learning_rate": 9.722992479438461e-06, "loss": 0.7107, "step": 2945 }, { "epoch": 1.3327301515494232, "grad_norm": 0.2904650568962097, "learning_rate": 9.722751919562707e-06, "loss": 0.4468, "step": 2946 }, { "epoch": 1.3331825378873559, "grad_norm": 0.2719293236732483, "learning_rate": 9.72251125825685e-06, "loss": 0.548, "step": 2947 }, { "epoch": 1.3336349242252883, "grad_norm": 0.31356915831565857, "learning_rate": 9.72227049552606e-06, "loss": 0.5479, "step": 2948 }, { "epoch": 1.334087310563221, "grad_norm": 0.3463706374168396, "learning_rate": 9.722029631375507e-06, "loss": 0.5884, "step": 2949 }, { "epoch": 1.3345396969011536, "grad_norm": 0.2896590828895569, "learning_rate": 9.721788665810365e-06, "loss": 0.4838, "step": 2950 }, { "epoch": 1.3349920832390862, "grad_norm": 0.3116393983364105, "learning_rate": 9.721547598835807e-06, "loss": 0.4548, "step": 2951 }, { "epoch": 1.335444469577019, "grad_norm": 0.26913323998451233, "learning_rate": 9.721306430457015e-06, "loss": 0.4044, "step": 2952 }, { "epoch": 1.3358968559149513, "grad_norm": 0.32032281160354614, "learning_rate": 9.721065160679165e-06, "loss": 0.4664, "step": 2953 }, { "epoch": 1.336349242252884, "grad_norm": 0.32286694645881653, "learning_rate": 9.720823789507438e-06, "loss": 0.6357, "step": 2954 }, { "epoch": 1.3368016285908166, "grad_norm": 0.3144914209842682, "learning_rate": 9.720582316947022e-06, "loss": 0.4883, "step": 2955 }, { "epoch": 1.337254014928749, "grad_norm": 0.3306204676628113, "learning_rate": 9.720340743003098e-06, "loss": 0.4611, "step": 2956 }, { "epoch": 1.3377064012666817, "grad_norm": 0.3701990842819214, "learning_rate": 9.720099067680857e-06, "loss": 0.5411, "step": 2957 }, { "epoch": 1.3381587876046144, "grad_norm": 0.3390199840068817, "learning_rate": 9.719857290985491e-06, "loss": 0.5221, "step": 2958 }, { "epoch": 1.3386111739425468, "grad_norm": 0.3868628442287445, "learning_rate": 9.719615412922189e-06, "loss": 0.5344, "step": 2959 }, { "epoch": 1.3390635602804795, "grad_norm": 0.3741057813167572, "learning_rate": 9.719373433496148e-06, "loss": 0.5385, "step": 2960 }, { "epoch": 1.3395159466184121, "grad_norm": 0.30349278450012207, "learning_rate": 9.719131352712564e-06, "loss": 0.8185, "step": 2961 }, { "epoch": 1.3399683329563448, "grad_norm": 0.16732406616210938, "learning_rate": 9.718889170576638e-06, "loss": 0.6533, "step": 2962 }, { "epoch": 1.3404207192942774, "grad_norm": 0.16851741075515747, "learning_rate": 9.718646887093568e-06, "loss": 0.5667, "step": 2963 }, { "epoch": 1.3408731056322098, "grad_norm": 0.19961898028850555, "learning_rate": 9.71840450226856e-06, "loss": 0.59, "step": 2964 }, { "epoch": 1.3413254919701425, "grad_norm": 0.21587979793548584, "learning_rate": 9.71816201610682e-06, "loss": 0.7074, "step": 2965 }, { "epoch": 1.3417778783080752, "grad_norm": 0.22144615650177002, "learning_rate": 9.717919428613555e-06, "loss": 0.6621, "step": 2966 }, { "epoch": 1.3422302646460076, "grad_norm": 0.20605920255184174, "learning_rate": 9.717676739793975e-06, "loss": 0.5647, "step": 2967 }, { "epoch": 1.3426826509839402, "grad_norm": 0.2148527204990387, "learning_rate": 9.71743394965329e-06, "loss": 0.4672, "step": 2968 }, { "epoch": 1.343135037321873, "grad_norm": 0.1954173743724823, "learning_rate": 9.71719105819672e-06, "loss": 0.4928, "step": 2969 }, { "epoch": 1.3435874236598055, "grad_norm": 0.22284527122974396, "learning_rate": 9.716948065429476e-06, "loss": 0.5566, "step": 2970 }, { "epoch": 1.3440398099977382, "grad_norm": 0.250431090593338, "learning_rate": 9.71670497135678e-06, "loss": 0.7391, "step": 2971 }, { "epoch": 1.3444921963356706, "grad_norm": 0.23870672285556793, "learning_rate": 9.716461775983852e-06, "loss": 0.6382, "step": 2972 }, { "epoch": 1.3449445826736033, "grad_norm": 0.21375969052314758, "learning_rate": 9.716218479315914e-06, "loss": 0.5095, "step": 2973 }, { "epoch": 1.345396969011536, "grad_norm": 0.24948665499687195, "learning_rate": 9.715975081358192e-06, "loss": 0.628, "step": 2974 }, { "epoch": 1.3458493553494684, "grad_norm": 0.21965231001377106, "learning_rate": 9.715731582115914e-06, "loss": 0.5806, "step": 2975 }, { "epoch": 1.346301741687401, "grad_norm": 0.24827897548675537, "learning_rate": 9.715487981594309e-06, "loss": 0.6255, "step": 2976 }, { "epoch": 1.3467541280253337, "grad_norm": 0.2570228576660156, "learning_rate": 9.71524427979861e-06, "loss": 0.5694, "step": 2977 }, { "epoch": 1.347206514363266, "grad_norm": 0.262031614780426, "learning_rate": 9.715000476734049e-06, "loss": 0.5804, "step": 2978 }, { "epoch": 1.3476589007011988, "grad_norm": 0.2342575341463089, "learning_rate": 9.714756572405864e-06, "loss": 0.5753, "step": 2979 }, { "epoch": 1.3481112870391314, "grad_norm": 0.28116917610168457, "learning_rate": 9.71451256681929e-06, "loss": 0.5416, "step": 2980 }, { "epoch": 1.348563673377064, "grad_norm": 0.2734035551548004, "learning_rate": 9.714268459979572e-06, "loss": 0.6363, "step": 2981 }, { "epoch": 1.3490160597149967, "grad_norm": 0.2475994974374771, "learning_rate": 9.71402425189195e-06, "loss": 0.517, "step": 2982 }, { "epoch": 1.3494684460529291, "grad_norm": 0.3428896963596344, "learning_rate": 9.71377994256167e-06, "loss": 0.7734, "step": 2983 }, { "epoch": 1.3499208323908618, "grad_norm": 0.27399638295173645, "learning_rate": 9.713535531993978e-06, "loss": 0.5498, "step": 2984 }, { "epoch": 1.3503732187287945, "grad_norm": 0.27124565839767456, "learning_rate": 9.713291020194122e-06, "loss": 0.6209, "step": 2985 }, { "epoch": 1.3508256050667269, "grad_norm": 0.24217726290225983, "learning_rate": 9.713046407167356e-06, "loss": 0.4915, "step": 2986 }, { "epoch": 1.3512779914046595, "grad_norm": 0.2910413146018982, "learning_rate": 9.712801692918934e-06, "loss": 0.6639, "step": 2987 }, { "epoch": 1.3517303777425922, "grad_norm": 0.2715666592121124, "learning_rate": 9.712556877454109e-06, "loss": 0.5705, "step": 2988 }, { "epoch": 1.3521827640805248, "grad_norm": 0.2559581995010376, "learning_rate": 9.71231196077814e-06, "loss": 0.5319, "step": 2989 }, { "epoch": 1.3526351504184575, "grad_norm": 0.26369765400886536, "learning_rate": 9.712066942896287e-06, "loss": 0.547, "step": 2990 }, { "epoch": 1.35308753675639, "grad_norm": 0.3004854917526245, "learning_rate": 9.711821823813812e-06, "loss": 0.6459, "step": 2991 }, { "epoch": 1.3535399230943226, "grad_norm": 0.29189351201057434, "learning_rate": 9.711576603535979e-06, "loss": 0.5393, "step": 2992 }, { "epoch": 1.3539923094322552, "grad_norm": 0.28924360871315, "learning_rate": 9.711331282068058e-06, "loss": 0.5364, "step": 2993 }, { "epoch": 1.3544446957701877, "grad_norm": 0.2848178744316101, "learning_rate": 9.711085859415312e-06, "loss": 0.5136, "step": 2994 }, { "epoch": 1.3548970821081203, "grad_norm": 0.27242615818977356, "learning_rate": 9.710840335583017e-06, "loss": 0.4358, "step": 2995 }, { "epoch": 1.355349468446053, "grad_norm": 0.2691543698310852, "learning_rate": 9.710594710576444e-06, "loss": 0.5124, "step": 2996 }, { "epoch": 1.3558018547839854, "grad_norm": 0.3238534927368164, "learning_rate": 9.710348984400867e-06, "loss": 0.6428, "step": 2997 }, { "epoch": 1.356254241121918, "grad_norm": 0.2883700132369995, "learning_rate": 9.710103157061566e-06, "loss": 0.4431, "step": 2998 }, { "epoch": 1.3567066274598507, "grad_norm": 0.29804283380508423, "learning_rate": 9.709857228563818e-06, "loss": 0.5032, "step": 2999 }, { "epoch": 1.3571590137977834, "grad_norm": 0.32726892828941345, "learning_rate": 9.709611198912907e-06, "loss": 0.5915, "step": 3000 }, { "epoch": 1.3571590137977834, "eval_loss": 0.6075624823570251, "eval_runtime": 25.7132, "eval_samples_per_second": 28.935, "eval_steps_per_second": 7.234, "step": 3000 }, { "epoch": 1.357611400135716, "grad_norm": 0.33779066801071167, "learning_rate": 9.709365068114115e-06, "loss": 0.5794, "step": 3001 }, { "epoch": 1.3580637864736484, "grad_norm": 0.3147925138473511, "learning_rate": 9.709118836172732e-06, "loss": 0.5041, "step": 3002 }, { "epoch": 1.358516172811581, "grad_norm": 0.3148426413536072, "learning_rate": 9.708872503094041e-06, "loss": 0.4471, "step": 3003 }, { "epoch": 1.3589685591495138, "grad_norm": 0.29102611541748047, "learning_rate": 9.708626068883335e-06, "loss": 0.4702, "step": 3004 }, { "epoch": 1.3594209454874462, "grad_norm": 0.3225577175617218, "learning_rate": 9.708379533545909e-06, "loss": 0.4337, "step": 3005 }, { "epoch": 1.3598733318253788, "grad_norm": 0.3438999652862549, "learning_rate": 9.708132897087053e-06, "loss": 0.4936, "step": 3006 }, { "epoch": 1.3603257181633115, "grad_norm": 0.3600670397281647, "learning_rate": 9.707886159512068e-06, "loss": 0.547, "step": 3007 }, { "epoch": 1.3607781045012441, "grad_norm": 0.3741298019886017, "learning_rate": 9.707639320826251e-06, "loss": 0.5019, "step": 3008 }, { "epoch": 1.3612304908391766, "grad_norm": 0.3864637017250061, "learning_rate": 9.707392381034904e-06, "loss": 0.5564, "step": 3009 }, { "epoch": 1.3616828771771092, "grad_norm": 0.34787848591804504, "learning_rate": 9.707145340143331e-06, "loss": 0.5266, "step": 3010 }, { "epoch": 1.3621352635150419, "grad_norm": 0.307043194770813, "learning_rate": 9.706898198156836e-06, "loss": 0.904, "step": 3011 }, { "epoch": 1.3625876498529745, "grad_norm": 0.18437831103801727, "learning_rate": 9.706650955080727e-06, "loss": 0.8526, "step": 3012 }, { "epoch": 1.363040036190907, "grad_norm": 0.16763921082019806, "learning_rate": 9.706403610920317e-06, "loss": 0.6073, "step": 3013 }, { "epoch": 1.3634924225288396, "grad_norm": 0.18909572064876556, "learning_rate": 9.706156165680915e-06, "loss": 0.5954, "step": 3014 }, { "epoch": 1.3639448088667723, "grad_norm": 0.1859043538570404, "learning_rate": 9.705908619367838e-06, "loss": 0.5299, "step": 3015 }, { "epoch": 1.3643971952047047, "grad_norm": 0.2420845776796341, "learning_rate": 9.7056609719864e-06, "loss": 0.7338, "step": 3016 }, { "epoch": 1.3648495815426374, "grad_norm": 0.20702306926250458, "learning_rate": 9.705413223541921e-06, "loss": 0.6349, "step": 3017 }, { "epoch": 1.36530196788057, "grad_norm": 0.2347843199968338, "learning_rate": 9.705165374039721e-06, "loss": 0.5292, "step": 3018 }, { "epoch": 1.3657543542185027, "grad_norm": 0.20976729691028595, "learning_rate": 9.704917423485126e-06, "loss": 0.6231, "step": 3019 }, { "epoch": 1.3662067405564353, "grad_norm": 0.22277799248695374, "learning_rate": 9.704669371883457e-06, "loss": 0.646, "step": 3020 }, { "epoch": 1.3666591268943677, "grad_norm": 0.2466108798980713, "learning_rate": 9.704421219240044e-06, "loss": 0.7118, "step": 3021 }, { "epoch": 1.3671115132323004, "grad_norm": 0.22093944251537323, "learning_rate": 9.704172965560215e-06, "loss": 0.5576, "step": 3022 }, { "epoch": 1.367563899570233, "grad_norm": 0.24433393776416779, "learning_rate": 9.703924610849302e-06, "loss": 0.6232, "step": 3023 }, { "epoch": 1.3680162859081655, "grad_norm": 0.24533483386039734, "learning_rate": 9.70367615511264e-06, "loss": 0.6267, "step": 3024 }, { "epoch": 1.3684686722460981, "grad_norm": 0.2602783143520355, "learning_rate": 9.703427598355563e-06, "loss": 0.6891, "step": 3025 }, { "epoch": 1.3689210585840308, "grad_norm": 0.22494252026081085, "learning_rate": 9.703178940583413e-06, "loss": 0.4726, "step": 3026 }, { "epoch": 1.3693734449219632, "grad_norm": 0.23157311975955963, "learning_rate": 9.70293018180153e-06, "loss": 0.5474, "step": 3027 }, { "epoch": 1.3698258312598959, "grad_norm": 0.24887605011463165, "learning_rate": 9.702681322015251e-06, "loss": 0.6193, "step": 3028 }, { "epoch": 1.3702782175978285, "grad_norm": 0.2825077772140503, "learning_rate": 9.702432361229926e-06, "loss": 0.738, "step": 3029 }, { "epoch": 1.3707306039357612, "grad_norm": 0.24347682297229767, "learning_rate": 9.702183299450897e-06, "loss": 0.4895, "step": 3030 }, { "epoch": 1.3711829902736938, "grad_norm": 0.3330383598804474, "learning_rate": 9.70193413668352e-06, "loss": 0.5251, "step": 3031 }, { "epoch": 1.3716353766116263, "grad_norm": 0.2524184584617615, "learning_rate": 9.701684872933143e-06, "loss": 0.4807, "step": 3032 }, { "epoch": 1.372087762949559, "grad_norm": 0.25565457344055176, "learning_rate": 9.701435508205117e-06, "loss": 0.4888, "step": 3033 }, { "epoch": 1.3725401492874916, "grad_norm": 0.24234944581985474, "learning_rate": 9.701186042504802e-06, "loss": 0.4824, "step": 3034 }, { "epoch": 1.372992535625424, "grad_norm": 0.24546019732952118, "learning_rate": 9.70093647583755e-06, "loss": 0.5834, "step": 3035 }, { "epoch": 1.3734449219633567, "grad_norm": 0.2530766725540161, "learning_rate": 9.700686808208728e-06, "loss": 0.5094, "step": 3036 }, { "epoch": 1.3738973083012893, "grad_norm": 0.2763305604457855, "learning_rate": 9.700437039623692e-06, "loss": 0.532, "step": 3037 }, { "epoch": 1.374349694639222, "grad_norm": 0.29738128185272217, "learning_rate": 9.700187170087809e-06, "loss": 0.6295, "step": 3038 }, { "epoch": 1.3748020809771546, "grad_norm": 0.2900155484676361, "learning_rate": 9.699937199606446e-06, "loss": 0.613, "step": 3039 }, { "epoch": 1.375254467315087, "grad_norm": 0.39296430349349976, "learning_rate": 9.69968712818497e-06, "loss": 0.5622, "step": 3040 }, { "epoch": 1.3757068536530197, "grad_norm": 0.2804388403892517, "learning_rate": 9.699436955828753e-06, "loss": 0.5216, "step": 3041 }, { "epoch": 1.3761592399909524, "grad_norm": 0.2731271982192993, "learning_rate": 9.699186682543168e-06, "loss": 0.5108, "step": 3042 }, { "epoch": 1.3766116263288848, "grad_norm": 0.29021310806274414, "learning_rate": 9.698936308333587e-06, "loss": 0.5386, "step": 3043 }, { "epoch": 1.3770640126668174, "grad_norm": 0.28796908259391785, "learning_rate": 9.698685833205392e-06, "loss": 0.5256, "step": 3044 }, { "epoch": 1.37751639900475, "grad_norm": 0.31264951825141907, "learning_rate": 9.698435257163959e-06, "loss": 0.5498, "step": 3045 }, { "epoch": 1.3779687853426825, "grad_norm": 0.3274178206920624, "learning_rate": 9.698184580214671e-06, "loss": 0.5417, "step": 3046 }, { "epoch": 1.3784211716806152, "grad_norm": 0.3071603775024414, "learning_rate": 9.69793380236291e-06, "loss": 0.6309, "step": 3047 }, { "epoch": 1.3788735580185478, "grad_norm": 0.30135852098464966, "learning_rate": 9.697682923614066e-06, "loss": 0.5262, "step": 3048 }, { "epoch": 1.3793259443564805, "grad_norm": 0.2861350476741791, "learning_rate": 9.697431943973525e-06, "loss": 0.4724, "step": 3049 }, { "epoch": 1.3797783306944131, "grad_norm": 0.3148767948150635, "learning_rate": 9.697180863446675e-06, "loss": 0.5669, "step": 3050 }, { "epoch": 1.3802307170323456, "grad_norm": 0.31222766637802124, "learning_rate": 9.69692968203891e-06, "loss": 0.5847, "step": 3051 }, { "epoch": 1.3806831033702782, "grad_norm": 0.3109757602214813, "learning_rate": 9.696678399755625e-06, "loss": 0.492, "step": 3052 }, { "epoch": 1.3811354897082109, "grad_norm": 0.3349771201610565, "learning_rate": 9.696427016602218e-06, "loss": 0.5914, "step": 3053 }, { "epoch": 1.3815878760461433, "grad_norm": 0.37271222472190857, "learning_rate": 9.696175532584085e-06, "loss": 0.6494, "step": 3054 }, { "epoch": 1.382040262384076, "grad_norm": 0.34563562273979187, "learning_rate": 9.69592394770663e-06, "loss": 0.6014, "step": 3055 }, { "epoch": 1.3824926487220086, "grad_norm": 0.34959375858306885, "learning_rate": 9.695672261975254e-06, "loss": 0.5753, "step": 3056 }, { "epoch": 1.3829450350599413, "grad_norm": 0.3792673647403717, "learning_rate": 9.695420475395365e-06, "loss": 0.59, "step": 3057 }, { "epoch": 1.383397421397874, "grad_norm": 0.4022141695022583, "learning_rate": 9.695168587972366e-06, "loss": 0.6018, "step": 3058 }, { "epoch": 1.3838498077358063, "grad_norm": 0.3943226635456085, "learning_rate": 9.694916599711673e-06, "loss": 0.532, "step": 3059 }, { "epoch": 1.384302194073739, "grad_norm": 0.43577829003334045, "learning_rate": 9.694664510618691e-06, "loss": 0.5954, "step": 3060 }, { "epoch": 1.3847545804116717, "grad_norm": 0.2758063077926636, "learning_rate": 9.69441232069884e-06, "loss": 0.9897, "step": 3061 }, { "epoch": 1.385206966749604, "grad_norm": 0.11559892445802689, "learning_rate": 9.694160029957533e-06, "loss": 0.5917, "step": 3062 }, { "epoch": 1.3856593530875367, "grad_norm": 0.18423113226890564, "learning_rate": 9.69390763840019e-06, "loss": 0.7794, "step": 3063 }, { "epoch": 1.3861117394254694, "grad_norm": 0.18359337747097015, "learning_rate": 9.693655146032233e-06, "loss": 0.6965, "step": 3064 }, { "epoch": 1.3865641257634018, "grad_norm": 0.2244405299425125, "learning_rate": 9.69340255285908e-06, "loss": 0.6904, "step": 3065 }, { "epoch": 1.3870165121013345, "grad_norm": 0.21569475531578064, "learning_rate": 9.69314985888616e-06, "loss": 0.5481, "step": 3066 }, { "epoch": 1.3874688984392671, "grad_norm": 0.21870139241218567, "learning_rate": 9.692897064118898e-06, "loss": 0.5954, "step": 3067 }, { "epoch": 1.3879212847771998, "grad_norm": 0.2254749834537506, "learning_rate": 9.692644168562724e-06, "loss": 0.6781, "step": 3068 }, { "epoch": 1.3883736711151324, "grad_norm": 0.21577465534210205, "learning_rate": 9.69239117222307e-06, "loss": 0.6016, "step": 3069 }, { "epoch": 1.3888260574530649, "grad_norm": 0.2135259062051773, "learning_rate": 9.692138075105368e-06, "loss": 0.499, "step": 3070 }, { "epoch": 1.3892784437909975, "grad_norm": 0.2538565397262573, "learning_rate": 9.691884877215056e-06, "loss": 0.7713, "step": 3071 }, { "epoch": 1.3897308301289302, "grad_norm": 0.2575910687446594, "learning_rate": 9.69163157855757e-06, "loss": 0.5194, "step": 3072 }, { "epoch": 1.3901832164668626, "grad_norm": 0.2372572273015976, "learning_rate": 9.69137817913835e-06, "loss": 0.6491, "step": 3073 }, { "epoch": 1.3906356028047953, "grad_norm": 0.26378440856933594, "learning_rate": 9.69112467896284e-06, "loss": 0.6575, "step": 3074 }, { "epoch": 1.391087989142728, "grad_norm": 0.24495281279087067, "learning_rate": 9.690871078036483e-06, "loss": 0.5443, "step": 3075 }, { "epoch": 1.3915403754806606, "grad_norm": 0.2460816204547882, "learning_rate": 9.690617376364726e-06, "loss": 0.5623, "step": 3076 }, { "epoch": 1.3919927618185932, "grad_norm": 0.264324426651001, "learning_rate": 9.690363573953018e-06, "loss": 0.7264, "step": 3077 }, { "epoch": 1.3924451481565256, "grad_norm": 0.29508066177368164, "learning_rate": 9.69010967080681e-06, "loss": 0.6955, "step": 3078 }, { "epoch": 1.3928975344944583, "grad_norm": 0.2750016450881958, "learning_rate": 9.689855666931554e-06, "loss": 0.5881, "step": 3079 }, { "epoch": 1.393349920832391, "grad_norm": 0.25102922320365906, "learning_rate": 9.689601562332704e-06, "loss": 0.5497, "step": 3080 }, { "epoch": 1.3938023071703234, "grad_norm": 0.2982873320579529, "learning_rate": 9.689347357015721e-06, "loss": 0.6778, "step": 3081 }, { "epoch": 1.394254693508256, "grad_norm": 0.26243898272514343, "learning_rate": 9.689093050986063e-06, "loss": 0.5246, "step": 3082 }, { "epoch": 1.3947070798461887, "grad_norm": 0.29669666290283203, "learning_rate": 9.688838644249192e-06, "loss": 0.6714, "step": 3083 }, { "epoch": 1.3951594661841211, "grad_norm": 0.2577560842037201, "learning_rate": 9.68858413681057e-06, "loss": 0.4917, "step": 3084 }, { "epoch": 1.3956118525220538, "grad_norm": 0.29012414813041687, "learning_rate": 9.688329528675666e-06, "loss": 0.6433, "step": 3085 }, { "epoch": 1.3960642388599864, "grad_norm": 0.26258397102355957, "learning_rate": 9.688074819849944e-06, "loss": 0.5156, "step": 3086 }, { "epoch": 1.396516625197919, "grad_norm": 0.28795525431632996, "learning_rate": 9.687820010338878e-06, "loss": 0.6301, "step": 3087 }, { "epoch": 1.3969690115358517, "grad_norm": 0.2633301615715027, "learning_rate": 9.68756510014794e-06, "loss": 0.5474, "step": 3088 }, { "epoch": 1.3974213978737842, "grad_norm": 0.2846234440803528, "learning_rate": 9.687310089282604e-06, "loss": 0.5867, "step": 3089 }, { "epoch": 1.3978737842117168, "grad_norm": 0.30442219972610474, "learning_rate": 9.687054977748347e-06, "loss": 0.5031, "step": 3090 }, { "epoch": 1.3983261705496495, "grad_norm": 0.2883112132549286, "learning_rate": 9.686799765550647e-06, "loss": 0.6258, "step": 3091 }, { "epoch": 1.398778556887582, "grad_norm": 0.2708801031112671, "learning_rate": 9.686544452694987e-06, "loss": 0.5038, "step": 3092 }, { "epoch": 1.3992309432255146, "grad_norm": 0.2945188879966736, "learning_rate": 9.686289039186848e-06, "loss": 0.5634, "step": 3093 }, { "epoch": 1.3996833295634472, "grad_norm": 0.28332194685935974, "learning_rate": 9.68603352503172e-06, "loss": 0.5862, "step": 3094 }, { "epoch": 1.4001357159013796, "grad_norm": 0.29766950011253357, "learning_rate": 9.685777910235085e-06, "loss": 0.5905, "step": 3095 }, { "epoch": 1.4005881022393123, "grad_norm": 0.2869698703289032, "learning_rate": 9.685522194802435e-06, "loss": 0.499, "step": 3096 }, { "epoch": 1.401040488577245, "grad_norm": 0.3004016876220703, "learning_rate": 9.685266378739265e-06, "loss": 0.5641, "step": 3097 }, { "epoch": 1.4014928749151776, "grad_norm": 0.3175637722015381, "learning_rate": 9.685010462051064e-06, "loss": 0.4514, "step": 3098 }, { "epoch": 1.4019452612531103, "grad_norm": 0.30301618576049805, "learning_rate": 9.684754444743331e-06, "loss": 0.5077, "step": 3099 }, { "epoch": 1.4023976475910427, "grad_norm": 0.3049514591693878, "learning_rate": 9.684498326821567e-06, "loss": 0.4543, "step": 3100 }, { "epoch": 1.4028500339289753, "grad_norm": 0.30849847197532654, "learning_rate": 9.684242108291268e-06, "loss": 0.4996, "step": 3101 }, { "epoch": 1.403302420266908, "grad_norm": 0.3386213183403015, "learning_rate": 9.683985789157939e-06, "loss": 0.567, "step": 3102 }, { "epoch": 1.4037548066048404, "grad_norm": 0.32166963815689087, "learning_rate": 9.683729369427084e-06, "loss": 0.5225, "step": 3103 }, { "epoch": 1.404207192942773, "grad_norm": 0.3601136803627014, "learning_rate": 9.68347284910421e-06, "loss": 0.603, "step": 3104 }, { "epoch": 1.4046595792807057, "grad_norm": 0.30504173040390015, "learning_rate": 9.68321622819483e-06, "loss": 0.4868, "step": 3105 }, { "epoch": 1.4051119656186384, "grad_norm": 0.35447564721107483, "learning_rate": 9.68295950670445e-06, "loss": 0.5276, "step": 3106 }, { "epoch": 1.405564351956571, "grad_norm": 0.34289512038230896, "learning_rate": 9.682702684638586e-06, "loss": 0.533, "step": 3107 }, { "epoch": 1.4060167382945035, "grad_norm": 0.33881041407585144, "learning_rate": 9.682445762002754e-06, "loss": 0.5016, "step": 3108 }, { "epoch": 1.4064691246324361, "grad_norm": 0.4190845489501953, "learning_rate": 9.682188738802473e-06, "loss": 0.608, "step": 3109 }, { "epoch": 1.4069215109703688, "grad_norm": 0.4203079044818878, "learning_rate": 9.68193161504326e-06, "loss": 0.5368, "step": 3110 }, { "epoch": 1.4073738973083012, "grad_norm": 0.3402780294418335, "learning_rate": 9.68167439073064e-06, "loss": 1.1502, "step": 3111 }, { "epoch": 1.4078262836462339, "grad_norm": 0.17804096639156342, "learning_rate": 9.681417065870135e-06, "loss": 1.1399, "step": 3112 }, { "epoch": 1.4082786699841665, "grad_norm": 0.1646643429994583, "learning_rate": 9.681159640467274e-06, "loss": 0.5208, "step": 3113 }, { "epoch": 1.408731056322099, "grad_norm": 0.17906537652015686, "learning_rate": 9.680902114527585e-06, "loss": 0.5632, "step": 3114 }, { "epoch": 1.4091834426600316, "grad_norm": 0.18854409456253052, "learning_rate": 9.680644488056598e-06, "loss": 0.6374, "step": 3115 }, { "epoch": 1.4096358289979642, "grad_norm": 0.20768658816814423, "learning_rate": 9.680386761059845e-06, "loss": 0.5831, "step": 3116 }, { "epoch": 1.410088215335897, "grad_norm": 0.2224583625793457, "learning_rate": 9.680128933542866e-06, "loss": 0.5967, "step": 3117 }, { "epoch": 1.4105406016738296, "grad_norm": 0.2185187190771103, "learning_rate": 9.679871005511192e-06, "loss": 0.5736, "step": 3118 }, { "epoch": 1.410992988011762, "grad_norm": 0.2519155740737915, "learning_rate": 9.679612976970367e-06, "loss": 0.7214, "step": 3119 }, { "epoch": 1.4114453743496946, "grad_norm": 0.2529256343841553, "learning_rate": 9.67935484792593e-06, "loss": 0.7075, "step": 3120 }, { "epoch": 1.4118977606876273, "grad_norm": 0.22649984061717987, "learning_rate": 9.679096618383426e-06, "loss": 0.7292, "step": 3121 }, { "epoch": 1.4123501470255597, "grad_norm": 0.22425559163093567, "learning_rate": 9.678838288348401e-06, "loss": 0.5849, "step": 3122 }, { "epoch": 1.4128025333634924, "grad_norm": 0.24688659608364105, "learning_rate": 9.678579857826404e-06, "loss": 0.6241, "step": 3123 }, { "epoch": 1.413254919701425, "grad_norm": 0.22955970466136932, "learning_rate": 9.678321326822983e-06, "loss": 0.4998, "step": 3124 }, { "epoch": 1.4137073060393577, "grad_norm": 0.24897554516792297, "learning_rate": 9.678062695343693e-06, "loss": 0.7757, "step": 3125 }, { "epoch": 1.4141596923772903, "grad_norm": 0.2686886787414551, "learning_rate": 9.677803963394085e-06, "loss": 0.6575, "step": 3126 }, { "epoch": 1.4146120787152228, "grad_norm": 0.2694909870624542, "learning_rate": 9.677545130979722e-06, "loss": 0.6959, "step": 3127 }, { "epoch": 1.4150644650531554, "grad_norm": 0.22811685502529144, "learning_rate": 9.677286198106154e-06, "loss": 0.4995, "step": 3128 }, { "epoch": 1.415516851391088, "grad_norm": 0.2779845595359802, "learning_rate": 9.677027164778952e-06, "loss": 0.5943, "step": 3129 }, { "epoch": 1.4159692377290205, "grad_norm": 0.24365994334220886, "learning_rate": 9.676768031003672e-06, "loss": 0.512, "step": 3130 }, { "epoch": 1.4164216240669532, "grad_norm": 0.25415486097335815, "learning_rate": 9.676508796785882e-06, "loss": 0.5758, "step": 3131 }, { "epoch": 1.4168740104048858, "grad_norm": 0.263223797082901, "learning_rate": 9.676249462131149e-06, "loss": 0.6222, "step": 3132 }, { "epoch": 1.4173263967428182, "grad_norm": 0.2651483118534088, "learning_rate": 9.675990027045043e-06, "loss": 0.5523, "step": 3133 }, { "epoch": 1.417778783080751, "grad_norm": 0.2580200135707855, "learning_rate": 9.675730491533138e-06, "loss": 0.5904, "step": 3134 }, { "epoch": 1.4182311694186835, "grad_norm": 0.26204708218574524, "learning_rate": 9.675470855601003e-06, "loss": 0.598, "step": 3135 }, { "epoch": 1.4186835557566162, "grad_norm": 0.2462567239999771, "learning_rate": 9.675211119254218e-06, "loss": 0.522, "step": 3136 }, { "epoch": 1.4191359420945489, "grad_norm": 0.28796812891960144, "learning_rate": 9.674951282498362e-06, "loss": 0.6107, "step": 3137 }, { "epoch": 1.4195883284324813, "grad_norm": 0.29022416472435, "learning_rate": 9.674691345339012e-06, "loss": 0.6697, "step": 3138 }, { "epoch": 1.420040714770414, "grad_norm": 0.3012951612472534, "learning_rate": 9.674431307781752e-06, "loss": 0.6203, "step": 3139 }, { "epoch": 1.4204931011083466, "grad_norm": 0.26968827843666077, "learning_rate": 9.67417116983217e-06, "loss": 0.5336, "step": 3140 }, { "epoch": 1.420945487446279, "grad_norm": 0.2651456296443939, "learning_rate": 9.673910931495847e-06, "loss": 0.5692, "step": 3141 }, { "epoch": 1.4213978737842117, "grad_norm": 0.28677797317504883, "learning_rate": 9.673650592778378e-06, "loss": 0.5316, "step": 3142 }, { "epoch": 1.4218502601221443, "grad_norm": 0.2993583679199219, "learning_rate": 9.67339015368535e-06, "loss": 0.5798, "step": 3143 }, { "epoch": 1.422302646460077, "grad_norm": 0.29930955171585083, "learning_rate": 9.673129614222359e-06, "loss": 0.6426, "step": 3144 }, { "epoch": 1.4227550327980096, "grad_norm": 0.2807559072971344, "learning_rate": 9.672868974395e-06, "loss": 0.5418, "step": 3145 }, { "epoch": 1.423207419135942, "grad_norm": 0.3198520839214325, "learning_rate": 9.672608234208869e-06, "loss": 0.624, "step": 3146 }, { "epoch": 1.4236598054738747, "grad_norm": 0.2847517728805542, "learning_rate": 9.67234739366957e-06, "loss": 0.4185, "step": 3147 }, { "epoch": 1.4241121918118074, "grad_norm": 0.2849522531032562, "learning_rate": 9.6720864527827e-06, "loss": 0.4896, "step": 3148 }, { "epoch": 1.4245645781497398, "grad_norm": 0.3403318226337433, "learning_rate": 9.671825411553866e-06, "loss": 0.5947, "step": 3149 }, { "epoch": 1.4250169644876725, "grad_norm": 0.3014656901359558, "learning_rate": 9.671564269988674e-06, "loss": 0.5449, "step": 3150 }, { "epoch": 1.425469350825605, "grad_norm": 0.30142247676849365, "learning_rate": 9.671303028092734e-06, "loss": 0.4946, "step": 3151 }, { "epoch": 1.4259217371635375, "grad_norm": 0.31833869218826294, "learning_rate": 9.67104168587165e-06, "loss": 0.4481, "step": 3152 }, { "epoch": 1.4263741235014702, "grad_norm": 0.2967980206012726, "learning_rate": 9.670780243331045e-06, "loss": 0.4174, "step": 3153 }, { "epoch": 1.4268265098394028, "grad_norm": 0.3221031129360199, "learning_rate": 9.670518700476528e-06, "loss": 0.5003, "step": 3154 }, { "epoch": 1.4272788961773355, "grad_norm": 0.3588424026966095, "learning_rate": 9.670257057313717e-06, "loss": 0.5602, "step": 3155 }, { "epoch": 1.4277312825152682, "grad_norm": 0.388143390417099, "learning_rate": 9.66999531384823e-06, "loss": 0.519, "step": 3156 }, { "epoch": 1.4281836688532006, "grad_norm": 0.3580835461616516, "learning_rate": 9.669733470085691e-06, "loss": 0.4803, "step": 3157 }, { "epoch": 1.4286360551911332, "grad_norm": 0.39896100759506226, "learning_rate": 9.669471526031723e-06, "loss": 0.5918, "step": 3158 }, { "epoch": 1.429088441529066, "grad_norm": 0.37206631898880005, "learning_rate": 9.66920948169195e-06, "loss": 0.5446, "step": 3159 }, { "epoch": 1.4295408278669983, "grad_norm": 0.3939513564109802, "learning_rate": 9.668947337072001e-06, "loss": 0.4725, "step": 3160 }, { "epoch": 1.429993214204931, "grad_norm": 0.3994324803352356, "learning_rate": 9.668685092177505e-06, "loss": 1.0442, "step": 3161 }, { "epoch": 1.4304456005428636, "grad_norm": 0.14075881242752075, "learning_rate": 9.668422747014096e-06, "loss": 0.7325, "step": 3162 }, { "epoch": 1.4308979868807963, "grad_norm": 0.19929783046245575, "learning_rate": 9.66816030158741e-06, "loss": 0.6912, "step": 3163 }, { "epoch": 1.431350373218729, "grad_norm": 0.20629748702049255, "learning_rate": 9.66789775590308e-06, "loss": 0.5586, "step": 3164 }, { "epoch": 1.4318027595566614, "grad_norm": 0.24527086317539215, "learning_rate": 9.667635109966745e-06, "loss": 0.7262, "step": 3165 }, { "epoch": 1.432255145894594, "grad_norm": 0.19910970330238342, "learning_rate": 9.667372363784046e-06, "loss": 0.6258, "step": 3166 }, { "epoch": 1.4327075322325267, "grad_norm": 0.21659035980701447, "learning_rate": 9.667109517360627e-06, "loss": 0.6342, "step": 3167 }, { "epoch": 1.433159918570459, "grad_norm": 0.2346005141735077, "learning_rate": 9.666846570702132e-06, "loss": 0.6159, "step": 3168 }, { "epoch": 1.4336123049083918, "grad_norm": 0.22355049848556519, "learning_rate": 9.66658352381421e-06, "loss": 0.7097, "step": 3169 }, { "epoch": 1.4340646912463244, "grad_norm": 0.22951184213161469, "learning_rate": 9.666320376702512e-06, "loss": 0.598, "step": 3170 }, { "epoch": 1.4345170775842568, "grad_norm": 0.2558267116546631, "learning_rate": 9.666057129372683e-06, "loss": 0.6094, "step": 3171 }, { "epoch": 1.4349694639221895, "grad_norm": 0.23532015085220337, "learning_rate": 9.665793781830383e-06, "loss": 0.5387, "step": 3172 }, { "epoch": 1.4354218502601221, "grad_norm": 0.23580504953861237, "learning_rate": 9.665530334081266e-06, "loss": 0.6073, "step": 3173 }, { "epoch": 1.4358742365980548, "grad_norm": 0.25872349739074707, "learning_rate": 9.665266786130988e-06, "loss": 0.6507, "step": 3174 }, { "epoch": 1.4363266229359875, "grad_norm": 0.25519946217536926, "learning_rate": 9.665003137985212e-06, "loss": 0.6901, "step": 3175 }, { "epoch": 1.4367790092739199, "grad_norm": 0.29656362533569336, "learning_rate": 9.664739389649598e-06, "loss": 0.65, "step": 3176 }, { "epoch": 1.4372313956118525, "grad_norm": 0.2160213738679886, "learning_rate": 9.664475541129813e-06, "loss": 0.4947, "step": 3177 }, { "epoch": 1.4376837819497852, "grad_norm": 0.23574191331863403, "learning_rate": 9.664211592431523e-06, "loss": 0.5134, "step": 3178 }, { "epoch": 1.4381361682877176, "grad_norm": 0.2798622250556946, "learning_rate": 9.663947543560395e-06, "loss": 0.7502, "step": 3179 }, { "epoch": 1.4385885546256503, "grad_norm": 0.2547754943370819, "learning_rate": 9.663683394522101e-06, "loss": 0.6251, "step": 3180 }, { "epoch": 1.439040940963583, "grad_norm": 0.26352134346961975, "learning_rate": 9.663419145322315e-06, "loss": 0.5655, "step": 3181 }, { "epoch": 1.4394933273015154, "grad_norm": 0.2861069142818451, "learning_rate": 9.663154795966712e-06, "loss": 0.6542, "step": 3182 }, { "epoch": 1.439945713639448, "grad_norm": 0.27237749099731445, "learning_rate": 9.662890346460966e-06, "loss": 0.5999, "step": 3183 }, { "epoch": 1.4403980999773807, "grad_norm": 0.26142361760139465, "learning_rate": 9.662625796810763e-06, "loss": 0.4615, "step": 3184 }, { "epoch": 1.4408504863153133, "grad_norm": 0.27449944615364075, "learning_rate": 9.66236114702178e-06, "loss": 0.5929, "step": 3185 }, { "epoch": 1.441302872653246, "grad_norm": 0.28402742743492126, "learning_rate": 9.662096397099702e-06, "loss": 0.6332, "step": 3186 }, { "epoch": 1.4417552589911784, "grad_norm": 0.27848848700523376, "learning_rate": 9.661831547050216e-06, "loss": 0.6331, "step": 3187 }, { "epoch": 1.442207645329111, "grad_norm": 0.26836317777633667, "learning_rate": 9.661566596879006e-06, "loss": 0.5858, "step": 3188 }, { "epoch": 1.4426600316670437, "grad_norm": 0.25272297859191895, "learning_rate": 9.661301546591768e-06, "loss": 0.548, "step": 3189 }, { "epoch": 1.4431124180049761, "grad_norm": 0.29371410608291626, "learning_rate": 9.661036396194192e-06, "loss": 0.5723, "step": 3190 }, { "epoch": 1.4435648043429088, "grad_norm": 0.26750633120536804, "learning_rate": 9.660771145691972e-06, "loss": 0.5368, "step": 3191 }, { "epoch": 1.4440171906808414, "grad_norm": 0.26518863439559937, "learning_rate": 9.660505795090807e-06, "loss": 0.503, "step": 3192 }, { "epoch": 1.444469577018774, "grad_norm": 0.3120782971382141, "learning_rate": 9.660240344396392e-06, "loss": 0.5525, "step": 3193 }, { "epoch": 1.4449219633567068, "grad_norm": 0.26772260665893555, "learning_rate": 9.659974793614431e-06, "loss": 0.5653, "step": 3194 }, { "epoch": 1.4453743496946392, "grad_norm": 0.2927623987197876, "learning_rate": 9.659709142750628e-06, "loss": 0.5392, "step": 3195 }, { "epoch": 1.4458267360325718, "grad_norm": 0.28686007857322693, "learning_rate": 9.659443391810686e-06, "loss": 0.5605, "step": 3196 }, { "epoch": 1.4462791223705045, "grad_norm": 0.2719975411891937, "learning_rate": 9.659177540800312e-06, "loss": 0.5569, "step": 3197 }, { "epoch": 1.446731508708437, "grad_norm": 0.29829856753349304, "learning_rate": 9.658911589725219e-06, "loss": 0.5704, "step": 3198 }, { "epoch": 1.4471838950463696, "grad_norm": 0.29760104417800903, "learning_rate": 9.658645538591116e-06, "loss": 0.5414, "step": 3199 }, { "epoch": 1.4476362813843022, "grad_norm": 0.28418222069740295, "learning_rate": 9.658379387403718e-06, "loss": 0.4742, "step": 3200 }, { "epoch": 1.4476362813843022, "eval_loss": 0.6059855818748474, "eval_runtime": 25.9199, "eval_samples_per_second": 28.704, "eval_steps_per_second": 7.176, "step": 3200 }, { "epoch": 1.4480886677222347, "grad_norm": 0.3058840334415436, "learning_rate": 9.658113136168741e-06, "loss": 0.4878, "step": 3201 }, { "epoch": 1.4485410540601673, "grad_norm": 0.3204171657562256, "learning_rate": 9.657846784891902e-06, "loss": 0.5616, "step": 3202 }, { "epoch": 1.4489934403981, "grad_norm": 0.3121301829814911, "learning_rate": 9.657580333578924e-06, "loss": 0.4755, "step": 3203 }, { "epoch": 1.4494458267360326, "grad_norm": 0.299382746219635, "learning_rate": 9.657313782235529e-06, "loss": 0.4415, "step": 3204 }, { "epoch": 1.4498982130739653, "grad_norm": 0.3496190011501312, "learning_rate": 9.657047130867439e-06, "loss": 0.5783, "step": 3205 }, { "epoch": 1.4503505994118977, "grad_norm": 0.3302763104438782, "learning_rate": 9.656780379480383e-06, "loss": 0.5345, "step": 3206 }, { "epoch": 1.4508029857498304, "grad_norm": 0.32881420850753784, "learning_rate": 9.65651352808009e-06, "loss": 0.5013, "step": 3207 }, { "epoch": 1.451255372087763, "grad_norm": 0.362731397151947, "learning_rate": 9.65624657667229e-06, "loss": 0.5789, "step": 3208 }, { "epoch": 1.4517077584256954, "grad_norm": 0.39226338267326355, "learning_rate": 9.655979525262718e-06, "loss": 0.5173, "step": 3209 }, { "epoch": 1.452160144763628, "grad_norm": 0.4402184784412384, "learning_rate": 9.65571237385711e-06, "loss": 0.5466, "step": 3210 }, { "epoch": 1.4526125311015607, "grad_norm": 0.35916799306869507, "learning_rate": 9.6554451224612e-06, "loss": 1.3325, "step": 3211 }, { "epoch": 1.4530649174394934, "grad_norm": 0.19418075680732727, "learning_rate": 9.65517777108073e-06, "loss": 0.9363, "step": 3212 }, { "epoch": 1.453517303777426, "grad_norm": 0.1713201105594635, "learning_rate": 9.654910319721443e-06, "loss": 0.6164, "step": 3213 }, { "epoch": 1.4539696901153585, "grad_norm": 0.2631869316101074, "learning_rate": 9.65464276838908e-06, "loss": 0.7469, "step": 3214 }, { "epoch": 1.4544220764532911, "grad_norm": 0.23019880056381226, "learning_rate": 9.65437511708939e-06, "loss": 0.6899, "step": 3215 }, { "epoch": 1.4548744627912238, "grad_norm": 0.20555782318115234, "learning_rate": 9.654107365828121e-06, "loss": 0.5606, "step": 3216 }, { "epoch": 1.4553268491291562, "grad_norm": 0.20527328550815582, "learning_rate": 9.65383951461102e-06, "loss": 0.5224, "step": 3217 }, { "epoch": 1.4557792354670889, "grad_norm": 0.22936062514781952, "learning_rate": 9.653571563443846e-06, "loss": 0.6091, "step": 3218 }, { "epoch": 1.4562316218050215, "grad_norm": 0.26904037594795227, "learning_rate": 9.653303512332348e-06, "loss": 0.7551, "step": 3219 }, { "epoch": 1.456684008142954, "grad_norm": 0.1986972689628601, "learning_rate": 9.653035361282286e-06, "loss": 0.494, "step": 3220 }, { "epoch": 1.4571363944808866, "grad_norm": 0.21841003000736237, "learning_rate": 9.652767110299417e-06, "loss": 0.6146, "step": 3221 }, { "epoch": 1.4575887808188193, "grad_norm": 0.19977615773677826, "learning_rate": 9.652498759389504e-06, "loss": 0.4388, "step": 3222 }, { "epoch": 1.458041167156752, "grad_norm": 0.25160521268844604, "learning_rate": 9.65223030855831e-06, "loss": 0.5945, "step": 3223 }, { "epoch": 1.4584935534946846, "grad_norm": 0.25070619583129883, "learning_rate": 9.6519617578116e-06, "loss": 0.557, "step": 3224 }, { "epoch": 1.458945939832617, "grad_norm": 0.24794258177280426, "learning_rate": 9.65169310715514e-06, "loss": 0.6748, "step": 3225 }, { "epoch": 1.4593983261705497, "grad_norm": 0.2528335154056549, "learning_rate": 9.651424356594704e-06, "loss": 0.6448, "step": 3226 }, { "epoch": 1.4598507125084823, "grad_norm": 0.2198626697063446, "learning_rate": 9.651155506136059e-06, "loss": 0.5596, "step": 3227 }, { "epoch": 1.4603030988464147, "grad_norm": 0.28523877263069153, "learning_rate": 9.650886555784983e-06, "loss": 0.6339, "step": 3228 }, { "epoch": 1.4607554851843474, "grad_norm": 0.26001259684562683, "learning_rate": 9.650617505547251e-06, "loss": 0.6497, "step": 3229 }, { "epoch": 1.46120787152228, "grad_norm": 0.21787165105342865, "learning_rate": 9.650348355428641e-06, "loss": 0.4626, "step": 3230 }, { "epoch": 1.4616602578602127, "grad_norm": 0.24825803935527802, "learning_rate": 9.650079105434933e-06, "loss": 0.639, "step": 3231 }, { "epoch": 1.4621126441981454, "grad_norm": 0.27076300978660583, "learning_rate": 9.64980975557191e-06, "loss": 0.5587, "step": 3232 }, { "epoch": 1.4625650305360778, "grad_norm": 0.2590966820716858, "learning_rate": 9.649540305845358e-06, "loss": 0.5132, "step": 3233 }, { "epoch": 1.4630174168740104, "grad_norm": 0.2551129162311554, "learning_rate": 9.649270756261063e-06, "loss": 0.6956, "step": 3234 }, { "epoch": 1.463469803211943, "grad_norm": 0.2677765190601349, "learning_rate": 9.649001106824813e-06, "loss": 0.5262, "step": 3235 }, { "epoch": 1.4639221895498755, "grad_norm": 0.25738057494163513, "learning_rate": 9.648731357542403e-06, "loss": 0.4166, "step": 3236 }, { "epoch": 1.4643745758878082, "grad_norm": 0.2673586905002594, "learning_rate": 9.648461508419622e-06, "loss": 0.5748, "step": 3237 }, { "epoch": 1.4648269622257408, "grad_norm": 0.28519243001937866, "learning_rate": 9.648191559462267e-06, "loss": 0.6085, "step": 3238 }, { "epoch": 1.4652793485636733, "grad_norm": 0.2563812732696533, "learning_rate": 9.647921510676137e-06, "loss": 0.5978, "step": 3239 }, { "epoch": 1.465731734901606, "grad_norm": 0.2717427611351013, "learning_rate": 9.64765136206703e-06, "loss": 0.6268, "step": 3240 }, { "epoch": 1.4661841212395386, "grad_norm": 0.2951571047306061, "learning_rate": 9.647381113640749e-06, "loss": 0.5849, "step": 3241 }, { "epoch": 1.4666365075774712, "grad_norm": 0.27763423323631287, "learning_rate": 9.647110765403098e-06, "loss": 0.5271, "step": 3242 }, { "epoch": 1.4670888939154039, "grad_norm": 0.30030950903892517, "learning_rate": 9.646840317359882e-06, "loss": 0.5602, "step": 3243 }, { "epoch": 1.4675412802533363, "grad_norm": 0.33798927068710327, "learning_rate": 9.646569769516912e-06, "loss": 0.5726, "step": 3244 }, { "epoch": 1.467993666591269, "grad_norm": 0.30987969040870667, "learning_rate": 9.646299121879996e-06, "loss": 0.577, "step": 3245 }, { "epoch": 1.4684460529292016, "grad_norm": 0.30212604999542236, "learning_rate": 9.646028374454948e-06, "loss": 0.4974, "step": 3246 }, { "epoch": 1.468898439267134, "grad_norm": 0.26641321182250977, "learning_rate": 9.645757527247583e-06, "loss": 0.4711, "step": 3247 }, { "epoch": 1.4693508256050667, "grad_norm": 0.36004894971847534, "learning_rate": 9.645486580263716e-06, "loss": 0.551, "step": 3248 }, { "epoch": 1.4698032119429993, "grad_norm": 0.2731257975101471, "learning_rate": 9.645215533509168e-06, "loss": 0.4664, "step": 3249 }, { "epoch": 1.470255598280932, "grad_norm": 0.2822580933570862, "learning_rate": 9.644944386989762e-06, "loss": 0.4759, "step": 3250 }, { "epoch": 1.4707079846188644, "grad_norm": 0.2990921437740326, "learning_rate": 9.644673140711318e-06, "loss": 0.5749, "step": 3251 }, { "epoch": 1.471160370956797, "grad_norm": 0.2947103977203369, "learning_rate": 9.644401794679662e-06, "loss": 0.4444, "step": 3252 }, { "epoch": 1.4716127572947297, "grad_norm": 0.2850090265274048, "learning_rate": 9.644130348900621e-06, "loss": 0.4646, "step": 3253 }, { "epoch": 1.4720651436326624, "grad_norm": 0.3320958614349365, "learning_rate": 9.643858803380028e-06, "loss": 0.4968, "step": 3254 }, { "epoch": 1.4725175299705948, "grad_norm": 0.32581627368927, "learning_rate": 9.643587158123714e-06, "loss": 0.4293, "step": 3255 }, { "epoch": 1.4729699163085275, "grad_norm": 0.3328057527542114, "learning_rate": 9.64331541313751e-06, "loss": 0.5007, "step": 3256 }, { "epoch": 1.4734223026464601, "grad_norm": 0.3710852861404419, "learning_rate": 9.643043568427256e-06, "loss": 0.4977, "step": 3257 }, { "epoch": 1.4738746889843926, "grad_norm": 0.39530977606773376, "learning_rate": 9.642771623998789e-06, "loss": 0.6292, "step": 3258 }, { "epoch": 1.4743270753223252, "grad_norm": 0.48512002825737, "learning_rate": 9.64249957985795e-06, "loss": 0.5605, "step": 3259 }, { "epoch": 1.4747794616602579, "grad_norm": 0.4468149244785309, "learning_rate": 9.64222743601058e-06, "loss": 0.584, "step": 3260 }, { "epoch": 1.4752318479981905, "grad_norm": 0.3485540449619293, "learning_rate": 9.641955192462525e-06, "loss": 0.9681, "step": 3261 }, { "epoch": 1.4756842343361232, "grad_norm": 0.15971915423870087, "learning_rate": 9.641682849219631e-06, "loss": 0.8034, "step": 3262 }, { "epoch": 1.4761366206740556, "grad_norm": 0.17139610648155212, "learning_rate": 9.641410406287749e-06, "loss": 0.688, "step": 3263 }, { "epoch": 1.4765890070119883, "grad_norm": 0.1963765174150467, "learning_rate": 9.641137863672728e-06, "loss": 0.713, "step": 3264 }, { "epoch": 1.477041393349921, "grad_norm": 0.215157613158226, "learning_rate": 9.640865221380425e-06, "loss": 0.635, "step": 3265 }, { "epoch": 1.4774937796878533, "grad_norm": 0.2118763029575348, "learning_rate": 9.640592479416691e-06, "loss": 0.6278, "step": 3266 }, { "epoch": 1.477946166025786, "grad_norm": 0.23787331581115723, "learning_rate": 9.640319637787385e-06, "loss": 0.6902, "step": 3267 }, { "epoch": 1.4783985523637186, "grad_norm": 0.232373908162117, "learning_rate": 9.640046696498369e-06, "loss": 0.725, "step": 3268 }, { "epoch": 1.478850938701651, "grad_norm": 0.23418891429901123, "learning_rate": 9.639773655555501e-06, "loss": 0.7344, "step": 3269 }, { "epoch": 1.4793033250395837, "grad_norm": 0.2250089794397354, "learning_rate": 9.63950051496465e-06, "loss": 0.5699, "step": 3270 }, { "epoch": 1.4797557113775164, "grad_norm": 0.25465938448905945, "learning_rate": 9.63922727473168e-06, "loss": 0.7711, "step": 3271 }, { "epoch": 1.480208097715449, "grad_norm": 0.23796729743480682, "learning_rate": 9.638953934862458e-06, "loss": 0.6295, "step": 3272 }, { "epoch": 1.4806604840533817, "grad_norm": 0.2622539699077606, "learning_rate": 9.638680495362857e-06, "loss": 0.7143, "step": 3273 }, { "epoch": 1.4811128703913141, "grad_norm": 0.2418701946735382, "learning_rate": 9.638406956238746e-06, "loss": 0.6399, "step": 3274 }, { "epoch": 1.4815652567292468, "grad_norm": 0.2520108222961426, "learning_rate": 9.638133317496003e-06, "loss": 0.6005, "step": 3275 }, { "epoch": 1.4820176430671794, "grad_norm": 0.24943123757839203, "learning_rate": 9.637859579140505e-06, "loss": 0.6349, "step": 3276 }, { "epoch": 1.4824700294051119, "grad_norm": 0.2205478698015213, "learning_rate": 9.637585741178128e-06, "loss": 0.5284, "step": 3277 }, { "epoch": 1.4829224157430445, "grad_norm": 0.2742500603199005, "learning_rate": 9.637311803614757e-06, "loss": 0.5482, "step": 3278 }, { "epoch": 1.4833748020809772, "grad_norm": 0.2481415718793869, "learning_rate": 9.637037766456272e-06, "loss": 0.5651, "step": 3279 }, { "epoch": 1.4838271884189098, "grad_norm": 0.2759951949119568, "learning_rate": 9.636763629708562e-06, "loss": 0.5694, "step": 3280 }, { "epoch": 1.4842795747568425, "grad_norm": 0.23442964255809784, "learning_rate": 9.636489393377508e-06, "loss": 0.5036, "step": 3281 }, { "epoch": 1.484731961094775, "grad_norm": 0.25160834193229675, "learning_rate": 9.636215057469009e-06, "loss": 0.5206, "step": 3282 }, { "epoch": 1.4851843474327076, "grad_norm": 0.2839876115322113, "learning_rate": 9.63594062198895e-06, "loss": 0.5755, "step": 3283 }, { "epoch": 1.4856367337706402, "grad_norm": 0.26319316029548645, "learning_rate": 9.635666086943228e-06, "loss": 0.5177, "step": 3284 }, { "epoch": 1.4860891201085726, "grad_norm": 0.2991958260536194, "learning_rate": 9.635391452337737e-06, "loss": 0.6341, "step": 3285 }, { "epoch": 1.4865415064465053, "grad_norm": 0.2388707995414734, "learning_rate": 9.635116718178378e-06, "loss": 0.5497, "step": 3286 }, { "epoch": 1.486993892784438, "grad_norm": 0.26529985666275024, "learning_rate": 9.63484188447105e-06, "loss": 0.5474, "step": 3287 }, { "epoch": 1.4874462791223704, "grad_norm": 0.25657564401626587, "learning_rate": 9.634566951221655e-06, "loss": 0.543, "step": 3288 }, { "epoch": 1.487898665460303, "grad_norm": 0.24100954830646515, "learning_rate": 9.634291918436099e-06, "loss": 0.5621, "step": 3289 }, { "epoch": 1.4883510517982357, "grad_norm": 0.29864025115966797, "learning_rate": 9.634016786120289e-06, "loss": 0.6331, "step": 3290 }, { "epoch": 1.4888034381361683, "grad_norm": 0.28396788239479065, "learning_rate": 9.633741554280132e-06, "loss": 0.6672, "step": 3291 }, { "epoch": 1.489255824474101, "grad_norm": 0.2552768588066101, "learning_rate": 9.633466222921539e-06, "loss": 0.5337, "step": 3292 }, { "epoch": 1.4897082108120334, "grad_norm": 0.315841943025589, "learning_rate": 9.633190792050426e-06, "loss": 0.6559, "step": 3293 }, { "epoch": 1.490160597149966, "grad_norm": 0.2601531744003296, "learning_rate": 9.63291526167271e-06, "loss": 0.4349, "step": 3294 }, { "epoch": 1.4906129834878987, "grad_norm": 0.26449164748191833, "learning_rate": 9.632639631794301e-06, "loss": 0.4409, "step": 3295 }, { "epoch": 1.4910653698258312, "grad_norm": 0.3046272397041321, "learning_rate": 9.632363902421124e-06, "loss": 0.5301, "step": 3296 }, { "epoch": 1.4915177561637638, "grad_norm": 0.2822907269001007, "learning_rate": 9.632088073559103e-06, "loss": 0.4506, "step": 3297 }, { "epoch": 1.4919701425016965, "grad_norm": 0.3342924118041992, "learning_rate": 9.631812145214157e-06, "loss": 0.5501, "step": 3298 }, { "epoch": 1.4924225288396291, "grad_norm": 0.3157198429107666, "learning_rate": 9.631536117392213e-06, "loss": 0.5464, "step": 3299 }, { "epoch": 1.4928749151775618, "grad_norm": 0.32474929094314575, "learning_rate": 9.631259990099202e-06, "loss": 0.5902, "step": 3300 }, { "epoch": 1.4933273015154942, "grad_norm": 0.3403853476047516, "learning_rate": 9.630983763341052e-06, "loss": 0.5457, "step": 3301 }, { "epoch": 1.4937796878534269, "grad_norm": 0.2996079623699188, "learning_rate": 9.630707437123697e-06, "loss": 0.4643, "step": 3302 }, { "epoch": 1.4942320741913595, "grad_norm": 0.3485661447048187, "learning_rate": 9.630431011453071e-06, "loss": 0.5962, "step": 3303 }, { "epoch": 1.494684460529292, "grad_norm": 0.33227619528770447, "learning_rate": 9.63015448633511e-06, "loss": 0.5594, "step": 3304 }, { "epoch": 1.4951368468672246, "grad_norm": 0.3870530426502228, "learning_rate": 9.629877861775753e-06, "loss": 0.5857, "step": 3305 }, { "epoch": 1.4955892332051572, "grad_norm": 0.4164879024028778, "learning_rate": 9.629601137780944e-06, "loss": 0.6431, "step": 3306 }, { "epoch": 1.4960416195430897, "grad_norm": 0.39663952589035034, "learning_rate": 9.62932431435662e-06, "loss": 0.5423, "step": 3307 }, { "epoch": 1.4964940058810223, "grad_norm": 0.4031278192996979, "learning_rate": 9.629047391508732e-06, "loss": 0.5185, "step": 3308 }, { "epoch": 1.496946392218955, "grad_norm": 0.3695038855075836, "learning_rate": 9.628770369243225e-06, "loss": 0.5289, "step": 3309 }, { "epoch": 1.4973987785568876, "grad_norm": 0.39945462346076965, "learning_rate": 9.62849324756605e-06, "loss": 0.5093, "step": 3310 }, { "epoch": 1.4978511648948203, "grad_norm": 0.4205876886844635, "learning_rate": 9.628216026483157e-06, "loss": 1.2774, "step": 3311 }, { "epoch": 1.4983035512327527, "grad_norm": 0.17223095893859863, "learning_rate": 9.6279387060005e-06, "loss": 0.749, "step": 3312 }, { "epoch": 1.4987559375706854, "grad_norm": 0.20430748164653778, "learning_rate": 9.627661286124038e-06, "loss": 0.6911, "step": 3313 }, { "epoch": 1.499208323908618, "grad_norm": 0.21624615788459778, "learning_rate": 9.627383766859724e-06, "loss": 0.6963, "step": 3314 }, { "epoch": 1.4996607102465505, "grad_norm": 0.2424689680337906, "learning_rate": 9.627106148213521e-06, "loss": 0.6886, "step": 3315 }, { "epoch": 1.500113096584483, "grad_norm": 0.21178047358989716, "learning_rate": 9.626828430191392e-06, "loss": 0.6026, "step": 3316 }, { "epoch": 1.5005654829224158, "grad_norm": 0.19941122829914093, "learning_rate": 9.626550612799303e-06, "loss": 0.598, "step": 3317 }, { "epoch": 1.5010178692603482, "grad_norm": 0.2716292142868042, "learning_rate": 9.626272696043217e-06, "loss": 0.684, "step": 3318 }, { "epoch": 1.501470255598281, "grad_norm": 0.2535647451877594, "learning_rate": 9.625994679929103e-06, "loss": 0.6896, "step": 3319 }, { "epoch": 1.5019226419362135, "grad_norm": 0.22288668155670166, "learning_rate": 9.625716564462934e-06, "loss": 0.5752, "step": 3320 }, { "epoch": 1.5023750282741462, "grad_norm": 0.21812471747398376, "learning_rate": 9.625438349650683e-06, "loss": 0.5485, "step": 3321 }, { "epoch": 1.5028274146120788, "grad_norm": 0.24036918580532074, "learning_rate": 9.625160035498324e-06, "loss": 0.5954, "step": 3322 }, { "epoch": 1.5032798009500112, "grad_norm": 0.22435691952705383, "learning_rate": 9.624881622011836e-06, "loss": 0.5936, "step": 3323 }, { "epoch": 1.5037321872879439, "grad_norm": 0.26180359721183777, "learning_rate": 9.624603109197196e-06, "loss": 0.7774, "step": 3324 }, { "epoch": 1.5041845736258765, "grad_norm": 0.2463502585887909, "learning_rate": 9.624324497060388e-06, "loss": 0.6044, "step": 3325 }, { "epoch": 1.504636959963809, "grad_norm": 0.2556041181087494, "learning_rate": 9.624045785607395e-06, "loss": 0.6467, "step": 3326 }, { "epoch": 1.5050893463017418, "grad_norm": 0.27579155564308167, "learning_rate": 9.6237669748442e-06, "loss": 0.7459, "step": 3327 }, { "epoch": 1.5055417326396743, "grad_norm": 0.25459977984428406, "learning_rate": 9.623488064776795e-06, "loss": 0.6267, "step": 3328 }, { "epoch": 1.5059941189776067, "grad_norm": 0.24927659332752228, "learning_rate": 9.623209055411168e-06, "loss": 0.4532, "step": 3329 }, { "epoch": 1.5064465053155396, "grad_norm": 0.2639617919921875, "learning_rate": 9.622929946753313e-06, "loss": 0.5932, "step": 3330 }, { "epoch": 1.506898891653472, "grad_norm": 0.266661137342453, "learning_rate": 9.622650738809223e-06, "loss": 0.6276, "step": 3331 }, { "epoch": 1.5073512779914047, "grad_norm": 0.2387927919626236, "learning_rate": 9.622371431584893e-06, "loss": 0.5327, "step": 3332 }, { "epoch": 1.5078036643293373, "grad_norm": 0.22913601994514465, "learning_rate": 9.622092025086326e-06, "loss": 0.4752, "step": 3333 }, { "epoch": 1.5082560506672698, "grad_norm": 0.27213096618652344, "learning_rate": 9.62181251931952e-06, "loss": 0.5426, "step": 3334 }, { "epoch": 1.5087084370052024, "grad_norm": 0.27415311336517334, "learning_rate": 9.621532914290477e-06, "loss": 0.5589, "step": 3335 }, { "epoch": 1.509160823343135, "grad_norm": 0.2736496031284332, "learning_rate": 9.621253210005202e-06, "loss": 0.6745, "step": 3336 }, { "epoch": 1.5096132096810675, "grad_norm": 0.243231862783432, "learning_rate": 9.620973406469705e-06, "loss": 0.4666, "step": 3337 }, { "epoch": 1.5100655960190004, "grad_norm": 0.29179584980010986, "learning_rate": 9.620693503689994e-06, "loss": 0.5102, "step": 3338 }, { "epoch": 1.5105179823569328, "grad_norm": 0.25850117206573486, "learning_rate": 9.620413501672079e-06, "loss": 0.5696, "step": 3339 }, { "epoch": 1.5109703686948655, "grad_norm": 0.29429352283477783, "learning_rate": 9.620133400421976e-06, "loss": 0.5811, "step": 3340 }, { "epoch": 1.511422755032798, "grad_norm": 0.2709670960903168, "learning_rate": 9.619853199945699e-06, "loss": 0.5194, "step": 3341 }, { "epoch": 1.5118751413707305, "grad_norm": 0.2815287709236145, "learning_rate": 9.619572900249266e-06, "loss": 0.5672, "step": 3342 }, { "epoch": 1.5123275277086632, "grad_norm": 0.28932464122772217, "learning_rate": 9.619292501338697e-06, "loss": 0.4825, "step": 3343 }, { "epoch": 1.5127799140465958, "grad_norm": 0.28106048703193665, "learning_rate": 9.619012003220014e-06, "loss": 0.5719, "step": 3344 }, { "epoch": 1.5132323003845283, "grad_norm": 0.29153698682785034, "learning_rate": 9.618731405899242e-06, "loss": 0.5716, "step": 3345 }, { "epoch": 1.5136846867224611, "grad_norm": 0.29371199011802673, "learning_rate": 9.618450709382407e-06, "loss": 0.5001, "step": 3346 }, { "epoch": 1.5141370730603936, "grad_norm": 0.31195491552352905, "learning_rate": 9.618169913675539e-06, "loss": 0.6109, "step": 3347 }, { "epoch": 1.514589459398326, "grad_norm": 0.32994571328163147, "learning_rate": 9.617889018784665e-06, "loss": 0.574, "step": 3348 }, { "epoch": 1.5150418457362589, "grad_norm": 0.315853476524353, "learning_rate": 9.617608024715821e-06, "loss": 0.5586, "step": 3349 }, { "epoch": 1.5154942320741913, "grad_norm": 0.28516295552253723, "learning_rate": 9.617326931475039e-06, "loss": 0.4586, "step": 3350 }, { "epoch": 1.515946618412124, "grad_norm": 0.309844434261322, "learning_rate": 9.617045739068359e-06, "loss": 0.5371, "step": 3351 }, { "epoch": 1.5163990047500566, "grad_norm": 0.33165252208709717, "learning_rate": 9.616764447501818e-06, "loss": 0.5014, "step": 3352 }, { "epoch": 1.516851391087989, "grad_norm": 0.3119548559188843, "learning_rate": 9.616483056781459e-06, "loss": 0.5564, "step": 3353 }, { "epoch": 1.5173037774259217, "grad_norm": 0.30310219526290894, "learning_rate": 9.616201566913325e-06, "loss": 0.4796, "step": 3354 }, { "epoch": 1.5177561637638544, "grad_norm": 0.3069257438182831, "learning_rate": 9.61591997790346e-06, "loss": 0.4368, "step": 3355 }, { "epoch": 1.5182085501017868, "grad_norm": 0.31426483392715454, "learning_rate": 9.615638289757913e-06, "loss": 0.4252, "step": 3356 }, { "epoch": 1.5186609364397197, "grad_norm": 0.44661346077919006, "learning_rate": 9.615356502482734e-06, "loss": 0.6516, "step": 3357 }, { "epoch": 1.519113322777652, "grad_norm": 0.3906629681587219, "learning_rate": 9.615074616083972e-06, "loss": 0.5194, "step": 3358 }, { "epoch": 1.5195657091155848, "grad_norm": 0.41715338826179504, "learning_rate": 9.614792630567687e-06, "loss": 0.4729, "step": 3359 }, { "epoch": 1.5200180954535174, "grad_norm": 0.422318696975708, "learning_rate": 9.61451054593993e-06, "loss": 0.4744, "step": 3360 }, { "epoch": 1.5204704817914498, "grad_norm": 0.391488254070282, "learning_rate": 9.61422836220676e-06, "loss": 1.1494, "step": 3361 }, { "epoch": 1.5209228681293825, "grad_norm": 0.12918426096439362, "learning_rate": 9.613946079374238e-06, "loss": 0.6032, "step": 3362 }, { "epoch": 1.5213752544673151, "grad_norm": 0.20628170669078827, "learning_rate": 9.61366369744843e-06, "loss": 0.7072, "step": 3363 }, { "epoch": 1.5218276408052476, "grad_norm": 0.2174634039402008, "learning_rate": 9.613381216435395e-06, "loss": 0.6558, "step": 3364 }, { "epoch": 1.5222800271431802, "grad_norm": 0.23380467295646667, "learning_rate": 9.613098636341204e-06, "loss": 0.7308, "step": 3365 }, { "epoch": 1.5227324134811129, "grad_norm": 0.21638867259025574, "learning_rate": 9.612815957171923e-06, "loss": 0.604, "step": 3366 }, { "epoch": 1.5231847998190453, "grad_norm": 0.19966576993465424, "learning_rate": 9.612533178933625e-06, "loss": 0.5128, "step": 3367 }, { "epoch": 1.5236371861569782, "grad_norm": 0.22012905776500702, "learning_rate": 9.612250301632382e-06, "loss": 0.6058, "step": 3368 }, { "epoch": 1.5240895724949106, "grad_norm": 0.2559877038002014, "learning_rate": 9.611967325274272e-06, "loss": 0.6789, "step": 3369 }, { "epoch": 1.5245419588328433, "grad_norm": 0.24115870893001556, "learning_rate": 9.61168424986537e-06, "loss": 0.6478, "step": 3370 }, { "epoch": 1.524994345170776, "grad_norm": 0.29904595017433167, "learning_rate": 9.611401075411755e-06, "loss": 0.53, "step": 3371 }, { "epoch": 1.5254467315087084, "grad_norm": 0.23213638365268707, "learning_rate": 9.61111780191951e-06, "loss": 0.5315, "step": 3372 }, { "epoch": 1.525899117846641, "grad_norm": 0.2496502548456192, "learning_rate": 9.610834429394719e-06, "loss": 0.6782, "step": 3373 }, { "epoch": 1.5263515041845737, "grad_norm": 0.27840960025787354, "learning_rate": 9.610550957843467e-06, "loss": 0.6883, "step": 3374 }, { "epoch": 1.526803890522506, "grad_norm": 0.2343616634607315, "learning_rate": 9.610267387271844e-06, "loss": 0.5806, "step": 3375 }, { "epoch": 1.527256276860439, "grad_norm": 0.2771667540073395, "learning_rate": 9.609983717685935e-06, "loss": 0.6891, "step": 3376 }, { "epoch": 1.5277086631983714, "grad_norm": 0.2664724886417389, "learning_rate": 9.60969994909184e-06, "loss": 0.7078, "step": 3377 }, { "epoch": 1.528161049536304, "grad_norm": 0.23847821354866028, "learning_rate": 9.609416081495649e-06, "loss": 0.518, "step": 3378 }, { "epoch": 1.5286134358742367, "grad_norm": 0.30707672238349915, "learning_rate": 9.609132114903458e-06, "loss": 0.6371, "step": 3379 }, { "epoch": 1.5290658222121691, "grad_norm": 0.2796226441860199, "learning_rate": 9.608848049321368e-06, "loss": 0.6637, "step": 3380 }, { "epoch": 1.5295182085501018, "grad_norm": 0.2323017120361328, "learning_rate": 9.60856388475548e-06, "loss": 0.4428, "step": 3381 }, { "epoch": 1.5299705948880344, "grad_norm": 0.27409660816192627, "learning_rate": 9.608279621211893e-06, "loss": 0.6266, "step": 3382 }, { "epoch": 1.5304229812259669, "grad_norm": 0.29068610072135925, "learning_rate": 9.607995258696718e-06, "loss": 0.5873, "step": 3383 }, { "epoch": 1.5308753675638995, "grad_norm": 0.27450260519981384, "learning_rate": 9.607710797216057e-06, "loss": 0.5817, "step": 3384 }, { "epoch": 1.5313277539018322, "grad_norm": 0.26477450132369995, "learning_rate": 9.607426236776022e-06, "loss": 0.488, "step": 3385 }, { "epoch": 1.5317801402397646, "grad_norm": 0.2831242084503174, "learning_rate": 9.607141577382724e-06, "loss": 0.6618, "step": 3386 }, { "epoch": 1.5322325265776975, "grad_norm": 0.2816939353942871, "learning_rate": 9.606856819042277e-06, "loss": 0.6194, "step": 3387 }, { "epoch": 1.53268491291563, "grad_norm": 0.30232179164886475, "learning_rate": 9.606571961760795e-06, "loss": 0.6418, "step": 3388 }, { "epoch": 1.5331372992535626, "grad_norm": 0.27521952986717224, "learning_rate": 9.606287005544399e-06, "loss": 0.525, "step": 3389 }, { "epoch": 1.5335896855914952, "grad_norm": 0.27149927616119385, "learning_rate": 9.606001950399207e-06, "loss": 0.5486, "step": 3390 }, { "epoch": 1.5340420719294277, "grad_norm": 0.27453628182411194, "learning_rate": 9.60571679633134e-06, "loss": 0.5007, "step": 3391 }, { "epoch": 1.5344944582673603, "grad_norm": 0.2726672291755676, "learning_rate": 9.605431543346925e-06, "loss": 0.4132, "step": 3392 }, { "epoch": 1.534946844605293, "grad_norm": 0.2875711917877197, "learning_rate": 9.605146191452085e-06, "loss": 0.5617, "step": 3393 }, { "epoch": 1.5353992309432254, "grad_norm": 0.2742254137992859, "learning_rate": 9.604860740652952e-06, "loss": 0.4639, "step": 3394 }, { "epoch": 1.5358516172811583, "grad_norm": 0.3218792676925659, "learning_rate": 9.604575190955654e-06, "loss": 0.6516, "step": 3395 }, { "epoch": 1.5363040036190907, "grad_norm": 0.2969991862773895, "learning_rate": 9.604289542366325e-06, "loss": 0.4664, "step": 3396 }, { "epoch": 1.5367563899570233, "grad_norm": 0.28579187393188477, "learning_rate": 9.6040037948911e-06, "loss": 0.5834, "step": 3397 }, { "epoch": 1.537208776294956, "grad_norm": 0.3178653419017792, "learning_rate": 9.603717948536113e-06, "loss": 0.4938, "step": 3398 }, { "epoch": 1.5376611626328884, "grad_norm": 0.36365458369255066, "learning_rate": 9.603432003307507e-06, "loss": 0.7027, "step": 3399 }, { "epoch": 1.538113548970821, "grad_norm": 0.37191587686538696, "learning_rate": 9.603145959211424e-06, "loss": 0.5491, "step": 3400 }, { "epoch": 1.538113548970821, "eval_loss": 0.6032029390335083, "eval_runtime": 25.7729, "eval_samples_per_second": 28.867, "eval_steps_per_second": 7.217, "step": 3400 }, { "epoch": 1.5385659353087537, "grad_norm": 0.3351244330406189, "learning_rate": 9.602859816254001e-06, "loss": 0.5782, "step": 3401 }, { "epoch": 1.5390183216466862, "grad_norm": 0.32671791315078735, "learning_rate": 9.602573574441392e-06, "loss": 0.5555, "step": 3402 }, { "epoch": 1.5394707079846188, "grad_norm": 0.3302680253982544, "learning_rate": 9.602287233779737e-06, "loss": 0.5451, "step": 3403 }, { "epoch": 1.5399230943225515, "grad_norm": 0.3128826320171356, "learning_rate": 9.60200079427519e-06, "loss": 0.5457, "step": 3404 }, { "epoch": 1.540375480660484, "grad_norm": 0.33837518095970154, "learning_rate": 9.601714255933902e-06, "loss": 0.5372, "step": 3405 }, { "epoch": 1.5408278669984168, "grad_norm": 0.39983147382736206, "learning_rate": 9.601427618762027e-06, "loss": 0.5823, "step": 3406 }, { "epoch": 1.5412802533363492, "grad_norm": 0.3945506513118744, "learning_rate": 9.601140882765718e-06, "loss": 0.605, "step": 3407 }, { "epoch": 1.5417326396742819, "grad_norm": 0.39802661538124084, "learning_rate": 9.600854047951137e-06, "loss": 0.5281, "step": 3408 }, { "epoch": 1.5421850260122145, "grad_norm": 0.376409649848938, "learning_rate": 9.600567114324446e-06, "loss": 0.4686, "step": 3409 }, { "epoch": 1.542637412350147, "grad_norm": 0.3993771970272064, "learning_rate": 9.600280081891801e-06, "loss": 0.5013, "step": 3410 }, { "epoch": 1.5430897986880796, "grad_norm": 0.36009469628334045, "learning_rate": 9.599992950659374e-06, "loss": 1.134, "step": 3411 }, { "epoch": 1.5435421850260123, "grad_norm": 0.1551712602376938, "learning_rate": 9.599705720633325e-06, "loss": 1.0149, "step": 3412 }, { "epoch": 1.5439945713639447, "grad_norm": 0.1723669320344925, "learning_rate": 9.599418391819827e-06, "loss": 0.6221, "step": 3413 }, { "epoch": 1.5444469577018776, "grad_norm": 0.22033587098121643, "learning_rate": 9.59913096422505e-06, "loss": 0.6835, "step": 3414 }, { "epoch": 1.54489934403981, "grad_norm": 0.24504825472831726, "learning_rate": 9.598843437855167e-06, "loss": 0.7569, "step": 3415 }, { "epoch": 1.5453517303777424, "grad_norm": 0.2014402449131012, "learning_rate": 9.598555812716353e-06, "loss": 0.6037, "step": 3416 }, { "epoch": 1.5458041167156753, "grad_norm": 0.2074400633573532, "learning_rate": 9.598268088814784e-06, "loss": 0.7109, "step": 3417 }, { "epoch": 1.5462565030536077, "grad_norm": 0.21651095151901245, "learning_rate": 9.597980266156641e-06, "loss": 0.5732, "step": 3418 }, { "epoch": 1.5467088893915404, "grad_norm": 0.25055474042892456, "learning_rate": 9.597692344748106e-06, "loss": 0.6449, "step": 3419 }, { "epoch": 1.547161275729473, "grad_norm": 0.2379698008298874, "learning_rate": 9.597404324595361e-06, "loss": 0.6012, "step": 3420 }, { "epoch": 1.5476136620674055, "grad_norm": 0.24687039852142334, "learning_rate": 9.597116205704593e-06, "loss": 0.6029, "step": 3421 }, { "epoch": 1.5480660484053381, "grad_norm": 0.2796812951564789, "learning_rate": 9.59682798808199e-06, "loss": 0.6405, "step": 3422 }, { "epoch": 1.5485184347432708, "grad_norm": 0.2414379119873047, "learning_rate": 9.596539671733741e-06, "loss": 0.571, "step": 3423 }, { "epoch": 1.5489708210812032, "grad_norm": 0.2049388438463211, "learning_rate": 9.596251256666039e-06, "loss": 0.4703, "step": 3424 }, { "epoch": 1.549423207419136, "grad_norm": 0.22208218276500702, "learning_rate": 9.595962742885078e-06, "loss": 0.558, "step": 3425 }, { "epoch": 1.5498755937570685, "grad_norm": 0.24054628610610962, "learning_rate": 9.595674130397055e-06, "loss": 0.636, "step": 3426 }, { "epoch": 1.5503279800950012, "grad_norm": 0.25134196877479553, "learning_rate": 9.595385419208168e-06, "loss": 0.5396, "step": 3427 }, { "epoch": 1.5507803664329338, "grad_norm": 0.2617988884449005, "learning_rate": 9.595096609324616e-06, "loss": 0.5766, "step": 3428 }, { "epoch": 1.5512327527708663, "grad_norm": 0.2558850049972534, "learning_rate": 9.594807700752604e-06, "loss": 0.557, "step": 3429 }, { "epoch": 1.551685139108799, "grad_norm": 0.2215222865343094, "learning_rate": 9.594518693498336e-06, "loss": 0.5002, "step": 3430 }, { "epoch": 1.5521375254467316, "grad_norm": 0.2744232714176178, "learning_rate": 9.594229587568019e-06, "loss": 0.6539, "step": 3431 }, { "epoch": 1.552589911784664, "grad_norm": 0.29493510723114014, "learning_rate": 9.593940382967862e-06, "loss": 0.5937, "step": 3432 }, { "epoch": 1.5530422981225966, "grad_norm": 0.2824174165725708, "learning_rate": 9.593651079704078e-06, "loss": 0.6827, "step": 3433 }, { "epoch": 1.5534946844605293, "grad_norm": 0.23830533027648926, "learning_rate": 9.593361677782876e-06, "loss": 0.5246, "step": 3434 }, { "epoch": 1.5539470707984617, "grad_norm": 0.2827991247177124, "learning_rate": 9.593072177210477e-06, "loss": 0.6286, "step": 3435 }, { "epoch": 1.5543994571363946, "grad_norm": 0.2792828381061554, "learning_rate": 9.592782577993094e-06, "loss": 0.6461, "step": 3436 }, { "epoch": 1.554851843474327, "grad_norm": 0.3000594675540924, "learning_rate": 9.59249288013695e-06, "loss": 0.6386, "step": 3437 }, { "epoch": 1.5553042298122597, "grad_norm": 0.28520944714546204, "learning_rate": 9.592203083648264e-06, "loss": 0.6017, "step": 3438 }, { "epoch": 1.5557566161501923, "grad_norm": 0.2688097059726715, "learning_rate": 9.59191318853326e-06, "loss": 0.5248, "step": 3439 }, { "epoch": 1.5562090024881248, "grad_norm": 0.31226077675819397, "learning_rate": 9.591623194798168e-06, "loss": 0.6243, "step": 3440 }, { "epoch": 1.5566613888260574, "grad_norm": 0.25289392471313477, "learning_rate": 9.591333102449212e-06, "loss": 0.4906, "step": 3441 }, { "epoch": 1.55711377516399, "grad_norm": 0.3071081042289734, "learning_rate": 9.591042911492622e-06, "loss": 0.6628, "step": 3442 }, { "epoch": 1.5575661615019225, "grad_norm": 0.28069204092025757, "learning_rate": 9.590752621934636e-06, "loss": 0.4727, "step": 3443 }, { "epoch": 1.5580185478398554, "grad_norm": 0.3134462535381317, "learning_rate": 9.590462233781483e-06, "loss": 0.5911, "step": 3444 }, { "epoch": 1.5584709341777878, "grad_norm": 0.2828170657157898, "learning_rate": 9.5901717470394e-06, "loss": 0.4864, "step": 3445 }, { "epoch": 1.5589233205157205, "grad_norm": 0.2673971951007843, "learning_rate": 9.58988116171463e-06, "loss": 0.5187, "step": 3446 }, { "epoch": 1.5593757068536531, "grad_norm": 0.30632656812667847, "learning_rate": 9.589590477813408e-06, "loss": 0.4847, "step": 3447 }, { "epoch": 1.5598280931915856, "grad_norm": 0.3117164969444275, "learning_rate": 9.589299695341981e-06, "loss": 0.5196, "step": 3448 }, { "epoch": 1.5602804795295182, "grad_norm": 0.2788640558719635, "learning_rate": 9.589008814306594e-06, "loss": 0.4978, "step": 3449 }, { "epoch": 1.5607328658674509, "grad_norm": 0.3205011188983917, "learning_rate": 9.588717834713492e-06, "loss": 0.5869, "step": 3450 }, { "epoch": 1.5611852522053833, "grad_norm": 0.3297264873981476, "learning_rate": 9.588426756568925e-06, "loss": 0.574, "step": 3451 }, { "epoch": 1.561637638543316, "grad_norm": 0.38639938831329346, "learning_rate": 9.588135579879145e-06, "loss": 0.6621, "step": 3452 }, { "epoch": 1.5620900248812486, "grad_norm": 0.34411191940307617, "learning_rate": 9.587844304650407e-06, "loss": 0.5348, "step": 3453 }, { "epoch": 1.562542411219181, "grad_norm": 0.2944461703300476, "learning_rate": 9.587552930888967e-06, "loss": 0.4937, "step": 3454 }, { "epoch": 1.562994797557114, "grad_norm": 0.3902071416378021, "learning_rate": 9.587261458601079e-06, "loss": 0.6454, "step": 3455 }, { "epoch": 1.5634471838950463, "grad_norm": 0.35786792635917664, "learning_rate": 9.586969887793003e-06, "loss": 0.4795, "step": 3456 }, { "epoch": 1.563899570232979, "grad_norm": 0.3142816424369812, "learning_rate": 9.586678218471006e-06, "loss": 0.4476, "step": 3457 }, { "epoch": 1.5643519565709116, "grad_norm": 0.32983919978141785, "learning_rate": 9.586386450641347e-06, "loss": 0.5307, "step": 3458 }, { "epoch": 1.564804342908844, "grad_norm": 0.37688329815864563, "learning_rate": 9.586094584310296e-06, "loss": 0.4988, "step": 3459 }, { "epoch": 1.5652567292467767, "grad_norm": 0.5330315232276917, "learning_rate": 9.58580261948412e-06, "loss": 0.6356, "step": 3460 }, { "epoch": 1.5657091155847094, "grad_norm": 0.3104362189769745, "learning_rate": 9.58551055616909e-06, "loss": 1.0809, "step": 3461 }, { "epoch": 1.5661615019226418, "grad_norm": 0.16663587093353271, "learning_rate": 9.585218394371476e-06, "loss": 0.734, "step": 3462 }, { "epoch": 1.5666138882605747, "grad_norm": 0.1844397932291031, "learning_rate": 9.584926134097557e-06, "loss": 0.5134, "step": 3463 }, { "epoch": 1.5670662745985071, "grad_norm": 0.19410136342048645, "learning_rate": 9.584633775353607e-06, "loss": 0.6494, "step": 3464 }, { "epoch": 1.5675186609364398, "grad_norm": 0.20430253446102142, "learning_rate": 9.584341318145905e-06, "loss": 0.6295, "step": 3465 }, { "epoch": 1.5679710472743724, "grad_norm": 0.256160706281662, "learning_rate": 9.584048762480732e-06, "loss": 0.7674, "step": 3466 }, { "epoch": 1.5684234336123049, "grad_norm": 0.22155046463012695, "learning_rate": 9.583756108364373e-06, "loss": 0.632, "step": 3467 }, { "epoch": 1.5688758199502375, "grad_norm": 0.21760471165180206, "learning_rate": 9.583463355803111e-06, "loss": 0.5755, "step": 3468 }, { "epoch": 1.5693282062881702, "grad_norm": 0.22662128508090973, "learning_rate": 9.583170504803235e-06, "loss": 0.5841, "step": 3469 }, { "epoch": 1.5697805926261026, "grad_norm": 0.23902878165245056, "learning_rate": 9.582877555371035e-06, "loss": 0.7214, "step": 3470 }, { "epoch": 1.5702329789640352, "grad_norm": 0.26075902581214905, "learning_rate": 9.5825845075128e-06, "loss": 0.7094, "step": 3471 }, { "epoch": 1.570685365301968, "grad_norm": 0.26533257961273193, "learning_rate": 9.582291361234827e-06, "loss": 0.6962, "step": 3472 }, { "epoch": 1.5711377516399003, "grad_norm": 0.2750012278556824, "learning_rate": 9.58199811654341e-06, "loss": 0.7025, "step": 3473 }, { "epoch": 1.5715901379778332, "grad_norm": 0.24078798294067383, "learning_rate": 9.581704773444847e-06, "loss": 0.5917, "step": 3474 }, { "epoch": 1.5720425243157656, "grad_norm": 0.25302553176879883, "learning_rate": 9.581411331945439e-06, "loss": 0.5515, "step": 3475 }, { "epoch": 1.5724949106536983, "grad_norm": 0.2556876242160797, "learning_rate": 9.581117792051487e-06, "loss": 0.6127, "step": 3476 }, { "epoch": 1.572947296991631, "grad_norm": 0.27502578496932983, "learning_rate": 9.580824153769296e-06, "loss": 0.666, "step": 3477 }, { "epoch": 1.5733996833295634, "grad_norm": 0.25153443217277527, "learning_rate": 9.580530417105174e-06, "loss": 0.6206, "step": 3478 }, { "epoch": 1.573852069667496, "grad_norm": 0.2513856887817383, "learning_rate": 9.580236582065426e-06, "loss": 0.5052, "step": 3479 }, { "epoch": 1.5743044560054287, "grad_norm": 0.2660365402698517, "learning_rate": 9.579942648656365e-06, "loss": 0.5704, "step": 3480 }, { "epoch": 1.574756842343361, "grad_norm": 0.2702926695346832, "learning_rate": 9.579648616884304e-06, "loss": 0.5735, "step": 3481 }, { "epoch": 1.575209228681294, "grad_norm": 0.272634357213974, "learning_rate": 9.579354486755556e-06, "loss": 0.5777, "step": 3482 }, { "epoch": 1.5756616150192264, "grad_norm": 0.27864885330200195, "learning_rate": 9.579060258276443e-06, "loss": 0.6624, "step": 3483 }, { "epoch": 1.576114001357159, "grad_norm": 0.2738334834575653, "learning_rate": 9.578765931453279e-06, "loss": 0.5954, "step": 3484 }, { "epoch": 1.5765663876950917, "grad_norm": 0.2717858552932739, "learning_rate": 9.578471506292384e-06, "loss": 0.4949, "step": 3485 }, { "epoch": 1.5770187740330242, "grad_norm": 0.28530120849609375, "learning_rate": 9.578176982800084e-06, "loss": 0.5549, "step": 3486 }, { "epoch": 1.5774711603709568, "grad_norm": 0.23932033777236938, "learning_rate": 9.577882360982707e-06, "loss": 0.5164, "step": 3487 }, { "epoch": 1.5779235467088895, "grad_norm": 0.26478883624076843, "learning_rate": 9.577587640846577e-06, "loss": 0.4707, "step": 3488 }, { "epoch": 1.5783759330468219, "grad_norm": 0.25634559988975525, "learning_rate": 9.577292822398026e-06, "loss": 0.5244, "step": 3489 }, { "epoch": 1.5788283193847545, "grad_norm": 0.25853052735328674, "learning_rate": 9.576997905643384e-06, "loss": 0.5468, "step": 3490 }, { "epoch": 1.5792807057226872, "grad_norm": 0.2902717888355255, "learning_rate": 9.576702890588985e-06, "loss": 0.5041, "step": 3491 }, { "epoch": 1.5797330920606196, "grad_norm": 0.3385380506515503, "learning_rate": 9.576407777241164e-06, "loss": 0.6184, "step": 3492 }, { "epoch": 1.5801854783985525, "grad_norm": 0.3010668456554413, "learning_rate": 9.576112565606263e-06, "loss": 0.5558, "step": 3493 }, { "epoch": 1.580637864736485, "grad_norm": 0.30869418382644653, "learning_rate": 9.575817255690618e-06, "loss": 0.5865, "step": 3494 }, { "epoch": 1.5810902510744176, "grad_norm": 0.3149212896823883, "learning_rate": 9.575521847500575e-06, "loss": 0.5829, "step": 3495 }, { "epoch": 1.5815426374123502, "grad_norm": 0.3225876986980438, "learning_rate": 9.575226341042474e-06, "loss": 0.576, "step": 3496 }, { "epoch": 1.5819950237502827, "grad_norm": 0.29661405086517334, "learning_rate": 9.574930736322665e-06, "loss": 0.5895, "step": 3497 }, { "epoch": 1.5824474100882153, "grad_norm": 0.28631794452667236, "learning_rate": 9.574635033347496e-06, "loss": 0.4628, "step": 3498 }, { "epoch": 1.582899796426148, "grad_norm": 0.30417564511299133, "learning_rate": 9.57433923212332e-06, "loss": 0.4482, "step": 3499 }, { "epoch": 1.5833521827640804, "grad_norm": 0.33763939142227173, "learning_rate": 9.574043332656484e-06, "loss": 0.5567, "step": 3500 }, { "epoch": 1.5838045691020133, "grad_norm": 0.3320923447608948, "learning_rate": 9.57374733495335e-06, "loss": 0.5902, "step": 3501 }, { "epoch": 1.5842569554399457, "grad_norm": 0.3099247217178345, "learning_rate": 9.573451239020268e-06, "loss": 0.5394, "step": 3502 }, { "epoch": 1.5847093417778781, "grad_norm": 0.3251764476299286, "learning_rate": 9.573155044863602e-06, "loss": 0.4733, "step": 3503 }, { "epoch": 1.585161728115811, "grad_norm": 0.3483607769012451, "learning_rate": 9.572858752489713e-06, "loss": 0.483, "step": 3504 }, { "epoch": 1.5856141144537435, "grad_norm": 0.33728283643722534, "learning_rate": 9.572562361904964e-06, "loss": 0.5343, "step": 3505 }, { "epoch": 1.586066500791676, "grad_norm": 0.37801027297973633, "learning_rate": 9.572265873115718e-06, "loss": 0.567, "step": 3506 }, { "epoch": 1.5865188871296088, "grad_norm": 0.37834423780441284, "learning_rate": 9.571969286128346e-06, "loss": 0.6005, "step": 3507 }, { "epoch": 1.5869712734675412, "grad_norm": 0.3795761466026306, "learning_rate": 9.571672600949217e-06, "loss": 0.5067, "step": 3508 }, { "epoch": 1.5874236598054738, "grad_norm": 0.3946326971054077, "learning_rate": 9.571375817584703e-06, "loss": 0.4663, "step": 3509 }, { "epoch": 1.5878760461434065, "grad_norm": 0.44395625591278076, "learning_rate": 9.571078936041177e-06, "loss": 0.5834, "step": 3510 }, { "epoch": 1.588328432481339, "grad_norm": 0.3209308683872223, "learning_rate": 9.570781956325015e-06, "loss": 0.9823, "step": 3511 }, { "epoch": 1.5887808188192718, "grad_norm": 0.1449478417634964, "learning_rate": 9.570484878442596e-06, "loss": 0.5109, "step": 3512 }, { "epoch": 1.5892332051572042, "grad_norm": 0.21067214012145996, "learning_rate": 9.5701877024003e-06, "loss": 0.7856, "step": 3513 }, { "epoch": 1.5896855914951369, "grad_norm": 0.19687515497207642, "learning_rate": 9.56989042820451e-06, "loss": 0.5574, "step": 3514 }, { "epoch": 1.5901379778330695, "grad_norm": 0.2271116077899933, "learning_rate": 9.569593055861611e-06, "loss": 0.8546, "step": 3515 }, { "epoch": 1.590590364171002, "grad_norm": 0.2098984271287918, "learning_rate": 9.569295585377986e-06, "loss": 0.5362, "step": 3516 }, { "epoch": 1.5910427505089346, "grad_norm": 0.2583464980125427, "learning_rate": 9.56899801676003e-06, "loss": 0.7928, "step": 3517 }, { "epoch": 1.5914951368468673, "grad_norm": 0.21456237137317657, "learning_rate": 9.568700350014126e-06, "loss": 0.657, "step": 3518 }, { "epoch": 1.5919475231847997, "grad_norm": 0.2835600674152374, "learning_rate": 9.568402585146674e-06, "loss": 0.7476, "step": 3519 }, { "epoch": 1.5923999095227324, "grad_norm": 0.2548482418060303, "learning_rate": 9.568104722164067e-06, "loss": 0.6418, "step": 3520 }, { "epoch": 1.592852295860665, "grad_norm": 0.25154632329940796, "learning_rate": 9.5678067610727e-06, "loss": 0.5949, "step": 3521 }, { "epoch": 1.5933046821985974, "grad_norm": 0.2534101903438568, "learning_rate": 9.567508701878974e-06, "loss": 0.582, "step": 3522 }, { "epoch": 1.5937570685365303, "grad_norm": 0.24180270731449127, "learning_rate": 9.567210544589289e-06, "loss": 0.5139, "step": 3523 }, { "epoch": 1.5942094548744628, "grad_norm": 0.24699397385120392, "learning_rate": 9.566912289210051e-06, "loss": 0.4628, "step": 3524 }, { "epoch": 1.5946618412123954, "grad_norm": 0.2658122777938843, "learning_rate": 9.566613935747664e-06, "loss": 0.6416, "step": 3525 }, { "epoch": 1.595114227550328, "grad_norm": 0.266683965921402, "learning_rate": 9.566315484208535e-06, "loss": 0.5014, "step": 3526 }, { "epoch": 1.5955666138882605, "grad_norm": 0.2556036412715912, "learning_rate": 9.566016934599078e-06, "loss": 0.6365, "step": 3527 }, { "epoch": 1.5960190002261931, "grad_norm": 0.2458638846874237, "learning_rate": 9.565718286925699e-06, "loss": 0.521, "step": 3528 }, { "epoch": 1.5964713865641258, "grad_norm": 0.2830744981765747, "learning_rate": 9.565419541194815e-06, "loss": 0.6497, "step": 3529 }, { "epoch": 1.5969237729020582, "grad_norm": 0.25663521885871887, "learning_rate": 9.565120697412841e-06, "loss": 0.4891, "step": 3530 }, { "epoch": 1.597376159239991, "grad_norm": 0.2574569284915924, "learning_rate": 9.564821755586197e-06, "loss": 0.4827, "step": 3531 }, { "epoch": 1.5978285455779235, "grad_norm": 0.2450714409351349, "learning_rate": 9.564522715721304e-06, "loss": 0.4668, "step": 3532 }, { "epoch": 1.5982809319158562, "grad_norm": 0.25897806882858276, "learning_rate": 9.56422357782458e-06, "loss": 0.5557, "step": 3533 }, { "epoch": 1.5987333182537888, "grad_norm": 0.25646302103996277, "learning_rate": 9.563924341902454e-06, "loss": 0.5097, "step": 3534 }, { "epoch": 1.5991857045917213, "grad_norm": 0.2656823992729187, "learning_rate": 9.563625007961352e-06, "loss": 0.5629, "step": 3535 }, { "epoch": 1.599638090929654, "grad_norm": 0.2780112028121948, "learning_rate": 9.563325576007702e-06, "loss": 0.5556, "step": 3536 }, { "epoch": 1.6000904772675866, "grad_norm": 0.26123517751693726, "learning_rate": 9.563026046047934e-06, "loss": 0.4955, "step": 3537 }, { "epoch": 1.600542863605519, "grad_norm": 0.2863539755344391, "learning_rate": 9.562726418088482e-06, "loss": 0.4951, "step": 3538 }, { "epoch": 1.6009952499434517, "grad_norm": 0.27989664673805237, "learning_rate": 9.56242669213578e-06, "loss": 0.5187, "step": 3539 }, { "epoch": 1.6014476362813843, "grad_norm": 0.27884286642074585, "learning_rate": 9.562126868196266e-06, "loss": 0.5584, "step": 3540 }, { "epoch": 1.6019000226193167, "grad_norm": 0.28456810116767883, "learning_rate": 9.561826946276379e-06, "loss": 0.5412, "step": 3541 }, { "epoch": 1.6023524089572496, "grad_norm": 0.27652639150619507, "learning_rate": 9.561526926382561e-06, "loss": 0.4601, "step": 3542 }, { "epoch": 1.602804795295182, "grad_norm": 0.28495439887046814, "learning_rate": 9.561226808521256e-06, "loss": 0.5482, "step": 3543 }, { "epoch": 1.6032571816331147, "grad_norm": 0.3128574788570404, "learning_rate": 9.560926592698909e-06, "loss": 0.5495, "step": 3544 }, { "epoch": 1.6037095679710474, "grad_norm": 0.2887755334377289, "learning_rate": 9.560626278921967e-06, "loss": 0.5191, "step": 3545 }, { "epoch": 1.6041619543089798, "grad_norm": 0.28326863050460815, "learning_rate": 9.56032586719688e-06, "loss": 0.5632, "step": 3546 }, { "epoch": 1.6046143406469124, "grad_norm": 0.29230284690856934, "learning_rate": 9.5600253575301e-06, "loss": 0.508, "step": 3547 }, { "epoch": 1.605066726984845, "grad_norm": 0.321900337934494, "learning_rate": 9.55972474992808e-06, "loss": 0.5956, "step": 3548 }, { "epoch": 1.6055191133227775, "grad_norm": 0.3110072910785675, "learning_rate": 9.55942404439728e-06, "loss": 0.5028, "step": 3549 }, { "epoch": 1.6059714996607104, "grad_norm": 0.33015432953834534, "learning_rate": 9.559123240944156e-06, "loss": 0.5355, "step": 3550 }, { "epoch": 1.6064238859986428, "grad_norm": 0.35003146529197693, "learning_rate": 9.558822339575167e-06, "loss": 0.5268, "step": 3551 }, { "epoch": 1.6068762723365755, "grad_norm": 0.3381284177303314, "learning_rate": 9.558521340296775e-06, "loss": 0.5544, "step": 3552 }, { "epoch": 1.6073286586745081, "grad_norm": 0.35171937942504883, "learning_rate": 9.558220243115446e-06, "loss": 0.516, "step": 3553 }, { "epoch": 1.6077810450124406, "grad_norm": 0.3014110028743744, "learning_rate": 9.557919048037646e-06, "loss": 0.5002, "step": 3554 }, { "epoch": 1.6082334313503732, "grad_norm": 0.3832520544528961, "learning_rate": 9.557617755069848e-06, "loss": 0.5559, "step": 3555 }, { "epoch": 1.6086858176883059, "grad_norm": 0.4276885688304901, "learning_rate": 9.557316364218515e-06, "loss": 0.5815, "step": 3556 }, { "epoch": 1.6091382040262383, "grad_norm": 0.32738399505615234, "learning_rate": 9.557014875490127e-06, "loss": 0.4958, "step": 3557 }, { "epoch": 1.609590590364171, "grad_norm": 0.41293540596961975, "learning_rate": 9.556713288891154e-06, "loss": 0.6166, "step": 3558 }, { "epoch": 1.6100429767021036, "grad_norm": 0.37256044149398804, "learning_rate": 9.556411604428077e-06, "loss": 0.4857, "step": 3559 }, { "epoch": 1.610495363040036, "grad_norm": 0.4117400050163269, "learning_rate": 9.556109822107371e-06, "loss": 0.471, "step": 3560 }, { "epoch": 1.610947749377969, "grad_norm": 0.35351279377937317, "learning_rate": 9.555807941935522e-06, "loss": 0.9008, "step": 3561 }, { "epoch": 1.6114001357159013, "grad_norm": 0.16082851588726044, "learning_rate": 9.555505963919012e-06, "loss": 1.3991, "step": 3562 }, { "epoch": 1.611852522053834, "grad_norm": 0.16155856847763062, "learning_rate": 9.555203888064324e-06, "loss": 0.5616, "step": 3563 }, { "epoch": 1.6123049083917667, "grad_norm": 0.20653550326824188, "learning_rate": 9.554901714377947e-06, "loss": 0.6849, "step": 3564 }, { "epoch": 1.612757294729699, "grad_norm": 0.2182815968990326, "learning_rate": 9.554599442866373e-06, "loss": 0.6839, "step": 3565 }, { "epoch": 1.6132096810676317, "grad_norm": 0.20474278926849365, "learning_rate": 9.554297073536091e-06, "loss": 0.5856, "step": 3566 }, { "epoch": 1.6136620674055644, "grad_norm": 0.21613189578056335, "learning_rate": 9.553994606393596e-06, "loss": 0.6285, "step": 3567 }, { "epoch": 1.6141144537434968, "grad_norm": 0.21939405798912048, "learning_rate": 9.553692041445384e-06, "loss": 0.5304, "step": 3568 }, { "epoch": 1.6145668400814297, "grad_norm": 0.2633697986602783, "learning_rate": 9.553389378697955e-06, "loss": 0.767, "step": 3569 }, { "epoch": 1.6150192264193621, "grad_norm": 0.23464249074459076, "learning_rate": 9.553086618157806e-06, "loss": 0.6252, "step": 3570 }, { "epoch": 1.6154716127572948, "grad_norm": 0.21640828251838684, "learning_rate": 9.55278375983144e-06, "loss": 0.6053, "step": 3571 }, { "epoch": 1.6159239990952274, "grad_norm": 0.21039873361587524, "learning_rate": 9.552480803725365e-06, "loss": 0.5037, "step": 3572 }, { "epoch": 1.6163763854331599, "grad_norm": 0.232986718416214, "learning_rate": 9.552177749846083e-06, "loss": 0.5372, "step": 3573 }, { "epoch": 1.6168287717710925, "grad_norm": 0.2288360446691513, "learning_rate": 9.551874598200107e-06, "loss": 0.5588, "step": 3574 }, { "epoch": 1.6172811581090252, "grad_norm": 0.276707261800766, "learning_rate": 9.551571348793942e-06, "loss": 0.7278, "step": 3575 }, { "epoch": 1.6177335444469576, "grad_norm": 0.2537810206413269, "learning_rate": 9.551268001634105e-06, "loss": 0.652, "step": 3576 }, { "epoch": 1.6181859307848903, "grad_norm": 0.2568269968032837, "learning_rate": 9.550964556727111e-06, "loss": 0.5954, "step": 3577 }, { "epoch": 1.618638317122823, "grad_norm": 0.3007785677909851, "learning_rate": 9.550661014079476e-06, "loss": 0.7277, "step": 3578 }, { "epoch": 1.6190907034607553, "grad_norm": 0.29014214873313904, "learning_rate": 9.55035737369772e-06, "loss": 0.637, "step": 3579 }, { "epoch": 1.6195430897986882, "grad_norm": 0.23773737251758575, "learning_rate": 9.550053635588363e-06, "loss": 0.4771, "step": 3580 }, { "epoch": 1.6199954761366206, "grad_norm": 0.29563868045806885, "learning_rate": 9.549749799757928e-06, "loss": 0.614, "step": 3581 }, { "epoch": 1.6204478624745533, "grad_norm": 0.26656338572502136, "learning_rate": 9.549445866212943e-06, "loss": 0.671, "step": 3582 }, { "epoch": 1.620900248812486, "grad_norm": 0.25250133872032166, "learning_rate": 9.549141834959935e-06, "loss": 0.5351, "step": 3583 }, { "epoch": 1.6213526351504184, "grad_norm": 0.25555142760276794, "learning_rate": 9.54883770600543e-06, "loss": 0.463, "step": 3584 }, { "epoch": 1.621805021488351, "grad_norm": 0.24526046216487885, "learning_rate": 9.548533479355964e-06, "loss": 0.5216, "step": 3585 }, { "epoch": 1.6222574078262837, "grad_norm": 0.2739415168762207, "learning_rate": 9.548229155018067e-06, "loss": 0.5277, "step": 3586 }, { "epoch": 1.6227097941642161, "grad_norm": 0.2486017346382141, "learning_rate": 9.54792473299828e-06, "loss": 0.4572, "step": 3587 }, { "epoch": 1.623162180502149, "grad_norm": 0.30602988600730896, "learning_rate": 9.547620213303136e-06, "loss": 0.7148, "step": 3588 }, { "epoch": 1.6236145668400814, "grad_norm": 0.3054349720478058, "learning_rate": 9.54731559593918e-06, "loss": 0.5493, "step": 3589 }, { "epoch": 1.6240669531780139, "grad_norm": 0.263090580701828, "learning_rate": 9.54701088091295e-06, "loss": 0.5235, "step": 3590 }, { "epoch": 1.6245193395159467, "grad_norm": 0.29457423090934753, "learning_rate": 9.54670606823099e-06, "loss": 0.5726, "step": 3591 }, { "epoch": 1.6249717258538792, "grad_norm": 0.2819312512874603, "learning_rate": 9.546401157899852e-06, "loss": 0.5858, "step": 3592 }, { "epoch": 1.6254241121918118, "grad_norm": 0.287306547164917, "learning_rate": 9.54609614992608e-06, "loss": 0.498, "step": 3593 }, { "epoch": 1.6258764985297445, "grad_norm": 0.27810221910476685, "learning_rate": 9.545791044316223e-06, "loss": 0.5535, "step": 3594 }, { "epoch": 1.626328884867677, "grad_norm": 0.2859022617340088, "learning_rate": 9.54548584107684e-06, "loss": 0.5014, "step": 3595 }, { "epoch": 1.6267812712056096, "grad_norm": 0.3190658986568451, "learning_rate": 9.54518054021448e-06, "loss": 0.6299, "step": 3596 }, { "epoch": 1.6272336575435422, "grad_norm": 0.3335822820663452, "learning_rate": 9.544875141735701e-06, "loss": 0.6343, "step": 3597 }, { "epoch": 1.6276860438814746, "grad_norm": 0.30289965867996216, "learning_rate": 9.544569645647065e-06, "loss": 0.5881, "step": 3598 }, { "epoch": 1.6281384302194075, "grad_norm": 0.2997182309627533, "learning_rate": 9.54426405195513e-06, "loss": 0.5246, "step": 3599 }, { "epoch": 1.62859081655734, "grad_norm": 0.3816145360469818, "learning_rate": 9.543958360666462e-06, "loss": 0.5591, "step": 3600 }, { "epoch": 1.62859081655734, "eval_loss": 0.6058939695358276, "eval_runtime": 27.1496, "eval_samples_per_second": 27.404, "eval_steps_per_second": 6.851, "step": 3600 }, { "epoch": 1.6290432028952726, "grad_norm": 0.3303169906139374, "learning_rate": 9.543652571787623e-06, "loss": 0.5994, "step": 3601 }, { "epoch": 1.6294955892332053, "grad_norm": 0.3597542941570282, "learning_rate": 9.54334668532518e-06, "loss": 0.6216, "step": 3602 }, { "epoch": 1.6299479755711377, "grad_norm": 0.35164013504981995, "learning_rate": 9.543040701285709e-06, "loss": 0.5459, "step": 3603 }, { "epoch": 1.6304003619090703, "grad_norm": 0.3457384407520294, "learning_rate": 9.542734619675776e-06, "loss": 0.5436, "step": 3604 }, { "epoch": 1.630852748247003, "grad_norm": 0.32812467217445374, "learning_rate": 9.542428440501954e-06, "loss": 0.4126, "step": 3605 }, { "epoch": 1.6313051345849354, "grad_norm": 0.34545600414276123, "learning_rate": 9.542122163770822e-06, "loss": 0.5051, "step": 3606 }, { "epoch": 1.631757520922868, "grad_norm": 0.36241641640663147, "learning_rate": 9.541815789488956e-06, "loss": 0.5345, "step": 3607 }, { "epoch": 1.6322099072608007, "grad_norm": 0.39829787611961365, "learning_rate": 9.541509317662935e-06, "loss": 0.5639, "step": 3608 }, { "epoch": 1.6326622935987332, "grad_norm": 0.41560232639312744, "learning_rate": 9.541202748299344e-06, "loss": 0.5493, "step": 3609 }, { "epoch": 1.633114679936666, "grad_norm": 0.36672478914260864, "learning_rate": 9.540896081404765e-06, "loss": 0.5188, "step": 3610 }, { "epoch": 1.6335670662745985, "grad_norm": 0.33182772994041443, "learning_rate": 9.540589316985787e-06, "loss": 1.1028, "step": 3611 }, { "epoch": 1.6340194526125311, "grad_norm": 0.15008686482906342, "learning_rate": 9.540282455048994e-06, "loss": 0.9891, "step": 3612 }, { "epoch": 1.6344718389504638, "grad_norm": 0.1502361297607422, "learning_rate": 9.53997549560098e-06, "loss": 0.5102, "step": 3613 }, { "epoch": 1.6349242252883962, "grad_norm": 0.2071141004562378, "learning_rate": 9.539668438648335e-06, "loss": 0.5673, "step": 3614 }, { "epoch": 1.6353766116263289, "grad_norm": 0.19793882966041565, "learning_rate": 9.539361284197655e-06, "loss": 0.6097, "step": 3615 }, { "epoch": 1.6358289979642615, "grad_norm": 0.23062771558761597, "learning_rate": 9.539054032255537e-06, "loss": 0.7457, "step": 3616 }, { "epoch": 1.636281384302194, "grad_norm": 0.2542022466659546, "learning_rate": 9.53874668282858e-06, "loss": 0.599, "step": 3617 }, { "epoch": 1.6367337706401268, "grad_norm": 0.3578844964504242, "learning_rate": 9.538439235923384e-06, "loss": 0.6668, "step": 3618 }, { "epoch": 1.6371861569780592, "grad_norm": 0.2468915581703186, "learning_rate": 9.538131691546552e-06, "loss": 0.5571, "step": 3619 }, { "epoch": 1.637638543315992, "grad_norm": 0.23944710195064545, "learning_rate": 9.53782404970469e-06, "loss": 0.5953, "step": 3620 }, { "epoch": 1.6380909296539246, "grad_norm": 0.2044467329978943, "learning_rate": 9.537516310404404e-06, "loss": 0.48, "step": 3621 }, { "epoch": 1.638543315991857, "grad_norm": 0.24785658717155457, "learning_rate": 9.537208473652306e-06, "loss": 0.6037, "step": 3622 }, { "epoch": 1.6389957023297896, "grad_norm": 0.22904862463474274, "learning_rate": 9.536900539455003e-06, "loss": 0.5871, "step": 3623 }, { "epoch": 1.6394480886677223, "grad_norm": 0.24742132425308228, "learning_rate": 9.53659250781911e-06, "loss": 0.5512, "step": 3624 }, { "epoch": 1.6399004750056547, "grad_norm": 0.2378455549478531, "learning_rate": 9.536284378751247e-06, "loss": 0.5776, "step": 3625 }, { "epoch": 1.6403528613435874, "grad_norm": 0.25116807222366333, "learning_rate": 9.535976152258027e-06, "loss": 0.6073, "step": 3626 }, { "epoch": 1.64080524768152, "grad_norm": 0.2741309702396393, "learning_rate": 9.53566782834607e-06, "loss": 0.6283, "step": 3627 }, { "epoch": 1.6412576340194525, "grad_norm": 0.27395859360694885, "learning_rate": 9.535359407021997e-06, "loss": 0.5461, "step": 3628 }, { "epoch": 1.6417100203573853, "grad_norm": 0.2378496378660202, "learning_rate": 9.535050888292435e-06, "loss": 0.5248, "step": 3629 }, { "epoch": 1.6421624066953178, "grad_norm": 0.2733905613422394, "learning_rate": 9.534742272164009e-06, "loss": 0.6304, "step": 3630 }, { "epoch": 1.6426147930332504, "grad_norm": 0.28660306334495544, "learning_rate": 9.534433558643345e-06, "loss": 0.5357, "step": 3631 }, { "epoch": 1.643067179371183, "grad_norm": 0.23155827820301056, "learning_rate": 9.534124747737077e-06, "loss": 0.4521, "step": 3632 }, { "epoch": 1.6435195657091155, "grad_norm": 0.26880601048469543, "learning_rate": 9.533815839451834e-06, "loss": 0.5032, "step": 3633 }, { "epoch": 1.6439719520470482, "grad_norm": 0.2773178517818451, "learning_rate": 9.533506833794253e-06, "loss": 0.5663, "step": 3634 }, { "epoch": 1.6444243383849808, "grad_norm": 0.27016597986221313, "learning_rate": 9.533197730770967e-06, "loss": 0.5568, "step": 3635 }, { "epoch": 1.6448767247229132, "grad_norm": 0.3098212480545044, "learning_rate": 9.532888530388618e-06, "loss": 0.5464, "step": 3636 }, { "epoch": 1.6453291110608461, "grad_norm": 0.2531755268573761, "learning_rate": 9.532579232653845e-06, "loss": 0.5, "step": 3637 }, { "epoch": 1.6457814973987785, "grad_norm": 0.2542855143547058, "learning_rate": 9.532269837573292e-06, "loss": 0.5216, "step": 3638 }, { "epoch": 1.6462338837367112, "grad_norm": 0.26406964659690857, "learning_rate": 9.531960345153602e-06, "loss": 0.4849, "step": 3639 }, { "epoch": 1.6466862700746439, "grad_norm": 0.27333879470825195, "learning_rate": 9.531650755401423e-06, "loss": 0.5377, "step": 3640 }, { "epoch": 1.6471386564125763, "grad_norm": 0.298801988363266, "learning_rate": 9.531341068323402e-06, "loss": 0.6077, "step": 3641 }, { "epoch": 1.647591042750509, "grad_norm": 0.30528199672698975, "learning_rate": 9.531031283926195e-06, "loss": 0.5287, "step": 3642 }, { "epoch": 1.6480434290884416, "grad_norm": 0.3101183772087097, "learning_rate": 9.530721402216452e-06, "loss": 0.6139, "step": 3643 }, { "epoch": 1.648495815426374, "grad_norm": 0.3785577416419983, "learning_rate": 9.530411423200826e-06, "loss": 0.6112, "step": 3644 }, { "epoch": 1.6489482017643067, "grad_norm": 0.3396469056606293, "learning_rate": 9.53010134688598e-06, "loss": 0.6612, "step": 3645 }, { "epoch": 1.6494005881022393, "grad_norm": 0.30324113368988037, "learning_rate": 9.52979117327857e-06, "loss": 0.4655, "step": 3646 }, { "epoch": 1.6498529744401718, "grad_norm": 0.3661966621875763, "learning_rate": 9.529480902385257e-06, "loss": 0.6253, "step": 3647 }, { "epoch": 1.6503053607781046, "grad_norm": 0.41279804706573486, "learning_rate": 9.529170534212705e-06, "loss": 0.67, "step": 3648 }, { "epoch": 1.650757747116037, "grad_norm": 0.3468129634857178, "learning_rate": 9.528860068767583e-06, "loss": 0.4808, "step": 3649 }, { "epoch": 1.6512101334539697, "grad_norm": 0.3347419798374176, "learning_rate": 9.528549506056554e-06, "loss": 0.538, "step": 3650 }, { "epoch": 1.6516625197919024, "grad_norm": 0.40694522857666016, "learning_rate": 9.52823884608629e-06, "loss": 0.6613, "step": 3651 }, { "epoch": 1.6521149061298348, "grad_norm": 0.343923956155777, "learning_rate": 9.527928088863465e-06, "loss": 0.5862, "step": 3652 }, { "epoch": 1.6525672924677675, "grad_norm": 0.30406972765922546, "learning_rate": 9.52761723439475e-06, "loss": 0.4771, "step": 3653 }, { "epoch": 1.6530196788057, "grad_norm": 0.3562451899051666, "learning_rate": 9.527306282686822e-06, "loss": 0.611, "step": 3654 }, { "epoch": 1.6534720651436325, "grad_norm": 0.3407900631427765, "learning_rate": 9.526995233746362e-06, "loss": 0.4696, "step": 3655 }, { "epoch": 1.6539244514815654, "grad_norm": 0.37303000688552856, "learning_rate": 9.526684087580045e-06, "loss": 0.5385, "step": 3656 }, { "epoch": 1.6543768378194978, "grad_norm": 0.4100414216518402, "learning_rate": 9.526372844194558e-06, "loss": 0.6292, "step": 3657 }, { "epoch": 1.6548292241574305, "grad_norm": 0.3561933934688568, "learning_rate": 9.526061503596585e-06, "loss": 0.4822, "step": 3658 }, { "epoch": 1.6552816104953632, "grad_norm": 0.38393428921699524, "learning_rate": 9.525750065792811e-06, "loss": 0.4856, "step": 3659 }, { "epoch": 1.6557339968332956, "grad_norm": 0.3860374093055725, "learning_rate": 9.525438530789926e-06, "loss": 0.4613, "step": 3660 }, { "epoch": 1.6561863831712282, "grad_norm": 0.3358103632926941, "learning_rate": 9.525126898594618e-06, "loss": 1.171, "step": 3661 }, { "epoch": 1.656638769509161, "grad_norm": 0.15460741519927979, "learning_rate": 9.524815169213586e-06, "loss": 0.6046, "step": 3662 }, { "epoch": 1.6570911558470933, "grad_norm": 0.17096015810966492, "learning_rate": 9.52450334265352e-06, "loss": 0.6301, "step": 3663 }, { "epoch": 1.657543542185026, "grad_norm": 0.20797787606716156, "learning_rate": 9.524191418921116e-06, "loss": 0.7281, "step": 3664 }, { "epoch": 1.6579959285229586, "grad_norm": 0.20084497332572937, "learning_rate": 9.523879398023077e-06, "loss": 0.5906, "step": 3665 }, { "epoch": 1.658448314860891, "grad_norm": 0.2588532269001007, "learning_rate": 9.523567279966101e-06, "loss": 0.5902, "step": 3666 }, { "epoch": 1.658900701198824, "grad_norm": 0.2157156616449356, "learning_rate": 9.523255064756896e-06, "loss": 0.6231, "step": 3667 }, { "epoch": 1.6593530875367564, "grad_norm": 0.23482519388198853, "learning_rate": 9.522942752402162e-06, "loss": 0.648, "step": 3668 }, { "epoch": 1.659805473874689, "grad_norm": 0.23232638835906982, "learning_rate": 9.52263034290861e-06, "loss": 0.5949, "step": 3669 }, { "epoch": 1.6602578602126217, "grad_norm": 0.2111445516347885, "learning_rate": 9.522317836282949e-06, "loss": 0.4622, "step": 3670 }, { "epoch": 1.660710246550554, "grad_norm": 0.2339666783809662, "learning_rate": 9.52200523253189e-06, "loss": 0.5679, "step": 3671 }, { "epoch": 1.6611626328884868, "grad_norm": 0.2731768786907196, "learning_rate": 9.521692531662146e-06, "loss": 0.6869, "step": 3672 }, { "epoch": 1.6616150192264194, "grad_norm": 0.25639551877975464, "learning_rate": 9.521379733680434e-06, "loss": 0.8344, "step": 3673 }, { "epoch": 1.6620674055643518, "grad_norm": 0.22072477638721466, "learning_rate": 9.521066838593471e-06, "loss": 0.5363, "step": 3674 }, { "epoch": 1.6625197919022847, "grad_norm": 0.2672465741634369, "learning_rate": 9.520753846407978e-06, "loss": 0.602, "step": 3675 }, { "epoch": 1.6629721782402171, "grad_norm": 0.2534881830215454, "learning_rate": 9.520440757130678e-06, "loss": 0.61, "step": 3676 }, { "epoch": 1.6634245645781496, "grad_norm": 0.27033117413520813, "learning_rate": 9.520127570768294e-06, "loss": 0.5592, "step": 3677 }, { "epoch": 1.6638769509160825, "grad_norm": 0.2457767277956009, "learning_rate": 9.519814287327552e-06, "loss": 0.5296, "step": 3678 }, { "epoch": 1.6643293372540149, "grad_norm": 0.27373573184013367, "learning_rate": 9.519500906815178e-06, "loss": 0.6866, "step": 3679 }, { "epoch": 1.6647817235919475, "grad_norm": 0.2801673114299774, "learning_rate": 9.519187429237908e-06, "loss": 0.6296, "step": 3680 }, { "epoch": 1.6652341099298802, "grad_norm": 0.3035813868045807, "learning_rate": 9.51887385460247e-06, "loss": 0.6671, "step": 3681 }, { "epoch": 1.6656864962678126, "grad_norm": 0.254302442073822, "learning_rate": 9.518560182915602e-06, "loss": 0.5122, "step": 3682 }, { "epoch": 1.6661388826057453, "grad_norm": 0.31309351325035095, "learning_rate": 9.518246414184038e-06, "loss": 0.6329, "step": 3683 }, { "epoch": 1.666591268943678, "grad_norm": 0.27768775820732117, "learning_rate": 9.51793254841452e-06, "loss": 0.5772, "step": 3684 }, { "epoch": 1.6670436552816104, "grad_norm": 0.2779090106487274, "learning_rate": 9.517618585613783e-06, "loss": 0.5797, "step": 3685 }, { "epoch": 1.6674960416195432, "grad_norm": 0.28067880868911743, "learning_rate": 9.517304525788576e-06, "loss": 0.5473, "step": 3686 }, { "epoch": 1.6679484279574757, "grad_norm": 0.26962709426879883, "learning_rate": 9.51699036894564e-06, "loss": 0.5471, "step": 3687 }, { "epoch": 1.6684008142954083, "grad_norm": 0.29997316002845764, "learning_rate": 9.516676115091722e-06, "loss": 0.6571, "step": 3688 }, { "epoch": 1.668853200633341, "grad_norm": 0.2799164950847626, "learning_rate": 9.516361764233577e-06, "loss": 0.5561, "step": 3689 }, { "epoch": 1.6693055869712734, "grad_norm": 0.3183387219905853, "learning_rate": 9.51604731637795e-06, "loss": 0.5999, "step": 3690 }, { "epoch": 1.669757973309206, "grad_norm": 0.34715595841407776, "learning_rate": 9.515732771531597e-06, "loss": 0.5187, "step": 3691 }, { "epoch": 1.6702103596471387, "grad_norm": 0.28871941566467285, "learning_rate": 9.515418129701273e-06, "loss": 0.5885, "step": 3692 }, { "epoch": 1.6706627459850711, "grad_norm": 0.27207672595977783, "learning_rate": 9.515103390893735e-06, "loss": 0.5325, "step": 3693 }, { "epoch": 1.6711151323230038, "grad_norm": 0.28442102670669556, "learning_rate": 9.514788555115742e-06, "loss": 0.5213, "step": 3694 }, { "epoch": 1.6715675186609364, "grad_norm": 0.3501071035861969, "learning_rate": 9.514473622374058e-06, "loss": 0.6109, "step": 3695 }, { "epoch": 1.6720199049988689, "grad_norm": 0.2864854037761688, "learning_rate": 9.514158592675446e-06, "loss": 0.4419, "step": 3696 }, { "epoch": 1.6724722913368018, "grad_norm": 0.36355534195899963, "learning_rate": 9.513843466026672e-06, "loss": 0.7294, "step": 3697 }, { "epoch": 1.6729246776747342, "grad_norm": 0.36227327585220337, "learning_rate": 9.513528242434502e-06, "loss": 0.6843, "step": 3698 }, { "epoch": 1.6733770640126668, "grad_norm": 0.295256108045578, "learning_rate": 9.513212921905708e-06, "loss": 0.4579, "step": 3699 }, { "epoch": 1.6738294503505995, "grad_norm": 0.3258838951587677, "learning_rate": 9.512897504447063e-06, "loss": 0.5258, "step": 3700 }, { "epoch": 1.674281836688532, "grad_norm": 0.37976840138435364, "learning_rate": 9.512581990065337e-06, "loss": 0.565, "step": 3701 }, { "epoch": 1.6747342230264646, "grad_norm": 0.33488330245018005, "learning_rate": 9.512266378767312e-06, "loss": 0.5048, "step": 3702 }, { "epoch": 1.6751866093643972, "grad_norm": 0.3454996645450592, "learning_rate": 9.511950670559761e-06, "loss": 0.577, "step": 3703 }, { "epoch": 1.6756389957023297, "grad_norm": 0.3374139964580536, "learning_rate": 9.511634865449468e-06, "loss": 0.6018, "step": 3704 }, { "epoch": 1.6760913820402625, "grad_norm": 0.3210242688655853, "learning_rate": 9.511318963443214e-06, "loss": 0.4889, "step": 3705 }, { "epoch": 1.676543768378195, "grad_norm": 0.32872945070266724, "learning_rate": 9.511002964547784e-06, "loss": 0.4973, "step": 3706 }, { "epoch": 1.6769961547161276, "grad_norm": 0.3472447991371155, "learning_rate": 9.510686868769964e-06, "loss": 0.4948, "step": 3707 }, { "epoch": 1.6774485410540603, "grad_norm": 0.49270039796829224, "learning_rate": 9.510370676116544e-06, "loss": 0.7054, "step": 3708 }, { "epoch": 1.6779009273919927, "grad_norm": 0.36635589599609375, "learning_rate": 9.510054386594314e-06, "loss": 0.4787, "step": 3709 }, { "epoch": 1.6783533137299254, "grad_norm": 0.47675949335098267, "learning_rate": 9.509738000210066e-06, "loss": 0.5716, "step": 3710 }, { "epoch": 1.678805700067858, "grad_norm": 0.34525930881500244, "learning_rate": 9.509421516970598e-06, "loss": 0.8963, "step": 3711 }, { "epoch": 1.6792580864057904, "grad_norm": 0.18458569049835205, "learning_rate": 9.509104936882705e-06, "loss": 1.2384, "step": 3712 }, { "epoch": 1.679710472743723, "grad_norm": 0.21398761868476868, "learning_rate": 9.508788259953185e-06, "loss": 0.6297, "step": 3713 }, { "epoch": 1.6801628590816557, "grad_norm": 0.19231107831001282, "learning_rate": 9.508471486188842e-06, "loss": 0.6401, "step": 3714 }, { "epoch": 1.6806152454195882, "grad_norm": 0.19369958341121674, "learning_rate": 9.508154615596477e-06, "loss": 0.6524, "step": 3715 }, { "epoch": 1.681067631757521, "grad_norm": 0.2821360230445862, "learning_rate": 9.507837648182896e-06, "loss": 0.6358, "step": 3716 }, { "epoch": 1.6815200180954535, "grad_norm": 0.2209686040878296, "learning_rate": 9.507520583954908e-06, "loss": 0.7062, "step": 3717 }, { "epoch": 1.6819724044333861, "grad_norm": 0.2191402018070221, "learning_rate": 9.507203422919321e-06, "loss": 0.5521, "step": 3718 }, { "epoch": 1.6824247907713188, "grad_norm": 0.21598361432552338, "learning_rate": 9.506886165082947e-06, "loss": 0.5786, "step": 3719 }, { "epoch": 1.6828771771092512, "grad_norm": 0.2411392480134964, "learning_rate": 9.5065688104526e-06, "loss": 0.7759, "step": 3720 }, { "epoch": 1.6833295634471839, "grad_norm": 0.22247235476970673, "learning_rate": 9.506251359035094e-06, "loss": 0.5941, "step": 3721 }, { "epoch": 1.6837819497851165, "grad_norm": 0.23335392773151398, "learning_rate": 9.505933810837251e-06, "loss": 0.5774, "step": 3722 }, { "epoch": 1.684234336123049, "grad_norm": 0.259004145860672, "learning_rate": 9.505616165865886e-06, "loss": 0.6055, "step": 3723 }, { "epoch": 1.6846867224609818, "grad_norm": 0.2366989403963089, "learning_rate": 9.505298424127825e-06, "loss": 0.5862, "step": 3724 }, { "epoch": 1.6851391087989143, "grad_norm": 0.2736048400402069, "learning_rate": 9.504980585629889e-06, "loss": 0.6758, "step": 3725 }, { "epoch": 1.685591495136847, "grad_norm": 0.23951829969882965, "learning_rate": 9.504662650378907e-06, "loss": 0.5565, "step": 3726 }, { "epoch": 1.6860438814747796, "grad_norm": 0.25442904233932495, "learning_rate": 9.504344618381704e-06, "loss": 0.6115, "step": 3727 }, { "epoch": 1.686496267812712, "grad_norm": 0.2914966344833374, "learning_rate": 9.504026489645115e-06, "loss": 0.7194, "step": 3728 }, { "epoch": 1.6869486541506447, "grad_norm": 0.28388655185699463, "learning_rate": 9.503708264175968e-06, "loss": 0.6696, "step": 3729 }, { "epoch": 1.6874010404885773, "grad_norm": 0.27159208059310913, "learning_rate": 9.5033899419811e-06, "loss": 0.5965, "step": 3730 }, { "epoch": 1.6878534268265097, "grad_norm": 0.2971208095550537, "learning_rate": 9.503071523067346e-06, "loss": 0.6495, "step": 3731 }, { "epoch": 1.6883058131644424, "grad_norm": 0.2667960822582245, "learning_rate": 9.502753007441545e-06, "loss": 0.5157, "step": 3732 }, { "epoch": 1.688758199502375, "grad_norm": 0.2462879866361618, "learning_rate": 9.50243439511054e-06, "loss": 0.5056, "step": 3733 }, { "epoch": 1.6892105858403075, "grad_norm": 0.30008888244628906, "learning_rate": 9.50211568608117e-06, "loss": 0.6392, "step": 3734 }, { "epoch": 1.6896629721782404, "grad_norm": 0.2540644109249115, "learning_rate": 9.501796880360284e-06, "loss": 0.4961, "step": 3735 }, { "epoch": 1.6901153585161728, "grad_norm": 0.28766506910324097, "learning_rate": 9.501477977954724e-06, "loss": 0.5541, "step": 3736 }, { "epoch": 1.6905677448541054, "grad_norm": 0.2969766855239868, "learning_rate": 9.501158978871343e-06, "loss": 0.5895, "step": 3737 }, { "epoch": 1.691020131192038, "grad_norm": 0.26816654205322266, "learning_rate": 9.500839883116992e-06, "loss": 0.4685, "step": 3738 }, { "epoch": 1.6914725175299705, "grad_norm": 0.29058748483657837, "learning_rate": 9.500520690698522e-06, "loss": 0.5776, "step": 3739 }, { "epoch": 1.6919249038679032, "grad_norm": 0.2638840079307556, "learning_rate": 9.50020140162279e-06, "loss": 0.4971, "step": 3740 }, { "epoch": 1.6923772902058358, "grad_norm": 0.27084797620773315, "learning_rate": 9.499882015896653e-06, "loss": 0.4756, "step": 3741 }, { "epoch": 1.6928296765437683, "grad_norm": 0.28203558921813965, "learning_rate": 9.499562533526969e-06, "loss": 0.5504, "step": 3742 }, { "epoch": 1.6932820628817011, "grad_norm": 0.28316113352775574, "learning_rate": 9.499242954520602e-06, "loss": 0.516, "step": 3743 }, { "epoch": 1.6937344492196336, "grad_norm": 0.3019495904445648, "learning_rate": 9.498923278884415e-06, "loss": 0.5335, "step": 3744 }, { "epoch": 1.694186835557566, "grad_norm": 0.32448479533195496, "learning_rate": 9.49860350662527e-06, "loss": 0.5234, "step": 3745 }, { "epoch": 1.6946392218954989, "grad_norm": 0.3092876076698303, "learning_rate": 9.49828363775004e-06, "loss": 0.5904, "step": 3746 }, { "epoch": 1.6950916082334313, "grad_norm": 0.30480724573135376, "learning_rate": 9.497963672265591e-06, "loss": 0.474, "step": 3747 }, { "epoch": 1.695543994571364, "grad_norm": 0.34083712100982666, "learning_rate": 9.497643610178798e-06, "loss": 0.6041, "step": 3748 }, { "epoch": 1.6959963809092966, "grad_norm": 0.37194007635116577, "learning_rate": 9.497323451496533e-06, "loss": 0.6619, "step": 3749 }, { "epoch": 1.696448767247229, "grad_norm": 0.35163015127182007, "learning_rate": 9.497003196225669e-06, "loss": 0.5563, "step": 3750 }, { "epoch": 1.6969011535851617, "grad_norm": 0.3720778226852417, "learning_rate": 9.49668284437309e-06, "loss": 0.5659, "step": 3751 }, { "epoch": 1.6973535399230943, "grad_norm": 0.34316113591194153, "learning_rate": 9.496362395945674e-06, "loss": 0.5344, "step": 3752 }, { "epoch": 1.6978059262610268, "grad_norm": 0.3473109006881714, "learning_rate": 9.496041850950301e-06, "loss": 0.5433, "step": 3753 }, { "epoch": 1.6982583125989597, "grad_norm": 0.34467658400535583, "learning_rate": 9.495721209393858e-06, "loss": 0.4769, "step": 3754 }, { "epoch": 1.698710698936892, "grad_norm": 0.34480616450309753, "learning_rate": 9.49540047128323e-06, "loss": 0.5545, "step": 3755 }, { "epoch": 1.6991630852748247, "grad_norm": 0.395867258310318, "learning_rate": 9.495079636625307e-06, "loss": 0.5483, "step": 3756 }, { "epoch": 1.6996154716127574, "grad_norm": 0.40114057064056396, "learning_rate": 9.494758705426978e-06, "loss": 0.5569, "step": 3757 }, { "epoch": 1.7000678579506898, "grad_norm": 0.34444862604141235, "learning_rate": 9.494437677695135e-06, "loss": 0.5394, "step": 3758 }, { "epoch": 1.7005202442886225, "grad_norm": 0.42817041277885437, "learning_rate": 9.494116553436674e-06, "loss": 0.5871, "step": 3759 }, { "epoch": 1.7009726306265551, "grad_norm": 0.47920238971710205, "learning_rate": 9.493795332658494e-06, "loss": 0.5483, "step": 3760 }, { "epoch": 1.7014250169644876, "grad_norm": 0.3687632381916046, "learning_rate": 9.493474015367487e-06, "loss": 1.0476, "step": 3761 }, { "epoch": 1.7018774033024204, "grad_norm": 0.15819332003593445, "learning_rate": 9.493152601570561e-06, "loss": 1.1616, "step": 3762 }, { "epoch": 1.7023297896403529, "grad_norm": 0.18116655945777893, "learning_rate": 9.492831091274615e-06, "loss": 0.7028, "step": 3763 }, { "epoch": 1.7027821759782853, "grad_norm": 0.1925601363182068, "learning_rate": 9.492509484486556e-06, "loss": 0.632, "step": 3764 }, { "epoch": 1.7032345623162182, "grad_norm": 0.21604616940021515, "learning_rate": 9.49218778121329e-06, "loss": 0.6308, "step": 3765 }, { "epoch": 1.7036869486541506, "grad_norm": 0.24985675513744354, "learning_rate": 9.491865981461727e-06, "loss": 0.6965, "step": 3766 }, { "epoch": 1.7041393349920833, "grad_norm": 0.21168553829193115, "learning_rate": 9.491544085238778e-06, "loss": 0.636, "step": 3767 }, { "epoch": 1.704591721330016, "grad_norm": 0.22135114669799805, "learning_rate": 9.491222092551355e-06, "loss": 0.5654, "step": 3768 }, { "epoch": 1.7050441076679483, "grad_norm": 0.21862253546714783, "learning_rate": 9.490900003406376e-06, "loss": 0.6026, "step": 3769 }, { "epoch": 1.705496494005881, "grad_norm": 0.24599289894104004, "learning_rate": 9.490577817810755e-06, "loss": 0.5822, "step": 3770 }, { "epoch": 1.7059488803438136, "grad_norm": 0.243042454123497, "learning_rate": 9.490255535771416e-06, "loss": 0.7222, "step": 3771 }, { "epoch": 1.706401266681746, "grad_norm": 0.24461546540260315, "learning_rate": 9.489933157295278e-06, "loss": 0.6763, "step": 3772 }, { "epoch": 1.706853653019679, "grad_norm": 0.24784651398658752, "learning_rate": 9.489610682389262e-06, "loss": 0.5521, "step": 3773 }, { "epoch": 1.7073060393576114, "grad_norm": 0.3155197501182556, "learning_rate": 9.489288111060298e-06, "loss": 0.6374, "step": 3774 }, { "epoch": 1.707758425695544, "grad_norm": 0.24913060665130615, "learning_rate": 9.488965443315314e-06, "loss": 0.57, "step": 3775 }, { "epoch": 1.7082108120334767, "grad_norm": 0.26560863852500916, "learning_rate": 9.488642679161237e-06, "loss": 0.6065, "step": 3776 }, { "epoch": 1.7086631983714091, "grad_norm": 0.28908881545066833, "learning_rate": 9.488319818605001e-06, "loss": 0.601, "step": 3777 }, { "epoch": 1.7091155847093418, "grad_norm": 0.292453408241272, "learning_rate": 9.48799686165354e-06, "loss": 0.7082, "step": 3778 }, { "epoch": 1.7095679710472744, "grad_norm": 0.26724424958229065, "learning_rate": 9.487673808313787e-06, "loss": 0.5183, "step": 3779 }, { "epoch": 1.7100203573852069, "grad_norm": 0.28528332710266113, "learning_rate": 9.487350658592684e-06, "loss": 0.4621, "step": 3780 }, { "epoch": 1.7104727437231395, "grad_norm": 0.25301480293273926, "learning_rate": 9.487027412497171e-06, "loss": 0.5085, "step": 3781 }, { "epoch": 1.7109251300610722, "grad_norm": 0.2704107165336609, "learning_rate": 9.486704070034188e-06, "loss": 0.5273, "step": 3782 }, { "epoch": 1.7113775163990046, "grad_norm": 0.2716200053691864, "learning_rate": 9.486380631210682e-06, "loss": 0.6192, "step": 3783 }, { "epoch": 1.7118299027369375, "grad_norm": 0.2811250686645508, "learning_rate": 9.486057096033597e-06, "loss": 0.6189, "step": 3784 }, { "epoch": 1.71228228907487, "grad_norm": 0.26805293560028076, "learning_rate": 9.485733464509883e-06, "loss": 0.4627, "step": 3785 }, { "epoch": 1.7127346754128026, "grad_norm": 0.2728281617164612, "learning_rate": 9.48540973664649e-06, "loss": 0.5449, "step": 3786 }, { "epoch": 1.7131870617507352, "grad_norm": 0.26550835371017456, "learning_rate": 9.485085912450371e-06, "loss": 0.6778, "step": 3787 }, { "epoch": 1.7136394480886676, "grad_norm": 0.2776852250099182, "learning_rate": 9.484761991928481e-06, "loss": 0.5806, "step": 3788 }, { "epoch": 1.7140918344266003, "grad_norm": 0.2725174129009247, "learning_rate": 9.484437975087778e-06, "loss": 0.4934, "step": 3789 }, { "epoch": 1.714544220764533, "grad_norm": 0.2889711260795593, "learning_rate": 9.484113861935218e-06, "loss": 0.6141, "step": 3790 }, { "epoch": 1.7149966071024654, "grad_norm": 0.27682915329933167, "learning_rate": 9.483789652477764e-06, "loss": 0.4888, "step": 3791 }, { "epoch": 1.7154489934403983, "grad_norm": 0.3200529217720032, "learning_rate": 9.483465346722377e-06, "loss": 0.6922, "step": 3792 }, { "epoch": 1.7159013797783307, "grad_norm": 0.31341707706451416, "learning_rate": 9.483140944676024e-06, "loss": 0.6327, "step": 3793 }, { "epoch": 1.7163537661162633, "grad_norm": 0.29655763506889343, "learning_rate": 9.482816446345672e-06, "loss": 0.5698, "step": 3794 }, { "epoch": 1.716806152454196, "grad_norm": 0.2817500829696655, "learning_rate": 9.48249185173829e-06, "loss": 0.5521, "step": 3795 }, { "epoch": 1.7172585387921284, "grad_norm": 0.3385741412639618, "learning_rate": 9.482167160860849e-06, "loss": 0.6433, "step": 3796 }, { "epoch": 1.717710925130061, "grad_norm": 0.3256336450576782, "learning_rate": 9.481842373720323e-06, "loss": 0.5299, "step": 3797 }, { "epoch": 1.7181633114679937, "grad_norm": 0.2909587025642395, "learning_rate": 9.481517490323684e-06, "loss": 0.5402, "step": 3798 }, { "epoch": 1.7186156978059262, "grad_norm": 0.31956425309181213, "learning_rate": 9.481192510677915e-06, "loss": 0.5486, "step": 3799 }, { "epoch": 1.7190680841438588, "grad_norm": 0.30612871050834656, "learning_rate": 9.480867434789993e-06, "loss": 0.4969, "step": 3800 }, { "epoch": 1.7190680841438588, "eval_loss": 0.6016281247138977, "eval_runtime": 25.7893, "eval_samples_per_second": 28.849, "eval_steps_per_second": 7.212, "step": 3800 }, { "epoch": 1.7195204704817915, "grad_norm": 0.3501591384410858, "learning_rate": 9.480542262666899e-06, "loss": 0.5715, "step": 3801 }, { "epoch": 1.719972856819724, "grad_norm": 0.3575507700443268, "learning_rate": 9.480216994315617e-06, "loss": 0.5067, "step": 3802 }, { "epoch": 1.7204252431576568, "grad_norm": 0.3290901184082031, "learning_rate": 9.479891629743134e-06, "loss": 0.5859, "step": 3803 }, { "epoch": 1.7208776294955892, "grad_norm": 0.3990701735019684, "learning_rate": 9.479566168956436e-06, "loss": 0.6423, "step": 3804 }, { "epoch": 1.7213300158335219, "grad_norm": 0.35720473527908325, "learning_rate": 9.479240611962512e-06, "loss": 0.5155, "step": 3805 }, { "epoch": 1.7217824021714545, "grad_norm": 0.41653332114219666, "learning_rate": 9.478914958768358e-06, "loss": 0.6689, "step": 3806 }, { "epoch": 1.722234788509387, "grad_norm": 0.4287664294242859, "learning_rate": 9.478589209380967e-06, "loss": 0.7224, "step": 3807 }, { "epoch": 1.7226871748473196, "grad_norm": 0.3673994243144989, "learning_rate": 9.478263363807331e-06, "loss": 0.5309, "step": 3808 }, { "epoch": 1.7231395611852522, "grad_norm": 0.3634587228298187, "learning_rate": 9.477937422054451e-06, "loss": 0.5209, "step": 3809 }, { "epoch": 1.7235919475231847, "grad_norm": 0.47969287633895874, "learning_rate": 9.477611384129329e-06, "loss": 0.6187, "step": 3810 }, { "epoch": 1.7240443338611176, "grad_norm": 0.3269537091255188, "learning_rate": 9.477285250038963e-06, "loss": 1.2731, "step": 3811 }, { "epoch": 1.72449672019905, "grad_norm": 0.15962186455726624, "learning_rate": 9.476959019790361e-06, "loss": 0.6459, "step": 3812 }, { "epoch": 1.7249491065369826, "grad_norm": 0.20628581941127777, "learning_rate": 9.476632693390531e-06, "loss": 0.6342, "step": 3813 }, { "epoch": 1.7254014928749153, "grad_norm": 0.24202758073806763, "learning_rate": 9.476306270846477e-06, "loss": 0.7583, "step": 3814 }, { "epoch": 1.7258538792128477, "grad_norm": 0.21016138792037964, "learning_rate": 9.47597975216521e-06, "loss": 0.5934, "step": 3815 }, { "epoch": 1.7263062655507804, "grad_norm": 0.25920018553733826, "learning_rate": 9.475653137353742e-06, "loss": 0.7206, "step": 3816 }, { "epoch": 1.726758651888713, "grad_norm": 0.20816510915756226, "learning_rate": 9.475326426419093e-06, "loss": 0.6124, "step": 3817 }, { "epoch": 1.7272110382266455, "grad_norm": 0.21192562580108643, "learning_rate": 9.474999619368274e-06, "loss": 0.5665, "step": 3818 }, { "epoch": 1.727663424564578, "grad_norm": 0.25286442041397095, "learning_rate": 9.474672716208308e-06, "loss": 0.6693, "step": 3819 }, { "epoch": 1.7281158109025108, "grad_norm": 0.24794644117355347, "learning_rate": 9.474345716946213e-06, "loss": 0.6794, "step": 3820 }, { "epoch": 1.7285681972404432, "grad_norm": 0.2561487853527069, "learning_rate": 9.474018621589015e-06, "loss": 0.6062, "step": 3821 }, { "epoch": 1.729020583578376, "grad_norm": 0.2760639786720276, "learning_rate": 9.473691430143735e-06, "loss": 0.6188, "step": 3822 }, { "epoch": 1.7294729699163085, "grad_norm": 0.24453511834144592, "learning_rate": 9.4733641426174e-06, "loss": 0.5285, "step": 3823 }, { "epoch": 1.7299253562542412, "grad_norm": 0.27380135655403137, "learning_rate": 9.473036759017045e-06, "loss": 0.6097, "step": 3824 }, { "epoch": 1.7303777425921738, "grad_norm": 0.2868073582649231, "learning_rate": 9.472709279349694e-06, "loss": 0.6193, "step": 3825 }, { "epoch": 1.7308301289301062, "grad_norm": 0.25567540526390076, "learning_rate": 9.472381703622384e-06, "loss": 0.6025, "step": 3826 }, { "epoch": 1.731282515268039, "grad_norm": 0.24869023263454437, "learning_rate": 9.472054031842152e-06, "loss": 0.5653, "step": 3827 }, { "epoch": 1.7317349016059715, "grad_norm": 0.256209135055542, "learning_rate": 9.471726264016032e-06, "loss": 0.5688, "step": 3828 }, { "epoch": 1.732187287943904, "grad_norm": 0.3013722002506256, "learning_rate": 9.471398400151065e-06, "loss": 0.6499, "step": 3829 }, { "epoch": 1.7326396742818369, "grad_norm": 0.2649279236793518, "learning_rate": 9.471070440254291e-06, "loss": 0.5375, "step": 3830 }, { "epoch": 1.7330920606197693, "grad_norm": 0.2605828046798706, "learning_rate": 9.470742384332754e-06, "loss": 0.5797, "step": 3831 }, { "epoch": 1.7335444469577017, "grad_norm": 0.2954097390174866, "learning_rate": 9.470414232393503e-06, "loss": 0.6041, "step": 3832 }, { "epoch": 1.7339968332956346, "grad_norm": 0.30910634994506836, "learning_rate": 9.47008598444358e-06, "loss": 0.7373, "step": 3833 }, { "epoch": 1.734449219633567, "grad_norm": 0.3055354356765747, "learning_rate": 9.46975764049004e-06, "loss": 0.6468, "step": 3834 }, { "epoch": 1.7349016059714997, "grad_norm": 0.2785838842391968, "learning_rate": 9.469429200539933e-06, "loss": 0.5457, "step": 3835 }, { "epoch": 1.7353539923094323, "grad_norm": 0.2674546539783478, "learning_rate": 9.46910066460031e-06, "loss": 0.456, "step": 3836 }, { "epoch": 1.7358063786473648, "grad_norm": 0.2942405045032501, "learning_rate": 9.468772032678231e-06, "loss": 0.5321, "step": 3837 }, { "epoch": 1.7362587649852974, "grad_norm": 0.30618226528167725, "learning_rate": 9.468443304780752e-06, "loss": 0.5778, "step": 3838 }, { "epoch": 1.73671115132323, "grad_norm": 0.30100002884864807, "learning_rate": 9.468114480914935e-06, "loss": 0.5968, "step": 3839 }, { "epoch": 1.7371635376611625, "grad_norm": 0.26312944293022156, "learning_rate": 9.467785561087838e-06, "loss": 0.4629, "step": 3840 }, { "epoch": 1.7376159239990954, "grad_norm": 0.27138108015060425, "learning_rate": 9.467456545306528e-06, "loss": 0.475, "step": 3841 }, { "epoch": 1.7380683103370278, "grad_norm": 0.30101191997528076, "learning_rate": 9.467127433578073e-06, "loss": 0.5444, "step": 3842 }, { "epoch": 1.7385206966749605, "grad_norm": 0.31372594833374023, "learning_rate": 9.466798225909537e-06, "loss": 0.6186, "step": 3843 }, { "epoch": 1.738973083012893, "grad_norm": 0.33602669835090637, "learning_rate": 9.466468922307994e-06, "loss": 0.6913, "step": 3844 }, { "epoch": 1.7394254693508255, "grad_norm": 0.34362903237342834, "learning_rate": 9.466139522780514e-06, "loss": 0.7066, "step": 3845 }, { "epoch": 1.7398778556887582, "grad_norm": 0.3254224359989166, "learning_rate": 9.465810027334174e-06, "loss": 0.5548, "step": 3846 }, { "epoch": 1.7403302420266908, "grad_norm": 0.3244694173336029, "learning_rate": 9.465480435976048e-06, "loss": 0.6595, "step": 3847 }, { "epoch": 1.7407826283646233, "grad_norm": 0.33571141958236694, "learning_rate": 9.465150748713217e-06, "loss": 0.55, "step": 3848 }, { "epoch": 1.741235014702556, "grad_norm": 0.3131696581840515, "learning_rate": 9.46482096555276e-06, "loss": 0.5555, "step": 3849 }, { "epoch": 1.7416874010404886, "grad_norm": 0.30565786361694336, "learning_rate": 9.464491086501758e-06, "loss": 0.5051, "step": 3850 }, { "epoch": 1.742139787378421, "grad_norm": 0.3174073398113251, "learning_rate": 9.464161111567302e-06, "loss": 0.5158, "step": 3851 }, { "epoch": 1.7425921737163539, "grad_norm": 0.3262297511100769, "learning_rate": 9.463831040756471e-06, "loss": 0.5193, "step": 3852 }, { "epoch": 1.7430445600542863, "grad_norm": 0.418008029460907, "learning_rate": 9.46350087407636e-06, "loss": 0.6442, "step": 3853 }, { "epoch": 1.743496946392219, "grad_norm": 0.3977830111980438, "learning_rate": 9.463170611534056e-06, "loss": 0.619, "step": 3854 }, { "epoch": 1.7439493327301516, "grad_norm": 0.33065715432167053, "learning_rate": 9.462840253136653e-06, "loss": 0.4727, "step": 3855 }, { "epoch": 1.744401719068084, "grad_norm": 0.32187482714653015, "learning_rate": 9.462509798891248e-06, "loss": 0.443, "step": 3856 }, { "epoch": 1.7448541054060167, "grad_norm": 0.410230427980423, "learning_rate": 9.462179248804937e-06, "loss": 0.5524, "step": 3857 }, { "epoch": 1.7453064917439494, "grad_norm": 0.372505247592926, "learning_rate": 9.461848602884817e-06, "loss": 0.5569, "step": 3858 }, { "epoch": 1.7457588780818818, "grad_norm": 0.3815667927265167, "learning_rate": 9.461517861137995e-06, "loss": 0.497, "step": 3859 }, { "epoch": 1.7462112644198147, "grad_norm": 0.5098915696144104, "learning_rate": 9.461187023571568e-06, "loss": 0.6497, "step": 3860 }, { "epoch": 1.746663650757747, "grad_norm": 0.4026460647583008, "learning_rate": 9.460856090192643e-06, "loss": 1.1026, "step": 3861 }, { "epoch": 1.7471160370956798, "grad_norm": 0.19065889716148376, "learning_rate": 9.46052506100833e-06, "loss": 0.9932, "step": 3862 }, { "epoch": 1.7475684234336124, "grad_norm": 0.17454662919044495, "learning_rate": 9.460193936025736e-06, "loss": 0.557, "step": 3863 }, { "epoch": 1.7480208097715448, "grad_norm": 0.1881958693265915, "learning_rate": 9.459862715251973e-06, "loss": 0.5425, "step": 3864 }, { "epoch": 1.7484731961094775, "grad_norm": 0.20409247279167175, "learning_rate": 9.459531398694156e-06, "loss": 0.5201, "step": 3865 }, { "epoch": 1.7489255824474101, "grad_norm": 0.2333524078130722, "learning_rate": 9.4591999863594e-06, "loss": 0.5459, "step": 3866 }, { "epoch": 1.7493779687853426, "grad_norm": 0.22101524472236633, "learning_rate": 9.45886847825482e-06, "loss": 0.5662, "step": 3867 }, { "epoch": 1.7498303551232752, "grad_norm": 0.2579314708709717, "learning_rate": 9.45853687438754e-06, "loss": 0.6235, "step": 3868 }, { "epoch": 1.7502827414612079, "grad_norm": 0.2436724752187729, "learning_rate": 9.45820517476468e-06, "loss": 0.6705, "step": 3869 }, { "epoch": 1.7507351277991403, "grad_norm": 0.28670310974121094, "learning_rate": 9.457873379393363e-06, "loss": 0.7123, "step": 3870 }, { "epoch": 1.7511875141370732, "grad_norm": 0.23572763800621033, "learning_rate": 9.457541488280715e-06, "loss": 0.5695, "step": 3871 }, { "epoch": 1.7516399004750056, "grad_norm": 0.22957415878772736, "learning_rate": 9.457209501433867e-06, "loss": 0.5493, "step": 3872 }, { "epoch": 1.7520922868129383, "grad_norm": 0.21469932794570923, "learning_rate": 9.456877418859945e-06, "loss": 0.5372, "step": 3873 }, { "epoch": 1.752544673150871, "grad_norm": 0.27292895317077637, "learning_rate": 9.456545240566085e-06, "loss": 0.68, "step": 3874 }, { "epoch": 1.7529970594888034, "grad_norm": 0.27543124556541443, "learning_rate": 9.456212966559417e-06, "loss": 0.6296, "step": 3875 }, { "epoch": 1.753449445826736, "grad_norm": 0.25557875633239746, "learning_rate": 9.45588059684708e-06, "loss": 0.5669, "step": 3876 }, { "epoch": 1.7539018321646687, "grad_norm": 0.27001041173934937, "learning_rate": 9.45554813143621e-06, "loss": 0.6348, "step": 3877 }, { "epoch": 1.754354218502601, "grad_norm": 0.28306347131729126, "learning_rate": 9.455215570333951e-06, "loss": 0.5391, "step": 3878 }, { "epoch": 1.754806604840534, "grad_norm": 0.25752830505371094, "learning_rate": 9.454882913547443e-06, "loss": 0.6766, "step": 3879 }, { "epoch": 1.7552589911784664, "grad_norm": 0.3307786285877228, "learning_rate": 9.45455016108383e-06, "loss": 0.9174, "step": 3880 }, { "epoch": 1.755711377516399, "grad_norm": 0.3170951306819916, "learning_rate": 9.45421731295026e-06, "loss": 0.641, "step": 3881 }, { "epoch": 1.7561637638543317, "grad_norm": 0.30443233251571655, "learning_rate": 9.453884369153878e-06, "loss": 0.7336, "step": 3882 }, { "epoch": 1.7566161501922641, "grad_norm": 0.27364736795425415, "learning_rate": 9.45355132970184e-06, "loss": 0.6094, "step": 3883 }, { "epoch": 1.7570685365301968, "grad_norm": 0.25460153818130493, "learning_rate": 9.453218194601295e-06, "loss": 0.538, "step": 3884 }, { "epoch": 1.7575209228681294, "grad_norm": 0.25590425729751587, "learning_rate": 9.4528849638594e-06, "loss": 0.5519, "step": 3885 }, { "epoch": 1.7579733092060619, "grad_norm": 0.28799694776535034, "learning_rate": 9.452551637483309e-06, "loss": 0.5351, "step": 3886 }, { "epoch": 1.7584256955439945, "grad_norm": 0.2883656322956085, "learning_rate": 9.452218215480184e-06, "loss": 0.6395, "step": 3887 }, { "epoch": 1.7588780818819272, "grad_norm": 0.3021397590637207, "learning_rate": 9.451884697857184e-06, "loss": 0.5812, "step": 3888 }, { "epoch": 1.7593304682198596, "grad_norm": 0.28175339102745056, "learning_rate": 9.45155108462147e-06, "loss": 0.5685, "step": 3889 }, { "epoch": 1.7597828545577925, "grad_norm": 0.3410613536834717, "learning_rate": 9.45121737578021e-06, "loss": 0.6427, "step": 3890 }, { "epoch": 1.760235240895725, "grad_norm": 0.2894584536552429, "learning_rate": 9.45088357134057e-06, "loss": 0.5606, "step": 3891 }, { "epoch": 1.7606876272336576, "grad_norm": 0.2920118570327759, "learning_rate": 9.45054967130972e-06, "loss": 0.5971, "step": 3892 }, { "epoch": 1.7611400135715902, "grad_norm": 0.2989917993545532, "learning_rate": 9.450215675694829e-06, "loss": 0.5435, "step": 3893 }, { "epoch": 1.7615923999095227, "grad_norm": 0.29622864723205566, "learning_rate": 9.449881584503073e-06, "loss": 0.508, "step": 3894 }, { "epoch": 1.7620447862474553, "grad_norm": 0.3055253326892853, "learning_rate": 9.449547397741625e-06, "loss": 0.5516, "step": 3895 }, { "epoch": 1.762497172585388, "grad_norm": 0.2840026319026947, "learning_rate": 9.449213115417663e-06, "loss": 0.4569, "step": 3896 }, { "epoch": 1.7629495589233204, "grad_norm": 0.29434698820114136, "learning_rate": 9.448878737538367e-06, "loss": 0.4971, "step": 3897 }, { "epoch": 1.7634019452612533, "grad_norm": 0.2899130582809448, "learning_rate": 9.448544264110919e-06, "loss": 0.4875, "step": 3898 }, { "epoch": 1.7638543315991857, "grad_norm": 0.33816319704055786, "learning_rate": 9.448209695142498e-06, "loss": 0.568, "step": 3899 }, { "epoch": 1.7643067179371184, "grad_norm": 0.3275698721408844, "learning_rate": 9.447875030640295e-06, "loss": 0.5621, "step": 3900 }, { "epoch": 1.764759104275051, "grad_norm": 0.33719974756240845, "learning_rate": 9.447540270611495e-06, "loss": 0.5826, "step": 3901 }, { "epoch": 1.7652114906129834, "grad_norm": 0.3311377763748169, "learning_rate": 9.447205415063288e-06, "loss": 0.5204, "step": 3902 }, { "epoch": 1.765663876950916, "grad_norm": 0.30825355648994446, "learning_rate": 9.446870464002867e-06, "loss": 0.5211, "step": 3903 }, { "epoch": 1.7661162632888487, "grad_norm": 0.3269532024860382, "learning_rate": 9.44653541743742e-06, "loss": 0.499, "step": 3904 }, { "epoch": 1.7665686496267812, "grad_norm": 0.320176362991333, "learning_rate": 9.446200275374151e-06, "loss": 0.4806, "step": 3905 }, { "epoch": 1.7670210359647138, "grad_norm": 0.39062145352363586, "learning_rate": 9.445865037820253e-06, "loss": 0.5798, "step": 3906 }, { "epoch": 1.7674734223026465, "grad_norm": 0.41119077801704407, "learning_rate": 9.445529704782928e-06, "loss": 0.5408, "step": 3907 }, { "epoch": 1.767925808640579, "grad_norm": 0.4149837791919708, "learning_rate": 9.445194276269373e-06, "loss": 0.6118, "step": 3908 }, { "epoch": 1.7683781949785118, "grad_norm": 0.4087030589580536, "learning_rate": 9.444858752286798e-06, "loss": 0.585, "step": 3909 }, { "epoch": 1.7688305813164442, "grad_norm": 0.4505324959754944, "learning_rate": 9.444523132842409e-06, "loss": 0.5424, "step": 3910 }, { "epoch": 1.7692829676543769, "grad_norm": 0.3275240659713745, "learning_rate": 9.44418741794341e-06, "loss": 0.9809, "step": 3911 }, { "epoch": 1.7697353539923095, "grad_norm": 0.1854424774646759, "learning_rate": 9.44385160759701e-06, "loss": 0.5642, "step": 3912 }, { "epoch": 1.770187740330242, "grad_norm": 0.2076703906059265, "learning_rate": 9.443515701810425e-06, "loss": 0.7961, "step": 3913 }, { "epoch": 1.7706401266681746, "grad_norm": 0.2149638533592224, "learning_rate": 9.443179700590872e-06, "loss": 0.6992, "step": 3914 }, { "epoch": 1.7710925130061073, "grad_norm": 0.24657738208770752, "learning_rate": 9.44284360394556e-06, "loss": 0.6066, "step": 3915 }, { "epoch": 1.7715448993440397, "grad_norm": 0.23050233721733093, "learning_rate": 9.442507411881711e-06, "loss": 0.6412, "step": 3916 }, { "epoch": 1.7719972856819726, "grad_norm": 0.2144007682800293, "learning_rate": 9.442171124406548e-06, "loss": 0.6007, "step": 3917 }, { "epoch": 1.772449672019905, "grad_norm": 0.22879065573215485, "learning_rate": 9.441834741527287e-06, "loss": 0.565, "step": 3918 }, { "epoch": 1.7729020583578374, "grad_norm": 0.24971821904182434, "learning_rate": 9.441498263251159e-06, "loss": 0.7168, "step": 3919 }, { "epoch": 1.7733544446957703, "grad_norm": 0.28788575530052185, "learning_rate": 9.441161689585386e-06, "loss": 0.6502, "step": 3920 }, { "epoch": 1.7738068310337027, "grad_norm": 0.22918877005577087, "learning_rate": 9.4408250205372e-06, "loss": 0.5148, "step": 3921 }, { "epoch": 1.7742592173716354, "grad_norm": 0.26751959323883057, "learning_rate": 9.440488256113826e-06, "loss": 0.6428, "step": 3922 }, { "epoch": 1.774711603709568, "grad_norm": 0.2583761513233185, "learning_rate": 9.440151396322504e-06, "loss": 0.6122, "step": 3923 }, { "epoch": 1.7751639900475005, "grad_norm": 0.2327471524477005, "learning_rate": 9.439814441170464e-06, "loss": 0.5564, "step": 3924 }, { "epoch": 1.7756163763854331, "grad_norm": 0.2355627864599228, "learning_rate": 9.439477390664942e-06, "loss": 0.5456, "step": 3925 }, { "epoch": 1.7760687627233658, "grad_norm": 0.31516239047050476, "learning_rate": 9.43914024481318e-06, "loss": 0.6541, "step": 3926 }, { "epoch": 1.7765211490612982, "grad_norm": 0.27553629875183105, "learning_rate": 9.438803003622418e-06, "loss": 0.653, "step": 3927 }, { "epoch": 1.776973535399231, "grad_norm": 0.25080278515815735, "learning_rate": 9.438465667099899e-06, "loss": 0.5489, "step": 3928 }, { "epoch": 1.7774259217371635, "grad_norm": 0.2838370203971863, "learning_rate": 9.438128235252866e-06, "loss": 0.6503, "step": 3929 }, { "epoch": 1.7778783080750962, "grad_norm": 0.2941870093345642, "learning_rate": 9.437790708088569e-06, "loss": 0.6418, "step": 3930 }, { "epoch": 1.7783306944130288, "grad_norm": 0.2697821855545044, "learning_rate": 9.437453085614252e-06, "loss": 0.5228, "step": 3931 }, { "epoch": 1.7787830807509613, "grad_norm": 0.22949202358722687, "learning_rate": 9.437115367837172e-06, "loss": 0.4551, "step": 3932 }, { "epoch": 1.779235467088894, "grad_norm": 0.2660045623779297, "learning_rate": 9.43677755476458e-06, "loss": 0.5226, "step": 3933 }, { "epoch": 1.7796878534268266, "grad_norm": 0.2736235558986664, "learning_rate": 9.43643964640373e-06, "loss": 0.5079, "step": 3934 }, { "epoch": 1.780140239764759, "grad_norm": 0.27052879333496094, "learning_rate": 9.43610164276188e-06, "loss": 0.4966, "step": 3935 }, { "epoch": 1.7805926261026916, "grad_norm": 0.275173157453537, "learning_rate": 9.435763543846288e-06, "loss": 0.5416, "step": 3936 }, { "epoch": 1.7810450124406243, "grad_norm": 0.29647013545036316, "learning_rate": 9.43542534966422e-06, "loss": 0.4575, "step": 3937 }, { "epoch": 1.7814973987785567, "grad_norm": 0.285526841878891, "learning_rate": 9.435087060222935e-06, "loss": 0.5151, "step": 3938 }, { "epoch": 1.7819497851164896, "grad_norm": 0.2788732945919037, "learning_rate": 9.434748675529696e-06, "loss": 0.5402, "step": 3939 }, { "epoch": 1.782402171454422, "grad_norm": 0.2952754497528076, "learning_rate": 9.434410195591776e-06, "loss": 0.5647, "step": 3940 }, { "epoch": 1.7828545577923547, "grad_norm": 0.2884630858898163, "learning_rate": 9.434071620416444e-06, "loss": 0.5519, "step": 3941 }, { "epoch": 1.7833069441302873, "grad_norm": 0.2780975103378296, "learning_rate": 9.43373295001097e-06, "loss": 0.5386, "step": 3942 }, { "epoch": 1.7837593304682198, "grad_norm": 0.2789194583892822, "learning_rate": 9.433394184382625e-06, "loss": 0.4887, "step": 3943 }, { "epoch": 1.7842117168061524, "grad_norm": 0.2795523703098297, "learning_rate": 9.433055323538688e-06, "loss": 0.513, "step": 3944 }, { "epoch": 1.784664103144085, "grad_norm": 0.2932177186012268, "learning_rate": 9.432716367486436e-06, "loss": 0.4757, "step": 3945 }, { "epoch": 1.7851164894820175, "grad_norm": 0.3058220446109772, "learning_rate": 9.432377316233147e-06, "loss": 0.6134, "step": 3946 }, { "epoch": 1.7855688758199504, "grad_norm": 0.31043946743011475, "learning_rate": 9.432038169786107e-06, "loss": 0.5601, "step": 3947 }, { "epoch": 1.7860212621578828, "grad_norm": 0.35394755005836487, "learning_rate": 9.431698928152596e-06, "loss": 0.6277, "step": 3948 }, { "epoch": 1.7864736484958155, "grad_norm": 0.3357166647911072, "learning_rate": 9.431359591339901e-06, "loss": 0.6304, "step": 3949 }, { "epoch": 1.7869260348337481, "grad_norm": 0.39095446467399597, "learning_rate": 9.43102015935531e-06, "loss": 0.616, "step": 3950 }, { "epoch": 1.7873784211716806, "grad_norm": 0.31693127751350403, "learning_rate": 9.430680632206113e-06, "loss": 0.4894, "step": 3951 }, { "epoch": 1.7878308075096132, "grad_norm": 0.32370319962501526, "learning_rate": 9.430341009899602e-06, "loss": 0.5424, "step": 3952 }, { "epoch": 1.7882831938475459, "grad_norm": 0.36244556307792664, "learning_rate": 9.430001292443069e-06, "loss": 0.6081, "step": 3953 }, { "epoch": 1.7887355801854783, "grad_norm": 0.32623982429504395, "learning_rate": 9.429661479843813e-06, "loss": 0.5153, "step": 3954 }, { "epoch": 1.789187966523411, "grad_norm": 0.4036509692668915, "learning_rate": 9.429321572109131e-06, "loss": 0.6256, "step": 3955 }, { "epoch": 1.7896403528613436, "grad_norm": 0.33282893896102905, "learning_rate": 9.428981569246324e-06, "loss": 0.4793, "step": 3956 }, { "epoch": 1.790092739199276, "grad_norm": 0.4184126853942871, "learning_rate": 9.428641471262695e-06, "loss": 0.6568, "step": 3957 }, { "epoch": 1.790545125537209, "grad_norm": 0.4216758906841278, "learning_rate": 9.428301278165544e-06, "loss": 0.5937, "step": 3958 }, { "epoch": 1.7909975118751413, "grad_norm": 0.41134893894195557, "learning_rate": 9.42796098996218e-06, "loss": 0.5011, "step": 3959 }, { "epoch": 1.791449898213074, "grad_norm": 0.40922796726226807, "learning_rate": 9.427620606659914e-06, "loss": 0.563, "step": 3960 }, { "epoch": 1.7919022845510066, "grad_norm": 0.3147052824497223, "learning_rate": 9.427280128266049e-06, "loss": 1.1044, "step": 3961 }, { "epoch": 1.792354670888939, "grad_norm": 0.18318906426429749, "learning_rate": 9.426939554787906e-06, "loss": 0.6016, "step": 3962 }, { "epoch": 1.7928070572268717, "grad_norm": 0.20293650031089783, "learning_rate": 9.426598886232795e-06, "loss": 0.637, "step": 3963 }, { "epoch": 1.7932594435648044, "grad_norm": 0.1860267072916031, "learning_rate": 9.426258122608033e-06, "loss": 0.486, "step": 3964 }, { "epoch": 1.7937118299027368, "grad_norm": 0.20409642159938812, "learning_rate": 9.425917263920939e-06, "loss": 0.5327, "step": 3965 }, { "epoch": 1.7941642162406697, "grad_norm": 0.2335260510444641, "learning_rate": 9.425576310178831e-06, "loss": 0.6119, "step": 3966 }, { "epoch": 1.7946166025786021, "grad_norm": 0.217774897813797, "learning_rate": 9.425235261389036e-06, "loss": 0.5024, "step": 3967 }, { "epoch": 1.7950689889165348, "grad_norm": 0.23390698432922363, "learning_rate": 9.424894117558876e-06, "loss": 0.5514, "step": 3968 }, { "epoch": 1.7955213752544674, "grad_norm": 0.27835357189178467, "learning_rate": 9.424552878695679e-06, "loss": 0.6056, "step": 3969 }, { "epoch": 1.7959737615923999, "grad_norm": 0.24317854642868042, "learning_rate": 9.424211544806772e-06, "loss": 0.5745, "step": 3970 }, { "epoch": 1.7964261479303325, "grad_norm": 0.28743261098861694, "learning_rate": 9.423870115899487e-06, "loss": 0.6448, "step": 3971 }, { "epoch": 1.7968785342682652, "grad_norm": 0.2650405466556549, "learning_rate": 9.423528591981158e-06, "loss": 0.6488, "step": 3972 }, { "epoch": 1.7973309206061976, "grad_norm": 0.2490166872739792, "learning_rate": 9.423186973059115e-06, "loss": 0.6039, "step": 3973 }, { "epoch": 1.7977833069441302, "grad_norm": 0.23539267480373383, "learning_rate": 9.422845259140701e-06, "loss": 0.5156, "step": 3974 }, { "epoch": 1.798235693282063, "grad_norm": 0.2649345397949219, "learning_rate": 9.422503450233253e-06, "loss": 0.6578, "step": 3975 }, { "epoch": 1.7986880796199953, "grad_norm": 0.25256046652793884, "learning_rate": 9.42216154634411e-06, "loss": 0.5437, "step": 3976 }, { "epoch": 1.7991404659579282, "grad_norm": 0.2933811843395233, "learning_rate": 9.421819547480617e-06, "loss": 0.6496, "step": 3977 }, { "epoch": 1.7995928522958606, "grad_norm": 0.26668500900268555, "learning_rate": 9.421477453650118e-06, "loss": 0.505, "step": 3978 }, { "epoch": 1.8000452386337933, "grad_norm": 0.2630860209465027, "learning_rate": 9.421135264859962e-06, "loss": 0.5224, "step": 3979 }, { "epoch": 1.800497624971726, "grad_norm": 0.2612780034542084, "learning_rate": 9.420792981117497e-06, "loss": 0.5637, "step": 3980 }, { "epoch": 1.8009500113096584, "grad_norm": 0.2896125316619873, "learning_rate": 9.420450602430072e-06, "loss": 0.5645, "step": 3981 }, { "epoch": 1.801402397647591, "grad_norm": 0.27498671412467957, "learning_rate": 9.420108128805043e-06, "loss": 0.5488, "step": 3982 }, { "epoch": 1.8018547839855237, "grad_norm": 0.2703060805797577, "learning_rate": 9.419765560249765e-06, "loss": 0.5784, "step": 3983 }, { "epoch": 1.802307170323456, "grad_norm": 0.28238123655319214, "learning_rate": 9.419422896771595e-06, "loss": 0.5607, "step": 3984 }, { "epoch": 1.802759556661389, "grad_norm": 0.3086361885070801, "learning_rate": 9.419080138377892e-06, "loss": 0.5548, "step": 3985 }, { "epoch": 1.8032119429993214, "grad_norm": 0.27144691348075867, "learning_rate": 9.418737285076016e-06, "loss": 0.5096, "step": 3986 }, { "epoch": 1.803664329337254, "grad_norm": 0.27561384439468384, "learning_rate": 9.418394336873333e-06, "loss": 0.4221, "step": 3987 }, { "epoch": 1.8041167156751867, "grad_norm": 0.2899794280529022, "learning_rate": 9.418051293777208e-06, "loss": 0.593, "step": 3988 }, { "epoch": 1.8045691020131192, "grad_norm": 0.28757426142692566, "learning_rate": 9.417708155795007e-06, "loss": 0.5336, "step": 3989 }, { "epoch": 1.8050214883510518, "grad_norm": 0.31562966108322144, "learning_rate": 9.417364922934103e-06, "loss": 0.6138, "step": 3990 }, { "epoch": 1.8054738746889845, "grad_norm": 0.286093533039093, "learning_rate": 9.417021595201864e-06, "loss": 0.5365, "step": 3991 }, { "epoch": 1.805926261026917, "grad_norm": 0.30727651715278625, "learning_rate": 9.416678172605662e-06, "loss": 0.5765, "step": 3992 }, { "epoch": 1.8063786473648495, "grad_norm": 0.2962477207183838, "learning_rate": 9.416334655152876e-06, "loss": 0.6134, "step": 3993 }, { "epoch": 1.8068310337027822, "grad_norm": 0.37647610902786255, "learning_rate": 9.415991042850885e-06, "loss": 0.6963, "step": 3994 }, { "epoch": 1.8072834200407146, "grad_norm": 0.3373658061027527, "learning_rate": 9.415647335707067e-06, "loss": 0.6567, "step": 3995 }, { "epoch": 1.8077358063786475, "grad_norm": 0.3595038652420044, "learning_rate": 9.415303533728802e-06, "loss": 0.6613, "step": 3996 }, { "epoch": 1.80818819271658, "grad_norm": 0.3031587302684784, "learning_rate": 9.414959636923475e-06, "loss": 0.524, "step": 3997 }, { "epoch": 1.8086405790545126, "grad_norm": 0.3039799928665161, "learning_rate": 9.414615645298475e-06, "loss": 0.499, "step": 3998 }, { "epoch": 1.8090929653924452, "grad_norm": 0.3135698735713959, "learning_rate": 9.414271558861185e-06, "loss": 0.5429, "step": 3999 }, { "epoch": 1.8095453517303777, "grad_norm": 0.3089488446712494, "learning_rate": 9.413927377618996e-06, "loss": 0.5541, "step": 4000 }, { "epoch": 1.8095453517303777, "eval_loss": 0.6033166646957397, "eval_runtime": 26.4208, "eval_samples_per_second": 28.16, "eval_steps_per_second": 7.04, "step": 4000 }, { "epoch": 1.8099977380683103, "grad_norm": 0.29833799600601196, "learning_rate": 9.413583101579302e-06, "loss": 0.4937, "step": 4001 }, { "epoch": 1.810450124406243, "grad_norm": 0.3550857901573181, "learning_rate": 9.413238730749497e-06, "loss": 0.5852, "step": 4002 }, { "epoch": 1.8109025107441754, "grad_norm": 0.3239154517650604, "learning_rate": 9.412894265136974e-06, "loss": 0.5238, "step": 4003 }, { "epoch": 1.8113548970821083, "grad_norm": 0.33474254608154297, "learning_rate": 9.412549704749135e-06, "loss": 0.5827, "step": 4004 }, { "epoch": 1.8118072834200407, "grad_norm": 0.3274690508842468, "learning_rate": 9.412205049593377e-06, "loss": 0.5107, "step": 4005 }, { "epoch": 1.8122596697579731, "grad_norm": 0.40966370701789856, "learning_rate": 9.411860299677102e-06, "loss": 0.6572, "step": 4006 }, { "epoch": 1.812712056095906, "grad_norm": 0.39374619722366333, "learning_rate": 9.411515455007716e-06, "loss": 0.5542, "step": 4007 }, { "epoch": 1.8131644424338385, "grad_norm": 0.3948548436164856, "learning_rate": 9.411170515592626e-06, "loss": 0.558, "step": 4008 }, { "epoch": 1.813616828771771, "grad_norm": 0.39563214778900146, "learning_rate": 9.410825481439238e-06, "loss": 0.5984, "step": 4009 }, { "epoch": 1.8140692151097038, "grad_norm": 0.45269134640693665, "learning_rate": 9.410480352554963e-06, "loss": 0.5723, "step": 4010 }, { "epoch": 1.8145216014476362, "grad_norm": 0.3412630558013916, "learning_rate": 9.410135128947213e-06, "loss": 0.9021, "step": 4011 }, { "epoch": 1.8149739877855688, "grad_norm": 0.16502295434474945, "learning_rate": 9.409789810623401e-06, "loss": 1.1869, "step": 4012 }, { "epoch": 1.8154263741235015, "grad_norm": 0.17602404952049255, "learning_rate": 9.409444397590947e-06, "loss": 0.8936, "step": 4013 }, { "epoch": 1.815878760461434, "grad_norm": 0.18048091232776642, "learning_rate": 9.409098889857268e-06, "loss": 0.5137, "step": 4014 }, { "epoch": 1.8163311467993668, "grad_norm": 0.22223486006259918, "learning_rate": 9.408753287429784e-06, "loss": 0.7495, "step": 4015 }, { "epoch": 1.8167835331372992, "grad_norm": 0.21173346042633057, "learning_rate": 9.408407590315918e-06, "loss": 0.5671, "step": 4016 }, { "epoch": 1.8172359194752319, "grad_norm": 0.2031809687614441, "learning_rate": 9.408061798523091e-06, "loss": 0.5887, "step": 4017 }, { "epoch": 1.8176883058131645, "grad_norm": 0.23224566876888275, "learning_rate": 9.407715912058734e-06, "loss": 0.5774, "step": 4018 }, { "epoch": 1.818140692151097, "grad_norm": 0.2639296352863312, "learning_rate": 9.407369930930274e-06, "loss": 0.6535, "step": 4019 }, { "epoch": 1.8185930784890296, "grad_norm": 0.25432008504867554, "learning_rate": 9.407023855145142e-06, "loss": 0.7885, "step": 4020 }, { "epoch": 1.8190454648269623, "grad_norm": 0.26081493496894836, "learning_rate": 9.40667768471077e-06, "loss": 0.5374, "step": 4021 }, { "epoch": 1.8194978511648947, "grad_norm": 0.2829272449016571, "learning_rate": 9.406331419634593e-06, "loss": 0.7957, "step": 4022 }, { "epoch": 1.8199502375028274, "grad_norm": 0.24509139358997345, "learning_rate": 9.405985059924047e-06, "loss": 0.568, "step": 4023 }, { "epoch": 1.82040262384076, "grad_norm": 0.30002516508102417, "learning_rate": 9.40563860558657e-06, "loss": 0.5243, "step": 4024 }, { "epoch": 1.8208550101786924, "grad_norm": 0.27762413024902344, "learning_rate": 9.405292056629607e-06, "loss": 0.6625, "step": 4025 }, { "epoch": 1.8213073965166253, "grad_norm": 0.2666238248348236, "learning_rate": 9.404945413060597e-06, "loss": 0.591, "step": 4026 }, { "epoch": 1.8217597828545578, "grad_norm": 0.28695204854011536, "learning_rate": 9.404598674886985e-06, "loss": 0.6589, "step": 4027 }, { "epoch": 1.8222121691924904, "grad_norm": 0.27570635080337524, "learning_rate": 9.40425184211622e-06, "loss": 0.6472, "step": 4028 }, { "epoch": 1.822664555530423, "grad_norm": 0.25030842423439026, "learning_rate": 9.403904914755748e-06, "loss": 0.5086, "step": 4029 }, { "epoch": 1.8231169418683555, "grad_norm": 0.2578973174095154, "learning_rate": 9.403557892813021e-06, "loss": 0.4942, "step": 4030 }, { "epoch": 1.8235693282062881, "grad_norm": 0.27718886733055115, "learning_rate": 9.403210776295494e-06, "loss": 0.5002, "step": 4031 }, { "epoch": 1.8240217145442208, "grad_norm": 0.27718716859817505, "learning_rate": 9.402863565210618e-06, "loss": 0.611, "step": 4032 }, { "epoch": 1.8244741008821532, "grad_norm": 0.2706460654735565, "learning_rate": 9.402516259565854e-06, "loss": 0.5665, "step": 4033 }, { "epoch": 1.824926487220086, "grad_norm": 0.27675244212150574, "learning_rate": 9.40216885936866e-06, "loss": 0.5601, "step": 4034 }, { "epoch": 1.8253788735580185, "grad_norm": 0.269819974899292, "learning_rate": 9.401821364626496e-06, "loss": 0.5146, "step": 4035 }, { "epoch": 1.8258312598959512, "grad_norm": 0.28453540802001953, "learning_rate": 9.401473775346825e-06, "loss": 0.4538, "step": 4036 }, { "epoch": 1.8262836462338838, "grad_norm": 0.3001590371131897, "learning_rate": 9.401126091537114e-06, "loss": 0.5742, "step": 4037 }, { "epoch": 1.8267360325718163, "grad_norm": 0.2863599359989166, "learning_rate": 9.400778313204828e-06, "loss": 0.5383, "step": 4038 }, { "epoch": 1.827188418909749, "grad_norm": 0.28811943531036377, "learning_rate": 9.400430440357438e-06, "loss": 0.5351, "step": 4039 }, { "epoch": 1.8276408052476816, "grad_norm": 0.29864251613616943, "learning_rate": 9.400082473002413e-06, "loss": 0.4572, "step": 4040 }, { "epoch": 1.828093191585614, "grad_norm": 0.2871939539909363, "learning_rate": 9.399734411147228e-06, "loss": 0.5772, "step": 4041 }, { "epoch": 1.8285455779235467, "grad_norm": 0.26380765438079834, "learning_rate": 9.399386254799359e-06, "loss": 0.4696, "step": 4042 }, { "epoch": 1.8289979642614793, "grad_norm": 0.32017895579338074, "learning_rate": 9.39903800396628e-06, "loss": 0.5907, "step": 4043 }, { "epoch": 1.8294503505994117, "grad_norm": 0.31494101881980896, "learning_rate": 9.398689658655475e-06, "loss": 0.553, "step": 4044 }, { "epoch": 1.8299027369373446, "grad_norm": 0.2899341583251953, "learning_rate": 9.398341218874423e-06, "loss": 0.5554, "step": 4045 }, { "epoch": 1.830355123275277, "grad_norm": 0.3167458474636078, "learning_rate": 9.397992684630606e-06, "loss": 0.6191, "step": 4046 }, { "epoch": 1.8308075096132097, "grad_norm": 0.2893627882003784, "learning_rate": 9.397644055931512e-06, "loss": 0.4798, "step": 4047 }, { "epoch": 1.8312598959511424, "grad_norm": 0.313271164894104, "learning_rate": 9.397295332784625e-06, "loss": 0.5617, "step": 4048 }, { "epoch": 1.8317122822890748, "grad_norm": 0.3165487051010132, "learning_rate": 9.39694651519744e-06, "loss": 0.521, "step": 4049 }, { "epoch": 1.8321646686270074, "grad_norm": 0.37475255131721497, "learning_rate": 9.396597603177445e-06, "loss": 0.5785, "step": 4050 }, { "epoch": 1.83261705496494, "grad_norm": 0.32133054733276367, "learning_rate": 9.39624859673213e-06, "loss": 0.5314, "step": 4051 }, { "epoch": 1.8330694413028725, "grad_norm": 0.36958006024360657, "learning_rate": 9.395899495868998e-06, "loss": 0.657, "step": 4052 }, { "epoch": 1.8335218276408054, "grad_norm": 0.36283910274505615, "learning_rate": 9.395550300595542e-06, "loss": 0.5083, "step": 4053 }, { "epoch": 1.8339742139787378, "grad_norm": 0.3306640386581421, "learning_rate": 9.395201010919263e-06, "loss": 0.46, "step": 4054 }, { "epoch": 1.8344266003166705, "grad_norm": 0.3957188129425049, "learning_rate": 9.394851626847662e-06, "loss": 0.5881, "step": 4055 }, { "epoch": 1.8348789866546031, "grad_norm": 0.3741939663887024, "learning_rate": 9.394502148388245e-06, "loss": 0.5011, "step": 4056 }, { "epoch": 1.8353313729925356, "grad_norm": 0.4040469527244568, "learning_rate": 9.394152575548512e-06, "loss": 0.5407, "step": 4057 }, { "epoch": 1.8357837593304682, "grad_norm": 0.4023972153663635, "learning_rate": 9.393802908335978e-06, "loss": 0.5129, "step": 4058 }, { "epoch": 1.8362361456684009, "grad_norm": 0.4825342893600464, "learning_rate": 9.393453146758147e-06, "loss": 0.5582, "step": 4059 }, { "epoch": 1.8366885320063333, "grad_norm": 0.5093990564346313, "learning_rate": 9.393103290822533e-06, "loss": 0.5682, "step": 4060 }, { "epoch": 1.837140918344266, "grad_norm": 0.3455466330051422, "learning_rate": 9.392753340536651e-06, "loss": 0.9855, "step": 4061 }, { "epoch": 1.8375933046821986, "grad_norm": 0.19139672815799713, "learning_rate": 9.392403295908016e-06, "loss": 0.7108, "step": 4062 }, { "epoch": 1.838045691020131, "grad_norm": 0.19020573794841766, "learning_rate": 9.392053156944145e-06, "loss": 0.5722, "step": 4063 }, { "epoch": 1.838498077358064, "grad_norm": 0.21952813863754272, "learning_rate": 9.391702923652558e-06, "loss": 0.7138, "step": 4064 }, { "epoch": 1.8389504636959964, "grad_norm": 0.20950454473495483, "learning_rate": 9.391352596040778e-06, "loss": 0.6189, "step": 4065 }, { "epoch": 1.839402850033929, "grad_norm": 0.2145305722951889, "learning_rate": 9.391002174116329e-06, "loss": 0.6068, "step": 4066 }, { "epoch": 1.8398552363718617, "grad_norm": 0.21847085654735565, "learning_rate": 9.390651657886735e-06, "loss": 0.732, "step": 4067 }, { "epoch": 1.840307622709794, "grad_norm": 0.2175823301076889, "learning_rate": 9.390301047359526e-06, "loss": 0.5414, "step": 4068 }, { "epoch": 1.8407600090477267, "grad_norm": 0.259246289730072, "learning_rate": 9.389950342542231e-06, "loss": 0.6488, "step": 4069 }, { "epoch": 1.8412123953856594, "grad_norm": 0.24157918989658356, "learning_rate": 9.389599543442383e-06, "loss": 0.6977, "step": 4070 }, { "epoch": 1.8416647817235918, "grad_norm": 0.2630341649055481, "learning_rate": 9.389248650067517e-06, "loss": 0.6061, "step": 4071 }, { "epoch": 1.8421171680615247, "grad_norm": 0.24502508342266083, "learning_rate": 9.388897662425165e-06, "loss": 0.6338, "step": 4072 }, { "epoch": 1.8425695543994571, "grad_norm": 0.27131035923957825, "learning_rate": 9.38854658052287e-06, "loss": 0.647, "step": 4073 }, { "epoch": 1.8430219407373898, "grad_norm": 0.294382780790329, "learning_rate": 9.388195404368168e-06, "loss": 0.6828, "step": 4074 }, { "epoch": 1.8434743270753224, "grad_norm": 0.23715947568416595, "learning_rate": 9.387844133968605e-06, "loss": 0.5261, "step": 4075 }, { "epoch": 1.8439267134132549, "grad_norm": 0.2735869288444519, "learning_rate": 9.38749276933172e-06, "loss": 0.5849, "step": 4076 }, { "epoch": 1.8443790997511875, "grad_norm": 0.2636812627315521, "learning_rate": 9.387141310465067e-06, "loss": 0.5049, "step": 4077 }, { "epoch": 1.8448314860891202, "grad_norm": 0.2947055697441101, "learning_rate": 9.386789757376186e-06, "loss": 0.7329, "step": 4078 }, { "epoch": 1.8452838724270526, "grad_norm": 0.24476715922355652, "learning_rate": 9.386438110072634e-06, "loss": 0.5369, "step": 4079 }, { "epoch": 1.8457362587649853, "grad_norm": 0.2446790188550949, "learning_rate": 9.386086368561957e-06, "loss": 0.4931, "step": 4080 }, { "epoch": 1.846188645102918, "grad_norm": 0.2847767472267151, "learning_rate": 9.385734532851715e-06, "loss": 0.6301, "step": 4081 }, { "epoch": 1.8466410314408503, "grad_norm": 0.31326356530189514, "learning_rate": 9.385382602949462e-06, "loss": 0.6796, "step": 4082 }, { "epoch": 1.8470934177787832, "grad_norm": 0.27539658546447754, "learning_rate": 9.385030578862758e-06, "loss": 0.529, "step": 4083 }, { "epoch": 1.8475458041167157, "grad_norm": 0.30497485399246216, "learning_rate": 9.384678460599158e-06, "loss": 0.6465, "step": 4084 }, { "epoch": 1.8479981904546483, "grad_norm": 0.27235424518585205, "learning_rate": 9.38432624816623e-06, "loss": 0.5543, "step": 4085 }, { "epoch": 1.848450576792581, "grad_norm": 0.3149414658546448, "learning_rate": 9.383973941571538e-06, "loss": 0.5869, "step": 4086 }, { "epoch": 1.8489029631305134, "grad_norm": 0.3120245337486267, "learning_rate": 9.383621540822644e-06, "loss": 0.6397, "step": 4087 }, { "epoch": 1.849355349468446, "grad_norm": 0.2868851125240326, "learning_rate": 9.383269045927121e-06, "loss": 0.5412, "step": 4088 }, { "epoch": 1.8498077358063787, "grad_norm": 0.2793721556663513, "learning_rate": 9.38291645689254e-06, "loss": 0.4935, "step": 4089 }, { "epoch": 1.8502601221443111, "grad_norm": 0.3137177526950836, "learning_rate": 9.382563773726468e-06, "loss": 0.5663, "step": 4090 }, { "epoch": 1.850712508482244, "grad_norm": 0.2932126820087433, "learning_rate": 9.382210996436485e-06, "loss": 0.4892, "step": 4091 }, { "epoch": 1.8511648948201764, "grad_norm": 0.28519099950790405, "learning_rate": 9.381858125030165e-06, "loss": 0.5184, "step": 4092 }, { "epoch": 1.8516172811581089, "grad_norm": 0.3132636845111847, "learning_rate": 9.381505159515088e-06, "loss": 0.5945, "step": 4093 }, { "epoch": 1.8520696674960417, "grad_norm": 0.3089517652988434, "learning_rate": 9.381152099898835e-06, "loss": 0.5282, "step": 4094 }, { "epoch": 1.8525220538339742, "grad_norm": 0.35842326283454895, "learning_rate": 9.380798946188985e-06, "loss": 0.5499, "step": 4095 }, { "epoch": 1.8529744401719068, "grad_norm": 0.28762999176979065, "learning_rate": 9.380445698393126e-06, "loss": 0.4856, "step": 4096 }, { "epoch": 1.8534268265098395, "grad_norm": 0.2983216643333435, "learning_rate": 9.380092356518846e-06, "loss": 0.5533, "step": 4097 }, { "epoch": 1.853879212847772, "grad_norm": 0.3462080955505371, "learning_rate": 9.379738920573728e-06, "loss": 0.5996, "step": 4098 }, { "epoch": 1.8543315991857046, "grad_norm": 0.2848760783672333, "learning_rate": 9.379385390565368e-06, "loss": 0.4735, "step": 4099 }, { "epoch": 1.8547839855236372, "grad_norm": 0.3380444645881653, "learning_rate": 9.379031766501357e-06, "loss": 0.5494, "step": 4100 }, { "epoch": 1.8552363718615696, "grad_norm": 0.34680575132369995, "learning_rate": 9.378678048389288e-06, "loss": 0.5811, "step": 4101 }, { "epoch": 1.8556887581995025, "grad_norm": 0.31134700775146484, "learning_rate": 9.378324236236763e-06, "loss": 0.5159, "step": 4102 }, { "epoch": 1.856141144537435, "grad_norm": 0.40586790442466736, "learning_rate": 9.377970330051375e-06, "loss": 0.6224, "step": 4103 }, { "epoch": 1.8565935308753676, "grad_norm": 0.37333598732948303, "learning_rate": 9.377616329840727e-06, "loss": 0.6726, "step": 4104 }, { "epoch": 1.8570459172133003, "grad_norm": 0.349976122379303, "learning_rate": 9.377262235612423e-06, "loss": 0.5479, "step": 4105 }, { "epoch": 1.8574983035512327, "grad_norm": 0.4378539025783539, "learning_rate": 9.376908047374067e-06, "loss": 0.6002, "step": 4106 }, { "epoch": 1.8579506898891653, "grad_norm": 0.3937978446483612, "learning_rate": 9.376553765133263e-06, "loss": 0.556, "step": 4107 }, { "epoch": 1.858403076227098, "grad_norm": 0.4229775369167328, "learning_rate": 9.376199388897625e-06, "loss": 0.5375, "step": 4108 }, { "epoch": 1.8588554625650304, "grad_norm": 0.4716390371322632, "learning_rate": 9.37584491867476e-06, "loss": 0.6158, "step": 4109 }, { "epoch": 1.859307848902963, "grad_norm": 0.4916490614414215, "learning_rate": 9.375490354472283e-06, "loss": 0.5183, "step": 4110 }, { "epoch": 1.8597602352408957, "grad_norm": 0.37345126271247864, "learning_rate": 9.375135696297808e-06, "loss": 1.0588, "step": 4111 }, { "epoch": 1.8602126215788282, "grad_norm": 0.1698429435491562, "learning_rate": 9.374780944158952e-06, "loss": 0.8678, "step": 4112 }, { "epoch": 1.860665007916761, "grad_norm": 0.18809260427951813, "learning_rate": 9.374426098063334e-06, "loss": 0.5419, "step": 4113 }, { "epoch": 1.8611173942546935, "grad_norm": 0.2085782140493393, "learning_rate": 9.374071158018577e-06, "loss": 0.6856, "step": 4114 }, { "epoch": 1.8615697805926261, "grad_norm": 0.2202918380498886, "learning_rate": 9.3737161240323e-06, "loss": 0.6545, "step": 4115 }, { "epoch": 1.8620221669305588, "grad_norm": 0.2608036994934082, "learning_rate": 9.373360996112131e-06, "loss": 0.6465, "step": 4116 }, { "epoch": 1.8624745532684912, "grad_norm": 0.22932612895965576, "learning_rate": 9.373005774265697e-06, "loss": 0.6415, "step": 4117 }, { "epoch": 1.8629269396064239, "grad_norm": 0.21617858111858368, "learning_rate": 9.372650458500626e-06, "loss": 0.6522, "step": 4118 }, { "epoch": 1.8633793259443565, "grad_norm": 0.24115236103534698, "learning_rate": 9.37229504882455e-06, "loss": 0.6797, "step": 4119 }, { "epoch": 1.863831712282289, "grad_norm": 0.22190287709236145, "learning_rate": 9.3719395452451e-06, "loss": 0.5961, "step": 4120 }, { "epoch": 1.8642840986202218, "grad_norm": 0.2519604563713074, "learning_rate": 9.371583947769914e-06, "loss": 0.6196, "step": 4121 }, { "epoch": 1.8647364849581543, "grad_norm": 0.28353777527809143, "learning_rate": 9.371228256406627e-06, "loss": 0.6999, "step": 4122 }, { "epoch": 1.865188871296087, "grad_norm": 0.2501011788845062, "learning_rate": 9.37087247116288e-06, "loss": 0.6619, "step": 4123 }, { "epoch": 1.8656412576340196, "grad_norm": 0.2432045191526413, "learning_rate": 9.370516592046313e-06, "loss": 0.5434, "step": 4124 }, { "epoch": 1.866093643971952, "grad_norm": 0.2824763357639313, "learning_rate": 9.370160619064568e-06, "loss": 0.5411, "step": 4125 }, { "epoch": 1.8665460303098846, "grad_norm": 0.2680625319480896, "learning_rate": 9.369804552225292e-06, "loss": 0.6272, "step": 4126 }, { "epoch": 1.8669984166478173, "grad_norm": 0.31738367676734924, "learning_rate": 9.369448391536132e-06, "loss": 0.6191, "step": 4127 }, { "epoch": 1.8674508029857497, "grad_norm": 0.2607821822166443, "learning_rate": 9.369092137004738e-06, "loss": 0.6068, "step": 4128 }, { "epoch": 1.8679031893236824, "grad_norm": 0.3001070022583008, "learning_rate": 9.368735788638758e-06, "loss": 0.6766, "step": 4129 }, { "epoch": 1.868355575661615, "grad_norm": 0.23703035712242126, "learning_rate": 9.368379346445848e-06, "loss": 0.5527, "step": 4130 }, { "epoch": 1.8688079619995475, "grad_norm": 0.30180996656417847, "learning_rate": 9.368022810433664e-06, "loss": 0.6047, "step": 4131 }, { "epoch": 1.8692603483374803, "grad_norm": 0.30240869522094727, "learning_rate": 9.367666180609861e-06, "loss": 0.553, "step": 4132 }, { "epoch": 1.8697127346754128, "grad_norm": 0.27976444363594055, "learning_rate": 9.3673094569821e-06, "loss": 0.6439, "step": 4133 }, { "epoch": 1.8701651210133454, "grad_norm": 0.3008115887641907, "learning_rate": 9.366952639558041e-06, "loss": 0.6284, "step": 4134 }, { "epoch": 1.870617507351278, "grad_norm": 0.2683003842830658, "learning_rate": 9.36659572834535e-06, "loss": 0.5276, "step": 4135 }, { "epoch": 1.8710698936892105, "grad_norm": 0.26556307077407837, "learning_rate": 9.366238723351687e-06, "loss": 0.5602, "step": 4136 }, { "epoch": 1.8715222800271432, "grad_norm": 0.26104477047920227, "learning_rate": 9.365881624584725e-06, "loss": 0.5085, "step": 4137 }, { "epoch": 1.8719746663650758, "grad_norm": 0.27763891220092773, "learning_rate": 9.36552443205213e-06, "loss": 0.531, "step": 4138 }, { "epoch": 1.8724270527030082, "grad_norm": 0.340660035610199, "learning_rate": 9.365167145761574e-06, "loss": 0.647, "step": 4139 }, { "epoch": 1.8728794390409411, "grad_norm": 0.2541746497154236, "learning_rate": 9.364809765720732e-06, "loss": 0.5215, "step": 4140 }, { "epoch": 1.8733318253788735, "grad_norm": 0.3141191005706787, "learning_rate": 9.364452291937278e-06, "loss": 0.5968, "step": 4141 }, { "epoch": 1.8737842117168062, "grad_norm": 0.2975021302700043, "learning_rate": 9.36409472441889e-06, "loss": 0.5226, "step": 4142 }, { "epoch": 1.8742365980547389, "grad_norm": 0.35212427377700806, "learning_rate": 9.363737063173246e-06, "loss": 0.7309, "step": 4143 }, { "epoch": 1.8746889843926713, "grad_norm": 0.29625073075294495, "learning_rate": 9.36337930820803e-06, "loss": 0.4783, "step": 4144 }, { "epoch": 1.875141370730604, "grad_norm": 0.32449018955230713, "learning_rate": 9.363021459530923e-06, "loss": 0.5651, "step": 4145 }, { "epoch": 1.8755937570685366, "grad_norm": 0.32368001341819763, "learning_rate": 9.362663517149613e-06, "loss": 0.5439, "step": 4146 }, { "epoch": 1.876046143406469, "grad_norm": 0.32996341586112976, "learning_rate": 9.362305481071785e-06, "loss": 0.5799, "step": 4147 }, { "epoch": 1.8764985297444017, "grad_norm": 0.3121313452720642, "learning_rate": 9.361947351305129e-06, "loss": 0.5986, "step": 4148 }, { "epoch": 1.8769509160823343, "grad_norm": 0.3629032373428345, "learning_rate": 9.361589127857338e-06, "loss": 0.5764, "step": 4149 }, { "epoch": 1.8774033024202668, "grad_norm": 0.3596321642398834, "learning_rate": 9.361230810736105e-06, "loss": 0.5763, "step": 4150 }, { "epoch": 1.8778556887581996, "grad_norm": 0.36809277534484863, "learning_rate": 9.360872399949124e-06, "loss": 0.6591, "step": 4151 }, { "epoch": 1.878308075096132, "grad_norm": 0.4161539077758789, "learning_rate": 9.360513895504096e-06, "loss": 0.6406, "step": 4152 }, { "epoch": 1.8787604614340647, "grad_norm": 0.4022868573665619, "learning_rate": 9.360155297408715e-06, "loss": 0.6252, "step": 4153 }, { "epoch": 1.8792128477719974, "grad_norm": 0.3617763817310333, "learning_rate": 9.359796605670687e-06, "loss": 0.5332, "step": 4154 }, { "epoch": 1.8796652341099298, "grad_norm": 0.4323500990867615, "learning_rate": 9.359437820297716e-06, "loss": 0.6178, "step": 4155 }, { "epoch": 1.8801176204478625, "grad_norm": 0.43233636021614075, "learning_rate": 9.359078941297506e-06, "loss": 0.6097, "step": 4156 }, { "epoch": 1.8805700067857951, "grad_norm": 0.42848506569862366, "learning_rate": 9.358719968677762e-06, "loss": 0.639, "step": 4157 }, { "epoch": 1.8810223931237275, "grad_norm": 0.34121134877204895, "learning_rate": 9.358360902446197e-06, "loss": 0.4618, "step": 4158 }, { "epoch": 1.8814747794616604, "grad_norm": 0.37527135014533997, "learning_rate": 9.358001742610524e-06, "loss": 0.4254, "step": 4159 }, { "epoch": 1.8819271657995928, "grad_norm": 0.4534311294555664, "learning_rate": 9.357642489178454e-06, "loss": 0.5447, "step": 4160 }, { "epoch": 1.8823795521375253, "grad_norm": 0.3468799293041229, "learning_rate": 9.357283142157703e-06, "loss": 0.997, "step": 4161 }, { "epoch": 1.8828319384754582, "grad_norm": 0.1538858860731125, "learning_rate": 9.356923701555987e-06, "loss": 0.9823, "step": 4162 }, { "epoch": 1.8832843248133906, "grad_norm": 0.17008747160434723, "learning_rate": 9.35656416738103e-06, "loss": 0.7288, "step": 4163 }, { "epoch": 1.8837367111513232, "grad_norm": 0.19778954982757568, "learning_rate": 9.35620453964055e-06, "loss": 0.7304, "step": 4164 }, { "epoch": 1.884189097489256, "grad_norm": 0.22242438793182373, "learning_rate": 9.355844818342272e-06, "loss": 0.6495, "step": 4165 }, { "epoch": 1.8846414838271883, "grad_norm": 0.2240648865699768, "learning_rate": 9.355485003493922e-06, "loss": 0.5917, "step": 4166 }, { "epoch": 1.885093870165121, "grad_norm": 0.20494140684604645, "learning_rate": 9.355125095103228e-06, "loss": 0.5839, "step": 4167 }, { "epoch": 1.8855462565030536, "grad_norm": 0.22637780010700226, "learning_rate": 9.354765093177917e-06, "loss": 0.5857, "step": 4168 }, { "epoch": 1.885998642840986, "grad_norm": 0.223556786775589, "learning_rate": 9.354404997725723e-06, "loss": 0.5793, "step": 4169 }, { "epoch": 1.886451029178919, "grad_norm": 0.24500146508216858, "learning_rate": 9.35404480875438e-06, "loss": 0.6175, "step": 4170 }, { "epoch": 1.8869034155168514, "grad_norm": 0.24010124802589417, "learning_rate": 9.353684526271624e-06, "loss": 0.6586, "step": 4171 }, { "epoch": 1.887355801854784, "grad_norm": 0.24521282315254211, "learning_rate": 9.353324150285193e-06, "loss": 0.6206, "step": 4172 }, { "epoch": 1.8878081881927167, "grad_norm": 0.2569812834262848, "learning_rate": 9.352963680802825e-06, "loss": 0.5955, "step": 4173 }, { "epoch": 1.888260574530649, "grad_norm": 0.23371994495391846, "learning_rate": 9.35260311783226e-06, "loss": 0.4788, "step": 4174 }, { "epoch": 1.8887129608685818, "grad_norm": 0.2646344006061554, "learning_rate": 9.352242461381247e-06, "loss": 0.6626, "step": 4175 }, { "epoch": 1.8891653472065144, "grad_norm": 0.22184477746486664, "learning_rate": 9.351881711457529e-06, "loss": 0.4743, "step": 4176 }, { "epoch": 1.8896177335444468, "grad_norm": 0.2629389464855194, "learning_rate": 9.351520868068853e-06, "loss": 0.6192, "step": 4177 }, { "epoch": 1.8900701198823797, "grad_norm": 0.24283616244792938, "learning_rate": 9.351159931222971e-06, "loss": 0.5507, "step": 4178 }, { "epoch": 1.8905225062203121, "grad_norm": 0.2834955155849457, "learning_rate": 9.35079890092763e-06, "loss": 0.567, "step": 4179 }, { "epoch": 1.8909748925582446, "grad_norm": 0.27477285265922546, "learning_rate": 9.35043777719059e-06, "loss": 0.6267, "step": 4180 }, { "epoch": 1.8914272788961775, "grad_norm": 0.24819882214069366, "learning_rate": 9.350076560019606e-06, "loss": 0.459, "step": 4181 }, { "epoch": 1.8918796652341099, "grad_norm": 0.29004546999931335, "learning_rate": 9.349715249422432e-06, "loss": 0.5239, "step": 4182 }, { "epoch": 1.8923320515720425, "grad_norm": 0.3165995180606842, "learning_rate": 9.349353845406829e-06, "loss": 0.5663, "step": 4183 }, { "epoch": 1.8927844379099752, "grad_norm": 0.27413028478622437, "learning_rate": 9.348992347980562e-06, "loss": 0.6802, "step": 4184 }, { "epoch": 1.8932368242479076, "grad_norm": 0.28623050451278687, "learning_rate": 9.348630757151389e-06, "loss": 0.6012, "step": 4185 }, { "epoch": 1.8936892105858403, "grad_norm": 0.26351916790008545, "learning_rate": 9.34826907292708e-06, "loss": 0.5258, "step": 4186 }, { "epoch": 1.894141596923773, "grad_norm": 0.2978702783584595, "learning_rate": 9.347907295315404e-06, "loss": 0.6173, "step": 4187 }, { "epoch": 1.8945939832617054, "grad_norm": 0.2730671763420105, "learning_rate": 9.34754542432413e-06, "loss": 0.5628, "step": 4188 }, { "epoch": 1.8950463695996382, "grad_norm": 0.30547237396240234, "learning_rate": 9.347183459961026e-06, "loss": 0.5241, "step": 4189 }, { "epoch": 1.8954987559375707, "grad_norm": 0.2985450029373169, "learning_rate": 9.34682140223387e-06, "loss": 0.6475, "step": 4190 }, { "epoch": 1.8959511422755033, "grad_norm": 0.30893194675445557, "learning_rate": 9.346459251150436e-06, "loss": 0.609, "step": 4191 }, { "epoch": 1.896403528613436, "grad_norm": 0.3199600577354431, "learning_rate": 9.346097006718505e-06, "loss": 0.595, "step": 4192 }, { "epoch": 1.8968559149513684, "grad_norm": 0.34074077010154724, "learning_rate": 9.345734668945853e-06, "loss": 0.6688, "step": 4193 }, { "epoch": 1.897308301289301, "grad_norm": 0.34868624806404114, "learning_rate": 9.345372237840264e-06, "loss": 0.6527, "step": 4194 }, { "epoch": 1.8977606876272337, "grad_norm": 0.2744690775871277, "learning_rate": 9.34500971340952e-06, "loss": 0.4377, "step": 4195 }, { "epoch": 1.8982130739651661, "grad_norm": 0.283181756734848, "learning_rate": 9.344647095661408e-06, "loss": 0.4444, "step": 4196 }, { "epoch": 1.8986654603030988, "grad_norm": 0.33308401703834534, "learning_rate": 9.344284384603717e-06, "loss": 0.5386, "step": 4197 }, { "epoch": 1.8991178466410314, "grad_norm": 0.2823837995529175, "learning_rate": 9.343921580244235e-06, "loss": 0.492, "step": 4198 }, { "epoch": 1.8995702329789639, "grad_norm": 0.33244776725769043, "learning_rate": 9.343558682590757e-06, "loss": 0.556, "step": 4199 }, { "epoch": 1.9000226193168968, "grad_norm": 0.3066149950027466, "learning_rate": 9.343195691651072e-06, "loss": 0.4229, "step": 4200 }, { "epoch": 1.9000226193168968, "eval_loss": 0.5994104146957397, "eval_runtime": 25.9317, "eval_samples_per_second": 28.691, "eval_steps_per_second": 7.173, "step": 4200 }, { "epoch": 1.9004750056548292, "grad_norm": 0.37630483508110046, "learning_rate": 9.34283260743298e-06, "loss": 0.6722, "step": 4201 }, { "epoch": 1.9009273919927618, "grad_norm": 0.3603948950767517, "learning_rate": 9.342469429944279e-06, "loss": 0.5513, "step": 4202 }, { "epoch": 1.9013797783306945, "grad_norm": 0.3480580449104309, "learning_rate": 9.342106159192766e-06, "loss": 0.5448, "step": 4203 }, { "epoch": 1.901832164668627, "grad_norm": 0.29755058884620667, "learning_rate": 9.341742795186244e-06, "loss": 0.4622, "step": 4204 }, { "epoch": 1.9022845510065596, "grad_norm": 0.35551920533180237, "learning_rate": 9.34137933793252e-06, "loss": 0.5738, "step": 4205 }, { "epoch": 1.9027369373444922, "grad_norm": 0.3994791507720947, "learning_rate": 9.341015787439396e-06, "loss": 0.5745, "step": 4206 }, { "epoch": 1.9031893236824247, "grad_norm": 0.40285980701446533, "learning_rate": 9.34065214371468e-06, "loss": 0.6575, "step": 4207 }, { "epoch": 1.9036417100203575, "grad_norm": 0.45617690682411194, "learning_rate": 9.340288406766183e-06, "loss": 0.6253, "step": 4208 }, { "epoch": 1.90409409635829, "grad_norm": 0.46081310510635376, "learning_rate": 9.33992457660172e-06, "loss": 0.5802, "step": 4209 }, { "epoch": 1.9045464826962226, "grad_norm": 0.5002060532569885, "learning_rate": 9.3395606532291e-06, "loss": 0.5114, "step": 4210 }, { "epoch": 1.9049988690341553, "grad_norm": 0.4312141239643097, "learning_rate": 9.339196636656143e-06, "loss": 1.3023, "step": 4211 }, { "epoch": 1.9054512553720877, "grad_norm": 0.13478952646255493, "learning_rate": 9.338832526890662e-06, "loss": 0.5996, "step": 4212 }, { "epoch": 1.9059036417100204, "grad_norm": 0.23149175941944122, "learning_rate": 9.338468323940483e-06, "loss": 0.8146, "step": 4213 }, { "epoch": 1.906356028047953, "grad_norm": 0.22534915804862976, "learning_rate": 9.338104027813423e-06, "loss": 0.6582, "step": 4214 }, { "epoch": 1.9068084143858854, "grad_norm": 0.22766929864883423, "learning_rate": 9.337739638517309e-06, "loss": 0.7286, "step": 4215 }, { "epoch": 1.907260800723818, "grad_norm": 0.2654440402984619, "learning_rate": 9.337375156059966e-06, "loss": 0.6689, "step": 4216 }, { "epoch": 1.9077131870617507, "grad_norm": 0.2748070955276489, "learning_rate": 9.337010580449222e-06, "loss": 0.5499, "step": 4217 }, { "epoch": 1.9081655733996832, "grad_norm": 0.22983422875404358, "learning_rate": 9.336645911692906e-06, "loss": 0.648, "step": 4218 }, { "epoch": 1.908617959737616, "grad_norm": 0.2323884516954422, "learning_rate": 9.336281149798851e-06, "loss": 0.5147, "step": 4219 }, { "epoch": 1.9090703460755485, "grad_norm": 0.24392792582511902, "learning_rate": 9.33591629477489e-06, "loss": 0.5844, "step": 4220 }, { "epoch": 1.9095227324134811, "grad_norm": 0.2596108913421631, "learning_rate": 9.33555134662886e-06, "loss": 0.6559, "step": 4221 }, { "epoch": 1.9099751187514138, "grad_norm": 0.27091866731643677, "learning_rate": 9.3351863053686e-06, "loss": 0.713, "step": 4222 }, { "epoch": 1.9104275050893462, "grad_norm": 0.2654546797275543, "learning_rate": 9.334821171001947e-06, "loss": 0.6057, "step": 4223 }, { "epoch": 1.9108798914272789, "grad_norm": 0.2505417764186859, "learning_rate": 9.334455943536746e-06, "loss": 0.5061, "step": 4224 }, { "epoch": 1.9113322777652115, "grad_norm": 0.24472370743751526, "learning_rate": 9.334090622980839e-06, "loss": 0.5631, "step": 4225 }, { "epoch": 1.911784664103144, "grad_norm": 0.27359938621520996, "learning_rate": 9.33372520934207e-06, "loss": 0.6737, "step": 4226 }, { "epoch": 1.9122370504410768, "grad_norm": 0.26317936182022095, "learning_rate": 9.333359702628293e-06, "loss": 0.5567, "step": 4227 }, { "epoch": 1.9126894367790093, "grad_norm": 0.29615750908851624, "learning_rate": 9.332994102847353e-06, "loss": 0.6634, "step": 4228 }, { "epoch": 1.913141823116942, "grad_norm": 0.2777847945690155, "learning_rate": 9.332628410007102e-06, "loss": 0.5497, "step": 4229 }, { "epoch": 1.9135942094548746, "grad_norm": 0.2932737469673157, "learning_rate": 9.332262624115395e-06, "loss": 0.635, "step": 4230 }, { "epoch": 1.914046595792807, "grad_norm": 0.27019548416137695, "learning_rate": 9.331896745180092e-06, "loss": 0.5829, "step": 4231 }, { "epoch": 1.9144989821307397, "grad_norm": 0.2939278185367584, "learning_rate": 9.331530773209043e-06, "loss": 0.5927, "step": 4232 }, { "epoch": 1.9149513684686723, "grad_norm": 0.24896448850631714, "learning_rate": 9.331164708210115e-06, "loss": 0.567, "step": 4233 }, { "epoch": 1.9154037548066047, "grad_norm": 0.27642199397087097, "learning_rate": 9.330798550191164e-06, "loss": 0.5342, "step": 4234 }, { "epoch": 1.9158561411445374, "grad_norm": 0.3348884582519531, "learning_rate": 9.330432299160059e-06, "loss": 0.6261, "step": 4235 }, { "epoch": 1.91630852748247, "grad_norm": 0.26693856716156006, "learning_rate": 9.330065955124665e-06, "loss": 0.4576, "step": 4236 }, { "epoch": 1.9167609138204025, "grad_norm": 0.31011632084846497, "learning_rate": 9.329699518092847e-06, "loss": 0.6013, "step": 4237 }, { "epoch": 1.9172133001583354, "grad_norm": 0.3467622399330139, "learning_rate": 9.329332988072478e-06, "loss": 0.5685, "step": 4238 }, { "epoch": 1.9176656864962678, "grad_norm": 0.3162948787212372, "learning_rate": 9.328966365071428e-06, "loss": 0.5636, "step": 4239 }, { "epoch": 1.9181180728342004, "grad_norm": 0.3054732084274292, "learning_rate": 9.328599649097571e-06, "loss": 0.5841, "step": 4240 }, { "epoch": 1.918570459172133, "grad_norm": 0.3360429108142853, "learning_rate": 9.328232840158784e-06, "loss": 0.6009, "step": 4241 }, { "epoch": 1.9190228455100655, "grad_norm": 0.30832639336586, "learning_rate": 9.327865938262946e-06, "loss": 0.6154, "step": 4242 }, { "epoch": 1.9194752318479982, "grad_norm": 0.31772786378860474, "learning_rate": 9.327498943417934e-06, "loss": 0.5181, "step": 4243 }, { "epoch": 1.9199276181859308, "grad_norm": 0.33878418803215027, "learning_rate": 9.32713185563163e-06, "loss": 0.5985, "step": 4244 }, { "epoch": 1.9203800045238633, "grad_norm": 0.3254421353340149, "learning_rate": 9.32676467491192e-06, "loss": 0.5369, "step": 4245 }, { "epoch": 1.9208323908617961, "grad_norm": 0.32181528210639954, "learning_rate": 9.326397401266691e-06, "loss": 0.556, "step": 4246 }, { "epoch": 1.9212847771997286, "grad_norm": 0.3425169885158539, "learning_rate": 9.326030034703828e-06, "loss": 0.604, "step": 4247 }, { "epoch": 1.921737163537661, "grad_norm": 0.35114461183547974, "learning_rate": 9.325662575231222e-06, "loss": 0.5959, "step": 4248 }, { "epoch": 1.9221895498755939, "grad_norm": 0.3606554865837097, "learning_rate": 9.325295022856764e-06, "loss": 0.5521, "step": 4249 }, { "epoch": 1.9226419362135263, "grad_norm": 0.3463476300239563, "learning_rate": 9.32492737758835e-06, "loss": 0.4765, "step": 4250 }, { "epoch": 1.923094322551459, "grad_norm": 0.34173399209976196, "learning_rate": 9.324559639433874e-06, "loss": 0.5557, "step": 4251 }, { "epoch": 1.9235467088893916, "grad_norm": 0.3677184283733368, "learning_rate": 9.324191808401235e-06, "loss": 0.5843, "step": 4252 }, { "epoch": 1.923999095227324, "grad_norm": 0.32888054847717285, "learning_rate": 9.323823884498331e-06, "loss": 0.4855, "step": 4253 }, { "epoch": 1.9244514815652567, "grad_norm": 0.3477068841457367, "learning_rate": 9.323455867733067e-06, "loss": 0.5201, "step": 4254 }, { "epoch": 1.9249038679031893, "grad_norm": 0.41787251830101013, "learning_rate": 9.323087758113343e-06, "loss": 0.4941, "step": 4255 }, { "epoch": 1.9253562542411218, "grad_norm": 0.3804939389228821, "learning_rate": 9.32271955564707e-06, "loss": 0.5138, "step": 4256 }, { "epoch": 1.9258086405790547, "grad_norm": 0.38976866006851196, "learning_rate": 9.32235126034215e-06, "loss": 0.5348, "step": 4257 }, { "epoch": 1.926261026916987, "grad_norm": 0.41056129336357117, "learning_rate": 9.321982872206496e-06, "loss": 0.5287, "step": 4258 }, { "epoch": 1.9267134132549197, "grad_norm": 0.4389215409755707, "learning_rate": 9.321614391248019e-06, "loss": 0.6279, "step": 4259 }, { "epoch": 1.9271657995928524, "grad_norm": 0.5120903253555298, "learning_rate": 9.321245817474636e-06, "loss": 0.5918, "step": 4260 }, { "epoch": 1.9276181859307848, "grad_norm": 0.3893144726753235, "learning_rate": 9.320877150894259e-06, "loss": 1.2712, "step": 4261 }, { "epoch": 1.9280705722687175, "grad_norm": 0.1747835874557495, "learning_rate": 9.320508391514805e-06, "loss": 0.923, "step": 4262 }, { "epoch": 1.9285229586066501, "grad_norm": 0.16871190071105957, "learning_rate": 9.320139539344198e-06, "loss": 0.6928, "step": 4263 }, { "epoch": 1.9289753449445826, "grad_norm": 0.1895984560251236, "learning_rate": 9.319770594390356e-06, "loss": 0.7137, "step": 4264 }, { "epoch": 1.9294277312825152, "grad_norm": 0.25322532653808594, "learning_rate": 9.319401556661205e-06, "loss": 0.6156, "step": 4265 }, { "epoch": 1.9298801176204479, "grad_norm": 0.2098110318183899, "learning_rate": 9.31903242616467e-06, "loss": 0.6471, "step": 4266 }, { "epoch": 1.9303325039583803, "grad_norm": 0.2011091560125351, "learning_rate": 9.31866320290868e-06, "loss": 0.5432, "step": 4267 }, { "epoch": 1.9307848902963132, "grad_norm": 0.26704442501068115, "learning_rate": 9.318293886901161e-06, "loss": 0.7734, "step": 4268 }, { "epoch": 1.9312372766342456, "grad_norm": 0.23079760372638702, "learning_rate": 9.317924478150051e-06, "loss": 0.6222, "step": 4269 }, { "epoch": 1.9316896629721783, "grad_norm": 0.2399549037218094, "learning_rate": 9.317554976663279e-06, "loss": 0.5852, "step": 4270 }, { "epoch": 1.932142049310111, "grad_norm": 0.27231454849243164, "learning_rate": 9.317185382448781e-06, "loss": 0.7175, "step": 4271 }, { "epoch": 1.9325944356480433, "grad_norm": 0.31351035833358765, "learning_rate": 9.316815695514496e-06, "loss": 0.7196, "step": 4272 }, { "epoch": 1.933046821985976, "grad_norm": 0.23336824774742126, "learning_rate": 9.316445915868365e-06, "loss": 0.5297, "step": 4273 }, { "epoch": 1.9334992083239086, "grad_norm": 0.2933565378189087, "learning_rate": 9.316076043518325e-06, "loss": 0.6245, "step": 4274 }, { "epoch": 1.933951594661841, "grad_norm": 0.28344717621803284, "learning_rate": 9.315706078472325e-06, "loss": 0.7243, "step": 4275 }, { "epoch": 1.934403980999774, "grad_norm": 0.28698891401290894, "learning_rate": 9.315336020738308e-06, "loss": 0.6726, "step": 4276 }, { "epoch": 1.9348563673377064, "grad_norm": 0.3110639452934265, "learning_rate": 9.314965870324223e-06, "loss": 0.816, "step": 4277 }, { "epoch": 1.935308753675639, "grad_norm": 0.2990783452987671, "learning_rate": 9.31459562723802e-06, "loss": 0.6945, "step": 4278 }, { "epoch": 1.9357611400135717, "grad_norm": 0.26264452934265137, "learning_rate": 9.314225291487647e-06, "loss": 0.5625, "step": 4279 }, { "epoch": 1.9362135263515041, "grad_norm": 0.2475898116827011, "learning_rate": 9.31385486308106e-06, "loss": 0.5302, "step": 4280 }, { "epoch": 1.9366659126894368, "grad_norm": 0.26067328453063965, "learning_rate": 9.313484342026218e-06, "loss": 0.5055, "step": 4281 }, { "epoch": 1.9371182990273694, "grad_norm": 0.28944411873817444, "learning_rate": 9.313113728331074e-06, "loss": 0.6898, "step": 4282 }, { "epoch": 1.9375706853653019, "grad_norm": 0.26786381006240845, "learning_rate": 9.312743022003589e-06, "loss": 0.4449, "step": 4283 }, { "epoch": 1.9380230717032345, "grad_norm": 0.2596350610256195, "learning_rate": 9.312372223051726e-06, "loss": 0.5243, "step": 4284 }, { "epoch": 1.9384754580411672, "grad_norm": 0.28737035393714905, "learning_rate": 9.312001331483446e-06, "loss": 0.5877, "step": 4285 }, { "epoch": 1.9389278443790996, "grad_norm": 0.26559799909591675, "learning_rate": 9.311630347306717e-06, "loss": 0.5505, "step": 4286 }, { "epoch": 1.9393802307170325, "grad_norm": 0.29865187406539917, "learning_rate": 9.311259270529504e-06, "loss": 0.5462, "step": 4287 }, { "epoch": 1.939832617054965, "grad_norm": 0.30807602405548096, "learning_rate": 9.310888101159781e-06, "loss": 0.5457, "step": 4288 }, { "epoch": 1.9402850033928976, "grad_norm": 0.26655521988868713, "learning_rate": 9.310516839205516e-06, "loss": 0.4964, "step": 4289 }, { "epoch": 1.9407373897308302, "grad_norm": 0.30542975664138794, "learning_rate": 9.310145484674682e-06, "loss": 0.6426, "step": 4290 }, { "epoch": 1.9411897760687626, "grad_norm": 0.3412620425224304, "learning_rate": 9.309774037575258e-06, "loss": 0.7489, "step": 4291 }, { "epoch": 1.9416421624066953, "grad_norm": 0.3374019265174866, "learning_rate": 9.309402497915218e-06, "loss": 0.7501, "step": 4292 }, { "epoch": 1.942094548744628, "grad_norm": 0.27129846811294556, "learning_rate": 9.309030865702543e-06, "loss": 0.4795, "step": 4293 }, { "epoch": 1.9425469350825604, "grad_norm": 0.3128521740436554, "learning_rate": 9.308659140945216e-06, "loss": 0.61, "step": 4294 }, { "epoch": 1.9429993214204933, "grad_norm": 0.3087126612663269, "learning_rate": 9.308287323651218e-06, "loss": 0.5258, "step": 4295 }, { "epoch": 1.9434517077584257, "grad_norm": 0.281817227602005, "learning_rate": 9.307915413828535e-06, "loss": 0.4543, "step": 4296 }, { "epoch": 1.9439040940963583, "grad_norm": 0.32277819514274597, "learning_rate": 9.307543411485154e-06, "loss": 0.5461, "step": 4297 }, { "epoch": 1.944356480434291, "grad_norm": 0.3390980660915375, "learning_rate": 9.307171316629066e-06, "loss": 0.6136, "step": 4298 }, { "epoch": 1.9448088667722234, "grad_norm": 0.3285902142524719, "learning_rate": 9.306799129268264e-06, "loss": 0.5345, "step": 4299 }, { "epoch": 1.945261253110156, "grad_norm": 0.30606240034103394, "learning_rate": 9.306426849410737e-06, "loss": 0.5108, "step": 4300 }, { "epoch": 1.9457136394480887, "grad_norm": 0.4051731824874878, "learning_rate": 9.306054477064485e-06, "loss": 0.5674, "step": 4301 }, { "epoch": 1.9461660257860212, "grad_norm": 0.37684112787246704, "learning_rate": 9.3056820122375e-06, "loss": 0.642, "step": 4302 }, { "epoch": 1.9466184121239538, "grad_norm": 0.31499534845352173, "learning_rate": 9.305309454937788e-06, "loss": 0.4982, "step": 4303 }, { "epoch": 1.9470707984618865, "grad_norm": 0.3347954750061035, "learning_rate": 9.304936805173344e-06, "loss": 0.4727, "step": 4304 }, { "epoch": 1.947523184799819, "grad_norm": 0.3783552646636963, "learning_rate": 9.304564062952175e-06, "loss": 0.5928, "step": 4305 }, { "epoch": 1.9479755711377518, "grad_norm": 0.3712312579154968, "learning_rate": 9.304191228282288e-06, "loss": 0.5609, "step": 4306 }, { "epoch": 1.9484279574756842, "grad_norm": 0.38990217447280884, "learning_rate": 9.303818301171685e-06, "loss": 0.5786, "step": 4307 }, { "epoch": 1.9488803438136169, "grad_norm": 0.40254873037338257, "learning_rate": 9.30344528162838e-06, "loss": 0.5815, "step": 4308 }, { "epoch": 1.9493327301515495, "grad_norm": 0.40296241641044617, "learning_rate": 9.303072169660382e-06, "loss": 0.5717, "step": 4309 }, { "epoch": 1.949785116489482, "grad_norm": 0.41043993830680847, "learning_rate": 9.302698965275705e-06, "loss": 0.5467, "step": 4310 }, { "epoch": 1.9502375028274146, "grad_norm": 0.5544296503067017, "learning_rate": 9.302325668482363e-06, "loss": 1.1053, "step": 4311 }, { "epoch": 1.9506898891653472, "grad_norm": 0.13852572441101074, "learning_rate": 9.301952279288376e-06, "loss": 0.6984, "step": 4312 }, { "epoch": 1.9511422755032797, "grad_norm": 0.20972536504268646, "learning_rate": 9.301578797701761e-06, "loss": 0.6226, "step": 4313 }, { "epoch": 1.9515946618412126, "grad_norm": 0.18415747582912445, "learning_rate": 9.301205223730538e-06, "loss": 0.5725, "step": 4314 }, { "epoch": 1.952047048179145, "grad_norm": 0.24964796006679535, "learning_rate": 9.300831557382735e-06, "loss": 0.6168, "step": 4315 }, { "epoch": 1.9524994345170776, "grad_norm": 0.20702511072158813, "learning_rate": 9.300457798666374e-06, "loss": 0.5585, "step": 4316 }, { "epoch": 1.9529518208550103, "grad_norm": 0.24878598749637604, "learning_rate": 9.300083947589481e-06, "loss": 0.7188, "step": 4317 }, { "epoch": 1.9534042071929427, "grad_norm": 0.2520959973335266, "learning_rate": 9.299710004160088e-06, "loss": 0.5795, "step": 4318 }, { "epoch": 1.9538565935308754, "grad_norm": 0.2577594816684723, "learning_rate": 9.299335968386223e-06, "loss": 0.7268, "step": 4319 }, { "epoch": 1.954308979868808, "grad_norm": 0.273210346698761, "learning_rate": 9.298961840275922e-06, "loss": 0.6937, "step": 4320 }, { "epoch": 1.9547613662067405, "grad_norm": 0.27099427580833435, "learning_rate": 9.298587619837219e-06, "loss": 0.5533, "step": 4321 }, { "epoch": 1.9552137525446731, "grad_norm": 0.24124424159526825, "learning_rate": 9.298213307078152e-06, "loss": 0.6719, "step": 4322 }, { "epoch": 1.9556661388826058, "grad_norm": 0.25158125162124634, "learning_rate": 9.297838902006758e-06, "loss": 0.7386, "step": 4323 }, { "epoch": 1.9561185252205382, "grad_norm": 0.2382429540157318, "learning_rate": 9.29746440463108e-06, "loss": 0.5009, "step": 4324 }, { "epoch": 1.956570911558471, "grad_norm": 0.30589747428894043, "learning_rate": 9.29708981495916e-06, "loss": 0.7352, "step": 4325 }, { "epoch": 1.9570232978964035, "grad_norm": 0.24704915285110474, "learning_rate": 9.296715132999043e-06, "loss": 0.4237, "step": 4326 }, { "epoch": 1.9574756842343362, "grad_norm": 0.2750619053840637, "learning_rate": 9.296340358758776e-06, "loss": 0.5308, "step": 4327 }, { "epoch": 1.9579280705722688, "grad_norm": 0.2812612056732178, "learning_rate": 9.295965492246409e-06, "loss": 0.4201, "step": 4328 }, { "epoch": 1.9583804569102012, "grad_norm": 0.272367388010025, "learning_rate": 9.295590533469992e-06, "loss": 0.5623, "step": 4329 }, { "epoch": 1.958832843248134, "grad_norm": 0.28280049562454224, "learning_rate": 9.295215482437578e-06, "loss": 0.5349, "step": 4330 }, { "epoch": 1.9592852295860665, "grad_norm": 0.3066886365413666, "learning_rate": 9.294840339157222e-06, "loss": 0.6186, "step": 4331 }, { "epoch": 1.959737615923999, "grad_norm": 0.27562370896339417, "learning_rate": 9.294465103636983e-06, "loss": 0.529, "step": 4332 }, { "epoch": 1.9601900022619319, "grad_norm": 0.30538827180862427, "learning_rate": 9.294089775884917e-06, "loss": 0.5854, "step": 4333 }, { "epoch": 1.9606423885998643, "grad_norm": 0.2683422565460205, "learning_rate": 9.293714355909086e-06, "loss": 0.4685, "step": 4334 }, { "epoch": 1.9610947749377967, "grad_norm": 0.3027566373348236, "learning_rate": 9.293338843717552e-06, "loss": 0.5506, "step": 4335 }, { "epoch": 1.9615471612757296, "grad_norm": 0.3129255175590515, "learning_rate": 9.292963239318381e-06, "loss": 0.6173, "step": 4336 }, { "epoch": 1.961999547613662, "grad_norm": 0.26438355445861816, "learning_rate": 9.29258754271964e-06, "loss": 0.4219, "step": 4337 }, { "epoch": 1.9624519339515947, "grad_norm": 0.31430307030677795, "learning_rate": 9.292211753929397e-06, "loss": 0.5716, "step": 4338 }, { "epoch": 1.9629043202895273, "grad_norm": 0.30710896849632263, "learning_rate": 9.291835872955722e-06, "loss": 0.5378, "step": 4339 }, { "epoch": 1.9633567066274598, "grad_norm": 0.2986905872821808, "learning_rate": 9.29145989980669e-06, "loss": 0.5329, "step": 4340 }, { "epoch": 1.9638090929653924, "grad_norm": 0.34259918332099915, "learning_rate": 9.291083834490373e-06, "loss": 0.6539, "step": 4341 }, { "epoch": 1.964261479303325, "grad_norm": 0.3268330693244934, "learning_rate": 9.29070767701485e-06, "loss": 0.5497, "step": 4342 }, { "epoch": 1.9647138656412575, "grad_norm": 0.3172469735145569, "learning_rate": 9.2903314273882e-06, "loss": 0.5146, "step": 4343 }, { "epoch": 1.9651662519791904, "grad_norm": 0.34517598152160645, "learning_rate": 9.289955085618501e-06, "loss": 0.5541, "step": 4344 }, { "epoch": 1.9656186383171228, "grad_norm": 0.33701086044311523, "learning_rate": 9.28957865171384e-06, "loss": 0.4949, "step": 4345 }, { "epoch": 1.9660710246550555, "grad_norm": 0.29630109667778015, "learning_rate": 9.289202125682294e-06, "loss": 0.5475, "step": 4346 }, { "epoch": 1.966523410992988, "grad_norm": 0.344438374042511, "learning_rate": 9.288825507531959e-06, "loss": 0.5363, "step": 4347 }, { "epoch": 1.9669757973309205, "grad_norm": 0.31185442209243774, "learning_rate": 9.288448797270917e-06, "loss": 0.4619, "step": 4348 }, { "epoch": 1.9674281836688532, "grad_norm": 0.33485373854637146, "learning_rate": 9.288071994907262e-06, "loss": 0.5301, "step": 4349 }, { "epoch": 1.9678805700067858, "grad_norm": 0.3308698534965515, "learning_rate": 9.287695100449084e-06, "loss": 0.5744, "step": 4350 }, { "epoch": 1.9683329563447183, "grad_norm": 0.3355395793914795, "learning_rate": 9.28731811390448e-06, "loss": 0.5666, "step": 4351 }, { "epoch": 1.968785342682651, "grad_norm": 0.33999472856521606, "learning_rate": 9.286941035281544e-06, "loss": 0.5003, "step": 4352 }, { "epoch": 1.9692377290205836, "grad_norm": 0.34624820947647095, "learning_rate": 9.286563864588377e-06, "loss": 0.4676, "step": 4353 }, { "epoch": 1.969690115358516, "grad_norm": 0.3880961835384369, "learning_rate": 9.286186601833077e-06, "loss": 0.6027, "step": 4354 }, { "epoch": 1.970142501696449, "grad_norm": 0.3689786195755005, "learning_rate": 9.285809247023747e-06, "loss": 0.5062, "step": 4355 }, { "epoch": 1.9705948880343813, "grad_norm": 0.4041725993156433, "learning_rate": 9.285431800168492e-06, "loss": 0.5608, "step": 4356 }, { "epoch": 1.971047274372314, "grad_norm": 0.4137590527534485, "learning_rate": 9.28505426127542e-06, "loss": 0.599, "step": 4357 }, { "epoch": 1.9714996607102466, "grad_norm": 0.40106651186943054, "learning_rate": 9.284676630352636e-06, "loss": 0.5017, "step": 4358 }, { "epoch": 1.971952047048179, "grad_norm": 0.393573135137558, "learning_rate": 9.284298907408254e-06, "loss": 0.5147, "step": 4359 }, { "epoch": 1.9724044333861117, "grad_norm": 0.518795907497406, "learning_rate": 9.283921092450384e-06, "loss": 0.5968, "step": 4360 }, { "epoch": 1.9728568197240444, "grad_norm": 0.33673378825187683, "learning_rate": 9.28354318548714e-06, "loss": 1.039, "step": 4361 }, { "epoch": 1.9733092060619768, "grad_norm": 0.1845237761735916, "learning_rate": 9.28316518652664e-06, "loss": 0.6792, "step": 4362 }, { "epoch": 1.9737615923999097, "grad_norm": 0.2191820591688156, "learning_rate": 9.282787095577002e-06, "loss": 0.6784, "step": 4363 }, { "epoch": 1.974213978737842, "grad_norm": 0.22241325676441193, "learning_rate": 9.282408912646343e-06, "loss": 0.6002, "step": 4364 }, { "epoch": 1.9746663650757748, "grad_norm": 0.2090712934732437, "learning_rate": 9.282030637742789e-06, "loss": 0.5782, "step": 4365 }, { "epoch": 1.9751187514137074, "grad_norm": 0.22336623072624207, "learning_rate": 9.281652270874464e-06, "loss": 0.5583, "step": 4366 }, { "epoch": 1.9755711377516398, "grad_norm": 0.22159628570079803, "learning_rate": 9.281273812049492e-06, "loss": 0.5935, "step": 4367 }, { "epoch": 1.9760235240895725, "grad_norm": 0.2498958259820938, "learning_rate": 9.280895261276002e-06, "loss": 0.6275, "step": 4368 }, { "epoch": 1.9764759104275051, "grad_norm": 0.25864920020103455, "learning_rate": 9.280516618562124e-06, "loss": 0.6084, "step": 4369 }, { "epoch": 1.9769282967654376, "grad_norm": 0.2597876787185669, "learning_rate": 9.280137883915991e-06, "loss": 0.6172, "step": 4370 }, { "epoch": 1.9773806831033702, "grad_norm": 0.24352486431598663, "learning_rate": 9.279759057345737e-06, "loss": 0.5789, "step": 4371 }, { "epoch": 1.9778330694413029, "grad_norm": 0.2706596553325653, "learning_rate": 9.279380138859495e-06, "loss": 0.6325, "step": 4372 }, { "epoch": 1.9782854557792353, "grad_norm": 0.27861911058425903, "learning_rate": 9.279001128465408e-06, "loss": 0.7357, "step": 4373 }, { "epoch": 1.9787378421171682, "grad_norm": 0.2949981987476349, "learning_rate": 9.278622026171612e-06, "loss": 0.5996, "step": 4374 }, { "epoch": 1.9791902284551006, "grad_norm": 0.27745723724365234, "learning_rate": 9.278242831986251e-06, "loss": 0.6642, "step": 4375 }, { "epoch": 1.9796426147930333, "grad_norm": 0.2617446184158325, "learning_rate": 9.277863545917468e-06, "loss": 0.6814, "step": 4376 }, { "epoch": 1.980095001130966, "grad_norm": 0.25392189621925354, "learning_rate": 9.277484167973408e-06, "loss": 0.5088, "step": 4377 }, { "epoch": 1.9805473874688984, "grad_norm": 0.29808706045150757, "learning_rate": 9.277104698162222e-06, "loss": 0.5309, "step": 4378 }, { "epoch": 1.980999773806831, "grad_norm": 0.26333460211753845, "learning_rate": 9.276725136492057e-06, "loss": 0.6299, "step": 4379 }, { "epoch": 1.9814521601447637, "grad_norm": 0.2920916974544525, "learning_rate": 9.276345482971066e-06, "loss": 0.6109, "step": 4380 }, { "epoch": 1.981904546482696, "grad_norm": 0.30715465545654297, "learning_rate": 9.2759657376074e-06, "loss": 0.5434, "step": 4381 }, { "epoch": 1.982356932820629, "grad_norm": 0.29293206334114075, "learning_rate": 9.27558590040922e-06, "loss": 0.5723, "step": 4382 }, { "epoch": 1.9828093191585614, "grad_norm": 0.32454824447631836, "learning_rate": 9.27520597138468e-06, "loss": 0.5984, "step": 4383 }, { "epoch": 1.983261705496494, "grad_norm": 0.32161030173301697, "learning_rate": 9.274825950541938e-06, "loss": 0.5958, "step": 4384 }, { "epoch": 1.9837140918344267, "grad_norm": 0.29866644740104675, "learning_rate": 9.274445837889162e-06, "loss": 0.5467, "step": 4385 }, { "epoch": 1.9841664781723591, "grad_norm": 0.28492850065231323, "learning_rate": 9.274065633434512e-06, "loss": 0.5036, "step": 4386 }, { "epoch": 1.9846188645102918, "grad_norm": 0.301961213350296, "learning_rate": 9.273685337186152e-06, "loss": 0.5238, "step": 4387 }, { "epoch": 1.9850712508482244, "grad_norm": 0.3288954794406891, "learning_rate": 9.273304949152251e-06, "loss": 0.625, "step": 4388 }, { "epoch": 1.9855236371861569, "grad_norm": 0.32368209958076477, "learning_rate": 9.27292446934098e-06, "loss": 0.5799, "step": 4389 }, { "epoch": 1.9859760235240895, "grad_norm": 0.31327369809150696, "learning_rate": 9.272543897760508e-06, "loss": 0.4834, "step": 4390 }, { "epoch": 1.9864284098620222, "grad_norm": 0.2991783320903778, "learning_rate": 9.272163234419009e-06, "loss": 0.5848, "step": 4391 }, { "epoch": 1.9868807961999546, "grad_norm": 0.30467313528060913, "learning_rate": 9.27178247932466e-06, "loss": 0.4675, "step": 4392 }, { "epoch": 1.9873331825378875, "grad_norm": 0.3569325804710388, "learning_rate": 9.27140163248564e-06, "loss": 0.5656, "step": 4393 }, { "epoch": 1.98778556887582, "grad_norm": 0.30121952295303345, "learning_rate": 9.271020693910122e-06, "loss": 0.5093, "step": 4394 }, { "epoch": 1.9882379552137526, "grad_norm": 0.3011307120323181, "learning_rate": 9.270639663606293e-06, "loss": 0.6072, "step": 4395 }, { "epoch": 1.9886903415516852, "grad_norm": 0.3292078375816345, "learning_rate": 9.270258541582335e-06, "loss": 0.5036, "step": 4396 }, { "epoch": 1.9891427278896177, "grad_norm": 0.33314183354377747, "learning_rate": 9.269877327846435e-06, "loss": 0.5537, "step": 4397 }, { "epoch": 1.9895951142275503, "grad_norm": 0.30374595522880554, "learning_rate": 9.269496022406775e-06, "loss": 0.4696, "step": 4398 }, { "epoch": 1.990047500565483, "grad_norm": 0.32618170976638794, "learning_rate": 9.269114625271549e-06, "loss": 0.5188, "step": 4399 }, { "epoch": 1.9904998869034154, "grad_norm": 0.32356879115104675, "learning_rate": 9.268733136448946e-06, "loss": 0.5067, "step": 4400 }, { "epoch": 1.9904998869034154, "eval_loss": 0.6005846858024597, "eval_runtime": 26.1553, "eval_samples_per_second": 28.445, "eval_steps_per_second": 7.111, "step": 4400 }, { "epoch": 1.9909522732413483, "grad_norm": 0.3321666419506073, "learning_rate": 9.268351555947161e-06, "loss": 0.5415, "step": 4401 }, { "epoch": 1.9914046595792807, "grad_norm": 0.3554307222366333, "learning_rate": 9.26796988377439e-06, "loss": 0.5473, "step": 4402 }, { "epoch": 1.9918570459172134, "grad_norm": 0.32536786794662476, "learning_rate": 9.267588119938827e-06, "loss": 0.5654, "step": 4403 }, { "epoch": 1.992309432255146, "grad_norm": 0.38834652304649353, "learning_rate": 9.267206264448671e-06, "loss": 0.531, "step": 4404 }, { "epoch": 1.9927618185930784, "grad_norm": 0.3848591446876526, "learning_rate": 9.266824317312127e-06, "loss": 0.5182, "step": 4405 }, { "epoch": 1.993214204931011, "grad_norm": 0.3649001121520996, "learning_rate": 9.266442278537393e-06, "loss": 0.5782, "step": 4406 }, { "epoch": 1.9936665912689437, "grad_norm": 0.3552115559577942, "learning_rate": 9.26606014813268e-06, "loss": 0.4448, "step": 4407 }, { "epoch": 1.9941189776068762, "grad_norm": 0.4418588876724243, "learning_rate": 9.265677926106187e-06, "loss": 0.6037, "step": 4408 }, { "epoch": 1.9945713639448088, "grad_norm": 0.3975854218006134, "learning_rate": 9.265295612466131e-06, "loss": 0.5383, "step": 4409 }, { "epoch": 1.9950237502827415, "grad_norm": 0.5287685990333557, "learning_rate": 9.264913207220717e-06, "loss": 0.5965, "step": 4410 }, { "epoch": 1.995476136620674, "grad_norm": 0.3290838897228241, "learning_rate": 9.264530710378164e-06, "loss": 0.5951, "step": 4411 }, { "epoch": 1.9959285229586068, "grad_norm": 0.20669426023960114, "learning_rate": 9.26414812194668e-06, "loss": 0.4848, "step": 4412 }, { "epoch": 1.9963809092965392, "grad_norm": 0.2778239846229553, "learning_rate": 9.263765441934487e-06, "loss": 0.5589, "step": 4413 }, { "epoch": 1.9968332956344719, "grad_norm": 0.29514840245246887, "learning_rate": 9.2633826703498e-06, "loss": 0.5438, "step": 4414 }, { "epoch": 1.9972856819724045, "grad_norm": 0.3357645273208618, "learning_rate": 9.262999807200843e-06, "loss": 0.7021, "step": 4415 }, { "epoch": 1.997738068310337, "grad_norm": 0.286819189786911, "learning_rate": 9.262616852495838e-06, "loss": 0.5792, "step": 4416 }, { "epoch": 1.9981904546482696, "grad_norm": 0.3069615066051483, "learning_rate": 9.262233806243006e-06, "loss": 0.5646, "step": 4417 }, { "epoch": 1.9986428409862023, "grad_norm": 0.35539302229881287, "learning_rate": 9.261850668450579e-06, "loss": 0.5396, "step": 4418 }, { "epoch": 1.9990952273241347, "grad_norm": 0.32711732387542725, "learning_rate": 9.261467439126782e-06, "loss": 0.4792, "step": 4419 }, { "epoch": 1.9995476136620676, "grad_norm": 0.37597399950027466, "learning_rate": 9.261084118279846e-06, "loss": 0.5373, "step": 4420 }, { "epoch": 2.0, "grad_norm": 1.2119807004928589, "learning_rate": 9.260700705918006e-06, "loss": 0.754, "step": 4421 }, { "epoch": 2.0004523863379324, "grad_norm": 0.17688879370689392, "learning_rate": 9.260317202049496e-06, "loss": 1.1561, "step": 4422 }, { "epoch": 2.0009047726758653, "grad_norm": 0.2052316665649414, "learning_rate": 9.259933606682549e-06, "loss": 0.6073, "step": 4423 }, { "epoch": 2.0013571590137977, "grad_norm": 0.2396189570426941, "learning_rate": 9.259549919825406e-06, "loss": 0.7883, "step": 4424 }, { "epoch": 2.00180954535173, "grad_norm": 0.24237491190433502, "learning_rate": 9.259166141486308e-06, "loss": 0.73, "step": 4425 }, { "epoch": 2.002261931689663, "grad_norm": 0.2294415831565857, "learning_rate": 9.258782271673496e-06, "loss": 0.6423, "step": 4426 }, { "epoch": 2.0027143180275955, "grad_norm": 0.24140161275863647, "learning_rate": 9.258398310395214e-06, "loss": 0.6891, "step": 4427 }, { "epoch": 2.0031667043655283, "grad_norm": 0.24352246522903442, "learning_rate": 9.258014257659711e-06, "loss": 0.6045, "step": 4428 }, { "epoch": 2.003619090703461, "grad_norm": 0.2357993721961975, "learning_rate": 9.257630113475232e-06, "loss": 0.5641, "step": 4429 }, { "epoch": 2.004071477041393, "grad_norm": 0.2591051459312439, "learning_rate": 9.25724587785003e-06, "loss": 0.7123, "step": 4430 }, { "epoch": 2.004523863379326, "grad_norm": 0.2474735975265503, "learning_rate": 9.256861550792355e-06, "loss": 0.581, "step": 4431 }, { "epoch": 2.0049762497172585, "grad_norm": 0.2468966245651245, "learning_rate": 9.256477132310462e-06, "loss": 0.5882, "step": 4432 }, { "epoch": 2.005428636055191, "grad_norm": 0.23950546979904175, "learning_rate": 9.256092622412607e-06, "loss": 0.5193, "step": 4433 }, { "epoch": 2.005881022393124, "grad_norm": 0.26674237847328186, "learning_rate": 9.255708021107048e-06, "loss": 0.5957, "step": 4434 }, { "epoch": 2.0063334087310563, "grad_norm": 0.26972055435180664, "learning_rate": 9.255323328402047e-06, "loss": 0.5656, "step": 4435 }, { "epoch": 2.006785795068989, "grad_norm": 0.25277647376060486, "learning_rate": 9.254938544305863e-06, "loss": 0.62, "step": 4436 }, { "epoch": 2.0072381814069216, "grad_norm": 0.2561109960079193, "learning_rate": 9.25455366882676e-06, "loss": 0.5633, "step": 4437 }, { "epoch": 2.007690567744854, "grad_norm": 0.2651765048503876, "learning_rate": 9.254168701973006e-06, "loss": 0.5336, "step": 4438 }, { "epoch": 2.008142954082787, "grad_norm": 0.2593035101890564, "learning_rate": 9.25378364375287e-06, "loss": 0.5717, "step": 4439 }, { "epoch": 2.0085953404207193, "grad_norm": 0.29293328523635864, "learning_rate": 9.253398494174617e-06, "loss": 0.638, "step": 4440 }, { "epoch": 2.0090477267586517, "grad_norm": 0.27362099289894104, "learning_rate": 9.253013253246524e-06, "loss": 0.5845, "step": 4441 }, { "epoch": 2.0095001130965846, "grad_norm": 0.276774138212204, "learning_rate": 9.252627920976861e-06, "loss": 0.5578, "step": 4442 }, { "epoch": 2.009952499434517, "grad_norm": 0.254915714263916, "learning_rate": 9.252242497373907e-06, "loss": 0.5506, "step": 4443 }, { "epoch": 2.0104048857724495, "grad_norm": 0.29341262578964233, "learning_rate": 9.251856982445936e-06, "loss": 0.5776, "step": 4444 }, { "epoch": 2.0108572721103823, "grad_norm": 0.28211086988449097, "learning_rate": 9.25147137620123e-06, "loss": 0.5517, "step": 4445 }, { "epoch": 2.0113096584483148, "grad_norm": 0.3010827600955963, "learning_rate": 9.251085678648072e-06, "loss": 0.5628, "step": 4446 }, { "epoch": 2.0117620447862476, "grad_norm": 0.244198277592659, "learning_rate": 9.250699889794743e-06, "loss": 0.3875, "step": 4447 }, { "epoch": 2.01221443112418, "grad_norm": 0.3076806962490082, "learning_rate": 9.250314009649528e-06, "loss": 0.6094, "step": 4448 }, { "epoch": 2.0126668174621125, "grad_norm": 0.3046729266643524, "learning_rate": 9.249928038220716e-06, "loss": 0.5532, "step": 4449 }, { "epoch": 2.0131192038000454, "grad_norm": 0.30994942784309387, "learning_rate": 9.249541975516599e-06, "loss": 0.4754, "step": 4450 }, { "epoch": 2.013571590137978, "grad_norm": 0.3072285056114197, "learning_rate": 9.249155821545465e-06, "loss": 0.5239, "step": 4451 }, { "epoch": 2.0140239764759102, "grad_norm": 0.2772060036659241, "learning_rate": 9.248769576315607e-06, "loss": 0.4745, "step": 4452 }, { "epoch": 2.014476362813843, "grad_norm": 0.2852628827095032, "learning_rate": 9.248383239835322e-06, "loss": 0.4668, "step": 4453 }, { "epoch": 2.0149287491517756, "grad_norm": 0.3184850811958313, "learning_rate": 9.247996812112907e-06, "loss": 0.5272, "step": 4454 }, { "epoch": 2.0153811354897084, "grad_norm": 0.30968841910362244, "learning_rate": 9.247610293156661e-06, "loss": 0.5054, "step": 4455 }, { "epoch": 2.015833521827641, "grad_norm": 0.29613780975341797, "learning_rate": 9.247223682974886e-06, "loss": 0.4575, "step": 4456 }, { "epoch": 2.0162859081655733, "grad_norm": 0.3134768605232239, "learning_rate": 9.246836981575885e-06, "loss": 0.4935, "step": 4457 }, { "epoch": 2.016738294503506, "grad_norm": 0.3352459967136383, "learning_rate": 9.246450188967962e-06, "loss": 0.5069, "step": 4458 }, { "epoch": 2.0171906808414386, "grad_norm": 0.34909772872924805, "learning_rate": 9.246063305159425e-06, "loss": 0.5361, "step": 4459 }, { "epoch": 2.017643067179371, "grad_norm": 0.32719817757606506, "learning_rate": 9.245676330158582e-06, "loss": 0.4934, "step": 4460 }, { "epoch": 2.018095453517304, "grad_norm": 0.3271111249923706, "learning_rate": 9.245289263973746e-06, "loss": 0.5055, "step": 4461 }, { "epoch": 2.0185478398552363, "grad_norm": 0.3277076482772827, "learning_rate": 9.244902106613229e-06, "loss": 0.4743, "step": 4462 }, { "epoch": 2.0190002261931688, "grad_norm": 0.31817373633384705, "learning_rate": 9.244514858085346e-06, "loss": 0.4128, "step": 4463 }, { "epoch": 2.0194526125311016, "grad_norm": 0.3801358640193939, "learning_rate": 9.244127518398413e-06, "loss": 0.5348, "step": 4464 }, { "epoch": 2.019904998869034, "grad_norm": 0.364067405462265, "learning_rate": 9.24374008756075e-06, "loss": 0.5113, "step": 4465 }, { "epoch": 2.020357385206967, "grad_norm": 0.3996902406215668, "learning_rate": 9.243352565580678e-06, "loss": 0.5473, "step": 4466 }, { "epoch": 2.0208097715448994, "grad_norm": 0.43211790919303894, "learning_rate": 9.242964952466519e-06, "loss": 0.6151, "step": 4467 }, { "epoch": 2.021262157882832, "grad_norm": 0.41928738355636597, "learning_rate": 9.242577248226597e-06, "loss": 0.5146, "step": 4468 }, { "epoch": 2.0217145442207647, "grad_norm": 0.3987017571926117, "learning_rate": 9.242189452869241e-06, "loss": 0.479, "step": 4469 }, { "epoch": 2.022166930558697, "grad_norm": 0.469295471906662, "learning_rate": 9.241801566402778e-06, "loss": 0.5199, "step": 4470 }, { "epoch": 2.0226193168966295, "grad_norm": 0.5328223705291748, "learning_rate": 9.241413588835538e-06, "loss": 0.5712, "step": 4471 }, { "epoch": 2.0230717032345624, "grad_norm": 0.1559407263994217, "learning_rate": 9.241025520175855e-06, "loss": 1.2491, "step": 4472 }, { "epoch": 2.023524089572495, "grad_norm": 0.21182288229465485, "learning_rate": 9.240637360432064e-06, "loss": 0.7611, "step": 4473 }, { "epoch": 2.0239764759104273, "grad_norm": 0.18067719042301178, "learning_rate": 9.240249109612498e-06, "loss": 0.5192, "step": 4474 }, { "epoch": 2.02442886224836, "grad_norm": 0.23733659088611603, "learning_rate": 9.2398607677255e-06, "loss": 0.6268, "step": 4475 }, { "epoch": 2.0248812485862926, "grad_norm": 0.26419597864151, "learning_rate": 9.239472334779407e-06, "loss": 0.6981, "step": 4476 }, { "epoch": 2.0253336349242255, "grad_norm": 0.22760502994060516, "learning_rate": 9.239083810782562e-06, "loss": 0.6978, "step": 4477 }, { "epoch": 2.025786021262158, "grad_norm": 0.27577948570251465, "learning_rate": 9.238695195743312e-06, "loss": 0.7388, "step": 4478 }, { "epoch": 2.0262384076000903, "grad_norm": 0.22184699773788452, "learning_rate": 9.23830648967e-06, "loss": 0.4913, "step": 4479 }, { "epoch": 2.026690793938023, "grad_norm": 0.24987736344337463, "learning_rate": 9.237917692570975e-06, "loss": 0.6047, "step": 4480 }, { "epoch": 2.0271431802759556, "grad_norm": 0.27962392568588257, "learning_rate": 9.237528804454587e-06, "loss": 0.7214, "step": 4481 }, { "epoch": 2.027595566613888, "grad_norm": 0.289972722530365, "learning_rate": 9.23713982532919e-06, "loss": 0.7129, "step": 4482 }, { "epoch": 2.028047952951821, "grad_norm": 0.2614589333534241, "learning_rate": 9.236750755203137e-06, "loss": 0.692, "step": 4483 }, { "epoch": 2.0285003392897534, "grad_norm": 0.26754623651504517, "learning_rate": 9.236361594084783e-06, "loss": 0.5398, "step": 4484 }, { "epoch": 2.0289527256276862, "grad_norm": 0.2830216586589813, "learning_rate": 9.235972341982486e-06, "loss": 0.7043, "step": 4485 }, { "epoch": 2.0294051119656187, "grad_norm": 0.25655511021614075, "learning_rate": 9.235582998904607e-06, "loss": 0.5969, "step": 4486 }, { "epoch": 2.029857498303551, "grad_norm": 0.27310270071029663, "learning_rate": 9.235193564859506e-06, "loss": 0.5335, "step": 4487 }, { "epoch": 2.030309884641484, "grad_norm": 0.315543532371521, "learning_rate": 9.234804039855552e-06, "loss": 0.6021, "step": 4488 }, { "epoch": 2.0307622709794164, "grad_norm": 0.2839200496673584, "learning_rate": 9.234414423901104e-06, "loss": 0.6104, "step": 4489 }, { "epoch": 2.031214657317349, "grad_norm": 0.2957529127597809, "learning_rate": 9.234024717004532e-06, "loss": 0.4646, "step": 4490 }, { "epoch": 2.0316670436552817, "grad_norm": 0.3142116367816925, "learning_rate": 9.23363491917421e-06, "loss": 0.6209, "step": 4491 }, { "epoch": 2.032119429993214, "grad_norm": 0.2772650122642517, "learning_rate": 9.233245030418505e-06, "loss": 0.5396, "step": 4492 }, { "epoch": 2.0325718163311466, "grad_norm": 0.2751467525959015, "learning_rate": 9.23285505074579e-06, "loss": 0.4966, "step": 4493 }, { "epoch": 2.0330242026690795, "grad_norm": 0.28036925196647644, "learning_rate": 9.232464980164444e-06, "loss": 0.5653, "step": 4494 }, { "epoch": 2.033476589007012, "grad_norm": 0.3277169466018677, "learning_rate": 9.232074818682844e-06, "loss": 0.7009, "step": 4495 }, { "epoch": 2.0339289753449448, "grad_norm": 0.28636378049850464, "learning_rate": 9.231684566309367e-06, "loss": 0.6487, "step": 4496 }, { "epoch": 2.034381361682877, "grad_norm": 0.2751009166240692, "learning_rate": 9.231294223052396e-06, "loss": 0.4258, "step": 4497 }, { "epoch": 2.0348337480208096, "grad_norm": 0.29763275384902954, "learning_rate": 9.230903788920314e-06, "loss": 0.5261, "step": 4498 }, { "epoch": 2.0352861343587425, "grad_norm": 0.3053711950778961, "learning_rate": 9.230513263921505e-06, "loss": 0.488, "step": 4499 }, { "epoch": 2.035738520696675, "grad_norm": 0.30605265498161316, "learning_rate": 9.23012264806436e-06, "loss": 0.5623, "step": 4500 }, { "epoch": 2.0361909070346074, "grad_norm": 0.32505443692207336, "learning_rate": 9.229731941357265e-06, "loss": 0.5819, "step": 4501 }, { "epoch": 2.0366432933725402, "grad_norm": 0.3072200119495392, "learning_rate": 9.22934114380861e-06, "loss": 0.4743, "step": 4502 }, { "epoch": 2.0370956797104727, "grad_norm": 0.36995115876197815, "learning_rate": 9.228950255426794e-06, "loss": 0.5881, "step": 4503 }, { "epoch": 2.0375480660484055, "grad_norm": 0.3189568817615509, "learning_rate": 9.228559276220207e-06, "loss": 0.5173, "step": 4504 }, { "epoch": 2.038000452386338, "grad_norm": 0.2888560891151428, "learning_rate": 9.228168206197245e-06, "loss": 0.4665, "step": 4505 }, { "epoch": 2.0384528387242704, "grad_norm": 0.3218246400356293, "learning_rate": 9.227777045366313e-06, "loss": 0.4367, "step": 4506 }, { "epoch": 2.0389052250622033, "grad_norm": 0.3142426311969757, "learning_rate": 9.227385793735806e-06, "loss": 0.4854, "step": 4507 }, { "epoch": 2.0393576114001357, "grad_norm": 0.3272954225540161, "learning_rate": 9.22699445131413e-06, "loss": 0.5402, "step": 4508 }, { "epoch": 2.039809997738068, "grad_norm": 0.34355250000953674, "learning_rate": 9.226603018109687e-06, "loss": 0.4797, "step": 4509 }, { "epoch": 2.040262384076001, "grad_norm": 0.3530751168727875, "learning_rate": 9.226211494130888e-06, "loss": 0.5052, "step": 4510 }, { "epoch": 2.0407147704139335, "grad_norm": 0.33692803978919983, "learning_rate": 9.225819879386137e-06, "loss": 0.5383, "step": 4511 }, { "epoch": 2.041167156751866, "grad_norm": 0.3810349702835083, "learning_rate": 9.22542817388385e-06, "loss": 0.5825, "step": 4512 }, { "epoch": 2.0416195430897988, "grad_norm": 0.3834066689014435, "learning_rate": 9.225036377632435e-06, "loss": 0.5827, "step": 4513 }, { "epoch": 2.042071929427731, "grad_norm": 0.38230904936790466, "learning_rate": 9.22464449064031e-06, "loss": 0.5576, "step": 4514 }, { "epoch": 2.042524315765664, "grad_norm": 0.3370196223258972, "learning_rate": 9.224252512915888e-06, "loss": 0.4822, "step": 4515 }, { "epoch": 2.0429767021035965, "grad_norm": 0.33795303106307983, "learning_rate": 9.22386044446759e-06, "loss": 0.4074, "step": 4516 }, { "epoch": 2.043429088441529, "grad_norm": 0.4013572633266449, "learning_rate": 9.223468285303834e-06, "loss": 0.5171, "step": 4517 }, { "epoch": 2.043881474779462, "grad_norm": 0.38276904821395874, "learning_rate": 9.223076035433046e-06, "loss": 0.4726, "step": 4518 }, { "epoch": 2.0443338611173942, "grad_norm": 0.3838946223258972, "learning_rate": 9.222683694863649e-06, "loss": 0.5115, "step": 4519 }, { "epoch": 2.0447862474553267, "grad_norm": 0.4384206235408783, "learning_rate": 9.222291263604067e-06, "loss": 0.4983, "step": 4520 }, { "epoch": 2.0452386337932595, "grad_norm": 0.4437529742717743, "learning_rate": 9.22189874166273e-06, "loss": 0.5653, "step": 4521 }, { "epoch": 2.045691020131192, "grad_norm": 0.13579267263412476, "learning_rate": 9.221506129048068e-06, "loss": 1.2342, "step": 4522 }, { "epoch": 2.046143406469125, "grad_norm": 0.1793294996023178, "learning_rate": 9.221113425768513e-06, "loss": 0.7196, "step": 4523 }, { "epoch": 2.0465957928070573, "grad_norm": 0.1959105134010315, "learning_rate": 9.2207206318325e-06, "loss": 0.5674, "step": 4524 }, { "epoch": 2.0470481791449897, "grad_norm": 0.2214972823858261, "learning_rate": 9.220327747248462e-06, "loss": 0.6481, "step": 4525 }, { "epoch": 2.0475005654829226, "grad_norm": 0.2453775703907013, "learning_rate": 9.219934772024842e-06, "loss": 0.7075, "step": 4526 }, { "epoch": 2.047952951820855, "grad_norm": 0.2360505312681198, "learning_rate": 9.219541706170076e-06, "loss": 0.5688, "step": 4527 }, { "epoch": 2.0484053381587874, "grad_norm": 0.2728217542171478, "learning_rate": 9.219148549692606e-06, "loss": 0.6924, "step": 4528 }, { "epoch": 2.0488577244967203, "grad_norm": 0.26028478145599365, "learning_rate": 9.218755302600876e-06, "loss": 0.7278, "step": 4529 }, { "epoch": 2.0493101108346528, "grad_norm": 0.2640851140022278, "learning_rate": 9.218361964903335e-06, "loss": 0.5963, "step": 4530 }, { "epoch": 2.049762497172585, "grad_norm": 0.2682202160358429, "learning_rate": 9.217968536608428e-06, "loss": 0.62, "step": 4531 }, { "epoch": 2.050214883510518, "grad_norm": 0.250598669052124, "learning_rate": 9.217575017724603e-06, "loss": 0.5482, "step": 4532 }, { "epoch": 2.0506672698484505, "grad_norm": 0.26430511474609375, "learning_rate": 9.217181408260311e-06, "loss": 0.6324, "step": 4533 }, { "epoch": 2.0511196561863834, "grad_norm": 0.2780570089817047, "learning_rate": 9.216787708224011e-06, "loss": 0.6646, "step": 4534 }, { "epoch": 2.051572042524316, "grad_norm": 0.27304893732070923, "learning_rate": 9.216393917624154e-06, "loss": 0.6269, "step": 4535 }, { "epoch": 2.0520244288622482, "grad_norm": 0.29510241746902466, "learning_rate": 9.2160000364692e-06, "loss": 0.5863, "step": 4536 }, { "epoch": 2.052476815200181, "grad_norm": 0.23805645108222961, "learning_rate": 9.215606064767605e-06, "loss": 0.4718, "step": 4537 }, { "epoch": 2.0529292015381135, "grad_norm": 0.3235904276371002, "learning_rate": 9.215212002527833e-06, "loss": 0.635, "step": 4538 }, { "epoch": 2.053381587876046, "grad_norm": 0.32848256826400757, "learning_rate": 9.214817849758347e-06, "loss": 0.7481, "step": 4539 }, { "epoch": 2.053833974213979, "grad_norm": 0.2595604956150055, "learning_rate": 9.214423606467612e-06, "loss": 0.6016, "step": 4540 }, { "epoch": 2.0542863605519113, "grad_norm": 0.2680409848690033, "learning_rate": 9.214029272664093e-06, "loss": 0.4674, "step": 4541 }, { "epoch": 2.0547387468898437, "grad_norm": 0.3063201904296875, "learning_rate": 9.213634848356262e-06, "loss": 0.6384, "step": 4542 }, { "epoch": 2.0551911332277766, "grad_norm": 0.29734471440315247, "learning_rate": 9.213240333552589e-06, "loss": 0.7259, "step": 4543 }, { "epoch": 2.055643519565709, "grad_norm": 0.2941591739654541, "learning_rate": 9.212845728261548e-06, "loss": 0.4864, "step": 4544 }, { "epoch": 2.056095905903642, "grad_norm": 0.29995134472846985, "learning_rate": 9.21245103249161e-06, "loss": 0.6126, "step": 4545 }, { "epoch": 2.0565482922415743, "grad_norm": 0.2985133230686188, "learning_rate": 9.212056246251256e-06, "loss": 0.5534, "step": 4546 }, { "epoch": 2.0570006785795067, "grad_norm": 0.285209596157074, "learning_rate": 9.211661369548964e-06, "loss": 0.4926, "step": 4547 }, { "epoch": 2.0574530649174396, "grad_norm": 0.2715494930744171, "learning_rate": 9.211266402393213e-06, "loss": 0.4904, "step": 4548 }, { "epoch": 2.057905451255372, "grad_norm": 0.3007429540157318, "learning_rate": 9.210871344792487e-06, "loss": 0.5436, "step": 4549 }, { "epoch": 2.0583578375933045, "grad_norm": 0.3383123278617859, "learning_rate": 9.21047619675527e-06, "loss": 0.6204, "step": 4550 }, { "epoch": 2.0588102239312374, "grad_norm": 0.2909863293170929, "learning_rate": 9.21008095829005e-06, "loss": 0.4887, "step": 4551 }, { "epoch": 2.05926261026917, "grad_norm": 0.30161622166633606, "learning_rate": 9.209685629405312e-06, "loss": 0.5376, "step": 4552 }, { "epoch": 2.0597149966071027, "grad_norm": 0.3086647689342499, "learning_rate": 9.209290210109551e-06, "loss": 0.5712, "step": 4553 }, { "epoch": 2.060167382945035, "grad_norm": 0.31466588377952576, "learning_rate": 9.208894700411257e-06, "loss": 0.6572, "step": 4554 }, { "epoch": 2.0606197692829675, "grad_norm": 0.294220507144928, "learning_rate": 9.208499100318925e-06, "loss": 0.5461, "step": 4555 }, { "epoch": 2.0610721556209004, "grad_norm": 0.35540348291397095, "learning_rate": 9.208103409841051e-06, "loss": 0.6918, "step": 4556 }, { "epoch": 2.061524541958833, "grad_norm": 0.32379522919654846, "learning_rate": 9.207707628986132e-06, "loss": 0.4838, "step": 4557 }, { "epoch": 2.0619769282967653, "grad_norm": 0.36063140630722046, "learning_rate": 9.207311757762669e-06, "loss": 0.5584, "step": 4558 }, { "epoch": 2.062429314634698, "grad_norm": 0.3180944621562958, "learning_rate": 9.206915796179165e-06, "loss": 0.5151, "step": 4559 }, { "epoch": 2.0628817009726306, "grad_norm": 0.3188018500804901, "learning_rate": 9.206519744244123e-06, "loss": 0.4769, "step": 4560 }, { "epoch": 2.0633340873105634, "grad_norm": 0.33385515213012695, "learning_rate": 9.206123601966049e-06, "loss": 0.516, "step": 4561 }, { "epoch": 2.063786473648496, "grad_norm": 0.34221306443214417, "learning_rate": 9.205727369353453e-06, "loss": 0.5501, "step": 4562 }, { "epoch": 2.0642388599864283, "grad_norm": 0.3726198077201843, "learning_rate": 9.20533104641484e-06, "loss": 0.5154, "step": 4563 }, { "epoch": 2.064691246324361, "grad_norm": 0.36021456122398376, "learning_rate": 9.204934633158728e-06, "loss": 0.4558, "step": 4564 }, { "epoch": 2.0651436326622936, "grad_norm": 0.39401283860206604, "learning_rate": 9.204538129593626e-06, "loss": 0.4775, "step": 4565 }, { "epoch": 2.065596019000226, "grad_norm": 0.38163307309150696, "learning_rate": 9.20414153572805e-06, "loss": 0.5174, "step": 4566 }, { "epoch": 2.066048405338159, "grad_norm": 0.4027860164642334, "learning_rate": 9.203744851570521e-06, "loss": 0.5279, "step": 4567 }, { "epoch": 2.0665007916760914, "grad_norm": 0.39395010471343994, "learning_rate": 9.203348077129557e-06, "loss": 0.5304, "step": 4568 }, { "epoch": 2.066953178014024, "grad_norm": 0.43758702278137207, "learning_rate": 9.202951212413675e-06, "loss": 0.5409, "step": 4569 }, { "epoch": 2.0674055643519567, "grad_norm": 0.4203152656555176, "learning_rate": 9.202554257431407e-06, "loss": 0.4436, "step": 4570 }, { "epoch": 2.067857950689889, "grad_norm": 0.5800669193267822, "learning_rate": 9.202157212191271e-06, "loss": 0.6362, "step": 4571 }, { "epoch": 2.068310337027822, "grad_norm": 0.18630428612232208, "learning_rate": 9.201760076701797e-06, "loss": 1.1134, "step": 4572 }, { "epoch": 2.0687627233657544, "grad_norm": 0.19812409579753876, "learning_rate": 9.201362850971514e-06, "loss": 0.638, "step": 4573 }, { "epoch": 2.069215109703687, "grad_norm": 0.2261907160282135, "learning_rate": 9.200965535008954e-06, "loss": 0.6817, "step": 4574 }, { "epoch": 2.0696674960416197, "grad_norm": 0.24894705414772034, "learning_rate": 9.200568128822651e-06, "loss": 0.6302, "step": 4575 }, { "epoch": 2.070119882379552, "grad_norm": 0.25150924921035767, "learning_rate": 9.200170632421138e-06, "loss": 0.7995, "step": 4576 }, { "epoch": 2.0705722687174846, "grad_norm": 0.2389799952507019, "learning_rate": 9.19977304581295e-06, "loss": 0.5913, "step": 4577 }, { "epoch": 2.0710246550554174, "grad_norm": 0.2851313650608063, "learning_rate": 9.199375369006631e-06, "loss": 0.7036, "step": 4578 }, { "epoch": 2.07147704139335, "grad_norm": 0.25615665316581726, "learning_rate": 9.198977602010719e-06, "loss": 0.5839, "step": 4579 }, { "epoch": 2.0719294277312823, "grad_norm": 0.27537980675697327, "learning_rate": 9.198579744833756e-06, "loss": 0.6772, "step": 4580 }, { "epoch": 2.072381814069215, "grad_norm": 0.2895612418651581, "learning_rate": 9.198181797484289e-06, "loss": 0.6046, "step": 4581 }, { "epoch": 2.0728342004071476, "grad_norm": 0.24734365940093994, "learning_rate": 9.197783759970864e-06, "loss": 0.5311, "step": 4582 }, { "epoch": 2.0732865867450805, "grad_norm": 0.2679705023765564, "learning_rate": 9.197385632302028e-06, "loss": 0.6437, "step": 4583 }, { "epoch": 2.073738973083013, "grad_norm": 0.2774890959262848, "learning_rate": 9.196987414486333e-06, "loss": 0.67, "step": 4584 }, { "epoch": 2.0741913594209453, "grad_norm": 0.3016436994075775, "learning_rate": 9.196589106532333e-06, "loss": 0.6938, "step": 4585 }, { "epoch": 2.074643745758878, "grad_norm": 0.26308825612068176, "learning_rate": 9.196190708448578e-06, "loss": 0.5205, "step": 4586 }, { "epoch": 2.0750961320968107, "grad_norm": 0.276523619890213, "learning_rate": 9.195792220243629e-06, "loss": 0.52, "step": 4587 }, { "epoch": 2.075548518434743, "grad_norm": 0.2679850459098816, "learning_rate": 9.195393641926041e-06, "loss": 0.5548, "step": 4588 }, { "epoch": 2.076000904772676, "grad_norm": 0.28660064935684204, "learning_rate": 9.194994973504376e-06, "loss": 0.5584, "step": 4589 }, { "epoch": 2.0764532911106084, "grad_norm": 0.3113636374473572, "learning_rate": 9.194596214987195e-06, "loss": 0.5779, "step": 4590 }, { "epoch": 2.0769056774485413, "grad_norm": 0.3069625496864319, "learning_rate": 9.194197366383064e-06, "loss": 0.5153, "step": 4591 }, { "epoch": 2.0773580637864737, "grad_norm": 0.2976818084716797, "learning_rate": 9.193798427700548e-06, "loss": 0.5563, "step": 4592 }, { "epoch": 2.077810450124406, "grad_norm": 0.31312087178230286, "learning_rate": 9.193399398948213e-06, "loss": 0.5462, "step": 4593 }, { "epoch": 2.078262836462339, "grad_norm": 0.3237786889076233, "learning_rate": 9.193000280134633e-06, "loss": 0.7003, "step": 4594 }, { "epoch": 2.0787152228002714, "grad_norm": 0.2998807728290558, "learning_rate": 9.192601071268377e-06, "loss": 0.534, "step": 4595 }, { "epoch": 2.079167609138204, "grad_norm": 0.32528844475746155, "learning_rate": 9.192201772358019e-06, "loss": 0.6135, "step": 4596 }, { "epoch": 2.0796199954761367, "grad_norm": 0.2675393521785736, "learning_rate": 9.191802383412135e-06, "loss": 0.4759, "step": 4597 }, { "epoch": 2.080072381814069, "grad_norm": 0.27223846316337585, "learning_rate": 9.191402904439301e-06, "loss": 0.4845, "step": 4598 }, { "epoch": 2.0805247681520016, "grad_norm": 0.3293786644935608, "learning_rate": 9.1910033354481e-06, "loss": 0.5719, "step": 4599 }, { "epoch": 2.0809771544899345, "grad_norm": 0.3148655891418457, "learning_rate": 9.190603676447111e-06, "loss": 0.5923, "step": 4600 }, { "epoch": 2.0809771544899345, "eval_loss": 0.5988585352897644, "eval_runtime": 25.7138, "eval_samples_per_second": 28.934, "eval_steps_per_second": 7.233, "step": 4600 }, { "epoch": 2.081429540827867, "grad_norm": 0.2949315011501312, "learning_rate": 9.190203927444921e-06, "loss": 0.4566, "step": 4601 }, { "epoch": 2.0818819271658, "grad_norm": 0.3312596380710602, "learning_rate": 9.18980408845011e-06, "loss": 0.5324, "step": 4602 }, { "epoch": 2.082334313503732, "grad_norm": 0.38051146268844604, "learning_rate": 9.189404159471269e-06, "loss": 0.6992, "step": 4603 }, { "epoch": 2.0827866998416646, "grad_norm": 0.30714330077171326, "learning_rate": 9.189004140516985e-06, "loss": 0.5254, "step": 4604 }, { "epoch": 2.0832390861795975, "grad_norm": 0.34493768215179443, "learning_rate": 9.18860403159585e-06, "loss": 0.5758, "step": 4605 }, { "epoch": 2.08369147251753, "grad_norm": 0.34901049733161926, "learning_rate": 9.188203832716458e-06, "loss": 0.5324, "step": 4606 }, { "epoch": 2.0841438588554624, "grad_norm": 0.3492219150066376, "learning_rate": 9.187803543887404e-06, "loss": 0.6022, "step": 4607 }, { "epoch": 2.0845962451933953, "grad_norm": 0.33575382828712463, "learning_rate": 9.187403165117283e-06, "loss": 0.5534, "step": 4608 }, { "epoch": 2.0850486315313277, "grad_norm": 0.39327552914619446, "learning_rate": 9.187002696414695e-06, "loss": 0.6251, "step": 4609 }, { "epoch": 2.0855010178692606, "grad_norm": 0.3573143482208252, "learning_rate": 9.186602137788244e-06, "loss": 0.5675, "step": 4610 }, { "epoch": 2.085953404207193, "grad_norm": 0.3719951808452606, "learning_rate": 9.186201489246526e-06, "loss": 0.5897, "step": 4611 }, { "epoch": 2.0864057905451254, "grad_norm": 0.35794034600257874, "learning_rate": 9.185800750798152e-06, "loss": 0.4889, "step": 4612 }, { "epoch": 2.0868581768830583, "grad_norm": 0.39969900250434875, "learning_rate": 9.185399922451727e-06, "loss": 0.5267, "step": 4613 }, { "epoch": 2.0873105632209907, "grad_norm": 0.3631002604961395, "learning_rate": 9.184999004215857e-06, "loss": 0.5724, "step": 4614 }, { "epoch": 2.087762949558923, "grad_norm": 0.4169052243232727, "learning_rate": 9.184597996099155e-06, "loss": 0.5887, "step": 4615 }, { "epoch": 2.088215335896856, "grad_norm": 0.3556850254535675, "learning_rate": 9.184196898110232e-06, "loss": 0.3982, "step": 4616 }, { "epoch": 2.0886677222347885, "grad_norm": 0.3708595037460327, "learning_rate": 9.183795710257704e-06, "loss": 0.4286, "step": 4617 }, { "epoch": 2.089120108572721, "grad_norm": 0.39702704548835754, "learning_rate": 9.183394432550184e-06, "loss": 0.5198, "step": 4618 }, { "epoch": 2.0895724949106538, "grad_norm": 0.4348140358924866, "learning_rate": 9.182993064996295e-06, "loss": 0.5096, "step": 4619 }, { "epoch": 2.090024881248586, "grad_norm": 0.45558449625968933, "learning_rate": 9.182591607604654e-06, "loss": 0.5777, "step": 4620 }, { "epoch": 2.090477267586519, "grad_norm": 0.50681471824646, "learning_rate": 9.182190060383882e-06, "loss": 0.5392, "step": 4621 }, { "epoch": 2.0909296539244515, "grad_norm": 0.14881514012813568, "learning_rate": 9.181788423342606e-06, "loss": 1.0793, "step": 4622 }, { "epoch": 2.091382040262384, "grad_norm": 0.23254913091659546, "learning_rate": 9.181386696489451e-06, "loss": 0.7964, "step": 4623 }, { "epoch": 2.091834426600317, "grad_norm": 0.20899827778339386, "learning_rate": 9.180984879833043e-06, "loss": 0.6907, "step": 4624 }, { "epoch": 2.0922868129382493, "grad_norm": 0.23963548243045807, "learning_rate": 9.180582973382015e-06, "loss": 0.7452, "step": 4625 }, { "epoch": 2.0927391992761817, "grad_norm": 0.2508614659309387, "learning_rate": 9.180180977144997e-06, "loss": 0.6595, "step": 4626 }, { "epoch": 2.0931915856141146, "grad_norm": 0.24035267531871796, "learning_rate": 9.179778891130619e-06, "loss": 0.6635, "step": 4627 }, { "epoch": 2.093643971952047, "grad_norm": 0.264955997467041, "learning_rate": 9.179376715347523e-06, "loss": 0.6934, "step": 4628 }, { "epoch": 2.0940963582899794, "grad_norm": 0.24844396114349365, "learning_rate": 9.178974449804343e-06, "loss": 0.5512, "step": 4629 }, { "epoch": 2.0945487446279123, "grad_norm": 0.26757556200027466, "learning_rate": 9.17857209450972e-06, "loss": 0.6436, "step": 4630 }, { "epoch": 2.0950011309658447, "grad_norm": 0.28483912348747253, "learning_rate": 9.178169649472292e-06, "loss": 0.4909, "step": 4631 }, { "epoch": 2.0954535173037776, "grad_norm": 0.25430408120155334, "learning_rate": 9.177767114700708e-06, "loss": 0.5256, "step": 4632 }, { "epoch": 2.09590590364171, "grad_norm": 0.24256475269794464, "learning_rate": 9.177364490203607e-06, "loss": 0.4904, "step": 4633 }, { "epoch": 2.0963582899796425, "grad_norm": 0.26820963621139526, "learning_rate": 9.17696177598964e-06, "loss": 0.5279, "step": 4634 }, { "epoch": 2.0968106763175753, "grad_norm": 0.28288283944129944, "learning_rate": 9.176558972067455e-06, "loss": 0.6336, "step": 4635 }, { "epoch": 2.0972630626555078, "grad_norm": 0.2995779514312744, "learning_rate": 9.176156078445703e-06, "loss": 0.6186, "step": 4636 }, { "epoch": 2.09771544899344, "grad_norm": 0.29004624485969543, "learning_rate": 9.175753095133038e-06, "loss": 0.4548, "step": 4637 }, { "epoch": 2.098167835331373, "grad_norm": 0.2925209403038025, "learning_rate": 9.175350022138113e-06, "loss": 0.5424, "step": 4638 }, { "epoch": 2.0986202216693055, "grad_norm": 0.2896849513053894, "learning_rate": 9.174946859469583e-06, "loss": 0.5289, "step": 4639 }, { "epoch": 2.0990726080072384, "grad_norm": 0.2828749716281891, "learning_rate": 9.174543607136111e-06, "loss": 0.5589, "step": 4640 }, { "epoch": 2.099524994345171, "grad_norm": 0.30332043766975403, "learning_rate": 9.174140265146356e-06, "loss": 0.6364, "step": 4641 }, { "epoch": 2.0999773806831032, "grad_norm": 0.2959150969982147, "learning_rate": 9.173736833508982e-06, "loss": 0.6029, "step": 4642 }, { "epoch": 2.100429767021036, "grad_norm": 0.31091320514678955, "learning_rate": 9.173333312232649e-06, "loss": 0.6512, "step": 4643 }, { "epoch": 2.1008821533589686, "grad_norm": 0.3408825099468231, "learning_rate": 9.172929701326027e-06, "loss": 0.6085, "step": 4644 }, { "epoch": 2.101334539696901, "grad_norm": 0.30544260144233704, "learning_rate": 9.172526000797785e-06, "loss": 0.453, "step": 4645 }, { "epoch": 2.101786926034834, "grad_norm": 0.3241575360298157, "learning_rate": 9.172122210656589e-06, "loss": 0.5107, "step": 4646 }, { "epoch": 2.1022393123727663, "grad_norm": 0.31326401233673096, "learning_rate": 9.171718330911117e-06, "loss": 0.5792, "step": 4647 }, { "epoch": 2.102691698710699, "grad_norm": 0.329722136259079, "learning_rate": 9.171314361570038e-06, "loss": 0.5636, "step": 4648 }, { "epoch": 2.1031440850486316, "grad_norm": 0.2996969223022461, "learning_rate": 9.17091030264203e-06, "loss": 0.4629, "step": 4649 }, { "epoch": 2.103596471386564, "grad_norm": 0.2926982045173645, "learning_rate": 9.170506154135772e-06, "loss": 0.4589, "step": 4650 }, { "epoch": 2.104048857724497, "grad_norm": 0.3087811768054962, "learning_rate": 9.170101916059942e-06, "loss": 0.505, "step": 4651 }, { "epoch": 2.1045012440624293, "grad_norm": 0.30828994512557983, "learning_rate": 9.169697588423222e-06, "loss": 0.5622, "step": 4652 }, { "epoch": 2.1049536304003618, "grad_norm": 0.3300500810146332, "learning_rate": 9.169293171234298e-06, "loss": 0.6251, "step": 4653 }, { "epoch": 2.1054060167382946, "grad_norm": 0.3869660198688507, "learning_rate": 9.168888664501853e-06, "loss": 0.635, "step": 4654 }, { "epoch": 2.105858403076227, "grad_norm": 0.361908495426178, "learning_rate": 9.168484068234575e-06, "loss": 0.5146, "step": 4655 }, { "epoch": 2.1063107894141595, "grad_norm": 0.34273475408554077, "learning_rate": 9.168079382441155e-06, "loss": 0.5924, "step": 4656 }, { "epoch": 2.1067631757520924, "grad_norm": 0.3381654918193817, "learning_rate": 9.167674607130283e-06, "loss": 0.5246, "step": 4657 }, { "epoch": 2.107215562090025, "grad_norm": 0.339428186416626, "learning_rate": 9.167269742310652e-06, "loss": 0.5799, "step": 4658 }, { "epoch": 2.1076679484279577, "grad_norm": 0.37968841195106506, "learning_rate": 9.16686478799096e-06, "loss": 0.6034, "step": 4659 }, { "epoch": 2.10812033476589, "grad_norm": 0.3617025911808014, "learning_rate": 9.1664597441799e-06, "loss": 0.5874, "step": 4660 }, { "epoch": 2.1085727211038225, "grad_norm": 0.4008392095565796, "learning_rate": 9.166054610886173e-06, "loss": 0.5891, "step": 4661 }, { "epoch": 2.1090251074417554, "grad_norm": 0.34652262926101685, "learning_rate": 9.16564938811848e-06, "loss": 0.473, "step": 4662 }, { "epoch": 2.109477493779688, "grad_norm": 0.33470290899276733, "learning_rate": 9.165244075885526e-06, "loss": 0.47, "step": 4663 }, { "epoch": 2.1099298801176203, "grad_norm": 0.38020631670951843, "learning_rate": 9.164838674196013e-06, "loss": 0.5514, "step": 4664 }, { "epoch": 2.110382266455553, "grad_norm": 0.3727567791938782, "learning_rate": 9.164433183058648e-06, "loss": 0.5297, "step": 4665 }, { "epoch": 2.1108346527934856, "grad_norm": 0.34655213356018066, "learning_rate": 9.164027602482141e-06, "loss": 0.4883, "step": 4666 }, { "epoch": 2.111287039131418, "grad_norm": 0.42849963903427124, "learning_rate": 9.163621932475203e-06, "loss": 0.5306, "step": 4667 }, { "epoch": 2.111739425469351, "grad_norm": 0.4233655333518982, "learning_rate": 9.163216173046545e-06, "loss": 0.5736, "step": 4668 }, { "epoch": 2.1121918118072833, "grad_norm": 0.44532260298728943, "learning_rate": 9.16281032420488e-06, "loss": 0.556, "step": 4669 }, { "epoch": 2.112644198145216, "grad_norm": 0.43484964966773987, "learning_rate": 9.16240438595893e-06, "loss": 0.5873, "step": 4670 }, { "epoch": 2.1130965844831486, "grad_norm": 0.692277193069458, "learning_rate": 9.161998358317408e-06, "loss": 0.5544, "step": 4671 }, { "epoch": 2.113548970821081, "grad_norm": 0.17907975614070892, "learning_rate": 9.161592241289035e-06, "loss": 0.9599, "step": 4672 }, { "epoch": 2.114001357159014, "grad_norm": 0.16416090726852417, "learning_rate": 9.161186034882536e-06, "loss": 0.6427, "step": 4673 }, { "epoch": 2.1144537434969464, "grad_norm": 0.20688408613204956, "learning_rate": 9.160779739106631e-06, "loss": 0.6614, "step": 4674 }, { "epoch": 2.114906129834879, "grad_norm": 0.24512115120887756, "learning_rate": 9.160373353970048e-06, "loss": 0.6408, "step": 4675 }, { "epoch": 2.1153585161728117, "grad_norm": 0.22184891998767853, "learning_rate": 9.159966879481518e-06, "loss": 0.7199, "step": 4676 }, { "epoch": 2.115810902510744, "grad_norm": 0.22135542333126068, "learning_rate": 9.159560315649765e-06, "loss": 0.4655, "step": 4677 }, { "epoch": 2.116263288848677, "grad_norm": 0.26300638914108276, "learning_rate": 9.159153662483525e-06, "loss": 0.6204, "step": 4678 }, { "epoch": 2.1167156751866094, "grad_norm": 0.2672659158706665, "learning_rate": 9.15874691999153e-06, "loss": 0.7085, "step": 4679 }, { "epoch": 2.117168061524542, "grad_norm": 0.24593952298164368, "learning_rate": 9.158340088182516e-06, "loss": 0.6269, "step": 4680 }, { "epoch": 2.1176204478624747, "grad_norm": 0.2702465355396271, "learning_rate": 9.157933167065219e-06, "loss": 0.5755, "step": 4681 }, { "epoch": 2.118072834200407, "grad_norm": 0.25614121556282043, "learning_rate": 9.15752615664838e-06, "loss": 0.5977, "step": 4682 }, { "epoch": 2.1185252205383396, "grad_norm": 0.29204410314559937, "learning_rate": 9.157119056940742e-06, "loss": 0.6984, "step": 4683 }, { "epoch": 2.1189776068762725, "grad_norm": 0.3167129158973694, "learning_rate": 9.156711867951044e-06, "loss": 0.6254, "step": 4684 }, { "epoch": 2.119429993214205, "grad_norm": 0.2954310178756714, "learning_rate": 9.156304589688034e-06, "loss": 0.6394, "step": 4685 }, { "epoch": 2.1198823795521373, "grad_norm": 0.2792518436908722, "learning_rate": 9.155897222160459e-06, "loss": 0.6131, "step": 4686 }, { "epoch": 2.12033476589007, "grad_norm": 0.26763254404067993, "learning_rate": 9.155489765377066e-06, "loss": 0.6117, "step": 4687 }, { "epoch": 2.1207871522280026, "grad_norm": 0.29482564330101013, "learning_rate": 9.155082219346609e-06, "loss": 0.6676, "step": 4688 }, { "epoch": 2.1212395385659355, "grad_norm": 0.2745495140552521, "learning_rate": 9.154674584077838e-06, "loss": 0.6421, "step": 4689 }, { "epoch": 2.121691924903868, "grad_norm": 0.29750996828079224, "learning_rate": 9.154266859579511e-06, "loss": 0.556, "step": 4690 }, { "epoch": 2.1221443112418004, "grad_norm": 0.29604190587997437, "learning_rate": 9.15385904586038e-06, "loss": 0.5438, "step": 4691 }, { "epoch": 2.1225966975797332, "grad_norm": 0.3109537363052368, "learning_rate": 9.153451142929208e-06, "loss": 0.5424, "step": 4692 }, { "epoch": 2.1230490839176657, "grad_norm": 0.28467652201652527, "learning_rate": 9.153043150794753e-06, "loss": 0.4907, "step": 4693 }, { "epoch": 2.123501470255598, "grad_norm": 0.3153214752674103, "learning_rate": 9.152635069465779e-06, "loss": 0.5094, "step": 4694 }, { "epoch": 2.123953856593531, "grad_norm": 0.31818482279777527, "learning_rate": 9.152226898951047e-06, "loss": 0.5306, "step": 4695 }, { "epoch": 2.1244062429314634, "grad_norm": 0.29707175493240356, "learning_rate": 9.151818639259328e-06, "loss": 0.583, "step": 4696 }, { "epoch": 2.1248586292693963, "grad_norm": 0.3083985447883606, "learning_rate": 9.151410290399387e-06, "loss": 0.5306, "step": 4697 }, { "epoch": 2.1253110156073287, "grad_norm": 0.291547954082489, "learning_rate": 9.151001852379996e-06, "loss": 0.4936, "step": 4698 }, { "epoch": 2.125763401945261, "grad_norm": 0.3204951286315918, "learning_rate": 9.150593325209924e-06, "loss": 0.5189, "step": 4699 }, { "epoch": 2.126215788283194, "grad_norm": 0.3107239305973053, "learning_rate": 9.150184708897947e-06, "loss": 0.5347, "step": 4700 }, { "epoch": 2.1266681746211265, "grad_norm": 0.3293301463127136, "learning_rate": 9.149776003452841e-06, "loss": 0.5554, "step": 4701 }, { "epoch": 2.127120560959059, "grad_norm": 0.31584352254867554, "learning_rate": 9.149367208883384e-06, "loss": 0.5046, "step": 4702 }, { "epoch": 2.1275729472969918, "grad_norm": 0.3351478576660156, "learning_rate": 9.148958325198354e-06, "loss": 0.558, "step": 4703 }, { "epoch": 2.128025333634924, "grad_norm": 0.37317243218421936, "learning_rate": 9.148549352406534e-06, "loss": 0.5693, "step": 4704 }, { "epoch": 2.1284777199728566, "grad_norm": 0.3345332443714142, "learning_rate": 9.148140290516708e-06, "loss": 0.6067, "step": 4705 }, { "epoch": 2.1289301063107895, "grad_norm": 0.3832906484603882, "learning_rate": 9.147731139537658e-06, "loss": 0.6428, "step": 4706 }, { "epoch": 2.129382492648722, "grad_norm": 0.3281771242618561, "learning_rate": 9.147321899478175e-06, "loss": 0.517, "step": 4707 }, { "epoch": 2.129834878986655, "grad_norm": 0.36724746227264404, "learning_rate": 9.146912570347046e-06, "loss": 0.5848, "step": 4708 }, { "epoch": 2.1302872653245872, "grad_norm": 0.35339438915252686, "learning_rate": 9.146503152153065e-06, "loss": 0.6064, "step": 4709 }, { "epoch": 2.1307396516625197, "grad_norm": 0.3669142723083496, "learning_rate": 9.146093644905022e-06, "loss": 0.6196, "step": 4710 }, { "epoch": 2.1311920380004525, "grad_norm": 0.3604067265987396, "learning_rate": 9.145684048611713e-06, "loss": 0.5283, "step": 4711 }, { "epoch": 2.131644424338385, "grad_norm": 0.3371598720550537, "learning_rate": 9.145274363281934e-06, "loss": 0.5307, "step": 4712 }, { "epoch": 2.1320968106763174, "grad_norm": 0.33309316635131836, "learning_rate": 9.144864588924484e-06, "loss": 0.4151, "step": 4713 }, { "epoch": 2.1325491970142503, "grad_norm": 0.3665007948875427, "learning_rate": 9.144454725548166e-06, "loss": 0.437, "step": 4714 }, { "epoch": 2.1330015833521827, "grad_norm": 0.4033816158771515, "learning_rate": 9.14404477316178e-06, "loss": 0.4958, "step": 4715 }, { "epoch": 2.133453969690115, "grad_norm": 0.38589540123939514, "learning_rate": 9.143634731774133e-06, "loss": 0.518, "step": 4716 }, { "epoch": 2.133906356028048, "grad_norm": 0.4290424585342407, "learning_rate": 9.14322460139403e-06, "loss": 0.6392, "step": 4717 }, { "epoch": 2.1343587423659804, "grad_norm": 0.4411315321922302, "learning_rate": 9.142814382030277e-06, "loss": 0.535, "step": 4718 }, { "epoch": 2.1348111287039133, "grad_norm": 0.4037415087223053, "learning_rate": 9.142404073691687e-06, "loss": 0.439, "step": 4719 }, { "epoch": 2.1352635150418457, "grad_norm": 0.4964406490325928, "learning_rate": 9.141993676387071e-06, "loss": 0.5473, "step": 4720 }, { "epoch": 2.135715901379778, "grad_norm": 0.5181330442428589, "learning_rate": 9.141583190125245e-06, "loss": 0.471, "step": 4721 }, { "epoch": 2.136168287717711, "grad_norm": 0.14024516940116882, "learning_rate": 9.141172614915023e-06, "loss": 1.4427, "step": 4722 }, { "epoch": 2.1366206740556435, "grad_norm": 0.2260255068540573, "learning_rate": 9.140761950765224e-06, "loss": 0.6255, "step": 4723 }, { "epoch": 2.137073060393576, "grad_norm": 0.24443495273590088, "learning_rate": 9.140351197684667e-06, "loss": 0.7292, "step": 4724 }, { "epoch": 2.137525446731509, "grad_norm": 0.2677711546421051, "learning_rate": 9.139940355682174e-06, "loss": 0.6896, "step": 4725 }, { "epoch": 2.1379778330694412, "grad_norm": 0.22615806758403778, "learning_rate": 9.139529424766568e-06, "loss": 0.512, "step": 4726 }, { "epoch": 2.138430219407374, "grad_norm": 0.24086999893188477, "learning_rate": 9.139118404946677e-06, "loss": 0.6154, "step": 4727 }, { "epoch": 2.1388826057453065, "grad_norm": 0.23338675498962402, "learning_rate": 9.138707296231323e-06, "loss": 0.5852, "step": 4728 }, { "epoch": 2.139334992083239, "grad_norm": 0.26504483819007874, "learning_rate": 9.138296098629342e-06, "loss": 0.7011, "step": 4729 }, { "epoch": 2.139787378421172, "grad_norm": 0.27934136986732483, "learning_rate": 9.13788481214956e-06, "loss": 0.6489, "step": 4730 }, { "epoch": 2.1402397647591043, "grad_norm": 0.2556348741054535, "learning_rate": 9.137473436800815e-06, "loss": 0.5746, "step": 4731 }, { "epoch": 2.1406921510970367, "grad_norm": 0.28091225028038025, "learning_rate": 9.137061972591937e-06, "loss": 0.6377, "step": 4732 }, { "epoch": 2.1411445374349696, "grad_norm": 0.2505549192428589, "learning_rate": 9.136650419531766e-06, "loss": 0.5622, "step": 4733 }, { "epoch": 2.141596923772902, "grad_norm": 0.23792442679405212, "learning_rate": 9.13623877762914e-06, "loss": 0.4343, "step": 4734 }, { "epoch": 2.142049310110835, "grad_norm": 0.3089410662651062, "learning_rate": 9.135827046892901e-06, "loss": 0.6099, "step": 4735 }, { "epoch": 2.1425016964487673, "grad_norm": 0.3172557055950165, "learning_rate": 9.135415227331891e-06, "loss": 0.5928, "step": 4736 }, { "epoch": 2.1429540827866997, "grad_norm": 0.285133957862854, "learning_rate": 9.135003318954954e-06, "loss": 0.476, "step": 4737 }, { "epoch": 2.1434064691246326, "grad_norm": 0.30250295996665955, "learning_rate": 9.134591321770936e-06, "loss": 0.6073, "step": 4738 }, { "epoch": 2.143858855462565, "grad_norm": 0.32833561301231384, "learning_rate": 9.134179235788688e-06, "loss": 0.6614, "step": 4739 }, { "epoch": 2.1443112418004975, "grad_norm": 0.2659751772880554, "learning_rate": 9.133767061017055e-06, "loss": 0.4807, "step": 4740 }, { "epoch": 2.1447636281384304, "grad_norm": 0.2637759745121002, "learning_rate": 9.133354797464896e-06, "loss": 0.4245, "step": 4741 }, { "epoch": 2.145216014476363, "grad_norm": 0.3176306486129761, "learning_rate": 9.132942445141062e-06, "loss": 0.5478, "step": 4742 }, { "epoch": 2.145668400814295, "grad_norm": 0.2827008366584778, "learning_rate": 9.132530004054408e-06, "loss": 0.4867, "step": 4743 }, { "epoch": 2.146120787152228, "grad_norm": 0.32834646105766296, "learning_rate": 9.132117474213793e-06, "loss": 0.777, "step": 4744 }, { "epoch": 2.1465731734901605, "grad_norm": 0.3402222990989685, "learning_rate": 9.131704855628079e-06, "loss": 0.542, "step": 4745 }, { "epoch": 2.1470255598280934, "grad_norm": 0.34869474172592163, "learning_rate": 9.131292148306123e-06, "loss": 0.6125, "step": 4746 }, { "epoch": 2.147477946166026, "grad_norm": 0.30599549412727356, "learning_rate": 9.130879352256791e-06, "loss": 0.5508, "step": 4747 }, { "epoch": 2.1479303325039583, "grad_norm": 0.3198413848876953, "learning_rate": 9.130466467488953e-06, "loss": 0.6427, "step": 4748 }, { "epoch": 2.148382718841891, "grad_norm": 0.3190510869026184, "learning_rate": 9.130053494011469e-06, "loss": 0.5573, "step": 4749 }, { "epoch": 2.1488351051798236, "grad_norm": 0.34793728590011597, "learning_rate": 9.129640431833212e-06, "loss": 0.5941, "step": 4750 }, { "epoch": 2.149287491517756, "grad_norm": 0.34647008776664734, "learning_rate": 9.129227280963054e-06, "loss": 0.5837, "step": 4751 }, { "epoch": 2.149739877855689, "grad_norm": 0.3452467620372772, "learning_rate": 9.128814041409865e-06, "loss": 0.5845, "step": 4752 }, { "epoch": 2.1501922641936213, "grad_norm": 0.3497125506401062, "learning_rate": 9.128400713182526e-06, "loss": 0.6132, "step": 4753 }, { "epoch": 2.1506446505315537, "grad_norm": 0.34264978766441345, "learning_rate": 9.127987296289908e-06, "loss": 0.5341, "step": 4754 }, { "epoch": 2.1510970368694866, "grad_norm": 0.3488319218158722, "learning_rate": 9.127573790740894e-06, "loss": 0.522, "step": 4755 }, { "epoch": 2.151549423207419, "grad_norm": 0.34481102228164673, "learning_rate": 9.127160196544361e-06, "loss": 0.5369, "step": 4756 }, { "epoch": 2.152001809545352, "grad_norm": 0.3413321077823639, "learning_rate": 9.126746513709195e-06, "loss": 0.5245, "step": 4757 }, { "epoch": 2.1524541958832843, "grad_norm": 0.3727875351905823, "learning_rate": 9.126332742244279e-06, "loss": 0.6305, "step": 4758 }, { "epoch": 2.152906582221217, "grad_norm": 0.3508927524089813, "learning_rate": 9.125918882158502e-06, "loss": 0.5947, "step": 4759 }, { "epoch": 2.1533589685591497, "grad_norm": 0.3440997898578644, "learning_rate": 9.125504933460749e-06, "loss": 0.5214, "step": 4760 }, { "epoch": 2.153811354897082, "grad_norm": 0.3583473265171051, "learning_rate": 9.125090896159912e-06, "loss": 0.4635, "step": 4761 }, { "epoch": 2.1542637412350145, "grad_norm": 0.34732192754745483, "learning_rate": 9.124676770264883e-06, "loss": 0.479, "step": 4762 }, { "epoch": 2.1547161275729474, "grad_norm": 0.3677120804786682, "learning_rate": 9.124262555784556e-06, "loss": 0.5284, "step": 4763 }, { "epoch": 2.15516851391088, "grad_norm": 0.4256076216697693, "learning_rate": 9.123848252727827e-06, "loss": 0.5509, "step": 4764 }, { "epoch": 2.1556209002488123, "grad_norm": 0.4048481285572052, "learning_rate": 9.123433861103594e-06, "loss": 0.486, "step": 4765 }, { "epoch": 2.156073286586745, "grad_norm": 0.4164426624774933, "learning_rate": 9.12301938092076e-06, "loss": 0.5835, "step": 4766 }, { "epoch": 2.1565256729246776, "grad_norm": 0.4366607666015625, "learning_rate": 9.12260481218822e-06, "loss": 0.5882, "step": 4767 }, { "epoch": 2.1569780592626104, "grad_norm": 0.37675997614860535, "learning_rate": 9.122190154914882e-06, "loss": 0.4824, "step": 4768 }, { "epoch": 2.157430445600543, "grad_norm": 0.5020010471343994, "learning_rate": 9.121775409109652e-06, "loss": 0.6209, "step": 4769 }, { "epoch": 2.1578828319384753, "grad_norm": 0.4790589213371277, "learning_rate": 9.121360574781438e-06, "loss": 0.5822, "step": 4770 }, { "epoch": 2.158335218276408, "grad_norm": 0.5345361232757568, "learning_rate": 9.120945651939146e-06, "loss": 0.5472, "step": 4771 }, { "epoch": 2.1587876046143406, "grad_norm": 0.1336456686258316, "learning_rate": 9.12053064059169e-06, "loss": 1.1222, "step": 4772 }, { "epoch": 2.159239990952273, "grad_norm": 0.18856501579284668, "learning_rate": 9.12011554074798e-06, "loss": 0.6819, "step": 4773 }, { "epoch": 2.159692377290206, "grad_norm": 0.20375534892082214, "learning_rate": 9.119700352416935e-06, "loss": 0.6199, "step": 4774 }, { "epoch": 2.1601447636281383, "grad_norm": 0.2116014063358307, "learning_rate": 9.11928507560747e-06, "loss": 0.5748, "step": 4775 }, { "epoch": 2.160597149966071, "grad_norm": 0.23542553186416626, "learning_rate": 9.118869710328504e-06, "loss": 0.6712, "step": 4776 }, { "epoch": 2.1610495363040036, "grad_norm": 0.2938215136528015, "learning_rate": 9.118454256588958e-06, "loss": 0.7244, "step": 4777 }, { "epoch": 2.161501922641936, "grad_norm": 0.2648598551750183, "learning_rate": 9.118038714397754e-06, "loss": 0.624, "step": 4778 }, { "epoch": 2.161954308979869, "grad_norm": 0.2513084411621094, "learning_rate": 9.117623083763818e-06, "loss": 0.5222, "step": 4779 }, { "epoch": 2.1624066953178014, "grad_norm": 0.21828670799732208, "learning_rate": 9.117207364696077e-06, "loss": 0.4398, "step": 4780 }, { "epoch": 2.162859081655734, "grad_norm": 0.2637549042701721, "learning_rate": 9.116791557203454e-06, "loss": 0.5404, "step": 4781 }, { "epoch": 2.1633114679936667, "grad_norm": 0.2869233787059784, "learning_rate": 9.116375661294888e-06, "loss": 0.6722, "step": 4782 }, { "epoch": 2.163763854331599, "grad_norm": 0.2759084105491638, "learning_rate": 9.115959676979303e-06, "loss": 0.6137, "step": 4783 }, { "epoch": 2.164216240669532, "grad_norm": 0.2971290349960327, "learning_rate": 9.115543604265636e-06, "loss": 0.7314, "step": 4784 }, { "epoch": 2.1646686270074644, "grad_norm": 0.2935580313205719, "learning_rate": 9.115127443162826e-06, "loss": 0.6869, "step": 4785 }, { "epoch": 2.165121013345397, "grad_norm": 0.3139994442462921, "learning_rate": 9.114711193679805e-06, "loss": 0.5918, "step": 4786 }, { "epoch": 2.1655733996833297, "grad_norm": 0.26471471786499023, "learning_rate": 9.114294855825519e-06, "loss": 0.5042, "step": 4787 }, { "epoch": 2.166025786021262, "grad_norm": 0.27881893515586853, "learning_rate": 9.113878429608905e-06, "loss": 0.4379, "step": 4788 }, { "epoch": 2.1664781723591946, "grad_norm": 0.25820446014404297, "learning_rate": 9.113461915038909e-06, "loss": 0.5074, "step": 4789 }, { "epoch": 2.1669305586971275, "grad_norm": 0.30098652839660645, "learning_rate": 9.113045312124474e-06, "loss": 0.5926, "step": 4790 }, { "epoch": 2.16738294503506, "grad_norm": 0.3356352150440216, "learning_rate": 9.11262862087455e-06, "loss": 0.6001, "step": 4791 }, { "epoch": 2.1678353313729923, "grad_norm": 0.3163335919380188, "learning_rate": 9.112211841298086e-06, "loss": 0.6102, "step": 4792 }, { "epoch": 2.168287717710925, "grad_norm": 0.3062126934528351, "learning_rate": 9.11179497340403e-06, "loss": 0.626, "step": 4793 }, { "epoch": 2.1687401040488576, "grad_norm": 0.27903762459754944, "learning_rate": 9.11137801720134e-06, "loss": 0.5646, "step": 4794 }, { "epoch": 2.1691924903867905, "grad_norm": 0.27837395668029785, "learning_rate": 9.110960972698967e-06, "loss": 0.4741, "step": 4795 }, { "epoch": 2.169644876724723, "grad_norm": 0.2938845753669739, "learning_rate": 9.11054383990587e-06, "loss": 0.5617, "step": 4796 }, { "epoch": 2.1700972630626554, "grad_norm": 0.3518490195274353, "learning_rate": 9.110126618831005e-06, "loss": 0.6963, "step": 4797 }, { "epoch": 2.1705496494005883, "grad_norm": 0.31045934557914734, "learning_rate": 9.109709309483335e-06, "loss": 0.5311, "step": 4798 }, { "epoch": 2.1710020357385207, "grad_norm": 0.3382814824581146, "learning_rate": 9.109291911871823e-06, "loss": 0.5691, "step": 4799 }, { "epoch": 2.171454422076453, "grad_norm": 0.3072865903377533, "learning_rate": 9.108874426005432e-06, "loss": 0.5524, "step": 4800 }, { "epoch": 2.171454422076453, "eval_loss": 0.5981824994087219, "eval_runtime": 25.7913, "eval_samples_per_second": 28.847, "eval_steps_per_second": 7.212, "step": 4800 }, { "epoch": 2.171906808414386, "grad_norm": 0.31508150696754456, "learning_rate": 9.108456851893128e-06, "loss": 0.5791, "step": 4801 }, { "epoch": 2.1723591947523184, "grad_norm": 0.2961462438106537, "learning_rate": 9.10803918954388e-06, "loss": 0.4376, "step": 4802 }, { "epoch": 2.172811581090251, "grad_norm": 0.30927619338035583, "learning_rate": 9.107621438966658e-06, "loss": 0.5012, "step": 4803 }, { "epoch": 2.1732639674281837, "grad_norm": 0.3251945376396179, "learning_rate": 9.107203600170434e-06, "loss": 0.5209, "step": 4804 }, { "epoch": 2.173716353766116, "grad_norm": 0.3613864779472351, "learning_rate": 9.106785673164183e-06, "loss": 0.49, "step": 4805 }, { "epoch": 2.174168740104049, "grad_norm": 0.3693954646587372, "learning_rate": 9.106367657956878e-06, "loss": 0.601, "step": 4806 }, { "epoch": 2.1746211264419815, "grad_norm": 0.32705986499786377, "learning_rate": 9.1059495545575e-06, "loss": 0.5363, "step": 4807 }, { "epoch": 2.175073512779914, "grad_norm": 0.3342117369174957, "learning_rate": 9.105531362975025e-06, "loss": 0.5002, "step": 4808 }, { "epoch": 2.1755258991178468, "grad_norm": 0.34524449706077576, "learning_rate": 9.105113083218438e-06, "loss": 0.4949, "step": 4809 }, { "epoch": 2.175978285455779, "grad_norm": 0.3524743616580963, "learning_rate": 9.10469471529672e-06, "loss": 0.5174, "step": 4810 }, { "epoch": 2.1764306717937116, "grad_norm": 0.3785725235939026, "learning_rate": 9.104276259218856e-06, "loss": 0.5745, "step": 4811 }, { "epoch": 2.1768830581316445, "grad_norm": 0.4107390344142914, "learning_rate": 9.103857714993835e-06, "loss": 0.568, "step": 4812 }, { "epoch": 2.177335444469577, "grad_norm": 0.37298810482025146, "learning_rate": 9.103439082630646e-06, "loss": 0.5446, "step": 4813 }, { "epoch": 2.17778783080751, "grad_norm": 0.3629017174243927, "learning_rate": 9.103020362138278e-06, "loss": 0.5498, "step": 4814 }, { "epoch": 2.1782402171454422, "grad_norm": 0.3676045536994934, "learning_rate": 9.102601553525724e-06, "loss": 0.4894, "step": 4815 }, { "epoch": 2.1786926034833747, "grad_norm": 0.3717390298843384, "learning_rate": 9.102182656801978e-06, "loss": 0.4861, "step": 4816 }, { "epoch": 2.1791449898213076, "grad_norm": 0.45678797364234924, "learning_rate": 9.101763671976044e-06, "loss": 0.5979, "step": 4817 }, { "epoch": 2.17959737615924, "grad_norm": 0.5018953084945679, "learning_rate": 9.101344599056909e-06, "loss": 0.5469, "step": 4818 }, { "epoch": 2.1800497624971724, "grad_norm": 0.40001630783081055, "learning_rate": 9.100925438053581e-06, "loss": 0.4974, "step": 4819 }, { "epoch": 2.1805021488351053, "grad_norm": 0.44556084275245667, "learning_rate": 9.100506188975061e-06, "loss": 0.5554, "step": 4820 }, { "epoch": 2.1809545351730377, "grad_norm": 0.5285558104515076, "learning_rate": 9.100086851830352e-06, "loss": 0.6447, "step": 4821 }, { "epoch": 2.1814069215109706, "grad_norm": 0.13656112551689148, "learning_rate": 9.099667426628463e-06, "loss": 0.718, "step": 4822 }, { "epoch": 2.181859307848903, "grad_norm": 0.23124220967292786, "learning_rate": 9.099247913378395e-06, "loss": 0.5896, "step": 4823 }, { "epoch": 2.1823116941868355, "grad_norm": 0.20591382682323456, "learning_rate": 9.098828312089166e-06, "loss": 0.5574, "step": 4824 }, { "epoch": 2.1827640805247683, "grad_norm": 0.25113800168037415, "learning_rate": 9.098408622769783e-06, "loss": 0.6578, "step": 4825 }, { "epoch": 2.1832164668627008, "grad_norm": 0.22453168034553528, "learning_rate": 9.09798884542926e-06, "loss": 0.5651, "step": 4826 }, { "epoch": 2.183668853200633, "grad_norm": 0.25769200921058655, "learning_rate": 9.097568980076614e-06, "loss": 0.6315, "step": 4827 }, { "epoch": 2.184121239538566, "grad_norm": 0.2546674311161041, "learning_rate": 9.097149026720863e-06, "loss": 0.5899, "step": 4828 }, { "epoch": 2.1845736258764985, "grad_norm": 0.2589612305164337, "learning_rate": 9.09672898537102e-06, "loss": 0.6654, "step": 4829 }, { "epoch": 2.185026012214431, "grad_norm": 0.276168555021286, "learning_rate": 9.096308856036116e-06, "loss": 0.6116, "step": 4830 }, { "epoch": 2.185478398552364, "grad_norm": 0.2868282198905945, "learning_rate": 9.095888638725168e-06, "loss": 0.6136, "step": 4831 }, { "epoch": 2.1859307848902962, "grad_norm": 0.2972191274166107, "learning_rate": 9.095468333447202e-06, "loss": 0.5873, "step": 4832 }, { "epoch": 2.186383171228229, "grad_norm": 0.2878153920173645, "learning_rate": 9.095047940211244e-06, "loss": 0.691, "step": 4833 }, { "epoch": 2.1868355575661615, "grad_norm": 0.29900237917900085, "learning_rate": 9.094627459026326e-06, "loss": 0.7225, "step": 4834 }, { "epoch": 2.187287943904094, "grad_norm": 0.2911369800567627, "learning_rate": 9.094206889901476e-06, "loss": 0.6046, "step": 4835 }, { "epoch": 2.187740330242027, "grad_norm": 0.2667067348957062, "learning_rate": 9.093786232845724e-06, "loss": 0.5558, "step": 4836 }, { "epoch": 2.1881927165799593, "grad_norm": 0.29362961649894714, "learning_rate": 9.09336548786811e-06, "loss": 0.5263, "step": 4837 }, { "epoch": 2.1886451029178917, "grad_norm": 0.26975858211517334, "learning_rate": 9.092944654977669e-06, "loss": 0.5369, "step": 4838 }, { "epoch": 2.1890974892558246, "grad_norm": 0.28889000415802, "learning_rate": 9.092523734183437e-06, "loss": 0.4479, "step": 4839 }, { "epoch": 2.189549875593757, "grad_norm": 0.26578837633132935, "learning_rate": 9.092102725494454e-06, "loss": 0.557, "step": 4840 }, { "epoch": 2.1900022619316895, "grad_norm": 0.28526586294174194, "learning_rate": 9.091681628919762e-06, "loss": 0.5832, "step": 4841 }, { "epoch": 2.1904546482696223, "grad_norm": 0.28907087445259094, "learning_rate": 9.091260444468409e-06, "loss": 0.6827, "step": 4842 }, { "epoch": 2.1909070346075548, "grad_norm": 0.2473454773426056, "learning_rate": 9.090839172149434e-06, "loss": 0.4463, "step": 4843 }, { "epoch": 2.1913594209454876, "grad_norm": 0.31189802289009094, "learning_rate": 9.09041781197189e-06, "loss": 0.5569, "step": 4844 }, { "epoch": 2.19181180728342, "grad_norm": 0.3048393726348877, "learning_rate": 9.089996363944824e-06, "loss": 0.5201, "step": 4845 }, { "epoch": 2.1922641936213525, "grad_norm": 0.3235587179660797, "learning_rate": 9.089574828077287e-06, "loss": 0.5666, "step": 4846 }, { "epoch": 2.1927165799592854, "grad_norm": 0.2930219769477844, "learning_rate": 9.089153204378336e-06, "loss": 0.6105, "step": 4847 }, { "epoch": 2.193168966297218, "grad_norm": 0.3088344931602478, "learning_rate": 9.088731492857021e-06, "loss": 0.5597, "step": 4848 }, { "epoch": 2.1936213526351502, "grad_norm": 0.3205549418926239, "learning_rate": 9.088309693522404e-06, "loss": 0.6234, "step": 4849 }, { "epoch": 2.194073738973083, "grad_norm": 0.28884655237197876, "learning_rate": 9.087887806383539e-06, "loss": 0.4759, "step": 4850 }, { "epoch": 2.1945261253110155, "grad_norm": 0.27841803431510925, "learning_rate": 9.08746583144949e-06, "loss": 0.4064, "step": 4851 }, { "epoch": 2.194978511648948, "grad_norm": 0.3680628538131714, "learning_rate": 9.08704376872932e-06, "loss": 0.623, "step": 4852 }, { "epoch": 2.195430897986881, "grad_norm": 0.32543209195137024, "learning_rate": 9.086621618232094e-06, "loss": 0.5537, "step": 4853 }, { "epoch": 2.1958832843248133, "grad_norm": 0.3400430977344513, "learning_rate": 9.086199379966874e-06, "loss": 0.5992, "step": 4854 }, { "epoch": 2.196335670662746, "grad_norm": 0.35195809602737427, "learning_rate": 9.085777053942733e-06, "loss": 0.5284, "step": 4855 }, { "epoch": 2.1967880570006786, "grad_norm": 0.2904324531555176, "learning_rate": 9.08535464016874e-06, "loss": 0.4641, "step": 4856 }, { "epoch": 2.197240443338611, "grad_norm": 0.3637993633747101, "learning_rate": 9.084932138653967e-06, "loss": 0.5309, "step": 4857 }, { "epoch": 2.197692829676544, "grad_norm": 0.3649826943874359, "learning_rate": 9.084509549407488e-06, "loss": 0.5535, "step": 4858 }, { "epoch": 2.1981452160144763, "grad_norm": 0.37128007411956787, "learning_rate": 9.084086872438377e-06, "loss": 0.5601, "step": 4859 }, { "epoch": 2.1985976023524088, "grad_norm": 0.3299643099308014, "learning_rate": 9.083664107755717e-06, "loss": 0.4467, "step": 4860 }, { "epoch": 2.1990499886903416, "grad_norm": 0.40984541177749634, "learning_rate": 9.08324125536858e-06, "loss": 0.6358, "step": 4861 }, { "epoch": 2.199502375028274, "grad_norm": 0.42457157373428345, "learning_rate": 9.082818315286054e-06, "loss": 0.6047, "step": 4862 }, { "epoch": 2.199954761366207, "grad_norm": 0.3329751789569855, "learning_rate": 9.082395287517222e-06, "loss": 0.458, "step": 4863 }, { "epoch": 2.2004071477041394, "grad_norm": 0.4041215777397156, "learning_rate": 9.081972172071166e-06, "loss": 0.6499, "step": 4864 }, { "epoch": 2.200859534042072, "grad_norm": 0.3471682667732239, "learning_rate": 9.081548968956974e-06, "loss": 0.4302, "step": 4865 }, { "epoch": 2.2013119203800047, "grad_norm": 0.38346725702285767, "learning_rate": 9.081125678183738e-06, "loss": 0.5781, "step": 4866 }, { "epoch": 2.201764306717937, "grad_norm": 0.4612744450569153, "learning_rate": 9.080702299760547e-06, "loss": 0.5728, "step": 4867 }, { "epoch": 2.2022166930558695, "grad_norm": 0.4344863295555115, "learning_rate": 9.080278833696491e-06, "loss": 0.5494, "step": 4868 }, { "epoch": 2.2026690793938024, "grad_norm": 0.4545765519142151, "learning_rate": 9.07985528000067e-06, "loss": 0.6292, "step": 4869 }, { "epoch": 2.203121465731735, "grad_norm": 0.4742748737335205, "learning_rate": 9.079431638682178e-06, "loss": 0.5161, "step": 4870 }, { "epoch": 2.2035738520696677, "grad_norm": 0.4121817946434021, "learning_rate": 9.079007909750112e-06, "loss": 0.413, "step": 4871 }, { "epoch": 2.2040262384076, "grad_norm": 0.17437005043029785, "learning_rate": 9.078584093213576e-06, "loss": 1.1044, "step": 4872 }, { "epoch": 2.2044786247455326, "grad_norm": 0.2079775482416153, "learning_rate": 9.07816018908167e-06, "loss": 0.5401, "step": 4873 }, { "epoch": 2.2049310110834655, "grad_norm": 0.26224130392074585, "learning_rate": 9.077736197363498e-06, "loss": 0.6981, "step": 4874 }, { "epoch": 2.205383397421398, "grad_norm": 0.2132185399532318, "learning_rate": 9.077312118068166e-06, "loss": 0.618, "step": 4875 }, { "epoch": 2.2058357837593303, "grad_norm": 0.26049405336380005, "learning_rate": 9.076887951204783e-06, "loss": 0.6011, "step": 4876 }, { "epoch": 2.206288170097263, "grad_norm": 0.23115260899066925, "learning_rate": 9.076463696782458e-06, "loss": 0.5683, "step": 4877 }, { "epoch": 2.2067405564351956, "grad_norm": 0.2878809869289398, "learning_rate": 9.076039354810303e-06, "loss": 0.7035, "step": 4878 }, { "epoch": 2.207192942773128, "grad_norm": 0.27193549275398254, "learning_rate": 9.075614925297431e-06, "loss": 0.5496, "step": 4879 }, { "epoch": 2.207645329111061, "grad_norm": 0.2724071145057678, "learning_rate": 9.075190408252959e-06, "loss": 0.5989, "step": 4880 }, { "epoch": 2.2080977154489934, "grad_norm": 0.2306087613105774, "learning_rate": 9.074765803686001e-06, "loss": 0.5194, "step": 4881 }, { "epoch": 2.2085501017869262, "grad_norm": 0.2742772102355957, "learning_rate": 9.07434111160568e-06, "loss": 0.6109, "step": 4882 }, { "epoch": 2.2090024881248587, "grad_norm": 0.27976539731025696, "learning_rate": 9.073916332021115e-06, "loss": 0.6042, "step": 4883 }, { "epoch": 2.209454874462791, "grad_norm": 0.28413715958595276, "learning_rate": 9.07349146494143e-06, "loss": 0.7039, "step": 4884 }, { "epoch": 2.209907260800724, "grad_norm": 0.3033708333969116, "learning_rate": 9.07306651037575e-06, "loss": 0.6094, "step": 4885 }, { "epoch": 2.2103596471386564, "grad_norm": 0.3021402060985565, "learning_rate": 9.0726414683332e-06, "loss": 0.6158, "step": 4886 }, { "epoch": 2.210812033476589, "grad_norm": 0.29204457998275757, "learning_rate": 9.07221633882291e-06, "loss": 0.6501, "step": 4887 }, { "epoch": 2.2112644198145217, "grad_norm": 0.306743860244751, "learning_rate": 9.071791121854008e-06, "loss": 0.5624, "step": 4888 }, { "epoch": 2.211716806152454, "grad_norm": 0.3066844642162323, "learning_rate": 9.071365817435629e-06, "loss": 0.5478, "step": 4889 }, { "epoch": 2.2121691924903866, "grad_norm": 0.30578508973121643, "learning_rate": 9.070940425576908e-06, "loss": 0.6973, "step": 4890 }, { "epoch": 2.2126215788283194, "grad_norm": 0.2754257321357727, "learning_rate": 9.07051494628698e-06, "loss": 0.5141, "step": 4891 }, { "epoch": 2.213073965166252, "grad_norm": 0.33304524421691895, "learning_rate": 9.07008937957498e-06, "loss": 0.5293, "step": 4892 }, { "epoch": 2.2135263515041848, "grad_norm": 0.31402716040611267, "learning_rate": 9.069663725450053e-06, "loss": 0.61, "step": 4893 }, { "epoch": 2.213978737842117, "grad_norm": 0.27947884798049927, "learning_rate": 9.069237983921336e-06, "loss": 0.5245, "step": 4894 }, { "epoch": 2.2144311241800496, "grad_norm": 0.30486127734184265, "learning_rate": 9.068812154997976e-06, "loss": 0.5734, "step": 4895 }, { "epoch": 2.2148835105179825, "grad_norm": 0.2694014012813568, "learning_rate": 9.068386238689117e-06, "loss": 0.4802, "step": 4896 }, { "epoch": 2.215335896855915, "grad_norm": 0.29667922854423523, "learning_rate": 9.067960235003908e-06, "loss": 0.5419, "step": 4897 }, { "epoch": 2.2157882831938474, "grad_norm": 0.327258437871933, "learning_rate": 9.067534143951495e-06, "loss": 0.5766, "step": 4898 }, { "epoch": 2.2162406695317802, "grad_norm": 0.3081181049346924, "learning_rate": 9.067107965541033e-06, "loss": 0.5419, "step": 4899 }, { "epoch": 2.2166930558697127, "grad_norm": 0.33664995431900024, "learning_rate": 9.066681699781672e-06, "loss": 0.6261, "step": 4900 }, { "epoch": 2.2171454422076455, "grad_norm": 0.31088095903396606, "learning_rate": 9.066255346682567e-06, "loss": 0.5257, "step": 4901 }, { "epoch": 2.217597828545578, "grad_norm": 0.3173364996910095, "learning_rate": 9.065828906252877e-06, "loss": 0.555, "step": 4902 }, { "epoch": 2.2180502148835104, "grad_norm": 0.33311983942985535, "learning_rate": 9.06540237850176e-06, "loss": 0.5253, "step": 4903 }, { "epoch": 2.2185026012214433, "grad_norm": 0.3214361369609833, "learning_rate": 9.064975763438373e-06, "loss": 0.5544, "step": 4904 }, { "epoch": 2.2189549875593757, "grad_norm": 0.332035630941391, "learning_rate": 9.064549061071884e-06, "loss": 0.5193, "step": 4905 }, { "epoch": 2.219407373897308, "grad_norm": 0.37916281819343567, "learning_rate": 9.064122271411452e-06, "loss": 0.6235, "step": 4906 }, { "epoch": 2.219859760235241, "grad_norm": 0.32970693707466125, "learning_rate": 9.063695394466248e-06, "loss": 0.4679, "step": 4907 }, { "epoch": 2.2203121465731734, "grad_norm": 0.32955458760261536, "learning_rate": 9.063268430245435e-06, "loss": 0.5057, "step": 4908 }, { "epoch": 2.2207645329111063, "grad_norm": 0.37566906213760376, "learning_rate": 9.062841378758187e-06, "loss": 0.6484, "step": 4909 }, { "epoch": 2.2212169192490387, "grad_norm": 0.32966357469558716, "learning_rate": 9.062414240013673e-06, "loss": 0.4693, "step": 4910 }, { "epoch": 2.221669305586971, "grad_norm": 0.3835389316082001, "learning_rate": 9.06198701402107e-06, "loss": 0.5581, "step": 4911 }, { "epoch": 2.222121691924904, "grad_norm": 0.3671244978904724, "learning_rate": 9.061559700789549e-06, "loss": 0.488, "step": 4912 }, { "epoch": 2.2225740782628365, "grad_norm": 0.36285296082496643, "learning_rate": 9.06113230032829e-06, "loss": 0.4912, "step": 4913 }, { "epoch": 2.223026464600769, "grad_norm": 0.3844170868396759, "learning_rate": 9.060704812646473e-06, "loss": 0.4724, "step": 4914 }, { "epoch": 2.223478850938702, "grad_norm": 0.39839696884155273, "learning_rate": 9.060277237753277e-06, "loss": 0.5742, "step": 4915 }, { "epoch": 2.223931237276634, "grad_norm": 0.46496787667274475, "learning_rate": 9.059849575657887e-06, "loss": 0.6262, "step": 4916 }, { "epoch": 2.2243836236145667, "grad_norm": 0.4531101584434509, "learning_rate": 9.059421826369484e-06, "loss": 0.6325, "step": 4917 }, { "epoch": 2.2248360099524995, "grad_norm": 0.5098051428794861, "learning_rate": 9.05899398989726e-06, "loss": 0.7464, "step": 4918 }, { "epoch": 2.225288396290432, "grad_norm": 0.4083210527896881, "learning_rate": 9.058566066250401e-06, "loss": 0.4691, "step": 4919 }, { "epoch": 2.225740782628365, "grad_norm": 0.4937748312950134, "learning_rate": 9.058138055438098e-06, "loss": 0.5336, "step": 4920 }, { "epoch": 2.2261931689662973, "grad_norm": 0.5293711423873901, "learning_rate": 9.057709957469541e-06, "loss": 0.5691, "step": 4921 }, { "epoch": 2.2266455553042297, "grad_norm": 0.14016510546207428, "learning_rate": 9.057281772353927e-06, "loss": 1.3072, "step": 4922 }, { "epoch": 2.2270979416421626, "grad_norm": 0.19501350820064545, "learning_rate": 9.05685350010045e-06, "loss": 0.6393, "step": 4923 }, { "epoch": 2.227550327980095, "grad_norm": 0.21258428692817688, "learning_rate": 9.056425140718312e-06, "loss": 0.5514, "step": 4924 }, { "epoch": 2.2280027143180274, "grad_norm": 0.21752649545669556, "learning_rate": 9.05599669421671e-06, "loss": 0.5825, "step": 4925 }, { "epoch": 2.2284551006559603, "grad_norm": 0.24176588654518127, "learning_rate": 9.055568160604844e-06, "loss": 0.6046, "step": 4926 }, { "epoch": 2.2289074869938927, "grad_norm": 0.24149157106876373, "learning_rate": 9.05513953989192e-06, "loss": 0.5627, "step": 4927 }, { "epoch": 2.229359873331825, "grad_norm": 0.24143913388252258, "learning_rate": 9.054710832087143e-06, "loss": 0.5922, "step": 4928 }, { "epoch": 2.229812259669758, "grad_norm": 0.2650712728500366, "learning_rate": 9.05428203719972e-06, "loss": 0.5619, "step": 4929 }, { "epoch": 2.2302646460076905, "grad_norm": 0.3160285949707031, "learning_rate": 9.05385315523886e-06, "loss": 0.7366, "step": 4930 }, { "epoch": 2.2307170323456234, "grad_norm": 0.29887357354164124, "learning_rate": 9.053424186213776e-06, "loss": 0.6409, "step": 4931 }, { "epoch": 2.231169418683556, "grad_norm": 0.2863733768463135, "learning_rate": 9.052995130133677e-06, "loss": 0.6467, "step": 4932 }, { "epoch": 2.231621805021488, "grad_norm": 0.2807140648365021, "learning_rate": 9.05256598700778e-06, "loss": 0.6536, "step": 4933 }, { "epoch": 2.232074191359421, "grad_norm": 0.27378684282302856, "learning_rate": 9.052136756845303e-06, "loss": 0.6948, "step": 4934 }, { "epoch": 2.2325265776973535, "grad_norm": 0.2982933819293976, "learning_rate": 9.051707439655464e-06, "loss": 0.6572, "step": 4935 }, { "epoch": 2.232978964035286, "grad_norm": 0.29963451623916626, "learning_rate": 9.05127803544748e-06, "loss": 0.6837, "step": 4936 }, { "epoch": 2.233431350373219, "grad_norm": 0.2760494351387024, "learning_rate": 9.050848544230579e-06, "loss": 0.4731, "step": 4937 }, { "epoch": 2.2338837367111513, "grad_norm": 0.2496650516986847, "learning_rate": 9.050418966013981e-06, "loss": 0.4377, "step": 4938 }, { "epoch": 2.2343361230490837, "grad_norm": 0.31924301385879517, "learning_rate": 9.049989300806912e-06, "loss": 0.5496, "step": 4939 }, { "epoch": 2.2347885093870166, "grad_norm": 0.3429841101169586, "learning_rate": 9.049559548618603e-06, "loss": 0.7257, "step": 4940 }, { "epoch": 2.235240895724949, "grad_norm": 0.27377966046333313, "learning_rate": 9.04912970945828e-06, "loss": 0.5052, "step": 4941 }, { "epoch": 2.235693282062882, "grad_norm": 0.3303467631340027, "learning_rate": 9.048699783335178e-06, "loss": 0.6186, "step": 4942 }, { "epoch": 2.2361456684008143, "grad_norm": 0.3461376428604126, "learning_rate": 9.048269770258526e-06, "loss": 0.7172, "step": 4943 }, { "epoch": 2.2365980547387467, "grad_norm": 0.32996252179145813, "learning_rate": 9.047839670237566e-06, "loss": 0.7623, "step": 4944 }, { "epoch": 2.2370504410766796, "grad_norm": 0.2911885678768158, "learning_rate": 9.047409483281528e-06, "loss": 0.5121, "step": 4945 }, { "epoch": 2.237502827414612, "grad_norm": 0.3166678845882416, "learning_rate": 9.046979209399657e-06, "loss": 0.54, "step": 4946 }, { "epoch": 2.2379552137525445, "grad_norm": 0.2901346981525421, "learning_rate": 9.046548848601192e-06, "loss": 0.489, "step": 4947 }, { "epoch": 2.2384076000904773, "grad_norm": 0.30270230770111084, "learning_rate": 9.046118400895372e-06, "loss": 0.4296, "step": 4948 }, { "epoch": 2.2388599864284098, "grad_norm": 0.31974536180496216, "learning_rate": 9.045687866291447e-06, "loss": 0.6086, "step": 4949 }, { "epoch": 2.2393123727663427, "grad_norm": 0.3440326154232025, "learning_rate": 9.045257244798661e-06, "loss": 0.6034, "step": 4950 }, { "epoch": 2.239764759104275, "grad_norm": 0.3044274151325226, "learning_rate": 9.044826536426263e-06, "loss": 0.4825, "step": 4951 }, { "epoch": 2.2402171454422075, "grad_norm": 0.3138715326786041, "learning_rate": 9.044395741183504e-06, "loss": 0.4958, "step": 4952 }, { "epoch": 2.2406695317801404, "grad_norm": 0.3441576361656189, "learning_rate": 9.043964859079634e-06, "loss": 0.5325, "step": 4953 }, { "epoch": 2.241121918118073, "grad_norm": 0.37108567357063293, "learning_rate": 9.04353389012391e-06, "loss": 0.6065, "step": 4954 }, { "epoch": 2.2415743044560053, "grad_norm": 0.33772870898246765, "learning_rate": 9.043102834325585e-06, "loss": 0.5071, "step": 4955 }, { "epoch": 2.242026690793938, "grad_norm": 0.35233333706855774, "learning_rate": 9.042671691693918e-06, "loss": 0.5539, "step": 4956 }, { "epoch": 2.2424790771318706, "grad_norm": 0.32301050424575806, "learning_rate": 9.04224046223817e-06, "loss": 0.4139, "step": 4957 }, { "epoch": 2.2429314634698034, "grad_norm": 0.3937550187110901, "learning_rate": 9.041809145967599e-06, "loss": 0.5881, "step": 4958 }, { "epoch": 2.243383849807736, "grad_norm": 0.33781737089157104, "learning_rate": 9.041377742891473e-06, "loss": 0.4724, "step": 4959 }, { "epoch": 2.2438362361456683, "grad_norm": 0.39625775814056396, "learning_rate": 9.040946253019052e-06, "loss": 0.5741, "step": 4960 }, { "epoch": 2.244288622483601, "grad_norm": 0.33644846081733704, "learning_rate": 9.040514676359607e-06, "loss": 0.5261, "step": 4961 }, { "epoch": 2.2447410088215336, "grad_norm": 0.37730371952056885, "learning_rate": 9.040083012922406e-06, "loss": 0.5223, "step": 4962 }, { "epoch": 2.245193395159466, "grad_norm": 0.34415072202682495, "learning_rate": 9.03965126271672e-06, "loss": 0.4767, "step": 4963 }, { "epoch": 2.245645781497399, "grad_norm": 0.3440341055393219, "learning_rate": 9.039219425751819e-06, "loss": 0.4121, "step": 4964 }, { "epoch": 2.2460981678353313, "grad_norm": 0.35665827989578247, "learning_rate": 9.03878750203698e-06, "loss": 0.5181, "step": 4965 }, { "epoch": 2.2465505541732638, "grad_norm": 0.4800388813018799, "learning_rate": 9.038355491581481e-06, "loss": 0.6608, "step": 4966 }, { "epoch": 2.2470029405111966, "grad_norm": 0.3857521414756775, "learning_rate": 9.037923394394596e-06, "loss": 0.4758, "step": 4967 }, { "epoch": 2.247455326849129, "grad_norm": 0.4343438744544983, "learning_rate": 9.037491210485609e-06, "loss": 0.5417, "step": 4968 }, { "epoch": 2.247907713187062, "grad_norm": 0.4176467955112457, "learning_rate": 9.037058939863798e-06, "loss": 0.4627, "step": 4969 }, { "epoch": 2.2483600995249944, "grad_norm": 0.4428592622280121, "learning_rate": 9.036626582538451e-06, "loss": 0.4947, "step": 4970 }, { "epoch": 2.248812485862927, "grad_norm": 0.470426082611084, "learning_rate": 9.036194138518852e-06, "loss": 0.538, "step": 4971 }, { "epoch": 2.2492648722008597, "grad_norm": 0.14477622509002686, "learning_rate": 9.035761607814288e-06, "loss": 1.2038, "step": 4972 }, { "epoch": 2.249717258538792, "grad_norm": 0.1946268081665039, "learning_rate": 9.03532899043405e-06, "loss": 0.7664, "step": 4973 }, { "epoch": 2.2501696448767246, "grad_norm": 0.2438236027956009, "learning_rate": 9.034896286387426e-06, "loss": 0.6659, "step": 4974 }, { "epoch": 2.2506220312146574, "grad_norm": 0.2726745009422302, "learning_rate": 9.034463495683711e-06, "loss": 0.6563, "step": 4975 }, { "epoch": 2.25107441755259, "grad_norm": 0.2689591944217682, "learning_rate": 9.034030618332202e-06, "loss": 0.751, "step": 4976 }, { "epoch": 2.2515268038905223, "grad_norm": 0.29891493916511536, "learning_rate": 9.033597654342194e-06, "loss": 0.6208, "step": 4977 }, { "epoch": 2.251979190228455, "grad_norm": 0.2541316747665405, "learning_rate": 9.033164603722986e-06, "loss": 0.6084, "step": 4978 }, { "epoch": 2.2524315765663876, "grad_norm": 0.30297183990478516, "learning_rate": 9.032731466483877e-06, "loss": 0.7161, "step": 4979 }, { "epoch": 2.2528839629043205, "grad_norm": 0.28768113255500793, "learning_rate": 9.032298242634173e-06, "loss": 0.6425, "step": 4980 }, { "epoch": 2.253336349242253, "grad_norm": 0.3100544810295105, "learning_rate": 9.031864932183174e-06, "loss": 0.7696, "step": 4981 }, { "epoch": 2.2537887355801853, "grad_norm": 0.26647570729255676, "learning_rate": 9.03143153514019e-06, "loss": 0.6115, "step": 4982 }, { "epoch": 2.254241121918118, "grad_norm": 0.2924705743789673, "learning_rate": 9.030998051514526e-06, "loss": 0.5491, "step": 4983 }, { "epoch": 2.2546935082560506, "grad_norm": 0.3129533529281616, "learning_rate": 9.030564481315494e-06, "loss": 0.5835, "step": 4984 }, { "epoch": 2.255145894593983, "grad_norm": 0.2719511389732361, "learning_rate": 9.030130824552405e-06, "loss": 0.4919, "step": 4985 }, { "epoch": 2.255598280931916, "grad_norm": 0.3098921775817871, "learning_rate": 9.029697081234574e-06, "loss": 0.6579, "step": 4986 }, { "epoch": 2.2560506672698484, "grad_norm": 0.31478351354599, "learning_rate": 9.029263251371314e-06, "loss": 0.6315, "step": 4987 }, { "epoch": 2.256503053607781, "grad_norm": 0.3183283507823944, "learning_rate": 9.028829334971942e-06, "loss": 0.6946, "step": 4988 }, { "epoch": 2.2569554399457137, "grad_norm": 0.2809697091579437, "learning_rate": 9.02839533204578e-06, "loss": 0.5502, "step": 4989 }, { "epoch": 2.257407826283646, "grad_norm": 0.31826138496398926, "learning_rate": 9.027961242602148e-06, "loss": 0.564, "step": 4990 }, { "epoch": 2.257860212621579, "grad_norm": 0.31438180804252625, "learning_rate": 9.027527066650368e-06, "loss": 0.6252, "step": 4991 }, { "epoch": 2.2583125989595114, "grad_norm": 0.35785582661628723, "learning_rate": 9.027092804199767e-06, "loss": 0.5382, "step": 4992 }, { "epoch": 2.258764985297444, "grad_norm": 0.3106387257575989, "learning_rate": 9.026658455259666e-06, "loss": 0.5426, "step": 4993 }, { "epoch": 2.2592173716353767, "grad_norm": 0.32802096009254456, "learning_rate": 9.0262240198394e-06, "loss": 0.6511, "step": 4994 }, { "epoch": 2.259669757973309, "grad_norm": 0.28365281224250793, "learning_rate": 9.025789497948297e-06, "loss": 0.494, "step": 4995 }, { "epoch": 2.260122144311242, "grad_norm": 0.3300001323223114, "learning_rate": 9.02535488959569e-06, "loss": 0.5528, "step": 4996 }, { "epoch": 2.2605745306491745, "grad_norm": 0.3573274612426758, "learning_rate": 9.024920194790909e-06, "loss": 0.6028, "step": 4997 }, { "epoch": 2.261026916987107, "grad_norm": 0.31037667393684387, "learning_rate": 9.024485413543295e-06, "loss": 0.5346, "step": 4998 }, { "epoch": 2.2614793033250398, "grad_norm": 0.3569951355457306, "learning_rate": 9.024050545862182e-06, "loss": 0.5514, "step": 4999 }, { "epoch": 2.261931689662972, "grad_norm": 0.32889077067375183, "learning_rate": 9.023615591756913e-06, "loss": 0.5306, "step": 5000 }, { "epoch": 2.261931689662972, "eval_loss": 0.5977937579154968, "eval_runtime": 25.9263, "eval_samples_per_second": 28.697, "eval_steps_per_second": 7.174, "step": 5000 }, { "epoch": 2.2623840760009046, "grad_norm": 0.34328487515449524, "learning_rate": 9.023180551236825e-06, "loss": 0.5668, "step": 5001 }, { "epoch": 2.2628364623388375, "grad_norm": 0.3343515992164612, "learning_rate": 9.022745424311267e-06, "loss": 0.5674, "step": 5002 }, { "epoch": 2.26328884867677, "grad_norm": 0.37320005893707275, "learning_rate": 9.02231021098958e-06, "loss": 0.6291, "step": 5003 }, { "epoch": 2.2637412350147024, "grad_norm": 0.3754653334617615, "learning_rate": 9.021874911281113e-06, "loss": 0.6445, "step": 5004 }, { "epoch": 2.2641936213526352, "grad_norm": 0.3666382133960724, "learning_rate": 9.021439525195213e-06, "loss": 0.5848, "step": 5005 }, { "epoch": 2.2646460076905677, "grad_norm": 0.35166966915130615, "learning_rate": 9.021004052741231e-06, "loss": 0.6292, "step": 5006 }, { "epoch": 2.2650983940285006, "grad_norm": 0.3760874569416046, "learning_rate": 9.020568493928522e-06, "loss": 0.5213, "step": 5007 }, { "epoch": 2.265550780366433, "grad_norm": 0.34898996353149414, "learning_rate": 9.020132848766439e-06, "loss": 0.5707, "step": 5008 }, { "epoch": 2.2660031667043654, "grad_norm": 0.34154435992240906, "learning_rate": 9.019697117264338e-06, "loss": 0.5004, "step": 5009 }, { "epoch": 2.2664555530422983, "grad_norm": 0.347610205411911, "learning_rate": 9.019261299431577e-06, "loss": 0.5219, "step": 5010 }, { "epoch": 2.2669079393802307, "grad_norm": 0.3784012198448181, "learning_rate": 9.018825395277516e-06, "loss": 0.5549, "step": 5011 }, { "epoch": 2.267360325718163, "grad_norm": 0.3647991716861725, "learning_rate": 9.018389404811518e-06, "loss": 0.5157, "step": 5012 }, { "epoch": 2.267812712056096, "grad_norm": 0.4222463071346283, "learning_rate": 9.017953328042947e-06, "loss": 0.6239, "step": 5013 }, { "epoch": 2.2682650983940285, "grad_norm": 0.4551243185997009, "learning_rate": 9.017517164981167e-06, "loss": 0.6538, "step": 5014 }, { "epoch": 2.268717484731961, "grad_norm": 0.3873961567878723, "learning_rate": 9.017080915635544e-06, "loss": 0.467, "step": 5015 }, { "epoch": 2.2691698710698938, "grad_norm": 0.42937663197517395, "learning_rate": 9.016644580015451e-06, "loss": 0.5468, "step": 5016 }, { "epoch": 2.269622257407826, "grad_norm": 0.45904460549354553, "learning_rate": 9.016208158130257e-06, "loss": 0.575, "step": 5017 }, { "epoch": 2.270074643745759, "grad_norm": 0.3992564380168915, "learning_rate": 9.015771649989336e-06, "loss": 0.5556, "step": 5018 }, { "epoch": 2.2705270300836915, "grad_norm": 0.4030390679836273, "learning_rate": 9.015335055602064e-06, "loss": 0.4909, "step": 5019 }, { "epoch": 2.270979416421624, "grad_norm": 0.5135411620140076, "learning_rate": 9.014898374977813e-06, "loss": 0.5175, "step": 5020 }, { "epoch": 2.271431802759557, "grad_norm": 0.5314582586288452, "learning_rate": 9.014461608125966e-06, "loss": 0.5774, "step": 5021 }, { "epoch": 2.2718841890974892, "grad_norm": 0.125913605093956, "learning_rate": 9.0140247550559e-06, "loss": 1.277, "step": 5022 }, { "epoch": 2.2723365754354217, "grad_norm": 0.22721947729587555, "learning_rate": 9.013587815777002e-06, "loss": 0.8646, "step": 5023 }, { "epoch": 2.2727889617733545, "grad_norm": 0.19379466772079468, "learning_rate": 9.013150790298652e-06, "loss": 0.6286, "step": 5024 }, { "epoch": 2.273241348111287, "grad_norm": 0.21950846910476685, "learning_rate": 9.012713678630239e-06, "loss": 0.5474, "step": 5025 }, { "epoch": 2.2736937344492194, "grad_norm": 0.25324851274490356, "learning_rate": 9.012276480781145e-06, "loss": 0.666, "step": 5026 }, { "epoch": 2.2741461207871523, "grad_norm": 0.2522192597389221, "learning_rate": 9.011839196760767e-06, "loss": 0.6145, "step": 5027 }, { "epoch": 2.2745985071250847, "grad_norm": 0.24325290322303772, "learning_rate": 9.011401826578492e-06, "loss": 0.6113, "step": 5028 }, { "epoch": 2.2750508934630176, "grad_norm": 0.25268417596817017, "learning_rate": 9.010964370243716e-06, "loss": 0.6329, "step": 5029 }, { "epoch": 2.27550327980095, "grad_norm": 0.24384495615959167, "learning_rate": 9.010526827765832e-06, "loss": 0.4879, "step": 5030 }, { "epoch": 2.2759556661388824, "grad_norm": 0.2682398855686188, "learning_rate": 9.010089199154238e-06, "loss": 0.6155, "step": 5031 }, { "epoch": 2.2764080524768153, "grad_norm": 0.3030489683151245, "learning_rate": 9.009651484418331e-06, "loss": 0.643, "step": 5032 }, { "epoch": 2.2768604388147478, "grad_norm": 0.2710229456424713, "learning_rate": 9.009213683567515e-06, "loss": 0.5489, "step": 5033 }, { "epoch": 2.2773128251526806, "grad_norm": 0.32904720306396484, "learning_rate": 9.008775796611191e-06, "loss": 0.6376, "step": 5034 }, { "epoch": 2.277765211490613, "grad_norm": 0.2845786511898041, "learning_rate": 9.008337823558763e-06, "loss": 0.6168, "step": 5035 }, { "epoch": 2.2782175978285455, "grad_norm": 0.32649704813957214, "learning_rate": 9.007899764419638e-06, "loss": 0.7091, "step": 5036 }, { "epoch": 2.278669984166478, "grad_norm": 0.30456748604774475, "learning_rate": 9.007461619203226e-06, "loss": 0.5352, "step": 5037 }, { "epoch": 2.279122370504411, "grad_norm": 0.2706780731678009, "learning_rate": 9.007023387918933e-06, "loss": 0.5262, "step": 5038 }, { "epoch": 2.2795747568423432, "grad_norm": 0.303498238325119, "learning_rate": 9.006585070576174e-06, "loss": 0.5586, "step": 5039 }, { "epoch": 2.280027143180276, "grad_norm": 0.31190311908721924, "learning_rate": 9.006146667184362e-06, "loss": 0.5987, "step": 5040 }, { "epoch": 2.2804795295182085, "grad_norm": 0.29405921697616577, "learning_rate": 9.00570817775291e-06, "loss": 0.589, "step": 5041 }, { "epoch": 2.280931915856141, "grad_norm": 0.2994610667228699, "learning_rate": 9.00526960229124e-06, "loss": 0.4962, "step": 5042 }, { "epoch": 2.281384302194074, "grad_norm": 0.3047558069229126, "learning_rate": 9.004830940808767e-06, "loss": 0.5979, "step": 5043 }, { "epoch": 2.2818366885320063, "grad_norm": 0.31192371249198914, "learning_rate": 9.004392193314914e-06, "loss": 0.5822, "step": 5044 }, { "epoch": 2.282289074869939, "grad_norm": 0.27066707611083984, "learning_rate": 9.003953359819105e-06, "loss": 0.4638, "step": 5045 }, { "epoch": 2.2827414612078716, "grad_norm": 0.284915030002594, "learning_rate": 9.003514440330761e-06, "loss": 0.4342, "step": 5046 }, { "epoch": 2.283193847545804, "grad_norm": 0.27902236580848694, "learning_rate": 9.003075434859314e-06, "loss": 0.4266, "step": 5047 }, { "epoch": 2.283646233883737, "grad_norm": 0.29623109102249146, "learning_rate": 9.002636343414188e-06, "loss": 0.4425, "step": 5048 }, { "epoch": 2.2840986202216693, "grad_norm": 0.3421972692012787, "learning_rate": 9.002197166004817e-06, "loss": 0.5933, "step": 5049 }, { "epoch": 2.2845510065596017, "grad_norm": 0.3594212830066681, "learning_rate": 9.00175790264063e-06, "loss": 0.6298, "step": 5050 }, { "epoch": 2.2850033928975346, "grad_norm": 0.32573261857032776, "learning_rate": 9.001318553331062e-06, "loss": 0.5407, "step": 5051 }, { "epoch": 2.285455779235467, "grad_norm": 0.3847179114818573, "learning_rate": 9.00087911808555e-06, "loss": 0.6626, "step": 5052 }, { "epoch": 2.2859081655733995, "grad_norm": 0.3302459120750427, "learning_rate": 9.000439596913531e-06, "loss": 0.5068, "step": 5053 }, { "epoch": 2.2863605519113324, "grad_norm": 0.3077160716056824, "learning_rate": 8.999999989824443e-06, "loss": 0.4695, "step": 5054 }, { "epoch": 2.286812938249265, "grad_norm": 0.361258327960968, "learning_rate": 8.99956029682773e-06, "loss": 0.4937, "step": 5055 }, { "epoch": 2.2872653245871977, "grad_norm": 0.36664658784866333, "learning_rate": 8.999120517932834e-06, "loss": 0.511, "step": 5056 }, { "epoch": 2.28771771092513, "grad_norm": 0.3544704020023346, "learning_rate": 8.998680653149199e-06, "loss": 0.5776, "step": 5057 }, { "epoch": 2.2881700972630625, "grad_norm": 0.3168734610080719, "learning_rate": 8.998240702486274e-06, "loss": 0.5363, "step": 5058 }, { "epoch": 2.2886224836009954, "grad_norm": 0.3637124300003052, "learning_rate": 8.997800665953506e-06, "loss": 0.5421, "step": 5059 }, { "epoch": 2.289074869938928, "grad_norm": 0.37682467699050903, "learning_rate": 8.997360543560346e-06, "loss": 0.5388, "step": 5060 }, { "epoch": 2.2895272562768603, "grad_norm": 0.3614627718925476, "learning_rate": 8.996920335316249e-06, "loss": 0.4859, "step": 5061 }, { "epoch": 2.289979642614793, "grad_norm": 0.3221120238304138, "learning_rate": 8.996480041230665e-06, "loss": 0.4298, "step": 5062 }, { "epoch": 2.2904320289527256, "grad_norm": 0.4305883049964905, "learning_rate": 8.996039661313054e-06, "loss": 0.6727, "step": 5063 }, { "epoch": 2.290884415290658, "grad_norm": 0.35608044266700745, "learning_rate": 8.99559919557287e-06, "loss": 0.5455, "step": 5064 }, { "epoch": 2.291336801628591, "grad_norm": 0.4218127727508545, "learning_rate": 8.995158644019579e-06, "loss": 0.5392, "step": 5065 }, { "epoch": 2.2917891879665233, "grad_norm": 0.4059213697910309, "learning_rate": 8.994718006662635e-06, "loss": 0.5022, "step": 5066 }, { "epoch": 2.292241574304456, "grad_norm": 0.3924534320831299, "learning_rate": 8.994277283511507e-06, "loss": 0.4815, "step": 5067 }, { "epoch": 2.2926939606423886, "grad_norm": 0.39385947585105896, "learning_rate": 8.99383647457566e-06, "loss": 0.4209, "step": 5068 }, { "epoch": 2.293146346980321, "grad_norm": 0.39434316754341125, "learning_rate": 8.993395579864558e-06, "loss": 0.488, "step": 5069 }, { "epoch": 2.293598733318254, "grad_norm": 0.4556533098220825, "learning_rate": 8.992954599387675e-06, "loss": 0.4329, "step": 5070 }, { "epoch": 2.2940511196561864, "grad_norm": 0.6709983348846436, "learning_rate": 8.992513533154477e-06, "loss": 0.6458, "step": 5071 }, { "epoch": 2.294503505994119, "grad_norm": 0.11765443533658981, "learning_rate": 8.99207238117444e-06, "loss": 1.098, "step": 5072 }, { "epoch": 2.2949558923320517, "grad_norm": 0.16917626559734344, "learning_rate": 8.991631143457034e-06, "loss": 0.9668, "step": 5073 }, { "epoch": 2.295408278669984, "grad_norm": 0.22798292338848114, "learning_rate": 8.99118982001174e-06, "loss": 0.7517, "step": 5074 }, { "epoch": 2.2958606650079165, "grad_norm": 0.2264532595872879, "learning_rate": 8.990748410848039e-06, "loss": 0.6623, "step": 5075 }, { "epoch": 2.2963130513458494, "grad_norm": 0.22418828308582306, "learning_rate": 8.990306915975402e-06, "loss": 0.5836, "step": 5076 }, { "epoch": 2.296765437683782, "grad_norm": 0.2567457854747772, "learning_rate": 8.989865335403318e-06, "loss": 0.684, "step": 5077 }, { "epoch": 2.2972178240217147, "grad_norm": 0.24350804090499878, "learning_rate": 8.989423669141268e-06, "loss": 0.5697, "step": 5078 }, { "epoch": 2.297670210359647, "grad_norm": 0.23422926664352417, "learning_rate": 8.98898191719874e-06, "loss": 0.5811, "step": 5079 }, { "epoch": 2.2981225966975796, "grad_norm": 0.24386048316955566, "learning_rate": 8.988540079585218e-06, "loss": 0.556, "step": 5080 }, { "epoch": 2.2985749830355124, "grad_norm": 0.2827109694480896, "learning_rate": 8.988098156310194e-06, "loss": 0.7276, "step": 5081 }, { "epoch": 2.299027369373445, "grad_norm": 0.30223724246025085, "learning_rate": 8.987656147383159e-06, "loss": 0.6538, "step": 5082 }, { "epoch": 2.2994797557113777, "grad_norm": 0.2997609078884125, "learning_rate": 8.987214052813605e-06, "loss": 0.6769, "step": 5083 }, { "epoch": 2.29993214204931, "grad_norm": 0.27591732144355774, "learning_rate": 8.986771872611027e-06, "loss": 0.6522, "step": 5084 }, { "epoch": 2.3003845283872426, "grad_norm": 0.2827262282371521, "learning_rate": 8.986329606784922e-06, "loss": 0.5803, "step": 5085 }, { "epoch": 2.3008369147251755, "grad_norm": 0.2681581974029541, "learning_rate": 8.985887255344788e-06, "loss": 0.5593, "step": 5086 }, { "epoch": 2.301289301063108, "grad_norm": 0.3470786511898041, "learning_rate": 8.985444818300125e-06, "loss": 0.6274, "step": 5087 }, { "epoch": 2.3017416874010403, "grad_norm": 0.27631470561027527, "learning_rate": 8.985002295660437e-06, "loss": 0.5203, "step": 5088 }, { "epoch": 2.3021940737389732, "grad_norm": 0.2744560241699219, "learning_rate": 8.984559687435226e-06, "loss": 0.4772, "step": 5089 }, { "epoch": 2.3026464600769057, "grad_norm": 0.28477469086647034, "learning_rate": 8.984116993634e-06, "loss": 0.5445, "step": 5090 }, { "epoch": 2.303098846414838, "grad_norm": 0.3166279196739197, "learning_rate": 8.983674214266263e-06, "loss": 0.561, "step": 5091 }, { "epoch": 2.303551232752771, "grad_norm": 0.34537747502326965, "learning_rate": 8.98323134934153e-06, "loss": 0.6104, "step": 5092 }, { "epoch": 2.3040036190907034, "grad_norm": 0.2955450415611267, "learning_rate": 8.982788398869306e-06, "loss": 0.6038, "step": 5093 }, { "epoch": 2.3044560054286363, "grad_norm": 0.31033676862716675, "learning_rate": 8.982345362859111e-06, "loss": 0.5511, "step": 5094 }, { "epoch": 2.3049083917665687, "grad_norm": 0.31058287620544434, "learning_rate": 8.981902241320454e-06, "loss": 0.6307, "step": 5095 }, { "epoch": 2.305360778104501, "grad_norm": 0.32820984721183777, "learning_rate": 8.981459034262855e-06, "loss": 0.5424, "step": 5096 }, { "epoch": 2.305813164442434, "grad_norm": 0.29032155871391296, "learning_rate": 8.981015741695832e-06, "loss": 0.4968, "step": 5097 }, { "epoch": 2.3062655507803664, "grad_norm": 0.33791881799697876, "learning_rate": 8.980572363628907e-06, "loss": 0.6353, "step": 5098 }, { "epoch": 2.306717937118299, "grad_norm": 0.3081226348876953, "learning_rate": 8.9801289000716e-06, "loss": 0.5505, "step": 5099 }, { "epoch": 2.3071703234562317, "grad_norm": 0.2916305959224701, "learning_rate": 8.979685351033435e-06, "loss": 0.5097, "step": 5100 }, { "epoch": 2.307622709794164, "grad_norm": 0.3282086253166199, "learning_rate": 8.979241716523941e-06, "loss": 0.5835, "step": 5101 }, { "epoch": 2.3080750961320966, "grad_norm": 0.3564854860305786, "learning_rate": 8.978797996552644e-06, "loss": 0.5134, "step": 5102 }, { "epoch": 2.3085274824700295, "grad_norm": 0.3681142330169678, "learning_rate": 8.978354191129076e-06, "loss": 0.5569, "step": 5103 }, { "epoch": 2.308979868807962, "grad_norm": 0.32936403155326843, "learning_rate": 8.977910300262763e-06, "loss": 0.4775, "step": 5104 }, { "epoch": 2.309432255145895, "grad_norm": 0.3236653506755829, "learning_rate": 8.977466323963243e-06, "loss": 0.5265, "step": 5105 }, { "epoch": 2.309884641483827, "grad_norm": 0.36080583930015564, "learning_rate": 8.977022262240051e-06, "loss": 0.5479, "step": 5106 }, { "epoch": 2.3103370278217596, "grad_norm": 0.35249292850494385, "learning_rate": 8.976578115102722e-06, "loss": 0.5415, "step": 5107 }, { "epoch": 2.3107894141596925, "grad_norm": 0.33883580565452576, "learning_rate": 8.976133882560798e-06, "loss": 0.5422, "step": 5108 }, { "epoch": 2.311241800497625, "grad_norm": 0.3563769459724426, "learning_rate": 8.975689564623817e-06, "loss": 0.487, "step": 5109 }, { "epoch": 2.3116941868355574, "grad_norm": 0.33895671367645264, "learning_rate": 8.975245161301323e-06, "loss": 0.4172, "step": 5110 }, { "epoch": 2.3121465731734903, "grad_norm": 0.37519681453704834, "learning_rate": 8.974800672602859e-06, "loss": 0.5481, "step": 5111 }, { "epoch": 2.3125989595114227, "grad_norm": 0.33460304141044617, "learning_rate": 8.974356098537973e-06, "loss": 0.4441, "step": 5112 }, { "epoch": 2.313051345849355, "grad_norm": 0.38560402393341064, "learning_rate": 8.973911439116212e-06, "loss": 0.5845, "step": 5113 }, { "epoch": 2.313503732187288, "grad_norm": 0.42339739203453064, "learning_rate": 8.973466694347125e-06, "loss": 0.6074, "step": 5114 }, { "epoch": 2.3139561185252204, "grad_norm": 0.34521761536598206, "learning_rate": 8.973021864240269e-06, "loss": 0.5493, "step": 5115 }, { "epoch": 2.3144085048631533, "grad_norm": 0.38674286007881165, "learning_rate": 8.97257694880519e-06, "loss": 0.4909, "step": 5116 }, { "epoch": 2.3148608912010857, "grad_norm": 0.43478986620903015, "learning_rate": 8.972131948051447e-06, "loss": 0.5139, "step": 5117 }, { "epoch": 2.315313277539018, "grad_norm": 0.45223739743232727, "learning_rate": 8.971686861988596e-06, "loss": 0.4881, "step": 5118 }, { "epoch": 2.315765663876951, "grad_norm": 0.41290605068206787, "learning_rate": 8.9712416906262e-06, "loss": 0.4643, "step": 5119 }, { "epoch": 2.3162180502148835, "grad_norm": 0.4456080198287964, "learning_rate": 8.970796433973813e-06, "loss": 0.4521, "step": 5120 }, { "epoch": 2.3166704365528163, "grad_norm": 0.4991777241230011, "learning_rate": 8.970351092041005e-06, "loss": 0.4833, "step": 5121 }, { "epoch": 2.317122822890749, "grad_norm": 0.1372484713792801, "learning_rate": 8.969905664837337e-06, "loss": 1.0853, "step": 5122 }, { "epoch": 2.317575209228681, "grad_norm": 0.2313486784696579, "learning_rate": 8.969460152372376e-06, "loss": 0.9125, "step": 5123 }, { "epoch": 2.3180275955666136, "grad_norm": 0.22185790538787842, "learning_rate": 8.96901455465569e-06, "loss": 0.6269, "step": 5124 }, { "epoch": 2.3184799819045465, "grad_norm": 0.2601901888847351, "learning_rate": 8.968568871696847e-06, "loss": 0.6425, "step": 5125 }, { "epoch": 2.318932368242479, "grad_norm": 0.2323128581047058, "learning_rate": 8.968123103505423e-06, "loss": 0.6711, "step": 5126 }, { "epoch": 2.319384754580412, "grad_norm": 0.2675032317638397, "learning_rate": 8.96767725009099e-06, "loss": 0.6742, "step": 5127 }, { "epoch": 2.3198371409183443, "grad_norm": 0.2531323730945587, "learning_rate": 8.967231311463123e-06, "loss": 0.6191, "step": 5128 }, { "epoch": 2.3202895272562767, "grad_norm": 0.27058520913124084, "learning_rate": 8.966785287631399e-06, "loss": 0.7332, "step": 5129 }, { "epoch": 2.3207419135942096, "grad_norm": 0.25856712460517883, "learning_rate": 8.966339178605397e-06, "loss": 0.5523, "step": 5130 }, { "epoch": 2.321194299932142, "grad_norm": 0.2937946617603302, "learning_rate": 8.965892984394701e-06, "loss": 0.6648, "step": 5131 }, { "epoch": 2.321646686270075, "grad_norm": 0.2797824442386627, "learning_rate": 8.96544670500889e-06, "loss": 0.6273, "step": 5132 }, { "epoch": 2.3220990726080073, "grad_norm": 0.26217928528785706, "learning_rate": 8.96500034045755e-06, "loss": 0.5458, "step": 5133 }, { "epoch": 2.3225514589459397, "grad_norm": 0.28655216097831726, "learning_rate": 8.96455389075027e-06, "loss": 0.5122, "step": 5134 }, { "epoch": 2.3230038452838726, "grad_norm": 0.3050766885280609, "learning_rate": 8.964107355896636e-06, "loss": 0.6645, "step": 5135 }, { "epoch": 2.323456231621805, "grad_norm": 0.2499980479478836, "learning_rate": 8.963660735906237e-06, "loss": 0.3956, "step": 5136 }, { "epoch": 2.3239086179597375, "grad_norm": 0.29579004645347595, "learning_rate": 8.963214030788668e-06, "loss": 0.6203, "step": 5137 }, { "epoch": 2.3243610042976703, "grad_norm": 0.30258843302726746, "learning_rate": 8.96276724055352e-06, "loss": 0.5595, "step": 5138 }, { "epoch": 2.3248133906356028, "grad_norm": 0.3302690386772156, "learning_rate": 8.96232036521039e-06, "loss": 0.6407, "step": 5139 }, { "epoch": 2.325265776973535, "grad_norm": 0.3066994845867157, "learning_rate": 8.961873404768877e-06, "loss": 0.6031, "step": 5140 }, { "epoch": 2.325718163311468, "grad_norm": 0.3121259808540344, "learning_rate": 8.961426359238579e-06, "loss": 0.5662, "step": 5141 }, { "epoch": 2.3261705496494005, "grad_norm": 0.30378177762031555, "learning_rate": 8.960979228629097e-06, "loss": 0.5572, "step": 5142 }, { "epoch": 2.3266229359873334, "grad_norm": 0.29598352313041687, "learning_rate": 8.960532012950032e-06, "loss": 0.4856, "step": 5143 }, { "epoch": 2.327075322325266, "grad_norm": 0.31938499212265015, "learning_rate": 8.960084712210993e-06, "loss": 0.5656, "step": 5144 }, { "epoch": 2.3275277086631982, "grad_norm": 0.31413334608078003, "learning_rate": 8.959637326421583e-06, "loss": 0.5639, "step": 5145 }, { "epoch": 2.327980095001131, "grad_norm": 0.33290380239486694, "learning_rate": 8.959189855591413e-06, "loss": 0.578, "step": 5146 }, { "epoch": 2.3284324813390636, "grad_norm": 0.30578720569610596, "learning_rate": 8.95874229973009e-06, "loss": 0.5357, "step": 5147 }, { "epoch": 2.328884867676996, "grad_norm": 0.29508233070373535, "learning_rate": 8.958294658847231e-06, "loss": 0.5235, "step": 5148 }, { "epoch": 2.329337254014929, "grad_norm": 0.32157933712005615, "learning_rate": 8.957846932952446e-06, "loss": 0.5691, "step": 5149 }, { "epoch": 2.3297896403528613, "grad_norm": 0.3388962149620056, "learning_rate": 8.957399122055353e-06, "loss": 0.5781, "step": 5150 }, { "epoch": 2.3302420266907937, "grad_norm": 0.3129505217075348, "learning_rate": 8.956951226165566e-06, "loss": 0.5283, "step": 5151 }, { "epoch": 2.3306944130287266, "grad_norm": 0.33784371614456177, "learning_rate": 8.95650324529271e-06, "loss": 0.5089, "step": 5152 }, { "epoch": 2.331146799366659, "grad_norm": 0.3461126983165741, "learning_rate": 8.956055179446402e-06, "loss": 0.5452, "step": 5153 }, { "epoch": 2.331599185704592, "grad_norm": 0.35084596276283264, "learning_rate": 8.955607028636267e-06, "loss": 0.5591, "step": 5154 }, { "epoch": 2.3320515720425243, "grad_norm": 0.37502869963645935, "learning_rate": 8.955158792871928e-06, "loss": 0.564, "step": 5155 }, { "epoch": 2.3325039583804568, "grad_norm": 0.33144041895866394, "learning_rate": 8.954710472163016e-06, "loss": 0.4898, "step": 5156 }, { "epoch": 2.3329563447183896, "grad_norm": 0.39707255363464355, "learning_rate": 8.954262066519152e-06, "loss": 0.6094, "step": 5157 }, { "epoch": 2.333408731056322, "grad_norm": 0.3692919909954071, "learning_rate": 8.953813575949974e-06, "loss": 0.5454, "step": 5158 }, { "epoch": 2.3338611173942545, "grad_norm": 0.3529667854309082, "learning_rate": 8.953365000465112e-06, "loss": 0.4735, "step": 5159 }, { "epoch": 2.3343135037321874, "grad_norm": 0.3957003951072693, "learning_rate": 8.952916340074197e-06, "loss": 0.5996, "step": 5160 }, { "epoch": 2.33476589007012, "grad_norm": 0.33116650581359863, "learning_rate": 8.952467594786867e-06, "loss": 0.4758, "step": 5161 }, { "epoch": 2.3352182764080522, "grad_norm": 0.4113021194934845, "learning_rate": 8.952018764612758e-06, "loss": 0.5547, "step": 5162 }, { "epoch": 2.335670662745985, "grad_norm": 0.3700742721557617, "learning_rate": 8.951569849561513e-06, "loss": 0.5547, "step": 5163 }, { "epoch": 2.3361230490839175, "grad_norm": 0.3984812796115875, "learning_rate": 8.951120849642772e-06, "loss": 0.5206, "step": 5164 }, { "epoch": 2.3365754354218504, "grad_norm": 0.4104999601840973, "learning_rate": 8.950671764866177e-06, "loss": 0.5266, "step": 5165 }, { "epoch": 2.337027821759783, "grad_norm": 0.39395037293434143, "learning_rate": 8.950222595241373e-06, "loss": 0.5475, "step": 5166 }, { "epoch": 2.3374802080977153, "grad_norm": 0.40888237953186035, "learning_rate": 8.949773340778007e-06, "loss": 0.4557, "step": 5167 }, { "epoch": 2.337932594435648, "grad_norm": 0.4658520519733429, "learning_rate": 8.949324001485728e-06, "loss": 0.6162, "step": 5168 }, { "epoch": 2.3383849807735806, "grad_norm": 0.40770334005355835, "learning_rate": 8.948874577374187e-06, "loss": 0.5126, "step": 5169 }, { "epoch": 2.3388373671115135, "grad_norm": 0.40405192971229553, "learning_rate": 8.948425068453034e-06, "loss": 0.4911, "step": 5170 }, { "epoch": 2.339289753449446, "grad_norm": 0.5039969086647034, "learning_rate": 8.947975474731927e-06, "loss": 0.5994, "step": 5171 }, { "epoch": 2.3397421397873783, "grad_norm": 0.18074560165405273, "learning_rate": 8.947525796220517e-06, "loss": 1.1309, "step": 5172 }, { "epoch": 2.340194526125311, "grad_norm": 0.22217054665088654, "learning_rate": 8.947076032928466e-06, "loss": 0.5835, "step": 5173 }, { "epoch": 2.3406469124632436, "grad_norm": 0.23608514666557312, "learning_rate": 8.94662618486543e-06, "loss": 0.723, "step": 5174 }, { "epoch": 2.341099298801176, "grad_norm": 0.2772035300731659, "learning_rate": 8.946176252041072e-06, "loss": 0.7182, "step": 5175 }, { "epoch": 2.341551685139109, "grad_norm": 0.21475175023078918, "learning_rate": 8.945726234465058e-06, "loss": 0.5362, "step": 5176 }, { "epoch": 2.3420040714770414, "grad_norm": 0.24416527152061462, "learning_rate": 8.945276132147047e-06, "loss": 0.6344, "step": 5177 }, { "epoch": 2.342456457814974, "grad_norm": 0.2703167498111725, "learning_rate": 8.944825945096709e-06, "loss": 0.6998, "step": 5178 }, { "epoch": 2.3429088441529067, "grad_norm": 0.2773960828781128, "learning_rate": 8.944375673323714e-06, "loss": 0.6797, "step": 5179 }, { "epoch": 2.343361230490839, "grad_norm": 0.2263125330209732, "learning_rate": 8.94392531683773e-06, "loss": 0.5135, "step": 5180 }, { "epoch": 2.343813616828772, "grad_norm": 0.27982455492019653, "learning_rate": 8.943474875648433e-06, "loss": 0.5934, "step": 5181 }, { "epoch": 2.3442660031667044, "grad_norm": 0.27248647809028625, "learning_rate": 8.94302434976549e-06, "loss": 0.5937, "step": 5182 }, { "epoch": 2.344718389504637, "grad_norm": 0.28724026679992676, "learning_rate": 8.942573739198585e-06, "loss": 0.6124, "step": 5183 }, { "epoch": 2.3451707758425697, "grad_norm": 0.26312363147735596, "learning_rate": 8.94212304395739e-06, "loss": 0.5307, "step": 5184 }, { "epoch": 2.345623162180502, "grad_norm": 0.2536335587501526, "learning_rate": 8.941672264051586e-06, "loss": 0.4983, "step": 5185 }, { "epoch": 2.3460755485184346, "grad_norm": 0.331790566444397, "learning_rate": 8.941221399490856e-06, "loss": 0.7405, "step": 5186 }, { "epoch": 2.3465279348563675, "grad_norm": 0.3212413489818573, "learning_rate": 8.940770450284884e-06, "loss": 0.559, "step": 5187 }, { "epoch": 2.3469803211943, "grad_norm": 0.2788722813129425, "learning_rate": 8.94031941644335e-06, "loss": 0.6052, "step": 5188 }, { "epoch": 2.3474327075322323, "grad_norm": 0.28932175040245056, "learning_rate": 8.939868297975945e-06, "loss": 0.565, "step": 5189 }, { "epoch": 2.347885093870165, "grad_norm": 0.28547203540802, "learning_rate": 8.939417094892355e-06, "loss": 0.5101, "step": 5190 }, { "epoch": 2.3483374802080976, "grad_norm": 0.305975079536438, "learning_rate": 8.938965807202273e-06, "loss": 0.4133, "step": 5191 }, { "epoch": 2.3487898665460305, "grad_norm": 0.3111722767353058, "learning_rate": 8.93851443491539e-06, "loss": 0.5426, "step": 5192 }, { "epoch": 2.349242252883963, "grad_norm": 0.3870984613895416, "learning_rate": 8.938062978041401e-06, "loss": 0.6433, "step": 5193 }, { "epoch": 2.3496946392218954, "grad_norm": 0.2995266318321228, "learning_rate": 8.937611436590003e-06, "loss": 0.6131, "step": 5194 }, { "epoch": 2.3501470255598282, "grad_norm": 0.3543240427970886, "learning_rate": 8.93715981057089e-06, "loss": 0.6553, "step": 5195 }, { "epoch": 2.3505994118977607, "grad_norm": 0.3024788796901703, "learning_rate": 8.936708099993763e-06, "loss": 0.5012, "step": 5196 }, { "epoch": 2.351051798235693, "grad_norm": 0.3152886629104614, "learning_rate": 8.936256304868324e-06, "loss": 0.5223, "step": 5197 }, { "epoch": 2.351504184573626, "grad_norm": 0.3279096484184265, "learning_rate": 8.935804425204276e-06, "loss": 0.5887, "step": 5198 }, { "epoch": 2.3519565709115584, "grad_norm": 0.3366962969303131, "learning_rate": 8.935352461011323e-06, "loss": 0.6271, "step": 5199 }, { "epoch": 2.352408957249491, "grad_norm": 0.3460014760494232, "learning_rate": 8.934900412299174e-06, "loss": 0.5276, "step": 5200 }, { "epoch": 2.352408957249491, "eval_loss": 0.5950751900672913, "eval_runtime": 25.8288, "eval_samples_per_second": 28.805, "eval_steps_per_second": 7.201, "step": 5200 }, { "epoch": 2.3528613435874237, "grad_norm": 0.3559178411960602, "learning_rate": 8.934448279077537e-06, "loss": 0.6865, "step": 5201 }, { "epoch": 2.353313729925356, "grad_norm": 0.3454980254173279, "learning_rate": 8.933996061356121e-06, "loss": 0.5437, "step": 5202 }, { "epoch": 2.353766116263289, "grad_norm": 0.3158490061759949, "learning_rate": 8.933543759144639e-06, "loss": 0.5537, "step": 5203 }, { "epoch": 2.3542185026012215, "grad_norm": 0.31986913084983826, "learning_rate": 8.933091372452804e-06, "loss": 0.5003, "step": 5204 }, { "epoch": 2.354670888939154, "grad_norm": 0.3672752380371094, "learning_rate": 8.932638901290334e-06, "loss": 0.5626, "step": 5205 }, { "epoch": 2.3551232752770868, "grad_norm": 0.31871354579925537, "learning_rate": 8.932186345666945e-06, "loss": 0.4804, "step": 5206 }, { "epoch": 2.355575661615019, "grad_norm": 0.36672306060791016, "learning_rate": 8.931733705592357e-06, "loss": 0.5147, "step": 5207 }, { "epoch": 2.356028047952952, "grad_norm": 0.3676467537879944, "learning_rate": 8.931280981076291e-06, "loss": 0.5715, "step": 5208 }, { "epoch": 2.3564804342908845, "grad_norm": 0.4063245356082916, "learning_rate": 8.930828172128471e-06, "loss": 0.5846, "step": 5209 }, { "epoch": 2.356932820628817, "grad_norm": 0.410403311252594, "learning_rate": 8.930375278758623e-06, "loss": 0.575, "step": 5210 }, { "epoch": 2.3573852069667494, "grad_norm": 0.4177560806274414, "learning_rate": 8.92992230097647e-06, "loss": 0.5002, "step": 5211 }, { "epoch": 2.3578375933046822, "grad_norm": 0.4173509180545807, "learning_rate": 8.929469238791744e-06, "loss": 0.5608, "step": 5212 }, { "epoch": 2.3582899796426147, "grad_norm": 0.37240418791770935, "learning_rate": 8.929016092214174e-06, "loss": 0.4982, "step": 5213 }, { "epoch": 2.3587423659805475, "grad_norm": 0.39809659123420715, "learning_rate": 8.928562861253491e-06, "loss": 0.5187, "step": 5214 }, { "epoch": 2.35919475231848, "grad_norm": 0.4240771532058716, "learning_rate": 8.928109545919433e-06, "loss": 0.5562, "step": 5215 }, { "epoch": 2.3596471386564124, "grad_norm": 0.4590861201286316, "learning_rate": 8.92765614622173e-06, "loss": 0.5848, "step": 5216 }, { "epoch": 2.3600995249943453, "grad_norm": 0.395620197057724, "learning_rate": 8.927202662170123e-06, "loss": 0.4758, "step": 5217 }, { "epoch": 2.3605519113322777, "grad_norm": 0.5504400730133057, "learning_rate": 8.926749093774353e-06, "loss": 0.5856, "step": 5218 }, { "epoch": 2.3610042976702106, "grad_norm": 0.49798864126205444, "learning_rate": 8.92629544104416e-06, "loss": 0.63, "step": 5219 }, { "epoch": 2.361456684008143, "grad_norm": 0.41312935948371887, "learning_rate": 8.925841703989285e-06, "loss": 0.4887, "step": 5220 }, { "epoch": 2.3619090703460754, "grad_norm": 0.5409550666809082, "learning_rate": 8.925387882619473e-06, "loss": 0.5653, "step": 5221 }, { "epoch": 2.3623614566840083, "grad_norm": 0.1917334645986557, "learning_rate": 8.924933976944474e-06, "loss": 1.3345, "step": 5222 }, { "epoch": 2.3628138430219408, "grad_norm": 0.21469278633594513, "learning_rate": 8.924479986974035e-06, "loss": 0.7704, "step": 5223 }, { "epoch": 2.363266229359873, "grad_norm": 0.22797371447086334, "learning_rate": 8.924025912717904e-06, "loss": 0.5806, "step": 5224 }, { "epoch": 2.363718615697806, "grad_norm": 0.22975832223892212, "learning_rate": 8.923571754185835e-06, "loss": 0.5494, "step": 5225 }, { "epoch": 2.3641710020357385, "grad_norm": 0.2744516432285309, "learning_rate": 8.923117511387583e-06, "loss": 0.5654, "step": 5226 }, { "epoch": 2.364623388373671, "grad_norm": 0.2559666931629181, "learning_rate": 8.922663184332903e-06, "loss": 0.5695, "step": 5227 }, { "epoch": 2.365075774711604, "grad_norm": 0.24163545668125153, "learning_rate": 8.92220877303155e-06, "loss": 0.5804, "step": 5228 }, { "epoch": 2.3655281610495362, "grad_norm": 0.274517297744751, "learning_rate": 8.921754277493288e-06, "loss": 0.6815, "step": 5229 }, { "epoch": 2.365980547387469, "grad_norm": 0.26811838150024414, "learning_rate": 8.921299697727874e-06, "loss": 0.6609, "step": 5230 }, { "epoch": 2.3664329337254015, "grad_norm": 0.2496194988489151, "learning_rate": 8.920845033745073e-06, "loss": 0.5306, "step": 5231 }, { "epoch": 2.366885320063334, "grad_norm": 0.299078106880188, "learning_rate": 8.92039028555465e-06, "loss": 0.5776, "step": 5232 }, { "epoch": 2.367337706401267, "grad_norm": 0.25788772106170654, "learning_rate": 8.91993545316637e-06, "loss": 0.5541, "step": 5233 }, { "epoch": 2.3677900927391993, "grad_norm": 0.28339770436286926, "learning_rate": 8.919480536590004e-06, "loss": 0.6144, "step": 5234 }, { "epoch": 2.3682424790771317, "grad_norm": 0.2880876958370209, "learning_rate": 8.919025535835318e-06, "loss": 0.5163, "step": 5235 }, { "epoch": 2.3686948654150646, "grad_norm": 0.28834447264671326, "learning_rate": 8.918570450912088e-06, "loss": 0.527, "step": 5236 }, { "epoch": 2.369147251752997, "grad_norm": 0.3201911151409149, "learning_rate": 8.918115281830087e-06, "loss": 0.5668, "step": 5237 }, { "epoch": 2.3695996380909294, "grad_norm": 0.33936285972595215, "learning_rate": 8.917660028599088e-06, "loss": 0.6508, "step": 5238 }, { "epoch": 2.3700520244288623, "grad_norm": 0.28536897897720337, "learning_rate": 8.917204691228873e-06, "loss": 0.4744, "step": 5239 }, { "epoch": 2.3705044107667947, "grad_norm": 0.336946576833725, "learning_rate": 8.916749269729217e-06, "loss": 0.6522, "step": 5240 }, { "epoch": 2.3709567971047276, "grad_norm": 0.3168976306915283, "learning_rate": 8.916293764109903e-06, "loss": 0.5221, "step": 5241 }, { "epoch": 2.37140918344266, "grad_norm": 0.3148195147514343, "learning_rate": 8.915838174380713e-06, "loss": 0.5168, "step": 5242 }, { "epoch": 2.3718615697805925, "grad_norm": 0.3642879128456116, "learning_rate": 8.915382500551434e-06, "loss": 0.6246, "step": 5243 }, { "epoch": 2.3723139561185254, "grad_norm": 0.30143481492996216, "learning_rate": 8.914926742631848e-06, "loss": 0.4783, "step": 5244 }, { "epoch": 2.372766342456458, "grad_norm": 0.30212244391441345, "learning_rate": 8.91447090063175e-06, "loss": 0.5036, "step": 5245 }, { "epoch": 2.37321872879439, "grad_norm": 0.2967592179775238, "learning_rate": 8.914014974560921e-06, "loss": 0.5923, "step": 5246 }, { "epoch": 2.373671115132323, "grad_norm": 0.3159501254558563, "learning_rate": 8.913558964429161e-06, "loss": 0.5432, "step": 5247 }, { "epoch": 2.3741235014702555, "grad_norm": 0.3331361711025238, "learning_rate": 8.91310287024626e-06, "loss": 0.5499, "step": 5248 }, { "epoch": 2.374575887808188, "grad_norm": 0.31080663204193115, "learning_rate": 8.912646692022014e-06, "loss": 0.5621, "step": 5249 }, { "epoch": 2.375028274146121, "grad_norm": 0.3188500702381134, "learning_rate": 8.912190429766222e-06, "loss": 0.5513, "step": 5250 }, { "epoch": 2.3754806604840533, "grad_norm": 0.3618680536746979, "learning_rate": 8.91173408348868e-06, "loss": 0.6293, "step": 5251 }, { "epoch": 2.375933046821986, "grad_norm": 0.33807262778282166, "learning_rate": 8.911277653199189e-06, "loss": 0.5323, "step": 5252 }, { "epoch": 2.3763854331599186, "grad_norm": 0.311208575963974, "learning_rate": 8.910821138907554e-06, "loss": 0.4886, "step": 5253 }, { "epoch": 2.376837819497851, "grad_norm": 0.33905744552612305, "learning_rate": 8.91036454062358e-06, "loss": 0.5034, "step": 5254 }, { "epoch": 2.377290205835784, "grad_norm": 0.3606916069984436, "learning_rate": 8.90990785835707e-06, "loss": 0.5737, "step": 5255 }, { "epoch": 2.3777425921737163, "grad_norm": 0.37854576110839844, "learning_rate": 8.909451092117836e-06, "loss": 0.6181, "step": 5256 }, { "epoch": 2.378194978511649, "grad_norm": 0.38674047589302063, "learning_rate": 8.908994241915685e-06, "loss": 0.5169, "step": 5257 }, { "epoch": 2.3786473648495816, "grad_norm": 0.36725178360939026, "learning_rate": 8.908537307760428e-06, "loss": 0.576, "step": 5258 }, { "epoch": 2.379099751187514, "grad_norm": 0.3875040113925934, "learning_rate": 8.908080289661882e-06, "loss": 0.534, "step": 5259 }, { "epoch": 2.379552137525447, "grad_norm": 0.37505942583084106, "learning_rate": 8.907623187629862e-06, "loss": 0.5002, "step": 5260 }, { "epoch": 2.3800045238633794, "grad_norm": 0.3564598858356476, "learning_rate": 8.907166001674181e-06, "loss": 0.5385, "step": 5261 }, { "epoch": 2.380456910201312, "grad_norm": 0.3699110448360443, "learning_rate": 8.906708731804663e-06, "loss": 0.4688, "step": 5262 }, { "epoch": 2.3809092965392447, "grad_norm": 0.42432114481925964, "learning_rate": 8.906251378031125e-06, "loss": 0.5481, "step": 5263 }, { "epoch": 2.381361682877177, "grad_norm": 0.35957586765289307, "learning_rate": 8.905793940363392e-06, "loss": 0.4283, "step": 5264 }, { "epoch": 2.3818140692151095, "grad_norm": 0.37844258546829224, "learning_rate": 8.905336418811286e-06, "loss": 0.5006, "step": 5265 }, { "epoch": 2.3822664555530424, "grad_norm": 0.41207537055015564, "learning_rate": 8.904878813384636e-06, "loss": 0.5484, "step": 5266 }, { "epoch": 2.382718841890975, "grad_norm": 0.4515474736690521, "learning_rate": 8.904421124093267e-06, "loss": 0.6212, "step": 5267 }, { "epoch": 2.3831712282289077, "grad_norm": 0.4509403705596924, "learning_rate": 8.90396335094701e-06, "loss": 0.496, "step": 5268 }, { "epoch": 2.38362361456684, "grad_norm": 0.46767744421958923, "learning_rate": 8.903505493955697e-06, "loss": 0.5529, "step": 5269 }, { "epoch": 2.3840760009047726, "grad_norm": 0.46141764521598816, "learning_rate": 8.903047553129162e-06, "loss": 0.45, "step": 5270 }, { "epoch": 2.3845283872427054, "grad_norm": 0.5676841139793396, "learning_rate": 8.902589528477239e-06, "loss": 0.5809, "step": 5271 }, { "epoch": 2.384980773580638, "grad_norm": 0.1427653282880783, "learning_rate": 8.902131420009765e-06, "loss": 1.3703, "step": 5272 }, { "epoch": 2.3854331599185703, "grad_norm": 0.2263144701719284, "learning_rate": 8.901673227736578e-06, "loss": 0.9007, "step": 5273 }, { "epoch": 2.385885546256503, "grad_norm": 0.2576144337654114, "learning_rate": 8.901214951667519e-06, "loss": 0.6424, "step": 5274 }, { "epoch": 2.3863379325944356, "grad_norm": 0.2382487952709198, "learning_rate": 8.900756591812433e-06, "loss": 0.5353, "step": 5275 }, { "epoch": 2.386790318932368, "grad_norm": 0.24812601506710052, "learning_rate": 8.900298148181161e-06, "loss": 0.6249, "step": 5276 }, { "epoch": 2.387242705270301, "grad_norm": 0.27381688356399536, "learning_rate": 8.89983962078355e-06, "loss": 0.5722, "step": 5277 }, { "epoch": 2.3876950916082333, "grad_norm": 0.2768847644329071, "learning_rate": 8.899381009629446e-06, "loss": 0.5739, "step": 5278 }, { "epoch": 2.388147477946166, "grad_norm": 0.29781198501586914, "learning_rate": 8.898922314728702e-06, "loss": 0.74, "step": 5279 }, { "epoch": 2.3885998642840987, "grad_norm": 0.27228280901908875, "learning_rate": 8.898463536091167e-06, "loss": 0.5543, "step": 5280 }, { "epoch": 2.389052250622031, "grad_norm": 0.2408645898103714, "learning_rate": 8.898004673726694e-06, "loss": 0.5171, "step": 5281 }, { "epoch": 2.389504636959964, "grad_norm": 0.27253690361976624, "learning_rate": 8.897545727645141e-06, "loss": 0.5835, "step": 5282 }, { "epoch": 2.3899570232978964, "grad_norm": 0.3259333372116089, "learning_rate": 8.89708669785636e-06, "loss": 0.498, "step": 5283 }, { "epoch": 2.390409409635829, "grad_norm": 0.29838910698890686, "learning_rate": 8.896627584370215e-06, "loss": 0.6678, "step": 5284 }, { "epoch": 2.3908617959737617, "grad_norm": 0.2814207077026367, "learning_rate": 8.89616838719656e-06, "loss": 0.6265, "step": 5285 }, { "epoch": 2.391314182311694, "grad_norm": 0.31334343552589417, "learning_rate": 8.89570910634526e-06, "loss": 0.5696, "step": 5286 }, { "epoch": 2.3917665686496266, "grad_norm": 0.3295969069004059, "learning_rate": 8.895249741826181e-06, "loss": 0.709, "step": 5287 }, { "epoch": 2.3922189549875594, "grad_norm": 0.29531171917915344, "learning_rate": 8.894790293649188e-06, "loss": 0.5735, "step": 5288 }, { "epoch": 2.392671341325492, "grad_norm": 0.31224778294563293, "learning_rate": 8.894330761824146e-06, "loss": 0.4875, "step": 5289 }, { "epoch": 2.3931237276634247, "grad_norm": 0.30230623483657837, "learning_rate": 8.893871146360926e-06, "loss": 0.5873, "step": 5290 }, { "epoch": 2.393576114001357, "grad_norm": 0.29957160353660583, "learning_rate": 8.8934114472694e-06, "loss": 0.5191, "step": 5291 }, { "epoch": 2.3940285003392896, "grad_norm": 0.29483267664909363, "learning_rate": 8.89295166455944e-06, "loss": 0.4608, "step": 5292 }, { "epoch": 2.3944808866772225, "grad_norm": 0.3138834238052368, "learning_rate": 8.892491798240919e-06, "loss": 0.5987, "step": 5293 }, { "epoch": 2.394933273015155, "grad_norm": 0.37517717480659485, "learning_rate": 8.892031848323717e-06, "loss": 0.6047, "step": 5294 }, { "epoch": 2.395385659353088, "grad_norm": 0.3198024034500122, "learning_rate": 8.89157181481771e-06, "loss": 0.5539, "step": 5295 }, { "epoch": 2.39583804569102, "grad_norm": 0.29389840364456177, "learning_rate": 8.891111697732777e-06, "loss": 0.4914, "step": 5296 }, { "epoch": 2.3962904320289526, "grad_norm": 0.3205486238002777, "learning_rate": 8.890651497078804e-06, "loss": 0.4389, "step": 5297 }, { "epoch": 2.396742818366885, "grad_norm": 0.3307687044143677, "learning_rate": 8.89019121286567e-06, "loss": 0.5386, "step": 5298 }, { "epoch": 2.397195204704818, "grad_norm": 0.3100384771823883, "learning_rate": 8.889730845103263e-06, "loss": 0.4708, "step": 5299 }, { "epoch": 2.3976475910427504, "grad_norm": 0.29674071073532104, "learning_rate": 8.88927039380147e-06, "loss": 0.5282, "step": 5300 }, { "epoch": 2.3980999773806833, "grad_norm": 0.31692129373550415, "learning_rate": 8.88880985897018e-06, "loss": 0.4814, "step": 5301 }, { "epoch": 2.3985523637186157, "grad_norm": 0.34936773777008057, "learning_rate": 8.888349240619285e-06, "loss": 0.5238, "step": 5302 }, { "epoch": 2.399004750056548, "grad_norm": 0.32773688435554504, "learning_rate": 8.887888538758676e-06, "loss": 0.488, "step": 5303 }, { "epoch": 2.399457136394481, "grad_norm": 0.35089343786239624, "learning_rate": 8.887427753398249e-06, "loss": 0.5211, "step": 5304 }, { "epoch": 2.3999095227324134, "grad_norm": 0.3327306807041168, "learning_rate": 8.886966884547896e-06, "loss": 0.5015, "step": 5305 }, { "epoch": 2.4003619090703463, "grad_norm": 0.32806727290153503, "learning_rate": 8.88650593221752e-06, "loss": 0.4802, "step": 5306 }, { "epoch": 2.4008142954082787, "grad_norm": 0.29592448472976685, "learning_rate": 8.88604489641702e-06, "loss": 0.4475, "step": 5307 }, { "epoch": 2.401266681746211, "grad_norm": 0.3349493443965912, "learning_rate": 8.885583777156295e-06, "loss": 0.4805, "step": 5308 }, { "epoch": 2.401719068084144, "grad_norm": 0.36836034059524536, "learning_rate": 8.88512257444525e-06, "loss": 0.5743, "step": 5309 }, { "epoch": 2.4021714544220765, "grad_norm": 0.31107714772224426, "learning_rate": 8.88466128829379e-06, "loss": 0.3641, "step": 5310 }, { "epoch": 2.402623840760009, "grad_norm": 0.36788785457611084, "learning_rate": 8.884199918711824e-06, "loss": 0.5014, "step": 5311 }, { "epoch": 2.4030762270979418, "grad_norm": 0.3677017092704773, "learning_rate": 8.88373846570926e-06, "loss": 0.5486, "step": 5312 }, { "epoch": 2.403528613435874, "grad_norm": 0.3668062686920166, "learning_rate": 8.883276929296006e-06, "loss": 0.4658, "step": 5313 }, { "epoch": 2.4039809997738066, "grad_norm": 0.36399152874946594, "learning_rate": 8.882815309481975e-06, "loss": 0.4705, "step": 5314 }, { "epoch": 2.4044333861117395, "grad_norm": 0.41944408416748047, "learning_rate": 8.882353606277083e-06, "loss": 0.5707, "step": 5315 }, { "epoch": 2.404885772449672, "grad_norm": 0.4592282176017761, "learning_rate": 8.881891819691246e-06, "loss": 0.6687, "step": 5316 }, { "epoch": 2.405338158787605, "grad_norm": 0.4054851830005646, "learning_rate": 8.88142994973438e-06, "loss": 0.4627, "step": 5317 }, { "epoch": 2.4057905451255372, "grad_norm": 0.4601972699165344, "learning_rate": 8.880967996416404e-06, "loss": 0.5378, "step": 5318 }, { "epoch": 2.4062429314634697, "grad_norm": 0.4713282287120819, "learning_rate": 8.880505959747245e-06, "loss": 0.5941, "step": 5319 }, { "epoch": 2.4066953178014026, "grad_norm": 0.4992575943470001, "learning_rate": 8.880043839736818e-06, "loss": 0.551, "step": 5320 }, { "epoch": 2.407147704139335, "grad_norm": 0.562827467918396, "learning_rate": 8.879581636395054e-06, "loss": 0.5606, "step": 5321 }, { "epoch": 2.4076000904772674, "grad_norm": 0.14555203914642334, "learning_rate": 8.879119349731877e-06, "loss": 1.0784, "step": 5322 }, { "epoch": 2.4080524768152003, "grad_norm": 0.2023267298936844, "learning_rate": 8.878656979757215e-06, "loss": 0.6482, "step": 5323 }, { "epoch": 2.4085048631531327, "grad_norm": 0.2303677201271057, "learning_rate": 8.878194526481e-06, "loss": 0.5892, "step": 5324 }, { "epoch": 2.408957249491065, "grad_norm": 0.23650243878364563, "learning_rate": 8.877731989913163e-06, "loss": 0.5714, "step": 5325 }, { "epoch": 2.409409635828998, "grad_norm": 0.25478923320770264, "learning_rate": 8.877269370063639e-06, "loss": 0.6659, "step": 5326 }, { "epoch": 2.4098620221669305, "grad_norm": 0.279761403799057, "learning_rate": 8.876806666942363e-06, "loss": 0.7131, "step": 5327 }, { "epoch": 2.4103144085048633, "grad_norm": 0.3055695593357086, "learning_rate": 8.876343880559271e-06, "loss": 0.6116, "step": 5328 }, { "epoch": 2.4107667948427958, "grad_norm": 0.28952184319496155, "learning_rate": 8.875881010924305e-06, "loss": 0.6511, "step": 5329 }, { "epoch": 2.411219181180728, "grad_norm": 0.2644006907939911, "learning_rate": 8.875418058047402e-06, "loss": 0.6327, "step": 5330 }, { "epoch": 2.411671567518661, "grad_norm": 0.31877684593200684, "learning_rate": 8.87495502193851e-06, "loss": 0.6455, "step": 5331 }, { "epoch": 2.4121239538565935, "grad_norm": 0.3282528817653656, "learning_rate": 8.87449190260757e-06, "loss": 0.6813, "step": 5332 }, { "epoch": 2.412576340194526, "grad_norm": 0.30799219012260437, "learning_rate": 8.874028700064528e-06, "loss": 0.5922, "step": 5333 }, { "epoch": 2.413028726532459, "grad_norm": 0.2891267240047455, "learning_rate": 8.873565414319336e-06, "loss": 0.5688, "step": 5334 }, { "epoch": 2.4134811128703912, "grad_norm": 0.30684101581573486, "learning_rate": 8.873102045381939e-06, "loss": 0.6179, "step": 5335 }, { "epoch": 2.4139334992083237, "grad_norm": 0.2874778211116791, "learning_rate": 8.87263859326229e-06, "loss": 0.5735, "step": 5336 }, { "epoch": 2.4143858855462565, "grad_norm": 0.3373110592365265, "learning_rate": 8.872175057970347e-06, "loss": 0.6875, "step": 5337 }, { "epoch": 2.414838271884189, "grad_norm": 0.31847095489501953, "learning_rate": 8.87171143951606e-06, "loss": 0.635, "step": 5338 }, { "epoch": 2.415290658222122, "grad_norm": 0.3038906455039978, "learning_rate": 8.871247737909387e-06, "loss": 0.4178, "step": 5339 }, { "epoch": 2.4157430445600543, "grad_norm": 0.29398006200790405, "learning_rate": 8.87078395316029e-06, "loss": 0.529, "step": 5340 }, { "epoch": 2.4161954308979867, "grad_norm": 0.29156097769737244, "learning_rate": 8.870320085278727e-06, "loss": 0.5983, "step": 5341 }, { "epoch": 2.4166478172359196, "grad_norm": 0.3314395844936371, "learning_rate": 8.86985613427466e-06, "loss": 0.6286, "step": 5342 }, { "epoch": 2.417100203573852, "grad_norm": 0.2678174376487732, "learning_rate": 8.869392100158055e-06, "loss": 0.5104, "step": 5343 }, { "epoch": 2.417552589911785, "grad_norm": 0.3382295072078705, "learning_rate": 8.868927982938877e-06, "loss": 0.5069, "step": 5344 }, { "epoch": 2.4180049762497173, "grad_norm": 0.32429739832878113, "learning_rate": 8.868463782627095e-06, "loss": 0.5742, "step": 5345 }, { "epoch": 2.4184573625876498, "grad_norm": 0.3550145924091339, "learning_rate": 8.867999499232675e-06, "loss": 0.6459, "step": 5346 }, { "epoch": 2.4189097489255826, "grad_norm": 0.35352128744125366, "learning_rate": 8.867535132765593e-06, "loss": 0.6231, "step": 5347 }, { "epoch": 2.419362135263515, "grad_norm": 0.31484100222587585, "learning_rate": 8.86707068323582e-06, "loss": 0.6527, "step": 5348 }, { "epoch": 2.4198145216014475, "grad_norm": 0.3595409393310547, "learning_rate": 8.866606150653331e-06, "loss": 0.654, "step": 5349 }, { "epoch": 2.4202669079393804, "grad_norm": 0.3363329768180847, "learning_rate": 8.866141535028104e-06, "loss": 0.5594, "step": 5350 }, { "epoch": 2.420719294277313, "grad_norm": 0.3004513084888458, "learning_rate": 8.865676836370114e-06, "loss": 0.517, "step": 5351 }, { "epoch": 2.4211716806152452, "grad_norm": 0.34054794907569885, "learning_rate": 8.865212054689344e-06, "loss": 0.6087, "step": 5352 }, { "epoch": 2.421624066953178, "grad_norm": 0.3066307306289673, "learning_rate": 8.864747189995775e-06, "loss": 0.3725, "step": 5353 }, { "epoch": 2.4220764532911105, "grad_norm": 0.3988504409790039, "learning_rate": 8.864282242299394e-06, "loss": 0.6691, "step": 5354 }, { "epoch": 2.4225288396290434, "grad_norm": 0.38124319911003113, "learning_rate": 8.863817211610183e-06, "loss": 0.5756, "step": 5355 }, { "epoch": 2.422981225966976, "grad_norm": 0.3242151737213135, "learning_rate": 8.86335209793813e-06, "loss": 0.5636, "step": 5356 }, { "epoch": 2.4234336123049083, "grad_norm": 0.37980321049690247, "learning_rate": 8.862886901293225e-06, "loss": 0.5232, "step": 5357 }, { "epoch": 2.423885998642841, "grad_norm": 0.3541114628314972, "learning_rate": 8.86242162168546e-06, "loss": 0.5046, "step": 5358 }, { "epoch": 2.4243383849807736, "grad_norm": 0.36211729049682617, "learning_rate": 8.861956259124825e-06, "loss": 0.5434, "step": 5359 }, { "epoch": 2.424790771318706, "grad_norm": 0.3420044183731079, "learning_rate": 8.861490813621317e-06, "loss": 0.4857, "step": 5360 }, { "epoch": 2.425243157656639, "grad_norm": 0.39352110028266907, "learning_rate": 8.86102528518493e-06, "loss": 0.5841, "step": 5361 }, { "epoch": 2.4256955439945713, "grad_norm": 0.36126866936683655, "learning_rate": 8.860559673825666e-06, "loss": 0.4875, "step": 5362 }, { "epoch": 2.4261479303325038, "grad_norm": 0.39452895522117615, "learning_rate": 8.86009397955352e-06, "loss": 0.4738, "step": 5363 }, { "epoch": 2.4266003166704366, "grad_norm": 0.42169830203056335, "learning_rate": 8.859628202378496e-06, "loss": 0.4853, "step": 5364 }, { "epoch": 2.427052703008369, "grad_norm": 0.4142342805862427, "learning_rate": 8.859162342310599e-06, "loss": 0.5243, "step": 5365 }, { "epoch": 2.427505089346302, "grad_norm": 0.42054373025894165, "learning_rate": 8.858696399359832e-06, "loss": 0.5433, "step": 5366 }, { "epoch": 2.4279574756842344, "grad_norm": 0.4369262158870697, "learning_rate": 8.858230373536203e-06, "loss": 0.5688, "step": 5367 }, { "epoch": 2.428409862022167, "grad_norm": 0.3775065541267395, "learning_rate": 8.857764264849722e-06, "loss": 0.4837, "step": 5368 }, { "epoch": 2.4288622483600997, "grad_norm": 0.5310124754905701, "learning_rate": 8.857298073310397e-06, "loss": 0.5895, "step": 5369 }, { "epoch": 2.429314634698032, "grad_norm": 0.4975602626800537, "learning_rate": 8.85683179892824e-06, "loss": 0.516, "step": 5370 }, { "epoch": 2.4297670210359645, "grad_norm": 0.5229071974754333, "learning_rate": 8.856365441713269e-06, "loss": 0.5521, "step": 5371 }, { "epoch": 2.4302194073738974, "grad_norm": 0.14225782454013824, "learning_rate": 8.855899001675497e-06, "loss": 0.9887, "step": 5372 }, { "epoch": 2.43067179371183, "grad_norm": 0.19779443740844727, "learning_rate": 8.855432478824941e-06, "loss": 0.8357, "step": 5373 }, { "epoch": 2.4311241800497623, "grad_norm": 0.22921974956989288, "learning_rate": 8.854965873171623e-06, "loss": 0.6282, "step": 5374 }, { "epoch": 2.431576566387695, "grad_norm": 0.25163426995277405, "learning_rate": 8.854499184725562e-06, "loss": 0.5181, "step": 5375 }, { "epoch": 2.4320289527256276, "grad_norm": 0.23717309534549713, "learning_rate": 8.854032413496783e-06, "loss": 0.6041, "step": 5376 }, { "epoch": 2.4324813390635605, "grad_norm": 0.23358270525932312, "learning_rate": 8.853565559495308e-06, "loss": 0.6296, "step": 5377 }, { "epoch": 2.432933725401493, "grad_norm": 0.26128092408180237, "learning_rate": 8.853098622731167e-06, "loss": 0.6375, "step": 5378 }, { "epoch": 2.4333861117394253, "grad_norm": 0.2504749000072479, "learning_rate": 8.852631603214385e-06, "loss": 0.5835, "step": 5379 }, { "epoch": 2.433838498077358, "grad_norm": 0.25571542978286743, "learning_rate": 8.852164500954997e-06, "loss": 0.5651, "step": 5380 }, { "epoch": 2.4342908844152906, "grad_norm": 0.29159918427467346, "learning_rate": 8.851697315963027e-06, "loss": 0.6465, "step": 5381 }, { "epoch": 2.4347432707532235, "grad_norm": 0.2574502229690552, "learning_rate": 8.851230048248518e-06, "loss": 0.5423, "step": 5382 }, { "epoch": 2.435195657091156, "grad_norm": 0.33362847566604614, "learning_rate": 8.8507626978215e-06, "loss": 0.7129, "step": 5383 }, { "epoch": 2.4356480434290884, "grad_norm": 0.2869953215122223, "learning_rate": 8.850295264692011e-06, "loss": 0.5663, "step": 5384 }, { "epoch": 2.436100429767021, "grad_norm": 0.29577234387397766, "learning_rate": 8.849827748870087e-06, "loss": 0.5468, "step": 5385 }, { "epoch": 2.4365528161049537, "grad_norm": 0.2803560495376587, "learning_rate": 8.849360150365776e-06, "loss": 0.5001, "step": 5386 }, { "epoch": 2.437005202442886, "grad_norm": 0.3033377528190613, "learning_rate": 8.848892469189114e-06, "loss": 0.5673, "step": 5387 }, { "epoch": 2.437457588780819, "grad_norm": 0.3291592299938202, "learning_rate": 8.84842470535015e-06, "loss": 0.5718, "step": 5388 }, { "epoch": 2.4379099751187514, "grad_norm": 0.30145835876464844, "learning_rate": 8.847956858858927e-06, "loss": 0.543, "step": 5389 }, { "epoch": 2.438362361456684, "grad_norm": 0.284882128238678, "learning_rate": 8.847488929725494e-06, "loss": 0.5034, "step": 5390 }, { "epoch": 2.4388147477946167, "grad_norm": 0.3367573022842407, "learning_rate": 8.847020917959898e-06, "loss": 0.6522, "step": 5391 }, { "epoch": 2.439267134132549, "grad_norm": 0.29866573214530945, "learning_rate": 8.846552823572196e-06, "loss": 0.5966, "step": 5392 }, { "epoch": 2.439719520470482, "grad_norm": 0.3149200677871704, "learning_rate": 8.846084646572439e-06, "loss": 0.5328, "step": 5393 }, { "epoch": 2.4401719068084144, "grad_norm": 0.35886910557746887, "learning_rate": 8.845616386970679e-06, "loss": 0.6393, "step": 5394 }, { "epoch": 2.440624293146347, "grad_norm": 0.31963664293289185, "learning_rate": 8.845148044776976e-06, "loss": 0.5927, "step": 5395 }, { "epoch": 2.4410766794842798, "grad_norm": 0.33402106165885925, "learning_rate": 8.844679620001388e-06, "loss": 0.5247, "step": 5396 }, { "epoch": 2.441529065822212, "grad_norm": 0.2932829260826111, "learning_rate": 8.844211112653973e-06, "loss": 0.4816, "step": 5397 }, { "epoch": 2.4419814521601446, "grad_norm": 0.329637348651886, "learning_rate": 8.843742522744796e-06, "loss": 0.4942, "step": 5398 }, { "epoch": 2.4424338384980775, "grad_norm": 0.32581427693367004, "learning_rate": 8.84327385028392e-06, "loss": 0.5665, "step": 5399 }, { "epoch": 2.44288622483601, "grad_norm": 0.36157557368278503, "learning_rate": 8.84280509528141e-06, "loss": 0.5778, "step": 5400 }, { "epoch": 2.44288622483601, "eval_loss": 0.5977576375007629, "eval_runtime": 26.1658, "eval_samples_per_second": 28.434, "eval_steps_per_second": 7.109, "step": 5400 }, { "epoch": 2.4433386111739424, "grad_norm": 0.35969844460487366, "learning_rate": 8.842336257747335e-06, "loss": 0.5595, "step": 5401 }, { "epoch": 2.4437909975118752, "grad_norm": 0.35302209854125977, "learning_rate": 8.841867337691762e-06, "loss": 0.4515, "step": 5402 }, { "epoch": 2.4442433838498077, "grad_norm": 0.3452773988246918, "learning_rate": 8.841398335124763e-06, "loss": 0.5106, "step": 5403 }, { "epoch": 2.4446957701877405, "grad_norm": 0.3759860098361969, "learning_rate": 8.840929250056411e-06, "loss": 0.5886, "step": 5404 }, { "epoch": 2.445148156525673, "grad_norm": 0.4010747969150543, "learning_rate": 8.84046008249678e-06, "loss": 0.6131, "step": 5405 }, { "epoch": 2.4456005428636054, "grad_norm": 0.36966726183891296, "learning_rate": 8.839990832455947e-06, "loss": 0.6209, "step": 5406 }, { "epoch": 2.4460529292015383, "grad_norm": 0.35617589950561523, "learning_rate": 8.839521499943989e-06, "loss": 0.4622, "step": 5407 }, { "epoch": 2.4465053155394707, "grad_norm": 0.37337586283683777, "learning_rate": 8.839052084970986e-06, "loss": 0.5624, "step": 5408 }, { "epoch": 2.446957701877403, "grad_norm": 0.3654063940048218, "learning_rate": 8.83858258754702e-06, "loss": 0.5817, "step": 5409 }, { "epoch": 2.447410088215336, "grad_norm": 0.43068212270736694, "learning_rate": 8.838113007682174e-06, "loss": 0.6009, "step": 5410 }, { "epoch": 2.4478624745532684, "grad_norm": 0.386129230260849, "learning_rate": 8.837643345386533e-06, "loss": 0.5289, "step": 5411 }, { "epoch": 2.448314860891201, "grad_norm": 0.41975995898246765, "learning_rate": 8.837173600670186e-06, "loss": 0.5244, "step": 5412 }, { "epoch": 2.4487672472291337, "grad_norm": 0.39822399616241455, "learning_rate": 8.83670377354322e-06, "loss": 0.5425, "step": 5413 }, { "epoch": 2.449219633567066, "grad_norm": 0.3845489025115967, "learning_rate": 8.836233864015725e-06, "loss": 0.5558, "step": 5414 }, { "epoch": 2.449672019904999, "grad_norm": 0.41170090436935425, "learning_rate": 8.83576387209779e-06, "loss": 0.4483, "step": 5415 }, { "epoch": 2.4501244062429315, "grad_norm": 0.37305787205696106, "learning_rate": 8.835293797799517e-06, "loss": 0.4706, "step": 5416 }, { "epoch": 2.450576792580864, "grad_norm": 0.4286135137081146, "learning_rate": 8.834823641130996e-06, "loss": 0.5341, "step": 5417 }, { "epoch": 2.451029178918797, "grad_norm": 0.4053560197353363, "learning_rate": 8.834353402102325e-06, "loss": 0.4988, "step": 5418 }, { "epoch": 2.4514815652567292, "grad_norm": 0.5109438896179199, "learning_rate": 8.833883080723604e-06, "loss": 0.5479, "step": 5419 }, { "epoch": 2.4519339515946617, "grad_norm": 0.5211220383644104, "learning_rate": 8.833412677004936e-06, "loss": 0.5801, "step": 5420 }, { "epoch": 2.4523863379325945, "grad_norm": 0.6040114760398865, "learning_rate": 8.83294219095642e-06, "loss": 0.6083, "step": 5421 }, { "epoch": 2.452838724270527, "grad_norm": 0.13380660116672516, "learning_rate": 8.832471622588164e-06, "loss": 1.2133, "step": 5422 }, { "epoch": 2.4532911106084594, "grad_norm": 0.1705181896686554, "learning_rate": 8.83200097191027e-06, "loss": 0.6989, "step": 5423 }, { "epoch": 2.4537434969463923, "grad_norm": 0.21337592601776123, "learning_rate": 8.831530238932853e-06, "loss": 0.5289, "step": 5424 }, { "epoch": 2.4541958832843247, "grad_norm": 0.263844758272171, "learning_rate": 8.831059423666016e-06, "loss": 0.5439, "step": 5425 }, { "epoch": 2.4546482696222576, "grad_norm": 0.22260598838329315, "learning_rate": 8.830588526119874e-06, "loss": 0.5281, "step": 5426 }, { "epoch": 2.45510065596019, "grad_norm": 0.2515573501586914, "learning_rate": 8.83011754630454e-06, "loss": 0.5627, "step": 5427 }, { "epoch": 2.4555530422981224, "grad_norm": 0.2875995934009552, "learning_rate": 8.82964648423013e-06, "loss": 0.6564, "step": 5428 }, { "epoch": 2.4560054286360553, "grad_norm": 0.286616712808609, "learning_rate": 8.82917533990676e-06, "loss": 0.6741, "step": 5429 }, { "epoch": 2.4564578149739877, "grad_norm": 0.2636890411376953, "learning_rate": 8.828704113344548e-06, "loss": 0.6286, "step": 5430 }, { "epoch": 2.4569102013119206, "grad_norm": 0.29177892208099365, "learning_rate": 8.828232804553615e-06, "loss": 0.6494, "step": 5431 }, { "epoch": 2.457362587649853, "grad_norm": 0.31826746463775635, "learning_rate": 8.827761413544084e-06, "loss": 0.6385, "step": 5432 }, { "epoch": 2.4578149739877855, "grad_norm": 0.3088007867336273, "learning_rate": 8.82728994032608e-06, "loss": 0.6848, "step": 5433 }, { "epoch": 2.4582673603257184, "grad_norm": 0.29524970054626465, "learning_rate": 8.826818384909726e-06, "loss": 0.6576, "step": 5434 }, { "epoch": 2.458719746663651, "grad_norm": 0.2738126218318939, "learning_rate": 8.826346747305151e-06, "loss": 0.5278, "step": 5435 }, { "epoch": 2.459172133001583, "grad_norm": 0.29413822293281555, "learning_rate": 8.825875027522485e-06, "loss": 0.5705, "step": 5436 }, { "epoch": 2.459624519339516, "grad_norm": 0.2851937711238861, "learning_rate": 8.825403225571857e-06, "loss": 0.4966, "step": 5437 }, { "epoch": 2.4600769056774485, "grad_norm": 0.2837385833263397, "learning_rate": 8.824931341463402e-06, "loss": 0.4795, "step": 5438 }, { "epoch": 2.460529292015381, "grad_norm": 0.2803003787994385, "learning_rate": 8.824459375207253e-06, "loss": 0.433, "step": 5439 }, { "epoch": 2.460981678353314, "grad_norm": 0.2901679277420044, "learning_rate": 8.823987326813548e-06, "loss": 0.6018, "step": 5440 }, { "epoch": 2.4614340646912463, "grad_norm": 0.3188396692276001, "learning_rate": 8.823515196292425e-06, "loss": 0.674, "step": 5441 }, { "epoch": 2.461886451029179, "grad_norm": 0.33738958835601807, "learning_rate": 8.823042983654022e-06, "loss": 0.6153, "step": 5442 }, { "epoch": 2.4623388373671116, "grad_norm": 0.3397676348686218, "learning_rate": 8.822570688908481e-06, "loss": 0.6204, "step": 5443 }, { "epoch": 2.462791223705044, "grad_norm": 0.3343583941459656, "learning_rate": 8.822098312065948e-06, "loss": 0.6248, "step": 5444 }, { "epoch": 2.463243610042977, "grad_norm": 0.32449957728385925, "learning_rate": 8.821625853136565e-06, "loss": 0.5107, "step": 5445 }, { "epoch": 2.4636959963809093, "grad_norm": 0.3463996946811676, "learning_rate": 8.821153312130483e-06, "loss": 0.5397, "step": 5446 }, { "epoch": 2.4641483827188417, "grad_norm": 0.3125450611114502, "learning_rate": 8.820680689057845e-06, "loss": 0.5359, "step": 5447 }, { "epoch": 2.4646007690567746, "grad_norm": 0.31700149178504944, "learning_rate": 8.820207983928808e-06, "loss": 0.5376, "step": 5448 }, { "epoch": 2.465053155394707, "grad_norm": 0.3517458438873291, "learning_rate": 8.819735196753518e-06, "loss": 0.6284, "step": 5449 }, { "epoch": 2.4655055417326395, "grad_norm": 0.35212165117263794, "learning_rate": 8.819262327542135e-06, "loss": 0.5274, "step": 5450 }, { "epoch": 2.4659579280705723, "grad_norm": 0.32930946350097656, "learning_rate": 8.818789376304808e-06, "loss": 0.5108, "step": 5451 }, { "epoch": 2.466410314408505, "grad_norm": 0.3142828643321991, "learning_rate": 8.8183163430517e-06, "loss": 0.4675, "step": 5452 }, { "epoch": 2.4668627007464377, "grad_norm": 0.3498554229736328, "learning_rate": 8.817843227792968e-06, "loss": 0.5789, "step": 5453 }, { "epoch": 2.46731508708437, "grad_norm": 0.3208390176296234, "learning_rate": 8.817370030538774e-06, "loss": 0.4351, "step": 5454 }, { "epoch": 2.4677674734223025, "grad_norm": 0.33452680706977844, "learning_rate": 8.81689675129928e-06, "loss": 0.5316, "step": 5455 }, { "epoch": 2.4682198597602354, "grad_norm": 0.3455788195133209, "learning_rate": 8.816423390084651e-06, "loss": 0.5354, "step": 5456 }, { "epoch": 2.468672246098168, "grad_norm": 0.3798849284648895, "learning_rate": 8.815949946905054e-06, "loss": 0.5307, "step": 5457 }, { "epoch": 2.4691246324361003, "grad_norm": 0.4235844314098358, "learning_rate": 8.815476421770657e-06, "loss": 0.6114, "step": 5458 }, { "epoch": 2.469577018774033, "grad_norm": 0.396506130695343, "learning_rate": 8.815002814691627e-06, "loss": 0.4543, "step": 5459 }, { "epoch": 2.4700294051119656, "grad_norm": 0.37156441807746887, "learning_rate": 8.814529125678139e-06, "loss": 0.5481, "step": 5460 }, { "epoch": 2.470481791449898, "grad_norm": 0.39429476857185364, "learning_rate": 8.814055354740366e-06, "loss": 0.5626, "step": 5461 }, { "epoch": 2.470934177787831, "grad_norm": 0.3900661766529083, "learning_rate": 8.81358150188848e-06, "loss": 0.5226, "step": 5462 }, { "epoch": 2.4713865641257633, "grad_norm": 0.36742550134658813, "learning_rate": 8.813107567132661e-06, "loss": 0.4775, "step": 5463 }, { "epoch": 2.471838950463696, "grad_norm": 0.3837139308452606, "learning_rate": 8.812633550483087e-06, "loss": 0.565, "step": 5464 }, { "epoch": 2.4722913368016286, "grad_norm": 0.41402918100357056, "learning_rate": 8.81215945194994e-06, "loss": 0.541, "step": 5465 }, { "epoch": 2.472743723139561, "grad_norm": 0.4743436872959137, "learning_rate": 8.811685271543399e-06, "loss": 0.5781, "step": 5466 }, { "epoch": 2.473196109477494, "grad_norm": 0.4517994523048401, "learning_rate": 8.81121100927365e-06, "loss": 0.6015, "step": 5467 }, { "epoch": 2.4736484958154263, "grad_norm": 0.42518162727355957, "learning_rate": 8.810736665150876e-06, "loss": 0.5241, "step": 5468 }, { "epoch": 2.474100882153359, "grad_norm": 0.4728601276874542, "learning_rate": 8.810262239185269e-06, "loss": 0.5302, "step": 5469 }, { "epoch": 2.4745532684912916, "grad_norm": 0.4844455122947693, "learning_rate": 8.809787731387015e-06, "loss": 0.6006, "step": 5470 }, { "epoch": 2.475005654829224, "grad_norm": 0.5375655293464661, "learning_rate": 8.809313141766305e-06, "loss": 0.5927, "step": 5471 }, { "epoch": 2.4754580411671565, "grad_norm": 0.17531976103782654, "learning_rate": 8.808838470333335e-06, "loss": 1.3645, "step": 5472 }, { "epoch": 2.4759104275050894, "grad_norm": 0.17121252417564392, "learning_rate": 8.808363717098294e-06, "loss": 0.5065, "step": 5473 }, { "epoch": 2.476362813843022, "grad_norm": 0.2330915778875351, "learning_rate": 8.807888882071381e-06, "loss": 0.5904, "step": 5474 }, { "epoch": 2.4768152001809547, "grad_norm": 0.22917616367340088, "learning_rate": 8.807413965262796e-06, "loss": 0.5219, "step": 5475 }, { "epoch": 2.477267586518887, "grad_norm": 0.26592373847961426, "learning_rate": 8.806938966682734e-06, "loss": 0.5928, "step": 5476 }, { "epoch": 2.4777199728568196, "grad_norm": 0.2583407461643219, "learning_rate": 8.806463886341402e-06, "loss": 0.6078, "step": 5477 }, { "epoch": 2.4781723591947524, "grad_norm": 0.294445276260376, "learning_rate": 8.805988724249e-06, "loss": 0.5596, "step": 5478 }, { "epoch": 2.478624745532685, "grad_norm": 0.28283995389938354, "learning_rate": 8.805513480415734e-06, "loss": 0.6396, "step": 5479 }, { "epoch": 2.4790771318706177, "grad_norm": 0.3049236238002777, "learning_rate": 8.805038154851809e-06, "loss": 0.6861, "step": 5480 }, { "epoch": 2.47952951820855, "grad_norm": 0.3004058599472046, "learning_rate": 8.804562747567435e-06, "loss": 0.5916, "step": 5481 }, { "epoch": 2.4799819045464826, "grad_norm": 0.29919689893722534, "learning_rate": 8.804087258572823e-06, "loss": 0.5817, "step": 5482 }, { "epoch": 2.4804342908844155, "grad_norm": 0.3200329542160034, "learning_rate": 8.803611687878184e-06, "loss": 0.6173, "step": 5483 }, { "epoch": 2.480886677222348, "grad_norm": 0.30821382999420166, "learning_rate": 8.803136035493734e-06, "loss": 0.5499, "step": 5484 }, { "epoch": 2.4813390635602803, "grad_norm": 0.3311557471752167, "learning_rate": 8.802660301429683e-06, "loss": 0.5317, "step": 5485 }, { "epoch": 2.481791449898213, "grad_norm": 0.2976446747779846, "learning_rate": 8.802184485696254e-06, "loss": 0.6632, "step": 5486 }, { "epoch": 2.4822438362361456, "grad_norm": 0.4763090908527374, "learning_rate": 8.801708588303663e-06, "loss": 0.5069, "step": 5487 }, { "epoch": 2.482696222574078, "grad_norm": 0.35900288820266724, "learning_rate": 8.801232609262131e-06, "loss": 0.5887, "step": 5488 }, { "epoch": 2.483148608912011, "grad_norm": 0.2977924048900604, "learning_rate": 8.800756548581883e-06, "loss": 0.6129, "step": 5489 }, { "epoch": 2.4836009952499434, "grad_norm": 0.27297067642211914, "learning_rate": 8.80028040627314e-06, "loss": 0.4325, "step": 5490 }, { "epoch": 2.4840533815878763, "grad_norm": 0.3581538498401642, "learning_rate": 8.79980418234613e-06, "loss": 0.6299, "step": 5491 }, { "epoch": 2.4845057679258087, "grad_norm": 0.3818656802177429, "learning_rate": 8.799327876811081e-06, "loss": 0.5843, "step": 5492 }, { "epoch": 2.484958154263741, "grad_norm": 0.3036392629146576, "learning_rate": 8.798851489678221e-06, "loss": 0.4942, "step": 5493 }, { "epoch": 2.485410540601674, "grad_norm": 0.31124022603034973, "learning_rate": 8.798375020957784e-06, "loss": 0.4684, "step": 5494 }, { "epoch": 2.4858629269396064, "grad_norm": 0.35157063603401184, "learning_rate": 8.79789847066e-06, "loss": 0.6759, "step": 5495 }, { "epoch": 2.486315313277539, "grad_norm": 0.3182127773761749, "learning_rate": 8.797421838795105e-06, "loss": 0.5444, "step": 5496 }, { "epoch": 2.4867676996154717, "grad_norm": 0.37617990374565125, "learning_rate": 8.796945125373338e-06, "loss": 0.7013, "step": 5497 }, { "epoch": 2.487220085953404, "grad_norm": 0.3663232624530792, "learning_rate": 8.796468330404932e-06, "loss": 0.5149, "step": 5498 }, { "epoch": 2.4876724722913366, "grad_norm": 0.30819031596183777, "learning_rate": 8.795991453900132e-06, "loss": 0.4926, "step": 5499 }, { "epoch": 2.4881248586292695, "grad_norm": 0.3504073917865753, "learning_rate": 8.79551449586918e-06, "loss": 0.5319, "step": 5500 }, { "epoch": 2.488577244967202, "grad_norm": 0.3637681305408478, "learning_rate": 8.795037456322315e-06, "loss": 0.6245, "step": 5501 }, { "epoch": 2.4890296313051348, "grad_norm": 0.3760673999786377, "learning_rate": 8.794560335269785e-06, "loss": 0.5902, "step": 5502 }, { "epoch": 2.489482017643067, "grad_norm": 0.34268510341644287, "learning_rate": 8.794083132721838e-06, "loss": 0.5009, "step": 5503 }, { "epoch": 2.4899344039809996, "grad_norm": 0.3456726372241974, "learning_rate": 8.793605848688719e-06, "loss": 0.5095, "step": 5504 }, { "epoch": 2.4903867903189325, "grad_norm": 0.33818769454956055, "learning_rate": 8.793128483180685e-06, "loss": 0.5133, "step": 5505 }, { "epoch": 2.490839176656865, "grad_norm": 0.35791754722595215, "learning_rate": 8.792651036207984e-06, "loss": 0.568, "step": 5506 }, { "epoch": 2.4912915629947974, "grad_norm": 0.37415561079978943, "learning_rate": 8.79217350778087e-06, "loss": 0.5367, "step": 5507 }, { "epoch": 2.4917439493327302, "grad_norm": 0.37867471575737, "learning_rate": 8.7916958979096e-06, "loss": 0.544, "step": 5508 }, { "epoch": 2.4921963356706627, "grad_norm": 0.4020763337612152, "learning_rate": 8.791218206604433e-06, "loss": 0.5914, "step": 5509 }, { "epoch": 2.492648722008595, "grad_norm": 0.3509797751903534, "learning_rate": 8.790740433875624e-06, "loss": 0.499, "step": 5510 }, { "epoch": 2.493101108346528, "grad_norm": 0.43847325444221497, "learning_rate": 8.790262579733438e-06, "loss": 0.69, "step": 5511 }, { "epoch": 2.4935534946844604, "grad_norm": 0.3778340518474579, "learning_rate": 8.789784644188138e-06, "loss": 0.4887, "step": 5512 }, { "epoch": 2.4940058810223933, "grad_norm": 0.3468453586101532, "learning_rate": 8.789306627249985e-06, "loss": 0.5303, "step": 5513 }, { "epoch": 2.4944582673603257, "grad_norm": 0.39931607246398926, "learning_rate": 8.788828528929247e-06, "loss": 0.5967, "step": 5514 }, { "epoch": 2.494910653698258, "grad_norm": 0.4232451319694519, "learning_rate": 8.788350349236194e-06, "loss": 0.5305, "step": 5515 }, { "epoch": 2.495363040036191, "grad_norm": 0.39510640501976013, "learning_rate": 8.787872088181093e-06, "loss": 0.4452, "step": 5516 }, { "epoch": 2.4958154263741235, "grad_norm": 0.3951968550682068, "learning_rate": 8.787393745774218e-06, "loss": 0.425, "step": 5517 }, { "epoch": 2.4962678127120563, "grad_norm": 0.4135085642337799, "learning_rate": 8.786915322025841e-06, "loss": 0.4786, "step": 5518 }, { "epoch": 2.4967201990499888, "grad_norm": 0.4086475670337677, "learning_rate": 8.786436816946238e-06, "loss": 0.5256, "step": 5519 }, { "epoch": 2.497172585387921, "grad_norm": 0.4065731167793274, "learning_rate": 8.785958230545684e-06, "loss": 0.3898, "step": 5520 }, { "epoch": 2.497624971725854, "grad_norm": 0.5750518441200256, "learning_rate": 8.785479562834457e-06, "loss": 0.5922, "step": 5521 }, { "epoch": 2.4980773580637865, "grad_norm": 0.18872061371803284, "learning_rate": 8.785000813822842e-06, "loss": 0.9599, "step": 5522 }, { "epoch": 2.498529744401719, "grad_norm": 0.17506158351898193, "learning_rate": 8.784521983521117e-06, "loss": 0.5387, "step": 5523 }, { "epoch": 2.498982130739652, "grad_norm": 0.22900308668613434, "learning_rate": 8.784043071939567e-06, "loss": 0.7292, "step": 5524 }, { "epoch": 2.4994345170775842, "grad_norm": 0.24263858795166016, "learning_rate": 8.783564079088478e-06, "loss": 0.6525, "step": 5525 }, { "epoch": 2.4998869034155167, "grad_norm": 0.34110721945762634, "learning_rate": 8.783085004978134e-06, "loss": 0.6242, "step": 5526 }, { "epoch": 2.5003392897534495, "grad_norm": 0.2513487935066223, "learning_rate": 8.782605849618829e-06, "loss": 0.6884, "step": 5527 }, { "epoch": 2.500791676091382, "grad_norm": 0.2791770398616791, "learning_rate": 8.78212661302085e-06, "loss": 0.5733, "step": 5528 }, { "epoch": 2.501244062429315, "grad_norm": 0.25769346952438354, "learning_rate": 8.781647295194492e-06, "loss": 0.5945, "step": 5529 }, { "epoch": 2.5016964487672473, "grad_norm": 0.25283312797546387, "learning_rate": 8.781167896150048e-06, "loss": 0.56, "step": 5530 }, { "epoch": 2.5021488351051797, "grad_norm": 0.26985058188438416, "learning_rate": 8.780688415897813e-06, "loss": 0.5206, "step": 5531 }, { "epoch": 2.502601221443112, "grad_norm": 0.2842089831829071, "learning_rate": 8.780208854448088e-06, "loss": 0.602, "step": 5532 }, { "epoch": 2.503053607781045, "grad_norm": 0.27800989151000977, "learning_rate": 8.77972921181117e-06, "loss": 0.5454, "step": 5533 }, { "epoch": 2.5035059941189775, "grad_norm": 0.3147179186344147, "learning_rate": 8.77924948799736e-06, "loss": 0.6035, "step": 5534 }, { "epoch": 2.5039583804569103, "grad_norm": 0.3761310577392578, "learning_rate": 8.778769683016962e-06, "loss": 0.6791, "step": 5535 }, { "epoch": 2.5044107667948428, "grad_norm": 0.30373695492744446, "learning_rate": 8.778289796880279e-06, "loss": 0.6146, "step": 5536 }, { "epoch": 2.504863153132775, "grad_norm": 0.30811429023742676, "learning_rate": 8.777809829597622e-06, "loss": 0.6173, "step": 5537 }, { "epoch": 2.505315539470708, "grad_norm": 0.30085769295692444, "learning_rate": 8.777329781179292e-06, "loss": 0.5498, "step": 5538 }, { "epoch": 2.5057679258086405, "grad_norm": 0.2978964149951935, "learning_rate": 8.776849651635605e-06, "loss": 0.5524, "step": 5539 }, { "epoch": 2.5062203121465734, "grad_norm": 0.3047117292881012, "learning_rate": 8.776369440976872e-06, "loss": 0.5527, "step": 5540 }, { "epoch": 2.506672698484506, "grad_norm": 0.3167020082473755, "learning_rate": 8.775889149213404e-06, "loss": 0.5706, "step": 5541 }, { "epoch": 2.5071250848224382, "grad_norm": 0.29579490423202515, "learning_rate": 8.775408776355518e-06, "loss": 0.5198, "step": 5542 }, { "epoch": 2.507577471160371, "grad_norm": 0.3150225579738617, "learning_rate": 8.77492832241353e-06, "loss": 0.5629, "step": 5543 }, { "epoch": 2.5080298574983035, "grad_norm": 0.3396511673927307, "learning_rate": 8.774447787397757e-06, "loss": 0.6428, "step": 5544 }, { "epoch": 2.5084822438362364, "grad_norm": 0.35510683059692383, "learning_rate": 8.773967171318523e-06, "loss": 0.5946, "step": 5545 }, { "epoch": 2.508934630174169, "grad_norm": 0.30553972721099854, "learning_rate": 8.77348647418615e-06, "loss": 0.5042, "step": 5546 }, { "epoch": 2.5093870165121013, "grad_norm": 0.32836756110191345, "learning_rate": 8.773005696010958e-06, "loss": 0.6058, "step": 5547 }, { "epoch": 2.5098394028500337, "grad_norm": 0.34370437264442444, "learning_rate": 8.772524836803276e-06, "loss": 0.5688, "step": 5548 }, { "epoch": 2.5102917891879666, "grad_norm": 0.3454599976539612, "learning_rate": 8.77204389657343e-06, "loss": 0.5365, "step": 5549 }, { "epoch": 2.510744175525899, "grad_norm": 0.2959088087081909, "learning_rate": 8.771562875331752e-06, "loss": 0.4206, "step": 5550 }, { "epoch": 2.511196561863832, "grad_norm": 0.35884201526641846, "learning_rate": 8.771081773088568e-06, "loss": 0.5955, "step": 5551 }, { "epoch": 2.5116489482017643, "grad_norm": 0.33788859844207764, "learning_rate": 8.770600589854212e-06, "loss": 0.5253, "step": 5552 }, { "epoch": 2.5121013345396968, "grad_norm": 0.3288978338241577, "learning_rate": 8.770119325639021e-06, "loss": 0.5147, "step": 5553 }, { "epoch": 2.5125537208776296, "grad_norm": 0.3277188539505005, "learning_rate": 8.769637980453328e-06, "loss": 0.5208, "step": 5554 }, { "epoch": 2.513006107215562, "grad_norm": 0.3025349974632263, "learning_rate": 8.769156554307473e-06, "loss": 0.4191, "step": 5555 }, { "epoch": 2.513458493553495, "grad_norm": 0.36155587434768677, "learning_rate": 8.768675047211795e-06, "loss": 0.602, "step": 5556 }, { "epoch": 2.5139108798914274, "grad_norm": 0.37133994698524475, "learning_rate": 8.768193459176635e-06, "loss": 0.5628, "step": 5557 }, { "epoch": 2.51436326622936, "grad_norm": 0.34585481882095337, "learning_rate": 8.767711790212335e-06, "loss": 0.5158, "step": 5558 }, { "epoch": 2.5148156525672922, "grad_norm": 0.35075026750564575, "learning_rate": 8.767230040329243e-06, "loss": 0.4939, "step": 5559 }, { "epoch": 2.515268038905225, "grad_norm": 0.41321471333503723, "learning_rate": 8.7667482095377e-06, "loss": 0.5936, "step": 5560 }, { "epoch": 2.5157204252431575, "grad_norm": 0.37593916058540344, "learning_rate": 8.766266297848058e-06, "loss": 0.4638, "step": 5561 }, { "epoch": 2.5161728115810904, "grad_norm": 0.413801908493042, "learning_rate": 8.765784305270669e-06, "loss": 0.5435, "step": 5562 }, { "epoch": 2.516625197919023, "grad_norm": 0.36863306164741516, "learning_rate": 8.76530223181588e-06, "loss": 0.4747, "step": 5563 }, { "epoch": 2.5170775842569553, "grad_norm": 0.4153030812740326, "learning_rate": 8.764820077494047e-06, "loss": 0.5654, "step": 5564 }, { "epoch": 2.517529970594888, "grad_norm": 0.3757514953613281, "learning_rate": 8.764337842315524e-06, "loss": 0.4944, "step": 5565 }, { "epoch": 2.5179823569328206, "grad_norm": 0.4394760727882385, "learning_rate": 8.76385552629067e-06, "loss": 0.5335, "step": 5566 }, { "epoch": 2.5184347432707535, "grad_norm": 0.39326348900794983, "learning_rate": 8.763373129429842e-06, "loss": 0.5209, "step": 5567 }, { "epoch": 2.518887129608686, "grad_norm": 0.45392856001853943, "learning_rate": 8.7628906517434e-06, "loss": 0.5917, "step": 5568 }, { "epoch": 2.5193395159466183, "grad_norm": 0.4264508783817291, "learning_rate": 8.762408093241708e-06, "loss": 0.5288, "step": 5569 }, { "epoch": 2.5197919022845507, "grad_norm": 0.48192253708839417, "learning_rate": 8.76192545393513e-06, "loss": 0.5174, "step": 5570 }, { "epoch": 2.5202442886224836, "grad_norm": 0.5121484994888306, "learning_rate": 8.761442733834026e-06, "loss": 0.532, "step": 5571 }, { "epoch": 2.520696674960416, "grad_norm": 0.12397557497024536, "learning_rate": 8.760959932948772e-06, "loss": 1.1354, "step": 5572 }, { "epoch": 2.521149061298349, "grad_norm": 0.19877079129219055, "learning_rate": 8.760477051289731e-06, "loss": 1.0012, "step": 5573 }, { "epoch": 2.5216014476362814, "grad_norm": 0.21857264637947083, "learning_rate": 8.759994088867276e-06, "loss": 0.7228, "step": 5574 }, { "epoch": 2.522053833974214, "grad_norm": 0.2364625632762909, "learning_rate": 8.75951104569178e-06, "loss": 0.6154, "step": 5575 }, { "epoch": 2.5225062203121467, "grad_norm": 0.210057333111763, "learning_rate": 8.759027921773614e-06, "loss": 0.5131, "step": 5576 }, { "epoch": 2.522958606650079, "grad_norm": 0.2563864290714264, "learning_rate": 8.758544717123158e-06, "loss": 0.6393, "step": 5577 }, { "epoch": 2.523410992988012, "grad_norm": 0.2606077790260315, "learning_rate": 8.758061431750786e-06, "loss": 0.6127, "step": 5578 }, { "epoch": 2.5238633793259444, "grad_norm": 0.2907216250896454, "learning_rate": 8.757578065666882e-06, "loss": 0.665, "step": 5579 }, { "epoch": 2.524315765663877, "grad_norm": 0.29943183064460754, "learning_rate": 8.757094618881824e-06, "loss": 0.7515, "step": 5580 }, { "epoch": 2.5247681520018097, "grad_norm": 0.2823304533958435, "learning_rate": 8.756611091405995e-06, "loss": 0.5903, "step": 5581 }, { "epoch": 2.525220538339742, "grad_norm": 0.26731938123703003, "learning_rate": 8.756127483249782e-06, "loss": 0.5631, "step": 5582 }, { "epoch": 2.5256729246776746, "grad_norm": 0.3200847804546356, "learning_rate": 8.755643794423569e-06, "loss": 0.7073, "step": 5583 }, { "epoch": 2.5261253110156074, "grad_norm": 0.2952301502227783, "learning_rate": 8.755160024937745e-06, "loss": 0.6835, "step": 5584 }, { "epoch": 2.52657769735354, "grad_norm": 0.2822968065738678, "learning_rate": 8.754676174802698e-06, "loss": 0.5646, "step": 5585 }, { "epoch": 2.5270300836914723, "grad_norm": 0.2863563895225525, "learning_rate": 8.754192244028825e-06, "loss": 0.5856, "step": 5586 }, { "epoch": 2.527482470029405, "grad_norm": 0.3107351064682007, "learning_rate": 8.753708232626514e-06, "loss": 0.6273, "step": 5587 }, { "epoch": 2.5279348563673376, "grad_norm": 0.30540451407432556, "learning_rate": 8.753224140606162e-06, "loss": 0.4533, "step": 5588 }, { "epoch": 2.5283872427052705, "grad_norm": 0.28850239515304565, "learning_rate": 8.752739967978166e-06, "loss": 0.4728, "step": 5589 }, { "epoch": 2.528839629043203, "grad_norm": 0.32707250118255615, "learning_rate": 8.752255714752923e-06, "loss": 0.5818, "step": 5590 }, { "epoch": 2.5292920153811354, "grad_norm": 0.3007029592990875, "learning_rate": 8.751771380940834e-06, "loss": 0.5612, "step": 5591 }, { "epoch": 2.5297444017190682, "grad_norm": 0.330806702375412, "learning_rate": 8.751286966552304e-06, "loss": 0.5789, "step": 5592 }, { "epoch": 2.5301967880570007, "grad_norm": 0.2931225001811981, "learning_rate": 8.750802471597733e-06, "loss": 0.5072, "step": 5593 }, { "epoch": 2.5306491743949335, "grad_norm": 0.3121397793292999, "learning_rate": 8.750317896087525e-06, "loss": 0.569, "step": 5594 }, { "epoch": 2.531101560732866, "grad_norm": 0.3471183776855469, "learning_rate": 8.749833240032092e-06, "loss": 0.661, "step": 5595 }, { "epoch": 2.5315539470707984, "grad_norm": 0.2589404881000519, "learning_rate": 8.749348503441841e-06, "loss": 0.4077, "step": 5596 }, { "epoch": 2.532006333408731, "grad_norm": 0.32980766892433167, "learning_rate": 8.748863686327183e-06, "loss": 0.5648, "step": 5597 }, { "epoch": 2.5324587197466637, "grad_norm": 0.32762375473976135, "learning_rate": 8.748378788698529e-06, "loss": 0.5106, "step": 5598 }, { "epoch": 2.532911106084596, "grad_norm": 0.28741684556007385, "learning_rate": 8.747893810566292e-06, "loss": 0.3661, "step": 5599 }, { "epoch": 2.533363492422529, "grad_norm": 0.323660671710968, "learning_rate": 8.74740875194089e-06, "loss": 0.5063, "step": 5600 }, { "epoch": 2.533363492422529, "eval_loss": 0.5955378413200378, "eval_runtime": 25.6972, "eval_samples_per_second": 28.953, "eval_steps_per_second": 7.238, "step": 5600 }, { "epoch": 2.5338158787604614, "grad_norm": 0.35302722454071045, "learning_rate": 8.746923612832742e-06, "loss": 0.5463, "step": 5601 }, { "epoch": 2.534268265098394, "grad_norm": 0.32410863041877747, "learning_rate": 8.746438393252266e-06, "loss": 0.6052, "step": 5602 }, { "epoch": 2.5347206514363267, "grad_norm": 0.336868017911911, "learning_rate": 8.745953093209881e-06, "loss": 0.539, "step": 5603 }, { "epoch": 2.535173037774259, "grad_norm": 0.3916645348072052, "learning_rate": 8.745467712716011e-06, "loss": 0.5656, "step": 5604 }, { "epoch": 2.535625424112192, "grad_norm": 0.3937990665435791, "learning_rate": 8.744982251781081e-06, "loss": 0.5411, "step": 5605 }, { "epoch": 2.5360778104501245, "grad_norm": 0.3957420587539673, "learning_rate": 8.744496710415518e-06, "loss": 0.5146, "step": 5606 }, { "epoch": 2.536530196788057, "grad_norm": 0.3657575249671936, "learning_rate": 8.744011088629748e-06, "loss": 0.4544, "step": 5607 }, { "epoch": 2.5369825831259893, "grad_norm": 0.43471744656562805, "learning_rate": 8.743525386434202e-06, "loss": 0.6797, "step": 5608 }, { "epoch": 2.537434969463922, "grad_norm": 0.3351464569568634, "learning_rate": 8.74303960383931e-06, "loss": 0.4644, "step": 5609 }, { "epoch": 2.5378873558018546, "grad_norm": 0.3854024112224579, "learning_rate": 8.742553740855507e-06, "loss": 0.5524, "step": 5610 }, { "epoch": 2.5383397421397875, "grad_norm": 0.44093388319015503, "learning_rate": 8.742067797493226e-06, "loss": 0.5709, "step": 5611 }, { "epoch": 2.53879212847772, "grad_norm": 0.4256325662136078, "learning_rate": 8.741581773762904e-06, "loss": 0.5452, "step": 5612 }, { "epoch": 2.5392445148156524, "grad_norm": 0.4239073693752289, "learning_rate": 8.74109566967498e-06, "loss": 0.547, "step": 5613 }, { "epoch": 2.5396969011535853, "grad_norm": 0.4154922664165497, "learning_rate": 8.740609485239895e-06, "loss": 0.4253, "step": 5614 }, { "epoch": 2.5401492874915177, "grad_norm": 0.3999197781085968, "learning_rate": 8.740123220468088e-06, "loss": 0.4763, "step": 5615 }, { "epoch": 2.5406016738294506, "grad_norm": 0.4685239791870117, "learning_rate": 8.739636875370003e-06, "loss": 0.6161, "step": 5616 }, { "epoch": 2.541054060167383, "grad_norm": 0.4887843728065491, "learning_rate": 8.739150449956088e-06, "loss": 0.567, "step": 5617 }, { "epoch": 2.5415064465053154, "grad_norm": 0.4283953011035919, "learning_rate": 8.738663944236787e-06, "loss": 0.5405, "step": 5618 }, { "epoch": 2.541958832843248, "grad_norm": 0.46427643299102783, "learning_rate": 8.73817735822255e-06, "loss": 0.5799, "step": 5619 }, { "epoch": 2.5424112191811807, "grad_norm": 0.460715115070343, "learning_rate": 8.737690691923827e-06, "loss": 0.5074, "step": 5620 }, { "epoch": 2.542863605519113, "grad_norm": 0.5935936570167542, "learning_rate": 8.73720394535107e-06, "loss": 0.4881, "step": 5621 }, { "epoch": 2.543315991857046, "grad_norm": 0.13201503455638885, "learning_rate": 8.736717118514733e-06, "loss": 1.4032, "step": 5622 }, { "epoch": 2.5437683781949785, "grad_norm": 0.16621127724647522, "learning_rate": 8.736230211425271e-06, "loss": 1.0558, "step": 5623 }, { "epoch": 2.544220764532911, "grad_norm": 0.23950935900211334, "learning_rate": 8.735743224093142e-06, "loss": 0.9122, "step": 5624 }, { "epoch": 2.544673150870844, "grad_norm": 0.2520673871040344, "learning_rate": 8.735256156528804e-06, "loss": 0.6664, "step": 5625 }, { "epoch": 2.545125537208776, "grad_norm": 0.2517290711402893, "learning_rate": 8.734769008742718e-06, "loss": 0.5912, "step": 5626 }, { "epoch": 2.545577923546709, "grad_norm": 0.22960475087165833, "learning_rate": 8.734281780745348e-06, "loss": 0.4918, "step": 5627 }, { "epoch": 2.5460303098846415, "grad_norm": 0.28992030024528503, "learning_rate": 8.733794472547157e-06, "loss": 0.6979, "step": 5628 }, { "epoch": 2.546482696222574, "grad_norm": 0.2601953446865082, "learning_rate": 8.73330708415861e-06, "loss": 0.5439, "step": 5629 }, { "epoch": 2.546935082560507, "grad_norm": 0.27636221051216125, "learning_rate": 8.732819615590175e-06, "loss": 0.7353, "step": 5630 }, { "epoch": 2.5473874688984393, "grad_norm": 0.26246243715286255, "learning_rate": 8.732332066852323e-06, "loss": 0.6194, "step": 5631 }, { "epoch": 2.547839855236372, "grad_norm": 0.298247754573822, "learning_rate": 8.731844437955523e-06, "loss": 0.5807, "step": 5632 }, { "epoch": 2.5482922415743046, "grad_norm": 0.3251953721046448, "learning_rate": 8.73135672891025e-06, "loss": 0.6004, "step": 5633 }, { "epoch": 2.548744627912237, "grad_norm": 0.2805132269859314, "learning_rate": 8.730868939726975e-06, "loss": 0.5924, "step": 5634 }, { "epoch": 2.5491970142501694, "grad_norm": 0.26805180311203003, "learning_rate": 8.730381070416177e-06, "loss": 0.4821, "step": 5635 }, { "epoch": 2.5496494005881023, "grad_norm": 0.32823073863983154, "learning_rate": 8.729893120988333e-06, "loss": 0.6299, "step": 5636 }, { "epoch": 2.5501017869260347, "grad_norm": 0.3108392357826233, "learning_rate": 8.729405091453924e-06, "loss": 0.5801, "step": 5637 }, { "epoch": 2.5505541732639676, "grad_norm": 0.30375343561172485, "learning_rate": 8.728916981823427e-06, "loss": 0.5939, "step": 5638 }, { "epoch": 2.5510065596019, "grad_norm": 0.34095048904418945, "learning_rate": 8.72842879210733e-06, "loss": 0.5125, "step": 5639 }, { "epoch": 2.5514589459398325, "grad_norm": 0.3268079161643982, "learning_rate": 8.727940522316115e-06, "loss": 0.5836, "step": 5640 }, { "epoch": 2.5519113322777653, "grad_norm": 0.3426135182380676, "learning_rate": 8.727452172460273e-06, "loss": 0.5597, "step": 5641 }, { "epoch": 2.5523637186156978, "grad_norm": 0.31250226497650146, "learning_rate": 8.726963742550286e-06, "loss": 0.4853, "step": 5642 }, { "epoch": 2.5528161049536306, "grad_norm": 0.3214510381221771, "learning_rate": 8.726475232596647e-06, "loss": 0.5804, "step": 5643 }, { "epoch": 2.553268491291563, "grad_norm": 0.314134418964386, "learning_rate": 8.725986642609849e-06, "loss": 0.5812, "step": 5644 }, { "epoch": 2.5537208776294955, "grad_norm": 0.3217645287513733, "learning_rate": 8.725497972600382e-06, "loss": 0.5815, "step": 5645 }, { "epoch": 2.554173263967428, "grad_norm": 0.3301340937614441, "learning_rate": 8.725009222578744e-06, "loss": 0.5721, "step": 5646 }, { "epoch": 2.554625650305361, "grad_norm": 0.35982146859169006, "learning_rate": 8.72452039255543e-06, "loss": 0.6859, "step": 5647 }, { "epoch": 2.5550780366432932, "grad_norm": 0.36838826537132263, "learning_rate": 8.72403148254094e-06, "loss": 0.6735, "step": 5648 }, { "epoch": 2.555530422981226, "grad_norm": 0.33974727988243103, "learning_rate": 8.723542492545773e-06, "loss": 0.5913, "step": 5649 }, { "epoch": 2.5559828093191586, "grad_norm": 0.3619416654109955, "learning_rate": 8.723053422580432e-06, "loss": 0.6439, "step": 5650 }, { "epoch": 2.556435195657091, "grad_norm": 0.3243519365787506, "learning_rate": 8.72256427265542e-06, "loss": 0.5435, "step": 5651 }, { "epoch": 2.556887581995024, "grad_norm": 0.33117520809173584, "learning_rate": 8.722075042781243e-06, "loss": 0.5202, "step": 5652 }, { "epoch": 2.5573399683329563, "grad_norm": 0.35326218605041504, "learning_rate": 8.721585732968408e-06, "loss": 0.549, "step": 5653 }, { "epoch": 2.557792354670889, "grad_norm": 0.385469913482666, "learning_rate": 8.721096343227423e-06, "loss": 0.686, "step": 5654 }, { "epoch": 2.5582447410088216, "grad_norm": 0.36834266781806946, "learning_rate": 8.7206068735688e-06, "loss": 0.5806, "step": 5655 }, { "epoch": 2.558697127346754, "grad_norm": 0.4262550473213196, "learning_rate": 8.72011732400305e-06, "loss": 0.6256, "step": 5656 }, { "epoch": 2.5591495136846865, "grad_norm": 0.353619784116745, "learning_rate": 8.719627694540688e-06, "loss": 0.4489, "step": 5657 }, { "epoch": 2.5596019000226193, "grad_norm": 0.3119400143623352, "learning_rate": 8.719137985192231e-06, "loss": 0.4187, "step": 5658 }, { "epoch": 2.5600542863605518, "grad_norm": 0.3942684531211853, "learning_rate": 8.718648195968194e-06, "loss": 0.6301, "step": 5659 }, { "epoch": 2.5605066726984846, "grad_norm": 0.3717525899410248, "learning_rate": 8.718158326879095e-06, "loss": 0.4965, "step": 5660 }, { "epoch": 2.560959059036417, "grad_norm": 0.36713239550590515, "learning_rate": 8.71766837793546e-06, "loss": 0.4128, "step": 5661 }, { "epoch": 2.5614114453743495, "grad_norm": 0.40771400928497314, "learning_rate": 8.717178349147806e-06, "loss": 0.6062, "step": 5662 }, { "epoch": 2.5618638317122824, "grad_norm": 0.3795808255672455, "learning_rate": 8.71668824052666e-06, "loss": 0.4497, "step": 5663 }, { "epoch": 2.562316218050215, "grad_norm": 0.39484071731567383, "learning_rate": 8.716198052082549e-06, "loss": 0.5114, "step": 5664 }, { "epoch": 2.5627686043881477, "grad_norm": 0.37123265862464905, "learning_rate": 8.715707783825998e-06, "loss": 0.4225, "step": 5665 }, { "epoch": 2.56322099072608, "grad_norm": 0.39541491866111755, "learning_rate": 8.715217435767537e-06, "loss": 0.5446, "step": 5666 }, { "epoch": 2.5636733770640125, "grad_norm": 0.4859430193901062, "learning_rate": 8.7147270079177e-06, "loss": 0.6432, "step": 5667 }, { "epoch": 2.5641257634019454, "grad_norm": 0.46263882517814636, "learning_rate": 8.714236500287019e-06, "loss": 0.5864, "step": 5668 }, { "epoch": 2.564578149739878, "grad_norm": 0.4472268521785736, "learning_rate": 8.713745912886026e-06, "loss": 0.5494, "step": 5669 }, { "epoch": 2.5650305360778103, "grad_norm": 0.49752479791641235, "learning_rate": 8.713255245725258e-06, "loss": 0.4868, "step": 5670 }, { "epoch": 2.565482922415743, "grad_norm": 0.6582954525947571, "learning_rate": 8.712764498815255e-06, "loss": 0.6495, "step": 5671 }, { "epoch": 2.5659353087536756, "grad_norm": 0.12223484367132187, "learning_rate": 8.712273672166555e-06, "loss": 1.0663, "step": 5672 }, { "epoch": 2.566387695091608, "grad_norm": 0.2712564766407013, "learning_rate": 8.711782765789702e-06, "loss": 1.0468, "step": 5673 }, { "epoch": 2.566840081429541, "grad_norm": 0.2335863560438156, "learning_rate": 8.711291779695235e-06, "loss": 0.6177, "step": 5674 }, { "epoch": 2.5672924677674733, "grad_norm": 0.303033709526062, "learning_rate": 8.710800713893703e-06, "loss": 0.6396, "step": 5675 }, { "epoch": 2.567744854105406, "grad_norm": 0.2617935240268707, "learning_rate": 8.710309568395648e-06, "loss": 0.7409, "step": 5676 }, { "epoch": 2.5681972404433386, "grad_norm": 0.26163285970687866, "learning_rate": 8.709818343211623e-06, "loss": 0.6564, "step": 5677 }, { "epoch": 2.568649626781271, "grad_norm": 0.2651450037956238, "learning_rate": 8.709327038352175e-06, "loss": 0.5418, "step": 5678 }, { "epoch": 2.569102013119204, "grad_norm": 0.24651269614696503, "learning_rate": 8.708835653827857e-06, "loss": 0.5433, "step": 5679 }, { "epoch": 2.5695543994571364, "grad_norm": 0.2873004972934723, "learning_rate": 8.70834418964922e-06, "loss": 0.6392, "step": 5680 }, { "epoch": 2.5700067857950692, "grad_norm": 0.2958512306213379, "learning_rate": 8.707852645826824e-06, "loss": 0.6077, "step": 5681 }, { "epoch": 2.5704591721330017, "grad_norm": 0.2835501730442047, "learning_rate": 8.707361022371222e-06, "loss": 0.5743, "step": 5682 }, { "epoch": 2.570911558470934, "grad_norm": 0.28843924403190613, "learning_rate": 8.706869319292972e-06, "loss": 0.5957, "step": 5683 }, { "epoch": 2.5713639448088665, "grad_norm": 0.27191323041915894, "learning_rate": 8.706377536602639e-06, "loss": 0.5236, "step": 5684 }, { "epoch": 2.5718163311467994, "grad_norm": 0.31444916129112244, "learning_rate": 8.70588567431078e-06, "loss": 0.6492, "step": 5685 }, { "epoch": 2.572268717484732, "grad_norm": 0.33866432309150696, "learning_rate": 8.70539373242796e-06, "loss": 0.5875, "step": 5686 }, { "epoch": 2.5727211038226647, "grad_norm": 0.2855691611766815, "learning_rate": 8.704901710964746e-06, "loss": 0.5947, "step": 5687 }, { "epoch": 2.573173490160597, "grad_norm": 0.32749077677726746, "learning_rate": 8.704409609931704e-06, "loss": 0.6345, "step": 5688 }, { "epoch": 2.5736258764985296, "grad_norm": 0.2950423061847687, "learning_rate": 8.703917429339401e-06, "loss": 0.5284, "step": 5689 }, { "epoch": 2.5740782628364625, "grad_norm": 0.32511845231056213, "learning_rate": 8.70342516919841e-06, "loss": 0.6683, "step": 5690 }, { "epoch": 2.574530649174395, "grad_norm": 0.3164081871509552, "learning_rate": 8.702932829519303e-06, "loss": 0.5594, "step": 5691 }, { "epoch": 2.5749830355123278, "grad_norm": 0.31446385383605957, "learning_rate": 8.702440410312654e-06, "loss": 0.4412, "step": 5692 }, { "epoch": 2.57543542185026, "grad_norm": 0.30646398663520813, "learning_rate": 8.701947911589036e-06, "loss": 0.4966, "step": 5693 }, { "epoch": 2.5758878081881926, "grad_norm": 0.3169136345386505, "learning_rate": 8.70145533335903e-06, "loss": 0.5262, "step": 5694 }, { "epoch": 2.576340194526125, "grad_norm": 0.34252792596817017, "learning_rate": 8.700962675633213e-06, "loss": 0.5969, "step": 5695 }, { "epoch": 2.576792580864058, "grad_norm": 0.31373050808906555, "learning_rate": 8.700469938422167e-06, "loss": 0.5658, "step": 5696 }, { "epoch": 2.5772449672019904, "grad_norm": 0.34431055188179016, "learning_rate": 8.699977121736473e-06, "loss": 0.6004, "step": 5697 }, { "epoch": 2.5776973535399232, "grad_norm": 0.32223257422447205, "learning_rate": 8.699484225586717e-06, "loss": 0.5642, "step": 5698 }, { "epoch": 2.5781497398778557, "grad_norm": 0.33540141582489014, "learning_rate": 8.698991249983481e-06, "loss": 0.543, "step": 5699 }, { "epoch": 2.578602126215788, "grad_norm": 0.3270452320575714, "learning_rate": 8.698498194937358e-06, "loss": 0.4912, "step": 5700 }, { "epoch": 2.579054512553721, "grad_norm": 0.3769868314266205, "learning_rate": 8.698005060458934e-06, "loss": 0.615, "step": 5701 }, { "epoch": 2.5795068988916534, "grad_norm": 0.38691726326942444, "learning_rate": 8.6975118465588e-06, "loss": 0.6856, "step": 5702 }, { "epoch": 2.5799592852295863, "grad_norm": 0.3647996783256531, "learning_rate": 8.69701855324755e-06, "loss": 0.4814, "step": 5703 }, { "epoch": 2.5804116715675187, "grad_norm": 0.36135345697402954, "learning_rate": 8.696525180535778e-06, "loss": 0.6195, "step": 5704 }, { "epoch": 2.580864057905451, "grad_norm": 0.40125179290771484, "learning_rate": 8.696031728434078e-06, "loss": 0.5817, "step": 5705 }, { "epoch": 2.5813164442433836, "grad_norm": 0.3545471131801605, "learning_rate": 8.695538196953052e-06, "loss": 0.4376, "step": 5706 }, { "epoch": 2.5817688305813165, "grad_norm": 0.3609042465686798, "learning_rate": 8.695044586103297e-06, "loss": 0.4997, "step": 5707 }, { "epoch": 2.582221216919249, "grad_norm": 0.38084378838539124, "learning_rate": 8.694550895895413e-06, "loss": 0.4796, "step": 5708 }, { "epoch": 2.5826736032571818, "grad_norm": 0.425731360912323, "learning_rate": 8.694057126340005e-06, "loss": 0.527, "step": 5709 }, { "epoch": 2.583125989595114, "grad_norm": 0.34577658772468567, "learning_rate": 8.693563277447678e-06, "loss": 0.4256, "step": 5710 }, { "epoch": 2.5835783759330466, "grad_norm": 0.3713057041168213, "learning_rate": 8.693069349229038e-06, "loss": 0.529, "step": 5711 }, { "epoch": 2.5840307622709795, "grad_norm": 0.4152858853340149, "learning_rate": 8.69257534169469e-06, "loss": 0.5209, "step": 5712 }, { "epoch": 2.584483148608912, "grad_norm": 0.3900648355484009, "learning_rate": 8.692081254855248e-06, "loss": 0.4697, "step": 5713 }, { "epoch": 2.584935534946845, "grad_norm": 0.44749006628990173, "learning_rate": 8.691587088721322e-06, "loss": 0.5615, "step": 5714 }, { "epoch": 2.5853879212847772, "grad_norm": 0.4017035961151123, "learning_rate": 8.691092843303524e-06, "loss": 0.4816, "step": 5715 }, { "epoch": 2.5858403076227097, "grad_norm": 0.4374752342700958, "learning_rate": 8.69059851861247e-06, "loss": 0.4761, "step": 5716 }, { "epoch": 2.5862926939606425, "grad_norm": 0.370530903339386, "learning_rate": 8.690104114658777e-06, "loss": 0.4519, "step": 5717 }, { "epoch": 2.586745080298575, "grad_norm": 0.42756134271621704, "learning_rate": 8.689609631453059e-06, "loss": 0.4923, "step": 5718 }, { "epoch": 2.587197466636508, "grad_norm": 0.4655548632144928, "learning_rate": 8.689115069005945e-06, "loss": 0.5419, "step": 5719 }, { "epoch": 2.5876498529744403, "grad_norm": 0.5325247645378113, "learning_rate": 8.688620427328047e-06, "loss": 0.6121, "step": 5720 }, { "epoch": 2.5881022393123727, "grad_norm": 0.4884117841720581, "learning_rate": 8.688125706429995e-06, "loss": 0.4619, "step": 5721 }, { "epoch": 2.588554625650305, "grad_norm": 0.14091332256793976, "learning_rate": 8.68763090632241e-06, "loss": 1.2102, "step": 5722 }, { "epoch": 2.589007011988238, "grad_norm": 0.33930104970932007, "learning_rate": 8.687136027015921e-06, "loss": 0.8043, "step": 5723 }, { "epoch": 2.5894593983261704, "grad_norm": 0.25123104453086853, "learning_rate": 8.686641068521155e-06, "loss": 0.7142, "step": 5724 }, { "epoch": 2.5899117846641033, "grad_norm": 0.2600346803665161, "learning_rate": 8.686146030848744e-06, "loss": 0.5699, "step": 5725 }, { "epoch": 2.5903641710020358, "grad_norm": 0.2569544315338135, "learning_rate": 8.68565091400932e-06, "loss": 0.6092, "step": 5726 }, { "epoch": 2.590816557339968, "grad_norm": 0.23097549378871918, "learning_rate": 8.685155718013514e-06, "loss": 0.5039, "step": 5727 }, { "epoch": 2.591268943677901, "grad_norm": 0.2892136871814728, "learning_rate": 8.684660442871963e-06, "loss": 0.6528, "step": 5728 }, { "epoch": 2.5917213300158335, "grad_norm": 0.27112826704978943, "learning_rate": 8.684165088595304e-06, "loss": 0.5707, "step": 5729 }, { "epoch": 2.5921737163537664, "grad_norm": 0.2675671875476837, "learning_rate": 8.683669655194176e-06, "loss": 0.5866, "step": 5730 }, { "epoch": 2.592626102691699, "grad_norm": 0.3115674555301666, "learning_rate": 8.683174142679218e-06, "loss": 0.7591, "step": 5731 }, { "epoch": 2.5930784890296312, "grad_norm": 0.27504366636276245, "learning_rate": 8.682678551061072e-06, "loss": 0.5647, "step": 5732 }, { "epoch": 2.5935308753675637, "grad_norm": 0.30538126826286316, "learning_rate": 8.682182880350386e-06, "loss": 0.6277, "step": 5733 }, { "epoch": 2.5939832617054965, "grad_norm": 0.3222768306732178, "learning_rate": 8.681687130557799e-06, "loss": 0.7092, "step": 5734 }, { "epoch": 2.594435648043429, "grad_norm": 0.2896077632904053, "learning_rate": 8.68119130169396e-06, "loss": 0.5347, "step": 5735 }, { "epoch": 2.594888034381362, "grad_norm": 0.3077663481235504, "learning_rate": 8.680695393769523e-06, "loss": 0.6544, "step": 5736 }, { "epoch": 2.5953404207192943, "grad_norm": 0.3069363534450531, "learning_rate": 8.680199406795133e-06, "loss": 0.6057, "step": 5737 }, { "epoch": 2.5957928070572267, "grad_norm": 0.3380058705806732, "learning_rate": 8.679703340781445e-06, "loss": 0.5969, "step": 5738 }, { "epoch": 2.5962451933951596, "grad_norm": 0.306671679019928, "learning_rate": 8.67920719573911e-06, "loss": 0.603, "step": 5739 }, { "epoch": 2.596697579733092, "grad_norm": 0.3112654983997345, "learning_rate": 8.678710971678788e-06, "loss": 0.5627, "step": 5740 }, { "epoch": 2.597149966071025, "grad_norm": 0.33297839760780334, "learning_rate": 8.678214668611131e-06, "loss": 0.6289, "step": 5741 }, { "epoch": 2.5976023524089573, "grad_norm": 0.3101857304573059, "learning_rate": 8.677718286546803e-06, "loss": 0.6537, "step": 5742 }, { "epoch": 2.5980547387468897, "grad_norm": 0.2973531186580658, "learning_rate": 8.677221825496465e-06, "loss": 0.5663, "step": 5743 }, { "epoch": 2.598507125084822, "grad_norm": 0.28486600518226624, "learning_rate": 8.676725285470774e-06, "loss": 0.4604, "step": 5744 }, { "epoch": 2.598959511422755, "grad_norm": 0.2979166805744171, "learning_rate": 8.676228666480397e-06, "loss": 0.5021, "step": 5745 }, { "epoch": 2.5994118977606875, "grad_norm": 0.31807345151901245, "learning_rate": 8.675731968536004e-06, "loss": 0.5897, "step": 5746 }, { "epoch": 2.5998642840986204, "grad_norm": 0.31840816140174866, "learning_rate": 8.675235191648255e-06, "loss": 0.5151, "step": 5747 }, { "epoch": 2.600316670436553, "grad_norm": 0.3176335096359253, "learning_rate": 8.674738335827826e-06, "loss": 0.4677, "step": 5748 }, { "epoch": 2.600769056774485, "grad_norm": 0.4141165614128113, "learning_rate": 8.674241401085382e-06, "loss": 0.6834, "step": 5749 }, { "epoch": 2.601221443112418, "grad_norm": 0.2909509241580963, "learning_rate": 8.6737443874316e-06, "loss": 0.4236, "step": 5750 }, { "epoch": 2.6016738294503505, "grad_norm": 0.3898342549800873, "learning_rate": 8.673247294877154e-06, "loss": 0.6643, "step": 5751 }, { "epoch": 2.6021262157882834, "grad_norm": 0.37536779046058655, "learning_rate": 8.672750123432717e-06, "loss": 0.546, "step": 5752 }, { "epoch": 2.602578602126216, "grad_norm": 0.31997349858283997, "learning_rate": 8.672252873108969e-06, "loss": 0.4985, "step": 5753 }, { "epoch": 2.6030309884641483, "grad_norm": 0.34433239698410034, "learning_rate": 8.671755543916589e-06, "loss": 0.4574, "step": 5754 }, { "epoch": 2.603483374802081, "grad_norm": 0.3322123885154724, "learning_rate": 8.671258135866258e-06, "loss": 0.4397, "step": 5755 }, { "epoch": 2.6039357611400136, "grad_norm": 0.35464224219322205, "learning_rate": 8.670760648968659e-06, "loss": 0.5187, "step": 5756 }, { "epoch": 2.604388147477946, "grad_norm": 0.32567301392555237, "learning_rate": 8.670263083234476e-06, "loss": 0.4935, "step": 5757 }, { "epoch": 2.604840533815879, "grad_norm": 0.36342304944992065, "learning_rate": 8.669765438674396e-06, "loss": 0.447, "step": 5758 }, { "epoch": 2.6052929201538113, "grad_norm": 0.3553107678890228, "learning_rate": 8.669267715299109e-06, "loss": 0.4864, "step": 5759 }, { "epoch": 2.6057453064917437, "grad_norm": 0.3823886513710022, "learning_rate": 8.668769913119297e-06, "loss": 0.4619, "step": 5760 }, { "epoch": 2.6061976928296766, "grad_norm": 0.41147294640541077, "learning_rate": 8.668272032145659e-06, "loss": 0.5434, "step": 5761 }, { "epoch": 2.606650079167609, "grad_norm": 0.5086625218391418, "learning_rate": 8.667774072388887e-06, "loss": 0.7275, "step": 5762 }, { "epoch": 2.607102465505542, "grad_norm": 0.405335396528244, "learning_rate": 8.66727603385967e-06, "loss": 0.6193, "step": 5763 }, { "epoch": 2.6075548518434744, "grad_norm": 0.43580707907676697, "learning_rate": 8.66677791656871e-06, "loss": 0.5699, "step": 5764 }, { "epoch": 2.608007238181407, "grad_norm": 0.4135909974575043, "learning_rate": 8.666279720526703e-06, "loss": 0.5123, "step": 5765 }, { "epoch": 2.6084596245193397, "grad_norm": 0.42961230874061584, "learning_rate": 8.665781445744348e-06, "loss": 0.5056, "step": 5766 }, { "epoch": 2.608912010857272, "grad_norm": 0.45658454298973083, "learning_rate": 8.665283092232349e-06, "loss": 0.6246, "step": 5767 }, { "epoch": 2.609364397195205, "grad_norm": 0.5162119269371033, "learning_rate": 8.664784660001405e-06, "loss": 0.6073, "step": 5768 }, { "epoch": 2.6098167835331374, "grad_norm": 0.47569864988327026, "learning_rate": 8.664286149062225e-06, "loss": 0.5487, "step": 5769 }, { "epoch": 2.61026916987107, "grad_norm": 0.4668107330799103, "learning_rate": 8.663787559425511e-06, "loss": 0.5143, "step": 5770 }, { "epoch": 2.6107215562090023, "grad_norm": 0.5413625836372375, "learning_rate": 8.663288891101977e-06, "loss": 0.5387, "step": 5771 }, { "epoch": 2.611173942546935, "grad_norm": 0.1409270167350769, "learning_rate": 8.662790144102326e-06, "loss": 1.0644, "step": 5772 }, { "epoch": 2.6116263288848676, "grad_norm": 0.18240195512771606, "learning_rate": 8.662291318437277e-06, "loss": 0.7605, "step": 5773 }, { "epoch": 2.6120787152228004, "grad_norm": 0.22218354046344757, "learning_rate": 8.661792414117536e-06, "loss": 0.605, "step": 5774 }, { "epoch": 2.612531101560733, "grad_norm": 0.27073076367378235, "learning_rate": 8.661293431153823e-06, "loss": 0.615, "step": 5775 }, { "epoch": 2.6129834878986653, "grad_norm": 0.23725278675556183, "learning_rate": 8.660794369556853e-06, "loss": 0.5848, "step": 5776 }, { "epoch": 2.613435874236598, "grad_norm": 0.26617786288261414, "learning_rate": 8.660295229337344e-06, "loss": 0.6038, "step": 5777 }, { "epoch": 2.6138882605745306, "grad_norm": 0.27532586455345154, "learning_rate": 8.659796010506016e-06, "loss": 0.5602, "step": 5778 }, { "epoch": 2.6143406469124635, "grad_norm": 0.26813235878944397, "learning_rate": 8.659296713073592e-06, "loss": 0.5769, "step": 5779 }, { "epoch": 2.614793033250396, "grad_norm": 0.2891724705696106, "learning_rate": 8.658797337050792e-06, "loss": 0.6165, "step": 5780 }, { "epoch": 2.6152454195883283, "grad_norm": 0.3077624440193176, "learning_rate": 8.658297882448345e-06, "loss": 0.6646, "step": 5781 }, { "epoch": 2.6156978059262608, "grad_norm": 0.31440743803977966, "learning_rate": 8.657798349276975e-06, "loss": 0.6983, "step": 5782 }, { "epoch": 2.6161501922641937, "grad_norm": 0.2603209316730499, "learning_rate": 8.657298737547412e-06, "loss": 0.5566, "step": 5783 }, { "epoch": 2.616602578602126, "grad_norm": 0.3339213728904724, "learning_rate": 8.656799047270385e-06, "loss": 0.5795, "step": 5784 }, { "epoch": 2.617054964940059, "grad_norm": 0.31251803040504456, "learning_rate": 8.656299278456628e-06, "loss": 0.6186, "step": 5785 }, { "epoch": 2.6175073512779914, "grad_norm": 0.27882120013237, "learning_rate": 8.655799431116871e-06, "loss": 0.4976, "step": 5786 }, { "epoch": 2.617959737615924, "grad_norm": 0.3121650516986847, "learning_rate": 8.655299505261852e-06, "loss": 0.6161, "step": 5787 }, { "epoch": 2.6184121239538567, "grad_norm": 0.3181210160255432, "learning_rate": 8.654799500902308e-06, "loss": 0.5731, "step": 5788 }, { "epoch": 2.618864510291789, "grad_norm": 0.30246248841285706, "learning_rate": 8.654299418048974e-06, "loss": 0.6262, "step": 5789 }, { "epoch": 2.619316896629722, "grad_norm": 0.3174123764038086, "learning_rate": 8.653799256712593e-06, "loss": 0.6094, "step": 5790 }, { "epoch": 2.6197692829676544, "grad_norm": 0.3230733275413513, "learning_rate": 8.653299016903909e-06, "loss": 0.6764, "step": 5791 }, { "epoch": 2.620221669305587, "grad_norm": 0.3345838487148285, "learning_rate": 8.652798698633662e-06, "loss": 0.576, "step": 5792 }, { "epoch": 2.6206740556435193, "grad_norm": 0.32709017395973206, "learning_rate": 8.652298301912596e-06, "loss": 0.5909, "step": 5793 }, { "epoch": 2.621126441981452, "grad_norm": 0.31995889544487, "learning_rate": 8.651797826751464e-06, "loss": 0.5534, "step": 5794 }, { "epoch": 2.6215788283193846, "grad_norm": 0.33655601739883423, "learning_rate": 8.65129727316101e-06, "loss": 0.506, "step": 5795 }, { "epoch": 2.6220312146573175, "grad_norm": 0.3260619044303894, "learning_rate": 8.650796641151983e-06, "loss": 0.5013, "step": 5796 }, { "epoch": 2.62248360099525, "grad_norm": 0.32738301157951355, "learning_rate": 8.650295930735141e-06, "loss": 0.5106, "step": 5797 }, { "epoch": 2.6229359873331823, "grad_norm": 0.39378970861434937, "learning_rate": 8.649795141921234e-06, "loss": 0.7101, "step": 5798 }, { "epoch": 2.623388373671115, "grad_norm": 0.34805989265441895, "learning_rate": 8.649294274721015e-06, "loss": 0.554, "step": 5799 }, { "epoch": 2.6238407600090476, "grad_norm": 0.33214929699897766, "learning_rate": 8.648793329145245e-06, "loss": 0.5377, "step": 5800 }, { "epoch": 2.6238407600090476, "eval_loss": 0.5962729454040527, "eval_runtime": 33.8207, "eval_samples_per_second": 21.998, "eval_steps_per_second": 5.5, "step": 5800 }, { "epoch": 2.6242931463469805, "grad_norm": 0.3500797748565674, "learning_rate": 8.648292305204681e-06, "loss": 0.5321, "step": 5801 }, { "epoch": 2.624745532684913, "grad_norm": 0.3390500247478485, "learning_rate": 8.647791202910084e-06, "loss": 0.5305, "step": 5802 }, { "epoch": 2.6251979190228454, "grad_norm": 0.3553808331489563, "learning_rate": 8.647290022272217e-06, "loss": 0.5828, "step": 5803 }, { "epoch": 2.6256503053607783, "grad_norm": 0.3987314999103546, "learning_rate": 8.646788763301842e-06, "loss": 0.64, "step": 5804 }, { "epoch": 2.6261026916987107, "grad_norm": 0.33867475390434265, "learning_rate": 8.646287426009725e-06, "loss": 0.5283, "step": 5805 }, { "epoch": 2.6265550780366436, "grad_norm": 0.37135204672813416, "learning_rate": 8.645786010406634e-06, "loss": 0.5546, "step": 5806 }, { "epoch": 2.627007464374576, "grad_norm": 0.35207614302635193, "learning_rate": 8.645284516503337e-06, "loss": 0.5182, "step": 5807 }, { "epoch": 2.6274598507125084, "grad_norm": 0.36672237515449524, "learning_rate": 8.644782944310605e-06, "loss": 0.5405, "step": 5808 }, { "epoch": 2.627912237050441, "grad_norm": 0.45588403940200806, "learning_rate": 8.64428129383921e-06, "loss": 0.7057, "step": 5809 }, { "epoch": 2.6283646233883737, "grad_norm": 0.38024985790252686, "learning_rate": 8.643779565099926e-06, "loss": 0.503, "step": 5810 }, { "epoch": 2.628817009726306, "grad_norm": 0.36591851711273193, "learning_rate": 8.643277758103527e-06, "loss": 0.473, "step": 5811 }, { "epoch": 2.629269396064239, "grad_norm": 0.3599725365638733, "learning_rate": 8.642775872860794e-06, "loss": 0.4945, "step": 5812 }, { "epoch": 2.6297217824021715, "grad_norm": 0.37575578689575195, "learning_rate": 8.642273909382504e-06, "loss": 0.5517, "step": 5813 }, { "epoch": 2.630174168740104, "grad_norm": 0.44483157992362976, "learning_rate": 8.641771867679436e-06, "loss": 0.5939, "step": 5814 }, { "epoch": 2.6306265550780368, "grad_norm": 0.4410352110862732, "learning_rate": 8.641269747762375e-06, "loss": 0.611, "step": 5815 }, { "epoch": 2.631078941415969, "grad_norm": 0.4362781345844269, "learning_rate": 8.640767549642102e-06, "loss": 0.5245, "step": 5816 }, { "epoch": 2.631531327753902, "grad_norm": 0.4363398253917694, "learning_rate": 8.640265273329408e-06, "loss": 0.5203, "step": 5817 }, { "epoch": 2.6319837140918345, "grad_norm": 0.46487000584602356, "learning_rate": 8.639762918835072e-06, "loss": 0.5683, "step": 5818 }, { "epoch": 2.632436100429767, "grad_norm": 0.4334155321121216, "learning_rate": 8.639260486169889e-06, "loss": 0.5114, "step": 5819 }, { "epoch": 2.6328884867676994, "grad_norm": 0.4765988886356354, "learning_rate": 8.638757975344649e-06, "loss": 0.5052, "step": 5820 }, { "epoch": 2.6333408731056323, "grad_norm": 0.5895520448684692, "learning_rate": 8.638255386370144e-06, "loss": 0.5535, "step": 5821 }, { "epoch": 2.6337932594435647, "grad_norm": 0.1615723967552185, "learning_rate": 8.63775271925717e-06, "loss": 0.9464, "step": 5822 }, { "epoch": 2.6342456457814976, "grad_norm": 0.18582037091255188, "learning_rate": 8.637249974016517e-06, "loss": 0.5896, "step": 5823 }, { "epoch": 2.63469803211943, "grad_norm": 0.2463391125202179, "learning_rate": 8.636747150658986e-06, "loss": 0.5213, "step": 5824 }, { "epoch": 2.6351504184573624, "grad_norm": 0.2861095368862152, "learning_rate": 8.636244249195378e-06, "loss": 0.6983, "step": 5825 }, { "epoch": 2.6356028047952953, "grad_norm": 0.2601008117198944, "learning_rate": 8.635741269636491e-06, "loss": 0.5927, "step": 5826 }, { "epoch": 2.6360551911332277, "grad_norm": 0.2415371686220169, "learning_rate": 8.635238211993128e-06, "loss": 0.5545, "step": 5827 }, { "epoch": 2.6365075774711606, "grad_norm": 0.26031219959259033, "learning_rate": 8.634735076276096e-06, "loss": 0.5497, "step": 5828 }, { "epoch": 2.636959963809093, "grad_norm": 0.28087812662124634, "learning_rate": 8.634231862496194e-06, "loss": 0.6245, "step": 5829 }, { "epoch": 2.6374123501470255, "grad_norm": 0.3039909303188324, "learning_rate": 8.633728570664237e-06, "loss": 0.709, "step": 5830 }, { "epoch": 2.637864736484958, "grad_norm": 0.2554214894771576, "learning_rate": 8.63322520079103e-06, "loss": 0.5942, "step": 5831 }, { "epoch": 2.6383171228228908, "grad_norm": 0.2995755672454834, "learning_rate": 8.632721752887384e-06, "loss": 0.784, "step": 5832 }, { "epoch": 2.638769509160823, "grad_norm": 0.3031977713108063, "learning_rate": 8.632218226964112e-06, "loss": 0.7225, "step": 5833 }, { "epoch": 2.639221895498756, "grad_norm": 0.29337289929389954, "learning_rate": 8.631714623032029e-06, "loss": 0.5486, "step": 5834 }, { "epoch": 2.6396742818366885, "grad_norm": 0.35068967938423157, "learning_rate": 8.631210941101948e-06, "loss": 0.7155, "step": 5835 }, { "epoch": 2.640126668174621, "grad_norm": 0.32930827140808105, "learning_rate": 8.630707181184692e-06, "loss": 0.7258, "step": 5836 }, { "epoch": 2.640579054512554, "grad_norm": 0.326236754655838, "learning_rate": 8.630203343291075e-06, "loss": 0.6357, "step": 5837 }, { "epoch": 2.6410314408504862, "grad_norm": 0.3774075210094452, "learning_rate": 8.629699427431921e-06, "loss": 0.6923, "step": 5838 }, { "epoch": 2.641483827188419, "grad_norm": 0.3506324887275696, "learning_rate": 8.629195433618051e-06, "loss": 0.6596, "step": 5839 }, { "epoch": 2.6419362135263516, "grad_norm": 0.31615689396858215, "learning_rate": 8.628691361860288e-06, "loss": 0.5304, "step": 5840 }, { "epoch": 2.642388599864284, "grad_norm": 0.36527353525161743, "learning_rate": 8.628187212169462e-06, "loss": 0.5768, "step": 5841 }, { "epoch": 2.642840986202217, "grad_norm": 0.2963448166847229, "learning_rate": 8.627682984556396e-06, "loss": 0.536, "step": 5842 }, { "epoch": 2.6432933725401493, "grad_norm": 0.3659953773021698, "learning_rate": 8.627178679031923e-06, "loss": 0.5254, "step": 5843 }, { "epoch": 2.6437457588780817, "grad_norm": 0.3185042440891266, "learning_rate": 8.626674295606872e-06, "loss": 0.5506, "step": 5844 }, { "epoch": 2.6441981452160146, "grad_norm": 0.3444763123989105, "learning_rate": 8.626169834292075e-06, "loss": 0.645, "step": 5845 }, { "epoch": 2.644650531553947, "grad_norm": 0.31447499990463257, "learning_rate": 8.625665295098366e-06, "loss": 0.4502, "step": 5846 }, { "epoch": 2.6451029178918795, "grad_norm": 0.3189042806625366, "learning_rate": 8.625160678036585e-06, "loss": 0.5105, "step": 5847 }, { "epoch": 2.6455553042298123, "grad_norm": 0.32419389486312866, "learning_rate": 8.624655983117565e-06, "loss": 0.4963, "step": 5848 }, { "epoch": 2.6460076905677448, "grad_norm": 0.3164665699005127, "learning_rate": 8.624151210352147e-06, "loss": 0.4884, "step": 5849 }, { "epoch": 2.6464600769056776, "grad_norm": 0.3841109275817871, "learning_rate": 8.623646359751172e-06, "loss": 0.7133, "step": 5850 }, { "epoch": 2.64691246324361, "grad_norm": 0.3158685863018036, "learning_rate": 8.623141431325483e-06, "loss": 0.4991, "step": 5851 }, { "epoch": 2.6473648495815425, "grad_norm": 0.3412874937057495, "learning_rate": 8.622636425085924e-06, "loss": 0.5723, "step": 5852 }, { "epoch": 2.6478172359194754, "grad_norm": 0.3464912176132202, "learning_rate": 8.62213134104334e-06, "loss": 0.4944, "step": 5853 }, { "epoch": 2.648269622257408, "grad_norm": 0.3191237151622772, "learning_rate": 8.62162617920858e-06, "loss": 0.4805, "step": 5854 }, { "epoch": 2.6487220085953407, "grad_norm": 0.38174712657928467, "learning_rate": 8.62112093959249e-06, "loss": 0.5536, "step": 5855 }, { "epoch": 2.649174394933273, "grad_norm": 0.34924548864364624, "learning_rate": 8.620615622205928e-06, "loss": 0.503, "step": 5856 }, { "epoch": 2.6496267812712055, "grad_norm": 0.3762029707431793, "learning_rate": 8.620110227059741e-06, "loss": 0.4502, "step": 5857 }, { "epoch": 2.650079167609138, "grad_norm": 0.35212796926498413, "learning_rate": 8.619604754164782e-06, "loss": 0.4927, "step": 5858 }, { "epoch": 2.650531553947071, "grad_norm": 0.3657449185848236, "learning_rate": 8.619099203531913e-06, "loss": 0.5602, "step": 5859 }, { "epoch": 2.6509839402850033, "grad_norm": 0.3824846148490906, "learning_rate": 8.618593575171987e-06, "loss": 0.5298, "step": 5860 }, { "epoch": 2.651436326622936, "grad_norm": 0.3956758975982666, "learning_rate": 8.618087869095866e-06, "loss": 0.5065, "step": 5861 }, { "epoch": 2.6518887129608686, "grad_norm": 0.38144898414611816, "learning_rate": 8.617582085314408e-06, "loss": 0.5615, "step": 5862 }, { "epoch": 2.652341099298801, "grad_norm": 0.3840895891189575, "learning_rate": 8.617076223838479e-06, "loss": 0.46, "step": 5863 }, { "epoch": 2.652793485636734, "grad_norm": 0.3949694335460663, "learning_rate": 8.61657028467894e-06, "loss": 0.5518, "step": 5864 }, { "epoch": 2.6532458719746663, "grad_norm": 0.4021891951560974, "learning_rate": 8.61606426784666e-06, "loss": 0.5111, "step": 5865 }, { "epoch": 2.653698258312599, "grad_norm": 0.3827269375324249, "learning_rate": 8.615558173352505e-06, "loss": 0.4408, "step": 5866 }, { "epoch": 2.6541506446505316, "grad_norm": 0.42671746015548706, "learning_rate": 8.615052001207346e-06, "loss": 0.5338, "step": 5867 }, { "epoch": 2.654603030988464, "grad_norm": 0.5754228830337524, "learning_rate": 8.61454575142205e-06, "loss": 0.6446, "step": 5868 }, { "epoch": 2.6550554173263965, "grad_norm": 0.4695386290550232, "learning_rate": 8.614039424007496e-06, "loss": 0.453, "step": 5869 }, { "epoch": 2.6555078036643294, "grad_norm": 0.5182480216026306, "learning_rate": 8.613533018974551e-06, "loss": 0.5527, "step": 5870 }, { "epoch": 2.655960190002262, "grad_norm": 0.5306710004806519, "learning_rate": 8.613026536334098e-06, "loss": 0.4908, "step": 5871 }, { "epoch": 2.6564125763401947, "grad_norm": 0.13036352396011353, "learning_rate": 8.61251997609701e-06, "loss": 1.3176, "step": 5872 }, { "epoch": 2.656864962678127, "grad_norm": 0.177462637424469, "learning_rate": 8.612013338274168e-06, "loss": 0.8893, "step": 5873 }, { "epoch": 2.6573173490160595, "grad_norm": 0.22064359486103058, "learning_rate": 8.611506622876454e-06, "loss": 0.8334, "step": 5874 }, { "epoch": 2.6577697353539924, "grad_norm": 0.24542413651943207, "learning_rate": 8.610999829914748e-06, "loss": 0.7118, "step": 5875 }, { "epoch": 2.658222121691925, "grad_norm": 0.2904423177242279, "learning_rate": 8.610492959399935e-06, "loss": 0.8216, "step": 5876 }, { "epoch": 2.6586745080298577, "grad_norm": 0.2526596784591675, "learning_rate": 8.609986011342903e-06, "loss": 0.6472, "step": 5877 }, { "epoch": 2.65912689436779, "grad_norm": 0.2579940855503082, "learning_rate": 8.609478985754541e-06, "loss": 0.5399, "step": 5878 }, { "epoch": 2.6595792807057226, "grad_norm": 0.280368834733963, "learning_rate": 8.608971882645732e-06, "loss": 0.6294, "step": 5879 }, { "epoch": 2.660031667043655, "grad_norm": 0.29944440722465515, "learning_rate": 8.608464702027372e-06, "loss": 0.6221, "step": 5880 }, { "epoch": 2.660484053381588, "grad_norm": 0.3138051927089691, "learning_rate": 8.607957443910352e-06, "loss": 0.5787, "step": 5881 }, { "epoch": 2.6609364397195203, "grad_norm": 0.2739887833595276, "learning_rate": 8.60745010830557e-06, "loss": 0.5439, "step": 5882 }, { "epoch": 2.661388826057453, "grad_norm": 0.28669479489326477, "learning_rate": 8.606942695223917e-06, "loss": 0.6283, "step": 5883 }, { "epoch": 2.6618412123953856, "grad_norm": 0.2542678117752075, "learning_rate": 8.606435204676293e-06, "loss": 0.5061, "step": 5884 }, { "epoch": 2.662293598733318, "grad_norm": 0.2556508779525757, "learning_rate": 8.605927636673596e-06, "loss": 0.3971, "step": 5885 }, { "epoch": 2.662745985071251, "grad_norm": 0.32215261459350586, "learning_rate": 8.605419991226728e-06, "loss": 0.5807, "step": 5886 }, { "epoch": 2.6631983714091834, "grad_norm": 0.30433809757232666, "learning_rate": 8.604912268346593e-06, "loss": 0.6403, "step": 5887 }, { "epoch": 2.6636507577471162, "grad_norm": 0.33275407552719116, "learning_rate": 8.604404468044092e-06, "loss": 0.5005, "step": 5888 }, { "epoch": 2.6641031440850487, "grad_norm": 0.3081875145435333, "learning_rate": 8.603896590330132e-06, "loss": 0.5756, "step": 5889 }, { "epoch": 2.664555530422981, "grad_norm": 0.30854612588882446, "learning_rate": 8.603388635215623e-06, "loss": 0.6873, "step": 5890 }, { "epoch": 2.665007916760914, "grad_norm": 0.31814390420913696, "learning_rate": 8.602880602711475e-06, "loss": 0.5329, "step": 5891 }, { "epoch": 2.6654603030988464, "grad_norm": 0.35187163949012756, "learning_rate": 8.602372492828593e-06, "loss": 0.6234, "step": 5892 }, { "epoch": 2.6659126894367793, "grad_norm": 0.3435465395450592, "learning_rate": 8.601864305577896e-06, "loss": 0.5526, "step": 5893 }, { "epoch": 2.6663650757747117, "grad_norm": 0.3127245604991913, "learning_rate": 8.601356040970294e-06, "loss": 0.5019, "step": 5894 }, { "epoch": 2.666817462112644, "grad_norm": 0.3729829490184784, "learning_rate": 8.600847699016705e-06, "loss": 0.6242, "step": 5895 }, { "epoch": 2.6672698484505766, "grad_norm": 0.35393214225769043, "learning_rate": 8.600339279728046e-06, "loss": 0.657, "step": 5896 }, { "epoch": 2.6677222347885094, "grad_norm": 0.31121888756752014, "learning_rate": 8.599830783115239e-06, "loss": 0.4532, "step": 5897 }, { "epoch": 2.668174621126442, "grad_norm": 0.33110857009887695, "learning_rate": 8.5993222091892e-06, "loss": 0.5632, "step": 5898 }, { "epoch": 2.6686270074643748, "grad_norm": 0.3280004858970642, "learning_rate": 8.598813557960855e-06, "loss": 0.4684, "step": 5899 }, { "epoch": 2.669079393802307, "grad_norm": 0.34328657388687134, "learning_rate": 8.598304829441126e-06, "loss": 0.5648, "step": 5900 }, { "epoch": 2.6695317801402396, "grad_norm": 0.339594304561615, "learning_rate": 8.59779602364094e-06, "loss": 0.5362, "step": 5901 }, { "epoch": 2.6699841664781725, "grad_norm": 0.3640854060649872, "learning_rate": 8.597287140571227e-06, "loss": 0.4854, "step": 5902 }, { "epoch": 2.670436552816105, "grad_norm": 0.36524540185928345, "learning_rate": 8.596778180242913e-06, "loss": 0.5475, "step": 5903 }, { "epoch": 2.670888939154038, "grad_norm": 0.33653587102890015, "learning_rate": 8.59626914266693e-06, "loss": 0.4937, "step": 5904 }, { "epoch": 2.6713413254919702, "grad_norm": 0.4031601548194885, "learning_rate": 8.595760027854211e-06, "loss": 0.6258, "step": 5905 }, { "epoch": 2.6717937118299027, "grad_norm": 0.3691055178642273, "learning_rate": 8.595250835815689e-06, "loss": 0.5385, "step": 5906 }, { "epoch": 2.672246098167835, "grad_norm": 0.36472922563552856, "learning_rate": 8.5947415665623e-06, "loss": 0.503, "step": 5907 }, { "epoch": 2.672698484505768, "grad_norm": 0.4060400724411011, "learning_rate": 8.594232220104983e-06, "loss": 0.5765, "step": 5908 }, { "epoch": 2.6731508708437004, "grad_norm": 0.3995891511440277, "learning_rate": 8.593722796454676e-06, "loss": 0.5407, "step": 5909 }, { "epoch": 2.6736032571816333, "grad_norm": 0.4087095260620117, "learning_rate": 8.593213295622318e-06, "loss": 0.5737, "step": 5910 }, { "epoch": 2.6740556435195657, "grad_norm": 0.38164326548576355, "learning_rate": 8.592703717618857e-06, "loss": 0.4663, "step": 5911 }, { "epoch": 2.674508029857498, "grad_norm": 0.3589520752429962, "learning_rate": 8.592194062455233e-06, "loss": 0.4624, "step": 5912 }, { "epoch": 2.674960416195431, "grad_norm": 0.4345736801624298, "learning_rate": 8.591684330142391e-06, "loss": 0.5823, "step": 5913 }, { "epoch": 2.6754128025333634, "grad_norm": 0.37865597009658813, "learning_rate": 8.59117452069128e-06, "loss": 0.4657, "step": 5914 }, { "epoch": 2.6758651888712963, "grad_norm": 0.41392141580581665, "learning_rate": 8.590664634112851e-06, "loss": 0.5076, "step": 5915 }, { "epoch": 2.6763175752092287, "grad_norm": 0.4385896325111389, "learning_rate": 8.590154670418054e-06, "loss": 0.5605, "step": 5916 }, { "epoch": 2.676769961547161, "grad_norm": 0.4025658369064331, "learning_rate": 8.589644629617837e-06, "loss": 0.5297, "step": 5917 }, { "epoch": 2.6772223478850936, "grad_norm": 0.4431506097316742, "learning_rate": 8.58913451172316e-06, "loss": 0.5257, "step": 5918 }, { "epoch": 2.6776747342230265, "grad_norm": 0.4500413239002228, "learning_rate": 8.588624316744974e-06, "loss": 0.4797, "step": 5919 }, { "epoch": 2.678127120560959, "grad_norm": 0.5156040787696838, "learning_rate": 8.58811404469424e-06, "loss": 0.536, "step": 5920 }, { "epoch": 2.678579506898892, "grad_norm": 0.5599775910377502, "learning_rate": 8.587603695581917e-06, "loss": 0.5624, "step": 5921 }, { "epoch": 2.6790318932368242, "grad_norm": 0.15960478782653809, "learning_rate": 8.587093269418963e-06, "loss": 1.0991, "step": 5922 }, { "epoch": 2.6794842795747567, "grad_norm": 0.1897083818912506, "learning_rate": 8.586582766216342e-06, "loss": 0.5348, "step": 5923 }, { "epoch": 2.6799366659126895, "grad_norm": 0.21489733457565308, "learning_rate": 8.586072185985019e-06, "loss": 0.505, "step": 5924 }, { "epoch": 2.680389052250622, "grad_norm": 0.2414473444223404, "learning_rate": 8.585561528735958e-06, "loss": 0.5865, "step": 5925 }, { "epoch": 2.680841438588555, "grad_norm": 0.277177631855011, "learning_rate": 8.585050794480125e-06, "loss": 0.6154, "step": 5926 }, { "epoch": 2.6812938249264873, "grad_norm": 0.2499813586473465, "learning_rate": 8.584539983228492e-06, "loss": 0.5574, "step": 5927 }, { "epoch": 2.6817462112644197, "grad_norm": 0.2946087718009949, "learning_rate": 8.58402909499203e-06, "loss": 0.6328, "step": 5928 }, { "epoch": 2.6821985976023526, "grad_norm": 0.33109864592552185, "learning_rate": 8.583518129781708e-06, "loss": 0.6726, "step": 5929 }, { "epoch": 2.682650983940285, "grad_norm": 0.34127554297447205, "learning_rate": 8.583007087608502e-06, "loss": 0.5649, "step": 5930 }, { "epoch": 2.6831033702782174, "grad_norm": 0.306516170501709, "learning_rate": 8.582495968483388e-06, "loss": 0.7062, "step": 5931 }, { "epoch": 2.6835557566161503, "grad_norm": 0.2941477596759796, "learning_rate": 8.581984772417342e-06, "loss": 0.5975, "step": 5932 }, { "epoch": 2.6840081429540827, "grad_norm": 0.32822856307029724, "learning_rate": 8.581473499421344e-06, "loss": 0.7474, "step": 5933 }, { "epoch": 2.684460529292015, "grad_norm": 0.2946610152721405, "learning_rate": 8.580962149506376e-06, "loss": 0.5593, "step": 5934 }, { "epoch": 2.684912915629948, "grad_norm": 0.33327516913414, "learning_rate": 8.580450722683415e-06, "loss": 0.4876, "step": 5935 }, { "epoch": 2.6853653019678805, "grad_norm": 0.3614838123321533, "learning_rate": 8.57993921896345e-06, "loss": 0.5973, "step": 5936 }, { "epoch": 2.6858176883058134, "grad_norm": 0.3313530683517456, "learning_rate": 8.579427638357465e-06, "loss": 0.5779, "step": 5937 }, { "epoch": 2.686270074643746, "grad_norm": 0.32026147842407227, "learning_rate": 8.578915980876446e-06, "loss": 0.6341, "step": 5938 }, { "epoch": 2.686722460981678, "grad_norm": 0.2802072763442993, "learning_rate": 8.578404246531382e-06, "loss": 0.539, "step": 5939 }, { "epoch": 2.687174847319611, "grad_norm": 0.36140063405036926, "learning_rate": 8.577892435333265e-06, "loss": 0.5891, "step": 5940 }, { "epoch": 2.6876272336575435, "grad_norm": 0.3187452554702759, "learning_rate": 8.577380547293089e-06, "loss": 0.5084, "step": 5941 }, { "epoch": 2.6880796199954764, "grad_norm": 0.3465421199798584, "learning_rate": 8.576868582421842e-06, "loss": 0.6107, "step": 5942 }, { "epoch": 2.688532006333409, "grad_norm": 0.30356502532958984, "learning_rate": 8.576356540730522e-06, "loss": 0.4748, "step": 5943 }, { "epoch": 2.6889843926713413, "grad_norm": 0.3564748167991638, "learning_rate": 8.57584442223013e-06, "loss": 0.643, "step": 5944 }, { "epoch": 2.6894367790092737, "grad_norm": 0.31590405106544495, "learning_rate": 8.575332226931656e-06, "loss": 0.486, "step": 5945 }, { "epoch": 2.6898891653472066, "grad_norm": 0.3356747329235077, "learning_rate": 8.57481995484611e-06, "loss": 0.5068, "step": 5946 }, { "epoch": 2.690341551685139, "grad_norm": 0.3259486258029938, "learning_rate": 8.574307605984487e-06, "loss": 0.5556, "step": 5947 }, { "epoch": 2.690793938023072, "grad_norm": 0.31241312623023987, "learning_rate": 8.573795180357796e-06, "loss": 0.5076, "step": 5948 }, { "epoch": 2.6912463243610043, "grad_norm": 0.35005614161491394, "learning_rate": 8.573282677977037e-06, "loss": 0.5403, "step": 5949 }, { "epoch": 2.6916987106989367, "grad_norm": 0.3862065076828003, "learning_rate": 8.57277009885322e-06, "loss": 0.5344, "step": 5950 }, { "epoch": 2.6921510970368696, "grad_norm": 0.3446349501609802, "learning_rate": 8.572257442997356e-06, "loss": 0.5846, "step": 5951 }, { "epoch": 2.692603483374802, "grad_norm": 0.3529168963432312, "learning_rate": 8.57174471042045e-06, "loss": 0.5163, "step": 5952 }, { "epoch": 2.693055869712735, "grad_norm": 0.3944449722766876, "learning_rate": 8.571231901133516e-06, "loss": 0.5823, "step": 5953 }, { "epoch": 2.6935082560506673, "grad_norm": 0.4257878363132477, "learning_rate": 8.57071901514757e-06, "loss": 0.646, "step": 5954 }, { "epoch": 2.6939606423886, "grad_norm": 0.33623388409614563, "learning_rate": 8.570206052473624e-06, "loss": 0.552, "step": 5955 }, { "epoch": 2.694413028726532, "grad_norm": 0.41458094120025635, "learning_rate": 8.569693013122696e-06, "loss": 0.6618, "step": 5956 }, { "epoch": 2.694865415064465, "grad_norm": 0.3894663155078888, "learning_rate": 8.569179897105805e-06, "loss": 0.588, "step": 5957 }, { "epoch": 2.6953178014023975, "grad_norm": 0.3560861647129059, "learning_rate": 8.56866670443397e-06, "loss": 0.5031, "step": 5958 }, { "epoch": 2.6957701877403304, "grad_norm": 0.3987221121788025, "learning_rate": 8.568153435118215e-06, "loss": 0.5144, "step": 5959 }, { "epoch": 2.696222574078263, "grad_norm": 0.3966299891471863, "learning_rate": 8.56764008916956e-06, "loss": 0.5246, "step": 5960 }, { "epoch": 2.6966749604161953, "grad_norm": 0.40658140182495117, "learning_rate": 8.567126666599033e-06, "loss": 0.5179, "step": 5961 }, { "epoch": 2.697127346754128, "grad_norm": 0.4140542149543762, "learning_rate": 8.566613167417659e-06, "loss": 0.5881, "step": 5962 }, { "epoch": 2.6975797330920606, "grad_norm": 0.45407652854919434, "learning_rate": 8.566099591636467e-06, "loss": 0.6002, "step": 5963 }, { "epoch": 2.6980321194299934, "grad_norm": 0.39340388774871826, "learning_rate": 8.565585939266487e-06, "loss": 0.4594, "step": 5964 }, { "epoch": 2.698484505767926, "grad_norm": 0.3853323459625244, "learning_rate": 8.565072210318752e-06, "loss": 0.4535, "step": 5965 }, { "epoch": 2.6989368921058583, "grad_norm": 0.39258044958114624, "learning_rate": 8.564558404804294e-06, "loss": 0.4728, "step": 5966 }, { "epoch": 2.6993892784437907, "grad_norm": 0.3978533148765564, "learning_rate": 8.564044522734147e-06, "loss": 0.5302, "step": 5967 }, { "epoch": 2.6998416647817236, "grad_norm": 0.44276711344718933, "learning_rate": 8.56353056411935e-06, "loss": 0.4867, "step": 5968 }, { "epoch": 2.700294051119656, "grad_norm": 0.4374552369117737, "learning_rate": 8.563016528970938e-06, "loss": 0.4379, "step": 5969 }, { "epoch": 2.700746437457589, "grad_norm": 0.5025157928466797, "learning_rate": 8.562502417299953e-06, "loss": 0.5462, "step": 5970 }, { "epoch": 2.7011988237955213, "grad_norm": 0.5852148532867432, "learning_rate": 8.561988229117438e-06, "loss": 0.5105, "step": 5971 }, { "epoch": 2.7016512101334538, "grad_norm": 0.15944591164588928, "learning_rate": 8.561473964434433e-06, "loss": 1.23, "step": 5972 }, { "epoch": 2.7021035964713866, "grad_norm": 0.2247801274061203, "learning_rate": 8.560959623261984e-06, "loss": 0.7306, "step": 5973 }, { "epoch": 2.702555982809319, "grad_norm": 0.29429665207862854, "learning_rate": 8.560445205611139e-06, "loss": 0.6374, "step": 5974 }, { "epoch": 2.703008369147252, "grad_norm": 0.25034964084625244, "learning_rate": 8.559930711492944e-06, "loss": 0.6835, "step": 5975 }, { "epoch": 2.7034607554851844, "grad_norm": 0.2522639036178589, "learning_rate": 8.55941614091845e-06, "loss": 0.5208, "step": 5976 }, { "epoch": 2.703913141823117, "grad_norm": 0.27710628509521484, "learning_rate": 8.558901493898708e-06, "loss": 0.4926, "step": 5977 }, { "epoch": 2.7043655281610497, "grad_norm": 0.2749655544757843, "learning_rate": 8.558386770444771e-06, "loss": 0.5326, "step": 5978 }, { "epoch": 2.704817914498982, "grad_norm": 0.3060790002346039, "learning_rate": 8.557871970567692e-06, "loss": 0.5782, "step": 5979 }, { "epoch": 2.705270300836915, "grad_norm": 0.29909905791282654, "learning_rate": 8.55735709427853e-06, "loss": 0.667, "step": 5980 }, { "epoch": 2.7057226871748474, "grad_norm": 0.32193252444267273, "learning_rate": 8.556842141588343e-06, "loss": 0.6207, "step": 5981 }, { "epoch": 2.70617507351278, "grad_norm": 0.28041914105415344, "learning_rate": 8.55632711250819e-06, "loss": 0.5547, "step": 5982 }, { "epoch": 2.7066274598507123, "grad_norm": 0.28914257884025574, "learning_rate": 8.555812007049128e-06, "loss": 0.5794, "step": 5983 }, { "epoch": 2.707079846188645, "grad_norm": 0.3132675588130951, "learning_rate": 8.555296825222228e-06, "loss": 0.6145, "step": 5984 }, { "epoch": 2.7075322325265776, "grad_norm": 0.28350725769996643, "learning_rate": 8.554781567038549e-06, "loss": 0.545, "step": 5985 }, { "epoch": 2.7079846188645105, "grad_norm": 0.3161270320415497, "learning_rate": 8.554266232509156e-06, "loss": 0.5753, "step": 5986 }, { "epoch": 2.708437005202443, "grad_norm": 0.31187787652015686, "learning_rate": 8.55375082164512e-06, "loss": 0.6094, "step": 5987 }, { "epoch": 2.7088893915403753, "grad_norm": 0.34344482421875, "learning_rate": 8.55323533445751e-06, "loss": 0.6396, "step": 5988 }, { "epoch": 2.709341777878308, "grad_norm": 0.2886374890804291, "learning_rate": 8.552719770957397e-06, "loss": 0.5279, "step": 5989 }, { "epoch": 2.7097941642162406, "grad_norm": 0.3188001215457916, "learning_rate": 8.552204131155852e-06, "loss": 0.6737, "step": 5990 }, { "epoch": 2.7102465505541735, "grad_norm": 0.31022655963897705, "learning_rate": 8.55168841506395e-06, "loss": 0.4959, "step": 5991 }, { "epoch": 2.710698936892106, "grad_norm": 0.32238808274269104, "learning_rate": 8.55117262269277e-06, "loss": 0.5664, "step": 5992 }, { "epoch": 2.7111513232300384, "grad_norm": 0.3375941514968872, "learning_rate": 8.550656754053386e-06, "loss": 0.6322, "step": 5993 }, { "epoch": 2.711603709567971, "grad_norm": 0.29693925380706787, "learning_rate": 8.550140809156879e-06, "loss": 0.5449, "step": 5994 }, { "epoch": 2.7120560959059037, "grad_norm": 0.3392254114151001, "learning_rate": 8.549624788014328e-06, "loss": 0.6548, "step": 5995 }, { "epoch": 2.712508482243836, "grad_norm": 0.305774450302124, "learning_rate": 8.549108690636819e-06, "loss": 0.5296, "step": 5996 }, { "epoch": 2.712960868581769, "grad_norm": 0.3071829676628113, "learning_rate": 8.548592517035433e-06, "loss": 0.45, "step": 5997 }, { "epoch": 2.7134132549197014, "grad_norm": 0.34066006541252136, "learning_rate": 8.548076267221258e-06, "loss": 0.5576, "step": 5998 }, { "epoch": 2.713865641257634, "grad_norm": 0.3425659239292145, "learning_rate": 8.547559941205378e-06, "loss": 0.5667, "step": 5999 }, { "epoch": 2.7143180275955667, "grad_norm": 0.37066271901130676, "learning_rate": 8.547043538998886e-06, "loss": 0.6114, "step": 6000 }, { "epoch": 2.7143180275955667, "eval_loss": 0.5951371192932129, "eval_runtime": 25.8206, "eval_samples_per_second": 28.814, "eval_steps_per_second": 7.204, "step": 6000 }, { "epoch": 2.714770413933499, "grad_norm": 0.3554350733757019, "learning_rate": 8.54652706061287e-06, "loss": 0.5649, "step": 6001 }, { "epoch": 2.715222800271432, "grad_norm": 0.31929486989974976, "learning_rate": 8.546010506058425e-06, "loss": 0.4562, "step": 6002 }, { "epoch": 2.7156751866093645, "grad_norm": 0.3904710114002228, "learning_rate": 8.545493875346642e-06, "loss": 0.6869, "step": 6003 }, { "epoch": 2.716127572947297, "grad_norm": 0.3729155957698822, "learning_rate": 8.54497716848862e-06, "loss": 0.5175, "step": 6004 }, { "epoch": 2.7165799592852293, "grad_norm": 0.37629449367523193, "learning_rate": 8.544460385495451e-06, "loss": 0.5163, "step": 6005 }, { "epoch": 2.717032345623162, "grad_norm": 0.3916374742984772, "learning_rate": 8.543943526378243e-06, "loss": 0.5514, "step": 6006 }, { "epoch": 2.7174847319610946, "grad_norm": 0.36911851167678833, "learning_rate": 8.543426591148087e-06, "loss": 0.5789, "step": 6007 }, { "epoch": 2.7179371182990275, "grad_norm": 0.3568420708179474, "learning_rate": 8.54290957981609e-06, "loss": 0.49, "step": 6008 }, { "epoch": 2.71838950463696, "grad_norm": 0.37122562527656555, "learning_rate": 8.542392492393354e-06, "loss": 0.5286, "step": 6009 }, { "epoch": 2.7188418909748924, "grad_norm": 0.4192439019680023, "learning_rate": 8.541875328890984e-06, "loss": 0.5617, "step": 6010 }, { "epoch": 2.7192942773128252, "grad_norm": 0.3888736069202423, "learning_rate": 8.541358089320091e-06, "loss": 0.5448, "step": 6011 }, { "epoch": 2.7197466636507577, "grad_norm": 0.3961309492588043, "learning_rate": 8.54084077369178e-06, "loss": 0.5706, "step": 6012 }, { "epoch": 2.7201990499886906, "grad_norm": 0.3569673001766205, "learning_rate": 8.540323382017163e-06, "loss": 0.46, "step": 6013 }, { "epoch": 2.720651436326623, "grad_norm": 0.39682477712631226, "learning_rate": 8.539805914307351e-06, "loss": 0.6027, "step": 6014 }, { "epoch": 2.7211038226645554, "grad_norm": 0.37757599353790283, "learning_rate": 8.539288370573457e-06, "loss": 0.4528, "step": 6015 }, { "epoch": 2.7215562090024883, "grad_norm": 0.38525769114494324, "learning_rate": 8.538770750826598e-06, "loss": 0.4984, "step": 6016 }, { "epoch": 2.7220085953404207, "grad_norm": 0.37547436356544495, "learning_rate": 8.53825305507789e-06, "loss": 0.446, "step": 6017 }, { "epoch": 2.722460981678353, "grad_norm": 0.4545373320579529, "learning_rate": 8.53773528333845e-06, "loss": 0.5848, "step": 6018 }, { "epoch": 2.722913368016286, "grad_norm": 0.4310052692890167, "learning_rate": 8.537217435619402e-06, "loss": 0.4488, "step": 6019 }, { "epoch": 2.7233657543542185, "grad_norm": 0.45066842436790466, "learning_rate": 8.536699511931865e-06, "loss": 0.4895, "step": 6020 }, { "epoch": 2.723818140692151, "grad_norm": 0.5249525308609009, "learning_rate": 8.536181512286962e-06, "loss": 0.517, "step": 6021 }, { "epoch": 2.7242705270300838, "grad_norm": 0.13759763538837433, "learning_rate": 8.53566343669582e-06, "loss": 1.3586, "step": 6022 }, { "epoch": 2.724722913368016, "grad_norm": 0.24101150035858154, "learning_rate": 8.535145285169563e-06, "loss": 0.9345, "step": 6023 }, { "epoch": 2.725175299705949, "grad_norm": 0.28699585795402527, "learning_rate": 8.534627057719322e-06, "loss": 0.9876, "step": 6024 }, { "epoch": 2.7256276860438815, "grad_norm": 0.25555410981178284, "learning_rate": 8.534108754356227e-06, "loss": 0.8044, "step": 6025 }, { "epoch": 2.726080072381814, "grad_norm": 0.2256612926721573, "learning_rate": 8.533590375091407e-06, "loss": 0.5085, "step": 6026 }, { "epoch": 2.726532458719747, "grad_norm": 0.26099252700805664, "learning_rate": 8.533071919935996e-06, "loss": 0.6381, "step": 6027 }, { "epoch": 2.7269848450576792, "grad_norm": 0.25663459300994873, "learning_rate": 8.53255338890113e-06, "loss": 0.5102, "step": 6028 }, { "epoch": 2.727437231395612, "grad_norm": 0.2920932173728943, "learning_rate": 8.532034781997944e-06, "loss": 0.5414, "step": 6029 }, { "epoch": 2.7278896177335445, "grad_norm": 0.30182406306266785, "learning_rate": 8.531516099237577e-06, "loss": 0.6224, "step": 6030 }, { "epoch": 2.728342004071477, "grad_norm": 0.3156813979148865, "learning_rate": 8.53099734063117e-06, "loss": 0.5001, "step": 6031 }, { "epoch": 2.7287943904094094, "grad_norm": 0.3112369179725647, "learning_rate": 8.530478506189861e-06, "loss": 0.6409, "step": 6032 }, { "epoch": 2.7292467767473423, "grad_norm": 0.29418858885765076, "learning_rate": 8.529959595924796e-06, "loss": 0.6678, "step": 6033 }, { "epoch": 2.7296991630852747, "grad_norm": 0.2703012526035309, "learning_rate": 8.529440609847118e-06, "loss": 0.5857, "step": 6034 }, { "epoch": 2.7301515494232076, "grad_norm": 0.3554691672325134, "learning_rate": 8.528921547967975e-06, "loss": 0.7982, "step": 6035 }, { "epoch": 2.73060393576114, "grad_norm": 0.3240789771080017, "learning_rate": 8.528402410298511e-06, "loss": 0.6459, "step": 6036 }, { "epoch": 2.7310563220990725, "grad_norm": 0.3128589987754822, "learning_rate": 8.52788319684988e-06, "loss": 0.6302, "step": 6037 }, { "epoch": 2.7315087084370053, "grad_norm": 0.3068155348300934, "learning_rate": 8.527363907633228e-06, "loss": 0.6257, "step": 6038 }, { "epoch": 2.7319610947749378, "grad_norm": 0.30915966629981995, "learning_rate": 8.526844542659713e-06, "loss": 0.5683, "step": 6039 }, { "epoch": 2.7324134811128706, "grad_norm": 0.32039400935173035, "learning_rate": 8.526325101940488e-06, "loss": 0.5233, "step": 6040 }, { "epoch": 2.732865867450803, "grad_norm": 0.34339767694473267, "learning_rate": 8.525805585486708e-06, "loss": 0.5993, "step": 6041 }, { "epoch": 2.7333182537887355, "grad_norm": 0.3259381949901581, "learning_rate": 8.525285993309529e-06, "loss": 0.561, "step": 6042 }, { "epoch": 2.733770640126668, "grad_norm": 0.340766578912735, "learning_rate": 8.524766325420113e-06, "loss": 0.6092, "step": 6043 }, { "epoch": 2.734223026464601, "grad_norm": 0.3114631474018097, "learning_rate": 8.524246581829618e-06, "loss": 0.5029, "step": 6044 }, { "epoch": 2.7346754128025332, "grad_norm": 0.3160081207752228, "learning_rate": 8.52372676254921e-06, "loss": 0.5287, "step": 6045 }, { "epoch": 2.735127799140466, "grad_norm": 0.3532041907310486, "learning_rate": 8.523206867590052e-06, "loss": 0.5498, "step": 6046 }, { "epoch": 2.7355801854783985, "grad_norm": 0.3345959782600403, "learning_rate": 8.522686896963307e-06, "loss": 0.5939, "step": 6047 }, { "epoch": 2.736032571816331, "grad_norm": 0.32937902212142944, "learning_rate": 8.522166850680147e-06, "loss": 0.4986, "step": 6048 }, { "epoch": 2.736484958154264, "grad_norm": 0.33699432015419006, "learning_rate": 8.521646728751737e-06, "loss": 0.566, "step": 6049 }, { "epoch": 2.7369373444921963, "grad_norm": 0.3356224298477173, "learning_rate": 8.52112653118925e-06, "loss": 0.5111, "step": 6050 }, { "epoch": 2.737389730830129, "grad_norm": 0.37009158730506897, "learning_rate": 8.520606258003856e-06, "loss": 0.5687, "step": 6051 }, { "epoch": 2.7378421171680616, "grad_norm": 0.3568638265132904, "learning_rate": 8.520085909206732e-06, "loss": 0.507, "step": 6052 }, { "epoch": 2.738294503505994, "grad_norm": 0.3551192879676819, "learning_rate": 8.51956548480905e-06, "loss": 0.5412, "step": 6053 }, { "epoch": 2.7387468898439264, "grad_norm": 0.3834206759929657, "learning_rate": 8.519044984821989e-06, "loss": 0.5375, "step": 6054 }, { "epoch": 2.7391992761818593, "grad_norm": 0.3322424292564392, "learning_rate": 8.518524409256727e-06, "loss": 0.4624, "step": 6055 }, { "epoch": 2.7396516625197918, "grad_norm": 0.3642440140247345, "learning_rate": 8.518003758124446e-06, "loss": 0.5243, "step": 6056 }, { "epoch": 2.7401040488577246, "grad_norm": 0.3729585111141205, "learning_rate": 8.517483031436328e-06, "loss": 0.5453, "step": 6057 }, { "epoch": 2.740556435195657, "grad_norm": 0.37252336740493774, "learning_rate": 8.516962229203554e-06, "loss": 0.5265, "step": 6058 }, { "epoch": 2.7410088215335895, "grad_norm": 0.3790731132030487, "learning_rate": 8.51644135143731e-06, "loss": 0.5681, "step": 6059 }, { "epoch": 2.7414612078715224, "grad_norm": 0.3946165442466736, "learning_rate": 8.515920398148784e-06, "loss": 0.5496, "step": 6060 }, { "epoch": 2.741913594209455, "grad_norm": 0.43255680799484253, "learning_rate": 8.515399369349164e-06, "loss": 0.5921, "step": 6061 }, { "epoch": 2.7423659805473877, "grad_norm": 0.3812539279460907, "learning_rate": 8.51487826504964e-06, "loss": 0.4937, "step": 6062 }, { "epoch": 2.74281836688532, "grad_norm": 0.43677812814712524, "learning_rate": 8.514357085261405e-06, "loss": 0.5966, "step": 6063 }, { "epoch": 2.7432707532232525, "grad_norm": 0.4026263952255249, "learning_rate": 8.513835829995649e-06, "loss": 0.5584, "step": 6064 }, { "epoch": 2.7437231395611854, "grad_norm": 0.35576343536376953, "learning_rate": 8.51331449926357e-06, "loss": 0.4483, "step": 6065 }, { "epoch": 2.744175525899118, "grad_norm": 0.4558538496494293, "learning_rate": 8.512793093076363e-06, "loss": 0.6426, "step": 6066 }, { "epoch": 2.7446279122370507, "grad_norm": 0.46901488304138184, "learning_rate": 8.512271611445226e-06, "loss": 0.5495, "step": 6067 }, { "epoch": 2.745080298574983, "grad_norm": 0.4998263120651245, "learning_rate": 8.511750054381362e-06, "loss": 0.5882, "step": 6068 }, { "epoch": 2.7455326849129156, "grad_norm": 0.4205273985862732, "learning_rate": 8.51122842189597e-06, "loss": 0.4788, "step": 6069 }, { "epoch": 2.745985071250848, "grad_norm": 0.48713281750679016, "learning_rate": 8.51070671400025e-06, "loss": 0.5052, "step": 6070 }, { "epoch": 2.746437457588781, "grad_norm": 0.5627679824829102, "learning_rate": 8.510184930705413e-06, "loss": 0.5754, "step": 6071 }, { "epoch": 2.7468898439267133, "grad_norm": 0.1362774819135666, "learning_rate": 8.50966307202266e-06, "loss": 1.2551, "step": 6072 }, { "epoch": 2.747342230264646, "grad_norm": 0.21527373790740967, "learning_rate": 8.509141137963202e-06, "loss": 1.0791, "step": 6073 }, { "epoch": 2.7477946166025786, "grad_norm": 0.2941072881221771, "learning_rate": 8.508619128538248e-06, "loss": 0.843, "step": 6074 }, { "epoch": 2.748247002940511, "grad_norm": 0.2490890771150589, "learning_rate": 8.508097043759007e-06, "loss": 0.5636, "step": 6075 }, { "epoch": 2.748699389278444, "grad_norm": 0.2705782353878021, "learning_rate": 8.507574883636695e-06, "loss": 0.625, "step": 6076 }, { "epoch": 2.7491517756163764, "grad_norm": 0.23853208124637604, "learning_rate": 8.507052648182522e-06, "loss": 0.5586, "step": 6077 }, { "epoch": 2.7496041619543092, "grad_norm": 0.3133867681026459, "learning_rate": 8.50653033740771e-06, "loss": 0.7116, "step": 6078 }, { "epoch": 2.7500565482922417, "grad_norm": 0.27598485350608826, "learning_rate": 8.506007951323474e-06, "loss": 0.57, "step": 6079 }, { "epoch": 2.750508934630174, "grad_norm": 0.2919926941394806, "learning_rate": 8.50548548994103e-06, "loss": 0.6581, "step": 6080 }, { "epoch": 2.7509613209681065, "grad_norm": 0.3194435238838196, "learning_rate": 8.504962953271602e-06, "loss": 0.5395, "step": 6081 }, { "epoch": 2.7514137073060394, "grad_norm": 0.31614840030670166, "learning_rate": 8.504440341326413e-06, "loss": 0.5708, "step": 6082 }, { "epoch": 2.751866093643972, "grad_norm": 0.3029356300830841, "learning_rate": 8.503917654116684e-06, "loss": 0.6623, "step": 6083 }, { "epoch": 2.7523184799819047, "grad_norm": 0.29410186409950256, "learning_rate": 8.503394891653644e-06, "loss": 0.565, "step": 6084 }, { "epoch": 2.752770866319837, "grad_norm": 0.3003126084804535, "learning_rate": 8.50287205394852e-06, "loss": 0.5294, "step": 6085 }, { "epoch": 2.7532232526577696, "grad_norm": 0.2832663357257843, "learning_rate": 8.50234914101254e-06, "loss": 0.5655, "step": 6086 }, { "epoch": 2.7536756389957024, "grad_norm": 0.2895132601261139, "learning_rate": 8.501826152856933e-06, "loss": 0.4463, "step": 6087 }, { "epoch": 2.754128025333635, "grad_norm": 0.33470821380615234, "learning_rate": 8.501303089492935e-06, "loss": 0.615, "step": 6088 }, { "epoch": 2.7545804116715678, "grad_norm": 0.3105226755142212, "learning_rate": 8.500779950931776e-06, "loss": 0.5873, "step": 6089 }, { "epoch": 2.7550327980095, "grad_norm": 0.3253321647644043, "learning_rate": 8.500256737184694e-06, "loss": 0.5757, "step": 6090 }, { "epoch": 2.7554851843474326, "grad_norm": 0.343608558177948, "learning_rate": 8.499733448262924e-06, "loss": 0.5273, "step": 6091 }, { "epoch": 2.755937570685365, "grad_norm": 0.38387253880500793, "learning_rate": 8.499210084177705e-06, "loss": 0.7092, "step": 6092 }, { "epoch": 2.756389957023298, "grad_norm": 0.30230918526649475, "learning_rate": 8.498686644940278e-06, "loss": 0.5183, "step": 6093 }, { "epoch": 2.7568423433612304, "grad_norm": 0.3806377351284027, "learning_rate": 8.498163130561885e-06, "loss": 0.5931, "step": 6094 }, { "epoch": 2.7572947296991632, "grad_norm": 0.3210088312625885, "learning_rate": 8.497639541053769e-06, "loss": 0.5267, "step": 6095 }, { "epoch": 2.7577471160370957, "grad_norm": 0.33101341128349304, "learning_rate": 8.497115876427176e-06, "loss": 0.5157, "step": 6096 }, { "epoch": 2.758199502375028, "grad_norm": 0.3695349097251892, "learning_rate": 8.496592136693351e-06, "loss": 0.5843, "step": 6097 }, { "epoch": 2.758651888712961, "grad_norm": 0.32061317563056946, "learning_rate": 8.496068321863544e-06, "loss": 0.5346, "step": 6098 }, { "epoch": 2.7591042750508934, "grad_norm": 0.31404078006744385, "learning_rate": 8.495544431949004e-06, "loss": 0.4654, "step": 6099 }, { "epoch": 2.7595566613888263, "grad_norm": 0.3334069848060608, "learning_rate": 8.495020466960982e-06, "loss": 0.4897, "step": 6100 }, { "epoch": 2.7600090477267587, "grad_norm": 0.35224398970603943, "learning_rate": 8.494496426910732e-06, "loss": 0.5683, "step": 6101 }, { "epoch": 2.760461434064691, "grad_norm": 0.3561459481716156, "learning_rate": 8.493972311809508e-06, "loss": 0.5516, "step": 6102 }, { "epoch": 2.7609138204026236, "grad_norm": 0.3673640787601471, "learning_rate": 8.493448121668568e-06, "loss": 0.5222, "step": 6103 }, { "epoch": 2.7613662067405564, "grad_norm": 0.3821951448917389, "learning_rate": 8.492923856499168e-06, "loss": 0.5881, "step": 6104 }, { "epoch": 2.761818593078489, "grad_norm": 0.41465070843696594, "learning_rate": 8.492399516312571e-06, "loss": 0.5509, "step": 6105 }, { "epoch": 2.7622709794164217, "grad_norm": 0.35343000292778015, "learning_rate": 8.491875101120032e-06, "loss": 0.4832, "step": 6106 }, { "epoch": 2.762723365754354, "grad_norm": 0.39598697423934937, "learning_rate": 8.49135061093282e-06, "loss": 0.5365, "step": 6107 }, { "epoch": 2.7631757520922866, "grad_norm": 0.3799465596675873, "learning_rate": 8.490826045762198e-06, "loss": 0.4823, "step": 6108 }, { "epoch": 2.7636281384302195, "grad_norm": 0.42856287956237793, "learning_rate": 8.490301405619428e-06, "loss": 0.5575, "step": 6109 }, { "epoch": 2.764080524768152, "grad_norm": 0.3869558870792389, "learning_rate": 8.489776690515782e-06, "loss": 0.5191, "step": 6110 }, { "epoch": 2.764532911106085, "grad_norm": 0.38395950198173523, "learning_rate": 8.489251900462528e-06, "loss": 0.56, "step": 6111 }, { "epoch": 2.764985297444017, "grad_norm": 0.416444331407547, "learning_rate": 8.488727035470937e-06, "loss": 0.5061, "step": 6112 }, { "epoch": 2.7654376837819497, "grad_norm": 0.46226152777671814, "learning_rate": 8.488202095552282e-06, "loss": 0.5914, "step": 6113 }, { "epoch": 2.7658900701198825, "grad_norm": 0.4059416949748993, "learning_rate": 8.487677080717836e-06, "loss": 0.4973, "step": 6114 }, { "epoch": 2.766342456457815, "grad_norm": 0.398995965719223, "learning_rate": 8.487151990978873e-06, "loss": 0.4183, "step": 6115 }, { "epoch": 2.766794842795748, "grad_norm": 0.3846459686756134, "learning_rate": 8.486626826346675e-06, "loss": 0.4682, "step": 6116 }, { "epoch": 2.7672472291336803, "grad_norm": 0.48520851135253906, "learning_rate": 8.486101586832516e-06, "loss": 0.5945, "step": 6117 }, { "epoch": 2.7676996154716127, "grad_norm": 0.5179101228713989, "learning_rate": 8.48557627244768e-06, "loss": 0.6097, "step": 6118 }, { "epoch": 2.768152001809545, "grad_norm": 0.43403759598731995, "learning_rate": 8.485050883203449e-06, "loss": 0.4632, "step": 6119 }, { "epoch": 2.768604388147478, "grad_norm": 0.43896594643592834, "learning_rate": 8.484525419111104e-06, "loss": 0.4846, "step": 6120 }, { "epoch": 2.7690567744854104, "grad_norm": 0.4603344202041626, "learning_rate": 8.483999880181932e-06, "loss": 0.4537, "step": 6121 }, { "epoch": 2.7695091608233433, "grad_norm": 0.15166841447353363, "learning_rate": 8.48347426642722e-06, "loss": 1.2633, "step": 6122 }, { "epoch": 2.7699615471612757, "grad_norm": 0.24070915579795837, "learning_rate": 8.482948577858256e-06, "loss": 0.9602, "step": 6123 }, { "epoch": 2.770413933499208, "grad_norm": 0.22136203944683075, "learning_rate": 8.482422814486334e-06, "loss": 0.6419, "step": 6124 }, { "epoch": 2.770866319837141, "grad_norm": 0.24155260622501373, "learning_rate": 8.481896976322738e-06, "loss": 0.5958, "step": 6125 }, { "epoch": 2.7713187061750735, "grad_norm": 0.29409489035606384, "learning_rate": 8.481371063378768e-06, "loss": 0.7142, "step": 6126 }, { "epoch": 2.7717710925130064, "grad_norm": 0.24571989476680756, "learning_rate": 8.480845075665717e-06, "loss": 0.6112, "step": 6127 }, { "epoch": 2.772223478850939, "grad_norm": 0.2618984282016754, "learning_rate": 8.48031901319488e-06, "loss": 0.6434, "step": 6128 }, { "epoch": 2.772675865188871, "grad_norm": 0.27783143520355225, "learning_rate": 8.479792875977557e-06, "loss": 0.5691, "step": 6129 }, { "epoch": 2.7731282515268036, "grad_norm": 0.28954893350601196, "learning_rate": 8.47926666402505e-06, "loss": 0.697, "step": 6130 }, { "epoch": 2.7735806378647365, "grad_norm": 0.27583909034729004, "learning_rate": 8.478740377348655e-06, "loss": 0.6328, "step": 6131 }, { "epoch": 2.774033024202669, "grad_norm": 0.30720651149749756, "learning_rate": 8.478214015959677e-06, "loss": 0.5865, "step": 6132 }, { "epoch": 2.774485410540602, "grad_norm": 0.28928491473197937, "learning_rate": 8.477687579869424e-06, "loss": 0.6233, "step": 6133 }, { "epoch": 2.7749377968785343, "grad_norm": 0.3897515535354614, "learning_rate": 8.477161069089199e-06, "loss": 0.6809, "step": 6134 }, { "epoch": 2.7753901832164667, "grad_norm": 0.2945503890514374, "learning_rate": 8.476634483630308e-06, "loss": 0.5151, "step": 6135 }, { "epoch": 2.7758425695543996, "grad_norm": 0.2932729423046112, "learning_rate": 8.476107823504067e-06, "loss": 0.5371, "step": 6136 }, { "epoch": 2.776294955892332, "grad_norm": 0.31229186058044434, "learning_rate": 8.475581088721779e-06, "loss": 0.5747, "step": 6137 }, { "epoch": 2.776747342230265, "grad_norm": 0.3078504204750061, "learning_rate": 8.475054279294763e-06, "loss": 0.6085, "step": 6138 }, { "epoch": 2.7771997285681973, "grad_norm": 0.3092038035392761, "learning_rate": 8.474527395234329e-06, "loss": 0.5608, "step": 6139 }, { "epoch": 2.7776521149061297, "grad_norm": 0.31650686264038086, "learning_rate": 8.474000436551796e-06, "loss": 0.4749, "step": 6140 }, { "epoch": 2.778104501244062, "grad_norm": 0.3243841230869293, "learning_rate": 8.47347340325848e-06, "loss": 0.5887, "step": 6141 }, { "epoch": 2.778556887581995, "grad_norm": 0.3567022681236267, "learning_rate": 8.472946295365699e-06, "loss": 0.618, "step": 6142 }, { "epoch": 2.7790092739199275, "grad_norm": 0.33521777391433716, "learning_rate": 8.472419112884776e-06, "loss": 0.4754, "step": 6143 }, { "epoch": 2.7794616602578603, "grad_norm": 0.3200432360172272, "learning_rate": 8.471891855827031e-06, "loss": 0.5919, "step": 6144 }, { "epoch": 2.7799140465957928, "grad_norm": 0.31626954674720764, "learning_rate": 8.47136452420379e-06, "loss": 0.4854, "step": 6145 }, { "epoch": 2.780366432933725, "grad_norm": 0.32336506247520447, "learning_rate": 8.470837118026375e-06, "loss": 0.5146, "step": 6146 }, { "epoch": 2.780818819271658, "grad_norm": 0.3448488414287567, "learning_rate": 8.470309637306118e-06, "loss": 0.5991, "step": 6147 }, { "epoch": 2.7812712056095905, "grad_norm": 0.3167770802974701, "learning_rate": 8.469782082054344e-06, "loss": 0.4771, "step": 6148 }, { "epoch": 2.7817235919475234, "grad_norm": 0.3381984233856201, "learning_rate": 8.469254452282385e-06, "loss": 0.5125, "step": 6149 }, { "epoch": 2.782175978285456, "grad_norm": 0.3246023654937744, "learning_rate": 8.46872674800157e-06, "loss": 0.4811, "step": 6150 }, { "epoch": 2.7826283646233883, "grad_norm": 0.3591046631336212, "learning_rate": 8.468198969223236e-06, "loss": 0.6105, "step": 6151 }, { "epoch": 2.783080750961321, "grad_norm": 0.3771142065525055, "learning_rate": 8.467671115958717e-06, "loss": 0.6902, "step": 6152 }, { "epoch": 2.7835331372992536, "grad_norm": 0.3544699549674988, "learning_rate": 8.467143188219349e-06, "loss": 0.5954, "step": 6153 }, { "epoch": 2.7839855236371864, "grad_norm": 0.35321858525276184, "learning_rate": 8.466615186016469e-06, "loss": 0.4597, "step": 6154 }, { "epoch": 2.784437909975119, "grad_norm": 0.39087796211242676, "learning_rate": 8.466087109361421e-06, "loss": 0.5641, "step": 6155 }, { "epoch": 2.7848902963130513, "grad_norm": 0.3677123785018921, "learning_rate": 8.465558958265542e-06, "loss": 0.4967, "step": 6156 }, { "epoch": 2.7853426826509837, "grad_norm": 0.3635774850845337, "learning_rate": 8.465030732740178e-06, "loss": 0.5586, "step": 6157 }, { "epoch": 2.7857950689889166, "grad_norm": 0.37253332138061523, "learning_rate": 8.464502432796672e-06, "loss": 0.5092, "step": 6158 }, { "epoch": 2.786247455326849, "grad_norm": 0.40326499938964844, "learning_rate": 8.46397405844637e-06, "loss": 0.5782, "step": 6159 }, { "epoch": 2.786699841664782, "grad_norm": 0.4213951826095581, "learning_rate": 8.463445609700622e-06, "loss": 0.5551, "step": 6160 }, { "epoch": 2.7871522280027143, "grad_norm": 0.4139993488788605, "learning_rate": 8.462917086570775e-06, "loss": 0.5807, "step": 6161 }, { "epoch": 2.7876046143406468, "grad_norm": 0.3687395751476288, "learning_rate": 8.462388489068182e-06, "loss": 0.472, "step": 6162 }, { "epoch": 2.7880570006785796, "grad_norm": 0.40840139985084534, "learning_rate": 8.461859817204195e-06, "loss": 0.5614, "step": 6163 }, { "epoch": 2.788509387016512, "grad_norm": 0.3758353888988495, "learning_rate": 8.461331070990166e-06, "loss": 0.4622, "step": 6164 }, { "epoch": 2.788961773354445, "grad_norm": 0.4233570992946625, "learning_rate": 8.460802250437455e-06, "loss": 0.5044, "step": 6165 }, { "epoch": 2.7894141596923774, "grad_norm": 0.41003742814064026, "learning_rate": 8.460273355557416e-06, "loss": 0.5805, "step": 6166 }, { "epoch": 2.78986654603031, "grad_norm": 0.395407497882843, "learning_rate": 8.45974438636141e-06, "loss": 0.4871, "step": 6167 }, { "epoch": 2.7903189323682422, "grad_norm": 0.44534987211227417, "learning_rate": 8.459215342860797e-06, "loss": 0.4825, "step": 6168 }, { "epoch": 2.790771318706175, "grad_norm": 0.501579999923706, "learning_rate": 8.45868622506694e-06, "loss": 0.5543, "step": 6169 }, { "epoch": 2.7912237050441076, "grad_norm": 0.49535277485847473, "learning_rate": 8.458157032991199e-06, "loss": 0.4964, "step": 6170 }, { "epoch": 2.7916760913820404, "grad_norm": 0.5585960745811462, "learning_rate": 8.457627766644944e-06, "loss": 0.533, "step": 6171 }, { "epoch": 2.792128477719973, "grad_norm": 0.15836162865161896, "learning_rate": 8.457098426039541e-06, "loss": 1.0764, "step": 6172 }, { "epoch": 2.7925808640579053, "grad_norm": 0.25518423318862915, "learning_rate": 8.456569011186357e-06, "loss": 0.9793, "step": 6173 }, { "epoch": 2.793033250395838, "grad_norm": 0.2559159994125366, "learning_rate": 8.456039522096763e-06, "loss": 0.6363, "step": 6174 }, { "epoch": 2.7934856367337706, "grad_norm": 0.23892712593078613, "learning_rate": 8.455509958782131e-06, "loss": 0.5625, "step": 6175 }, { "epoch": 2.7939380230717035, "grad_norm": 0.2578345239162445, "learning_rate": 8.454980321253837e-06, "loss": 0.6417, "step": 6176 }, { "epoch": 2.794390409409636, "grad_norm": 0.31682127714157104, "learning_rate": 8.45445060952325e-06, "loss": 0.6312, "step": 6177 }, { "epoch": 2.7948427957475683, "grad_norm": 0.28201162815093994, "learning_rate": 8.453920823601751e-06, "loss": 0.5933, "step": 6178 }, { "epoch": 2.7952951820855008, "grad_norm": 0.272927463054657, "learning_rate": 8.453390963500717e-06, "loss": 0.5452, "step": 6179 }, { "epoch": 2.7957475684234336, "grad_norm": 0.26827892661094666, "learning_rate": 8.452861029231527e-06, "loss": 0.6376, "step": 6180 }, { "epoch": 2.796199954761366, "grad_norm": 0.2833632230758667, "learning_rate": 8.452331020805564e-06, "loss": 0.5897, "step": 6181 }, { "epoch": 2.796652341099299, "grad_norm": 0.2700693905353546, "learning_rate": 8.45180093823421e-06, "loss": 0.5155, "step": 6182 }, { "epoch": 2.7971047274372314, "grad_norm": 0.2844320833683014, "learning_rate": 8.451270781528851e-06, "loss": 0.6037, "step": 6183 }, { "epoch": 2.797557113775164, "grad_norm": 0.32812047004699707, "learning_rate": 8.450740550700869e-06, "loss": 0.6709, "step": 6184 }, { "epoch": 2.7980095001130967, "grad_norm": 0.29616957902908325, "learning_rate": 8.450210245761656e-06, "loss": 0.6117, "step": 6185 }, { "epoch": 2.798461886451029, "grad_norm": 0.3134191334247589, "learning_rate": 8.4496798667226e-06, "loss": 0.5129, "step": 6186 }, { "epoch": 2.798914272788962, "grad_norm": 0.2843526601791382, "learning_rate": 8.44914941359509e-06, "loss": 0.4699, "step": 6187 }, { "epoch": 2.7993666591268944, "grad_norm": 0.3438127636909485, "learning_rate": 8.448618886390523e-06, "loss": 0.6904, "step": 6188 }, { "epoch": 2.799819045464827, "grad_norm": 0.3723723292350769, "learning_rate": 8.448088285120289e-06, "loss": 0.6007, "step": 6189 }, { "epoch": 2.8002714318027593, "grad_norm": 0.3546414375305176, "learning_rate": 8.447557609795782e-06, "loss": 0.5321, "step": 6190 }, { "epoch": 2.800723818140692, "grad_norm": 0.3725714683532715, "learning_rate": 8.447026860428404e-06, "loss": 0.6806, "step": 6191 }, { "epoch": 2.8011762044786246, "grad_norm": 0.311830997467041, "learning_rate": 8.446496037029555e-06, "loss": 0.4768, "step": 6192 }, { "epoch": 2.8016285908165575, "grad_norm": 0.31676241755485535, "learning_rate": 8.44596513961063e-06, "loss": 0.5286, "step": 6193 }, { "epoch": 2.80208097715449, "grad_norm": 0.3274959921836853, "learning_rate": 8.445434168183034e-06, "loss": 0.5673, "step": 6194 }, { "epoch": 2.8025333634924223, "grad_norm": 0.3899116516113281, "learning_rate": 8.444903122758169e-06, "loss": 0.6801, "step": 6195 }, { "epoch": 2.802985749830355, "grad_norm": 0.3491309583187103, "learning_rate": 8.44437200334744e-06, "loss": 0.5154, "step": 6196 }, { "epoch": 2.8034381361682876, "grad_norm": 0.33349162340164185, "learning_rate": 8.443840809962259e-06, "loss": 0.5326, "step": 6197 }, { "epoch": 2.8038905225062205, "grad_norm": 0.36534202098846436, "learning_rate": 8.443309542614028e-06, "loss": 0.5926, "step": 6198 }, { "epoch": 2.804342908844153, "grad_norm": 0.41432860493659973, "learning_rate": 8.442778201314161e-06, "loss": 0.5917, "step": 6199 }, { "epoch": 2.8047952951820854, "grad_norm": 0.35014453530311584, "learning_rate": 8.442246786074066e-06, "loss": 0.537, "step": 6200 }, { "epoch": 2.8047952951820854, "eval_loss": 0.5947020053863525, "eval_runtime": 26.1837, "eval_samples_per_second": 28.415, "eval_steps_per_second": 7.104, "step": 6200 }, { "epoch": 2.8052476815200182, "grad_norm": 0.3348887264728546, "learning_rate": 8.44171529690516e-06, "loss": 0.4055, "step": 6201 }, { "epoch": 2.8057000678579507, "grad_norm": 0.46182915568351746, "learning_rate": 8.441183733818855e-06, "loss": 0.5262, "step": 6202 }, { "epoch": 2.8061524541958835, "grad_norm": 0.4023842215538025, "learning_rate": 8.44065209682657e-06, "loss": 0.6211, "step": 6203 }, { "epoch": 2.806604840533816, "grad_norm": 0.39854997396469116, "learning_rate": 8.44012038593972e-06, "loss": 0.6064, "step": 6204 }, { "epoch": 2.8070572268717484, "grad_norm": 0.3701247274875641, "learning_rate": 8.439588601169725e-06, "loss": 0.5426, "step": 6205 }, { "epoch": 2.807509613209681, "grad_norm": 0.3685521185398102, "learning_rate": 8.43905674252801e-06, "loss": 0.5073, "step": 6206 }, { "epoch": 2.8079619995476137, "grad_norm": 0.405699759721756, "learning_rate": 8.438524810025993e-06, "loss": 0.6111, "step": 6207 }, { "epoch": 2.808414385885546, "grad_norm": 0.36581215262413025, "learning_rate": 8.437992803675097e-06, "loss": 0.5355, "step": 6208 }, { "epoch": 2.808866772223479, "grad_norm": 0.38004231452941895, "learning_rate": 8.437460723486752e-06, "loss": 0.5501, "step": 6209 }, { "epoch": 2.8093191585614115, "grad_norm": 0.3786655366420746, "learning_rate": 8.436928569472385e-06, "loss": 0.4448, "step": 6210 }, { "epoch": 2.809771544899344, "grad_norm": 0.38796377182006836, "learning_rate": 8.436396341643424e-06, "loss": 0.4978, "step": 6211 }, { "epoch": 2.8102239312372768, "grad_norm": 0.40608352422714233, "learning_rate": 8.4358640400113e-06, "loss": 0.4881, "step": 6212 }, { "epoch": 2.810676317575209, "grad_norm": 0.40776529908180237, "learning_rate": 8.435331664587444e-06, "loss": 0.4788, "step": 6213 }, { "epoch": 2.811128703913142, "grad_norm": 0.40614232420921326, "learning_rate": 8.434799215383292e-06, "loss": 0.5421, "step": 6214 }, { "epoch": 2.8115810902510745, "grad_norm": 0.42375648021698, "learning_rate": 8.434266692410276e-06, "loss": 0.5089, "step": 6215 }, { "epoch": 2.812033476589007, "grad_norm": 0.4284926950931549, "learning_rate": 8.433734095679837e-06, "loss": 0.5072, "step": 6216 }, { "epoch": 2.8124858629269394, "grad_norm": 0.45583000779151917, "learning_rate": 8.43320142520341e-06, "loss": 0.5247, "step": 6217 }, { "epoch": 2.8129382492648722, "grad_norm": 0.4446853697299957, "learning_rate": 8.432668680992437e-06, "loss": 0.5602, "step": 6218 }, { "epoch": 2.8133906356028047, "grad_norm": 0.4495112895965576, "learning_rate": 8.43213586305836e-06, "loss": 0.4735, "step": 6219 }, { "epoch": 2.8138430219407375, "grad_norm": 0.49791133403778076, "learning_rate": 8.43160297141262e-06, "loss": 0.5081, "step": 6220 }, { "epoch": 2.81429540827867, "grad_norm": 0.6754029393196106, "learning_rate": 8.431070006066664e-06, "loss": 0.6445, "step": 6221 }, { "epoch": 2.8147477946166024, "grad_norm": 0.15185509622097015, "learning_rate": 8.430536967031939e-06, "loss": 1.123, "step": 6222 }, { "epoch": 2.8152001809545353, "grad_norm": 0.2798718512058258, "learning_rate": 8.430003854319891e-06, "loss": 0.7425, "step": 6223 }, { "epoch": 2.8156525672924677, "grad_norm": 0.2528233826160431, "learning_rate": 8.42947066794197e-06, "loss": 0.6606, "step": 6224 }, { "epoch": 2.8161049536304006, "grad_norm": 0.27249518036842346, "learning_rate": 8.42893740790963e-06, "loss": 0.6188, "step": 6225 }, { "epoch": 2.816557339968333, "grad_norm": 0.24340683221817017, "learning_rate": 8.42840407423432e-06, "loss": 0.4934, "step": 6226 }, { "epoch": 2.8170097263062654, "grad_norm": 0.2617064416408539, "learning_rate": 8.427870666927495e-06, "loss": 0.5254, "step": 6227 }, { "epoch": 2.817462112644198, "grad_norm": 0.3037700653076172, "learning_rate": 8.427337186000613e-06, "loss": 0.5005, "step": 6228 }, { "epoch": 2.8179144989821308, "grad_norm": 0.28405898809432983, "learning_rate": 8.426803631465131e-06, "loss": 0.5846, "step": 6229 }, { "epoch": 2.818366885320063, "grad_norm": 0.3150983452796936, "learning_rate": 8.426270003332506e-06, "loss": 0.5733, "step": 6230 }, { "epoch": 2.818819271657996, "grad_norm": 0.32117927074432373, "learning_rate": 8.4257363016142e-06, "loss": 0.6371, "step": 6231 }, { "epoch": 2.8192716579959285, "grad_norm": 0.32360726594924927, "learning_rate": 8.425202526321676e-06, "loss": 0.6899, "step": 6232 }, { "epoch": 2.819724044333861, "grad_norm": 0.3610115051269531, "learning_rate": 8.424668677466397e-06, "loss": 0.7061, "step": 6233 }, { "epoch": 2.820176430671794, "grad_norm": 0.26590695977211, "learning_rate": 8.424134755059829e-06, "loss": 0.3918, "step": 6234 }, { "epoch": 2.8206288170097262, "grad_norm": 0.29354292154312134, "learning_rate": 8.42360075911344e-06, "loss": 0.5325, "step": 6235 }, { "epoch": 2.821081203347659, "grad_norm": 0.34784817695617676, "learning_rate": 8.423066689638696e-06, "loss": 0.5558, "step": 6236 }, { "epoch": 2.8215335896855915, "grad_norm": 0.3220553696155548, "learning_rate": 8.422532546647067e-06, "loss": 0.5869, "step": 6237 }, { "epoch": 2.821985976023524, "grad_norm": 0.315766841173172, "learning_rate": 8.421998330150027e-06, "loss": 0.5216, "step": 6238 }, { "epoch": 2.822438362361457, "grad_norm": 0.36304911971092224, "learning_rate": 8.421464040159047e-06, "loss": 0.719, "step": 6239 }, { "epoch": 2.8228907486993893, "grad_norm": 0.3162936270236969, "learning_rate": 8.420929676685606e-06, "loss": 0.5285, "step": 6240 }, { "epoch": 2.823343135037322, "grad_norm": 0.3200879395008087, "learning_rate": 8.420395239741178e-06, "loss": 0.5743, "step": 6241 }, { "epoch": 2.8237955213752546, "grad_norm": 0.3767230808734894, "learning_rate": 8.419860729337237e-06, "loss": 0.6255, "step": 6242 }, { "epoch": 2.824247907713187, "grad_norm": 0.32472044229507446, "learning_rate": 8.41932614548527e-06, "loss": 0.5679, "step": 6243 }, { "epoch": 2.8247002940511194, "grad_norm": 0.3311210870742798, "learning_rate": 8.418791488196753e-06, "loss": 0.6396, "step": 6244 }, { "epoch": 2.8251526803890523, "grad_norm": 0.3263010084629059, "learning_rate": 8.418256757483172e-06, "loss": 0.5712, "step": 6245 }, { "epoch": 2.8256050667269847, "grad_norm": 0.3193914592266083, "learning_rate": 8.417721953356009e-06, "loss": 0.4518, "step": 6246 }, { "epoch": 2.8260574530649176, "grad_norm": 0.2921936511993408, "learning_rate": 8.417187075826751e-06, "loss": 0.498, "step": 6247 }, { "epoch": 2.82650983940285, "grad_norm": 0.3584350645542145, "learning_rate": 8.416652124906886e-06, "loss": 0.5995, "step": 6248 }, { "epoch": 2.8269622257407825, "grad_norm": 0.34334105253219604, "learning_rate": 8.4161171006079e-06, "loss": 0.4855, "step": 6249 }, { "epoch": 2.8274146120787154, "grad_norm": 0.3686297833919525, "learning_rate": 8.415582002941288e-06, "loss": 0.5166, "step": 6250 }, { "epoch": 2.827866998416648, "grad_norm": 0.38422319293022156, "learning_rate": 8.415046831918541e-06, "loss": 0.564, "step": 6251 }, { "epoch": 2.8283193847545807, "grad_norm": 0.37301504611968994, "learning_rate": 8.41451158755115e-06, "loss": 0.639, "step": 6252 }, { "epoch": 2.828771771092513, "grad_norm": 0.3455844521522522, "learning_rate": 8.413976269850615e-06, "loss": 0.5125, "step": 6253 }, { "epoch": 2.8292241574304455, "grad_norm": 0.34586766362190247, "learning_rate": 8.413440878828427e-06, "loss": 0.4864, "step": 6254 }, { "epoch": 2.829676543768378, "grad_norm": 0.3955437242984772, "learning_rate": 8.41290541449609e-06, "loss": 0.5677, "step": 6255 }, { "epoch": 2.830128930106311, "grad_norm": 0.34955838322639465, "learning_rate": 8.412369876865103e-06, "loss": 0.5168, "step": 6256 }, { "epoch": 2.8305813164442433, "grad_norm": 0.4088304936885834, "learning_rate": 8.411834265946966e-06, "loss": 0.6149, "step": 6257 }, { "epoch": 2.831033702782176, "grad_norm": 0.39521464705467224, "learning_rate": 8.411298581753183e-06, "loss": 0.5725, "step": 6258 }, { "epoch": 2.8314860891201086, "grad_norm": 0.3681979477405548, "learning_rate": 8.41076282429526e-06, "loss": 0.483, "step": 6259 }, { "epoch": 2.831938475458041, "grad_norm": 0.43050578236579895, "learning_rate": 8.4102269935847e-06, "loss": 0.6264, "step": 6260 }, { "epoch": 2.832390861795974, "grad_norm": 0.43465229868888855, "learning_rate": 8.409691089633013e-06, "loss": 0.5555, "step": 6261 }, { "epoch": 2.8328432481339063, "grad_norm": 0.4050455093383789, "learning_rate": 8.409155112451712e-06, "loss": 0.5189, "step": 6262 }, { "epoch": 2.833295634471839, "grad_norm": 0.47633618116378784, "learning_rate": 8.408619062052305e-06, "loss": 0.618, "step": 6263 }, { "epoch": 2.8337480208097716, "grad_norm": 0.4043081998825073, "learning_rate": 8.408082938446302e-06, "loss": 0.5406, "step": 6264 }, { "epoch": 2.834200407147704, "grad_norm": 0.44257932901382446, "learning_rate": 8.407546741645221e-06, "loss": 0.5016, "step": 6265 }, { "epoch": 2.8346527934856365, "grad_norm": 0.45985129475593567, "learning_rate": 8.407010471660578e-06, "loss": 0.5833, "step": 6266 }, { "epoch": 2.8351051798235694, "grad_norm": 0.46535125374794006, "learning_rate": 8.406474128503887e-06, "loss": 0.5524, "step": 6267 }, { "epoch": 2.835557566161502, "grad_norm": 0.4770302176475525, "learning_rate": 8.40593771218667e-06, "loss": 0.5574, "step": 6268 }, { "epoch": 2.8360099524994347, "grad_norm": 0.4380345940589905, "learning_rate": 8.40540122272045e-06, "loss": 0.5196, "step": 6269 }, { "epoch": 2.836462338837367, "grad_norm": 0.4606255888938904, "learning_rate": 8.404864660116743e-06, "loss": 0.422, "step": 6270 }, { "epoch": 2.8369147251752995, "grad_norm": 0.5152679681777954, "learning_rate": 8.404328024387076e-06, "loss": 0.4779, "step": 6271 }, { "epoch": 2.8373671115132324, "grad_norm": 0.12395033240318298, "learning_rate": 8.403791315542976e-06, "loss": 1.0565, "step": 6272 }, { "epoch": 2.837819497851165, "grad_norm": 0.20076867938041687, "learning_rate": 8.403254533595963e-06, "loss": 1.0446, "step": 6273 }, { "epoch": 2.8382718841890977, "grad_norm": 0.23939728736877441, "learning_rate": 8.402717678557576e-06, "loss": 0.5405, "step": 6274 }, { "epoch": 2.83872427052703, "grad_norm": 0.25838860869407654, "learning_rate": 8.402180750439335e-06, "loss": 0.6637, "step": 6275 }, { "epoch": 2.8391766568649626, "grad_norm": 0.2627571225166321, "learning_rate": 8.401643749252777e-06, "loss": 0.5979, "step": 6276 }, { "epoch": 2.839629043202895, "grad_norm": 0.27024760842323303, "learning_rate": 8.401106675009434e-06, "loss": 0.6459, "step": 6277 }, { "epoch": 2.840081429540828, "grad_norm": 0.2880045473575592, "learning_rate": 8.40056952772084e-06, "loss": 0.6346, "step": 6278 }, { "epoch": 2.8405338158787603, "grad_norm": 0.25773370265960693, "learning_rate": 8.400032307398533e-06, "loss": 0.401, "step": 6279 }, { "epoch": 2.840986202216693, "grad_norm": 0.27172422409057617, "learning_rate": 8.399495014054048e-06, "loss": 0.5644, "step": 6280 }, { "epoch": 2.8414385885546256, "grad_norm": 0.3012930750846863, "learning_rate": 8.398957647698928e-06, "loss": 0.5963, "step": 6281 }, { "epoch": 2.841890974892558, "grad_norm": 0.2709486782550812, "learning_rate": 8.398420208344711e-06, "loss": 0.5046, "step": 6282 }, { "epoch": 2.842343361230491, "grad_norm": 0.2732105851173401, "learning_rate": 8.39788269600294e-06, "loss": 0.4549, "step": 6283 }, { "epoch": 2.8427957475684233, "grad_norm": 0.2793128788471222, "learning_rate": 8.39734511068516e-06, "loss": 0.5273, "step": 6284 }, { "epoch": 2.8432481339063562, "grad_norm": 0.29159659147262573, "learning_rate": 8.396807452402916e-06, "loss": 0.5998, "step": 6285 }, { "epoch": 2.8437005202442887, "grad_norm": 0.3168523907661438, "learning_rate": 8.396269721167758e-06, "loss": 0.497, "step": 6286 }, { "epoch": 2.844152906582221, "grad_norm": 0.3229195475578308, "learning_rate": 8.39573191699123e-06, "loss": 0.5162, "step": 6287 }, { "epoch": 2.844605292920154, "grad_norm": 0.3285291790962219, "learning_rate": 8.395194039884885e-06, "loss": 0.5446, "step": 6288 }, { "epoch": 2.8450576792580864, "grad_norm": 0.33733153343200684, "learning_rate": 8.394656089860274e-06, "loss": 0.5985, "step": 6289 }, { "epoch": 2.8455100655960193, "grad_norm": 0.34269580245018005, "learning_rate": 8.394118066928953e-06, "loss": 0.5281, "step": 6290 }, { "epoch": 2.8459624519339517, "grad_norm": 0.3019844591617584, "learning_rate": 8.393579971102474e-06, "loss": 0.4992, "step": 6291 }, { "epoch": 2.846414838271884, "grad_norm": 0.32748478651046753, "learning_rate": 8.393041802392395e-06, "loss": 0.5487, "step": 6292 }, { "epoch": 2.8468672246098166, "grad_norm": 0.32067370414733887, "learning_rate": 8.392503560810275e-06, "loss": 0.5728, "step": 6293 }, { "epoch": 2.8473196109477494, "grad_norm": 0.3407701253890991, "learning_rate": 8.391965246367672e-06, "loss": 0.5817, "step": 6294 }, { "epoch": 2.847771997285682, "grad_norm": 0.3892028331756592, "learning_rate": 8.391426859076148e-06, "loss": 0.6317, "step": 6295 }, { "epoch": 2.8482243836236147, "grad_norm": 0.3245588541030884, "learning_rate": 8.390888398947265e-06, "loss": 0.4593, "step": 6296 }, { "epoch": 2.848676769961547, "grad_norm": 0.4015701413154602, "learning_rate": 8.390349865992591e-06, "loss": 0.6575, "step": 6297 }, { "epoch": 2.8491291562994796, "grad_norm": 0.3548543155193329, "learning_rate": 8.389811260223687e-06, "loss": 0.6207, "step": 6298 }, { "epoch": 2.8495815426374125, "grad_norm": 0.3547293543815613, "learning_rate": 8.389272581652125e-06, "loss": 0.5861, "step": 6299 }, { "epoch": 2.850033928975345, "grad_norm": 0.3704690933227539, "learning_rate": 8.388733830289473e-06, "loss": 0.6059, "step": 6300 }, { "epoch": 2.850486315313278, "grad_norm": 0.3233630061149597, "learning_rate": 8.3881950061473e-06, "loss": 0.4958, "step": 6301 }, { "epoch": 2.85093870165121, "grad_norm": 0.3650766611099243, "learning_rate": 8.38765610923718e-06, "loss": 0.6148, "step": 6302 }, { "epoch": 2.8513910879891426, "grad_norm": 0.34355050325393677, "learning_rate": 8.387117139570684e-06, "loss": 0.5454, "step": 6303 }, { "epoch": 2.851843474327075, "grad_norm": 0.3566795587539673, "learning_rate": 8.38657809715939e-06, "loss": 0.555, "step": 6304 }, { "epoch": 2.852295860665008, "grad_norm": 0.39813438057899475, "learning_rate": 8.386038982014876e-06, "loss": 0.5387, "step": 6305 }, { "epoch": 2.8527482470029404, "grad_norm": 0.3498702049255371, "learning_rate": 8.385499794148716e-06, "loss": 0.4288, "step": 6306 }, { "epoch": 2.8532006333408733, "grad_norm": 0.37301018834114075, "learning_rate": 8.384960533572497e-06, "loss": 0.5045, "step": 6307 }, { "epoch": 2.8536530196788057, "grad_norm": 0.4019213318824768, "learning_rate": 8.384421200297796e-06, "loss": 0.4703, "step": 6308 }, { "epoch": 2.854105406016738, "grad_norm": 0.44594481587409973, "learning_rate": 8.383881794336194e-06, "loss": 0.6261, "step": 6309 }, { "epoch": 2.854557792354671, "grad_norm": 0.41359180212020874, "learning_rate": 8.383342315699281e-06, "loss": 0.5546, "step": 6310 }, { "epoch": 2.8550101786926034, "grad_norm": 0.3804074823856354, "learning_rate": 8.382802764398642e-06, "loss": 0.5159, "step": 6311 }, { "epoch": 2.8554625650305363, "grad_norm": 0.3861176371574402, "learning_rate": 8.382263140445862e-06, "loss": 0.4914, "step": 6312 }, { "epoch": 2.8559149513684687, "grad_norm": 0.3967639207839966, "learning_rate": 8.381723443852533e-06, "loss": 0.5226, "step": 6313 }, { "epoch": 2.856367337706401, "grad_norm": 0.38036972284317017, "learning_rate": 8.381183674630245e-06, "loss": 0.5006, "step": 6314 }, { "epoch": 2.8568197240443336, "grad_norm": 0.3908082842826843, "learning_rate": 8.380643832790591e-06, "loss": 0.4826, "step": 6315 }, { "epoch": 2.8572721103822665, "grad_norm": 0.42447593808174133, "learning_rate": 8.380103918345166e-06, "loss": 0.5403, "step": 6316 }, { "epoch": 2.857724496720199, "grad_norm": 0.40453624725341797, "learning_rate": 8.379563931305565e-06, "loss": 0.4821, "step": 6317 }, { "epoch": 2.858176883058132, "grad_norm": 0.49485066533088684, "learning_rate": 8.379023871683385e-06, "loss": 0.5459, "step": 6318 }, { "epoch": 2.858629269396064, "grad_norm": 0.5098987817764282, "learning_rate": 8.378483739490224e-06, "loss": 0.6846, "step": 6319 }, { "epoch": 2.8590816557339966, "grad_norm": 0.5125452876091003, "learning_rate": 8.377943534737682e-06, "loss": 0.5433, "step": 6320 }, { "epoch": 2.8595340420719295, "grad_norm": 0.5891380310058594, "learning_rate": 8.377403257437364e-06, "loss": 0.6635, "step": 6321 }, { "epoch": 2.859986428409862, "grad_norm": 0.14236848056316376, "learning_rate": 8.376862907600872e-06, "loss": 1.1295, "step": 6322 }, { "epoch": 2.860438814747795, "grad_norm": 0.18700291216373444, "learning_rate": 8.37632248523981e-06, "loss": 0.9502, "step": 6323 }, { "epoch": 2.8608912010857273, "grad_norm": 0.2712372839450836, "learning_rate": 8.375781990365786e-06, "loss": 0.7874, "step": 6324 }, { "epoch": 2.8613435874236597, "grad_norm": 0.26843321323394775, "learning_rate": 8.375241422990405e-06, "loss": 0.6103, "step": 6325 }, { "epoch": 2.8617959737615926, "grad_norm": 0.2712703049182892, "learning_rate": 8.374700783125282e-06, "loss": 0.6004, "step": 6326 }, { "epoch": 2.862248360099525, "grad_norm": 0.25130924582481384, "learning_rate": 8.374160070782024e-06, "loss": 0.5535, "step": 6327 }, { "epoch": 2.862700746437458, "grad_norm": 0.2572259306907654, "learning_rate": 8.373619285972246e-06, "loss": 0.6269, "step": 6328 }, { "epoch": 2.8631531327753903, "grad_norm": 0.2814970910549164, "learning_rate": 8.373078428707562e-06, "loss": 0.5801, "step": 6329 }, { "epoch": 2.8636055191133227, "grad_norm": 0.26195797324180603, "learning_rate": 8.372537498999587e-06, "loss": 0.525, "step": 6330 }, { "epoch": 2.864057905451255, "grad_norm": 0.2859537899494171, "learning_rate": 8.371996496859938e-06, "loss": 0.5194, "step": 6331 }, { "epoch": 2.864510291789188, "grad_norm": 0.2946650981903076, "learning_rate": 8.371455422300237e-06, "loss": 0.545, "step": 6332 }, { "epoch": 2.8649626781271205, "grad_norm": 0.32422712445259094, "learning_rate": 8.370914275332101e-06, "loss": 0.5966, "step": 6333 }, { "epoch": 2.8654150644650533, "grad_norm": 0.31002601981163025, "learning_rate": 8.370373055967155e-06, "loss": 0.5663, "step": 6334 }, { "epoch": 2.8658674508029858, "grad_norm": 0.32718193531036377, "learning_rate": 8.36983176421702e-06, "loss": 0.5318, "step": 6335 }, { "epoch": 2.866319837140918, "grad_norm": 0.2955475151538849, "learning_rate": 8.369290400093325e-06, "loss": 0.4979, "step": 6336 }, { "epoch": 2.866772223478851, "grad_norm": 0.319456547498703, "learning_rate": 8.368748963607694e-06, "loss": 0.5592, "step": 6337 }, { "epoch": 2.8672246098167835, "grad_norm": 0.3479180634021759, "learning_rate": 8.368207454771757e-06, "loss": 0.5238, "step": 6338 }, { "epoch": 2.8676769961547164, "grad_norm": 0.3575822114944458, "learning_rate": 8.367665873597141e-06, "loss": 0.6157, "step": 6339 }, { "epoch": 2.868129382492649, "grad_norm": 0.37350958585739136, "learning_rate": 8.367124220095481e-06, "loss": 0.6526, "step": 6340 }, { "epoch": 2.8685817688305812, "grad_norm": 0.35868382453918457, "learning_rate": 8.366582494278409e-06, "loss": 0.6004, "step": 6341 }, { "epoch": 2.8690341551685137, "grad_norm": 0.368195116519928, "learning_rate": 8.366040696157559e-06, "loss": 0.6457, "step": 6342 }, { "epoch": 2.8694865415064466, "grad_norm": 0.3608800172805786, "learning_rate": 8.365498825744566e-06, "loss": 0.6403, "step": 6343 }, { "epoch": 2.869938927844379, "grad_norm": 0.3426936864852905, "learning_rate": 8.36495688305107e-06, "loss": 0.5174, "step": 6344 }, { "epoch": 2.870391314182312, "grad_norm": 0.3513723611831665, "learning_rate": 8.364414868088708e-06, "loss": 0.5531, "step": 6345 }, { "epoch": 2.8708437005202443, "grad_norm": 0.35436201095581055, "learning_rate": 8.363872780869123e-06, "loss": 0.5607, "step": 6346 }, { "epoch": 2.8712960868581767, "grad_norm": 0.3721621632575989, "learning_rate": 8.363330621403957e-06, "loss": 0.5041, "step": 6347 }, { "epoch": 2.8717484731961096, "grad_norm": 0.42748168110847473, "learning_rate": 8.362788389704852e-06, "loss": 0.6489, "step": 6348 }, { "epoch": 2.872200859534042, "grad_norm": 0.3515227437019348, "learning_rate": 8.362246085783457e-06, "loss": 0.5329, "step": 6349 }, { "epoch": 2.872653245871975, "grad_norm": 0.369052916765213, "learning_rate": 8.361703709651415e-06, "loss": 0.5723, "step": 6350 }, { "epoch": 2.8731056322099073, "grad_norm": 0.3484916090965271, "learning_rate": 8.361161261320376e-06, "loss": 0.4991, "step": 6351 }, { "epoch": 2.8735580185478398, "grad_norm": 0.3657105565071106, "learning_rate": 8.360618740801992e-06, "loss": 0.5667, "step": 6352 }, { "epoch": 2.874010404885772, "grad_norm": 0.3824383020401001, "learning_rate": 8.360076148107911e-06, "loss": 0.5903, "step": 6353 }, { "epoch": 2.874462791223705, "grad_norm": 0.32984092831611633, "learning_rate": 8.359533483249791e-06, "loss": 0.5164, "step": 6354 }, { "epoch": 2.8749151775616375, "grad_norm": 0.3690013885498047, "learning_rate": 8.358990746239284e-06, "loss": 0.5353, "step": 6355 }, { "epoch": 2.8753675638995704, "grad_norm": 0.37027254700660706, "learning_rate": 8.358447937088045e-06, "loss": 0.4871, "step": 6356 }, { "epoch": 2.875819950237503, "grad_norm": 0.35892075300216675, "learning_rate": 8.357905055807733e-06, "loss": 0.5093, "step": 6357 }, { "epoch": 2.8762723365754352, "grad_norm": 0.35304495692253113, "learning_rate": 8.357362102410009e-06, "loss": 0.5035, "step": 6358 }, { "epoch": 2.876724722913368, "grad_norm": 0.38415247201919556, "learning_rate": 8.356819076906532e-06, "loss": 0.4883, "step": 6359 }, { "epoch": 2.8771771092513005, "grad_norm": 0.4389605224132538, "learning_rate": 8.356275979308965e-06, "loss": 0.5556, "step": 6360 }, { "epoch": 2.8776294955892334, "grad_norm": 0.41657590866088867, "learning_rate": 8.355732809628971e-06, "loss": 0.5327, "step": 6361 }, { "epoch": 2.878081881927166, "grad_norm": 0.38421010971069336, "learning_rate": 8.355189567878219e-06, "loss": 0.4958, "step": 6362 }, { "epoch": 2.8785342682650983, "grad_norm": 0.4056447446346283, "learning_rate": 8.354646254068373e-06, "loss": 0.5149, "step": 6363 }, { "epoch": 2.8789866546030307, "grad_norm": 0.45014721155166626, "learning_rate": 8.354102868211103e-06, "loss": 0.5087, "step": 6364 }, { "epoch": 2.8794390409409636, "grad_norm": 0.4109978973865509, "learning_rate": 8.35355941031808e-06, "loss": 0.4705, "step": 6365 }, { "epoch": 2.879891427278896, "grad_norm": 0.4117828905582428, "learning_rate": 8.353015880400974e-06, "loss": 0.5603, "step": 6366 }, { "epoch": 2.880343813616829, "grad_norm": 0.4496031701564789, "learning_rate": 8.352472278471457e-06, "loss": 0.5102, "step": 6367 }, { "epoch": 2.8807961999547613, "grad_norm": 0.43703439831733704, "learning_rate": 8.351928604541208e-06, "loss": 0.5228, "step": 6368 }, { "epoch": 2.8812485862926938, "grad_norm": 0.5062860250473022, "learning_rate": 8.3513848586219e-06, "loss": 0.5033, "step": 6369 }, { "epoch": 2.8817009726306266, "grad_norm": 0.4546487331390381, "learning_rate": 8.350841040725214e-06, "loss": 0.5338, "step": 6370 }, { "epoch": 2.882153358968559, "grad_norm": 0.5733978748321533, "learning_rate": 8.350297150862828e-06, "loss": 0.5403, "step": 6371 }, { "epoch": 2.882605745306492, "grad_norm": 0.13298550248146057, "learning_rate": 8.349753189046421e-06, "loss": 1.1777, "step": 6372 }, { "epoch": 2.8830581316444244, "grad_norm": 0.1755703240633011, "learning_rate": 8.34920915528768e-06, "loss": 0.3953, "step": 6373 }, { "epoch": 2.883510517982357, "grad_norm": 0.27576911449432373, "learning_rate": 8.348665049598286e-06, "loss": 0.7621, "step": 6374 }, { "epoch": 2.8839629043202897, "grad_norm": 0.28176149725914, "learning_rate": 8.348120871989925e-06, "loss": 0.6092, "step": 6375 }, { "epoch": 2.884415290658222, "grad_norm": 0.2563265562057495, "learning_rate": 8.347576622474283e-06, "loss": 0.5349, "step": 6376 }, { "epoch": 2.884867676996155, "grad_norm": 0.29110488295555115, "learning_rate": 8.347032301063054e-06, "loss": 0.692, "step": 6377 }, { "epoch": 2.8853200633340874, "grad_norm": 0.261529803276062, "learning_rate": 8.346487907767922e-06, "loss": 0.5124, "step": 6378 }, { "epoch": 2.88577244967202, "grad_norm": 0.2722216546535492, "learning_rate": 8.345943442600582e-06, "loss": 0.619, "step": 6379 }, { "epoch": 2.8862248360099523, "grad_norm": 0.33604803681373596, "learning_rate": 8.345398905572728e-06, "loss": 0.7843, "step": 6380 }, { "epoch": 2.886677222347885, "grad_norm": 0.3096086084842682, "learning_rate": 8.344854296696054e-06, "loss": 0.6188, "step": 6381 }, { "epoch": 2.8871296086858176, "grad_norm": 0.3112984299659729, "learning_rate": 8.344309615982258e-06, "loss": 0.5735, "step": 6382 }, { "epoch": 2.8875819950237505, "grad_norm": 0.36803704500198364, "learning_rate": 8.343764863443034e-06, "loss": 0.7708, "step": 6383 }, { "epoch": 2.888034381361683, "grad_norm": 0.32453423738479614, "learning_rate": 8.343220039090086e-06, "loss": 0.5144, "step": 6384 }, { "epoch": 2.8884867676996153, "grad_norm": 0.3243732154369354, "learning_rate": 8.342675142935113e-06, "loss": 0.4918, "step": 6385 }, { "epoch": 2.888939154037548, "grad_norm": 0.30582788586616516, "learning_rate": 8.342130174989819e-06, "loss": 0.4091, "step": 6386 }, { "epoch": 2.8893915403754806, "grad_norm": 0.2996274530887604, "learning_rate": 8.341585135265906e-06, "loss": 0.4928, "step": 6387 }, { "epoch": 2.8898439267134135, "grad_norm": 0.3513928949832916, "learning_rate": 8.341040023775083e-06, "loss": 0.5708, "step": 6388 }, { "epoch": 2.890296313051346, "grad_norm": 0.32961711287498474, "learning_rate": 8.340494840529053e-06, "loss": 0.6068, "step": 6389 }, { "epoch": 2.8907486993892784, "grad_norm": 0.3190494477748871, "learning_rate": 8.339949585539528e-06, "loss": 0.5415, "step": 6390 }, { "epoch": 2.891201085727211, "grad_norm": 0.34470149874687195, "learning_rate": 8.339404258818218e-06, "loss": 0.5841, "step": 6391 }, { "epoch": 2.8916534720651437, "grad_norm": 0.3388875126838684, "learning_rate": 8.338858860376836e-06, "loss": 0.5797, "step": 6392 }, { "epoch": 2.892105858403076, "grad_norm": 0.3643176853656769, "learning_rate": 8.338313390227091e-06, "loss": 0.5433, "step": 6393 }, { "epoch": 2.892558244741009, "grad_norm": 0.35315629839897156, "learning_rate": 8.337767848380703e-06, "loss": 0.5094, "step": 6394 }, { "epoch": 2.8930106310789414, "grad_norm": 0.36274734139442444, "learning_rate": 8.337222234849387e-06, "loss": 0.4963, "step": 6395 }, { "epoch": 2.893463017416874, "grad_norm": 0.393794447183609, "learning_rate": 8.33667654964486e-06, "loss": 0.5473, "step": 6396 }, { "epoch": 2.8939154037548067, "grad_norm": 0.37483569979667664, "learning_rate": 8.336130792778842e-06, "loss": 0.5886, "step": 6397 }, { "epoch": 2.894367790092739, "grad_norm": 0.35760408639907837, "learning_rate": 8.335584964263054e-06, "loss": 0.5137, "step": 6398 }, { "epoch": 2.894820176430672, "grad_norm": 0.3614044189453125, "learning_rate": 8.335039064109222e-06, "loss": 0.5745, "step": 6399 }, { "epoch": 2.8952725627686045, "grad_norm": 0.3847323954105377, "learning_rate": 8.334493092329065e-06, "loss": 0.4599, "step": 6400 }, { "epoch": 2.8952725627686045, "eval_loss": 0.5947079062461853, "eval_runtime": 25.7449, "eval_samples_per_second": 28.899, "eval_steps_per_second": 7.225, "step": 6400 }, { "epoch": 2.895724949106537, "grad_norm": 0.3687978982925415, "learning_rate": 8.333947048934312e-06, "loss": 0.5909, "step": 6401 }, { "epoch": 2.8961773354444693, "grad_norm": 0.36289846897125244, "learning_rate": 8.33340093393669e-06, "loss": 0.5377, "step": 6402 }, { "epoch": 2.896629721782402, "grad_norm": 0.3999541997909546, "learning_rate": 8.332854747347927e-06, "loss": 0.6652, "step": 6403 }, { "epoch": 2.8970821081203346, "grad_norm": 0.41098257899284363, "learning_rate": 8.332308489179754e-06, "loss": 0.634, "step": 6404 }, { "epoch": 2.8975344944582675, "grad_norm": 0.38316720724105835, "learning_rate": 8.331762159443901e-06, "loss": 0.5133, "step": 6405 }, { "epoch": 2.8979868807962, "grad_norm": 0.4161182641983032, "learning_rate": 8.331215758152107e-06, "loss": 0.6272, "step": 6406 }, { "epoch": 2.8984392671341324, "grad_norm": 0.36222657561302185, "learning_rate": 8.330669285316099e-06, "loss": 0.4958, "step": 6407 }, { "epoch": 2.8988916534720652, "grad_norm": 0.38928380608558655, "learning_rate": 8.33012274094762e-06, "loss": 0.5663, "step": 6408 }, { "epoch": 2.8993440398099977, "grad_norm": 0.3509397804737091, "learning_rate": 8.329576125058406e-06, "loss": 0.457, "step": 6409 }, { "epoch": 2.8997964261479305, "grad_norm": 0.404513418674469, "learning_rate": 8.329029437660196e-06, "loss": 0.5929, "step": 6410 }, { "epoch": 2.900248812485863, "grad_norm": 0.4439297020435333, "learning_rate": 8.328482678764734e-06, "loss": 0.617, "step": 6411 }, { "epoch": 2.9007011988237954, "grad_norm": 0.3985638916492462, "learning_rate": 8.327935848383758e-06, "loss": 0.5169, "step": 6412 }, { "epoch": 2.9011535851617283, "grad_norm": 0.4392116963863373, "learning_rate": 8.327388946529013e-06, "loss": 0.529, "step": 6413 }, { "epoch": 2.9016059714996607, "grad_norm": 0.4356406629085541, "learning_rate": 8.326841973212249e-06, "loss": 0.5079, "step": 6414 }, { "epoch": 2.902058357837593, "grad_norm": 0.46012967824935913, "learning_rate": 8.32629492844521e-06, "loss": 0.4733, "step": 6415 }, { "epoch": 2.902510744175526, "grad_norm": 0.47029829025268555, "learning_rate": 8.325747812239645e-06, "loss": 0.571, "step": 6416 }, { "epoch": 2.9029631305134584, "grad_norm": 0.4924481511116028, "learning_rate": 8.325200624607304e-06, "loss": 0.5672, "step": 6417 }, { "epoch": 2.903415516851391, "grad_norm": 0.4430055320262909, "learning_rate": 8.324653365559942e-06, "loss": 0.5093, "step": 6418 }, { "epoch": 2.9038679031893238, "grad_norm": 0.4929184317588806, "learning_rate": 8.32410603510931e-06, "loss": 0.4903, "step": 6419 }, { "epoch": 2.904320289527256, "grad_norm": 0.6252223253250122, "learning_rate": 8.32355863326716e-06, "loss": 0.6846, "step": 6420 }, { "epoch": 2.904772675865189, "grad_norm": 0.6128209233283997, "learning_rate": 8.323011160045254e-06, "loss": 0.6849, "step": 6421 }, { "epoch": 2.9052250622031215, "grad_norm": 0.15767472982406616, "learning_rate": 8.322463615455346e-06, "loss": 1.1805, "step": 6422 }, { "epoch": 2.905677448541054, "grad_norm": 0.239568829536438, "learning_rate": 8.3219159995092e-06, "loss": 0.8265, "step": 6423 }, { "epoch": 2.906129834878987, "grad_norm": 0.23465080559253693, "learning_rate": 8.321368312218573e-06, "loss": 0.613, "step": 6424 }, { "epoch": 2.9065822212169192, "grad_norm": 0.24307198822498322, "learning_rate": 8.320820553595228e-06, "loss": 0.6013, "step": 6425 }, { "epoch": 2.907034607554852, "grad_norm": 0.2543877363204956, "learning_rate": 8.32027272365093e-06, "loss": 0.644, "step": 6426 }, { "epoch": 2.9074869938927845, "grad_norm": 0.2623932957649231, "learning_rate": 8.319724822397446e-06, "loss": 0.5806, "step": 6427 }, { "epoch": 2.907939380230717, "grad_norm": 0.2706856429576874, "learning_rate": 8.319176849846543e-06, "loss": 0.6422, "step": 6428 }, { "epoch": 2.9083917665686494, "grad_norm": 0.2954728901386261, "learning_rate": 8.318628806009988e-06, "loss": 0.5537, "step": 6429 }, { "epoch": 2.9088441529065823, "grad_norm": 0.27852821350097656, "learning_rate": 8.31808069089955e-06, "loss": 0.5933, "step": 6430 }, { "epoch": 2.9092965392445147, "grad_norm": 0.3055676519870758, "learning_rate": 8.317532504527003e-06, "loss": 0.6061, "step": 6431 }, { "epoch": 2.9097489255824476, "grad_norm": 0.3056114912033081, "learning_rate": 8.316984246904122e-06, "loss": 0.5414, "step": 6432 }, { "epoch": 2.91020131192038, "grad_norm": 0.29300540685653687, "learning_rate": 8.316435918042682e-06, "loss": 0.5577, "step": 6433 }, { "epoch": 2.9106536982583124, "grad_norm": 0.30266106128692627, "learning_rate": 8.315887517954454e-06, "loss": 0.5932, "step": 6434 }, { "epoch": 2.9111060845962453, "grad_norm": 0.3051559329032898, "learning_rate": 8.31533904665122e-06, "loss": 0.601, "step": 6435 }, { "epoch": 2.9115584709341777, "grad_norm": 0.339847207069397, "learning_rate": 8.31479050414476e-06, "loss": 0.5552, "step": 6436 }, { "epoch": 2.9120108572721106, "grad_norm": 0.32294756174087524, "learning_rate": 8.314241890446853e-06, "loss": 0.5487, "step": 6437 }, { "epoch": 2.912463243610043, "grad_norm": 0.3678239583969116, "learning_rate": 8.313693205569284e-06, "loss": 0.6579, "step": 6438 }, { "epoch": 2.9129156299479755, "grad_norm": 0.3044910728931427, "learning_rate": 8.313144449523833e-06, "loss": 0.5609, "step": 6439 }, { "epoch": 2.913368016285908, "grad_norm": 0.3542015552520752, "learning_rate": 8.31259562232229e-06, "loss": 0.5655, "step": 6440 }, { "epoch": 2.913820402623841, "grad_norm": 0.313269704580307, "learning_rate": 8.31204672397644e-06, "loss": 0.6216, "step": 6441 }, { "epoch": 2.914272788961773, "grad_norm": 0.32869815826416016, "learning_rate": 8.31149775449807e-06, "loss": 0.5048, "step": 6442 }, { "epoch": 2.914725175299706, "grad_norm": 0.3386023938655853, "learning_rate": 8.310948713898973e-06, "loss": 0.6103, "step": 6443 }, { "epoch": 2.9151775616376385, "grad_norm": 0.36298924684524536, "learning_rate": 8.310399602190942e-06, "loss": 0.5955, "step": 6444 }, { "epoch": 2.915629947975571, "grad_norm": 0.3044707775115967, "learning_rate": 8.309850419385765e-06, "loss": 0.4686, "step": 6445 }, { "epoch": 2.916082334313504, "grad_norm": 0.34286779165267944, "learning_rate": 8.30930116549524e-06, "loss": 0.4983, "step": 6446 }, { "epoch": 2.9165347206514363, "grad_norm": 0.3430196940898895, "learning_rate": 8.308751840531165e-06, "loss": 0.5627, "step": 6447 }, { "epoch": 2.916987106989369, "grad_norm": 0.3095053732395172, "learning_rate": 8.308202444505333e-06, "loss": 0.4067, "step": 6448 }, { "epoch": 2.9174394933273016, "grad_norm": 0.38266828656196594, "learning_rate": 8.307652977429548e-06, "loss": 0.5955, "step": 6449 }, { "epoch": 2.917891879665234, "grad_norm": 0.42231717705726624, "learning_rate": 8.307103439315607e-06, "loss": 0.525, "step": 6450 }, { "epoch": 2.9183442660031664, "grad_norm": 0.33850833773612976, "learning_rate": 8.306553830175315e-06, "loss": 0.5195, "step": 6451 }, { "epoch": 2.9187966523410993, "grad_norm": 0.3570455312728882, "learning_rate": 8.306004150020478e-06, "loss": 0.5313, "step": 6452 }, { "epoch": 2.9192490386790317, "grad_norm": 0.36414119601249695, "learning_rate": 8.305454398862895e-06, "loss": 0.5478, "step": 6453 }, { "epoch": 2.9197014250169646, "grad_norm": 0.32384422421455383, "learning_rate": 8.304904576714376e-06, "loss": 0.3619, "step": 6454 }, { "epoch": 2.920153811354897, "grad_norm": 0.3510342240333557, "learning_rate": 8.304354683586733e-06, "loss": 0.5552, "step": 6455 }, { "epoch": 2.9206061976928295, "grad_norm": 0.38970234990119934, "learning_rate": 8.303804719491772e-06, "loss": 0.5715, "step": 6456 }, { "epoch": 2.9210585840307624, "grad_norm": 0.4098857045173645, "learning_rate": 8.303254684441305e-06, "loss": 0.5009, "step": 6457 }, { "epoch": 2.921510970368695, "grad_norm": 0.3909189701080322, "learning_rate": 8.302704578447147e-06, "loss": 0.5223, "step": 6458 }, { "epoch": 2.9219633567066277, "grad_norm": 0.35397499799728394, "learning_rate": 8.302154401521109e-06, "loss": 0.4832, "step": 6459 }, { "epoch": 2.92241574304456, "grad_norm": 0.4282948076725006, "learning_rate": 8.301604153675012e-06, "loss": 0.5813, "step": 6460 }, { "epoch": 2.9228681293824925, "grad_norm": 0.40208008885383606, "learning_rate": 8.30105383492067e-06, "loss": 0.6003, "step": 6461 }, { "epoch": 2.9233205157204254, "grad_norm": 0.3633049428462982, "learning_rate": 8.3005034452699e-06, "loss": 0.4865, "step": 6462 }, { "epoch": 2.923772902058358, "grad_norm": 0.38000816106796265, "learning_rate": 8.29995298473453e-06, "loss": 0.458, "step": 6463 }, { "epoch": 2.9242252883962907, "grad_norm": 0.4080607295036316, "learning_rate": 8.299402453326375e-06, "loss": 0.4992, "step": 6464 }, { "epoch": 2.924677674734223, "grad_norm": 0.4939263164997101, "learning_rate": 8.298851851057264e-06, "loss": 0.6574, "step": 6465 }, { "epoch": 2.9251300610721556, "grad_norm": 0.4863220453262329, "learning_rate": 8.298301177939018e-06, "loss": 0.567, "step": 6466 }, { "epoch": 2.925582447410088, "grad_norm": 0.4935426712036133, "learning_rate": 8.297750433983467e-06, "loss": 0.6121, "step": 6467 }, { "epoch": 2.926034833748021, "grad_norm": 0.5052579045295715, "learning_rate": 8.297199619202436e-06, "loss": 0.5748, "step": 6468 }, { "epoch": 2.9264872200859533, "grad_norm": 0.4389953911304474, "learning_rate": 8.296648733607759e-06, "loss": 0.4123, "step": 6469 }, { "epoch": 2.926939606423886, "grad_norm": 0.5156069397926331, "learning_rate": 8.296097777211263e-06, "loss": 0.4784, "step": 6470 }, { "epoch": 2.9273919927618186, "grad_norm": 0.6600238084793091, "learning_rate": 8.295546750024782e-06, "loss": 0.5544, "step": 6471 }, { "epoch": 2.927844379099751, "grad_norm": 0.14915455877780914, "learning_rate": 8.294995652060155e-06, "loss": 1.1766, "step": 6472 }, { "epoch": 2.928296765437684, "grad_norm": 0.19560065865516663, "learning_rate": 8.294444483329212e-06, "loss": 1.1191, "step": 6473 }, { "epoch": 2.9287491517756163, "grad_norm": 0.22354350984096527, "learning_rate": 8.293893243843792e-06, "loss": 0.6817, "step": 6474 }, { "epoch": 2.929201538113549, "grad_norm": 0.2476997673511505, "learning_rate": 8.293341933615735e-06, "loss": 0.5417, "step": 6475 }, { "epoch": 2.9296539244514817, "grad_norm": 0.2576313316822052, "learning_rate": 8.292790552656881e-06, "loss": 0.6153, "step": 6476 }, { "epoch": 2.930106310789414, "grad_norm": 0.28269582986831665, "learning_rate": 8.292239100979072e-06, "loss": 0.6241, "step": 6477 }, { "epoch": 2.9305586971273465, "grad_norm": 0.27053123712539673, "learning_rate": 8.29168757859415e-06, "loss": 0.517, "step": 6478 }, { "epoch": 2.9310110834652794, "grad_norm": 0.29141396284103394, "learning_rate": 8.291135985513962e-06, "loss": 0.6824, "step": 6479 }, { "epoch": 2.931463469803212, "grad_norm": 0.2964974641799927, "learning_rate": 8.290584321750355e-06, "loss": 0.5586, "step": 6480 }, { "epoch": 2.9319158561411447, "grad_norm": 0.32856056094169617, "learning_rate": 8.290032587315174e-06, "loss": 0.6545, "step": 6481 }, { "epoch": 2.932368242479077, "grad_norm": 0.3230401277542114, "learning_rate": 8.289480782220271e-06, "loss": 0.5869, "step": 6482 }, { "epoch": 2.9328206288170096, "grad_norm": 0.30749186873435974, "learning_rate": 8.288928906477497e-06, "loss": 0.5757, "step": 6483 }, { "epoch": 2.9332730151549424, "grad_norm": 0.29436877369880676, "learning_rate": 8.288376960098704e-06, "loss": 0.548, "step": 6484 }, { "epoch": 2.933725401492875, "grad_norm": 0.2833237946033478, "learning_rate": 8.287824943095746e-06, "loss": 0.6568, "step": 6485 }, { "epoch": 2.9341777878308077, "grad_norm": 0.3945300877094269, "learning_rate": 8.287272855480477e-06, "loss": 0.6288, "step": 6486 }, { "epoch": 2.93463017416874, "grad_norm": 0.2973654866218567, "learning_rate": 8.286720697264759e-06, "loss": 0.5229, "step": 6487 }, { "epoch": 2.9350825605066726, "grad_norm": 0.3297414183616638, "learning_rate": 8.286168468460445e-06, "loss": 0.6377, "step": 6488 }, { "epoch": 2.935534946844605, "grad_norm": 0.3027338683605194, "learning_rate": 8.285616169079399e-06, "loss": 0.564, "step": 6489 }, { "epoch": 2.935987333182538, "grad_norm": 0.33672428131103516, "learning_rate": 8.285063799133482e-06, "loss": 0.5989, "step": 6490 }, { "epoch": 2.9364397195204703, "grad_norm": 0.3296302258968353, "learning_rate": 8.284511358634555e-06, "loss": 0.5383, "step": 6491 }, { "epoch": 2.936892105858403, "grad_norm": 0.34682178497314453, "learning_rate": 8.283958847594485e-06, "loss": 0.5337, "step": 6492 }, { "epoch": 2.9373444921963356, "grad_norm": 0.3448314368724823, "learning_rate": 8.283406266025136e-06, "loss": 0.6904, "step": 6493 }, { "epoch": 2.937796878534268, "grad_norm": 0.33733466267585754, "learning_rate": 8.282853613938379e-06, "loss": 0.5833, "step": 6494 }, { "epoch": 2.938249264872201, "grad_norm": 0.3295680284500122, "learning_rate": 8.28230089134608e-06, "loss": 0.5465, "step": 6495 }, { "epoch": 2.9387016512101334, "grad_norm": 0.3779410123825073, "learning_rate": 8.281748098260113e-06, "loss": 0.6653, "step": 6496 }, { "epoch": 2.9391540375480663, "grad_norm": 0.31704214215278625, "learning_rate": 8.281195234692348e-06, "loss": 0.5232, "step": 6497 }, { "epoch": 2.9396064238859987, "grad_norm": 0.34034934639930725, "learning_rate": 8.28064230065466e-06, "loss": 0.5595, "step": 6498 }, { "epoch": 2.940058810223931, "grad_norm": 0.34248417615890503, "learning_rate": 8.280089296158923e-06, "loss": 0.4991, "step": 6499 }, { "epoch": 2.940511196561864, "grad_norm": 0.37275567650794983, "learning_rate": 8.279536221217012e-06, "loss": 0.6102, "step": 6500 }, { "epoch": 2.9409635828997964, "grad_norm": 0.31930696964263916, "learning_rate": 8.278983075840811e-06, "loss": 0.5216, "step": 6501 }, { "epoch": 2.941415969237729, "grad_norm": 0.3398669362068176, "learning_rate": 8.278429860042197e-06, "loss": 0.5048, "step": 6502 }, { "epoch": 2.9418683555756617, "grad_norm": 0.39144638180732727, "learning_rate": 8.27787657383305e-06, "loss": 0.6151, "step": 6503 }, { "epoch": 2.942320741913594, "grad_norm": 0.3620992600917816, "learning_rate": 8.277323217225254e-06, "loss": 0.625, "step": 6504 }, { "epoch": 2.9427731282515266, "grad_norm": 0.3616740107536316, "learning_rate": 8.276769790230693e-06, "loss": 0.5197, "step": 6505 }, { "epoch": 2.9432255145894595, "grad_norm": 0.38564497232437134, "learning_rate": 8.276216292861254e-06, "loss": 0.4886, "step": 6506 }, { "epoch": 2.943677900927392, "grad_norm": 0.38361725211143494, "learning_rate": 8.275662725128823e-06, "loss": 0.5046, "step": 6507 }, { "epoch": 2.9441302872653248, "grad_norm": 0.34826481342315674, "learning_rate": 8.27510908704529e-06, "loss": 0.4837, "step": 6508 }, { "epoch": 2.944582673603257, "grad_norm": 0.37145963311195374, "learning_rate": 8.274555378622544e-06, "loss": 0.5062, "step": 6509 }, { "epoch": 2.9450350599411896, "grad_norm": 0.45534050464630127, "learning_rate": 8.274001599872479e-06, "loss": 0.7216, "step": 6510 }, { "epoch": 2.9454874462791225, "grad_norm": 0.3901090919971466, "learning_rate": 8.273447750806987e-06, "loss": 0.5135, "step": 6511 }, { "epoch": 2.945939832617055, "grad_norm": 0.425809383392334, "learning_rate": 8.272893831437964e-06, "loss": 0.5727, "step": 6512 }, { "epoch": 2.946392218954988, "grad_norm": 0.39206647872924805, "learning_rate": 8.272339841777305e-06, "loss": 0.4807, "step": 6513 }, { "epoch": 2.9468446052929202, "grad_norm": 0.41766440868377686, "learning_rate": 8.27178578183691e-06, "loss": 0.4827, "step": 6514 }, { "epoch": 2.9472969916308527, "grad_norm": 0.5035632848739624, "learning_rate": 8.271231651628675e-06, "loss": 0.6575, "step": 6515 }, { "epoch": 2.947749377968785, "grad_norm": 0.5096244812011719, "learning_rate": 8.270677451164506e-06, "loss": 0.633, "step": 6516 }, { "epoch": 2.948201764306718, "grad_norm": 0.46289166808128357, "learning_rate": 8.2701231804563e-06, "loss": 0.5066, "step": 6517 }, { "epoch": 2.9486541506446504, "grad_norm": 0.4809775948524475, "learning_rate": 8.269568839515966e-06, "loss": 0.4929, "step": 6518 }, { "epoch": 2.9491065369825833, "grad_norm": 0.42113229632377625, "learning_rate": 8.269014428355407e-06, "loss": 0.4581, "step": 6519 }, { "epoch": 2.9495589233205157, "grad_norm": 0.44944462180137634, "learning_rate": 8.26845994698653e-06, "loss": 0.5129, "step": 6520 }, { "epoch": 2.950011309658448, "grad_norm": 0.572476863861084, "learning_rate": 8.267905395421242e-06, "loss": 0.5083, "step": 6521 }, { "epoch": 2.950463695996381, "grad_norm": 0.1368122547864914, "learning_rate": 8.267350773671459e-06, "loss": 1.1119, "step": 6522 }, { "epoch": 2.9509160823343135, "grad_norm": 0.210594043135643, "learning_rate": 8.266796081749087e-06, "loss": 0.6129, "step": 6523 }, { "epoch": 2.9513684686722463, "grad_norm": 0.24581894278526306, "learning_rate": 8.26624131966604e-06, "loss": 0.5442, "step": 6524 }, { "epoch": 2.9518208550101788, "grad_norm": 0.27198100090026855, "learning_rate": 8.265686487434232e-06, "loss": 0.7932, "step": 6525 }, { "epoch": 2.952273241348111, "grad_norm": 0.2949771285057068, "learning_rate": 8.265131585065582e-06, "loss": 0.7075, "step": 6526 }, { "epoch": 2.9527256276860436, "grad_norm": 0.30533361434936523, "learning_rate": 8.264576612572006e-06, "loss": 0.71, "step": 6527 }, { "epoch": 2.9531780140239765, "grad_norm": 0.2800408899784088, "learning_rate": 8.264021569965423e-06, "loss": 0.4723, "step": 6528 }, { "epoch": 2.953630400361909, "grad_norm": 0.3077542185783386, "learning_rate": 8.263466457257753e-06, "loss": 0.6426, "step": 6529 }, { "epoch": 2.954082786699842, "grad_norm": 0.2911969721317291, "learning_rate": 8.262911274460918e-06, "loss": 0.6905, "step": 6530 }, { "epoch": 2.9545351730377742, "grad_norm": 0.31784096360206604, "learning_rate": 8.262356021586843e-06, "loss": 0.6813, "step": 6531 }, { "epoch": 2.9549875593757067, "grad_norm": 0.3086147904396057, "learning_rate": 8.261800698647453e-06, "loss": 0.5655, "step": 6532 }, { "epoch": 2.9554399457136395, "grad_norm": 0.3133292496204376, "learning_rate": 8.261245305654674e-06, "loss": 0.4419, "step": 6533 }, { "epoch": 2.955892332051572, "grad_norm": 0.2996436357498169, "learning_rate": 8.260689842620433e-06, "loss": 0.5489, "step": 6534 }, { "epoch": 2.956344718389505, "grad_norm": 0.2882857322692871, "learning_rate": 8.260134309556663e-06, "loss": 0.5357, "step": 6535 }, { "epoch": 2.9567971047274373, "grad_norm": 0.37113943696022034, "learning_rate": 8.25957870647529e-06, "loss": 0.5628, "step": 6536 }, { "epoch": 2.9572494910653697, "grad_norm": 0.3403480052947998, "learning_rate": 8.259023033388251e-06, "loss": 0.5486, "step": 6537 }, { "epoch": 2.957701877403302, "grad_norm": 0.336093932390213, "learning_rate": 8.258467290307478e-06, "loss": 0.5739, "step": 6538 }, { "epoch": 2.958154263741235, "grad_norm": 0.3260478675365448, "learning_rate": 8.257911477244908e-06, "loss": 0.5697, "step": 6539 }, { "epoch": 2.9586066500791675, "grad_norm": 0.32656487822532654, "learning_rate": 8.257355594212478e-06, "loss": 0.6629, "step": 6540 }, { "epoch": 2.9590590364171003, "grad_norm": 0.32236507534980774, "learning_rate": 8.256799641222124e-06, "loss": 0.5109, "step": 6541 }, { "epoch": 2.9595114227550328, "grad_norm": 0.32879307866096497, "learning_rate": 8.25624361828579e-06, "loss": 0.5301, "step": 6542 }, { "epoch": 2.959963809092965, "grad_norm": 0.35089895129203796, "learning_rate": 8.255687525415415e-06, "loss": 0.5936, "step": 6543 }, { "epoch": 2.960416195430898, "grad_norm": 0.35999757051467896, "learning_rate": 8.255131362622943e-06, "loss": 0.667, "step": 6544 }, { "epoch": 2.9608685817688305, "grad_norm": 0.3491911292076111, "learning_rate": 8.25457512992032e-06, "loss": 0.555, "step": 6545 }, { "epoch": 2.9613209681067634, "grad_norm": 0.348842978477478, "learning_rate": 8.25401882731949e-06, "loss": 0.5508, "step": 6546 }, { "epoch": 2.961773354444696, "grad_norm": 0.3658214807510376, "learning_rate": 8.253462454832402e-06, "loss": 0.6381, "step": 6547 }, { "epoch": 2.9622257407826282, "grad_norm": 0.3619207441806793, "learning_rate": 8.252906012471003e-06, "loss": 0.6457, "step": 6548 }, { "epoch": 2.962678127120561, "grad_norm": 0.38467347621917725, "learning_rate": 8.252349500247248e-06, "loss": 0.6357, "step": 6549 }, { "epoch": 2.9631305134584935, "grad_norm": 0.3917468190193176, "learning_rate": 8.251792918173086e-06, "loss": 0.6181, "step": 6550 }, { "epoch": 2.9635828997964264, "grad_norm": 0.33200955390930176, "learning_rate": 8.25123626626047e-06, "loss": 0.4966, "step": 6551 }, { "epoch": 2.964035286134359, "grad_norm": 0.33925923705101013, "learning_rate": 8.250679544521356e-06, "loss": 0.4395, "step": 6552 }, { "epoch": 2.9644876724722913, "grad_norm": 0.38064587116241455, "learning_rate": 8.250122752967702e-06, "loss": 0.5814, "step": 6553 }, { "epoch": 2.9649400588102237, "grad_norm": 0.36788293719291687, "learning_rate": 8.249565891611464e-06, "loss": 0.5115, "step": 6554 }, { "epoch": 2.9653924451481566, "grad_norm": 0.36234810948371887, "learning_rate": 8.249008960464605e-06, "loss": 0.4503, "step": 6555 }, { "epoch": 2.965844831486089, "grad_norm": 0.3676859140396118, "learning_rate": 8.24845195953908e-06, "loss": 0.5395, "step": 6556 }, { "epoch": 2.966297217824022, "grad_norm": 0.3638904392719269, "learning_rate": 8.24789488884686e-06, "loss": 0.4563, "step": 6557 }, { "epoch": 2.9667496041619543, "grad_norm": 0.39804038405418396, "learning_rate": 8.247337748399904e-06, "loss": 0.5464, "step": 6558 }, { "epoch": 2.9672019904998868, "grad_norm": 0.428582102060318, "learning_rate": 8.246780538210177e-06, "loss": 0.501, "step": 6559 }, { "epoch": 2.9676543768378196, "grad_norm": 0.3994693458080292, "learning_rate": 8.246223258289647e-06, "loss": 0.4874, "step": 6560 }, { "epoch": 2.968106763175752, "grad_norm": 0.46480679512023926, "learning_rate": 8.245665908650285e-06, "loss": 0.6314, "step": 6561 }, { "epoch": 2.968559149513685, "grad_norm": 0.40016090869903564, "learning_rate": 8.24510848930406e-06, "loss": 0.5058, "step": 6562 }, { "epoch": 2.9690115358516174, "grad_norm": 0.392776757478714, "learning_rate": 8.244551000262941e-06, "loss": 0.5174, "step": 6563 }, { "epoch": 2.96946392218955, "grad_norm": 0.4193871319293976, "learning_rate": 8.243993441538904e-06, "loss": 0.5183, "step": 6564 }, { "epoch": 2.9699163085274822, "grad_norm": 0.3810568153858185, "learning_rate": 8.243435813143922e-06, "loss": 0.4984, "step": 6565 }, { "epoch": 2.970368694865415, "grad_norm": 0.40685927867889404, "learning_rate": 8.242878115089973e-06, "loss": 0.4479, "step": 6566 }, { "epoch": 2.9708210812033475, "grad_norm": 0.5057392716407776, "learning_rate": 8.242320347389032e-06, "loss": 0.549, "step": 6567 }, { "epoch": 2.9712734675412804, "grad_norm": 0.4976295828819275, "learning_rate": 8.241762510053082e-06, "loss": 0.6197, "step": 6568 }, { "epoch": 2.971725853879213, "grad_norm": 0.4724542200565338, "learning_rate": 8.2412046030941e-06, "loss": 0.4537, "step": 6569 }, { "epoch": 2.9721782402171453, "grad_norm": 0.456170916557312, "learning_rate": 8.240646626524068e-06, "loss": 0.4727, "step": 6570 }, { "epoch": 2.972630626555078, "grad_norm": 0.5696062445640564, "learning_rate": 8.240088580354974e-06, "loss": 0.5332, "step": 6571 }, { "epoch": 2.9730830128930106, "grad_norm": 0.18399794399738312, "learning_rate": 8.2395304645988e-06, "loss": 1.181, "step": 6572 }, { "epoch": 2.9735353992309435, "grad_norm": 0.2586807906627655, "learning_rate": 8.23897227926753e-06, "loss": 0.699, "step": 6573 }, { "epoch": 2.973987785568876, "grad_norm": 0.28909751772880554, "learning_rate": 8.238414024373157e-06, "loss": 0.8334, "step": 6574 }, { "epoch": 2.9744401719068083, "grad_norm": 0.24647976458072662, "learning_rate": 8.237855699927668e-06, "loss": 0.5804, "step": 6575 }, { "epoch": 2.9748925582447407, "grad_norm": 0.24367104470729828, "learning_rate": 8.237297305943054e-06, "loss": 0.4695, "step": 6576 }, { "epoch": 2.9753449445826736, "grad_norm": 0.299054890871048, "learning_rate": 8.23673884243131e-06, "loss": 0.6077, "step": 6577 }, { "epoch": 2.975797330920606, "grad_norm": 0.27289867401123047, "learning_rate": 8.236180309404425e-06, "loss": 0.6171, "step": 6578 }, { "epoch": 2.976249717258539, "grad_norm": 0.32594239711761475, "learning_rate": 8.235621706874399e-06, "loss": 0.7083, "step": 6579 }, { "epoch": 2.9767021035964714, "grad_norm": 0.2929418981075287, "learning_rate": 8.235063034853228e-06, "loss": 0.626, "step": 6580 }, { "epoch": 2.977154489934404, "grad_norm": 0.31752094626426697, "learning_rate": 8.23450429335291e-06, "loss": 0.6297, "step": 6581 }, { "epoch": 2.9776068762723367, "grad_norm": 0.28722530603408813, "learning_rate": 8.233945482385445e-06, "loss": 0.5542, "step": 6582 }, { "epoch": 2.978059262610269, "grad_norm": 0.3119654655456543, "learning_rate": 8.233386601962837e-06, "loss": 0.6033, "step": 6583 }, { "epoch": 2.978511648948202, "grad_norm": 0.3192020356655121, "learning_rate": 8.232827652097087e-06, "loss": 0.6346, "step": 6584 }, { "epoch": 2.9789640352861344, "grad_norm": 0.35783812403678894, "learning_rate": 8.232268632800197e-06, "loss": 0.6609, "step": 6585 }, { "epoch": 2.979416421624067, "grad_norm": 0.3125256896018982, "learning_rate": 8.231709544084176e-06, "loss": 0.5814, "step": 6586 }, { "epoch": 2.9798688079619997, "grad_norm": 0.3423612117767334, "learning_rate": 8.231150385961032e-06, "loss": 0.7164, "step": 6587 }, { "epoch": 2.980321194299932, "grad_norm": 0.31167203187942505, "learning_rate": 8.23059115844277e-06, "loss": 0.5688, "step": 6588 }, { "epoch": 2.9807735806378646, "grad_norm": 0.32831916213035583, "learning_rate": 8.230031861541408e-06, "loss": 0.5329, "step": 6589 }, { "epoch": 2.9812259669757974, "grad_norm": 0.3482939898967743, "learning_rate": 8.229472495268951e-06, "loss": 0.5263, "step": 6590 }, { "epoch": 2.98167835331373, "grad_norm": 0.28338027000427246, "learning_rate": 8.228913059637414e-06, "loss": 0.4325, "step": 6591 }, { "epoch": 2.9821307396516623, "grad_norm": 0.32357627153396606, "learning_rate": 8.228353554658813e-06, "loss": 0.5373, "step": 6592 }, { "epoch": 2.982583125989595, "grad_norm": 0.3362346291542053, "learning_rate": 8.227793980345164e-06, "loss": 0.5665, "step": 6593 }, { "epoch": 2.9830355123275276, "grad_norm": 0.362664133310318, "learning_rate": 8.227234336708488e-06, "loss": 0.5782, "step": 6594 }, { "epoch": 2.9834878986654605, "grad_norm": 0.30467164516448975, "learning_rate": 8.2266746237608e-06, "loss": 0.4506, "step": 6595 }, { "epoch": 2.983940285003393, "grad_norm": 0.32177436351776123, "learning_rate": 8.22611484151412e-06, "loss": 0.502, "step": 6596 }, { "epoch": 2.9843926713413254, "grad_norm": 0.34794315695762634, "learning_rate": 8.225554989980476e-06, "loss": 0.5498, "step": 6597 }, { "epoch": 2.9848450576792582, "grad_norm": 0.35369357466697693, "learning_rate": 8.224995069171887e-06, "loss": 0.5038, "step": 6598 }, { "epoch": 2.9852974440171907, "grad_norm": 0.39413681626319885, "learning_rate": 8.22443507910038e-06, "loss": 0.654, "step": 6599 }, { "epoch": 2.9857498303551235, "grad_norm": 0.3572711646556854, "learning_rate": 8.223875019777983e-06, "loss": 0.6249, "step": 6600 }, { "epoch": 2.9857498303551235, "eval_loss": 0.5922526121139526, "eval_runtime": 25.7558, "eval_samples_per_second": 28.887, "eval_steps_per_second": 7.222, "step": 6600 }, { "epoch": 2.986202216693056, "grad_norm": 0.3788277804851532, "learning_rate": 8.223314891216721e-06, "loss": 0.5905, "step": 6601 }, { "epoch": 2.9866546030309884, "grad_norm": 0.4877707064151764, "learning_rate": 8.222754693428628e-06, "loss": 0.704, "step": 6602 }, { "epoch": 2.987106989368921, "grad_norm": 0.35480996966362, "learning_rate": 8.222194426425733e-06, "loss": 0.4601, "step": 6603 }, { "epoch": 2.9875593757068537, "grad_norm": 0.34091654419898987, "learning_rate": 8.22163409022007e-06, "loss": 0.483, "step": 6604 }, { "epoch": 2.988011762044786, "grad_norm": 0.37999263405799866, "learning_rate": 8.221073684823672e-06, "loss": 0.5996, "step": 6605 }, { "epoch": 2.988464148382719, "grad_norm": 0.4182780683040619, "learning_rate": 8.220513210248576e-06, "loss": 0.6674, "step": 6606 }, { "epoch": 2.9889165347206514, "grad_norm": 0.42228347063064575, "learning_rate": 8.219952666506815e-06, "loss": 0.5926, "step": 6607 }, { "epoch": 2.989368921058584, "grad_norm": 0.3738807737827301, "learning_rate": 8.219392053610436e-06, "loss": 0.5227, "step": 6608 }, { "epoch": 2.9898213073965167, "grad_norm": 0.3418598473072052, "learning_rate": 8.218831371571471e-06, "loss": 0.4228, "step": 6609 }, { "epoch": 2.990273693734449, "grad_norm": 0.3374108374118805, "learning_rate": 8.218270620401967e-06, "loss": 0.4191, "step": 6610 }, { "epoch": 2.990726080072382, "grad_norm": 0.38952526450157166, "learning_rate": 8.217709800113965e-06, "loss": 0.4606, "step": 6611 }, { "epoch": 2.9911784664103145, "grad_norm": 0.45494428277015686, "learning_rate": 8.21714891071951e-06, "loss": 0.573, "step": 6612 }, { "epoch": 2.991630852748247, "grad_norm": 0.4464728534221649, "learning_rate": 8.21658795223065e-06, "loss": 0.5806, "step": 6613 }, { "epoch": 2.9920832390861793, "grad_norm": 0.4161461591720581, "learning_rate": 8.216026924659427e-06, "loss": 0.5247, "step": 6614 }, { "epoch": 2.9925356254241122, "grad_norm": 0.5410524606704712, "learning_rate": 8.215465828017898e-06, "loss": 0.629, "step": 6615 }, { "epoch": 2.9929880117620447, "grad_norm": 0.4871745705604553, "learning_rate": 8.214904662318108e-06, "loss": 0.6077, "step": 6616 }, { "epoch": 2.9934403980999775, "grad_norm": 0.4114164710044861, "learning_rate": 8.214343427572111e-06, "loss": 0.4841, "step": 6617 }, { "epoch": 2.99389278443791, "grad_norm": 0.4583822190761566, "learning_rate": 8.21378212379196e-06, "loss": 0.5427, "step": 6618 }, { "epoch": 2.9943451707758424, "grad_norm": 0.4845244884490967, "learning_rate": 8.213220750989712e-06, "loss": 0.5361, "step": 6619 }, { "epoch": 2.9947975571137753, "grad_norm": 0.49081432819366455, "learning_rate": 8.21265930917742e-06, "loss": 0.5121, "step": 6620 }, { "epoch": 2.9952499434517077, "grad_norm": 0.6171821355819702, "learning_rate": 8.212097798367144e-06, "loss": 0.5537, "step": 6621 }, { "epoch": 2.9957023297896406, "grad_norm": 0.2231212556362152, "learning_rate": 8.211536218570946e-06, "loss": 0.9592, "step": 6622 }, { "epoch": 2.996154716127573, "grad_norm": 0.30324411392211914, "learning_rate": 8.210974569800882e-06, "loss": 0.5391, "step": 6623 }, { "epoch": 2.9966071024655054, "grad_norm": 0.32468071579933167, "learning_rate": 8.210412852069019e-06, "loss": 0.584, "step": 6624 }, { "epoch": 2.997059488803438, "grad_norm": 0.3340408205986023, "learning_rate": 8.209851065387418e-06, "loss": 0.6377, "step": 6625 }, { "epoch": 2.9975118751413707, "grad_norm": 0.32492101192474365, "learning_rate": 8.209289209768146e-06, "loss": 0.4719, "step": 6626 }, { "epoch": 2.997964261479303, "grad_norm": 0.3468219041824341, "learning_rate": 8.208727285223268e-06, "loss": 0.4882, "step": 6627 }, { "epoch": 2.998416647817236, "grad_norm": 0.4163586497306824, "learning_rate": 8.208165291764856e-06, "loss": 0.5594, "step": 6628 }, { "epoch": 2.9988690341551685, "grad_norm": 0.3860779404640198, "learning_rate": 8.207603229404975e-06, "loss": 0.4713, "step": 6629 }, { "epoch": 2.999321420493101, "grad_norm": 0.41197606921195984, "learning_rate": 8.207041098155701e-06, "loss": 0.4733, "step": 6630 }, { "epoch": 2.999773806831034, "grad_norm": 0.4274275600910187, "learning_rate": 8.206478898029103e-06, "loss": 0.4284, "step": 6631 }, { "epoch": 3.000226193168966, "grad_norm": 1.6075420379638672, "learning_rate": 8.205916629037259e-06, "loss": 1.1946, "step": 6632 }, { "epoch": 3.000678579506899, "grad_norm": 0.2192709594964981, "learning_rate": 8.205354291192241e-06, "loss": 1.0011, "step": 6633 }, { "epoch": 3.0011309658448315, "grad_norm": 0.21643507480621338, "learning_rate": 8.20479188450613e-06, "loss": 0.6116, "step": 6634 }, { "epoch": 3.001583352182764, "grad_norm": 0.2840622067451477, "learning_rate": 8.204229408991002e-06, "loss": 0.7516, "step": 6635 }, { "epoch": 3.002035738520697, "grad_norm": 0.28083473443984985, "learning_rate": 8.203666864658938e-06, "loss": 0.6994, "step": 6636 }, { "epoch": 3.0024881248586293, "grad_norm": 0.2628157138824463, "learning_rate": 8.203104251522021e-06, "loss": 0.5335, "step": 6637 }, { "epoch": 3.0029405111965617, "grad_norm": 0.2636222839355469, "learning_rate": 8.202541569592332e-06, "loss": 0.5198, "step": 6638 }, { "epoch": 3.0033928975344946, "grad_norm": 0.2968021631240845, "learning_rate": 8.20197881888196e-06, "loss": 0.6741, "step": 6639 }, { "epoch": 3.003845283872427, "grad_norm": 0.2920144200325012, "learning_rate": 8.201415999402984e-06, "loss": 0.6635, "step": 6640 }, { "epoch": 3.0042976702103594, "grad_norm": 0.29476481676101685, "learning_rate": 8.200853111167497e-06, "loss": 0.6652, "step": 6641 }, { "epoch": 3.0047500565482923, "grad_norm": 0.2813597023487091, "learning_rate": 8.200290154187587e-06, "loss": 0.5471, "step": 6642 }, { "epoch": 3.0052024428862247, "grad_norm": 0.3217180669307709, "learning_rate": 8.199727128475343e-06, "loss": 0.6155, "step": 6643 }, { "epoch": 3.0056548292241576, "grad_norm": 0.3012827932834625, "learning_rate": 8.19916403404286e-06, "loss": 0.4581, "step": 6644 }, { "epoch": 3.00610721556209, "grad_norm": 0.2869637906551361, "learning_rate": 8.198600870902229e-06, "loss": 0.5756, "step": 6645 }, { "epoch": 3.0065596019000225, "grad_norm": 0.2953350245952606, "learning_rate": 8.198037639065545e-06, "loss": 0.5186, "step": 6646 }, { "epoch": 3.0070119882379553, "grad_norm": 0.3485586643218994, "learning_rate": 8.197474338544905e-06, "loss": 0.5465, "step": 6647 }, { "epoch": 3.0074643745758878, "grad_norm": 0.3706350326538086, "learning_rate": 8.196910969352408e-06, "loss": 0.6586, "step": 6648 }, { "epoch": 3.00791676091382, "grad_norm": 0.32077476382255554, "learning_rate": 8.196347531500152e-06, "loss": 0.6295, "step": 6649 }, { "epoch": 3.008369147251753, "grad_norm": 0.327134907245636, "learning_rate": 8.19578402500024e-06, "loss": 0.506, "step": 6650 }, { "epoch": 3.0088215335896855, "grad_norm": 0.32740214467048645, "learning_rate": 8.195220449864774e-06, "loss": 0.5452, "step": 6651 }, { "epoch": 3.0092739199276184, "grad_norm": 0.36038798093795776, "learning_rate": 8.194656806105854e-06, "loss": 0.5512, "step": 6652 }, { "epoch": 3.009726306265551, "grad_norm": 0.3592658042907715, "learning_rate": 8.194093093735589e-06, "loss": 0.601, "step": 6653 }, { "epoch": 3.0101786926034833, "grad_norm": 0.31969356536865234, "learning_rate": 8.193529312766085e-06, "loss": 0.5094, "step": 6654 }, { "epoch": 3.010631078941416, "grad_norm": 0.32526567578315735, "learning_rate": 8.19296546320945e-06, "loss": 0.4204, "step": 6655 }, { "epoch": 3.0110834652793486, "grad_norm": 0.3333887457847595, "learning_rate": 8.192401545077795e-06, "loss": 0.567, "step": 6656 }, { "epoch": 3.011535851617281, "grad_norm": 0.3905881941318512, "learning_rate": 8.19183755838323e-06, "loss": 0.6627, "step": 6657 }, { "epoch": 3.011988237955214, "grad_norm": 0.32509347796440125, "learning_rate": 8.191273503137867e-06, "loss": 0.4709, "step": 6658 }, { "epoch": 3.0124406242931463, "grad_norm": 0.33577004075050354, "learning_rate": 8.190709379353821e-06, "loss": 0.4446, "step": 6659 }, { "epoch": 3.0128930106310787, "grad_norm": 0.3461117744445801, "learning_rate": 8.190145187043207e-06, "loss": 0.5013, "step": 6660 }, { "epoch": 3.0133453969690116, "grad_norm": 0.34722188115119934, "learning_rate": 8.189580926218145e-06, "loss": 0.4498, "step": 6661 }, { "epoch": 3.013797783306944, "grad_norm": 0.3781244158744812, "learning_rate": 8.18901659689075e-06, "loss": 0.6001, "step": 6662 }, { "epoch": 3.014250169644877, "grad_norm": 0.3759401738643646, "learning_rate": 8.188452199073143e-06, "loss": 0.522, "step": 6663 }, { "epoch": 3.0147025559828093, "grad_norm": 0.36077651381492615, "learning_rate": 8.187887732777447e-06, "loss": 0.524, "step": 6664 }, { "epoch": 3.0151549423207418, "grad_norm": 0.3976369798183441, "learning_rate": 8.187323198015781e-06, "loss": 0.4823, "step": 6665 }, { "epoch": 3.0156073286586746, "grad_norm": 0.3800893723964691, "learning_rate": 8.186758594800276e-06, "loss": 0.6265, "step": 6666 }, { "epoch": 3.016059714996607, "grad_norm": 0.39209842681884766, "learning_rate": 8.186193923143053e-06, "loss": 0.48, "step": 6667 }, { "epoch": 3.0165121013345395, "grad_norm": 0.45583102107048035, "learning_rate": 8.18562918305624e-06, "loss": 0.552, "step": 6668 }, { "epoch": 3.0169644876724724, "grad_norm": 0.3828054666519165, "learning_rate": 8.185064374551966e-06, "loss": 0.4462, "step": 6669 }, { "epoch": 3.017416874010405, "grad_norm": 0.4164743423461914, "learning_rate": 8.184499497642363e-06, "loss": 0.492, "step": 6670 }, { "epoch": 3.0178692603483377, "grad_norm": 0.42672199010849, "learning_rate": 8.18393455233956e-06, "loss": 0.4905, "step": 6671 }, { "epoch": 3.01832164668627, "grad_norm": 0.409456729888916, "learning_rate": 8.183369538655694e-06, "loss": 0.4919, "step": 6672 }, { "epoch": 3.0187740330242026, "grad_norm": 0.4236372113227844, "learning_rate": 8.182804456602897e-06, "loss": 0.4908, "step": 6673 }, { "epoch": 3.0192264193621354, "grad_norm": 0.4421939551830292, "learning_rate": 8.182239306193308e-06, "loss": 0.5145, "step": 6674 }, { "epoch": 3.019678805700068, "grad_norm": 0.4206116795539856, "learning_rate": 8.18167408743906e-06, "loss": 0.5055, "step": 6675 }, { "epoch": 3.0201311920380003, "grad_norm": 0.4531896412372589, "learning_rate": 8.181108800352295e-06, "loss": 0.5687, "step": 6676 }, { "epoch": 3.020583578375933, "grad_norm": 0.4158667027950287, "learning_rate": 8.180543444945154e-06, "loss": 0.4526, "step": 6677 }, { "epoch": 3.0210359647138656, "grad_norm": 0.49377691745758057, "learning_rate": 8.179978021229777e-06, "loss": 0.5157, "step": 6678 }, { "epoch": 3.021488351051798, "grad_norm": 0.4762725234031677, "learning_rate": 8.179412529218311e-06, "loss": 0.4993, "step": 6679 }, { "epoch": 3.021940737389731, "grad_norm": 0.5078946352005005, "learning_rate": 8.178846968922897e-06, "loss": 0.6109, "step": 6680 }, { "epoch": 3.0223931237276633, "grad_norm": 0.6239053010940552, "learning_rate": 8.178281340355686e-06, "loss": 0.6133, "step": 6681 }, { "epoch": 3.022845510065596, "grad_norm": 0.39838993549346924, "learning_rate": 8.177715643528822e-06, "loss": 1.0474, "step": 6682 }, { "epoch": 3.0232978964035286, "grad_norm": 0.192171111702919, "learning_rate": 8.177149878454455e-06, "loss": 0.8491, "step": 6683 }, { "epoch": 3.023750282741461, "grad_norm": 0.2017846256494522, "learning_rate": 8.176584045144739e-06, "loss": 0.552, "step": 6684 }, { "epoch": 3.024202669079394, "grad_norm": 0.24217641353607178, "learning_rate": 8.176018143611824e-06, "loss": 0.633, "step": 6685 }, { "epoch": 3.0246550554173264, "grad_norm": 0.2568002939224243, "learning_rate": 8.175452173867861e-06, "loss": 0.6144, "step": 6686 }, { "epoch": 3.025107441755259, "grad_norm": 0.2598837912082672, "learning_rate": 8.17488613592501e-06, "loss": 0.6144, "step": 6687 }, { "epoch": 3.0255598280931917, "grad_norm": 0.2959272265434265, "learning_rate": 8.174320029795426e-06, "loss": 0.6906, "step": 6688 }, { "epoch": 3.026012214431124, "grad_norm": 0.25840866565704346, "learning_rate": 8.173753855491269e-06, "loss": 0.5954, "step": 6689 }, { "epoch": 3.026464600769057, "grad_norm": 0.29833945631980896, "learning_rate": 8.173187613024694e-06, "loss": 0.6729, "step": 6690 }, { "epoch": 3.0269169871069894, "grad_norm": 0.3040110766887665, "learning_rate": 8.172621302407867e-06, "loss": 0.6081, "step": 6691 }, { "epoch": 3.027369373444922, "grad_norm": 0.3349413275718689, "learning_rate": 8.172054923652947e-06, "loss": 0.619, "step": 6692 }, { "epoch": 3.0278217597828547, "grad_norm": 0.3330981135368347, "learning_rate": 8.1714884767721e-06, "loss": 0.6271, "step": 6693 }, { "epoch": 3.028274146120787, "grad_norm": 0.3134107291698456, "learning_rate": 8.170921961777491e-06, "loss": 0.574, "step": 6694 }, { "epoch": 3.0287265324587196, "grad_norm": 0.3294215202331543, "learning_rate": 8.170355378681288e-06, "loss": 0.6287, "step": 6695 }, { "epoch": 3.0291789187966525, "grad_norm": 0.2764366567134857, "learning_rate": 8.169788727495658e-06, "loss": 0.4561, "step": 6696 }, { "epoch": 3.029631305134585, "grad_norm": 0.3674245774745941, "learning_rate": 8.16922200823277e-06, "loss": 0.6373, "step": 6697 }, { "epoch": 3.0300836914725173, "grad_norm": 0.3086090683937073, "learning_rate": 8.1686552209048e-06, "loss": 0.5251, "step": 6698 }, { "epoch": 3.03053607781045, "grad_norm": 0.33876273036003113, "learning_rate": 8.168088365523917e-06, "loss": 0.6363, "step": 6699 }, { "epoch": 3.0309884641483826, "grad_norm": 0.31736937165260315, "learning_rate": 8.167521442102296e-06, "loss": 0.5104, "step": 6700 }, { "epoch": 3.0314408504863155, "grad_norm": 0.31880220770835876, "learning_rate": 8.166954450652112e-06, "loss": 0.4916, "step": 6701 }, { "epoch": 3.031893236824248, "grad_norm": 0.31220853328704834, "learning_rate": 8.166387391185543e-06, "loss": 0.5049, "step": 6702 }, { "epoch": 3.0323456231621804, "grad_norm": 0.38079699873924255, "learning_rate": 8.165820263714767e-06, "loss": 0.667, "step": 6703 }, { "epoch": 3.0327980095001132, "grad_norm": 0.35596829652786255, "learning_rate": 8.165253068251966e-06, "loss": 0.6747, "step": 6704 }, { "epoch": 3.0332503958380457, "grad_norm": 0.34778493642807007, "learning_rate": 8.164685804809321e-06, "loss": 0.5344, "step": 6705 }, { "epoch": 3.033702782175978, "grad_norm": 0.3669811487197876, "learning_rate": 8.164118473399013e-06, "loss": 0.5759, "step": 6706 }, { "epoch": 3.034155168513911, "grad_norm": 0.3247598707675934, "learning_rate": 8.16355107403323e-06, "loss": 0.527, "step": 6707 }, { "epoch": 3.0346075548518434, "grad_norm": 0.3741704225540161, "learning_rate": 8.162983606724154e-06, "loss": 0.5461, "step": 6708 }, { "epoch": 3.0350599411897763, "grad_norm": 0.33467206358909607, "learning_rate": 8.162416071483974e-06, "loss": 0.535, "step": 6709 }, { "epoch": 3.0355123275277087, "grad_norm": 0.3402162194252014, "learning_rate": 8.16184846832488e-06, "loss": 0.5366, "step": 6710 }, { "epoch": 3.035964713865641, "grad_norm": 0.34793946146965027, "learning_rate": 8.161280797259063e-06, "loss": 0.5051, "step": 6711 }, { "epoch": 3.036417100203574, "grad_norm": 0.3517147898674011, "learning_rate": 8.160713058298713e-06, "loss": 0.4934, "step": 6712 }, { "epoch": 3.0368694865415065, "grad_norm": 0.35389775037765503, "learning_rate": 8.160145251456025e-06, "loss": 0.5032, "step": 6713 }, { "epoch": 3.037321872879439, "grad_norm": 0.5155638456344604, "learning_rate": 8.15957737674319e-06, "loss": 0.6426, "step": 6714 }, { "epoch": 3.0377742592173718, "grad_norm": 0.37746214866638184, "learning_rate": 8.159009434172406e-06, "loss": 0.4853, "step": 6715 }, { "epoch": 3.038226645555304, "grad_norm": 0.3766145408153534, "learning_rate": 8.158441423755874e-06, "loss": 0.6009, "step": 6716 }, { "epoch": 3.0386790318932366, "grad_norm": 0.3830404281616211, "learning_rate": 8.157873345505791e-06, "loss": 0.4694, "step": 6717 }, { "epoch": 3.0391314182311695, "grad_norm": 0.38858547806739807, "learning_rate": 8.157305199434356e-06, "loss": 0.5334, "step": 6718 }, { "epoch": 3.039583804569102, "grad_norm": 0.3670560121536255, "learning_rate": 8.15673698555377e-06, "loss": 0.428, "step": 6719 }, { "epoch": 3.040036190907035, "grad_norm": 0.3969941735267639, "learning_rate": 8.156168703876242e-06, "loss": 0.5389, "step": 6720 }, { "epoch": 3.0404885772449672, "grad_norm": 0.42988651990890503, "learning_rate": 8.155600354413971e-06, "loss": 0.5231, "step": 6721 }, { "epoch": 3.0409409635828997, "grad_norm": 0.37679746747016907, "learning_rate": 8.155031937179168e-06, "loss": 0.386, "step": 6722 }, { "epoch": 3.0413933499208325, "grad_norm": 0.4185118079185486, "learning_rate": 8.154463452184038e-06, "loss": 0.4911, "step": 6723 }, { "epoch": 3.041845736258765, "grad_norm": 0.4372599124908447, "learning_rate": 8.15389489944079e-06, "loss": 0.5174, "step": 6724 }, { "epoch": 3.0422981225966974, "grad_norm": 0.4358530342578888, "learning_rate": 8.153326278961636e-06, "loss": 0.5149, "step": 6725 }, { "epoch": 3.0427505089346303, "grad_norm": 0.43394026160240173, "learning_rate": 8.152757590758789e-06, "loss": 0.4176, "step": 6726 }, { "epoch": 3.0432028952725627, "grad_norm": 0.4614505171775818, "learning_rate": 8.152188834844462e-06, "loss": 0.5136, "step": 6727 }, { "epoch": 3.043655281610495, "grad_norm": 0.5113537311553955, "learning_rate": 8.151620011230867e-06, "loss": 0.5404, "step": 6728 }, { "epoch": 3.044107667948428, "grad_norm": 0.4802349805831909, "learning_rate": 8.151051119930226e-06, "loss": 0.4779, "step": 6729 }, { "epoch": 3.0445600542863605, "grad_norm": 0.5031489133834839, "learning_rate": 8.150482160954754e-06, "loss": 0.5108, "step": 6730 }, { "epoch": 3.0450124406242933, "grad_norm": 0.571185827255249, "learning_rate": 8.14991313431667e-06, "loss": 0.4914, "step": 6731 }, { "epoch": 3.0454648269622258, "grad_norm": 0.4650340974330902, "learning_rate": 8.149344040028196e-06, "loss": 1.0064, "step": 6732 }, { "epoch": 3.045917213300158, "grad_norm": 0.23221848905086517, "learning_rate": 8.148774878101555e-06, "loss": 0.7344, "step": 6733 }, { "epoch": 3.046369599638091, "grad_norm": 0.24376578629016876, "learning_rate": 8.148205648548968e-06, "loss": 0.4807, "step": 6734 }, { "epoch": 3.0468219859760235, "grad_norm": 0.2446979135274887, "learning_rate": 8.147636351382663e-06, "loss": 0.6297, "step": 6735 }, { "epoch": 3.047274372313956, "grad_norm": 0.25164905190467834, "learning_rate": 8.147066986614867e-06, "loss": 0.4757, "step": 6736 }, { "epoch": 3.047726758651889, "grad_norm": 0.29900044202804565, "learning_rate": 8.146497554257807e-06, "loss": 0.6737, "step": 6737 }, { "epoch": 3.0481791449898212, "grad_norm": 0.27857252955436707, "learning_rate": 8.145928054323714e-06, "loss": 0.619, "step": 6738 }, { "epoch": 3.048631531327754, "grad_norm": 0.2973421812057495, "learning_rate": 8.145358486824816e-06, "loss": 0.6265, "step": 6739 }, { "epoch": 3.0490839176656865, "grad_norm": 0.292044073343277, "learning_rate": 8.14478885177335e-06, "loss": 0.6865, "step": 6740 }, { "epoch": 3.049536304003619, "grad_norm": 0.29792526364326477, "learning_rate": 8.144219149181544e-06, "loss": 0.6312, "step": 6741 }, { "epoch": 3.049988690341552, "grad_norm": 0.34347227215766907, "learning_rate": 8.143649379061639e-06, "loss": 0.6973, "step": 6742 }, { "epoch": 3.0504410766794843, "grad_norm": 0.32506445050239563, "learning_rate": 8.14307954142587e-06, "loss": 0.7334, "step": 6743 }, { "epoch": 3.0508934630174167, "grad_norm": 0.31515151262283325, "learning_rate": 8.142509636286473e-06, "loss": 0.5973, "step": 6744 }, { "epoch": 3.0513458493553496, "grad_norm": 0.3151567280292511, "learning_rate": 8.141939663655692e-06, "loss": 0.5652, "step": 6745 }, { "epoch": 3.051798235693282, "grad_norm": 0.3202228546142578, "learning_rate": 8.141369623545765e-06, "loss": 0.6187, "step": 6746 }, { "epoch": 3.0522506220312144, "grad_norm": 0.3109385073184967, "learning_rate": 8.14079951596894e-06, "loss": 0.4816, "step": 6747 }, { "epoch": 3.0527030083691473, "grad_norm": 0.31806954741477966, "learning_rate": 8.140229340937453e-06, "loss": 0.515, "step": 6748 }, { "epoch": 3.0531553947070798, "grad_norm": 0.38942596316337585, "learning_rate": 8.139659098463556e-06, "loss": 0.6663, "step": 6749 }, { "epoch": 3.0536077810450126, "grad_norm": 0.3525657653808594, "learning_rate": 8.13908878855949e-06, "loss": 0.5547, "step": 6750 }, { "epoch": 3.054060167382945, "grad_norm": 0.36437031626701355, "learning_rate": 8.13851841123751e-06, "loss": 0.6137, "step": 6751 }, { "epoch": 3.0545125537208775, "grad_norm": 0.34379348158836365, "learning_rate": 8.137947966509864e-06, "loss": 0.5653, "step": 6752 }, { "epoch": 3.0549649400588104, "grad_norm": 0.3465386629104614, "learning_rate": 8.137377454388802e-06, "loss": 0.5715, "step": 6753 }, { "epoch": 3.055417326396743, "grad_norm": 0.38035881519317627, "learning_rate": 8.136806874886577e-06, "loss": 0.6124, "step": 6754 }, { "epoch": 3.0558697127346752, "grad_norm": 0.340809166431427, "learning_rate": 8.136236228015444e-06, "loss": 0.5126, "step": 6755 }, { "epoch": 3.056322099072608, "grad_norm": 0.3682800829410553, "learning_rate": 8.135665513787656e-06, "loss": 0.5866, "step": 6756 }, { "epoch": 3.0567744854105405, "grad_norm": 0.32496726512908936, "learning_rate": 8.135094732215476e-06, "loss": 0.4964, "step": 6757 }, { "epoch": 3.0572268717484734, "grad_norm": 0.3313734233379364, "learning_rate": 8.134523883311158e-06, "loss": 0.4778, "step": 6758 }, { "epoch": 3.057679258086406, "grad_norm": 0.34502604603767395, "learning_rate": 8.133952967086963e-06, "loss": 0.5197, "step": 6759 }, { "epoch": 3.0581316444243383, "grad_norm": 0.3789712190628052, "learning_rate": 8.133381983555153e-06, "loss": 0.5261, "step": 6760 }, { "epoch": 3.058584030762271, "grad_norm": 0.380206823348999, "learning_rate": 8.13281093272799e-06, "loss": 0.5729, "step": 6761 }, { "epoch": 3.0590364171002036, "grad_norm": 0.3700765073299408, "learning_rate": 8.13223981461774e-06, "loss": 0.5336, "step": 6762 }, { "epoch": 3.059488803438136, "grad_norm": 0.3828321695327759, "learning_rate": 8.13166862923667e-06, "loss": 0.5692, "step": 6763 }, { "epoch": 3.059941189776069, "grad_norm": 0.39243537187576294, "learning_rate": 8.131097376597041e-06, "loss": 0.5312, "step": 6764 }, { "epoch": 3.0603935761140013, "grad_norm": 0.42924362421035767, "learning_rate": 8.130526056711128e-06, "loss": 0.554, "step": 6765 }, { "epoch": 3.0608459624519337, "grad_norm": 0.4019638001918793, "learning_rate": 8.129954669591201e-06, "loss": 0.547, "step": 6766 }, { "epoch": 3.0612983487898666, "grad_norm": 0.3562328517436981, "learning_rate": 8.129383215249526e-06, "loss": 0.4847, "step": 6767 }, { "epoch": 3.061750735127799, "grad_norm": 0.4119063913822174, "learning_rate": 8.128811693698384e-06, "loss": 0.5871, "step": 6768 }, { "epoch": 3.062203121465732, "grad_norm": 0.44470730423927307, "learning_rate": 8.128240104950044e-06, "loss": 0.6289, "step": 6769 }, { "epoch": 3.0626555078036644, "grad_norm": 0.41833317279815674, "learning_rate": 8.127668449016783e-06, "loss": 0.4903, "step": 6770 }, { "epoch": 3.063107894141597, "grad_norm": 0.3813905715942383, "learning_rate": 8.12709672591088e-06, "loss": 0.4875, "step": 6771 }, { "epoch": 3.0635602804795297, "grad_norm": 0.4346943497657776, "learning_rate": 8.126524935644611e-06, "loss": 0.5653, "step": 6772 }, { "epoch": 3.064012666817462, "grad_norm": 0.4248143136501312, "learning_rate": 8.125953078230258e-06, "loss": 0.5034, "step": 6773 }, { "epoch": 3.0644650531553945, "grad_norm": 0.44171783328056335, "learning_rate": 8.125381153680103e-06, "loss": 0.5108, "step": 6774 }, { "epoch": 3.0649174394933274, "grad_norm": 0.42741525173187256, "learning_rate": 8.124809162006429e-06, "loss": 0.5214, "step": 6775 }, { "epoch": 3.06536982583126, "grad_norm": 0.41437292098999023, "learning_rate": 8.124237103221519e-06, "loss": 0.4614, "step": 6776 }, { "epoch": 3.0658222121691923, "grad_norm": 0.5044060349464417, "learning_rate": 8.123664977337663e-06, "loss": 0.7045, "step": 6777 }, { "epoch": 3.066274598507125, "grad_norm": 0.4577660858631134, "learning_rate": 8.123092784367144e-06, "loss": 0.5202, "step": 6778 }, { "epoch": 3.0667269848450576, "grad_norm": 0.5113386511802673, "learning_rate": 8.122520524322255e-06, "loss": 0.5568, "step": 6779 }, { "epoch": 3.0671793711829904, "grad_norm": 0.45200324058532715, "learning_rate": 8.121948197215283e-06, "loss": 0.41, "step": 6780 }, { "epoch": 3.067631757520923, "grad_norm": 0.5096612572669983, "learning_rate": 8.121375803058522e-06, "loss": 0.5473, "step": 6781 }, { "epoch": 3.0680841438588553, "grad_norm": 0.36396080255508423, "learning_rate": 8.120803341864263e-06, "loss": 0.8381, "step": 6782 }, { "epoch": 3.068536530196788, "grad_norm": 0.2137652337551117, "learning_rate": 8.120230813644803e-06, "loss": 1.2906, "step": 6783 }, { "epoch": 3.0689889165347206, "grad_norm": 0.18168070912361145, "learning_rate": 8.119658218412436e-06, "loss": 0.5995, "step": 6784 }, { "epoch": 3.069441302872653, "grad_norm": 0.2611026465892792, "learning_rate": 8.119085556179462e-06, "loss": 0.7116, "step": 6785 }, { "epoch": 3.069893689210586, "grad_norm": 0.27018287777900696, "learning_rate": 8.11851282695818e-06, "loss": 0.6608, "step": 6786 }, { "epoch": 3.0703460755485183, "grad_norm": 0.27952441573143005, "learning_rate": 8.117940030760888e-06, "loss": 0.5822, "step": 6787 }, { "epoch": 3.0707984618864512, "grad_norm": 0.2769932746887207, "learning_rate": 8.117367167599888e-06, "loss": 0.5456, "step": 6788 }, { "epoch": 3.0712508482243837, "grad_norm": 0.2718324363231659, "learning_rate": 8.116794237487485e-06, "loss": 0.6247, "step": 6789 }, { "epoch": 3.071703234562316, "grad_norm": 0.2861984074115753, "learning_rate": 8.116221240435983e-06, "loss": 0.5976, "step": 6790 }, { "epoch": 3.072155620900249, "grad_norm": 0.3407355844974518, "learning_rate": 8.115648176457688e-06, "loss": 0.6349, "step": 6791 }, { "epoch": 3.0726080072381814, "grad_norm": 0.3580301105976105, "learning_rate": 8.11507504556491e-06, "loss": 0.6681, "step": 6792 }, { "epoch": 3.073060393576114, "grad_norm": 0.3414112627506256, "learning_rate": 8.114501847769955e-06, "loss": 0.6565, "step": 6793 }, { "epoch": 3.0735127799140467, "grad_norm": 0.31688475608825684, "learning_rate": 8.113928583085135e-06, "loss": 0.5807, "step": 6794 }, { "epoch": 3.073965166251979, "grad_norm": 0.3390893042087555, "learning_rate": 8.113355251522764e-06, "loss": 0.5424, "step": 6795 }, { "epoch": 3.074417552589912, "grad_norm": 0.3273243010044098, "learning_rate": 8.112781853095148e-06, "loss": 0.5039, "step": 6796 }, { "epoch": 3.0748699389278444, "grad_norm": 0.3532792925834656, "learning_rate": 8.112208387814611e-06, "loss": 0.5883, "step": 6797 }, { "epoch": 3.075322325265777, "grad_norm": 0.3340378999710083, "learning_rate": 8.111634855693463e-06, "loss": 0.4731, "step": 6798 }, { "epoch": 3.0757747116037097, "grad_norm": 0.3785868287086487, "learning_rate": 8.111061256744024e-06, "loss": 0.5634, "step": 6799 }, { "epoch": 3.076227097941642, "grad_norm": 0.3449472486972809, "learning_rate": 8.110487590978613e-06, "loss": 0.5499, "step": 6800 }, { "epoch": 3.076227097941642, "eval_loss": 0.5931933522224426, "eval_runtime": 25.2683, "eval_samples_per_second": 29.444, "eval_steps_per_second": 7.361, "step": 6800 }, { "epoch": 3.0766794842795746, "grad_norm": 0.32121574878692627, "learning_rate": 8.10991385840955e-06, "loss": 0.5042, "step": 6801 }, { "epoch": 3.0771318706175075, "grad_norm": 0.3340110182762146, "learning_rate": 8.10934005904916e-06, "loss": 0.551, "step": 6802 }, { "epoch": 3.07758425695544, "grad_norm": 0.34923186898231506, "learning_rate": 8.108766192909761e-06, "loss": 0.5358, "step": 6803 }, { "epoch": 3.0780366432933723, "grad_norm": 0.34324079751968384, "learning_rate": 8.108192260003683e-06, "loss": 0.495, "step": 6804 }, { "epoch": 3.078489029631305, "grad_norm": 0.35035088658332825, "learning_rate": 8.107618260343248e-06, "loss": 0.5986, "step": 6805 }, { "epoch": 3.0789414159692376, "grad_norm": 0.3883579969406128, "learning_rate": 8.107044193940788e-06, "loss": 0.6014, "step": 6806 }, { "epoch": 3.0793938023071705, "grad_norm": 0.408519983291626, "learning_rate": 8.106470060808628e-06, "loss": 0.6194, "step": 6807 }, { "epoch": 3.079846188645103, "grad_norm": 0.32977113127708435, "learning_rate": 8.105895860959102e-06, "loss": 0.4455, "step": 6808 }, { "epoch": 3.0802985749830354, "grad_norm": 0.3631770610809326, "learning_rate": 8.10532159440454e-06, "loss": 0.4918, "step": 6809 }, { "epoch": 3.0807509613209683, "grad_norm": 0.3539304733276367, "learning_rate": 8.104747261157277e-06, "loss": 0.5514, "step": 6810 }, { "epoch": 3.0812033476589007, "grad_norm": 0.3329235911369324, "learning_rate": 8.104172861229646e-06, "loss": 0.4204, "step": 6811 }, { "epoch": 3.081655733996833, "grad_norm": 0.3776697814464569, "learning_rate": 8.103598394633984e-06, "loss": 0.5299, "step": 6812 }, { "epoch": 3.082108120334766, "grad_norm": 0.4275754988193512, "learning_rate": 8.103023861382629e-06, "loss": 0.6454, "step": 6813 }, { "epoch": 3.0825605066726984, "grad_norm": 0.4430869221687317, "learning_rate": 8.102449261487922e-06, "loss": 0.5956, "step": 6814 }, { "epoch": 3.083012893010631, "grad_norm": 0.42302581667900085, "learning_rate": 8.1018745949622e-06, "loss": 0.6127, "step": 6815 }, { "epoch": 3.0834652793485637, "grad_norm": 0.37497004866600037, "learning_rate": 8.101299861817808e-06, "loss": 0.4753, "step": 6816 }, { "epoch": 3.083917665686496, "grad_norm": 0.4601093530654907, "learning_rate": 8.10072506206709e-06, "loss": 0.6036, "step": 6817 }, { "epoch": 3.084370052024429, "grad_norm": 0.4211452603340149, "learning_rate": 8.100150195722388e-06, "loss": 0.5472, "step": 6818 }, { "epoch": 3.0848224383623615, "grad_norm": 0.45734214782714844, "learning_rate": 8.099575262796049e-06, "loss": 0.5831, "step": 6819 }, { "epoch": 3.085274824700294, "grad_norm": 0.38912686705589294, "learning_rate": 8.099000263300424e-06, "loss": 0.5, "step": 6820 }, { "epoch": 3.085727211038227, "grad_norm": 0.4258025288581848, "learning_rate": 8.098425197247856e-06, "loss": 0.5164, "step": 6821 }, { "epoch": 3.086179597376159, "grad_norm": 0.43958672881126404, "learning_rate": 8.0978500646507e-06, "loss": 0.5355, "step": 6822 }, { "epoch": 3.0866319837140916, "grad_norm": 0.4365125596523285, "learning_rate": 8.09727486552131e-06, "loss": 0.5118, "step": 6823 }, { "epoch": 3.0870843700520245, "grad_norm": 0.4000256359577179, "learning_rate": 8.096699599872036e-06, "loss": 0.4432, "step": 6824 }, { "epoch": 3.087536756389957, "grad_norm": 0.4863925278186798, "learning_rate": 8.096124267715233e-06, "loss": 0.6545, "step": 6825 }, { "epoch": 3.08798914272789, "grad_norm": 0.4171436131000519, "learning_rate": 8.095548869063258e-06, "loss": 0.4959, "step": 6826 }, { "epoch": 3.0884415290658223, "grad_norm": 0.43550604581832886, "learning_rate": 8.094973403928472e-06, "loss": 0.4688, "step": 6827 }, { "epoch": 3.0888939154037547, "grad_norm": 0.47028788924217224, "learning_rate": 8.094397872323227e-06, "loss": 0.4612, "step": 6828 }, { "epoch": 3.0893463017416876, "grad_norm": 0.49634990096092224, "learning_rate": 8.09382227425989e-06, "loss": 0.4887, "step": 6829 }, { "epoch": 3.08979868807962, "grad_norm": 0.5486912131309509, "learning_rate": 8.093246609750823e-06, "loss": 0.5726, "step": 6830 }, { "epoch": 3.0902510744175524, "grad_norm": 0.5328325033187866, "learning_rate": 8.092670878808385e-06, "loss": 0.5327, "step": 6831 }, { "epoch": 3.0907034607554853, "grad_norm": 0.38543179631233215, "learning_rate": 8.092095081444943e-06, "loss": 1.1942, "step": 6832 }, { "epoch": 3.0911558470934177, "grad_norm": 0.16348548233509064, "learning_rate": 8.091519217672865e-06, "loss": 0.4354, "step": 6833 }, { "epoch": 3.09160823343135, "grad_norm": 0.2630755603313446, "learning_rate": 8.090943287504516e-06, "loss": 0.6986, "step": 6834 }, { "epoch": 3.092060619769283, "grad_norm": 0.2632649540901184, "learning_rate": 8.090367290952268e-06, "loss": 0.6836, "step": 6835 }, { "epoch": 3.0925130061072155, "grad_norm": 0.3134084939956665, "learning_rate": 8.08979122802849e-06, "loss": 0.5934, "step": 6836 }, { "epoch": 3.0929653924451483, "grad_norm": 0.2829670011997223, "learning_rate": 8.089215098745554e-06, "loss": 0.5978, "step": 6837 }, { "epoch": 3.0934177787830808, "grad_norm": 0.24772188067436218, "learning_rate": 8.088638903115835e-06, "loss": 0.5307, "step": 6838 }, { "epoch": 3.093870165121013, "grad_norm": 0.2789740264415741, "learning_rate": 8.088062641151705e-06, "loss": 0.5732, "step": 6839 }, { "epoch": 3.094322551458946, "grad_norm": 0.27235719561576843, "learning_rate": 8.087486312865543e-06, "loss": 0.4377, "step": 6840 }, { "epoch": 3.0947749377968785, "grad_norm": 0.31865793466567993, "learning_rate": 8.086909918269726e-06, "loss": 0.5339, "step": 6841 }, { "epoch": 3.095227324134811, "grad_norm": 0.28711238503456116, "learning_rate": 8.086333457376632e-06, "loss": 0.5523, "step": 6842 }, { "epoch": 3.095679710472744, "grad_norm": 0.3385606110095978, "learning_rate": 8.085756930198643e-06, "loss": 0.8013, "step": 6843 }, { "epoch": 3.0961320968106762, "grad_norm": 0.32857954502105713, "learning_rate": 8.08518033674814e-06, "loss": 0.6114, "step": 6844 }, { "epoch": 3.096584483148609, "grad_norm": 0.3014777898788452, "learning_rate": 8.084603677037508e-06, "loss": 0.5386, "step": 6845 }, { "epoch": 3.0970368694865416, "grad_norm": 0.3338097631931305, "learning_rate": 8.08402695107913e-06, "loss": 0.6355, "step": 6846 }, { "epoch": 3.097489255824474, "grad_norm": 0.34926921129226685, "learning_rate": 8.083450158885395e-06, "loss": 0.6054, "step": 6847 }, { "epoch": 3.097941642162407, "grad_norm": 0.3141983151435852, "learning_rate": 8.082873300468688e-06, "loss": 0.4656, "step": 6848 }, { "epoch": 3.0983940285003393, "grad_norm": 0.3484163284301758, "learning_rate": 8.082296375841398e-06, "loss": 0.6146, "step": 6849 }, { "epoch": 3.0988464148382717, "grad_norm": 0.3291020393371582, "learning_rate": 8.081719385015919e-06, "loss": 0.5277, "step": 6850 }, { "epoch": 3.0992988011762046, "grad_norm": 0.3716105818748474, "learning_rate": 8.081142328004638e-06, "loss": 0.6406, "step": 6851 }, { "epoch": 3.099751187514137, "grad_norm": 0.35620906949043274, "learning_rate": 8.08056520481995e-06, "loss": 0.6028, "step": 6852 }, { "epoch": 3.1002035738520695, "grad_norm": 0.3387952148914337, "learning_rate": 8.079988015474255e-06, "loss": 0.5027, "step": 6853 }, { "epoch": 3.1006559601900023, "grad_norm": 0.35806503891944885, "learning_rate": 8.079410759979942e-06, "loss": 0.5884, "step": 6854 }, { "epoch": 3.1011083465279348, "grad_norm": 0.37269943952560425, "learning_rate": 8.078833438349411e-06, "loss": 0.5566, "step": 6855 }, { "epoch": 3.1015607328658676, "grad_norm": 0.3644399344921112, "learning_rate": 8.078256050595063e-06, "loss": 0.5703, "step": 6856 }, { "epoch": 3.1020131192038, "grad_norm": 0.32508349418640137, "learning_rate": 8.077678596729298e-06, "loss": 0.5336, "step": 6857 }, { "epoch": 3.1024655055417325, "grad_norm": 0.38305333256721497, "learning_rate": 8.077101076764518e-06, "loss": 0.5643, "step": 6858 }, { "epoch": 3.1029178918796654, "grad_norm": 0.3434520959854126, "learning_rate": 8.076523490713123e-06, "loss": 0.4588, "step": 6859 }, { "epoch": 3.103370278217598, "grad_norm": 0.41532203555107117, "learning_rate": 8.07594583858752e-06, "loss": 0.622, "step": 6860 }, { "epoch": 3.1038226645555302, "grad_norm": 0.35696956515312195, "learning_rate": 8.075368120400117e-06, "loss": 0.5062, "step": 6861 }, { "epoch": 3.104275050893463, "grad_norm": 0.36520859599113464, "learning_rate": 8.074790336163321e-06, "loss": 0.5322, "step": 6862 }, { "epoch": 3.1047274372313955, "grad_norm": 0.40325719118118286, "learning_rate": 8.07421248588954e-06, "loss": 0.5933, "step": 6863 }, { "epoch": 3.105179823569328, "grad_norm": 0.41172271966934204, "learning_rate": 8.073634569591183e-06, "loss": 0.5703, "step": 6864 }, { "epoch": 3.105632209907261, "grad_norm": 0.37423792481422424, "learning_rate": 8.073056587280664e-06, "loss": 0.5771, "step": 6865 }, { "epoch": 3.1060845962451933, "grad_norm": 0.3712892532348633, "learning_rate": 8.072478538970394e-06, "loss": 0.4341, "step": 6866 }, { "epoch": 3.106536982583126, "grad_norm": 0.38154277205467224, "learning_rate": 8.071900424672792e-06, "loss": 0.5313, "step": 6867 }, { "epoch": 3.1069893689210586, "grad_norm": 0.3947354853153229, "learning_rate": 8.07132224440027e-06, "loss": 0.5299, "step": 6868 }, { "epoch": 3.107441755258991, "grad_norm": 0.4181290864944458, "learning_rate": 8.070743998165246e-06, "loss": 0.5336, "step": 6869 }, { "epoch": 3.107894141596924, "grad_norm": 0.39085903763771057, "learning_rate": 8.070165685980143e-06, "loss": 0.4789, "step": 6870 }, { "epoch": 3.1083465279348563, "grad_norm": 0.36226218938827515, "learning_rate": 8.069587307857377e-06, "loss": 0.3934, "step": 6871 }, { "epoch": 3.1087989142727888, "grad_norm": 0.3849199712276459, "learning_rate": 8.06900886380937e-06, "loss": 0.4106, "step": 6872 }, { "epoch": 3.1092513006107216, "grad_norm": 0.43923377990722656, "learning_rate": 8.068430353848547e-06, "loss": 0.5589, "step": 6873 }, { "epoch": 3.109703686948654, "grad_norm": 0.4446803629398346, "learning_rate": 8.067851777987331e-06, "loss": 0.5201, "step": 6874 }, { "epoch": 3.110156073286587, "grad_norm": 0.45933303236961365, "learning_rate": 8.06727313623815e-06, "loss": 0.5028, "step": 6875 }, { "epoch": 3.1106084596245194, "grad_norm": 0.4451215863227844, "learning_rate": 8.06669442861343e-06, "loss": 0.5283, "step": 6876 }, { "epoch": 3.111060845962452, "grad_norm": 0.5161502361297607, "learning_rate": 8.0661156551256e-06, "loss": 0.4692, "step": 6877 }, { "epoch": 3.1115132323003847, "grad_norm": 0.4900444447994232, "learning_rate": 8.06553681578709e-06, "loss": 0.531, "step": 6878 }, { "epoch": 3.111965618638317, "grad_norm": 0.5274783968925476, "learning_rate": 8.064957910610335e-06, "loss": 0.5403, "step": 6879 }, { "epoch": 3.1124180049762495, "grad_norm": 0.49520814418792725, "learning_rate": 8.064378939607762e-06, "loss": 0.4531, "step": 6880 }, { "epoch": 3.1128703913141824, "grad_norm": 0.47939804196357727, "learning_rate": 8.06379990279181e-06, "loss": 0.4493, "step": 6881 }, { "epoch": 3.113322777652115, "grad_norm": 0.3630225658416748, "learning_rate": 8.06322080017491e-06, "loss": 0.9097, "step": 6882 }, { "epoch": 3.1137751639900477, "grad_norm": 0.1942109763622284, "learning_rate": 8.062641631769507e-06, "loss": 0.7007, "step": 6883 }, { "epoch": 3.11422755032798, "grad_norm": 0.23993158340454102, "learning_rate": 8.062062397588036e-06, "loss": 0.6041, "step": 6884 }, { "epoch": 3.1146799366659126, "grad_norm": 0.25432339310646057, "learning_rate": 8.061483097642935e-06, "loss": 0.6489, "step": 6885 }, { "epoch": 3.1151323230038455, "grad_norm": 0.2708612382411957, "learning_rate": 8.060903731946648e-06, "loss": 0.6292, "step": 6886 }, { "epoch": 3.115584709341778, "grad_norm": 0.3297748863697052, "learning_rate": 8.060324300511617e-06, "loss": 0.8659, "step": 6887 }, { "epoch": 3.1160370956797103, "grad_norm": 0.25759515166282654, "learning_rate": 8.059744803350286e-06, "loss": 0.6762, "step": 6888 }, { "epoch": 3.116489482017643, "grad_norm": 0.3026386499404907, "learning_rate": 8.059165240475105e-06, "loss": 0.5898, "step": 6889 }, { "epoch": 3.1169418683555756, "grad_norm": 0.274992972612381, "learning_rate": 8.058585611898514e-06, "loss": 0.5314, "step": 6890 }, { "epoch": 3.117394254693508, "grad_norm": 0.2768142521381378, "learning_rate": 8.058005917632967e-06, "loss": 0.5136, "step": 6891 }, { "epoch": 3.117846641031441, "grad_norm": 0.29141396284103394, "learning_rate": 8.057426157690912e-06, "loss": 0.5827, "step": 6892 }, { "epoch": 3.1182990273693734, "grad_norm": 0.30183881521224976, "learning_rate": 8.056846332084803e-06, "loss": 0.6068, "step": 6893 }, { "epoch": 3.1187514137073062, "grad_norm": 0.3151332437992096, "learning_rate": 8.05626644082709e-06, "loss": 0.6265, "step": 6894 }, { "epoch": 3.1192038000452387, "grad_norm": 0.35716086626052856, "learning_rate": 8.055686483930224e-06, "loss": 0.5614, "step": 6895 }, { "epoch": 3.119656186383171, "grad_norm": 0.33306947350502014, "learning_rate": 8.05510646140667e-06, "loss": 0.5053, "step": 6896 }, { "epoch": 3.120108572721104, "grad_norm": 0.31318172812461853, "learning_rate": 8.054526373268878e-06, "loss": 0.5222, "step": 6897 }, { "epoch": 3.1205609590590364, "grad_norm": 0.3654864430427551, "learning_rate": 8.053946219529307e-06, "loss": 0.5989, "step": 6898 }, { "epoch": 3.121013345396969, "grad_norm": 0.3355353772640228, "learning_rate": 8.05336600020042e-06, "loss": 0.6224, "step": 6899 }, { "epoch": 3.1214657317349017, "grad_norm": 0.33339160680770874, "learning_rate": 8.052785715294676e-06, "loss": 0.486, "step": 6900 }, { "epoch": 3.121918118072834, "grad_norm": 0.3309531807899475, "learning_rate": 8.052205364824538e-06, "loss": 0.7166, "step": 6901 }, { "epoch": 3.1223705044107666, "grad_norm": 0.33880844712257385, "learning_rate": 8.051624948802472e-06, "loss": 0.4906, "step": 6902 }, { "epoch": 3.1228228907486995, "grad_norm": 0.36526042222976685, "learning_rate": 8.051044467240941e-06, "loss": 0.5829, "step": 6903 }, { "epoch": 3.123275277086632, "grad_norm": 0.36187154054641724, "learning_rate": 8.050463920152412e-06, "loss": 0.5108, "step": 6904 }, { "epoch": 3.1237276634245648, "grad_norm": 0.39799681305885315, "learning_rate": 8.049883307549355e-06, "loss": 0.5949, "step": 6905 }, { "epoch": 3.124180049762497, "grad_norm": 0.3772258460521698, "learning_rate": 8.049302629444239e-06, "loss": 0.531, "step": 6906 }, { "epoch": 3.1246324361004296, "grad_norm": 0.3999866247177124, "learning_rate": 8.048721885849534e-06, "loss": 0.474, "step": 6907 }, { "epoch": 3.1250848224383625, "grad_norm": 0.36747726798057556, "learning_rate": 8.048141076777716e-06, "loss": 0.5705, "step": 6908 }, { "epoch": 3.125537208776295, "grad_norm": 0.3428710103034973, "learning_rate": 8.047560202241255e-06, "loss": 0.4716, "step": 6909 }, { "epoch": 3.1259895951142274, "grad_norm": 0.395895391702652, "learning_rate": 8.046979262252627e-06, "loss": 0.4849, "step": 6910 }, { "epoch": 3.1264419814521602, "grad_norm": 0.3584251403808594, "learning_rate": 8.046398256824312e-06, "loss": 0.4721, "step": 6911 }, { "epoch": 3.1268943677900927, "grad_norm": 0.39600083231925964, "learning_rate": 8.045817185968787e-06, "loss": 0.5475, "step": 6912 }, { "epoch": 3.1273467541280255, "grad_norm": 0.38075822591781616, "learning_rate": 8.045236049698529e-06, "loss": 0.5681, "step": 6913 }, { "epoch": 3.127799140465958, "grad_norm": 0.4311618506908417, "learning_rate": 8.04465484802602e-06, "loss": 0.5306, "step": 6914 }, { "epoch": 3.1282515268038904, "grad_norm": 0.4227982759475708, "learning_rate": 8.044073580963747e-06, "loss": 0.5443, "step": 6915 }, { "epoch": 3.1287039131418233, "grad_norm": 0.42311328649520874, "learning_rate": 8.043492248524186e-06, "loss": 0.5597, "step": 6916 }, { "epoch": 3.1291562994797557, "grad_norm": 0.4390213191509247, "learning_rate": 8.042910850719827e-06, "loss": 0.5939, "step": 6917 }, { "epoch": 3.129608685817688, "grad_norm": 0.4241529405117035, "learning_rate": 8.042329387563158e-06, "loss": 0.5766, "step": 6918 }, { "epoch": 3.130061072155621, "grad_norm": 0.42601141333580017, "learning_rate": 8.041747859066663e-06, "loss": 0.524, "step": 6919 }, { "epoch": 3.1305134584935534, "grad_norm": 0.3979688286781311, "learning_rate": 8.041166265242833e-06, "loss": 0.5124, "step": 6920 }, { "epoch": 3.1309658448314863, "grad_norm": 0.4732106029987335, "learning_rate": 8.04058460610416e-06, "loss": 0.662, "step": 6921 }, { "epoch": 3.1314182311694188, "grad_norm": 0.41427430510520935, "learning_rate": 8.040002881663135e-06, "loss": 0.4975, "step": 6922 }, { "epoch": 3.131870617507351, "grad_norm": 0.3875648081302643, "learning_rate": 8.039421091932252e-06, "loss": 0.4822, "step": 6923 }, { "epoch": 3.132323003845284, "grad_norm": 0.4752960503101349, "learning_rate": 8.038839236924006e-06, "loss": 0.5237, "step": 6924 }, { "epoch": 3.1327753901832165, "grad_norm": 0.4554615914821625, "learning_rate": 8.038257316650895e-06, "loss": 0.5689, "step": 6925 }, { "epoch": 3.133227776521149, "grad_norm": 0.427712082862854, "learning_rate": 8.037675331125412e-06, "loss": 0.4613, "step": 6926 }, { "epoch": 3.133680162859082, "grad_norm": 0.4924871623516083, "learning_rate": 8.037093280360063e-06, "loss": 0.5502, "step": 6927 }, { "epoch": 3.1341325491970142, "grad_norm": 0.4643838703632355, "learning_rate": 8.036511164367343e-06, "loss": 0.4988, "step": 6928 }, { "epoch": 3.1345849355349467, "grad_norm": 0.5069987177848816, "learning_rate": 8.035928983159758e-06, "loss": 0.5273, "step": 6929 }, { "epoch": 3.1350373218728795, "grad_norm": 0.5288658142089844, "learning_rate": 8.035346736749808e-06, "loss": 0.5311, "step": 6930 }, { "epoch": 3.135489708210812, "grad_norm": 0.5331446528434753, "learning_rate": 8.034764425150002e-06, "loss": 0.4731, "step": 6931 }, { "epoch": 3.135942094548745, "grad_norm": 0.3709419071674347, "learning_rate": 8.034182048372841e-06, "loss": 1.185, "step": 6932 }, { "epoch": 3.1363944808866773, "grad_norm": 0.2004835456609726, "learning_rate": 8.033599606430837e-06, "loss": 0.9884, "step": 6933 }, { "epoch": 3.1368468672246097, "grad_norm": 0.26864421367645264, "learning_rate": 8.033017099336498e-06, "loss": 0.7484, "step": 6934 }, { "epoch": 3.1372992535625426, "grad_norm": 0.23171517252922058, "learning_rate": 8.032434527102333e-06, "loss": 0.5248, "step": 6935 }, { "epoch": 3.137751639900475, "grad_norm": 0.2798386812210083, "learning_rate": 8.031851889740856e-06, "loss": 0.6663, "step": 6936 }, { "epoch": 3.1382040262384074, "grad_norm": 0.2543222904205322, "learning_rate": 8.031269187264577e-06, "loss": 0.4947, "step": 6937 }, { "epoch": 3.1386564125763403, "grad_norm": 0.30305248498916626, "learning_rate": 8.030686419686015e-06, "loss": 0.6254, "step": 6938 }, { "epoch": 3.1391087989142727, "grad_norm": 0.2605748176574707, "learning_rate": 8.030103587017683e-06, "loss": 0.5403, "step": 6939 }, { "epoch": 3.139561185252205, "grad_norm": 0.30901357531547546, "learning_rate": 8.029520689272098e-06, "loss": 0.5522, "step": 6940 }, { "epoch": 3.140013571590138, "grad_norm": 0.307638943195343, "learning_rate": 8.028937726461781e-06, "loss": 0.5648, "step": 6941 }, { "epoch": 3.1404659579280705, "grad_norm": 0.3432576358318329, "learning_rate": 8.028354698599253e-06, "loss": 0.7332, "step": 6942 }, { "epoch": 3.1409183442660034, "grad_norm": 0.33052611351013184, "learning_rate": 8.027771605697032e-06, "loss": 0.5618, "step": 6943 }, { "epoch": 3.141370730603936, "grad_norm": 0.3047221601009369, "learning_rate": 8.027188447767642e-06, "loss": 0.5504, "step": 6944 }, { "epoch": 3.141823116941868, "grad_norm": 0.38693124055862427, "learning_rate": 8.026605224823612e-06, "loss": 0.63, "step": 6945 }, { "epoch": 3.142275503279801, "grad_norm": 0.3177262544631958, "learning_rate": 8.02602193687746e-06, "loss": 0.5882, "step": 6946 }, { "epoch": 3.1427278896177335, "grad_norm": 0.3704546391963959, "learning_rate": 8.02543858394172e-06, "loss": 0.5417, "step": 6947 }, { "epoch": 3.143180275955666, "grad_norm": 0.3199019730091095, "learning_rate": 8.024855166028917e-06, "loss": 0.575, "step": 6948 }, { "epoch": 3.143632662293599, "grad_norm": 0.32693207263946533, "learning_rate": 8.024271683151584e-06, "loss": 0.5271, "step": 6949 }, { "epoch": 3.1440850486315313, "grad_norm": 0.35695508122444153, "learning_rate": 8.02368813532225e-06, "loss": 0.6077, "step": 6950 }, { "epoch": 3.1445374349694637, "grad_norm": 0.3475067615509033, "learning_rate": 8.023104522553447e-06, "loss": 0.5419, "step": 6951 }, { "epoch": 3.1449898213073966, "grad_norm": 0.3238171637058258, "learning_rate": 8.02252084485771e-06, "loss": 0.4811, "step": 6952 }, { "epoch": 3.145442207645329, "grad_norm": 0.3863169252872467, "learning_rate": 8.021937102247576e-06, "loss": 0.5334, "step": 6953 }, { "epoch": 3.145894593983262, "grad_norm": 0.40878379344940186, "learning_rate": 8.02135329473558e-06, "loss": 0.6744, "step": 6954 }, { "epoch": 3.1463469803211943, "grad_norm": 0.36418062448501587, "learning_rate": 8.020769422334264e-06, "loss": 0.5192, "step": 6955 }, { "epoch": 3.1467993666591267, "grad_norm": 0.38859322667121887, "learning_rate": 8.020185485056164e-06, "loss": 0.4885, "step": 6956 }, { "epoch": 3.1472517529970596, "grad_norm": 0.3227870464324951, "learning_rate": 8.019601482913822e-06, "loss": 0.3889, "step": 6957 }, { "epoch": 3.147704139334992, "grad_norm": 0.3647085428237915, "learning_rate": 8.019017415919781e-06, "loss": 0.5308, "step": 6958 }, { "epoch": 3.1481565256729245, "grad_norm": 0.3703622817993164, "learning_rate": 8.018433284086585e-06, "loss": 0.4847, "step": 6959 }, { "epoch": 3.1486089120108574, "grad_norm": 0.3756011128425598, "learning_rate": 8.017849087426779e-06, "loss": 0.5171, "step": 6960 }, { "epoch": 3.14906129834879, "grad_norm": 0.37776681780815125, "learning_rate": 8.017264825952911e-06, "loss": 0.5282, "step": 6961 }, { "epoch": 3.1495136846867227, "grad_norm": 0.420879065990448, "learning_rate": 8.016680499677527e-06, "loss": 0.5396, "step": 6962 }, { "epoch": 3.149966071024655, "grad_norm": 0.4001430869102478, "learning_rate": 8.016096108613178e-06, "loss": 0.5364, "step": 6963 }, { "epoch": 3.1504184573625875, "grad_norm": 0.382823646068573, "learning_rate": 8.015511652772414e-06, "loss": 0.5009, "step": 6964 }, { "epoch": 3.1508708437005204, "grad_norm": 0.3435631990432739, "learning_rate": 8.01492713216779e-06, "loss": 0.3945, "step": 6965 }, { "epoch": 3.151323230038453, "grad_norm": 0.4195325970649719, "learning_rate": 8.014342546811855e-06, "loss": 0.5713, "step": 6966 }, { "epoch": 3.1517756163763853, "grad_norm": 0.41728419065475464, "learning_rate": 8.013757896717168e-06, "loss": 0.5116, "step": 6967 }, { "epoch": 3.152228002714318, "grad_norm": 0.4349029064178467, "learning_rate": 8.013173181896283e-06, "loss": 0.514, "step": 6968 }, { "epoch": 3.1526803890522506, "grad_norm": 0.464617520570755, "learning_rate": 8.012588402361759e-06, "loss": 0.5323, "step": 6969 }, { "epoch": 3.1531327753901834, "grad_norm": 0.43149492144584656, "learning_rate": 8.012003558126155e-06, "loss": 0.6156, "step": 6970 }, { "epoch": 3.153585161728116, "grad_norm": 0.39568814635276794, "learning_rate": 8.011418649202035e-06, "loss": 0.4167, "step": 6971 }, { "epoch": 3.1540375480660483, "grad_norm": 0.39569327235221863, "learning_rate": 8.010833675601954e-06, "loss": 0.4443, "step": 6972 }, { "epoch": 3.154489934403981, "grad_norm": 0.4421551823616028, "learning_rate": 8.01024863733848e-06, "loss": 0.5563, "step": 6973 }, { "epoch": 3.1549423207419136, "grad_norm": 0.39105111360549927, "learning_rate": 8.009663534424179e-06, "loss": 0.4351, "step": 6974 }, { "epoch": 3.155394707079846, "grad_norm": 0.46496108174324036, "learning_rate": 8.009078366871614e-06, "loss": 0.5105, "step": 6975 }, { "epoch": 3.155847093417779, "grad_norm": 0.5543685555458069, "learning_rate": 8.008493134693355e-06, "loss": 0.6559, "step": 6976 }, { "epoch": 3.1562994797557113, "grad_norm": 0.4545900821685791, "learning_rate": 8.007907837901969e-06, "loss": 0.5242, "step": 6977 }, { "epoch": 3.1567518660936438, "grad_norm": 0.46894022822380066, "learning_rate": 8.007322476510027e-06, "loss": 0.507, "step": 6978 }, { "epoch": 3.1572042524315767, "grad_norm": 0.48870187997817993, "learning_rate": 8.006737050530103e-06, "loss": 0.4399, "step": 6979 }, { "epoch": 3.157656638769509, "grad_norm": 0.587676465511322, "learning_rate": 8.006151559974764e-06, "loss": 0.5486, "step": 6980 }, { "epoch": 3.158109025107442, "grad_norm": 0.6422447562217712, "learning_rate": 8.005566004856593e-06, "loss": 0.5331, "step": 6981 }, { "epoch": 3.1585614114453744, "grad_norm": 0.47243064641952515, "learning_rate": 8.004980385188159e-06, "loss": 0.9817, "step": 6982 }, { "epoch": 3.159013797783307, "grad_norm": 0.2167821079492569, "learning_rate": 8.004394700982044e-06, "loss": 0.8052, "step": 6983 }, { "epoch": 3.1594661841212397, "grad_norm": 0.2080824077129364, "learning_rate": 8.003808952250822e-06, "loss": 0.6008, "step": 6984 }, { "epoch": 3.159918570459172, "grad_norm": 0.2340872883796692, "learning_rate": 8.003223139007076e-06, "loss": 0.4248, "step": 6985 }, { "epoch": 3.1603709567971046, "grad_norm": 0.27524295449256897, "learning_rate": 8.002637261263389e-06, "loss": 0.6908, "step": 6986 }, { "epoch": 3.1608233431350374, "grad_norm": 0.32265907526016235, "learning_rate": 8.002051319032342e-06, "loss": 0.6962, "step": 6987 }, { "epoch": 3.16127572947297, "grad_norm": 0.28266558051109314, "learning_rate": 8.001465312326518e-06, "loss": 0.5797, "step": 6988 }, { "epoch": 3.1617281158109023, "grad_norm": 0.2918812334537506, "learning_rate": 8.000879241158504e-06, "loss": 0.5965, "step": 6989 }, { "epoch": 3.162180502148835, "grad_norm": 0.32772496342658997, "learning_rate": 8.000293105540887e-06, "loss": 0.6556, "step": 6990 }, { "epoch": 3.1626328884867676, "grad_norm": 0.3082197904586792, "learning_rate": 7.999706905486256e-06, "loss": 0.5926, "step": 6991 }, { "epoch": 3.1630852748247005, "grad_norm": 0.31491950154304504, "learning_rate": 7.999120641007198e-06, "loss": 0.5835, "step": 6992 }, { "epoch": 3.163537661162633, "grad_norm": 0.3310619294643402, "learning_rate": 7.998534312116307e-06, "loss": 0.567, "step": 6993 }, { "epoch": 3.1639900475005653, "grad_norm": 0.3487052917480469, "learning_rate": 7.997947918826174e-06, "loss": 0.568, "step": 6994 }, { "epoch": 3.164442433838498, "grad_norm": 0.3268755376338959, "learning_rate": 7.997361461149396e-06, "loss": 0.5352, "step": 6995 }, { "epoch": 3.1648948201764306, "grad_norm": 0.3095226287841797, "learning_rate": 7.996774939098563e-06, "loss": 0.4644, "step": 6996 }, { "epoch": 3.165347206514363, "grad_norm": 0.3546241819858551, "learning_rate": 7.996188352686274e-06, "loss": 0.6518, "step": 6997 }, { "epoch": 3.165799592852296, "grad_norm": 0.3349514305591583, "learning_rate": 7.99560170192513e-06, "loss": 0.5215, "step": 6998 }, { "epoch": 3.1662519791902284, "grad_norm": 0.3694009780883789, "learning_rate": 7.995014986827728e-06, "loss": 0.4758, "step": 6999 }, { "epoch": 3.166704365528161, "grad_norm": 0.3551805317401886, "learning_rate": 7.994428207406667e-06, "loss": 0.61, "step": 7000 }, { "epoch": 3.166704365528161, "eval_loss": 0.5925046801567078, "eval_runtime": 25.5285, "eval_samples_per_second": 29.144, "eval_steps_per_second": 7.286, "step": 7000 }, { "epoch": 3.1671567518660937, "grad_norm": 0.35683301091194153, "learning_rate": 7.993841363674551e-06, "loss": 0.564, "step": 7001 }, { "epoch": 3.167609138204026, "grad_norm": 0.35105594992637634, "learning_rate": 7.993254455643985e-06, "loss": 0.5878, "step": 7002 }, { "epoch": 3.168061524541959, "grad_norm": 0.3679022192955017, "learning_rate": 7.992667483327573e-06, "loss": 0.6125, "step": 7003 }, { "epoch": 3.1685139108798914, "grad_norm": 0.3466130793094635, "learning_rate": 7.99208044673792e-06, "loss": 0.5599, "step": 7004 }, { "epoch": 3.168966297217824, "grad_norm": 0.4128170311450958, "learning_rate": 7.991493345887634e-06, "loss": 0.6964, "step": 7005 }, { "epoch": 3.1694186835557567, "grad_norm": 0.33330920338630676, "learning_rate": 7.990906180789325e-06, "loss": 0.4222, "step": 7006 }, { "epoch": 3.169871069893689, "grad_norm": 0.33921971917152405, "learning_rate": 7.990318951455604e-06, "loss": 0.6764, "step": 7007 }, { "epoch": 3.170323456231622, "grad_norm": 0.3546331822872162, "learning_rate": 7.989731657899083e-06, "loss": 0.4459, "step": 7008 }, { "epoch": 3.1707758425695545, "grad_norm": 0.3443077802658081, "learning_rate": 7.989144300132372e-06, "loss": 0.5007, "step": 7009 }, { "epoch": 3.171228228907487, "grad_norm": 0.37011444568634033, "learning_rate": 7.98855687816809e-06, "loss": 0.5103, "step": 7010 }, { "epoch": 3.1716806152454198, "grad_norm": 0.36087900400161743, "learning_rate": 7.98796939201885e-06, "loss": 0.5164, "step": 7011 }, { "epoch": 3.172133001583352, "grad_norm": 0.3509998917579651, "learning_rate": 7.98738184169727e-06, "loss": 0.5245, "step": 7012 }, { "epoch": 3.1725853879212846, "grad_norm": 0.3948846757411957, "learning_rate": 7.986794227215971e-06, "loss": 0.4863, "step": 7013 }, { "epoch": 3.1730377742592175, "grad_norm": 0.3443911373615265, "learning_rate": 7.98620654858757e-06, "loss": 0.4197, "step": 7014 }, { "epoch": 3.17349016059715, "grad_norm": 0.3779710531234741, "learning_rate": 7.985618805824689e-06, "loss": 0.5428, "step": 7015 }, { "epoch": 3.1739425469350824, "grad_norm": 0.40349137783050537, "learning_rate": 7.985030998939954e-06, "loss": 0.5143, "step": 7016 }, { "epoch": 3.1743949332730153, "grad_norm": 0.41628026962280273, "learning_rate": 7.984443127945984e-06, "loss": 0.4785, "step": 7017 }, { "epoch": 3.1748473196109477, "grad_norm": 0.5081429481506348, "learning_rate": 7.98385519285541e-06, "loss": 0.6412, "step": 7018 }, { "epoch": 3.1752997059488806, "grad_norm": 0.4193955957889557, "learning_rate": 7.983267193680856e-06, "loss": 0.5235, "step": 7019 }, { "epoch": 3.175752092286813, "grad_norm": 0.4552347660064697, "learning_rate": 7.982679130434952e-06, "loss": 0.5468, "step": 7020 }, { "epoch": 3.1762044786247454, "grad_norm": 0.4040985405445099, "learning_rate": 7.982091003130325e-06, "loss": 0.4674, "step": 7021 }, { "epoch": 3.1766568649626783, "grad_norm": 0.48456066846847534, "learning_rate": 7.981502811779609e-06, "loss": 0.6356, "step": 7022 }, { "epoch": 3.1771092513006107, "grad_norm": 0.4297106862068176, "learning_rate": 7.980914556395436e-06, "loss": 0.5336, "step": 7023 }, { "epoch": 3.177561637638543, "grad_norm": 0.4489736258983612, "learning_rate": 7.980326236990439e-06, "loss": 0.4926, "step": 7024 }, { "epoch": 3.178014023976476, "grad_norm": 0.47221097350120544, "learning_rate": 7.979737853577255e-06, "loss": 0.498, "step": 7025 }, { "epoch": 3.1784664103144085, "grad_norm": 0.4437166154384613, "learning_rate": 7.979149406168516e-06, "loss": 0.506, "step": 7026 }, { "epoch": 3.178918796652341, "grad_norm": 0.5408547520637512, "learning_rate": 7.978560894776866e-06, "loss": 0.682, "step": 7027 }, { "epoch": 3.1793711829902738, "grad_norm": 0.4593098759651184, "learning_rate": 7.977972319414941e-06, "loss": 0.4846, "step": 7028 }, { "epoch": 3.179823569328206, "grad_norm": 0.4756871163845062, "learning_rate": 7.977383680095384e-06, "loss": 0.4631, "step": 7029 }, { "epoch": 3.180275955666139, "grad_norm": 0.4937851130962372, "learning_rate": 7.976794976830834e-06, "loss": 0.585, "step": 7030 }, { "epoch": 3.1807283420040715, "grad_norm": 0.49688711762428284, "learning_rate": 7.976206209633937e-06, "loss": 0.4614, "step": 7031 }, { "epoch": 3.181180728342004, "grad_norm": 0.4355946183204651, "learning_rate": 7.975617378517337e-06, "loss": 1.1168, "step": 7032 }, { "epoch": 3.181633114679937, "grad_norm": 0.21012917160987854, "learning_rate": 7.975028483493681e-06, "loss": 0.9723, "step": 7033 }, { "epoch": 3.1820855010178692, "grad_norm": 0.24078357219696045, "learning_rate": 7.974439524575615e-06, "loss": 0.5688, "step": 7034 }, { "epoch": 3.1825378873558017, "grad_norm": 0.2467380166053772, "learning_rate": 7.97385050177579e-06, "loss": 0.5689, "step": 7035 }, { "epoch": 3.1829902736937346, "grad_norm": 0.2751285135746002, "learning_rate": 7.973261415106855e-06, "loss": 0.6116, "step": 7036 }, { "epoch": 3.183442660031667, "grad_norm": 0.2901551127433777, "learning_rate": 7.972672264581462e-06, "loss": 0.629, "step": 7037 }, { "epoch": 3.1838950463695994, "grad_norm": 0.289562463760376, "learning_rate": 7.972083050212267e-06, "loss": 0.5657, "step": 7038 }, { "epoch": 3.1843474327075323, "grad_norm": 0.3104276955127716, "learning_rate": 7.971493772011919e-06, "loss": 0.6643, "step": 7039 }, { "epoch": 3.1847998190454647, "grad_norm": 0.3093893826007843, "learning_rate": 7.970904429993078e-06, "loss": 0.5141, "step": 7040 }, { "epoch": 3.1852522053833976, "grad_norm": 0.3246336877346039, "learning_rate": 7.9703150241684e-06, "loss": 0.6671, "step": 7041 }, { "epoch": 3.18570459172133, "grad_norm": 0.34785333275794983, "learning_rate": 7.969725554550542e-06, "loss": 0.6385, "step": 7042 }, { "epoch": 3.1861569780592625, "grad_norm": 0.3495787978172302, "learning_rate": 7.969136021152166e-06, "loss": 0.5731, "step": 7043 }, { "epoch": 3.1866093643971953, "grad_norm": 0.317902147769928, "learning_rate": 7.968546423985934e-06, "loss": 0.568, "step": 7044 }, { "epoch": 3.1870617507351278, "grad_norm": 0.3667433559894562, "learning_rate": 7.967956763064508e-06, "loss": 0.6981, "step": 7045 }, { "epoch": 3.18751413707306, "grad_norm": 0.36692625284194946, "learning_rate": 7.967367038400551e-06, "loss": 0.5528, "step": 7046 }, { "epoch": 3.187966523410993, "grad_norm": 0.3536999225616455, "learning_rate": 7.96677725000673e-06, "loss": 0.515, "step": 7047 }, { "epoch": 3.1884189097489255, "grad_norm": 0.333442747592926, "learning_rate": 7.96618739789571e-06, "loss": 0.5414, "step": 7048 }, { "epoch": 3.1888712960868584, "grad_norm": 0.33520907163619995, "learning_rate": 7.965597482080161e-06, "loss": 0.5275, "step": 7049 }, { "epoch": 3.189323682424791, "grad_norm": 0.3598330616950989, "learning_rate": 7.965007502572752e-06, "loss": 0.508, "step": 7050 }, { "epoch": 3.1897760687627232, "grad_norm": 0.30597826838493347, "learning_rate": 7.964417459386153e-06, "loss": 0.4915, "step": 7051 }, { "epoch": 3.190228455100656, "grad_norm": 0.36792996525764465, "learning_rate": 7.963827352533038e-06, "loss": 0.5435, "step": 7052 }, { "epoch": 3.1906808414385885, "grad_norm": 0.3765811622142792, "learning_rate": 7.963237182026079e-06, "loss": 0.5574, "step": 7053 }, { "epoch": 3.191133227776521, "grad_norm": 0.37886157631874084, "learning_rate": 7.962646947877952e-06, "loss": 0.6654, "step": 7054 }, { "epoch": 3.191585614114454, "grad_norm": 0.35497185587882996, "learning_rate": 7.962056650101334e-06, "loss": 0.4636, "step": 7055 }, { "epoch": 3.1920380004523863, "grad_norm": 0.3490341901779175, "learning_rate": 7.9614662887089e-06, "loss": 0.5459, "step": 7056 }, { "epoch": 3.192490386790319, "grad_norm": 0.4163520336151123, "learning_rate": 7.960875863713332e-06, "loss": 0.7377, "step": 7057 }, { "epoch": 3.1929427731282516, "grad_norm": 0.4047759473323822, "learning_rate": 7.96028537512731e-06, "loss": 0.564, "step": 7058 }, { "epoch": 3.193395159466184, "grad_norm": 0.3783060908317566, "learning_rate": 7.959694822963515e-06, "loss": 0.49, "step": 7059 }, { "epoch": 3.193847545804117, "grad_norm": 0.37968510389328003, "learning_rate": 7.959104207234631e-06, "loss": 0.5587, "step": 7060 }, { "epoch": 3.1942999321420493, "grad_norm": 0.4128599762916565, "learning_rate": 7.958513527953341e-06, "loss": 0.6161, "step": 7061 }, { "epoch": 3.1947523184799818, "grad_norm": 0.4821203649044037, "learning_rate": 7.957922785132334e-06, "loss": 0.6288, "step": 7062 }, { "epoch": 3.1952047048179146, "grad_norm": 0.3755888044834137, "learning_rate": 7.957331978784295e-06, "loss": 0.5336, "step": 7063 }, { "epoch": 3.195657091155847, "grad_norm": 0.4381701350212097, "learning_rate": 7.956741108921913e-06, "loss": 0.5469, "step": 7064 }, { "epoch": 3.1961094774937795, "grad_norm": 0.42571115493774414, "learning_rate": 7.95615017555788e-06, "loss": 0.5357, "step": 7065 }, { "epoch": 3.1965618638317124, "grad_norm": 0.4026544988155365, "learning_rate": 7.955559178704884e-06, "loss": 0.4878, "step": 7066 }, { "epoch": 3.197014250169645, "grad_norm": 0.39848875999450684, "learning_rate": 7.954968118375619e-06, "loss": 0.5069, "step": 7067 }, { "epoch": 3.1974666365075777, "grad_norm": 0.4170195758342743, "learning_rate": 7.95437699458278e-06, "loss": 0.5, "step": 7068 }, { "epoch": 3.19791902284551, "grad_norm": 0.424089252948761, "learning_rate": 7.953785807339063e-06, "loss": 0.5307, "step": 7069 }, { "epoch": 3.1983714091834425, "grad_norm": 0.4416940212249756, "learning_rate": 7.953194556657165e-06, "loss": 0.574, "step": 7070 }, { "epoch": 3.1988237955213754, "grad_norm": 0.44232508540153503, "learning_rate": 7.95260324254978e-06, "loss": 0.5955, "step": 7071 }, { "epoch": 3.199276181859308, "grad_norm": 0.4115673005580902, "learning_rate": 7.952011865029614e-06, "loss": 0.4287, "step": 7072 }, { "epoch": 3.1997285681972403, "grad_norm": 0.5004408359527588, "learning_rate": 7.951420424109364e-06, "loss": 0.5628, "step": 7073 }, { "epoch": 3.200180954535173, "grad_norm": 0.4287615418434143, "learning_rate": 7.950828919801733e-06, "loss": 0.488, "step": 7074 }, { "epoch": 3.2006333408731056, "grad_norm": 0.47287437319755554, "learning_rate": 7.950237352119425e-06, "loss": 0.5956, "step": 7075 }, { "epoch": 3.201085727211038, "grad_norm": 0.4809645414352417, "learning_rate": 7.949645721075144e-06, "loss": 0.5008, "step": 7076 }, { "epoch": 3.201538113548971, "grad_norm": 0.47195765376091003, "learning_rate": 7.949054026681598e-06, "loss": 0.5196, "step": 7077 }, { "epoch": 3.2019904998869033, "grad_norm": 0.5021160244941711, "learning_rate": 7.948462268951495e-06, "loss": 0.5176, "step": 7078 }, { "epoch": 3.202442886224836, "grad_norm": 0.44220584630966187, "learning_rate": 7.947870447897541e-06, "loss": 0.4869, "step": 7079 }, { "epoch": 3.2028952725627686, "grad_norm": 0.5089640021324158, "learning_rate": 7.94727856353245e-06, "loss": 0.5628, "step": 7080 }, { "epoch": 3.203347658900701, "grad_norm": 0.5095834136009216, "learning_rate": 7.946686615868935e-06, "loss": 0.5268, "step": 7081 }, { "epoch": 3.203800045238634, "grad_norm": 0.421769380569458, "learning_rate": 7.946094604919703e-06, "loss": 1.2974, "step": 7082 }, { "epoch": 3.2042524315765664, "grad_norm": 0.21908463537693024, "learning_rate": 7.945502530697472e-06, "loss": 0.5171, "step": 7083 }, { "epoch": 3.204704817914499, "grad_norm": 0.23803231120109558, "learning_rate": 7.944910393214962e-06, "loss": 0.6183, "step": 7084 }, { "epoch": 3.2051572042524317, "grad_norm": 0.2699604332447052, "learning_rate": 7.944318192484883e-06, "loss": 0.7583, "step": 7085 }, { "epoch": 3.205609590590364, "grad_norm": 0.2953305244445801, "learning_rate": 7.943725928519958e-06, "loss": 0.4107, "step": 7086 }, { "epoch": 3.2060619769282965, "grad_norm": 0.2860432267189026, "learning_rate": 7.943133601332908e-06, "loss": 0.6119, "step": 7087 }, { "epoch": 3.2065143632662294, "grad_norm": 0.2757575809955597, "learning_rate": 7.942541210936452e-06, "loss": 0.6181, "step": 7088 }, { "epoch": 3.206966749604162, "grad_norm": 0.32480141520500183, "learning_rate": 7.941948757343312e-06, "loss": 0.5536, "step": 7089 }, { "epoch": 3.2074191359420947, "grad_norm": 0.2877108156681061, "learning_rate": 7.941356240566213e-06, "loss": 0.5226, "step": 7090 }, { "epoch": 3.207871522280027, "grad_norm": 0.3150126039981842, "learning_rate": 7.940763660617881e-06, "loss": 0.5744, "step": 7091 }, { "epoch": 3.2083239086179596, "grad_norm": 0.3385179340839386, "learning_rate": 7.940171017511042e-06, "loss": 0.5304, "step": 7092 }, { "epoch": 3.2087762949558924, "grad_norm": 0.28169405460357666, "learning_rate": 7.939578311258427e-06, "loss": 0.4673, "step": 7093 }, { "epoch": 3.209228681293825, "grad_norm": 0.34563952684402466, "learning_rate": 7.938985541872762e-06, "loss": 0.653, "step": 7094 }, { "epoch": 3.2096810676317578, "grad_norm": 0.3248763382434845, "learning_rate": 7.938392709366778e-06, "loss": 0.6039, "step": 7095 }, { "epoch": 3.21013345396969, "grad_norm": 0.32633519172668457, "learning_rate": 7.937799813753208e-06, "loss": 0.6074, "step": 7096 }, { "epoch": 3.2105858403076226, "grad_norm": 0.32448533177375793, "learning_rate": 7.937206855044785e-06, "loss": 0.5827, "step": 7097 }, { "epoch": 3.2110382266455555, "grad_norm": 0.3601732850074768, "learning_rate": 7.936613833254247e-06, "loss": 0.5569, "step": 7098 }, { "epoch": 3.211490612983488, "grad_norm": 0.31240981817245483, "learning_rate": 7.936020748394327e-06, "loss": 0.4607, "step": 7099 }, { "epoch": 3.2119429993214204, "grad_norm": 0.34914615750312805, "learning_rate": 7.935427600477766e-06, "loss": 0.5494, "step": 7100 }, { "epoch": 3.2123953856593532, "grad_norm": 0.3506433069705963, "learning_rate": 7.934834389517297e-06, "loss": 0.5836, "step": 7101 }, { "epoch": 3.2128477719972857, "grad_norm": 0.3431846797466278, "learning_rate": 7.934241115525665e-06, "loss": 0.5146, "step": 7102 }, { "epoch": 3.213300158335218, "grad_norm": 0.3474538326263428, "learning_rate": 7.93364777851561e-06, "loss": 0.5081, "step": 7103 }, { "epoch": 3.213752544673151, "grad_norm": 0.38559192419052124, "learning_rate": 7.933054378499875e-06, "loss": 0.5849, "step": 7104 }, { "epoch": 3.2142049310110834, "grad_norm": 0.42221328616142273, "learning_rate": 7.932460915491208e-06, "loss": 0.6475, "step": 7105 }, { "epoch": 3.2146573173490163, "grad_norm": 0.396502286195755, "learning_rate": 7.93186738950235e-06, "loss": 0.6114, "step": 7106 }, { "epoch": 3.2151097036869487, "grad_norm": 0.3846208155155182, "learning_rate": 7.93127380054605e-06, "loss": 0.5729, "step": 7107 }, { "epoch": 3.215562090024881, "grad_norm": 0.4037102162837982, "learning_rate": 7.930680148635057e-06, "loss": 0.6311, "step": 7108 }, { "epoch": 3.216014476362814, "grad_norm": 0.38412246108055115, "learning_rate": 7.930086433782119e-06, "loss": 0.538, "step": 7109 }, { "epoch": 3.2164668627007464, "grad_norm": 0.3830138146877289, "learning_rate": 7.929492655999988e-06, "loss": 0.5637, "step": 7110 }, { "epoch": 3.216919249038679, "grad_norm": 0.43438729643821716, "learning_rate": 7.928898815301418e-06, "loss": 0.5301, "step": 7111 }, { "epoch": 3.2173716353766117, "grad_norm": 0.39908310770988464, "learning_rate": 7.928304911699162e-06, "loss": 0.5186, "step": 7112 }, { "epoch": 3.217824021714544, "grad_norm": 0.379911333322525, "learning_rate": 7.927710945205973e-06, "loss": 0.5764, "step": 7113 }, { "epoch": 3.2182764080524766, "grad_norm": 0.38805434107780457, "learning_rate": 7.927116915834612e-06, "loss": 0.4939, "step": 7114 }, { "epoch": 3.2187287943904095, "grad_norm": 0.38646721839904785, "learning_rate": 7.926522823597832e-06, "loss": 0.6249, "step": 7115 }, { "epoch": 3.219181180728342, "grad_norm": 0.3925068974494934, "learning_rate": 7.925928668508395e-06, "loss": 0.4875, "step": 7116 }, { "epoch": 3.219633567066275, "grad_norm": 0.42352762818336487, "learning_rate": 7.925334450579063e-06, "loss": 0.5758, "step": 7117 }, { "epoch": 3.2200859534042072, "grad_norm": 0.3941341042518616, "learning_rate": 7.924740169822594e-06, "loss": 0.504, "step": 7118 }, { "epoch": 3.2205383397421397, "grad_norm": 0.4070132076740265, "learning_rate": 7.924145826251755e-06, "loss": 0.5388, "step": 7119 }, { "epoch": 3.2209907260800725, "grad_norm": 0.4091927111148834, "learning_rate": 7.923551419879308e-06, "loss": 0.5271, "step": 7120 }, { "epoch": 3.221443112418005, "grad_norm": 0.40489861369132996, "learning_rate": 7.922956950718021e-06, "loss": 0.4604, "step": 7121 }, { "epoch": 3.2218954987559374, "grad_norm": 0.40168946981430054, "learning_rate": 7.92236241878066e-06, "loss": 0.4474, "step": 7122 }, { "epoch": 3.2223478850938703, "grad_norm": 0.46359455585479736, "learning_rate": 7.921767824079995e-06, "loss": 0.6108, "step": 7123 }, { "epoch": 3.2228002714318027, "grad_norm": 0.4670944809913635, "learning_rate": 7.921173166628795e-06, "loss": 0.575, "step": 7124 }, { "epoch": 3.223252657769735, "grad_norm": 0.40727174282073975, "learning_rate": 7.920578446439831e-06, "loss": 0.5397, "step": 7125 }, { "epoch": 3.223705044107668, "grad_norm": 0.40843504667282104, "learning_rate": 7.919983663525877e-06, "loss": 0.4206, "step": 7126 }, { "epoch": 3.2241574304456004, "grad_norm": 0.4914346933364868, "learning_rate": 7.919388817899708e-06, "loss": 0.5157, "step": 7127 }, { "epoch": 3.2246098167835333, "grad_norm": 0.49659448862075806, "learning_rate": 7.918793909574096e-06, "loss": 0.5787, "step": 7128 }, { "epoch": 3.2250622031214657, "grad_norm": 0.5285300016403198, "learning_rate": 7.918198938561821e-06, "loss": 0.6054, "step": 7129 }, { "epoch": 3.225514589459398, "grad_norm": 0.5412314534187317, "learning_rate": 7.917603904875658e-06, "loss": 0.5393, "step": 7130 }, { "epoch": 3.225966975797331, "grad_norm": 0.6232995986938477, "learning_rate": 7.91700880852839e-06, "loss": 0.6141, "step": 7131 }, { "epoch": 3.2264193621352635, "grad_norm": 0.43472862243652344, "learning_rate": 7.916413649532796e-06, "loss": 0.9863, "step": 7132 }, { "epoch": 3.226871748473196, "grad_norm": 0.23308013379573822, "learning_rate": 7.915818427901659e-06, "loss": 1.1672, "step": 7133 }, { "epoch": 3.227324134811129, "grad_norm": 0.20874746143817902, "learning_rate": 7.915223143647761e-06, "loss": 0.4954, "step": 7134 }, { "epoch": 3.227776521149061, "grad_norm": 0.2727172076702118, "learning_rate": 7.914627796783888e-06, "loss": 0.7329, "step": 7135 }, { "epoch": 3.228228907486994, "grad_norm": 0.2506895363330841, "learning_rate": 7.914032387322825e-06, "loss": 0.5855, "step": 7136 }, { "epoch": 3.2286812938249265, "grad_norm": 0.292080283164978, "learning_rate": 7.913436915277361e-06, "loss": 0.6097, "step": 7137 }, { "epoch": 3.229133680162859, "grad_norm": 0.2680940628051758, "learning_rate": 7.912841380660285e-06, "loss": 0.5558, "step": 7138 }, { "epoch": 3.229586066500792, "grad_norm": 0.2722710967063904, "learning_rate": 7.912245783484388e-06, "loss": 0.5171, "step": 7139 }, { "epoch": 3.2300384528387243, "grad_norm": 0.3144720792770386, "learning_rate": 7.91165012376246e-06, "loss": 0.6357, "step": 7140 }, { "epoch": 3.2304908391766567, "grad_norm": 0.36167648434638977, "learning_rate": 7.911054401507292e-06, "loss": 0.6169, "step": 7141 }, { "epoch": 3.2309432255145896, "grad_norm": 0.37933972477912903, "learning_rate": 7.910458616731682e-06, "loss": 0.7088, "step": 7142 }, { "epoch": 3.231395611852522, "grad_norm": 0.33637067675590515, "learning_rate": 7.909862769448424e-06, "loss": 0.607, "step": 7143 }, { "epoch": 3.231847998190455, "grad_norm": 0.3162451982498169, "learning_rate": 7.909266859670316e-06, "loss": 0.5627, "step": 7144 }, { "epoch": 3.2323003845283873, "grad_norm": 0.34916749596595764, "learning_rate": 7.908670887410153e-06, "loss": 0.5618, "step": 7145 }, { "epoch": 3.2327527708663197, "grad_norm": 0.3457392454147339, "learning_rate": 7.908074852680738e-06, "loss": 0.5076, "step": 7146 }, { "epoch": 3.2332051572042526, "grad_norm": 0.3462238907814026, "learning_rate": 7.90747875549487e-06, "loss": 0.4469, "step": 7147 }, { "epoch": 3.233657543542185, "grad_norm": 0.325696736574173, "learning_rate": 7.906882595865355e-06, "loss": 0.5042, "step": 7148 }, { "epoch": 3.2341099298801175, "grad_norm": 0.31266433000564575, "learning_rate": 7.906286373804992e-06, "loss": 0.4811, "step": 7149 }, { "epoch": 3.2345623162180503, "grad_norm": 0.36549097299575806, "learning_rate": 7.905690089326587e-06, "loss": 0.6098, "step": 7150 }, { "epoch": 3.235014702555983, "grad_norm": 0.2939645051956177, "learning_rate": 7.90509374244295e-06, "loss": 0.4128, "step": 7151 }, { "epoch": 3.235467088893915, "grad_norm": 0.3440958261489868, "learning_rate": 7.904497333166885e-06, "loss": 0.5392, "step": 7152 }, { "epoch": 3.235919475231848, "grad_norm": 0.3494541049003601, "learning_rate": 7.9039008615112e-06, "loss": 0.5764, "step": 7153 }, { "epoch": 3.2363718615697805, "grad_norm": 0.3787574768066406, "learning_rate": 7.90330432748871e-06, "loss": 0.5457, "step": 7154 }, { "epoch": 3.2368242479077134, "grad_norm": 0.3424752652645111, "learning_rate": 7.902707731112222e-06, "loss": 0.4691, "step": 7155 }, { "epoch": 3.237276634245646, "grad_norm": 0.35327470302581787, "learning_rate": 7.902111072394554e-06, "loss": 0.5785, "step": 7156 }, { "epoch": 3.2377290205835783, "grad_norm": 0.4000433087348938, "learning_rate": 7.901514351348515e-06, "loss": 0.5835, "step": 7157 }, { "epoch": 3.238181406921511, "grad_norm": 0.3543972671031952, "learning_rate": 7.900917567986924e-06, "loss": 0.528, "step": 7158 }, { "epoch": 3.2386337932594436, "grad_norm": 0.37374886870384216, "learning_rate": 7.900320722322597e-06, "loss": 0.554, "step": 7159 }, { "epoch": 3.239086179597376, "grad_norm": 0.3929511606693268, "learning_rate": 7.899723814368353e-06, "loss": 0.6246, "step": 7160 }, { "epoch": 3.239538565935309, "grad_norm": 0.3711974024772644, "learning_rate": 7.899126844137013e-06, "loss": 0.5109, "step": 7161 }, { "epoch": 3.2399909522732413, "grad_norm": 0.43197202682495117, "learning_rate": 7.898529811641393e-06, "loss": 0.6289, "step": 7162 }, { "epoch": 3.2404433386111737, "grad_norm": 0.37375229597091675, "learning_rate": 7.897932716894321e-06, "loss": 0.5847, "step": 7163 }, { "epoch": 3.2408957249491066, "grad_norm": 0.3945968449115753, "learning_rate": 7.89733555990862e-06, "loss": 0.521, "step": 7164 }, { "epoch": 3.241348111287039, "grad_norm": 0.4234285354614258, "learning_rate": 7.896738340697111e-06, "loss": 0.525, "step": 7165 }, { "epoch": 3.241800497624972, "grad_norm": 0.43108704686164856, "learning_rate": 7.896141059272626e-06, "loss": 0.5737, "step": 7166 }, { "epoch": 3.2422528839629043, "grad_norm": 0.41351380944252014, "learning_rate": 7.895543715647986e-06, "loss": 0.5094, "step": 7167 }, { "epoch": 3.2427052703008368, "grad_norm": 0.3776361048221588, "learning_rate": 7.894946309836028e-06, "loss": 0.4682, "step": 7168 }, { "epoch": 3.2431576566387696, "grad_norm": 0.4097554683685303, "learning_rate": 7.894348841849578e-06, "loss": 0.5308, "step": 7169 }, { "epoch": 3.243610042976702, "grad_norm": 0.4296043813228607, "learning_rate": 7.893751311701465e-06, "loss": 0.4842, "step": 7170 }, { "epoch": 3.2440624293146345, "grad_norm": 0.43637341260910034, "learning_rate": 7.893153719404529e-06, "loss": 0.4907, "step": 7171 }, { "epoch": 3.2445148156525674, "grad_norm": 0.40865519642829895, "learning_rate": 7.892556064971599e-06, "loss": 0.4288, "step": 7172 }, { "epoch": 3.2449672019905, "grad_norm": 0.4275365173816681, "learning_rate": 7.891958348415512e-06, "loss": 0.5151, "step": 7173 }, { "epoch": 3.2454195883284322, "grad_norm": 0.5663906335830688, "learning_rate": 7.891360569749107e-06, "loss": 0.6109, "step": 7174 }, { "epoch": 3.245871974666365, "grad_norm": 0.4664932191371918, "learning_rate": 7.89076272898522e-06, "loss": 0.4775, "step": 7175 }, { "epoch": 3.2463243610042976, "grad_norm": 0.43828439712524414, "learning_rate": 7.890164826136693e-06, "loss": 0.5294, "step": 7176 }, { "epoch": 3.2467767473422304, "grad_norm": 0.49869346618652344, "learning_rate": 7.889566861216365e-06, "loss": 0.5337, "step": 7177 }, { "epoch": 3.247229133680163, "grad_norm": 0.4927443563938141, "learning_rate": 7.888968834237079e-06, "loss": 0.4753, "step": 7178 }, { "epoch": 3.2476815200180953, "grad_norm": 0.48605582118034363, "learning_rate": 7.88837074521168e-06, "loss": 0.4648, "step": 7179 }, { "epoch": 3.248133906356028, "grad_norm": 0.5063513517379761, "learning_rate": 7.887772594153012e-06, "loss": 0.4823, "step": 7180 }, { "epoch": 3.2485862926939606, "grad_norm": 0.6222195625305176, "learning_rate": 7.887174381073921e-06, "loss": 0.5667, "step": 7181 }, { "epoch": 3.249038679031893, "grad_norm": 0.49310699105262756, "learning_rate": 7.886576105987255e-06, "loss": 0.946, "step": 7182 }, { "epoch": 3.249491065369826, "grad_norm": 0.20313602685928345, "learning_rate": 7.885977768905866e-06, "loss": 0.8113, "step": 7183 }, { "epoch": 3.2499434517077583, "grad_norm": 0.275950163602829, "learning_rate": 7.885379369842601e-06, "loss": 0.6504, "step": 7184 }, { "epoch": 3.250395838045691, "grad_norm": 0.2736874222755432, "learning_rate": 7.884780908810312e-06, "loss": 0.5994, "step": 7185 }, { "epoch": 3.2508482243836236, "grad_norm": 0.267783522605896, "learning_rate": 7.884182385821853e-06, "loss": 0.6307, "step": 7186 }, { "epoch": 3.251300610721556, "grad_norm": 0.2876068353652954, "learning_rate": 7.883583800890077e-06, "loss": 0.5674, "step": 7187 }, { "epoch": 3.251752997059489, "grad_norm": 0.3041163384914398, "learning_rate": 7.882985154027844e-06, "loss": 0.6633, "step": 7188 }, { "epoch": 3.2522053833974214, "grad_norm": 0.27684563398361206, "learning_rate": 7.882386445248004e-06, "loss": 0.6366, "step": 7189 }, { "epoch": 3.252657769735354, "grad_norm": 0.32287663221359253, "learning_rate": 7.881787674563423e-06, "loss": 0.7601, "step": 7190 }, { "epoch": 3.2531101560732867, "grad_norm": 0.32892367243766785, "learning_rate": 7.881188841986957e-06, "loss": 0.6174, "step": 7191 }, { "epoch": 3.253562542411219, "grad_norm": 0.3214976191520691, "learning_rate": 7.880589947531467e-06, "loss": 0.6095, "step": 7192 }, { "epoch": 3.254014928749152, "grad_norm": 0.3052436411380768, "learning_rate": 7.879990991209813e-06, "loss": 0.4945, "step": 7193 }, { "epoch": 3.2544673150870844, "grad_norm": 0.3317328989505768, "learning_rate": 7.879391973034865e-06, "loss": 0.6058, "step": 7194 }, { "epoch": 3.254919701425017, "grad_norm": 0.3458389937877655, "learning_rate": 7.878792893019482e-06, "loss": 0.5109, "step": 7195 }, { "epoch": 3.2553720877629497, "grad_norm": 0.3456990718841553, "learning_rate": 7.878193751176535e-06, "loss": 0.5822, "step": 7196 }, { "epoch": 3.255824474100882, "grad_norm": 0.3691173195838928, "learning_rate": 7.877594547518888e-06, "loss": 0.6913, "step": 7197 }, { "epoch": 3.2562768604388146, "grad_norm": 0.36643320322036743, "learning_rate": 7.876995282059412e-06, "loss": 0.6281, "step": 7198 }, { "epoch": 3.2567292467767475, "grad_norm": 0.43074408173561096, "learning_rate": 7.876395954810977e-06, "loss": 0.5543, "step": 7199 }, { "epoch": 3.25718163311468, "grad_norm": 0.3590477406978607, "learning_rate": 7.875796565786456e-06, "loss": 0.5993, "step": 7200 }, { "epoch": 3.25718163311468, "eval_loss": 0.592366099357605, "eval_runtime": 28.4625, "eval_samples_per_second": 26.14, "eval_steps_per_second": 6.535, "step": 7200 }, { "epoch": 3.2576340194526123, "grad_norm": 0.3628638982772827, "learning_rate": 7.87519711499872e-06, "loss": 0.5278, "step": 7201 }, { "epoch": 3.258086405790545, "grad_norm": 0.3636283278465271, "learning_rate": 7.874597602460643e-06, "loss": 0.5978, "step": 7202 }, { "epoch": 3.2585387921284776, "grad_norm": 0.3636658787727356, "learning_rate": 7.873998028185103e-06, "loss": 0.5741, "step": 7203 }, { "epoch": 3.2589911784664105, "grad_norm": 0.33867791295051575, "learning_rate": 7.873398392184975e-06, "loss": 0.5488, "step": 7204 }, { "epoch": 3.259443564804343, "grad_norm": 0.3434220850467682, "learning_rate": 7.872798694473138e-06, "loss": 0.5039, "step": 7205 }, { "epoch": 3.2598959511422754, "grad_norm": 0.37071093916893005, "learning_rate": 7.872198935062471e-06, "loss": 0.546, "step": 7206 }, { "epoch": 3.2603483374802082, "grad_norm": 0.3743361830711365, "learning_rate": 7.871599113965857e-06, "loss": 0.5194, "step": 7207 }, { "epoch": 3.2608007238181407, "grad_norm": 0.40214601159095764, "learning_rate": 7.870999231196176e-06, "loss": 0.5728, "step": 7208 }, { "epoch": 3.261253110156073, "grad_norm": 0.36718660593032837, "learning_rate": 7.870399286766315e-06, "loss": 0.5392, "step": 7209 }, { "epoch": 3.261705496494006, "grad_norm": 0.3746246099472046, "learning_rate": 7.869799280689154e-06, "loss": 0.5564, "step": 7210 }, { "epoch": 3.2621578828319384, "grad_norm": 0.36062175035476685, "learning_rate": 7.869199212977583e-06, "loss": 0.458, "step": 7211 }, { "epoch": 3.262610269169871, "grad_norm": 0.34163063764572144, "learning_rate": 7.868599083644488e-06, "loss": 0.4234, "step": 7212 }, { "epoch": 3.2630626555078037, "grad_norm": 0.37995070219039917, "learning_rate": 7.86799889270276e-06, "loss": 0.5445, "step": 7213 }, { "epoch": 3.263515041845736, "grad_norm": 0.3409864604473114, "learning_rate": 7.867398640165285e-06, "loss": 0.4896, "step": 7214 }, { "epoch": 3.263967428183669, "grad_norm": 0.3824792802333832, "learning_rate": 7.866798326044957e-06, "loss": 0.5476, "step": 7215 }, { "epoch": 3.2644198145216015, "grad_norm": 0.40190285444259644, "learning_rate": 7.866197950354673e-06, "loss": 0.4663, "step": 7216 }, { "epoch": 3.264872200859534, "grad_norm": 0.4995054304599762, "learning_rate": 7.86559751310732e-06, "loss": 0.6356, "step": 7217 }, { "epoch": 3.2653245871974668, "grad_norm": 0.40711018443107605, "learning_rate": 7.864997014315797e-06, "loss": 0.5007, "step": 7218 }, { "epoch": 3.265776973535399, "grad_norm": 0.3956906497478485, "learning_rate": 7.864396453993001e-06, "loss": 0.5373, "step": 7219 }, { "epoch": 3.266229359873332, "grad_norm": 0.3735887408256531, "learning_rate": 7.86379583215183e-06, "loss": 0.4608, "step": 7220 }, { "epoch": 3.2666817462112645, "grad_norm": 0.3960055410861969, "learning_rate": 7.863195148805184e-06, "loss": 0.4479, "step": 7221 }, { "epoch": 3.267134132549197, "grad_norm": 0.44478243589401245, "learning_rate": 7.862594403965962e-06, "loss": 0.5076, "step": 7222 }, { "epoch": 3.2675865188871294, "grad_norm": 0.38092535734176636, "learning_rate": 7.861993597647069e-06, "loss": 0.3914, "step": 7223 }, { "epoch": 3.2680389052250622, "grad_norm": 0.4856853783130646, "learning_rate": 7.861392729861406e-06, "loss": 0.6528, "step": 7224 }, { "epoch": 3.2684912915629947, "grad_norm": 0.38503655791282654, "learning_rate": 7.860791800621877e-06, "loss": 0.3947, "step": 7225 }, { "epoch": 3.2689436779009275, "grad_norm": 0.3584558665752411, "learning_rate": 7.860190809941392e-06, "loss": 0.374, "step": 7226 }, { "epoch": 3.26939606423886, "grad_norm": 0.4933010935783386, "learning_rate": 7.859589757832855e-06, "loss": 0.5951, "step": 7227 }, { "epoch": 3.2698484505767924, "grad_norm": 0.5180219411849976, "learning_rate": 7.858988644309175e-06, "loss": 0.5249, "step": 7228 }, { "epoch": 3.2703008369147253, "grad_norm": 0.4870653748512268, "learning_rate": 7.858387469383264e-06, "loss": 0.5251, "step": 7229 }, { "epoch": 3.2707532232526577, "grad_norm": 0.5028033256530762, "learning_rate": 7.857786233068032e-06, "loss": 0.4593, "step": 7230 }, { "epoch": 3.2712056095905906, "grad_norm": 0.5528517961502075, "learning_rate": 7.857184935376393e-06, "loss": 0.5221, "step": 7231 }, { "epoch": 3.271657995928523, "grad_norm": 0.4459100067615509, "learning_rate": 7.856583576321257e-06, "loss": 0.9672, "step": 7232 }, { "epoch": 3.2721103822664555, "grad_norm": 0.15741173923015594, "learning_rate": 7.855982155915545e-06, "loss": 0.6686, "step": 7233 }, { "epoch": 3.2725627686043883, "grad_norm": 0.2439800500869751, "learning_rate": 7.85538067417217e-06, "loss": 0.7754, "step": 7234 }, { "epoch": 3.2730151549423208, "grad_norm": 0.29884833097457886, "learning_rate": 7.854779131104052e-06, "loss": 0.7012, "step": 7235 }, { "epoch": 3.273467541280253, "grad_norm": 0.3551517724990845, "learning_rate": 7.854177526724107e-06, "loss": 0.6831, "step": 7236 }, { "epoch": 3.273919927618186, "grad_norm": 0.3079351782798767, "learning_rate": 7.85357586104526e-06, "loss": 0.6473, "step": 7237 }, { "epoch": 3.2743723139561185, "grad_norm": 0.2746163606643677, "learning_rate": 7.85297413408043e-06, "loss": 0.5203, "step": 7238 }, { "epoch": 3.274824700294051, "grad_norm": 0.3286083936691284, "learning_rate": 7.85237234584254e-06, "loss": 0.6776, "step": 7239 }, { "epoch": 3.275277086631984, "grad_norm": 0.2988455891609192, "learning_rate": 7.851770496344517e-06, "loss": 0.6892, "step": 7240 }, { "epoch": 3.2757294729699162, "grad_norm": 0.3257042169570923, "learning_rate": 7.851168585599285e-06, "loss": 0.4598, "step": 7241 }, { "epoch": 3.276181859307849, "grad_norm": 0.2994218170642853, "learning_rate": 7.85056661361977e-06, "loss": 0.6272, "step": 7242 }, { "epoch": 3.2766342456457815, "grad_norm": 0.27514493465423584, "learning_rate": 7.849964580418903e-06, "loss": 0.463, "step": 7243 }, { "epoch": 3.277086631983714, "grad_norm": 0.3153175413608551, "learning_rate": 7.849362486009613e-06, "loss": 0.4932, "step": 7244 }, { "epoch": 3.277539018321647, "grad_norm": 0.3121564984321594, "learning_rate": 7.848760330404833e-06, "loss": 0.5759, "step": 7245 }, { "epoch": 3.2779914046595793, "grad_norm": 0.30584439635276794, "learning_rate": 7.84815811361749e-06, "loss": 0.5957, "step": 7246 }, { "epoch": 3.2784437909975117, "grad_norm": 0.3281686305999756, "learning_rate": 7.847555835660523e-06, "loss": 0.5918, "step": 7247 }, { "epoch": 3.2788961773354446, "grad_norm": 0.3723118007183075, "learning_rate": 7.846953496546866e-06, "loss": 0.683, "step": 7248 }, { "epoch": 3.279348563673377, "grad_norm": 0.3369726538658142, "learning_rate": 7.846351096289453e-06, "loss": 0.5865, "step": 7249 }, { "epoch": 3.2798009500113094, "grad_norm": 0.3214588463306427, "learning_rate": 7.845748634901223e-06, "loss": 0.3902, "step": 7250 }, { "epoch": 3.2802533363492423, "grad_norm": 0.3913593292236328, "learning_rate": 7.845146112395117e-06, "loss": 0.5938, "step": 7251 }, { "epoch": 3.2807057226871748, "grad_norm": 0.35091108083724976, "learning_rate": 7.844543528784072e-06, "loss": 0.518, "step": 7252 }, { "epoch": 3.2811581090251076, "grad_norm": 0.3787446916103363, "learning_rate": 7.843940884081032e-06, "loss": 0.6096, "step": 7253 }, { "epoch": 3.28161049536304, "grad_norm": 0.4102432429790497, "learning_rate": 7.84333817829894e-06, "loss": 0.671, "step": 7254 }, { "epoch": 3.2820628817009725, "grad_norm": 0.3882600963115692, "learning_rate": 7.842735411450738e-06, "loss": 0.5802, "step": 7255 }, { "epoch": 3.2825152680389054, "grad_norm": 0.43181461095809937, "learning_rate": 7.842132583549375e-06, "loss": 0.6446, "step": 7256 }, { "epoch": 3.282967654376838, "grad_norm": 0.3816452622413635, "learning_rate": 7.841529694607795e-06, "loss": 0.606, "step": 7257 }, { "epoch": 3.2834200407147702, "grad_norm": 0.3963305950164795, "learning_rate": 7.840926744638945e-06, "loss": 0.601, "step": 7258 }, { "epoch": 3.283872427052703, "grad_norm": 0.3627757132053375, "learning_rate": 7.84032373365578e-06, "loss": 0.6063, "step": 7259 }, { "epoch": 3.2843248133906355, "grad_norm": 0.4028054177761078, "learning_rate": 7.839720661671246e-06, "loss": 0.619, "step": 7260 }, { "epoch": 3.284777199728568, "grad_norm": 0.3860757648944855, "learning_rate": 7.839117528698295e-06, "loss": 0.5287, "step": 7261 }, { "epoch": 3.285229586066501, "grad_norm": 0.42140161991119385, "learning_rate": 7.838514334749885e-06, "loss": 0.5719, "step": 7262 }, { "epoch": 3.2856819724044333, "grad_norm": 0.38060441613197327, "learning_rate": 7.837911079838966e-06, "loss": 0.5614, "step": 7263 }, { "epoch": 3.286134358742366, "grad_norm": 0.39942631125450134, "learning_rate": 7.837307763978495e-06, "loss": 0.4939, "step": 7264 }, { "epoch": 3.2865867450802986, "grad_norm": 0.41307884454727173, "learning_rate": 7.83670438718143e-06, "loss": 0.5266, "step": 7265 }, { "epoch": 3.287039131418231, "grad_norm": 0.4317207932472229, "learning_rate": 7.836100949460733e-06, "loss": 0.5468, "step": 7266 }, { "epoch": 3.287491517756164, "grad_norm": 0.406695693731308, "learning_rate": 7.835497450829358e-06, "loss": 0.5271, "step": 7267 }, { "epoch": 3.2879439040940963, "grad_norm": 0.43726497888565063, "learning_rate": 7.83489389130027e-06, "loss": 0.6003, "step": 7268 }, { "epoch": 3.288396290432029, "grad_norm": 0.366828590631485, "learning_rate": 7.83429027088643e-06, "loss": 0.4896, "step": 7269 }, { "epoch": 3.2888486767699616, "grad_norm": 0.4140554368495941, "learning_rate": 7.833686589600803e-06, "loss": 0.4894, "step": 7270 }, { "epoch": 3.289301063107894, "grad_norm": 0.48474374413490295, "learning_rate": 7.833082847456352e-06, "loss": 0.6196, "step": 7271 }, { "epoch": 3.289753449445827, "grad_norm": 0.4138162434101105, "learning_rate": 7.832479044466047e-06, "loss": 0.4848, "step": 7272 }, { "epoch": 3.2902058357837594, "grad_norm": 0.4439361095428467, "learning_rate": 7.831875180642854e-06, "loss": 0.5334, "step": 7273 }, { "epoch": 3.290658222121692, "grad_norm": 0.46134546399116516, "learning_rate": 7.831271255999743e-06, "loss": 0.5465, "step": 7274 }, { "epoch": 3.2911106084596247, "grad_norm": 0.42431163787841797, "learning_rate": 7.830667270549683e-06, "loss": 0.5103, "step": 7275 }, { "epoch": 3.291562994797557, "grad_norm": 0.46521443128585815, "learning_rate": 7.830063224305646e-06, "loss": 0.4833, "step": 7276 }, { "epoch": 3.2920153811354895, "grad_norm": 0.4350901246070862, "learning_rate": 7.829459117280605e-06, "loss": 0.5291, "step": 7277 }, { "epoch": 3.2924677674734224, "grad_norm": 0.5400170087814331, "learning_rate": 7.828854949487535e-06, "loss": 0.5286, "step": 7278 }, { "epoch": 3.292920153811355, "grad_norm": 0.4895123243331909, "learning_rate": 7.828250720939412e-06, "loss": 0.4818, "step": 7279 }, { "epoch": 3.2933725401492877, "grad_norm": 0.4863109290599823, "learning_rate": 7.827646431649213e-06, "loss": 0.5038, "step": 7280 }, { "epoch": 3.29382492648722, "grad_norm": 0.5571334958076477, "learning_rate": 7.827042081629915e-06, "loss": 0.4818, "step": 7281 }, { "epoch": 3.2942773128251526, "grad_norm": 0.479444295167923, "learning_rate": 7.826437670894497e-06, "loss": 1.0436, "step": 7282 }, { "epoch": 3.2947296991630854, "grad_norm": 0.21356967091560364, "learning_rate": 7.825833199455943e-06, "loss": 0.7188, "step": 7283 }, { "epoch": 3.295182085501018, "grad_norm": 0.24996916949748993, "learning_rate": 7.825228667327233e-06, "loss": 0.7455, "step": 7284 }, { "epoch": 3.2956344718389503, "grad_norm": 0.3454699218273163, "learning_rate": 7.824624074521349e-06, "loss": 0.6798, "step": 7285 }, { "epoch": 3.296086858176883, "grad_norm": 0.2448417842388153, "learning_rate": 7.82401942105128e-06, "loss": 0.5569, "step": 7286 }, { "epoch": 3.2965392445148156, "grad_norm": 0.3272230923175812, "learning_rate": 7.823414706930007e-06, "loss": 0.7136, "step": 7287 }, { "epoch": 3.296991630852748, "grad_norm": 0.3133159279823303, "learning_rate": 7.822809932170521e-06, "loss": 0.579, "step": 7288 }, { "epoch": 3.297444017190681, "grad_norm": 0.27693814039230347, "learning_rate": 7.822205096785811e-06, "loss": 0.5353, "step": 7289 }, { "epoch": 3.2978964035286134, "grad_norm": 0.3200450837612152, "learning_rate": 7.821600200788865e-06, "loss": 0.5878, "step": 7290 }, { "epoch": 3.2983487898665462, "grad_norm": 0.34997785091400146, "learning_rate": 7.820995244192673e-06, "loss": 0.7017, "step": 7291 }, { "epoch": 3.2988011762044787, "grad_norm": 0.36704879999160767, "learning_rate": 7.820390227010232e-06, "loss": 0.7334, "step": 7292 }, { "epoch": 3.299253562542411, "grad_norm": 0.3157632648944855, "learning_rate": 7.819785149254534e-06, "loss": 0.5437, "step": 7293 }, { "epoch": 3.299705948880344, "grad_norm": 0.31977584958076477, "learning_rate": 7.819180010938572e-06, "loss": 0.6191, "step": 7294 }, { "epoch": 3.3001583352182764, "grad_norm": 0.35376793146133423, "learning_rate": 7.818574812075344e-06, "loss": 0.528, "step": 7295 }, { "epoch": 3.300610721556209, "grad_norm": 0.35863393545150757, "learning_rate": 7.817969552677848e-06, "loss": 0.5348, "step": 7296 }, { "epoch": 3.3010631078941417, "grad_norm": 0.34212154150009155, "learning_rate": 7.817364232759084e-06, "loss": 0.4919, "step": 7297 }, { "epoch": 3.301515494232074, "grad_norm": 0.3588115870952606, "learning_rate": 7.816758852332051e-06, "loss": 0.5628, "step": 7298 }, { "epoch": 3.3019678805700066, "grad_norm": 0.4095693528652191, "learning_rate": 7.816153411409753e-06, "loss": 0.6124, "step": 7299 }, { "epoch": 3.3024202669079394, "grad_norm": 0.3912900388240814, "learning_rate": 7.815547910005188e-06, "loss": 0.6174, "step": 7300 }, { "epoch": 3.302872653245872, "grad_norm": 0.4085979759693146, "learning_rate": 7.814942348131367e-06, "loss": 0.6612, "step": 7301 }, { "epoch": 3.3033250395838047, "grad_norm": 0.39512622356414795, "learning_rate": 7.81433672580129e-06, "loss": 0.5782, "step": 7302 }, { "epoch": 3.303777425921737, "grad_norm": 0.3742673695087433, "learning_rate": 7.813731043027965e-06, "loss": 0.5245, "step": 7303 }, { "epoch": 3.3042298122596696, "grad_norm": 0.34361082315444946, "learning_rate": 7.813125299824403e-06, "loss": 0.4736, "step": 7304 }, { "epoch": 3.3046821985976025, "grad_norm": 0.3850691616535187, "learning_rate": 7.81251949620361e-06, "loss": 0.5966, "step": 7305 }, { "epoch": 3.305134584935535, "grad_norm": 0.35997164249420166, "learning_rate": 7.811913632178598e-06, "loss": 0.5357, "step": 7306 }, { "epoch": 3.305586971273468, "grad_norm": 0.3912207782268524, "learning_rate": 7.811307707762382e-06, "loss": 0.613, "step": 7307 }, { "epoch": 3.3060393576114, "grad_norm": 0.3957541882991791, "learning_rate": 7.810701722967971e-06, "loss": 0.5825, "step": 7308 }, { "epoch": 3.3064917439493327, "grad_norm": 0.3544884920120239, "learning_rate": 7.810095677808383e-06, "loss": 0.5503, "step": 7309 }, { "epoch": 3.306944130287265, "grad_norm": 0.39831286668777466, "learning_rate": 7.809489572296632e-06, "loss": 0.4835, "step": 7310 }, { "epoch": 3.307396516625198, "grad_norm": 0.3925761282444, "learning_rate": 7.808883406445735e-06, "loss": 0.5227, "step": 7311 }, { "epoch": 3.3078489029631304, "grad_norm": 0.39174601435661316, "learning_rate": 7.808277180268713e-06, "loss": 0.5326, "step": 7312 }, { "epoch": 3.3083012893010633, "grad_norm": 0.3789520561695099, "learning_rate": 7.807670893778583e-06, "loss": 0.5293, "step": 7313 }, { "epoch": 3.3087536756389957, "grad_norm": 0.3989674150943756, "learning_rate": 7.807064546988368e-06, "loss": 0.4967, "step": 7314 }, { "epoch": 3.309206061976928, "grad_norm": 0.4058832824230194, "learning_rate": 7.80645813991109e-06, "loss": 0.5266, "step": 7315 }, { "epoch": 3.309658448314861, "grad_norm": 0.39998137950897217, "learning_rate": 7.805851672559773e-06, "loss": 0.4711, "step": 7316 }, { "epoch": 3.3101108346527934, "grad_norm": 0.45871424674987793, "learning_rate": 7.80524514494744e-06, "loss": 0.5581, "step": 7317 }, { "epoch": 3.3105632209907263, "grad_norm": 0.4315376579761505, "learning_rate": 7.80463855708712e-06, "loss": 0.4816, "step": 7318 }, { "epoch": 3.3110156073286587, "grad_norm": 0.48722684383392334, "learning_rate": 7.804031908991839e-06, "loss": 0.5206, "step": 7319 }, { "epoch": 3.311467993666591, "grad_norm": 0.4781135618686676, "learning_rate": 7.803425200674627e-06, "loss": 0.6152, "step": 7320 }, { "epoch": 3.311920380004524, "grad_norm": 0.4511440396308899, "learning_rate": 7.802818432148514e-06, "loss": 0.5816, "step": 7321 }, { "epoch": 3.3123727663424565, "grad_norm": 0.4721631705760956, "learning_rate": 7.80221160342653e-06, "loss": 0.5418, "step": 7322 }, { "epoch": 3.312825152680389, "grad_norm": 0.4606338143348694, "learning_rate": 7.801604714521711e-06, "loss": 0.4692, "step": 7323 }, { "epoch": 3.313277539018322, "grad_norm": 0.4636628031730652, "learning_rate": 7.80099776544709e-06, "loss": 0.5659, "step": 7324 }, { "epoch": 3.313729925356254, "grad_norm": 0.45776697993278503, "learning_rate": 7.8003907562157e-06, "loss": 0.5037, "step": 7325 }, { "epoch": 3.3141823116941866, "grad_norm": 0.4523719549179077, "learning_rate": 7.799783686840577e-06, "loss": 0.4045, "step": 7326 }, { "epoch": 3.3146346980321195, "grad_norm": 0.5249732136726379, "learning_rate": 7.799176557334765e-06, "loss": 0.5764, "step": 7327 }, { "epoch": 3.315087084370052, "grad_norm": 0.5075933337211609, "learning_rate": 7.798569367711297e-06, "loss": 0.4784, "step": 7328 }, { "epoch": 3.315539470707985, "grad_norm": 0.5482316613197327, "learning_rate": 7.797962117983217e-06, "loss": 0.5277, "step": 7329 }, { "epoch": 3.3159918570459173, "grad_norm": 0.6540105938911438, "learning_rate": 7.797354808163566e-06, "loss": 0.6332, "step": 7330 }, { "epoch": 3.3164442433838497, "grad_norm": 0.5369443297386169, "learning_rate": 7.796747438265387e-06, "loss": 0.4343, "step": 7331 }, { "epoch": 3.3168966297217826, "grad_norm": 0.4338793456554413, "learning_rate": 7.796140008301723e-06, "loss": 1.1579, "step": 7332 }, { "epoch": 3.317349016059715, "grad_norm": 0.1975252628326416, "learning_rate": 7.795532518285623e-06, "loss": 0.5352, "step": 7333 }, { "epoch": 3.3178014023976474, "grad_norm": 0.2705066502094269, "learning_rate": 7.794924968230133e-06, "loss": 0.5369, "step": 7334 }, { "epoch": 3.3182537887355803, "grad_norm": 0.3047500252723694, "learning_rate": 7.794317358148297e-06, "loss": 0.6383, "step": 7335 }, { "epoch": 3.3187061750735127, "grad_norm": 0.3045019805431366, "learning_rate": 7.793709688053172e-06, "loss": 0.5832, "step": 7336 }, { "epoch": 3.319158561411445, "grad_norm": 0.30795931816101074, "learning_rate": 7.793101957957803e-06, "loss": 0.5836, "step": 7337 }, { "epoch": 3.319610947749378, "grad_norm": 0.32961326837539673, "learning_rate": 7.792494167875244e-06, "loss": 0.6497, "step": 7338 }, { "epoch": 3.3200633340873105, "grad_norm": 0.3380035161972046, "learning_rate": 7.791886317818548e-06, "loss": 0.6698, "step": 7339 }, { "epoch": 3.3205157204252433, "grad_norm": 0.30816128849983215, "learning_rate": 7.791278407800771e-06, "loss": 0.5669, "step": 7340 }, { "epoch": 3.3209681067631758, "grad_norm": 0.32133668661117554, "learning_rate": 7.790670437834969e-06, "loss": 0.5877, "step": 7341 }, { "epoch": 3.321420493101108, "grad_norm": 0.30544498562812805, "learning_rate": 7.790062407934198e-06, "loss": 0.6159, "step": 7342 }, { "epoch": 3.321872879439041, "grad_norm": 0.3336131274700165, "learning_rate": 7.789454318111517e-06, "loss": 0.685, "step": 7343 }, { "epoch": 3.3223252657769735, "grad_norm": 0.3559618592262268, "learning_rate": 7.788846168379987e-06, "loss": 0.5598, "step": 7344 }, { "epoch": 3.322777652114906, "grad_norm": 0.34893789887428284, "learning_rate": 7.788237958752667e-06, "loss": 0.6668, "step": 7345 }, { "epoch": 3.323230038452839, "grad_norm": 0.3505030870437622, "learning_rate": 7.787629689242621e-06, "loss": 0.5472, "step": 7346 }, { "epoch": 3.3236824247907713, "grad_norm": 0.3455961346626282, "learning_rate": 7.787021359862913e-06, "loss": 0.5628, "step": 7347 }, { "epoch": 3.3241348111287037, "grad_norm": 0.3294720947742462, "learning_rate": 7.786412970626606e-06, "loss": 0.5053, "step": 7348 }, { "epoch": 3.3245871974666366, "grad_norm": 0.3519960939884186, "learning_rate": 7.78580452154677e-06, "loss": 0.4865, "step": 7349 }, { "epoch": 3.325039583804569, "grad_norm": 0.3409973084926605, "learning_rate": 7.785196012636468e-06, "loss": 0.3643, "step": 7350 }, { "epoch": 3.325491970142502, "grad_norm": 0.3486344516277313, "learning_rate": 7.784587443908774e-06, "loss": 0.5452, "step": 7351 }, { "epoch": 3.3259443564804343, "grad_norm": 0.37178173661231995, "learning_rate": 7.783978815376756e-06, "loss": 0.5783, "step": 7352 }, { "epoch": 3.3263967428183667, "grad_norm": 0.37824228405952454, "learning_rate": 7.783370127053482e-06, "loss": 0.5673, "step": 7353 }, { "epoch": 3.3268491291562996, "grad_norm": 0.36828315258026123, "learning_rate": 7.78276137895203e-06, "loss": 0.5796, "step": 7354 }, { "epoch": 3.327301515494232, "grad_norm": 0.33946192264556885, "learning_rate": 7.782152571085471e-06, "loss": 0.3902, "step": 7355 }, { "epoch": 3.327753901832165, "grad_norm": 0.35613682866096497, "learning_rate": 7.781543703466881e-06, "loss": 0.4704, "step": 7356 }, { "epoch": 3.3282062881700973, "grad_norm": 0.4181819260120392, "learning_rate": 7.780934776109337e-06, "loss": 0.5896, "step": 7357 }, { "epoch": 3.3286586745080298, "grad_norm": 0.4233212471008301, "learning_rate": 7.780325789025916e-06, "loss": 0.5213, "step": 7358 }, { "epoch": 3.3291110608459626, "grad_norm": 0.39034730195999146, "learning_rate": 7.779716742229698e-06, "loss": 0.6309, "step": 7359 }, { "epoch": 3.329563447183895, "grad_norm": 0.39525097608566284, "learning_rate": 7.779107635733762e-06, "loss": 0.5872, "step": 7360 }, { "epoch": 3.3300158335218275, "grad_norm": 0.4081462025642395, "learning_rate": 7.778498469551193e-06, "loss": 0.507, "step": 7361 }, { "epoch": 3.3304682198597604, "grad_norm": 0.40427371859550476, "learning_rate": 7.777889243695069e-06, "loss": 0.4959, "step": 7362 }, { "epoch": 3.330920606197693, "grad_norm": 0.4060494601726532, "learning_rate": 7.777279958178478e-06, "loss": 0.488, "step": 7363 }, { "epoch": 3.3313729925356252, "grad_norm": 0.41295716166496277, "learning_rate": 7.776670613014506e-06, "loss": 0.4739, "step": 7364 }, { "epoch": 3.331825378873558, "grad_norm": 0.42345407605171204, "learning_rate": 7.77606120821624e-06, "loss": 0.5151, "step": 7365 }, { "epoch": 3.3322777652114906, "grad_norm": 0.3948121666908264, "learning_rate": 7.775451743796763e-06, "loss": 0.52, "step": 7366 }, { "epoch": 3.3327301515494234, "grad_norm": 0.4210667908191681, "learning_rate": 7.77484221976917e-06, "loss": 0.482, "step": 7367 }, { "epoch": 3.333182537887356, "grad_norm": 0.45163169503211975, "learning_rate": 7.77423263614655e-06, "loss": 0.544, "step": 7368 }, { "epoch": 3.3336349242252883, "grad_norm": 0.5180023312568665, "learning_rate": 7.773622992941994e-06, "loss": 0.6089, "step": 7369 }, { "epoch": 3.334087310563221, "grad_norm": 0.4298311471939087, "learning_rate": 7.773013290168596e-06, "loss": 0.5419, "step": 7370 }, { "epoch": 3.3345396969011536, "grad_norm": 0.471540629863739, "learning_rate": 7.77240352783945e-06, "loss": 0.4983, "step": 7371 }, { "epoch": 3.334992083239086, "grad_norm": 0.43345460295677185, "learning_rate": 7.771793705967653e-06, "loss": 0.4872, "step": 7372 }, { "epoch": 3.335444469577019, "grad_norm": 0.4746280610561371, "learning_rate": 7.771183824566303e-06, "loss": 0.5374, "step": 7373 }, { "epoch": 3.3358968559149513, "grad_norm": 0.4997951090335846, "learning_rate": 7.770573883648495e-06, "loss": 0.5983, "step": 7374 }, { "epoch": 3.3363492422528838, "grad_norm": 0.5132347345352173, "learning_rate": 7.769963883227331e-06, "loss": 0.4775, "step": 7375 }, { "epoch": 3.3368016285908166, "grad_norm": 0.4234570264816284, "learning_rate": 7.76935382331591e-06, "loss": 0.3913, "step": 7376 }, { "epoch": 3.337254014928749, "grad_norm": 0.5815085768699646, "learning_rate": 7.768743703927337e-06, "loss": 0.5989, "step": 7377 }, { "epoch": 3.337706401266682, "grad_norm": 0.5293532609939575, "learning_rate": 7.768133525074714e-06, "loss": 0.6242, "step": 7378 }, { "epoch": 3.3381587876046144, "grad_norm": 0.5091747045516968, "learning_rate": 7.767523286771145e-06, "loss": 0.5329, "step": 7379 }, { "epoch": 3.338611173942547, "grad_norm": 0.6029506921768188, "learning_rate": 7.766912989029738e-06, "loss": 0.5301, "step": 7380 }, { "epoch": 3.3390635602804797, "grad_norm": 0.5868857502937317, "learning_rate": 7.7663026318636e-06, "loss": 0.6284, "step": 7381 }, { "epoch": 3.339515946618412, "grad_norm": 0.45688900351524353, "learning_rate": 7.765692215285837e-06, "loss": 0.8484, "step": 7382 }, { "epoch": 3.3399683329563445, "grad_norm": 0.17125873267650604, "learning_rate": 7.76508173930956e-06, "loss": 1.3316, "step": 7383 }, { "epoch": 3.3404207192942774, "grad_norm": 0.19708530604839325, "learning_rate": 7.76447120394788e-06, "loss": 0.8624, "step": 7384 }, { "epoch": 3.34087310563221, "grad_norm": 0.22090460360050201, "learning_rate": 7.763860609213913e-06, "loss": 0.6109, "step": 7385 }, { "epoch": 3.3413254919701423, "grad_norm": 0.2501892149448395, "learning_rate": 7.763249955120768e-06, "loss": 0.618, "step": 7386 }, { "epoch": 3.341777878308075, "grad_norm": 0.29315847158432007, "learning_rate": 7.762639241681562e-06, "loss": 0.5011, "step": 7387 }, { "epoch": 3.3422302646460076, "grad_norm": 0.32744765281677246, "learning_rate": 7.762028468909411e-06, "loss": 0.7064, "step": 7388 }, { "epoch": 3.3426826509839405, "grad_norm": 0.3014523386955261, "learning_rate": 7.761417636817435e-06, "loss": 0.5597, "step": 7389 }, { "epoch": 3.343135037321873, "grad_norm": 0.3307400643825531, "learning_rate": 7.760806745418747e-06, "loss": 0.5224, "step": 7390 }, { "epoch": 3.3435874236598053, "grad_norm": 0.30044883489608765, "learning_rate": 7.760195794726471e-06, "loss": 0.6031, "step": 7391 }, { "epoch": 3.344039809997738, "grad_norm": 0.41117092967033386, "learning_rate": 7.759584784753729e-06, "loss": 0.7684, "step": 7392 }, { "epoch": 3.3444921963356706, "grad_norm": 0.3389795124530792, "learning_rate": 7.758973715513641e-06, "loss": 0.6315, "step": 7393 }, { "epoch": 3.3449445826736035, "grad_norm": 0.346284955739975, "learning_rate": 7.758362587019333e-06, "loss": 0.5403, "step": 7394 }, { "epoch": 3.345396969011536, "grad_norm": 0.350093811750412, "learning_rate": 7.757751399283928e-06, "loss": 0.5773, "step": 7395 }, { "epoch": 3.3458493553494684, "grad_norm": 0.3474940359592438, "learning_rate": 7.757140152320554e-06, "loss": 0.5294, "step": 7396 }, { "epoch": 3.346301741687401, "grad_norm": 0.37667348980903625, "learning_rate": 7.756528846142339e-06, "loss": 0.5697, "step": 7397 }, { "epoch": 3.3467541280253337, "grad_norm": 0.35103386640548706, "learning_rate": 7.755917480762412e-06, "loss": 0.6518, "step": 7398 }, { "epoch": 3.347206514363266, "grad_norm": 0.33676478266716003, "learning_rate": 7.755306056193902e-06, "loss": 0.4972, "step": 7399 }, { "epoch": 3.347658900701199, "grad_norm": 0.3244917392730713, "learning_rate": 7.75469457244994e-06, "loss": 0.5393, "step": 7400 }, { "epoch": 3.347658900701199, "eval_loss": 0.5921218991279602, "eval_runtime": 25.828, "eval_samples_per_second": 28.806, "eval_steps_per_second": 7.201, "step": 7400 }, { "epoch": 3.3481112870391314, "grad_norm": 0.38965046405792236, "learning_rate": 7.754083029543661e-06, "loss": 0.577, "step": 7401 }, { "epoch": 3.348563673377064, "grad_norm": 0.3383985757827759, "learning_rate": 7.7534714274882e-06, "loss": 0.5667, "step": 7402 }, { "epoch": 3.3490160597149967, "grad_norm": 0.40551266074180603, "learning_rate": 7.752859766296688e-06, "loss": 0.6203, "step": 7403 }, { "epoch": 3.349468446052929, "grad_norm": 0.3405687212944031, "learning_rate": 7.752248045982264e-06, "loss": 0.5308, "step": 7404 }, { "epoch": 3.349920832390862, "grad_norm": 0.3189205229282379, "learning_rate": 7.751636266558066e-06, "loss": 0.5609, "step": 7405 }, { "epoch": 3.3503732187287945, "grad_norm": 0.39242491126060486, "learning_rate": 7.751024428037232e-06, "loss": 0.5404, "step": 7406 }, { "epoch": 3.350825605066727, "grad_norm": 0.4135778844356537, "learning_rate": 7.750412530432905e-06, "loss": 0.6553, "step": 7407 }, { "epoch": 3.3512779914046598, "grad_norm": 0.37131592631340027, "learning_rate": 7.749800573758224e-06, "loss": 0.5449, "step": 7408 }, { "epoch": 3.351730377742592, "grad_norm": 0.36161094903945923, "learning_rate": 7.749188558026333e-06, "loss": 0.4762, "step": 7409 }, { "epoch": 3.3521827640805246, "grad_norm": 0.3954141139984131, "learning_rate": 7.748576483250375e-06, "loss": 0.5747, "step": 7410 }, { "epoch": 3.3526351504184575, "grad_norm": 0.4189464747905731, "learning_rate": 7.747964349443497e-06, "loss": 0.6608, "step": 7411 }, { "epoch": 3.35308753675639, "grad_norm": 0.39670807123184204, "learning_rate": 7.747352156618847e-06, "loss": 0.5084, "step": 7412 }, { "epoch": 3.3535399230943224, "grad_norm": 0.3605146110057831, "learning_rate": 7.74673990478957e-06, "loss": 0.47, "step": 7413 }, { "epoch": 3.3539923094322552, "grad_norm": 0.41853970289230347, "learning_rate": 7.746127593968818e-06, "loss": 0.6403, "step": 7414 }, { "epoch": 3.3544446957701877, "grad_norm": 0.3811572790145874, "learning_rate": 7.745515224169738e-06, "loss": 0.448, "step": 7415 }, { "epoch": 3.3548970821081205, "grad_norm": 0.40881386399269104, "learning_rate": 7.744902795405484e-06, "loss": 0.4942, "step": 7416 }, { "epoch": 3.355349468446053, "grad_norm": 0.3776860535144806, "learning_rate": 7.744290307689211e-06, "loss": 0.5306, "step": 7417 }, { "epoch": 3.3558018547839854, "grad_norm": 0.4032444357872009, "learning_rate": 7.74367776103407e-06, "loss": 0.4869, "step": 7418 }, { "epoch": 3.3562542411219183, "grad_norm": 0.4157989025115967, "learning_rate": 7.743065155453217e-06, "loss": 0.5541, "step": 7419 }, { "epoch": 3.3567066274598507, "grad_norm": 0.43974488973617554, "learning_rate": 7.742452490959811e-06, "loss": 0.5637, "step": 7420 }, { "epoch": 3.357159013797783, "grad_norm": 0.4884379804134369, "learning_rate": 7.741839767567009e-06, "loss": 0.5892, "step": 7421 }, { "epoch": 3.357611400135716, "grad_norm": 0.40848875045776367, "learning_rate": 7.741226985287971e-06, "loss": 0.5187, "step": 7422 }, { "epoch": 3.3580637864736484, "grad_norm": 0.519853413105011, "learning_rate": 7.740614144135856e-06, "loss": 0.5568, "step": 7423 }, { "epoch": 3.358516172811581, "grad_norm": 0.43987205624580383, "learning_rate": 7.740001244123826e-06, "loss": 0.4639, "step": 7424 }, { "epoch": 3.3589685591495138, "grad_norm": 0.43293890357017517, "learning_rate": 7.739388285265048e-06, "loss": 0.5156, "step": 7425 }, { "epoch": 3.359420945487446, "grad_norm": 0.4110514223575592, "learning_rate": 7.738775267572681e-06, "loss": 0.4425, "step": 7426 }, { "epoch": 3.359873331825379, "grad_norm": 0.4647606611251831, "learning_rate": 7.738162191059895e-06, "loss": 0.468, "step": 7427 }, { "epoch": 3.3603257181633115, "grad_norm": 0.4986777603626251, "learning_rate": 7.737549055739854e-06, "loss": 0.5363, "step": 7428 }, { "epoch": 3.360778104501244, "grad_norm": 0.49137863516807556, "learning_rate": 7.736935861625729e-06, "loss": 0.4529, "step": 7429 }, { "epoch": 3.361230490839177, "grad_norm": 0.5748875141143799, "learning_rate": 7.736322608730686e-06, "loss": 0.6303, "step": 7430 }, { "epoch": 3.3616828771771092, "grad_norm": 0.6010419130325317, "learning_rate": 7.735709297067901e-06, "loss": 0.5075, "step": 7431 }, { "epoch": 3.3621352635150417, "grad_norm": 0.34303170442581177, "learning_rate": 7.73509592665054e-06, "loss": 0.8761, "step": 7432 }, { "epoch": 3.3625876498529745, "grad_norm": 0.19831664860248566, "learning_rate": 7.73448249749178e-06, "loss": 1.0749, "step": 7433 }, { "epoch": 3.363040036190907, "grad_norm": 0.20542488992214203, "learning_rate": 7.733869009604795e-06, "loss": 0.5597, "step": 7434 }, { "epoch": 3.3634924225288394, "grad_norm": 0.2792004644870758, "learning_rate": 7.733255463002762e-06, "loss": 0.6222, "step": 7435 }, { "epoch": 3.3639448088667723, "grad_norm": 0.26043397188186646, "learning_rate": 7.732641857698856e-06, "loss": 0.629, "step": 7436 }, { "epoch": 3.3643971952047047, "grad_norm": 0.2965107858181, "learning_rate": 7.732028193706255e-06, "loss": 0.5461, "step": 7437 }, { "epoch": 3.3648495815426376, "grad_norm": 0.3207358121871948, "learning_rate": 7.73141447103814e-06, "loss": 0.5369, "step": 7438 }, { "epoch": 3.36530196788057, "grad_norm": 0.3311494290828705, "learning_rate": 7.730800689707691e-06, "loss": 0.6582, "step": 7439 }, { "epoch": 3.3657543542185024, "grad_norm": 0.2892623543739319, "learning_rate": 7.730186849728092e-06, "loss": 0.5524, "step": 7440 }, { "epoch": 3.3662067405564353, "grad_norm": 0.37549343705177307, "learning_rate": 7.729572951112524e-06, "loss": 0.7049, "step": 7441 }, { "epoch": 3.3666591268943677, "grad_norm": 0.29520371556282043, "learning_rate": 7.728958993874171e-06, "loss": 0.4777, "step": 7442 }, { "epoch": 3.3671115132323006, "grad_norm": 0.34810343384742737, "learning_rate": 7.728344978026224e-06, "loss": 0.6047, "step": 7443 }, { "epoch": 3.367563899570233, "grad_norm": 0.32997867465019226, "learning_rate": 7.727730903581866e-06, "loss": 0.59, "step": 7444 }, { "epoch": 3.3680162859081655, "grad_norm": 0.32164376974105835, "learning_rate": 7.727116770554285e-06, "loss": 0.6951, "step": 7445 }, { "epoch": 3.3684686722460984, "grad_norm": 0.3315337002277374, "learning_rate": 7.726502578956669e-06, "loss": 0.5624, "step": 7446 }, { "epoch": 3.368921058584031, "grad_norm": 0.34121763706207275, "learning_rate": 7.725888328802216e-06, "loss": 0.5712, "step": 7447 }, { "epoch": 3.3693734449219632, "grad_norm": 0.3322295546531677, "learning_rate": 7.725274020104113e-06, "loss": 0.5613, "step": 7448 }, { "epoch": 3.369825831259896, "grad_norm": 0.3384768068790436, "learning_rate": 7.724659652875554e-06, "loss": 0.5665, "step": 7449 }, { "epoch": 3.3702782175978285, "grad_norm": 0.36652469635009766, "learning_rate": 7.724045227129732e-06, "loss": 0.6124, "step": 7450 }, { "epoch": 3.370730603935761, "grad_norm": 0.32294997572898865, "learning_rate": 7.723430742879848e-06, "loss": 0.515, "step": 7451 }, { "epoch": 3.371182990273694, "grad_norm": 0.37442514300346375, "learning_rate": 7.722816200139097e-06, "loss": 0.5549, "step": 7452 }, { "epoch": 3.3716353766116263, "grad_norm": 0.3858637809753418, "learning_rate": 7.722201598920673e-06, "loss": 0.5065, "step": 7453 }, { "epoch": 3.372087762949559, "grad_norm": 0.332539826631546, "learning_rate": 7.721586939237783e-06, "loss": 0.4729, "step": 7454 }, { "epoch": 3.3725401492874916, "grad_norm": 0.3986014127731323, "learning_rate": 7.720972221103622e-06, "loss": 0.6507, "step": 7455 }, { "epoch": 3.372992535625424, "grad_norm": 0.38538599014282227, "learning_rate": 7.720357444531397e-06, "loss": 0.6238, "step": 7456 }, { "epoch": 3.373444921963357, "grad_norm": 0.38559669256210327, "learning_rate": 7.719742609534307e-06, "loss": 0.5672, "step": 7457 }, { "epoch": 3.3738973083012893, "grad_norm": 0.3585031032562256, "learning_rate": 7.719127716125561e-06, "loss": 0.4629, "step": 7458 }, { "epoch": 3.3743496946392217, "grad_norm": 0.38418155908584595, "learning_rate": 7.718512764318362e-06, "loss": 0.5159, "step": 7459 }, { "epoch": 3.3748020809771546, "grad_norm": 0.4583515524864197, "learning_rate": 7.717897754125918e-06, "loss": 0.6401, "step": 7460 }, { "epoch": 3.375254467315087, "grad_norm": 0.35242053866386414, "learning_rate": 7.717282685561438e-06, "loss": 0.4208, "step": 7461 }, { "epoch": 3.3757068536530195, "grad_norm": 0.39222022891044617, "learning_rate": 7.71666755863813e-06, "loss": 0.522, "step": 7462 }, { "epoch": 3.3761592399909524, "grad_norm": 0.3895947337150574, "learning_rate": 7.716052373369209e-06, "loss": 0.4658, "step": 7463 }, { "epoch": 3.376611626328885, "grad_norm": 0.42968466877937317, "learning_rate": 7.715437129767884e-06, "loss": 0.5559, "step": 7464 }, { "epoch": 3.3770640126668177, "grad_norm": 0.4311535358428955, "learning_rate": 7.714821827847368e-06, "loss": 0.6086, "step": 7465 }, { "epoch": 3.37751639900475, "grad_norm": 0.4406832158565521, "learning_rate": 7.714206467620879e-06, "loss": 0.5639, "step": 7466 }, { "epoch": 3.3779687853426825, "grad_norm": 0.4158409535884857, "learning_rate": 7.71359104910163e-06, "loss": 0.5574, "step": 7467 }, { "epoch": 3.3784211716806154, "grad_norm": 0.41443026065826416, "learning_rate": 7.71297557230284e-06, "loss": 0.4786, "step": 7468 }, { "epoch": 3.378873558018548, "grad_norm": 0.40602409839630127, "learning_rate": 7.712360037237726e-06, "loss": 0.4534, "step": 7469 }, { "epoch": 3.3793259443564803, "grad_norm": 0.3948574960231781, "learning_rate": 7.711744443919512e-06, "loss": 0.4867, "step": 7470 }, { "epoch": 3.379778330694413, "grad_norm": 0.4697231948375702, "learning_rate": 7.711128792361414e-06, "loss": 0.5486, "step": 7471 }, { "epoch": 3.3802307170323456, "grad_norm": 0.46135610342025757, "learning_rate": 7.710513082576653e-06, "loss": 0.5535, "step": 7472 }, { "epoch": 3.380683103370278, "grad_norm": 0.4511546790599823, "learning_rate": 7.709897314578459e-06, "loss": 0.5477, "step": 7473 }, { "epoch": 3.381135489708211, "grad_norm": 0.38590189814567566, "learning_rate": 7.709281488380053e-06, "loss": 0.4336, "step": 7474 }, { "epoch": 3.3815878760461433, "grad_norm": 0.4494282007217407, "learning_rate": 7.70866560399466e-06, "loss": 0.528, "step": 7475 }, { "epoch": 3.382040262384076, "grad_norm": 0.4631700813770294, "learning_rate": 7.70804966143551e-06, "loss": 0.5488, "step": 7476 }, { "epoch": 3.3824926487220086, "grad_norm": 0.4500862956047058, "learning_rate": 7.70743366071583e-06, "loss": 0.4784, "step": 7477 }, { "epoch": 3.382945035059941, "grad_norm": 0.49583205580711365, "learning_rate": 7.706817601848851e-06, "loss": 0.541, "step": 7478 }, { "epoch": 3.383397421397874, "grad_norm": 0.5369263291358948, "learning_rate": 7.706201484847804e-06, "loss": 0.6082, "step": 7479 }, { "epoch": 3.3838498077358063, "grad_norm": 0.5480797290802002, "learning_rate": 7.705585309725919e-06, "loss": 0.6239, "step": 7480 }, { "epoch": 3.3843021940737392, "grad_norm": 0.5330764651298523, "learning_rate": 7.70496907649643e-06, "loss": 0.4814, "step": 7481 }, { "epoch": 3.3847545804116717, "grad_norm": 0.43048566579818726, "learning_rate": 7.704352785172575e-06, "loss": 1.2616, "step": 7482 }, { "epoch": 3.385206966749604, "grad_norm": 0.2165258377790451, "learning_rate": 7.703736435767586e-06, "loss": 1.3087, "step": 7483 }, { "epoch": 3.3856593530875365, "grad_norm": 0.2250078022480011, "learning_rate": 7.703120028294702e-06, "loss": 0.6737, "step": 7484 }, { "epoch": 3.3861117394254694, "grad_norm": 0.28054067492485046, "learning_rate": 7.702503562767162e-06, "loss": 0.7556, "step": 7485 }, { "epoch": 3.386564125763402, "grad_norm": 0.27210813760757446, "learning_rate": 7.701887039198206e-06, "loss": 0.5752, "step": 7486 }, { "epoch": 3.3870165121013347, "grad_norm": 0.2597596347332001, "learning_rate": 7.701270457601073e-06, "loss": 0.5646, "step": 7487 }, { "epoch": 3.387468898439267, "grad_norm": 0.2837333381175995, "learning_rate": 7.700653817989008e-06, "loss": 0.6269, "step": 7488 }, { "epoch": 3.3879212847771996, "grad_norm": 0.33012712001800537, "learning_rate": 7.700037120375252e-06, "loss": 0.4495, "step": 7489 }, { "epoch": 3.3883736711151324, "grad_norm": 0.3380446434020996, "learning_rate": 7.699420364773052e-06, "loss": 0.5781, "step": 7490 }, { "epoch": 3.388826057453065, "grad_norm": 0.3204764723777771, "learning_rate": 7.698803551195653e-06, "loss": 0.5296, "step": 7491 }, { "epoch": 3.3892784437909977, "grad_norm": 0.30633002519607544, "learning_rate": 7.698186679656301e-06, "loss": 0.5885, "step": 7492 }, { "epoch": 3.38973083012893, "grad_norm": 0.3451721668243408, "learning_rate": 7.697569750168247e-06, "loss": 0.5777, "step": 7493 }, { "epoch": 3.3901832164668626, "grad_norm": 0.3278988301753998, "learning_rate": 7.696952762744739e-06, "loss": 0.5057, "step": 7494 }, { "epoch": 3.3906356028047955, "grad_norm": 0.331032931804657, "learning_rate": 7.696335717399029e-06, "loss": 0.5387, "step": 7495 }, { "epoch": 3.391087989142728, "grad_norm": 0.30699196457862854, "learning_rate": 7.695718614144366e-06, "loss": 0.496, "step": 7496 }, { "epoch": 3.3915403754806603, "grad_norm": 0.3410186767578125, "learning_rate": 7.695101452994008e-06, "loss": 0.5961, "step": 7497 }, { "epoch": 3.391992761818593, "grad_norm": 0.341678261756897, "learning_rate": 7.694484233961208e-06, "loss": 0.5716, "step": 7498 }, { "epoch": 3.3924451481565256, "grad_norm": 0.35623013973236084, "learning_rate": 7.693866957059222e-06, "loss": 0.5954, "step": 7499 }, { "epoch": 3.392897534494458, "grad_norm": 0.34171125292778015, "learning_rate": 7.693249622301305e-06, "loss": 0.5076, "step": 7500 }, { "epoch": 3.393349920832391, "grad_norm": 0.4247296154499054, "learning_rate": 7.692632229700718e-06, "loss": 0.5424, "step": 7501 }, { "epoch": 3.3938023071703234, "grad_norm": 0.385483980178833, "learning_rate": 7.692014779270722e-06, "loss": 0.6455, "step": 7502 }, { "epoch": 3.3942546935082563, "grad_norm": 0.3865795135498047, "learning_rate": 7.691397271024574e-06, "loss": 0.604, "step": 7503 }, { "epoch": 3.3947070798461887, "grad_norm": 0.38720595836639404, "learning_rate": 7.690779704975539e-06, "loss": 0.6458, "step": 7504 }, { "epoch": 3.395159466184121, "grad_norm": 0.3484162390232086, "learning_rate": 7.69016208113688e-06, "loss": 0.4121, "step": 7505 }, { "epoch": 3.395611852522054, "grad_norm": 0.36973583698272705, "learning_rate": 7.68954439952186e-06, "loss": 0.5681, "step": 7506 }, { "epoch": 3.3960642388599864, "grad_norm": 0.384880930185318, "learning_rate": 7.688926660143746e-06, "loss": 0.5732, "step": 7507 }, { "epoch": 3.396516625197919, "grad_norm": 0.3466717600822449, "learning_rate": 7.688308863015808e-06, "loss": 0.4894, "step": 7508 }, { "epoch": 3.3969690115358517, "grad_norm": 0.38361966609954834, "learning_rate": 7.687691008151308e-06, "loss": 0.5359, "step": 7509 }, { "epoch": 3.397421397873784, "grad_norm": 0.3900966942310333, "learning_rate": 7.687073095563522e-06, "loss": 0.6029, "step": 7510 }, { "epoch": 3.3978737842117166, "grad_norm": 0.36461424827575684, "learning_rate": 7.686455125265716e-06, "loss": 0.5446, "step": 7511 }, { "epoch": 3.3983261705496495, "grad_norm": 0.37303656339645386, "learning_rate": 7.685837097271164e-06, "loss": 0.4638, "step": 7512 }, { "epoch": 3.398778556887582, "grad_norm": 0.47832363843917847, "learning_rate": 7.685219011593141e-06, "loss": 0.5263, "step": 7513 }, { "epoch": 3.399230943225515, "grad_norm": 0.39439424872398376, "learning_rate": 7.68460086824492e-06, "loss": 0.5648, "step": 7514 }, { "epoch": 3.399683329563447, "grad_norm": 0.38609543442726135, "learning_rate": 7.683982667239776e-06, "loss": 0.4836, "step": 7515 }, { "epoch": 3.4001357159013796, "grad_norm": 0.4419952630996704, "learning_rate": 7.683364408590987e-06, "loss": 0.5435, "step": 7516 }, { "epoch": 3.4005881022393125, "grad_norm": 0.45660778880119324, "learning_rate": 7.682746092311832e-06, "loss": 0.6208, "step": 7517 }, { "epoch": 3.401040488577245, "grad_norm": 0.41515564918518066, "learning_rate": 7.68212771841559e-06, "loss": 0.5244, "step": 7518 }, { "epoch": 3.4014928749151774, "grad_norm": 0.3982800245285034, "learning_rate": 7.68150928691554e-06, "loss": 0.4096, "step": 7519 }, { "epoch": 3.4019452612531103, "grad_norm": 0.4706244170665741, "learning_rate": 7.680890797824965e-06, "loss": 0.5849, "step": 7520 }, { "epoch": 3.4023976475910427, "grad_norm": 0.44784560799598694, "learning_rate": 7.680272251157151e-06, "loss": 0.5326, "step": 7521 }, { "epoch": 3.402850033928975, "grad_norm": 0.4214221239089966, "learning_rate": 7.679653646925378e-06, "loss": 0.4619, "step": 7522 }, { "epoch": 3.403302420266908, "grad_norm": 0.48982441425323486, "learning_rate": 7.679034985142935e-06, "loss": 0.5978, "step": 7523 }, { "epoch": 3.4037548066048404, "grad_norm": 0.4159504771232605, "learning_rate": 7.67841626582311e-06, "loss": 0.4916, "step": 7524 }, { "epoch": 3.4042071929427733, "grad_norm": 0.46341022849082947, "learning_rate": 7.677797488979186e-06, "loss": 0.511, "step": 7525 }, { "epoch": 3.4046595792807057, "grad_norm": 0.4449201822280884, "learning_rate": 7.677178654624455e-06, "loss": 0.4991, "step": 7526 }, { "epoch": 3.405111965618638, "grad_norm": 0.4867554008960724, "learning_rate": 7.676559762772208e-06, "loss": 0.558, "step": 7527 }, { "epoch": 3.405564351956571, "grad_norm": 0.5198003053665161, "learning_rate": 7.67594081343574e-06, "loss": 0.5151, "step": 7528 }, { "epoch": 3.4060167382945035, "grad_norm": 0.5601162314414978, "learning_rate": 7.675321806628339e-06, "loss": 0.6076, "step": 7529 }, { "epoch": 3.4064691246324363, "grad_norm": 0.5355836153030396, "learning_rate": 7.6747027423633e-06, "loss": 0.5338, "step": 7530 }, { "epoch": 3.4069215109703688, "grad_norm": 0.5361890196800232, "learning_rate": 7.674083620653922e-06, "loss": 0.4383, "step": 7531 }, { "epoch": 3.407373897308301, "grad_norm": 0.3719709515571594, "learning_rate": 7.673464441513498e-06, "loss": 1.288, "step": 7532 }, { "epoch": 3.407826283646234, "grad_norm": 0.18113645911216736, "learning_rate": 7.672845204955331e-06, "loss": 0.5455, "step": 7533 }, { "epoch": 3.4082786699841665, "grad_norm": 0.22733895480632782, "learning_rate": 7.672225910992714e-06, "loss": 0.4598, "step": 7534 }, { "epoch": 3.408731056322099, "grad_norm": 0.2669048309326172, "learning_rate": 7.671606559638954e-06, "loss": 0.4934, "step": 7535 }, { "epoch": 3.409183442660032, "grad_norm": 0.29308268427848816, "learning_rate": 7.670987150907345e-06, "loss": 0.5828, "step": 7536 }, { "epoch": 3.4096358289979642, "grad_norm": 0.30752888321876526, "learning_rate": 7.670367684811198e-06, "loss": 0.581, "step": 7537 }, { "epoch": 3.4100882153358967, "grad_norm": 0.32056015729904175, "learning_rate": 7.669748161363812e-06, "loss": 0.5198, "step": 7538 }, { "epoch": 3.4105406016738296, "grad_norm": 0.31142666935920715, "learning_rate": 7.669128580578494e-06, "loss": 0.6018, "step": 7539 }, { "epoch": 3.410992988011762, "grad_norm": 0.3537661135196686, "learning_rate": 7.66850894246855e-06, "loss": 0.6676, "step": 7540 }, { "epoch": 3.411445374349695, "grad_norm": 0.3200874924659729, "learning_rate": 7.667889247047291e-06, "loss": 0.5091, "step": 7541 }, { "epoch": 3.4118977606876273, "grad_norm": 0.34407198429107666, "learning_rate": 7.667269494328022e-06, "loss": 0.5674, "step": 7542 }, { "epoch": 3.4123501470255597, "grad_norm": 0.3229787051677704, "learning_rate": 7.666649684324057e-06, "loss": 0.6005, "step": 7543 }, { "epoch": 3.4128025333634926, "grad_norm": 0.3567276895046234, "learning_rate": 7.666029817048704e-06, "loss": 0.6374, "step": 7544 }, { "epoch": 3.413254919701425, "grad_norm": 0.34217870235443115, "learning_rate": 7.665409892515278e-06, "loss": 0.4446, "step": 7545 }, { "epoch": 3.4137073060393575, "grad_norm": 0.3491359055042267, "learning_rate": 7.664789910737094e-06, "loss": 0.5818, "step": 7546 }, { "epoch": 3.4141596923772903, "grad_norm": 0.3741770386695862, "learning_rate": 7.664169871727465e-06, "loss": 0.6314, "step": 7547 }, { "epoch": 3.4146120787152228, "grad_norm": 0.3449215888977051, "learning_rate": 7.663549775499708e-06, "loss": 0.5146, "step": 7548 }, { "epoch": 3.415064465053155, "grad_norm": 0.33195042610168457, "learning_rate": 7.662929622067141e-06, "loss": 0.4323, "step": 7549 }, { "epoch": 3.415516851391088, "grad_norm": 0.37023812532424927, "learning_rate": 7.662309411443084e-06, "loss": 0.6337, "step": 7550 }, { "epoch": 3.4159692377290205, "grad_norm": 0.3510170876979828, "learning_rate": 7.661689143640855e-06, "loss": 0.5049, "step": 7551 }, { "epoch": 3.4164216240669534, "grad_norm": 0.35672473907470703, "learning_rate": 7.661068818673779e-06, "loss": 0.5319, "step": 7552 }, { "epoch": 3.416874010404886, "grad_norm": 0.3642180562019348, "learning_rate": 7.660448436555175e-06, "loss": 0.5367, "step": 7553 }, { "epoch": 3.4173263967428182, "grad_norm": 0.44556158781051636, "learning_rate": 7.659827997298369e-06, "loss": 0.5411, "step": 7554 }, { "epoch": 3.417778783080751, "grad_norm": 0.42466047406196594, "learning_rate": 7.659207500916683e-06, "loss": 0.644, "step": 7555 }, { "epoch": 3.4182311694186835, "grad_norm": 0.3961670994758606, "learning_rate": 7.65858694742345e-06, "loss": 0.5808, "step": 7556 }, { "epoch": 3.418683555756616, "grad_norm": 0.32473036646842957, "learning_rate": 7.65796633683199e-06, "loss": 0.4292, "step": 7557 }, { "epoch": 3.419135942094549, "grad_norm": 0.3428947329521179, "learning_rate": 7.657345669155639e-06, "loss": 0.4431, "step": 7558 }, { "epoch": 3.4195883284324813, "grad_norm": 0.4567035734653473, "learning_rate": 7.656724944407719e-06, "loss": 0.6385, "step": 7559 }, { "epoch": 3.4200407147704137, "grad_norm": 0.42306360602378845, "learning_rate": 7.656104162601568e-06, "loss": 0.6516, "step": 7560 }, { "epoch": 3.4204931011083466, "grad_norm": 0.38362380862236023, "learning_rate": 7.655483323750514e-06, "loss": 0.4908, "step": 7561 }, { "epoch": 3.420945487446279, "grad_norm": 0.41863057017326355, "learning_rate": 7.654862427867895e-06, "loss": 0.5627, "step": 7562 }, { "epoch": 3.421397873784212, "grad_norm": 0.4145425260066986, "learning_rate": 7.654241474967041e-06, "loss": 0.5137, "step": 7563 }, { "epoch": 3.4218502601221443, "grad_norm": 0.374805212020874, "learning_rate": 7.653620465061294e-06, "loss": 0.4658, "step": 7564 }, { "epoch": 3.4223026464600768, "grad_norm": 0.4461342394351959, "learning_rate": 7.652999398163985e-06, "loss": 0.6158, "step": 7565 }, { "epoch": 3.4227550327980096, "grad_norm": 0.42558860778808594, "learning_rate": 7.652378274288457e-06, "loss": 0.5144, "step": 7566 }, { "epoch": 3.423207419135942, "grad_norm": 0.4156671464443207, "learning_rate": 7.651757093448049e-06, "loss": 0.5066, "step": 7567 }, { "epoch": 3.4236598054738745, "grad_norm": 0.423531711101532, "learning_rate": 7.651135855656103e-06, "loss": 0.4961, "step": 7568 }, { "epoch": 3.4241121918118074, "grad_norm": 0.41475579142570496, "learning_rate": 7.650514560925957e-06, "loss": 0.4717, "step": 7569 }, { "epoch": 3.42456457814974, "grad_norm": 0.4336296021938324, "learning_rate": 7.649893209270957e-06, "loss": 0.5481, "step": 7570 }, { "epoch": 3.4250169644876722, "grad_norm": 0.45021986961364746, "learning_rate": 7.64927180070445e-06, "loss": 0.519, "step": 7571 }, { "epoch": 3.425469350825605, "grad_norm": 0.43533504009246826, "learning_rate": 7.648650335239777e-06, "loss": 0.5041, "step": 7572 }, { "epoch": 3.4259217371635375, "grad_norm": 0.4501439034938812, "learning_rate": 7.648028812890291e-06, "loss": 0.5063, "step": 7573 }, { "epoch": 3.4263741235014704, "grad_norm": 0.451605886220932, "learning_rate": 7.647407233669337e-06, "loss": 0.4865, "step": 7574 }, { "epoch": 3.426826509839403, "grad_norm": 0.47848737239837646, "learning_rate": 7.646785597590265e-06, "loss": 0.5241, "step": 7575 }, { "epoch": 3.4272788961773353, "grad_norm": 0.4451713263988495, "learning_rate": 7.646163904666426e-06, "loss": 0.4997, "step": 7576 }, { "epoch": 3.427731282515268, "grad_norm": 0.4842848777770996, "learning_rate": 7.64554215491117e-06, "loss": 0.5331, "step": 7577 }, { "epoch": 3.4281836688532006, "grad_norm": 0.5091652274131775, "learning_rate": 7.644920348337856e-06, "loss": 0.5582, "step": 7578 }, { "epoch": 3.4286360551911335, "grad_norm": 0.5070539712905884, "learning_rate": 7.644298484959832e-06, "loss": 0.5669, "step": 7579 }, { "epoch": 3.429088441529066, "grad_norm": 0.5061094164848328, "learning_rate": 7.643676564790457e-06, "loss": 0.4436, "step": 7580 }, { "epoch": 3.4295408278669983, "grad_norm": 0.6469835042953491, "learning_rate": 7.643054587843087e-06, "loss": 0.5733, "step": 7581 }, { "epoch": 3.429993214204931, "grad_norm": 0.39371272921562195, "learning_rate": 7.642432554131081e-06, "loss": 0.8272, "step": 7582 }, { "epoch": 3.4304456005428636, "grad_norm": 0.2148093581199646, "learning_rate": 7.641810463667796e-06, "loss": 1.3191, "step": 7583 }, { "epoch": 3.430897986880796, "grad_norm": 0.219925656914711, "learning_rate": 7.641188316466598e-06, "loss": 0.6369, "step": 7584 }, { "epoch": 3.431350373218729, "grad_norm": 0.2887856960296631, "learning_rate": 7.64056611254084e-06, "loss": 0.6587, "step": 7585 }, { "epoch": 3.4318027595566614, "grad_norm": 0.2925126254558563, "learning_rate": 7.639943851903895e-06, "loss": 0.6853, "step": 7586 }, { "epoch": 3.432255145894594, "grad_norm": 0.3287741541862488, "learning_rate": 7.639321534569119e-06, "loss": 0.6288, "step": 7587 }, { "epoch": 3.4327075322325267, "grad_norm": 0.27718037366867065, "learning_rate": 7.638699160549883e-06, "loss": 0.443, "step": 7588 }, { "epoch": 3.433159918570459, "grad_norm": 0.31844308972358704, "learning_rate": 7.63807672985955e-06, "loss": 0.5522, "step": 7589 }, { "epoch": 3.433612304908392, "grad_norm": 0.3107489347457886, "learning_rate": 7.637454242511493e-06, "loss": 0.5087, "step": 7590 }, { "epoch": 3.4340646912463244, "grad_norm": 0.3380063772201538, "learning_rate": 7.636831698519075e-06, "loss": 0.7705, "step": 7591 }, { "epoch": 3.434517077584257, "grad_norm": 0.33277347683906555, "learning_rate": 7.636209097895668e-06, "loss": 0.5479, "step": 7592 }, { "epoch": 3.4349694639221897, "grad_norm": 0.32744988799095154, "learning_rate": 7.635586440654646e-06, "loss": 0.617, "step": 7593 }, { "epoch": 3.435421850260122, "grad_norm": 0.37342432141304016, "learning_rate": 7.63496372680938e-06, "loss": 0.6935, "step": 7594 }, { "epoch": 3.4358742365980546, "grad_norm": 0.33818212151527405, "learning_rate": 7.634340956373242e-06, "loss": 0.6432, "step": 7595 }, { "epoch": 3.4363266229359875, "grad_norm": 0.34954267740249634, "learning_rate": 7.63371812935961e-06, "loss": 0.6257, "step": 7596 }, { "epoch": 3.43677900927392, "grad_norm": 0.4090612530708313, "learning_rate": 7.63309524578186e-06, "loss": 0.6227, "step": 7597 }, { "epoch": 3.4372313956118523, "grad_norm": 0.3539426624774933, "learning_rate": 7.63247230565337e-06, "loss": 0.5844, "step": 7598 }, { "epoch": 3.437683781949785, "grad_norm": 0.35535240173339844, "learning_rate": 7.631849308987519e-06, "loss": 0.6627, "step": 7599 }, { "epoch": 3.4381361682877176, "grad_norm": 0.3430209457874298, "learning_rate": 7.631226255797686e-06, "loss": 0.5043, "step": 7600 }, { "epoch": 3.4381361682877176, "eval_loss": 0.5923088788986206, "eval_runtime": 25.6496, "eval_samples_per_second": 29.006, "eval_steps_per_second": 7.252, "step": 7600 }, { "epoch": 3.4385885546256505, "grad_norm": 0.3582300841808319, "learning_rate": 7.630603146097252e-06, "loss": 0.5051, "step": 7601 }, { "epoch": 3.439040940963583, "grad_norm": 0.38807734847068787, "learning_rate": 7.629979979899598e-06, "loss": 0.5295, "step": 7602 }, { "epoch": 3.4394933273015154, "grad_norm": 0.32387131452560425, "learning_rate": 7.629356757218112e-06, "loss": 0.4205, "step": 7603 }, { "epoch": 3.4399457136394482, "grad_norm": 0.3834383487701416, "learning_rate": 7.6287334780661745e-06, "loss": 0.5559, "step": 7604 }, { "epoch": 3.4403980999773807, "grad_norm": 0.3605841398239136, "learning_rate": 7.628110142457175e-06, "loss": 0.4732, "step": 7605 }, { "epoch": 3.440850486315313, "grad_norm": 0.35365235805511475, "learning_rate": 7.6274867504044996e-06, "loss": 0.5841, "step": 7606 }, { "epoch": 3.441302872653246, "grad_norm": 0.3644017279148102, "learning_rate": 7.626863301921536e-06, "loss": 0.4916, "step": 7607 }, { "epoch": 3.4417552589911784, "grad_norm": 0.35616379976272583, "learning_rate": 7.626239797021674e-06, "loss": 0.5144, "step": 7608 }, { "epoch": 3.442207645329111, "grad_norm": 0.34673646092414856, "learning_rate": 7.625616235718305e-06, "loss": 0.49, "step": 7609 }, { "epoch": 3.4426600316670437, "grad_norm": 0.3763176202774048, "learning_rate": 7.624992618024822e-06, "loss": 0.3908, "step": 7610 }, { "epoch": 3.443112418004976, "grad_norm": 0.3679545521736145, "learning_rate": 7.624368943954619e-06, "loss": 0.4559, "step": 7611 }, { "epoch": 3.443564804342909, "grad_norm": 0.4265907108783722, "learning_rate": 7.623745213521088e-06, "loss": 0.5981, "step": 7612 }, { "epoch": 3.4440171906808414, "grad_norm": 0.36530518531799316, "learning_rate": 7.623121426737626e-06, "loss": 0.4589, "step": 7613 }, { "epoch": 3.444469577018774, "grad_norm": 0.39422133564949036, "learning_rate": 7.62249758361763e-06, "loss": 0.4933, "step": 7614 }, { "epoch": 3.4449219633567068, "grad_norm": 0.3989088535308838, "learning_rate": 7.621873684174499e-06, "loss": 0.4646, "step": 7615 }, { "epoch": 3.445374349694639, "grad_norm": 0.418876975774765, "learning_rate": 7.621249728421632e-06, "loss": 0.534, "step": 7616 }, { "epoch": 3.445826736032572, "grad_norm": 0.39675888419151306, "learning_rate": 7.620625716372428e-06, "loss": 0.4714, "step": 7617 }, { "epoch": 3.4462791223705045, "grad_norm": 0.4007391631603241, "learning_rate": 7.620001648040289e-06, "loss": 0.4574, "step": 7618 }, { "epoch": 3.446731508708437, "grad_norm": 0.3899446725845337, "learning_rate": 7.619377523438623e-06, "loss": 0.459, "step": 7619 }, { "epoch": 3.4471838950463694, "grad_norm": 0.46272289752960205, "learning_rate": 7.618753342580827e-06, "loss": 0.5731, "step": 7620 }, { "epoch": 3.4476362813843022, "grad_norm": 0.41357123851776123, "learning_rate": 7.618129105480312e-06, "loss": 0.4463, "step": 7621 }, { "epoch": 3.4480886677222347, "grad_norm": 0.46607649326324463, "learning_rate": 7.617504812150483e-06, "loss": 0.5266, "step": 7622 }, { "epoch": 3.4485410540601675, "grad_norm": 0.4190835654735565, "learning_rate": 7.616880462604746e-06, "loss": 0.4096, "step": 7623 }, { "epoch": 3.4489934403981, "grad_norm": 0.4822009205818176, "learning_rate": 7.616256056856515e-06, "loss": 0.5544, "step": 7624 }, { "epoch": 3.4494458267360324, "grad_norm": 0.5182948708534241, "learning_rate": 7.615631594919193e-06, "loss": 0.6421, "step": 7625 }, { "epoch": 3.4498982130739653, "grad_norm": 0.43914496898651123, "learning_rate": 7.615007076806198e-06, "loss": 0.4126, "step": 7626 }, { "epoch": 3.4503505994118977, "grad_norm": 0.47739875316619873, "learning_rate": 7.6143825025309395e-06, "loss": 0.4506, "step": 7627 }, { "epoch": 3.4508029857498306, "grad_norm": 0.5690363645553589, "learning_rate": 7.613757872106831e-06, "loss": 0.5483, "step": 7628 }, { "epoch": 3.451255372087763, "grad_norm": 0.5020372867584229, "learning_rate": 7.6131331855472904e-06, "loss": 0.5334, "step": 7629 }, { "epoch": 3.4517077584256954, "grad_norm": 0.5682223439216614, "learning_rate": 7.6125084428657315e-06, "loss": 0.6739, "step": 7630 }, { "epoch": 3.4521601447636283, "grad_norm": 0.5338121652603149, "learning_rate": 7.611883644075573e-06, "loss": 0.5285, "step": 7631 }, { "epoch": 3.4526125311015607, "grad_norm": 0.4282796382904053, "learning_rate": 7.611258789190232e-06, "loss": 1.102, "step": 7632 }, { "epoch": 3.453064917439493, "grad_norm": 0.21386250853538513, "learning_rate": 7.610633878223133e-06, "loss": 0.5694, "step": 7633 }, { "epoch": 3.453517303777426, "grad_norm": 0.2957074046134949, "learning_rate": 7.610008911187692e-06, "loss": 0.5925, "step": 7634 }, { "epoch": 3.4539696901153585, "grad_norm": 0.29524385929107666, "learning_rate": 7.609383888097334e-06, "loss": 0.5686, "step": 7635 }, { "epoch": 3.454422076453291, "grad_norm": 0.2947036921977997, "learning_rate": 7.608758808965482e-06, "loss": 0.6602, "step": 7636 }, { "epoch": 3.454874462791224, "grad_norm": 0.30692458152770996, "learning_rate": 7.60813367380556e-06, "loss": 0.5798, "step": 7637 }, { "epoch": 3.455326849129156, "grad_norm": 0.3321322202682495, "learning_rate": 7.607508482630994e-06, "loss": 0.6585, "step": 7638 }, { "epoch": 3.455779235467089, "grad_norm": 0.3170749545097351, "learning_rate": 7.6068832354552135e-06, "loss": 0.58, "step": 7639 }, { "epoch": 3.4562316218050215, "grad_norm": 0.3606642484664917, "learning_rate": 7.606257932291643e-06, "loss": 0.6352, "step": 7640 }, { "epoch": 3.456684008142954, "grad_norm": 0.36650243401527405, "learning_rate": 7.605632573153716e-06, "loss": 0.758, "step": 7641 }, { "epoch": 3.457136394480887, "grad_norm": 0.313693642616272, "learning_rate": 7.6050071580548614e-06, "loss": 0.5535, "step": 7642 }, { "epoch": 3.4575887808188193, "grad_norm": 0.37419581413269043, "learning_rate": 7.604381687008511e-06, "loss": 0.5967, "step": 7643 }, { "epoch": 3.4580411671567517, "grad_norm": 0.3628145456314087, "learning_rate": 7.603756160028099e-06, "loss": 0.6796, "step": 7644 }, { "epoch": 3.4584935534946846, "grad_norm": 0.36081212759017944, "learning_rate": 7.603130577127058e-06, "loss": 0.6056, "step": 7645 }, { "epoch": 3.458945939832617, "grad_norm": 0.3144899308681488, "learning_rate": 7.602504938318826e-06, "loss": 0.5272, "step": 7646 }, { "epoch": 3.4593983261705494, "grad_norm": 0.4012282192707062, "learning_rate": 7.601879243616838e-06, "loss": 0.5705, "step": 7647 }, { "epoch": 3.4598507125084823, "grad_norm": 0.37300366163253784, "learning_rate": 7.601253493034533e-06, "loss": 0.7218, "step": 7648 }, { "epoch": 3.4603030988464147, "grad_norm": 0.3787477910518646, "learning_rate": 7.600627686585348e-06, "loss": 0.5718, "step": 7649 }, { "epoch": 3.4607554851843476, "grad_norm": 0.354285329580307, "learning_rate": 7.600001824282726e-06, "loss": 0.4967, "step": 7650 }, { "epoch": 3.46120787152228, "grad_norm": 0.40608906745910645, "learning_rate": 7.599375906140107e-06, "loss": 0.6001, "step": 7651 }, { "epoch": 3.4616602578602125, "grad_norm": 0.35617414116859436, "learning_rate": 7.598749932170935e-06, "loss": 0.5762, "step": 7652 }, { "epoch": 3.4621126441981454, "grad_norm": 0.4436100125312805, "learning_rate": 7.598123902388654e-06, "loss": 0.6791, "step": 7653 }, { "epoch": 3.462565030536078, "grad_norm": 0.3970871865749359, "learning_rate": 7.597497816806707e-06, "loss": 0.5413, "step": 7654 }, { "epoch": 3.46301741687401, "grad_norm": 0.3311270773410797, "learning_rate": 7.5968716754385444e-06, "loss": 0.5112, "step": 7655 }, { "epoch": 3.463469803211943, "grad_norm": 0.39850807189941406, "learning_rate": 7.596245478297608e-06, "loss": 0.5442, "step": 7656 }, { "epoch": 3.4639221895498755, "grad_norm": 0.39392077922821045, "learning_rate": 7.595619225397353e-06, "loss": 0.5075, "step": 7657 }, { "epoch": 3.464374575887808, "grad_norm": 0.3676722049713135, "learning_rate": 7.5949929167512245e-06, "loss": 0.3924, "step": 7658 }, { "epoch": 3.464826962225741, "grad_norm": 0.37904053926467896, "learning_rate": 7.594366552372676e-06, "loss": 0.4622, "step": 7659 }, { "epoch": 3.4652793485636733, "grad_norm": 0.41062307357788086, "learning_rate": 7.5937401322751605e-06, "loss": 0.5766, "step": 7660 }, { "epoch": 3.465731734901606, "grad_norm": 0.37613558769226074, "learning_rate": 7.593113656472128e-06, "loss": 0.5385, "step": 7661 }, { "epoch": 3.4661841212395386, "grad_norm": 0.3760749101638794, "learning_rate": 7.592487124977037e-06, "loss": 0.4501, "step": 7662 }, { "epoch": 3.466636507577471, "grad_norm": 0.4451247751712799, "learning_rate": 7.5918605378033426e-06, "loss": 0.5878, "step": 7663 }, { "epoch": 3.467088893915404, "grad_norm": 0.3778825104236603, "learning_rate": 7.591233894964502e-06, "loss": 0.4605, "step": 7664 }, { "epoch": 3.4675412802533363, "grad_norm": 0.4398171007633209, "learning_rate": 7.590607196473972e-06, "loss": 0.5688, "step": 7665 }, { "epoch": 3.467993666591269, "grad_norm": 0.4647059440612793, "learning_rate": 7.589980442345213e-06, "loss": 0.6976, "step": 7666 }, { "epoch": 3.4684460529292016, "grad_norm": 0.4044337868690491, "learning_rate": 7.589353632591685e-06, "loss": 0.4979, "step": 7667 }, { "epoch": 3.468898439267134, "grad_norm": 0.4105020761489868, "learning_rate": 7.5887267672268524e-06, "loss": 0.4837, "step": 7668 }, { "epoch": 3.469350825605067, "grad_norm": 0.45321041345596313, "learning_rate": 7.588099846264177e-06, "loss": 0.5751, "step": 7669 }, { "epoch": 3.4698032119429993, "grad_norm": 0.42310479283332825, "learning_rate": 7.587472869717122e-06, "loss": 0.4721, "step": 7670 }, { "epoch": 3.4702555982809318, "grad_norm": 0.43764209747314453, "learning_rate": 7.586845837599154e-06, "loss": 0.501, "step": 7671 }, { "epoch": 3.4707079846188646, "grad_norm": 0.42933306097984314, "learning_rate": 7.586218749923739e-06, "loss": 0.4777, "step": 7672 }, { "epoch": 3.471160370956797, "grad_norm": 0.4404713213443756, "learning_rate": 7.585591606704345e-06, "loss": 0.5718, "step": 7673 }, { "epoch": 3.4716127572947295, "grad_norm": 0.4195696711540222, "learning_rate": 7.584964407954444e-06, "loss": 0.4739, "step": 7674 }, { "epoch": 3.4720651436326624, "grad_norm": 0.4812304973602295, "learning_rate": 7.584337153687502e-06, "loss": 0.4587, "step": 7675 }, { "epoch": 3.472517529970595, "grad_norm": 0.4320559799671173, "learning_rate": 7.583709843916993e-06, "loss": 0.432, "step": 7676 }, { "epoch": 3.4729699163085277, "grad_norm": 0.49290481209754944, "learning_rate": 7.5830824786563875e-06, "loss": 0.5198, "step": 7677 }, { "epoch": 3.47342230264646, "grad_norm": 0.5457162857055664, "learning_rate": 7.582455057919163e-06, "loss": 0.5813, "step": 7678 }, { "epoch": 3.4738746889843926, "grad_norm": 0.5818607211112976, "learning_rate": 7.5818275817187904e-06, "loss": 0.4815, "step": 7679 }, { "epoch": 3.4743270753223254, "grad_norm": 0.6143090128898621, "learning_rate": 7.58120005006875e-06, "loss": 0.6437, "step": 7680 }, { "epoch": 3.474779461660258, "grad_norm": 0.7052140235900879, "learning_rate": 7.580572462982516e-06, "loss": 0.623, "step": 7681 }, { "epoch": 3.4752318479981903, "grad_norm": 0.45011797547340393, "learning_rate": 7.579944820473568e-06, "loss": 0.9891, "step": 7682 }, { "epoch": 3.475684234336123, "grad_norm": 0.21183112263679504, "learning_rate": 7.579317122555386e-06, "loss": 1.4062, "step": 7683 }, { "epoch": 3.4761366206740556, "grad_norm": 0.19848884642124176, "learning_rate": 7.578689369241451e-06, "loss": 0.434, "step": 7684 }, { "epoch": 3.476589007011988, "grad_norm": 0.274231493473053, "learning_rate": 7.578061560545246e-06, "loss": 0.6642, "step": 7685 }, { "epoch": 3.477041393349921, "grad_norm": 0.31871911883354187, "learning_rate": 7.577433696480252e-06, "loss": 0.6151, "step": 7686 }, { "epoch": 3.4774937796878533, "grad_norm": 0.3319917619228363, "learning_rate": 7.5768057770599565e-06, "loss": 0.4809, "step": 7687 }, { "epoch": 3.477946166025786, "grad_norm": 0.2993091642856598, "learning_rate": 7.576177802297844e-06, "loss": 0.5713, "step": 7688 }, { "epoch": 3.4783985523637186, "grad_norm": 0.3237152695655823, "learning_rate": 7.575549772207401e-06, "loss": 0.5917, "step": 7689 }, { "epoch": 3.478850938701651, "grad_norm": 0.3377673923969269, "learning_rate": 7.574921686802117e-06, "loss": 0.6623, "step": 7690 }, { "epoch": 3.479303325039584, "grad_norm": 0.3316591680049896, "learning_rate": 7.574293546095481e-06, "loss": 0.4962, "step": 7691 }, { "epoch": 3.4797557113775164, "grad_norm": 0.3226824700832367, "learning_rate": 7.573665350100981e-06, "loss": 0.6082, "step": 7692 }, { "epoch": 3.480208097715449, "grad_norm": 0.3569064140319824, "learning_rate": 7.57303709883211e-06, "loss": 0.6476, "step": 7693 }, { "epoch": 3.4806604840533817, "grad_norm": 0.3321855962276459, "learning_rate": 7.572408792302364e-06, "loss": 0.5757, "step": 7694 }, { "epoch": 3.481112870391314, "grad_norm": 0.33903032541275024, "learning_rate": 7.571780430525233e-06, "loss": 0.5052, "step": 7695 }, { "epoch": 3.4815652567292465, "grad_norm": 0.3588285744190216, "learning_rate": 7.571152013514213e-06, "loss": 0.5381, "step": 7696 }, { "epoch": 3.4820176430671794, "grad_norm": 0.3468092381954193, "learning_rate": 7.5705235412828035e-06, "loss": 0.576, "step": 7697 }, { "epoch": 3.482470029405112, "grad_norm": 0.3727263808250427, "learning_rate": 7.569895013844499e-06, "loss": 0.6276, "step": 7698 }, { "epoch": 3.4829224157430447, "grad_norm": 0.3702167868614197, "learning_rate": 7.569266431212798e-06, "loss": 0.5523, "step": 7699 }, { "epoch": 3.483374802080977, "grad_norm": 0.4007321894168854, "learning_rate": 7.568637793401203e-06, "loss": 0.6824, "step": 7700 }, { "epoch": 3.4838271884189096, "grad_norm": 0.3652675747871399, "learning_rate": 7.568009100423215e-06, "loss": 0.5009, "step": 7701 }, { "epoch": 3.4842795747568425, "grad_norm": 0.35707104206085205, "learning_rate": 7.567380352292335e-06, "loss": 0.6031, "step": 7702 }, { "epoch": 3.484731961094775, "grad_norm": 0.37036946415901184, "learning_rate": 7.5667515490220665e-06, "loss": 0.5844, "step": 7703 }, { "epoch": 3.4851843474327078, "grad_norm": 0.3842995762825012, "learning_rate": 7.566122690625914e-06, "loss": 0.5355, "step": 7704 }, { "epoch": 3.48563673377064, "grad_norm": 0.36899837851524353, "learning_rate": 7.565493777117386e-06, "loss": 0.478, "step": 7705 }, { "epoch": 3.4860891201085726, "grad_norm": 0.3356296718120575, "learning_rate": 7.5648648085099854e-06, "loss": 0.4919, "step": 7706 }, { "epoch": 3.486541506446505, "grad_norm": 0.35724663734436035, "learning_rate": 7.564235784817225e-06, "loss": 0.5087, "step": 7707 }, { "epoch": 3.486993892784438, "grad_norm": 0.395342618227005, "learning_rate": 7.5636067060526095e-06, "loss": 0.5207, "step": 7708 }, { "epoch": 3.4874462791223704, "grad_norm": 0.4543476104736328, "learning_rate": 7.562977572229654e-06, "loss": 0.6469, "step": 7709 }, { "epoch": 3.4878986654603032, "grad_norm": 0.37931710481643677, "learning_rate": 7.562348383361868e-06, "loss": 0.4841, "step": 7710 }, { "epoch": 3.4883510517982357, "grad_norm": 0.4097425639629364, "learning_rate": 7.561719139462765e-06, "loss": 0.5402, "step": 7711 }, { "epoch": 3.488803438136168, "grad_norm": 0.40344366431236267, "learning_rate": 7.5610898405458584e-06, "loss": 0.6002, "step": 7712 }, { "epoch": 3.489255824474101, "grad_norm": 0.3475983738899231, "learning_rate": 7.5604604866246655e-06, "loss": 0.4385, "step": 7713 }, { "epoch": 3.4897082108120334, "grad_norm": 0.43900394439697266, "learning_rate": 7.559831077712704e-06, "loss": 0.5361, "step": 7714 }, { "epoch": 3.4901605971499663, "grad_norm": 0.35305941104888916, "learning_rate": 7.559201613823487e-06, "loss": 0.454, "step": 7715 }, { "epoch": 3.4906129834878987, "grad_norm": 0.38436418771743774, "learning_rate": 7.558572094970537e-06, "loss": 0.5108, "step": 7716 }, { "epoch": 3.491065369825831, "grad_norm": 0.4527551829814911, "learning_rate": 7.557942521167372e-06, "loss": 0.499, "step": 7717 }, { "epoch": 3.491517756163764, "grad_norm": 0.42283520102500916, "learning_rate": 7.557312892427514e-06, "loss": 0.4854, "step": 7718 }, { "epoch": 3.4919701425016965, "grad_norm": 0.4364098608493805, "learning_rate": 7.556683208764487e-06, "loss": 0.521, "step": 7719 }, { "epoch": 3.492422528839629, "grad_norm": 0.3850577473640442, "learning_rate": 7.5560534701918135e-06, "loss": 0.4537, "step": 7720 }, { "epoch": 3.4928749151775618, "grad_norm": 0.41148126125335693, "learning_rate": 7.555423676723017e-06, "loss": 0.485, "step": 7721 }, { "epoch": 3.493327301515494, "grad_norm": 0.4910885691642761, "learning_rate": 7.554793828371626e-06, "loss": 0.5433, "step": 7722 }, { "epoch": 3.4937796878534266, "grad_norm": 0.41571593284606934, "learning_rate": 7.554163925151166e-06, "loss": 0.4154, "step": 7723 }, { "epoch": 3.4942320741913595, "grad_norm": 0.406485915184021, "learning_rate": 7.553533967075166e-06, "loss": 0.4259, "step": 7724 }, { "epoch": 3.494684460529292, "grad_norm": 0.4634496867656708, "learning_rate": 7.552903954157157e-06, "loss": 0.4392, "step": 7725 }, { "epoch": 3.495136846867225, "grad_norm": 0.44189324975013733, "learning_rate": 7.552273886410666e-06, "loss": 0.4217, "step": 7726 }, { "epoch": 3.4955892332051572, "grad_norm": 0.4574565589427948, "learning_rate": 7.551643763849228e-06, "loss": 0.4654, "step": 7727 }, { "epoch": 3.4960416195430897, "grad_norm": 0.5545201301574707, "learning_rate": 7.551013586486376e-06, "loss": 0.5913, "step": 7728 }, { "epoch": 3.4964940058810225, "grad_norm": 0.48827192187309265, "learning_rate": 7.550383354335642e-06, "loss": 0.5253, "step": 7729 }, { "epoch": 3.496946392218955, "grad_norm": 0.48920804262161255, "learning_rate": 7.549753067410564e-06, "loss": 0.4836, "step": 7730 }, { "epoch": 3.4973987785568874, "grad_norm": 0.5467035174369812, "learning_rate": 7.549122725724676e-06, "loss": 0.4531, "step": 7731 }, { "epoch": 3.4978511648948203, "grad_norm": 0.4379018545150757, "learning_rate": 7.548492329291518e-06, "loss": 0.8744, "step": 7732 }, { "epoch": 3.4983035512327527, "grad_norm": 0.20630763471126556, "learning_rate": 7.547861878124628e-06, "loss": 0.9556, "step": 7733 }, { "epoch": 3.498755937570685, "grad_norm": 0.26467904448509216, "learning_rate": 7.547231372237546e-06, "loss": 0.6455, "step": 7734 }, { "epoch": 3.499208323908618, "grad_norm": 0.29534226655960083, "learning_rate": 7.546600811643816e-06, "loss": 0.6583, "step": 7735 }, { "epoch": 3.4996607102465505, "grad_norm": 0.2944108843803406, "learning_rate": 7.545970196356975e-06, "loss": 0.5304, "step": 7736 }, { "epoch": 3.5001130965844833, "grad_norm": 0.2824802100658417, "learning_rate": 7.545339526390574e-06, "loss": 0.5679, "step": 7737 }, { "epoch": 3.5005654829224158, "grad_norm": 0.30133405327796936, "learning_rate": 7.54470880175815e-06, "loss": 0.5083, "step": 7738 }, { "epoch": 3.501017869260348, "grad_norm": 0.3756164014339447, "learning_rate": 7.544078022473255e-06, "loss": 0.5255, "step": 7739 }, { "epoch": 3.501470255598281, "grad_norm": 0.30346840620040894, "learning_rate": 7.543447188549432e-06, "loss": 0.5667, "step": 7740 }, { "epoch": 3.5019226419362135, "grad_norm": 0.3469946086406708, "learning_rate": 7.542816300000233e-06, "loss": 0.6635, "step": 7741 }, { "epoch": 3.5023750282741464, "grad_norm": 0.3731743097305298, "learning_rate": 7.542185356839205e-06, "loss": 0.6754, "step": 7742 }, { "epoch": 3.502827414612079, "grad_norm": 0.3610062599182129, "learning_rate": 7.541554359079899e-06, "loss": 0.6329, "step": 7743 }, { "epoch": 3.5032798009500112, "grad_norm": 0.30833858251571655, "learning_rate": 7.540923306735868e-06, "loss": 0.5819, "step": 7744 }, { "epoch": 3.5037321872879437, "grad_norm": 0.35905057191848755, "learning_rate": 7.540292199820662e-06, "loss": 0.5251, "step": 7745 }, { "epoch": 3.5041845736258765, "grad_norm": 0.3675276041030884, "learning_rate": 7.539661038347841e-06, "loss": 0.5634, "step": 7746 }, { "epoch": 3.504636959963809, "grad_norm": 0.40648749470710754, "learning_rate": 7.539029822330955e-06, "loss": 0.5433, "step": 7747 }, { "epoch": 3.505089346301742, "grad_norm": 0.3535742461681366, "learning_rate": 7.538398551783566e-06, "loss": 0.6345, "step": 7748 }, { "epoch": 3.5055417326396743, "grad_norm": 0.34426337480545044, "learning_rate": 7.5377672267192245e-06, "loss": 0.4743, "step": 7749 }, { "epoch": 3.5059941189776067, "grad_norm": 0.3339046239852905, "learning_rate": 7.5371358471514956e-06, "loss": 0.5415, "step": 7750 }, { "epoch": 3.5064465053155396, "grad_norm": 0.39424458146095276, "learning_rate": 7.536504413093935e-06, "loss": 0.5831, "step": 7751 }, { "epoch": 3.506898891653472, "grad_norm": 0.35320329666137695, "learning_rate": 7.535872924560108e-06, "loss": 0.4927, "step": 7752 }, { "epoch": 3.507351277991405, "grad_norm": 0.3810546100139618, "learning_rate": 7.535241381563574e-06, "loss": 0.5366, "step": 7753 }, { "epoch": 3.5078036643293373, "grad_norm": 0.3942110538482666, "learning_rate": 7.5346097841178966e-06, "loss": 0.599, "step": 7754 }, { "epoch": 3.5082560506672698, "grad_norm": 0.3791007399559021, "learning_rate": 7.533978132236643e-06, "loss": 0.5686, "step": 7755 }, { "epoch": 3.508708437005202, "grad_norm": 0.3928525447845459, "learning_rate": 7.533346425933377e-06, "loss": 0.6122, "step": 7756 }, { "epoch": 3.509160823343135, "grad_norm": 0.3474729061126709, "learning_rate": 7.532714665221668e-06, "loss": 0.4912, "step": 7757 }, { "epoch": 3.5096132096810675, "grad_norm": 0.350673109292984, "learning_rate": 7.532082850115081e-06, "loss": 0.4836, "step": 7758 }, { "epoch": 3.5100655960190004, "grad_norm": 0.3861469030380249, "learning_rate": 7.531450980627188e-06, "loss": 0.5134, "step": 7759 }, { "epoch": 3.510517982356933, "grad_norm": 0.3646329939365387, "learning_rate": 7.5308190567715586e-06, "loss": 0.4849, "step": 7760 }, { "epoch": 3.5109703686948652, "grad_norm": 0.42073529958724976, "learning_rate": 7.5301870785617635e-06, "loss": 0.5939, "step": 7761 }, { "epoch": 3.511422755032798, "grad_norm": 0.38973137736320496, "learning_rate": 7.529555046011378e-06, "loss": 0.5438, "step": 7762 }, { "epoch": 3.5118751413707305, "grad_norm": 0.4068644642829895, "learning_rate": 7.528922959133974e-06, "loss": 0.5983, "step": 7763 }, { "epoch": 3.5123275277086634, "grad_norm": 0.40271785855293274, "learning_rate": 7.528290817943129e-06, "loss": 0.5219, "step": 7764 }, { "epoch": 3.512779914046596, "grad_norm": 0.43821048736572266, "learning_rate": 7.527658622452416e-06, "loss": 0.4885, "step": 7765 }, { "epoch": 3.5132323003845283, "grad_norm": 0.39222341775894165, "learning_rate": 7.527026372675417e-06, "loss": 0.5175, "step": 7766 }, { "epoch": 3.513684686722461, "grad_norm": 0.4045216739177704, "learning_rate": 7.526394068625709e-06, "loss": 0.4757, "step": 7767 }, { "epoch": 3.5141370730603936, "grad_norm": 0.41462597250938416, "learning_rate": 7.52576171031687e-06, "loss": 0.5613, "step": 7768 }, { "epoch": 3.514589459398326, "grad_norm": 0.42328545451164246, "learning_rate": 7.525129297762484e-06, "loss": 0.5617, "step": 7769 }, { "epoch": 3.515041845736259, "grad_norm": 0.4381188154220581, "learning_rate": 7.524496830976131e-06, "loss": 0.5057, "step": 7770 }, { "epoch": 3.5154942320741913, "grad_norm": 0.45641425251960754, "learning_rate": 7.5238643099713944e-06, "loss": 0.5511, "step": 7771 }, { "epoch": 3.5159466184121237, "grad_norm": 0.480421245098114, "learning_rate": 7.523231734761861e-06, "loss": 0.5632, "step": 7772 }, { "epoch": 3.5163990047500566, "grad_norm": 0.40769162774086, "learning_rate": 7.522599105361115e-06, "loss": 0.4476, "step": 7773 }, { "epoch": 3.516851391087989, "grad_norm": 0.46440279483795166, "learning_rate": 7.5219664217827425e-06, "loss": 0.5558, "step": 7774 }, { "epoch": 3.517303777425922, "grad_norm": 0.5294941663742065, "learning_rate": 7.521333684040335e-06, "loss": 0.5781, "step": 7775 }, { "epoch": 3.5177561637638544, "grad_norm": 0.44980567693710327, "learning_rate": 7.520700892147478e-06, "loss": 0.4283, "step": 7776 }, { "epoch": 3.518208550101787, "grad_norm": 0.4532598853111267, "learning_rate": 7.5200680461177636e-06, "loss": 0.4547, "step": 7777 }, { "epoch": 3.5186609364397197, "grad_norm": 0.4850282073020935, "learning_rate": 7.519435145964782e-06, "loss": 0.4516, "step": 7778 }, { "epoch": 3.519113322777652, "grad_norm": 0.5739508271217346, "learning_rate": 7.518802191702128e-06, "loss": 0.5373, "step": 7779 }, { "epoch": 3.519565709115585, "grad_norm": 0.5289998650550842, "learning_rate": 7.518169183343395e-06, "loss": 0.5017, "step": 7780 }, { "epoch": 3.5200180954535174, "grad_norm": 0.5938728451728821, "learning_rate": 7.517536120902177e-06, "loss": 0.5506, "step": 7781 }, { "epoch": 3.52047048179145, "grad_norm": 0.4280725121498108, "learning_rate": 7.516903004392071e-06, "loss": 1.0275, "step": 7782 }, { "epoch": 3.5209228681293823, "grad_norm": 0.20280113816261292, "learning_rate": 7.516269833826675e-06, "loss": 1.1036, "step": 7783 }, { "epoch": 3.521375254467315, "grad_norm": 0.2594439387321472, "learning_rate": 7.5156366092195855e-06, "loss": 0.5868, "step": 7784 }, { "epoch": 3.5218276408052476, "grad_norm": 0.29454344511032104, "learning_rate": 7.515003330584405e-06, "loss": 0.5815, "step": 7785 }, { "epoch": 3.5222800271431804, "grad_norm": 0.29167690873146057, "learning_rate": 7.514369997934732e-06, "loss": 0.6355, "step": 7786 }, { "epoch": 3.522732413481113, "grad_norm": 0.3160916864871979, "learning_rate": 7.513736611284168e-06, "loss": 0.6214, "step": 7787 }, { "epoch": 3.5231847998190453, "grad_norm": 0.2834990620613098, "learning_rate": 7.513103170646319e-06, "loss": 0.4712, "step": 7788 }, { "epoch": 3.523637186156978, "grad_norm": 0.330904483795166, "learning_rate": 7.512469676034788e-06, "loss": 0.7095, "step": 7789 }, { "epoch": 3.5240895724949106, "grad_norm": 0.35136696696281433, "learning_rate": 7.511836127463179e-06, "loss": 0.6462, "step": 7790 }, { "epoch": 3.5245419588328435, "grad_norm": 0.3281747102737427, "learning_rate": 7.511202524945101e-06, "loss": 0.6586, "step": 7791 }, { "epoch": 3.524994345170776, "grad_norm": 0.3146061599254608, "learning_rate": 7.5105688684941614e-06, "loss": 0.5236, "step": 7792 }, { "epoch": 3.5254467315087084, "grad_norm": 0.3231216073036194, "learning_rate": 7.509935158123969e-06, "loss": 0.5348, "step": 7793 }, { "epoch": 3.525899117846641, "grad_norm": 0.3426530063152313, "learning_rate": 7.509301393848133e-06, "loss": 0.5804, "step": 7794 }, { "epoch": 3.5263515041845737, "grad_norm": 0.3202544152736664, "learning_rate": 7.508667575680265e-06, "loss": 0.5761, "step": 7795 }, { "epoch": 3.526803890522506, "grad_norm": 0.35904115438461304, "learning_rate": 7.508033703633978e-06, "loss": 0.5618, "step": 7796 }, { "epoch": 3.527256276860439, "grad_norm": 0.3232153356075287, "learning_rate": 7.507399777722886e-06, "loss": 0.4779, "step": 7797 }, { "epoch": 3.5277086631983714, "grad_norm": 0.3740919530391693, "learning_rate": 7.5067657979606025e-06, "loss": 0.5603, "step": 7798 }, { "epoch": 3.528161049536304, "grad_norm": 0.3506416380405426, "learning_rate": 7.5061317643607425e-06, "loss": 0.5844, "step": 7799 }, { "epoch": 3.5286134358742367, "grad_norm": 0.3600442409515381, "learning_rate": 7.505497676936926e-06, "loss": 0.444, "step": 7800 }, { "epoch": 3.5286134358742367, "eval_loss": 0.591353178024292, "eval_runtime": 25.8643, "eval_samples_per_second": 28.766, "eval_steps_per_second": 7.191, "step": 7800 }, { "epoch": 3.529065822212169, "grad_norm": 0.36109238862991333, "learning_rate": 7.504863535702771e-06, "loss": 0.4975, "step": 7801 }, { "epoch": 3.529518208550102, "grad_norm": 0.35169148445129395, "learning_rate": 7.504229340671894e-06, "loss": 0.6073, "step": 7802 }, { "epoch": 3.5299705948880344, "grad_norm": 0.3541252613067627, "learning_rate": 7.503595091857918e-06, "loss": 0.5179, "step": 7803 }, { "epoch": 3.530422981225967, "grad_norm": 0.3551771640777588, "learning_rate": 7.502960789274465e-06, "loss": 0.5367, "step": 7804 }, { "epoch": 3.5308753675638993, "grad_norm": 0.3728635311126709, "learning_rate": 7.502326432935156e-06, "loss": 0.486, "step": 7805 }, { "epoch": 3.531327753901832, "grad_norm": 0.4000107944011688, "learning_rate": 7.501692022853616e-06, "loss": 0.498, "step": 7806 }, { "epoch": 3.5317801402397646, "grad_norm": 0.40849754214286804, "learning_rate": 7.50105755904347e-06, "loss": 0.551, "step": 7807 }, { "epoch": 3.5322325265776975, "grad_norm": 0.43760743737220764, "learning_rate": 7.500423041518345e-06, "loss": 0.6409, "step": 7808 }, { "epoch": 3.53268491291563, "grad_norm": 0.3703501522541046, "learning_rate": 7.499788470291867e-06, "loss": 0.4968, "step": 7809 }, { "epoch": 3.5331372992535623, "grad_norm": 0.3661748766899109, "learning_rate": 7.499153845377664e-06, "loss": 0.4895, "step": 7810 }, { "epoch": 3.533589685591495, "grad_norm": 0.37354370951652527, "learning_rate": 7.49851916678937e-06, "loss": 0.4521, "step": 7811 }, { "epoch": 3.5340420719294277, "grad_norm": 0.39818328619003296, "learning_rate": 7.497884434540612e-06, "loss": 0.6024, "step": 7812 }, { "epoch": 3.5344944582673605, "grad_norm": 0.4034680426120758, "learning_rate": 7.497249648645023e-06, "loss": 0.5446, "step": 7813 }, { "epoch": 3.534946844605293, "grad_norm": 0.4386666715145111, "learning_rate": 7.496614809116236e-06, "loss": 0.5658, "step": 7814 }, { "epoch": 3.5353992309432254, "grad_norm": 0.4551374018192291, "learning_rate": 7.495979915967887e-06, "loss": 0.5675, "step": 7815 }, { "epoch": 3.5358516172811583, "grad_norm": 0.44130730628967285, "learning_rate": 7.4953449692136116e-06, "loss": 0.5913, "step": 7816 }, { "epoch": 3.5363040036190907, "grad_norm": 0.41376617550849915, "learning_rate": 7.494709968867043e-06, "loss": 0.4787, "step": 7817 }, { "epoch": 3.5367563899570236, "grad_norm": 0.445721834897995, "learning_rate": 7.494074914941822e-06, "loss": 0.5398, "step": 7818 }, { "epoch": 3.537208776294956, "grad_norm": 0.5044659376144409, "learning_rate": 7.493439807451588e-06, "loss": 0.5119, "step": 7819 }, { "epoch": 3.5376611626328884, "grad_norm": 0.4469051659107208, "learning_rate": 7.492804646409979e-06, "loss": 0.5176, "step": 7820 }, { "epoch": 3.538113548970821, "grad_norm": 0.48811715841293335, "learning_rate": 7.492169431830638e-06, "loss": 0.5982, "step": 7821 }, { "epoch": 3.5385659353087537, "grad_norm": 0.4218328893184662, "learning_rate": 7.491534163727206e-06, "loss": 0.4505, "step": 7822 }, { "epoch": 3.539018321646686, "grad_norm": 0.45245087146759033, "learning_rate": 7.4908988421133275e-06, "loss": 0.5464, "step": 7823 }, { "epoch": 3.539470707984619, "grad_norm": 0.4551563858985901, "learning_rate": 7.4902634670026485e-06, "loss": 0.5128, "step": 7824 }, { "epoch": 3.5399230943225515, "grad_norm": 0.42751091718673706, "learning_rate": 7.4896280384088125e-06, "loss": 0.3929, "step": 7825 }, { "epoch": 3.540375480660484, "grad_norm": 0.45258229970932007, "learning_rate": 7.488992556345468e-06, "loss": 0.4187, "step": 7826 }, { "epoch": 3.540827866998417, "grad_norm": 0.45667821168899536, "learning_rate": 7.488357020826263e-06, "loss": 0.4903, "step": 7827 }, { "epoch": 3.541280253336349, "grad_norm": 0.5283269882202148, "learning_rate": 7.4877214318648476e-06, "loss": 0.5986, "step": 7828 }, { "epoch": 3.541732639674282, "grad_norm": 0.5945267677307129, "learning_rate": 7.487085789474869e-06, "loss": 0.6073, "step": 7829 }, { "epoch": 3.5421850260122145, "grad_norm": 0.47323158383369446, "learning_rate": 7.486450093669982e-06, "loss": 0.4547, "step": 7830 }, { "epoch": 3.542637412350147, "grad_norm": 0.5265317559242249, "learning_rate": 7.485814344463839e-06, "loss": 0.444, "step": 7831 }, { "epoch": 3.5430897986880794, "grad_norm": 0.44553616642951965, "learning_rate": 7.4851785418700925e-06, "loss": 1.1305, "step": 7832 }, { "epoch": 3.5435421850260123, "grad_norm": 0.25731053948402405, "learning_rate": 7.484542685902401e-06, "loss": 0.9972, "step": 7833 }, { "epoch": 3.5439945713639447, "grad_norm": 0.2680342197418213, "learning_rate": 7.483906776574416e-06, "loss": 0.8159, "step": 7834 }, { "epoch": 3.5444469577018776, "grad_norm": 0.2907456159591675, "learning_rate": 7.483270813899798e-06, "loss": 0.6797, "step": 7835 }, { "epoch": 3.54489934403981, "grad_norm": 0.28136366605758667, "learning_rate": 7.482634797892203e-06, "loss": 0.6873, "step": 7836 }, { "epoch": 3.5453517303777424, "grad_norm": 0.2777754068374634, "learning_rate": 7.481998728565294e-06, "loss": 0.4391, "step": 7837 }, { "epoch": 3.5458041167156753, "grad_norm": 0.2741187810897827, "learning_rate": 7.481362605932731e-06, "loss": 0.5912, "step": 7838 }, { "epoch": 3.5462565030536077, "grad_norm": 0.33914366364479065, "learning_rate": 7.480726430008174e-06, "loss": 0.6888, "step": 7839 }, { "epoch": 3.5467088893915406, "grad_norm": 0.3464469015598297, "learning_rate": 7.480090200805287e-06, "loss": 0.566, "step": 7840 }, { "epoch": 3.547161275729473, "grad_norm": 0.30492618680000305, "learning_rate": 7.479453918337733e-06, "loss": 0.5396, "step": 7841 }, { "epoch": 3.5476136620674055, "grad_norm": 0.3454711139202118, "learning_rate": 7.47881758261918e-06, "loss": 0.6882, "step": 7842 }, { "epoch": 3.548066048405338, "grad_norm": 0.3158089220523834, "learning_rate": 7.478181193663294e-06, "loss": 0.5236, "step": 7843 }, { "epoch": 3.5485184347432708, "grad_norm": 0.3346734642982483, "learning_rate": 7.477544751483741e-06, "loss": 0.5279, "step": 7844 }, { "epoch": 3.548970821081203, "grad_norm": 0.3329574763774872, "learning_rate": 7.476908256094191e-06, "loss": 0.5044, "step": 7845 }, { "epoch": 3.549423207419136, "grad_norm": 0.38855063915252686, "learning_rate": 7.476271707508314e-06, "loss": 0.7397, "step": 7846 }, { "epoch": 3.5498755937570685, "grad_norm": 0.31691670417785645, "learning_rate": 7.475635105739781e-06, "loss": 0.4979, "step": 7847 }, { "epoch": 3.550327980095001, "grad_norm": 0.3952770233154297, "learning_rate": 7.474998450802262e-06, "loss": 0.5274, "step": 7848 }, { "epoch": 3.550780366432934, "grad_norm": 0.3592427968978882, "learning_rate": 7.474361742709434e-06, "loss": 0.6084, "step": 7849 }, { "epoch": 3.5512327527708663, "grad_norm": 0.4145718514919281, "learning_rate": 7.47372498147497e-06, "loss": 0.6967, "step": 7850 }, { "epoch": 3.551685139108799, "grad_norm": 0.40121152997016907, "learning_rate": 7.473088167112546e-06, "loss": 0.6079, "step": 7851 }, { "epoch": 3.5521375254467316, "grad_norm": 0.3512495160102844, "learning_rate": 7.472451299635838e-06, "loss": 0.4518, "step": 7852 }, { "epoch": 3.552589911784664, "grad_norm": 0.4129463732242584, "learning_rate": 7.471814379058524e-06, "loss": 0.6633, "step": 7853 }, { "epoch": 3.5530422981225964, "grad_norm": 0.369660884141922, "learning_rate": 7.4711774053942835e-06, "loss": 0.5681, "step": 7854 }, { "epoch": 3.5534946844605293, "grad_norm": 0.4359782636165619, "learning_rate": 7.4705403786567966e-06, "loss": 0.7249, "step": 7855 }, { "epoch": 3.5539470707984617, "grad_norm": 0.3747178018093109, "learning_rate": 7.4699032988597445e-06, "loss": 0.4569, "step": 7856 }, { "epoch": 3.5543994571363946, "grad_norm": 0.4468427896499634, "learning_rate": 7.469266166016811e-06, "loss": 0.6199, "step": 7857 }, { "epoch": 3.554851843474327, "grad_norm": 0.372683048248291, "learning_rate": 7.468628980141677e-06, "loss": 0.4701, "step": 7858 }, { "epoch": 3.5553042298122595, "grad_norm": 0.4095923900604248, "learning_rate": 7.46799174124803e-06, "loss": 0.5219, "step": 7859 }, { "epoch": 3.5557566161501923, "grad_norm": 0.4335615038871765, "learning_rate": 7.4673544493495545e-06, "loss": 0.5554, "step": 7860 }, { "epoch": 3.5562090024881248, "grad_norm": 0.38557755947113037, "learning_rate": 7.466717104459938e-06, "loss": 0.5077, "step": 7861 }, { "epoch": 3.5566613888260576, "grad_norm": 0.4125659167766571, "learning_rate": 7.466079706592869e-06, "loss": 0.524, "step": 7862 }, { "epoch": 3.55711377516399, "grad_norm": 0.3829527497291565, "learning_rate": 7.465442255762037e-06, "loss": 0.5301, "step": 7863 }, { "epoch": 3.5575661615019225, "grad_norm": 0.460994154214859, "learning_rate": 7.464804751981131e-06, "loss": 0.5988, "step": 7864 }, { "epoch": 3.5580185478398554, "grad_norm": 0.4046197831630707, "learning_rate": 7.464167195263842e-06, "loss": 0.4486, "step": 7865 }, { "epoch": 3.558470934177788, "grad_norm": 0.44295361638069153, "learning_rate": 7.463529585623865e-06, "loss": 0.5323, "step": 7866 }, { "epoch": 3.5589233205157207, "grad_norm": 0.4535368084907532, "learning_rate": 7.462891923074894e-06, "loss": 0.5458, "step": 7867 }, { "epoch": 3.559375706853653, "grad_norm": 0.4275914132595062, "learning_rate": 7.462254207630624e-06, "loss": 0.4974, "step": 7868 }, { "epoch": 3.5598280931915856, "grad_norm": 0.42901554703712463, "learning_rate": 7.461616439304749e-06, "loss": 0.5409, "step": 7869 }, { "epoch": 3.560280479529518, "grad_norm": 0.3893338143825531, "learning_rate": 7.460978618110969e-06, "loss": 0.4158, "step": 7870 }, { "epoch": 3.560732865867451, "grad_norm": 0.4333467483520508, "learning_rate": 7.460340744062979e-06, "loss": 0.5472, "step": 7871 }, { "epoch": 3.5611852522053833, "grad_norm": 0.4248405396938324, "learning_rate": 7.459702817174482e-06, "loss": 0.4472, "step": 7872 }, { "epoch": 3.561637638543316, "grad_norm": 0.4778003394603729, "learning_rate": 7.459064837459178e-06, "loss": 0.5632, "step": 7873 }, { "epoch": 3.5620900248812486, "grad_norm": 0.46231138706207275, "learning_rate": 7.458426804930767e-06, "loss": 0.4571, "step": 7874 }, { "epoch": 3.562542411219181, "grad_norm": 0.4509841501712799, "learning_rate": 7.457788719602953e-06, "loss": 0.4922, "step": 7875 }, { "epoch": 3.562994797557114, "grad_norm": 0.4416079819202423, "learning_rate": 7.457150581489441e-06, "loss": 0.4745, "step": 7876 }, { "epoch": 3.5634471838950463, "grad_norm": 0.5360206961631775, "learning_rate": 7.456512390603934e-06, "loss": 0.5963, "step": 7877 }, { "epoch": 3.563899570232979, "grad_norm": 0.6151277422904968, "learning_rate": 7.455874146960141e-06, "loss": 0.541, "step": 7878 }, { "epoch": 3.5643519565709116, "grad_norm": 0.510564386844635, "learning_rate": 7.455235850571767e-06, "loss": 0.5074, "step": 7879 }, { "epoch": 3.564804342908844, "grad_norm": 0.5321109294891357, "learning_rate": 7.454597501452523e-06, "loss": 0.4829, "step": 7880 }, { "epoch": 3.5652567292467765, "grad_norm": 0.5838386416435242, "learning_rate": 7.453959099616119e-06, "loss": 0.4344, "step": 7881 }, { "epoch": 3.5657091155847094, "grad_norm": 0.5860344767570496, "learning_rate": 7.453320645076263e-06, "loss": 1.2154, "step": 7882 }, { "epoch": 3.566161501922642, "grad_norm": 0.23131053149700165, "learning_rate": 7.4526821378466684e-06, "loss": 1.0723, "step": 7883 }, { "epoch": 3.5666138882605747, "grad_norm": 0.23410776257514954, "learning_rate": 7.452043577941049e-06, "loss": 0.5079, "step": 7884 }, { "epoch": 3.567066274598507, "grad_norm": 0.26063936948776245, "learning_rate": 7.451404965373119e-06, "loss": 0.5637, "step": 7885 }, { "epoch": 3.5675186609364395, "grad_norm": 0.31369006633758545, "learning_rate": 7.4507663001565936e-06, "loss": 0.625, "step": 7886 }, { "epoch": 3.5679710472743724, "grad_norm": 0.3084163963794708, "learning_rate": 7.450127582305188e-06, "loss": 0.6965, "step": 7887 }, { "epoch": 3.568423433612305, "grad_norm": 0.32046180963516235, "learning_rate": 7.449488811832621e-06, "loss": 0.6321, "step": 7888 }, { "epoch": 3.5688758199502377, "grad_norm": 0.3253014087677002, "learning_rate": 7.4488499887526136e-06, "loss": 0.5525, "step": 7889 }, { "epoch": 3.56932820628817, "grad_norm": 0.36920690536499023, "learning_rate": 7.4482111130788826e-06, "loss": 0.6547, "step": 7890 }, { "epoch": 3.5697805926261026, "grad_norm": 0.3267059326171875, "learning_rate": 7.447572184825149e-06, "loss": 0.5716, "step": 7891 }, { "epoch": 3.570232978964035, "grad_norm": 0.33083897829055786, "learning_rate": 7.446933204005138e-06, "loss": 0.5323, "step": 7892 }, { "epoch": 3.570685365301968, "grad_norm": 0.3465590178966522, "learning_rate": 7.446294170632571e-06, "loss": 0.5598, "step": 7893 }, { "epoch": 3.5711377516399003, "grad_norm": 0.3885561227798462, "learning_rate": 7.445655084721171e-06, "loss": 0.6116, "step": 7894 }, { "epoch": 3.571590137977833, "grad_norm": 0.35864371061325073, "learning_rate": 7.445015946284666e-06, "loss": 0.5854, "step": 7895 }, { "epoch": 3.5720425243157656, "grad_norm": 0.4037393033504486, "learning_rate": 7.444376755336782e-06, "loss": 0.6561, "step": 7896 }, { "epoch": 3.572494910653698, "grad_norm": 0.42541560530662537, "learning_rate": 7.443737511891247e-06, "loss": 0.8011, "step": 7897 }, { "epoch": 3.572947296991631, "grad_norm": 0.3746158480644226, "learning_rate": 7.443098215961789e-06, "loss": 0.6013, "step": 7898 }, { "epoch": 3.5733996833295634, "grad_norm": 0.37682774662971497, "learning_rate": 7.442458867562139e-06, "loss": 0.6033, "step": 7899 }, { "epoch": 3.5738520696674962, "grad_norm": 0.3471643924713135, "learning_rate": 7.441819466706029e-06, "loss": 0.5552, "step": 7900 }, { "epoch": 3.5743044560054287, "grad_norm": 0.41367030143737793, "learning_rate": 7.441180013407188e-06, "loss": 0.604, "step": 7901 }, { "epoch": 3.574756842343361, "grad_norm": 0.4088820219039917, "learning_rate": 7.440540507679353e-06, "loss": 0.5049, "step": 7902 }, { "epoch": 3.575209228681294, "grad_norm": 0.42918860912323, "learning_rate": 7.439900949536257e-06, "loss": 0.6789, "step": 7903 }, { "epoch": 3.5756616150192264, "grad_norm": 0.35539183020591736, "learning_rate": 7.439261338991639e-06, "loss": 0.4627, "step": 7904 }, { "epoch": 3.5761140013571593, "grad_norm": 0.3515028953552246, "learning_rate": 7.43862167605923e-06, "loss": 0.4925, "step": 7905 }, { "epoch": 3.5765663876950917, "grad_norm": 0.38492003083229065, "learning_rate": 7.437981960752771e-06, "loss": 0.4487, "step": 7906 }, { "epoch": 3.577018774033024, "grad_norm": 0.40286073088645935, "learning_rate": 7.437342193086003e-06, "loss": 0.6635, "step": 7907 }, { "epoch": 3.5774711603709566, "grad_norm": 0.36717450618743896, "learning_rate": 7.436702373072663e-06, "loss": 0.5436, "step": 7908 }, { "epoch": 3.5779235467088895, "grad_norm": 0.4561291038990021, "learning_rate": 7.436062500726493e-06, "loss": 0.5968, "step": 7909 }, { "epoch": 3.578375933046822, "grad_norm": 0.38725998997688293, "learning_rate": 7.435422576061238e-06, "loss": 0.4792, "step": 7910 }, { "epoch": 3.5788283193847548, "grad_norm": 0.42389166355133057, "learning_rate": 7.434782599090639e-06, "loss": 0.5828, "step": 7911 }, { "epoch": 3.579280705722687, "grad_norm": 0.44588491320610046, "learning_rate": 7.434142569828441e-06, "loss": 0.6011, "step": 7912 }, { "epoch": 3.5797330920606196, "grad_norm": 0.4180043637752533, "learning_rate": 7.433502488288391e-06, "loss": 0.5866, "step": 7913 }, { "epoch": 3.5801854783985525, "grad_norm": 0.3901429772377014, "learning_rate": 7.4328623544842335e-06, "loss": 0.4716, "step": 7914 }, { "epoch": 3.580637864736485, "grad_norm": 0.48011380434036255, "learning_rate": 7.4322221684297215e-06, "loss": 0.5669, "step": 7915 }, { "epoch": 3.581090251074418, "grad_norm": 0.4555760622024536, "learning_rate": 7.4315819301386e-06, "loss": 0.5561, "step": 7916 }, { "epoch": 3.5815426374123502, "grad_norm": 0.4428521990776062, "learning_rate": 7.43094163962462e-06, "loss": 0.5583, "step": 7917 }, { "epoch": 3.5819950237502827, "grad_norm": 0.4382707476615906, "learning_rate": 7.430301296901533e-06, "loss": 0.4735, "step": 7918 }, { "epoch": 3.582447410088215, "grad_norm": 0.45344677567481995, "learning_rate": 7.429660901983092e-06, "loss": 0.6289, "step": 7919 }, { "epoch": 3.582899796426148, "grad_norm": 0.4293152391910553, "learning_rate": 7.4290204548830516e-06, "loss": 0.557, "step": 7920 }, { "epoch": 3.5833521827640804, "grad_norm": 0.46544885635375977, "learning_rate": 7.428379955615166e-06, "loss": 0.6016, "step": 7921 }, { "epoch": 3.5838045691020133, "grad_norm": 0.4226466417312622, "learning_rate": 7.427739404193189e-06, "loss": 0.5303, "step": 7922 }, { "epoch": 3.5842569554399457, "grad_norm": 0.4181806743144989, "learning_rate": 7.427098800630881e-06, "loss": 0.4307, "step": 7923 }, { "epoch": 3.584709341777878, "grad_norm": 0.445878803730011, "learning_rate": 7.426458144941998e-06, "loss": 0.4754, "step": 7924 }, { "epoch": 3.585161728115811, "grad_norm": 0.49634766578674316, "learning_rate": 7.4258174371403014e-06, "loss": 0.5936, "step": 7925 }, { "epoch": 3.5856141144537435, "grad_norm": 0.47695693373680115, "learning_rate": 7.42517667723955e-06, "loss": 0.5816, "step": 7926 }, { "epoch": 3.5860665007916763, "grad_norm": 0.49549004435539246, "learning_rate": 7.424535865253505e-06, "loss": 0.5091, "step": 7927 }, { "epoch": 3.5865188871296088, "grad_norm": 0.5408452749252319, "learning_rate": 7.423895001195929e-06, "loss": 0.4876, "step": 7928 }, { "epoch": 3.586971273467541, "grad_norm": 0.5154067277908325, "learning_rate": 7.423254085080587e-06, "loss": 0.485, "step": 7929 }, { "epoch": 3.5874236598054736, "grad_norm": 0.5343160629272461, "learning_rate": 7.422613116921244e-06, "loss": 0.4934, "step": 7930 }, { "epoch": 3.5878760461434065, "grad_norm": 0.5917600989341736, "learning_rate": 7.4219720967316635e-06, "loss": 0.5215, "step": 7931 }, { "epoch": 3.588328432481339, "grad_norm": 0.49615421891212463, "learning_rate": 7.421331024525616e-06, "loss": 1.1529, "step": 7932 }, { "epoch": 3.588780818819272, "grad_norm": 0.2635614573955536, "learning_rate": 7.4206899003168676e-06, "loss": 1.0612, "step": 7933 }, { "epoch": 3.5892332051572042, "grad_norm": 0.2773010730743408, "learning_rate": 7.420048724119188e-06, "loss": 0.6099, "step": 7934 }, { "epoch": 3.5896855914951367, "grad_norm": 0.31865784525871277, "learning_rate": 7.419407495946348e-06, "loss": 0.6777, "step": 7935 }, { "epoch": 3.5901379778330695, "grad_norm": 0.3671024441719055, "learning_rate": 7.41876621581212e-06, "loss": 0.6777, "step": 7936 }, { "epoch": 3.590590364171002, "grad_norm": 0.31000280380249023, "learning_rate": 7.418124883730275e-06, "loss": 0.5088, "step": 7937 }, { "epoch": 3.591042750508935, "grad_norm": 0.32886332273483276, "learning_rate": 7.417483499714589e-06, "loss": 0.728, "step": 7938 }, { "epoch": 3.5914951368468673, "grad_norm": 0.3591380715370178, "learning_rate": 7.4168420637788355e-06, "loss": 0.6617, "step": 7939 }, { "epoch": 3.5919475231847997, "grad_norm": 0.3317001163959503, "learning_rate": 7.416200575936789e-06, "loss": 0.5774, "step": 7940 }, { "epoch": 3.592399909522732, "grad_norm": 0.3423134684562683, "learning_rate": 7.415559036202229e-06, "loss": 0.6498, "step": 7941 }, { "epoch": 3.592852295860665, "grad_norm": 0.31594839692115784, "learning_rate": 7.414917444588934e-06, "loss": 0.4757, "step": 7942 }, { "epoch": 3.5933046821985974, "grad_norm": 0.32970407605171204, "learning_rate": 7.414275801110682e-06, "loss": 0.48, "step": 7943 }, { "epoch": 3.5937570685365303, "grad_norm": 0.33815574645996094, "learning_rate": 7.413634105781253e-06, "loss": 0.5756, "step": 7944 }, { "epoch": 3.5942094548744628, "grad_norm": 0.38061806559562683, "learning_rate": 7.412992358614431e-06, "loss": 0.6537, "step": 7945 }, { "epoch": 3.594661841212395, "grad_norm": 0.35719606280326843, "learning_rate": 7.412350559623998e-06, "loss": 0.5404, "step": 7946 }, { "epoch": 3.595114227550328, "grad_norm": 0.3897092044353485, "learning_rate": 7.411708708823737e-06, "loss": 0.5605, "step": 7947 }, { "epoch": 3.5955666138882605, "grad_norm": 0.3299182057380676, "learning_rate": 7.411066806227433e-06, "loss": 0.5115, "step": 7948 }, { "epoch": 3.5960190002261934, "grad_norm": 0.4048079550266266, "learning_rate": 7.410424851848872e-06, "loss": 0.5976, "step": 7949 }, { "epoch": 3.596471386564126, "grad_norm": 0.39450380206108093, "learning_rate": 7.409782845701843e-06, "loss": 0.6, "step": 7950 }, { "epoch": 3.5969237729020582, "grad_norm": 0.36188963055610657, "learning_rate": 7.409140787800131e-06, "loss": 0.4855, "step": 7951 }, { "epoch": 3.597376159239991, "grad_norm": 0.4007405638694763, "learning_rate": 7.408498678157528e-06, "loss": 0.5083, "step": 7952 }, { "epoch": 3.5978285455779235, "grad_norm": 0.40864884853363037, "learning_rate": 7.407856516787824e-06, "loss": 0.591, "step": 7953 }, { "epoch": 3.5982809319158564, "grad_norm": 0.4020937979221344, "learning_rate": 7.407214303704809e-06, "loss": 0.4901, "step": 7954 }, { "epoch": 3.598733318253789, "grad_norm": 0.3531137704849243, "learning_rate": 7.40657203892228e-06, "loss": 0.5099, "step": 7955 }, { "epoch": 3.5991857045917213, "grad_norm": 0.4110317826271057, "learning_rate": 7.405929722454026e-06, "loss": 0.5607, "step": 7956 }, { "epoch": 3.5996380909296537, "grad_norm": 0.39981508255004883, "learning_rate": 7.405287354313844e-06, "loss": 0.5651, "step": 7957 }, { "epoch": 3.6000904772675866, "grad_norm": 0.40240707993507385, "learning_rate": 7.4046449345155305e-06, "loss": 0.473, "step": 7958 }, { "epoch": 3.600542863605519, "grad_norm": 0.36526355147361755, "learning_rate": 7.404002463072882e-06, "loss": 0.4768, "step": 7959 }, { "epoch": 3.600995249943452, "grad_norm": 0.4620055854320526, "learning_rate": 7.403359939999699e-06, "loss": 0.709, "step": 7960 }, { "epoch": 3.6014476362813843, "grad_norm": 0.39952588081359863, "learning_rate": 7.402717365309777e-06, "loss": 0.5791, "step": 7961 }, { "epoch": 3.6019000226193167, "grad_norm": 0.36832794547080994, "learning_rate": 7.40207473901692e-06, "loss": 0.4372, "step": 7962 }, { "epoch": 3.6023524089572496, "grad_norm": 0.43917912244796753, "learning_rate": 7.4014320611349266e-06, "loss": 0.5801, "step": 7963 }, { "epoch": 3.602804795295182, "grad_norm": 0.4057979881763458, "learning_rate": 7.4007893316776025e-06, "loss": 0.4725, "step": 7964 }, { "epoch": 3.603257181633115, "grad_norm": 0.425838440656662, "learning_rate": 7.400146550658749e-06, "loss": 0.538, "step": 7965 }, { "epoch": 3.6037095679710474, "grad_norm": 0.47238224744796753, "learning_rate": 7.399503718092173e-06, "loss": 0.5905, "step": 7966 }, { "epoch": 3.60416195430898, "grad_norm": 0.4194088578224182, "learning_rate": 7.398860833991679e-06, "loss": 0.5139, "step": 7967 }, { "epoch": 3.604614340646912, "grad_norm": 0.45221641659736633, "learning_rate": 7.398217898371074e-06, "loss": 0.5418, "step": 7968 }, { "epoch": 3.605066726984845, "grad_norm": 0.4252311587333679, "learning_rate": 7.3975749112441685e-06, "loss": 0.4595, "step": 7969 }, { "epoch": 3.6055191133227775, "grad_norm": 0.3855815827846527, "learning_rate": 7.3969318726247705e-06, "loss": 0.4476, "step": 7970 }, { "epoch": 3.6059714996607104, "grad_norm": 0.4757993817329407, "learning_rate": 7.396288782526692e-06, "loss": 0.516, "step": 7971 }, { "epoch": 3.606423885998643, "grad_norm": 0.4653041362762451, "learning_rate": 7.395645640963741e-06, "loss": 0.515, "step": 7972 }, { "epoch": 3.6068762723365753, "grad_norm": 0.5263321399688721, "learning_rate": 7.395002447949733e-06, "loss": 0.5645, "step": 7973 }, { "epoch": 3.607328658674508, "grad_norm": 0.42678168416023254, "learning_rate": 7.39435920349848e-06, "loss": 0.4987, "step": 7974 }, { "epoch": 3.6077810450124406, "grad_norm": 0.47067171335220337, "learning_rate": 7.393715907623799e-06, "loss": 0.5143, "step": 7975 }, { "epoch": 3.6082334313503734, "grad_norm": 0.48431333899497986, "learning_rate": 7.393072560339505e-06, "loss": 0.5694, "step": 7976 }, { "epoch": 3.608685817688306, "grad_norm": 0.5195887088775635, "learning_rate": 7.392429161659414e-06, "loss": 0.5684, "step": 7977 }, { "epoch": 3.6091382040262383, "grad_norm": 0.5419626832008362, "learning_rate": 7.391785711597347e-06, "loss": 0.4918, "step": 7978 }, { "epoch": 3.6095905903641707, "grad_norm": 0.5012105703353882, "learning_rate": 7.391142210167121e-06, "loss": 0.4299, "step": 7979 }, { "epoch": 3.6100429767021036, "grad_norm": 0.5166043639183044, "learning_rate": 7.390498657382557e-06, "loss": 0.4918, "step": 7980 }, { "epoch": 3.610495363040036, "grad_norm": 0.637766420841217, "learning_rate": 7.3898550532574765e-06, "loss": 0.5829, "step": 7981 }, { "epoch": 3.610947749377969, "grad_norm": 0.49396124482154846, "learning_rate": 7.389211397805702e-06, "loss": 1.2799, "step": 7982 }, { "epoch": 3.6114001357159013, "grad_norm": 0.25178423523902893, "learning_rate": 7.388567691041059e-06, "loss": 0.7907, "step": 7983 }, { "epoch": 3.611852522053834, "grad_norm": 0.2226138561964035, "learning_rate": 7.3879239329773685e-06, "loss": 0.5291, "step": 7984 }, { "epoch": 3.6123049083917667, "grad_norm": 0.28889238834381104, "learning_rate": 7.38728012362846e-06, "loss": 0.5678, "step": 7985 }, { "epoch": 3.612757294729699, "grad_norm": 0.3109738826751709, "learning_rate": 7.386636263008159e-06, "loss": 0.6746, "step": 7986 }, { "epoch": 3.613209681067632, "grad_norm": 0.309466153383255, "learning_rate": 7.385992351130293e-06, "loss": 0.5859, "step": 7987 }, { "epoch": 3.6136620674055644, "grad_norm": 0.3221946060657501, "learning_rate": 7.385348388008693e-06, "loss": 0.718, "step": 7988 }, { "epoch": 3.614114453743497, "grad_norm": 0.3153648376464844, "learning_rate": 7.384704373657189e-06, "loss": 0.4933, "step": 7989 }, { "epoch": 3.6145668400814297, "grad_norm": 0.3063473701477051, "learning_rate": 7.384060308089611e-06, "loss": 0.5333, "step": 7990 }, { "epoch": 3.615019226419362, "grad_norm": 0.32574746012687683, "learning_rate": 7.3834161913197925e-06, "loss": 0.5417, "step": 7991 }, { "epoch": 3.615471612757295, "grad_norm": 0.3421747386455536, "learning_rate": 7.382772023361566e-06, "loss": 0.5904, "step": 7992 }, { "epoch": 3.6159239990952274, "grad_norm": 0.33860382437705994, "learning_rate": 7.382127804228769e-06, "loss": 0.5472, "step": 7993 }, { "epoch": 3.61637638543316, "grad_norm": 0.4078325033187866, "learning_rate": 7.381483533935236e-06, "loss": 0.7291, "step": 7994 }, { "epoch": 3.6168287717710923, "grad_norm": 0.37942567467689514, "learning_rate": 7.380839212494803e-06, "loss": 0.5852, "step": 7995 }, { "epoch": 3.617281158109025, "grad_norm": 0.3817750811576843, "learning_rate": 7.380194839921307e-06, "loss": 0.5719, "step": 7996 }, { "epoch": 3.6177335444469576, "grad_norm": 0.3546896278858185, "learning_rate": 7.379550416228591e-06, "loss": 0.5058, "step": 7997 }, { "epoch": 3.6181859307848905, "grad_norm": 0.3706839084625244, "learning_rate": 7.378905941430491e-06, "loss": 0.5493, "step": 7998 }, { "epoch": 3.618638317122823, "grad_norm": 0.4078243374824524, "learning_rate": 7.3782614155408505e-06, "loss": 0.5855, "step": 7999 }, { "epoch": 3.6190907034607553, "grad_norm": 0.40602192282676697, "learning_rate": 7.377616838573511e-06, "loss": 0.6123, "step": 8000 }, { "epoch": 3.6190907034607553, "eval_loss": 0.5907747745513916, "eval_runtime": 25.9912, "eval_samples_per_second": 28.625, "eval_steps_per_second": 7.156, "step": 8000 }, { "epoch": 3.619543089798688, "grad_norm": 0.3787628710269928, "learning_rate": 7.376972210542319e-06, "loss": 0.5238, "step": 8001 }, { "epoch": 3.6199954761366206, "grad_norm": 0.3679898679256439, "learning_rate": 7.376327531461116e-06, "loss": 0.5485, "step": 8002 }, { "epoch": 3.6204478624745535, "grad_norm": 0.4177369773387909, "learning_rate": 7.375682801343747e-06, "loss": 0.5901, "step": 8003 }, { "epoch": 3.620900248812486, "grad_norm": 0.40289631485939026, "learning_rate": 7.375038020204062e-06, "loss": 0.5775, "step": 8004 }, { "epoch": 3.6213526351504184, "grad_norm": 0.382504940032959, "learning_rate": 7.374393188055908e-06, "loss": 0.5215, "step": 8005 }, { "epoch": 3.621805021488351, "grad_norm": 0.38878870010375977, "learning_rate": 7.373748304913131e-06, "loss": 0.5317, "step": 8006 }, { "epoch": 3.6222574078262837, "grad_norm": 0.37624138593673706, "learning_rate": 7.3731033707895845e-06, "loss": 0.5174, "step": 8007 }, { "epoch": 3.622709794164216, "grad_norm": 0.4515194594860077, "learning_rate": 7.372458385699118e-06, "loss": 0.6567, "step": 8008 }, { "epoch": 3.623162180502149, "grad_norm": 0.3813580572605133, "learning_rate": 7.371813349655585e-06, "loss": 0.5051, "step": 8009 }, { "epoch": 3.6236145668400814, "grad_norm": 0.42061060667037964, "learning_rate": 7.371168262672837e-06, "loss": 0.6026, "step": 8010 }, { "epoch": 3.624066953178014, "grad_norm": 0.4293801784515381, "learning_rate": 7.3705231247647314e-06, "loss": 0.497, "step": 8011 }, { "epoch": 3.6245193395159467, "grad_norm": 0.39885246753692627, "learning_rate": 7.3698779359451215e-06, "loss": 0.504, "step": 8012 }, { "epoch": 3.624971725853879, "grad_norm": 0.38914036750793457, "learning_rate": 7.369232696227863e-06, "loss": 0.5282, "step": 8013 }, { "epoch": 3.625424112191812, "grad_norm": 0.40935206413269043, "learning_rate": 7.368587405626817e-06, "loss": 0.546, "step": 8014 }, { "epoch": 3.6258764985297445, "grad_norm": 0.4580225348472595, "learning_rate": 7.36794206415584e-06, "loss": 0.5436, "step": 8015 }, { "epoch": 3.626328884867677, "grad_norm": 0.4203220009803772, "learning_rate": 7.367296671828792e-06, "loss": 0.4743, "step": 8016 }, { "epoch": 3.6267812712056093, "grad_norm": 0.4460139274597168, "learning_rate": 7.366651228659535e-06, "loss": 0.5304, "step": 8017 }, { "epoch": 3.627233657543542, "grad_norm": 0.4083365499973297, "learning_rate": 7.366005734661931e-06, "loss": 0.5196, "step": 8018 }, { "epoch": 3.6276860438814746, "grad_norm": 0.47105827927589417, "learning_rate": 7.365360189849842e-06, "loss": 0.5743, "step": 8019 }, { "epoch": 3.6281384302194075, "grad_norm": 0.41839680075645447, "learning_rate": 7.364714594237132e-06, "loss": 0.5245, "step": 8020 }, { "epoch": 3.62859081655734, "grad_norm": 0.43675798177719116, "learning_rate": 7.3640689478376695e-06, "loss": 0.5128, "step": 8021 }, { "epoch": 3.6290432028952724, "grad_norm": 0.4142839312553406, "learning_rate": 7.363423250665317e-06, "loss": 0.5143, "step": 8022 }, { "epoch": 3.6294955892332053, "grad_norm": 0.4801170527935028, "learning_rate": 7.3627775027339455e-06, "loss": 0.5366, "step": 8023 }, { "epoch": 3.6299479755711377, "grad_norm": 0.46761390566825867, "learning_rate": 7.362131704057421e-06, "loss": 0.5009, "step": 8024 }, { "epoch": 3.6304003619090706, "grad_norm": 0.5002704858779907, "learning_rate": 7.361485854649616e-06, "loss": 0.4727, "step": 8025 }, { "epoch": 3.630852748247003, "grad_norm": 0.4677130877971649, "learning_rate": 7.3608399545243996e-06, "loss": 0.4618, "step": 8026 }, { "epoch": 3.6313051345849354, "grad_norm": 0.4767751097679138, "learning_rate": 7.360194003695643e-06, "loss": 0.5429, "step": 8027 }, { "epoch": 3.631757520922868, "grad_norm": 0.507372260093689, "learning_rate": 7.359548002177221e-06, "loss": 0.5672, "step": 8028 }, { "epoch": 3.6322099072608007, "grad_norm": 0.5577520728111267, "learning_rate": 7.358901949983007e-06, "loss": 0.5524, "step": 8029 }, { "epoch": 3.632662293598733, "grad_norm": 0.5295104384422302, "learning_rate": 7.358255847126876e-06, "loss": 0.4909, "step": 8030 }, { "epoch": 3.633114679936666, "grad_norm": 0.5974236726760864, "learning_rate": 7.357609693622704e-06, "loss": 0.5191, "step": 8031 }, { "epoch": 3.6335670662745985, "grad_norm": 0.4920061230659485, "learning_rate": 7.356963489484369e-06, "loss": 0.9042, "step": 8032 }, { "epoch": 3.634019452612531, "grad_norm": 0.17678789794445038, "learning_rate": 7.3563172347257495e-06, "loss": 0.7593, "step": 8033 }, { "epoch": 3.6344718389504638, "grad_norm": 0.23165816068649292, "learning_rate": 7.355670929360724e-06, "loss": 0.6115, "step": 8034 }, { "epoch": 3.634924225288396, "grad_norm": 0.2954132854938507, "learning_rate": 7.355024573403174e-06, "loss": 0.5985, "step": 8035 }, { "epoch": 3.635376611626329, "grad_norm": 0.2756202816963196, "learning_rate": 7.354378166866982e-06, "loss": 0.527, "step": 8036 }, { "epoch": 3.6358289979642615, "grad_norm": 0.3155917525291443, "learning_rate": 7.35373170976603e-06, "loss": 0.6297, "step": 8037 }, { "epoch": 3.636281384302194, "grad_norm": 0.32882311940193176, "learning_rate": 7.353085202114201e-06, "loss": 0.6448, "step": 8038 }, { "epoch": 3.636733770640127, "grad_norm": 0.2992655038833618, "learning_rate": 7.352438643925382e-06, "loss": 0.627, "step": 8039 }, { "epoch": 3.6371861569780592, "grad_norm": 0.3205466866493225, "learning_rate": 7.351792035213458e-06, "loss": 0.542, "step": 8040 }, { "epoch": 3.637638543315992, "grad_norm": 0.30229902267456055, "learning_rate": 7.351145375992315e-06, "loss": 0.5785, "step": 8041 }, { "epoch": 3.6380909296539246, "grad_norm": 0.3184596300125122, "learning_rate": 7.350498666275842e-06, "loss": 0.4646, "step": 8042 }, { "epoch": 3.638543315991857, "grad_norm": 0.3284457325935364, "learning_rate": 7.349851906077929e-06, "loss": 0.7688, "step": 8043 }, { "epoch": 3.6389957023297894, "grad_norm": 0.34900757670402527, "learning_rate": 7.349205095412466e-06, "loss": 0.4925, "step": 8044 }, { "epoch": 3.6394480886677223, "grad_norm": 0.3589272201061249, "learning_rate": 7.348558234293346e-06, "loss": 0.6525, "step": 8045 }, { "epoch": 3.6399004750056547, "grad_norm": 0.310517817735672, "learning_rate": 7.347911322734459e-06, "loss": 0.4618, "step": 8046 }, { "epoch": 3.6403528613435876, "grad_norm": 0.37712520360946655, "learning_rate": 7.3472643607497e-06, "loss": 0.5573, "step": 8047 }, { "epoch": 3.64080524768152, "grad_norm": 0.38127273321151733, "learning_rate": 7.346617348352962e-06, "loss": 0.6181, "step": 8048 }, { "epoch": 3.6412576340194525, "grad_norm": 0.35615721344947815, "learning_rate": 7.345970285558144e-06, "loss": 0.5675, "step": 8049 }, { "epoch": 3.6417100203573853, "grad_norm": 0.3499981760978699, "learning_rate": 7.345323172379142e-06, "loss": 0.4755, "step": 8050 }, { "epoch": 3.6421624066953178, "grad_norm": 0.4087279438972473, "learning_rate": 7.344676008829852e-06, "loss": 0.5561, "step": 8051 }, { "epoch": 3.6426147930332506, "grad_norm": 0.40525156259536743, "learning_rate": 7.344028794924175e-06, "loss": 0.6857, "step": 8052 }, { "epoch": 3.643067179371183, "grad_norm": 0.3773857057094574, "learning_rate": 7.343381530676011e-06, "loss": 0.5332, "step": 8053 }, { "epoch": 3.6435195657091155, "grad_norm": 0.37969616055488586, "learning_rate": 7.342734216099258e-06, "loss": 0.5029, "step": 8054 }, { "epoch": 3.643971952047048, "grad_norm": 0.3832426071166992, "learning_rate": 7.342086851207824e-06, "loss": 0.5505, "step": 8055 }, { "epoch": 3.644424338384981, "grad_norm": 0.40210285782814026, "learning_rate": 7.3414394360156075e-06, "loss": 0.5233, "step": 8056 }, { "epoch": 3.6448767247229132, "grad_norm": 0.37602442502975464, "learning_rate": 7.340791970536516e-06, "loss": 0.4573, "step": 8057 }, { "epoch": 3.645329111060846, "grad_norm": 0.4019063413143158, "learning_rate": 7.340144454784452e-06, "loss": 0.5923, "step": 8058 }, { "epoch": 3.6457814973987785, "grad_norm": 0.4260135591030121, "learning_rate": 7.339496888773326e-06, "loss": 0.6866, "step": 8059 }, { "epoch": 3.646233883736711, "grad_norm": 0.40958094596862793, "learning_rate": 7.338849272517043e-06, "loss": 0.5151, "step": 8060 }, { "epoch": 3.646686270074644, "grad_norm": 0.3870633840560913, "learning_rate": 7.338201606029514e-06, "loss": 0.4578, "step": 8061 }, { "epoch": 3.6471386564125763, "grad_norm": 0.38912516832351685, "learning_rate": 7.337553889324646e-06, "loss": 0.4529, "step": 8062 }, { "epoch": 3.647591042750509, "grad_norm": 0.40352484583854675, "learning_rate": 7.336906122416353e-06, "loss": 0.5744, "step": 8063 }, { "epoch": 3.6480434290884416, "grad_norm": 0.41110724210739136, "learning_rate": 7.336258305318544e-06, "loss": 0.5476, "step": 8064 }, { "epoch": 3.648495815426374, "grad_norm": 0.4541831612586975, "learning_rate": 7.335610438045135e-06, "loss": 0.6679, "step": 8065 }, { "epoch": 3.6489482017643065, "grad_norm": 0.4319208264350891, "learning_rate": 7.334962520610037e-06, "loss": 0.5355, "step": 8066 }, { "epoch": 3.6494005881022393, "grad_norm": 0.4598018527030945, "learning_rate": 7.334314553027168e-06, "loss": 0.5968, "step": 8067 }, { "epoch": 3.6498529744401718, "grad_norm": 0.4662656784057617, "learning_rate": 7.333666535310442e-06, "loss": 0.581, "step": 8068 }, { "epoch": 3.6503053607781046, "grad_norm": 0.41126206517219543, "learning_rate": 7.3330184674737795e-06, "loss": 0.5556, "step": 8069 }, { "epoch": 3.650757747116037, "grad_norm": 0.410910040140152, "learning_rate": 7.332370349531096e-06, "loss": 0.4482, "step": 8070 }, { "epoch": 3.6512101334539695, "grad_norm": 0.40691912174224854, "learning_rate": 7.3317221814963125e-06, "loss": 0.4974, "step": 8071 }, { "epoch": 3.6516625197919024, "grad_norm": 0.4743020832538605, "learning_rate": 7.331073963383351e-06, "loss": 0.5449, "step": 8072 }, { "epoch": 3.652114906129835, "grad_norm": 0.45938006043434143, "learning_rate": 7.33042569520613e-06, "loss": 0.5533, "step": 8073 }, { "epoch": 3.6525672924677677, "grad_norm": 0.443852961063385, "learning_rate": 7.329777376978575e-06, "loss": 0.5052, "step": 8074 }, { "epoch": 3.6530196788057, "grad_norm": 0.47358375787734985, "learning_rate": 7.329129008714609e-06, "loss": 0.5377, "step": 8075 }, { "epoch": 3.6534720651436325, "grad_norm": 0.4921081066131592, "learning_rate": 7.328480590428155e-06, "loss": 0.5328, "step": 8076 }, { "epoch": 3.6539244514815654, "grad_norm": 0.5169853568077087, "learning_rate": 7.3278321221331415e-06, "loss": 0.5105, "step": 8077 }, { "epoch": 3.654376837819498, "grad_norm": 0.525782585144043, "learning_rate": 7.327183603843495e-06, "loss": 0.526, "step": 8078 }, { "epoch": 3.6548292241574307, "grad_norm": 0.48911771178245544, "learning_rate": 7.326535035573144e-06, "loss": 0.4886, "step": 8079 }, { "epoch": 3.655281610495363, "grad_norm": 0.5510890483856201, "learning_rate": 7.325886417336017e-06, "loss": 0.5422, "step": 8080 }, { "epoch": 3.6557339968332956, "grad_norm": 0.5326504111289978, "learning_rate": 7.325237749146045e-06, "loss": 0.4882, "step": 8081 }, { "epoch": 3.656186383171228, "grad_norm": 0.6171855926513672, "learning_rate": 7.324589031017158e-06, "loss": 1.1935, "step": 8082 }, { "epoch": 3.656638769509161, "grad_norm": 0.22983230650424957, "learning_rate": 7.323940262963292e-06, "loss": 0.5625, "step": 8083 }, { "epoch": 3.6570911558470933, "grad_norm": 0.22876764833927155, "learning_rate": 7.323291444998376e-06, "loss": 0.6021, "step": 8084 }, { "epoch": 3.657543542185026, "grad_norm": 0.3138095438480377, "learning_rate": 7.3226425771363475e-06, "loss": 0.633, "step": 8085 }, { "epoch": 3.6579959285229586, "grad_norm": 0.32829660177230835, "learning_rate": 7.32199365939114e-06, "loss": 0.6732, "step": 8086 }, { "epoch": 3.658448314860891, "grad_norm": 0.3030560314655304, "learning_rate": 7.321344691776692e-06, "loss": 0.5394, "step": 8087 }, { "epoch": 3.658900701198824, "grad_norm": 0.30835896730422974, "learning_rate": 7.320695674306941e-06, "loss": 0.6309, "step": 8088 }, { "epoch": 3.6593530875367564, "grad_norm": 0.33463191986083984, "learning_rate": 7.320046606995825e-06, "loss": 0.607, "step": 8089 }, { "epoch": 3.6598054738746892, "grad_norm": 0.31024298071861267, "learning_rate": 7.3193974898572866e-06, "loss": 0.5793, "step": 8090 }, { "epoch": 3.6602578602126217, "grad_norm": 0.3066910207271576, "learning_rate": 7.318748322905263e-06, "loss": 0.5623, "step": 8091 }, { "epoch": 3.660710246550554, "grad_norm": 0.39231589436531067, "learning_rate": 7.318099106153699e-06, "loss": 0.7165, "step": 8092 }, { "epoch": 3.6611626328884865, "grad_norm": 0.32408860325813293, "learning_rate": 7.317449839616537e-06, "loss": 0.4728, "step": 8093 }, { "epoch": 3.6616150192264194, "grad_norm": 0.3502448797225952, "learning_rate": 7.3168005233077214e-06, "loss": 0.601, "step": 8094 }, { "epoch": 3.662067405564352, "grad_norm": 0.3578130006790161, "learning_rate": 7.316151157241197e-06, "loss": 0.6642, "step": 8095 }, { "epoch": 3.6625197919022847, "grad_norm": 0.36550846695899963, "learning_rate": 7.31550174143091e-06, "loss": 0.5288, "step": 8096 }, { "epoch": 3.662972178240217, "grad_norm": 0.34662291407585144, "learning_rate": 7.31485227589081e-06, "loss": 0.5714, "step": 8097 }, { "epoch": 3.6634245645781496, "grad_norm": 0.3620257079601288, "learning_rate": 7.314202760634843e-06, "loss": 0.5602, "step": 8098 }, { "epoch": 3.6638769509160825, "grad_norm": 0.39462295174598694, "learning_rate": 7.313553195676959e-06, "loss": 0.6496, "step": 8099 }, { "epoch": 3.664329337254015, "grad_norm": 0.3862362205982208, "learning_rate": 7.312903581031109e-06, "loss": 0.6782, "step": 8100 }, { "epoch": 3.6647817235919478, "grad_norm": 0.38081520795822144, "learning_rate": 7.312253916711245e-06, "loss": 0.6229, "step": 8101 }, { "epoch": 3.66523410992988, "grad_norm": 0.3608816862106323, "learning_rate": 7.311604202731319e-06, "loss": 0.5134, "step": 8102 }, { "epoch": 3.6656864962678126, "grad_norm": 0.3380156457424164, "learning_rate": 7.310954439105287e-06, "loss": 0.4367, "step": 8103 }, { "epoch": 3.666138882605745, "grad_norm": 0.38391658663749695, "learning_rate": 7.3103046258471e-06, "loss": 0.5435, "step": 8104 }, { "epoch": 3.666591268943678, "grad_norm": 0.4017661511898041, "learning_rate": 7.309654762970718e-06, "loss": 0.593, "step": 8105 }, { "epoch": 3.6670436552816104, "grad_norm": 0.446852445602417, "learning_rate": 7.309004850490096e-06, "loss": 0.6003, "step": 8106 }, { "epoch": 3.6674960416195432, "grad_norm": 0.38102754950523376, "learning_rate": 7.308354888419193e-06, "loss": 0.5509, "step": 8107 }, { "epoch": 3.6679484279574757, "grad_norm": 0.36832159757614136, "learning_rate": 7.307704876771967e-06, "loss": 0.5164, "step": 8108 }, { "epoch": 3.668400814295408, "grad_norm": 0.34982866048812866, "learning_rate": 7.30705481556238e-06, "loss": 0.4239, "step": 8109 }, { "epoch": 3.668853200633341, "grad_norm": 0.4251924753189087, "learning_rate": 7.306404704804391e-06, "loss": 0.5924, "step": 8110 }, { "epoch": 3.6693055869712734, "grad_norm": 0.36819538474082947, "learning_rate": 7.305754544511963e-06, "loss": 0.442, "step": 8111 }, { "epoch": 3.6697579733092063, "grad_norm": 0.39804840087890625, "learning_rate": 7.30510433469906e-06, "loss": 0.4842, "step": 8112 }, { "epoch": 3.6702103596471387, "grad_norm": 0.36821407079696655, "learning_rate": 7.3044540753796474e-06, "loss": 0.4794, "step": 8113 }, { "epoch": 3.670662745985071, "grad_norm": 0.39177945256233215, "learning_rate": 7.30380376656769e-06, "loss": 0.4638, "step": 8114 }, { "epoch": 3.6711151323230036, "grad_norm": 0.42073002457618713, "learning_rate": 7.303153408277154e-06, "loss": 0.5026, "step": 8115 }, { "epoch": 3.6715675186609364, "grad_norm": 0.37539178133010864, "learning_rate": 7.302503000522007e-06, "loss": 0.4879, "step": 8116 }, { "epoch": 3.672019904998869, "grad_norm": 0.49487096071243286, "learning_rate": 7.3018525433162175e-06, "loss": 0.6283, "step": 8117 }, { "epoch": 3.6724722913368018, "grad_norm": 0.37581101059913635, "learning_rate": 7.301202036673758e-06, "loss": 0.3964, "step": 8118 }, { "epoch": 3.672924677674734, "grad_norm": 0.40887850522994995, "learning_rate": 7.300551480608595e-06, "loss": 0.4288, "step": 8119 }, { "epoch": 3.6733770640126666, "grad_norm": 0.41293781995773315, "learning_rate": 7.299900875134705e-06, "loss": 0.4714, "step": 8120 }, { "epoch": 3.6738294503505995, "grad_norm": 0.4613214135169983, "learning_rate": 7.2992502202660564e-06, "loss": 0.5582, "step": 8121 }, { "epoch": 3.674281836688532, "grad_norm": 0.4642995297908783, "learning_rate": 7.298599516016627e-06, "loss": 0.5575, "step": 8122 }, { "epoch": 3.674734223026465, "grad_norm": 0.4262576699256897, "learning_rate": 7.297948762400389e-06, "loss": 0.46, "step": 8123 }, { "epoch": 3.6751866093643972, "grad_norm": 0.4506663680076599, "learning_rate": 7.2972979594313205e-06, "loss": 0.4456, "step": 8124 }, { "epoch": 3.6756389957023297, "grad_norm": 0.5317422151565552, "learning_rate": 7.296647107123399e-06, "loss": 0.5748, "step": 8125 }, { "epoch": 3.6760913820402625, "grad_norm": 0.47956573963165283, "learning_rate": 7.2959962054906004e-06, "loss": 0.5869, "step": 8126 }, { "epoch": 3.676543768378195, "grad_norm": 0.4973604083061218, "learning_rate": 7.295345254546908e-06, "loss": 0.4829, "step": 8127 }, { "epoch": 3.676996154716128, "grad_norm": 0.48647892475128174, "learning_rate": 7.2946942543062985e-06, "loss": 0.4893, "step": 8128 }, { "epoch": 3.6774485410540603, "grad_norm": 0.504148006439209, "learning_rate": 7.294043204782756e-06, "loss": 0.534, "step": 8129 }, { "epoch": 3.6779009273919927, "grad_norm": 0.6241506338119507, "learning_rate": 7.293392105990261e-06, "loss": 0.6634, "step": 8130 }, { "epoch": 3.678353313729925, "grad_norm": 0.5080744624137878, "learning_rate": 7.292740957942798e-06, "loss": 0.4543, "step": 8131 }, { "epoch": 3.678805700067858, "grad_norm": 0.3929700255393982, "learning_rate": 7.292089760654352e-06, "loss": 1.032, "step": 8132 }, { "epoch": 3.6792580864057904, "grad_norm": 0.19751910865306854, "learning_rate": 7.291438514138907e-06, "loss": 1.0368, "step": 8133 }, { "epoch": 3.6797104727437233, "grad_norm": 0.22376325726509094, "learning_rate": 7.290787218410452e-06, "loss": 0.5466, "step": 8134 }, { "epoch": 3.6801628590816557, "grad_norm": 0.2763814628124237, "learning_rate": 7.290135873482974e-06, "loss": 0.5928, "step": 8135 }, { "epoch": 3.680615245419588, "grad_norm": 0.3452487885951996, "learning_rate": 7.289484479370461e-06, "loss": 0.6747, "step": 8136 }, { "epoch": 3.681067631757521, "grad_norm": 0.3109634518623352, "learning_rate": 7.288833036086904e-06, "loss": 0.6104, "step": 8137 }, { "epoch": 3.6815200180954535, "grad_norm": 0.3379506766796112, "learning_rate": 7.288181543646293e-06, "loss": 0.5998, "step": 8138 }, { "epoch": 3.6819724044333864, "grad_norm": 0.28279730677604675, "learning_rate": 7.2875300020626215e-06, "loss": 0.5268, "step": 8139 }, { "epoch": 3.682424790771319, "grad_norm": 0.3457562029361725, "learning_rate": 7.286878411349882e-06, "loss": 0.5739, "step": 8140 }, { "epoch": 3.682877177109251, "grad_norm": 0.3364446759223938, "learning_rate": 7.286226771522069e-06, "loss": 0.521, "step": 8141 }, { "epoch": 3.6833295634471837, "grad_norm": 0.33139559626579285, "learning_rate": 7.285575082593176e-06, "loss": 0.5807, "step": 8142 }, { "epoch": 3.6837819497851165, "grad_norm": 0.32579880952835083, "learning_rate": 7.284923344577201e-06, "loss": 0.5882, "step": 8143 }, { "epoch": 3.684234336123049, "grad_norm": 0.38515767455101013, "learning_rate": 7.284271557488141e-06, "loss": 0.654, "step": 8144 }, { "epoch": 3.684686722460982, "grad_norm": 0.36165884137153625, "learning_rate": 7.283619721339992e-06, "loss": 0.6241, "step": 8145 }, { "epoch": 3.6851391087989143, "grad_norm": 0.3360612094402313, "learning_rate": 7.282967836146757e-06, "loss": 0.487, "step": 8146 }, { "epoch": 3.6855914951368467, "grad_norm": 0.38984301686286926, "learning_rate": 7.282315901922435e-06, "loss": 0.4746, "step": 8147 }, { "epoch": 3.6860438814747796, "grad_norm": 0.3708780109882355, "learning_rate": 7.281663918681027e-06, "loss": 0.6338, "step": 8148 }, { "epoch": 3.686496267812712, "grad_norm": 0.3515796959400177, "learning_rate": 7.281011886436537e-06, "loss": 0.4611, "step": 8149 }, { "epoch": 3.686948654150645, "grad_norm": 0.3958394229412079, "learning_rate": 7.2803598052029664e-06, "loss": 0.5062, "step": 8150 }, { "epoch": 3.6874010404885773, "grad_norm": 0.3832267224788666, "learning_rate": 7.2797076749943214e-06, "loss": 0.5751, "step": 8151 }, { "epoch": 3.6878534268265097, "grad_norm": 0.3886428773403168, "learning_rate": 7.27905549582461e-06, "loss": 0.5232, "step": 8152 }, { "epoch": 3.688305813164442, "grad_norm": 0.40804523229599, "learning_rate": 7.278403267707834e-06, "loss": 0.5784, "step": 8153 }, { "epoch": 3.688758199502375, "grad_norm": 0.36921975016593933, "learning_rate": 7.277750990658004e-06, "loss": 0.5357, "step": 8154 }, { "epoch": 3.6892105858403075, "grad_norm": 0.3455341160297394, "learning_rate": 7.27709866468913e-06, "loss": 0.5246, "step": 8155 }, { "epoch": 3.6896629721782404, "grad_norm": 0.39762529730796814, "learning_rate": 7.27644628981522e-06, "loss": 0.5127, "step": 8156 }, { "epoch": 3.690115358516173, "grad_norm": 0.3827136754989624, "learning_rate": 7.275793866050284e-06, "loss": 0.5271, "step": 8157 }, { "epoch": 3.690567744854105, "grad_norm": 0.3919585645198822, "learning_rate": 7.275141393408338e-06, "loss": 0.4645, "step": 8158 }, { "epoch": 3.691020131192038, "grad_norm": 0.37656113505363464, "learning_rate": 7.274488871903392e-06, "loss": 0.4378, "step": 8159 }, { "epoch": 3.6914725175299705, "grad_norm": 0.41130581498146057, "learning_rate": 7.273836301549461e-06, "loss": 0.5884, "step": 8160 }, { "epoch": 3.6919249038679034, "grad_norm": 0.46281588077545166, "learning_rate": 7.27318368236056e-06, "loss": 0.6675, "step": 8161 }, { "epoch": 3.692377290205836, "grad_norm": 0.3858863413333893, "learning_rate": 7.272531014350706e-06, "loss": 0.4913, "step": 8162 }, { "epoch": 3.6928296765437683, "grad_norm": 0.4089476764202118, "learning_rate": 7.271878297533914e-06, "loss": 0.4577, "step": 8163 }, { "epoch": 3.693282062881701, "grad_norm": 0.4508623778820038, "learning_rate": 7.271225531924206e-06, "loss": 0.5414, "step": 8164 }, { "epoch": 3.6937344492196336, "grad_norm": 0.43295741081237793, "learning_rate": 7.270572717535598e-06, "loss": 0.5398, "step": 8165 }, { "epoch": 3.694186835557566, "grad_norm": 0.4518735110759735, "learning_rate": 7.269919854382112e-06, "loss": 0.5763, "step": 8166 }, { "epoch": 3.694639221895499, "grad_norm": 0.38555800914764404, "learning_rate": 7.269266942477769e-06, "loss": 0.4946, "step": 8167 }, { "epoch": 3.6950916082334313, "grad_norm": 0.4276670813560486, "learning_rate": 7.2686139818365915e-06, "loss": 0.5536, "step": 8168 }, { "epoch": 3.6955439945713637, "grad_norm": 0.4449087083339691, "learning_rate": 7.267960972472604e-06, "loss": 0.538, "step": 8169 }, { "epoch": 3.6959963809092966, "grad_norm": 0.3875552713871002, "learning_rate": 7.2673079143998304e-06, "loss": 0.4282, "step": 8170 }, { "epoch": 3.696448767247229, "grad_norm": 0.4211287498474121, "learning_rate": 7.266654807632296e-06, "loss": 0.4504, "step": 8171 }, { "epoch": 3.696901153585162, "grad_norm": 0.4309324324131012, "learning_rate": 7.266001652184029e-06, "loss": 0.4997, "step": 8172 }, { "epoch": 3.6973535399230943, "grad_norm": 0.43027549982070923, "learning_rate": 7.265348448069055e-06, "loss": 0.4642, "step": 8173 }, { "epoch": 3.6978059262610268, "grad_norm": 0.4464740753173828, "learning_rate": 7.264695195301405e-06, "loss": 0.45, "step": 8174 }, { "epoch": 3.6982583125989597, "grad_norm": 0.4446644186973572, "learning_rate": 7.2640418938951085e-06, "loss": 0.4774, "step": 8175 }, { "epoch": 3.698710698936892, "grad_norm": 0.43956032395362854, "learning_rate": 7.263388543864194e-06, "loss": 0.4609, "step": 8176 }, { "epoch": 3.699163085274825, "grad_norm": 0.4608322083950043, "learning_rate": 7.262735145222696e-06, "loss": 0.469, "step": 8177 }, { "epoch": 3.6996154716127574, "grad_norm": 0.5173629522323608, "learning_rate": 7.262081697984646e-06, "loss": 0.5192, "step": 8178 }, { "epoch": 3.70006785795069, "grad_norm": 0.49961185455322266, "learning_rate": 7.261428202164078e-06, "loss": 0.4813, "step": 8179 }, { "epoch": 3.7005202442886223, "grad_norm": 0.5001221895217896, "learning_rate": 7.260774657775029e-06, "loss": 0.4624, "step": 8180 }, { "epoch": 3.700972630626555, "grad_norm": 0.6048153042793274, "learning_rate": 7.2601210648315334e-06, "loss": 0.4951, "step": 8181 }, { "epoch": 3.7014250169644876, "grad_norm": 0.445112019777298, "learning_rate": 7.259467423347628e-06, "loss": 1.0524, "step": 8182 }, { "epoch": 3.7018774033024204, "grad_norm": 0.22720745205879211, "learning_rate": 7.258813733337353e-06, "loss": 1.0224, "step": 8183 }, { "epoch": 3.702329789640353, "grad_norm": 0.2756362855434418, "learning_rate": 7.2581599948147455e-06, "loss": 0.6468, "step": 8184 }, { "epoch": 3.7027821759782853, "grad_norm": 0.2903575301170349, "learning_rate": 7.2575062077938475e-06, "loss": 0.5992, "step": 8185 }, { "epoch": 3.703234562316218, "grad_norm": 0.38198861479759216, "learning_rate": 7.2568523722887e-06, "loss": 0.6103, "step": 8186 }, { "epoch": 3.7036869486541506, "grad_norm": 0.24975961446762085, "learning_rate": 7.256198488313343e-06, "loss": 0.5106, "step": 8187 }, { "epoch": 3.7041393349920835, "grad_norm": 0.3039921522140503, "learning_rate": 7.2555445558818225e-06, "loss": 0.6317, "step": 8188 }, { "epoch": 3.704591721330016, "grad_norm": 0.3337215483188629, "learning_rate": 7.254890575008182e-06, "loss": 0.6704, "step": 8189 }, { "epoch": 3.7050441076679483, "grad_norm": 0.3186250925064087, "learning_rate": 7.254236545706468e-06, "loss": 0.6, "step": 8190 }, { "epoch": 3.7054964940058808, "grad_norm": 0.3390290439128876, "learning_rate": 7.253582467990725e-06, "loss": 0.4448, "step": 8191 }, { "epoch": 3.7059488803438136, "grad_norm": 0.3390836715698242, "learning_rate": 7.252928341875003e-06, "loss": 0.6005, "step": 8192 }, { "epoch": 3.706401266681746, "grad_norm": 0.35222724080085754, "learning_rate": 7.252274167373348e-06, "loss": 0.6491, "step": 8193 }, { "epoch": 3.706853653019679, "grad_norm": 0.34441348910331726, "learning_rate": 7.251619944499813e-06, "loss": 0.6914, "step": 8194 }, { "epoch": 3.7073060393576114, "grad_norm": 0.35375285148620605, "learning_rate": 7.250965673268445e-06, "loss": 0.6495, "step": 8195 }, { "epoch": 3.707758425695544, "grad_norm": 0.3539772927761078, "learning_rate": 7.250311353693299e-06, "loss": 0.5161, "step": 8196 }, { "epoch": 3.7082108120334767, "grad_norm": 0.37576398253440857, "learning_rate": 7.249656985788425e-06, "loss": 0.6258, "step": 8197 }, { "epoch": 3.708663198371409, "grad_norm": 0.3857179880142212, "learning_rate": 7.249002569567879e-06, "loss": 0.6427, "step": 8198 }, { "epoch": 3.709115584709342, "grad_norm": 0.3892945647239685, "learning_rate": 7.248348105045715e-06, "loss": 0.6804, "step": 8199 }, { "epoch": 3.7095679710472744, "grad_norm": 0.34496191143989563, "learning_rate": 7.247693592235987e-06, "loss": 0.4655, "step": 8200 }, { "epoch": 3.7095679710472744, "eval_loss": 0.5902590155601501, "eval_runtime": 25.1503, "eval_samples_per_second": 29.582, "eval_steps_per_second": 7.396, "step": 8200 }, { "epoch": 3.710020357385207, "grad_norm": 0.3573683500289917, "learning_rate": 7.247039031152754e-06, "loss": 0.5554, "step": 8201 }, { "epoch": 3.7104727437231393, "grad_norm": 0.38485121726989746, "learning_rate": 7.246384421810074e-06, "loss": 0.5198, "step": 8202 }, { "epoch": 3.710925130061072, "grad_norm": 0.32242417335510254, "learning_rate": 7.2457297642220055e-06, "loss": 0.4082, "step": 8203 }, { "epoch": 3.7113775163990046, "grad_norm": 0.4399104416370392, "learning_rate": 7.2450750584026086e-06, "loss": 0.5452, "step": 8204 }, { "epoch": 3.7118299027369375, "grad_norm": 0.3744443655014038, "learning_rate": 7.244420304365944e-06, "loss": 0.5338, "step": 8205 }, { "epoch": 3.71228228907487, "grad_norm": 0.3823913335800171, "learning_rate": 7.243765502126077e-06, "loss": 0.5315, "step": 8206 }, { "epoch": 3.7127346754128023, "grad_norm": 0.4451451599597931, "learning_rate": 7.243110651697065e-06, "loss": 0.6293, "step": 8207 }, { "epoch": 3.713187061750735, "grad_norm": 0.4149760603904724, "learning_rate": 7.242455753092975e-06, "loss": 0.5636, "step": 8208 }, { "epoch": 3.7136394480886676, "grad_norm": 0.3791710138320923, "learning_rate": 7.2418008063278725e-06, "loss": 0.502, "step": 8209 }, { "epoch": 3.7140918344266005, "grad_norm": 0.44070184230804443, "learning_rate": 7.241145811415824e-06, "loss": 0.5929, "step": 8210 }, { "epoch": 3.714544220764533, "grad_norm": 0.3770029842853546, "learning_rate": 7.240490768370896e-06, "loss": 0.5967, "step": 8211 }, { "epoch": 3.7149966071024654, "grad_norm": 0.40416643023490906, "learning_rate": 7.2398356772071566e-06, "loss": 0.5532, "step": 8212 }, { "epoch": 3.7154489934403983, "grad_norm": 0.4239855706691742, "learning_rate": 7.239180537938675e-06, "loss": 0.5339, "step": 8213 }, { "epoch": 3.7159013797783307, "grad_norm": 0.3992176353931427, "learning_rate": 7.238525350579524e-06, "loss": 0.5233, "step": 8214 }, { "epoch": 3.7163537661162636, "grad_norm": 0.3762281537055969, "learning_rate": 7.237870115143771e-06, "loss": 0.463, "step": 8215 }, { "epoch": 3.716806152454196, "grad_norm": 0.44136884808540344, "learning_rate": 7.237214831645492e-06, "loss": 0.5373, "step": 8216 }, { "epoch": 3.7172585387921284, "grad_norm": 0.444926381111145, "learning_rate": 7.236559500098758e-06, "loss": 0.5867, "step": 8217 }, { "epoch": 3.717710925130061, "grad_norm": 0.40108373761177063, "learning_rate": 7.235904120517646e-06, "loss": 0.5048, "step": 8218 }, { "epoch": 3.7181633114679937, "grad_norm": 0.43736112117767334, "learning_rate": 7.235248692916229e-06, "loss": 0.5547, "step": 8219 }, { "epoch": 3.718615697805926, "grad_norm": 0.4117565155029297, "learning_rate": 7.234593217308584e-06, "loss": 0.4852, "step": 8220 }, { "epoch": 3.719068084143859, "grad_norm": 0.4770458936691284, "learning_rate": 7.23393769370879e-06, "loss": 0.5619, "step": 8221 }, { "epoch": 3.7195204704817915, "grad_norm": 0.4651924669742584, "learning_rate": 7.233282122130925e-06, "loss": 0.5608, "step": 8222 }, { "epoch": 3.719972856819724, "grad_norm": 0.4542667269706726, "learning_rate": 7.232626502589067e-06, "loss": 0.4947, "step": 8223 }, { "epoch": 3.7204252431576568, "grad_norm": 0.45238375663757324, "learning_rate": 7.231970835097299e-06, "loss": 0.5553, "step": 8224 }, { "epoch": 3.720877629495589, "grad_norm": 0.4477149546146393, "learning_rate": 7.2313151196697015e-06, "loss": 0.4859, "step": 8225 }, { "epoch": 3.721330015833522, "grad_norm": 0.45403844118118286, "learning_rate": 7.230659356320358e-06, "loss": 0.4803, "step": 8226 }, { "epoch": 3.7217824021714545, "grad_norm": 0.48034217953681946, "learning_rate": 7.230003545063352e-06, "loss": 0.4455, "step": 8227 }, { "epoch": 3.722234788509387, "grad_norm": 0.5632690787315369, "learning_rate": 7.229347685912767e-06, "loss": 0.6032, "step": 8228 }, { "epoch": 3.7226871748473194, "grad_norm": 0.5370433926582336, "learning_rate": 7.2286917788826926e-06, "loss": 0.5059, "step": 8229 }, { "epoch": 3.7231395611852522, "grad_norm": 0.5404873490333557, "learning_rate": 7.22803582398721e-06, "loss": 0.5105, "step": 8230 }, { "epoch": 3.7235919475231847, "grad_norm": 0.5668332576751709, "learning_rate": 7.2273798212404125e-06, "loss": 0.5796, "step": 8231 }, { "epoch": 3.7240443338611176, "grad_norm": 0.4186602830886841, "learning_rate": 7.2267237706563856e-06, "loss": 0.9385, "step": 8232 }, { "epoch": 3.72449672019905, "grad_norm": 0.2175871878862381, "learning_rate": 7.22606767224922e-06, "loss": 1.0535, "step": 8233 }, { "epoch": 3.7249491065369824, "grad_norm": 0.25064530968666077, "learning_rate": 7.225411526033006e-06, "loss": 0.5276, "step": 8234 }, { "epoch": 3.7254014928749153, "grad_norm": 0.26845234632492065, "learning_rate": 7.2247553320218365e-06, "loss": 0.524, "step": 8235 }, { "epoch": 3.7258538792128477, "grad_norm": 0.3151288628578186, "learning_rate": 7.224099090229806e-06, "loss": 0.6895, "step": 8236 }, { "epoch": 3.7263062655507806, "grad_norm": 0.31690967082977295, "learning_rate": 7.2234428006710055e-06, "loss": 0.5946, "step": 8237 }, { "epoch": 3.726758651888713, "grad_norm": 0.3063041865825653, "learning_rate": 7.222786463359533e-06, "loss": 0.7142, "step": 8238 }, { "epoch": 3.7272110382266455, "grad_norm": 0.32736310362815857, "learning_rate": 7.222130078309483e-06, "loss": 0.5833, "step": 8239 }, { "epoch": 3.727663424564578, "grad_norm": 0.3504396975040436, "learning_rate": 7.221473645534953e-06, "loss": 0.5748, "step": 8240 }, { "epoch": 3.7281158109025108, "grad_norm": 0.35020211338996887, "learning_rate": 7.22081716505004e-06, "loss": 0.6244, "step": 8241 }, { "epoch": 3.728568197240443, "grad_norm": 0.33123400807380676, "learning_rate": 7.220160636868845e-06, "loss": 0.5727, "step": 8242 }, { "epoch": 3.729020583578376, "grad_norm": 0.4035811722278595, "learning_rate": 7.219504061005467e-06, "loss": 0.545, "step": 8243 }, { "epoch": 3.7294729699163085, "grad_norm": 0.40569546818733215, "learning_rate": 7.218847437474007e-06, "loss": 0.5969, "step": 8244 }, { "epoch": 3.729925356254241, "grad_norm": 0.3340725004673004, "learning_rate": 7.218190766288568e-06, "loss": 0.4785, "step": 8245 }, { "epoch": 3.730377742592174, "grad_norm": 0.3793618083000183, "learning_rate": 7.217534047463254e-06, "loss": 0.5739, "step": 8246 }, { "epoch": 3.7308301289301062, "grad_norm": 0.42513343691825867, "learning_rate": 7.216877281012167e-06, "loss": 0.6542, "step": 8247 }, { "epoch": 3.731282515268039, "grad_norm": 0.41041991114616394, "learning_rate": 7.2162204669494155e-06, "loss": 0.5593, "step": 8248 }, { "epoch": 3.7317349016059715, "grad_norm": 0.3794940114021301, "learning_rate": 7.215563605289102e-06, "loss": 0.5691, "step": 8249 }, { "epoch": 3.732187287943904, "grad_norm": 0.33614081144332886, "learning_rate": 7.2149066960453365e-06, "loss": 0.4511, "step": 8250 }, { "epoch": 3.732639674281837, "grad_norm": 0.359695166349411, "learning_rate": 7.214249739232229e-06, "loss": 0.4677, "step": 8251 }, { "epoch": 3.7330920606197693, "grad_norm": 0.36473122239112854, "learning_rate": 7.213592734863884e-06, "loss": 0.5229, "step": 8252 }, { "epoch": 3.7335444469577017, "grad_norm": 0.35336121916770935, "learning_rate": 7.212935682954415e-06, "loss": 0.4316, "step": 8253 }, { "epoch": 3.7339968332956346, "grad_norm": 0.4326109290122986, "learning_rate": 7.212278583517933e-06, "loss": 0.6602, "step": 8254 }, { "epoch": 3.734449219633567, "grad_norm": 0.3535705804824829, "learning_rate": 7.21162143656855e-06, "loss": 0.4265, "step": 8255 }, { "epoch": 3.7349016059714994, "grad_norm": 0.3863328993320465, "learning_rate": 7.210964242120381e-06, "loss": 0.5063, "step": 8256 }, { "epoch": 3.7353539923094323, "grad_norm": 0.4393877685070038, "learning_rate": 7.210307000187538e-06, "loss": 0.5449, "step": 8257 }, { "epoch": 3.7358063786473648, "grad_norm": 0.39229902625083923, "learning_rate": 7.20964971078414e-06, "loss": 0.5405, "step": 8258 }, { "epoch": 3.7362587649852976, "grad_norm": 0.4148174822330475, "learning_rate": 7.2089923739243e-06, "loss": 0.5603, "step": 8259 }, { "epoch": 3.73671115132323, "grad_norm": 0.4738655686378479, "learning_rate": 7.208334989622138e-06, "loss": 0.5968, "step": 8260 }, { "epoch": 3.7371635376611625, "grad_norm": 0.38732558488845825, "learning_rate": 7.207677557891771e-06, "loss": 0.4289, "step": 8261 }, { "epoch": 3.7376159239990954, "grad_norm": 0.41261187195777893, "learning_rate": 7.20702007874732e-06, "loss": 0.4649, "step": 8262 }, { "epoch": 3.738068310337028, "grad_norm": 0.4683370590209961, "learning_rate": 7.206362552202905e-06, "loss": 0.6037, "step": 8263 }, { "epoch": 3.7385206966749607, "grad_norm": 0.4125347137451172, "learning_rate": 7.205704978272646e-06, "loss": 0.505, "step": 8264 }, { "epoch": 3.738973083012893, "grad_norm": 0.4769308865070343, "learning_rate": 7.205047356970669e-06, "loss": 0.644, "step": 8265 }, { "epoch": 3.7394254693508255, "grad_norm": 0.4393939971923828, "learning_rate": 7.204389688311094e-06, "loss": 0.5491, "step": 8266 }, { "epoch": 3.739877855688758, "grad_norm": 0.40839889645576477, "learning_rate": 7.203731972308049e-06, "loss": 0.4688, "step": 8267 }, { "epoch": 3.740330242026691, "grad_norm": 0.4222867488861084, "learning_rate": 7.203074208975658e-06, "loss": 0.4339, "step": 8268 }, { "epoch": 3.7407826283646233, "grad_norm": 0.4377709925174713, "learning_rate": 7.202416398328047e-06, "loss": 0.5171, "step": 8269 }, { "epoch": 3.741235014702556, "grad_norm": 0.4435172975063324, "learning_rate": 7.201758540379345e-06, "loss": 0.4736, "step": 8270 }, { "epoch": 3.7416874010404886, "grad_norm": 0.47911518812179565, "learning_rate": 7.201100635143681e-06, "loss": 0.5072, "step": 8271 }, { "epoch": 3.742139787378421, "grad_norm": 0.4551388919353485, "learning_rate": 7.200442682635183e-06, "loss": 0.4846, "step": 8272 }, { "epoch": 3.742592173716354, "grad_norm": 0.42938581109046936, "learning_rate": 7.199784682867985e-06, "loss": 0.4565, "step": 8273 }, { "epoch": 3.7430445600542863, "grad_norm": 0.5289057493209839, "learning_rate": 7.199126635856216e-06, "loss": 0.5692, "step": 8274 }, { "epoch": 3.743496946392219, "grad_norm": 0.5290199518203735, "learning_rate": 7.19846854161401e-06, "loss": 0.6261, "step": 8275 }, { "epoch": 3.7439493327301516, "grad_norm": 0.5512881278991699, "learning_rate": 7.1978104001554995e-06, "loss": 0.4282, "step": 8276 }, { "epoch": 3.744401719068084, "grad_norm": 0.573341965675354, "learning_rate": 7.19715221149482e-06, "loss": 0.5863, "step": 8277 }, { "epoch": 3.7448541054060165, "grad_norm": 0.5084636211395264, "learning_rate": 7.196493975646109e-06, "loss": 0.4375, "step": 8278 }, { "epoch": 3.7453064917439494, "grad_norm": 0.48470595479011536, "learning_rate": 7.195835692623501e-06, "loss": 0.4192, "step": 8279 }, { "epoch": 3.745758878081882, "grad_norm": 0.6067512035369873, "learning_rate": 7.195177362441135e-06, "loss": 0.4999, "step": 8280 }, { "epoch": 3.7462112644198147, "grad_norm": 0.568464994430542, "learning_rate": 7.194518985113149e-06, "loss": 0.508, "step": 8281 }, { "epoch": 3.746663650757747, "grad_norm": 0.4751073718070984, "learning_rate": 7.193860560653685e-06, "loss": 1.1666, "step": 8282 }, { "epoch": 3.7471160370956795, "grad_norm": 0.28039872646331787, "learning_rate": 7.193202089076883e-06, "loss": 0.5408, "step": 8283 }, { "epoch": 3.7475684234336124, "grad_norm": 0.3161298632621765, "learning_rate": 7.192543570396883e-06, "loss": 0.636, "step": 8284 }, { "epoch": 3.748020809771545, "grad_norm": 0.3209848999977112, "learning_rate": 7.191885004627831e-06, "loss": 0.5635, "step": 8285 }, { "epoch": 3.7484731961094777, "grad_norm": 0.3683524429798126, "learning_rate": 7.191226391783868e-06, "loss": 0.8909, "step": 8286 }, { "epoch": 3.74892558244741, "grad_norm": 0.29573872685432434, "learning_rate": 7.1905677318791425e-06, "loss": 0.5404, "step": 8287 }, { "epoch": 3.7493779687853426, "grad_norm": 0.3599741458892822, "learning_rate": 7.189909024927797e-06, "loss": 0.7415, "step": 8288 }, { "epoch": 3.749830355123275, "grad_norm": 0.31552019715309143, "learning_rate": 7.189250270943979e-06, "loss": 0.5974, "step": 8289 }, { "epoch": 3.750282741461208, "grad_norm": 0.37245848774909973, "learning_rate": 7.188591469941839e-06, "loss": 0.5716, "step": 8290 }, { "epoch": 3.7507351277991403, "grad_norm": 0.348312109708786, "learning_rate": 7.187932621935522e-06, "loss": 0.6664, "step": 8291 }, { "epoch": 3.751187514137073, "grad_norm": 0.38040560483932495, "learning_rate": 7.187273726939184e-06, "loss": 0.6113, "step": 8292 }, { "epoch": 3.7516399004750056, "grad_norm": 0.33154839277267456, "learning_rate": 7.18661478496697e-06, "loss": 0.5783, "step": 8293 }, { "epoch": 3.752092286812938, "grad_norm": 0.36522412300109863, "learning_rate": 7.185955796033035e-06, "loss": 0.5432, "step": 8294 }, { "epoch": 3.752544673150871, "grad_norm": 0.34985068440437317, "learning_rate": 7.185296760151531e-06, "loss": 0.4919, "step": 8295 }, { "epoch": 3.7529970594888034, "grad_norm": 0.3572457730770111, "learning_rate": 7.184637677336613e-06, "loss": 0.5489, "step": 8296 }, { "epoch": 3.7534494458267362, "grad_norm": 0.35944801568984985, "learning_rate": 7.183978547602437e-06, "loss": 0.48, "step": 8297 }, { "epoch": 3.7539018321646687, "grad_norm": 0.39526471495628357, "learning_rate": 7.183319370963155e-06, "loss": 0.5745, "step": 8298 }, { "epoch": 3.754354218502601, "grad_norm": 0.35799524188041687, "learning_rate": 7.18266014743293e-06, "loss": 0.5274, "step": 8299 }, { "epoch": 3.754806604840534, "grad_norm": 0.386322021484375, "learning_rate": 7.182000877025914e-06, "loss": 0.6155, "step": 8300 }, { "epoch": 3.7552589911784664, "grad_norm": 0.373526930809021, "learning_rate": 7.18134155975627e-06, "loss": 0.5447, "step": 8301 }, { "epoch": 3.7557113775163993, "grad_norm": 0.3723032474517822, "learning_rate": 7.180682195638157e-06, "loss": 0.528, "step": 8302 }, { "epoch": 3.7561637638543317, "grad_norm": 0.4439416229724884, "learning_rate": 7.180022784685736e-06, "loss": 0.6498, "step": 8303 }, { "epoch": 3.756616150192264, "grad_norm": 0.41486087441444397, "learning_rate": 7.179363326913169e-06, "loss": 0.5946, "step": 8304 }, { "epoch": 3.7570685365301966, "grad_norm": 0.3723852336406708, "learning_rate": 7.17870382233462e-06, "loss": 0.5744, "step": 8305 }, { "epoch": 3.7575209228681294, "grad_norm": 0.38894906640052795, "learning_rate": 7.178044270964252e-06, "loss": 0.5079, "step": 8306 }, { "epoch": 3.757973309206062, "grad_norm": 0.42509156465530396, "learning_rate": 7.17738467281623e-06, "loss": 0.5843, "step": 8307 }, { "epoch": 3.7584256955439947, "grad_norm": 0.4052128195762634, "learning_rate": 7.176725027904723e-06, "loss": 0.5499, "step": 8308 }, { "epoch": 3.758878081881927, "grad_norm": 0.3450085520744324, "learning_rate": 7.176065336243894e-06, "loss": 0.4705, "step": 8309 }, { "epoch": 3.7593304682198596, "grad_norm": 0.41422000527381897, "learning_rate": 7.175405597847913e-06, "loss": 0.52, "step": 8310 }, { "epoch": 3.7597828545577925, "grad_norm": 0.4707091152667999, "learning_rate": 7.17474581273095e-06, "loss": 0.6302, "step": 8311 }, { "epoch": 3.760235240895725, "grad_norm": 0.4192534387111664, "learning_rate": 7.174085980907174e-06, "loss": 0.5544, "step": 8312 }, { "epoch": 3.760687627233658, "grad_norm": 0.4440588355064392, "learning_rate": 7.173426102390756e-06, "loss": 0.5336, "step": 8313 }, { "epoch": 3.7611400135715902, "grad_norm": 0.44367772340774536, "learning_rate": 7.172766177195869e-06, "loss": 0.5265, "step": 8314 }, { "epoch": 3.7615923999095227, "grad_norm": 0.41882771253585815, "learning_rate": 7.1721062053366844e-06, "loss": 0.5032, "step": 8315 }, { "epoch": 3.762044786247455, "grad_norm": 0.41704773902893066, "learning_rate": 7.1714461868273785e-06, "loss": 0.5133, "step": 8316 }, { "epoch": 3.762497172585388, "grad_norm": 0.43299761414527893, "learning_rate": 7.170786121682126e-06, "loss": 0.5549, "step": 8317 }, { "epoch": 3.7629495589233204, "grad_norm": 0.41863691806793213, "learning_rate": 7.170126009915103e-06, "loss": 0.5083, "step": 8318 }, { "epoch": 3.7634019452612533, "grad_norm": 0.4592602550983429, "learning_rate": 7.169465851540487e-06, "loss": 0.5028, "step": 8319 }, { "epoch": 3.7638543315991857, "grad_norm": 0.4738897681236267, "learning_rate": 7.168805646572454e-06, "loss": 0.5479, "step": 8320 }, { "epoch": 3.764306717937118, "grad_norm": 0.4239608645439148, "learning_rate": 7.1681453950251845e-06, "loss": 0.457, "step": 8321 }, { "epoch": 3.764759104275051, "grad_norm": 0.46712958812713623, "learning_rate": 7.16748509691286e-06, "loss": 0.5523, "step": 8322 }, { "epoch": 3.7652114906129834, "grad_norm": 0.4865667223930359, "learning_rate": 7.166824752249659e-06, "loss": 0.4932, "step": 8323 }, { "epoch": 3.7656638769509163, "grad_norm": 0.44135063886642456, "learning_rate": 7.1661643610497665e-06, "loss": 0.4573, "step": 8324 }, { "epoch": 3.7661162632888487, "grad_norm": 0.5035232901573181, "learning_rate": 7.165503923327364e-06, "loss": 0.4963, "step": 8325 }, { "epoch": 3.766568649626781, "grad_norm": 0.5648856163024902, "learning_rate": 7.1648434390966356e-06, "loss": 0.6139, "step": 8326 }, { "epoch": 3.7670210359647136, "grad_norm": 0.5601599216461182, "learning_rate": 7.164182908371766e-06, "loss": 0.5477, "step": 8327 }, { "epoch": 3.7674734223026465, "grad_norm": 0.48998749256134033, "learning_rate": 7.163522331166943e-06, "loss": 0.4641, "step": 8328 }, { "epoch": 3.767925808640579, "grad_norm": 0.5446048974990845, "learning_rate": 7.1628617074963534e-06, "loss": 0.5122, "step": 8329 }, { "epoch": 3.768378194978512, "grad_norm": 0.5397573113441467, "learning_rate": 7.162201037374185e-06, "loss": 0.4636, "step": 8330 }, { "epoch": 3.768830581316444, "grad_norm": 0.5747941136360168, "learning_rate": 7.1615403208146285e-06, "loss": 0.5195, "step": 8331 }, { "epoch": 3.7692829676543766, "grad_norm": 0.4157159626483917, "learning_rate": 7.16087955783187e-06, "loss": 1.0651, "step": 8332 }, { "epoch": 3.7697353539923095, "grad_norm": 0.2132076472043991, "learning_rate": 7.160218748440103e-06, "loss": 1.067, "step": 8333 }, { "epoch": 3.770187740330242, "grad_norm": 0.27709031105041504, "learning_rate": 7.15955789265352e-06, "loss": 0.8061, "step": 8334 }, { "epoch": 3.770640126668175, "grad_norm": 0.27907106280326843, "learning_rate": 7.1588969904863134e-06, "loss": 0.6732, "step": 8335 }, { "epoch": 3.7710925130061073, "grad_norm": 0.281758576631546, "learning_rate": 7.158236041952678e-06, "loss": 0.6056, "step": 8336 }, { "epoch": 3.7715448993440397, "grad_norm": 0.31771451234817505, "learning_rate": 7.157575047066809e-06, "loss": 0.6527, "step": 8337 }, { "epoch": 3.7719972856819726, "grad_norm": 0.28965824842453003, "learning_rate": 7.1569140058429036e-06, "loss": 0.5095, "step": 8338 }, { "epoch": 3.772449672019905, "grad_norm": 0.300359308719635, "learning_rate": 7.156252918295155e-06, "loss": 0.6181, "step": 8339 }, { "epoch": 3.7729020583578374, "grad_norm": 0.36391833424568176, "learning_rate": 7.155591784437766e-06, "loss": 0.7038, "step": 8340 }, { "epoch": 3.7733544446957703, "grad_norm": 0.32095521688461304, "learning_rate": 7.154930604284931e-06, "loss": 0.5404, "step": 8341 }, { "epoch": 3.7738068310337027, "grad_norm": 0.356838583946228, "learning_rate": 7.154269377850855e-06, "loss": 0.69, "step": 8342 }, { "epoch": 3.774259217371635, "grad_norm": 0.31165245175361633, "learning_rate": 7.1536081051497364e-06, "loss": 0.4248, "step": 8343 }, { "epoch": 3.774711603709568, "grad_norm": 0.3518347442150116, "learning_rate": 7.152946786195777e-06, "loss": 0.5622, "step": 8344 }, { "epoch": 3.7751639900475005, "grad_norm": 0.3207526206970215, "learning_rate": 7.15228542100318e-06, "loss": 0.5325, "step": 8345 }, { "epoch": 3.7756163763854333, "grad_norm": 0.35458746552467346, "learning_rate": 7.1516240095861515e-06, "loss": 0.5865, "step": 8346 }, { "epoch": 3.776068762723366, "grad_norm": 0.4001976549625397, "learning_rate": 7.150962551958893e-06, "loss": 0.7077, "step": 8347 }, { "epoch": 3.776521149061298, "grad_norm": 0.37373268604278564, "learning_rate": 7.150301048135615e-06, "loss": 0.6266, "step": 8348 }, { "epoch": 3.776973535399231, "grad_norm": 0.36830443143844604, "learning_rate": 7.14963949813052e-06, "loss": 0.5606, "step": 8349 }, { "epoch": 3.7774259217371635, "grad_norm": 0.3746813237667084, "learning_rate": 7.14897790195782e-06, "loss": 0.5824, "step": 8350 }, { "epoch": 3.7778783080750964, "grad_norm": 0.3446361720561981, "learning_rate": 7.148316259631721e-06, "loss": 0.4815, "step": 8351 }, { "epoch": 3.778330694413029, "grad_norm": 0.4075472056865692, "learning_rate": 7.147654571166436e-06, "loss": 0.5226, "step": 8352 }, { "epoch": 3.7787830807509613, "grad_norm": 0.39408838748931885, "learning_rate": 7.146992836576173e-06, "loss": 0.6117, "step": 8353 }, { "epoch": 3.7792354670888937, "grad_norm": 0.3640475571155548, "learning_rate": 7.146331055875146e-06, "loss": 0.463, "step": 8354 }, { "epoch": 3.7796878534268266, "grad_norm": 0.3875674605369568, "learning_rate": 7.145669229077567e-06, "loss": 0.4396, "step": 8355 }, { "epoch": 3.780140239764759, "grad_norm": 0.39405205845832825, "learning_rate": 7.145007356197651e-06, "loss": 0.5539, "step": 8356 }, { "epoch": 3.780592626102692, "grad_norm": 0.38166385889053345, "learning_rate": 7.144345437249611e-06, "loss": 0.4829, "step": 8357 }, { "epoch": 3.7810450124406243, "grad_norm": 0.39485469460487366, "learning_rate": 7.143683472247664e-06, "loss": 0.4896, "step": 8358 }, { "epoch": 3.7814973987785567, "grad_norm": 0.4146787226200104, "learning_rate": 7.143021461206029e-06, "loss": 0.5423, "step": 8359 }, { "epoch": 3.7819497851164896, "grad_norm": 0.3545907735824585, "learning_rate": 7.142359404138921e-06, "loss": 0.4603, "step": 8360 }, { "epoch": 3.782402171454422, "grad_norm": 0.39911720156669617, "learning_rate": 7.14169730106056e-06, "loss": 0.5136, "step": 8361 }, { "epoch": 3.782854557792355, "grad_norm": 0.404948353767395, "learning_rate": 7.141035151985167e-06, "loss": 0.4971, "step": 8362 }, { "epoch": 3.7833069441302873, "grad_norm": 0.4496859014034271, "learning_rate": 7.140372956926962e-06, "loss": 0.6082, "step": 8363 }, { "epoch": 3.7837593304682198, "grad_norm": 0.43646082282066345, "learning_rate": 7.139710715900168e-06, "loss": 0.5282, "step": 8364 }, { "epoch": 3.784211716806152, "grad_norm": 0.46143391728401184, "learning_rate": 7.139048428919004e-06, "loss": 0.5688, "step": 8365 }, { "epoch": 3.784664103144085, "grad_norm": 0.4692995548248291, "learning_rate": 7.138386095997698e-06, "loss": 0.5258, "step": 8366 }, { "epoch": 3.7851164894820175, "grad_norm": 0.44347497820854187, "learning_rate": 7.1377237171504735e-06, "loss": 0.4742, "step": 8367 }, { "epoch": 3.7855688758199504, "grad_norm": 0.5455581545829773, "learning_rate": 7.137061292391556e-06, "loss": 0.6007, "step": 8368 }, { "epoch": 3.786021262157883, "grad_norm": 0.4814540445804596, "learning_rate": 7.136398821735173e-06, "loss": 0.6655, "step": 8369 }, { "epoch": 3.7864736484958152, "grad_norm": 0.45057937502861023, "learning_rate": 7.135736305195553e-06, "loss": 0.5731, "step": 8370 }, { "epoch": 3.786926034833748, "grad_norm": 0.4840839207172394, "learning_rate": 7.135073742786922e-06, "loss": 0.5717, "step": 8371 }, { "epoch": 3.7873784211716806, "grad_norm": 0.42736339569091797, "learning_rate": 7.1344111345235136e-06, "loss": 0.3963, "step": 8372 }, { "epoch": 3.7878308075096134, "grad_norm": 0.5326136350631714, "learning_rate": 7.1337484804195546e-06, "loss": 0.6295, "step": 8373 }, { "epoch": 3.788283193847546, "grad_norm": 0.47930964827537537, "learning_rate": 7.133085780489279e-06, "loss": 0.5285, "step": 8374 }, { "epoch": 3.7887355801854783, "grad_norm": 0.5800706148147583, "learning_rate": 7.1324230347469204e-06, "loss": 0.6055, "step": 8375 }, { "epoch": 3.7891879665234107, "grad_norm": 0.5342775583267212, "learning_rate": 7.131760243206712e-06, "loss": 0.6051, "step": 8376 }, { "epoch": 3.7896403528613436, "grad_norm": 0.517083466053009, "learning_rate": 7.131097405882888e-06, "loss": 0.5485, "step": 8377 }, { "epoch": 3.790092739199276, "grad_norm": 0.5495998859405518, "learning_rate": 7.130434522789682e-06, "loss": 0.5468, "step": 8378 }, { "epoch": 3.790545125537209, "grad_norm": 0.5999481678009033, "learning_rate": 7.129771593941333e-06, "loss": 0.5458, "step": 8379 }, { "epoch": 3.7909975118751413, "grad_norm": 0.499796599149704, "learning_rate": 7.129108619352079e-06, "loss": 0.4597, "step": 8380 }, { "epoch": 3.7914498982130738, "grad_norm": 0.5168105959892273, "learning_rate": 7.1284455990361574e-06, "loss": 0.4931, "step": 8381 }, { "epoch": 3.7919022845510066, "grad_norm": 0.5184217691421509, "learning_rate": 7.1277825330078095e-06, "loss": 1.0785, "step": 8382 }, { "epoch": 3.792354670888939, "grad_norm": 0.19884102046489716, "learning_rate": 7.127119421281274e-06, "loss": 0.7107, "step": 8383 }, { "epoch": 3.792807057226872, "grad_norm": 0.3167625367641449, "learning_rate": 7.1264562638707945e-06, "loss": 0.5726, "step": 8384 }, { "epoch": 3.7932594435648044, "grad_norm": 0.2989059090614319, "learning_rate": 7.1257930607906115e-06, "loss": 0.563, "step": 8385 }, { "epoch": 3.793711829902737, "grad_norm": 0.3410821557044983, "learning_rate": 7.12512981205497e-06, "loss": 0.6615, "step": 8386 }, { "epoch": 3.7941642162406697, "grad_norm": 0.2772086262702942, "learning_rate": 7.124466517678114e-06, "loss": 0.5977, "step": 8387 }, { "epoch": 3.794616602578602, "grad_norm": 0.29538407921791077, "learning_rate": 7.1238031776742885e-06, "loss": 0.4858, "step": 8388 }, { "epoch": 3.795068988916535, "grad_norm": 0.3374304473400116, "learning_rate": 7.1231397920577405e-06, "loss": 0.576, "step": 8389 }, { "epoch": 3.7955213752544674, "grad_norm": 0.3472336232662201, "learning_rate": 7.122476360842717e-06, "loss": 0.6524, "step": 8390 }, { "epoch": 3.7959737615924, "grad_norm": 0.3501876890659332, "learning_rate": 7.121812884043467e-06, "loss": 0.5774, "step": 8391 }, { "epoch": 3.7964261479303323, "grad_norm": 0.3679836392402649, "learning_rate": 7.121149361674241e-06, "loss": 0.5639, "step": 8392 }, { "epoch": 3.796878534268265, "grad_norm": 0.3962157964706421, "learning_rate": 7.120485793749287e-06, "loss": 0.6051, "step": 8393 }, { "epoch": 3.7973309206061976, "grad_norm": 0.372096449136734, "learning_rate": 7.119822180282859e-06, "loss": 0.6196, "step": 8394 }, { "epoch": 3.7977833069441305, "grad_norm": 0.3468775153160095, "learning_rate": 7.119158521289207e-06, "loss": 0.4444, "step": 8395 }, { "epoch": 3.798235693282063, "grad_norm": 0.35112103819847107, "learning_rate": 7.118494816782585e-06, "loss": 0.5866, "step": 8396 }, { "epoch": 3.7986880796199953, "grad_norm": 0.3495842218399048, "learning_rate": 7.117831066777247e-06, "loss": 0.4928, "step": 8397 }, { "epoch": 3.799140465957928, "grad_norm": 0.3785644769668579, "learning_rate": 7.117167271287453e-06, "loss": 0.5646, "step": 8398 }, { "epoch": 3.7995928522958606, "grad_norm": 0.3706865906715393, "learning_rate": 7.116503430327451e-06, "loss": 0.5033, "step": 8399 }, { "epoch": 3.8000452386337935, "grad_norm": 0.4106360077857971, "learning_rate": 7.1158395439115045e-06, "loss": 0.638, "step": 8400 }, { "epoch": 3.8000452386337935, "eval_loss": 0.590426504611969, "eval_runtime": 25.4265, "eval_samples_per_second": 29.261, "eval_steps_per_second": 7.315, "step": 8400 }, { "epoch": 3.800497624971726, "grad_norm": 0.40649402141571045, "learning_rate": 7.115175612053868e-06, "loss": 0.5826, "step": 8401 }, { "epoch": 3.8009500113096584, "grad_norm": 0.3902415931224823, "learning_rate": 7.114511634768803e-06, "loss": 0.5449, "step": 8402 }, { "epoch": 3.801402397647591, "grad_norm": 0.4066977798938751, "learning_rate": 7.1138476120705704e-06, "loss": 0.5852, "step": 8403 }, { "epoch": 3.8018547839855237, "grad_norm": 0.46067264676094055, "learning_rate": 7.11318354397343e-06, "loss": 0.6491, "step": 8404 }, { "epoch": 3.802307170323456, "grad_norm": 0.3664512634277344, "learning_rate": 7.112519430491642e-06, "loss": 0.4859, "step": 8405 }, { "epoch": 3.802759556661389, "grad_norm": 0.39542654156684875, "learning_rate": 7.111855271639473e-06, "loss": 0.4959, "step": 8406 }, { "epoch": 3.8032119429993214, "grad_norm": 0.46381843090057373, "learning_rate": 7.111191067431186e-06, "loss": 0.6342, "step": 8407 }, { "epoch": 3.803664329337254, "grad_norm": 0.4594956040382385, "learning_rate": 7.110526817881044e-06, "loss": 0.6025, "step": 8408 }, { "epoch": 3.8041167156751867, "grad_norm": 0.38493505120277405, "learning_rate": 7.109862523003316e-06, "loss": 0.5001, "step": 8409 }, { "epoch": 3.804569102013119, "grad_norm": 0.4823687672615051, "learning_rate": 7.1091981828122665e-06, "loss": 0.7192, "step": 8410 }, { "epoch": 3.805021488351052, "grad_norm": 0.43195539712905884, "learning_rate": 7.108533797322166e-06, "loss": 0.5707, "step": 8411 }, { "epoch": 3.8054738746889845, "grad_norm": 0.4173101484775543, "learning_rate": 7.107869366547281e-06, "loss": 0.5248, "step": 8412 }, { "epoch": 3.805926261026917, "grad_norm": 0.4161129295825958, "learning_rate": 7.107204890501883e-06, "loss": 0.496, "step": 8413 }, { "epoch": 3.8063786473648493, "grad_norm": 0.5160880088806152, "learning_rate": 7.106540369200241e-06, "loss": 0.7248, "step": 8414 }, { "epoch": 3.806831033702782, "grad_norm": 0.407747745513916, "learning_rate": 7.10587580265663e-06, "loss": 0.4727, "step": 8415 }, { "epoch": 3.8072834200407146, "grad_norm": 0.4517309069633484, "learning_rate": 7.105211190885319e-06, "loss": 0.5346, "step": 8416 }, { "epoch": 3.8077358063786475, "grad_norm": 0.40690264105796814, "learning_rate": 7.104546533900586e-06, "loss": 0.4957, "step": 8417 }, { "epoch": 3.80818819271658, "grad_norm": 0.502209484577179, "learning_rate": 7.103881831716704e-06, "loss": 0.5893, "step": 8418 }, { "epoch": 3.8086405790545124, "grad_norm": 0.4326380491256714, "learning_rate": 7.1032170843479465e-06, "loss": 0.5302, "step": 8419 }, { "epoch": 3.8090929653924452, "grad_norm": 0.4431098997592926, "learning_rate": 7.102552291808592e-06, "loss": 0.4908, "step": 8420 }, { "epoch": 3.8095453517303777, "grad_norm": 0.4674176871776581, "learning_rate": 7.10188745411292e-06, "loss": 0.6585, "step": 8421 }, { "epoch": 3.8099977380683105, "grad_norm": 0.5155872106552124, "learning_rate": 7.101222571275206e-06, "loss": 0.5948, "step": 8422 }, { "epoch": 3.810450124406243, "grad_norm": 0.4992597699165344, "learning_rate": 7.100557643309732e-06, "loss": 0.6051, "step": 8423 }, { "epoch": 3.8109025107441754, "grad_norm": 0.4952602684497833, "learning_rate": 7.099892670230777e-06, "loss": 0.5657, "step": 8424 }, { "epoch": 3.8113548970821083, "grad_norm": 0.499919056892395, "learning_rate": 7.099227652052624e-06, "loss": 0.523, "step": 8425 }, { "epoch": 3.8118072834200407, "grad_norm": 0.5105465054512024, "learning_rate": 7.098562588789554e-06, "loss": 0.4947, "step": 8426 }, { "epoch": 3.812259669757973, "grad_norm": 0.4902293086051941, "learning_rate": 7.097897480455851e-06, "loss": 0.5134, "step": 8427 }, { "epoch": 3.812712056095906, "grad_norm": 0.5530281662940979, "learning_rate": 7.0972323270658e-06, "loss": 0.592, "step": 8428 }, { "epoch": 3.8131644424338385, "grad_norm": 0.5600841045379639, "learning_rate": 7.096567128633687e-06, "loss": 0.5354, "step": 8429 }, { "epoch": 3.813616828771771, "grad_norm": 0.5288351774215698, "learning_rate": 7.095901885173796e-06, "loss": 0.5261, "step": 8430 }, { "epoch": 3.8140692151097038, "grad_norm": 0.5728168487548828, "learning_rate": 7.095236596700416e-06, "loss": 0.4929, "step": 8431 }, { "epoch": 3.814521601447636, "grad_norm": 0.431043416261673, "learning_rate": 7.094571263227839e-06, "loss": 0.9819, "step": 8432 }, { "epoch": 3.814973987785569, "grad_norm": 0.1921032965183258, "learning_rate": 7.0939058847703465e-06, "loss": 0.9484, "step": 8433 }, { "epoch": 3.8154263741235015, "grad_norm": 0.21151228249073029, "learning_rate": 7.093240461342235e-06, "loss": 0.6511, "step": 8434 }, { "epoch": 3.815878760461434, "grad_norm": 0.2644018232822418, "learning_rate": 7.092574992957791e-06, "loss": 0.6028, "step": 8435 }, { "epoch": 3.816331146799367, "grad_norm": 0.27947086095809937, "learning_rate": 7.0919094796313114e-06, "loss": 0.565, "step": 8436 }, { "epoch": 3.8167835331372992, "grad_norm": 0.2814664840698242, "learning_rate": 7.091243921377087e-06, "loss": 0.5728, "step": 8437 }, { "epoch": 3.817235919475232, "grad_norm": 0.28889480233192444, "learning_rate": 7.090578318209411e-06, "loss": 0.5166, "step": 8438 }, { "epoch": 3.8176883058131645, "grad_norm": 0.35217276215553284, "learning_rate": 7.089912670142581e-06, "loss": 0.6939, "step": 8439 }, { "epoch": 3.818140692151097, "grad_norm": 0.33717456459999084, "learning_rate": 7.089246977190892e-06, "loss": 0.5235, "step": 8440 }, { "epoch": 3.8185930784890294, "grad_norm": 0.3588196933269501, "learning_rate": 7.08858123936864e-06, "loss": 0.6777, "step": 8441 }, { "epoch": 3.8190454648269623, "grad_norm": 0.3419398069381714, "learning_rate": 7.087915456690124e-06, "loss": 0.572, "step": 8442 }, { "epoch": 3.8194978511648947, "grad_norm": 0.3540392518043518, "learning_rate": 7.087249629169642e-06, "loss": 0.5865, "step": 8443 }, { "epoch": 3.8199502375028276, "grad_norm": 0.34014636278152466, "learning_rate": 7.086583756821497e-06, "loss": 0.5524, "step": 8444 }, { "epoch": 3.82040262384076, "grad_norm": 0.31076645851135254, "learning_rate": 7.085917839659986e-06, "loss": 0.4466, "step": 8445 }, { "epoch": 3.8208550101786924, "grad_norm": 0.3663107752799988, "learning_rate": 7.085251877699412e-06, "loss": 0.5446, "step": 8446 }, { "epoch": 3.8213073965166253, "grad_norm": 0.3540785610675812, "learning_rate": 7.084585870954078e-06, "loss": 0.6014, "step": 8447 }, { "epoch": 3.8217597828545578, "grad_norm": 0.391757607460022, "learning_rate": 7.083919819438289e-06, "loss": 0.5749, "step": 8448 }, { "epoch": 3.8222121691924906, "grad_norm": 0.36265432834625244, "learning_rate": 7.0832537231663475e-06, "loss": 0.5523, "step": 8449 }, { "epoch": 3.822664555530423, "grad_norm": 0.38054361939430237, "learning_rate": 7.082587582152561e-06, "loss": 0.5747, "step": 8450 }, { "epoch": 3.8231169418683555, "grad_norm": 0.4056057929992676, "learning_rate": 7.081921396411235e-06, "loss": 0.7029, "step": 8451 }, { "epoch": 3.823569328206288, "grad_norm": 0.40458858013153076, "learning_rate": 7.081255165956677e-06, "loss": 0.4831, "step": 8452 }, { "epoch": 3.824021714544221, "grad_norm": 0.37585529685020447, "learning_rate": 7.080588890803197e-06, "loss": 0.554, "step": 8453 }, { "epoch": 3.8244741008821532, "grad_norm": 0.41456741094589233, "learning_rate": 7.079922570965104e-06, "loss": 0.6417, "step": 8454 }, { "epoch": 3.824926487220086, "grad_norm": 0.388065367937088, "learning_rate": 7.079256206456709e-06, "loss": 0.5657, "step": 8455 }, { "epoch": 3.8253788735580185, "grad_norm": 0.3909775912761688, "learning_rate": 7.0785897972923225e-06, "loss": 0.5118, "step": 8456 }, { "epoch": 3.825831259895951, "grad_norm": 0.41105082631111145, "learning_rate": 7.077923343486255e-06, "loss": 0.555, "step": 8457 }, { "epoch": 3.826283646233884, "grad_norm": 0.3841765820980072, "learning_rate": 7.0772568450528236e-06, "loss": 0.404, "step": 8458 }, { "epoch": 3.8267360325718163, "grad_norm": 0.39709535241127014, "learning_rate": 7.07659030200634e-06, "loss": 0.5563, "step": 8459 }, { "epoch": 3.827188418909749, "grad_norm": 0.4500335156917572, "learning_rate": 7.075923714361121e-06, "loss": 0.6071, "step": 8460 }, { "epoch": 3.8276408052476816, "grad_norm": 0.4525498151779175, "learning_rate": 7.075257082131481e-06, "loss": 0.641, "step": 8461 }, { "epoch": 3.828093191585614, "grad_norm": 0.4092669188976288, "learning_rate": 7.07459040533174e-06, "loss": 0.4912, "step": 8462 }, { "epoch": 3.8285455779235464, "grad_norm": 0.4388555884361267, "learning_rate": 7.073923683976214e-06, "loss": 0.5759, "step": 8463 }, { "epoch": 3.8289979642614793, "grad_norm": 0.4232099652290344, "learning_rate": 7.073256918079223e-06, "loss": 0.551, "step": 8464 }, { "epoch": 3.8294503505994117, "grad_norm": 0.5034275054931641, "learning_rate": 7.0725901076550864e-06, "loss": 0.5839, "step": 8465 }, { "epoch": 3.8299027369373446, "grad_norm": 0.4207353889942169, "learning_rate": 7.071923252718127e-06, "loss": 0.4965, "step": 8466 }, { "epoch": 3.830355123275277, "grad_norm": 0.4787149131298065, "learning_rate": 7.071256353282664e-06, "loss": 0.5042, "step": 8467 }, { "epoch": 3.8308075096132095, "grad_norm": 0.41899868845939636, "learning_rate": 7.070589409363022e-06, "loss": 0.4314, "step": 8468 }, { "epoch": 3.8312598959511424, "grad_norm": 0.43993327021598816, "learning_rate": 7.069922420973524e-06, "loss": 0.5213, "step": 8469 }, { "epoch": 3.831712282289075, "grad_norm": 0.42049336433410645, "learning_rate": 7.0692553881284955e-06, "loss": 0.4821, "step": 8470 }, { "epoch": 3.8321646686270077, "grad_norm": 0.3949207663536072, "learning_rate": 7.068588310842264e-06, "loss": 0.4263, "step": 8471 }, { "epoch": 3.83261705496494, "grad_norm": 0.4907822012901306, "learning_rate": 7.067921189129152e-06, "loss": 0.5279, "step": 8472 }, { "epoch": 3.8330694413028725, "grad_norm": 0.4660489857196808, "learning_rate": 7.067254023003491e-06, "loss": 0.5473, "step": 8473 }, { "epoch": 3.8335218276408054, "grad_norm": 0.5197551846504211, "learning_rate": 7.066586812479609e-06, "loss": 0.5983, "step": 8474 }, { "epoch": 3.833974213978738, "grad_norm": 0.5531418323516846, "learning_rate": 7.0659195575718346e-06, "loss": 0.5574, "step": 8475 }, { "epoch": 3.8344266003166707, "grad_norm": 0.47328975796699524, "learning_rate": 7.065252258294498e-06, "loss": 0.5136, "step": 8476 }, { "epoch": 3.834878986654603, "grad_norm": 0.46627095341682434, "learning_rate": 7.064584914661934e-06, "loss": 0.4714, "step": 8477 }, { "epoch": 3.8353313729925356, "grad_norm": 0.5540060997009277, "learning_rate": 7.06391752668847e-06, "loss": 0.5538, "step": 8478 }, { "epoch": 3.835783759330468, "grad_norm": 0.4649556875228882, "learning_rate": 7.063250094388443e-06, "loss": 0.4215, "step": 8479 }, { "epoch": 3.836236145668401, "grad_norm": 0.5929507613182068, "learning_rate": 7.062582617776186e-06, "loss": 0.5205, "step": 8480 }, { "epoch": 3.8366885320063333, "grad_norm": 0.5884465575218201, "learning_rate": 7.061915096866034e-06, "loss": 0.4828, "step": 8481 }, { "epoch": 3.837140918344266, "grad_norm": 0.47426995635032654, "learning_rate": 7.0612475316723265e-06, "loss": 0.8924, "step": 8482 }, { "epoch": 3.8375933046821986, "grad_norm": 0.22525371611118317, "learning_rate": 7.060579922209396e-06, "loss": 0.7768, "step": 8483 }, { "epoch": 3.838045691020131, "grad_norm": 0.2684219479560852, "learning_rate": 7.059912268491583e-06, "loss": 0.6807, "step": 8484 }, { "epoch": 3.838498077358064, "grad_norm": 0.2813349962234497, "learning_rate": 7.059244570533228e-06, "loss": 0.5947, "step": 8485 }, { "epoch": 3.8389504636959964, "grad_norm": 0.34152886271476746, "learning_rate": 7.058576828348669e-06, "loss": 0.6964, "step": 8486 }, { "epoch": 3.8394028500339292, "grad_norm": 0.3377821743488312, "learning_rate": 7.057909041952247e-06, "loss": 0.5906, "step": 8487 }, { "epoch": 3.8398552363718617, "grad_norm": 0.3312196433544159, "learning_rate": 7.057241211358306e-06, "loss": 0.5956, "step": 8488 }, { "epoch": 3.840307622709794, "grad_norm": 0.29598522186279297, "learning_rate": 7.056573336581188e-06, "loss": 0.5066, "step": 8489 }, { "epoch": 3.8407600090477265, "grad_norm": 0.32711711525917053, "learning_rate": 7.055905417635233e-06, "loss": 0.5763, "step": 8490 }, { "epoch": 3.8412123953856594, "grad_norm": 0.3487837314605713, "learning_rate": 7.055237454534793e-06, "loss": 0.5428, "step": 8491 }, { "epoch": 3.841664781723592, "grad_norm": 0.3644358515739441, "learning_rate": 7.054569447294208e-06, "loss": 0.6156, "step": 8492 }, { "epoch": 3.8421171680615247, "grad_norm": 0.3592225909233093, "learning_rate": 7.053901395927829e-06, "loss": 0.5766, "step": 8493 }, { "epoch": 3.842569554399457, "grad_norm": 0.34288519620895386, "learning_rate": 7.0532333004499996e-06, "loss": 0.5827, "step": 8494 }, { "epoch": 3.8430219407373896, "grad_norm": 0.3545849323272705, "learning_rate": 7.052565160875071e-06, "loss": 0.5303, "step": 8495 }, { "epoch": 3.8434743270753224, "grad_norm": 0.3255024254322052, "learning_rate": 7.0518969772173916e-06, "loss": 0.4914, "step": 8496 }, { "epoch": 3.843926713413255, "grad_norm": 0.4385387599468231, "learning_rate": 7.051228749491313e-06, "loss": 0.6982, "step": 8497 }, { "epoch": 3.8443790997511877, "grad_norm": 0.3534526228904724, "learning_rate": 7.050560477711186e-06, "loss": 0.5148, "step": 8498 }, { "epoch": 3.84483148608912, "grad_norm": 0.4213140606880188, "learning_rate": 7.049892161891363e-06, "loss": 0.6518, "step": 8499 }, { "epoch": 3.8452838724270526, "grad_norm": 0.3858993351459503, "learning_rate": 7.049223802046199e-06, "loss": 0.5648, "step": 8500 }, { "epoch": 3.845736258764985, "grad_norm": 0.36116480827331543, "learning_rate": 7.048555398190045e-06, "loss": 0.581, "step": 8501 }, { "epoch": 3.846188645102918, "grad_norm": 0.35795098543167114, "learning_rate": 7.047886950337259e-06, "loss": 0.4905, "step": 8502 }, { "epoch": 3.8466410314408503, "grad_norm": 0.4023960828781128, "learning_rate": 7.047218458502196e-06, "loss": 0.6245, "step": 8503 }, { "epoch": 3.847093417778783, "grad_norm": 0.43698424100875854, "learning_rate": 7.046549922699213e-06, "loss": 0.5964, "step": 8504 }, { "epoch": 3.8475458041167157, "grad_norm": 0.37037262320518494, "learning_rate": 7.045881342942669e-06, "loss": 0.4855, "step": 8505 }, { "epoch": 3.847998190454648, "grad_norm": 0.4171759784221649, "learning_rate": 7.045212719246921e-06, "loss": 0.5236, "step": 8506 }, { "epoch": 3.848450576792581, "grad_norm": 0.4119621813297272, "learning_rate": 7.044544051626332e-06, "loss": 0.558, "step": 8507 }, { "epoch": 3.8489029631305134, "grad_norm": 0.38228675723075867, "learning_rate": 7.043875340095259e-06, "loss": 0.5145, "step": 8508 }, { "epoch": 3.8493553494684463, "grad_norm": 0.3698817789554596, "learning_rate": 7.043206584668068e-06, "loss": 0.5079, "step": 8509 }, { "epoch": 3.8498077358063787, "grad_norm": 0.42301177978515625, "learning_rate": 7.04253778535912e-06, "loss": 0.5967, "step": 8510 }, { "epoch": 3.850260122144311, "grad_norm": 0.39944908022880554, "learning_rate": 7.04186894218278e-06, "loss": 0.5243, "step": 8511 }, { "epoch": 3.850712508482244, "grad_norm": 0.41257333755493164, "learning_rate": 7.04120005515341e-06, "loss": 0.5106, "step": 8512 }, { "epoch": 3.8511648948201764, "grad_norm": 0.43097788095474243, "learning_rate": 7.040531124285377e-06, "loss": 0.5303, "step": 8513 }, { "epoch": 3.851617281158109, "grad_norm": 0.4307059347629547, "learning_rate": 7.0398621495930475e-06, "loss": 0.553, "step": 8514 }, { "epoch": 3.8520696674960417, "grad_norm": 0.42678210139274597, "learning_rate": 7.039193131090789e-06, "loss": 0.6096, "step": 8515 }, { "epoch": 3.852522053833974, "grad_norm": 0.40469813346862793, "learning_rate": 7.038524068792971e-06, "loss": 0.5193, "step": 8516 }, { "epoch": 3.8529744401719066, "grad_norm": 0.4625941514968872, "learning_rate": 7.037854962713961e-06, "loss": 0.5427, "step": 8517 }, { "epoch": 3.8534268265098395, "grad_norm": 0.4212914705276489, "learning_rate": 7.03718581286813e-06, "loss": 0.4695, "step": 8518 }, { "epoch": 3.853879212847772, "grad_norm": 0.46481239795684814, "learning_rate": 7.036516619269851e-06, "loss": 0.5658, "step": 8519 }, { "epoch": 3.854331599185705, "grad_norm": 0.49594396352767944, "learning_rate": 7.035847381933494e-06, "loss": 0.5954, "step": 8520 }, { "epoch": 3.854783985523637, "grad_norm": 0.4942263662815094, "learning_rate": 7.035178100873432e-06, "loss": 0.551, "step": 8521 }, { "epoch": 3.8552363718615696, "grad_norm": 0.5234791040420532, "learning_rate": 7.034508776104041e-06, "loss": 0.6283, "step": 8522 }, { "epoch": 3.8556887581995025, "grad_norm": 0.44325920939445496, "learning_rate": 7.033839407639695e-06, "loss": 0.4508, "step": 8523 }, { "epoch": 3.856141144537435, "grad_norm": 0.5092695951461792, "learning_rate": 7.033169995494769e-06, "loss": 0.5322, "step": 8524 }, { "epoch": 3.856593530875368, "grad_norm": 0.45881789922714233, "learning_rate": 7.032500539683642e-06, "loss": 0.473, "step": 8525 }, { "epoch": 3.8570459172133003, "grad_norm": 0.4367724657058716, "learning_rate": 7.031831040220691e-06, "loss": 0.4672, "step": 8526 }, { "epoch": 3.8574983035512327, "grad_norm": 0.5791262984275818, "learning_rate": 7.031161497120293e-06, "loss": 0.6342, "step": 8527 }, { "epoch": 3.857950689889165, "grad_norm": 0.5112612247467041, "learning_rate": 7.03049191039683e-06, "loss": 0.5652, "step": 8528 }, { "epoch": 3.858403076227098, "grad_norm": 0.5638890266418457, "learning_rate": 7.029822280064682e-06, "loss": 0.5751, "step": 8529 }, { "epoch": 3.8588554625650304, "grad_norm": 0.506472110748291, "learning_rate": 7.029152606138231e-06, "loss": 0.5011, "step": 8530 }, { "epoch": 3.8593078489029633, "grad_norm": 0.5703691840171814, "learning_rate": 7.028482888631858e-06, "loss": 0.5401, "step": 8531 }, { "epoch": 3.8597602352408957, "grad_norm": 0.4407188296318054, "learning_rate": 7.027813127559948e-06, "loss": 0.9285, "step": 8532 }, { "epoch": 3.860212621578828, "grad_norm": 0.2579137980937958, "learning_rate": 7.027143322936885e-06, "loss": 0.574, "step": 8533 }, { "epoch": 3.860665007916761, "grad_norm": 0.2931665778160095, "learning_rate": 7.026473474777053e-06, "loss": 0.6723, "step": 8534 }, { "epoch": 3.8611173942546935, "grad_norm": 0.33046096563339233, "learning_rate": 7.02580358309484e-06, "loss": 0.701, "step": 8535 }, { "epoch": 3.8615697805926263, "grad_norm": 0.3063443899154663, "learning_rate": 7.0251336479046325e-06, "loss": 0.6277, "step": 8536 }, { "epoch": 3.8620221669305588, "grad_norm": 0.31064364314079285, "learning_rate": 7.02446366922082e-06, "loss": 0.5135, "step": 8537 }, { "epoch": 3.862474553268491, "grad_norm": 0.3545629382133484, "learning_rate": 7.023793647057789e-06, "loss": 0.7331, "step": 8538 }, { "epoch": 3.8629269396064236, "grad_norm": 0.3581921458244324, "learning_rate": 7.0231235814299315e-06, "loss": 0.5702, "step": 8539 }, { "epoch": 3.8633793259443565, "grad_norm": 0.31612905859947205, "learning_rate": 7.022453472351637e-06, "loss": 0.5815, "step": 8540 }, { "epoch": 3.863831712282289, "grad_norm": 0.3697175681591034, "learning_rate": 7.021783319837298e-06, "loss": 0.6943, "step": 8541 }, { "epoch": 3.864284098620222, "grad_norm": 0.3894196152687073, "learning_rate": 7.021113123901309e-06, "loss": 0.6959, "step": 8542 }, { "epoch": 3.8647364849581543, "grad_norm": 0.3523353636264801, "learning_rate": 7.020442884558062e-06, "loss": 0.5449, "step": 8543 }, { "epoch": 3.8651888712960867, "grad_norm": 0.37519529461860657, "learning_rate": 7.019772601821952e-06, "loss": 0.6133, "step": 8544 }, { "epoch": 3.8656412576340196, "grad_norm": 0.37897124886512756, "learning_rate": 7.019102275707373e-06, "loss": 0.5498, "step": 8545 }, { "epoch": 3.866093643971952, "grad_norm": 0.37865719199180603, "learning_rate": 7.018431906228723e-06, "loss": 0.4807, "step": 8546 }, { "epoch": 3.866546030309885, "grad_norm": 0.38198530673980713, "learning_rate": 7.0177614934004e-06, "loss": 0.5959, "step": 8547 }, { "epoch": 3.8669984166478173, "grad_norm": 0.3819615840911865, "learning_rate": 7.017091037236802e-06, "loss": 0.6064, "step": 8548 }, { "epoch": 3.8674508029857497, "grad_norm": 0.3681950569152832, "learning_rate": 7.0164205377523284e-06, "loss": 0.529, "step": 8549 }, { "epoch": 3.867903189323682, "grad_norm": 0.373684287071228, "learning_rate": 7.0157499949613785e-06, "loss": 0.5976, "step": 8550 }, { "epoch": 3.868355575661615, "grad_norm": 0.3650743365287781, "learning_rate": 7.015079408878354e-06, "loss": 0.5268, "step": 8551 }, { "epoch": 3.8688079619995475, "grad_norm": 0.40508538484573364, "learning_rate": 7.014408779517658e-06, "loss": 0.5388, "step": 8552 }, { "epoch": 3.8692603483374803, "grad_norm": 0.39549994468688965, "learning_rate": 7.013738106893693e-06, "loss": 0.5568, "step": 8553 }, { "epoch": 3.8697127346754128, "grad_norm": 0.399370014667511, "learning_rate": 7.013067391020864e-06, "loss": 0.4919, "step": 8554 }, { "epoch": 3.870165121013345, "grad_norm": 0.3892631530761719, "learning_rate": 7.012396631913572e-06, "loss": 0.4934, "step": 8555 }, { "epoch": 3.870617507351278, "grad_norm": 0.3841637969017029, "learning_rate": 7.011725829586227e-06, "loss": 0.5614, "step": 8556 }, { "epoch": 3.8710698936892105, "grad_norm": 0.44260716438293457, "learning_rate": 7.0110549840532325e-06, "loss": 0.6141, "step": 8557 }, { "epoch": 3.8715222800271434, "grad_norm": 0.4264954924583435, "learning_rate": 7.010384095328999e-06, "loss": 0.6038, "step": 8558 }, { "epoch": 3.871974666365076, "grad_norm": 0.3886229693889618, "learning_rate": 7.009713163427935e-06, "loss": 0.5235, "step": 8559 }, { "epoch": 3.8724270527030082, "grad_norm": 0.4097563326358795, "learning_rate": 7.009042188364448e-06, "loss": 0.527, "step": 8560 }, { "epoch": 3.872879439040941, "grad_norm": 0.3668091595172882, "learning_rate": 7.008371170152949e-06, "loss": 0.4448, "step": 8561 }, { "epoch": 3.8733318253788735, "grad_norm": 0.39737531542778015, "learning_rate": 7.007700108807851e-06, "loss": 0.4213, "step": 8562 }, { "epoch": 3.8737842117168064, "grad_norm": 0.4507256746292114, "learning_rate": 7.007029004343564e-06, "loss": 0.5326, "step": 8563 }, { "epoch": 3.874236598054739, "grad_norm": 0.41676434874534607, "learning_rate": 7.006357856774504e-06, "loss": 0.5532, "step": 8564 }, { "epoch": 3.8746889843926713, "grad_norm": 0.4131351113319397, "learning_rate": 7.005686666115083e-06, "loss": 0.5492, "step": 8565 }, { "epoch": 3.8751413707306037, "grad_norm": 0.4043789505958557, "learning_rate": 7.005015432379718e-06, "loss": 0.4501, "step": 8566 }, { "epoch": 3.8755937570685366, "grad_norm": 0.4381093680858612, "learning_rate": 7.004344155582823e-06, "loss": 0.5295, "step": 8567 }, { "epoch": 3.876046143406469, "grad_norm": 0.4410848915576935, "learning_rate": 7.003672835738815e-06, "loss": 0.564, "step": 8568 }, { "epoch": 3.876498529744402, "grad_norm": 0.48082655668258667, "learning_rate": 7.003001472862112e-06, "loss": 0.5983, "step": 8569 }, { "epoch": 3.8769509160823343, "grad_norm": 0.4249984622001648, "learning_rate": 7.002330066967136e-06, "loss": 0.4856, "step": 8570 }, { "epoch": 3.8774033024202668, "grad_norm": 0.48186197876930237, "learning_rate": 7.001658618068301e-06, "loss": 0.5049, "step": 8571 }, { "epoch": 3.8778556887581996, "grad_norm": 0.48458045721054077, "learning_rate": 7.000987126180033e-06, "loss": 0.5727, "step": 8572 }, { "epoch": 3.878308075096132, "grad_norm": 0.511160135269165, "learning_rate": 7.0003155913167506e-06, "loss": 0.5512, "step": 8573 }, { "epoch": 3.878760461434065, "grad_norm": 0.4873487949371338, "learning_rate": 6.999644013492877e-06, "loss": 0.519, "step": 8574 }, { "epoch": 3.8792128477719974, "grad_norm": 0.44018858671188354, "learning_rate": 6.998972392722836e-06, "loss": 0.4536, "step": 8575 }, { "epoch": 3.87966523410993, "grad_norm": 0.47599077224731445, "learning_rate": 6.998300729021051e-06, "loss": 0.4483, "step": 8576 }, { "epoch": 3.8801176204478622, "grad_norm": 0.5028440952301025, "learning_rate": 6.9976290224019495e-06, "loss": 0.4645, "step": 8577 }, { "epoch": 3.880570006785795, "grad_norm": 0.6016746759414673, "learning_rate": 6.996957272879955e-06, "loss": 0.5845, "step": 8578 }, { "epoch": 3.8810223931237275, "grad_norm": 0.5754581093788147, "learning_rate": 6.996285480469496e-06, "loss": 0.5885, "step": 8579 }, { "epoch": 3.8814747794616604, "grad_norm": 0.5643505454063416, "learning_rate": 6.995613645184999e-06, "loss": 0.6131, "step": 8580 }, { "epoch": 3.881927165799593, "grad_norm": 0.6563587784767151, "learning_rate": 6.994941767040895e-06, "loss": 0.5962, "step": 8581 }, { "epoch": 3.8823795521375253, "grad_norm": 0.5130084156990051, "learning_rate": 6.9942698460516125e-06, "loss": 0.9798, "step": 8582 }, { "epoch": 3.882831938475458, "grad_norm": 0.18360038101673126, "learning_rate": 6.993597882231583e-06, "loss": 1.0598, "step": 8583 }, { "epoch": 3.8832843248133906, "grad_norm": 0.2541874051094055, "learning_rate": 6.9929258755952375e-06, "loss": 0.7095, "step": 8584 }, { "epoch": 3.8837367111513235, "grad_norm": 0.29707270860671997, "learning_rate": 6.992253826157009e-06, "loss": 0.8052, "step": 8585 }, { "epoch": 3.884189097489256, "grad_norm": 0.326960027217865, "learning_rate": 6.991581733931332e-06, "loss": 0.554, "step": 8586 }, { "epoch": 3.8846414838271883, "grad_norm": 0.2514895796775818, "learning_rate": 6.990909598932641e-06, "loss": 0.4354, "step": 8587 }, { "epoch": 3.8850938701651208, "grad_norm": 0.2857106328010559, "learning_rate": 6.99023742117537e-06, "loss": 0.5744, "step": 8588 }, { "epoch": 3.8855462565030536, "grad_norm": 0.2761799693107605, "learning_rate": 6.989565200673954e-06, "loss": 0.5666, "step": 8589 }, { "epoch": 3.885998642840986, "grad_norm": 0.30420026183128357, "learning_rate": 6.9888929374428324e-06, "loss": 0.6077, "step": 8590 }, { "epoch": 3.886451029178919, "grad_norm": 0.3204323649406433, "learning_rate": 6.988220631496444e-06, "loss": 0.6009, "step": 8591 }, { "epoch": 3.8869034155168514, "grad_norm": 0.37106528878211975, "learning_rate": 6.987548282849226e-06, "loss": 0.6622, "step": 8592 }, { "epoch": 3.887355801854784, "grad_norm": 0.3118378520011902, "learning_rate": 6.98687589151562e-06, "loss": 0.5545, "step": 8593 }, { "epoch": 3.8878081881927167, "grad_norm": 0.36343830823898315, "learning_rate": 6.986203457510066e-06, "loss": 0.6648, "step": 8594 }, { "epoch": 3.888260574530649, "grad_norm": 0.37937572598457336, "learning_rate": 6.985530980847004e-06, "loss": 0.577, "step": 8595 }, { "epoch": 3.888712960868582, "grad_norm": 0.3569967448711395, "learning_rate": 6.9848584615408795e-06, "loss": 0.617, "step": 8596 }, { "epoch": 3.8891653472065144, "grad_norm": 0.40582209825515747, "learning_rate": 6.9841858996061365e-06, "loss": 0.6952, "step": 8597 }, { "epoch": 3.889617733544447, "grad_norm": 0.3509305417537689, "learning_rate": 6.983513295057217e-06, "loss": 0.5553, "step": 8598 }, { "epoch": 3.8900701198823797, "grad_norm": 0.3363732099533081, "learning_rate": 6.982840647908568e-06, "loss": 0.4992, "step": 8599 }, { "epoch": 3.890522506220312, "grad_norm": 0.39114412665367126, "learning_rate": 6.982167958174636e-06, "loss": 0.6118, "step": 8600 }, { "epoch": 3.890522506220312, "eval_loss": 0.5914932489395142, "eval_runtime": 25.4961, "eval_samples_per_second": 29.181, "eval_steps_per_second": 7.295, "step": 8600 }, { "epoch": 3.8909748925582446, "grad_norm": 0.39393365383148193, "learning_rate": 6.981495225869868e-06, "loss": 0.6511, "step": 8601 }, { "epoch": 3.8914272788961775, "grad_norm": 0.3840406537055969, "learning_rate": 6.98082245100871e-06, "loss": 0.5436, "step": 8602 }, { "epoch": 3.89187966523411, "grad_norm": 0.3928453028202057, "learning_rate": 6.980149633605615e-06, "loss": 0.5149, "step": 8603 }, { "epoch": 3.8923320515720423, "grad_norm": 0.392561137676239, "learning_rate": 6.979476773675031e-06, "loss": 0.529, "step": 8604 }, { "epoch": 3.892784437909975, "grad_norm": 0.3860798478126526, "learning_rate": 6.9788038712314096e-06, "loss": 0.5416, "step": 8605 }, { "epoch": 3.8932368242479076, "grad_norm": 0.41528552770614624, "learning_rate": 6.9781309262892015e-06, "loss": 0.6063, "step": 8606 }, { "epoch": 3.8936892105858405, "grad_norm": 0.4284402132034302, "learning_rate": 6.977457938862861e-06, "loss": 0.6042, "step": 8607 }, { "epoch": 3.894141596923773, "grad_norm": 0.39089667797088623, "learning_rate": 6.97678490896684e-06, "loss": 0.5333, "step": 8608 }, { "epoch": 3.8945939832617054, "grad_norm": 0.40115970373153687, "learning_rate": 6.976111836615594e-06, "loss": 0.5217, "step": 8609 }, { "epoch": 3.8950463695996382, "grad_norm": 0.42595329880714417, "learning_rate": 6.97543872182358e-06, "loss": 0.6122, "step": 8610 }, { "epoch": 3.8954987559375707, "grad_norm": 0.39366766810417175, "learning_rate": 6.974765564605252e-06, "loss": 0.5647, "step": 8611 }, { "epoch": 3.8959511422755035, "grad_norm": 0.42707642912864685, "learning_rate": 6.974092364975069e-06, "loss": 0.5033, "step": 8612 }, { "epoch": 3.896403528613436, "grad_norm": 0.40340280532836914, "learning_rate": 6.973419122947489e-06, "loss": 0.5814, "step": 8613 }, { "epoch": 3.8968559149513684, "grad_norm": 0.39714109897613525, "learning_rate": 6.9727458385369696e-06, "loss": 0.506, "step": 8614 }, { "epoch": 3.897308301289301, "grad_norm": 0.396867036819458, "learning_rate": 6.972072511757972e-06, "loss": 0.348, "step": 8615 }, { "epoch": 3.8977606876272337, "grad_norm": 0.41898876428604126, "learning_rate": 6.971399142624958e-06, "loss": 0.4653, "step": 8616 }, { "epoch": 3.898213073965166, "grad_norm": 0.4218512177467346, "learning_rate": 6.970725731152389e-06, "loss": 0.4667, "step": 8617 }, { "epoch": 3.898665460303099, "grad_norm": 0.4381503760814667, "learning_rate": 6.970052277354727e-06, "loss": 0.5371, "step": 8618 }, { "epoch": 3.8991178466410314, "grad_norm": 0.3868419826030731, "learning_rate": 6.969378781246436e-06, "loss": 0.4719, "step": 8619 }, { "epoch": 3.899570232978964, "grad_norm": 0.4479169249534607, "learning_rate": 6.968705242841981e-06, "loss": 0.5499, "step": 8620 }, { "epoch": 3.9000226193168968, "grad_norm": 0.4389864206314087, "learning_rate": 6.968031662155828e-06, "loss": 0.4891, "step": 8621 }, { "epoch": 3.900475005654829, "grad_norm": 0.4662887752056122, "learning_rate": 6.967358039202443e-06, "loss": 0.4757, "step": 8622 }, { "epoch": 3.900927391992762, "grad_norm": 0.5197649002075195, "learning_rate": 6.966684373996292e-06, "loss": 0.6536, "step": 8623 }, { "epoch": 3.9013797783306945, "grad_norm": 0.5003644824028015, "learning_rate": 6.966010666551845e-06, "loss": 0.5132, "step": 8624 }, { "epoch": 3.901832164668627, "grad_norm": 0.5327630639076233, "learning_rate": 6.965336916883569e-06, "loss": 0.6636, "step": 8625 }, { "epoch": 3.9022845510065594, "grad_norm": 0.4389488101005554, "learning_rate": 6.964663125005937e-06, "loss": 0.4734, "step": 8626 }, { "epoch": 3.9027369373444922, "grad_norm": 0.5012379884719849, "learning_rate": 6.96398929093342e-06, "loss": 0.4885, "step": 8627 }, { "epoch": 3.9031893236824247, "grad_norm": 0.5440966486930847, "learning_rate": 6.963315414680486e-06, "loss": 0.5883, "step": 8628 }, { "epoch": 3.9036417100203575, "grad_norm": 0.5713669657707214, "learning_rate": 6.962641496261611e-06, "loss": 0.4976, "step": 8629 }, { "epoch": 3.90409409635829, "grad_norm": 0.5828353762626648, "learning_rate": 6.961967535691268e-06, "loss": 0.5054, "step": 8630 }, { "epoch": 3.9045464826962224, "grad_norm": 0.5504095554351807, "learning_rate": 6.96129353298393e-06, "loss": 0.4859, "step": 8631 }, { "epoch": 3.9049988690341553, "grad_norm": 0.4090718924999237, "learning_rate": 6.960619488154076e-06, "loss": 0.7639, "step": 8632 }, { "epoch": 3.9054512553720877, "grad_norm": 0.256238728761673, "learning_rate": 6.9599454012161805e-06, "loss": 0.8868, "step": 8633 }, { "epoch": 3.9059036417100206, "grad_norm": 0.24180589616298676, "learning_rate": 6.959271272184719e-06, "loss": 0.6595, "step": 8634 }, { "epoch": 3.906356028047953, "grad_norm": 0.285194993019104, "learning_rate": 6.958597101074172e-06, "loss": 0.4662, "step": 8635 }, { "epoch": 3.9068084143858854, "grad_norm": 0.27872538566589355, "learning_rate": 6.957922887899018e-06, "loss": 0.5215, "step": 8636 }, { "epoch": 3.907260800723818, "grad_norm": 0.28237849473953247, "learning_rate": 6.957248632673736e-06, "loss": 0.5338, "step": 8637 }, { "epoch": 3.9077131870617507, "grad_norm": 0.32940346002578735, "learning_rate": 6.95657433541281e-06, "loss": 0.6141, "step": 8638 }, { "epoch": 3.908165573399683, "grad_norm": 0.3589392304420471, "learning_rate": 6.9558999961307185e-06, "loss": 0.5525, "step": 8639 }, { "epoch": 3.908617959737616, "grad_norm": 0.34083619713783264, "learning_rate": 6.955225614841945e-06, "loss": 0.6317, "step": 8640 }, { "epoch": 3.9090703460755485, "grad_norm": 0.3378809094429016, "learning_rate": 6.954551191560975e-06, "loss": 0.5907, "step": 8641 }, { "epoch": 3.909522732413481, "grad_norm": 0.3069833219051361, "learning_rate": 6.953876726302291e-06, "loss": 0.5302, "step": 8642 }, { "epoch": 3.909975118751414, "grad_norm": 0.372175395488739, "learning_rate": 6.953202219080379e-06, "loss": 0.6752, "step": 8643 }, { "epoch": 3.9104275050893462, "grad_norm": 0.36018818616867065, "learning_rate": 6.9525276699097255e-06, "loss": 0.6146, "step": 8644 }, { "epoch": 3.910879891427279, "grad_norm": 0.37127038836479187, "learning_rate": 6.9518530788048165e-06, "loss": 0.6452, "step": 8645 }, { "epoch": 3.9113322777652115, "grad_norm": 0.3619563579559326, "learning_rate": 6.951178445780142e-06, "loss": 0.5786, "step": 8646 }, { "epoch": 3.911784664103144, "grad_norm": 0.3470170497894287, "learning_rate": 6.9505037708501895e-06, "loss": 0.5339, "step": 8647 }, { "epoch": 3.912237050441077, "grad_norm": 0.3607289791107178, "learning_rate": 6.949829054029451e-06, "loss": 0.6147, "step": 8648 }, { "epoch": 3.9126894367790093, "grad_norm": 0.33020713925361633, "learning_rate": 6.949154295332416e-06, "loss": 0.4963, "step": 8649 }, { "epoch": 3.913141823116942, "grad_norm": 0.37593308091163635, "learning_rate": 6.948479494773575e-06, "loss": 0.5158, "step": 8650 }, { "epoch": 3.9135942094548746, "grad_norm": 0.3808322548866272, "learning_rate": 6.947804652367421e-06, "loss": 0.5842, "step": 8651 }, { "epoch": 3.914046595792807, "grad_norm": 0.41797834634780884, "learning_rate": 6.947129768128451e-06, "loss": 0.5333, "step": 8652 }, { "epoch": 3.9144989821307394, "grad_norm": 0.4437936842441559, "learning_rate": 6.946454842071156e-06, "loss": 0.6139, "step": 8653 }, { "epoch": 3.9149513684686723, "grad_norm": 0.44527074694633484, "learning_rate": 6.94577987421003e-06, "loss": 0.6125, "step": 8654 }, { "epoch": 3.9154037548066047, "grad_norm": 0.41073399782180786, "learning_rate": 6.945104864559573e-06, "loss": 0.548, "step": 8655 }, { "epoch": 3.9158561411445376, "grad_norm": 0.40458256006240845, "learning_rate": 6.944429813134281e-06, "loss": 0.5922, "step": 8656 }, { "epoch": 3.91630852748247, "grad_norm": 0.3593719005584717, "learning_rate": 6.943754719948652e-06, "loss": 0.5948, "step": 8657 }, { "epoch": 3.9167609138204025, "grad_norm": 0.38026130199432373, "learning_rate": 6.9430795850171825e-06, "loss": 0.4644, "step": 8658 }, { "epoch": 3.9172133001583354, "grad_norm": 0.4390026032924652, "learning_rate": 6.942404408354376e-06, "loss": 0.5309, "step": 8659 }, { "epoch": 3.917665686496268, "grad_norm": 0.3243671655654907, "learning_rate": 6.941729189974731e-06, "loss": 0.3854, "step": 8660 }, { "epoch": 3.9181180728342007, "grad_norm": 0.444742888212204, "learning_rate": 6.9410539298927486e-06, "loss": 0.601, "step": 8661 }, { "epoch": 3.918570459172133, "grad_norm": 0.43769198656082153, "learning_rate": 6.940378628122932e-06, "loss": 0.5677, "step": 8662 }, { "epoch": 3.9190228455100655, "grad_norm": 0.4081299901008606, "learning_rate": 6.939703284679787e-06, "loss": 0.4773, "step": 8663 }, { "epoch": 3.919475231847998, "grad_norm": 0.4812665581703186, "learning_rate": 6.9390278995778125e-06, "loss": 0.6502, "step": 8664 }, { "epoch": 3.919927618185931, "grad_norm": 0.4345477819442749, "learning_rate": 6.938352472831519e-06, "loss": 0.4689, "step": 8665 }, { "epoch": 3.9203800045238633, "grad_norm": 0.4506515562534332, "learning_rate": 6.9376770044554095e-06, "loss": 0.641, "step": 8666 }, { "epoch": 3.920832390861796, "grad_norm": 0.42521223425865173, "learning_rate": 6.937001494463994e-06, "loss": 0.4587, "step": 8667 }, { "epoch": 3.9212847771997286, "grad_norm": 0.43745097517967224, "learning_rate": 6.936325942871777e-06, "loss": 0.4747, "step": 8668 }, { "epoch": 3.921737163537661, "grad_norm": 0.4335764944553375, "learning_rate": 6.935650349693268e-06, "loss": 0.4757, "step": 8669 }, { "epoch": 3.922189549875594, "grad_norm": 0.44203802943229675, "learning_rate": 6.934974714942977e-06, "loss": 0.5079, "step": 8670 }, { "epoch": 3.9226419362135263, "grad_norm": 0.44495299458503723, "learning_rate": 6.934299038635414e-06, "loss": 0.5504, "step": 8671 }, { "epoch": 3.923094322551459, "grad_norm": 0.4502077102661133, "learning_rate": 6.9336233207850935e-06, "loss": 0.454, "step": 8672 }, { "epoch": 3.9235467088893916, "grad_norm": 0.44224122166633606, "learning_rate": 6.932947561406524e-06, "loss": 0.5129, "step": 8673 }, { "epoch": 3.923999095227324, "grad_norm": 0.41981762647628784, "learning_rate": 6.932271760514221e-06, "loss": 0.4087, "step": 8674 }, { "epoch": 3.9244514815652565, "grad_norm": 0.5063013434410095, "learning_rate": 6.9315959181226974e-06, "loss": 0.539, "step": 8675 }, { "epoch": 3.9249038679031893, "grad_norm": 0.5132131576538086, "learning_rate": 6.930920034246469e-06, "loss": 0.5261, "step": 8676 }, { "epoch": 3.925356254241122, "grad_norm": 0.4932524561882019, "learning_rate": 6.930244108900051e-06, "loss": 0.544, "step": 8677 }, { "epoch": 3.9258086405790547, "grad_norm": 0.5767809152603149, "learning_rate": 6.929568142097962e-06, "loss": 0.6124, "step": 8678 }, { "epoch": 3.926261026916987, "grad_norm": 0.628997266292572, "learning_rate": 6.928892133854716e-06, "loss": 0.6381, "step": 8679 }, { "epoch": 3.9267134132549195, "grad_norm": 0.5599125027656555, "learning_rate": 6.928216084184836e-06, "loss": 0.5629, "step": 8680 }, { "epoch": 3.9271657995928524, "grad_norm": 0.5857692360877991, "learning_rate": 6.9275399931028364e-06, "loss": 0.4727, "step": 8681 }, { "epoch": 3.927618185930785, "grad_norm": 0.439083456993103, "learning_rate": 6.926863860623242e-06, "loss": 0.9696, "step": 8682 }, { "epoch": 3.9280705722687177, "grad_norm": 0.20799873769283295, "learning_rate": 6.926187686760572e-06, "loss": 0.5657, "step": 8683 }, { "epoch": 3.92852295860665, "grad_norm": 0.2894785404205322, "learning_rate": 6.925511471529349e-06, "loss": 0.4438, "step": 8684 }, { "epoch": 3.9289753449445826, "grad_norm": 0.2819901704788208, "learning_rate": 6.924835214944096e-06, "loss": 0.521, "step": 8685 }, { "epoch": 3.929427731282515, "grad_norm": 0.2967236638069153, "learning_rate": 6.924158917019338e-06, "loss": 0.4968, "step": 8686 }, { "epoch": 3.929880117620448, "grad_norm": 0.32793569564819336, "learning_rate": 6.923482577769597e-06, "loss": 0.6456, "step": 8687 }, { "epoch": 3.9303325039583803, "grad_norm": 0.3580394983291626, "learning_rate": 6.9228061972094005e-06, "loss": 0.6648, "step": 8688 }, { "epoch": 3.930784890296313, "grad_norm": 0.32689204812049866, "learning_rate": 6.922129775353276e-06, "loss": 0.5872, "step": 8689 }, { "epoch": 3.9312372766342456, "grad_norm": 0.3004072308540344, "learning_rate": 6.92145331221575e-06, "loss": 0.516, "step": 8690 }, { "epoch": 3.931689662972178, "grad_norm": 0.3611636161804199, "learning_rate": 6.920776807811349e-06, "loss": 0.585, "step": 8691 }, { "epoch": 3.932142049310111, "grad_norm": 0.33640462160110474, "learning_rate": 6.920100262154605e-06, "loss": 0.6185, "step": 8692 }, { "epoch": 3.9325944356480433, "grad_norm": 0.3585173189640045, "learning_rate": 6.9194236752600465e-06, "loss": 0.5856, "step": 8693 }, { "epoch": 3.933046821985976, "grad_norm": 0.3988986909389496, "learning_rate": 6.918747047142205e-06, "loss": 0.6135, "step": 8694 }, { "epoch": 3.9334992083239086, "grad_norm": 0.39185449481010437, "learning_rate": 6.9180703778156134e-06, "loss": 0.5613, "step": 8695 }, { "epoch": 3.933951594661841, "grad_norm": 0.35660678148269653, "learning_rate": 6.917393667294801e-06, "loss": 0.5889, "step": 8696 }, { "epoch": 3.934403980999774, "grad_norm": 0.36133185029029846, "learning_rate": 6.916716915594306e-06, "loss": 0.5504, "step": 8697 }, { "epoch": 3.9348563673377064, "grad_norm": 0.3541906774044037, "learning_rate": 6.91604012272866e-06, "loss": 0.5447, "step": 8698 }, { "epoch": 3.9353087536756393, "grad_norm": 0.3476216197013855, "learning_rate": 6.9153632887124e-06, "loss": 0.5041, "step": 8699 }, { "epoch": 3.9357611400135717, "grad_norm": 0.400136798620224, "learning_rate": 6.91468641356006e-06, "loss": 0.5457, "step": 8700 }, { "epoch": 3.936213526351504, "grad_norm": 0.4052641987800598, "learning_rate": 6.9140094972861805e-06, "loss": 0.621, "step": 8701 }, { "epoch": 3.9366659126894366, "grad_norm": 0.36892130970954895, "learning_rate": 6.913332539905298e-06, "loss": 0.4668, "step": 8702 }, { "epoch": 3.9371182990273694, "grad_norm": 0.45320242643356323, "learning_rate": 6.91265554143195e-06, "loss": 0.6532, "step": 8703 }, { "epoch": 3.937570685365302, "grad_norm": 0.40880346298217773, "learning_rate": 6.911978501880676e-06, "loss": 0.5377, "step": 8704 }, { "epoch": 3.9380230717032347, "grad_norm": 0.4204370081424713, "learning_rate": 6.911301421266021e-06, "loss": 0.503, "step": 8705 }, { "epoch": 3.938475458041167, "grad_norm": 0.40697765350341797, "learning_rate": 6.910624299602522e-06, "loss": 0.5294, "step": 8706 }, { "epoch": 3.9389278443790996, "grad_norm": 0.39393407106399536, "learning_rate": 6.909947136904725e-06, "loss": 0.4709, "step": 8707 }, { "epoch": 3.9393802307170325, "grad_norm": 0.4119998812675476, "learning_rate": 6.909269933187171e-06, "loss": 0.5572, "step": 8708 }, { "epoch": 3.939832617054965, "grad_norm": 0.4863649010658264, "learning_rate": 6.9085926884644036e-06, "loss": 0.6564, "step": 8709 }, { "epoch": 3.940285003392898, "grad_norm": 0.451484352350235, "learning_rate": 6.907915402750971e-06, "loss": 0.5101, "step": 8710 }, { "epoch": 3.94073738973083, "grad_norm": 0.428641140460968, "learning_rate": 6.907238076061417e-06, "loss": 0.607, "step": 8711 }, { "epoch": 3.9411897760687626, "grad_norm": 0.4075620770454407, "learning_rate": 6.9065607084102895e-06, "loss": 0.498, "step": 8712 }, { "epoch": 3.941642162406695, "grad_norm": 0.42536208033561707, "learning_rate": 6.905883299812135e-06, "loss": 0.6184, "step": 8713 }, { "epoch": 3.942094548744628, "grad_norm": 0.40416714549064636, "learning_rate": 6.905205850281502e-06, "loss": 0.5242, "step": 8714 }, { "epoch": 3.9425469350825604, "grad_norm": 0.4184442162513733, "learning_rate": 6.904528359832942e-06, "loss": 0.4822, "step": 8715 }, { "epoch": 3.9429993214204933, "grad_norm": 0.5042157173156738, "learning_rate": 6.903850828481005e-06, "loss": 0.6306, "step": 8716 }, { "epoch": 3.9434517077584257, "grad_norm": 0.41068708896636963, "learning_rate": 6.903173256240241e-06, "loss": 0.4265, "step": 8717 }, { "epoch": 3.943904094096358, "grad_norm": 0.4783579707145691, "learning_rate": 6.9024956431252024e-06, "loss": 0.5747, "step": 8718 }, { "epoch": 3.944356480434291, "grad_norm": 0.42834436893463135, "learning_rate": 6.901817989150441e-06, "loss": 0.5349, "step": 8719 }, { "epoch": 3.9448088667722234, "grad_norm": 0.4031572937965393, "learning_rate": 6.901140294330515e-06, "loss": 0.4317, "step": 8720 }, { "epoch": 3.9452612531101563, "grad_norm": 0.4926771819591522, "learning_rate": 6.9004625586799745e-06, "loss": 0.5137, "step": 8721 }, { "epoch": 3.9457136394480887, "grad_norm": 0.48894545435905457, "learning_rate": 6.899784782213378e-06, "loss": 0.5595, "step": 8722 }, { "epoch": 3.946166025786021, "grad_norm": 0.5095982551574707, "learning_rate": 6.899106964945281e-06, "loss": 0.4937, "step": 8723 }, { "epoch": 3.9466184121239536, "grad_norm": 0.4710530638694763, "learning_rate": 6.898429106890241e-06, "loss": 0.462, "step": 8724 }, { "epoch": 3.9470707984618865, "grad_norm": 0.4875253438949585, "learning_rate": 6.897751208062817e-06, "loss": 0.4647, "step": 8725 }, { "epoch": 3.947523184799819, "grad_norm": 0.46331921219825745, "learning_rate": 6.897073268477567e-06, "loss": 0.4873, "step": 8726 }, { "epoch": 3.9479755711377518, "grad_norm": 0.5661497116088867, "learning_rate": 6.896395288149051e-06, "loss": 0.4715, "step": 8727 }, { "epoch": 3.948427957475684, "grad_norm": 0.562507152557373, "learning_rate": 6.895717267091832e-06, "loss": 0.591, "step": 8728 }, { "epoch": 3.9488803438136166, "grad_norm": 0.5324230790138245, "learning_rate": 6.895039205320469e-06, "loss": 0.4968, "step": 8729 }, { "epoch": 3.9493327301515495, "grad_norm": 0.5886062383651733, "learning_rate": 6.894361102849525e-06, "loss": 0.5377, "step": 8730 }, { "epoch": 3.949785116489482, "grad_norm": 0.6444167494773865, "learning_rate": 6.893682959693566e-06, "loss": 0.5508, "step": 8731 }, { "epoch": 3.950237502827415, "grad_norm": 0.4250379204750061, "learning_rate": 6.893004775867155e-06, "loss": 0.9094, "step": 8732 }, { "epoch": 3.9506898891653472, "grad_norm": 0.2006959766149521, "learning_rate": 6.8923265513848565e-06, "loss": 0.6347, "step": 8733 }, { "epoch": 3.9511422755032797, "grad_norm": 0.26974305510520935, "learning_rate": 6.891648286261238e-06, "loss": 0.6497, "step": 8734 }, { "epoch": 3.9515946618412126, "grad_norm": 0.2771059572696686, "learning_rate": 6.8909699805108664e-06, "loss": 0.5998, "step": 8735 }, { "epoch": 3.952047048179145, "grad_norm": 0.308523952960968, "learning_rate": 6.8902916341483085e-06, "loss": 0.6084, "step": 8736 }, { "epoch": 3.952499434517078, "grad_norm": 0.2975044846534729, "learning_rate": 6.889613247188134e-06, "loss": 0.5897, "step": 8737 }, { "epoch": 3.9529518208550103, "grad_norm": 0.3001428246498108, "learning_rate": 6.888934819644911e-06, "loss": 0.5236, "step": 8738 }, { "epoch": 3.9534042071929427, "grad_norm": 0.33121439814567566, "learning_rate": 6.888256351533213e-06, "loss": 0.6678, "step": 8739 }, { "epoch": 3.953856593530875, "grad_norm": 0.3991074562072754, "learning_rate": 6.887577842867608e-06, "loss": 0.6985, "step": 8740 }, { "epoch": 3.954308979868808, "grad_norm": 0.3706502318382263, "learning_rate": 6.88689929366267e-06, "loss": 0.6271, "step": 8741 }, { "epoch": 3.9547613662067405, "grad_norm": 0.3850741684436798, "learning_rate": 6.886220703932973e-06, "loss": 0.7119, "step": 8742 }, { "epoch": 3.9552137525446733, "grad_norm": 0.3451414108276367, "learning_rate": 6.88554207369309e-06, "loss": 0.5438, "step": 8743 }, { "epoch": 3.9556661388826058, "grad_norm": 0.32969772815704346, "learning_rate": 6.884863402957595e-06, "loss": 0.4656, "step": 8744 }, { "epoch": 3.956118525220538, "grad_norm": 0.356003999710083, "learning_rate": 6.884184691741067e-06, "loss": 0.5735, "step": 8745 }, { "epoch": 3.956570911558471, "grad_norm": 0.4072325527667999, "learning_rate": 6.8835059400580796e-06, "loss": 0.629, "step": 8746 }, { "epoch": 3.9570232978964035, "grad_norm": 0.3602656126022339, "learning_rate": 6.88282714792321e-06, "loss": 0.5337, "step": 8747 }, { "epoch": 3.9574756842343364, "grad_norm": 0.397825688123703, "learning_rate": 6.882148315351038e-06, "loss": 0.5838, "step": 8748 }, { "epoch": 3.957928070572269, "grad_norm": 0.36001867055892944, "learning_rate": 6.881469442356143e-06, "loss": 0.5579, "step": 8749 }, { "epoch": 3.9583804569102012, "grad_norm": 0.4068639874458313, "learning_rate": 6.880790528953103e-06, "loss": 0.483, "step": 8750 }, { "epoch": 3.9588328432481337, "grad_norm": 0.4095443785190582, "learning_rate": 6.880111575156502e-06, "loss": 0.6085, "step": 8751 }, { "epoch": 3.9592852295860665, "grad_norm": 0.406906396150589, "learning_rate": 6.879432580980919e-06, "loss": 0.5733, "step": 8752 }, { "epoch": 3.959737615923999, "grad_norm": 0.3479066789150238, "learning_rate": 6.878753546440938e-06, "loss": 0.4211, "step": 8753 }, { "epoch": 3.960190002261932, "grad_norm": 0.4027039408683777, "learning_rate": 6.878074471551144e-06, "loss": 0.4361, "step": 8754 }, { "epoch": 3.9606423885998643, "grad_norm": 0.38023361563682556, "learning_rate": 6.877395356326119e-06, "loss": 0.4978, "step": 8755 }, { "epoch": 3.9610947749377967, "grad_norm": 0.4105130434036255, "learning_rate": 6.876716200780449e-06, "loss": 0.5772, "step": 8756 }, { "epoch": 3.9615471612757296, "grad_norm": 0.3728322982788086, "learning_rate": 6.8760370049287215e-06, "loss": 0.4382, "step": 8757 }, { "epoch": 3.961999547613662, "grad_norm": 0.374832421541214, "learning_rate": 6.875357768785521e-06, "loss": 0.5337, "step": 8758 }, { "epoch": 3.962451933951595, "grad_norm": 0.41324418783187866, "learning_rate": 6.874678492365438e-06, "loss": 0.6119, "step": 8759 }, { "epoch": 3.9629043202895273, "grad_norm": 0.36714333295822144, "learning_rate": 6.873999175683059e-06, "loss": 0.4928, "step": 8760 }, { "epoch": 3.9633567066274598, "grad_norm": 0.45780041813850403, "learning_rate": 6.873319818752975e-06, "loss": 0.5421, "step": 8761 }, { "epoch": 3.963809092965392, "grad_norm": 0.3994154632091522, "learning_rate": 6.872640421589777e-06, "loss": 0.4944, "step": 8762 }, { "epoch": 3.964261479303325, "grad_norm": 0.4455246329307556, "learning_rate": 6.8719609842080545e-06, "loss": 0.5822, "step": 8763 }, { "epoch": 3.9647138656412575, "grad_norm": 0.4914417266845703, "learning_rate": 6.871281506622402e-06, "loss": 0.6145, "step": 8764 }, { "epoch": 3.9651662519791904, "grad_norm": 0.38838115334510803, "learning_rate": 6.87060198884741e-06, "loss": 0.4591, "step": 8765 }, { "epoch": 3.965618638317123, "grad_norm": 0.46410393714904785, "learning_rate": 6.869922430897674e-06, "loss": 0.6226, "step": 8766 }, { "epoch": 3.9660710246550552, "grad_norm": 0.4660816788673401, "learning_rate": 6.8692428327877895e-06, "loss": 0.5461, "step": 8767 }, { "epoch": 3.966523410992988, "grad_norm": 0.49017688632011414, "learning_rate": 6.868563194532351e-06, "loss": 0.5122, "step": 8768 }, { "epoch": 3.9669757973309205, "grad_norm": 0.507847785949707, "learning_rate": 6.8678835161459555e-06, "loss": 0.5593, "step": 8769 }, { "epoch": 3.9674281836688534, "grad_norm": 0.455375999212265, "learning_rate": 6.867203797643201e-06, "loss": 0.4603, "step": 8770 }, { "epoch": 3.967880570006786, "grad_norm": 0.45524269342422485, "learning_rate": 6.866524039038682e-06, "loss": 0.4918, "step": 8771 }, { "epoch": 3.9683329563447183, "grad_norm": 0.4365828037261963, "learning_rate": 6.865844240347003e-06, "loss": 0.5019, "step": 8772 }, { "epoch": 3.9687853426826507, "grad_norm": 0.48991528153419495, "learning_rate": 6.86516440158276e-06, "loss": 0.546, "step": 8773 }, { "epoch": 3.9692377290205836, "grad_norm": 0.4434996247291565, "learning_rate": 6.8644845227605564e-06, "loss": 0.4217, "step": 8774 }, { "epoch": 3.969690115358516, "grad_norm": 0.5034658908843994, "learning_rate": 6.863804603894991e-06, "loss": 0.5512, "step": 8775 }, { "epoch": 3.970142501696449, "grad_norm": 0.4853594899177551, "learning_rate": 6.863124645000669e-06, "loss": 0.4545, "step": 8776 }, { "epoch": 3.9705948880343813, "grad_norm": 0.508147656917572, "learning_rate": 6.8624446460921946e-06, "loss": 0.5402, "step": 8777 }, { "epoch": 3.9710472743723138, "grad_norm": 0.56357741355896, "learning_rate": 6.86176460718417e-06, "loss": 0.6269, "step": 8778 }, { "epoch": 3.9714996607102466, "grad_norm": 0.5201089382171631, "learning_rate": 6.8610845282912e-06, "loss": 0.5344, "step": 8779 }, { "epoch": 3.971952047048179, "grad_norm": 0.563106894493103, "learning_rate": 6.860404409427892e-06, "loss": 0.489, "step": 8780 }, { "epoch": 3.972404433386112, "grad_norm": 0.604664146900177, "learning_rate": 6.859724250608851e-06, "loss": 0.5355, "step": 8781 }, { "epoch": 3.9728568197240444, "grad_norm": 0.4759438931941986, "learning_rate": 6.859044051848687e-06, "loss": 1.0619, "step": 8782 }, { "epoch": 3.973309206061977, "grad_norm": 0.2520522177219391, "learning_rate": 6.858363813162008e-06, "loss": 0.9433, "step": 8783 }, { "epoch": 3.9737615923999097, "grad_norm": 0.2832954227924347, "learning_rate": 6.857683534563422e-06, "loss": 0.6565, "step": 8784 }, { "epoch": 3.974213978737842, "grad_norm": 0.23423799872398376, "learning_rate": 6.85700321606754e-06, "loss": 0.4688, "step": 8785 }, { "epoch": 3.974666365075775, "grad_norm": 0.2789730429649353, "learning_rate": 6.856322857688974e-06, "loss": 0.6217, "step": 8786 }, { "epoch": 3.9751187514137074, "grad_norm": 0.32263851165771484, "learning_rate": 6.855642459442335e-06, "loss": 0.6281, "step": 8787 }, { "epoch": 3.97557113775164, "grad_norm": 0.3149986267089844, "learning_rate": 6.854962021342237e-06, "loss": 0.5997, "step": 8788 }, { "epoch": 3.9760235240895723, "grad_norm": 0.33049076795578003, "learning_rate": 6.854281543403291e-06, "loss": 0.5397, "step": 8789 }, { "epoch": 3.976475910427505, "grad_norm": 0.32681992650032043, "learning_rate": 6.853601025640115e-06, "loss": 0.5497, "step": 8790 }, { "epoch": 3.9769282967654376, "grad_norm": 0.33137965202331543, "learning_rate": 6.852920468067323e-06, "loss": 0.6623, "step": 8791 }, { "epoch": 3.9773806831033705, "grad_norm": 0.42920398712158203, "learning_rate": 6.852239870699531e-06, "loss": 0.6759, "step": 8792 }, { "epoch": 3.977833069441303, "grad_norm": 0.35255029797554016, "learning_rate": 6.851559233551356e-06, "loss": 0.6056, "step": 8793 }, { "epoch": 3.9782854557792353, "grad_norm": 0.3439638912677765, "learning_rate": 6.850878556637416e-06, "loss": 0.4667, "step": 8794 }, { "epoch": 3.978737842117168, "grad_norm": 0.3284856081008911, "learning_rate": 6.85019783997233e-06, "loss": 0.4763, "step": 8795 }, { "epoch": 3.9791902284551006, "grad_norm": 0.3506464660167694, "learning_rate": 6.849517083570718e-06, "loss": 0.5918, "step": 8796 }, { "epoch": 3.9796426147930335, "grad_norm": 0.3700243830680847, "learning_rate": 6.8488362874472e-06, "loss": 0.4576, "step": 8797 }, { "epoch": 3.980095001130966, "grad_norm": 0.34790533781051636, "learning_rate": 6.848155451616399e-06, "loss": 0.5478, "step": 8798 }, { "epoch": 3.9805473874688984, "grad_norm": 0.4316231608390808, "learning_rate": 6.847474576092934e-06, "loss": 0.5745, "step": 8799 }, { "epoch": 3.980999773806831, "grad_norm": 0.3766200542449951, "learning_rate": 6.846793660891431e-06, "loss": 0.5443, "step": 8800 }, { "epoch": 3.980999773806831, "eval_loss": 0.5891403555870056, "eval_runtime": 25.5487, "eval_samples_per_second": 29.121, "eval_steps_per_second": 7.28, "step": 8800 }, { "epoch": 3.9814521601447637, "grad_norm": 0.4342774450778961, "learning_rate": 6.8461127060265135e-06, "loss": 0.5954, "step": 8801 }, { "epoch": 3.981904546482696, "grad_norm": 0.3660278618335724, "learning_rate": 6.845431711512806e-06, "loss": 0.6218, "step": 8802 }, { "epoch": 3.982356932820629, "grad_norm": 0.379599392414093, "learning_rate": 6.844750677364934e-06, "loss": 0.5328, "step": 8803 }, { "epoch": 3.9828093191585614, "grad_norm": 0.3804551064968109, "learning_rate": 6.844069603597523e-06, "loss": 0.5044, "step": 8804 }, { "epoch": 3.983261705496494, "grad_norm": 0.4391680359840393, "learning_rate": 6.843388490225202e-06, "loss": 0.4806, "step": 8805 }, { "epoch": 3.9837140918344267, "grad_norm": 0.36156585812568665, "learning_rate": 6.842707337262598e-06, "loss": 0.5714, "step": 8806 }, { "epoch": 3.984166478172359, "grad_norm": 0.35630369186401367, "learning_rate": 6.842026144724341e-06, "loss": 0.5433, "step": 8807 }, { "epoch": 3.984618864510292, "grad_norm": 0.38786453008651733, "learning_rate": 6.841344912625061e-06, "loss": 0.4936, "step": 8808 }, { "epoch": 3.9850712508482244, "grad_norm": 0.3945057988166809, "learning_rate": 6.840663640979387e-06, "loss": 0.491, "step": 8809 }, { "epoch": 3.985523637186157, "grad_norm": 0.41903090476989746, "learning_rate": 6.839982329801953e-06, "loss": 0.5723, "step": 8810 }, { "epoch": 3.9859760235240893, "grad_norm": 0.37555304169654846, "learning_rate": 6.8393009791073895e-06, "loss": 0.4808, "step": 8811 }, { "epoch": 3.986428409862022, "grad_norm": 0.4661801755428314, "learning_rate": 6.838619588910331e-06, "loss": 0.6952, "step": 8812 }, { "epoch": 3.9868807961999546, "grad_norm": 0.3725803792476654, "learning_rate": 6.837938159225411e-06, "loss": 0.4558, "step": 8813 }, { "epoch": 3.9873331825378875, "grad_norm": 0.42090684175491333, "learning_rate": 6.837256690067266e-06, "loss": 0.5018, "step": 8814 }, { "epoch": 3.98778556887582, "grad_norm": 0.4676920175552368, "learning_rate": 6.83657518145053e-06, "loss": 0.5898, "step": 8815 }, { "epoch": 3.9882379552137524, "grad_norm": 0.45496755838394165, "learning_rate": 6.8358936333898394e-06, "loss": 0.6093, "step": 8816 }, { "epoch": 3.9886903415516852, "grad_norm": 0.4502725601196289, "learning_rate": 6.835212045899834e-06, "loss": 0.5203, "step": 8817 }, { "epoch": 3.9891427278896177, "grad_norm": 0.5499309301376343, "learning_rate": 6.83453041899515e-06, "loss": 0.7082, "step": 8818 }, { "epoch": 3.9895951142275505, "grad_norm": 0.48883482813835144, "learning_rate": 6.833848752690427e-06, "loss": 0.6146, "step": 8819 }, { "epoch": 3.990047500565483, "grad_norm": 0.4171922206878662, "learning_rate": 6.833167047000306e-06, "loss": 0.4853, "step": 8820 }, { "epoch": 3.9904998869034154, "grad_norm": 0.45083290338516235, "learning_rate": 6.832485301939429e-06, "loss": 0.544, "step": 8821 }, { "epoch": 3.9909522732413483, "grad_norm": 0.5242631435394287, "learning_rate": 6.831803517522435e-06, "loss": 0.5359, "step": 8822 }, { "epoch": 3.9914046595792807, "grad_norm": 0.46893641352653503, "learning_rate": 6.831121693763968e-06, "loss": 0.4676, "step": 8823 }, { "epoch": 3.9918570459172136, "grad_norm": 0.5555372834205627, "learning_rate": 6.830439830678671e-06, "loss": 0.6331, "step": 8824 }, { "epoch": 3.992309432255146, "grad_norm": 0.5475244522094727, "learning_rate": 6.829757928281191e-06, "loss": 0.5795, "step": 8825 }, { "epoch": 3.9927618185930784, "grad_norm": 0.5146301984786987, "learning_rate": 6.829075986586169e-06, "loss": 0.481, "step": 8826 }, { "epoch": 3.993214204931011, "grad_norm": 0.489535927772522, "learning_rate": 6.828394005608252e-06, "loss": 0.4721, "step": 8827 }, { "epoch": 3.9936665912689437, "grad_norm": 0.5908418893814087, "learning_rate": 6.827711985362088e-06, "loss": 0.495, "step": 8828 }, { "epoch": 3.994118977606876, "grad_norm": 0.5216425657272339, "learning_rate": 6.827029925862325e-06, "loss": 0.5521, "step": 8829 }, { "epoch": 3.994571363944809, "grad_norm": 0.6131598949432373, "learning_rate": 6.826347827123612e-06, "loss": 0.5598, "step": 8830 }, { "epoch": 3.9950237502827415, "grad_norm": 0.6428058743476868, "learning_rate": 6.825665689160596e-06, "loss": 0.5484, "step": 8831 }, { "epoch": 3.995476136620674, "grad_norm": 0.45117658376693726, "learning_rate": 6.824983511987928e-06, "loss": 0.4641, "step": 8832 }, { "epoch": 3.995928522958607, "grad_norm": 0.26869094371795654, "learning_rate": 6.82430129562026e-06, "loss": 0.4787, "step": 8833 }, { "epoch": 3.996380909296539, "grad_norm": 0.340031236410141, "learning_rate": 6.823619040072243e-06, "loss": 0.4894, "step": 8834 }, { "epoch": 3.996833295634472, "grad_norm": 0.3917492628097534, "learning_rate": 6.822936745358531e-06, "loss": 0.6306, "step": 8835 }, { "epoch": 3.9972856819724045, "grad_norm": 0.3909585177898407, "learning_rate": 6.822254411493777e-06, "loss": 0.5345, "step": 8836 }, { "epoch": 3.997738068310337, "grad_norm": 0.47218990325927734, "learning_rate": 6.8215720384926365e-06, "loss": 0.609, "step": 8837 }, { "epoch": 3.9981904546482694, "grad_norm": 0.41827353835105896, "learning_rate": 6.820889626369761e-06, "loss": 0.4737, "step": 8838 }, { "epoch": 3.9986428409862023, "grad_norm": 0.48106077313423157, "learning_rate": 6.82020717513981e-06, "loss": 0.5961, "step": 8839 }, { "epoch": 3.9990952273241347, "grad_norm": 0.47111356258392334, "learning_rate": 6.819524684817439e-06, "loss": 0.5641, "step": 8840 }, { "epoch": 3.9995476136620676, "grad_norm": 0.5255725383758545, "learning_rate": 6.818842155417306e-06, "loss": 0.5524, "step": 8841 }, { "epoch": 4.0, "grad_norm": 1.6010491847991943, "learning_rate": 6.818159586954071e-06, "loss": 0.7261, "step": 8842 }, { "epoch": 4.000452386337932, "grad_norm": 0.22755447030067444, "learning_rate": 6.817476979442391e-06, "loss": 1.0515, "step": 8843 }, { "epoch": 4.000904772675865, "grad_norm": 0.26336631178855896, "learning_rate": 6.816794332896927e-06, "loss": 0.578, "step": 8844 }, { "epoch": 4.001357159013798, "grad_norm": 0.2865493893623352, "learning_rate": 6.816111647332344e-06, "loss": 0.5709, "step": 8845 }, { "epoch": 4.001809545351731, "grad_norm": 0.30078527331352234, "learning_rate": 6.815428922763298e-06, "loss": 0.5143, "step": 8846 }, { "epoch": 4.002261931689663, "grad_norm": 0.3401489555835724, "learning_rate": 6.814746159204455e-06, "loss": 0.6538, "step": 8847 }, { "epoch": 4.0027143180275955, "grad_norm": 0.3105497658252716, "learning_rate": 6.814063356670481e-06, "loss": 0.7076, "step": 8848 }, { "epoch": 4.003166704365528, "grad_norm": 0.30288609862327576, "learning_rate": 6.813380515176034e-06, "loss": 0.5509, "step": 8849 }, { "epoch": 4.00361909070346, "grad_norm": 0.32960933446884155, "learning_rate": 6.812697634735785e-06, "loss": 0.5862, "step": 8850 }, { "epoch": 4.004071477041394, "grad_norm": 0.3304262161254883, "learning_rate": 6.812014715364396e-06, "loss": 0.5487, "step": 8851 }, { "epoch": 4.004523863379326, "grad_norm": 0.3396478295326233, "learning_rate": 6.811331757076539e-06, "loss": 0.6848, "step": 8852 }, { "epoch": 4.0049762497172585, "grad_norm": 0.3552911877632141, "learning_rate": 6.810648759886877e-06, "loss": 0.538, "step": 8853 }, { "epoch": 4.005428636055191, "grad_norm": 0.34826651215553284, "learning_rate": 6.8099657238100814e-06, "loss": 0.5019, "step": 8854 }, { "epoch": 4.005881022393123, "grad_norm": 0.39688074588775635, "learning_rate": 6.809282648860822e-06, "loss": 0.579, "step": 8855 }, { "epoch": 4.006333408731057, "grad_norm": 0.3283554017543793, "learning_rate": 6.808599535053767e-06, "loss": 0.5037, "step": 8856 }, { "epoch": 4.006785795068989, "grad_norm": 0.3648102581501007, "learning_rate": 6.807916382403589e-06, "loss": 0.5365, "step": 8857 }, { "epoch": 4.007238181406922, "grad_norm": 0.381466805934906, "learning_rate": 6.807233190924959e-06, "loss": 0.594, "step": 8858 }, { "epoch": 4.007690567744854, "grad_norm": 0.3500804603099823, "learning_rate": 6.8065499606325514e-06, "loss": 0.4395, "step": 8859 }, { "epoch": 4.008142954082786, "grad_norm": 0.3584540784358978, "learning_rate": 6.805866691541039e-06, "loss": 0.4743, "step": 8860 }, { "epoch": 4.008595340420719, "grad_norm": 0.37672901153564453, "learning_rate": 6.805183383665096e-06, "loss": 0.5324, "step": 8861 }, { "epoch": 4.009047726758652, "grad_norm": 0.3884756565093994, "learning_rate": 6.804500037019398e-06, "loss": 0.68, "step": 8862 }, { "epoch": 4.009500113096585, "grad_norm": 0.4045882225036621, "learning_rate": 6.803816651618621e-06, "loss": 0.594, "step": 8863 }, { "epoch": 4.009952499434517, "grad_norm": 0.37356066703796387, "learning_rate": 6.803133227477442e-06, "loss": 0.4881, "step": 8864 }, { "epoch": 4.0104048857724495, "grad_norm": 0.4114995300769806, "learning_rate": 6.802449764610539e-06, "loss": 0.6244, "step": 8865 }, { "epoch": 4.010857272110382, "grad_norm": 0.39945781230926514, "learning_rate": 6.801766263032592e-06, "loss": 0.5382, "step": 8866 }, { "epoch": 4.011309658448315, "grad_norm": 0.4378843605518341, "learning_rate": 6.801082722758279e-06, "loss": 0.593, "step": 8867 }, { "epoch": 4.011762044786248, "grad_norm": 0.40910425782203674, "learning_rate": 6.800399143802278e-06, "loss": 0.5026, "step": 8868 }, { "epoch": 4.01221443112418, "grad_norm": 0.3799869120121002, "learning_rate": 6.7997155261792755e-06, "loss": 0.551, "step": 8869 }, { "epoch": 4.0126668174621125, "grad_norm": 0.43716713786125183, "learning_rate": 6.79903186990395e-06, "loss": 0.5967, "step": 8870 }, { "epoch": 4.013119203800045, "grad_norm": 0.36612653732299805, "learning_rate": 6.798348174990986e-06, "loss": 0.4543, "step": 8871 }, { "epoch": 4.013571590137978, "grad_norm": 0.38993915915489197, "learning_rate": 6.7976644414550644e-06, "loss": 0.441, "step": 8872 }, { "epoch": 4.014023976475911, "grad_norm": 0.4608413875102997, "learning_rate": 6.796980669310872e-06, "loss": 0.6192, "step": 8873 }, { "epoch": 4.014476362813843, "grad_norm": 0.4553779065608978, "learning_rate": 6.796296858573094e-06, "loss": 0.5711, "step": 8874 }, { "epoch": 4.0149287491517756, "grad_norm": 0.41853800415992737, "learning_rate": 6.795613009256416e-06, "loss": 0.488, "step": 8875 }, { "epoch": 4.015381135489708, "grad_norm": 0.4568899869918823, "learning_rate": 6.794929121375525e-06, "loss": 0.5273, "step": 8876 }, { "epoch": 4.01583352182764, "grad_norm": 0.46571552753448486, "learning_rate": 6.794245194945108e-06, "loss": 0.5809, "step": 8877 }, { "epoch": 4.016285908165574, "grad_norm": 0.48209384083747864, "learning_rate": 6.793561229979856e-06, "loss": 0.55, "step": 8878 }, { "epoch": 4.016738294503506, "grad_norm": 0.42410507798194885, "learning_rate": 6.792877226494457e-06, "loss": 0.5061, "step": 8879 }, { "epoch": 4.017190680841439, "grad_norm": 0.42249107360839844, "learning_rate": 6.7921931845036015e-06, "loss": 0.4366, "step": 8880 }, { "epoch": 4.017643067179371, "grad_norm": 0.45268791913986206, "learning_rate": 6.79150910402198e-06, "loss": 0.5104, "step": 8881 }, { "epoch": 4.0180954535173035, "grad_norm": 0.4709303379058838, "learning_rate": 6.790824985064284e-06, "loss": 0.457, "step": 8882 }, { "epoch": 4.018547839855237, "grad_norm": 0.45826253294944763, "learning_rate": 6.790140827645207e-06, "loss": 0.5282, "step": 8883 }, { "epoch": 4.019000226193169, "grad_norm": 0.45866647362709045, "learning_rate": 6.789456631779443e-06, "loss": 0.4642, "step": 8884 }, { "epoch": 4.019452612531102, "grad_norm": 0.5279093384742737, "learning_rate": 6.7887723974816854e-06, "loss": 0.5581, "step": 8885 }, { "epoch": 4.019904998869034, "grad_norm": 0.42083844542503357, "learning_rate": 6.788088124766631e-06, "loss": 0.3839, "step": 8886 }, { "epoch": 4.0203573852069665, "grad_norm": 0.5742693543434143, "learning_rate": 6.7874038136489735e-06, "loss": 0.5982, "step": 8887 }, { "epoch": 4.020809771544899, "grad_norm": 0.5128586292266846, "learning_rate": 6.786719464143412e-06, "loss": 0.5143, "step": 8888 }, { "epoch": 4.021262157882832, "grad_norm": 0.5480242371559143, "learning_rate": 6.786035076264644e-06, "loss": 0.5465, "step": 8889 }, { "epoch": 4.021714544220765, "grad_norm": 0.6024248600006104, "learning_rate": 6.7853506500273666e-06, "loss": 0.5771, "step": 8890 }, { "epoch": 4.022166930558697, "grad_norm": 0.5947443246841431, "learning_rate": 6.784666185446282e-06, "loss": 0.5285, "step": 8891 }, { "epoch": 4.0226193168966295, "grad_norm": 0.6053540110588074, "learning_rate": 6.783981682536087e-06, "loss": 0.462, "step": 8892 }, { "epoch": 4.023071703234562, "grad_norm": 0.157732293009758, "learning_rate": 6.783297141311484e-06, "loss": 1.2268, "step": 8893 }, { "epoch": 4.023524089572495, "grad_norm": 0.19738173484802246, "learning_rate": 6.782612561787175e-06, "loss": 0.7011, "step": 8894 }, { "epoch": 4.023976475910428, "grad_norm": 0.2754000723361969, "learning_rate": 6.781927943977862e-06, "loss": 0.6523, "step": 8895 }, { "epoch": 4.02442886224836, "grad_norm": 0.3252013921737671, "learning_rate": 6.781243287898249e-06, "loss": 0.5398, "step": 8896 }, { "epoch": 4.024881248586293, "grad_norm": 0.3271414041519165, "learning_rate": 6.7805585935630406e-06, "loss": 0.5308, "step": 8897 }, { "epoch": 4.025333634924225, "grad_norm": 0.348251610994339, "learning_rate": 6.779873860986942e-06, "loss": 0.6956, "step": 8898 }, { "epoch": 4.0257860212621575, "grad_norm": 0.34111547470092773, "learning_rate": 6.779189090184658e-06, "loss": 0.5614, "step": 8899 }, { "epoch": 4.026238407600091, "grad_norm": 0.34790706634521484, "learning_rate": 6.778504281170896e-06, "loss": 0.6928, "step": 8900 }, { "epoch": 4.026690793938023, "grad_norm": 0.3854224681854248, "learning_rate": 6.7778194339603645e-06, "loss": 0.7229, "step": 8901 }, { "epoch": 4.027143180275956, "grad_norm": 0.35751256346702576, "learning_rate": 6.777134548567771e-06, "loss": 0.6601, "step": 8902 }, { "epoch": 4.027595566613888, "grad_norm": 0.3797284662723541, "learning_rate": 6.776449625007824e-06, "loss": 0.5647, "step": 8903 }, { "epoch": 4.0280479529518205, "grad_norm": 0.3775121569633484, "learning_rate": 6.775764663295235e-06, "loss": 0.5835, "step": 8904 }, { "epoch": 4.028500339289754, "grad_norm": 0.3646544814109802, "learning_rate": 6.775079663444713e-06, "loss": 0.5641, "step": 8905 }, { "epoch": 4.028952725627686, "grad_norm": 0.37205949425697327, "learning_rate": 6.774394625470972e-06, "loss": 0.5086, "step": 8906 }, { "epoch": 4.029405111965619, "grad_norm": 0.4035301208496094, "learning_rate": 6.773709549388722e-06, "loss": 0.4819, "step": 8907 }, { "epoch": 4.029857498303551, "grad_norm": 0.37840715050697327, "learning_rate": 6.773024435212678e-06, "loss": 0.5701, "step": 8908 }, { "epoch": 4.0303098846414835, "grad_norm": 0.4360729157924652, "learning_rate": 6.772339282957554e-06, "loss": 0.5485, "step": 8909 }, { "epoch": 4.030762270979417, "grad_norm": 0.4333612024784088, "learning_rate": 6.7716540926380655e-06, "loss": 0.5561, "step": 8910 }, { "epoch": 4.031214657317349, "grad_norm": 0.4094587564468384, "learning_rate": 6.770968864268928e-06, "loss": 0.5514, "step": 8911 }, { "epoch": 4.031667043655282, "grad_norm": 0.38754644989967346, "learning_rate": 6.770283597864855e-06, "loss": 0.5326, "step": 8912 }, { "epoch": 4.032119429993214, "grad_norm": 0.43203526735305786, "learning_rate": 6.769598293440569e-06, "loss": 0.563, "step": 8913 }, { "epoch": 4.032571816331147, "grad_norm": 0.38789939880371094, "learning_rate": 6.768912951010785e-06, "loss": 0.5033, "step": 8914 }, { "epoch": 4.033024202669079, "grad_norm": 0.4363321363925934, "learning_rate": 6.768227570590222e-06, "loss": 0.5356, "step": 8915 }, { "epoch": 4.033476589007012, "grad_norm": 0.36771973967552185, "learning_rate": 6.7675421521936025e-06, "loss": 0.4893, "step": 8916 }, { "epoch": 4.033928975344945, "grad_norm": 0.4036610424518585, "learning_rate": 6.7668566958356426e-06, "loss": 0.4613, "step": 8917 }, { "epoch": 4.034381361682877, "grad_norm": 0.33289748430252075, "learning_rate": 6.766171201531069e-06, "loss": 0.351, "step": 8918 }, { "epoch": 4.03483374802081, "grad_norm": 0.44165125489234924, "learning_rate": 6.765485669294599e-06, "loss": 0.6585, "step": 8919 }, { "epoch": 4.035286134358742, "grad_norm": 0.3839176595211029, "learning_rate": 6.76480009914096e-06, "loss": 0.4529, "step": 8920 }, { "epoch": 4.035738520696675, "grad_norm": 0.4767869710922241, "learning_rate": 6.764114491084873e-06, "loss": 0.5833, "step": 8921 }, { "epoch": 4.036190907034608, "grad_norm": 0.3605995178222656, "learning_rate": 6.763428845141065e-06, "loss": 0.3843, "step": 8922 }, { "epoch": 4.03664329337254, "grad_norm": 0.43688341975212097, "learning_rate": 6.762743161324259e-06, "loss": 0.5577, "step": 8923 }, { "epoch": 4.037095679710473, "grad_norm": 0.44227665662765503, "learning_rate": 6.762057439649183e-06, "loss": 0.5292, "step": 8924 }, { "epoch": 4.037548066048405, "grad_norm": 0.4900573194026947, "learning_rate": 6.761371680130565e-06, "loss": 0.578, "step": 8925 }, { "epoch": 4.0380004523863375, "grad_norm": 0.46940702199935913, "learning_rate": 6.760685882783131e-06, "loss": 0.5416, "step": 8926 }, { "epoch": 4.038452838724271, "grad_norm": 0.41222408413887024, "learning_rate": 6.760000047621611e-06, "loss": 0.4486, "step": 8927 }, { "epoch": 4.038905225062203, "grad_norm": 0.43176600337028503, "learning_rate": 6.759314174660735e-06, "loss": 0.4692, "step": 8928 }, { "epoch": 4.039357611400136, "grad_norm": 0.5101643800735474, "learning_rate": 6.758628263915232e-06, "loss": 0.5554, "step": 8929 }, { "epoch": 4.039809997738068, "grad_norm": 0.4648932218551636, "learning_rate": 6.757942315399834e-06, "loss": 0.415, "step": 8930 }, { "epoch": 4.040262384076001, "grad_norm": 0.433493435382843, "learning_rate": 6.7572563291292724e-06, "loss": 0.5014, "step": 8931 }, { "epoch": 4.040714770413934, "grad_norm": 0.5208778381347656, "learning_rate": 6.756570305118282e-06, "loss": 0.5213, "step": 8932 }, { "epoch": 4.041167156751866, "grad_norm": 0.4524594843387604, "learning_rate": 6.755884243381594e-06, "loss": 0.4521, "step": 8933 }, { "epoch": 4.041619543089799, "grad_norm": 0.49624940752983093, "learning_rate": 6.755198143933945e-06, "loss": 0.5103, "step": 8934 }, { "epoch": 4.042071929427731, "grad_norm": 0.5365965962409973, "learning_rate": 6.754512006790068e-06, "loss": 0.4815, "step": 8935 }, { "epoch": 4.042524315765664, "grad_norm": 0.5198737382888794, "learning_rate": 6.7538258319647034e-06, "loss": 0.5973, "step": 8936 }, { "epoch": 4.042976702103596, "grad_norm": 0.4876384139060974, "learning_rate": 6.753139619472584e-06, "loss": 0.4605, "step": 8937 }, { "epoch": 4.043429088441529, "grad_norm": 0.65754234790802, "learning_rate": 6.752453369328448e-06, "loss": 0.5435, "step": 8938 }, { "epoch": 4.043881474779462, "grad_norm": 0.520637035369873, "learning_rate": 6.751767081547034e-06, "loss": 0.4856, "step": 8939 }, { "epoch": 4.044333861117394, "grad_norm": 0.5070751905441284, "learning_rate": 6.751080756143082e-06, "loss": 0.4523, "step": 8940 }, { "epoch": 4.044786247455327, "grad_norm": 0.506813108921051, "learning_rate": 6.750394393131333e-06, "loss": 0.4056, "step": 8941 }, { "epoch": 4.045238633793259, "grad_norm": 0.5744426250457764, "learning_rate": 6.749707992526528e-06, "loss": 0.4404, "step": 8942 }, { "epoch": 4.045691020131192, "grad_norm": 0.14721420407295227, "learning_rate": 6.749021554343406e-06, "loss": 1.1443, "step": 8943 }, { "epoch": 4.046143406469125, "grad_norm": 0.2925342917442322, "learning_rate": 6.7483350785967125e-06, "loss": 0.904, "step": 8944 }, { "epoch": 4.046595792807057, "grad_norm": 0.25470465421676636, "learning_rate": 6.747648565301189e-06, "loss": 0.4642, "step": 8945 }, { "epoch": 4.04704817914499, "grad_norm": 0.31298160552978516, "learning_rate": 6.746962014471581e-06, "loss": 0.6878, "step": 8946 }, { "epoch": 4.047500565482922, "grad_norm": 0.33891522884368896, "learning_rate": 6.746275426122634e-06, "loss": 0.4761, "step": 8947 }, { "epoch": 4.047952951820855, "grad_norm": 0.35501575469970703, "learning_rate": 6.745588800269092e-06, "loss": 0.6996, "step": 8948 }, { "epoch": 4.048405338158788, "grad_norm": 0.32178494334220886, "learning_rate": 6.744902136925702e-06, "loss": 0.582, "step": 8949 }, { "epoch": 4.04885772449672, "grad_norm": 0.3293164074420929, "learning_rate": 6.744215436107211e-06, "loss": 0.646, "step": 8950 }, { "epoch": 4.049310110834653, "grad_norm": 0.3122200071811676, "learning_rate": 6.743528697828369e-06, "loss": 0.5126, "step": 8951 }, { "epoch": 4.049762497172585, "grad_norm": 0.3037220537662506, "learning_rate": 6.742841922103923e-06, "loss": 0.5404, "step": 8952 }, { "epoch": 4.050214883510518, "grad_norm": 0.36735036969184875, "learning_rate": 6.742155108948625e-06, "loss": 0.6197, "step": 8953 }, { "epoch": 4.050667269848451, "grad_norm": 0.3202469050884247, "learning_rate": 6.741468258377224e-06, "loss": 0.5219, "step": 8954 }, { "epoch": 4.051119656186383, "grad_norm": 0.36336976289749146, "learning_rate": 6.740781370404471e-06, "loss": 0.621, "step": 8955 }, { "epoch": 4.051572042524316, "grad_norm": 0.39434435963630676, "learning_rate": 6.74009444504512e-06, "loss": 0.893, "step": 8956 }, { "epoch": 4.052024428862248, "grad_norm": 0.38370826840400696, "learning_rate": 6.739407482313923e-06, "loss": 0.655, "step": 8957 }, { "epoch": 4.052476815200181, "grad_norm": 0.38143935799598694, "learning_rate": 6.738720482225635e-06, "loss": 0.4804, "step": 8958 }, { "epoch": 4.052929201538114, "grad_norm": 0.3503269553184509, "learning_rate": 6.738033444795008e-06, "loss": 0.4543, "step": 8959 }, { "epoch": 4.053381587876046, "grad_norm": 0.36184510588645935, "learning_rate": 6.7373463700367995e-06, "loss": 0.483, "step": 8960 }, { "epoch": 4.053833974213979, "grad_norm": 0.4138580560684204, "learning_rate": 6.736659257965765e-06, "loss": 0.7174, "step": 8961 }, { "epoch": 4.054286360551911, "grad_norm": 0.39256253838539124, "learning_rate": 6.735972108596662e-06, "loss": 0.5039, "step": 8962 }, { "epoch": 4.054738746889844, "grad_norm": 0.40680551528930664, "learning_rate": 6.735284921944248e-06, "loss": 0.5375, "step": 8963 }, { "epoch": 4.055191133227776, "grad_norm": 0.36440369486808777, "learning_rate": 6.734597698023282e-06, "loss": 0.4858, "step": 8964 }, { "epoch": 4.0556435195657095, "grad_norm": 0.4285513460636139, "learning_rate": 6.733910436848523e-06, "loss": 0.549, "step": 8965 }, { "epoch": 4.056095905903642, "grad_norm": 0.4270610511302948, "learning_rate": 6.733223138434732e-06, "loss": 0.5976, "step": 8966 }, { "epoch": 4.056548292241574, "grad_norm": 0.38607141375541687, "learning_rate": 6.732535802796669e-06, "loss": 0.4933, "step": 8967 }, { "epoch": 4.057000678579507, "grad_norm": 0.4409521222114563, "learning_rate": 6.731848429949097e-06, "loss": 0.5665, "step": 8968 }, { "epoch": 4.057453064917439, "grad_norm": 0.44536978006362915, "learning_rate": 6.731161019906778e-06, "loss": 0.6142, "step": 8969 }, { "epoch": 4.0579054512553725, "grad_norm": 0.4278338849544525, "learning_rate": 6.730473572684475e-06, "loss": 0.5466, "step": 8970 }, { "epoch": 4.058357837593305, "grad_norm": 0.4434773027896881, "learning_rate": 6.729786088296953e-06, "loss": 0.4827, "step": 8971 }, { "epoch": 4.058810223931237, "grad_norm": 0.41205769777297974, "learning_rate": 6.729098566758976e-06, "loss": 0.5388, "step": 8972 }, { "epoch": 4.05926261026917, "grad_norm": 0.4620945453643799, "learning_rate": 6.728411008085311e-06, "loss": 0.5768, "step": 8973 }, { "epoch": 4.059714996607102, "grad_norm": 0.4520975649356842, "learning_rate": 6.727723412290724e-06, "loss": 0.5686, "step": 8974 }, { "epoch": 4.060167382945035, "grad_norm": 0.467867910861969, "learning_rate": 6.727035779389983e-06, "loss": 0.6017, "step": 8975 }, { "epoch": 4.060619769282968, "grad_norm": 0.4153282344341278, "learning_rate": 6.726348109397856e-06, "loss": 0.4826, "step": 8976 }, { "epoch": 4.0610721556209, "grad_norm": 0.4389496147632599, "learning_rate": 6.72566040232911e-06, "loss": 0.5797, "step": 8977 }, { "epoch": 4.061524541958833, "grad_norm": 0.4793166220188141, "learning_rate": 6.724972658198519e-06, "loss": 0.5994, "step": 8978 }, { "epoch": 4.061976928296765, "grad_norm": 0.40304481983184814, "learning_rate": 6.72428487702085e-06, "loss": 0.4137, "step": 8979 }, { "epoch": 4.062429314634698, "grad_norm": 0.46227824687957764, "learning_rate": 6.723597058810877e-06, "loss": 0.5264, "step": 8980 }, { "epoch": 4.062881700972631, "grad_norm": 0.47521263360977173, "learning_rate": 6.722909203583371e-06, "loss": 0.4989, "step": 8981 }, { "epoch": 4.0633340873105634, "grad_norm": 0.49304044246673584, "learning_rate": 6.722221311353104e-06, "loss": 0.4809, "step": 8982 }, { "epoch": 4.063786473648496, "grad_norm": 0.49757590889930725, "learning_rate": 6.72153338213485e-06, "loss": 0.5437, "step": 8983 }, { "epoch": 4.064238859986428, "grad_norm": 0.48888224363327026, "learning_rate": 6.720845415943386e-06, "loss": 0.4986, "step": 8984 }, { "epoch": 4.064691246324361, "grad_norm": 0.5714057683944702, "learning_rate": 6.720157412793484e-06, "loss": 0.5622, "step": 8985 }, { "epoch": 4.065143632662293, "grad_norm": 0.5163483023643494, "learning_rate": 6.719469372699923e-06, "loss": 0.5432, "step": 8986 }, { "epoch": 4.0655960190002265, "grad_norm": 0.5030855536460876, "learning_rate": 6.718781295677479e-06, "loss": 0.5233, "step": 8987 }, { "epoch": 4.066048405338159, "grad_norm": 0.584238588809967, "learning_rate": 6.718093181740929e-06, "loss": 0.5705, "step": 8988 }, { "epoch": 4.066500791676091, "grad_norm": 0.6202609539031982, "learning_rate": 6.717405030905052e-06, "loss": 0.618, "step": 8989 }, { "epoch": 4.066953178014024, "grad_norm": 0.5871252417564392, "learning_rate": 6.716716843184628e-06, "loss": 0.6177, "step": 8990 }, { "epoch": 4.067405564351956, "grad_norm": 0.5783814191818237, "learning_rate": 6.716028618594437e-06, "loss": 0.5821, "step": 8991 }, { "epoch": 4.0678579506898895, "grad_norm": 0.6535913348197937, "learning_rate": 6.71534035714926e-06, "loss": 0.4733, "step": 8992 }, { "epoch": 4.068310337027822, "grad_norm": 0.1629963517189026, "learning_rate": 6.714652058863878e-06, "loss": 1.3151, "step": 8993 }, { "epoch": 4.068762723365754, "grad_norm": 0.22391116619110107, "learning_rate": 6.713963723753073e-06, "loss": 0.9034, "step": 8994 }, { "epoch": 4.069215109703687, "grad_norm": 0.26107820868492126, "learning_rate": 6.7132753518316294e-06, "loss": 0.5506, "step": 8995 }, { "epoch": 4.069667496041619, "grad_norm": 0.306374192237854, "learning_rate": 6.712586943114332e-06, "loss": 0.5773, "step": 8996 }, { "epoch": 4.070119882379553, "grad_norm": 0.320566862821579, "learning_rate": 6.711898497615963e-06, "loss": 0.6794, "step": 8997 }, { "epoch": 4.070572268717485, "grad_norm": 0.2867784798145294, "learning_rate": 6.71121001535131e-06, "loss": 0.49, "step": 8998 }, { "epoch": 4.071024655055417, "grad_norm": 0.3203221559524536, "learning_rate": 6.71052149633516e-06, "loss": 0.5051, "step": 8999 }, { "epoch": 4.07147704139335, "grad_norm": 0.3210018575191498, "learning_rate": 6.7098329405823014e-06, "loss": 0.5506, "step": 9000 }, { "epoch": 4.07147704139335, "eval_loss": 0.5913072824478149, "eval_runtime": 26.0671, "eval_samples_per_second": 28.542, "eval_steps_per_second": 7.135, "step": 9000 }, { "epoch": 4.071929427731282, "grad_norm": 0.3589558005332947, "learning_rate": 6.709144348107518e-06, "loss": 0.6807, "step": 9001 }, { "epoch": 4.072381814069215, "grad_norm": 0.3256528675556183, "learning_rate": 6.708455718925602e-06, "loss": 0.5394, "step": 9002 }, { "epoch": 4.072834200407148, "grad_norm": 0.3559659719467163, "learning_rate": 6.707767053051341e-06, "loss": 0.6392, "step": 9003 }, { "epoch": 4.0732865867450805, "grad_norm": 0.35807549953460693, "learning_rate": 6.707078350499529e-06, "loss": 0.6564, "step": 9004 }, { "epoch": 4.073738973083013, "grad_norm": 0.3610268533229828, "learning_rate": 6.706389611284953e-06, "loss": 0.6183, "step": 9005 }, { "epoch": 4.074191359420945, "grad_norm": 0.31714102625846863, "learning_rate": 6.705700835422407e-06, "loss": 0.4331, "step": 9006 }, { "epoch": 4.074643745758878, "grad_norm": 0.43939408659935, "learning_rate": 6.705012022926683e-06, "loss": 0.6837, "step": 9007 }, { "epoch": 4.075096132096811, "grad_norm": 0.35289546847343445, "learning_rate": 6.704323173812574e-06, "loss": 0.5751, "step": 9008 }, { "epoch": 4.0755485184347435, "grad_norm": 0.3460550606250763, "learning_rate": 6.703634288094878e-06, "loss": 0.4844, "step": 9009 }, { "epoch": 4.076000904772676, "grad_norm": 0.3719133138656616, "learning_rate": 6.702945365788386e-06, "loss": 0.5569, "step": 9010 }, { "epoch": 4.076453291110608, "grad_norm": 0.34720826148986816, "learning_rate": 6.702256406907895e-06, "loss": 0.5638, "step": 9011 }, { "epoch": 4.076905677448541, "grad_norm": 0.39279359579086304, "learning_rate": 6.701567411468202e-06, "loss": 0.4844, "step": 9012 }, { "epoch": 4.077358063786473, "grad_norm": 0.3923804759979248, "learning_rate": 6.700878379484106e-06, "loss": 0.6015, "step": 9013 }, { "epoch": 4.077810450124407, "grad_norm": 0.420977920293808, "learning_rate": 6.700189310970402e-06, "loss": 0.5989, "step": 9014 }, { "epoch": 4.078262836462339, "grad_norm": 0.41333895921707153, "learning_rate": 6.699500205941894e-06, "loss": 0.4869, "step": 9015 }, { "epoch": 4.078715222800271, "grad_norm": 0.44021815061569214, "learning_rate": 6.698811064413376e-06, "loss": 0.5896, "step": 9016 }, { "epoch": 4.079167609138204, "grad_norm": 0.42071107029914856, "learning_rate": 6.6981218863996524e-06, "loss": 0.5475, "step": 9017 }, { "epoch": 4.079619995476136, "grad_norm": 0.3915330767631531, "learning_rate": 6.697432671915523e-06, "loss": 0.4957, "step": 9018 }, { "epoch": 4.08007238181407, "grad_norm": 0.4193454682826996, "learning_rate": 6.6967434209757906e-06, "loss": 0.4843, "step": 9019 }, { "epoch": 4.080524768152002, "grad_norm": 0.39866751432418823, "learning_rate": 6.696054133595259e-06, "loss": 0.5159, "step": 9020 }, { "epoch": 4.0809771544899345, "grad_norm": 0.4310080409049988, "learning_rate": 6.695364809788732e-06, "loss": 0.5589, "step": 9021 }, { "epoch": 4.081429540827867, "grad_norm": 0.4260406792163849, "learning_rate": 6.694675449571011e-06, "loss": 0.5259, "step": 9022 }, { "epoch": 4.081881927165799, "grad_norm": 0.43296605348587036, "learning_rate": 6.6939860529569046e-06, "loss": 0.5743, "step": 9023 }, { "epoch": 4.082334313503732, "grad_norm": 0.39095258712768555, "learning_rate": 6.6932966199612186e-06, "loss": 0.4128, "step": 9024 }, { "epoch": 4.082786699841665, "grad_norm": 0.45127713680267334, "learning_rate": 6.692607150598757e-06, "loss": 0.5623, "step": 9025 }, { "epoch": 4.0832390861795975, "grad_norm": 0.3664487600326538, "learning_rate": 6.691917644884334e-06, "loss": 0.345, "step": 9026 }, { "epoch": 4.08369147251753, "grad_norm": 0.4891457259654999, "learning_rate": 6.691228102832751e-06, "loss": 0.5218, "step": 9027 }, { "epoch": 4.084143858855462, "grad_norm": 0.45395809412002563, "learning_rate": 6.690538524458821e-06, "loss": 0.5572, "step": 9028 }, { "epoch": 4.084596245193395, "grad_norm": 0.46820321679115295, "learning_rate": 6.689848909777351e-06, "loss": 0.4742, "step": 9029 }, { "epoch": 4.085048631531328, "grad_norm": 0.4379742741584778, "learning_rate": 6.6891592588031554e-06, "loss": 0.4848, "step": 9030 }, { "epoch": 4.085501017869261, "grad_norm": 0.49389901757240295, "learning_rate": 6.688469571551043e-06, "loss": 0.4951, "step": 9031 }, { "epoch": 4.085953404207193, "grad_norm": 0.45154890418052673, "learning_rate": 6.687779848035828e-06, "loss": 0.4421, "step": 9032 }, { "epoch": 4.086405790545125, "grad_norm": 0.4925287365913391, "learning_rate": 6.687090088272323e-06, "loss": 0.5069, "step": 9033 }, { "epoch": 4.086858176883058, "grad_norm": 0.4707770347595215, "learning_rate": 6.6864002922753415e-06, "loss": 0.5447, "step": 9034 }, { "epoch": 4.08731056322099, "grad_norm": 0.5050384998321533, "learning_rate": 6.685710460059698e-06, "loss": 0.4708, "step": 9035 }, { "epoch": 4.087762949558924, "grad_norm": 0.5428707003593445, "learning_rate": 6.685020591640209e-06, "loss": 0.5459, "step": 9036 }, { "epoch": 4.088215335896856, "grad_norm": 0.47834160923957825, "learning_rate": 6.684330687031689e-06, "loss": 0.4335, "step": 9037 }, { "epoch": 4.0886677222347885, "grad_norm": 0.48910605907440186, "learning_rate": 6.683640746248958e-06, "loss": 0.5351, "step": 9038 }, { "epoch": 4.089120108572721, "grad_norm": 0.581983745098114, "learning_rate": 6.682950769306831e-06, "loss": 0.5836, "step": 9039 }, { "epoch": 4.089572494910653, "grad_norm": 0.5501769781112671, "learning_rate": 6.682260756220128e-06, "loss": 0.4488, "step": 9040 }, { "epoch": 4.090024881248587, "grad_norm": 0.6326491832733154, "learning_rate": 6.681570707003667e-06, "loss": 0.5425, "step": 9041 }, { "epoch": 4.090477267586519, "grad_norm": 0.738612711429596, "learning_rate": 6.680880621672269e-06, "loss": 0.4705, "step": 9042 }, { "epoch": 4.0909296539244515, "grad_norm": 0.2119298130273819, "learning_rate": 6.680190500240755e-06, "loss": 1.3659, "step": 9043 }, { "epoch": 4.091382040262384, "grad_norm": 0.31018659472465515, "learning_rate": 6.679500342723947e-06, "loss": 0.5331, "step": 9044 }, { "epoch": 4.091834426600316, "grad_norm": 0.2664101719856262, "learning_rate": 6.678810149136666e-06, "loss": 0.5406, "step": 9045 }, { "epoch": 4.09228681293825, "grad_norm": 0.31718966364860535, "learning_rate": 6.678119919493738e-06, "loss": 0.5631, "step": 9046 }, { "epoch": 4.092739199276182, "grad_norm": 0.3469271659851074, "learning_rate": 6.677429653809983e-06, "loss": 0.6837, "step": 9047 }, { "epoch": 4.093191585614115, "grad_norm": 0.32890236377716064, "learning_rate": 6.676739352100229e-06, "loss": 0.5703, "step": 9048 }, { "epoch": 4.093643971952047, "grad_norm": 0.3745458126068115, "learning_rate": 6.6760490143793025e-06, "loss": 0.5633, "step": 9049 }, { "epoch": 4.094096358289979, "grad_norm": 0.36159011721611023, "learning_rate": 6.675358640662025e-06, "loss": 0.6397, "step": 9050 }, { "epoch": 4.094548744627912, "grad_norm": 0.4086318910121918, "learning_rate": 6.674668230963227e-06, "loss": 0.6404, "step": 9051 }, { "epoch": 4.095001130965845, "grad_norm": 0.31039589643478394, "learning_rate": 6.6739777852977365e-06, "loss": 0.5345, "step": 9052 }, { "epoch": 4.095453517303778, "grad_norm": 0.4099605679512024, "learning_rate": 6.67328730368038e-06, "loss": 0.5766, "step": 9053 }, { "epoch": 4.09590590364171, "grad_norm": 0.33940669894218445, "learning_rate": 6.67259678612599e-06, "loss": 0.4349, "step": 9054 }, { "epoch": 4.0963582899796425, "grad_norm": 0.3789840042591095, "learning_rate": 6.671906232649394e-06, "loss": 0.6002, "step": 9055 }, { "epoch": 4.096810676317575, "grad_norm": 0.40436026453971863, "learning_rate": 6.671215643265425e-06, "loss": 0.6187, "step": 9056 }, { "epoch": 4.097263062655508, "grad_norm": 0.3640940487384796, "learning_rate": 6.670525017988912e-06, "loss": 0.5679, "step": 9057 }, { "epoch": 4.097715448993441, "grad_norm": 0.4031279683113098, "learning_rate": 6.66983435683469e-06, "loss": 0.6043, "step": 9058 }, { "epoch": 4.098167835331373, "grad_norm": 0.42706626653671265, "learning_rate": 6.669143659817592e-06, "loss": 0.4672, "step": 9059 }, { "epoch": 4.0986202216693055, "grad_norm": 0.4557705819606781, "learning_rate": 6.668452926952453e-06, "loss": 0.6696, "step": 9060 }, { "epoch": 4.099072608007238, "grad_norm": 0.3567633032798767, "learning_rate": 6.667762158254104e-06, "loss": 0.481, "step": 9061 }, { "epoch": 4.09952499434517, "grad_norm": 0.4400884211063385, "learning_rate": 6.6670713537373835e-06, "loss": 0.6403, "step": 9062 }, { "epoch": 4.099977380683104, "grad_norm": 0.40164265036582947, "learning_rate": 6.6663805134171275e-06, "loss": 0.5276, "step": 9063 }, { "epoch": 4.100429767021036, "grad_norm": 0.40159016847610474, "learning_rate": 6.665689637308173e-06, "loss": 0.5148, "step": 9064 }, { "epoch": 4.1008821533589686, "grad_norm": 0.39258575439453125, "learning_rate": 6.6649987254253554e-06, "loss": 0.4752, "step": 9065 }, { "epoch": 4.101334539696901, "grad_norm": 0.43841078877449036, "learning_rate": 6.664307777783519e-06, "loss": 0.6943, "step": 9066 }, { "epoch": 4.101786926034833, "grad_norm": 0.4171494245529175, "learning_rate": 6.663616794397497e-06, "loss": 0.5931, "step": 9067 }, { "epoch": 4.102239312372767, "grad_norm": 0.46724703907966614, "learning_rate": 6.662925775282134e-06, "loss": 0.5299, "step": 9068 }, { "epoch": 4.102691698710699, "grad_norm": 0.41519272327423096, "learning_rate": 6.66223472045227e-06, "loss": 0.5478, "step": 9069 }, { "epoch": 4.103144085048632, "grad_norm": 0.43479683995246887, "learning_rate": 6.661543629922744e-06, "loss": 0.5713, "step": 9070 }, { "epoch": 4.103596471386564, "grad_norm": 0.4414137899875641, "learning_rate": 6.6608525037084035e-06, "loss": 0.5958, "step": 9071 }, { "epoch": 4.1040488577244965, "grad_norm": 0.4194486141204834, "learning_rate": 6.660161341824087e-06, "loss": 0.4741, "step": 9072 }, { "epoch": 4.104501244062429, "grad_norm": 0.3938842713832855, "learning_rate": 6.659470144284641e-06, "loss": 0.4117, "step": 9073 }, { "epoch": 4.104953630400362, "grad_norm": 0.3916451930999756, "learning_rate": 6.658778911104909e-06, "loss": 0.4342, "step": 9074 }, { "epoch": 4.105406016738295, "grad_norm": 0.43872955441474915, "learning_rate": 6.6580876422997375e-06, "loss": 0.5407, "step": 9075 }, { "epoch": 4.105858403076227, "grad_norm": 0.5281492471694946, "learning_rate": 6.6573963378839735e-06, "loss": 0.6197, "step": 9076 }, { "epoch": 4.1063107894141595, "grad_norm": 0.46645504236221313, "learning_rate": 6.656704997872462e-06, "loss": 0.5188, "step": 9077 }, { "epoch": 4.106763175752092, "grad_norm": 0.48467329144477844, "learning_rate": 6.656013622280052e-06, "loss": 0.5088, "step": 9078 }, { "epoch": 4.107215562090025, "grad_norm": 0.43272092938423157, "learning_rate": 6.655322211121593e-06, "loss": 0.4723, "step": 9079 }, { "epoch": 4.107667948427958, "grad_norm": 0.4462508261203766, "learning_rate": 6.6546307644119324e-06, "loss": 0.475, "step": 9080 }, { "epoch": 4.10812033476589, "grad_norm": 0.48935022950172424, "learning_rate": 6.653939282165922e-06, "loss": 0.477, "step": 9081 }, { "epoch": 4.1085727211038225, "grad_norm": 0.4673260450363159, "learning_rate": 6.653247764398411e-06, "loss": 0.4705, "step": 9082 }, { "epoch": 4.109025107441755, "grad_norm": 0.520735502243042, "learning_rate": 6.652556211124255e-06, "loss": 0.5294, "step": 9083 }, { "epoch": 4.109477493779687, "grad_norm": 0.4910898506641388, "learning_rate": 6.651864622358301e-06, "loss": 0.4689, "step": 9084 }, { "epoch": 4.109929880117621, "grad_norm": 0.4763193428516388, "learning_rate": 6.651172998115406e-06, "loss": 0.4274, "step": 9085 }, { "epoch": 4.110382266455553, "grad_norm": 0.5455310940742493, "learning_rate": 6.650481338410421e-06, "loss": 0.5092, "step": 9086 }, { "epoch": 4.110834652793486, "grad_norm": 0.5499915480613708, "learning_rate": 6.6497896432582045e-06, "loss": 0.5363, "step": 9087 }, { "epoch": 4.111287039131418, "grad_norm": 0.5465049147605896, "learning_rate": 6.6490979126736085e-06, "loss": 0.5444, "step": 9088 }, { "epoch": 4.1117394254693505, "grad_norm": 0.558255136013031, "learning_rate": 6.648406146671491e-06, "loss": 0.5032, "step": 9089 }, { "epoch": 4.112191811807284, "grad_norm": 0.49428245425224304, "learning_rate": 6.647714345266709e-06, "loss": 0.4406, "step": 9090 }, { "epoch": 4.112644198145216, "grad_norm": 0.5770502686500549, "learning_rate": 6.64702250847412e-06, "loss": 0.4222, "step": 9091 }, { "epoch": 4.113096584483149, "grad_norm": 0.5956503748893738, "learning_rate": 6.6463306363085825e-06, "loss": 0.4543, "step": 9092 }, { "epoch": 4.113548970821081, "grad_norm": 0.16313058137893677, "learning_rate": 6.645638728784954e-06, "loss": 1.4286, "step": 9093 }, { "epoch": 4.1140013571590135, "grad_norm": 0.20983043313026428, "learning_rate": 6.644946785918099e-06, "loss": 0.6113, "step": 9094 }, { "epoch": 4.114453743496947, "grad_norm": 0.2770179212093353, "learning_rate": 6.644254807722874e-06, "loss": 0.6369, "step": 9095 }, { "epoch": 4.114906129834879, "grad_norm": 0.31334835290908813, "learning_rate": 6.643562794214142e-06, "loss": 0.6768, "step": 9096 }, { "epoch": 4.115358516172812, "grad_norm": 0.3042267858982086, "learning_rate": 6.642870745406765e-06, "loss": 0.6056, "step": 9097 }, { "epoch": 4.115810902510744, "grad_norm": 0.3350878059864044, "learning_rate": 6.642178661315607e-06, "loss": 0.6235, "step": 9098 }, { "epoch": 4.1162632888486765, "grad_norm": 0.2980155944824219, "learning_rate": 6.641486541955531e-06, "loss": 0.5478, "step": 9099 }, { "epoch": 4.116715675186609, "grad_norm": 0.31785908341407776, "learning_rate": 6.640794387341402e-06, "loss": 0.553, "step": 9100 }, { "epoch": 4.117168061524542, "grad_norm": 0.3463713824748993, "learning_rate": 6.640102197488085e-06, "loss": 0.6037, "step": 9101 }, { "epoch": 4.117620447862475, "grad_norm": 0.36866945028305054, "learning_rate": 6.639409972410446e-06, "loss": 0.6015, "step": 9102 }, { "epoch": 4.118072834200407, "grad_norm": 0.33507850766181946, "learning_rate": 6.638717712123353e-06, "loss": 0.5349, "step": 9103 }, { "epoch": 4.11852522053834, "grad_norm": 0.375373899936676, "learning_rate": 6.638025416641671e-06, "loss": 0.6172, "step": 9104 }, { "epoch": 4.118977606876272, "grad_norm": 0.3971033990383148, "learning_rate": 6.637333085980273e-06, "loss": 0.5539, "step": 9105 }, { "epoch": 4.119429993214205, "grad_norm": 0.4906928539276123, "learning_rate": 6.636640720154023e-06, "loss": 0.7682, "step": 9106 }, { "epoch": 4.119882379552138, "grad_norm": 0.3790417015552521, "learning_rate": 6.635948319177794e-06, "loss": 0.5473, "step": 9107 }, { "epoch": 4.12033476589007, "grad_norm": 0.4143795371055603, "learning_rate": 6.635255883066455e-06, "loss": 0.6134, "step": 9108 }, { "epoch": 4.120787152228003, "grad_norm": 0.41347283124923706, "learning_rate": 6.6345634118348775e-06, "loss": 0.5169, "step": 9109 }, { "epoch": 4.121239538565935, "grad_norm": 0.4088606834411621, "learning_rate": 6.633870905497936e-06, "loss": 0.5697, "step": 9110 }, { "epoch": 4.1216919249038675, "grad_norm": 0.385924756526947, "learning_rate": 6.6331783640705e-06, "loss": 0.5732, "step": 9111 }, { "epoch": 4.122144311241801, "grad_norm": 0.42971423268318176, "learning_rate": 6.632485787567447e-06, "loss": 0.5725, "step": 9112 }, { "epoch": 4.122596697579733, "grad_norm": 0.3749372959136963, "learning_rate": 6.631793176003647e-06, "loss": 0.4795, "step": 9113 }, { "epoch": 4.123049083917666, "grad_norm": 0.42711886763572693, "learning_rate": 6.6311005293939775e-06, "loss": 0.6229, "step": 9114 }, { "epoch": 4.123501470255598, "grad_norm": 0.3922116458415985, "learning_rate": 6.630407847753315e-06, "loss": 0.4307, "step": 9115 }, { "epoch": 4.1239538565935305, "grad_norm": 0.4351806044578552, "learning_rate": 6.629715131096535e-06, "loss": 0.4575, "step": 9116 }, { "epoch": 4.124406242931464, "grad_norm": 0.43186554312705994, "learning_rate": 6.629022379438517e-06, "loss": 0.5459, "step": 9117 }, { "epoch": 4.124858629269396, "grad_norm": 0.4584064781665802, "learning_rate": 6.628329592794136e-06, "loss": 0.6234, "step": 9118 }, { "epoch": 4.125311015607329, "grad_norm": 0.4306950271129608, "learning_rate": 6.627636771178272e-06, "loss": 0.5758, "step": 9119 }, { "epoch": 4.125763401945261, "grad_norm": 0.4300723373889923, "learning_rate": 6.6269439146058055e-06, "loss": 0.5029, "step": 9120 }, { "epoch": 4.126215788283194, "grad_norm": 0.4667360484600067, "learning_rate": 6.626251023091616e-06, "loss": 0.6271, "step": 9121 }, { "epoch": 4.126668174621127, "grad_norm": 0.4423859417438507, "learning_rate": 6.625558096650586e-06, "loss": 0.5146, "step": 9122 }, { "epoch": 4.127120560959059, "grad_norm": 0.4667820334434509, "learning_rate": 6.624865135297597e-06, "loss": 0.585, "step": 9123 }, { "epoch": 4.127572947296992, "grad_norm": 0.4350406229496002, "learning_rate": 6.62417213904753e-06, "loss": 0.4991, "step": 9124 }, { "epoch": 4.128025333634924, "grad_norm": 0.4258823096752167, "learning_rate": 6.623479107915271e-06, "loss": 0.474, "step": 9125 }, { "epoch": 4.128477719972857, "grad_norm": 0.4404004216194153, "learning_rate": 6.622786041915701e-06, "loss": 0.5183, "step": 9126 }, { "epoch": 4.128930106310789, "grad_norm": 0.4794539213180542, "learning_rate": 6.622092941063708e-06, "loss": 0.476, "step": 9127 }, { "epoch": 4.129382492648722, "grad_norm": 0.4917868971824646, "learning_rate": 6.621399805374177e-06, "loss": 0.5918, "step": 9128 }, { "epoch": 4.129834878986655, "grad_norm": 0.4619574248790741, "learning_rate": 6.620706634861993e-06, "loss": 0.4523, "step": 9129 }, { "epoch": 4.130287265324587, "grad_norm": 0.4955630302429199, "learning_rate": 6.620013429542044e-06, "loss": 0.4852, "step": 9130 }, { "epoch": 4.13073965166252, "grad_norm": 0.4454694986343384, "learning_rate": 6.619320189429217e-06, "loss": 0.4473, "step": 9131 }, { "epoch": 4.131192038000452, "grad_norm": 0.5039477944374084, "learning_rate": 6.618626914538402e-06, "loss": 0.5795, "step": 9132 }, { "epoch": 4.1316444243383845, "grad_norm": 0.4563736617565155, "learning_rate": 6.617933604884489e-06, "loss": 0.4199, "step": 9133 }, { "epoch": 4.132096810676318, "grad_norm": 0.4993060529232025, "learning_rate": 6.617240260482366e-06, "loss": 0.4572, "step": 9134 }, { "epoch": 4.13254919701425, "grad_norm": 0.5100919604301453, "learning_rate": 6.616546881346925e-06, "loss": 0.492, "step": 9135 }, { "epoch": 4.133001583352183, "grad_norm": 0.5726776123046875, "learning_rate": 6.615853467493057e-06, "loss": 0.6462, "step": 9136 }, { "epoch": 4.133453969690115, "grad_norm": 0.5385697484016418, "learning_rate": 6.615160018935656e-06, "loss": 0.5604, "step": 9137 }, { "epoch": 4.133906356028048, "grad_norm": 0.6392991542816162, "learning_rate": 6.614466535689615e-06, "loss": 0.5695, "step": 9138 }, { "epoch": 4.134358742365981, "grad_norm": 0.5418950319290161, "learning_rate": 6.613773017769826e-06, "loss": 0.4887, "step": 9139 }, { "epoch": 4.134811128703913, "grad_norm": 0.6282740831375122, "learning_rate": 6.613079465191184e-06, "loss": 0.5408, "step": 9140 }, { "epoch": 4.135263515041846, "grad_norm": 0.6692485213279724, "learning_rate": 6.6123858779685855e-06, "loss": 0.5282, "step": 9141 }, { "epoch": 4.135715901379778, "grad_norm": 0.6231878995895386, "learning_rate": 6.611692256116926e-06, "loss": 0.4984, "step": 9142 }, { "epoch": 4.136168287717711, "grad_norm": 0.15744389593601227, "learning_rate": 6.610998599651102e-06, "loss": 1.2564, "step": 9143 }, { "epoch": 4.136620674055644, "grad_norm": 0.29370054602622986, "learning_rate": 6.610304908586012e-06, "loss": 0.9001, "step": 9144 }, { "epoch": 4.137073060393576, "grad_norm": 0.27131786942481995, "learning_rate": 6.609611182936554e-06, "loss": 0.6468, "step": 9145 }, { "epoch": 4.137525446731509, "grad_norm": 0.32712894678115845, "learning_rate": 6.608917422717627e-06, "loss": 0.6404, "step": 9146 }, { "epoch": 4.137977833069441, "grad_norm": 0.3015301823616028, "learning_rate": 6.60822362794413e-06, "loss": 0.5846, "step": 9147 }, { "epoch": 4.138430219407374, "grad_norm": 0.32065367698669434, "learning_rate": 6.607529798630965e-06, "loss": 0.5493, "step": 9148 }, { "epoch": 4.138882605745306, "grad_norm": 0.37320369482040405, "learning_rate": 6.606835934793032e-06, "loss": 0.5405, "step": 9149 }, { "epoch": 4.139334992083239, "grad_norm": 0.3448481559753418, "learning_rate": 6.606142036445236e-06, "loss": 0.6153, "step": 9150 }, { "epoch": 4.139787378421172, "grad_norm": 0.35362985730171204, "learning_rate": 6.605448103602474e-06, "loss": 0.6349, "step": 9151 }, { "epoch": 4.140239764759104, "grad_norm": 0.3703480362892151, "learning_rate": 6.604754136279655e-06, "loss": 0.605, "step": 9152 }, { "epoch": 4.140692151097037, "grad_norm": 0.38893264532089233, "learning_rate": 6.604060134491681e-06, "loss": 0.6978, "step": 9153 }, { "epoch": 4.141144537434969, "grad_norm": 0.3638404607772827, "learning_rate": 6.6033660982534575e-06, "loss": 0.6412, "step": 9154 }, { "epoch": 4.1415969237729024, "grad_norm": 0.348911315202713, "learning_rate": 6.6026720275798895e-06, "loss": 0.5334, "step": 9155 }, { "epoch": 4.142049310110835, "grad_norm": 0.4051847755908966, "learning_rate": 6.601977922485884e-06, "loss": 0.5363, "step": 9156 }, { "epoch": 4.142501696448767, "grad_norm": 0.3401660621166229, "learning_rate": 6.601283782986348e-06, "loss": 0.4749, "step": 9157 }, { "epoch": 4.1429540827867, "grad_norm": 0.4174799621105194, "learning_rate": 6.600589609096191e-06, "loss": 0.5569, "step": 9158 }, { "epoch": 4.143406469124632, "grad_norm": 0.4058816432952881, "learning_rate": 6.59989540083032e-06, "loss": 0.5212, "step": 9159 }, { "epoch": 4.143858855462565, "grad_norm": 0.40461859107017517, "learning_rate": 6.599201158203644e-06, "loss": 0.5425, "step": 9160 }, { "epoch": 4.144311241800498, "grad_norm": 0.37000855803489685, "learning_rate": 6.598506881231075e-06, "loss": 0.5368, "step": 9161 }, { "epoch": 4.14476362813843, "grad_norm": 0.3567774295806885, "learning_rate": 6.597812569927523e-06, "loss": 0.5378, "step": 9162 }, { "epoch": 4.145216014476363, "grad_norm": 0.38867366313934326, "learning_rate": 6.597118224307899e-06, "loss": 0.6047, "step": 9163 }, { "epoch": 4.145668400814295, "grad_norm": 0.384263813495636, "learning_rate": 6.596423844387117e-06, "loss": 0.516, "step": 9164 }, { "epoch": 4.146120787152228, "grad_norm": 0.4664509892463684, "learning_rate": 6.595729430180088e-06, "loss": 0.631, "step": 9165 }, { "epoch": 4.146573173490161, "grad_norm": 0.37664395570755005, "learning_rate": 6.595034981701728e-06, "loss": 0.4799, "step": 9166 }, { "epoch": 4.147025559828093, "grad_norm": 0.40053460001945496, "learning_rate": 6.59434049896695e-06, "loss": 0.4771, "step": 9167 }, { "epoch": 4.147477946166026, "grad_norm": 0.416946142911911, "learning_rate": 6.593645981990671e-06, "loss": 0.5425, "step": 9168 }, { "epoch": 4.147930332503958, "grad_norm": 0.43832656741142273, "learning_rate": 6.592951430787804e-06, "loss": 0.5272, "step": 9169 }, { "epoch": 4.148382718841891, "grad_norm": 0.4129659831523895, "learning_rate": 6.592256845373269e-06, "loss": 0.4181, "step": 9170 }, { "epoch": 4.148835105179824, "grad_norm": 0.4328583776950836, "learning_rate": 6.591562225761983e-06, "loss": 0.4556, "step": 9171 }, { "epoch": 4.149287491517756, "grad_norm": 0.4203912615776062, "learning_rate": 6.5908675719688635e-06, "loss": 0.5118, "step": 9172 }, { "epoch": 4.149739877855689, "grad_norm": 0.42889127135276794, "learning_rate": 6.59017288400883e-06, "loss": 0.556, "step": 9173 }, { "epoch": 4.150192264193621, "grad_norm": 0.40963470935821533, "learning_rate": 6.589478161896802e-06, "loss": 0.4445, "step": 9174 }, { "epoch": 4.150644650531554, "grad_norm": 0.4756423532962799, "learning_rate": 6.588783405647699e-06, "loss": 0.6065, "step": 9175 }, { "epoch": 4.151097036869486, "grad_norm": 0.49229705333709717, "learning_rate": 6.588088615276445e-06, "loss": 0.5528, "step": 9176 }, { "epoch": 4.1515494232074195, "grad_norm": 0.4514625370502472, "learning_rate": 6.587393790797958e-06, "loss": 0.4283, "step": 9177 }, { "epoch": 4.152001809545352, "grad_norm": 0.4703383445739746, "learning_rate": 6.586698932227162e-06, "loss": 0.5524, "step": 9178 }, { "epoch": 4.152454195883284, "grad_norm": 0.4701790511608124, "learning_rate": 6.5860040395789835e-06, "loss": 0.5316, "step": 9179 }, { "epoch": 4.152906582221217, "grad_norm": 0.41231390833854675, "learning_rate": 6.585309112868343e-06, "loss": 0.4397, "step": 9180 }, { "epoch": 4.153358968559149, "grad_norm": 0.4592895209789276, "learning_rate": 6.584614152110168e-06, "loss": 0.4849, "step": 9181 }, { "epoch": 4.1538113548970825, "grad_norm": 0.5076760649681091, "learning_rate": 6.583919157319383e-06, "loss": 0.5891, "step": 9182 }, { "epoch": 4.154263741235015, "grad_norm": 0.5098790526390076, "learning_rate": 6.583224128510912e-06, "loss": 0.5112, "step": 9183 }, { "epoch": 4.154716127572947, "grad_norm": 0.45688554644584656, "learning_rate": 6.582529065699687e-06, "loss": 0.4776, "step": 9184 }, { "epoch": 4.15516851391088, "grad_norm": 0.5251797437667847, "learning_rate": 6.581833968900631e-06, "loss": 0.5425, "step": 9185 }, { "epoch": 4.155620900248812, "grad_norm": 0.46185821294784546, "learning_rate": 6.581138838128675e-06, "loss": 0.477, "step": 9186 }, { "epoch": 4.156073286586745, "grad_norm": 0.5564541816711426, "learning_rate": 6.5804436733987484e-06, "loss": 0.5304, "step": 9187 }, { "epoch": 4.156525672924678, "grad_norm": 0.5272897481918335, "learning_rate": 6.579748474725781e-06, "loss": 0.4949, "step": 9188 }, { "epoch": 4.15697805926261, "grad_norm": 0.5696784257888794, "learning_rate": 6.5790532421247026e-06, "loss": 0.4709, "step": 9189 }, { "epoch": 4.157430445600543, "grad_norm": 0.5359762907028198, "learning_rate": 6.578357975610444e-06, "loss": 0.4586, "step": 9190 }, { "epoch": 4.157882831938475, "grad_norm": 0.7397006750106812, "learning_rate": 6.57766267519794e-06, "loss": 0.5388, "step": 9191 }, { "epoch": 4.158335218276408, "grad_norm": 0.6063573360443115, "learning_rate": 6.576967340902122e-06, "loss": 0.4691, "step": 9192 }, { "epoch": 4.158787604614341, "grad_norm": 0.18295057117938995, "learning_rate": 6.576271972737923e-06, "loss": 1.2326, "step": 9193 }, { "epoch": 4.1592399909522735, "grad_norm": 0.2782609760761261, "learning_rate": 6.5755765707202776e-06, "loss": 0.6803, "step": 9194 }, { "epoch": 4.159692377290206, "grad_norm": 0.32770398259162903, "learning_rate": 6.574881134864123e-06, "loss": 0.5087, "step": 9195 }, { "epoch": 4.160144763628138, "grad_norm": 0.29775330424308777, "learning_rate": 6.574185665184393e-06, "loss": 0.5386, "step": 9196 }, { "epoch": 4.160597149966071, "grad_norm": 0.3262799382209778, "learning_rate": 6.573490161696023e-06, "loss": 0.5776, "step": 9197 }, { "epoch": 4.161049536304003, "grad_norm": 0.31988492608070374, "learning_rate": 6.572794624413954e-06, "loss": 0.5937, "step": 9198 }, { "epoch": 4.1615019226419365, "grad_norm": 0.3219819962978363, "learning_rate": 6.57209905335312e-06, "loss": 0.5629, "step": 9199 }, { "epoch": 4.161954308979869, "grad_norm": 0.31714409589767456, "learning_rate": 6.571403448528461e-06, "loss": 0.5168, "step": 9200 }, { "epoch": 4.161954308979869, "eval_loss": 0.5921995639801025, "eval_runtime": 26.7088, "eval_samples_per_second": 27.856, "eval_steps_per_second": 6.964, "step": 9200 }, { "epoch": 4.162406695317801, "grad_norm": 0.3399795889854431, "learning_rate": 6.570707809954918e-06, "loss": 0.6203, "step": 9201 }, { "epoch": 4.162859081655734, "grad_norm": 0.34163618087768555, "learning_rate": 6.5700121376474306e-06, "loss": 0.6328, "step": 9202 }, { "epoch": 4.163311467993666, "grad_norm": 0.39225703477859497, "learning_rate": 6.569316431620938e-06, "loss": 0.5321, "step": 9203 }, { "epoch": 4.1637638543316, "grad_norm": 0.347525954246521, "learning_rate": 6.568620691890383e-06, "loss": 0.5716, "step": 9204 }, { "epoch": 4.164216240669532, "grad_norm": 0.387200266122818, "learning_rate": 6.5679249184707074e-06, "loss": 0.7945, "step": 9205 }, { "epoch": 4.164668627007464, "grad_norm": 0.3773006498813629, "learning_rate": 6.567229111376856e-06, "loss": 0.6216, "step": 9206 }, { "epoch": 4.165121013345397, "grad_norm": 0.3770281672477722, "learning_rate": 6.566533270623771e-06, "loss": 0.5485, "step": 9207 }, { "epoch": 4.165573399683329, "grad_norm": 0.3919973373413086, "learning_rate": 6.565837396226396e-06, "loss": 0.5985, "step": 9208 }, { "epoch": 4.166025786021262, "grad_norm": 0.4146258234977722, "learning_rate": 6.5651414881996785e-06, "loss": 0.6693, "step": 9209 }, { "epoch": 4.166478172359195, "grad_norm": 0.4150277078151703, "learning_rate": 6.564445546558564e-06, "loss": 0.5226, "step": 9210 }, { "epoch": 4.1669305586971275, "grad_norm": 0.40302574634552, "learning_rate": 6.563749571317998e-06, "loss": 0.4706, "step": 9211 }, { "epoch": 4.16738294503506, "grad_norm": 0.3936697840690613, "learning_rate": 6.563053562492929e-06, "loss": 0.5758, "step": 9212 }, { "epoch": 4.167835331372992, "grad_norm": 0.41873204708099365, "learning_rate": 6.562357520098303e-06, "loss": 0.5627, "step": 9213 }, { "epoch": 4.168287717710925, "grad_norm": 0.38459527492523193, "learning_rate": 6.561661444149072e-06, "loss": 0.5488, "step": 9214 }, { "epoch": 4.168740104048858, "grad_norm": 0.4695434868335724, "learning_rate": 6.560965334660183e-06, "loss": 0.5311, "step": 9215 }, { "epoch": 4.1691924903867905, "grad_norm": 0.445616751909256, "learning_rate": 6.56026919164659e-06, "loss": 0.6421, "step": 9216 }, { "epoch": 4.169644876724723, "grad_norm": 0.43061891198158264, "learning_rate": 6.559573015123238e-06, "loss": 0.5693, "step": 9217 }, { "epoch": 4.170097263062655, "grad_norm": 0.3953515589237213, "learning_rate": 6.558876805105083e-06, "loss": 0.5411, "step": 9218 }, { "epoch": 4.170549649400588, "grad_norm": 0.4585588872432709, "learning_rate": 6.558180561607077e-06, "loss": 0.4823, "step": 9219 }, { "epoch": 4.171002035738521, "grad_norm": 0.39511021971702576, "learning_rate": 6.5574842846441724e-06, "loss": 0.4543, "step": 9220 }, { "epoch": 4.171454422076454, "grad_norm": 0.4460030794143677, "learning_rate": 6.556787974231323e-06, "loss": 0.5067, "step": 9221 }, { "epoch": 4.171906808414386, "grad_norm": 0.41156846284866333, "learning_rate": 6.556091630383484e-06, "loss": 0.5114, "step": 9222 }, { "epoch": 4.172359194752318, "grad_norm": 0.431941419839859, "learning_rate": 6.555395253115609e-06, "loss": 0.4857, "step": 9223 }, { "epoch": 4.172811581090251, "grad_norm": 0.43444451689720154, "learning_rate": 6.554698842442657e-06, "loss": 0.4843, "step": 9224 }, { "epoch": 4.173263967428183, "grad_norm": 0.45430293679237366, "learning_rate": 6.554002398379583e-06, "loss": 0.5275, "step": 9225 }, { "epoch": 4.173716353766117, "grad_norm": 0.4781428277492523, "learning_rate": 6.553305920941344e-06, "loss": 0.5158, "step": 9226 }, { "epoch": 4.174168740104049, "grad_norm": 0.4971734285354614, "learning_rate": 6.5526094101429e-06, "loss": 0.5703, "step": 9227 }, { "epoch": 4.1746211264419815, "grad_norm": 0.5024170875549316, "learning_rate": 6.551912865999208e-06, "loss": 0.5728, "step": 9228 }, { "epoch": 4.175073512779914, "grad_norm": 0.4263124167919159, "learning_rate": 6.551216288525228e-06, "loss": 0.4786, "step": 9229 }, { "epoch": 4.175525899117846, "grad_norm": 0.4411052465438843, "learning_rate": 6.55051967773592e-06, "loss": 0.4034, "step": 9230 }, { "epoch": 4.17597828545578, "grad_norm": 0.4800819754600525, "learning_rate": 6.5498230336462476e-06, "loss": 0.496, "step": 9231 }, { "epoch": 4.176430671793712, "grad_norm": 0.49966204166412354, "learning_rate": 6.549126356271169e-06, "loss": 0.6006, "step": 9232 }, { "epoch": 4.1768830581316445, "grad_norm": 0.5056126117706299, "learning_rate": 6.548429645625649e-06, "loss": 0.5748, "step": 9233 }, { "epoch": 4.177335444469577, "grad_norm": 0.47295308113098145, "learning_rate": 6.547732901724649e-06, "loss": 0.4477, "step": 9234 }, { "epoch": 4.177787830807509, "grad_norm": 0.49144110083580017, "learning_rate": 6.547036124583135e-06, "loss": 0.4732, "step": 9235 }, { "epoch": 4.178240217145442, "grad_norm": 0.5488713383674622, "learning_rate": 6.54633931421607e-06, "loss": 0.5774, "step": 9236 }, { "epoch": 4.178692603483375, "grad_norm": 0.4610040783882141, "learning_rate": 6.54564247063842e-06, "loss": 0.4704, "step": 9237 }, { "epoch": 4.1791449898213076, "grad_norm": 0.5313790440559387, "learning_rate": 6.544945593865153e-06, "loss": 0.517, "step": 9238 }, { "epoch": 4.17959737615924, "grad_norm": 0.5477973818778992, "learning_rate": 6.544248683911232e-06, "loss": 0.4667, "step": 9239 }, { "epoch": 4.180049762497172, "grad_norm": 0.564037024974823, "learning_rate": 6.543551740791627e-06, "loss": 0.5629, "step": 9240 }, { "epoch": 4.180502148835105, "grad_norm": 0.6299627423286438, "learning_rate": 6.542854764521305e-06, "loss": 0.54, "step": 9241 }, { "epoch": 4.180954535173038, "grad_norm": 0.626824140548706, "learning_rate": 6.542157755115235e-06, "loss": 0.498, "step": 9242 }, { "epoch": 4.181406921510971, "grad_norm": 0.16041602194309235, "learning_rate": 6.541460712588387e-06, "loss": 1.1268, "step": 9243 }, { "epoch": 4.181859307848903, "grad_norm": 0.2847653031349182, "learning_rate": 6.540763636955732e-06, "loss": 0.4555, "step": 9244 }, { "epoch": 4.1823116941868355, "grad_norm": 0.3706931173801422, "learning_rate": 6.540066528232239e-06, "loss": 0.949, "step": 9245 }, { "epoch": 4.182764080524768, "grad_norm": 0.3013375997543335, "learning_rate": 6.539369386432881e-06, "loss": 0.5928, "step": 9246 }, { "epoch": 4.1832164668627, "grad_norm": 0.31228476762771606, "learning_rate": 6.5386722115726305e-06, "loss": 0.5288, "step": 9247 }, { "epoch": 4.183668853200634, "grad_norm": 0.3005448877811432, "learning_rate": 6.537975003666462e-06, "loss": 0.5702, "step": 9248 }, { "epoch": 4.184121239538566, "grad_norm": 0.33203378319740295, "learning_rate": 6.537277762729346e-06, "loss": 0.5812, "step": 9249 }, { "epoch": 4.1845736258764985, "grad_norm": 0.33303046226501465, "learning_rate": 6.536580488776262e-06, "loss": 0.5014, "step": 9250 }, { "epoch": 4.185026012214431, "grad_norm": 0.41984662413597107, "learning_rate": 6.53588318182218e-06, "loss": 0.6379, "step": 9251 }, { "epoch": 4.185478398552363, "grad_norm": 0.35008344054222107, "learning_rate": 6.535185841882079e-06, "loss": 0.5281, "step": 9252 }, { "epoch": 4.185930784890297, "grad_norm": 0.3550383448600769, "learning_rate": 6.534488468970934e-06, "loss": 0.538, "step": 9253 }, { "epoch": 4.186383171228229, "grad_norm": 0.3907996416091919, "learning_rate": 6.533791063103725e-06, "loss": 0.5407, "step": 9254 }, { "epoch": 4.1868355575661615, "grad_norm": 0.4243888854980469, "learning_rate": 6.533093624295428e-06, "loss": 0.6335, "step": 9255 }, { "epoch": 4.187287943904094, "grad_norm": 0.4066236913204193, "learning_rate": 6.532396152561021e-06, "loss": 0.474, "step": 9256 }, { "epoch": 4.187740330242026, "grad_norm": 0.37382519245147705, "learning_rate": 6.531698647915486e-06, "loss": 0.474, "step": 9257 }, { "epoch": 4.188192716579959, "grad_norm": 0.407083123922348, "learning_rate": 6.531001110373802e-06, "loss": 0.6233, "step": 9258 }, { "epoch": 4.188645102917892, "grad_norm": 0.4279669523239136, "learning_rate": 6.530303539950951e-06, "loss": 0.5452, "step": 9259 }, { "epoch": 4.189097489255825, "grad_norm": 0.44528695940971375, "learning_rate": 6.5296059366619135e-06, "loss": 0.5745, "step": 9260 }, { "epoch": 4.189549875593757, "grad_norm": 0.4136759638786316, "learning_rate": 6.5289083005216735e-06, "loss": 0.5443, "step": 9261 }, { "epoch": 4.1900022619316895, "grad_norm": 0.40911999344825745, "learning_rate": 6.528210631545211e-06, "loss": 0.54, "step": 9262 }, { "epoch": 4.190454648269622, "grad_norm": 0.41354089975357056, "learning_rate": 6.527512929747511e-06, "loss": 0.4744, "step": 9263 }, { "epoch": 4.190907034607555, "grad_norm": 0.3555707335472107, "learning_rate": 6.526815195143559e-06, "loss": 0.4587, "step": 9264 }, { "epoch": 4.191359420945488, "grad_norm": 0.40972888469696045, "learning_rate": 6.52611742774834e-06, "loss": 0.4635, "step": 9265 }, { "epoch": 4.19181180728342, "grad_norm": 0.41060012578964233, "learning_rate": 6.525419627576839e-06, "loss": 0.5064, "step": 9266 }, { "epoch": 4.1922641936213525, "grad_norm": 0.42850685119628906, "learning_rate": 6.524721794644044e-06, "loss": 0.5667, "step": 9267 }, { "epoch": 4.192716579959285, "grad_norm": 0.4059579074382782, "learning_rate": 6.52402392896494e-06, "loss": 0.4326, "step": 9268 }, { "epoch": 4.193168966297218, "grad_norm": 0.4753672480583191, "learning_rate": 6.523326030554517e-06, "loss": 0.6099, "step": 9269 }, { "epoch": 4.193621352635151, "grad_norm": 0.4292205572128296, "learning_rate": 6.522628099427764e-06, "loss": 0.4926, "step": 9270 }, { "epoch": 4.194073738973083, "grad_norm": 0.42827844619750977, "learning_rate": 6.521930135599668e-06, "loss": 0.4614, "step": 9271 }, { "epoch": 4.1945261253110155, "grad_norm": 0.4388393461704254, "learning_rate": 6.521232139085223e-06, "loss": 0.5055, "step": 9272 }, { "epoch": 4.194978511648948, "grad_norm": 0.4575609564781189, "learning_rate": 6.520534109899416e-06, "loss": 0.5553, "step": 9273 }, { "epoch": 4.19543089798688, "grad_norm": 0.4477483928203583, "learning_rate": 6.51983604805724e-06, "loss": 0.5573, "step": 9274 }, { "epoch": 4.195883284324814, "grad_norm": 0.4507652223110199, "learning_rate": 6.519137953573687e-06, "loss": 0.5009, "step": 9275 }, { "epoch": 4.196335670662746, "grad_norm": 0.4672459065914154, "learning_rate": 6.518439826463749e-06, "loss": 0.5969, "step": 9276 }, { "epoch": 4.196788057000679, "grad_norm": 0.4982643723487854, "learning_rate": 6.517741666742421e-06, "loss": 0.5805, "step": 9277 }, { "epoch": 4.197240443338611, "grad_norm": 0.5321710705757141, "learning_rate": 6.517043474424698e-06, "loss": 0.6623, "step": 9278 }, { "epoch": 4.1976928296765434, "grad_norm": 0.4300493597984314, "learning_rate": 6.516345249525573e-06, "loss": 0.4575, "step": 9279 }, { "epoch": 4.198145216014477, "grad_norm": 0.5545474886894226, "learning_rate": 6.515646992060043e-06, "loss": 0.6809, "step": 9280 }, { "epoch": 4.198597602352409, "grad_norm": 0.46634694933891296, "learning_rate": 6.514948702043105e-06, "loss": 0.4883, "step": 9281 }, { "epoch": 4.199049988690342, "grad_norm": 0.5236040353775024, "learning_rate": 6.514250379489754e-06, "loss": 0.5278, "step": 9282 }, { "epoch": 4.199502375028274, "grad_norm": 0.5410577654838562, "learning_rate": 6.51355202441499e-06, "loss": 0.5181, "step": 9283 }, { "epoch": 4.1999547613662065, "grad_norm": 0.3966304659843445, "learning_rate": 6.51285363683381e-06, "loss": 0.3727, "step": 9284 }, { "epoch": 4.200407147704139, "grad_norm": 0.5211005806922913, "learning_rate": 6.5121552167612135e-06, "loss": 0.4694, "step": 9285 }, { "epoch": 4.200859534042072, "grad_norm": 0.5728071928024292, "learning_rate": 6.511456764212201e-06, "loss": 0.532, "step": 9286 }, { "epoch": 4.201311920380005, "grad_norm": 0.4865688681602478, "learning_rate": 6.510758279201773e-06, "loss": 0.4586, "step": 9287 }, { "epoch": 4.201764306717937, "grad_norm": 0.5828778147697449, "learning_rate": 6.5100597617449305e-06, "loss": 0.5726, "step": 9288 }, { "epoch": 4.2022166930558695, "grad_norm": 0.5592635869979858, "learning_rate": 6.509361211856674e-06, "loss": 0.4746, "step": 9289 }, { "epoch": 4.202669079393802, "grad_norm": 0.5741302371025085, "learning_rate": 6.5086626295520096e-06, "loss": 0.5432, "step": 9290 }, { "epoch": 4.203121465731735, "grad_norm": 0.6242907643318176, "learning_rate": 6.507964014845938e-06, "loss": 0.4895, "step": 9291 }, { "epoch": 4.203573852069668, "grad_norm": 0.7893347144126892, "learning_rate": 6.5072653677534635e-06, "loss": 0.4299, "step": 9292 }, { "epoch": 4.2040262384076, "grad_norm": 0.15292079746723175, "learning_rate": 6.506566688289592e-06, "loss": 1.0215, "step": 9293 }, { "epoch": 4.204478624745533, "grad_norm": 0.27795156836509705, "learning_rate": 6.505867976469329e-06, "loss": 0.6948, "step": 9294 }, { "epoch": 4.204931011083465, "grad_norm": 0.3481391668319702, "learning_rate": 6.50516923230768e-06, "loss": 0.6151, "step": 9295 }, { "epoch": 4.205383397421398, "grad_norm": 0.3065759539604187, "learning_rate": 6.504470455819651e-06, "loss": 0.5865, "step": 9296 }, { "epoch": 4.205835783759331, "grad_norm": 0.34425657987594604, "learning_rate": 6.503771647020251e-06, "loss": 0.6495, "step": 9297 }, { "epoch": 4.206288170097263, "grad_norm": 0.3587922155857086, "learning_rate": 6.503072805924488e-06, "loss": 0.6251, "step": 9298 }, { "epoch": 4.206740556435196, "grad_norm": 0.3793257474899292, "learning_rate": 6.502373932547371e-06, "loss": 0.6355, "step": 9299 }, { "epoch": 4.207192942773128, "grad_norm": 0.3807002305984497, "learning_rate": 6.5016750269039084e-06, "loss": 0.585, "step": 9300 }, { "epoch": 4.2076453291110605, "grad_norm": 0.439798504114151, "learning_rate": 6.5009760890091125e-06, "loss": 0.672, "step": 9301 }, { "epoch": 4.208097715448994, "grad_norm": 0.4094761908054352, "learning_rate": 6.5002771188779925e-06, "loss": 0.5505, "step": 9302 }, { "epoch": 4.208550101786926, "grad_norm": 0.42955684661865234, "learning_rate": 6.499578116525561e-06, "loss": 0.6624, "step": 9303 }, { "epoch": 4.209002488124859, "grad_norm": 0.3848553001880646, "learning_rate": 6.49887908196683e-06, "loss": 0.643, "step": 9304 }, { "epoch": 4.209454874462791, "grad_norm": 0.4204871654510498, "learning_rate": 6.498180015216814e-06, "loss": 0.6322, "step": 9305 }, { "epoch": 4.2099072608007235, "grad_norm": 0.3949516713619232, "learning_rate": 6.497480916290526e-06, "loss": 0.5628, "step": 9306 }, { "epoch": 4.210359647138656, "grad_norm": 0.43041256070137024, "learning_rate": 6.496781785202979e-06, "loss": 0.6185, "step": 9307 }, { "epoch": 4.210812033476589, "grad_norm": 0.40276944637298584, "learning_rate": 6.496082621969191e-06, "loss": 0.677, "step": 9308 }, { "epoch": 4.211264419814522, "grad_norm": 0.3981199264526367, "learning_rate": 6.495383426604174e-06, "loss": 0.6649, "step": 9309 }, { "epoch": 4.211716806152454, "grad_norm": 0.4644860029220581, "learning_rate": 6.494684199122948e-06, "loss": 0.6787, "step": 9310 }, { "epoch": 4.212169192490387, "grad_norm": 0.4028988182544708, "learning_rate": 6.493984939540529e-06, "loss": 0.5054, "step": 9311 }, { "epoch": 4.212621578828319, "grad_norm": 0.39348670840263367, "learning_rate": 6.493285647871934e-06, "loss": 0.546, "step": 9312 }, { "epoch": 4.213073965166252, "grad_norm": 0.5339503288269043, "learning_rate": 6.492586324132184e-06, "loss": 0.5326, "step": 9313 }, { "epoch": 4.213526351504185, "grad_norm": 0.5042878985404968, "learning_rate": 6.4918869683362964e-06, "loss": 0.6275, "step": 9314 }, { "epoch": 4.213978737842117, "grad_norm": 0.4165917634963989, "learning_rate": 6.491187580499292e-06, "loss": 0.5291, "step": 9315 }, { "epoch": 4.21443112418005, "grad_norm": 0.3598771095275879, "learning_rate": 6.4904881606361905e-06, "loss": 0.3467, "step": 9316 }, { "epoch": 4.214883510517982, "grad_norm": 0.3901095986366272, "learning_rate": 6.489788708762014e-06, "loss": 0.5567, "step": 9317 }, { "epoch": 4.215335896855915, "grad_norm": 0.4140357971191406, "learning_rate": 6.489089224891786e-06, "loss": 0.4852, "step": 9318 }, { "epoch": 4.215788283193848, "grad_norm": 0.4373982846736908, "learning_rate": 6.4883897090405265e-06, "loss": 0.4993, "step": 9319 }, { "epoch": 4.21624066953178, "grad_norm": 0.4344203472137451, "learning_rate": 6.48769016122326e-06, "loss": 0.5478, "step": 9320 }, { "epoch": 4.216693055869713, "grad_norm": 0.45195460319519043, "learning_rate": 6.486990581455011e-06, "loss": 0.5206, "step": 9321 }, { "epoch": 4.217145442207645, "grad_norm": 0.46055495738983154, "learning_rate": 6.486290969750805e-06, "loss": 0.5994, "step": 9322 }, { "epoch": 4.2175978285455775, "grad_norm": 0.5065718293190002, "learning_rate": 6.485591326125665e-06, "loss": 0.5356, "step": 9323 }, { "epoch": 4.218050214883511, "grad_norm": 0.48788467049598694, "learning_rate": 6.48489165059462e-06, "loss": 0.5614, "step": 9324 }, { "epoch": 4.218502601221443, "grad_norm": 0.4452280104160309, "learning_rate": 6.484191943172695e-06, "loss": 0.4595, "step": 9325 }, { "epoch": 4.218954987559376, "grad_norm": 0.4548358619213104, "learning_rate": 6.4834922038749194e-06, "loss": 0.5644, "step": 9326 }, { "epoch": 4.219407373897308, "grad_norm": 0.4327777028083801, "learning_rate": 6.482792432716319e-06, "loss": 0.4569, "step": 9327 }, { "epoch": 4.219859760235241, "grad_norm": 0.47565528750419617, "learning_rate": 6.482092629711925e-06, "loss": 0.5241, "step": 9328 }, { "epoch": 4.220312146573174, "grad_norm": 0.45295360684394836, "learning_rate": 6.4813927948767666e-06, "loss": 0.5244, "step": 9329 }, { "epoch": 4.220764532911106, "grad_norm": 0.5257610082626343, "learning_rate": 6.480692928225873e-06, "loss": 0.5468, "step": 9330 }, { "epoch": 4.221216919249039, "grad_norm": 0.48847952485084534, "learning_rate": 6.479993029774274e-06, "loss": 0.4792, "step": 9331 }, { "epoch": 4.221669305586971, "grad_norm": 0.4817238450050354, "learning_rate": 6.479293099537005e-06, "loss": 0.4634, "step": 9332 }, { "epoch": 4.222121691924904, "grad_norm": 0.5953919887542725, "learning_rate": 6.478593137529094e-06, "loss": 0.5768, "step": 9333 }, { "epoch": 4.222574078262836, "grad_norm": 0.5014532208442688, "learning_rate": 6.477893143765579e-06, "loss": 0.5732, "step": 9334 }, { "epoch": 4.223026464600769, "grad_norm": 0.5362128615379333, "learning_rate": 6.477193118261489e-06, "loss": 0.5225, "step": 9335 }, { "epoch": 4.223478850938702, "grad_norm": 0.5347446799278259, "learning_rate": 6.476493061031861e-06, "loss": 0.4798, "step": 9336 }, { "epoch": 4.223931237276634, "grad_norm": 0.5217540860176086, "learning_rate": 6.475792972091729e-06, "loss": 0.531, "step": 9337 }, { "epoch": 4.224383623614567, "grad_norm": 0.48443230986595154, "learning_rate": 6.4750928514561294e-06, "loss": 0.3945, "step": 9338 }, { "epoch": 4.224836009952499, "grad_norm": 0.6199737191200256, "learning_rate": 6.4743926991400985e-06, "loss": 0.5851, "step": 9339 }, { "epoch": 4.225288396290432, "grad_norm": 0.6394362449645996, "learning_rate": 6.4736925151586734e-06, "loss": 0.571, "step": 9340 }, { "epoch": 4.225740782628365, "grad_norm": 0.6135337352752686, "learning_rate": 6.472992299526892e-06, "loss": 0.5704, "step": 9341 }, { "epoch": 4.226193168966297, "grad_norm": 0.7530930638313293, "learning_rate": 6.472292052259792e-06, "loss": 0.5887, "step": 9342 }, { "epoch": 4.22664555530423, "grad_norm": 0.1635914146900177, "learning_rate": 6.471591773372413e-06, "loss": 1.048, "step": 9343 }, { "epoch": 4.227097941642162, "grad_norm": 0.27993324398994446, "learning_rate": 6.470891462879796e-06, "loss": 0.779, "step": 9344 }, { "epoch": 4.2275503279800954, "grad_norm": 0.3487805128097534, "learning_rate": 6.4701911207969795e-06, "loss": 0.6273, "step": 9345 }, { "epoch": 4.228002714318028, "grad_norm": 0.3023255467414856, "learning_rate": 6.469490747139006e-06, "loss": 0.6649, "step": 9346 }, { "epoch": 4.22845510065596, "grad_norm": 0.3245726227760315, "learning_rate": 6.4687903419209165e-06, "loss": 0.5787, "step": 9347 }, { "epoch": 4.228907486993893, "grad_norm": 0.31329837441444397, "learning_rate": 6.468089905157753e-06, "loss": 0.6125, "step": 9348 }, { "epoch": 4.229359873331825, "grad_norm": 0.31389233469963074, "learning_rate": 6.467389436864562e-06, "loss": 0.4676, "step": 9349 }, { "epoch": 4.229812259669758, "grad_norm": 0.3947190046310425, "learning_rate": 6.466688937056384e-06, "loss": 0.6186, "step": 9350 }, { "epoch": 4.230264646007691, "grad_norm": 0.3832712769508362, "learning_rate": 6.4659884057482644e-06, "loss": 0.5841, "step": 9351 }, { "epoch": 4.230717032345623, "grad_norm": 0.3740543723106384, "learning_rate": 6.465287842955249e-06, "loss": 0.5782, "step": 9352 }, { "epoch": 4.231169418683556, "grad_norm": 0.29586684703826904, "learning_rate": 6.464587248692384e-06, "loss": 0.3985, "step": 9353 }, { "epoch": 4.231621805021488, "grad_norm": 0.3643980324268341, "learning_rate": 6.463886622974715e-06, "loss": 0.544, "step": 9354 }, { "epoch": 4.232074191359421, "grad_norm": 0.42518436908721924, "learning_rate": 6.463185965817289e-06, "loss": 0.5945, "step": 9355 }, { "epoch": 4.232526577697354, "grad_norm": 0.3960179090499878, "learning_rate": 6.462485277235155e-06, "loss": 0.5711, "step": 9356 }, { "epoch": 4.232978964035286, "grad_norm": 0.4050392508506775, "learning_rate": 6.461784557243361e-06, "loss": 0.4099, "step": 9357 }, { "epoch": 4.233431350373219, "grad_norm": 0.39921531081199646, "learning_rate": 6.461083805856957e-06, "loss": 0.4967, "step": 9358 }, { "epoch": 4.233883736711151, "grad_norm": 0.41561925411224365, "learning_rate": 6.4603830230909925e-06, "loss": 0.5168, "step": 9359 }, { "epoch": 4.234336123049084, "grad_norm": 0.39184844493865967, "learning_rate": 6.4596822089605184e-06, "loss": 0.5175, "step": 9360 }, { "epoch": 4.234788509387016, "grad_norm": 0.3864988386631012, "learning_rate": 6.458981363480585e-06, "loss": 0.5358, "step": 9361 }, { "epoch": 4.235240895724949, "grad_norm": 0.35885387659072876, "learning_rate": 6.458280486666247e-06, "loss": 0.38, "step": 9362 }, { "epoch": 4.235693282062882, "grad_norm": 0.42401909828186035, "learning_rate": 6.457579578532553e-06, "loss": 0.4884, "step": 9363 }, { "epoch": 4.236145668400814, "grad_norm": 0.3961191773414612, "learning_rate": 6.45687863909456e-06, "loss": 0.4492, "step": 9364 }, { "epoch": 4.236598054738747, "grad_norm": 0.40308448672294617, "learning_rate": 6.456177668367321e-06, "loss": 0.5002, "step": 9365 }, { "epoch": 4.237050441076679, "grad_norm": 0.4162027835845947, "learning_rate": 6.455476666365889e-06, "loss": 0.493, "step": 9366 }, { "epoch": 4.2375028274146125, "grad_norm": 0.4352666139602661, "learning_rate": 6.4547756331053205e-06, "loss": 0.5378, "step": 9367 }, { "epoch": 4.237955213752545, "grad_norm": 0.3857254683971405, "learning_rate": 6.4540745686006715e-06, "loss": 0.4482, "step": 9368 }, { "epoch": 4.238407600090477, "grad_norm": 0.48064547777175903, "learning_rate": 6.4533734728669995e-06, "loss": 0.6057, "step": 9369 }, { "epoch": 4.23885998642841, "grad_norm": 0.41244596242904663, "learning_rate": 6.452672345919359e-06, "loss": 0.4214, "step": 9370 }, { "epoch": 4.239312372766342, "grad_norm": 0.4481479525566101, "learning_rate": 6.451971187772813e-06, "loss": 0.5109, "step": 9371 }, { "epoch": 4.239764759104275, "grad_norm": 0.48064613342285156, "learning_rate": 6.4512699984424165e-06, "loss": 0.5579, "step": 9372 }, { "epoch": 4.240217145442208, "grad_norm": 0.41006043553352356, "learning_rate": 6.450568777943229e-06, "loss": 0.4849, "step": 9373 }, { "epoch": 4.24066953178014, "grad_norm": 0.5241140127182007, "learning_rate": 6.449867526290313e-06, "loss": 0.6491, "step": 9374 }, { "epoch": 4.241121918118073, "grad_norm": 0.4884836673736572, "learning_rate": 6.449166243498727e-06, "loss": 0.5184, "step": 9375 }, { "epoch": 4.241574304456005, "grad_norm": 0.4274267554283142, "learning_rate": 6.448464929583532e-06, "loss": 0.5087, "step": 9376 }, { "epoch": 4.242026690793938, "grad_norm": 0.4925827980041504, "learning_rate": 6.447763584559792e-06, "loss": 0.5346, "step": 9377 }, { "epoch": 4.242479077131871, "grad_norm": 0.513483464717865, "learning_rate": 6.447062208442567e-06, "loss": 0.5462, "step": 9378 }, { "epoch": 4.242931463469803, "grad_norm": 0.4739459455013275, "learning_rate": 6.446360801246924e-06, "loss": 0.4742, "step": 9379 }, { "epoch": 4.243383849807736, "grad_norm": 0.46799299120903015, "learning_rate": 6.445659362987924e-06, "loss": 0.4241, "step": 9380 }, { "epoch": 4.243836236145668, "grad_norm": 0.4837290942668915, "learning_rate": 6.444957893680632e-06, "loss": 0.4905, "step": 9381 }, { "epoch": 4.244288622483601, "grad_norm": 0.5309267640113831, "learning_rate": 6.444256393340116e-06, "loss": 0.5453, "step": 9382 }, { "epoch": 4.244741008821533, "grad_norm": 0.4814933240413666, "learning_rate": 6.44355486198144e-06, "loss": 0.5065, "step": 9383 }, { "epoch": 4.2451933951594665, "grad_norm": 0.4830528497695923, "learning_rate": 6.442853299619671e-06, "loss": 0.4629, "step": 9384 }, { "epoch": 4.245645781497399, "grad_norm": 0.48826712369918823, "learning_rate": 6.442151706269877e-06, "loss": 0.4995, "step": 9385 }, { "epoch": 4.246098167835331, "grad_norm": 0.5254212617874146, "learning_rate": 6.441450081947125e-06, "loss": 0.5064, "step": 9386 }, { "epoch": 4.246550554173264, "grad_norm": 0.5094698071479797, "learning_rate": 6.440748426666483e-06, "loss": 0.3978, "step": 9387 }, { "epoch": 4.247002940511196, "grad_norm": 0.5543383359909058, "learning_rate": 6.440046740443022e-06, "loss": 0.4711, "step": 9388 }, { "epoch": 4.2474553268491295, "grad_norm": 0.5881347060203552, "learning_rate": 6.439345023291813e-06, "loss": 0.5272, "step": 9389 }, { "epoch": 4.247907713187062, "grad_norm": 0.5792377591133118, "learning_rate": 6.438643275227925e-06, "loss": 0.5032, "step": 9390 }, { "epoch": 4.248360099524994, "grad_norm": 0.5647850632667542, "learning_rate": 6.437941496266429e-06, "loss": 0.4651, "step": 9391 }, { "epoch": 4.248812485862927, "grad_norm": 0.7414628863334656, "learning_rate": 6.437239686422398e-06, "loss": 0.5752, "step": 9392 }, { "epoch": 4.249264872200859, "grad_norm": 0.17794516682624817, "learning_rate": 6.436537845710904e-06, "loss": 1.0998, "step": 9393 }, { "epoch": 4.249717258538793, "grad_norm": 0.2615014910697937, "learning_rate": 6.435835974147021e-06, "loss": 0.6324, "step": 9394 }, { "epoch": 4.250169644876725, "grad_norm": 0.26682576537132263, "learning_rate": 6.4351340717458236e-06, "loss": 0.6206, "step": 9395 }, { "epoch": 4.250622031214657, "grad_norm": 0.3222082555294037, "learning_rate": 6.434432138522385e-06, "loss": 0.583, "step": 9396 }, { "epoch": 4.25107441755259, "grad_norm": 0.3442196846008301, "learning_rate": 6.4337301744917826e-06, "loss": 0.5984, "step": 9397 }, { "epoch": 4.251526803890522, "grad_norm": 0.3589189946651459, "learning_rate": 6.43302817966909e-06, "loss": 0.4778, "step": 9398 }, { "epoch": 4.251979190228455, "grad_norm": 0.3809727728366852, "learning_rate": 6.432326154069385e-06, "loss": 0.6737, "step": 9399 }, { "epoch": 4.252431576566388, "grad_norm": 0.3311525583267212, "learning_rate": 6.431624097707745e-06, "loss": 0.5141, "step": 9400 }, { "epoch": 4.252431576566388, "eval_loss": 0.590677797794342, "eval_runtime": 25.6767, "eval_samples_per_second": 28.976, "eval_steps_per_second": 7.244, "step": 9400 }, { "epoch": 4.2528839629043205, "grad_norm": 0.3879922926425934, "learning_rate": 6.430922010599248e-06, "loss": 0.5822, "step": 9401 }, { "epoch": 4.253336349242253, "grad_norm": 0.3480162024497986, "learning_rate": 6.430219892758973e-06, "loss": 0.5235, "step": 9402 }, { "epoch": 4.253788735580185, "grad_norm": 0.3758719861507416, "learning_rate": 6.429517744201999e-06, "loss": 0.6609, "step": 9403 }, { "epoch": 4.254241121918118, "grad_norm": 0.35254207253456116, "learning_rate": 6.4288155649434044e-06, "loss": 0.5627, "step": 9404 }, { "epoch": 4.254693508256051, "grad_norm": 0.36780375242233276, "learning_rate": 6.4281133549982735e-06, "loss": 0.5642, "step": 9405 }, { "epoch": 4.2551458945939835, "grad_norm": 0.3738195598125458, "learning_rate": 6.427411114381684e-06, "loss": 0.5636, "step": 9406 }, { "epoch": 4.255598280931916, "grad_norm": 0.3821106255054474, "learning_rate": 6.42670884310872e-06, "loss": 0.5544, "step": 9407 }, { "epoch": 4.256050667269848, "grad_norm": 0.4125317633152008, "learning_rate": 6.426006541194464e-06, "loss": 0.4621, "step": 9408 }, { "epoch": 4.256503053607781, "grad_norm": 0.4406026303768158, "learning_rate": 6.425304208653998e-06, "loss": 0.5683, "step": 9409 }, { "epoch": 4.256955439945713, "grad_norm": 0.3466985523700714, "learning_rate": 6.424601845502406e-06, "loss": 0.4627, "step": 9410 }, { "epoch": 4.257407826283647, "grad_norm": 0.37722688913345337, "learning_rate": 6.4238994517547735e-06, "loss": 0.496, "step": 9411 }, { "epoch": 4.257860212621579, "grad_norm": 0.4155440330505371, "learning_rate": 6.4231970274261856e-06, "loss": 0.5712, "step": 9412 }, { "epoch": 4.258312598959511, "grad_norm": 0.41080421209335327, "learning_rate": 6.422494572531728e-06, "loss": 0.5186, "step": 9413 }, { "epoch": 4.258764985297444, "grad_norm": 0.4664965867996216, "learning_rate": 6.421792087086487e-06, "loss": 0.6243, "step": 9414 }, { "epoch": 4.259217371635376, "grad_norm": 0.41265636682510376, "learning_rate": 6.421089571105551e-06, "loss": 0.5296, "step": 9415 }, { "epoch": 4.25966975797331, "grad_norm": 0.4473910629749298, "learning_rate": 6.420387024604006e-06, "loss": 0.6065, "step": 9416 }, { "epoch": 4.260122144311242, "grad_norm": 0.4180319607257843, "learning_rate": 6.419684447596942e-06, "loss": 0.5538, "step": 9417 }, { "epoch": 4.2605745306491745, "grad_norm": 0.3847956359386444, "learning_rate": 6.418981840099448e-06, "loss": 0.4839, "step": 9418 }, { "epoch": 4.261026916987107, "grad_norm": 0.39798712730407715, "learning_rate": 6.418279202126613e-06, "loss": 0.4579, "step": 9419 }, { "epoch": 4.261479303325039, "grad_norm": 0.43870317935943604, "learning_rate": 6.417576533693529e-06, "loss": 0.529, "step": 9420 }, { "epoch": 4.261931689662973, "grad_norm": 0.44168439507484436, "learning_rate": 6.416873834815286e-06, "loss": 0.6141, "step": 9421 }, { "epoch": 4.262384076000905, "grad_norm": 0.4506489336490631, "learning_rate": 6.416171105506974e-06, "loss": 0.5328, "step": 9422 }, { "epoch": 4.2628364623388375, "grad_norm": 0.4989471733570099, "learning_rate": 6.41546834578369e-06, "loss": 0.5678, "step": 9423 }, { "epoch": 4.26328884867677, "grad_norm": 0.43657007813453674, "learning_rate": 6.414765555660523e-06, "loss": 0.5368, "step": 9424 }, { "epoch": 4.263741235014702, "grad_norm": 0.4281748831272125, "learning_rate": 6.414062735152569e-06, "loss": 0.4984, "step": 9425 }, { "epoch": 4.264193621352635, "grad_norm": 0.5281638503074646, "learning_rate": 6.4133598842749215e-06, "loss": 0.5696, "step": 9426 }, { "epoch": 4.264646007690568, "grad_norm": 0.4501369595527649, "learning_rate": 6.412657003042675e-06, "loss": 0.5313, "step": 9427 }, { "epoch": 4.2650983940285006, "grad_norm": 0.5080687999725342, "learning_rate": 6.411954091470927e-06, "loss": 0.5351, "step": 9428 }, { "epoch": 4.265550780366433, "grad_norm": 0.5115650296211243, "learning_rate": 6.411251149574771e-06, "loss": 0.6489, "step": 9429 }, { "epoch": 4.266003166704365, "grad_norm": 0.4990519881248474, "learning_rate": 6.410548177369308e-06, "loss": 0.5015, "step": 9430 }, { "epoch": 4.266455553042298, "grad_norm": 0.512000322341919, "learning_rate": 6.409845174869633e-06, "loss": 0.5334, "step": 9431 }, { "epoch": 4.26690793938023, "grad_norm": 0.4820273220539093, "learning_rate": 6.409142142090845e-06, "loss": 0.4923, "step": 9432 }, { "epoch": 4.267360325718164, "grad_norm": 0.5441705584526062, "learning_rate": 6.408439079048042e-06, "loss": 0.5139, "step": 9433 }, { "epoch": 4.267812712056096, "grad_norm": 0.43915989995002747, "learning_rate": 6.407735985756325e-06, "loss": 0.4246, "step": 9434 }, { "epoch": 4.2682650983940285, "grad_norm": 0.4954736530780792, "learning_rate": 6.407032862230793e-06, "loss": 0.4566, "step": 9435 }, { "epoch": 4.268717484731961, "grad_norm": 0.5393627882003784, "learning_rate": 6.406329708486548e-06, "loss": 0.4929, "step": 9436 }, { "epoch": 4.269169871069893, "grad_norm": 0.6403639316558838, "learning_rate": 6.405626524538692e-06, "loss": 0.5962, "step": 9437 }, { "epoch": 4.269622257407827, "grad_norm": 0.515656590461731, "learning_rate": 6.404923310402326e-06, "loss": 0.4609, "step": 9438 }, { "epoch": 4.270074643745759, "grad_norm": 0.6072582602500916, "learning_rate": 6.404220066092552e-06, "loss": 0.5061, "step": 9439 }, { "epoch": 4.2705270300836915, "grad_norm": 0.589463472366333, "learning_rate": 6.403516791624477e-06, "loss": 0.5494, "step": 9440 }, { "epoch": 4.270979416421624, "grad_norm": 0.5676060914993286, "learning_rate": 6.4028134870132e-06, "loss": 0.4528, "step": 9441 }, { "epoch": 4.271431802759556, "grad_norm": 0.679216206073761, "learning_rate": 6.402110152273833e-06, "loss": 0.5237, "step": 9442 }, { "epoch": 4.27188418909749, "grad_norm": 0.16828308999538422, "learning_rate": 6.401406787421474e-06, "loss": 0.9847, "step": 9443 }, { "epoch": 4.272336575435422, "grad_norm": 0.27996155619621277, "learning_rate": 6.400703392471232e-06, "loss": 0.6968, "step": 9444 }, { "epoch": 4.2727889617733545, "grad_norm": 0.3196926712989807, "learning_rate": 6.399999967438216e-06, "loss": 0.5703, "step": 9445 }, { "epoch": 4.273241348111287, "grad_norm": 0.32774192094802856, "learning_rate": 6.399296512337531e-06, "loss": 0.6088, "step": 9446 }, { "epoch": 4.273693734449219, "grad_norm": 0.28679516911506653, "learning_rate": 6.398593027184284e-06, "loss": 0.513, "step": 9447 }, { "epoch": 4.274146120787152, "grad_norm": 0.3232564926147461, "learning_rate": 6.3978895119935865e-06, "loss": 0.643, "step": 9448 }, { "epoch": 4.274598507125085, "grad_norm": 0.30647867918014526, "learning_rate": 6.3971859667805456e-06, "loss": 0.527, "step": 9449 }, { "epoch": 4.275050893463018, "grad_norm": 0.3724159002304077, "learning_rate": 6.396482391560272e-06, "loss": 0.5968, "step": 9450 }, { "epoch": 4.27550327980095, "grad_norm": 0.40064525604248047, "learning_rate": 6.395778786347878e-06, "loss": 0.6027, "step": 9451 }, { "epoch": 4.2759556661388824, "grad_norm": 0.3690699636936188, "learning_rate": 6.3950751511584715e-06, "loss": 0.6179, "step": 9452 }, { "epoch": 4.276408052476815, "grad_norm": 0.39614593982696533, "learning_rate": 6.3943714860071695e-06, "loss": 0.7397, "step": 9453 }, { "epoch": 4.276860438814748, "grad_norm": 0.3978283405303955, "learning_rate": 6.393667790909079e-06, "loss": 0.5781, "step": 9454 }, { "epoch": 4.277312825152681, "grad_norm": 0.38431915640830994, "learning_rate": 6.392964065879314e-06, "loss": 0.6361, "step": 9455 }, { "epoch": 4.277765211490613, "grad_norm": 0.38779088854789734, "learning_rate": 6.392260310932993e-06, "loss": 0.6484, "step": 9456 }, { "epoch": 4.2782175978285455, "grad_norm": 0.4409182369709015, "learning_rate": 6.391556526085225e-06, "loss": 0.5184, "step": 9457 }, { "epoch": 4.278669984166478, "grad_norm": 0.3719801604747772, "learning_rate": 6.390852711351128e-06, "loss": 0.4922, "step": 9458 }, { "epoch": 4.27912237050441, "grad_norm": 0.4281558394432068, "learning_rate": 6.390148866745818e-06, "loss": 0.5297, "step": 9459 }, { "epoch": 4.279574756842344, "grad_norm": 0.3707234561443329, "learning_rate": 6.389444992284409e-06, "loss": 0.4354, "step": 9460 }, { "epoch": 4.280027143180276, "grad_norm": 0.366910845041275, "learning_rate": 6.388741087982021e-06, "loss": 0.5184, "step": 9461 }, { "epoch": 4.2804795295182085, "grad_norm": 0.39085444808006287, "learning_rate": 6.38803715385377e-06, "loss": 0.6028, "step": 9462 }, { "epoch": 4.280931915856141, "grad_norm": 0.4335061013698578, "learning_rate": 6.387333189914775e-06, "loss": 0.6048, "step": 9463 }, { "epoch": 4.281384302194073, "grad_norm": 0.42619845271110535, "learning_rate": 6.386629196180155e-06, "loss": 0.4323, "step": 9464 }, { "epoch": 4.281836688532007, "grad_norm": 0.43683722615242004, "learning_rate": 6.385925172665029e-06, "loss": 0.5441, "step": 9465 }, { "epoch": 4.282289074869939, "grad_norm": 0.39671167731285095, "learning_rate": 6.385221119384517e-06, "loss": 0.5175, "step": 9466 }, { "epoch": 4.282741461207872, "grad_norm": 0.4346619248390198, "learning_rate": 6.3845170363537404e-06, "loss": 0.4643, "step": 9467 }, { "epoch": 4.283193847545804, "grad_norm": 0.4925006628036499, "learning_rate": 6.38381292358782e-06, "loss": 0.5767, "step": 9468 }, { "epoch": 4.283646233883736, "grad_norm": 0.41179490089416504, "learning_rate": 6.38310878110188e-06, "loss": 0.501, "step": 9469 }, { "epoch": 4.28409862022167, "grad_norm": 0.41052132844924927, "learning_rate": 6.382404608911041e-06, "loss": 0.5105, "step": 9470 }, { "epoch": 4.284551006559602, "grad_norm": 0.48512348532676697, "learning_rate": 6.381700407030428e-06, "loss": 0.6325, "step": 9471 }, { "epoch": 4.285003392897535, "grad_norm": 0.5004212260246277, "learning_rate": 6.380996175475164e-06, "loss": 0.5464, "step": 9472 }, { "epoch": 4.285455779235467, "grad_norm": 0.46014606952667236, "learning_rate": 6.3802919142603735e-06, "loss": 0.6143, "step": 9473 }, { "epoch": 4.2859081655733995, "grad_norm": 0.531721830368042, "learning_rate": 6.379587623401183e-06, "loss": 0.6138, "step": 9474 }, { "epoch": 4.286360551911332, "grad_norm": 0.46753424406051636, "learning_rate": 6.3788833029127185e-06, "loss": 0.4616, "step": 9475 }, { "epoch": 4.286812938249265, "grad_norm": 0.4567864239215851, "learning_rate": 6.378178952810106e-06, "loss": 0.4905, "step": 9476 }, { "epoch": 4.287265324587198, "grad_norm": 0.5077362060546875, "learning_rate": 6.377474573108473e-06, "loss": 0.5252, "step": 9477 }, { "epoch": 4.28771771092513, "grad_norm": 0.45201417803764343, "learning_rate": 6.376770163822946e-06, "loss": 0.5246, "step": 9478 }, { "epoch": 4.2881700972630625, "grad_norm": 0.5146055817604065, "learning_rate": 6.3760657249686554e-06, "loss": 0.5524, "step": 9479 }, { "epoch": 4.288622483600995, "grad_norm": 0.4845193326473236, "learning_rate": 6.375361256560731e-06, "loss": 0.5068, "step": 9480 }, { "epoch": 4.289074869938927, "grad_norm": 0.4560285210609436, "learning_rate": 6.374656758614299e-06, "loss": 0.4415, "step": 9481 }, { "epoch": 4.289527256276861, "grad_norm": 0.5068943500518799, "learning_rate": 6.373952231144493e-06, "loss": 0.5818, "step": 9482 }, { "epoch": 4.289979642614793, "grad_norm": 0.502072811126709, "learning_rate": 6.373247674166444e-06, "loss": 0.5149, "step": 9483 }, { "epoch": 4.290432028952726, "grad_norm": 0.4768492579460144, "learning_rate": 6.3725430876952824e-06, "loss": 0.605, "step": 9484 }, { "epoch": 4.290884415290658, "grad_norm": 0.5243869423866272, "learning_rate": 6.371838471746141e-06, "loss": 0.493, "step": 9485 }, { "epoch": 4.29133680162859, "grad_norm": 0.5030243992805481, "learning_rate": 6.371133826334152e-06, "loss": 0.4872, "step": 9486 }, { "epoch": 4.291789187966524, "grad_norm": 0.5233740210533142, "learning_rate": 6.370429151474453e-06, "loss": 0.5647, "step": 9487 }, { "epoch": 4.292241574304456, "grad_norm": 0.4915124475955963, "learning_rate": 6.369724447182173e-06, "loss": 0.4353, "step": 9488 }, { "epoch": 4.292693960642389, "grad_norm": 0.5875189900398254, "learning_rate": 6.369019713472449e-06, "loss": 0.5703, "step": 9489 }, { "epoch": 4.293146346980321, "grad_norm": 0.5325825810432434, "learning_rate": 6.368314950360416e-06, "loss": 0.4632, "step": 9490 }, { "epoch": 4.2935987333182535, "grad_norm": 0.5978353023529053, "learning_rate": 6.367610157861211e-06, "loss": 0.5726, "step": 9491 }, { "epoch": 4.294051119656187, "grad_norm": 0.6559750437736511, "learning_rate": 6.36690533598997e-06, "loss": 0.4805, "step": 9492 }, { "epoch": 4.294503505994119, "grad_norm": 0.19027405977249146, "learning_rate": 6.3662004847618305e-06, "loss": 1.2, "step": 9493 }, { "epoch": 4.294955892332052, "grad_norm": 0.27732372283935547, "learning_rate": 6.365495604191932e-06, "loss": 0.6459, "step": 9494 }, { "epoch": 4.295408278669984, "grad_norm": 0.30353328585624695, "learning_rate": 6.364790694295411e-06, "loss": 0.5581, "step": 9495 }, { "epoch": 4.2958606650079165, "grad_norm": 0.30438604950904846, "learning_rate": 6.364085755087408e-06, "loss": 0.5432, "step": 9496 }, { "epoch": 4.296313051345849, "grad_norm": 0.31991544365882874, "learning_rate": 6.363380786583062e-06, "loss": 0.668, "step": 9497 }, { "epoch": 4.296765437683782, "grad_norm": 0.32935354113578796, "learning_rate": 6.362675788797516e-06, "loss": 0.6068, "step": 9498 }, { "epoch": 4.297217824021715, "grad_norm": 0.3432694673538208, "learning_rate": 6.361970761745908e-06, "loss": 0.6171, "step": 9499 }, { "epoch": 4.297670210359647, "grad_norm": 0.32451966404914856, "learning_rate": 6.361265705443381e-06, "loss": 0.4326, "step": 9500 }, { "epoch": 4.29812259669758, "grad_norm": 0.3626355826854706, "learning_rate": 6.3605606199050785e-06, "loss": 0.6334, "step": 9501 }, { "epoch": 4.298574983035512, "grad_norm": 0.35539209842681885, "learning_rate": 6.359855505146142e-06, "loss": 0.6339, "step": 9502 }, { "epoch": 4.299027369373445, "grad_norm": 0.347305566072464, "learning_rate": 6.3591503611817155e-06, "loss": 0.5448, "step": 9503 }, { "epoch": 4.299479755711378, "grad_norm": 0.3999626040458679, "learning_rate": 6.358445188026944e-06, "loss": 0.6012, "step": 9504 }, { "epoch": 4.29993214204931, "grad_norm": 0.41051554679870605, "learning_rate": 6.3577399856969715e-06, "loss": 0.5931, "step": 9505 }, { "epoch": 4.300384528387243, "grad_norm": 0.4002001881599426, "learning_rate": 6.3570347542069445e-06, "loss": 0.6458, "step": 9506 }, { "epoch": 4.300836914725175, "grad_norm": 0.41110649704933167, "learning_rate": 6.356329493572009e-06, "loss": 0.6033, "step": 9507 }, { "epoch": 4.3012893010631075, "grad_norm": 0.4032600224018097, "learning_rate": 6.355624203807311e-06, "loss": 0.581, "step": 9508 }, { "epoch": 4.301741687401041, "grad_norm": 0.4055093824863434, "learning_rate": 6.354918884927999e-06, "loss": 0.6028, "step": 9509 }, { "epoch": 4.302194073738973, "grad_norm": 0.4183690547943115, "learning_rate": 6.354213536949222e-06, "loss": 0.6972, "step": 9510 }, { "epoch": 4.302646460076906, "grad_norm": 0.37992411851882935, "learning_rate": 6.353508159886126e-06, "loss": 0.5239, "step": 9511 }, { "epoch": 4.303098846414838, "grad_norm": 0.3640613853931427, "learning_rate": 6.3528027537538616e-06, "loss": 0.4818, "step": 9512 }, { "epoch": 4.3035512327527705, "grad_norm": 0.46169784665107727, "learning_rate": 6.352097318567579e-06, "loss": 0.8323, "step": 9513 }, { "epoch": 4.304003619090704, "grad_norm": 0.3716890811920166, "learning_rate": 6.351391854342428e-06, "loss": 0.5122, "step": 9514 }, { "epoch": 4.304456005428636, "grad_norm": 0.4531923234462738, "learning_rate": 6.350686361093561e-06, "loss": 0.6329, "step": 9515 }, { "epoch": 4.304908391766569, "grad_norm": 0.4867245554924011, "learning_rate": 6.3499808388361294e-06, "loss": 0.6313, "step": 9516 }, { "epoch": 4.305360778104501, "grad_norm": 0.4387843608856201, "learning_rate": 6.349275287585285e-06, "loss": 0.7229, "step": 9517 }, { "epoch": 4.305813164442434, "grad_norm": 0.4490761160850525, "learning_rate": 6.348569707356182e-06, "loss": 0.6179, "step": 9518 }, { "epoch": 4.306265550780367, "grad_norm": 0.418201744556427, "learning_rate": 6.347864098163972e-06, "loss": 0.5158, "step": 9519 }, { "epoch": 4.306717937118299, "grad_norm": 0.4556402266025543, "learning_rate": 6.34715846002381e-06, "loss": 0.6134, "step": 9520 }, { "epoch": 4.307170323456232, "grad_norm": 0.4130723476409912, "learning_rate": 6.346452792950855e-06, "loss": 0.4685, "step": 9521 }, { "epoch": 4.307622709794164, "grad_norm": 0.4561479389667511, "learning_rate": 6.3457470969602565e-06, "loss": 0.6057, "step": 9522 }, { "epoch": 4.308075096132097, "grad_norm": 0.4773060977458954, "learning_rate": 6.345041372067173e-06, "loss": 0.5501, "step": 9523 }, { "epoch": 4.308527482470029, "grad_norm": 0.510735034942627, "learning_rate": 6.344335618286762e-06, "loss": 0.6267, "step": 9524 }, { "epoch": 4.308979868807962, "grad_norm": 0.4337863028049469, "learning_rate": 6.343629835634181e-06, "loss": 0.5076, "step": 9525 }, { "epoch": 4.309432255145895, "grad_norm": 0.4999764859676361, "learning_rate": 6.342924024124587e-06, "loss": 0.5561, "step": 9526 }, { "epoch": 4.309884641483827, "grad_norm": 0.4762069582939148, "learning_rate": 6.342218183773139e-06, "loss": 0.491, "step": 9527 }, { "epoch": 4.31033702782176, "grad_norm": 0.4355912208557129, "learning_rate": 6.341512314594997e-06, "loss": 0.5171, "step": 9528 }, { "epoch": 4.310789414159692, "grad_norm": 0.5108531713485718, "learning_rate": 6.340806416605321e-06, "loss": 0.6151, "step": 9529 }, { "epoch": 4.3112418004976245, "grad_norm": 0.5373982787132263, "learning_rate": 6.340100489819269e-06, "loss": 0.6966, "step": 9530 }, { "epoch": 4.311694186835558, "grad_norm": 0.5157392621040344, "learning_rate": 6.339394534252005e-06, "loss": 0.5799, "step": 9531 }, { "epoch": 4.31214657317349, "grad_norm": 0.5259252786636353, "learning_rate": 6.33868854991869e-06, "loss": 0.5568, "step": 9532 }, { "epoch": 4.312598959511423, "grad_norm": 0.5057282447814941, "learning_rate": 6.337982536834486e-06, "loss": 0.5041, "step": 9533 }, { "epoch": 4.313051345849355, "grad_norm": 0.4944709241390228, "learning_rate": 6.337276495014555e-06, "loss": 0.4086, "step": 9534 }, { "epoch": 4.3135037321872876, "grad_norm": 0.4323752522468567, "learning_rate": 6.336570424474062e-06, "loss": 0.3539, "step": 9535 }, { "epoch": 4.313956118525221, "grad_norm": 0.4804614782333374, "learning_rate": 6.335864325228172e-06, "loss": 0.412, "step": 9536 }, { "epoch": 4.314408504863153, "grad_norm": 0.718823254108429, "learning_rate": 6.335158197292047e-06, "loss": 0.6784, "step": 9537 }, { "epoch": 4.314860891201086, "grad_norm": 0.6140152812004089, "learning_rate": 6.334452040680854e-06, "loss": 0.5935, "step": 9538 }, { "epoch": 4.315313277539018, "grad_norm": 0.6690399050712585, "learning_rate": 6.33374585540976e-06, "loss": 0.6292, "step": 9539 }, { "epoch": 4.315765663876951, "grad_norm": 0.5646594762802124, "learning_rate": 6.333039641493929e-06, "loss": 0.4766, "step": 9540 }, { "epoch": 4.316218050214884, "grad_norm": 0.5626845359802246, "learning_rate": 6.332333398948533e-06, "loss": 0.4618, "step": 9541 }, { "epoch": 4.316670436552816, "grad_norm": 0.6548497080802917, "learning_rate": 6.331627127788735e-06, "loss": 0.4969, "step": 9542 }, { "epoch": 4.317122822890749, "grad_norm": 0.19176962971687317, "learning_rate": 6.330920828029707e-06, "loss": 0.9911, "step": 9543 }, { "epoch": 4.317575209228681, "grad_norm": 0.27091339230537415, "learning_rate": 6.330214499686615e-06, "loss": 0.7229, "step": 9544 }, { "epoch": 4.318027595566614, "grad_norm": 0.3467707633972168, "learning_rate": 6.329508142774632e-06, "loss": 0.6802, "step": 9545 }, { "epoch": 4.318479981904546, "grad_norm": 0.325131893157959, "learning_rate": 6.328801757308925e-06, "loss": 0.643, "step": 9546 }, { "epoch": 4.318932368242479, "grad_norm": 0.3233196437358856, "learning_rate": 6.328095343304669e-06, "loss": 0.6956, "step": 9547 }, { "epoch": 4.319384754580412, "grad_norm": 0.32218867540359497, "learning_rate": 6.327388900777031e-06, "loss": 0.622, "step": 9548 }, { "epoch": 4.319837140918344, "grad_norm": 0.3258589804172516, "learning_rate": 6.326682429741187e-06, "loss": 0.5652, "step": 9549 }, { "epoch": 4.320289527256277, "grad_norm": 0.2942110300064087, "learning_rate": 6.325975930212308e-06, "loss": 0.4854, "step": 9550 }, { "epoch": 4.320741913594209, "grad_norm": 0.3594760298728943, "learning_rate": 6.325269402205568e-06, "loss": 0.6912, "step": 9551 }, { "epoch": 4.321194299932142, "grad_norm": 0.36236995458602905, "learning_rate": 6.324562845736142e-06, "loss": 0.5051, "step": 9552 }, { "epoch": 4.321646686270075, "grad_norm": 0.42704373598098755, "learning_rate": 6.323856260819202e-06, "loss": 0.6136, "step": 9553 }, { "epoch": 4.322099072608007, "grad_norm": 0.37870267033576965, "learning_rate": 6.323149647469925e-06, "loss": 0.5191, "step": 9554 }, { "epoch": 4.32255145894594, "grad_norm": 0.3969556987285614, "learning_rate": 6.322443005703486e-06, "loss": 0.5442, "step": 9555 }, { "epoch": 4.323003845283872, "grad_norm": 0.3645499050617218, "learning_rate": 6.321736335535062e-06, "loss": 0.5413, "step": 9556 }, { "epoch": 4.323456231621805, "grad_norm": 0.41993388533592224, "learning_rate": 6.321029636979832e-06, "loss": 0.5927, "step": 9557 }, { "epoch": 4.323908617959738, "grad_norm": 0.34963542222976685, "learning_rate": 6.32032291005297e-06, "loss": 0.485, "step": 9558 }, { "epoch": 4.32436100429767, "grad_norm": 0.38389208912849426, "learning_rate": 6.319616154769657e-06, "loss": 0.5701, "step": 9559 }, { "epoch": 4.324813390635603, "grad_norm": 0.3847079277038574, "learning_rate": 6.31890937114507e-06, "loss": 0.5021, "step": 9560 }, { "epoch": 4.325265776973535, "grad_norm": 0.4090445041656494, "learning_rate": 6.318202559194391e-06, "loss": 0.5882, "step": 9561 }, { "epoch": 4.325718163311468, "grad_norm": 0.41982460021972656, "learning_rate": 6.3174957189327976e-06, "loss": 0.524, "step": 9562 }, { "epoch": 4.326170549649401, "grad_norm": 0.38852328062057495, "learning_rate": 6.316788850375471e-06, "loss": 0.4542, "step": 9563 }, { "epoch": 4.326622935987333, "grad_norm": 0.38403213024139404, "learning_rate": 6.316081953537596e-06, "loss": 0.5811, "step": 9564 }, { "epoch": 4.327075322325266, "grad_norm": 0.4136190712451935, "learning_rate": 6.315375028434351e-06, "loss": 0.5023, "step": 9565 }, { "epoch": 4.327527708663198, "grad_norm": 0.43211081624031067, "learning_rate": 6.314668075080919e-06, "loss": 0.487, "step": 9566 }, { "epoch": 4.327980095001131, "grad_norm": 0.5021094083786011, "learning_rate": 6.313961093492482e-06, "loss": 0.6397, "step": 9567 }, { "epoch": 4.328432481339064, "grad_norm": 0.41391798853874207, "learning_rate": 6.313254083684226e-06, "loss": 0.4521, "step": 9568 }, { "epoch": 4.328884867676996, "grad_norm": 0.4562036693096161, "learning_rate": 6.312547045671336e-06, "loss": 0.5537, "step": 9569 }, { "epoch": 4.329337254014929, "grad_norm": 0.4331320524215698, "learning_rate": 6.311839979468994e-06, "loss": 0.6067, "step": 9570 }, { "epoch": 4.329789640352861, "grad_norm": 0.4179893434047699, "learning_rate": 6.311132885092388e-06, "loss": 0.4439, "step": 9571 }, { "epoch": 4.330242026690794, "grad_norm": 0.44743409752845764, "learning_rate": 6.310425762556704e-06, "loss": 0.5465, "step": 9572 }, { "epoch": 4.330694413028726, "grad_norm": 0.4374428987503052, "learning_rate": 6.309718611877127e-06, "loss": 0.4583, "step": 9573 }, { "epoch": 4.3311467993666595, "grad_norm": 0.4260733425617218, "learning_rate": 6.309011433068847e-06, "loss": 0.4741, "step": 9574 }, { "epoch": 4.331599185704592, "grad_norm": 0.5087081789970398, "learning_rate": 6.308304226147051e-06, "loss": 0.6357, "step": 9575 }, { "epoch": 4.332051572042524, "grad_norm": 0.4785284996032715, "learning_rate": 6.307596991126926e-06, "loss": 0.5028, "step": 9576 }, { "epoch": 4.332503958380457, "grad_norm": 0.5150300860404968, "learning_rate": 6.306889728023663e-06, "loss": 0.5345, "step": 9577 }, { "epoch": 4.332956344718389, "grad_norm": 0.4417511522769928, "learning_rate": 6.3061824368524515e-06, "loss": 0.4514, "step": 9578 }, { "epoch": 4.333408731056322, "grad_norm": 0.39970341324806213, "learning_rate": 6.305475117628481e-06, "loss": 0.3893, "step": 9579 }, { "epoch": 4.333861117394255, "grad_norm": 0.5062258243560791, "learning_rate": 6.304767770366944e-06, "loss": 0.5405, "step": 9580 }, { "epoch": 4.334313503732187, "grad_norm": 0.520774781703949, "learning_rate": 6.304060395083031e-06, "loss": 0.554, "step": 9581 }, { "epoch": 4.33476589007012, "grad_norm": 0.5778136253356934, "learning_rate": 6.303352991791936e-06, "loss": 0.6068, "step": 9582 }, { "epoch": 4.335218276408052, "grad_norm": 0.49525585770606995, "learning_rate": 6.302645560508849e-06, "loss": 0.4796, "step": 9583 }, { "epoch": 4.335670662745985, "grad_norm": 0.5812645554542542, "learning_rate": 6.301938101248966e-06, "loss": 0.5365, "step": 9584 }, { "epoch": 4.336123049083918, "grad_norm": 0.5080732107162476, "learning_rate": 6.30123061402748e-06, "loss": 0.4483, "step": 9585 }, { "epoch": 4.33657543542185, "grad_norm": 0.6259636878967285, "learning_rate": 6.300523098859586e-06, "loss": 0.4033, "step": 9586 }, { "epoch": 4.337027821759783, "grad_norm": 0.5579864978790283, "learning_rate": 6.299815555760478e-06, "loss": 0.5844, "step": 9587 }, { "epoch": 4.337480208097715, "grad_norm": 0.7331888675689697, "learning_rate": 6.2991079847453514e-06, "loss": 0.6988, "step": 9588 }, { "epoch": 4.337932594435648, "grad_norm": 0.5646861791610718, "learning_rate": 6.298400385829406e-06, "loss": 0.4783, "step": 9589 }, { "epoch": 4.338384980773581, "grad_norm": 0.6087921261787415, "learning_rate": 6.297692759027836e-06, "loss": 0.4632, "step": 9590 }, { "epoch": 4.3388373671115135, "grad_norm": 0.6353251934051514, "learning_rate": 6.29698510435584e-06, "loss": 0.6138, "step": 9591 }, { "epoch": 4.339289753449446, "grad_norm": 0.7610289454460144, "learning_rate": 6.2962774218286145e-06, "loss": 0.5657, "step": 9592 }, { "epoch": 4.339742139787378, "grad_norm": 0.14982011914253235, "learning_rate": 6.295569711461362e-06, "loss": 1.1473, "step": 9593 }, { "epoch": 4.340194526125311, "grad_norm": 0.19349117577075958, "learning_rate": 6.294861973269278e-06, "loss": 0.9989, "step": 9594 }, { "epoch": 4.340646912463244, "grad_norm": 0.3146960437297821, "learning_rate": 6.294154207267567e-06, "loss": 0.6954, "step": 9595 }, { "epoch": 4.3410992988011765, "grad_norm": 0.326325923204422, "learning_rate": 6.293446413471425e-06, "loss": 0.597, "step": 9596 }, { "epoch": 4.341551685139109, "grad_norm": 0.2944960594177246, "learning_rate": 6.292738591896055e-06, "loss": 0.4459, "step": 9597 }, { "epoch": 4.342004071477041, "grad_norm": 0.30525654554367065, "learning_rate": 6.292030742556661e-06, "loss": 0.5145, "step": 9598 }, { "epoch": 4.342456457814974, "grad_norm": 0.3273196220397949, "learning_rate": 6.291322865468441e-06, "loss": 0.5745, "step": 9599 }, { "epoch": 4.342908844152906, "grad_norm": 0.30909937620162964, "learning_rate": 6.290614960646602e-06, "loss": 0.4629, "step": 9600 }, { "epoch": 4.342908844152906, "eval_loss": 0.5903434157371521, "eval_runtime": 25.1006, "eval_samples_per_second": 29.641, "eval_steps_per_second": 7.41, "step": 9600 }, { "epoch": 4.3433612304908396, "grad_norm": 0.3185810446739197, "learning_rate": 6.289907028106344e-06, "loss": 0.4829, "step": 9601 }, { "epoch": 4.343813616828772, "grad_norm": 0.3353137969970703, "learning_rate": 6.289199067862874e-06, "loss": 0.5986, "step": 9602 }, { "epoch": 4.344266003166704, "grad_norm": 0.3316313326358795, "learning_rate": 6.288491079931397e-06, "loss": 0.5087, "step": 9603 }, { "epoch": 4.344718389504637, "grad_norm": 0.42018651962280273, "learning_rate": 6.287783064327116e-06, "loss": 0.7892, "step": 9604 }, { "epoch": 4.345170775842569, "grad_norm": 0.37061643600463867, "learning_rate": 6.2870750210652396e-06, "loss": 0.491, "step": 9605 }, { "epoch": 4.345623162180502, "grad_norm": 0.4045735001564026, "learning_rate": 6.286366950160974e-06, "loss": 0.6654, "step": 9606 }, { "epoch": 4.346075548518435, "grad_norm": 0.3811456561088562, "learning_rate": 6.285658851629524e-06, "loss": 0.5356, "step": 9607 }, { "epoch": 4.3465279348563675, "grad_norm": 0.44076842069625854, "learning_rate": 6.2849507254860985e-06, "loss": 0.6416, "step": 9608 }, { "epoch": 4.3469803211943, "grad_norm": 0.4046318531036377, "learning_rate": 6.2842425717459085e-06, "loss": 0.5542, "step": 9609 }, { "epoch": 4.347432707532232, "grad_norm": 0.40570905804634094, "learning_rate": 6.28353439042416e-06, "loss": 0.528, "step": 9610 }, { "epoch": 4.347885093870165, "grad_norm": 0.46737805008888245, "learning_rate": 6.282826181536061e-06, "loss": 0.5708, "step": 9611 }, { "epoch": 4.348337480208098, "grad_norm": 0.3769506812095642, "learning_rate": 6.282117945096826e-06, "loss": 0.4657, "step": 9612 }, { "epoch": 4.3487898665460305, "grad_norm": 0.38415852189064026, "learning_rate": 6.281409681121663e-06, "loss": 0.5039, "step": 9613 }, { "epoch": 4.349242252883963, "grad_norm": 0.4444813132286072, "learning_rate": 6.280701389625784e-06, "loss": 0.5722, "step": 9614 }, { "epoch": 4.349694639221895, "grad_norm": 0.3807510733604431, "learning_rate": 6.279993070624402e-06, "loss": 0.4507, "step": 9615 }, { "epoch": 4.350147025559828, "grad_norm": 0.401235967874527, "learning_rate": 6.279284724132726e-06, "loss": 0.4714, "step": 9616 }, { "epoch": 4.350599411897761, "grad_norm": 0.41926002502441406, "learning_rate": 6.278576350165972e-06, "loss": 0.5184, "step": 9617 }, { "epoch": 4.3510517982356935, "grad_norm": 0.42556852102279663, "learning_rate": 6.2778679487393546e-06, "loss": 0.4659, "step": 9618 }, { "epoch": 4.351504184573626, "grad_norm": 0.4153200089931488, "learning_rate": 6.277159519868086e-06, "loss": 0.4934, "step": 9619 }, { "epoch": 4.351956570911558, "grad_norm": 0.4535750448703766, "learning_rate": 6.2764510635673825e-06, "loss": 0.5551, "step": 9620 }, { "epoch": 4.352408957249491, "grad_norm": 0.4319263994693756, "learning_rate": 6.275742579852458e-06, "loss": 0.5066, "step": 9621 }, { "epoch": 4.352861343587423, "grad_norm": 0.4783836603164673, "learning_rate": 6.2750340687385284e-06, "loss": 0.5988, "step": 9622 }, { "epoch": 4.353313729925357, "grad_norm": 0.45879340171813965, "learning_rate": 6.274325530240811e-06, "loss": 0.5774, "step": 9623 }, { "epoch": 4.353766116263289, "grad_norm": 0.45953604578971863, "learning_rate": 6.273616964374525e-06, "loss": 0.4857, "step": 9624 }, { "epoch": 4.3542185026012215, "grad_norm": 0.4757315516471863, "learning_rate": 6.272908371154885e-06, "loss": 0.5709, "step": 9625 }, { "epoch": 4.354670888939154, "grad_norm": 0.45069122314453125, "learning_rate": 6.272199750597111e-06, "loss": 0.4562, "step": 9626 }, { "epoch": 4.355123275277086, "grad_norm": 0.4426519572734833, "learning_rate": 6.2714911027164214e-06, "loss": 0.5463, "step": 9627 }, { "epoch": 4.35557566161502, "grad_norm": 0.4562075138092041, "learning_rate": 6.270782427528037e-06, "loss": 0.5491, "step": 9628 }, { "epoch": 4.356028047952952, "grad_norm": 0.5407863259315491, "learning_rate": 6.270073725047177e-06, "loss": 0.5651, "step": 9629 }, { "epoch": 4.3564804342908845, "grad_norm": 0.4621252417564392, "learning_rate": 6.269364995289061e-06, "loss": 0.4283, "step": 9630 }, { "epoch": 4.356932820628817, "grad_norm": 0.486773818731308, "learning_rate": 6.268656238268913e-06, "loss": 0.4371, "step": 9631 }, { "epoch": 4.357385206966749, "grad_norm": 0.5151461362838745, "learning_rate": 6.267947454001953e-06, "loss": 0.4905, "step": 9632 }, { "epoch": 4.357837593304682, "grad_norm": 0.5353973507881165, "learning_rate": 6.267238642503404e-06, "loss": 0.5504, "step": 9633 }, { "epoch": 4.358289979642615, "grad_norm": 0.49180033802986145, "learning_rate": 6.266529803788489e-06, "loss": 0.4522, "step": 9634 }, { "epoch": 4.3587423659805475, "grad_norm": 0.5681300163269043, "learning_rate": 6.2658209378724315e-06, "loss": 0.4841, "step": 9635 }, { "epoch": 4.35919475231848, "grad_norm": 0.5013583302497864, "learning_rate": 6.265112044770456e-06, "loss": 0.4793, "step": 9636 }, { "epoch": 4.359647138656412, "grad_norm": 0.5714869499206543, "learning_rate": 6.2644031244977864e-06, "loss": 0.4874, "step": 9637 }, { "epoch": 4.360099524994345, "grad_norm": 0.46920937299728394, "learning_rate": 6.26369417706965e-06, "loss": 0.4296, "step": 9638 }, { "epoch": 4.360551911332278, "grad_norm": 0.5997452735900879, "learning_rate": 6.262985202501272e-06, "loss": 0.6034, "step": 9639 }, { "epoch": 4.361004297670211, "grad_norm": 0.5494278073310852, "learning_rate": 6.262276200807877e-06, "loss": 0.5026, "step": 9640 }, { "epoch": 4.361456684008143, "grad_norm": 0.6712788343429565, "learning_rate": 6.261567172004697e-06, "loss": 0.5536, "step": 9641 }, { "epoch": 4.3619090703460754, "grad_norm": 0.6720883846282959, "learning_rate": 6.260858116106954e-06, "loss": 0.524, "step": 9642 }, { "epoch": 4.362361456684008, "grad_norm": 0.16257670521736145, "learning_rate": 6.2601490331298805e-06, "loss": 1.1529, "step": 9643 }, { "epoch": 4.362813843021941, "grad_norm": 0.2525354027748108, "learning_rate": 6.259439923088704e-06, "loss": 0.9629, "step": 9644 }, { "epoch": 4.363266229359874, "grad_norm": 0.2858593463897705, "learning_rate": 6.258730785998653e-06, "loss": 0.6356, "step": 9645 }, { "epoch": 4.363718615697806, "grad_norm": 0.3232974708080292, "learning_rate": 6.258021621874959e-06, "loss": 0.5495, "step": 9646 }, { "epoch": 4.3641710020357385, "grad_norm": 0.32354217767715454, "learning_rate": 6.257312430732852e-06, "loss": 0.6273, "step": 9647 }, { "epoch": 4.364623388373671, "grad_norm": 0.33866578340530396, "learning_rate": 6.256603212587562e-06, "loss": 0.7227, "step": 9648 }, { "epoch": 4.365075774711603, "grad_norm": 0.37698689103126526, "learning_rate": 6.2558939674543226e-06, "loss": 0.7667, "step": 9649 }, { "epoch": 4.365528161049537, "grad_norm": 0.3517102599143982, "learning_rate": 6.255184695348367e-06, "loss": 0.5107, "step": 9650 }, { "epoch": 4.365980547387469, "grad_norm": 0.35771897435188293, "learning_rate": 6.254475396284927e-06, "loss": 0.5606, "step": 9651 }, { "epoch": 4.3664329337254015, "grad_norm": 0.3613768219947815, "learning_rate": 6.253766070279235e-06, "loss": 0.5215, "step": 9652 }, { "epoch": 4.366885320063334, "grad_norm": 0.39668309688568115, "learning_rate": 6.253056717346526e-06, "loss": 0.6247, "step": 9653 }, { "epoch": 4.367337706401266, "grad_norm": 0.4130971431732178, "learning_rate": 6.252347337502036e-06, "loss": 0.6286, "step": 9654 }, { "epoch": 4.367790092739199, "grad_norm": 0.35469865798950195, "learning_rate": 6.251637930760999e-06, "loss": 0.66, "step": 9655 }, { "epoch": 4.368242479077132, "grad_norm": 0.3841998279094696, "learning_rate": 6.250928497138651e-06, "loss": 0.5934, "step": 9656 }, { "epoch": 4.368694865415065, "grad_norm": 0.3816935122013092, "learning_rate": 6.250219036650228e-06, "loss": 0.5526, "step": 9657 }, { "epoch": 4.369147251752997, "grad_norm": 0.40709152817726135, "learning_rate": 6.249509549310968e-06, "loss": 0.4658, "step": 9658 }, { "epoch": 4.369599638090929, "grad_norm": 0.4092249572277069, "learning_rate": 6.248800035136107e-06, "loss": 0.639, "step": 9659 }, { "epoch": 4.370052024428862, "grad_norm": 0.3887004852294922, "learning_rate": 6.248090494140885e-06, "loss": 0.5359, "step": 9660 }, { "epoch": 4.370504410766795, "grad_norm": 0.38975194096565247, "learning_rate": 6.2473809263405404e-06, "loss": 0.456, "step": 9661 }, { "epoch": 4.370956797104728, "grad_norm": 0.43881285190582275, "learning_rate": 6.246671331750311e-06, "loss": 0.6822, "step": 9662 }, { "epoch": 4.37140918344266, "grad_norm": 0.39535295963287354, "learning_rate": 6.24596171038544e-06, "loss": 0.505, "step": 9663 }, { "epoch": 4.3718615697805925, "grad_norm": 0.36196961998939514, "learning_rate": 6.245252062261163e-06, "loss": 0.4209, "step": 9664 }, { "epoch": 4.372313956118525, "grad_norm": 0.43436703085899353, "learning_rate": 6.244542387392727e-06, "loss": 0.5923, "step": 9665 }, { "epoch": 4.372766342456458, "grad_norm": 0.38554292917251587, "learning_rate": 6.243832685795368e-06, "loss": 0.419, "step": 9666 }, { "epoch": 4.373218728794391, "grad_norm": 0.4197028577327728, "learning_rate": 6.2431229574843315e-06, "loss": 0.5369, "step": 9667 }, { "epoch": 4.373671115132323, "grad_norm": 0.46558472514152527, "learning_rate": 6.2424132024748585e-06, "loss": 0.5359, "step": 9668 }, { "epoch": 4.3741235014702555, "grad_norm": 0.4768615663051605, "learning_rate": 6.2417034207821934e-06, "loss": 0.6274, "step": 9669 }, { "epoch": 4.374575887808188, "grad_norm": 0.46692386269569397, "learning_rate": 6.2409936124215795e-06, "loss": 0.672, "step": 9670 }, { "epoch": 4.37502827414612, "grad_norm": 0.4485618472099304, "learning_rate": 6.240283777408262e-06, "loss": 0.5106, "step": 9671 }, { "epoch": 4.375480660484054, "grad_norm": 0.5020697712898254, "learning_rate": 6.239573915757486e-06, "loss": 0.5811, "step": 9672 }, { "epoch": 4.375933046821986, "grad_norm": 0.39436742663383484, "learning_rate": 6.238864027484497e-06, "loss": 0.51, "step": 9673 }, { "epoch": 4.376385433159919, "grad_norm": 0.46213966608047485, "learning_rate": 6.2381541126045395e-06, "loss": 0.5164, "step": 9674 }, { "epoch": 4.376837819497851, "grad_norm": 0.4744856059551239, "learning_rate": 6.237444171132864e-06, "loss": 0.4771, "step": 9675 }, { "epoch": 4.377290205835783, "grad_norm": 0.5297808647155762, "learning_rate": 6.236734203084713e-06, "loss": 0.6208, "step": 9676 }, { "epoch": 4.377742592173717, "grad_norm": 0.4792250990867615, "learning_rate": 6.236024208475341e-06, "loss": 0.5879, "step": 9677 }, { "epoch": 4.378194978511649, "grad_norm": 0.41004276275634766, "learning_rate": 6.23531418731999e-06, "loss": 0.4171, "step": 9678 }, { "epoch": 4.378647364849582, "grad_norm": 0.5409497022628784, "learning_rate": 6.2346041396339104e-06, "loss": 0.6624, "step": 9679 }, { "epoch": 4.379099751187514, "grad_norm": 0.4960959255695343, "learning_rate": 6.233894065432354e-06, "loss": 0.5186, "step": 9680 }, { "epoch": 4.3795521375254465, "grad_norm": 0.5728989243507385, "learning_rate": 6.23318396473057e-06, "loss": 0.5587, "step": 9681 }, { "epoch": 4.380004523863379, "grad_norm": 0.4768582582473755, "learning_rate": 6.2324738375438095e-06, "loss": 0.4862, "step": 9682 }, { "epoch": 4.380456910201312, "grad_norm": 0.47301825881004333, "learning_rate": 6.231763683887323e-06, "loss": 0.4856, "step": 9683 }, { "epoch": 4.380909296539245, "grad_norm": 0.5394904017448425, "learning_rate": 6.231053503776363e-06, "loss": 0.5606, "step": 9684 }, { "epoch": 4.381361682877177, "grad_norm": 0.43606045842170715, "learning_rate": 6.2303432972261825e-06, "loss": 0.4194, "step": 9685 }, { "epoch": 4.3818140692151095, "grad_norm": 0.5826749205589294, "learning_rate": 6.229633064252033e-06, "loss": 0.6169, "step": 9686 }, { "epoch": 4.382266455553042, "grad_norm": 0.5822718143463135, "learning_rate": 6.22892280486917e-06, "loss": 0.5724, "step": 9687 }, { "epoch": 4.382718841890975, "grad_norm": 0.5199810862541199, "learning_rate": 6.228212519092847e-06, "loss": 0.5321, "step": 9688 }, { "epoch": 4.383171228228908, "grad_norm": 0.6205268502235413, "learning_rate": 6.2275022069383175e-06, "loss": 0.4772, "step": 9689 }, { "epoch": 4.38362361456684, "grad_norm": 0.6059772968292236, "learning_rate": 6.226791868420837e-06, "loss": 0.5112, "step": 9690 }, { "epoch": 4.384076000904773, "grad_norm": 0.6268749833106995, "learning_rate": 6.226081503555663e-06, "loss": 0.4408, "step": 9691 }, { "epoch": 4.384528387242705, "grad_norm": 0.7871192097663879, "learning_rate": 6.225371112358051e-06, "loss": 0.4954, "step": 9692 }, { "epoch": 4.384980773580638, "grad_norm": 0.1597602367401123, "learning_rate": 6.2246606948432575e-06, "loss": 1.0616, "step": 9693 }, { "epoch": 4.385433159918571, "grad_norm": 0.24772252142429352, "learning_rate": 6.223950251026542e-06, "loss": 1.0722, "step": 9694 }, { "epoch": 4.385885546256503, "grad_norm": 0.26664021611213684, "learning_rate": 6.223239780923161e-06, "loss": 0.6627, "step": 9695 }, { "epoch": 4.386337932594436, "grad_norm": 0.26869526505470276, "learning_rate": 6.222529284548372e-06, "loss": 0.6819, "step": 9696 }, { "epoch": 4.386790318932368, "grad_norm": 0.2943112552165985, "learning_rate": 6.221818761917437e-06, "loss": 0.5609, "step": 9697 }, { "epoch": 4.3872427052703005, "grad_norm": 0.30495986342430115, "learning_rate": 6.221108213045613e-06, "loss": 0.6453, "step": 9698 }, { "epoch": 4.387695091608234, "grad_norm": 0.3403323292732239, "learning_rate": 6.220397637948165e-06, "loss": 0.5685, "step": 9699 }, { "epoch": 4.388147477946166, "grad_norm": 0.31414794921875, "learning_rate": 6.219687036640348e-06, "loss": 0.503, "step": 9700 }, { "epoch": 4.388599864284099, "grad_norm": 0.3604378402233124, "learning_rate": 6.218976409137427e-06, "loss": 0.5736, "step": 9701 }, { "epoch": 4.389052250622031, "grad_norm": 0.33611029386520386, "learning_rate": 6.218265755454664e-06, "loss": 0.5209, "step": 9702 }, { "epoch": 4.3895046369599635, "grad_norm": 0.3663256764411926, "learning_rate": 6.217555075607321e-06, "loss": 0.5946, "step": 9703 }, { "epoch": 4.389957023297896, "grad_norm": 0.3550020754337311, "learning_rate": 6.2168443696106615e-06, "loss": 0.5886, "step": 9704 }, { "epoch": 4.390409409635829, "grad_norm": 0.36069655418395996, "learning_rate": 6.216133637479949e-06, "loss": 0.5353, "step": 9705 }, { "epoch": 4.390861795973762, "grad_norm": 0.43620821833610535, "learning_rate": 6.2154228792304475e-06, "loss": 0.6, "step": 9706 }, { "epoch": 4.391314182311694, "grad_norm": 0.39045456051826477, "learning_rate": 6.214712094877423e-06, "loss": 0.6029, "step": 9707 }, { "epoch": 4.391766568649627, "grad_norm": 0.37199416756629944, "learning_rate": 6.21400128443614e-06, "loss": 0.4836, "step": 9708 }, { "epoch": 4.392218954987559, "grad_norm": 0.39387062191963196, "learning_rate": 6.213290447921865e-06, "loss": 0.5476, "step": 9709 }, { "epoch": 4.392671341325492, "grad_norm": 0.4311682879924774, "learning_rate": 6.212579585349864e-06, "loss": 0.614, "step": 9710 }, { "epoch": 4.393123727663425, "grad_norm": 0.4175962507724762, "learning_rate": 6.211868696735405e-06, "loss": 0.5599, "step": 9711 }, { "epoch": 4.393576114001357, "grad_norm": 0.42498108744621277, "learning_rate": 6.211157782093755e-06, "loss": 0.5307, "step": 9712 }, { "epoch": 4.39402850033929, "grad_norm": 0.3839572072029114, "learning_rate": 6.210446841440182e-06, "loss": 0.4292, "step": 9713 }, { "epoch": 4.394480886677222, "grad_norm": 0.410866916179657, "learning_rate": 6.209735874789955e-06, "loss": 0.5018, "step": 9714 }, { "epoch": 4.394933273015155, "grad_norm": 0.4114008843898773, "learning_rate": 6.209024882158344e-06, "loss": 0.5009, "step": 9715 }, { "epoch": 4.395385659353088, "grad_norm": 0.40269559621810913, "learning_rate": 6.208313863560618e-06, "loss": 0.5124, "step": 9716 }, { "epoch": 4.39583804569102, "grad_norm": 0.4361442029476166, "learning_rate": 6.20760281901205e-06, "loss": 0.4475, "step": 9717 }, { "epoch": 4.396290432028953, "grad_norm": 0.3820638060569763, "learning_rate": 6.2068917485279065e-06, "loss": 0.4417, "step": 9718 }, { "epoch": 4.396742818366885, "grad_norm": 0.5164209604263306, "learning_rate": 6.206180652123463e-06, "loss": 0.5948, "step": 9719 }, { "epoch": 4.3971952047048175, "grad_norm": 0.5362248420715332, "learning_rate": 6.20546952981399e-06, "loss": 0.6615, "step": 9720 }, { "epoch": 4.397647591042751, "grad_norm": 0.5037355422973633, "learning_rate": 6.20475838161476e-06, "loss": 0.5956, "step": 9721 }, { "epoch": 4.398099977380683, "grad_norm": 0.42335113883018494, "learning_rate": 6.204047207541048e-06, "loss": 0.4109, "step": 9722 }, { "epoch": 4.398552363718616, "grad_norm": 0.4739815592765808, "learning_rate": 6.203336007608126e-06, "loss": 0.482, "step": 9723 }, { "epoch": 4.399004750056548, "grad_norm": 0.4296318292617798, "learning_rate": 6.202624781831269e-06, "loss": 0.4782, "step": 9724 }, { "epoch": 4.3994571363944805, "grad_norm": 0.5259279608726501, "learning_rate": 6.201913530225752e-06, "loss": 0.5607, "step": 9725 }, { "epoch": 4.399909522732414, "grad_norm": 0.5070255398750305, "learning_rate": 6.201202252806849e-06, "loss": 0.5417, "step": 9726 }, { "epoch": 4.400361909070346, "grad_norm": 0.4669192433357239, "learning_rate": 6.200490949589839e-06, "loss": 0.4827, "step": 9727 }, { "epoch": 4.400814295408279, "grad_norm": 0.4893803596496582, "learning_rate": 6.199779620589996e-06, "loss": 0.5503, "step": 9728 }, { "epoch": 4.401266681746211, "grad_norm": 0.4936221241950989, "learning_rate": 6.199068265822599e-06, "loss": 0.5772, "step": 9729 }, { "epoch": 4.401719068084144, "grad_norm": 0.42649319767951965, "learning_rate": 6.198356885302924e-06, "loss": 0.5032, "step": 9730 }, { "epoch": 4.402171454422076, "grad_norm": 0.543303370475769, "learning_rate": 6.197645479046251e-06, "loss": 0.5361, "step": 9731 }, { "epoch": 4.402623840760009, "grad_norm": 0.5213505029678345, "learning_rate": 6.196934047067856e-06, "loss": 0.544, "step": 9732 }, { "epoch": 4.403076227097942, "grad_norm": 0.5291633605957031, "learning_rate": 6.196222589383023e-06, "loss": 0.4998, "step": 9733 }, { "epoch": 4.403528613435874, "grad_norm": 0.5102645754814148, "learning_rate": 6.1955111060070275e-06, "loss": 0.471, "step": 9734 }, { "epoch": 4.403980999773807, "grad_norm": 0.5760107040405273, "learning_rate": 6.194799596955152e-06, "loss": 0.4626, "step": 9735 }, { "epoch": 4.404433386111739, "grad_norm": 0.5737799406051636, "learning_rate": 6.194088062242677e-06, "loss": 0.545, "step": 9736 }, { "epoch": 4.404885772449672, "grad_norm": 0.49999552965164185, "learning_rate": 6.1933765018848835e-06, "loss": 0.509, "step": 9737 }, { "epoch": 4.405338158787605, "grad_norm": 0.49492642283439636, "learning_rate": 6.192664915897055e-06, "loss": 0.4514, "step": 9738 }, { "epoch": 4.405790545125537, "grad_norm": 0.5417995452880859, "learning_rate": 6.191953304294474e-06, "loss": 0.5312, "step": 9739 }, { "epoch": 4.40624293146347, "grad_norm": 0.6523023247718811, "learning_rate": 6.191241667092422e-06, "loss": 0.6558, "step": 9740 }, { "epoch": 4.406695317801402, "grad_norm": 0.6903699636459351, "learning_rate": 6.190530004306184e-06, "loss": 0.5912, "step": 9741 }, { "epoch": 4.407147704139335, "grad_norm": 0.7045551538467407, "learning_rate": 6.189818315951045e-06, "loss": 0.5392, "step": 9742 }, { "epoch": 4.407600090477268, "grad_norm": 0.20782196521759033, "learning_rate": 6.18910660204229e-06, "loss": 1.1266, "step": 9743 }, { "epoch": 4.4080524768152, "grad_norm": 0.2642132043838501, "learning_rate": 6.1883948625952025e-06, "loss": 0.6719, "step": 9744 }, { "epoch": 4.408504863153133, "grad_norm": 0.3218730390071869, "learning_rate": 6.18768309762507e-06, "loss": 0.6053, "step": 9745 }, { "epoch": 4.408957249491065, "grad_norm": 0.3182435929775238, "learning_rate": 6.186971307147178e-06, "loss": 0.5964, "step": 9746 }, { "epoch": 4.409409635828998, "grad_norm": 0.38036277890205383, "learning_rate": 6.186259491176814e-06, "loss": 0.6308, "step": 9747 }, { "epoch": 4.409862022166931, "grad_norm": 0.34894654154777527, "learning_rate": 6.185547649729266e-06, "loss": 0.6243, "step": 9748 }, { "epoch": 4.410314408504863, "grad_norm": 0.37292101979255676, "learning_rate": 6.184835782819821e-06, "loss": 0.4978, "step": 9749 }, { "epoch": 4.410766794842796, "grad_norm": 0.3437928855419159, "learning_rate": 6.18412389046377e-06, "loss": 0.5757, "step": 9750 }, { "epoch": 4.411219181180728, "grad_norm": 0.37482166290283203, "learning_rate": 6.1834119726764e-06, "loss": 0.5746, "step": 9751 }, { "epoch": 4.411671567518661, "grad_norm": 0.3897918462753296, "learning_rate": 6.182700029473001e-06, "loss": 0.5932, "step": 9752 }, { "epoch": 4.412123953856593, "grad_norm": 0.3793432414531708, "learning_rate": 6.181988060868866e-06, "loss": 0.473, "step": 9753 }, { "epoch": 4.412576340194526, "grad_norm": 0.3698672950267792, "learning_rate": 6.181276066879282e-06, "loss": 0.4737, "step": 9754 }, { "epoch": 4.413028726532459, "grad_norm": 0.3888765573501587, "learning_rate": 6.1805640475195415e-06, "loss": 0.6271, "step": 9755 }, { "epoch": 4.413481112870391, "grad_norm": 0.36427533626556396, "learning_rate": 6.1798520028049406e-06, "loss": 0.5234, "step": 9756 }, { "epoch": 4.413933499208324, "grad_norm": 0.36356040835380554, "learning_rate": 6.179139932750765e-06, "loss": 0.4957, "step": 9757 }, { "epoch": 4.414385885546256, "grad_norm": 0.40096256136894226, "learning_rate": 6.1784278373723126e-06, "loss": 0.551, "step": 9758 }, { "epoch": 4.414838271884189, "grad_norm": 0.3809586763381958, "learning_rate": 6.177715716684875e-06, "loss": 0.5492, "step": 9759 }, { "epoch": 4.415290658222122, "grad_norm": 0.40836620330810547, "learning_rate": 6.177003570703747e-06, "loss": 0.5295, "step": 9760 }, { "epoch": 4.415743044560054, "grad_norm": 0.405942440032959, "learning_rate": 6.176291399444223e-06, "loss": 0.5206, "step": 9761 }, { "epoch": 4.416195430897987, "grad_norm": 0.4174205958843231, "learning_rate": 6.1755792029216e-06, "loss": 0.5984, "step": 9762 }, { "epoch": 4.416647817235919, "grad_norm": 0.39799532294273376, "learning_rate": 6.174866981151171e-06, "loss": 0.5297, "step": 9763 }, { "epoch": 4.4171002035738525, "grad_norm": 0.43038657307624817, "learning_rate": 6.174154734148233e-06, "loss": 0.5664, "step": 9764 }, { "epoch": 4.417552589911785, "grad_norm": 0.4805659353733063, "learning_rate": 6.1734424619280845e-06, "loss": 0.6732, "step": 9765 }, { "epoch": 4.418004976249717, "grad_norm": 0.4164363443851471, "learning_rate": 6.172730164506021e-06, "loss": 0.4269, "step": 9766 }, { "epoch": 4.41845736258765, "grad_norm": 0.4010190963745117, "learning_rate": 6.1720178418973435e-06, "loss": 0.4657, "step": 9767 }, { "epoch": 4.418909748925582, "grad_norm": 0.4686838388442993, "learning_rate": 6.171305494117346e-06, "loss": 0.5571, "step": 9768 }, { "epoch": 4.4193621352635155, "grad_norm": 0.5042645931243896, "learning_rate": 6.17059312118133e-06, "loss": 0.653, "step": 9769 }, { "epoch": 4.419814521601448, "grad_norm": 0.46781742572784424, "learning_rate": 6.1698807231045955e-06, "loss": 0.5697, "step": 9770 }, { "epoch": 4.42026690793938, "grad_norm": 0.4714716970920563, "learning_rate": 6.169168299902441e-06, "loss": 0.5372, "step": 9771 }, { "epoch": 4.420719294277313, "grad_norm": 0.45139849185943604, "learning_rate": 6.16845585159017e-06, "loss": 0.5035, "step": 9772 }, { "epoch": 4.421171680615245, "grad_norm": 0.4201582372188568, "learning_rate": 6.16774337818308e-06, "loss": 0.3958, "step": 9773 }, { "epoch": 4.421624066953178, "grad_norm": 0.4810764491558075, "learning_rate": 6.167030879696474e-06, "loss": 0.5057, "step": 9774 }, { "epoch": 4.422076453291111, "grad_norm": 0.49437564611434937, "learning_rate": 6.1663183561456565e-06, "loss": 0.5382, "step": 9775 }, { "epoch": 4.422528839629043, "grad_norm": 0.4721686840057373, "learning_rate": 6.165605807545928e-06, "loss": 0.4885, "step": 9776 }, { "epoch": 4.422981225966976, "grad_norm": 0.5229479670524597, "learning_rate": 6.164893233912592e-06, "loss": 0.5896, "step": 9777 }, { "epoch": 4.423433612304908, "grad_norm": 0.4796239137649536, "learning_rate": 6.1641806352609534e-06, "loss": 0.4819, "step": 9778 }, { "epoch": 4.423885998642841, "grad_norm": 0.5425348281860352, "learning_rate": 6.163468011606316e-06, "loss": 0.5505, "step": 9779 }, { "epoch": 4.424338384980773, "grad_norm": 0.5165950655937195, "learning_rate": 6.162755362963985e-06, "loss": 0.5335, "step": 9780 }, { "epoch": 4.4247907713187065, "grad_norm": 0.550978422164917, "learning_rate": 6.1620426893492645e-06, "loss": 0.5412, "step": 9781 }, { "epoch": 4.425243157656639, "grad_norm": 0.5353013873100281, "learning_rate": 6.161329990777462e-06, "loss": 0.5325, "step": 9782 }, { "epoch": 4.425695543994571, "grad_norm": 0.48407238721847534, "learning_rate": 6.160617267263884e-06, "loss": 0.4466, "step": 9783 }, { "epoch": 4.426147930332504, "grad_norm": 0.5003272294998169, "learning_rate": 6.159904518823838e-06, "loss": 0.5045, "step": 9784 }, { "epoch": 4.426600316670436, "grad_norm": 0.5375664830207825, "learning_rate": 6.15919174547263e-06, "loss": 0.4982, "step": 9785 }, { "epoch": 4.4270527030083695, "grad_norm": 0.4979151785373688, "learning_rate": 6.15847894722557e-06, "loss": 0.4976, "step": 9786 }, { "epoch": 4.427505089346302, "grad_norm": 0.532320499420166, "learning_rate": 6.1577661240979645e-06, "loss": 0.49, "step": 9787 }, { "epoch": 4.427957475684234, "grad_norm": 0.5959937572479248, "learning_rate": 6.157053276105125e-06, "loss": 0.5307, "step": 9788 }, { "epoch": 4.428409862022167, "grad_norm": 0.6274888515472412, "learning_rate": 6.156340403262361e-06, "loss": 0.5591, "step": 9789 }, { "epoch": 4.428862248360099, "grad_norm": 0.6673392653465271, "learning_rate": 6.155627505584982e-06, "loss": 0.5764, "step": 9790 }, { "epoch": 4.4293146346980325, "grad_norm": 0.6747393608093262, "learning_rate": 6.154914583088298e-06, "loss": 0.6002, "step": 9791 }, { "epoch": 4.429767021035965, "grad_norm": 0.6716331839561462, "learning_rate": 6.1542016357876225e-06, "loss": 0.4481, "step": 9792 }, { "epoch": 4.430219407373897, "grad_norm": 0.17206725478172302, "learning_rate": 6.153488663698265e-06, "loss": 0.9979, "step": 9793 }, { "epoch": 4.43067179371183, "grad_norm": 0.33583301305770874, "learning_rate": 6.15277566683554e-06, "loss": 0.8258, "step": 9794 }, { "epoch": 4.431124180049762, "grad_norm": 0.27282142639160156, "learning_rate": 6.152062645214759e-06, "loss": 0.5638, "step": 9795 }, { "epoch": 4.431576566387695, "grad_norm": 0.3223588168621063, "learning_rate": 6.151349598851237e-06, "loss": 0.6517, "step": 9796 }, { "epoch": 4.432028952725628, "grad_norm": 0.29638543725013733, "learning_rate": 6.150636527760287e-06, "loss": 0.493, "step": 9797 }, { "epoch": 4.4324813390635605, "grad_norm": 0.33730438351631165, "learning_rate": 6.149923431957223e-06, "loss": 0.5981, "step": 9798 }, { "epoch": 4.432933725401493, "grad_norm": 0.3453002870082855, "learning_rate": 6.149210311457361e-06, "loss": 0.533, "step": 9799 }, { "epoch": 4.433386111739425, "grad_norm": 0.3885340094566345, "learning_rate": 6.148497166276017e-06, "loss": 0.5711, "step": 9800 }, { "epoch": 4.433386111739425, "eval_loss": 0.5898838639259338, "eval_runtime": 25.7062, "eval_samples_per_second": 28.942, "eval_steps_per_second": 7.236, "step": 9800 }, { "epoch": 4.433838498077358, "grad_norm": 0.3791637420654297, "learning_rate": 6.147783996428507e-06, "loss": 0.5965, "step": 9801 }, { "epoch": 4.434290884415291, "grad_norm": 0.42010340094566345, "learning_rate": 6.1470708019301475e-06, "loss": 0.6381, "step": 9802 }, { "epoch": 4.4347432707532235, "grad_norm": 0.3948909640312195, "learning_rate": 6.146357582796254e-06, "loss": 0.5594, "step": 9803 }, { "epoch": 4.435195657091156, "grad_norm": 0.35153746604919434, "learning_rate": 6.145644339042146e-06, "loss": 0.5591, "step": 9804 }, { "epoch": 4.435648043429088, "grad_norm": 0.3671838343143463, "learning_rate": 6.144931070683142e-06, "loss": 0.5621, "step": 9805 }, { "epoch": 4.436100429767021, "grad_norm": 0.43365317583084106, "learning_rate": 6.1442177777345605e-06, "loss": 0.6355, "step": 9806 }, { "epoch": 4.436552816104953, "grad_norm": 0.39680755138397217, "learning_rate": 6.14350446021172e-06, "loss": 0.578, "step": 9807 }, { "epoch": 4.4370052024428865, "grad_norm": 0.41937094926834106, "learning_rate": 6.142791118129942e-06, "loss": 0.5762, "step": 9808 }, { "epoch": 4.437457588780819, "grad_norm": 0.35673603415489197, "learning_rate": 6.142077751504545e-06, "loss": 0.432, "step": 9809 }, { "epoch": 4.437909975118751, "grad_norm": 0.4016277492046356, "learning_rate": 6.141364360350851e-06, "loss": 0.5186, "step": 9810 }, { "epoch": 4.438362361456684, "grad_norm": 0.38953661918640137, "learning_rate": 6.140650944684181e-06, "loss": 0.5796, "step": 9811 }, { "epoch": 4.438814747794616, "grad_norm": 0.43455612659454346, "learning_rate": 6.13993750451986e-06, "loss": 0.5327, "step": 9812 }, { "epoch": 4.43926713413255, "grad_norm": 0.43431922793388367, "learning_rate": 6.139224039873204e-06, "loss": 0.5709, "step": 9813 }, { "epoch": 4.439719520470482, "grad_norm": 0.383708119392395, "learning_rate": 6.138510550759541e-06, "loss": 0.5013, "step": 9814 }, { "epoch": 4.4401719068084144, "grad_norm": 0.4013531804084778, "learning_rate": 6.137797037194193e-06, "loss": 0.5014, "step": 9815 }, { "epoch": 4.440624293146347, "grad_norm": 0.509264349937439, "learning_rate": 6.1370834991924845e-06, "loss": 0.6151, "step": 9816 }, { "epoch": 4.441076679484279, "grad_norm": 0.4717050790786743, "learning_rate": 6.136369936769739e-06, "loss": 0.5941, "step": 9817 }, { "epoch": 4.441529065822213, "grad_norm": 0.44880974292755127, "learning_rate": 6.1356563499412835e-06, "loss": 0.5222, "step": 9818 }, { "epoch": 4.441981452160145, "grad_norm": 0.44416192173957825, "learning_rate": 6.134942738722442e-06, "loss": 0.5305, "step": 9819 }, { "epoch": 4.4424338384980775, "grad_norm": 0.45629334449768066, "learning_rate": 6.134229103128541e-06, "loss": 0.6222, "step": 9820 }, { "epoch": 4.44288622483601, "grad_norm": 0.4450448155403137, "learning_rate": 6.133515443174909e-06, "loss": 0.5337, "step": 9821 }, { "epoch": 4.443338611173942, "grad_norm": 0.38655534386634827, "learning_rate": 6.13280175887687e-06, "loss": 0.434, "step": 9822 }, { "epoch": 4.443790997511875, "grad_norm": 0.49116528034210205, "learning_rate": 6.132088050249755e-06, "loss": 0.5419, "step": 9823 }, { "epoch": 4.444243383849808, "grad_norm": 0.45637187361717224, "learning_rate": 6.131374317308891e-06, "loss": 0.4558, "step": 9824 }, { "epoch": 4.4446957701877405, "grad_norm": 0.4857898950576782, "learning_rate": 6.1306605600696055e-06, "loss": 0.5924, "step": 9825 }, { "epoch": 4.445148156525673, "grad_norm": 0.4280603528022766, "learning_rate": 6.129946778547229e-06, "loss": 0.4245, "step": 9826 }, { "epoch": 4.445600542863605, "grad_norm": 0.45148470997810364, "learning_rate": 6.12923297275709e-06, "loss": 0.4507, "step": 9827 }, { "epoch": 4.446052929201538, "grad_norm": 0.4553334712982178, "learning_rate": 6.128519142714521e-06, "loss": 0.4738, "step": 9828 }, { "epoch": 4.44650531553947, "grad_norm": 0.4768848121166229, "learning_rate": 6.12780528843485e-06, "loss": 0.4827, "step": 9829 }, { "epoch": 4.446957701877404, "grad_norm": 0.5084120631217957, "learning_rate": 6.127091409933412e-06, "loss": 0.5302, "step": 9830 }, { "epoch": 4.447410088215336, "grad_norm": 0.49861401319503784, "learning_rate": 6.126377507225536e-06, "loss": 0.4937, "step": 9831 }, { "epoch": 4.447862474553268, "grad_norm": 0.47446227073669434, "learning_rate": 6.125663580326557e-06, "loss": 0.4691, "step": 9832 }, { "epoch": 4.448314860891201, "grad_norm": 0.50810307264328, "learning_rate": 6.124949629251804e-06, "loss": 0.5377, "step": 9833 }, { "epoch": 4.448767247229133, "grad_norm": 0.44806623458862305, "learning_rate": 6.124235654016615e-06, "loss": 0.4286, "step": 9834 }, { "epoch": 4.449219633567067, "grad_norm": 0.46667811274528503, "learning_rate": 6.1235216546363215e-06, "loss": 0.3871, "step": 9835 }, { "epoch": 4.449672019904999, "grad_norm": 0.5311443209648132, "learning_rate": 6.122807631126258e-06, "loss": 0.5276, "step": 9836 }, { "epoch": 4.4501244062429315, "grad_norm": 0.639561116695404, "learning_rate": 6.122093583501761e-06, "loss": 0.5937, "step": 9837 }, { "epoch": 4.450576792580864, "grad_norm": 0.5131572484970093, "learning_rate": 6.121379511778163e-06, "loss": 0.4666, "step": 9838 }, { "epoch": 4.451029178918796, "grad_norm": 0.560533881187439, "learning_rate": 6.120665415970802e-06, "loss": 0.4659, "step": 9839 }, { "epoch": 4.45148156525673, "grad_norm": 0.6030118465423584, "learning_rate": 6.1199512960950144e-06, "loss": 0.5336, "step": 9840 }, { "epoch": 4.451933951594662, "grad_norm": 0.5914849042892456, "learning_rate": 6.119237152166139e-06, "loss": 0.4598, "step": 9841 }, { "epoch": 4.4523863379325945, "grad_norm": 0.7211952209472656, "learning_rate": 6.1185229841995105e-06, "loss": 0.54, "step": 9842 }, { "epoch": 4.452838724270527, "grad_norm": 0.1799927055835724, "learning_rate": 6.117808792210469e-06, "loss": 1.1095, "step": 9843 }, { "epoch": 4.453291110608459, "grad_norm": 0.2597552537918091, "learning_rate": 6.117094576214352e-06, "loss": 0.5639, "step": 9844 }, { "epoch": 4.453743496946392, "grad_norm": 0.2897356450557709, "learning_rate": 6.116380336226499e-06, "loss": 0.6654, "step": 9845 }, { "epoch": 4.454195883284325, "grad_norm": 0.31078121066093445, "learning_rate": 6.115666072262252e-06, "loss": 0.542, "step": 9846 }, { "epoch": 4.454648269622258, "grad_norm": 0.3663017749786377, "learning_rate": 6.114951784336947e-06, "loss": 0.61, "step": 9847 }, { "epoch": 4.45510065596019, "grad_norm": 0.3679484724998474, "learning_rate": 6.114237472465927e-06, "loss": 0.511, "step": 9848 }, { "epoch": 4.455553042298122, "grad_norm": 0.40205004811286926, "learning_rate": 6.113523136664532e-06, "loss": 0.6747, "step": 9849 }, { "epoch": 4.456005428636055, "grad_norm": 0.35429003834724426, "learning_rate": 6.112808776948105e-06, "loss": 0.5956, "step": 9850 }, { "epoch": 4.456457814973988, "grad_norm": 0.36752554774284363, "learning_rate": 6.112094393331988e-06, "loss": 0.6096, "step": 9851 }, { "epoch": 4.456910201311921, "grad_norm": 0.39123326539993286, "learning_rate": 6.111379985831524e-06, "loss": 0.4802, "step": 9852 }, { "epoch": 4.457362587649853, "grad_norm": 0.4177555441856384, "learning_rate": 6.110665554462056e-06, "loss": 0.6024, "step": 9853 }, { "epoch": 4.4578149739877855, "grad_norm": 0.36189329624176025, "learning_rate": 6.109951099238926e-06, "loss": 0.6032, "step": 9854 }, { "epoch": 4.458267360325718, "grad_norm": 0.38266727328300476, "learning_rate": 6.109236620177481e-06, "loss": 0.5936, "step": 9855 }, { "epoch": 4.45871974666365, "grad_norm": 0.3842596113681793, "learning_rate": 6.108522117293065e-06, "loss": 0.4679, "step": 9856 }, { "epoch": 4.459172133001584, "grad_norm": 0.3942267894744873, "learning_rate": 6.107807590601022e-06, "loss": 0.5737, "step": 9857 }, { "epoch": 4.459624519339516, "grad_norm": 0.3943744897842407, "learning_rate": 6.1070930401167005e-06, "loss": 0.549, "step": 9858 }, { "epoch": 4.4600769056774485, "grad_norm": 0.39649197459220886, "learning_rate": 6.106378465855444e-06, "loss": 0.4363, "step": 9859 }, { "epoch": 4.460529292015381, "grad_norm": 0.4211181700229645, "learning_rate": 6.1056638678326005e-06, "loss": 0.5259, "step": 9860 }, { "epoch": 4.460981678353313, "grad_norm": 0.4020215570926666, "learning_rate": 6.104949246063517e-06, "loss": 0.4319, "step": 9861 }, { "epoch": 4.461434064691247, "grad_norm": 0.4743976891040802, "learning_rate": 6.104234600563542e-06, "loss": 0.6644, "step": 9862 }, { "epoch": 4.461886451029179, "grad_norm": 0.4206840395927429, "learning_rate": 6.103519931348024e-06, "loss": 0.4929, "step": 9863 }, { "epoch": 4.462338837367112, "grad_norm": 0.45738890767097473, "learning_rate": 6.102805238432312e-06, "loss": 0.5124, "step": 9864 }, { "epoch": 4.462791223705044, "grad_norm": 0.46292048692703247, "learning_rate": 6.1020905218317535e-06, "loss": 0.5034, "step": 9865 }, { "epoch": 4.463243610042976, "grad_norm": 0.43926525115966797, "learning_rate": 6.1013757815617004e-06, "loss": 0.4721, "step": 9866 }, { "epoch": 4.46369599638091, "grad_norm": 0.3754102289676666, "learning_rate": 6.100661017637503e-06, "loss": 0.4037, "step": 9867 }, { "epoch": 4.464148382718842, "grad_norm": 0.5123727917671204, "learning_rate": 6.099946230074512e-06, "loss": 0.6298, "step": 9868 }, { "epoch": 4.464600769056775, "grad_norm": 0.4768806993961334, "learning_rate": 6.099231418888078e-06, "loss": 0.5853, "step": 9869 }, { "epoch": 4.465053155394707, "grad_norm": 0.4071371257305145, "learning_rate": 6.098516584093553e-06, "loss": 0.4899, "step": 9870 }, { "epoch": 4.4655055417326395, "grad_norm": 0.45609936118125916, "learning_rate": 6.0978017257062905e-06, "loss": 0.4357, "step": 9871 }, { "epoch": 4.465957928070572, "grad_norm": 0.44572126865386963, "learning_rate": 6.097086843741642e-06, "loss": 0.4558, "step": 9872 }, { "epoch": 4.466410314408505, "grad_norm": 0.532867431640625, "learning_rate": 6.096371938214962e-06, "loss": 0.6405, "step": 9873 }, { "epoch": 4.466862700746438, "grad_norm": 0.46392199397087097, "learning_rate": 6.0956570091416054e-06, "loss": 0.4682, "step": 9874 }, { "epoch": 4.46731508708437, "grad_norm": 0.46660882234573364, "learning_rate": 6.094942056536923e-06, "loss": 0.5028, "step": 9875 }, { "epoch": 4.4677674734223025, "grad_norm": 0.473160982131958, "learning_rate": 6.094227080416275e-06, "loss": 0.4648, "step": 9876 }, { "epoch": 4.468219859760235, "grad_norm": 0.45532873272895813, "learning_rate": 6.093512080795013e-06, "loss": 0.428, "step": 9877 }, { "epoch": 4.468672246098167, "grad_norm": 0.527583658695221, "learning_rate": 6.092797057688496e-06, "loss": 0.5295, "step": 9878 }, { "epoch": 4.469124632436101, "grad_norm": 0.5007955431938171, "learning_rate": 6.092082011112076e-06, "loss": 0.5165, "step": 9879 }, { "epoch": 4.469577018774033, "grad_norm": 0.49602657556533813, "learning_rate": 6.091366941081114e-06, "loss": 0.4767, "step": 9880 }, { "epoch": 4.470029405111966, "grad_norm": 0.4928325116634369, "learning_rate": 6.090651847610965e-06, "loss": 0.472, "step": 9881 }, { "epoch": 4.470481791449898, "grad_norm": 0.5879489183425903, "learning_rate": 6.08993673071699e-06, "loss": 0.6381, "step": 9882 }, { "epoch": 4.47093417778783, "grad_norm": 0.490418016910553, "learning_rate": 6.0892215904145445e-06, "loss": 0.5183, "step": 9883 }, { "epoch": 4.471386564125764, "grad_norm": 0.5114513039588928, "learning_rate": 6.088506426718988e-06, "loss": 0.4802, "step": 9884 }, { "epoch": 4.471838950463696, "grad_norm": 0.5922156572341919, "learning_rate": 6.087791239645682e-06, "loss": 0.5474, "step": 9885 }, { "epoch": 4.472291336801629, "grad_norm": 0.587253212928772, "learning_rate": 6.087076029209984e-06, "loss": 0.5602, "step": 9886 }, { "epoch": 4.472743723139561, "grad_norm": 0.6044721007347107, "learning_rate": 6.086360795427256e-06, "loss": 0.5318, "step": 9887 }, { "epoch": 4.4731961094774935, "grad_norm": 0.6315537691116333, "learning_rate": 6.085645538312859e-06, "loss": 0.499, "step": 9888 }, { "epoch": 4.473648495815427, "grad_norm": 0.5861734747886658, "learning_rate": 6.084930257882154e-06, "loss": 0.4953, "step": 9889 }, { "epoch": 4.474100882153359, "grad_norm": 0.653201162815094, "learning_rate": 6.084214954150503e-06, "loss": 0.5863, "step": 9890 }, { "epoch": 4.474553268491292, "grad_norm": 0.6724738478660583, "learning_rate": 6.0834996271332695e-06, "loss": 0.5414, "step": 9891 }, { "epoch": 4.475005654829224, "grad_norm": 0.7810421586036682, "learning_rate": 6.082784276845815e-06, "loss": 0.5488, "step": 9892 }, { "epoch": 4.4754580411671565, "grad_norm": 0.1706179529428482, "learning_rate": 6.082068903303503e-06, "loss": 1.0936, "step": 9893 }, { "epoch": 4.475910427505089, "grad_norm": 0.29635903239250183, "learning_rate": 6.081353506521699e-06, "loss": 0.57, "step": 9894 }, { "epoch": 4.476362813843022, "grad_norm": 0.30289846658706665, "learning_rate": 6.080638086515767e-06, "loss": 0.5497, "step": 9895 }, { "epoch": 4.476815200180955, "grad_norm": 0.33681467175483704, "learning_rate": 6.07992264330107e-06, "loss": 0.5671, "step": 9896 }, { "epoch": 4.477267586518887, "grad_norm": 0.3436667025089264, "learning_rate": 6.079207176892978e-06, "loss": 0.5252, "step": 9897 }, { "epoch": 4.4777199728568196, "grad_norm": 0.3704262971878052, "learning_rate": 6.0784916873068524e-06, "loss": 0.6938, "step": 9898 }, { "epoch": 4.478172359194752, "grad_norm": 0.3991837799549103, "learning_rate": 6.077776174558062e-06, "loss": 0.5241, "step": 9899 }, { "epoch": 4.478624745532685, "grad_norm": 0.3697640597820282, "learning_rate": 6.077060638661974e-06, "loss": 0.5863, "step": 9900 }, { "epoch": 4.479077131870618, "grad_norm": 0.3794926702976227, "learning_rate": 6.076345079633955e-06, "loss": 0.6757, "step": 9901 }, { "epoch": 4.47952951820855, "grad_norm": 0.3911217153072357, "learning_rate": 6.075629497489373e-06, "loss": 0.6029, "step": 9902 }, { "epoch": 4.479981904546483, "grad_norm": 0.40260088443756104, "learning_rate": 6.074913892243596e-06, "loss": 0.5607, "step": 9903 }, { "epoch": 4.480434290884415, "grad_norm": 0.429691880941391, "learning_rate": 6.074198263911995e-06, "loss": 0.6219, "step": 9904 }, { "epoch": 4.4808866772223475, "grad_norm": 0.4146384000778198, "learning_rate": 6.073482612509936e-06, "loss": 0.6122, "step": 9905 }, { "epoch": 4.481339063560281, "grad_norm": 0.402592271566391, "learning_rate": 6.072766938052793e-06, "loss": 0.5918, "step": 9906 }, { "epoch": 4.481791449898213, "grad_norm": 0.40438732504844666, "learning_rate": 6.072051240555934e-06, "loss": 0.5909, "step": 9907 }, { "epoch": 4.482243836236146, "grad_norm": 0.3924556374549866, "learning_rate": 6.071335520034732e-06, "loss": 0.4709, "step": 9908 }, { "epoch": 4.482696222574078, "grad_norm": 0.41745850443840027, "learning_rate": 6.070619776504555e-06, "loss": 0.551, "step": 9909 }, { "epoch": 4.4831486089120105, "grad_norm": 0.43811851739883423, "learning_rate": 6.069904009980778e-06, "loss": 0.5311, "step": 9910 }, { "epoch": 4.483600995249944, "grad_norm": 0.3901914358139038, "learning_rate": 6.069188220478772e-06, "loss": 0.4392, "step": 9911 }, { "epoch": 4.484053381587876, "grad_norm": 0.4279731512069702, "learning_rate": 6.068472408013912e-06, "loss": 0.6018, "step": 9912 }, { "epoch": 4.484505767925809, "grad_norm": 0.4209972321987152, "learning_rate": 6.06775657260157e-06, "loss": 0.5201, "step": 9913 }, { "epoch": 4.484958154263741, "grad_norm": 0.5563433766365051, "learning_rate": 6.067040714257118e-06, "loss": 0.5992, "step": 9914 }, { "epoch": 4.4854105406016735, "grad_norm": 0.4527778923511505, "learning_rate": 6.066324832995934e-06, "loss": 0.5158, "step": 9915 }, { "epoch": 4.485862926939607, "grad_norm": 0.3862810730934143, "learning_rate": 6.065608928833388e-06, "loss": 0.4849, "step": 9916 }, { "epoch": 4.486315313277539, "grad_norm": 0.4065494239330292, "learning_rate": 6.064893001784862e-06, "loss": 0.4834, "step": 9917 }, { "epoch": 4.486767699615472, "grad_norm": 0.43560341000556946, "learning_rate": 6.064177051865726e-06, "loss": 0.5542, "step": 9918 }, { "epoch": 4.487220085953404, "grad_norm": 0.44174763560295105, "learning_rate": 6.0634610790913606e-06, "loss": 0.558, "step": 9919 }, { "epoch": 4.487672472291337, "grad_norm": 0.4599493145942688, "learning_rate": 6.062745083477139e-06, "loss": 0.5335, "step": 9920 }, { "epoch": 4.488124858629269, "grad_norm": 0.46286168694496155, "learning_rate": 6.062029065038443e-06, "loss": 0.4861, "step": 9921 }, { "epoch": 4.488577244967202, "grad_norm": 0.4519326388835907, "learning_rate": 6.061313023790646e-06, "loss": 0.4389, "step": 9922 }, { "epoch": 4.489029631305135, "grad_norm": 0.3749350905418396, "learning_rate": 6.06059695974913e-06, "loss": 0.3684, "step": 9923 }, { "epoch": 4.489482017643067, "grad_norm": 0.4827781021595001, "learning_rate": 6.059880872929272e-06, "loss": 0.5449, "step": 9924 }, { "epoch": 4.489934403981, "grad_norm": 0.4555209279060364, "learning_rate": 6.05916476334645e-06, "loss": 0.4673, "step": 9925 }, { "epoch": 4.490386790318932, "grad_norm": 0.45220455527305603, "learning_rate": 6.058448631016045e-06, "loss": 0.4637, "step": 9926 }, { "epoch": 4.4908391766568645, "grad_norm": 0.46861788630485535, "learning_rate": 6.057732475953439e-06, "loss": 0.5196, "step": 9927 }, { "epoch": 4.491291562994798, "grad_norm": 0.41119879484176636, "learning_rate": 6.057016298174011e-06, "loss": 0.4276, "step": 9928 }, { "epoch": 4.49174394933273, "grad_norm": 0.4965476095676422, "learning_rate": 6.056300097693142e-06, "loss": 0.5207, "step": 9929 }, { "epoch": 4.492196335670663, "grad_norm": 0.49483349919319153, "learning_rate": 6.055583874526215e-06, "loss": 0.4913, "step": 9930 }, { "epoch": 4.492648722008595, "grad_norm": 0.45022305846214294, "learning_rate": 6.0548676286886125e-06, "loss": 0.4933, "step": 9931 }, { "epoch": 4.4931011083465275, "grad_norm": 0.5027831792831421, "learning_rate": 6.054151360195715e-06, "loss": 0.5355, "step": 9932 }, { "epoch": 4.493553494684461, "grad_norm": 0.450530469417572, "learning_rate": 6.053435069062907e-06, "loss": 0.4173, "step": 9933 }, { "epoch": 4.494005881022393, "grad_norm": 0.45792558789253235, "learning_rate": 6.052718755305575e-06, "loss": 0.4732, "step": 9934 }, { "epoch": 4.494458267360326, "grad_norm": 0.49557384848594666, "learning_rate": 6.052002418939098e-06, "loss": 0.5156, "step": 9935 }, { "epoch": 4.494910653698258, "grad_norm": 0.5315952897071838, "learning_rate": 6.051286059978865e-06, "loss": 0.508, "step": 9936 }, { "epoch": 4.495363040036191, "grad_norm": 0.4995218813419342, "learning_rate": 6.050569678440258e-06, "loss": 0.4251, "step": 9937 }, { "epoch": 4.495815426374124, "grad_norm": 0.48501482605934143, "learning_rate": 6.049853274338664e-06, "loss": 0.4552, "step": 9938 }, { "epoch": 4.496267812712056, "grad_norm": 0.5237612128257751, "learning_rate": 6.049136847689471e-06, "loss": 0.4542, "step": 9939 }, { "epoch": 4.496720199049989, "grad_norm": 0.5895063877105713, "learning_rate": 6.048420398508062e-06, "loss": 0.5127, "step": 9940 }, { "epoch": 4.497172585387921, "grad_norm": 0.5569533705711365, "learning_rate": 6.047703926809827e-06, "loss": 0.4526, "step": 9941 }, { "epoch": 4.497624971725854, "grad_norm": 0.5760993361473083, "learning_rate": 6.046987432610152e-06, "loss": 0.4547, "step": 9942 }, { "epoch": 4.498077358063786, "grad_norm": 0.15529276430606842, "learning_rate": 6.046270915924426e-06, "loss": 1.2998, "step": 9943 }, { "epoch": 4.498529744401719, "grad_norm": 0.2560938596725464, "learning_rate": 6.045554376768038e-06, "loss": 0.5252, "step": 9944 }, { "epoch": 4.498982130739652, "grad_norm": 0.2793765068054199, "learning_rate": 6.044837815156377e-06, "loss": 0.5314, "step": 9945 }, { "epoch": 4.499434517077584, "grad_norm": 0.32594427466392517, "learning_rate": 6.04412123110483e-06, "loss": 0.6402, "step": 9946 }, { "epoch": 4.499886903415517, "grad_norm": 0.32826000452041626, "learning_rate": 6.043404624628789e-06, "loss": 0.5202, "step": 9947 }, { "epoch": 4.500339289753449, "grad_norm": 0.32133617997169495, "learning_rate": 6.0426879957436445e-06, "loss": 0.6166, "step": 9948 }, { "epoch": 4.500791676091382, "grad_norm": 0.3481982946395874, "learning_rate": 6.0419713444647875e-06, "loss": 0.586, "step": 9949 }, { "epoch": 4.501244062429315, "grad_norm": 0.30087393522262573, "learning_rate": 6.041254670807609e-06, "loss": 0.4637, "step": 9950 }, { "epoch": 4.501696448767247, "grad_norm": 0.366440087556839, "learning_rate": 6.040537974787502e-06, "loss": 0.5981, "step": 9951 }, { "epoch": 4.50214883510518, "grad_norm": 0.3659916818141937, "learning_rate": 6.039821256419857e-06, "loss": 0.5033, "step": 9952 }, { "epoch": 4.502601221443112, "grad_norm": 0.407204806804657, "learning_rate": 6.039104515720069e-06, "loss": 0.6231, "step": 9953 }, { "epoch": 4.503053607781045, "grad_norm": 0.41534852981567383, "learning_rate": 6.03838775270353e-06, "loss": 0.5746, "step": 9954 }, { "epoch": 4.503505994118978, "grad_norm": 0.38312193751335144, "learning_rate": 6.037670967385633e-06, "loss": 0.4964, "step": 9955 }, { "epoch": 4.50395838045691, "grad_norm": 0.40562063455581665, "learning_rate": 6.036954159781775e-06, "loss": 0.7106, "step": 9956 }, { "epoch": 4.504410766794843, "grad_norm": 0.46263983845710754, "learning_rate": 6.0362373299073494e-06, "loss": 0.5715, "step": 9957 }, { "epoch": 4.504863153132775, "grad_norm": 0.42351582646369934, "learning_rate": 6.035520477777751e-06, "loss": 0.6044, "step": 9958 }, { "epoch": 4.505315539470708, "grad_norm": 0.43251848220825195, "learning_rate": 6.034803603408376e-06, "loss": 0.4967, "step": 9959 }, { "epoch": 4.505767925808641, "grad_norm": 0.4104975163936615, "learning_rate": 6.034086706814621e-06, "loss": 0.5542, "step": 9960 }, { "epoch": 4.506220312146573, "grad_norm": 0.44554805755615234, "learning_rate": 6.033369788011881e-06, "loss": 0.5452, "step": 9961 }, { "epoch": 4.506672698484506, "grad_norm": 0.4018062651157379, "learning_rate": 6.032652847015555e-06, "loss": 0.4826, "step": 9962 }, { "epoch": 4.507125084822438, "grad_norm": 0.5101608633995056, "learning_rate": 6.0319358838410415e-06, "loss": 0.6936, "step": 9963 }, { "epoch": 4.507577471160371, "grad_norm": 0.41784316301345825, "learning_rate": 6.031218898503737e-06, "loss": 0.5046, "step": 9964 }, { "epoch": 4.508029857498304, "grad_norm": 0.46833109855651855, "learning_rate": 6.030501891019039e-06, "loss": 0.5952, "step": 9965 }, { "epoch": 4.508482243836236, "grad_norm": 0.4071108102798462, "learning_rate": 6.02978486140235e-06, "loss": 0.4764, "step": 9966 }, { "epoch": 4.508934630174169, "grad_norm": 0.4181317090988159, "learning_rate": 6.029067809669067e-06, "loss": 0.5005, "step": 9967 }, { "epoch": 4.509387016512101, "grad_norm": 0.44984176754951477, "learning_rate": 6.028350735834592e-06, "loss": 0.5435, "step": 9968 }, { "epoch": 4.509839402850034, "grad_norm": 0.40992942452430725, "learning_rate": 6.027633639914322e-06, "loss": 0.5073, "step": 9969 }, { "epoch": 4.510291789187966, "grad_norm": 0.42489197850227356, "learning_rate": 6.0269165219236614e-06, "loss": 0.4356, "step": 9970 }, { "epoch": 4.5107441755258995, "grad_norm": 0.4358978569507599, "learning_rate": 6.026199381878011e-06, "loss": 0.5052, "step": 9971 }, { "epoch": 4.511196561863832, "grad_norm": 0.5026026964187622, "learning_rate": 6.025482219792771e-06, "loss": 0.5277, "step": 9972 }, { "epoch": 4.511648948201764, "grad_norm": 0.4321710467338562, "learning_rate": 6.024765035683346e-06, "loss": 0.4602, "step": 9973 }, { "epoch": 4.512101334539697, "grad_norm": 0.4897453486919403, "learning_rate": 6.024047829565139e-06, "loss": 0.6596, "step": 9974 }, { "epoch": 4.512553720877629, "grad_norm": 0.46743881702423096, "learning_rate": 6.0233306014535505e-06, "loss": 0.5489, "step": 9975 }, { "epoch": 4.513006107215562, "grad_norm": 0.4612450897693634, "learning_rate": 6.022613351363987e-06, "loss": 0.473, "step": 9976 }, { "epoch": 4.513458493553495, "grad_norm": 0.4425530731678009, "learning_rate": 6.021896079311852e-06, "loss": 0.47, "step": 9977 }, { "epoch": 4.513910879891427, "grad_norm": 0.4823836088180542, "learning_rate": 6.0211787853125506e-06, "loss": 0.5607, "step": 9978 }, { "epoch": 4.51436326622936, "grad_norm": 0.4976250231266022, "learning_rate": 6.0204614693814885e-06, "loss": 0.5182, "step": 9979 }, { "epoch": 4.514815652567292, "grad_norm": 0.49324649572372437, "learning_rate": 6.01974413153407e-06, "loss": 0.5431, "step": 9980 }, { "epoch": 4.515268038905225, "grad_norm": 0.4537407159805298, "learning_rate": 6.019026771785702e-06, "loss": 0.3854, "step": 9981 }, { "epoch": 4.515720425243158, "grad_norm": 0.44208309054374695, "learning_rate": 6.018309390151791e-06, "loss": 0.4671, "step": 9982 }, { "epoch": 4.51617281158109, "grad_norm": 0.47942155599594116, "learning_rate": 6.017591986647744e-06, "loss": 0.5214, "step": 9983 }, { "epoch": 4.516625197919023, "grad_norm": 0.5257421135902405, "learning_rate": 6.016874561288969e-06, "loss": 0.5369, "step": 9984 }, { "epoch": 4.517077584256955, "grad_norm": 0.5314010977745056, "learning_rate": 6.016157114090875e-06, "loss": 0.5295, "step": 9985 }, { "epoch": 4.517529970594888, "grad_norm": 0.48702070116996765, "learning_rate": 6.015439645068868e-06, "loss": 0.4578, "step": 9986 }, { "epoch": 4.517982356932821, "grad_norm": 0.4990609288215637, "learning_rate": 6.014722154238359e-06, "loss": 0.4409, "step": 9987 }, { "epoch": 4.5184347432707535, "grad_norm": 0.5921383500099182, "learning_rate": 6.014004641614758e-06, "loss": 0.5479, "step": 9988 }, { "epoch": 4.518887129608686, "grad_norm": 0.6251603364944458, "learning_rate": 6.013287107213474e-06, "loss": 0.6558, "step": 9989 }, { "epoch": 4.519339515946618, "grad_norm": 0.557854175567627, "learning_rate": 6.012569551049917e-06, "loss": 0.4698, "step": 9990 }, { "epoch": 4.519791902284551, "grad_norm": 0.6268952488899231, "learning_rate": 6.011851973139499e-06, "loss": 0.527, "step": 9991 }, { "epoch": 4.520244288622484, "grad_norm": 0.8573987483978271, "learning_rate": 6.01113437349763e-06, "loss": 0.6258, "step": 9992 }, { "epoch": 4.5206966749604165, "grad_norm": 0.13928444683551788, "learning_rate": 6.010416752139723e-06, "loss": 1.309, "step": 9993 }, { "epoch": 4.521149061298349, "grad_norm": 0.2732396721839905, "learning_rate": 6.009699109081188e-06, "loss": 1.2097, "step": 9994 }, { "epoch": 4.521601447636281, "grad_norm": 0.3134301006793976, "learning_rate": 6.0089814443374406e-06, "loss": 0.6978, "step": 9995 }, { "epoch": 4.522053833974214, "grad_norm": 0.34808140993118286, "learning_rate": 6.008263757923893e-06, "loss": 0.6596, "step": 9996 }, { "epoch": 4.522506220312146, "grad_norm": 0.3715035915374756, "learning_rate": 6.007546049855959e-06, "loss": 0.8632, "step": 9997 }, { "epoch": 4.5229586066500795, "grad_norm": 0.3715048134326935, "learning_rate": 6.006828320149053e-06, "loss": 0.6479, "step": 9998 }, { "epoch": 4.523410992988012, "grad_norm": 0.3722665309906006, "learning_rate": 6.006110568818588e-06, "loss": 0.6437, "step": 9999 }, { "epoch": 4.523863379325944, "grad_norm": 0.3575993776321411, "learning_rate": 6.005392795879982e-06, "loss": 0.5238, "step": 10000 }, { "epoch": 4.523863379325944, "eval_loss": 0.5891706347465515, "eval_runtime": 25.6117, "eval_samples_per_second": 29.049, "eval_steps_per_second": 7.262, "step": 10000 }, { "epoch": 4.524315765663877, "grad_norm": 0.34885239601135254, "learning_rate": 6.004675001348648e-06, "loss": 0.5604, "step": 10001 }, { "epoch": 4.524768152001809, "grad_norm": 0.37109819054603577, "learning_rate": 6.003957185240002e-06, "loss": 0.5917, "step": 10002 }, { "epoch": 4.525220538339742, "grad_norm": 0.3513300120830536, "learning_rate": 6.003239347569463e-06, "loss": 0.5848, "step": 10003 }, { "epoch": 4.525672924677675, "grad_norm": 0.3374817371368408, "learning_rate": 6.002521488352444e-06, "loss": 0.411, "step": 10004 }, { "epoch": 4.5261253110156074, "grad_norm": 0.38370418548583984, "learning_rate": 6.001803607604366e-06, "loss": 0.5386, "step": 10005 }, { "epoch": 4.52657769735354, "grad_norm": 0.3966556787490845, "learning_rate": 6.0010857053406455e-06, "loss": 0.5748, "step": 10006 }, { "epoch": 4.527030083691472, "grad_norm": 0.3322100043296814, "learning_rate": 6.0003677815767e-06, "loss": 0.5133, "step": 10007 }, { "epoch": 4.527482470029405, "grad_norm": 0.39096397161483765, "learning_rate": 5.99964983632795e-06, "loss": 0.5698, "step": 10008 }, { "epoch": 4.527934856367338, "grad_norm": 0.39198189973831177, "learning_rate": 5.9989318696098135e-06, "loss": 0.5363, "step": 10009 }, { "epoch": 4.5283872427052705, "grad_norm": 0.42353782057762146, "learning_rate": 5.99821388143771e-06, "loss": 0.49, "step": 10010 }, { "epoch": 4.528839629043203, "grad_norm": 0.45940884947776794, "learning_rate": 5.9974958718270605e-06, "loss": 0.5772, "step": 10011 }, { "epoch": 4.529292015381135, "grad_norm": 0.43306952714920044, "learning_rate": 5.996777840793285e-06, "loss": 0.5344, "step": 10012 }, { "epoch": 4.529744401719068, "grad_norm": 0.41102859377861023, "learning_rate": 5.996059788351807e-06, "loss": 0.575, "step": 10013 }, { "epoch": 4.530196788057001, "grad_norm": 0.41413286328315735, "learning_rate": 5.995341714518044e-06, "loss": 0.5058, "step": 10014 }, { "epoch": 4.5306491743949335, "grad_norm": 0.451887845993042, "learning_rate": 5.99462361930742e-06, "loss": 0.5224, "step": 10015 }, { "epoch": 4.531101560732866, "grad_norm": 0.41956183314323425, "learning_rate": 5.993905502735358e-06, "loss": 0.4558, "step": 10016 }, { "epoch": 4.531553947070798, "grad_norm": 0.445143461227417, "learning_rate": 5.99318736481728e-06, "loss": 0.5754, "step": 10017 }, { "epoch": 4.532006333408731, "grad_norm": 0.4652597904205322, "learning_rate": 5.99246920556861e-06, "loss": 0.5458, "step": 10018 }, { "epoch": 4.532458719746664, "grad_norm": 0.4612543284893036, "learning_rate": 5.991751025004771e-06, "loss": 0.534, "step": 10019 }, { "epoch": 4.532911106084597, "grad_norm": 0.44210606813430786, "learning_rate": 5.991032823141188e-06, "loss": 0.5114, "step": 10020 }, { "epoch": 4.533363492422529, "grad_norm": 0.45035359263420105, "learning_rate": 5.990314599993285e-06, "loss": 0.5473, "step": 10021 }, { "epoch": 4.533815878760461, "grad_norm": 0.4951249957084656, "learning_rate": 5.989596355576489e-06, "loss": 0.4671, "step": 10022 }, { "epoch": 4.534268265098394, "grad_norm": 0.5251184701919556, "learning_rate": 5.988878089906224e-06, "loss": 0.6467, "step": 10023 }, { "epoch": 4.534720651436326, "grad_norm": 0.4804021418094635, "learning_rate": 5.988159802997916e-06, "loss": 0.5033, "step": 10024 }, { "epoch": 4.535173037774259, "grad_norm": 0.48714062571525574, "learning_rate": 5.9874414948669946e-06, "loss": 0.5428, "step": 10025 }, { "epoch": 4.535625424112192, "grad_norm": 0.5311936140060425, "learning_rate": 5.986723165528883e-06, "loss": 0.5587, "step": 10026 }, { "epoch": 4.5360778104501245, "grad_norm": 0.5290337800979614, "learning_rate": 5.986004814999009e-06, "loss": 0.5495, "step": 10027 }, { "epoch": 4.536530196788057, "grad_norm": 0.47442427277565, "learning_rate": 5.9852864432928025e-06, "loss": 0.4818, "step": 10028 }, { "epoch": 4.536982583125989, "grad_norm": 0.5636610984802246, "learning_rate": 5.984568050425691e-06, "loss": 0.6318, "step": 10029 }, { "epoch": 4.537434969463922, "grad_norm": 0.5188373923301697, "learning_rate": 5.9838496364131025e-06, "loss": 0.5299, "step": 10030 }, { "epoch": 4.537887355801855, "grad_norm": 0.5258698463439941, "learning_rate": 5.983131201270469e-06, "loss": 0.5224, "step": 10031 }, { "epoch": 4.5383397421397875, "grad_norm": 0.5125499367713928, "learning_rate": 5.982412745013218e-06, "loss": 0.4775, "step": 10032 }, { "epoch": 4.53879212847772, "grad_norm": 0.5663819909095764, "learning_rate": 5.981694267656779e-06, "loss": 0.5337, "step": 10033 }, { "epoch": 4.539244514815652, "grad_norm": 0.5003456473350525, "learning_rate": 5.980975769216585e-06, "loss": 0.3984, "step": 10034 }, { "epoch": 4.539696901153585, "grad_norm": 0.5273736715316772, "learning_rate": 5.980257249708066e-06, "loss": 0.5058, "step": 10035 }, { "epoch": 4.540149287491518, "grad_norm": 0.531892716884613, "learning_rate": 5.979538709146654e-06, "loss": 0.5687, "step": 10036 }, { "epoch": 4.540601673829451, "grad_norm": 0.5757437944412231, "learning_rate": 5.97882014754778e-06, "loss": 0.5458, "step": 10037 }, { "epoch": 4.541054060167383, "grad_norm": 0.5137304663658142, "learning_rate": 5.978101564926877e-06, "loss": 0.3921, "step": 10038 }, { "epoch": 4.541506446505315, "grad_norm": 0.6627729535102844, "learning_rate": 5.977382961299378e-06, "loss": 0.4495, "step": 10039 }, { "epoch": 4.541958832843248, "grad_norm": 0.6003199219703674, "learning_rate": 5.9766643366807164e-06, "loss": 0.5069, "step": 10040 }, { "epoch": 4.542411219181181, "grad_norm": 0.5856605172157288, "learning_rate": 5.975945691086327e-06, "loss": 0.4731, "step": 10041 }, { "epoch": 4.542863605519114, "grad_norm": 0.6295278668403625, "learning_rate": 5.975227024531642e-06, "loss": 0.5208, "step": 10042 }, { "epoch": 4.543315991857046, "grad_norm": 0.1713848114013672, "learning_rate": 5.974508337032097e-06, "loss": 1.261, "step": 10043 }, { "epoch": 4.5437683781949785, "grad_norm": 0.333859920501709, "learning_rate": 5.9737896286031285e-06, "loss": 0.7344, "step": 10044 }, { "epoch": 4.544220764532911, "grad_norm": 0.31355175375938416, "learning_rate": 5.9730708992601704e-06, "loss": 0.6276, "step": 10045 }, { "epoch": 4.544673150870843, "grad_norm": 0.33160412311553955, "learning_rate": 5.97235214901866e-06, "loss": 0.569, "step": 10046 }, { "epoch": 4.545125537208777, "grad_norm": 0.3488249182701111, "learning_rate": 5.971633377894034e-06, "loss": 0.7131, "step": 10047 }, { "epoch": 4.545577923546709, "grad_norm": 0.3770263195037842, "learning_rate": 5.970914585901727e-06, "loss": 0.6904, "step": 10048 }, { "epoch": 4.5460303098846415, "grad_norm": 0.3531179428100586, "learning_rate": 5.970195773057179e-06, "loss": 0.6231, "step": 10049 }, { "epoch": 4.546482696222574, "grad_norm": 0.39356639981269836, "learning_rate": 5.9694769393758265e-06, "loss": 0.6805, "step": 10050 }, { "epoch": 4.546935082560506, "grad_norm": 0.3521098792552948, "learning_rate": 5.968758084873108e-06, "loss": 0.6162, "step": 10051 }, { "epoch": 4.547387468898439, "grad_norm": 0.3627796173095703, "learning_rate": 5.968039209564463e-06, "loss": 0.4937, "step": 10052 }, { "epoch": 4.547839855236372, "grad_norm": 0.4002125561237335, "learning_rate": 5.967320313465331e-06, "loss": 0.6022, "step": 10053 }, { "epoch": 4.548292241574305, "grad_norm": 0.43219614028930664, "learning_rate": 5.96660139659115e-06, "loss": 0.6264, "step": 10054 }, { "epoch": 4.548744627912237, "grad_norm": 0.4342018663883209, "learning_rate": 5.965882458957362e-06, "loss": 0.5781, "step": 10055 }, { "epoch": 4.549197014250169, "grad_norm": 0.40671291947364807, "learning_rate": 5.965163500579406e-06, "loss": 0.5817, "step": 10056 }, { "epoch": 4.549649400588102, "grad_norm": 0.40924713015556335, "learning_rate": 5.964444521472724e-06, "loss": 0.5792, "step": 10057 }, { "epoch": 4.550101786926035, "grad_norm": 0.39717599749565125, "learning_rate": 5.963725521652758e-06, "loss": 0.4724, "step": 10058 }, { "epoch": 4.550554173263968, "grad_norm": 0.40348923206329346, "learning_rate": 5.963006501134948e-06, "loss": 0.5601, "step": 10059 }, { "epoch": 4.5510065596019, "grad_norm": 0.45137184858322144, "learning_rate": 5.962287459934737e-06, "loss": 0.5804, "step": 10060 }, { "epoch": 4.5514589459398325, "grad_norm": 0.4677363932132721, "learning_rate": 5.9615683980675676e-06, "loss": 0.4817, "step": 10061 }, { "epoch": 4.551911332277765, "grad_norm": 0.42262643575668335, "learning_rate": 5.960849315548884e-06, "loss": 0.484, "step": 10062 }, { "epoch": 4.552363718615698, "grad_norm": 0.4415736198425293, "learning_rate": 5.960130212394129e-06, "loss": 0.5186, "step": 10063 }, { "epoch": 4.552816104953631, "grad_norm": 0.5272606015205383, "learning_rate": 5.959411088618749e-06, "loss": 0.6883, "step": 10064 }, { "epoch": 4.553268491291563, "grad_norm": 0.391618937253952, "learning_rate": 5.958691944238184e-06, "loss": 0.4274, "step": 10065 }, { "epoch": 4.5537208776294955, "grad_norm": 0.5056571960449219, "learning_rate": 5.9579727792678835e-06, "loss": 0.605, "step": 10066 }, { "epoch": 4.554173263967428, "grad_norm": 0.45232945680618286, "learning_rate": 5.95725359372329e-06, "loss": 0.5075, "step": 10067 }, { "epoch": 4.554625650305361, "grad_norm": 0.45145082473754883, "learning_rate": 5.956534387619851e-06, "loss": 0.603, "step": 10068 }, { "epoch": 4.555078036643294, "grad_norm": 0.4623699486255646, "learning_rate": 5.955815160973013e-06, "loss": 0.515, "step": 10069 }, { "epoch": 4.555530422981226, "grad_norm": 0.5196733474731445, "learning_rate": 5.9550959137982225e-06, "loss": 0.5383, "step": 10070 }, { "epoch": 4.5559828093191586, "grad_norm": 0.44723188877105713, "learning_rate": 5.954376646110924e-06, "loss": 0.4815, "step": 10071 }, { "epoch": 4.556435195657091, "grad_norm": 0.47676733136177063, "learning_rate": 5.953657357926569e-06, "loss": 0.5541, "step": 10072 }, { "epoch": 4.556887581995023, "grad_norm": 0.4349079132080078, "learning_rate": 5.952938049260604e-06, "loss": 0.456, "step": 10073 }, { "epoch": 4.557339968332956, "grad_norm": 0.47196245193481445, "learning_rate": 5.952218720128477e-06, "loss": 0.4874, "step": 10074 }, { "epoch": 4.557792354670889, "grad_norm": 0.47847989201545715, "learning_rate": 5.9514993705456355e-06, "loss": 0.4833, "step": 10075 }, { "epoch": 4.558244741008822, "grad_norm": 0.49476924538612366, "learning_rate": 5.950780000527534e-06, "loss": 0.5643, "step": 10076 }, { "epoch": 4.558697127346754, "grad_norm": 0.49198997020721436, "learning_rate": 5.950060610089618e-06, "loss": 0.5248, "step": 10077 }, { "epoch": 4.5591495136846865, "grad_norm": 0.5438183546066284, "learning_rate": 5.949341199247338e-06, "loss": 0.5774, "step": 10078 }, { "epoch": 4.559601900022619, "grad_norm": 0.4206569790840149, "learning_rate": 5.948621768016147e-06, "loss": 0.408, "step": 10079 }, { "epoch": 4.560054286360552, "grad_norm": 0.4680294096469879, "learning_rate": 5.947902316411494e-06, "loss": 0.5417, "step": 10080 }, { "epoch": 4.560506672698485, "grad_norm": 0.5016375184059143, "learning_rate": 5.947182844448833e-06, "loss": 0.4777, "step": 10081 }, { "epoch": 4.560959059036417, "grad_norm": 0.5569865703582764, "learning_rate": 5.946463352143614e-06, "loss": 0.5929, "step": 10082 }, { "epoch": 4.5614114453743495, "grad_norm": 0.4748811423778534, "learning_rate": 5.94574383951129e-06, "loss": 0.4509, "step": 10083 }, { "epoch": 4.561863831712282, "grad_norm": 0.5621930360794067, "learning_rate": 5.945024306567313e-06, "loss": 0.5469, "step": 10084 }, { "epoch": 4.562316218050215, "grad_norm": 0.5036601424217224, "learning_rate": 5.944304753327138e-06, "loss": 0.5092, "step": 10085 }, { "epoch": 4.562768604388148, "grad_norm": 0.6028850078582764, "learning_rate": 5.943585179806217e-06, "loss": 0.5845, "step": 10086 }, { "epoch": 4.56322099072608, "grad_norm": 0.5713685154914856, "learning_rate": 5.942865586020006e-06, "loss": 0.4668, "step": 10087 }, { "epoch": 4.5636733770640125, "grad_norm": 0.5975900888442993, "learning_rate": 5.9421459719839595e-06, "loss": 0.5142, "step": 10088 }, { "epoch": 4.564125763401945, "grad_norm": 0.6559189558029175, "learning_rate": 5.941426337713531e-06, "loss": 0.5469, "step": 10089 }, { "epoch": 4.564578149739878, "grad_norm": 0.680534839630127, "learning_rate": 5.940706683224177e-06, "loss": 0.5193, "step": 10090 }, { "epoch": 4.565030536077811, "grad_norm": 0.6219462156295776, "learning_rate": 5.939987008531352e-06, "loss": 0.496, "step": 10091 }, { "epoch": 4.565482922415743, "grad_norm": 0.7197852730751038, "learning_rate": 5.9392673136505174e-06, "loss": 0.579, "step": 10092 }, { "epoch": 4.565935308753676, "grad_norm": 0.17401224374771118, "learning_rate": 5.938547598597124e-06, "loss": 0.9028, "step": 10093 }, { "epoch": 4.566387695091608, "grad_norm": 0.28875526785850525, "learning_rate": 5.9378278633866325e-06, "loss": 0.5975, "step": 10094 }, { "epoch": 4.5668400814295405, "grad_norm": 0.33505871891975403, "learning_rate": 5.9371081080344984e-06, "loss": 0.584, "step": 10095 }, { "epoch": 4.567292467767474, "grad_norm": 0.3374595046043396, "learning_rate": 5.936388332556182e-06, "loss": 0.5961, "step": 10096 }, { "epoch": 4.567744854105406, "grad_norm": 0.3588670492172241, "learning_rate": 5.93566853696714e-06, "loss": 0.6442, "step": 10097 }, { "epoch": 4.568197240443339, "grad_norm": 0.3952876031398773, "learning_rate": 5.9349487212828325e-06, "loss": 0.625, "step": 10098 }, { "epoch": 4.568649626781271, "grad_norm": 0.36095520853996277, "learning_rate": 5.934228885518718e-06, "loss": 0.5944, "step": 10099 }, { "epoch": 4.5691020131192035, "grad_norm": 0.3455774188041687, "learning_rate": 5.933509029690258e-06, "loss": 0.623, "step": 10100 }, { "epoch": 4.569554399457136, "grad_norm": 0.3871525824069977, "learning_rate": 5.932789153812911e-06, "loss": 0.6109, "step": 10101 }, { "epoch": 4.570006785795069, "grad_norm": 0.3682929575443268, "learning_rate": 5.9320692579021385e-06, "loss": 0.5303, "step": 10102 }, { "epoch": 4.570459172133002, "grad_norm": 0.39240312576293945, "learning_rate": 5.931349341973402e-06, "loss": 0.4883, "step": 10103 }, { "epoch": 4.570911558470934, "grad_norm": 0.4036731719970703, "learning_rate": 5.930629406042164e-06, "loss": 0.5467, "step": 10104 }, { "epoch": 4.5713639448088665, "grad_norm": 0.3831310570240021, "learning_rate": 5.9299094501238844e-06, "loss": 0.4975, "step": 10105 }, { "epoch": 4.571816331146799, "grad_norm": 0.39943572878837585, "learning_rate": 5.929189474234025e-06, "loss": 0.5168, "step": 10106 }, { "epoch": 4.572268717484732, "grad_norm": 0.4538988769054413, "learning_rate": 5.92846947838805e-06, "loss": 0.6345, "step": 10107 }, { "epoch": 4.572721103822665, "grad_norm": 0.4047839641571045, "learning_rate": 5.927749462601424e-06, "loss": 0.519, "step": 10108 }, { "epoch": 4.573173490160597, "grad_norm": 0.3976536691188812, "learning_rate": 5.927029426889609e-06, "loss": 0.519, "step": 10109 }, { "epoch": 4.57362587649853, "grad_norm": 0.4398157298564911, "learning_rate": 5.9263093712680695e-06, "loss": 0.6124, "step": 10110 }, { "epoch": 4.574078262836462, "grad_norm": 0.40613114833831787, "learning_rate": 5.92558929575227e-06, "loss": 0.5102, "step": 10111 }, { "epoch": 4.574530649174395, "grad_norm": 0.43361690640449524, "learning_rate": 5.924869200357676e-06, "loss": 0.6681, "step": 10112 }, { "epoch": 4.574983035512328, "grad_norm": 0.41694164276123047, "learning_rate": 5.9241490850997515e-06, "loss": 0.4978, "step": 10113 }, { "epoch": 4.57543542185026, "grad_norm": 0.4589563012123108, "learning_rate": 5.923428949993964e-06, "loss": 0.5503, "step": 10114 }, { "epoch": 4.575887808188193, "grad_norm": 0.39144396781921387, "learning_rate": 5.92270879505578e-06, "loss": 0.4469, "step": 10115 }, { "epoch": 4.576340194526125, "grad_norm": 0.48813050985336304, "learning_rate": 5.921988620300666e-06, "loss": 0.6121, "step": 10116 }, { "epoch": 4.576792580864058, "grad_norm": 0.47286364436149597, "learning_rate": 5.921268425744087e-06, "loss": 0.4908, "step": 10117 }, { "epoch": 4.577244967201991, "grad_norm": 0.5156825184822083, "learning_rate": 5.920548211401512e-06, "loss": 0.6916, "step": 10118 }, { "epoch": 4.577697353539923, "grad_norm": 0.47133737802505493, "learning_rate": 5.9198279772884094e-06, "loss": 0.4869, "step": 10119 }, { "epoch": 4.578149739877856, "grad_norm": 0.5232285857200623, "learning_rate": 5.919107723420248e-06, "loss": 0.6617, "step": 10120 }, { "epoch": 4.578602126215788, "grad_norm": 0.49059203267097473, "learning_rate": 5.918387449812494e-06, "loss": 0.5847, "step": 10121 }, { "epoch": 4.5790545125537205, "grad_norm": 0.44826212525367737, "learning_rate": 5.91766715648062e-06, "loss": 0.471, "step": 10122 }, { "epoch": 4.579506898891654, "grad_norm": 0.486880898475647, "learning_rate": 5.916946843440094e-06, "loss": 0.5461, "step": 10123 }, { "epoch": 4.579959285229586, "grad_norm": 0.5225138664245605, "learning_rate": 5.916226510706386e-06, "loss": 0.5527, "step": 10124 }, { "epoch": 4.580411671567519, "grad_norm": 0.49267348647117615, "learning_rate": 5.915506158294966e-06, "loss": 0.5446, "step": 10125 }, { "epoch": 4.580864057905451, "grad_norm": 0.46246665716171265, "learning_rate": 5.914785786221307e-06, "loss": 0.4498, "step": 10126 }, { "epoch": 4.581316444243384, "grad_norm": 0.547082245349884, "learning_rate": 5.914065394500879e-06, "loss": 0.5805, "step": 10127 }, { "epoch": 4.581768830581316, "grad_norm": 0.4884147644042969, "learning_rate": 5.913344983149154e-06, "loss": 0.471, "step": 10128 }, { "epoch": 4.582221216919249, "grad_norm": 0.4204467833042145, "learning_rate": 5.912624552181603e-06, "loss": 0.4045, "step": 10129 }, { "epoch": 4.582673603257182, "grad_norm": 0.5079936385154724, "learning_rate": 5.9119041016137e-06, "loss": 0.5627, "step": 10130 }, { "epoch": 4.583125989595114, "grad_norm": 0.5083721280097961, "learning_rate": 5.911183631460918e-06, "loss": 0.4904, "step": 10131 }, { "epoch": 4.583578375933047, "grad_norm": 0.4504120945930481, "learning_rate": 5.91046314173873e-06, "loss": 0.4677, "step": 10132 }, { "epoch": 4.584030762270979, "grad_norm": 0.5830187797546387, "learning_rate": 5.9097426324626104e-06, "loss": 0.6005, "step": 10133 }, { "epoch": 4.584483148608912, "grad_norm": 0.500262439250946, "learning_rate": 5.909022103648034e-06, "loss": 0.4762, "step": 10134 }, { "epoch": 4.584935534946845, "grad_norm": 0.563623309135437, "learning_rate": 5.908301555310474e-06, "loss": 0.5667, "step": 10135 }, { "epoch": 4.585387921284777, "grad_norm": 0.528846800327301, "learning_rate": 5.907580987465407e-06, "loss": 0.4914, "step": 10136 }, { "epoch": 4.58584030762271, "grad_norm": 0.5165499448776245, "learning_rate": 5.906860400128307e-06, "loss": 0.4311, "step": 10137 }, { "epoch": 4.586292693960642, "grad_norm": 0.550814151763916, "learning_rate": 5.9061397933146515e-06, "loss": 0.5335, "step": 10138 }, { "epoch": 4.586745080298575, "grad_norm": 0.5924322009086609, "learning_rate": 5.9054191670399164e-06, "loss": 0.5325, "step": 10139 }, { "epoch": 4.587197466636508, "grad_norm": 0.5872438549995422, "learning_rate": 5.9046985213195775e-06, "loss": 0.5318, "step": 10140 }, { "epoch": 4.58764985297444, "grad_norm": 0.700289785861969, "learning_rate": 5.903977856169113e-06, "loss": 0.5041, "step": 10141 }, { "epoch": 4.588102239312373, "grad_norm": 0.6801844835281372, "learning_rate": 5.903257171604001e-06, "loss": 0.4761, "step": 10142 }, { "epoch": 4.588554625650305, "grad_norm": 0.17123708128929138, "learning_rate": 5.902536467639719e-06, "loss": 1.1283, "step": 10143 }, { "epoch": 4.589007011988238, "grad_norm": 0.28115883469581604, "learning_rate": 5.901815744291746e-06, "loss": 0.5751, "step": 10144 }, { "epoch": 4.589459398326171, "grad_norm": 0.3305279612541199, "learning_rate": 5.90109500157556e-06, "loss": 0.639, "step": 10145 }, { "epoch": 4.589911784664103, "grad_norm": 0.29794061183929443, "learning_rate": 5.90037423950664e-06, "loss": 0.5443, "step": 10146 }, { "epoch": 4.590364171002036, "grad_norm": 0.3347508907318115, "learning_rate": 5.899653458100467e-06, "loss": 0.586, "step": 10147 }, { "epoch": 4.590816557339968, "grad_norm": 0.34328585863113403, "learning_rate": 5.8989326573725214e-06, "loss": 0.5749, "step": 10148 }, { "epoch": 4.591268943677901, "grad_norm": 0.4296923577785492, "learning_rate": 5.898211837338283e-06, "loss": 0.7891, "step": 10149 }, { "epoch": 4.591721330015833, "grad_norm": 0.35226738452911377, "learning_rate": 5.897490998013234e-06, "loss": 0.5615, "step": 10150 }, { "epoch": 4.592173716353766, "grad_norm": 0.4113437831401825, "learning_rate": 5.8967701394128526e-06, "loss": 0.5718, "step": 10151 }, { "epoch": 4.592626102691699, "grad_norm": 0.398546427488327, "learning_rate": 5.896049261552624e-06, "loss": 0.6308, "step": 10152 }, { "epoch": 4.593078489029631, "grad_norm": 0.3428436815738678, "learning_rate": 5.895328364448027e-06, "loss": 0.5113, "step": 10153 }, { "epoch": 4.593530875367564, "grad_norm": 0.43134886026382446, "learning_rate": 5.894607448114549e-06, "loss": 0.6332, "step": 10154 }, { "epoch": 4.593983261705496, "grad_norm": 0.33703041076660156, "learning_rate": 5.893886512567668e-06, "loss": 0.4405, "step": 10155 }, { "epoch": 4.594435648043429, "grad_norm": 0.37538814544677734, "learning_rate": 5.8931655578228706e-06, "loss": 0.5842, "step": 10156 }, { "epoch": 4.594888034381362, "grad_norm": 0.3950912058353424, "learning_rate": 5.892444583895641e-06, "loss": 0.5793, "step": 10157 }, { "epoch": 4.595340420719294, "grad_norm": 0.4132331609725952, "learning_rate": 5.891723590801462e-06, "loss": 0.5255, "step": 10158 }, { "epoch": 4.595792807057227, "grad_norm": 0.37824028730392456, "learning_rate": 5.891002578555818e-06, "loss": 0.5496, "step": 10159 }, { "epoch": 4.596245193395159, "grad_norm": 0.4205934405326843, "learning_rate": 5.890281547174196e-06, "loss": 0.596, "step": 10160 }, { "epoch": 4.5966975797330925, "grad_norm": 0.41115298867225647, "learning_rate": 5.88956049667208e-06, "loss": 0.5333, "step": 10161 }, { "epoch": 4.597149966071025, "grad_norm": 0.4336353838443756, "learning_rate": 5.888839427064955e-06, "loss": 0.5361, "step": 10162 }, { "epoch": 4.597602352408957, "grad_norm": 0.42018094658851624, "learning_rate": 5.888118338368309e-06, "loss": 0.5191, "step": 10163 }, { "epoch": 4.59805473874689, "grad_norm": 0.4162658751010895, "learning_rate": 5.88739723059763e-06, "loss": 0.4947, "step": 10164 }, { "epoch": 4.598507125084822, "grad_norm": 0.47793111205101013, "learning_rate": 5.886676103768401e-06, "loss": 0.5678, "step": 10165 }, { "epoch": 4.5989595114227555, "grad_norm": 0.4428809583187103, "learning_rate": 5.885954957896115e-06, "loss": 0.551, "step": 10166 }, { "epoch": 4.599411897760688, "grad_norm": 0.41978394985198975, "learning_rate": 5.885233792996255e-06, "loss": 0.4983, "step": 10167 }, { "epoch": 4.59986428409862, "grad_norm": 0.4423864781856537, "learning_rate": 5.884512609084312e-06, "loss": 0.5035, "step": 10168 }, { "epoch": 4.600316670436553, "grad_norm": 0.4128256142139435, "learning_rate": 5.883791406175775e-06, "loss": 0.4535, "step": 10169 }, { "epoch": 4.600769056774485, "grad_norm": 0.4276486337184906, "learning_rate": 5.883070184286132e-06, "loss": 0.4765, "step": 10170 }, { "epoch": 4.601221443112418, "grad_norm": 0.45964276790618896, "learning_rate": 5.8823489434308736e-06, "loss": 0.5802, "step": 10171 }, { "epoch": 4.601673829450351, "grad_norm": 0.45015600323677063, "learning_rate": 5.881627683625489e-06, "loss": 0.5287, "step": 10172 }, { "epoch": 4.602126215788283, "grad_norm": 0.46528565883636475, "learning_rate": 5.8809064048854694e-06, "loss": 0.5749, "step": 10173 }, { "epoch": 4.602578602126216, "grad_norm": 0.45137670636177063, "learning_rate": 5.880185107226304e-06, "loss": 0.5187, "step": 10174 }, { "epoch": 4.603030988464148, "grad_norm": 0.4885404706001282, "learning_rate": 5.8794637906634855e-06, "loss": 0.5041, "step": 10175 }, { "epoch": 4.603483374802081, "grad_norm": 0.5453711748123169, "learning_rate": 5.878742455212506e-06, "loss": 0.591, "step": 10176 }, { "epoch": 4.603935761140013, "grad_norm": 0.5408828854560852, "learning_rate": 5.878021100888856e-06, "loss": 0.5307, "step": 10177 }, { "epoch": 4.6043881474779464, "grad_norm": 0.5025967955589294, "learning_rate": 5.87729972770803e-06, "loss": 0.4982, "step": 10178 }, { "epoch": 4.604840533815879, "grad_norm": 0.44453686475753784, "learning_rate": 5.876578335685519e-06, "loss": 0.4662, "step": 10179 }, { "epoch": 4.605292920153811, "grad_norm": 0.5703954696655273, "learning_rate": 5.875856924836816e-06, "loss": 0.6275, "step": 10180 }, { "epoch": 4.605745306491744, "grad_norm": 0.4998542368412018, "learning_rate": 5.8751354951774165e-06, "loss": 0.4945, "step": 10181 }, { "epoch": 4.606197692829676, "grad_norm": 0.4444061815738678, "learning_rate": 5.874414046722814e-06, "loss": 0.3902, "step": 10182 }, { "epoch": 4.6066500791676095, "grad_norm": 0.48808160424232483, "learning_rate": 5.873692579488504e-06, "loss": 0.4727, "step": 10183 }, { "epoch": 4.607102465505542, "grad_norm": 0.5506872534751892, "learning_rate": 5.8729710934899785e-06, "loss": 0.5221, "step": 10184 }, { "epoch": 4.607554851843474, "grad_norm": 0.5296390056610107, "learning_rate": 5.872249588742734e-06, "loss": 0.5077, "step": 10185 }, { "epoch": 4.608007238181407, "grad_norm": 0.5618607997894287, "learning_rate": 5.871528065262266e-06, "loss": 0.5203, "step": 10186 }, { "epoch": 4.608459624519339, "grad_norm": 0.6189718842506409, "learning_rate": 5.870806523064072e-06, "loss": 0.5536, "step": 10187 }, { "epoch": 4.6089120108572725, "grad_norm": 0.5870020985603333, "learning_rate": 5.870084962163648e-06, "loss": 0.5305, "step": 10188 }, { "epoch": 4.609364397195205, "grad_norm": 0.5995709896087646, "learning_rate": 5.86936338257649e-06, "loss": 0.5242, "step": 10189 }, { "epoch": 4.609816783533137, "grad_norm": 0.6366062164306641, "learning_rate": 5.868641784318096e-06, "loss": 0.4705, "step": 10190 }, { "epoch": 4.61026916987107, "grad_norm": 0.6728687882423401, "learning_rate": 5.867920167403965e-06, "loss": 0.5594, "step": 10191 }, { "epoch": 4.610721556209002, "grad_norm": 0.6991404294967651, "learning_rate": 5.86719853184959e-06, "loss": 0.5571, "step": 10192 }, { "epoch": 4.611173942546936, "grad_norm": 0.1688733845949173, "learning_rate": 5.866476877670476e-06, "loss": 1.1502, "step": 10193 }, { "epoch": 4.611626328884868, "grad_norm": 0.2788890600204468, "learning_rate": 5.86575520488212e-06, "loss": 0.8393, "step": 10194 }, { "epoch": 4.6120787152228, "grad_norm": 0.2921971380710602, "learning_rate": 5.8650335135000185e-06, "loss": 0.5548, "step": 10195 }, { "epoch": 4.612531101560733, "grad_norm": 0.4056224226951599, "learning_rate": 5.864311803539673e-06, "loss": 0.522, "step": 10196 }, { "epoch": 4.612983487898665, "grad_norm": 0.32986748218536377, "learning_rate": 5.863590075016583e-06, "loss": 0.5213, "step": 10197 }, { "epoch": 4.613435874236598, "grad_norm": 0.35246673226356506, "learning_rate": 5.86286832794625e-06, "loss": 0.6145, "step": 10198 }, { "epoch": 4.61388826057453, "grad_norm": 0.371656209230423, "learning_rate": 5.862146562344175e-06, "loss": 0.5394, "step": 10199 }, { "epoch": 4.6143406469124635, "grad_norm": 0.40422093868255615, "learning_rate": 5.861424778225858e-06, "loss": 0.691, "step": 10200 }, { "epoch": 4.6143406469124635, "eval_loss": 0.5910388231277466, "eval_runtime": 25.8631, "eval_samples_per_second": 28.767, "eval_steps_per_second": 7.192, "step": 10200 }, { "epoch": 4.614793033250396, "grad_norm": 0.3874368965625763, "learning_rate": 5.8607029756068025e-06, "loss": 0.5913, "step": 10201 }, { "epoch": 4.615245419588328, "grad_norm": 0.35648396611213684, "learning_rate": 5.85998115450251e-06, "loss": 0.5081, "step": 10202 }, { "epoch": 4.615697805926261, "grad_norm": 0.38960209488868713, "learning_rate": 5.859259314928481e-06, "loss": 0.6133, "step": 10203 }, { "epoch": 4.616150192264193, "grad_norm": 0.37077754735946655, "learning_rate": 5.858537456900221e-06, "loss": 0.516, "step": 10204 }, { "epoch": 4.6166025786021265, "grad_norm": 0.39913925528526306, "learning_rate": 5.857815580433233e-06, "loss": 0.6549, "step": 10205 }, { "epoch": 4.617054964940059, "grad_norm": 0.38054150342941284, "learning_rate": 5.8570936855430185e-06, "loss": 0.5666, "step": 10206 }, { "epoch": 4.617507351277991, "grad_norm": 0.42642295360565186, "learning_rate": 5.856371772245083e-06, "loss": 0.57, "step": 10207 }, { "epoch": 4.617959737615924, "grad_norm": 0.3833701014518738, "learning_rate": 5.855649840554931e-06, "loss": 0.5167, "step": 10208 }, { "epoch": 4.618412123953856, "grad_norm": 0.6404155492782593, "learning_rate": 5.854927890488067e-06, "loss": 0.5626, "step": 10209 }, { "epoch": 4.61886451029179, "grad_norm": 0.4495488405227661, "learning_rate": 5.854205922059995e-06, "loss": 0.5211, "step": 10210 }, { "epoch": 4.619316896629722, "grad_norm": 0.445095419883728, "learning_rate": 5.853483935286225e-06, "loss": 0.4833, "step": 10211 }, { "epoch": 4.619769282967654, "grad_norm": 0.46465185284614563, "learning_rate": 5.852761930182259e-06, "loss": 0.6039, "step": 10212 }, { "epoch": 4.620221669305587, "grad_norm": 0.45122867822647095, "learning_rate": 5.852039906763606e-06, "loss": 0.5864, "step": 10213 }, { "epoch": 4.620674055643519, "grad_norm": 0.4283941388130188, "learning_rate": 5.851317865045771e-06, "loss": 0.4602, "step": 10214 }, { "epoch": 4.621126441981453, "grad_norm": 0.4264289140701294, "learning_rate": 5.850595805044261e-06, "loss": 0.5544, "step": 10215 }, { "epoch": 4.621578828319385, "grad_norm": 0.40401995182037354, "learning_rate": 5.849873726774584e-06, "loss": 0.507, "step": 10216 }, { "epoch": 4.6220312146573175, "grad_norm": 0.4551631808280945, "learning_rate": 5.84915163025225e-06, "loss": 0.5507, "step": 10217 }, { "epoch": 4.62248360099525, "grad_norm": 0.4079771637916565, "learning_rate": 5.848429515492764e-06, "loss": 0.5637, "step": 10218 }, { "epoch": 4.622935987333182, "grad_norm": 0.4368396997451782, "learning_rate": 5.847707382511637e-06, "loss": 0.579, "step": 10219 }, { "epoch": 4.623388373671115, "grad_norm": 0.44071683287620544, "learning_rate": 5.846985231324378e-06, "loss": 0.5359, "step": 10220 }, { "epoch": 4.623840760009048, "grad_norm": 0.44526904821395874, "learning_rate": 5.846263061946497e-06, "loss": 0.5475, "step": 10221 }, { "epoch": 4.6242931463469805, "grad_norm": 0.43300169706344604, "learning_rate": 5.845540874393502e-06, "loss": 0.4067, "step": 10222 }, { "epoch": 4.624745532684913, "grad_norm": 0.4365268051624298, "learning_rate": 5.844818668680905e-06, "loss": 0.5052, "step": 10223 }, { "epoch": 4.625197919022845, "grad_norm": 0.4735153913497925, "learning_rate": 5.8440964448242155e-06, "loss": 0.5107, "step": 10224 }, { "epoch": 4.625650305360778, "grad_norm": 0.41640087962150574, "learning_rate": 5.843374202838947e-06, "loss": 0.4609, "step": 10225 }, { "epoch": 4.62610269169871, "grad_norm": 0.5006853938102722, "learning_rate": 5.8426519427406095e-06, "loss": 0.5596, "step": 10226 }, { "epoch": 4.626555078036644, "grad_norm": 0.49434319138526917, "learning_rate": 5.841929664544714e-06, "loss": 0.5424, "step": 10227 }, { "epoch": 4.627007464374576, "grad_norm": 0.46722298860549927, "learning_rate": 5.841207368266774e-06, "loss": 0.4842, "step": 10228 }, { "epoch": 4.627459850712508, "grad_norm": 0.46391311287879944, "learning_rate": 5.840485053922302e-06, "loss": 0.5011, "step": 10229 }, { "epoch": 4.627912237050441, "grad_norm": 0.4501776695251465, "learning_rate": 5.839762721526811e-06, "loss": 0.4278, "step": 10230 }, { "epoch": 4.628364623388373, "grad_norm": 0.4455527365207672, "learning_rate": 5.839040371095814e-06, "loss": 0.3933, "step": 10231 }, { "epoch": 4.628817009726307, "grad_norm": 0.5344359278678894, "learning_rate": 5.838318002644826e-06, "loss": 0.4938, "step": 10232 }, { "epoch": 4.629269396064239, "grad_norm": 0.5420944690704346, "learning_rate": 5.83759561618936e-06, "loss": 0.562, "step": 10233 }, { "epoch": 4.6297217824021715, "grad_norm": 0.4972292184829712, "learning_rate": 5.836873211744931e-06, "loss": 0.5008, "step": 10234 }, { "epoch": 4.630174168740104, "grad_norm": 0.5379239916801453, "learning_rate": 5.836150789327055e-06, "loss": 0.5306, "step": 10235 }, { "epoch": 4.630626555078036, "grad_norm": 0.5466299653053284, "learning_rate": 5.835428348951247e-06, "loss": 0.5473, "step": 10236 }, { "epoch": 4.63107894141597, "grad_norm": 0.5668150782585144, "learning_rate": 5.834705890633021e-06, "loss": 0.5433, "step": 10237 }, { "epoch": 4.631531327753902, "grad_norm": 0.5913309454917908, "learning_rate": 5.833983414387896e-06, "loss": 0.4852, "step": 10238 }, { "epoch": 4.6319837140918345, "grad_norm": 0.5832398533821106, "learning_rate": 5.833260920231385e-06, "loss": 0.5096, "step": 10239 }, { "epoch": 4.632436100429767, "grad_norm": 0.5957128405570984, "learning_rate": 5.832538408179009e-06, "loss": 0.5135, "step": 10240 }, { "epoch": 4.632888486767699, "grad_norm": 0.6494279503822327, "learning_rate": 5.831815878246282e-06, "loss": 0.4853, "step": 10241 }, { "epoch": 4.633340873105633, "grad_norm": 0.7945911884307861, "learning_rate": 5.8310933304487234e-06, "loss": 0.5351, "step": 10242 }, { "epoch": 4.633793259443565, "grad_norm": 0.1954634189605713, "learning_rate": 5.830370764801852e-06, "loss": 1.0534, "step": 10243 }, { "epoch": 4.634245645781498, "grad_norm": 0.3557758927345276, "learning_rate": 5.829648181321183e-06, "loss": 0.7278, "step": 10244 }, { "epoch": 4.63469803211943, "grad_norm": 0.2835122346878052, "learning_rate": 5.82892558002224e-06, "loss": 0.5769, "step": 10245 }, { "epoch": 4.635150418457362, "grad_norm": 0.3988575339317322, "learning_rate": 5.8282029609205386e-06, "loss": 0.5529, "step": 10246 }, { "epoch": 4.635602804795295, "grad_norm": 0.3640868365764618, "learning_rate": 5.827480324031599e-06, "loss": 0.5686, "step": 10247 }, { "epoch": 4.636055191133227, "grad_norm": 0.3705771267414093, "learning_rate": 5.826757669370944e-06, "loss": 0.4838, "step": 10248 }, { "epoch": 4.636507577471161, "grad_norm": 0.4020390808582306, "learning_rate": 5.8260349969540895e-06, "loss": 0.64, "step": 10249 }, { "epoch": 4.636959963809093, "grad_norm": 0.37588077783584595, "learning_rate": 5.825312306796559e-06, "loss": 0.5895, "step": 10250 }, { "epoch": 4.6374123501470255, "grad_norm": 0.38234058022499084, "learning_rate": 5.824589598913872e-06, "loss": 0.5795, "step": 10251 }, { "epoch": 4.637864736484958, "grad_norm": 0.3710545301437378, "learning_rate": 5.823866873321553e-06, "loss": 0.5438, "step": 10252 }, { "epoch": 4.63831712282289, "grad_norm": 0.44081827998161316, "learning_rate": 5.823144130035122e-06, "loss": 0.5915, "step": 10253 }, { "epoch": 4.638769509160824, "grad_norm": 0.38713932037353516, "learning_rate": 5.822421369070101e-06, "loss": 0.4928, "step": 10254 }, { "epoch": 4.639221895498756, "grad_norm": 0.3866440951824188, "learning_rate": 5.821698590442012e-06, "loss": 0.6491, "step": 10255 }, { "epoch": 4.6396742818366885, "grad_norm": 0.47719287872314453, "learning_rate": 5.82097579416638e-06, "loss": 0.5857, "step": 10256 }, { "epoch": 4.640126668174621, "grad_norm": 0.4253702461719513, "learning_rate": 5.820252980258728e-06, "loss": 0.5625, "step": 10257 }, { "epoch": 4.640579054512553, "grad_norm": 0.4072794020175934, "learning_rate": 5.819530148734579e-06, "loss": 0.4894, "step": 10258 }, { "epoch": 4.641031440850487, "grad_norm": 0.4448128938674927, "learning_rate": 5.818807299609459e-06, "loss": 0.5196, "step": 10259 }, { "epoch": 4.641483827188419, "grad_norm": 0.4175281226634979, "learning_rate": 5.81808443289889e-06, "loss": 0.5759, "step": 10260 }, { "epoch": 4.6419362135263516, "grad_norm": 0.4606083631515503, "learning_rate": 5.817361548618396e-06, "loss": 0.6427, "step": 10261 }, { "epoch": 4.642388599864284, "grad_norm": 0.4114099442958832, "learning_rate": 5.8166386467835065e-06, "loss": 0.4569, "step": 10262 }, { "epoch": 4.642840986202216, "grad_norm": 0.45912110805511475, "learning_rate": 5.815915727409744e-06, "loss": 0.4988, "step": 10263 }, { "epoch": 4.64329337254015, "grad_norm": 0.4338507652282715, "learning_rate": 5.815192790512636e-06, "loss": 0.4687, "step": 10264 }, { "epoch": 4.643745758878082, "grad_norm": 0.4723161458969116, "learning_rate": 5.814469836107709e-06, "loss": 0.56, "step": 10265 }, { "epoch": 4.644198145216015, "grad_norm": 0.42817485332489014, "learning_rate": 5.813746864210489e-06, "loss": 0.509, "step": 10266 }, { "epoch": 4.644650531553947, "grad_norm": 0.42779991030693054, "learning_rate": 5.813023874836504e-06, "loss": 0.4669, "step": 10267 }, { "epoch": 4.6451029178918795, "grad_norm": 0.4265895187854767, "learning_rate": 5.81230086800128e-06, "loss": 0.4742, "step": 10268 }, { "epoch": 4.645555304229812, "grad_norm": 0.42975571751594543, "learning_rate": 5.811577843720347e-06, "loss": 0.5337, "step": 10269 }, { "epoch": 4.646007690567745, "grad_norm": 0.4711938500404358, "learning_rate": 5.8108548020092325e-06, "loss": 0.5297, "step": 10270 }, { "epoch": 4.646460076905678, "grad_norm": 0.512342095375061, "learning_rate": 5.810131742883467e-06, "loss": 0.5387, "step": 10271 }, { "epoch": 4.64691246324361, "grad_norm": 0.4965580105781555, "learning_rate": 5.809408666358575e-06, "loss": 0.6021, "step": 10272 }, { "epoch": 4.6473648495815425, "grad_norm": 0.4630354046821594, "learning_rate": 5.80868557245009e-06, "loss": 0.4913, "step": 10273 }, { "epoch": 4.647817235919475, "grad_norm": 0.5151192545890808, "learning_rate": 5.8079624611735395e-06, "loss": 0.5255, "step": 10274 }, { "epoch": 4.648269622257407, "grad_norm": 0.4915732145309448, "learning_rate": 5.807239332544455e-06, "loss": 0.473, "step": 10275 }, { "epoch": 4.648722008595341, "grad_norm": 0.4785947799682617, "learning_rate": 5.806516186578367e-06, "loss": 0.4802, "step": 10276 }, { "epoch": 4.649174394933273, "grad_norm": 0.5370710492134094, "learning_rate": 5.805793023290805e-06, "loss": 0.5275, "step": 10277 }, { "epoch": 4.6496267812712055, "grad_norm": 0.5349414944648743, "learning_rate": 5.805069842697303e-06, "loss": 0.5472, "step": 10278 }, { "epoch": 4.650079167609138, "grad_norm": 0.5464057326316833, "learning_rate": 5.80434664481339e-06, "loss": 0.553, "step": 10279 }, { "epoch": 4.65053155394707, "grad_norm": 0.5014979243278503, "learning_rate": 5.8036234296546e-06, "loss": 0.5486, "step": 10280 }, { "epoch": 4.650983940285004, "grad_norm": 0.5824980139732361, "learning_rate": 5.802900197236465e-06, "loss": 0.5559, "step": 10281 }, { "epoch": 4.651436326622936, "grad_norm": 0.5972913503646851, "learning_rate": 5.802176947574518e-06, "loss": 0.602, "step": 10282 }, { "epoch": 4.651888712960869, "grad_norm": 0.48642778396606445, "learning_rate": 5.80145368068429e-06, "loss": 0.4438, "step": 10283 }, { "epoch": 4.652341099298801, "grad_norm": 0.5229275822639465, "learning_rate": 5.800730396581318e-06, "loss": 0.4664, "step": 10284 }, { "epoch": 4.6527934856367335, "grad_norm": 0.5681953430175781, "learning_rate": 5.8000070952811315e-06, "loss": 0.4984, "step": 10285 }, { "epoch": 4.653245871974667, "grad_norm": 0.5205143690109253, "learning_rate": 5.79928377679927e-06, "loss": 0.4464, "step": 10286 }, { "epoch": 4.653698258312599, "grad_norm": 0.5431142449378967, "learning_rate": 5.798560441151263e-06, "loss": 0.4675, "step": 10287 }, { "epoch": 4.654150644650532, "grad_norm": 0.5591283440589905, "learning_rate": 5.797837088352649e-06, "loss": 0.5441, "step": 10288 }, { "epoch": 4.654603030988464, "grad_norm": 0.6896749138832092, "learning_rate": 5.797113718418963e-06, "loss": 0.5229, "step": 10289 }, { "epoch": 4.6550554173263965, "grad_norm": 0.6269506812095642, "learning_rate": 5.7963903313657385e-06, "loss": 0.4877, "step": 10290 }, { "epoch": 4.65550780366433, "grad_norm": 0.6110018491744995, "learning_rate": 5.795666927208514e-06, "loss": 0.5246, "step": 10291 }, { "epoch": 4.655960190002262, "grad_norm": 0.9113428592681885, "learning_rate": 5.794943505962826e-06, "loss": 0.6385, "step": 10292 }, { "epoch": 4.656412576340195, "grad_norm": 0.2004762887954712, "learning_rate": 5.7942200676442105e-06, "loss": 1.0694, "step": 10293 }, { "epoch": 4.656864962678127, "grad_norm": 0.2881444990634918, "learning_rate": 5.793496612268204e-06, "loss": 0.789, "step": 10294 }, { "epoch": 4.6573173490160595, "grad_norm": 0.3257797062397003, "learning_rate": 5.7927731398503464e-06, "loss": 0.5677, "step": 10295 }, { "epoch": 4.657769735353992, "grad_norm": 0.3471468687057495, "learning_rate": 5.792049650406173e-06, "loss": 0.6844, "step": 10296 }, { "epoch": 4.658222121691925, "grad_norm": 0.3502825200557709, "learning_rate": 5.791326143951223e-06, "loss": 0.6174, "step": 10297 }, { "epoch": 4.658674508029858, "grad_norm": 0.34854596853256226, "learning_rate": 5.790602620501034e-06, "loss": 0.5509, "step": 10298 }, { "epoch": 4.65912689436779, "grad_norm": 0.37644562125205994, "learning_rate": 5.789879080071148e-06, "loss": 0.7092, "step": 10299 }, { "epoch": 4.659579280705723, "grad_norm": 0.3836134076118469, "learning_rate": 5.7891555226771035e-06, "loss": 0.6621, "step": 10300 }, { "epoch": 4.660031667043655, "grad_norm": 0.38137269020080566, "learning_rate": 5.788431948334437e-06, "loss": 0.5496, "step": 10301 }, { "epoch": 4.660484053381587, "grad_norm": 0.35430195927619934, "learning_rate": 5.787708357058694e-06, "loss": 0.4796, "step": 10302 }, { "epoch": 4.660936439719521, "grad_norm": 0.39061036705970764, "learning_rate": 5.786984748865411e-06, "loss": 0.6097, "step": 10303 }, { "epoch": 4.661388826057453, "grad_norm": 0.3995383083820343, "learning_rate": 5.78626112377013e-06, "loss": 0.6442, "step": 10304 }, { "epoch": 4.661841212395386, "grad_norm": 0.43330883979797363, "learning_rate": 5.785537481788393e-06, "loss": 0.4843, "step": 10305 }, { "epoch": 4.662293598733318, "grad_norm": 0.41972219944000244, "learning_rate": 5.78481382293574e-06, "loss": 0.5499, "step": 10306 }, { "epoch": 4.6627459850712505, "grad_norm": 0.3875519335269928, "learning_rate": 5.784090147227713e-06, "loss": 0.5107, "step": 10307 }, { "epoch": 4.663198371409184, "grad_norm": 0.4057663679122925, "learning_rate": 5.783366454679857e-06, "loss": 0.4916, "step": 10308 }, { "epoch": 4.663650757747116, "grad_norm": 0.4199879467487335, "learning_rate": 5.782642745307712e-06, "loss": 0.4444, "step": 10309 }, { "epoch": 4.664103144085049, "grad_norm": 0.42971259355545044, "learning_rate": 5.7819190191268204e-06, "loss": 0.6035, "step": 10310 }, { "epoch": 4.664555530422981, "grad_norm": 0.42529985308647156, "learning_rate": 5.781195276152729e-06, "loss": 0.5212, "step": 10311 }, { "epoch": 4.6650079167609135, "grad_norm": 0.4919140040874481, "learning_rate": 5.780471516400978e-06, "loss": 0.7057, "step": 10312 }, { "epoch": 4.665460303098847, "grad_norm": 0.4182828366756439, "learning_rate": 5.779747739887115e-06, "loss": 0.5146, "step": 10313 }, { "epoch": 4.665912689436779, "grad_norm": 0.41852325201034546, "learning_rate": 5.779023946626682e-06, "loss": 0.526, "step": 10314 }, { "epoch": 4.666365075774712, "grad_norm": 0.43994036316871643, "learning_rate": 5.778300136635223e-06, "loss": 0.5776, "step": 10315 }, { "epoch": 4.666817462112644, "grad_norm": 0.39069753885269165, "learning_rate": 5.777576309928288e-06, "loss": 0.4745, "step": 10316 }, { "epoch": 4.667269848450577, "grad_norm": 0.4769641160964966, "learning_rate": 5.776852466521416e-06, "loss": 0.6943, "step": 10317 }, { "epoch": 4.667722234788509, "grad_norm": 0.41937047243118286, "learning_rate": 5.776128606430157e-06, "loss": 0.4542, "step": 10318 }, { "epoch": 4.668174621126442, "grad_norm": 0.4847988188266754, "learning_rate": 5.775404729670057e-06, "loss": 0.5204, "step": 10319 }, { "epoch": 4.668627007464375, "grad_norm": 0.49321040511131287, "learning_rate": 5.774680836256661e-06, "loss": 0.5585, "step": 10320 }, { "epoch": 4.669079393802307, "grad_norm": 0.39663198590278625, "learning_rate": 5.773956926205517e-06, "loss": 0.4249, "step": 10321 }, { "epoch": 4.66953178014024, "grad_norm": 0.4813256859779358, "learning_rate": 5.773232999532173e-06, "loss": 0.51, "step": 10322 }, { "epoch": 4.669984166478172, "grad_norm": 0.45462551712989807, "learning_rate": 5.772509056252177e-06, "loss": 0.5232, "step": 10323 }, { "epoch": 4.6704365528161045, "grad_norm": 0.5099487900733948, "learning_rate": 5.771785096381075e-06, "loss": 0.5572, "step": 10324 }, { "epoch": 4.670888939154038, "grad_norm": 0.4156635105609894, "learning_rate": 5.771061119934417e-06, "loss": 0.4102, "step": 10325 }, { "epoch": 4.67134132549197, "grad_norm": 0.4880199134349823, "learning_rate": 5.770337126927751e-06, "loss": 0.5218, "step": 10326 }, { "epoch": 4.671793711829903, "grad_norm": 0.5001962780952454, "learning_rate": 5.769613117376627e-06, "loss": 0.509, "step": 10327 }, { "epoch": 4.672246098167835, "grad_norm": 0.5278817415237427, "learning_rate": 5.768889091296593e-06, "loss": 0.5274, "step": 10328 }, { "epoch": 4.6726984845057675, "grad_norm": 0.5577713847160339, "learning_rate": 5.7681650487032e-06, "loss": 0.608, "step": 10329 }, { "epoch": 4.673150870843701, "grad_norm": 0.48404309153556824, "learning_rate": 5.767440989611999e-06, "loss": 0.4671, "step": 10330 }, { "epoch": 4.673603257181633, "grad_norm": 0.5314190983772278, "learning_rate": 5.7667169140385395e-06, "loss": 0.5133, "step": 10331 }, { "epoch": 4.674055643519566, "grad_norm": 0.5266849398612976, "learning_rate": 5.765992821998371e-06, "loss": 0.5655, "step": 10332 }, { "epoch": 4.674508029857498, "grad_norm": 0.5694075226783752, "learning_rate": 5.7652687135070475e-06, "loss": 0.5525, "step": 10333 }, { "epoch": 4.674960416195431, "grad_norm": 0.501131534576416, "learning_rate": 5.764544588580118e-06, "loss": 0.4568, "step": 10334 }, { "epoch": 4.675412802533364, "grad_norm": 0.534630537033081, "learning_rate": 5.7638204472331364e-06, "loss": 0.4787, "step": 10335 }, { "epoch": 4.675865188871296, "grad_norm": 0.521767258644104, "learning_rate": 5.763096289481655e-06, "loss": 0.5373, "step": 10336 }, { "epoch": 4.676317575209229, "grad_norm": 0.677047073841095, "learning_rate": 5.7623721153412245e-06, "loss": 0.5938, "step": 10337 }, { "epoch": 4.676769961547161, "grad_norm": 0.6176376342773438, "learning_rate": 5.761647924827402e-06, "loss": 0.5201, "step": 10338 }, { "epoch": 4.677222347885094, "grad_norm": 0.5313944220542908, "learning_rate": 5.760923717955735e-06, "loss": 0.4405, "step": 10339 }, { "epoch": 4.677674734223027, "grad_norm": 0.576235830783844, "learning_rate": 5.760199494741783e-06, "loss": 0.4803, "step": 10340 }, { "epoch": 4.678127120560959, "grad_norm": 0.656134307384491, "learning_rate": 5.759475255201096e-06, "loss": 0.5334, "step": 10341 }, { "epoch": 4.678579506898892, "grad_norm": 0.8130826950073242, "learning_rate": 5.75875099934923e-06, "loss": 0.6417, "step": 10342 }, { "epoch": 4.679031893236824, "grad_norm": 0.2367280274629593, "learning_rate": 5.7580267272017385e-06, "loss": 1.062, "step": 10343 }, { "epoch": 4.679484279574757, "grad_norm": 0.2826990485191345, "learning_rate": 5.757302438774179e-06, "loss": 0.5073, "step": 10344 }, { "epoch": 4.679936665912689, "grad_norm": 0.3405104875564575, "learning_rate": 5.756578134082106e-06, "loss": 0.7133, "step": 10345 }, { "epoch": 4.680389052250622, "grad_norm": 0.38421037793159485, "learning_rate": 5.7558538131410745e-06, "loss": 0.6515, "step": 10346 }, { "epoch": 4.680841438588555, "grad_norm": 0.37728026509284973, "learning_rate": 5.755129475966641e-06, "loss": 0.5705, "step": 10347 }, { "epoch": 4.681293824926487, "grad_norm": 0.34661999344825745, "learning_rate": 5.754405122574362e-06, "loss": 0.5469, "step": 10348 }, { "epoch": 4.68174621126442, "grad_norm": 0.35070300102233887, "learning_rate": 5.753680752979795e-06, "loss": 0.564, "step": 10349 }, { "epoch": 4.682198597602352, "grad_norm": 0.35490450263023376, "learning_rate": 5.752956367198499e-06, "loss": 0.5392, "step": 10350 }, { "epoch": 4.682650983940285, "grad_norm": 0.39209190011024475, "learning_rate": 5.752231965246028e-06, "loss": 0.548, "step": 10351 }, { "epoch": 4.683103370278218, "grad_norm": 0.3862757682800293, "learning_rate": 5.751507547137941e-06, "loss": 0.546, "step": 10352 }, { "epoch": 4.68355575661615, "grad_norm": 0.35770002007484436, "learning_rate": 5.750783112889796e-06, "loss": 0.5771, "step": 10353 }, { "epoch": 4.684008142954083, "grad_norm": 0.4220709800720215, "learning_rate": 5.750058662517153e-06, "loss": 0.5545, "step": 10354 }, { "epoch": 4.684460529292015, "grad_norm": 0.4050266742706299, "learning_rate": 5.7493341960355696e-06, "loss": 0.5413, "step": 10355 }, { "epoch": 4.684912915629948, "grad_norm": 0.41682198643684387, "learning_rate": 5.748609713460606e-06, "loss": 0.6273, "step": 10356 }, { "epoch": 4.685365301967881, "grad_norm": 0.4342922270298004, "learning_rate": 5.747885214807821e-06, "loss": 0.5778, "step": 10357 }, { "epoch": 4.685817688305813, "grad_norm": 0.39068037271499634, "learning_rate": 5.7471607000927755e-06, "loss": 0.5174, "step": 10358 }, { "epoch": 4.686270074643746, "grad_norm": 0.42002055048942566, "learning_rate": 5.74643616933103e-06, "loss": 0.5992, "step": 10359 }, { "epoch": 4.686722460981678, "grad_norm": 0.4303332567214966, "learning_rate": 5.745711622538144e-06, "loss": 0.5088, "step": 10360 }, { "epoch": 4.687174847319611, "grad_norm": 0.42684444785118103, "learning_rate": 5.74498705972968e-06, "loss": 0.503, "step": 10361 }, { "epoch": 4.687627233657544, "grad_norm": 0.4266350567340851, "learning_rate": 5.744262480921198e-06, "loss": 0.5003, "step": 10362 }, { "epoch": 4.688079619995476, "grad_norm": 0.46242114901542664, "learning_rate": 5.743537886128258e-06, "loss": 0.6265, "step": 10363 }, { "epoch": 4.688532006333409, "grad_norm": 0.3925979435443878, "learning_rate": 5.742813275366428e-06, "loss": 0.5161, "step": 10364 }, { "epoch": 4.688984392671341, "grad_norm": 0.4232487082481384, "learning_rate": 5.742088648651264e-06, "loss": 0.495, "step": 10365 }, { "epoch": 4.689436779009274, "grad_norm": 0.46796661615371704, "learning_rate": 5.741364005998332e-06, "loss": 0.6084, "step": 10366 }, { "epoch": 4.689889165347207, "grad_norm": 0.4439244866371155, "learning_rate": 5.740639347423195e-06, "loss": 0.5174, "step": 10367 }, { "epoch": 4.690341551685139, "grad_norm": 0.40180960297584534, "learning_rate": 5.739914672941416e-06, "loss": 0.4451, "step": 10368 }, { "epoch": 4.690793938023072, "grad_norm": 0.4784121513366699, "learning_rate": 5.739189982568557e-06, "loss": 0.4917, "step": 10369 }, { "epoch": 4.691246324361004, "grad_norm": 0.47534629702568054, "learning_rate": 5.738465276320185e-06, "loss": 0.5713, "step": 10370 }, { "epoch": 4.691698710698937, "grad_norm": 0.5040174722671509, "learning_rate": 5.737740554211864e-06, "loss": 0.6248, "step": 10371 }, { "epoch": 4.692151097036869, "grad_norm": 0.45789840817451477, "learning_rate": 5.7370158162591574e-06, "loss": 0.5233, "step": 10372 }, { "epoch": 4.692603483374802, "grad_norm": 0.4435354173183441, "learning_rate": 5.7362910624776305e-06, "loss": 0.4531, "step": 10373 }, { "epoch": 4.693055869712735, "grad_norm": 0.48420450091362, "learning_rate": 5.73556629288285e-06, "loss": 0.526, "step": 10374 }, { "epoch": 4.693508256050667, "grad_norm": 0.5439074635505676, "learning_rate": 5.73484150749038e-06, "loss": 0.6153, "step": 10375 }, { "epoch": 4.6939606423886, "grad_norm": 0.5269020795822144, "learning_rate": 5.7341167063157876e-06, "loss": 0.6405, "step": 10376 }, { "epoch": 4.694413028726532, "grad_norm": 0.446377158164978, "learning_rate": 5.73339188937464e-06, "loss": 0.4258, "step": 10377 }, { "epoch": 4.694865415064465, "grad_norm": 0.46483859419822693, "learning_rate": 5.732667056682503e-06, "loss": 0.5354, "step": 10378 }, { "epoch": 4.695317801402398, "grad_norm": 0.4927051067352295, "learning_rate": 5.731942208254943e-06, "loss": 0.5622, "step": 10379 }, { "epoch": 4.69577018774033, "grad_norm": 0.48064708709716797, "learning_rate": 5.7312173441075305e-06, "loss": 0.4767, "step": 10380 }, { "epoch": 4.696222574078263, "grad_norm": 0.48852208256721497, "learning_rate": 5.73049246425583e-06, "loss": 0.4664, "step": 10381 }, { "epoch": 4.696674960416195, "grad_norm": 0.5727304220199585, "learning_rate": 5.729767568715411e-06, "loss": 0.6115, "step": 10382 }, { "epoch": 4.697127346754128, "grad_norm": 0.48583850264549255, "learning_rate": 5.729042657501842e-06, "loss": 0.4395, "step": 10383 }, { "epoch": 4.697579733092061, "grad_norm": 0.5207260251045227, "learning_rate": 5.728317730630694e-06, "loss": 0.4545, "step": 10384 }, { "epoch": 4.698032119429993, "grad_norm": 0.53764808177948, "learning_rate": 5.727592788117533e-06, "loss": 0.4967, "step": 10385 }, { "epoch": 4.698484505767926, "grad_norm": 0.4721095561981201, "learning_rate": 5.726867829977929e-06, "loss": 0.4116, "step": 10386 }, { "epoch": 4.698936892105858, "grad_norm": 0.5381777882575989, "learning_rate": 5.726142856227453e-06, "loss": 0.4372, "step": 10387 }, { "epoch": 4.699389278443791, "grad_norm": 0.5147114396095276, "learning_rate": 5.725417866881674e-06, "loss": 0.4329, "step": 10388 }, { "epoch": 4.699841664781724, "grad_norm": 0.652927577495575, "learning_rate": 5.724692861956164e-06, "loss": 0.5248, "step": 10389 }, { "epoch": 4.7002940511196565, "grad_norm": 0.6580521464347839, "learning_rate": 5.723967841466493e-06, "loss": 0.5418, "step": 10390 }, { "epoch": 4.700746437457589, "grad_norm": 0.618255615234375, "learning_rate": 5.723242805428231e-06, "loss": 0.537, "step": 10391 }, { "epoch": 4.701198823795521, "grad_norm": 0.6549615263938904, "learning_rate": 5.722517753856951e-06, "loss": 0.4793, "step": 10392 }, { "epoch": 4.701651210133454, "grad_norm": 0.15653038024902344, "learning_rate": 5.721792686768226e-06, "loss": 1.0517, "step": 10393 }, { "epoch": 4.702103596471386, "grad_norm": 0.2787667214870453, "learning_rate": 5.721067604177626e-06, "loss": 0.8004, "step": 10394 }, { "epoch": 4.7025559828093195, "grad_norm": 0.337444931268692, "learning_rate": 5.720342506100726e-06, "loss": 0.5685, "step": 10395 }, { "epoch": 4.703008369147252, "grad_norm": 0.3575666546821594, "learning_rate": 5.719617392553096e-06, "loss": 0.7677, "step": 10396 }, { "epoch": 4.703460755485184, "grad_norm": 0.3923971354961395, "learning_rate": 5.71889226355031e-06, "loss": 0.6815, "step": 10397 }, { "epoch": 4.703913141823117, "grad_norm": 0.35128751397132874, "learning_rate": 5.718167119107942e-06, "loss": 0.5355, "step": 10398 }, { "epoch": 4.704365528161049, "grad_norm": 0.3494136333465576, "learning_rate": 5.717441959241566e-06, "loss": 0.5246, "step": 10399 }, { "epoch": 4.704817914498982, "grad_norm": 0.3693385422229767, "learning_rate": 5.716716783966757e-06, "loss": 0.6843, "step": 10400 }, { "epoch": 4.704817914498982, "eval_loss": 0.5900002717971802, "eval_runtime": 25.8274, "eval_samples_per_second": 28.807, "eval_steps_per_second": 7.202, "step": 10400 }, { "epoch": 4.705270300836915, "grad_norm": 0.35986167192459106, "learning_rate": 5.7159915932990885e-06, "loss": 0.5246, "step": 10401 }, { "epoch": 4.705722687174847, "grad_norm": 0.3462739884853363, "learning_rate": 5.715266387254135e-06, "loss": 0.5881, "step": 10402 }, { "epoch": 4.70617507351278, "grad_norm": 0.44416823983192444, "learning_rate": 5.714541165847471e-06, "loss": 0.6846, "step": 10403 }, { "epoch": 4.706627459850712, "grad_norm": 0.4260360896587372, "learning_rate": 5.713815929094674e-06, "loss": 0.6579, "step": 10404 }, { "epoch": 4.707079846188645, "grad_norm": 0.40326058864593506, "learning_rate": 5.713090677011319e-06, "loss": 0.5843, "step": 10405 }, { "epoch": 4.707532232526578, "grad_norm": 0.39271053671836853, "learning_rate": 5.712365409612983e-06, "loss": 0.5543, "step": 10406 }, { "epoch": 4.7079846188645105, "grad_norm": 0.4245187044143677, "learning_rate": 5.711640126915241e-06, "loss": 0.5278, "step": 10407 }, { "epoch": 4.708437005202443, "grad_norm": 0.43510112166404724, "learning_rate": 5.71091482893367e-06, "loss": 0.5859, "step": 10408 }, { "epoch": 4.708889391540375, "grad_norm": 0.4173460006713867, "learning_rate": 5.710189515683847e-06, "loss": 0.4793, "step": 10409 }, { "epoch": 4.709341777878308, "grad_norm": 0.41906678676605225, "learning_rate": 5.709464187181351e-06, "loss": 0.5042, "step": 10410 }, { "epoch": 4.709794164216241, "grad_norm": 0.40107011795043945, "learning_rate": 5.708738843441759e-06, "loss": 0.4497, "step": 10411 }, { "epoch": 4.7102465505541735, "grad_norm": 0.4580257534980774, "learning_rate": 5.708013484480649e-06, "loss": 0.6087, "step": 10412 }, { "epoch": 4.710698936892106, "grad_norm": 0.3979369103908539, "learning_rate": 5.707288110313599e-06, "loss": 0.4917, "step": 10413 }, { "epoch": 4.711151323230038, "grad_norm": 0.4798024892807007, "learning_rate": 5.706562720956189e-06, "loss": 0.5648, "step": 10414 }, { "epoch": 4.711603709567971, "grad_norm": 0.4238053560256958, "learning_rate": 5.705837316423997e-06, "loss": 0.5114, "step": 10415 }, { "epoch": 4.712056095905904, "grad_norm": 0.3913829028606415, "learning_rate": 5.705111896732603e-06, "loss": 0.472, "step": 10416 }, { "epoch": 4.712508482243837, "grad_norm": 0.4670548737049103, "learning_rate": 5.704386461897586e-06, "loss": 0.5136, "step": 10417 }, { "epoch": 4.712960868581769, "grad_norm": 0.5032027363777161, "learning_rate": 5.703661011934528e-06, "loss": 0.5658, "step": 10418 }, { "epoch": 4.713413254919701, "grad_norm": 0.5096268653869629, "learning_rate": 5.7029355468590075e-06, "loss": 0.6067, "step": 10419 }, { "epoch": 4.713865641257634, "grad_norm": 0.4704240560531616, "learning_rate": 5.702210066686607e-06, "loss": 0.5167, "step": 10420 }, { "epoch": 4.714318027595566, "grad_norm": 0.5115222334861755, "learning_rate": 5.7014845714329046e-06, "loss": 0.5607, "step": 10421 }, { "epoch": 4.714770413933499, "grad_norm": 0.4122966229915619, "learning_rate": 5.700759061113485e-06, "loss": 0.4758, "step": 10422 }, { "epoch": 4.715222800271432, "grad_norm": 0.5467329621315002, "learning_rate": 5.700033535743928e-06, "loss": 0.59, "step": 10423 }, { "epoch": 4.7156751866093645, "grad_norm": 0.4513695240020752, "learning_rate": 5.6993079953398145e-06, "loss": 0.5065, "step": 10424 }, { "epoch": 4.716127572947297, "grad_norm": 0.47833430767059326, "learning_rate": 5.69858243991673e-06, "loss": 0.5, "step": 10425 }, { "epoch": 4.716579959285229, "grad_norm": 0.47777271270751953, "learning_rate": 5.697856869490256e-06, "loss": 0.53, "step": 10426 }, { "epoch": 4.717032345623162, "grad_norm": 0.49898195266723633, "learning_rate": 5.697131284075974e-06, "loss": 0.5401, "step": 10427 }, { "epoch": 4.717484731961095, "grad_norm": 0.546677827835083, "learning_rate": 5.69640568368947e-06, "loss": 0.5846, "step": 10428 }, { "epoch": 4.7179371182990275, "grad_norm": 0.533136248588562, "learning_rate": 5.695680068346327e-06, "loss": 0.4676, "step": 10429 }, { "epoch": 4.71838950463696, "grad_norm": 0.5356099605560303, "learning_rate": 5.694954438062125e-06, "loss": 0.5163, "step": 10430 }, { "epoch": 4.718841890974892, "grad_norm": 0.5261856913566589, "learning_rate": 5.694228792852454e-06, "loss": 0.4985, "step": 10431 }, { "epoch": 4.719294277312825, "grad_norm": 0.561566948890686, "learning_rate": 5.693503132732895e-06, "loss": 0.5177, "step": 10432 }, { "epoch": 4.719746663650758, "grad_norm": 0.5078989267349243, "learning_rate": 5.692777457719034e-06, "loss": 0.5056, "step": 10433 }, { "epoch": 4.7201990499886906, "grad_norm": 0.5834830403327942, "learning_rate": 5.692051767826456e-06, "loss": 0.645, "step": 10434 }, { "epoch": 4.720651436326623, "grad_norm": 0.45661240816116333, "learning_rate": 5.691326063070748e-06, "loss": 0.4034, "step": 10435 }, { "epoch": 4.721103822664555, "grad_norm": 0.5053005814552307, "learning_rate": 5.690600343467493e-06, "loss": 0.4461, "step": 10436 }, { "epoch": 4.721556209002488, "grad_norm": 0.5580257773399353, "learning_rate": 5.68987460903228e-06, "loss": 0.5266, "step": 10437 }, { "epoch": 4.722008595340421, "grad_norm": 0.5354418158531189, "learning_rate": 5.689148859780694e-06, "loss": 0.448, "step": 10438 }, { "epoch": 4.722460981678354, "grad_norm": 0.6048482060432434, "learning_rate": 5.688423095728323e-06, "loss": 0.4963, "step": 10439 }, { "epoch": 4.722913368016286, "grad_norm": 0.6329414248466492, "learning_rate": 5.687697316890753e-06, "loss": 0.5496, "step": 10440 }, { "epoch": 4.7233657543542185, "grad_norm": 0.5137094259262085, "learning_rate": 5.686971523283572e-06, "loss": 0.3911, "step": 10441 }, { "epoch": 4.723818140692151, "grad_norm": 0.6658916473388672, "learning_rate": 5.686245714922368e-06, "loss": 0.4979, "step": 10442 }, { "epoch": 4.724270527030083, "grad_norm": 0.16253651678562164, "learning_rate": 5.6855198918227285e-06, "loss": 1.1958, "step": 10443 }, { "epoch": 4.724722913368017, "grad_norm": 0.2341982126235962, "learning_rate": 5.684794054000242e-06, "loss": 0.9914, "step": 10444 }, { "epoch": 4.725175299705949, "grad_norm": 0.28808486461639404, "learning_rate": 5.6840682014704985e-06, "loss": 0.7336, "step": 10445 }, { "epoch": 4.7256276860438815, "grad_norm": 0.33567097783088684, "learning_rate": 5.683342334249084e-06, "loss": 0.6921, "step": 10446 }, { "epoch": 4.726080072381814, "grad_norm": 0.3267536163330078, "learning_rate": 5.682616452351593e-06, "loss": 0.6592, "step": 10447 }, { "epoch": 4.726532458719746, "grad_norm": 0.40095284581184387, "learning_rate": 5.68189055579361e-06, "loss": 0.7224, "step": 10448 }, { "epoch": 4.726984845057679, "grad_norm": 0.33427703380584717, "learning_rate": 5.6811646445907275e-06, "loss": 0.5519, "step": 10449 }, { "epoch": 4.727437231395612, "grad_norm": 0.33993956446647644, "learning_rate": 5.6804387187585366e-06, "loss": 0.5216, "step": 10450 }, { "epoch": 4.7278896177335445, "grad_norm": 0.40715882182121277, "learning_rate": 5.679712778312627e-06, "loss": 0.6444, "step": 10451 }, { "epoch": 4.728342004071477, "grad_norm": 0.4276224374771118, "learning_rate": 5.6789868232685886e-06, "loss": 0.5641, "step": 10452 }, { "epoch": 4.728794390409409, "grad_norm": 0.35620352625846863, "learning_rate": 5.678260853642013e-06, "loss": 0.635, "step": 10453 }, { "epoch": 4.729246776747342, "grad_norm": 0.4154244363307953, "learning_rate": 5.677534869448494e-06, "loss": 0.601, "step": 10454 }, { "epoch": 4.729699163085275, "grad_norm": 0.3584596812725067, "learning_rate": 5.676808870703621e-06, "loss": 0.4458, "step": 10455 }, { "epoch": 4.730151549423208, "grad_norm": 0.398055762052536, "learning_rate": 5.676082857422988e-06, "loss": 0.5876, "step": 10456 }, { "epoch": 4.73060393576114, "grad_norm": 0.42571696639060974, "learning_rate": 5.675356829622185e-06, "loss": 0.6583, "step": 10457 }, { "epoch": 4.7310563220990725, "grad_norm": 0.4298786222934723, "learning_rate": 5.674630787316807e-06, "loss": 0.6213, "step": 10458 }, { "epoch": 4.731508708437005, "grad_norm": 0.40126562118530273, "learning_rate": 5.673904730522447e-06, "loss": 0.5953, "step": 10459 }, { "epoch": 4.731961094774938, "grad_norm": 0.42114055156707764, "learning_rate": 5.673178659254698e-06, "loss": 0.5098, "step": 10460 }, { "epoch": 4.732413481112871, "grad_norm": 0.4843766987323761, "learning_rate": 5.672452573529153e-06, "loss": 0.7127, "step": 10461 }, { "epoch": 4.732865867450803, "grad_norm": 0.421867311000824, "learning_rate": 5.671726473361408e-06, "loss": 0.5217, "step": 10462 }, { "epoch": 4.7333182537887355, "grad_norm": 0.4637346863746643, "learning_rate": 5.6710003587670555e-06, "loss": 0.5932, "step": 10463 }, { "epoch": 4.733770640126668, "grad_norm": 0.41853252053260803, "learning_rate": 5.670274229761692e-06, "loss": 0.5742, "step": 10464 }, { "epoch": 4.734223026464601, "grad_norm": 0.37615448236465454, "learning_rate": 5.669548086360911e-06, "loss": 0.4432, "step": 10465 }, { "epoch": 4.734675412802534, "grad_norm": 0.4333910644054413, "learning_rate": 5.668821928580307e-06, "loss": 0.5745, "step": 10466 }, { "epoch": 4.735127799140466, "grad_norm": 0.4589308500289917, "learning_rate": 5.6680957564354785e-06, "loss": 0.5542, "step": 10467 }, { "epoch": 4.7355801854783985, "grad_norm": 0.42484548687934875, "learning_rate": 5.6673695699420186e-06, "loss": 0.4567, "step": 10468 }, { "epoch": 4.736032571816331, "grad_norm": 0.4051879346370697, "learning_rate": 5.666643369115525e-06, "loss": 0.4801, "step": 10469 }, { "epoch": 4.736484958154263, "grad_norm": 0.4249517321586609, "learning_rate": 5.665917153971595e-06, "loss": 0.4884, "step": 10470 }, { "epoch": 4.736937344492197, "grad_norm": 0.48640012741088867, "learning_rate": 5.665190924525824e-06, "loss": 0.5511, "step": 10471 }, { "epoch": 4.737389730830129, "grad_norm": 0.47885867953300476, "learning_rate": 5.664464680793808e-06, "loss": 0.5677, "step": 10472 }, { "epoch": 4.737842117168062, "grad_norm": 0.447162002325058, "learning_rate": 5.663738422791149e-06, "loss": 0.459, "step": 10473 }, { "epoch": 4.738294503505994, "grad_norm": 0.5174841284751892, "learning_rate": 5.663012150533441e-06, "loss": 0.588, "step": 10474 }, { "epoch": 4.7387468898439264, "grad_norm": 0.4410720467567444, "learning_rate": 5.662285864036282e-06, "loss": 0.4714, "step": 10475 }, { "epoch": 4.739199276181859, "grad_norm": 0.4759259521961212, "learning_rate": 5.661559563315272e-06, "loss": 0.5694, "step": 10476 }, { "epoch": 4.739651662519792, "grad_norm": 0.5683112740516663, "learning_rate": 5.6608332483860086e-06, "loss": 0.5914, "step": 10477 }, { "epoch": 4.740104048857725, "grad_norm": 0.5000482201576233, "learning_rate": 5.660106919264093e-06, "loss": 0.5751, "step": 10478 }, { "epoch": 4.740556435195657, "grad_norm": 0.5587843060493469, "learning_rate": 5.65938057596512e-06, "loss": 0.5892, "step": 10479 }, { "epoch": 4.7410088215335895, "grad_norm": 0.5077690482139587, "learning_rate": 5.658654218504693e-06, "loss": 0.5295, "step": 10480 }, { "epoch": 4.741461207871522, "grad_norm": 0.4605028033256531, "learning_rate": 5.6579278468984115e-06, "loss": 0.4844, "step": 10481 }, { "epoch": 4.741913594209455, "grad_norm": 0.4845638871192932, "learning_rate": 5.657201461161874e-06, "loss": 0.5748, "step": 10482 }, { "epoch": 4.742365980547388, "grad_norm": 0.5079182982444763, "learning_rate": 5.656475061310682e-06, "loss": 0.4323, "step": 10483 }, { "epoch": 4.74281836688532, "grad_norm": 0.5327150821685791, "learning_rate": 5.655748647360437e-06, "loss": 0.5358, "step": 10484 }, { "epoch": 4.7432707532232525, "grad_norm": 0.4945853054523468, "learning_rate": 5.6550222193267416e-06, "loss": 0.4544, "step": 10485 }, { "epoch": 4.743723139561185, "grad_norm": 0.6082181334495544, "learning_rate": 5.654295777225193e-06, "loss": 0.6802, "step": 10486 }, { "epoch": 4.744175525899118, "grad_norm": 0.5080341696739197, "learning_rate": 5.653569321071395e-06, "loss": 0.478, "step": 10487 }, { "epoch": 4.744627912237051, "grad_norm": 0.6487542390823364, "learning_rate": 5.6528428508809495e-06, "loss": 0.6647, "step": 10488 }, { "epoch": 4.745080298574983, "grad_norm": 0.5593520402908325, "learning_rate": 5.65211636666946e-06, "loss": 0.4721, "step": 10489 }, { "epoch": 4.745532684912916, "grad_norm": 0.6108453869819641, "learning_rate": 5.651389868452528e-06, "loss": 0.5177, "step": 10490 }, { "epoch": 4.745985071250848, "grad_norm": 0.6029545664787292, "learning_rate": 5.650663356245756e-06, "loss": 0.5021, "step": 10491 }, { "epoch": 4.74643745758878, "grad_norm": 0.6997426748275757, "learning_rate": 5.649936830064749e-06, "loss": 0.4977, "step": 10492 }, { "epoch": 4.746889843926714, "grad_norm": 0.1723536103963852, "learning_rate": 5.649210289925108e-06, "loss": 1.0121, "step": 10493 }, { "epoch": 4.747342230264646, "grad_norm": 0.28384292125701904, "learning_rate": 5.648483735842438e-06, "loss": 0.7492, "step": 10494 }, { "epoch": 4.747794616602579, "grad_norm": 0.3011513650417328, "learning_rate": 5.647757167832345e-06, "loss": 0.5063, "step": 10495 }, { "epoch": 4.748247002940511, "grad_norm": 0.3208230435848236, "learning_rate": 5.6470305859104314e-06, "loss": 0.7705, "step": 10496 }, { "epoch": 4.7486993892784435, "grad_norm": 0.3618986904621124, "learning_rate": 5.646303990092302e-06, "loss": 0.8049, "step": 10497 }, { "epoch": 4.749151775616376, "grad_norm": 0.38172101974487305, "learning_rate": 5.645577380393562e-06, "loss": 0.6566, "step": 10498 }, { "epoch": 4.749604161954309, "grad_norm": 0.31979838013648987, "learning_rate": 5.6448507568298175e-06, "loss": 0.4948, "step": 10499 }, { "epoch": 4.750056548292242, "grad_norm": 0.34502124786376953, "learning_rate": 5.644124119416673e-06, "loss": 0.4993, "step": 10500 }, { "epoch": 4.750508934630174, "grad_norm": 0.4170891046524048, "learning_rate": 5.643397468169734e-06, "loss": 0.6515, "step": 10501 }, { "epoch": 4.7509613209681065, "grad_norm": 0.34096091985702515, "learning_rate": 5.64267080310461e-06, "loss": 0.5217, "step": 10502 }, { "epoch": 4.751413707306039, "grad_norm": 0.39230629801750183, "learning_rate": 5.641944124236903e-06, "loss": 0.5582, "step": 10503 }, { "epoch": 4.751866093643972, "grad_norm": 0.4245571494102478, "learning_rate": 5.641217431582223e-06, "loss": 0.5811, "step": 10504 }, { "epoch": 4.752318479981905, "grad_norm": 0.46160298585891724, "learning_rate": 5.640490725156176e-06, "loss": 0.7131, "step": 10505 }, { "epoch": 4.752770866319837, "grad_norm": 0.4228353500366211, "learning_rate": 5.63976400497437e-06, "loss": 0.5744, "step": 10506 }, { "epoch": 4.75322325265777, "grad_norm": 0.34550076723098755, "learning_rate": 5.639037271052411e-06, "loss": 0.4425, "step": 10507 }, { "epoch": 4.753675638995702, "grad_norm": 0.35832133889198303, "learning_rate": 5.63831052340591e-06, "loss": 0.4442, "step": 10508 }, { "epoch": 4.754128025333635, "grad_norm": 0.39288169145584106, "learning_rate": 5.6375837620504715e-06, "loss": 0.5123, "step": 10509 }, { "epoch": 4.754580411671568, "grad_norm": 0.40881747007369995, "learning_rate": 5.636856987001708e-06, "loss": 0.5183, "step": 10510 }, { "epoch": 4.7550327980095, "grad_norm": 0.40766453742980957, "learning_rate": 5.6361301982752245e-06, "loss": 0.4977, "step": 10511 }, { "epoch": 4.755485184347433, "grad_norm": 0.4271490275859833, "learning_rate": 5.635403395886633e-06, "loss": 0.5642, "step": 10512 }, { "epoch": 4.755937570685365, "grad_norm": 0.47363168001174927, "learning_rate": 5.634676579851542e-06, "loss": 0.7315, "step": 10513 }, { "epoch": 4.756389957023298, "grad_norm": 0.4225083887577057, "learning_rate": 5.633949750185561e-06, "loss": 0.5409, "step": 10514 }, { "epoch": 4.756842343361231, "grad_norm": 0.4500342607498169, "learning_rate": 5.633222906904301e-06, "loss": 0.4971, "step": 10515 }, { "epoch": 4.757294729699163, "grad_norm": 0.46829211711883545, "learning_rate": 5.632496050023372e-06, "loss": 0.541, "step": 10516 }, { "epoch": 4.757747116037096, "grad_norm": 0.4764615595340729, "learning_rate": 5.631769179558384e-06, "loss": 0.4823, "step": 10517 }, { "epoch": 4.758199502375028, "grad_norm": 0.4096536338329315, "learning_rate": 5.631042295524949e-06, "loss": 0.4278, "step": 10518 }, { "epoch": 4.7586518887129605, "grad_norm": 0.4135589301586151, "learning_rate": 5.630315397938677e-06, "loss": 0.4797, "step": 10519 }, { "epoch": 4.759104275050894, "grad_norm": 0.4802151918411255, "learning_rate": 5.6295884868151805e-06, "loss": 0.6177, "step": 10520 }, { "epoch": 4.759556661388826, "grad_norm": 0.46656668186187744, "learning_rate": 5.62886156217007e-06, "loss": 0.5688, "step": 10521 }, { "epoch": 4.760009047726759, "grad_norm": 0.4853883683681488, "learning_rate": 5.62813462401896e-06, "loss": 0.4485, "step": 10522 }, { "epoch": 4.760461434064691, "grad_norm": 0.4213210642337799, "learning_rate": 5.6274076723774595e-06, "loss": 0.4076, "step": 10523 }, { "epoch": 4.760913820402624, "grad_norm": 0.5030396580696106, "learning_rate": 5.626680707261183e-06, "loss": 0.6336, "step": 10524 }, { "epoch": 4.761366206740556, "grad_norm": 0.4483703374862671, "learning_rate": 5.625953728685745e-06, "loss": 0.4825, "step": 10525 }, { "epoch": 4.761818593078489, "grad_norm": 0.47321978211402893, "learning_rate": 5.625226736666756e-06, "loss": 0.4663, "step": 10526 }, { "epoch": 4.762270979416422, "grad_norm": 0.5389536619186401, "learning_rate": 5.624499731219831e-06, "loss": 0.6314, "step": 10527 }, { "epoch": 4.762723365754354, "grad_norm": 0.521740734577179, "learning_rate": 5.623772712360583e-06, "loss": 0.5335, "step": 10528 }, { "epoch": 4.763175752092287, "grad_norm": 0.5178649425506592, "learning_rate": 5.623045680104629e-06, "loss": 0.4648, "step": 10529 }, { "epoch": 4.763628138430219, "grad_norm": 0.4952029287815094, "learning_rate": 5.622318634467578e-06, "loss": 0.5065, "step": 10530 }, { "epoch": 4.764080524768152, "grad_norm": 0.4462752342224121, "learning_rate": 5.621591575465051e-06, "loss": 0.4704, "step": 10531 }, { "epoch": 4.764532911106085, "grad_norm": 0.5342493057250977, "learning_rate": 5.620864503112658e-06, "loss": 0.5292, "step": 10532 }, { "epoch": 4.764985297444017, "grad_norm": 0.5278305411338806, "learning_rate": 5.6201374174260155e-06, "loss": 0.5476, "step": 10533 }, { "epoch": 4.76543768378195, "grad_norm": 0.5626249313354492, "learning_rate": 5.619410318420742e-06, "loss": 0.5213, "step": 10534 }, { "epoch": 4.765890070119882, "grad_norm": 0.5798815488815308, "learning_rate": 5.618683206112448e-06, "loss": 0.5202, "step": 10535 }, { "epoch": 4.766342456457815, "grad_norm": 0.5481441020965576, "learning_rate": 5.617956080516755e-06, "loss": 0.497, "step": 10536 }, { "epoch": 4.766794842795748, "grad_norm": 0.5616064071655273, "learning_rate": 5.617228941649275e-06, "loss": 0.5242, "step": 10537 }, { "epoch": 4.76724722913368, "grad_norm": 0.5871011018753052, "learning_rate": 5.616501789525627e-06, "loss": 0.5216, "step": 10538 }, { "epoch": 4.767699615471613, "grad_norm": 0.6023361086845398, "learning_rate": 5.615774624161428e-06, "loss": 0.5359, "step": 10539 }, { "epoch": 4.768152001809545, "grad_norm": 0.6359946131706238, "learning_rate": 5.615047445572295e-06, "loss": 0.6313, "step": 10540 }, { "epoch": 4.7686043881474784, "grad_norm": 0.6195305585861206, "learning_rate": 5.614320253773845e-06, "loss": 0.4378, "step": 10541 }, { "epoch": 4.769056774485411, "grad_norm": 0.700482189655304, "learning_rate": 5.613593048781698e-06, "loss": 0.496, "step": 10542 }, { "epoch": 4.769509160823343, "grad_norm": 0.16517873108386993, "learning_rate": 5.612865830611469e-06, "loss": 1.1615, "step": 10543 }, { "epoch": 4.769961547161276, "grad_norm": 0.24674750864505768, "learning_rate": 5.612138599278778e-06, "loss": 0.9028, "step": 10544 }, { "epoch": 4.770413933499208, "grad_norm": 0.3033297657966614, "learning_rate": 5.611411354799242e-06, "loss": 0.5767, "step": 10545 }, { "epoch": 4.770866319837141, "grad_norm": 0.3126260042190552, "learning_rate": 5.610684097188483e-06, "loss": 0.5597, "step": 10546 }, { "epoch": 4.771318706175073, "grad_norm": 0.33509308099746704, "learning_rate": 5.609956826462119e-06, "loss": 0.6587, "step": 10547 }, { "epoch": 4.771771092513006, "grad_norm": 0.32653793692588806, "learning_rate": 5.609229542635768e-06, "loss": 0.5809, "step": 10548 }, { "epoch": 4.772223478850939, "grad_norm": 0.343241810798645, "learning_rate": 5.608502245725052e-06, "loss": 0.5507, "step": 10549 }, { "epoch": 4.772675865188871, "grad_norm": 0.30600807070732117, "learning_rate": 5.607774935745589e-06, "loss": 0.4462, "step": 10550 }, { "epoch": 4.773128251526804, "grad_norm": 0.3920212388038635, "learning_rate": 5.607047612713003e-06, "loss": 0.6029, "step": 10551 }, { "epoch": 4.773580637864736, "grad_norm": 0.3687971234321594, "learning_rate": 5.60632027664291e-06, "loss": 0.6174, "step": 10552 }, { "epoch": 4.774033024202669, "grad_norm": 0.3496926724910736, "learning_rate": 5.6055929275509335e-06, "loss": 0.4138, "step": 10553 }, { "epoch": 4.774485410540602, "grad_norm": 0.3611300587654114, "learning_rate": 5.604865565452694e-06, "loss": 0.4942, "step": 10554 }, { "epoch": 4.774937796878534, "grad_norm": 0.4948473870754242, "learning_rate": 5.6041381903638135e-06, "loss": 0.6608, "step": 10555 }, { "epoch": 4.775390183216467, "grad_norm": 0.43777164816856384, "learning_rate": 5.603410802299914e-06, "loss": 0.6408, "step": 10556 }, { "epoch": 4.775842569554399, "grad_norm": 0.3913147449493408, "learning_rate": 5.6026834012766155e-06, "loss": 0.5736, "step": 10557 }, { "epoch": 4.776294955892332, "grad_norm": 0.4134710431098938, "learning_rate": 5.601955987309543e-06, "loss": 0.5836, "step": 10558 }, { "epoch": 4.776747342230265, "grad_norm": 0.45180854201316833, "learning_rate": 5.601228560414318e-06, "loss": 0.6035, "step": 10559 }, { "epoch": 4.777199728568197, "grad_norm": 0.5035602450370789, "learning_rate": 5.600501120606563e-06, "loss": 0.7183, "step": 10560 }, { "epoch": 4.77765211490613, "grad_norm": 0.4646380543708801, "learning_rate": 5.599773667901901e-06, "loss": 0.5906, "step": 10561 }, { "epoch": 4.778104501244062, "grad_norm": 0.40289121866226196, "learning_rate": 5.5990462023159575e-06, "loss": 0.4399, "step": 10562 }, { "epoch": 4.7785568875819955, "grad_norm": 0.4268597960472107, "learning_rate": 5.5983187238643544e-06, "loss": 0.6247, "step": 10563 }, { "epoch": 4.779009273919928, "grad_norm": 0.37487396597862244, "learning_rate": 5.5975912325627144e-06, "loss": 0.4426, "step": 10564 }, { "epoch": 4.77946166025786, "grad_norm": 0.43827173113822937, "learning_rate": 5.596863728426664e-06, "loss": 0.567, "step": 10565 }, { "epoch": 4.779914046595793, "grad_norm": 0.39297086000442505, "learning_rate": 5.596136211471828e-06, "loss": 0.4391, "step": 10566 }, { "epoch": 4.780366432933725, "grad_norm": 0.4338574707508087, "learning_rate": 5.59540868171383e-06, "loss": 0.5047, "step": 10567 }, { "epoch": 4.780818819271658, "grad_norm": 0.43377217650413513, "learning_rate": 5.594681139168294e-06, "loss": 0.5549, "step": 10568 }, { "epoch": 4.781271205609591, "grad_norm": 0.4426785707473755, "learning_rate": 5.593953583850847e-06, "loss": 0.4264, "step": 10569 }, { "epoch": 4.781723591947523, "grad_norm": 0.5187472701072693, "learning_rate": 5.5932260157771156e-06, "loss": 0.6552, "step": 10570 }, { "epoch": 4.782175978285456, "grad_norm": 0.49665042757987976, "learning_rate": 5.5924984349627235e-06, "loss": 0.6014, "step": 10571 }, { "epoch": 4.782628364623388, "grad_norm": 0.46717485785484314, "learning_rate": 5.591770841423299e-06, "loss": 0.4749, "step": 10572 }, { "epoch": 4.783080750961321, "grad_norm": 0.4613783359527588, "learning_rate": 5.591043235174467e-06, "loss": 0.5303, "step": 10573 }, { "epoch": 4.783533137299253, "grad_norm": 0.4838919937610626, "learning_rate": 5.590315616231854e-06, "loss": 0.562, "step": 10574 }, { "epoch": 4.783985523637186, "grad_norm": 0.5061687231063843, "learning_rate": 5.589587984611088e-06, "loss": 0.5871, "step": 10575 }, { "epoch": 4.784437909975119, "grad_norm": 0.48294609785079956, "learning_rate": 5.5888603403277955e-06, "loss": 0.43, "step": 10576 }, { "epoch": 4.784890296313051, "grad_norm": 0.4526299834251404, "learning_rate": 5.588132683397605e-06, "loss": 0.4531, "step": 10577 }, { "epoch": 4.785342682650984, "grad_norm": 0.47398412227630615, "learning_rate": 5.587405013836142e-06, "loss": 0.4378, "step": 10578 }, { "epoch": 4.785795068988916, "grad_norm": 0.4958421289920807, "learning_rate": 5.5866773316590385e-06, "loss": 0.4682, "step": 10579 }, { "epoch": 4.7862474553268495, "grad_norm": 0.5518966317176819, "learning_rate": 5.58594963688192e-06, "loss": 0.5523, "step": 10580 }, { "epoch": 4.786699841664782, "grad_norm": 0.4883647859096527, "learning_rate": 5.585221929520416e-06, "loss": 0.473, "step": 10581 }, { "epoch": 4.787152228002714, "grad_norm": 0.5138705372810364, "learning_rate": 5.584494209590154e-06, "loss": 0.5562, "step": 10582 }, { "epoch": 4.787604614340647, "grad_norm": 0.6028603315353394, "learning_rate": 5.583766477106767e-06, "loss": 0.5829, "step": 10583 }, { "epoch": 4.788057000678579, "grad_norm": 0.5583069920539856, "learning_rate": 5.583038732085881e-06, "loss": 0.581, "step": 10584 }, { "epoch": 4.7885093870165125, "grad_norm": 0.4964533746242523, "learning_rate": 5.582310974543126e-06, "loss": 0.5056, "step": 10585 }, { "epoch": 4.788961773354445, "grad_norm": 0.5421659350395203, "learning_rate": 5.581583204494133e-06, "loss": 0.4907, "step": 10586 }, { "epoch": 4.789414159692377, "grad_norm": 0.5071086883544922, "learning_rate": 5.580855421954531e-06, "loss": 0.4517, "step": 10587 }, { "epoch": 4.78986654603031, "grad_norm": 0.6023347973823547, "learning_rate": 5.580127626939952e-06, "loss": 0.5792, "step": 10588 }, { "epoch": 4.790318932368242, "grad_norm": 0.5822795629501343, "learning_rate": 5.579399819466026e-06, "loss": 0.449, "step": 10589 }, { "epoch": 4.790771318706176, "grad_norm": 0.6174877882003784, "learning_rate": 5.578671999548384e-06, "loss": 0.4956, "step": 10590 }, { "epoch": 4.791223705044108, "grad_norm": 0.5854802131652832, "learning_rate": 5.577944167202658e-06, "loss": 0.4374, "step": 10591 }, { "epoch": 4.79167609138204, "grad_norm": 0.6309381723403931, "learning_rate": 5.577216322444478e-06, "loss": 0.5153, "step": 10592 }, { "epoch": 4.792128477719973, "grad_norm": 0.18901711702346802, "learning_rate": 5.576488465289478e-06, "loss": 1.1624, "step": 10593 }, { "epoch": 4.792580864057905, "grad_norm": 0.3137471377849579, "learning_rate": 5.575760595753289e-06, "loss": 0.7977, "step": 10594 }, { "epoch": 4.793033250395838, "grad_norm": 0.34872886538505554, "learning_rate": 5.575032713851544e-06, "loss": 0.7375, "step": 10595 }, { "epoch": 4.79348563673377, "grad_norm": 0.3384619355201721, "learning_rate": 5.574304819599875e-06, "loss": 0.6574, "step": 10596 }, { "epoch": 4.7939380230717035, "grad_norm": 0.32387152314186096, "learning_rate": 5.573576913013914e-06, "loss": 0.4354, "step": 10597 }, { "epoch": 4.794390409409636, "grad_norm": 0.32389116287231445, "learning_rate": 5.572848994109296e-06, "loss": 0.5901, "step": 10598 }, { "epoch": 4.794842795747568, "grad_norm": 0.3207359313964844, "learning_rate": 5.572121062901653e-06, "loss": 0.5648, "step": 10599 }, { "epoch": 4.795295182085501, "grad_norm": 0.3549423813819885, "learning_rate": 5.57139311940662e-06, "loss": 0.5865, "step": 10600 }, { "epoch": 4.795295182085501, "eval_loss": 0.5894350409507751, "eval_runtime": 25.5791, "eval_samples_per_second": 29.086, "eval_steps_per_second": 7.272, "step": 10600 }, { "epoch": 4.795747568423433, "grad_norm": 0.40710315108299255, "learning_rate": 5.570665163639829e-06, "loss": 0.7546, "step": 10601 }, { "epoch": 4.7961999547613665, "grad_norm": 0.326852411031723, "learning_rate": 5.569937195616917e-06, "loss": 0.5155, "step": 10602 }, { "epoch": 4.796652341099299, "grad_norm": 0.3395182490348816, "learning_rate": 5.569209215353516e-06, "loss": 0.5345, "step": 10603 }, { "epoch": 4.797104727437231, "grad_norm": 0.3730272948741913, "learning_rate": 5.568481222865263e-06, "loss": 0.5782, "step": 10604 }, { "epoch": 4.797557113775164, "grad_norm": 0.44009390473365784, "learning_rate": 5.567753218167791e-06, "loss": 0.8552, "step": 10605 }, { "epoch": 4.798009500113096, "grad_norm": 0.3569817543029785, "learning_rate": 5.567025201276736e-06, "loss": 0.4912, "step": 10606 }, { "epoch": 4.79846188645103, "grad_norm": 0.34989210963249207, "learning_rate": 5.5662971722077356e-06, "loss": 0.4339, "step": 10607 }, { "epoch": 4.798914272788962, "grad_norm": 0.43517908453941345, "learning_rate": 5.5655691309764225e-06, "loss": 0.6074, "step": 10608 }, { "epoch": 4.799366659126894, "grad_norm": 0.437010258436203, "learning_rate": 5.564841077598433e-06, "loss": 0.5767, "step": 10609 }, { "epoch": 4.799819045464827, "grad_norm": 0.4478224217891693, "learning_rate": 5.564113012089406e-06, "loss": 0.6112, "step": 10610 }, { "epoch": 4.800271431802759, "grad_norm": 0.4307814836502075, "learning_rate": 5.563384934464975e-06, "loss": 0.6174, "step": 10611 }, { "epoch": 4.800723818140693, "grad_norm": 0.4392803907394409, "learning_rate": 5.562656844740779e-06, "loss": 0.6313, "step": 10612 }, { "epoch": 4.801176204478625, "grad_norm": 0.4293891489505768, "learning_rate": 5.561928742932456e-06, "loss": 0.56, "step": 10613 }, { "epoch": 4.8016285908165575, "grad_norm": 0.4617120921611786, "learning_rate": 5.56120062905564e-06, "loss": 0.5584, "step": 10614 }, { "epoch": 4.80208097715449, "grad_norm": 0.4010477066040039, "learning_rate": 5.56047250312597e-06, "loss": 0.5151, "step": 10615 }, { "epoch": 4.802533363492422, "grad_norm": 0.4509068727493286, "learning_rate": 5.559744365159086e-06, "loss": 0.605, "step": 10616 }, { "epoch": 4.802985749830355, "grad_norm": 0.4670351445674896, "learning_rate": 5.559016215170623e-06, "loss": 0.5443, "step": 10617 }, { "epoch": 4.803438136168288, "grad_norm": 0.4176243543624878, "learning_rate": 5.558288053176221e-06, "loss": 0.5251, "step": 10618 }, { "epoch": 4.8038905225062205, "grad_norm": 0.4113883376121521, "learning_rate": 5.5575598791915216e-06, "loss": 0.4556, "step": 10619 }, { "epoch": 4.804342908844153, "grad_norm": 0.45242902636528015, "learning_rate": 5.556831693232159e-06, "loss": 0.5664, "step": 10620 }, { "epoch": 4.804795295182085, "grad_norm": 0.532323956489563, "learning_rate": 5.556103495313774e-06, "loss": 0.4572, "step": 10621 }, { "epoch": 4.805247681520018, "grad_norm": 0.4518415331840515, "learning_rate": 5.555375285452005e-06, "loss": 0.5413, "step": 10622 }, { "epoch": 4.80570006785795, "grad_norm": 0.5011792182922363, "learning_rate": 5.5546470636624936e-06, "loss": 0.5679, "step": 10623 }, { "epoch": 4.8061524541958835, "grad_norm": 0.4611097574234009, "learning_rate": 5.553918829960879e-06, "loss": 0.5348, "step": 10624 }, { "epoch": 4.806604840533816, "grad_norm": 0.5087568163871765, "learning_rate": 5.553190584362802e-06, "loss": 0.4957, "step": 10625 }, { "epoch": 4.807057226871748, "grad_norm": 0.43072202801704407, "learning_rate": 5.5524623268839015e-06, "loss": 0.466, "step": 10626 }, { "epoch": 4.807509613209681, "grad_norm": 0.48183533549308777, "learning_rate": 5.551734057539821e-06, "loss": 0.5039, "step": 10627 }, { "epoch": 4.807961999547613, "grad_norm": 0.46585381031036377, "learning_rate": 5.551005776346198e-06, "loss": 0.4265, "step": 10628 }, { "epoch": 4.808414385885547, "grad_norm": 0.4941990375518799, "learning_rate": 5.550277483318676e-06, "loss": 0.5127, "step": 10629 }, { "epoch": 4.808866772223479, "grad_norm": 0.5438737273216248, "learning_rate": 5.549549178472896e-06, "loss": 0.6031, "step": 10630 }, { "epoch": 4.8093191585614115, "grad_norm": 0.5102017521858215, "learning_rate": 5.548820861824499e-06, "loss": 0.5767, "step": 10631 }, { "epoch": 4.809771544899344, "grad_norm": 0.5016269683837891, "learning_rate": 5.548092533389128e-06, "loss": 0.459, "step": 10632 }, { "epoch": 4.810223931237276, "grad_norm": 0.5231689214706421, "learning_rate": 5.547364193182425e-06, "loss": 0.5028, "step": 10633 }, { "epoch": 4.81067631757521, "grad_norm": 0.5030825734138489, "learning_rate": 5.5466358412200315e-06, "loss": 0.4975, "step": 10634 }, { "epoch": 4.811128703913142, "grad_norm": 0.5493812561035156, "learning_rate": 5.545907477517593e-06, "loss": 0.5053, "step": 10635 }, { "epoch": 4.8115810902510745, "grad_norm": 0.6150582432746887, "learning_rate": 5.545179102090748e-06, "loss": 0.5648, "step": 10636 }, { "epoch": 4.812033476589007, "grad_norm": 0.5212494134902954, "learning_rate": 5.544450714955144e-06, "loss": 0.4374, "step": 10637 }, { "epoch": 4.812485862926939, "grad_norm": 0.5714514851570129, "learning_rate": 5.543722316126422e-06, "loss": 0.5057, "step": 10638 }, { "epoch": 4.812938249264873, "grad_norm": 0.5146077275276184, "learning_rate": 5.5429939056202275e-06, "loss": 0.4649, "step": 10639 }, { "epoch": 4.813390635602805, "grad_norm": 0.6208315491676331, "learning_rate": 5.542265483452203e-06, "loss": 0.4864, "step": 10640 }, { "epoch": 4.8138430219407375, "grad_norm": 0.6151208877563477, "learning_rate": 5.541537049637995e-06, "loss": 0.5041, "step": 10641 }, { "epoch": 4.81429540827867, "grad_norm": 0.6455155611038208, "learning_rate": 5.5408086041932445e-06, "loss": 0.4901, "step": 10642 }, { "epoch": 4.814747794616602, "grad_norm": 0.1701783984899521, "learning_rate": 5.540080147133599e-06, "loss": 1.3796, "step": 10643 }, { "epoch": 4.815200180954535, "grad_norm": 0.2885948121547699, "learning_rate": 5.539351678474701e-06, "loss": 0.5807, "step": 10644 }, { "epoch": 4.815652567292468, "grad_norm": 0.30852243304252625, "learning_rate": 5.538623198232198e-06, "loss": 0.626, "step": 10645 }, { "epoch": 4.816104953630401, "grad_norm": 0.30924493074417114, "learning_rate": 5.537894706421736e-06, "loss": 0.5727, "step": 10646 }, { "epoch": 4.816557339968333, "grad_norm": 0.3705731928348541, "learning_rate": 5.537166203058959e-06, "loss": 0.579, "step": 10647 }, { "epoch": 4.8170097263062654, "grad_norm": 0.32766759395599365, "learning_rate": 5.536437688159513e-06, "loss": 0.5396, "step": 10648 }, { "epoch": 4.817462112644198, "grad_norm": 0.3397770822048187, "learning_rate": 5.5357091617390454e-06, "loss": 0.5998, "step": 10649 }, { "epoch": 4.81791449898213, "grad_norm": 0.36553072929382324, "learning_rate": 5.534980623813202e-06, "loss": 0.5723, "step": 10650 }, { "epoch": 4.818366885320064, "grad_norm": 0.4258970022201538, "learning_rate": 5.53425207439763e-06, "loss": 0.6403, "step": 10651 }, { "epoch": 4.818819271657996, "grad_norm": 0.42697691917419434, "learning_rate": 5.533523513507978e-06, "loss": 0.7012, "step": 10652 }, { "epoch": 4.8192716579959285, "grad_norm": 0.37158283591270447, "learning_rate": 5.532794941159889e-06, "loss": 0.5238, "step": 10653 }, { "epoch": 4.819724044333861, "grad_norm": 0.42049577832221985, "learning_rate": 5.532066357369012e-06, "loss": 0.6326, "step": 10654 }, { "epoch": 4.820176430671793, "grad_norm": 0.5167102813720703, "learning_rate": 5.5313377621509975e-06, "loss": 0.5492, "step": 10655 }, { "epoch": 4.820628817009727, "grad_norm": 0.4208316206932068, "learning_rate": 5.5306091555214906e-06, "loss": 0.488, "step": 10656 }, { "epoch": 4.821081203347659, "grad_norm": 0.48333486914634705, "learning_rate": 5.5298805374961404e-06, "loss": 0.7112, "step": 10657 }, { "epoch": 4.8215335896855915, "grad_norm": 0.4688965678215027, "learning_rate": 5.529151908090595e-06, "loss": 0.5655, "step": 10658 }, { "epoch": 4.821985976023524, "grad_norm": 0.47764426469802856, "learning_rate": 5.528423267320504e-06, "loss": 0.7405, "step": 10659 }, { "epoch": 4.822438362361456, "grad_norm": 0.48138415813446045, "learning_rate": 5.527694615201515e-06, "loss": 0.6868, "step": 10660 }, { "epoch": 4.82289074869939, "grad_norm": 0.37993505597114563, "learning_rate": 5.52696595174928e-06, "loss": 0.5209, "step": 10661 }, { "epoch": 4.823343135037322, "grad_norm": 0.35909372568130493, "learning_rate": 5.526237276979445e-06, "loss": 0.3535, "step": 10662 }, { "epoch": 4.823795521375255, "grad_norm": 0.4783843755722046, "learning_rate": 5.525508590907661e-06, "loss": 0.5753, "step": 10663 }, { "epoch": 4.824247907713187, "grad_norm": 0.4129186272621155, "learning_rate": 5.52477989354958e-06, "loss": 0.4613, "step": 10664 }, { "epoch": 4.824700294051119, "grad_norm": 0.46761783957481384, "learning_rate": 5.524051184920849e-06, "loss": 0.575, "step": 10665 }, { "epoch": 4.825152680389052, "grad_norm": 0.4259720742702484, "learning_rate": 5.5233224650371185e-06, "loss": 0.4977, "step": 10666 }, { "epoch": 4.825605066726985, "grad_norm": 0.45797133445739746, "learning_rate": 5.522593733914041e-06, "loss": 0.5406, "step": 10667 }, { "epoch": 4.826057453064918, "grad_norm": 0.5024190545082092, "learning_rate": 5.5218649915672675e-06, "loss": 0.5584, "step": 10668 }, { "epoch": 4.82650983940285, "grad_norm": 0.46718376874923706, "learning_rate": 5.521136238012448e-06, "loss": 0.5559, "step": 10669 }, { "epoch": 4.8269622257407825, "grad_norm": 0.45739543437957764, "learning_rate": 5.520407473265233e-06, "loss": 0.5124, "step": 10670 }, { "epoch": 4.827414612078715, "grad_norm": 0.4382256865501404, "learning_rate": 5.519678697341276e-06, "loss": 0.4747, "step": 10671 }, { "epoch": 4.827866998416647, "grad_norm": 0.5244066119194031, "learning_rate": 5.518949910256228e-06, "loss": 0.5831, "step": 10672 }, { "epoch": 4.828319384754581, "grad_norm": 0.4642459452152252, "learning_rate": 5.5182211120257425e-06, "loss": 0.536, "step": 10673 }, { "epoch": 4.828771771092513, "grad_norm": 0.48291295766830444, "learning_rate": 5.517492302665469e-06, "loss": 0.5495, "step": 10674 }, { "epoch": 4.8292241574304455, "grad_norm": 0.5425096750259399, "learning_rate": 5.516763482191063e-06, "loss": 0.6374, "step": 10675 }, { "epoch": 4.829676543768378, "grad_norm": 0.49008890986442566, "learning_rate": 5.516034650618175e-06, "loss": 0.5005, "step": 10676 }, { "epoch": 4.83012893010631, "grad_norm": 0.4834480285644531, "learning_rate": 5.51530580796246e-06, "loss": 0.51, "step": 10677 }, { "epoch": 4.830581316444244, "grad_norm": 0.5267611742019653, "learning_rate": 5.514576954239569e-06, "loss": 0.5193, "step": 10678 }, { "epoch": 4.831033702782176, "grad_norm": 0.48962894082069397, "learning_rate": 5.5138480894651565e-06, "loss": 0.4441, "step": 10679 }, { "epoch": 4.831486089120109, "grad_norm": 0.5639973282814026, "learning_rate": 5.513119213654878e-06, "loss": 0.5269, "step": 10680 }, { "epoch": 4.831938475458041, "grad_norm": 0.5151032209396362, "learning_rate": 5.512390326824385e-06, "loss": 0.502, "step": 10681 }, { "epoch": 4.832390861795973, "grad_norm": 0.5460888743400574, "learning_rate": 5.511661428989333e-06, "loss": 0.5934, "step": 10682 }, { "epoch": 4.832843248133907, "grad_norm": 0.5449122786521912, "learning_rate": 5.5109325201653775e-06, "loss": 0.4874, "step": 10683 }, { "epoch": 4.833295634471839, "grad_norm": 0.5229772925376892, "learning_rate": 5.51020360036817e-06, "loss": 0.4699, "step": 10684 }, { "epoch": 4.833748020809772, "grad_norm": 0.6125257611274719, "learning_rate": 5.5094746696133694e-06, "loss": 0.6124, "step": 10685 }, { "epoch": 4.834200407147704, "grad_norm": 0.6167307496070862, "learning_rate": 5.508745727916629e-06, "loss": 0.5259, "step": 10686 }, { "epoch": 4.8346527934856365, "grad_norm": 0.5658557415008545, "learning_rate": 5.508016775293602e-06, "loss": 0.4927, "step": 10687 }, { "epoch": 4.83510517982357, "grad_norm": 0.6092032790184021, "learning_rate": 5.5072878117599475e-06, "loss": 0.5289, "step": 10688 }, { "epoch": 4.835557566161502, "grad_norm": 0.623598039150238, "learning_rate": 5.506558837331319e-06, "loss": 0.5448, "step": 10689 }, { "epoch": 4.836009952499435, "grad_norm": 0.6051155924797058, "learning_rate": 5.505829852023376e-06, "loss": 0.57, "step": 10690 }, { "epoch": 4.836462338837367, "grad_norm": 0.5330275893211365, "learning_rate": 5.505100855851771e-06, "loss": 0.3758, "step": 10691 }, { "epoch": 4.8369147251752995, "grad_norm": 0.6673328876495361, "learning_rate": 5.5043718488321605e-06, "loss": 0.5205, "step": 10692 }, { "epoch": 4.837367111513232, "grad_norm": 0.16444748640060425, "learning_rate": 5.503642830980205e-06, "loss": 1.208, "step": 10693 }, { "epoch": 4.837819497851165, "grad_norm": 0.2379164695739746, "learning_rate": 5.502913802311558e-06, "loss": 0.8758, "step": 10694 }, { "epoch": 4.838271884189098, "grad_norm": 0.29769963026046753, "learning_rate": 5.502184762841879e-06, "loss": 0.6486, "step": 10695 }, { "epoch": 4.83872427052703, "grad_norm": 0.29672953486442566, "learning_rate": 5.501455712586825e-06, "loss": 0.4952, "step": 10696 }, { "epoch": 4.839176656864963, "grad_norm": 0.36255648732185364, "learning_rate": 5.500726651562054e-06, "loss": 0.6169, "step": 10697 }, { "epoch": 4.839629043202895, "grad_norm": 0.38637417554855347, "learning_rate": 5.499997579783223e-06, "loss": 0.672, "step": 10698 }, { "epoch": 4.840081429540827, "grad_norm": 0.40008366107940674, "learning_rate": 5.499268497265991e-06, "loss": 0.7002, "step": 10699 }, { "epoch": 4.840533815878761, "grad_norm": 0.3774268329143524, "learning_rate": 5.498539404026015e-06, "loss": 0.617, "step": 10700 }, { "epoch": 4.840986202216693, "grad_norm": 0.36988943815231323, "learning_rate": 5.497810300078955e-06, "loss": 0.5644, "step": 10701 }, { "epoch": 4.841438588554626, "grad_norm": 0.36563077569007874, "learning_rate": 5.497081185440471e-06, "loss": 0.5736, "step": 10702 }, { "epoch": 4.841890974892558, "grad_norm": 0.3946436047554016, "learning_rate": 5.496352060126219e-06, "loss": 0.4779, "step": 10703 }, { "epoch": 4.8423433612304905, "grad_norm": 0.4781540334224701, "learning_rate": 5.495622924151861e-06, "loss": 0.5986, "step": 10704 }, { "epoch": 4.842795747568424, "grad_norm": 0.4073505401611328, "learning_rate": 5.494893777533056e-06, "loss": 0.6344, "step": 10705 }, { "epoch": 4.843248133906356, "grad_norm": 0.46101632714271545, "learning_rate": 5.494164620285463e-06, "loss": 0.5848, "step": 10706 }, { "epoch": 4.843700520244289, "grad_norm": 0.41103339195251465, "learning_rate": 5.493435452424742e-06, "loss": 0.5519, "step": 10707 }, { "epoch": 4.844152906582221, "grad_norm": 0.4129759967327118, "learning_rate": 5.4927062739665545e-06, "loss": 0.5165, "step": 10708 }, { "epoch": 4.8446052929201535, "grad_norm": 0.40174996852874756, "learning_rate": 5.491977084926562e-06, "loss": 0.5057, "step": 10709 }, { "epoch": 4.845057679258087, "grad_norm": 0.42792901396751404, "learning_rate": 5.491247885320422e-06, "loss": 0.5334, "step": 10710 }, { "epoch": 4.845510065596019, "grad_norm": 0.39393603801727295, "learning_rate": 5.490518675163797e-06, "loss": 0.4926, "step": 10711 }, { "epoch": 4.845962451933952, "grad_norm": 0.4895305037498474, "learning_rate": 5.489789454472349e-06, "loss": 0.6485, "step": 10712 }, { "epoch": 4.846414838271884, "grad_norm": 0.4719812572002411, "learning_rate": 5.489060223261738e-06, "loss": 0.602, "step": 10713 }, { "epoch": 4.846867224609817, "grad_norm": 0.5158295035362244, "learning_rate": 5.488330981547626e-06, "loss": 0.641, "step": 10714 }, { "epoch": 4.847319610947749, "grad_norm": 0.4700779914855957, "learning_rate": 5.487601729345676e-06, "loss": 0.603, "step": 10715 }, { "epoch": 4.847771997285682, "grad_norm": 0.44300007820129395, "learning_rate": 5.486872466671549e-06, "loss": 0.5357, "step": 10716 }, { "epoch": 4.848224383623615, "grad_norm": 0.37552890181541443, "learning_rate": 5.486143193540906e-06, "loss": 0.4387, "step": 10717 }, { "epoch": 4.848676769961547, "grad_norm": 0.45301711559295654, "learning_rate": 5.485413909969413e-06, "loss": 0.4983, "step": 10718 }, { "epoch": 4.84912915629948, "grad_norm": 0.419474333524704, "learning_rate": 5.484684615972729e-06, "loss": 0.546, "step": 10719 }, { "epoch": 4.849581542637412, "grad_norm": 0.5253953337669373, "learning_rate": 5.483955311566521e-06, "loss": 0.6527, "step": 10720 }, { "epoch": 4.8500339289753445, "grad_norm": 0.4899841547012329, "learning_rate": 5.483225996766449e-06, "loss": 0.5731, "step": 10721 }, { "epoch": 4.850486315313278, "grad_norm": 0.4678986668586731, "learning_rate": 5.482496671588177e-06, "loss": 0.5362, "step": 10722 }, { "epoch": 4.85093870165121, "grad_norm": 0.447091281414032, "learning_rate": 5.4817673360473685e-06, "loss": 0.4977, "step": 10723 }, { "epoch": 4.851391087989143, "grad_norm": 0.5095344185829163, "learning_rate": 5.481037990159689e-06, "loss": 0.5273, "step": 10724 }, { "epoch": 4.851843474327075, "grad_norm": 0.4744347333908081, "learning_rate": 5.4803086339408005e-06, "loss": 0.5577, "step": 10725 }, { "epoch": 4.8522958606650075, "grad_norm": 0.43966221809387207, "learning_rate": 5.4795792674063694e-06, "loss": 0.4607, "step": 10726 }, { "epoch": 4.852748247002941, "grad_norm": 0.48340317606925964, "learning_rate": 5.478849890572059e-06, "loss": 0.4653, "step": 10727 }, { "epoch": 4.853200633340873, "grad_norm": 0.4366174042224884, "learning_rate": 5.478120503453533e-06, "loss": 0.3906, "step": 10728 }, { "epoch": 4.853653019678806, "grad_norm": 0.550967812538147, "learning_rate": 5.477391106066459e-06, "loss": 0.5324, "step": 10729 }, { "epoch": 4.854105406016738, "grad_norm": 0.5361813902854919, "learning_rate": 5.4766616984265e-06, "loss": 0.5651, "step": 10730 }, { "epoch": 4.8545577923546706, "grad_norm": 0.5160470008850098, "learning_rate": 5.475932280549322e-06, "loss": 0.4895, "step": 10731 }, { "epoch": 4.855010178692604, "grad_norm": 0.5530928373336792, "learning_rate": 5.475202852450593e-06, "loss": 0.5345, "step": 10732 }, { "epoch": 4.855462565030536, "grad_norm": 0.5396880507469177, "learning_rate": 5.4744734141459744e-06, "loss": 0.5058, "step": 10733 }, { "epoch": 4.855914951368469, "grad_norm": 0.6169857978820801, "learning_rate": 5.473743965651136e-06, "loss": 0.4829, "step": 10734 }, { "epoch": 4.856367337706401, "grad_norm": 0.5665090084075928, "learning_rate": 5.473014506981741e-06, "loss": 0.5833, "step": 10735 }, { "epoch": 4.856819724044334, "grad_norm": 0.6095009446144104, "learning_rate": 5.472285038153459e-06, "loss": 0.5793, "step": 10736 }, { "epoch": 4.857272110382267, "grad_norm": 0.5633830428123474, "learning_rate": 5.471555559181955e-06, "loss": 0.5028, "step": 10737 }, { "epoch": 4.857724496720199, "grad_norm": 0.635465145111084, "learning_rate": 5.470826070082895e-06, "loss": 0.6011, "step": 10738 }, { "epoch": 4.858176883058132, "grad_norm": 0.6760136485099792, "learning_rate": 5.470096570871949e-06, "loss": 0.6034, "step": 10739 }, { "epoch": 4.858629269396064, "grad_norm": 0.6307496428489685, "learning_rate": 5.469367061564783e-06, "loss": 0.5441, "step": 10740 }, { "epoch": 4.859081655733997, "grad_norm": 0.6303584575653076, "learning_rate": 5.468637542177063e-06, "loss": 0.5195, "step": 10741 }, { "epoch": 4.859534042071929, "grad_norm": 0.6690105199813843, "learning_rate": 5.46790801272446e-06, "loss": 0.4595, "step": 10742 }, { "epoch": 4.859986428409862, "grad_norm": 0.17297790944576263, "learning_rate": 5.467178473222641e-06, "loss": 1.1436, "step": 10743 }, { "epoch": 4.860438814747795, "grad_norm": 0.2716812491416931, "learning_rate": 5.466448923687271e-06, "loss": 0.6727, "step": 10744 }, { "epoch": 4.860891201085727, "grad_norm": 0.34095290303230286, "learning_rate": 5.465719364134022e-06, "loss": 0.6283, "step": 10745 }, { "epoch": 4.86134358742366, "grad_norm": 0.32295793294906616, "learning_rate": 5.464989794578563e-06, "loss": 0.5312, "step": 10746 }, { "epoch": 4.861795973761592, "grad_norm": 0.3254340887069702, "learning_rate": 5.46426021503656e-06, "loss": 0.5219, "step": 10747 }, { "epoch": 4.8622483600995245, "grad_norm": 0.3782033622264862, "learning_rate": 5.463530625523683e-06, "loss": 0.581, "step": 10748 }, { "epoch": 4.862700746437458, "grad_norm": 0.40180033445358276, "learning_rate": 5.462801026055604e-06, "loss": 0.636, "step": 10749 }, { "epoch": 4.86315313277539, "grad_norm": 0.37669265270233154, "learning_rate": 5.462071416647989e-06, "loss": 0.5945, "step": 10750 }, { "epoch": 4.863605519113323, "grad_norm": 0.4196114242076874, "learning_rate": 5.46134179731651e-06, "loss": 0.654, "step": 10751 }, { "epoch": 4.864057905451255, "grad_norm": 0.40112727880477905, "learning_rate": 5.460612168076837e-06, "loss": 0.5081, "step": 10752 }, { "epoch": 4.864510291789188, "grad_norm": 0.35975903272628784, "learning_rate": 5.459882528944639e-06, "loss": 0.4454, "step": 10753 }, { "epoch": 4.864962678127121, "grad_norm": 0.4164198935031891, "learning_rate": 5.459152879935588e-06, "loss": 0.5864, "step": 10754 }, { "epoch": 4.865415064465053, "grad_norm": 0.35770654678344727, "learning_rate": 5.458423221065352e-06, "loss": 0.4376, "step": 10755 }, { "epoch": 4.865867450802986, "grad_norm": 0.4196317791938782, "learning_rate": 5.457693552349603e-06, "loss": 0.5902, "step": 10756 }, { "epoch": 4.866319837140918, "grad_norm": 0.4495839774608612, "learning_rate": 5.456963873804013e-06, "loss": 0.5468, "step": 10757 }, { "epoch": 4.866772223478851, "grad_norm": 0.4423122704029083, "learning_rate": 5.456234185444252e-06, "loss": 0.5692, "step": 10758 }, { "epoch": 4.867224609816784, "grad_norm": 0.42846328020095825, "learning_rate": 5.455504487285993e-06, "loss": 0.6801, "step": 10759 }, { "epoch": 4.867676996154716, "grad_norm": 0.45947232842445374, "learning_rate": 5.454774779344906e-06, "loss": 0.6272, "step": 10760 }, { "epoch": 4.868129382492649, "grad_norm": 0.45724818110466003, "learning_rate": 5.4540450616366635e-06, "loss": 0.6435, "step": 10761 }, { "epoch": 4.868581768830581, "grad_norm": 0.4259015917778015, "learning_rate": 5.4533153341769374e-06, "loss": 0.5228, "step": 10762 }, { "epoch": 4.869034155168514, "grad_norm": 0.4900253415107727, "learning_rate": 5.4525855969814e-06, "loss": 0.637, "step": 10763 }, { "epoch": 4.869486541506447, "grad_norm": 0.41377559304237366, "learning_rate": 5.451855850065725e-06, "loss": 0.5179, "step": 10764 }, { "epoch": 4.869938927844379, "grad_norm": 0.40965908765792847, "learning_rate": 5.451126093445584e-06, "loss": 0.494, "step": 10765 }, { "epoch": 4.870391314182312, "grad_norm": 0.4458225965499878, "learning_rate": 5.45039632713665e-06, "loss": 0.5373, "step": 10766 }, { "epoch": 4.870843700520244, "grad_norm": 0.4430021643638611, "learning_rate": 5.449666551154595e-06, "loss": 0.5553, "step": 10767 }, { "epoch": 4.871296086858177, "grad_norm": 0.4292522966861725, "learning_rate": 5.448936765515095e-06, "loss": 0.4977, "step": 10768 }, { "epoch": 4.871748473196109, "grad_norm": 0.47845083475112915, "learning_rate": 5.448206970233821e-06, "loss": 0.5202, "step": 10769 }, { "epoch": 4.872200859534042, "grad_norm": 0.4952782988548279, "learning_rate": 5.447477165326448e-06, "loss": 0.5105, "step": 10770 }, { "epoch": 4.872653245871975, "grad_norm": 0.5097358226776123, "learning_rate": 5.4467473508086496e-06, "loss": 0.5621, "step": 10771 }, { "epoch": 4.873105632209907, "grad_norm": 0.4741162061691284, "learning_rate": 5.4460175266961004e-06, "loss": 0.547, "step": 10772 }, { "epoch": 4.87355801854784, "grad_norm": 0.4885275363922119, "learning_rate": 5.445287693004474e-06, "loss": 0.5165, "step": 10773 }, { "epoch": 4.874010404885772, "grad_norm": 0.4771646559238434, "learning_rate": 5.444557849749446e-06, "loss": 0.4374, "step": 10774 }, { "epoch": 4.874462791223705, "grad_norm": 0.49059394001960754, "learning_rate": 5.44382799694669e-06, "loss": 0.4684, "step": 10775 }, { "epoch": 4.874915177561638, "grad_norm": 0.46176809072494507, "learning_rate": 5.443098134611882e-06, "loss": 0.4254, "step": 10776 }, { "epoch": 4.87536756389957, "grad_norm": 0.6018497943878174, "learning_rate": 5.4423682627606975e-06, "loss": 0.6508, "step": 10777 }, { "epoch": 4.875819950237503, "grad_norm": 0.4846925139427185, "learning_rate": 5.44163838140881e-06, "loss": 0.4767, "step": 10778 }, { "epoch": 4.876272336575435, "grad_norm": 0.48891979455947876, "learning_rate": 5.440908490571897e-06, "loss": 0.5086, "step": 10779 }, { "epoch": 4.876724722913368, "grad_norm": 0.46268412470817566, "learning_rate": 5.4401785902656326e-06, "loss": 0.4655, "step": 10780 }, { "epoch": 4.877177109251301, "grad_norm": 0.4944525361061096, "learning_rate": 5.4394486805056934e-06, "loss": 0.4805, "step": 10781 }, { "epoch": 4.877629495589233, "grad_norm": 0.5008891224861145, "learning_rate": 5.438718761307757e-06, "loss": 0.4995, "step": 10782 }, { "epoch": 4.878081881927166, "grad_norm": 0.5625344514846802, "learning_rate": 5.437988832687497e-06, "loss": 0.4753, "step": 10783 }, { "epoch": 4.878534268265098, "grad_norm": 0.50958651304245, "learning_rate": 5.437258894660594e-06, "loss": 0.452, "step": 10784 }, { "epoch": 4.878986654603031, "grad_norm": 0.5639791488647461, "learning_rate": 5.436528947242721e-06, "loss": 0.5034, "step": 10785 }, { "epoch": 4.879439040940964, "grad_norm": 0.6078188419342041, "learning_rate": 5.435798990449558e-06, "loss": 0.5134, "step": 10786 }, { "epoch": 4.8798914272788965, "grad_norm": 0.624315083026886, "learning_rate": 5.435069024296779e-06, "loss": 0.5436, "step": 10787 }, { "epoch": 4.880343813616829, "grad_norm": 0.5767595767974854, "learning_rate": 5.434339048800066e-06, "loss": 0.4406, "step": 10788 }, { "epoch": 4.880796199954761, "grad_norm": 0.579104483127594, "learning_rate": 5.433609063975092e-06, "loss": 0.4954, "step": 10789 }, { "epoch": 4.881248586292694, "grad_norm": 0.575593888759613, "learning_rate": 5.4328790698375355e-06, "loss": 0.4597, "step": 10790 }, { "epoch": 4.881700972630626, "grad_norm": 0.5822521448135376, "learning_rate": 5.4321490664030775e-06, "loss": 0.4822, "step": 10791 }, { "epoch": 4.8821533589685595, "grad_norm": 0.8448026776313782, "learning_rate": 5.431419053687393e-06, "loss": 0.5958, "step": 10792 }, { "epoch": 4.882605745306492, "grad_norm": 0.21772585809230804, "learning_rate": 5.430689031706162e-06, "loss": 1.2254, "step": 10793 }, { "epoch": 4.883058131644424, "grad_norm": 0.3237381875514984, "learning_rate": 5.4299590004750635e-06, "loss": 0.7507, "step": 10794 }, { "epoch": 4.883510517982357, "grad_norm": 0.3535354733467102, "learning_rate": 5.429228960009776e-06, "loss": 0.6783, "step": 10795 }, { "epoch": 4.883962904320289, "grad_norm": 0.3884057104587555, "learning_rate": 5.428498910325977e-06, "loss": 0.6008, "step": 10796 }, { "epoch": 4.884415290658222, "grad_norm": 0.31087735295295715, "learning_rate": 5.427768851439348e-06, "loss": 0.5712, "step": 10797 }, { "epoch": 4.884867676996155, "grad_norm": 0.421701580286026, "learning_rate": 5.4270387833655655e-06, "loss": 0.6586, "step": 10798 }, { "epoch": 4.885320063334087, "grad_norm": 0.3880407512187958, "learning_rate": 5.426308706120314e-06, "loss": 0.6025, "step": 10799 }, { "epoch": 4.88577244967202, "grad_norm": 0.37353551387786865, "learning_rate": 5.425578619719268e-06, "loss": 0.5811, "step": 10800 }, { "epoch": 4.88577244967202, "eval_loss": 0.5917639136314392, "eval_runtime": 25.9931, "eval_samples_per_second": 28.623, "eval_steps_per_second": 7.156, "step": 10800 }, { "epoch": 4.886224836009952, "grad_norm": 0.4688951075077057, "learning_rate": 5.424848524178111e-06, "loss": 0.684, "step": 10801 }, { "epoch": 4.886677222347885, "grad_norm": 0.3975757658481598, "learning_rate": 5.4241184195125205e-06, "loss": 0.5691, "step": 10802 }, { "epoch": 4.887129608685818, "grad_norm": 0.43729346990585327, "learning_rate": 5.42338830573818e-06, "loss": 0.588, "step": 10803 }, { "epoch": 4.8875819950237505, "grad_norm": 0.3720282018184662, "learning_rate": 5.422658182870767e-06, "loss": 0.4906, "step": 10804 }, { "epoch": 4.888034381361683, "grad_norm": 0.3992096781730652, "learning_rate": 5.421928050925964e-06, "loss": 0.575, "step": 10805 }, { "epoch": 4.888486767699615, "grad_norm": 0.5181128978729248, "learning_rate": 5.4211979099194515e-06, "loss": 0.6261, "step": 10806 }, { "epoch": 4.888939154037548, "grad_norm": 0.4332115054130554, "learning_rate": 5.420467759866911e-06, "loss": 0.511, "step": 10807 }, { "epoch": 4.889391540375481, "grad_norm": 0.48030176758766174, "learning_rate": 5.419737600784024e-06, "loss": 0.5097, "step": 10808 }, { "epoch": 4.8898439267134135, "grad_norm": 0.4467189311981201, "learning_rate": 5.41900743268647e-06, "loss": 0.53, "step": 10809 }, { "epoch": 4.890296313051346, "grad_norm": 0.3780410587787628, "learning_rate": 5.418277255589935e-06, "loss": 0.5056, "step": 10810 }, { "epoch": 4.890748699389278, "grad_norm": 0.48517027497291565, "learning_rate": 5.417547069510098e-06, "loss": 0.6443, "step": 10811 }, { "epoch": 4.891201085727211, "grad_norm": 0.43830952048301697, "learning_rate": 5.416816874462641e-06, "loss": 0.5479, "step": 10812 }, { "epoch": 4.891653472065144, "grad_norm": 0.4581489861011505, "learning_rate": 5.416086670463247e-06, "loss": 0.5434, "step": 10813 }, { "epoch": 4.8921058584030765, "grad_norm": 0.491134911775589, "learning_rate": 5.415356457527599e-06, "loss": 0.5413, "step": 10814 }, { "epoch": 4.892558244741009, "grad_norm": 0.40182146430015564, "learning_rate": 5.414626235671379e-06, "loss": 0.4265, "step": 10815 }, { "epoch": 4.893010631078941, "grad_norm": 0.454787939786911, "learning_rate": 5.4138960049102685e-06, "loss": 0.4647, "step": 10816 }, { "epoch": 4.893463017416874, "grad_norm": 0.41244015097618103, "learning_rate": 5.413165765259953e-06, "loss": 0.4958, "step": 10817 }, { "epoch": 4.893915403754806, "grad_norm": 0.4927079379558563, "learning_rate": 5.412435516736115e-06, "loss": 0.6049, "step": 10818 }, { "epoch": 4.894367790092739, "grad_norm": 0.4579727351665497, "learning_rate": 5.411705259354438e-06, "loss": 0.5226, "step": 10819 }, { "epoch": 4.894820176430672, "grad_norm": 0.428519070148468, "learning_rate": 5.4109749931306055e-06, "loss": 0.4465, "step": 10820 }, { "epoch": 4.8952725627686045, "grad_norm": 0.4848102927207947, "learning_rate": 5.410244718080302e-06, "loss": 0.5262, "step": 10821 }, { "epoch": 4.895724949106537, "grad_norm": 0.4202897548675537, "learning_rate": 5.409514434219212e-06, "loss": 0.4271, "step": 10822 }, { "epoch": 4.896177335444469, "grad_norm": 0.4920751452445984, "learning_rate": 5.4087841415630175e-06, "loss": 0.5274, "step": 10823 }, { "epoch": 4.896629721782402, "grad_norm": 0.5120900273323059, "learning_rate": 5.408053840127403e-06, "loss": 0.4785, "step": 10824 }, { "epoch": 4.897082108120335, "grad_norm": 0.46851930022239685, "learning_rate": 5.407323529928056e-06, "loss": 0.5585, "step": 10825 }, { "epoch": 4.8975344944582675, "grad_norm": 0.47497329115867615, "learning_rate": 5.406593210980658e-06, "loss": 0.4898, "step": 10826 }, { "epoch": 4.8979868807962, "grad_norm": 0.4848705530166626, "learning_rate": 5.405862883300898e-06, "loss": 0.5159, "step": 10827 }, { "epoch": 4.898439267134132, "grad_norm": 0.5704187750816345, "learning_rate": 5.405132546904457e-06, "loss": 0.5635, "step": 10828 }, { "epoch": 4.898891653472065, "grad_norm": 0.5056565999984741, "learning_rate": 5.404402201807022e-06, "loss": 0.4938, "step": 10829 }, { "epoch": 4.899344039809998, "grad_norm": 0.48169004917144775, "learning_rate": 5.403671848024279e-06, "loss": 0.4808, "step": 10830 }, { "epoch": 4.8997964261479305, "grad_norm": 0.49501198530197144, "learning_rate": 5.402941485571913e-06, "loss": 0.4589, "step": 10831 }, { "epoch": 4.900248812485863, "grad_norm": 0.5590320825576782, "learning_rate": 5.402211114465611e-06, "loss": 0.5128, "step": 10832 }, { "epoch": 4.900701198823795, "grad_norm": 0.5160160064697266, "learning_rate": 5.401480734721059e-06, "loss": 0.5224, "step": 10833 }, { "epoch": 4.901153585161728, "grad_norm": 0.45225951075553894, "learning_rate": 5.400750346353942e-06, "loss": 0.4104, "step": 10834 }, { "epoch": 4.901605971499661, "grad_norm": 0.5540781617164612, "learning_rate": 5.400019949379948e-06, "loss": 0.5346, "step": 10835 }, { "epoch": 4.902058357837594, "grad_norm": 0.6130911111831665, "learning_rate": 5.399289543814762e-06, "loss": 0.5828, "step": 10836 }, { "epoch": 4.902510744175526, "grad_norm": 0.575437068939209, "learning_rate": 5.398559129674071e-06, "loss": 0.4638, "step": 10837 }, { "epoch": 4.9029631305134584, "grad_norm": 0.6476883888244629, "learning_rate": 5.3978287069735635e-06, "loss": 0.5499, "step": 10838 }, { "epoch": 4.903415516851391, "grad_norm": 0.6263096332550049, "learning_rate": 5.397098275728927e-06, "loss": 0.4526, "step": 10839 }, { "epoch": 4.903867903189323, "grad_norm": 0.5645332932472229, "learning_rate": 5.396367835955847e-06, "loss": 0.4151, "step": 10840 }, { "epoch": 4.904320289527257, "grad_norm": 0.5569474101066589, "learning_rate": 5.395637387670012e-06, "loss": 0.4738, "step": 10841 }, { "epoch": 4.904772675865189, "grad_norm": 0.7037100791931152, "learning_rate": 5.39490693088711e-06, "loss": 0.4977, "step": 10842 }, { "epoch": 4.9052250622031215, "grad_norm": 0.16857962310314178, "learning_rate": 5.394176465622828e-06, "loss": 1.2187, "step": 10843 }, { "epoch": 4.905677448541054, "grad_norm": 0.3268122673034668, "learning_rate": 5.393445991892856e-06, "loss": 0.854, "step": 10844 }, { "epoch": 4.906129834878986, "grad_norm": 0.3005451560020447, "learning_rate": 5.392715509712881e-06, "loss": 0.5702, "step": 10845 }, { "epoch": 4.906582221216919, "grad_norm": 0.38587677478790283, "learning_rate": 5.3919850190985914e-06, "loss": 0.6381, "step": 10846 }, { "epoch": 4.907034607554852, "grad_norm": 0.4002133905887604, "learning_rate": 5.391254520065676e-06, "loss": 0.7235, "step": 10847 }, { "epoch": 4.9074869938927845, "grad_norm": 0.34511715173721313, "learning_rate": 5.390524012629824e-06, "loss": 0.5236, "step": 10848 }, { "epoch": 4.907939380230717, "grad_norm": 0.3468313217163086, "learning_rate": 5.389793496806724e-06, "loss": 0.5096, "step": 10849 }, { "epoch": 4.908391766568649, "grad_norm": 0.37203285098075867, "learning_rate": 5.389062972612066e-06, "loss": 0.5835, "step": 10850 }, { "epoch": 4.908844152906582, "grad_norm": 0.4311887323856354, "learning_rate": 5.388332440061539e-06, "loss": 0.6994, "step": 10851 }, { "epoch": 4.909296539244515, "grad_norm": 0.41790953278541565, "learning_rate": 5.387601899170831e-06, "loss": 0.5024, "step": 10852 }, { "epoch": 4.909748925582448, "grad_norm": 0.4225893020629883, "learning_rate": 5.386871349955635e-06, "loss": 0.6248, "step": 10853 }, { "epoch": 4.91020131192038, "grad_norm": 0.40217363834381104, "learning_rate": 5.3861407924316375e-06, "loss": 0.7275, "step": 10854 }, { "epoch": 4.910653698258312, "grad_norm": 0.37839025259017944, "learning_rate": 5.3854102266145315e-06, "loss": 0.4506, "step": 10855 }, { "epoch": 4.911106084596245, "grad_norm": 0.42242562770843506, "learning_rate": 5.384679652520006e-06, "loss": 0.5793, "step": 10856 }, { "epoch": 4.911558470934178, "grad_norm": 0.4596608579158783, "learning_rate": 5.383949070163751e-06, "loss": 0.6641, "step": 10857 }, { "epoch": 4.912010857272111, "grad_norm": 0.38736066222190857, "learning_rate": 5.3832184795614585e-06, "loss": 0.4419, "step": 10858 }, { "epoch": 4.912463243610043, "grad_norm": 0.4156990051269531, "learning_rate": 5.3824878807288174e-06, "loss": 0.536, "step": 10859 }, { "epoch": 4.9129156299479755, "grad_norm": 0.4800722301006317, "learning_rate": 5.38175727368152e-06, "loss": 0.6461, "step": 10860 }, { "epoch": 4.913368016285908, "grad_norm": 0.40084752440452576, "learning_rate": 5.381026658435258e-06, "loss": 0.4614, "step": 10861 }, { "epoch": 4.913820402623841, "grad_norm": 0.496852308511734, "learning_rate": 5.380296035005721e-06, "loss": 0.6138, "step": 10862 }, { "epoch": 4.914272788961774, "grad_norm": 0.45779576897621155, "learning_rate": 5.379565403408602e-06, "loss": 0.5122, "step": 10863 }, { "epoch": 4.914725175299706, "grad_norm": 0.4302186071872711, "learning_rate": 5.378834763659593e-06, "loss": 0.6003, "step": 10864 }, { "epoch": 4.9151775616376385, "grad_norm": 0.46880480647087097, "learning_rate": 5.378104115774384e-06, "loss": 0.5599, "step": 10865 }, { "epoch": 4.915629947975571, "grad_norm": 0.4151909351348877, "learning_rate": 5.377373459768669e-06, "loss": 0.4774, "step": 10866 }, { "epoch": 4.916082334313503, "grad_norm": 0.4569517970085144, "learning_rate": 5.37664279565814e-06, "loss": 0.569, "step": 10867 }, { "epoch": 4.916534720651437, "grad_norm": 0.4233965575695038, "learning_rate": 5.375912123458488e-06, "loss": 0.4723, "step": 10868 }, { "epoch": 4.916987106989369, "grad_norm": 0.4430537819862366, "learning_rate": 5.3751814431854065e-06, "loss": 0.5929, "step": 10869 }, { "epoch": 4.917439493327302, "grad_norm": 0.4368269443511963, "learning_rate": 5.374450754854589e-06, "loss": 0.4742, "step": 10870 }, { "epoch": 4.917891879665234, "grad_norm": 0.4923884868621826, "learning_rate": 5.373720058481726e-06, "loss": 0.5106, "step": 10871 }, { "epoch": 4.918344266003166, "grad_norm": 0.45423394441604614, "learning_rate": 5.372989354082513e-06, "loss": 0.4792, "step": 10872 }, { "epoch": 4.918796652341099, "grad_norm": 0.4989909529685974, "learning_rate": 5.372258641672642e-06, "loss": 0.6639, "step": 10873 }, { "epoch": 4.919249038679032, "grad_norm": 0.44260475039482117, "learning_rate": 5.371527921267807e-06, "loss": 0.5024, "step": 10874 }, { "epoch": 4.919701425016965, "grad_norm": 0.4409477114677429, "learning_rate": 5.370797192883702e-06, "loss": 0.4432, "step": 10875 }, { "epoch": 4.920153811354897, "grad_norm": 0.4798477590084076, "learning_rate": 5.37006645653602e-06, "loss": 0.5047, "step": 10876 }, { "epoch": 4.9206061976928295, "grad_norm": 0.5031596422195435, "learning_rate": 5.369335712240457e-06, "loss": 0.5057, "step": 10877 }, { "epoch": 4.921058584030762, "grad_norm": 0.5076724886894226, "learning_rate": 5.368604960012705e-06, "loss": 0.5269, "step": 10878 }, { "epoch": 4.921510970368695, "grad_norm": 0.5050603747367859, "learning_rate": 5.367874199868457e-06, "loss": 0.4998, "step": 10879 }, { "epoch": 4.921963356706628, "grad_norm": 0.519494891166687, "learning_rate": 5.367143431823409e-06, "loss": 0.6982, "step": 10880 }, { "epoch": 4.92241574304456, "grad_norm": 0.5515509843826294, "learning_rate": 5.366412655893256e-06, "loss": 0.5585, "step": 10881 }, { "epoch": 4.9228681293824925, "grad_norm": 0.5193607807159424, "learning_rate": 5.365681872093693e-06, "loss": 0.4782, "step": 10882 }, { "epoch": 4.923320515720425, "grad_norm": 0.5410118103027344, "learning_rate": 5.364951080440415e-06, "loss": 0.5648, "step": 10883 }, { "epoch": 4.923772902058358, "grad_norm": 0.5212827324867249, "learning_rate": 5.364220280949116e-06, "loss": 0.4508, "step": 10884 }, { "epoch": 4.924225288396291, "grad_norm": 0.5132308602333069, "learning_rate": 5.363489473635492e-06, "loss": 0.4362, "step": 10885 }, { "epoch": 4.924677674734223, "grad_norm": 0.5185028910636902, "learning_rate": 5.36275865851524e-06, "loss": 0.4471, "step": 10886 }, { "epoch": 4.925130061072156, "grad_norm": 0.530794620513916, "learning_rate": 5.362027835604055e-06, "loss": 0.4749, "step": 10887 }, { "epoch": 4.925582447410088, "grad_norm": 0.6666979789733887, "learning_rate": 5.361297004917629e-06, "loss": 0.5973, "step": 10888 }, { "epoch": 4.92603483374802, "grad_norm": 0.6161625385284424, "learning_rate": 5.360566166471662e-06, "loss": 0.5589, "step": 10889 }, { "epoch": 4.926487220085954, "grad_norm": 0.6785427331924438, "learning_rate": 5.3598353202818495e-06, "loss": 0.5542, "step": 10890 }, { "epoch": 4.926939606423886, "grad_norm": 0.5838181972503662, "learning_rate": 5.359104466363887e-06, "loss": 0.4479, "step": 10891 }, { "epoch": 4.927391992761819, "grad_norm": 0.7195177674293518, "learning_rate": 5.358373604733472e-06, "loss": 0.5209, "step": 10892 }, { "epoch": 4.927844379099751, "grad_norm": 0.18576164543628693, "learning_rate": 5.3576427354062995e-06, "loss": 1.0642, "step": 10893 }, { "epoch": 4.9282967654376835, "grad_norm": 0.26736581325531006, "learning_rate": 5.356911858398068e-06, "loss": 0.7628, "step": 10894 }, { "epoch": 4.928749151775616, "grad_norm": 0.28973355889320374, "learning_rate": 5.356180973724474e-06, "loss": 0.7055, "step": 10895 }, { "epoch": 4.929201538113549, "grad_norm": 0.28625181317329407, "learning_rate": 5.355450081401214e-06, "loss": 0.5062, "step": 10896 }, { "epoch": 4.929653924451482, "grad_norm": 0.30687808990478516, "learning_rate": 5.354719181443987e-06, "loss": 0.5519, "step": 10897 }, { "epoch": 4.930106310789414, "grad_norm": 0.3088024854660034, "learning_rate": 5.3539882738684875e-06, "loss": 0.5738, "step": 10898 }, { "epoch": 4.9305586971273465, "grad_norm": 0.36716386675834656, "learning_rate": 5.353257358690418e-06, "loss": 0.5967, "step": 10899 }, { "epoch": 4.931011083465279, "grad_norm": 0.39508935809135437, "learning_rate": 5.35252643592547e-06, "loss": 0.6299, "step": 10900 }, { "epoch": 4.931463469803212, "grad_norm": 0.4008781313896179, "learning_rate": 5.351795505589346e-06, "loss": 0.7259, "step": 10901 }, { "epoch": 4.931915856141145, "grad_norm": 0.4078456461429596, "learning_rate": 5.351064567697743e-06, "loss": 0.5043, "step": 10902 }, { "epoch": 4.932368242479077, "grad_norm": 0.35466691851615906, "learning_rate": 5.3503336222663584e-06, "loss": 0.5836, "step": 10903 }, { "epoch": 4.93282062881701, "grad_norm": 0.3724474608898163, "learning_rate": 5.349602669310891e-06, "loss": 0.5586, "step": 10904 }, { "epoch": 4.933273015154942, "grad_norm": 0.375456303358078, "learning_rate": 5.34887170884704e-06, "loss": 0.4882, "step": 10905 }, { "epoch": 4.933725401492875, "grad_norm": 0.40098971128463745, "learning_rate": 5.348140740890504e-06, "loss": 0.5087, "step": 10906 }, { "epoch": 4.934177787830808, "grad_norm": 0.37627726793289185, "learning_rate": 5.347409765456982e-06, "loss": 0.4154, "step": 10907 }, { "epoch": 4.93463017416874, "grad_norm": 0.44332361221313477, "learning_rate": 5.3466787825621735e-06, "loss": 0.6786, "step": 10908 }, { "epoch": 4.935082560506673, "grad_norm": 0.39529114961624146, "learning_rate": 5.345947792221777e-06, "loss": 0.4926, "step": 10909 }, { "epoch": 4.935534946844605, "grad_norm": 0.39232584834098816, "learning_rate": 5.345216794451491e-06, "loss": 0.5023, "step": 10910 }, { "epoch": 4.935987333182538, "grad_norm": 0.3877122104167938, "learning_rate": 5.344485789267018e-06, "loss": 0.5302, "step": 10911 }, { "epoch": 4.936439719520471, "grad_norm": 0.3890119791030884, "learning_rate": 5.3437547766840535e-06, "loss": 0.5384, "step": 10912 }, { "epoch": 4.936892105858403, "grad_norm": 0.41657885909080505, "learning_rate": 5.343023756718301e-06, "loss": 0.4645, "step": 10913 }, { "epoch": 4.937344492196336, "grad_norm": 0.40144848823547363, "learning_rate": 5.342292729385459e-06, "loss": 0.4604, "step": 10914 }, { "epoch": 4.937796878534268, "grad_norm": 0.4456392526626587, "learning_rate": 5.341561694701228e-06, "loss": 0.5348, "step": 10915 }, { "epoch": 4.9382492648722005, "grad_norm": 0.45605775713920593, "learning_rate": 5.340830652681309e-06, "loss": 0.5571, "step": 10916 }, { "epoch": 4.938701651210134, "grad_norm": 0.41678544878959656, "learning_rate": 5.340099603341401e-06, "loss": 0.5056, "step": 10917 }, { "epoch": 4.939154037548066, "grad_norm": 0.43193989992141724, "learning_rate": 5.339368546697206e-06, "loss": 0.4863, "step": 10918 }, { "epoch": 4.939606423885999, "grad_norm": 0.48485279083251953, "learning_rate": 5.338637482764424e-06, "loss": 0.5111, "step": 10919 }, { "epoch": 4.940058810223931, "grad_norm": 0.41528868675231934, "learning_rate": 5.337906411558758e-06, "loss": 0.4675, "step": 10920 }, { "epoch": 4.9405111965618635, "grad_norm": 0.44142523407936096, "learning_rate": 5.337175333095907e-06, "loss": 0.5124, "step": 10921 }, { "epoch": 4.940963582899796, "grad_norm": 0.45268353819847107, "learning_rate": 5.336444247391573e-06, "loss": 0.5207, "step": 10922 }, { "epoch": 4.941415969237729, "grad_norm": 0.4753705561161041, "learning_rate": 5.335713154461455e-06, "loss": 0.5708, "step": 10923 }, { "epoch": 4.941868355575662, "grad_norm": 0.5122848153114319, "learning_rate": 5.3349820543212585e-06, "loss": 0.5106, "step": 10924 }, { "epoch": 4.942320741913594, "grad_norm": 0.49648621678352356, "learning_rate": 5.334250946986685e-06, "loss": 0.4425, "step": 10925 }, { "epoch": 4.942773128251527, "grad_norm": 0.48175227642059326, "learning_rate": 5.333519832473433e-06, "loss": 0.5409, "step": 10926 }, { "epoch": 4.943225514589459, "grad_norm": 0.5322744846343994, "learning_rate": 5.332788710797208e-06, "loss": 0.5675, "step": 10927 }, { "epoch": 4.943677900927392, "grad_norm": 0.49587661027908325, "learning_rate": 5.33205758197371e-06, "loss": 0.5718, "step": 10928 }, { "epoch": 4.944130287265325, "grad_norm": 0.5018343925476074, "learning_rate": 5.331326446018642e-06, "loss": 0.5158, "step": 10929 }, { "epoch": 4.944582673603257, "grad_norm": 0.5316690802574158, "learning_rate": 5.330595302947707e-06, "loss": 0.4709, "step": 10930 }, { "epoch": 4.94503505994119, "grad_norm": 0.47567248344421387, "learning_rate": 5.329864152776609e-06, "loss": 0.4672, "step": 10931 }, { "epoch": 4.945487446279122, "grad_norm": 0.5380827188491821, "learning_rate": 5.3291329955210505e-06, "loss": 0.4989, "step": 10932 }, { "epoch": 4.945939832617055, "grad_norm": 0.5225793123245239, "learning_rate": 5.328401831196731e-06, "loss": 0.4843, "step": 10933 }, { "epoch": 4.946392218954988, "grad_norm": 0.4965612292289734, "learning_rate": 5.327670659819356e-06, "loss": 0.4247, "step": 10934 }, { "epoch": 4.94684460529292, "grad_norm": 0.501266598701477, "learning_rate": 5.3269394814046295e-06, "loss": 0.4948, "step": 10935 }, { "epoch": 4.947296991630853, "grad_norm": 0.5725453495979309, "learning_rate": 5.326208295968254e-06, "loss": 0.5289, "step": 10936 }, { "epoch": 4.947749377968785, "grad_norm": 0.5587712526321411, "learning_rate": 5.3254771035259335e-06, "loss": 0.4341, "step": 10937 }, { "epoch": 4.948201764306718, "grad_norm": 0.5219336748123169, "learning_rate": 5.324745904093372e-06, "loss": 0.4332, "step": 10938 }, { "epoch": 4.948654150644651, "grad_norm": 0.659622073173523, "learning_rate": 5.3240146976862725e-06, "loss": 0.5399, "step": 10939 }, { "epoch": 4.949106536982583, "grad_norm": 0.6046470403671265, "learning_rate": 5.3232834843203395e-06, "loss": 0.5421, "step": 10940 }, { "epoch": 4.949558923320516, "grad_norm": 0.65834641456604, "learning_rate": 5.322552264011278e-06, "loss": 0.4978, "step": 10941 }, { "epoch": 4.950011309658448, "grad_norm": 0.7453722357749939, "learning_rate": 5.321821036774791e-06, "loss": 0.6276, "step": 10942 }, { "epoch": 4.950463695996381, "grad_norm": 0.1623193621635437, "learning_rate": 5.321089802626585e-06, "loss": 1.1852, "step": 10943 }, { "epoch": 4.950916082334313, "grad_norm": 0.265330970287323, "learning_rate": 5.320358561582363e-06, "loss": 1.2117, "step": 10944 }, { "epoch": 4.951368468672246, "grad_norm": 0.2996785342693329, "learning_rate": 5.319627313657829e-06, "loss": 0.5952, "step": 10945 }, { "epoch": 4.951820855010179, "grad_norm": 0.3634742200374603, "learning_rate": 5.318896058868689e-06, "loss": 0.6445, "step": 10946 }, { "epoch": 4.952273241348111, "grad_norm": 0.34998804330825806, "learning_rate": 5.318164797230648e-06, "loss": 0.4858, "step": 10947 }, { "epoch": 4.952725627686044, "grad_norm": 0.39114439487457275, "learning_rate": 5.317433528759412e-06, "loss": 0.6598, "step": 10948 }, { "epoch": 4.953178014023976, "grad_norm": 0.3922719657421112, "learning_rate": 5.316702253470686e-06, "loss": 0.5878, "step": 10949 }, { "epoch": 4.953630400361909, "grad_norm": 0.3973436951637268, "learning_rate": 5.315970971380174e-06, "loss": 0.5697, "step": 10950 }, { "epoch": 4.954082786699842, "grad_norm": 0.3923318684101105, "learning_rate": 5.315239682503585e-06, "loss": 0.5678, "step": 10951 }, { "epoch": 4.954535173037774, "grad_norm": 0.3619314432144165, "learning_rate": 5.314508386856621e-06, "loss": 0.4966, "step": 10952 }, { "epoch": 4.954987559375707, "grad_norm": 0.4243536591529846, "learning_rate": 5.313777084454991e-06, "loss": 0.6428, "step": 10953 }, { "epoch": 4.955439945713639, "grad_norm": 0.3833235204219818, "learning_rate": 5.313045775314399e-06, "loss": 0.5661, "step": 10954 }, { "epoch": 4.955892332051572, "grad_norm": 0.40710052847862244, "learning_rate": 5.312314459450553e-06, "loss": 0.5682, "step": 10955 }, { "epoch": 4.956344718389505, "grad_norm": 0.36594754457473755, "learning_rate": 5.311583136879158e-06, "loss": 0.5148, "step": 10956 }, { "epoch": 4.956797104727437, "grad_norm": 0.4318734407424927, "learning_rate": 5.310851807615921e-06, "loss": 0.5461, "step": 10957 }, { "epoch": 4.95724949106537, "grad_norm": 0.44563883543014526, "learning_rate": 5.3101204716765485e-06, "loss": 0.6329, "step": 10958 }, { "epoch": 4.957701877403302, "grad_norm": 0.42472043633461, "learning_rate": 5.309389129076747e-06, "loss": 0.4449, "step": 10959 }, { "epoch": 4.9581542637412355, "grad_norm": 0.46592381596565247, "learning_rate": 5.308657779832225e-06, "loss": 0.4789, "step": 10960 }, { "epoch": 4.958606650079168, "grad_norm": 0.5094093084335327, "learning_rate": 5.307926423958689e-06, "loss": 0.6886, "step": 10961 }, { "epoch": 4.9590590364171, "grad_norm": 0.43462127447128296, "learning_rate": 5.307195061471845e-06, "loss": 0.5293, "step": 10962 }, { "epoch": 4.959511422755033, "grad_norm": 0.4563942551612854, "learning_rate": 5.3064636923874015e-06, "loss": 0.5143, "step": 10963 }, { "epoch": 4.959963809092965, "grad_norm": 0.44206345081329346, "learning_rate": 5.305732316721066e-06, "loss": 0.6099, "step": 10964 }, { "epoch": 4.960416195430898, "grad_norm": 0.4892324209213257, "learning_rate": 5.305000934488547e-06, "loss": 0.6111, "step": 10965 }, { "epoch": 4.960868581768831, "grad_norm": 0.4590640962123871, "learning_rate": 5.3042695457055515e-06, "loss": 0.6359, "step": 10966 }, { "epoch": 4.961320968106763, "grad_norm": 0.469776451587677, "learning_rate": 5.303538150387787e-06, "loss": 0.586, "step": 10967 }, { "epoch": 4.961773354444696, "grad_norm": 0.4439322054386139, "learning_rate": 5.302806748550962e-06, "loss": 0.5352, "step": 10968 }, { "epoch": 4.962225740782628, "grad_norm": 0.4546920955181122, "learning_rate": 5.302075340210784e-06, "loss": 0.5296, "step": 10969 }, { "epoch": 4.962678127120561, "grad_norm": 0.4396827518939972, "learning_rate": 5.301343925382963e-06, "loss": 0.5185, "step": 10970 }, { "epoch": 4.963130513458493, "grad_norm": 0.4623573422431946, "learning_rate": 5.300612504083204e-06, "loss": 0.5288, "step": 10971 }, { "epoch": 4.963582899796426, "grad_norm": 0.5080932378768921, "learning_rate": 5.299881076327221e-06, "loss": 0.6243, "step": 10972 }, { "epoch": 4.964035286134359, "grad_norm": 0.48444125056266785, "learning_rate": 5.29914964213072e-06, "loss": 0.5606, "step": 10973 }, { "epoch": 4.964487672472291, "grad_norm": 0.48585188388824463, "learning_rate": 5.29841820150941e-06, "loss": 0.4746, "step": 10974 }, { "epoch": 4.964940058810224, "grad_norm": 0.4576486051082611, "learning_rate": 5.297686754478999e-06, "loss": 0.4782, "step": 10975 }, { "epoch": 4.965392445148156, "grad_norm": 0.47596848011016846, "learning_rate": 5.2969553010551976e-06, "loss": 0.5489, "step": 10976 }, { "epoch": 4.9658448314860895, "grad_norm": 0.4662688374519348, "learning_rate": 5.296223841253716e-06, "loss": 0.4818, "step": 10977 }, { "epoch": 4.966297217824022, "grad_norm": 0.521182656288147, "learning_rate": 5.2954923750902636e-06, "loss": 0.4961, "step": 10978 }, { "epoch": 4.966749604161954, "grad_norm": 0.5395450592041016, "learning_rate": 5.294760902580547e-06, "loss": 0.5762, "step": 10979 }, { "epoch": 4.967201990499887, "grad_norm": 0.48154231905937195, "learning_rate": 5.294029423740279e-06, "loss": 0.4894, "step": 10980 }, { "epoch": 4.967654376837819, "grad_norm": 0.5309399366378784, "learning_rate": 5.293297938585168e-06, "loss": 0.5687, "step": 10981 }, { "epoch": 4.9681067631757525, "grad_norm": 0.5354779958724976, "learning_rate": 5.292566447130925e-06, "loss": 0.5123, "step": 10982 }, { "epoch": 4.968559149513685, "grad_norm": 0.55443274974823, "learning_rate": 5.291834949393258e-06, "loss": 0.6033, "step": 10983 }, { "epoch": 4.969011535851617, "grad_norm": 0.5400132536888123, "learning_rate": 5.2911034453878815e-06, "loss": 0.5, "step": 10984 }, { "epoch": 4.96946392218955, "grad_norm": 0.5953063368797302, "learning_rate": 5.290371935130503e-06, "loss": 0.5951, "step": 10985 }, { "epoch": 4.969916308527482, "grad_norm": 0.5303640961647034, "learning_rate": 5.289640418636832e-06, "loss": 0.4768, "step": 10986 }, { "epoch": 4.9703686948654155, "grad_norm": 0.5284261107444763, "learning_rate": 5.288908895922582e-06, "loss": 0.431, "step": 10987 }, { "epoch": 4.970821081203348, "grad_norm": 0.5899472832679749, "learning_rate": 5.288177367003462e-06, "loss": 0.4744, "step": 10988 }, { "epoch": 4.97127346754128, "grad_norm": 0.6040835976600647, "learning_rate": 5.287445831895186e-06, "loss": 0.5008, "step": 10989 }, { "epoch": 4.971725853879213, "grad_norm": 0.628912627696991, "learning_rate": 5.286714290613461e-06, "loss": 0.461, "step": 10990 }, { "epoch": 4.972178240217145, "grad_norm": 0.6741276979446411, "learning_rate": 5.285982743173999e-06, "loss": 0.5534, "step": 10991 }, { "epoch": 4.972630626555078, "grad_norm": 0.6722972989082336, "learning_rate": 5.285251189592513e-06, "loss": 0.4637, "step": 10992 }, { "epoch": 4.97308301289301, "grad_norm": 0.19333693385124207, "learning_rate": 5.284519629884713e-06, "loss": 1.2905, "step": 10993 }, { "epoch": 4.9735353992309435, "grad_norm": 0.2761077582836151, "learning_rate": 5.283788064066313e-06, "loss": 0.6907, "step": 10994 }, { "epoch": 4.973987785568876, "grad_norm": 0.3163740038871765, "learning_rate": 5.283056492153023e-06, "loss": 0.6825, "step": 10995 }, { "epoch": 4.974440171906808, "grad_norm": 0.28540629148483276, "learning_rate": 5.282324914160554e-06, "loss": 0.4807, "step": 10996 }, { "epoch": 4.974892558244741, "grad_norm": 0.33492645621299744, "learning_rate": 5.281593330104619e-06, "loss": 0.5658, "step": 10997 }, { "epoch": 4.975344944582673, "grad_norm": 0.3380201756954193, "learning_rate": 5.280861740000932e-06, "loss": 0.5289, "step": 10998 }, { "epoch": 4.9757973309206065, "grad_norm": 0.3737388551235199, "learning_rate": 5.280130143865203e-06, "loss": 0.643, "step": 10999 }, { "epoch": 4.976249717258539, "grad_norm": 0.3723691701889038, "learning_rate": 5.279398541713146e-06, "loss": 0.5156, "step": 11000 }, { "epoch": 4.976249717258539, "eval_loss": 0.5891911387443542, "eval_runtime": 25.7766, "eval_samples_per_second": 28.863, "eval_steps_per_second": 7.216, "step": 11000 }, { "epoch": 4.976702103596471, "grad_norm": 0.3713177442550659, "learning_rate": 5.2786669335604726e-06, "loss": 0.5599, "step": 11001 }, { "epoch": 4.977154489934404, "grad_norm": 0.3799741864204407, "learning_rate": 5.277935319422893e-06, "loss": 0.5058, "step": 11002 }, { "epoch": 4.977606876272336, "grad_norm": 0.40765079855918884, "learning_rate": 5.277203699316125e-06, "loss": 0.5615, "step": 11003 }, { "epoch": 4.9780592626102695, "grad_norm": 0.40144816040992737, "learning_rate": 5.276472073255877e-06, "loss": 0.6396, "step": 11004 }, { "epoch": 4.978511648948202, "grad_norm": 0.4206066429615021, "learning_rate": 5.275740441257866e-06, "loss": 0.5661, "step": 11005 }, { "epoch": 4.978964035286134, "grad_norm": 0.38980063796043396, "learning_rate": 5.275008803337802e-06, "loss": 0.5398, "step": 11006 }, { "epoch": 4.979416421624067, "grad_norm": 0.44628041982650757, "learning_rate": 5.2742771595114e-06, "loss": 0.6703, "step": 11007 }, { "epoch": 4.979868807961999, "grad_norm": 0.3464527726173401, "learning_rate": 5.273545509794373e-06, "loss": 0.4118, "step": 11008 }, { "epoch": 4.980321194299933, "grad_norm": 0.4139171540737152, "learning_rate": 5.272813854202434e-06, "loss": 0.6411, "step": 11009 }, { "epoch": 4.980773580637865, "grad_norm": 0.4918636381626129, "learning_rate": 5.272082192751299e-06, "loss": 0.6911, "step": 11010 }, { "epoch": 4.9812259669757974, "grad_norm": 0.46424809098243713, "learning_rate": 5.271350525456679e-06, "loss": 0.5175, "step": 11011 }, { "epoch": 4.98167835331373, "grad_norm": 0.43220311403274536, "learning_rate": 5.27061885233429e-06, "loss": 0.5931, "step": 11012 }, { "epoch": 4.982130739651662, "grad_norm": 0.420343816280365, "learning_rate": 5.2698871733998445e-06, "loss": 0.4755, "step": 11013 }, { "epoch": 4.982583125989595, "grad_norm": 0.5096726417541504, "learning_rate": 5.2691554886690575e-06, "loss": 0.6025, "step": 11014 }, { "epoch": 4.983035512327528, "grad_norm": 0.44434741139411926, "learning_rate": 5.268423798157643e-06, "loss": 0.5888, "step": 11015 }, { "epoch": 4.9834878986654605, "grad_norm": 0.40489280223846436, "learning_rate": 5.267692101881315e-06, "loss": 0.4842, "step": 11016 }, { "epoch": 4.983940285003393, "grad_norm": 0.4801941215991974, "learning_rate": 5.26696039985579e-06, "loss": 0.547, "step": 11017 }, { "epoch": 4.984392671341325, "grad_norm": 0.45092159509658813, "learning_rate": 5.266228692096781e-06, "loss": 0.5459, "step": 11018 }, { "epoch": 4.984845057679258, "grad_norm": 0.4236691892147064, "learning_rate": 5.265496978620002e-06, "loss": 0.555, "step": 11019 }, { "epoch": 4.98529744401719, "grad_norm": 0.4935833215713501, "learning_rate": 5.2647652594411714e-06, "loss": 0.6076, "step": 11020 }, { "epoch": 4.9857498303551235, "grad_norm": 0.4697783589363098, "learning_rate": 5.264033534576001e-06, "loss": 0.5426, "step": 11021 }, { "epoch": 4.986202216693056, "grad_norm": 0.4867292046546936, "learning_rate": 5.263301804040207e-06, "loss": 0.5073, "step": 11022 }, { "epoch": 4.986654603030988, "grad_norm": 0.48339298367500305, "learning_rate": 5.2625700678495065e-06, "loss": 0.4904, "step": 11023 }, { "epoch": 4.987106989368921, "grad_norm": 0.48322591185569763, "learning_rate": 5.26183832601961e-06, "loss": 0.5747, "step": 11024 }, { "epoch": 4.987559375706853, "grad_norm": 0.48684147000312805, "learning_rate": 5.261106578566238e-06, "loss": 0.5146, "step": 11025 }, { "epoch": 4.988011762044787, "grad_norm": 0.468508243560791, "learning_rate": 5.2603748255051045e-06, "loss": 0.5077, "step": 11026 }, { "epoch": 4.988464148382719, "grad_norm": 0.5784341096878052, "learning_rate": 5.2596430668519235e-06, "loss": 0.6255, "step": 11027 }, { "epoch": 4.988916534720651, "grad_norm": 0.5198858976364136, "learning_rate": 5.2589113026224145e-06, "loss": 0.5794, "step": 11028 }, { "epoch": 4.989368921058584, "grad_norm": 0.4815399944782257, "learning_rate": 5.258179532832292e-06, "loss": 0.4871, "step": 11029 }, { "epoch": 4.989821307396516, "grad_norm": 0.47918596863746643, "learning_rate": 5.2574477574972695e-06, "loss": 0.4543, "step": 11030 }, { "epoch": 4.99027369373445, "grad_norm": 0.552367627620697, "learning_rate": 5.256715976633067e-06, "loss": 0.5071, "step": 11031 }, { "epoch": 4.990726080072382, "grad_norm": 0.5615236759185791, "learning_rate": 5.255984190255399e-06, "loss": 0.5956, "step": 11032 }, { "epoch": 4.9911784664103145, "grad_norm": 0.5150805115699768, "learning_rate": 5.255252398379983e-06, "loss": 0.4961, "step": 11033 }, { "epoch": 4.991630852748247, "grad_norm": 0.5669829249382019, "learning_rate": 5.254520601022536e-06, "loss": 0.5203, "step": 11034 }, { "epoch": 4.992083239086179, "grad_norm": 0.5103144645690918, "learning_rate": 5.253788798198772e-06, "loss": 0.5111, "step": 11035 }, { "epoch": 4.992535625424113, "grad_norm": 0.4886048436164856, "learning_rate": 5.25305698992441e-06, "loss": 0.42, "step": 11036 }, { "epoch": 4.992988011762045, "grad_norm": 0.5348808169364929, "learning_rate": 5.252325176215167e-06, "loss": 0.433, "step": 11037 }, { "epoch": 4.9934403980999775, "grad_norm": 0.5999260544776917, "learning_rate": 5.251593357086759e-06, "loss": 0.4838, "step": 11038 }, { "epoch": 4.99389278443791, "grad_norm": 0.6115787625312805, "learning_rate": 5.2508615325549055e-06, "loss": 0.4715, "step": 11039 }, { "epoch": 4.994345170775842, "grad_norm": 0.6215089559555054, "learning_rate": 5.250129702635322e-06, "loss": 0.4926, "step": 11040 }, { "epoch": 4.994797557113775, "grad_norm": 0.6302504539489746, "learning_rate": 5.249397867343725e-06, "loss": 0.4827, "step": 11041 }, { "epoch": 4.995249943451708, "grad_norm": 0.7418894171714783, "learning_rate": 5.248666026695835e-06, "loss": 0.516, "step": 11042 }, { "epoch": 4.995702329789641, "grad_norm": 0.34970706701278687, "learning_rate": 5.247934180707366e-06, "loss": 0.7173, "step": 11043 }, { "epoch": 4.996154716127573, "grad_norm": 0.3531058430671692, "learning_rate": 5.247202329394039e-06, "loss": 0.4895, "step": 11044 }, { "epoch": 4.996607102465505, "grad_norm": 0.3872791528701782, "learning_rate": 5.246470472771571e-06, "loss": 0.5514, "step": 11045 }, { "epoch": 4.997059488803438, "grad_norm": 0.5171129703521729, "learning_rate": 5.245738610855678e-06, "loss": 0.6497, "step": 11046 }, { "epoch": 4.99751187514137, "grad_norm": 0.48055198788642883, "learning_rate": 5.245006743662082e-06, "loss": 0.6432, "step": 11047 }, { "epoch": 4.997964261479304, "grad_norm": 0.474361389875412, "learning_rate": 5.244274871206497e-06, "loss": 0.5258, "step": 11048 }, { "epoch": 4.998416647817236, "grad_norm": 0.5070958733558655, "learning_rate": 5.243542993504645e-06, "loss": 0.5609, "step": 11049 }, { "epoch": 4.9988690341551685, "grad_norm": 0.5364562273025513, "learning_rate": 5.242811110572243e-06, "loss": 0.5537, "step": 11050 }, { "epoch": 4.999321420493101, "grad_norm": 0.5682131052017212, "learning_rate": 5.242079222425007e-06, "loss": 0.4488, "step": 11051 }, { "epoch": 4.999773806831033, "grad_norm": 0.6325240135192871, "learning_rate": 5.24134732907866e-06, "loss": 0.5729, "step": 11052 }, { "epoch": 5.000226193168967, "grad_norm": 2.1420929431915283, "learning_rate": 5.2406154305489186e-06, "loss": 1.1117, "step": 11053 }, { "epoch": 5.000678579506899, "grad_norm": 0.22338718175888062, "learning_rate": 5.239883526851502e-06, "loss": 1.3402, "step": 11054 }, { "epoch": 5.0011309658448315, "grad_norm": 0.2530868649482727, "learning_rate": 5.23915161800213e-06, "loss": 0.6396, "step": 11055 }, { "epoch": 5.001583352182764, "grad_norm": 0.3327220380306244, "learning_rate": 5.2384197040165205e-06, "loss": 0.5769, "step": 11056 }, { "epoch": 5.002035738520696, "grad_norm": 0.3431169390678406, "learning_rate": 5.237687784910395e-06, "loss": 0.6734, "step": 11057 }, { "epoch": 5.00248812485863, "grad_norm": 0.2948142886161804, "learning_rate": 5.236955860699468e-06, "loss": 0.4178, "step": 11058 }, { "epoch": 5.002940511196562, "grad_norm": 0.33396273851394653, "learning_rate": 5.236223931399465e-06, "loss": 0.4796, "step": 11059 }, { "epoch": 5.003392897534495, "grad_norm": 0.3646664321422577, "learning_rate": 5.235491997026099e-06, "loss": 0.5509, "step": 11060 }, { "epoch": 5.003845283872427, "grad_norm": 0.37587249279022217, "learning_rate": 5.234760057595096e-06, "loss": 0.5311, "step": 11061 }, { "epoch": 5.004297670210359, "grad_norm": 0.3792160451412201, "learning_rate": 5.2340281131221735e-06, "loss": 0.4932, "step": 11062 }, { "epoch": 5.004750056548292, "grad_norm": 0.412463515996933, "learning_rate": 5.2332961636230504e-06, "loss": 0.5266, "step": 11063 }, { "epoch": 5.005202442886225, "grad_norm": 0.43022772669792175, "learning_rate": 5.232564209113446e-06, "loss": 0.5567, "step": 11064 }, { "epoch": 5.005654829224158, "grad_norm": 0.4043780565261841, "learning_rate": 5.2318322496090845e-06, "loss": 0.4998, "step": 11065 }, { "epoch": 5.00610721556209, "grad_norm": 0.46131637692451477, "learning_rate": 5.231100285125682e-06, "loss": 0.6203, "step": 11066 }, { "epoch": 5.0065596019000225, "grad_norm": 0.36665964126586914, "learning_rate": 5.23036831567896e-06, "loss": 0.4726, "step": 11067 }, { "epoch": 5.007011988237955, "grad_norm": 0.4426225423812866, "learning_rate": 5.229636341284641e-06, "loss": 0.5015, "step": 11068 }, { "epoch": 5.007464374575888, "grad_norm": 0.45153820514678955, "learning_rate": 5.2289043619584416e-06, "loss": 0.5621, "step": 11069 }, { "epoch": 5.007916760913821, "grad_norm": 0.40827375650405884, "learning_rate": 5.228172377716085e-06, "loss": 0.5136, "step": 11070 }, { "epoch": 5.008369147251753, "grad_norm": 0.44314801692962646, "learning_rate": 5.227440388573292e-06, "loss": 0.4938, "step": 11071 }, { "epoch": 5.0088215335896855, "grad_norm": 0.4724438190460205, "learning_rate": 5.226708394545783e-06, "loss": 0.5219, "step": 11072 }, { "epoch": 5.009273919927618, "grad_norm": 0.4564777612686157, "learning_rate": 5.225976395649278e-06, "loss": 0.504, "step": 11073 }, { "epoch": 5.00972630626555, "grad_norm": 0.4484506845474243, "learning_rate": 5.2252443918995e-06, "loss": 0.6022, "step": 11074 }, { "epoch": 5.010178692603484, "grad_norm": 0.39846864342689514, "learning_rate": 5.224512383312169e-06, "loss": 0.4672, "step": 11075 }, { "epoch": 5.010631078941416, "grad_norm": 0.4999361038208008, "learning_rate": 5.223780369903007e-06, "loss": 0.6244, "step": 11076 }, { "epoch": 5.011083465279349, "grad_norm": 0.47640058398246765, "learning_rate": 5.2230483516877335e-06, "loss": 0.5608, "step": 11077 }, { "epoch": 5.011535851617281, "grad_norm": 0.44651809334754944, "learning_rate": 5.222316328682073e-06, "loss": 0.4553, "step": 11078 }, { "epoch": 5.011988237955213, "grad_norm": 0.4212394058704376, "learning_rate": 5.2215843009017465e-06, "loss": 0.4247, "step": 11079 }, { "epoch": 5.012440624293147, "grad_norm": 0.4633857011795044, "learning_rate": 5.220852268362472e-06, "loss": 0.4348, "step": 11080 }, { "epoch": 5.012893010631079, "grad_norm": 0.48977211117744446, "learning_rate": 5.220120231079975e-06, "loss": 0.5453, "step": 11081 }, { "epoch": 5.013345396969012, "grad_norm": 0.4645065665245056, "learning_rate": 5.219388189069976e-06, "loss": 0.5029, "step": 11082 }, { "epoch": 5.013797783306944, "grad_norm": 0.4685192108154297, "learning_rate": 5.218656142348198e-06, "loss": 0.465, "step": 11083 }, { "epoch": 5.0142501696448765, "grad_norm": 0.538470983505249, "learning_rate": 5.217924090930362e-06, "loss": 0.4931, "step": 11084 }, { "epoch": 5.014702555982809, "grad_norm": 0.5017645359039307, "learning_rate": 5.21719203483219e-06, "loss": 0.5026, "step": 11085 }, { "epoch": 5.015154942320742, "grad_norm": 0.4500964283943176, "learning_rate": 5.2164599740694065e-06, "loss": 0.5, "step": 11086 }, { "epoch": 5.015607328658675, "grad_norm": 0.5193464159965515, "learning_rate": 5.215727908657732e-06, "loss": 0.4408, "step": 11087 }, { "epoch": 5.016059714996607, "grad_norm": 0.502839207649231, "learning_rate": 5.214995838612889e-06, "loss": 0.4995, "step": 11088 }, { "epoch": 5.0165121013345395, "grad_norm": 0.5409420728683472, "learning_rate": 5.2142637639506e-06, "loss": 0.5724, "step": 11089 }, { "epoch": 5.016964487672472, "grad_norm": 0.5517786145210266, "learning_rate": 5.213531684686588e-06, "loss": 0.547, "step": 11090 }, { "epoch": 5.017416874010405, "grad_norm": 0.490444153547287, "learning_rate": 5.212799600836578e-06, "loss": 0.4447, "step": 11091 }, { "epoch": 5.017869260348338, "grad_norm": 0.5303735136985779, "learning_rate": 5.212067512416289e-06, "loss": 0.5027, "step": 11092 }, { "epoch": 5.01832164668627, "grad_norm": 0.5072153806686401, "learning_rate": 5.211335419441446e-06, "loss": 0.4571, "step": 11093 }, { "epoch": 5.0187740330242026, "grad_norm": 0.48754119873046875, "learning_rate": 5.2106033219277715e-06, "loss": 0.4662, "step": 11094 }, { "epoch": 5.019226419362135, "grad_norm": 0.538137674331665, "learning_rate": 5.20987121989099e-06, "loss": 0.5224, "step": 11095 }, { "epoch": 5.019678805700067, "grad_norm": 0.5836107730865479, "learning_rate": 5.209139113346824e-06, "loss": 0.5274, "step": 11096 }, { "epoch": 5.020131192038001, "grad_norm": 0.597597599029541, "learning_rate": 5.2084070023109955e-06, "loss": 0.4698, "step": 11097 }, { "epoch": 5.020583578375933, "grad_norm": 0.49035578966140747, "learning_rate": 5.207674886799231e-06, "loss": 0.3947, "step": 11098 }, { "epoch": 5.021035964713866, "grad_norm": 0.6282534599304199, "learning_rate": 5.206942766827251e-06, "loss": 0.547, "step": 11099 }, { "epoch": 5.021488351051798, "grad_norm": 0.5365346074104309, "learning_rate": 5.206210642410782e-06, "loss": 0.4736, "step": 11100 }, { "epoch": 5.0219407373897305, "grad_norm": 0.5553884506225586, "learning_rate": 5.205478513565544e-06, "loss": 0.4135, "step": 11101 }, { "epoch": 5.022393123727664, "grad_norm": 0.6696778535842896, "learning_rate": 5.204746380307266e-06, "loss": 0.4846, "step": 11102 }, { "epoch": 5.022845510065596, "grad_norm": 0.5399975180625916, "learning_rate": 5.204014242651667e-06, "loss": 1.1306, "step": 11103 }, { "epoch": 5.023297896403529, "grad_norm": 0.2230461984872818, "learning_rate": 5.203282100614474e-06, "loss": 0.7444, "step": 11104 }, { "epoch": 5.023750282741461, "grad_norm": 0.28511911630630493, "learning_rate": 5.20254995421141e-06, "loss": 0.6251, "step": 11105 }, { "epoch": 5.0242026690793935, "grad_norm": 0.32260870933532715, "learning_rate": 5.2018178034582e-06, "loss": 0.5467, "step": 11106 }, { "epoch": 5.024655055417327, "grad_norm": 0.33098357915878296, "learning_rate": 5.201085648370567e-06, "loss": 0.5554, "step": 11107 }, { "epoch": 5.025107441755259, "grad_norm": 0.35620036721229553, "learning_rate": 5.200353488964237e-06, "loss": 0.6467, "step": 11108 }, { "epoch": 5.025559828093192, "grad_norm": 0.40850430727005005, "learning_rate": 5.1996213252549335e-06, "loss": 0.6452, "step": 11109 }, { "epoch": 5.026012214431124, "grad_norm": 0.37147122621536255, "learning_rate": 5.198889157258382e-06, "loss": 0.6344, "step": 11110 }, { "epoch": 5.0264646007690565, "grad_norm": 0.40168967843055725, "learning_rate": 5.1981569849903055e-06, "loss": 0.6583, "step": 11111 }, { "epoch": 5.026916987106989, "grad_norm": 0.33720704913139343, "learning_rate": 5.197424808466431e-06, "loss": 0.4726, "step": 11112 }, { "epoch": 5.027369373444922, "grad_norm": 0.3999301493167877, "learning_rate": 5.196692627702484e-06, "loss": 0.5123, "step": 11113 }, { "epoch": 5.027821759782855, "grad_norm": 0.3929619789123535, "learning_rate": 5.195960442714185e-06, "loss": 0.5124, "step": 11114 }, { "epoch": 5.028274146120787, "grad_norm": 0.4502856433391571, "learning_rate": 5.195228253517262e-06, "loss": 0.6735, "step": 11115 }, { "epoch": 5.02872653245872, "grad_norm": 0.4124628007411957, "learning_rate": 5.194496060127441e-06, "loss": 0.5162, "step": 11116 }, { "epoch": 5.029178918796652, "grad_norm": 0.4772239327430725, "learning_rate": 5.193763862560446e-06, "loss": 0.7008, "step": 11117 }, { "epoch": 5.029631305134585, "grad_norm": 0.46153637766838074, "learning_rate": 5.193031660832003e-06, "loss": 0.5724, "step": 11118 }, { "epoch": 5.030083691472518, "grad_norm": 0.3993755877017975, "learning_rate": 5.192299454957837e-06, "loss": 0.54, "step": 11119 }, { "epoch": 5.03053607781045, "grad_norm": 0.44974061846733093, "learning_rate": 5.1915672449536724e-06, "loss": 0.522, "step": 11120 }, { "epoch": 5.030988464148383, "grad_norm": 0.451530784368515, "learning_rate": 5.190835030835236e-06, "loss": 0.4853, "step": 11121 }, { "epoch": 5.031440850486315, "grad_norm": 0.39740777015686035, "learning_rate": 5.190102812618255e-06, "loss": 0.4443, "step": 11122 }, { "epoch": 5.0318932368242475, "grad_norm": 0.4309099018573761, "learning_rate": 5.189370590318452e-06, "loss": 0.4713, "step": 11123 }, { "epoch": 5.032345623162181, "grad_norm": 0.4968660771846771, "learning_rate": 5.188638363951556e-06, "loss": 0.5946, "step": 11124 }, { "epoch": 5.032798009500113, "grad_norm": 0.4568040668964386, "learning_rate": 5.187906133533291e-06, "loss": 0.4987, "step": 11125 }, { "epoch": 5.033250395838046, "grad_norm": 0.45496949553489685, "learning_rate": 5.187173899079384e-06, "loss": 0.7275, "step": 11126 }, { "epoch": 5.033702782175978, "grad_norm": 0.4535413980484009, "learning_rate": 5.18644166060556e-06, "loss": 0.5294, "step": 11127 }, { "epoch": 5.0341551685139105, "grad_norm": 0.4579276740550995, "learning_rate": 5.185709418127546e-06, "loss": 0.5167, "step": 11128 }, { "epoch": 5.034607554851844, "grad_norm": 0.48764869570732117, "learning_rate": 5.184977171661068e-06, "loss": 0.6056, "step": 11129 }, { "epoch": 5.035059941189776, "grad_norm": 0.4506661891937256, "learning_rate": 5.184244921221852e-06, "loss": 0.4421, "step": 11130 }, { "epoch": 5.035512327527709, "grad_norm": 0.5263172388076782, "learning_rate": 5.183512666825626e-06, "loss": 0.7122, "step": 11131 }, { "epoch": 5.035964713865641, "grad_norm": 0.44751793146133423, "learning_rate": 5.182780408488116e-06, "loss": 0.4836, "step": 11132 }, { "epoch": 5.036417100203574, "grad_norm": 0.5055407285690308, "learning_rate": 5.182048146225047e-06, "loss": 0.5152, "step": 11133 }, { "epoch": 5.036869486541506, "grad_norm": 0.4727499485015869, "learning_rate": 5.181315880052148e-06, "loss": 0.5444, "step": 11134 }, { "epoch": 5.037321872879439, "grad_norm": 0.5025814175605774, "learning_rate": 5.1805836099851435e-06, "loss": 0.5109, "step": 11135 }, { "epoch": 5.037774259217372, "grad_norm": 0.5246191024780273, "learning_rate": 5.179851336039764e-06, "loss": 0.5683, "step": 11136 }, { "epoch": 5.038226645555304, "grad_norm": 0.442655473947525, "learning_rate": 5.179119058231732e-06, "loss": 0.4774, "step": 11137 }, { "epoch": 5.038679031893237, "grad_norm": 0.5204102396965027, "learning_rate": 5.178386776576777e-06, "loss": 0.4874, "step": 11138 }, { "epoch": 5.039131418231169, "grad_norm": 0.5684295296669006, "learning_rate": 5.177654491090627e-06, "loss": 0.6151, "step": 11139 }, { "epoch": 5.039583804569102, "grad_norm": 0.4521027207374573, "learning_rate": 5.176922201789006e-06, "loss": 0.3984, "step": 11140 }, { "epoch": 5.040036190907035, "grad_norm": 0.5296181440353394, "learning_rate": 5.176189908687646e-06, "loss": 0.6172, "step": 11141 }, { "epoch": 5.040488577244967, "grad_norm": 0.5330727100372314, "learning_rate": 5.175457611802269e-06, "loss": 0.4972, "step": 11142 }, { "epoch": 5.0409409635829, "grad_norm": 0.4942012429237366, "learning_rate": 5.1747253111486075e-06, "loss": 0.4075, "step": 11143 }, { "epoch": 5.041393349920832, "grad_norm": 0.46329841017723083, "learning_rate": 5.1739930067423864e-06, "loss": 0.4457, "step": 11144 }, { "epoch": 5.041845736258765, "grad_norm": 0.5674512982368469, "learning_rate": 5.173260698599333e-06, "loss": 0.5242, "step": 11145 }, { "epoch": 5.042298122596698, "grad_norm": 0.618108868598938, "learning_rate": 5.172528386735176e-06, "loss": 0.5399, "step": 11146 }, { "epoch": 5.04275050893463, "grad_norm": 0.5502834320068359, "learning_rate": 5.171796071165643e-06, "loss": 0.5194, "step": 11147 }, { "epoch": 5.043202895272563, "grad_norm": 0.5361664891242981, "learning_rate": 5.171063751906462e-06, "loss": 0.467, "step": 11148 }, { "epoch": 5.043655281610495, "grad_norm": 0.5657768845558167, "learning_rate": 5.170331428973361e-06, "loss": 0.4976, "step": 11149 }, { "epoch": 5.044107667948428, "grad_norm": 0.6206517219543457, "learning_rate": 5.169599102382067e-06, "loss": 0.5124, "step": 11150 }, { "epoch": 5.044560054286361, "grad_norm": 0.5824640393257141, "learning_rate": 5.16886677214831e-06, "loss": 0.4937, "step": 11151 }, { "epoch": 5.045012440624293, "grad_norm": 0.6867227554321289, "learning_rate": 5.168134438287816e-06, "loss": 0.4508, "step": 11152 }, { "epoch": 5.045464826962226, "grad_norm": 0.6266620755195618, "learning_rate": 5.167402100816315e-06, "loss": 1.1879, "step": 11153 }, { "epoch": 5.045917213300158, "grad_norm": 0.24679219722747803, "learning_rate": 5.1666697597495345e-06, "loss": 0.4973, "step": 11154 }, { "epoch": 5.046369599638091, "grad_norm": 0.33673399686813354, "learning_rate": 5.165937415103202e-06, "loss": 0.6595, "step": 11155 }, { "epoch": 5.046821985976024, "grad_norm": 0.3549950420856476, "learning_rate": 5.1652050668930485e-06, "loss": 0.6649, "step": 11156 }, { "epoch": 5.047274372313956, "grad_norm": 0.36855581402778625, "learning_rate": 5.1644727151348016e-06, "loss": 0.6326, "step": 11157 }, { "epoch": 5.047726758651889, "grad_norm": 0.34228336811065674, "learning_rate": 5.16374035984419e-06, "loss": 0.5192, "step": 11158 }, { "epoch": 5.048179144989821, "grad_norm": 0.40931206941604614, "learning_rate": 5.163008001036941e-06, "loss": 0.6553, "step": 11159 }, { "epoch": 5.048631531327754, "grad_norm": 0.3837679922580719, "learning_rate": 5.162275638728784e-06, "loss": 0.6363, "step": 11160 }, { "epoch": 5.049083917665686, "grad_norm": 0.41350969672203064, "learning_rate": 5.161543272935449e-06, "loss": 0.6857, "step": 11161 }, { "epoch": 5.049536304003619, "grad_norm": 0.34125036001205444, "learning_rate": 5.160810903672664e-06, "loss": 0.4601, "step": 11162 }, { "epoch": 5.049988690341552, "grad_norm": 0.3888476490974426, "learning_rate": 5.160078530956158e-06, "loss": 0.4492, "step": 11163 }, { "epoch": 5.050441076679484, "grad_norm": 0.40076369047164917, "learning_rate": 5.15934615480166e-06, "loss": 0.5887, "step": 11164 }, { "epoch": 5.050893463017417, "grad_norm": 0.4496789574623108, "learning_rate": 5.1586137752249e-06, "loss": 0.6188, "step": 11165 }, { "epoch": 5.051345849355349, "grad_norm": 0.4515194594860077, "learning_rate": 5.157881392241606e-06, "loss": 0.5905, "step": 11166 }, { "epoch": 5.0517982356932825, "grad_norm": 0.424949049949646, "learning_rate": 5.157149005867509e-06, "loss": 0.5675, "step": 11167 }, { "epoch": 5.052250622031215, "grad_norm": 0.45735132694244385, "learning_rate": 5.156416616118337e-06, "loss": 0.503, "step": 11168 }, { "epoch": 5.052703008369147, "grad_norm": 0.5002506375312805, "learning_rate": 5.15568422300982e-06, "loss": 0.6026, "step": 11169 }, { "epoch": 5.05315539470708, "grad_norm": 0.4140632450580597, "learning_rate": 5.154951826557689e-06, "loss": 0.4898, "step": 11170 }, { "epoch": 5.053607781045012, "grad_norm": 0.4519929587841034, "learning_rate": 5.154219426777672e-06, "loss": 0.5341, "step": 11171 }, { "epoch": 5.054060167382945, "grad_norm": 0.4691585898399353, "learning_rate": 5.153487023685496e-06, "loss": 0.5988, "step": 11172 }, { "epoch": 5.054512553720878, "grad_norm": 0.49247199296951294, "learning_rate": 5.152754617296895e-06, "loss": 0.6611, "step": 11173 }, { "epoch": 5.05496494005881, "grad_norm": 0.4677028954029083, "learning_rate": 5.152022207627597e-06, "loss": 0.6452, "step": 11174 }, { "epoch": 5.055417326396743, "grad_norm": 0.5035292506217957, "learning_rate": 5.151289794693333e-06, "loss": 0.6085, "step": 11175 }, { "epoch": 5.055869712734675, "grad_norm": 0.4946669936180115, "learning_rate": 5.150557378509832e-06, "loss": 0.5278, "step": 11176 }, { "epoch": 5.056322099072608, "grad_norm": 0.4440668523311615, "learning_rate": 5.149824959092824e-06, "loss": 0.4838, "step": 11177 }, { "epoch": 5.056774485410541, "grad_norm": 0.49286070466041565, "learning_rate": 5.149092536458039e-06, "loss": 0.5764, "step": 11178 }, { "epoch": 5.057226871748473, "grad_norm": 0.47086209058761597, "learning_rate": 5.148360110621208e-06, "loss": 0.5213, "step": 11179 }, { "epoch": 5.057679258086406, "grad_norm": 0.4780083894729614, "learning_rate": 5.147627681598061e-06, "loss": 0.5384, "step": 11180 }, { "epoch": 5.058131644424338, "grad_norm": 0.4990505874156952, "learning_rate": 5.146895249404329e-06, "loss": 0.513, "step": 11181 }, { "epoch": 5.058584030762271, "grad_norm": 0.4447266161441803, "learning_rate": 5.14616281405574e-06, "loss": 0.4535, "step": 11182 }, { "epoch": 5.059036417100204, "grad_norm": 0.5437628030776978, "learning_rate": 5.145430375568026e-06, "loss": 0.5387, "step": 11183 }, { "epoch": 5.0594888034381365, "grad_norm": 0.480020135641098, "learning_rate": 5.144697933956917e-06, "loss": 0.4982, "step": 11184 }, { "epoch": 5.059941189776069, "grad_norm": 0.5100601315498352, "learning_rate": 5.143965489238145e-06, "loss": 0.4603, "step": 11185 }, { "epoch": 5.060393576114001, "grad_norm": 0.5041719079017639, "learning_rate": 5.143233041427439e-06, "loss": 0.5166, "step": 11186 }, { "epoch": 5.060845962451934, "grad_norm": 0.5079004168510437, "learning_rate": 5.14250059054053e-06, "loss": 0.5461, "step": 11187 }, { "epoch": 5.061298348789866, "grad_norm": 0.49869006872177124, "learning_rate": 5.14176813659315e-06, "loss": 0.4821, "step": 11188 }, { "epoch": 5.0617507351277995, "grad_norm": 0.47215303778648376, "learning_rate": 5.141035679601028e-06, "loss": 0.4728, "step": 11189 }, { "epoch": 5.062203121465732, "grad_norm": 0.5842257142066956, "learning_rate": 5.140303219579897e-06, "loss": 0.6209, "step": 11190 }, { "epoch": 5.062655507803664, "grad_norm": 0.5456023812294006, "learning_rate": 5.139570756545487e-06, "loss": 0.489, "step": 11191 }, { "epoch": 5.063107894141597, "grad_norm": 0.5435435771942139, "learning_rate": 5.138838290513528e-06, "loss": 0.4926, "step": 11192 }, { "epoch": 5.063560280479529, "grad_norm": 0.5493932366371155, "learning_rate": 5.138105821499753e-06, "loss": 0.4894, "step": 11193 }, { "epoch": 5.0640126668174625, "grad_norm": 0.5304822325706482, "learning_rate": 5.137373349519891e-06, "loss": 0.4713, "step": 11194 }, { "epoch": 5.064465053155395, "grad_norm": 0.5544705390930176, "learning_rate": 5.136640874589675e-06, "loss": 0.4756, "step": 11195 }, { "epoch": 5.064917439493327, "grad_norm": 0.6315935850143433, "learning_rate": 5.135908396724835e-06, "loss": 0.5668, "step": 11196 }, { "epoch": 5.06536982583126, "grad_norm": 0.5585578680038452, "learning_rate": 5.135175915941103e-06, "loss": 0.5145, "step": 11197 }, { "epoch": 5.065822212169192, "grad_norm": 0.6001442670822144, "learning_rate": 5.134443432254211e-06, "loss": 0.554, "step": 11198 }, { "epoch": 5.066274598507125, "grad_norm": 0.6028825640678406, "learning_rate": 5.13371094567989e-06, "loss": 0.4981, "step": 11199 }, { "epoch": 5.066726984845058, "grad_norm": 0.6207860112190247, "learning_rate": 5.132978456233871e-06, "loss": 0.5447, "step": 11200 }, { "epoch": 5.066726984845058, "eval_loss": 0.5898146033287048, "eval_runtime": 32.0744, "eval_samples_per_second": 23.196, "eval_steps_per_second": 5.799, "step": 11200 }, { "epoch": 5.06717937118299, "grad_norm": 0.5940231084823608, "learning_rate": 5.132245963931888e-06, "loss": 0.4634, "step": 11201 }, { "epoch": 5.067631757520923, "grad_norm": 0.7920205593109131, "learning_rate": 5.1315134687896686e-06, "loss": 0.5343, "step": 11202 }, { "epoch": 5.068084143858855, "grad_norm": 0.6104041934013367, "learning_rate": 5.130780970822947e-06, "loss": 1.0493, "step": 11203 }, { "epoch": 5.068536530196788, "grad_norm": 0.23928707838058472, "learning_rate": 5.1300484700474574e-06, "loss": 0.7618, "step": 11204 }, { "epoch": 5.068988916534721, "grad_norm": 0.25619328022003174, "learning_rate": 5.129315966478926e-06, "loss": 0.5197, "step": 11205 }, { "epoch": 5.0694413028726535, "grad_norm": 0.304675430059433, "learning_rate": 5.128583460133088e-06, "loss": 0.5312, "step": 11206 }, { "epoch": 5.069893689210586, "grad_norm": 0.3208312690258026, "learning_rate": 5.127850951025675e-06, "loss": 0.5671, "step": 11207 }, { "epoch": 5.070346075548518, "grad_norm": 0.36443161964416504, "learning_rate": 5.12711843917242e-06, "loss": 0.5648, "step": 11208 }, { "epoch": 5.070798461886451, "grad_norm": 0.3316135108470917, "learning_rate": 5.126385924589054e-06, "loss": 0.4786, "step": 11209 }, { "epoch": 5.071250848224383, "grad_norm": 0.34507182240486145, "learning_rate": 5.125653407291309e-06, "loss": 0.5566, "step": 11210 }, { "epoch": 5.0717032345623165, "grad_norm": 0.41240161657333374, "learning_rate": 5.124920887294917e-06, "loss": 0.6091, "step": 11211 }, { "epoch": 5.072155620900249, "grad_norm": 0.4083606004714966, "learning_rate": 5.1241883646156106e-06, "loss": 0.6652, "step": 11212 }, { "epoch": 5.072608007238181, "grad_norm": 0.38300564885139465, "learning_rate": 5.123455839269123e-06, "loss": 0.5772, "step": 11213 }, { "epoch": 5.073060393576114, "grad_norm": 0.39346805214881897, "learning_rate": 5.122723311271186e-06, "loss": 0.6267, "step": 11214 }, { "epoch": 5.073512779914046, "grad_norm": 0.3768056035041809, "learning_rate": 5.121990780637532e-06, "loss": 0.5262, "step": 11215 }, { "epoch": 5.07396516625198, "grad_norm": 0.39622801542282104, "learning_rate": 5.121258247383893e-06, "loss": 0.4719, "step": 11216 }, { "epoch": 5.074417552589912, "grad_norm": 0.4575894773006439, "learning_rate": 5.120525711526002e-06, "loss": 0.6296, "step": 11217 }, { "epoch": 5.074869938927844, "grad_norm": 0.43721601366996765, "learning_rate": 5.119793173079592e-06, "loss": 0.4795, "step": 11218 }, { "epoch": 5.075322325265777, "grad_norm": 0.4416942596435547, "learning_rate": 5.1190606320603945e-06, "loss": 0.5229, "step": 11219 }, { "epoch": 5.075774711603709, "grad_norm": 0.419691801071167, "learning_rate": 5.118328088484144e-06, "loss": 0.5031, "step": 11220 }, { "epoch": 5.076227097941642, "grad_norm": 0.37360724806785583, "learning_rate": 5.1175955423665705e-06, "loss": 0.418, "step": 11221 }, { "epoch": 5.076679484279575, "grad_norm": 0.381850004196167, "learning_rate": 5.116862993723409e-06, "loss": 0.4499, "step": 11222 }, { "epoch": 5.0771318706175075, "grad_norm": 0.432659387588501, "learning_rate": 5.116130442570393e-06, "loss": 0.5497, "step": 11223 }, { "epoch": 5.07758425695544, "grad_norm": 0.53816157579422, "learning_rate": 5.115397888923255e-06, "loss": 0.5711, "step": 11224 }, { "epoch": 5.078036643293372, "grad_norm": 0.4909137487411499, "learning_rate": 5.114665332797725e-06, "loss": 0.6134, "step": 11225 }, { "epoch": 5.078489029631305, "grad_norm": 0.5549939274787903, "learning_rate": 5.113932774209539e-06, "loss": 0.633, "step": 11226 }, { "epoch": 5.078941415969238, "grad_norm": 0.4671017527580261, "learning_rate": 5.113200213174431e-06, "loss": 0.5098, "step": 11227 }, { "epoch": 5.0793938023071705, "grad_norm": 0.467974454164505, "learning_rate": 5.11246764970813e-06, "loss": 0.5394, "step": 11228 }, { "epoch": 5.079846188645103, "grad_norm": 0.509114682674408, "learning_rate": 5.111735083826374e-06, "loss": 0.5945, "step": 11229 }, { "epoch": 5.080298574983035, "grad_norm": 0.4726940989494324, "learning_rate": 5.111002515544893e-06, "loss": 0.5171, "step": 11230 }, { "epoch": 5.080750961320968, "grad_norm": 0.4360769987106323, "learning_rate": 5.110269944879421e-06, "loss": 0.4365, "step": 11231 }, { "epoch": 5.081203347658901, "grad_norm": 0.5217816829681396, "learning_rate": 5.109537371845692e-06, "loss": 0.5974, "step": 11232 }, { "epoch": 5.081655733996834, "grad_norm": 0.4889044165611267, "learning_rate": 5.10880479645944e-06, "loss": 0.495, "step": 11233 }, { "epoch": 5.082108120334766, "grad_norm": 0.5498167276382446, "learning_rate": 5.108072218736396e-06, "loss": 0.5726, "step": 11234 }, { "epoch": 5.082560506672698, "grad_norm": 0.5115059614181519, "learning_rate": 5.107339638692298e-06, "loss": 0.5462, "step": 11235 }, { "epoch": 5.083012893010631, "grad_norm": 0.4920312464237213, "learning_rate": 5.1066070563428736e-06, "loss": 0.4531, "step": 11236 }, { "epoch": 5.083465279348563, "grad_norm": 0.44324931502342224, "learning_rate": 5.1058744717038614e-06, "loss": 0.4844, "step": 11237 }, { "epoch": 5.083917665686497, "grad_norm": 0.5210042595863342, "learning_rate": 5.105141884790992e-06, "loss": 0.5073, "step": 11238 }, { "epoch": 5.084370052024429, "grad_norm": 0.5140693187713623, "learning_rate": 5.1044092956199995e-06, "loss": 0.5498, "step": 11239 }, { "epoch": 5.0848224383623615, "grad_norm": 0.5273185968399048, "learning_rate": 5.1036767042066194e-06, "loss": 0.4895, "step": 11240 }, { "epoch": 5.085274824700294, "grad_norm": 0.52720046043396, "learning_rate": 5.102944110566584e-06, "loss": 0.4945, "step": 11241 }, { "epoch": 5.085727211038226, "grad_norm": 0.5258522033691406, "learning_rate": 5.102211514715627e-06, "loss": 0.4474, "step": 11242 }, { "epoch": 5.08617959737616, "grad_norm": 0.5797314047813416, "learning_rate": 5.101478916669483e-06, "loss": 0.6069, "step": 11243 }, { "epoch": 5.086631983714092, "grad_norm": 0.576310932636261, "learning_rate": 5.100746316443887e-06, "loss": 0.5089, "step": 11244 }, { "epoch": 5.0870843700520245, "grad_norm": 0.5367217659950256, "learning_rate": 5.10001371405457e-06, "loss": 0.5445, "step": 11245 }, { "epoch": 5.087536756389957, "grad_norm": 0.5989963412284851, "learning_rate": 5.09928110951727e-06, "loss": 0.5545, "step": 11246 }, { "epoch": 5.087989142727889, "grad_norm": 0.6175532341003418, "learning_rate": 5.098548502847718e-06, "loss": 0.4966, "step": 11247 }, { "epoch": 5.088441529065822, "grad_norm": 0.626684308052063, "learning_rate": 5.097815894061649e-06, "loss": 0.5487, "step": 11248 }, { "epoch": 5.088893915403755, "grad_norm": 0.6634669303894043, "learning_rate": 5.097083283174797e-06, "loss": 0.4597, "step": 11249 }, { "epoch": 5.089346301741688, "grad_norm": 0.5949984788894653, "learning_rate": 5.096350670202895e-06, "loss": 0.4998, "step": 11250 }, { "epoch": 5.08979868807962, "grad_norm": 0.6241093873977661, "learning_rate": 5.09561805516168e-06, "loss": 0.5132, "step": 11251 }, { "epoch": 5.090251074417552, "grad_norm": 0.8041269183158875, "learning_rate": 5.094885438066884e-06, "loss": 0.6217, "step": 11252 }, { "epoch": 5.090703460755485, "grad_norm": 0.4792484641075134, "learning_rate": 5.0941528189342435e-06, "loss": 1.0178, "step": 11253 }, { "epoch": 5.091155847093418, "grad_norm": 0.29778629541397095, "learning_rate": 5.093420197779492e-06, "loss": 0.6346, "step": 11254 }, { "epoch": 5.091608233431351, "grad_norm": 0.33185574412345886, "learning_rate": 5.092687574618362e-06, "loss": 0.6552, "step": 11255 }, { "epoch": 5.092060619769283, "grad_norm": 0.3687180280685425, "learning_rate": 5.091954949466591e-06, "loss": 0.6281, "step": 11256 }, { "epoch": 5.0925130061072155, "grad_norm": 0.34076422452926636, "learning_rate": 5.091222322339912e-06, "loss": 0.6779, "step": 11257 }, { "epoch": 5.092965392445148, "grad_norm": 0.33449801802635193, "learning_rate": 5.0904896932540595e-06, "loss": 0.5051, "step": 11258 }, { "epoch": 5.09341777878308, "grad_norm": 0.3967413306236267, "learning_rate": 5.089757062224768e-06, "loss": 0.612, "step": 11259 }, { "epoch": 5.093870165121014, "grad_norm": 0.3929999768733978, "learning_rate": 5.089024429267773e-06, "loss": 0.5431, "step": 11260 }, { "epoch": 5.094322551458946, "grad_norm": 0.4553225636482239, "learning_rate": 5.088291794398808e-06, "loss": 0.6317, "step": 11261 }, { "epoch": 5.0947749377968785, "grad_norm": 0.37984615564346313, "learning_rate": 5.087559157633609e-06, "loss": 0.4986, "step": 11262 }, { "epoch": 5.095227324134811, "grad_norm": 0.43019911646842957, "learning_rate": 5.08682651898791e-06, "loss": 0.6787, "step": 11263 }, { "epoch": 5.095679710472743, "grad_norm": 0.4206899404525757, "learning_rate": 5.086093878477445e-06, "loss": 0.5049, "step": 11264 }, { "epoch": 5.096132096810677, "grad_norm": 0.4327152371406555, "learning_rate": 5.085361236117952e-06, "loss": 0.5764, "step": 11265 }, { "epoch": 5.096584483148609, "grad_norm": 0.414976567029953, "learning_rate": 5.084628591925161e-06, "loss": 0.4856, "step": 11266 }, { "epoch": 5.0970368694865416, "grad_norm": 0.4124453663825989, "learning_rate": 5.0838959459148106e-06, "loss": 0.3874, "step": 11267 }, { "epoch": 5.097489255824474, "grad_norm": 0.4651114046573639, "learning_rate": 5.083163298102635e-06, "loss": 0.4754, "step": 11268 }, { "epoch": 5.097941642162406, "grad_norm": 0.40738943219184875, "learning_rate": 5.082430648504371e-06, "loss": 0.4864, "step": 11269 }, { "epoch": 5.098394028500339, "grad_norm": 0.4315110743045807, "learning_rate": 5.081697997135749e-06, "loss": 0.5012, "step": 11270 }, { "epoch": 5.098846414838272, "grad_norm": 0.4673481285572052, "learning_rate": 5.080965344012509e-06, "loss": 0.5374, "step": 11271 }, { "epoch": 5.099298801176205, "grad_norm": 0.4162807762622833, "learning_rate": 5.080232689150382e-06, "loss": 0.4493, "step": 11272 }, { "epoch": 5.099751187514137, "grad_norm": 0.4847480356693268, "learning_rate": 5.079500032565106e-06, "loss": 0.6162, "step": 11273 }, { "epoch": 5.1002035738520695, "grad_norm": 0.4133059084415436, "learning_rate": 5.078767374272414e-06, "loss": 0.4581, "step": 11274 }, { "epoch": 5.100655960190002, "grad_norm": 0.43820786476135254, "learning_rate": 5.078034714288043e-06, "loss": 0.5305, "step": 11275 }, { "epoch": 5.101108346527935, "grad_norm": 0.471911758184433, "learning_rate": 5.0773020526277275e-06, "loss": 0.5301, "step": 11276 }, { "epoch": 5.101560732865868, "grad_norm": 0.41854506731033325, "learning_rate": 5.076569389307203e-06, "loss": 0.4364, "step": 11277 }, { "epoch": 5.1020131192038, "grad_norm": 0.4517832100391388, "learning_rate": 5.075836724342206e-06, "loss": 0.5323, "step": 11278 }, { "epoch": 5.1024655055417325, "grad_norm": 0.4384348392486572, "learning_rate": 5.07510405774847e-06, "loss": 0.502, "step": 11279 }, { "epoch": 5.102917891879665, "grad_norm": 0.46632611751556396, "learning_rate": 5.074371389541732e-06, "loss": 0.4406, "step": 11280 }, { "epoch": 5.103370278217598, "grad_norm": 0.47859498858451843, "learning_rate": 5.0736387197377254e-06, "loss": 0.462, "step": 11281 }, { "epoch": 5.103822664555531, "grad_norm": 0.5206245183944702, "learning_rate": 5.072906048352187e-06, "loss": 0.5353, "step": 11282 }, { "epoch": 5.104275050893463, "grad_norm": 0.4792967140674591, "learning_rate": 5.072173375400853e-06, "loss": 0.4794, "step": 11283 }, { "epoch": 5.1047274372313955, "grad_norm": 0.4509256184101105, "learning_rate": 5.0714407008994585e-06, "loss": 0.4467, "step": 11284 }, { "epoch": 5.105179823569328, "grad_norm": 0.48994943499565125, "learning_rate": 5.070708024863737e-06, "loss": 0.4846, "step": 11285 }, { "epoch": 5.10563220990726, "grad_norm": 0.523347795009613, "learning_rate": 5.069975347309427e-06, "loss": 0.6042, "step": 11286 }, { "epoch": 5.106084596245194, "grad_norm": 0.5563266277313232, "learning_rate": 5.069242668252262e-06, "loss": 0.5778, "step": 11287 }, { "epoch": 5.106536982583126, "grad_norm": 0.4774409532546997, "learning_rate": 5.06850998770798e-06, "loss": 0.4611, "step": 11288 }, { "epoch": 5.106989368921059, "grad_norm": 0.5026154518127441, "learning_rate": 5.067777305692315e-06, "loss": 0.4929, "step": 11289 }, { "epoch": 5.107441755258991, "grad_norm": 0.5803771018981934, "learning_rate": 5.067044622221002e-06, "loss": 0.5615, "step": 11290 }, { "epoch": 5.1078941415969235, "grad_norm": 0.5928201079368591, "learning_rate": 5.066311937309779e-06, "loss": 0.5491, "step": 11291 }, { "epoch": 5.108346527934857, "grad_norm": 0.49827784299850464, "learning_rate": 5.065579250974382e-06, "loss": 0.497, "step": 11292 }, { "epoch": 5.108798914272789, "grad_norm": 0.6251811981201172, "learning_rate": 5.064846563230544e-06, "loss": 0.6369, "step": 11293 }, { "epoch": 5.109251300610722, "grad_norm": 0.5373079180717468, "learning_rate": 5.064113874094003e-06, "loss": 0.478, "step": 11294 }, { "epoch": 5.109703686948654, "grad_norm": 0.5352738499641418, "learning_rate": 5.063381183580493e-06, "loss": 0.4769, "step": 11295 }, { "epoch": 5.1101560732865865, "grad_norm": 0.6540150046348572, "learning_rate": 5.062648491705753e-06, "loss": 0.4959, "step": 11296 }, { "epoch": 5.110608459624519, "grad_norm": 0.559669554233551, "learning_rate": 5.0619157984855164e-06, "loss": 0.5125, "step": 11297 }, { "epoch": 5.111060845962452, "grad_norm": 0.589368999004364, "learning_rate": 5.06118310393552e-06, "loss": 0.5114, "step": 11298 }, { "epoch": 5.111513232300385, "grad_norm": 0.6968478560447693, "learning_rate": 5.060450408071499e-06, "loss": 0.5235, "step": 11299 }, { "epoch": 5.111965618638317, "grad_norm": 0.5888974070549011, "learning_rate": 5.059717710909191e-06, "loss": 0.4516, "step": 11300 }, { "epoch": 5.1124180049762495, "grad_norm": 0.7550285458564758, "learning_rate": 5.058985012464331e-06, "loss": 0.5373, "step": 11301 }, { "epoch": 5.112870391314182, "grad_norm": 0.6591219305992126, "learning_rate": 5.058252312752656e-06, "loss": 0.4656, "step": 11302 }, { "epoch": 5.113322777652115, "grad_norm": 0.502524733543396, "learning_rate": 5.057519611789902e-06, "loss": 0.8948, "step": 11303 }, { "epoch": 5.113775163990048, "grad_norm": 0.2148352861404419, "learning_rate": 5.0567869095918036e-06, "loss": 1.1272, "step": 11304 }, { "epoch": 5.11422755032798, "grad_norm": 0.28833022713661194, "learning_rate": 5.056054206174098e-06, "loss": 0.573, "step": 11305 }, { "epoch": 5.114679936665913, "grad_norm": 0.31127554178237915, "learning_rate": 5.055321501552522e-06, "loss": 0.618, "step": 11306 }, { "epoch": 5.115132323003845, "grad_norm": 0.34127873182296753, "learning_rate": 5.054588795742811e-06, "loss": 0.5306, "step": 11307 }, { "epoch": 5.1155847093417774, "grad_norm": 0.3579685091972351, "learning_rate": 5.053856088760702e-06, "loss": 0.7197, "step": 11308 }, { "epoch": 5.116037095679711, "grad_norm": 0.3587843179702759, "learning_rate": 5.053123380621929e-06, "loss": 0.5095, "step": 11309 }, { "epoch": 5.116489482017643, "grad_norm": 0.3601358234882355, "learning_rate": 5.0523906713422325e-06, "loss": 0.6212, "step": 11310 }, { "epoch": 5.116941868355576, "grad_norm": 0.4516037404537201, "learning_rate": 5.051657960937345e-06, "loss": 0.5901, "step": 11311 }, { "epoch": 5.117394254693508, "grad_norm": 0.3732326626777649, "learning_rate": 5.050925249423005e-06, "loss": 0.5542, "step": 11312 }, { "epoch": 5.1178466410314405, "grad_norm": 0.37238556146621704, "learning_rate": 5.050192536814948e-06, "loss": 0.6186, "step": 11313 }, { "epoch": 5.118299027369374, "grad_norm": 0.383610337972641, "learning_rate": 5.04945982312891e-06, "loss": 0.5283, "step": 11314 }, { "epoch": 5.118751413707306, "grad_norm": 0.3928011655807495, "learning_rate": 5.048727108380629e-06, "loss": 0.5684, "step": 11315 }, { "epoch": 5.119203800045239, "grad_norm": 0.4134840965270996, "learning_rate": 5.04799439258584e-06, "loss": 0.5849, "step": 11316 }, { "epoch": 5.119656186383171, "grad_norm": 0.4083605110645294, "learning_rate": 5.04726167576028e-06, "loss": 0.7003, "step": 11317 }, { "epoch": 5.1201085727211035, "grad_norm": 0.46620091795921326, "learning_rate": 5.0465289579196844e-06, "loss": 0.6164, "step": 11318 }, { "epoch": 5.120560959059037, "grad_norm": 0.42597126960754395, "learning_rate": 5.045796239079792e-06, "loss": 0.6185, "step": 11319 }, { "epoch": 5.121013345396969, "grad_norm": 0.4338490962982178, "learning_rate": 5.045063519256337e-06, "loss": 0.5803, "step": 11320 }, { "epoch": 5.121465731734902, "grad_norm": 0.4944349229335785, "learning_rate": 5.044330798465058e-06, "loss": 0.7332, "step": 11321 }, { "epoch": 5.121918118072834, "grad_norm": 0.41094520688056946, "learning_rate": 5.0435980767216895e-06, "loss": 0.5436, "step": 11322 }, { "epoch": 5.122370504410767, "grad_norm": 0.42765262722969055, "learning_rate": 5.04286535404197e-06, "loss": 0.5628, "step": 11323 }, { "epoch": 5.122822890748699, "grad_norm": 0.4412253499031067, "learning_rate": 5.042132630441635e-06, "loss": 0.5997, "step": 11324 }, { "epoch": 5.123275277086632, "grad_norm": 0.4651359021663666, "learning_rate": 5.041399905936422e-06, "loss": 0.5103, "step": 11325 }, { "epoch": 5.123727663424565, "grad_norm": 0.4575479030609131, "learning_rate": 5.040667180542067e-06, "loss": 0.4765, "step": 11326 }, { "epoch": 5.124180049762497, "grad_norm": 0.5057826638221741, "learning_rate": 5.039934454274307e-06, "loss": 0.5423, "step": 11327 }, { "epoch": 5.12463243610043, "grad_norm": 0.4432235360145569, "learning_rate": 5.039201727148877e-06, "loss": 0.494, "step": 11328 }, { "epoch": 5.125084822438362, "grad_norm": 0.4424418807029724, "learning_rate": 5.038468999181516e-06, "loss": 0.4409, "step": 11329 }, { "epoch": 5.125537208776295, "grad_norm": 0.4664880037307739, "learning_rate": 5.03773627038796e-06, "loss": 0.5244, "step": 11330 }, { "epoch": 5.125989595114228, "grad_norm": 0.47675684094429016, "learning_rate": 5.037003540783946e-06, "loss": 0.5384, "step": 11331 }, { "epoch": 5.12644198145216, "grad_norm": 0.3995063304901123, "learning_rate": 5.0362708103852096e-06, "loss": 0.4204, "step": 11332 }, { "epoch": 5.126894367790093, "grad_norm": 0.46066945791244507, "learning_rate": 5.035538079207488e-06, "loss": 0.5162, "step": 11333 }, { "epoch": 5.127346754128025, "grad_norm": 0.49793997406959534, "learning_rate": 5.034805347266519e-06, "loss": 0.4983, "step": 11334 }, { "epoch": 5.1277991404659575, "grad_norm": 0.469579815864563, "learning_rate": 5.03407261457804e-06, "loss": 0.4661, "step": 11335 }, { "epoch": 5.128251526803891, "grad_norm": 0.5620110034942627, "learning_rate": 5.033339881157785e-06, "loss": 0.5836, "step": 11336 }, { "epoch": 5.128703913141823, "grad_norm": 0.4936693608760834, "learning_rate": 5.032607147021494e-06, "loss": 0.4336, "step": 11337 }, { "epoch": 5.129156299479756, "grad_norm": 0.4965991973876953, "learning_rate": 5.031874412184901e-06, "loss": 0.4942, "step": 11338 }, { "epoch": 5.129608685817688, "grad_norm": 0.5022698044776917, "learning_rate": 5.031141676663745e-06, "loss": 0.4035, "step": 11339 }, { "epoch": 5.130061072155621, "grad_norm": 0.47949910163879395, "learning_rate": 5.0304089404737624e-06, "loss": 0.4579, "step": 11340 }, { "epoch": 5.130513458493554, "grad_norm": 0.4783257246017456, "learning_rate": 5.029676203630689e-06, "loss": 0.4364, "step": 11341 }, { "epoch": 5.130965844831486, "grad_norm": 0.5761313438415527, "learning_rate": 5.028943466150262e-06, "loss": 0.5534, "step": 11342 }, { "epoch": 5.131418231169419, "grad_norm": 0.5358985662460327, "learning_rate": 5.028210728048219e-06, "loss": 0.4614, "step": 11343 }, { "epoch": 5.131870617507351, "grad_norm": 0.6173979043960571, "learning_rate": 5.027477989340298e-06, "loss": 0.7194, "step": 11344 }, { "epoch": 5.132323003845284, "grad_norm": 0.5705219507217407, "learning_rate": 5.026745250042234e-06, "loss": 0.5049, "step": 11345 }, { "epoch": 5.132775390183216, "grad_norm": 0.6251544952392578, "learning_rate": 5.026012510169765e-06, "loss": 0.6064, "step": 11346 }, { "epoch": 5.133227776521149, "grad_norm": 0.5919802784919739, "learning_rate": 5.025279769738628e-06, "loss": 0.5435, "step": 11347 }, { "epoch": 5.133680162859082, "grad_norm": 0.5382701754570007, "learning_rate": 5.0245470287645606e-06, "loss": 0.4156, "step": 11348 }, { "epoch": 5.134132549197014, "grad_norm": 0.5824554562568665, "learning_rate": 5.0238142872632976e-06, "loss": 0.5506, "step": 11349 }, { "epoch": 5.134584935534947, "grad_norm": 0.6605470180511475, "learning_rate": 5.0230815452505774e-06, "loss": 0.5379, "step": 11350 }, { "epoch": 5.135037321872879, "grad_norm": 0.6393526792526245, "learning_rate": 5.022348802742137e-06, "loss": 0.565, "step": 11351 }, { "epoch": 5.135489708210812, "grad_norm": 0.5969440340995789, "learning_rate": 5.021616059753715e-06, "loss": 0.5286, "step": 11352 }, { "epoch": 5.135942094548745, "grad_norm": 0.455068975687027, "learning_rate": 5.020883316301045e-06, "loss": 1.1188, "step": 11353 }, { "epoch": 5.136394480886677, "grad_norm": 0.19209785759449005, "learning_rate": 5.020150572399867e-06, "loss": 0.4537, "step": 11354 }, { "epoch": 5.13684686722461, "grad_norm": 0.2939070463180542, "learning_rate": 5.019417828065916e-06, "loss": 0.5476, "step": 11355 }, { "epoch": 5.137299253562542, "grad_norm": 0.32683703303337097, "learning_rate": 5.01868508331493e-06, "loss": 0.6309, "step": 11356 }, { "epoch": 5.1377516399004755, "grad_norm": 0.4328486919403076, "learning_rate": 5.017952338162647e-06, "loss": 0.6327, "step": 11357 }, { "epoch": 5.138204026238408, "grad_norm": 0.436262309551239, "learning_rate": 5.017219592624804e-06, "loss": 0.6357, "step": 11358 }, { "epoch": 5.13865641257634, "grad_norm": 0.3848155736923218, "learning_rate": 5.016486846717137e-06, "loss": 0.5727, "step": 11359 }, { "epoch": 5.139108798914273, "grad_norm": 0.3779255151748657, "learning_rate": 5.015754100455383e-06, "loss": 0.5983, "step": 11360 }, { "epoch": 5.139561185252205, "grad_norm": 0.4744502902030945, "learning_rate": 5.015021353855281e-06, "loss": 0.6631, "step": 11361 }, { "epoch": 5.140013571590138, "grad_norm": 0.420970618724823, "learning_rate": 5.014288606932565e-06, "loss": 0.5857, "step": 11362 }, { "epoch": 5.140465957928071, "grad_norm": 0.41290175914764404, "learning_rate": 5.0135558597029745e-06, "loss": 0.5379, "step": 11363 }, { "epoch": 5.140918344266003, "grad_norm": 0.43473854660987854, "learning_rate": 5.0128231121822456e-06, "loss": 0.6781, "step": 11364 }, { "epoch": 5.141370730603936, "grad_norm": 0.47883352637290955, "learning_rate": 5.012090364386117e-06, "loss": 0.6771, "step": 11365 }, { "epoch": 5.141823116941868, "grad_norm": 0.3833886384963989, "learning_rate": 5.011357616330324e-06, "loss": 0.4349, "step": 11366 }, { "epoch": 5.142275503279801, "grad_norm": 0.4222332835197449, "learning_rate": 5.010624868030605e-06, "loss": 0.4853, "step": 11367 }, { "epoch": 5.142727889617734, "grad_norm": 0.43958282470703125, "learning_rate": 5.009892119502696e-06, "loss": 0.5773, "step": 11368 }, { "epoch": 5.143180275955666, "grad_norm": 0.42851871252059937, "learning_rate": 5.009159370762336e-06, "loss": 0.5773, "step": 11369 }, { "epoch": 5.143632662293599, "grad_norm": 0.444776326417923, "learning_rate": 5.00842662182526e-06, "loss": 0.5281, "step": 11370 }, { "epoch": 5.144085048631531, "grad_norm": 0.4196228086948395, "learning_rate": 5.007693872707207e-06, "loss": 0.3408, "step": 11371 }, { "epoch": 5.144537434969464, "grad_norm": 0.42875832319259644, "learning_rate": 5.006961123423913e-06, "loss": 0.56, "step": 11372 }, { "epoch": 5.144989821307396, "grad_norm": 0.5335195064544678, "learning_rate": 5.006228373991116e-06, "loss": 0.6916, "step": 11373 }, { "epoch": 5.1454422076453294, "grad_norm": 0.4099080264568329, "learning_rate": 5.005495624424552e-06, "loss": 0.4648, "step": 11374 }, { "epoch": 5.145894593983262, "grad_norm": 0.4881769120693207, "learning_rate": 5.004762874739959e-06, "loss": 0.5247, "step": 11375 }, { "epoch": 5.146346980321194, "grad_norm": 0.4987201988697052, "learning_rate": 5.004030124953075e-06, "loss": 0.5835, "step": 11376 }, { "epoch": 5.146799366659127, "grad_norm": 0.42289137840270996, "learning_rate": 5.003297375079636e-06, "loss": 0.4816, "step": 11377 }, { "epoch": 5.147251752997059, "grad_norm": 0.5082607269287109, "learning_rate": 5.002564625135379e-06, "loss": 0.4918, "step": 11378 }, { "epoch": 5.1477041393349925, "grad_norm": 0.4519132077693939, "learning_rate": 5.001831875136043e-06, "loss": 0.5125, "step": 11379 }, { "epoch": 5.148156525672925, "grad_norm": 0.5055415034294128, "learning_rate": 5.001099125097363e-06, "loss": 0.4838, "step": 11380 }, { "epoch": 5.148608912010857, "grad_norm": 0.4696647524833679, "learning_rate": 5.0003663750350774e-06, "loss": 0.5248, "step": 11381 }, { "epoch": 5.14906129834879, "grad_norm": 0.5585653781890869, "learning_rate": 4.999633624964924e-06, "loss": 0.6446, "step": 11382 }, { "epoch": 5.149513684686722, "grad_norm": 0.5328734517097473, "learning_rate": 4.998900874902638e-06, "loss": 0.4789, "step": 11383 }, { "epoch": 5.149966071024655, "grad_norm": 0.4978470802307129, "learning_rate": 4.998168124863959e-06, "loss": 0.5856, "step": 11384 }, { "epoch": 5.150418457362588, "grad_norm": 0.49888119101524353, "learning_rate": 4.997435374864622e-06, "loss": 0.5476, "step": 11385 }, { "epoch": 5.15087084370052, "grad_norm": 0.4767812490463257, "learning_rate": 4.996702624920366e-06, "loss": 0.4608, "step": 11386 }, { "epoch": 5.151323230038453, "grad_norm": 0.47151437401771545, "learning_rate": 4.995969875046927e-06, "loss": 0.4503, "step": 11387 }, { "epoch": 5.151775616376385, "grad_norm": 0.5385220646858215, "learning_rate": 4.9952371252600424e-06, "loss": 0.5355, "step": 11388 }, { "epoch": 5.152228002714318, "grad_norm": 0.5046237707138062, "learning_rate": 4.994504375575449e-06, "loss": 0.4894, "step": 11389 }, { "epoch": 5.152680389052251, "grad_norm": 0.507832407951355, "learning_rate": 4.993771626008886e-06, "loss": 0.4271, "step": 11390 }, { "epoch": 5.153132775390183, "grad_norm": 0.5974923968315125, "learning_rate": 4.993038876576089e-06, "loss": 0.529, "step": 11391 }, { "epoch": 5.153585161728116, "grad_norm": 0.520656406879425, "learning_rate": 4.992306127292794e-06, "loss": 0.4756, "step": 11392 }, { "epoch": 5.154037548066048, "grad_norm": 0.5302023887634277, "learning_rate": 4.9915733781747425e-06, "loss": 0.4275, "step": 11393 }, { "epoch": 5.154489934403981, "grad_norm": 0.5587235689163208, "learning_rate": 4.990840629237667e-06, "loss": 0.4784, "step": 11394 }, { "epoch": 5.154942320741913, "grad_norm": 0.5316705107688904, "learning_rate": 4.990107880497305e-06, "loss": 0.4123, "step": 11395 }, { "epoch": 5.1553947070798465, "grad_norm": 0.5891151428222656, "learning_rate": 4.989375131969397e-06, "loss": 0.5211, "step": 11396 }, { "epoch": 5.155847093417779, "grad_norm": 0.6504048109054565, "learning_rate": 4.988642383669678e-06, "loss": 0.5177, "step": 11397 }, { "epoch": 5.156299479755711, "grad_norm": 0.6224930286407471, "learning_rate": 4.987909635613884e-06, "loss": 0.4968, "step": 11398 }, { "epoch": 5.156751866093644, "grad_norm": 0.6419300436973572, "learning_rate": 4.987176887817755e-06, "loss": 0.5196, "step": 11399 }, { "epoch": 5.157204252431576, "grad_norm": 0.6132867932319641, "learning_rate": 4.986444140297027e-06, "loss": 0.4798, "step": 11400 }, { "epoch": 5.157204252431576, "eval_loss": 0.5920748114585876, "eval_runtime": 26.3664, "eval_samples_per_second": 28.218, "eval_steps_per_second": 7.054, "step": 11400 }, { "epoch": 5.1576566387695095, "grad_norm": 0.6377370953559875, "learning_rate": 4.985711393067437e-06, "loss": 0.5242, "step": 11401 }, { "epoch": 5.158109025107442, "grad_norm": 0.8104985356330872, "learning_rate": 4.984978646144721e-06, "loss": 0.4753, "step": 11402 }, { "epoch": 5.158561411445374, "grad_norm": 0.5156714916229248, "learning_rate": 4.984245899544618e-06, "loss": 0.885, "step": 11403 }, { "epoch": 5.159013797783307, "grad_norm": 0.20342448353767395, "learning_rate": 4.983513153282864e-06, "loss": 0.8151, "step": 11404 }, { "epoch": 5.159466184121239, "grad_norm": 0.3518577218055725, "learning_rate": 4.982780407375198e-06, "loss": 0.7891, "step": 11405 }, { "epoch": 5.159918570459173, "grad_norm": 0.3270866274833679, "learning_rate": 4.982047661837355e-06, "loss": 0.5611, "step": 11406 }, { "epoch": 5.160370956797105, "grad_norm": 0.3713703453540802, "learning_rate": 4.981314916685071e-06, "loss": 0.7371, "step": 11407 }, { "epoch": 5.160823343135037, "grad_norm": 0.331501841545105, "learning_rate": 4.980582171934086e-06, "loss": 0.4811, "step": 11408 }, { "epoch": 5.16127572947297, "grad_norm": 0.4010835886001587, "learning_rate": 4.9798494276001355e-06, "loss": 0.6609, "step": 11409 }, { "epoch": 5.161728115810902, "grad_norm": 0.41787418723106384, "learning_rate": 4.979116683698957e-06, "loss": 0.5279, "step": 11410 }, { "epoch": 5.162180502148835, "grad_norm": 0.36907845735549927, "learning_rate": 4.978383940246287e-06, "loss": 0.5491, "step": 11411 }, { "epoch": 5.162632888486768, "grad_norm": 0.43170881271362305, "learning_rate": 4.977651197257864e-06, "loss": 0.6109, "step": 11412 }, { "epoch": 5.1630852748247005, "grad_norm": 0.4171883761882782, "learning_rate": 4.976918454749423e-06, "loss": 0.5966, "step": 11413 }, { "epoch": 5.163537661162633, "grad_norm": 0.45532652735710144, "learning_rate": 4.976185712736703e-06, "loss": 0.6338, "step": 11414 }, { "epoch": 5.163990047500565, "grad_norm": 0.4469536542892456, "learning_rate": 4.97545297123544e-06, "loss": 0.7443, "step": 11415 }, { "epoch": 5.164442433838498, "grad_norm": 0.37647780776023865, "learning_rate": 4.974720230261373e-06, "loss": 0.4441, "step": 11416 }, { "epoch": 5.164894820176431, "grad_norm": 0.5083298683166504, "learning_rate": 4.973987489830236e-06, "loss": 0.6408, "step": 11417 }, { "epoch": 5.1653472065143635, "grad_norm": 0.48606234788894653, "learning_rate": 4.973254749957767e-06, "loss": 0.6623, "step": 11418 }, { "epoch": 5.165799592852296, "grad_norm": 0.4736420810222626, "learning_rate": 4.972522010659703e-06, "loss": 0.5747, "step": 11419 }, { "epoch": 5.166251979190228, "grad_norm": 0.4644400179386139, "learning_rate": 4.971789271951781e-06, "loss": 0.6174, "step": 11420 }, { "epoch": 5.166704365528161, "grad_norm": 0.4773406386375427, "learning_rate": 4.971056533849739e-06, "loss": 0.6845, "step": 11421 }, { "epoch": 5.167156751866093, "grad_norm": 0.5290935635566711, "learning_rate": 4.970323796369313e-06, "loss": 0.5579, "step": 11422 }, { "epoch": 5.167609138204027, "grad_norm": 0.5356165170669556, "learning_rate": 4.969591059526239e-06, "loss": 0.6302, "step": 11423 }, { "epoch": 5.168061524541959, "grad_norm": 0.47106626629829407, "learning_rate": 4.9688583233362565e-06, "loss": 0.5445, "step": 11424 }, { "epoch": 5.168513910879891, "grad_norm": 0.5113558173179626, "learning_rate": 4.9681255878150995e-06, "loss": 0.5946, "step": 11425 }, { "epoch": 5.168966297217824, "grad_norm": 0.44522830843925476, "learning_rate": 4.967392852978507e-06, "loss": 0.4918, "step": 11426 }, { "epoch": 5.169418683555756, "grad_norm": 0.5061670541763306, "learning_rate": 4.9666601188422165e-06, "loss": 0.6592, "step": 11427 }, { "epoch": 5.16987106989369, "grad_norm": 0.4327445924282074, "learning_rate": 4.965927385421962e-06, "loss": 0.4631, "step": 11428 }, { "epoch": 5.170323456231622, "grad_norm": 0.482847660779953, "learning_rate": 4.965194652733481e-06, "loss": 0.4811, "step": 11429 }, { "epoch": 5.1707758425695545, "grad_norm": 0.4288040101528168, "learning_rate": 4.964461920792512e-06, "loss": 0.4353, "step": 11430 }, { "epoch": 5.171228228907487, "grad_norm": 0.45230022072792053, "learning_rate": 4.963729189614792e-06, "loss": 0.4509, "step": 11431 }, { "epoch": 5.171680615245419, "grad_norm": 0.5205665230751038, "learning_rate": 4.962996459216056e-06, "loss": 0.5227, "step": 11432 }, { "epoch": 5.172133001583352, "grad_norm": 0.5110481381416321, "learning_rate": 4.962263729612041e-06, "loss": 0.4564, "step": 11433 }, { "epoch": 5.172585387921285, "grad_norm": 0.5879647731781006, "learning_rate": 4.961531000818485e-06, "loss": 0.5347, "step": 11434 }, { "epoch": 5.1730377742592175, "grad_norm": 0.45964884757995605, "learning_rate": 4.960798272851124e-06, "loss": 0.4309, "step": 11435 }, { "epoch": 5.17349016059715, "grad_norm": 0.5393146276473999, "learning_rate": 4.960065545725695e-06, "loss": 0.5829, "step": 11436 }, { "epoch": 5.173942546935082, "grad_norm": 0.5240101218223572, "learning_rate": 4.959332819457935e-06, "loss": 0.5268, "step": 11437 }, { "epoch": 5.174394933273015, "grad_norm": 0.5430912971496582, "learning_rate": 4.95860009406358e-06, "loss": 0.5791, "step": 11438 }, { "epoch": 5.174847319610948, "grad_norm": 0.5171389579772949, "learning_rate": 4.957867369558366e-06, "loss": 0.5024, "step": 11439 }, { "epoch": 5.175299705948881, "grad_norm": 0.5693656206130981, "learning_rate": 4.957134645958031e-06, "loss": 0.5391, "step": 11440 }, { "epoch": 5.175752092286813, "grad_norm": 0.5313798189163208, "learning_rate": 4.956401923278311e-06, "loss": 0.5027, "step": 11441 }, { "epoch": 5.176204478624745, "grad_norm": 0.5070506930351257, "learning_rate": 4.955669201534944e-06, "loss": 0.4533, "step": 11442 }, { "epoch": 5.176656864962678, "grad_norm": 0.588525652885437, "learning_rate": 4.954936480743664e-06, "loss": 0.5793, "step": 11443 }, { "epoch": 5.17710925130061, "grad_norm": 0.6135499477386475, "learning_rate": 4.95420376092021e-06, "loss": 0.55, "step": 11444 }, { "epoch": 5.177561637638544, "grad_norm": 0.6704609990119934, "learning_rate": 4.953471042080316e-06, "loss": 0.5275, "step": 11445 }, { "epoch": 5.178014023976476, "grad_norm": 0.5584989786148071, "learning_rate": 4.952738324239722e-06, "loss": 0.4774, "step": 11446 }, { "epoch": 5.1784664103144085, "grad_norm": 0.623134434223175, "learning_rate": 4.952005607414162e-06, "loss": 0.5341, "step": 11447 }, { "epoch": 5.178918796652341, "grad_norm": 0.636631965637207, "learning_rate": 4.951272891619373e-06, "loss": 0.539, "step": 11448 }, { "epoch": 5.179371182990273, "grad_norm": 0.5724186897277832, "learning_rate": 4.950540176871091e-06, "loss": 0.4917, "step": 11449 }, { "epoch": 5.179823569328207, "grad_norm": 0.6110941171646118, "learning_rate": 4.949807463185055e-06, "loss": 0.5044, "step": 11450 }, { "epoch": 5.180275955666139, "grad_norm": 0.6201316118240356, "learning_rate": 4.949074750576997e-06, "loss": 0.5127, "step": 11451 }, { "epoch": 5.1807283420040715, "grad_norm": 0.7628627419471741, "learning_rate": 4.948342039062657e-06, "loss": 0.5406, "step": 11452 }, { "epoch": 5.181180728342004, "grad_norm": 0.4956250488758087, "learning_rate": 4.94760932865777e-06, "loss": 0.949, "step": 11453 }, { "epoch": 5.181633114679936, "grad_norm": 0.2074546217918396, "learning_rate": 4.946876619378072e-06, "loss": 1.0379, "step": 11454 }, { "epoch": 5.18208550101787, "grad_norm": 0.26155388355255127, "learning_rate": 4.9461439112393e-06, "loss": 0.627, "step": 11455 }, { "epoch": 5.182537887355802, "grad_norm": 0.3257097601890564, "learning_rate": 4.94541120425719e-06, "loss": 0.5841, "step": 11456 }, { "epoch": 5.1829902736937346, "grad_norm": 0.33006978034973145, "learning_rate": 4.944678498447479e-06, "loss": 0.51, "step": 11457 }, { "epoch": 5.183442660031667, "grad_norm": 0.3760795295238495, "learning_rate": 4.9439457938259035e-06, "loss": 0.5285, "step": 11458 }, { "epoch": 5.183895046369599, "grad_norm": 0.36477532982826233, "learning_rate": 4.943213090408197e-06, "loss": 0.5894, "step": 11459 }, { "epoch": 5.184347432707532, "grad_norm": 0.38402584195137024, "learning_rate": 4.9424803882101e-06, "loss": 0.5224, "step": 11460 }, { "epoch": 5.184799819045465, "grad_norm": 0.3763779401779175, "learning_rate": 4.941747687247346e-06, "loss": 0.5682, "step": 11461 }, { "epoch": 5.185252205383398, "grad_norm": 0.41705626249313354, "learning_rate": 4.9410149875356705e-06, "loss": 0.5943, "step": 11462 }, { "epoch": 5.18570459172133, "grad_norm": 0.38883379101753235, "learning_rate": 4.94028228909081e-06, "loss": 0.5583, "step": 11463 }, { "epoch": 5.1861569780592625, "grad_norm": 0.4061135947704315, "learning_rate": 4.939549591928502e-06, "loss": 0.6104, "step": 11464 }, { "epoch": 5.186609364397195, "grad_norm": 0.4254995584487915, "learning_rate": 4.9388168960644824e-06, "loss": 0.5706, "step": 11465 }, { "epoch": 5.187061750735128, "grad_norm": 0.42549481987953186, "learning_rate": 4.938084201514485e-06, "loss": 0.4212, "step": 11466 }, { "epoch": 5.187514137073061, "grad_norm": 0.4537915885448456, "learning_rate": 4.9373515082942485e-06, "loss": 0.522, "step": 11467 }, { "epoch": 5.187966523410993, "grad_norm": 0.4179390072822571, "learning_rate": 4.936618816419507e-06, "loss": 0.4902, "step": 11468 }, { "epoch": 5.1884189097489255, "grad_norm": 0.48151615262031555, "learning_rate": 4.935886125905998e-06, "loss": 0.5384, "step": 11469 }, { "epoch": 5.188871296086858, "grad_norm": 0.49757710099220276, "learning_rate": 4.935153436769457e-06, "loss": 0.6706, "step": 11470 }, { "epoch": 5.18932368242479, "grad_norm": 0.45713669061660767, "learning_rate": 4.934420749025619e-06, "loss": 0.6918, "step": 11471 }, { "epoch": 5.189776068762724, "grad_norm": 0.41622644662857056, "learning_rate": 4.933688062690222e-06, "loss": 0.4634, "step": 11472 }, { "epoch": 5.190228455100656, "grad_norm": 0.4608928859233856, "learning_rate": 4.932955377778999e-06, "loss": 0.5234, "step": 11473 }, { "epoch": 5.1906808414385885, "grad_norm": 0.4246842563152313, "learning_rate": 4.932222694307687e-06, "loss": 0.4627, "step": 11474 }, { "epoch": 5.191133227776521, "grad_norm": 0.47424694895744324, "learning_rate": 4.931490012292022e-06, "loss": 0.6378, "step": 11475 }, { "epoch": 5.191585614114453, "grad_norm": 0.5108854174613953, "learning_rate": 4.930757331747739e-06, "loss": 0.578, "step": 11476 }, { "epoch": 5.192038000452387, "grad_norm": 0.48846375942230225, "learning_rate": 4.930024652690575e-06, "loss": 0.524, "step": 11477 }, { "epoch": 5.192490386790319, "grad_norm": 0.4859755039215088, "learning_rate": 4.9292919751362645e-06, "loss": 0.5652, "step": 11478 }, { "epoch": 5.192942773128252, "grad_norm": 0.4657761752605438, "learning_rate": 4.928559299100544e-06, "loss": 0.4397, "step": 11479 }, { "epoch": 5.193395159466184, "grad_norm": 0.5364787578582764, "learning_rate": 4.927826624599148e-06, "loss": 0.6065, "step": 11480 }, { "epoch": 5.1938475458041165, "grad_norm": 0.5182528495788574, "learning_rate": 4.927093951647814e-06, "loss": 0.5373, "step": 11481 }, { "epoch": 5.194299932142049, "grad_norm": 0.49366793036460876, "learning_rate": 4.926361280262275e-06, "loss": 0.4959, "step": 11482 }, { "epoch": 5.194752318479982, "grad_norm": 0.47125861048698425, "learning_rate": 4.9256286104582695e-06, "loss": 0.4691, "step": 11483 }, { "epoch": 5.195204704817915, "grad_norm": 0.5620824098587036, "learning_rate": 4.924895942251532e-06, "loss": 0.6456, "step": 11484 }, { "epoch": 5.195657091155847, "grad_norm": 0.5055307149887085, "learning_rate": 4.924163275657796e-06, "loss": 0.5388, "step": 11485 }, { "epoch": 5.1961094774937795, "grad_norm": 0.518951952457428, "learning_rate": 4.9234306106927984e-06, "loss": 0.587, "step": 11486 }, { "epoch": 5.196561863831712, "grad_norm": 0.48293057084083557, "learning_rate": 4.922697947372273e-06, "loss": 0.4563, "step": 11487 }, { "epoch": 5.197014250169645, "grad_norm": 0.5289762616157532, "learning_rate": 4.921965285711959e-06, "loss": 0.5248, "step": 11488 }, { "epoch": 5.197466636507578, "grad_norm": 0.5515347719192505, "learning_rate": 4.921232625727587e-06, "loss": 0.5411, "step": 11489 }, { "epoch": 5.19791902284551, "grad_norm": 0.5221287608146667, "learning_rate": 4.9204999674348965e-06, "loss": 0.499, "step": 11490 }, { "epoch": 5.1983714091834425, "grad_norm": 0.4889715909957886, "learning_rate": 4.91976731084962e-06, "loss": 0.4547, "step": 11491 }, { "epoch": 5.198823795521375, "grad_norm": 0.546612024307251, "learning_rate": 4.919034655987493e-06, "loss": 0.502, "step": 11492 }, { "epoch": 5.199276181859307, "grad_norm": 0.4972887933254242, "learning_rate": 4.918302002864252e-06, "loss": 0.4255, "step": 11493 }, { "epoch": 5.199728568197241, "grad_norm": 0.5207933187484741, "learning_rate": 4.91756935149563e-06, "loss": 0.5578, "step": 11494 }, { "epoch": 5.200180954535173, "grad_norm": 0.5826173424720764, "learning_rate": 4.9168367018973655e-06, "loss": 0.4673, "step": 11495 }, { "epoch": 5.200633340873106, "grad_norm": 0.47563889622688293, "learning_rate": 4.91610405408519e-06, "loss": 0.3873, "step": 11496 }, { "epoch": 5.201085727211038, "grad_norm": 0.6118376851081848, "learning_rate": 4.91537140807484e-06, "loss": 0.5309, "step": 11497 }, { "epoch": 5.20153811354897, "grad_norm": 0.5992552042007446, "learning_rate": 4.914638763882051e-06, "loss": 0.4371, "step": 11498 }, { "epoch": 5.201990499886904, "grad_norm": 0.5987685322761536, "learning_rate": 4.913906121522556e-06, "loss": 0.4651, "step": 11499 }, { "epoch": 5.202442886224836, "grad_norm": 0.7171632647514343, "learning_rate": 4.913173481012092e-06, "loss": 0.54, "step": 11500 }, { "epoch": 5.202895272562769, "grad_norm": 0.5853802561759949, "learning_rate": 4.912440842366393e-06, "loss": 0.405, "step": 11501 }, { "epoch": 5.203347658900701, "grad_norm": 0.7331240177154541, "learning_rate": 4.911708205601193e-06, "loss": 0.4355, "step": 11502 }, { "epoch": 5.2038000452386335, "grad_norm": 0.47631317377090454, "learning_rate": 4.910975570732228e-06, "loss": 1.0668, "step": 11503 }, { "epoch": 5.204252431576567, "grad_norm": 0.25547635555267334, "learning_rate": 4.910242937775233e-06, "loss": 0.7865, "step": 11504 }, { "epoch": 5.204704817914499, "grad_norm": 0.3352929949760437, "learning_rate": 4.909510306745941e-06, "loss": 0.6954, "step": 11505 }, { "epoch": 5.205157204252432, "grad_norm": 0.33899152278900146, "learning_rate": 4.90877767766009e-06, "loss": 0.5261, "step": 11506 }, { "epoch": 5.205609590590364, "grad_norm": 0.37976008653640747, "learning_rate": 4.908045050533411e-06, "loss": 0.6165, "step": 11507 }, { "epoch": 5.2060619769282965, "grad_norm": 0.4082273542881012, "learning_rate": 4.907312425381639e-06, "loss": 0.539, "step": 11508 }, { "epoch": 5.206514363266229, "grad_norm": 0.38463643193244934, "learning_rate": 4.906579802220511e-06, "loss": 0.4544, "step": 11509 }, { "epoch": 5.206966749604162, "grad_norm": 0.38946738839149475, "learning_rate": 4.905847181065757e-06, "loss": 0.584, "step": 11510 }, { "epoch": 5.207419135942095, "grad_norm": 0.37036600708961487, "learning_rate": 4.905114561933117e-06, "loss": 0.5632, "step": 11511 }, { "epoch": 5.207871522280027, "grad_norm": 0.38310346007347107, "learning_rate": 4.904381944838322e-06, "loss": 0.543, "step": 11512 }, { "epoch": 5.20832390861796, "grad_norm": 0.3568848669528961, "learning_rate": 4.903649329797107e-06, "loss": 0.5101, "step": 11513 }, { "epoch": 5.208776294955892, "grad_norm": 0.4619874656200409, "learning_rate": 4.902916716825206e-06, "loss": 0.5297, "step": 11514 }, { "epoch": 5.209228681293825, "grad_norm": 0.3958577811717987, "learning_rate": 4.902184105938354e-06, "loss": 0.5538, "step": 11515 }, { "epoch": 5.209681067631758, "grad_norm": 0.36965423822402954, "learning_rate": 4.901451497152284e-06, "loss": 0.4303, "step": 11516 }, { "epoch": 5.21013345396969, "grad_norm": 0.4763779938220978, "learning_rate": 4.900718890482731e-06, "loss": 0.6334, "step": 11517 }, { "epoch": 5.210585840307623, "grad_norm": 0.42267054319381714, "learning_rate": 4.899986285945431e-06, "loss": 0.5667, "step": 11518 }, { "epoch": 5.211038226645555, "grad_norm": 0.41151607036590576, "learning_rate": 4.899253683556115e-06, "loss": 0.4958, "step": 11519 }, { "epoch": 5.2114906129834875, "grad_norm": 0.43005090951919556, "learning_rate": 4.898521083330518e-06, "loss": 0.5299, "step": 11520 }, { "epoch": 5.211942999321421, "grad_norm": 0.5162132382392883, "learning_rate": 4.897788485284374e-06, "loss": 0.6306, "step": 11521 }, { "epoch": 5.212395385659353, "grad_norm": 0.45844897627830505, "learning_rate": 4.897055889433418e-06, "loss": 0.5159, "step": 11522 }, { "epoch": 5.212847771997286, "grad_norm": 0.49396058917045593, "learning_rate": 4.896323295793382e-06, "loss": 0.58, "step": 11523 }, { "epoch": 5.213300158335218, "grad_norm": 0.46440550684928894, "learning_rate": 4.895590704380002e-06, "loss": 0.5458, "step": 11524 }, { "epoch": 5.2137525446731505, "grad_norm": 0.5114407539367676, "learning_rate": 4.894858115209011e-06, "loss": 0.5455, "step": 11525 }, { "epoch": 5.214204931011084, "grad_norm": 0.48472142219543457, "learning_rate": 4.894125528296141e-06, "loss": 0.5215, "step": 11526 }, { "epoch": 5.214657317349016, "grad_norm": 0.5066761374473572, "learning_rate": 4.893392943657127e-06, "loss": 0.5445, "step": 11527 }, { "epoch": 5.215109703686949, "grad_norm": 0.4369858503341675, "learning_rate": 4.892660361307704e-06, "loss": 0.4552, "step": 11528 }, { "epoch": 5.215562090024881, "grad_norm": 0.5169899463653564, "learning_rate": 4.891927781263605e-06, "loss": 0.5466, "step": 11529 }, { "epoch": 5.216014476362814, "grad_norm": 0.5035265684127808, "learning_rate": 4.8911952035405615e-06, "loss": 0.5608, "step": 11530 }, { "epoch": 5.216466862700747, "grad_norm": 0.44068631529808044, "learning_rate": 4.890462628154309e-06, "loss": 0.4939, "step": 11531 }, { "epoch": 5.216919249038679, "grad_norm": 0.44151708483695984, "learning_rate": 4.8897300551205805e-06, "loss": 0.4873, "step": 11532 }, { "epoch": 5.217371635376612, "grad_norm": 0.4986400604248047, "learning_rate": 4.888997484455109e-06, "loss": 0.5087, "step": 11533 }, { "epoch": 5.217824021714544, "grad_norm": 0.4905974566936493, "learning_rate": 4.888264916173628e-06, "loss": 0.5562, "step": 11534 }, { "epoch": 5.218276408052477, "grad_norm": 0.47934651374816895, "learning_rate": 4.887532350291871e-06, "loss": 0.4233, "step": 11535 }, { "epoch": 5.218728794390409, "grad_norm": 0.5525146126747131, "learning_rate": 4.886799786825571e-06, "loss": 0.5719, "step": 11536 }, { "epoch": 5.219181180728342, "grad_norm": 0.49034443497657776, "learning_rate": 4.886067225790462e-06, "loss": 0.46, "step": 11537 }, { "epoch": 5.219633567066275, "grad_norm": 0.5379946231842041, "learning_rate": 4.885334667202276e-06, "loss": 0.5354, "step": 11538 }, { "epoch": 5.220085953404207, "grad_norm": 0.5423034429550171, "learning_rate": 4.8846021110767475e-06, "loss": 0.5245, "step": 11539 }, { "epoch": 5.22053833974214, "grad_norm": 0.5655099749565125, "learning_rate": 4.883869557429609e-06, "loss": 0.6322, "step": 11540 }, { "epoch": 5.220990726080072, "grad_norm": 0.5037591457366943, "learning_rate": 4.883137006276592e-06, "loss": 0.4331, "step": 11541 }, { "epoch": 5.221443112418005, "grad_norm": 0.5488454699516296, "learning_rate": 4.882404457633431e-06, "loss": 0.4943, "step": 11542 }, { "epoch": 5.221895498755938, "grad_norm": 0.5211799740791321, "learning_rate": 4.881671911515859e-06, "loss": 0.4526, "step": 11543 }, { "epoch": 5.22234788509387, "grad_norm": 0.5419963598251343, "learning_rate": 4.880939367939607e-06, "loss": 0.5398, "step": 11544 }, { "epoch": 5.222800271431803, "grad_norm": 0.5432047843933105, "learning_rate": 4.88020682692041e-06, "loss": 0.4045, "step": 11545 }, { "epoch": 5.223252657769735, "grad_norm": 0.5726087689399719, "learning_rate": 4.879474288473999e-06, "loss": 0.5149, "step": 11546 }, { "epoch": 5.223705044107668, "grad_norm": 0.5524846315383911, "learning_rate": 4.878741752616108e-06, "loss": 0.4793, "step": 11547 }, { "epoch": 5.224157430445601, "grad_norm": 0.6094163060188293, "learning_rate": 4.878009219362469e-06, "loss": 0.4881, "step": 11548 }, { "epoch": 5.224609816783533, "grad_norm": 0.6436564326286316, "learning_rate": 4.877276688728815e-06, "loss": 0.5503, "step": 11549 }, { "epoch": 5.225062203121466, "grad_norm": 0.6154261231422424, "learning_rate": 4.8765441607308775e-06, "loss": 0.4309, "step": 11550 }, { "epoch": 5.225514589459398, "grad_norm": 0.6461963057518005, "learning_rate": 4.875811635384391e-06, "loss": 0.4841, "step": 11551 }, { "epoch": 5.225966975797331, "grad_norm": 0.6747749447822571, "learning_rate": 4.875079112705085e-06, "loss": 0.5003, "step": 11552 }, { "epoch": 5.226419362135264, "grad_norm": 0.584540069103241, "learning_rate": 4.8743465927086935e-06, "loss": 0.8948, "step": 11553 }, { "epoch": 5.226871748473196, "grad_norm": 0.302324116230011, "learning_rate": 4.873614075410949e-06, "loss": 1.1964, "step": 11554 }, { "epoch": 5.227324134811129, "grad_norm": 0.3240862190723419, "learning_rate": 4.872881560827583e-06, "loss": 0.7026, "step": 11555 }, { "epoch": 5.227776521149061, "grad_norm": 0.3191392719745636, "learning_rate": 4.872149048974327e-06, "loss": 0.6207, "step": 11556 }, { "epoch": 5.228228907486994, "grad_norm": 0.36456605792045593, "learning_rate": 4.871416539866914e-06, "loss": 0.7566, "step": 11557 }, { "epoch": 5.228681293824926, "grad_norm": 0.38580575585365295, "learning_rate": 4.870684033521077e-06, "loss": 0.5432, "step": 11558 }, { "epoch": 5.229133680162859, "grad_norm": 0.3383632004261017, "learning_rate": 4.869951529952546e-06, "loss": 0.516, "step": 11559 }, { "epoch": 5.229586066500792, "grad_norm": 0.3301604688167572, "learning_rate": 4.869219029177054e-06, "loss": 0.427, "step": 11560 }, { "epoch": 5.230038452838724, "grad_norm": 0.415791779756546, "learning_rate": 4.868486531210333e-06, "loss": 0.7038, "step": 11561 }, { "epoch": 5.230490839176657, "grad_norm": 0.400198757648468, "learning_rate": 4.867754036068114e-06, "loss": 0.5442, "step": 11562 }, { "epoch": 5.230943225514589, "grad_norm": 0.40703076124191284, "learning_rate": 4.86702154376613e-06, "loss": 0.5371, "step": 11563 }, { "epoch": 5.231395611852522, "grad_norm": 0.4240109324455261, "learning_rate": 4.866289054320112e-06, "loss": 0.5194, "step": 11564 }, { "epoch": 5.231847998190455, "grad_norm": 0.4289350211620331, "learning_rate": 4.8655565677457915e-06, "loss": 0.5293, "step": 11565 }, { "epoch": 5.232300384528387, "grad_norm": 0.3994061350822449, "learning_rate": 4.8648240840588985e-06, "loss": 0.4745, "step": 11566 }, { "epoch": 5.23275277086632, "grad_norm": 0.4419114589691162, "learning_rate": 4.864091603275167e-06, "loss": 0.5706, "step": 11567 }, { "epoch": 5.233205157204252, "grad_norm": 0.4230806529521942, "learning_rate": 4.863359125410328e-06, "loss": 0.5469, "step": 11568 }, { "epoch": 5.233657543542185, "grad_norm": 0.49692341685295105, "learning_rate": 4.8626266504801116e-06, "loss": 0.6021, "step": 11569 }, { "epoch": 5.234109929880118, "grad_norm": 0.4275183379650116, "learning_rate": 4.86189417850025e-06, "loss": 0.5539, "step": 11570 }, { "epoch": 5.23456231621805, "grad_norm": 0.44598615169525146, "learning_rate": 4.861161709486474e-06, "loss": 0.414, "step": 11571 }, { "epoch": 5.235014702555983, "grad_norm": 0.4219793379306793, "learning_rate": 4.860429243454515e-06, "loss": 0.5527, "step": 11572 }, { "epoch": 5.235467088893915, "grad_norm": 0.465819388628006, "learning_rate": 4.859696780420105e-06, "loss": 0.5227, "step": 11573 }, { "epoch": 5.235919475231848, "grad_norm": 0.4643312096595764, "learning_rate": 4.858964320398974e-06, "loss": 0.5916, "step": 11574 }, { "epoch": 5.236371861569781, "grad_norm": 0.48369941115379333, "learning_rate": 4.858231863406852e-06, "loss": 0.5104, "step": 11575 }, { "epoch": 5.236824247907713, "grad_norm": 0.45210546255111694, "learning_rate": 4.8574994094594725e-06, "loss": 0.6388, "step": 11576 }, { "epoch": 5.237276634245646, "grad_norm": 0.4715324938297272, "learning_rate": 4.856766958572563e-06, "loss": 0.5508, "step": 11577 }, { "epoch": 5.237729020583578, "grad_norm": 0.5186764597892761, "learning_rate": 4.856034510761858e-06, "loss": 0.4945, "step": 11578 }, { "epoch": 5.238181406921511, "grad_norm": 0.5112282037734985, "learning_rate": 4.855302066043085e-06, "loss": 0.5618, "step": 11579 }, { "epoch": 5.238633793259444, "grad_norm": 0.4874875247478485, "learning_rate": 4.854569624431977e-06, "loss": 0.5427, "step": 11580 }, { "epoch": 5.239086179597376, "grad_norm": 0.48272979259490967, "learning_rate": 4.853837185944263e-06, "loss": 0.5177, "step": 11581 }, { "epoch": 5.239538565935309, "grad_norm": 0.5105332136154175, "learning_rate": 4.853104750595674e-06, "loss": 0.5355, "step": 11582 }, { "epoch": 5.239990952273241, "grad_norm": 0.4669618010520935, "learning_rate": 4.852372318401941e-06, "loss": 0.5213, "step": 11583 }, { "epoch": 5.240443338611174, "grad_norm": 0.4841897189617157, "learning_rate": 4.851639889378793e-06, "loss": 0.5165, "step": 11584 }, { "epoch": 5.240895724949106, "grad_norm": 0.5166888236999512, "learning_rate": 4.850907463541963e-06, "loss": 0.4865, "step": 11585 }, { "epoch": 5.2413481112870395, "grad_norm": 0.5521143674850464, "learning_rate": 4.850175040907179e-06, "loss": 0.5865, "step": 11586 }, { "epoch": 5.241800497624972, "grad_norm": 0.5081901550292969, "learning_rate": 4.84944262149017e-06, "loss": 0.502, "step": 11587 }, { "epoch": 5.242252883962904, "grad_norm": 0.4819912314414978, "learning_rate": 4.8487102053066695e-06, "loss": 0.4288, "step": 11588 }, { "epoch": 5.242705270300837, "grad_norm": 0.5208309292793274, "learning_rate": 4.847977792372405e-06, "loss": 0.4776, "step": 11589 }, { "epoch": 5.243157656638769, "grad_norm": 0.5768367648124695, "learning_rate": 4.847245382703107e-06, "loss": 0.553, "step": 11590 }, { "epoch": 5.2436100429767025, "grad_norm": 0.537319540977478, "learning_rate": 4.8465129763145065e-06, "loss": 0.4754, "step": 11591 }, { "epoch": 5.244062429314635, "grad_norm": 0.6245038509368896, "learning_rate": 4.845780573222332e-06, "loss": 0.5481, "step": 11592 }, { "epoch": 5.244514815652567, "grad_norm": 0.5701978206634521, "learning_rate": 4.845048173442313e-06, "loss": 0.4976, "step": 11593 }, { "epoch": 5.2449672019905, "grad_norm": 0.523451030254364, "learning_rate": 4.844315776990181e-06, "loss": 0.3666, "step": 11594 }, { "epoch": 5.245419588328432, "grad_norm": 0.5270732045173645, "learning_rate": 4.843583383881664e-06, "loss": 0.4547, "step": 11595 }, { "epoch": 5.245871974666365, "grad_norm": 0.5746989250183105, "learning_rate": 4.842850994132493e-06, "loss": 0.4564, "step": 11596 }, { "epoch": 5.246324361004298, "grad_norm": 0.5908270478248596, "learning_rate": 4.842118607758396e-06, "loss": 0.4547, "step": 11597 }, { "epoch": 5.24677674734223, "grad_norm": 0.6363797187805176, "learning_rate": 4.841386224775103e-06, "loss": 0.4878, "step": 11598 }, { "epoch": 5.247229133680163, "grad_norm": 0.5872918963432312, "learning_rate": 4.840653845198342e-06, "loss": 0.487, "step": 11599 }, { "epoch": 5.247681520018095, "grad_norm": 0.7048015594482422, "learning_rate": 4.839921469043845e-06, "loss": 0.5126, "step": 11600 }, { "epoch": 5.247681520018095, "eval_loss": 0.5909072160720825, "eval_runtime": 25.8995, "eval_samples_per_second": 28.726, "eval_steps_per_second": 7.182, "step": 11600 }, { "epoch": 5.248133906356028, "grad_norm": 0.6963675022125244, "learning_rate": 4.839189096327339e-06, "loss": 0.4688, "step": 11601 }, { "epoch": 5.248586292693961, "grad_norm": 0.6974955201148987, "learning_rate": 4.838456727064554e-06, "loss": 0.5551, "step": 11602 }, { "epoch": 5.2490386790318935, "grad_norm": 0.4341066777706146, "learning_rate": 4.837724361271218e-06, "loss": 1.0046, "step": 11603 }, { "epoch": 5.249491065369826, "grad_norm": 0.25946587324142456, "learning_rate": 4.836991998963062e-06, "loss": 0.7651, "step": 11604 }, { "epoch": 5.249943451707758, "grad_norm": 0.33230915665626526, "learning_rate": 4.8362596401558124e-06, "loss": 0.6671, "step": 11605 }, { "epoch": 5.250395838045691, "grad_norm": 0.33219707012176514, "learning_rate": 4.835527284865199e-06, "loss": 0.4825, "step": 11606 }, { "epoch": 5.250848224383624, "grad_norm": 0.3610875606536865, "learning_rate": 4.834794933106952e-06, "loss": 0.6611, "step": 11607 }, { "epoch": 5.2513006107215565, "grad_norm": 0.37142083048820496, "learning_rate": 4.8340625848968e-06, "loss": 0.648, "step": 11608 }, { "epoch": 5.251752997059489, "grad_norm": 0.37817248702049255, "learning_rate": 4.833330240250468e-06, "loss": 0.5239, "step": 11609 }, { "epoch": 5.252205383397421, "grad_norm": 0.3540915548801422, "learning_rate": 4.832597899183688e-06, "loss": 0.5006, "step": 11610 }, { "epoch": 5.252657769735354, "grad_norm": 0.3828842043876648, "learning_rate": 4.831865561712186e-06, "loss": 0.6857, "step": 11611 }, { "epoch": 5.253110156073286, "grad_norm": 0.4608341455459595, "learning_rate": 4.831133227851693e-06, "loss": 0.5941, "step": 11612 }, { "epoch": 5.25356254241122, "grad_norm": 0.4309341311454773, "learning_rate": 4.830400897617935e-06, "loss": 0.5584, "step": 11613 }, { "epoch": 5.254014928749152, "grad_norm": 0.423275887966156, "learning_rate": 4.829668571026642e-06, "loss": 0.5435, "step": 11614 }, { "epoch": 5.254467315087084, "grad_norm": 0.4166775345802307, "learning_rate": 4.82893624809354e-06, "loss": 0.4858, "step": 11615 }, { "epoch": 5.254919701425017, "grad_norm": 0.42359426617622375, "learning_rate": 4.828203928834359e-06, "loss": 0.466, "step": 11616 }, { "epoch": 5.255372087762949, "grad_norm": 0.44839000701904297, "learning_rate": 4.827471613264826e-06, "loss": 0.5218, "step": 11617 }, { "epoch": 5.255824474100882, "grad_norm": 0.43325188755989075, "learning_rate": 4.8267393014006695e-06, "loss": 0.49, "step": 11618 }, { "epoch": 5.256276860438815, "grad_norm": 0.5065491795539856, "learning_rate": 4.826006993257617e-06, "loss": 0.6109, "step": 11619 }, { "epoch": 5.2567292467767475, "grad_norm": 0.49763497710227966, "learning_rate": 4.825274688851395e-06, "loss": 0.6744, "step": 11620 }, { "epoch": 5.25718163311468, "grad_norm": 0.5054024457931519, "learning_rate": 4.824542388197732e-06, "loss": 0.4135, "step": 11621 }, { "epoch": 5.257634019452612, "grad_norm": 0.4393102526664734, "learning_rate": 4.8238100913123575e-06, "loss": 0.5135, "step": 11622 }, { "epoch": 5.258086405790545, "grad_norm": 0.4451279640197754, "learning_rate": 4.8230777982109965e-06, "loss": 0.4362, "step": 11623 }, { "epoch": 5.258538792128478, "grad_norm": 0.45198938250541687, "learning_rate": 4.822345508909376e-06, "loss": 0.5015, "step": 11624 }, { "epoch": 5.2589911784664105, "grad_norm": 0.4315536916255951, "learning_rate": 4.821613223423225e-06, "loss": 0.4171, "step": 11625 }, { "epoch": 5.259443564804343, "grad_norm": 0.44859951734542847, "learning_rate": 4.820880941768271e-06, "loss": 0.4959, "step": 11626 }, { "epoch": 5.259895951142275, "grad_norm": 0.4859320819377899, "learning_rate": 4.820148663960239e-06, "loss": 0.4773, "step": 11627 }, { "epoch": 5.260348337480208, "grad_norm": 0.472032368183136, "learning_rate": 4.819416390014858e-06, "loss": 0.4977, "step": 11628 }, { "epoch": 5.260800723818141, "grad_norm": 0.4612728953361511, "learning_rate": 4.818684119947854e-06, "loss": 0.4831, "step": 11629 }, { "epoch": 5.2612531101560736, "grad_norm": 0.48033109307289124, "learning_rate": 4.817951853774954e-06, "loss": 0.5601, "step": 11630 }, { "epoch": 5.261705496494006, "grad_norm": 0.5276734232902527, "learning_rate": 4.817219591511887e-06, "loss": 0.5989, "step": 11631 }, { "epoch": 5.262157882831938, "grad_norm": 0.5200439691543579, "learning_rate": 4.816487333174376e-06, "loss": 0.5663, "step": 11632 }, { "epoch": 5.262610269169871, "grad_norm": 0.4763776957988739, "learning_rate": 4.81575507877815e-06, "loss": 0.4383, "step": 11633 }, { "epoch": 5.263062655507803, "grad_norm": 0.5164760947227478, "learning_rate": 4.815022828338934e-06, "loss": 0.5813, "step": 11634 }, { "epoch": 5.263515041845737, "grad_norm": 0.5175319314002991, "learning_rate": 4.814290581872456e-06, "loss": 0.4916, "step": 11635 }, { "epoch": 5.263967428183669, "grad_norm": 0.4762960970401764, "learning_rate": 4.813558339394443e-06, "loss": 0.4461, "step": 11636 }, { "epoch": 5.2644198145216015, "grad_norm": 0.4601941406726837, "learning_rate": 4.812826100920618e-06, "loss": 0.4695, "step": 11637 }, { "epoch": 5.264872200859534, "grad_norm": 0.5168612003326416, "learning_rate": 4.8120938664667105e-06, "loss": 0.5089, "step": 11638 }, { "epoch": 5.265324587197466, "grad_norm": 0.5997986793518066, "learning_rate": 4.811361636048446e-06, "loss": 0.5522, "step": 11639 }, { "epoch": 5.2657769735354, "grad_norm": 0.6185656189918518, "learning_rate": 4.8106294096815485e-06, "loss": 0.5489, "step": 11640 }, { "epoch": 5.266229359873332, "grad_norm": 0.5896387100219727, "learning_rate": 4.809897187381747e-06, "loss": 0.5336, "step": 11641 }, { "epoch": 5.2666817462112645, "grad_norm": 0.5904713273048401, "learning_rate": 4.809164969164766e-06, "loss": 0.5037, "step": 11642 }, { "epoch": 5.267134132549197, "grad_norm": 0.5620483160018921, "learning_rate": 4.80843275504633e-06, "loss": 0.4271, "step": 11643 }, { "epoch": 5.267586518887129, "grad_norm": 0.49031147360801697, "learning_rate": 4.8077005450421665e-06, "loss": 0.4296, "step": 11644 }, { "epoch": 5.268038905225062, "grad_norm": 0.7034366726875305, "learning_rate": 4.806968339168e-06, "loss": 0.7251, "step": 11645 }, { "epoch": 5.268491291562995, "grad_norm": 0.5484309792518616, "learning_rate": 4.806236137439557e-06, "loss": 0.4941, "step": 11646 }, { "epoch": 5.2689436779009275, "grad_norm": 0.5807098150253296, "learning_rate": 4.805503939872562e-06, "loss": 0.4703, "step": 11647 }, { "epoch": 5.26939606423886, "grad_norm": 0.6072129011154175, "learning_rate": 4.80477174648274e-06, "loss": 0.4737, "step": 11648 }, { "epoch": 5.269848450576792, "grad_norm": 0.6240144968032837, "learning_rate": 4.804039557285818e-06, "loss": 0.5106, "step": 11649 }, { "epoch": 5.270300836914725, "grad_norm": 0.6428773403167725, "learning_rate": 4.803307372297519e-06, "loss": 0.5683, "step": 11650 }, { "epoch": 5.270753223252658, "grad_norm": 0.6272387504577637, "learning_rate": 4.8025751915335705e-06, "loss": 0.4761, "step": 11651 }, { "epoch": 5.271205609590591, "grad_norm": 0.7183663249015808, "learning_rate": 4.801843015009695e-06, "loss": 0.4823, "step": 11652 }, { "epoch": 5.271657995928523, "grad_norm": 0.5466617345809937, "learning_rate": 4.801110842741621e-06, "loss": 1.0343, "step": 11653 }, { "epoch": 5.2721103822664555, "grad_norm": 0.24446268379688263, "learning_rate": 4.800378674745069e-06, "loss": 0.3822, "step": 11654 }, { "epoch": 5.272562768604388, "grad_norm": 0.31848421692848206, "learning_rate": 4.7996465110357656e-06, "loss": 0.4974, "step": 11655 }, { "epoch": 5.273015154942321, "grad_norm": 0.3562057614326477, "learning_rate": 4.798914351629435e-06, "loss": 0.6072, "step": 11656 }, { "epoch": 5.273467541280254, "grad_norm": 0.3748127520084381, "learning_rate": 4.798182196541803e-06, "loss": 0.7151, "step": 11657 }, { "epoch": 5.273919927618186, "grad_norm": 0.4190349280834198, "learning_rate": 4.797450045788592e-06, "loss": 0.6506, "step": 11658 }, { "epoch": 5.2743723139561185, "grad_norm": 0.3894152045249939, "learning_rate": 4.796717899385529e-06, "loss": 0.473, "step": 11659 }, { "epoch": 5.274824700294051, "grad_norm": 0.4144047796726227, "learning_rate": 4.795985757348335e-06, "loss": 0.5781, "step": 11660 }, { "epoch": 5.275277086631983, "grad_norm": 0.45776820182800293, "learning_rate": 4.795253619692737e-06, "loss": 0.6964, "step": 11661 }, { "epoch": 5.275729472969917, "grad_norm": 0.4361797273159027, "learning_rate": 4.7945214864344566e-06, "loss": 0.6004, "step": 11662 }, { "epoch": 5.276181859307849, "grad_norm": 0.42916616797447205, "learning_rate": 4.793789357589221e-06, "loss": 0.6172, "step": 11663 }, { "epoch": 5.2766342456457815, "grad_norm": 0.4631486237049103, "learning_rate": 4.793057233172752e-06, "loss": 0.5896, "step": 11664 }, { "epoch": 5.277086631983714, "grad_norm": 0.47762787342071533, "learning_rate": 4.792325113200772e-06, "loss": 0.4921, "step": 11665 }, { "epoch": 5.277539018321646, "grad_norm": 0.48602503538131714, "learning_rate": 4.791592997689006e-06, "loss": 0.644, "step": 11666 }, { "epoch": 5.277991404659579, "grad_norm": 0.4654475450515747, "learning_rate": 4.790860886653178e-06, "loss": 0.5803, "step": 11667 }, { "epoch": 5.278443790997512, "grad_norm": 0.4644676744937897, "learning_rate": 4.790128780109012e-06, "loss": 0.6006, "step": 11668 }, { "epoch": 5.278896177335445, "grad_norm": 0.4276752173900604, "learning_rate": 4.789396678072231e-06, "loss": 0.4741, "step": 11669 }, { "epoch": 5.279348563673377, "grad_norm": 0.4085302948951721, "learning_rate": 4.788664580558557e-06, "loss": 0.5431, "step": 11670 }, { "epoch": 5.2798009500113094, "grad_norm": 0.42134201526641846, "learning_rate": 4.787932487583713e-06, "loss": 0.4306, "step": 11671 }, { "epoch": 5.280253336349242, "grad_norm": 0.4408581852912903, "learning_rate": 4.787200399163425e-06, "loss": 0.4566, "step": 11672 }, { "epoch": 5.280705722687175, "grad_norm": 0.47220203280448914, "learning_rate": 4.786468315313413e-06, "loss": 0.5243, "step": 11673 }, { "epoch": 5.281158109025108, "grad_norm": 0.437772661447525, "learning_rate": 4.785736236049402e-06, "loss": 0.4213, "step": 11674 }, { "epoch": 5.28161049536304, "grad_norm": 0.5086797475814819, "learning_rate": 4.785004161387113e-06, "loss": 0.5523, "step": 11675 }, { "epoch": 5.2820628817009725, "grad_norm": 0.45358434319496155, "learning_rate": 4.784272091342271e-06, "loss": 0.4572, "step": 11676 }, { "epoch": 5.282515268038905, "grad_norm": 0.4216698110103607, "learning_rate": 4.783540025930596e-06, "loss": 0.4663, "step": 11677 }, { "epoch": 5.282967654376838, "grad_norm": 0.5353884100914001, "learning_rate": 4.782807965167812e-06, "loss": 0.5727, "step": 11678 }, { "epoch": 5.283420040714771, "grad_norm": 0.4968247711658478, "learning_rate": 4.782075909069641e-06, "loss": 0.5113, "step": 11679 }, { "epoch": 5.283872427052703, "grad_norm": 0.5273851156234741, "learning_rate": 4.781343857651804e-06, "loss": 0.5604, "step": 11680 }, { "epoch": 5.2843248133906355, "grad_norm": 0.5277435779571533, "learning_rate": 4.780611810930026e-06, "loss": 0.5578, "step": 11681 }, { "epoch": 5.284777199728568, "grad_norm": 0.5267757773399353, "learning_rate": 4.779879768920027e-06, "loss": 0.5365, "step": 11682 }, { "epoch": 5.2852295860665, "grad_norm": 0.5100605487823486, "learning_rate": 4.7791477316375305e-06, "loss": 0.5183, "step": 11683 }, { "epoch": 5.285681972404434, "grad_norm": 0.6126734614372253, "learning_rate": 4.778415699098257e-06, "loss": 0.628, "step": 11684 }, { "epoch": 5.286134358742366, "grad_norm": 0.49836021661758423, "learning_rate": 4.7776836713179285e-06, "loss": 0.4761, "step": 11685 }, { "epoch": 5.286586745080299, "grad_norm": 0.5330991148948669, "learning_rate": 4.776951648312267e-06, "loss": 0.4941, "step": 11686 }, { "epoch": 5.287039131418231, "grad_norm": 0.5782977938652039, "learning_rate": 4.776219630096996e-06, "loss": 0.5781, "step": 11687 }, { "epoch": 5.287491517756163, "grad_norm": 0.6037880778312683, "learning_rate": 4.775487616687833e-06, "loss": 0.6196, "step": 11688 }, { "epoch": 5.287943904094097, "grad_norm": 0.5434638857841492, "learning_rate": 4.7747556081005025e-06, "loss": 0.5134, "step": 11689 }, { "epoch": 5.288396290432029, "grad_norm": 0.5238656401634216, "learning_rate": 4.774023604350724e-06, "loss": 0.4817, "step": 11690 }, { "epoch": 5.288848676769962, "grad_norm": 0.561734139919281, "learning_rate": 4.77329160545422e-06, "loss": 0.5575, "step": 11691 }, { "epoch": 5.289301063107894, "grad_norm": 0.5205011367797852, "learning_rate": 4.772559611426711e-06, "loss": 0.5597, "step": 11692 }, { "epoch": 5.2897534494458265, "grad_norm": 0.5720407366752625, "learning_rate": 4.771827622283917e-06, "loss": 0.4773, "step": 11693 }, { "epoch": 5.290205835783759, "grad_norm": 0.5833950042724609, "learning_rate": 4.771095638041561e-06, "loss": 0.5225, "step": 11694 }, { "epoch": 5.290658222121692, "grad_norm": 0.5860521793365479, "learning_rate": 4.770363658715363e-06, "loss": 0.553, "step": 11695 }, { "epoch": 5.291110608459625, "grad_norm": 0.6185488700866699, "learning_rate": 4.769631684321042e-06, "loss": 0.4924, "step": 11696 }, { "epoch": 5.291562994797557, "grad_norm": 0.5884940028190613, "learning_rate": 4.76889971487432e-06, "loss": 0.5461, "step": 11697 }, { "epoch": 5.2920153811354895, "grad_norm": 0.5353162884712219, "learning_rate": 4.768167750390919e-06, "loss": 0.4542, "step": 11698 }, { "epoch": 5.292467767473422, "grad_norm": 0.5960798263549805, "learning_rate": 4.767435790886555e-06, "loss": 0.5195, "step": 11699 }, { "epoch": 5.292920153811355, "grad_norm": 0.6545383334159851, "learning_rate": 4.766703836376952e-06, "loss": 0.5241, "step": 11700 }, { "epoch": 5.293372540149288, "grad_norm": 0.648199200630188, "learning_rate": 4.765971886877829e-06, "loss": 0.4864, "step": 11701 }, { "epoch": 5.29382492648722, "grad_norm": 0.6972231864929199, "learning_rate": 4.7652399424049054e-06, "loss": 0.478, "step": 11702 }, { "epoch": 5.294277312825153, "grad_norm": 0.5694881677627563, "learning_rate": 4.764508002973902e-06, "loss": 0.9991, "step": 11703 }, { "epoch": 5.294729699163085, "grad_norm": 0.26450055837631226, "learning_rate": 4.763776068600539e-06, "loss": 0.7721, "step": 11704 }, { "epoch": 5.295182085501018, "grad_norm": 0.33340680599212646, "learning_rate": 4.763044139300534e-06, "loss": 0.732, "step": 11705 }, { "epoch": 5.295634471838951, "grad_norm": 0.36460450291633606, "learning_rate": 4.7623122150896085e-06, "loss": 0.7777, "step": 11706 }, { "epoch": 5.296086858176883, "grad_norm": 0.34640440344810486, "learning_rate": 4.761580295983481e-06, "loss": 0.6062, "step": 11707 }, { "epoch": 5.296539244514816, "grad_norm": 0.3974425792694092, "learning_rate": 4.760848381997872e-06, "loss": 0.7148, "step": 11708 }, { "epoch": 5.296991630852748, "grad_norm": 0.4039503335952759, "learning_rate": 4.760116473148499e-06, "loss": 0.4855, "step": 11709 }, { "epoch": 5.2974440171906805, "grad_norm": 0.39254030585289, "learning_rate": 4.759384569451084e-06, "loss": 0.6294, "step": 11710 }, { "epoch": 5.297896403528614, "grad_norm": 0.4404766261577606, "learning_rate": 4.758652670921342e-06, "loss": 0.5768, "step": 11711 }, { "epoch": 5.298348789866546, "grad_norm": 0.4124361276626587, "learning_rate": 4.757920777574995e-06, "loss": 0.7161, "step": 11712 }, { "epoch": 5.298801176204479, "grad_norm": 0.4404434859752655, "learning_rate": 4.757188889427761e-06, "loss": 0.6661, "step": 11713 }, { "epoch": 5.299253562542411, "grad_norm": 0.41808459162712097, "learning_rate": 4.756457006495358e-06, "loss": 0.6282, "step": 11714 }, { "epoch": 5.2997059488803435, "grad_norm": 0.42965278029441833, "learning_rate": 4.7557251287935044e-06, "loss": 0.5705, "step": 11715 }, { "epoch": 5.300158335218276, "grad_norm": 0.40924355387687683, "learning_rate": 4.754993256337921e-06, "loss": 0.6067, "step": 11716 }, { "epoch": 5.300610721556209, "grad_norm": 0.43209806084632874, "learning_rate": 4.7542613891443225e-06, "loss": 0.4249, "step": 11717 }, { "epoch": 5.301063107894142, "grad_norm": 0.4599345922470093, "learning_rate": 4.753529527228431e-06, "loss": 0.5153, "step": 11718 }, { "epoch": 5.301515494232074, "grad_norm": 0.4216509461402893, "learning_rate": 4.752797670605962e-06, "loss": 0.4605, "step": 11719 }, { "epoch": 5.301967880570007, "grad_norm": 0.4356498718261719, "learning_rate": 4.752065819292634e-06, "loss": 0.5123, "step": 11720 }, { "epoch": 5.302420266907939, "grad_norm": 0.43116477131843567, "learning_rate": 4.751333973304166e-06, "loss": 0.5668, "step": 11721 }, { "epoch": 5.302872653245872, "grad_norm": 0.4864574372768402, "learning_rate": 4.750602132656275e-06, "loss": 0.6466, "step": 11722 }, { "epoch": 5.303325039583805, "grad_norm": 0.4132589101791382, "learning_rate": 4.749870297364678e-06, "loss": 0.6319, "step": 11723 }, { "epoch": 5.303777425921737, "grad_norm": 0.45587918162345886, "learning_rate": 4.7491384674450945e-06, "loss": 0.5563, "step": 11724 }, { "epoch": 5.30422981225967, "grad_norm": 0.5275869965553284, "learning_rate": 4.74840664291324e-06, "loss": 0.6436, "step": 11725 }, { "epoch": 5.304682198597602, "grad_norm": 0.43456193804740906, "learning_rate": 4.747674823784833e-06, "loss": 0.4909, "step": 11726 }, { "epoch": 5.305134584935535, "grad_norm": 0.5391337871551514, "learning_rate": 4.746943010075591e-06, "loss": 0.7726, "step": 11727 }, { "epoch": 5.305586971273468, "grad_norm": 0.5136274099349976, "learning_rate": 4.7462112018012295e-06, "loss": 0.5371, "step": 11728 }, { "epoch": 5.3060393576114, "grad_norm": 0.45826077461242676, "learning_rate": 4.745479398977466e-06, "loss": 0.5177, "step": 11729 }, { "epoch": 5.306491743949333, "grad_norm": 0.5145021677017212, "learning_rate": 4.744747601620018e-06, "loss": 0.454, "step": 11730 }, { "epoch": 5.306944130287265, "grad_norm": 0.4909900426864624, "learning_rate": 4.744015809744601e-06, "loss": 0.514, "step": 11731 }, { "epoch": 5.3073965166251975, "grad_norm": 0.5048425197601318, "learning_rate": 4.743284023366934e-06, "loss": 0.5139, "step": 11732 }, { "epoch": 5.307848902963131, "grad_norm": 0.4636618494987488, "learning_rate": 4.7425522425027305e-06, "loss": 0.4968, "step": 11733 }, { "epoch": 5.308301289301063, "grad_norm": 0.49733254313468933, "learning_rate": 4.741820467167709e-06, "loss": 0.4662, "step": 11734 }, { "epoch": 5.308753675638996, "grad_norm": 0.5387334227561951, "learning_rate": 4.7410886973775855e-06, "loss": 0.5637, "step": 11735 }, { "epoch": 5.309206061976928, "grad_norm": 0.532562255859375, "learning_rate": 4.740356933148076e-06, "loss": 0.5633, "step": 11736 }, { "epoch": 5.309658448314861, "grad_norm": 0.47693586349487305, "learning_rate": 4.739625174494896e-06, "loss": 0.4622, "step": 11737 }, { "epoch": 5.310110834652794, "grad_norm": 0.5341755151748657, "learning_rate": 4.738893421433762e-06, "loss": 0.4721, "step": 11738 }, { "epoch": 5.310563220990726, "grad_norm": 0.5709365606307983, "learning_rate": 4.738161673980391e-06, "loss": 0.5008, "step": 11739 }, { "epoch": 5.311015607328659, "grad_norm": 0.482405424118042, "learning_rate": 4.737429932150496e-06, "loss": 0.4548, "step": 11740 }, { "epoch": 5.311467993666591, "grad_norm": 0.5560604333877563, "learning_rate": 4.736698195959794e-06, "loss": 0.518, "step": 11741 }, { "epoch": 5.311920380004524, "grad_norm": 0.5301807522773743, "learning_rate": 4.735966465423999e-06, "loss": 0.4715, "step": 11742 }, { "epoch": 5.312372766342456, "grad_norm": 0.6156931519508362, "learning_rate": 4.7352347405588285e-06, "loss": 0.4903, "step": 11743 }, { "epoch": 5.312825152680389, "grad_norm": 0.5691151022911072, "learning_rate": 4.734503021379997e-06, "loss": 0.5704, "step": 11744 }, { "epoch": 5.313277539018322, "grad_norm": 0.5903979539871216, "learning_rate": 4.73377130790322e-06, "loss": 0.5026, "step": 11745 }, { "epoch": 5.313729925356254, "grad_norm": 0.554105281829834, "learning_rate": 4.733039600144211e-06, "loss": 0.4988, "step": 11746 }, { "epoch": 5.314182311694187, "grad_norm": 0.6212618947029114, "learning_rate": 4.732307898118685e-06, "loss": 0.4775, "step": 11747 }, { "epoch": 5.314634698032119, "grad_norm": 0.5773317813873291, "learning_rate": 4.7315762018423574e-06, "loss": 0.464, "step": 11748 }, { "epoch": 5.315087084370052, "grad_norm": 0.7014885544776917, "learning_rate": 4.7308445113309424e-06, "loss": 0.5375, "step": 11749 }, { "epoch": 5.315539470707985, "grad_norm": 0.6677647829055786, "learning_rate": 4.730112826600156e-06, "loss": 0.5965, "step": 11750 }, { "epoch": 5.315991857045917, "grad_norm": 0.6894204020500183, "learning_rate": 4.7293811476657115e-06, "loss": 0.5416, "step": 11751 }, { "epoch": 5.31644424338385, "grad_norm": 0.7112298011779785, "learning_rate": 4.728649474543321e-06, "loss": 0.4762, "step": 11752 }, { "epoch": 5.316896629721782, "grad_norm": 0.469687819480896, "learning_rate": 4.7279178072487016e-06, "loss": 0.9276, "step": 11753 }, { "epoch": 5.317349016059715, "grad_norm": 0.2509496212005615, "learning_rate": 4.727186145797566e-06, "loss": 0.8632, "step": 11754 }, { "epoch": 5.317801402397648, "grad_norm": 0.3727937936782837, "learning_rate": 4.726454490205628e-06, "loss": 0.6728, "step": 11755 }, { "epoch": 5.31825378873558, "grad_norm": 0.3080894947052002, "learning_rate": 4.7257228404886e-06, "loss": 0.5289, "step": 11756 }, { "epoch": 5.318706175073513, "grad_norm": 0.40942278504371643, "learning_rate": 4.724991196662198e-06, "loss": 0.6857, "step": 11757 }, { "epoch": 5.319158561411445, "grad_norm": 0.3767319917678833, "learning_rate": 4.7242595587421346e-06, "loss": 0.6217, "step": 11758 }, { "epoch": 5.319610947749378, "grad_norm": 0.3498280644416809, "learning_rate": 4.723527926744122e-06, "loss": 0.5249, "step": 11759 }, { "epoch": 5.320063334087311, "grad_norm": 0.2966021001338959, "learning_rate": 4.722796300683875e-06, "loss": 0.4068, "step": 11760 }, { "epoch": 5.320515720425243, "grad_norm": 0.4373069703578949, "learning_rate": 4.722064680577108e-06, "loss": 0.6271, "step": 11761 }, { "epoch": 5.320968106763176, "grad_norm": 0.4183332622051239, "learning_rate": 4.72133306643953e-06, "loss": 0.5887, "step": 11762 }, { "epoch": 5.321420493101108, "grad_norm": 0.4080407917499542, "learning_rate": 4.7206014582868555e-06, "loss": 0.588, "step": 11763 }, { "epoch": 5.321872879439041, "grad_norm": 0.4395535886287689, "learning_rate": 4.719869856134797e-06, "loss": 0.5845, "step": 11764 }, { "epoch": 5.322325265776973, "grad_norm": 0.4194961488246918, "learning_rate": 4.719138259999068e-06, "loss": 0.623, "step": 11765 }, { "epoch": 5.322777652114906, "grad_norm": 0.4187387526035309, "learning_rate": 4.71840666989538e-06, "loss": 0.5226, "step": 11766 }, { "epoch": 5.323230038452839, "grad_norm": 0.44159621000289917, "learning_rate": 4.717675085839446e-06, "loss": 0.5909, "step": 11767 }, { "epoch": 5.323682424790771, "grad_norm": 0.44801124930381775, "learning_rate": 4.716943507846978e-06, "loss": 0.5563, "step": 11768 }, { "epoch": 5.324134811128704, "grad_norm": 0.4526616930961609, "learning_rate": 4.716211935933687e-06, "loss": 0.5093, "step": 11769 }, { "epoch": 5.324587197466636, "grad_norm": 0.45532625913619995, "learning_rate": 4.715480370115287e-06, "loss": 0.5095, "step": 11770 }, { "epoch": 5.325039583804569, "grad_norm": 0.4740544557571411, "learning_rate": 4.714748810407487e-06, "loss": 0.5588, "step": 11771 }, { "epoch": 5.325491970142502, "grad_norm": 0.447907030582428, "learning_rate": 4.714017256826002e-06, "loss": 0.5532, "step": 11772 }, { "epoch": 5.325944356480434, "grad_norm": 0.5320127606391907, "learning_rate": 4.713285709386542e-06, "loss": 0.6245, "step": 11773 }, { "epoch": 5.326396742818367, "grad_norm": 0.42173174023628235, "learning_rate": 4.712554168104816e-06, "loss": 0.4636, "step": 11774 }, { "epoch": 5.326849129156299, "grad_norm": 0.46697136759757996, "learning_rate": 4.711822632996538e-06, "loss": 0.4992, "step": 11775 }, { "epoch": 5.3273015154942325, "grad_norm": 0.4523071348667145, "learning_rate": 4.711091104077418e-06, "loss": 0.4884, "step": 11776 }, { "epoch": 5.327753901832165, "grad_norm": 0.4628504514694214, "learning_rate": 4.710359581363168e-06, "loss": 0.5216, "step": 11777 }, { "epoch": 5.328206288170097, "grad_norm": 0.4274104833602905, "learning_rate": 4.709628064869498e-06, "loss": 0.4245, "step": 11778 }, { "epoch": 5.32865867450803, "grad_norm": 0.46445897221565247, "learning_rate": 4.708896554612119e-06, "loss": 0.499, "step": 11779 }, { "epoch": 5.329111060845962, "grad_norm": 0.5138843655586243, "learning_rate": 4.708165050606741e-06, "loss": 0.5868, "step": 11780 }, { "epoch": 5.329563447183895, "grad_norm": 0.4188888370990753, "learning_rate": 4.707433552869076e-06, "loss": 0.3231, "step": 11781 }, { "epoch": 5.330015833521828, "grad_norm": 0.5321623682975769, "learning_rate": 4.706702061414833e-06, "loss": 0.6878, "step": 11782 }, { "epoch": 5.33046821985976, "grad_norm": 0.4935370087623596, "learning_rate": 4.705970576259721e-06, "loss": 0.5003, "step": 11783 }, { "epoch": 5.330920606197693, "grad_norm": 0.4881081283092499, "learning_rate": 4.705239097419454e-06, "loss": 0.4761, "step": 11784 }, { "epoch": 5.331372992535625, "grad_norm": 0.5012545585632324, "learning_rate": 4.704507624909738e-06, "loss": 0.5314, "step": 11785 }, { "epoch": 5.331825378873558, "grad_norm": 0.5103927850723267, "learning_rate": 4.703776158746284e-06, "loss": 0.4866, "step": 11786 }, { "epoch": 5.332277765211491, "grad_norm": 0.4955727756023407, "learning_rate": 4.7030446989448016e-06, "loss": 0.5133, "step": 11787 }, { "epoch": 5.332730151549423, "grad_norm": 0.5659480094909668, "learning_rate": 4.702313245521001e-06, "loss": 0.575, "step": 11788 }, { "epoch": 5.333182537887356, "grad_norm": 0.6043127179145813, "learning_rate": 4.701581798490591e-06, "loss": 0.5379, "step": 11789 }, { "epoch": 5.333634924225288, "grad_norm": 0.5707105994224548, "learning_rate": 4.700850357869281e-06, "loss": 0.5244, "step": 11790 }, { "epoch": 5.334087310563221, "grad_norm": 0.5681530237197876, "learning_rate": 4.700118923672779e-06, "loss": 0.4838, "step": 11791 }, { "epoch": 5.334539696901153, "grad_norm": 0.5323688983917236, "learning_rate": 4.699387495916796e-06, "loss": 0.4151, "step": 11792 }, { "epoch": 5.3349920832390865, "grad_norm": 0.6262521147727966, "learning_rate": 4.698656074617038e-06, "loss": 0.5377, "step": 11793 }, { "epoch": 5.335444469577019, "grad_norm": 0.6031540632247925, "learning_rate": 4.697924659789217e-06, "loss": 0.5841, "step": 11794 }, { "epoch": 5.335896855914951, "grad_norm": 0.6189396381378174, "learning_rate": 4.69719325144904e-06, "loss": 0.4831, "step": 11795 }, { "epoch": 5.336349242252884, "grad_norm": 0.5864156484603882, "learning_rate": 4.696461849612215e-06, "loss": 0.4393, "step": 11796 }, { "epoch": 5.336801628590816, "grad_norm": 0.5444428324699402, "learning_rate": 4.69573045429445e-06, "loss": 0.4936, "step": 11797 }, { "epoch": 5.3372540149287495, "grad_norm": 0.6226509809494019, "learning_rate": 4.694999065511453e-06, "loss": 0.5412, "step": 11798 }, { "epoch": 5.337706401266682, "grad_norm": 0.5485130548477173, "learning_rate": 4.694267683278934e-06, "loss": 0.4842, "step": 11799 }, { "epoch": 5.338158787604614, "grad_norm": 0.6055540442466736, "learning_rate": 4.6935363076125985e-06, "loss": 0.4657, "step": 11800 }, { "epoch": 5.338158787604614, "eval_loss": 0.5904834866523743, "eval_runtime": 25.9748, "eval_samples_per_second": 28.643, "eval_steps_per_second": 7.161, "step": 11800 }, { "epoch": 5.338611173942547, "grad_norm": 0.6598460078239441, "learning_rate": 4.692804938528156e-06, "loss": 0.4962, "step": 11801 }, { "epoch": 5.339063560280479, "grad_norm": 0.68326336145401, "learning_rate": 4.692073576041312e-06, "loss": 0.5408, "step": 11802 }, { "epoch": 5.339515946618413, "grad_norm": 0.47592103481292725, "learning_rate": 4.691342220167775e-06, "loss": 0.9573, "step": 11803 }, { "epoch": 5.339968332956345, "grad_norm": 0.2868410646915436, "learning_rate": 4.690610870923253e-06, "loss": 1.0862, "step": 11804 }, { "epoch": 5.340420719294277, "grad_norm": 0.28638505935668945, "learning_rate": 4.689879528323452e-06, "loss": 0.4937, "step": 11805 }, { "epoch": 5.34087310563221, "grad_norm": 0.3387026786804199, "learning_rate": 4.6891481923840795e-06, "loss": 0.5926, "step": 11806 }, { "epoch": 5.341325491970142, "grad_norm": 0.35295021533966064, "learning_rate": 4.688416863120843e-06, "loss": 0.5108, "step": 11807 }, { "epoch": 5.341777878308075, "grad_norm": 0.3718339204788208, "learning_rate": 4.687685540549449e-06, "loss": 0.6416, "step": 11808 }, { "epoch": 5.342230264646008, "grad_norm": 0.3642714023590088, "learning_rate": 4.686954224685602e-06, "loss": 0.5926, "step": 11809 }, { "epoch": 5.3426826509839405, "grad_norm": 0.328040212392807, "learning_rate": 4.68622291554501e-06, "loss": 0.4687, "step": 11810 }, { "epoch": 5.343135037321873, "grad_norm": 0.38276827335357666, "learning_rate": 4.685491613143379e-06, "loss": 0.6464, "step": 11811 }, { "epoch": 5.343587423659805, "grad_norm": 0.3954848051071167, "learning_rate": 4.684760317496416e-06, "loss": 0.6002, "step": 11812 }, { "epoch": 5.344039809997738, "grad_norm": 0.3289879858493805, "learning_rate": 4.684029028619826e-06, "loss": 0.4518, "step": 11813 }, { "epoch": 5.344492196335671, "grad_norm": 0.4187178313732147, "learning_rate": 4.683297746529314e-06, "loss": 0.521, "step": 11814 }, { "epoch": 5.3449445826736035, "grad_norm": 0.4278266727924347, "learning_rate": 4.682566471240589e-06, "loss": 0.4958, "step": 11815 }, { "epoch": 5.345396969011536, "grad_norm": 0.507861316204071, "learning_rate": 4.681835202769352e-06, "loss": 0.6897, "step": 11816 }, { "epoch": 5.345849355349468, "grad_norm": 0.3627113103866577, "learning_rate": 4.681103941131311e-06, "loss": 0.4711, "step": 11817 }, { "epoch": 5.346301741687401, "grad_norm": 0.46992629766464233, "learning_rate": 4.680372686342173e-06, "loss": 0.6492, "step": 11818 }, { "epoch": 5.346754128025333, "grad_norm": 0.4632377028465271, "learning_rate": 4.6796414384176396e-06, "loss": 0.5926, "step": 11819 }, { "epoch": 5.3472065143632665, "grad_norm": 0.45063894987106323, "learning_rate": 4.678910197373416e-06, "loss": 0.5852, "step": 11820 }, { "epoch": 5.347658900701199, "grad_norm": 0.3841070830821991, "learning_rate": 4.678178963225209e-06, "loss": 0.5401, "step": 11821 }, { "epoch": 5.348111287039131, "grad_norm": 0.42813947796821594, "learning_rate": 4.6774477359887224e-06, "loss": 0.4292, "step": 11822 }, { "epoch": 5.348563673377064, "grad_norm": 0.5096036195755005, "learning_rate": 4.6767165156796604e-06, "loss": 0.5253, "step": 11823 }, { "epoch": 5.349016059714996, "grad_norm": 0.46804317831993103, "learning_rate": 4.6759853023137275e-06, "loss": 0.5923, "step": 11824 }, { "epoch": 5.34946844605293, "grad_norm": 0.45031681656837463, "learning_rate": 4.6752540959066285e-06, "loss": 0.4124, "step": 11825 }, { "epoch": 5.349920832390862, "grad_norm": 0.5272080302238464, "learning_rate": 4.6745228964740665e-06, "loss": 0.5985, "step": 11826 }, { "epoch": 5.3503732187287945, "grad_norm": 0.4464329183101654, "learning_rate": 4.673791704031746e-06, "loss": 0.456, "step": 11827 }, { "epoch": 5.350825605066727, "grad_norm": 0.4868883192539215, "learning_rate": 4.6730605185953705e-06, "loss": 0.4425, "step": 11828 }, { "epoch": 5.351277991404659, "grad_norm": 0.46014419198036194, "learning_rate": 4.672329340180645e-06, "loss": 0.5171, "step": 11829 }, { "epoch": 5.351730377742593, "grad_norm": 0.41373389959335327, "learning_rate": 4.6715981688032706e-06, "loss": 0.4377, "step": 11830 }, { "epoch": 5.352182764080525, "grad_norm": 0.5021573305130005, "learning_rate": 4.670867004478952e-06, "loss": 0.5847, "step": 11831 }, { "epoch": 5.3526351504184575, "grad_norm": 0.5217959880828857, "learning_rate": 4.670135847223391e-06, "loss": 0.5228, "step": 11832 }, { "epoch": 5.35308753675639, "grad_norm": 0.47759681940078735, "learning_rate": 4.6694046970522926e-06, "loss": 0.4422, "step": 11833 }, { "epoch": 5.353539923094322, "grad_norm": 0.4824780523777008, "learning_rate": 4.668673553981358e-06, "loss": 0.5012, "step": 11834 }, { "epoch": 5.353992309432255, "grad_norm": 0.5478725433349609, "learning_rate": 4.66794241802629e-06, "loss": 0.4915, "step": 11835 }, { "epoch": 5.354444695770188, "grad_norm": 0.4834541976451874, "learning_rate": 4.667211289202793e-06, "loss": 0.4182, "step": 11836 }, { "epoch": 5.3548970821081205, "grad_norm": 0.5928758978843689, "learning_rate": 4.666480167526567e-06, "loss": 0.6165, "step": 11837 }, { "epoch": 5.355349468446053, "grad_norm": 0.5254517197608948, "learning_rate": 4.665749053013316e-06, "loss": 0.5057, "step": 11838 }, { "epoch": 5.355801854783985, "grad_norm": 0.48656389117240906, "learning_rate": 4.6650179456787415e-06, "loss": 0.4481, "step": 11839 }, { "epoch": 5.356254241121918, "grad_norm": 0.5380948185920715, "learning_rate": 4.664286845538546e-06, "loss": 0.5201, "step": 11840 }, { "epoch": 5.35670662745985, "grad_norm": 0.5640146136283875, "learning_rate": 4.66355575260843e-06, "loss": 0.5473, "step": 11841 }, { "epoch": 5.357159013797784, "grad_norm": 0.574850857257843, "learning_rate": 4.662824666904095e-06, "loss": 0.5797, "step": 11842 }, { "epoch": 5.357611400135716, "grad_norm": 0.574211597442627, "learning_rate": 4.662093588441243e-06, "loss": 0.4874, "step": 11843 }, { "epoch": 5.3580637864736484, "grad_norm": 0.5683149695396423, "learning_rate": 4.661362517235576e-06, "loss": 0.4997, "step": 11844 }, { "epoch": 5.358516172811581, "grad_norm": 0.5842364430427551, "learning_rate": 4.660631453302794e-06, "loss": 0.5314, "step": 11845 }, { "epoch": 5.358968559149513, "grad_norm": 0.5129888653755188, "learning_rate": 4.6599003966586e-06, "loss": 0.4433, "step": 11846 }, { "epoch": 5.359420945487447, "grad_norm": 0.5586932897567749, "learning_rate": 4.659169347318692e-06, "loss": 0.5204, "step": 11847 }, { "epoch": 5.359873331825379, "grad_norm": 0.558185338973999, "learning_rate": 4.658438305298772e-06, "loss": 0.4814, "step": 11848 }, { "epoch": 5.3603257181633115, "grad_norm": 0.6026747822761536, "learning_rate": 4.657707270614541e-06, "loss": 0.434, "step": 11849 }, { "epoch": 5.360778104501244, "grad_norm": 0.6513333916664124, "learning_rate": 4.656976243281699e-06, "loss": 0.521, "step": 11850 }, { "epoch": 5.361230490839176, "grad_norm": 0.681979238986969, "learning_rate": 4.6562452233159465e-06, "loss": 0.4986, "step": 11851 }, { "epoch": 5.36168287717711, "grad_norm": 0.7303825616836548, "learning_rate": 4.6555142107329844e-06, "loss": 0.585, "step": 11852 }, { "epoch": 5.362135263515042, "grad_norm": 0.44803470373153687, "learning_rate": 4.65478320554851e-06, "loss": 1.0661, "step": 11853 }, { "epoch": 5.3625876498529745, "grad_norm": 0.2692503035068512, "learning_rate": 4.654052207778224e-06, "loss": 0.4084, "step": 11854 }, { "epoch": 5.363040036190907, "grad_norm": 0.33714163303375244, "learning_rate": 4.653321217437827e-06, "loss": 0.5782, "step": 11855 }, { "epoch": 5.363492422528839, "grad_norm": 0.3319648802280426, "learning_rate": 4.652590234543018e-06, "loss": 0.5301, "step": 11856 }, { "epoch": 5.363944808866772, "grad_norm": 0.3918002247810364, "learning_rate": 4.651859259109496e-06, "loss": 0.6438, "step": 11857 }, { "epoch": 5.364397195204705, "grad_norm": 0.38748496770858765, "learning_rate": 4.65112829115296e-06, "loss": 0.6419, "step": 11858 }, { "epoch": 5.364849581542638, "grad_norm": 0.40749597549438477, "learning_rate": 4.650397330689109e-06, "loss": 0.6253, "step": 11859 }, { "epoch": 5.36530196788057, "grad_norm": 0.3773185908794403, "learning_rate": 4.649666377733642e-06, "loss": 0.7076, "step": 11860 }, { "epoch": 5.365754354218502, "grad_norm": 0.4350575804710388, "learning_rate": 4.648935432302258e-06, "loss": 0.6707, "step": 11861 }, { "epoch": 5.366206740556435, "grad_norm": 0.3747543394565582, "learning_rate": 4.6482044944106545e-06, "loss": 0.5962, "step": 11862 }, { "epoch": 5.366659126894368, "grad_norm": 0.39287662506103516, "learning_rate": 4.647473564074531e-06, "loss": 0.528, "step": 11863 }, { "epoch": 5.367111513232301, "grad_norm": 0.3710073232650757, "learning_rate": 4.646742641309584e-06, "loss": 0.5162, "step": 11864 }, { "epoch": 5.367563899570233, "grad_norm": 0.38070061802864075, "learning_rate": 4.646011726131512e-06, "loss": 0.4784, "step": 11865 }, { "epoch": 5.3680162859081655, "grad_norm": 0.39604833722114563, "learning_rate": 4.6452808185560135e-06, "loss": 0.4696, "step": 11866 }, { "epoch": 5.368468672246098, "grad_norm": 0.4226330518722534, "learning_rate": 4.6445499185987856e-06, "loss": 0.6459, "step": 11867 }, { "epoch": 5.36892105858403, "grad_norm": 0.37581923604011536, "learning_rate": 4.643819026275526e-06, "loss": 0.4913, "step": 11868 }, { "epoch": 5.369373444921964, "grad_norm": 0.4503959119319916, "learning_rate": 4.643088141601932e-06, "loss": 0.5745, "step": 11869 }, { "epoch": 5.369825831259896, "grad_norm": 0.4558180868625641, "learning_rate": 4.6423572645937e-06, "loss": 0.5599, "step": 11870 }, { "epoch": 5.3702782175978285, "grad_norm": 0.5313894748687744, "learning_rate": 4.641626395266529e-06, "loss": 0.6115, "step": 11871 }, { "epoch": 5.370730603935761, "grad_norm": 0.414660781621933, "learning_rate": 4.640895533636113e-06, "loss": 0.4613, "step": 11872 }, { "epoch": 5.371182990273693, "grad_norm": 0.47522830963134766, "learning_rate": 4.640164679718151e-06, "loss": 0.5048, "step": 11873 }, { "epoch": 5.371635376611627, "grad_norm": 0.5098809599876404, "learning_rate": 4.639433833528339e-06, "loss": 0.6478, "step": 11874 }, { "epoch": 5.372087762949559, "grad_norm": 0.4564526379108429, "learning_rate": 4.638702995082372e-06, "loss": 0.5273, "step": 11875 }, { "epoch": 5.372540149287492, "grad_norm": 0.4268338084220886, "learning_rate": 4.637972164395948e-06, "loss": 0.4477, "step": 11876 }, { "epoch": 5.372992535625424, "grad_norm": 0.4859517216682434, "learning_rate": 4.6372413414847605e-06, "loss": 0.6606, "step": 11877 }, { "epoch": 5.373444921963356, "grad_norm": 0.4784713387489319, "learning_rate": 4.636510526364508e-06, "loss": 0.4957, "step": 11878 }, { "epoch": 5.37389730830129, "grad_norm": 0.466874897480011, "learning_rate": 4.635779719050884e-06, "loss": 0.4805, "step": 11879 }, { "epoch": 5.374349694639222, "grad_norm": 0.4854165315628052, "learning_rate": 4.635048919559585e-06, "loss": 0.5149, "step": 11880 }, { "epoch": 5.374802080977155, "grad_norm": 0.4990781247615814, "learning_rate": 4.634318127906307e-06, "loss": 0.4854, "step": 11881 }, { "epoch": 5.375254467315087, "grad_norm": 0.48672232031822205, "learning_rate": 4.633587344106744e-06, "loss": 0.5657, "step": 11882 }, { "epoch": 5.3757068536530195, "grad_norm": 0.4895070493221283, "learning_rate": 4.632856568176592e-06, "loss": 0.5097, "step": 11883 }, { "epoch": 5.376159239990952, "grad_norm": 0.49501746892929077, "learning_rate": 4.632125800131545e-06, "loss": 0.5447, "step": 11884 }, { "epoch": 5.376611626328885, "grad_norm": 0.42612606287002563, "learning_rate": 4.631395039987297e-06, "loss": 0.3801, "step": 11885 }, { "epoch": 5.377064012666818, "grad_norm": 0.4639551043510437, "learning_rate": 4.630664287759546e-06, "loss": 0.3979, "step": 11886 }, { "epoch": 5.37751639900475, "grad_norm": 0.5734292268753052, "learning_rate": 4.629933543463981e-06, "loss": 0.553, "step": 11887 }, { "epoch": 5.3779687853426825, "grad_norm": 0.507258415222168, "learning_rate": 4.629202807116299e-06, "loss": 0.4837, "step": 11888 }, { "epoch": 5.378421171680615, "grad_norm": 0.4659164249897003, "learning_rate": 4.6284720787321935e-06, "loss": 0.4531, "step": 11889 }, { "epoch": 5.378873558018547, "grad_norm": 0.5402355790138245, "learning_rate": 4.627741358327359e-06, "loss": 0.5502, "step": 11890 }, { "epoch": 5.379325944356481, "grad_norm": 0.49646973609924316, "learning_rate": 4.627010645917489e-06, "loss": 0.4497, "step": 11891 }, { "epoch": 5.379778330694413, "grad_norm": 0.5327067375183105, "learning_rate": 4.626279941518275e-06, "loss": 0.4946, "step": 11892 }, { "epoch": 5.380230717032346, "grad_norm": 0.575542151927948, "learning_rate": 4.625549245145413e-06, "loss": 0.5708, "step": 11893 }, { "epoch": 5.380683103370278, "grad_norm": 0.5692362785339355, "learning_rate": 4.624818556814595e-06, "loss": 0.4492, "step": 11894 }, { "epoch": 5.38113548970821, "grad_norm": 0.5611720681190491, "learning_rate": 4.624087876541513e-06, "loss": 0.4633, "step": 11895 }, { "epoch": 5.381587876046144, "grad_norm": 0.5729765892028809, "learning_rate": 4.6233572043418615e-06, "loss": 0.4974, "step": 11896 }, { "epoch": 5.382040262384076, "grad_norm": 0.647585928440094, "learning_rate": 4.622626540231333e-06, "loss": 0.4473, "step": 11897 }, { "epoch": 5.382492648722009, "grad_norm": 0.6106710433959961, "learning_rate": 4.621895884225617e-06, "loss": 0.4554, "step": 11898 }, { "epoch": 5.382945035059941, "grad_norm": 0.7463243007659912, "learning_rate": 4.6211652363404086e-06, "loss": 0.6083, "step": 11899 }, { "epoch": 5.3833974213978735, "grad_norm": 0.7023301720619202, "learning_rate": 4.6204345965914e-06, "loss": 0.5231, "step": 11900 }, { "epoch": 5.383849807735807, "grad_norm": 0.5419899225234985, "learning_rate": 4.619703964994281e-06, "loss": 0.3829, "step": 11901 }, { "epoch": 5.384302194073739, "grad_norm": 0.6446231603622437, "learning_rate": 4.618973341564744e-06, "loss": 0.442, "step": 11902 }, { "epoch": 5.384754580411672, "grad_norm": 0.4746617376804352, "learning_rate": 4.618242726318481e-06, "loss": 0.9702, "step": 11903 }, { "epoch": 5.385206966749604, "grad_norm": 0.2692151963710785, "learning_rate": 4.617512119271184e-06, "loss": 0.8329, "step": 11904 }, { "epoch": 5.3856593530875365, "grad_norm": 0.3055379390716553, "learning_rate": 4.616781520438543e-06, "loss": 0.649, "step": 11905 }, { "epoch": 5.386111739425469, "grad_norm": 0.3534815013408661, "learning_rate": 4.61605092983625e-06, "loss": 0.6295, "step": 11906 }, { "epoch": 5.386564125763402, "grad_norm": 0.3399909436702728, "learning_rate": 4.615320347479995e-06, "loss": 0.5109, "step": 11907 }, { "epoch": 5.387016512101335, "grad_norm": 0.38373005390167236, "learning_rate": 4.61458977338547e-06, "loss": 0.6868, "step": 11908 }, { "epoch": 5.387468898439267, "grad_norm": 0.378460168838501, "learning_rate": 4.613859207568364e-06, "loss": 0.597, "step": 11909 }, { "epoch": 5.3879212847772, "grad_norm": 0.3450831472873688, "learning_rate": 4.613128650044367e-06, "loss": 0.5506, "step": 11910 }, { "epoch": 5.388373671115132, "grad_norm": 0.3842152953147888, "learning_rate": 4.61239810082917e-06, "loss": 0.6093, "step": 11911 }, { "epoch": 5.388826057453065, "grad_norm": 0.39579370617866516, "learning_rate": 4.611667559938463e-06, "loss": 0.4893, "step": 11912 }, { "epoch": 5.389278443790998, "grad_norm": 0.3658047616481781, "learning_rate": 4.610937027387936e-06, "loss": 0.4358, "step": 11913 }, { "epoch": 5.38973083012893, "grad_norm": 0.44411394000053406, "learning_rate": 4.610206503193277e-06, "loss": 0.5322, "step": 11914 }, { "epoch": 5.390183216466863, "grad_norm": 0.41826343536376953, "learning_rate": 4.609475987370177e-06, "loss": 0.5906, "step": 11915 }, { "epoch": 5.390635602804795, "grad_norm": 0.4489770531654358, "learning_rate": 4.608745479934325e-06, "loss": 0.6348, "step": 11916 }, { "epoch": 5.3910879891427275, "grad_norm": 0.385219931602478, "learning_rate": 4.608014980901409e-06, "loss": 0.4734, "step": 11917 }, { "epoch": 5.391540375480661, "grad_norm": 0.4358963370323181, "learning_rate": 4.60728449028712e-06, "loss": 0.5728, "step": 11918 }, { "epoch": 5.391992761818593, "grad_norm": 0.4487397372722626, "learning_rate": 4.6065540081071455e-06, "loss": 0.5719, "step": 11919 }, { "epoch": 5.392445148156526, "grad_norm": 0.40170639753341675, "learning_rate": 4.605823534377173e-06, "loss": 0.5449, "step": 11920 }, { "epoch": 5.392897534494458, "grad_norm": 0.5115556716918945, "learning_rate": 4.605093069112892e-06, "loss": 0.5881, "step": 11921 }, { "epoch": 5.3933499208323905, "grad_norm": 0.44628846645355225, "learning_rate": 4.6043626123299894e-06, "loss": 0.5295, "step": 11922 }, { "epoch": 5.393802307170324, "grad_norm": 0.4862349033355713, "learning_rate": 4.603632164044155e-06, "loss": 0.5488, "step": 11923 }, { "epoch": 5.394254693508256, "grad_norm": 0.456013947725296, "learning_rate": 4.602901724271075e-06, "loss": 0.4926, "step": 11924 }, { "epoch": 5.394707079846189, "grad_norm": 0.5605694651603699, "learning_rate": 4.602171293026437e-06, "loss": 0.4617, "step": 11925 }, { "epoch": 5.395159466184121, "grad_norm": 0.46838316321372986, "learning_rate": 4.60144087032593e-06, "loss": 0.5523, "step": 11926 }, { "epoch": 5.3956118525220536, "grad_norm": 0.48592615127563477, "learning_rate": 4.60071045618524e-06, "loss": 0.5706, "step": 11927 }, { "epoch": 5.396064238859987, "grad_norm": 0.5163471698760986, "learning_rate": 4.599980050620055e-06, "loss": 0.6519, "step": 11928 }, { "epoch": 5.396516625197919, "grad_norm": 0.4498104155063629, "learning_rate": 4.5992496536460595e-06, "loss": 0.423, "step": 11929 }, { "epoch": 5.396969011535852, "grad_norm": 0.4773201048374176, "learning_rate": 4.598519265278942e-06, "loss": 0.5493, "step": 11930 }, { "epoch": 5.397421397873784, "grad_norm": 0.44499656558036804, "learning_rate": 4.59778888553439e-06, "loss": 0.4262, "step": 11931 }, { "epoch": 5.397873784211717, "grad_norm": 0.4768975079059601, "learning_rate": 4.597058514428088e-06, "loss": 0.4615, "step": 11932 }, { "epoch": 5.398326170549649, "grad_norm": 0.5045543909072876, "learning_rate": 4.596328151975723e-06, "loss": 0.4684, "step": 11933 }, { "epoch": 5.398778556887582, "grad_norm": 0.540963888168335, "learning_rate": 4.59559779819298e-06, "loss": 0.5191, "step": 11934 }, { "epoch": 5.399230943225515, "grad_norm": 0.5428220629692078, "learning_rate": 4.594867453095545e-06, "loss": 0.6482, "step": 11935 }, { "epoch": 5.399683329563447, "grad_norm": 0.5242665410041809, "learning_rate": 4.594137116699105e-06, "loss": 0.5325, "step": 11936 }, { "epoch": 5.40013571590138, "grad_norm": 0.535467267036438, "learning_rate": 4.593406789019343e-06, "loss": 0.5348, "step": 11937 }, { "epoch": 5.400588102239312, "grad_norm": 0.49106886982917786, "learning_rate": 4.592676470071946e-06, "loss": 0.4893, "step": 11938 }, { "epoch": 5.4010404885772445, "grad_norm": 0.5434895157814026, "learning_rate": 4.5919461598725975e-06, "loss": 0.5897, "step": 11939 }, { "epoch": 5.401492874915178, "grad_norm": 0.5488916635513306, "learning_rate": 4.591215858436984e-06, "loss": 0.4824, "step": 11940 }, { "epoch": 5.40194526125311, "grad_norm": 0.5951372981071472, "learning_rate": 4.5904855657807894e-06, "loss": 0.5871, "step": 11941 }, { "epoch": 5.402397647591043, "grad_norm": 0.5593612194061279, "learning_rate": 4.5897552819196994e-06, "loss": 0.4876, "step": 11942 }, { "epoch": 5.402850033928975, "grad_norm": 0.6198183298110962, "learning_rate": 4.589025006869395e-06, "loss": 0.5226, "step": 11943 }, { "epoch": 5.4033024202669075, "grad_norm": 0.5210949778556824, "learning_rate": 4.588294740645563e-06, "loss": 0.4361, "step": 11944 }, { "epoch": 5.403754806604841, "grad_norm": 0.6055765151977539, "learning_rate": 4.587564483263886e-06, "loss": 0.6381, "step": 11945 }, { "epoch": 5.404207192942773, "grad_norm": 0.5566126704216003, "learning_rate": 4.586834234740049e-06, "loss": 0.4878, "step": 11946 }, { "epoch": 5.404659579280706, "grad_norm": 0.5839557647705078, "learning_rate": 4.586103995089733e-06, "loss": 0.5242, "step": 11947 }, { "epoch": 5.405111965618638, "grad_norm": 0.608671247959137, "learning_rate": 4.585373764328624e-06, "loss": 0.5303, "step": 11948 }, { "epoch": 5.405564351956571, "grad_norm": 0.6411340832710266, "learning_rate": 4.584643542472403e-06, "loss": 0.5564, "step": 11949 }, { "epoch": 5.406016738294504, "grad_norm": 0.87263023853302, "learning_rate": 4.5839133295367544e-06, "loss": 0.5788, "step": 11950 }, { "epoch": 5.406469124632436, "grad_norm": 0.6668619513511658, "learning_rate": 4.5831831255373595e-06, "loss": 0.5072, "step": 11951 }, { "epoch": 5.406921510970369, "grad_norm": 0.7258706092834473, "learning_rate": 4.5824529304899026e-06, "loss": 0.5223, "step": 11952 }, { "epoch": 5.407373897308301, "grad_norm": 0.5739092230796814, "learning_rate": 4.5817227444100665e-06, "loss": 1.0074, "step": 11953 }, { "epoch": 5.407826283646234, "grad_norm": 0.30979734659194946, "learning_rate": 4.580992567313531e-06, "loss": 0.5429, "step": 11954 }, { "epoch": 5.408278669984166, "grad_norm": 0.3380374014377594, "learning_rate": 4.580262399215978e-06, "loss": 0.6314, "step": 11955 }, { "epoch": 5.408731056322099, "grad_norm": 0.349404513835907, "learning_rate": 4.579532240133091e-06, "loss": 0.5399, "step": 11956 }, { "epoch": 5.409183442660032, "grad_norm": 0.36090540885925293, "learning_rate": 4.57880209008055e-06, "loss": 0.654, "step": 11957 }, { "epoch": 5.409635828997964, "grad_norm": 0.36574259400367737, "learning_rate": 4.5780719490740375e-06, "loss": 0.564, "step": 11958 }, { "epoch": 5.410088215335897, "grad_norm": 0.3133825957775116, "learning_rate": 4.577341817129235e-06, "loss": 0.4619, "step": 11959 }, { "epoch": 5.410540601673829, "grad_norm": 0.3786498010158539, "learning_rate": 4.576611694261822e-06, "loss": 0.6625, "step": 11960 }, { "epoch": 5.410992988011762, "grad_norm": 0.3378855586051941, "learning_rate": 4.57588158048748e-06, "loss": 0.4972, "step": 11961 }, { "epoch": 5.411445374349695, "grad_norm": 0.4180574417114258, "learning_rate": 4.5751514758218906e-06, "loss": 0.6388, "step": 11962 }, { "epoch": 5.411897760687627, "grad_norm": 0.43174007534980774, "learning_rate": 4.574421380280733e-06, "loss": 0.5671, "step": 11963 }, { "epoch": 5.41235014702556, "grad_norm": 0.4496487081050873, "learning_rate": 4.573691293879687e-06, "loss": 0.6369, "step": 11964 }, { "epoch": 5.412802533363492, "grad_norm": 0.38715860247612, "learning_rate": 4.572961216634435e-06, "loss": 0.4794, "step": 11965 }, { "epoch": 5.413254919701425, "grad_norm": 0.43408989906311035, "learning_rate": 4.5722311485606544e-06, "loss": 0.5318, "step": 11966 }, { "epoch": 5.413707306039358, "grad_norm": 0.479142427444458, "learning_rate": 4.571501089674025e-06, "loss": 0.5981, "step": 11967 }, { "epoch": 5.41415969237729, "grad_norm": 0.43533268570899963, "learning_rate": 4.570771039990226e-06, "loss": 0.6204, "step": 11968 }, { "epoch": 5.414612078715223, "grad_norm": 0.4508664608001709, "learning_rate": 4.570040999524938e-06, "loss": 0.5104, "step": 11969 }, { "epoch": 5.415064465053155, "grad_norm": 0.474536657333374, "learning_rate": 4.5693109682938395e-06, "loss": 0.6867, "step": 11970 }, { "epoch": 5.415516851391088, "grad_norm": 0.43575137853622437, "learning_rate": 4.568580946312608e-06, "loss": 0.5116, "step": 11971 }, { "epoch": 5.415969237729021, "grad_norm": 0.47652900218963623, "learning_rate": 4.567850933596924e-06, "loss": 0.5687, "step": 11972 }, { "epoch": 5.416421624066953, "grad_norm": 0.4468309283256531, "learning_rate": 4.567120930162466e-06, "loss": 0.4496, "step": 11973 }, { "epoch": 5.416874010404886, "grad_norm": 0.5208975672721863, "learning_rate": 4.5663909360249106e-06, "loss": 0.5318, "step": 11974 }, { "epoch": 5.417326396742818, "grad_norm": 0.46486255526542664, "learning_rate": 4.565660951199936e-06, "loss": 0.5125, "step": 11975 }, { "epoch": 5.417778783080751, "grad_norm": 0.4584416449069977, "learning_rate": 4.564930975703222e-06, "loss": 0.477, "step": 11976 }, { "epoch": 5.418231169418684, "grad_norm": 0.5060110688209534, "learning_rate": 4.564201009550444e-06, "loss": 0.7169, "step": 11977 }, { "epoch": 5.418683555756616, "grad_norm": 0.5113174915313721, "learning_rate": 4.56347105275728e-06, "loss": 0.5477, "step": 11978 }, { "epoch": 5.419135942094549, "grad_norm": 0.5336417555809021, "learning_rate": 4.562741105339408e-06, "loss": 0.5701, "step": 11979 }, { "epoch": 5.419588328432481, "grad_norm": 0.4735555350780487, "learning_rate": 4.5620111673125036e-06, "loss": 0.5342, "step": 11980 }, { "epoch": 5.420040714770414, "grad_norm": 0.5287697911262512, "learning_rate": 4.561281238692245e-06, "loss": 0.5673, "step": 11981 }, { "epoch": 5.420493101108346, "grad_norm": 0.480523020029068, "learning_rate": 4.560551319494308e-06, "loss": 0.4835, "step": 11982 }, { "epoch": 5.4209454874462795, "grad_norm": 0.49670401215553284, "learning_rate": 4.55982140973437e-06, "loss": 0.4348, "step": 11983 }, { "epoch": 5.421397873784212, "grad_norm": 0.6179952621459961, "learning_rate": 4.559091509428105e-06, "loss": 0.602, "step": 11984 }, { "epoch": 5.421850260122144, "grad_norm": 0.5230650305747986, "learning_rate": 4.5583616185911915e-06, "loss": 0.4496, "step": 11985 }, { "epoch": 5.422302646460077, "grad_norm": 0.5577432513237, "learning_rate": 4.557631737239304e-06, "loss": 0.5306, "step": 11986 }, { "epoch": 5.422755032798009, "grad_norm": 0.5632128119468689, "learning_rate": 4.55690186538812e-06, "loss": 0.5431, "step": 11987 }, { "epoch": 5.4232074191359425, "grad_norm": 0.5047194361686707, "learning_rate": 4.556172003053311e-06, "loss": 0.3899, "step": 11988 }, { "epoch": 5.423659805473875, "grad_norm": 0.6191260814666748, "learning_rate": 4.555442150250555e-06, "loss": 0.5761, "step": 11989 }, { "epoch": 5.424112191811807, "grad_norm": 0.4949401915073395, "learning_rate": 4.554712306995527e-06, "loss": 0.4641, "step": 11990 }, { "epoch": 5.42456457814974, "grad_norm": 0.5123977065086365, "learning_rate": 4.5539824733039e-06, "loss": 0.4831, "step": 11991 }, { "epoch": 5.425016964487672, "grad_norm": 0.6344079375267029, "learning_rate": 4.553252649191351e-06, "loss": 0.6041, "step": 11992 }, { "epoch": 5.425469350825605, "grad_norm": 0.5315906405448914, "learning_rate": 4.5525228346735535e-06, "loss": 0.4808, "step": 11993 }, { "epoch": 5.425921737163538, "grad_norm": 0.5499710440635681, "learning_rate": 4.55179302976618e-06, "loss": 0.4529, "step": 11994 }, { "epoch": 5.42637412350147, "grad_norm": 0.5773559808731079, "learning_rate": 4.551063234484907e-06, "loss": 0.5293, "step": 11995 }, { "epoch": 5.426826509839403, "grad_norm": 0.5711239576339722, "learning_rate": 4.5503334488454054e-06, "loss": 0.4553, "step": 11996 }, { "epoch": 5.427278896177335, "grad_norm": 0.6461744904518127, "learning_rate": 4.549603672863351e-06, "loss": 0.5436, "step": 11997 }, { "epoch": 5.427731282515268, "grad_norm": 0.7167932987213135, "learning_rate": 4.548873906554417e-06, "loss": 0.6138, "step": 11998 }, { "epoch": 5.428183668853201, "grad_norm": 0.6789971590042114, "learning_rate": 4.548144149934276e-06, "loss": 0.572, "step": 11999 }, { "epoch": 5.4286360551911335, "grad_norm": 0.632144033908844, "learning_rate": 4.547414403018601e-06, "loss": 0.427, "step": 12000 }, { "epoch": 5.4286360551911335, "eval_loss": 0.5912463068962097, "eval_runtime": 26.5362, "eval_samples_per_second": 28.037, "eval_steps_per_second": 7.009, "step": 12000 }, { "epoch": 5.429088441529066, "grad_norm": 0.5825356245040894, "learning_rate": 4.546684665823064e-06, "loss": 0.4169, "step": 12001 }, { "epoch": 5.429540827866998, "grad_norm": 0.6522572636604309, "learning_rate": 4.545954938363338e-06, "loss": 0.4197, "step": 12002 }, { "epoch": 5.429993214204931, "grad_norm": 0.5454887747764587, "learning_rate": 4.545225220655096e-06, "loss": 1.165, "step": 12003 }, { "epoch": 5.430445600542864, "grad_norm": 0.18524283170700073, "learning_rate": 4.544495512714009e-06, "loss": 0.5832, "step": 12004 }, { "epoch": 5.4308979868807965, "grad_norm": 0.3089163899421692, "learning_rate": 4.543765814555749e-06, "loss": 0.614, "step": 12005 }, { "epoch": 5.431350373218729, "grad_norm": 0.3105488419532776, "learning_rate": 4.5430361261959885e-06, "loss": 0.6108, "step": 12006 }, { "epoch": 5.431802759556661, "grad_norm": 0.3516070246696472, "learning_rate": 4.542306447650398e-06, "loss": 0.4765, "step": 12007 }, { "epoch": 5.432255145894594, "grad_norm": 0.37842416763305664, "learning_rate": 4.54157677893465e-06, "loss": 0.4869, "step": 12008 }, { "epoch": 5.432707532232526, "grad_norm": 0.3894091844558716, "learning_rate": 4.540847120064414e-06, "loss": 0.5274, "step": 12009 }, { "epoch": 5.4331599185704595, "grad_norm": 0.40441977977752686, "learning_rate": 4.540117471055363e-06, "loss": 0.4872, "step": 12010 }, { "epoch": 5.433612304908392, "grad_norm": 0.40378981828689575, "learning_rate": 4.539387831923164e-06, "loss": 0.6226, "step": 12011 }, { "epoch": 5.434064691246324, "grad_norm": 0.4212174713611603, "learning_rate": 4.53865820268349e-06, "loss": 0.6603, "step": 12012 }, { "epoch": 5.434517077584257, "grad_norm": 0.4681386947631836, "learning_rate": 4.537928583352012e-06, "loss": 0.6923, "step": 12013 }, { "epoch": 5.434969463922189, "grad_norm": 0.4259994328022003, "learning_rate": 4.537198973944398e-06, "loss": 0.4988, "step": 12014 }, { "epoch": 5.435421850260122, "grad_norm": 0.4359464943408966, "learning_rate": 4.536469374476318e-06, "loss": 0.5815, "step": 12015 }, { "epoch": 5.435874236598055, "grad_norm": 0.4349699318408966, "learning_rate": 4.535739784963442e-06, "loss": 0.5959, "step": 12016 }, { "epoch": 5.4363266229359875, "grad_norm": 0.5231044292449951, "learning_rate": 4.535010205421439e-06, "loss": 0.4781, "step": 12017 }, { "epoch": 5.43677900927392, "grad_norm": 0.482532262802124, "learning_rate": 4.5342806358659795e-06, "loss": 0.6074, "step": 12018 }, { "epoch": 5.437231395611852, "grad_norm": 0.4680534601211548, "learning_rate": 4.53355107631273e-06, "loss": 0.5572, "step": 12019 }, { "epoch": 5.437683781949785, "grad_norm": 0.46020349860191345, "learning_rate": 4.5328215267773615e-06, "loss": 0.4741, "step": 12020 }, { "epoch": 5.438136168287718, "grad_norm": 0.4439718723297119, "learning_rate": 4.5320919872755415e-06, "loss": 0.5458, "step": 12021 }, { "epoch": 5.4385885546256505, "grad_norm": 0.5073080658912659, "learning_rate": 4.531362457822937e-06, "loss": 0.5972, "step": 12022 }, { "epoch": 5.439040940963583, "grad_norm": 0.46889442205429077, "learning_rate": 4.530632938435218e-06, "loss": 0.5534, "step": 12023 }, { "epoch": 5.439493327301515, "grad_norm": 0.4613328278064728, "learning_rate": 4.529903429128052e-06, "loss": 0.5469, "step": 12024 }, { "epoch": 5.439945713639448, "grad_norm": 0.4856417775154114, "learning_rate": 4.529173929917106e-06, "loss": 0.5509, "step": 12025 }, { "epoch": 5.440398099977381, "grad_norm": 0.4709959626197815, "learning_rate": 4.528444440818047e-06, "loss": 0.5173, "step": 12026 }, { "epoch": 5.4408504863153135, "grad_norm": 0.46487492322921753, "learning_rate": 4.527714961846543e-06, "loss": 0.5262, "step": 12027 }, { "epoch": 5.441302872653246, "grad_norm": 0.44885003566741943, "learning_rate": 4.52698549301826e-06, "loss": 0.4807, "step": 12028 }, { "epoch": 5.441755258991178, "grad_norm": 0.5564752221107483, "learning_rate": 4.526256034348866e-06, "loss": 0.6422, "step": 12029 }, { "epoch": 5.442207645329111, "grad_norm": 0.5156490206718445, "learning_rate": 4.525526585854026e-06, "loss": 0.5564, "step": 12030 }, { "epoch": 5.442660031667043, "grad_norm": 0.5251885652542114, "learning_rate": 4.524797147549409e-06, "loss": 0.6234, "step": 12031 }, { "epoch": 5.443112418004977, "grad_norm": 0.46530231833457947, "learning_rate": 4.524067719450679e-06, "loss": 0.5268, "step": 12032 }, { "epoch": 5.443564804342909, "grad_norm": 0.5296192765235901, "learning_rate": 4.523338301573502e-06, "loss": 0.451, "step": 12033 }, { "epoch": 5.4440171906808414, "grad_norm": 0.5518990755081177, "learning_rate": 4.522608893933543e-06, "loss": 0.5983, "step": 12034 }, { "epoch": 5.444469577018774, "grad_norm": 0.5161492228507996, "learning_rate": 4.5218794965464676e-06, "loss": 0.4968, "step": 12035 }, { "epoch": 5.444921963356706, "grad_norm": 0.48175883293151855, "learning_rate": 4.521150109427943e-06, "loss": 0.4866, "step": 12036 }, { "epoch": 5.44537434969464, "grad_norm": 0.5466291904449463, "learning_rate": 4.520420732593632e-06, "loss": 0.5358, "step": 12037 }, { "epoch": 5.445826736032572, "grad_norm": 0.4994567930698395, "learning_rate": 4.5196913660592e-06, "loss": 0.3983, "step": 12038 }, { "epoch": 5.4462791223705045, "grad_norm": 0.5925190448760986, "learning_rate": 4.518962009840313e-06, "loss": 0.5704, "step": 12039 }, { "epoch": 5.446731508708437, "grad_norm": 0.5689609050750732, "learning_rate": 4.518232663952632e-06, "loss": 0.5106, "step": 12040 }, { "epoch": 5.447183895046369, "grad_norm": 0.5246638655662537, "learning_rate": 4.517503328411825e-06, "loss": 0.5067, "step": 12041 }, { "epoch": 5.447636281384302, "grad_norm": 0.6246397495269775, "learning_rate": 4.516774003233553e-06, "loss": 0.524, "step": 12042 }, { "epoch": 5.448088667722235, "grad_norm": 0.6222879886627197, "learning_rate": 4.51604468843348e-06, "loss": 0.5571, "step": 12043 }, { "epoch": 5.4485410540601675, "grad_norm": 0.5901985168457031, "learning_rate": 4.515315384027272e-06, "loss": 0.5205, "step": 12044 }, { "epoch": 5.4489934403981, "grad_norm": 0.6069658994674683, "learning_rate": 4.514586090030589e-06, "loss": 0.5469, "step": 12045 }, { "epoch": 5.449445826736032, "grad_norm": 0.6060872077941895, "learning_rate": 4.513856806459095e-06, "loss": 0.4735, "step": 12046 }, { "epoch": 5.449898213073965, "grad_norm": 0.6137279272079468, "learning_rate": 4.513127533328453e-06, "loss": 0.4843, "step": 12047 }, { "epoch": 5.450350599411898, "grad_norm": 0.6192247271537781, "learning_rate": 4.512398270654325e-06, "loss": 0.4865, "step": 12048 }, { "epoch": 5.450802985749831, "grad_norm": 0.6843118667602539, "learning_rate": 4.5116690184523745e-06, "loss": 0.6092, "step": 12049 }, { "epoch": 5.451255372087763, "grad_norm": 0.686653733253479, "learning_rate": 4.5109397767382635e-06, "loss": 0.4822, "step": 12050 }, { "epoch": 5.451707758425695, "grad_norm": 0.6377617716789246, "learning_rate": 4.5102105455276525e-06, "loss": 0.5066, "step": 12051 }, { "epoch": 5.452160144763628, "grad_norm": 0.7791646122932434, "learning_rate": 4.509481324836204e-06, "loss": 0.4715, "step": 12052 }, { "epoch": 5.452612531101561, "grad_norm": 0.5244854688644409, "learning_rate": 4.508752114679578e-06, "loss": 1.1632, "step": 12053 }, { "epoch": 5.453064917439494, "grad_norm": 0.23541945219039917, "learning_rate": 4.508022915073439e-06, "loss": 0.5139, "step": 12054 }, { "epoch": 5.453517303777426, "grad_norm": 0.3245640695095062, "learning_rate": 4.507293726033446e-06, "loss": 0.6333, "step": 12055 }, { "epoch": 5.4539696901153585, "grad_norm": 0.3882005214691162, "learning_rate": 4.506564547575259e-06, "loss": 0.586, "step": 12056 }, { "epoch": 5.454422076453291, "grad_norm": 0.32435959577560425, "learning_rate": 4.505835379714539e-06, "loss": 0.5373, "step": 12057 }, { "epoch": 5.454874462791223, "grad_norm": 0.41498589515686035, "learning_rate": 4.505106222466946e-06, "loss": 0.743, "step": 12058 }, { "epoch": 5.455326849129157, "grad_norm": 0.362608939409256, "learning_rate": 4.50437707584814e-06, "loss": 0.6054, "step": 12059 }, { "epoch": 5.455779235467089, "grad_norm": 0.41803795099258423, "learning_rate": 4.503647939873782e-06, "loss": 0.5537, "step": 12060 }, { "epoch": 5.4562316218050215, "grad_norm": 0.40346983075141907, "learning_rate": 4.502918814559531e-06, "loss": 0.5584, "step": 12061 }, { "epoch": 5.456684008142954, "grad_norm": 0.4507363736629486, "learning_rate": 4.5021896999210466e-06, "loss": 0.7004, "step": 12062 }, { "epoch": 5.457136394480886, "grad_norm": 0.3988656997680664, "learning_rate": 4.5014605959739866e-06, "loss": 0.5223, "step": 12063 }, { "epoch": 5.457588780818819, "grad_norm": 0.41327667236328125, "learning_rate": 4.500731502734011e-06, "loss": 0.5307, "step": 12064 }, { "epoch": 5.458041167156752, "grad_norm": 0.41610774397850037, "learning_rate": 4.500002420216778e-06, "loss": 0.53, "step": 12065 }, { "epoch": 5.458493553494685, "grad_norm": 0.47894883155822754, "learning_rate": 4.499273348437948e-06, "loss": 0.5883, "step": 12066 }, { "epoch": 5.458945939832617, "grad_norm": 0.4121507704257965, "learning_rate": 4.498544287413176e-06, "loss": 0.4856, "step": 12067 }, { "epoch": 5.459398326170549, "grad_norm": 0.3951616883277893, "learning_rate": 4.497815237158122e-06, "loss": 0.3936, "step": 12068 }, { "epoch": 5.459850712508482, "grad_norm": 0.4177693724632263, "learning_rate": 4.4970861976884425e-06, "loss": 0.4478, "step": 12069 }, { "epoch": 5.460303098846415, "grad_norm": 0.4567721486091614, "learning_rate": 4.4963571690197975e-06, "loss": 0.6203, "step": 12070 }, { "epoch": 5.460755485184348, "grad_norm": 0.4937022626399994, "learning_rate": 4.49562815116784e-06, "loss": 0.5856, "step": 12071 }, { "epoch": 5.46120787152228, "grad_norm": 0.4214365482330322, "learning_rate": 4.494899144148232e-06, "loss": 0.4987, "step": 12072 }, { "epoch": 5.4616602578602125, "grad_norm": 0.5307006239891052, "learning_rate": 4.494170147976627e-06, "loss": 0.6446, "step": 12073 }, { "epoch": 5.462112644198145, "grad_norm": 0.4891810715198517, "learning_rate": 4.493441162668682e-06, "loss": 0.5565, "step": 12074 }, { "epoch": 5.462565030536078, "grad_norm": 0.43320539593696594, "learning_rate": 4.492712188240053e-06, "loss": 0.4811, "step": 12075 }, { "epoch": 5.463017416874011, "grad_norm": 0.4461999833583832, "learning_rate": 4.491983224706399e-06, "loss": 0.5746, "step": 12076 }, { "epoch": 5.463469803211943, "grad_norm": 0.49900245666503906, "learning_rate": 4.491254272083373e-06, "loss": 0.5248, "step": 12077 }, { "epoch": 5.4639221895498755, "grad_norm": 0.47037121653556824, "learning_rate": 4.490525330386632e-06, "loss": 0.4761, "step": 12078 }, { "epoch": 5.464374575887808, "grad_norm": 0.4546007513999939, "learning_rate": 4.489796399631831e-06, "loss": 0.5086, "step": 12079 }, { "epoch": 5.46482696222574, "grad_norm": 0.41565975546836853, "learning_rate": 4.489067479834624e-06, "loss": 0.4273, "step": 12080 }, { "epoch": 5.465279348563674, "grad_norm": 0.5555649399757385, "learning_rate": 4.488338571010668e-06, "loss": 0.5744, "step": 12081 }, { "epoch": 5.465731734901606, "grad_norm": 0.5018900632858276, "learning_rate": 4.487609673175616e-06, "loss": 0.466, "step": 12082 }, { "epoch": 5.466184121239539, "grad_norm": 0.5121945738792419, "learning_rate": 4.486880786345123e-06, "loss": 0.542, "step": 12083 }, { "epoch": 5.466636507577471, "grad_norm": 0.507585883140564, "learning_rate": 4.486151910534844e-06, "loss": 0.5089, "step": 12084 }, { "epoch": 5.467088893915403, "grad_norm": 0.6016163229942322, "learning_rate": 4.485423045760432e-06, "loss": 0.6541, "step": 12085 }, { "epoch": 5.467541280253337, "grad_norm": 0.4724242389202118, "learning_rate": 4.484694192037542e-06, "loss": 0.4181, "step": 12086 }, { "epoch": 5.467993666591269, "grad_norm": 0.5300173759460449, "learning_rate": 4.483965349381826e-06, "loss": 0.4749, "step": 12087 }, { "epoch": 5.468446052929202, "grad_norm": 0.5617977976799011, "learning_rate": 4.483236517808938e-06, "loss": 0.5233, "step": 12088 }, { "epoch": 5.468898439267134, "grad_norm": 0.5088356733322144, "learning_rate": 4.482507697334532e-06, "loss": 0.4661, "step": 12089 }, { "epoch": 5.4693508256050665, "grad_norm": 0.6092919707298279, "learning_rate": 4.48177888797426e-06, "loss": 0.542, "step": 12090 }, { "epoch": 5.469803211942999, "grad_norm": 0.5517929196357727, "learning_rate": 4.481050089743773e-06, "loss": 0.4993, "step": 12091 }, { "epoch": 5.470255598280932, "grad_norm": 0.532884418964386, "learning_rate": 4.480321302658725e-06, "loss": 0.5324, "step": 12092 }, { "epoch": 5.470707984618865, "grad_norm": 0.5262811183929443, "learning_rate": 4.4795925267347685e-06, "loss": 0.4099, "step": 12093 }, { "epoch": 5.471160370956797, "grad_norm": 0.5850948095321655, "learning_rate": 4.478863761987554e-06, "loss": 0.5614, "step": 12094 }, { "epoch": 5.4716127572947295, "grad_norm": 0.5702711343765259, "learning_rate": 4.478135008432734e-06, "loss": 0.5089, "step": 12095 }, { "epoch": 5.472065143632662, "grad_norm": 0.6161372065544128, "learning_rate": 4.47740626608596e-06, "loss": 0.5006, "step": 12096 }, { "epoch": 5.472517529970595, "grad_norm": 0.5738009810447693, "learning_rate": 4.476677534962882e-06, "loss": 0.4879, "step": 12097 }, { "epoch": 5.472969916308528, "grad_norm": 0.7336105108261108, "learning_rate": 4.475948815079153e-06, "loss": 0.5023, "step": 12098 }, { "epoch": 5.47342230264646, "grad_norm": 0.632944643497467, "learning_rate": 4.4752201064504215e-06, "loss": 0.5224, "step": 12099 }, { "epoch": 5.4738746889843926, "grad_norm": 0.7102434635162354, "learning_rate": 4.47449140909234e-06, "loss": 0.5634, "step": 12100 }, { "epoch": 5.474327075322325, "grad_norm": 0.7557467818260193, "learning_rate": 4.4737627230205565e-06, "loss": 0.5748, "step": 12101 }, { "epoch": 5.474779461660258, "grad_norm": 0.7233670353889465, "learning_rate": 4.473034048250722e-06, "loss": 0.544, "step": 12102 }, { "epoch": 5.475231847998191, "grad_norm": 0.5028302669525146, "learning_rate": 4.472305384798486e-06, "loss": 0.9464, "step": 12103 }, { "epoch": 5.475684234336123, "grad_norm": 0.2613687515258789, "learning_rate": 4.471576732679497e-06, "loss": 1.0396, "step": 12104 }, { "epoch": 5.476136620674056, "grad_norm": 0.3348219096660614, "learning_rate": 4.4708480919094064e-06, "loss": 0.7828, "step": 12105 }, { "epoch": 5.476589007011988, "grad_norm": 0.3606390058994293, "learning_rate": 4.470119462503861e-06, "loss": 0.6847, "step": 12106 }, { "epoch": 5.4770413933499205, "grad_norm": 0.3369266092777252, "learning_rate": 4.469390844478511e-06, "loss": 0.585, "step": 12107 }, { "epoch": 5.477493779687854, "grad_norm": 0.37172630429267883, "learning_rate": 4.468662237849004e-06, "loss": 0.5491, "step": 12108 }, { "epoch": 5.477946166025786, "grad_norm": 0.3562529683113098, "learning_rate": 4.467933642630989e-06, "loss": 0.5289, "step": 12109 }, { "epoch": 5.478398552363719, "grad_norm": 0.4071696996688843, "learning_rate": 4.467205058840112e-06, "loss": 0.5772, "step": 12110 }, { "epoch": 5.478850938701651, "grad_norm": 0.39530396461486816, "learning_rate": 4.466476486492024e-06, "loss": 0.4976, "step": 12111 }, { "epoch": 5.4793033250395835, "grad_norm": 0.43233051896095276, "learning_rate": 4.465747925602371e-06, "loss": 0.4454, "step": 12112 }, { "epoch": 5.479755711377516, "grad_norm": 0.39728718996047974, "learning_rate": 4.465019376186799e-06, "loss": 0.5841, "step": 12113 }, { "epoch": 5.480208097715449, "grad_norm": 0.40092039108276367, "learning_rate": 4.464290838260955e-06, "loss": 0.4973, "step": 12114 }, { "epoch": 5.480660484053382, "grad_norm": 0.41191595792770386, "learning_rate": 4.463562311840488e-06, "loss": 0.5282, "step": 12115 }, { "epoch": 5.481112870391314, "grad_norm": 0.4277861416339874, "learning_rate": 4.462833796941043e-06, "loss": 0.5674, "step": 12116 }, { "epoch": 5.4815652567292465, "grad_norm": 0.4786350727081299, "learning_rate": 4.462105293578266e-06, "loss": 0.5391, "step": 12117 }, { "epoch": 5.482017643067179, "grad_norm": 0.5019985437393188, "learning_rate": 4.4613768017678025e-06, "loss": 0.5817, "step": 12118 }, { "epoch": 5.482470029405112, "grad_norm": 0.46719399094581604, "learning_rate": 4.4606483215253006e-06, "loss": 0.5224, "step": 12119 }, { "epoch": 5.482922415743045, "grad_norm": 0.5189313292503357, "learning_rate": 4.459919852866404e-06, "loss": 0.593, "step": 12120 }, { "epoch": 5.483374802080977, "grad_norm": 0.46362370252609253, "learning_rate": 4.459191395806757e-06, "loss": 0.5619, "step": 12121 }, { "epoch": 5.48382718841891, "grad_norm": 0.451350599527359, "learning_rate": 4.458462950362007e-06, "loss": 0.5038, "step": 12122 }, { "epoch": 5.484279574756842, "grad_norm": 0.5385102033615112, "learning_rate": 4.457734516547799e-06, "loss": 0.5842, "step": 12123 }, { "epoch": 5.484731961094775, "grad_norm": 0.4717753827571869, "learning_rate": 4.457006094379774e-06, "loss": 0.5657, "step": 12124 }, { "epoch": 5.485184347432708, "grad_norm": 0.4440454840660095, "learning_rate": 4.4562776838735794e-06, "loss": 0.5228, "step": 12125 }, { "epoch": 5.48563673377064, "grad_norm": 0.45457664132118225, "learning_rate": 4.455549285044858e-06, "loss": 0.4968, "step": 12126 }, { "epoch": 5.486089120108573, "grad_norm": 0.4384876787662506, "learning_rate": 4.4548208979092534e-06, "loss": 0.4391, "step": 12127 }, { "epoch": 5.486541506446505, "grad_norm": 0.514248251914978, "learning_rate": 4.454092522482409e-06, "loss": 0.5725, "step": 12128 }, { "epoch": 5.4869938927844375, "grad_norm": 0.46167850494384766, "learning_rate": 4.453364158779969e-06, "loss": 0.5084, "step": 12129 }, { "epoch": 5.487446279122371, "grad_norm": 0.5095555782318115, "learning_rate": 4.4526358068175765e-06, "loss": 0.5162, "step": 12130 }, { "epoch": 5.487898665460303, "grad_norm": 0.559241771697998, "learning_rate": 4.451907466610873e-06, "loss": 0.5836, "step": 12131 }, { "epoch": 5.488351051798236, "grad_norm": 0.48699408769607544, "learning_rate": 4.451179138175502e-06, "loss": 0.5022, "step": 12132 }, { "epoch": 5.488803438136168, "grad_norm": 0.5627172589302063, "learning_rate": 4.4504508215271054e-06, "loss": 0.5373, "step": 12133 }, { "epoch": 5.4892558244741005, "grad_norm": 0.4729514718055725, "learning_rate": 4.449722516681326e-06, "loss": 0.4895, "step": 12134 }, { "epoch": 5.489708210812034, "grad_norm": 0.5711072683334351, "learning_rate": 4.448994223653804e-06, "loss": 0.5545, "step": 12135 }, { "epoch": 5.490160597149966, "grad_norm": 0.5297709107398987, "learning_rate": 4.448265942460182e-06, "loss": 0.4935, "step": 12136 }, { "epoch": 5.490612983487899, "grad_norm": 0.5648270845413208, "learning_rate": 4.447537673116099e-06, "loss": 0.5559, "step": 12137 }, { "epoch": 5.491065369825831, "grad_norm": 0.5569305419921875, "learning_rate": 4.4468094156371995e-06, "loss": 0.5066, "step": 12138 }, { "epoch": 5.491517756163764, "grad_norm": 0.5689657926559448, "learning_rate": 4.4460811700391225e-06, "loss": 0.4808, "step": 12139 }, { "epoch": 5.491970142501696, "grad_norm": 0.5298765301704407, "learning_rate": 4.445352936337507e-06, "loss": 0.4616, "step": 12140 }, { "epoch": 5.492422528839629, "grad_norm": 0.5591469407081604, "learning_rate": 4.444624714547997e-06, "loss": 0.5062, "step": 12141 }, { "epoch": 5.492874915177562, "grad_norm": 0.5807911157608032, "learning_rate": 4.443896504686228e-06, "loss": 0.5395, "step": 12142 }, { "epoch": 5.493327301515494, "grad_norm": 0.5828961730003357, "learning_rate": 4.443168306767843e-06, "loss": 0.5022, "step": 12143 }, { "epoch": 5.493779687853427, "grad_norm": 0.5403826832771301, "learning_rate": 4.44244012080848e-06, "loss": 0.5318, "step": 12144 }, { "epoch": 5.494232074191359, "grad_norm": 0.5594722032546997, "learning_rate": 4.4417119468237795e-06, "loss": 0.4118, "step": 12145 }, { "epoch": 5.494684460529292, "grad_norm": 0.5911008715629578, "learning_rate": 4.440983784829378e-06, "loss": 0.5188, "step": 12146 }, { "epoch": 5.495136846867225, "grad_norm": 0.6723851561546326, "learning_rate": 4.440255634840916e-06, "loss": 0.4864, "step": 12147 }, { "epoch": 5.495589233205157, "grad_norm": 0.6134000420570374, "learning_rate": 4.439527496874031e-06, "loss": 0.5109, "step": 12148 }, { "epoch": 5.49604161954309, "grad_norm": 0.5711175203323364, "learning_rate": 4.438799370944362e-06, "loss": 0.4246, "step": 12149 }, { "epoch": 5.496494005881022, "grad_norm": 0.6671885251998901, "learning_rate": 4.438071257067547e-06, "loss": 0.4865, "step": 12150 }, { "epoch": 5.496946392218955, "grad_norm": 0.7407957315444946, "learning_rate": 4.437343155259222e-06, "loss": 0.5887, "step": 12151 }, { "epoch": 5.497398778556888, "grad_norm": 0.6661505699157715, "learning_rate": 4.436615065535026e-06, "loss": 0.4926, "step": 12152 }, { "epoch": 5.49785116489482, "grad_norm": 0.5067893862724304, "learning_rate": 4.435886987910596e-06, "loss": 0.9007, "step": 12153 }, { "epoch": 5.498303551232753, "grad_norm": 0.2256462424993515, "learning_rate": 4.4351589224015675e-06, "loss": 1.2336, "step": 12154 }, { "epoch": 5.498755937570685, "grad_norm": 0.2787858843803406, "learning_rate": 4.434430869023579e-06, "loss": 0.6967, "step": 12155 }, { "epoch": 5.499208323908618, "grad_norm": 0.3427899479866028, "learning_rate": 4.433702827792265e-06, "loss": 0.6446, "step": 12156 }, { "epoch": 5.499660710246551, "grad_norm": 0.3493773341178894, "learning_rate": 4.432974798723265e-06, "loss": 0.664, "step": 12157 }, { "epoch": 5.500113096584483, "grad_norm": 0.39014172554016113, "learning_rate": 4.43224678183221e-06, "loss": 0.7244, "step": 12158 }, { "epoch": 5.500565482922416, "grad_norm": 0.38154318928718567, "learning_rate": 4.431518777134738e-06, "loss": 0.611, "step": 12159 }, { "epoch": 5.501017869260348, "grad_norm": 0.4146938920021057, "learning_rate": 4.430790784646485e-06, "loss": 0.6395, "step": 12160 }, { "epoch": 5.501470255598281, "grad_norm": 0.39968016743659973, "learning_rate": 4.4300628043830845e-06, "loss": 0.5543, "step": 12161 }, { "epoch": 5.501922641936213, "grad_norm": 0.3988839387893677, "learning_rate": 4.429334836360172e-06, "loss": 0.573, "step": 12162 }, { "epoch": 5.502375028274146, "grad_norm": 0.4529499113559723, "learning_rate": 4.428606880593382e-06, "loss": 0.5572, "step": 12163 }, { "epoch": 5.502827414612079, "grad_norm": 0.4296439290046692, "learning_rate": 4.427878937098349e-06, "loss": 0.4995, "step": 12164 }, { "epoch": 5.503279800950011, "grad_norm": 0.44527965784072876, "learning_rate": 4.427151005890706e-06, "loss": 0.6022, "step": 12165 }, { "epoch": 5.503732187287944, "grad_norm": 0.4720144271850586, "learning_rate": 4.4264230869860876e-06, "loss": 0.5907, "step": 12166 }, { "epoch": 5.504184573625876, "grad_norm": 0.41915300488471985, "learning_rate": 4.425695180400127e-06, "loss": 0.5092, "step": 12167 }, { "epoch": 5.504636959963809, "grad_norm": 0.41940101981163025, "learning_rate": 4.424967286148458e-06, "loss": 0.4935, "step": 12168 }, { "epoch": 5.505089346301742, "grad_norm": 0.4228875935077667, "learning_rate": 4.424239404246712e-06, "loss": 0.475, "step": 12169 }, { "epoch": 5.505541732639674, "grad_norm": 0.4581698775291443, "learning_rate": 4.423511534710523e-06, "loss": 0.5659, "step": 12170 }, { "epoch": 5.505994118977607, "grad_norm": 0.46432358026504517, "learning_rate": 4.4227836775555225e-06, "loss": 0.5628, "step": 12171 }, { "epoch": 5.506446505315539, "grad_norm": 0.46843594312667847, "learning_rate": 4.422055832797344e-06, "loss": 0.6567, "step": 12172 }, { "epoch": 5.5068988916534725, "grad_norm": 0.45346248149871826, "learning_rate": 4.421328000451617e-06, "loss": 0.5002, "step": 12173 }, { "epoch": 5.507351277991405, "grad_norm": 0.5026775002479553, "learning_rate": 4.420600180533975e-06, "loss": 0.5317, "step": 12174 }, { "epoch": 5.507803664329337, "grad_norm": 0.5348524451255798, "learning_rate": 4.419872373060049e-06, "loss": 0.5396, "step": 12175 }, { "epoch": 5.50825605066727, "grad_norm": 0.5393784046173096, "learning_rate": 4.41914457804547e-06, "loss": 0.602, "step": 12176 }, { "epoch": 5.508708437005202, "grad_norm": 0.5301290154457092, "learning_rate": 4.418416795505868e-06, "loss": 0.451, "step": 12177 }, { "epoch": 5.5091608233431355, "grad_norm": 0.4354546070098877, "learning_rate": 4.4176890254568756e-06, "loss": 0.4776, "step": 12178 }, { "epoch": 5.509613209681068, "grad_norm": 0.4692286252975464, "learning_rate": 4.416961267914122e-06, "loss": 0.491, "step": 12179 }, { "epoch": 5.510065596019, "grad_norm": 0.4840169847011566, "learning_rate": 4.416233522893235e-06, "loss": 0.5508, "step": 12180 }, { "epoch": 5.510517982356933, "grad_norm": 0.49416443705558777, "learning_rate": 4.415505790409847e-06, "loss": 0.5516, "step": 12181 }, { "epoch": 5.510970368694865, "grad_norm": 0.46762654185295105, "learning_rate": 4.414778070479586e-06, "loss": 0.4589, "step": 12182 }, { "epoch": 5.511422755032798, "grad_norm": 0.5584492683410645, "learning_rate": 4.414050363118082e-06, "loss": 0.6446, "step": 12183 }, { "epoch": 5.511875141370731, "grad_norm": 0.494698166847229, "learning_rate": 4.413322668340963e-06, "loss": 0.435, "step": 12184 }, { "epoch": 5.512327527708663, "grad_norm": 0.48032277822494507, "learning_rate": 4.412594986163859e-06, "loss": 0.4844, "step": 12185 }, { "epoch": 5.512779914046596, "grad_norm": 0.5159932971000671, "learning_rate": 4.411867316602398e-06, "loss": 0.4941, "step": 12186 }, { "epoch": 5.513232300384528, "grad_norm": 0.5072252154350281, "learning_rate": 4.411139659672206e-06, "loss": 0.4838, "step": 12187 }, { "epoch": 5.513684686722461, "grad_norm": 0.5469373464584351, "learning_rate": 4.410412015388914e-06, "loss": 0.5171, "step": 12188 }, { "epoch": 5.514137073060393, "grad_norm": 0.5958544015884399, "learning_rate": 4.409684383768148e-06, "loss": 0.5755, "step": 12189 }, { "epoch": 5.5145894593983265, "grad_norm": 0.6130548715591431, "learning_rate": 4.408956764825535e-06, "loss": 0.5471, "step": 12190 }, { "epoch": 5.515041845736259, "grad_norm": 0.5229876041412354, "learning_rate": 4.408229158576704e-06, "loss": 0.5156, "step": 12191 }, { "epoch": 5.515494232074191, "grad_norm": 0.5872633457183838, "learning_rate": 4.407501565037277e-06, "loss": 0.4884, "step": 12192 }, { "epoch": 5.515946618412124, "grad_norm": 0.5577894449234009, "learning_rate": 4.406773984222886e-06, "loss": 0.5145, "step": 12193 }, { "epoch": 5.516399004750056, "grad_norm": 0.5596465468406677, "learning_rate": 4.406046416149154e-06, "loss": 0.5435, "step": 12194 }, { "epoch": 5.5168513910879895, "grad_norm": 0.49853336811065674, "learning_rate": 4.405318860831708e-06, "loss": 0.4031, "step": 12195 }, { "epoch": 5.517303777425922, "grad_norm": 0.602692186832428, "learning_rate": 4.404591318286172e-06, "loss": 0.4732, "step": 12196 }, { "epoch": 5.517756163763854, "grad_norm": 0.5543642044067383, "learning_rate": 4.403863788528174e-06, "loss": 0.4888, "step": 12197 }, { "epoch": 5.518208550101787, "grad_norm": 0.6414245367050171, "learning_rate": 4.4031362715733365e-06, "loss": 0.5706, "step": 12198 }, { "epoch": 5.518660936439719, "grad_norm": 0.7292016744613647, "learning_rate": 4.402408767437286e-06, "loss": 0.5992, "step": 12199 }, { "epoch": 5.5191133227776525, "grad_norm": 0.6707420945167542, "learning_rate": 4.401681276135647e-06, "loss": 0.4774, "step": 12200 }, { "epoch": 5.5191133227776525, "eval_loss": 0.5905720591545105, "eval_runtime": 25.962, "eval_samples_per_second": 28.657, "eval_steps_per_second": 7.164, "step": 12200 }, { "epoch": 5.519565709115585, "grad_norm": 0.7670555114746094, "learning_rate": 4.400953797684044e-06, "loss": 0.6109, "step": 12201 }, { "epoch": 5.520018095453517, "grad_norm": 0.736048698425293, "learning_rate": 4.4002263320981e-06, "loss": 0.4804, "step": 12202 }, { "epoch": 5.52047048179145, "grad_norm": 0.5806955099105835, "learning_rate": 4.399498879393439e-06, "loss": 1.0684, "step": 12203 }, { "epoch": 5.520922868129382, "grad_norm": 0.23485714197158813, "learning_rate": 4.398771439585684e-06, "loss": 0.9042, "step": 12204 }, { "epoch": 5.521375254467316, "grad_norm": 0.3028041124343872, "learning_rate": 4.398044012690459e-06, "loss": 0.8782, "step": 12205 }, { "epoch": 5.521827640805248, "grad_norm": 0.38700130581855774, "learning_rate": 4.397316598723385e-06, "loss": 0.6425, "step": 12206 }, { "epoch": 5.5222800271431804, "grad_norm": 0.34966471791267395, "learning_rate": 4.396589197700089e-06, "loss": 0.615, "step": 12207 }, { "epoch": 5.522732413481113, "grad_norm": 0.3863559663295746, "learning_rate": 4.395861809636188e-06, "loss": 0.6623, "step": 12208 }, { "epoch": 5.523184799819045, "grad_norm": 0.3619322180747986, "learning_rate": 4.395134434547308e-06, "loss": 0.5434, "step": 12209 }, { "epoch": 5.523637186156978, "grad_norm": 0.3404700458049774, "learning_rate": 4.394407072449068e-06, "loss": 0.4555, "step": 12210 }, { "epoch": 5.52408957249491, "grad_norm": 0.3999933898448944, "learning_rate": 4.3936797233570915e-06, "loss": 0.5895, "step": 12211 }, { "epoch": 5.5245419588328435, "grad_norm": 0.4372783303260803, "learning_rate": 4.392952387286999e-06, "loss": 0.5588, "step": 12212 }, { "epoch": 5.524994345170776, "grad_norm": 0.4403042495250702, "learning_rate": 4.392225064254412e-06, "loss": 0.5133, "step": 12213 }, { "epoch": 5.525446731508708, "grad_norm": 0.412529855966568, "learning_rate": 4.39149775427495e-06, "loss": 0.57, "step": 12214 }, { "epoch": 5.525899117846641, "grad_norm": 0.3818627595901489, "learning_rate": 4.390770457364234e-06, "loss": 0.4452, "step": 12215 }, { "epoch": 5.526351504184573, "grad_norm": 0.4831121265888214, "learning_rate": 4.390043173537883e-06, "loss": 0.5666, "step": 12216 }, { "epoch": 5.5268038905225065, "grad_norm": 0.40000638365745544, "learning_rate": 4.3893159028115185e-06, "loss": 0.457, "step": 12217 }, { "epoch": 5.527256276860439, "grad_norm": 0.5291526317596436, "learning_rate": 4.3885886452007595e-06, "loss": 0.602, "step": 12218 }, { "epoch": 5.527708663198371, "grad_norm": 0.44915971159935, "learning_rate": 4.3878614007212245e-06, "loss": 0.5713, "step": 12219 }, { "epoch": 5.528161049536304, "grad_norm": 0.4958095848560333, "learning_rate": 4.387134169388534e-06, "loss": 0.624, "step": 12220 }, { "epoch": 5.528613435874236, "grad_norm": 0.49921464920043945, "learning_rate": 4.3864069512183045e-06, "loss": 0.5818, "step": 12221 }, { "epoch": 5.52906582221217, "grad_norm": 0.48054444789886475, "learning_rate": 4.3856797462261555e-06, "loss": 0.5078, "step": 12222 }, { "epoch": 5.529518208550102, "grad_norm": 0.5076774954795837, "learning_rate": 4.384952554427706e-06, "loss": 0.5672, "step": 12223 }, { "epoch": 5.529970594888034, "grad_norm": 0.41847676038742065, "learning_rate": 4.384225375838573e-06, "loss": 0.4373, "step": 12224 }, { "epoch": 5.530422981225967, "grad_norm": 0.4528375566005707, "learning_rate": 4.383498210474375e-06, "loss": 0.5125, "step": 12225 }, { "epoch": 5.530875367563899, "grad_norm": 0.4693014621734619, "learning_rate": 4.382771058350728e-06, "loss": 0.4791, "step": 12226 }, { "epoch": 5.531327753901833, "grad_norm": 0.48121002316474915, "learning_rate": 4.382043919483249e-06, "loss": 0.5085, "step": 12227 }, { "epoch": 5.531780140239765, "grad_norm": 0.5055458545684814, "learning_rate": 4.381316793887554e-06, "loss": 0.5633, "step": 12228 }, { "epoch": 5.5322325265776975, "grad_norm": 0.4745938181877136, "learning_rate": 4.3805896815792615e-06, "loss": 0.4929, "step": 12229 }, { "epoch": 5.53268491291563, "grad_norm": 0.5412952899932861, "learning_rate": 4.379862582573986e-06, "loss": 0.6294, "step": 12230 }, { "epoch": 5.533137299253562, "grad_norm": 0.4896758198738098, "learning_rate": 4.379135496887344e-06, "loss": 0.4604, "step": 12231 }, { "epoch": 5.533589685591495, "grad_norm": 0.5101252198219299, "learning_rate": 4.378408424534951e-06, "loss": 0.543, "step": 12232 }, { "epoch": 5.534042071929428, "grad_norm": 0.553519070148468, "learning_rate": 4.3776813655324226e-06, "loss": 0.4562, "step": 12233 }, { "epoch": 5.5344944582673605, "grad_norm": 0.5360565781593323, "learning_rate": 4.376954319895374e-06, "loss": 0.5442, "step": 12234 }, { "epoch": 5.534946844605293, "grad_norm": 0.5159940719604492, "learning_rate": 4.376227287639418e-06, "loss": 0.4659, "step": 12235 }, { "epoch": 5.535399230943225, "grad_norm": 0.5987151265144348, "learning_rate": 4.375500268780172e-06, "loss": 0.7518, "step": 12236 }, { "epoch": 5.535851617281158, "grad_norm": 0.5126054286956787, "learning_rate": 4.374773263333247e-06, "loss": 0.4279, "step": 12237 }, { "epoch": 5.53630400361909, "grad_norm": 0.5202808976173401, "learning_rate": 4.3740462713142575e-06, "loss": 0.4651, "step": 12238 }, { "epoch": 5.536756389957024, "grad_norm": 0.5730465054512024, "learning_rate": 4.373319292738819e-06, "loss": 0.5263, "step": 12239 }, { "epoch": 5.537208776294956, "grad_norm": 0.5980619788169861, "learning_rate": 4.372592327622544e-06, "loss": 0.5609, "step": 12240 }, { "epoch": 5.537661162632888, "grad_norm": 0.6724846363067627, "learning_rate": 4.371865375981044e-06, "loss": 0.5754, "step": 12241 }, { "epoch": 5.538113548970821, "grad_norm": 0.55109703540802, "learning_rate": 4.371138437829932e-06, "loss": 0.5069, "step": 12242 }, { "epoch": 5.538565935308753, "grad_norm": 0.5729483962059021, "learning_rate": 4.370411513184822e-06, "loss": 0.5348, "step": 12243 }, { "epoch": 5.539018321646687, "grad_norm": 0.5592821836471558, "learning_rate": 4.369684602061325e-06, "loss": 0.4595, "step": 12244 }, { "epoch": 5.539470707984619, "grad_norm": 0.6553311347961426, "learning_rate": 4.368957704475053e-06, "loss": 0.5085, "step": 12245 }, { "epoch": 5.5399230943225515, "grad_norm": 0.5650274753570557, "learning_rate": 4.368230820441617e-06, "loss": 0.481, "step": 12246 }, { "epoch": 5.540375480660484, "grad_norm": 0.6606951355934143, "learning_rate": 4.367503949976631e-06, "loss": 0.5019, "step": 12247 }, { "epoch": 5.540827866998416, "grad_norm": 0.6842470765113831, "learning_rate": 4.366777093095701e-06, "loss": 0.5645, "step": 12248 }, { "epoch": 5.54128025333635, "grad_norm": 0.6800990700721741, "learning_rate": 4.366050249814441e-06, "loss": 0.4761, "step": 12249 }, { "epoch": 5.541732639674282, "grad_norm": 0.6912761330604553, "learning_rate": 4.3653234201484605e-06, "loss": 0.5484, "step": 12250 }, { "epoch": 5.5421850260122145, "grad_norm": 0.6643879413604736, "learning_rate": 4.364596604113369e-06, "loss": 0.4661, "step": 12251 }, { "epoch": 5.542637412350147, "grad_norm": 0.6688820719718933, "learning_rate": 4.363869801724778e-06, "loss": 0.5462, "step": 12252 }, { "epoch": 5.543089798688079, "grad_norm": 0.5813589096069336, "learning_rate": 4.363143012998296e-06, "loss": 1.0426, "step": 12253 }, { "epoch": 5.543542185026013, "grad_norm": 0.24613769352436066, "learning_rate": 4.36241623794953e-06, "loss": 0.6435, "step": 12254 }, { "epoch": 5.543994571363945, "grad_norm": 0.29591619968414307, "learning_rate": 4.361689476594093e-06, "loss": 0.6145, "step": 12255 }, { "epoch": 5.544446957701878, "grad_norm": 0.42187827825546265, "learning_rate": 4.360962728947591e-06, "loss": 0.5961, "step": 12256 }, { "epoch": 5.54489934403981, "grad_norm": 0.4354674518108368, "learning_rate": 4.3602359950256325e-06, "loss": 0.8438, "step": 12257 }, { "epoch": 5.545351730377742, "grad_norm": 0.381778359413147, "learning_rate": 4.359509274843827e-06, "loss": 0.6005, "step": 12258 }, { "epoch": 5.545804116715675, "grad_norm": 0.420886367559433, "learning_rate": 4.3587825684177795e-06, "loss": 0.6543, "step": 12259 }, { "epoch": 5.546256503053607, "grad_norm": 0.3938908576965332, "learning_rate": 4.3580558757630996e-06, "loss": 0.6316, "step": 12260 }, { "epoch": 5.546708889391541, "grad_norm": 0.46391594409942627, "learning_rate": 4.357329196895394e-06, "loss": 0.5255, "step": 12261 }, { "epoch": 5.547161275729473, "grad_norm": 0.4125136733055115, "learning_rate": 4.3566025318302675e-06, "loss": 0.5399, "step": 12262 }, { "epoch": 5.5476136620674055, "grad_norm": 0.4249415993690491, "learning_rate": 4.355875880583329e-06, "loss": 0.5411, "step": 12263 }, { "epoch": 5.548066048405338, "grad_norm": 0.43961718678474426, "learning_rate": 4.355149243170185e-06, "loss": 0.5316, "step": 12264 }, { "epoch": 5.54851843474327, "grad_norm": 0.4488430321216583, "learning_rate": 4.35442261960644e-06, "loss": 0.6481, "step": 12265 }, { "epoch": 5.548970821081204, "grad_norm": 0.3828078508377075, "learning_rate": 4.3536960099077006e-06, "loss": 0.4492, "step": 12266 }, { "epoch": 5.549423207419136, "grad_norm": 0.42163410782814026, "learning_rate": 4.35296941408957e-06, "loss": 0.5065, "step": 12267 }, { "epoch": 5.5498755937570685, "grad_norm": 0.4763459861278534, "learning_rate": 4.352242832167657e-06, "loss": 0.4385, "step": 12268 }, { "epoch": 5.550327980095001, "grad_norm": 0.4766327738761902, "learning_rate": 4.351516264157563e-06, "loss": 0.5, "step": 12269 }, { "epoch": 5.550780366432933, "grad_norm": 0.43633297085762024, "learning_rate": 4.350789710074895e-06, "loss": 0.497, "step": 12270 }, { "epoch": 5.551232752770867, "grad_norm": 0.47559669613838196, "learning_rate": 4.350063169935255e-06, "loss": 0.6044, "step": 12271 }, { "epoch": 5.551685139108799, "grad_norm": 0.46085986495018005, "learning_rate": 4.3493366437542465e-06, "loss": 0.561, "step": 12272 }, { "epoch": 5.552137525446732, "grad_norm": 0.4992946982383728, "learning_rate": 4.348610131547475e-06, "loss": 0.5546, "step": 12273 }, { "epoch": 5.552589911784664, "grad_norm": 0.5039486289024353, "learning_rate": 4.3478836333305425e-06, "loss": 0.5068, "step": 12274 }, { "epoch": 5.553042298122596, "grad_norm": 0.51252681016922, "learning_rate": 4.347157149119052e-06, "loss": 0.4423, "step": 12275 }, { "epoch": 5.55349468446053, "grad_norm": 0.4942265748977661, "learning_rate": 4.346430678928607e-06, "loss": 0.5515, "step": 12276 }, { "epoch": 5.553947070798462, "grad_norm": 0.440589964389801, "learning_rate": 4.34570422277481e-06, "loss": 0.3706, "step": 12277 }, { "epoch": 5.554399457136395, "grad_norm": 0.46782469749450684, "learning_rate": 4.344977780673261e-06, "loss": 0.5053, "step": 12278 }, { "epoch": 5.554851843474327, "grad_norm": 0.5036591291427612, "learning_rate": 4.344251352639564e-06, "loss": 0.5366, "step": 12279 }, { "epoch": 5.5553042298122595, "grad_norm": 0.49800577759742737, "learning_rate": 4.343524938689318e-06, "loss": 0.5199, "step": 12280 }, { "epoch": 5.555756616150192, "grad_norm": 0.5171424150466919, "learning_rate": 4.342798538838129e-06, "loss": 0.5541, "step": 12281 }, { "epoch": 5.556209002488125, "grad_norm": 0.5653961896896362, "learning_rate": 4.342072153101592e-06, "loss": 0.6342, "step": 12282 }, { "epoch": 5.556661388826058, "grad_norm": 0.5081943869590759, "learning_rate": 4.3413457814953095e-06, "loss": 0.4688, "step": 12283 }, { "epoch": 5.55711377516399, "grad_norm": 0.5042452216148376, "learning_rate": 4.340619424034882e-06, "loss": 0.4738, "step": 12284 }, { "epoch": 5.5575661615019225, "grad_norm": 0.5128776431083679, "learning_rate": 4.339893080735911e-06, "loss": 0.4231, "step": 12285 }, { "epoch": 5.558018547839855, "grad_norm": 0.4771510660648346, "learning_rate": 4.339166751613993e-06, "loss": 0.3826, "step": 12286 }, { "epoch": 5.558470934177787, "grad_norm": 0.5499930381774902, "learning_rate": 4.338440436684731e-06, "loss": 0.4868, "step": 12287 }, { "epoch": 5.558923320515721, "grad_norm": 0.5870126485824585, "learning_rate": 4.3377141359637195e-06, "loss": 0.5596, "step": 12288 }, { "epoch": 5.559375706853653, "grad_norm": 0.5292516350746155, "learning_rate": 4.3369878494665614e-06, "loss": 0.5114, "step": 12289 }, { "epoch": 5.5598280931915856, "grad_norm": 0.5527605414390564, "learning_rate": 4.336261577208853e-06, "loss": 0.4788, "step": 12290 }, { "epoch": 5.560280479529518, "grad_norm": 0.5226048231124878, "learning_rate": 4.3355353192061925e-06, "loss": 0.4394, "step": 12291 }, { "epoch": 5.56073286586745, "grad_norm": 0.5403909683227539, "learning_rate": 4.334809075474179e-06, "loss": 0.5452, "step": 12292 }, { "epoch": 5.561185252205384, "grad_norm": 0.5485232472419739, "learning_rate": 4.334082846028408e-06, "loss": 0.4989, "step": 12293 }, { "epoch": 5.561637638543316, "grad_norm": 0.5645416378974915, "learning_rate": 4.333356630884477e-06, "loss": 0.4936, "step": 12294 }, { "epoch": 5.562090024881249, "grad_norm": 0.5332038402557373, "learning_rate": 4.332630430057984e-06, "loss": 0.4908, "step": 12295 }, { "epoch": 5.562542411219181, "grad_norm": 0.6003257036209106, "learning_rate": 4.331904243564524e-06, "loss": 0.5146, "step": 12296 }, { "epoch": 5.5629947975571135, "grad_norm": 0.5991036295890808, "learning_rate": 4.3311780714196946e-06, "loss": 0.479, "step": 12297 }, { "epoch": 5.563447183895047, "grad_norm": 0.5941183567047119, "learning_rate": 4.330451913639092e-06, "loss": 0.5009, "step": 12298 }, { "epoch": 5.563899570232979, "grad_norm": 0.636645495891571, "learning_rate": 4.3297257702383105e-06, "loss": 0.5157, "step": 12299 }, { "epoch": 5.564351956570912, "grad_norm": 0.6420783400535583, "learning_rate": 4.328999641232946e-06, "loss": 0.5018, "step": 12300 }, { "epoch": 5.564804342908844, "grad_norm": 0.6575440764427185, "learning_rate": 4.328273526638594e-06, "loss": 0.4782, "step": 12301 }, { "epoch": 5.5652567292467765, "grad_norm": 0.793915331363678, "learning_rate": 4.327547426470848e-06, "loss": 0.5709, "step": 12302 }, { "epoch": 5.56570911558471, "grad_norm": 0.5257952213287354, "learning_rate": 4.326821340745304e-06, "loss": 1.0596, "step": 12303 }, { "epoch": 5.566161501922642, "grad_norm": 0.2156299352645874, "learning_rate": 4.326095269477556e-06, "loss": 0.6862, "step": 12304 }, { "epoch": 5.566613888260575, "grad_norm": 0.41354796290397644, "learning_rate": 4.325369212683196e-06, "loss": 0.5855, "step": 12305 }, { "epoch": 5.567066274598507, "grad_norm": 0.3473964035511017, "learning_rate": 4.324643170377818e-06, "loss": 0.6174, "step": 12306 }, { "epoch": 5.5675186609364395, "grad_norm": 0.3771124482154846, "learning_rate": 4.323917142577016e-06, "loss": 0.6133, "step": 12307 }, { "epoch": 5.567971047274372, "grad_norm": 0.3593766689300537, "learning_rate": 4.323191129296381e-06, "loss": 0.5746, "step": 12308 }, { "epoch": 5.568423433612305, "grad_norm": 0.4230033755302429, "learning_rate": 4.322465130551508e-06, "loss": 0.6304, "step": 12309 }, { "epoch": 5.568875819950238, "grad_norm": 0.39099133014678955, "learning_rate": 4.321739146357988e-06, "loss": 0.607, "step": 12310 }, { "epoch": 5.56932820628817, "grad_norm": 0.4889873266220093, "learning_rate": 4.321013176731414e-06, "loss": 0.6838, "step": 12311 }, { "epoch": 5.569780592626103, "grad_norm": 0.39440229535102844, "learning_rate": 4.320287221687375e-06, "loss": 0.5953, "step": 12312 }, { "epoch": 5.570232978964035, "grad_norm": 0.45542484521865845, "learning_rate": 4.319561281241465e-06, "loss": 0.5471, "step": 12313 }, { "epoch": 5.5706853653019675, "grad_norm": 0.386478990316391, "learning_rate": 4.318835355409273e-06, "loss": 0.4584, "step": 12314 }, { "epoch": 5.571137751639901, "grad_norm": 0.44811001420021057, "learning_rate": 4.3181094442063924e-06, "loss": 0.5739, "step": 12315 }, { "epoch": 5.571590137977833, "grad_norm": 0.4148143529891968, "learning_rate": 4.31738354764841e-06, "loss": 0.5455, "step": 12316 }, { "epoch": 5.572042524315766, "grad_norm": 0.48711737990379333, "learning_rate": 4.316657665750917e-06, "loss": 0.6553, "step": 12317 }, { "epoch": 5.572494910653698, "grad_norm": 0.44002845883369446, "learning_rate": 4.315931798529505e-06, "loss": 0.4512, "step": 12318 }, { "epoch": 5.5729472969916305, "grad_norm": 0.5132304430007935, "learning_rate": 4.31520594599976e-06, "loss": 0.4739, "step": 12319 }, { "epoch": 5.573399683329564, "grad_norm": 0.4467042088508606, "learning_rate": 4.314480108177274e-06, "loss": 0.5515, "step": 12320 }, { "epoch": 5.573852069667496, "grad_norm": 0.4459393620491028, "learning_rate": 4.313754285077635e-06, "loss": 0.5328, "step": 12321 }, { "epoch": 5.574304456005429, "grad_norm": 0.4290298521518707, "learning_rate": 4.31302847671643e-06, "loss": 0.5125, "step": 12322 }, { "epoch": 5.574756842343361, "grad_norm": 0.4469105899333954, "learning_rate": 4.312302683109249e-06, "loss": 0.5295, "step": 12323 }, { "epoch": 5.5752092286812935, "grad_norm": 0.47174254059791565, "learning_rate": 4.3115769042716795e-06, "loss": 0.5427, "step": 12324 }, { "epoch": 5.575661615019227, "grad_norm": 0.41694965958595276, "learning_rate": 4.310851140219308e-06, "loss": 0.4491, "step": 12325 }, { "epoch": 5.576114001357159, "grad_norm": 0.4293383061885834, "learning_rate": 4.310125390967723e-06, "loss": 0.4539, "step": 12326 }, { "epoch": 5.576566387695092, "grad_norm": 0.540142297744751, "learning_rate": 4.3093996565325095e-06, "loss": 0.7471, "step": 12327 }, { "epoch": 5.577018774033024, "grad_norm": 0.5323593020439148, "learning_rate": 4.3086739369292555e-06, "loss": 0.5899, "step": 12328 }, { "epoch": 5.577471160370957, "grad_norm": 0.5053077936172485, "learning_rate": 4.307948232173546e-06, "loss": 0.521, "step": 12329 }, { "epoch": 5.577923546708889, "grad_norm": 0.5247970223426819, "learning_rate": 4.307222542280969e-06, "loss": 0.5734, "step": 12330 }, { "epoch": 5.578375933046822, "grad_norm": 0.5388393998146057, "learning_rate": 4.3064968672671075e-06, "loss": 0.61, "step": 12331 }, { "epoch": 5.578828319384755, "grad_norm": 0.48096832633018494, "learning_rate": 4.305771207147549e-06, "loss": 0.4463, "step": 12332 }, { "epoch": 5.579280705722687, "grad_norm": 0.5452231168746948, "learning_rate": 4.305045561937877e-06, "loss": 0.5042, "step": 12333 }, { "epoch": 5.57973309206062, "grad_norm": 0.5310642719268799, "learning_rate": 4.304319931653676e-06, "loss": 0.5549, "step": 12334 }, { "epoch": 5.580185478398552, "grad_norm": 0.5895252823829651, "learning_rate": 4.3035943163105315e-06, "loss": 0.6357, "step": 12335 }, { "epoch": 5.5806378647364845, "grad_norm": 0.5549164414405823, "learning_rate": 4.302868715924027e-06, "loss": 0.5009, "step": 12336 }, { "epoch": 5.581090251074418, "grad_norm": 0.49749940633773804, "learning_rate": 4.3021431305097455e-06, "loss": 0.4492, "step": 12337 }, { "epoch": 5.58154263741235, "grad_norm": 0.484605610370636, "learning_rate": 4.301417560083272e-06, "loss": 0.4337, "step": 12338 }, { "epoch": 5.581995023750283, "grad_norm": 0.5848215818405151, "learning_rate": 4.300692004660187e-06, "loss": 0.6031, "step": 12339 }, { "epoch": 5.582447410088215, "grad_norm": 0.6001344323158264, "learning_rate": 4.299966464256076e-06, "loss": 0.5269, "step": 12340 }, { "epoch": 5.5828997964261475, "grad_norm": 0.5382207632064819, "learning_rate": 4.2992409388865186e-06, "loss": 0.4135, "step": 12341 }, { "epoch": 5.583352182764081, "grad_norm": 0.5520532727241516, "learning_rate": 4.298515428567098e-06, "loss": 0.4781, "step": 12342 }, { "epoch": 5.583804569102013, "grad_norm": 0.615330696105957, "learning_rate": 4.297789933313397e-06, "loss": 0.5191, "step": 12343 }, { "epoch": 5.584256955439946, "grad_norm": 0.596545398235321, "learning_rate": 4.297064453140994e-06, "loss": 0.5161, "step": 12344 }, { "epoch": 5.584709341777878, "grad_norm": 0.5462815165519714, "learning_rate": 4.296338988065474e-06, "loss": 0.473, "step": 12345 }, { "epoch": 5.585161728115811, "grad_norm": 0.5274084806442261, "learning_rate": 4.295613538102415e-06, "loss": 0.3569, "step": 12346 }, { "epoch": 5.585614114453744, "grad_norm": 0.552669882774353, "learning_rate": 4.294888103267399e-06, "loss": 0.4757, "step": 12347 }, { "epoch": 5.586066500791676, "grad_norm": 0.6176888942718506, "learning_rate": 4.294162683576005e-06, "loss": 0.5133, "step": 12348 }, { "epoch": 5.586518887129609, "grad_norm": 0.7095707058906555, "learning_rate": 4.293437279043814e-06, "loss": 0.5545, "step": 12349 }, { "epoch": 5.586971273467541, "grad_norm": 0.6352253556251526, "learning_rate": 4.292711889686403e-06, "loss": 0.4977, "step": 12350 }, { "epoch": 5.587423659805474, "grad_norm": 0.8233181834220886, "learning_rate": 4.291986515519353e-06, "loss": 0.6752, "step": 12351 }, { "epoch": 5.587876046143407, "grad_norm": 0.7636365294456482, "learning_rate": 4.291261156558244e-06, "loss": 0.5277, "step": 12352 }, { "epoch": 5.588328432481339, "grad_norm": 0.5362386703491211, "learning_rate": 4.290535812818651e-06, "loss": 0.822, "step": 12353 }, { "epoch": 5.588780818819272, "grad_norm": 0.21423505246639252, "learning_rate": 4.2898104843161545e-06, "loss": 1.024, "step": 12354 }, { "epoch": 5.589233205157204, "grad_norm": 0.29773613810539246, "learning_rate": 4.289085171066333e-06, "loss": 0.6722, "step": 12355 }, { "epoch": 5.589685591495137, "grad_norm": 0.3416917622089386, "learning_rate": 4.288359873084762e-06, "loss": 0.6257, "step": 12356 }, { "epoch": 5.590137977833069, "grad_norm": 0.36339622735977173, "learning_rate": 4.28763459038702e-06, "loss": 0.4996, "step": 12357 }, { "epoch": 5.590590364171002, "grad_norm": 0.33017945289611816, "learning_rate": 4.286909322988682e-06, "loss": 0.5316, "step": 12358 }, { "epoch": 5.591042750508935, "grad_norm": 0.3765254318714142, "learning_rate": 4.286184070905328e-06, "loss": 0.5329, "step": 12359 }, { "epoch": 5.591495136846867, "grad_norm": 0.3768974542617798, "learning_rate": 4.285458834152531e-06, "loss": 0.5647, "step": 12360 }, { "epoch": 5.5919475231848, "grad_norm": 0.36089733242988586, "learning_rate": 4.284733612745869e-06, "loss": 0.435, "step": 12361 }, { "epoch": 5.592399909522732, "grad_norm": 0.3915646970272064, "learning_rate": 4.284008406700915e-06, "loss": 0.5124, "step": 12362 }, { "epoch": 5.592852295860665, "grad_norm": 0.3481360971927643, "learning_rate": 4.2832832160332454e-06, "loss": 0.4953, "step": 12363 }, { "epoch": 5.593304682198598, "grad_norm": 0.44029855728149414, "learning_rate": 4.2825580407584355e-06, "loss": 0.5907, "step": 12364 }, { "epoch": 5.59375706853653, "grad_norm": 0.4330346882343292, "learning_rate": 4.2818328808920605e-06, "loss": 0.5937, "step": 12365 }, { "epoch": 5.594209454874463, "grad_norm": 0.43685391545295715, "learning_rate": 4.281107736449692e-06, "loss": 0.5687, "step": 12366 }, { "epoch": 5.594661841212395, "grad_norm": 0.43657422065734863, "learning_rate": 4.280382607446907e-06, "loss": 0.5825, "step": 12367 }, { "epoch": 5.595114227550328, "grad_norm": 0.4911344051361084, "learning_rate": 4.279657493899277e-06, "loss": 0.6516, "step": 12368 }, { "epoch": 5.595566613888261, "grad_norm": 0.4338876008987427, "learning_rate": 4.2789323958223755e-06, "loss": 0.5426, "step": 12369 }, { "epoch": 5.596019000226193, "grad_norm": 0.4444217383861542, "learning_rate": 4.278207313231776e-06, "loss": 0.5933, "step": 12370 }, { "epoch": 5.596471386564126, "grad_norm": 0.4290120601654053, "learning_rate": 4.277482246143051e-06, "loss": 0.4274, "step": 12371 }, { "epoch": 5.596923772902058, "grad_norm": 0.47246161103248596, "learning_rate": 4.276757194571772e-06, "loss": 0.5993, "step": 12372 }, { "epoch": 5.597376159239991, "grad_norm": 0.42811620235443115, "learning_rate": 4.276032158533511e-06, "loss": 0.4591, "step": 12373 }, { "epoch": 5.597828545577924, "grad_norm": 0.4707452058792114, "learning_rate": 4.27530713804384e-06, "loss": 0.5723, "step": 12374 }, { "epoch": 5.598280931915856, "grad_norm": 0.4910784959793091, "learning_rate": 4.274582133118329e-06, "loss": 0.5741, "step": 12375 }, { "epoch": 5.598733318253789, "grad_norm": 0.47611740231513977, "learning_rate": 4.27385714377255e-06, "loss": 0.4331, "step": 12376 }, { "epoch": 5.599185704591721, "grad_norm": 0.45846816897392273, "learning_rate": 4.273132170022074e-06, "loss": 0.4823, "step": 12377 }, { "epoch": 5.599638090929654, "grad_norm": 0.4515421688556671, "learning_rate": 4.27240721188247e-06, "loss": 0.5926, "step": 12378 }, { "epoch": 5.600090477267586, "grad_norm": 0.513606607913971, "learning_rate": 4.271682269369308e-06, "loss": 0.5739, "step": 12379 }, { "epoch": 5.6005428636055195, "grad_norm": 0.45612430572509766, "learning_rate": 4.270957342498159e-06, "loss": 0.4654, "step": 12380 }, { "epoch": 5.600995249943452, "grad_norm": 0.5109691619873047, "learning_rate": 4.27023243128459e-06, "loss": 0.5069, "step": 12381 }, { "epoch": 5.601447636281384, "grad_norm": 0.486925333738327, "learning_rate": 4.269507535744172e-06, "loss": 0.4957, "step": 12382 }, { "epoch": 5.601900022619317, "grad_norm": 0.535290539264679, "learning_rate": 4.268782655892473e-06, "loss": 0.5691, "step": 12383 }, { "epoch": 5.602352408957249, "grad_norm": 0.5468198657035828, "learning_rate": 4.268057791745059e-06, "loss": 0.4609, "step": 12384 }, { "epoch": 5.602804795295182, "grad_norm": 0.5017691254615784, "learning_rate": 4.2673329433175e-06, "loss": 0.5144, "step": 12385 }, { "epoch": 5.603257181633115, "grad_norm": 0.5617280006408691, "learning_rate": 4.266608110625363e-06, "loss": 0.5034, "step": 12386 }, { "epoch": 5.603709567971047, "grad_norm": 0.6257621645927429, "learning_rate": 4.265883293684212e-06, "loss": 0.5318, "step": 12387 }, { "epoch": 5.60416195430898, "grad_norm": 0.5995649099349976, "learning_rate": 4.26515849250962e-06, "loss": 0.6265, "step": 12388 }, { "epoch": 5.604614340646912, "grad_norm": 0.6005342602729797, "learning_rate": 4.264433707117151e-06, "loss": 0.5302, "step": 12389 }, { "epoch": 5.605066726984845, "grad_norm": 0.5494483113288879, "learning_rate": 4.26370893752237e-06, "loss": 0.4213, "step": 12390 }, { "epoch": 5.605519113322778, "grad_norm": 0.6054033041000366, "learning_rate": 4.262984183740843e-06, "loss": 0.6335, "step": 12391 }, { "epoch": 5.60597149966071, "grad_norm": 0.508073091506958, "learning_rate": 4.262259445788137e-06, "loss": 0.4641, "step": 12392 }, { "epoch": 5.606423885998643, "grad_norm": 0.5284201502799988, "learning_rate": 4.2615347236798156e-06, "loss": 0.4243, "step": 12393 }, { "epoch": 5.606876272336575, "grad_norm": 0.5226340293884277, "learning_rate": 4.260810017431443e-06, "loss": 0.4448, "step": 12394 }, { "epoch": 5.607328658674508, "grad_norm": 0.5744267106056213, "learning_rate": 4.260085327058585e-06, "loss": 0.4892, "step": 12395 }, { "epoch": 5.607781045012441, "grad_norm": 0.5591321587562561, "learning_rate": 4.259360652576805e-06, "loss": 0.4327, "step": 12396 }, { "epoch": 5.608233431350373, "grad_norm": 0.583874523639679, "learning_rate": 4.258635994001669e-06, "loss": 0.5154, "step": 12397 }, { "epoch": 5.608685817688306, "grad_norm": 0.5864120125770569, "learning_rate": 4.257911351348737e-06, "loss": 0.5265, "step": 12398 }, { "epoch": 5.609138204026238, "grad_norm": 0.6905863881111145, "learning_rate": 4.257186724633573e-06, "loss": 0.4793, "step": 12399 }, { "epoch": 5.609590590364171, "grad_norm": 0.6085776686668396, "learning_rate": 4.256462113871741e-06, "loss": 0.4553, "step": 12400 }, { "epoch": 5.609590590364171, "eval_loss": 0.5900253057479858, "eval_runtime": 25.8978, "eval_samples_per_second": 28.728, "eval_steps_per_second": 7.182, "step": 12400 }, { "epoch": 5.610042976702104, "grad_norm": 0.6310389041900635, "learning_rate": 4.2557375190788046e-06, "loss": 0.4885, "step": 12401 }, { "epoch": 5.6104953630400365, "grad_norm": 0.6622512340545654, "learning_rate": 4.255012940270322e-06, "loss": 0.4812, "step": 12402 }, { "epoch": 5.610947749377969, "grad_norm": 0.5744020342826843, "learning_rate": 4.254288377461856e-06, "loss": 1.0921, "step": 12403 }, { "epoch": 5.611400135715901, "grad_norm": 0.21909870207309723, "learning_rate": 4.253563830668971e-06, "loss": 0.532, "step": 12404 }, { "epoch": 5.611852522053834, "grad_norm": 0.3484543561935425, "learning_rate": 4.2528392999072245e-06, "loss": 0.4902, "step": 12405 }, { "epoch": 5.612304908391766, "grad_norm": 0.3253363370895386, "learning_rate": 4.252114785192179e-06, "loss": 0.4681, "step": 12406 }, { "epoch": 5.6127572947296995, "grad_norm": 0.4507794976234436, "learning_rate": 4.251390286539394e-06, "loss": 0.5639, "step": 12407 }, { "epoch": 5.613209681067632, "grad_norm": 0.3644734025001526, "learning_rate": 4.25066580396443e-06, "loss": 0.5181, "step": 12408 }, { "epoch": 5.613662067405564, "grad_norm": 0.3523419201374054, "learning_rate": 4.249941337482848e-06, "loss": 0.5211, "step": 12409 }, { "epoch": 5.614114453743497, "grad_norm": 0.41171887516975403, "learning_rate": 4.249216887110204e-06, "loss": 0.5646, "step": 12410 }, { "epoch": 5.614566840081429, "grad_norm": 0.4408193528652191, "learning_rate": 4.2484924528620595e-06, "loss": 0.5744, "step": 12411 }, { "epoch": 5.615019226419362, "grad_norm": 0.4517500698566437, "learning_rate": 4.247768034753974e-06, "loss": 0.649, "step": 12412 }, { "epoch": 5.615471612757295, "grad_norm": 0.4800946116447449, "learning_rate": 4.247043632801502e-06, "loss": 0.6094, "step": 12413 }, { "epoch": 5.615923999095227, "grad_norm": 0.43346089124679565, "learning_rate": 4.246319247020204e-06, "loss": 0.4618, "step": 12414 }, { "epoch": 5.61637638543316, "grad_norm": 0.49738815426826477, "learning_rate": 4.245594877425637e-06, "loss": 0.6951, "step": 12415 }, { "epoch": 5.616828771771092, "grad_norm": 0.40777555108070374, "learning_rate": 4.2448705240333596e-06, "loss": 0.5214, "step": 12416 }, { "epoch": 5.617281158109025, "grad_norm": 0.47172996401786804, "learning_rate": 4.244146186858926e-06, "loss": 0.5064, "step": 12417 }, { "epoch": 5.617733544446958, "grad_norm": 0.47804635763168335, "learning_rate": 4.243421865917895e-06, "loss": 0.5894, "step": 12418 }, { "epoch": 5.6181859307848905, "grad_norm": 0.4591115713119507, "learning_rate": 4.2426975612258215e-06, "loss": 0.6225, "step": 12419 }, { "epoch": 5.618638317122823, "grad_norm": 0.4642426073551178, "learning_rate": 4.2419732727982615e-06, "loss": 0.5797, "step": 12420 }, { "epoch": 5.619090703460755, "grad_norm": 0.4751468896865845, "learning_rate": 4.241249000650771e-06, "loss": 0.5501, "step": 12421 }, { "epoch": 5.619543089798688, "grad_norm": 0.5049278140068054, "learning_rate": 4.240524744798905e-06, "loss": 0.5127, "step": 12422 }, { "epoch": 5.619995476136621, "grad_norm": 0.44664695858955383, "learning_rate": 4.239800505258219e-06, "loss": 0.433, "step": 12423 }, { "epoch": 5.6204478624745535, "grad_norm": 0.46168890595436096, "learning_rate": 4.239076282044266e-06, "loss": 0.5371, "step": 12424 }, { "epoch": 5.620900248812486, "grad_norm": 0.5185437202453613, "learning_rate": 4.2383520751726e-06, "loss": 0.547, "step": 12425 }, { "epoch": 5.621352635150418, "grad_norm": 0.47060298919677734, "learning_rate": 4.2376278846587755e-06, "loss": 0.4774, "step": 12426 }, { "epoch": 5.621805021488351, "grad_norm": 0.5502009391784668, "learning_rate": 4.236903710518346e-06, "loss": 0.6188, "step": 12427 }, { "epoch": 5.622257407826284, "grad_norm": 0.45363855361938477, "learning_rate": 4.2361795527668635e-06, "loss": 0.5059, "step": 12428 }, { "epoch": 5.622709794164217, "grad_norm": 0.5703052282333374, "learning_rate": 4.235455411419882e-06, "loss": 0.6229, "step": 12429 }, { "epoch": 5.623162180502149, "grad_norm": 0.5414186716079712, "learning_rate": 4.234731286492953e-06, "loss": 0.537, "step": 12430 }, { "epoch": 5.623614566840081, "grad_norm": 0.49227213859558105, "learning_rate": 4.23400717800163e-06, "loss": 0.5206, "step": 12431 }, { "epoch": 5.624066953178014, "grad_norm": 0.49307918548583984, "learning_rate": 4.233283085961461e-06, "loss": 0.4584, "step": 12432 }, { "epoch": 5.624519339515946, "grad_norm": 0.5564000010490417, "learning_rate": 4.232559010388001e-06, "loss": 0.5381, "step": 12433 }, { "epoch": 5.624971725853879, "grad_norm": 0.5951627492904663, "learning_rate": 4.2318349512968005e-06, "loss": 0.5311, "step": 12434 }, { "epoch": 5.625424112191812, "grad_norm": 0.5480487942695618, "learning_rate": 4.231110908703408e-06, "loss": 0.4619, "step": 12435 }, { "epoch": 5.6258764985297445, "grad_norm": 0.542133092880249, "learning_rate": 4.230386882623374e-06, "loss": 0.4656, "step": 12436 }, { "epoch": 5.626328884867677, "grad_norm": 0.5053402781486511, "learning_rate": 4.22966287307225e-06, "loss": 0.4384, "step": 12437 }, { "epoch": 5.626781271205609, "grad_norm": 0.5883341431617737, "learning_rate": 4.228938880065584e-06, "loss": 0.5986, "step": 12438 }, { "epoch": 5.627233657543542, "grad_norm": 0.5696621537208557, "learning_rate": 4.228214903618926e-06, "loss": 0.5193, "step": 12439 }, { "epoch": 5.627686043881475, "grad_norm": 0.5247088670730591, "learning_rate": 4.227490943747824e-06, "loss": 0.4683, "step": 12440 }, { "epoch": 5.6281384302194075, "grad_norm": 0.561400830745697, "learning_rate": 4.226767000467826e-06, "loss": 0.4439, "step": 12441 }, { "epoch": 5.62859081655734, "grad_norm": 0.5809794664382935, "learning_rate": 4.226043073794483e-06, "loss": 0.551, "step": 12442 }, { "epoch": 5.629043202895272, "grad_norm": 0.5477252006530762, "learning_rate": 4.2253191637433385e-06, "loss": 0.4765, "step": 12443 }, { "epoch": 5.629495589233205, "grad_norm": 0.553064227104187, "learning_rate": 4.224595270329944e-06, "loss": 0.5158, "step": 12444 }, { "epoch": 5.629947975571138, "grad_norm": 0.5769237875938416, "learning_rate": 4.2238713935698425e-06, "loss": 0.5233, "step": 12445 }, { "epoch": 5.630400361909071, "grad_norm": 0.6046124696731567, "learning_rate": 4.223147533478585e-06, "loss": 0.4529, "step": 12446 }, { "epoch": 5.630852748247003, "grad_norm": 0.6516922116279602, "learning_rate": 4.222423690071714e-06, "loss": 0.5554, "step": 12447 }, { "epoch": 5.631305134584935, "grad_norm": 0.6273292899131775, "learning_rate": 4.221699863364776e-06, "loss": 0.4873, "step": 12448 }, { "epoch": 5.631757520922868, "grad_norm": 0.728441059589386, "learning_rate": 4.220976053373319e-06, "loss": 0.5657, "step": 12449 }, { "epoch": 5.632209907260801, "grad_norm": 0.6580464839935303, "learning_rate": 4.220252260112885e-06, "loss": 0.5088, "step": 12450 }, { "epoch": 5.632662293598734, "grad_norm": 0.6670337319374084, "learning_rate": 4.219528483599021e-06, "loss": 0.5096, "step": 12451 }, { "epoch": 5.633114679936666, "grad_norm": 0.721570611000061, "learning_rate": 4.2188047238472714e-06, "loss": 0.47, "step": 12452 }, { "epoch": 5.6335670662745985, "grad_norm": 0.49893471598625183, "learning_rate": 4.2180809808731795e-06, "loss": 1.0323, "step": 12453 }, { "epoch": 5.634019452612531, "grad_norm": 0.23338371515274048, "learning_rate": 4.217357254692289e-06, "loss": 0.624, "step": 12454 }, { "epoch": 5.634471838950463, "grad_norm": 0.34657469391822815, "learning_rate": 4.216633545320143e-06, "loss": 0.5655, "step": 12455 }, { "epoch": 5.634924225288397, "grad_norm": 0.3437705338001251, "learning_rate": 4.215909852772287e-06, "loss": 0.7189, "step": 12456 }, { "epoch": 5.635376611626329, "grad_norm": 0.35679927468299866, "learning_rate": 4.215186177064262e-06, "loss": 0.67, "step": 12457 }, { "epoch": 5.6358289979642615, "grad_norm": 0.3739597201347351, "learning_rate": 4.214462518211609e-06, "loss": 0.4647, "step": 12458 }, { "epoch": 5.636281384302194, "grad_norm": 0.48372548818588257, "learning_rate": 4.213738876229871e-06, "loss": 0.6543, "step": 12459 }, { "epoch": 5.636733770640126, "grad_norm": 0.39088907837867737, "learning_rate": 4.21301525113459e-06, "loss": 0.5418, "step": 12460 }, { "epoch": 5.637186156978059, "grad_norm": 0.44126302003860474, "learning_rate": 4.212291642941307e-06, "loss": 0.5411, "step": 12461 }, { "epoch": 5.637638543315992, "grad_norm": 0.38188326358795166, "learning_rate": 4.211568051665562e-06, "loss": 0.4958, "step": 12462 }, { "epoch": 5.6380909296539246, "grad_norm": 0.4061121642589569, "learning_rate": 4.210844477322897e-06, "loss": 0.583, "step": 12463 }, { "epoch": 5.638543315991857, "grad_norm": 0.404807984828949, "learning_rate": 4.210120919928852e-06, "loss": 0.5844, "step": 12464 }, { "epoch": 5.638995702329789, "grad_norm": 0.4535755217075348, "learning_rate": 4.209397379498966e-06, "loss": 0.5755, "step": 12465 }, { "epoch": 5.639448088667722, "grad_norm": 0.4517979025840759, "learning_rate": 4.208673856048778e-06, "loss": 0.5548, "step": 12466 }, { "epoch": 5.639900475005655, "grad_norm": 0.4781966209411621, "learning_rate": 4.207950349593828e-06, "loss": 0.6548, "step": 12467 }, { "epoch": 5.640352861343588, "grad_norm": 0.4873236119747162, "learning_rate": 4.207226860149656e-06, "loss": 0.6005, "step": 12468 }, { "epoch": 5.64080524768152, "grad_norm": 0.4551929831504822, "learning_rate": 4.206503387731797e-06, "loss": 0.477, "step": 12469 }, { "epoch": 5.6412576340194525, "grad_norm": 0.4369319975376129, "learning_rate": 4.20577993235579e-06, "loss": 0.4779, "step": 12470 }, { "epoch": 5.641710020357385, "grad_norm": 0.5164049863815308, "learning_rate": 4.205056494037174e-06, "loss": 0.6568, "step": 12471 }, { "epoch": 5.642162406695318, "grad_norm": 0.5072240233421326, "learning_rate": 4.204333072791486e-06, "loss": 0.5216, "step": 12472 }, { "epoch": 5.642614793033251, "grad_norm": 0.44312766194343567, "learning_rate": 4.2036096686342614e-06, "loss": 0.4382, "step": 12473 }, { "epoch": 5.643067179371183, "grad_norm": 0.4818101227283478, "learning_rate": 4.202886281581038e-06, "loss": 0.4042, "step": 12474 }, { "epoch": 5.6435195657091155, "grad_norm": 0.491172194480896, "learning_rate": 4.202162911647351e-06, "loss": 0.5698, "step": 12475 }, { "epoch": 5.643971952047048, "grad_norm": 0.5215734243392944, "learning_rate": 4.201439558848737e-06, "loss": 0.5861, "step": 12476 }, { "epoch": 5.644424338384981, "grad_norm": 0.6347359418869019, "learning_rate": 4.200716223200731e-06, "loss": 0.7216, "step": 12477 }, { "epoch": 5.644876724722914, "grad_norm": 0.5795104503631592, "learning_rate": 4.1999929047188685e-06, "loss": 0.6357, "step": 12478 }, { "epoch": 5.645329111060846, "grad_norm": 0.5117807984352112, "learning_rate": 4.199269603418683e-06, "loss": 0.5285, "step": 12479 }, { "epoch": 5.6457814973987785, "grad_norm": 0.5001311302185059, "learning_rate": 4.198546319315711e-06, "loss": 0.524, "step": 12480 }, { "epoch": 5.646233883736711, "grad_norm": 0.5883253216743469, "learning_rate": 4.197823052425483e-06, "loss": 0.6767, "step": 12481 }, { "epoch": 5.646686270074643, "grad_norm": 0.47095516324043274, "learning_rate": 4.197099802763536e-06, "loss": 0.4632, "step": 12482 }, { "epoch": 5.647138656412576, "grad_norm": 0.5288903713226318, "learning_rate": 4.196376570345399e-06, "loss": 0.5011, "step": 12483 }, { "epoch": 5.647591042750509, "grad_norm": 0.49417707324028015, "learning_rate": 4.19565335518661e-06, "loss": 0.4216, "step": 12484 }, { "epoch": 5.648043429088442, "grad_norm": 0.6244030594825745, "learning_rate": 4.194930157302698e-06, "loss": 0.5914, "step": 12485 }, { "epoch": 5.648495815426374, "grad_norm": 0.5818678140640259, "learning_rate": 4.194206976709195e-06, "loss": 0.5677, "step": 12486 }, { "epoch": 5.6489482017643065, "grad_norm": 0.5488852858543396, "learning_rate": 4.193483813421634e-06, "loss": 0.5619, "step": 12487 }, { "epoch": 5.649400588102239, "grad_norm": 0.565551221370697, "learning_rate": 4.192760667455546e-06, "loss": 0.4827, "step": 12488 }, { "epoch": 5.649852974440172, "grad_norm": 0.6180415749549866, "learning_rate": 4.192037538826461e-06, "loss": 0.5854, "step": 12489 }, { "epoch": 5.650305360778105, "grad_norm": 0.5735427141189575, "learning_rate": 4.191314427549911e-06, "loss": 0.4187, "step": 12490 }, { "epoch": 5.650757747116037, "grad_norm": 0.6073324680328369, "learning_rate": 4.190591333641426e-06, "loss": 0.5355, "step": 12491 }, { "epoch": 5.6512101334539695, "grad_norm": 0.5997166633605957, "learning_rate": 4.189868257116535e-06, "loss": 0.5175, "step": 12492 }, { "epoch": 5.651662519791902, "grad_norm": 0.5826224088668823, "learning_rate": 4.1891451979907674e-06, "loss": 0.4505, "step": 12493 }, { "epoch": 5.652114906129835, "grad_norm": 0.5224854946136475, "learning_rate": 4.188422156279653e-06, "loss": 0.456, "step": 12494 }, { "epoch": 5.652567292467768, "grad_norm": 0.5945069193840027, "learning_rate": 4.18769913199872e-06, "loss": 0.4932, "step": 12495 }, { "epoch": 5.6530196788057, "grad_norm": 0.6782915592193604, "learning_rate": 4.186976125163496e-06, "loss": 0.5571, "step": 12496 }, { "epoch": 5.6534720651436325, "grad_norm": 0.6085245609283447, "learning_rate": 4.186253135789511e-06, "loss": 0.5115, "step": 12497 }, { "epoch": 5.653924451481565, "grad_norm": 0.6719927787780762, "learning_rate": 4.185530163892291e-06, "loss": 0.5347, "step": 12498 }, { "epoch": 5.654376837819498, "grad_norm": 0.5670915842056274, "learning_rate": 4.184807209487364e-06, "loss": 0.4459, "step": 12499 }, { "epoch": 5.654829224157431, "grad_norm": 0.6950914263725281, "learning_rate": 4.184084272590256e-06, "loss": 0.4932, "step": 12500 }, { "epoch": 5.655281610495363, "grad_norm": 0.6432456374168396, "learning_rate": 4.183361353216494e-06, "loss": 0.4644, "step": 12501 }, { "epoch": 5.655733996833296, "grad_norm": 1.0547856092453003, "learning_rate": 4.182638451381605e-06, "loss": 0.569, "step": 12502 }, { "epoch": 5.656186383171228, "grad_norm": 0.6114215850830078, "learning_rate": 4.181915567101113e-06, "loss": 1.1014, "step": 12503 }, { "epoch": 5.6566387695091604, "grad_norm": 0.2810978889465332, "learning_rate": 4.181192700390542e-06, "loss": 0.7915, "step": 12504 }, { "epoch": 5.657091155847094, "grad_norm": 0.39961346983909607, "learning_rate": 4.180469851265421e-06, "loss": 0.724, "step": 12505 }, { "epoch": 5.657543542185026, "grad_norm": 0.2867352068424225, "learning_rate": 4.179747019741272e-06, "loss": 0.3958, "step": 12506 }, { "epoch": 5.657995928522959, "grad_norm": 0.3600502610206604, "learning_rate": 4.17902420583362e-06, "loss": 0.6185, "step": 12507 }, { "epoch": 5.658448314860891, "grad_norm": 0.4294719398021698, "learning_rate": 4.178301409557988e-06, "loss": 0.6681, "step": 12508 }, { "epoch": 5.6589007011988235, "grad_norm": 0.4230382442474365, "learning_rate": 4.1775786309299e-06, "loss": 0.7226, "step": 12509 }, { "epoch": 5.659353087536756, "grad_norm": 0.41439661383628845, "learning_rate": 4.176855869964879e-06, "loss": 0.7134, "step": 12510 }, { "epoch": 5.659805473874689, "grad_norm": 0.4029291570186615, "learning_rate": 4.176133126678447e-06, "loss": 0.5648, "step": 12511 }, { "epoch": 5.660257860212622, "grad_norm": 0.48580434918403625, "learning_rate": 4.175410401086128e-06, "loss": 0.7055, "step": 12512 }, { "epoch": 5.660710246550554, "grad_norm": 0.42129191756248474, "learning_rate": 4.174687693203443e-06, "loss": 0.5364, "step": 12513 }, { "epoch": 5.6611626328884865, "grad_norm": 0.4566245675086975, "learning_rate": 4.173965003045912e-06, "loss": 0.6433, "step": 12514 }, { "epoch": 5.661615019226419, "grad_norm": 0.46056726574897766, "learning_rate": 4.1732423306290585e-06, "loss": 0.7284, "step": 12515 }, { "epoch": 5.662067405564352, "grad_norm": 0.47518986463546753, "learning_rate": 4.1725196759684015e-06, "loss": 0.5931, "step": 12516 }, { "epoch": 5.662519791902285, "grad_norm": 0.43445926904678345, "learning_rate": 4.171797039079462e-06, "loss": 0.4957, "step": 12517 }, { "epoch": 5.662972178240217, "grad_norm": 0.450774222612381, "learning_rate": 4.171074419977761e-06, "loss": 0.5734, "step": 12518 }, { "epoch": 5.66342456457815, "grad_norm": 0.4221005141735077, "learning_rate": 4.170351818678817e-06, "loss": 0.471, "step": 12519 }, { "epoch": 5.663876950916082, "grad_norm": 0.45204123854637146, "learning_rate": 4.169629235198149e-06, "loss": 0.4256, "step": 12520 }, { "epoch": 5.664329337254015, "grad_norm": 0.5164569020271301, "learning_rate": 4.168906669551277e-06, "loss": 0.6107, "step": 12521 }, { "epoch": 5.664781723591948, "grad_norm": 0.4208088517189026, "learning_rate": 4.168184121753718e-06, "loss": 0.4648, "step": 12522 }, { "epoch": 5.66523410992988, "grad_norm": 0.4795852303504944, "learning_rate": 4.167461591820991e-06, "loss": 0.5435, "step": 12523 }, { "epoch": 5.665686496267813, "grad_norm": 0.46276846528053284, "learning_rate": 4.166739079768615e-06, "loss": 0.4949, "step": 12524 }, { "epoch": 5.666138882605745, "grad_norm": 0.471352219581604, "learning_rate": 4.166016585612106e-06, "loss": 0.5336, "step": 12525 }, { "epoch": 5.666591268943678, "grad_norm": 0.4662570059299469, "learning_rate": 4.16529410936698e-06, "loss": 0.4542, "step": 12526 }, { "epoch": 5.667043655281611, "grad_norm": 0.44319772720336914, "learning_rate": 4.164571651048754e-06, "loss": 0.4433, "step": 12527 }, { "epoch": 5.667496041619543, "grad_norm": 0.5246009230613708, "learning_rate": 4.163849210672946e-06, "loss": 0.5398, "step": 12528 }, { "epoch": 5.667948427957476, "grad_norm": 0.5326296091079712, "learning_rate": 4.163126788255069e-06, "loss": 0.6078, "step": 12529 }, { "epoch": 5.668400814295408, "grad_norm": 0.5762249827384949, "learning_rate": 4.162404383810641e-06, "loss": 0.6197, "step": 12530 }, { "epoch": 5.6688532006333405, "grad_norm": 0.4864983558654785, "learning_rate": 4.161681997355175e-06, "loss": 0.4685, "step": 12531 }, { "epoch": 5.669305586971274, "grad_norm": 0.4938550293445587, "learning_rate": 4.160959628904186e-06, "loss": 0.5397, "step": 12532 }, { "epoch": 5.669757973309206, "grad_norm": 0.5347737669944763, "learning_rate": 4.160237278473189e-06, "loss": 0.4966, "step": 12533 }, { "epoch": 5.670210359647139, "grad_norm": 0.5312497615814209, "learning_rate": 4.159514946077699e-06, "loss": 0.5372, "step": 12534 }, { "epoch": 5.670662745985071, "grad_norm": 0.5388982892036438, "learning_rate": 4.158792631733226e-06, "loss": 0.488, "step": 12535 }, { "epoch": 5.671115132323004, "grad_norm": 0.5143072009086609, "learning_rate": 4.158070335455287e-06, "loss": 0.515, "step": 12536 }, { "epoch": 5.671567518660936, "grad_norm": 0.47405484318733215, "learning_rate": 4.157348057259391e-06, "loss": 0.4367, "step": 12537 }, { "epoch": 5.672019904998869, "grad_norm": 0.5685434341430664, "learning_rate": 4.156625797161054e-06, "loss": 0.5017, "step": 12538 }, { "epoch": 5.672472291336802, "grad_norm": 0.5010643005371094, "learning_rate": 4.1559035551757845e-06, "loss": 0.5716, "step": 12539 }, { "epoch": 5.672924677674734, "grad_norm": 0.5420850515365601, "learning_rate": 4.155181331319096e-06, "loss": 0.4395, "step": 12540 }, { "epoch": 5.673377064012667, "grad_norm": 0.5583322048187256, "learning_rate": 4.154459125606499e-06, "loss": 0.5548, "step": 12541 }, { "epoch": 5.673829450350599, "grad_norm": 0.5053964853286743, "learning_rate": 4.153736938053504e-06, "loss": 0.4442, "step": 12542 }, { "epoch": 5.674281836688532, "grad_norm": 0.5668990612030029, "learning_rate": 4.153014768675622e-06, "loss": 0.5305, "step": 12543 }, { "epoch": 5.674734223026465, "grad_norm": 0.5535005927085876, "learning_rate": 4.1522926174883635e-06, "loss": 0.4361, "step": 12544 }, { "epoch": 5.675186609364397, "grad_norm": 0.546942412853241, "learning_rate": 4.151570484507237e-06, "loss": 0.4447, "step": 12545 }, { "epoch": 5.67563899570233, "grad_norm": 0.6410817503929138, "learning_rate": 4.150848369747751e-06, "loss": 0.4762, "step": 12546 }, { "epoch": 5.676091382040262, "grad_norm": 0.5614113211631775, "learning_rate": 4.150126273225417e-06, "loss": 0.4958, "step": 12547 }, { "epoch": 5.676543768378195, "grad_norm": 0.556631863117218, "learning_rate": 4.14940419495574e-06, "loss": 0.4362, "step": 12548 }, { "epoch": 5.676996154716128, "grad_norm": 0.6517258286476135, "learning_rate": 4.1486821349542304e-06, "loss": 0.5104, "step": 12549 }, { "epoch": 5.67744854105406, "grad_norm": 0.6132650375366211, "learning_rate": 4.147960093236395e-06, "loss": 0.4375, "step": 12550 }, { "epoch": 5.677900927391993, "grad_norm": 0.7219163179397583, "learning_rate": 4.147238069817741e-06, "loss": 0.5638, "step": 12551 }, { "epoch": 5.678353313729925, "grad_norm": 0.7585386633872986, "learning_rate": 4.146516064713775e-06, "loss": 0.5291, "step": 12552 }, { "epoch": 5.678805700067858, "grad_norm": 0.5436564683914185, "learning_rate": 4.145794077940004e-06, "loss": 1.0849, "step": 12553 }, { "epoch": 5.679258086405791, "grad_norm": 0.31760430335998535, "learning_rate": 4.145072109511934e-06, "loss": 0.8072, "step": 12554 }, { "epoch": 5.679710472743723, "grad_norm": 0.3022230267524719, "learning_rate": 4.144350159445071e-06, "loss": 0.5798, "step": 12555 }, { "epoch": 5.680162859081656, "grad_norm": 0.37468773126602173, "learning_rate": 4.1436282277549186e-06, "loss": 0.6863, "step": 12556 }, { "epoch": 5.680615245419588, "grad_norm": 0.34599658846855164, "learning_rate": 4.142906314456983e-06, "loss": 0.544, "step": 12557 }, { "epoch": 5.681067631757521, "grad_norm": 0.3493625819683075, "learning_rate": 4.1421844195667695e-06, "loss": 0.5508, "step": 12558 }, { "epoch": 5.681520018095453, "grad_norm": 0.38399171829223633, "learning_rate": 4.141462543099781e-06, "loss": 0.5899, "step": 12559 }, { "epoch": 5.681972404433386, "grad_norm": 0.3990771174430847, "learning_rate": 4.14074068507152e-06, "loss": 0.5841, "step": 12560 }, { "epoch": 5.682424790771319, "grad_norm": 0.42657017707824707, "learning_rate": 4.140018845497492e-06, "loss": 0.5434, "step": 12561 }, { "epoch": 5.682877177109251, "grad_norm": 0.38601571321487427, "learning_rate": 4.139297024393198e-06, "loss": 0.5858, "step": 12562 }, { "epoch": 5.683329563447184, "grad_norm": 0.4351204037666321, "learning_rate": 4.138575221774143e-06, "loss": 0.6703, "step": 12563 }, { "epoch": 5.683781949785116, "grad_norm": 0.413465291261673, "learning_rate": 4.1378534376558265e-06, "loss": 0.5703, "step": 12564 }, { "epoch": 5.684234336123049, "grad_norm": 0.4619792103767395, "learning_rate": 4.137131672053751e-06, "loss": 0.6026, "step": 12565 }, { "epoch": 5.684686722460982, "grad_norm": 0.48235997557640076, "learning_rate": 4.136409924983419e-06, "loss": 0.6609, "step": 12566 }, { "epoch": 5.685139108798914, "grad_norm": 0.46129828691482544, "learning_rate": 4.135688196460329e-06, "loss": 0.553, "step": 12567 }, { "epoch": 5.685591495136847, "grad_norm": 0.43514299392700195, "learning_rate": 4.134966486499984e-06, "loss": 0.4679, "step": 12568 }, { "epoch": 5.686043881474779, "grad_norm": 0.47233307361602783, "learning_rate": 4.134244795117882e-06, "loss": 0.6346, "step": 12569 }, { "epoch": 5.6864962678127124, "grad_norm": 0.43426522612571716, "learning_rate": 4.133523122329526e-06, "loss": 0.4602, "step": 12570 }, { "epoch": 5.686948654150645, "grad_norm": 0.5263886451721191, "learning_rate": 4.1328014681504105e-06, "loss": 0.6301, "step": 12571 }, { "epoch": 5.687401040488577, "grad_norm": 0.5037388205528259, "learning_rate": 4.132079832596038e-06, "loss": 0.6859, "step": 12572 }, { "epoch": 5.68785342682651, "grad_norm": 0.448112428188324, "learning_rate": 4.131358215681905e-06, "loss": 0.4634, "step": 12573 }, { "epoch": 5.688305813164442, "grad_norm": 0.43756306171417236, "learning_rate": 4.1306366174235114e-06, "loss": 0.4605, "step": 12574 }, { "epoch": 5.6887581995023755, "grad_norm": 0.48915955424308777, "learning_rate": 4.129915037836353e-06, "loss": 0.5426, "step": 12575 }, { "epoch": 5.689210585840308, "grad_norm": 0.44951769709587097, "learning_rate": 4.129193476935929e-06, "loss": 0.5745, "step": 12576 }, { "epoch": 5.68966297217824, "grad_norm": 0.523809015750885, "learning_rate": 4.128471934737735e-06, "loss": 0.515, "step": 12577 }, { "epoch": 5.690115358516173, "grad_norm": 0.5296573042869568, "learning_rate": 4.127750411257267e-06, "loss": 0.5734, "step": 12578 }, { "epoch": 5.690567744854105, "grad_norm": 0.5253081321716309, "learning_rate": 4.127028906510023e-06, "loss": 0.5392, "step": 12579 }, { "epoch": 5.691020131192038, "grad_norm": 0.4883047342300415, "learning_rate": 4.126307420511498e-06, "loss": 0.4803, "step": 12580 }, { "epoch": 5.691472517529971, "grad_norm": 0.5410770773887634, "learning_rate": 4.125585953277187e-06, "loss": 0.5295, "step": 12581 }, { "epoch": 5.691924903867903, "grad_norm": 0.5533269643783569, "learning_rate": 4.124864504822584e-06, "loss": 0.5226, "step": 12582 }, { "epoch": 5.692377290205836, "grad_norm": 0.5569542646408081, "learning_rate": 4.124143075163185e-06, "loss": 0.6115, "step": 12583 }, { "epoch": 5.692829676543768, "grad_norm": 0.46602699160575867, "learning_rate": 4.123421664314483e-06, "loss": 0.468, "step": 12584 }, { "epoch": 5.693282062881701, "grad_norm": 0.5321898460388184, "learning_rate": 4.122700272291972e-06, "loss": 0.4837, "step": 12585 }, { "epoch": 5.693734449219633, "grad_norm": 0.5093689560890198, "learning_rate": 4.1219788991111454e-06, "loss": 0.5204, "step": 12586 }, { "epoch": 5.694186835557566, "grad_norm": 0.4932047128677368, "learning_rate": 4.121257544787496e-06, "loss": 0.472, "step": 12587 }, { "epoch": 5.694639221895499, "grad_norm": 0.625179648399353, "learning_rate": 4.120536209336516e-06, "loss": 0.5289, "step": 12588 }, { "epoch": 5.695091608233431, "grad_norm": 0.5698224902153015, "learning_rate": 4.119814892773698e-06, "loss": 0.4799, "step": 12589 }, { "epoch": 5.695543994571364, "grad_norm": 0.5355980396270752, "learning_rate": 4.119093595114533e-06, "loss": 0.4852, "step": 12590 }, { "epoch": 5.695996380909296, "grad_norm": 0.5314083695411682, "learning_rate": 4.118372316374512e-06, "loss": 0.4579, "step": 12591 }, { "epoch": 5.6964487672472295, "grad_norm": 0.5134862661361694, "learning_rate": 4.117651056569127e-06, "loss": 0.3884, "step": 12592 }, { "epoch": 5.696901153585162, "grad_norm": 0.5843636989593506, "learning_rate": 4.11692981571387e-06, "loss": 0.5674, "step": 12593 }, { "epoch": 5.697353539923094, "grad_norm": 0.6342141628265381, "learning_rate": 4.116208593824227e-06, "loss": 0.5285, "step": 12594 }, { "epoch": 5.697805926261027, "grad_norm": 0.5529889464378357, "learning_rate": 4.115487390915689e-06, "loss": 0.4972, "step": 12595 }, { "epoch": 5.698258312598959, "grad_norm": 0.5970746874809265, "learning_rate": 4.114766207003746e-06, "loss": 0.4916, "step": 12596 }, { "epoch": 5.6987106989368925, "grad_norm": 0.6226773262023926, "learning_rate": 4.1140450421038865e-06, "loss": 0.4996, "step": 12597 }, { "epoch": 5.699163085274825, "grad_norm": 0.6787109375, "learning_rate": 4.1133238962315995e-06, "loss": 0.4984, "step": 12598 }, { "epoch": 5.699615471612757, "grad_norm": 0.7458872199058533, "learning_rate": 4.1126027694023716e-06, "loss": 0.6611, "step": 12599 }, { "epoch": 5.70006785795069, "grad_norm": 0.6676209568977356, "learning_rate": 4.111881661631692e-06, "loss": 0.47, "step": 12600 }, { "epoch": 5.70006785795069, "eval_loss": 0.5888630151748657, "eval_runtime": 25.063, "eval_samples_per_second": 29.685, "eval_steps_per_second": 7.421, "step": 12600 }, { "epoch": 5.700520244288622, "grad_norm": 0.6418895125389099, "learning_rate": 4.111160572935046e-06, "loss": 0.4203, "step": 12601 }, { "epoch": 5.700972630626556, "grad_norm": 0.8164469599723816, "learning_rate": 4.1104395033279215e-06, "loss": 0.5252, "step": 12602 }, { "epoch": 5.701425016964488, "grad_norm": 0.5437166690826416, "learning_rate": 4.109718452825805e-06, "loss": 0.7506, "step": 12603 }, { "epoch": 5.70187740330242, "grad_norm": 0.2472403347492218, "learning_rate": 4.108997421444184e-06, "loss": 1.216, "step": 12604 }, { "epoch": 5.702329789640353, "grad_norm": 0.29648005962371826, "learning_rate": 4.10827640919854e-06, "loss": 0.6446, "step": 12605 }, { "epoch": 5.702782175978285, "grad_norm": 0.46411368250846863, "learning_rate": 4.107555416104361e-06, "loss": 0.4685, "step": 12606 }, { "epoch": 5.703234562316218, "grad_norm": 0.3953568637371063, "learning_rate": 4.10683444217713e-06, "loss": 0.6218, "step": 12607 }, { "epoch": 5.70368694865415, "grad_norm": 0.40240421891212463, "learning_rate": 4.106113487432333e-06, "loss": 0.6508, "step": 12608 }, { "epoch": 5.7041393349920835, "grad_norm": 0.4608251750469208, "learning_rate": 4.105392551885453e-06, "loss": 0.6436, "step": 12609 }, { "epoch": 5.704591721330016, "grad_norm": 0.44558557868003845, "learning_rate": 4.104671635551974e-06, "loss": 0.6421, "step": 12610 }, { "epoch": 5.705044107667948, "grad_norm": 0.434039831161499, "learning_rate": 4.103950738447378e-06, "loss": 0.5395, "step": 12611 }, { "epoch": 5.705496494005881, "grad_norm": 0.4298863410949707, "learning_rate": 4.103229860587148e-06, "loss": 0.8527, "step": 12612 }, { "epoch": 5.705948880343813, "grad_norm": 0.46330809593200684, "learning_rate": 4.102509001986767e-06, "loss": 0.6795, "step": 12613 }, { "epoch": 5.7064012666817465, "grad_norm": 0.49018505215644836, "learning_rate": 4.101788162661718e-06, "loss": 0.6349, "step": 12614 }, { "epoch": 5.706853653019679, "grad_norm": 0.48233842849731445, "learning_rate": 4.101067342627479e-06, "loss": 0.6039, "step": 12615 }, { "epoch": 5.707306039357611, "grad_norm": 0.4722541272640228, "learning_rate": 4.1003465418995334e-06, "loss": 0.6069, "step": 12616 }, { "epoch": 5.707758425695544, "grad_norm": 0.5278608202934265, "learning_rate": 4.099625760493361e-06, "loss": 0.5416, "step": 12617 }, { "epoch": 5.708210812033476, "grad_norm": 0.4972821772098541, "learning_rate": 4.098904998424442e-06, "loss": 0.6029, "step": 12618 }, { "epoch": 5.70866319837141, "grad_norm": 0.5001148581504822, "learning_rate": 4.098184255708256e-06, "loss": 0.5509, "step": 12619 }, { "epoch": 5.709115584709342, "grad_norm": 0.4281996786594391, "learning_rate": 4.0974635323602825e-06, "loss": 0.4335, "step": 12620 }, { "epoch": 5.709567971047274, "grad_norm": 0.4268491566181183, "learning_rate": 4.096742828396001e-06, "loss": 0.4499, "step": 12621 }, { "epoch": 5.710020357385207, "grad_norm": 0.42278385162353516, "learning_rate": 4.0960221438308874e-06, "loss": 0.492, "step": 12622 }, { "epoch": 5.710472743723139, "grad_norm": 0.5123460292816162, "learning_rate": 4.095301478680424e-06, "loss": 0.5624, "step": 12623 }, { "epoch": 5.710925130061073, "grad_norm": 0.507178008556366, "learning_rate": 4.094580832960085e-06, "loss": 0.5497, "step": 12624 }, { "epoch": 5.711377516399005, "grad_norm": 0.43536606431007385, "learning_rate": 4.093860206685349e-06, "loss": 0.4852, "step": 12625 }, { "epoch": 5.7118299027369375, "grad_norm": 0.5110580325126648, "learning_rate": 4.093139599871695e-06, "loss": 0.6403, "step": 12626 }, { "epoch": 5.71228228907487, "grad_norm": 0.5083260536193848, "learning_rate": 4.0924190125345954e-06, "loss": 0.4995, "step": 12627 }, { "epoch": 5.712734675412802, "grad_norm": 0.4597178101539612, "learning_rate": 4.091698444689527e-06, "loss": 0.428, "step": 12628 }, { "epoch": 5.713187061750735, "grad_norm": 0.5698702335357666, "learning_rate": 4.090977896351967e-06, "loss": 0.6385, "step": 12629 }, { "epoch": 5.713639448088668, "grad_norm": 0.4643033444881439, "learning_rate": 4.09025736753739e-06, "loss": 0.4585, "step": 12630 }, { "epoch": 5.7140918344266005, "grad_norm": 0.49027442932128906, "learning_rate": 4.089536858261271e-06, "loss": 0.481, "step": 12631 }, { "epoch": 5.714544220764533, "grad_norm": 0.5341509580612183, "learning_rate": 4.088816368539083e-06, "loss": 0.5289, "step": 12632 }, { "epoch": 5.714996607102465, "grad_norm": 0.45834779739379883, "learning_rate": 4.088095898386301e-06, "loss": 0.5162, "step": 12633 }, { "epoch": 5.715448993440398, "grad_norm": 0.4617919623851776, "learning_rate": 4.087375447818399e-06, "loss": 0.3946, "step": 12634 }, { "epoch": 5.71590137977833, "grad_norm": 0.5812906622886658, "learning_rate": 4.086655016850848e-06, "loss": 0.5607, "step": 12635 }, { "epoch": 5.716353766116264, "grad_norm": 0.5375627875328064, "learning_rate": 4.085934605499122e-06, "loss": 0.5962, "step": 12636 }, { "epoch": 5.716806152454196, "grad_norm": 0.5004717111587524, "learning_rate": 4.085214213778694e-06, "loss": 0.5703, "step": 12637 }, { "epoch": 5.717258538792128, "grad_norm": 0.5254408717155457, "learning_rate": 4.084493841705036e-06, "loss": 0.4776, "step": 12638 }, { "epoch": 5.717710925130061, "grad_norm": 0.581995964050293, "learning_rate": 4.083773489293617e-06, "loss": 0.5367, "step": 12639 }, { "epoch": 5.718163311467993, "grad_norm": 0.4954926371574402, "learning_rate": 4.083053156559909e-06, "loss": 0.4635, "step": 12640 }, { "epoch": 5.718615697805927, "grad_norm": 0.5400475859642029, "learning_rate": 4.0823328435193825e-06, "loss": 0.4536, "step": 12641 }, { "epoch": 5.719068084143859, "grad_norm": 0.5104039311408997, "learning_rate": 4.081612550187508e-06, "loss": 0.4598, "step": 12642 }, { "epoch": 5.7195204704817915, "grad_norm": 0.5342324376106262, "learning_rate": 4.080892276579754e-06, "loss": 0.4587, "step": 12643 }, { "epoch": 5.719972856819724, "grad_norm": 0.6132945418357849, "learning_rate": 4.080172022711592e-06, "loss": 0.5547, "step": 12644 }, { "epoch": 5.720425243157656, "grad_norm": 0.6007161140441895, "learning_rate": 4.0794517885984894e-06, "loss": 0.5253, "step": 12645 }, { "epoch": 5.72087762949559, "grad_norm": 0.5269535183906555, "learning_rate": 4.078731574255915e-06, "loss": 0.4089, "step": 12646 }, { "epoch": 5.721330015833522, "grad_norm": 0.6078847646713257, "learning_rate": 4.078011379699336e-06, "loss": 0.4869, "step": 12647 }, { "epoch": 5.7217824021714545, "grad_norm": 0.6286530494689941, "learning_rate": 4.0772912049442204e-06, "loss": 0.4414, "step": 12648 }, { "epoch": 5.722234788509387, "grad_norm": 0.7306817173957825, "learning_rate": 4.076571050006037e-06, "loss": 0.5343, "step": 12649 }, { "epoch": 5.722687174847319, "grad_norm": 0.666853129863739, "learning_rate": 4.075850914900249e-06, "loss": 0.5514, "step": 12650 }, { "epoch": 5.723139561185253, "grad_norm": 0.6384944915771484, "learning_rate": 4.075130799642326e-06, "loss": 0.4354, "step": 12651 }, { "epoch": 5.723591947523185, "grad_norm": 0.7648922801017761, "learning_rate": 4.074410704247732e-06, "loss": 0.4635, "step": 12652 }, { "epoch": 5.7240443338611176, "grad_norm": 0.5668854117393494, "learning_rate": 4.073690628731932e-06, "loss": 0.9074, "step": 12653 }, { "epoch": 5.72449672019905, "grad_norm": 0.2901831865310669, "learning_rate": 4.072970573110393e-06, "loss": 0.7967, "step": 12654 }, { "epoch": 5.724949106536982, "grad_norm": 0.28067874908447266, "learning_rate": 4.072250537398578e-06, "loss": 0.5019, "step": 12655 }, { "epoch": 5.725401492874915, "grad_norm": 0.3462749421596527, "learning_rate": 4.071530521611951e-06, "loss": 0.5244, "step": 12656 }, { "epoch": 5.725853879212847, "grad_norm": 0.3735601305961609, "learning_rate": 4.070810525765976e-06, "loss": 0.6209, "step": 12657 }, { "epoch": 5.726306265550781, "grad_norm": 0.37457290291786194, "learning_rate": 4.070090549876118e-06, "loss": 0.6254, "step": 12658 }, { "epoch": 5.726758651888713, "grad_norm": 0.412557989358902, "learning_rate": 4.0693705939578375e-06, "loss": 0.6801, "step": 12659 }, { "epoch": 5.7272110382266455, "grad_norm": 0.37050333619117737, "learning_rate": 4.068650658026599e-06, "loss": 0.5522, "step": 12660 }, { "epoch": 5.727663424564578, "grad_norm": 0.42420247197151184, "learning_rate": 4.067930742097862e-06, "loss": 0.6311, "step": 12661 }, { "epoch": 5.72811581090251, "grad_norm": 0.41390469670295715, "learning_rate": 4.0672108461870905e-06, "loss": 0.6483, "step": 12662 }, { "epoch": 5.728568197240444, "grad_norm": 0.37566086649894714, "learning_rate": 4.066490970309743e-06, "loss": 0.5051, "step": 12663 }, { "epoch": 5.729020583578376, "grad_norm": 0.4175930619239807, "learning_rate": 4.0657711144812826e-06, "loss": 0.5503, "step": 12664 }, { "epoch": 5.7294729699163085, "grad_norm": 0.4229668080806732, "learning_rate": 4.065051278717169e-06, "loss": 0.5852, "step": 12665 }, { "epoch": 5.729925356254241, "grad_norm": 0.4265505075454712, "learning_rate": 4.064331463032862e-06, "loss": 0.5111, "step": 12666 }, { "epoch": 5.730377742592173, "grad_norm": 0.40735480189323425, "learning_rate": 4.0636116674438194e-06, "loss": 0.5164, "step": 12667 }, { "epoch": 5.730830128930107, "grad_norm": 0.4292065501213074, "learning_rate": 4.062891891965503e-06, "loss": 0.5448, "step": 12668 }, { "epoch": 5.731282515268039, "grad_norm": 0.42439091205596924, "learning_rate": 4.062172136613369e-06, "loss": 0.5029, "step": 12669 }, { "epoch": 5.7317349016059715, "grad_norm": 0.45090529322624207, "learning_rate": 4.061452401402876e-06, "loss": 0.5149, "step": 12670 }, { "epoch": 5.732187287943904, "grad_norm": 0.48279446363449097, "learning_rate": 4.060732686349484e-06, "loss": 0.5046, "step": 12671 }, { "epoch": 5.732639674281836, "grad_norm": 0.4678773283958435, "learning_rate": 4.060012991468648e-06, "loss": 0.5115, "step": 12672 }, { "epoch": 5.73309206061977, "grad_norm": 0.4754822552204132, "learning_rate": 4.059293316775825e-06, "loss": 0.4709, "step": 12673 }, { "epoch": 5.733544446957702, "grad_norm": 0.49802306294441223, "learning_rate": 4.058573662286471e-06, "loss": 0.5858, "step": 12674 }, { "epoch": 5.733996833295635, "grad_norm": 0.5545689463615417, "learning_rate": 4.057854028016043e-06, "loss": 0.6266, "step": 12675 }, { "epoch": 5.734449219633567, "grad_norm": 0.5061941742897034, "learning_rate": 4.057134413979995e-06, "loss": 0.544, "step": 12676 }, { "epoch": 5.7349016059714994, "grad_norm": 0.5620335340499878, "learning_rate": 4.056414820193784e-06, "loss": 0.5871, "step": 12677 }, { "epoch": 5.735353992309432, "grad_norm": 0.5410709381103516, "learning_rate": 4.055695246672864e-06, "loss": 0.5716, "step": 12678 }, { "epoch": 5.735806378647365, "grad_norm": 0.6091393232345581, "learning_rate": 4.054975693432688e-06, "loss": 0.6724, "step": 12679 }, { "epoch": 5.736258764985298, "grad_norm": 0.513572096824646, "learning_rate": 4.054256160488711e-06, "loss": 0.5489, "step": 12680 }, { "epoch": 5.73671115132323, "grad_norm": 0.5077426433563232, "learning_rate": 4.053536647856387e-06, "loss": 0.5196, "step": 12681 }, { "epoch": 5.7371635376611625, "grad_norm": 0.5141040086746216, "learning_rate": 4.052817155551168e-06, "loss": 0.5367, "step": 12682 }, { "epoch": 5.737615923999095, "grad_norm": 0.5041226148605347, "learning_rate": 4.0520976835885065e-06, "loss": 0.4936, "step": 12683 }, { "epoch": 5.738068310337027, "grad_norm": 0.5824228525161743, "learning_rate": 4.051378231983854e-06, "loss": 0.5416, "step": 12684 }, { "epoch": 5.738520696674961, "grad_norm": 0.5313569903373718, "learning_rate": 4.050658800752663e-06, "loss": 0.536, "step": 12685 }, { "epoch": 5.738973083012893, "grad_norm": 0.4517059028148651, "learning_rate": 4.049939389910384e-06, "loss": 0.4226, "step": 12686 }, { "epoch": 5.7394254693508255, "grad_norm": 0.6143425703048706, "learning_rate": 4.049219999472468e-06, "loss": 0.5415, "step": 12687 }, { "epoch": 5.739877855688758, "grad_norm": 0.5364498496055603, "learning_rate": 4.048500629454365e-06, "loss": 0.5598, "step": 12688 }, { "epoch": 5.74033024202669, "grad_norm": 0.5236214995384216, "learning_rate": 4.047781279871526e-06, "loss": 0.5169, "step": 12689 }, { "epoch": 5.740782628364624, "grad_norm": 0.48070162534713745, "learning_rate": 4.047061950739398e-06, "loss": 0.4308, "step": 12690 }, { "epoch": 5.741235014702556, "grad_norm": 0.5340002179145813, "learning_rate": 4.046342642073433e-06, "loss": 0.5365, "step": 12691 }, { "epoch": 5.741687401040489, "grad_norm": 0.579314112663269, "learning_rate": 4.045623353889077e-06, "loss": 0.557, "step": 12692 }, { "epoch": 5.742139787378421, "grad_norm": 0.5436702966690063, "learning_rate": 4.04490408620178e-06, "loss": 0.4931, "step": 12693 }, { "epoch": 5.742592173716353, "grad_norm": 0.5770688652992249, "learning_rate": 4.044184839026989e-06, "loss": 0.5038, "step": 12694 }, { "epoch": 5.743044560054287, "grad_norm": 0.5657736659049988, "learning_rate": 4.043465612380151e-06, "loss": 0.4832, "step": 12695 }, { "epoch": 5.743496946392219, "grad_norm": 0.5539020895957947, "learning_rate": 4.042746406276711e-06, "loss": 0.4481, "step": 12696 }, { "epoch": 5.743949332730152, "grad_norm": 0.8043603301048279, "learning_rate": 4.042027220732117e-06, "loss": 0.7307, "step": 12697 }, { "epoch": 5.744401719068084, "grad_norm": 0.5663145184516907, "learning_rate": 4.041308055761817e-06, "loss": 0.3913, "step": 12698 }, { "epoch": 5.7448541054060165, "grad_norm": 0.7284775376319885, "learning_rate": 4.040588911381253e-06, "loss": 0.556, "step": 12699 }, { "epoch": 5.74530649174395, "grad_norm": 0.5810743570327759, "learning_rate": 4.0398697876058715e-06, "loss": 0.4309, "step": 12700 }, { "epoch": 5.745758878081882, "grad_norm": 0.6253424286842346, "learning_rate": 4.039150684451117e-06, "loss": 0.4769, "step": 12701 }, { "epoch": 5.746211264419815, "grad_norm": 0.7449482679367065, "learning_rate": 4.038431601932433e-06, "loss": 0.424, "step": 12702 }, { "epoch": 5.746663650757747, "grad_norm": 0.5120706558227539, "learning_rate": 4.037712540065265e-06, "loss": 0.9915, "step": 12703 }, { "epoch": 5.7471160370956795, "grad_norm": 0.21635718643665314, "learning_rate": 4.036993498865054e-06, "loss": 1.2312, "step": 12704 }, { "epoch": 5.747568423433612, "grad_norm": 0.28858715295791626, "learning_rate": 4.0362744783472434e-06, "loss": 0.6528, "step": 12705 }, { "epoch": 5.748020809771545, "grad_norm": 0.28432148694992065, "learning_rate": 4.035555478527278e-06, "loss": 0.4723, "step": 12706 }, { "epoch": 5.748473196109478, "grad_norm": 0.35745319724082947, "learning_rate": 4.034836499420595e-06, "loss": 0.6641, "step": 12707 }, { "epoch": 5.74892558244741, "grad_norm": 0.4002746641635895, "learning_rate": 4.034117541042639e-06, "loss": 0.6757, "step": 12708 }, { "epoch": 5.749377968785343, "grad_norm": 0.4174477756023407, "learning_rate": 4.0333986034088504e-06, "loss": 0.6394, "step": 12709 }, { "epoch": 5.749830355123275, "grad_norm": 0.40395912528038025, "learning_rate": 4.03267968653467e-06, "loss": 0.6486, "step": 12710 }, { "epoch": 5.750282741461207, "grad_norm": 0.4218120574951172, "learning_rate": 4.0319607904355375e-06, "loss": 0.6226, "step": 12711 }, { "epoch": 5.750735127799141, "grad_norm": 0.4095368981361389, "learning_rate": 4.0312419151268925e-06, "loss": 0.5824, "step": 12712 }, { "epoch": 5.751187514137073, "grad_norm": 0.4128633737564087, "learning_rate": 4.030523060624174e-06, "loss": 0.5152, "step": 12713 }, { "epoch": 5.751639900475006, "grad_norm": 0.46463683247566223, "learning_rate": 4.029804226942823e-06, "loss": 0.6105, "step": 12714 }, { "epoch": 5.752092286812938, "grad_norm": 0.43848705291748047, "learning_rate": 4.029085414098274e-06, "loss": 0.5495, "step": 12715 }, { "epoch": 5.7525446731508705, "grad_norm": 0.4634358286857605, "learning_rate": 4.028366622105968e-06, "loss": 0.5741, "step": 12716 }, { "epoch": 5.752997059488804, "grad_norm": 0.4405931532382965, "learning_rate": 4.027647850981342e-06, "loss": 0.5682, "step": 12717 }, { "epoch": 5.753449445826736, "grad_norm": 0.42698273062705994, "learning_rate": 4.026929100739831e-06, "loss": 0.5761, "step": 12718 }, { "epoch": 5.753901832164669, "grad_norm": 0.4900575578212738, "learning_rate": 4.026210371396874e-06, "loss": 0.6181, "step": 12719 }, { "epoch": 5.754354218502601, "grad_norm": 0.4313044846057892, "learning_rate": 4.025491662967904e-06, "loss": 0.5881, "step": 12720 }, { "epoch": 5.7548066048405335, "grad_norm": 0.4725438356399536, "learning_rate": 4.024772975468359e-06, "loss": 0.5432, "step": 12721 }, { "epoch": 5.755258991178467, "grad_norm": 0.4433285892009735, "learning_rate": 4.024054308913675e-06, "loss": 0.5687, "step": 12722 }, { "epoch": 5.755711377516399, "grad_norm": 0.4523036479949951, "learning_rate": 4.023335663319285e-06, "loss": 0.4522, "step": 12723 }, { "epoch": 5.756163763854332, "grad_norm": 0.48974835872650146, "learning_rate": 4.022617038700624e-06, "loss": 0.5956, "step": 12724 }, { "epoch": 5.756616150192264, "grad_norm": 0.4047718346118927, "learning_rate": 4.021898435073124e-06, "loss": 0.3864, "step": 12725 }, { "epoch": 5.757068536530197, "grad_norm": 0.5405908823013306, "learning_rate": 4.0211798524522215e-06, "loss": 0.5339, "step": 12726 }, { "epoch": 5.757520922868129, "grad_norm": 0.44068533182144165, "learning_rate": 4.0204612908533474e-06, "loss": 0.4657, "step": 12727 }, { "epoch": 5.757973309206062, "grad_norm": 0.4773191511631012, "learning_rate": 4.019742750291936e-06, "loss": 0.5013, "step": 12728 }, { "epoch": 5.758425695543995, "grad_norm": 0.49966129660606384, "learning_rate": 4.0190242307834165e-06, "loss": 0.5066, "step": 12729 }, { "epoch": 5.758878081881927, "grad_norm": 0.49855759739875793, "learning_rate": 4.018305732343222e-06, "loss": 0.5731, "step": 12730 }, { "epoch": 5.75933046821986, "grad_norm": 0.5186036229133606, "learning_rate": 4.017587254986784e-06, "loss": 0.5029, "step": 12731 }, { "epoch": 5.759782854557792, "grad_norm": 0.5026910901069641, "learning_rate": 4.016868798729533e-06, "loss": 0.5218, "step": 12732 }, { "epoch": 5.7602352408957245, "grad_norm": 0.47546762228012085, "learning_rate": 4.016150363586898e-06, "loss": 0.5001, "step": 12733 }, { "epoch": 5.760687627233658, "grad_norm": 0.5227771401405334, "learning_rate": 4.015431949574311e-06, "loss": 0.5308, "step": 12734 }, { "epoch": 5.76114001357159, "grad_norm": 0.5503045320510864, "learning_rate": 4.014713556707199e-06, "loss": 0.6077, "step": 12735 }, { "epoch": 5.761592399909523, "grad_norm": 0.5359815955162048, "learning_rate": 4.0139951850009925e-06, "loss": 0.4992, "step": 12736 }, { "epoch": 5.762044786247455, "grad_norm": 0.5084894299507141, "learning_rate": 4.0132768344711195e-06, "loss": 0.4612, "step": 12737 }, { "epoch": 5.7624971725853875, "grad_norm": 0.5161052346229553, "learning_rate": 4.012558505133007e-06, "loss": 0.4795, "step": 12738 }, { "epoch": 5.762949558923321, "grad_norm": 0.6572481989860535, "learning_rate": 4.011840197002086e-06, "loss": 0.5923, "step": 12739 }, { "epoch": 5.763401945261253, "grad_norm": 0.5692017078399658, "learning_rate": 4.011121910093777e-06, "loss": 0.4568, "step": 12740 }, { "epoch": 5.763854331599186, "grad_norm": 0.5032594799995422, "learning_rate": 4.010403644423513e-06, "loss": 0.4587, "step": 12741 }, { "epoch": 5.764306717937118, "grad_norm": 0.5494136810302734, "learning_rate": 4.0096854000067164e-06, "loss": 0.4755, "step": 12742 }, { "epoch": 5.764759104275051, "grad_norm": 0.5676823854446411, "learning_rate": 4.008967176858814e-06, "loss": 0.5532, "step": 12743 }, { "epoch": 5.765211490612984, "grad_norm": 0.553425133228302, "learning_rate": 4.008248974995231e-06, "loss": 0.4793, "step": 12744 }, { "epoch": 5.765663876950916, "grad_norm": 0.5600208640098572, "learning_rate": 4.007530794431392e-06, "loss": 0.4844, "step": 12745 }, { "epoch": 5.766116263288849, "grad_norm": 0.5867162346839905, "learning_rate": 4.006812635182722e-06, "loss": 0.5055, "step": 12746 }, { "epoch": 5.766568649626781, "grad_norm": 0.5596165657043457, "learning_rate": 4.006094497264644e-06, "loss": 0.4812, "step": 12747 }, { "epoch": 5.767021035964714, "grad_norm": 0.6868941187858582, "learning_rate": 4.005376380692581e-06, "loss": 0.4996, "step": 12748 }, { "epoch": 5.767473422302647, "grad_norm": 0.596089243888855, "learning_rate": 4.004658285481957e-06, "loss": 0.4438, "step": 12749 }, { "epoch": 5.767925808640579, "grad_norm": 0.6786696910858154, "learning_rate": 4.003940211648195e-06, "loss": 0.4899, "step": 12750 }, { "epoch": 5.768378194978512, "grad_norm": 0.6656180024147034, "learning_rate": 4.0032221592067155e-06, "loss": 0.4829, "step": 12751 }, { "epoch": 5.768830581316444, "grad_norm": 0.8223446607589722, "learning_rate": 4.00250412817294e-06, "loss": 0.4922, "step": 12752 }, { "epoch": 5.769282967654377, "grad_norm": 0.6956403255462646, "learning_rate": 4.001786118562291e-06, "loss": 0.9332, "step": 12753 }, { "epoch": 5.769735353992309, "grad_norm": 0.28784802556037903, "learning_rate": 4.001068130390188e-06, "loss": 1.1731, "step": 12754 }, { "epoch": 5.770187740330242, "grad_norm": 0.30946093797683716, "learning_rate": 4.000350163672052e-06, "loss": 0.5814, "step": 12755 }, { "epoch": 5.770640126668175, "grad_norm": 0.4091433584690094, "learning_rate": 3.999632218423301e-06, "loss": 0.6034, "step": 12756 }, { "epoch": 5.771092513006107, "grad_norm": 0.3296380937099457, "learning_rate": 3.998914294659356e-06, "loss": 0.4542, "step": 12757 }, { "epoch": 5.77154489934404, "grad_norm": 0.3763772249221802, "learning_rate": 3.998196392395635e-06, "loss": 0.4836, "step": 12758 }, { "epoch": 5.771997285681972, "grad_norm": 0.43541231751441956, "learning_rate": 3.997478511647557e-06, "loss": 0.6386, "step": 12759 }, { "epoch": 5.7724496720199046, "grad_norm": 0.39478105306625366, "learning_rate": 3.996760652430539e-06, "loss": 0.513, "step": 12760 }, { "epoch": 5.772902058357838, "grad_norm": 0.41900381445884705, "learning_rate": 3.996042814759999e-06, "loss": 0.6703, "step": 12761 }, { "epoch": 5.77335444469577, "grad_norm": 0.40089771151542664, "learning_rate": 3.995324998651354e-06, "loss": 0.4748, "step": 12762 }, { "epoch": 5.773806831033703, "grad_norm": 0.41946941614151, "learning_rate": 3.9946072041200205e-06, "loss": 0.5732, "step": 12763 }, { "epoch": 5.774259217371635, "grad_norm": 0.4874124228954315, "learning_rate": 3.993889431181413e-06, "loss": 0.6482, "step": 12764 }, { "epoch": 5.774711603709568, "grad_norm": 0.470865935087204, "learning_rate": 3.993171679850948e-06, "loss": 0.6095, "step": 12765 }, { "epoch": 5.775163990047501, "grad_norm": 0.41176921129226685, "learning_rate": 3.992453950144042e-06, "loss": 0.4918, "step": 12766 }, { "epoch": 5.775616376385433, "grad_norm": 0.4417150914669037, "learning_rate": 3.991736242076108e-06, "loss": 0.5244, "step": 12767 }, { "epoch": 5.776068762723366, "grad_norm": 0.4361085593700409, "learning_rate": 3.99101855566256e-06, "loss": 0.5001, "step": 12768 }, { "epoch": 5.776521149061298, "grad_norm": 0.46396347880363464, "learning_rate": 3.990300890918813e-06, "loss": 0.5457, "step": 12769 }, { "epoch": 5.776973535399231, "grad_norm": 0.5242601037025452, "learning_rate": 3.9895832478602795e-06, "loss": 0.6484, "step": 12770 }, { "epoch": 5.777425921737164, "grad_norm": 0.4812123477458954, "learning_rate": 3.9888656265023715e-06, "loss": 0.4988, "step": 12771 }, { "epoch": 5.777878308075096, "grad_norm": 0.4286130368709564, "learning_rate": 3.988148026860502e-06, "loss": 0.4267, "step": 12772 }, { "epoch": 5.778330694413029, "grad_norm": 0.5051121115684509, "learning_rate": 3.987430448950085e-06, "loss": 0.5485, "step": 12773 }, { "epoch": 5.778783080750961, "grad_norm": 0.4886068105697632, "learning_rate": 3.986712892786527e-06, "loss": 0.5186, "step": 12774 }, { "epoch": 5.779235467088894, "grad_norm": 0.5318618416786194, "learning_rate": 3.985995358385243e-06, "loss": 0.5325, "step": 12775 }, { "epoch": 5.779687853426827, "grad_norm": 0.5633957386016846, "learning_rate": 3.985277845761642e-06, "loss": 0.5531, "step": 12776 }, { "epoch": 5.780140239764759, "grad_norm": 0.4498642385005951, "learning_rate": 3.984560354931133e-06, "loss": 0.4322, "step": 12777 }, { "epoch": 5.780592626102692, "grad_norm": 0.5132616758346558, "learning_rate": 3.9838428859091276e-06, "loss": 0.5789, "step": 12778 }, { "epoch": 5.781045012440624, "grad_norm": 0.5548256635665894, "learning_rate": 3.983125438711032e-06, "loss": 0.5177, "step": 12779 }, { "epoch": 5.781497398778557, "grad_norm": 0.5627488493919373, "learning_rate": 3.982408013352258e-06, "loss": 0.5717, "step": 12780 }, { "epoch": 5.781949785116489, "grad_norm": 0.49540841579437256, "learning_rate": 3.981690609848211e-06, "loss": 0.4235, "step": 12781 }, { "epoch": 5.782402171454422, "grad_norm": 0.530044436454773, "learning_rate": 3.9809732282143e-06, "loss": 0.5042, "step": 12782 }, { "epoch": 5.782854557792355, "grad_norm": 0.5870112180709839, "learning_rate": 3.980255868465932e-06, "loss": 0.495, "step": 12783 }, { "epoch": 5.783306944130287, "grad_norm": 0.6142293810844421, "learning_rate": 3.979538530618513e-06, "loss": 0.5524, "step": 12784 }, { "epoch": 5.78375933046822, "grad_norm": 0.5440499782562256, "learning_rate": 3.978821214687451e-06, "loss": 0.5173, "step": 12785 }, { "epoch": 5.784211716806152, "grad_norm": 0.5521963834762573, "learning_rate": 3.97810392068815e-06, "loss": 0.5308, "step": 12786 }, { "epoch": 5.784664103144085, "grad_norm": 0.5388118624687195, "learning_rate": 3.977386648636014e-06, "loss": 0.5089, "step": 12787 }, { "epoch": 5.785116489482018, "grad_norm": 0.502413272857666, "learning_rate": 3.976669398546451e-06, "loss": 0.4078, "step": 12788 }, { "epoch": 5.78556887581995, "grad_norm": 0.5826531052589417, "learning_rate": 3.975952170434864e-06, "loss": 0.5056, "step": 12789 }, { "epoch": 5.786021262157883, "grad_norm": 0.5984066128730774, "learning_rate": 3.9752349643166555e-06, "loss": 0.5355, "step": 12790 }, { "epoch": 5.786473648495815, "grad_norm": 0.5319923758506775, "learning_rate": 3.9745177802072295e-06, "loss": 0.4624, "step": 12791 }, { "epoch": 5.786926034833748, "grad_norm": 0.5595119595527649, "learning_rate": 3.973800618121991e-06, "loss": 0.4743, "step": 12792 }, { "epoch": 5.787378421171681, "grad_norm": 0.6450970768928528, "learning_rate": 3.973083478076339e-06, "loss": 0.5483, "step": 12793 }, { "epoch": 5.787830807509613, "grad_norm": 0.5355997085571289, "learning_rate": 3.9723663600856785e-06, "loss": 0.483, "step": 12794 }, { "epoch": 5.788283193847546, "grad_norm": 0.6080219745635986, "learning_rate": 3.97164926416541e-06, "loss": 0.4554, "step": 12795 }, { "epoch": 5.788735580185478, "grad_norm": 0.6515663266181946, "learning_rate": 3.970932190330935e-06, "loss": 0.5095, "step": 12796 }, { "epoch": 5.789187966523411, "grad_norm": 0.6216298341751099, "learning_rate": 3.970215138597651e-06, "loss": 0.5452, "step": 12797 }, { "epoch": 5.789640352861344, "grad_norm": 0.6502141356468201, "learning_rate": 3.969498108980962e-06, "loss": 0.5028, "step": 12798 }, { "epoch": 5.7900927391992765, "grad_norm": 0.6272109746932983, "learning_rate": 3.9687811014962656e-06, "loss": 0.5023, "step": 12799 }, { "epoch": 5.790545125537209, "grad_norm": 0.6606152653694153, "learning_rate": 3.96806411615896e-06, "loss": 0.5626, "step": 12800 }, { "epoch": 5.790545125537209, "eval_loss": 0.5883487462997437, "eval_runtime": 25.9018, "eval_samples_per_second": 28.724, "eval_steps_per_second": 7.181, "step": 12800 } ], "logging_steps": 1, "max_steps": 22100, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.300286264224645e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }