{ "best_metric": null, "best_model_checkpoint": null, "epoch": 15.0, "eval_steps": 500, "global_step": 330, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.045454545454545456, "grad_norm": 257.23835842552523, "learning_rate": 2.9411764705882355e-06, "loss": 8.3183, "step": 1 }, { "epoch": 0.09090909090909091, "grad_norm": 212.69538455789322, "learning_rate": 5.882352941176471e-06, "loss": 8.7163, "step": 2 }, { "epoch": 0.13636363636363635, "grad_norm": 112.61346458118909, "learning_rate": 8.823529411764707e-06, "loss": 7.6494, "step": 3 }, { "epoch": 0.18181818181818182, "grad_norm": 784.8951183910573, "learning_rate": 1.1764705882352942e-05, "loss": 8.0969, "step": 4 }, { "epoch": 0.22727272727272727, "grad_norm": 185.45732108671288, "learning_rate": 1.4705882352941177e-05, "loss": 7.311, "step": 5 }, { "epoch": 0.2727272727272727, "grad_norm": 195.28031105021546, "learning_rate": 1.7647058823529414e-05, "loss": 7.8658, "step": 6 }, { "epoch": 0.3181818181818182, "grad_norm": 36.48221664801266, "learning_rate": 2.058823529411765e-05, "loss": 5.2603, "step": 7 }, { "epoch": 0.36363636363636365, "grad_norm": 9.55998838923863, "learning_rate": 2.3529411764705884e-05, "loss": 4.8036, "step": 8 }, { "epoch": 0.4090909090909091, "grad_norm": 9.615632721341097, "learning_rate": 2.647058823529412e-05, "loss": 4.8279, "step": 9 }, { "epoch": 0.45454545454545453, "grad_norm": 7.669264353119744, "learning_rate": 2.9411764705882354e-05, "loss": 4.6123, "step": 10 }, { "epoch": 0.5, "grad_norm": 6.9967605072507215, "learning_rate": 3.235294117647059e-05, "loss": 4.3857, "step": 11 }, { "epoch": 0.5454545454545454, "grad_norm": 6.587199773697142, "learning_rate": 3.529411764705883e-05, "loss": 4.4648, "step": 12 }, { "epoch": 0.5909090909090909, "grad_norm": 5.505554834249296, "learning_rate": 3.8235294117647055e-05, "loss": 4.1377, "step": 13 }, { "epoch": 0.6363636363636364, "grad_norm": 4.9970053869195965, "learning_rate": 4.11764705882353e-05, "loss": 3.8989, "step": 14 }, { "epoch": 0.6818181818181818, "grad_norm": 3.79904253928739, "learning_rate": 4.411764705882353e-05, "loss": 3.8294, "step": 15 }, { "epoch": 0.7272727272727273, "grad_norm": 3.508407692792644, "learning_rate": 4.705882352941177e-05, "loss": 4.0649, "step": 16 }, { "epoch": 0.7727272727272727, "grad_norm": 3.2871677206858054, "learning_rate": 5e-05, "loss": 3.7045, "step": 17 }, { "epoch": 0.8181818181818182, "grad_norm": 3.3241161598281, "learning_rate": 4.999886666070519e-05, "loss": 3.7635, "step": 18 }, { "epoch": 0.8636363636363636, "grad_norm": 3.1152646953619567, "learning_rate": 4.9995466756994795e-05, "loss": 3.6178, "step": 19 }, { "epoch": 0.9090909090909091, "grad_norm": 3.273528675012558, "learning_rate": 4.9989800631379443e-05, "loss": 3.5688, "step": 20 }, { "epoch": 0.9545454545454546, "grad_norm": 3.115834430389958, "learning_rate": 4.998186885467182e-05, "loss": 3.2943, "step": 21 }, { "epoch": 1.0, "grad_norm": 2.661570245411782, "learning_rate": 4.99716722259292e-05, "loss": 3.0865, "step": 22 }, { "epoch": 1.0454545454545454, "grad_norm": 2.849664680666923, "learning_rate": 4.99592117723729e-05, "loss": 3.1392, "step": 23 }, { "epoch": 1.0909090909090908, "grad_norm": 2.4018072414305895, "learning_rate": 4.994448874928487e-05, "loss": 2.9298, "step": 24 }, { "epoch": 1.1363636363636362, "grad_norm": 2.0206665673120896, "learning_rate": 4.992750463988114e-05, "loss": 2.8627, "step": 25 }, { "epoch": 1.1818181818181819, "grad_norm": 1.9920698277851823, "learning_rate": 4.990826115516248e-05, "loss": 2.8083, "step": 26 }, { "epoch": 1.2272727272727273, "grad_norm": 1.7918724410197877, "learning_rate": 4.9886760233742e-05, "loss": 2.6199, "step": 27 }, { "epoch": 1.2727272727272727, "grad_norm": 1.5505702505545602, "learning_rate": 4.986300404164984e-05, "loss": 2.6521, "step": 28 }, { "epoch": 1.3181818181818181, "grad_norm": 1.2616722287294364, "learning_rate": 4.9836994972114974e-05, "loss": 2.6015, "step": 29 }, { "epoch": 1.3636363636363638, "grad_norm": 1.1638338779341943, "learning_rate": 4.9808735645324125e-05, "loss": 2.5141, "step": 30 }, { "epoch": 1.4090909090909092, "grad_norm": 1.0794908268185501, "learning_rate": 4.9778228908157766e-05, "loss": 2.4714, "step": 31 }, { "epoch": 1.4545454545454546, "grad_norm": 0.9412858812798977, "learning_rate": 4.9745477833903364e-05, "loss": 2.4086, "step": 32 }, { "epoch": 1.5, "grad_norm": 0.9703710030327403, "learning_rate": 4.971048572194577e-05, "loss": 2.3233, "step": 33 }, { "epoch": 1.5454545454545454, "grad_norm": 0.8923121257838885, "learning_rate": 4.9673256097434793e-05, "loss": 2.3252, "step": 34 }, { "epoch": 1.5909090909090908, "grad_norm": 0.7408466397481557, "learning_rate": 4.963379271093012e-05, "loss": 2.3592, "step": 35 }, { "epoch": 1.6363636363636362, "grad_norm": 0.8332937369940602, "learning_rate": 4.959209953802344e-05, "loss": 2.3153, "step": 36 }, { "epoch": 1.6818181818181817, "grad_norm": 0.8198715006056391, "learning_rate": 4.954818077893798e-05, "loss": 2.14, "step": 37 }, { "epoch": 1.7272727272727273, "grad_norm": 0.7030061091284978, "learning_rate": 4.950204085810533e-05, "loss": 2.1745, "step": 38 }, { "epoch": 1.7727272727272727, "grad_norm": 0.7045320084558211, "learning_rate": 4.945368442371974e-05, "loss": 2.0868, "step": 39 }, { "epoch": 1.8181818181818183, "grad_norm": 0.6475997413192687, "learning_rate": 4.9403116347269866e-05, "loss": 2.0927, "step": 40 }, { "epoch": 1.8636363636363638, "grad_norm": 0.6818237942981833, "learning_rate": 4.935034172304797e-05, "loss": 2.1228, "step": 41 }, { "epoch": 1.9090909090909092, "grad_norm": 0.6349632576540991, "learning_rate": 4.9295365867636766e-05, "loss": 2.1594, "step": 42 }, { "epoch": 1.9545454545454546, "grad_norm": 0.6123700654773722, "learning_rate": 4.923819431937377e-05, "loss": 1.9419, "step": 43 }, { "epoch": 2.0, "grad_norm": 0.628265874333403, "learning_rate": 4.9178832837793415e-05, "loss": 1.9591, "step": 44 }, { "epoch": 2.0454545454545454, "grad_norm": 0.6079957574674165, "learning_rate": 4.9117287403046766e-05, "loss": 1.9066, "step": 45 }, { "epoch": 2.090909090909091, "grad_norm": 0.5913194294689115, "learning_rate": 4.9053564215299135e-05, "loss": 1.9269, "step": 46 }, { "epoch": 2.1363636363636362, "grad_norm": 0.524599692265706, "learning_rate": 4.898766969410542e-05, "loss": 1.8848, "step": 47 }, { "epoch": 2.1818181818181817, "grad_norm": 0.5449852882597904, "learning_rate": 4.891961047776342e-05, "loss": 1.8835, "step": 48 }, { "epoch": 2.227272727272727, "grad_norm": 0.5842912439698547, "learning_rate": 4.8849393422645054e-05, "loss": 1.8353, "step": 49 }, { "epoch": 2.2727272727272725, "grad_norm": 0.5004654366250076, "learning_rate": 4.87770256025057e-05, "loss": 1.8698, "step": 50 }, { "epoch": 2.3181818181818183, "grad_norm": 0.5000460896838382, "learning_rate": 4.870251430777148e-05, "loss": 1.8355, "step": 51 }, { "epoch": 2.3636363636363638, "grad_norm": 0.47646909638568696, "learning_rate": 4.862586704480494e-05, "loss": 1.8062, "step": 52 }, { "epoch": 2.409090909090909, "grad_norm": 0.4775044778243631, "learning_rate": 4.8547091535148725e-05, "loss": 1.7511, "step": 53 }, { "epoch": 2.4545454545454546, "grad_norm": 0.4271462696410255, "learning_rate": 4.846619571474777e-05, "loss": 1.819, "step": 54 }, { "epoch": 2.5, "grad_norm": 0.4687848016567625, "learning_rate": 4.8383187733149814e-05, "loss": 1.7687, "step": 55 }, { "epoch": 2.5454545454545454, "grad_norm": 0.49290753360843537, "learning_rate": 4.8298075952684406e-05, "loss": 1.7602, "step": 56 }, { "epoch": 2.590909090909091, "grad_norm": 0.4145717434905905, "learning_rate": 4.821086894762045e-05, "loss": 1.6849, "step": 57 }, { "epoch": 2.6363636363636362, "grad_norm": 0.47758526019077047, "learning_rate": 4.812157550330246e-05, "loss": 1.8031, "step": 58 }, { "epoch": 2.6818181818181817, "grad_norm": 0.43090399608171975, "learning_rate": 4.8030204615265445e-05, "loss": 1.6979, "step": 59 }, { "epoch": 2.7272727272727275, "grad_norm": 0.42558498758567664, "learning_rate": 4.7936765488328794e-05, "loss": 1.6167, "step": 60 }, { "epoch": 2.7727272727272725, "grad_norm": 0.42771373575336713, "learning_rate": 4.7841267535668876e-05, "loss": 1.6126, "step": 61 }, { "epoch": 2.8181818181818183, "grad_norm": 0.45605982497696307, "learning_rate": 4.7743720377870786e-05, "loss": 1.7051, "step": 62 }, { "epoch": 2.8636363636363638, "grad_norm": 0.36383290803582924, "learning_rate": 4.764413384195915e-05, "loss": 1.5355, "step": 63 }, { "epoch": 2.909090909090909, "grad_norm": 0.35888633901842737, "learning_rate": 4.7542517960408125e-05, "loss": 1.6037, "step": 64 }, { "epoch": 2.9545454545454546, "grad_norm": 0.35247547310658606, "learning_rate": 4.7438882970130756e-05, "loss": 1.6403, "step": 65 }, { "epoch": 3.0, "grad_norm": 0.4123547479714445, "learning_rate": 4.7333239311447634e-05, "loss": 1.6687, "step": 66 }, { "epoch": 3.0454545454545454, "grad_norm": 0.3575991218498864, "learning_rate": 4.7225597627035176e-05, "loss": 1.6531, "step": 67 }, { "epoch": 3.090909090909091, "grad_norm": 0.3303944518163955, "learning_rate": 4.711596876085344e-05, "loss": 1.4537, "step": 68 }, { "epoch": 3.1363636363636362, "grad_norm": 0.43041449387205766, "learning_rate": 4.70043637570537e-05, "loss": 1.6096, "step": 69 }, { "epoch": 3.1818181818181817, "grad_norm": 0.3646696726581941, "learning_rate": 4.6890793858865865e-05, "loss": 1.586, "step": 70 }, { "epoch": 3.227272727272727, "grad_norm": 0.3260419946678917, "learning_rate": 4.677527050746577e-05, "loss": 1.5422, "step": 71 }, { "epoch": 3.2727272727272725, "grad_norm": 0.33570338878732947, "learning_rate": 4.665780534082264e-05, "loss": 1.5553, "step": 72 }, { "epoch": 3.3181818181818183, "grad_norm": 0.3085499618355785, "learning_rate": 4.6538410192526613e-05, "loss": 1.5067, "step": 73 }, { "epoch": 3.3636363636363638, "grad_norm": 0.30727638027509513, "learning_rate": 4.6417097090596637e-05, "loss": 1.5667, "step": 74 }, { "epoch": 3.409090909090909, "grad_norm": 0.3021730689826977, "learning_rate": 4.629387825626875e-05, "loss": 1.5795, "step": 75 }, { "epoch": 3.4545454545454546, "grad_norm": 0.32502917714553514, "learning_rate": 4.6168766102764874e-05, "loss": 1.5154, "step": 76 }, { "epoch": 3.5, "grad_norm": 0.3301265717537239, "learning_rate": 4.604177323404235e-05, "loss": 1.5048, "step": 77 }, { "epoch": 3.5454545454545454, "grad_norm": 0.323165145487666, "learning_rate": 4.591291244352413e-05, "loss": 1.5117, "step": 78 }, { "epoch": 3.590909090909091, "grad_norm": 0.3082703852698703, "learning_rate": 4.578219671280998e-05, "loss": 1.4521, "step": 79 }, { "epoch": 3.6363636363636362, "grad_norm": 0.29479913403691677, "learning_rate": 4.5649639210368714e-05, "loss": 1.4487, "step": 80 }, { "epoch": 3.6818181818181817, "grad_norm": 0.3092257716704811, "learning_rate": 4.551525329021155e-05, "loss": 1.447, "step": 81 }, { "epoch": 3.7272727272727275, "grad_norm": 0.37229811196636947, "learning_rate": 4.5379052490546855e-05, "loss": 1.4919, "step": 82 }, { "epoch": 3.7727272727272725, "grad_norm": 0.31138376308603494, "learning_rate": 4.524105053241625e-05, "loss": 1.4865, "step": 83 }, { "epoch": 3.8181818181818183, "grad_norm": 0.3584821617778458, "learning_rate": 4.510126131831234e-05, "loss": 1.5092, "step": 84 }, { "epoch": 3.8636363636363638, "grad_norm": 0.32342053983138597, "learning_rate": 4.4959698930778184e-05, "loss": 1.3528, "step": 85 }, { "epoch": 3.909090909090909, "grad_norm": 0.3755258228072684, "learning_rate": 4.481637763098858e-05, "loss": 1.5071, "step": 86 }, { "epoch": 3.9545454545454546, "grad_norm": 0.31577787863565876, "learning_rate": 4.4671311857313376e-05, "loss": 1.5149, "step": 87 }, { "epoch": 4.0, "grad_norm": 0.3274660630382459, "learning_rate": 4.452451622386294e-05, "loss": 1.5101, "step": 88 }, { "epoch": 4.045454545454546, "grad_norm": 0.295499868623689, "learning_rate": 4.437600551901591e-05, "loss": 1.4591, "step": 89 }, { "epoch": 4.090909090909091, "grad_norm": 0.28617863379643455, "learning_rate": 4.422579470392941e-05, "loss": 1.4866, "step": 90 }, { "epoch": 4.136363636363637, "grad_norm": 0.27613544331064876, "learning_rate": 4.40738989110318e-05, "loss": 1.4244, "step": 91 }, { "epoch": 4.181818181818182, "grad_norm": 0.2969006794655833, "learning_rate": 4.392033344249827e-05, "loss": 1.3955, "step": 92 }, { "epoch": 4.2272727272727275, "grad_norm": 0.2833654953485509, "learning_rate": 4.376511376870925e-05, "loss": 1.4366, "step": 93 }, { "epoch": 4.2727272727272725, "grad_norm": 0.32286555195443095, "learning_rate": 4.36082555266919e-05, "loss": 1.3779, "step": 94 }, { "epoch": 4.318181818181818, "grad_norm": 0.30339867592392933, "learning_rate": 4.3449774518544837e-05, "loss": 1.3523, "step": 95 }, { "epoch": 4.363636363636363, "grad_norm": 0.3678773849209462, "learning_rate": 4.328968670984621e-05, "loss": 1.4055, "step": 96 }, { "epoch": 4.409090909090909, "grad_norm": 0.2809671108473524, "learning_rate": 4.3128008228045264e-05, "loss": 1.3742, "step": 97 }, { "epoch": 4.454545454545454, "grad_norm": 0.3264760569541278, "learning_rate": 4.296475536083769e-05, "loss": 1.3946, "step": 98 }, { "epoch": 4.5, "grad_norm": 0.258862105020995, "learning_rate": 4.279994455452478e-05, "loss": 1.3284, "step": 99 }, { "epoch": 4.545454545454545, "grad_norm": 0.3134553840821824, "learning_rate": 4.263359241235657e-05, "loss": 1.3851, "step": 100 }, { "epoch": 4.590909090909091, "grad_norm": 0.30170722376996695, "learning_rate": 4.246571569285925e-05, "loss": 1.371, "step": 101 }, { "epoch": 4.636363636363637, "grad_norm": 0.2887395932931785, "learning_rate": 4.229633130814685e-05, "loss": 1.3292, "step": 102 }, { "epoch": 4.681818181818182, "grad_norm": 0.3218428316599217, "learning_rate": 4.212545632221751e-05, "loss": 1.4558, "step": 103 }, { "epoch": 4.7272727272727275, "grad_norm": 0.27631578638146514, "learning_rate": 4.1953107949234414e-05, "loss": 1.2989, "step": 104 }, { "epoch": 4.7727272727272725, "grad_norm": 0.2688375043457644, "learning_rate": 4.1779303551791695e-05, "loss": 1.3677, "step": 105 }, { "epoch": 4.818181818181818, "grad_norm": 0.28592384506961227, "learning_rate": 4.160406063916517e-05, "loss": 1.2749, "step": 106 }, { "epoch": 4.863636363636363, "grad_norm": 0.30087535430968293, "learning_rate": 4.142739686554853e-05, "loss": 1.405, "step": 107 }, { "epoch": 4.909090909090909, "grad_norm": 0.27354449180224716, "learning_rate": 4.124933002827481e-05, "loss": 1.3642, "step": 108 }, { "epoch": 4.954545454545455, "grad_norm": 0.29727660205166156, "learning_rate": 4.106987806602345e-05, "loss": 1.3686, "step": 109 }, { "epoch": 5.0, "grad_norm": 0.29032585014056134, "learning_rate": 4.088905905701316e-05, "loss": 1.3651, "step": 110 }, { "epoch": 5.045454545454546, "grad_norm": 0.2900324279431226, "learning_rate": 4.070689121718066e-05, "loss": 1.3556, "step": 111 }, { "epoch": 5.090909090909091, "grad_norm": 0.24863713928813203, "learning_rate": 4.0523392898345604e-05, "loss": 1.341, "step": 112 }, { "epoch": 5.136363636363637, "grad_norm": 0.2790657225717466, "learning_rate": 4.03385825863618e-05, "loss": 1.3565, "step": 113 }, { "epoch": 5.181818181818182, "grad_norm": 0.2665012607059084, "learning_rate": 4.0152478899254906e-05, "loss": 1.2776, "step": 114 }, { "epoch": 5.2272727272727275, "grad_norm": 0.29328265354243366, "learning_rate": 3.996510058534682e-05, "loss": 1.3697, "step": 115 }, { "epoch": 5.2727272727272725, "grad_norm": 0.2636658932396027, "learning_rate": 3.9776466521366995e-05, "loss": 1.3208, "step": 116 }, { "epoch": 5.318181818181818, "grad_norm": 0.23509864156431806, "learning_rate": 3.958659571055071e-05, "loss": 1.2671, "step": 117 }, { "epoch": 5.363636363636363, "grad_norm": 0.2621476324195016, "learning_rate": 3.939550728072473e-05, "loss": 1.3526, "step": 118 }, { "epoch": 5.409090909090909, "grad_norm": 0.24664500701059844, "learning_rate": 3.920322048238024e-05, "loss": 1.2985, "step": 119 }, { "epoch": 5.454545454545454, "grad_norm": 0.28092507999952776, "learning_rate": 3.900975468673368e-05, "loss": 1.2591, "step": 120 }, { "epoch": 5.5, "grad_norm": 0.2604137388452376, "learning_rate": 3.8815129383775104e-05, "loss": 1.3022, "step": 121 }, { "epoch": 5.545454545454545, "grad_norm": 0.25485436564236713, "learning_rate": 3.861936418030483e-05, "loss": 1.3511, "step": 122 }, { "epoch": 5.590909090909091, "grad_norm": 0.22709249270142517, "learning_rate": 3.842247879795822e-05, "loss": 1.3479, "step": 123 }, { "epoch": 5.636363636363637, "grad_norm": 0.23971939043085813, "learning_rate": 3.822449307121886e-05, "loss": 1.2734, "step": 124 }, { "epoch": 5.681818181818182, "grad_norm": 0.27819081228375214, "learning_rate": 3.8025426945420426e-05, "loss": 1.2971, "step": 125 }, { "epoch": 5.7272727272727275, "grad_norm": 0.25456906150670894, "learning_rate": 3.782530047473739e-05, "loss": 1.2765, "step": 126 }, { "epoch": 5.7727272727272725, "grad_norm": 0.2235373692230836, "learning_rate": 3.762413382016467e-05, "loss": 1.2652, "step": 127 }, { "epoch": 5.818181818181818, "grad_norm": 0.29051536045091336, "learning_rate": 3.742194724748668e-05, "loss": 1.2158, "step": 128 }, { "epoch": 5.863636363636363, "grad_norm": 0.25505139923240183, "learning_rate": 3.721876112523566e-05, "loss": 1.2749, "step": 129 }, { "epoch": 5.909090909090909, "grad_norm": 0.2297174194118098, "learning_rate": 3.701459592263974e-05, "loss": 1.2686, "step": 130 }, { "epoch": 5.954545454545455, "grad_norm": 1.1819543960335286, "learning_rate": 3.680947220756086e-05, "loss": 1.2669, "step": 131 }, { "epoch": 6.0, "grad_norm": 0.3596649944995116, "learning_rate": 3.6603410644422703e-05, "loss": 1.2553, "step": 132 }, { "epoch": 6.045454545454546, "grad_norm": 0.2373357658133507, "learning_rate": 3.639643199212899e-05, "loss": 1.2475, "step": 133 }, { "epoch": 6.090909090909091, "grad_norm": 0.22493113890489372, "learning_rate": 3.618855710197212e-05, "loss": 1.2343, "step": 134 }, { "epoch": 6.136363636363637, "grad_norm": 0.2186099012077051, "learning_rate": 3.59798069155327e-05, "loss": 1.2583, "step": 135 }, { "epoch": 6.181818181818182, "grad_norm": 0.2198359168498248, "learning_rate": 3.577020246256974e-05, "loss": 1.2124, "step": 136 }, { "epoch": 6.2272727272727275, "grad_norm": 0.2418275850989451, "learning_rate": 3.555976485890216e-05, "loss": 1.2652, "step": 137 }, { "epoch": 6.2727272727272725, "grad_norm": 0.24177652899109378, "learning_rate": 3.5348515304281567e-05, "loss": 1.2718, "step": 138 }, { "epoch": 6.318181818181818, "grad_norm": 0.23176107126497403, "learning_rate": 3.5136475080256504e-05, "loss": 1.2815, "step": 139 }, { "epoch": 6.363636363636363, "grad_norm": 0.2186114898802362, "learning_rate": 3.492366554802856e-05, "loss": 1.2278, "step": 140 }, { "epoch": 6.409090909090909, "grad_norm": 0.24894955084370854, "learning_rate": 3.471010814630044e-05, "loss": 1.2528, "step": 141 }, { "epoch": 6.454545454545454, "grad_norm": 0.26036666060205077, "learning_rate": 3.449582438911613e-05, "loss": 1.3011, "step": 142 }, { "epoch": 6.5, "grad_norm": 0.21464875894128543, "learning_rate": 3.428083586369362e-05, "loss": 1.2153, "step": 143 }, { "epoch": 6.545454545454545, "grad_norm": 0.24924104575584335, "learning_rate": 3.406516422825013e-05, "loss": 1.2149, "step": 144 }, { "epoch": 6.590909090909091, "grad_norm": 0.22796512903628247, "learning_rate": 3.384883120982027e-05, "loss": 1.2057, "step": 145 }, { "epoch": 6.636363636363637, "grad_norm": 0.22412816101017885, "learning_rate": 3.363185860206719e-05, "loss": 1.2879, "step": 146 }, { "epoch": 6.681818181818182, "grad_norm": 0.22898070560796288, "learning_rate": 3.341426826308708e-05, "loss": 1.2407, "step": 147 }, { "epoch": 6.7272727272727275, "grad_norm": 0.24599569608907884, "learning_rate": 3.319608211320719e-05, "loss": 1.193, "step": 148 }, { "epoch": 6.7727272727272725, "grad_norm": 0.22837138512494187, "learning_rate": 3.29773221327775e-05, "loss": 1.2173, "step": 149 }, { "epoch": 6.818181818181818, "grad_norm": 0.23242892820855598, "learning_rate": 3.2758010359956376e-05, "loss": 1.3222, "step": 150 }, { "epoch": 6.863636363636363, "grad_norm": 0.2543864237654759, "learning_rate": 3.253816888849051e-05, "loss": 1.24, "step": 151 }, { "epoch": 6.909090909090909, "grad_norm": 0.24092106206033628, "learning_rate": 3.2317819865489066e-05, "loss": 1.2964, "step": 152 }, { "epoch": 6.954545454545455, "grad_norm": 0.24386711464365543, "learning_rate": 3.209698548919262e-05, "loss": 1.2041, "step": 153 }, { "epoch": 7.0, "grad_norm": 0.2507114565585548, "learning_rate": 3.187568800673682e-05, "loss": 1.2057, "step": 154 }, { "epoch": 7.045454545454546, "grad_norm": 0.21813780408934713, "learning_rate": 3.165394971191125e-05, "loss": 1.1822, "step": 155 }, { "epoch": 7.090909090909091, "grad_norm": 0.22737082010889867, "learning_rate": 3.143179294291351e-05, "loss": 1.2516, "step": 156 }, { "epoch": 7.136363636363637, "grad_norm": 0.2168145915465791, "learning_rate": 3.120924008009875e-05, "loss": 1.1801, "step": 157 }, { "epoch": 7.181818181818182, "grad_norm": 0.19136333990689988, "learning_rate": 3.0986313543725174e-05, "loss": 1.1683, "step": 158 }, { "epoch": 7.2272727272727275, "grad_norm": 0.2261635672146715, "learning_rate": 3.0763035791695335e-05, "loss": 1.2801, "step": 159 }, { "epoch": 7.2727272727272725, "grad_norm": 0.1900863256471995, "learning_rate": 3.053942931729365e-05, "loss": 1.2395, "step": 160 }, { "epoch": 7.318181818181818, "grad_norm": 0.20083348886939187, "learning_rate": 3.0315516646920494e-05, "loss": 1.1789, "step": 161 }, { "epoch": 7.363636363636363, "grad_norm": 0.22527651747823768, "learning_rate": 3.0091320337822793e-05, "loss": 1.1912, "step": 162 }, { "epoch": 7.409090909090909, "grad_norm": 0.22935986524053414, "learning_rate": 2.9866862975821596e-05, "loss": 1.2043, "step": 163 }, { "epoch": 7.454545454545454, "grad_norm": 0.25053211749891174, "learning_rate": 2.9642167173036768e-05, "loss": 1.2245, "step": 164 }, { "epoch": 7.5, "grad_norm": 0.24058101769072435, "learning_rate": 2.9417255565608982e-05, "loss": 1.1887, "step": 165 }, { "epoch": 7.545454545454545, "grad_norm": 0.2236093293557457, "learning_rate": 2.9192150811419343e-05, "loss": 1.1546, "step": 166 }, { "epoch": 7.590909090909091, "grad_norm": 0.2536710657188106, "learning_rate": 2.8966875587806842e-05, "loss": 1.2302, "step": 167 }, { "epoch": 7.636363636363637, "grad_norm": 0.23635026185440464, "learning_rate": 2.8741452589283747e-05, "loss": 1.2491, "step": 168 }, { "epoch": 7.681818181818182, "grad_norm": 0.20829672573008348, "learning_rate": 2.8515904525249342e-05, "loss": 1.1821, "step": 169 }, { "epoch": 7.7272727272727275, "grad_norm": 0.19887830172000012, "learning_rate": 2.8290254117702204e-05, "loss": 1.2327, "step": 170 }, { "epoch": 7.7727272727272725, "grad_norm": 0.22301542864724377, "learning_rate": 2.8064524098951122e-05, "loss": 1.1883, "step": 171 }, { "epoch": 7.818181818181818, "grad_norm": 0.20156843827561532, "learning_rate": 2.7838737209324995e-05, "loss": 1.2065, "step": 172 }, { "epoch": 7.863636363636363, "grad_norm": 0.2196470118065061, "learning_rate": 2.761291619488198e-05, "loss": 1.2109, "step": 173 }, { "epoch": 7.909090909090909, "grad_norm": 0.23170457369155842, "learning_rate": 2.738708380511803e-05, "loss": 1.211, "step": 174 }, { "epoch": 7.954545454545455, "grad_norm": 0.20233640061480676, "learning_rate": 2.7161262790675013e-05, "loss": 1.1566, "step": 175 }, { "epoch": 8.0, "grad_norm": 0.22428142795845282, "learning_rate": 2.6935475901048884e-05, "loss": 1.2359, "step": 176 }, { "epoch": 8.045454545454545, "grad_norm": 0.21470110790573302, "learning_rate": 2.6709745882297805e-05, "loss": 1.2061, "step": 177 }, { "epoch": 8.090909090909092, "grad_norm": 0.19724648690029864, "learning_rate": 2.6484095474750663e-05, "loss": 1.2481, "step": 178 }, { "epoch": 8.136363636363637, "grad_norm": 0.20720915390036856, "learning_rate": 2.6258547410716272e-05, "loss": 1.1459, "step": 179 }, { "epoch": 8.181818181818182, "grad_norm": 0.2279215538554465, "learning_rate": 2.6033124412193167e-05, "loss": 1.1456, "step": 180 }, { "epoch": 8.227272727272727, "grad_norm": 0.1919565575948159, "learning_rate": 2.580784918858066e-05, "loss": 1.2178, "step": 181 }, { "epoch": 8.272727272727273, "grad_norm": 0.20123788080080415, "learning_rate": 2.558274443439103e-05, "loss": 1.1572, "step": 182 }, { "epoch": 8.318181818181818, "grad_norm": 0.18813695162235894, "learning_rate": 2.535783282696324e-05, "loss": 1.1923, "step": 183 }, { "epoch": 8.363636363636363, "grad_norm": 0.20823032710382397, "learning_rate": 2.5133137024178406e-05, "loss": 1.1843, "step": 184 }, { "epoch": 8.409090909090908, "grad_norm": 0.2099319254835431, "learning_rate": 2.4908679662177216e-05, "loss": 1.1993, "step": 185 }, { "epoch": 8.454545454545455, "grad_norm": 0.19824053523947052, "learning_rate": 2.468448335307951e-05, "loss": 1.1826, "step": 186 }, { "epoch": 8.5, "grad_norm": 0.19231852836204918, "learning_rate": 2.4460570682706362e-05, "loss": 1.1279, "step": 187 }, { "epoch": 8.545454545454545, "grad_norm": 0.20741037413832103, "learning_rate": 2.4236964208304673e-05, "loss": 1.0622, "step": 188 }, { "epoch": 8.590909090909092, "grad_norm": 0.3975271395657707, "learning_rate": 2.4013686456274824e-05, "loss": 1.1743, "step": 189 }, { "epoch": 8.636363636363637, "grad_norm": 0.1911090472778124, "learning_rate": 2.379075991990126e-05, "loss": 1.1241, "step": 190 }, { "epoch": 8.681818181818182, "grad_norm": 0.19428196767319525, "learning_rate": 2.35682070570865e-05, "loss": 1.2169, "step": 191 }, { "epoch": 8.727272727272727, "grad_norm": 0.1929543238663036, "learning_rate": 2.3346050288088743e-05, "loss": 1.1213, "step": 192 }, { "epoch": 8.772727272727273, "grad_norm": 0.1859295707214649, "learning_rate": 2.3124311993263192e-05, "loss": 1.2022, "step": 193 }, { "epoch": 8.818181818181818, "grad_norm": 0.19215762964127076, "learning_rate": 2.2903014510807392e-05, "loss": 1.1756, "step": 194 }, { "epoch": 8.863636363636363, "grad_norm": 0.19577206240809536, "learning_rate": 2.2682180134510943e-05, "loss": 1.1492, "step": 195 }, { "epoch": 8.909090909090908, "grad_norm": 0.18354313338272446, "learning_rate": 2.2461831111509496e-05, "loss": 1.1474, "step": 196 }, { "epoch": 8.954545454545455, "grad_norm": 0.1959877870143329, "learning_rate": 2.2241989640043633e-05, "loss": 1.2621, "step": 197 }, { "epoch": 9.0, "grad_norm": 0.1948482879201238, "learning_rate": 2.202267786722252e-05, "loss": 1.2449, "step": 198 }, { "epoch": 9.045454545454545, "grad_norm": 0.21120390875412814, "learning_rate": 2.1803917886792812e-05, "loss": 1.1333, "step": 199 }, { "epoch": 9.090909090909092, "grad_norm": 0.18499690032381164, "learning_rate": 2.1585731736912922e-05, "loss": 1.2015, "step": 200 }, { "epoch": 9.136363636363637, "grad_norm": 0.19403738840593351, "learning_rate": 2.136814139793282e-05, "loss": 1.1961, "step": 201 }, { "epoch": 9.181818181818182, "grad_norm": 0.2116507051866016, "learning_rate": 2.1151168790179738e-05, "loss": 1.215, "step": 202 }, { "epoch": 9.227272727272727, "grad_norm": 0.2091130488246949, "learning_rate": 2.0934835771749872e-05, "loss": 1.1493, "step": 203 }, { "epoch": 9.272727272727273, "grad_norm": 0.21656692423046117, "learning_rate": 2.0719164136306386e-05, "loss": 1.1132, "step": 204 }, { "epoch": 9.318181818181818, "grad_norm": 0.20613565196879666, "learning_rate": 2.0504175610883876e-05, "loss": 1.2056, "step": 205 }, { "epoch": 9.363636363636363, "grad_norm": 0.18361410512148918, "learning_rate": 2.0289891853699573e-05, "loss": 1.2396, "step": 206 }, { "epoch": 9.409090909090908, "grad_norm": 0.1970968023390117, "learning_rate": 2.0076334451971447e-05, "loss": 1.1505, "step": 207 }, { "epoch": 9.454545454545455, "grad_norm": 0.2802489746470572, "learning_rate": 1.9863524919743505e-05, "loss": 1.0803, "step": 208 }, { "epoch": 9.5, "grad_norm": 0.18361667226221806, "learning_rate": 1.9651484695718435e-05, "loss": 1.1293, "step": 209 }, { "epoch": 9.545454545454545, "grad_norm": 0.1836624775996522, "learning_rate": 1.944023514109784e-05, "loss": 1.1646, "step": 210 }, { "epoch": 9.590909090909092, "grad_norm": 0.7985028121765728, "learning_rate": 1.922979753743027e-05, "loss": 1.1631, "step": 211 }, { "epoch": 9.636363636363637, "grad_norm": 0.18982874728183255, "learning_rate": 1.9020193084467303e-05, "loss": 1.0795, "step": 212 }, { "epoch": 9.681818181818182, "grad_norm": 0.2738998625073182, "learning_rate": 1.881144289802788e-05, "loss": 1.0812, "step": 213 }, { "epoch": 9.727272727272727, "grad_norm": 0.19287056890311244, "learning_rate": 1.8603568007871025e-05, "loss": 1.1318, "step": 214 }, { "epoch": 9.772727272727273, "grad_norm": 0.19551935271275803, "learning_rate": 1.83965893555773e-05, "loss": 1.1903, "step": 215 }, { "epoch": 9.818181818181818, "grad_norm": 0.20289464182378333, "learning_rate": 1.8190527792439145e-05, "loss": 1.1716, "step": 216 }, { "epoch": 9.863636363636363, "grad_norm": 0.18427534137801654, "learning_rate": 1.7985404077360258e-05, "loss": 1.181, "step": 217 }, { "epoch": 9.909090909090908, "grad_norm": 0.17269516793495363, "learning_rate": 1.7781238874764337e-05, "loss": 1.1443, "step": 218 }, { "epoch": 9.954545454545455, "grad_norm": 0.20393617981346993, "learning_rate": 1.757805275251333e-05, "loss": 1.1991, "step": 219 }, { "epoch": 10.0, "grad_norm": 0.19442135716601172, "learning_rate": 1.737586617983534e-05, "loss": 1.1121, "step": 220 }, { "epoch": 10.045454545454545, "grad_norm": 0.17057632115064372, "learning_rate": 1.717469952526262e-05, "loss": 1.162, "step": 221 }, { "epoch": 10.090909090909092, "grad_norm": 0.17688084966953885, "learning_rate": 1.6974573054579582e-05, "loss": 1.177, "step": 222 }, { "epoch": 10.136363636363637, "grad_norm": 0.17485623392128088, "learning_rate": 1.6775506928781146e-05, "loss": 1.1594, "step": 223 }, { "epoch": 10.181818181818182, "grad_norm": 0.1915709146213454, "learning_rate": 1.6577521202041775e-05, "loss": 1.1637, "step": 224 }, { "epoch": 10.227272727272727, "grad_norm": 0.18779150088359975, "learning_rate": 1.6380635819695172e-05, "loss": 1.1325, "step": 225 }, { "epoch": 10.272727272727273, "grad_norm": 0.17675767122639172, "learning_rate": 1.6184870616224905e-05, "loss": 1.1283, "step": 226 }, { "epoch": 10.318181818181818, "grad_norm": 0.20187810956460173, "learning_rate": 1.599024531326632e-05, "loss": 1.1362, "step": 227 }, { "epoch": 10.363636363636363, "grad_norm": 0.18429771837794573, "learning_rate": 1.5796779517619757e-05, "loss": 1.0782, "step": 228 }, { "epoch": 10.409090909090908, "grad_norm": 0.18129745466430086, "learning_rate": 1.560449271927528e-05, "loss": 1.1556, "step": 229 }, { "epoch": 10.454545454545455, "grad_norm": 0.18156111868915045, "learning_rate": 1.541340428944929e-05, "loss": 1.1472, "step": 230 }, { "epoch": 10.5, "grad_norm": 0.2058178822209493, "learning_rate": 1.5223533478633012e-05, "loss": 1.1436, "step": 231 }, { "epoch": 10.545454545454545, "grad_norm": 0.19420543695117135, "learning_rate": 1.5034899414653183e-05, "loss": 1.1632, "step": 232 }, { "epoch": 10.590909090909092, "grad_norm": 0.1816721312324971, "learning_rate": 1.4847521100745101e-05, "loss": 1.0919, "step": 233 }, { "epoch": 10.636363636363637, "grad_norm": 0.17503046181743187, "learning_rate": 1.4661417413638206e-05, "loss": 1.177, "step": 234 }, { "epoch": 10.681818181818182, "grad_norm": 0.1996340286183738, "learning_rate": 1.44766071016544e-05, "loss": 1.1901, "step": 235 }, { "epoch": 10.727272727272727, "grad_norm": 0.18843938813925634, "learning_rate": 1.4293108782819345e-05, "loss": 1.1081, "step": 236 }, { "epoch": 10.772727272727273, "grad_norm": 0.18750865793808888, "learning_rate": 1.4110940942986844e-05, "loss": 1.0781, "step": 237 }, { "epoch": 10.818181818181818, "grad_norm": 0.17785144309042253, "learning_rate": 1.3930121933976556e-05, "loss": 1.1961, "step": 238 }, { "epoch": 10.863636363636363, "grad_norm": 0.17015756605905757, "learning_rate": 1.37506699717252e-05, "loss": 1.142, "step": 239 }, { "epoch": 10.909090909090908, "grad_norm": 0.1976824831465818, "learning_rate": 1.3572603134451479e-05, "loss": 1.1024, "step": 240 }, { "epoch": 10.954545454545455, "grad_norm": 0.17152795538031174, "learning_rate": 1.3395939360834845e-05, "loss": 1.136, "step": 241 }, { "epoch": 11.0, "grad_norm": 0.1667419474833486, "learning_rate": 1.3220696448208308e-05, "loss": 1.1114, "step": 242 }, { "epoch": 11.045454545454545, "grad_norm": 0.16490649981297714, "learning_rate": 1.304689205076558e-05, "loss": 1.1569, "step": 243 }, { "epoch": 11.090909090909092, "grad_norm": 0.17142788488665067, "learning_rate": 1.2874543677782508e-05, "loss": 1.1667, "step": 244 }, { "epoch": 11.136363636363637, "grad_norm": 0.1747032605597451, "learning_rate": 1.2703668691853155e-05, "loss": 1.1422, "step": 245 }, { "epoch": 11.181818181818182, "grad_norm": 0.1680960418576758, "learning_rate": 1.253428430714076e-05, "loss": 1.1462, "step": 246 }, { "epoch": 11.227272727272727, "grad_norm": 0.1893549688172363, "learning_rate": 1.2366407587643432e-05, "loss": 1.1496, "step": 247 }, { "epoch": 11.272727272727273, "grad_norm": 0.2930049956267172, "learning_rate": 1.220005544547522e-05, "loss": 1.0953, "step": 248 }, { "epoch": 11.318181818181818, "grad_norm": 0.16093892837183954, "learning_rate": 1.2035244639162319e-05, "loss": 1.1001, "step": 249 }, { "epoch": 11.363636363636363, "grad_norm": 0.16689857435832806, "learning_rate": 1.1871991771954748e-05, "loss": 1.0471, "step": 250 }, { "epoch": 11.409090909090908, "grad_norm": 0.17166817028161555, "learning_rate": 1.1710313290153795e-05, "loss": 1.0986, "step": 251 }, { "epoch": 11.454545454545455, "grad_norm": 0.18190627031196485, "learning_rate": 1.1550225481455165e-05, "loss": 1.1788, "step": 252 }, { "epoch": 11.5, "grad_norm": 0.17512791533622662, "learning_rate": 1.1391744473308106e-05, "loss": 1.1673, "step": 253 }, { "epoch": 11.545454545454545, "grad_norm": 0.16757254457143245, "learning_rate": 1.1234886231290759e-05, "loss": 1.1746, "step": 254 }, { "epoch": 11.590909090909092, "grad_norm": 0.1668004234347932, "learning_rate": 1.1079666557501736e-05, "loss": 1.1107, "step": 255 }, { "epoch": 11.636363636363637, "grad_norm": 0.15359971441629242, "learning_rate": 1.0926101088968207e-05, "loss": 1.1658, "step": 256 }, { "epoch": 11.681818181818182, "grad_norm": 0.16049551854978608, "learning_rate": 1.0774205296070597e-05, "loss": 1.0853, "step": 257 }, { "epoch": 11.727272727272727, "grad_norm": 0.20099872095589202, "learning_rate": 1.062399448098409e-05, "loss": 1.0978, "step": 258 }, { "epoch": 11.772727272727273, "grad_norm": 0.18578304835004, "learning_rate": 1.0475483776137062e-05, "loss": 1.1296, "step": 259 }, { "epoch": 11.818181818181818, "grad_norm": 0.16704809777000973, "learning_rate": 1.0328688142686627e-05, "loss": 1.0695, "step": 260 }, { "epoch": 11.863636363636363, "grad_norm": 0.17323425454468377, "learning_rate": 1.0183622369011422e-05, "loss": 1.077, "step": 261 }, { "epoch": 11.909090909090908, "grad_norm": 0.17800221938786567, "learning_rate": 1.0040301069221823e-05, "loss": 1.122, "step": 262 }, { "epoch": 11.954545454545455, "grad_norm": 0.17418177714949332, "learning_rate": 9.89873868168766e-06, "loss": 1.1375, "step": 263 }, { "epoch": 12.0, "grad_norm": 0.16676746363627873, "learning_rate": 9.758949467583754e-06, "loss": 1.1737, "step": 264 }, { "epoch": 12.045454545454545, "grad_norm": 0.1527614833387276, "learning_rate": 9.620947509453155e-06, "loss": 1.1136, "step": 265 }, { "epoch": 12.090909090909092, "grad_norm": 0.17699034639073358, "learning_rate": 9.484746709788451e-06, "loss": 1.1231, "step": 266 }, { "epoch": 12.136363636363637, "grad_norm": 0.17421157455050928, "learning_rate": 9.350360789631291e-06, "loss": 1.148, "step": 267 }, { "epoch": 12.181818181818182, "grad_norm": 0.15462065956553298, "learning_rate": 9.217803287190029e-06, "loss": 1.1435, "step": 268 }, { "epoch": 12.227272727272727, "grad_norm": 0.16543045040679552, "learning_rate": 9.087087556475873e-06, "loss": 1.1312, "step": 269 }, { "epoch": 12.272727272727273, "grad_norm": 0.17092139127270728, "learning_rate": 8.958226765957655e-06, "loss": 1.1164, "step": 270 }, { "epoch": 12.318181818181818, "grad_norm": 0.16435344514564526, "learning_rate": 8.831233897235128e-06, "loss": 1.075, "step": 271 }, { "epoch": 12.363636363636363, "grad_norm": 0.16450679267899113, "learning_rate": 8.706121743731256e-06, "loss": 1.1508, "step": 272 }, { "epoch": 12.409090909090908, "grad_norm": 0.15282580972945572, "learning_rate": 8.58290290940337e-06, "loss": 1.135, "step": 273 }, { "epoch": 12.454545454545455, "grad_norm": 0.1555918103689812, "learning_rate": 8.461589807473392e-06, "loss": 1.121, "step": 274 }, { "epoch": 12.5, "grad_norm": 0.17353062952008638, "learning_rate": 8.342194659177358e-06, "loss": 1.1849, "step": 275 }, { "epoch": 12.545454545454545, "grad_norm": 0.15700310300217593, "learning_rate": 8.224729492534231e-06, "loss": 1.1479, "step": 276 }, { "epoch": 12.590909090909092, "grad_norm": 0.23929770557768484, "learning_rate": 8.109206141134142e-06, "loss": 1.0834, "step": 277 }, { "epoch": 12.636363636363637, "grad_norm": 0.18493048427935435, "learning_rate": 7.995636242946305e-06, "loss": 1.1398, "step": 278 }, { "epoch": 12.681818181818182, "grad_norm": 0.16961784748611264, "learning_rate": 7.884031239146569e-06, "loss": 1.0651, "step": 279 }, { "epoch": 12.727272727272727, "grad_norm": 0.1597693846586007, "learning_rate": 7.774402372964833e-06, "loss": 1.0952, "step": 280 }, { "epoch": 12.772727272727273, "grad_norm": 0.16022321839887022, "learning_rate": 7.666760688552371e-06, "loss": 1.1269, "step": 281 }, { "epoch": 12.818181818181818, "grad_norm": 0.1661994031922762, "learning_rate": 7.5611170298692466e-06, "loss": 1.0682, "step": 282 }, { "epoch": 12.863636363636363, "grad_norm": 0.16932158388771293, "learning_rate": 7.4574820395918735e-06, "loss": 1.163, "step": 283 }, { "epoch": 12.909090909090908, "grad_norm": 0.15828697644374046, "learning_rate": 7.3558661580408545e-06, "loss": 1.1466, "step": 284 }, { "epoch": 12.954545454545455, "grad_norm": 0.25782602954860684, "learning_rate": 7.256279622129215e-06, "loss": 0.9785, "step": 285 }, { "epoch": 13.0, "grad_norm": 0.15598380461908062, "learning_rate": 7.15873246433113e-06, "loss": 1.1248, "step": 286 }, { "epoch": 13.045454545454545, "grad_norm": 0.16456745649594778, "learning_rate": 7.063234511671206e-06, "loss": 1.1426, "step": 287 }, { "epoch": 13.090909090909092, "grad_norm": 0.17833577442195056, "learning_rate": 6.969795384734556e-06, "loss": 1.1278, "step": 288 }, { "epoch": 13.136363636363637, "grad_norm": 0.16618049623992226, "learning_rate": 6.878424496697554e-06, "loss": 1.0637, "step": 289 }, { "epoch": 13.181818181818182, "grad_norm": 0.1548002033964038, "learning_rate": 6.789131052379549e-06, "loss": 1.1438, "step": 290 }, { "epoch": 13.227272727272727, "grad_norm": 0.15277655748023802, "learning_rate": 6.7019240473155924e-06, "loss": 1.0657, "step": 291 }, { "epoch": 13.272727272727273, "grad_norm": 0.24777282879928914, "learning_rate": 6.616812266850187e-06, "loss": 1.159, "step": 292 }, { "epoch": 13.318181818181818, "grad_norm": 0.19128988519206985, "learning_rate": 6.5338042852522305e-06, "loss": 1.1772, "step": 293 }, { "epoch": 13.363636363636363, "grad_norm": 0.17365339407446134, "learning_rate": 6.4529084648512815e-06, "loss": 1.0807, "step": 294 }, { "epoch": 13.409090909090908, "grad_norm": 0.16321204192747293, "learning_rate": 6.374132955195062e-06, "loss": 1.1293, "step": 295 }, { "epoch": 13.454545454545455, "grad_norm": 0.1754755861725117, "learning_rate": 6.297485692228512e-06, "loss": 1.1434, "step": 296 }, { "epoch": 13.5, "grad_norm": 0.14452663905974894, "learning_rate": 6.222974397494309e-06, "loss": 1.0709, "step": 297 }, { "epoch": 13.545454545454545, "grad_norm": 0.14708535357644448, "learning_rate": 6.150606577354948e-06, "loss": 1.0964, "step": 298 }, { "epoch": 13.590909090909092, "grad_norm": 0.16471061622026806, "learning_rate": 6.080389522236585e-06, "loss": 1.114, "step": 299 }, { "epoch": 13.636363636363637, "grad_norm": 0.175358096168744, "learning_rate": 6.012330305894584e-06, "loss": 1.0573, "step": 300 }, { "epoch": 13.681818181818182, "grad_norm": 0.15744915474687687, "learning_rate": 5.946435784700869e-06, "loss": 1.1256, "step": 301 }, { "epoch": 13.727272727272727, "grad_norm": 0.1594724312088306, "learning_rate": 5.8827125969532365e-06, "loss": 1.0757, "step": 302 }, { "epoch": 13.772727272727273, "grad_norm": 0.16045998825705504, "learning_rate": 5.82116716220659e-06, "loss": 1.1302, "step": 303 }, { "epoch": 13.818181818181818, "grad_norm": 0.15174228657227892, "learning_rate": 5.76180568062623e-06, "loss": 1.1555, "step": 304 }, { "epoch": 13.863636363636363, "grad_norm": 0.16291888537269678, "learning_rate": 5.704634132363239e-06, "loss": 1.131, "step": 305 }, { "epoch": 13.909090909090908, "grad_norm": 0.15722592315890294, "learning_rate": 5.649658276952029e-06, "loss": 1.0328, "step": 306 }, { "epoch": 13.954545454545455, "grad_norm": 0.15455642700386937, "learning_rate": 5.596883652730137e-06, "loss": 1.0786, "step": 307 }, { "epoch": 14.0, "grad_norm": 0.16018814750974167, "learning_rate": 5.546315576280258e-06, "loss": 1.0977, "step": 308 }, { "epoch": 14.045454545454545, "grad_norm": 0.15691439028906481, "learning_rate": 5.497959141894671e-06, "loss": 1.1606, "step": 309 }, { "epoch": 14.090909090909092, "grad_norm": 0.15808702196585686, "learning_rate": 5.451819221062024e-06, "loss": 1.1181, "step": 310 }, { "epoch": 14.136363636363637, "grad_norm": 0.15115950620778784, "learning_rate": 5.4079004619765614e-06, "loss": 1.1576, "step": 311 }, { "epoch": 14.181818181818182, "grad_norm": 0.15083892267087215, "learning_rate": 5.3662072890698845e-06, "loss": 1.1048, "step": 312 }, { "epoch": 14.227272727272727, "grad_norm": 0.1481235847517611, "learning_rate": 5.326743902565208e-06, "loss": 1.0597, "step": 313 }, { "epoch": 14.272727272727273, "grad_norm": 0.17487949113995144, "learning_rate": 5.289514278054232e-06, "loss": 1.1048, "step": 314 }, { "epoch": 14.318181818181818, "grad_norm": 0.1565612302630643, "learning_rate": 5.254522166096635e-06, "loss": 1.1404, "step": 315 }, { "epoch": 14.363636363636363, "grad_norm": 0.15463375307382266, "learning_rate": 5.221771091842242e-06, "loss": 1.0867, "step": 316 }, { "epoch": 14.409090909090908, "grad_norm": 0.14777974752978296, "learning_rate": 5.191264354675882e-06, "loss": 1.1297, "step": 317 }, { "epoch": 14.454545454545455, "grad_norm": 0.15705272728049427, "learning_rate": 5.1630050278850275e-06, "loss": 1.1302, "step": 318 }, { "epoch": 14.5, "grad_norm": 0.15391726300293704, "learning_rate": 5.136995958350162e-06, "loss": 1.1421, "step": 319 }, { "epoch": 14.545454545454545, "grad_norm": 0.15288254329961756, "learning_rate": 5.113239766257999e-06, "loss": 1.1455, "step": 320 }, { "epoch": 14.590909090909092, "grad_norm": 0.14947122968152077, "learning_rate": 5.091738844837518e-06, "loss": 1.0706, "step": 321 }, { "epoch": 14.636363636363637, "grad_norm": 0.16745909284503774, "learning_rate": 5.0724953601188635e-06, "loss": 1.0375, "step": 322 }, { "epoch": 14.681818181818182, "grad_norm": 0.1515771191801389, "learning_rate": 5.0555112507151364e-06, "loss": 1.1166, "step": 323 }, { "epoch": 14.727272727272727, "grad_norm": 0.15386111047539605, "learning_rate": 5.0407882276271015e-06, "loss": 1.0891, "step": 324 }, { "epoch": 14.772727272727273, "grad_norm": 1.3735036578655186, "learning_rate": 5.028327774070807e-06, "loss": 1.0836, "step": 325 }, { "epoch": 14.818181818181818, "grad_norm": 0.1570470593366731, "learning_rate": 5.018131145328181e-06, "loss": 1.0566, "step": 326 }, { "epoch": 14.863636363636363, "grad_norm": 0.15950636214118602, "learning_rate": 5.0101993686205585e-06, "loss": 1.1019, "step": 327 }, { "epoch": 14.909090909090908, "grad_norm": 0.15627633379629186, "learning_rate": 5.004533243005204e-06, "loss": 1.0666, "step": 328 }, { "epoch": 14.954545454545455, "grad_norm": 0.157736145851946, "learning_rate": 5.0011333392948126e-06, "loss": 1.1023, "step": 329 }, { "epoch": 15.0, "grad_norm": 0.15122659418177306, "learning_rate": 5e-06, "loss": 1.1027, "step": 330 }, { "epoch": 15.0, "step": 330, "total_flos": 103645451026432.0, "train_loss": 1.592864108988733, "train_runtime": 2075.5919, "train_samples_per_second": 2.544, "train_steps_per_second": 0.159 } ], "logging_steps": 1, "max_steps": 330, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 103645451026432.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }