{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 6564, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009140767824497258, "grad_norm": 12.370736122131348, "learning_rate": 2.6109660574412534e-06, "loss": 36.987, "step": 10 }, { "epoch": 0.018281535648994516, "grad_norm": 35.588836669921875, "learning_rate": 5.221932114882507e-06, "loss": 15.4689, "step": 20 }, { "epoch": 0.027422303473491772, "grad_norm": 135.9927978515625, "learning_rate": 7.832898172323761e-06, "loss": 35.3912, "step": 30 }, { "epoch": 0.03656307129798903, "grad_norm": 20.429269790649414, "learning_rate": 1.0443864229765014e-05, "loss": 11.1572, "step": 40 }, { "epoch": 0.04570383912248629, "grad_norm": 23.74187469482422, "learning_rate": 1.3054830287206268e-05, "loss": 12.816, "step": 50 }, { "epoch": 0.054844606946983544, "grad_norm": 24.825340270996094, "learning_rate": 1.5665796344647522e-05, "loss": 9.497, "step": 60 }, { "epoch": 0.06398537477148081, "grad_norm": 28.620555877685547, "learning_rate": 1.8276762402088773e-05, "loss": 27.8313, "step": 70 }, { "epoch": 0.07312614259597806, "grad_norm": 31.348176956176758, "learning_rate": 2.0887728459530027e-05, "loss": 31.1205, "step": 80 }, { "epoch": 0.08226691042047532, "grad_norm": 25.55410385131836, "learning_rate": 2.349869451697128e-05, "loss": 13.5872, "step": 90 }, { "epoch": 0.09140767824497258, "grad_norm": 61.1695671081543, "learning_rate": 2.6109660574412536e-05, "loss": 23.0605, "step": 100 }, { "epoch": 0.10054844606946983, "grad_norm": 22.535160064697266, "learning_rate": 2.8720626631853787e-05, "loss": 23.0383, "step": 110 }, { "epoch": 0.10968921389396709, "grad_norm": 41.989540100097656, "learning_rate": 3.1331592689295045e-05, "loss": 17.7472, "step": 120 }, { "epoch": 0.11882998171846434, "grad_norm": 26.50091552734375, "learning_rate": 3.394255874673629e-05, "loss": 23.2046, "step": 130 }, { "epoch": 0.12797074954296161, "grad_norm": 11.607914924621582, "learning_rate": 3.6553524804177546e-05, "loss": 12.0288, "step": 140 }, { "epoch": 0.13711151736745886, "grad_norm": 7.147351264953613, "learning_rate": 3.91644908616188e-05, "loss": 10.2779, "step": 150 }, { "epoch": 0.14625228519195613, "grad_norm": 11.11351490020752, "learning_rate": 4.1775456919060055e-05, "loss": 35.3618, "step": 160 }, { "epoch": 0.15539305301645337, "grad_norm": 12.485090255737305, "learning_rate": 4.438642297650131e-05, "loss": 6.2166, "step": 170 }, { "epoch": 0.16453382084095064, "grad_norm": 145.72616577148438, "learning_rate": 4.699738903394256e-05, "loss": 27.5459, "step": 180 }, { "epoch": 0.1736745886654479, "grad_norm": 23.479036331176758, "learning_rate": 4.960835509138381e-05, "loss": 10.0974, "step": 190 }, { "epoch": 0.18281535648994515, "grad_norm": 126.79408264160156, "learning_rate": 5.221932114882507e-05, "loss": 22.6112, "step": 200 }, { "epoch": 0.19195612431444242, "grad_norm": 39.95475769042969, "learning_rate": 5.483028720626632e-05, "loss": 19.3106, "step": 210 }, { "epoch": 0.20109689213893966, "grad_norm": 44.722843170166016, "learning_rate": 5.7441253263707574e-05, "loss": 10.5616, "step": 220 }, { "epoch": 0.21023765996343693, "grad_norm": 3.150900363922119, "learning_rate": 6.005221932114883e-05, "loss": 13.019, "step": 230 }, { "epoch": 0.21937842778793418, "grad_norm": 83.21868896484375, "learning_rate": 6.266318537859009e-05, "loss": 17.95, "step": 240 }, { "epoch": 0.22851919561243145, "grad_norm": 46.631446838378906, "learning_rate": 6.527415143603134e-05, "loss": 17.4659, "step": 250 }, { "epoch": 0.2376599634369287, "grad_norm": 4.102182865142822, "learning_rate": 6.788511749347258e-05, "loss": 15.8148, "step": 260 }, { "epoch": 0.24680073126142596, "grad_norm": 55.98577117919922, "learning_rate": 7.049608355091384e-05, "loss": 6.8381, "step": 270 }, { "epoch": 0.25594149908592323, "grad_norm": 31.68951416015625, "learning_rate": 7.310704960835509e-05, "loss": 11.6472, "step": 280 }, { "epoch": 0.26508226691042047, "grad_norm": 37.748939514160156, "learning_rate": 7.571801566579635e-05, "loss": 15.4919, "step": 290 }, { "epoch": 0.2742230347349177, "grad_norm": 32.00756072998047, "learning_rate": 7.83289817232376e-05, "loss": 10.5245, "step": 300 }, { "epoch": 0.283363802559415, "grad_norm": 32.15166473388672, "learning_rate": 8.093994778067886e-05, "loss": 13.0103, "step": 310 }, { "epoch": 0.29250457038391225, "grad_norm": 59.19680404663086, "learning_rate": 8.355091383812011e-05, "loss": 12.5773, "step": 320 }, { "epoch": 0.3016453382084095, "grad_norm": 50.93272018432617, "learning_rate": 8.616187989556136e-05, "loss": 16.9995, "step": 330 }, { "epoch": 0.31078610603290674, "grad_norm": 4.630652904510498, "learning_rate": 8.877284595300262e-05, "loss": 25.0165, "step": 340 }, { "epoch": 0.31992687385740404, "grad_norm": 3.24698543548584, "learning_rate": 9.138381201044387e-05, "loss": 16.605, "step": 350 }, { "epoch": 0.3290676416819013, "grad_norm": 42.776981353759766, "learning_rate": 9.399477806788513e-05, "loss": 16.2757, "step": 360 }, { "epoch": 0.3382084095063985, "grad_norm": 89.18607330322266, "learning_rate": 9.660574412532638e-05, "loss": 26.5941, "step": 370 }, { "epoch": 0.3473491773308958, "grad_norm": 67.55815887451172, "learning_rate": 9.921671018276762e-05, "loss": 16.5563, "step": 380 }, { "epoch": 0.35648994515539306, "grad_norm": 12.19714641571045, "learning_rate": 9.990378006872852e-05, "loss": 9.3716, "step": 390 }, { "epoch": 0.3656307129798903, "grad_norm": 3.0744447708129883, "learning_rate": 9.976632302405498e-05, "loss": 7.0558, "step": 400 }, { "epoch": 0.37477148080438755, "grad_norm": 18.78617286682129, "learning_rate": 9.962886597938144e-05, "loss": 7.6828, "step": 410 }, { "epoch": 0.38391224862888484, "grad_norm": 13.532540321350098, "learning_rate": 9.949140893470791e-05, "loss": 6.9473, "step": 420 }, { "epoch": 0.3930530164533821, "grad_norm": 14.925369262695312, "learning_rate": 9.935395189003437e-05, "loss": 11.9202, "step": 430 }, { "epoch": 0.40219378427787933, "grad_norm": 80.48348236083984, "learning_rate": 9.921649484536083e-05, "loss": 12.2047, "step": 440 }, { "epoch": 0.4113345521023766, "grad_norm": 3.9793624877929688, "learning_rate": 9.90790378006873e-05, "loss": 9.5004, "step": 450 }, { "epoch": 0.42047531992687387, "grad_norm": 111.30896759033203, "learning_rate": 9.894158075601375e-05, "loss": 12.7129, "step": 460 }, { "epoch": 0.4296160877513711, "grad_norm": 45.249576568603516, "learning_rate": 9.880412371134022e-05, "loss": 10.4175, "step": 470 }, { "epoch": 0.43875685557586835, "grad_norm": 13.036559104919434, "learning_rate": 9.866666666666668e-05, "loss": 14.1077, "step": 480 }, { "epoch": 0.44789762340036565, "grad_norm": 46.61274337768555, "learning_rate": 9.852920962199314e-05, "loss": 9.0448, "step": 490 }, { "epoch": 0.4570383912248629, "grad_norm": 6.358148097991943, "learning_rate": 9.83917525773196e-05, "loss": 12.5849, "step": 500 }, { "epoch": 0.46617915904936014, "grad_norm": 15.417573928833008, "learning_rate": 9.825429553264606e-05, "loss": 6.0911, "step": 510 }, { "epoch": 0.4753199268738574, "grad_norm": 61.236305236816406, "learning_rate": 9.811683848797252e-05, "loss": 9.3876, "step": 520 }, { "epoch": 0.4844606946983547, "grad_norm": 25.130023956298828, "learning_rate": 9.797938144329898e-05, "loss": 5.829, "step": 530 }, { "epoch": 0.4936014625228519, "grad_norm": 46.94304656982422, "learning_rate": 9.784192439862544e-05, "loss": 9.5231, "step": 540 }, { "epoch": 0.5027422303473492, "grad_norm": 52.86389923095703, "learning_rate": 9.770446735395188e-05, "loss": 9.6375, "step": 550 }, { "epoch": 0.5118829981718465, "grad_norm": 22.981060028076172, "learning_rate": 9.756701030927836e-05, "loss": 14.71, "step": 560 }, { "epoch": 0.5210237659963437, "grad_norm": 11.894240379333496, "learning_rate": 9.742955326460482e-05, "loss": 5.0481, "step": 570 }, { "epoch": 0.5301645338208409, "grad_norm": 8.134305000305176, "learning_rate": 9.729209621993128e-05, "loss": 5.92, "step": 580 }, { "epoch": 0.5393053016453382, "grad_norm": 24.126920700073242, "learning_rate": 9.715463917525774e-05, "loss": 4.9919, "step": 590 }, { "epoch": 0.5484460694698354, "grad_norm": 11.994281768798828, "learning_rate": 9.70171821305842e-05, "loss": 7.8652, "step": 600 }, { "epoch": 0.5575868372943327, "grad_norm": 25.653837203979492, "learning_rate": 9.687972508591066e-05, "loss": 10.3586, "step": 610 }, { "epoch": 0.56672760511883, "grad_norm": 63.768672943115234, "learning_rate": 9.674226804123712e-05, "loss": 13.0511, "step": 620 }, { "epoch": 0.5758683729433273, "grad_norm": 20.820335388183594, "learning_rate": 9.660481099656358e-05, "loss": 6.11, "step": 630 }, { "epoch": 0.5850091407678245, "grad_norm": 14.367292404174805, "learning_rate": 9.646735395189004e-05, "loss": 10.9518, "step": 640 }, { "epoch": 0.5941499085923218, "grad_norm": 60.2218132019043, "learning_rate": 9.63298969072165e-05, "loss": 7.7834, "step": 650 }, { "epoch": 0.603290676416819, "grad_norm": 12.209604263305664, "learning_rate": 9.619243986254296e-05, "loss": 10.9247, "step": 660 }, { "epoch": 0.6124314442413162, "grad_norm": 7.226526260375977, "learning_rate": 9.605498281786942e-05, "loss": 6.1566, "step": 670 }, { "epoch": 0.6215722120658135, "grad_norm": 22.709333419799805, "learning_rate": 9.591752577319588e-05, "loss": 6.3354, "step": 680 }, { "epoch": 0.6307129798903108, "grad_norm": 73.90802001953125, "learning_rate": 9.578006872852234e-05, "loss": 3.818, "step": 690 }, { "epoch": 0.6398537477148081, "grad_norm": 5.468135833740234, "learning_rate": 9.56426116838488e-05, "loss": 6.2398, "step": 700 }, { "epoch": 0.6489945155393053, "grad_norm": 42.02013397216797, "learning_rate": 9.550515463917526e-05, "loss": 5.9792, "step": 710 }, { "epoch": 0.6581352833638026, "grad_norm": 22.250171661376953, "learning_rate": 9.536769759450172e-05, "loss": 7.2283, "step": 720 }, { "epoch": 0.6672760511882998, "grad_norm": 13.502397537231445, "learning_rate": 9.523024054982819e-05, "loss": 4.2156, "step": 730 }, { "epoch": 0.676416819012797, "grad_norm": 24.216949462890625, "learning_rate": 9.509278350515465e-05, "loss": 4.7861, "step": 740 }, { "epoch": 0.6855575868372943, "grad_norm": 45.543880462646484, "learning_rate": 9.495532646048111e-05, "loss": 8.098, "step": 750 }, { "epoch": 0.6946983546617916, "grad_norm": 28.0845947265625, "learning_rate": 9.481786941580757e-05, "loss": 4.5235, "step": 760 }, { "epoch": 0.7038391224862889, "grad_norm": 99.67831420898438, "learning_rate": 9.468041237113402e-05, "loss": 10.3558, "step": 770 }, { "epoch": 0.7129798903107861, "grad_norm": 55.37455749511719, "learning_rate": 9.454295532646048e-05, "loss": 7.9366, "step": 780 }, { "epoch": 0.7221206581352834, "grad_norm": 69.7910385131836, "learning_rate": 9.440549828178694e-05, "loss": 10.8708, "step": 790 }, { "epoch": 0.7312614259597806, "grad_norm": 57.91436004638672, "learning_rate": 9.42680412371134e-05, "loss": 9.6494, "step": 800 }, { "epoch": 0.7404021937842779, "grad_norm": 23.30112648010254, "learning_rate": 9.413058419243986e-05, "loss": 5.2776, "step": 810 }, { "epoch": 0.7495429616087751, "grad_norm": 3.633213758468628, "learning_rate": 9.399312714776632e-05, "loss": 3.9024, "step": 820 }, { "epoch": 0.7586837294332724, "grad_norm": 28.030031204223633, "learning_rate": 9.385567010309278e-05, "loss": 7.8747, "step": 830 }, { "epoch": 0.7678244972577697, "grad_norm": 13.434077262878418, "learning_rate": 9.371821305841924e-05, "loss": 5.245, "step": 840 }, { "epoch": 0.7769652650822669, "grad_norm": 16.447040557861328, "learning_rate": 9.35807560137457e-05, "loss": 5.2299, "step": 850 }, { "epoch": 0.7861060329067642, "grad_norm": 77.3619384765625, "learning_rate": 9.344329896907216e-05, "loss": 8.3412, "step": 860 }, { "epoch": 0.7952468007312614, "grad_norm": 13.566094398498535, "learning_rate": 9.330584192439864e-05, "loss": 3.8435, "step": 870 }, { "epoch": 0.8043875685557587, "grad_norm": 34.86231994628906, "learning_rate": 9.31683848797251e-05, "loss": 6.3295, "step": 880 }, { "epoch": 0.8135283363802559, "grad_norm": 51.13885498046875, "learning_rate": 9.303092783505156e-05, "loss": 7.9717, "step": 890 }, { "epoch": 0.8226691042047533, "grad_norm": 14.308514595031738, "learning_rate": 9.289347079037802e-05, "loss": 4.3259, "step": 900 }, { "epoch": 0.8318098720292505, "grad_norm": 20.286693572998047, "learning_rate": 9.275601374570448e-05, "loss": 2.3497, "step": 910 }, { "epoch": 0.8409506398537477, "grad_norm": 3.1538331508636475, "learning_rate": 9.261855670103094e-05, "loss": 4.4724, "step": 920 }, { "epoch": 0.850091407678245, "grad_norm": 3.950300931930542, "learning_rate": 9.24810996563574e-05, "loss": 3.6357, "step": 930 }, { "epoch": 0.8592321755027422, "grad_norm": 107.22858428955078, "learning_rate": 9.234364261168386e-05, "loss": 10.6746, "step": 940 }, { "epoch": 0.8683729433272395, "grad_norm": 35.789302825927734, "learning_rate": 9.220618556701032e-05, "loss": 5.7225, "step": 950 }, { "epoch": 0.8775137111517367, "grad_norm": 18.202207565307617, "learning_rate": 9.206872852233678e-05, "loss": 3.3507, "step": 960 }, { "epoch": 0.886654478976234, "grad_norm": 44.243858337402344, "learning_rate": 9.193127147766324e-05, "loss": 6.0851, "step": 970 }, { "epoch": 0.8957952468007313, "grad_norm": 10.109786033630371, "learning_rate": 9.17938144329897e-05, "loss": 4.3643, "step": 980 }, { "epoch": 0.9049360146252285, "grad_norm": 40.355831146240234, "learning_rate": 9.165635738831616e-05, "loss": 3.4289, "step": 990 }, { "epoch": 0.9140767824497258, "grad_norm": 68.06707763671875, "learning_rate": 9.151890034364262e-05, "loss": 3.8808, "step": 1000 }, { "epoch": 0.923217550274223, "grad_norm": 51.31651306152344, "learning_rate": 9.138144329896908e-05, "loss": 7.954, "step": 1010 }, { "epoch": 0.9323583180987203, "grad_norm": 19.272785186767578, "learning_rate": 9.124398625429554e-05, "loss": 3.5655, "step": 1020 }, { "epoch": 0.9414990859232175, "grad_norm": 48.537540435791016, "learning_rate": 9.1106529209622e-05, "loss": 5.4733, "step": 1030 }, { "epoch": 0.9506398537477148, "grad_norm": 25.16614532470703, "learning_rate": 9.096907216494846e-05, "loss": 4.6179, "step": 1040 }, { "epoch": 0.9597806215722121, "grad_norm": 24.13741111755371, "learning_rate": 9.083161512027492e-05, "loss": 5.0826, "step": 1050 }, { "epoch": 0.9689213893967094, "grad_norm": 11.616049766540527, "learning_rate": 9.069415807560138e-05, "loss": 4.038, "step": 1060 }, { "epoch": 0.9780621572212066, "grad_norm": 33.03008270263672, "learning_rate": 9.055670103092784e-05, "loss": 3.9123, "step": 1070 }, { "epoch": 0.9872029250457038, "grad_norm": 31.290311813354492, "learning_rate": 9.04192439862543e-05, "loss": 2.4426, "step": 1080 }, { "epoch": 0.9963436928702011, "grad_norm": 12.836657524108887, "learning_rate": 9.028178694158076e-05, "loss": 3.4412, "step": 1090 }, { "epoch": 1.0054844606946984, "grad_norm": 10.609333038330078, "learning_rate": 9.014432989690722e-05, "loss": 3.9651, "step": 1100 }, { "epoch": 1.0146252285191957, "grad_norm": 27.414073944091797, "learning_rate": 9.000687285223368e-05, "loss": 9.0338, "step": 1110 }, { "epoch": 1.023765996343693, "grad_norm": 41.76542282104492, "learning_rate": 8.986941580756014e-05, "loss": 3.8182, "step": 1120 }, { "epoch": 1.0329067641681902, "grad_norm": 40.475032806396484, "learning_rate": 8.97319587628866e-05, "loss": 7.0833, "step": 1130 }, { "epoch": 1.0420475319926874, "grad_norm": 3.250760793685913, "learning_rate": 8.959450171821306e-05, "loss": 5.4448, "step": 1140 }, { "epoch": 1.0511882998171846, "grad_norm": 63.5485725402832, "learning_rate": 8.945704467353952e-05, "loss": 4.7609, "step": 1150 }, { "epoch": 1.0603290676416819, "grad_norm": 6.022167682647705, "learning_rate": 8.931958762886598e-05, "loss": 6.1579, "step": 1160 }, { "epoch": 1.0694698354661791, "grad_norm": 10.743149757385254, "learning_rate": 8.918213058419244e-05, "loss": 5.3672, "step": 1170 }, { "epoch": 1.0786106032906764, "grad_norm": 40.21040725708008, "learning_rate": 8.904467353951891e-05, "loss": 4.6747, "step": 1180 }, { "epoch": 1.0877513711151736, "grad_norm": 49.98366165161133, "learning_rate": 8.890721649484537e-05, "loss": 2.6949, "step": 1190 }, { "epoch": 1.0968921389396709, "grad_norm": 14.809346199035645, "learning_rate": 8.876975945017183e-05, "loss": 3.5252, "step": 1200 }, { "epoch": 1.106032906764168, "grad_norm": 22.286317825317383, "learning_rate": 8.86323024054983e-05, "loss": 1.6055, "step": 1210 }, { "epoch": 1.1151736745886653, "grad_norm": 17.506837844848633, "learning_rate": 8.849484536082475e-05, "loss": 2.8383, "step": 1220 }, { "epoch": 1.1243144424131628, "grad_norm": 16.956575393676758, "learning_rate": 8.83573883161512e-05, "loss": 3.4014, "step": 1230 }, { "epoch": 1.13345521023766, "grad_norm": 19.66242790222168, "learning_rate": 8.821993127147766e-05, "loss": 3.5567, "step": 1240 }, { "epoch": 1.1425959780621573, "grad_norm": 5.644603252410889, "learning_rate": 8.808247422680412e-05, "loss": 2.3742, "step": 1250 }, { "epoch": 1.1517367458866545, "grad_norm": 16.857858657836914, "learning_rate": 8.794501718213058e-05, "loss": 4.808, "step": 1260 }, { "epoch": 1.1608775137111518, "grad_norm": 5.14100456237793, "learning_rate": 8.780756013745704e-05, "loss": 3.4968, "step": 1270 }, { "epoch": 1.170018281535649, "grad_norm": 49.16345977783203, "learning_rate": 8.76701030927835e-05, "loss": 3.07, "step": 1280 }, { "epoch": 1.1791590493601463, "grad_norm": 4.4499430656433105, "learning_rate": 8.753264604810996e-05, "loss": 5.6824, "step": 1290 }, { "epoch": 1.1882998171846435, "grad_norm": 27.19160270690918, "learning_rate": 8.739518900343642e-05, "loss": 3.0071, "step": 1300 }, { "epoch": 1.1974405850091407, "grad_norm": 43.31160354614258, "learning_rate": 8.72577319587629e-05, "loss": 5.9902, "step": 1310 }, { "epoch": 1.206581352833638, "grad_norm": 28.998746871948242, "learning_rate": 8.712027491408936e-05, "loss": 4.6574, "step": 1320 }, { "epoch": 1.2157221206581352, "grad_norm": 28.488924026489258, "learning_rate": 8.698281786941582e-05, "loss": 2.8268, "step": 1330 }, { "epoch": 1.2248628884826325, "grad_norm": 5.329763412475586, "learning_rate": 8.684536082474228e-05, "loss": 3.9106, "step": 1340 }, { "epoch": 1.2340036563071297, "grad_norm": 9.086085319519043, "learning_rate": 8.670790378006874e-05, "loss": 4.6363, "step": 1350 }, { "epoch": 1.2431444241316272, "grad_norm": 13.057124137878418, "learning_rate": 8.65704467353952e-05, "loss": 3.9971, "step": 1360 }, { "epoch": 1.2522851919561244, "grad_norm": 31.13848876953125, "learning_rate": 8.643298969072166e-05, "loss": 1.9081, "step": 1370 }, { "epoch": 1.2614259597806217, "grad_norm": 4.629405498504639, "learning_rate": 8.629553264604812e-05, "loss": 2.8613, "step": 1380 }, { "epoch": 1.270566727605119, "grad_norm": 2.575673818588257, "learning_rate": 8.615807560137458e-05, "loss": 5.9426, "step": 1390 }, { "epoch": 1.2797074954296161, "grad_norm": 28.09403419494629, "learning_rate": 8.602061855670104e-05, "loss": 2.8701, "step": 1400 }, { "epoch": 1.2888482632541134, "grad_norm": 15.765434265136719, "learning_rate": 8.58831615120275e-05, "loss": 3.4717, "step": 1410 }, { "epoch": 1.2979890310786106, "grad_norm": 10.838272094726562, "learning_rate": 8.574570446735396e-05, "loss": 4.087, "step": 1420 }, { "epoch": 1.3071297989031079, "grad_norm": 35.35599899291992, "learning_rate": 8.560824742268042e-05, "loss": 6.6286, "step": 1430 }, { "epoch": 1.3162705667276051, "grad_norm": 19.76988983154297, "learning_rate": 8.547079037800688e-05, "loss": 2.7263, "step": 1440 }, { "epoch": 1.3254113345521024, "grad_norm": 8.665532112121582, "learning_rate": 8.533333333333334e-05, "loss": 2.8431, "step": 1450 }, { "epoch": 1.3345521023765996, "grad_norm": 14.710444450378418, "learning_rate": 8.51958762886598e-05, "loss": 2.9496, "step": 1460 }, { "epoch": 1.3436928702010968, "grad_norm": 25.26740264892578, "learning_rate": 8.505841924398626e-05, "loss": 3.061, "step": 1470 }, { "epoch": 1.352833638025594, "grad_norm": 23.122854232788086, "learning_rate": 8.492096219931272e-05, "loss": 2.2172, "step": 1480 }, { "epoch": 1.3619744058500913, "grad_norm": 2.3405656814575195, "learning_rate": 8.478350515463918e-05, "loss": 3.5349, "step": 1490 }, { "epoch": 1.3711151736745886, "grad_norm": 29.025020599365234, "learning_rate": 8.464604810996564e-05, "loss": 3.1462, "step": 1500 }, { "epoch": 1.3802559414990858, "grad_norm": 8.402873992919922, "learning_rate": 8.45085910652921e-05, "loss": 6.8825, "step": 1510 }, { "epoch": 1.389396709323583, "grad_norm": 19.748918533325195, "learning_rate": 8.437113402061856e-05, "loss": 2.6068, "step": 1520 }, { "epoch": 1.3985374771480805, "grad_norm": 82.48213195800781, "learning_rate": 8.423367697594502e-05, "loss": 8.0128, "step": 1530 }, { "epoch": 1.4076782449725778, "grad_norm": 20.15019989013672, "learning_rate": 8.409621993127148e-05, "loss": 3.7551, "step": 1540 }, { "epoch": 1.416819012797075, "grad_norm": 27.346290588378906, "learning_rate": 8.395876288659794e-05, "loss": 4.6377, "step": 1550 }, { "epoch": 1.4259597806215722, "grad_norm": 29.291259765625, "learning_rate": 8.38213058419244e-05, "loss": 3.3131, "step": 1560 }, { "epoch": 1.4351005484460695, "grad_norm": 20.125619888305664, "learning_rate": 8.368384879725086e-05, "loss": 3.6933, "step": 1570 }, { "epoch": 1.4442413162705667, "grad_norm": 2.6610958576202393, "learning_rate": 8.354639175257732e-05, "loss": 7.4656, "step": 1580 }, { "epoch": 1.453382084095064, "grad_norm": 20.6624698638916, "learning_rate": 8.340893470790378e-05, "loss": 5.808, "step": 1590 }, { "epoch": 1.4625228519195612, "grad_norm": 44.88820266723633, "learning_rate": 8.327147766323024e-05, "loss": 4.7827, "step": 1600 }, { "epoch": 1.4716636197440585, "grad_norm": 17.63262176513672, "learning_rate": 8.31340206185567e-05, "loss": 2.8129, "step": 1610 }, { "epoch": 1.4808043875685557, "grad_norm": 35.04880142211914, "learning_rate": 8.299656357388317e-05, "loss": 3.4804, "step": 1620 }, { "epoch": 1.489945155393053, "grad_norm": 37.83504104614258, "learning_rate": 8.285910652920964e-05, "loss": 5.5062, "step": 1630 }, { "epoch": 1.4990859232175504, "grad_norm": 20.243568420410156, "learning_rate": 8.27216494845361e-05, "loss": 3.1748, "step": 1640 }, { "epoch": 1.5082266910420477, "grad_norm": 34.09272384643555, "learning_rate": 8.258419243986256e-05, "loss": 4.6195, "step": 1650 }, { "epoch": 1.517367458866545, "grad_norm": 35.49079895019531, "learning_rate": 8.244673539518902e-05, "loss": 5.0, "step": 1660 }, { "epoch": 1.5265082266910421, "grad_norm": 38.871978759765625, "learning_rate": 8.230927835051548e-05, "loss": 4.5287, "step": 1670 }, { "epoch": 1.5356489945155394, "grad_norm": 51.830039978027344, "learning_rate": 8.217182130584192e-05, "loss": 4.0049, "step": 1680 }, { "epoch": 1.5447897623400366, "grad_norm": 52.82420349121094, "learning_rate": 8.203436426116838e-05, "loss": 3.8007, "step": 1690 }, { "epoch": 1.5539305301645339, "grad_norm": 44.84202575683594, "learning_rate": 8.189690721649484e-05, "loss": 5.2886, "step": 1700 }, { "epoch": 1.563071297989031, "grad_norm": 60.55004119873047, "learning_rate": 8.17594501718213e-05, "loss": 3.9711, "step": 1710 }, { "epoch": 1.5722120658135283, "grad_norm": 17.61454200744629, "learning_rate": 8.162199312714776e-05, "loss": 3.8944, "step": 1720 }, { "epoch": 1.5813528336380256, "grad_norm": 11.059786796569824, "learning_rate": 8.148453608247422e-05, "loss": 3.2088, "step": 1730 }, { "epoch": 1.5904936014625228, "grad_norm": 5.978787899017334, "learning_rate": 8.134707903780068e-05, "loss": 3.1343, "step": 1740 }, { "epoch": 1.59963436928702, "grad_norm": 13.30731201171875, "learning_rate": 8.120962199312714e-05, "loss": 1.7047, "step": 1750 }, { "epoch": 1.6087751371115173, "grad_norm": 44.732967376708984, "learning_rate": 8.107216494845362e-05, "loss": 4.8081, "step": 1760 }, { "epoch": 1.6179159049360146, "grad_norm": 3.613630533218384, "learning_rate": 8.093470790378008e-05, "loss": 3.5862, "step": 1770 }, { "epoch": 1.6270566727605118, "grad_norm": 3.538996458053589, "learning_rate": 8.079725085910654e-05, "loss": 3.0854, "step": 1780 }, { "epoch": 1.636197440585009, "grad_norm": 30.810333251953125, "learning_rate": 8.0659793814433e-05, "loss": 3.2235, "step": 1790 }, { "epoch": 1.6453382084095063, "grad_norm": 15.83150577545166, "learning_rate": 8.052233676975946e-05, "loss": 2.8161, "step": 1800 }, { "epoch": 1.6544789762340035, "grad_norm": 55.83029556274414, "learning_rate": 8.038487972508592e-05, "loss": 3.5098, "step": 1810 }, { "epoch": 1.6636197440585008, "grad_norm": 56.56729507446289, "learning_rate": 8.024742268041238e-05, "loss": 5.0419, "step": 1820 }, { "epoch": 1.672760511882998, "grad_norm": 2.367525339126587, "learning_rate": 8.010996563573884e-05, "loss": 3.8107, "step": 1830 }, { "epoch": 1.6819012797074955, "grad_norm": 9.977028846740723, "learning_rate": 7.99725085910653e-05, "loss": 5.5031, "step": 1840 }, { "epoch": 1.6910420475319927, "grad_norm": 51.213600158691406, "learning_rate": 7.983505154639176e-05, "loss": 5.6861, "step": 1850 }, { "epoch": 1.70018281535649, "grad_norm": 33.10087966918945, "learning_rate": 7.969759450171822e-05, "loss": 2.7235, "step": 1860 }, { "epoch": 1.7093235831809872, "grad_norm": 5.704224586486816, "learning_rate": 7.956013745704468e-05, "loss": 4.4562, "step": 1870 }, { "epoch": 1.7184643510054844, "grad_norm": 2.6250569820404053, "learning_rate": 7.942268041237114e-05, "loss": 2.5911, "step": 1880 }, { "epoch": 1.7276051188299817, "grad_norm": 2.4583847522735596, "learning_rate": 7.92852233676976e-05, "loss": 4.2191, "step": 1890 }, { "epoch": 1.736745886654479, "grad_norm": 7.257296562194824, "learning_rate": 7.914776632302406e-05, "loss": 3.3345, "step": 1900 }, { "epoch": 1.7458866544789764, "grad_norm": 16.38401985168457, "learning_rate": 7.901030927835052e-05, "loss": 2.407, "step": 1910 }, { "epoch": 1.7550274223034736, "grad_norm": 58.603084564208984, "learning_rate": 7.887285223367698e-05, "loss": 5.0646, "step": 1920 }, { "epoch": 1.7641681901279709, "grad_norm": 3.5304384231567383, "learning_rate": 7.873539518900344e-05, "loss": 3.9886, "step": 1930 }, { "epoch": 1.7733089579524681, "grad_norm": 13.106943130493164, "learning_rate": 7.85979381443299e-05, "loss": 3.3263, "step": 1940 }, { "epoch": 1.7824497257769654, "grad_norm": 19.20189666748047, "learning_rate": 7.846048109965636e-05, "loss": 2.7366, "step": 1950 }, { "epoch": 1.7915904936014626, "grad_norm": 41.548553466796875, "learning_rate": 7.832302405498282e-05, "loss": 4.3702, "step": 1960 }, { "epoch": 1.8007312614259599, "grad_norm": 33.38926696777344, "learning_rate": 7.818556701030928e-05, "loss": 3.7298, "step": 1970 }, { "epoch": 1.809872029250457, "grad_norm": 18.56389808654785, "learning_rate": 7.804810996563574e-05, "loss": 4.0066, "step": 1980 }, { "epoch": 1.8190127970749543, "grad_norm": 48.816986083984375, "learning_rate": 7.79106529209622e-05, "loss": 4.9297, "step": 1990 }, { "epoch": 1.8281535648994516, "grad_norm": 15.528534889221191, "learning_rate": 7.777319587628866e-05, "loss": 2.3897, "step": 2000 }, { "epoch": 1.8372943327239488, "grad_norm": 9.369296073913574, "learning_rate": 7.763573883161512e-05, "loss": 5.4963, "step": 2010 }, { "epoch": 1.846435100548446, "grad_norm": 20.032461166381836, "learning_rate": 7.749828178694158e-05, "loss": 2.735, "step": 2020 }, { "epoch": 1.8555758683729433, "grad_norm": 38.112083435058594, "learning_rate": 7.736082474226804e-05, "loss": 3.9523, "step": 2030 }, { "epoch": 1.8647166361974405, "grad_norm": 5.362397193908691, "learning_rate": 7.72233676975945e-05, "loss": 3.9655, "step": 2040 }, { "epoch": 1.8738574040219378, "grad_norm": 8.610513687133789, "learning_rate": 7.708591065292096e-05, "loss": 4.1931, "step": 2050 }, { "epoch": 1.882998171846435, "grad_norm": 18.511356353759766, "learning_rate": 7.694845360824742e-05, "loss": 1.9007, "step": 2060 }, { "epoch": 1.8921389396709323, "grad_norm": 15.115946769714355, "learning_rate": 7.68109965635739e-05, "loss": 2.6706, "step": 2070 }, { "epoch": 1.9012797074954295, "grad_norm": 33.0804557800293, "learning_rate": 7.667353951890036e-05, "loss": 3.7002, "step": 2080 }, { "epoch": 1.9104204753199268, "grad_norm": 8.883389472961426, "learning_rate": 7.653608247422682e-05, "loss": 4.2629, "step": 2090 }, { "epoch": 1.919561243144424, "grad_norm": 22.31747817993164, "learning_rate": 7.639862542955328e-05, "loss": 3.6954, "step": 2100 }, { "epoch": 1.9287020109689212, "grad_norm": 24.325546264648438, "learning_rate": 7.626116838487974e-05, "loss": 5.954, "step": 2110 }, { "epoch": 1.9378427787934185, "grad_norm": 13.499019622802734, "learning_rate": 7.61237113402062e-05, "loss": 2.2838, "step": 2120 }, { "epoch": 1.946983546617916, "grad_norm": 37.1631965637207, "learning_rate": 7.598625429553266e-05, "loss": 3.9877, "step": 2130 }, { "epoch": 1.9561243144424132, "grad_norm": 18.608469009399414, "learning_rate": 7.58487972508591e-05, "loss": 2.9827, "step": 2140 }, { "epoch": 1.9652650822669104, "grad_norm": 2.4131762981414795, "learning_rate": 7.571134020618556e-05, "loss": 2.631, "step": 2150 }, { "epoch": 1.9744058500914077, "grad_norm": 16.322826385498047, "learning_rate": 7.557388316151202e-05, "loss": 3.3235, "step": 2160 }, { "epoch": 1.983546617915905, "grad_norm": 50.81422805786133, "learning_rate": 7.543642611683848e-05, "loss": 2.0557, "step": 2170 }, { "epoch": 1.9926873857404022, "grad_norm": 20.819671630859375, "learning_rate": 7.529896907216494e-05, "loss": 2.1771, "step": 2180 }, { "epoch": 2.0018281535648996, "grad_norm": 13.76378059387207, "learning_rate": 7.51615120274914e-05, "loss": 2.8785, "step": 2190 }, { "epoch": 2.010968921389397, "grad_norm": 40.454368591308594, "learning_rate": 7.502405498281786e-05, "loss": 3.4294, "step": 2200 }, { "epoch": 2.020109689213894, "grad_norm": 2.7516205310821533, "learning_rate": 7.488659793814434e-05, "loss": 2.0466, "step": 2210 }, { "epoch": 2.0292504570383914, "grad_norm": 16.869720458984375, "learning_rate": 7.47491408934708e-05, "loss": 2.6307, "step": 2220 }, { "epoch": 2.0383912248628886, "grad_norm": 21.431907653808594, "learning_rate": 7.461168384879726e-05, "loss": 3.386, "step": 2230 }, { "epoch": 2.047531992687386, "grad_norm": 15.011380195617676, "learning_rate": 7.447422680412372e-05, "loss": 3.7787, "step": 2240 }, { "epoch": 2.056672760511883, "grad_norm": 5.384696960449219, "learning_rate": 7.433676975945018e-05, "loss": 3.6865, "step": 2250 }, { "epoch": 2.0658135283363803, "grad_norm": 34.95271682739258, "learning_rate": 7.419931271477664e-05, "loss": 3.9777, "step": 2260 }, { "epoch": 2.0749542961608776, "grad_norm": 1.3080244064331055, "learning_rate": 7.40618556701031e-05, "loss": 3.667, "step": 2270 }, { "epoch": 2.084095063985375, "grad_norm": 75.9090805053711, "learning_rate": 7.392439862542956e-05, "loss": 5.1135, "step": 2280 }, { "epoch": 2.093235831809872, "grad_norm": 10.378287315368652, "learning_rate": 7.378694158075602e-05, "loss": 3.4623, "step": 2290 }, { "epoch": 2.1023765996343693, "grad_norm": 8.285221099853516, "learning_rate": 7.364948453608248e-05, "loss": 1.8549, "step": 2300 }, { "epoch": 2.1115173674588665, "grad_norm": 8.914546012878418, "learning_rate": 7.351202749140894e-05, "loss": 2.7094, "step": 2310 }, { "epoch": 2.1206581352833638, "grad_norm": 22.9365291595459, "learning_rate": 7.33745704467354e-05, "loss": 2.3973, "step": 2320 }, { "epoch": 2.129798903107861, "grad_norm": 73.36719512939453, "learning_rate": 7.323711340206186e-05, "loss": 4.5362, "step": 2330 }, { "epoch": 2.1389396709323583, "grad_norm": 24.444332122802734, "learning_rate": 7.309965635738832e-05, "loss": 4.2144, "step": 2340 }, { "epoch": 2.1480804387568555, "grad_norm": 74.73833465576172, "learning_rate": 7.296219931271478e-05, "loss": 6.8727, "step": 2350 }, { "epoch": 2.1572212065813527, "grad_norm": 23.257946014404297, "learning_rate": 7.282474226804124e-05, "loss": 2.1952, "step": 2360 }, { "epoch": 2.16636197440585, "grad_norm": 6.300850868225098, "learning_rate": 7.26872852233677e-05, "loss": 2.6599, "step": 2370 }, { "epoch": 2.1755027422303472, "grad_norm": 4.111196994781494, "learning_rate": 7.254982817869416e-05, "loss": 3.4409, "step": 2380 }, { "epoch": 2.1846435100548445, "grad_norm": 16.30699920654297, "learning_rate": 7.241237113402062e-05, "loss": 2.7772, "step": 2390 }, { "epoch": 2.1937842778793417, "grad_norm": 19.61358070373535, "learning_rate": 7.227491408934708e-05, "loss": 1.6474, "step": 2400 }, { "epoch": 2.202925045703839, "grad_norm": 14.636984825134277, "learning_rate": 7.213745704467354e-05, "loss": 3.5608, "step": 2410 }, { "epoch": 2.212065813528336, "grad_norm": 55.98421096801758, "learning_rate": 7.2e-05, "loss": 4.4748, "step": 2420 }, { "epoch": 2.2212065813528334, "grad_norm": 4.317901611328125, "learning_rate": 7.186254295532646e-05, "loss": 2.4601, "step": 2430 }, { "epoch": 2.2303473491773307, "grad_norm": 55.478309631347656, "learning_rate": 7.172508591065292e-05, "loss": 3.7106, "step": 2440 }, { "epoch": 2.2394881170018284, "grad_norm": 28.277690887451172, "learning_rate": 7.158762886597938e-05, "loss": 2.8326, "step": 2450 }, { "epoch": 2.2486288848263256, "grad_norm": 60.390987396240234, "learning_rate": 7.145017182130584e-05, "loss": 7.7422, "step": 2460 }, { "epoch": 2.257769652650823, "grad_norm": 4.666689872741699, "learning_rate": 7.13127147766323e-05, "loss": 1.6734, "step": 2470 }, { "epoch": 2.26691042047532, "grad_norm": 22.921472549438477, "learning_rate": 7.117525773195876e-05, "loss": 4.4896, "step": 2480 }, { "epoch": 2.2760511882998173, "grad_norm": 23.485576629638672, "learning_rate": 7.103780068728522e-05, "loss": 4.6333, "step": 2490 }, { "epoch": 2.2851919561243146, "grad_norm": 67.08316040039062, "learning_rate": 7.090034364261168e-05, "loss": 3.1863, "step": 2500 }, { "epoch": 2.294332723948812, "grad_norm": 5.972626209259033, "learning_rate": 7.076288659793814e-05, "loss": 3.3179, "step": 2510 }, { "epoch": 2.303473491773309, "grad_norm": 25.892724990844727, "learning_rate": 7.062542955326462e-05, "loss": 3.1055, "step": 2520 }, { "epoch": 2.3126142595978063, "grad_norm": 23.75788116455078, "learning_rate": 7.048797250859108e-05, "loss": 2.6144, "step": 2530 }, { "epoch": 2.3217550274223036, "grad_norm": 5.538463115692139, "learning_rate": 7.035051546391754e-05, "loss": 2.6044, "step": 2540 }, { "epoch": 2.330895795246801, "grad_norm": 7.502548694610596, "learning_rate": 7.0213058419244e-05, "loss": 3.8262, "step": 2550 }, { "epoch": 2.340036563071298, "grad_norm": 3.2050673961639404, "learning_rate": 7.007560137457046e-05, "loss": 2.1746, "step": 2560 }, { "epoch": 2.3491773308957953, "grad_norm": 6.752432346343994, "learning_rate": 6.993814432989692e-05, "loss": 4.0177, "step": 2570 }, { "epoch": 2.3583180987202925, "grad_norm": 4.2431254386901855, "learning_rate": 6.980068728522338e-05, "loss": 1.5508, "step": 2580 }, { "epoch": 2.3674588665447898, "grad_norm": 39.168800354003906, "learning_rate": 6.966323024054982e-05, "loss": 4.2122, "step": 2590 }, { "epoch": 2.376599634369287, "grad_norm": 13.839384078979492, "learning_rate": 6.952577319587628e-05, "loss": 4.2187, "step": 2600 }, { "epoch": 2.3857404021937842, "grad_norm": 45.029415130615234, "learning_rate": 6.938831615120274e-05, "loss": 5.6072, "step": 2610 }, { "epoch": 2.3948811700182815, "grad_norm": 50.613033294677734, "learning_rate": 6.92508591065292e-05, "loss": 2.9785, "step": 2620 }, { "epoch": 2.4040219378427787, "grad_norm": 33.60123062133789, "learning_rate": 6.911340206185567e-05, "loss": 3.5613, "step": 2630 }, { "epoch": 2.413162705667276, "grad_norm": 9.174614906311035, "learning_rate": 6.897594501718213e-05, "loss": 2.8326, "step": 2640 }, { "epoch": 2.422303473491773, "grad_norm": 6.342416763305664, "learning_rate": 6.883848797250859e-05, "loss": 5.5385, "step": 2650 }, { "epoch": 2.4314442413162705, "grad_norm": 25.321182250976562, "learning_rate": 6.870103092783506e-05, "loss": 3.4805, "step": 2660 }, { "epoch": 2.4405850091407677, "grad_norm": 6.249077796936035, "learning_rate": 6.856357388316152e-05, "loss": 3.3259, "step": 2670 }, { "epoch": 2.449725776965265, "grad_norm": 33.98255920410156, "learning_rate": 6.842611683848798e-05, "loss": 3.7863, "step": 2680 }, { "epoch": 2.458866544789762, "grad_norm": 16.39696502685547, "learning_rate": 6.828865979381444e-05, "loss": 3.5162, "step": 2690 }, { "epoch": 2.4680073126142594, "grad_norm": 15.858969688415527, "learning_rate": 6.81512027491409e-05, "loss": 2.9297, "step": 2700 }, { "epoch": 2.4771480804387567, "grad_norm": 24.053613662719727, "learning_rate": 6.801374570446736e-05, "loss": 4.5092, "step": 2710 }, { "epoch": 2.4862888482632544, "grad_norm": 6.913681983947754, "learning_rate": 6.787628865979382e-05, "loss": 4.0109, "step": 2720 }, { "epoch": 2.495429616087751, "grad_norm": 4.633556842803955, "learning_rate": 6.773883161512028e-05, "loss": 2.3454, "step": 2730 }, { "epoch": 2.504570383912249, "grad_norm": 4.404022693634033, "learning_rate": 6.760137457044674e-05, "loss": 2.6667, "step": 2740 }, { "epoch": 2.5137111517367456, "grad_norm": 2.593783378601074, "learning_rate": 6.74639175257732e-05, "loss": 2.7669, "step": 2750 }, { "epoch": 2.5228519195612433, "grad_norm": 6.139989852905273, "learning_rate": 6.732646048109966e-05, "loss": 3.1637, "step": 2760 }, { "epoch": 2.53199268738574, "grad_norm": 35.844322204589844, "learning_rate": 6.718900343642612e-05, "loss": 2.919, "step": 2770 }, { "epoch": 2.541133455210238, "grad_norm": 9.154361724853516, "learning_rate": 6.705154639175258e-05, "loss": 4.3191, "step": 2780 }, { "epoch": 2.550274223034735, "grad_norm": 45.13698196411133, "learning_rate": 6.691408934707904e-05, "loss": 5.3821, "step": 2790 }, { "epoch": 2.5594149908592323, "grad_norm": 1.6518701314926147, "learning_rate": 6.67766323024055e-05, "loss": 3.5547, "step": 2800 }, { "epoch": 2.5685557586837295, "grad_norm": 21.438737869262695, "learning_rate": 6.663917525773196e-05, "loss": 3.7097, "step": 2810 }, { "epoch": 2.577696526508227, "grad_norm": 10.967202186584473, "learning_rate": 6.650171821305842e-05, "loss": 2.5573, "step": 2820 }, { "epoch": 2.586837294332724, "grad_norm": 9.448858261108398, "learning_rate": 6.636426116838488e-05, "loss": 4.3857, "step": 2830 }, { "epoch": 2.5959780621572213, "grad_norm": 24.12337875366211, "learning_rate": 6.622680412371134e-05, "loss": 4.7079, "step": 2840 }, { "epoch": 2.6051188299817185, "grad_norm": 19.77309226989746, "learning_rate": 6.60893470790378e-05, "loss": 3.0497, "step": 2850 }, { "epoch": 2.6142595978062158, "grad_norm": 25.181673049926758, "learning_rate": 6.595189003436426e-05, "loss": 3.7283, "step": 2860 }, { "epoch": 2.623400365630713, "grad_norm": 3.535970687866211, "learning_rate": 6.581443298969072e-05, "loss": 3.4058, "step": 2870 }, { "epoch": 2.6325411334552102, "grad_norm": 13.15378189086914, "learning_rate": 6.567697594501718e-05, "loss": 2.5594, "step": 2880 }, { "epoch": 2.6416819012797075, "grad_norm": 29.685325622558594, "learning_rate": 6.553951890034364e-05, "loss": 3.0905, "step": 2890 }, { "epoch": 2.6508226691042047, "grad_norm": 60.87052917480469, "learning_rate": 6.54020618556701e-05, "loss": 4.1474, "step": 2900 }, { "epoch": 2.659963436928702, "grad_norm": 21.232072830200195, "learning_rate": 6.526460481099656e-05, "loss": 2.8324, "step": 2910 }, { "epoch": 2.669104204753199, "grad_norm": 2.662925958633423, "learning_rate": 6.512714776632302e-05, "loss": 3.4585, "step": 2920 }, { "epoch": 2.6782449725776964, "grad_norm": 29.561481475830078, "learning_rate": 6.498969072164948e-05, "loss": 3.7242, "step": 2930 }, { "epoch": 2.6873857404021937, "grad_norm": 11.558247566223145, "learning_rate": 6.485223367697594e-05, "loss": 3.001, "step": 2940 }, { "epoch": 2.696526508226691, "grad_norm": 24.327394485473633, "learning_rate": 6.47147766323024e-05, "loss": 1.1454, "step": 2950 }, { "epoch": 2.705667276051188, "grad_norm": 26.96755027770996, "learning_rate": 6.457731958762886e-05, "loss": 4.0081, "step": 2960 }, { "epoch": 2.7148080438756854, "grad_norm": 8.340194702148438, "learning_rate": 6.443986254295534e-05, "loss": 2.1115, "step": 2970 }, { "epoch": 2.7239488117001827, "grad_norm": 26.23448944091797, "learning_rate": 6.43024054982818e-05, "loss": 4.0345, "step": 2980 }, { "epoch": 2.7330895795246803, "grad_norm": 42.25297927856445, "learning_rate": 6.416494845360826e-05, "loss": 3.1889, "step": 2990 }, { "epoch": 2.742230347349177, "grad_norm": 24.40770721435547, "learning_rate": 6.402749140893472e-05, "loss": 1.5217, "step": 3000 }, { "epoch": 2.751371115173675, "grad_norm": 11.226517677307129, "learning_rate": 6.389003436426118e-05, "loss": 1.5382, "step": 3010 }, { "epoch": 2.7605118829981716, "grad_norm": 42.00068283081055, "learning_rate": 6.375257731958764e-05, "loss": 3.674, "step": 3020 }, { "epoch": 2.7696526508226693, "grad_norm": 14.041068077087402, "learning_rate": 6.36151202749141e-05, "loss": 3.0446, "step": 3030 }, { "epoch": 2.778793418647166, "grad_norm": 34.71745300292969, "learning_rate": 6.347766323024056e-05, "loss": 2.3853, "step": 3040 }, { "epoch": 2.787934186471664, "grad_norm": 8.145952224731445, "learning_rate": 6.3340206185567e-05, "loss": 2.3362, "step": 3050 }, { "epoch": 2.797074954296161, "grad_norm": 11.472765922546387, "learning_rate": 6.320274914089347e-05, "loss": 3.4714, "step": 3060 }, { "epoch": 2.8062157221206583, "grad_norm": 43.87724685668945, "learning_rate": 6.306529209621993e-05, "loss": 6.6035, "step": 3070 }, { "epoch": 2.8153564899451555, "grad_norm": 21.57341766357422, "learning_rate": 6.292783505154639e-05, "loss": 3.2473, "step": 3080 }, { "epoch": 2.8244972577696528, "grad_norm": 2.9774398803710938, "learning_rate": 6.279037800687285e-05, "loss": 6.4152, "step": 3090 }, { "epoch": 2.83363802559415, "grad_norm": 55.68925857543945, "learning_rate": 6.26529209621993e-05, "loss": 2.7846, "step": 3100 }, { "epoch": 2.8427787934186473, "grad_norm": 19.855960845947266, "learning_rate": 6.251546391752578e-05, "loss": 2.6609, "step": 3110 }, { "epoch": 2.8519195612431445, "grad_norm": 21.656904220581055, "learning_rate": 6.237800687285224e-05, "loss": 4.2597, "step": 3120 }, { "epoch": 2.8610603290676417, "grad_norm": 20.271787643432617, "learning_rate": 6.22405498281787e-05, "loss": 4.6657, "step": 3130 }, { "epoch": 2.870201096892139, "grad_norm": 39.23398971557617, "learning_rate": 6.210309278350516e-05, "loss": 4.0414, "step": 3140 }, { "epoch": 2.8793418647166362, "grad_norm": 37.48088836669922, "learning_rate": 6.196563573883162e-05, "loss": 2.6495, "step": 3150 }, { "epoch": 2.8884826325411335, "grad_norm": 2.584395408630371, "learning_rate": 6.182817869415808e-05, "loss": 3.6445, "step": 3160 }, { "epoch": 2.8976234003656307, "grad_norm": 1.601783275604248, "learning_rate": 6.169072164948454e-05, "loss": 3.1495, "step": 3170 }, { "epoch": 2.906764168190128, "grad_norm": 24.405302047729492, "learning_rate": 6.1553264604811e-05, "loss": 1.7035, "step": 3180 }, { "epoch": 2.915904936014625, "grad_norm": 7.433406829833984, "learning_rate": 6.141580756013746e-05, "loss": 2.6272, "step": 3190 }, { "epoch": 2.9250457038391224, "grad_norm": 43.664215087890625, "learning_rate": 6.127835051546392e-05, "loss": 2.511, "step": 3200 }, { "epoch": 2.9341864716636197, "grad_norm": 24.487712860107422, "learning_rate": 6.114089347079038e-05, "loss": 3.7379, "step": 3210 }, { "epoch": 2.943327239488117, "grad_norm": 12.161733627319336, "learning_rate": 6.100343642611684e-05, "loss": 4.0161, "step": 3220 }, { "epoch": 2.952468007312614, "grad_norm": 45.96884536743164, "learning_rate": 6.08659793814433e-05, "loss": 3.7486, "step": 3230 }, { "epoch": 2.9616087751371114, "grad_norm": 39.76556396484375, "learning_rate": 6.072852233676977e-05, "loss": 2.87, "step": 3240 }, { "epoch": 2.9707495429616086, "grad_norm": 11.144150733947754, "learning_rate": 6.059106529209623e-05, "loss": 2.2462, "step": 3250 }, { "epoch": 2.979890310786106, "grad_norm": 20.62501335144043, "learning_rate": 6.045360824742269e-05, "loss": 1.9589, "step": 3260 }, { "epoch": 2.989031078610603, "grad_norm": 14.595650672912598, "learning_rate": 6.031615120274915e-05, "loss": 3.0963, "step": 3270 }, { "epoch": 2.998171846435101, "grad_norm": 5.07867956161499, "learning_rate": 6.0178694158075597e-05, "loss": 1.5925, "step": 3280 }, { "epoch": 3.0073126142595976, "grad_norm": 8.165665626525879, "learning_rate": 6.0041237113402063e-05, "loss": 2.0654, "step": 3290 }, { "epoch": 3.016453382084095, "grad_norm": 38.654449462890625, "learning_rate": 5.9903780068728524e-05, "loss": 3.0234, "step": 3300 }, { "epoch": 3.025594149908592, "grad_norm": 11.510053634643555, "learning_rate": 5.9766323024054984e-05, "loss": 3.0066, "step": 3310 }, { "epoch": 3.03473491773309, "grad_norm": 37.88251495361328, "learning_rate": 5.9628865979381444e-05, "loss": 3.2684, "step": 3320 }, { "epoch": 3.043875685557587, "grad_norm": 18.768789291381836, "learning_rate": 5.9491408934707904e-05, "loss": 2.8714, "step": 3330 }, { "epoch": 3.0530164533820843, "grad_norm": 29.653430938720703, "learning_rate": 5.9353951890034364e-05, "loss": 3.5212, "step": 3340 }, { "epoch": 3.0621572212065815, "grad_norm": 1.0434975624084473, "learning_rate": 5.9216494845360824e-05, "loss": 1.4941, "step": 3350 }, { "epoch": 3.0712979890310788, "grad_norm": 24.58734703063965, "learning_rate": 5.9079037800687285e-05, "loss": 2.0394, "step": 3360 }, { "epoch": 3.080438756855576, "grad_norm": 14.098194122314453, "learning_rate": 5.8941580756013745e-05, "loss": 3.1014, "step": 3370 }, { "epoch": 3.0895795246800732, "grad_norm": 40.32368850708008, "learning_rate": 5.880412371134021e-05, "loss": 1.8304, "step": 3380 }, { "epoch": 3.0987202925045705, "grad_norm": 34.59912872314453, "learning_rate": 5.866666666666667e-05, "loss": 2.0508, "step": 3390 }, { "epoch": 3.1078610603290677, "grad_norm": 10.865257263183594, "learning_rate": 5.852920962199313e-05, "loss": 1.9609, "step": 3400 }, { "epoch": 3.117001828153565, "grad_norm": 15.05291748046875, "learning_rate": 5.839175257731959e-05, "loss": 2.226, "step": 3410 }, { "epoch": 3.126142595978062, "grad_norm": 36.58625411987305, "learning_rate": 5.825429553264605e-05, "loss": 1.7122, "step": 3420 }, { "epoch": 3.1352833638025595, "grad_norm": 10.816834449768066, "learning_rate": 5.811683848797251e-05, "loss": 3.0545, "step": 3430 }, { "epoch": 3.1444241316270567, "grad_norm": 1.6258448362350464, "learning_rate": 5.797938144329897e-05, "loss": 3.2232, "step": 3440 }, { "epoch": 3.153564899451554, "grad_norm": 7.278636932373047, "learning_rate": 5.784192439862543e-05, "loss": 3.6959, "step": 3450 }, { "epoch": 3.162705667276051, "grad_norm": 16.253158569335938, "learning_rate": 5.77044673539519e-05, "loss": 3.5702, "step": 3460 }, { "epoch": 3.1718464351005484, "grad_norm": 3.6805758476257324, "learning_rate": 5.756701030927836e-05, "loss": 2.7302, "step": 3470 }, { "epoch": 3.1809872029250457, "grad_norm": 26.246538162231445, "learning_rate": 5.742955326460482e-05, "loss": 3.0514, "step": 3480 }, { "epoch": 3.190127970749543, "grad_norm": 51.66022872924805, "learning_rate": 5.729209621993128e-05, "loss": 3.538, "step": 3490 }, { "epoch": 3.19926873857404, "grad_norm": 5.166927814483643, "learning_rate": 5.715463917525773e-05, "loss": 1.9841, "step": 3500 }, { "epoch": 3.2084095063985374, "grad_norm": 11.577645301818848, "learning_rate": 5.701718213058419e-05, "loss": 1.9164, "step": 3510 }, { "epoch": 3.2175502742230346, "grad_norm": 15.531632423400879, "learning_rate": 5.6879725085910654e-05, "loss": 3.0735, "step": 3520 }, { "epoch": 3.226691042047532, "grad_norm": 6.023890018463135, "learning_rate": 5.6742268041237114e-05, "loss": 3.2461, "step": 3530 }, { "epoch": 3.235831809872029, "grad_norm": 31.277076721191406, "learning_rate": 5.6604810996563574e-05, "loss": 4.4504, "step": 3540 }, { "epoch": 3.2449725776965264, "grad_norm": 22.392810821533203, "learning_rate": 5.6467353951890035e-05, "loss": 2.6456, "step": 3550 }, { "epoch": 3.2541133455210236, "grad_norm": 1.9706804752349854, "learning_rate": 5.6329896907216495e-05, "loss": 1.6753, "step": 3560 }, { "epoch": 3.263254113345521, "grad_norm": 33.714908599853516, "learning_rate": 5.6192439862542955e-05, "loss": 5.1575, "step": 3570 }, { "epoch": 3.272394881170018, "grad_norm": 13.837388038635254, "learning_rate": 5.6054982817869415e-05, "loss": 3.0931, "step": 3580 }, { "epoch": 3.2815356489945158, "grad_norm": 63.96002197265625, "learning_rate": 5.5917525773195875e-05, "loss": 3.9908, "step": 3590 }, { "epoch": 3.2906764168190126, "grad_norm": 35.695343017578125, "learning_rate": 5.578006872852234e-05, "loss": 5.7375, "step": 3600 }, { "epoch": 3.2998171846435103, "grad_norm": 9.524042129516602, "learning_rate": 5.56426116838488e-05, "loss": 3.211, "step": 3610 }, { "epoch": 3.3089579524680075, "grad_norm": 12.215673446655273, "learning_rate": 5.550515463917526e-05, "loss": 3.764, "step": 3620 }, { "epoch": 3.3180987202925047, "grad_norm": 27.301197052001953, "learning_rate": 5.536769759450172e-05, "loss": 3.0869, "step": 3630 }, { "epoch": 3.327239488117002, "grad_norm": 3.850611686706543, "learning_rate": 5.523024054982818e-05, "loss": 2.3149, "step": 3640 }, { "epoch": 3.3363802559414992, "grad_norm": 10.277817726135254, "learning_rate": 5.509278350515464e-05, "loss": 2.5227, "step": 3650 }, { "epoch": 3.3455210237659965, "grad_norm": 30.70311164855957, "learning_rate": 5.49553264604811e-05, "loss": 3.2279, "step": 3660 }, { "epoch": 3.3546617915904937, "grad_norm": 8.507287979125977, "learning_rate": 5.481786941580756e-05, "loss": 2.1448, "step": 3670 }, { "epoch": 3.363802559414991, "grad_norm": 14.576974868774414, "learning_rate": 5.4680412371134024e-05, "loss": 2.5505, "step": 3680 }, { "epoch": 3.372943327239488, "grad_norm": 28.560914993286133, "learning_rate": 5.454295532646049e-05, "loss": 3.3888, "step": 3690 }, { "epoch": 3.3820840950639854, "grad_norm": 23.68754768371582, "learning_rate": 5.440549828178695e-05, "loss": 3.0566, "step": 3700 }, { "epoch": 3.3912248628884827, "grad_norm": 7.420950412750244, "learning_rate": 5.426804123711341e-05, "loss": 3.1078, "step": 3710 }, { "epoch": 3.40036563071298, "grad_norm": 11.084020614624023, "learning_rate": 5.413058419243987e-05, "loss": 2.0873, "step": 3720 }, { "epoch": 3.409506398537477, "grad_norm": 12.480317115783691, "learning_rate": 5.399312714776632e-05, "loss": 1.9018, "step": 3730 }, { "epoch": 3.4186471663619744, "grad_norm": 18.72062110900879, "learning_rate": 5.3855670103092785e-05, "loss": 2.9614, "step": 3740 }, { "epoch": 3.4277879341864717, "grad_norm": 14.07093620300293, "learning_rate": 5.3718213058419245e-05, "loss": 1.4104, "step": 3750 }, { "epoch": 3.436928702010969, "grad_norm": 39.07556915283203, "learning_rate": 5.3580756013745705e-05, "loss": 2.6931, "step": 3760 }, { "epoch": 3.446069469835466, "grad_norm": 12.356401443481445, "learning_rate": 5.3443298969072165e-05, "loss": 1.0451, "step": 3770 }, { "epoch": 3.4552102376599634, "grad_norm": 3.6911463737487793, "learning_rate": 5.3305841924398625e-05, "loss": 1.6828, "step": 3780 }, { "epoch": 3.4643510054844606, "grad_norm": 34.00398254394531, "learning_rate": 5.3168384879725085e-05, "loss": 2.0529, "step": 3790 }, { "epoch": 3.473491773308958, "grad_norm": 8.145764350891113, "learning_rate": 5.3030927835051546e-05, "loss": 1.5271, "step": 3800 }, { "epoch": 3.482632541133455, "grad_norm": 47.7993049621582, "learning_rate": 5.2893470790378006e-05, "loss": 2.4896, "step": 3810 }, { "epoch": 3.4917733089579523, "grad_norm": 25.864011764526367, "learning_rate": 5.2756013745704466e-05, "loss": 3.5647, "step": 3820 }, { "epoch": 3.5009140767824496, "grad_norm": 13.714553833007812, "learning_rate": 5.261855670103093e-05, "loss": 1.8352, "step": 3830 }, { "epoch": 3.510054844606947, "grad_norm": 26.42018699645996, "learning_rate": 5.248109965635739e-05, "loss": 3.2156, "step": 3840 }, { "epoch": 3.519195612431444, "grad_norm": 13.820340156555176, "learning_rate": 5.234364261168385e-05, "loss": 2.1797, "step": 3850 }, { "epoch": 3.5283363802559418, "grad_norm": 34.22669219970703, "learning_rate": 5.220618556701031e-05, "loss": 3.8938, "step": 3860 }, { "epoch": 3.5374771480804386, "grad_norm": 5.3498735427856445, "learning_rate": 5.2068728522336773e-05, "loss": 3.9223, "step": 3870 }, { "epoch": 3.5466179159049362, "grad_norm": 10.008206367492676, "learning_rate": 5.1931271477663234e-05, "loss": 3.342, "step": 3880 }, { "epoch": 3.555758683729433, "grad_norm": 57.29923629760742, "learning_rate": 5.1793814432989694e-05, "loss": 4.0716, "step": 3890 }, { "epoch": 3.5648994515539307, "grad_norm": 32.30663299560547, "learning_rate": 5.1656357388316154e-05, "loss": 2.5384, "step": 3900 }, { "epoch": 3.5740402193784275, "grad_norm": 41.93818283081055, "learning_rate": 5.151890034364262e-05, "loss": 2.8209, "step": 3910 }, { "epoch": 3.583180987202925, "grad_norm": 31.53754997253418, "learning_rate": 5.138144329896908e-05, "loss": 1.9222, "step": 3920 }, { "epoch": 3.5923217550274225, "grad_norm": 13.321298599243164, "learning_rate": 5.124398625429554e-05, "loss": 2.8538, "step": 3930 }, { "epoch": 3.6014625228519197, "grad_norm": 14.536643981933594, "learning_rate": 5.1106529209622e-05, "loss": 2.5409, "step": 3940 }, { "epoch": 3.610603290676417, "grad_norm": 2.4003069400787354, "learning_rate": 5.096907216494846e-05, "loss": 2.3861, "step": 3950 }, { "epoch": 3.619744058500914, "grad_norm": 24.93983268737793, "learning_rate": 5.083161512027491e-05, "loss": 2.8332, "step": 3960 }, { "epoch": 3.6288848263254114, "grad_norm": 25.520532608032227, "learning_rate": 5.0694158075601375e-05, "loss": 2.5675, "step": 3970 }, { "epoch": 3.6380255941499087, "grad_norm": 40.56070327758789, "learning_rate": 5.0556701030927835e-05, "loss": 2.5476, "step": 3980 }, { "epoch": 3.647166361974406, "grad_norm": 66.18084716796875, "learning_rate": 5.0419243986254295e-05, "loss": 4.0243, "step": 3990 }, { "epoch": 3.656307129798903, "grad_norm": 3.627523899078369, "learning_rate": 5.0281786941580756e-05, "loss": 2.8749, "step": 4000 }, { "epoch": 3.6654478976234004, "grad_norm": 16.89777946472168, "learning_rate": 5.0144329896907216e-05, "loss": 1.5864, "step": 4010 }, { "epoch": 3.6745886654478976, "grad_norm": 4.770555019378662, "learning_rate": 5.0006872852233676e-05, "loss": 2.3256, "step": 4020 }, { "epoch": 3.683729433272395, "grad_norm": 40.325416564941406, "learning_rate": 4.9869415807560136e-05, "loss": 4.9124, "step": 4030 }, { "epoch": 3.692870201096892, "grad_norm": 35.87617492675781, "learning_rate": 4.9731958762886596e-05, "loss": 3.26, "step": 4040 }, { "epoch": 3.7020109689213894, "grad_norm": 9.53622055053711, "learning_rate": 4.959450171821306e-05, "loss": 2.1098, "step": 4050 }, { "epoch": 3.7111517367458866, "grad_norm": 9.944204330444336, "learning_rate": 4.9457044673539523e-05, "loss": 2.2796, "step": 4060 }, { "epoch": 3.720292504570384, "grad_norm": 68.9666519165039, "learning_rate": 4.9319587628865984e-05, "loss": 3.669, "step": 4070 }, { "epoch": 3.729433272394881, "grad_norm": 44.379486083984375, "learning_rate": 4.9182130584192444e-05, "loss": 4.0846, "step": 4080 }, { "epoch": 3.7385740402193783, "grad_norm": 2.863612651824951, "learning_rate": 4.9044673539518904e-05, "loss": 1.7643, "step": 4090 }, { "epoch": 3.7477148080438756, "grad_norm": 7.91416072845459, "learning_rate": 4.8907216494845364e-05, "loss": 1.8332, "step": 4100 }, { "epoch": 3.756855575868373, "grad_norm": 18.587059020996094, "learning_rate": 4.8769759450171824e-05, "loss": 2.2393, "step": 4110 }, { "epoch": 3.76599634369287, "grad_norm": 62.433101654052734, "learning_rate": 4.8632302405498284e-05, "loss": 4.2538, "step": 4120 }, { "epoch": 3.7751371115173673, "grad_norm": 17.794198989868164, "learning_rate": 4.8494845360824745e-05, "loss": 2.9599, "step": 4130 }, { "epoch": 3.7842778793418645, "grad_norm": 8.640511512756348, "learning_rate": 4.8357388316151205e-05, "loss": 1.0206, "step": 4140 }, { "epoch": 3.7934186471663622, "grad_norm": 27.117431640625, "learning_rate": 4.8219931271477665e-05, "loss": 2.2921, "step": 4150 }, { "epoch": 3.802559414990859, "grad_norm": 3.005143404006958, "learning_rate": 4.8082474226804125e-05, "loss": 4.388, "step": 4160 }, { "epoch": 3.8117001828153567, "grad_norm": 4.682939052581787, "learning_rate": 4.7945017182130585e-05, "loss": 1.6005, "step": 4170 }, { "epoch": 3.8208409506398535, "grad_norm": 6.531580924987793, "learning_rate": 4.7807560137457045e-05, "loss": 3.0279, "step": 4180 }, { "epoch": 3.829981718464351, "grad_norm": 4.0620903968811035, "learning_rate": 4.7670103092783506e-05, "loss": 1.7796, "step": 4190 }, { "epoch": 3.839122486288848, "grad_norm": 36.2795295715332, "learning_rate": 4.7532646048109966e-05, "loss": 2.904, "step": 4200 }, { "epoch": 3.8482632541133457, "grad_norm": 17.953227996826172, "learning_rate": 4.739518900343643e-05, "loss": 1.8322, "step": 4210 }, { "epoch": 3.857404021937843, "grad_norm": 24.8377742767334, "learning_rate": 4.725773195876289e-05, "loss": 2.5231, "step": 4220 }, { "epoch": 3.86654478976234, "grad_norm": 2.082951068878174, "learning_rate": 4.712027491408935e-05, "loss": 1.6397, "step": 4230 }, { "epoch": 3.8756855575868374, "grad_norm": 3.8803048133850098, "learning_rate": 4.6982817869415806e-05, "loss": 1.4773, "step": 4240 }, { "epoch": 3.8848263254113347, "grad_norm": 10.262565612792969, "learning_rate": 4.684536082474227e-05, "loss": 3.4798, "step": 4250 }, { "epoch": 3.893967093235832, "grad_norm": 13.233234405517578, "learning_rate": 4.670790378006873e-05, "loss": 2.4207, "step": 4260 }, { "epoch": 3.903107861060329, "grad_norm": 10.368006706237793, "learning_rate": 4.657044673539519e-05, "loss": 2.2587, "step": 4270 }, { "epoch": 3.9122486288848264, "grad_norm": 34.14609909057617, "learning_rate": 4.6432989690721654e-05, "loss": 3.1058, "step": 4280 }, { "epoch": 3.9213893967093236, "grad_norm": 34.448646545410156, "learning_rate": 4.6295532646048114e-05, "loss": 3.0052, "step": 4290 }, { "epoch": 3.930530164533821, "grad_norm": 4.581418991088867, "learning_rate": 4.6158075601374574e-05, "loss": 2.1071, "step": 4300 }, { "epoch": 3.939670932358318, "grad_norm": 7.447966575622559, "learning_rate": 4.6020618556701034e-05, "loss": 2.6064, "step": 4310 }, { "epoch": 3.9488117001828154, "grad_norm": 9.63040542602539, "learning_rate": 4.5883161512027495e-05, "loss": 3.4821, "step": 4320 }, { "epoch": 3.9579524680073126, "grad_norm": 4.5407867431640625, "learning_rate": 4.5745704467353955e-05, "loss": 1.7947, "step": 4330 }, { "epoch": 3.96709323583181, "grad_norm": 18.322980880737305, "learning_rate": 4.5608247422680415e-05, "loss": 3.3161, "step": 4340 }, { "epoch": 3.976234003656307, "grad_norm": 5.55999231338501, "learning_rate": 4.5470790378006875e-05, "loss": 3.6557, "step": 4350 }, { "epoch": 3.9853747714808043, "grad_norm": 11.15638542175293, "learning_rate": 4.5333333333333335e-05, "loss": 2.2201, "step": 4360 }, { "epoch": 3.9945155393053016, "grad_norm": 17.33359718322754, "learning_rate": 4.5195876288659795e-05, "loss": 2.6641, "step": 4370 }, { "epoch": 4.003656307129799, "grad_norm": 5.207797527313232, "learning_rate": 4.5058419243986256e-05, "loss": 3.4421, "step": 4380 }, { "epoch": 4.012797074954296, "grad_norm": 39.71084213256836, "learning_rate": 4.4920962199312716e-05, "loss": 2.4747, "step": 4390 }, { "epoch": 4.021937842778794, "grad_norm": 2.674152374267578, "learning_rate": 4.4783505154639176e-05, "loss": 2.3405, "step": 4400 }, { "epoch": 4.0310786106032905, "grad_norm": 5.832704544067383, "learning_rate": 4.4646048109965636e-05, "loss": 2.0246, "step": 4410 }, { "epoch": 4.040219378427788, "grad_norm": 16.360095977783203, "learning_rate": 4.4508591065292096e-05, "loss": 2.4537, "step": 4420 }, { "epoch": 4.049360146252285, "grad_norm": 7.281402111053467, "learning_rate": 4.437113402061856e-05, "loss": 2.6443, "step": 4430 }, { "epoch": 4.058500914076783, "grad_norm": 4.582732677459717, "learning_rate": 4.423367697594502e-05, "loss": 2.9415, "step": 4440 }, { "epoch": 4.0676416819012795, "grad_norm": 29.319988250732422, "learning_rate": 4.4096219931271484e-05, "loss": 3.7679, "step": 4450 }, { "epoch": 4.076782449725777, "grad_norm": 14.167167663574219, "learning_rate": 4.3958762886597944e-05, "loss": 3.3404, "step": 4460 }, { "epoch": 4.085923217550274, "grad_norm": 6.3118743896484375, "learning_rate": 4.38213058419244e-05, "loss": 3.2941, "step": 4470 }, { "epoch": 4.095063985374772, "grad_norm": 24.48611068725586, "learning_rate": 4.368384879725086e-05, "loss": 2.0331, "step": 4480 }, { "epoch": 4.1042047531992685, "grad_norm": 48.02914047241211, "learning_rate": 4.354639175257732e-05, "loss": 1.9154, "step": 4490 }, { "epoch": 4.113345521023766, "grad_norm": 3.718224287033081, "learning_rate": 4.3408934707903784e-05, "loss": 2.1786, "step": 4500 }, { "epoch": 4.122486288848263, "grad_norm": 5.512245178222656, "learning_rate": 4.3271477663230245e-05, "loss": 3.2471, "step": 4510 }, { "epoch": 4.131627056672761, "grad_norm": 2.0834078788757324, "learning_rate": 4.3134020618556705e-05, "loss": 3.5136, "step": 4520 }, { "epoch": 4.140767824497257, "grad_norm": 6.958035469055176, "learning_rate": 4.2996563573883165e-05, "loss": 1.9014, "step": 4530 }, { "epoch": 4.149908592321755, "grad_norm": 3.1806819438934326, "learning_rate": 4.2859106529209625e-05, "loss": 2.6987, "step": 4540 }, { "epoch": 4.159049360146252, "grad_norm": 40.1801643371582, "learning_rate": 4.2721649484536085e-05, "loss": 4.0997, "step": 4550 }, { "epoch": 4.16819012797075, "grad_norm": 22.242502212524414, "learning_rate": 4.2584192439862545e-05, "loss": 3.1252, "step": 4560 }, { "epoch": 4.177330895795246, "grad_norm": 40.568275451660156, "learning_rate": 4.2446735395189006e-05, "loss": 1.8563, "step": 4570 }, { "epoch": 4.186471663619744, "grad_norm": 37.774497985839844, "learning_rate": 4.2309278350515466e-05, "loss": 3.1248, "step": 4580 }, { "epoch": 4.195612431444241, "grad_norm": 61.567317962646484, "learning_rate": 4.2171821305841926e-05, "loss": 2.5985, "step": 4590 }, { "epoch": 4.204753199268739, "grad_norm": 7.0761332511901855, "learning_rate": 4.2034364261168386e-05, "loss": 1.7983, "step": 4600 }, { "epoch": 4.213893967093236, "grad_norm": 30.95400047302246, "learning_rate": 4.1896907216494846e-05, "loss": 2.0101, "step": 4610 }, { "epoch": 4.223034734917733, "grad_norm": 2.7616946697235107, "learning_rate": 4.1759450171821306e-05, "loss": 2.627, "step": 4620 }, { "epoch": 4.232175502742231, "grad_norm": 27.2283992767334, "learning_rate": 4.1621993127147767e-05, "loss": 3.5313, "step": 4630 }, { "epoch": 4.2413162705667276, "grad_norm": 19.318359375, "learning_rate": 4.148453608247423e-05, "loss": 2.0158, "step": 4640 }, { "epoch": 4.250457038391225, "grad_norm": 1.8010936975479126, "learning_rate": 4.134707903780069e-05, "loss": 2.2471, "step": 4650 }, { "epoch": 4.259597806215722, "grad_norm": 29.99203109741211, "learning_rate": 4.1209621993127154e-05, "loss": 2.1975, "step": 4660 }, { "epoch": 4.26873857404022, "grad_norm": 23.430566787719727, "learning_rate": 4.1072164948453614e-05, "loss": 2.4257, "step": 4670 }, { "epoch": 4.2778793418647165, "grad_norm": 3.085381507873535, "learning_rate": 4.0934707903780074e-05, "loss": 1.7805, "step": 4680 }, { "epoch": 4.287020109689214, "grad_norm": 15.781076431274414, "learning_rate": 4.0797250859106534e-05, "loss": 3.6163, "step": 4690 }, { "epoch": 4.296160877513711, "grad_norm": 9.538421630859375, "learning_rate": 4.065979381443299e-05, "loss": 1.6564, "step": 4700 }, { "epoch": 4.305301645338209, "grad_norm": 8.264079093933105, "learning_rate": 4.052233676975945e-05, "loss": 1.9412, "step": 4710 }, { "epoch": 4.3144424131627055, "grad_norm": 22.3348445892334, "learning_rate": 4.038487972508591e-05, "loss": 2.6973, "step": 4720 }, { "epoch": 4.323583180987203, "grad_norm": 3.4497017860412598, "learning_rate": 4.0247422680412375e-05, "loss": 1.9969, "step": 4730 }, { "epoch": 4.3327239488117, "grad_norm": 24.479812622070312, "learning_rate": 4.0109965635738835e-05, "loss": 1.6668, "step": 4740 }, { "epoch": 4.341864716636198, "grad_norm": 9.487496376037598, "learning_rate": 3.9972508591065295e-05, "loss": 1.7239, "step": 4750 }, { "epoch": 4.3510054844606945, "grad_norm": 4.498852729797363, "learning_rate": 3.9835051546391755e-05, "loss": 2.6517, "step": 4760 }, { "epoch": 4.360146252285192, "grad_norm": 4.84251594543457, "learning_rate": 3.9697594501718216e-05, "loss": 1.4224, "step": 4770 }, { "epoch": 4.369287020109689, "grad_norm": 4.723317623138428, "learning_rate": 3.9560137457044676e-05, "loss": 2.7588, "step": 4780 }, { "epoch": 4.378427787934187, "grad_norm": 17.655303955078125, "learning_rate": 3.9422680412371136e-05, "loss": 1.2121, "step": 4790 }, { "epoch": 4.387568555758683, "grad_norm": 14.442437171936035, "learning_rate": 3.9285223367697596e-05, "loss": 2.2341, "step": 4800 }, { "epoch": 4.396709323583181, "grad_norm": 8.936553001403809, "learning_rate": 3.9147766323024056e-05, "loss": 1.6042, "step": 4810 }, { "epoch": 4.405850091407678, "grad_norm": 40.44393539428711, "learning_rate": 3.9010309278350516e-05, "loss": 2.3249, "step": 4820 }, { "epoch": 4.414990859232176, "grad_norm": 3.048468589782715, "learning_rate": 3.887285223367698e-05, "loss": 1.5854, "step": 4830 }, { "epoch": 4.424131627056672, "grad_norm": 42.918243408203125, "learning_rate": 3.873539518900344e-05, "loss": 1.4826, "step": 4840 }, { "epoch": 4.43327239488117, "grad_norm": 6.540756702423096, "learning_rate": 3.85979381443299e-05, "loss": 1.7846, "step": 4850 }, { "epoch": 4.442413162705667, "grad_norm": 9.93269157409668, "learning_rate": 3.846048109965636e-05, "loss": 2.4916, "step": 4860 }, { "epoch": 4.451553930530165, "grad_norm": 33.895362854003906, "learning_rate": 3.832302405498282e-05, "loss": 2.2142, "step": 4870 }, { "epoch": 4.460694698354661, "grad_norm": 15.547473907470703, "learning_rate": 3.8185567010309284e-05, "loss": 1.7403, "step": 4880 }, { "epoch": 4.469835466179159, "grad_norm": 12.812091827392578, "learning_rate": 3.8048109965635744e-05, "loss": 1.8979, "step": 4890 }, { "epoch": 4.478976234003657, "grad_norm": 9.93725299835205, "learning_rate": 3.7910652920962205e-05, "loss": 2.3295, "step": 4900 }, { "epoch": 4.4881170018281535, "grad_norm": 13.730470657348633, "learning_rate": 3.7773195876288665e-05, "loss": 1.595, "step": 4910 }, { "epoch": 4.497257769652651, "grad_norm": 1.367891788482666, "learning_rate": 3.763573883161512e-05, "loss": 1.2266, "step": 4920 }, { "epoch": 4.506398537477148, "grad_norm": 10.396485328674316, "learning_rate": 3.749828178694158e-05, "loss": 1.7501, "step": 4930 }, { "epoch": 4.515539305301646, "grad_norm": 34.7637825012207, "learning_rate": 3.736082474226804e-05, "loss": 2.6249, "step": 4940 }, { "epoch": 4.5246800731261425, "grad_norm": 29.076396942138672, "learning_rate": 3.7223367697594505e-05, "loss": 2.1935, "step": 4950 }, { "epoch": 4.53382084095064, "grad_norm": 6.217894554138184, "learning_rate": 3.7085910652920966e-05, "loss": 2.298, "step": 4960 }, { "epoch": 4.542961608775137, "grad_norm": 54.704471588134766, "learning_rate": 3.6948453608247426e-05, "loss": 3.7784, "step": 4970 }, { "epoch": 4.552102376599635, "grad_norm": 3.752734422683716, "learning_rate": 3.6810996563573886e-05, "loss": 1.5304, "step": 4980 }, { "epoch": 4.5612431444241315, "grad_norm": 37.39411926269531, "learning_rate": 3.6673539518900346e-05, "loss": 4.5318, "step": 4990 }, { "epoch": 4.570383912248629, "grad_norm": 47.03571319580078, "learning_rate": 3.6536082474226806e-05, "loss": 5.2689, "step": 5000 }, { "epoch": 4.579524680073126, "grad_norm": 7.162147521972656, "learning_rate": 3.6398625429553266e-05, "loss": 1.7271, "step": 5010 }, { "epoch": 4.588665447897624, "grad_norm": 49.802093505859375, "learning_rate": 3.6261168384879727e-05, "loss": 2.2074, "step": 5020 }, { "epoch": 4.5978062157221204, "grad_norm": 4.2565226554870605, "learning_rate": 3.612371134020619e-05, "loss": 2.4533, "step": 5030 }, { "epoch": 4.606946983546618, "grad_norm": 14.877213478088379, "learning_rate": 3.598625429553265e-05, "loss": 2.3329, "step": 5040 }, { "epoch": 4.616087751371115, "grad_norm": 48.06449508666992, "learning_rate": 3.584879725085911e-05, "loss": 2.6687, "step": 5050 }, { "epoch": 4.625228519195613, "grad_norm": 9.72822093963623, "learning_rate": 3.571134020618557e-05, "loss": 1.2115, "step": 5060 }, { "epoch": 4.634369287020109, "grad_norm": 3.450927257537842, "learning_rate": 3.557388316151203e-05, "loss": 2.6099, "step": 5070 }, { "epoch": 4.643510054844607, "grad_norm": 8.899778366088867, "learning_rate": 3.543642611683849e-05, "loss": 2.4046, "step": 5080 }, { "epoch": 4.652650822669104, "grad_norm": 2.5108563899993896, "learning_rate": 3.529896907216495e-05, "loss": 1.2432, "step": 5090 }, { "epoch": 4.661791590493602, "grad_norm": 33.27580642700195, "learning_rate": 3.516151202749141e-05, "loss": 1.4312, "step": 5100 }, { "epoch": 4.670932358318098, "grad_norm": 2.61483097076416, "learning_rate": 3.5024054982817875e-05, "loss": 2.4889, "step": 5110 }, { "epoch": 4.680073126142596, "grad_norm": 6.73372745513916, "learning_rate": 3.4886597938144335e-05, "loss": 1.4196, "step": 5120 }, { "epoch": 4.689213893967093, "grad_norm": 19.291534423828125, "learning_rate": 3.4749140893470795e-05, "loss": 2.946, "step": 5130 }, { "epoch": 4.698354661791591, "grad_norm": 13.349567413330078, "learning_rate": 3.4611683848797255e-05, "loss": 2.1376, "step": 5140 }, { "epoch": 4.707495429616088, "grad_norm": 18.800390243530273, "learning_rate": 3.447422680412371e-05, "loss": 0.985, "step": 5150 }, { "epoch": 4.716636197440585, "grad_norm": 51.60750961303711, "learning_rate": 3.433676975945017e-05, "loss": 4.7212, "step": 5160 }, { "epoch": 4.725776965265082, "grad_norm": 19.25632095336914, "learning_rate": 3.419931271477663e-05, "loss": 2.5584, "step": 5170 }, { "epoch": 4.7349177330895795, "grad_norm": 10.047552108764648, "learning_rate": 3.4061855670103096e-05, "loss": 1.3071, "step": 5180 }, { "epoch": 4.744058500914077, "grad_norm": 18.213903427124023, "learning_rate": 3.3924398625429556e-05, "loss": 2.2691, "step": 5190 }, { "epoch": 4.753199268738574, "grad_norm": 1.1614028215408325, "learning_rate": 3.3786941580756016e-05, "loss": 2.1536, "step": 5200 }, { "epoch": 4.762340036563071, "grad_norm": 2.6994235515594482, "learning_rate": 3.3649484536082477e-05, "loss": 3.1792, "step": 5210 }, { "epoch": 4.7714808043875685, "grad_norm": 7.141355991363525, "learning_rate": 3.351202749140894e-05, "loss": 2.1517, "step": 5220 }, { "epoch": 4.780621572212066, "grad_norm": 16.924476623535156, "learning_rate": 3.33745704467354e-05, "loss": 1.6004, "step": 5230 }, { "epoch": 4.789762340036563, "grad_norm": 30.610586166381836, "learning_rate": 3.323711340206186e-05, "loss": 3.5606, "step": 5240 }, { "epoch": 4.798903107861061, "grad_norm": 6.229728698730469, "learning_rate": 3.309965635738832e-05, "loss": 1.983, "step": 5250 }, { "epoch": 4.8080438756855575, "grad_norm": 31.79667091369629, "learning_rate": 3.296219931271478e-05, "loss": 2.0817, "step": 5260 }, { "epoch": 4.817184643510055, "grad_norm": 19.74639129638672, "learning_rate": 3.282474226804124e-05, "loss": 1.2709, "step": 5270 }, { "epoch": 4.826325411334552, "grad_norm": 21.854433059692383, "learning_rate": 3.26872852233677e-05, "loss": 1.7519, "step": 5280 }, { "epoch": 4.83546617915905, "grad_norm": 39.07857894897461, "learning_rate": 3.254982817869416e-05, "loss": 1.8227, "step": 5290 }, { "epoch": 4.844606946983546, "grad_norm": 17.42938995361328, "learning_rate": 3.241237113402062e-05, "loss": 1.6379, "step": 5300 }, { "epoch": 4.853747714808044, "grad_norm": 15.212745666503906, "learning_rate": 3.227491408934708e-05, "loss": 1.9336, "step": 5310 }, { "epoch": 4.862888482632541, "grad_norm": 27.362951278686523, "learning_rate": 3.213745704467354e-05, "loss": 2.4916, "step": 5320 }, { "epoch": 4.872029250457039, "grad_norm": 21.680028915405273, "learning_rate": 3.2000000000000005e-05, "loss": 2.6826, "step": 5330 }, { "epoch": 4.881170018281535, "grad_norm": 7.119617938995361, "learning_rate": 3.1862542955326465e-05, "loss": 2.6695, "step": 5340 }, { "epoch": 4.890310786106033, "grad_norm": 9.521164894104004, "learning_rate": 3.1725085910652926e-05, "loss": 1.6305, "step": 5350 }, { "epoch": 4.89945155393053, "grad_norm": 14.29737663269043, "learning_rate": 3.1587628865979386e-05, "loss": 2.2219, "step": 5360 }, { "epoch": 4.908592321755028, "grad_norm": 4.072351455688477, "learning_rate": 3.1450171821305846e-05, "loss": 1.7012, "step": 5370 }, { "epoch": 4.917733089579524, "grad_norm": 2.581660032272339, "learning_rate": 3.13127147766323e-05, "loss": 1.324, "step": 5380 }, { "epoch": 4.926873857404022, "grad_norm": 6.261230945587158, "learning_rate": 3.117525773195876e-05, "loss": 2.2284, "step": 5390 }, { "epoch": 4.936014625228519, "grad_norm": 14.056791305541992, "learning_rate": 3.1037800687285226e-05, "loss": 1.9313, "step": 5400 }, { "epoch": 4.9451553930530165, "grad_norm": 10.7229642868042, "learning_rate": 3.090034364261169e-05, "loss": 1.3747, "step": 5410 }, { "epoch": 4.954296160877513, "grad_norm": 13.983070373535156, "learning_rate": 3.076288659793815e-05, "loss": 1.9437, "step": 5420 }, { "epoch": 4.963436928702011, "grad_norm": 6.611279487609863, "learning_rate": 3.062542955326461e-05, "loss": 1.9377, "step": 5430 }, { "epoch": 4.972577696526509, "grad_norm": 9.25185775756836, "learning_rate": 3.0487972508591067e-05, "loss": 2.5634, "step": 5440 }, { "epoch": 4.9817184643510055, "grad_norm": 16.942989349365234, "learning_rate": 3.0350515463917527e-05, "loss": 1.5676, "step": 5450 }, { "epoch": 4.990859232175502, "grad_norm": 3.1517653465270996, "learning_rate": 3.0213058419243988e-05, "loss": 1.911, "step": 5460 }, { "epoch": 5.0, "grad_norm": 4.196774959564209, "learning_rate": 3.007560137457045e-05, "loss": 1.6157, "step": 5470 }, { "epoch": 5.009140767824498, "grad_norm": 34.476051330566406, "learning_rate": 2.993814432989691e-05, "loss": 2.5793, "step": 5480 }, { "epoch": 5.0182815356489945, "grad_norm": 2.9003474712371826, "learning_rate": 2.9800687285223368e-05, "loss": 1.984, "step": 5490 }, { "epoch": 5.027422303473492, "grad_norm": 4.331759929656982, "learning_rate": 2.9663230240549828e-05, "loss": 1.859, "step": 5500 }, { "epoch": 5.036563071297989, "grad_norm": 14.341668128967285, "learning_rate": 2.952577319587629e-05, "loss": 2.1915, "step": 5510 }, { "epoch": 5.045703839122487, "grad_norm": 9.266731262207031, "learning_rate": 2.938831615120275e-05, "loss": 1.7988, "step": 5520 }, { "epoch": 5.0548446069469835, "grad_norm": 9.505987167358398, "learning_rate": 2.925085910652921e-05, "loss": 1.8552, "step": 5530 }, { "epoch": 5.063985374771481, "grad_norm": 16.4996280670166, "learning_rate": 2.9113402061855672e-05, "loss": 1.7587, "step": 5540 }, { "epoch": 5.073126142595978, "grad_norm": 19.993389129638672, "learning_rate": 2.8975945017182132e-05, "loss": 2.5448, "step": 5550 }, { "epoch": 5.082266910420476, "grad_norm": 20.46304702758789, "learning_rate": 2.8838487972508593e-05, "loss": 2.4968, "step": 5560 }, { "epoch": 5.091407678244972, "grad_norm": 23.583526611328125, "learning_rate": 2.8701030927835053e-05, "loss": 2.0548, "step": 5570 }, { "epoch": 5.10054844606947, "grad_norm": 7.041038990020752, "learning_rate": 2.8563573883161516e-05, "loss": 1.2609, "step": 5580 }, { "epoch": 5.109689213893967, "grad_norm": 14.106550216674805, "learning_rate": 2.8426116838487976e-05, "loss": 1.437, "step": 5590 }, { "epoch": 5.118829981718465, "grad_norm": 61.84626007080078, "learning_rate": 2.8288659793814437e-05, "loss": 2.8303, "step": 5600 }, { "epoch": 5.127970749542961, "grad_norm": 6.59751558303833, "learning_rate": 2.8151202749140893e-05, "loss": 2.3817, "step": 5610 }, { "epoch": 5.137111517367459, "grad_norm": 20.792953491210938, "learning_rate": 2.8013745704467354e-05, "loss": 1.9271, "step": 5620 }, { "epoch": 5.146252285191956, "grad_norm": 38.42577362060547, "learning_rate": 2.7876288659793814e-05, "loss": 1.9873, "step": 5630 }, { "epoch": 5.155393053016454, "grad_norm": 4.840715408325195, "learning_rate": 2.7738831615120274e-05, "loss": 1.8765, "step": 5640 }, { "epoch": 5.16453382084095, "grad_norm": 19.28848648071289, "learning_rate": 2.7601374570446737e-05, "loss": 1.8097, "step": 5650 }, { "epoch": 5.173674588665448, "grad_norm": 32.625, "learning_rate": 2.7463917525773198e-05, "loss": 1.9237, "step": 5660 }, { "epoch": 5.182815356489945, "grad_norm": 24.589351654052734, "learning_rate": 2.7326460481099658e-05, "loss": 3.3319, "step": 5670 }, { "epoch": 5.1919561243144425, "grad_norm": 4.235184192657471, "learning_rate": 2.7189003436426118e-05, "loss": 1.3542, "step": 5680 }, { "epoch": 5.201096892138939, "grad_norm": 3.8742153644561768, "learning_rate": 2.705154639175258e-05, "loss": 1.3745, "step": 5690 }, { "epoch": 5.210237659963437, "grad_norm": 6.135552406311035, "learning_rate": 2.691408934707904e-05, "loss": 2.2472, "step": 5700 }, { "epoch": 5.219378427787934, "grad_norm": 5.5752034187316895, "learning_rate": 2.6776632302405502e-05, "loss": 1.3138, "step": 5710 }, { "epoch": 5.2285191956124315, "grad_norm": 11.66261100769043, "learning_rate": 2.663917525773196e-05, "loss": 2.3336, "step": 5720 }, { "epoch": 5.237659963436928, "grad_norm": 26.470853805541992, "learning_rate": 2.650171821305842e-05, "loss": 1.9246, "step": 5730 }, { "epoch": 5.246800731261426, "grad_norm": 5.959348201751709, "learning_rate": 2.636426116838488e-05, "loss": 2.0815, "step": 5740 }, { "epoch": 5.255941499085923, "grad_norm": 8.588202476501465, "learning_rate": 2.622680412371134e-05, "loss": 2.047, "step": 5750 }, { "epoch": 5.2650822669104205, "grad_norm": 40.6922607421875, "learning_rate": 2.6089347079037803e-05, "loss": 2.9851, "step": 5760 }, { "epoch": 5.274223034734918, "grad_norm": 15.263986587524414, "learning_rate": 2.5951890034364263e-05, "loss": 1.5784, "step": 5770 }, { "epoch": 5.283363802559415, "grad_norm": 22.796846389770508, "learning_rate": 2.5814432989690723e-05, "loss": 2.4052, "step": 5780 }, { "epoch": 5.292504570383913, "grad_norm": 31.491392135620117, "learning_rate": 2.5676975945017183e-05, "loss": 2.2536, "step": 5790 }, { "epoch": 5.301645338208409, "grad_norm": 9.189494132995605, "learning_rate": 2.5539518900343647e-05, "loss": 2.0999, "step": 5800 }, { "epoch": 5.310786106032907, "grad_norm": 32.402259826660156, "learning_rate": 2.5402061855670107e-05, "loss": 2.0654, "step": 5810 }, { "epoch": 5.319926873857404, "grad_norm": 6.688931941986084, "learning_rate": 2.5264604810996567e-05, "loss": 2.5549, "step": 5820 }, { "epoch": 5.329067641681902, "grad_norm": 1.9148147106170654, "learning_rate": 2.5127147766323024e-05, "loss": 2.5326, "step": 5830 }, { "epoch": 5.338208409506398, "grad_norm": 35.11110305786133, "learning_rate": 2.4989690721649487e-05, "loss": 1.7518, "step": 5840 }, { "epoch": 5.347349177330896, "grad_norm": 14.587748527526855, "learning_rate": 2.4852233676975948e-05, "loss": 1.9119, "step": 5850 }, { "epoch": 5.356489945155393, "grad_norm": 24.35093879699707, "learning_rate": 2.4714776632302404e-05, "loss": 1.4344, "step": 5860 }, { "epoch": 5.365630712979891, "grad_norm": 6.264184951782227, "learning_rate": 2.4577319587628868e-05, "loss": 2.6135, "step": 5870 }, { "epoch": 5.374771480804387, "grad_norm": 19.515151977539062, "learning_rate": 2.4439862542955328e-05, "loss": 2.3661, "step": 5880 }, { "epoch": 5.383912248628885, "grad_norm": 22.518278121948242, "learning_rate": 2.4302405498281788e-05, "loss": 1.313, "step": 5890 }, { "epoch": 5.393053016453382, "grad_norm": 4.795875072479248, "learning_rate": 2.416494845360825e-05, "loss": 3.0295, "step": 5900 }, { "epoch": 5.4021937842778796, "grad_norm": 9.725578308105469, "learning_rate": 2.4027491408934712e-05, "loss": 1.9971, "step": 5910 }, { "epoch": 5.411334552102376, "grad_norm": 33.24148178100586, "learning_rate": 2.389003436426117e-05, "loss": 2.0029, "step": 5920 }, { "epoch": 5.420475319926874, "grad_norm": 14.862298965454102, "learning_rate": 2.375257731958763e-05, "loss": 1.3386, "step": 5930 }, { "epoch": 5.429616087751371, "grad_norm": 57.17320251464844, "learning_rate": 2.361512027491409e-05, "loss": 3.2821, "step": 5940 }, { "epoch": 5.4387568555758685, "grad_norm": 16.02284049987793, "learning_rate": 2.3477663230240553e-05, "loss": 2.3394, "step": 5950 }, { "epoch": 5.447897623400365, "grad_norm": 6.794984817504883, "learning_rate": 2.3340206185567013e-05, "loss": 2.3584, "step": 5960 }, { "epoch": 5.457038391224863, "grad_norm": 20.631656646728516, "learning_rate": 2.320274914089347e-05, "loss": 2.7751, "step": 5970 }, { "epoch": 5.46617915904936, "grad_norm": 40.044891357421875, "learning_rate": 2.3065292096219933e-05, "loss": 2.2999, "step": 5980 }, { "epoch": 5.4753199268738575, "grad_norm": 5.686783313751221, "learning_rate": 2.2927835051546393e-05, "loss": 1.2177, "step": 5990 }, { "epoch": 5.484460694698354, "grad_norm": 5.030197620391846, "learning_rate": 2.2790378006872853e-05, "loss": 1.4006, "step": 6000 }, { "epoch": 5.493601462522852, "grad_norm": 30.809846878051758, "learning_rate": 2.2652920962199314e-05, "loss": 1.5069, "step": 6010 }, { "epoch": 5.50274223034735, "grad_norm": 6.307195663452148, "learning_rate": 2.2515463917525774e-05, "loss": 1.5458, "step": 6020 }, { "epoch": 5.5118829981718465, "grad_norm": 41.10477828979492, "learning_rate": 2.2378006872852234e-05, "loss": 3.6876, "step": 6030 }, { "epoch": 5.521023765996343, "grad_norm": 11.78812313079834, "learning_rate": 2.2240549828178694e-05, "loss": 2.5693, "step": 6040 }, { "epoch": 5.530164533820841, "grad_norm": 14.058636665344238, "learning_rate": 2.2103092783505154e-05, "loss": 2.6255, "step": 6050 }, { "epoch": 5.539305301645339, "grad_norm": 7.14555025100708, "learning_rate": 2.1965635738831618e-05, "loss": 1.8024, "step": 6060 }, { "epoch": 5.548446069469835, "grad_norm": 10.551641464233398, "learning_rate": 2.1828178694158078e-05, "loss": 1.4299, "step": 6070 }, { "epoch": 5.557586837294332, "grad_norm": 13.216389656066895, "learning_rate": 2.1690721649484538e-05, "loss": 2.1603, "step": 6080 }, { "epoch": 5.56672760511883, "grad_norm": 1.5080305337905884, "learning_rate": 2.1553264604810995e-05, "loss": 2.2961, "step": 6090 }, { "epoch": 5.575868372943328, "grad_norm": 7.247092247009277, "learning_rate": 2.141580756013746e-05, "loss": 1.7767, "step": 6100 }, { "epoch": 5.585009140767824, "grad_norm": 2.7271177768707275, "learning_rate": 2.127835051546392e-05, "loss": 1.175, "step": 6110 }, { "epoch": 5.594149908592322, "grad_norm": 45.61552810668945, "learning_rate": 2.114089347079038e-05, "loss": 1.3511, "step": 6120 }, { "epoch": 5.603290676416819, "grad_norm": 14.556281089782715, "learning_rate": 2.100343642611684e-05, "loss": 1.5322, "step": 6130 }, { "epoch": 5.612431444241317, "grad_norm": 61.991111755371094, "learning_rate": 2.0865979381443303e-05, "loss": 4.3124, "step": 6140 }, { "epoch": 5.621572212065813, "grad_norm": 31.479997634887695, "learning_rate": 2.072852233676976e-05, "loss": 1.4807, "step": 6150 }, { "epoch": 5.630712979890311, "grad_norm": 3.9369869232177734, "learning_rate": 2.059106529209622e-05, "loss": 3.2076, "step": 6160 }, { "epoch": 5.639853747714808, "grad_norm": 22.462398529052734, "learning_rate": 2.0453608247422683e-05, "loss": 2.1722, "step": 6170 }, { "epoch": 5.6489945155393055, "grad_norm": 59.8381233215332, "learning_rate": 2.0316151202749143e-05, "loss": 2.4083, "step": 6180 }, { "epoch": 5.658135283363802, "grad_norm": 11.01894474029541, "learning_rate": 2.0178694158075603e-05, "loss": 1.7211, "step": 6190 }, { "epoch": 5.6672760511883, "grad_norm": 19.811832427978516, "learning_rate": 2.004123711340206e-05, "loss": 2.2812, "step": 6200 }, { "epoch": 5.676416819012797, "grad_norm": 12.10692310333252, "learning_rate": 1.9903780068728524e-05, "loss": 3.0, "step": 6210 }, { "epoch": 5.6855575868372945, "grad_norm": 21.908916473388672, "learning_rate": 1.9766323024054984e-05, "loss": 1.5615, "step": 6220 }, { "epoch": 5.694698354661791, "grad_norm": 10.701665878295898, "learning_rate": 1.9628865979381444e-05, "loss": 1.1985, "step": 6230 }, { "epoch": 5.703839122486289, "grad_norm": 17.514066696166992, "learning_rate": 1.9491408934707904e-05, "loss": 1.5525, "step": 6240 }, { "epoch": 5.712979890310786, "grad_norm": 3.509481430053711, "learning_rate": 1.9353951890034368e-05, "loss": 1.6935, "step": 6250 }, { "epoch": 5.7221206581352835, "grad_norm": 19.565475463867188, "learning_rate": 1.9216494845360825e-05, "loss": 1.6022, "step": 6260 }, { "epoch": 5.73126142595978, "grad_norm": 21.839176177978516, "learning_rate": 1.9079037800687285e-05, "loss": 1.8085, "step": 6270 }, { "epoch": 5.740402193784278, "grad_norm": 7.026022911071777, "learning_rate": 1.8941580756013745e-05, "loss": 2.0553, "step": 6280 }, { "epoch": 5.749542961608775, "grad_norm": 32.81854248046875, "learning_rate": 1.880412371134021e-05, "loss": 2.1562, "step": 6290 }, { "epoch": 5.7586837294332724, "grad_norm": 39.547950744628906, "learning_rate": 1.866666666666667e-05, "loss": 1.6854, "step": 6300 }, { "epoch": 5.76782449725777, "grad_norm": 15.32168960571289, "learning_rate": 1.852920962199313e-05, "loss": 2.5072, "step": 6310 }, { "epoch": 5.776965265082267, "grad_norm": 13.045056343078613, "learning_rate": 1.839175257731959e-05, "loss": 1.9899, "step": 6320 }, { "epoch": 5.786106032906764, "grad_norm": 19.206762313842773, "learning_rate": 1.825429553264605e-05, "loss": 1.3674, "step": 6330 }, { "epoch": 5.795246800731261, "grad_norm": 8.894610404968262, "learning_rate": 1.811683848797251e-05, "loss": 1.6348, "step": 6340 }, { "epoch": 5.804387568555759, "grad_norm": 2.9649221897125244, "learning_rate": 1.797938144329897e-05, "loss": 1.5468, "step": 6350 }, { "epoch": 5.813528336380256, "grad_norm": 19.463268280029297, "learning_rate": 1.7841924398625433e-05, "loss": 2.7073, "step": 6360 }, { "epoch": 5.822669104204754, "grad_norm": 6.29571008682251, "learning_rate": 1.770446735395189e-05, "loss": 1.6744, "step": 6370 }, { "epoch": 5.83180987202925, "grad_norm": 3.524350166320801, "learning_rate": 1.756701030927835e-05, "loss": 2.6163, "step": 6380 }, { "epoch": 5.840950639853748, "grad_norm": 16.762889862060547, "learning_rate": 1.742955326460481e-05, "loss": 2.4136, "step": 6390 }, { "epoch": 5.850091407678245, "grad_norm": 8.522496223449707, "learning_rate": 1.7292096219931274e-05, "loss": 1.2524, "step": 6400 }, { "epoch": 5.859232175502743, "grad_norm": 30.195547103881836, "learning_rate": 1.7154639175257734e-05, "loss": 2.856, "step": 6410 }, { "epoch": 5.868372943327239, "grad_norm": 7.831188201904297, "learning_rate": 1.7017182130584194e-05, "loss": 2.9353, "step": 6420 }, { "epoch": 5.877513711151737, "grad_norm": 38.5179443359375, "learning_rate": 1.6879725085910654e-05, "loss": 2.0612, "step": 6430 }, { "epoch": 5.886654478976234, "grad_norm": 13.353597640991211, "learning_rate": 1.6742268041237114e-05, "loss": 2.1692, "step": 6440 }, { "epoch": 5.8957952468007315, "grad_norm": 12.689579010009766, "learning_rate": 1.6604810996563575e-05, "loss": 2.1301, "step": 6450 }, { "epoch": 5.904936014625228, "grad_norm": 5.47307825088501, "learning_rate": 1.6467353951890035e-05, "loss": 1.6086, "step": 6460 }, { "epoch": 5.914076782449726, "grad_norm": 3.7523372173309326, "learning_rate": 1.6329896907216495e-05, "loss": 2.2775, "step": 6470 }, { "epoch": 5.923217550274223, "grad_norm": 4.5170578956604, "learning_rate": 1.619243986254296e-05, "loss": 1.2349, "step": 6480 }, { "epoch": 5.9323583180987205, "grad_norm": 2.7021877765655518, "learning_rate": 1.6054982817869415e-05, "loss": 1.8774, "step": 6490 }, { "epoch": 5.941499085923217, "grad_norm": 2.1031246185302734, "learning_rate": 1.5917525773195875e-05, "loss": 1.9034, "step": 6500 }, { "epoch": 5.950639853747715, "grad_norm": 1.3965234756469727, "learning_rate": 1.578006872852234e-05, "loss": 1.5506, "step": 6510 }, { "epoch": 5.959780621572212, "grad_norm": 26.129735946655273, "learning_rate": 1.56426116838488e-05, "loss": 1.5488, "step": 6520 }, { "epoch": 5.9689213893967095, "grad_norm": 3.006767988204956, "learning_rate": 1.550515463917526e-05, "loss": 2.9622, "step": 6530 }, { "epoch": 5.978062157221206, "grad_norm": 9.74594783782959, "learning_rate": 1.5367697594501716e-05, "loss": 1.7, "step": 6540 }, { "epoch": 5.987202925045704, "grad_norm": 56.580047607421875, "learning_rate": 1.5230240549828178e-05, "loss": 2.8964, "step": 6550 }, { "epoch": 5.996343692870201, "grad_norm": 14.416129112243652, "learning_rate": 1.509278350515464e-05, "loss": 2.2022, "step": 6560 } ], "logging_steps": 10, "max_steps": 7658, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.1107170304e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }