{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.08, "eval_steps": 500, "global_step": 19000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032, "grad_norm": 70.8677749633789, "learning_rate": 9.140767824497258e-07, "loss": 372.4206, "step": 10 }, { "epoch": 0.0064, "grad_norm": 330.7235412597656, "learning_rate": 1.8281535648994516e-06, "loss": 211.1453, "step": 20 }, { "epoch": 0.0096, "grad_norm": 120.3741226196289, "learning_rate": 2.7422303473491773e-06, "loss": 440.9232, "step": 30 }, { "epoch": 0.0128, "grad_norm": 133.67770385742188, "learning_rate": 3.6563071297989032e-06, "loss": 412.1671, "step": 40 }, { "epoch": 0.016, "grad_norm": 164.37681579589844, "learning_rate": 4.570383912248629e-06, "loss": 233.8384, "step": 50 }, { "epoch": 0.0192, "grad_norm": 184.6650848388672, "learning_rate": 5.484460694698355e-06, "loss": 152.7614, "step": 60 }, { "epoch": 0.0224, "grad_norm": 92.71810913085938, "learning_rate": 6.398537477148081e-06, "loss": 258.3155, "step": 70 }, { "epoch": 0.0256, "grad_norm": 348.5384216308594, "learning_rate": 7.3126142595978065e-06, "loss": 258.0102, "step": 80 }, { "epoch": 0.0288, "grad_norm": 49.261295318603516, "learning_rate": 8.226691042047533e-06, "loss": 161.7314, "step": 90 }, { "epoch": 0.032, "grad_norm": 830.3086547851562, "learning_rate": 9.140767824497258e-06, "loss": 315.4532, "step": 100 }, { "epoch": 0.0352, "grad_norm": 481.1623840332031, "learning_rate": 1.0054844606946984e-05, "loss": 365.2337, "step": 110 }, { "epoch": 0.0384, "grad_norm": 610.8396606445312, "learning_rate": 1.096892138939671e-05, "loss": 358.488, "step": 120 }, { "epoch": 0.0416, "grad_norm": 370.4792785644531, "learning_rate": 1.1882998171846435e-05, "loss": 350.3711, "step": 130 }, { "epoch": 0.0448, "grad_norm": 368.6134338378906, "learning_rate": 1.2797074954296162e-05, "loss": 179.6351, "step": 140 }, { "epoch": 0.048, "grad_norm": 321.947021484375, "learning_rate": 1.3711151736745886e-05, "loss": 265.5494, "step": 150 }, { "epoch": 0.0512, "grad_norm": 415.5428161621094, "learning_rate": 1.4625228519195613e-05, "loss": 339.6797, "step": 160 }, { "epoch": 0.0544, "grad_norm": 109.42345428466797, "learning_rate": 1.553930530164534e-05, "loss": 322.6206, "step": 170 }, { "epoch": 0.0576, "grad_norm": 886.9024047851562, "learning_rate": 1.6453382084095066e-05, "loss": 239.7657, "step": 180 }, { "epoch": 0.0608, "grad_norm": 699.3433837890625, "learning_rate": 1.7367458866544793e-05, "loss": 323.6489, "step": 190 }, { "epoch": 0.064, "grad_norm": 241.30528259277344, "learning_rate": 1.8281535648994517e-05, "loss": 304.5519, "step": 200 }, { "epoch": 0.0672, "grad_norm": 441.35968017578125, "learning_rate": 1.9195612431444244e-05, "loss": 139.2091, "step": 210 }, { "epoch": 0.0704, "grad_norm": 270.9167785644531, "learning_rate": 2.0109689213893968e-05, "loss": 481.9218, "step": 220 }, { "epoch": 0.0736, "grad_norm": 40.07669448852539, "learning_rate": 2.1023765996343695e-05, "loss": 332.3318, "step": 230 }, { "epoch": 0.0768, "grad_norm": 893.2574462890625, "learning_rate": 2.193784277879342e-05, "loss": 282.2632, "step": 240 }, { "epoch": 0.08, "grad_norm": 1040.603271484375, "learning_rate": 2.2851919561243146e-05, "loss": 285.0945, "step": 250 }, { "epoch": 0.0832, "grad_norm": 1304.2822265625, "learning_rate": 2.376599634369287e-05, "loss": 268.8489, "step": 260 }, { "epoch": 0.0864, "grad_norm": 560.0037231445312, "learning_rate": 2.4680073126142597e-05, "loss": 256.9847, "step": 270 }, { "epoch": 0.0896, "grad_norm": 327.4929504394531, "learning_rate": 2.5594149908592324e-05, "loss": 303.0919, "step": 280 }, { "epoch": 0.0928, "grad_norm": 463.27801513671875, "learning_rate": 2.6508226691042048e-05, "loss": 417.4169, "step": 290 }, { "epoch": 0.096, "grad_norm": 607.85107421875, "learning_rate": 2.742230347349177e-05, "loss": 240.9904, "step": 300 }, { "epoch": 0.0992, "grad_norm": 321.6316833496094, "learning_rate": 2.8336380255941502e-05, "loss": 156.5909, "step": 310 }, { "epoch": 0.1024, "grad_norm": 741.380615234375, "learning_rate": 2.9250457038391226e-05, "loss": 275.7987, "step": 320 }, { "epoch": 0.1056, "grad_norm": 193.89871215820312, "learning_rate": 3.016453382084095e-05, "loss": 86.3223, "step": 330 }, { "epoch": 0.1088, "grad_norm": 142.398681640625, "learning_rate": 3.107861060329068e-05, "loss": 161.4423, "step": 340 }, { "epoch": 0.112, "grad_norm": 101.83248138427734, "learning_rate": 3.1992687385740404e-05, "loss": 211.2831, "step": 350 }, { "epoch": 0.1152, "grad_norm": 180.22805786132812, "learning_rate": 3.290676416819013e-05, "loss": 148.6474, "step": 360 }, { "epoch": 0.1184, "grad_norm": 280.3041687011719, "learning_rate": 3.382084095063985e-05, "loss": 237.924, "step": 370 }, { "epoch": 0.1216, "grad_norm": 322.2187805175781, "learning_rate": 3.4734917733089586e-05, "loss": 126.5145, "step": 380 }, { "epoch": 0.1248, "grad_norm": 679.7227783203125, "learning_rate": 3.5648994515539306e-05, "loss": 144.3784, "step": 390 }, { "epoch": 0.128, "grad_norm": 1676.8643798828125, "learning_rate": 3.656307129798903e-05, "loss": 279.3074, "step": 400 }, { "epoch": 0.1312, "grad_norm": 681.2305908203125, "learning_rate": 3.7477148080438754e-05, "loss": 210.6748, "step": 410 }, { "epoch": 0.1344, "grad_norm": 950.00537109375, "learning_rate": 3.839122486288849e-05, "loss": 373.9897, "step": 420 }, { "epoch": 0.1376, "grad_norm": 88.00889587402344, "learning_rate": 3.930530164533821e-05, "loss": 119.6067, "step": 430 }, { "epoch": 0.1408, "grad_norm": 1166.579833984375, "learning_rate": 4.0219378427787935e-05, "loss": 135.5659, "step": 440 }, { "epoch": 0.144, "grad_norm": 826.1378784179688, "learning_rate": 4.113345521023766e-05, "loss": 165.0297, "step": 450 }, { "epoch": 0.1472, "grad_norm": 230.793701171875, "learning_rate": 4.204753199268739e-05, "loss": 195.6006, "step": 460 }, { "epoch": 0.1504, "grad_norm": 620.4400024414062, "learning_rate": 4.296160877513711e-05, "loss": 279.1812, "step": 470 }, { "epoch": 0.1536, "grad_norm": 427.4312744140625, "learning_rate": 4.387568555758684e-05, "loss": 84.7935, "step": 480 }, { "epoch": 0.1568, "grad_norm": 136.93243408203125, "learning_rate": 4.4789762340036564e-05, "loss": 143.6062, "step": 490 }, { "epoch": 0.16, "grad_norm": 229.9365234375, "learning_rate": 4.570383912248629e-05, "loss": 198.6888, "step": 500 }, { "epoch": 0.1632, "grad_norm": 235.56333923339844, "learning_rate": 4.661791590493602e-05, "loss": 294.0395, "step": 510 }, { "epoch": 0.1664, "grad_norm": 322.97625732421875, "learning_rate": 4.753199268738574e-05, "loss": 142.6408, "step": 520 }, { "epoch": 0.1696, "grad_norm": 286.3608703613281, "learning_rate": 4.844606946983547e-05, "loss": 208.4502, "step": 530 }, { "epoch": 0.1728, "grad_norm": 140.10496520996094, "learning_rate": 4.936014625228519e-05, "loss": 87.6955, "step": 540 }, { "epoch": 0.176, "grad_norm": 221.54261779785156, "learning_rate": 5.027422303473493e-05, "loss": 96.5447, "step": 550 }, { "epoch": 0.1792, "grad_norm": 369.0118408203125, "learning_rate": 5.118829981718465e-05, "loss": 73.7012, "step": 560 }, { "epoch": 0.1824, "grad_norm": 390.25335693359375, "learning_rate": 5.2102376599634375e-05, "loss": 105.7585, "step": 570 }, { "epoch": 0.1856, "grad_norm": 608.348388671875, "learning_rate": 5.3016453382084095e-05, "loss": 63.3395, "step": 580 }, { "epoch": 0.1888, "grad_norm": 573.05615234375, "learning_rate": 5.393053016453382e-05, "loss": 207.9913, "step": 590 }, { "epoch": 0.192, "grad_norm": 72.8682861328125, "learning_rate": 5.484460694698354e-05, "loss": 144.4198, "step": 600 }, { "epoch": 0.1952, "grad_norm": 831.5779418945312, "learning_rate": 5.575868372943327e-05, "loss": 125.4837, "step": 610 }, { "epoch": 0.1984, "grad_norm": 1477.1602783203125, "learning_rate": 5.6672760511883004e-05, "loss": 159.8333, "step": 620 }, { "epoch": 0.2016, "grad_norm": 436.246337890625, "learning_rate": 5.758683729433273e-05, "loss": 91.322, "step": 630 }, { "epoch": 0.2048, "grad_norm": 94.6619644165039, "learning_rate": 5.850091407678245e-05, "loss": 87.9035, "step": 640 }, { "epoch": 0.208, "grad_norm": 118.78590393066406, "learning_rate": 5.941499085923218e-05, "loss": 70.5239, "step": 650 }, { "epoch": 0.2112, "grad_norm": 328.2958984375, "learning_rate": 6.03290676416819e-05, "loss": 116.9846, "step": 660 }, { "epoch": 0.2144, "grad_norm": 505.57452392578125, "learning_rate": 6.124314442413163e-05, "loss": 129.6089, "step": 670 }, { "epoch": 0.2176, "grad_norm": 844.416015625, "learning_rate": 6.215722120658135e-05, "loss": 146.8645, "step": 680 }, { "epoch": 0.2208, "grad_norm": 876.8177490234375, "learning_rate": 6.307129798903108e-05, "loss": 133.5625, "step": 690 }, { "epoch": 0.224, "grad_norm": 150.03578186035156, "learning_rate": 6.398537477148081e-05, "loss": 95.1602, "step": 700 }, { "epoch": 0.2272, "grad_norm": 352.86932373046875, "learning_rate": 6.489945155393054e-05, "loss": 76.4034, "step": 710 }, { "epoch": 0.2304, "grad_norm": 452.97027587890625, "learning_rate": 6.581352833638026e-05, "loss": 122.3771, "step": 720 }, { "epoch": 0.2336, "grad_norm": 185.1142120361328, "learning_rate": 6.672760511882999e-05, "loss": 44.6558, "step": 730 }, { "epoch": 0.2368, "grad_norm": 56.88755416870117, "learning_rate": 6.76416819012797e-05, "loss": 87.8657, "step": 740 }, { "epoch": 0.24, "grad_norm": 497.0704345703125, "learning_rate": 6.855575868372943e-05, "loss": 78.2252, "step": 750 }, { "epoch": 0.2432, "grad_norm": 826.345458984375, "learning_rate": 6.946983546617917e-05, "loss": 109.3264, "step": 760 }, { "epoch": 0.2464, "grad_norm": 871.9296264648438, "learning_rate": 7.03839122486289e-05, "loss": 80.0283, "step": 770 }, { "epoch": 0.2496, "grad_norm": 572.2880859375, "learning_rate": 7.129798903107861e-05, "loss": 78.3177, "step": 780 }, { "epoch": 0.2528, "grad_norm": 424.81597900390625, "learning_rate": 7.221206581352834e-05, "loss": 63.2078, "step": 790 }, { "epoch": 0.256, "grad_norm": 539.1710815429688, "learning_rate": 7.312614259597807e-05, "loss": 59.1713, "step": 800 }, { "epoch": 0.2592, "grad_norm": 269.7329406738281, "learning_rate": 7.40402193784278e-05, "loss": 46.6312, "step": 810 }, { "epoch": 0.2624, "grad_norm": 403.2044982910156, "learning_rate": 7.495429616087751e-05, "loss": 108.5579, "step": 820 }, { "epoch": 0.2656, "grad_norm": 419.28912353515625, "learning_rate": 7.586837294332725e-05, "loss": 115.3796, "step": 830 }, { "epoch": 0.2688, "grad_norm": 325.571044921875, "learning_rate": 7.678244972577697e-05, "loss": 76.451, "step": 840 }, { "epoch": 0.272, "grad_norm": 419.87457275390625, "learning_rate": 7.76965265082267e-05, "loss": 89.4141, "step": 850 }, { "epoch": 0.2752, "grad_norm": 529.0945434570312, "learning_rate": 7.861060329067642e-05, "loss": 69.1431, "step": 860 }, { "epoch": 0.2784, "grad_norm": 130.44972229003906, "learning_rate": 7.952468007312614e-05, "loss": 75.1188, "step": 870 }, { "epoch": 0.2816, "grad_norm": 247.48854064941406, "learning_rate": 8.043875685557587e-05, "loss": 69.9392, "step": 880 }, { "epoch": 0.2848, "grad_norm": 379.2283020019531, "learning_rate": 8.13528336380256e-05, "loss": 67.0566, "step": 890 }, { "epoch": 0.288, "grad_norm": 293.61126708984375, "learning_rate": 8.226691042047532e-05, "loss": 56.9224, "step": 900 }, { "epoch": 0.2912, "grad_norm": 159.09547424316406, "learning_rate": 8.318098720292505e-05, "loss": 72.8887, "step": 910 }, { "epoch": 0.2944, "grad_norm": 183.39781188964844, "learning_rate": 8.409506398537478e-05, "loss": 73.3354, "step": 920 }, { "epoch": 0.2976, "grad_norm": 237.7224578857422, "learning_rate": 8.50091407678245e-05, "loss": 41.4987, "step": 930 }, { "epoch": 0.3008, "grad_norm": 584.767578125, "learning_rate": 8.592321755027422e-05, "loss": 65.0617, "step": 940 }, { "epoch": 0.304, "grad_norm": 459.34320068359375, "learning_rate": 8.683729433272395e-05, "loss": 73.5751, "step": 950 }, { "epoch": 0.3072, "grad_norm": 762.7192993164062, "learning_rate": 8.775137111517367e-05, "loss": 73.2722, "step": 960 }, { "epoch": 0.3104, "grad_norm": 603.3504028320312, "learning_rate": 8.86654478976234e-05, "loss": 96.0344, "step": 970 }, { "epoch": 0.3136, "grad_norm": 248.93045043945312, "learning_rate": 8.957952468007313e-05, "loss": 59.8658, "step": 980 }, { "epoch": 0.3168, "grad_norm": 279.07110595703125, "learning_rate": 9.049360146252286e-05, "loss": 38.864, "step": 990 }, { "epoch": 0.32, "grad_norm": 218.828125, "learning_rate": 9.140767824497258e-05, "loss": 48.5516, "step": 1000 }, { "epoch": 0.3232, "grad_norm": 92.16239929199219, "learning_rate": 9.232175502742231e-05, "loss": 50.7601, "step": 1010 }, { "epoch": 0.3264, "grad_norm": 131.48045349121094, "learning_rate": 9.323583180987204e-05, "loss": 69.0272, "step": 1020 }, { "epoch": 0.3296, "grad_norm": 94.53346252441406, "learning_rate": 9.414990859232175e-05, "loss": 65.3211, "step": 1030 }, { "epoch": 0.3328, "grad_norm": 358.3692932128906, "learning_rate": 9.506398537477148e-05, "loss": 55.4857, "step": 1040 }, { "epoch": 0.336, "grad_norm": 292.1886291503906, "learning_rate": 9.597806215722122e-05, "loss": 45.9116, "step": 1050 }, { "epoch": 0.3392, "grad_norm": 294.4053955078125, "learning_rate": 9.689213893967095e-05, "loss": 54.4099, "step": 1060 }, { "epoch": 0.3424, "grad_norm": 536.5247192382812, "learning_rate": 9.780621572212066e-05, "loss": 39.3349, "step": 1070 }, { "epoch": 0.3456, "grad_norm": 442.44207763671875, "learning_rate": 9.872029250457039e-05, "loss": 42.7796, "step": 1080 }, { "epoch": 0.3488, "grad_norm": 301.01690673828125, "learning_rate": 9.963436928702011e-05, "loss": 49.6293, "step": 1090 }, { "epoch": 0.352, "grad_norm": 627.5016479492188, "learning_rate": 9.99711274722102e-05, "loss": 58.6378, "step": 1100 }, { "epoch": 0.3552, "grad_norm": 190.20114135742188, "learning_rate": 9.992300659256052e-05, "loss": 60.2339, "step": 1110 }, { "epoch": 0.3584, "grad_norm": 106.81483459472656, "learning_rate": 9.987488571291084e-05, "loss": 71.9868, "step": 1120 }, { "epoch": 0.3616, "grad_norm": 584.4236450195312, "learning_rate": 9.982676483326115e-05, "loss": 56.3242, "step": 1130 }, { "epoch": 0.3648, "grad_norm": 457.242919921875, "learning_rate": 9.977864395361147e-05, "loss": 62.0991, "step": 1140 }, { "epoch": 0.368, "grad_norm": 320.54620361328125, "learning_rate": 9.97305230739618e-05, "loss": 56.8969, "step": 1150 }, { "epoch": 0.3712, "grad_norm": 202.88552856445312, "learning_rate": 9.968240219431211e-05, "loss": 43.3129, "step": 1160 }, { "epoch": 0.3744, "grad_norm": 599.7297973632812, "learning_rate": 9.963428131466243e-05, "loss": 112.1979, "step": 1170 }, { "epoch": 0.3776, "grad_norm": 409.6940612792969, "learning_rate": 9.958616043501276e-05, "loss": 35.5901, "step": 1180 }, { "epoch": 0.3808, "grad_norm": 476.78387451171875, "learning_rate": 9.953803955536308e-05, "loss": 58.1074, "step": 1190 }, { "epoch": 0.384, "grad_norm": 194.54656982421875, "learning_rate": 9.94899186757134e-05, "loss": 50.6098, "step": 1200 }, { "epoch": 0.3872, "grad_norm": 186.1998291015625, "learning_rate": 9.944179779606372e-05, "loss": 24.7077, "step": 1210 }, { "epoch": 0.3904, "grad_norm": 62.49644470214844, "learning_rate": 9.939367691641404e-05, "loss": 59.4432, "step": 1220 }, { "epoch": 0.3936, "grad_norm": 207.9137725830078, "learning_rate": 9.934555603676435e-05, "loss": 24.3741, "step": 1230 }, { "epoch": 0.3968, "grad_norm": 334.8984375, "learning_rate": 9.929743515711468e-05, "loss": 67.5187, "step": 1240 }, { "epoch": 0.4, "grad_norm": 98.1480941772461, "learning_rate": 9.9249314277465e-05, "loss": 40.4422, "step": 1250 }, { "epoch": 0.4032, "grad_norm": 222.9722442626953, "learning_rate": 9.920119339781531e-05, "loss": 37.1549, "step": 1260 }, { "epoch": 0.4064, "grad_norm": 162.55287170410156, "learning_rate": 9.915307251816563e-05, "loss": 44.5279, "step": 1270 }, { "epoch": 0.4096, "grad_norm": 125.35118865966797, "learning_rate": 9.910495163851595e-05, "loss": 35.3092, "step": 1280 }, { "epoch": 0.4128, "grad_norm": 144.71304321289062, "learning_rate": 9.905683075886628e-05, "loss": 39.4383, "step": 1290 }, { "epoch": 0.416, "grad_norm": 39.62452697753906, "learning_rate": 9.90087098792166e-05, "loss": 33.2722, "step": 1300 }, { "epoch": 0.4192, "grad_norm": 25.904735565185547, "learning_rate": 9.896058899956692e-05, "loss": 34.3183, "step": 1310 }, { "epoch": 0.4224, "grad_norm": 125.3044204711914, "learning_rate": 9.891246811991724e-05, "loss": 59.9542, "step": 1320 }, { "epoch": 0.4256, "grad_norm": 415.9193420410156, "learning_rate": 9.886434724026755e-05, "loss": 38.8972, "step": 1330 }, { "epoch": 0.4288, "grad_norm": 152.35000610351562, "learning_rate": 9.881622636061788e-05, "loss": 45.3212, "step": 1340 }, { "epoch": 0.432, "grad_norm": 115.28887176513672, "learning_rate": 9.87681054809682e-05, "loss": 28.9608, "step": 1350 }, { "epoch": 0.4352, "grad_norm": 234.9051055908203, "learning_rate": 9.871998460131851e-05, "loss": 40.3441, "step": 1360 }, { "epoch": 0.4384, "grad_norm": 604.304443359375, "learning_rate": 9.867186372166883e-05, "loss": 51.7925, "step": 1370 }, { "epoch": 0.4416, "grad_norm": 193.7539520263672, "learning_rate": 9.862374284201916e-05, "loss": 30.6552, "step": 1380 }, { "epoch": 0.4448, "grad_norm": 100.42778015136719, "learning_rate": 9.857562196236948e-05, "loss": 29.9894, "step": 1390 }, { "epoch": 0.448, "grad_norm": 288.8160095214844, "learning_rate": 9.852750108271979e-05, "loss": 40.8629, "step": 1400 }, { "epoch": 0.4512, "grad_norm": 66.43569946289062, "learning_rate": 9.847938020307012e-05, "loss": 26.2656, "step": 1410 }, { "epoch": 0.4544, "grad_norm": 78.16374969482422, "learning_rate": 9.843125932342044e-05, "loss": 30.1275, "step": 1420 }, { "epoch": 0.4576, "grad_norm": 120.88524627685547, "learning_rate": 9.838313844377077e-05, "loss": 20.4762, "step": 1430 }, { "epoch": 0.4608, "grad_norm": 177.96958923339844, "learning_rate": 9.833501756412108e-05, "loss": 24.2266, "step": 1440 }, { "epoch": 0.464, "grad_norm": 130.786376953125, "learning_rate": 9.82868966844714e-05, "loss": 47.8729, "step": 1450 }, { "epoch": 0.4672, "grad_norm": 444.28851318359375, "learning_rate": 9.823877580482171e-05, "loss": 30.919, "step": 1460 }, { "epoch": 0.4704, "grad_norm": 113.35100555419922, "learning_rate": 9.819065492517203e-05, "loss": 51.097, "step": 1470 }, { "epoch": 0.4736, "grad_norm": 245.37928771972656, "learning_rate": 9.814253404552236e-05, "loss": 31.1144, "step": 1480 }, { "epoch": 0.4768, "grad_norm": 101.38189697265625, "learning_rate": 9.809441316587268e-05, "loss": 41.1514, "step": 1490 }, { "epoch": 0.48, "grad_norm": 148.2977294921875, "learning_rate": 9.804629228622299e-05, "loss": 32.8587, "step": 1500 }, { "epoch": 0.4832, "grad_norm": 62.406951904296875, "learning_rate": 9.799817140657331e-05, "loss": 22.1627, "step": 1510 }, { "epoch": 0.4864, "grad_norm": 433.7176208496094, "learning_rate": 9.795005052692364e-05, "loss": 35.2771, "step": 1520 }, { "epoch": 0.4896, "grad_norm": 139.73338317871094, "learning_rate": 9.790192964727395e-05, "loss": 28.6561, "step": 1530 }, { "epoch": 0.4928, "grad_norm": 127.1927261352539, "learning_rate": 9.785380876762428e-05, "loss": 26.2607, "step": 1540 }, { "epoch": 0.496, "grad_norm": 225.4915771484375, "learning_rate": 9.78056878879746e-05, "loss": 31.8774, "step": 1550 }, { "epoch": 0.4992, "grad_norm": 152.98448181152344, "learning_rate": 9.775756700832491e-05, "loss": 29.1978, "step": 1560 }, { "epoch": 0.5024, "grad_norm": 423.86236572265625, "learning_rate": 9.770944612867524e-05, "loss": 23.1176, "step": 1570 }, { "epoch": 0.5056, "grad_norm": 579.5122680664062, "learning_rate": 9.766132524902556e-05, "loss": 36.9734, "step": 1580 }, { "epoch": 0.5088, "grad_norm": 106.73033142089844, "learning_rate": 9.761320436937588e-05, "loss": 26.1043, "step": 1590 }, { "epoch": 0.512, "grad_norm": 149.6201171875, "learning_rate": 9.756508348972619e-05, "loss": 32.0093, "step": 1600 }, { "epoch": 0.5152, "grad_norm": 472.0378112792969, "learning_rate": 9.751696261007651e-05, "loss": 35.9758, "step": 1610 }, { "epoch": 0.5184, "grad_norm": 48.845401763916016, "learning_rate": 9.746884173042684e-05, "loss": 24.3892, "step": 1620 }, { "epoch": 0.5216, "grad_norm": 200.67312622070312, "learning_rate": 9.742072085077715e-05, "loss": 33.8457, "step": 1630 }, { "epoch": 0.5248, "grad_norm": 207.95066833496094, "learning_rate": 9.737259997112748e-05, "loss": 35.7563, "step": 1640 }, { "epoch": 0.528, "grad_norm": 247.0340118408203, "learning_rate": 9.73244790914778e-05, "loss": 29.0463, "step": 1650 }, { "epoch": 0.5312, "grad_norm": 285.4220275878906, "learning_rate": 9.727635821182811e-05, "loss": 26.6152, "step": 1660 }, { "epoch": 0.5344, "grad_norm": 253.2611083984375, "learning_rate": 9.722823733217844e-05, "loss": 25.4378, "step": 1670 }, { "epoch": 0.5376, "grad_norm": 472.8829040527344, "learning_rate": 9.718011645252876e-05, "loss": 62.3821, "step": 1680 }, { "epoch": 0.5408, "grad_norm": 89.76939392089844, "learning_rate": 9.713199557287908e-05, "loss": 24.1752, "step": 1690 }, { "epoch": 0.544, "grad_norm": 275.97509765625, "learning_rate": 9.708387469322939e-05, "loss": 45.5173, "step": 1700 }, { "epoch": 0.5472, "grad_norm": 458.5698547363281, "learning_rate": 9.703575381357972e-05, "loss": 26.1641, "step": 1710 }, { "epoch": 0.5504, "grad_norm": 34.091243743896484, "learning_rate": 9.698763293393004e-05, "loss": 22.7095, "step": 1720 }, { "epoch": 0.5536, "grad_norm": 89.35597229003906, "learning_rate": 9.693951205428035e-05, "loss": 38.0242, "step": 1730 }, { "epoch": 0.5568, "grad_norm": 147.58323669433594, "learning_rate": 9.689139117463067e-05, "loss": 29.0062, "step": 1740 }, { "epoch": 0.56, "grad_norm": 524.7474365234375, "learning_rate": 9.6843270294981e-05, "loss": 59.098, "step": 1750 }, { "epoch": 0.5632, "grad_norm": 77.78461456298828, "learning_rate": 9.679514941533131e-05, "loss": 23.434, "step": 1760 }, { "epoch": 0.5664, "grad_norm": 63.40346908569336, "learning_rate": 9.674702853568164e-05, "loss": 22.6146, "step": 1770 }, { "epoch": 0.5696, "grad_norm": 52.35934066772461, "learning_rate": 9.669890765603196e-05, "loss": 23.4179, "step": 1780 }, { "epoch": 0.5728, "grad_norm": 213.41380310058594, "learning_rate": 9.665078677638228e-05, "loss": 33.7234, "step": 1790 }, { "epoch": 0.576, "grad_norm": 286.4679870605469, "learning_rate": 9.660266589673259e-05, "loss": 24.4579, "step": 1800 }, { "epoch": 0.5792, "grad_norm": 372.3683166503906, "learning_rate": 9.655454501708292e-05, "loss": 36.6747, "step": 1810 }, { "epoch": 0.5824, "grad_norm": 367.9760437011719, "learning_rate": 9.650642413743324e-05, "loss": 35.3778, "step": 1820 }, { "epoch": 0.5856, "grad_norm": 454.5294494628906, "learning_rate": 9.645830325778355e-05, "loss": 33.0155, "step": 1830 }, { "epoch": 0.5888, "grad_norm": 110.4859848022461, "learning_rate": 9.641018237813387e-05, "loss": 24.3229, "step": 1840 }, { "epoch": 0.592, "grad_norm": 234.80357360839844, "learning_rate": 9.63620614984842e-05, "loss": 21.1816, "step": 1850 }, { "epoch": 0.5952, "grad_norm": 77.17384338378906, "learning_rate": 9.631394061883451e-05, "loss": 20.9283, "step": 1860 }, { "epoch": 0.5984, "grad_norm": 458.8993835449219, "learning_rate": 9.626581973918483e-05, "loss": 31.8687, "step": 1870 }, { "epoch": 0.6016, "grad_norm": 42.81018829345703, "learning_rate": 9.621769885953516e-05, "loss": 34.5272, "step": 1880 }, { "epoch": 0.6048, "grad_norm": 220.99127197265625, "learning_rate": 9.616957797988548e-05, "loss": 26.7147, "step": 1890 }, { "epoch": 0.608, "grad_norm": 54.44512176513672, "learning_rate": 9.61214571002358e-05, "loss": 24.3494, "step": 1900 }, { "epoch": 0.6112, "grad_norm": 105.65538787841797, "learning_rate": 9.607333622058612e-05, "loss": 31.0569, "step": 1910 }, { "epoch": 0.6144, "grad_norm": 23.5866641998291, "learning_rate": 9.602521534093644e-05, "loss": 35.5666, "step": 1920 }, { "epoch": 0.6176, "grad_norm": 318.65399169921875, "learning_rate": 9.597709446128675e-05, "loss": 29.6396, "step": 1930 }, { "epoch": 0.6208, "grad_norm": 41.01874542236328, "learning_rate": 9.592897358163707e-05, "loss": 32.9048, "step": 1940 }, { "epoch": 0.624, "grad_norm": 45.38637924194336, "learning_rate": 9.58808527019874e-05, "loss": 22.4149, "step": 1950 }, { "epoch": 0.6272, "grad_norm": 301.2698059082031, "learning_rate": 9.583273182233771e-05, "loss": 27.3668, "step": 1960 }, { "epoch": 0.6304, "grad_norm": 109.63639831542969, "learning_rate": 9.578461094268803e-05, "loss": 22.9627, "step": 1970 }, { "epoch": 0.6336, "grad_norm": 192.92343139648438, "learning_rate": 9.573649006303835e-05, "loss": 19.1204, "step": 1980 }, { "epoch": 0.6368, "grad_norm": 205.81973266601562, "learning_rate": 9.568836918338868e-05, "loss": 27.6961, "step": 1990 }, { "epoch": 0.64, "grad_norm": 187.46075439453125, "learning_rate": 9.5640248303739e-05, "loss": 22.1607, "step": 2000 }, { "epoch": 0.6432, "grad_norm": 151.6815948486328, "learning_rate": 9.559212742408932e-05, "loss": 19.8078, "step": 2010 }, { "epoch": 0.6464, "grad_norm": 74.60163116455078, "learning_rate": 9.554400654443964e-05, "loss": 23.4414, "step": 2020 }, { "epoch": 0.6496, "grad_norm": 61.8299674987793, "learning_rate": 9.549588566478995e-05, "loss": 17.4522, "step": 2030 }, { "epoch": 0.6528, "grad_norm": 318.5955505371094, "learning_rate": 9.544776478514028e-05, "loss": 22.7987, "step": 2040 }, { "epoch": 0.656, "grad_norm": 311.2384948730469, "learning_rate": 9.53996439054906e-05, "loss": 35.9233, "step": 2050 }, { "epoch": 0.6592, "grad_norm": 158.7857208251953, "learning_rate": 9.535152302584091e-05, "loss": 26.4328, "step": 2060 }, { "epoch": 0.6624, "grad_norm": 100.60624694824219, "learning_rate": 9.530340214619123e-05, "loss": 17.902, "step": 2070 }, { "epoch": 0.6656, "grad_norm": 104.47111511230469, "learning_rate": 9.525528126654155e-05, "loss": 35.7127, "step": 2080 }, { "epoch": 0.6688, "grad_norm": 254.18075561523438, "learning_rate": 9.520716038689188e-05, "loss": 28.0673, "step": 2090 }, { "epoch": 0.672, "grad_norm": 176.0093231201172, "learning_rate": 9.515903950724219e-05, "loss": 45.0393, "step": 2100 }, { "epoch": 0.6752, "grad_norm": 156.06594848632812, "learning_rate": 9.511091862759252e-05, "loss": 25.5019, "step": 2110 }, { "epoch": 0.6784, "grad_norm": 70.44853210449219, "learning_rate": 9.506279774794284e-05, "loss": 29.3255, "step": 2120 }, { "epoch": 0.6816, "grad_norm": 340.83062744140625, "learning_rate": 9.501467686829317e-05, "loss": 27.4794, "step": 2130 }, { "epoch": 0.6848, "grad_norm": 156.46664428710938, "learning_rate": 9.496655598864348e-05, "loss": 31.8008, "step": 2140 }, { "epoch": 0.688, "grad_norm": 18.300678253173828, "learning_rate": 9.49184351089938e-05, "loss": 20.808, "step": 2150 }, { "epoch": 0.6912, "grad_norm": 168.41383361816406, "learning_rate": 9.487031422934411e-05, "loss": 27.5098, "step": 2160 }, { "epoch": 0.6944, "grad_norm": 219.61073303222656, "learning_rate": 9.482219334969443e-05, "loss": 16.9947, "step": 2170 }, { "epoch": 0.6976, "grad_norm": 132.65806579589844, "learning_rate": 9.477407247004476e-05, "loss": 21.8476, "step": 2180 }, { "epoch": 0.7008, "grad_norm": 206.9647674560547, "learning_rate": 9.472595159039508e-05, "loss": 26.942, "step": 2190 }, { "epoch": 0.704, "grad_norm": 189.9781951904297, "learning_rate": 9.467783071074539e-05, "loss": 31.8326, "step": 2200 }, { "epoch": 0.7072, "grad_norm": 109.49040985107422, "learning_rate": 9.462970983109571e-05, "loss": 29.6914, "step": 2210 }, { "epoch": 0.7104, "grad_norm": 255.90353393554688, "learning_rate": 9.458158895144604e-05, "loss": 23.2996, "step": 2220 }, { "epoch": 0.7136, "grad_norm": 54.25808334350586, "learning_rate": 9.453346807179637e-05, "loss": 25.4372, "step": 2230 }, { "epoch": 0.7168, "grad_norm": 38.8013801574707, "learning_rate": 9.448534719214668e-05, "loss": 39.1475, "step": 2240 }, { "epoch": 0.72, "grad_norm": 223.67648315429688, "learning_rate": 9.4437226312497e-05, "loss": 19.257, "step": 2250 }, { "epoch": 0.7232, "grad_norm": 95.73957824707031, "learning_rate": 9.438910543284732e-05, "loss": 14.9857, "step": 2260 }, { "epoch": 0.7264, "grad_norm": 221.9759063720703, "learning_rate": 9.434098455319764e-05, "loss": 18.8402, "step": 2270 }, { "epoch": 0.7296, "grad_norm": 143.08071899414062, "learning_rate": 9.429286367354796e-05, "loss": 29.2019, "step": 2280 }, { "epoch": 0.7328, "grad_norm": 47.351165771484375, "learning_rate": 9.424474279389828e-05, "loss": 23.1339, "step": 2290 }, { "epoch": 0.736, "grad_norm": 119.9102554321289, "learning_rate": 9.419662191424859e-05, "loss": 26.1662, "step": 2300 }, { "epoch": 0.7392, "grad_norm": 16.750450134277344, "learning_rate": 9.414850103459891e-05, "loss": 14.8211, "step": 2310 }, { "epoch": 0.7424, "grad_norm": 85.4156265258789, "learning_rate": 9.410038015494924e-05, "loss": 22.9083, "step": 2320 }, { "epoch": 0.7456, "grad_norm": 81.16226959228516, "learning_rate": 9.405225927529955e-05, "loss": 31.724, "step": 2330 }, { "epoch": 0.7488, "grad_norm": 113.119873046875, "learning_rate": 9.400413839564988e-05, "loss": 21.8308, "step": 2340 }, { "epoch": 0.752, "grad_norm": 243.59823608398438, "learning_rate": 9.39560175160002e-05, "loss": 23.0452, "step": 2350 }, { "epoch": 0.7552, "grad_norm": 163.2970428466797, "learning_rate": 9.390789663635052e-05, "loss": 21.6552, "step": 2360 }, { "epoch": 0.7584, "grad_norm": 263.91802978515625, "learning_rate": 9.385977575670084e-05, "loss": 19.8966, "step": 2370 }, { "epoch": 0.7616, "grad_norm": 217.38804626464844, "learning_rate": 9.381165487705116e-05, "loss": 25.8317, "step": 2380 }, { "epoch": 0.7648, "grad_norm": 201.3759765625, "learning_rate": 9.376353399740148e-05, "loss": 25.6379, "step": 2390 }, { "epoch": 0.768, "grad_norm": 154.11007690429688, "learning_rate": 9.371541311775179e-05, "loss": 16.3964, "step": 2400 }, { "epoch": 0.7712, "grad_norm": 73.16317749023438, "learning_rate": 9.366729223810212e-05, "loss": 28.1211, "step": 2410 }, { "epoch": 0.7744, "grad_norm": 160.62786865234375, "learning_rate": 9.361917135845244e-05, "loss": 43.5907, "step": 2420 }, { "epoch": 0.7776, "grad_norm": 436.85064697265625, "learning_rate": 9.357105047880275e-05, "loss": 37.3683, "step": 2430 }, { "epoch": 0.7808, "grad_norm": 122.0400390625, "learning_rate": 9.352292959915307e-05, "loss": 18.7893, "step": 2440 }, { "epoch": 0.784, "grad_norm": 57.079673767089844, "learning_rate": 9.34748087195034e-05, "loss": 13.8807, "step": 2450 }, { "epoch": 0.7872, "grad_norm": 22.007753372192383, "learning_rate": 9.342668783985372e-05, "loss": 30.5707, "step": 2460 }, { "epoch": 0.7904, "grad_norm": 169.28701782226562, "learning_rate": 9.337856696020404e-05, "loss": 24.963, "step": 2470 }, { "epoch": 0.7936, "grad_norm": 59.228389739990234, "learning_rate": 9.333044608055436e-05, "loss": 18.7066, "step": 2480 }, { "epoch": 0.7968, "grad_norm": 177.11756896972656, "learning_rate": 9.328232520090468e-05, "loss": 32.5552, "step": 2490 }, { "epoch": 0.8, "grad_norm": 64.83467102050781, "learning_rate": 9.323420432125499e-05, "loss": 23.9, "step": 2500 }, { "epoch": 0.8032, "grad_norm": 84.04242706298828, "learning_rate": 9.318608344160532e-05, "loss": 15.9287, "step": 2510 }, { "epoch": 0.8064, "grad_norm": 139.7073974609375, "learning_rate": 9.313796256195564e-05, "loss": 16.9659, "step": 2520 }, { "epoch": 0.8096, "grad_norm": 308.4088134765625, "learning_rate": 9.308984168230595e-05, "loss": 21.4233, "step": 2530 }, { "epoch": 0.8128, "grad_norm": 267.69232177734375, "learning_rate": 9.304172080265627e-05, "loss": 22.2255, "step": 2540 }, { "epoch": 0.816, "grad_norm": 65.49011993408203, "learning_rate": 9.29935999230066e-05, "loss": 16.4713, "step": 2550 }, { "epoch": 0.8192, "grad_norm": 212.91941833496094, "learning_rate": 9.294547904335692e-05, "loss": 26.7151, "step": 2560 }, { "epoch": 0.8224, "grad_norm": 388.1001892089844, "learning_rate": 9.289735816370723e-05, "loss": 28.3798, "step": 2570 }, { "epoch": 0.8256, "grad_norm": 87.90804290771484, "learning_rate": 9.284923728405756e-05, "loss": 14.7084, "step": 2580 }, { "epoch": 0.8288, "grad_norm": 42.13310241699219, "learning_rate": 9.280111640440788e-05, "loss": 28.8467, "step": 2590 }, { "epoch": 0.832, "grad_norm": 212.66700744628906, "learning_rate": 9.27529955247582e-05, "loss": 19.1156, "step": 2600 }, { "epoch": 0.8352, "grad_norm": 98.9093246459961, "learning_rate": 9.270487464510852e-05, "loss": 27.698, "step": 2610 }, { "epoch": 0.8384, "grad_norm": 98.71244049072266, "learning_rate": 9.265675376545884e-05, "loss": 15.3549, "step": 2620 }, { "epoch": 0.8416, "grad_norm": 28.624130249023438, "learning_rate": 9.260863288580915e-05, "loss": 19.0902, "step": 2630 }, { "epoch": 0.8448, "grad_norm": 100.19732666015625, "learning_rate": 9.256051200615947e-05, "loss": 17.0595, "step": 2640 }, { "epoch": 0.848, "grad_norm": 159.31724548339844, "learning_rate": 9.25123911265098e-05, "loss": 14.5163, "step": 2650 }, { "epoch": 0.8512, "grad_norm": 111.39456176757812, "learning_rate": 9.246427024686012e-05, "loss": 26.1437, "step": 2660 }, { "epoch": 0.8544, "grad_norm": 99.31816864013672, "learning_rate": 9.241614936721043e-05, "loss": 26.4835, "step": 2670 }, { "epoch": 0.8576, "grad_norm": 134.98931884765625, "learning_rate": 9.236802848756075e-05, "loss": 23.0558, "step": 2680 }, { "epoch": 0.8608, "grad_norm": 127.19244384765625, "learning_rate": 9.231990760791108e-05, "loss": 23.4113, "step": 2690 }, { "epoch": 0.864, "grad_norm": 86.95189666748047, "learning_rate": 9.22717867282614e-05, "loss": 16.0389, "step": 2700 }, { "epoch": 0.8672, "grad_norm": 333.9284973144531, "learning_rate": 9.222366584861172e-05, "loss": 27.8456, "step": 2710 }, { "epoch": 0.8704, "grad_norm": 74.68511199951172, "learning_rate": 9.217554496896204e-05, "loss": 22.6915, "step": 2720 }, { "epoch": 0.8736, "grad_norm": 182.12875366210938, "learning_rate": 9.212742408931235e-05, "loss": 23.0509, "step": 2730 }, { "epoch": 0.8768, "grad_norm": 23.309284210205078, "learning_rate": 9.207930320966268e-05, "loss": 24.2455, "step": 2740 }, { "epoch": 0.88, "grad_norm": 100.41490173339844, "learning_rate": 9.2031182330013e-05, "loss": 20.7198, "step": 2750 }, { "epoch": 0.8832, "grad_norm": 179.3391876220703, "learning_rate": 9.198306145036332e-05, "loss": 18.6303, "step": 2760 }, { "epoch": 0.8864, "grad_norm": 280.602294921875, "learning_rate": 9.193494057071363e-05, "loss": 21.324, "step": 2770 }, { "epoch": 0.8896, "grad_norm": 96.27948760986328, "learning_rate": 9.188681969106395e-05, "loss": 20.0374, "step": 2780 }, { "epoch": 0.8928, "grad_norm": 112.8154067993164, "learning_rate": 9.183869881141428e-05, "loss": 15.0929, "step": 2790 }, { "epoch": 0.896, "grad_norm": 83.93199920654297, "learning_rate": 9.179057793176459e-05, "loss": 18.6614, "step": 2800 }, { "epoch": 0.8992, "grad_norm": 64.36573791503906, "learning_rate": 9.174245705211492e-05, "loss": 12.9272, "step": 2810 }, { "epoch": 0.9024, "grad_norm": 135.02993774414062, "learning_rate": 9.169433617246524e-05, "loss": 16.2795, "step": 2820 }, { "epoch": 0.9056, "grad_norm": 107.19320678710938, "learning_rate": 9.164621529281555e-05, "loss": 19.5821, "step": 2830 }, { "epoch": 0.9088, "grad_norm": 236.80886840820312, "learning_rate": 9.159809441316588e-05, "loss": 18.8913, "step": 2840 }, { "epoch": 0.912, "grad_norm": 87.29410552978516, "learning_rate": 9.15499735335162e-05, "loss": 19.1247, "step": 2850 }, { "epoch": 0.9152, "grad_norm": 133.20993041992188, "learning_rate": 9.150185265386652e-05, "loss": 22.8197, "step": 2860 }, { "epoch": 0.9184, "grad_norm": 24.7292537689209, "learning_rate": 9.145373177421683e-05, "loss": 13.8677, "step": 2870 }, { "epoch": 0.9216, "grad_norm": 65.50430297851562, "learning_rate": 9.140561089456716e-05, "loss": 18.5532, "step": 2880 }, { "epoch": 0.9248, "grad_norm": 138.55653381347656, "learning_rate": 9.135749001491748e-05, "loss": 18.4098, "step": 2890 }, { "epoch": 0.928, "grad_norm": 126.76996612548828, "learning_rate": 9.130936913526779e-05, "loss": 21.2464, "step": 2900 }, { "epoch": 0.9312, "grad_norm": 186.4793701171875, "learning_rate": 9.126124825561811e-05, "loss": 19.276, "step": 2910 }, { "epoch": 0.9344, "grad_norm": 166.5569610595703, "learning_rate": 9.121312737596844e-05, "loss": 21.0127, "step": 2920 }, { "epoch": 0.9376, "grad_norm": 279.2941589355469, "learning_rate": 9.116500649631877e-05, "loss": 16.0693, "step": 2930 }, { "epoch": 0.9408, "grad_norm": 202.0535430908203, "learning_rate": 9.111688561666908e-05, "loss": 21.0857, "step": 2940 }, { "epoch": 0.944, "grad_norm": 117.5189208984375, "learning_rate": 9.10687647370194e-05, "loss": 14.9517, "step": 2950 }, { "epoch": 0.9472, "grad_norm": 132.30059814453125, "learning_rate": 9.102064385736972e-05, "loss": 18.1718, "step": 2960 }, { "epoch": 0.9504, "grad_norm": 24.629837036132812, "learning_rate": 9.097252297772003e-05, "loss": 19.3962, "step": 2970 }, { "epoch": 0.9536, "grad_norm": 92.62589263916016, "learning_rate": 9.092440209807036e-05, "loss": 19.6293, "step": 2980 }, { "epoch": 0.9568, "grad_norm": 146.61370849609375, "learning_rate": 9.087628121842068e-05, "loss": 18.724, "step": 2990 }, { "epoch": 0.96, "grad_norm": 57.64328384399414, "learning_rate": 9.082816033877099e-05, "loss": 17.2081, "step": 3000 }, { "epoch": 0.9632, "grad_norm": 43.548831939697266, "learning_rate": 9.078003945912131e-05, "loss": 19.166, "step": 3010 }, { "epoch": 0.9664, "grad_norm": 41.80573272705078, "learning_rate": 9.073191857947164e-05, "loss": 13.5203, "step": 3020 }, { "epoch": 0.9696, "grad_norm": 139.7488555908203, "learning_rate": 9.068379769982195e-05, "loss": 12.6262, "step": 3030 }, { "epoch": 0.9728, "grad_norm": 51.73686981201172, "learning_rate": 9.063567682017228e-05, "loss": 17.4564, "step": 3040 }, { "epoch": 0.976, "grad_norm": 61.7197380065918, "learning_rate": 9.05875559405226e-05, "loss": 24.205, "step": 3050 }, { "epoch": 0.9792, "grad_norm": 229.83740234375, "learning_rate": 9.053943506087292e-05, "loss": 18.3157, "step": 3060 }, { "epoch": 0.9824, "grad_norm": 75.02870178222656, "learning_rate": 9.049131418122324e-05, "loss": 13.9577, "step": 3070 }, { "epoch": 0.9856, "grad_norm": 56.74820327758789, "learning_rate": 9.044319330157356e-05, "loss": 14.7094, "step": 3080 }, { "epoch": 0.9888, "grad_norm": 418.1128845214844, "learning_rate": 9.039507242192388e-05, "loss": 22.2907, "step": 3090 }, { "epoch": 0.992, "grad_norm": 75.13136291503906, "learning_rate": 9.034695154227419e-05, "loss": 20.1825, "step": 3100 }, { "epoch": 0.9952, "grad_norm": 191.5671844482422, "learning_rate": 9.029883066262451e-05, "loss": 18.452, "step": 3110 }, { "epoch": 0.9984, "grad_norm": 170.02565002441406, "learning_rate": 9.025070978297484e-05, "loss": 16.8112, "step": 3120 }, { "epoch": 1.0016, "grad_norm": 28.247150421142578, "learning_rate": 9.020258890332515e-05, "loss": 20.343, "step": 3130 }, { "epoch": 1.0048, "grad_norm": 160.43968200683594, "learning_rate": 9.015446802367547e-05, "loss": 18.3176, "step": 3140 }, { "epoch": 1.008, "grad_norm": 102.76715850830078, "learning_rate": 9.01063471440258e-05, "loss": 22.6881, "step": 3150 }, { "epoch": 1.0112, "grad_norm": 159.62071228027344, "learning_rate": 9.005822626437612e-05, "loss": 18.1613, "step": 3160 }, { "epoch": 1.0144, "grad_norm": 90.15779113769531, "learning_rate": 9.001010538472645e-05, "loss": 18.2287, "step": 3170 }, { "epoch": 1.0176, "grad_norm": 33.28681564331055, "learning_rate": 8.996198450507676e-05, "loss": 19.9894, "step": 3180 }, { "epoch": 1.0208, "grad_norm": 74.93292999267578, "learning_rate": 8.991386362542708e-05, "loss": 20.6377, "step": 3190 }, { "epoch": 1.024, "grad_norm": 62.466548919677734, "learning_rate": 8.986574274577739e-05, "loss": 15.5018, "step": 3200 }, { "epoch": 1.0272, "grad_norm": 210.1294403076172, "learning_rate": 8.981762186612772e-05, "loss": 27.1131, "step": 3210 }, { "epoch": 1.0304, "grad_norm": 577.5269165039062, "learning_rate": 8.976950098647804e-05, "loss": 33.2962, "step": 3220 }, { "epoch": 1.0336, "grad_norm": 171.1616668701172, "learning_rate": 8.972138010682835e-05, "loss": 18.4719, "step": 3230 }, { "epoch": 1.0368, "grad_norm": 265.5197448730469, "learning_rate": 8.967325922717867e-05, "loss": 18.8867, "step": 3240 }, { "epoch": 1.04, "grad_norm": 66.31884765625, "learning_rate": 8.962513834752899e-05, "loss": 20.626, "step": 3250 }, { "epoch": 1.0432, "grad_norm": 321.4315490722656, "learning_rate": 8.957701746787932e-05, "loss": 23.7732, "step": 3260 }, { "epoch": 1.0464, "grad_norm": 24.0924015045166, "learning_rate": 8.952889658822963e-05, "loss": 19.7098, "step": 3270 }, { "epoch": 1.0496, "grad_norm": 56.18558883666992, "learning_rate": 8.948077570857996e-05, "loss": 14.8469, "step": 3280 }, { "epoch": 1.0528, "grad_norm": 211.67799377441406, "learning_rate": 8.943265482893028e-05, "loss": 15.8526, "step": 3290 }, { "epoch": 1.056, "grad_norm": 29.273334503173828, "learning_rate": 8.93845339492806e-05, "loss": 21.7563, "step": 3300 }, { "epoch": 1.0592, "grad_norm": 34.773399353027344, "learning_rate": 8.933641306963092e-05, "loss": 20.5681, "step": 3310 }, { "epoch": 1.0624, "grad_norm": 128.06903076171875, "learning_rate": 8.928829218998124e-05, "loss": 17.9385, "step": 3320 }, { "epoch": 1.0656, "grad_norm": 53.50063705444336, "learning_rate": 8.924017131033155e-05, "loss": 17.889, "step": 3330 }, { "epoch": 1.0688, "grad_norm": 90.24520874023438, "learning_rate": 8.919205043068187e-05, "loss": 16.4801, "step": 3340 }, { "epoch": 1.072, "grad_norm": 48.26347732543945, "learning_rate": 8.91439295510322e-05, "loss": 12.4998, "step": 3350 }, { "epoch": 1.0752, "grad_norm": 226.14881896972656, "learning_rate": 8.909580867138252e-05, "loss": 17.8058, "step": 3360 }, { "epoch": 1.0784, "grad_norm": 240.18943786621094, "learning_rate": 8.904768779173283e-05, "loss": 24.4458, "step": 3370 }, { "epoch": 1.0816, "grad_norm": 256.17041015625, "learning_rate": 8.899956691208316e-05, "loss": 15.7776, "step": 3380 }, { "epoch": 1.0848, "grad_norm": 127.46939849853516, "learning_rate": 8.895144603243348e-05, "loss": 18.8639, "step": 3390 }, { "epoch": 1.088, "grad_norm": 150.3739013671875, "learning_rate": 8.89033251527838e-05, "loss": 21.9335, "step": 3400 }, { "epoch": 1.0912, "grad_norm": 39.739906311035156, "learning_rate": 8.885520427313412e-05, "loss": 24.4284, "step": 3410 }, { "epoch": 1.0944, "grad_norm": 42.24324035644531, "learning_rate": 8.880708339348444e-05, "loss": 21.3751, "step": 3420 }, { "epoch": 1.0976, "grad_norm": 80.6647720336914, "learning_rate": 8.875896251383475e-05, "loss": 14.6389, "step": 3430 }, { "epoch": 1.1008, "grad_norm": 198.7286376953125, "learning_rate": 8.871084163418508e-05, "loss": 17.9525, "step": 3440 }, { "epoch": 1.104, "grad_norm": 163.33241271972656, "learning_rate": 8.86627207545354e-05, "loss": 17.7861, "step": 3450 }, { "epoch": 1.1072, "grad_norm": 67.50294494628906, "learning_rate": 8.861459987488572e-05, "loss": 15.8094, "step": 3460 }, { "epoch": 1.1104, "grad_norm": 117.59965515136719, "learning_rate": 8.856647899523603e-05, "loss": 19.9422, "step": 3470 }, { "epoch": 1.1136, "grad_norm": 55.91126251220703, "learning_rate": 8.851835811558635e-05, "loss": 23.4646, "step": 3480 }, { "epoch": 1.1168, "grad_norm": 205.85218811035156, "learning_rate": 8.847023723593668e-05, "loss": 19.3409, "step": 3490 }, { "epoch": 1.12, "grad_norm": 99.56281280517578, "learning_rate": 8.842211635628699e-05, "loss": 14.2356, "step": 3500 }, { "epoch": 1.1232, "grad_norm": 108.86479187011719, "learning_rate": 8.837399547663732e-05, "loss": 13.2339, "step": 3510 }, { "epoch": 1.1264, "grad_norm": 191.55271911621094, "learning_rate": 8.832587459698764e-05, "loss": 20.5198, "step": 3520 }, { "epoch": 1.1296, "grad_norm": 171.61331176757812, "learning_rate": 8.827775371733795e-05, "loss": 19.1445, "step": 3530 }, { "epoch": 1.1328, "grad_norm": 243.066650390625, "learning_rate": 8.822963283768828e-05, "loss": 20.2084, "step": 3540 }, { "epoch": 1.1360000000000001, "grad_norm": 24.227903366088867, "learning_rate": 8.81815119580386e-05, "loss": 16.574, "step": 3550 }, { "epoch": 1.1392, "grad_norm": 140.19219970703125, "learning_rate": 8.813339107838892e-05, "loss": 11.8627, "step": 3560 }, { "epoch": 1.1424, "grad_norm": 25.654165267944336, "learning_rate": 8.808527019873923e-05, "loss": 11.637, "step": 3570 }, { "epoch": 1.1456, "grad_norm": 86.61192321777344, "learning_rate": 8.803714931908956e-05, "loss": 22.8422, "step": 3580 }, { "epoch": 1.1488, "grad_norm": 78.49467468261719, "learning_rate": 8.798902843943988e-05, "loss": 21.4586, "step": 3590 }, { "epoch": 1.152, "grad_norm": 49.73566818237305, "learning_rate": 8.794090755979019e-05, "loss": 13.4097, "step": 3600 }, { "epoch": 1.1552, "grad_norm": 118.85221862792969, "learning_rate": 8.789278668014051e-05, "loss": 19.4218, "step": 3610 }, { "epoch": 1.1584, "grad_norm": 44.99506378173828, "learning_rate": 8.784466580049084e-05, "loss": 10.6387, "step": 3620 }, { "epoch": 1.1616, "grad_norm": 73.66319274902344, "learning_rate": 8.779654492084117e-05, "loss": 20.6615, "step": 3630 }, { "epoch": 1.1648, "grad_norm": 162.18310546875, "learning_rate": 8.774842404119148e-05, "loss": 16.1592, "step": 3640 }, { "epoch": 1.168, "grad_norm": 138.8058319091797, "learning_rate": 8.77003031615418e-05, "loss": 21.1501, "step": 3650 }, { "epoch": 1.1712, "grad_norm": 81.72513580322266, "learning_rate": 8.765218228189212e-05, "loss": 18.0231, "step": 3660 }, { "epoch": 1.1743999999999999, "grad_norm": 54.952762603759766, "learning_rate": 8.760406140224243e-05, "loss": 15.1313, "step": 3670 }, { "epoch": 1.1776, "grad_norm": 153.96243286132812, "learning_rate": 8.755594052259276e-05, "loss": 12.756, "step": 3680 }, { "epoch": 1.1808, "grad_norm": 52.13134002685547, "learning_rate": 8.750781964294308e-05, "loss": 20.7255, "step": 3690 }, { "epoch": 1.184, "grad_norm": 150.67257690429688, "learning_rate": 8.745969876329339e-05, "loss": 13.2773, "step": 3700 }, { "epoch": 1.1872, "grad_norm": 65.9190673828125, "learning_rate": 8.741157788364371e-05, "loss": 19.1161, "step": 3710 }, { "epoch": 1.1904, "grad_norm": 162.93846130371094, "learning_rate": 8.736345700399404e-05, "loss": 13.5696, "step": 3720 }, { "epoch": 1.1936, "grad_norm": 72.31463623046875, "learning_rate": 8.731533612434435e-05, "loss": 11.577, "step": 3730 }, { "epoch": 1.1968, "grad_norm": 65.74739074707031, "learning_rate": 8.726721524469468e-05, "loss": 13.338, "step": 3740 }, { "epoch": 1.2, "grad_norm": 94.97151184082031, "learning_rate": 8.7219094365045e-05, "loss": 16.9566, "step": 3750 }, { "epoch": 1.2032, "grad_norm": 56.01218795776367, "learning_rate": 8.717097348539532e-05, "loss": 16.3883, "step": 3760 }, { "epoch": 1.2064, "grad_norm": 157.9155731201172, "learning_rate": 8.712285260574565e-05, "loss": 16.785, "step": 3770 }, { "epoch": 1.2096, "grad_norm": 89.81455993652344, "learning_rate": 8.707473172609596e-05, "loss": 19.2455, "step": 3780 }, { "epoch": 1.2128, "grad_norm": 167.51266479492188, "learning_rate": 8.702661084644628e-05, "loss": 15.4239, "step": 3790 }, { "epoch": 1.216, "grad_norm": 72.16557312011719, "learning_rate": 8.697848996679659e-05, "loss": 16.9929, "step": 3800 }, { "epoch": 1.2192, "grad_norm": 232.61080932617188, "learning_rate": 8.693036908714691e-05, "loss": 19.7777, "step": 3810 }, { "epoch": 1.2224, "grad_norm": 84.61380004882812, "learning_rate": 8.688224820749724e-05, "loss": 21.2231, "step": 3820 }, { "epoch": 1.2256, "grad_norm": 166.76876831054688, "learning_rate": 8.683412732784755e-05, "loss": 19.0521, "step": 3830 }, { "epoch": 1.2288000000000001, "grad_norm": 208.75253295898438, "learning_rate": 8.678600644819787e-05, "loss": 9.0883, "step": 3840 }, { "epoch": 1.232, "grad_norm": 39.65126419067383, "learning_rate": 8.67378855685482e-05, "loss": 16.3794, "step": 3850 }, { "epoch": 1.2352, "grad_norm": 150.71987915039062, "learning_rate": 8.668976468889852e-05, "loss": 17.0459, "step": 3860 }, { "epoch": 1.2384, "grad_norm": 47.50828552246094, "learning_rate": 8.664164380924885e-05, "loss": 20.5055, "step": 3870 }, { "epoch": 1.2416, "grad_norm": 129.40567016601562, "learning_rate": 8.659352292959916e-05, "loss": 14.9571, "step": 3880 }, { "epoch": 1.2448, "grad_norm": 62.31793212890625, "learning_rate": 8.654540204994948e-05, "loss": 11.0652, "step": 3890 }, { "epoch": 1.248, "grad_norm": 24.129987716674805, "learning_rate": 8.64972811702998e-05, "loss": 18.4998, "step": 3900 }, { "epoch": 1.2511999999999999, "grad_norm": 184.7411346435547, "learning_rate": 8.644916029065012e-05, "loss": 18.3784, "step": 3910 }, { "epoch": 1.2544, "grad_norm": 184.86978149414062, "learning_rate": 8.640103941100044e-05, "loss": 12.3013, "step": 3920 }, { "epoch": 1.2576, "grad_norm": 233.11602783203125, "learning_rate": 8.635291853135075e-05, "loss": 16.2423, "step": 3930 }, { "epoch": 1.2608, "grad_norm": 87.3406982421875, "learning_rate": 8.630479765170107e-05, "loss": 16.4923, "step": 3940 }, { "epoch": 1.264, "grad_norm": 313.2857666015625, "learning_rate": 8.625667677205139e-05, "loss": 15.6605, "step": 3950 }, { "epoch": 1.2671999999999999, "grad_norm": 433.8872985839844, "learning_rate": 8.620855589240172e-05, "loss": 19.1776, "step": 3960 }, { "epoch": 1.2704, "grad_norm": 161.95138549804688, "learning_rate": 8.616043501275205e-05, "loss": 10.7976, "step": 3970 }, { "epoch": 1.2736, "grad_norm": 40.16414260864258, "learning_rate": 8.611231413310236e-05, "loss": 11.6908, "step": 3980 }, { "epoch": 1.2768, "grad_norm": 114.56854248046875, "learning_rate": 8.606419325345268e-05, "loss": 13.705, "step": 3990 }, { "epoch": 1.28, "grad_norm": 156.34515380859375, "learning_rate": 8.601607237380301e-05, "loss": 16.7319, "step": 4000 }, { "epoch": 1.2832, "grad_norm": 85.69612121582031, "learning_rate": 8.596795149415332e-05, "loss": 15.025, "step": 4010 }, { "epoch": 1.2864, "grad_norm": 13.940338134765625, "learning_rate": 8.591983061450364e-05, "loss": 13.3147, "step": 4020 }, { "epoch": 1.2896, "grad_norm": 89.14633178710938, "learning_rate": 8.587170973485395e-05, "loss": 11.4895, "step": 4030 }, { "epoch": 1.2928, "grad_norm": 41.638023376464844, "learning_rate": 8.582358885520427e-05, "loss": 22.741, "step": 4040 }, { "epoch": 1.296, "grad_norm": 56.797183990478516, "learning_rate": 8.57754679755546e-05, "loss": 14.4201, "step": 4050 }, { "epoch": 1.2992, "grad_norm": 327.9649353027344, "learning_rate": 8.572734709590492e-05, "loss": 16.501, "step": 4060 }, { "epoch": 1.3024, "grad_norm": 44.324920654296875, "learning_rate": 8.567922621625523e-05, "loss": 16.5338, "step": 4070 }, { "epoch": 1.3056, "grad_norm": 142.70382690429688, "learning_rate": 8.563110533660556e-05, "loss": 11.7481, "step": 4080 }, { "epoch": 1.3088, "grad_norm": 290.443359375, "learning_rate": 8.558298445695588e-05, "loss": 19.714, "step": 4090 }, { "epoch": 1.312, "grad_norm": 163.5287628173828, "learning_rate": 8.553486357730621e-05, "loss": 16.3256, "step": 4100 }, { "epoch": 1.3152, "grad_norm": 259.9360046386719, "learning_rate": 8.548674269765652e-05, "loss": 19.712, "step": 4110 }, { "epoch": 1.3184, "grad_norm": 197.34959411621094, "learning_rate": 8.543862181800684e-05, "loss": 11.5704, "step": 4120 }, { "epoch": 1.3216, "grad_norm": 284.62811279296875, "learning_rate": 8.539050093835715e-05, "loss": 16.0154, "step": 4130 }, { "epoch": 1.3248, "grad_norm": 115.84011840820312, "learning_rate": 8.534238005870747e-05, "loss": 15.6927, "step": 4140 }, { "epoch": 1.328, "grad_norm": 38.8075065612793, "learning_rate": 8.52942591790578e-05, "loss": 15.451, "step": 4150 }, { "epoch": 1.3312, "grad_norm": 51.029197692871094, "learning_rate": 8.524613829940812e-05, "loss": 13.48, "step": 4160 }, { "epoch": 1.3344, "grad_norm": 222.28623962402344, "learning_rate": 8.519801741975843e-05, "loss": 12.3188, "step": 4170 }, { "epoch": 1.3376000000000001, "grad_norm": 54.21427917480469, "learning_rate": 8.514989654010875e-05, "loss": 15.1296, "step": 4180 }, { "epoch": 1.3408, "grad_norm": 210.58763122558594, "learning_rate": 8.510177566045908e-05, "loss": 16.5921, "step": 4190 }, { "epoch": 1.3439999999999999, "grad_norm": 120.01280212402344, "learning_rate": 8.50536547808094e-05, "loss": 15.9001, "step": 4200 }, { "epoch": 1.3472, "grad_norm": 204.13780212402344, "learning_rate": 8.500553390115972e-05, "loss": 15.023, "step": 4210 }, { "epoch": 1.3504, "grad_norm": 93.78736877441406, "learning_rate": 8.495741302151004e-05, "loss": 11.2663, "step": 4220 }, { "epoch": 1.3536000000000001, "grad_norm": 103.1590576171875, "learning_rate": 8.490929214186035e-05, "loss": 10.2798, "step": 4230 }, { "epoch": 1.3568, "grad_norm": 299.2051086425781, "learning_rate": 8.486117126221068e-05, "loss": 13.5543, "step": 4240 }, { "epoch": 1.3599999999999999, "grad_norm": 373.3537292480469, "learning_rate": 8.4813050382561e-05, "loss": 16.2461, "step": 4250 }, { "epoch": 1.3632, "grad_norm": 156.3833465576172, "learning_rate": 8.476492950291132e-05, "loss": 15.5678, "step": 4260 }, { "epoch": 1.3664, "grad_norm": 127.47845458984375, "learning_rate": 8.471680862326163e-05, "loss": 23.0998, "step": 4270 }, { "epoch": 1.3696, "grad_norm": 112.68706512451172, "learning_rate": 8.466868774361195e-05, "loss": 13.1495, "step": 4280 }, { "epoch": 1.3728, "grad_norm": 92.29225158691406, "learning_rate": 8.462056686396228e-05, "loss": 9.5102, "step": 4290 }, { "epoch": 1.376, "grad_norm": 281.71502685546875, "learning_rate": 8.45724459843126e-05, "loss": 19.3678, "step": 4300 }, { "epoch": 1.3792, "grad_norm": 138.14439392089844, "learning_rate": 8.452432510466291e-05, "loss": 11.9201, "step": 4310 }, { "epoch": 1.3824, "grad_norm": 167.27597045898438, "learning_rate": 8.447620422501324e-05, "loss": 11.4982, "step": 4320 }, { "epoch": 1.3856, "grad_norm": 79.44098663330078, "learning_rate": 8.442808334536357e-05, "loss": 11.6135, "step": 4330 }, { "epoch": 1.3888, "grad_norm": 129.193359375, "learning_rate": 8.437996246571388e-05, "loss": 16.6837, "step": 4340 }, { "epoch": 1.392, "grad_norm": 318.9241027832031, "learning_rate": 8.43318415860642e-05, "loss": 14.042, "step": 4350 }, { "epoch": 1.3952, "grad_norm": 341.84051513671875, "learning_rate": 8.428372070641452e-05, "loss": 17.6114, "step": 4360 }, { "epoch": 1.3984, "grad_norm": 91.1358413696289, "learning_rate": 8.423559982676483e-05, "loss": 15.0694, "step": 4370 }, { "epoch": 1.4016, "grad_norm": 225.77487182617188, "learning_rate": 8.418747894711516e-05, "loss": 12.3426, "step": 4380 }, { "epoch": 1.4048, "grad_norm": 218.9681396484375, "learning_rate": 8.413935806746548e-05, "loss": 14.3222, "step": 4390 }, { "epoch": 1.408, "grad_norm": 220.1842498779297, "learning_rate": 8.40912371878158e-05, "loss": 21.1482, "step": 4400 }, { "epoch": 1.4112, "grad_norm": 578.4465942382812, "learning_rate": 8.404311630816611e-05, "loss": 19.2856, "step": 4410 }, { "epoch": 1.4144, "grad_norm": 19.313934326171875, "learning_rate": 8.399499542851643e-05, "loss": 10.2682, "step": 4420 }, { "epoch": 1.4176, "grad_norm": 135.8042755126953, "learning_rate": 8.394687454886675e-05, "loss": 10.3577, "step": 4430 }, { "epoch": 1.4208, "grad_norm": 62.049766540527344, "learning_rate": 8.389875366921708e-05, "loss": 11.0704, "step": 4440 }, { "epoch": 1.424, "grad_norm": 54.49662399291992, "learning_rate": 8.38506327895674e-05, "loss": 11.2212, "step": 4450 }, { "epoch": 1.4272, "grad_norm": 133.63314819335938, "learning_rate": 8.380251190991772e-05, "loss": 14.5858, "step": 4460 }, { "epoch": 1.4304000000000001, "grad_norm": 123.1229476928711, "learning_rate": 8.375439103026805e-05, "loss": 17.0966, "step": 4470 }, { "epoch": 1.4336, "grad_norm": 158.63372802734375, "learning_rate": 8.370627015061836e-05, "loss": 17.3195, "step": 4480 }, { "epoch": 1.4368, "grad_norm": 46.22249984741211, "learning_rate": 8.365814927096868e-05, "loss": 11.6803, "step": 4490 }, { "epoch": 1.44, "grad_norm": 18.912309646606445, "learning_rate": 8.3610028391319e-05, "loss": 14.2524, "step": 4500 }, { "epoch": 1.4432, "grad_norm": 50.050113677978516, "learning_rate": 8.356190751166931e-05, "loss": 9.8441, "step": 4510 }, { "epoch": 1.4464000000000001, "grad_norm": 89.43431091308594, "learning_rate": 8.351378663201964e-05, "loss": 19.4293, "step": 4520 }, { "epoch": 1.4496, "grad_norm": 78.4156265258789, "learning_rate": 8.346566575236995e-05, "loss": 13.2139, "step": 4530 }, { "epoch": 1.4527999999999999, "grad_norm": 146.5263214111328, "learning_rate": 8.341754487272027e-05, "loss": 14.3972, "step": 4540 }, { "epoch": 1.456, "grad_norm": 77.80128479003906, "learning_rate": 8.33694239930706e-05, "loss": 12.2828, "step": 4550 }, { "epoch": 1.4592, "grad_norm": 61.81273651123047, "learning_rate": 8.332130311342092e-05, "loss": 16.5848, "step": 4560 }, { "epoch": 1.4624, "grad_norm": 81.45936584472656, "learning_rate": 8.327318223377125e-05, "loss": 17.5102, "step": 4570 }, { "epoch": 1.4656, "grad_norm": 85.84539031982422, "learning_rate": 8.322506135412156e-05, "loss": 11.685, "step": 4580 }, { "epoch": 1.4687999999999999, "grad_norm": 283.8833923339844, "learning_rate": 8.317694047447188e-05, "loss": 18.6031, "step": 4590 }, { "epoch": 1.472, "grad_norm": 127.67681884765625, "learning_rate": 8.31288195948222e-05, "loss": 12.9313, "step": 4600 }, { "epoch": 1.4752, "grad_norm": 57.71700668334961, "learning_rate": 8.308069871517252e-05, "loss": 11.3301, "step": 4610 }, { "epoch": 1.4784, "grad_norm": 210.92361450195312, "learning_rate": 8.303257783552284e-05, "loss": 11.0869, "step": 4620 }, { "epoch": 1.4816, "grad_norm": 129.86866760253906, "learning_rate": 8.298445695587315e-05, "loss": 11.6906, "step": 4630 }, { "epoch": 1.4848, "grad_norm": 158.3607940673828, "learning_rate": 8.293633607622347e-05, "loss": 13.9119, "step": 4640 }, { "epoch": 1.488, "grad_norm": 123.92117309570312, "learning_rate": 8.288821519657379e-05, "loss": 8.3928, "step": 4650 }, { "epoch": 1.4912, "grad_norm": 212.11981201171875, "learning_rate": 8.284009431692412e-05, "loss": 15.3008, "step": 4660 }, { "epoch": 1.4944, "grad_norm": 20.583595275878906, "learning_rate": 8.279197343727445e-05, "loss": 12.5589, "step": 4670 }, { "epoch": 1.4976, "grad_norm": 169.9720916748047, "learning_rate": 8.274385255762476e-05, "loss": 12.0134, "step": 4680 }, { "epoch": 1.5008, "grad_norm": 84.78213500976562, "learning_rate": 8.269573167797508e-05, "loss": 14.0137, "step": 4690 }, { "epoch": 1.504, "grad_norm": 137.66647338867188, "learning_rate": 8.26476107983254e-05, "loss": 12.6496, "step": 4700 }, { "epoch": 1.5072, "grad_norm": 64.67051696777344, "learning_rate": 8.259948991867572e-05, "loss": 10.9616, "step": 4710 }, { "epoch": 1.5104, "grad_norm": 75.29645538330078, "learning_rate": 8.255136903902604e-05, "loss": 15.7158, "step": 4720 }, { "epoch": 1.5135999999999998, "grad_norm": 248.16354370117188, "learning_rate": 8.250324815937636e-05, "loss": 17.8274, "step": 4730 }, { "epoch": 1.5168, "grad_norm": 22.718576431274414, "learning_rate": 8.245512727972667e-05, "loss": 12.8601, "step": 4740 }, { "epoch": 1.52, "grad_norm": 199.01705932617188, "learning_rate": 8.2407006400077e-05, "loss": 11.5606, "step": 4750 }, { "epoch": 1.5232, "grad_norm": 110.53349304199219, "learning_rate": 8.235888552042732e-05, "loss": 16.377, "step": 4760 }, { "epoch": 1.5264, "grad_norm": 119.91397094726562, "learning_rate": 8.231076464077763e-05, "loss": 11.5748, "step": 4770 }, { "epoch": 1.5295999999999998, "grad_norm": 62.261474609375, "learning_rate": 8.226264376112796e-05, "loss": 16.8108, "step": 4780 }, { "epoch": 1.5328, "grad_norm": 430.2541198730469, "learning_rate": 8.221452288147828e-05, "loss": 26.1672, "step": 4790 }, { "epoch": 1.536, "grad_norm": 21.06357192993164, "learning_rate": 8.216640200182861e-05, "loss": 8.549, "step": 4800 }, { "epoch": 1.5392000000000001, "grad_norm": 168.61270141601562, "learning_rate": 8.211828112217892e-05, "loss": 13.2334, "step": 4810 }, { "epoch": 1.5424, "grad_norm": 199.2380828857422, "learning_rate": 8.207016024252924e-05, "loss": 19.5848, "step": 4820 }, { "epoch": 1.5455999999999999, "grad_norm": 79.18196868896484, "learning_rate": 8.202203936287956e-05, "loss": 9.7126, "step": 4830 }, { "epoch": 1.5488, "grad_norm": 253.09219360351562, "learning_rate": 8.197391848322987e-05, "loss": 10.9486, "step": 4840 }, { "epoch": 1.552, "grad_norm": 315.3786315917969, "learning_rate": 8.19257976035802e-05, "loss": 18.2413, "step": 4850 }, { "epoch": 1.5552000000000001, "grad_norm": 268.1244812011719, "learning_rate": 8.187767672393052e-05, "loss": 18.7632, "step": 4860 }, { "epoch": 1.5584, "grad_norm": 47.586700439453125, "learning_rate": 8.182955584428083e-05, "loss": 6.8808, "step": 4870 }, { "epoch": 1.5615999999999999, "grad_norm": 45.04067611694336, "learning_rate": 8.178143496463115e-05, "loss": 14.237, "step": 4880 }, { "epoch": 1.5648, "grad_norm": 46.79035186767578, "learning_rate": 8.173331408498148e-05, "loss": 10.811, "step": 4890 }, { "epoch": 1.568, "grad_norm": 55.57338333129883, "learning_rate": 8.16851932053318e-05, "loss": 10.1705, "step": 4900 }, { "epoch": 1.5712000000000002, "grad_norm": 23.716787338256836, "learning_rate": 8.163707232568212e-05, "loss": 19.2927, "step": 4910 }, { "epoch": 1.5744, "grad_norm": 193.51046752929688, "learning_rate": 8.158895144603244e-05, "loss": 15.805, "step": 4920 }, { "epoch": 1.5776, "grad_norm": 121.63523864746094, "learning_rate": 8.154083056638276e-05, "loss": 13.0191, "step": 4930 }, { "epoch": 1.5808, "grad_norm": 25.50160789489746, "learning_rate": 8.149270968673308e-05, "loss": 19.8822, "step": 4940 }, { "epoch": 1.584, "grad_norm": 83.73605346679688, "learning_rate": 8.14445888070834e-05, "loss": 15.3651, "step": 4950 }, { "epoch": 1.5872000000000002, "grad_norm": 38.56587219238281, "learning_rate": 8.139646792743372e-05, "loss": 11.4676, "step": 4960 }, { "epoch": 1.5904, "grad_norm": 39.88987350463867, "learning_rate": 8.134834704778403e-05, "loss": 8.4299, "step": 4970 }, { "epoch": 1.5936, "grad_norm": 36.31617736816406, "learning_rate": 8.130022616813435e-05, "loss": 12.0971, "step": 4980 }, { "epoch": 1.5968, "grad_norm": 353.7848205566406, "learning_rate": 8.125210528848468e-05, "loss": 20.2348, "step": 4990 }, { "epoch": 1.6, "grad_norm": 60.40487289428711, "learning_rate": 8.1203984408835e-05, "loss": 9.8344, "step": 5000 }, { "epoch": 1.6032, "grad_norm": 57.07439041137695, "learning_rate": 8.115586352918531e-05, "loss": 15.2207, "step": 5010 }, { "epoch": 1.6064, "grad_norm": 46.06114196777344, "learning_rate": 8.110774264953564e-05, "loss": 17.9066, "step": 5020 }, { "epoch": 1.6096, "grad_norm": 51.93063735961914, "learning_rate": 8.105962176988597e-05, "loss": 16.0372, "step": 5030 }, { "epoch": 1.6128, "grad_norm": 176.78553771972656, "learning_rate": 8.101150089023628e-05, "loss": 13.6432, "step": 5040 }, { "epoch": 1.616, "grad_norm": 139.7910919189453, "learning_rate": 8.09633800105866e-05, "loss": 11.4176, "step": 5050 }, { "epoch": 1.6192, "grad_norm": 210.73585510253906, "learning_rate": 8.091525913093692e-05, "loss": 7.967, "step": 5060 }, { "epoch": 1.6223999999999998, "grad_norm": 19.885711669921875, "learning_rate": 8.086713825128723e-05, "loss": 10.6299, "step": 5070 }, { "epoch": 1.6256, "grad_norm": 95.74657440185547, "learning_rate": 8.081901737163756e-05, "loss": 12.9132, "step": 5080 }, { "epoch": 1.6288, "grad_norm": 62.29002380371094, "learning_rate": 8.077089649198788e-05, "loss": 8.7284, "step": 5090 }, { "epoch": 1.6320000000000001, "grad_norm": 50.070159912109375, "learning_rate": 8.07227756123382e-05, "loss": 11.1506, "step": 5100 }, { "epoch": 1.6352, "grad_norm": 159.37423706054688, "learning_rate": 8.067465473268851e-05, "loss": 23.2039, "step": 5110 }, { "epoch": 1.6383999999999999, "grad_norm": 118.30895233154297, "learning_rate": 8.062653385303883e-05, "loss": 9.3489, "step": 5120 }, { "epoch": 1.6416, "grad_norm": 94.68067932128906, "learning_rate": 8.057841297338916e-05, "loss": 10.3403, "step": 5130 }, { "epoch": 1.6448, "grad_norm": 99.9928970336914, "learning_rate": 8.053029209373948e-05, "loss": 12.7561, "step": 5140 }, { "epoch": 1.6480000000000001, "grad_norm": 66.11470031738281, "learning_rate": 8.04821712140898e-05, "loss": 15.6002, "step": 5150 }, { "epoch": 1.6512, "grad_norm": 123.01483154296875, "learning_rate": 8.043405033444012e-05, "loss": 13.9927, "step": 5160 }, { "epoch": 1.6543999999999999, "grad_norm": 167.7794647216797, "learning_rate": 8.038592945479045e-05, "loss": 13.4633, "step": 5170 }, { "epoch": 1.6576, "grad_norm": 82.57764434814453, "learning_rate": 8.033780857514076e-05, "loss": 6.405, "step": 5180 }, { "epoch": 1.6608, "grad_norm": 217.13636779785156, "learning_rate": 8.028968769549108e-05, "loss": 8.4071, "step": 5190 }, { "epoch": 1.6640000000000001, "grad_norm": 144.3599395751953, "learning_rate": 8.02415668158414e-05, "loss": 27.8519, "step": 5200 }, { "epoch": 1.6672, "grad_norm": 123.50732421875, "learning_rate": 8.019344593619171e-05, "loss": 13.6996, "step": 5210 }, { "epoch": 1.6703999999999999, "grad_norm": 214.61167907714844, "learning_rate": 8.014532505654204e-05, "loss": 13.5145, "step": 5220 }, { "epoch": 1.6736, "grad_norm": 98.19036102294922, "learning_rate": 8.009720417689236e-05, "loss": 7.3298, "step": 5230 }, { "epoch": 1.6768, "grad_norm": 318.9629211425781, "learning_rate": 8.004908329724267e-05, "loss": 15.3252, "step": 5240 }, { "epoch": 1.6800000000000002, "grad_norm": 114.79318237304688, "learning_rate": 8.0000962417593e-05, "loss": 8.879, "step": 5250 }, { "epoch": 1.6832, "grad_norm": 93.5780029296875, "learning_rate": 7.995284153794332e-05, "loss": 7.1264, "step": 5260 }, { "epoch": 1.6864, "grad_norm": 151.0817108154297, "learning_rate": 7.990472065829365e-05, "loss": 12.6877, "step": 5270 }, { "epoch": 1.6896, "grad_norm": 398.4906921386719, "learning_rate": 7.985659977864396e-05, "loss": 10.0716, "step": 5280 }, { "epoch": 1.6928, "grad_norm": 252.71217346191406, "learning_rate": 7.980847889899428e-05, "loss": 14.1426, "step": 5290 }, { "epoch": 1.696, "grad_norm": 25.260385513305664, "learning_rate": 7.97603580193446e-05, "loss": 8.477, "step": 5300 }, { "epoch": 1.6992, "grad_norm": 403.61474609375, "learning_rate": 7.971223713969491e-05, "loss": 24.3042, "step": 5310 }, { "epoch": 1.7024, "grad_norm": 106.37423706054688, "learning_rate": 7.966411626004524e-05, "loss": 10.2918, "step": 5320 }, { "epoch": 1.7056, "grad_norm": 75.35301971435547, "learning_rate": 7.961599538039556e-05, "loss": 5.7908, "step": 5330 }, { "epoch": 1.7088, "grad_norm": 105.8494873046875, "learning_rate": 7.956787450074587e-05, "loss": 8.7775, "step": 5340 }, { "epoch": 1.712, "grad_norm": 107.4222183227539, "learning_rate": 7.951975362109619e-05, "loss": 11.0194, "step": 5350 }, { "epoch": 1.7151999999999998, "grad_norm": 132.2693634033203, "learning_rate": 7.947163274144652e-05, "loss": 15.9559, "step": 5360 }, { "epoch": 1.7184, "grad_norm": 16.42701530456543, "learning_rate": 7.942351186179685e-05, "loss": 7.3651, "step": 5370 }, { "epoch": 1.7216, "grad_norm": 101.45223999023438, "learning_rate": 7.937539098214716e-05, "loss": 9.4495, "step": 5380 }, { "epoch": 1.7248, "grad_norm": 366.22613525390625, "learning_rate": 7.932727010249748e-05, "loss": 18.8372, "step": 5390 }, { "epoch": 1.728, "grad_norm": 243.88165283203125, "learning_rate": 7.92791492228478e-05, "loss": 16.7303, "step": 5400 }, { "epoch": 1.7311999999999999, "grad_norm": 58.66535568237305, "learning_rate": 7.923102834319812e-05, "loss": 20.8765, "step": 5410 }, { "epoch": 1.7344, "grad_norm": 328.5350036621094, "learning_rate": 7.918290746354844e-05, "loss": 13.662, "step": 5420 }, { "epoch": 1.7376, "grad_norm": 76.87786102294922, "learning_rate": 7.913478658389876e-05, "loss": 9.665, "step": 5430 }, { "epoch": 1.7408000000000001, "grad_norm": 43.58374786376953, "learning_rate": 7.908666570424907e-05, "loss": 15.5587, "step": 5440 }, { "epoch": 1.744, "grad_norm": 293.5721130371094, "learning_rate": 7.903854482459939e-05, "loss": 11.5139, "step": 5450 }, { "epoch": 1.7471999999999999, "grad_norm": 235.2281494140625, "learning_rate": 7.899042394494972e-05, "loss": 12.9634, "step": 5460 }, { "epoch": 1.7504, "grad_norm": 302.9085693359375, "learning_rate": 7.894230306530003e-05, "loss": 15.1702, "step": 5470 }, { "epoch": 1.7536, "grad_norm": 145.3501434326172, "learning_rate": 7.889418218565036e-05, "loss": 18.516, "step": 5480 }, { "epoch": 1.7568000000000001, "grad_norm": 193.26490783691406, "learning_rate": 7.884606130600068e-05, "loss": 10.5385, "step": 5490 }, { "epoch": 1.76, "grad_norm": 93.55962371826172, "learning_rate": 7.879794042635101e-05, "loss": 10.7848, "step": 5500 }, { "epoch": 1.7631999999999999, "grad_norm": 145.73129272460938, "learning_rate": 7.874981954670132e-05, "loss": 23.1037, "step": 5510 }, { "epoch": 1.7664, "grad_norm": 195.6656951904297, "learning_rate": 7.870169866705164e-05, "loss": 11.4764, "step": 5520 }, { "epoch": 1.7696, "grad_norm": 82.76277160644531, "learning_rate": 7.865357778740196e-05, "loss": 7.373, "step": 5530 }, { "epoch": 1.7728000000000002, "grad_norm": 15.001042366027832, "learning_rate": 7.860545690775227e-05, "loss": 6.8925, "step": 5540 }, { "epoch": 1.776, "grad_norm": 509.9377746582031, "learning_rate": 7.85573360281026e-05, "loss": 24.2378, "step": 5550 }, { "epoch": 1.7792, "grad_norm": 316.38140869140625, "learning_rate": 7.850921514845292e-05, "loss": 12.8201, "step": 5560 }, { "epoch": 1.7824, "grad_norm": 174.23080444335938, "learning_rate": 7.846109426880323e-05, "loss": 11.2291, "step": 5570 }, { "epoch": 1.7856, "grad_norm": 129.2643585205078, "learning_rate": 7.841297338915355e-05, "loss": 14.9766, "step": 5580 }, { "epoch": 1.7888, "grad_norm": 182.66383361816406, "learning_rate": 7.836485250950388e-05, "loss": 14.7635, "step": 5590 }, { "epoch": 1.792, "grad_norm": 94.5778579711914, "learning_rate": 7.83167316298542e-05, "loss": 18.0659, "step": 5600 }, { "epoch": 1.7952, "grad_norm": 316.46533203125, "learning_rate": 7.826861075020452e-05, "loss": 7.6631, "step": 5610 }, { "epoch": 1.7984, "grad_norm": 89.08241271972656, "learning_rate": 7.822048987055484e-05, "loss": 12.239, "step": 5620 }, { "epoch": 1.8016, "grad_norm": 199.45790100097656, "learning_rate": 7.817236899090516e-05, "loss": 10.6874, "step": 5630 }, { "epoch": 1.8048, "grad_norm": 400.2660217285156, "learning_rate": 7.812424811125549e-05, "loss": 20.561, "step": 5640 }, { "epoch": 1.808, "grad_norm": 355.1072082519531, "learning_rate": 7.80761272316058e-05, "loss": 14.0241, "step": 5650 }, { "epoch": 1.8112, "grad_norm": 236.64321899414062, "learning_rate": 7.802800635195612e-05, "loss": 15.7424, "step": 5660 }, { "epoch": 1.8144, "grad_norm": 358.04168701171875, "learning_rate": 7.797988547230643e-05, "loss": 13.1021, "step": 5670 }, { "epoch": 1.8176, "grad_norm": 82.75016021728516, "learning_rate": 7.793176459265675e-05, "loss": 8.3815, "step": 5680 }, { "epoch": 1.8208, "grad_norm": 152.54981994628906, "learning_rate": 7.788364371300708e-05, "loss": 7.1334, "step": 5690 }, { "epoch": 1.8239999999999998, "grad_norm": 52.3581428527832, "learning_rate": 7.78355228333574e-05, "loss": 12.3181, "step": 5700 }, { "epoch": 1.8272, "grad_norm": 40.6826286315918, "learning_rate": 7.778740195370772e-05, "loss": 8.4415, "step": 5710 }, { "epoch": 1.8304, "grad_norm": 28.89336585998535, "learning_rate": 7.773928107405804e-05, "loss": 8.5133, "step": 5720 }, { "epoch": 1.8336000000000001, "grad_norm": 147.1861114501953, "learning_rate": 7.769116019440836e-05, "loss": 8.8364, "step": 5730 }, { "epoch": 1.8368, "grad_norm": 38.074947357177734, "learning_rate": 7.764303931475869e-05, "loss": 11.028, "step": 5740 }, { "epoch": 1.8399999999999999, "grad_norm": 106.05870056152344, "learning_rate": 7.7594918435109e-05, "loss": 13.719, "step": 5750 }, { "epoch": 1.8432, "grad_norm": 104.74298858642578, "learning_rate": 7.754679755545932e-05, "loss": 18.2022, "step": 5760 }, { "epoch": 1.8464, "grad_norm": 83.45960235595703, "learning_rate": 7.749867667580963e-05, "loss": 7.8557, "step": 5770 }, { "epoch": 1.8496000000000001, "grad_norm": 292.71502685546875, "learning_rate": 7.745055579615996e-05, "loss": 12.3575, "step": 5780 }, { "epoch": 1.8528, "grad_norm": 79.79765319824219, "learning_rate": 7.740243491651028e-05, "loss": 7.6825, "step": 5790 }, { "epoch": 1.8559999999999999, "grad_norm": 165.64764404296875, "learning_rate": 7.73543140368606e-05, "loss": 10.9088, "step": 5800 }, { "epoch": 1.8592, "grad_norm": 157.21531677246094, "learning_rate": 7.730619315721091e-05, "loss": 11.7163, "step": 5810 }, { "epoch": 1.8624, "grad_norm": 63.153770446777344, "learning_rate": 7.725807227756124e-05, "loss": 9.0825, "step": 5820 }, { "epoch": 1.8656000000000001, "grad_norm": 105.34776306152344, "learning_rate": 7.720995139791156e-05, "loss": 6.3151, "step": 5830 }, { "epoch": 1.8688, "grad_norm": 119.96544647216797, "learning_rate": 7.716183051826189e-05, "loss": 7.3588, "step": 5840 }, { "epoch": 1.8719999999999999, "grad_norm": 159.51512145996094, "learning_rate": 7.71137096386122e-05, "loss": 10.4566, "step": 5850 }, { "epoch": 1.8752, "grad_norm": 143.95925903320312, "learning_rate": 7.706558875896252e-05, "loss": 12.0627, "step": 5860 }, { "epoch": 1.8784, "grad_norm": 359.7722473144531, "learning_rate": 7.701746787931283e-05, "loss": 16.8758, "step": 5870 }, { "epoch": 1.8816000000000002, "grad_norm": 115.46981811523438, "learning_rate": 7.696934699966316e-05, "loss": 10.9226, "step": 5880 }, { "epoch": 1.8848, "grad_norm": 45.141239166259766, "learning_rate": 7.692122612001348e-05, "loss": 16.9141, "step": 5890 }, { "epoch": 1.888, "grad_norm": 504.2868347167969, "learning_rate": 7.68731052403638e-05, "loss": 16.3973, "step": 5900 }, { "epoch": 1.8912, "grad_norm": 12.496575355529785, "learning_rate": 7.682498436071411e-05, "loss": 11.9035, "step": 5910 }, { "epoch": 1.8944, "grad_norm": 285.81646728515625, "learning_rate": 7.677686348106444e-05, "loss": 10.2374, "step": 5920 }, { "epoch": 1.8976, "grad_norm": 381.9243469238281, "learning_rate": 7.672874260141476e-05, "loss": 14.271, "step": 5930 }, { "epoch": 1.9008, "grad_norm": 142.6376190185547, "learning_rate": 7.668062172176507e-05, "loss": 9.3042, "step": 5940 }, { "epoch": 1.904, "grad_norm": 240.14691162109375, "learning_rate": 7.66325008421154e-05, "loss": 16.9515, "step": 5950 }, { "epoch": 1.9072, "grad_norm": 532.143310546875, "learning_rate": 7.658437996246572e-05, "loss": 17.9042, "step": 5960 }, { "epoch": 1.9104, "grad_norm": 45.8946533203125, "learning_rate": 7.653625908281605e-05, "loss": 14.6533, "step": 5970 }, { "epoch": 1.9136, "grad_norm": 724.5422973632812, "learning_rate": 7.648813820316636e-05, "loss": 19.5838, "step": 5980 }, { "epoch": 1.9167999999999998, "grad_norm": 379.60247802734375, "learning_rate": 7.644001732351668e-05, "loss": 16.9698, "step": 5990 }, { "epoch": 1.92, "grad_norm": 58.040245056152344, "learning_rate": 7.6391896443867e-05, "loss": 17.4524, "step": 6000 }, { "epoch": 1.9232, "grad_norm": 379.4302978515625, "learning_rate": 7.634377556421731e-05, "loss": 13.6254, "step": 6010 }, { "epoch": 1.9264000000000001, "grad_norm": 141.63548278808594, "learning_rate": 7.629565468456764e-05, "loss": 8.9876, "step": 6020 }, { "epoch": 1.9296, "grad_norm": 390.8727111816406, "learning_rate": 7.624753380491796e-05, "loss": 14.0292, "step": 6030 }, { "epoch": 1.9327999999999999, "grad_norm": 308.1633605957031, "learning_rate": 7.619941292526827e-05, "loss": 15.0683, "step": 6040 }, { "epoch": 1.936, "grad_norm": 75.39464569091797, "learning_rate": 7.615129204561859e-05, "loss": 6.7009, "step": 6050 }, { "epoch": 1.9392, "grad_norm": 186.804443359375, "learning_rate": 7.610317116596892e-05, "loss": 13.6385, "step": 6060 }, { "epoch": 1.9424000000000001, "grad_norm": 44.96998596191406, "learning_rate": 7.605505028631925e-05, "loss": 8.7478, "step": 6070 }, { "epoch": 1.9456, "grad_norm": 208.7914276123047, "learning_rate": 7.600692940666956e-05, "loss": 6.0505, "step": 6080 }, { "epoch": 1.9487999999999999, "grad_norm": 68.26963806152344, "learning_rate": 7.595880852701988e-05, "loss": 21.5444, "step": 6090 }, { "epoch": 1.952, "grad_norm": 113.08411407470703, "learning_rate": 7.59106876473702e-05, "loss": 13.2148, "step": 6100 }, { "epoch": 1.9552, "grad_norm": 110.25634002685547, "learning_rate": 7.586256676772052e-05, "loss": 29.8494, "step": 6110 }, { "epoch": 1.9584000000000001, "grad_norm": 364.66357421875, "learning_rate": 7.581444588807084e-05, "loss": 9.8007, "step": 6120 }, { "epoch": 1.9616, "grad_norm": 237.5818328857422, "learning_rate": 7.576632500842116e-05, "loss": 17.9669, "step": 6130 }, { "epoch": 1.9647999999999999, "grad_norm": 44.8785400390625, "learning_rate": 7.571820412877147e-05, "loss": 14.2484, "step": 6140 }, { "epoch": 1.968, "grad_norm": 92.72181701660156, "learning_rate": 7.567008324912179e-05, "loss": 14.5794, "step": 6150 }, { "epoch": 1.9712, "grad_norm": 16.53237533569336, "learning_rate": 7.562196236947212e-05, "loss": 23.5858, "step": 6160 }, { "epoch": 1.9744000000000002, "grad_norm": 23.033843994140625, "learning_rate": 7.557384148982243e-05, "loss": 12.0332, "step": 6170 }, { "epoch": 1.9776, "grad_norm": 839.6536254882812, "learning_rate": 7.552572061017276e-05, "loss": 18.865, "step": 6180 }, { "epoch": 1.9808, "grad_norm": 235.6468963623047, "learning_rate": 7.547759973052308e-05, "loss": 12.3309, "step": 6190 }, { "epoch": 1.984, "grad_norm": 217.9586944580078, "learning_rate": 7.542947885087341e-05, "loss": 19.9604, "step": 6200 }, { "epoch": 1.9872, "grad_norm": 202.22488403320312, "learning_rate": 7.538135797122372e-05, "loss": 12.4974, "step": 6210 }, { "epoch": 1.9904, "grad_norm": 210.09681701660156, "learning_rate": 7.533323709157404e-05, "loss": 14.6642, "step": 6220 }, { "epoch": 1.9936, "grad_norm": 230.638427734375, "learning_rate": 7.528511621192436e-05, "loss": 12.7861, "step": 6230 }, { "epoch": 1.9968, "grad_norm": 64.64010620117188, "learning_rate": 7.523699533227467e-05, "loss": 14.9273, "step": 6240 }, { "epoch": 2.0, "grad_norm": 75.06802368164062, "learning_rate": 7.5188874452625e-05, "loss": 9.6899, "step": 6250 }, { "epoch": 2.0032, "grad_norm": 96.08731842041016, "learning_rate": 7.514075357297532e-05, "loss": 6.4337, "step": 6260 }, { "epoch": 2.0064, "grad_norm": 166.83889770507812, "learning_rate": 7.509263269332563e-05, "loss": 16.0556, "step": 6270 }, { "epoch": 2.0096, "grad_norm": 236.71409606933594, "learning_rate": 7.504451181367595e-05, "loss": 17.5883, "step": 6280 }, { "epoch": 2.0128, "grad_norm": 201.16754150390625, "learning_rate": 7.499639093402628e-05, "loss": 9.6581, "step": 6290 }, { "epoch": 2.016, "grad_norm": 209.53012084960938, "learning_rate": 7.494827005437661e-05, "loss": 11.0374, "step": 6300 }, { "epoch": 2.0192, "grad_norm": 375.9708557128906, "learning_rate": 7.490014917472692e-05, "loss": 15.5587, "step": 6310 }, { "epoch": 2.0224, "grad_norm": 124.85608673095703, "learning_rate": 7.485202829507724e-05, "loss": 10.65, "step": 6320 }, { "epoch": 2.0256, "grad_norm": 88.7503662109375, "learning_rate": 7.480390741542756e-05, "loss": 10.928, "step": 6330 }, { "epoch": 2.0288, "grad_norm": 251.53952026367188, "learning_rate": 7.475578653577789e-05, "loss": 10.7625, "step": 6340 }, { "epoch": 2.032, "grad_norm": 166.08445739746094, "learning_rate": 7.47076656561282e-05, "loss": 6.7231, "step": 6350 }, { "epoch": 2.0352, "grad_norm": 147.50840759277344, "learning_rate": 7.465954477647852e-05, "loss": 9.798, "step": 6360 }, { "epoch": 2.0384, "grad_norm": 74.1734390258789, "learning_rate": 7.461142389682883e-05, "loss": 24.7704, "step": 6370 }, { "epoch": 2.0416, "grad_norm": 77.3690414428711, "learning_rate": 7.456330301717915e-05, "loss": 8.2234, "step": 6380 }, { "epoch": 2.0448, "grad_norm": 14.62825870513916, "learning_rate": 7.451518213752948e-05, "loss": 9.8217, "step": 6390 }, { "epoch": 2.048, "grad_norm": 239.0679473876953, "learning_rate": 7.44670612578798e-05, "loss": 10.236, "step": 6400 }, { "epoch": 2.0512, "grad_norm": 227.8649444580078, "learning_rate": 7.441894037823012e-05, "loss": 5.7911, "step": 6410 }, { "epoch": 2.0544, "grad_norm": 242.1506805419922, "learning_rate": 7.437081949858044e-05, "loss": 10.785, "step": 6420 }, { "epoch": 2.0576, "grad_norm": 225.8619384765625, "learning_rate": 7.432269861893076e-05, "loss": 10.192, "step": 6430 }, { "epoch": 2.0608, "grad_norm": 103.14022827148438, "learning_rate": 7.427457773928109e-05, "loss": 21.4979, "step": 6440 }, { "epoch": 2.064, "grad_norm": 154.09381103515625, "learning_rate": 7.42264568596314e-05, "loss": 7.138, "step": 6450 }, { "epoch": 2.0672, "grad_norm": 402.90435791015625, "learning_rate": 7.417833597998172e-05, "loss": 13.7608, "step": 6460 }, { "epoch": 2.0704, "grad_norm": 24.620616912841797, "learning_rate": 7.413021510033203e-05, "loss": 8.4864, "step": 6470 }, { "epoch": 2.0736, "grad_norm": 510.222412109375, "learning_rate": 7.408209422068236e-05, "loss": 15.3918, "step": 6480 }, { "epoch": 2.0768, "grad_norm": 284.45703125, "learning_rate": 7.403397334103268e-05, "loss": 24.4673, "step": 6490 }, { "epoch": 2.08, "grad_norm": 83.74275207519531, "learning_rate": 7.3985852461383e-05, "loss": 7.5365, "step": 6500 }, { "epoch": 2.0832, "grad_norm": 638.9994506835938, "learning_rate": 7.393773158173331e-05, "loss": 24.3156, "step": 6510 }, { "epoch": 2.0864, "grad_norm": 263.9012451171875, "learning_rate": 7.388961070208364e-05, "loss": 11.5295, "step": 6520 }, { "epoch": 2.0896, "grad_norm": 50.43157958984375, "learning_rate": 7.384148982243396e-05, "loss": 14.3107, "step": 6530 }, { "epoch": 2.0928, "grad_norm": 38.867401123046875, "learning_rate": 7.379336894278429e-05, "loss": 14.6974, "step": 6540 }, { "epoch": 2.096, "grad_norm": 434.9025573730469, "learning_rate": 7.37452480631346e-05, "loss": 19.3049, "step": 6550 }, { "epoch": 2.0992, "grad_norm": 74.7429428100586, "learning_rate": 7.369712718348492e-05, "loss": 15.6472, "step": 6560 }, { "epoch": 2.1024, "grad_norm": 48.39684295654297, "learning_rate": 7.364900630383523e-05, "loss": 11.5215, "step": 6570 }, { "epoch": 2.1056, "grad_norm": 380.85369873046875, "learning_rate": 7.360088542418556e-05, "loss": 11.4086, "step": 6580 }, { "epoch": 2.1088, "grad_norm": 414.107421875, "learning_rate": 7.355276454453588e-05, "loss": 8.8353, "step": 6590 }, { "epoch": 2.112, "grad_norm": 33.47442626953125, "learning_rate": 7.35046436648862e-05, "loss": 14.7972, "step": 6600 }, { "epoch": 2.1152, "grad_norm": 72.98236846923828, "learning_rate": 7.345652278523651e-05, "loss": 11.1523, "step": 6610 }, { "epoch": 2.1184, "grad_norm": 217.662353515625, "learning_rate": 7.340840190558683e-05, "loss": 16.38, "step": 6620 }, { "epoch": 2.1216, "grad_norm": 61.013065338134766, "learning_rate": 7.336028102593716e-05, "loss": 10.5283, "step": 6630 }, { "epoch": 2.1248, "grad_norm": 217.58145141601562, "learning_rate": 7.331216014628747e-05, "loss": 7.2979, "step": 6640 }, { "epoch": 2.128, "grad_norm": 459.1943054199219, "learning_rate": 7.32640392666378e-05, "loss": 20.6165, "step": 6650 }, { "epoch": 2.1312, "grad_norm": 105.99282836914062, "learning_rate": 7.321591838698812e-05, "loss": 14.0473, "step": 6660 }, { "epoch": 2.1344, "grad_norm": 167.48902893066406, "learning_rate": 7.316779750733845e-05, "loss": 10.7388, "step": 6670 }, { "epoch": 2.1376, "grad_norm": 324.2806091308594, "learning_rate": 7.311967662768876e-05, "loss": 15.4278, "step": 6680 }, { "epoch": 2.1408, "grad_norm": 42.593727111816406, "learning_rate": 7.307155574803908e-05, "loss": 6.7883, "step": 6690 }, { "epoch": 2.144, "grad_norm": 404.28533935546875, "learning_rate": 7.30234348683894e-05, "loss": 16.19, "step": 6700 }, { "epoch": 2.1471999999999998, "grad_norm": 26.503576278686523, "learning_rate": 7.297531398873971e-05, "loss": 14.5914, "step": 6710 }, { "epoch": 2.1504, "grad_norm": 137.45455932617188, "learning_rate": 7.292719310909004e-05, "loss": 13.4639, "step": 6720 }, { "epoch": 2.1536, "grad_norm": 549.45654296875, "learning_rate": 7.287907222944036e-05, "loss": 10.4226, "step": 6730 }, { "epoch": 2.1568, "grad_norm": 66.96205139160156, "learning_rate": 7.283095134979067e-05, "loss": 15.6968, "step": 6740 }, { "epoch": 2.16, "grad_norm": 194.8579559326172, "learning_rate": 7.278283047014099e-05, "loss": 9.3199, "step": 6750 }, { "epoch": 2.1632, "grad_norm": 182.88906860351562, "learning_rate": 7.273470959049132e-05, "loss": 5.1033, "step": 6760 }, { "epoch": 2.1664, "grad_norm": 348.4930419921875, "learning_rate": 7.268658871084165e-05, "loss": 17.7213, "step": 6770 }, { "epoch": 2.1696, "grad_norm": 410.0494689941406, "learning_rate": 7.263846783119196e-05, "loss": 9.7019, "step": 6780 }, { "epoch": 2.1728, "grad_norm": 339.5379638671875, "learning_rate": 7.259034695154228e-05, "loss": 7.6859, "step": 6790 }, { "epoch": 2.176, "grad_norm": 560.112548828125, "learning_rate": 7.25422260718926e-05, "loss": 11.329, "step": 6800 }, { "epoch": 2.1792, "grad_norm": 67.06986236572266, "learning_rate": 7.249410519224292e-05, "loss": 6.5245, "step": 6810 }, { "epoch": 2.1824, "grad_norm": 51.85016632080078, "learning_rate": 7.244598431259324e-05, "loss": 8.7249, "step": 6820 }, { "epoch": 2.1856, "grad_norm": 120.45693969726562, "learning_rate": 7.239786343294356e-05, "loss": 15.1652, "step": 6830 }, { "epoch": 2.1888, "grad_norm": 177.74664306640625, "learning_rate": 7.234974255329387e-05, "loss": 12.9736, "step": 6840 }, { "epoch": 2.192, "grad_norm": 90.53146362304688, "learning_rate": 7.230162167364419e-05, "loss": 10.2584, "step": 6850 }, { "epoch": 2.1952, "grad_norm": 406.0667419433594, "learning_rate": 7.225350079399452e-05, "loss": 9.9424, "step": 6860 }, { "epoch": 2.1984, "grad_norm": 378.0879211425781, "learning_rate": 7.220537991434483e-05, "loss": 11.9188, "step": 6870 }, { "epoch": 2.2016, "grad_norm": 172.00131225585938, "learning_rate": 7.215725903469516e-05, "loss": 7.5667, "step": 6880 }, { "epoch": 2.2048, "grad_norm": 48.922607421875, "learning_rate": 7.210913815504548e-05, "loss": 11.326, "step": 6890 }, { "epoch": 2.208, "grad_norm": 41.334754943847656, "learning_rate": 7.20610172753958e-05, "loss": 11.9216, "step": 6900 }, { "epoch": 2.2112, "grad_norm": 174.93580627441406, "learning_rate": 7.201289639574612e-05, "loss": 11.5841, "step": 6910 }, { "epoch": 2.2144, "grad_norm": 20.004573822021484, "learning_rate": 7.196477551609644e-05, "loss": 13.5695, "step": 6920 }, { "epoch": 2.2176, "grad_norm": 94.86103820800781, "learning_rate": 7.191665463644676e-05, "loss": 2.5417, "step": 6930 }, { "epoch": 2.2208, "grad_norm": 118.7885513305664, "learning_rate": 7.186853375679707e-05, "loss": 15.4059, "step": 6940 }, { "epoch": 2.224, "grad_norm": 136.67779541015625, "learning_rate": 7.18204128771474e-05, "loss": 10.0703, "step": 6950 }, { "epoch": 2.2272, "grad_norm": 474.792724609375, "learning_rate": 7.177229199749772e-05, "loss": 12.0873, "step": 6960 }, { "epoch": 2.2304, "grad_norm": 51.49345397949219, "learning_rate": 7.172417111784803e-05, "loss": 10.2686, "step": 6970 }, { "epoch": 2.2336, "grad_norm": 140.35585021972656, "learning_rate": 7.167605023819835e-05, "loss": 17.8834, "step": 6980 }, { "epoch": 2.2368, "grad_norm": 108.5957260131836, "learning_rate": 7.162792935854868e-05, "loss": 9.0994, "step": 6990 }, { "epoch": 2.24, "grad_norm": 25.200002670288086, "learning_rate": 7.157980847889901e-05, "loss": 5.7606, "step": 7000 }, { "epoch": 2.2432, "grad_norm": 629.4432373046875, "learning_rate": 7.153168759924932e-05, "loss": 10.5185, "step": 7010 }, { "epoch": 2.2464, "grad_norm": 23.01872444152832, "learning_rate": 7.148356671959964e-05, "loss": 10.9264, "step": 7020 }, { "epoch": 2.2496, "grad_norm": 66.02039337158203, "learning_rate": 7.143544583994996e-05, "loss": 17.1559, "step": 7030 }, { "epoch": 2.2528, "grad_norm": 424.1953430175781, "learning_rate": 7.138732496030027e-05, "loss": 8.4546, "step": 7040 }, { "epoch": 2.2560000000000002, "grad_norm": 20.050071716308594, "learning_rate": 7.13392040806506e-05, "loss": 9.5869, "step": 7050 }, { "epoch": 2.2592, "grad_norm": 21.311527252197266, "learning_rate": 7.129108320100092e-05, "loss": 15.6254, "step": 7060 }, { "epoch": 2.2624, "grad_norm": 352.6675720214844, "learning_rate": 7.124296232135123e-05, "loss": 13.0083, "step": 7070 }, { "epoch": 2.2656, "grad_norm": 259.0144958496094, "learning_rate": 7.119484144170155e-05, "loss": 15.8531, "step": 7080 }, { "epoch": 2.2688, "grad_norm": 209.40565490722656, "learning_rate": 7.114672056205188e-05, "loss": 11.605, "step": 7090 }, { "epoch": 2.2720000000000002, "grad_norm": 473.474365234375, "learning_rate": 7.10985996824022e-05, "loss": 12.3493, "step": 7100 }, { "epoch": 2.2752, "grad_norm": 312.6822204589844, "learning_rate": 7.105047880275252e-05, "loss": 8.752, "step": 7110 }, { "epoch": 2.2784, "grad_norm": 32.932064056396484, "learning_rate": 7.100235792310284e-05, "loss": 7.7577, "step": 7120 }, { "epoch": 2.2816, "grad_norm": 257.11834716796875, "learning_rate": 7.095423704345316e-05, "loss": 22.7839, "step": 7130 }, { "epoch": 2.2848, "grad_norm": 129.80516052246094, "learning_rate": 7.090611616380349e-05, "loss": 13.8956, "step": 7140 }, { "epoch": 2.288, "grad_norm": 64.81372833251953, "learning_rate": 7.08579952841538e-05, "loss": 11.3041, "step": 7150 }, { "epoch": 2.2912, "grad_norm": 148.7347869873047, "learning_rate": 7.080987440450412e-05, "loss": 12.355, "step": 7160 }, { "epoch": 2.2944, "grad_norm": 337.8056335449219, "learning_rate": 7.076175352485443e-05, "loss": 25.4572, "step": 7170 }, { "epoch": 2.2976, "grad_norm": 567.4310302734375, "learning_rate": 7.071363264520475e-05, "loss": 13.1897, "step": 7180 }, { "epoch": 2.3008, "grad_norm": 301.4020080566406, "learning_rate": 7.066551176555508e-05, "loss": 14.877, "step": 7190 }, { "epoch": 2.304, "grad_norm": 481.36700439453125, "learning_rate": 7.06173908859054e-05, "loss": 10.5529, "step": 7200 }, { "epoch": 2.3072, "grad_norm": 336.3973388671875, "learning_rate": 7.056927000625571e-05, "loss": 16.3895, "step": 7210 }, { "epoch": 2.3104, "grad_norm": 304.0165710449219, "learning_rate": 7.052114912660604e-05, "loss": 16.1288, "step": 7220 }, { "epoch": 2.3136, "grad_norm": 593.84765625, "learning_rate": 7.047302824695636e-05, "loss": 7.8308, "step": 7230 }, { "epoch": 2.3168, "grad_norm": 25.797414779663086, "learning_rate": 7.042490736730669e-05, "loss": 6.6659, "step": 7240 }, { "epoch": 2.32, "grad_norm": 205.01246643066406, "learning_rate": 7.0376786487657e-05, "loss": 7.7454, "step": 7250 }, { "epoch": 2.3232, "grad_norm": 270.94287109375, "learning_rate": 7.032866560800732e-05, "loss": 7.0633, "step": 7260 }, { "epoch": 2.3264, "grad_norm": 197.09603881835938, "learning_rate": 7.028054472835763e-05, "loss": 10.8385, "step": 7270 }, { "epoch": 2.3296, "grad_norm": 158.2281951904297, "learning_rate": 7.023242384870796e-05, "loss": 10.457, "step": 7280 }, { "epoch": 2.3327999999999998, "grad_norm": 34.9866943359375, "learning_rate": 7.018430296905828e-05, "loss": 9.5866, "step": 7290 }, { "epoch": 2.336, "grad_norm": 198.6848602294922, "learning_rate": 7.01361820894086e-05, "loss": 20.9451, "step": 7300 }, { "epoch": 2.3392, "grad_norm": 128.42063903808594, "learning_rate": 7.008806120975891e-05, "loss": 10.1311, "step": 7310 }, { "epoch": 2.3424, "grad_norm": 324.71209716796875, "learning_rate": 7.003994033010923e-05, "loss": 6.7654, "step": 7320 }, { "epoch": 2.3456, "grad_norm": 282.8023681640625, "learning_rate": 6.999181945045956e-05, "loss": 21.1262, "step": 7330 }, { "epoch": 2.3487999999999998, "grad_norm": 134.40771484375, "learning_rate": 6.994369857080987e-05, "loss": 6.8578, "step": 7340 }, { "epoch": 2.352, "grad_norm": 393.96295166015625, "learning_rate": 6.98955776911602e-05, "loss": 14.6084, "step": 7350 }, { "epoch": 2.3552, "grad_norm": 486.47509765625, "learning_rate": 6.984745681151052e-05, "loss": 11.4547, "step": 7360 }, { "epoch": 2.3584, "grad_norm": 32.58149719238281, "learning_rate": 6.979933593186085e-05, "loss": 13.6346, "step": 7370 }, { "epoch": 2.3616, "grad_norm": 321.4623718261719, "learning_rate": 6.975121505221116e-05, "loss": 7.7223, "step": 7380 }, { "epoch": 2.3648, "grad_norm": 5.947575569152832, "learning_rate": 6.970309417256148e-05, "loss": 10.565, "step": 7390 }, { "epoch": 2.368, "grad_norm": 397.53253173828125, "learning_rate": 6.96549732929118e-05, "loss": 11.6187, "step": 7400 }, { "epoch": 2.3712, "grad_norm": 149.7398681640625, "learning_rate": 6.960685241326211e-05, "loss": 15.2403, "step": 7410 }, { "epoch": 2.3744, "grad_norm": 388.9798889160156, "learning_rate": 6.955873153361244e-05, "loss": 8.6925, "step": 7420 }, { "epoch": 2.3776, "grad_norm": 171.44644165039062, "learning_rate": 6.951061065396276e-05, "loss": 9.8957, "step": 7430 }, { "epoch": 2.3808, "grad_norm": 315.3659362792969, "learning_rate": 6.946248977431307e-05, "loss": 10.2292, "step": 7440 }, { "epoch": 2.384, "grad_norm": 317.8998107910156, "learning_rate": 6.941436889466339e-05, "loss": 19.9849, "step": 7450 }, { "epoch": 2.3872, "grad_norm": 268.6977844238281, "learning_rate": 6.936624801501372e-05, "loss": 14.4996, "step": 7460 }, { "epoch": 2.3904, "grad_norm": 249.7406463623047, "learning_rate": 6.931812713536405e-05, "loss": 21.2362, "step": 7470 }, { "epoch": 2.3936, "grad_norm": 165.2587127685547, "learning_rate": 6.927000625571436e-05, "loss": 22.6905, "step": 7480 }, { "epoch": 2.3968, "grad_norm": 173.2901611328125, "learning_rate": 6.922188537606468e-05, "loss": 9.8161, "step": 7490 }, { "epoch": 2.4, "grad_norm": 251.69595336914062, "learning_rate": 6.9173764496415e-05, "loss": 7.2397, "step": 7500 }, { "epoch": 2.4032, "grad_norm": 31.20841407775879, "learning_rate": 6.912564361676532e-05, "loss": 12.2418, "step": 7510 }, { "epoch": 2.4064, "grad_norm": 291.97259521484375, "learning_rate": 6.907752273711564e-05, "loss": 18.2746, "step": 7520 }, { "epoch": 2.4096, "grad_norm": 328.2070007324219, "learning_rate": 6.902940185746596e-05, "loss": 9.7591, "step": 7530 }, { "epoch": 2.4128, "grad_norm": 106.48345947265625, "learning_rate": 6.898128097781627e-05, "loss": 15.9682, "step": 7540 }, { "epoch": 2.416, "grad_norm": 96.59558868408203, "learning_rate": 6.893316009816659e-05, "loss": 5.3462, "step": 7550 }, { "epoch": 2.4192, "grad_norm": 349.659912109375, "learning_rate": 6.888503921851692e-05, "loss": 16.0209, "step": 7560 }, { "epoch": 2.4224, "grad_norm": 123.92171478271484, "learning_rate": 6.883691833886723e-05, "loss": 5.7432, "step": 7570 }, { "epoch": 2.4256, "grad_norm": 109.02745056152344, "learning_rate": 6.878879745921756e-05, "loss": 8.8852, "step": 7580 }, { "epoch": 2.4288, "grad_norm": 224.05465698242188, "learning_rate": 6.874067657956788e-05, "loss": 15.0121, "step": 7590 }, { "epoch": 2.432, "grad_norm": 35.6044807434082, "learning_rate": 6.86925556999182e-05, "loss": 21.4395, "step": 7600 }, { "epoch": 2.4352, "grad_norm": 333.714111328125, "learning_rate": 6.864443482026852e-05, "loss": 7.833, "step": 7610 }, { "epoch": 2.4384, "grad_norm": 418.23187255859375, "learning_rate": 6.859631394061884e-05, "loss": 8.662, "step": 7620 }, { "epoch": 2.4416, "grad_norm": 56.24296569824219, "learning_rate": 6.854819306096916e-05, "loss": 10.2676, "step": 7630 }, { "epoch": 2.4448, "grad_norm": 333.6312255859375, "learning_rate": 6.850007218131947e-05, "loss": 8.8738, "step": 7640 }, { "epoch": 2.448, "grad_norm": 244.0286407470703, "learning_rate": 6.84519513016698e-05, "loss": 8.8871, "step": 7650 }, { "epoch": 2.4512, "grad_norm": 181.86865234375, "learning_rate": 6.840383042202012e-05, "loss": 7.7143, "step": 7660 }, { "epoch": 2.4544, "grad_norm": 131.95101928710938, "learning_rate": 6.835570954237043e-05, "loss": 28.127, "step": 7670 }, { "epoch": 2.4576000000000002, "grad_norm": 412.29644775390625, "learning_rate": 6.830758866272075e-05, "loss": 19.8561, "step": 7680 }, { "epoch": 2.4608, "grad_norm": 118.61441040039062, "learning_rate": 6.825946778307108e-05, "loss": 11.0783, "step": 7690 }, { "epoch": 2.464, "grad_norm": 581.0807495117188, "learning_rate": 6.821134690342141e-05, "loss": 20.0144, "step": 7700 }, { "epoch": 2.4672, "grad_norm": 18.782302856445312, "learning_rate": 6.816322602377172e-05, "loss": 13.3374, "step": 7710 }, { "epoch": 2.4704, "grad_norm": 167.26947021484375, "learning_rate": 6.811510514412204e-05, "loss": 10.8831, "step": 7720 }, { "epoch": 2.4736000000000002, "grad_norm": 272.6399841308594, "learning_rate": 6.806698426447236e-05, "loss": 8.6469, "step": 7730 }, { "epoch": 2.4768, "grad_norm": 279.4835205078125, "learning_rate": 6.801886338482267e-05, "loss": 9.8967, "step": 7740 }, { "epoch": 2.48, "grad_norm": 219.4610137939453, "learning_rate": 6.7970742505173e-05, "loss": 12.8179, "step": 7750 }, { "epoch": 2.4832, "grad_norm": 144.40087890625, "learning_rate": 6.792262162552332e-05, "loss": 9.4323, "step": 7760 }, { "epoch": 2.4864, "grad_norm": 97.81302642822266, "learning_rate": 6.787450074587363e-05, "loss": 7.6604, "step": 7770 }, { "epoch": 2.4896, "grad_norm": 198.28985595703125, "learning_rate": 6.782637986622395e-05, "loss": 9.8568, "step": 7780 }, { "epoch": 2.4928, "grad_norm": 10.893805503845215, "learning_rate": 6.777825898657427e-05, "loss": 8.8436, "step": 7790 }, { "epoch": 2.496, "grad_norm": 37.64440155029297, "learning_rate": 6.77301381069246e-05, "loss": 7.437, "step": 7800 }, { "epoch": 2.4992, "grad_norm": 55.20491409301758, "learning_rate": 6.768201722727492e-05, "loss": 14.1631, "step": 7810 }, { "epoch": 2.5023999999999997, "grad_norm": 395.33062744140625, "learning_rate": 6.763389634762524e-05, "loss": 20.8171, "step": 7820 }, { "epoch": 2.5056000000000003, "grad_norm": 413.8629150390625, "learning_rate": 6.758577546797556e-05, "loss": 14.5598, "step": 7830 }, { "epoch": 2.5088, "grad_norm": 57.88976287841797, "learning_rate": 6.753765458832589e-05, "loss": 11.6303, "step": 7840 }, { "epoch": 2.512, "grad_norm": 97.69367980957031, "learning_rate": 6.74895337086762e-05, "loss": 14.0876, "step": 7850 }, { "epoch": 2.5152, "grad_norm": 111.28333282470703, "learning_rate": 6.744141282902652e-05, "loss": 12.9791, "step": 7860 }, { "epoch": 2.5183999999999997, "grad_norm": 203.21102905273438, "learning_rate": 6.739329194937683e-05, "loss": 8.3333, "step": 7870 }, { "epoch": 2.5216, "grad_norm": 281.72674560546875, "learning_rate": 6.734517106972715e-05, "loss": 10.106, "step": 7880 }, { "epoch": 2.5248, "grad_norm": 15.85966968536377, "learning_rate": 6.729705019007748e-05, "loss": 4.5757, "step": 7890 }, { "epoch": 2.528, "grad_norm": 111.71348571777344, "learning_rate": 6.72489293104278e-05, "loss": 17.3042, "step": 7900 }, { "epoch": 2.5312, "grad_norm": 261.8768005371094, "learning_rate": 6.720080843077811e-05, "loss": 8.3276, "step": 7910 }, { "epoch": 2.5343999999999998, "grad_norm": 100.83056640625, "learning_rate": 6.715268755112844e-05, "loss": 19.3323, "step": 7920 }, { "epoch": 2.5376, "grad_norm": 251.13912963867188, "learning_rate": 6.710456667147876e-05, "loss": 5.9124, "step": 7930 }, { "epoch": 2.5408, "grad_norm": 68.4140853881836, "learning_rate": 6.705644579182909e-05, "loss": 12.9803, "step": 7940 }, { "epoch": 2.544, "grad_norm": 327.96771240234375, "learning_rate": 6.70083249121794e-05, "loss": 22.6083, "step": 7950 }, { "epoch": 2.5472, "grad_norm": 25.544214248657227, "learning_rate": 6.696020403252972e-05, "loss": 16.6521, "step": 7960 }, { "epoch": 2.5504, "grad_norm": 98.6743392944336, "learning_rate": 6.691208315288003e-05, "loss": 11.9375, "step": 7970 }, { "epoch": 2.5536, "grad_norm": 309.0210876464844, "learning_rate": 6.686396227323036e-05, "loss": 6.8649, "step": 7980 }, { "epoch": 2.5568, "grad_norm": 54.98996353149414, "learning_rate": 6.681584139358068e-05, "loss": 3.2887, "step": 7990 }, { "epoch": 2.56, "grad_norm": 225.3155975341797, "learning_rate": 6.6767720513931e-05, "loss": 8.6671, "step": 8000 }, { "epoch": 2.5632, "grad_norm": 373.3511962890625, "learning_rate": 6.671959963428131e-05, "loss": 12.7867, "step": 8010 }, { "epoch": 2.5664, "grad_norm": 42.7872200012207, "learning_rate": 6.667147875463163e-05, "loss": 9.3848, "step": 8020 }, { "epoch": 2.5696, "grad_norm": 122.23094940185547, "learning_rate": 6.662335787498196e-05, "loss": 10.8837, "step": 8030 }, { "epoch": 2.5728, "grad_norm": 23.231321334838867, "learning_rate": 6.657523699533229e-05, "loss": 7.9574, "step": 8040 }, { "epoch": 2.576, "grad_norm": 286.73297119140625, "learning_rate": 6.65271161156826e-05, "loss": 14.7203, "step": 8050 }, { "epoch": 2.5792, "grad_norm": 505.94854736328125, "learning_rate": 6.647899523603292e-05, "loss": 15.2922, "step": 8060 }, { "epoch": 2.5824, "grad_norm": 12.066452026367188, "learning_rate": 6.643087435638323e-05, "loss": 4.7122, "step": 8070 }, { "epoch": 2.5856, "grad_norm": 194.5050048828125, "learning_rate": 6.638275347673356e-05, "loss": 22.2663, "step": 8080 }, { "epoch": 2.5888, "grad_norm": 9.79727840423584, "learning_rate": 6.633463259708388e-05, "loss": 21.1014, "step": 8090 }, { "epoch": 2.592, "grad_norm": 278.9139709472656, "learning_rate": 6.62865117174342e-05, "loss": 7.2773, "step": 8100 }, { "epoch": 2.5952, "grad_norm": 264.5290222167969, "learning_rate": 6.623839083778451e-05, "loss": 10.2388, "step": 8110 }, { "epoch": 2.5984, "grad_norm": 305.080810546875, "learning_rate": 6.619026995813484e-05, "loss": 8.111, "step": 8120 }, { "epoch": 2.6016, "grad_norm": 230.17572021484375, "learning_rate": 6.614214907848516e-05, "loss": 11.7224, "step": 8130 }, { "epoch": 2.6048, "grad_norm": 44.37038803100586, "learning_rate": 6.609402819883547e-05, "loss": 12.7029, "step": 8140 }, { "epoch": 2.608, "grad_norm": 50.765777587890625, "learning_rate": 6.60459073191858e-05, "loss": 10.0183, "step": 8150 }, { "epoch": 2.6112, "grad_norm": 309.2338562011719, "learning_rate": 6.599778643953612e-05, "loss": 6.1279, "step": 8160 }, { "epoch": 2.6144, "grad_norm": 49.92375183105469, "learning_rate": 6.594966555988645e-05, "loss": 4.2627, "step": 8170 }, { "epoch": 2.6176, "grad_norm": 240.57290649414062, "learning_rate": 6.590154468023676e-05, "loss": 11.6809, "step": 8180 }, { "epoch": 2.6208, "grad_norm": 371.12274169921875, "learning_rate": 6.585342380058708e-05, "loss": 11.3286, "step": 8190 }, { "epoch": 2.624, "grad_norm": 207.53726196289062, "learning_rate": 6.58053029209374e-05, "loss": 5.0364, "step": 8200 }, { "epoch": 2.6272, "grad_norm": 90.33932495117188, "learning_rate": 6.575718204128771e-05, "loss": 6.589, "step": 8210 }, { "epoch": 2.6304, "grad_norm": 374.23406982421875, "learning_rate": 6.570906116163804e-05, "loss": 16.1078, "step": 8220 }, { "epoch": 2.6336, "grad_norm": 459.0262145996094, "learning_rate": 6.566094028198836e-05, "loss": 9.9947, "step": 8230 }, { "epoch": 2.6368, "grad_norm": 371.4980163574219, "learning_rate": 6.561281940233867e-05, "loss": 12.3441, "step": 8240 }, { "epoch": 2.64, "grad_norm": 10.977595329284668, "learning_rate": 6.556469852268899e-05, "loss": 8.8908, "step": 8250 }, { "epoch": 2.6432, "grad_norm": 74.25899505615234, "learning_rate": 6.551657764303932e-05, "loss": 11.2549, "step": 8260 }, { "epoch": 2.6464, "grad_norm": 121.06352996826172, "learning_rate": 6.546845676338963e-05, "loss": 11.4914, "step": 8270 }, { "epoch": 2.6496, "grad_norm": 109.97881317138672, "learning_rate": 6.542033588373996e-05, "loss": 15.5407, "step": 8280 }, { "epoch": 2.6528, "grad_norm": 226.41842651367188, "learning_rate": 6.537221500409028e-05, "loss": 4.2782, "step": 8290 }, { "epoch": 2.656, "grad_norm": 542.8214111328125, "learning_rate": 6.53240941244406e-05, "loss": 9.6553, "step": 8300 }, { "epoch": 2.6592000000000002, "grad_norm": 5.239200592041016, "learning_rate": 6.527597324479093e-05, "loss": 10.8204, "step": 8310 }, { "epoch": 2.6624, "grad_norm": 182.9786376953125, "learning_rate": 6.522785236514124e-05, "loss": 11.0442, "step": 8320 }, { "epoch": 2.6656, "grad_norm": 189.10028076171875, "learning_rate": 6.517973148549156e-05, "loss": 20.1628, "step": 8330 }, { "epoch": 2.6688, "grad_norm": 145.64425659179688, "learning_rate": 6.513161060584187e-05, "loss": 13.0871, "step": 8340 }, { "epoch": 2.672, "grad_norm": 99.07398986816406, "learning_rate": 6.508348972619219e-05, "loss": 10.3763, "step": 8350 }, { "epoch": 2.6752000000000002, "grad_norm": 176.4828643798828, "learning_rate": 6.503536884654252e-05, "loss": 7.5651, "step": 8360 }, { "epoch": 2.6784, "grad_norm": 126.48267364501953, "learning_rate": 6.498724796689283e-05, "loss": 7.4398, "step": 8370 }, { "epoch": 2.6816, "grad_norm": 56.89348602294922, "learning_rate": 6.493912708724315e-05, "loss": 13.3988, "step": 8380 }, { "epoch": 2.6848, "grad_norm": 159.89732360839844, "learning_rate": 6.489100620759348e-05, "loss": 11.4889, "step": 8390 }, { "epoch": 2.6879999999999997, "grad_norm": 13.585084915161133, "learning_rate": 6.484288532794381e-05, "loss": 15.0281, "step": 8400 }, { "epoch": 2.6912000000000003, "grad_norm": 677.8982543945312, "learning_rate": 6.479476444829413e-05, "loss": 16.4007, "step": 8410 }, { "epoch": 2.6944, "grad_norm": 72.86627197265625, "learning_rate": 6.474664356864444e-05, "loss": 16.6471, "step": 8420 }, { "epoch": 2.6976, "grad_norm": 86.93643188476562, "learning_rate": 6.469852268899476e-05, "loss": 6.6994, "step": 8430 }, { "epoch": 2.7008, "grad_norm": 143.4864501953125, "learning_rate": 6.465040180934507e-05, "loss": 8.4952, "step": 8440 }, { "epoch": 2.7039999999999997, "grad_norm": 37.4204216003418, "learning_rate": 6.46022809296954e-05, "loss": 11.4054, "step": 8450 }, { "epoch": 2.7072000000000003, "grad_norm": 123.00093078613281, "learning_rate": 6.455416005004572e-05, "loss": 10.3167, "step": 8460 }, { "epoch": 2.7104, "grad_norm": 342.58544921875, "learning_rate": 6.450603917039603e-05, "loss": 29.3602, "step": 8470 }, { "epoch": 2.7136, "grad_norm": 10.590736389160156, "learning_rate": 6.445791829074635e-05, "loss": 8.8204, "step": 8480 }, { "epoch": 2.7168, "grad_norm": 522.9834594726562, "learning_rate": 6.440979741109667e-05, "loss": 17.5884, "step": 8490 }, { "epoch": 2.7199999999999998, "grad_norm": 57.40034484863281, "learning_rate": 6.4361676531447e-05, "loss": 17.5016, "step": 8500 }, { "epoch": 2.7232, "grad_norm": 76.77796173095703, "learning_rate": 6.431355565179733e-05, "loss": 6.2128, "step": 8510 }, { "epoch": 2.7264, "grad_norm": 362.51629638671875, "learning_rate": 6.426543477214764e-05, "loss": 10.1377, "step": 8520 }, { "epoch": 2.7296, "grad_norm": 226.9800567626953, "learning_rate": 6.421731389249796e-05, "loss": 5.3617, "step": 8530 }, { "epoch": 2.7328, "grad_norm": 22.264591217041016, "learning_rate": 6.416919301284829e-05, "loss": 7.6716, "step": 8540 }, { "epoch": 2.7359999999999998, "grad_norm": 384.7442321777344, "learning_rate": 6.41210721331986e-05, "loss": 6.8894, "step": 8550 }, { "epoch": 2.7392, "grad_norm": 329.1286315917969, "learning_rate": 6.407295125354892e-05, "loss": 11.5627, "step": 8560 }, { "epoch": 2.7424, "grad_norm": 24.393489837646484, "learning_rate": 6.402483037389923e-05, "loss": 5.8641, "step": 8570 }, { "epoch": 2.7456, "grad_norm": 181.37246704101562, "learning_rate": 6.397670949424955e-05, "loss": 11.8038, "step": 8580 }, { "epoch": 2.7488, "grad_norm": 8.99964427947998, "learning_rate": 6.392858861459988e-05, "loss": 11.0608, "step": 8590 }, { "epoch": 2.752, "grad_norm": 101.2455062866211, "learning_rate": 6.38804677349502e-05, "loss": 19.2332, "step": 8600 }, { "epoch": 2.7552, "grad_norm": 360.11236572265625, "learning_rate": 6.383234685530051e-05, "loss": 15.1417, "step": 8610 }, { "epoch": 2.7584, "grad_norm": 10.317300796508789, "learning_rate": 6.378422597565084e-05, "loss": 16.4146, "step": 8620 }, { "epoch": 2.7616, "grad_norm": 63.12464141845703, "learning_rate": 6.373610509600116e-05, "loss": 14.2052, "step": 8630 }, { "epoch": 2.7648, "grad_norm": 71.79895782470703, "learning_rate": 6.368798421635149e-05, "loss": 8.9316, "step": 8640 }, { "epoch": 2.768, "grad_norm": 28.541255950927734, "learning_rate": 6.36398633367018e-05, "loss": 13.8679, "step": 8650 }, { "epoch": 2.7712, "grad_norm": 141.2139129638672, "learning_rate": 6.359174245705212e-05, "loss": 16.0401, "step": 8660 }, { "epoch": 2.7744, "grad_norm": 103.69583129882812, "learning_rate": 6.354362157740243e-05, "loss": 5.3904, "step": 8670 }, { "epoch": 2.7776, "grad_norm": 161.27169799804688, "learning_rate": 6.349550069775276e-05, "loss": 6.8157, "step": 8680 }, { "epoch": 2.7808, "grad_norm": 340.5076599121094, "learning_rate": 6.344737981810308e-05, "loss": 12.8139, "step": 8690 }, { "epoch": 2.784, "grad_norm": 126.8208236694336, "learning_rate": 6.33992589384534e-05, "loss": 7.0034, "step": 8700 }, { "epoch": 2.7872, "grad_norm": 49.4276008605957, "learning_rate": 6.335113805880371e-05, "loss": 14.2766, "step": 8710 }, { "epoch": 2.7904, "grad_norm": 526.3049926757812, "learning_rate": 6.330301717915403e-05, "loss": 9.3528, "step": 8720 }, { "epoch": 2.7936, "grad_norm": 105.29026794433594, "learning_rate": 6.325489629950436e-05, "loss": 15.3314, "step": 8730 }, { "epoch": 2.7968, "grad_norm": 236.982421875, "learning_rate": 6.320677541985469e-05, "loss": 9.0033, "step": 8740 }, { "epoch": 2.8, "grad_norm": 381.7471618652344, "learning_rate": 6.3158654540205e-05, "loss": 12.5759, "step": 8750 }, { "epoch": 2.8032, "grad_norm": 433.3175354003906, "learning_rate": 6.311053366055532e-05, "loss": 9.0194, "step": 8760 }, { "epoch": 2.8064, "grad_norm": 310.0291748046875, "learning_rate": 6.306241278090563e-05, "loss": 7.7929, "step": 8770 }, { "epoch": 2.8096, "grad_norm": 71.15445709228516, "learning_rate": 6.301429190125596e-05, "loss": 6.3988, "step": 8780 }, { "epoch": 2.8128, "grad_norm": 400.8841552734375, "learning_rate": 6.296617102160628e-05, "loss": 9.0553, "step": 8790 }, { "epoch": 2.816, "grad_norm": 479.6634826660156, "learning_rate": 6.29180501419566e-05, "loss": 7.9714, "step": 8800 }, { "epoch": 2.8192, "grad_norm": 366.2032470703125, "learning_rate": 6.286992926230691e-05, "loss": 17.1902, "step": 8810 }, { "epoch": 2.8224, "grad_norm": 63.369388580322266, "learning_rate": 6.282180838265724e-05, "loss": 7.883, "step": 8820 }, { "epoch": 2.8256, "grad_norm": 324.8609313964844, "learning_rate": 6.277368750300756e-05, "loss": 12.4161, "step": 8830 }, { "epoch": 2.8288, "grad_norm": 534.8110961914062, "learning_rate": 6.272556662335787e-05, "loss": 11.3921, "step": 8840 }, { "epoch": 2.832, "grad_norm": 46.13950729370117, "learning_rate": 6.26774457437082e-05, "loss": 17.1585, "step": 8850 }, { "epoch": 2.8352, "grad_norm": 82.06448364257812, "learning_rate": 6.262932486405852e-05, "loss": 12.3841, "step": 8860 }, { "epoch": 2.8384, "grad_norm": 62.40217971801758, "learning_rate": 6.258120398440885e-05, "loss": 8.4796, "step": 8870 }, { "epoch": 2.8416, "grad_norm": 119.8001708984375, "learning_rate": 6.253308310475916e-05, "loss": 10.8421, "step": 8880 }, { "epoch": 2.8448, "grad_norm": 638.8903198242188, "learning_rate": 6.248496222510948e-05, "loss": 13.5369, "step": 8890 }, { "epoch": 2.848, "grad_norm": 291.4001770019531, "learning_rate": 6.24368413454598e-05, "loss": 28.597, "step": 8900 }, { "epoch": 2.8512, "grad_norm": 123.43546295166016, "learning_rate": 6.238872046581011e-05, "loss": 13.6578, "step": 8910 }, { "epoch": 2.8544, "grad_norm": 231.50169372558594, "learning_rate": 6.234059958616044e-05, "loss": 6.1267, "step": 8920 }, { "epoch": 2.8576, "grad_norm": 90.33557891845703, "learning_rate": 6.229247870651076e-05, "loss": 3.3846, "step": 8930 }, { "epoch": 2.8608000000000002, "grad_norm": 76.51793670654297, "learning_rate": 6.224435782686107e-05, "loss": 7.1172, "step": 8940 }, { "epoch": 2.864, "grad_norm": 74.56710052490234, "learning_rate": 6.219623694721139e-05, "loss": 7.6084, "step": 8950 }, { "epoch": 2.8672, "grad_norm": 34.432769775390625, "learning_rate": 6.214811606756172e-05, "loss": 4.7307, "step": 8960 }, { "epoch": 2.8704, "grad_norm": 101.9024429321289, "learning_rate": 6.209999518791203e-05, "loss": 11.793, "step": 8970 }, { "epoch": 2.8736, "grad_norm": 93.39012908935547, "learning_rate": 6.205187430826236e-05, "loss": 15.9775, "step": 8980 }, { "epoch": 2.8768000000000002, "grad_norm": 58.27486801147461, "learning_rate": 6.200375342861268e-05, "loss": 10.1296, "step": 8990 }, { "epoch": 2.88, "grad_norm": 60.625587463378906, "learning_rate": 6.1955632548963e-05, "loss": 2.8233, "step": 9000 }, { "epoch": 2.8832, "grad_norm": 134.53567504882812, "learning_rate": 6.190751166931333e-05, "loss": 12.5368, "step": 9010 }, { "epoch": 2.8864, "grad_norm": 9.814436912536621, "learning_rate": 6.185939078966364e-05, "loss": 6.7769, "step": 9020 }, { "epoch": 2.8895999999999997, "grad_norm": 388.83203125, "learning_rate": 6.181126991001396e-05, "loss": 10.0599, "step": 9030 }, { "epoch": 2.8928000000000003, "grad_norm": 227.12034606933594, "learning_rate": 6.176314903036427e-05, "loss": 13.9625, "step": 9040 }, { "epoch": 2.896, "grad_norm": 312.0540771484375, "learning_rate": 6.171502815071459e-05, "loss": 11.7044, "step": 9050 }, { "epoch": 2.8992, "grad_norm": 425.62567138671875, "learning_rate": 6.166690727106492e-05, "loss": 16.4232, "step": 9060 }, { "epoch": 2.9024, "grad_norm": 123.69522094726562, "learning_rate": 6.161878639141523e-05, "loss": 9.4824, "step": 9070 }, { "epoch": 2.9055999999999997, "grad_norm": 100.07563781738281, "learning_rate": 6.157066551176555e-05, "loss": 9.001, "step": 9080 }, { "epoch": 2.9088000000000003, "grad_norm": 51.00460433959961, "learning_rate": 6.152254463211588e-05, "loss": 11.279, "step": 9090 }, { "epoch": 2.912, "grad_norm": 148.40980529785156, "learning_rate": 6.14744237524662e-05, "loss": 6.6653, "step": 9100 }, { "epoch": 2.9152, "grad_norm": 57.62782287597656, "learning_rate": 6.142630287281653e-05, "loss": 5.9029, "step": 9110 }, { "epoch": 2.9184, "grad_norm": 79.69389343261719, "learning_rate": 6.137818199316684e-05, "loss": 12.0028, "step": 9120 }, { "epoch": 2.9215999999999998, "grad_norm": 198.62611389160156, "learning_rate": 6.133006111351716e-05, "loss": 11.2852, "step": 9130 }, { "epoch": 2.9248, "grad_norm": 215.56948852539062, "learning_rate": 6.128194023386747e-05, "loss": 8.2257, "step": 9140 }, { "epoch": 2.928, "grad_norm": 118.26122283935547, "learning_rate": 6.12338193542178e-05, "loss": 5.5315, "step": 9150 }, { "epoch": 2.9312, "grad_norm": 215.2784423828125, "learning_rate": 6.118569847456812e-05, "loss": 10.541, "step": 9160 }, { "epoch": 2.9344, "grad_norm": 321.5195617675781, "learning_rate": 6.113757759491843e-05, "loss": 12.1365, "step": 9170 }, { "epoch": 2.9375999999999998, "grad_norm": 246.36727905273438, "learning_rate": 6.108945671526875e-05, "loss": 17.8036, "step": 9180 }, { "epoch": 2.9408, "grad_norm": 589.6262817382812, "learning_rate": 6.104133583561907e-05, "loss": 9.8468, "step": 9190 }, { "epoch": 2.944, "grad_norm": 158.98361206054688, "learning_rate": 6.09932149559694e-05, "loss": 11.5522, "step": 9200 }, { "epoch": 2.9472, "grad_norm": 170.11764526367188, "learning_rate": 6.094509407631972e-05, "loss": 4.5428, "step": 9210 }, { "epoch": 2.9504, "grad_norm": 114.11456298828125, "learning_rate": 6.0896973196670035e-05, "loss": 6.8285, "step": 9220 }, { "epoch": 2.9536, "grad_norm": 214.0742645263672, "learning_rate": 6.084885231702036e-05, "loss": 12.7053, "step": 9230 }, { "epoch": 2.9568, "grad_norm": 87.31275177001953, "learning_rate": 6.0800731437370674e-05, "loss": 8.631, "step": 9240 }, { "epoch": 2.96, "grad_norm": 137.9563446044922, "learning_rate": 6.0752610557721e-05, "loss": 7.4718, "step": 9250 }, { "epoch": 2.9632, "grad_norm": 22.895612716674805, "learning_rate": 6.070448967807132e-05, "loss": 7.7254, "step": 9260 }, { "epoch": 2.9664, "grad_norm": 477.72430419921875, "learning_rate": 6.0656368798421635e-05, "loss": 13.8774, "step": 9270 }, { "epoch": 2.9696, "grad_norm": 200.23208618164062, "learning_rate": 6.060824791877196e-05, "loss": 12.0605, "step": 9280 }, { "epoch": 2.9728, "grad_norm": 190.87347412109375, "learning_rate": 6.056012703912228e-05, "loss": 13.0139, "step": 9290 }, { "epoch": 2.976, "grad_norm": 165.8858184814453, "learning_rate": 6.05120061594726e-05, "loss": 9.4468, "step": 9300 }, { "epoch": 2.9792, "grad_norm": 252.0768585205078, "learning_rate": 6.046388527982292e-05, "loss": 8.5149, "step": 9310 }, { "epoch": 2.9824, "grad_norm": 191.50491333007812, "learning_rate": 6.0415764400173235e-05, "loss": 11.8007, "step": 9320 }, { "epoch": 2.9856, "grad_norm": 14.657149314880371, "learning_rate": 6.036764352052355e-05, "loss": 7.9021, "step": 9330 }, { "epoch": 2.9888, "grad_norm": 298.49639892578125, "learning_rate": 6.031952264087388e-05, "loss": 12.5622, "step": 9340 }, { "epoch": 2.992, "grad_norm": 169.65924072265625, "learning_rate": 6.02714017612242e-05, "loss": 12.6842, "step": 9350 }, { "epoch": 2.9952, "grad_norm": 106.19922637939453, "learning_rate": 6.022328088157452e-05, "loss": 10.1612, "step": 9360 }, { "epoch": 2.9984, "grad_norm": 438.81011962890625, "learning_rate": 6.0175160001924835e-05, "loss": 5.5419, "step": 9370 }, { "epoch": 3.0016, "grad_norm": 196.07020568847656, "learning_rate": 6.012703912227515e-05, "loss": 5.5144, "step": 9380 }, { "epoch": 3.0048, "grad_norm": 77.8779525756836, "learning_rate": 6.007891824262548e-05, "loss": 9.8386, "step": 9390 }, { "epoch": 3.008, "grad_norm": 86.0557861328125, "learning_rate": 6.0030797362975796e-05, "loss": 12.2776, "step": 9400 }, { "epoch": 3.0112, "grad_norm": 121.39236450195312, "learning_rate": 5.998267648332612e-05, "loss": 14.5853, "step": 9410 }, { "epoch": 3.0144, "grad_norm": 52.361026763916016, "learning_rate": 5.9934555603676435e-05, "loss": 8.938, "step": 9420 }, { "epoch": 3.0176, "grad_norm": 33.73550033569336, "learning_rate": 5.9886434724026764e-05, "loss": 2.9664, "step": 9430 }, { "epoch": 3.0208, "grad_norm": 196.17001342773438, "learning_rate": 5.983831384437708e-05, "loss": 15.761, "step": 9440 }, { "epoch": 3.024, "grad_norm": 4.483468532562256, "learning_rate": 5.9790192964727396e-05, "loss": 6.4856, "step": 9450 }, { "epoch": 3.0272, "grad_norm": 257.7809753417969, "learning_rate": 5.974207208507772e-05, "loss": 18.8768, "step": 9460 }, { "epoch": 3.0304, "grad_norm": 34.08185958862305, "learning_rate": 5.9693951205428035e-05, "loss": 5.8329, "step": 9470 }, { "epoch": 3.0336, "grad_norm": 176.45867919921875, "learning_rate": 5.9645830325778365e-05, "loss": 8.4861, "step": 9480 }, { "epoch": 3.0368, "grad_norm": 283.4518127441406, "learning_rate": 5.959770944612868e-05, "loss": 6.1794, "step": 9490 }, { "epoch": 3.04, "grad_norm": 211.9278564453125, "learning_rate": 5.9549588566478996e-05, "loss": 14.5167, "step": 9500 }, { "epoch": 3.0432, "grad_norm": 526.6793212890625, "learning_rate": 5.950146768682931e-05, "loss": 8.149, "step": 9510 }, { "epoch": 3.0464, "grad_norm": 142.95130920410156, "learning_rate": 5.9453346807179635e-05, "loss": 8.76, "step": 9520 }, { "epoch": 3.0496, "grad_norm": 57.34198760986328, "learning_rate": 5.9405225927529965e-05, "loss": 16.0683, "step": 9530 }, { "epoch": 3.0528, "grad_norm": 9.715970993041992, "learning_rate": 5.935710504788028e-05, "loss": 10.4715, "step": 9540 }, { "epoch": 3.056, "grad_norm": 105.94384765625, "learning_rate": 5.9308984168230596e-05, "loss": 1.8357, "step": 9550 }, { "epoch": 3.0592, "grad_norm": 143.14691162109375, "learning_rate": 5.926086328858091e-05, "loss": 13.621, "step": 9560 }, { "epoch": 3.0624, "grad_norm": 77.60493469238281, "learning_rate": 5.921274240893124e-05, "loss": 5.46, "step": 9570 }, { "epoch": 3.0656, "grad_norm": 260.6083984375, "learning_rate": 5.916462152928156e-05, "loss": 7.1788, "step": 9580 }, { "epoch": 3.0688, "grad_norm": 58.237548828125, "learning_rate": 5.911650064963188e-05, "loss": 7.8488, "step": 9590 }, { "epoch": 3.072, "grad_norm": 79.41767120361328, "learning_rate": 5.9068379769982196e-05, "loss": 12.8096, "step": 9600 }, { "epoch": 3.0752, "grad_norm": 227.4895477294922, "learning_rate": 5.902025889033251e-05, "loss": 6.1908, "step": 9610 }, { "epoch": 3.0784, "grad_norm": 192.41859436035156, "learning_rate": 5.897213801068284e-05, "loss": 12.3739, "step": 9620 }, { "epoch": 3.0816, "grad_norm": 35.24849319458008, "learning_rate": 5.892401713103316e-05, "loss": 9.8204, "step": 9630 }, { "epoch": 3.0848, "grad_norm": 391.391845703125, "learning_rate": 5.887589625138348e-05, "loss": 8.8886, "step": 9640 }, { "epoch": 3.088, "grad_norm": 217.22225952148438, "learning_rate": 5.8827775371733796e-05, "loss": 8.5184, "step": 9650 }, { "epoch": 3.0912, "grad_norm": 259.8196105957031, "learning_rate": 5.877965449208411e-05, "loss": 8.491, "step": 9660 }, { "epoch": 3.0944, "grad_norm": 595.046630859375, "learning_rate": 5.873153361243444e-05, "loss": 16.5511, "step": 9670 }, { "epoch": 3.0976, "grad_norm": 250.98202514648438, "learning_rate": 5.868341273278476e-05, "loss": 10.0545, "step": 9680 }, { "epoch": 3.1008, "grad_norm": 51.79408264160156, "learning_rate": 5.8635291853135074e-05, "loss": 12.4288, "step": 9690 }, { "epoch": 3.104, "grad_norm": 123.24848175048828, "learning_rate": 5.8587170973485397e-05, "loss": 13.6866, "step": 9700 }, { "epoch": 3.1072, "grad_norm": 120.61489868164062, "learning_rate": 5.8539050093835726e-05, "loss": 10.3918, "step": 9710 }, { "epoch": 3.1104, "grad_norm": 51.93746566772461, "learning_rate": 5.849092921418604e-05, "loss": 11.0926, "step": 9720 }, { "epoch": 3.1136, "grad_norm": 335.9434509277344, "learning_rate": 5.844280833453636e-05, "loss": 7.6178, "step": 9730 }, { "epoch": 3.1168, "grad_norm": 221.61582946777344, "learning_rate": 5.8394687454886674e-05, "loss": 6.6475, "step": 9740 }, { "epoch": 3.12, "grad_norm": 83.53640747070312, "learning_rate": 5.8346566575236997e-05, "loss": 6.6486, "step": 9750 }, { "epoch": 3.1232, "grad_norm": 118.57586669921875, "learning_rate": 5.829844569558732e-05, "loss": 4.6365, "step": 9760 }, { "epoch": 3.1264, "grad_norm": 86.72565460205078, "learning_rate": 5.825032481593764e-05, "loss": 7.7041, "step": 9770 }, { "epoch": 3.1296, "grad_norm": 427.0348205566406, "learning_rate": 5.820220393628796e-05, "loss": 12.082, "step": 9780 }, { "epoch": 3.1328, "grad_norm": 294.4844055175781, "learning_rate": 5.8154083056638274e-05, "loss": 7.5574, "step": 9790 }, { "epoch": 3.136, "grad_norm": 277.423828125, "learning_rate": 5.810596217698859e-05, "loss": 10.0281, "step": 9800 }, { "epoch": 3.1391999999999998, "grad_norm": 510.573486328125, "learning_rate": 5.805784129733892e-05, "loss": 15.4906, "step": 9810 }, { "epoch": 3.1424, "grad_norm": 169.46841430664062, "learning_rate": 5.800972041768924e-05, "loss": 8.88, "step": 9820 }, { "epoch": 3.1456, "grad_norm": 95.35476684570312, "learning_rate": 5.796159953803956e-05, "loss": 11.3824, "step": 9830 }, { "epoch": 3.1488, "grad_norm": 133.87979125976562, "learning_rate": 5.7913478658389874e-05, "loss": 7.7908, "step": 9840 }, { "epoch": 3.152, "grad_norm": 92.72872161865234, "learning_rate": 5.78653577787402e-05, "loss": 5.4827, "step": 9850 }, { "epoch": 3.1552, "grad_norm": 186.6851806640625, "learning_rate": 5.781723689909052e-05, "loss": 11.7598, "step": 9860 }, { "epoch": 3.1584, "grad_norm": 526.7644653320312, "learning_rate": 5.776911601944084e-05, "loss": 7.5514, "step": 9870 }, { "epoch": 3.1616, "grad_norm": 453.7850341796875, "learning_rate": 5.772099513979116e-05, "loss": 6.7458, "step": 9880 }, { "epoch": 3.1648, "grad_norm": 113.1493911743164, "learning_rate": 5.7672874260141474e-05, "loss": 13.0659, "step": 9890 }, { "epoch": 3.168, "grad_norm": 88.48123168945312, "learning_rate": 5.7624753380491803e-05, "loss": 14.8817, "step": 9900 }, { "epoch": 3.1712, "grad_norm": 260.2159423828125, "learning_rate": 5.757663250084212e-05, "loss": 11.9646, "step": 9910 }, { "epoch": 3.1744, "grad_norm": 404.491943359375, "learning_rate": 5.7528511621192435e-05, "loss": 13.7903, "step": 9920 }, { "epoch": 3.1776, "grad_norm": 122.68408203125, "learning_rate": 5.748039074154276e-05, "loss": 11.6551, "step": 9930 }, { "epoch": 3.1808, "grad_norm": 36.266178131103516, "learning_rate": 5.7432269861893074e-05, "loss": 9.2347, "step": 9940 }, { "epoch": 3.184, "grad_norm": 249.447021484375, "learning_rate": 5.7384148982243403e-05, "loss": 14.1425, "step": 9950 }, { "epoch": 3.1872, "grad_norm": 156.2315216064453, "learning_rate": 5.733602810259372e-05, "loss": 6.5789, "step": 9960 }, { "epoch": 3.1904, "grad_norm": 215.3241424560547, "learning_rate": 5.7287907222944035e-05, "loss": 7.8892, "step": 9970 }, { "epoch": 3.1936, "grad_norm": 289.9732666015625, "learning_rate": 5.723978634329436e-05, "loss": 18.2574, "step": 9980 }, { "epoch": 3.1968, "grad_norm": 361.384521484375, "learning_rate": 5.719166546364468e-05, "loss": 15.7979, "step": 9990 }, { "epoch": 3.2, "grad_norm": 206.41119384765625, "learning_rate": 5.7143544583995003e-05, "loss": 6.6047, "step": 10000 }, { "epoch": 3.2032, "grad_norm": 194.72552490234375, "learning_rate": 5.709542370434532e-05, "loss": 13.4965, "step": 10010 }, { "epoch": 3.2064, "grad_norm": 107.0044937133789, "learning_rate": 5.7047302824695635e-05, "loss": 9.4915, "step": 10020 }, { "epoch": 3.2096, "grad_norm": 87.2500991821289, "learning_rate": 5.699918194504595e-05, "loss": 12.0628, "step": 10030 }, { "epoch": 3.2128, "grad_norm": 37.524967193603516, "learning_rate": 5.695106106539628e-05, "loss": 10.7553, "step": 10040 }, { "epoch": 3.216, "grad_norm": 563.2716674804688, "learning_rate": 5.6902940185746603e-05, "loss": 8.6043, "step": 10050 }, { "epoch": 3.2192, "grad_norm": 286.1072082519531, "learning_rate": 5.685481930609692e-05, "loss": 9.7686, "step": 10060 }, { "epoch": 3.2224, "grad_norm": 59.3211784362793, "learning_rate": 5.6806698426447235e-05, "loss": 12.8586, "step": 10070 }, { "epoch": 3.2256, "grad_norm": 25.64715003967285, "learning_rate": 5.675857754679755e-05, "loss": 7.7884, "step": 10080 }, { "epoch": 3.2288, "grad_norm": 226.26136779785156, "learning_rate": 5.671045666714788e-05, "loss": 11.6915, "step": 10090 }, { "epoch": 3.232, "grad_norm": 120.35904693603516, "learning_rate": 5.66623357874982e-05, "loss": 8.4681, "step": 10100 }, { "epoch": 3.2352, "grad_norm": 88.03390502929688, "learning_rate": 5.661421490784852e-05, "loss": 5.2364, "step": 10110 }, { "epoch": 3.2384, "grad_norm": 164.8140411376953, "learning_rate": 5.6566094028198835e-05, "loss": 8.9016, "step": 10120 }, { "epoch": 3.2416, "grad_norm": 10.532370567321777, "learning_rate": 5.6517973148549165e-05, "loss": 7.9191, "step": 10130 }, { "epoch": 3.2448, "grad_norm": 553.1741943359375, "learning_rate": 5.646985226889948e-05, "loss": 13.3998, "step": 10140 }, { "epoch": 3.248, "grad_norm": 120.67591857910156, "learning_rate": 5.64217313892498e-05, "loss": 10.1969, "step": 10150 }, { "epoch": 3.2512, "grad_norm": 80.25688934326172, "learning_rate": 5.637361050960012e-05, "loss": 6.9874, "step": 10160 }, { "epoch": 3.2544, "grad_norm": 76.02688598632812, "learning_rate": 5.6325489629950435e-05, "loss": 9.6038, "step": 10170 }, { "epoch": 3.2576, "grad_norm": 90.9759292602539, "learning_rate": 5.6277368750300765e-05, "loss": 6.3408, "step": 10180 }, { "epoch": 3.2608, "grad_norm": 142.73646545410156, "learning_rate": 5.622924787065108e-05, "loss": 5.385, "step": 10190 }, { "epoch": 3.2640000000000002, "grad_norm": 379.7109069824219, "learning_rate": 5.61811269910014e-05, "loss": 19.6901, "step": 10200 }, { "epoch": 3.2672, "grad_norm": 65.48287963867188, "learning_rate": 5.613300611135171e-05, "loss": 11.9598, "step": 10210 }, { "epoch": 3.2704, "grad_norm": 61.98320388793945, "learning_rate": 5.6084885231702035e-05, "loss": 9.7209, "step": 10220 }, { "epoch": 3.2736, "grad_norm": 104.31575775146484, "learning_rate": 5.6036764352052365e-05, "loss": 6.2051, "step": 10230 }, { "epoch": 3.2768, "grad_norm": 87.57137298583984, "learning_rate": 5.598864347240268e-05, "loss": 6.3533, "step": 10240 }, { "epoch": 3.2800000000000002, "grad_norm": 115.27996826171875, "learning_rate": 5.5940522592753e-05, "loss": 13.3796, "step": 10250 }, { "epoch": 3.2832, "grad_norm": 117.89083099365234, "learning_rate": 5.589240171310331e-05, "loss": 6.7985, "step": 10260 }, { "epoch": 3.2864, "grad_norm": 243.5779571533203, "learning_rate": 5.5844280833453635e-05, "loss": 6.4181, "step": 10270 }, { "epoch": 3.2896, "grad_norm": 24.41347312927246, "learning_rate": 5.579615995380396e-05, "loss": 8.45, "step": 10280 }, { "epoch": 3.2928, "grad_norm": 25.959936141967773, "learning_rate": 5.574803907415428e-05, "loss": 5.3181, "step": 10290 }, { "epoch": 3.296, "grad_norm": 493.1497497558594, "learning_rate": 5.56999181945046e-05, "loss": 18.3033, "step": 10300 }, { "epoch": 3.2992, "grad_norm": 85.99234008789062, "learning_rate": 5.565179731485491e-05, "loss": 11.5838, "step": 10310 }, { "epoch": 3.3024, "grad_norm": 685.2490234375, "learning_rate": 5.560367643520524e-05, "loss": 12.0466, "step": 10320 }, { "epoch": 3.3056, "grad_norm": 32.04133605957031, "learning_rate": 5.555555555555556e-05, "loss": 6.0094, "step": 10330 }, { "epoch": 3.3088, "grad_norm": 471.3772277832031, "learning_rate": 5.550743467590588e-05, "loss": 8.1759, "step": 10340 }, { "epoch": 3.312, "grad_norm": 78.36851501464844, "learning_rate": 5.54593137962562e-05, "loss": 8.19, "step": 10350 }, { "epoch": 3.3152, "grad_norm": 32.85549545288086, "learning_rate": 5.541119291660651e-05, "loss": 11.6489, "step": 10360 }, { "epoch": 3.3184, "grad_norm": 404.6401672363281, "learning_rate": 5.536307203695684e-05, "loss": 13.6418, "step": 10370 }, { "epoch": 3.3216, "grad_norm": 199.50653076171875, "learning_rate": 5.531495115730716e-05, "loss": 5.9807, "step": 10380 }, { "epoch": 3.3247999999999998, "grad_norm": 284.4595031738281, "learning_rate": 5.5266830277657474e-05, "loss": 5.6027, "step": 10390 }, { "epoch": 3.328, "grad_norm": 230.23902893066406, "learning_rate": 5.52187093980078e-05, "loss": 14.2283, "step": 10400 }, { "epoch": 3.3312, "grad_norm": 89.29363250732422, "learning_rate": 5.517058851835811e-05, "loss": 7.5047, "step": 10410 }, { "epoch": 3.3344, "grad_norm": 326.7586364746094, "learning_rate": 5.512246763870844e-05, "loss": 7.0058, "step": 10420 }, { "epoch": 3.3376, "grad_norm": 111.60249328613281, "learning_rate": 5.507434675905876e-05, "loss": 9.9023, "step": 10430 }, { "epoch": 3.3407999999999998, "grad_norm": 374.09100341796875, "learning_rate": 5.5026225879409074e-05, "loss": 9.0989, "step": 10440 }, { "epoch": 3.344, "grad_norm": 119.5875244140625, "learning_rate": 5.49781049997594e-05, "loss": 5.111, "step": 10450 }, { "epoch": 3.3472, "grad_norm": 66.82816314697266, "learning_rate": 5.4929984120109726e-05, "loss": 12.2477, "step": 10460 }, { "epoch": 3.3504, "grad_norm": 508.3679504394531, "learning_rate": 5.488186324046004e-05, "loss": 10.9882, "step": 10470 }, { "epoch": 3.3536, "grad_norm": 63.36748123168945, "learning_rate": 5.483374236081036e-05, "loss": 6.9128, "step": 10480 }, { "epoch": 3.3568, "grad_norm": 13.34519100189209, "learning_rate": 5.4785621481160674e-05, "loss": 4.3515, "step": 10490 }, { "epoch": 3.36, "grad_norm": 64.90548706054688, "learning_rate": 5.4737500601511e-05, "loss": 4.3658, "step": 10500 }, { "epoch": 3.3632, "grad_norm": 181.36500549316406, "learning_rate": 5.468937972186132e-05, "loss": 12.8478, "step": 10510 }, { "epoch": 3.3664, "grad_norm": 29.35219383239746, "learning_rate": 5.464125884221164e-05, "loss": 10.0206, "step": 10520 }, { "epoch": 3.3696, "grad_norm": 377.886962890625, "learning_rate": 5.459313796256196e-05, "loss": 12.2786, "step": 10530 }, { "epoch": 3.3728, "grad_norm": 119.6103515625, "learning_rate": 5.4545017082912274e-05, "loss": 6.8772, "step": 10540 }, { "epoch": 3.376, "grad_norm": 242.71517944335938, "learning_rate": 5.449689620326259e-05, "loss": 6.4386, "step": 10550 }, { "epoch": 3.3792, "grad_norm": 366.473388671875, "learning_rate": 5.444877532361292e-05, "loss": 8.0935, "step": 10560 }, { "epoch": 3.3824, "grad_norm": 42.36399841308594, "learning_rate": 5.440065444396324e-05, "loss": 14.8502, "step": 10570 }, { "epoch": 3.3856, "grad_norm": 14.890262603759766, "learning_rate": 5.435253356431356e-05, "loss": 7.4552, "step": 10580 }, { "epoch": 3.3888, "grad_norm": 214.78651428222656, "learning_rate": 5.4304412684663874e-05, "loss": 10.09, "step": 10590 }, { "epoch": 3.392, "grad_norm": 56.47807312011719, "learning_rate": 5.4256291805014204e-05, "loss": 13.6934, "step": 10600 }, { "epoch": 3.3952, "grad_norm": 494.8249206542969, "learning_rate": 5.420817092536452e-05, "loss": 18.7878, "step": 10610 }, { "epoch": 3.3984, "grad_norm": 202.69296264648438, "learning_rate": 5.4160050045714836e-05, "loss": 6.4601, "step": 10620 }, { "epoch": 3.4016, "grad_norm": 166.670166015625, "learning_rate": 5.411192916606516e-05, "loss": 7.9081, "step": 10630 }, { "epoch": 3.4048, "grad_norm": 546.6384887695312, "learning_rate": 5.4063808286415474e-05, "loss": 9.5603, "step": 10640 }, { "epoch": 3.408, "grad_norm": 1243.3924560546875, "learning_rate": 5.4015687406765804e-05, "loss": 17.8355, "step": 10650 }, { "epoch": 3.4112, "grad_norm": 134.7431182861328, "learning_rate": 5.396756652711612e-05, "loss": 12.9075, "step": 10660 }, { "epoch": 3.4144, "grad_norm": 197.9523162841797, "learning_rate": 5.3919445647466436e-05, "loss": 7.779, "step": 10670 }, { "epoch": 3.4176, "grad_norm": 24.598529815673828, "learning_rate": 5.387132476781676e-05, "loss": 13.5682, "step": 10680 }, { "epoch": 3.4208, "grad_norm": 69.93231964111328, "learning_rate": 5.3823203888167074e-05, "loss": 6.4275, "step": 10690 }, { "epoch": 3.424, "grad_norm": 113.85429382324219, "learning_rate": 5.3775083008517404e-05, "loss": 9.9581, "step": 10700 }, { "epoch": 3.4272, "grad_norm": 92.768310546875, "learning_rate": 5.372696212886772e-05, "loss": 7.5728, "step": 10710 }, { "epoch": 3.4304, "grad_norm": 162.51593017578125, "learning_rate": 5.3678841249218036e-05, "loss": 5.58, "step": 10720 }, { "epoch": 3.4336, "grad_norm": 309.459228515625, "learning_rate": 5.363072036956835e-05, "loss": 9.3868, "step": 10730 }, { "epoch": 3.4368, "grad_norm": 47.860679626464844, "learning_rate": 5.358259948991868e-05, "loss": 6.5203, "step": 10740 }, { "epoch": 3.44, "grad_norm": 453.8770446777344, "learning_rate": 5.3534478610269004e-05, "loss": 14.036, "step": 10750 }, { "epoch": 3.4432, "grad_norm": 154.74610900878906, "learning_rate": 5.348635773061932e-05, "loss": 7.4603, "step": 10760 }, { "epoch": 3.4464, "grad_norm": 50.46474075317383, "learning_rate": 5.3438236850969636e-05, "loss": 11.4914, "step": 10770 }, { "epoch": 3.4496, "grad_norm": 145.1925048828125, "learning_rate": 5.339011597131995e-05, "loss": 8.4293, "step": 10780 }, { "epoch": 3.4528, "grad_norm": 114.92239379882812, "learning_rate": 5.334199509167028e-05, "loss": 14.4253, "step": 10790 }, { "epoch": 3.456, "grad_norm": 49.29560089111328, "learning_rate": 5.32938742120206e-05, "loss": 7.24, "step": 10800 }, { "epoch": 3.4592, "grad_norm": 47.80524826049805, "learning_rate": 5.324575333237092e-05, "loss": 11.6426, "step": 10810 }, { "epoch": 3.4624, "grad_norm": 126.1449966430664, "learning_rate": 5.3197632452721236e-05, "loss": 9.9049, "step": 10820 }, { "epoch": 3.4656000000000002, "grad_norm": 351.4503479003906, "learning_rate": 5.314951157307155e-05, "loss": 11.8798, "step": 10830 }, { "epoch": 3.4688, "grad_norm": 144.12814331054688, "learning_rate": 5.310139069342188e-05, "loss": 17.3417, "step": 10840 }, { "epoch": 3.472, "grad_norm": 125.06886291503906, "learning_rate": 5.30532698137722e-05, "loss": 8.5307, "step": 10850 }, { "epoch": 3.4752, "grad_norm": 451.85821533203125, "learning_rate": 5.300514893412252e-05, "loss": 11.6163, "step": 10860 }, { "epoch": 3.4784, "grad_norm": 259.54736328125, "learning_rate": 5.2957028054472836e-05, "loss": 20.1392, "step": 10870 }, { "epoch": 3.4816, "grad_norm": 613.2321166992188, "learning_rate": 5.2908907174823165e-05, "loss": 15.7588, "step": 10880 }, { "epoch": 3.4848, "grad_norm": 121.89942169189453, "learning_rate": 5.286078629517348e-05, "loss": 14.5637, "step": 10890 }, { "epoch": 3.488, "grad_norm": 264.4930114746094, "learning_rate": 5.28126654155238e-05, "loss": 21.2455, "step": 10900 }, { "epoch": 3.4912, "grad_norm": 137.4848175048828, "learning_rate": 5.276454453587411e-05, "loss": 7.4833, "step": 10910 }, { "epoch": 3.4944, "grad_norm": 35.75732421875, "learning_rate": 5.2716423656224436e-05, "loss": 5.426, "step": 10920 }, { "epoch": 3.4976, "grad_norm": 151.94093322753906, "learning_rate": 5.2668302776574765e-05, "loss": 6.9642, "step": 10930 }, { "epoch": 3.5008, "grad_norm": 328.2507019042969, "learning_rate": 5.262018189692508e-05, "loss": 11.0252, "step": 10940 }, { "epoch": 3.504, "grad_norm": 109.06185150146484, "learning_rate": 5.25720610172754e-05, "loss": 2.5377, "step": 10950 }, { "epoch": 3.5072, "grad_norm": 104.96216583251953, "learning_rate": 5.252394013762571e-05, "loss": 8.2971, "step": 10960 }, { "epoch": 3.5103999999999997, "grad_norm": 168.24444580078125, "learning_rate": 5.2475819257976036e-05, "loss": 14.4615, "step": 10970 }, { "epoch": 3.5136, "grad_norm": 374.3692932128906, "learning_rate": 5.2427698378326365e-05, "loss": 7.7937, "step": 10980 }, { "epoch": 3.5168, "grad_norm": 123.0724868774414, "learning_rate": 5.237957749867668e-05, "loss": 9.3746, "step": 10990 }, { "epoch": 3.52, "grad_norm": 413.34228515625, "learning_rate": 5.2331456619027e-05, "loss": 7.4134, "step": 11000 }, { "epoch": 3.5232, "grad_norm": 163.57693481445312, "learning_rate": 5.228333573937731e-05, "loss": 7.235, "step": 11010 }, { "epoch": 3.5263999999999998, "grad_norm": 193.9126739501953, "learning_rate": 5.223521485972764e-05, "loss": 11.2848, "step": 11020 }, { "epoch": 3.5296, "grad_norm": 57.687740325927734, "learning_rate": 5.218709398007796e-05, "loss": 17.3143, "step": 11030 }, { "epoch": 3.5328, "grad_norm": 51.359535217285156, "learning_rate": 5.213897310042828e-05, "loss": 5.1285, "step": 11040 }, { "epoch": 3.536, "grad_norm": 55.714202880859375, "learning_rate": 5.20908522207786e-05, "loss": 9.8882, "step": 11050 }, { "epoch": 3.5392, "grad_norm": 117.07864379882812, "learning_rate": 5.204273134112891e-05, "loss": 12.6358, "step": 11060 }, { "epoch": 3.5423999999999998, "grad_norm": 256.4263916015625, "learning_rate": 5.199461046147924e-05, "loss": 9.222, "step": 11070 }, { "epoch": 3.5456, "grad_norm": 78.67411041259766, "learning_rate": 5.194648958182956e-05, "loss": 9.3217, "step": 11080 }, { "epoch": 3.5488, "grad_norm": 10.878169059753418, "learning_rate": 5.189836870217988e-05, "loss": 8.9555, "step": 11090 }, { "epoch": 3.552, "grad_norm": 103.51618194580078, "learning_rate": 5.18502478225302e-05, "loss": 3.6041, "step": 11100 }, { "epoch": 3.5552, "grad_norm": 135.8647918701172, "learning_rate": 5.180212694288051e-05, "loss": 6.7043, "step": 11110 }, { "epoch": 3.5584, "grad_norm": 315.6936950683594, "learning_rate": 5.175400606323084e-05, "loss": 10.6699, "step": 11120 }, { "epoch": 3.5616, "grad_norm": 730.7633056640625, "learning_rate": 5.170588518358116e-05, "loss": 12.7053, "step": 11130 }, { "epoch": 3.5648, "grad_norm": 81.5301513671875, "learning_rate": 5.1657764303931475e-05, "loss": 7.995, "step": 11140 }, { "epoch": 3.568, "grad_norm": 110.9262924194336, "learning_rate": 5.16096434242818e-05, "loss": 5.3926, "step": 11150 }, { "epoch": 3.5712, "grad_norm": 363.3927307128906, "learning_rate": 5.156152254463213e-05, "loss": 9.6087, "step": 11160 }, { "epoch": 3.5744, "grad_norm": 87.52037811279297, "learning_rate": 5.151340166498244e-05, "loss": 5.4234, "step": 11170 }, { "epoch": 3.5776, "grad_norm": 30.732654571533203, "learning_rate": 5.146528078533276e-05, "loss": 3.9724, "step": 11180 }, { "epoch": 3.5808, "grad_norm": 84.24311065673828, "learning_rate": 5.1417159905683075e-05, "loss": 3.7602, "step": 11190 }, { "epoch": 3.584, "grad_norm": 37.27490234375, "learning_rate": 5.13690390260334e-05, "loss": 6.3771, "step": 11200 }, { "epoch": 3.5872, "grad_norm": 179.7301483154297, "learning_rate": 5.132091814638372e-05, "loss": 11.2909, "step": 11210 }, { "epoch": 3.5904, "grad_norm": 133.4637451171875, "learning_rate": 5.127279726673404e-05, "loss": 12.0138, "step": 11220 }, { "epoch": 3.5936, "grad_norm": 348.7917175292969, "learning_rate": 5.122467638708436e-05, "loss": 6.1237, "step": 11230 }, { "epoch": 3.5968, "grad_norm": 145.38336181640625, "learning_rate": 5.1176555507434675e-05, "loss": 15.1322, "step": 11240 }, { "epoch": 3.6, "grad_norm": 74.31438446044922, "learning_rate": 5.112843462778499e-05, "loss": 11.6989, "step": 11250 }, { "epoch": 3.6032, "grad_norm": 181.16127014160156, "learning_rate": 5.108031374813532e-05, "loss": 7.7492, "step": 11260 }, { "epoch": 3.6064, "grad_norm": 82.90265655517578, "learning_rate": 5.103219286848564e-05, "loss": 6.1079, "step": 11270 }, { "epoch": 3.6096, "grad_norm": 67.36904907226562, "learning_rate": 5.098407198883596e-05, "loss": 6.9229, "step": 11280 }, { "epoch": 3.6128, "grad_norm": 90.85781860351562, "learning_rate": 5.0935951109186275e-05, "loss": 7.4309, "step": 11290 }, { "epoch": 3.616, "grad_norm": 327.0402526855469, "learning_rate": 5.0887830229536604e-05, "loss": 20.7902, "step": 11300 }, { "epoch": 3.6192, "grad_norm": 99.4246597290039, "learning_rate": 5.083970934988692e-05, "loss": 6.7063, "step": 11310 }, { "epoch": 3.6224, "grad_norm": 199.2280731201172, "learning_rate": 5.0791588470237236e-05, "loss": 6.7176, "step": 11320 }, { "epoch": 3.6256, "grad_norm": 123.94599914550781, "learning_rate": 5.074346759058756e-05, "loss": 9.6116, "step": 11330 }, { "epoch": 3.6288, "grad_norm": 175.60699462890625, "learning_rate": 5.0695346710937875e-05, "loss": 9.9806, "step": 11340 }, { "epoch": 3.632, "grad_norm": 49.826377868652344, "learning_rate": 5.0647225831288204e-05, "loss": 9.7228, "step": 11350 }, { "epoch": 3.6352, "grad_norm": 183.75050354003906, "learning_rate": 5.059910495163852e-05, "loss": 11.8958, "step": 11360 }, { "epoch": 3.6384, "grad_norm": 29.84248924255371, "learning_rate": 5.0550984071988836e-05, "loss": 4.0395, "step": 11370 }, { "epoch": 3.6416, "grad_norm": 194.1902313232422, "learning_rate": 5.050286319233916e-05, "loss": 8.661, "step": 11380 }, { "epoch": 3.6448, "grad_norm": 105.26177215576172, "learning_rate": 5.0454742312689475e-05, "loss": 10.23, "step": 11390 }, { "epoch": 3.648, "grad_norm": 350.8613586425781, "learning_rate": 5.0406621433039804e-05, "loss": 8.1449, "step": 11400 }, { "epoch": 3.6512000000000002, "grad_norm": 178.09446716308594, "learning_rate": 5.035850055339012e-05, "loss": 7.9261, "step": 11410 }, { "epoch": 3.6544, "grad_norm": 55.64959716796875, "learning_rate": 5.0310379673740436e-05, "loss": 4.3709, "step": 11420 }, { "epoch": 3.6576, "grad_norm": 218.20883178710938, "learning_rate": 5.026225879409075e-05, "loss": 2.6882, "step": 11430 }, { "epoch": 3.6608, "grad_norm": 246.2463836669922, "learning_rate": 5.0214137914441075e-05, "loss": 8.0737, "step": 11440 }, { "epoch": 3.664, "grad_norm": 26.20545768737793, "learning_rate": 5.0166017034791404e-05, "loss": 10.1952, "step": 11450 }, { "epoch": 3.6672000000000002, "grad_norm": 51.214988708496094, "learning_rate": 5.011789615514172e-05, "loss": 10.8203, "step": 11460 }, { "epoch": 3.6704, "grad_norm": 28.66921043395996, "learning_rate": 5.0069775275492036e-05, "loss": 3.5207, "step": 11470 }, { "epoch": 3.6736, "grad_norm": 274.5750427246094, "learning_rate": 5.002165439584235e-05, "loss": 9.9648, "step": 11480 }, { "epoch": 3.6768, "grad_norm": 293.16998291015625, "learning_rate": 4.9973533516192675e-05, "loss": 11.7132, "step": 11490 }, { "epoch": 3.68, "grad_norm": 116.18402862548828, "learning_rate": 4.9925412636543e-05, "loss": 12.1652, "step": 11500 }, { "epoch": 3.6832000000000003, "grad_norm": 61.41278839111328, "learning_rate": 4.987729175689332e-05, "loss": 2.4297, "step": 11510 }, { "epoch": 3.6864, "grad_norm": 31.924379348754883, "learning_rate": 4.9829170877243636e-05, "loss": 7.9264, "step": 11520 }, { "epoch": 3.6896, "grad_norm": 113.23978424072266, "learning_rate": 4.978104999759396e-05, "loss": 8.5131, "step": 11530 }, { "epoch": 3.6928, "grad_norm": 121.65467834472656, "learning_rate": 4.9732929117944275e-05, "loss": 5.9042, "step": 11540 }, { "epoch": 3.6959999999999997, "grad_norm": 670.8251953125, "learning_rate": 4.96848082382946e-05, "loss": 13.039, "step": 11550 }, { "epoch": 3.6992000000000003, "grad_norm": 44.61491012573242, "learning_rate": 4.963668735864492e-05, "loss": 6.3408, "step": 11560 }, { "epoch": 3.7024, "grad_norm": 64.76012420654297, "learning_rate": 4.958856647899524e-05, "loss": 3.7327, "step": 11570 }, { "epoch": 3.7056, "grad_norm": 12.21462345123291, "learning_rate": 4.954044559934556e-05, "loss": 5.1006, "step": 11580 }, { "epoch": 3.7088, "grad_norm": 240.92086791992188, "learning_rate": 4.9492324719695875e-05, "loss": 7.1545, "step": 11590 }, { "epoch": 3.7119999999999997, "grad_norm": 328.66839599609375, "learning_rate": 4.94442038400462e-05, "loss": 8.6411, "step": 11600 }, { "epoch": 3.7152, "grad_norm": 22.784469604492188, "learning_rate": 4.939608296039652e-05, "loss": 10.7476, "step": 11610 }, { "epoch": 3.7184, "grad_norm": 43.62549591064453, "learning_rate": 4.934796208074684e-05, "loss": 6.5361, "step": 11620 }, { "epoch": 3.7216, "grad_norm": 34.11656188964844, "learning_rate": 4.929984120109716e-05, "loss": 4.4015, "step": 11630 }, { "epoch": 3.7248, "grad_norm": 149.90037536621094, "learning_rate": 4.925172032144748e-05, "loss": 3.7327, "step": 11640 }, { "epoch": 3.7279999999999998, "grad_norm": 207.9757843017578, "learning_rate": 4.92035994417978e-05, "loss": 5.6256, "step": 11650 }, { "epoch": 3.7312, "grad_norm": 123.30134582519531, "learning_rate": 4.9155478562148114e-05, "loss": 9.3971, "step": 11660 }, { "epoch": 3.7344, "grad_norm": 411.8901672363281, "learning_rate": 4.9107357682498436e-05, "loss": 9.9118, "step": 11670 }, { "epoch": 3.7376, "grad_norm": 149.00994873046875, "learning_rate": 4.905923680284876e-05, "loss": 3.9755, "step": 11680 }, { "epoch": 3.7408, "grad_norm": 173.2017364501953, "learning_rate": 4.901111592319908e-05, "loss": 4.2737, "step": 11690 }, { "epoch": 3.7439999999999998, "grad_norm": 173.57420349121094, "learning_rate": 4.89629950435494e-05, "loss": 4.2272, "step": 11700 }, { "epoch": 3.7472, "grad_norm": 84.92804718017578, "learning_rate": 4.891487416389972e-05, "loss": 15.6724, "step": 11710 }, { "epoch": 3.7504, "grad_norm": 180.1131134033203, "learning_rate": 4.8866753284250036e-05, "loss": 8.848, "step": 11720 }, { "epoch": 3.7536, "grad_norm": 519.44140625, "learning_rate": 4.881863240460036e-05, "loss": 16.7353, "step": 11730 }, { "epoch": 3.7568, "grad_norm": 154.0474395751953, "learning_rate": 4.877051152495068e-05, "loss": 8.8156, "step": 11740 }, { "epoch": 3.76, "grad_norm": 241.169921875, "learning_rate": 4.8722390645301e-05, "loss": 6.3165, "step": 11750 }, { "epoch": 3.7632, "grad_norm": 248.59170532226562, "learning_rate": 4.867426976565132e-05, "loss": 6.6341, "step": 11760 }, { "epoch": 3.7664, "grad_norm": 188.39158630371094, "learning_rate": 4.8626148886001636e-05, "loss": 14.5356, "step": 11770 }, { "epoch": 3.7696, "grad_norm": 260.808837890625, "learning_rate": 4.857802800635196e-05, "loss": 4.5516, "step": 11780 }, { "epoch": 3.7728, "grad_norm": 552.4074096679688, "learning_rate": 4.852990712670228e-05, "loss": 15.6111, "step": 11790 }, { "epoch": 3.776, "grad_norm": 243.57078552246094, "learning_rate": 4.84817862470526e-05, "loss": 8.1961, "step": 11800 }, { "epoch": 3.7792, "grad_norm": 162.5610809326172, "learning_rate": 4.843366536740292e-05, "loss": 9.514, "step": 11810 }, { "epoch": 3.7824, "grad_norm": 98.18606567382812, "learning_rate": 4.8385544487753236e-05, "loss": 11.3489, "step": 11820 }, { "epoch": 3.7856, "grad_norm": 68.80675506591797, "learning_rate": 4.833742360810356e-05, "loss": 14.565, "step": 11830 }, { "epoch": 3.7888, "grad_norm": 69.52263641357422, "learning_rate": 4.8289302728453875e-05, "loss": 7.2866, "step": 11840 }, { "epoch": 3.792, "grad_norm": 114.2996826171875, "learning_rate": 4.82411818488042e-05, "loss": 9.7929, "step": 11850 }, { "epoch": 3.7952, "grad_norm": 294.47686767578125, "learning_rate": 4.819306096915452e-05, "loss": 6.5934, "step": 11860 }, { "epoch": 3.7984, "grad_norm": 17.562715530395508, "learning_rate": 4.8144940089504836e-05, "loss": 17.7711, "step": 11870 }, { "epoch": 3.8016, "grad_norm": 394.4798889160156, "learning_rate": 4.809681920985516e-05, "loss": 7.3666, "step": 11880 }, { "epoch": 3.8048, "grad_norm": 72.10729217529297, "learning_rate": 4.8048698330205475e-05, "loss": 4.5319, "step": 11890 }, { "epoch": 3.808, "grad_norm": 159.09017944335938, "learning_rate": 4.80005774505558e-05, "loss": 5.5631, "step": 11900 }, { "epoch": 3.8112, "grad_norm": 90.21721649169922, "learning_rate": 4.795245657090612e-05, "loss": 9.6178, "step": 11910 }, { "epoch": 3.8144, "grad_norm": 43.22314453125, "learning_rate": 4.790433569125644e-05, "loss": 12.6967, "step": 11920 }, { "epoch": 3.8176, "grad_norm": 189.50782775878906, "learning_rate": 4.785621481160676e-05, "loss": 13.9557, "step": 11930 }, { "epoch": 3.8208, "grad_norm": 60.87796401977539, "learning_rate": 4.7808093931957075e-05, "loss": 4.2569, "step": 11940 }, { "epoch": 3.824, "grad_norm": 65.7997817993164, "learning_rate": 4.77599730523074e-05, "loss": 6.1646, "step": 11950 }, { "epoch": 3.8272, "grad_norm": 143.854248046875, "learning_rate": 4.771185217265772e-05, "loss": 16.4387, "step": 11960 }, { "epoch": 3.8304, "grad_norm": 277.49261474609375, "learning_rate": 4.766373129300804e-05, "loss": 5.0689, "step": 11970 }, { "epoch": 3.8336, "grad_norm": 332.9490051269531, "learning_rate": 4.761561041335836e-05, "loss": 10.1769, "step": 11980 }, { "epoch": 3.8368, "grad_norm": 42.79552459716797, "learning_rate": 4.756748953370868e-05, "loss": 5.4904, "step": 11990 }, { "epoch": 3.84, "grad_norm": 199.64788818359375, "learning_rate": 4.7519368654059e-05, "loss": 10.0774, "step": 12000 }, { "epoch": 3.8432, "grad_norm": 294.695068359375, "learning_rate": 4.7471247774409314e-05, "loss": 16.7321, "step": 12010 }, { "epoch": 3.8464, "grad_norm": 354.9795837402344, "learning_rate": 4.7423126894759636e-05, "loss": 5.2691, "step": 12020 }, { "epoch": 3.8496, "grad_norm": 188.9573974609375, "learning_rate": 4.737500601510996e-05, "loss": 3.613, "step": 12030 }, { "epoch": 3.8528000000000002, "grad_norm": 96.00435638427734, "learning_rate": 4.732688513546028e-05, "loss": 7.6285, "step": 12040 }, { "epoch": 3.856, "grad_norm": 132.21182250976562, "learning_rate": 4.72787642558106e-05, "loss": 4.3558, "step": 12050 }, { "epoch": 3.8592, "grad_norm": 212.25498962402344, "learning_rate": 4.7230643376160914e-05, "loss": 13.4022, "step": 12060 }, { "epoch": 3.8624, "grad_norm": 247.55589294433594, "learning_rate": 4.7182522496511236e-05, "loss": 6.3433, "step": 12070 }, { "epoch": 3.8656, "grad_norm": 211.2296600341797, "learning_rate": 4.713440161686156e-05, "loss": 9.1397, "step": 12080 }, { "epoch": 3.8688000000000002, "grad_norm": 284.04241943359375, "learning_rate": 4.708628073721188e-05, "loss": 5.6531, "step": 12090 }, { "epoch": 3.872, "grad_norm": 50.027225494384766, "learning_rate": 4.70381598575622e-05, "loss": 6.1604, "step": 12100 }, { "epoch": 3.8752, "grad_norm": 6.891795635223389, "learning_rate": 4.699003897791252e-05, "loss": 7.4541, "step": 12110 }, { "epoch": 3.8784, "grad_norm": 342.19976806640625, "learning_rate": 4.6941918098262837e-05, "loss": 4.5675, "step": 12120 }, { "epoch": 3.8816, "grad_norm": 60.52157211303711, "learning_rate": 4.689379721861315e-05, "loss": 6.8015, "step": 12130 }, { "epoch": 3.8848000000000003, "grad_norm": 12.727437019348145, "learning_rate": 4.684567633896348e-05, "loss": 10.297, "step": 12140 }, { "epoch": 3.888, "grad_norm": 202.27330017089844, "learning_rate": 4.67975554593138e-05, "loss": 7.2501, "step": 12150 }, { "epoch": 3.8912, "grad_norm": 152.308349609375, "learning_rate": 4.674943457966412e-05, "loss": 6.7754, "step": 12160 }, { "epoch": 3.8944, "grad_norm": 482.5093994140625, "learning_rate": 4.6701313700014437e-05, "loss": 4.6819, "step": 12170 }, { "epoch": 3.8975999999999997, "grad_norm": 178.71385192871094, "learning_rate": 4.665319282036476e-05, "loss": 3.4214, "step": 12180 }, { "epoch": 3.9008000000000003, "grad_norm": 23.884092330932617, "learning_rate": 4.6605071940715075e-05, "loss": 8.208, "step": 12190 }, { "epoch": 3.904, "grad_norm": 174.35052490234375, "learning_rate": 4.65569510610654e-05, "loss": 4.5435, "step": 12200 }, { "epoch": 3.9072, "grad_norm": 423.03570556640625, "learning_rate": 4.650883018141572e-05, "loss": 15.2718, "step": 12210 }, { "epoch": 3.9104, "grad_norm": 102.40347290039062, "learning_rate": 4.6460709301766037e-05, "loss": 9.9615, "step": 12220 }, { "epoch": 3.9135999999999997, "grad_norm": 179.13671875, "learning_rate": 4.641258842211636e-05, "loss": 8.1626, "step": 12230 }, { "epoch": 3.9168, "grad_norm": 107.88333892822266, "learning_rate": 4.6364467542466675e-05, "loss": 4.9037, "step": 12240 }, { "epoch": 3.92, "grad_norm": 61.00058364868164, "learning_rate": 4.6316346662817e-05, "loss": 7.4888, "step": 12250 }, { "epoch": 3.9232, "grad_norm": 31.088891983032227, "learning_rate": 4.626822578316732e-05, "loss": 10.0471, "step": 12260 }, { "epoch": 3.9264, "grad_norm": 333.03424072265625, "learning_rate": 4.6220104903517637e-05, "loss": 6.7883, "step": 12270 }, { "epoch": 3.9295999999999998, "grad_norm": 224.67445373535156, "learning_rate": 4.617198402386796e-05, "loss": 7.6882, "step": 12280 }, { "epoch": 3.9328, "grad_norm": 28.072439193725586, "learning_rate": 4.6123863144218275e-05, "loss": 5.1695, "step": 12290 }, { "epoch": 3.936, "grad_norm": 37.69227600097656, "learning_rate": 4.60757422645686e-05, "loss": 2.9096, "step": 12300 }, { "epoch": 3.9392, "grad_norm": 354.0428161621094, "learning_rate": 4.602762138491892e-05, "loss": 12.1898, "step": 12310 }, { "epoch": 3.9424, "grad_norm": 194.1611328125, "learning_rate": 4.5979500505269243e-05, "loss": 9.1539, "step": 12320 }, { "epoch": 3.9455999999999998, "grad_norm": 104.0475845336914, "learning_rate": 4.593137962561956e-05, "loss": 10.9145, "step": 12330 }, { "epoch": 3.9488, "grad_norm": 95.49259948730469, "learning_rate": 4.5883258745969875e-05, "loss": 7.8727, "step": 12340 }, { "epoch": 3.952, "grad_norm": 241.5102081298828, "learning_rate": 4.58351378663202e-05, "loss": 8.9394, "step": 12350 }, { "epoch": 3.9552, "grad_norm": 27.74616050720215, "learning_rate": 4.5787016986670514e-05, "loss": 11.4749, "step": 12360 }, { "epoch": 3.9584, "grad_norm": 21.461673736572266, "learning_rate": 4.573889610702084e-05, "loss": 7.4508, "step": 12370 }, { "epoch": 3.9616, "grad_norm": 109.84790802001953, "learning_rate": 4.569077522737116e-05, "loss": 8.5757, "step": 12380 }, { "epoch": 3.9648, "grad_norm": 40.0219612121582, "learning_rate": 4.564265434772148e-05, "loss": 9.5256, "step": 12390 }, { "epoch": 3.968, "grad_norm": 203.36526489257812, "learning_rate": 4.55945334680718e-05, "loss": 3.3688, "step": 12400 }, { "epoch": 3.9712, "grad_norm": 96.9083023071289, "learning_rate": 4.5546412588422114e-05, "loss": 13.2995, "step": 12410 }, { "epoch": 3.9744, "grad_norm": 445.0245361328125, "learning_rate": 4.549829170877244e-05, "loss": 7.9941, "step": 12420 }, { "epoch": 3.9776, "grad_norm": 316.97528076171875, "learning_rate": 4.545017082912276e-05, "loss": 7.6404, "step": 12430 }, { "epoch": 3.9808, "grad_norm": 32.34640121459961, "learning_rate": 4.540204994947308e-05, "loss": 9.0225, "step": 12440 }, { "epoch": 3.984, "grad_norm": 31.805898666381836, "learning_rate": 4.53539290698234e-05, "loss": 8.1072, "step": 12450 }, { "epoch": 3.9872, "grad_norm": 255.33908081054688, "learning_rate": 4.530580819017372e-05, "loss": 7.07, "step": 12460 }, { "epoch": 3.9904, "grad_norm": 39.463809967041016, "learning_rate": 4.525768731052404e-05, "loss": 7.4113, "step": 12470 }, { "epoch": 3.9936, "grad_norm": 99.33548736572266, "learning_rate": 4.520956643087436e-05, "loss": 6.1012, "step": 12480 }, { "epoch": 3.9968, "grad_norm": 262.13861083984375, "learning_rate": 4.516144555122468e-05, "loss": 3.2721, "step": 12490 }, { "epoch": 4.0, "grad_norm": 201.76150512695312, "learning_rate": 4.5113324671575e-05, "loss": 4.0394, "step": 12500 }, { "epoch": 4.0032, "grad_norm": 219.48187255859375, "learning_rate": 4.506520379192532e-05, "loss": 14.5706, "step": 12510 }, { "epoch": 4.0064, "grad_norm": 86.71796417236328, "learning_rate": 4.501708291227564e-05, "loss": 6.7415, "step": 12520 }, { "epoch": 4.0096, "grad_norm": 277.3252868652344, "learning_rate": 4.496896203262596e-05, "loss": 4.9949, "step": 12530 }, { "epoch": 4.0128, "grad_norm": 44.04762268066406, "learning_rate": 4.4920841152976275e-05, "loss": 3.9624, "step": 12540 }, { "epoch": 4.016, "grad_norm": 17.924755096435547, "learning_rate": 4.48727202733266e-05, "loss": 3.2712, "step": 12550 }, { "epoch": 4.0192, "grad_norm": 312.2597351074219, "learning_rate": 4.482459939367692e-05, "loss": 14.5068, "step": 12560 }, { "epoch": 4.0224, "grad_norm": 192.3851776123047, "learning_rate": 4.477647851402724e-05, "loss": 8.9248, "step": 12570 }, { "epoch": 4.0256, "grad_norm": 342.3024597167969, "learning_rate": 4.472835763437756e-05, "loss": 16.6702, "step": 12580 }, { "epoch": 4.0288, "grad_norm": 9.51805591583252, "learning_rate": 4.4680236754727875e-05, "loss": 6.7774, "step": 12590 }, { "epoch": 4.032, "grad_norm": 438.2104797363281, "learning_rate": 4.46321158750782e-05, "loss": 11.5915, "step": 12600 }, { "epoch": 4.0352, "grad_norm": 77.5681381225586, "learning_rate": 4.458399499542852e-05, "loss": 10.4559, "step": 12610 }, { "epoch": 4.0384, "grad_norm": 163.0925750732422, "learning_rate": 4.453587411577884e-05, "loss": 6.7243, "step": 12620 }, { "epoch": 4.0416, "grad_norm": 389.1625061035156, "learning_rate": 4.448775323612916e-05, "loss": 14.9318, "step": 12630 }, { "epoch": 4.0448, "grad_norm": 55.324527740478516, "learning_rate": 4.4439632356479475e-05, "loss": 5.5316, "step": 12640 }, { "epoch": 4.048, "grad_norm": 168.31138610839844, "learning_rate": 4.43915114768298e-05, "loss": 8.4634, "step": 12650 }, { "epoch": 4.0512, "grad_norm": 118.34691619873047, "learning_rate": 4.434339059718012e-05, "loss": 8.4052, "step": 12660 }, { "epoch": 4.0544, "grad_norm": 541.5223999023438, "learning_rate": 4.4295269717530444e-05, "loss": 10.4972, "step": 12670 }, { "epoch": 4.0576, "grad_norm": 388.40521240234375, "learning_rate": 4.424714883788076e-05, "loss": 10.2888, "step": 12680 }, { "epoch": 4.0608, "grad_norm": 130.67112731933594, "learning_rate": 4.4199027958231075e-05, "loss": 5.858, "step": 12690 }, { "epoch": 4.064, "grad_norm": 155.96279907226562, "learning_rate": 4.41509070785814e-05, "loss": 4.5343, "step": 12700 }, { "epoch": 4.0672, "grad_norm": 737.684814453125, "learning_rate": 4.4102786198931714e-05, "loss": 8.6321, "step": 12710 }, { "epoch": 4.0704, "grad_norm": 200.27117919921875, "learning_rate": 4.405466531928204e-05, "loss": 7.9248, "step": 12720 }, { "epoch": 4.0736, "grad_norm": 37.11867904663086, "learning_rate": 4.400654443963236e-05, "loss": 11.421, "step": 12730 }, { "epoch": 4.0768, "grad_norm": 236.78366088867188, "learning_rate": 4.395842355998268e-05, "loss": 9.5104, "step": 12740 }, { "epoch": 4.08, "grad_norm": 563.0734252929688, "learning_rate": 4.3910302680333e-05, "loss": 14.7617, "step": 12750 }, { "epoch": 4.0832, "grad_norm": 53.46348190307617, "learning_rate": 4.3862181800683314e-05, "loss": 7.9012, "step": 12760 }, { "epoch": 4.0864, "grad_norm": 150.35806274414062, "learning_rate": 4.381406092103364e-05, "loss": 7.0907, "step": 12770 }, { "epoch": 4.0896, "grad_norm": 106.21488952636719, "learning_rate": 4.376594004138396e-05, "loss": 7.4574, "step": 12780 }, { "epoch": 4.0928, "grad_norm": 147.40090942382812, "learning_rate": 4.371781916173428e-05, "loss": 7.6415, "step": 12790 }, { "epoch": 4.096, "grad_norm": 220.6534881591797, "learning_rate": 4.36696982820846e-05, "loss": 9.2389, "step": 12800 }, { "epoch": 4.0992, "grad_norm": 71.61075592041016, "learning_rate": 4.362157740243492e-05, "loss": 3.7425, "step": 12810 }, { "epoch": 4.1024, "grad_norm": 134.68701171875, "learning_rate": 4.357345652278524e-05, "loss": 2.014, "step": 12820 }, { "epoch": 4.1056, "grad_norm": 251.7549285888672, "learning_rate": 4.352533564313556e-05, "loss": 7.9257, "step": 12830 }, { "epoch": 4.1088, "grad_norm": 360.9670104980469, "learning_rate": 4.347721476348588e-05, "loss": 7.5368, "step": 12840 }, { "epoch": 4.112, "grad_norm": 387.5711975097656, "learning_rate": 4.34290938838362e-05, "loss": 6.8107, "step": 12850 }, { "epoch": 4.1152, "grad_norm": 46.75333786010742, "learning_rate": 4.338097300418652e-05, "loss": 6.4655, "step": 12860 }, { "epoch": 4.1184, "grad_norm": 9.210946083068848, "learning_rate": 4.333285212453684e-05, "loss": 4.3271, "step": 12870 }, { "epoch": 4.1216, "grad_norm": 186.88442993164062, "learning_rate": 4.328473124488716e-05, "loss": 4.7776, "step": 12880 }, { "epoch": 4.1248, "grad_norm": 168.9029083251953, "learning_rate": 4.3236610365237476e-05, "loss": 7.5121, "step": 12890 }, { "epoch": 4.128, "grad_norm": 108.74742126464844, "learning_rate": 4.31884894855878e-05, "loss": 8.7816, "step": 12900 }, { "epoch": 4.1312, "grad_norm": 112.93949890136719, "learning_rate": 4.314036860593812e-05, "loss": 12.4958, "step": 12910 }, { "epoch": 4.1344, "grad_norm": 62.16154098510742, "learning_rate": 4.309224772628844e-05, "loss": 11.5903, "step": 12920 }, { "epoch": 4.1376, "grad_norm": 651.5052490234375, "learning_rate": 4.304412684663876e-05, "loss": 14.6368, "step": 12930 }, { "epoch": 4.1408, "grad_norm": 292.4139099121094, "learning_rate": 4.2996005966989076e-05, "loss": 3.8382, "step": 12940 }, { "epoch": 4.144, "grad_norm": 103.50984954833984, "learning_rate": 4.29478850873394e-05, "loss": 8.6125, "step": 12950 }, { "epoch": 4.1472, "grad_norm": 93.3908920288086, "learning_rate": 4.289976420768972e-05, "loss": 13.0246, "step": 12960 }, { "epoch": 4.1504, "grad_norm": 24.3245792388916, "learning_rate": 4.285164332804004e-05, "loss": 8.5953, "step": 12970 }, { "epoch": 4.1536, "grad_norm": 166.63409423828125, "learning_rate": 4.280352244839036e-05, "loss": 4.2516, "step": 12980 }, { "epoch": 4.1568, "grad_norm": 43.23161315917969, "learning_rate": 4.2755401568740676e-05, "loss": 22.4351, "step": 12990 }, { "epoch": 4.16, "grad_norm": 82.63404083251953, "learning_rate": 4.2707280689091e-05, "loss": 10.9908, "step": 13000 }, { "epoch": 4.1632, "grad_norm": 173.07992553710938, "learning_rate": 4.265915980944132e-05, "loss": 5.5241, "step": 13010 }, { "epoch": 4.1664, "grad_norm": 50.02283477783203, "learning_rate": 4.2611038929791644e-05, "loss": 6.9723, "step": 13020 }, { "epoch": 4.1696, "grad_norm": 193.55470275878906, "learning_rate": 4.256291805014196e-05, "loss": 5.6653, "step": 13030 }, { "epoch": 4.1728, "grad_norm": 34.09096908569336, "learning_rate": 4.2514797170492276e-05, "loss": 12.0383, "step": 13040 }, { "epoch": 4.176, "grad_norm": 155.21815490722656, "learning_rate": 4.24666762908426e-05, "loss": 8.282, "step": 13050 }, { "epoch": 4.1792, "grad_norm": 141.95018005371094, "learning_rate": 4.2418555411192914e-05, "loss": 5.8353, "step": 13060 }, { "epoch": 4.1824, "grad_norm": 28.69383430480957, "learning_rate": 4.2370434531543244e-05, "loss": 11.6757, "step": 13070 }, { "epoch": 4.1856, "grad_norm": 379.13037109375, "learning_rate": 4.232231365189356e-05, "loss": 11.8019, "step": 13080 }, { "epoch": 4.1888, "grad_norm": 101.4684829711914, "learning_rate": 4.227419277224388e-05, "loss": 2.9229, "step": 13090 }, { "epoch": 4.192, "grad_norm": 148.60357666015625, "learning_rate": 4.22260718925942e-05, "loss": 8.8491, "step": 13100 }, { "epoch": 4.1952, "grad_norm": 79.58724975585938, "learning_rate": 4.2177951012944514e-05, "loss": 3.9053, "step": 13110 }, { "epoch": 4.1984, "grad_norm": 280.4392395019531, "learning_rate": 4.212983013329484e-05, "loss": 6.0691, "step": 13120 }, { "epoch": 4.2016, "grad_norm": 17.856550216674805, "learning_rate": 4.208170925364516e-05, "loss": 3.3693, "step": 13130 }, { "epoch": 4.2048, "grad_norm": 303.6913146972656, "learning_rate": 4.203358837399548e-05, "loss": 6.093, "step": 13140 }, { "epoch": 4.208, "grad_norm": 113.54812622070312, "learning_rate": 4.19854674943458e-05, "loss": 5.526, "step": 13150 }, { "epoch": 4.2112, "grad_norm": 207.98643493652344, "learning_rate": 4.193734661469612e-05, "loss": 5.3908, "step": 13160 }, { "epoch": 4.2144, "grad_norm": 290.43157958984375, "learning_rate": 4.188922573504644e-05, "loss": 12.4042, "step": 13170 }, { "epoch": 4.2176, "grad_norm": 29.755868911743164, "learning_rate": 4.184110485539676e-05, "loss": 3.9674, "step": 13180 }, { "epoch": 4.2208, "grad_norm": 261.7047424316406, "learning_rate": 4.179298397574708e-05, "loss": 10.5314, "step": 13190 }, { "epoch": 4.224, "grad_norm": 56.67449188232422, "learning_rate": 4.17448630960974e-05, "loss": 8.2155, "step": 13200 }, { "epoch": 4.2272, "grad_norm": 15.337433815002441, "learning_rate": 4.169674221644772e-05, "loss": 10.4281, "step": 13210 }, { "epoch": 4.2304, "grad_norm": 19.147184371948242, "learning_rate": 4.164862133679804e-05, "loss": 8.1588, "step": 13220 }, { "epoch": 4.2336, "grad_norm": 387.998779296875, "learning_rate": 4.160050045714836e-05, "loss": 5.3021, "step": 13230 }, { "epoch": 4.2368, "grad_norm": 192.4482421875, "learning_rate": 4.1552379577498676e-05, "loss": 6.8171, "step": 13240 }, { "epoch": 4.24, "grad_norm": 145.72079467773438, "learning_rate": 4.1504258697849e-05, "loss": 4.3195, "step": 13250 }, { "epoch": 4.2432, "grad_norm": 131.11184692382812, "learning_rate": 4.145613781819932e-05, "loss": 2.4877, "step": 13260 }, { "epoch": 4.2464, "grad_norm": 109.23413848876953, "learning_rate": 4.140801693854964e-05, "loss": 5.0755, "step": 13270 }, { "epoch": 4.2496, "grad_norm": 266.8868408203125, "learning_rate": 4.135989605889996e-05, "loss": 8.1896, "step": 13280 }, { "epoch": 4.2528, "grad_norm": 57.84616470336914, "learning_rate": 4.1311775179250276e-05, "loss": 4.6097, "step": 13290 }, { "epoch": 4.256, "grad_norm": 133.28659057617188, "learning_rate": 4.12636542996006e-05, "loss": 11.7546, "step": 13300 }, { "epoch": 4.2592, "grad_norm": 341.3699951171875, "learning_rate": 4.121553341995092e-05, "loss": 5.6101, "step": 13310 }, { "epoch": 4.2624, "grad_norm": 194.50201416015625, "learning_rate": 4.116741254030124e-05, "loss": 7.7583, "step": 13320 }, { "epoch": 4.2656, "grad_norm": 220.2288360595703, "learning_rate": 4.111929166065156e-05, "loss": 5.9201, "step": 13330 }, { "epoch": 4.2688, "grad_norm": 150.18211364746094, "learning_rate": 4.1071170781001876e-05, "loss": 11.1982, "step": 13340 }, { "epoch": 4.272, "grad_norm": 110.22770690917969, "learning_rate": 4.10230499013522e-05, "loss": 6.6394, "step": 13350 }, { "epoch": 4.2752, "grad_norm": 36.75579071044922, "learning_rate": 4.097492902170252e-05, "loss": 14.3122, "step": 13360 }, { "epoch": 4.2783999999999995, "grad_norm": 32.37786865234375, "learning_rate": 4.092680814205284e-05, "loss": 7.2745, "step": 13370 }, { "epoch": 4.2816, "grad_norm": 73.69268798828125, "learning_rate": 4.087868726240316e-05, "loss": 6.0325, "step": 13380 }, { "epoch": 4.2848, "grad_norm": 1.6859294176101685, "learning_rate": 4.0830566382753476e-05, "loss": 7.0528, "step": 13390 }, { "epoch": 4.288, "grad_norm": 233.9347381591797, "learning_rate": 4.07824455031038e-05, "loss": 10.5471, "step": 13400 }, { "epoch": 4.2912, "grad_norm": 239.86483764648438, "learning_rate": 4.0734324623454115e-05, "loss": 3.968, "step": 13410 }, { "epoch": 4.2943999999999996, "grad_norm": 216.93885803222656, "learning_rate": 4.0686203743804444e-05, "loss": 9.2287, "step": 13420 }, { "epoch": 4.2976, "grad_norm": 2.2190887928009033, "learning_rate": 4.063808286415476e-05, "loss": 5.3801, "step": 13430 }, { "epoch": 4.3008, "grad_norm": 173.89349365234375, "learning_rate": 4.0589961984505076e-05, "loss": 9.0231, "step": 13440 }, { "epoch": 4.304, "grad_norm": 305.8917236328125, "learning_rate": 4.05418411048554e-05, "loss": 4.5123, "step": 13450 }, { "epoch": 4.3072, "grad_norm": 476.81927490234375, "learning_rate": 4.0493720225205715e-05, "loss": 12.9005, "step": 13460 }, { "epoch": 4.3104, "grad_norm": 23.600570678710938, "learning_rate": 4.044559934555604e-05, "loss": 5.57, "step": 13470 }, { "epoch": 4.3136, "grad_norm": 89.55452728271484, "learning_rate": 4.039747846590636e-05, "loss": 4.3115, "step": 13480 }, { "epoch": 4.3168, "grad_norm": 40.7599983215332, "learning_rate": 4.034935758625668e-05, "loss": 7.0678, "step": 13490 }, { "epoch": 4.32, "grad_norm": 53.80952835083008, "learning_rate": 4.0301236706607e-05, "loss": 13.1652, "step": 13500 }, { "epoch": 4.3232, "grad_norm": 146.87252807617188, "learning_rate": 4.0253115826957315e-05, "loss": 7.537, "step": 13510 }, { "epoch": 4.3264, "grad_norm": 184.13804626464844, "learning_rate": 4.020499494730764e-05, "loss": 4.9385, "step": 13520 }, { "epoch": 4.3296, "grad_norm": 253.9748077392578, "learning_rate": 4.015687406765796e-05, "loss": 4.8251, "step": 13530 }, { "epoch": 4.3328, "grad_norm": 75.63909912109375, "learning_rate": 4.010875318800828e-05, "loss": 4.3444, "step": 13540 }, { "epoch": 4.336, "grad_norm": 5.252464771270752, "learning_rate": 4.00606323083586e-05, "loss": 9.6419, "step": 13550 }, { "epoch": 4.3392, "grad_norm": 212.344482421875, "learning_rate": 4.001251142870892e-05, "loss": 8.6965, "step": 13560 }, { "epoch": 4.3424, "grad_norm": 285.918212890625, "learning_rate": 3.996439054905924e-05, "loss": 6.7412, "step": 13570 }, { "epoch": 4.3456, "grad_norm": 371.56781005859375, "learning_rate": 3.991626966940955e-05, "loss": 9.2269, "step": 13580 }, { "epoch": 4.3488, "grad_norm": 387.47894287109375, "learning_rate": 3.9868148789759876e-05, "loss": 9.4567, "step": 13590 }, { "epoch": 4.352, "grad_norm": 196.92633056640625, "learning_rate": 3.98200279101102e-05, "loss": 5.5254, "step": 13600 }, { "epoch": 4.3552, "grad_norm": 523.7937622070312, "learning_rate": 3.977190703046052e-05, "loss": 10.9692, "step": 13610 }, { "epoch": 4.3584, "grad_norm": 113.77444458007812, "learning_rate": 3.972378615081084e-05, "loss": 8.478, "step": 13620 }, { "epoch": 4.3616, "grad_norm": 164.98135375976562, "learning_rate": 3.967566527116116e-05, "loss": 5.4628, "step": 13630 }, { "epoch": 4.3648, "grad_norm": 680.1068115234375, "learning_rate": 3.9627544391511476e-05, "loss": 9.6591, "step": 13640 }, { "epoch": 4.368, "grad_norm": 233.43727111816406, "learning_rate": 3.95794235118618e-05, "loss": 7.0272, "step": 13650 }, { "epoch": 4.3712, "grad_norm": 35.061988830566406, "learning_rate": 3.953130263221212e-05, "loss": 6.9641, "step": 13660 }, { "epoch": 4.3744, "grad_norm": 152.36106872558594, "learning_rate": 3.948318175256244e-05, "loss": 10.0238, "step": 13670 }, { "epoch": 4.3776, "grad_norm": 108.11162567138672, "learning_rate": 3.943506087291276e-05, "loss": 10.0065, "step": 13680 }, { "epoch": 4.3808, "grad_norm": 420.056640625, "learning_rate": 3.9386939993263076e-05, "loss": 16.2647, "step": 13690 }, { "epoch": 4.384, "grad_norm": 118.71830749511719, "learning_rate": 3.93388191136134e-05, "loss": 8.4024, "step": 13700 }, { "epoch": 4.3872, "grad_norm": 169.36265563964844, "learning_rate": 3.929069823396372e-05, "loss": 6.7452, "step": 13710 }, { "epoch": 4.3904, "grad_norm": 137.5408935546875, "learning_rate": 3.924257735431404e-05, "loss": 8.2784, "step": 13720 }, { "epoch": 4.3936, "grad_norm": 129.11793518066406, "learning_rate": 3.919445647466436e-05, "loss": 6.3966, "step": 13730 }, { "epoch": 4.3968, "grad_norm": 80.49801635742188, "learning_rate": 3.9146335595014676e-05, "loss": 8.3062, "step": 13740 }, { "epoch": 4.4, "grad_norm": 298.2650146484375, "learning_rate": 3.9098214715365e-05, "loss": 11.8754, "step": 13750 }, { "epoch": 4.4032, "grad_norm": 16.916446685791016, "learning_rate": 3.9050093835715315e-05, "loss": 5.9079, "step": 13760 }, { "epoch": 4.4064, "grad_norm": 160.86131286621094, "learning_rate": 3.9001972956065644e-05, "loss": 3.6422, "step": 13770 }, { "epoch": 4.4096, "grad_norm": 328.84552001953125, "learning_rate": 3.895385207641596e-05, "loss": 8.1059, "step": 13780 }, { "epoch": 4.4128, "grad_norm": 500.2921447753906, "learning_rate": 3.8905731196766276e-05, "loss": 9.1363, "step": 13790 }, { "epoch": 4.416, "grad_norm": 57.35462951660156, "learning_rate": 3.88576103171166e-05, "loss": 4.9691, "step": 13800 }, { "epoch": 4.4192, "grad_norm": 310.38055419921875, "learning_rate": 3.8809489437466915e-05, "loss": 5.9616, "step": 13810 }, { "epoch": 4.4224, "grad_norm": 33.3964958190918, "learning_rate": 3.876136855781724e-05, "loss": 7.361, "step": 13820 }, { "epoch": 4.4256, "grad_norm": 282.300048828125, "learning_rate": 3.871324767816756e-05, "loss": 8.5378, "step": 13830 }, { "epoch": 4.4288, "grad_norm": 180.30618286132812, "learning_rate": 3.866512679851788e-05, "loss": 9.9501, "step": 13840 }, { "epoch": 4.432, "grad_norm": 47.000694274902344, "learning_rate": 3.86170059188682e-05, "loss": 9.8571, "step": 13850 }, { "epoch": 4.4352, "grad_norm": 65.16790008544922, "learning_rate": 3.8568885039218515e-05, "loss": 10.8796, "step": 13860 }, { "epoch": 4.4384, "grad_norm": 98.3447494506836, "learning_rate": 3.852076415956884e-05, "loss": 9.202, "step": 13870 }, { "epoch": 4.4416, "grad_norm": 179.40518188476562, "learning_rate": 3.847264327991916e-05, "loss": 4.4862, "step": 13880 }, { "epoch": 4.4448, "grad_norm": 76.96600341796875, "learning_rate": 3.842452240026948e-05, "loss": 4.0793, "step": 13890 }, { "epoch": 4.448, "grad_norm": 62.65483856201172, "learning_rate": 3.83764015206198e-05, "loss": 10.0258, "step": 13900 }, { "epoch": 4.4512, "grad_norm": 133.31536865234375, "learning_rate": 3.832828064097012e-05, "loss": 4.3569, "step": 13910 }, { "epoch": 4.4544, "grad_norm": 438.76055908203125, "learning_rate": 3.828015976132044e-05, "loss": 6.3647, "step": 13920 }, { "epoch": 4.4576, "grad_norm": 129.13429260253906, "learning_rate": 3.8232038881670753e-05, "loss": 6.9756, "step": 13930 }, { "epoch": 4.4608, "grad_norm": 314.2818908691406, "learning_rate": 3.818391800202108e-05, "loss": 9.676, "step": 13940 }, { "epoch": 4.464, "grad_norm": 412.3567810058594, "learning_rate": 3.81357971223714e-05, "loss": 5.9988, "step": 13950 }, { "epoch": 4.4672, "grad_norm": 30.61244773864746, "learning_rate": 3.808767624272172e-05, "loss": 2.6265, "step": 13960 }, { "epoch": 4.4704, "grad_norm": 355.9007568359375, "learning_rate": 3.803955536307204e-05, "loss": 12.762, "step": 13970 }, { "epoch": 4.4736, "grad_norm": 177.19375610351562, "learning_rate": 3.799143448342236e-05, "loss": 5.0347, "step": 13980 }, { "epoch": 4.4768, "grad_norm": 125.07095336914062, "learning_rate": 3.7943313603772676e-05, "loss": 5.1654, "step": 13990 }, { "epoch": 4.48, "grad_norm": 322.7609558105469, "learning_rate": 3.7895192724123e-05, "loss": 5.1124, "step": 14000 }, { "epoch": 4.4832, "grad_norm": 28.07305145263672, "learning_rate": 3.784707184447332e-05, "loss": 8.616, "step": 14010 }, { "epoch": 4.4864, "grad_norm": 455.0904541015625, "learning_rate": 3.779895096482364e-05, "loss": 9.7516, "step": 14020 }, { "epoch": 4.4896, "grad_norm": 117.10187530517578, "learning_rate": 3.775083008517396e-05, "loss": 5.9951, "step": 14030 }, { "epoch": 4.4928, "grad_norm": 334.0278015136719, "learning_rate": 3.7702709205524276e-05, "loss": 8.8093, "step": 14040 }, { "epoch": 4.496, "grad_norm": 54.20822525024414, "learning_rate": 3.76545883258746e-05, "loss": 4.7542, "step": 14050 }, { "epoch": 4.4992, "grad_norm": 512.4501953125, "learning_rate": 3.760646744622492e-05, "loss": 11.4651, "step": 14060 }, { "epoch": 4.5024, "grad_norm": 122.72032165527344, "learning_rate": 3.755834656657524e-05, "loss": 3.751, "step": 14070 }, { "epoch": 4.5056, "grad_norm": 375.9181823730469, "learning_rate": 3.751022568692556e-05, "loss": 5.939, "step": 14080 }, { "epoch": 4.5088, "grad_norm": 147.89892578125, "learning_rate": 3.7462104807275876e-05, "loss": 7.6764, "step": 14090 }, { "epoch": 4.5120000000000005, "grad_norm": 50.360198974609375, "learning_rate": 3.74139839276262e-05, "loss": 9.7923, "step": 14100 }, { "epoch": 4.5152, "grad_norm": 460.80767822265625, "learning_rate": 3.7365863047976515e-05, "loss": 6.7647, "step": 14110 }, { "epoch": 4.5184, "grad_norm": 148.6534881591797, "learning_rate": 3.7317742168326844e-05, "loss": 6.2943, "step": 14120 }, { "epoch": 4.5216, "grad_norm": 80.185302734375, "learning_rate": 3.726962128867716e-05, "loss": 7.201, "step": 14130 }, { "epoch": 4.5248, "grad_norm": 216.1837615966797, "learning_rate": 3.7221500409027476e-05, "loss": 8.0806, "step": 14140 }, { "epoch": 4.5280000000000005, "grad_norm": 128.46861267089844, "learning_rate": 3.71733795293778e-05, "loss": 5.039, "step": 14150 }, { "epoch": 4.5312, "grad_norm": 22.085586547851562, "learning_rate": 3.7125258649728115e-05, "loss": 2.4226, "step": 14160 }, { "epoch": 4.5344, "grad_norm": 107.95946502685547, "learning_rate": 3.707713777007844e-05, "loss": 5.8418, "step": 14170 }, { "epoch": 4.5376, "grad_norm": 122.21377563476562, "learning_rate": 3.702901689042876e-05, "loss": 3.9343, "step": 14180 }, { "epoch": 4.5408, "grad_norm": 223.5470733642578, "learning_rate": 3.698089601077908e-05, "loss": 6.3904, "step": 14190 }, { "epoch": 4.5440000000000005, "grad_norm": 835.0013427734375, "learning_rate": 3.69327751311294e-05, "loss": 9.0871, "step": 14200 }, { "epoch": 4.5472, "grad_norm": 236.5133819580078, "learning_rate": 3.6884654251479715e-05, "loss": 13.2097, "step": 14210 }, { "epoch": 4.5504, "grad_norm": 119.03888702392578, "learning_rate": 3.683653337183004e-05, "loss": 5.8527, "step": 14220 }, { "epoch": 4.5536, "grad_norm": 115.69915771484375, "learning_rate": 3.678841249218036e-05, "loss": 7.2131, "step": 14230 }, { "epoch": 4.5568, "grad_norm": 17.163619995117188, "learning_rate": 3.674029161253068e-05, "loss": 4.7103, "step": 14240 }, { "epoch": 4.5600000000000005, "grad_norm": 197.39285278320312, "learning_rate": 3.6692170732881e-05, "loss": 3.6521, "step": 14250 }, { "epoch": 4.5632, "grad_norm": 213.7352752685547, "learning_rate": 3.664404985323132e-05, "loss": 4.6944, "step": 14260 }, { "epoch": 4.5664, "grad_norm": 256.00408935546875, "learning_rate": 3.659592897358164e-05, "loss": 4.434, "step": 14270 }, { "epoch": 4.5696, "grad_norm": 274.5691833496094, "learning_rate": 3.6547808093931954e-05, "loss": 5.7881, "step": 14280 }, { "epoch": 4.5728, "grad_norm": 319.9366455078125, "learning_rate": 3.649968721428228e-05, "loss": 7.5774, "step": 14290 }, { "epoch": 4.576, "grad_norm": 157.98048400878906, "learning_rate": 3.64515663346326e-05, "loss": 9.713, "step": 14300 }, { "epoch": 4.5792, "grad_norm": 94.29521942138672, "learning_rate": 3.640344545498292e-05, "loss": 4.8292, "step": 14310 }, { "epoch": 4.5824, "grad_norm": 229.71499633789062, "learning_rate": 3.635532457533324e-05, "loss": 5.03, "step": 14320 }, { "epoch": 4.5856, "grad_norm": 178.13717651367188, "learning_rate": 3.630720369568356e-05, "loss": 6.285, "step": 14330 }, { "epoch": 4.5888, "grad_norm": 131.4956512451172, "learning_rate": 3.6259082816033876e-05, "loss": 4.6723, "step": 14340 }, { "epoch": 4.592, "grad_norm": 521.5806884765625, "learning_rate": 3.62109619363842e-05, "loss": 10.6425, "step": 14350 }, { "epoch": 4.5952, "grad_norm": 203.82778930664062, "learning_rate": 3.616284105673452e-05, "loss": 11.1627, "step": 14360 }, { "epoch": 4.5984, "grad_norm": 130.14695739746094, "learning_rate": 3.611472017708484e-05, "loss": 2.3538, "step": 14370 }, { "epoch": 4.6016, "grad_norm": 133.6646270751953, "learning_rate": 3.606659929743516e-05, "loss": 5.514, "step": 14380 }, { "epoch": 4.6048, "grad_norm": 139.2952423095703, "learning_rate": 3.6018478417785476e-05, "loss": 4.8333, "step": 14390 }, { "epoch": 4.608, "grad_norm": 48.05717468261719, "learning_rate": 3.59703575381358e-05, "loss": 8.0364, "step": 14400 }, { "epoch": 4.6112, "grad_norm": 59.98562240600586, "learning_rate": 3.592223665848612e-05, "loss": 3.8792, "step": 14410 }, { "epoch": 4.6144, "grad_norm": 185.75938415527344, "learning_rate": 3.587411577883644e-05, "loss": 6.7528, "step": 14420 }, { "epoch": 4.6176, "grad_norm": 96.98120880126953, "learning_rate": 3.582599489918676e-05, "loss": 6.6684, "step": 14430 }, { "epoch": 4.6208, "grad_norm": 277.8514404296875, "learning_rate": 3.5777874019537076e-05, "loss": 6.8233, "step": 14440 }, { "epoch": 4.624, "grad_norm": 126.861328125, "learning_rate": 3.57297531398874e-05, "loss": 11.0075, "step": 14450 }, { "epoch": 4.6272, "grad_norm": 4.527242183685303, "learning_rate": 3.5681632260237715e-05, "loss": 7.3927, "step": 14460 }, { "epoch": 4.6304, "grad_norm": 133.99301147460938, "learning_rate": 3.5633511380588045e-05, "loss": 4.6586, "step": 14470 }, { "epoch": 4.6336, "grad_norm": 159.46630859375, "learning_rate": 3.558539050093836e-05, "loss": 5.3181, "step": 14480 }, { "epoch": 4.6368, "grad_norm": 159.32662963867188, "learning_rate": 3.5537269621288677e-05, "loss": 7.3141, "step": 14490 }, { "epoch": 4.64, "grad_norm": 153.9884033203125, "learning_rate": 3.5489148741639e-05, "loss": 7.8734, "step": 14500 }, { "epoch": 4.6432, "grad_norm": 220.59725952148438, "learning_rate": 3.5441027861989315e-05, "loss": 5.19, "step": 14510 }, { "epoch": 4.6464, "grad_norm": 231.93405151367188, "learning_rate": 3.539290698233964e-05, "loss": 12.618, "step": 14520 }, { "epoch": 4.6495999999999995, "grad_norm": 262.6766357421875, "learning_rate": 3.534478610268996e-05, "loss": 8.8075, "step": 14530 }, { "epoch": 4.6528, "grad_norm": 264.6095886230469, "learning_rate": 3.5296665223040277e-05, "loss": 6.4145, "step": 14540 }, { "epoch": 4.656, "grad_norm": 249.39218139648438, "learning_rate": 3.52485443433906e-05, "loss": 5.1372, "step": 14550 }, { "epoch": 4.6592, "grad_norm": 361.8636779785156, "learning_rate": 3.5200423463740915e-05, "loss": 8.7648, "step": 14560 }, { "epoch": 4.6624, "grad_norm": 55.58082962036133, "learning_rate": 3.515230258409124e-05, "loss": 6.4507, "step": 14570 }, { "epoch": 4.6655999999999995, "grad_norm": 248.04177856445312, "learning_rate": 3.510418170444156e-05, "loss": 9.8757, "step": 14580 }, { "epoch": 4.6688, "grad_norm": 225.97276306152344, "learning_rate": 3.505606082479188e-05, "loss": 9.6997, "step": 14590 }, { "epoch": 4.672, "grad_norm": 74.68299102783203, "learning_rate": 3.50079399451422e-05, "loss": 9.7402, "step": 14600 }, { "epoch": 4.6752, "grad_norm": 96.77436065673828, "learning_rate": 3.4959819065492515e-05, "loss": 6.9416, "step": 14610 }, { "epoch": 4.6784, "grad_norm": 5.356565952301025, "learning_rate": 3.491169818584284e-05, "loss": 6.4418, "step": 14620 }, { "epoch": 4.6815999999999995, "grad_norm": 212.08383178710938, "learning_rate": 3.4863577306193154e-05, "loss": 4.468, "step": 14630 }, { "epoch": 4.6848, "grad_norm": 113.68955993652344, "learning_rate": 3.481545642654348e-05, "loss": 5.7937, "step": 14640 }, { "epoch": 4.688, "grad_norm": 31.04882049560547, "learning_rate": 3.47673355468938e-05, "loss": 8.4099, "step": 14650 }, { "epoch": 4.6912, "grad_norm": 52.2400016784668, "learning_rate": 3.471921466724412e-05, "loss": 8.0731, "step": 14660 }, { "epoch": 4.6944, "grad_norm": 63.41189956665039, "learning_rate": 3.467109378759444e-05, "loss": 3.78, "step": 14670 }, { "epoch": 4.6975999999999996, "grad_norm": 24.254396438598633, "learning_rate": 3.4622972907944754e-05, "loss": 6.303, "step": 14680 }, { "epoch": 4.7008, "grad_norm": 10.205245018005371, "learning_rate": 3.457485202829508e-05, "loss": 10.226, "step": 14690 }, { "epoch": 4.704, "grad_norm": 506.66888427734375, "learning_rate": 3.45267311486454e-05, "loss": 5.2381, "step": 14700 }, { "epoch": 4.7072, "grad_norm": 88.07456970214844, "learning_rate": 3.447861026899572e-05, "loss": 7.1336, "step": 14710 }, { "epoch": 4.7104, "grad_norm": 51.9757080078125, "learning_rate": 3.443048938934604e-05, "loss": 9.3062, "step": 14720 }, { "epoch": 4.7136, "grad_norm": 57.67353057861328, "learning_rate": 3.438236850969636e-05, "loss": 9.7328, "step": 14730 }, { "epoch": 4.7168, "grad_norm": 85.74404907226562, "learning_rate": 3.433424763004668e-05, "loss": 4.8255, "step": 14740 }, { "epoch": 4.72, "grad_norm": 322.3530578613281, "learning_rate": 3.4286126750397e-05, "loss": 9.7766, "step": 14750 }, { "epoch": 4.7232, "grad_norm": 43.43214416503906, "learning_rate": 3.423800587074732e-05, "loss": 6.9049, "step": 14760 }, { "epoch": 4.7264, "grad_norm": 101.64360809326172, "learning_rate": 3.418988499109764e-05, "loss": 5.1876, "step": 14770 }, { "epoch": 4.7296, "grad_norm": 5.733304977416992, "learning_rate": 3.414176411144796e-05, "loss": 4.8686, "step": 14780 }, { "epoch": 4.7328, "grad_norm": 62.27647018432617, "learning_rate": 3.409364323179828e-05, "loss": 9.8903, "step": 14790 }, { "epoch": 4.736, "grad_norm": 180.34808349609375, "learning_rate": 3.40455223521486e-05, "loss": 4.1282, "step": 14800 }, { "epoch": 4.7392, "grad_norm": 480.96051025390625, "learning_rate": 3.3997401472498915e-05, "loss": 5.0375, "step": 14810 }, { "epoch": 4.7424, "grad_norm": 100.28192138671875, "learning_rate": 3.394928059284924e-05, "loss": 5.7724, "step": 14820 }, { "epoch": 4.7456, "grad_norm": 103.45751190185547, "learning_rate": 3.390115971319956e-05, "loss": 2.8814, "step": 14830 }, { "epoch": 4.7488, "grad_norm": 62.27669143676758, "learning_rate": 3.385303883354988e-05, "loss": 10.955, "step": 14840 }, { "epoch": 4.752, "grad_norm": 453.1050720214844, "learning_rate": 3.38049179539002e-05, "loss": 11.9186, "step": 14850 }, { "epoch": 4.7552, "grad_norm": 96.87089538574219, "learning_rate": 3.3756797074250515e-05, "loss": 2.8798, "step": 14860 }, { "epoch": 4.7584, "grad_norm": 147.32489013671875, "learning_rate": 3.370867619460084e-05, "loss": 8.2673, "step": 14870 }, { "epoch": 4.7616, "grad_norm": 206.2452392578125, "learning_rate": 3.366055531495116e-05, "loss": 6.7432, "step": 14880 }, { "epoch": 4.7648, "grad_norm": 127.63615417480469, "learning_rate": 3.361243443530148e-05, "loss": 9.191, "step": 14890 }, { "epoch": 4.768, "grad_norm": 427.1709899902344, "learning_rate": 3.35643135556518e-05, "loss": 11.4983, "step": 14900 }, { "epoch": 4.7712, "grad_norm": 26.90435218811035, "learning_rate": 3.3516192676002115e-05, "loss": 5.1363, "step": 14910 }, { "epoch": 4.7744, "grad_norm": 148.94894409179688, "learning_rate": 3.346807179635244e-05, "loss": 7.5392, "step": 14920 }, { "epoch": 4.7776, "grad_norm": 10.521403312683105, "learning_rate": 3.341995091670276e-05, "loss": 5.3794, "step": 14930 }, { "epoch": 4.7808, "grad_norm": 62.37763595581055, "learning_rate": 3.3371830037053084e-05, "loss": 4.7531, "step": 14940 }, { "epoch": 4.784, "grad_norm": 28.024364471435547, "learning_rate": 3.33237091574034e-05, "loss": 6.1647, "step": 14950 }, { "epoch": 4.7872, "grad_norm": 51.651031494140625, "learning_rate": 3.3275588277753715e-05, "loss": 8.9008, "step": 14960 }, { "epoch": 4.7904, "grad_norm": 294.5870666503906, "learning_rate": 3.322746739810404e-05, "loss": 7.9682, "step": 14970 }, { "epoch": 4.7936, "grad_norm": 17.49493980407715, "learning_rate": 3.3179346518454354e-05, "loss": 10.4694, "step": 14980 }, { "epoch": 4.7968, "grad_norm": 270.0805358886719, "learning_rate": 3.3131225638804684e-05, "loss": 8.8056, "step": 14990 }, { "epoch": 4.8, "grad_norm": 411.599609375, "learning_rate": 3.3083104759155e-05, "loss": 8.4568, "step": 15000 }, { "epoch": 4.8032, "grad_norm": 78.6136474609375, "learning_rate": 3.303498387950532e-05, "loss": 6.0771, "step": 15010 }, { "epoch": 4.8064, "grad_norm": 263.6440734863281, "learning_rate": 3.298686299985564e-05, "loss": 7.1224, "step": 15020 }, { "epoch": 4.8096, "grad_norm": 20.406938552856445, "learning_rate": 3.2938742120205954e-05, "loss": 10.1784, "step": 15030 }, { "epoch": 4.8128, "grad_norm": 167.03245544433594, "learning_rate": 3.289062124055628e-05, "loss": 3.4428, "step": 15040 }, { "epoch": 4.816, "grad_norm": 78.43927001953125, "learning_rate": 3.28425003609066e-05, "loss": 4.9948, "step": 15050 }, { "epoch": 4.8192, "grad_norm": 51.499488830566406, "learning_rate": 3.279437948125692e-05, "loss": 6.0939, "step": 15060 }, { "epoch": 4.8224, "grad_norm": 134.2278289794922, "learning_rate": 3.274625860160724e-05, "loss": 11.6047, "step": 15070 }, { "epoch": 4.8256, "grad_norm": 105.29154968261719, "learning_rate": 3.269813772195756e-05, "loss": 8.0474, "step": 15080 }, { "epoch": 4.8288, "grad_norm": 142.3481903076172, "learning_rate": 3.265001684230788e-05, "loss": 4.2899, "step": 15090 }, { "epoch": 4.832, "grad_norm": 211.79403686523438, "learning_rate": 3.26018959626582e-05, "loss": 7.3181, "step": 15100 }, { "epoch": 4.8352, "grad_norm": 443.5145263671875, "learning_rate": 3.255377508300852e-05, "loss": 10.6205, "step": 15110 }, { "epoch": 4.8384, "grad_norm": 391.74725341796875, "learning_rate": 3.250565420335884e-05, "loss": 9.0033, "step": 15120 }, { "epoch": 4.8416, "grad_norm": 250.6675567626953, "learning_rate": 3.245753332370916e-05, "loss": 7.0266, "step": 15130 }, { "epoch": 4.8448, "grad_norm": 97.15452575683594, "learning_rate": 3.240941244405948e-05, "loss": 7.5582, "step": 15140 }, { "epoch": 4.848, "grad_norm": 93.81172180175781, "learning_rate": 3.23612915644098e-05, "loss": 3.1094, "step": 15150 }, { "epoch": 4.8512, "grad_norm": 90.86038208007812, "learning_rate": 3.231317068476012e-05, "loss": 4.9252, "step": 15160 }, { "epoch": 4.8544, "grad_norm": 276.02178955078125, "learning_rate": 3.226504980511044e-05, "loss": 13.4206, "step": 15170 }, { "epoch": 4.8576, "grad_norm": 360.1201477050781, "learning_rate": 3.221692892546076e-05, "loss": 8.8526, "step": 15180 }, { "epoch": 4.8608, "grad_norm": 48.165550231933594, "learning_rate": 3.216880804581108e-05, "loss": 4.4287, "step": 15190 }, { "epoch": 4.864, "grad_norm": 26.654298782348633, "learning_rate": 3.21206871661614e-05, "loss": 7.8631, "step": 15200 }, { "epoch": 4.8672, "grad_norm": 37.91269302368164, "learning_rate": 3.2072566286511716e-05, "loss": 7.369, "step": 15210 }, { "epoch": 4.8704, "grad_norm": 53.42951202392578, "learning_rate": 3.202444540686204e-05, "loss": 3.431, "step": 15220 }, { "epoch": 4.8736, "grad_norm": 165.07635498046875, "learning_rate": 3.197632452721236e-05, "loss": 13.6166, "step": 15230 }, { "epoch": 4.8768, "grad_norm": 10.949719429016113, "learning_rate": 3.192820364756268e-05, "loss": 12.9307, "step": 15240 }, { "epoch": 4.88, "grad_norm": 193.47561645507812, "learning_rate": 3.1880082767913e-05, "loss": 10.8231, "step": 15250 }, { "epoch": 4.8832, "grad_norm": 177.36154174804688, "learning_rate": 3.1831961888263316e-05, "loss": 7.0789, "step": 15260 }, { "epoch": 4.8864, "grad_norm": 59.642601013183594, "learning_rate": 3.178384100861364e-05, "loss": 7.0001, "step": 15270 }, { "epoch": 4.8896, "grad_norm": 427.9485778808594, "learning_rate": 3.173572012896396e-05, "loss": 11.0439, "step": 15280 }, { "epoch": 4.8928, "grad_norm": 288.674072265625, "learning_rate": 3.1687599249314284e-05, "loss": 6.1042, "step": 15290 }, { "epoch": 4.896, "grad_norm": 147.27413940429688, "learning_rate": 3.16394783696646e-05, "loss": 3.6784, "step": 15300 }, { "epoch": 4.8992, "grad_norm": 7.880171775817871, "learning_rate": 3.1591357490014916e-05, "loss": 5.7472, "step": 15310 }, { "epoch": 4.9024, "grad_norm": 130.74253845214844, "learning_rate": 3.154323661036524e-05, "loss": 10.4308, "step": 15320 }, { "epoch": 4.9056, "grad_norm": 216.4311981201172, "learning_rate": 3.1495115730715554e-05, "loss": 3.1505, "step": 15330 }, { "epoch": 4.9088, "grad_norm": 228.6086883544922, "learning_rate": 3.1446994851065884e-05, "loss": 10.7767, "step": 15340 }, { "epoch": 4.912, "grad_norm": 41.463932037353516, "learning_rate": 3.13988739714162e-05, "loss": 3.5023, "step": 15350 }, { "epoch": 4.9152000000000005, "grad_norm": 132.22169494628906, "learning_rate": 3.135075309176652e-05, "loss": 4.437, "step": 15360 }, { "epoch": 4.9184, "grad_norm": 123.64639282226562, "learning_rate": 3.130263221211684e-05, "loss": 5.7249, "step": 15370 }, { "epoch": 4.9216, "grad_norm": 22.774639129638672, "learning_rate": 3.1254511332467154e-05, "loss": 7.2843, "step": 15380 }, { "epoch": 4.9248, "grad_norm": 103.03235626220703, "learning_rate": 3.120639045281748e-05, "loss": 5.1156, "step": 15390 }, { "epoch": 4.928, "grad_norm": 375.8827209472656, "learning_rate": 3.11582695731678e-05, "loss": 7.3892, "step": 15400 }, { "epoch": 4.9312000000000005, "grad_norm": 609.2612915039062, "learning_rate": 3.111014869351812e-05, "loss": 8.6952, "step": 15410 }, { "epoch": 4.9344, "grad_norm": 119.58415985107422, "learning_rate": 3.106202781386844e-05, "loss": 2.5376, "step": 15420 }, { "epoch": 4.9376, "grad_norm": 851.2537231445312, "learning_rate": 3.101390693421876e-05, "loss": 12.8027, "step": 15430 }, { "epoch": 4.9408, "grad_norm": 141.66998291015625, "learning_rate": 3.096578605456908e-05, "loss": 6.4847, "step": 15440 }, { "epoch": 4.944, "grad_norm": 263.7648010253906, "learning_rate": 3.09176651749194e-05, "loss": 7.2153, "step": 15450 }, { "epoch": 4.9472000000000005, "grad_norm": 203.18482971191406, "learning_rate": 3.086954429526972e-05, "loss": 5.8634, "step": 15460 }, { "epoch": 4.9504, "grad_norm": 56.58214569091797, "learning_rate": 3.082142341562004e-05, "loss": 6.1852, "step": 15470 }, { "epoch": 4.9536, "grad_norm": 201.8409423828125, "learning_rate": 3.077330253597036e-05, "loss": 7.1284, "step": 15480 }, { "epoch": 4.9568, "grad_norm": 423.6837463378906, "learning_rate": 3.072518165632068e-05, "loss": 8.0504, "step": 15490 }, { "epoch": 4.96, "grad_norm": 45.82731628417969, "learning_rate": 3.0677060776671e-05, "loss": 11.7172, "step": 15500 }, { "epoch": 4.9632, "grad_norm": 109.822509765625, "learning_rate": 3.062893989702132e-05, "loss": 6.9922, "step": 15510 }, { "epoch": 4.9664, "grad_norm": 187.40000915527344, "learning_rate": 3.058081901737164e-05, "loss": 6.279, "step": 15520 }, { "epoch": 4.9696, "grad_norm": 211.17147827148438, "learning_rate": 3.053269813772196e-05, "loss": 4.3334, "step": 15530 }, { "epoch": 4.9728, "grad_norm": 652.0032348632812, "learning_rate": 3.0484577258072277e-05, "loss": 11.9485, "step": 15540 }, { "epoch": 4.976, "grad_norm": 257.0616149902344, "learning_rate": 3.04364563784226e-05, "loss": 6.2261, "step": 15550 }, { "epoch": 4.9792, "grad_norm": 85.20001983642578, "learning_rate": 3.038833549877292e-05, "loss": 5.4241, "step": 15560 }, { "epoch": 4.9824, "grad_norm": 91.3134765625, "learning_rate": 3.0340214619123242e-05, "loss": 7.2377, "step": 15570 }, { "epoch": 4.9856, "grad_norm": 183.97195434570312, "learning_rate": 3.0292093739473558e-05, "loss": 5.2757, "step": 15580 }, { "epoch": 4.9888, "grad_norm": 30.434110641479492, "learning_rate": 3.0243972859823877e-05, "loss": 4.1833, "step": 15590 }, { "epoch": 4.992, "grad_norm": 28.06619644165039, "learning_rate": 3.01958519801742e-05, "loss": 4.3259, "step": 15600 }, { "epoch": 4.9952, "grad_norm": 28.83574676513672, "learning_rate": 3.014773110052452e-05, "loss": 9.7587, "step": 15610 }, { "epoch": 4.9984, "grad_norm": 184.25595092773438, "learning_rate": 3.0099610220874842e-05, "loss": 4.3558, "step": 15620 }, { "epoch": 5.0016, "grad_norm": 72.25971984863281, "learning_rate": 3.0051489341225158e-05, "loss": 3.9499, "step": 15630 }, { "epoch": 5.0048, "grad_norm": 133.2681121826172, "learning_rate": 3.000336846157548e-05, "loss": 6.6277, "step": 15640 }, { "epoch": 5.008, "grad_norm": 62.12648391723633, "learning_rate": 2.99552475819258e-05, "loss": 9.8365, "step": 15650 }, { "epoch": 5.0112, "grad_norm": 117.30449676513672, "learning_rate": 2.9907126702276116e-05, "loss": 4.1696, "step": 15660 }, { "epoch": 5.0144, "grad_norm": 31.59276580810547, "learning_rate": 2.985900582262644e-05, "loss": 4.1672, "step": 15670 }, { "epoch": 5.0176, "grad_norm": 105.5892333984375, "learning_rate": 2.9810884942976758e-05, "loss": 6.9072, "step": 15680 }, { "epoch": 5.0208, "grad_norm": 407.95068359375, "learning_rate": 2.976276406332708e-05, "loss": 6.7035, "step": 15690 }, { "epoch": 5.024, "grad_norm": 354.446533203125, "learning_rate": 2.97146431836774e-05, "loss": 3.68, "step": 15700 }, { "epoch": 5.0272, "grad_norm": 54.057979583740234, "learning_rate": 2.9666522304027723e-05, "loss": 9.3417, "step": 15710 }, { "epoch": 5.0304, "grad_norm": 121.83612060546875, "learning_rate": 2.961840142437804e-05, "loss": 11.677, "step": 15720 }, { "epoch": 5.0336, "grad_norm": 137.72088623046875, "learning_rate": 2.9570280544728358e-05, "loss": 5.3047, "step": 15730 }, { "epoch": 5.0368, "grad_norm": 100.73807525634766, "learning_rate": 2.952215966507868e-05, "loss": 3.3448, "step": 15740 }, { "epoch": 5.04, "grad_norm": 359.5600280761719, "learning_rate": 2.9474038785428997e-05, "loss": 9.701, "step": 15750 }, { "epoch": 5.0432, "grad_norm": 87.38190460205078, "learning_rate": 2.9425917905779323e-05, "loss": 6.8862, "step": 15760 }, { "epoch": 5.0464, "grad_norm": 132.5592498779297, "learning_rate": 2.937779702612964e-05, "loss": 6.9687, "step": 15770 }, { "epoch": 5.0496, "grad_norm": 54.74911117553711, "learning_rate": 2.9329676146479958e-05, "loss": 8.484, "step": 15780 }, { "epoch": 5.0528, "grad_norm": 53.047908782958984, "learning_rate": 2.928155526683028e-05, "loss": 3.8589, "step": 15790 }, { "epoch": 5.056, "grad_norm": 138.95635986328125, "learning_rate": 2.9233434387180597e-05, "loss": 4.1625, "step": 15800 }, { "epoch": 5.0592, "grad_norm": 74.47620391845703, "learning_rate": 2.918531350753092e-05, "loss": 4.9843, "step": 15810 }, { "epoch": 5.0624, "grad_norm": 387.7304382324219, "learning_rate": 2.913719262788124e-05, "loss": 8.8081, "step": 15820 }, { "epoch": 5.0656, "grad_norm": 35.267024993896484, "learning_rate": 2.908907174823156e-05, "loss": 3.5744, "step": 15830 }, { "epoch": 5.0688, "grad_norm": 121.54290771484375, "learning_rate": 2.9040950868581877e-05, "loss": 3.4639, "step": 15840 }, { "epoch": 5.072, "grad_norm": 464.52728271484375, "learning_rate": 2.8992829988932197e-05, "loss": 8.777, "step": 15850 }, { "epoch": 5.0752, "grad_norm": 164.7660675048828, "learning_rate": 2.894470910928252e-05, "loss": 6.1059, "step": 15860 }, { "epoch": 5.0784, "grad_norm": 322.4814453125, "learning_rate": 2.889658822963284e-05, "loss": 6.5032, "step": 15870 }, { "epoch": 5.0816, "grad_norm": 151.1011199951172, "learning_rate": 2.884846734998316e-05, "loss": 3.286, "step": 15880 }, { "epoch": 5.0848, "grad_norm": 137.9677734375, "learning_rate": 2.8800346470333477e-05, "loss": 7.1731, "step": 15890 }, { "epoch": 5.088, "grad_norm": 350.030029296875, "learning_rate": 2.87522255906838e-05, "loss": 3.6659, "step": 15900 }, { "epoch": 5.0912, "grad_norm": 469.53765869140625, "learning_rate": 2.870410471103412e-05, "loss": 9.7411, "step": 15910 }, { "epoch": 5.0944, "grad_norm": 236.94566345214844, "learning_rate": 2.8655983831384435e-05, "loss": 7.4653, "step": 15920 }, { "epoch": 5.0976, "grad_norm": 63.707130432128906, "learning_rate": 2.8607862951734758e-05, "loss": 2.6429, "step": 15930 }, { "epoch": 5.1008, "grad_norm": 295.5698547363281, "learning_rate": 2.8559742072085077e-05, "loss": 10.0168, "step": 15940 }, { "epoch": 5.104, "grad_norm": 19.73590660095215, "learning_rate": 2.85116211924354e-05, "loss": 5.8276, "step": 15950 }, { "epoch": 5.1072, "grad_norm": 159.47750854492188, "learning_rate": 2.846350031278572e-05, "loss": 4.3727, "step": 15960 }, { "epoch": 5.1104, "grad_norm": 114.60530090332031, "learning_rate": 2.8415379433136042e-05, "loss": 4.447, "step": 15970 }, { "epoch": 5.1136, "grad_norm": 97.3194351196289, "learning_rate": 2.8367258553486358e-05, "loss": 9.4209, "step": 15980 }, { "epoch": 5.1168, "grad_norm": 141.7789764404297, "learning_rate": 2.8319137673836677e-05, "loss": 6.2586, "step": 15990 }, { "epoch": 5.12, "grad_norm": 117.36822509765625, "learning_rate": 2.8271016794187e-05, "loss": 4.039, "step": 16000 }, { "epoch": 5.1232, "grad_norm": 67.18170166015625, "learning_rate": 2.8222895914537316e-05, "loss": 11.1775, "step": 16010 }, { "epoch": 5.1264, "grad_norm": 215.93138122558594, "learning_rate": 2.8174775034887642e-05, "loss": 9.0792, "step": 16020 }, { "epoch": 5.1296, "grad_norm": 93.301513671875, "learning_rate": 2.8126654155237958e-05, "loss": 4.5495, "step": 16030 }, { "epoch": 5.1328, "grad_norm": 148.17263793945312, "learning_rate": 2.807853327558828e-05, "loss": 9.195, "step": 16040 }, { "epoch": 5.136, "grad_norm": 351.7143859863281, "learning_rate": 2.80304123959386e-05, "loss": 14.0712, "step": 16050 }, { "epoch": 5.1392, "grad_norm": 112.32396697998047, "learning_rate": 2.7982291516288916e-05, "loss": 3.8382, "step": 16060 }, { "epoch": 5.1424, "grad_norm": 370.50823974609375, "learning_rate": 2.793417063663924e-05, "loss": 7.9652, "step": 16070 }, { "epoch": 5.1456, "grad_norm": 206.6117706298828, "learning_rate": 2.7886049756989558e-05, "loss": 4.6328, "step": 16080 }, { "epoch": 5.1488, "grad_norm": 239.50857543945312, "learning_rate": 2.783792887733988e-05, "loss": 8.1652, "step": 16090 }, { "epoch": 5.152, "grad_norm": 67.14512634277344, "learning_rate": 2.7789807997690197e-05, "loss": 3.8, "step": 16100 }, { "epoch": 5.1552, "grad_norm": 372.5460510253906, "learning_rate": 2.7741687118040523e-05, "loss": 7.2759, "step": 16110 }, { "epoch": 5.1584, "grad_norm": 65.01236724853516, "learning_rate": 2.769356623839084e-05, "loss": 6.8679, "step": 16120 }, { "epoch": 5.1616, "grad_norm": 146.1551055908203, "learning_rate": 2.7645445358741158e-05, "loss": 5.063, "step": 16130 }, { "epoch": 5.1648, "grad_norm": 47.821205139160156, "learning_rate": 2.759732447909148e-05, "loss": 7.3481, "step": 16140 }, { "epoch": 5.168, "grad_norm": 85.73245239257812, "learning_rate": 2.7549203599441797e-05, "loss": 4.788, "step": 16150 }, { "epoch": 5.1712, "grad_norm": 11.634099006652832, "learning_rate": 2.750108271979212e-05, "loss": 2.1871, "step": 16160 }, { "epoch": 5.1744, "grad_norm": 248.97140502929688, "learning_rate": 2.745296184014244e-05, "loss": 10.6587, "step": 16170 }, { "epoch": 5.1776, "grad_norm": 87.56939697265625, "learning_rate": 2.740484096049276e-05, "loss": 6.0361, "step": 16180 }, { "epoch": 5.1808, "grad_norm": 143.6980743408203, "learning_rate": 2.7356720080843077e-05, "loss": 5.4118, "step": 16190 }, { "epoch": 5.184, "grad_norm": 244.07899475097656, "learning_rate": 2.7308599201193397e-05, "loss": 15.6558, "step": 16200 }, { "epoch": 5.1872, "grad_norm": 147.19642639160156, "learning_rate": 2.726047832154372e-05, "loss": 5.5982, "step": 16210 }, { "epoch": 5.1904, "grad_norm": 177.24131774902344, "learning_rate": 2.721235744189404e-05, "loss": 5.7205, "step": 16220 }, { "epoch": 5.1936, "grad_norm": 445.0991516113281, "learning_rate": 2.716423656224436e-05, "loss": 10.0954, "step": 16230 }, { "epoch": 5.1968, "grad_norm": 67.33574676513672, "learning_rate": 2.7116115682594677e-05, "loss": 4.4138, "step": 16240 }, { "epoch": 5.2, "grad_norm": 251.82997131347656, "learning_rate": 2.7067994802945e-05, "loss": 10.537, "step": 16250 }, { "epoch": 5.2032, "grad_norm": 572.82373046875, "learning_rate": 2.701987392329532e-05, "loss": 8.8704, "step": 16260 }, { "epoch": 5.2064, "grad_norm": 65.88017272949219, "learning_rate": 2.6971753043645635e-05, "loss": 3.6974, "step": 16270 }, { "epoch": 5.2096, "grad_norm": 71.66238403320312, "learning_rate": 2.692363216399596e-05, "loss": 4.5059, "step": 16280 }, { "epoch": 5.2128, "grad_norm": 319.1573791503906, "learning_rate": 2.6875511284346278e-05, "loss": 6.7203, "step": 16290 }, { "epoch": 5.216, "grad_norm": 99.7453384399414, "learning_rate": 2.68273904046966e-05, "loss": 6.6855, "step": 16300 }, { "epoch": 5.2192, "grad_norm": 331.1013488769531, "learning_rate": 2.677926952504692e-05, "loss": 5.9585, "step": 16310 }, { "epoch": 5.2224, "grad_norm": 236.64344787597656, "learning_rate": 2.6731148645397242e-05, "loss": 7.9318, "step": 16320 }, { "epoch": 5.2256, "grad_norm": 73.85679626464844, "learning_rate": 2.6683027765747558e-05, "loss": 8.5088, "step": 16330 }, { "epoch": 5.2288, "grad_norm": 88.76898956298828, "learning_rate": 2.6634906886097878e-05, "loss": 5.2926, "step": 16340 }, { "epoch": 5.232, "grad_norm": 19.761035919189453, "learning_rate": 2.65867860064482e-05, "loss": 3.2963, "step": 16350 }, { "epoch": 5.2352, "grad_norm": 399.1698303222656, "learning_rate": 2.6538665126798516e-05, "loss": 5.4284, "step": 16360 }, { "epoch": 5.2384, "grad_norm": 271.735595703125, "learning_rate": 2.6490544247148842e-05, "loss": 4.9333, "step": 16370 }, { "epoch": 5.2416, "grad_norm": 88.03325653076172, "learning_rate": 2.6442423367499158e-05, "loss": 6.0586, "step": 16380 }, { "epoch": 5.2448, "grad_norm": 117.71830749511719, "learning_rate": 2.639430248784948e-05, "loss": 4.3421, "step": 16390 }, { "epoch": 5.248, "grad_norm": 80.27143096923828, "learning_rate": 2.63461816081998e-05, "loss": 1.927, "step": 16400 }, { "epoch": 5.2512, "grad_norm": 135.8401336669922, "learning_rate": 2.6298060728550116e-05, "loss": 8.4009, "step": 16410 }, { "epoch": 5.2544, "grad_norm": 14.429817199707031, "learning_rate": 2.624993984890044e-05, "loss": 5.5504, "step": 16420 }, { "epoch": 5.2576, "grad_norm": 470.201904296875, "learning_rate": 2.6201818969250758e-05, "loss": 11.718, "step": 16430 }, { "epoch": 5.2608, "grad_norm": 312.2998352050781, "learning_rate": 2.615369808960108e-05, "loss": 11.6401, "step": 16440 }, { "epoch": 5.264, "grad_norm": 42.97299575805664, "learning_rate": 2.6105577209951397e-05, "loss": 1.6013, "step": 16450 }, { "epoch": 5.2672, "grad_norm": 163.26837158203125, "learning_rate": 2.6057456330301723e-05, "loss": 6.4067, "step": 16460 }, { "epoch": 5.2704, "grad_norm": 184.8193817138672, "learning_rate": 2.600933545065204e-05, "loss": 7.5856, "step": 16470 }, { "epoch": 5.2736, "grad_norm": 4.910395622253418, "learning_rate": 2.5961214571002358e-05, "loss": 2.8182, "step": 16480 }, { "epoch": 5.2768, "grad_norm": 16.06243896484375, "learning_rate": 2.591309369135268e-05, "loss": 9.6371, "step": 16490 }, { "epoch": 5.28, "grad_norm": 624.3032836914062, "learning_rate": 2.5864972811702997e-05, "loss": 6.3539, "step": 16500 }, { "epoch": 5.2832, "grad_norm": 161.3603973388672, "learning_rate": 2.581685193205332e-05, "loss": 5.5155, "step": 16510 }, { "epoch": 5.2864, "grad_norm": 223.1274871826172, "learning_rate": 2.576873105240364e-05, "loss": 6.6144, "step": 16520 }, { "epoch": 5.2896, "grad_norm": 11.653890609741211, "learning_rate": 2.5720610172753962e-05, "loss": 5.0938, "step": 16530 }, { "epoch": 5.2928, "grad_norm": 77.67524719238281, "learning_rate": 2.5672489293104278e-05, "loss": 7.8173, "step": 16540 }, { "epoch": 5.296, "grad_norm": 54.87876510620117, "learning_rate": 2.5624368413454597e-05, "loss": 5.5919, "step": 16550 }, { "epoch": 5.2992, "grad_norm": 121.10255432128906, "learning_rate": 2.557624753380492e-05, "loss": 4.8316, "step": 16560 }, { "epoch": 5.3024000000000004, "grad_norm": 245.53805541992188, "learning_rate": 2.552812665415524e-05, "loss": 5.6025, "step": 16570 }, { "epoch": 5.3056, "grad_norm": 165.28176879882812, "learning_rate": 2.5480005774505562e-05, "loss": 9.3643, "step": 16580 }, { "epoch": 5.3088, "grad_norm": 253.56057739257812, "learning_rate": 2.5431884894855878e-05, "loss": 3.018, "step": 16590 }, { "epoch": 5.312, "grad_norm": 328.3662414550781, "learning_rate": 2.53837640152062e-05, "loss": 6.4544, "step": 16600 }, { "epoch": 5.3152, "grad_norm": 100.15255737304688, "learning_rate": 2.533564313555652e-05, "loss": 8.8217, "step": 16610 }, { "epoch": 5.3184000000000005, "grad_norm": 72.20362091064453, "learning_rate": 2.5287522255906836e-05, "loss": 11.4448, "step": 16620 }, { "epoch": 5.3216, "grad_norm": 154.09164428710938, "learning_rate": 2.5239401376257162e-05, "loss": 4.7238, "step": 16630 }, { "epoch": 5.3248, "grad_norm": 118.145751953125, "learning_rate": 2.5191280496607478e-05, "loss": 8.5312, "step": 16640 }, { "epoch": 5.328, "grad_norm": 295.76202392578125, "learning_rate": 2.51431596169578e-05, "loss": 6.5859, "step": 16650 }, { "epoch": 5.3312, "grad_norm": 399.7045593261719, "learning_rate": 2.509503873730812e-05, "loss": 10.0417, "step": 16660 }, { "epoch": 5.3344, "grad_norm": 101.16209411621094, "learning_rate": 2.5046917857658442e-05, "loss": 8.5092, "step": 16670 }, { "epoch": 5.3376, "grad_norm": 73.08101654052734, "learning_rate": 2.499879697800876e-05, "loss": 3.6922, "step": 16680 }, { "epoch": 5.3408, "grad_norm": 168.78477478027344, "learning_rate": 2.495067609835908e-05, "loss": 8.4038, "step": 16690 }, { "epoch": 5.344, "grad_norm": 42.43104553222656, "learning_rate": 2.49025552187094e-05, "loss": 6.5543, "step": 16700 }, { "epoch": 5.3472, "grad_norm": 280.9566955566406, "learning_rate": 2.4854434339059716e-05, "loss": 10.3781, "step": 16710 }, { "epoch": 5.3504, "grad_norm": 118.36164855957031, "learning_rate": 2.480631345941004e-05, "loss": 1.5687, "step": 16720 }, { "epoch": 5.3536, "grad_norm": 257.1197814941406, "learning_rate": 2.475819257976036e-05, "loss": 4.0468, "step": 16730 }, { "epoch": 5.3568, "grad_norm": 75.32530212402344, "learning_rate": 2.4710071700110678e-05, "loss": 6.8949, "step": 16740 }, { "epoch": 5.36, "grad_norm": 252.7229766845703, "learning_rate": 2.4661950820461e-05, "loss": 7.5414, "step": 16750 }, { "epoch": 5.3632, "grad_norm": 132.25466918945312, "learning_rate": 2.461382994081132e-05, "loss": 4.4817, "step": 16760 }, { "epoch": 5.3664, "grad_norm": 167.68414306640625, "learning_rate": 2.456570906116164e-05, "loss": 4.9337, "step": 16770 }, { "epoch": 5.3696, "grad_norm": 239.80227661132812, "learning_rate": 2.451758818151196e-05, "loss": 7.1544, "step": 16780 }, { "epoch": 5.3728, "grad_norm": 308.13311767578125, "learning_rate": 2.4469467301862278e-05, "loss": 3.8484, "step": 16790 }, { "epoch": 5.376, "grad_norm": 571.2833862304688, "learning_rate": 2.4421346422212597e-05, "loss": 11.5687, "step": 16800 }, { "epoch": 5.3792, "grad_norm": 152.44236755371094, "learning_rate": 2.437322554256292e-05, "loss": 12.5365, "step": 16810 }, { "epoch": 5.3824, "grad_norm": 11.518658638000488, "learning_rate": 2.432510466291324e-05, "loss": 5.2198, "step": 16820 }, { "epoch": 5.3856, "grad_norm": 162.59210205078125, "learning_rate": 2.427698378326356e-05, "loss": 7.7696, "step": 16830 }, { "epoch": 5.3888, "grad_norm": 64.32059478759766, "learning_rate": 2.422886290361388e-05, "loss": 9.992, "step": 16840 }, { "epoch": 5.392, "grad_norm": 35.79432678222656, "learning_rate": 2.4180742023964197e-05, "loss": 2.8258, "step": 16850 }, { "epoch": 5.3952, "grad_norm": 102.675048828125, "learning_rate": 2.413262114431452e-05, "loss": 6.8104, "step": 16860 }, { "epoch": 5.3984, "grad_norm": 331.57086181640625, "learning_rate": 2.408450026466484e-05, "loss": 7.1477, "step": 16870 }, { "epoch": 5.4016, "grad_norm": 160.4903106689453, "learning_rate": 2.403637938501516e-05, "loss": 3.92, "step": 16880 }, { "epoch": 5.4048, "grad_norm": 974.5618286132812, "learning_rate": 2.398825850536548e-05, "loss": 18.1932, "step": 16890 }, { "epoch": 5.408, "grad_norm": 248.79446411132812, "learning_rate": 2.39401376257158e-05, "loss": 13.3369, "step": 16900 }, { "epoch": 5.4112, "grad_norm": 202.8837432861328, "learning_rate": 2.389201674606612e-05, "loss": 6.3833, "step": 16910 }, { "epoch": 5.4144, "grad_norm": 33.91128158569336, "learning_rate": 2.384389586641644e-05, "loss": 2.8662, "step": 16920 }, { "epoch": 5.4176, "grad_norm": 143.8839569091797, "learning_rate": 2.379577498676676e-05, "loss": 2.679, "step": 16930 }, { "epoch": 5.4208, "grad_norm": 142.1279296875, "learning_rate": 2.3747654107117078e-05, "loss": 5.563, "step": 16940 }, { "epoch": 5.424, "grad_norm": 5.882359027862549, "learning_rate": 2.36995332274674e-05, "loss": 7.8976, "step": 16950 }, { "epoch": 5.4272, "grad_norm": 128.54779052734375, "learning_rate": 2.365141234781772e-05, "loss": 3.6569, "step": 16960 }, { "epoch": 5.4304, "grad_norm": 178.67684936523438, "learning_rate": 2.360329146816804e-05, "loss": 3.5193, "step": 16970 }, { "epoch": 5.4336, "grad_norm": 24.382736206054688, "learning_rate": 2.3555170588518362e-05, "loss": 9.8714, "step": 16980 }, { "epoch": 5.4368, "grad_norm": 52.97625732421875, "learning_rate": 2.3507049708868678e-05, "loss": 6.3641, "step": 16990 }, { "epoch": 5.44, "grad_norm": 216.1295623779297, "learning_rate": 2.3458928829218997e-05, "loss": 4.2272, "step": 17000 }, { "epoch": 5.4432, "grad_norm": 545.7445068359375, "learning_rate": 2.341080794956932e-05, "loss": 9.1847, "step": 17010 }, { "epoch": 5.4464, "grad_norm": 99.55867767333984, "learning_rate": 2.336268706991964e-05, "loss": 10.188, "step": 17020 }, { "epoch": 5.4496, "grad_norm": 215.9521026611328, "learning_rate": 2.331456619026996e-05, "loss": 6.0431, "step": 17030 }, { "epoch": 5.4528, "grad_norm": 52.24707794189453, "learning_rate": 2.326644531062028e-05, "loss": 4.1778, "step": 17040 }, { "epoch": 5.456, "grad_norm": 276.9158630371094, "learning_rate": 2.32183244309706e-05, "loss": 8.9711, "step": 17050 }, { "epoch": 5.4592, "grad_norm": 163.34182739257812, "learning_rate": 2.3170203551320917e-05, "loss": 5.3662, "step": 17060 }, { "epoch": 5.4624, "grad_norm": 61.05720901489258, "learning_rate": 2.312208267167124e-05, "loss": 3.1632, "step": 17070 }, { "epoch": 5.4656, "grad_norm": 117.36095428466797, "learning_rate": 2.307396179202156e-05, "loss": 5.2404, "step": 17080 }, { "epoch": 5.4688, "grad_norm": 37.871891021728516, "learning_rate": 2.3025840912371878e-05, "loss": 5.6053, "step": 17090 }, { "epoch": 5.4719999999999995, "grad_norm": 48.21548843383789, "learning_rate": 2.29777200327222e-05, "loss": 4.4834, "step": 17100 }, { "epoch": 5.4752, "grad_norm": 76.3283920288086, "learning_rate": 2.292959915307252e-05, "loss": 5.2274, "step": 17110 }, { "epoch": 5.4784, "grad_norm": 375.5204162597656, "learning_rate": 2.288147827342284e-05, "loss": 7.1106, "step": 17120 }, { "epoch": 5.4816, "grad_norm": 136.9579620361328, "learning_rate": 2.283335739377316e-05, "loss": 6.0955, "step": 17130 }, { "epoch": 5.4848, "grad_norm": 135.28114318847656, "learning_rate": 2.2785236514123478e-05, "loss": 13.9789, "step": 17140 }, { "epoch": 5.4879999999999995, "grad_norm": 17.562471389770508, "learning_rate": 2.2737115634473797e-05, "loss": 7.481, "step": 17150 }, { "epoch": 5.4912, "grad_norm": 82.12932586669922, "learning_rate": 2.268899475482412e-05, "loss": 5.4491, "step": 17160 }, { "epoch": 5.4944, "grad_norm": 139.13844299316406, "learning_rate": 2.264087387517444e-05, "loss": 4.9945, "step": 17170 }, { "epoch": 5.4976, "grad_norm": 8.68923568725586, "learning_rate": 2.259275299552476e-05, "loss": 5.2018, "step": 17180 }, { "epoch": 5.5008, "grad_norm": 419.0538635253906, "learning_rate": 2.254463211587508e-05, "loss": 5.2193, "step": 17190 }, { "epoch": 5.504, "grad_norm": 125.63003540039062, "learning_rate": 2.2496511236225397e-05, "loss": 4.6555, "step": 17200 }, { "epoch": 5.5072, "grad_norm": 66.9253921508789, "learning_rate": 2.244839035657572e-05, "loss": 5.3293, "step": 17210 }, { "epoch": 5.5104, "grad_norm": 132.5989227294922, "learning_rate": 2.240026947692604e-05, "loss": 3.6147, "step": 17220 }, { "epoch": 5.5136, "grad_norm": 658.3170166015625, "learning_rate": 2.235214859727636e-05, "loss": 5.8501, "step": 17230 }, { "epoch": 5.5168, "grad_norm": 127.98309326171875, "learning_rate": 2.230402771762668e-05, "loss": 10.026, "step": 17240 }, { "epoch": 5.52, "grad_norm": 120.48301696777344, "learning_rate": 2.2255906837977e-05, "loss": 5.0785, "step": 17250 }, { "epoch": 5.5232, "grad_norm": 245.26979064941406, "learning_rate": 2.220778595832732e-05, "loss": 12.0439, "step": 17260 }, { "epoch": 5.5264, "grad_norm": 144.724365234375, "learning_rate": 2.215966507867764e-05, "loss": 4.2044, "step": 17270 }, { "epoch": 5.5296, "grad_norm": 84.62095642089844, "learning_rate": 2.211154419902796e-05, "loss": 4.278, "step": 17280 }, { "epoch": 5.5328, "grad_norm": 273.14617919921875, "learning_rate": 2.2063423319378278e-05, "loss": 5.6814, "step": 17290 }, { "epoch": 5.536, "grad_norm": 24.513885498046875, "learning_rate": 2.20153024397286e-05, "loss": 6.2836, "step": 17300 }, { "epoch": 5.5392, "grad_norm": 89.3847427368164, "learning_rate": 2.196718156007892e-05, "loss": 7.1967, "step": 17310 }, { "epoch": 5.5424, "grad_norm": 42.9315299987793, "learning_rate": 2.191906068042924e-05, "loss": 6.9065, "step": 17320 }, { "epoch": 5.5456, "grad_norm": 314.33123779296875, "learning_rate": 2.187093980077956e-05, "loss": 6.1874, "step": 17330 }, { "epoch": 5.5488, "grad_norm": 132.9884490966797, "learning_rate": 2.1822818921129878e-05, "loss": 4.366, "step": 17340 }, { "epoch": 5.552, "grad_norm": 39.1591682434082, "learning_rate": 2.1774698041480197e-05, "loss": 4.9835, "step": 17350 }, { "epoch": 5.5552, "grad_norm": 162.68557739257812, "learning_rate": 2.172657716183052e-05, "loss": 4.4605, "step": 17360 }, { "epoch": 5.5584, "grad_norm": 460.33563232421875, "learning_rate": 2.167845628218084e-05, "loss": 5.4169, "step": 17370 }, { "epoch": 5.5616, "grad_norm": 17.75435447692871, "learning_rate": 2.163033540253116e-05, "loss": 6.2572, "step": 17380 }, { "epoch": 5.5648, "grad_norm": 315.34564208984375, "learning_rate": 2.158221452288148e-05, "loss": 6.6178, "step": 17390 }, { "epoch": 5.568, "grad_norm": 144.3094024658203, "learning_rate": 2.1534093643231797e-05, "loss": 6.4261, "step": 17400 }, { "epoch": 5.5712, "grad_norm": 80.86283111572266, "learning_rate": 2.1485972763582117e-05, "loss": 5.8925, "step": 17410 }, { "epoch": 5.5744, "grad_norm": 152.36431884765625, "learning_rate": 2.143785188393244e-05, "loss": 3.0784, "step": 17420 }, { "epoch": 5.5776, "grad_norm": 89.33140563964844, "learning_rate": 2.138973100428276e-05, "loss": 4.4339, "step": 17430 }, { "epoch": 5.5808, "grad_norm": 591.7390747070312, "learning_rate": 2.1341610124633078e-05, "loss": 5.2465, "step": 17440 }, { "epoch": 5.584, "grad_norm": 107.24054718017578, "learning_rate": 2.12934892449834e-05, "loss": 4.7794, "step": 17450 }, { "epoch": 5.5872, "grad_norm": 44.13363265991211, "learning_rate": 2.124536836533372e-05, "loss": 5.4835, "step": 17460 }, { "epoch": 5.5904, "grad_norm": 33.71746826171875, "learning_rate": 2.119724748568404e-05, "loss": 5.5547, "step": 17470 }, { "epoch": 5.5936, "grad_norm": 47.45794677734375, "learning_rate": 2.114912660603436e-05, "loss": 3.9636, "step": 17480 }, { "epoch": 5.5968, "grad_norm": 25.6744327545166, "learning_rate": 2.1101005726384678e-05, "loss": 5.9998, "step": 17490 }, { "epoch": 5.6, "grad_norm": 110.42631530761719, "learning_rate": 2.1052884846735e-05, "loss": 4.2139, "step": 17500 }, { "epoch": 5.6032, "grad_norm": 131.73936462402344, "learning_rate": 2.100476396708532e-05, "loss": 7.7626, "step": 17510 }, { "epoch": 5.6064, "grad_norm": 133.59164428710938, "learning_rate": 2.095664308743564e-05, "loss": 7.4239, "step": 17520 }, { "epoch": 5.6096, "grad_norm": 279.53448486328125, "learning_rate": 2.090852220778596e-05, "loss": 7.0999, "step": 17530 }, { "epoch": 5.6128, "grad_norm": 230.70094299316406, "learning_rate": 2.0860401328136278e-05, "loss": 5.2878, "step": 17540 }, { "epoch": 5.616, "grad_norm": 67.16010284423828, "learning_rate": 2.0812280448486598e-05, "loss": 5.909, "step": 17550 }, { "epoch": 5.6192, "grad_norm": 253.69671630859375, "learning_rate": 2.076415956883692e-05, "loss": 6.0945, "step": 17560 }, { "epoch": 5.6224, "grad_norm": 221.26583862304688, "learning_rate": 2.071603868918724e-05, "loss": 5.8222, "step": 17570 }, { "epoch": 5.6256, "grad_norm": 320.3901672363281, "learning_rate": 2.066791780953756e-05, "loss": 5.8375, "step": 17580 }, { "epoch": 5.6288, "grad_norm": 331.9903869628906, "learning_rate": 2.061979692988788e-05, "loss": 5.979, "step": 17590 }, { "epoch": 5.632, "grad_norm": 28.595483779907227, "learning_rate": 2.05716760502382e-05, "loss": 4.1591, "step": 17600 }, { "epoch": 5.6352, "grad_norm": 20.17335319519043, "learning_rate": 2.0523555170588517e-05, "loss": 6.5196, "step": 17610 }, { "epoch": 5.6384, "grad_norm": 46.73622512817383, "learning_rate": 2.047543429093884e-05, "loss": 2.3983, "step": 17620 }, { "epoch": 5.6416, "grad_norm": 352.5174865722656, "learning_rate": 2.042731341128916e-05, "loss": 10.9599, "step": 17630 }, { "epoch": 5.6448, "grad_norm": 156.60694885253906, "learning_rate": 2.0379192531639478e-05, "loss": 5.6494, "step": 17640 }, { "epoch": 5.648, "grad_norm": 171.55755615234375, "learning_rate": 2.03310716519898e-05, "loss": 4.0468, "step": 17650 }, { "epoch": 5.6512, "grad_norm": 595.6713256835938, "learning_rate": 2.028295077234012e-05, "loss": 8.3896, "step": 17660 }, { "epoch": 5.6544, "grad_norm": 124.84902954101562, "learning_rate": 2.023482989269044e-05, "loss": 4.7206, "step": 17670 }, { "epoch": 5.6576, "grad_norm": 59.092464447021484, "learning_rate": 2.018670901304076e-05, "loss": 10.5288, "step": 17680 }, { "epoch": 5.6608, "grad_norm": 148.74388122558594, "learning_rate": 2.013858813339108e-05, "loss": 6.7232, "step": 17690 }, { "epoch": 5.664, "grad_norm": 104.44282531738281, "learning_rate": 2.0090467253741398e-05, "loss": 5.9889, "step": 17700 }, { "epoch": 5.6672, "grad_norm": 14.767074584960938, "learning_rate": 2.004234637409172e-05, "loss": 6.4026, "step": 17710 }, { "epoch": 5.6704, "grad_norm": 168.45936584472656, "learning_rate": 1.999422549444204e-05, "loss": 4.4146, "step": 17720 }, { "epoch": 5.6736, "grad_norm": 259.3738708496094, "learning_rate": 1.994610461479236e-05, "loss": 4.0013, "step": 17730 }, { "epoch": 5.6768, "grad_norm": 209.3070831298828, "learning_rate": 1.9897983735142682e-05, "loss": 8.4221, "step": 17740 }, { "epoch": 5.68, "grad_norm": 14.505404472351074, "learning_rate": 1.9849862855492998e-05, "loss": 5.0816, "step": 17750 }, { "epoch": 5.6832, "grad_norm": 66.14643859863281, "learning_rate": 1.980174197584332e-05, "loss": 11.3057, "step": 17760 }, { "epoch": 5.6864, "grad_norm": 210.3777313232422, "learning_rate": 1.975362109619364e-05, "loss": 13.4353, "step": 17770 }, { "epoch": 5.6896, "grad_norm": 64.52182006835938, "learning_rate": 1.970550021654396e-05, "loss": 3.7719, "step": 17780 }, { "epoch": 5.6928, "grad_norm": 399.90264892578125, "learning_rate": 1.965737933689428e-05, "loss": 10.9653, "step": 17790 }, { "epoch": 5.696, "grad_norm": 309.4091491699219, "learning_rate": 1.96092584572446e-05, "loss": 3.2319, "step": 17800 }, { "epoch": 5.6992, "grad_norm": 16.168378829956055, "learning_rate": 1.956113757759492e-05, "loss": 3.8206, "step": 17810 }, { "epoch": 5.7024, "grad_norm": 183.8621826171875, "learning_rate": 1.951301669794524e-05, "loss": 8.292, "step": 17820 }, { "epoch": 5.7056000000000004, "grad_norm": 290.3883972167969, "learning_rate": 1.946489581829556e-05, "loss": 10.862, "step": 17830 }, { "epoch": 5.7088, "grad_norm": 82.94297790527344, "learning_rate": 1.941677493864588e-05, "loss": 3.9342, "step": 17840 }, { "epoch": 5.712, "grad_norm": 17.132610321044922, "learning_rate": 1.93686540589962e-05, "loss": 8.0398, "step": 17850 }, { "epoch": 5.7152, "grad_norm": 78.22087860107422, "learning_rate": 1.932053317934652e-05, "loss": 11.6153, "step": 17860 }, { "epoch": 5.7184, "grad_norm": 222.96234130859375, "learning_rate": 1.927241229969684e-05, "loss": 3.2612, "step": 17870 }, { "epoch": 5.7216000000000005, "grad_norm": 173.70079040527344, "learning_rate": 1.922429142004716e-05, "loss": 11.3988, "step": 17880 }, { "epoch": 5.7248, "grad_norm": 155.9319610595703, "learning_rate": 1.917617054039748e-05, "loss": 7.31, "step": 17890 }, { "epoch": 5.728, "grad_norm": 25.57274627685547, "learning_rate": 1.9128049660747798e-05, "loss": 4.5636, "step": 17900 }, { "epoch": 5.7312, "grad_norm": 185.43978881835938, "learning_rate": 1.907992878109812e-05, "loss": 4.891, "step": 17910 }, { "epoch": 5.7344, "grad_norm": 141.6038818359375, "learning_rate": 1.903180790144844e-05, "loss": 3.9273, "step": 17920 }, { "epoch": 5.7376000000000005, "grad_norm": 137.5728302001953, "learning_rate": 1.898368702179876e-05, "loss": 13.2236, "step": 17930 }, { "epoch": 5.7408, "grad_norm": 31.237668991088867, "learning_rate": 1.8935566142149082e-05, "loss": 5.3887, "step": 17940 }, { "epoch": 5.744, "grad_norm": 87.02256774902344, "learning_rate": 1.8887445262499398e-05, "loss": 5.2575, "step": 17950 }, { "epoch": 5.7472, "grad_norm": 141.08407592773438, "learning_rate": 1.8839324382849717e-05, "loss": 5.0405, "step": 17960 }, { "epoch": 5.7504, "grad_norm": 203.0397186279297, "learning_rate": 1.879120350320004e-05, "loss": 3.8057, "step": 17970 }, { "epoch": 5.7536000000000005, "grad_norm": 376.3076477050781, "learning_rate": 1.874308262355036e-05, "loss": 5.8243, "step": 17980 }, { "epoch": 5.7568, "grad_norm": 545.3963623046875, "learning_rate": 1.869496174390068e-05, "loss": 8.5602, "step": 17990 }, { "epoch": 5.76, "grad_norm": 152.22384643554688, "learning_rate": 1.8646840864251e-05, "loss": 5.8367, "step": 18000 }, { "epoch": 5.7632, "grad_norm": 127.8596420288086, "learning_rate": 1.859871998460132e-05, "loss": 2.2566, "step": 18010 }, { "epoch": 5.7664, "grad_norm": 76.36260986328125, "learning_rate": 1.8550599104951636e-05, "loss": 2.1901, "step": 18020 }, { "epoch": 5.7696, "grad_norm": 299.13702392578125, "learning_rate": 1.850247822530196e-05, "loss": 9.5776, "step": 18030 }, { "epoch": 5.7728, "grad_norm": 25.609708786010742, "learning_rate": 1.845435734565228e-05, "loss": 3.5249, "step": 18040 }, { "epoch": 5.776, "grad_norm": 194.51499938964844, "learning_rate": 1.8406236466002598e-05, "loss": 6.3563, "step": 18050 }, { "epoch": 5.7792, "grad_norm": 38.76519775390625, "learning_rate": 1.835811558635292e-05, "loss": 6.4866, "step": 18060 }, { "epoch": 5.7824, "grad_norm": 391.5928955078125, "learning_rate": 1.830999470670324e-05, "loss": 6.1177, "step": 18070 }, { "epoch": 5.7856, "grad_norm": 179.78688049316406, "learning_rate": 1.826187382705356e-05, "loss": 4.7081, "step": 18080 }, { "epoch": 5.7888, "grad_norm": 277.09368896484375, "learning_rate": 1.821375294740388e-05, "loss": 6.4989, "step": 18090 }, { "epoch": 5.792, "grad_norm": 157.92308044433594, "learning_rate": 1.8165632067754198e-05, "loss": 2.4596, "step": 18100 }, { "epoch": 5.7952, "grad_norm": 165.11151123046875, "learning_rate": 1.811751118810452e-05, "loss": 6.712, "step": 18110 }, { "epoch": 5.7984, "grad_norm": 150.787841796875, "learning_rate": 1.806939030845484e-05, "loss": 3.6624, "step": 18120 }, { "epoch": 5.8016, "grad_norm": 69.14986419677734, "learning_rate": 1.802126942880516e-05, "loss": 7.1376, "step": 18130 }, { "epoch": 5.8048, "grad_norm": 110.26704406738281, "learning_rate": 1.797314854915548e-05, "loss": 6.7027, "step": 18140 }, { "epoch": 5.808, "grad_norm": 210.7619171142578, "learning_rate": 1.79250276695058e-05, "loss": 8.2289, "step": 18150 }, { "epoch": 5.8112, "grad_norm": 13.080667495727539, "learning_rate": 1.7876906789856117e-05, "loss": 7.4753, "step": 18160 }, { "epoch": 5.8144, "grad_norm": 75.03619384765625, "learning_rate": 1.782878591020644e-05, "loss": 8.6267, "step": 18170 }, { "epoch": 5.8176, "grad_norm": 73.8773422241211, "learning_rate": 1.778066503055676e-05, "loss": 7.4141, "step": 18180 }, { "epoch": 5.8208, "grad_norm": 413.8924560546875, "learning_rate": 1.773254415090708e-05, "loss": 9.0137, "step": 18190 }, { "epoch": 5.824, "grad_norm": 3.4073636531829834, "learning_rate": 1.76844232712574e-05, "loss": 4.2835, "step": 18200 }, { "epoch": 5.8272, "grad_norm": 90.74120330810547, "learning_rate": 1.763630239160772e-05, "loss": 5.2425, "step": 18210 }, { "epoch": 5.8304, "grad_norm": 39.07492446899414, "learning_rate": 1.758818151195804e-05, "loss": 3.7187, "step": 18220 }, { "epoch": 5.8336, "grad_norm": 41.2070198059082, "learning_rate": 1.754006063230836e-05, "loss": 4.878, "step": 18230 }, { "epoch": 5.8368, "grad_norm": 178.09432983398438, "learning_rate": 1.749193975265868e-05, "loss": 6.2356, "step": 18240 }, { "epoch": 5.84, "grad_norm": 105.47174072265625, "learning_rate": 1.7443818873008998e-05, "loss": 6.5983, "step": 18250 }, { "epoch": 5.8431999999999995, "grad_norm": 14.411127090454102, "learning_rate": 1.739569799335932e-05, "loss": 6.5114, "step": 18260 }, { "epoch": 5.8464, "grad_norm": 259.1484680175781, "learning_rate": 1.734757711370964e-05, "loss": 5.1548, "step": 18270 }, { "epoch": 5.8496, "grad_norm": 136.5657958984375, "learning_rate": 1.729945623405996e-05, "loss": 8.0494, "step": 18280 }, { "epoch": 5.8528, "grad_norm": 274.432861328125, "learning_rate": 1.7251335354410282e-05, "loss": 5.3024, "step": 18290 }, { "epoch": 5.856, "grad_norm": 184.41543579101562, "learning_rate": 1.7203214474760598e-05, "loss": 6.1458, "step": 18300 }, { "epoch": 5.8591999999999995, "grad_norm": 264.2789001464844, "learning_rate": 1.7155093595110917e-05, "loss": 6.1656, "step": 18310 }, { "epoch": 5.8624, "grad_norm": 113.28950500488281, "learning_rate": 1.710697271546124e-05, "loss": 2.1757, "step": 18320 }, { "epoch": 5.8656, "grad_norm": 14.294161796569824, "learning_rate": 1.705885183581156e-05, "loss": 2.0296, "step": 18330 }, { "epoch": 5.8688, "grad_norm": 52.15556716918945, "learning_rate": 1.701073095616188e-05, "loss": 5.5319, "step": 18340 }, { "epoch": 5.872, "grad_norm": 6.450396537780762, "learning_rate": 1.69626100765122e-05, "loss": 9.0948, "step": 18350 }, { "epoch": 5.8751999999999995, "grad_norm": 274.41131591796875, "learning_rate": 1.691448919686252e-05, "loss": 7.8572, "step": 18360 }, { "epoch": 5.8784, "grad_norm": 454.1431884765625, "learning_rate": 1.686636831721284e-05, "loss": 6.0475, "step": 18370 }, { "epoch": 5.8816, "grad_norm": 140.90158081054688, "learning_rate": 1.681824743756316e-05, "loss": 4.0573, "step": 18380 }, { "epoch": 5.8848, "grad_norm": 259.6550598144531, "learning_rate": 1.677012655791348e-05, "loss": 7.7685, "step": 18390 }, { "epoch": 5.888, "grad_norm": 112.15701293945312, "learning_rate": 1.6722005678263798e-05, "loss": 7.5503, "step": 18400 }, { "epoch": 5.8911999999999995, "grad_norm": 114.3333969116211, "learning_rate": 1.667388479861412e-05, "loss": 3.5866, "step": 18410 }, { "epoch": 5.8944, "grad_norm": 101.40161895751953, "learning_rate": 1.662576391896444e-05, "loss": 4.6037, "step": 18420 }, { "epoch": 5.8976, "grad_norm": 112.75108337402344, "learning_rate": 1.657764303931476e-05, "loss": 4.6755, "step": 18430 }, { "epoch": 5.9008, "grad_norm": 100.20002746582031, "learning_rate": 1.652952215966508e-05, "loss": 6.0844, "step": 18440 }, { "epoch": 5.904, "grad_norm": 79.26084899902344, "learning_rate": 1.6481401280015398e-05, "loss": 3.6824, "step": 18450 }, { "epoch": 5.9072, "grad_norm": 165.93650817871094, "learning_rate": 1.643328040036572e-05, "loss": 3.9552, "step": 18460 }, { "epoch": 5.9104, "grad_norm": 78.83628845214844, "learning_rate": 1.638515952071604e-05, "loss": 9.2101, "step": 18470 }, { "epoch": 5.9136, "grad_norm": 41.739009857177734, "learning_rate": 1.633703864106636e-05, "loss": 10.2251, "step": 18480 }, { "epoch": 5.9168, "grad_norm": 92.99891662597656, "learning_rate": 1.6288917761416682e-05, "loss": 4.7544, "step": 18490 }, { "epoch": 5.92, "grad_norm": 33.587867736816406, "learning_rate": 1.6240796881766998e-05, "loss": 4.5398, "step": 18500 }, { "epoch": 5.9232, "grad_norm": 244.25970458984375, "learning_rate": 1.6192676002117317e-05, "loss": 7.3689, "step": 18510 }, { "epoch": 5.9264, "grad_norm": 94.53414916992188, "learning_rate": 1.614455512246764e-05, "loss": 10.3478, "step": 18520 }, { "epoch": 5.9296, "grad_norm": 50.227272033691406, "learning_rate": 1.609643424281796e-05, "loss": 1.8595, "step": 18530 }, { "epoch": 5.9328, "grad_norm": 22.723535537719727, "learning_rate": 1.604831336316828e-05, "loss": 4.9988, "step": 18540 }, { "epoch": 5.936, "grad_norm": 291.2261047363281, "learning_rate": 1.60001924835186e-05, "loss": 7.5086, "step": 18550 }, { "epoch": 5.9392, "grad_norm": 156.45794677734375, "learning_rate": 1.595207160386892e-05, "loss": 5.4618, "step": 18560 }, { "epoch": 5.9424, "grad_norm": 166.38754272460938, "learning_rate": 1.5903950724219237e-05, "loss": 4.0989, "step": 18570 }, { "epoch": 5.9456, "grad_norm": 208.31015014648438, "learning_rate": 1.585582984456956e-05, "loss": 2.909, "step": 18580 }, { "epoch": 5.9488, "grad_norm": 68.99541473388672, "learning_rate": 1.580770896491988e-05, "loss": 3.0505, "step": 18590 }, { "epoch": 5.952, "grad_norm": 535.3999633789062, "learning_rate": 1.5759588085270198e-05, "loss": 6.2428, "step": 18600 }, { "epoch": 5.9552, "grad_norm": 178.76937866210938, "learning_rate": 1.571146720562052e-05, "loss": 3.5947, "step": 18610 }, { "epoch": 5.9584, "grad_norm": 248.93653869628906, "learning_rate": 1.566334632597084e-05, "loss": 8.2842, "step": 18620 }, { "epoch": 5.9616, "grad_norm": 130.55413818359375, "learning_rate": 1.561522544632116e-05, "loss": 5.6656, "step": 18630 }, { "epoch": 5.9648, "grad_norm": 227.6366729736328, "learning_rate": 1.556710456667148e-05, "loss": 6.5515, "step": 18640 }, { "epoch": 5.968, "grad_norm": 191.78555297851562, "learning_rate": 1.5518983687021798e-05, "loss": 4.6734, "step": 18650 }, { "epoch": 5.9712, "grad_norm": 144.41883850097656, "learning_rate": 1.5470862807372118e-05, "loss": 6.6394, "step": 18660 }, { "epoch": 5.9744, "grad_norm": 321.1352844238281, "learning_rate": 1.542274192772244e-05, "loss": 5.6482, "step": 18670 }, { "epoch": 5.9776, "grad_norm": 149.00071716308594, "learning_rate": 1.537462104807276e-05, "loss": 8.4415, "step": 18680 }, { "epoch": 5.9808, "grad_norm": 30.27177619934082, "learning_rate": 1.532650016842308e-05, "loss": 5.3603, "step": 18690 }, { "epoch": 5.984, "grad_norm": 94.55560302734375, "learning_rate": 1.52783792887734e-05, "loss": 4.1931, "step": 18700 }, { "epoch": 5.9872, "grad_norm": 87.25016021728516, "learning_rate": 1.5230258409123718e-05, "loss": 1.7037, "step": 18710 }, { "epoch": 5.9904, "grad_norm": 271.5228271484375, "learning_rate": 1.5182137529474039e-05, "loss": 5.8877, "step": 18720 }, { "epoch": 5.9936, "grad_norm": 260.53094482421875, "learning_rate": 1.513401664982436e-05, "loss": 7.8267, "step": 18730 }, { "epoch": 5.9968, "grad_norm": 102.48533630371094, "learning_rate": 1.5085895770174679e-05, "loss": 4.2328, "step": 18740 }, { "epoch": 6.0, "grad_norm": 189.56979370117188, "learning_rate": 1.5037774890525e-05, "loss": 3.8888, "step": 18750 }, { "epoch": 6.0032, "grad_norm": 7.992267608642578, "learning_rate": 1.4989654010875321e-05, "loss": 3.3318, "step": 18760 }, { "epoch": 6.0064, "grad_norm": 179.93560791015625, "learning_rate": 1.494153313122564e-05, "loss": 4.5963, "step": 18770 }, { "epoch": 6.0096, "grad_norm": 121.27680969238281, "learning_rate": 1.4893412251575958e-05, "loss": 4.7287, "step": 18780 }, { "epoch": 6.0128, "grad_norm": 509.8040466308594, "learning_rate": 1.4845291371926279e-05, "loss": 5.5393, "step": 18790 }, { "epoch": 6.016, "grad_norm": 241.3214569091797, "learning_rate": 1.47971704922766e-05, "loss": 8.3187, "step": 18800 }, { "epoch": 6.0192, "grad_norm": 129.79518127441406, "learning_rate": 1.474904961262692e-05, "loss": 1.7463, "step": 18810 }, { "epoch": 6.0224, "grad_norm": 284.64910888671875, "learning_rate": 1.470092873297724e-05, "loss": 7.9822, "step": 18820 }, { "epoch": 6.0256, "grad_norm": 20.635343551635742, "learning_rate": 1.465280785332756e-05, "loss": 1.7924, "step": 18830 }, { "epoch": 6.0288, "grad_norm": 242.31417846679688, "learning_rate": 1.460468697367788e-05, "loss": 7.7418, "step": 18840 }, { "epoch": 6.032, "grad_norm": 4.880585193634033, "learning_rate": 1.4556566094028198e-05, "loss": 2.0081, "step": 18850 }, { "epoch": 6.0352, "grad_norm": 15.383512496948242, "learning_rate": 1.450844521437852e-05, "loss": 6.0562, "step": 18860 }, { "epoch": 6.0384, "grad_norm": 5.576357841491699, "learning_rate": 1.4460324334728839e-05, "loss": 4.435, "step": 18870 }, { "epoch": 6.0416, "grad_norm": 305.27435302734375, "learning_rate": 1.441220345507916e-05, "loss": 7.8157, "step": 18880 }, { "epoch": 6.0448, "grad_norm": 25.318906784057617, "learning_rate": 1.436408257542948e-05, "loss": 5.704, "step": 18890 }, { "epoch": 6.048, "grad_norm": 180.80438232421875, "learning_rate": 1.43159616957798e-05, "loss": 7.267, "step": 18900 }, { "epoch": 6.0512, "grad_norm": 99.02574920654297, "learning_rate": 1.4267840816130121e-05, "loss": 3.9751, "step": 18910 }, { "epoch": 6.0544, "grad_norm": 64.50300598144531, "learning_rate": 1.4219719936480439e-05, "loss": 3.6183, "step": 18920 }, { "epoch": 6.0576, "grad_norm": 219.17709350585938, "learning_rate": 1.4171599056830758e-05, "loss": 3.454, "step": 18930 }, { "epoch": 6.0608, "grad_norm": 137.03749084472656, "learning_rate": 1.4123478177181079e-05, "loss": 4.6317, "step": 18940 }, { "epoch": 6.064, "grad_norm": 177.88040161132812, "learning_rate": 1.40753572975314e-05, "loss": 4.7447, "step": 18950 }, { "epoch": 6.0672, "grad_norm": 218.0112762451172, "learning_rate": 1.402723641788172e-05, "loss": 6.0236, "step": 18960 }, { "epoch": 6.0704, "grad_norm": 188.44822692871094, "learning_rate": 1.397911553823204e-05, "loss": 4.9825, "step": 18970 }, { "epoch": 6.0736, "grad_norm": 351.8241271972656, "learning_rate": 1.3930994658582361e-05, "loss": 11.5429, "step": 18980 }, { "epoch": 6.0768, "grad_norm": 18.24509048461914, "learning_rate": 1.3882873778932679e-05, "loss": 6.4644, "step": 18990 }, { "epoch": 6.08, "grad_norm": 91.41239166259766, "learning_rate": 1.3834752899282998e-05, "loss": 6.6279, "step": 19000 } ], "logging_steps": 10, "max_steps": 21875, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.625447268352e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }