{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9987956849648527, "eval_steps": 100, "global_step": 1828, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010932788971549125, "grad_norm": 181.14852905273438, "learning_rate": 0.0005076923076923076, "loss": 9.7153, "step": 10 }, { "epoch": 0.02186557794309825, "grad_norm": 110.58919525146484, "learning_rate": 0.0006, "loss": 8.5026, "step": 20 }, { "epoch": 0.032798366914647374, "grad_norm": 243.64418029785156, "learning_rate": 0.0006, "loss": 7.987, "step": 30 }, { "epoch": 0.0437311558861965, "grad_norm": 91.78092193603516, "learning_rate": 0.0006, "loss": 7.8078, "step": 40 }, { "epoch": 0.054663944857745624, "grad_norm": 277.1399841308594, "learning_rate": 0.0006, "loss": 7.8083, "step": 50 }, { "epoch": 0.06559673382929475, "grad_norm": 240.4947052001953, "learning_rate": 0.0006, "loss": 7.6807, "step": 60 }, { "epoch": 0.07652952280084388, "grad_norm": 152.3121795654297, "learning_rate": 0.0006, "loss": 7.5963, "step": 70 }, { "epoch": 0.087462311772393, "grad_norm": 195.2014923095703, "learning_rate": 0.0006, "loss": 7.4986, "step": 80 }, { "epoch": 0.09839510074394213, "grad_norm": 165.33566284179688, "learning_rate": 0.0006, "loss": 7.4302, "step": 90 }, { "epoch": 0.10932788971549125, "grad_norm": 169.95648193359375, "learning_rate": 0.0006, "loss": 7.3611, "step": 100 }, { "epoch": 0.10932788971549125, "eval_loss": 7.343780040740967, "eval_runtime": 87.0849, "eval_samples_per_second": 107.55, "eval_steps_per_second": 13.447, "step": 100 }, { "epoch": 0.12026067868704038, "grad_norm": 220.66673278808594, "learning_rate": 0.0006, "loss": 7.2779, "step": 110 }, { "epoch": 0.1311934676585895, "grad_norm": 270.6573791503906, "learning_rate": 0.0006, "loss": 7.2641, "step": 120 }, { "epoch": 0.14212625663013861, "grad_norm": 247.8153839111328, "learning_rate": 0.0006, "loss": 7.2793, "step": 130 }, { "epoch": 0.15305904560168776, "grad_norm": 193.0130615234375, "learning_rate": 0.0006, "loss": 7.2593, "step": 140 }, { "epoch": 0.16399183457323688, "grad_norm": 146.72889709472656, "learning_rate": 0.0006, "loss": 7.2119, "step": 150 }, { "epoch": 0.174924623544786, "grad_norm": 219.57923889160156, "learning_rate": 0.0006, "loss": 7.1735, "step": 160 }, { "epoch": 0.1858574125163351, "grad_norm": 141.38900756835938, "learning_rate": 0.0006, "loss": 7.2062, "step": 170 }, { "epoch": 0.19679020148788426, "grad_norm": 123.53499603271484, "learning_rate": 0.0006, "loss": 7.1545, "step": 180 }, { "epoch": 0.20772299045943338, "grad_norm": 144.73370361328125, "learning_rate": 0.0006, "loss": 7.1246, "step": 190 }, { "epoch": 0.2186557794309825, "grad_norm": 126.77777862548828, "learning_rate": 0.0006, "loss": 7.0969, "step": 200 }, { "epoch": 0.2186557794309825, "eval_loss": 7.094976902008057, "eval_runtime": 86.8787, "eval_samples_per_second": 107.805, "eval_steps_per_second": 13.479, "step": 200 }, { "epoch": 0.2295885684025316, "grad_norm": 82.89041137695312, "learning_rate": 0.0006, "loss": 7.0632, "step": 210 }, { "epoch": 0.24052135737408076, "grad_norm": 174.27378845214844, "learning_rate": 0.0006, "loss": 7.0664, "step": 220 }, { "epoch": 0.25145414634562985, "grad_norm": 159.48312377929688, "learning_rate": 0.0006, "loss": 7.1705, "step": 230 }, { "epoch": 0.262386935317179, "grad_norm": 132.21905517578125, "learning_rate": 0.0006, "loss": 7.1723, "step": 240 }, { "epoch": 0.27331972428872814, "grad_norm": 125.35407257080078, "learning_rate": 0.0006, "loss": 7.0974, "step": 250 }, { "epoch": 0.28425251326027723, "grad_norm": 106.2899169921875, "learning_rate": 0.0006, "loss": 7.0344, "step": 260 }, { "epoch": 0.2951853022318264, "grad_norm": 121.89833068847656, "learning_rate": 0.0006, "loss": 7.0553, "step": 270 }, { "epoch": 0.3061180912033755, "grad_norm": 239.16209411621094, "learning_rate": 0.0006, "loss": 7.122, "step": 280 }, { "epoch": 0.3170508801749246, "grad_norm": 92.66609191894531, "learning_rate": 0.0006, "loss": 7.144, "step": 290 }, { "epoch": 0.32798366914647376, "grad_norm": 121.50637817382812, "learning_rate": 0.0006, "loss": 7.0404, "step": 300 }, { "epoch": 0.32798366914647376, "eval_loss": 7.017237186431885, "eval_runtime": 86.6501, "eval_samples_per_second": 108.09, "eval_steps_per_second": 13.514, "step": 300 }, { "epoch": 0.3389164581180229, "grad_norm": 111.2243881225586, "learning_rate": 0.0006, "loss": 7.0229, "step": 310 }, { "epoch": 0.349849247089572, "grad_norm": 113.8078842163086, "learning_rate": 0.0006, "loss": 7.0651, "step": 320 }, { "epoch": 0.36078203606112114, "grad_norm": 134.9191131591797, "learning_rate": 0.0006, "loss": 7.0728, "step": 330 }, { "epoch": 0.3717148250326702, "grad_norm": 123.53280639648438, "learning_rate": 0.0006, "loss": 7.0312, "step": 340 }, { "epoch": 0.38264761400421937, "grad_norm": 153.8329315185547, "learning_rate": 0.0006, "loss": 6.9934, "step": 350 }, { "epoch": 0.3935804029757685, "grad_norm": 112.699951171875, "learning_rate": 0.0006, "loss": 7.0051, "step": 360 }, { "epoch": 0.4045131919473176, "grad_norm": 298.7499694824219, "learning_rate": 0.0006, "loss": 7.0912, "step": 370 }, { "epoch": 0.41544598091886675, "grad_norm": 163.08180236816406, "learning_rate": 0.0006, "loss": 7.2592, "step": 380 }, { "epoch": 0.4263787698904159, "grad_norm": 95.96805572509766, "learning_rate": 0.0006, "loss": 7.1868, "step": 390 }, { "epoch": 0.437311558861965, "grad_norm": 112.89175415039062, "learning_rate": 0.0006, "loss": 7.041, "step": 400 }, { "epoch": 0.437311558861965, "eval_loss": 7.021984577178955, "eval_runtime": 104.9558, "eval_samples_per_second": 89.238, "eval_steps_per_second": 11.157, "step": 400 }, { "epoch": 0.44824434783351413, "grad_norm": 89.9395751953125, "learning_rate": 0.0006, "loss": 6.9804, "step": 410 }, { "epoch": 0.4591771368050632, "grad_norm": 148.31468200683594, "learning_rate": 0.0006, "loss": 6.9625, "step": 420 }, { "epoch": 0.47010992577661237, "grad_norm": 203.39456176757812, "learning_rate": 0.0006, "loss": 6.9835, "step": 430 }, { "epoch": 0.4810427147481615, "grad_norm": 125.79383087158203, "learning_rate": 0.0006, "loss": 7.0087, "step": 440 }, { "epoch": 0.4919755037197106, "grad_norm": 92.43452453613281, "learning_rate": 0.0006, "loss": 6.993, "step": 450 }, { "epoch": 0.5029082926912597, "grad_norm": 153.0977325439453, "learning_rate": 0.0006, "loss": 6.9634, "step": 460 }, { "epoch": 0.5138410816628088, "grad_norm": 106.5129623413086, "learning_rate": 0.0006, "loss": 6.9674, "step": 470 }, { "epoch": 0.524773870634358, "grad_norm": 212.9166717529297, "learning_rate": 0.0006, "loss": 7.0303, "step": 480 }, { "epoch": 0.5357066596059071, "grad_norm": 145.5367431640625, "learning_rate": 0.0006, "loss": 7.1604, "step": 490 }, { "epoch": 0.5466394485774563, "grad_norm": 113.21528625488281, "learning_rate": 0.0006, "loss": 7.1792, "step": 500 }, { "epoch": 0.5466394485774563, "eval_loss": 7.15646505355835, "eval_runtime": 99.5651, "eval_samples_per_second": 94.069, "eval_steps_per_second": 11.761, "step": 500 }, { "epoch": 0.5575722375490054, "grad_norm": 198.23634338378906, "learning_rate": 0.0006, "loss": 7.1151, "step": 510 }, { "epoch": 0.5685050265205545, "grad_norm": 141.17764282226562, "learning_rate": 0.0006, "loss": 7.0219, "step": 520 }, { "epoch": 0.5794378154921036, "grad_norm": 89.62096405029297, "learning_rate": 0.0006, "loss": 6.9803, "step": 530 }, { "epoch": 0.5903706044636527, "grad_norm": 158.40231323242188, "learning_rate": 0.0006, "loss": 6.9783, "step": 540 }, { "epoch": 0.6013033934352019, "grad_norm": 233.17726135253906, "learning_rate": 0.0006, "loss": 6.9843, "step": 550 }, { "epoch": 0.612236182406751, "grad_norm": 147.8563995361328, "learning_rate": 0.0006, "loss": 7.0332, "step": 560 }, { "epoch": 0.6231689713783001, "grad_norm": 193.6658477783203, "learning_rate": 0.0006, "loss": 7.0557, "step": 570 }, { "epoch": 0.6341017603498492, "grad_norm": 162.1654510498047, "learning_rate": 0.0006, "loss": 7.0636, "step": 580 }, { "epoch": 0.6450345493213984, "grad_norm": 154.3310546875, "learning_rate": 0.0006, "loss": 7.086, "step": 590 }, { "epoch": 0.6559673382929475, "grad_norm": 104.41985321044922, "learning_rate": 0.0006, "loss": 7.0261, "step": 600 }, { "epoch": 0.6559673382929475, "eval_loss": 6.992109775543213, "eval_runtime": 99.6342, "eval_samples_per_second": 94.004, "eval_steps_per_second": 11.753, "step": 600 }, { "epoch": 0.6669001272644967, "grad_norm": 124.9378433227539, "learning_rate": 0.0006, "loss": 6.9647, "step": 610 }, { "epoch": 0.6778329162360458, "grad_norm": 110.5478744506836, "learning_rate": 0.0006, "loss": 7.0055, "step": 620 }, { "epoch": 0.6887657052075948, "grad_norm": 159.1732177734375, "learning_rate": 0.0006, "loss": 7.0735, "step": 630 }, { "epoch": 0.699698494179144, "grad_norm": 157.9123077392578, "learning_rate": 0.0006, "loss": 7.1532, "step": 640 }, { "epoch": 0.7106312831506931, "grad_norm": 118.69967651367188, "learning_rate": 0.0006, "loss": 7.1615, "step": 650 }, { "epoch": 0.7215640721222423, "grad_norm": 152.35418701171875, "learning_rate": 0.0006, "loss": 7.1013, "step": 660 }, { "epoch": 0.7324968610937914, "grad_norm": 137.92417907714844, "learning_rate": 0.0006, "loss": 7.0749, "step": 670 }, { "epoch": 0.7434296500653405, "grad_norm": 136.46435546875, "learning_rate": 0.0006, "loss": 7.0868, "step": 680 }, { "epoch": 0.7543624390368896, "grad_norm": 94.5594711303711, "learning_rate": 0.0006, "loss": 7.0402, "step": 690 }, { "epoch": 0.7652952280084387, "grad_norm": 104.29204559326172, "learning_rate": 0.0006, "loss": 6.969, "step": 700 }, { "epoch": 0.7652952280084387, "eval_loss": 6.94205379486084, "eval_runtime": 99.7317, "eval_samples_per_second": 93.912, "eval_steps_per_second": 11.742, "step": 700 }, { "epoch": 0.7762280169799879, "grad_norm": 97.6023178100586, "learning_rate": 0.0006, "loss": 6.9172, "step": 710 }, { "epoch": 0.787160805951537, "grad_norm": 106.64608001708984, "learning_rate": 0.0006, "loss": 6.9255, "step": 720 }, { "epoch": 0.7980935949230861, "grad_norm": 247.9333038330078, "learning_rate": 0.0006, "loss": 7.1781, "step": 730 }, { "epoch": 0.8090263838946352, "grad_norm": 400.5404357910156, "learning_rate": 6e-05, "loss": 7.2559, "step": 740 }, { "epoch": 0.8199591728661844, "grad_norm": 178.59230041503906, "learning_rate": 6e-05, "loss": 7.2795, "step": 750 }, { "epoch": 0.8308919618377335, "grad_norm": 97.08971405029297, "learning_rate": 6e-05, "loss": 7.1765, "step": 760 }, { "epoch": 0.8418247508092827, "grad_norm": 97.70082092285156, "learning_rate": 6e-05, "loss": 7.0649, "step": 770 }, { "epoch": 0.8527575397808318, "grad_norm": 91.7701644897461, "learning_rate": 6e-05, "loss": 6.9831, "step": 780 }, { "epoch": 0.8636903287523808, "grad_norm": 75.20551300048828, "learning_rate": 6e-05, "loss": 6.9241, "step": 790 }, { "epoch": 0.87462311772393, "grad_norm": 88.81455993652344, "learning_rate": 6e-05, "loss": 6.9251, "step": 800 }, { "epoch": 0.87462311772393, "eval_loss": 6.933653831481934, "eval_runtime": 99.6151, "eval_samples_per_second": 94.022, "eval_steps_per_second": 11.755, "step": 800 }, { "epoch": 0.8855559066954791, "grad_norm": 193.87106323242188, "learning_rate": 6e-05, "loss": 6.9471, "step": 810 }, { "epoch": 0.8964886956670283, "grad_norm": 297.5222473144531, "learning_rate": 6e-05, "loss": 7.2936, "step": 820 }, { "epoch": 0.9074214846385774, "grad_norm": 1132.71923828125, "learning_rate": 6e-05, "loss": 7.5122, "step": 830 }, { "epoch": 0.9183542736101264, "grad_norm": 411.77130126953125, "learning_rate": 6e-05, "loss": 7.9047, "step": 840 }, { "epoch": 0.9292870625816756, "grad_norm": 175.99290466308594, "learning_rate": 6e-05, "loss": 7.8735, "step": 850 }, { "epoch": 0.9402198515532247, "grad_norm": 139.72422790527344, "learning_rate": 6e-05, "loss": 7.538, "step": 860 }, { "epoch": 0.9511526405247739, "grad_norm": 64.29164123535156, "learning_rate": 6e-05, "loss": 7.2554, "step": 870 }, { "epoch": 0.962085429496323, "grad_norm": 53.71916198730469, "learning_rate": 6e-05, "loss": 7.074, "step": 880 }, { "epoch": 0.9730182184678722, "grad_norm": 51.395530700683594, "learning_rate": 6e-05, "loss": 6.9628, "step": 890 }, { "epoch": 0.9839510074394212, "grad_norm": 67.36868286132812, "learning_rate": 6e-05, "loss": 6.8868, "step": 900 }, { "epoch": 0.9839510074394212, "eval_loss": 6.888969421386719, "eval_runtime": 99.7453, "eval_samples_per_second": 93.899, "eval_steps_per_second": 11.74, "step": 900 }, { "epoch": 0.9948837964109704, "grad_norm": 80.71182250976562, "learning_rate": 6e-05, "loss": 6.8807, "step": 910 }, { "epoch": 1.0060984463481923, "grad_norm": 106.76873016357422, "learning_rate": 6e-05, "loss": 6.8982, "step": 920 }, { "epoch": 1.0170312353197413, "grad_norm": 401.31884765625, "learning_rate": 6e-05, "loss": 7.0132, "step": 930 }, { "epoch": 1.0279640242912904, "grad_norm": 225.1197967529297, "learning_rate": 6e-05, "loss": 7.2337, "step": 940 }, { "epoch": 1.0388968132628396, "grad_norm": 142.52125549316406, "learning_rate": 6e-05, "loss": 7.2578, "step": 950 }, { "epoch": 1.0498296022343887, "grad_norm": 143.7019500732422, "learning_rate": 6e-05, "loss": 7.1791, "step": 960 }, { "epoch": 1.0607623912059378, "grad_norm": 94.52357482910156, "learning_rate": 6e-05, "loss": 7.0675, "step": 970 }, { "epoch": 1.071695180177487, "grad_norm": 81.73490142822266, "learning_rate": 6e-05, "loss": 6.9916, "step": 980 }, { "epoch": 1.0826279691490361, "grad_norm": 82.44967651367188, "learning_rate": 6e-05, "loss": 6.9204, "step": 990 }, { "epoch": 1.0935607581205853, "grad_norm": 86.71012115478516, "learning_rate": 6e-05, "loss": 6.8979, "step": 1000 }, { "epoch": 1.0935607581205853, "eval_loss": 6.89885950088501, "eval_runtime": 99.3939, "eval_samples_per_second": 94.231, "eval_steps_per_second": 11.781, "step": 1000 }, { "epoch": 1.1044935470921344, "grad_norm": 97.32078552246094, "learning_rate": 6e-05, "loss": 6.8888, "step": 1010 }, { "epoch": 1.1154263360636836, "grad_norm": 220.74195861816406, "learning_rate": 6e-05, "loss": 6.9171, "step": 1020 }, { "epoch": 1.1263591250352327, "grad_norm": 255.6461944580078, "learning_rate": 6e-05, "loss": 7.0676, "step": 1030 }, { "epoch": 1.1372919140067816, "grad_norm": 471.5797119140625, "learning_rate": 6e-05, "loss": 7.3424, "step": 1040 }, { "epoch": 1.1482247029783308, "grad_norm": 451.72308349609375, "learning_rate": 6e-05, "loss": 7.4555, "step": 1050 }, { "epoch": 1.15915749194988, "grad_norm": 362.2579650878906, "learning_rate": 6e-05, "loss": 7.5882, "step": 1060 }, { "epoch": 1.170090280921429, "grad_norm": 174.1190185546875, "learning_rate": 6e-05, "loss": 7.5085, "step": 1070 }, { "epoch": 1.1810230698929782, "grad_norm": 69.2359848022461, "learning_rate": 6e-05, "loss": 7.2459, "step": 1080 }, { "epoch": 1.1919558588645274, "grad_norm": 50.8225212097168, "learning_rate": 6e-05, "loss": 7.0644, "step": 1090 }, { "epoch": 1.2028886478360765, "grad_norm": 64.34941101074219, "learning_rate": 6e-05, "loss": 6.9687, "step": 1100 }, { "epoch": 1.2028886478360765, "eval_loss": 6.9256205558776855, "eval_runtime": 99.68, "eval_samples_per_second": 93.961, "eval_steps_per_second": 11.748, "step": 1100 }, { "epoch": 1.2138214368076257, "grad_norm": 48.78596115112305, "learning_rate": 6e-05, "loss": 6.8985, "step": 1110 }, { "epoch": 1.2247542257791748, "grad_norm": 88.06245422363281, "learning_rate": 6e-05, "loss": 6.8822, "step": 1120 }, { "epoch": 1.235687014750724, "grad_norm": 90.59126281738281, "learning_rate": 6e-05, "loss": 6.8739, "step": 1130 }, { "epoch": 1.246619803722273, "grad_norm": 172.49806213378906, "learning_rate": 6e-05, "loss": 6.9651, "step": 1140 }, { "epoch": 1.257552592693822, "grad_norm": 124.38924407958984, "learning_rate": 6e-05, "loss": 7.0716, "step": 1150 }, { "epoch": 1.2684853816653712, "grad_norm": 278.1981506347656, "learning_rate": 6e-05, "loss": 7.1942, "step": 1160 }, { "epoch": 1.2794181706369203, "grad_norm": 147.80160522460938, "learning_rate": 6e-05, "loss": 7.1944, "step": 1170 }, { "epoch": 1.2903509596084695, "grad_norm": 77.38485717773438, "learning_rate": 6e-05, "loss": 7.1041, "step": 1180 }, { "epoch": 1.3012837485800186, "grad_norm": 70.65458679199219, "learning_rate": 6e-05, "loss": 7.0189, "step": 1190 }, { "epoch": 1.3122165375515678, "grad_norm": 67.25665283203125, "learning_rate": 6e-05, "loss": 6.9569, "step": 1200 }, { "epoch": 1.3122165375515678, "eval_loss": 6.939022064208984, "eval_runtime": 99.5561, "eval_samples_per_second": 94.078, "eval_steps_per_second": 11.762, "step": 1200 }, { "epoch": 1.323149326523117, "grad_norm": 92.24222564697266, "learning_rate": 6e-05, "loss": 6.9035, "step": 1210 }, { "epoch": 1.334082115494666, "grad_norm": 99.06700134277344, "learning_rate": 6e-05, "loss": 6.8957, "step": 1220 }, { "epoch": 1.3450149044662152, "grad_norm": 86.9864501953125, "learning_rate": 6e-05, "loss": 6.918, "step": 1230 }, { "epoch": 1.3559476934377641, "grad_norm": 169.9635467529297, "learning_rate": 6e-05, "loss": 6.9808, "step": 1240 }, { "epoch": 1.3668804824093135, "grad_norm": 874.7172241210938, "learning_rate": 6e-05, "loss": 7.1926, "step": 1250 }, { "epoch": 1.3778132713808624, "grad_norm": 550.202392578125, "learning_rate": 0.0006, "loss": 7.9011, "step": 1260 }, { "epoch": 1.3887460603524115, "grad_norm": 327.4295654296875, "learning_rate": 0.0006, "loss": 7.7866, "step": 1270 }, { "epoch": 1.3996788493239607, "grad_norm": 158.1693115234375, "learning_rate": 0.0006, "loss": 7.751, "step": 1280 }, { "epoch": 1.4106116382955098, "grad_norm": 92.38487243652344, "learning_rate": 0.0006, "loss": 7.3469, "step": 1290 }, { "epoch": 1.421544427267059, "grad_norm": 59.317264556884766, "learning_rate": 0.0006, "loss": 7.1729, "step": 1300 }, { "epoch": 1.421544427267059, "eval_loss": 7.10260534286499, "eval_runtime": 99.8559, "eval_samples_per_second": 93.795, "eval_steps_per_second": 11.727, "step": 1300 }, { "epoch": 1.4324772162386081, "grad_norm": 79.95414733886719, "learning_rate": 0.0006, "loss": 7.0548, "step": 1310 }, { "epoch": 1.4434100052101573, "grad_norm": 58.8936653137207, "learning_rate": 0.0006, "loss": 6.9769, "step": 1320 }, { "epoch": 1.4543427941817064, "grad_norm": 65.88204193115234, "learning_rate": 0.0006, "loss": 6.9327, "step": 1330 }, { "epoch": 1.4652755831532556, "grad_norm": 96.34090423583984, "learning_rate": 0.0006, "loss": 6.9195, "step": 1340 }, { "epoch": 1.4762083721248045, "grad_norm": 174.13668823242188, "learning_rate": 0.0006, "loss": 6.9118, "step": 1350 }, { "epoch": 1.4871411610963539, "grad_norm": 151.93190002441406, "learning_rate": 0.0006, "loss": 6.9575, "step": 1360 }, { "epoch": 1.4980739500679028, "grad_norm": 292.9132995605469, "learning_rate": 0.0006, "loss": 7.1444, "step": 1370 }, { "epoch": 1.5090067390394521, "grad_norm": 158.8959503173828, "learning_rate": 0.0006, "loss": 7.1306, "step": 1380 }, { "epoch": 1.519939528011001, "grad_norm": 195.31460571289062, "learning_rate": 0.0006, "loss": 7.1245, "step": 1390 }, { "epoch": 1.5308723169825502, "grad_norm": 150.30398559570312, "learning_rate": 0.0006, "loss": 7.0727, "step": 1400 }, { "epoch": 1.5308723169825502, "eval_loss": 7.0534749031066895, "eval_runtime": 99.5179, "eval_samples_per_second": 94.114, "eval_steps_per_second": 11.767, "step": 1400 }, { "epoch": 1.5418051059540994, "grad_norm": 96.65521240234375, "learning_rate": 0.0006, "loss": 7.0267, "step": 1410 }, { "epoch": 1.5527378949256485, "grad_norm": 96.80887603759766, "learning_rate": 0.0006, "loss": 6.9712, "step": 1420 }, { "epoch": 1.5636706838971977, "grad_norm": 85.50247955322266, "learning_rate": 0.0006, "loss": 6.9328, "step": 1430 }, { "epoch": 1.5746034728687466, "grad_norm": 133.37132263183594, "learning_rate": 0.0006, "loss": 6.913, "step": 1440 }, { "epoch": 1.585536261840296, "grad_norm": 449.5869445800781, "learning_rate": 0.0006, "loss": 6.9241, "step": 1450 }, { "epoch": 1.5964690508118449, "grad_norm": 1678.7078857421875, "learning_rate": 0.0006, "loss": 7.1521, "step": 1460 }, { "epoch": 1.6074018397833942, "grad_norm": 297.7402648925781, "learning_rate": 0.0006, "loss": 7.309, "step": 1470 }, { "epoch": 1.6183346287549432, "grad_norm": 252.57696533203125, "learning_rate": 0.0006, "loss": 7.4165, "step": 1480 }, { "epoch": 1.6292674177264925, "grad_norm": 231.01919555664062, "learning_rate": 0.0006, "loss": 7.2324, "step": 1490 }, { "epoch": 1.6402002066980415, "grad_norm": 95.04192352294922, "learning_rate": 0.0006, "loss": 7.1383, "step": 1500 }, { "epoch": 1.6402002066980415, "eval_loss": 7.079530715942383, "eval_runtime": 97.3013, "eval_samples_per_second": 96.258, "eval_steps_per_second": 12.035, "step": 1500 }, { "epoch": 1.6511329956695906, "grad_norm": 92.854736328125, "learning_rate": 0.0006, "loss": 7.0281, "step": 1510 }, { "epoch": 1.6620657846411397, "grad_norm": 115.00365447998047, "learning_rate": 0.0006, "loss": 6.9758, "step": 1520 }, { "epoch": 1.6729985736126889, "grad_norm": 78.67362976074219, "learning_rate": 0.0006, "loss": 6.9319, "step": 1530 }, { "epoch": 1.683931362584238, "grad_norm": 94.60012817382812, "learning_rate": 0.0006, "loss": 6.9229, "step": 1540 }, { "epoch": 1.694864151555787, "grad_norm": 188.86849975585938, "learning_rate": 0.0006, "loss": 6.8954, "step": 1550 }, { "epoch": 1.7057969405273363, "grad_norm": 165.15492248535156, "learning_rate": 0.0006, "loss": 6.9594, "step": 1560 }, { "epoch": 1.7167297294988852, "grad_norm": 356.9344482421875, "learning_rate": 0.0006, "loss": 6.989, "step": 1570 }, { "epoch": 1.7276625184704346, "grad_norm": 420.25567626953125, "learning_rate": 0.0006, "loss": 7.0395, "step": 1580 }, { "epoch": 1.7385953074419835, "grad_norm": 166.69720458984375, "learning_rate": 0.0006, "loss": 7.0574, "step": 1590 }, { "epoch": 1.749528096413533, "grad_norm": 270.54522705078125, "learning_rate": 0.0006, "loss": 7.0753, "step": 1600 }, { "epoch": 1.749528096413533, "eval_loss": 7.091196060180664, "eval_runtime": 89.5183, "eval_samples_per_second": 104.627, "eval_steps_per_second": 13.081, "step": 1600 }, { "epoch": 1.7604608853850818, "grad_norm": 153.78634643554688, "learning_rate": 0.0006, "loss": 7.0849, "step": 1610 }, { "epoch": 1.771393674356631, "grad_norm": 391.0542907714844, "learning_rate": 0.0006, "loss": 7.0725, "step": 1620 }, { "epoch": 1.7823264633281801, "grad_norm": 255.31056213378906, "learning_rate": 0.0006, "loss": 7.2, "step": 1630 }, { "epoch": 1.7932592522997293, "grad_norm": 174.11514282226562, "learning_rate": 0.0006, "loss": 7.2102, "step": 1640 }, { "epoch": 1.8041920412712784, "grad_norm": 175.83531188964844, "learning_rate": 0.0006, "loss": 7.1926, "step": 1650 }, { "epoch": 1.8151248302428273, "grad_norm": 107.8171157836914, "learning_rate": 0.0006, "loss": 7.171, "step": 1660 }, { "epoch": 1.8260576192143767, "grad_norm": 126.21243286132812, "learning_rate": 0.0006, "loss": 7.084, "step": 1670 }, { "epoch": 1.8369904081859256, "grad_norm": 77.04115295410156, "learning_rate": 0.0006, "loss": 6.9972, "step": 1680 }, { "epoch": 1.847923197157475, "grad_norm": 96.28236389160156, "learning_rate": 0.0006, "loss": 6.9581, "step": 1690 }, { "epoch": 1.858855986129024, "grad_norm": 96.95925903320312, "learning_rate": 0.0006, "loss": 6.9092, "step": 1700 }, { "epoch": 1.858855986129024, "eval_loss": 6.905356407165527, "eval_runtime": 97.882, "eval_samples_per_second": 95.687, "eval_steps_per_second": 11.963, "step": 1700 }, { "epoch": 1.869788775100573, "grad_norm": 130.61839294433594, "learning_rate": 0.0006, "loss": 6.9007, "step": 1710 }, { "epoch": 1.8807215640721222, "grad_norm": 122.97721862792969, "learning_rate": 0.0006, "loss": 6.8936, "step": 1720 }, { "epoch": 1.8916543530436714, "grad_norm": 143.2695770263672, "learning_rate": 0.0006, "loss": 6.8846, "step": 1730 }, { "epoch": 1.9025871420152205, "grad_norm": 218.91848754882812, "learning_rate": 0.0006, "loss": 6.9299, "step": 1740 }, { "epoch": 1.9135199309867696, "grad_norm": 374.58160400390625, "learning_rate": 6e-05, "loss": 7.1832, "step": 1750 }, { "epoch": 1.9244527199583188, "grad_norm": 386.4493713378906, "learning_rate": 6e-05, "loss": 7.2035, "step": 1760 }, { "epoch": 1.9353855089298677, "grad_norm": 250.43450927734375, "learning_rate": 6e-05, "loss": 7.3685, "step": 1770 }, { "epoch": 1.946318297901417, "grad_norm": 130.10150146484375, "learning_rate": 6e-05, "loss": 7.2494, "step": 1780 }, { "epoch": 1.957251086872966, "grad_norm": 131.78555297851562, "learning_rate": 6e-05, "loss": 7.1311, "step": 1790 }, { "epoch": 1.9681838758445154, "grad_norm": 113.94293212890625, "learning_rate": 6e-05, "loss": 7.0392, "step": 1800 }, { "epoch": 1.9681838758445154, "eval_loss": 6.994777679443359, "eval_runtime": 98.1103, "eval_samples_per_second": 95.464, "eval_steps_per_second": 11.936, "step": 1800 }, { "epoch": 1.9791166648160643, "grad_norm": 100.18412017822266, "learning_rate": 6e-05, "loss": 6.9709, "step": 1810 }, { "epoch": 1.9900494537876134, "grad_norm": 82.470458984375, "learning_rate": 6e-05, "loss": 6.9195, "step": 1820 } ], "logging_steps": 10, "max_steps": 1828, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.4715685271028367e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }