{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 12332, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008108984755108661, "grad_norm": 1.9572986364364624, "learning_rate": 9.980942340442787e-06, "loss": 1.3961, "step": 50 }, { "epoch": 0.016217969510217322, "grad_norm": 2.140582323074341, "learning_rate": 9.960668234530858e-06, "loss": 1.2721, "step": 100 }, { "epoch": 0.02432695426532598, "grad_norm": 2.600647211074829, "learning_rate": 9.940394128618928e-06, "loss": 1.2162, "step": 150 }, { "epoch": 0.032435939020434644, "grad_norm": 2.521652936935425, "learning_rate": 9.920120022707e-06, "loss": 1.2469, "step": 200 }, { "epoch": 0.0405449237755433, "grad_norm": 2.509162425994873, "learning_rate": 9.89984591679507e-06, "loss": 1.2162, "step": 250 }, { "epoch": 0.04865390853065196, "grad_norm": 4.292358875274658, "learning_rate": 9.879571810883141e-06, "loss": 1.2096, "step": 300 }, { "epoch": 0.056762893285760625, "grad_norm": 2.3972127437591553, "learning_rate": 9.859297704971211e-06, "loss": 1.1857, "step": 350 }, { "epoch": 0.06487187804086929, "grad_norm": 1.8880646228790283, "learning_rate": 9.839023599059281e-06, "loss": 1.1973, "step": 400 }, { "epoch": 0.07298086279597794, "grad_norm": 1.4280647039413452, "learning_rate": 9.818749493147353e-06, "loss": 1.2305, "step": 450 }, { "epoch": 0.0810898475510866, "grad_norm": 1.4675335884094238, "learning_rate": 9.798475387235423e-06, "loss": 1.2302, "step": 500 }, { "epoch": 0.08919883230619527, "grad_norm": 3.1858043670654297, "learning_rate": 9.778201281323494e-06, "loss": 1.1004, "step": 550 }, { "epoch": 0.09730781706130393, "grad_norm": 1.4137463569641113, "learning_rate": 9.757927175411566e-06, "loss": 1.1911, "step": 600 }, { "epoch": 0.10541680181641258, "grad_norm": 1.389324426651001, "learning_rate": 9.737653069499636e-06, "loss": 1.1752, "step": 650 }, { "epoch": 0.11352578657152125, "grad_norm": 2.291736602783203, "learning_rate": 9.717378963587707e-06, "loss": 1.135, "step": 700 }, { "epoch": 0.1216347713266299, "grad_norm": 1.8395874500274658, "learning_rate": 9.697104857675777e-06, "loss": 1.1551, "step": 750 }, { "epoch": 0.12974375608173858, "grad_norm": 1.9675475358963013, "learning_rate": 9.676830751763849e-06, "loss": 1.1685, "step": 800 }, { "epoch": 0.13785274083684723, "grad_norm": 2.3232228755950928, "learning_rate": 9.656556645851919e-06, "loss": 1.1895, "step": 850 }, { "epoch": 0.1459617255919559, "grad_norm": 1.8441636562347412, "learning_rate": 9.636282539939989e-06, "loss": 1.1697, "step": 900 }, { "epoch": 0.15407071034706454, "grad_norm": 2.008300542831421, "learning_rate": 9.61600843402806e-06, "loss": 1.1553, "step": 950 }, { "epoch": 0.1621796951021732, "grad_norm": 1.5163062810897827, "learning_rate": 9.59573432811613e-06, "loss": 1.1739, "step": 1000 }, { "epoch": 0.17028867985728185, "grad_norm": 1.4669376611709595, "learning_rate": 9.575460222204202e-06, "loss": 1.1582, "step": 1050 }, { "epoch": 0.17839766461239054, "grad_norm": 2.10261607170105, "learning_rate": 9.555186116292272e-06, "loss": 1.1284, "step": 1100 }, { "epoch": 0.1865066493674992, "grad_norm": 1.8615520000457764, "learning_rate": 9.534912010380343e-06, "loss": 1.0916, "step": 1150 }, { "epoch": 0.19461563412260785, "grad_norm": 3.1653287410736084, "learning_rate": 9.514637904468413e-06, "loss": 1.2177, "step": 1200 }, { "epoch": 0.2027246188777165, "grad_norm": 2.7635200023651123, "learning_rate": 9.494363798556485e-06, "loss": 1.1217, "step": 1250 }, { "epoch": 0.21083360363282516, "grad_norm": 1.2704397439956665, "learning_rate": 9.474089692644555e-06, "loss": 1.2341, "step": 1300 }, { "epoch": 0.21894258838793382, "grad_norm": 5.202953815460205, "learning_rate": 9.453815586732625e-06, "loss": 1.2126, "step": 1350 }, { "epoch": 0.2270515731430425, "grad_norm": 2.3350253105163574, "learning_rate": 9.433541480820698e-06, "loss": 1.1721, "step": 1400 }, { "epoch": 0.23516055789815116, "grad_norm": 1.807605266571045, "learning_rate": 9.413267374908768e-06, "loss": 1.1475, "step": 1450 }, { "epoch": 0.2432695426532598, "grad_norm": 1.765973448753357, "learning_rate": 9.392993268996838e-06, "loss": 1.1572, "step": 1500 }, { "epoch": 0.25137852740836847, "grad_norm": 1.1422791481018066, "learning_rate": 9.37271916308491e-06, "loss": 1.1987, "step": 1550 }, { "epoch": 0.25948751216347715, "grad_norm": 2.6164450645446777, "learning_rate": 9.35244505717298e-06, "loss": 1.2152, "step": 1600 }, { "epoch": 0.2675964969185858, "grad_norm": 0.7747199535369873, "learning_rate": 9.332170951261051e-06, "loss": 1.1686, "step": 1650 }, { "epoch": 0.27570548167369446, "grad_norm": 2.8744866847991943, "learning_rate": 9.31189684534912e-06, "loss": 1.1571, "step": 1700 }, { "epoch": 0.2838144664288031, "grad_norm": 3.669771671295166, "learning_rate": 9.291622739437192e-06, "loss": 1.1609, "step": 1750 }, { "epoch": 0.2919234511839118, "grad_norm": 1.5796583890914917, "learning_rate": 9.271348633525262e-06, "loss": 1.1873, "step": 1800 }, { "epoch": 0.30003243593902046, "grad_norm": 24.162195205688477, "learning_rate": 9.251074527613332e-06, "loss": 1.071, "step": 1850 }, { "epoch": 0.3081414206941291, "grad_norm": 2.8964436054229736, "learning_rate": 9.230800421701404e-06, "loss": 1.1767, "step": 1900 }, { "epoch": 0.31625040544923777, "grad_norm": 2.9506802558898926, "learning_rate": 9.210526315789474e-06, "loss": 1.1566, "step": 1950 }, { "epoch": 0.3243593902043464, "grad_norm": 3.0490458011627197, "learning_rate": 9.190252209877545e-06, "loss": 1.1563, "step": 2000 }, { "epoch": 0.3324683749594551, "grad_norm": 2.236199378967285, "learning_rate": 9.169978103965615e-06, "loss": 1.1493, "step": 2050 }, { "epoch": 0.3405773597145637, "grad_norm": 2.1622684001922607, "learning_rate": 9.149703998053687e-06, "loss": 1.1613, "step": 2100 }, { "epoch": 0.3486863444696724, "grad_norm": 1.634074330329895, "learning_rate": 9.129429892141757e-06, "loss": 1.2061, "step": 2150 }, { "epoch": 0.3567953292247811, "grad_norm": 1.8448489904403687, "learning_rate": 9.109155786229828e-06, "loss": 1.1047, "step": 2200 }, { "epoch": 0.3649043139798897, "grad_norm": 2.2948291301727295, "learning_rate": 9.0888816803179e-06, "loss": 1.0898, "step": 2250 }, { "epoch": 0.3730132987349984, "grad_norm": 2.0072033405303955, "learning_rate": 9.06860757440597e-06, "loss": 1.1275, "step": 2300 }, { "epoch": 0.381122283490107, "grad_norm": 1.8813329935073853, "learning_rate": 9.04833346849404e-06, "loss": 1.143, "step": 2350 }, { "epoch": 0.3892312682452157, "grad_norm": 2.1367673873901367, "learning_rate": 9.028059362582111e-06, "loss": 1.1655, "step": 2400 }, { "epoch": 0.3973402530003244, "grad_norm": 0.8832216858863831, "learning_rate": 9.007785256670181e-06, "loss": 1.2366, "step": 2450 }, { "epoch": 0.405449237755433, "grad_norm": 2.3302135467529297, "learning_rate": 8.987511150758253e-06, "loss": 1.1629, "step": 2500 }, { "epoch": 0.4135582225105417, "grad_norm": 2.1263036727905273, "learning_rate": 8.967237044846323e-06, "loss": 1.1454, "step": 2550 }, { "epoch": 0.4216672072656503, "grad_norm": 1.9923527240753174, "learning_rate": 8.946962938934394e-06, "loss": 1.1205, "step": 2600 }, { "epoch": 0.429776192020759, "grad_norm": 3.215719699859619, "learning_rate": 8.926688833022464e-06, "loss": 1.0715, "step": 2650 }, { "epoch": 0.43788517677586763, "grad_norm": 0.8249936103820801, "learning_rate": 8.906414727110536e-06, "loss": 1.1257, "step": 2700 }, { "epoch": 0.4459941615309763, "grad_norm": 7.653429985046387, "learning_rate": 8.886140621198606e-06, "loss": 1.1224, "step": 2750 }, { "epoch": 0.454103146286085, "grad_norm": 1.5797380208969116, "learning_rate": 8.865866515286676e-06, "loss": 1.1456, "step": 2800 }, { "epoch": 0.46221213104119363, "grad_norm": 1.7436014413833618, "learning_rate": 8.845592409374747e-06, "loss": 1.1848, "step": 2850 }, { "epoch": 0.4703211157963023, "grad_norm": 2.171922206878662, "learning_rate": 8.825318303462817e-06, "loss": 1.1075, "step": 2900 }, { "epoch": 0.47843010055141094, "grad_norm": 2.2190613746643066, "learning_rate": 8.805044197550889e-06, "loss": 1.11, "step": 2950 }, { "epoch": 0.4865390853065196, "grad_norm": 1.689220905303955, "learning_rate": 8.784770091638959e-06, "loss": 1.0315, "step": 3000 }, { "epoch": 0.4946480700616283, "grad_norm": 2.075385332107544, "learning_rate": 8.76449598572703e-06, "loss": 1.0846, "step": 3050 }, { "epoch": 0.5027570548167369, "grad_norm": 1.8431355953216553, "learning_rate": 8.744221879815102e-06, "loss": 1.1823, "step": 3100 }, { "epoch": 0.5108660395718456, "grad_norm": 2.7447328567504883, "learning_rate": 8.723947773903172e-06, "loss": 1.1195, "step": 3150 }, { "epoch": 0.5189750243269543, "grad_norm": 1.8737456798553467, "learning_rate": 8.703673667991243e-06, "loss": 1.1918, "step": 3200 }, { "epoch": 0.5270840090820629, "grad_norm": 3.270134210586548, "learning_rate": 8.683399562079313e-06, "loss": 1.0992, "step": 3250 }, { "epoch": 0.5351929938371716, "grad_norm": 2.3992574214935303, "learning_rate": 8.663125456167383e-06, "loss": 1.168, "step": 3300 }, { "epoch": 0.5433019785922802, "grad_norm": 2.5086565017700195, "learning_rate": 8.642851350255455e-06, "loss": 1.104, "step": 3350 }, { "epoch": 0.5514109633473889, "grad_norm": 2.5543065071105957, "learning_rate": 8.622577244343525e-06, "loss": 1.1699, "step": 3400 }, { "epoch": 0.5595199481024976, "grad_norm": 1.8610668182373047, "learning_rate": 8.602303138431596e-06, "loss": 1.0648, "step": 3450 }, { "epoch": 0.5676289328576062, "grad_norm": 0.530300498008728, "learning_rate": 8.582029032519666e-06, "loss": 1.0437, "step": 3500 }, { "epoch": 0.5757379176127149, "grad_norm": 2.2418153285980225, "learning_rate": 8.561754926607738e-06, "loss": 1.0542, "step": 3550 }, { "epoch": 0.5838469023678236, "grad_norm": 3.173265218734741, "learning_rate": 8.541480820695808e-06, "loss": 1.1204, "step": 3600 }, { "epoch": 0.5919558871229322, "grad_norm": 1.4262479543685913, "learning_rate": 8.521206714783878e-06, "loss": 1.1228, "step": 3650 }, { "epoch": 0.6000648718780409, "grad_norm": 6.2367377281188965, "learning_rate": 8.50093260887195e-06, "loss": 1.1236, "step": 3700 }, { "epoch": 0.6081738566331495, "grad_norm": 1.1879503726959229, "learning_rate": 8.480658502960019e-06, "loss": 1.1394, "step": 3750 }, { "epoch": 0.6162828413882582, "grad_norm": 1.846981167793274, "learning_rate": 8.46038439704809e-06, "loss": 1.1356, "step": 3800 }, { "epoch": 0.6243918261433669, "grad_norm": 1.8748732805252075, "learning_rate": 8.440110291136162e-06, "loss": 1.2168, "step": 3850 }, { "epoch": 0.6325008108984755, "grad_norm": 1.6056501865386963, "learning_rate": 8.419836185224232e-06, "loss": 1.0998, "step": 3900 }, { "epoch": 0.6406097956535842, "grad_norm": 3.1764864921569824, "learning_rate": 8.399562079312304e-06, "loss": 1.1459, "step": 3950 }, { "epoch": 0.6487187804086928, "grad_norm": 2.427140712738037, "learning_rate": 8.379287973400374e-06, "loss": 1.1625, "step": 4000 }, { "epoch": 0.6568277651638015, "grad_norm": 3.784374237060547, "learning_rate": 8.359013867488445e-06, "loss": 1.1198, "step": 4050 }, { "epoch": 0.6649367499189102, "grad_norm": 1.0050832033157349, "learning_rate": 8.338739761576515e-06, "loss": 1.156, "step": 4100 }, { "epoch": 0.6730457346740188, "grad_norm": 1.9474921226501465, "learning_rate": 8.318465655664587e-06, "loss": 1.1186, "step": 4150 }, { "epoch": 0.6811547194291274, "grad_norm": 2.666841983795166, "learning_rate": 8.298191549752657e-06, "loss": 1.1655, "step": 4200 }, { "epoch": 0.6892637041842361, "grad_norm": 3.144240379333496, "learning_rate": 8.277917443840727e-06, "loss": 1.1691, "step": 4250 }, { "epoch": 0.6973726889393448, "grad_norm": 130.09506225585938, "learning_rate": 8.257643337928798e-06, "loss": 1.2045, "step": 4300 }, { "epoch": 0.7054816736944535, "grad_norm": 2.074591636657715, "learning_rate": 8.237369232016868e-06, "loss": 1.1461, "step": 4350 }, { "epoch": 0.7135906584495622, "grad_norm": 2.799001932144165, "learning_rate": 8.21709512610494e-06, "loss": 1.1341, "step": 4400 }, { "epoch": 0.7216996432046707, "grad_norm": 2.7407402992248535, "learning_rate": 8.19682102019301e-06, "loss": 1.1195, "step": 4450 }, { "epoch": 0.7298086279597794, "grad_norm": 0.21362553536891937, "learning_rate": 8.176546914281081e-06, "loss": 1.1311, "step": 4500 }, { "epoch": 0.7379176127148881, "grad_norm": 1.1738766431808472, "learning_rate": 8.156272808369151e-06, "loss": 1.0796, "step": 4550 }, { "epoch": 0.7460265974699968, "grad_norm": 2.821784257888794, "learning_rate": 8.135998702457221e-06, "loss": 1.116, "step": 4600 }, { "epoch": 0.7541355822251055, "grad_norm": 2.0731353759765625, "learning_rate": 8.115724596545294e-06, "loss": 1.1121, "step": 4650 }, { "epoch": 0.762244566980214, "grad_norm": 2.632211685180664, "learning_rate": 8.095450490633364e-06, "loss": 1.0747, "step": 4700 }, { "epoch": 0.7703535517353227, "grad_norm": 2.5461819171905518, "learning_rate": 8.075176384721434e-06, "loss": 1.0444, "step": 4750 }, { "epoch": 0.7784625364904314, "grad_norm": 5.323375225067139, "learning_rate": 8.054902278809506e-06, "loss": 1.1097, "step": 4800 }, { "epoch": 0.7865715212455401, "grad_norm": 3.063811779022217, "learning_rate": 8.034628172897576e-06, "loss": 1.1502, "step": 4850 }, { "epoch": 0.7946805060006488, "grad_norm": 2.1988182067871094, "learning_rate": 8.014354066985647e-06, "loss": 1.1024, "step": 4900 }, { "epoch": 0.8027894907557573, "grad_norm": 2.6995670795440674, "learning_rate": 7.994079961073717e-06, "loss": 1.1678, "step": 4950 }, { "epoch": 0.810898475510866, "grad_norm": 0.982642412185669, "learning_rate": 7.973805855161789e-06, "loss": 1.1194, "step": 5000 }, { "epoch": 0.8190074602659747, "grad_norm": 1.8992295265197754, "learning_rate": 7.953531749249859e-06, "loss": 1.1394, "step": 5050 }, { "epoch": 0.8271164450210834, "grad_norm": 1.253397822380066, "learning_rate": 7.933257643337929e-06, "loss": 1.0602, "step": 5100 }, { "epoch": 0.8352254297761921, "grad_norm": 2.562586784362793, "learning_rate": 7.912983537426e-06, "loss": 1.1393, "step": 5150 }, { "epoch": 0.8433344145313006, "grad_norm": 3.5401248931884766, "learning_rate": 7.89270943151407e-06, "loss": 1.0614, "step": 5200 }, { "epoch": 0.8514433992864093, "grad_norm": 1.610593318939209, "learning_rate": 7.872435325602142e-06, "loss": 1.0493, "step": 5250 }, { "epoch": 0.859552384041518, "grad_norm": 1.8910722732543945, "learning_rate": 7.852161219690212e-06, "loss": 1.1287, "step": 5300 }, { "epoch": 0.8676613687966267, "grad_norm": 1.915490746498108, "learning_rate": 7.831887113778283e-06, "loss": 1.0797, "step": 5350 }, { "epoch": 0.8757703535517353, "grad_norm": 3.8769915103912354, "learning_rate": 7.811613007866353e-06, "loss": 1.1077, "step": 5400 }, { "epoch": 0.883879338306844, "grad_norm": 0.7856437563896179, "learning_rate": 7.791338901954425e-06, "loss": 1.0537, "step": 5450 }, { "epoch": 0.8919883230619526, "grad_norm": 1.956763744354248, "learning_rate": 7.771064796042496e-06, "loss": 1.0882, "step": 5500 }, { "epoch": 0.9000973078170613, "grad_norm": 2.763761043548584, "learning_rate": 7.750790690130566e-06, "loss": 1.1859, "step": 5550 }, { "epoch": 0.90820629257217, "grad_norm": 1.743696689605713, "learning_rate": 7.730516584218636e-06, "loss": 1.0828, "step": 5600 }, { "epoch": 0.9163152773272786, "grad_norm": 1.8948084115982056, "learning_rate": 7.710242478306708e-06, "loss": 1.1487, "step": 5650 }, { "epoch": 0.9244242620823873, "grad_norm": 1.9413537979125977, "learning_rate": 7.689968372394778e-06, "loss": 1.0365, "step": 5700 }, { "epoch": 0.9325332468374959, "grad_norm": 1.3905330896377563, "learning_rate": 7.66969426648285e-06, "loss": 1.1288, "step": 5750 }, { "epoch": 0.9406422315926046, "grad_norm": 2.7509984970092773, "learning_rate": 7.649420160570919e-06, "loss": 1.1158, "step": 5800 }, { "epoch": 0.9487512163477133, "grad_norm": 2.0269293785095215, "learning_rate": 7.62914605465899e-06, "loss": 1.0666, "step": 5850 }, { "epoch": 0.9568602011028219, "grad_norm": 2.457383632659912, "learning_rate": 7.608871948747061e-06, "loss": 1.0743, "step": 5900 }, { "epoch": 0.9649691858579306, "grad_norm": 1.9480834007263184, "learning_rate": 7.588597842835131e-06, "loss": 1.0856, "step": 5950 }, { "epoch": 0.9730781706130393, "grad_norm": 2.421600103378296, "learning_rate": 7.568323736923202e-06, "loss": 1.1044, "step": 6000 }, { "epoch": 0.9811871553681479, "grad_norm": 1.9701539278030396, "learning_rate": 7.548049631011273e-06, "loss": 1.0493, "step": 6050 }, { "epoch": 0.9892961401232566, "grad_norm": 3.0008904933929443, "learning_rate": 7.527775525099344e-06, "loss": 1.057, "step": 6100 }, { "epoch": 0.9974051248783652, "grad_norm": 2.9297544956207275, "learning_rate": 7.507501419187414e-06, "loss": 1.0695, "step": 6150 }, { "epoch": 1.0055141096334739, "grad_norm": 8.883905410766602, "learning_rate": 7.487227313275484e-06, "loss": 1.1178, "step": 6200 }, { "epoch": 1.0136230943885824, "grad_norm": 1.9629335403442383, "learning_rate": 7.466953207363555e-06, "loss": 1.1246, "step": 6250 }, { "epoch": 1.0217320791436912, "grad_norm": 1.3846949338912964, "learning_rate": 7.4466791014516275e-06, "loss": 1.1218, "step": 6300 }, { "epoch": 1.0298410638987998, "grad_norm": 1.8666070699691772, "learning_rate": 7.426404995539697e-06, "loss": 1.072, "step": 6350 }, { "epoch": 1.0379500486539086, "grad_norm": 2.517359972000122, "learning_rate": 7.406130889627768e-06, "loss": 1.1204, "step": 6400 }, { "epoch": 1.0460590334090172, "grad_norm": 10.04233455657959, "learning_rate": 7.385856783715839e-06, "loss": 1.1025, "step": 6450 }, { "epoch": 1.0541680181641258, "grad_norm": 2.2324578762054443, "learning_rate": 7.36558267780391e-06, "loss": 1.0799, "step": 6500 }, { "epoch": 1.0622770029192345, "grad_norm": 1.6139917373657227, "learning_rate": 7.34530857189198e-06, "loss": 1.1512, "step": 6550 }, { "epoch": 1.0703859876743431, "grad_norm": 1.7317317724227905, "learning_rate": 7.325034465980051e-06, "loss": 1.0947, "step": 6600 }, { "epoch": 1.078494972429452, "grad_norm": 1.3901951313018799, "learning_rate": 7.304760360068122e-06, "loss": 1.1165, "step": 6650 }, { "epoch": 1.0866039571845605, "grad_norm": 2.5908029079437256, "learning_rate": 7.284486254156193e-06, "loss": 1.0484, "step": 6700 }, { "epoch": 1.094712941939669, "grad_norm": 2.977522850036621, "learning_rate": 7.2642121482442626e-06, "loss": 1.0471, "step": 6750 }, { "epoch": 1.1028219266947779, "grad_norm": 1.2937211990356445, "learning_rate": 7.243938042332333e-06, "loss": 1.1202, "step": 6800 }, { "epoch": 1.1109309114498864, "grad_norm": 2.9973866939544678, "learning_rate": 7.223663936420404e-06, "loss": 1.0414, "step": 6850 }, { "epoch": 1.1190398962049952, "grad_norm": 5.904523849487305, "learning_rate": 7.203389830508475e-06, "loss": 1.0944, "step": 6900 }, { "epoch": 1.1271488809601038, "grad_norm": 4.078853130340576, "learning_rate": 7.1831157245965456e-06, "loss": 1.0421, "step": 6950 }, { "epoch": 1.1352578657152124, "grad_norm": 2.3049638271331787, "learning_rate": 7.162841618684616e-06, "loss": 1.1185, "step": 7000 }, { "epoch": 1.1433668504703212, "grad_norm": 1.740246057510376, "learning_rate": 7.142567512772687e-06, "loss": 1.0862, "step": 7050 }, { "epoch": 1.1514758352254297, "grad_norm": 7.726000785827637, "learning_rate": 7.122293406860759e-06, "loss": 1.0821, "step": 7100 }, { "epoch": 1.1595848199805385, "grad_norm": 0.24267837405204773, "learning_rate": 7.1020193009488294e-06, "loss": 1.0419, "step": 7150 }, { "epoch": 1.167693804735647, "grad_norm": 1.9721111059188843, "learning_rate": 7.0817451950369e-06, "loss": 1.0959, "step": 7200 }, { "epoch": 1.1758027894907557, "grad_norm": 1.5573512315750122, "learning_rate": 7.06147108912497e-06, "loss": 1.0827, "step": 7250 }, { "epoch": 1.1839117742458645, "grad_norm": 2.4766602516174316, "learning_rate": 7.041196983213041e-06, "loss": 1.0144, "step": 7300 }, { "epoch": 1.192020759000973, "grad_norm": 3.357357978820801, "learning_rate": 7.020922877301112e-06, "loss": 1.0686, "step": 7350 }, { "epoch": 1.2001297437560818, "grad_norm": 1.6499881744384766, "learning_rate": 7.000648771389182e-06, "loss": 1.0918, "step": 7400 }, { "epoch": 1.2082387285111904, "grad_norm": 2.088510751724243, "learning_rate": 6.980374665477253e-06, "loss": 1.0963, "step": 7450 }, { "epoch": 1.216347713266299, "grad_norm": 2.040971040725708, "learning_rate": 6.960100559565324e-06, "loss": 1.1578, "step": 7500 }, { "epoch": 1.2244566980214078, "grad_norm": 1.381493091583252, "learning_rate": 6.939826453653395e-06, "loss": 1.1057, "step": 7550 }, { "epoch": 1.2325656827765163, "grad_norm": 1.6742016077041626, "learning_rate": 6.919552347741465e-06, "loss": 1.1118, "step": 7600 }, { "epoch": 1.240674667531625, "grad_norm": 0.9264686107635498, "learning_rate": 6.899278241829535e-06, "loss": 1.0471, "step": 7650 }, { "epoch": 1.2487836522867337, "grad_norm": 3.0064830780029297, "learning_rate": 6.879004135917606e-06, "loss": 1.0854, "step": 7700 }, { "epoch": 1.2568926370418423, "grad_norm": 0.9564074873924255, "learning_rate": 6.858730030005677e-06, "loss": 1.1668, "step": 7750 }, { "epoch": 1.265001621796951, "grad_norm": 7.573803901672363, "learning_rate": 6.8384559240937475e-06, "loss": 1.1087, "step": 7800 }, { "epoch": 1.2731106065520597, "grad_norm": 6.813046455383301, "learning_rate": 6.818181818181818e-06, "loss": 1.1263, "step": 7850 }, { "epoch": 1.2812195913071682, "grad_norm": 1.5467007160186768, "learning_rate": 6.79790771226989e-06, "loss": 1.1001, "step": 7900 }, { "epoch": 1.289328576062277, "grad_norm": 1.5642696619033813, "learning_rate": 6.777633606357961e-06, "loss": 1.1252, "step": 7950 }, { "epoch": 1.2974375608173856, "grad_norm": 5.454095363616943, "learning_rate": 6.757359500446031e-06, "loss": 1.0533, "step": 8000 }, { "epoch": 1.3055465455724944, "grad_norm": 2.569298028945923, "learning_rate": 6.737085394534102e-06, "loss": 1.1265, "step": 8050 }, { "epoch": 1.313655530327603, "grad_norm": 0.9834175705909729, "learning_rate": 6.716811288622173e-06, "loss": 1.1177, "step": 8100 }, { "epoch": 1.3217645150827115, "grad_norm": 1.9268743991851807, "learning_rate": 6.696537182710244e-06, "loss": 1.0928, "step": 8150 }, { "epoch": 1.3298734998378203, "grad_norm": 2.7169127464294434, "learning_rate": 6.6762630767983135e-06, "loss": 1.0943, "step": 8200 }, { "epoch": 1.337982484592929, "grad_norm": 1.8307722806930542, "learning_rate": 6.655988970886384e-06, "loss": 1.0697, "step": 8250 }, { "epoch": 1.3460914693480377, "grad_norm": 1.8044813871383667, "learning_rate": 6.635714864974455e-06, "loss": 1.1663, "step": 8300 }, { "epoch": 1.3542004541031463, "grad_norm": 0.26450178027153015, "learning_rate": 6.615440759062526e-06, "loss": 1.0665, "step": 8350 }, { "epoch": 1.3623094388582548, "grad_norm": 5.571292877197266, "learning_rate": 6.5951666531505966e-06, "loss": 1.0099, "step": 8400 }, { "epoch": 1.3704184236133636, "grad_norm": 3.6548831462860107, "learning_rate": 6.574892547238667e-06, "loss": 1.0948, "step": 8450 }, { "epoch": 1.3785274083684722, "grad_norm": 1.9964606761932373, "learning_rate": 6.554618441326738e-06, "loss": 1.079, "step": 8500 }, { "epoch": 1.386636393123581, "grad_norm": 9.50430965423584, "learning_rate": 6.534344335414808e-06, "loss": 1.093, "step": 8550 }, { "epoch": 1.3947453778786896, "grad_norm": 2.573894500732422, "learning_rate": 6.514070229502879e-06, "loss": 1.1444, "step": 8600 }, { "epoch": 1.4028543626337981, "grad_norm": 2.370356559753418, "learning_rate": 6.4937961235909495e-06, "loss": 1.1342, "step": 8650 }, { "epoch": 1.410963347388907, "grad_norm": 2.698498487472534, "learning_rate": 6.47352201767902e-06, "loss": 1.1387, "step": 8700 }, { "epoch": 1.4190723321440155, "grad_norm": 1.4973636865615845, "learning_rate": 6.453247911767092e-06, "loss": 1.077, "step": 8750 }, { "epoch": 1.4271813168991243, "grad_norm": 2.350057601928711, "learning_rate": 6.432973805855163e-06, "loss": 1.0281, "step": 8800 }, { "epoch": 1.4352903016542329, "grad_norm": 2.5107436180114746, "learning_rate": 6.412699699943233e-06, "loss": 1.0517, "step": 8850 }, { "epoch": 1.4433992864093415, "grad_norm": 2.506737470626831, "learning_rate": 6.392425594031304e-06, "loss": 1.0565, "step": 8900 }, { "epoch": 1.4515082711644502, "grad_norm": 2.0631542205810547, "learning_rate": 6.372151488119375e-06, "loss": 1.0925, "step": 8950 }, { "epoch": 1.4596172559195588, "grad_norm": 1.4057486057281494, "learning_rate": 6.351877382207446e-06, "loss": 1.1257, "step": 9000 }, { "epoch": 1.4677262406746676, "grad_norm": 1.5711629390716553, "learning_rate": 6.331603276295516e-06, "loss": 1.0615, "step": 9050 }, { "epoch": 1.4758352254297762, "grad_norm": 3.042393922805786, "learning_rate": 6.311329170383586e-06, "loss": 1.0664, "step": 9100 }, { "epoch": 1.4839442101848848, "grad_norm": 2.0653626918792725, "learning_rate": 6.291055064471657e-06, "loss": 1.0039, "step": 9150 }, { "epoch": 1.4920531949399936, "grad_norm": 2.374119281768799, "learning_rate": 6.270780958559728e-06, "loss": 1.0053, "step": 9200 }, { "epoch": 1.5001621796951021, "grad_norm": 2.568708896636963, "learning_rate": 6.2505068526477985e-06, "loss": 1.0484, "step": 9250 }, { "epoch": 1.508271164450211, "grad_norm": 2.4763758182525635, "learning_rate": 6.230232746735869e-06, "loss": 1.0998, "step": 9300 }, { "epoch": 1.5163801492053195, "grad_norm": 1.8199087381362915, "learning_rate": 6.20995864082394e-06, "loss": 1.1189, "step": 9350 }, { "epoch": 1.524489133960428, "grad_norm": 4.159176826477051, "learning_rate": 6.189684534912011e-06, "loss": 1.1606, "step": 9400 }, { "epoch": 1.5325981187155369, "grad_norm": 1.266381025314331, "learning_rate": 6.1694104290000815e-06, "loss": 1.039, "step": 9450 }, { "epoch": 1.5407071034706454, "grad_norm": 1.114743947982788, "learning_rate": 6.1491363230881514e-06, "loss": 1.0937, "step": 9500 }, { "epoch": 1.5488160882257542, "grad_norm": 4.52326774597168, "learning_rate": 6.128862217176224e-06, "loss": 1.0782, "step": 9550 }, { "epoch": 1.5569250729808628, "grad_norm": 2.066561460494995, "learning_rate": 6.108588111264294e-06, "loss": 1.0033, "step": 9600 }, { "epoch": 1.5650340577359714, "grad_norm": 3.260119676589966, "learning_rate": 6.0883140053523645e-06, "loss": 1.063, "step": 9650 }, { "epoch": 1.5731430424910802, "grad_norm": 3.0911264419555664, "learning_rate": 6.068039899440435e-06, "loss": 1.0468, "step": 9700 }, { "epoch": 1.5812520272461887, "grad_norm": 6.644060134887695, "learning_rate": 6.047765793528506e-06, "loss": 1.0236, "step": 9750 }, { "epoch": 1.5893610120012975, "grad_norm": 2.7163946628570557, "learning_rate": 6.027491687616577e-06, "loss": 1.0351, "step": 9800 }, { "epoch": 1.597469996756406, "grad_norm": 2.4623427391052246, "learning_rate": 6.0072175817046475e-06, "loss": 1.1016, "step": 9850 }, { "epoch": 1.6055789815115147, "grad_norm": 2.7548983097076416, "learning_rate": 5.986943475792718e-06, "loss": 1.0571, "step": 9900 }, { "epoch": 1.6136879662666233, "grad_norm": 2.269378423690796, "learning_rate": 5.966669369880789e-06, "loss": 1.0962, "step": 9950 }, { "epoch": 1.621796951021732, "grad_norm": 2.0521297454833984, "learning_rate": 5.946395263968859e-06, "loss": 1.1202, "step": 10000 }, { "epoch": 1.6299059357768408, "grad_norm": 2.488682985305786, "learning_rate": 5.92612115805693e-06, "loss": 1.0879, "step": 10050 }, { "epoch": 1.6380149205319494, "grad_norm": 2.8953864574432373, "learning_rate": 5.9058470521450005e-06, "loss": 1.1048, "step": 10100 }, { "epoch": 1.646123905287058, "grad_norm": 2.272630214691162, "learning_rate": 5.885572946233071e-06, "loss": 1.0903, "step": 10150 }, { "epoch": 1.6542328900421666, "grad_norm": 2.1369316577911377, "learning_rate": 5.865298840321142e-06, "loss": 1.0911, "step": 10200 }, { "epoch": 1.6623418747972754, "grad_norm": 3.1257848739624023, "learning_rate": 5.845024734409213e-06, "loss": 1.066, "step": 10250 }, { "epoch": 1.6704508595523841, "grad_norm": 2.81978440284729, "learning_rate": 5.8247506284972835e-06, "loss": 1.1032, "step": 10300 }, { "epoch": 1.6785598443074927, "grad_norm": 1.2515383958816528, "learning_rate": 5.804476522585355e-06, "loss": 1.0646, "step": 10350 }, { "epoch": 1.6866688290626013, "grad_norm": 2.6792616844177246, "learning_rate": 5.784202416673426e-06, "loss": 1.0804, "step": 10400 }, { "epoch": 1.6947778138177099, "grad_norm": 2.989546537399292, "learning_rate": 5.763928310761497e-06, "loss": 1.1245, "step": 10450 }, { "epoch": 1.7028867985728187, "grad_norm": 3.0825021266937256, "learning_rate": 5.743654204849567e-06, "loss": 1.1059, "step": 10500 }, { "epoch": 1.7109957833279275, "grad_norm": 2.1985509395599365, "learning_rate": 5.723380098937637e-06, "loss": 1.1493, "step": 10550 }, { "epoch": 1.719104768083036, "grad_norm": 2.780210256576538, "learning_rate": 5.703105993025708e-06, "loss": 1.1207, "step": 10600 }, { "epoch": 1.7272137528381446, "grad_norm": 1.8562790155410767, "learning_rate": 5.682831887113779e-06, "loss": 1.0158, "step": 10650 }, { "epoch": 1.7353227375932532, "grad_norm": 1.9536188840866089, "learning_rate": 5.6625577812018495e-06, "loss": 1.113, "step": 10700 }, { "epoch": 1.743431722348362, "grad_norm": 7.557844638824463, "learning_rate": 5.64228367528992e-06, "loss": 1.0133, "step": 10750 }, { "epoch": 1.7515407071034708, "grad_norm": 6.065642356872559, "learning_rate": 5.622009569377991e-06, "loss": 1.1057, "step": 10800 }, { "epoch": 1.7596496918585793, "grad_norm": 2.0582070350646973, "learning_rate": 5.601735463466062e-06, "loss": 1.0018, "step": 10850 }, { "epoch": 1.767758676613688, "grad_norm": 1.545466423034668, "learning_rate": 5.5814613575541325e-06, "loss": 0.9689, "step": 10900 }, { "epoch": 1.7758676613687965, "grad_norm": 2.50724196434021, "learning_rate": 5.561187251642202e-06, "loss": 1.0578, "step": 10950 }, { "epoch": 1.7839766461239053, "grad_norm": 3.886679172515869, "learning_rate": 5.540913145730273e-06, "loss": 1.0745, "step": 11000 }, { "epoch": 1.792085630879014, "grad_norm": 3.0689311027526855, "learning_rate": 5.520639039818344e-06, "loss": 1.0816, "step": 11050 }, { "epoch": 1.8001946156341226, "grad_norm": 2.43038010597229, "learning_rate": 5.500364933906415e-06, "loss": 1.1257, "step": 11100 }, { "epoch": 1.8083036003892312, "grad_norm": 1.9623647928237915, "learning_rate": 5.480090827994486e-06, "loss": 1.072, "step": 11150 }, { "epoch": 1.8164125851443398, "grad_norm": 2.903964042663574, "learning_rate": 5.459816722082557e-06, "loss": 1.1159, "step": 11200 }, { "epoch": 1.8245215698994486, "grad_norm": 3.9385204315185547, "learning_rate": 5.439542616170628e-06, "loss": 1.0296, "step": 11250 }, { "epoch": 1.8326305546545574, "grad_norm": 3.2969629764556885, "learning_rate": 5.4192685102586985e-06, "loss": 1.0818, "step": 11300 }, { "epoch": 1.840739539409666, "grad_norm": 6.509484767913818, "learning_rate": 5.398994404346769e-06, "loss": 1.0848, "step": 11350 }, { "epoch": 1.8488485241647745, "grad_norm": 3.701723098754883, "learning_rate": 5.37872029843484e-06, "loss": 0.9775, "step": 11400 }, { "epoch": 1.856957508919883, "grad_norm": 2.0968496799468994, "learning_rate": 5.35844619252291e-06, "loss": 1.1417, "step": 11450 }, { "epoch": 1.8650664936749919, "grad_norm": 2.2569050788879395, "learning_rate": 5.338172086610981e-06, "loss": 1.0199, "step": 11500 }, { "epoch": 1.8731754784301007, "grad_norm": 8.650045394897461, "learning_rate": 5.3178979806990514e-06, "loss": 1.1163, "step": 11550 }, { "epoch": 1.8812844631852093, "grad_norm": 3.9758262634277344, "learning_rate": 5.297623874787122e-06, "loss": 0.9833, "step": 11600 }, { "epoch": 1.8893934479403178, "grad_norm": 2.467902183532715, "learning_rate": 5.277349768875193e-06, "loss": 1.0529, "step": 11650 }, { "epoch": 1.8975024326954264, "grad_norm": 3.978503465652466, "learning_rate": 5.257075662963264e-06, "loss": 1.2041, "step": 11700 }, { "epoch": 1.9056114174505352, "grad_norm": 2.431842803955078, "learning_rate": 5.2368015570513345e-06, "loss": 1.0279, "step": 11750 }, { "epoch": 1.913720402205644, "grad_norm": 2.4601986408233643, "learning_rate": 5.216527451139405e-06, "loss": 1.1013, "step": 11800 }, { "epoch": 1.9218293869607526, "grad_norm": 2.702439546585083, "learning_rate": 5.196253345227475e-06, "loss": 1.0153, "step": 11850 }, { "epoch": 1.9299383717158611, "grad_norm": 1.80119788646698, "learning_rate": 5.175979239315546e-06, "loss": 1.0995, "step": 11900 }, { "epoch": 1.9380473564709697, "grad_norm": 2.78031587600708, "learning_rate": 5.155705133403617e-06, "loss": 1.0488, "step": 11950 }, { "epoch": 1.9461563412260785, "grad_norm": 1.3408476114273071, "learning_rate": 5.135431027491688e-06, "loss": 1.0176, "step": 12000 }, { "epoch": 1.9542653259811873, "grad_norm": 2.418548107147217, "learning_rate": 5.115156921579759e-06, "loss": 1.0323, "step": 12050 }, { "epoch": 1.9623743107362959, "grad_norm": 2.1113944053649902, "learning_rate": 5.09488281566783e-06, "loss": 1.0717, "step": 12100 }, { "epoch": 1.9704832954914044, "grad_norm": 1.4384586811065674, "learning_rate": 5.0746087097559005e-06, "loss": 1.053, "step": 12150 }, { "epoch": 1.978592280246513, "grad_norm": 2.836148738861084, "learning_rate": 5.054334603843971e-06, "loss": 0.9988, "step": 12200 }, { "epoch": 1.9867012650016218, "grad_norm": 1.2631614208221436, "learning_rate": 5.034060497932042e-06, "loss": 1.0348, "step": 12250 }, { "epoch": 1.9948102497567306, "grad_norm": 5.070575714111328, "learning_rate": 5.013786392020113e-06, "loss": 1.0272, "step": 12300 } ], "logging_steps": 50, "max_steps": 24664, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.8392191071174525e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }