|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 12332, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008108984755108661, |
|
"grad_norm": 1.9572986364364624, |
|
"learning_rate": 9.980942340442787e-06, |
|
"loss": 1.3961, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.016217969510217322, |
|
"grad_norm": 2.140582323074341, |
|
"learning_rate": 9.960668234530858e-06, |
|
"loss": 1.2721, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.02432695426532598, |
|
"grad_norm": 2.600647211074829, |
|
"learning_rate": 9.940394128618928e-06, |
|
"loss": 1.2162, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.032435939020434644, |
|
"grad_norm": 2.521652936935425, |
|
"learning_rate": 9.920120022707e-06, |
|
"loss": 1.2469, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0405449237755433, |
|
"grad_norm": 2.509162425994873, |
|
"learning_rate": 9.89984591679507e-06, |
|
"loss": 1.2162, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.04865390853065196, |
|
"grad_norm": 4.292358875274658, |
|
"learning_rate": 9.879571810883141e-06, |
|
"loss": 1.2096, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.056762893285760625, |
|
"grad_norm": 2.3972127437591553, |
|
"learning_rate": 9.859297704971211e-06, |
|
"loss": 1.1857, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.06487187804086929, |
|
"grad_norm": 1.8880646228790283, |
|
"learning_rate": 9.839023599059281e-06, |
|
"loss": 1.1973, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.07298086279597794, |
|
"grad_norm": 1.4280647039413452, |
|
"learning_rate": 9.818749493147353e-06, |
|
"loss": 1.2305, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.0810898475510866, |
|
"grad_norm": 1.4675335884094238, |
|
"learning_rate": 9.798475387235423e-06, |
|
"loss": 1.2302, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.08919883230619527, |
|
"grad_norm": 3.1858043670654297, |
|
"learning_rate": 9.778201281323494e-06, |
|
"loss": 1.1004, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.09730781706130393, |
|
"grad_norm": 1.4137463569641113, |
|
"learning_rate": 9.757927175411566e-06, |
|
"loss": 1.1911, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.10541680181641258, |
|
"grad_norm": 1.389324426651001, |
|
"learning_rate": 9.737653069499636e-06, |
|
"loss": 1.1752, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.11352578657152125, |
|
"grad_norm": 2.291736602783203, |
|
"learning_rate": 9.717378963587707e-06, |
|
"loss": 1.135, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.1216347713266299, |
|
"grad_norm": 1.8395874500274658, |
|
"learning_rate": 9.697104857675777e-06, |
|
"loss": 1.1551, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.12974375608173858, |
|
"grad_norm": 1.9675475358963013, |
|
"learning_rate": 9.676830751763849e-06, |
|
"loss": 1.1685, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.13785274083684723, |
|
"grad_norm": 2.3232228755950928, |
|
"learning_rate": 9.656556645851919e-06, |
|
"loss": 1.1895, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.1459617255919559, |
|
"grad_norm": 1.8441636562347412, |
|
"learning_rate": 9.636282539939989e-06, |
|
"loss": 1.1697, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.15407071034706454, |
|
"grad_norm": 2.008300542831421, |
|
"learning_rate": 9.61600843402806e-06, |
|
"loss": 1.1553, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.1621796951021732, |
|
"grad_norm": 1.5163062810897827, |
|
"learning_rate": 9.59573432811613e-06, |
|
"loss": 1.1739, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.17028867985728185, |
|
"grad_norm": 1.4669376611709595, |
|
"learning_rate": 9.575460222204202e-06, |
|
"loss": 1.1582, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.17839766461239054, |
|
"grad_norm": 2.10261607170105, |
|
"learning_rate": 9.555186116292272e-06, |
|
"loss": 1.1284, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.1865066493674992, |
|
"grad_norm": 1.8615520000457764, |
|
"learning_rate": 9.534912010380343e-06, |
|
"loss": 1.0916, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.19461563412260785, |
|
"grad_norm": 3.1653287410736084, |
|
"learning_rate": 9.514637904468413e-06, |
|
"loss": 1.2177, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.2027246188777165, |
|
"grad_norm": 2.7635200023651123, |
|
"learning_rate": 9.494363798556485e-06, |
|
"loss": 1.1217, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.21083360363282516, |
|
"grad_norm": 1.2704397439956665, |
|
"learning_rate": 9.474089692644555e-06, |
|
"loss": 1.2341, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.21894258838793382, |
|
"grad_norm": 5.202953815460205, |
|
"learning_rate": 9.453815586732625e-06, |
|
"loss": 1.2126, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.2270515731430425, |
|
"grad_norm": 2.3350253105163574, |
|
"learning_rate": 9.433541480820698e-06, |
|
"loss": 1.1721, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.23516055789815116, |
|
"grad_norm": 1.807605266571045, |
|
"learning_rate": 9.413267374908768e-06, |
|
"loss": 1.1475, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.2432695426532598, |
|
"grad_norm": 1.765973448753357, |
|
"learning_rate": 9.392993268996838e-06, |
|
"loss": 1.1572, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.25137852740836847, |
|
"grad_norm": 1.1422791481018066, |
|
"learning_rate": 9.37271916308491e-06, |
|
"loss": 1.1987, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.25948751216347715, |
|
"grad_norm": 2.6164450645446777, |
|
"learning_rate": 9.35244505717298e-06, |
|
"loss": 1.2152, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.2675964969185858, |
|
"grad_norm": 0.7747199535369873, |
|
"learning_rate": 9.332170951261051e-06, |
|
"loss": 1.1686, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.27570548167369446, |
|
"grad_norm": 2.8744866847991943, |
|
"learning_rate": 9.31189684534912e-06, |
|
"loss": 1.1571, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.2838144664288031, |
|
"grad_norm": 3.669771671295166, |
|
"learning_rate": 9.291622739437192e-06, |
|
"loss": 1.1609, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.2919234511839118, |
|
"grad_norm": 1.5796583890914917, |
|
"learning_rate": 9.271348633525262e-06, |
|
"loss": 1.1873, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.30003243593902046, |
|
"grad_norm": 24.162195205688477, |
|
"learning_rate": 9.251074527613332e-06, |
|
"loss": 1.071, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.3081414206941291, |
|
"grad_norm": 2.8964436054229736, |
|
"learning_rate": 9.230800421701404e-06, |
|
"loss": 1.1767, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.31625040544923777, |
|
"grad_norm": 2.9506802558898926, |
|
"learning_rate": 9.210526315789474e-06, |
|
"loss": 1.1566, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.3243593902043464, |
|
"grad_norm": 3.0490458011627197, |
|
"learning_rate": 9.190252209877545e-06, |
|
"loss": 1.1563, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.3324683749594551, |
|
"grad_norm": 2.236199378967285, |
|
"learning_rate": 9.169978103965615e-06, |
|
"loss": 1.1493, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.3405773597145637, |
|
"grad_norm": 2.1622684001922607, |
|
"learning_rate": 9.149703998053687e-06, |
|
"loss": 1.1613, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.3486863444696724, |
|
"grad_norm": 1.634074330329895, |
|
"learning_rate": 9.129429892141757e-06, |
|
"loss": 1.2061, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.3567953292247811, |
|
"grad_norm": 1.8448489904403687, |
|
"learning_rate": 9.109155786229828e-06, |
|
"loss": 1.1047, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.3649043139798897, |
|
"grad_norm": 2.2948291301727295, |
|
"learning_rate": 9.0888816803179e-06, |
|
"loss": 1.0898, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.3730132987349984, |
|
"grad_norm": 2.0072033405303955, |
|
"learning_rate": 9.06860757440597e-06, |
|
"loss": 1.1275, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.381122283490107, |
|
"grad_norm": 1.8813329935073853, |
|
"learning_rate": 9.04833346849404e-06, |
|
"loss": 1.143, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.3892312682452157, |
|
"grad_norm": 2.1367673873901367, |
|
"learning_rate": 9.028059362582111e-06, |
|
"loss": 1.1655, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.3973402530003244, |
|
"grad_norm": 0.8832216858863831, |
|
"learning_rate": 9.007785256670181e-06, |
|
"loss": 1.2366, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.405449237755433, |
|
"grad_norm": 2.3302135467529297, |
|
"learning_rate": 8.987511150758253e-06, |
|
"loss": 1.1629, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.4135582225105417, |
|
"grad_norm": 2.1263036727905273, |
|
"learning_rate": 8.967237044846323e-06, |
|
"loss": 1.1454, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.4216672072656503, |
|
"grad_norm": 1.9923527240753174, |
|
"learning_rate": 8.946962938934394e-06, |
|
"loss": 1.1205, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.429776192020759, |
|
"grad_norm": 3.215719699859619, |
|
"learning_rate": 8.926688833022464e-06, |
|
"loss": 1.0715, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.43788517677586763, |
|
"grad_norm": 0.8249936103820801, |
|
"learning_rate": 8.906414727110536e-06, |
|
"loss": 1.1257, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.4459941615309763, |
|
"grad_norm": 7.653429985046387, |
|
"learning_rate": 8.886140621198606e-06, |
|
"loss": 1.1224, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.454103146286085, |
|
"grad_norm": 1.5797380208969116, |
|
"learning_rate": 8.865866515286676e-06, |
|
"loss": 1.1456, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.46221213104119363, |
|
"grad_norm": 1.7436014413833618, |
|
"learning_rate": 8.845592409374747e-06, |
|
"loss": 1.1848, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.4703211157963023, |
|
"grad_norm": 2.171922206878662, |
|
"learning_rate": 8.825318303462817e-06, |
|
"loss": 1.1075, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.47843010055141094, |
|
"grad_norm": 2.2190613746643066, |
|
"learning_rate": 8.805044197550889e-06, |
|
"loss": 1.11, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.4865390853065196, |
|
"grad_norm": 1.689220905303955, |
|
"learning_rate": 8.784770091638959e-06, |
|
"loss": 1.0315, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.4946480700616283, |
|
"grad_norm": 2.075385332107544, |
|
"learning_rate": 8.76449598572703e-06, |
|
"loss": 1.0846, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.5027570548167369, |
|
"grad_norm": 1.8431355953216553, |
|
"learning_rate": 8.744221879815102e-06, |
|
"loss": 1.1823, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.5108660395718456, |
|
"grad_norm": 2.7447328567504883, |
|
"learning_rate": 8.723947773903172e-06, |
|
"loss": 1.1195, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.5189750243269543, |
|
"grad_norm": 1.8737456798553467, |
|
"learning_rate": 8.703673667991243e-06, |
|
"loss": 1.1918, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.5270840090820629, |
|
"grad_norm": 3.270134210586548, |
|
"learning_rate": 8.683399562079313e-06, |
|
"loss": 1.0992, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.5351929938371716, |
|
"grad_norm": 2.3992574214935303, |
|
"learning_rate": 8.663125456167383e-06, |
|
"loss": 1.168, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.5433019785922802, |
|
"grad_norm": 2.5086565017700195, |
|
"learning_rate": 8.642851350255455e-06, |
|
"loss": 1.104, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.5514109633473889, |
|
"grad_norm": 2.5543065071105957, |
|
"learning_rate": 8.622577244343525e-06, |
|
"loss": 1.1699, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.5595199481024976, |
|
"grad_norm": 1.8610668182373047, |
|
"learning_rate": 8.602303138431596e-06, |
|
"loss": 1.0648, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.5676289328576062, |
|
"grad_norm": 0.530300498008728, |
|
"learning_rate": 8.582029032519666e-06, |
|
"loss": 1.0437, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.5757379176127149, |
|
"grad_norm": 2.2418153285980225, |
|
"learning_rate": 8.561754926607738e-06, |
|
"loss": 1.0542, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.5838469023678236, |
|
"grad_norm": 3.173265218734741, |
|
"learning_rate": 8.541480820695808e-06, |
|
"loss": 1.1204, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.5919558871229322, |
|
"grad_norm": 1.4262479543685913, |
|
"learning_rate": 8.521206714783878e-06, |
|
"loss": 1.1228, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.6000648718780409, |
|
"grad_norm": 6.2367377281188965, |
|
"learning_rate": 8.50093260887195e-06, |
|
"loss": 1.1236, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.6081738566331495, |
|
"grad_norm": 1.1879503726959229, |
|
"learning_rate": 8.480658502960019e-06, |
|
"loss": 1.1394, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.6162828413882582, |
|
"grad_norm": 1.846981167793274, |
|
"learning_rate": 8.46038439704809e-06, |
|
"loss": 1.1356, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.6243918261433669, |
|
"grad_norm": 1.8748732805252075, |
|
"learning_rate": 8.440110291136162e-06, |
|
"loss": 1.2168, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.6325008108984755, |
|
"grad_norm": 1.6056501865386963, |
|
"learning_rate": 8.419836185224232e-06, |
|
"loss": 1.0998, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.6406097956535842, |
|
"grad_norm": 3.1764864921569824, |
|
"learning_rate": 8.399562079312304e-06, |
|
"loss": 1.1459, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.6487187804086928, |
|
"grad_norm": 2.427140712738037, |
|
"learning_rate": 8.379287973400374e-06, |
|
"loss": 1.1625, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.6568277651638015, |
|
"grad_norm": 3.784374237060547, |
|
"learning_rate": 8.359013867488445e-06, |
|
"loss": 1.1198, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.6649367499189102, |
|
"grad_norm": 1.0050832033157349, |
|
"learning_rate": 8.338739761576515e-06, |
|
"loss": 1.156, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.6730457346740188, |
|
"grad_norm": 1.9474921226501465, |
|
"learning_rate": 8.318465655664587e-06, |
|
"loss": 1.1186, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.6811547194291274, |
|
"grad_norm": 2.666841983795166, |
|
"learning_rate": 8.298191549752657e-06, |
|
"loss": 1.1655, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.6892637041842361, |
|
"grad_norm": 3.144240379333496, |
|
"learning_rate": 8.277917443840727e-06, |
|
"loss": 1.1691, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.6973726889393448, |
|
"grad_norm": 130.09506225585938, |
|
"learning_rate": 8.257643337928798e-06, |
|
"loss": 1.2045, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.7054816736944535, |
|
"grad_norm": 2.074591636657715, |
|
"learning_rate": 8.237369232016868e-06, |
|
"loss": 1.1461, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.7135906584495622, |
|
"grad_norm": 2.799001932144165, |
|
"learning_rate": 8.21709512610494e-06, |
|
"loss": 1.1341, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.7216996432046707, |
|
"grad_norm": 2.7407402992248535, |
|
"learning_rate": 8.19682102019301e-06, |
|
"loss": 1.1195, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.7298086279597794, |
|
"grad_norm": 0.21362553536891937, |
|
"learning_rate": 8.176546914281081e-06, |
|
"loss": 1.1311, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.7379176127148881, |
|
"grad_norm": 1.1738766431808472, |
|
"learning_rate": 8.156272808369151e-06, |
|
"loss": 1.0796, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.7460265974699968, |
|
"grad_norm": 2.821784257888794, |
|
"learning_rate": 8.135998702457221e-06, |
|
"loss": 1.116, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.7541355822251055, |
|
"grad_norm": 2.0731353759765625, |
|
"learning_rate": 8.115724596545294e-06, |
|
"loss": 1.1121, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.762244566980214, |
|
"grad_norm": 2.632211685180664, |
|
"learning_rate": 8.095450490633364e-06, |
|
"loss": 1.0747, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.7703535517353227, |
|
"grad_norm": 2.5461819171905518, |
|
"learning_rate": 8.075176384721434e-06, |
|
"loss": 1.0444, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.7784625364904314, |
|
"grad_norm": 5.323375225067139, |
|
"learning_rate": 8.054902278809506e-06, |
|
"loss": 1.1097, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.7865715212455401, |
|
"grad_norm": 3.063811779022217, |
|
"learning_rate": 8.034628172897576e-06, |
|
"loss": 1.1502, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.7946805060006488, |
|
"grad_norm": 2.1988182067871094, |
|
"learning_rate": 8.014354066985647e-06, |
|
"loss": 1.1024, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.8027894907557573, |
|
"grad_norm": 2.6995670795440674, |
|
"learning_rate": 7.994079961073717e-06, |
|
"loss": 1.1678, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.810898475510866, |
|
"grad_norm": 0.982642412185669, |
|
"learning_rate": 7.973805855161789e-06, |
|
"loss": 1.1194, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.8190074602659747, |
|
"grad_norm": 1.8992295265197754, |
|
"learning_rate": 7.953531749249859e-06, |
|
"loss": 1.1394, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.8271164450210834, |
|
"grad_norm": 1.253397822380066, |
|
"learning_rate": 7.933257643337929e-06, |
|
"loss": 1.0602, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.8352254297761921, |
|
"grad_norm": 2.562586784362793, |
|
"learning_rate": 7.912983537426e-06, |
|
"loss": 1.1393, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.8433344145313006, |
|
"grad_norm": 3.5401248931884766, |
|
"learning_rate": 7.89270943151407e-06, |
|
"loss": 1.0614, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.8514433992864093, |
|
"grad_norm": 1.610593318939209, |
|
"learning_rate": 7.872435325602142e-06, |
|
"loss": 1.0493, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.859552384041518, |
|
"grad_norm": 1.8910722732543945, |
|
"learning_rate": 7.852161219690212e-06, |
|
"loss": 1.1287, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.8676613687966267, |
|
"grad_norm": 1.915490746498108, |
|
"learning_rate": 7.831887113778283e-06, |
|
"loss": 1.0797, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.8757703535517353, |
|
"grad_norm": 3.8769915103912354, |
|
"learning_rate": 7.811613007866353e-06, |
|
"loss": 1.1077, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.883879338306844, |
|
"grad_norm": 0.7856437563896179, |
|
"learning_rate": 7.791338901954425e-06, |
|
"loss": 1.0537, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.8919883230619526, |
|
"grad_norm": 1.956763744354248, |
|
"learning_rate": 7.771064796042496e-06, |
|
"loss": 1.0882, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.9000973078170613, |
|
"grad_norm": 2.763761043548584, |
|
"learning_rate": 7.750790690130566e-06, |
|
"loss": 1.1859, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.90820629257217, |
|
"grad_norm": 1.743696689605713, |
|
"learning_rate": 7.730516584218636e-06, |
|
"loss": 1.0828, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.9163152773272786, |
|
"grad_norm": 1.8948084115982056, |
|
"learning_rate": 7.710242478306708e-06, |
|
"loss": 1.1487, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.9244242620823873, |
|
"grad_norm": 1.9413537979125977, |
|
"learning_rate": 7.689968372394778e-06, |
|
"loss": 1.0365, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.9325332468374959, |
|
"grad_norm": 1.3905330896377563, |
|
"learning_rate": 7.66969426648285e-06, |
|
"loss": 1.1288, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.9406422315926046, |
|
"grad_norm": 2.7509984970092773, |
|
"learning_rate": 7.649420160570919e-06, |
|
"loss": 1.1158, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.9487512163477133, |
|
"grad_norm": 2.0269293785095215, |
|
"learning_rate": 7.62914605465899e-06, |
|
"loss": 1.0666, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.9568602011028219, |
|
"grad_norm": 2.457383632659912, |
|
"learning_rate": 7.608871948747061e-06, |
|
"loss": 1.0743, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.9649691858579306, |
|
"grad_norm": 1.9480834007263184, |
|
"learning_rate": 7.588597842835131e-06, |
|
"loss": 1.0856, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.9730781706130393, |
|
"grad_norm": 2.421600103378296, |
|
"learning_rate": 7.568323736923202e-06, |
|
"loss": 1.1044, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.9811871553681479, |
|
"grad_norm": 1.9701539278030396, |
|
"learning_rate": 7.548049631011273e-06, |
|
"loss": 1.0493, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.9892961401232566, |
|
"grad_norm": 3.0008904933929443, |
|
"learning_rate": 7.527775525099344e-06, |
|
"loss": 1.057, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.9974051248783652, |
|
"grad_norm": 2.9297544956207275, |
|
"learning_rate": 7.507501419187414e-06, |
|
"loss": 1.0695, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 1.0055141096334739, |
|
"grad_norm": 8.883905410766602, |
|
"learning_rate": 7.487227313275484e-06, |
|
"loss": 1.1178, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.0136230943885824, |
|
"grad_norm": 1.9629335403442383, |
|
"learning_rate": 7.466953207363555e-06, |
|
"loss": 1.1246, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 1.0217320791436912, |
|
"grad_norm": 1.3846949338912964, |
|
"learning_rate": 7.4466791014516275e-06, |
|
"loss": 1.1218, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.0298410638987998, |
|
"grad_norm": 1.8666070699691772, |
|
"learning_rate": 7.426404995539697e-06, |
|
"loss": 1.072, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 1.0379500486539086, |
|
"grad_norm": 2.517359972000122, |
|
"learning_rate": 7.406130889627768e-06, |
|
"loss": 1.1204, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.0460590334090172, |
|
"grad_norm": 10.04233455657959, |
|
"learning_rate": 7.385856783715839e-06, |
|
"loss": 1.1025, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 1.0541680181641258, |
|
"grad_norm": 2.2324578762054443, |
|
"learning_rate": 7.36558267780391e-06, |
|
"loss": 1.0799, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.0622770029192345, |
|
"grad_norm": 1.6139917373657227, |
|
"learning_rate": 7.34530857189198e-06, |
|
"loss": 1.1512, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 1.0703859876743431, |
|
"grad_norm": 1.7317317724227905, |
|
"learning_rate": 7.325034465980051e-06, |
|
"loss": 1.0947, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.078494972429452, |
|
"grad_norm": 1.3901951313018799, |
|
"learning_rate": 7.304760360068122e-06, |
|
"loss": 1.1165, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 1.0866039571845605, |
|
"grad_norm": 2.5908029079437256, |
|
"learning_rate": 7.284486254156193e-06, |
|
"loss": 1.0484, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.094712941939669, |
|
"grad_norm": 2.977522850036621, |
|
"learning_rate": 7.2642121482442626e-06, |
|
"loss": 1.0471, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 1.1028219266947779, |
|
"grad_norm": 1.2937211990356445, |
|
"learning_rate": 7.243938042332333e-06, |
|
"loss": 1.1202, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.1109309114498864, |
|
"grad_norm": 2.9973866939544678, |
|
"learning_rate": 7.223663936420404e-06, |
|
"loss": 1.0414, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 1.1190398962049952, |
|
"grad_norm": 5.904523849487305, |
|
"learning_rate": 7.203389830508475e-06, |
|
"loss": 1.0944, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 1.1271488809601038, |
|
"grad_norm": 4.078853130340576, |
|
"learning_rate": 7.1831157245965456e-06, |
|
"loss": 1.0421, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 1.1352578657152124, |
|
"grad_norm": 2.3049638271331787, |
|
"learning_rate": 7.162841618684616e-06, |
|
"loss": 1.1185, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.1433668504703212, |
|
"grad_norm": 1.740246057510376, |
|
"learning_rate": 7.142567512772687e-06, |
|
"loss": 1.0862, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 1.1514758352254297, |
|
"grad_norm": 7.726000785827637, |
|
"learning_rate": 7.122293406860759e-06, |
|
"loss": 1.0821, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 1.1595848199805385, |
|
"grad_norm": 0.24267837405204773, |
|
"learning_rate": 7.1020193009488294e-06, |
|
"loss": 1.0419, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 1.167693804735647, |
|
"grad_norm": 1.9721111059188843, |
|
"learning_rate": 7.0817451950369e-06, |
|
"loss": 1.0959, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 1.1758027894907557, |
|
"grad_norm": 1.5573512315750122, |
|
"learning_rate": 7.06147108912497e-06, |
|
"loss": 1.0827, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 1.1839117742458645, |
|
"grad_norm": 2.4766602516174316, |
|
"learning_rate": 7.041196983213041e-06, |
|
"loss": 1.0144, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 1.192020759000973, |
|
"grad_norm": 3.357357978820801, |
|
"learning_rate": 7.020922877301112e-06, |
|
"loss": 1.0686, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 1.2001297437560818, |
|
"grad_norm": 1.6499881744384766, |
|
"learning_rate": 7.000648771389182e-06, |
|
"loss": 1.0918, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 1.2082387285111904, |
|
"grad_norm": 2.088510751724243, |
|
"learning_rate": 6.980374665477253e-06, |
|
"loss": 1.0963, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 1.216347713266299, |
|
"grad_norm": 2.040971040725708, |
|
"learning_rate": 6.960100559565324e-06, |
|
"loss": 1.1578, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.2244566980214078, |
|
"grad_norm": 1.381493091583252, |
|
"learning_rate": 6.939826453653395e-06, |
|
"loss": 1.1057, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 1.2325656827765163, |
|
"grad_norm": 1.6742016077041626, |
|
"learning_rate": 6.919552347741465e-06, |
|
"loss": 1.1118, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 1.240674667531625, |
|
"grad_norm": 0.9264686107635498, |
|
"learning_rate": 6.899278241829535e-06, |
|
"loss": 1.0471, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 1.2487836522867337, |
|
"grad_norm": 3.0064830780029297, |
|
"learning_rate": 6.879004135917606e-06, |
|
"loss": 1.0854, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 1.2568926370418423, |
|
"grad_norm": 0.9564074873924255, |
|
"learning_rate": 6.858730030005677e-06, |
|
"loss": 1.1668, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 1.265001621796951, |
|
"grad_norm": 7.573803901672363, |
|
"learning_rate": 6.8384559240937475e-06, |
|
"loss": 1.1087, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 1.2731106065520597, |
|
"grad_norm": 6.813046455383301, |
|
"learning_rate": 6.818181818181818e-06, |
|
"loss": 1.1263, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 1.2812195913071682, |
|
"grad_norm": 1.5467007160186768, |
|
"learning_rate": 6.79790771226989e-06, |
|
"loss": 1.1001, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 1.289328576062277, |
|
"grad_norm": 1.5642696619033813, |
|
"learning_rate": 6.777633606357961e-06, |
|
"loss": 1.1252, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 1.2974375608173856, |
|
"grad_norm": 5.454095363616943, |
|
"learning_rate": 6.757359500446031e-06, |
|
"loss": 1.0533, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.3055465455724944, |
|
"grad_norm": 2.569298028945923, |
|
"learning_rate": 6.737085394534102e-06, |
|
"loss": 1.1265, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 1.313655530327603, |
|
"grad_norm": 0.9834175705909729, |
|
"learning_rate": 6.716811288622173e-06, |
|
"loss": 1.1177, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 1.3217645150827115, |
|
"grad_norm": 1.9268743991851807, |
|
"learning_rate": 6.696537182710244e-06, |
|
"loss": 1.0928, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 1.3298734998378203, |
|
"grad_norm": 2.7169127464294434, |
|
"learning_rate": 6.6762630767983135e-06, |
|
"loss": 1.0943, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 1.337982484592929, |
|
"grad_norm": 1.8307722806930542, |
|
"learning_rate": 6.655988970886384e-06, |
|
"loss": 1.0697, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 1.3460914693480377, |
|
"grad_norm": 1.8044813871383667, |
|
"learning_rate": 6.635714864974455e-06, |
|
"loss": 1.1663, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 1.3542004541031463, |
|
"grad_norm": 0.26450178027153015, |
|
"learning_rate": 6.615440759062526e-06, |
|
"loss": 1.0665, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 1.3623094388582548, |
|
"grad_norm": 5.571292877197266, |
|
"learning_rate": 6.5951666531505966e-06, |
|
"loss": 1.0099, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 1.3704184236133636, |
|
"grad_norm": 3.6548831462860107, |
|
"learning_rate": 6.574892547238667e-06, |
|
"loss": 1.0948, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 1.3785274083684722, |
|
"grad_norm": 1.9964606761932373, |
|
"learning_rate": 6.554618441326738e-06, |
|
"loss": 1.079, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.386636393123581, |
|
"grad_norm": 9.50430965423584, |
|
"learning_rate": 6.534344335414808e-06, |
|
"loss": 1.093, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 1.3947453778786896, |
|
"grad_norm": 2.573894500732422, |
|
"learning_rate": 6.514070229502879e-06, |
|
"loss": 1.1444, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 1.4028543626337981, |
|
"grad_norm": 2.370356559753418, |
|
"learning_rate": 6.4937961235909495e-06, |
|
"loss": 1.1342, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 1.410963347388907, |
|
"grad_norm": 2.698498487472534, |
|
"learning_rate": 6.47352201767902e-06, |
|
"loss": 1.1387, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 1.4190723321440155, |
|
"grad_norm": 1.4973636865615845, |
|
"learning_rate": 6.453247911767092e-06, |
|
"loss": 1.077, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 1.4271813168991243, |
|
"grad_norm": 2.350057601928711, |
|
"learning_rate": 6.432973805855163e-06, |
|
"loss": 1.0281, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 1.4352903016542329, |
|
"grad_norm": 2.5107436180114746, |
|
"learning_rate": 6.412699699943233e-06, |
|
"loss": 1.0517, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 1.4433992864093415, |
|
"grad_norm": 2.506737470626831, |
|
"learning_rate": 6.392425594031304e-06, |
|
"loss": 1.0565, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 1.4515082711644502, |
|
"grad_norm": 2.0631542205810547, |
|
"learning_rate": 6.372151488119375e-06, |
|
"loss": 1.0925, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 1.4596172559195588, |
|
"grad_norm": 1.4057486057281494, |
|
"learning_rate": 6.351877382207446e-06, |
|
"loss": 1.1257, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.4677262406746676, |
|
"grad_norm": 1.5711629390716553, |
|
"learning_rate": 6.331603276295516e-06, |
|
"loss": 1.0615, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 1.4758352254297762, |
|
"grad_norm": 3.042393922805786, |
|
"learning_rate": 6.311329170383586e-06, |
|
"loss": 1.0664, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 1.4839442101848848, |
|
"grad_norm": 2.0653626918792725, |
|
"learning_rate": 6.291055064471657e-06, |
|
"loss": 1.0039, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 1.4920531949399936, |
|
"grad_norm": 2.374119281768799, |
|
"learning_rate": 6.270780958559728e-06, |
|
"loss": 1.0053, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 1.5001621796951021, |
|
"grad_norm": 2.568708896636963, |
|
"learning_rate": 6.2505068526477985e-06, |
|
"loss": 1.0484, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 1.508271164450211, |
|
"grad_norm": 2.4763758182525635, |
|
"learning_rate": 6.230232746735869e-06, |
|
"loss": 1.0998, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 1.5163801492053195, |
|
"grad_norm": 1.8199087381362915, |
|
"learning_rate": 6.20995864082394e-06, |
|
"loss": 1.1189, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 1.524489133960428, |
|
"grad_norm": 4.159176826477051, |
|
"learning_rate": 6.189684534912011e-06, |
|
"loss": 1.1606, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 1.5325981187155369, |
|
"grad_norm": 1.266381025314331, |
|
"learning_rate": 6.1694104290000815e-06, |
|
"loss": 1.039, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 1.5407071034706454, |
|
"grad_norm": 1.114743947982788, |
|
"learning_rate": 6.1491363230881514e-06, |
|
"loss": 1.0937, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.5488160882257542, |
|
"grad_norm": 4.52326774597168, |
|
"learning_rate": 6.128862217176224e-06, |
|
"loss": 1.0782, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 1.5569250729808628, |
|
"grad_norm": 2.066561460494995, |
|
"learning_rate": 6.108588111264294e-06, |
|
"loss": 1.0033, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 1.5650340577359714, |
|
"grad_norm": 3.260119676589966, |
|
"learning_rate": 6.0883140053523645e-06, |
|
"loss": 1.063, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 1.5731430424910802, |
|
"grad_norm": 3.0911264419555664, |
|
"learning_rate": 6.068039899440435e-06, |
|
"loss": 1.0468, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 1.5812520272461887, |
|
"grad_norm": 6.644060134887695, |
|
"learning_rate": 6.047765793528506e-06, |
|
"loss": 1.0236, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 1.5893610120012975, |
|
"grad_norm": 2.7163946628570557, |
|
"learning_rate": 6.027491687616577e-06, |
|
"loss": 1.0351, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 1.597469996756406, |
|
"grad_norm": 2.4623427391052246, |
|
"learning_rate": 6.0072175817046475e-06, |
|
"loss": 1.1016, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 1.6055789815115147, |
|
"grad_norm": 2.7548983097076416, |
|
"learning_rate": 5.986943475792718e-06, |
|
"loss": 1.0571, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 1.6136879662666233, |
|
"grad_norm": 2.269378423690796, |
|
"learning_rate": 5.966669369880789e-06, |
|
"loss": 1.0962, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 1.621796951021732, |
|
"grad_norm": 2.0521297454833984, |
|
"learning_rate": 5.946395263968859e-06, |
|
"loss": 1.1202, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.6299059357768408, |
|
"grad_norm": 2.488682985305786, |
|
"learning_rate": 5.92612115805693e-06, |
|
"loss": 1.0879, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 1.6380149205319494, |
|
"grad_norm": 2.8953864574432373, |
|
"learning_rate": 5.9058470521450005e-06, |
|
"loss": 1.1048, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 1.646123905287058, |
|
"grad_norm": 2.272630214691162, |
|
"learning_rate": 5.885572946233071e-06, |
|
"loss": 1.0903, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 1.6542328900421666, |
|
"grad_norm": 2.1369316577911377, |
|
"learning_rate": 5.865298840321142e-06, |
|
"loss": 1.0911, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 1.6623418747972754, |
|
"grad_norm": 3.1257848739624023, |
|
"learning_rate": 5.845024734409213e-06, |
|
"loss": 1.066, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 1.6704508595523841, |
|
"grad_norm": 2.81978440284729, |
|
"learning_rate": 5.8247506284972835e-06, |
|
"loss": 1.1032, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 1.6785598443074927, |
|
"grad_norm": 1.2515383958816528, |
|
"learning_rate": 5.804476522585355e-06, |
|
"loss": 1.0646, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 1.6866688290626013, |
|
"grad_norm": 2.6792616844177246, |
|
"learning_rate": 5.784202416673426e-06, |
|
"loss": 1.0804, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 1.6947778138177099, |
|
"grad_norm": 2.989546537399292, |
|
"learning_rate": 5.763928310761497e-06, |
|
"loss": 1.1245, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 1.7028867985728187, |
|
"grad_norm": 3.0825021266937256, |
|
"learning_rate": 5.743654204849567e-06, |
|
"loss": 1.1059, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.7109957833279275, |
|
"grad_norm": 2.1985509395599365, |
|
"learning_rate": 5.723380098937637e-06, |
|
"loss": 1.1493, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 1.719104768083036, |
|
"grad_norm": 2.780210256576538, |
|
"learning_rate": 5.703105993025708e-06, |
|
"loss": 1.1207, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 1.7272137528381446, |
|
"grad_norm": 1.8562790155410767, |
|
"learning_rate": 5.682831887113779e-06, |
|
"loss": 1.0158, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 1.7353227375932532, |
|
"grad_norm": 1.9536188840866089, |
|
"learning_rate": 5.6625577812018495e-06, |
|
"loss": 1.113, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 1.743431722348362, |
|
"grad_norm": 7.557844638824463, |
|
"learning_rate": 5.64228367528992e-06, |
|
"loss": 1.0133, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 1.7515407071034708, |
|
"grad_norm": 6.065642356872559, |
|
"learning_rate": 5.622009569377991e-06, |
|
"loss": 1.1057, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 1.7596496918585793, |
|
"grad_norm": 2.0582070350646973, |
|
"learning_rate": 5.601735463466062e-06, |
|
"loss": 1.0018, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 1.767758676613688, |
|
"grad_norm": 1.545466423034668, |
|
"learning_rate": 5.5814613575541325e-06, |
|
"loss": 0.9689, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 1.7758676613687965, |
|
"grad_norm": 2.50724196434021, |
|
"learning_rate": 5.561187251642202e-06, |
|
"loss": 1.0578, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 1.7839766461239053, |
|
"grad_norm": 3.886679172515869, |
|
"learning_rate": 5.540913145730273e-06, |
|
"loss": 1.0745, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.792085630879014, |
|
"grad_norm": 3.0689311027526855, |
|
"learning_rate": 5.520639039818344e-06, |
|
"loss": 1.0816, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 1.8001946156341226, |
|
"grad_norm": 2.43038010597229, |
|
"learning_rate": 5.500364933906415e-06, |
|
"loss": 1.1257, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 1.8083036003892312, |
|
"grad_norm": 1.9623647928237915, |
|
"learning_rate": 5.480090827994486e-06, |
|
"loss": 1.072, |
|
"step": 11150 |
|
}, |
|
{ |
|
"epoch": 1.8164125851443398, |
|
"grad_norm": 2.903964042663574, |
|
"learning_rate": 5.459816722082557e-06, |
|
"loss": 1.1159, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 1.8245215698994486, |
|
"grad_norm": 3.9385204315185547, |
|
"learning_rate": 5.439542616170628e-06, |
|
"loss": 1.0296, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 1.8326305546545574, |
|
"grad_norm": 3.2969629764556885, |
|
"learning_rate": 5.4192685102586985e-06, |
|
"loss": 1.0818, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 1.840739539409666, |
|
"grad_norm": 6.509484767913818, |
|
"learning_rate": 5.398994404346769e-06, |
|
"loss": 1.0848, |
|
"step": 11350 |
|
}, |
|
{ |
|
"epoch": 1.8488485241647745, |
|
"grad_norm": 3.701723098754883, |
|
"learning_rate": 5.37872029843484e-06, |
|
"loss": 0.9775, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 1.856957508919883, |
|
"grad_norm": 2.0968496799468994, |
|
"learning_rate": 5.35844619252291e-06, |
|
"loss": 1.1417, |
|
"step": 11450 |
|
}, |
|
{ |
|
"epoch": 1.8650664936749919, |
|
"grad_norm": 2.2569050788879395, |
|
"learning_rate": 5.338172086610981e-06, |
|
"loss": 1.0199, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.8731754784301007, |
|
"grad_norm": 8.650045394897461, |
|
"learning_rate": 5.3178979806990514e-06, |
|
"loss": 1.1163, |
|
"step": 11550 |
|
}, |
|
{ |
|
"epoch": 1.8812844631852093, |
|
"grad_norm": 3.9758262634277344, |
|
"learning_rate": 5.297623874787122e-06, |
|
"loss": 0.9833, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 1.8893934479403178, |
|
"grad_norm": 2.467902183532715, |
|
"learning_rate": 5.277349768875193e-06, |
|
"loss": 1.0529, |
|
"step": 11650 |
|
}, |
|
{ |
|
"epoch": 1.8975024326954264, |
|
"grad_norm": 3.978503465652466, |
|
"learning_rate": 5.257075662963264e-06, |
|
"loss": 1.2041, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 1.9056114174505352, |
|
"grad_norm": 2.431842803955078, |
|
"learning_rate": 5.2368015570513345e-06, |
|
"loss": 1.0279, |
|
"step": 11750 |
|
}, |
|
{ |
|
"epoch": 1.913720402205644, |
|
"grad_norm": 2.4601986408233643, |
|
"learning_rate": 5.216527451139405e-06, |
|
"loss": 1.1013, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 1.9218293869607526, |
|
"grad_norm": 2.702439546585083, |
|
"learning_rate": 5.196253345227475e-06, |
|
"loss": 1.0153, |
|
"step": 11850 |
|
}, |
|
{ |
|
"epoch": 1.9299383717158611, |
|
"grad_norm": 1.80119788646698, |
|
"learning_rate": 5.175979239315546e-06, |
|
"loss": 1.0995, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 1.9380473564709697, |
|
"grad_norm": 2.78031587600708, |
|
"learning_rate": 5.155705133403617e-06, |
|
"loss": 1.0488, |
|
"step": 11950 |
|
}, |
|
{ |
|
"epoch": 1.9461563412260785, |
|
"grad_norm": 1.3408476114273071, |
|
"learning_rate": 5.135431027491688e-06, |
|
"loss": 1.0176, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.9542653259811873, |
|
"grad_norm": 2.418548107147217, |
|
"learning_rate": 5.115156921579759e-06, |
|
"loss": 1.0323, |
|
"step": 12050 |
|
}, |
|
{ |
|
"epoch": 1.9623743107362959, |
|
"grad_norm": 2.1113944053649902, |
|
"learning_rate": 5.09488281566783e-06, |
|
"loss": 1.0717, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 1.9704832954914044, |
|
"grad_norm": 1.4384586811065674, |
|
"learning_rate": 5.0746087097559005e-06, |
|
"loss": 1.053, |
|
"step": 12150 |
|
}, |
|
{ |
|
"epoch": 1.978592280246513, |
|
"grad_norm": 2.836148738861084, |
|
"learning_rate": 5.054334603843971e-06, |
|
"loss": 0.9988, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 1.9867012650016218, |
|
"grad_norm": 1.2631614208221436, |
|
"learning_rate": 5.034060497932042e-06, |
|
"loss": 1.0348, |
|
"step": 12250 |
|
}, |
|
{ |
|
"epoch": 1.9948102497567306, |
|
"grad_norm": 5.070575714111328, |
|
"learning_rate": 5.013786392020113e-06, |
|
"loss": 1.0272, |
|
"step": 12300 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 24664, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.8392191071174525e+19, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|