ACRE-Qwen-2.5-7B-instruct / trainer_state.json
TommyChien's picture
Upload folder using huggingface_hub
020fb0e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 12332,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008108984755108661,
"grad_norm": 1.9572986364364624,
"learning_rate": 9.980942340442787e-06,
"loss": 1.3961,
"step": 50
},
{
"epoch": 0.016217969510217322,
"grad_norm": 2.140582323074341,
"learning_rate": 9.960668234530858e-06,
"loss": 1.2721,
"step": 100
},
{
"epoch": 0.02432695426532598,
"grad_norm": 2.600647211074829,
"learning_rate": 9.940394128618928e-06,
"loss": 1.2162,
"step": 150
},
{
"epoch": 0.032435939020434644,
"grad_norm": 2.521652936935425,
"learning_rate": 9.920120022707e-06,
"loss": 1.2469,
"step": 200
},
{
"epoch": 0.0405449237755433,
"grad_norm": 2.509162425994873,
"learning_rate": 9.89984591679507e-06,
"loss": 1.2162,
"step": 250
},
{
"epoch": 0.04865390853065196,
"grad_norm": 4.292358875274658,
"learning_rate": 9.879571810883141e-06,
"loss": 1.2096,
"step": 300
},
{
"epoch": 0.056762893285760625,
"grad_norm": 2.3972127437591553,
"learning_rate": 9.859297704971211e-06,
"loss": 1.1857,
"step": 350
},
{
"epoch": 0.06487187804086929,
"grad_norm": 1.8880646228790283,
"learning_rate": 9.839023599059281e-06,
"loss": 1.1973,
"step": 400
},
{
"epoch": 0.07298086279597794,
"grad_norm": 1.4280647039413452,
"learning_rate": 9.818749493147353e-06,
"loss": 1.2305,
"step": 450
},
{
"epoch": 0.0810898475510866,
"grad_norm": 1.4675335884094238,
"learning_rate": 9.798475387235423e-06,
"loss": 1.2302,
"step": 500
},
{
"epoch": 0.08919883230619527,
"grad_norm": 3.1858043670654297,
"learning_rate": 9.778201281323494e-06,
"loss": 1.1004,
"step": 550
},
{
"epoch": 0.09730781706130393,
"grad_norm": 1.4137463569641113,
"learning_rate": 9.757927175411566e-06,
"loss": 1.1911,
"step": 600
},
{
"epoch": 0.10541680181641258,
"grad_norm": 1.389324426651001,
"learning_rate": 9.737653069499636e-06,
"loss": 1.1752,
"step": 650
},
{
"epoch": 0.11352578657152125,
"grad_norm": 2.291736602783203,
"learning_rate": 9.717378963587707e-06,
"loss": 1.135,
"step": 700
},
{
"epoch": 0.1216347713266299,
"grad_norm": 1.8395874500274658,
"learning_rate": 9.697104857675777e-06,
"loss": 1.1551,
"step": 750
},
{
"epoch": 0.12974375608173858,
"grad_norm": 1.9675475358963013,
"learning_rate": 9.676830751763849e-06,
"loss": 1.1685,
"step": 800
},
{
"epoch": 0.13785274083684723,
"grad_norm": 2.3232228755950928,
"learning_rate": 9.656556645851919e-06,
"loss": 1.1895,
"step": 850
},
{
"epoch": 0.1459617255919559,
"grad_norm": 1.8441636562347412,
"learning_rate": 9.636282539939989e-06,
"loss": 1.1697,
"step": 900
},
{
"epoch": 0.15407071034706454,
"grad_norm": 2.008300542831421,
"learning_rate": 9.61600843402806e-06,
"loss": 1.1553,
"step": 950
},
{
"epoch": 0.1621796951021732,
"grad_norm": 1.5163062810897827,
"learning_rate": 9.59573432811613e-06,
"loss": 1.1739,
"step": 1000
},
{
"epoch": 0.17028867985728185,
"grad_norm": 1.4669376611709595,
"learning_rate": 9.575460222204202e-06,
"loss": 1.1582,
"step": 1050
},
{
"epoch": 0.17839766461239054,
"grad_norm": 2.10261607170105,
"learning_rate": 9.555186116292272e-06,
"loss": 1.1284,
"step": 1100
},
{
"epoch": 0.1865066493674992,
"grad_norm": 1.8615520000457764,
"learning_rate": 9.534912010380343e-06,
"loss": 1.0916,
"step": 1150
},
{
"epoch": 0.19461563412260785,
"grad_norm": 3.1653287410736084,
"learning_rate": 9.514637904468413e-06,
"loss": 1.2177,
"step": 1200
},
{
"epoch": 0.2027246188777165,
"grad_norm": 2.7635200023651123,
"learning_rate": 9.494363798556485e-06,
"loss": 1.1217,
"step": 1250
},
{
"epoch": 0.21083360363282516,
"grad_norm": 1.2704397439956665,
"learning_rate": 9.474089692644555e-06,
"loss": 1.2341,
"step": 1300
},
{
"epoch": 0.21894258838793382,
"grad_norm": 5.202953815460205,
"learning_rate": 9.453815586732625e-06,
"loss": 1.2126,
"step": 1350
},
{
"epoch": 0.2270515731430425,
"grad_norm": 2.3350253105163574,
"learning_rate": 9.433541480820698e-06,
"loss": 1.1721,
"step": 1400
},
{
"epoch": 0.23516055789815116,
"grad_norm": 1.807605266571045,
"learning_rate": 9.413267374908768e-06,
"loss": 1.1475,
"step": 1450
},
{
"epoch": 0.2432695426532598,
"grad_norm": 1.765973448753357,
"learning_rate": 9.392993268996838e-06,
"loss": 1.1572,
"step": 1500
},
{
"epoch": 0.25137852740836847,
"grad_norm": 1.1422791481018066,
"learning_rate": 9.37271916308491e-06,
"loss": 1.1987,
"step": 1550
},
{
"epoch": 0.25948751216347715,
"grad_norm": 2.6164450645446777,
"learning_rate": 9.35244505717298e-06,
"loss": 1.2152,
"step": 1600
},
{
"epoch": 0.2675964969185858,
"grad_norm": 0.7747199535369873,
"learning_rate": 9.332170951261051e-06,
"loss": 1.1686,
"step": 1650
},
{
"epoch": 0.27570548167369446,
"grad_norm": 2.8744866847991943,
"learning_rate": 9.31189684534912e-06,
"loss": 1.1571,
"step": 1700
},
{
"epoch": 0.2838144664288031,
"grad_norm": 3.669771671295166,
"learning_rate": 9.291622739437192e-06,
"loss": 1.1609,
"step": 1750
},
{
"epoch": 0.2919234511839118,
"grad_norm": 1.5796583890914917,
"learning_rate": 9.271348633525262e-06,
"loss": 1.1873,
"step": 1800
},
{
"epoch": 0.30003243593902046,
"grad_norm": 24.162195205688477,
"learning_rate": 9.251074527613332e-06,
"loss": 1.071,
"step": 1850
},
{
"epoch": 0.3081414206941291,
"grad_norm": 2.8964436054229736,
"learning_rate": 9.230800421701404e-06,
"loss": 1.1767,
"step": 1900
},
{
"epoch": 0.31625040544923777,
"grad_norm": 2.9506802558898926,
"learning_rate": 9.210526315789474e-06,
"loss": 1.1566,
"step": 1950
},
{
"epoch": 0.3243593902043464,
"grad_norm": 3.0490458011627197,
"learning_rate": 9.190252209877545e-06,
"loss": 1.1563,
"step": 2000
},
{
"epoch": 0.3324683749594551,
"grad_norm": 2.236199378967285,
"learning_rate": 9.169978103965615e-06,
"loss": 1.1493,
"step": 2050
},
{
"epoch": 0.3405773597145637,
"grad_norm": 2.1622684001922607,
"learning_rate": 9.149703998053687e-06,
"loss": 1.1613,
"step": 2100
},
{
"epoch": 0.3486863444696724,
"grad_norm": 1.634074330329895,
"learning_rate": 9.129429892141757e-06,
"loss": 1.2061,
"step": 2150
},
{
"epoch": 0.3567953292247811,
"grad_norm": 1.8448489904403687,
"learning_rate": 9.109155786229828e-06,
"loss": 1.1047,
"step": 2200
},
{
"epoch": 0.3649043139798897,
"grad_norm": 2.2948291301727295,
"learning_rate": 9.0888816803179e-06,
"loss": 1.0898,
"step": 2250
},
{
"epoch": 0.3730132987349984,
"grad_norm": 2.0072033405303955,
"learning_rate": 9.06860757440597e-06,
"loss": 1.1275,
"step": 2300
},
{
"epoch": 0.381122283490107,
"grad_norm": 1.8813329935073853,
"learning_rate": 9.04833346849404e-06,
"loss": 1.143,
"step": 2350
},
{
"epoch": 0.3892312682452157,
"grad_norm": 2.1367673873901367,
"learning_rate": 9.028059362582111e-06,
"loss": 1.1655,
"step": 2400
},
{
"epoch": 0.3973402530003244,
"grad_norm": 0.8832216858863831,
"learning_rate": 9.007785256670181e-06,
"loss": 1.2366,
"step": 2450
},
{
"epoch": 0.405449237755433,
"grad_norm": 2.3302135467529297,
"learning_rate": 8.987511150758253e-06,
"loss": 1.1629,
"step": 2500
},
{
"epoch": 0.4135582225105417,
"grad_norm": 2.1263036727905273,
"learning_rate": 8.967237044846323e-06,
"loss": 1.1454,
"step": 2550
},
{
"epoch": 0.4216672072656503,
"grad_norm": 1.9923527240753174,
"learning_rate": 8.946962938934394e-06,
"loss": 1.1205,
"step": 2600
},
{
"epoch": 0.429776192020759,
"grad_norm": 3.215719699859619,
"learning_rate": 8.926688833022464e-06,
"loss": 1.0715,
"step": 2650
},
{
"epoch": 0.43788517677586763,
"grad_norm": 0.8249936103820801,
"learning_rate": 8.906414727110536e-06,
"loss": 1.1257,
"step": 2700
},
{
"epoch": 0.4459941615309763,
"grad_norm": 7.653429985046387,
"learning_rate": 8.886140621198606e-06,
"loss": 1.1224,
"step": 2750
},
{
"epoch": 0.454103146286085,
"grad_norm": 1.5797380208969116,
"learning_rate": 8.865866515286676e-06,
"loss": 1.1456,
"step": 2800
},
{
"epoch": 0.46221213104119363,
"grad_norm": 1.7436014413833618,
"learning_rate": 8.845592409374747e-06,
"loss": 1.1848,
"step": 2850
},
{
"epoch": 0.4703211157963023,
"grad_norm": 2.171922206878662,
"learning_rate": 8.825318303462817e-06,
"loss": 1.1075,
"step": 2900
},
{
"epoch": 0.47843010055141094,
"grad_norm": 2.2190613746643066,
"learning_rate": 8.805044197550889e-06,
"loss": 1.11,
"step": 2950
},
{
"epoch": 0.4865390853065196,
"grad_norm": 1.689220905303955,
"learning_rate": 8.784770091638959e-06,
"loss": 1.0315,
"step": 3000
},
{
"epoch": 0.4946480700616283,
"grad_norm": 2.075385332107544,
"learning_rate": 8.76449598572703e-06,
"loss": 1.0846,
"step": 3050
},
{
"epoch": 0.5027570548167369,
"grad_norm": 1.8431355953216553,
"learning_rate": 8.744221879815102e-06,
"loss": 1.1823,
"step": 3100
},
{
"epoch": 0.5108660395718456,
"grad_norm": 2.7447328567504883,
"learning_rate": 8.723947773903172e-06,
"loss": 1.1195,
"step": 3150
},
{
"epoch": 0.5189750243269543,
"grad_norm": 1.8737456798553467,
"learning_rate": 8.703673667991243e-06,
"loss": 1.1918,
"step": 3200
},
{
"epoch": 0.5270840090820629,
"grad_norm": 3.270134210586548,
"learning_rate": 8.683399562079313e-06,
"loss": 1.0992,
"step": 3250
},
{
"epoch": 0.5351929938371716,
"grad_norm": 2.3992574214935303,
"learning_rate": 8.663125456167383e-06,
"loss": 1.168,
"step": 3300
},
{
"epoch": 0.5433019785922802,
"grad_norm": 2.5086565017700195,
"learning_rate": 8.642851350255455e-06,
"loss": 1.104,
"step": 3350
},
{
"epoch": 0.5514109633473889,
"grad_norm": 2.5543065071105957,
"learning_rate": 8.622577244343525e-06,
"loss": 1.1699,
"step": 3400
},
{
"epoch": 0.5595199481024976,
"grad_norm": 1.8610668182373047,
"learning_rate": 8.602303138431596e-06,
"loss": 1.0648,
"step": 3450
},
{
"epoch": 0.5676289328576062,
"grad_norm": 0.530300498008728,
"learning_rate": 8.582029032519666e-06,
"loss": 1.0437,
"step": 3500
},
{
"epoch": 0.5757379176127149,
"grad_norm": 2.2418153285980225,
"learning_rate": 8.561754926607738e-06,
"loss": 1.0542,
"step": 3550
},
{
"epoch": 0.5838469023678236,
"grad_norm": 3.173265218734741,
"learning_rate": 8.541480820695808e-06,
"loss": 1.1204,
"step": 3600
},
{
"epoch": 0.5919558871229322,
"grad_norm": 1.4262479543685913,
"learning_rate": 8.521206714783878e-06,
"loss": 1.1228,
"step": 3650
},
{
"epoch": 0.6000648718780409,
"grad_norm": 6.2367377281188965,
"learning_rate": 8.50093260887195e-06,
"loss": 1.1236,
"step": 3700
},
{
"epoch": 0.6081738566331495,
"grad_norm": 1.1879503726959229,
"learning_rate": 8.480658502960019e-06,
"loss": 1.1394,
"step": 3750
},
{
"epoch": 0.6162828413882582,
"grad_norm": 1.846981167793274,
"learning_rate": 8.46038439704809e-06,
"loss": 1.1356,
"step": 3800
},
{
"epoch": 0.6243918261433669,
"grad_norm": 1.8748732805252075,
"learning_rate": 8.440110291136162e-06,
"loss": 1.2168,
"step": 3850
},
{
"epoch": 0.6325008108984755,
"grad_norm": 1.6056501865386963,
"learning_rate": 8.419836185224232e-06,
"loss": 1.0998,
"step": 3900
},
{
"epoch": 0.6406097956535842,
"grad_norm": 3.1764864921569824,
"learning_rate": 8.399562079312304e-06,
"loss": 1.1459,
"step": 3950
},
{
"epoch": 0.6487187804086928,
"grad_norm": 2.427140712738037,
"learning_rate": 8.379287973400374e-06,
"loss": 1.1625,
"step": 4000
},
{
"epoch": 0.6568277651638015,
"grad_norm": 3.784374237060547,
"learning_rate": 8.359013867488445e-06,
"loss": 1.1198,
"step": 4050
},
{
"epoch": 0.6649367499189102,
"grad_norm": 1.0050832033157349,
"learning_rate": 8.338739761576515e-06,
"loss": 1.156,
"step": 4100
},
{
"epoch": 0.6730457346740188,
"grad_norm": 1.9474921226501465,
"learning_rate": 8.318465655664587e-06,
"loss": 1.1186,
"step": 4150
},
{
"epoch": 0.6811547194291274,
"grad_norm": 2.666841983795166,
"learning_rate": 8.298191549752657e-06,
"loss": 1.1655,
"step": 4200
},
{
"epoch": 0.6892637041842361,
"grad_norm": 3.144240379333496,
"learning_rate": 8.277917443840727e-06,
"loss": 1.1691,
"step": 4250
},
{
"epoch": 0.6973726889393448,
"grad_norm": 130.09506225585938,
"learning_rate": 8.257643337928798e-06,
"loss": 1.2045,
"step": 4300
},
{
"epoch": 0.7054816736944535,
"grad_norm": 2.074591636657715,
"learning_rate": 8.237369232016868e-06,
"loss": 1.1461,
"step": 4350
},
{
"epoch": 0.7135906584495622,
"grad_norm": 2.799001932144165,
"learning_rate": 8.21709512610494e-06,
"loss": 1.1341,
"step": 4400
},
{
"epoch": 0.7216996432046707,
"grad_norm": 2.7407402992248535,
"learning_rate": 8.19682102019301e-06,
"loss": 1.1195,
"step": 4450
},
{
"epoch": 0.7298086279597794,
"grad_norm": 0.21362553536891937,
"learning_rate": 8.176546914281081e-06,
"loss": 1.1311,
"step": 4500
},
{
"epoch": 0.7379176127148881,
"grad_norm": 1.1738766431808472,
"learning_rate": 8.156272808369151e-06,
"loss": 1.0796,
"step": 4550
},
{
"epoch": 0.7460265974699968,
"grad_norm": 2.821784257888794,
"learning_rate": 8.135998702457221e-06,
"loss": 1.116,
"step": 4600
},
{
"epoch": 0.7541355822251055,
"grad_norm": 2.0731353759765625,
"learning_rate": 8.115724596545294e-06,
"loss": 1.1121,
"step": 4650
},
{
"epoch": 0.762244566980214,
"grad_norm": 2.632211685180664,
"learning_rate": 8.095450490633364e-06,
"loss": 1.0747,
"step": 4700
},
{
"epoch": 0.7703535517353227,
"grad_norm": 2.5461819171905518,
"learning_rate": 8.075176384721434e-06,
"loss": 1.0444,
"step": 4750
},
{
"epoch": 0.7784625364904314,
"grad_norm": 5.323375225067139,
"learning_rate": 8.054902278809506e-06,
"loss": 1.1097,
"step": 4800
},
{
"epoch": 0.7865715212455401,
"grad_norm": 3.063811779022217,
"learning_rate": 8.034628172897576e-06,
"loss": 1.1502,
"step": 4850
},
{
"epoch": 0.7946805060006488,
"grad_norm": 2.1988182067871094,
"learning_rate": 8.014354066985647e-06,
"loss": 1.1024,
"step": 4900
},
{
"epoch": 0.8027894907557573,
"grad_norm": 2.6995670795440674,
"learning_rate": 7.994079961073717e-06,
"loss": 1.1678,
"step": 4950
},
{
"epoch": 0.810898475510866,
"grad_norm": 0.982642412185669,
"learning_rate": 7.973805855161789e-06,
"loss": 1.1194,
"step": 5000
},
{
"epoch": 0.8190074602659747,
"grad_norm": 1.8992295265197754,
"learning_rate": 7.953531749249859e-06,
"loss": 1.1394,
"step": 5050
},
{
"epoch": 0.8271164450210834,
"grad_norm": 1.253397822380066,
"learning_rate": 7.933257643337929e-06,
"loss": 1.0602,
"step": 5100
},
{
"epoch": 0.8352254297761921,
"grad_norm": 2.562586784362793,
"learning_rate": 7.912983537426e-06,
"loss": 1.1393,
"step": 5150
},
{
"epoch": 0.8433344145313006,
"grad_norm": 3.5401248931884766,
"learning_rate": 7.89270943151407e-06,
"loss": 1.0614,
"step": 5200
},
{
"epoch": 0.8514433992864093,
"grad_norm": 1.610593318939209,
"learning_rate": 7.872435325602142e-06,
"loss": 1.0493,
"step": 5250
},
{
"epoch": 0.859552384041518,
"grad_norm": 1.8910722732543945,
"learning_rate": 7.852161219690212e-06,
"loss": 1.1287,
"step": 5300
},
{
"epoch": 0.8676613687966267,
"grad_norm": 1.915490746498108,
"learning_rate": 7.831887113778283e-06,
"loss": 1.0797,
"step": 5350
},
{
"epoch": 0.8757703535517353,
"grad_norm": 3.8769915103912354,
"learning_rate": 7.811613007866353e-06,
"loss": 1.1077,
"step": 5400
},
{
"epoch": 0.883879338306844,
"grad_norm": 0.7856437563896179,
"learning_rate": 7.791338901954425e-06,
"loss": 1.0537,
"step": 5450
},
{
"epoch": 0.8919883230619526,
"grad_norm": 1.956763744354248,
"learning_rate": 7.771064796042496e-06,
"loss": 1.0882,
"step": 5500
},
{
"epoch": 0.9000973078170613,
"grad_norm": 2.763761043548584,
"learning_rate": 7.750790690130566e-06,
"loss": 1.1859,
"step": 5550
},
{
"epoch": 0.90820629257217,
"grad_norm": 1.743696689605713,
"learning_rate": 7.730516584218636e-06,
"loss": 1.0828,
"step": 5600
},
{
"epoch": 0.9163152773272786,
"grad_norm": 1.8948084115982056,
"learning_rate": 7.710242478306708e-06,
"loss": 1.1487,
"step": 5650
},
{
"epoch": 0.9244242620823873,
"grad_norm": 1.9413537979125977,
"learning_rate": 7.689968372394778e-06,
"loss": 1.0365,
"step": 5700
},
{
"epoch": 0.9325332468374959,
"grad_norm": 1.3905330896377563,
"learning_rate": 7.66969426648285e-06,
"loss": 1.1288,
"step": 5750
},
{
"epoch": 0.9406422315926046,
"grad_norm": 2.7509984970092773,
"learning_rate": 7.649420160570919e-06,
"loss": 1.1158,
"step": 5800
},
{
"epoch": 0.9487512163477133,
"grad_norm": 2.0269293785095215,
"learning_rate": 7.62914605465899e-06,
"loss": 1.0666,
"step": 5850
},
{
"epoch": 0.9568602011028219,
"grad_norm": 2.457383632659912,
"learning_rate": 7.608871948747061e-06,
"loss": 1.0743,
"step": 5900
},
{
"epoch": 0.9649691858579306,
"grad_norm": 1.9480834007263184,
"learning_rate": 7.588597842835131e-06,
"loss": 1.0856,
"step": 5950
},
{
"epoch": 0.9730781706130393,
"grad_norm": 2.421600103378296,
"learning_rate": 7.568323736923202e-06,
"loss": 1.1044,
"step": 6000
},
{
"epoch": 0.9811871553681479,
"grad_norm": 1.9701539278030396,
"learning_rate": 7.548049631011273e-06,
"loss": 1.0493,
"step": 6050
},
{
"epoch": 0.9892961401232566,
"grad_norm": 3.0008904933929443,
"learning_rate": 7.527775525099344e-06,
"loss": 1.057,
"step": 6100
},
{
"epoch": 0.9974051248783652,
"grad_norm": 2.9297544956207275,
"learning_rate": 7.507501419187414e-06,
"loss": 1.0695,
"step": 6150
},
{
"epoch": 1.0055141096334739,
"grad_norm": 8.883905410766602,
"learning_rate": 7.487227313275484e-06,
"loss": 1.1178,
"step": 6200
},
{
"epoch": 1.0136230943885824,
"grad_norm": 1.9629335403442383,
"learning_rate": 7.466953207363555e-06,
"loss": 1.1246,
"step": 6250
},
{
"epoch": 1.0217320791436912,
"grad_norm": 1.3846949338912964,
"learning_rate": 7.4466791014516275e-06,
"loss": 1.1218,
"step": 6300
},
{
"epoch": 1.0298410638987998,
"grad_norm": 1.8666070699691772,
"learning_rate": 7.426404995539697e-06,
"loss": 1.072,
"step": 6350
},
{
"epoch": 1.0379500486539086,
"grad_norm": 2.517359972000122,
"learning_rate": 7.406130889627768e-06,
"loss": 1.1204,
"step": 6400
},
{
"epoch": 1.0460590334090172,
"grad_norm": 10.04233455657959,
"learning_rate": 7.385856783715839e-06,
"loss": 1.1025,
"step": 6450
},
{
"epoch": 1.0541680181641258,
"grad_norm": 2.2324578762054443,
"learning_rate": 7.36558267780391e-06,
"loss": 1.0799,
"step": 6500
},
{
"epoch": 1.0622770029192345,
"grad_norm": 1.6139917373657227,
"learning_rate": 7.34530857189198e-06,
"loss": 1.1512,
"step": 6550
},
{
"epoch": 1.0703859876743431,
"grad_norm": 1.7317317724227905,
"learning_rate": 7.325034465980051e-06,
"loss": 1.0947,
"step": 6600
},
{
"epoch": 1.078494972429452,
"grad_norm": 1.3901951313018799,
"learning_rate": 7.304760360068122e-06,
"loss": 1.1165,
"step": 6650
},
{
"epoch": 1.0866039571845605,
"grad_norm": 2.5908029079437256,
"learning_rate": 7.284486254156193e-06,
"loss": 1.0484,
"step": 6700
},
{
"epoch": 1.094712941939669,
"grad_norm": 2.977522850036621,
"learning_rate": 7.2642121482442626e-06,
"loss": 1.0471,
"step": 6750
},
{
"epoch": 1.1028219266947779,
"grad_norm": 1.2937211990356445,
"learning_rate": 7.243938042332333e-06,
"loss": 1.1202,
"step": 6800
},
{
"epoch": 1.1109309114498864,
"grad_norm": 2.9973866939544678,
"learning_rate": 7.223663936420404e-06,
"loss": 1.0414,
"step": 6850
},
{
"epoch": 1.1190398962049952,
"grad_norm": 5.904523849487305,
"learning_rate": 7.203389830508475e-06,
"loss": 1.0944,
"step": 6900
},
{
"epoch": 1.1271488809601038,
"grad_norm": 4.078853130340576,
"learning_rate": 7.1831157245965456e-06,
"loss": 1.0421,
"step": 6950
},
{
"epoch": 1.1352578657152124,
"grad_norm": 2.3049638271331787,
"learning_rate": 7.162841618684616e-06,
"loss": 1.1185,
"step": 7000
},
{
"epoch": 1.1433668504703212,
"grad_norm": 1.740246057510376,
"learning_rate": 7.142567512772687e-06,
"loss": 1.0862,
"step": 7050
},
{
"epoch": 1.1514758352254297,
"grad_norm": 7.726000785827637,
"learning_rate": 7.122293406860759e-06,
"loss": 1.0821,
"step": 7100
},
{
"epoch": 1.1595848199805385,
"grad_norm": 0.24267837405204773,
"learning_rate": 7.1020193009488294e-06,
"loss": 1.0419,
"step": 7150
},
{
"epoch": 1.167693804735647,
"grad_norm": 1.9721111059188843,
"learning_rate": 7.0817451950369e-06,
"loss": 1.0959,
"step": 7200
},
{
"epoch": 1.1758027894907557,
"grad_norm": 1.5573512315750122,
"learning_rate": 7.06147108912497e-06,
"loss": 1.0827,
"step": 7250
},
{
"epoch": 1.1839117742458645,
"grad_norm": 2.4766602516174316,
"learning_rate": 7.041196983213041e-06,
"loss": 1.0144,
"step": 7300
},
{
"epoch": 1.192020759000973,
"grad_norm": 3.357357978820801,
"learning_rate": 7.020922877301112e-06,
"loss": 1.0686,
"step": 7350
},
{
"epoch": 1.2001297437560818,
"grad_norm": 1.6499881744384766,
"learning_rate": 7.000648771389182e-06,
"loss": 1.0918,
"step": 7400
},
{
"epoch": 1.2082387285111904,
"grad_norm": 2.088510751724243,
"learning_rate": 6.980374665477253e-06,
"loss": 1.0963,
"step": 7450
},
{
"epoch": 1.216347713266299,
"grad_norm": 2.040971040725708,
"learning_rate": 6.960100559565324e-06,
"loss": 1.1578,
"step": 7500
},
{
"epoch": 1.2244566980214078,
"grad_norm": 1.381493091583252,
"learning_rate": 6.939826453653395e-06,
"loss": 1.1057,
"step": 7550
},
{
"epoch": 1.2325656827765163,
"grad_norm": 1.6742016077041626,
"learning_rate": 6.919552347741465e-06,
"loss": 1.1118,
"step": 7600
},
{
"epoch": 1.240674667531625,
"grad_norm": 0.9264686107635498,
"learning_rate": 6.899278241829535e-06,
"loss": 1.0471,
"step": 7650
},
{
"epoch": 1.2487836522867337,
"grad_norm": 3.0064830780029297,
"learning_rate": 6.879004135917606e-06,
"loss": 1.0854,
"step": 7700
},
{
"epoch": 1.2568926370418423,
"grad_norm": 0.9564074873924255,
"learning_rate": 6.858730030005677e-06,
"loss": 1.1668,
"step": 7750
},
{
"epoch": 1.265001621796951,
"grad_norm": 7.573803901672363,
"learning_rate": 6.8384559240937475e-06,
"loss": 1.1087,
"step": 7800
},
{
"epoch": 1.2731106065520597,
"grad_norm": 6.813046455383301,
"learning_rate": 6.818181818181818e-06,
"loss": 1.1263,
"step": 7850
},
{
"epoch": 1.2812195913071682,
"grad_norm": 1.5467007160186768,
"learning_rate": 6.79790771226989e-06,
"loss": 1.1001,
"step": 7900
},
{
"epoch": 1.289328576062277,
"grad_norm": 1.5642696619033813,
"learning_rate": 6.777633606357961e-06,
"loss": 1.1252,
"step": 7950
},
{
"epoch": 1.2974375608173856,
"grad_norm": 5.454095363616943,
"learning_rate": 6.757359500446031e-06,
"loss": 1.0533,
"step": 8000
},
{
"epoch": 1.3055465455724944,
"grad_norm": 2.569298028945923,
"learning_rate": 6.737085394534102e-06,
"loss": 1.1265,
"step": 8050
},
{
"epoch": 1.313655530327603,
"grad_norm": 0.9834175705909729,
"learning_rate": 6.716811288622173e-06,
"loss": 1.1177,
"step": 8100
},
{
"epoch": 1.3217645150827115,
"grad_norm": 1.9268743991851807,
"learning_rate": 6.696537182710244e-06,
"loss": 1.0928,
"step": 8150
},
{
"epoch": 1.3298734998378203,
"grad_norm": 2.7169127464294434,
"learning_rate": 6.6762630767983135e-06,
"loss": 1.0943,
"step": 8200
},
{
"epoch": 1.337982484592929,
"grad_norm": 1.8307722806930542,
"learning_rate": 6.655988970886384e-06,
"loss": 1.0697,
"step": 8250
},
{
"epoch": 1.3460914693480377,
"grad_norm": 1.8044813871383667,
"learning_rate": 6.635714864974455e-06,
"loss": 1.1663,
"step": 8300
},
{
"epoch": 1.3542004541031463,
"grad_norm": 0.26450178027153015,
"learning_rate": 6.615440759062526e-06,
"loss": 1.0665,
"step": 8350
},
{
"epoch": 1.3623094388582548,
"grad_norm": 5.571292877197266,
"learning_rate": 6.5951666531505966e-06,
"loss": 1.0099,
"step": 8400
},
{
"epoch": 1.3704184236133636,
"grad_norm": 3.6548831462860107,
"learning_rate": 6.574892547238667e-06,
"loss": 1.0948,
"step": 8450
},
{
"epoch": 1.3785274083684722,
"grad_norm": 1.9964606761932373,
"learning_rate": 6.554618441326738e-06,
"loss": 1.079,
"step": 8500
},
{
"epoch": 1.386636393123581,
"grad_norm": 9.50430965423584,
"learning_rate": 6.534344335414808e-06,
"loss": 1.093,
"step": 8550
},
{
"epoch": 1.3947453778786896,
"grad_norm": 2.573894500732422,
"learning_rate": 6.514070229502879e-06,
"loss": 1.1444,
"step": 8600
},
{
"epoch": 1.4028543626337981,
"grad_norm": 2.370356559753418,
"learning_rate": 6.4937961235909495e-06,
"loss": 1.1342,
"step": 8650
},
{
"epoch": 1.410963347388907,
"grad_norm": 2.698498487472534,
"learning_rate": 6.47352201767902e-06,
"loss": 1.1387,
"step": 8700
},
{
"epoch": 1.4190723321440155,
"grad_norm": 1.4973636865615845,
"learning_rate": 6.453247911767092e-06,
"loss": 1.077,
"step": 8750
},
{
"epoch": 1.4271813168991243,
"grad_norm": 2.350057601928711,
"learning_rate": 6.432973805855163e-06,
"loss": 1.0281,
"step": 8800
},
{
"epoch": 1.4352903016542329,
"grad_norm": 2.5107436180114746,
"learning_rate": 6.412699699943233e-06,
"loss": 1.0517,
"step": 8850
},
{
"epoch": 1.4433992864093415,
"grad_norm": 2.506737470626831,
"learning_rate": 6.392425594031304e-06,
"loss": 1.0565,
"step": 8900
},
{
"epoch": 1.4515082711644502,
"grad_norm": 2.0631542205810547,
"learning_rate": 6.372151488119375e-06,
"loss": 1.0925,
"step": 8950
},
{
"epoch": 1.4596172559195588,
"grad_norm": 1.4057486057281494,
"learning_rate": 6.351877382207446e-06,
"loss": 1.1257,
"step": 9000
},
{
"epoch": 1.4677262406746676,
"grad_norm": 1.5711629390716553,
"learning_rate": 6.331603276295516e-06,
"loss": 1.0615,
"step": 9050
},
{
"epoch": 1.4758352254297762,
"grad_norm": 3.042393922805786,
"learning_rate": 6.311329170383586e-06,
"loss": 1.0664,
"step": 9100
},
{
"epoch": 1.4839442101848848,
"grad_norm": 2.0653626918792725,
"learning_rate": 6.291055064471657e-06,
"loss": 1.0039,
"step": 9150
},
{
"epoch": 1.4920531949399936,
"grad_norm": 2.374119281768799,
"learning_rate": 6.270780958559728e-06,
"loss": 1.0053,
"step": 9200
},
{
"epoch": 1.5001621796951021,
"grad_norm": 2.568708896636963,
"learning_rate": 6.2505068526477985e-06,
"loss": 1.0484,
"step": 9250
},
{
"epoch": 1.508271164450211,
"grad_norm": 2.4763758182525635,
"learning_rate": 6.230232746735869e-06,
"loss": 1.0998,
"step": 9300
},
{
"epoch": 1.5163801492053195,
"grad_norm": 1.8199087381362915,
"learning_rate": 6.20995864082394e-06,
"loss": 1.1189,
"step": 9350
},
{
"epoch": 1.524489133960428,
"grad_norm": 4.159176826477051,
"learning_rate": 6.189684534912011e-06,
"loss": 1.1606,
"step": 9400
},
{
"epoch": 1.5325981187155369,
"grad_norm": 1.266381025314331,
"learning_rate": 6.1694104290000815e-06,
"loss": 1.039,
"step": 9450
},
{
"epoch": 1.5407071034706454,
"grad_norm": 1.114743947982788,
"learning_rate": 6.1491363230881514e-06,
"loss": 1.0937,
"step": 9500
},
{
"epoch": 1.5488160882257542,
"grad_norm": 4.52326774597168,
"learning_rate": 6.128862217176224e-06,
"loss": 1.0782,
"step": 9550
},
{
"epoch": 1.5569250729808628,
"grad_norm": 2.066561460494995,
"learning_rate": 6.108588111264294e-06,
"loss": 1.0033,
"step": 9600
},
{
"epoch": 1.5650340577359714,
"grad_norm": 3.260119676589966,
"learning_rate": 6.0883140053523645e-06,
"loss": 1.063,
"step": 9650
},
{
"epoch": 1.5731430424910802,
"grad_norm": 3.0911264419555664,
"learning_rate": 6.068039899440435e-06,
"loss": 1.0468,
"step": 9700
},
{
"epoch": 1.5812520272461887,
"grad_norm": 6.644060134887695,
"learning_rate": 6.047765793528506e-06,
"loss": 1.0236,
"step": 9750
},
{
"epoch": 1.5893610120012975,
"grad_norm": 2.7163946628570557,
"learning_rate": 6.027491687616577e-06,
"loss": 1.0351,
"step": 9800
},
{
"epoch": 1.597469996756406,
"grad_norm": 2.4623427391052246,
"learning_rate": 6.0072175817046475e-06,
"loss": 1.1016,
"step": 9850
},
{
"epoch": 1.6055789815115147,
"grad_norm": 2.7548983097076416,
"learning_rate": 5.986943475792718e-06,
"loss": 1.0571,
"step": 9900
},
{
"epoch": 1.6136879662666233,
"grad_norm": 2.269378423690796,
"learning_rate": 5.966669369880789e-06,
"loss": 1.0962,
"step": 9950
},
{
"epoch": 1.621796951021732,
"grad_norm": 2.0521297454833984,
"learning_rate": 5.946395263968859e-06,
"loss": 1.1202,
"step": 10000
},
{
"epoch": 1.6299059357768408,
"grad_norm": 2.488682985305786,
"learning_rate": 5.92612115805693e-06,
"loss": 1.0879,
"step": 10050
},
{
"epoch": 1.6380149205319494,
"grad_norm": 2.8953864574432373,
"learning_rate": 5.9058470521450005e-06,
"loss": 1.1048,
"step": 10100
},
{
"epoch": 1.646123905287058,
"grad_norm": 2.272630214691162,
"learning_rate": 5.885572946233071e-06,
"loss": 1.0903,
"step": 10150
},
{
"epoch": 1.6542328900421666,
"grad_norm": 2.1369316577911377,
"learning_rate": 5.865298840321142e-06,
"loss": 1.0911,
"step": 10200
},
{
"epoch": 1.6623418747972754,
"grad_norm": 3.1257848739624023,
"learning_rate": 5.845024734409213e-06,
"loss": 1.066,
"step": 10250
},
{
"epoch": 1.6704508595523841,
"grad_norm": 2.81978440284729,
"learning_rate": 5.8247506284972835e-06,
"loss": 1.1032,
"step": 10300
},
{
"epoch": 1.6785598443074927,
"grad_norm": 1.2515383958816528,
"learning_rate": 5.804476522585355e-06,
"loss": 1.0646,
"step": 10350
},
{
"epoch": 1.6866688290626013,
"grad_norm": 2.6792616844177246,
"learning_rate": 5.784202416673426e-06,
"loss": 1.0804,
"step": 10400
},
{
"epoch": 1.6947778138177099,
"grad_norm": 2.989546537399292,
"learning_rate": 5.763928310761497e-06,
"loss": 1.1245,
"step": 10450
},
{
"epoch": 1.7028867985728187,
"grad_norm": 3.0825021266937256,
"learning_rate": 5.743654204849567e-06,
"loss": 1.1059,
"step": 10500
},
{
"epoch": 1.7109957833279275,
"grad_norm": 2.1985509395599365,
"learning_rate": 5.723380098937637e-06,
"loss": 1.1493,
"step": 10550
},
{
"epoch": 1.719104768083036,
"grad_norm": 2.780210256576538,
"learning_rate": 5.703105993025708e-06,
"loss": 1.1207,
"step": 10600
},
{
"epoch": 1.7272137528381446,
"grad_norm": 1.8562790155410767,
"learning_rate": 5.682831887113779e-06,
"loss": 1.0158,
"step": 10650
},
{
"epoch": 1.7353227375932532,
"grad_norm": 1.9536188840866089,
"learning_rate": 5.6625577812018495e-06,
"loss": 1.113,
"step": 10700
},
{
"epoch": 1.743431722348362,
"grad_norm": 7.557844638824463,
"learning_rate": 5.64228367528992e-06,
"loss": 1.0133,
"step": 10750
},
{
"epoch": 1.7515407071034708,
"grad_norm": 6.065642356872559,
"learning_rate": 5.622009569377991e-06,
"loss": 1.1057,
"step": 10800
},
{
"epoch": 1.7596496918585793,
"grad_norm": 2.0582070350646973,
"learning_rate": 5.601735463466062e-06,
"loss": 1.0018,
"step": 10850
},
{
"epoch": 1.767758676613688,
"grad_norm": 1.545466423034668,
"learning_rate": 5.5814613575541325e-06,
"loss": 0.9689,
"step": 10900
},
{
"epoch": 1.7758676613687965,
"grad_norm": 2.50724196434021,
"learning_rate": 5.561187251642202e-06,
"loss": 1.0578,
"step": 10950
},
{
"epoch": 1.7839766461239053,
"grad_norm": 3.886679172515869,
"learning_rate": 5.540913145730273e-06,
"loss": 1.0745,
"step": 11000
},
{
"epoch": 1.792085630879014,
"grad_norm": 3.0689311027526855,
"learning_rate": 5.520639039818344e-06,
"loss": 1.0816,
"step": 11050
},
{
"epoch": 1.8001946156341226,
"grad_norm": 2.43038010597229,
"learning_rate": 5.500364933906415e-06,
"loss": 1.1257,
"step": 11100
},
{
"epoch": 1.8083036003892312,
"grad_norm": 1.9623647928237915,
"learning_rate": 5.480090827994486e-06,
"loss": 1.072,
"step": 11150
},
{
"epoch": 1.8164125851443398,
"grad_norm": 2.903964042663574,
"learning_rate": 5.459816722082557e-06,
"loss": 1.1159,
"step": 11200
},
{
"epoch": 1.8245215698994486,
"grad_norm": 3.9385204315185547,
"learning_rate": 5.439542616170628e-06,
"loss": 1.0296,
"step": 11250
},
{
"epoch": 1.8326305546545574,
"grad_norm": 3.2969629764556885,
"learning_rate": 5.4192685102586985e-06,
"loss": 1.0818,
"step": 11300
},
{
"epoch": 1.840739539409666,
"grad_norm": 6.509484767913818,
"learning_rate": 5.398994404346769e-06,
"loss": 1.0848,
"step": 11350
},
{
"epoch": 1.8488485241647745,
"grad_norm": 3.701723098754883,
"learning_rate": 5.37872029843484e-06,
"loss": 0.9775,
"step": 11400
},
{
"epoch": 1.856957508919883,
"grad_norm": 2.0968496799468994,
"learning_rate": 5.35844619252291e-06,
"loss": 1.1417,
"step": 11450
},
{
"epoch": 1.8650664936749919,
"grad_norm": 2.2569050788879395,
"learning_rate": 5.338172086610981e-06,
"loss": 1.0199,
"step": 11500
},
{
"epoch": 1.8731754784301007,
"grad_norm": 8.650045394897461,
"learning_rate": 5.3178979806990514e-06,
"loss": 1.1163,
"step": 11550
},
{
"epoch": 1.8812844631852093,
"grad_norm": 3.9758262634277344,
"learning_rate": 5.297623874787122e-06,
"loss": 0.9833,
"step": 11600
},
{
"epoch": 1.8893934479403178,
"grad_norm": 2.467902183532715,
"learning_rate": 5.277349768875193e-06,
"loss": 1.0529,
"step": 11650
},
{
"epoch": 1.8975024326954264,
"grad_norm": 3.978503465652466,
"learning_rate": 5.257075662963264e-06,
"loss": 1.2041,
"step": 11700
},
{
"epoch": 1.9056114174505352,
"grad_norm": 2.431842803955078,
"learning_rate": 5.2368015570513345e-06,
"loss": 1.0279,
"step": 11750
},
{
"epoch": 1.913720402205644,
"grad_norm": 2.4601986408233643,
"learning_rate": 5.216527451139405e-06,
"loss": 1.1013,
"step": 11800
},
{
"epoch": 1.9218293869607526,
"grad_norm": 2.702439546585083,
"learning_rate": 5.196253345227475e-06,
"loss": 1.0153,
"step": 11850
},
{
"epoch": 1.9299383717158611,
"grad_norm": 1.80119788646698,
"learning_rate": 5.175979239315546e-06,
"loss": 1.0995,
"step": 11900
},
{
"epoch": 1.9380473564709697,
"grad_norm": 2.78031587600708,
"learning_rate": 5.155705133403617e-06,
"loss": 1.0488,
"step": 11950
},
{
"epoch": 1.9461563412260785,
"grad_norm": 1.3408476114273071,
"learning_rate": 5.135431027491688e-06,
"loss": 1.0176,
"step": 12000
},
{
"epoch": 1.9542653259811873,
"grad_norm": 2.418548107147217,
"learning_rate": 5.115156921579759e-06,
"loss": 1.0323,
"step": 12050
},
{
"epoch": 1.9623743107362959,
"grad_norm": 2.1113944053649902,
"learning_rate": 5.09488281566783e-06,
"loss": 1.0717,
"step": 12100
},
{
"epoch": 1.9704832954914044,
"grad_norm": 1.4384586811065674,
"learning_rate": 5.0746087097559005e-06,
"loss": 1.053,
"step": 12150
},
{
"epoch": 1.978592280246513,
"grad_norm": 2.836148738861084,
"learning_rate": 5.054334603843971e-06,
"loss": 0.9988,
"step": 12200
},
{
"epoch": 1.9867012650016218,
"grad_norm": 1.2631614208221436,
"learning_rate": 5.034060497932042e-06,
"loss": 1.0348,
"step": 12250
},
{
"epoch": 1.9948102497567306,
"grad_norm": 5.070575714111328,
"learning_rate": 5.013786392020113e-06,
"loss": 1.0272,
"step": 12300
}
],
"logging_steps": 50,
"max_steps": 24664,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.8392191071174525e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}