ACRE-Qwen-2.5-3B-instruct / trainer_state.json
TommyChien's picture
Upload folder using huggingface_hub
e1d722a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 18498,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008108984755108661,
"grad_norm": 1.6712403297424316,
"learning_rate": 9.980942340442787e-06,
"loss": 1.5976,
"step": 50
},
{
"epoch": 0.016217969510217322,
"grad_norm": 1.9675096273422241,
"learning_rate": 9.960668234530858e-06,
"loss": 1.4389,
"step": 100
},
{
"epoch": 0.02432695426532598,
"grad_norm": 4.630291938781738,
"learning_rate": 9.940394128618928e-06,
"loss": 1.387,
"step": 150
},
{
"epoch": 0.032435939020434644,
"grad_norm": 2.186373710632324,
"learning_rate": 9.920120022707e-06,
"loss": 1.4025,
"step": 200
},
{
"epoch": 0.0405449237755433,
"grad_norm": 1.8459932804107666,
"learning_rate": 9.89984591679507e-06,
"loss": 1.3795,
"step": 250
},
{
"epoch": 0.04865390853065196,
"grad_norm": 1.8652907609939575,
"learning_rate": 9.879571810883141e-06,
"loss": 1.3835,
"step": 300
},
{
"epoch": 0.056762893285760625,
"grad_norm": 1.97219979763031,
"learning_rate": 9.859297704971211e-06,
"loss": 1.3549,
"step": 350
},
{
"epoch": 0.06487187804086929,
"grad_norm": 1.764737844467163,
"learning_rate": 9.839023599059281e-06,
"loss": 1.3785,
"step": 400
},
{
"epoch": 0.07298086279597794,
"grad_norm": 1.37200927734375,
"learning_rate": 9.818749493147353e-06,
"loss": 1.397,
"step": 450
},
{
"epoch": 0.0810898475510866,
"grad_norm": 0.783941388130188,
"learning_rate": 9.798475387235423e-06,
"loss": 1.4055,
"step": 500
},
{
"epoch": 0.08919883230619527,
"grad_norm": 2.4082376956939697,
"learning_rate": 9.778201281323494e-06,
"loss": 1.2851,
"step": 550
},
{
"epoch": 0.09730781706130393,
"grad_norm": 1.2580335140228271,
"learning_rate": 9.757927175411566e-06,
"loss": 1.3557,
"step": 600
},
{
"epoch": 0.10541680181641258,
"grad_norm": 1.688684105873108,
"learning_rate": 9.737653069499636e-06,
"loss": 1.3462,
"step": 650
},
{
"epoch": 0.11352578657152125,
"grad_norm": 1.7642041444778442,
"learning_rate": 9.717378963587707e-06,
"loss": 1.3122,
"step": 700
},
{
"epoch": 0.1216347713266299,
"grad_norm": 1.5700275897979736,
"learning_rate": 9.697104857675777e-06,
"loss": 1.3248,
"step": 750
},
{
"epoch": 0.12974375608173858,
"grad_norm": 1.8756022453308105,
"learning_rate": 9.676830751763849e-06,
"loss": 1.3364,
"step": 800
},
{
"epoch": 0.13785274083684723,
"grad_norm": 2.0151844024658203,
"learning_rate": 9.656556645851919e-06,
"loss": 1.3566,
"step": 850
},
{
"epoch": 0.1459617255919559,
"grad_norm": 1.8269509077072144,
"learning_rate": 9.636282539939989e-06,
"loss": 1.3291,
"step": 900
},
{
"epoch": 0.15407071034706454,
"grad_norm": 2.033515453338623,
"learning_rate": 9.61600843402806e-06,
"loss": 1.3239,
"step": 950
},
{
"epoch": 0.1621796951021732,
"grad_norm": 1.2932145595550537,
"learning_rate": 9.59573432811613e-06,
"loss": 1.3458,
"step": 1000
},
{
"epoch": 0.17028867985728185,
"grad_norm": 1.7336113452911377,
"learning_rate": 9.575460222204202e-06,
"loss": 1.3227,
"step": 1050
},
{
"epoch": 0.17839766461239054,
"grad_norm": 2.2469635009765625,
"learning_rate": 9.555186116292272e-06,
"loss": 1.3055,
"step": 1100
},
{
"epoch": 0.1865066493674992,
"grad_norm": 1.9851100444793701,
"learning_rate": 9.534912010380343e-06,
"loss": 1.2713,
"step": 1150
},
{
"epoch": 0.19461563412260785,
"grad_norm": 1.896209716796875,
"learning_rate": 9.514637904468413e-06,
"loss": 1.3926,
"step": 1200
},
{
"epoch": 0.2027246188777165,
"grad_norm": 1.941974401473999,
"learning_rate": 9.494363798556485e-06,
"loss": 1.2845,
"step": 1250
},
{
"epoch": 0.21083360363282516,
"grad_norm": 1.7393105030059814,
"learning_rate": 9.474089692644555e-06,
"loss": 1.3989,
"step": 1300
},
{
"epoch": 0.21894258838793382,
"grad_norm": 2.3268821239471436,
"learning_rate": 9.453815586732625e-06,
"loss": 1.3939,
"step": 1350
},
{
"epoch": 0.2270515731430425,
"grad_norm": 2.2540266513824463,
"learning_rate": 9.433541480820698e-06,
"loss": 1.3421,
"step": 1400
},
{
"epoch": 0.23516055789815116,
"grad_norm": 1.5483736991882324,
"learning_rate": 9.413267374908768e-06,
"loss": 1.3236,
"step": 1450
},
{
"epoch": 0.2432695426532598,
"grad_norm": 1.2663439512252808,
"learning_rate": 9.392993268996838e-06,
"loss": 1.3333,
"step": 1500
},
{
"epoch": 0.25137852740836847,
"grad_norm": 0.978643536567688,
"learning_rate": 9.37271916308491e-06,
"loss": 1.3522,
"step": 1550
},
{
"epoch": 0.25948751216347715,
"grad_norm": 2.664464235305786,
"learning_rate": 9.35244505717298e-06,
"loss": 1.3969,
"step": 1600
},
{
"epoch": 0.2675964969185858,
"grad_norm": 0.9333709478378296,
"learning_rate": 9.332170951261051e-06,
"loss": 1.3423,
"step": 1650
},
{
"epoch": 0.27570548167369446,
"grad_norm": 2.0592761039733887,
"learning_rate": 9.31189684534912e-06,
"loss": 1.3274,
"step": 1700
},
{
"epoch": 0.2838144664288031,
"grad_norm": 1.6379183530807495,
"learning_rate": 9.291622739437192e-06,
"loss": 1.321,
"step": 1750
},
{
"epoch": 0.2919234511839118,
"grad_norm": 1.659201979637146,
"learning_rate": 9.271348633525262e-06,
"loss": 1.3662,
"step": 1800
},
{
"epoch": 0.30003243593902046,
"grad_norm": 2.4258599281311035,
"learning_rate": 9.251074527613332e-06,
"loss": 1.2489,
"step": 1850
},
{
"epoch": 0.3081414206941291,
"grad_norm": 2.314232349395752,
"learning_rate": 9.230800421701404e-06,
"loss": 1.3549,
"step": 1900
},
{
"epoch": 0.31625040544923777,
"grad_norm": 1.78170645236969,
"learning_rate": 9.210526315789474e-06,
"loss": 1.3405,
"step": 1950
},
{
"epoch": 0.3243593902043464,
"grad_norm": 2.3508291244506836,
"learning_rate": 9.190252209877545e-06,
"loss": 1.335,
"step": 2000
},
{
"epoch": 0.3324683749594551,
"grad_norm": 3.0449280738830566,
"learning_rate": 9.169978103965615e-06,
"loss": 1.3188,
"step": 2050
},
{
"epoch": 0.3405773597145637,
"grad_norm": 2.0919177532196045,
"learning_rate": 9.149703998053687e-06,
"loss": 1.3369,
"step": 2100
},
{
"epoch": 0.3486863444696724,
"grad_norm": 1.5090256929397583,
"learning_rate": 9.129429892141757e-06,
"loss": 1.3782,
"step": 2150
},
{
"epoch": 0.3567953292247811,
"grad_norm": 1.5943760871887207,
"learning_rate": 9.109155786229828e-06,
"loss": 1.2826,
"step": 2200
},
{
"epoch": 0.3649043139798897,
"grad_norm": 1.4445804357528687,
"learning_rate": 9.0888816803179e-06,
"loss": 1.2706,
"step": 2250
},
{
"epoch": 0.3730132987349984,
"grad_norm": 1.366866111755371,
"learning_rate": 9.06860757440597e-06,
"loss": 1.3081,
"step": 2300
},
{
"epoch": 0.381122283490107,
"grad_norm": 1.5148597955703735,
"learning_rate": 9.04833346849404e-06,
"loss": 1.3117,
"step": 2350
},
{
"epoch": 0.3892312682452157,
"grad_norm": 1.9457615613937378,
"learning_rate": 9.028059362582111e-06,
"loss": 1.3385,
"step": 2400
},
{
"epoch": 0.3973402530003244,
"grad_norm": 1.0214955806732178,
"learning_rate": 9.007785256670181e-06,
"loss": 1.4173,
"step": 2450
},
{
"epoch": 0.405449237755433,
"grad_norm": 2.3479509353637695,
"learning_rate": 8.987511150758253e-06,
"loss": 1.338,
"step": 2500
},
{
"epoch": 0.4135582225105417,
"grad_norm": 1.7548096179962158,
"learning_rate": 8.967237044846323e-06,
"loss": 1.3245,
"step": 2550
},
{
"epoch": 0.4216672072656503,
"grad_norm": 1.3919732570648193,
"learning_rate": 8.946962938934394e-06,
"loss": 1.2934,
"step": 2600
},
{
"epoch": 0.429776192020759,
"grad_norm": 2.419706344604492,
"learning_rate": 8.926688833022464e-06,
"loss": 1.2451,
"step": 2650
},
{
"epoch": 0.43788517677586763,
"grad_norm": 0.7026298642158508,
"learning_rate": 8.906414727110536e-06,
"loss": 1.3075,
"step": 2700
},
{
"epoch": 0.4459941615309763,
"grad_norm": 1.9528323411941528,
"learning_rate": 8.886140621198606e-06,
"loss": 1.3071,
"step": 2750
},
{
"epoch": 0.454103146286085,
"grad_norm": 1.322129487991333,
"learning_rate": 8.865866515286676e-06,
"loss": 1.3234,
"step": 2800
},
{
"epoch": 0.46221213104119363,
"grad_norm": 3.2143826484680176,
"learning_rate": 8.845592409374747e-06,
"loss": 1.3553,
"step": 2850
},
{
"epoch": 0.4703211157963023,
"grad_norm": 2.3696305751800537,
"learning_rate": 8.825318303462817e-06,
"loss": 1.2905,
"step": 2900
},
{
"epoch": 0.47843010055141094,
"grad_norm": 1.7011082172393799,
"learning_rate": 8.805044197550889e-06,
"loss": 1.2826,
"step": 2950
},
{
"epoch": 0.4865390853065196,
"grad_norm": 1.8602455854415894,
"learning_rate": 8.784770091638959e-06,
"loss": 1.2119,
"step": 3000
},
{
"epoch": 0.4946480700616283,
"grad_norm": 1.8766978979110718,
"learning_rate": 8.76449598572703e-06,
"loss": 1.25,
"step": 3050
},
{
"epoch": 0.5027570548167369,
"grad_norm": 1.2416331768035889,
"learning_rate": 8.744221879815102e-06,
"loss": 1.3494,
"step": 3100
},
{
"epoch": 0.5108660395718456,
"grad_norm": 2.160327434539795,
"learning_rate": 8.723947773903172e-06,
"loss": 1.2821,
"step": 3150
},
{
"epoch": 0.5189750243269543,
"grad_norm": 1.5392768383026123,
"learning_rate": 8.703673667991243e-06,
"loss": 1.3716,
"step": 3200
},
{
"epoch": 0.5270840090820629,
"grad_norm": 1.9121636152267456,
"learning_rate": 8.683399562079313e-06,
"loss": 1.2615,
"step": 3250
},
{
"epoch": 0.5351929938371716,
"grad_norm": 1.8084639310836792,
"learning_rate": 8.663125456167383e-06,
"loss": 1.3391,
"step": 3300
},
{
"epoch": 0.5433019785922802,
"grad_norm": 2.516788959503174,
"learning_rate": 8.642851350255455e-06,
"loss": 1.2747,
"step": 3350
},
{
"epoch": 0.5514109633473889,
"grad_norm": 5.785072326660156,
"learning_rate": 8.622577244343525e-06,
"loss": 1.3414,
"step": 3400
},
{
"epoch": 0.5595199481024976,
"grad_norm": 3.8145599365234375,
"learning_rate": 8.602303138431596e-06,
"loss": 1.2274,
"step": 3450
},
{
"epoch": 0.5676289328576062,
"grad_norm": 0.36742010712623596,
"learning_rate": 8.582029032519666e-06,
"loss": 1.2184,
"step": 3500
},
{
"epoch": 0.5757379176127149,
"grad_norm": 2.2382278442382812,
"learning_rate": 8.561754926607738e-06,
"loss": 1.2359,
"step": 3550
},
{
"epoch": 0.5838469023678236,
"grad_norm": 3.001042127609253,
"learning_rate": 8.541480820695808e-06,
"loss": 1.2839,
"step": 3600
},
{
"epoch": 0.5919558871229322,
"grad_norm": 1.2254408597946167,
"learning_rate": 8.521206714783878e-06,
"loss": 1.3025,
"step": 3650
},
{
"epoch": 0.6000648718780409,
"grad_norm": 2.408430576324463,
"learning_rate": 8.50093260887195e-06,
"loss": 1.2959,
"step": 3700
},
{
"epoch": 0.6081738566331495,
"grad_norm": 1.125710129737854,
"learning_rate": 8.480658502960019e-06,
"loss": 1.3165,
"step": 3750
},
{
"epoch": 0.6162828413882582,
"grad_norm": 1.7280784845352173,
"learning_rate": 8.46038439704809e-06,
"loss": 1.3033,
"step": 3800
},
{
"epoch": 0.6243918261433669,
"grad_norm": 1.7289584875106812,
"learning_rate": 8.440110291136162e-06,
"loss": 1.3818,
"step": 3850
},
{
"epoch": 0.6325008108984755,
"grad_norm": 1.4574753046035767,
"learning_rate": 8.419836185224232e-06,
"loss": 1.2858,
"step": 3900
},
{
"epoch": 0.6406097956535842,
"grad_norm": 1.9952958822250366,
"learning_rate": 8.399562079312304e-06,
"loss": 1.323,
"step": 3950
},
{
"epoch": 0.6487187804086928,
"grad_norm": 2.3702497482299805,
"learning_rate": 8.379287973400374e-06,
"loss": 1.3362,
"step": 4000
},
{
"epoch": 0.6568277651638015,
"grad_norm": 2.1261913776397705,
"learning_rate": 8.359013867488445e-06,
"loss": 1.2978,
"step": 4050
},
{
"epoch": 0.6649367499189102,
"grad_norm": 0.6903754472732544,
"learning_rate": 8.338739761576515e-06,
"loss": 1.3273,
"step": 4100
},
{
"epoch": 0.6730457346740188,
"grad_norm": 1.848915934562683,
"learning_rate": 8.318465655664587e-06,
"loss": 1.294,
"step": 4150
},
{
"epoch": 0.6811547194291274,
"grad_norm": 2.0583913326263428,
"learning_rate": 8.298191549752657e-06,
"loss": 1.3353,
"step": 4200
},
{
"epoch": 0.6892637041842361,
"grad_norm": 1.5055549144744873,
"learning_rate": 8.277917443840727e-06,
"loss": 1.3305,
"step": 4250
},
{
"epoch": 0.6973726889393448,
"grad_norm": 1.6197587251663208,
"learning_rate": 8.257643337928798e-06,
"loss": 1.3765,
"step": 4300
},
{
"epoch": 0.7054816736944535,
"grad_norm": 1.927742838859558,
"learning_rate": 8.237369232016868e-06,
"loss": 1.3372,
"step": 4350
},
{
"epoch": 0.7135906584495622,
"grad_norm": 2.3434853553771973,
"learning_rate": 8.21709512610494e-06,
"loss": 1.3041,
"step": 4400
},
{
"epoch": 0.7216996432046707,
"grad_norm": 2.09249210357666,
"learning_rate": 8.19682102019301e-06,
"loss": 1.2946,
"step": 4450
},
{
"epoch": 0.7298086279597794,
"grad_norm": 0.19756704568862915,
"learning_rate": 8.176546914281081e-06,
"loss": 1.3014,
"step": 4500
},
{
"epoch": 0.7379176127148881,
"grad_norm": 1.2464430332183838,
"learning_rate": 8.156272808369151e-06,
"loss": 1.2552,
"step": 4550
},
{
"epoch": 0.7460265974699968,
"grad_norm": 1.7760344743728638,
"learning_rate": 8.135998702457221e-06,
"loss": 1.2963,
"step": 4600
},
{
"epoch": 0.7541355822251055,
"grad_norm": 1.922969937324524,
"learning_rate": 8.115724596545294e-06,
"loss": 1.2849,
"step": 4650
},
{
"epoch": 0.762244566980214,
"grad_norm": 3.757589340209961,
"learning_rate": 8.095450490633364e-06,
"loss": 1.2511,
"step": 4700
},
{
"epoch": 0.7703535517353227,
"grad_norm": 2.4983339309692383,
"learning_rate": 8.075176384721434e-06,
"loss": 1.2169,
"step": 4750
},
{
"epoch": 0.7784625364904314,
"grad_norm": 2.1676225662231445,
"learning_rate": 8.054902278809506e-06,
"loss": 1.2711,
"step": 4800
},
{
"epoch": 0.7865715212455401,
"grad_norm": 2.8060524463653564,
"learning_rate": 8.034628172897576e-06,
"loss": 1.3246,
"step": 4850
},
{
"epoch": 0.7946805060006488,
"grad_norm": 1.917485237121582,
"learning_rate": 8.014354066985647e-06,
"loss": 1.2669,
"step": 4900
},
{
"epoch": 0.8027894907557573,
"grad_norm": 2.1371405124664307,
"learning_rate": 7.994079961073717e-06,
"loss": 1.3554,
"step": 4950
},
{
"epoch": 0.810898475510866,
"grad_norm": 0.9637095928192139,
"learning_rate": 7.973805855161789e-06,
"loss": 1.2862,
"step": 5000
},
{
"epoch": 0.8190074602659747,
"grad_norm": 1.3636858463287354,
"learning_rate": 7.953531749249859e-06,
"loss": 1.3134,
"step": 5050
},
{
"epoch": 0.8271164450210834,
"grad_norm": 0.9230815768241882,
"learning_rate": 7.933257643337929e-06,
"loss": 1.2352,
"step": 5100
},
{
"epoch": 0.8352254297761921,
"grad_norm": 1.7373725175857544,
"learning_rate": 7.912983537426e-06,
"loss": 1.318,
"step": 5150
},
{
"epoch": 0.8433344145313006,
"grad_norm": 2.0280568599700928,
"learning_rate": 7.89270943151407e-06,
"loss": 1.2464,
"step": 5200
},
{
"epoch": 0.8514433992864093,
"grad_norm": 1.1584951877593994,
"learning_rate": 7.872435325602142e-06,
"loss": 1.2292,
"step": 5250
},
{
"epoch": 0.859552384041518,
"grad_norm": 1.7846872806549072,
"learning_rate": 7.852161219690212e-06,
"loss": 1.2976,
"step": 5300
},
{
"epoch": 0.8676613687966267,
"grad_norm": 1.7553609609603882,
"learning_rate": 7.831887113778283e-06,
"loss": 1.2684,
"step": 5350
},
{
"epoch": 0.8757703535517353,
"grad_norm": 2.1919806003570557,
"learning_rate": 7.811613007866353e-06,
"loss": 1.2956,
"step": 5400
},
{
"epoch": 0.883879338306844,
"grad_norm": 0.717258870601654,
"learning_rate": 7.791338901954425e-06,
"loss": 1.2186,
"step": 5450
},
{
"epoch": 0.8919883230619526,
"grad_norm": 1.4364169836044312,
"learning_rate": 7.771064796042496e-06,
"loss": 1.2596,
"step": 5500
},
{
"epoch": 0.9000973078170613,
"grad_norm": 1.99948251247406,
"learning_rate": 7.750790690130566e-06,
"loss": 1.3616,
"step": 5550
},
{
"epoch": 0.90820629257217,
"grad_norm": 1.5464040040969849,
"learning_rate": 7.730516584218636e-06,
"loss": 1.2595,
"step": 5600
},
{
"epoch": 0.9163152773272786,
"grad_norm": 1.500651240348816,
"learning_rate": 7.710242478306708e-06,
"loss": 1.3223,
"step": 5650
},
{
"epoch": 0.9244242620823873,
"grad_norm": 1.6872031688690186,
"learning_rate": 7.689968372394778e-06,
"loss": 1.2099,
"step": 5700
},
{
"epoch": 0.9325332468374959,
"grad_norm": 1.5728824138641357,
"learning_rate": 7.66969426648285e-06,
"loss": 1.2975,
"step": 5750
},
{
"epoch": 0.9406422315926046,
"grad_norm": 2.0520365238189697,
"learning_rate": 7.649420160570919e-06,
"loss": 1.2895,
"step": 5800
},
{
"epoch": 0.9487512163477133,
"grad_norm": 1.6994986534118652,
"learning_rate": 7.62914605465899e-06,
"loss": 1.2421,
"step": 5850
},
{
"epoch": 0.9568602011028219,
"grad_norm": 2.1418871879577637,
"learning_rate": 7.608871948747061e-06,
"loss": 1.2464,
"step": 5900
},
{
"epoch": 0.9649691858579306,
"grad_norm": 3.202744960784912,
"learning_rate": 7.588597842835131e-06,
"loss": 1.2565,
"step": 5950
},
{
"epoch": 0.9730781706130393,
"grad_norm": 2.3648183345794678,
"learning_rate": 7.568323736923202e-06,
"loss": 1.262,
"step": 6000
},
{
"epoch": 0.9811871553681479,
"grad_norm": 1.500482439994812,
"learning_rate": 7.548049631011273e-06,
"loss": 1.2281,
"step": 6050
},
{
"epoch": 0.9892961401232566,
"grad_norm": 2.1810970306396484,
"learning_rate": 7.527775525099344e-06,
"loss": 1.2318,
"step": 6100
},
{
"epoch": 0.9974051248783652,
"grad_norm": 1.3932199478149414,
"learning_rate": 7.507501419187414e-06,
"loss": 1.2344,
"step": 6150
},
{
"epoch": 1.0055141096334739,
"grad_norm": 2.48954701423645,
"learning_rate": 7.487227313275484e-06,
"loss": 1.3035,
"step": 6200
},
{
"epoch": 1.0136230943885824,
"grad_norm": 1.4061833620071411,
"learning_rate": 7.466953207363555e-06,
"loss": 1.308,
"step": 6250
},
{
"epoch": 1.0217320791436912,
"grad_norm": 1.1377198696136475,
"learning_rate": 7.4466791014516275e-06,
"loss": 1.2877,
"step": 6300
},
{
"epoch": 1.0298410638987998,
"grad_norm": 1.90823233127594,
"learning_rate": 7.426404995539697e-06,
"loss": 1.2371,
"step": 6350
},
{
"epoch": 1.0379500486539086,
"grad_norm": 1.5908371210098267,
"learning_rate": 7.406130889627768e-06,
"loss": 1.2861,
"step": 6400
},
{
"epoch": 1.0460590334090172,
"grad_norm": 2.892045259475708,
"learning_rate": 7.385856783715839e-06,
"loss": 1.2856,
"step": 6450
},
{
"epoch": 1.0541680181641258,
"grad_norm": 1.4647297859191895,
"learning_rate": 7.36558267780391e-06,
"loss": 1.2431,
"step": 6500
},
{
"epoch": 1.0622770029192345,
"grad_norm": 1.2635786533355713,
"learning_rate": 7.34530857189198e-06,
"loss": 1.3269,
"step": 6550
},
{
"epoch": 1.0703859876743431,
"grad_norm": 1.0140317678451538,
"learning_rate": 7.325034465980051e-06,
"loss": 1.2724,
"step": 6600
},
{
"epoch": 1.078494972429452,
"grad_norm": 1.1980386972427368,
"learning_rate": 7.304760360068122e-06,
"loss": 1.2909,
"step": 6650
},
{
"epoch": 1.0866039571845605,
"grad_norm": 2.5311107635498047,
"learning_rate": 7.284486254156193e-06,
"loss": 1.2296,
"step": 6700
},
{
"epoch": 1.094712941939669,
"grad_norm": 1.5901424884796143,
"learning_rate": 7.2642121482442626e-06,
"loss": 1.2136,
"step": 6750
},
{
"epoch": 1.1028219266947779,
"grad_norm": 1.0336142778396606,
"learning_rate": 7.243938042332333e-06,
"loss": 1.2908,
"step": 6800
},
{
"epoch": 1.1109309114498864,
"grad_norm": 2.5491976737976074,
"learning_rate": 7.223663936420404e-06,
"loss": 1.2256,
"step": 6850
},
{
"epoch": 1.1190398962049952,
"grad_norm": 2.592517137527466,
"learning_rate": 7.203389830508475e-06,
"loss": 1.2808,
"step": 6900
},
{
"epoch": 1.1271488809601038,
"grad_norm": 1.5294160842895508,
"learning_rate": 7.1831157245965456e-06,
"loss": 1.2279,
"step": 6950
},
{
"epoch": 1.1352578657152124,
"grad_norm": 2.665705680847168,
"learning_rate": 7.162841618684616e-06,
"loss": 1.2813,
"step": 7000
},
{
"epoch": 1.1433668504703212,
"grad_norm": 5.743466854095459,
"learning_rate": 7.142567512772687e-06,
"loss": 1.2601,
"step": 7050
},
{
"epoch": 1.1514758352254297,
"grad_norm": 1.979543924331665,
"learning_rate": 7.122293406860759e-06,
"loss": 1.2568,
"step": 7100
},
{
"epoch": 1.1595848199805385,
"grad_norm": 0.2160961776971817,
"learning_rate": 7.1020193009488294e-06,
"loss": 1.2234,
"step": 7150
},
{
"epoch": 1.167693804735647,
"grad_norm": 2.09420108795166,
"learning_rate": 7.0817451950369e-06,
"loss": 1.2727,
"step": 7200
},
{
"epoch": 1.1758027894907557,
"grad_norm": 1.4437772035598755,
"learning_rate": 7.06147108912497e-06,
"loss": 1.2535,
"step": 7250
},
{
"epoch": 1.1839117742458645,
"grad_norm": 2.6284403800964355,
"learning_rate": 7.041196983213041e-06,
"loss": 1.1969,
"step": 7300
},
{
"epoch": 1.192020759000973,
"grad_norm": 1.897250771522522,
"learning_rate": 7.020922877301112e-06,
"loss": 1.2579,
"step": 7350
},
{
"epoch": 1.2001297437560818,
"grad_norm": 1.591044545173645,
"learning_rate": 7.000648771389182e-06,
"loss": 1.2553,
"step": 7400
},
{
"epoch": 1.2082387285111904,
"grad_norm": 2.323927402496338,
"learning_rate": 6.980374665477253e-06,
"loss": 1.2802,
"step": 7450
},
{
"epoch": 1.216347713266299,
"grad_norm": 2.145848035812378,
"learning_rate": 6.960100559565324e-06,
"loss": 1.3271,
"step": 7500
},
{
"epoch": 1.2244566980214078,
"grad_norm": 1.1199519634246826,
"learning_rate": 6.939826453653395e-06,
"loss": 1.2948,
"step": 7550
},
{
"epoch": 1.2325656827765163,
"grad_norm": 1.5974798202514648,
"learning_rate": 6.919552347741465e-06,
"loss": 1.2925,
"step": 7600
},
{
"epoch": 1.240674667531625,
"grad_norm": 1.1883361339569092,
"learning_rate": 6.899278241829535e-06,
"loss": 1.2288,
"step": 7650
},
{
"epoch": 1.2487836522867337,
"grad_norm": 2.2361881732940674,
"learning_rate": 6.879004135917606e-06,
"loss": 1.2732,
"step": 7700
},
{
"epoch": 1.2568926370418423,
"grad_norm": 0.9524820446968079,
"learning_rate": 6.858730030005677e-06,
"loss": 1.3427,
"step": 7750
},
{
"epoch": 1.265001621796951,
"grad_norm": 2.439042091369629,
"learning_rate": 6.8384559240937475e-06,
"loss": 1.2915,
"step": 7800
},
{
"epoch": 1.2731106065520597,
"grad_norm": 2.7487218379974365,
"learning_rate": 6.818181818181818e-06,
"loss": 1.3025,
"step": 7850
},
{
"epoch": 1.2812195913071682,
"grad_norm": 1.3138405084609985,
"learning_rate": 6.79790771226989e-06,
"loss": 1.2678,
"step": 7900
},
{
"epoch": 1.289328576062277,
"grad_norm": 2.2130439281463623,
"learning_rate": 6.777633606357961e-06,
"loss": 1.2955,
"step": 7950
},
{
"epoch": 1.2974375608173856,
"grad_norm": 2.6008100509643555,
"learning_rate": 6.757359500446031e-06,
"loss": 1.2413,
"step": 8000
},
{
"epoch": 1.3055465455724944,
"grad_norm": 2.495473861694336,
"learning_rate": 6.737085394534102e-06,
"loss": 1.3061,
"step": 8050
},
{
"epoch": 1.313655530327603,
"grad_norm": 0.8150402307510376,
"learning_rate": 6.716811288622173e-06,
"loss": 1.3055,
"step": 8100
},
{
"epoch": 1.3217645150827115,
"grad_norm": 1.7135632038116455,
"learning_rate": 6.696537182710244e-06,
"loss": 1.2723,
"step": 8150
},
{
"epoch": 1.3298734998378203,
"grad_norm": 2.367229700088501,
"learning_rate": 6.6762630767983135e-06,
"loss": 1.2817,
"step": 8200
},
{
"epoch": 1.337982484592929,
"grad_norm": 1.9759521484375,
"learning_rate": 6.655988970886384e-06,
"loss": 1.2453,
"step": 8250
},
{
"epoch": 1.3460914693480377,
"grad_norm": 1.2894783020019531,
"learning_rate": 6.635714864974455e-06,
"loss": 1.338,
"step": 8300
},
{
"epoch": 1.3542004541031463,
"grad_norm": 0.24668912589550018,
"learning_rate": 6.615440759062526e-06,
"loss": 1.255,
"step": 8350
},
{
"epoch": 1.3623094388582548,
"grad_norm": 2.6564314365386963,
"learning_rate": 6.5951666531505966e-06,
"loss": 1.1793,
"step": 8400
},
{
"epoch": 1.3704184236133636,
"grad_norm": 1.1451148986816406,
"learning_rate": 6.574892547238667e-06,
"loss": 1.2857,
"step": 8450
},
{
"epoch": 1.3785274083684722,
"grad_norm": 2.693976879119873,
"learning_rate": 6.554618441326738e-06,
"loss": 1.2497,
"step": 8500
},
{
"epoch": 1.386636393123581,
"grad_norm": 2.3329060077667236,
"learning_rate": 6.534344335414808e-06,
"loss": 1.2665,
"step": 8550
},
{
"epoch": 1.3947453778786896,
"grad_norm": 2.251298427581787,
"learning_rate": 6.514070229502879e-06,
"loss": 1.33,
"step": 8600
},
{
"epoch": 1.4028543626337981,
"grad_norm": 3.0650289058685303,
"learning_rate": 6.4937961235909495e-06,
"loss": 1.2996,
"step": 8650
},
{
"epoch": 1.410963347388907,
"grad_norm": 2.1476645469665527,
"learning_rate": 6.47352201767902e-06,
"loss": 1.3232,
"step": 8700
},
{
"epoch": 1.4190723321440155,
"grad_norm": 1.236338496208191,
"learning_rate": 6.453247911767092e-06,
"loss": 1.2547,
"step": 8750
},
{
"epoch": 1.4271813168991243,
"grad_norm": 2.276726722717285,
"learning_rate": 6.432973805855163e-06,
"loss": 1.2067,
"step": 8800
},
{
"epoch": 1.4352903016542329,
"grad_norm": 1.871100664138794,
"learning_rate": 6.412699699943233e-06,
"loss": 1.238,
"step": 8850
},
{
"epoch": 1.4433992864093415,
"grad_norm": 1.9131008386611938,
"learning_rate": 6.392425594031304e-06,
"loss": 1.2436,
"step": 8900
},
{
"epoch": 1.4515082711644502,
"grad_norm": 1.6729109287261963,
"learning_rate": 6.372151488119375e-06,
"loss": 1.2732,
"step": 8950
},
{
"epoch": 1.4596172559195588,
"grad_norm": 1.4220985174179077,
"learning_rate": 6.351877382207446e-06,
"loss": 1.3043,
"step": 9000
},
{
"epoch": 1.4677262406746676,
"grad_norm": 1.5014866590499878,
"learning_rate": 6.331603276295516e-06,
"loss": 1.2501,
"step": 9050
},
{
"epoch": 1.4758352254297762,
"grad_norm": 1.6406453847885132,
"learning_rate": 6.311329170383586e-06,
"loss": 1.2457,
"step": 9100
},
{
"epoch": 1.4839442101848848,
"grad_norm": 2.01636004447937,
"learning_rate": 6.291055064471657e-06,
"loss": 1.1894,
"step": 9150
},
{
"epoch": 1.4920531949399936,
"grad_norm": 1.5200086832046509,
"learning_rate": 6.270780958559728e-06,
"loss": 1.1833,
"step": 9200
},
{
"epoch": 1.5001621796951021,
"grad_norm": 1.9229196310043335,
"learning_rate": 6.2505068526477985e-06,
"loss": 1.2219,
"step": 9250
},
{
"epoch": 1.508271164450211,
"grad_norm": 2.19616961479187,
"learning_rate": 6.230232746735869e-06,
"loss": 1.2788,
"step": 9300
},
{
"epoch": 1.5163801492053195,
"grad_norm": 1.4525929689407349,
"learning_rate": 6.20995864082394e-06,
"loss": 1.2963,
"step": 9350
},
{
"epoch": 1.524489133960428,
"grad_norm": 2.227957248687744,
"learning_rate": 6.189684534912011e-06,
"loss": 1.324,
"step": 9400
},
{
"epoch": 1.5325981187155369,
"grad_norm": 1.158480167388916,
"learning_rate": 6.1694104290000815e-06,
"loss": 1.2185,
"step": 9450
},
{
"epoch": 1.5407071034706454,
"grad_norm": 1.3199654817581177,
"learning_rate": 6.1491363230881514e-06,
"loss": 1.2714,
"step": 9500
},
{
"epoch": 1.5488160882257542,
"grad_norm": 2.110074043273926,
"learning_rate": 6.128862217176224e-06,
"loss": 1.256,
"step": 9550
},
{
"epoch": 1.5569250729808628,
"grad_norm": 1.322704553604126,
"learning_rate": 6.108588111264294e-06,
"loss": 1.1785,
"step": 9600
},
{
"epoch": 1.5650340577359714,
"grad_norm": 2.572434425354004,
"learning_rate": 6.0883140053523645e-06,
"loss": 1.2349,
"step": 9650
},
{
"epoch": 1.5731430424910802,
"grad_norm": 2.033780574798584,
"learning_rate": 6.068039899440435e-06,
"loss": 1.2203,
"step": 9700
},
{
"epoch": 1.5812520272461887,
"grad_norm": 1.9049453735351562,
"learning_rate": 6.047765793528506e-06,
"loss": 1.2043,
"step": 9750
},
{
"epoch": 1.5893610120012975,
"grad_norm": 2.882568597793579,
"learning_rate": 6.027491687616577e-06,
"loss": 1.217,
"step": 9800
},
{
"epoch": 1.597469996756406,
"grad_norm": 2.430227279663086,
"learning_rate": 6.0072175817046475e-06,
"loss": 1.2795,
"step": 9850
},
{
"epoch": 1.6055789815115147,
"grad_norm": 1.9852076768875122,
"learning_rate": 5.986943475792718e-06,
"loss": 1.2434,
"step": 9900
},
{
"epoch": 1.6136879662666233,
"grad_norm": 2.1857733726501465,
"learning_rate": 5.966669369880789e-06,
"loss": 1.2756,
"step": 9950
},
{
"epoch": 1.621796951021732,
"grad_norm": 2.7323496341705322,
"learning_rate": 5.946395263968859e-06,
"loss": 1.2921,
"step": 10000
},
{
"epoch": 1.6299059357768408,
"grad_norm": 2.2632415294647217,
"learning_rate": 5.92612115805693e-06,
"loss": 1.2763,
"step": 10050
},
{
"epoch": 1.6380149205319494,
"grad_norm": 2.0098931789398193,
"learning_rate": 5.9058470521450005e-06,
"loss": 1.292,
"step": 10100
},
{
"epoch": 1.646123905287058,
"grad_norm": 2.09377384185791,
"learning_rate": 5.885572946233071e-06,
"loss": 1.2738,
"step": 10150
},
{
"epoch": 1.6542328900421666,
"grad_norm": 2.292084217071533,
"learning_rate": 5.865298840321142e-06,
"loss": 1.2726,
"step": 10200
},
{
"epoch": 1.6623418747972754,
"grad_norm": 2.7795863151550293,
"learning_rate": 5.845024734409213e-06,
"loss": 1.2523,
"step": 10250
},
{
"epoch": 1.6704508595523841,
"grad_norm": 3.0921523571014404,
"learning_rate": 5.8247506284972835e-06,
"loss": 1.2684,
"step": 10300
},
{
"epoch": 1.6785598443074927,
"grad_norm": 1.036535382270813,
"learning_rate": 5.804476522585355e-06,
"loss": 1.2404,
"step": 10350
},
{
"epoch": 1.6866688290626013,
"grad_norm": 1.3650333881378174,
"learning_rate": 5.784202416673426e-06,
"loss": 1.247,
"step": 10400
},
{
"epoch": 1.6947778138177099,
"grad_norm": 2.558058977127075,
"learning_rate": 5.763928310761497e-06,
"loss": 1.2938,
"step": 10450
},
{
"epoch": 1.7028867985728187,
"grad_norm": 2.6575851440429688,
"learning_rate": 5.743654204849567e-06,
"loss": 1.2974,
"step": 10500
},
{
"epoch": 1.7109957833279275,
"grad_norm": 1.657400369644165,
"learning_rate": 5.723380098937637e-06,
"loss": 1.3268,
"step": 10550
},
{
"epoch": 1.719104768083036,
"grad_norm": 2.027801275253296,
"learning_rate": 5.703105993025708e-06,
"loss": 1.3029,
"step": 10600
},
{
"epoch": 1.7272137528381446,
"grad_norm": 1.3793126344680786,
"learning_rate": 5.682831887113779e-06,
"loss": 1.1812,
"step": 10650
},
{
"epoch": 1.7353227375932532,
"grad_norm": 1.5861361026763916,
"learning_rate": 5.6625577812018495e-06,
"loss": 1.2871,
"step": 10700
},
{
"epoch": 1.743431722348362,
"grad_norm": 2.3719797134399414,
"learning_rate": 5.64228367528992e-06,
"loss": 1.1947,
"step": 10750
},
{
"epoch": 1.7515407071034708,
"grad_norm": 2.675689697265625,
"learning_rate": 5.622009569377991e-06,
"loss": 1.2902,
"step": 10800
},
{
"epoch": 1.7596496918585793,
"grad_norm": 2.018069267272949,
"learning_rate": 5.601735463466062e-06,
"loss": 1.1827,
"step": 10850
},
{
"epoch": 1.767758676613688,
"grad_norm": 1.1704685688018799,
"learning_rate": 5.5814613575541325e-06,
"loss": 1.1498,
"step": 10900
},
{
"epoch": 1.7758676613687965,
"grad_norm": 2.0111939907073975,
"learning_rate": 5.561187251642202e-06,
"loss": 1.2358,
"step": 10950
},
{
"epoch": 1.7839766461239053,
"grad_norm": 3.043297529220581,
"learning_rate": 5.540913145730273e-06,
"loss": 1.26,
"step": 11000
},
{
"epoch": 1.792085630879014,
"grad_norm": 3.0858490467071533,
"learning_rate": 5.520639039818344e-06,
"loss": 1.2519,
"step": 11050
},
{
"epoch": 1.8001946156341226,
"grad_norm": 2.669131278991699,
"learning_rate": 5.500364933906415e-06,
"loss": 1.3174,
"step": 11100
},
{
"epoch": 1.8083036003892312,
"grad_norm": 1.4813052415847778,
"learning_rate": 5.480090827994486e-06,
"loss": 1.2528,
"step": 11150
},
{
"epoch": 1.8164125851443398,
"grad_norm": 2.2234039306640625,
"learning_rate": 5.459816722082557e-06,
"loss": 1.2984,
"step": 11200
},
{
"epoch": 1.8245215698994486,
"grad_norm": 3.4711413383483887,
"learning_rate": 5.439542616170628e-06,
"loss": 1.2174,
"step": 11250
},
{
"epoch": 1.8326305546545574,
"grad_norm": 1.9171231985092163,
"learning_rate": 5.4192685102586985e-06,
"loss": 1.2657,
"step": 11300
},
{
"epoch": 1.840739539409666,
"grad_norm": 1.7139352560043335,
"learning_rate": 5.398994404346769e-06,
"loss": 1.274,
"step": 11350
},
{
"epoch": 1.8488485241647745,
"grad_norm": 2.9915032386779785,
"learning_rate": 5.37872029843484e-06,
"loss": 1.1745,
"step": 11400
},
{
"epoch": 1.856957508919883,
"grad_norm": 3.0832788944244385,
"learning_rate": 5.35844619252291e-06,
"loss": 1.3089,
"step": 11450
},
{
"epoch": 1.8650664936749919,
"grad_norm": 2.364255905151367,
"learning_rate": 5.338172086610981e-06,
"loss": 1.2146,
"step": 11500
},
{
"epoch": 1.8731754784301007,
"grad_norm": 3.9903273582458496,
"learning_rate": 5.3178979806990514e-06,
"loss": 1.3103,
"step": 11550
},
{
"epoch": 1.8812844631852093,
"grad_norm": 3.4954657554626465,
"learning_rate": 5.297623874787122e-06,
"loss": 1.1682,
"step": 11600
},
{
"epoch": 1.8893934479403178,
"grad_norm": 1.927339792251587,
"learning_rate": 5.277349768875193e-06,
"loss": 1.224,
"step": 11650
},
{
"epoch": 1.8975024326954264,
"grad_norm": 1.446385145187378,
"learning_rate": 5.257075662963264e-06,
"loss": 1.3907,
"step": 11700
},
{
"epoch": 1.9056114174505352,
"grad_norm": 2.112168312072754,
"learning_rate": 5.2368015570513345e-06,
"loss": 1.2041,
"step": 11750
},
{
"epoch": 1.913720402205644,
"grad_norm": 2.601565361022949,
"learning_rate": 5.216527451139405e-06,
"loss": 1.2827,
"step": 11800
},
{
"epoch": 1.9218293869607526,
"grad_norm": 1.9448506832122803,
"learning_rate": 5.196253345227475e-06,
"loss": 1.1899,
"step": 11850
},
{
"epoch": 1.9299383717158611,
"grad_norm": 1.7583825588226318,
"learning_rate": 5.175979239315546e-06,
"loss": 1.2681,
"step": 11900
},
{
"epoch": 1.9380473564709697,
"grad_norm": 1.319887399673462,
"learning_rate": 5.155705133403617e-06,
"loss": 1.2311,
"step": 11950
},
{
"epoch": 1.9461563412260785,
"grad_norm": 1.5955997705459595,
"learning_rate": 5.135431027491688e-06,
"loss": 1.2043,
"step": 12000
},
{
"epoch": 1.9542653259811873,
"grad_norm": 2.112924337387085,
"learning_rate": 5.115156921579759e-06,
"loss": 1.21,
"step": 12050
},
{
"epoch": 1.9623743107362959,
"grad_norm": 2.8387506008148193,
"learning_rate": 5.09488281566783e-06,
"loss": 1.253,
"step": 12100
},
{
"epoch": 1.9704832954914044,
"grad_norm": 2.55635142326355,
"learning_rate": 5.0746087097559005e-06,
"loss": 1.2277,
"step": 12150
},
{
"epoch": 1.978592280246513,
"grad_norm": 2.2216525077819824,
"learning_rate": 5.054334603843971e-06,
"loss": 1.1755,
"step": 12200
},
{
"epoch": 1.9867012650016218,
"grad_norm": 1.1123380661010742,
"learning_rate": 5.034060497932042e-06,
"loss": 1.229,
"step": 12250
},
{
"epoch": 1.9948102497567306,
"grad_norm": 2.5723652839660645,
"learning_rate": 5.013786392020113e-06,
"loss": 1.2028,
"step": 12300
},
{
"epoch": 2.002919234511839,
"grad_norm": 2.5151309967041016,
"learning_rate": 4.993512286108183e-06,
"loss": 1.2314,
"step": 12350
},
{
"epoch": 2.0110282192669477,
"grad_norm": 1.5693440437316895,
"learning_rate": 4.973238180196253e-06,
"loss": 1.2771,
"step": 12400
},
{
"epoch": 2.0191372040220563,
"grad_norm": 4.562756538391113,
"learning_rate": 4.952964074284324e-06,
"loss": 1.2614,
"step": 12450
},
{
"epoch": 2.027246188777165,
"grad_norm": 0.9000415802001953,
"learning_rate": 4.932689968372395e-06,
"loss": 1.2383,
"step": 12500
},
{
"epoch": 2.035355173532274,
"grad_norm": 1.2384685277938843,
"learning_rate": 4.912415862460466e-06,
"loss": 1.2501,
"step": 12550
},
{
"epoch": 2.0434641582873825,
"grad_norm": 1.771003246307373,
"learning_rate": 4.892141756548537e-06,
"loss": 1.2603,
"step": 12600
},
{
"epoch": 2.051573143042491,
"grad_norm": 2.1732709407806396,
"learning_rate": 4.871867650636607e-06,
"loss": 1.1763,
"step": 12650
},
{
"epoch": 2.0596821277975996,
"grad_norm": 1.5497926473617554,
"learning_rate": 4.851593544724678e-06,
"loss": 1.2558,
"step": 12700
},
{
"epoch": 2.067791112552708,
"grad_norm": 1.6834354400634766,
"learning_rate": 4.831319438812749e-06,
"loss": 1.2592,
"step": 12750
},
{
"epoch": 2.075900097307817,
"grad_norm": 1.920637845993042,
"learning_rate": 4.811045332900819e-06,
"loss": 1.279,
"step": 12800
},
{
"epoch": 2.084009082062926,
"grad_norm": 1.851440191268921,
"learning_rate": 4.79077122698889e-06,
"loss": 1.2904,
"step": 12850
},
{
"epoch": 2.0921180668180344,
"grad_norm": 3.262347936630249,
"learning_rate": 4.770497121076961e-06,
"loss": 1.1763,
"step": 12900
},
{
"epoch": 2.100227051573143,
"grad_norm": 1.7690410614013672,
"learning_rate": 4.750223015165032e-06,
"loss": 1.246,
"step": 12950
},
{
"epoch": 2.1083360363282515,
"grad_norm": 2.3499839305877686,
"learning_rate": 4.7299489092531024e-06,
"loss": 1.1969,
"step": 13000
},
{
"epoch": 2.1164450210833605,
"grad_norm": 2.815704822540283,
"learning_rate": 4.709674803341173e-06,
"loss": 1.2299,
"step": 13050
},
{
"epoch": 2.124554005838469,
"grad_norm": 1.6346577405929565,
"learning_rate": 4.689400697429244e-06,
"loss": 1.1941,
"step": 13100
},
{
"epoch": 2.1326629905935777,
"grad_norm": 1.3435848951339722,
"learning_rate": 4.669126591517315e-06,
"loss": 1.2193,
"step": 13150
},
{
"epoch": 2.1407719753486862,
"grad_norm": 2.11517596244812,
"learning_rate": 4.6488524856053854e-06,
"loss": 1.2868,
"step": 13200
},
{
"epoch": 2.148880960103795,
"grad_norm": 1.5125885009765625,
"learning_rate": 4.628578379693456e-06,
"loss": 1.2034,
"step": 13250
},
{
"epoch": 2.156989944858904,
"grad_norm": 2.0965402126312256,
"learning_rate": 4.608304273781526e-06,
"loss": 1.2484,
"step": 13300
},
{
"epoch": 2.1650989296140124,
"grad_norm": 1.9137533903121948,
"learning_rate": 4.588030167869597e-06,
"loss": 1.2531,
"step": 13350
},
{
"epoch": 2.173207914369121,
"grad_norm": 2.630784749984741,
"learning_rate": 4.5677560619576685e-06,
"loss": 1.2074,
"step": 13400
},
{
"epoch": 2.1813168991242295,
"grad_norm": 1.4360569715499878,
"learning_rate": 4.547481956045739e-06,
"loss": 1.22,
"step": 13450
},
{
"epoch": 2.189425883879338,
"grad_norm": 2.066938638687134,
"learning_rate": 4.52720785013381e-06,
"loss": 1.1438,
"step": 13500
},
{
"epoch": 2.197534868634447,
"grad_norm": 1.7182413339614868,
"learning_rate": 4.50693374422188e-06,
"loss": 1.2681,
"step": 13550
},
{
"epoch": 2.2056438533895557,
"grad_norm": 1.706874132156372,
"learning_rate": 4.486659638309951e-06,
"loss": 1.1888,
"step": 13600
},
{
"epoch": 2.2137528381446643,
"grad_norm": 1.8634135723114014,
"learning_rate": 4.466385532398021e-06,
"loss": 1.2953,
"step": 13650
},
{
"epoch": 2.221861822899773,
"grad_norm": 1.77712881565094,
"learning_rate": 4.446111426486092e-06,
"loss": 1.2461,
"step": 13700
},
{
"epoch": 2.2299708076548814,
"grad_norm": 1.9737837314605713,
"learning_rate": 4.425837320574163e-06,
"loss": 1.2856,
"step": 13750
},
{
"epoch": 2.2380797924099904,
"grad_norm": 2.7296142578125,
"learning_rate": 4.405563214662234e-06,
"loss": 1.2124,
"step": 13800
},
{
"epoch": 2.246188777165099,
"grad_norm": 3.5112380981445312,
"learning_rate": 4.385289108750304e-06,
"loss": 1.2143,
"step": 13850
},
{
"epoch": 2.2542977619202076,
"grad_norm": 3.4935994148254395,
"learning_rate": 4.365015002838375e-06,
"loss": 1.3156,
"step": 13900
},
{
"epoch": 2.262406746675316,
"grad_norm": 2.2354025840759277,
"learning_rate": 4.344740896926446e-06,
"loss": 1.3025,
"step": 13950
},
{
"epoch": 2.2705157314304247,
"grad_norm": 2.089087724685669,
"learning_rate": 4.324466791014517e-06,
"loss": 1.2249,
"step": 14000
},
{
"epoch": 2.2786247161855337,
"grad_norm": 2.6738228797912598,
"learning_rate": 4.304192685102587e-06,
"loss": 1.1766,
"step": 14050
},
{
"epoch": 2.2867337009406423,
"grad_norm": 1.6028488874435425,
"learning_rate": 4.283918579190658e-06,
"loss": 1.2806,
"step": 14100
},
{
"epoch": 2.294842685695751,
"grad_norm": 2.742100954055786,
"learning_rate": 4.263644473278729e-06,
"loss": 1.2241,
"step": 14150
},
{
"epoch": 2.3029516704508595,
"grad_norm": 1.2533172369003296,
"learning_rate": 4.2433703673668e-06,
"loss": 1.2563,
"step": 14200
},
{
"epoch": 2.311060655205968,
"grad_norm": 2.783311128616333,
"learning_rate": 4.22309626145487e-06,
"loss": 1.2382,
"step": 14250
},
{
"epoch": 2.319169639961077,
"grad_norm": 1.9947431087493896,
"learning_rate": 4.202822155542941e-06,
"loss": 1.2533,
"step": 14300
},
{
"epoch": 2.3272786247161856,
"grad_norm": 2.8825254440307617,
"learning_rate": 4.182548049631012e-06,
"loss": 1.2332,
"step": 14350
},
{
"epoch": 2.335387609471294,
"grad_norm": 1.9132847785949707,
"learning_rate": 4.162273943719083e-06,
"loss": 1.225,
"step": 14400
},
{
"epoch": 2.3434965942264028,
"grad_norm": 2.9740896224975586,
"learning_rate": 4.1419998378071526e-06,
"loss": 1.302,
"step": 14450
},
{
"epoch": 2.3516055789815113,
"grad_norm": 2.5434772968292236,
"learning_rate": 4.121725731895223e-06,
"loss": 1.2857,
"step": 14500
},
{
"epoch": 2.3597145637366204,
"grad_norm": 0.8146458864212036,
"learning_rate": 4.101451625983294e-06,
"loss": 1.1667,
"step": 14550
},
{
"epoch": 2.367823548491729,
"grad_norm": 2.3713395595550537,
"learning_rate": 4.081177520071365e-06,
"loss": 1.1799,
"step": 14600
},
{
"epoch": 2.3759325332468375,
"grad_norm": 2.5800139904022217,
"learning_rate": 4.0609034141594364e-06,
"loss": 1.2639,
"step": 14650
},
{
"epoch": 2.384041518001946,
"grad_norm": 8.0242338180542,
"learning_rate": 4.040629308247507e-06,
"loss": 1.2168,
"step": 14700
},
{
"epoch": 2.3921505027570547,
"grad_norm": 0.9941585063934326,
"learning_rate": 4.020355202335577e-06,
"loss": 1.2966,
"step": 14750
},
{
"epoch": 2.4002594875121637,
"grad_norm": 0.7390837669372559,
"learning_rate": 4.000081096423648e-06,
"loss": 1.3237,
"step": 14800
},
{
"epoch": 2.4083684722672722,
"grad_norm": 0.5450477004051208,
"learning_rate": 3.979806990511719e-06,
"loss": 1.284,
"step": 14850
},
{
"epoch": 2.416477457022381,
"grad_norm": 1.7690049409866333,
"learning_rate": 3.959532884599789e-06,
"loss": 1.286,
"step": 14900
},
{
"epoch": 2.4245864417774894,
"grad_norm": 2.655095100402832,
"learning_rate": 3.93925877868786e-06,
"loss": 1.1513,
"step": 14950
},
{
"epoch": 2.432695426532598,
"grad_norm": 1.2846322059631348,
"learning_rate": 3.918984672775931e-06,
"loss": 1.2024,
"step": 15000
},
{
"epoch": 2.440804411287707,
"grad_norm": 6.425904273986816,
"learning_rate": 3.898710566864002e-06,
"loss": 1.2462,
"step": 15050
},
{
"epoch": 2.4489133960428155,
"grad_norm": 2.2578928470611572,
"learning_rate": 3.878436460952072e-06,
"loss": 1.2289,
"step": 15100
},
{
"epoch": 2.457022380797924,
"grad_norm": 3.1006276607513428,
"learning_rate": 3.858162355040143e-06,
"loss": 1.2839,
"step": 15150
},
{
"epoch": 2.4651313655530327,
"grad_norm": 2.598376512527466,
"learning_rate": 3.837888249128214e-06,
"loss": 1.2496,
"step": 15200
},
{
"epoch": 2.4732403503081413,
"grad_norm": 0.7084365487098694,
"learning_rate": 3.817614143216285e-06,
"loss": 1.2325,
"step": 15250
},
{
"epoch": 2.48134933506325,
"grad_norm": 2.280824661254883,
"learning_rate": 3.797340037304355e-06,
"loss": 1.1541,
"step": 15300
},
{
"epoch": 2.489458319818359,
"grad_norm": 10.274874687194824,
"learning_rate": 3.7770659313924257e-06,
"loss": 1.1489,
"step": 15350
},
{
"epoch": 2.4975673045734674,
"grad_norm": 2.2401506900787354,
"learning_rate": 3.7567918254804964e-06,
"loss": 1.1696,
"step": 15400
},
{
"epoch": 2.505676289328576,
"grad_norm": 2.0671870708465576,
"learning_rate": 3.7365177195685676e-06,
"loss": 1.2512,
"step": 15450
},
{
"epoch": 2.5137852740836846,
"grad_norm": 2.2912776470184326,
"learning_rate": 3.7162436136566384e-06,
"loss": 1.2278,
"step": 15500
},
{
"epoch": 2.5218942588387936,
"grad_norm": 1.7719197273254395,
"learning_rate": 3.6959695077447087e-06,
"loss": 1.2984,
"step": 15550
},
{
"epoch": 2.530003243593902,
"grad_norm": 1.9428201913833618,
"learning_rate": 3.6756954018327795e-06,
"loss": 1.2511,
"step": 15600
},
{
"epoch": 2.5381122283490107,
"grad_norm": 2.804948091506958,
"learning_rate": 3.6554212959208502e-06,
"loss": 1.2599,
"step": 15650
},
{
"epoch": 2.5462212131041193,
"grad_norm": 2.306248188018799,
"learning_rate": 3.635147190008921e-06,
"loss": 1.2252,
"step": 15700
},
{
"epoch": 2.554330197859228,
"grad_norm": 2.2706100940704346,
"learning_rate": 3.6148730840969913e-06,
"loss": 1.1874,
"step": 15750
},
{
"epoch": 2.5624391826143365,
"grad_norm": 1.6053426265716553,
"learning_rate": 3.594598978185062e-06,
"loss": 1.1885,
"step": 15800
},
{
"epoch": 2.5705481673694455,
"grad_norm": 2.0183823108673096,
"learning_rate": 3.5743248722731332e-06,
"loss": 1.2176,
"step": 15850
},
{
"epoch": 2.578657152124554,
"grad_norm": 1.6887139081954956,
"learning_rate": 3.554050766361204e-06,
"loss": 1.1165,
"step": 15900
},
{
"epoch": 2.5867661368796626,
"grad_norm": 1.6139768362045288,
"learning_rate": 3.5337766604492747e-06,
"loss": 1.2512,
"step": 15950
},
{
"epoch": 2.594875121634771,
"grad_norm": 1.026294231414795,
"learning_rate": 3.513502554537345e-06,
"loss": 1.2485,
"step": 16000
},
{
"epoch": 2.60298410638988,
"grad_norm": 2.261563777923584,
"learning_rate": 3.493228448625416e-06,
"loss": 1.2493,
"step": 16050
},
{
"epoch": 2.6110930911449888,
"grad_norm": 2.9573357105255127,
"learning_rate": 3.4729543427134866e-06,
"loss": 1.2542,
"step": 16100
},
{
"epoch": 2.6192020759000973,
"grad_norm": 2.836587905883789,
"learning_rate": 3.4526802368015573e-06,
"loss": 1.2571,
"step": 16150
},
{
"epoch": 2.627311060655206,
"grad_norm": 5.188553333282471,
"learning_rate": 3.4324061308896276e-06,
"loss": 1.3193,
"step": 16200
},
{
"epoch": 2.6354200454103145,
"grad_norm": 1.9180452823638916,
"learning_rate": 3.412132024977699e-06,
"loss": 1.2382,
"step": 16250
},
{
"epoch": 2.643529030165423,
"grad_norm": 2.1819140911102295,
"learning_rate": 3.3918579190657696e-06,
"loss": 1.2826,
"step": 16300
},
{
"epoch": 2.651638014920532,
"grad_norm": 2.264775037765503,
"learning_rate": 3.3715838131538403e-06,
"loss": 1.2507,
"step": 16350
},
{
"epoch": 2.6597469996756407,
"grad_norm": 1.7436145544052124,
"learning_rate": 3.351309707241911e-06,
"loss": 1.2057,
"step": 16400
},
{
"epoch": 2.6678559844307492,
"grad_norm": 2.5168802738189697,
"learning_rate": 3.3310356013299814e-06,
"loss": 1.215,
"step": 16450
},
{
"epoch": 2.675964969185858,
"grad_norm": 1.952141284942627,
"learning_rate": 3.310761495418052e-06,
"loss": 1.255,
"step": 16500
},
{
"epoch": 2.684073953940967,
"grad_norm": 1.5202018022537231,
"learning_rate": 3.290487389506123e-06,
"loss": 1.224,
"step": 16550
},
{
"epoch": 2.6921829386960754,
"grad_norm": 1.8313968181610107,
"learning_rate": 3.2702132835941937e-06,
"loss": 1.3158,
"step": 16600
},
{
"epoch": 2.700291923451184,
"grad_norm": 2.5669538974761963,
"learning_rate": 3.249939177682265e-06,
"loss": 1.2905,
"step": 16650
},
{
"epoch": 2.7084009082062925,
"grad_norm": 2.714341878890991,
"learning_rate": 3.2296650717703356e-06,
"loss": 1.2508,
"step": 16700
},
{
"epoch": 2.716509892961401,
"grad_norm": 2.8722524642944336,
"learning_rate": 3.209390965858406e-06,
"loss": 1.2905,
"step": 16750
},
{
"epoch": 2.7246188777165097,
"grad_norm": 2.822148323059082,
"learning_rate": 3.1891168599464767e-06,
"loss": 1.1907,
"step": 16800
},
{
"epoch": 2.7327278624716187,
"grad_norm": 0.6179723143577576,
"learning_rate": 3.1688427540345474e-06,
"loss": 1.276,
"step": 16850
},
{
"epoch": 2.7408368472267273,
"grad_norm": 1.800058126449585,
"learning_rate": 3.148568648122618e-06,
"loss": 1.1746,
"step": 16900
},
{
"epoch": 2.748945831981836,
"grad_norm": 2.2209925651550293,
"learning_rate": 3.1282945422106885e-06,
"loss": 1.245,
"step": 16950
},
{
"epoch": 2.7570548167369444,
"grad_norm": 2.067692995071411,
"learning_rate": 3.1080204362987593e-06,
"loss": 1.2182,
"step": 17000
},
{
"epoch": 2.7651638014920534,
"grad_norm": 2.8044259548187256,
"learning_rate": 3.0877463303868304e-06,
"loss": 1.1653,
"step": 17050
},
{
"epoch": 2.773272786247162,
"grad_norm": 2.077935218811035,
"learning_rate": 3.067472224474901e-06,
"loss": 1.2223,
"step": 17100
},
{
"epoch": 2.7813817710022706,
"grad_norm": 0.14890266954898834,
"learning_rate": 3.047198118562972e-06,
"loss": 1.2318,
"step": 17150
},
{
"epoch": 2.789490755757379,
"grad_norm": 1.6987643241882324,
"learning_rate": 3.0269240126510423e-06,
"loss": 1.2607,
"step": 17200
},
{
"epoch": 2.7975997405124877,
"grad_norm": 2.667273759841919,
"learning_rate": 3.006649906739113e-06,
"loss": 1.2486,
"step": 17250
},
{
"epoch": 2.8057087252675963,
"grad_norm": 1.8006951808929443,
"learning_rate": 2.9863758008271838e-06,
"loss": 1.2674,
"step": 17300
},
{
"epoch": 2.8138177100227053,
"grad_norm": 3.4938597679138184,
"learning_rate": 2.9661016949152545e-06,
"loss": 1.2782,
"step": 17350
},
{
"epoch": 2.821926694777814,
"grad_norm": 3.029115915298462,
"learning_rate": 2.945827589003325e-06,
"loss": 1.2397,
"step": 17400
},
{
"epoch": 2.8300356795329225,
"grad_norm": 1.7525807619094849,
"learning_rate": 2.9255534830913956e-06,
"loss": 1.1576,
"step": 17450
},
{
"epoch": 2.838144664288031,
"grad_norm": 2.326188564300537,
"learning_rate": 2.905279377179467e-06,
"loss": 1.2581,
"step": 17500
},
{
"epoch": 2.84625364904314,
"grad_norm": 2.2817983627319336,
"learning_rate": 2.8850052712675375e-06,
"loss": 1.1471,
"step": 17550
},
{
"epoch": 2.8543626337982486,
"grad_norm": 1.9472275972366333,
"learning_rate": 2.8647311653556083e-06,
"loss": 1.2784,
"step": 17600
},
{
"epoch": 2.862471618553357,
"grad_norm": 2.6982924938201904,
"learning_rate": 2.8444570594436786e-06,
"loss": 1.181,
"step": 17650
},
{
"epoch": 2.8705806033084658,
"grad_norm": 2.204470634460449,
"learning_rate": 2.8241829535317494e-06,
"loss": 1.3165,
"step": 17700
},
{
"epoch": 2.8786895880635743,
"grad_norm": 1.8621634244918823,
"learning_rate": 2.80390884761982e-06,
"loss": 1.1949,
"step": 17750
},
{
"epoch": 2.886798572818683,
"grad_norm": 2.0357561111450195,
"learning_rate": 2.783634741707891e-06,
"loss": 1.2433,
"step": 17800
},
{
"epoch": 2.894907557573792,
"grad_norm": 1.873806118965149,
"learning_rate": 2.7633606357959612e-06,
"loss": 1.2737,
"step": 17850
},
{
"epoch": 2.9030165423289005,
"grad_norm": 2.299546241760254,
"learning_rate": 2.7430865298840324e-06,
"loss": 1.2296,
"step": 17900
},
{
"epoch": 2.911125527084009,
"grad_norm": 2.267756938934326,
"learning_rate": 2.722812423972103e-06,
"loss": 1.2348,
"step": 17950
},
{
"epoch": 2.9192345118391176,
"grad_norm": 3.639319658279419,
"learning_rate": 2.702538318060174e-06,
"loss": 1.2168,
"step": 18000
},
{
"epoch": 2.9273434965942267,
"grad_norm": 2.825929641723633,
"learning_rate": 2.6822642121482447e-06,
"loss": 1.2045,
"step": 18050
},
{
"epoch": 2.9354524813493352,
"grad_norm": 1.6339542865753174,
"learning_rate": 2.661990106236315e-06,
"loss": 1.2303,
"step": 18100
},
{
"epoch": 2.943561466104444,
"grad_norm": 1.8570992946624756,
"learning_rate": 2.6417160003243857e-06,
"loss": 1.2268,
"step": 18150
},
{
"epoch": 2.9516704508595524,
"grad_norm": 2.2465381622314453,
"learning_rate": 2.6214418944124565e-06,
"loss": 1.1506,
"step": 18200
},
{
"epoch": 2.959779435614661,
"grad_norm": 2.456606149673462,
"learning_rate": 2.6011677885005272e-06,
"loss": 1.2371,
"step": 18250
},
{
"epoch": 2.9678884203697695,
"grad_norm": 1.7189408540725708,
"learning_rate": 2.5808936825885984e-06,
"loss": 1.1677,
"step": 18300
},
{
"epoch": 2.975997405124878,
"grad_norm": 1.5492075681686401,
"learning_rate": 2.5606195766766687e-06,
"loss": 1.2146,
"step": 18350
},
{
"epoch": 2.984106389879987,
"grad_norm": 2.5447022914886475,
"learning_rate": 2.5403454707647395e-06,
"loss": 1.1574,
"step": 18400
},
{
"epoch": 2.9922153746350957,
"grad_norm": 1.8842716217041016,
"learning_rate": 2.5200713648528103e-06,
"loss": 1.2093,
"step": 18450
}
],
"logging_steps": 50,
"max_steps": 24664,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.035905534827063e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}