{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 18498, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008108984755108661, "grad_norm": 1.6712403297424316, "learning_rate": 9.980942340442787e-06, "loss": 1.5976, "step": 50 }, { "epoch": 0.016217969510217322, "grad_norm": 1.9675096273422241, "learning_rate": 9.960668234530858e-06, "loss": 1.4389, "step": 100 }, { "epoch": 0.02432695426532598, "grad_norm": 4.630291938781738, "learning_rate": 9.940394128618928e-06, "loss": 1.387, "step": 150 }, { "epoch": 0.032435939020434644, "grad_norm": 2.186373710632324, "learning_rate": 9.920120022707e-06, "loss": 1.4025, "step": 200 }, { "epoch": 0.0405449237755433, "grad_norm": 1.8459932804107666, "learning_rate": 9.89984591679507e-06, "loss": 1.3795, "step": 250 }, { "epoch": 0.04865390853065196, "grad_norm": 1.8652907609939575, "learning_rate": 9.879571810883141e-06, "loss": 1.3835, "step": 300 }, { "epoch": 0.056762893285760625, "grad_norm": 1.97219979763031, "learning_rate": 9.859297704971211e-06, "loss": 1.3549, "step": 350 }, { "epoch": 0.06487187804086929, "grad_norm": 1.764737844467163, "learning_rate": 9.839023599059281e-06, "loss": 1.3785, "step": 400 }, { "epoch": 0.07298086279597794, "grad_norm": 1.37200927734375, "learning_rate": 9.818749493147353e-06, "loss": 1.397, "step": 450 }, { "epoch": 0.0810898475510866, "grad_norm": 0.783941388130188, "learning_rate": 9.798475387235423e-06, "loss": 1.4055, "step": 500 }, { "epoch": 0.08919883230619527, "grad_norm": 2.4082376956939697, "learning_rate": 9.778201281323494e-06, "loss": 1.2851, "step": 550 }, { "epoch": 0.09730781706130393, "grad_norm": 1.2580335140228271, "learning_rate": 9.757927175411566e-06, "loss": 1.3557, "step": 600 }, { "epoch": 0.10541680181641258, "grad_norm": 1.688684105873108, "learning_rate": 9.737653069499636e-06, "loss": 1.3462, "step": 650 }, { "epoch": 0.11352578657152125, "grad_norm": 1.7642041444778442, "learning_rate": 9.717378963587707e-06, "loss": 1.3122, "step": 700 }, { "epoch": 0.1216347713266299, "grad_norm": 1.5700275897979736, "learning_rate": 9.697104857675777e-06, "loss": 1.3248, "step": 750 }, { "epoch": 0.12974375608173858, "grad_norm": 1.8756022453308105, "learning_rate": 9.676830751763849e-06, "loss": 1.3364, "step": 800 }, { "epoch": 0.13785274083684723, "grad_norm": 2.0151844024658203, "learning_rate": 9.656556645851919e-06, "loss": 1.3566, "step": 850 }, { "epoch": 0.1459617255919559, "grad_norm": 1.8269509077072144, "learning_rate": 9.636282539939989e-06, "loss": 1.3291, "step": 900 }, { "epoch": 0.15407071034706454, "grad_norm": 2.033515453338623, "learning_rate": 9.61600843402806e-06, "loss": 1.3239, "step": 950 }, { "epoch": 0.1621796951021732, "grad_norm": 1.2932145595550537, "learning_rate": 9.59573432811613e-06, "loss": 1.3458, "step": 1000 }, { "epoch": 0.17028867985728185, "grad_norm": 1.7336113452911377, "learning_rate": 9.575460222204202e-06, "loss": 1.3227, "step": 1050 }, { "epoch": 0.17839766461239054, "grad_norm": 2.2469635009765625, "learning_rate": 9.555186116292272e-06, "loss": 1.3055, "step": 1100 }, { "epoch": 0.1865066493674992, "grad_norm": 1.9851100444793701, "learning_rate": 9.534912010380343e-06, "loss": 1.2713, "step": 1150 }, { "epoch": 0.19461563412260785, "grad_norm": 1.896209716796875, "learning_rate": 9.514637904468413e-06, "loss": 1.3926, "step": 1200 }, { "epoch": 0.2027246188777165, "grad_norm": 1.941974401473999, "learning_rate": 9.494363798556485e-06, "loss": 1.2845, "step": 1250 }, { "epoch": 0.21083360363282516, "grad_norm": 1.7393105030059814, "learning_rate": 9.474089692644555e-06, "loss": 1.3989, "step": 1300 }, { "epoch": 0.21894258838793382, "grad_norm": 2.3268821239471436, "learning_rate": 9.453815586732625e-06, "loss": 1.3939, "step": 1350 }, { "epoch": 0.2270515731430425, "grad_norm": 2.2540266513824463, "learning_rate": 9.433541480820698e-06, "loss": 1.3421, "step": 1400 }, { "epoch": 0.23516055789815116, "grad_norm": 1.5483736991882324, "learning_rate": 9.413267374908768e-06, "loss": 1.3236, "step": 1450 }, { "epoch": 0.2432695426532598, "grad_norm": 1.2663439512252808, "learning_rate": 9.392993268996838e-06, "loss": 1.3333, "step": 1500 }, { "epoch": 0.25137852740836847, "grad_norm": 0.978643536567688, "learning_rate": 9.37271916308491e-06, "loss": 1.3522, "step": 1550 }, { "epoch": 0.25948751216347715, "grad_norm": 2.664464235305786, "learning_rate": 9.35244505717298e-06, "loss": 1.3969, "step": 1600 }, { "epoch": 0.2675964969185858, "grad_norm": 0.9333709478378296, "learning_rate": 9.332170951261051e-06, "loss": 1.3423, "step": 1650 }, { "epoch": 0.27570548167369446, "grad_norm": 2.0592761039733887, "learning_rate": 9.31189684534912e-06, "loss": 1.3274, "step": 1700 }, { "epoch": 0.2838144664288031, "grad_norm": 1.6379183530807495, "learning_rate": 9.291622739437192e-06, "loss": 1.321, "step": 1750 }, { "epoch": 0.2919234511839118, "grad_norm": 1.659201979637146, "learning_rate": 9.271348633525262e-06, "loss": 1.3662, "step": 1800 }, { "epoch": 0.30003243593902046, "grad_norm": 2.4258599281311035, "learning_rate": 9.251074527613332e-06, "loss": 1.2489, "step": 1850 }, { "epoch": 0.3081414206941291, "grad_norm": 2.314232349395752, "learning_rate": 9.230800421701404e-06, "loss": 1.3549, "step": 1900 }, { "epoch": 0.31625040544923777, "grad_norm": 1.78170645236969, "learning_rate": 9.210526315789474e-06, "loss": 1.3405, "step": 1950 }, { "epoch": 0.3243593902043464, "grad_norm": 2.3508291244506836, "learning_rate": 9.190252209877545e-06, "loss": 1.335, "step": 2000 }, { "epoch": 0.3324683749594551, "grad_norm": 3.0449280738830566, "learning_rate": 9.169978103965615e-06, "loss": 1.3188, "step": 2050 }, { "epoch": 0.3405773597145637, "grad_norm": 2.0919177532196045, "learning_rate": 9.149703998053687e-06, "loss": 1.3369, "step": 2100 }, { "epoch": 0.3486863444696724, "grad_norm": 1.5090256929397583, "learning_rate": 9.129429892141757e-06, "loss": 1.3782, "step": 2150 }, { "epoch": 0.3567953292247811, "grad_norm": 1.5943760871887207, "learning_rate": 9.109155786229828e-06, "loss": 1.2826, "step": 2200 }, { "epoch": 0.3649043139798897, "grad_norm": 1.4445804357528687, "learning_rate": 9.0888816803179e-06, "loss": 1.2706, "step": 2250 }, { "epoch": 0.3730132987349984, "grad_norm": 1.366866111755371, "learning_rate": 9.06860757440597e-06, "loss": 1.3081, "step": 2300 }, { "epoch": 0.381122283490107, "grad_norm": 1.5148597955703735, "learning_rate": 9.04833346849404e-06, "loss": 1.3117, "step": 2350 }, { "epoch": 0.3892312682452157, "grad_norm": 1.9457615613937378, "learning_rate": 9.028059362582111e-06, "loss": 1.3385, "step": 2400 }, { "epoch": 0.3973402530003244, "grad_norm": 1.0214955806732178, "learning_rate": 9.007785256670181e-06, "loss": 1.4173, "step": 2450 }, { "epoch": 0.405449237755433, "grad_norm": 2.3479509353637695, "learning_rate": 8.987511150758253e-06, "loss": 1.338, "step": 2500 }, { "epoch": 0.4135582225105417, "grad_norm": 1.7548096179962158, "learning_rate": 8.967237044846323e-06, "loss": 1.3245, "step": 2550 }, { "epoch": 0.4216672072656503, "grad_norm": 1.3919732570648193, "learning_rate": 8.946962938934394e-06, "loss": 1.2934, "step": 2600 }, { "epoch": 0.429776192020759, "grad_norm": 2.419706344604492, "learning_rate": 8.926688833022464e-06, "loss": 1.2451, "step": 2650 }, { "epoch": 0.43788517677586763, "grad_norm": 0.7026298642158508, "learning_rate": 8.906414727110536e-06, "loss": 1.3075, "step": 2700 }, { "epoch": 0.4459941615309763, "grad_norm": 1.9528323411941528, "learning_rate": 8.886140621198606e-06, "loss": 1.3071, "step": 2750 }, { "epoch": 0.454103146286085, "grad_norm": 1.322129487991333, "learning_rate": 8.865866515286676e-06, "loss": 1.3234, "step": 2800 }, { "epoch": 0.46221213104119363, "grad_norm": 3.2143826484680176, "learning_rate": 8.845592409374747e-06, "loss": 1.3553, "step": 2850 }, { "epoch": 0.4703211157963023, "grad_norm": 2.3696305751800537, "learning_rate": 8.825318303462817e-06, "loss": 1.2905, "step": 2900 }, { "epoch": 0.47843010055141094, "grad_norm": 1.7011082172393799, "learning_rate": 8.805044197550889e-06, "loss": 1.2826, "step": 2950 }, { "epoch": 0.4865390853065196, "grad_norm": 1.8602455854415894, "learning_rate": 8.784770091638959e-06, "loss": 1.2119, "step": 3000 }, { "epoch": 0.4946480700616283, "grad_norm": 1.8766978979110718, "learning_rate": 8.76449598572703e-06, "loss": 1.25, "step": 3050 }, { "epoch": 0.5027570548167369, "grad_norm": 1.2416331768035889, "learning_rate": 8.744221879815102e-06, "loss": 1.3494, "step": 3100 }, { "epoch": 0.5108660395718456, "grad_norm": 2.160327434539795, "learning_rate": 8.723947773903172e-06, "loss": 1.2821, "step": 3150 }, { "epoch": 0.5189750243269543, "grad_norm": 1.5392768383026123, "learning_rate": 8.703673667991243e-06, "loss": 1.3716, "step": 3200 }, { "epoch": 0.5270840090820629, "grad_norm": 1.9121636152267456, "learning_rate": 8.683399562079313e-06, "loss": 1.2615, "step": 3250 }, { "epoch": 0.5351929938371716, "grad_norm": 1.8084639310836792, "learning_rate": 8.663125456167383e-06, "loss": 1.3391, "step": 3300 }, { "epoch": 0.5433019785922802, "grad_norm": 2.516788959503174, "learning_rate": 8.642851350255455e-06, "loss": 1.2747, "step": 3350 }, { "epoch": 0.5514109633473889, "grad_norm": 5.785072326660156, "learning_rate": 8.622577244343525e-06, "loss": 1.3414, "step": 3400 }, { "epoch": 0.5595199481024976, "grad_norm": 3.8145599365234375, "learning_rate": 8.602303138431596e-06, "loss": 1.2274, "step": 3450 }, { "epoch": 0.5676289328576062, "grad_norm": 0.36742010712623596, "learning_rate": 8.582029032519666e-06, "loss": 1.2184, "step": 3500 }, { "epoch": 0.5757379176127149, "grad_norm": 2.2382278442382812, "learning_rate": 8.561754926607738e-06, "loss": 1.2359, "step": 3550 }, { "epoch": 0.5838469023678236, "grad_norm": 3.001042127609253, "learning_rate": 8.541480820695808e-06, "loss": 1.2839, "step": 3600 }, { "epoch": 0.5919558871229322, "grad_norm": 1.2254408597946167, "learning_rate": 8.521206714783878e-06, "loss": 1.3025, "step": 3650 }, { "epoch": 0.6000648718780409, "grad_norm": 2.408430576324463, "learning_rate": 8.50093260887195e-06, "loss": 1.2959, "step": 3700 }, { "epoch": 0.6081738566331495, "grad_norm": 1.125710129737854, "learning_rate": 8.480658502960019e-06, "loss": 1.3165, "step": 3750 }, { "epoch": 0.6162828413882582, "grad_norm": 1.7280784845352173, "learning_rate": 8.46038439704809e-06, "loss": 1.3033, "step": 3800 }, { "epoch": 0.6243918261433669, "grad_norm": 1.7289584875106812, "learning_rate": 8.440110291136162e-06, "loss": 1.3818, "step": 3850 }, { "epoch": 0.6325008108984755, "grad_norm": 1.4574753046035767, "learning_rate": 8.419836185224232e-06, "loss": 1.2858, "step": 3900 }, { "epoch": 0.6406097956535842, "grad_norm": 1.9952958822250366, "learning_rate": 8.399562079312304e-06, "loss": 1.323, "step": 3950 }, { "epoch": 0.6487187804086928, "grad_norm": 2.3702497482299805, "learning_rate": 8.379287973400374e-06, "loss": 1.3362, "step": 4000 }, { "epoch": 0.6568277651638015, "grad_norm": 2.1261913776397705, "learning_rate": 8.359013867488445e-06, "loss": 1.2978, "step": 4050 }, { "epoch": 0.6649367499189102, "grad_norm": 0.6903754472732544, "learning_rate": 8.338739761576515e-06, "loss": 1.3273, "step": 4100 }, { "epoch": 0.6730457346740188, "grad_norm": 1.848915934562683, "learning_rate": 8.318465655664587e-06, "loss": 1.294, "step": 4150 }, { "epoch": 0.6811547194291274, "grad_norm": 2.0583913326263428, "learning_rate": 8.298191549752657e-06, "loss": 1.3353, "step": 4200 }, { "epoch": 0.6892637041842361, "grad_norm": 1.5055549144744873, "learning_rate": 8.277917443840727e-06, "loss": 1.3305, "step": 4250 }, { "epoch": 0.6973726889393448, "grad_norm": 1.6197587251663208, "learning_rate": 8.257643337928798e-06, "loss": 1.3765, "step": 4300 }, { "epoch": 0.7054816736944535, "grad_norm": 1.927742838859558, "learning_rate": 8.237369232016868e-06, "loss": 1.3372, "step": 4350 }, { "epoch": 0.7135906584495622, "grad_norm": 2.3434853553771973, "learning_rate": 8.21709512610494e-06, "loss": 1.3041, "step": 4400 }, { "epoch": 0.7216996432046707, "grad_norm": 2.09249210357666, "learning_rate": 8.19682102019301e-06, "loss": 1.2946, "step": 4450 }, { "epoch": 0.7298086279597794, "grad_norm": 0.19756704568862915, "learning_rate": 8.176546914281081e-06, "loss": 1.3014, "step": 4500 }, { "epoch": 0.7379176127148881, "grad_norm": 1.2464430332183838, "learning_rate": 8.156272808369151e-06, "loss": 1.2552, "step": 4550 }, { "epoch": 0.7460265974699968, "grad_norm": 1.7760344743728638, "learning_rate": 8.135998702457221e-06, "loss": 1.2963, "step": 4600 }, { "epoch": 0.7541355822251055, "grad_norm": 1.922969937324524, "learning_rate": 8.115724596545294e-06, "loss": 1.2849, "step": 4650 }, { "epoch": 0.762244566980214, "grad_norm": 3.757589340209961, "learning_rate": 8.095450490633364e-06, "loss": 1.2511, "step": 4700 }, { "epoch": 0.7703535517353227, "grad_norm": 2.4983339309692383, "learning_rate": 8.075176384721434e-06, "loss": 1.2169, "step": 4750 }, { "epoch": 0.7784625364904314, "grad_norm": 2.1676225662231445, "learning_rate": 8.054902278809506e-06, "loss": 1.2711, "step": 4800 }, { "epoch": 0.7865715212455401, "grad_norm": 2.8060524463653564, "learning_rate": 8.034628172897576e-06, "loss": 1.3246, "step": 4850 }, { "epoch": 0.7946805060006488, "grad_norm": 1.917485237121582, "learning_rate": 8.014354066985647e-06, "loss": 1.2669, "step": 4900 }, { "epoch": 0.8027894907557573, "grad_norm": 2.1371405124664307, "learning_rate": 7.994079961073717e-06, "loss": 1.3554, "step": 4950 }, { "epoch": 0.810898475510866, "grad_norm": 0.9637095928192139, "learning_rate": 7.973805855161789e-06, "loss": 1.2862, "step": 5000 }, { "epoch": 0.8190074602659747, "grad_norm": 1.3636858463287354, "learning_rate": 7.953531749249859e-06, "loss": 1.3134, "step": 5050 }, { "epoch": 0.8271164450210834, "grad_norm": 0.9230815768241882, "learning_rate": 7.933257643337929e-06, "loss": 1.2352, "step": 5100 }, { "epoch": 0.8352254297761921, "grad_norm": 1.7373725175857544, "learning_rate": 7.912983537426e-06, "loss": 1.318, "step": 5150 }, { "epoch": 0.8433344145313006, "grad_norm": 2.0280568599700928, "learning_rate": 7.89270943151407e-06, "loss": 1.2464, "step": 5200 }, { "epoch": 0.8514433992864093, "grad_norm": 1.1584951877593994, "learning_rate": 7.872435325602142e-06, "loss": 1.2292, "step": 5250 }, { "epoch": 0.859552384041518, "grad_norm": 1.7846872806549072, "learning_rate": 7.852161219690212e-06, "loss": 1.2976, "step": 5300 }, { "epoch": 0.8676613687966267, "grad_norm": 1.7553609609603882, "learning_rate": 7.831887113778283e-06, "loss": 1.2684, "step": 5350 }, { "epoch": 0.8757703535517353, "grad_norm": 2.1919806003570557, "learning_rate": 7.811613007866353e-06, "loss": 1.2956, "step": 5400 }, { "epoch": 0.883879338306844, "grad_norm": 0.717258870601654, "learning_rate": 7.791338901954425e-06, "loss": 1.2186, "step": 5450 }, { "epoch": 0.8919883230619526, "grad_norm": 1.4364169836044312, "learning_rate": 7.771064796042496e-06, "loss": 1.2596, "step": 5500 }, { "epoch": 0.9000973078170613, "grad_norm": 1.99948251247406, "learning_rate": 7.750790690130566e-06, "loss": 1.3616, "step": 5550 }, { "epoch": 0.90820629257217, "grad_norm": 1.5464040040969849, "learning_rate": 7.730516584218636e-06, "loss": 1.2595, "step": 5600 }, { "epoch": 0.9163152773272786, "grad_norm": 1.500651240348816, "learning_rate": 7.710242478306708e-06, "loss": 1.3223, "step": 5650 }, { "epoch": 0.9244242620823873, "grad_norm": 1.6872031688690186, "learning_rate": 7.689968372394778e-06, "loss": 1.2099, "step": 5700 }, { "epoch": 0.9325332468374959, "grad_norm": 1.5728824138641357, "learning_rate": 7.66969426648285e-06, "loss": 1.2975, "step": 5750 }, { "epoch": 0.9406422315926046, "grad_norm": 2.0520365238189697, "learning_rate": 7.649420160570919e-06, "loss": 1.2895, "step": 5800 }, { "epoch": 0.9487512163477133, "grad_norm": 1.6994986534118652, "learning_rate": 7.62914605465899e-06, "loss": 1.2421, "step": 5850 }, { "epoch": 0.9568602011028219, "grad_norm": 2.1418871879577637, "learning_rate": 7.608871948747061e-06, "loss": 1.2464, "step": 5900 }, { "epoch": 0.9649691858579306, "grad_norm": 3.202744960784912, "learning_rate": 7.588597842835131e-06, "loss": 1.2565, "step": 5950 }, { "epoch": 0.9730781706130393, "grad_norm": 2.3648183345794678, "learning_rate": 7.568323736923202e-06, "loss": 1.262, "step": 6000 }, { "epoch": 0.9811871553681479, "grad_norm": 1.500482439994812, "learning_rate": 7.548049631011273e-06, "loss": 1.2281, "step": 6050 }, { "epoch": 0.9892961401232566, "grad_norm": 2.1810970306396484, "learning_rate": 7.527775525099344e-06, "loss": 1.2318, "step": 6100 }, { "epoch": 0.9974051248783652, "grad_norm": 1.3932199478149414, "learning_rate": 7.507501419187414e-06, "loss": 1.2344, "step": 6150 }, { "epoch": 1.0055141096334739, "grad_norm": 2.48954701423645, "learning_rate": 7.487227313275484e-06, "loss": 1.3035, "step": 6200 }, { "epoch": 1.0136230943885824, "grad_norm": 1.4061833620071411, "learning_rate": 7.466953207363555e-06, "loss": 1.308, "step": 6250 }, { "epoch": 1.0217320791436912, "grad_norm": 1.1377198696136475, "learning_rate": 7.4466791014516275e-06, "loss": 1.2877, "step": 6300 }, { "epoch": 1.0298410638987998, "grad_norm": 1.90823233127594, "learning_rate": 7.426404995539697e-06, "loss": 1.2371, "step": 6350 }, { "epoch": 1.0379500486539086, "grad_norm": 1.5908371210098267, "learning_rate": 7.406130889627768e-06, "loss": 1.2861, "step": 6400 }, { "epoch": 1.0460590334090172, "grad_norm": 2.892045259475708, "learning_rate": 7.385856783715839e-06, "loss": 1.2856, "step": 6450 }, { "epoch": 1.0541680181641258, "grad_norm": 1.4647297859191895, "learning_rate": 7.36558267780391e-06, "loss": 1.2431, "step": 6500 }, { "epoch": 1.0622770029192345, "grad_norm": 1.2635786533355713, "learning_rate": 7.34530857189198e-06, "loss": 1.3269, "step": 6550 }, { "epoch": 1.0703859876743431, "grad_norm": 1.0140317678451538, "learning_rate": 7.325034465980051e-06, "loss": 1.2724, "step": 6600 }, { "epoch": 1.078494972429452, "grad_norm": 1.1980386972427368, "learning_rate": 7.304760360068122e-06, "loss": 1.2909, "step": 6650 }, { "epoch": 1.0866039571845605, "grad_norm": 2.5311107635498047, "learning_rate": 7.284486254156193e-06, "loss": 1.2296, "step": 6700 }, { "epoch": 1.094712941939669, "grad_norm": 1.5901424884796143, "learning_rate": 7.2642121482442626e-06, "loss": 1.2136, "step": 6750 }, { "epoch": 1.1028219266947779, "grad_norm": 1.0336142778396606, "learning_rate": 7.243938042332333e-06, "loss": 1.2908, "step": 6800 }, { "epoch": 1.1109309114498864, "grad_norm": 2.5491976737976074, "learning_rate": 7.223663936420404e-06, "loss": 1.2256, "step": 6850 }, { "epoch": 1.1190398962049952, "grad_norm": 2.592517137527466, "learning_rate": 7.203389830508475e-06, "loss": 1.2808, "step": 6900 }, { "epoch": 1.1271488809601038, "grad_norm": 1.5294160842895508, "learning_rate": 7.1831157245965456e-06, "loss": 1.2279, "step": 6950 }, { "epoch": 1.1352578657152124, "grad_norm": 2.665705680847168, "learning_rate": 7.162841618684616e-06, "loss": 1.2813, "step": 7000 }, { "epoch": 1.1433668504703212, "grad_norm": 5.743466854095459, "learning_rate": 7.142567512772687e-06, "loss": 1.2601, "step": 7050 }, { "epoch": 1.1514758352254297, "grad_norm": 1.979543924331665, "learning_rate": 7.122293406860759e-06, "loss": 1.2568, "step": 7100 }, { "epoch": 1.1595848199805385, "grad_norm": 0.2160961776971817, "learning_rate": 7.1020193009488294e-06, "loss": 1.2234, "step": 7150 }, { "epoch": 1.167693804735647, "grad_norm": 2.09420108795166, "learning_rate": 7.0817451950369e-06, "loss": 1.2727, "step": 7200 }, { "epoch": 1.1758027894907557, "grad_norm": 1.4437772035598755, "learning_rate": 7.06147108912497e-06, "loss": 1.2535, "step": 7250 }, { "epoch": 1.1839117742458645, "grad_norm": 2.6284403800964355, "learning_rate": 7.041196983213041e-06, "loss": 1.1969, "step": 7300 }, { "epoch": 1.192020759000973, "grad_norm": 1.897250771522522, "learning_rate": 7.020922877301112e-06, "loss": 1.2579, "step": 7350 }, { "epoch": 1.2001297437560818, "grad_norm": 1.591044545173645, "learning_rate": 7.000648771389182e-06, "loss": 1.2553, "step": 7400 }, { "epoch": 1.2082387285111904, "grad_norm": 2.323927402496338, "learning_rate": 6.980374665477253e-06, "loss": 1.2802, "step": 7450 }, { "epoch": 1.216347713266299, "grad_norm": 2.145848035812378, "learning_rate": 6.960100559565324e-06, "loss": 1.3271, "step": 7500 }, { "epoch": 1.2244566980214078, "grad_norm": 1.1199519634246826, "learning_rate": 6.939826453653395e-06, "loss": 1.2948, "step": 7550 }, { "epoch": 1.2325656827765163, "grad_norm": 1.5974798202514648, "learning_rate": 6.919552347741465e-06, "loss": 1.2925, "step": 7600 }, { "epoch": 1.240674667531625, "grad_norm": 1.1883361339569092, "learning_rate": 6.899278241829535e-06, "loss": 1.2288, "step": 7650 }, { "epoch": 1.2487836522867337, "grad_norm": 2.2361881732940674, "learning_rate": 6.879004135917606e-06, "loss": 1.2732, "step": 7700 }, { "epoch": 1.2568926370418423, "grad_norm": 0.9524820446968079, "learning_rate": 6.858730030005677e-06, "loss": 1.3427, "step": 7750 }, { "epoch": 1.265001621796951, "grad_norm": 2.439042091369629, "learning_rate": 6.8384559240937475e-06, "loss": 1.2915, "step": 7800 }, { "epoch": 1.2731106065520597, "grad_norm": 2.7487218379974365, "learning_rate": 6.818181818181818e-06, "loss": 1.3025, "step": 7850 }, { "epoch": 1.2812195913071682, "grad_norm": 1.3138405084609985, "learning_rate": 6.79790771226989e-06, "loss": 1.2678, "step": 7900 }, { "epoch": 1.289328576062277, "grad_norm": 2.2130439281463623, "learning_rate": 6.777633606357961e-06, "loss": 1.2955, "step": 7950 }, { "epoch": 1.2974375608173856, "grad_norm": 2.6008100509643555, "learning_rate": 6.757359500446031e-06, "loss": 1.2413, "step": 8000 }, { "epoch": 1.3055465455724944, "grad_norm": 2.495473861694336, "learning_rate": 6.737085394534102e-06, "loss": 1.3061, "step": 8050 }, { "epoch": 1.313655530327603, "grad_norm": 0.8150402307510376, "learning_rate": 6.716811288622173e-06, "loss": 1.3055, "step": 8100 }, { "epoch": 1.3217645150827115, "grad_norm": 1.7135632038116455, "learning_rate": 6.696537182710244e-06, "loss": 1.2723, "step": 8150 }, { "epoch": 1.3298734998378203, "grad_norm": 2.367229700088501, "learning_rate": 6.6762630767983135e-06, "loss": 1.2817, "step": 8200 }, { "epoch": 1.337982484592929, "grad_norm": 1.9759521484375, "learning_rate": 6.655988970886384e-06, "loss": 1.2453, "step": 8250 }, { "epoch": 1.3460914693480377, "grad_norm": 1.2894783020019531, "learning_rate": 6.635714864974455e-06, "loss": 1.338, "step": 8300 }, { "epoch": 1.3542004541031463, "grad_norm": 0.24668912589550018, "learning_rate": 6.615440759062526e-06, "loss": 1.255, "step": 8350 }, { "epoch": 1.3623094388582548, "grad_norm": 2.6564314365386963, "learning_rate": 6.5951666531505966e-06, "loss": 1.1793, "step": 8400 }, { "epoch": 1.3704184236133636, "grad_norm": 1.1451148986816406, "learning_rate": 6.574892547238667e-06, "loss": 1.2857, "step": 8450 }, { "epoch": 1.3785274083684722, "grad_norm": 2.693976879119873, "learning_rate": 6.554618441326738e-06, "loss": 1.2497, "step": 8500 }, { "epoch": 1.386636393123581, "grad_norm": 2.3329060077667236, "learning_rate": 6.534344335414808e-06, "loss": 1.2665, "step": 8550 }, { "epoch": 1.3947453778786896, "grad_norm": 2.251298427581787, "learning_rate": 6.514070229502879e-06, "loss": 1.33, "step": 8600 }, { "epoch": 1.4028543626337981, "grad_norm": 3.0650289058685303, "learning_rate": 6.4937961235909495e-06, "loss": 1.2996, "step": 8650 }, { "epoch": 1.410963347388907, "grad_norm": 2.1476645469665527, "learning_rate": 6.47352201767902e-06, "loss": 1.3232, "step": 8700 }, { "epoch": 1.4190723321440155, "grad_norm": 1.236338496208191, "learning_rate": 6.453247911767092e-06, "loss": 1.2547, "step": 8750 }, { "epoch": 1.4271813168991243, "grad_norm": 2.276726722717285, "learning_rate": 6.432973805855163e-06, "loss": 1.2067, "step": 8800 }, { "epoch": 1.4352903016542329, "grad_norm": 1.871100664138794, "learning_rate": 6.412699699943233e-06, "loss": 1.238, "step": 8850 }, { "epoch": 1.4433992864093415, "grad_norm": 1.9131008386611938, "learning_rate": 6.392425594031304e-06, "loss": 1.2436, "step": 8900 }, { "epoch": 1.4515082711644502, "grad_norm": 1.6729109287261963, "learning_rate": 6.372151488119375e-06, "loss": 1.2732, "step": 8950 }, { "epoch": 1.4596172559195588, "grad_norm": 1.4220985174179077, "learning_rate": 6.351877382207446e-06, "loss": 1.3043, "step": 9000 }, { "epoch": 1.4677262406746676, "grad_norm": 1.5014866590499878, "learning_rate": 6.331603276295516e-06, "loss": 1.2501, "step": 9050 }, { "epoch": 1.4758352254297762, "grad_norm": 1.6406453847885132, "learning_rate": 6.311329170383586e-06, "loss": 1.2457, "step": 9100 }, { "epoch": 1.4839442101848848, "grad_norm": 2.01636004447937, "learning_rate": 6.291055064471657e-06, "loss": 1.1894, "step": 9150 }, { "epoch": 1.4920531949399936, "grad_norm": 1.5200086832046509, "learning_rate": 6.270780958559728e-06, "loss": 1.1833, "step": 9200 }, { "epoch": 1.5001621796951021, "grad_norm": 1.9229196310043335, "learning_rate": 6.2505068526477985e-06, "loss": 1.2219, "step": 9250 }, { "epoch": 1.508271164450211, "grad_norm": 2.19616961479187, "learning_rate": 6.230232746735869e-06, "loss": 1.2788, "step": 9300 }, { "epoch": 1.5163801492053195, "grad_norm": 1.4525929689407349, "learning_rate": 6.20995864082394e-06, "loss": 1.2963, "step": 9350 }, { "epoch": 1.524489133960428, "grad_norm": 2.227957248687744, "learning_rate": 6.189684534912011e-06, "loss": 1.324, "step": 9400 }, { "epoch": 1.5325981187155369, "grad_norm": 1.158480167388916, "learning_rate": 6.1694104290000815e-06, "loss": 1.2185, "step": 9450 }, { "epoch": 1.5407071034706454, "grad_norm": 1.3199654817581177, "learning_rate": 6.1491363230881514e-06, "loss": 1.2714, "step": 9500 }, { "epoch": 1.5488160882257542, "grad_norm": 2.110074043273926, "learning_rate": 6.128862217176224e-06, "loss": 1.256, "step": 9550 }, { "epoch": 1.5569250729808628, "grad_norm": 1.322704553604126, "learning_rate": 6.108588111264294e-06, "loss": 1.1785, "step": 9600 }, { "epoch": 1.5650340577359714, "grad_norm": 2.572434425354004, "learning_rate": 6.0883140053523645e-06, "loss": 1.2349, "step": 9650 }, { "epoch": 1.5731430424910802, "grad_norm": 2.033780574798584, "learning_rate": 6.068039899440435e-06, "loss": 1.2203, "step": 9700 }, { "epoch": 1.5812520272461887, "grad_norm": 1.9049453735351562, "learning_rate": 6.047765793528506e-06, "loss": 1.2043, "step": 9750 }, { "epoch": 1.5893610120012975, "grad_norm": 2.882568597793579, "learning_rate": 6.027491687616577e-06, "loss": 1.217, "step": 9800 }, { "epoch": 1.597469996756406, "grad_norm": 2.430227279663086, "learning_rate": 6.0072175817046475e-06, "loss": 1.2795, "step": 9850 }, { "epoch": 1.6055789815115147, "grad_norm": 1.9852076768875122, "learning_rate": 5.986943475792718e-06, "loss": 1.2434, "step": 9900 }, { "epoch": 1.6136879662666233, "grad_norm": 2.1857733726501465, "learning_rate": 5.966669369880789e-06, "loss": 1.2756, "step": 9950 }, { "epoch": 1.621796951021732, "grad_norm": 2.7323496341705322, "learning_rate": 5.946395263968859e-06, "loss": 1.2921, "step": 10000 }, { "epoch": 1.6299059357768408, "grad_norm": 2.2632415294647217, "learning_rate": 5.92612115805693e-06, "loss": 1.2763, "step": 10050 }, { "epoch": 1.6380149205319494, "grad_norm": 2.0098931789398193, "learning_rate": 5.9058470521450005e-06, "loss": 1.292, "step": 10100 }, { "epoch": 1.646123905287058, "grad_norm": 2.09377384185791, "learning_rate": 5.885572946233071e-06, "loss": 1.2738, "step": 10150 }, { "epoch": 1.6542328900421666, "grad_norm": 2.292084217071533, "learning_rate": 5.865298840321142e-06, "loss": 1.2726, "step": 10200 }, { "epoch": 1.6623418747972754, "grad_norm": 2.7795863151550293, "learning_rate": 5.845024734409213e-06, "loss": 1.2523, "step": 10250 }, { "epoch": 1.6704508595523841, "grad_norm": 3.0921523571014404, "learning_rate": 5.8247506284972835e-06, "loss": 1.2684, "step": 10300 }, { "epoch": 1.6785598443074927, "grad_norm": 1.036535382270813, "learning_rate": 5.804476522585355e-06, "loss": 1.2404, "step": 10350 }, { "epoch": 1.6866688290626013, "grad_norm": 1.3650333881378174, "learning_rate": 5.784202416673426e-06, "loss": 1.247, "step": 10400 }, { "epoch": 1.6947778138177099, "grad_norm": 2.558058977127075, "learning_rate": 5.763928310761497e-06, "loss": 1.2938, "step": 10450 }, { "epoch": 1.7028867985728187, "grad_norm": 2.6575851440429688, "learning_rate": 5.743654204849567e-06, "loss": 1.2974, "step": 10500 }, { "epoch": 1.7109957833279275, "grad_norm": 1.657400369644165, "learning_rate": 5.723380098937637e-06, "loss": 1.3268, "step": 10550 }, { "epoch": 1.719104768083036, "grad_norm": 2.027801275253296, "learning_rate": 5.703105993025708e-06, "loss": 1.3029, "step": 10600 }, { "epoch": 1.7272137528381446, "grad_norm": 1.3793126344680786, "learning_rate": 5.682831887113779e-06, "loss": 1.1812, "step": 10650 }, { "epoch": 1.7353227375932532, "grad_norm": 1.5861361026763916, "learning_rate": 5.6625577812018495e-06, "loss": 1.2871, "step": 10700 }, { "epoch": 1.743431722348362, "grad_norm": 2.3719797134399414, "learning_rate": 5.64228367528992e-06, "loss": 1.1947, "step": 10750 }, { "epoch": 1.7515407071034708, "grad_norm": 2.675689697265625, "learning_rate": 5.622009569377991e-06, "loss": 1.2902, "step": 10800 }, { "epoch": 1.7596496918585793, "grad_norm": 2.018069267272949, "learning_rate": 5.601735463466062e-06, "loss": 1.1827, "step": 10850 }, { "epoch": 1.767758676613688, "grad_norm": 1.1704685688018799, "learning_rate": 5.5814613575541325e-06, "loss": 1.1498, "step": 10900 }, { "epoch": 1.7758676613687965, "grad_norm": 2.0111939907073975, "learning_rate": 5.561187251642202e-06, "loss": 1.2358, "step": 10950 }, { "epoch": 1.7839766461239053, "grad_norm": 3.043297529220581, "learning_rate": 5.540913145730273e-06, "loss": 1.26, "step": 11000 }, { "epoch": 1.792085630879014, "grad_norm": 3.0858490467071533, "learning_rate": 5.520639039818344e-06, "loss": 1.2519, "step": 11050 }, { "epoch": 1.8001946156341226, "grad_norm": 2.669131278991699, "learning_rate": 5.500364933906415e-06, "loss": 1.3174, "step": 11100 }, { "epoch": 1.8083036003892312, "grad_norm": 1.4813052415847778, "learning_rate": 5.480090827994486e-06, "loss": 1.2528, "step": 11150 }, { "epoch": 1.8164125851443398, "grad_norm": 2.2234039306640625, "learning_rate": 5.459816722082557e-06, "loss": 1.2984, "step": 11200 }, { "epoch": 1.8245215698994486, "grad_norm": 3.4711413383483887, "learning_rate": 5.439542616170628e-06, "loss": 1.2174, "step": 11250 }, { "epoch": 1.8326305546545574, "grad_norm": 1.9171231985092163, "learning_rate": 5.4192685102586985e-06, "loss": 1.2657, "step": 11300 }, { "epoch": 1.840739539409666, "grad_norm": 1.7139352560043335, "learning_rate": 5.398994404346769e-06, "loss": 1.274, "step": 11350 }, { "epoch": 1.8488485241647745, "grad_norm": 2.9915032386779785, "learning_rate": 5.37872029843484e-06, "loss": 1.1745, "step": 11400 }, { "epoch": 1.856957508919883, "grad_norm": 3.0832788944244385, "learning_rate": 5.35844619252291e-06, "loss": 1.3089, "step": 11450 }, { "epoch": 1.8650664936749919, "grad_norm": 2.364255905151367, "learning_rate": 5.338172086610981e-06, "loss": 1.2146, "step": 11500 }, { "epoch": 1.8731754784301007, "grad_norm": 3.9903273582458496, "learning_rate": 5.3178979806990514e-06, "loss": 1.3103, "step": 11550 }, { "epoch": 1.8812844631852093, "grad_norm": 3.4954657554626465, "learning_rate": 5.297623874787122e-06, "loss": 1.1682, "step": 11600 }, { "epoch": 1.8893934479403178, "grad_norm": 1.927339792251587, "learning_rate": 5.277349768875193e-06, "loss": 1.224, "step": 11650 }, { "epoch": 1.8975024326954264, "grad_norm": 1.446385145187378, "learning_rate": 5.257075662963264e-06, "loss": 1.3907, "step": 11700 }, { "epoch": 1.9056114174505352, "grad_norm": 2.112168312072754, "learning_rate": 5.2368015570513345e-06, "loss": 1.2041, "step": 11750 }, { "epoch": 1.913720402205644, "grad_norm": 2.601565361022949, "learning_rate": 5.216527451139405e-06, "loss": 1.2827, "step": 11800 }, { "epoch": 1.9218293869607526, "grad_norm": 1.9448506832122803, "learning_rate": 5.196253345227475e-06, "loss": 1.1899, "step": 11850 }, { "epoch": 1.9299383717158611, "grad_norm": 1.7583825588226318, "learning_rate": 5.175979239315546e-06, "loss": 1.2681, "step": 11900 }, { "epoch": 1.9380473564709697, "grad_norm": 1.319887399673462, "learning_rate": 5.155705133403617e-06, "loss": 1.2311, "step": 11950 }, { "epoch": 1.9461563412260785, "grad_norm": 1.5955997705459595, "learning_rate": 5.135431027491688e-06, "loss": 1.2043, "step": 12000 }, { "epoch": 1.9542653259811873, "grad_norm": 2.112924337387085, "learning_rate": 5.115156921579759e-06, "loss": 1.21, "step": 12050 }, { "epoch": 1.9623743107362959, "grad_norm": 2.8387506008148193, "learning_rate": 5.09488281566783e-06, "loss": 1.253, "step": 12100 }, { "epoch": 1.9704832954914044, "grad_norm": 2.55635142326355, "learning_rate": 5.0746087097559005e-06, "loss": 1.2277, "step": 12150 }, { "epoch": 1.978592280246513, "grad_norm": 2.2216525077819824, "learning_rate": 5.054334603843971e-06, "loss": 1.1755, "step": 12200 }, { "epoch": 1.9867012650016218, "grad_norm": 1.1123380661010742, "learning_rate": 5.034060497932042e-06, "loss": 1.229, "step": 12250 }, { "epoch": 1.9948102497567306, "grad_norm": 2.5723652839660645, "learning_rate": 5.013786392020113e-06, "loss": 1.2028, "step": 12300 }, { "epoch": 2.002919234511839, "grad_norm": 2.5151309967041016, "learning_rate": 4.993512286108183e-06, "loss": 1.2314, "step": 12350 }, { "epoch": 2.0110282192669477, "grad_norm": 1.5693440437316895, "learning_rate": 4.973238180196253e-06, "loss": 1.2771, "step": 12400 }, { "epoch": 2.0191372040220563, "grad_norm": 4.562756538391113, "learning_rate": 4.952964074284324e-06, "loss": 1.2614, "step": 12450 }, { "epoch": 2.027246188777165, "grad_norm": 0.9000415802001953, "learning_rate": 4.932689968372395e-06, "loss": 1.2383, "step": 12500 }, { "epoch": 2.035355173532274, "grad_norm": 1.2384685277938843, "learning_rate": 4.912415862460466e-06, "loss": 1.2501, "step": 12550 }, { "epoch": 2.0434641582873825, "grad_norm": 1.771003246307373, "learning_rate": 4.892141756548537e-06, "loss": 1.2603, "step": 12600 }, { "epoch": 2.051573143042491, "grad_norm": 2.1732709407806396, "learning_rate": 4.871867650636607e-06, "loss": 1.1763, "step": 12650 }, { "epoch": 2.0596821277975996, "grad_norm": 1.5497926473617554, "learning_rate": 4.851593544724678e-06, "loss": 1.2558, "step": 12700 }, { "epoch": 2.067791112552708, "grad_norm": 1.6834354400634766, "learning_rate": 4.831319438812749e-06, "loss": 1.2592, "step": 12750 }, { "epoch": 2.075900097307817, "grad_norm": 1.920637845993042, "learning_rate": 4.811045332900819e-06, "loss": 1.279, "step": 12800 }, { "epoch": 2.084009082062926, "grad_norm": 1.851440191268921, "learning_rate": 4.79077122698889e-06, "loss": 1.2904, "step": 12850 }, { "epoch": 2.0921180668180344, "grad_norm": 3.262347936630249, "learning_rate": 4.770497121076961e-06, "loss": 1.1763, "step": 12900 }, { "epoch": 2.100227051573143, "grad_norm": 1.7690410614013672, "learning_rate": 4.750223015165032e-06, "loss": 1.246, "step": 12950 }, { "epoch": 2.1083360363282515, "grad_norm": 2.3499839305877686, "learning_rate": 4.7299489092531024e-06, "loss": 1.1969, "step": 13000 }, { "epoch": 2.1164450210833605, "grad_norm": 2.815704822540283, "learning_rate": 4.709674803341173e-06, "loss": 1.2299, "step": 13050 }, { "epoch": 2.124554005838469, "grad_norm": 1.6346577405929565, "learning_rate": 4.689400697429244e-06, "loss": 1.1941, "step": 13100 }, { "epoch": 2.1326629905935777, "grad_norm": 1.3435848951339722, "learning_rate": 4.669126591517315e-06, "loss": 1.2193, "step": 13150 }, { "epoch": 2.1407719753486862, "grad_norm": 2.11517596244812, "learning_rate": 4.6488524856053854e-06, "loss": 1.2868, "step": 13200 }, { "epoch": 2.148880960103795, "grad_norm": 1.5125885009765625, "learning_rate": 4.628578379693456e-06, "loss": 1.2034, "step": 13250 }, { "epoch": 2.156989944858904, "grad_norm": 2.0965402126312256, "learning_rate": 4.608304273781526e-06, "loss": 1.2484, "step": 13300 }, { "epoch": 2.1650989296140124, "grad_norm": 1.9137533903121948, "learning_rate": 4.588030167869597e-06, "loss": 1.2531, "step": 13350 }, { "epoch": 2.173207914369121, "grad_norm": 2.630784749984741, "learning_rate": 4.5677560619576685e-06, "loss": 1.2074, "step": 13400 }, { "epoch": 2.1813168991242295, "grad_norm": 1.4360569715499878, "learning_rate": 4.547481956045739e-06, "loss": 1.22, "step": 13450 }, { "epoch": 2.189425883879338, "grad_norm": 2.066938638687134, "learning_rate": 4.52720785013381e-06, "loss": 1.1438, "step": 13500 }, { "epoch": 2.197534868634447, "grad_norm": 1.7182413339614868, "learning_rate": 4.50693374422188e-06, "loss": 1.2681, "step": 13550 }, { "epoch": 2.2056438533895557, "grad_norm": 1.706874132156372, "learning_rate": 4.486659638309951e-06, "loss": 1.1888, "step": 13600 }, { "epoch": 2.2137528381446643, "grad_norm": 1.8634135723114014, "learning_rate": 4.466385532398021e-06, "loss": 1.2953, "step": 13650 }, { "epoch": 2.221861822899773, "grad_norm": 1.77712881565094, "learning_rate": 4.446111426486092e-06, "loss": 1.2461, "step": 13700 }, { "epoch": 2.2299708076548814, "grad_norm": 1.9737837314605713, "learning_rate": 4.425837320574163e-06, "loss": 1.2856, "step": 13750 }, { "epoch": 2.2380797924099904, "grad_norm": 2.7296142578125, "learning_rate": 4.405563214662234e-06, "loss": 1.2124, "step": 13800 }, { "epoch": 2.246188777165099, "grad_norm": 3.5112380981445312, "learning_rate": 4.385289108750304e-06, "loss": 1.2143, "step": 13850 }, { "epoch": 2.2542977619202076, "grad_norm": 3.4935994148254395, "learning_rate": 4.365015002838375e-06, "loss": 1.3156, "step": 13900 }, { "epoch": 2.262406746675316, "grad_norm": 2.2354025840759277, "learning_rate": 4.344740896926446e-06, "loss": 1.3025, "step": 13950 }, { "epoch": 2.2705157314304247, "grad_norm": 2.089087724685669, "learning_rate": 4.324466791014517e-06, "loss": 1.2249, "step": 14000 }, { "epoch": 2.2786247161855337, "grad_norm": 2.6738228797912598, "learning_rate": 4.304192685102587e-06, "loss": 1.1766, "step": 14050 }, { "epoch": 2.2867337009406423, "grad_norm": 1.6028488874435425, "learning_rate": 4.283918579190658e-06, "loss": 1.2806, "step": 14100 }, { "epoch": 2.294842685695751, "grad_norm": 2.742100954055786, "learning_rate": 4.263644473278729e-06, "loss": 1.2241, "step": 14150 }, { "epoch": 2.3029516704508595, "grad_norm": 1.2533172369003296, "learning_rate": 4.2433703673668e-06, "loss": 1.2563, "step": 14200 }, { "epoch": 2.311060655205968, "grad_norm": 2.783311128616333, "learning_rate": 4.22309626145487e-06, "loss": 1.2382, "step": 14250 }, { "epoch": 2.319169639961077, "grad_norm": 1.9947431087493896, "learning_rate": 4.202822155542941e-06, "loss": 1.2533, "step": 14300 }, { "epoch": 2.3272786247161856, "grad_norm": 2.8825254440307617, "learning_rate": 4.182548049631012e-06, "loss": 1.2332, "step": 14350 }, { "epoch": 2.335387609471294, "grad_norm": 1.9132847785949707, "learning_rate": 4.162273943719083e-06, "loss": 1.225, "step": 14400 }, { "epoch": 2.3434965942264028, "grad_norm": 2.9740896224975586, "learning_rate": 4.1419998378071526e-06, "loss": 1.302, "step": 14450 }, { "epoch": 2.3516055789815113, "grad_norm": 2.5434772968292236, "learning_rate": 4.121725731895223e-06, "loss": 1.2857, "step": 14500 }, { "epoch": 2.3597145637366204, "grad_norm": 0.8146458864212036, "learning_rate": 4.101451625983294e-06, "loss": 1.1667, "step": 14550 }, { "epoch": 2.367823548491729, "grad_norm": 2.3713395595550537, "learning_rate": 4.081177520071365e-06, "loss": 1.1799, "step": 14600 }, { "epoch": 2.3759325332468375, "grad_norm": 2.5800139904022217, "learning_rate": 4.0609034141594364e-06, "loss": 1.2639, "step": 14650 }, { "epoch": 2.384041518001946, "grad_norm": 8.0242338180542, "learning_rate": 4.040629308247507e-06, "loss": 1.2168, "step": 14700 }, { "epoch": 2.3921505027570547, "grad_norm": 0.9941585063934326, "learning_rate": 4.020355202335577e-06, "loss": 1.2966, "step": 14750 }, { "epoch": 2.4002594875121637, "grad_norm": 0.7390837669372559, "learning_rate": 4.000081096423648e-06, "loss": 1.3237, "step": 14800 }, { "epoch": 2.4083684722672722, "grad_norm": 0.5450477004051208, "learning_rate": 3.979806990511719e-06, "loss": 1.284, "step": 14850 }, { "epoch": 2.416477457022381, "grad_norm": 1.7690049409866333, "learning_rate": 3.959532884599789e-06, "loss": 1.286, "step": 14900 }, { "epoch": 2.4245864417774894, "grad_norm": 2.655095100402832, "learning_rate": 3.93925877868786e-06, "loss": 1.1513, "step": 14950 }, { "epoch": 2.432695426532598, "grad_norm": 1.2846322059631348, "learning_rate": 3.918984672775931e-06, "loss": 1.2024, "step": 15000 }, { "epoch": 2.440804411287707, "grad_norm": 6.425904273986816, "learning_rate": 3.898710566864002e-06, "loss": 1.2462, "step": 15050 }, { "epoch": 2.4489133960428155, "grad_norm": 2.2578928470611572, "learning_rate": 3.878436460952072e-06, "loss": 1.2289, "step": 15100 }, { "epoch": 2.457022380797924, "grad_norm": 3.1006276607513428, "learning_rate": 3.858162355040143e-06, "loss": 1.2839, "step": 15150 }, { "epoch": 2.4651313655530327, "grad_norm": 2.598376512527466, "learning_rate": 3.837888249128214e-06, "loss": 1.2496, "step": 15200 }, { "epoch": 2.4732403503081413, "grad_norm": 0.7084365487098694, "learning_rate": 3.817614143216285e-06, "loss": 1.2325, "step": 15250 }, { "epoch": 2.48134933506325, "grad_norm": 2.280824661254883, "learning_rate": 3.797340037304355e-06, "loss": 1.1541, "step": 15300 }, { "epoch": 2.489458319818359, "grad_norm": 10.274874687194824, "learning_rate": 3.7770659313924257e-06, "loss": 1.1489, "step": 15350 }, { "epoch": 2.4975673045734674, "grad_norm": 2.2401506900787354, "learning_rate": 3.7567918254804964e-06, "loss": 1.1696, "step": 15400 }, { "epoch": 2.505676289328576, "grad_norm": 2.0671870708465576, "learning_rate": 3.7365177195685676e-06, "loss": 1.2512, "step": 15450 }, { "epoch": 2.5137852740836846, "grad_norm": 2.2912776470184326, "learning_rate": 3.7162436136566384e-06, "loss": 1.2278, "step": 15500 }, { "epoch": 2.5218942588387936, "grad_norm": 1.7719197273254395, "learning_rate": 3.6959695077447087e-06, "loss": 1.2984, "step": 15550 }, { "epoch": 2.530003243593902, "grad_norm": 1.9428201913833618, "learning_rate": 3.6756954018327795e-06, "loss": 1.2511, "step": 15600 }, { "epoch": 2.5381122283490107, "grad_norm": 2.804948091506958, "learning_rate": 3.6554212959208502e-06, "loss": 1.2599, "step": 15650 }, { "epoch": 2.5462212131041193, "grad_norm": 2.306248188018799, "learning_rate": 3.635147190008921e-06, "loss": 1.2252, "step": 15700 }, { "epoch": 2.554330197859228, "grad_norm": 2.2706100940704346, "learning_rate": 3.6148730840969913e-06, "loss": 1.1874, "step": 15750 }, { "epoch": 2.5624391826143365, "grad_norm": 1.6053426265716553, "learning_rate": 3.594598978185062e-06, "loss": 1.1885, "step": 15800 }, { "epoch": 2.5705481673694455, "grad_norm": 2.0183823108673096, "learning_rate": 3.5743248722731332e-06, "loss": 1.2176, "step": 15850 }, { "epoch": 2.578657152124554, "grad_norm": 1.6887139081954956, "learning_rate": 3.554050766361204e-06, "loss": 1.1165, "step": 15900 }, { "epoch": 2.5867661368796626, "grad_norm": 1.6139768362045288, "learning_rate": 3.5337766604492747e-06, "loss": 1.2512, "step": 15950 }, { "epoch": 2.594875121634771, "grad_norm": 1.026294231414795, "learning_rate": 3.513502554537345e-06, "loss": 1.2485, "step": 16000 }, { "epoch": 2.60298410638988, "grad_norm": 2.261563777923584, "learning_rate": 3.493228448625416e-06, "loss": 1.2493, "step": 16050 }, { "epoch": 2.6110930911449888, "grad_norm": 2.9573357105255127, "learning_rate": 3.4729543427134866e-06, "loss": 1.2542, "step": 16100 }, { "epoch": 2.6192020759000973, "grad_norm": 2.836587905883789, "learning_rate": 3.4526802368015573e-06, "loss": 1.2571, "step": 16150 }, { "epoch": 2.627311060655206, "grad_norm": 5.188553333282471, "learning_rate": 3.4324061308896276e-06, "loss": 1.3193, "step": 16200 }, { "epoch": 2.6354200454103145, "grad_norm": 1.9180452823638916, "learning_rate": 3.412132024977699e-06, "loss": 1.2382, "step": 16250 }, { "epoch": 2.643529030165423, "grad_norm": 2.1819140911102295, "learning_rate": 3.3918579190657696e-06, "loss": 1.2826, "step": 16300 }, { "epoch": 2.651638014920532, "grad_norm": 2.264775037765503, "learning_rate": 3.3715838131538403e-06, "loss": 1.2507, "step": 16350 }, { "epoch": 2.6597469996756407, "grad_norm": 1.7436145544052124, "learning_rate": 3.351309707241911e-06, "loss": 1.2057, "step": 16400 }, { "epoch": 2.6678559844307492, "grad_norm": 2.5168802738189697, "learning_rate": 3.3310356013299814e-06, "loss": 1.215, "step": 16450 }, { "epoch": 2.675964969185858, "grad_norm": 1.952141284942627, "learning_rate": 3.310761495418052e-06, "loss": 1.255, "step": 16500 }, { "epoch": 2.684073953940967, "grad_norm": 1.5202018022537231, "learning_rate": 3.290487389506123e-06, "loss": 1.224, "step": 16550 }, { "epoch": 2.6921829386960754, "grad_norm": 1.8313968181610107, "learning_rate": 3.2702132835941937e-06, "loss": 1.3158, "step": 16600 }, { "epoch": 2.700291923451184, "grad_norm": 2.5669538974761963, "learning_rate": 3.249939177682265e-06, "loss": 1.2905, "step": 16650 }, { "epoch": 2.7084009082062925, "grad_norm": 2.714341878890991, "learning_rate": 3.2296650717703356e-06, "loss": 1.2508, "step": 16700 }, { "epoch": 2.716509892961401, "grad_norm": 2.8722524642944336, "learning_rate": 3.209390965858406e-06, "loss": 1.2905, "step": 16750 }, { "epoch": 2.7246188777165097, "grad_norm": 2.822148323059082, "learning_rate": 3.1891168599464767e-06, "loss": 1.1907, "step": 16800 }, { "epoch": 2.7327278624716187, "grad_norm": 0.6179723143577576, "learning_rate": 3.1688427540345474e-06, "loss": 1.276, "step": 16850 }, { "epoch": 2.7408368472267273, "grad_norm": 1.800058126449585, "learning_rate": 3.148568648122618e-06, "loss": 1.1746, "step": 16900 }, { "epoch": 2.748945831981836, "grad_norm": 2.2209925651550293, "learning_rate": 3.1282945422106885e-06, "loss": 1.245, "step": 16950 }, { "epoch": 2.7570548167369444, "grad_norm": 2.067692995071411, "learning_rate": 3.1080204362987593e-06, "loss": 1.2182, "step": 17000 }, { "epoch": 2.7651638014920534, "grad_norm": 2.8044259548187256, "learning_rate": 3.0877463303868304e-06, "loss": 1.1653, "step": 17050 }, { "epoch": 2.773272786247162, "grad_norm": 2.077935218811035, "learning_rate": 3.067472224474901e-06, "loss": 1.2223, "step": 17100 }, { "epoch": 2.7813817710022706, "grad_norm": 0.14890266954898834, "learning_rate": 3.047198118562972e-06, "loss": 1.2318, "step": 17150 }, { "epoch": 2.789490755757379, "grad_norm": 1.6987643241882324, "learning_rate": 3.0269240126510423e-06, "loss": 1.2607, "step": 17200 }, { "epoch": 2.7975997405124877, "grad_norm": 2.667273759841919, "learning_rate": 3.006649906739113e-06, "loss": 1.2486, "step": 17250 }, { "epoch": 2.8057087252675963, "grad_norm": 1.8006951808929443, "learning_rate": 2.9863758008271838e-06, "loss": 1.2674, "step": 17300 }, { "epoch": 2.8138177100227053, "grad_norm": 3.4938597679138184, "learning_rate": 2.9661016949152545e-06, "loss": 1.2782, "step": 17350 }, { "epoch": 2.821926694777814, "grad_norm": 3.029115915298462, "learning_rate": 2.945827589003325e-06, "loss": 1.2397, "step": 17400 }, { "epoch": 2.8300356795329225, "grad_norm": 1.7525807619094849, "learning_rate": 2.9255534830913956e-06, "loss": 1.1576, "step": 17450 }, { "epoch": 2.838144664288031, "grad_norm": 2.326188564300537, "learning_rate": 2.905279377179467e-06, "loss": 1.2581, "step": 17500 }, { "epoch": 2.84625364904314, "grad_norm": 2.2817983627319336, "learning_rate": 2.8850052712675375e-06, "loss": 1.1471, "step": 17550 }, { "epoch": 2.8543626337982486, "grad_norm": 1.9472275972366333, "learning_rate": 2.8647311653556083e-06, "loss": 1.2784, "step": 17600 }, { "epoch": 2.862471618553357, "grad_norm": 2.6982924938201904, "learning_rate": 2.8444570594436786e-06, "loss": 1.181, "step": 17650 }, { "epoch": 2.8705806033084658, "grad_norm": 2.204470634460449, "learning_rate": 2.8241829535317494e-06, "loss": 1.3165, "step": 17700 }, { "epoch": 2.8786895880635743, "grad_norm": 1.8621634244918823, "learning_rate": 2.80390884761982e-06, "loss": 1.1949, "step": 17750 }, { "epoch": 2.886798572818683, "grad_norm": 2.0357561111450195, "learning_rate": 2.783634741707891e-06, "loss": 1.2433, "step": 17800 }, { "epoch": 2.894907557573792, "grad_norm": 1.873806118965149, "learning_rate": 2.7633606357959612e-06, "loss": 1.2737, "step": 17850 }, { "epoch": 2.9030165423289005, "grad_norm": 2.299546241760254, "learning_rate": 2.7430865298840324e-06, "loss": 1.2296, "step": 17900 }, { "epoch": 2.911125527084009, "grad_norm": 2.267756938934326, "learning_rate": 2.722812423972103e-06, "loss": 1.2348, "step": 17950 }, { "epoch": 2.9192345118391176, "grad_norm": 3.639319658279419, "learning_rate": 2.702538318060174e-06, "loss": 1.2168, "step": 18000 }, { "epoch": 2.9273434965942267, "grad_norm": 2.825929641723633, "learning_rate": 2.6822642121482447e-06, "loss": 1.2045, "step": 18050 }, { "epoch": 2.9354524813493352, "grad_norm": 1.6339542865753174, "learning_rate": 2.661990106236315e-06, "loss": 1.2303, "step": 18100 }, { "epoch": 2.943561466104444, "grad_norm": 1.8570992946624756, "learning_rate": 2.6417160003243857e-06, "loss": 1.2268, "step": 18150 }, { "epoch": 2.9516704508595524, "grad_norm": 2.2465381622314453, "learning_rate": 2.6214418944124565e-06, "loss": 1.1506, "step": 18200 }, { "epoch": 2.959779435614661, "grad_norm": 2.456606149673462, "learning_rate": 2.6011677885005272e-06, "loss": 1.2371, "step": 18250 }, { "epoch": 2.9678884203697695, "grad_norm": 1.7189408540725708, "learning_rate": 2.5808936825885984e-06, "loss": 1.1677, "step": 18300 }, { "epoch": 2.975997405124878, "grad_norm": 1.5492075681686401, "learning_rate": 2.5606195766766687e-06, "loss": 1.2146, "step": 18350 }, { "epoch": 2.984106389879987, "grad_norm": 2.5447022914886475, "learning_rate": 2.5403454707647395e-06, "loss": 1.1574, "step": 18400 }, { "epoch": 2.9922153746350957, "grad_norm": 1.8842716217041016, "learning_rate": 2.5200713648528103e-06, "loss": 1.2093, "step": 18450 } ], "logging_steps": 50, "max_steps": 24664, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.035905534827063e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }